shulijia commited on
Commit
f651489
·
verified ·
1 Parent(s): 1cb69bf

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c218477cd2c0625663cdfb2563492eb7172c89c7303421743a59f7d281ceb01
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a49264be75b7765ea678b9b59b3a697ee1aafcfc6ba700a2bb5029482f8d7bf
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c07d7eb4be73f387d83f3548ea91e7d24dab91c8fc3a76c775a21b4b4623cefe
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f742f053229dbe9bb02cee7d2b024a0b812cecc2ca49ca71a2d415a2c2912a70
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bbed8446741b35ce14e30a783ce1c024c4e38dd948dbe1505d21c5937fe6077
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29f59d5835f9b632408ae6ab68738905e125bd09ad3ec01adcf60d2207ea126d
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.403943255590286,
6
  "eval_steps": 100,
7
- "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2258,6 +2258,456 @@
2258
  "mean_token_accuracy": 0.8223948117345572,
2259
  "num_tokens": 20475904.0,
2260
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2261
  }
2262
  ],
2263
  "logging_steps": 10,
@@ -2277,7 +2727,7 @@
2277
  "attributes": {}
2278
  }
2279
  },
2280
- "total_flos": 5.411381606508134e+16,
2281
  "train_batch_size": 2,
2282
  "trial_name": null,
2283
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.88482808367396,
6
  "eval_steps": 100,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2258
  "mean_token_accuracy": 0.8223948117345572,
2259
  "num_tokens": 20475904.0,
2260
  "step": 2500
2261
+ },
2262
+ {
2263
+ "epoch": 2.4135609521519594,
2264
+ "grad_norm": 1.350571870803833,
2265
+ "learning_rate": 2.175925925925926e-06,
2266
+ "loss": 0.1403,
2267
+ "mean_token_accuracy": 0.829684441536665,
2268
+ "num_tokens": 20557824.0,
2269
+ "step": 2510
2270
+ },
2271
+ {
2272
+ "epoch": 2.423178648713633,
2273
+ "grad_norm": 1.4614524841308594,
2274
+ "learning_rate": 2.1403133903133905e-06,
2275
+ "loss": 0.1627,
2276
+ "mean_token_accuracy": 0.8001100782305002,
2277
+ "num_tokens": 20639744.0,
2278
+ "step": 2520
2279
+ },
2280
+ {
2281
+ "epoch": 2.4327963452753067,
2282
+ "grad_norm": 1.424734115600586,
2283
+ "learning_rate": 2.104700854700855e-06,
2284
+ "loss": 0.1476,
2285
+ "mean_token_accuracy": 0.8220401186496019,
2286
+ "num_tokens": 20721664.0,
2287
+ "step": 2530
2288
+ },
2289
+ {
2290
+ "epoch": 2.44241404183698,
2291
+ "grad_norm": 1.10490882396698,
2292
+ "learning_rate": 2.069088319088319e-06,
2293
+ "loss": 0.1294,
2294
+ "mean_token_accuracy": 0.83066291436553,
2295
+ "num_tokens": 20803584.0,
2296
+ "step": 2540
2297
+ },
2298
+ {
2299
+ "epoch": 2.4520317383986536,
2300
+ "grad_norm": 1.5091724395751953,
2301
+ "learning_rate": 2.033475783475784e-06,
2302
+ "loss": 0.122,
2303
+ "mean_token_accuracy": 0.8367049902677536,
2304
+ "num_tokens": 20885504.0,
2305
+ "step": 2550
2306
+ },
2307
+ {
2308
+ "epoch": 2.461649434960327,
2309
+ "grad_norm": 1.088440179824829,
2310
+ "learning_rate": 1.9978632478632483e-06,
2311
+ "loss": 0.1233,
2312
+ "mean_token_accuracy": 0.8360934432595968,
2313
+ "num_tokens": 20967424.0,
2314
+ "step": 2560
2315
+ },
2316
+ {
2317
+ "epoch": 2.4712671315220005,
2318
+ "grad_norm": 1.1838353872299194,
2319
+ "learning_rate": 1.962250712250712e-06,
2320
+ "loss": 0.1428,
2321
+ "mean_token_accuracy": 0.8347969647496939,
2322
+ "num_tokens": 21049344.0,
2323
+ "step": 2570
2324
+ },
2325
+ {
2326
+ "epoch": 2.480884828083674,
2327
+ "grad_norm": 1.687991738319397,
2328
+ "learning_rate": 1.9266381766381765e-06,
2329
+ "loss": 0.1543,
2330
+ "mean_token_accuracy": 0.808084636554122,
2331
+ "num_tokens": 21131264.0,
2332
+ "step": 2580
2333
+ },
2334
+ {
2335
+ "epoch": 2.4905025246453474,
2336
+ "grad_norm": 0.9223553538322449,
2337
+ "learning_rate": 1.891025641025641e-06,
2338
+ "loss": 0.1155,
2339
+ "mean_token_accuracy": 0.8487157512456178,
2340
+ "num_tokens": 21213184.0,
2341
+ "step": 2590
2342
+ },
2343
+ {
2344
+ "epoch": 2.500120221207021,
2345
+ "grad_norm": 1.4413843154907227,
2346
+ "learning_rate": 1.8554131054131056e-06,
2347
+ "loss": 0.135,
2348
+ "mean_token_accuracy": 0.8379280813038349,
2349
+ "num_tokens": 21295104.0,
2350
+ "step": 2600
2351
+ },
2352
+ {
2353
+ "epoch": 2.5097379177686943,
2354
+ "grad_norm": 1.0525134801864624,
2355
+ "learning_rate": 1.81980056980057e-06,
2356
+ "loss": 0.14,
2357
+ "mean_token_accuracy": 0.8360812149941921,
2358
+ "num_tokens": 21377024.0,
2359
+ "step": 2610
2360
+ },
2361
+ {
2362
+ "epoch": 2.519355614330368,
2363
+ "grad_norm": 1.246717095375061,
2364
+ "learning_rate": 1.7841880341880343e-06,
2365
+ "loss": 0.1511,
2366
+ "mean_token_accuracy": 0.827531798556447,
2367
+ "num_tokens": 21458944.0,
2368
+ "step": 2620
2369
+ },
2370
+ {
2371
+ "epoch": 2.5289733108920416,
2372
+ "grad_norm": 1.5476717948913574,
2373
+ "learning_rate": 1.7485754985754989e-06,
2374
+ "loss": 0.1471,
2375
+ "mean_token_accuracy": 0.8342710375785828,
2376
+ "num_tokens": 21540864.0,
2377
+ "step": 2630
2378
+ },
2379
+ {
2380
+ "epoch": 2.538591007453715,
2381
+ "grad_norm": 0.9907192587852478,
2382
+ "learning_rate": 1.7129629629629632e-06,
2383
+ "loss": 0.1281,
2384
+ "mean_token_accuracy": 0.8439823880791664,
2385
+ "num_tokens": 21622784.0,
2386
+ "step": 2640
2387
+ },
2388
+ {
2389
+ "epoch": 2.5482087040153885,
2390
+ "grad_norm": 1.137152910232544,
2391
+ "learning_rate": 1.6773504273504274e-06,
2392
+ "loss": 0.1273,
2393
+ "mean_token_accuracy": 0.8406555730849504,
2394
+ "num_tokens": 21704704.0,
2395
+ "step": 2650
2396
+ },
2397
+ {
2398
+ "epoch": 2.5578264005770617,
2399
+ "grad_norm": 1.1246287822723389,
2400
+ "learning_rate": 1.6417378917378917e-06,
2401
+ "loss": 0.1161,
2402
+ "mean_token_accuracy": 0.8306873764842748,
2403
+ "num_tokens": 21786624.0,
2404
+ "step": 2660
2405
+ },
2406
+ {
2407
+ "epoch": 2.5674440971387353,
2408
+ "grad_norm": 1.2270443439483643,
2409
+ "learning_rate": 1.6061253561253563e-06,
2410
+ "loss": 0.1478,
2411
+ "mean_token_accuracy": 0.8372553810477257,
2412
+ "num_tokens": 21868544.0,
2413
+ "step": 2670
2414
+ },
2415
+ {
2416
+ "epoch": 2.577061793700409,
2417
+ "grad_norm": 1.3623789548873901,
2418
+ "learning_rate": 1.5705128205128206e-06,
2419
+ "loss": 0.1489,
2420
+ "mean_token_accuracy": 0.8291340474039316,
2421
+ "num_tokens": 21950464.0,
2422
+ "step": 2680
2423
+ },
2424
+ {
2425
+ "epoch": 2.586679490262082,
2426
+ "grad_norm": 0.8751484751701355,
2427
+ "learning_rate": 1.534900284900285e-06,
2428
+ "loss": 0.1659,
2429
+ "mean_token_accuracy": 0.8064823867753148,
2430
+ "num_tokens": 22032384.0,
2431
+ "step": 2690
2432
+ },
2433
+ {
2434
+ "epoch": 2.596297186823756,
2435
+ "grad_norm": 1.0951944589614868,
2436
+ "learning_rate": 1.4992877492877495e-06,
2437
+ "loss": 0.1433,
2438
+ "mean_token_accuracy": 0.828779349476099,
2439
+ "num_tokens": 22114304.0,
2440
+ "step": 2700
2441
+ },
2442
+ {
2443
+ "epoch": 2.605914883385429,
2444
+ "grad_norm": 1.2346906661987305,
2445
+ "learning_rate": 1.4636752136752138e-06,
2446
+ "loss": 0.1502,
2447
+ "mean_token_accuracy": 0.8060298431664705,
2448
+ "num_tokens": 22196224.0,
2449
+ "step": 2710
2450
+ },
2451
+ {
2452
+ "epoch": 2.6155325799471028,
2453
+ "grad_norm": 1.6823463439941406,
2454
+ "learning_rate": 1.4280626780626784e-06,
2455
+ "loss": 0.1778,
2456
+ "mean_token_accuracy": 0.7855185937136412,
2457
+ "num_tokens": 22278144.0,
2458
+ "step": 2720
2459
+ },
2460
+ {
2461
+ "epoch": 2.6251502765087764,
2462
+ "grad_norm": 1.1732553243637085,
2463
+ "learning_rate": 1.3924501424501425e-06,
2464
+ "loss": 0.1529,
2465
+ "mean_token_accuracy": 0.8153375726193189,
2466
+ "num_tokens": 22360064.0,
2467
+ "step": 2730
2468
+ },
2469
+ {
2470
+ "epoch": 2.6347679730704496,
2471
+ "grad_norm": 0.921587347984314,
2472
+ "learning_rate": 1.3568376068376069e-06,
2473
+ "loss": 0.135,
2474
+ "mean_token_accuracy": 0.8219545006752014,
2475
+ "num_tokens": 22441984.0,
2476
+ "step": 2740
2477
+ },
2478
+ {
2479
+ "epoch": 2.644385669632123,
2480
+ "grad_norm": 1.4507887363433838,
2481
+ "learning_rate": 1.3212250712250712e-06,
2482
+ "loss": 0.1432,
2483
+ "mean_token_accuracy": 0.8294031292200088,
2484
+ "num_tokens": 22523904.0,
2485
+ "step": 2750
2486
+ },
2487
+ {
2488
+ "epoch": 2.6540033661937965,
2489
+ "grad_norm": 1.614626169204712,
2490
+ "learning_rate": 1.2856125356125358e-06,
2491
+ "loss": 0.1658,
2492
+ "mean_token_accuracy": 0.8020303327590227,
2493
+ "num_tokens": 22605824.0,
2494
+ "step": 2760
2495
+ },
2496
+ {
2497
+ "epoch": 2.66362106275547,
2498
+ "grad_norm": 0.9506202340126038,
2499
+ "learning_rate": 1.25e-06,
2500
+ "loss": 0.1229,
2501
+ "mean_token_accuracy": 0.8390044014900923,
2502
+ "num_tokens": 22687744.0,
2503
+ "step": 2770
2504
+ },
2505
+ {
2506
+ "epoch": 2.6732387593171434,
2507
+ "grad_norm": 1.1267729997634888,
2508
+ "learning_rate": 1.2143874643874644e-06,
2509
+ "loss": 0.1409,
2510
+ "mean_token_accuracy": 0.8417808238416911,
2511
+ "num_tokens": 22769664.0,
2512
+ "step": 2780
2513
+ },
2514
+ {
2515
+ "epoch": 2.682856455878817,
2516
+ "grad_norm": 0.9731321930885315,
2517
+ "learning_rate": 1.178774928774929e-06,
2518
+ "loss": 0.1481,
2519
+ "mean_token_accuracy": 0.8307118374854326,
2520
+ "num_tokens": 22851584.0,
2521
+ "step": 2790
2522
+ },
2523
+ {
2524
+ "epoch": 2.6924741524404903,
2525
+ "grad_norm": 1.5113604068756104,
2526
+ "learning_rate": 1.1431623931623931e-06,
2527
+ "loss": 0.1493,
2528
+ "mean_token_accuracy": 0.8170865952968598,
2529
+ "num_tokens": 22933504.0,
2530
+ "step": 2800
2531
+ },
2532
+ {
2533
+ "epoch": 2.702091849002164,
2534
+ "grad_norm": 1.5031578540802002,
2535
+ "learning_rate": 1.1075498575498577e-06,
2536
+ "loss": 0.1333,
2537
+ "mean_token_accuracy": 0.8457436379045248,
2538
+ "num_tokens": 23015424.0,
2539
+ "step": 2810
2540
+ },
2541
+ {
2542
+ "epoch": 2.7117095455638376,
2543
+ "grad_norm": 0.9475900530815125,
2544
+ "learning_rate": 1.071937321937322e-06,
2545
+ "loss": 0.1533,
2546
+ "mean_token_accuracy": 0.8175391405820847,
2547
+ "num_tokens": 23097344.0,
2548
+ "step": 2820
2549
+ },
2550
+ {
2551
+ "epoch": 2.721327242125511,
2552
+ "grad_norm": 1.1415244340896606,
2553
+ "learning_rate": 1.0363247863247866e-06,
2554
+ "loss": 0.1325,
2555
+ "mean_token_accuracy": 0.8296232886612416,
2556
+ "num_tokens": 23179264.0,
2557
+ "step": 2830
2558
+ },
2559
+ {
2560
+ "epoch": 2.7309449386871845,
2561
+ "grad_norm": 1.2204571962356567,
2562
+ "learning_rate": 1.0007122507122507e-06,
2563
+ "loss": 0.1314,
2564
+ "mean_token_accuracy": 0.8463918767869473,
2565
+ "num_tokens": 23261184.0,
2566
+ "step": 2840
2567
+ },
2568
+ {
2569
+ "epoch": 2.7405626352488577,
2570
+ "grad_norm": 1.4655503034591675,
2571
+ "learning_rate": 9.650997150997153e-07,
2572
+ "loss": 0.1492,
2573
+ "mean_token_accuracy": 0.8253546953201294,
2574
+ "num_tokens": 23343104.0,
2575
+ "step": 2850
2576
+ },
2577
+ {
2578
+ "epoch": 2.7501803318105313,
2579
+ "grad_norm": 1.8217484951019287,
2580
+ "learning_rate": 9.294871794871796e-07,
2581
+ "loss": 0.148,
2582
+ "mean_token_accuracy": 0.8358488243073225,
2583
+ "num_tokens": 23425024.0,
2584
+ "step": 2860
2585
+ },
2586
+ {
2587
+ "epoch": 2.759798028372205,
2588
+ "grad_norm": 1.4163944721221924,
2589
+ "learning_rate": 8.938746438746441e-07,
2590
+ "loss": 0.1626,
2591
+ "mean_token_accuracy": 0.8236545998603105,
2592
+ "num_tokens": 23506944.0,
2593
+ "step": 2870
2594
+ },
2595
+ {
2596
+ "epoch": 2.769415724933878,
2597
+ "grad_norm": 1.2180320024490356,
2598
+ "learning_rate": 8.582621082621083e-07,
2599
+ "loss": 0.1444,
2600
+ "mean_token_accuracy": 0.817331212759018,
2601
+ "num_tokens": 23588864.0,
2602
+ "step": 2880
2603
+ },
2604
+ {
2605
+ "epoch": 2.779033421495552,
2606
+ "grad_norm": 1.457688808441162,
2607
+ "learning_rate": 8.226495726495727e-07,
2608
+ "loss": 0.1272,
2609
+ "mean_token_accuracy": 0.8441291574388743,
2610
+ "num_tokens": 23670784.0,
2611
+ "step": 2890
2612
+ },
2613
+ {
2614
+ "epoch": 2.788651118057225,
2615
+ "grad_norm": 1.5352418422698975,
2616
+ "learning_rate": 7.870370370370371e-07,
2617
+ "loss": 0.1599,
2618
+ "mean_token_accuracy": 0.8156800389289856,
2619
+ "num_tokens": 23752704.0,
2620
+ "step": 2900
2621
+ },
2622
+ {
2623
+ "epoch": 2.7982688146188988,
2624
+ "grad_norm": 1.1528174877166748,
2625
+ "learning_rate": 7.514245014245015e-07,
2626
+ "loss": 0.142,
2627
+ "mean_token_accuracy": 0.8310543041676283,
2628
+ "num_tokens": 23834624.0,
2629
+ "step": 2910
2630
+ },
2631
+ {
2632
+ "epoch": 2.8078865111805724,
2633
+ "grad_norm": 0.9919131994247437,
2634
+ "learning_rate": 7.158119658119659e-07,
2635
+ "loss": 0.1517,
2636
+ "mean_token_accuracy": 0.8280088048428297,
2637
+ "num_tokens": 23916544.0,
2638
+ "step": 2920
2639
+ },
2640
+ {
2641
+ "epoch": 2.8175042077422456,
2642
+ "grad_norm": 1.1298660039901733,
2643
+ "learning_rate": 6.801994301994302e-07,
2644
+ "loss": 0.1282,
2645
+ "mean_token_accuracy": 0.8396893307566643,
2646
+ "num_tokens": 23998464.0,
2647
+ "step": 2930
2648
+ },
2649
+ {
2650
+ "epoch": 2.8271219043039193,
2651
+ "grad_norm": 1.4493669271469116,
2652
+ "learning_rate": 6.445868945868947e-07,
2653
+ "loss": 0.1474,
2654
+ "mean_token_accuracy": 0.8369129169732332,
2655
+ "num_tokens": 24080384.0,
2656
+ "step": 2940
2657
+ },
2658
+ {
2659
+ "epoch": 2.8367396008655925,
2660
+ "grad_norm": 1.313490390777588,
2661
+ "learning_rate": 6.08974358974359e-07,
2662
+ "loss": 0.1356,
2663
+ "mean_token_accuracy": 0.8197651654481888,
2664
+ "num_tokens": 24162304.0,
2665
+ "step": 2950
2666
+ },
2667
+ {
2668
+ "epoch": 2.846357297427266,
2669
+ "grad_norm": 1.632154107093811,
2670
+ "learning_rate": 5.733618233618235e-07,
2671
+ "loss": 0.1636,
2672
+ "mean_token_accuracy": 0.8243272993713617,
2673
+ "num_tokens": 24244224.0,
2674
+ "step": 2960
2675
+ },
2676
+ {
2677
+ "epoch": 2.85597499398894,
2678
+ "grad_norm": 1.3894017934799194,
2679
+ "learning_rate": 5.377492877492878e-07,
2680
+ "loss": 0.16,
2681
+ "mean_token_accuracy": 0.8183586075901985,
2682
+ "num_tokens": 24326144.0,
2683
+ "step": 2970
2684
+ },
2685
+ {
2686
+ "epoch": 2.865592690550613,
2687
+ "grad_norm": 1.0274466276168823,
2688
+ "learning_rate": 5.021367521367522e-07,
2689
+ "loss": 0.1246,
2690
+ "mean_token_accuracy": 0.8431629177182913,
2691
+ "num_tokens": 24408064.0,
2692
+ "step": 2980
2693
+ },
2694
+ {
2695
+ "epoch": 2.8752103871122867,
2696
+ "grad_norm": 1.2301019430160522,
2697
+ "learning_rate": 4.6652421652421653e-07,
2698
+ "loss": 0.1283,
2699
+ "mean_token_accuracy": 0.8367906074970961,
2700
+ "num_tokens": 24489984.0,
2701
+ "step": 2990
2702
+ },
2703
+ {
2704
+ "epoch": 2.88482808367396,
2705
+ "grad_norm": 1.6906356811523438,
2706
+ "learning_rate": 4.30911680911681e-07,
2707
+ "loss": 0.1407,
2708
+ "mean_token_accuracy": 0.8170743655413389,
2709
+ "num_tokens": 24571904.0,
2710
+ "step": 3000
2711
  }
2712
  ],
2713
  "logging_steps": 10,
 
2727
  "attributes": {}
2728
  }
2729
  },
2730
+ "total_flos": 6.493874426373734e+16,
2731
  "train_batch_size": 2,
2732
  "trial_name": null,
2733
  "trial_params": null