shulijia commited on
Commit
bf78cb8
·
verified ·
1 Parent(s): 2b3b840

Training in progress, step 6500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a51051b34edc25e0d64c30ca7f33b83a0761c13d9072778abfd10448c19f3f5e
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f7f960c526aaddf3dc6988e73942fac836299ec8e275266c1eba5701ae94d95
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f3b41dca90dac11ca9ea5d9671e951dd4dabbed860e4fb7230df978f7f0f912
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1ba615b7681549fb237aead953796280a2ad4be16081ccbd5f79689ec8c3f9c
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf6efc206c57420111096224e0f9dccda6dffe818f96d885666ae79f4ad31671
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf1be9c157afb4ca46e7843711b38a681d679ee3bcd0c31f21d197c72d6bbf2
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.8068669527896994,
6
  "eval_steps": 100,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5408,6 +5408,456 @@
5408
  "mean_token_accuracy": 0.7743395321071148,
5409
  "num_tokens": 49145856.0,
5410
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5411
  }
5412
  ],
5413
  "logging_steps": 10,
@@ -5427,7 +5877,7 @@
5427
  "attributes": {}
5428
  }
5429
  },
5430
- "total_flos": 1.2988290099157402e+17,
5431
  "train_batch_size": 2,
5432
  "trial_name": null,
5433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.9574580227392515,
6
  "eval_steps": 100,
7
+ "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5408
  "mean_token_accuracy": 0.7743395321071148,
5409
  "num_tokens": 49145856.0,
5410
  "step": 6000
5411
+ },
5412
+ {
5413
+ "epoch": 1.8098787741886906,
5414
+ "grad_norm": 1.119452714920044,
5415
+ "learning_rate": 4.409993308052644e-06,
5416
+ "loss": 0.1267,
5417
+ "mean_token_accuracy": 0.7942514635622502,
5418
+ "num_tokens": 49227776.0,
5419
+ "step": 6010
5420
+ },
5421
+ {
5422
+ "epoch": 1.8128905955876817,
5423
+ "grad_norm": 1.5210676193237305,
5424
+ "learning_rate": 4.398840062458175e-06,
5425
+ "loss": 0.1288,
5426
+ "mean_token_accuracy": 0.7662426587194204,
5427
+ "num_tokens": 49309696.0,
5428
+ "step": 6020
5429
+ },
5430
+ {
5431
+ "epoch": 1.8159024169866727,
5432
+ "grad_norm": 1.1394039392471313,
5433
+ "learning_rate": 4.387686816863708e-06,
5434
+ "loss": 0.1204,
5435
+ "mean_token_accuracy": 0.7840264186263084,
5436
+ "num_tokens": 49391616.0,
5437
+ "step": 6030
5438
+ },
5439
+ {
5440
+ "epoch": 1.8189142383856636,
5441
+ "grad_norm": 1.3435912132263184,
5442
+ "learning_rate": 4.37653357126924e-06,
5443
+ "loss": 0.1307,
5444
+ "mean_token_accuracy": 0.7883317038416863,
5445
+ "num_tokens": 49473536.0,
5446
+ "step": 6040
5447
+ },
5448
+ {
5449
+ "epoch": 1.8219260597846547,
5450
+ "grad_norm": 1.6107054948806763,
5451
+ "learning_rate": 4.365380325674772e-06,
5452
+ "loss": 0.1364,
5453
+ "mean_token_accuracy": 0.7881237775087356,
5454
+ "num_tokens": 49555456.0,
5455
+ "step": 6050
5456
+ },
5457
+ {
5458
+ "epoch": 1.8249378811836459,
5459
+ "grad_norm": 1.3873751163482666,
5460
+ "learning_rate": 4.3542270800803035e-06,
5461
+ "loss": 0.1151,
5462
+ "mean_token_accuracy": 0.7963184919208288,
5463
+ "num_tokens": 49637376.0,
5464
+ "step": 6060
5465
+ },
5466
+ {
5467
+ "epoch": 1.8279497025826368,
5468
+ "grad_norm": 1.1877696514129639,
5469
+ "learning_rate": 4.343073834485836e-06,
5470
+ "loss": 0.116,
5471
+ "mean_token_accuracy": 0.7753791596740485,
5472
+ "num_tokens": 49719296.0,
5473
+ "step": 6070
5474
+ },
5475
+ {
5476
+ "epoch": 1.830961523981628,
5477
+ "grad_norm": 1.254225492477417,
5478
+ "learning_rate": 4.331920588891368e-06,
5479
+ "loss": 0.1247,
5480
+ "mean_token_accuracy": 0.7898238748311996,
5481
+ "num_tokens": 49801216.0,
5482
+ "step": 6080
5483
+ },
5484
+ {
5485
+ "epoch": 1.833973345380619,
5486
+ "grad_norm": 0.9596796631813049,
5487
+ "learning_rate": 4.3207673432969e-06,
5488
+ "loss": 0.1179,
5489
+ "mean_token_accuracy": 0.7971868872642517,
5490
+ "num_tokens": 49883136.0,
5491
+ "step": 6090
5492
+ },
5493
+ {
5494
+ "epoch": 1.83698516677961,
5495
+ "grad_norm": 1.2079626321792603,
5496
+ "learning_rate": 4.309614097702432e-06,
5497
+ "loss": 0.1226,
5498
+ "mean_token_accuracy": 0.7858732901513577,
5499
+ "num_tokens": 49965056.0,
5500
+ "step": 6100
5501
+ },
5502
+ {
5503
+ "epoch": 1.839996988178601,
5504
+ "grad_norm": 0.9496937990188599,
5505
+ "learning_rate": 4.298460852107964e-06,
5506
+ "loss": 0.1228,
5507
+ "mean_token_accuracy": 0.7744006861001254,
5508
+ "num_tokens": 50046976.0,
5509
+ "step": 6110
5510
+ },
5511
+ {
5512
+ "epoch": 1.843008809577592,
5513
+ "grad_norm": 1.2540092468261719,
5514
+ "learning_rate": 4.287307606513495e-06,
5515
+ "loss": 0.146,
5516
+ "mean_token_accuracy": 0.7829867884516716,
5517
+ "num_tokens": 50128896.0,
5518
+ "step": 6120
5519
+ },
5520
+ {
5521
+ "epoch": 1.8460206309765832,
5522
+ "grad_norm": 1.411310076713562,
5523
+ "learning_rate": 4.276154360919028e-06,
5524
+ "loss": 0.1577,
5525
+ "mean_token_accuracy": 0.7729085143655539,
5526
+ "num_tokens": 50210816.0,
5527
+ "step": 6130
5528
+ },
5529
+ {
5530
+ "epoch": 1.8490324523755741,
5531
+ "grad_norm": 1.2968723773956299,
5532
+ "learning_rate": 4.26500111532456e-06,
5533
+ "loss": 0.1357,
5534
+ "mean_token_accuracy": 0.7781433466821909,
5535
+ "num_tokens": 50292736.0,
5536
+ "step": 6140
5537
+ },
5538
+ {
5539
+ "epoch": 1.852044273774565,
5540
+ "grad_norm": 1.5124306678771973,
5541
+ "learning_rate": 4.253847869730091e-06,
5542
+ "loss": 0.127,
5543
+ "mean_token_accuracy": 0.7770914871245622,
5544
+ "num_tokens": 50374656.0,
5545
+ "step": 6150
5546
+ },
5547
+ {
5548
+ "epoch": 1.8550560951735562,
5549
+ "grad_norm": 1.213470220565796,
5550
+ "learning_rate": 4.2426946241356236e-06,
5551
+ "loss": 0.1265,
5552
+ "mean_token_accuracy": 0.7825953986495733,
5553
+ "num_tokens": 50456576.0,
5554
+ "step": 6160
5555
+ },
5556
+ {
5557
+ "epoch": 1.8580679165725473,
5558
+ "grad_norm": 1.0956401824951172,
5559
+ "learning_rate": 4.231541378541156e-06,
5560
+ "loss": 0.1175,
5561
+ "mean_token_accuracy": 0.7812500018626451,
5562
+ "num_tokens": 50538496.0,
5563
+ "step": 6170
5564
+ },
5565
+ {
5566
+ "epoch": 1.8610797379715383,
5567
+ "grad_norm": 1.4625799655914307,
5568
+ "learning_rate": 4.220388132946688e-06,
5569
+ "loss": 0.1221,
5570
+ "mean_token_accuracy": 0.7790484338998794,
5571
+ "num_tokens": 50620416.0,
5572
+ "step": 6180
5573
+ },
5574
+ {
5575
+ "epoch": 1.8640915593705292,
5576
+ "grad_norm": 1.459068775177002,
5577
+ "learning_rate": 4.2092348873522195e-06,
5578
+ "loss": 0.1256,
5579
+ "mean_token_accuracy": 0.7763820964843035,
5580
+ "num_tokens": 50702336.0,
5581
+ "step": 6190
5582
+ },
5583
+ {
5584
+ "epoch": 1.8671033807695203,
5585
+ "grad_norm": 1.2089377641677856,
5586
+ "learning_rate": 4.198081641757752e-06,
5587
+ "loss": 0.1144,
5588
+ "mean_token_accuracy": 0.7932118389755487,
5589
+ "num_tokens": 50784256.0,
5590
+ "step": 6200
5591
+ },
5592
+ {
5593
+ "epoch": 1.8701152021685115,
5594
+ "grad_norm": 1.3430728912353516,
5595
+ "learning_rate": 4.186928396163284e-06,
5596
+ "loss": 0.1357,
5597
+ "mean_token_accuracy": 0.7570083189755679,
5598
+ "num_tokens": 50866176.0,
5599
+ "step": 6210
5600
+ },
5601
+ {
5602
+ "epoch": 1.8731270235675024,
5603
+ "grad_norm": 1.1253952980041504,
5604
+ "learning_rate": 4.175775150568816e-06,
5605
+ "loss": 0.1388,
5606
+ "mean_token_accuracy": 0.770560173690319,
5607
+ "num_tokens": 50948096.0,
5608
+ "step": 6220
5609
+ },
5610
+ {
5611
+ "epoch": 1.8761388449664935,
5612
+ "grad_norm": 1.4846373796463013,
5613
+ "learning_rate": 4.164621904974348e-06,
5614
+ "loss": 0.1169,
5615
+ "mean_token_accuracy": 0.790007334202528,
5616
+ "num_tokens": 51030016.0,
5617
+ "step": 6230
5618
+ },
5619
+ {
5620
+ "epoch": 1.8791506663654847,
5621
+ "grad_norm": 1.1216601133346558,
5622
+ "learning_rate": 4.15346865937988e-06,
5623
+ "loss": 0.1067,
5624
+ "mean_token_accuracy": 0.7991316046565771,
5625
+ "num_tokens": 51111936.0,
5626
+ "step": 6240
5627
+ },
5628
+ {
5629
+ "epoch": 1.8821624877644756,
5630
+ "grad_norm": 1.0364607572555542,
5631
+ "learning_rate": 4.142315413785412e-06,
5632
+ "loss": 0.1319,
5633
+ "mean_token_accuracy": 0.7956824835389853,
5634
+ "num_tokens": 51193856.0,
5635
+ "step": 6250
5636
+ },
5637
+ {
5638
+ "epoch": 1.8851743091634665,
5639
+ "grad_norm": 1.2253397703170776,
5640
+ "learning_rate": 4.1311621681909444e-06,
5641
+ "loss": 0.106,
5642
+ "mean_token_accuracy": 0.7979452051222324,
5643
+ "num_tokens": 51275776.0,
5644
+ "step": 6260
5645
+ },
5646
+ {
5647
+ "epoch": 1.8881861305624577,
5648
+ "grad_norm": 1.4938507080078125,
5649
+ "learning_rate": 4.120008922596476e-06,
5650
+ "loss": 0.1251,
5651
+ "mean_token_accuracy": 0.7749999970197677,
5652
+ "num_tokens": 51357696.0,
5653
+ "step": 6270
5654
+ },
5655
+ {
5656
+ "epoch": 1.8911979519614488,
5657
+ "grad_norm": 0.9345296025276184,
5658
+ "learning_rate": 4.108855677002008e-06,
5659
+ "loss": 0.1378,
5660
+ "mean_token_accuracy": 0.7624633058905601,
5661
+ "num_tokens": 51439616.0,
5662
+ "step": 6280
5663
+ },
5664
+ {
5665
+ "epoch": 1.8942097733604397,
5666
+ "grad_norm": 1.692668080329895,
5667
+ "learning_rate": 4.0977024314075395e-06,
5668
+ "loss": 0.1278,
5669
+ "mean_token_accuracy": 0.7732876695692539,
5670
+ "num_tokens": 51521536.0,
5671
+ "step": 6290
5672
+ },
5673
+ {
5674
+ "epoch": 1.8972215947594306,
5675
+ "grad_norm": 1.1990948915481567,
5676
+ "learning_rate": 4.086549185813072e-06,
5677
+ "loss": 0.1033,
5678
+ "mean_token_accuracy": 0.7984222084283829,
5679
+ "num_tokens": 51603456.0,
5680
+ "step": 6300
5681
+ },
5682
+ {
5683
+ "epoch": 1.9002334161584218,
5684
+ "grad_norm": 1.023429274559021,
5685
+ "learning_rate": 4.075395940218604e-06,
5686
+ "loss": 0.141,
5687
+ "mean_token_accuracy": 0.7674779828637839,
5688
+ "num_tokens": 51685376.0,
5689
+ "step": 6310
5690
+ },
5691
+ {
5692
+ "epoch": 1.903245237557413,
5693
+ "grad_norm": 1.4536356925964355,
5694
+ "learning_rate": 4.064242694624136e-06,
5695
+ "loss": 0.1262,
5696
+ "mean_token_accuracy": 0.7853106647729874,
5697
+ "num_tokens": 51767296.0,
5698
+ "step": 6320
5699
+ },
5700
+ {
5701
+ "epoch": 1.9062570589564038,
5702
+ "grad_norm": 1.2798579931259155,
5703
+ "learning_rate": 4.053089449029668e-06,
5704
+ "loss": 0.136,
5705
+ "mean_token_accuracy": 0.7931629169732333,
5706
+ "num_tokens": 51849216.0,
5707
+ "step": 6330
5708
+ },
5709
+ {
5710
+ "epoch": 1.9092688803553948,
5711
+ "grad_norm": 1.8882237672805786,
5712
+ "learning_rate": 4.0419362034352e-06,
5713
+ "loss": 0.1245,
5714
+ "mean_token_accuracy": 0.7892612550407648,
5715
+ "num_tokens": 51931136.0,
5716
+ "step": 6340
5717
+ },
5718
+ {
5719
+ "epoch": 1.912280701754386,
5720
+ "grad_norm": 0.9331277012825012,
5721
+ "learning_rate": 4.030782957840732e-06,
5722
+ "loss": 0.1328,
5723
+ "mean_token_accuracy": 0.7794153604656457,
5724
+ "num_tokens": 52013056.0,
5725
+ "step": 6350
5726
+ },
5727
+ {
5728
+ "epoch": 1.915292523153377,
5729
+ "grad_norm": 1.0210968255996704,
5730
+ "learning_rate": 4.0196297122462645e-06,
5731
+ "loss": 0.1339,
5732
+ "mean_token_accuracy": 0.7672945164144039,
5733
+ "num_tokens": 52094976.0,
5734
+ "step": 6360
5735
+ },
5736
+ {
5737
+ "epoch": 1.918304344552368,
5738
+ "grad_norm": 0.8210415244102478,
5739
+ "learning_rate": 4.008476466651796e-06,
5740
+ "loss": 0.1383,
5741
+ "mean_token_accuracy": 0.7825097866356373,
5742
+ "num_tokens": 52176896.0,
5743
+ "step": 6370
5744
+ },
5745
+ {
5746
+ "epoch": 1.9213161659513591,
5747
+ "grad_norm": 1.4494845867156982,
5748
+ "learning_rate": 3.997323221057328e-06,
5749
+ "loss": 0.1251,
5750
+ "mean_token_accuracy": 0.7858732867985964,
5751
+ "num_tokens": 52258816.0,
5752
+ "step": 6380
5753
+ },
5754
+ {
5755
+ "epoch": 1.9243279873503503,
5756
+ "grad_norm": 1.3196386098861694,
5757
+ "learning_rate": 3.98616997546286e-06,
5758
+ "loss": 0.1248,
5759
+ "mean_token_accuracy": 0.7867539167404175,
5760
+ "num_tokens": 52340736.0,
5761
+ "step": 6390
5762
+ },
5763
+ {
5764
+ "epoch": 1.9273398087493412,
5765
+ "grad_norm": 1.0273571014404297,
5766
+ "learning_rate": 3.975016729868393e-06,
5767
+ "loss": 0.1305,
5768
+ "mean_token_accuracy": 0.7854329772293568,
5769
+ "num_tokens": 52422656.0,
5770
+ "step": 6400
5771
+ },
5772
+ {
5773
+ "epoch": 1.930351630148332,
5774
+ "grad_norm": 0.8488328456878662,
5775
+ "learning_rate": 3.963863484273924e-06,
5776
+ "loss": 0.1263,
5777
+ "mean_token_accuracy": 0.7624266136437654,
5778
+ "num_tokens": 52504576.0,
5779
+ "step": 6410
5780
+ },
5781
+ {
5782
+ "epoch": 1.9333634515473233,
5783
+ "grad_norm": 1.2516889572143555,
5784
+ "learning_rate": 3.9527102386794555e-06,
5785
+ "loss": 0.1152,
5786
+ "mean_token_accuracy": 0.7868517614901066,
5787
+ "num_tokens": 52586496.0,
5788
+ "step": 6420
5789
+ },
5790
+ {
5791
+ "epoch": 1.9363752729463144,
5792
+ "grad_norm": 0.8944329619407654,
5793
+ "learning_rate": 3.941556993084988e-06,
5794
+ "loss": 0.1433,
5795
+ "mean_token_accuracy": 0.7754770018160343,
5796
+ "num_tokens": 52668416.0,
5797
+ "step": 6430
5798
+ },
5799
+ {
5800
+ "epoch": 1.9393870943453053,
5801
+ "grad_norm": 1.3723251819610596,
5802
+ "learning_rate": 3.93040374749052e-06,
5803
+ "loss": 0.1303,
5804
+ "mean_token_accuracy": 0.7749755352735519,
5805
+ "num_tokens": 52750336.0,
5806
+ "step": 6440
5807
+ },
5808
+ {
5809
+ "epoch": 1.9423989157442962,
5810
+ "grad_norm": 1.1744338274002075,
5811
+ "learning_rate": 3.919250501896052e-06,
5812
+ "loss": 0.1262,
5813
+ "mean_token_accuracy": 0.7595768127590418,
5814
+ "num_tokens": 52832256.0,
5815
+ "step": 6450
5816
+ },
5817
+ {
5818
+ "epoch": 1.9454107371432874,
5819
+ "grad_norm": 1.4083287715911865,
5820
+ "learning_rate": 3.908097256301584e-06,
5821
+ "loss": 0.1206,
5822
+ "mean_token_accuracy": 0.7865826826542616,
5823
+ "num_tokens": 52914176.0,
5824
+ "step": 6460
5825
+ },
5826
+ {
5827
+ "epoch": 1.9484225585422785,
5828
+ "grad_norm": 1.0532681941986084,
5829
+ "learning_rate": 3.896944010707116e-06,
5830
+ "loss": 0.1319,
5831
+ "mean_token_accuracy": 0.7665606673806906,
5832
+ "num_tokens": 52996096.0,
5833
+ "step": 6470
5834
+ },
5835
+ {
5836
+ "epoch": 1.9514343799412694,
5837
+ "grad_norm": 1.3897960186004639,
5838
+ "learning_rate": 3.885790765112648e-06,
5839
+ "loss": 0.1262,
5840
+ "mean_token_accuracy": 0.7697407025843859,
5841
+ "num_tokens": 53078016.0,
5842
+ "step": 6480
5843
+ },
5844
+ {
5845
+ "epoch": 1.9544462013402604,
5846
+ "grad_norm": 1.6499828100204468,
5847
+ "learning_rate": 3.8746375195181804e-06,
5848
+ "loss": 0.1129,
5849
+ "mean_token_accuracy": 0.8042808264493942,
5850
+ "num_tokens": 53159936.0,
5851
+ "step": 6490
5852
+ },
5853
+ {
5854
+ "epoch": 1.9574580227392515,
5855
+ "grad_norm": 1.6160123348236084,
5856
+ "learning_rate": 3.863484273923712e-06,
5857
+ "loss": 0.1315,
5858
+ "mean_token_accuracy": 0.7979574371129274,
5859
+ "num_tokens": 53241856.0,
5860
+ "step": 6500
5861
  }
5862
  ],
5863
  "logging_steps": 10,
 
5877
  "attributes": {}
5878
  }
5879
  },
5880
+ "total_flos": 1.4070782919023002e+17,
5881
  "train_batch_size": 2,
5882
  "trial_name": null,
5883
  "trial_params": null