Wilsonwin commited on
Commit
6811e45
·
verified ·
1 Parent(s): fb62d79

Training in progress, step 8000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c46250bd52dcffd2137953f30321a4ed3d622b1bca6be15bc5f8f084e4fc31f
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87f17ddb27c78df3df9ebbdcf34ba0e534bed6b159e38ed164a359bf939c519b
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71e649d70bbaea3f3c60f2aa0818a879521dffa0038d58ed1695489f8bca966b
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d926ca500ef14515d2d612de8ff61253060acb30bb272798b045d8da75c1e72
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7105421fba4235e8fc90f3dbc4569b85e884f75c3232217a25f8f5042cf8247a
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:329a377c90ca49d3bcb8c01bcb7bdf9bc769af05915d36720b3201a9c222f867
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1a5b64fb90c999b23793906d64020914f128f72d1523c4f0f8e8ea53ab2425c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:578bef6269d270c9ba7be042609ff28604e2fee3538e234c365c9aa652e62f33
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2671059300557528,
6
  "eval_steps": 500,
7
- "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5385,6 +5385,364 @@
5385
  "eval_samples_per_second": 270.066,
5386
  "eval_steps_per_second": 5.671,
5387
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5388
  }
5389
  ],
5390
  "logging_steps": 10,
@@ -5404,7 +5762,7 @@
5404
  "attributes": {}
5405
  }
5406
  },
5407
- "total_flos": 2.5084052528902963e+17,
5408
  "train_batch_size": 48,
5409
  "trial_name": null,
5410
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.3515796587261362,
6
  "eval_steps": 500,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5385
  "eval_samples_per_second": 270.066,
5386
  "eval_steps_per_second": 5.671,
5387
  "step": 7500
5388
+ },
5389
+ {
5390
+ "epoch": 1.2687954046291603,
5391
+ "grad_norm": 0.4743061661720276,
5392
+ "learning_rate": 0.00012190597732468595,
5393
+ "loss": 4.406225204467773,
5394
+ "step": 7510
5395
+ },
5396
+ {
5397
+ "epoch": 1.270484879202568,
5398
+ "grad_norm": 0.47182679176330566,
5399
+ "learning_rate": 0.00012143559904956533,
5400
+ "loss": 4.388427734375,
5401
+ "step": 7520
5402
+ },
5403
+ {
5404
+ "epoch": 1.2721743537759758,
5405
+ "grad_norm": 0.4888850152492523,
5406
+ "learning_rate": 0.00012096551205457511,
5407
+ "loss": 4.383931350708008,
5408
+ "step": 7530
5409
+ },
5410
+ {
5411
+ "epoch": 1.2738638283493833,
5412
+ "grad_norm": 0.49569153785705566,
5413
+ "learning_rate": 0.00012049572113333949,
5414
+ "loss": 4.372357177734375,
5415
+ "step": 7540
5416
+ },
5417
+ {
5418
+ "epoch": 1.275553302922791,
5419
+ "grad_norm": 0.47200775146484375,
5420
+ "learning_rate": 0.00012002623107646327,
5421
+ "loss": 4.393280410766602,
5422
+ "step": 7550
5423
+ },
5424
+ {
5425
+ "epoch": 1.2772427774961987,
5426
+ "grad_norm": 0.4849415123462677,
5427
+ "learning_rate": 0.00011955704667148361,
5428
+ "loss": 4.397673416137695,
5429
+ "step": 7560
5430
+ },
5431
+ {
5432
+ "epoch": 1.2789322520696063,
5433
+ "grad_norm": 0.5245839357376099,
5434
+ "learning_rate": 0.00011908817270282048,
5435
+ "loss": 4.423194122314453,
5436
+ "step": 7570
5437
+ },
5438
+ {
5439
+ "epoch": 1.280621726643014,
5440
+ "grad_norm": 0.47324326634407043,
5441
+ "learning_rate": 0.00011861961395172844,
5442
+ "loss": 4.417614364624024,
5443
+ "step": 7580
5444
+ },
5445
+ {
5446
+ "epoch": 1.2823112012164217,
5447
+ "grad_norm": 0.46404823660850525,
5448
+ "learning_rate": 0.00011815137519624767,
5449
+ "loss": 4.41317138671875,
5450
+ "step": 7590
5451
+ },
5452
+ {
5453
+ "epoch": 1.2840006757898292,
5454
+ "grad_norm": 0.46742913126945496,
5455
+ "learning_rate": 0.0001176834612111551,
5456
+ "loss": 4.3837333679199215,
5457
+ "step": 7600
5458
+ },
5459
+ {
5460
+ "epoch": 1.285690150363237,
5461
+ "grad_norm": 0.49143141508102417,
5462
+ "learning_rate": 0.0001172158767679161,
5463
+ "loss": 4.37153205871582,
5464
+ "step": 7610
5465
+ },
5466
+ {
5467
+ "epoch": 1.2873796249366447,
5468
+ "grad_norm": 0.5067402720451355,
5469
+ "learning_rate": 0.00011674862663463538,
5470
+ "loss": 4.377300262451172,
5471
+ "step": 7620
5472
+ },
5473
+ {
5474
+ "epoch": 1.2890690995100524,
5475
+ "grad_norm": 0.4705940783023834,
5476
+ "learning_rate": 0.00011628171557600869,
5477
+ "loss": 4.387155914306641,
5478
+ "step": 7630
5479
+ },
5480
+ {
5481
+ "epoch": 1.2907585740834602,
5482
+ "grad_norm": 0.5150594711303711,
5483
+ "learning_rate": 0.0001158151483532742,
5484
+ "loss": 4.366217041015625,
5485
+ "step": 7640
5486
+ },
5487
+ {
5488
+ "epoch": 1.2924480486568677,
5489
+ "grad_norm": 0.4702792465686798,
5490
+ "learning_rate": 0.00011534892972416382,
5491
+ "loss": 4.3991741180419925,
5492
+ "step": 7650
5493
+ },
5494
+ {
5495
+ "epoch": 1.2941375232302754,
5496
+ "grad_norm": 0.5021132230758667,
5497
+ "learning_rate": 0.00011488306444285465,
5498
+ "loss": 4.408302307128906,
5499
+ "step": 7660
5500
+ },
5501
+ {
5502
+ "epoch": 1.2958269978036832,
5503
+ "grad_norm": 0.46044430136680603,
5504
+ "learning_rate": 0.0001144175572599207,
5505
+ "loss": 4.390703582763672,
5506
+ "step": 7670
5507
+ },
5508
+ {
5509
+ "epoch": 1.2975164723770907,
5510
+ "grad_norm": 0.501556396484375,
5511
+ "learning_rate": 0.00011395241292228435,
5512
+ "loss": 4.350027847290039,
5513
+ "step": 7680
5514
+ },
5515
+ {
5516
+ "epoch": 1.2992059469504984,
5517
+ "grad_norm": 0.4919809103012085,
5518
+ "learning_rate": 0.00011348763617316781,
5519
+ "loss": 4.407309341430664,
5520
+ "step": 7690
5521
+ },
5522
+ {
5523
+ "epoch": 1.3008954215239061,
5524
+ "grad_norm": 0.4790572226047516,
5525
+ "learning_rate": 0.00011302323175204497,
5526
+ "loss": 4.382944107055664,
5527
+ "step": 7700
5528
+ },
5529
+ {
5530
+ "epoch": 1.3025848960973136,
5531
+ "grad_norm": 0.4744700491428375,
5532
+ "learning_rate": 0.00011255920439459302,
5533
+ "loss": 4.376468658447266,
5534
+ "step": 7710
5535
+ },
5536
+ {
5537
+ "epoch": 1.3042743706707214,
5538
+ "grad_norm": 0.4606315791606903,
5539
+ "learning_rate": 0.00011209555883264406,
5540
+ "loss": 4.396706771850586,
5541
+ "step": 7720
5542
+ },
5543
+ {
5544
+ "epoch": 1.3059638452441291,
5545
+ "grad_norm": 0.46377789974212646,
5546
+ "learning_rate": 0.00011163229979413685,
5547
+ "loss": 4.378550720214844,
5548
+ "step": 7730
5549
+ },
5550
+ {
5551
+ "epoch": 1.3076533198175366,
5552
+ "grad_norm": 0.4852358102798462,
5553
+ "learning_rate": 0.00011116943200306871,
5554
+ "loss": 4.369587326049805,
5555
+ "step": 7740
5556
+ },
5557
+ {
5558
+ "epoch": 1.3093427943909444,
5559
+ "grad_norm": 0.47700658440589905,
5560
+ "learning_rate": 0.00011070696017944728,
5561
+ "loss": 4.420570755004883,
5562
+ "step": 7750
5563
+ },
5564
+ {
5565
+ "epoch": 1.311032268964352,
5566
+ "grad_norm": 0.49495404958724976,
5567
+ "learning_rate": 0.00011024488903924235,
5568
+ "loss": 4.396437072753907,
5569
+ "step": 7760
5570
+ },
5571
+ {
5572
+ "epoch": 1.3127217435377598,
5573
+ "grad_norm": 0.4793596863746643,
5574
+ "learning_rate": 0.00010978322329433796,
5575
+ "loss": 4.368120574951172,
5576
+ "step": 7770
5577
+ },
5578
+ {
5579
+ "epoch": 1.3144112181111673,
5580
+ "grad_norm": 0.47642168402671814,
5581
+ "learning_rate": 0.00010932196765248396,
5582
+ "loss": 4.349945449829102,
5583
+ "step": 7780
5584
+ },
5585
+ {
5586
+ "epoch": 1.316100692684575,
5587
+ "grad_norm": 0.470968633890152,
5588
+ "learning_rate": 0.0001088611268172485,
5589
+ "loss": 4.380016326904297,
5590
+ "step": 7790
5591
+ },
5592
+ {
5593
+ "epoch": 1.3177901672579828,
5594
+ "grad_norm": 0.4649656116962433,
5595
+ "learning_rate": 0.00010840070548796967,
5596
+ "loss": 4.385102844238281,
5597
+ "step": 7800
5598
+ },
5599
+ {
5600
+ "epoch": 1.3194796418313905,
5601
+ "grad_norm": 0.4923093318939209,
5602
+ "learning_rate": 0.00010794070835970782,
5603
+ "loss": 4.3910682678222654,
5604
+ "step": 7810
5605
+ },
5606
+ {
5607
+ "epoch": 1.321169116404798,
5608
+ "grad_norm": 0.4957950711250305,
5609
+ "learning_rate": 0.00010748114012319747,
5610
+ "loss": 4.3693492889404295,
5611
+ "step": 7820
5612
+ },
5613
+ {
5614
+ "epoch": 1.3228585909782058,
5615
+ "grad_norm": 0.4974667727947235,
5616
+ "learning_rate": 0.0001070220054647997,
5617
+ "loss": 4.351894760131836,
5618
+ "step": 7830
5619
+ },
5620
+ {
5621
+ "epoch": 1.3245480655516135,
5622
+ "grad_norm": 0.47996777296066284,
5623
+ "learning_rate": 0.00010656330906645422,
5624
+ "loss": 4.377753067016601,
5625
+ "step": 7840
5626
+ },
5627
+ {
5628
+ "epoch": 1.326237540125021,
5629
+ "grad_norm": 0.496206134557724,
5630
+ "learning_rate": 0.00010610505560563163,
5631
+ "loss": 4.364116668701172,
5632
+ "step": 7850
5633
+ },
5634
+ {
5635
+ "epoch": 1.3279270146984288,
5636
+ "grad_norm": 0.4971146881580353,
5637
+ "learning_rate": 0.00010564724975528584,
5638
+ "loss": 4.382867813110352,
5639
+ "step": 7860
5640
+ },
5641
+ {
5642
+ "epoch": 1.3296164892718365,
5643
+ "grad_norm": 0.500912606716156,
5644
+ "learning_rate": 0.00010518989618380632,
5645
+ "loss": 4.386323165893555,
5646
+ "step": 7870
5647
+ },
5648
+ {
5649
+ "epoch": 1.331305963845244,
5650
+ "grad_norm": 0.48914024233818054,
5651
+ "learning_rate": 0.00010473299955497044,
5652
+ "loss": 4.394392013549805,
5653
+ "step": 7880
5654
+ },
5655
+ {
5656
+ "epoch": 1.3329954384186518,
5657
+ "grad_norm": 0.5237129926681519,
5658
+ "learning_rate": 0.000104276564527896,
5659
+ "loss": 4.340487289428711,
5660
+ "step": 7890
5661
+ },
5662
+ {
5663
+ "epoch": 1.3346849129920595,
5664
+ "grad_norm": 0.5235133171081543,
5665
+ "learning_rate": 0.0001038205957569938,
5666
+ "loss": 4.360867309570312,
5667
+ "step": 7900
5668
+ },
5669
+ {
5670
+ "epoch": 1.336374387565467,
5671
+ "grad_norm": 0.506583571434021,
5672
+ "learning_rate": 0.00010336509789191994,
5673
+ "loss": 4.368827056884766,
5674
+ "step": 7910
5675
+ },
5676
+ {
5677
+ "epoch": 1.3380638621388747,
5678
+ "grad_norm": 0.4910382330417633,
5679
+ "learning_rate": 0.00010291007557752861,
5680
+ "loss": 4.372630310058594,
5681
+ "step": 7920
5682
+ },
5683
+ {
5684
+ "epoch": 1.3397533367122825,
5685
+ "grad_norm": 0.4972977340221405,
5686
+ "learning_rate": 0.00010245553345382467,
5687
+ "loss": 4.360449981689453,
5688
+ "step": 7930
5689
+ },
5690
+ {
5691
+ "epoch": 1.3414428112856902,
5692
+ "grad_norm": 0.4779084324836731,
5693
+ "learning_rate": 0.00010200147615591643,
5694
+ "loss": 4.333792877197266,
5695
+ "step": 7940
5696
+ },
5697
+ {
5698
+ "epoch": 1.343132285859098,
5699
+ "grad_norm": 0.48390597105026245,
5700
+ "learning_rate": 0.00010154790831396805,
5701
+ "loss": 4.4007080078125,
5702
+ "step": 7950
5703
+ },
5704
+ {
5705
+ "epoch": 1.3448217604325055,
5706
+ "grad_norm": 0.47715461254119873,
5707
+ "learning_rate": 0.00010109483455315269,
5708
+ "loss": 4.379761123657227,
5709
+ "step": 7960
5710
+ },
5711
+ {
5712
+ "epoch": 1.3465112350059132,
5713
+ "grad_norm": 0.4725069999694824,
5714
+ "learning_rate": 0.00010064225949360525,
5715
+ "loss": 4.364334487915039,
5716
+ "step": 7970
5717
+ },
5718
+ {
5719
+ "epoch": 1.348200709579321,
5720
+ "grad_norm": 0.4733550250530243,
5721
+ "learning_rate": 0.00010019018775037509,
5722
+ "loss": 4.398410034179688,
5723
+ "step": 7980
5724
+ },
5725
+ {
5726
+ "epoch": 1.3498901841527284,
5727
+ "grad_norm": 0.49187690019607544,
5728
+ "learning_rate": 9.973862393337925e-05,
5729
+ "loss": 4.387223815917968,
5730
+ "step": 7990
5731
+ },
5732
+ {
5733
+ "epoch": 1.3515796587261362,
5734
+ "grad_norm": 0.4789150357246399,
5735
+ "learning_rate": 9.928757264735506e-05,
5736
+ "loss": 4.403899383544922,
5737
+ "step": 8000
5738
+ },
5739
+ {
5740
+ "epoch": 1.3515796587261362,
5741
+ "eval_loss": 4.3366875648498535,
5742
+ "eval_runtime": 3.6684,
5743
+ "eval_samples_per_second": 272.602,
5744
+ "eval_steps_per_second": 5.725,
5745
+ "step": 8000
5746
  }
5747
  ],
5748
  "logging_steps": 10,
 
5762
  "attributes": {}
5763
  }
5764
  },
5765
+ "total_flos": 2.6756336633197363e+17,
5766
  "train_batch_size": 48,
5767
  "trial_name": null,
5768
  "trial_params": null