Wilsonwin commited on
Commit
a0e13cc
·
verified ·
1 Parent(s): 7c1ef73

Training in progress, step 8000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:152c34cc1ef8eea86d84f7b0351d9f983b40e24507e8054571349aacd4aba343
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a5b1264363800d835097d941071eaf668b648591456cb18035122aa338a30b9
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8012a529a81b3f92efa4c79d19d5460d546f7ff16907210ecdb6456891de9745
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfe3532ddb10671229c77a55f85cca973229a308c2faa98d60ea12da855a7153
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b50695bbf99bef39c4d13662a35b1f845a2b2c6b19490939ad9cc39127e32ab1
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16ef5699c401ab357753367766bad7490c0997d4f3cbc8e6689c7f21d470f2f2
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1a5b64fb90c999b23793906d64020914f128f72d1523c4f0f8e8ea53ab2425c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:578bef6269d270c9ba7be042609ff28604e2fee3538e234c365c9aa652e62f33
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2671059300557528,
6
  "eval_steps": 500,
7
- "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5385,6 +5385,364 @@
5385
  "eval_samples_per_second": 266.869,
5386
  "eval_steps_per_second": 5.604,
5387
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5388
  }
5389
  ],
5390
  "logging_steps": 10,
@@ -5404,7 +5762,7 @@
5404
  "attributes": {}
5405
  }
5406
  },
5407
- "total_flos": 2.5084052528902963e+17,
5408
  "train_batch_size": 48,
5409
  "trial_name": null,
5410
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.3515796587261362,
6
  "eval_steps": 500,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5385
  "eval_samples_per_second": 266.869,
5386
  "eval_steps_per_second": 5.604,
5387
  "step": 7500
5388
+ },
5389
+ {
5390
+ "epoch": 1.2687954046291603,
5391
+ "grad_norm": 0.48278650641441345,
5392
+ "learning_rate": 0.00012190597732468595,
5393
+ "loss": 4.407323837280273,
5394
+ "step": 7510
5395
+ },
5396
+ {
5397
+ "epoch": 1.270484879202568,
5398
+ "grad_norm": 0.48528528213500977,
5399
+ "learning_rate": 0.00012143559904956533,
5400
+ "loss": 4.389751815795899,
5401
+ "step": 7520
5402
+ },
5403
+ {
5404
+ "epoch": 1.2721743537759758,
5405
+ "grad_norm": 0.4944697320461273,
5406
+ "learning_rate": 0.00012096551205457511,
5407
+ "loss": 4.385165786743164,
5408
+ "step": 7530
5409
+ },
5410
+ {
5411
+ "epoch": 1.2738638283493833,
5412
+ "grad_norm": 0.5002730488777161,
5413
+ "learning_rate": 0.00012049572113333949,
5414
+ "loss": 4.374062347412109,
5415
+ "step": 7540
5416
+ },
5417
+ {
5418
+ "epoch": 1.275553302922791,
5419
+ "grad_norm": 0.46715047955513,
5420
+ "learning_rate": 0.00012002623107646327,
5421
+ "loss": 4.394298553466797,
5422
+ "step": 7550
5423
+ },
5424
+ {
5425
+ "epoch": 1.2772427774961987,
5426
+ "grad_norm": 0.4903099834918976,
5427
+ "learning_rate": 0.00011955704667148361,
5428
+ "loss": 4.400055694580078,
5429
+ "step": 7560
5430
+ },
5431
+ {
5432
+ "epoch": 1.2789322520696063,
5433
+ "grad_norm": 0.5333164930343628,
5434
+ "learning_rate": 0.00011908817270282048,
5435
+ "loss": 4.424139404296875,
5436
+ "step": 7570
5437
+ },
5438
+ {
5439
+ "epoch": 1.280621726643014,
5440
+ "grad_norm": 0.47946473956108093,
5441
+ "learning_rate": 0.00011861961395172844,
5442
+ "loss": 4.419405746459961,
5443
+ "step": 7580
5444
+ },
5445
+ {
5446
+ "epoch": 1.2823112012164217,
5447
+ "grad_norm": 0.4778226315975189,
5448
+ "learning_rate": 0.00011815137519624767,
5449
+ "loss": 4.414478302001953,
5450
+ "step": 7590
5451
+ },
5452
+ {
5453
+ "epoch": 1.2840006757898292,
5454
+ "grad_norm": 0.4878886342048645,
5455
+ "learning_rate": 0.0001176834612111551,
5456
+ "loss": 4.384803009033203,
5457
+ "step": 7600
5458
+ },
5459
+ {
5460
+ "epoch": 1.285690150363237,
5461
+ "grad_norm": 0.4819967746734619,
5462
+ "learning_rate": 0.0001172158767679161,
5463
+ "loss": 4.3730110168457035,
5464
+ "step": 7610
5465
+ },
5466
+ {
5467
+ "epoch": 1.2873796249366447,
5468
+ "grad_norm": 0.4928823411464691,
5469
+ "learning_rate": 0.00011674862663463538,
5470
+ "loss": 4.3778236389160154,
5471
+ "step": 7620
5472
+ },
5473
+ {
5474
+ "epoch": 1.2890690995100524,
5475
+ "grad_norm": 0.4724312722682953,
5476
+ "learning_rate": 0.00011628171557600869,
5477
+ "loss": 4.387655639648438,
5478
+ "step": 7630
5479
+ },
5480
+ {
5481
+ "epoch": 1.2907585740834602,
5482
+ "grad_norm": 0.5023632049560547,
5483
+ "learning_rate": 0.0001158151483532742,
5484
+ "loss": 4.366682052612305,
5485
+ "step": 7640
5486
+ },
5487
+ {
5488
+ "epoch": 1.2924480486568677,
5489
+ "grad_norm": 0.47042906284332275,
5490
+ "learning_rate": 0.00011534892972416382,
5491
+ "loss": 4.3992149353027346,
5492
+ "step": 7650
5493
+ },
5494
+ {
5495
+ "epoch": 1.2941375232302754,
5496
+ "grad_norm": 0.5019961595535278,
5497
+ "learning_rate": 0.00011488306444285465,
5498
+ "loss": 4.408546829223633,
5499
+ "step": 7660
5500
+ },
5501
+ {
5502
+ "epoch": 1.2958269978036832,
5503
+ "grad_norm": 0.4686186909675598,
5504
+ "learning_rate": 0.0001144175572599207,
5505
+ "loss": 4.392362976074219,
5506
+ "step": 7670
5507
+ },
5508
+ {
5509
+ "epoch": 1.2975164723770907,
5510
+ "grad_norm": 0.5097217559814453,
5511
+ "learning_rate": 0.00011395241292228435,
5512
+ "loss": 4.350882339477539,
5513
+ "step": 7680
5514
+ },
5515
+ {
5516
+ "epoch": 1.2992059469504984,
5517
+ "grad_norm": 0.5009888410568237,
5518
+ "learning_rate": 0.00011348763617316781,
5519
+ "loss": 4.407807159423828,
5520
+ "step": 7690
5521
+ },
5522
+ {
5523
+ "epoch": 1.3008954215239061,
5524
+ "grad_norm": 0.4623536765575409,
5525
+ "learning_rate": 0.00011302323175204497,
5526
+ "loss": 4.383738708496094,
5527
+ "step": 7700
5528
+ },
5529
+ {
5530
+ "epoch": 1.3025848960973136,
5531
+ "grad_norm": 0.49098923802375793,
5532
+ "learning_rate": 0.00011255920439459302,
5533
+ "loss": 4.3777015686035154,
5534
+ "step": 7710
5535
+ },
5536
+ {
5537
+ "epoch": 1.3042743706707214,
5538
+ "grad_norm": 0.47158893942832947,
5539
+ "learning_rate": 0.00011209555883264406,
5540
+ "loss": 4.398603439331055,
5541
+ "step": 7720
5542
+ },
5543
+ {
5544
+ "epoch": 1.3059638452441291,
5545
+ "grad_norm": 0.4723564684391022,
5546
+ "learning_rate": 0.00011163229979413685,
5547
+ "loss": 4.379953384399414,
5548
+ "step": 7730
5549
+ },
5550
+ {
5551
+ "epoch": 1.3076533198175366,
5552
+ "grad_norm": 0.478575199842453,
5553
+ "learning_rate": 0.00011116943200306871,
5554
+ "loss": 4.369690322875977,
5555
+ "step": 7740
5556
+ },
5557
+ {
5558
+ "epoch": 1.3093427943909444,
5559
+ "grad_norm": 0.4801791310310364,
5560
+ "learning_rate": 0.00011070696017944728,
5561
+ "loss": 4.421099853515625,
5562
+ "step": 7750
5563
+ },
5564
+ {
5565
+ "epoch": 1.311032268964352,
5566
+ "grad_norm": 0.5147274732589722,
5567
+ "learning_rate": 0.00011024488903924235,
5568
+ "loss": 4.396934127807617,
5569
+ "step": 7760
5570
+ },
5571
+ {
5572
+ "epoch": 1.3127217435377598,
5573
+ "grad_norm": 0.4905327558517456,
5574
+ "learning_rate": 0.00010978322329433796,
5575
+ "loss": 4.368836975097656,
5576
+ "step": 7770
5577
+ },
5578
+ {
5579
+ "epoch": 1.3144112181111673,
5580
+ "grad_norm": 0.47583821415901184,
5581
+ "learning_rate": 0.00010932196765248396,
5582
+ "loss": 4.351024627685547,
5583
+ "step": 7780
5584
+ },
5585
+ {
5586
+ "epoch": 1.316100692684575,
5587
+ "grad_norm": 0.4749636650085449,
5588
+ "learning_rate": 0.0001088611268172485,
5589
+ "loss": 4.381603622436524,
5590
+ "step": 7790
5591
+ },
5592
+ {
5593
+ "epoch": 1.3177901672579828,
5594
+ "grad_norm": 0.47106119990348816,
5595
+ "learning_rate": 0.00010840070548796967,
5596
+ "loss": 4.386127471923828,
5597
+ "step": 7800
5598
+ },
5599
+ {
5600
+ "epoch": 1.3194796418313905,
5601
+ "grad_norm": 0.49278977513313293,
5602
+ "learning_rate": 0.00010794070835970782,
5603
+ "loss": 4.393439865112304,
5604
+ "step": 7810
5605
+ },
5606
+ {
5607
+ "epoch": 1.321169116404798,
5608
+ "grad_norm": 0.49596497416496277,
5609
+ "learning_rate": 0.00010748114012319747,
5610
+ "loss": 4.369705581665039,
5611
+ "step": 7820
5612
+ },
5613
+ {
5614
+ "epoch": 1.3228585909782058,
5615
+ "grad_norm": 0.48959940671920776,
5616
+ "learning_rate": 0.0001070220054647997,
5617
+ "loss": 4.353339767456054,
5618
+ "step": 7830
5619
+ },
5620
+ {
5621
+ "epoch": 1.3245480655516135,
5622
+ "grad_norm": 0.4975447952747345,
5623
+ "learning_rate": 0.00010656330906645422,
5624
+ "loss": 4.378279113769532,
5625
+ "step": 7840
5626
+ },
5627
+ {
5628
+ "epoch": 1.326237540125021,
5629
+ "grad_norm": 0.48734408617019653,
5630
+ "learning_rate": 0.00010610505560563163,
5631
+ "loss": 4.365981674194336,
5632
+ "step": 7850
5633
+ },
5634
+ {
5635
+ "epoch": 1.3279270146984288,
5636
+ "grad_norm": 0.4985700845718384,
5637
+ "learning_rate": 0.00010564724975528584,
5638
+ "loss": 4.384627151489258,
5639
+ "step": 7860
5640
+ },
5641
+ {
5642
+ "epoch": 1.3296164892718365,
5643
+ "grad_norm": 0.48617759346961975,
5644
+ "learning_rate": 0.00010518989618380632,
5645
+ "loss": 4.387208938598633,
5646
+ "step": 7870
5647
+ },
5648
+ {
5649
+ "epoch": 1.331305963845244,
5650
+ "grad_norm": 0.479184091091156,
5651
+ "learning_rate": 0.00010473299955497044,
5652
+ "loss": 4.39497184753418,
5653
+ "step": 7880
5654
+ },
5655
+ {
5656
+ "epoch": 1.3329954384186518,
5657
+ "grad_norm": 0.5024631023406982,
5658
+ "learning_rate": 0.000104276564527896,
5659
+ "loss": 4.341180801391602,
5660
+ "step": 7890
5661
+ },
5662
+ {
5663
+ "epoch": 1.3346849129920595,
5664
+ "grad_norm": 0.5147078633308411,
5665
+ "learning_rate": 0.0001038205957569938,
5666
+ "loss": 4.36151008605957,
5667
+ "step": 7900
5668
+ },
5669
+ {
5670
+ "epoch": 1.336374387565467,
5671
+ "grad_norm": 0.4864480197429657,
5672
+ "learning_rate": 0.00010336509789191994,
5673
+ "loss": 4.3700817108154295,
5674
+ "step": 7910
5675
+ },
5676
+ {
5677
+ "epoch": 1.3380638621388747,
5678
+ "grad_norm": 0.48009052872657776,
5679
+ "learning_rate": 0.00010291007557752861,
5680
+ "loss": 4.372967910766602,
5681
+ "step": 7920
5682
+ },
5683
+ {
5684
+ "epoch": 1.3397533367122825,
5685
+ "grad_norm": 0.4770645499229431,
5686
+ "learning_rate": 0.00010245553345382467,
5687
+ "loss": 4.361065673828125,
5688
+ "step": 7930
5689
+ },
5690
+ {
5691
+ "epoch": 1.3414428112856902,
5692
+ "grad_norm": 0.47222378849983215,
5693
+ "learning_rate": 0.00010200147615591643,
5694
+ "loss": 4.3356986999511715,
5695
+ "step": 7940
5696
+ },
5697
+ {
5698
+ "epoch": 1.343132285859098,
5699
+ "grad_norm": 0.513080894947052,
5700
+ "learning_rate": 0.00010154790831396805,
5701
+ "loss": 4.402030181884766,
5702
+ "step": 7950
5703
+ },
5704
+ {
5705
+ "epoch": 1.3448217604325055,
5706
+ "grad_norm": 0.48416030406951904,
5707
+ "learning_rate": 0.00010109483455315269,
5708
+ "loss": 4.381985855102539,
5709
+ "step": 7960
5710
+ },
5711
+ {
5712
+ "epoch": 1.3465112350059132,
5713
+ "grad_norm": 0.46342408657073975,
5714
+ "learning_rate": 0.00010064225949360525,
5715
+ "loss": 4.364437103271484,
5716
+ "step": 7970
5717
+ },
5718
+ {
5719
+ "epoch": 1.348200709579321,
5720
+ "grad_norm": 0.4690420925617218,
5721
+ "learning_rate": 0.00010019018775037509,
5722
+ "loss": 4.399689102172852,
5723
+ "step": 7980
5724
+ },
5725
+ {
5726
+ "epoch": 1.3498901841527284,
5727
+ "grad_norm": 0.47876372933387756,
5728
+ "learning_rate": 9.973862393337925e-05,
5729
+ "loss": 4.388835144042969,
5730
+ "step": 7990
5731
+ },
5732
+ {
5733
+ "epoch": 1.3515796587261362,
5734
+ "grad_norm": 0.48350629210472107,
5735
+ "learning_rate": 9.928757264735506e-05,
5736
+ "loss": 4.405188751220703,
5737
+ "step": 8000
5738
+ },
5739
+ {
5740
+ "epoch": 1.3515796587261362,
5741
+ "eval_loss": 4.357097148895264,
5742
+ "eval_runtime": 3.734,
5743
+ "eval_samples_per_second": 267.812,
5744
+ "eval_steps_per_second": 5.624,
5745
+ "step": 8000
5746
  }
5747
  ],
5748
  "logging_steps": 10,
 
5762
  "attributes": {}
5763
  }
5764
  },
5765
+ "total_flos": 2.6756336633197363e+17,
5766
  "train_batch_size": 48,
5767
  "trial_name": null,
5768
  "trial_params": null