shulijia commited on
Commit
fb5d877
·
verified ·
1 Parent(s): b75295d

Training in progress, step 837, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4f5d6d88a0e413748b8b3c5d48ffbea60eac06f86956a8e64d18bc171212e24
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9c582d51bb0658b0c32b078d8c78b44fb00469e6045c8930ede2512d8c1d1fe
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e60235e199813f24de8d00342d55ded11c4aa787cb4f3f7c19238310ec942c02
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de1325120a711c47e73344865a95485440a588b6fcfd5476fd0dabc048e29658
3
  size 4768663315
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:480389ce7f683504c393112df2c8045b3bbba2e7bfbed923d3dbd1ed09e2f087
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30a6f8e8a66f8439d9c1b53fe36cb4eed9513b935f5bc249a5747a3e5927b2a1
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72f1a554a7d63eaa7f3dc5d08c7c6817aa6f6251549a335627b56fecedbc6007
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0170101ba8630236270ece77dab1f5651cfe5eb2cebf976f243d3e6b8f8b254
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.7924697445091886,
6
  "eval_steps": 100,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -498,6 +498,327 @@
498
  "eval_samples_per_second": 30.215,
499
  "eval_steps_per_second": 1.888,
500
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  }
502
  ],
503
  "logging_steps": 10,
@@ -512,12 +833,12 @@
512
  "should_evaluate": false,
513
  "should_log": false,
514
  "should_save": true,
515
- "should_training_stop": false
516
  },
517
  "attributes": {}
518
  }
519
  },
520
- "total_flos": 1.0820868850581504e+16,
521
  "train_batch_size": 2,
522
  "trial_name": null,
523
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 100,
7
+ "global_step": 837,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
498
  "eval_samples_per_second": 30.215,
499
  "eval_steps_per_second": 1.888,
500
  "step": 500
501
+ },
502
+ {
503
+ "epoch": 1.8283281039892425,
504
+ "grad_norm": 0.9853012561798096,
505
+ "learning_rate": 4.355909694555113e-06,
506
+ "loss": 0.0821,
507
+ "mean_token_accuracy": 0.927525683119893,
508
+ "num_tokens": 4176384.0,
509
+ "step": 510
510
+ },
511
+ {
512
+ "epoch": 1.8641864634692964,
513
+ "grad_norm": 1.3522577285766602,
514
+ "learning_rate": 4.223107569721116e-06,
515
+ "loss": 0.091,
516
+ "mean_token_accuracy": 0.9234466701745987,
517
+ "num_tokens": 4258304.0,
518
+ "step": 520
519
+ },
520
+ {
521
+ "epoch": 1.90004482294935,
522
+ "grad_norm": 1.1387897729873657,
523
+ "learning_rate": 4.090305444887119e-06,
524
+ "loss": 0.0841,
525
+ "mean_token_accuracy": 0.9275195710361004,
526
+ "num_tokens": 4340224.0,
527
+ "step": 530
528
+ },
529
+ {
530
+ "epoch": 1.9359031824294037,
531
+ "grad_norm": 1.279567837715149,
532
+ "learning_rate": 3.957503320053121e-06,
533
+ "loss": 0.0786,
534
+ "mean_token_accuracy": 0.929244127124548,
535
+ "num_tokens": 4422144.0,
536
+ "step": 540
537
+ },
538
+ {
539
+ "epoch": 1.9717615419094576,
540
+ "grad_norm": 1.068411111831665,
541
+ "learning_rate": 3.824701195219123e-06,
542
+ "loss": 0.079,
543
+ "mean_token_accuracy": 0.9259050883352756,
544
+ "num_tokens": 4504064.0,
545
+ "step": 550
546
+ },
547
+ {
548
+ "epoch": 2.007171671896011,
549
+ "grad_norm": 1.2037402391433716,
550
+ "learning_rate": 3.6918990703851264e-06,
551
+ "loss": 0.0838,
552
+ "mean_token_accuracy": 0.9283732553071613,
553
+ "num_tokens": 4584448.0,
554
+ "step": 560
555
+ },
556
+ {
557
+ "epoch": 2.0430300313760648,
558
+ "grad_norm": 1.110350489616394,
559
+ "learning_rate": 3.5590969455511294e-06,
560
+ "loss": 0.0789,
561
+ "mean_token_accuracy": 0.928657042235136,
562
+ "num_tokens": 4666368.0,
563
+ "step": 570
564
+ },
565
+ {
566
+ "epoch": 2.078888390856118,
567
+ "grad_norm": 0.8649426102638245,
568
+ "learning_rate": 3.4262948207171315e-06,
569
+ "loss": 0.0741,
570
+ "mean_token_accuracy": 0.9291340477764607,
571
+ "num_tokens": 4748288.0,
572
+ "step": 580
573
+ },
574
+ {
575
+ "epoch": 2.114746750336172,
576
+ "grad_norm": 1.0113835334777832,
577
+ "learning_rate": 3.293492695883134e-06,
578
+ "loss": 0.0765,
579
+ "mean_token_accuracy": 0.9294398218393326,
580
+ "num_tokens": 4830208.0,
581
+ "step": 590
582
+ },
583
+ {
584
+ "epoch": 2.150605109816226,
585
+ "grad_norm": 1.8120030164718628,
586
+ "learning_rate": 3.160690571049137e-06,
587
+ "loss": 0.0675,
588
+ "step": 600
589
+ },
590
+ {
591
+ "epoch": 2.150605109816226,
592
+ "eval_loss": 0.18555927276611328,
593
+ "eval_mean_token_accuracy": 0.9658441774306759,
594
+ "eval_num_tokens": 4912128.0,
595
+ "eval_runtime": 16.4214,
596
+ "eval_samples_per_second": 30.205,
597
+ "eval_steps_per_second": 1.888,
598
+ "step": 600
599
+ },
600
+ {
601
+ "epoch": 2.18646346929628,
602
+ "grad_norm": 1.118471384048462,
603
+ "learning_rate": 3.0278884462151397e-06,
604
+ "loss": 0.0819,
605
+ "mean_token_accuracy": 0.9301736779510975,
606
+ "num_tokens": 4994048.0,
607
+ "step": 610
608
+ },
609
+ {
610
+ "epoch": 2.2223218287763333,
611
+ "grad_norm": 1.2974787950515747,
612
+ "learning_rate": 2.8950863213811427e-06,
613
+ "loss": 0.0781,
614
+ "mean_token_accuracy": 0.9276907995343209,
615
+ "num_tokens": 5075968.0,
616
+ "step": 620
617
+ },
618
+ {
619
+ "epoch": 2.258180188256387,
620
+ "grad_norm": 1.191937804222107,
621
+ "learning_rate": 2.762284196547145e-06,
622
+ "loss": 0.0738,
623
+ "mean_token_accuracy": 0.9292808197438717,
624
+ "num_tokens": 5157888.0,
625
+ "step": 630
626
+ },
627
+ {
628
+ "epoch": 2.294038547736441,
629
+ "grad_norm": 1.2355011701583862,
630
+ "learning_rate": 2.6294820717131474e-06,
631
+ "loss": 0.0776,
632
+ "mean_token_accuracy": 0.9282044991850853,
633
+ "num_tokens": 5239808.0,
634
+ "step": 640
635
+ },
636
+ {
637
+ "epoch": 2.329896907216495,
638
+ "grad_norm": 1.0226362943649292,
639
+ "learning_rate": 2.4966799468791504e-06,
640
+ "loss": 0.0742,
641
+ "mean_token_accuracy": 0.9314701519906521,
642
+ "num_tokens": 5321728.0,
643
+ "step": 650
644
+ },
645
+ {
646
+ "epoch": 2.3657552666965485,
647
+ "grad_norm": 0.7803632020950317,
648
+ "learning_rate": 2.363877822045153e-06,
649
+ "loss": 0.0756,
650
+ "mean_token_accuracy": 0.9292930528521538,
651
+ "num_tokens": 5403648.0,
652
+ "step": 660
653
+ },
654
+ {
655
+ "epoch": 2.4016136261766023,
656
+ "grad_norm": 1.070591688156128,
657
+ "learning_rate": 2.2310756972111555e-06,
658
+ "loss": 0.071,
659
+ "mean_token_accuracy": 0.9280210383236408,
660
+ "num_tokens": 5485568.0,
661
+ "step": 670
662
+ },
663
+ {
664
+ "epoch": 2.4374719856566562,
665
+ "grad_norm": 1.4033000469207764,
666
+ "learning_rate": 2.098273572377158e-06,
667
+ "loss": 0.0718,
668
+ "mean_token_accuracy": 0.9286203525960446,
669
+ "num_tokens": 5567488.0,
670
+ "step": 680
671
+ },
672
+ {
673
+ "epoch": 2.47333034513671,
674
+ "grad_norm": 1.2492733001708984,
675
+ "learning_rate": 1.9654714475431607e-06,
676
+ "loss": 0.0733,
677
+ "mean_token_accuracy": 0.9287304326891899,
678
+ "num_tokens": 5649408.0,
679
+ "step": 690
680
+ },
681
+ {
682
+ "epoch": 2.509188704616764,
683
+ "grad_norm": 0.885611891746521,
684
+ "learning_rate": 1.8326693227091634e-06,
685
+ "loss": 0.0769,
686
+ "step": 700
687
+ },
688
+ {
689
+ "epoch": 2.509188704616764,
690
+ "eval_loss": 0.1842765361070633,
691
+ "eval_mean_token_accuracy": 0.9662663398250457,
692
+ "eval_num_tokens": 5731328.0,
693
+ "eval_runtime": 16.3741,
694
+ "eval_samples_per_second": 30.292,
695
+ "eval_steps_per_second": 1.893,
696
+ "step": 700
697
+ },
698
+ {
699
+ "epoch": 2.5450470640968175,
700
+ "grad_norm": 1.087609887123108,
701
+ "learning_rate": 1.699867197875166e-06,
702
+ "loss": 0.0702,
703
+ "mean_token_accuracy": 0.9273666802793741,
704
+ "num_tokens": 5813248.0,
705
+ "step": 710
706
+ },
707
+ {
708
+ "epoch": 2.5809054235768714,
709
+ "grad_norm": 0.9095642566680908,
710
+ "learning_rate": 1.5670650730411688e-06,
711
+ "loss": 0.0732,
712
+ "mean_token_accuracy": 0.9297089017927647,
713
+ "num_tokens": 5895168.0,
714
+ "step": 720
715
+ },
716
+ {
717
+ "epoch": 2.6167637830569253,
718
+ "grad_norm": 1.0973559617996216,
719
+ "learning_rate": 1.4342629482071716e-06,
720
+ "loss": 0.0719,
721
+ "mean_token_accuracy": 0.9309075310826301,
722
+ "num_tokens": 5977088.0,
723
+ "step": 730
724
+ },
725
+ {
726
+ "epoch": 2.6526221425369787,
727
+ "grad_norm": 1.3313140869140625,
728
+ "learning_rate": 1.301460823373174e-06,
729
+ "loss": 0.0742,
730
+ "mean_token_accuracy": 0.9301125220954418,
731
+ "num_tokens": 6059008.0,
732
+ "step": 740
733
+ },
734
+ {
735
+ "epoch": 2.6884805020170326,
736
+ "grad_norm": 1.0324218273162842,
737
+ "learning_rate": 1.1686586985391767e-06,
738
+ "loss": 0.0693,
739
+ "mean_token_accuracy": 0.9307362966239452,
740
+ "num_tokens": 6140928.0,
741
+ "step": 750
742
+ },
743
+ {
744
+ "epoch": 2.7243388614970865,
745
+ "grad_norm": 0.874535858631134,
746
+ "learning_rate": 1.0358565737051795e-06,
747
+ "loss": 0.0742,
748
+ "mean_token_accuracy": 0.9291340492665767,
749
+ "num_tokens": 6222848.0,
750
+ "step": 760
751
+ },
752
+ {
753
+ "epoch": 2.7601972209771404,
754
+ "grad_norm": 0.8414442539215088,
755
+ "learning_rate": 9.030544488711821e-07,
756
+ "loss": 0.0797,
757
+ "mean_token_accuracy": 0.9262475520372391,
758
+ "num_tokens": 6304768.0,
759
+ "step": 770
760
+ },
761
+ {
762
+ "epoch": 2.7960555804571943,
763
+ "grad_norm": 1.106335163116455,
764
+ "learning_rate": 7.702523240371847e-07,
765
+ "loss": 0.0797,
766
+ "mean_token_accuracy": 0.9259662419557572,
767
+ "num_tokens": 6386688.0,
768
+ "step": 780
769
+ },
770
+ {
771
+ "epoch": 2.8319139399372477,
772
+ "grad_norm": 1.4045188426971436,
773
+ "learning_rate": 6.374501992031873e-07,
774
+ "loss": 0.0806,
775
+ "mean_token_accuracy": 0.9263087011873722,
776
+ "num_tokens": 6468608.0,
777
+ "step": 790
778
+ },
779
+ {
780
+ "epoch": 2.8677722994173016,
781
+ "grad_norm": 0.9607040286064148,
782
+ "learning_rate": 5.046480743691899e-07,
783
+ "loss": 0.0666,
784
+ "step": 800
785
+ },
786
+ {
787
+ "epoch": 2.8677722994173016,
788
+ "eval_loss": 0.18328328430652618,
789
+ "eval_mean_token_accuracy": 0.9665898680686951,
790
+ "eval_num_tokens": 6550528.0,
791
+ "eval_runtime": 16.369,
792
+ "eval_samples_per_second": 30.301,
793
+ "eval_steps_per_second": 1.894,
794
+ "step": 800
795
+ },
796
+ {
797
+ "epoch": 2.9036306588973555,
798
+ "grad_norm": 0.9709027409553528,
799
+ "learning_rate": 3.718459495351926e-07,
800
+ "loss": 0.0721,
801
+ "mean_token_accuracy": 0.9297394804656506,
802
+ "num_tokens": 6632448.0,
803
+ "step": 810
804
+ },
805
+ {
806
+ "epoch": 2.9394890183774094,
807
+ "grad_norm": 1.0785083770751953,
808
+ "learning_rate": 2.390438247011952e-07,
809
+ "loss": 0.0746,
810
+ "mean_token_accuracy": 0.9277519538998604,
811
+ "num_tokens": 6714368.0,
812
+ "step": 820
813
+ },
814
+ {
815
+ "epoch": 2.975347377857463,
816
+ "grad_norm": 1.141538381576538,
817
+ "learning_rate": 1.0624169986719788e-07,
818
+ "loss": 0.0714,
819
+ "mean_token_accuracy": 0.927385026961565,
820
+ "num_tokens": 6796288.0,
821
+ "step": 830
822
  }
823
  ],
824
  "logging_steps": 10,
 
833
  "should_evaluate": false,
834
  "should_log": false,
835
  "should_save": true,
836
+ "should_training_stop": true
837
  },
838
  "attributes": {}
839
  }
840
  },
841
+ "total_flos": 1.8108751760326656e+16,
842
  "train_batch_size": 2,
843
  "trial_name": null,
844
  "trial_params": null