TweedleDeepLearnings commited on
Commit
cd267e2
·
verified ·
1 Parent(s): 56270b8

Training in progress, step 1050, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f565cb9d9aa20d3fc0c4cf21dc1af5220363606104db9abb5d4f89f7001196b
3
  size 1047100024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f30fd72029c85ef1034060b6a05f13a18f6f374cbcf84ea05e5ac07059de9bf
3
  size 1047100024
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52ac987815df5e61bc63573356f0459dfb09134d5989cd7a883ac3b44f899fa6
3
  size 2027092538
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edbbe1503e5f2d7f92e8091c19d5b8613b3fca14aaa57a6dfe7ff45fbf26731a
3
  size 2027092538
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c834ec5bbb245414ce634f25cb531bf19a3d11dbafca153709906b07ea0138c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4a8c6500d1d10db1384ce2cca16f709390d5090ab3697e89287dc445b9fabd4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0967ac1f523632f67d95657fd1fbf687c0e98c17b5efba20ddfc48b60eebb9ed
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fed480f2c3a03c35e03e4ee1faa1f3587e2694d78b3b9e74d3abb3f0e0a5d9b
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.3061332702636719,
3
- "best_model_checkpoint": "./output/checkpoint-750",
4
- "epoch": 1.7162471395881007,
5
  "eval_steps": 150,
6
- "global_step": 750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -572,6 +572,232 @@
572
  "eval_samples_per_second": 12.975,
573
  "eval_steps_per_second": 12.975,
574
  "step": 750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  }
576
  ],
577
  "logging_steps": 10,
@@ -591,7 +817,7 @@
591
  "attributes": {}
592
  }
593
  },
594
- "total_flos": 3.88467426249769e+16,
595
  "train_batch_size": 4,
596
  "trial_name": null,
597
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.3020451068878174,
3
+ "best_model_checkpoint": "./output/checkpoint-1050",
4
+ "epoch": 2.402745995423341,
5
  "eval_steps": 150,
6
+ "global_step": 1050,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
572
  "eval_samples_per_second": 12.975,
573
  "eval_steps_per_second": 12.975,
574
  "step": 750
575
+ },
576
+ {
577
+ "epoch": 1.7391304347826086,
578
+ "grad_norm": 3.9558420181274414,
579
+ "learning_rate": 7.647194453650227e-06,
580
+ "loss": 1.2581,
581
+ "step": 760
582
+ },
583
+ {
584
+ "epoch": 1.7620137299771166,
585
+ "grad_norm": 4.010689735412598,
586
+ "learning_rate": 7.636588493358299e-06,
587
+ "loss": 1.2551,
588
+ "step": 770
589
+ },
590
+ {
591
+ "epoch": 1.7848970251716247,
592
+ "grad_norm": 3.8898725509643555,
593
+ "learning_rate": 7.625833047156952e-06,
594
+ "loss": 1.2165,
595
+ "step": 780
596
+ },
597
+ {
598
+ "epoch": 1.8077803203661327,
599
+ "grad_norm": 2.9613306522369385,
600
+ "learning_rate": 7.614928557160453e-06,
601
+ "loss": 1.2551,
602
+ "step": 790
603
+ },
604
+ {
605
+ "epoch": 1.8306636155606406,
606
+ "grad_norm": 7.471399784088135,
607
+ "learning_rate": 7.6038754716096755e-06,
608
+ "loss": 1.2807,
609
+ "step": 800
610
+ },
611
+ {
612
+ "epoch": 1.8535469107551488,
613
+ "grad_norm": 2.4937212467193604,
614
+ "learning_rate": 7.592674244853676e-06,
615
+ "loss": 1.1959,
616
+ "step": 810
617
+ },
618
+ {
619
+ "epoch": 1.8764302059496567,
620
+ "grad_norm": 3.251375675201416,
621
+ "learning_rate": 7.5813253373310125e-06,
622
+ "loss": 1.1757,
623
+ "step": 820
624
+ },
625
+ {
626
+ "epoch": 1.8993135011441646,
627
+ "grad_norm": 4.890213489532471,
628
+ "learning_rate": 7.5698292155508235e-06,
629
+ "loss": 1.2003,
630
+ "step": 830
631
+ },
632
+ {
633
+ "epoch": 1.9221967963386728,
634
+ "grad_norm": 2.9511072635650635,
635
+ "learning_rate": 7.558186352073647e-06,
636
+ "loss": 1.2203,
637
+ "step": 840
638
+ },
639
+ {
640
+ "epoch": 1.9450800915331807,
641
+ "grad_norm": 3.5382401943206787,
642
+ "learning_rate": 7.546397225492001e-06,
643
+ "loss": 1.146,
644
+ "step": 850
645
+ },
646
+ {
647
+ "epoch": 1.9679633867276887,
648
+ "grad_norm": 3.678964138031006,
649
+ "learning_rate": 7.534462320410701e-06,
650
+ "loss": 1.1553,
651
+ "step": 860
652
+ },
653
+ {
654
+ "epoch": 1.9908466819221968,
655
+ "grad_norm": 3.4112305641174316,
656
+ "learning_rate": 7.5223821274269514e-06,
657
+ "loss": 1.3744,
658
+ "step": 870
659
+ },
660
+ {
661
+ "epoch": 2.013729977116705,
662
+ "grad_norm": 4.165940761566162,
663
+ "learning_rate": 7.510157143110172e-06,
664
+ "loss": 1.2193,
665
+ "step": 880
666
+ },
667
+ {
668
+ "epoch": 2.0366132723112127,
669
+ "grad_norm": 3.76263165473938,
670
+ "learning_rate": 7.497787869981582e-06,
671
+ "loss": 1.1421,
672
+ "step": 890
673
+ },
674
+ {
675
+ "epoch": 2.059496567505721,
676
+ "grad_norm": 4.58417272567749,
677
+ "learning_rate": 7.485274816493557e-06,
678
+ "loss": 1.3133,
679
+ "step": 900
680
+ },
681
+ {
682
+ "epoch": 2.059496567505721,
683
+ "eval_loss": 1.3053786754608154,
684
+ "eval_runtime": 15.8699,
685
+ "eval_samples_per_second": 12.224,
686
+ "eval_steps_per_second": 12.224,
687
+ "step": 900
688
+ },
689
+ {
690
+ "epoch": 2.082379862700229,
691
+ "grad_norm": 3.815793752670288,
692
+ "learning_rate": 7.472618497008713e-06,
693
+ "loss": 1.2318,
694
+ "step": 910
695
+ },
696
+ {
697
+ "epoch": 2.1052631578947367,
698
+ "grad_norm": 3.645143747329712,
699
+ "learning_rate": 7.459819431778774e-06,
700
+ "loss": 1.1741,
701
+ "step": 920
702
+ },
703
+ {
704
+ "epoch": 2.128146453089245,
705
+ "grad_norm": 5.823740005493164,
706
+ "learning_rate": 7.4468781469231794e-06,
707
+ "loss": 1.1319,
708
+ "step": 930
709
+ },
710
+ {
711
+ "epoch": 2.151029748283753,
712
+ "grad_norm": 4.464242458343506,
713
+ "learning_rate": 7.433795174407464e-06,
714
+ "loss": 1.2388,
715
+ "step": 940
716
+ },
717
+ {
718
+ "epoch": 2.1739130434782608,
719
+ "grad_norm": 3.9899096488952637,
720
+ "learning_rate": 7.420571052021385e-06,
721
+ "loss": 1.1491,
722
+ "step": 950
723
+ },
724
+ {
725
+ "epoch": 2.196796338672769,
726
+ "grad_norm": 3.726358652114868,
727
+ "learning_rate": 7.407206323356817e-06,
728
+ "loss": 1.2732,
729
+ "step": 960
730
+ },
731
+ {
732
+ "epoch": 2.219679633867277,
733
+ "grad_norm": 3.940854787826538,
734
+ "learning_rate": 7.39370153778541e-06,
735
+ "loss": 1.1286,
736
+ "step": 970
737
+ },
738
+ {
739
+ "epoch": 2.242562929061785,
740
+ "grad_norm": 4.5767107009887695,
741
+ "learning_rate": 7.380057250436005e-06,
742
+ "loss": 1.1473,
743
+ "step": 980
744
+ },
745
+ {
746
+ "epoch": 2.265446224256293,
747
+ "grad_norm": 3.9847497940063477,
748
+ "learning_rate": 7.366274022171812e-06,
749
+ "loss": 1.1198,
750
+ "step": 990
751
+ },
752
+ {
753
+ "epoch": 2.288329519450801,
754
+ "grad_norm": 4.3240838050842285,
755
+ "learning_rate": 7.352352419567362e-06,
756
+ "loss": 1.1236,
757
+ "step": 1000
758
+ },
759
+ {
760
+ "epoch": 2.311212814645309,
761
+ "grad_norm": 3.4978835582733154,
762
+ "learning_rate": 7.33829301488521e-06,
763
+ "loss": 1.0911,
764
+ "step": 1010
765
+ },
766
+ {
767
+ "epoch": 2.334096109839817,
768
+ "grad_norm": 3.878068447113037,
769
+ "learning_rate": 7.324096386052415e-06,
770
+ "loss": 1.1133,
771
+ "step": 1020
772
+ },
773
+ {
774
+ "epoch": 2.356979405034325,
775
+ "grad_norm": 5.018012523651123,
776
+ "learning_rate": 7.309763116636785e-06,
777
+ "loss": 1.1869,
778
+ "step": 1030
779
+ },
780
+ {
781
+ "epoch": 2.379862700228833,
782
+ "grad_norm": 3.2581946849823,
783
+ "learning_rate": 7.295293795822886e-06,
784
+ "loss": 1.076,
785
+ "step": 1040
786
+ },
787
+ {
788
+ "epoch": 2.402745995423341,
789
+ "grad_norm": 5.764566421508789,
790
+ "learning_rate": 7.280689018387823e-06,
791
+ "loss": 1.2117,
792
+ "step": 1050
793
+ },
794
+ {
795
+ "epoch": 2.402745995423341,
796
+ "eval_loss": 1.3020451068878174,
797
+ "eval_runtime": 15.0321,
798
+ "eval_samples_per_second": 12.906,
799
+ "eval_steps_per_second": 12.906,
800
+ "step": 1050
801
  }
802
  ],
803
  "logging_steps": 10,
 
817
  "attributes": {}
818
  }
819
  },
820
+ "total_flos": 5.438212807302758e+16,
821
  "train_batch_size": 4,
822
  "trial_name": null,
823
  "trial_params": null