3N3G commited on
Commit
dedaca6
·
verified ·
1 Parent(s): 5728b80

Training in progress, step 112, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5696b0909841c30ce657da7fef89d416fda37a2d1d0a8e66831a5ea7676d6e4
3
  size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e47b82aa2abc24774e65bf3c840b73254af400c018b54a4b74076c10b7aa50f1
3
  size 4969539560
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a7223ea8f99f7b799b24686be78454f9f72e8d03fef9f83c56d83584086be14
3
  size 1912795688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3948e1712d66e4846b20e1f82e841d2b592341e95b95f7284d25901e27823131
3
  size 1912795688
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 20.0,
6
  "eval_steps": 16,
7
- "global_step": 80,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -608,6 +608,246 @@
608
  "eval_samples_per_second": 17.305,
609
  "eval_steps_per_second": 17.305,
610
  "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  }
612
  ],
613
  "logging_steps": 1,
@@ -627,7 +867,7 @@
627
  "attributes": {}
628
  }
629
  },
630
- "total_flos": 2.68306572017664e+16,
631
  "train_batch_size": 1,
632
  "trial_name": null,
633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 28.0,
6
  "eval_steps": 16,
7
+ "global_step": 112,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
608
  "eval_samples_per_second": 17.305,
609
  "eval_steps_per_second": 17.305,
610
  "step": 80
611
+ },
612
+ {
613
+ "epoch": 20.29090909090909,
614
+ "grad_norm": 9.43545913696289,
615
+ "learning_rate": 9.728616793536587e-08,
616
+ "loss": 0.726,
617
+ "step": 81
618
+ },
619
+ {
620
+ "epoch": 20.581818181818182,
621
+ "grad_norm": 8.36042308807373,
622
+ "learning_rate": 9.715024851617789e-08,
623
+ "loss": 0.7908,
624
+ "step": 82
625
+ },
626
+ {
627
+ "epoch": 20.87272727272727,
628
+ "grad_norm": 9.46149730682373,
629
+ "learning_rate": 9.701111919237408e-08,
630
+ "loss": 0.8219,
631
+ "step": 83
632
+ },
633
+ {
634
+ "epoch": 21.0,
635
+ "grad_norm": 9.277331352233887,
636
+ "learning_rate": 9.68687905591911e-08,
637
+ "loss": 0.7955,
638
+ "step": 84
639
+ },
640
+ {
641
+ "epoch": 21.29090909090909,
642
+ "grad_norm": 9.980899810791016,
643
+ "learning_rate": 9.672327345550542e-08,
644
+ "loss": 0.8459,
645
+ "step": 85
646
+ },
647
+ {
648
+ "epoch": 21.581818181818182,
649
+ "grad_norm": 8.734892845153809,
650
+ "learning_rate": 9.65745789630079e-08,
651
+ "loss": 0.7952,
652
+ "step": 86
653
+ },
654
+ {
655
+ "epoch": 21.87272727272727,
656
+ "grad_norm": 7.979213714599609,
657
+ "learning_rate": 9.642271840535982e-08,
658
+ "loss": 0.6928,
659
+ "step": 87
660
+ },
661
+ {
662
+ "epoch": 22.0,
663
+ "grad_norm": 9.570889472961426,
664
+ "learning_rate": 9.626770334733058e-08,
665
+ "loss": 0.7813,
666
+ "step": 88
667
+ },
668
+ {
669
+ "epoch": 22.29090909090909,
670
+ "grad_norm": 9.478497505187988,
671
+ "learning_rate": 9.610954559391703e-08,
672
+ "loss": 0.783,
673
+ "step": 89
674
+ },
675
+ {
676
+ "epoch": 22.581818181818182,
677
+ "grad_norm": 8.57199478149414,
678
+ "learning_rate": 9.594825718944444e-08,
679
+ "loss": 0.7859,
680
+ "step": 90
681
+ },
682
+ {
683
+ "epoch": 22.87272727272727,
684
+ "grad_norm": 8.782203674316406,
685
+ "learning_rate": 9.578385041664925e-08,
686
+ "loss": 0.7784,
687
+ "step": 91
688
+ },
689
+ {
690
+ "epoch": 23.0,
691
+ "grad_norm": 9.160470008850098,
692
+ "learning_rate": 9.561633779574373e-08,
693
+ "loss": 0.7613,
694
+ "step": 92
695
+ },
696
+ {
697
+ "epoch": 23.29090909090909,
698
+ "grad_norm": 8.80034065246582,
699
+ "learning_rate": 9.544573208346251e-08,
700
+ "loss": 0.7708,
701
+ "step": 93
702
+ },
703
+ {
704
+ "epoch": 23.581818181818182,
705
+ "grad_norm": 9.001204490661621,
706
+ "learning_rate": 9.527204627209113e-08,
707
+ "loss": 0.7975,
708
+ "step": 94
709
+ },
710
+ {
711
+ "epoch": 23.87272727272727,
712
+ "grad_norm": 8.64294147491455,
713
+ "learning_rate": 9.509529358847655e-08,
714
+ "loss": 0.7533,
715
+ "step": 95
716
+ },
717
+ {
718
+ "epoch": 24.0,
719
+ "grad_norm": 9.539164543151855,
720
+ "learning_rate": 9.491548749301997e-08,
721
+ "loss": 0.8112,
722
+ "step": 96
723
+ },
724
+ {
725
+ "epoch": 24.0,
726
+ "eval_loss": 0.7388671040534973,
727
+ "eval_runtime": 0.7379,
728
+ "eval_samples_per_second": 17.617,
729
+ "eval_steps_per_second": 17.617,
730
+ "step": 96
731
+ },
732
+ {
733
+ "epoch": 24.29090909090909,
734
+ "grad_norm": 8.564647674560547,
735
+ "learning_rate": 9.473264167865172e-08,
736
+ "loss": 0.779,
737
+ "step": 97
738
+ },
739
+ {
740
+ "epoch": 24.581818181818182,
741
+ "grad_norm": 8.466269493103027,
742
+ "learning_rate": 9.454677006978843e-08,
743
+ "loss": 0.7427,
744
+ "step": 98
745
+ },
746
+ {
747
+ "epoch": 24.87272727272727,
748
+ "grad_norm": 9.549156188964844,
749
+ "learning_rate": 9.435788682127281e-08,
750
+ "loss": 0.7749,
751
+ "step": 99
752
+ },
753
+ {
754
+ "epoch": 25.0,
755
+ "grad_norm": 8.791007041931152,
756
+ "learning_rate": 9.416600631729548e-08,
757
+ "loss": 0.8413,
758
+ "step": 100
759
+ },
760
+ {
761
+ "epoch": 25.29090909090909,
762
+ "grad_norm": 8.481273651123047,
763
+ "learning_rate": 9.397114317029974e-08,
764
+ "loss": 0.7987,
765
+ "step": 101
766
+ },
767
+ {
768
+ "epoch": 25.581818181818182,
769
+ "grad_norm": 7.957334518432617,
770
+ "learning_rate": 9.377331221986867e-08,
771
+ "loss": 0.7579,
772
+ "step": 102
773
+ },
774
+ {
775
+ "epoch": 25.87272727272727,
776
+ "grad_norm": 7.695952415466309,
777
+ "learning_rate": 9.357252853159505e-08,
778
+ "loss": 0.7138,
779
+ "step": 103
780
+ },
781
+ {
782
+ "epoch": 26.0,
783
+ "grad_norm": 8.535294532775879,
784
+ "learning_rate": 9.336880739593415e-08,
785
+ "loss": 0.8143,
786
+ "step": 104
787
+ },
788
+ {
789
+ "epoch": 26.29090909090909,
790
+ "grad_norm": 7.785234451293945,
791
+ "learning_rate": 9.316216432703917e-08,
792
+ "loss": 0.7595,
793
+ "step": 105
794
+ },
795
+ {
796
+ "epoch": 26.581818181818182,
797
+ "grad_norm": 7.210692882537842,
798
+ "learning_rate": 9.295261506157986e-08,
799
+ "loss": 0.6892,
800
+ "step": 106
801
+ },
802
+ {
803
+ "epoch": 26.87272727272727,
804
+ "grad_norm": 7.439105033874512,
805
+ "learning_rate": 9.274017555754408e-08,
806
+ "loss": 0.7828,
807
+ "step": 107
808
+ },
809
+ {
810
+ "epoch": 27.0,
811
+ "grad_norm": 8.5601167678833,
812
+ "learning_rate": 9.252486199302256e-08,
813
+ "loss": 0.8267,
814
+ "step": 108
815
+ },
816
+ {
817
+ "epoch": 27.29090909090909,
818
+ "grad_norm": 7.751751899719238,
819
+ "learning_rate": 9.230669076497686e-08,
820
+ "loss": 0.7837,
821
+ "step": 109
822
+ },
823
+ {
824
+ "epoch": 27.581818181818182,
825
+ "grad_norm": 7.58750057220459,
826
+ "learning_rate": 9.20856784879907e-08,
827
+ "loss": 0.7629,
828
+ "step": 110
829
+ },
830
+ {
831
+ "epoch": 27.87272727272727,
832
+ "grad_norm": 7.078155040740967,
833
+ "learning_rate": 9.186184199300463e-08,
834
+ "loss": 0.732,
835
+ "step": 111
836
+ },
837
+ {
838
+ "epoch": 28.0,
839
+ "grad_norm": 7.075254440307617,
840
+ "learning_rate": 9.163519832603437e-08,
841
+ "loss": 0.708,
842
+ "step": 112
843
+ },
844
+ {
845
+ "epoch": 28.0,
846
+ "eval_loss": 0.7153984904289246,
847
+ "eval_runtime": 0.7384,
848
+ "eval_samples_per_second": 17.605,
849
+ "eval_steps_per_second": 17.605,
850
+ "step": 112
851
  }
852
  ],
853
  "logging_steps": 1,
 
867
  "attributes": {}
868
  }
869
  },
870
+ "total_flos": 3.756292008247296e+16,
871
  "train_batch_size": 1,
872
  "trial_name": null,
873
  "trial_params": null