LLJYY commited on
Commit
46b5873
·
verified ·
1 Parent(s): 3e7440f

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dd24ee6828501b624fa6d66fd1194cee27acdf6fbf4040fa3393ed025f1e0b8
3
  size 174663600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e2039cdf0d4ca2fe364489b9a26fd1582201d0fd89c103dcb7a06c7a3083e2c
3
  size 174663600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57c4a4dab1575e19036cc179b540af28afc954075c76fa1c3f74f467b18a0a54
3
  size 177908997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0df68d466562b2d42ea22f3c6f25f54c0ba6fc82f68b90e89c45cb2738741123
3
  size 177908997
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04eefe07496c9ea6eacb03b570d4b4b5896211d650c0810a1180d502bea3bcc3
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4004b539c016dc3dd1a46f0cfd51bdccd67571231886c54485e4a0726c042be8
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:119c8b8031efeada1dd54137e4c5ca8dc90f054b53a8f73cacb65b1b4acc4f58
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba16ce8cf2517b7afd4a4313c86e62d498e4965522f1c59e111da3f1986b5604
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7086335183654187,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -608,6 +608,206 @@
608
  "mean_token_accuracy": 0.9487812982499599,
609
  "num_tokens": 24793698.0,
610
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  }
612
  ],
613
  "logging_steps": 25,
@@ -627,7 +827,7 @@
627
  "attributes": {}
628
  }
629
  },
630
- "total_flos": 1.587361750240276e+18,
631
  "train_batch_size": 2,
632
  "trial_name": null,
633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9448446911538916,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
608
  "mean_token_accuracy": 0.9487812982499599,
609
  "num_tokens": 24793698.0,
610
  "step": 1500
611
+ },
612
+ {
613
+ "entropy": 0.20117157969623803,
614
+ "epoch": 0.7204440770048424,
615
+ "grad_norm": 0.212890625,
616
+ "learning_rate": 9.416027663269881e-05,
617
+ "loss": 0.199,
618
+ "mean_token_accuracy": 0.9457039895653725,
619
+ "num_tokens": 25203851.0,
620
+ "step": 1525
621
+ },
622
+ {
623
+ "entropy": 0.18685766063630582,
624
+ "epoch": 0.732254635644266,
625
+ "grad_norm": 0.220703125,
626
+ "learning_rate": 9.383385876187659e-05,
627
+ "loss": 0.1848,
628
+ "mean_token_accuracy": 0.9489575871825218,
629
+ "num_tokens": 25608056.0,
630
+ "step": 1550
631
+ },
632
+ {
633
+ "entropy": 0.20038649912923576,
634
+ "epoch": 0.7440651942836897,
635
+ "grad_norm": 0.203125,
636
+ "learning_rate": 9.34991624194776e-05,
637
+ "loss": 0.1964,
638
+ "mean_token_accuracy": 0.9469015775620937,
639
+ "num_tokens": 26027627.0,
640
+ "step": 1575
641
+ },
642
+ {
643
+ "entropy": 0.19791467829607426,
644
+ "epoch": 0.7558757529231133,
645
+ "grad_norm": 0.1953125,
646
+ "learning_rate": 9.315625081632191e-05,
647
+ "loss": 0.1949,
648
+ "mean_token_accuracy": 0.9461945466697216,
649
+ "num_tokens": 26437140.0,
650
+ "step": 1600
651
+ },
652
+ {
653
+ "entropy": 0.1835308167617768,
654
+ "epoch": 0.7676863115625369,
655
+ "grad_norm": 0.19140625,
656
+ "learning_rate": 9.280518871476536e-05,
657
+ "loss": 0.182,
658
+ "mean_token_accuracy": 0.9500955049693585,
659
+ "num_tokens": 26845418.0,
660
+ "step": 1625
661
+ },
662
+ {
663
+ "entropy": 0.17514564257115126,
664
+ "epoch": 0.7794968702019606,
665
+ "grad_norm": 0.16015625,
666
+ "learning_rate": 9.244604241646864e-05,
667
+ "loss": 0.1703,
668
+ "mean_token_accuracy": 0.9527664017677308,
669
+ "num_tokens": 27259457.0,
670
+ "step": 1650
671
+ },
672
+ {
673
+ "entropy": 0.1950150650832802,
674
+ "epoch": 0.7913074288413842,
675
+ "grad_norm": 0.322265625,
676
+ "learning_rate": 9.207887974987546e-05,
677
+ "loss": 0.1933,
678
+ "mean_token_accuracy": 0.9472205652296544,
679
+ "num_tokens": 27681677.0,
680
+ "step": 1675
681
+ },
682
+ {
683
+ "entropy": 0.18074996698647738,
684
+ "epoch": 0.8031179874808079,
685
+ "grad_norm": 0.240234375,
686
+ "learning_rate": 9.170377005740251e-05,
687
+ "loss": 0.1771,
688
+ "mean_token_accuracy": 0.9505787827074528,
689
+ "num_tokens": 28105249.0,
690
+ "step": 1700
691
+ },
692
+ {
693
+ "entropy": 0.1879336739424616,
694
+ "epoch": 0.8149285461202315,
695
+ "grad_norm": 0.19140625,
696
+ "learning_rate": 9.132078418234344e-05,
697
+ "loss": 0.1849,
698
+ "mean_token_accuracy": 0.9493447379767894,
699
+ "num_tokens": 28530587.0,
700
+ "step": 1725
701
+ },
702
+ {
703
+ "entropy": 0.1837541355099529,
704
+ "epoch": 0.8267391047596552,
705
+ "grad_norm": 0.158203125,
706
+ "learning_rate": 9.09299944554893e-05,
707
+ "loss": 0.1806,
708
+ "mean_token_accuracy": 0.9498767641186714,
709
+ "num_tokens": 28958518.0,
710
+ "step": 1750
711
+ },
712
+ {
713
+ "entropy": 0.18400955947116018,
714
+ "epoch": 0.8385496633990788,
715
+ "grad_norm": 0.169921875,
716
+ "learning_rate": 9.05314746814683e-05,
717
+ "loss": 0.1825,
718
+ "mean_token_accuracy": 0.9495964366197586,
719
+ "num_tokens": 29370081.0,
720
+ "step": 1775
721
+ },
722
+ {
723
+ "entropy": 0.19354972328990697,
724
+ "epoch": 0.8503602220385024,
725
+ "grad_norm": 0.193359375,
726
+ "learning_rate": 9.012530012480684e-05,
727
+ "loss": 0.1909,
728
+ "mean_token_accuracy": 0.9474910768866539,
729
+ "num_tokens": 29782814.0,
730
+ "step": 1800
731
+ },
732
+ {
733
+ "entropy": 0.17223400254733862,
734
+ "epoch": 0.8621707806779261,
735
+ "grad_norm": 0.2216796875,
736
+ "learning_rate": 8.971154749571522e-05,
737
+ "loss": 0.171,
738
+ "mean_token_accuracy": 0.9524741047620773,
739
+ "num_tokens": 30188402.0,
740
+ "step": 1825
741
+ },
742
+ {
743
+ "entropy": 0.19484048396348952,
744
+ "epoch": 0.8739813393173497,
745
+ "grad_norm": 0.19140625,
746
+ "learning_rate": 8.92902949356e-05,
747
+ "loss": 0.1928,
748
+ "mean_token_accuracy": 0.947279536575079,
749
+ "num_tokens": 30600792.0,
750
+ "step": 1850
751
+ },
752
+ {
753
+ "entropy": 0.19088726976886392,
754
+ "epoch": 0.8857918979567734,
755
+ "grad_norm": 0.181640625,
756
+ "learning_rate": 8.886162200230628e-05,
757
+ "loss": 0.1894,
758
+ "mean_token_accuracy": 0.9477717036008835,
759
+ "num_tokens": 31020474.0,
760
+ "step": 1875
761
+ },
762
+ {
763
+ "entropy": 0.17441698019392787,
764
+ "epoch": 0.897602456596197,
765
+ "grad_norm": 0.1845703125,
766
+ "learning_rate": 8.84256096550924e-05,
767
+ "loss": 0.1717,
768
+ "mean_token_accuracy": 0.95237029671669,
769
+ "num_tokens": 31443050.0,
770
+ "step": 1900
771
+ },
772
+ {
773
+ "entropy": 0.18448449746705592,
774
+ "epoch": 0.9094130152356207,
775
+ "grad_norm": 0.1748046875,
776
+ "learning_rate": 8.798234023933985e-05,
777
+ "loss": 0.1825,
778
+ "mean_token_accuracy": 0.9495246517658233,
779
+ "num_tokens": 31852766.0,
780
+ "step": 1925
781
+ },
782
+ {
783
+ "entropy": 0.18353000645525752,
784
+ "epoch": 0.9212235738750443,
785
+ "grad_norm": 0.146484375,
786
+ "learning_rate": 8.753189747100161e-05,
787
+ "loss": 0.1808,
788
+ "mean_token_accuracy": 0.9494968324899673,
789
+ "num_tokens": 32264050.0,
790
+ "step": 1950
791
+ },
792
+ {
793
+ "entropy": 0.18177078458480536,
794
+ "epoch": 0.933034132514468,
795
+ "grad_norm": 0.1455078125,
796
+ "learning_rate": 8.707436642079154e-05,
797
+ "loss": 0.181,
798
+ "mean_token_accuracy": 0.9502438700199127,
799
+ "num_tokens": 32670263.0,
800
+ "step": 1975
801
+ },
802
+ {
803
+ "entropy": 0.18713734617456793,
804
+ "epoch": 0.9448446911538916,
805
+ "grad_norm": 0.189453125,
806
+ "learning_rate": 8.660983349811783e-05,
807
+ "loss": 0.1845,
808
+ "mean_token_accuracy": 0.9492973360419273,
809
+ "num_tokens": 33086791.0,
810
+ "step": 2000
811
  }
812
  ],
813
  "logging_steps": 25,
 
827
  "attributes": {}
828
  }
829
  },
830
+ "total_flos": 2.1213520186826977e+18,
831
  "train_batch_size": 2,
832
  "trial_name": null,
833
  "trial_params": null