0xSero commited on
Commit
1951ffd
·
verified ·
1 Parent(s): 0caaed5

Training in progress, step 800, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2b0c9743b534c9f3b61729ce907957871628f6c6bd4a91c86f7774e533d19f8
3
  size 513878424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99d655d301ea858be3358895b900f83f3546df34d2ad0f6a2f427b9e8efe0dac
3
  size 513878424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cd2bc46275d71c4d0fd88a709f3e830e5dc437ac7ec70e1383abf0ea62ea7dd
3
  size 1028086195
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1bf9643fafa9e9bb5ec0bb44ef18204a17f56b0fcf6fb4603dad61af264a8a
3
  size 1028086195
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50352f21cd324123077e7f0c0cb69ef633f7a882bdc66e19fe26f10515185db8
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94892e7742afdef406160d78bd9e53f540b3471184bc2f154b90e6e632d989ef
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0d8ff2fcedc81c60725ad66fe21c76176599c9cb208a592f57fd543d82628a5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57723e8fe5027a1976ba3682a23ecfd5fc8498547c1dca4dadc9ac83caee2746
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 600,
3
- "best_metric": 0.7431696057319641,
4
- "best_model_checkpoint": "./outputs/sero-nouscoder-14b-sft/checkpoint-600",
5
- "epoch": 1.8948616600790515,
6
  "eval_steps": 200,
7
- "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -641,6 +641,217 @@
641
  "eval_samples_per_second": 0.84,
642
  "eval_steps_per_second": 0.429,
643
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
  }
645
  ],
646
  "logging_steps": 10,
@@ -660,7 +871,7 @@
660
  "attributes": {}
661
  }
662
  },
663
- "total_flos": 3.318632687376384e+18,
664
  "train_batch_size": 2,
665
  "trial_name": null,
666
  "trial_params": null
 
1
  {
2
+ "best_global_step": 800,
3
+ "best_metric": 0.737969696521759,
4
+ "best_model_checkpoint": "./outputs/sero-nouscoder-14b-sft/checkpoint-800",
5
+ "epoch": 2.524901185770751,
6
  "eval_steps": 200,
7
+ "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
641
  "eval_samples_per_second": 0.84,
642
  "eval_steps_per_second": 0.429,
643
  "step": 600
644
+ },
645
+ {
646
+ "entropy": 0.7412703175097703,
647
+ "epoch": 1.9264822134387352,
648
+ "grad_norm": 0.171875,
649
+ "learning_rate": 6.281907376577316e-06,
650
+ "loss": 0.7424,
651
+ "mean_token_accuracy": 0.8079155292361975,
652
+ "num_tokens": 39471809.0,
653
+ "step": 610
654
+ },
655
+ {
656
+ "entropy": 0.7081608459353447,
657
+ "epoch": 1.958102766798419,
658
+ "grad_norm": 0.1455078125,
659
+ "learning_rate": 5.961257989545189e-06,
660
+ "loss": 0.7057,
661
+ "mean_token_accuracy": 0.811659300327301,
662
+ "num_tokens": 40121372.0,
663
+ "step": 620
664
+ },
665
+ {
666
+ "entropy": 0.7572397343814373,
667
+ "epoch": 1.9897233201581028,
668
+ "grad_norm": 0.146484375,
669
+ "learning_rate": 5.645496550218089e-06,
670
+ "loss": 0.7629,
671
+ "mean_token_accuracy": 0.7998820699751377,
672
+ "num_tokens": 40772512.0,
673
+ "step": 630
674
+ },
675
+ {
676
+ "entropy": 0.7029830978528874,
677
+ "epoch": 2.01897233201581,
678
+ "grad_norm": 0.142578125,
679
+ "learning_rate": 5.3350052135835616e-06,
680
+ "loss": 0.6927,
681
+ "mean_token_accuracy": 0.8122583684083577,
682
+ "num_tokens": 41369804.0,
683
+ "step": 640
684
+ },
685
+ {
686
+ "entropy": 0.7205941841006279,
687
+ "epoch": 2.0505928853754942,
688
+ "grad_norm": 0.1357421875,
689
+ "learning_rate": 5.0301597564088245e-06,
690
+ "loss": 0.7221,
691
+ "mean_token_accuracy": 0.8073350362479687,
692
+ "num_tokens": 42022341.0,
693
+ "step": 650
694
+ },
695
+ {
696
+ "entropy": 0.7175642896443606,
697
+ "epoch": 2.082213438735178,
698
+ "grad_norm": 0.1416015625,
699
+ "learning_rate": 4.7313291224513494e-06,
700
+ "loss": 0.7146,
701
+ "mean_token_accuracy": 0.8162445619702339,
702
+ "num_tokens": 42669879.0,
703
+ "step": 660
704
+ },
705
+ {
706
+ "entropy": 0.7316870277747511,
707
+ "epoch": 2.1138339920948614,
708
+ "grad_norm": 0.1396484375,
709
+ "learning_rate": 4.438874975939176e-06,
710
+ "loss": 0.735,
711
+ "mean_token_accuracy": 0.8043030217289925,
712
+ "num_tokens": 43316130.0,
713
+ "step": 670
714
+ },
715
+ {
716
+ "entropy": 0.7129190620034933,
717
+ "epoch": 2.1454545454545455,
718
+ "grad_norm": 0.1494140625,
719
+ "learning_rate": 4.153151263861379e-06,
720
+ "loss": 0.716,
721
+ "mean_token_accuracy": 0.8076732002198697,
722
+ "num_tokens": 43961436.0,
723
+ "step": 680
724
+ },
725
+ {
726
+ "entropy": 0.6951379429548978,
727
+ "epoch": 2.177075098814229,
728
+ "grad_norm": 0.140625,
729
+ "learning_rate": 3.874503787598461e-06,
730
+ "loss": 0.6945,
731
+ "mean_token_accuracy": 0.8157570861279965,
732
+ "num_tokens": 44610983.0,
733
+ "step": 690
734
+ },
735
+ {
736
+ "entropy": 0.740327725932002,
737
+ "epoch": 2.208695652173913,
738
+ "grad_norm": 0.166015625,
739
+ "learning_rate": 3.6032697844110896e-06,
740
+ "loss": 0.7392,
741
+ "mean_token_accuracy": 0.8023809418082237,
742
+ "num_tokens": 45261676.0,
743
+ "step": 700
744
+ },
745
+ {
746
+ "entropy": 0.705286979302764,
747
+ "epoch": 2.240316205533597,
748
+ "grad_norm": 0.1474609375,
749
+ "learning_rate": 3.3397775192936465e-06,
750
+ "loss": 0.7086,
751
+ "mean_token_accuracy": 0.8123329438269138,
752
+ "num_tokens": 45910102.0,
753
+ "step": 710
754
+ },
755
+ {
756
+ "entropy": 0.7086669180542231,
757
+ "epoch": 2.271936758893281,
758
+ "grad_norm": 0.13671875,
759
+ "learning_rate": 3.084345887686655e-06,
760
+ "loss": 0.711,
761
+ "mean_token_accuracy": 0.8112940810620785,
762
+ "num_tokens": 46558151.0,
763
+ "step": 720
764
+ },
765
+ {
766
+ "entropy": 0.685754819586873,
767
+ "epoch": 2.3035573122529645,
768
+ "grad_norm": 0.134765625,
769
+ "learning_rate": 2.8372840295288106e-06,
770
+ "loss": 0.685,
771
+ "mean_token_accuracy": 0.8164379067718983,
772
+ "num_tokens": 47205220.0,
773
+ "step": 730
774
+ },
775
+ {
776
+ "entropy": 0.7092605076730252,
777
+ "epoch": 2.335177865612648,
778
+ "grad_norm": 0.1376953125,
779
+ "learning_rate": 2.598890955115757e-06,
780
+ "loss": 0.7098,
781
+ "mean_token_accuracy": 0.8080747678875924,
782
+ "num_tokens": 47855619.0,
783
+ "step": 740
784
+ },
785
+ {
786
+ "entropy": 0.7719928354024888,
787
+ "epoch": 2.366798418972332,
788
+ "grad_norm": 0.158203125,
789
+ "learning_rate": 2.369455183218423e-06,
790
+ "loss": 0.7762,
791
+ "mean_token_accuracy": 0.8013517506420612,
792
+ "num_tokens": 48507209.0,
793
+ "step": 750
794
+ },
795
+ {
796
+ "entropy": 0.7657644001767039,
797
+ "epoch": 2.3984189723320157,
798
+ "grad_norm": 0.166015625,
799
+ "learning_rate": 2.1492543918988906e-06,
800
+ "loss": 0.7674,
801
+ "mean_token_accuracy": 0.800662949681282,
802
+ "num_tokens": 49154758.0,
803
+ "step": 760
804
+ },
805
+ {
806
+ "entropy": 0.7193233577534557,
807
+ "epoch": 2.4300395256917,
808
+ "grad_norm": 0.1435546875,
809
+ "learning_rate": 1.9385550824463727e-06,
810
+ "loss": 0.7214,
811
+ "mean_token_accuracy": 0.8076900616288185,
812
+ "num_tokens": 49802558.0,
813
+ "step": 770
814
+ },
815
+ {
816
+ "entropy": 0.6996176840737462,
817
+ "epoch": 2.4616600790513834,
818
+ "grad_norm": 0.1435546875,
819
+ "learning_rate": 1.7376122568400533e-06,
820
+ "loss": 0.7008,
821
+ "mean_token_accuracy": 0.813380641490221,
822
+ "num_tokens": 50449669.0,
823
+ "step": 780
824
+ },
825
+ {
826
+ "entropy": 0.7119161710143089,
827
+ "epoch": 2.493280632411067,
828
+ "grad_norm": 0.1552734375,
829
+ "learning_rate": 1.5466691091291452e-06,
830
+ "loss": 0.7119,
831
+ "mean_token_accuracy": 0.8110212564468384,
832
+ "num_tokens": 51099730.0,
833
+ "step": 790
834
+ },
835
+ {
836
+ "entropy": 0.705672075971961,
837
+ "epoch": 2.524901185770751,
838
+ "grad_norm": 0.146484375,
839
+ "learning_rate": 1.3659567311036804e-06,
840
+ "loss": 0.7046,
841
+ "mean_token_accuracy": 0.8123706214129924,
842
+ "num_tokens": 51750159.0,
843
+ "step": 800
844
+ },
845
+ {
846
+ "epoch": 2.524901185770751,
847
+ "eval_entropy": 0.7314845383167267,
848
+ "eval_loss": 0.737969696521759,
849
+ "eval_mean_token_accuracy": 0.802791440486908,
850
+ "eval_num_tokens": 51750159.0,
851
+ "eval_runtime": 58.415,
852
+ "eval_samples_per_second": 0.839,
853
+ "eval_steps_per_second": 0.428,
854
+ "step": 800
855
  }
856
  ],
857
  "logging_steps": 10,
 
871
  "attributes": {}
872
  }
873
  },
874
+ "total_flos": 4.423798999920384e+18,
875
  "train_batch_size": 2,
876
  "trial_name": null,
877
  "trial_params": null