Kudod commited on
Commit
b8ef482
·
verified ·
1 Parent(s): f0a3bd9

Training in progress, step 60000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4430b2cd91f8f5fd01fd03553f76ef9f10a36827da138a3e8eaa981ae6a46670
3
  size 357393656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:738e31afb26cb178bc406668bb49ad13f5b128cdd690fb5d4b785da8ca0bb63c
3
  size 357393656
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e65862f074b26014a8698b6bea02fcb81eff286edfc108c5226df5fd63301594
3
  size 714965067
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06a3f9ac205780c80a9050ff5ee89e3f122c188aaf1cfbe22549b997f3194fa8
3
  size 714965067
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:406ccc095198a9b51f73271d2ea161ceb1c293768b592772f3eeb691591f2264
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51d5845e27caf8cfea1d2b1f8892eb08f3013019008d987ba6f3d71ae629c686
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16ef4994c96a5a240fd745bfffb644fd64784bbe7855b0c7aab4eb65aed0aa2d
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2df85f60e785e2e08c732a0ce809bd89b9448d10319df74b89110aea5ec1d783
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53ce6824ad446863a69db60dfce5a739e9186a8a15f324d6c03901630309b780
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c022a44c12e767c6893f92275f46ad4b324df28421791b06ec9aa6234af9b0cb
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.157658406785298,
6
  "eval_steps": 10000,
7
- "global_step": 50000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -748,6 +748,154 @@
748
  "eval_samples_per_second": 137.102,
749
  "eval_steps_per_second": 4.285,
750
  "step": 50000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  }
752
  ],
753
  "logging_steps": 500,
@@ -767,7 +915,7 @@
767
  "attributes": {}
768
  }
769
  },
770
- "total_flos": 5.6054486632842035e+17,
771
  "train_batch_size": 32,
772
  "trial_name": null,
773
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.989190088142358,
6
  "eval_steps": 10000,
7
+ "global_step": 60000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
748
  "eval_samples_per_second": 137.102,
749
  "eval_steps_per_second": 4.285,
750
  "step": 50000
751
+ },
752
+ {
753
+ "epoch": 4.199234990853151,
754
+ "grad_norm": 0.676832377910614,
755
+ "learning_rate": 8.165060037206157e-05,
756
+ "loss": 7.9251,
757
+ "step": 50500
758
+ },
759
+ {
760
+ "epoch": 4.240811574921005,
761
+ "grad_norm": 0.6083265542984009,
762
+ "learning_rate": 7.742262810755962e-05,
763
+ "loss": 7.9429,
764
+ "step": 51000
765
+ },
766
+ {
767
+ "epoch": 4.282388158988858,
768
+ "grad_norm": 8.456089973449707,
769
+ "learning_rate": 7.319465584305767e-05,
770
+ "loss": 7.9295,
771
+ "step": 51500
772
+ },
773
+ {
774
+ "epoch": 4.3239647430567105,
775
+ "grad_norm": 80.9195785522461,
776
+ "learning_rate": 6.896668357855572e-05,
777
+ "loss": 7.9159,
778
+ "step": 52000
779
+ },
780
+ {
781
+ "epoch": 4.365541327124563,
782
+ "grad_norm": 1.259993314743042,
783
+ "learning_rate": 6.473871131405378e-05,
784
+ "loss": 7.9322,
785
+ "step": 52500
786
+ },
787
+ {
788
+ "epoch": 4.407117911192416,
789
+ "grad_norm": 1.4960211515426636,
790
+ "learning_rate": 6.051073904955184e-05,
791
+ "loss": 7.9287,
792
+ "step": 53000
793
+ },
794
+ {
795
+ "epoch": 4.44869449526027,
796
+ "grad_norm": 0.4545043706893921,
797
+ "learning_rate": 5.6291222729578894e-05,
798
+ "loss": 7.9237,
799
+ "step": 53500
800
+ },
801
+ {
802
+ "epoch": 4.490271079328123,
803
+ "grad_norm": 2.8742592334747314,
804
+ "learning_rate": 5.207170640960596e-05,
805
+ "loss": 7.9185,
806
+ "step": 54000
807
+ },
808
+ {
809
+ "epoch": 4.5318476633959754,
810
+ "grad_norm": 2.539797306060791,
811
+ "learning_rate": 4.784373414510401e-05,
812
+ "loss": 7.9342,
813
+ "step": 54500
814
+ },
815
+ {
816
+ "epoch": 4.573424247463828,
817
+ "grad_norm": 1.3232216835021973,
818
+ "learning_rate": 4.3615761880602066e-05,
819
+ "loss": 7.9254,
820
+ "step": 55000
821
+ },
822
+ {
823
+ "epoch": 4.615000831531681,
824
+ "grad_norm": 3.313217878341675,
825
+ "learning_rate": 3.938778961610012e-05,
826
+ "loss": 7.9267,
827
+ "step": 55500
828
+ },
829
+ {
830
+ "epoch": 4.656577415599534,
831
+ "grad_norm": 1.0304898023605347,
832
+ "learning_rate": 3.5159817351598174e-05,
833
+ "loss": 7.9303,
834
+ "step": 56000
835
+ },
836
+ {
837
+ "epoch": 4.698153999667388,
838
+ "grad_norm": 10.963839530944824,
839
+ "learning_rate": 3.093184508709623e-05,
840
+ "loss": 7.9318,
841
+ "step": 56500
842
+ },
843
+ {
844
+ "epoch": 4.73973058373524,
845
+ "grad_norm": 4.864618301391602,
846
+ "learning_rate": 2.672078471165229e-05,
847
+ "loss": 7.9217,
848
+ "step": 57000
849
+ },
850
+ {
851
+ "epoch": 4.781307167803093,
852
+ "grad_norm": 0.6966050863265991,
853
+ "learning_rate": 2.2492812447150345e-05,
854
+ "loss": 7.9273,
855
+ "step": 57500
856
+ },
857
+ {
858
+ "epoch": 4.822883751870946,
859
+ "grad_norm": 1.1059428453445435,
860
+ "learning_rate": 1.8264840182648402e-05,
861
+ "loss": 7.9342,
862
+ "step": 58000
863
+ },
864
+ {
865
+ "epoch": 4.8644603359388,
866
+ "grad_norm": 1.2508047819137573,
867
+ "learning_rate": 1.4036867918146456e-05,
868
+ "loss": 7.9339,
869
+ "step": 58500
870
+ },
871
+ {
872
+ "epoch": 4.9060369200066525,
873
+ "grad_norm": 0.4586002826690674,
874
+ "learning_rate": 9.808895653644512e-06,
875
+ "loss": 7.9219,
876
+ "step": 59000
877
+ },
878
+ {
879
+ "epoch": 4.947613504074505,
880
+ "grad_norm": 1.337792158126831,
881
+ "learning_rate": 5.580923389142567e-06,
882
+ "loss": 7.9324,
883
+ "step": 59500
884
+ },
885
+ {
886
+ "epoch": 4.989190088142358,
887
+ "grad_norm": 2.416313409805298,
888
+ "learning_rate": 1.3529511246406224e-06,
889
+ "loss": 7.9342,
890
+ "step": 60000
891
+ },
892
+ {
893
+ "epoch": 4.989190088142358,
894
+ "eval_loss": 8.904105186462402,
895
+ "eval_runtime": 2799.8597,
896
+ "eval_samples_per_second": 137.444,
897
+ "eval_steps_per_second": 4.295,
898
+ "step": 60000
899
  }
900
  ],
901
  "logging_steps": 500,
 
915
  "attributes": {}
916
  }
917
  },
918
+ "total_flos": 6.726635198108539e+17,
919
  "train_batch_size": 32,
920
  "trial_name": null,
921
  "trial_params": null