Kudod commited on
Commit
de720a1
·
verified ·
1 Parent(s): 35390fb

Training in progress, step 60000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:469e1f89eccfb155338fd5db39f32fd0c4eb9dd326d3a7d29a4d2672a07d4b21
3
  size 357393656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05a30dcc4066c74987493dbc2a267fc6d98cb8159f7cd132cf51d35e7905c50a
3
  size 357393656
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:131fac4f70e3c78c776224a1d93835a3169eb37f76ab0c797fa8b50cabbb2ff2
3
  size 714964666
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a41286438e9fd35f9bdf47cbadeb04807c4c2d4f0a9fb94e912dce25f586e5
3
  size 714964666
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9080fe56036d364eeed3a3e17bc011e2720636fbd1cd74b0b67060e1c6f9fd0b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c1934cdc6a6281fbbbb65f91e6a59588aa3aa21c101b0948ed814af372256f2
3
  size 14244
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b5a1f66ccb3d7e0c17977550a1bc585bff50e438a8704fb2fe2f661fb58928b
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b426d42a4e9592ccb13e91eff14490693255fb2d0c2b3d028db69ce729eb760f
3
  size 988
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17767a352adb7f4edfdb7e9a788ffce018ee7072087581f483d4c5cf01d238db
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20ca3cb91477d18e43f14bf0f696487d6551646ca2aee7b0317f7c8df3b4406b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.157658406785298,
6
  "eval_steps": 10000,
7
- "global_step": 50000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -748,6 +748,154 @@
748
  "eval_samples_per_second": 133.107,
749
  "eval_steps_per_second": 4.16,
750
  "step": 50000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  }
752
  ],
753
  "logging_steps": 500,
@@ -767,7 +915,7 @@
767
  "attributes": {}
768
  }
769
  },
770
- "total_flos": 5.6054932837341696e+17,
771
  "train_batch_size": 32,
772
  "trial_name": null,
773
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.989190088142358,
6
  "eval_steps": 10000,
7
+ "global_step": 60000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
748
  "eval_samples_per_second": 133.107,
749
  "eval_steps_per_second": 4.16,
750
  "step": 50000
751
+ },
752
+ {
753
+ "epoch": 4.199234990853151,
754
+ "grad_norm": 0.21643032133579254,
755
+ "learning_rate": 8.159986470488753e-05,
756
+ "loss": 7.9383,
757
+ "step": 50500
758
+ },
759
+ {
760
+ "epoch": 4.240811574921005,
761
+ "grad_norm": 0.670906662940979,
762
+ "learning_rate": 7.737189244038559e-05,
763
+ "loss": 7.9348,
764
+ "step": 51000
765
+ },
766
+ {
767
+ "epoch": 4.282388158988858,
768
+ "grad_norm": 0.21291261911392212,
769
+ "learning_rate": 7.315237612041264e-05,
770
+ "loss": 7.9352,
771
+ "step": 51500
772
+ },
773
+ {
774
+ "epoch": 4.3239647430567105,
775
+ "grad_norm": 0.330913782119751,
776
+ "learning_rate": 6.89244038559107e-05,
777
+ "loss": 7.9266,
778
+ "step": 52000
779
+ },
780
+ {
781
+ "epoch": 4.365541327124563,
782
+ "grad_norm": 0.37971287965774536,
783
+ "learning_rate": 6.469643159140875e-05,
784
+ "loss": 7.9368,
785
+ "step": 52500
786
+ },
787
+ {
788
+ "epoch": 4.407117911192416,
789
+ "grad_norm": 0.46552345156669617,
790
+ "learning_rate": 6.0468459326906816e-05,
791
+ "loss": 7.9407,
792
+ "step": 53000
793
+ },
794
+ {
795
+ "epoch": 4.44869449526027,
796
+ "grad_norm": 0.32077756524086,
797
+ "learning_rate": 5.624048706240487e-05,
798
+ "loss": 7.9331,
799
+ "step": 53500
800
+ },
801
+ {
802
+ "epoch": 4.490271079328123,
803
+ "grad_norm": 0.5039647221565247,
804
+ "learning_rate": 5.201251479790293e-05,
805
+ "loss": 7.9327,
806
+ "step": 54000
807
+ },
808
+ {
809
+ "epoch": 4.5318476633959754,
810
+ "grad_norm": 0.7789810299873352,
811
+ "learning_rate": 4.779299847792998e-05,
812
+ "loss": 7.9297,
813
+ "step": 54500
814
+ },
815
+ {
816
+ "epoch": 4.573424247463828,
817
+ "grad_norm": 0.33232608437538147,
818
+ "learning_rate": 4.356502621342804e-05,
819
+ "loss": 7.9354,
820
+ "step": 55000
821
+ },
822
+ {
823
+ "epoch": 4.615000831531681,
824
+ "grad_norm": 0.9029014706611633,
825
+ "learning_rate": 3.9337053948926096e-05,
826
+ "loss": 7.941,
827
+ "step": 55500
828
+ },
829
+ {
830
+ "epoch": 4.656577415599534,
831
+ "grad_norm": 0.39622828364372253,
832
+ "learning_rate": 3.510908168442415e-05,
833
+ "loss": 7.9325,
834
+ "step": 56000
835
+ },
836
+ {
837
+ "epoch": 4.698153999667388,
838
+ "grad_norm": 0.19301092624664307,
839
+ "learning_rate": 3.0881109419922204e-05,
840
+ "loss": 7.9323,
841
+ "step": 56500
842
+ },
843
+ {
844
+ "epoch": 4.73973058373524,
845
+ "grad_norm": 0.8889003992080688,
846
+ "learning_rate": 2.6661593099949264e-05,
847
+ "loss": 7.9267,
848
+ "step": 57000
849
+ },
850
+ {
851
+ "epoch": 4.781307167803093,
852
+ "grad_norm": 0.31362804770469666,
853
+ "learning_rate": 2.243362083544732e-05,
854
+ "loss": 7.9302,
855
+ "step": 57500
856
+ },
857
+ {
858
+ "epoch": 4.822883751870946,
859
+ "grad_norm": 0.329970121383667,
860
+ "learning_rate": 1.8205648570945376e-05,
861
+ "loss": 7.933,
862
+ "step": 58000
863
+ },
864
+ {
865
+ "epoch": 4.8644603359388,
866
+ "grad_norm": 0.5594154596328735,
867
+ "learning_rate": 1.397767630644343e-05,
868
+ "loss": 7.9314,
869
+ "step": 58500
870
+ },
871
+ {
872
+ "epoch": 4.9060369200066525,
873
+ "grad_norm": 0.4558698236942291,
874
+ "learning_rate": 9.749704041941485e-06,
875
+ "loss": 7.927,
876
+ "step": 59000
877
+ },
878
+ {
879
+ "epoch": 4.947613504074505,
880
+ "grad_norm": 0.39327356219291687,
881
+ "learning_rate": 5.530187721968544e-06,
882
+ "loss": 7.9275,
883
+ "step": 59500
884
+ },
885
+ {
886
+ "epoch": 4.989190088142358,
887
+ "grad_norm": 0.227029949426651,
888
+ "learning_rate": 1.3022154574665992e-06,
889
+ "loss": 7.9352,
890
+ "step": 60000
891
+ },
892
+ {
893
+ "epoch": 4.989190088142358,
894
+ "eval_loss": 9.034906387329102,
895
+ "eval_runtime": 2885.2471,
896
+ "eval_samples_per_second": 133.376,
897
+ "eval_steps_per_second": 4.168,
898
+ "step": 60000
899
  }
900
  ],
901
  "logging_steps": 500,
 
915
  "attributes": {}
916
  }
917
  },
918
+ "total_flos": 6.726749069113344e+17,
919
  "train_batch_size": 32,
920
  "trial_name": null,
921
  "trial_params": null