AlanFeder commited on
Commit
3f07d6d
·
verified ·
1 Parent(s): 702838d

Training in progress, step 138, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:faaccbff72b5ac06b6a0df5c9080f2abfc0357d5ced0d4f8cd41a0132fbc01c9
3
  size 2436951232
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f719a64470755771748becc2d79750c1dfc3c56ede76336615f2287e3183de55
3
  size 2436951232
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:623d23ee9bd96430d20c02db1931c340263ee334ae7e77686bf643c2ef657f8a
3
  size 2274077596
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b98a96d8f832a2978139f64d417323559b2351dd714affdd97f8810a53a1c30
3
  size 2274077596
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc504633953a1bc6ad5282190e1dfa0d9a6e9c0298769de5bbe61202fcee389d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:734e89105df261b24dbde0e06f6215c07754ac7f546e434ffc90b4c4aeac7c05
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:194d2ac2c6cfc971682599aa0b4aa84395dc81930a4609447db40281d4881264
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21d7f598026e67cff206e8b45c13b9fe02c682beb205dd1fd163b157e36c8d6c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.855614973262032,
5
  "eval_steps": 12,
6
- "global_step": 92,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -715,6 +715,360 @@
715
  "learning_rate": 0.00010901530811120655,
716
  "loss": 0.283,
717
  "step": 92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  }
719
  ],
720
  "logging_steps": 1,
@@ -734,7 +1088,7 @@
734
  "attributes": {}
735
  }
736
  },
737
- "total_flos": 9173011097714688.0,
738
  "train_batch_size": 2,
739
  "trial_name": null,
740
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.732620320855615,
5
  "eval_steps": 12,
6
+ "global_step": 138,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
715
  "learning_rate": 0.00010901530811120655,
716
  "loss": 0.283,
717
  "step": 92
718
+ },
719
+ {
720
+ "epoch": 1.8770053475935828,
721
+ "grad_norm": 4.964606285095215,
722
+ "learning_rate": 0.00010721577558006164,
723
+ "loss": 0.2673,
724
+ "step": 93
725
+ },
726
+ {
727
+ "epoch": 1.8983957219251337,
728
+ "grad_norm": 8.798148155212402,
729
+ "learning_rate": 0.00010541389085854176,
730
+ "loss": 0.2055,
731
+ "step": 94
732
+ },
733
+ {
734
+ "epoch": 1.9197860962566846,
735
+ "grad_norm": 5.59722375869751,
736
+ "learning_rate": 0.00010361024132299364,
737
+ "loss": 0.209,
738
+ "step": 95
739
+ },
740
+ {
741
+ "epoch": 1.9411764705882353,
742
+ "grad_norm": 8.256745338439941,
743
+ "learning_rate": 0.00010180541492505604,
744
+ "loss": 0.1079,
745
+ "step": 96
746
+ },
747
+ {
748
+ "epoch": 1.9411764705882353,
749
+ "eval_loss": 0.7684900760650635,
750
+ "eval_runtime": 17.746,
751
+ "eval_samples_per_second": 16.905,
752
+ "eval_steps_per_second": 8.453,
753
+ "step": 96
754
+ },
755
+ {
756
+ "epoch": 1.962566844919786,
757
+ "grad_norm": 4.481995582580566,
758
+ "learning_rate": 0.0001,
759
+ "loss": 0.1824,
760
+ "step": 97
761
+ },
762
+ {
763
+ "epoch": 1.9839572192513368,
764
+ "grad_norm": 3.852792263031006,
765
+ "learning_rate": 9.819458507494394e-05,
766
+ "loss": 0.124,
767
+ "step": 98
768
+ },
769
+ {
770
+ "epoch": 2.0053475935828877,
771
+ "grad_norm": 29.360998153686523,
772
+ "learning_rate": 9.638975867700638e-05,
773
+ "loss": 0.1844,
774
+ "step": 99
775
+ },
776
+ {
777
+ "epoch": 2.0267379679144386,
778
+ "grad_norm": 4.80811071395874,
779
+ "learning_rate": 9.458610914145826e-05,
780
+ "loss": 0.1347,
781
+ "step": 100
782
+ },
783
+ {
784
+ "epoch": 2.048128342245989,
785
+ "grad_norm": 17.53403091430664,
786
+ "learning_rate": 9.27842244199384e-05,
787
+ "loss": 0.3378,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 2.06951871657754,
792
+ "grad_norm": 7.34214973449707,
793
+ "learning_rate": 9.098469188879349e-05,
794
+ "loss": 0.2045,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 2.090909090909091,
799
+ "grad_norm": 4.968944072723389,
800
+ "learning_rate": 8.918809815760585e-05,
801
+ "loss": 0.2284,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 2.0053475935828877,
806
+ "grad_norm": 4.202042579650879,
807
+ "learning_rate": 8.739502887797107e-05,
808
+ "loss": 0.1655,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 2.0267379679144386,
813
+ "grad_norm": 3.561790704727173,
814
+ "learning_rate": 8.560606855258808e-05,
815
+ "loss": 0.0442,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 2.0481283422459895,
820
+ "grad_norm": 3.8292624950408936,
821
+ "learning_rate": 8.382180034472353e-05,
822
+ "loss": 0.0821,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 2.06951871657754,
827
+ "grad_norm": 2.1650640964508057,
828
+ "learning_rate": 8.204280588811283e-05,
829
+ "loss": 0.0384,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 2.090909090909091,
834
+ "grad_norm": 1.6922334432601929,
835
+ "learning_rate": 8.026966509736001e-05,
836
+ "loss": 0.0342,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 2.090909090909091,
841
+ "eval_loss": 0.7716657519340515,
842
+ "eval_runtime": 17.9156,
843
+ "eval_samples_per_second": 16.745,
844
+ "eval_steps_per_second": 8.373,
845
+ "step": 108
846
+ },
847
+ {
848
+ "epoch": 2.1122994652406417,
849
+ "grad_norm": 1.0654356479644775,
850
+ "learning_rate": 7.85029559788976e-05,
851
+ "loss": 0.0184,
852
+ "step": 109
853
+ },
854
+ {
855
+ "epoch": 2.1336898395721926,
856
+ "grad_norm": 3.1057019233703613,
857
+ "learning_rate": 7.674325444256899e-05,
858
+ "loss": 0.0417,
859
+ "step": 110
860
+ },
861
+ {
862
+ "epoch": 2.1550802139037435,
863
+ "grad_norm": 0.19042205810546875,
864
+ "learning_rate": 7.499113411389371e-05,
865
+ "loss": 0.0026,
866
+ "step": 111
867
+ },
868
+ {
869
+ "epoch": 2.176470588235294,
870
+ "grad_norm": 1.5116851329803467,
871
+ "learning_rate": 7.324716614707793e-05,
872
+ "loss": 0.0089,
873
+ "step": 112
874
+ },
875
+ {
876
+ "epoch": 2.197860962566845,
877
+ "grad_norm": 2.5151679515838623,
878
+ "learning_rate": 7.151191903883001e-05,
879
+ "loss": 0.0357,
880
+ "step": 113
881
+ },
882
+ {
883
+ "epoch": 2.2192513368983957,
884
+ "grad_norm": 2.838503837585449,
885
+ "learning_rate": 6.978595844304271e-05,
886
+ "loss": 0.0366,
887
+ "step": 114
888
+ },
889
+ {
890
+ "epoch": 2.2406417112299466,
891
+ "grad_norm": 3.835000514984131,
892
+ "learning_rate": 6.806984698640202e-05,
893
+ "loss": 0.1412,
894
+ "step": 115
895
+ },
896
+ {
897
+ "epoch": 2.2620320855614975,
898
+ "grad_norm": 3.4443538188934326,
899
+ "learning_rate": 6.636414408498249e-05,
900
+ "loss": 0.0707,
901
+ "step": 116
902
+ },
903
+ {
904
+ "epoch": 2.283422459893048,
905
+ "grad_norm": 2.701524496078491,
906
+ "learning_rate": 6.466940576188977e-05,
907
+ "loss": 0.0497,
908
+ "step": 117
909
+ },
910
+ {
911
+ "epoch": 2.304812834224599,
912
+ "grad_norm": 2.612593412399292,
913
+ "learning_rate": 6.298618446600856e-05,
914
+ "loss": 0.052,
915
+ "step": 118
916
+ },
917
+ {
918
+ "epoch": 2.3262032085561497,
919
+ "grad_norm": 4.986962795257568,
920
+ "learning_rate": 6.13150288919161e-05,
921
+ "loss": 0.1255,
922
+ "step": 119
923
+ },
924
+ {
925
+ "epoch": 2.3475935828877006,
926
+ "grad_norm": 1.8598374128341675,
927
+ "learning_rate": 5.965648380101916e-05,
928
+ "loss": 0.0309,
929
+ "step": 120
930
+ },
931
+ {
932
+ "epoch": 2.3475935828877006,
933
+ "eval_loss": 0.785007119178772,
934
+ "eval_runtime": 17.7923,
935
+ "eval_samples_per_second": 16.861,
936
+ "eval_steps_per_second": 8.431,
937
+ "step": 120
938
+ },
939
+ {
940
+ "epoch": 2.3689839572192515,
941
+ "grad_norm": 1.5813214778900146,
942
+ "learning_rate": 5.801108984397354e-05,
943
+ "loss": 0.0201,
944
+ "step": 121
945
+ },
946
+ {
947
+ "epoch": 2.3903743315508024,
948
+ "grad_norm": 0.13843385875225067,
949
+ "learning_rate": 5.6379383384443255e-05,
950
+ "loss": 0.0018,
951
+ "step": 122
952
+ },
953
+ {
954
+ "epoch": 2.411764705882353,
955
+ "grad_norm": 4.4155707359313965,
956
+ "learning_rate": 5.476189632425732e-05,
957
+ "loss": 0.0326,
958
+ "step": 123
959
+ },
960
+ {
961
+ "epoch": 2.4331550802139037,
962
+ "grad_norm": 3.5101325511932373,
963
+ "learning_rate": 5.3159155930021e-05,
964
+ "loss": 0.0259,
965
+ "step": 124
966
+ },
967
+ {
968
+ "epoch": 2.4545454545454546,
969
+ "grad_norm": 5.201532363891602,
970
+ "learning_rate": 5.1571684661238075e-05,
971
+ "loss": 0.0761,
972
+ "step": 125
973
+ },
974
+ {
975
+ "epoch": 2.4759358288770055,
976
+ "grad_norm": 2.48543119430542,
977
+ "learning_rate": 5.000000000000002e-05,
978
+ "loss": 0.0587,
979
+ "step": 126
980
+ },
981
+ {
982
+ "epoch": 2.497326203208556,
983
+ "grad_norm": 7.39755916595459,
984
+ "learning_rate": 4.844461428229782e-05,
985
+ "loss": 0.0391,
986
+ "step": 127
987
+ },
988
+ {
989
+ "epoch": 2.518716577540107,
990
+ "grad_norm": 4.151485443115234,
991
+ "learning_rate": 4.6906034531011346e-05,
992
+ "loss": 0.0982,
993
+ "step": 128
994
+ },
995
+ {
996
+ "epoch": 2.5401069518716577,
997
+ "grad_norm": 4.144845485687256,
998
+ "learning_rate": 4.53847622906303e-05,
999
+ "loss": 0.0707,
1000
+ "step": 129
1001
+ },
1002
+ {
1003
+ "epoch": 2.5614973262032086,
1004
+ "grad_norm": 7.3682732582092285,
1005
+ "learning_rate": 4.388129346376178e-05,
1006
+ "loss": 0.0455,
1007
+ "step": 130
1008
+ },
1009
+ {
1010
+ "epoch": 2.5828877005347595,
1011
+ "grad_norm": 4.947929382324219,
1012
+ "learning_rate": 4.239611814947605e-05,
1013
+ "loss": 0.033,
1014
+ "step": 131
1015
+ },
1016
+ {
1017
+ "epoch": 2.6042780748663104,
1018
+ "grad_norm": 3.0208606719970703,
1019
+ "learning_rate": 4.092972048354491e-05,
1020
+ "loss": 0.0373,
1021
+ "step": 132
1022
+ },
1023
+ {
1024
+ "epoch": 2.6042780748663104,
1025
+ "eval_loss": 0.776565432548523,
1026
+ "eval_runtime": 17.3019,
1027
+ "eval_samples_per_second": 17.339,
1028
+ "eval_steps_per_second": 8.67,
1029
+ "step": 132
1030
+ },
1031
+ {
1032
+ "epoch": 2.625668449197861,
1033
+ "grad_norm": 7.514610290527344,
1034
+ "learning_rate": 3.948257848062351e-05,
1035
+ "loss": 0.0323,
1036
+ "step": 133
1037
+ },
1038
+ {
1039
+ "epoch": 2.6470588235294117,
1040
+ "grad_norm": 1.8352607488632202,
1041
+ "learning_rate": 3.80551638784277e-05,
1042
+ "loss": 0.043,
1043
+ "step": 134
1044
+ },
1045
+ {
1046
+ "epoch": 2.6684491978609626,
1047
+ "grad_norm": 3.525506019592285,
1048
+ "learning_rate": 3.664794198395764e-05,
1049
+ "loss": 0.0643,
1050
+ "step": 135
1051
+ },
1052
+ {
1053
+ "epoch": 2.6898395721925135,
1054
+ "grad_norm": 5.074891567230225,
1055
+ "learning_rate": 3.5261371521817244e-05,
1056
+ "loss": 0.0658,
1057
+ "step": 136
1058
+ },
1059
+ {
1060
+ "epoch": 2.711229946524064,
1061
+ "grad_norm": 3.6220922470092773,
1062
+ "learning_rate": 3.3895904484679984e-05,
1063
+ "loss": 0.1535,
1064
+ "step": 137
1065
+ },
1066
+ {
1067
+ "epoch": 2.732620320855615,
1068
+ "grad_norm": 3.9044840335845947,
1069
+ "learning_rate": 3.2551985985948616e-05,
1070
+ "loss": 0.0572,
1071
+ "step": 138
1072
  }
1073
  ],
1074
  "logging_steps": 1,
 
1088
  "attributes": {}
1089
  }
1090
  },
1091
+ "total_flos": 1.3759516646572032e+16,
1092
  "train_batch_size": 2,
1093
  "trial_name": null,
1094
  "trial_params": null