ljcamargo commited on
Commit
b6914ee
·
verified ·
1 Parent(s): 9a0ea0a

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f18f2b21d1eb9893ef7d432745ca210cc86cd300d6d237450504c29478453fb
3
  size 3237818848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0cd03d3a9be3023a0e3720a6e91db11f47e20d8e7bec88e3c1220ca8a10eaa
3
  size 3237818848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffd49387501c08473c006cb3983fe8e3572862f34ccc79a00ee2957719d3508e
3
  size 2062251569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0523d436c3449c90448d00f0c9ea8840e7e341f44632cc2e10b78b0d80da3e7c
3
  size 2062251569
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a602fcddae5166b23f64a1263af24cb60ac56e25cf7aa91c125f6b46213120d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:191b991347426ecc0aa235378fd9d2fce0ab0d707a85beb25ac14245f68ee477
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b04ef7af3a89dd0eb8778c7ed7d28aeab310d9f53593d47cc2bdc9458a253ac
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c90e73b569a38f99c2197447433676c2eaa22ce221aeecf0a7d6e7d0501c17
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:370edc1f7812cd81a8eae6fcade42c3407f4dcaf97659f9602f84f2549a0a41c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8c49d54f38ea4c21892dfde13ddaac2daecfb954dcbad06d74b64fe3dec95fd
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5333333333333333,
6
  "eval_steps": 300,
7
- "global_step": 1200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -860,6 +860,216 @@
860
  "learning_rate": 9.302750961496888e-05,
861
  "loss": 1.0333,
862
  "step": 1200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  }
864
  ],
865
  "logging_steps": 10,
@@ -879,7 +1089,7 @@
879
  "attributes": {}
880
  }
881
  },
882
- "total_flos": 3.2735062130688e+19,
883
  "train_batch_size": 4,
884
  "trial_name": null,
885
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6666666666666666,
6
  "eval_steps": 300,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
860
  "learning_rate": 9.302750961496888e-05,
861
  "loss": 1.0333,
862
  "step": 1200
863
+ },
864
+ {
865
+ "epoch": 0.5377777777777778,
866
+ "grad_norm": 10.483113288879395,
867
+ "learning_rate": 9.160761787396665e-05,
868
+ "loss": 0.9749,
869
+ "step": 1210
870
+ },
871
+ {
872
+ "epoch": 0.5422222222222223,
873
+ "grad_norm": 9.348003387451172,
874
+ "learning_rate": 9.018942816072545e-05,
875
+ "loss": 0.9837,
876
+ "step": 1220
877
+ },
878
+ {
879
+ "epoch": 0.5466666666666666,
880
+ "grad_norm": 9.57206916809082,
881
+ "learning_rate": 8.87732280930188e-05,
882
+ "loss": 1.0002,
883
+ "step": 1230
884
+ },
885
+ {
886
+ "epoch": 0.5511111111111111,
887
+ "grad_norm": 9.370091438293457,
888
+ "learning_rate": 8.735930488510774e-05,
889
+ "loss": 1.0049,
890
+ "step": 1240
891
+ },
892
+ {
893
+ "epoch": 0.5555555555555556,
894
+ "grad_norm": 9.066927909851074,
895
+ "learning_rate": 8.594794528949183e-05,
896
+ "loss": 0.9549,
897
+ "step": 1250
898
+ },
899
+ {
900
+ "epoch": 0.56,
901
+ "grad_norm": 13.580326080322266,
902
+ "learning_rate": 8.453943553875392e-05,
903
+ "loss": 1.0505,
904
+ "step": 1260
905
+ },
906
+ {
907
+ "epoch": 0.5644444444444444,
908
+ "grad_norm": 9.729880332946777,
909
+ "learning_rate": 8.313406128751049e-05,
910
+ "loss": 1.0413,
911
+ "step": 1270
912
+ },
913
+ {
914
+ "epoch": 0.5688888888888889,
915
+ "grad_norm": 10.354995727539062,
916
+ "learning_rate": 8.173210755447905e-05,
917
+ "loss": 1.033,
918
+ "step": 1280
919
+ },
920
+ {
921
+ "epoch": 0.5733333333333334,
922
+ "grad_norm": 10.784231185913086,
923
+ "learning_rate": 8.033385866467444e-05,
924
+ "loss": 1.0747,
925
+ "step": 1290
926
+ },
927
+ {
928
+ "epoch": 0.5777777777777777,
929
+ "grad_norm": 8.267210006713867,
930
+ "learning_rate": 7.893959819174619e-05,
931
+ "loss": 0.9777,
932
+ "step": 1300
933
+ },
934
+ {
935
+ "epoch": 0.5822222222222222,
936
+ "grad_norm": 8.181448936462402,
937
+ "learning_rate": 7.754960890046785e-05,
938
+ "loss": 0.9738,
939
+ "step": 1310
940
+ },
941
+ {
942
+ "epoch": 0.5866666666666667,
943
+ "grad_norm": 6.555637836456299,
944
+ "learning_rate": 7.616417268939037e-05,
945
+ "loss": 0.9659,
946
+ "step": 1320
947
+ },
948
+ {
949
+ "epoch": 0.5911111111111111,
950
+ "grad_norm": 8.430340766906738,
951
+ "learning_rate": 7.47835705336716e-05,
952
+ "loss": 0.999,
953
+ "step": 1330
954
+ },
955
+ {
956
+ "epoch": 0.5955555555555555,
957
+ "grad_norm": 7.698472023010254,
958
+ "learning_rate": 7.340808242809264e-05,
959
+ "loss": 0.9666,
960
+ "step": 1340
961
+ },
962
+ {
963
+ "epoch": 0.6,
964
+ "grad_norm": 6.320609092712402,
965
+ "learning_rate": 7.203798733027304e-05,
966
+ "loss": 0.9954,
967
+ "step": 1350
968
+ },
969
+ {
970
+ "epoch": 0.6044444444444445,
971
+ "grad_norm": 7.057352542877197,
972
+ "learning_rate": 7.067356310409659e-05,
973
+ "loss": 0.9971,
974
+ "step": 1360
975
+ },
976
+ {
977
+ "epoch": 0.6088888888888889,
978
+ "grad_norm": 10.81286334991455,
979
+ "learning_rate": 6.931508646335874e-05,
980
+ "loss": 0.9931,
981
+ "step": 1370
982
+ },
983
+ {
984
+ "epoch": 0.6133333333333333,
985
+ "grad_norm": 7.427656173706055,
986
+ "learning_rate": 6.796283291564722e-05,
987
+ "loss": 0.9491,
988
+ "step": 1380
989
+ },
990
+ {
991
+ "epoch": 0.6177777777777778,
992
+ "grad_norm": 7.356409072875977,
993
+ "learning_rate": 6.66170767064675e-05,
994
+ "loss": 1.0202,
995
+ "step": 1390
996
+ },
997
+ {
998
+ "epoch": 0.6222222222222222,
999
+ "grad_norm": 8.578875541687012,
1000
+ "learning_rate": 6.527809076362399e-05,
1001
+ "loss": 1.0542,
1002
+ "step": 1400
1003
+ },
1004
+ {
1005
+ "epoch": 0.6266666666666667,
1006
+ "grad_norm": 8.644619941711426,
1007
+ "learning_rate": 6.394614664186862e-05,
1008
+ "loss": 1.0267,
1009
+ "step": 1410
1010
+ },
1011
+ {
1012
+ "epoch": 0.6311111111111111,
1013
+ "grad_norm": 9.160662651062012,
1014
+ "learning_rate": 6.262151446782785e-05,
1015
+ "loss": 0.9914,
1016
+ "step": 1420
1017
+ },
1018
+ {
1019
+ "epoch": 0.6355555555555555,
1020
+ "grad_norm": 7.767285346984863,
1021
+ "learning_rate": 6.130446288521915e-05,
1022
+ "loss": 0.987,
1023
+ "step": 1430
1024
+ },
1025
+ {
1026
+ "epoch": 0.64,
1027
+ "grad_norm": 12.88818073272705,
1028
+ "learning_rate": 5.999525900036855e-05,
1029
+ "loss": 0.9676,
1030
+ "step": 1440
1031
+ },
1032
+ {
1033
+ "epoch": 0.6444444444444445,
1034
+ "grad_norm": 10.068846702575684,
1035
+ "learning_rate": 5.86941683280398e-05,
1036
+ "loss": 0.9942,
1037
+ "step": 1450
1038
+ },
1039
+ {
1040
+ "epoch": 0.6488888888888888,
1041
+ "grad_norm": 8.70479679107666,
1042
+ "learning_rate": 5.7401454737586055e-05,
1043
+ "loss": 0.9848,
1044
+ "step": 1460
1045
+ },
1046
+ {
1047
+ "epoch": 0.6533333333333333,
1048
+ "grad_norm": 10.635972023010254,
1049
+ "learning_rate": 5.6117380399435826e-05,
1050
+ "loss": 0.9892,
1051
+ "step": 1470
1052
+ },
1053
+ {
1054
+ "epoch": 0.6577777777777778,
1055
+ "grad_norm": 6.84842586517334,
1056
+ "learning_rate": 5.484220573192307e-05,
1057
+ "loss": 0.961,
1058
+ "step": 1480
1059
+ },
1060
+ {
1061
+ "epoch": 0.6622222222222223,
1062
+ "grad_norm": 6.793154716491699,
1063
+ "learning_rate": 5.3576189348472526e-05,
1064
+ "loss": 0.9772,
1065
+ "step": 1490
1066
+ },
1067
+ {
1068
+ "epoch": 0.6666666666666666,
1069
+ "grad_norm": 10.692822456359863,
1070
+ "learning_rate": 5.231958800515164e-05,
1071
+ "loss": 1.0044,
1072
+ "step": 1500
1073
  }
1074
  ],
1075
  "logging_steps": 10,
 
1089
  "attributes": {}
1090
  }
1091
  },
1092
+ "total_flos": 4.091882766336e+19,
1093
  "train_batch_size": 4,
1094
  "trial_name": null,
1095
  "trial_params": null