Azrail commited on
Commit
1709054
·
verified ·
1 Parent(s): e8b23ec

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37296df7790e03d83312df3152295ea5675574fc24606e23051f92ba2a8785cd
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e4b4cd151eaa16ec86c3f95ae395991c7622f58c2e7d2e74c474e3b36e760
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddcb0501b4cfee967f8db5bb4fa8fb92b655ff610b064b96b43d66b2ba0fdac4
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37d9c729674e95fb1e29d44967d84df6c7f3c27e76670f3f7d480a455ded0987
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4c247dc7c4172df7c1d104b1da0eaec0df0b665cbc24707f3227675351f1df9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f55de071972b9763cbcf2a8de91813bfc16f9cfb1e09299e92ce7c238a6f40c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ad5c996b0875772675f1bc75e15a0dbeb09c5ba7146d169befa6908149e4159
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae5b4037609f8d0983c46601237411cdbc2481ef3d858df1d7dd4ab2f6d6072
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.10983021621450939,
6
  "eval_steps": 500,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -898,11 +898,189 @@
898
  "eval_steps_per_second": 19.021,
899
  "num_input_tokens_seen": 5242880000,
900
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
901
  }
902
  ],
903
  "logging_steps": 50,
904
  "max_steps": 200000,
905
- "num_input_tokens_seen": 5242880000,
906
  "num_train_epochs": 5,
907
  "save_steps": 1000,
908
  "stateful_callbacks": {
@@ -917,7 +1095,7 @@
917
  "attributes": {}
918
  }
919
  },
920
- "total_flos": 2.98585857982464e+18,
921
  "train_batch_size": 64,
922
  "trial_name": null,
923
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.13179625945741127,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
898
  "eval_steps_per_second": 19.021,
899
  "num_input_tokens_seen": 5242880000,
900
  "step": 5000
901
+ },
902
+ {
903
+ "epoch": 0.11092851837665449,
904
+ "grad_norm": 0.15302155911922455,
905
+ "learning_rate": 0.001,
906
+ "loss": 3.0037,
907
+ "num_input_tokens_seen": 5295308800,
908
+ "step": 5050
909
+ },
910
+ {
911
+ "epoch": 0.11202682053879959,
912
+ "grad_norm": 0.1474563181400299,
913
+ "learning_rate": 0.001,
914
+ "loss": 3.0063,
915
+ "num_input_tokens_seen": 5347737600,
916
+ "step": 5100
917
+ },
918
+ {
919
+ "epoch": 0.11312512270094467,
920
+ "grad_norm": 0.14318443834781647,
921
+ "learning_rate": 0.001,
922
+ "loss": 3.0011,
923
+ "num_input_tokens_seen": 5400166400,
924
+ "step": 5150
925
+ },
926
+ {
927
+ "epoch": 0.11422342486308977,
928
+ "grad_norm": 0.1521013379096985,
929
+ "learning_rate": 0.001,
930
+ "loss": 2.9946,
931
+ "num_input_tokens_seen": 5452595200,
932
+ "step": 5200
933
+ },
934
+ {
935
+ "epoch": 0.11532172702523487,
936
+ "grad_norm": 0.14434175193309784,
937
+ "learning_rate": 0.001,
938
+ "loss": 2.9909,
939
+ "num_input_tokens_seen": 5505024000,
940
+ "step": 5250
941
+ },
942
+ {
943
+ "epoch": 0.11642002918737995,
944
+ "grad_norm": 0.16284991800785065,
945
+ "learning_rate": 0.001,
946
+ "loss": 2.9846,
947
+ "num_input_tokens_seen": 5557452800,
948
+ "step": 5300
949
+ },
950
+ {
951
+ "epoch": 0.11751833134952505,
952
+ "grad_norm": 0.15281164646148682,
953
+ "learning_rate": 0.001,
954
+ "loss": 2.9843,
955
+ "num_input_tokens_seen": 5609881600,
956
+ "step": 5350
957
+ },
958
+ {
959
+ "epoch": 0.11861663351167015,
960
+ "grad_norm": 0.1227719634771347,
961
+ "learning_rate": 0.001,
962
+ "loss": 2.9778,
963
+ "num_input_tokens_seen": 5662310400,
964
+ "step": 5400
965
+ },
966
+ {
967
+ "epoch": 0.11971493567381523,
968
+ "grad_norm": 0.1346055269241333,
969
+ "learning_rate": 0.001,
970
+ "loss": 2.9745,
971
+ "num_input_tokens_seen": 5714739200,
972
+ "step": 5450
973
+ },
974
+ {
975
+ "epoch": 0.12081323783596033,
976
+ "grad_norm": 0.15828204154968262,
977
+ "learning_rate": 0.001,
978
+ "loss": 2.9723,
979
+ "num_input_tokens_seen": 5767168000,
980
+ "step": 5500
981
+ },
982
+ {
983
+ "epoch": 0.12081323783596033,
984
+ "eval_loss": 2.8801000118255615,
985
+ "eval_runtime": 65.3935,
986
+ "eval_samples_per_second": 76.46,
987
+ "eval_steps_per_second": 19.115,
988
+ "num_input_tokens_seen": 5767168000,
989
+ "step": 5500
990
+ },
991
+ {
992
+ "epoch": 0.12191153999810543,
993
+ "grad_norm": 0.1391400694847107,
994
+ "learning_rate": 0.001,
995
+ "loss": 2.9609,
996
+ "num_input_tokens_seen": 5819596800,
997
+ "step": 5550
998
+ },
999
+ {
1000
+ "epoch": 0.12300984216025053,
1001
+ "grad_norm": 0.14347107708454132,
1002
+ "learning_rate": 0.001,
1003
+ "loss": 2.9697,
1004
+ "num_input_tokens_seen": 5872025600,
1005
+ "step": 5600
1006
+ },
1007
+ {
1008
+ "epoch": 0.12410814432239561,
1009
+ "grad_norm": 0.13779127597808838,
1010
+ "learning_rate": 0.001,
1011
+ "loss": 2.9609,
1012
+ "num_input_tokens_seen": 5924454400,
1013
+ "step": 5650
1014
+ },
1015
+ {
1016
+ "epoch": 0.1252064464845407,
1017
+ "grad_norm": 0.13017955422401428,
1018
+ "learning_rate": 0.001,
1019
+ "loss": 2.9545,
1020
+ "num_input_tokens_seen": 5976883200,
1021
+ "step": 5700
1022
+ },
1023
+ {
1024
+ "epoch": 0.1263047486466858,
1025
+ "grad_norm": 0.12697578966617584,
1026
+ "learning_rate": 0.001,
1027
+ "loss": 2.9563,
1028
+ "num_input_tokens_seen": 6029312000,
1029
+ "step": 5750
1030
+ },
1031
+ {
1032
+ "epoch": 0.1274030508088309,
1033
+ "grad_norm": 0.15175020694732666,
1034
+ "learning_rate": 0.001,
1035
+ "loss": 2.9502,
1036
+ "num_input_tokens_seen": 6081740800,
1037
+ "step": 5800
1038
+ },
1039
+ {
1040
+ "epoch": 0.12850135297097598,
1041
+ "grad_norm": 0.1209852397441864,
1042
+ "learning_rate": 0.001,
1043
+ "loss": 2.9516,
1044
+ "num_input_tokens_seen": 6134169600,
1045
+ "step": 5850
1046
+ },
1047
+ {
1048
+ "epoch": 0.12959965513312108,
1049
+ "grad_norm": 0.16521666944026947,
1050
+ "learning_rate": 0.001,
1051
+ "loss": 2.9528,
1052
+ "num_input_tokens_seen": 6186598400,
1053
+ "step": 5900
1054
+ },
1055
+ {
1056
+ "epoch": 0.13069795729526618,
1057
+ "grad_norm": 0.12271756678819656,
1058
+ "learning_rate": 0.001,
1059
+ "loss": 2.9382,
1060
+ "num_input_tokens_seen": 6239027200,
1061
+ "step": 5950
1062
+ },
1063
+ {
1064
+ "epoch": 0.13179625945741127,
1065
+ "grad_norm": 0.1376461535692215,
1066
+ "learning_rate": 0.001,
1067
+ "loss": 2.9464,
1068
+ "num_input_tokens_seen": 6291456000,
1069
+ "step": 6000
1070
+ },
1071
+ {
1072
+ "epoch": 0.13179625945741127,
1073
+ "eval_loss": 2.84769606590271,
1074
+ "eval_runtime": 65.8814,
1075
+ "eval_samples_per_second": 75.894,
1076
+ "eval_steps_per_second": 18.973,
1077
+ "num_input_tokens_seen": 6291456000,
1078
+ "step": 6000
1079
  }
1080
  ],
1081
  "logging_steps": 50,
1082
  "max_steps": 200000,
1083
+ "num_input_tokens_seen": 6291456000,
1084
  "num_train_epochs": 5,
1085
  "save_steps": 1000,
1086
  "stateful_callbacks": {
 
1095
  "attributes": {}
1096
  }
1097
  },
1098
+ "total_flos": 3.583030295789568e+18,
1099
  "train_batch_size": 64,
1100
  "trial_name": null,
1101
  "trial_params": null