Azrail commited on
Commit
12f5941
·
verified ·
1 Parent(s): 3193144

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:205a3991d60a5c28fb1c39f7dbf7a515c4fd4b6685d8240efe51c017acfa36b1
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb3e56d4c71b4fe3907ac3f7a21f7c5b645b6f7b5077a46679eb62578db2183
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d50c4bfe654d987803fd6a6960587e83c15bda23187fdd2e49b310d524ae5ac
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cb3c4740043f09a91cb2957024d48735e7f0fa83989925c2f015f5c9071410b
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2fab474267dfdb6f9f735fba3b6956eaa8395da984c318144fab7c0aefa914f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73c97fed542b1263f810594e9084ec5dd9fdff08a7e12c9f17ae2b74518f1304
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1982f4530393fe0c872036d1fa81199d7b6bd002acf4619fae87ec9de696f64d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c4ce97b38b7fb778eb543562838e653bbb8adc096b47968a332aa5700d8c5ce
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9656441902922582,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -880,11 +880,229 @@
880
  "eval_steps_per_second": 21.156,
881
  "num_input_tokens_seen": 1932223680,
882
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
  }
884
  ],
885
  "logging_steps": 50,
886
  "max_steps": 16568,
887
- "num_input_tokens_seen": 1932223680,
888
  "num_train_epochs": 4,
889
  "save_steps": 1000,
890
  "stateful_callbacks": {
@@ -899,7 +1117,7 @@
899
  "attributes": {}
900
  }
901
  },
902
- "total_flos": 5.168886929031168e+17,
903
  "train_batch_size": 16,
904
  "trial_name": null,
905
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2068892677701164,
6
  "eval_steps": 500,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
880
  "eval_steps_per_second": 21.156,
881
  "num_input_tokens_seen": 1932223680,
882
  "step": 4000
883
+ },
884
+ {
885
+ "epoch": 0.9777147426709115,
886
+ "grad_norm": 0.298828125,
887
+ "learning_rate": 4.722347970424023e-05,
888
+ "loss": 2.1476,
889
+ "mean_token_accuracy": 0.5491000188142061,
890
+ "num_input_tokens_seen": 1956345120,
891
+ "num_tokens": 824824558.0,
892
+ "step": 4050
893
+ },
894
+ {
895
+ "epoch": 0.9897852950495647,
896
+ "grad_norm": 0.2890625,
897
+ "learning_rate": 4.703485740153916e-05,
898
+ "loss": 2.1336,
899
+ "mean_token_accuracy": 0.5505272497236728,
900
+ "num_input_tokens_seen": 1980539728,
901
+ "num_tokens": 835004823.0,
902
+ "step": 4100
903
+ },
904
+ {
905
+ "epoch": 1.0016898773330114,
906
+ "grad_norm": 0.2890625,
907
+ "learning_rate": 4.684623509883809e-05,
908
+ "loss": 2.1376,
909
+ "mean_token_accuracy": 0.5500758526367531,
910
+ "num_input_tokens_seen": 2004388912,
911
+ "num_tokens": 844972763.0,
912
+ "step": 4150
913
+ },
914
+ {
915
+ "epoch": 1.0137604297116647,
916
+ "grad_norm": 0.275390625,
917
+ "learning_rate": 4.665761279613702e-05,
918
+ "loss": 2.1349,
919
+ "mean_token_accuracy": 0.5500743924826383,
920
+ "num_input_tokens_seen": 2028622064,
921
+ "num_tokens": 855126584.0,
922
+ "step": 4200
923
+ },
924
+ {
925
+ "epoch": 1.025830982090318,
926
+ "grad_norm": 0.283203125,
927
+ "learning_rate": 4.646899049343595e-05,
928
+ "loss": 2.1248,
929
+ "mean_token_accuracy": 0.5514456473290921,
930
+ "num_input_tokens_seen": 2052718336,
931
+ "num_tokens": 865332386.0,
932
+ "step": 4250
933
+ },
934
+ {
935
+ "epoch": 1.037901534468971,
936
+ "grad_norm": 0.28125,
937
+ "learning_rate": 4.6280368190734876e-05,
938
+ "loss": 2.1088,
939
+ "mean_token_accuracy": 0.5532256289571523,
940
+ "num_input_tokens_seen": 2076571680,
941
+ "num_tokens": 875448332.0,
942
+ "step": 4300
943
+ },
944
+ {
945
+ "epoch": 1.0499720868476243,
946
+ "grad_norm": 0.326171875,
947
+ "learning_rate": 4.60917458880338e-05,
948
+ "loss": 2.1184,
949
+ "mean_token_accuracy": 0.5509732039645314,
950
+ "num_input_tokens_seen": 2100726912,
951
+ "num_tokens": 885623694.0,
952
+ "step": 4350
953
+ },
954
+ {
955
+ "epoch": 1.0620426392262776,
956
+ "grad_norm": 0.310546875,
957
+ "learning_rate": 4.590312358533273e-05,
958
+ "loss": 2.1324,
959
+ "mean_token_accuracy": 0.5498137963563203,
960
+ "num_input_tokens_seen": 2124980016,
961
+ "num_tokens": 895774422.0,
962
+ "step": 4400
963
+ },
964
+ {
965
+ "epoch": 1.074113191604931,
966
+ "grad_norm": 0.32421875,
967
+ "learning_rate": 4.571450128263166e-05,
968
+ "loss": 2.1195,
969
+ "mean_token_accuracy": 0.551388250514865,
970
+ "num_input_tokens_seen": 2149237504,
971
+ "num_tokens": 905968753.0,
972
+ "step": 4450
973
+ },
974
+ {
975
+ "epoch": 1.086183743983584,
976
+ "grad_norm": 0.296875,
977
+ "learning_rate": 4.552587897993059e-05,
978
+ "loss": 2.1195,
979
+ "num_input_tokens_seen": 2173337456,
980
+ "step": 4500
981
+ },
982
+ {
983
+ "epoch": 1.086183743983584,
984
+ "eval_loss": 1.989871859550476,
985
+ "eval_mean_token_accuracy": 0.5754866465826013,
986
+ "eval_num_tokens": 916079112.0,
987
+ "eval_runtime": 128.4454,
988
+ "eval_samples_per_second": 83.397,
989
+ "eval_steps_per_second": 20.849,
990
+ "num_input_tokens_seen": 2173337456,
991
+ "step": 4500
992
+ },
993
+ {
994
+ "epoch": 1.0982542963622373,
995
+ "grad_norm": 0.287109375,
996
+ "learning_rate": 4.5337256677229516e-05,
997
+ "loss": 2.1218,
998
+ "mean_token_accuracy": 0.5514280049689114,
999
+ "num_input_tokens_seen": 2197505712,
1000
+ "num_tokens": 926254107.0,
1001
+ "step": 4550
1002
+ },
1003
+ {
1004
+ "epoch": 1.1103248487408905,
1005
+ "grad_norm": 0.291015625,
1006
+ "learning_rate": 4.514863437452845e-05,
1007
+ "loss": 2.1132,
1008
+ "mean_token_accuracy": 0.5515907733514905,
1009
+ "num_input_tokens_seen": 2221716688,
1010
+ "num_tokens": 936449430.0,
1011
+ "step": 4600
1012
+ },
1013
+ {
1014
+ "epoch": 1.1223954011195438,
1015
+ "grad_norm": 0.296875,
1016
+ "learning_rate": 4.4960012071827373e-05,
1017
+ "loss": 2.1142,
1018
+ "mean_token_accuracy": 0.5520639397203922,
1019
+ "num_input_tokens_seen": 2245565536,
1020
+ "num_tokens": 946528658.0,
1021
+ "step": 4650
1022
+ },
1023
+ {
1024
+ "epoch": 1.134465953498197,
1025
+ "grad_norm": 0.2734375,
1026
+ "learning_rate": 4.4771389769126305e-05,
1027
+ "loss": 2.1275,
1028
+ "mean_token_accuracy": 0.5497148666903376,
1029
+ "num_input_tokens_seen": 2269696864,
1030
+ "num_tokens": 956594209.0,
1031
+ "step": 4700
1032
+ },
1033
+ {
1034
+ "epoch": 1.1465365058768502,
1035
+ "grad_norm": 0.279296875,
1036
+ "learning_rate": 4.458276746642524e-05,
1037
+ "loss": 2.1065,
1038
+ "mean_token_accuracy": 0.5532364987954498,
1039
+ "num_input_tokens_seen": 2293845360,
1040
+ "num_tokens": 966701814.0,
1041
+ "step": 4750
1042
+ },
1043
+ {
1044
+ "epoch": 1.1586070582555035,
1045
+ "grad_norm": 0.259765625,
1046
+ "learning_rate": 4.439414516372416e-05,
1047
+ "loss": 2.1133,
1048
+ "mean_token_accuracy": 0.5517958915606141,
1049
+ "num_input_tokens_seen": 2318062016,
1050
+ "num_tokens": 976956133.0,
1051
+ "step": 4800
1052
+ },
1053
+ {
1054
+ "epoch": 1.1706776106341565,
1055
+ "grad_norm": 0.314453125,
1056
+ "learning_rate": 4.420552286102309e-05,
1057
+ "loss": 2.1083,
1058
+ "mean_token_accuracy": 0.5527382261306047,
1059
+ "num_input_tokens_seen": 2342152464,
1060
+ "num_tokens": 987113621.0,
1061
+ "step": 4850
1062
+ },
1063
+ {
1064
+ "epoch": 1.1827481630128098,
1065
+ "grad_norm": 0.26953125,
1066
+ "learning_rate": 4.401690055832201e-05,
1067
+ "loss": 2.1084,
1068
+ "mean_token_accuracy": 0.5531642048805953,
1069
+ "num_input_tokens_seen": 2366342016,
1070
+ "num_tokens": 997304128.0,
1071
+ "step": 4900
1072
+ },
1073
+ {
1074
+ "epoch": 1.1948187153914631,
1075
+ "grad_norm": 0.263671875,
1076
+ "learning_rate": 4.3828278255620945e-05,
1077
+ "loss": 2.1129,
1078
+ "mean_token_accuracy": 0.5526808862015605,
1079
+ "num_input_tokens_seen": 2390580560,
1080
+ "num_tokens": 1007600120.0,
1081
+ "step": 4950
1082
+ },
1083
+ {
1084
+ "epoch": 1.2068892677701164,
1085
+ "grad_norm": 0.271484375,
1086
+ "learning_rate": 4.363965595291988e-05,
1087
+ "loss": 2.1136,
1088
+ "num_input_tokens_seen": 2414871648,
1089
+ "step": 5000
1090
+ },
1091
+ {
1092
+ "epoch": 1.2068892677701164,
1093
+ "eval_loss": 1.9823503494262695,
1094
+ "eval_mean_token_accuracy": 0.5763351263685668,
1095
+ "eval_num_tokens": 1017920689.0,
1096
+ "eval_runtime": 131.1681,
1097
+ "eval_samples_per_second": 81.666,
1098
+ "eval_steps_per_second": 20.417,
1099
+ "num_input_tokens_seen": 2414871648,
1100
+ "step": 5000
1101
  }
1102
  ],
1103
  "logging_steps": 50,
1104
  "max_steps": 16568,
1105
+ "num_input_tokens_seen": 2414871648,
1106
  "num_train_epochs": 4,
1107
  "save_steps": 1000,
1108
  "stateful_callbacks": {
 
1117
  "attributes": {}
1118
  }
1119
  },
1120
+ "total_flos": 6.460017349872845e+17,
1121
  "train_batch_size": 16,
1122
  "trial_name": null,
1123
  "trial_params": null