Training in progress, step 46000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa89571eb3340eba1a67ab65cc95a52de52c688ab135a582ba9671de6b4b9b2b
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee8593e17fbb590b6be9983a2252f2eb629b591782e538eabf2da48b5e3443f7
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fdfef9e83b1fd0865026b3e547285feb0ce1b439ee58282cde4fbaa3e21a682
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c85b77405559b6f9d3b974ee441baee89ea00505d86e9a6015f23da9cbeb2cb5
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8018,11 +8018,189 @@
|
|
| 8018 |
"eval_steps_per_second": 24.43,
|
| 8019 |
"num_input_tokens_seen": 11796475456,
|
| 8020 |
"step": 45000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8021 |
}
|
| 8022 |
],
|
| 8023 |
"logging_steps": 50,
|
| 8024 |
"max_steps": 70000,
|
| 8025 |
-
"num_input_tokens_seen":
|
| 8026 |
"num_train_epochs": 1,
|
| 8027 |
"save_steps": 1000,
|
| 8028 |
"stateful_callbacks": {
|
|
@@ -8037,7 +8215,7 @@
|
|
| 8037 |
"attributes": {}
|
| 8038 |
}
|
| 8039 |
},
|
| 8040 |
-
"total_flos": 3.
|
| 8041 |
"train_batch_size": 64,
|
| 8042 |
"trial_name": null,
|
| 8043 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.21942115744660554,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 46000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8018 |
"eval_steps_per_second": 24.43,
|
| 8019 |
"num_input_tokens_seen": 11796475456,
|
| 8020 |
"step": 45000
|
| 8021 |
+
},
|
| 8022 |
+
{
|
| 8023 |
+
"epoch": 0.21488963354281693,
|
| 8024 |
+
"grad_norm": 0.226406991481781,
|
| 8025 |
+
"learning_rate": 0.001,
|
| 8026 |
+
"loss": 2.626,
|
| 8027 |
+
"num_input_tokens_seen": 11809582656,
|
| 8028 |
+
"step": 45050
|
| 8029 |
+
},
|
| 8030 |
+
{
|
| 8031 |
+
"epoch": 0.21512813480091109,
|
| 8032 |
+
"grad_norm": 0.20505741238594055,
|
| 8033 |
+
"learning_rate": 0.001,
|
| 8034 |
+
"loss": 2.6095,
|
| 8035 |
+
"num_input_tokens_seen": 11822689856,
|
| 8036 |
+
"step": 45100
|
| 8037 |
+
},
|
| 8038 |
+
{
|
| 8039 |
+
"epoch": 0.21536663605900522,
|
| 8040 |
+
"grad_norm": 0.2917146682739258,
|
| 8041 |
+
"learning_rate": 0.001,
|
| 8042 |
+
"loss": 2.6439,
|
| 8043 |
+
"num_input_tokens_seen": 11835797056,
|
| 8044 |
+
"step": 45150
|
| 8045 |
+
},
|
| 8046 |
+
{
|
| 8047 |
+
"epoch": 0.21560513731709935,
|
| 8048 |
+
"grad_norm": 0.24030283093452454,
|
| 8049 |
+
"learning_rate": 0.001,
|
| 8050 |
+
"loss": 2.6386,
|
| 8051 |
+
"num_input_tokens_seen": 11848904256,
|
| 8052 |
+
"step": 45200
|
| 8053 |
+
},
|
| 8054 |
+
{
|
| 8055 |
+
"epoch": 0.21584363857519348,
|
| 8056 |
+
"grad_norm": 0.1799454241991043,
|
| 8057 |
+
"learning_rate": 0.001,
|
| 8058 |
+
"loss": 2.6344,
|
| 8059 |
+
"num_input_tokens_seen": 11862011456,
|
| 8060 |
+
"step": 45250
|
| 8061 |
+
},
|
| 8062 |
+
{
|
| 8063 |
+
"epoch": 0.2160821398332876,
|
| 8064 |
+
"grad_norm": 0.2093718945980072,
|
| 8065 |
+
"learning_rate": 0.001,
|
| 8066 |
+
"loss": 2.6152,
|
| 8067 |
+
"num_input_tokens_seen": 11875118656,
|
| 8068 |
+
"step": 45300
|
| 8069 |
+
},
|
| 8070 |
+
{
|
| 8071 |
+
"epoch": 0.21632064109138174,
|
| 8072 |
+
"grad_norm": 0.19477079808712006,
|
| 8073 |
+
"learning_rate": 0.001,
|
| 8074 |
+
"loss": 2.622,
|
| 8075 |
+
"num_input_tokens_seen": 11888225856,
|
| 8076 |
+
"step": 45350
|
| 8077 |
+
},
|
| 8078 |
+
{
|
| 8079 |
+
"epoch": 0.2165591423494759,
|
| 8080 |
+
"grad_norm": 0.2764741778373718,
|
| 8081 |
+
"learning_rate": 0.001,
|
| 8082 |
+
"loss": 2.5951,
|
| 8083 |
+
"num_input_tokens_seen": 11901333056,
|
| 8084 |
+
"step": 45400
|
| 8085 |
+
},
|
| 8086 |
+
{
|
| 8087 |
+
"epoch": 0.21679764360757003,
|
| 8088 |
+
"grad_norm": 0.2127208709716797,
|
| 8089 |
+
"learning_rate": 0.001,
|
| 8090 |
+
"loss": 2.6231,
|
| 8091 |
+
"num_input_tokens_seen": 11914440256,
|
| 8092 |
+
"step": 45450
|
| 8093 |
+
},
|
| 8094 |
+
{
|
| 8095 |
+
"epoch": 0.21703614486566417,
|
| 8096 |
+
"grad_norm": 0.21089383959770203,
|
| 8097 |
+
"learning_rate": 0.001,
|
| 8098 |
+
"loss": 2.6099,
|
| 8099 |
+
"num_input_tokens_seen": 11927547456,
|
| 8100 |
+
"step": 45500
|
| 8101 |
+
},
|
| 8102 |
+
{
|
| 8103 |
+
"epoch": 0.21703614486566417,
|
| 8104 |
+
"eval_loss": 2.502464771270752,
|
| 8105 |
+
"eval_runtime": 50.946,
|
| 8106 |
+
"eval_samples_per_second": 98.143,
|
| 8107 |
+
"eval_steps_per_second": 24.536,
|
| 8108 |
+
"num_input_tokens_seen": 11927547456,
|
| 8109 |
+
"step": 45500
|
| 8110 |
+
},
|
| 8111 |
+
{
|
| 8112 |
+
"epoch": 0.2172746461237583,
|
| 8113 |
+
"grad_norm": 0.19550016522407532,
|
| 8114 |
+
"learning_rate": 0.001,
|
| 8115 |
+
"loss": 2.6365,
|
| 8116 |
+
"num_input_tokens_seen": 11940654656,
|
| 8117 |
+
"step": 45550
|
| 8118 |
+
},
|
| 8119 |
+
{
|
| 8120 |
+
"epoch": 0.21751314738185243,
|
| 8121 |
+
"grad_norm": 0.18284358084201813,
|
| 8122 |
+
"learning_rate": 0.001,
|
| 8123 |
+
"loss": 2.6358,
|
| 8124 |
+
"num_input_tokens_seen": 11953761856,
|
| 8125 |
+
"step": 45600
|
| 8126 |
+
},
|
| 8127 |
+
{
|
| 8128 |
+
"epoch": 0.2177516486399466,
|
| 8129 |
+
"grad_norm": 0.21821847558021545,
|
| 8130 |
+
"learning_rate": 0.001,
|
| 8131 |
+
"loss": 2.607,
|
| 8132 |
+
"num_input_tokens_seen": 11966869056,
|
| 8133 |
+
"step": 45650
|
| 8134 |
+
},
|
| 8135 |
+
{
|
| 8136 |
+
"epoch": 0.21799014989804072,
|
| 8137 |
+
"grad_norm": 0.2195073515176773,
|
| 8138 |
+
"learning_rate": 0.001,
|
| 8139 |
+
"loss": 2.6195,
|
| 8140 |
+
"num_input_tokens_seen": 11979976256,
|
| 8141 |
+
"step": 45700
|
| 8142 |
+
},
|
| 8143 |
+
{
|
| 8144 |
+
"epoch": 0.21822865115613485,
|
| 8145 |
+
"grad_norm": 0.19679750502109528,
|
| 8146 |
+
"learning_rate": 0.001,
|
| 8147 |
+
"loss": 2.6259,
|
| 8148 |
+
"num_input_tokens_seen": 11993083456,
|
| 8149 |
+
"step": 45750
|
| 8150 |
+
},
|
| 8151 |
+
{
|
| 8152 |
+
"epoch": 0.21846715241422898,
|
| 8153 |
+
"grad_norm": 0.1985604166984558,
|
| 8154 |
+
"learning_rate": 0.001,
|
| 8155 |
+
"loss": 2.6224,
|
| 8156 |
+
"num_input_tokens_seen": 12006190656,
|
| 8157 |
+
"step": 45800
|
| 8158 |
+
},
|
| 8159 |
+
{
|
| 8160 |
+
"epoch": 0.2187056536723231,
|
| 8161 |
+
"grad_norm": 0.18398787081241608,
|
| 8162 |
+
"learning_rate": 0.001,
|
| 8163 |
+
"loss": 2.6215,
|
| 8164 |
+
"num_input_tokens_seen": 12019297856,
|
| 8165 |
+
"step": 45850
|
| 8166 |
+
},
|
| 8167 |
+
{
|
| 8168 |
+
"epoch": 0.21894415493041725,
|
| 8169 |
+
"grad_norm": 0.2306145578622818,
|
| 8170 |
+
"learning_rate": 0.001,
|
| 8171 |
+
"loss": 2.6346,
|
| 8172 |
+
"num_input_tokens_seen": 12032405056,
|
| 8173 |
+
"step": 45900
|
| 8174 |
+
},
|
| 8175 |
+
{
|
| 8176 |
+
"epoch": 0.2191826561885114,
|
| 8177 |
+
"grad_norm": 0.21335257589817047,
|
| 8178 |
+
"learning_rate": 0.001,
|
| 8179 |
+
"loss": 2.6232,
|
| 8180 |
+
"num_input_tokens_seen": 12045512256,
|
| 8181 |
+
"step": 45950
|
| 8182 |
+
},
|
| 8183 |
+
{
|
| 8184 |
+
"epoch": 0.21942115744660554,
|
| 8185 |
+
"grad_norm": 0.22988814115524292,
|
| 8186 |
+
"learning_rate": 0.001,
|
| 8187 |
+
"loss": 2.6132,
|
| 8188 |
+
"num_input_tokens_seen": 12058619456,
|
| 8189 |
+
"step": 46000
|
| 8190 |
+
},
|
| 8191 |
+
{
|
| 8192 |
+
"epoch": 0.21942115744660554,
|
| 8193 |
+
"eval_loss": 2.499041795730591,
|
| 8194 |
+
"eval_runtime": 50.6868,
|
| 8195 |
+
"eval_samples_per_second": 98.645,
|
| 8196 |
+
"eval_steps_per_second": 24.661,
|
| 8197 |
+
"num_input_tokens_seen": 12058619456,
|
| 8198 |
+
"step": 46000
|
| 8199 |
}
|
| 8200 |
],
|
| 8201 |
"logging_steps": 50,
|
| 8202 |
"max_steps": 70000,
|
| 8203 |
+
"num_input_tokens_seen": 12058619456,
|
| 8204 |
"num_train_epochs": 1,
|
| 8205 |
"save_steps": 1000,
|
| 8206 |
"stateful_callbacks": {
|
|
|
|
| 8215 |
"attributes": {}
|
| 8216 |
}
|
| 8217 |
},
|
| 8218 |
+
"total_flos": 3.2257983966058906e+18,
|
| 8219 |
"train_batch_size": 64,
|
| 8220 |
"trial_name": null,
|
| 8221 |
"trial_params": null
|