Training in progress, step 52000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17f79bfd92c936c07be11debb700728ae4b7e0771937dc9aee38748f4dc80dc3
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdcb0e96beb98fcdfd50cc3b612cd068e544f01ef0961afbf353f3d6eabba3ce
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:746267b8ba996549a033d105e363328c635034a7afa0e3070ea8447957aaca5a
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24b3fcbecd3d55078c913506015bb6e1182f04ee52bf4c0845fc043823a61161
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9086,11 +9086,189 @@
|
|
| 9086 |
"eval_steps_per_second": 23.543,
|
| 9087 |
"num_input_tokens_seen": 13369344000,
|
| 9088 |
"step": 51000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9089 |
}
|
| 9090 |
],
|
| 9091 |
"logging_steps": 50,
|
| 9092 |
"max_steps": 60000,
|
| 9093 |
-
"num_input_tokens_seen":
|
| 9094 |
"num_train_epochs": 1,
|
| 9095 |
"save_steps": 1000,
|
| 9096 |
"stateful_callbacks": {
|
|
@@ -9105,7 +9283,7 @@
|
|
| 9105 |
"attributes": {}
|
| 9106 |
}
|
| 9107 |
},
|
| 9108 |
-
"total_flos": 3.
|
| 9109 |
"train_batch_size": 64,
|
| 9110 |
"trial_name": null,
|
| 9111 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.34977794145591706,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 52000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9086 |
"eval_steps_per_second": 23.543,
|
| 9087 |
"num_input_tokens_seen": 13369344000,
|
| 9088 |
"step": 51000
|
| 9089 |
+
},
|
| 9090 |
+
{
|
| 9091 |
+
"epoch": 0.3433877675254724,
|
| 9092 |
+
"grad_norm": 0.19399498403072357,
|
| 9093 |
+
"learning_rate": 0.001,
|
| 9094 |
+
"loss": 3.0583,
|
| 9095 |
+
"num_input_tokens_seen": 13382451200,
|
| 9096 |
+
"step": 51050
|
| 9097 |
+
},
|
| 9098 |
+
{
|
| 9099 |
+
"epoch": 0.34372409246918,
|
| 9100 |
+
"grad_norm": 0.19893072545528412,
|
| 9101 |
+
"learning_rate": 0.001,
|
| 9102 |
+
"loss": 3.0505,
|
| 9103 |
+
"num_input_tokens_seen": 13395558400,
|
| 9104 |
+
"step": 51100
|
| 9105 |
+
},
|
| 9106 |
+
{
|
| 9107 |
+
"epoch": 0.3440604174128876,
|
| 9108 |
+
"grad_norm": 0.17791305482387543,
|
| 9109 |
+
"learning_rate": 0.001,
|
| 9110 |
+
"loss": 3.0504,
|
| 9111 |
+
"num_input_tokens_seen": 13408665600,
|
| 9112 |
+
"step": 51150
|
| 9113 |
+
},
|
| 9114 |
+
{
|
| 9115 |
+
"epoch": 0.34439674235659523,
|
| 9116 |
+
"grad_norm": 0.7631425261497498,
|
| 9117 |
+
"learning_rate": 0.001,
|
| 9118 |
+
"loss": 3.0483,
|
| 9119 |
+
"num_input_tokens_seen": 13421772800,
|
| 9120 |
+
"step": 51200
|
| 9121 |
+
},
|
| 9122 |
+
{
|
| 9123 |
+
"epoch": 0.34473306730030284,
|
| 9124 |
+
"grad_norm": 0.22620978951454163,
|
| 9125 |
+
"learning_rate": 0.001,
|
| 9126 |
+
"loss": 3.0512,
|
| 9127 |
+
"num_input_tokens_seen": 13434880000,
|
| 9128 |
+
"step": 51250
|
| 9129 |
+
},
|
| 9130 |
+
{
|
| 9131 |
+
"epoch": 0.34506939224401045,
|
| 9132 |
+
"grad_norm": 0.219919815659523,
|
| 9133 |
+
"learning_rate": 0.001,
|
| 9134 |
+
"loss": 3.0415,
|
| 9135 |
+
"num_input_tokens_seen": 13447987200,
|
| 9136 |
+
"step": 51300
|
| 9137 |
+
},
|
| 9138 |
+
{
|
| 9139 |
+
"epoch": 0.34540571718771806,
|
| 9140 |
+
"grad_norm": 0.21654649078845978,
|
| 9141 |
+
"learning_rate": 0.001,
|
| 9142 |
+
"loss": 3.062,
|
| 9143 |
+
"num_input_tokens_seen": 13461094400,
|
| 9144 |
+
"step": 51350
|
| 9145 |
+
},
|
| 9146 |
+
{
|
| 9147 |
+
"epoch": 0.3457420421314257,
|
| 9148 |
+
"grad_norm": 0.2439095377922058,
|
| 9149 |
+
"learning_rate": 0.001,
|
| 9150 |
+
"loss": 3.0478,
|
| 9151 |
+
"num_input_tokens_seen": 13474201600,
|
| 9152 |
+
"step": 51400
|
| 9153 |
+
},
|
| 9154 |
+
{
|
| 9155 |
+
"epoch": 0.3460783670751333,
|
| 9156 |
+
"grad_norm": 0.19535380601882935,
|
| 9157 |
+
"learning_rate": 0.001,
|
| 9158 |
+
"loss": 3.0444,
|
| 9159 |
+
"num_input_tokens_seen": 13487308800,
|
| 9160 |
+
"step": 51450
|
| 9161 |
+
},
|
| 9162 |
+
{
|
| 9163 |
+
"epoch": 0.3464146920188409,
|
| 9164 |
+
"grad_norm": 0.1964534968137741,
|
| 9165 |
+
"learning_rate": 0.001,
|
| 9166 |
+
"loss": 3.049,
|
| 9167 |
+
"num_input_tokens_seen": 13500416000,
|
| 9168 |
+
"step": 51500
|
| 9169 |
+
},
|
| 9170 |
+
{
|
| 9171 |
+
"epoch": 0.3464146920188409,
|
| 9172 |
+
"eval_loss": 2.945749044418335,
|
| 9173 |
+
"eval_runtime": 53.0447,
|
| 9174 |
+
"eval_samples_per_second": 94.26,
|
| 9175 |
+
"eval_steps_per_second": 23.565,
|
| 9176 |
+
"num_input_tokens_seen": 13500416000,
|
| 9177 |
+
"step": 51500
|
| 9178 |
+
},
|
| 9179 |
+
{
|
| 9180 |
+
"epoch": 0.3467510169625485,
|
| 9181 |
+
"grad_norm": 0.2085062563419342,
|
| 9182 |
+
"learning_rate": 0.001,
|
| 9183 |
+
"loss": 3.0582,
|
| 9184 |
+
"num_input_tokens_seen": 13513523200,
|
| 9185 |
+
"step": 51550
|
| 9186 |
+
},
|
| 9187 |
+
{
|
| 9188 |
+
"epoch": 0.3470873419062562,
|
| 9189 |
+
"grad_norm": 0.1903097778558731,
|
| 9190 |
+
"learning_rate": 0.001,
|
| 9191 |
+
"loss": 3.0488,
|
| 9192 |
+
"num_input_tokens_seen": 13526630400,
|
| 9193 |
+
"step": 51600
|
| 9194 |
+
},
|
| 9195 |
+
{
|
| 9196 |
+
"epoch": 0.3474236668499638,
|
| 9197 |
+
"grad_norm": 0.20101405680179596,
|
| 9198 |
+
"learning_rate": 0.001,
|
| 9199 |
+
"loss": 3.0573,
|
| 9200 |
+
"num_input_tokens_seen": 13539737600,
|
| 9201 |
+
"step": 51650
|
| 9202 |
+
},
|
| 9203 |
+
{
|
| 9204 |
+
"epoch": 0.3477599917936714,
|
| 9205 |
+
"grad_norm": 0.6418889164924622,
|
| 9206 |
+
"learning_rate": 0.001,
|
| 9207 |
+
"loss": 3.0513,
|
| 9208 |
+
"num_input_tokens_seen": 13552844800,
|
| 9209 |
+
"step": 51700
|
| 9210 |
+
},
|
| 9211 |
+
{
|
| 9212 |
+
"epoch": 0.348096316737379,
|
| 9213 |
+
"grad_norm": 0.22524093091487885,
|
| 9214 |
+
"learning_rate": 0.001,
|
| 9215 |
+
"loss": 3.0567,
|
| 9216 |
+
"num_input_tokens_seen": 13565952000,
|
| 9217 |
+
"step": 51750
|
| 9218 |
+
},
|
| 9219 |
+
{
|
| 9220 |
+
"epoch": 0.3484326416810866,
|
| 9221 |
+
"grad_norm": 0.21830599009990692,
|
| 9222 |
+
"learning_rate": 0.001,
|
| 9223 |
+
"loss": 3.0538,
|
| 9224 |
+
"num_input_tokens_seen": 13579059200,
|
| 9225 |
+
"step": 51800
|
| 9226 |
+
},
|
| 9227 |
+
{
|
| 9228 |
+
"epoch": 0.34876896662479423,
|
| 9229 |
+
"grad_norm": 0.6111611127853394,
|
| 9230 |
+
"learning_rate": 0.001,
|
| 9231 |
+
"loss": 3.0581,
|
| 9232 |
+
"num_input_tokens_seen": 13592166400,
|
| 9233 |
+
"step": 51850
|
| 9234 |
+
},
|
| 9235 |
+
{
|
| 9236 |
+
"epoch": 0.34910529156850184,
|
| 9237 |
+
"grad_norm": 0.3782864511013031,
|
| 9238 |
+
"learning_rate": 0.001,
|
| 9239 |
+
"loss": 3.0694,
|
| 9240 |
+
"num_input_tokens_seen": 13605273600,
|
| 9241 |
+
"step": 51900
|
| 9242 |
+
},
|
| 9243 |
+
{
|
| 9244 |
+
"epoch": 0.34944161651220945,
|
| 9245 |
+
"grad_norm": 0.23944802582263947,
|
| 9246 |
+
"learning_rate": 0.001,
|
| 9247 |
+
"loss": 3.0683,
|
| 9248 |
+
"num_input_tokens_seen": 13618380800,
|
| 9249 |
+
"step": 51950
|
| 9250 |
+
},
|
| 9251 |
+
{
|
| 9252 |
+
"epoch": 0.34977794145591706,
|
| 9253 |
+
"grad_norm": 0.20257577300071716,
|
| 9254 |
+
"learning_rate": 0.001,
|
| 9255 |
+
"loss": 3.0509,
|
| 9256 |
+
"num_input_tokens_seen": 13631488000,
|
| 9257 |
+
"step": 52000
|
| 9258 |
+
},
|
| 9259 |
+
{
|
| 9260 |
+
"epoch": 0.34977794145591706,
|
| 9261 |
+
"eval_loss": 2.94769287109375,
|
| 9262 |
+
"eval_runtime": 53.1351,
|
| 9263 |
+
"eval_samples_per_second": 94.1,
|
| 9264 |
+
"eval_steps_per_second": 23.525,
|
| 9265 |
+
"num_input_tokens_seen": 13631488000,
|
| 9266 |
+
"step": 52000
|
| 9267 |
}
|
| 9268 |
],
|
| 9269 |
"logging_steps": 50,
|
| 9270 |
"max_steps": 60000,
|
| 9271 |
+
"num_input_tokens_seen": 13631488000,
|
| 9272 |
"num_train_epochs": 1,
|
| 9273 |
"save_steps": 1000,
|
| 9274 |
"stateful_callbacks": {
|
|
|
|
| 9283 |
"attributes": {}
|
| 9284 |
}
|
| 9285 |
},
|
| 9286 |
+
"total_flos": 3.64655608332288e+18,
|
| 9287 |
"train_batch_size": 64,
|
| 9288 |
"trial_name": null,
|
| 9289 |
"trial_params": null
|