Training in progress, step 73000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82e96b382e85cf4f91a0957df390eab642f1a5b90594b054112e585987e922fb
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86831135ab2a33d7609f755ab5e685a1ac6602cf0ed6e3f717ff3cd6a64064f2
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dbf5a8e94cdeb9d71543994044a1496c0b99dc653812727d1f2b5879319264c4
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e585ae00a418f8315b98a87df365e3f31023ec6747db05d48bdc24ed26af3666
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c65df296955d0ea7a8b7df67d30426101d0bc72ddcf4935d0366aeb81991dd30
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75f3a690a6b3c19beeba0982e2eceaedb3e05582e018ecc3f8710afa643876ad
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1971585f96833288fec52d3fdc773fe9f57b50e9c45dc3d75ed2e10f5ab3dca7
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -25208,6 +25208,356 @@
|
|
| 25208 |
"learning_rate": 0.00048234355429470035,
|
| 25209 |
"loss": 16.5261,
|
| 25210 |
"step": 72000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25211 |
}
|
| 25212 |
],
|
| 25213 |
"logging_steps": 20,
|
|
@@ -25227,7 +25577,7 @@
|
|
| 25227 |
"attributes": {}
|
| 25228 |
}
|
| 25229 |
},
|
| 25230 |
-
"total_flos": 5.
|
| 25231 |
"train_batch_size": 48,
|
| 25232 |
"trial_name": null,
|
| 25233 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.10813597283861373,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 73000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 25208 |
"learning_rate": 0.00048234355429470035,
|
| 25209 |
"loss": 16.5261,
|
| 25210 |
"step": 72000
|
| 25211 |
+
},
|
| 25212 |
+
{
|
| 25213 |
+
"epoch": 0.10668428443612275,
|
| 25214 |
+
"grad_norm": 6.65625,
|
| 25215 |
+
"learning_rate": 0.00048233861535969274,
|
| 25216 |
+
"loss": 16.4916,
|
| 25217 |
+
"step": 72020
|
| 25218 |
+
},
|
| 25219 |
+
{
|
| 25220 |
+
"epoch": 0.10671391073005114,
|
| 25221 |
+
"grad_norm": 6.6875,
|
| 25222 |
+
"learning_rate": 0.0004823336764246852,
|
| 25223 |
+
"loss": 16.4841,
|
| 25224 |
+
"step": 72040
|
| 25225 |
+
},
|
| 25226 |
+
{
|
| 25227 |
+
"epoch": 0.10674353702397953,
|
| 25228 |
+
"grad_norm": 6.09375,
|
| 25229 |
+
"learning_rate": 0.00048232873748967764,
|
| 25230 |
+
"loss": 16.485,
|
| 25231 |
+
"step": 72060
|
| 25232 |
+
},
|
| 25233 |
+
{
|
| 25234 |
+
"epoch": 0.10677316331790791,
|
| 25235 |
+
"grad_norm": 6.375,
|
| 25236 |
+
"learning_rate": 0.0004823237985546701,
|
| 25237 |
+
"loss": 16.4513,
|
| 25238 |
+
"step": 72080
|
| 25239 |
+
},
|
| 25240 |
+
{
|
| 25241 |
+
"epoch": 0.1068027896118363,
|
| 25242 |
+
"grad_norm": 7.78125,
|
| 25243 |
+
"learning_rate": 0.0004823188596196625,
|
| 25244 |
+
"loss": 16.4513,
|
| 25245 |
+
"step": 72100
|
| 25246 |
+
},
|
| 25247 |
+
{
|
| 25248 |
+
"epoch": 0.10683241590576469,
|
| 25249 |
+
"grad_norm": 7.0,
|
| 25250 |
+
"learning_rate": 0.000482313920684655,
|
| 25251 |
+
"loss": 16.4768,
|
| 25252 |
+
"step": 72120
|
| 25253 |
+
},
|
| 25254 |
+
{
|
| 25255 |
+
"epoch": 0.10686204219969307,
|
| 25256 |
+
"grad_norm": 6.5625,
|
| 25257 |
+
"learning_rate": 0.0004823089817496474,
|
| 25258 |
+
"loss": 16.4426,
|
| 25259 |
+
"step": 72140
|
| 25260 |
+
},
|
| 25261 |
+
{
|
| 25262 |
+
"epoch": 0.10689166849362146,
|
| 25263 |
+
"grad_norm": 7.46875,
|
| 25264 |
+
"learning_rate": 0.0004823040428146398,
|
| 25265 |
+
"loss": 16.4733,
|
| 25266 |
+
"step": 72160
|
| 25267 |
+
},
|
| 25268 |
+
{
|
| 25269 |
+
"epoch": 0.10692129478754984,
|
| 25270 |
+
"grad_norm": 7.625,
|
| 25271 |
+
"learning_rate": 0.0004822991038796322,
|
| 25272 |
+
"loss": 16.459,
|
| 25273 |
+
"step": 72180
|
| 25274 |
+
},
|
| 25275 |
+
{
|
| 25276 |
+
"epoch": 0.10695092108147823,
|
| 25277 |
+
"grad_norm": 6.0,
|
| 25278 |
+
"learning_rate": 0.0004822941649446247,
|
| 25279 |
+
"loss": 16.4675,
|
| 25280 |
+
"step": 72200
|
| 25281 |
+
},
|
| 25282 |
+
{
|
| 25283 |
+
"epoch": 0.10698054737540662,
|
| 25284 |
+
"grad_norm": 6.28125,
|
| 25285 |
+
"learning_rate": 0.0004822892260096171,
|
| 25286 |
+
"loss": 16.517,
|
| 25287 |
+
"step": 72220
|
| 25288 |
+
},
|
| 25289 |
+
{
|
| 25290 |
+
"epoch": 0.107010173669335,
|
| 25291 |
+
"grad_norm": 6.375,
|
| 25292 |
+
"learning_rate": 0.0004822842870746095,
|
| 25293 |
+
"loss": 16.4864,
|
| 25294 |
+
"step": 72240
|
| 25295 |
+
},
|
| 25296 |
+
{
|
| 25297 |
+
"epoch": 0.10703979996326339,
|
| 25298 |
+
"grad_norm": 6.4375,
|
| 25299 |
+
"learning_rate": 0.00048227934813960195,
|
| 25300 |
+
"loss": 16.4141,
|
| 25301 |
+
"step": 72260
|
| 25302 |
+
},
|
| 25303 |
+
{
|
| 25304 |
+
"epoch": 0.10706942625719178,
|
| 25305 |
+
"grad_norm": 6.875,
|
| 25306 |
+
"learning_rate": 0.0004822744092045944,
|
| 25307 |
+
"loss": 16.5031,
|
| 25308 |
+
"step": 72280
|
| 25309 |
+
},
|
| 25310 |
+
{
|
| 25311 |
+
"epoch": 0.10709905255112016,
|
| 25312 |
+
"grad_norm": 6.4375,
|
| 25313 |
+
"learning_rate": 0.00048226947026958685,
|
| 25314 |
+
"loss": 16.4786,
|
| 25315 |
+
"step": 72300
|
| 25316 |
+
},
|
| 25317 |
+
{
|
| 25318 |
+
"epoch": 0.10712867884504856,
|
| 25319 |
+
"grad_norm": 6.4375,
|
| 25320 |
+
"learning_rate": 0.00048226453133457924,
|
| 25321 |
+
"loss": 16.4936,
|
| 25322 |
+
"step": 72320
|
| 25323 |
+
},
|
| 25324 |
+
{
|
| 25325 |
+
"epoch": 0.10715830513897695,
|
| 25326 |
+
"grad_norm": 7.3125,
|
| 25327 |
+
"learning_rate": 0.0004822595923995717,
|
| 25328 |
+
"loss": 16.4618,
|
| 25329 |
+
"step": 72340
|
| 25330 |
+
},
|
| 25331 |
+
{
|
| 25332 |
+
"epoch": 0.10718793143290534,
|
| 25333 |
+
"grad_norm": 6.28125,
|
| 25334 |
+
"learning_rate": 0.00048225465346456414,
|
| 25335 |
+
"loss": 16.4663,
|
| 25336 |
+
"step": 72360
|
| 25337 |
+
},
|
| 25338 |
+
{
|
| 25339 |
+
"epoch": 0.10721755772683372,
|
| 25340 |
+
"grad_norm": 6.25,
|
| 25341 |
+
"learning_rate": 0.0004822497145295566,
|
| 25342 |
+
"loss": 16.4996,
|
| 25343 |
+
"step": 72380
|
| 25344 |
+
},
|
| 25345 |
+
{
|
| 25346 |
+
"epoch": 0.10724718402076211,
|
| 25347 |
+
"grad_norm": 6.25,
|
| 25348 |
+
"learning_rate": 0.000482244775594549,
|
| 25349 |
+
"loss": 16.4604,
|
| 25350 |
+
"step": 72400
|
| 25351 |
+
},
|
| 25352 |
+
{
|
| 25353 |
+
"epoch": 0.1072768103146905,
|
| 25354 |
+
"grad_norm": 6.6875,
|
| 25355 |
+
"learning_rate": 0.0004822398366595415,
|
| 25356 |
+
"loss": 16.4964,
|
| 25357 |
+
"step": 72420
|
| 25358 |
+
},
|
| 25359 |
+
{
|
| 25360 |
+
"epoch": 0.10730643660861888,
|
| 25361 |
+
"grad_norm": 6.8125,
|
| 25362 |
+
"learning_rate": 0.0004822348977245339,
|
| 25363 |
+
"loss": 16.4576,
|
| 25364 |
+
"step": 72440
|
| 25365 |
+
},
|
| 25366 |
+
{
|
| 25367 |
+
"epoch": 0.10733606290254727,
|
| 25368 |
+
"grad_norm": 6.65625,
|
| 25369 |
+
"learning_rate": 0.0004822299587895263,
|
| 25370 |
+
"loss": 16.4561,
|
| 25371 |
+
"step": 72460
|
| 25372 |
+
},
|
| 25373 |
+
{
|
| 25374 |
+
"epoch": 0.10736568919647566,
|
| 25375 |
+
"grad_norm": 6.125,
|
| 25376 |
+
"learning_rate": 0.0004822250198545187,
|
| 25377 |
+
"loss": 16.4686,
|
| 25378 |
+
"step": 72480
|
| 25379 |
+
},
|
| 25380 |
+
{
|
| 25381 |
+
"epoch": 0.10739531549040404,
|
| 25382 |
+
"grad_norm": 6.4375,
|
| 25383 |
+
"learning_rate": 0.0004822200809195112,
|
| 25384 |
+
"loss": 16.477,
|
| 25385 |
+
"step": 72500
|
| 25386 |
+
},
|
| 25387 |
+
{
|
| 25388 |
+
"epoch": 0.10742494178433243,
|
| 25389 |
+
"grad_norm": 7.0,
|
| 25390 |
+
"learning_rate": 0.0004822151419845036,
|
| 25391 |
+
"loss": 16.4681,
|
| 25392 |
+
"step": 72520
|
| 25393 |
+
},
|
| 25394 |
+
{
|
| 25395 |
+
"epoch": 0.10745456807826081,
|
| 25396 |
+
"grad_norm": 6.59375,
|
| 25397 |
+
"learning_rate": 0.000482210203049496,
|
| 25398 |
+
"loss": 16.4467,
|
| 25399 |
+
"step": 72540
|
| 25400 |
+
},
|
| 25401 |
+
{
|
| 25402 |
+
"epoch": 0.1074841943721892,
|
| 25403 |
+
"grad_norm": 6.40625,
|
| 25404 |
+
"learning_rate": 0.00048220526411448845,
|
| 25405 |
+
"loss": 16.4432,
|
| 25406 |
+
"step": 72560
|
| 25407 |
+
},
|
| 25408 |
+
{
|
| 25409 |
+
"epoch": 0.10751382066611759,
|
| 25410 |
+
"grad_norm": 6.46875,
|
| 25411 |
+
"learning_rate": 0.0004822003251794809,
|
| 25412 |
+
"loss": 16.4936,
|
| 25413 |
+
"step": 72580
|
| 25414 |
+
},
|
| 25415 |
+
{
|
| 25416 |
+
"epoch": 0.10754344696004597,
|
| 25417 |
+
"grad_norm": 6.46875,
|
| 25418 |
+
"learning_rate": 0.00048219538624447335,
|
| 25419 |
+
"loss": 16.4488,
|
| 25420 |
+
"step": 72600
|
| 25421 |
+
},
|
| 25422 |
+
{
|
| 25423 |
+
"epoch": 0.10757307325397436,
|
| 25424 |
+
"grad_norm": 6.0,
|
| 25425 |
+
"learning_rate": 0.00048219044730946574,
|
| 25426 |
+
"loss": 16.4461,
|
| 25427 |
+
"step": 72620
|
| 25428 |
+
},
|
| 25429 |
+
{
|
| 25430 |
+
"epoch": 0.10760269954790276,
|
| 25431 |
+
"grad_norm": 6.625,
|
| 25432 |
+
"learning_rate": 0.0004821855083744582,
|
| 25433 |
+
"loss": 16.5009,
|
| 25434 |
+
"step": 72640
|
| 25435 |
+
},
|
| 25436 |
+
{
|
| 25437 |
+
"epoch": 0.10763232584183115,
|
| 25438 |
+
"grad_norm": 7.21875,
|
| 25439 |
+
"learning_rate": 0.00048218056943945064,
|
| 25440 |
+
"loss": 16.4108,
|
| 25441 |
+
"step": 72660
|
| 25442 |
+
},
|
| 25443 |
+
{
|
| 25444 |
+
"epoch": 0.10766195213575953,
|
| 25445 |
+
"grad_norm": 6.40625,
|
| 25446 |
+
"learning_rate": 0.0004821756305044431,
|
| 25447 |
+
"loss": 16.4139,
|
| 25448 |
+
"step": 72680
|
| 25449 |
+
},
|
| 25450 |
+
{
|
| 25451 |
+
"epoch": 0.10769157842968792,
|
| 25452 |
+
"grad_norm": 6.90625,
|
| 25453 |
+
"learning_rate": 0.0004821706915694355,
|
| 25454 |
+
"loss": 16.4898,
|
| 25455 |
+
"step": 72700
|
| 25456 |
+
},
|
| 25457 |
+
{
|
| 25458 |
+
"epoch": 0.1077212047236163,
|
| 25459 |
+
"grad_norm": 7.40625,
|
| 25460 |
+
"learning_rate": 0.000482165752634428,
|
| 25461 |
+
"loss": 16.5149,
|
| 25462 |
+
"step": 72720
|
| 25463 |
+
},
|
| 25464 |
+
{
|
| 25465 |
+
"epoch": 0.1077508310175447,
|
| 25466 |
+
"grad_norm": 6.5,
|
| 25467 |
+
"learning_rate": 0.0004821608136994204,
|
| 25468 |
+
"loss": 16.4915,
|
| 25469 |
+
"step": 72740
|
| 25470 |
+
},
|
| 25471 |
+
{
|
| 25472 |
+
"epoch": 0.10778045731147308,
|
| 25473 |
+
"grad_norm": 7.5,
|
| 25474 |
+
"learning_rate": 0.0004821558747644128,
|
| 25475 |
+
"loss": 16.428,
|
| 25476 |
+
"step": 72760
|
| 25477 |
+
},
|
| 25478 |
+
{
|
| 25479 |
+
"epoch": 0.10781008360540147,
|
| 25480 |
+
"grad_norm": 7.21875,
|
| 25481 |
+
"learning_rate": 0.0004821509358294052,
|
| 25482 |
+
"loss": 16.4355,
|
| 25483 |
+
"step": 72780
|
| 25484 |
+
},
|
| 25485 |
+
{
|
| 25486 |
+
"epoch": 0.10783970989932985,
|
| 25487 |
+
"grad_norm": 6.34375,
|
| 25488 |
+
"learning_rate": 0.0004821459968943977,
|
| 25489 |
+
"loss": 16.4379,
|
| 25490 |
+
"step": 72800
|
| 25491 |
+
},
|
| 25492 |
+
{
|
| 25493 |
+
"epoch": 0.10786933619325824,
|
| 25494 |
+
"grad_norm": 6.53125,
|
| 25495 |
+
"learning_rate": 0.0004821410579593901,
|
| 25496 |
+
"loss": 16.4117,
|
| 25497 |
+
"step": 72820
|
| 25498 |
+
},
|
| 25499 |
+
{
|
| 25500 |
+
"epoch": 0.10789896248718663,
|
| 25501 |
+
"grad_norm": 6.59375,
|
| 25502 |
+
"learning_rate": 0.00048213611902438256,
|
| 25503 |
+
"loss": 16.4863,
|
| 25504 |
+
"step": 72840
|
| 25505 |
+
},
|
| 25506 |
+
{
|
| 25507 |
+
"epoch": 0.10792858878111501,
|
| 25508 |
+
"grad_norm": 6.875,
|
| 25509 |
+
"learning_rate": 0.00048213118008937495,
|
| 25510 |
+
"loss": 16.4081,
|
| 25511 |
+
"step": 72860
|
| 25512 |
+
},
|
| 25513 |
+
{
|
| 25514 |
+
"epoch": 0.1079582150750434,
|
| 25515 |
+
"grad_norm": 6.03125,
|
| 25516 |
+
"learning_rate": 0.0004821262411543674,
|
| 25517 |
+
"loss": 16.4631,
|
| 25518 |
+
"step": 72880
|
| 25519 |
+
},
|
| 25520 |
+
{
|
| 25521 |
+
"epoch": 0.10798784136897178,
|
| 25522 |
+
"grad_norm": 6.34375,
|
| 25523 |
+
"learning_rate": 0.00048212130221935985,
|
| 25524 |
+
"loss": 16.4837,
|
| 25525 |
+
"step": 72900
|
| 25526 |
+
},
|
| 25527 |
+
{
|
| 25528 |
+
"epoch": 0.10801746766290017,
|
| 25529 |
+
"grad_norm": 6.53125,
|
| 25530 |
+
"learning_rate": 0.00048211636328435224,
|
| 25531 |
+
"loss": 16.4245,
|
| 25532 |
+
"step": 72920
|
| 25533 |
+
},
|
| 25534 |
+
{
|
| 25535 |
+
"epoch": 0.10804709395682856,
|
| 25536 |
+
"grad_norm": 6.90625,
|
| 25537 |
+
"learning_rate": 0.0004821114243493447,
|
| 25538 |
+
"loss": 16.4369,
|
| 25539 |
+
"step": 72940
|
| 25540 |
+
},
|
| 25541 |
+
{
|
| 25542 |
+
"epoch": 0.10807672025075696,
|
| 25543 |
+
"grad_norm": 6.8125,
|
| 25544 |
+
"learning_rate": 0.00048210648541433714,
|
| 25545 |
+
"loss": 16.4274,
|
| 25546 |
+
"step": 72960
|
| 25547 |
+
},
|
| 25548 |
+
{
|
| 25549 |
+
"epoch": 0.10810634654468534,
|
| 25550 |
+
"grad_norm": 5.90625,
|
| 25551 |
+
"learning_rate": 0.0004821015464793296,
|
| 25552 |
+
"loss": 16.4141,
|
| 25553 |
+
"step": 72980
|
| 25554 |
+
},
|
| 25555 |
+
{
|
| 25556 |
+
"epoch": 0.10813597283861373,
|
| 25557 |
+
"grad_norm": 6.6875,
|
| 25558 |
+
"learning_rate": 0.000482096607544322,
|
| 25559 |
+
"loss": 16.4235,
|
| 25560 |
+
"step": 73000
|
| 25561 |
}
|
| 25562 |
],
|
| 25563 |
"logging_steps": 20,
|
|
|
|
| 25577 |
"attributes": {}
|
| 25578 |
}
|
| 25579 |
},
|
| 25580 |
+
"total_flos": 5.367243712484711e+19,
|
| 25581 |
"train_batch_size": 48,
|
| 25582 |
"trial_name": null,
|
| 25583 |
"trial_params": null
|