Training in progress, step 19000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01e4827000f30108c5db6d9ab6168d6e7dfecf37eef3edc1465363ee9ea8e490
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:795bb5905ce658a665a647e1035b68562ea8227998cfd6cdd93e835459408e5d
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6872023f654a65ebb855f875663f2550ec7c7270f37183aedc09afdf3151f71c
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d749134b574c8d566f1f7b1e5e174cfc46c406c32210d882ffb530c2f402814
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7b2317285b7aac6485bde8423b9bd42301b29e0cd0b6a3f299d06ddf3270099
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a512863aeac154eb9ea09654b5c57fb002e6788836adf8be9c2844cb710adf1
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b73090e5ff4d77e40aae33305c58d2deda13e4f4510f1c076acf40a9f8a97bef
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6308,6 +6308,356 @@
|
|
| 6308 |
"learning_rate": 0.0004943110480558603,
|
| 6309 |
"loss": 19.8528,
|
| 6310 |
"step": 18000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6311 |
}
|
| 6312 |
],
|
| 6313 |
"logging_steps": 20,
|
|
@@ -6327,7 +6677,7 @@
|
|
| 6327 |
"attributes": {}
|
| 6328 |
}
|
| 6329 |
},
|
| 6330 |
-
"total_flos": 1.
|
| 6331 |
"train_batch_size": 48,
|
| 6332 |
"trial_name": null,
|
| 6333 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.03704949520062789,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 19000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6308 |
"learning_rate": 0.0004943110480558603,
|
| 6309 |
"loss": 19.8528,
|
| 6310 |
"step": 18000
|
| 6311 |
+
},
|
| 6312 |
+
{
|
| 6313 |
+
"epoch": 0.035138521237648136,
|
| 6314 |
+
"grad_norm": 12.25,
|
| 6315 |
+
"learning_rate": 0.0004943045460249509,
|
| 6316 |
+
"loss": 19.9118,
|
| 6317 |
+
"step": 18020
|
| 6318 |
+
},
|
| 6319 |
+
{
|
| 6320 |
+
"epoch": 0.03517752070628038,
|
| 6321 |
+
"grad_norm": 12.0625,
|
| 6322 |
+
"learning_rate": 0.0004942980439940416,
|
| 6323 |
+
"loss": 19.7998,
|
| 6324 |
+
"step": 18040
|
| 6325 |
+
},
|
| 6326 |
+
{
|
| 6327 |
+
"epoch": 0.03521652017491262,
|
| 6328 |
+
"grad_norm": 10.5625,
|
| 6329 |
+
"learning_rate": 0.0004942915419631321,
|
| 6330 |
+
"loss": 19.8966,
|
| 6331 |
+
"step": 18060
|
| 6332 |
+
},
|
| 6333 |
+
{
|
| 6334 |
+
"epoch": 0.035255519643544855,
|
| 6335 |
+
"grad_norm": 10.375,
|
| 6336 |
+
"learning_rate": 0.0004942850399322228,
|
| 6337 |
+
"loss": 19.8631,
|
| 6338 |
+
"step": 18080
|
| 6339 |
+
},
|
| 6340 |
+
{
|
| 6341 |
+
"epoch": 0.0352945191121771,
|
| 6342 |
+
"grad_norm": 10.3125,
|
| 6343 |
+
"learning_rate": 0.0004942785379013135,
|
| 6344 |
+
"loss": 19.8226,
|
| 6345 |
+
"step": 18100
|
| 6346 |
+
},
|
| 6347 |
+
{
|
| 6348 |
+
"epoch": 0.03533351858080934,
|
| 6349 |
+
"grad_norm": 9.9375,
|
| 6350 |
+
"learning_rate": 0.0004942720358704041,
|
| 6351 |
+
"loss": 19.8466,
|
| 6352 |
+
"step": 18120
|
| 6353 |
+
},
|
| 6354 |
+
{
|
| 6355 |
+
"epoch": 0.035372518049441574,
|
| 6356 |
+
"grad_norm": 10.875,
|
| 6357 |
+
"learning_rate": 0.0004942655338394948,
|
| 6358 |
+
"loss": 19.8142,
|
| 6359 |
+
"step": 18140
|
| 6360 |
+
},
|
| 6361 |
+
{
|
| 6362 |
+
"epoch": 0.035411517518073816,
|
| 6363 |
+
"grad_norm": 10.0,
|
| 6364 |
+
"learning_rate": 0.0004942590318085854,
|
| 6365 |
+
"loss": 19.8257,
|
| 6366 |
+
"step": 18160
|
| 6367 |
+
},
|
| 6368 |
+
{
|
| 6369 |
+
"epoch": 0.03545051698670606,
|
| 6370 |
+
"grad_norm": 10.125,
|
| 6371 |
+
"learning_rate": 0.0004942525297776761,
|
| 6372 |
+
"loss": 19.7939,
|
| 6373 |
+
"step": 18180
|
| 6374 |
+
},
|
| 6375 |
+
{
|
| 6376 |
+
"epoch": 0.035489516455338294,
|
| 6377 |
+
"grad_norm": 9.5,
|
| 6378 |
+
"learning_rate": 0.0004942460277467667,
|
| 6379 |
+
"loss": 19.8764,
|
| 6380 |
+
"step": 18200
|
| 6381 |
+
},
|
| 6382 |
+
{
|
| 6383 |
+
"epoch": 0.035528515923970536,
|
| 6384 |
+
"grad_norm": 10.375,
|
| 6385 |
+
"learning_rate": 0.0004942395257158573,
|
| 6386 |
+
"loss": 19.8394,
|
| 6387 |
+
"step": 18220
|
| 6388 |
+
},
|
| 6389 |
+
{
|
| 6390 |
+
"epoch": 0.03556751539260278,
|
| 6391 |
+
"grad_norm": 10.375,
|
| 6392 |
+
"learning_rate": 0.000494233023684948,
|
| 6393 |
+
"loss": 19.7666,
|
| 6394 |
+
"step": 18240
|
| 6395 |
+
},
|
| 6396 |
+
{
|
| 6397 |
+
"epoch": 0.03560651486123501,
|
| 6398 |
+
"grad_norm": 10.625,
|
| 6399 |
+
"learning_rate": 0.0004942265216540386,
|
| 6400 |
+
"loss": 19.8165,
|
| 6401 |
+
"step": 18260
|
| 6402 |
+
},
|
| 6403 |
+
{
|
| 6404 |
+
"epoch": 0.035645514329867255,
|
| 6405 |
+
"grad_norm": 9.8125,
|
| 6406 |
+
"learning_rate": 0.0004942200196231293,
|
| 6407 |
+
"loss": 19.9201,
|
| 6408 |
+
"step": 18280
|
| 6409 |
+
},
|
| 6410 |
+
{
|
| 6411 |
+
"epoch": 0.0356845137984995,
|
| 6412 |
+
"grad_norm": 11.0625,
|
| 6413 |
+
"learning_rate": 0.0004942135175922199,
|
| 6414 |
+
"loss": 19.8705,
|
| 6415 |
+
"step": 18300
|
| 6416 |
+
},
|
| 6417 |
+
{
|
| 6418 |
+
"epoch": 0.03572351326713174,
|
| 6419 |
+
"grad_norm": 10.9375,
|
| 6420 |
+
"learning_rate": 0.0004942070155613106,
|
| 6421 |
+
"loss": 19.7906,
|
| 6422 |
+
"step": 18320
|
| 6423 |
+
},
|
| 6424 |
+
{
|
| 6425 |
+
"epoch": 0.035762512735763974,
|
| 6426 |
+
"grad_norm": 10.375,
|
| 6427 |
+
"learning_rate": 0.0004942005135304012,
|
| 6428 |
+
"loss": 19.7983,
|
| 6429 |
+
"step": 18340
|
| 6430 |
+
},
|
| 6431 |
+
{
|
| 6432 |
+
"epoch": 0.035801512204396216,
|
| 6433 |
+
"grad_norm": 10.5,
|
| 6434 |
+
"learning_rate": 0.0004941940114994919,
|
| 6435 |
+
"loss": 19.7921,
|
| 6436 |
+
"step": 18360
|
| 6437 |
+
},
|
| 6438 |
+
{
|
| 6439 |
+
"epoch": 0.03584051167302846,
|
| 6440 |
+
"grad_norm": 11.1875,
|
| 6441 |
+
"learning_rate": 0.0004941875094685825,
|
| 6442 |
+
"loss": 19.6906,
|
| 6443 |
+
"step": 18380
|
| 6444 |
+
},
|
| 6445 |
+
{
|
| 6446 |
+
"epoch": 0.03587951114166069,
|
| 6447 |
+
"grad_norm": 10.375,
|
| 6448 |
+
"learning_rate": 0.0004941810074376732,
|
| 6449 |
+
"loss": 19.8046,
|
| 6450 |
+
"step": 18400
|
| 6451 |
+
},
|
| 6452 |
+
{
|
| 6453 |
+
"epoch": 0.035918510610292935,
|
| 6454 |
+
"grad_norm": 10.5625,
|
| 6455 |
+
"learning_rate": 0.0004941745054067638,
|
| 6456 |
+
"loss": 19.8274,
|
| 6457 |
+
"step": 18420
|
| 6458 |
+
},
|
| 6459 |
+
{
|
| 6460 |
+
"epoch": 0.03595751007892518,
|
| 6461 |
+
"grad_norm": 11.25,
|
| 6462 |
+
"learning_rate": 0.0004941680033758544,
|
| 6463 |
+
"loss": 19.7977,
|
| 6464 |
+
"step": 18440
|
| 6465 |
+
},
|
| 6466 |
+
{
|
| 6467 |
+
"epoch": 0.03599650954755741,
|
| 6468 |
+
"grad_norm": 11.1875,
|
| 6469 |
+
"learning_rate": 0.0004941615013449451,
|
| 6470 |
+
"loss": 19.7892,
|
| 6471 |
+
"step": 18460
|
| 6472 |
+
},
|
| 6473 |
+
{
|
| 6474 |
+
"epoch": 0.036035509016189654,
|
| 6475 |
+
"grad_norm": 10.625,
|
| 6476 |
+
"learning_rate": 0.0004941549993140357,
|
| 6477 |
+
"loss": 19.6819,
|
| 6478 |
+
"step": 18480
|
| 6479 |
+
},
|
| 6480 |
+
{
|
| 6481 |
+
"epoch": 0.036074508484821896,
|
| 6482 |
+
"grad_norm": 12.3125,
|
| 6483 |
+
"learning_rate": 0.0004941484972831264,
|
| 6484 |
+
"loss": 19.7501,
|
| 6485 |
+
"step": 18500
|
| 6486 |
+
},
|
| 6487 |
+
{
|
| 6488 |
+
"epoch": 0.03611350795345413,
|
| 6489 |
+
"grad_norm": 10.0625,
|
| 6490 |
+
"learning_rate": 0.000494141995252217,
|
| 6491 |
+
"loss": 19.7792,
|
| 6492 |
+
"step": 18520
|
| 6493 |
+
},
|
| 6494 |
+
{
|
| 6495 |
+
"epoch": 0.03615250742208637,
|
| 6496 |
+
"grad_norm": 9.3125,
|
| 6497 |
+
"learning_rate": 0.0004941354932213077,
|
| 6498 |
+
"loss": 19.7753,
|
| 6499 |
+
"step": 18540
|
| 6500 |
+
},
|
| 6501 |
+
{
|
| 6502 |
+
"epoch": 0.036191506890718615,
|
| 6503 |
+
"grad_norm": 10.4375,
|
| 6504 |
+
"learning_rate": 0.0004941289911903983,
|
| 6505 |
+
"loss": 19.8244,
|
| 6506 |
+
"step": 18560
|
| 6507 |
+
},
|
| 6508 |
+
{
|
| 6509 |
+
"epoch": 0.03623050635935086,
|
| 6510 |
+
"grad_norm": 10.75,
|
| 6511 |
+
"learning_rate": 0.000494122489159489,
|
| 6512 |
+
"loss": 19.7036,
|
| 6513 |
+
"step": 18580
|
| 6514 |
+
},
|
| 6515 |
+
{
|
| 6516 |
+
"epoch": 0.03626950582798309,
|
| 6517 |
+
"grad_norm": 8.5625,
|
| 6518 |
+
"learning_rate": 0.0004941159871285797,
|
| 6519 |
+
"loss": 19.7776,
|
| 6520 |
+
"step": 18600
|
| 6521 |
+
},
|
| 6522 |
+
{
|
| 6523 |
+
"epoch": 0.036308505296615334,
|
| 6524 |
+
"grad_norm": 10.8125,
|
| 6525 |
+
"learning_rate": 0.0004941094850976703,
|
| 6526 |
+
"loss": 19.8757,
|
| 6527 |
+
"step": 18620
|
| 6528 |
+
},
|
| 6529 |
+
{
|
| 6530 |
+
"epoch": 0.036347504765247576,
|
| 6531 |
+
"grad_norm": 9.625,
|
| 6532 |
+
"learning_rate": 0.000494102983066761,
|
| 6533 |
+
"loss": 19.8473,
|
| 6534 |
+
"step": 18640
|
| 6535 |
+
},
|
| 6536 |
+
{
|
| 6537 |
+
"epoch": 0.03638650423387981,
|
| 6538 |
+
"grad_norm": 14.3125,
|
| 6539 |
+
"learning_rate": 0.0004940964810358516,
|
| 6540 |
+
"loss": 19.6959,
|
| 6541 |
+
"step": 18660
|
| 6542 |
+
},
|
| 6543 |
+
{
|
| 6544 |
+
"epoch": 0.03642550370251205,
|
| 6545 |
+
"grad_norm": 12.25,
|
| 6546 |
+
"learning_rate": 0.0004940899790049422,
|
| 6547 |
+
"loss": 19.7052,
|
| 6548 |
+
"step": 18680
|
| 6549 |
+
},
|
| 6550 |
+
{
|
| 6551 |
+
"epoch": 0.036464503171144295,
|
| 6552 |
+
"grad_norm": 10.625,
|
| 6553 |
+
"learning_rate": 0.0004940834769740328,
|
| 6554 |
+
"loss": 19.7221,
|
| 6555 |
+
"step": 18700
|
| 6556 |
+
},
|
| 6557 |
+
{
|
| 6558 |
+
"epoch": 0.03650350263977653,
|
| 6559 |
+
"grad_norm": 9.125,
|
| 6560 |
+
"learning_rate": 0.0004940769749431235,
|
| 6561 |
+
"loss": 19.636,
|
| 6562 |
+
"step": 18720
|
| 6563 |
+
},
|
| 6564 |
+
{
|
| 6565 |
+
"epoch": 0.03654250210840877,
|
| 6566 |
+
"grad_norm": 9.8125,
|
| 6567 |
+
"learning_rate": 0.0004940704729122142,
|
| 6568 |
+
"loss": 19.7428,
|
| 6569 |
+
"step": 18740
|
| 6570 |
+
},
|
| 6571 |
+
{
|
| 6572 |
+
"epoch": 0.036581501577041015,
|
| 6573 |
+
"grad_norm": 10.5,
|
| 6574 |
+
"learning_rate": 0.0004940639708813048,
|
| 6575 |
+
"loss": 19.7076,
|
| 6576 |
+
"step": 18760
|
| 6577 |
+
},
|
| 6578 |
+
{
|
| 6579 |
+
"epoch": 0.03662050104567325,
|
| 6580 |
+
"grad_norm": 10.125,
|
| 6581 |
+
"learning_rate": 0.0004940574688503955,
|
| 6582 |
+
"loss": 19.6721,
|
| 6583 |
+
"step": 18780
|
| 6584 |
+
},
|
| 6585 |
+
{
|
| 6586 |
+
"epoch": 0.03665950051430549,
|
| 6587 |
+
"grad_norm": 12.4375,
|
| 6588 |
+
"learning_rate": 0.0004940509668194861,
|
| 6589 |
+
"loss": 19.7135,
|
| 6590 |
+
"step": 18800
|
| 6591 |
+
},
|
| 6592 |
+
{
|
| 6593 |
+
"epoch": 0.036698499982937734,
|
| 6594 |
+
"grad_norm": 9.5,
|
| 6595 |
+
"learning_rate": 0.0004940444647885768,
|
| 6596 |
+
"loss": 19.591,
|
| 6597 |
+
"step": 18820
|
| 6598 |
+
},
|
| 6599 |
+
{
|
| 6600 |
+
"epoch": 0.036737499451569976,
|
| 6601 |
+
"grad_norm": 9.3125,
|
| 6602 |
+
"learning_rate": 0.0004940379627576673,
|
| 6603 |
+
"loss": 19.595,
|
| 6604 |
+
"step": 18840
|
| 6605 |
+
},
|
| 6606 |
+
{
|
| 6607 |
+
"epoch": 0.03677649892020221,
|
| 6608 |
+
"grad_norm": 10.375,
|
| 6609 |
+
"learning_rate": 0.000494031460726758,
|
| 6610 |
+
"loss": 19.7223,
|
| 6611 |
+
"step": 18860
|
| 6612 |
+
},
|
| 6613 |
+
{
|
| 6614 |
+
"epoch": 0.03681549838883445,
|
| 6615 |
+
"grad_norm": 12.4375,
|
| 6616 |
+
"learning_rate": 0.0004940249586958486,
|
| 6617 |
+
"loss": 19.6684,
|
| 6618 |
+
"step": 18880
|
| 6619 |
+
},
|
| 6620 |
+
{
|
| 6621 |
+
"epoch": 0.036854497857466695,
|
| 6622 |
+
"grad_norm": 10.0625,
|
| 6623 |
+
"learning_rate": 0.0004940184566649393,
|
| 6624 |
+
"loss": 19.6797,
|
| 6625 |
+
"step": 18900
|
| 6626 |
+
},
|
| 6627 |
+
{
|
| 6628 |
+
"epoch": 0.03689349732609893,
|
| 6629 |
+
"grad_norm": 10.0,
|
| 6630 |
+
"learning_rate": 0.00049401195463403,
|
| 6631 |
+
"loss": 19.712,
|
| 6632 |
+
"step": 18920
|
| 6633 |
+
},
|
| 6634 |
+
{
|
| 6635 |
+
"epoch": 0.03693249679473117,
|
| 6636 |
+
"grad_norm": 10.25,
|
| 6637 |
+
"learning_rate": 0.0004940054526031206,
|
| 6638 |
+
"loss": 19.5788,
|
| 6639 |
+
"step": 18940
|
| 6640 |
+
},
|
| 6641 |
+
{
|
| 6642 |
+
"epoch": 0.036971496263363414,
|
| 6643 |
+
"grad_norm": 11.8125,
|
| 6644 |
+
"learning_rate": 0.0004939989505722113,
|
| 6645 |
+
"loss": 19.6803,
|
| 6646 |
+
"step": 18960
|
| 6647 |
+
},
|
| 6648 |
+
{
|
| 6649 |
+
"epoch": 0.03701049573199565,
|
| 6650 |
+
"grad_norm": 10.5,
|
| 6651 |
+
"learning_rate": 0.0004939924485413019,
|
| 6652 |
+
"loss": 19.705,
|
| 6653 |
+
"step": 18980
|
| 6654 |
+
},
|
| 6655 |
+
{
|
| 6656 |
+
"epoch": 0.03704949520062789,
|
| 6657 |
+
"grad_norm": 9.25,
|
| 6658 |
+
"learning_rate": 0.0004939859465103925,
|
| 6659 |
+
"loss": 19.6594,
|
| 6660 |
+
"step": 19000
|
| 6661 |
}
|
| 6662 |
],
|
| 6663 |
"logging_steps": 20,
|
|
|
|
| 6677 |
"attributes": {}
|
| 6678 |
}
|
| 6679 |
},
|
| 6680 |
+
"total_flos": 1.3968203395446604e+19,
|
| 6681 |
"train_batch_size": 48,
|
| 6682 |
"trial_name": null,
|
| 6683 |
"trial_params": null
|