Training in progress, step 121000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71fae22dcd21758bd18c93255be6587d157b9938e670e9b4e1e58707f826293b
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c78fd0c407d20f07636b49b2421a64b67521b73a2c07508922e8bab006631080
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d05682589c4464dbd9ebcfc283944f7611626ce7745ad85f4042e5c5171b5198
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5601bca8adb9619336ad1a8f8dd5a3bb4b196a7ee7870568f8cb821d9554477
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -21368,11 +21368,189 @@
|
|
| 21368 |
"eval_steps_per_second": 15.087,
|
| 21369 |
"num_input_tokens_seen": 62904447680,
|
| 21370 |
"step": 120000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21371 |
}
|
| 21372 |
],
|
| 21373 |
"logging_steps": 50,
|
| 21374 |
"max_steps": 140000,
|
| 21375 |
-
"num_input_tokens_seen":
|
| 21376 |
"num_train_epochs": 2,
|
| 21377 |
"save_steps": 1000,
|
| 21378 |
"stateful_callbacks": {
|
|
@@ -21387,7 +21565,7 @@
|
|
| 21387 |
"attributes": {}
|
| 21388 |
}
|
| 21389 |
},
|
| 21390 |
-
"total_flos": 1.
|
| 21391 |
"train_batch_size": 32,
|
| 21392 |
"trial_name": null,
|
| 21393 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1543484741882013,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 121000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 21368 |
"eval_steps_per_second": 15.087,
|
| 21369 |
"num_input_tokens_seen": 62904447680,
|
| 21370 |
"step": 120000
|
| 21371 |
+
},
|
| 21372 |
+
{
|
| 21373 |
+
"epoch": 1.1452854263806242,
|
| 21374 |
+
"grad_norm": 0.15100175142288208,
|
| 21375 |
+
"learning_rate": 0.0008095469746549171,
|
| 21376 |
+
"loss": 2.0793,
|
| 21377 |
+
"num_input_tokens_seen": 62930656352,
|
| 21378 |
+
"step": 120050
|
| 21379 |
+
},
|
| 21380 |
+
{
|
| 21381 |
+
"epoch": 1.1457624288968125,
|
| 21382 |
+
"grad_norm": 0.14095434546470642,
|
| 21383 |
+
"learning_rate": 0.0008073393063582386,
|
| 21384 |
+
"loss": 2.0828,
|
| 21385 |
+
"num_input_tokens_seen": 62956868576,
|
| 21386 |
+
"step": 120100
|
| 21387 |
+
},
|
| 21388 |
+
{
|
| 21389 |
+
"epoch": 1.1462394314130007,
|
| 21390 |
+
"grad_norm": 0.15013264119625092,
|
| 21391 |
+
"learning_rate": 0.0008051219655187818,
|
| 21392 |
+
"loss": 2.0711,
|
| 21393 |
+
"num_input_tokens_seen": 62983080544,
|
| 21394 |
+
"step": 120150
|
| 21395 |
+
},
|
| 21396 |
+
{
|
| 21397 |
+
"epoch": 1.146716433929189,
|
| 21398 |
+
"grad_norm": 0.1443673074245453,
|
| 21399 |
+
"learning_rate": 0.00080289502192041,
|
| 21400 |
+
"loss": 2.0764,
|
| 21401 |
+
"num_input_tokens_seen": 63009276608,
|
| 21402 |
+
"step": 120200
|
| 21403 |
+
},
|
| 21404 |
+
{
|
| 21405 |
+
"epoch": 1.1471934364453773,
|
| 21406 |
+
"grad_norm": 0.13627703487873077,
|
| 21407 |
+
"learning_rate": 0.0008006585456492029,
|
| 21408 |
+
"loss": 2.0805,
|
| 21409 |
+
"num_input_tokens_seen": 63035488032,
|
| 21410 |
+
"step": 120250
|
| 21411 |
+
},
|
| 21412 |
+
{
|
| 21413 |
+
"epoch": 1.1476704389615655,
|
| 21414 |
+
"grad_norm": 0.14744721353054047,
|
| 21415 |
+
"learning_rate": 0.0007984126070912518,
|
| 21416 |
+
"loss": 2.0691,
|
| 21417 |
+
"num_input_tokens_seen": 63061701600,
|
| 21418 |
+
"step": 120300
|
| 21419 |
+
},
|
| 21420 |
+
{
|
| 21421 |
+
"epoch": 1.1481474414777537,
|
| 21422 |
+
"grad_norm": 0.14301970601081848,
|
| 21423 |
+
"learning_rate": 0.0007961572769304437,
|
| 21424 |
+
"loss": 2.0788,
|
| 21425 |
+
"num_input_tokens_seen": 63087914624,
|
| 21426 |
+
"step": 120350
|
| 21427 |
+
},
|
| 21428 |
+
{
|
| 21429 |
+
"epoch": 1.1486244439939421,
|
| 21430 |
+
"grad_norm": 0.13261480629444122,
|
| 21431 |
+
"learning_rate": 0.0007938926261462366,
|
| 21432 |
+
"loss": 2.0802,
|
| 21433 |
+
"num_input_tokens_seen": 63114128096,
|
| 21434 |
+
"step": 120400
|
| 21435 |
+
},
|
| 21436 |
+
{
|
| 21437 |
+
"epoch": 1.1491014465101304,
|
| 21438 |
+
"grad_norm": 0.14857733249664307,
|
| 21439 |
+
"learning_rate": 0.0007916187260114262,
|
| 21440 |
+
"loss": 2.0773,
|
| 21441 |
+
"num_input_tokens_seen": 63140341024,
|
| 21442 |
+
"step": 120450
|
| 21443 |
+
},
|
| 21444 |
+
{
|
| 21445 |
+
"epoch": 1.1495784490263186,
|
| 21446 |
+
"grad_norm": 0.13263733685016632,
|
| 21447 |
+
"learning_rate": 0.000789335648089903,
|
| 21448 |
+
"loss": 2.0796,
|
| 21449 |
+
"num_input_tokens_seen": 63166554368,
|
| 21450 |
+
"step": 120500
|
| 21451 |
+
},
|
| 21452 |
+
{
|
| 21453 |
+
"epoch": 1.1495784490263186,
|
| 21454 |
+
"eval_loss": 1.9961134195327759,
|
| 21455 |
+
"eval_runtime": 82.5305,
|
| 21456 |
+
"eval_samples_per_second": 60.584,
|
| 21457 |
+
"eval_steps_per_second": 15.146,
|
| 21458 |
+
"num_input_tokens_seen": 63166554368,
|
| 21459 |
+
"step": 120500
|
| 21460 |
+
},
|
| 21461 |
+
{
|
| 21462 |
+
"epoch": 1.150055451542507,
|
| 21463 |
+
"grad_norm": 0.13879702985286713,
|
| 21464 |
+
"learning_rate": 0.0007870434642343984,
|
| 21465 |
+
"loss": 2.0783,
|
| 21466 |
+
"num_input_tokens_seen": 63192764288,
|
| 21467 |
+
"step": 120550
|
| 21468 |
+
},
|
| 21469 |
+
{
|
| 21470 |
+
"epoch": 1.1505324540586952,
|
| 21471 |
+
"grad_norm": 0.13164860010147095,
|
| 21472 |
+
"learning_rate": 0.000784742246584226,
|
| 21473 |
+
"loss": 2.081,
|
| 21474 |
+
"num_input_tokens_seen": 63218969504,
|
| 21475 |
+
"step": 120600
|
| 21476 |
+
},
|
| 21477 |
+
{
|
| 21478 |
+
"epoch": 1.1510094565748834,
|
| 21479 |
+
"grad_norm": 0.1406654268503189,
|
| 21480 |
+
"learning_rate": 0.0007824320675630089,
|
| 21481 |
+
"loss": 2.0704,
|
| 21482 |
+
"num_input_tokens_seen": 63245179680,
|
| 21483 |
+
"step": 120650
|
| 21484 |
+
},
|
| 21485 |
+
{
|
| 21486 |
+
"epoch": 1.1514864590910716,
|
| 21487 |
+
"grad_norm": 0.13722951710224152,
|
| 21488 |
+
"learning_rate": 0.0007801129998764014,
|
| 21489 |
+
"loss": 2.0693,
|
| 21490 |
+
"num_input_tokens_seen": 63271389024,
|
| 21491 |
+
"step": 120700
|
| 21492 |
+
},
|
| 21493 |
+
{
|
| 21494 |
+
"epoch": 1.15196346160726,
|
| 21495 |
+
"grad_norm": 0.15168820321559906,
|
| 21496 |
+
"learning_rate": 0.0007777851165098011,
|
| 21497 |
+
"loss": 2.0813,
|
| 21498 |
+
"num_input_tokens_seen": 63297594624,
|
| 21499 |
+
"step": 120750
|
| 21500 |
+
},
|
| 21501 |
+
{
|
| 21502 |
+
"epoch": 1.1524404641234482,
|
| 21503 |
+
"grad_norm": 0.13907547295093536,
|
| 21504 |
+
"learning_rate": 0.0007754484907260512,
|
| 21505 |
+
"loss": 2.0747,
|
| 21506 |
+
"num_input_tokens_seen": 63323809024,
|
| 21507 |
+
"step": 120800
|
| 21508 |
+
},
|
| 21509 |
+
{
|
| 21510 |
+
"epoch": 1.1529174666396365,
|
| 21511 |
+
"grad_norm": 0.13827022910118103,
|
| 21512 |
+
"learning_rate": 0.0007731031960631354,
|
| 21513 |
+
"loss": 2.079,
|
| 21514 |
+
"num_input_tokens_seen": 63350015808,
|
| 21515 |
+
"step": 120850
|
| 21516 |
+
},
|
| 21517 |
+
{
|
| 21518 |
+
"epoch": 1.1533944691558249,
|
| 21519 |
+
"grad_norm": 0.1326221376657486,
|
| 21520 |
+
"learning_rate": 0.0007707493063318629,
|
| 21521 |
+
"loss": 2.0856,
|
| 21522 |
+
"num_input_tokens_seen": 63376227968,
|
| 21523 |
+
"step": 120900
|
| 21524 |
+
},
|
| 21525 |
+
{
|
| 21526 |
+
"epoch": 1.153871471672013,
|
| 21527 |
+
"grad_norm": 0.13669894635677338,
|
| 21528 |
+
"learning_rate": 0.000768386895613546,
|
| 21529 |
+
"loss": 2.0691,
|
| 21530 |
+
"num_input_tokens_seen": 63402433504,
|
| 21531 |
+
"step": 120950
|
| 21532 |
+
},
|
| 21533 |
+
{
|
| 21534 |
+
"epoch": 1.1543484741882013,
|
| 21535 |
+
"grad_norm": 0.1403321623802185,
|
| 21536 |
+
"learning_rate": 0.0007660160382576683,
|
| 21537 |
+
"loss": 2.077,
|
| 21538 |
+
"num_input_tokens_seen": 63428647904,
|
| 21539 |
+
"step": 121000
|
| 21540 |
+
},
|
| 21541 |
+
{
|
| 21542 |
+
"epoch": 1.1543484741882013,
|
| 21543 |
+
"eval_loss": 1.9939944744110107,
|
| 21544 |
+
"eval_runtime": 82.7663,
|
| 21545 |
+
"eval_samples_per_second": 60.411,
|
| 21546 |
+
"eval_steps_per_second": 15.103,
|
| 21547 |
+
"num_input_tokens_seen": 63428647904,
|
| 21548 |
+
"step": 121000
|
| 21549 |
}
|
| 21550 |
],
|
| 21551 |
"logging_steps": 50,
|
| 21552 |
"max_steps": 140000,
|
| 21553 |
+
"num_input_tokens_seen": 63428647904,
|
| 21554 |
"num_train_epochs": 2,
|
| 21555 |
"save_steps": 1000,
|
| 21556 |
"stateful_callbacks": {
|
|
|
|
| 21565 |
"attributes": {}
|
| 21566 |
}
|
| 21567 |
},
|
| 21568 |
+
"total_flos": 1.1225713740470231e+20,
|
| 21569 |
"train_batch_size": 32,
|
| 21570 |
"trial_name": null,
|
| 21571 |
"trial_params": null
|