Training in progress, epoch 0, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step3050/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step3050/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step3050/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step3050/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step3050/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +316 -4
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1037269336
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f371c80024225652f3c86237652abc582dcf0f83241d7917c781b6818ee9f107
|
| 3 |
size 1037269336
|
last-checkpoint/global_step3050/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9702b049d245d16c002c1456338f72710c06f952b0478544008ad989a5de7e07
|
| 3 |
+
size 781993445
|
last-checkpoint/global_step3050/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4294cab9b940c0087225b9c7649512d19f24fa6a3f5f01b7538b149f5d7be8ab
|
| 3 |
+
size 781993509
|
last-checkpoint/global_step3050/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11d7132340fd716524b3218b067e868d891edb7ac95bd8b57e53e48ee29e0838
|
| 3 |
+
size 781993509
|
last-checkpoint/global_step3050/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6c4a5a81b58fe1331524664aed42e5082b13c66d3fadfb6645ff1316cf85beb
|
| 3 |
+
size 781993509
|
last-checkpoint/global_step3050/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2d5e8ed0bb6d30443a8b0f9ac2b192e359d428409f7528391fcd07030056005
|
| 3 |
+
size 2610290277
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step3050
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e467591174f4d5f061fdc6867a8959bae4dd3ff9f561e079a51d1986c3871bef
|
| 3 |
size 15429
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2af72cce7586fc024c88a31600f7b9bd8f97fac8953bf342b40bab89d92f4d3d
|
| 3 |
size 15429
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:267b8e17d32bdaab462ce2a11855474cca07a7c3d899baff6bd1f852d0f4b42e
|
| 3 |
size 15429
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69cff40eb67e607ef56c9df4fce05c9d4f61aef835fb92458f77bb2b8ff22109
|
| 3 |
size 15429
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1401
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e4f4eb8437c35cc3bc21ff2f135541f1bea2ca5b0d67f12d8ea935606929e82
|
| 3 |
size 1401
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"best_global_step": null,
|
| 3 |
-
"best_metric": 2.
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 50,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4454,6 +4454,318 @@
|
|
| 4454 |
"eval_samples_per_second": 173.899,
|
| 4455 |
"eval_steps_per_second": 10.905,
|
| 4456 |
"step": 2850
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4457 |
}
|
| 4458 |
],
|
| 4459 |
"logging_steps": 5,
|
|
@@ -4482,7 +4794,7 @@
|
|
| 4482 |
"attributes": {}
|
| 4483 |
}
|
| 4484 |
},
|
| 4485 |
-
"total_flos": 7.
|
| 4486 |
"train_batch_size": 4,
|
| 4487 |
"trial_name": null,
|
| 4488 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_global_step": null,
|
| 3 |
+
"best_metric": 2.019763946533203,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.44337839802296847,
|
| 6 |
"eval_steps": 50,
|
| 7 |
+
"global_step": 3050,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4454 |
"eval_samples_per_second": 173.899,
|
| 4455 |
"eval_steps_per_second": 10.905,
|
| 4456 |
"step": 2850
|
| 4457 |
+
},
|
| 4458 |
+
{
|
| 4459 |
+
"epoch": 0.4150312545428115,
|
| 4460 |
+
"grad_norm": 2.5634236335754395,
|
| 4461 |
+
"learning_rate": 8.225223779709644e-05,
|
| 4462 |
+
"loss": 2.143,
|
| 4463 |
+
"step": 2855
|
| 4464 |
+
},
|
| 4465 |
+
{
|
| 4466 |
+
"epoch": 0.415758104375636,
|
| 4467 |
+
"grad_norm": 2.3738303184509277,
|
| 4468 |
+
"learning_rate": 8.219124026368057e-05,
|
| 4469 |
+
"loss": 2.2716,
|
| 4470 |
+
"step": 2860
|
| 4471 |
+
},
|
| 4472 |
+
{
|
| 4473 |
+
"epoch": 0.41648495420846054,
|
| 4474 |
+
"grad_norm": 2.5236852169036865,
|
| 4475 |
+
"learning_rate": 8.213016193649599e-05,
|
| 4476 |
+
"loss": 2.1629,
|
| 4477 |
+
"step": 2865
|
| 4478 |
+
},
|
| 4479 |
+
{
|
| 4480 |
+
"epoch": 0.41721180404128505,
|
| 4481 |
+
"grad_norm": 2.6418023109436035,
|
| 4482 |
+
"learning_rate": 8.206900296930731e-05,
|
| 4483 |
+
"loss": 2.2035,
|
| 4484 |
+
"step": 2870
|
| 4485 |
+
},
|
| 4486 |
+
{
|
| 4487 |
+
"epoch": 0.4179386538741096,
|
| 4488 |
+
"grad_norm": 2.70849609375,
|
| 4489 |
+
"learning_rate": 8.200776351608213e-05,
|
| 4490 |
+
"loss": 2.1833,
|
| 4491 |
+
"step": 2875
|
| 4492 |
+
},
|
| 4493 |
+
{
|
| 4494 |
+
"epoch": 0.4186655037069342,
|
| 4495 |
+
"grad_norm": 2.2143619060516357,
|
| 4496 |
+
"learning_rate": 8.194644373099076e-05,
|
| 4497 |
+
"loss": 2.1152,
|
| 4498 |
+
"step": 2880
|
| 4499 |
+
},
|
| 4500 |
+
{
|
| 4501 |
+
"epoch": 0.4193923535397587,
|
| 4502 |
+
"grad_norm": 2.5225560665130615,
|
| 4503 |
+
"learning_rate": 8.18850437684056e-05,
|
| 4504 |
+
"loss": 2.2915,
|
| 4505 |
+
"step": 2885
|
| 4506 |
+
},
|
| 4507 |
+
{
|
| 4508 |
+
"epoch": 0.42011920337258324,
|
| 4509 |
+
"grad_norm": 2.643038511276245,
|
| 4510 |
+
"learning_rate": 8.182356378290107e-05,
|
| 4511 |
+
"loss": 2.2131,
|
| 4512 |
+
"step": 2890
|
| 4513 |
+
},
|
| 4514 |
+
{
|
| 4515 |
+
"epoch": 0.42084605320540774,
|
| 4516 |
+
"grad_norm": 2.9499423503875732,
|
| 4517 |
+
"learning_rate": 8.17620039292529e-05,
|
| 4518 |
+
"loss": 2.2959,
|
| 4519 |
+
"step": 2895
|
| 4520 |
+
},
|
| 4521 |
+
{
|
| 4522 |
+
"epoch": 0.4215729030382323,
|
| 4523 |
+
"grad_norm": 2.53491473197937,
|
| 4524 |
+
"learning_rate": 8.170036436243797e-05,
|
| 4525 |
+
"loss": 2.1247,
|
| 4526 |
+
"step": 2900
|
| 4527 |
+
},
|
| 4528 |
+
{
|
| 4529 |
+
"epoch": 0.4215729030382323,
|
| 4530 |
+
"eval_loss": 2.030867576599121,
|
| 4531 |
+
"eval_runtime": 21.6628,
|
| 4532 |
+
"eval_samples_per_second": 152.381,
|
| 4533 |
+
"eval_steps_per_second": 9.556,
|
| 4534 |
+
"step": 2900
|
| 4535 |
+
},
|
| 4536 |
+
{
|
| 4537 |
+
"epoch": 0.42229975287105687,
|
| 4538 |
+
"grad_norm": 2.75742769241333,
|
| 4539 |
+
"learning_rate": 8.163864523763382e-05,
|
| 4540 |
+
"loss": 1.9965,
|
| 4541 |
+
"step": 2905
|
| 4542 |
+
},
|
| 4543 |
+
{
|
| 4544 |
+
"epoch": 0.42302660270388137,
|
| 4545 |
+
"grad_norm": 4.27183198928833,
|
| 4546 |
+
"learning_rate": 8.157684671021828e-05,
|
| 4547 |
+
"loss": 2.1029,
|
| 4548 |
+
"step": 2910
|
| 4549 |
+
},
|
| 4550 |
+
{
|
| 4551 |
+
"epoch": 0.42375345253670593,
|
| 4552 |
+
"grad_norm": 2.9568264484405518,
|
| 4553 |
+
"learning_rate": 8.151496893576904e-05,
|
| 4554 |
+
"loss": 2.2166,
|
| 4555 |
+
"step": 2915
|
| 4556 |
+
},
|
| 4557 |
+
{
|
| 4558 |
+
"epoch": 0.42448030236953044,
|
| 4559 |
+
"grad_norm": 2.716278314590454,
|
| 4560 |
+
"learning_rate": 8.145301207006335e-05,
|
| 4561 |
+
"loss": 2.1629,
|
| 4562 |
+
"step": 2920
|
| 4563 |
+
},
|
| 4564 |
+
{
|
| 4565 |
+
"epoch": 0.425207152202355,
|
| 4566 |
+
"grad_norm": 2.635277032852173,
|
| 4567 |
+
"learning_rate": 8.139097626907753e-05,
|
| 4568 |
+
"loss": 2.2077,
|
| 4569 |
+
"step": 2925
|
| 4570 |
+
},
|
| 4571 |
+
{
|
| 4572 |
+
"epoch": 0.4259340020351795,
|
| 4573 |
+
"grad_norm": 2.677725076675415,
|
| 4574 |
+
"learning_rate": 8.132886168898666e-05,
|
| 4575 |
+
"loss": 2.2313,
|
| 4576 |
+
"step": 2930
|
| 4577 |
+
},
|
| 4578 |
+
{
|
| 4579 |
+
"epoch": 0.42666085186800407,
|
| 4580 |
+
"grad_norm": 2.510044813156128,
|
| 4581 |
+
"learning_rate": 8.12666684861641e-05,
|
| 4582 |
+
"loss": 2.03,
|
| 4583 |
+
"step": 2935
|
| 4584 |
+
},
|
| 4585 |
+
{
|
| 4586 |
+
"epoch": 0.4273877017008286,
|
| 4587 |
+
"grad_norm": 2.279388904571533,
|
| 4588 |
+
"learning_rate": 8.120439681718117e-05,
|
| 4589 |
+
"loss": 2.1885,
|
| 4590 |
+
"step": 2940
|
| 4591 |
+
},
|
| 4592 |
+
{
|
| 4593 |
+
"epoch": 0.42811455153365313,
|
| 4594 |
+
"grad_norm": 2.61489200592041,
|
| 4595 |
+
"learning_rate": 8.114204683880671e-05,
|
| 4596 |
+
"loss": 2.2475,
|
| 4597 |
+
"step": 2945
|
| 4598 |
+
},
|
| 4599 |
+
{
|
| 4600 |
+
"epoch": 0.4288414013664777,
|
| 4601 |
+
"grad_norm": 2.7564356327056885,
|
| 4602 |
+
"learning_rate": 8.107961870800672e-05,
|
| 4603 |
+
"loss": 2.2717,
|
| 4604 |
+
"step": 2950
|
| 4605 |
+
},
|
| 4606 |
+
{
|
| 4607 |
+
"epoch": 0.4288414013664777,
|
| 4608 |
+
"eval_loss": 2.0410735607147217,
|
| 4609 |
+
"eval_runtime": 19.0203,
|
| 4610 |
+
"eval_samples_per_second": 173.552,
|
| 4611 |
+
"eval_steps_per_second": 10.883,
|
| 4612 |
+
"step": 2950
|
| 4613 |
+
},
|
| 4614 |
+
{
|
| 4615 |
+
"epoch": 0.4295682511993022,
|
| 4616 |
+
"grad_norm": 2.354588270187378,
|
| 4617 |
+
"learning_rate": 8.101711258194397e-05,
|
| 4618 |
+
"loss": 2.0337,
|
| 4619 |
+
"step": 2955
|
| 4620 |
+
},
|
| 4621 |
+
{
|
| 4622 |
+
"epoch": 0.43029510103212676,
|
| 4623 |
+
"grad_norm": 2.4436914920806885,
|
| 4624 |
+
"learning_rate": 8.095452861797751e-05,
|
| 4625 |
+
"loss": 2.0731,
|
| 4626 |
+
"step": 2960
|
| 4627 |
+
},
|
| 4628 |
+
{
|
| 4629 |
+
"epoch": 0.4310219508649513,
|
| 4630 |
+
"grad_norm": 2.4441328048706055,
|
| 4631 |
+
"learning_rate": 8.089186697366247e-05,
|
| 4632 |
+
"loss": 2.0913,
|
| 4633 |
+
"step": 2965
|
| 4634 |
+
},
|
| 4635 |
+
{
|
| 4636 |
+
"epoch": 0.4317488006977758,
|
| 4637 |
+
"grad_norm": 2.439755916595459,
|
| 4638 |
+
"learning_rate": 8.082912780674939e-05,
|
| 4639 |
+
"loss": 1.9794,
|
| 4640 |
+
"step": 2970
|
| 4641 |
+
},
|
| 4642 |
+
{
|
| 4643 |
+
"epoch": 0.4324756505306004,
|
| 4644 |
+
"grad_norm": 3.0894908905029297,
|
| 4645 |
+
"learning_rate": 8.076631127518407e-05,
|
| 4646 |
+
"loss": 2.2068,
|
| 4647 |
+
"step": 2975
|
| 4648 |
+
},
|
| 4649 |
+
{
|
| 4650 |
+
"epoch": 0.4332025003634249,
|
| 4651 |
+
"grad_norm": 2.3073198795318604,
|
| 4652 |
+
"learning_rate": 8.070341753710708e-05,
|
| 4653 |
+
"loss": 2.153,
|
| 4654 |
+
"step": 2980
|
| 4655 |
+
},
|
| 4656 |
+
{
|
| 4657 |
+
"epoch": 0.43392935019624945,
|
| 4658 |
+
"grad_norm": 2.387176513671875,
|
| 4659 |
+
"learning_rate": 8.06404467508533e-05,
|
| 4660 |
+
"loss": 2.0941,
|
| 4661 |
+
"step": 2985
|
| 4662 |
+
},
|
| 4663 |
+
{
|
| 4664 |
+
"epoch": 0.434656200029074,
|
| 4665 |
+
"grad_norm": 2.364358425140381,
|
| 4666 |
+
"learning_rate": 8.057739907495163e-05,
|
| 4667 |
+
"loss": 2.1182,
|
| 4668 |
+
"step": 2990
|
| 4669 |
+
},
|
| 4670 |
+
{
|
| 4671 |
+
"epoch": 0.4353830498618985,
|
| 4672 |
+
"grad_norm": 2.8649942874908447,
|
| 4673 |
+
"learning_rate": 8.05142746681245e-05,
|
| 4674 |
+
"loss": 2.0715,
|
| 4675 |
+
"step": 2995
|
| 4676 |
+
},
|
| 4677 |
+
{
|
| 4678 |
+
"epoch": 0.4361098996947231,
|
| 4679 |
+
"grad_norm": 2.504004716873169,
|
| 4680 |
+
"learning_rate": 8.045107368928755e-05,
|
| 4681 |
+
"loss": 2.183,
|
| 4682 |
+
"step": 3000
|
| 4683 |
+
},
|
| 4684 |
+
{
|
| 4685 |
+
"epoch": 0.4361098996947231,
|
| 4686 |
+
"eval_loss": 2.0339367389678955,
|
| 4687 |
+
"eval_runtime": 19.159,
|
| 4688 |
+
"eval_samples_per_second": 172.295,
|
| 4689 |
+
"eval_steps_per_second": 10.804,
|
| 4690 |
+
"step": 3000
|
| 4691 |
+
},
|
| 4692 |
+
{
|
| 4693 |
+
"epoch": 0.4368367495275476,
|
| 4694 |
+
"grad_norm": 2.4988174438476562,
|
| 4695 |
+
"learning_rate": 8.038779629754915e-05,
|
| 4696 |
+
"loss": 2.1443,
|
| 4697 |
+
"step": 3005
|
| 4698 |
+
},
|
| 4699 |
+
{
|
| 4700 |
+
"epoch": 0.43756359936037215,
|
| 4701 |
+
"grad_norm": 2.5082359313964844,
|
| 4702 |
+
"learning_rate": 8.032444265221006e-05,
|
| 4703 |
+
"loss": 2.0544,
|
| 4704 |
+
"step": 3010
|
| 4705 |
+
},
|
| 4706 |
+
{
|
| 4707 |
+
"epoch": 0.4382904491931967,
|
| 4708 |
+
"grad_norm": 2.3334364891052246,
|
| 4709 |
+
"learning_rate": 8.026101291276302e-05,
|
| 4710 |
+
"loss": 2.1904,
|
| 4711 |
+
"step": 3015
|
| 4712 |
+
},
|
| 4713 |
+
{
|
| 4714 |
+
"epoch": 0.4390172990260212,
|
| 4715 |
+
"grad_norm": 2.405759572982788,
|
| 4716 |
+
"learning_rate": 8.019750723889232e-05,
|
| 4717 |
+
"loss": 2.0836,
|
| 4718 |
+
"step": 3020
|
| 4719 |
+
},
|
| 4720 |
+
{
|
| 4721 |
+
"epoch": 0.4397441488588458,
|
| 4722 |
+
"grad_norm": 2.2676541805267334,
|
| 4723 |
+
"learning_rate": 8.013392579047339e-05,
|
| 4724 |
+
"loss": 2.1745,
|
| 4725 |
+
"step": 3025
|
| 4726 |
+
},
|
| 4727 |
+
{
|
| 4728 |
+
"epoch": 0.4404709986916703,
|
| 4729 |
+
"grad_norm": 2.144158124923706,
|
| 4730 |
+
"learning_rate": 8.00702687275725e-05,
|
| 4731 |
+
"loss": 2.2107,
|
| 4732 |
+
"step": 3030
|
| 4733 |
+
},
|
| 4734 |
+
{
|
| 4735 |
+
"epoch": 0.44119784852449484,
|
| 4736 |
+
"grad_norm": 2.9987900257110596,
|
| 4737 |
+
"learning_rate": 8.000653621044621e-05,
|
| 4738 |
+
"loss": 2.1826,
|
| 4739 |
+
"step": 3035
|
| 4740 |
+
},
|
| 4741 |
+
{
|
| 4742 |
+
"epoch": 0.4419246983573194,
|
| 4743 |
+
"grad_norm": 2.3955495357513428,
|
| 4744 |
+
"learning_rate": 7.994272839954103e-05,
|
| 4745 |
+
"loss": 2.1445,
|
| 4746 |
+
"step": 3040
|
| 4747 |
+
},
|
| 4748 |
+
{
|
| 4749 |
+
"epoch": 0.4426515481901439,
|
| 4750 |
+
"grad_norm": 3.0471301078796387,
|
| 4751 |
+
"learning_rate": 7.987884545549309e-05,
|
| 4752 |
+
"loss": 2.1338,
|
| 4753 |
+
"step": 3045
|
| 4754 |
+
},
|
| 4755 |
+
{
|
| 4756 |
+
"epoch": 0.44337839802296847,
|
| 4757 |
+
"grad_norm": 2.6408660411834717,
|
| 4758 |
+
"learning_rate": 7.981488753912759e-05,
|
| 4759 |
+
"loss": 2.1363,
|
| 4760 |
+
"step": 3050
|
| 4761 |
+
},
|
| 4762 |
+
{
|
| 4763 |
+
"epoch": 0.44337839802296847,
|
| 4764 |
+
"eval_loss": 2.019763946533203,
|
| 4765 |
+
"eval_runtime": 18.8959,
|
| 4766 |
+
"eval_samples_per_second": 174.694,
|
| 4767 |
+
"eval_steps_per_second": 10.955,
|
| 4768 |
+
"step": 3050
|
| 4769 |
}
|
| 4770 |
],
|
| 4771 |
"logging_steps": 5,
|
|
|
|
| 4794 |
"attributes": {}
|
| 4795 |
}
|
| 4796 |
},
|
| 4797 |
+
"total_flos": 7.952395068977971e+17,
|
| 4798 |
"train_batch_size": 4,
|
| 4799 |
"trial_name": null,
|
| 4800 |
"trial_params": null
|