Training in progress, step 16500, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step16500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step16500/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +203 -3
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84490a0ba5e68272e718e1b587f957c02192d8adb3af51f4ee9f3e16c57d8791
|
| 3 |
size 12017472
|
last-checkpoint/global_step16500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ecddbe1aff6e55389d660e2ef4b3e28b3ddd7f5e51d8ff2861b1f4dfb4439e89
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step16500/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a3e559e7998fe0b3e9109e5a1fcca3316d4f86d200dd5db4d8ea7b7da1c59c0
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step16500
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25f34d42adfb703043e77cccecb301eb61eb2fb980871ddcbacc2771de139a9c
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 16000,
|
| 3 |
"best_metric": 0.5378558039665222,
|
| 4 |
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-16000",
|
| 5 |
-
"epoch": 11.
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6417,6 +6417,206 @@
|
|
| 6417 |
"eval_samples_per_second": 43.873,
|
| 6418 |
"eval_steps_per_second": 5.491,
|
| 6419 |
"step": 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6420 |
}
|
| 6421 |
],
|
| 6422 |
"logging_steps": 25,
|
|
@@ -6436,7 +6636,7 @@
|
|
| 6436 |
"attributes": {}
|
| 6437 |
}
|
| 6438 |
},
|
| 6439 |
-
"total_flos":
|
| 6440 |
"train_batch_size": 4,
|
| 6441 |
"trial_name": null,
|
| 6442 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 16000,
|
| 3 |
"best_metric": 0.5378558039665222,
|
| 4 |
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-16000",
|
| 5 |
+
"epoch": 11.991819669151063,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 16500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6417 |
"eval_samples_per_second": 43.873,
|
| 6418 |
"eval_steps_per_second": 5.491,
|
| 6419 |
"step": 16000
|
| 6420 |
+
},
|
| 6421 |
+
{
|
| 6422 |
+
"epoch": 11.646427922195965,
|
| 6423 |
+
"grad_norm": 0.8521804213523865,
|
| 6424 |
+
"learning_rate": 1.7368012049631344e-07,
|
| 6425 |
+
"loss": 0.5054,
|
| 6426 |
+
"mean_token_accuracy": 0.8433844763040542,
|
| 6427 |
+
"num_tokens": 352828019.0,
|
| 6428 |
+
"step": 16025
|
| 6429 |
+
},
|
| 6430 |
+
{
|
| 6431 |
+
"epoch": 11.664606435193601,
|
| 6432 |
+
"grad_norm": 0.8544715642929077,
|
| 6433 |
+
"learning_rate": 1.5631750216815733e-07,
|
| 6434 |
+
"loss": 0.5127,
|
| 6435 |
+
"mean_token_accuracy": 0.8406408429145813,
|
| 6436 |
+
"num_tokens": 353383848.0,
|
| 6437 |
+
"step": 16050
|
| 6438 |
+
},
|
| 6439 |
+
{
|
| 6440 |
+
"epoch": 11.682784948191237,
|
| 6441 |
+
"grad_norm": 0.8408353924751282,
|
| 6442 |
+
"learning_rate": 1.398673471025891e-07,
|
| 6443 |
+
"loss": 0.5119,
|
| 6444 |
+
"mean_token_accuracy": 0.8391477057337761,
|
| 6445 |
+
"num_tokens": 353947699.0,
|
| 6446 |
+
"step": 16075
|
| 6447 |
+
},
|
| 6448 |
+
{
|
| 6449 |
+
"epoch": 11.700963461188875,
|
| 6450 |
+
"grad_norm": 0.9233898520469666,
|
| 6451 |
+
"learning_rate": 1.2433003202588113e-07,
|
| 6452 |
+
"loss": 0.5064,
|
| 6453 |
+
"mean_token_accuracy": 0.8429214411973953,
|
| 6454 |
+
"num_tokens": 354490412.0,
|
| 6455 |
+
"step": 16100
|
| 6456 |
+
},
|
| 6457 |
+
{
|
| 6458 |
+
"epoch": 11.719141974186511,
|
| 6459 |
+
"grad_norm": 0.8499307036399841,
|
| 6460 |
+
"learning_rate": 1.0970591275929476e-07,
|
| 6461 |
+
"loss": 0.4985,
|
| 6462 |
+
"mean_token_accuracy": 0.8445121836662293,
|
| 6463 |
+
"num_tokens": 355031092.0,
|
| 6464 |
+
"step": 16125
|
| 6465 |
+
},
|
| 6466 |
+
{
|
| 6467 |
+
"epoch": 11.737320487184148,
|
| 6468 |
+
"grad_norm": 0.814639687538147,
|
| 6469 |
+
"learning_rate": 9.599532421092239e-08,
|
| 6470 |
+
"loss": 0.4986,
|
| 6471 |
+
"mean_token_accuracy": 0.8441943645477294,
|
| 6472 |
+
"num_tokens": 355575121.0,
|
| 6473 |
+
"step": 16150
|
| 6474 |
+
},
|
| 6475 |
+
{
|
| 6476 |
+
"epoch": 11.755499000181786,
|
| 6477 |
+
"grad_norm": 0.9159456491470337,
|
| 6478 |
+
"learning_rate": 8.31985803680091e-08,
|
| 6479 |
+
"loss": 0.5077,
|
| 6480 |
+
"mean_token_accuracy": 0.8415706121921539,
|
| 6481 |
+
"num_tokens": 356143674.0,
|
| 6482 |
+
"step": 16175
|
| 6483 |
+
},
|
| 6484 |
+
{
|
| 6485 |
+
"epoch": 11.773677513179422,
|
| 6486 |
+
"grad_norm": 0.9033915996551514,
|
| 6487 |
+
"learning_rate": 7.13159742897851e-08,
|
| 6488 |
+
"loss": 0.5093,
|
| 6489 |
+
"mean_token_accuracy": 0.8420270484685898,
|
| 6490 |
+
"num_tokens": 356687741.0,
|
| 6491 |
+
"step": 16200
|
| 6492 |
+
},
|
| 6493 |
+
{
|
| 6494 |
+
"epoch": 11.791856026177058,
|
| 6495 |
+
"grad_norm": 0.9012133479118347,
|
| 6496 |
+
"learning_rate": 6.034777810072445e-08,
|
| 6497 |
+
"loss": 0.5157,
|
| 6498 |
+
"mean_token_accuracy": 0.8388536632061004,
|
| 6499 |
+
"num_tokens": 357249933.0,
|
| 6500 |
+
"step": 16225
|
| 6501 |
+
},
|
| 6502 |
+
{
|
| 6503 |
+
"epoch": 11.810034539174696,
|
| 6504 |
+
"grad_norm": 0.8706744909286499,
|
| 6505 |
+
"learning_rate": 5.0294242984345595e-08,
|
| 6506 |
+
"loss": 0.5077,
|
| 6507 |
+
"mean_token_accuracy": 0.8408184266090393,
|
| 6508 |
+
"num_tokens": 357820486.0,
|
| 6509 |
+
"step": 16250
|
| 6510 |
+
},
|
| 6511 |
+
{
|
| 6512 |
+
"epoch": 11.810034539174696,
|
| 6513 |
+
"eval_loss": 0.5378739237785339,
|
| 6514 |
+
"eval_mean_token_accuracy": 0.8327080023054984,
|
| 6515 |
+
"eval_num_tokens": 357820486.0,
|
| 6516 |
+
"eval_runtime": 111.4072,
|
| 6517 |
+
"eval_samples_per_second": 43.893,
|
| 6518 |
+
"eval_steps_per_second": 5.493,
|
| 6519 |
+
"step": 16250
|
| 6520 |
+
},
|
| 6521 |
+
{
|
| 6522 |
+
"epoch": 11.828213052172332,
|
| 6523 |
+
"grad_norm": 0.8344452381134033,
|
| 6524 |
+
"learning_rate": 4.1155599177433725e-08,
|
| 6525 |
+
"loss": 0.5054,
|
| 6526 |
+
"mean_token_accuracy": 0.842702434360981,
|
| 6527 |
+
"num_tokens": 358376267.0,
|
| 6528 |
+
"step": 16275
|
| 6529 |
+
},
|
| 6530 |
+
{
|
| 6531 |
+
"epoch": 11.846391565169968,
|
| 6532 |
+
"grad_norm": 0.9577372074127197,
|
| 6533 |
+
"learning_rate": 3.293205596477833e-08,
|
| 6534 |
+
"loss": 0.5119,
|
| 6535 |
+
"mean_token_accuracy": 0.8426962018013,
|
| 6536 |
+
"num_tokens": 358925005.0,
|
| 6537 |
+
"step": 16300
|
| 6538 |
+
},
|
| 6539 |
+
{
|
| 6540 |
+
"epoch": 11.864570078167606,
|
| 6541 |
+
"grad_norm": 0.8274015188217163,
|
| 6542 |
+
"learning_rate": 2.5623801674381498e-08,
|
| 6543 |
+
"loss": 0.5108,
|
| 6544 |
+
"mean_token_accuracy": 0.8408406323194504,
|
| 6545 |
+
"num_tokens": 359493299.0,
|
| 6546 |
+
"step": 16325
|
| 6547 |
+
},
|
| 6548 |
+
{
|
| 6549 |
+
"epoch": 11.882748591165242,
|
| 6550 |
+
"grad_norm": 0.8797160387039185,
|
| 6551 |
+
"learning_rate": 1.9231003673145788e-08,
|
| 6552 |
+
"loss": 0.5047,
|
| 6553 |
+
"mean_token_accuracy": 0.842636145055294,
|
| 6554 |
+
"num_tokens": 360053289.0,
|
| 6555 |
+
"step": 16350
|
| 6556 |
+
},
|
| 6557 |
+
{
|
| 6558 |
+
"epoch": 11.900927104162879,
|
| 6559 |
+
"grad_norm": 0.8720398545265198,
|
| 6560 |
+
"learning_rate": 1.375380836302398e-08,
|
| 6561 |
+
"loss": 0.5006,
|
| 6562 |
+
"mean_token_accuracy": 0.8434941950440407,
|
| 6563 |
+
"num_tokens": 360612686.0,
|
| 6564 |
+
"step": 16375
|
| 6565 |
+
},
|
| 6566 |
+
{
|
| 6567 |
+
"epoch": 11.919105617160517,
|
| 6568 |
+
"grad_norm": 0.951812207698822,
|
| 6569 |
+
"learning_rate": 9.192341177697295e-09,
|
| 6570 |
+
"loss": 0.5031,
|
| 6571 |
+
"mean_token_accuracy": 0.8426809054613114,
|
| 6572 |
+
"num_tokens": 361163960.0,
|
| 6573 |
+
"step": 16400
|
| 6574 |
+
},
|
| 6575 |
+
{
|
| 6576 |
+
"epoch": 11.937284130158153,
|
| 6577 |
+
"grad_norm": 0.8881998062133789,
|
| 6578 |
+
"learning_rate": 5.546706579679928e-09,
|
| 6579 |
+
"loss": 0.5146,
|
| 6580 |
+
"mean_token_accuracy": 0.8401396802067757,
|
| 6581 |
+
"num_tokens": 361709569.0,
|
| 6582 |
+
"step": 16425
|
| 6583 |
+
},
|
| 6584 |
+
{
|
| 6585 |
+
"epoch": 11.95546264315579,
|
| 6586 |
+
"grad_norm": 0.864645779132843,
|
| 6587 |
+
"learning_rate": 2.816988057929848e-09,
|
| 6588 |
+
"loss": 0.516,
|
| 6589 |
+
"mean_token_accuracy": 0.838928511440754,
|
| 6590 |
+
"num_tokens": 362266243.0,
|
| 6591 |
+
"step": 16450
|
| 6592 |
+
},
|
| 6593 |
+
{
|
| 6594 |
+
"epoch": 11.973641156153427,
|
| 6595 |
+
"grad_norm": 0.8667661547660828,
|
| 6596 |
+
"learning_rate": 1.0032481259436566e-09,
|
| 6597 |
+
"loss": 0.5195,
|
| 6598 |
+
"mean_token_accuracy": 0.8384470102190972,
|
| 6599 |
+
"num_tokens": 362824221.0,
|
| 6600 |
+
"step": 16475
|
| 6601 |
+
},
|
| 6602 |
+
{
|
| 6603 |
+
"epoch": 11.991819669151063,
|
| 6604 |
+
"grad_norm": 0.8674113154411316,
|
| 6605 |
+
"learning_rate": 1.0552832032217908e-10,
|
| 6606 |
+
"loss": 0.508,
|
| 6607 |
+
"mean_token_accuracy": 0.8425227817893028,
|
| 6608 |
+
"num_tokens": 363367808.0,
|
| 6609 |
+
"step": 16500
|
| 6610 |
+
},
|
| 6611 |
+
{
|
| 6612 |
+
"epoch": 11.991819669151063,
|
| 6613 |
+
"eval_loss": 0.5378732681274414,
|
| 6614 |
+
"eval_mean_token_accuracy": 0.8326478998840244,
|
| 6615 |
+
"eval_num_tokens": 363367808.0,
|
| 6616 |
+
"eval_runtime": 111.938,
|
| 6617 |
+
"eval_samples_per_second": 43.685,
|
| 6618 |
+
"eval_steps_per_second": 5.467,
|
| 6619 |
+
"step": 16500
|
| 6620 |
}
|
| 6621 |
],
|
| 6622 |
"logging_steps": 25,
|
|
|
|
| 6636 |
"attributes": {}
|
| 6637 |
}
|
| 6638 |
},
|
| 6639 |
+
"total_flos": 9.164882254873231e+17,
|
| 6640 |
"train_batch_size": 4,
|
| 6641 |
"trial_name": null,
|
| 6642 |
"trial_params": null
|