Training in progress, step 11500, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step11500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step11500/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +206 -6
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d739a46be07afc08058bcee6abb1772a84e044deaf39817666f3049bcf653c23
|
| 3 |
size 12017472
|
last-checkpoint/global_step11500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d600239e729c4ded64651931f6cb684445e13a28cc9c8180766ac66dd15525f4
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step11500/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f2820de752ec7bbd3367caf1e9cd13773cad956bad3832cf6ffd7af2da666c1
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step11500
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d17e6956d333adf450e550fb2bbfe82bc47be67acb5350845a13faa81c890b40
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4417,6 +4417,206 @@
|
|
| 4417 |
"eval_samples_per_second": 43.083,
|
| 4418 |
"eval_steps_per_second": 5.392,
|
| 4419 |
"step": 11000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4420 |
}
|
| 4421 |
],
|
| 4422 |
"logging_steps": 25,
|
|
@@ -4436,7 +4636,7 @@
|
|
| 4436 |
"attributes": {}
|
| 4437 |
}
|
| 4438 |
},
|
| 4439 |
-
"total_flos": 6.
|
| 4440 |
"train_batch_size": 4,
|
| 4441 |
"trial_name": null,
|
| 4442 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 11500,
|
| 3 |
+
"best_metric": 0.544745683670044,
|
| 4 |
+
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-11500",
|
| 5 |
+
"epoch": 8.357753135793493,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 11500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4417 |
"eval_samples_per_second": 43.083,
|
| 4418 |
"eval_steps_per_second": 5.392,
|
| 4419 |
"step": 11000
|
| 4420 |
+
},
|
| 4421 |
+
{
|
| 4422 |
+
"epoch": 8.012361388838393,
|
| 4423 |
+
"grad_norm": 0.8800711035728455,
|
| 4424 |
+
"learning_rate": 2.0108403898867298e-05,
|
| 4425 |
+
"loss": 0.511,
|
| 4426 |
+
"mean_token_accuracy": 0.8405692936833372,
|
| 4427 |
+
"num_tokens": 242787188.0,
|
| 4428 |
+
"step": 11025
|
| 4429 |
+
},
|
| 4430 |
+
{
|
| 4431 |
+
"epoch": 8.030539901836029,
|
| 4432 |
+
"grad_norm": 0.8134773969650269,
|
| 4433 |
+
"learning_rate": 1.994255888710432e-05,
|
| 4434 |
+
"loss": 0.5161,
|
| 4435 |
+
"mean_token_accuracy": 0.8396863287687302,
|
| 4436 |
+
"num_tokens": 243323359.0,
|
| 4437 |
+
"step": 11050
|
| 4438 |
+
},
|
| 4439 |
+
{
|
| 4440 |
+
"epoch": 8.048718414833667,
|
| 4441 |
+
"grad_norm": 0.8506718277931213,
|
| 4442 |
+
"learning_rate": 1.977717321235564e-05,
|
| 4443 |
+
"loss": 0.5184,
|
| 4444 |
+
"mean_token_accuracy": 0.8389563143253327,
|
| 4445 |
+
"num_tokens": 243872329.0,
|
| 4446 |
+
"step": 11075
|
| 4447 |
+
},
|
| 4448 |
+
{
|
| 4449 |
+
"epoch": 8.066896927831303,
|
| 4450 |
+
"grad_norm": 0.8440150022506714,
|
| 4451 |
+
"learning_rate": 1.9612250662131406e-05,
|
| 4452 |
+
"loss": 0.5154,
|
| 4453 |
+
"mean_token_accuracy": 0.8382254421710968,
|
| 4454 |
+
"num_tokens": 244414078.0,
|
| 4455 |
+
"step": 11100
|
| 4456 |
+
},
|
| 4457 |
+
{
|
| 4458 |
+
"epoch": 8.08507544082894,
|
| 4459 |
+
"grad_norm": 0.8506153225898743,
|
| 4460 |
+
"learning_rate": 1.9447795013335734e-05,
|
| 4461 |
+
"loss": 0.5232,
|
| 4462 |
+
"mean_token_accuracy": 0.8369153061509133,
|
| 4463 |
+
"num_tokens": 244971752.0,
|
| 4464 |
+
"step": 11125
|
| 4465 |
+
},
|
| 4466 |
+
{
|
| 4467 |
+
"epoch": 8.103253953826577,
|
| 4468 |
+
"grad_norm": 0.8672150373458862,
|
| 4469 |
+
"learning_rate": 1.9283810032180205e-05,
|
| 4470 |
+
"loss": 0.512,
|
| 4471 |
+
"mean_token_accuracy": 0.8409202411770821,
|
| 4472 |
+
"num_tokens": 245520458.0,
|
| 4473 |
+
"step": 11150
|
| 4474 |
+
},
|
| 4475 |
+
{
|
| 4476 |
+
"epoch": 8.121432466824213,
|
| 4477 |
+
"grad_norm": 0.8128538727760315,
|
| 4478 |
+
"learning_rate": 1.9120299474097583e-05,
|
| 4479 |
+
"loss": 0.5209,
|
| 4480 |
+
"mean_token_accuracy": 0.837473659813404,
|
| 4481 |
+
"num_tokens": 246088615.0,
|
| 4482 |
+
"step": 11175
|
| 4483 |
+
},
|
| 4484 |
+
{
|
| 4485 |
+
"epoch": 8.139610979821851,
|
| 4486 |
+
"grad_norm": 0.7617402076721191,
|
| 4487 |
+
"learning_rate": 1.8957267083655835e-05,
|
| 4488 |
+
"loss": 0.5153,
|
| 4489 |
+
"mean_token_accuracy": 0.8394093406200409,
|
| 4490 |
+
"num_tokens": 246630403.0,
|
| 4491 |
+
"step": 11200
|
| 4492 |
+
},
|
| 4493 |
+
{
|
| 4494 |
+
"epoch": 8.157789492819488,
|
| 4495 |
+
"grad_norm": 0.8872790932655334,
|
| 4496 |
+
"learning_rate": 1.8794716594472376e-05,
|
| 4497 |
+
"loss": 0.5179,
|
| 4498 |
+
"mean_token_accuracy": 0.838570873439312,
|
| 4499 |
+
"num_tokens": 247175258.0,
|
| 4500 |
+
"step": 11225
|
| 4501 |
+
},
|
| 4502 |
+
{
|
| 4503 |
+
"epoch": 8.175968005817124,
|
| 4504 |
+
"grad_norm": 0.7966537475585938,
|
| 4505 |
+
"learning_rate": 1.8632651729128564e-05,
|
| 4506 |
+
"loss": 0.5209,
|
| 4507 |
+
"mean_token_accuracy": 0.8365825054049492,
|
| 4508 |
+
"num_tokens": 247743507.0,
|
| 4509 |
+
"step": 11250
|
| 4510 |
+
},
|
| 4511 |
+
{
|
| 4512 |
+
"epoch": 8.175968005817124,
|
| 4513 |
+
"eval_loss": 0.5448639392852783,
|
| 4514 |
+
"eval_mean_token_accuracy": 0.8303035672973184,
|
| 4515 |
+
"eval_num_tokens": 247743507.0,
|
| 4516 |
+
"eval_runtime": 113.7219,
|
| 4517 |
+
"eval_samples_per_second": 43.0,
|
| 4518 |
+
"eval_steps_per_second": 5.382,
|
| 4519 |
+
"step": 11250
|
| 4520 |
+
},
|
| 4521 |
+
{
|
| 4522 |
+
"epoch": 8.194146518814762,
|
| 4523 |
+
"grad_norm": 0.9003967642784119,
|
| 4524 |
+
"learning_rate": 1.847107619908445e-05,
|
| 4525 |
+
"loss": 0.5157,
|
| 4526 |
+
"mean_token_accuracy": 0.8391850134730339,
|
| 4527 |
+
"num_tokens": 248275961.0,
|
| 4528 |
+
"step": 11275
|
| 4529 |
+
},
|
| 4530 |
+
{
|
| 4531 |
+
"epoch": 8.212325031812398,
|
| 4532 |
+
"grad_norm": 0.7678829431533813,
|
| 4533 |
+
"learning_rate": 1.8309993704593756e-05,
|
| 4534 |
+
"loss": 0.5175,
|
| 4535 |
+
"mean_token_accuracy": 0.8387827044725418,
|
| 4536 |
+
"num_tokens": 248835571.0,
|
| 4537 |
+
"step": 11300
|
| 4538 |
+
},
|
| 4539 |
+
{
|
| 4540 |
+
"epoch": 8.230503544810034,
|
| 4541 |
+
"grad_norm": 0.8297247290611267,
|
| 4542 |
+
"learning_rate": 1.8149407934619215e-05,
|
| 4543 |
+
"loss": 0.5213,
|
| 4544 |
+
"mean_token_accuracy": 0.8382138457894325,
|
| 4545 |
+
"num_tokens": 249386561.0,
|
| 4546 |
+
"step": 11325
|
| 4547 |
+
},
|
| 4548 |
+
{
|
| 4549 |
+
"epoch": 8.248682057807672,
|
| 4550 |
+
"grad_norm": 0.8659992218017578,
|
| 4551 |
+
"learning_rate": 1.798932256674798e-05,
|
| 4552 |
+
"loss": 0.5181,
|
| 4553 |
+
"mean_token_accuracy": 0.8384436306357383,
|
| 4554 |
+
"num_tokens": 249964812.0,
|
| 4555 |
+
"step": 11350
|
| 4556 |
+
},
|
| 4557 |
+
{
|
| 4558 |
+
"epoch": 8.266860570805308,
|
| 4559 |
+
"grad_norm": 0.8487904071807861,
|
| 4560 |
+
"learning_rate": 1.782974126710748e-05,
|
| 4561 |
+
"loss": 0.5243,
|
| 4562 |
+
"mean_token_accuracy": 0.8366836148500443,
|
| 4563 |
+
"num_tokens": 250524273.0,
|
| 4564 |
+
"step": 11375
|
| 4565 |
+
},
|
| 4566 |
+
{
|
| 4567 |
+
"epoch": 8.285039083802944,
|
| 4568 |
+
"grad_norm": 0.8609278202056885,
|
| 4569 |
+
"learning_rate": 1.767066769028143e-05,
|
| 4570 |
+
"loss": 0.521,
|
| 4571 |
+
"mean_token_accuracy": 0.8375069627165794,
|
| 4572 |
+
"num_tokens": 251087296.0,
|
| 4573 |
+
"step": 11400
|
| 4574 |
+
},
|
| 4575 |
+
{
|
| 4576 |
+
"epoch": 8.303217596800582,
|
| 4577 |
+
"grad_norm": 0.8295932412147522,
|
| 4578 |
+
"learning_rate": 1.7512105479226144e-05,
|
| 4579 |
+
"loss": 0.5205,
|
| 4580 |
+
"mean_token_accuracy": 0.8369895967841149,
|
| 4581 |
+
"num_tokens": 251637750.0,
|
| 4582 |
+
"step": 11425
|
| 4583 |
+
},
|
| 4584 |
+
{
|
| 4585 |
+
"epoch": 8.321396109798219,
|
| 4586 |
+
"grad_norm": 0.8182777166366577,
|
| 4587 |
+
"learning_rate": 1.7354058265187116e-05,
|
| 4588 |
+
"loss": 0.5224,
|
| 4589 |
+
"mean_token_accuracy": 0.8378088471293449,
|
| 4590 |
+
"num_tokens": 252191575.0,
|
| 4591 |
+
"step": 11450
|
| 4592 |
+
},
|
| 4593 |
+
{
|
| 4594 |
+
"epoch": 8.339574622795855,
|
| 4595 |
+
"grad_norm": 0.7869584560394287,
|
| 4596 |
+
"learning_rate": 1.7196529667615838e-05,
|
| 4597 |
+
"loss": 0.518,
|
| 4598 |
+
"mean_token_accuracy": 0.8380302327871323,
|
| 4599 |
+
"num_tokens": 252747951.0,
|
| 4600 |
+
"step": 11475
|
| 4601 |
+
},
|
| 4602 |
+
{
|
| 4603 |
+
"epoch": 8.357753135793493,
|
| 4604 |
+
"grad_norm": 0.7901642918586731,
|
| 4605 |
+
"learning_rate": 1.7039523294086968e-05,
|
| 4606 |
+
"loss": 0.5188,
|
| 4607 |
+
"mean_token_accuracy": 0.8381170380115509,
|
| 4608 |
+
"num_tokens": 253308593.0,
|
| 4609 |
+
"step": 11500
|
| 4610 |
+
},
|
| 4611 |
+
{
|
| 4612 |
+
"epoch": 8.357753135793493,
|
| 4613 |
+
"eval_loss": 0.544745683670044,
|
| 4614 |
+
"eval_mean_token_accuracy": 0.8305268148386401,
|
| 4615 |
+
"eval_num_tokens": 253308593.0,
|
| 4616 |
+
"eval_runtime": 114.0858,
|
| 4617 |
+
"eval_samples_per_second": 42.862,
|
| 4618 |
+
"eval_steps_per_second": 5.364,
|
| 4619 |
+
"step": 11500
|
| 4620 |
}
|
| 4621 |
],
|
| 4622 |
"logging_steps": 25,
|
|
|
|
| 4636 |
"attributes": {}
|
| 4637 |
}
|
| 4638 |
},
|
| 4639 |
+
"total_flos": 6.387220694035333e+17,
|
| 4640 |
"train_batch_size": 4,
|
| 4641 |
"trial_name": null,
|
| 4642 |
"trial_params": null
|