Instructions to use rovdetection/code-1b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use rovdetection/code-1b-instruct with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("rovdetection/code-1b-instruct", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Training in progress, step 4000, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9446744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:835db88c2c568a2a5b9eecd0ca20228d562ccd37375f6d5e37ee4f667bd5c028
|
| 3 |
size 9446744
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4879947
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bde2b53b9a0c26662086027ef84b0578651b731c913f116872da22f0740efeab
|
| 3 |
size 4879947
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa8f41c51c4c045061b2c14ad0e244d1f18ea14e355c0937c51abc1c22235765
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fcdef9cce1358b15f98ec011b2742b883d23020479104f9b5467277f0c257b88
|
| 3 |
size 14917
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0033c7745b46bdca3ecab5787678834ca68f7f7e1288869dceeb38812abc253
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5ee800e7df74b641553b418c04566b716dade6c517cb6fd519bb2168d1739f3
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 6.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3508,6 +3508,506 @@
|
|
| 3508 |
"mean_token_accuracy": 0.6643109286760355,
|
| 3509 |
"num_tokens": 20795175.0,
|
| 3510 |
"step": 3500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3511 |
}
|
| 3512 |
],
|
| 3513 |
"logging_steps": 10,
|
|
@@ -3527,7 +4027,7 @@
|
|
| 3527 |
"attributes": {}
|
| 3528 |
}
|
| 3529 |
},
|
| 3530 |
-
"total_flos": 1.
|
| 3531 |
"train_batch_size": 2,
|
| 3532 |
"trial_name": null,
|
| 3533 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 6.873415001074576,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 4000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3508 |
"mean_token_accuracy": 0.6643109286760355,
|
| 3509 |
"num_tokens": 20795175.0,
|
| 3510 |
"step": 3500
|
| 3511 |
+
},
|
| 3512 |
+
{
|
| 3513 |
+
"entropy": 1.687648557126522,
|
| 3514 |
+
"epoch": 6.0309477756286265,
|
| 3515 |
+
"grad_norm": 0.8348304629325867,
|
| 3516 |
+
"learning_rate": 5.9640000000000005e-05,
|
| 3517 |
+
"loss": 1.7558349609375,
|
| 3518 |
+
"mean_token_accuracy": 0.6784385897219181,
|
| 3519 |
+
"num_tokens": 20852486.0,
|
| 3520 |
+
"step": 3510
|
| 3521 |
+
},
|
| 3522 |
+
{
|
| 3523 |
+
"entropy": 1.6863658234477044,
|
| 3524 |
+
"epoch": 6.048140984311197,
|
| 3525 |
+
"grad_norm": 0.7642632126808167,
|
| 3526 |
+
"learning_rate": 5.924000000000001e-05,
|
| 3527 |
+
"loss": 1.6536775588989259,
|
| 3528 |
+
"mean_token_accuracy": 0.680523382127285,
|
| 3529 |
+
"num_tokens": 20908597.0,
|
| 3530 |
+
"step": 3520
|
| 3531 |
+
},
|
| 3532 |
+
{
|
| 3533 |
+
"entropy": 1.6652932062745094,
|
| 3534 |
+
"epoch": 6.065334192993768,
|
| 3535 |
+
"grad_norm": 0.8676924109458923,
|
| 3536 |
+
"learning_rate": 5.8840000000000006e-05,
|
| 3537 |
+
"loss": 1.7443069458007812,
|
| 3538 |
+
"mean_token_accuracy": 0.6719188451766968,
|
| 3539 |
+
"num_tokens": 20966567.0,
|
| 3540 |
+
"step": 3530
|
| 3541 |
+
},
|
| 3542 |
+
{
|
| 3543 |
+
"entropy": 1.7391631960868836,
|
| 3544 |
+
"epoch": 6.082527401676338,
|
| 3545 |
+
"grad_norm": 0.8444374799728394,
|
| 3546 |
+
"learning_rate": 5.844e-05,
|
| 3547 |
+
"loss": 1.7849775314331056,
|
| 3548 |
+
"mean_token_accuracy": 0.672398941218853,
|
| 3549 |
+
"num_tokens": 21023832.0,
|
| 3550 |
+
"step": 3540
|
| 3551 |
+
},
|
| 3552 |
+
{
|
| 3553 |
+
"entropy": 1.7432220742106437,
|
| 3554 |
+
"epoch": 6.099720610358908,
|
| 3555 |
+
"grad_norm": 0.7972187995910645,
|
| 3556 |
+
"learning_rate": 5.804000000000001e-05,
|
| 3557 |
+
"loss": 1.8264921188354493,
|
| 3558 |
+
"mean_token_accuracy": 0.6713483344763518,
|
| 3559 |
+
"num_tokens": 21080325.0,
|
| 3560 |
+
"step": 3550
|
| 3561 |
+
},
|
| 3562 |
+
{
|
| 3563 |
+
"entropy": 1.7394985787570476,
|
| 3564 |
+
"epoch": 6.1169138190414785,
|
| 3565 |
+
"grad_norm": 0.8266369700431824,
|
| 3566 |
+
"learning_rate": 5.7640000000000004e-05,
|
| 3567 |
+
"loss": 1.819821548461914,
|
| 3568 |
+
"mean_token_accuracy": 0.6708907049149275,
|
| 3569 |
+
"num_tokens": 21143316.0,
|
| 3570 |
+
"step": 3560
|
| 3571 |
+
},
|
| 3572 |
+
{
|
| 3573 |
+
"entropy": 1.7923602670431138,
|
| 3574 |
+
"epoch": 6.134107027724049,
|
| 3575 |
+
"grad_norm": 0.8315872550010681,
|
| 3576 |
+
"learning_rate": 5.724000000000001e-05,
|
| 3577 |
+
"loss": 1.8086809158325194,
|
| 3578 |
+
"mean_token_accuracy": 0.665992408245802,
|
| 3579 |
+
"num_tokens": 21203848.0,
|
| 3580 |
+
"step": 3570
|
| 3581 |
+
},
|
| 3582 |
+
{
|
| 3583 |
+
"entropy": 1.711188006401062,
|
| 3584 |
+
"epoch": 6.15130023640662,
|
| 3585 |
+
"grad_norm": 0.8174048066139221,
|
| 3586 |
+
"learning_rate": 5.6840000000000005e-05,
|
| 3587 |
+
"loss": 1.7656991958618165,
|
| 3588 |
+
"mean_token_accuracy": 0.6711975857615471,
|
| 3589 |
+
"num_tokens": 21266260.0,
|
| 3590 |
+
"step": 3580
|
| 3591 |
+
},
|
| 3592 |
+
{
|
| 3593 |
+
"entropy": 1.8437035098671912,
|
| 3594 |
+
"epoch": 6.16849344508919,
|
| 3595 |
+
"grad_norm": 0.8155949711799622,
|
| 3596 |
+
"learning_rate": 5.644e-05,
|
| 3597 |
+
"loss": 1.877999496459961,
|
| 3598 |
+
"mean_token_accuracy": 0.6532085236161947,
|
| 3599 |
+
"num_tokens": 21326008.0,
|
| 3600 |
+
"step": 3590
|
| 3601 |
+
},
|
| 3602 |
+
{
|
| 3603 |
+
"entropy": 1.7264528393745422,
|
| 3604 |
+
"epoch": 6.18568665377176,
|
| 3605 |
+
"grad_norm": 0.7951272130012512,
|
| 3606 |
+
"learning_rate": 5.6040000000000006e-05,
|
| 3607 |
+
"loss": 1.747119140625,
|
| 3608 |
+
"mean_token_accuracy": 0.6696909107267857,
|
| 3609 |
+
"num_tokens": 21385356.0,
|
| 3610 |
+
"step": 3600
|
| 3611 |
+
},
|
| 3612 |
+
{
|
| 3613 |
+
"entropy": 1.68227918446064,
|
| 3614 |
+
"epoch": 6.20287986245433,
|
| 3615 |
+
"grad_norm": 0.779587984085083,
|
| 3616 |
+
"learning_rate": 5.564e-05,
|
| 3617 |
+
"loss": 1.7062965393066407,
|
| 3618 |
+
"mean_token_accuracy": 0.6786911800503731,
|
| 3619 |
+
"num_tokens": 21443231.0,
|
| 3620 |
+
"step": 3610
|
| 3621 |
+
},
|
| 3622 |
+
{
|
| 3623 |
+
"entropy": 1.7644565671682357,
|
| 3624 |
+
"epoch": 6.220073071136901,
|
| 3625 |
+
"grad_norm": 0.9153981804847717,
|
| 3626 |
+
"learning_rate": 5.524e-05,
|
| 3627 |
+
"loss": 1.8082721710205079,
|
| 3628 |
+
"mean_token_accuracy": 0.6671201888471842,
|
| 3629 |
+
"num_tokens": 21499309.0,
|
| 3630 |
+
"step": 3620
|
| 3631 |
+
},
|
| 3632 |
+
{
|
| 3633 |
+
"entropy": 1.7211210913956165,
|
| 3634 |
+
"epoch": 6.237266279819472,
|
| 3635 |
+
"grad_norm": 0.8166586756706238,
|
| 3636 |
+
"learning_rate": 5.4840000000000003e-05,
|
| 3637 |
+
"loss": 1.769371795654297,
|
| 3638 |
+
"mean_token_accuracy": 0.6694241009652615,
|
| 3639 |
+
"num_tokens": 21558565.0,
|
| 3640 |
+
"step": 3630
|
| 3641 |
+
},
|
| 3642 |
+
{
|
| 3643 |
+
"entropy": 1.7693689942359925,
|
| 3644 |
+
"epoch": 6.254459488502041,
|
| 3645 |
+
"grad_norm": 0.7773623466491699,
|
| 3646 |
+
"learning_rate": 5.444e-05,
|
| 3647 |
+
"loss": 1.848412322998047,
|
| 3648 |
+
"mean_token_accuracy": 0.66685731112957,
|
| 3649 |
+
"num_tokens": 21618504.0,
|
| 3650 |
+
"step": 3640
|
| 3651 |
+
},
|
| 3652 |
+
{
|
| 3653 |
+
"entropy": 1.8090675905346871,
|
| 3654 |
+
"epoch": 6.271652697184612,
|
| 3655 |
+
"grad_norm": 0.9420453310012817,
|
| 3656 |
+
"learning_rate": 5.4040000000000004e-05,
|
| 3657 |
+
"loss": 1.8266836166381837,
|
| 3658 |
+
"mean_token_accuracy": 0.6643423162400722,
|
| 3659 |
+
"num_tokens": 21676861.0,
|
| 3660 |
+
"step": 3650
|
| 3661 |
+
},
|
| 3662 |
+
{
|
| 3663 |
+
"entropy": 1.7340097561478616,
|
| 3664 |
+
"epoch": 6.288845905867182,
|
| 3665 |
+
"grad_norm": 0.805880069732666,
|
| 3666 |
+
"learning_rate": 5.364e-05,
|
| 3667 |
+
"loss": 1.7760274887084961,
|
| 3668 |
+
"mean_token_accuracy": 0.6729184173047542,
|
| 3669 |
+
"num_tokens": 21734874.0,
|
| 3670 |
+
"step": 3660
|
| 3671 |
+
},
|
| 3672 |
+
{
|
| 3673 |
+
"entropy": 1.733542764186859,
|
| 3674 |
+
"epoch": 6.306039114549753,
|
| 3675 |
+
"grad_norm": 0.7459798455238342,
|
| 3676 |
+
"learning_rate": 5.324e-05,
|
| 3677 |
+
"loss": 1.7874065399169923,
|
| 3678 |
+
"mean_token_accuracy": 0.6733234331011773,
|
| 3679 |
+
"num_tokens": 21797467.0,
|
| 3680 |
+
"step": 3670
|
| 3681 |
+
},
|
| 3682 |
+
{
|
| 3683 |
+
"entropy": 1.6855479300022125,
|
| 3684 |
+
"epoch": 6.3232323232323235,
|
| 3685 |
+
"grad_norm": 0.7362611889839172,
|
| 3686 |
+
"learning_rate": 5.284e-05,
|
| 3687 |
+
"loss": 1.7557338714599608,
|
| 3688 |
+
"mean_token_accuracy": 0.6742986045777798,
|
| 3689 |
+
"num_tokens": 21856704.0,
|
| 3690 |
+
"step": 3680
|
| 3691 |
+
},
|
| 3692 |
+
{
|
| 3693 |
+
"entropy": 1.762756396830082,
|
| 3694 |
+
"epoch": 6.340425531914893,
|
| 3695 |
+
"grad_norm": 0.8349901437759399,
|
| 3696 |
+
"learning_rate": 5.244e-05,
|
| 3697 |
+
"loss": 1.784174346923828,
|
| 3698 |
+
"mean_token_accuracy": 0.6732991166412831,
|
| 3699 |
+
"num_tokens": 21915781.0,
|
| 3700 |
+
"step": 3690
|
| 3701 |
+
},
|
| 3702 |
+
{
|
| 3703 |
+
"entropy": 1.7664957396686076,
|
| 3704 |
+
"epoch": 6.357618740597464,
|
| 3705 |
+
"grad_norm": 0.8295337557792664,
|
| 3706 |
+
"learning_rate": 5.204e-05,
|
| 3707 |
+
"loss": 1.8338695526123048,
|
| 3708 |
+
"mean_token_accuracy": 0.6659718155860901,
|
| 3709 |
+
"num_tokens": 21973568.0,
|
| 3710 |
+
"step": 3700
|
| 3711 |
+
},
|
| 3712 |
+
{
|
| 3713 |
+
"entropy": 1.7744196206331253,
|
| 3714 |
+
"epoch": 6.374811949280034,
|
| 3715 |
+
"grad_norm": 0.739115297794342,
|
| 3716 |
+
"learning_rate": 5.164e-05,
|
| 3717 |
+
"loss": 1.8148929595947265,
|
| 3718 |
+
"mean_token_accuracy": 0.6660460762679576,
|
| 3719 |
+
"num_tokens": 22032979.0,
|
| 3720 |
+
"step": 3710
|
| 3721 |
+
},
|
| 3722 |
+
{
|
| 3723 |
+
"entropy": 1.7459667712450027,
|
| 3724 |
+
"epoch": 6.392005157962605,
|
| 3725 |
+
"grad_norm": 0.7716593146324158,
|
| 3726 |
+
"learning_rate": 5.124e-05,
|
| 3727 |
+
"loss": 1.8079204559326172,
|
| 3728 |
+
"mean_token_accuracy": 0.66551748290658,
|
| 3729 |
+
"num_tokens": 22092283.0,
|
| 3730 |
+
"step": 3720
|
| 3731 |
+
},
|
| 3732 |
+
{
|
| 3733 |
+
"entropy": 1.7491293936967849,
|
| 3734 |
+
"epoch": 6.4091983666451755,
|
| 3735 |
+
"grad_norm": 0.8270374536514282,
|
| 3736 |
+
"learning_rate": 5.084e-05,
|
| 3737 |
+
"loss": 1.8020380020141602,
|
| 3738 |
+
"mean_token_accuracy": 0.6673273537307978,
|
| 3739 |
+
"num_tokens": 22150667.0,
|
| 3740 |
+
"step": 3730
|
| 3741 |
+
},
|
| 3742 |
+
{
|
| 3743 |
+
"entropy": 1.6887403331696986,
|
| 3744 |
+
"epoch": 6.426391575327745,
|
| 3745 |
+
"grad_norm": 0.8306758403778076,
|
| 3746 |
+
"learning_rate": 5.044e-05,
|
| 3747 |
+
"loss": 1.7328964233398438,
|
| 3748 |
+
"mean_token_accuracy": 0.676455694437027,
|
| 3749 |
+
"num_tokens": 22211170.0,
|
| 3750 |
+
"step": 3740
|
| 3751 |
+
},
|
| 3752 |
+
{
|
| 3753 |
+
"entropy": 1.8332835257053375,
|
| 3754 |
+
"epoch": 6.443584784010316,
|
| 3755 |
+
"grad_norm": 0.8369497656822205,
|
| 3756 |
+
"learning_rate": 5.0039999999999995e-05,
|
| 3757 |
+
"loss": 1.913273239135742,
|
| 3758 |
+
"mean_token_accuracy": 0.656198850646615,
|
| 3759 |
+
"num_tokens": 22269928.0,
|
| 3760 |
+
"step": 3750
|
| 3761 |
+
},
|
| 3762 |
+
{
|
| 3763 |
+
"entropy": 1.6914366707205772,
|
| 3764 |
+
"epoch": 6.460777992692886,
|
| 3765 |
+
"grad_norm": 0.7562059164047241,
|
| 3766 |
+
"learning_rate": 4.9640000000000006e-05,
|
| 3767 |
+
"loss": 1.7506240844726562,
|
| 3768 |
+
"mean_token_accuracy": 0.67936124317348,
|
| 3769 |
+
"num_tokens": 22328611.0,
|
| 3770 |
+
"step": 3760
|
| 3771 |
+
},
|
| 3772 |
+
{
|
| 3773 |
+
"entropy": 1.7604179099202155,
|
| 3774 |
+
"epoch": 6.477971201375457,
|
| 3775 |
+
"grad_norm": 0.7541300058364868,
|
| 3776 |
+
"learning_rate": 4.924e-05,
|
| 3777 |
+
"loss": 1.8065948486328125,
|
| 3778 |
+
"mean_token_accuracy": 0.6697364591062069,
|
| 3779 |
+
"num_tokens": 22389219.0,
|
| 3780 |
+
"step": 3770
|
| 3781 |
+
},
|
| 3782 |
+
{
|
| 3783 |
+
"entropy": 1.731757602095604,
|
| 3784 |
+
"epoch": 6.4951644100580275,
|
| 3785 |
+
"grad_norm": 0.8319364190101624,
|
| 3786 |
+
"learning_rate": 4.884e-05,
|
| 3787 |
+
"loss": 1.7902181625366211,
|
| 3788 |
+
"mean_token_accuracy": 0.6673447206616402,
|
| 3789 |
+
"num_tokens": 22449858.0,
|
| 3790 |
+
"step": 3780
|
| 3791 |
+
},
|
| 3792 |
+
{
|
| 3793 |
+
"entropy": 1.7152166068553925,
|
| 3794 |
+
"epoch": 6.512357618740597,
|
| 3795 |
+
"grad_norm": 0.8575091361999512,
|
| 3796 |
+
"learning_rate": 4.8440000000000004e-05,
|
| 3797 |
+
"loss": 1.7424659729003906,
|
| 3798 |
+
"mean_token_accuracy": 0.6707747709006071,
|
| 3799 |
+
"num_tokens": 22509375.0,
|
| 3800 |
+
"step": 3790
|
| 3801 |
+
},
|
| 3802 |
+
{
|
| 3803 |
+
"entropy": 1.6641680032014847,
|
| 3804 |
+
"epoch": 6.529550827423168,
|
| 3805 |
+
"grad_norm": 0.7516652345657349,
|
| 3806 |
+
"learning_rate": 4.804e-05,
|
| 3807 |
+
"loss": 1.6937873840332032,
|
| 3808 |
+
"mean_token_accuracy": 0.6811798132956028,
|
| 3809 |
+
"num_tokens": 22566440.0,
|
| 3810 |
+
"step": 3800
|
| 3811 |
+
},
|
| 3812 |
+
{
|
| 3813 |
+
"entropy": 1.7551555022597314,
|
| 3814 |
+
"epoch": 6.546744036105738,
|
| 3815 |
+
"grad_norm": 0.817863941192627,
|
| 3816 |
+
"learning_rate": 4.7640000000000005e-05,
|
| 3817 |
+
"loss": 1.8282489776611328,
|
| 3818 |
+
"mean_token_accuracy": 0.6655839093029499,
|
| 3819 |
+
"num_tokens": 22627900.0,
|
| 3820 |
+
"step": 3810
|
| 3821 |
+
},
|
| 3822 |
+
{
|
| 3823 |
+
"entropy": 1.7025569766759872,
|
| 3824 |
+
"epoch": 6.563937244788309,
|
| 3825 |
+
"grad_norm": 0.757764458656311,
|
| 3826 |
+
"learning_rate": 4.724e-05,
|
| 3827 |
+
"loss": 1.7325496673583984,
|
| 3828 |
+
"mean_token_accuracy": 0.6785391330718994,
|
| 3829 |
+
"num_tokens": 22685738.0,
|
| 3830 |
+
"step": 3820
|
| 3831 |
+
},
|
| 3832 |
+
{
|
| 3833 |
+
"entropy": 1.699775031208992,
|
| 3834 |
+
"epoch": 6.5811304534708785,
|
| 3835 |
+
"grad_norm": 0.7960421442985535,
|
| 3836 |
+
"learning_rate": 4.684e-05,
|
| 3837 |
+
"loss": 1.7602745056152345,
|
| 3838 |
+
"mean_token_accuracy": 0.6698532458394766,
|
| 3839 |
+
"num_tokens": 22745696.0,
|
| 3840 |
+
"step": 3830
|
| 3841 |
+
},
|
| 3842 |
+
{
|
| 3843 |
+
"entropy": 1.8100605458021164,
|
| 3844 |
+
"epoch": 6.598323662153449,
|
| 3845 |
+
"grad_norm": 0.8477244973182678,
|
| 3846 |
+
"learning_rate": 4.644e-05,
|
| 3847 |
+
"loss": 1.8226333618164063,
|
| 3848 |
+
"mean_token_accuracy": 0.6646727129817009,
|
| 3849 |
+
"num_tokens": 22805783.0,
|
| 3850 |
+
"step": 3840
|
| 3851 |
+
},
|
| 3852 |
+
{
|
| 3853 |
+
"entropy": 1.7685839846730231,
|
| 3854 |
+
"epoch": 6.61551687083602,
|
| 3855 |
+
"grad_norm": 0.7853493690490723,
|
| 3856 |
+
"learning_rate": 4.604e-05,
|
| 3857 |
+
"loss": 1.8230281829833985,
|
| 3858 |
+
"mean_token_accuracy": 0.664577030390501,
|
| 3859 |
+
"num_tokens": 22866822.0,
|
| 3860 |
+
"step": 3850
|
| 3861 |
+
},
|
| 3862 |
+
{
|
| 3863 |
+
"entropy": 1.7810854628682136,
|
| 3864 |
+
"epoch": 6.63271007951859,
|
| 3865 |
+
"grad_norm": 0.7139444351196289,
|
| 3866 |
+
"learning_rate": 4.564e-05,
|
| 3867 |
+
"loss": 1.855198287963867,
|
| 3868 |
+
"mean_token_accuracy": 0.6652711797505617,
|
| 3869 |
+
"num_tokens": 22928790.0,
|
| 3870 |
+
"step": 3860
|
| 3871 |
+
},
|
| 3872 |
+
{
|
| 3873 |
+
"entropy": 1.7815292954444886,
|
| 3874 |
+
"epoch": 6.649903288201161,
|
| 3875 |
+
"grad_norm": 0.7039018869400024,
|
| 3876 |
+
"learning_rate": 4.524000000000001e-05,
|
| 3877 |
+
"loss": 1.845859909057617,
|
| 3878 |
+
"mean_token_accuracy": 0.6595252249389887,
|
| 3879 |
+
"num_tokens": 22990170.0,
|
| 3880 |
+
"step": 3870
|
| 3881 |
+
},
|
| 3882 |
+
{
|
| 3883 |
+
"entropy": 1.7107908308506012,
|
| 3884 |
+
"epoch": 6.667096496883731,
|
| 3885 |
+
"grad_norm": 0.7651708126068115,
|
| 3886 |
+
"learning_rate": 4.4840000000000004e-05,
|
| 3887 |
+
"loss": 1.7340824127197265,
|
| 3888 |
+
"mean_token_accuracy": 0.6750431463122368,
|
| 3889 |
+
"num_tokens": 23047902.0,
|
| 3890 |
+
"step": 3880
|
| 3891 |
+
},
|
| 3892 |
+
{
|
| 3893 |
+
"entropy": 1.7069460928440094,
|
| 3894 |
+
"epoch": 6.684289705566301,
|
| 3895 |
+
"grad_norm": 0.7385950088500977,
|
| 3896 |
+
"learning_rate": 4.444e-05,
|
| 3897 |
+
"loss": 1.758881187438965,
|
| 3898 |
+
"mean_token_accuracy": 0.6745327576994896,
|
| 3899 |
+
"num_tokens": 23112106.0,
|
| 3900 |
+
"step": 3890
|
| 3901 |
+
},
|
| 3902 |
+
{
|
| 3903 |
+
"entropy": 1.821124967932701,
|
| 3904 |
+
"epoch": 6.701482914248872,
|
| 3905 |
+
"grad_norm": 0.7827627658843994,
|
| 3906 |
+
"learning_rate": 4.4040000000000005e-05,
|
| 3907 |
+
"loss": 1.913480567932129,
|
| 3908 |
+
"mean_token_accuracy": 0.6593531377613544,
|
| 3909 |
+
"num_tokens": 23170056.0,
|
| 3910 |
+
"step": 3900
|
| 3911 |
+
},
|
| 3912 |
+
{
|
| 3913 |
+
"entropy": 1.7924881175160408,
|
| 3914 |
+
"epoch": 6.718676122931442,
|
| 3915 |
+
"grad_norm": 0.8166612386703491,
|
| 3916 |
+
"learning_rate": 4.364e-05,
|
| 3917 |
+
"loss": 1.855017852783203,
|
| 3918 |
+
"mean_token_accuracy": 0.6593458168208599,
|
| 3919 |
+
"num_tokens": 23228582.0,
|
| 3920 |
+
"step": 3910
|
| 3921 |
+
},
|
| 3922 |
+
{
|
| 3923 |
+
"entropy": 1.736910080909729,
|
| 3924 |
+
"epoch": 6.735869331614013,
|
| 3925 |
+
"grad_norm": 0.779629647731781,
|
| 3926 |
+
"learning_rate": 4.324e-05,
|
| 3927 |
+
"loss": 1.7581821441650392,
|
| 3928 |
+
"mean_token_accuracy": 0.6779871381819248,
|
| 3929 |
+
"num_tokens": 23288702.0,
|
| 3930 |
+
"step": 3920
|
| 3931 |
+
},
|
| 3932 |
+
{
|
| 3933 |
+
"entropy": 1.6776573412120341,
|
| 3934 |
+
"epoch": 6.7530625402965825,
|
| 3935 |
+
"grad_norm": 0.7625913619995117,
|
| 3936 |
+
"learning_rate": 4.284e-05,
|
| 3937 |
+
"loss": 1.7102031707763672,
|
| 3938 |
+
"mean_token_accuracy": 0.6794889360666275,
|
| 3939 |
+
"num_tokens": 23349004.0,
|
| 3940 |
+
"step": 3930
|
| 3941 |
+
},
|
| 3942 |
+
{
|
| 3943 |
+
"entropy": 1.8100020587444305,
|
| 3944 |
+
"epoch": 6.770255748979153,
|
| 3945 |
+
"grad_norm": 0.7499405145645142,
|
| 3946 |
+
"learning_rate": 4.244e-05,
|
| 3947 |
+
"loss": 1.8514158248901367,
|
| 3948 |
+
"mean_token_accuracy": 0.6620845705270767,
|
| 3949 |
+
"num_tokens": 23410874.0,
|
| 3950 |
+
"step": 3940
|
| 3951 |
+
},
|
| 3952 |
+
{
|
| 3953 |
+
"entropy": 1.697011759877205,
|
| 3954 |
+
"epoch": 6.787448957661724,
|
| 3955 |
+
"grad_norm": 0.736323893070221,
|
| 3956 |
+
"learning_rate": 4.2040000000000004e-05,
|
| 3957 |
+
"loss": 1.7609180450439452,
|
| 3958 |
+
"mean_token_accuracy": 0.6772994473576546,
|
| 3959 |
+
"num_tokens": 23472518.0,
|
| 3960 |
+
"step": 3950
|
| 3961 |
+
},
|
| 3962 |
+
{
|
| 3963 |
+
"entropy": 1.764576494693756,
|
| 3964 |
+
"epoch": 6.804642166344294,
|
| 3965 |
+
"grad_norm": 0.8523833751678467,
|
| 3966 |
+
"learning_rate": 4.164e-05,
|
| 3967 |
+
"loss": 1.81484375,
|
| 3968 |
+
"mean_token_accuracy": 0.6644324712455273,
|
| 3969 |
+
"num_tokens": 23531203.0,
|
| 3970 |
+
"step": 3960
|
| 3971 |
+
},
|
| 3972 |
+
{
|
| 3973 |
+
"entropy": 1.7241224959492683,
|
| 3974 |
+
"epoch": 6.821835375026865,
|
| 3975 |
+
"grad_norm": 0.8820350766181946,
|
| 3976 |
+
"learning_rate": 4.124e-05,
|
| 3977 |
+
"loss": 1.739130401611328,
|
| 3978 |
+
"mean_token_accuracy": 0.6771424360573292,
|
| 3979 |
+
"num_tokens": 23590289.0,
|
| 3980 |
+
"step": 3970
|
| 3981 |
+
},
|
| 3982 |
+
{
|
| 3983 |
+
"entropy": 1.6967746496200562,
|
| 3984 |
+
"epoch": 6.8390285837094345,
|
| 3985 |
+
"grad_norm": 0.8161067962646484,
|
| 3986 |
+
"learning_rate": 4.084e-05,
|
| 3987 |
+
"loss": 1.7659534454345702,
|
| 3988 |
+
"mean_token_accuracy": 0.6744477659463882,
|
| 3989 |
+
"num_tokens": 23647985.0,
|
| 3990 |
+
"step": 3980
|
| 3991 |
+
},
|
| 3992 |
+
{
|
| 3993 |
+
"entropy": 1.8578275874257089,
|
| 3994 |
+
"epoch": 6.856221792392005,
|
| 3995 |
+
"grad_norm": 0.778160810470581,
|
| 3996 |
+
"learning_rate": 4.044e-05,
|
| 3997 |
+
"loss": 1.9046249389648438,
|
| 3998 |
+
"mean_token_accuracy": 0.6525318272411823,
|
| 3999 |
+
"num_tokens": 23707387.0,
|
| 4000 |
+
"step": 3990
|
| 4001 |
+
},
|
| 4002 |
+
{
|
| 4003 |
+
"entropy": 1.781902502477169,
|
| 4004 |
+
"epoch": 6.873415001074576,
|
| 4005 |
+
"grad_norm": 0.9398592710494995,
|
| 4006 |
+
"learning_rate": 4.004e-05,
|
| 4007 |
+
"loss": 1.8081722259521484,
|
| 4008 |
+
"mean_token_accuracy": 0.6625144556164742,
|
| 4009 |
+
"num_tokens": 23764831.0,
|
| 4010 |
+
"step": 4000
|
| 4011 |
}
|
| 4012 |
],
|
| 4013 |
"logging_steps": 10,
|
|
|
|
| 4027 |
"attributes": {}
|
| 4028 |
}
|
| 4029 |
},
|
| 4030 |
+
"total_flos": 1.951545327353856e+17,
|
| 4031 |
"train_batch_size": 2,
|
| 4032 |
"trial_name": null,
|
| 4033 |
"trial_params": null
|