Instructions to use moos124/code-reasoning-0.5b with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use moos124/code-reasoning-0.5b with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("moos124/code-reasoning-0.5b", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Training in progress, step 4680, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 70430032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a6492ef322d5597fea48d081cb22f028e55d23c7221c1f5d2c0d52b36383977
|
| 3 |
size 70430032
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 141058579
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c9b537d59fc8f54ed6f9bc67a07461d112b9d35e7bdba97c29c8aeae6b9c47d
|
| 3 |
size 141058579
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:627c156134610fd8bc9611a809d7d0e96a5b62384327d5b22ed21fa23dd24cb0
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a7021d506222316729103157c09f6bde2051538c7c9b802480f486047db26ae
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4408,6 +4408,296 @@
|
|
| 4408 |
"mean_token_accuracy": 0.764773941040039,
|
| 4409 |
"num_tokens": 20432372.0,
|
| 4410 |
"step": 4390
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4411 |
}
|
| 4412 |
],
|
| 4413 |
"logging_steps": 10,
|
|
@@ -4427,7 +4717,7 @@
|
|
| 4427 |
"attributes": {}
|
| 4428 |
}
|
| 4429 |
},
|
| 4430 |
-
"total_flos":
|
| 4431 |
"train_batch_size": 4,
|
| 4432 |
"trial_name": null,
|
| 4433 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.9984,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 4680,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4408 |
"mean_token_accuracy": 0.764773941040039,
|
| 4409 |
"num_tokens": 20432372.0,
|
| 4410 |
"step": 4390
|
| 4411 |
+
},
|
| 4412 |
+
{
|
| 4413 |
+
"entropy": 0.9848338901996613,
|
| 4414 |
+
"epoch": 0.9386666666666666,
|
| 4415 |
+
"grad_norm": 0.2765245735645294,
|
| 4416 |
+
"learning_rate": 5.756862957908433e-05,
|
| 4417 |
+
"loss": 1.1192432403564454,
|
| 4418 |
+
"mean_token_accuracy": 0.7547446370124817,
|
| 4419 |
+
"num_tokens": 20481366.0,
|
| 4420 |
+
"step": 4400
|
| 4421 |
+
},
|
| 4422 |
+
{
|
| 4423 |
+
"entropy": 0.9790939651429653,
|
| 4424 |
+
"epoch": 0.9408,
|
| 4425 |
+
"grad_norm": 0.23915551602840424,
|
| 4426 |
+
"learning_rate": 5.739750748666606e-05,
|
| 4427 |
+
"loss": 1.036961555480957,
|
| 4428 |
+
"mean_token_accuracy": 0.7573970347642899,
|
| 4429 |
+
"num_tokens": 20526985.0,
|
| 4430 |
+
"step": 4410
|
| 4431 |
+
},
|
| 4432 |
+
{
|
| 4433 |
+
"entropy": 0.9054527454078197,
|
| 4434 |
+
"epoch": 0.9429333333333333,
|
| 4435 |
+
"grad_norm": 0.24054944515228271,
|
| 4436 |
+
"learning_rate": 5.7226296761186274e-05,
|
| 4437 |
+
"loss": 0.9758554458618164,
|
| 4438 |
+
"mean_token_accuracy": 0.7724366948008538,
|
| 4439 |
+
"num_tokens": 20571815.0,
|
| 4440 |
+
"step": 4420
|
| 4441 |
+
},
|
| 4442 |
+
{
|
| 4443 |
+
"entropy": 0.9364707127213479,
|
| 4444 |
+
"epoch": 0.9450666666666667,
|
| 4445 |
+
"grad_norm": 0.28272607922554016,
|
| 4446 |
+
"learning_rate": 5.705499945400223e-05,
|
| 4447 |
+
"loss": 1.0225676536560058,
|
| 4448 |
+
"mean_token_accuracy": 0.7622330486774445,
|
| 4449 |
+
"num_tokens": 20615072.0,
|
| 4450 |
+
"step": 4430
|
| 4451 |
+
},
|
| 4452 |
+
{
|
| 4453 |
+
"entropy": 1.0657535366714002,
|
| 4454 |
+
"epoch": 0.9472,
|
| 4455 |
+
"grad_norm": 0.23734600841999054,
|
| 4456 |
+
"learning_rate": 5.688361761750861e-05,
|
| 4457 |
+
"loss": 1.1335111618041993,
|
| 4458 |
+
"mean_token_accuracy": 0.7402229458093643,
|
| 4459 |
+
"num_tokens": 20666534.0,
|
| 4460 |
+
"step": 4440
|
| 4461 |
+
},
|
| 4462 |
+
{
|
| 4463 |
+
"entropy": 0.9826746597886086,
|
| 4464 |
+
"epoch": 0.9493333333333334,
|
| 4465 |
+
"grad_norm": 0.28600969910621643,
|
| 4466 |
+
"learning_rate": 5.671215330511283e-05,
|
| 4467 |
+
"loss": 1.066628646850586,
|
| 4468 |
+
"mean_token_accuracy": 0.7560828119516373,
|
| 4469 |
+
"num_tokens": 20715376.0,
|
| 4470 |
+
"step": 4450
|
| 4471 |
+
},
|
| 4472 |
+
{
|
| 4473 |
+
"entropy": 0.9109843887388707,
|
| 4474 |
+
"epoch": 0.9514666666666667,
|
| 4475 |
+
"grad_norm": 0.2514685392379761,
|
| 4476 |
+
"learning_rate": 5.65406085712105e-05,
|
| 4477 |
+
"loss": 1.0114540100097655,
|
| 4478 |
+
"mean_token_accuracy": 0.7724284827709198,
|
| 4479 |
+
"num_tokens": 20758838.0,
|
| 4480 |
+
"step": 4460
|
| 4481 |
+
},
|
| 4482 |
+
{
|
| 4483 |
+
"entropy": 0.8498819716274738,
|
| 4484 |
+
"epoch": 0.9536,
|
| 4485 |
+
"grad_norm": 0.28889158368110657,
|
| 4486 |
+
"learning_rate": 5.6368985471160804e-05,
|
| 4487 |
+
"loss": 0.9062424659729004,
|
| 4488 |
+
"mean_token_accuracy": 0.785689315199852,
|
| 4489 |
+
"num_tokens": 20799444.0,
|
| 4490 |
+
"step": 4470
|
| 4491 |
+
},
|
| 4492 |
+
{
|
| 4493 |
+
"entropy": 0.8840778715908527,
|
| 4494 |
+
"epoch": 0.9557333333333333,
|
| 4495 |
+
"grad_norm": 0.2577449083328247,
|
| 4496 |
+
"learning_rate": 5.6197286061261875e-05,
|
| 4497 |
+
"loss": 0.9439300537109375,
|
| 4498 |
+
"mean_token_accuracy": 0.7696003526449203,
|
| 4499 |
+
"num_tokens": 20843766.0,
|
| 4500 |
+
"step": 4480
|
| 4501 |
+
},
|
| 4502 |
+
{
|
| 4503 |
+
"entropy": 0.8888865426182747,
|
| 4504 |
+
"epoch": 0.9578666666666666,
|
| 4505 |
+
"grad_norm": 0.27302756905555725,
|
| 4506 |
+
"learning_rate": 5.602551239872616e-05,
|
| 4507 |
+
"loss": 0.9372305870056152,
|
| 4508 |
+
"mean_token_accuracy": 0.7730641543865204,
|
| 4509 |
+
"num_tokens": 20888764.0,
|
| 4510 |
+
"step": 4490
|
| 4511 |
+
},
|
| 4512 |
+
{
|
| 4513 |
+
"entropy": 0.9558203481137753,
|
| 4514 |
+
"epoch": 0.96,
|
| 4515 |
+
"grad_norm": 0.3576233386993408,
|
| 4516 |
+
"learning_rate": 5.58536665416557e-05,
|
| 4517 |
+
"loss": 1.0556070327758789,
|
| 4518 |
+
"mean_token_accuracy": 0.762606156617403,
|
| 4519 |
+
"num_tokens": 20936028.0,
|
| 4520 |
+
"step": 4500
|
| 4521 |
+
},
|
| 4522 |
+
{
|
| 4523 |
+
"entropy": 0.9054192140698433,
|
| 4524 |
+
"epoch": 0.9621333333333333,
|
| 4525 |
+
"grad_norm": 0.2521965205669403,
|
| 4526 |
+
"learning_rate": 5.568175054901763e-05,
|
| 4527 |
+
"loss": 0.9705222129821778,
|
| 4528 |
+
"mean_token_accuracy": 0.7672724887728691,
|
| 4529 |
+
"num_tokens": 20985057.0,
|
| 4530 |
+
"step": 4510
|
| 4531 |
+
},
|
| 4532 |
+
{
|
| 4533 |
+
"entropy": 0.9011006608605385,
|
| 4534 |
+
"epoch": 0.9642666666666667,
|
| 4535 |
+
"grad_norm": 0.27024832367897034,
|
| 4536 |
+
"learning_rate": 5.550976648061934e-05,
|
| 4537 |
+
"loss": 0.9830186843872071,
|
| 4538 |
+
"mean_token_accuracy": 0.7754541039466858,
|
| 4539 |
+
"num_tokens": 21028567.0,
|
| 4540 |
+
"step": 4520
|
| 4541 |
+
},
|
| 4542 |
+
{
|
| 4543 |
+
"entropy": 0.9991332605481148,
|
| 4544 |
+
"epoch": 0.9664,
|
| 4545 |
+
"grad_norm": 0.2703147828578949,
|
| 4546 |
+
"learning_rate": 5.533771639708388e-05,
|
| 4547 |
+
"loss": 1.1589097023010253,
|
| 4548 |
+
"mean_token_accuracy": 0.7532796613872051,
|
| 4549 |
+
"num_tokens": 21072699.0,
|
| 4550 |
+
"step": 4530
|
| 4551 |
+
},
|
| 4552 |
+
{
|
| 4553 |
+
"entropy": 0.9183724671602249,
|
| 4554 |
+
"epoch": 0.9685333333333334,
|
| 4555 |
+
"grad_norm": 0.2243046760559082,
|
| 4556 |
+
"learning_rate": 5.516560235982527e-05,
|
| 4557 |
+
"loss": 0.9856460571289063,
|
| 4558 |
+
"mean_token_accuracy": 0.771567003428936,
|
| 4559 |
+
"num_tokens": 21121413.0,
|
| 4560 |
+
"step": 4540
|
| 4561 |
+
},
|
| 4562 |
+
{
|
| 4563 |
+
"entropy": 0.8655671834945678,
|
| 4564 |
+
"epoch": 0.9706666666666667,
|
| 4565 |
+
"grad_norm": 0.3306775987148285,
|
| 4566 |
+
"learning_rate": 5.499342643102381e-05,
|
| 4567 |
+
"loss": 0.9172829627990723,
|
| 4568 |
+
"mean_token_accuracy": 0.777653044462204,
|
| 4569 |
+
"num_tokens": 21162927.0,
|
| 4570 |
+
"step": 4550
|
| 4571 |
+
},
|
| 4572 |
+
{
|
| 4573 |
+
"entropy": 0.9436637915670871,
|
| 4574 |
+
"epoch": 0.9728,
|
| 4575 |
+
"grad_norm": 0.2542389929294586,
|
| 4576 |
+
"learning_rate": 5.482119067360132e-05,
|
| 4577 |
+
"loss": 1.0658721923828125,
|
| 4578 |
+
"mean_token_accuracy": 0.767835621535778,
|
| 4579 |
+
"num_tokens": 21206936.0,
|
| 4580 |
+
"step": 4560
|
| 4581 |
+
},
|
| 4582 |
+
{
|
| 4583 |
+
"entropy": 0.7974261797964572,
|
| 4584 |
+
"epoch": 0.9749333333333333,
|
| 4585 |
+
"grad_norm": 0.24307052791118622,
|
| 4586 |
+
"learning_rate": 5.4648897151196455e-05,
|
| 4587 |
+
"loss": 0.8578211784362793,
|
| 4588 |
+
"mean_token_accuracy": 0.7923481151461601,
|
| 4589 |
+
"num_tokens": 21252732.0,
|
| 4590 |
+
"step": 4570
|
| 4591 |
+
},
|
| 4592 |
+
{
|
| 4593 |
+
"entropy": 0.9691430673003196,
|
| 4594 |
+
"epoch": 0.9770666666666666,
|
| 4595 |
+
"grad_norm": 0.2720329165458679,
|
| 4596 |
+
"learning_rate": 5.447654792814e-05,
|
| 4597 |
+
"loss": 1.0459741592407226,
|
| 4598 |
+
"mean_token_accuracy": 0.7617560073733329,
|
| 4599 |
+
"num_tokens": 21298972.0,
|
| 4600 |
+
"step": 4580
|
| 4601 |
+
},
|
| 4602 |
+
{
|
| 4603 |
+
"entropy": 0.9178217075765133,
|
| 4604 |
+
"epoch": 0.9792,
|
| 4605 |
+
"grad_norm": 0.2640475630760193,
|
| 4606 |
+
"learning_rate": 5.4304145069430115e-05,
|
| 4607 |
+
"loss": 1.0324625015258788,
|
| 4608 |
+
"mean_token_accuracy": 0.7745086327195168,
|
| 4609 |
+
"num_tokens": 21348870.0,
|
| 4610 |
+
"step": 4590
|
| 4611 |
+
},
|
| 4612 |
+
{
|
| 4613 |
+
"entropy": 0.8973256818950176,
|
| 4614 |
+
"epoch": 0.9813333333333333,
|
| 4615 |
+
"grad_norm": 0.2828875184059143,
|
| 4616 |
+
"learning_rate": 5.4131690640707574e-05,
|
| 4617 |
+
"loss": 0.9894962310791016,
|
| 4618 |
+
"mean_token_accuracy": 0.7752941563725472,
|
| 4619 |
+
"num_tokens": 21390716.0,
|
| 4620 |
+
"step": 4600
|
| 4621 |
+
},
|
| 4622 |
+
{
|
| 4623 |
+
"entropy": 0.9490196861326694,
|
| 4624 |
+
"epoch": 0.9834666666666667,
|
| 4625 |
+
"grad_norm": 0.27414020895957947,
|
| 4626 |
+
"learning_rate": 5.3959186708231046e-05,
|
| 4627 |
+
"loss": 1.0264591217041015,
|
| 4628 |
+
"mean_token_accuracy": 0.7639399319887161,
|
| 4629 |
+
"num_tokens": 21440700.0,
|
| 4630 |
+
"step": 4610
|
| 4631 |
+
},
|
| 4632 |
+
{
|
| 4633 |
+
"entropy": 0.9219519071280956,
|
| 4634 |
+
"epoch": 0.9856,
|
| 4635 |
+
"grad_norm": 0.2545549273490906,
|
| 4636 |
+
"learning_rate": 5.3786635338852346e-05,
|
| 4637 |
+
"loss": 1.0511361122131349,
|
| 4638 |
+
"mean_token_accuracy": 0.7739394150674344,
|
| 4639 |
+
"num_tokens": 21483867.0,
|
| 4640 |
+
"step": 4620
|
| 4641 |
+
},
|
| 4642 |
+
{
|
| 4643 |
+
"entropy": 0.99324054941535,
|
| 4644 |
+
"epoch": 0.9877333333333334,
|
| 4645 |
+
"grad_norm": 0.272182434797287,
|
| 4646 |
+
"learning_rate": 5.361403859999161e-05,
|
| 4647 |
+
"loss": 1.116584587097168,
|
| 4648 |
+
"mean_token_accuracy": 0.7553175091743469,
|
| 4649 |
+
"num_tokens": 21535354.0,
|
| 4650 |
+
"step": 4630
|
| 4651 |
+
},
|
| 4652 |
+
{
|
| 4653 |
+
"entropy": 0.8828953221440315,
|
| 4654 |
+
"epoch": 0.9898666666666667,
|
| 4655 |
+
"grad_norm": 0.29537713527679443,
|
| 4656 |
+
"learning_rate": 5.344139855961262e-05,
|
| 4657 |
+
"loss": 0.9682372093200684,
|
| 4658 |
+
"mean_token_accuracy": 0.7781552016735077,
|
| 4659 |
+
"num_tokens": 21578265.0,
|
| 4660 |
+
"step": 4640
|
| 4661 |
+
},
|
| 4662 |
+
{
|
| 4663 |
+
"entropy": 0.9005228154361248,
|
| 4664 |
+
"epoch": 0.992,
|
| 4665 |
+
"grad_norm": 0.3032234013080597,
|
| 4666 |
+
"learning_rate": 5.3268717286197945e-05,
|
| 4667 |
+
"loss": 0.9423254013061524,
|
| 4668 |
+
"mean_token_accuracy": 0.7735077708959579,
|
| 4669 |
+
"num_tokens": 21618545.0,
|
| 4670 |
+
"step": 4650
|
| 4671 |
+
},
|
| 4672 |
+
{
|
| 4673 |
+
"entropy": 0.8464630447328091,
|
| 4674 |
+
"epoch": 0.9941333333333333,
|
| 4675 |
+
"grad_norm": 0.32000964879989624,
|
| 4676 |
+
"learning_rate": 5.3095996848724184e-05,
|
| 4677 |
+
"loss": 0.9030919075012207,
|
| 4678 |
+
"mean_token_accuracy": 0.7863337904214859,
|
| 4679 |
+
"num_tokens": 21657735.0,
|
| 4680 |
+
"step": 4660
|
| 4681 |
+
},
|
| 4682 |
+
{
|
| 4683 |
+
"entropy": 0.8923816077411175,
|
| 4684 |
+
"epoch": 0.9962666666666666,
|
| 4685 |
+
"grad_norm": 0.3551577627658844,
|
| 4686 |
+
"learning_rate": 5.292323931663719e-05,
|
| 4687 |
+
"loss": 0.9792759895324707,
|
| 4688 |
+
"mean_token_accuracy": 0.7739578939974308,
|
| 4689 |
+
"num_tokens": 21705183.0,
|
| 4690 |
+
"step": 4670
|
| 4691 |
+
},
|
| 4692 |
+
{
|
| 4693 |
+
"entropy": 0.9760521411895752,
|
| 4694 |
+
"epoch": 0.9984,
|
| 4695 |
+
"grad_norm": 0.2613706886768341,
|
| 4696 |
+
"learning_rate": 5.275044675982724e-05,
|
| 4697 |
+
"loss": 1.055685043334961,
|
| 4698 |
+
"mean_token_accuracy": 0.7623668745160103,
|
| 4699 |
+
"num_tokens": 21747104.0,
|
| 4700 |
+
"step": 4680
|
| 4701 |
}
|
| 4702 |
],
|
| 4703 |
"logging_steps": 10,
|
|
|
|
| 4717 |
"attributes": {}
|
| 4718 |
}
|
| 4719 |
},
|
| 4720 |
+
"total_flos": 1.0298188561140326e+17,
|
| 4721 |
"train_batch_size": 4,
|
| 4722 |
"trial_name": null,
|
| 4723 |
"trial_params": null
|