Instructions to use rovdetection/code-1b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use rovdetection/code-1b-instruct with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("rovdetection/code-1b-instruct", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Training in progress, step 3000, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9446744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c4bc2b38adc32706e2acf860899364c389c44795cde5d43402ad1b3e24719a4
|
| 3 |
size 9446744
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4879947
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa9401bbac3284c4bb6a11169e446baad7a8e6fc00ab949bbe417b3d7a375a77
|
| 3 |
size 4879947
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2194fb161e52a47a7f6b1734e178985577fd22e6aae4a22215e086c0248266b
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:431821c493d4002c62a876cbdeb3eade105892abe1c599865b041dfe28827339
|
| 3 |
size 14917
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ac1c46a2776d12775d23d0f587efc112188137ce2140da35bc15d301c9f620e
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad46a212d4576c083702df279951b960843d734a5cd61ac93041cad4b1712452
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2508,6 +2508,506 @@
|
|
| 2508 |
"mean_token_accuracy": 0.6467559643089771,
|
| 2509 |
"num_tokens": 14861262.0,
|
| 2510 |
"step": 2500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2511 |
}
|
| 2512 |
],
|
| 2513 |
"logging_steps": 10,
|
|
@@ -2527,7 +3027,7 @@
|
|
| 2527 |
"attributes": {}
|
| 2528 |
}
|
| 2529 |
},
|
| 2530 |
-
"total_flos": 1.
|
| 2531 |
"train_batch_size": 2,
|
| 2532 |
"trial_name": null,
|
| 2533 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 5.154738878143133,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 3000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2508 |
"mean_token_accuracy": 0.6467559643089771,
|
| 2509 |
"num_tokens": 14861262.0,
|
| 2510 |
"step": 2500
|
| 2511 |
+
},
|
| 2512 |
+
{
|
| 2513 |
+
"entropy": 1.8234948687255383,
|
| 2514 |
+
"epoch": 4.312916398022781,
|
| 2515 |
+
"grad_norm": 0.778538167476654,
|
| 2516 |
+
"learning_rate": 9.964e-05,
|
| 2517 |
+
"loss": 1.8733020782470704,
|
| 2518 |
+
"mean_token_accuracy": 0.6553889319300652,
|
| 2519 |
+
"num_tokens": 14920923.0,
|
| 2520 |
+
"step": 2510
|
| 2521 |
+
},
|
| 2522 |
+
{
|
| 2523 |
+
"entropy": 1.812998068332672,
|
| 2524 |
+
"epoch": 4.330109606705351,
|
| 2525 |
+
"grad_norm": 0.7861834764480591,
|
| 2526 |
+
"learning_rate": 9.924e-05,
|
| 2527 |
+
"loss": 1.8699317932128907,
|
| 2528 |
+
"mean_token_accuracy": 0.6555795632302761,
|
| 2529 |
+
"num_tokens": 14978173.0,
|
| 2530 |
+
"step": 2520
|
| 2531 |
+
},
|
| 2532 |
+
{
|
| 2533 |
+
"entropy": 1.8013822883367538,
|
| 2534 |
+
"epoch": 4.347302815387922,
|
| 2535 |
+
"grad_norm": 0.751916229724884,
|
| 2536 |
+
"learning_rate": 9.884e-05,
|
| 2537 |
+
"loss": 1.8372121810913087,
|
| 2538 |
+
"mean_token_accuracy": 0.664341426640749,
|
| 2539 |
+
"num_tokens": 15034480.0,
|
| 2540 |
+
"step": 2530
|
| 2541 |
+
},
|
| 2542 |
+
{
|
| 2543 |
+
"entropy": 1.7700918450951577,
|
| 2544 |
+
"epoch": 4.364496024070492,
|
| 2545 |
+
"grad_norm": 0.7365695834159851,
|
| 2546 |
+
"learning_rate": 9.844000000000001e-05,
|
| 2547 |
+
"loss": 1.8166645050048829,
|
| 2548 |
+
"mean_token_accuracy": 0.6654425717890262,
|
| 2549 |
+
"num_tokens": 15093226.0,
|
| 2550 |
+
"step": 2540
|
| 2551 |
+
},
|
| 2552 |
+
{
|
| 2553 |
+
"entropy": 1.7808674454689026,
|
| 2554 |
+
"epoch": 4.381689232753063,
|
| 2555 |
+
"grad_norm": 0.7306393980979919,
|
| 2556 |
+
"learning_rate": 9.804e-05,
|
| 2557 |
+
"loss": 1.8363780975341797,
|
| 2558 |
+
"mean_token_accuracy": 0.6601886965334416,
|
| 2559 |
+
"num_tokens": 15149937.0,
|
| 2560 |
+
"step": 2550
|
| 2561 |
+
},
|
| 2562 |
+
{
|
| 2563 |
+
"entropy": 1.7890540674328803,
|
| 2564 |
+
"epoch": 4.398882441435633,
|
| 2565 |
+
"grad_norm": 0.7466715574264526,
|
| 2566 |
+
"learning_rate": 9.764000000000001e-05,
|
| 2567 |
+
"loss": 1.847653579711914,
|
| 2568 |
+
"mean_token_accuracy": 0.6586611110717058,
|
| 2569 |
+
"num_tokens": 15210500.0,
|
| 2570 |
+
"step": 2560
|
| 2571 |
+
},
|
| 2572 |
+
{
|
| 2573 |
+
"entropy": 1.7866264268755914,
|
| 2574 |
+
"epoch": 4.416075650118203,
|
| 2575 |
+
"grad_norm": 0.7825273871421814,
|
| 2576 |
+
"learning_rate": 9.724000000000001e-05,
|
| 2577 |
+
"loss": 1.82576904296875,
|
| 2578 |
+
"mean_token_accuracy": 0.6592508733272553,
|
| 2579 |
+
"num_tokens": 15268262.0,
|
| 2580 |
+
"step": 2570
|
| 2581 |
+
},
|
| 2582 |
+
{
|
| 2583 |
+
"entropy": 1.8321722269058227,
|
| 2584 |
+
"epoch": 4.433268858800774,
|
| 2585 |
+
"grad_norm": 0.7158058285713196,
|
| 2586 |
+
"learning_rate": 9.684000000000001e-05,
|
| 2587 |
+
"loss": 1.8807327270507812,
|
| 2588 |
+
"mean_token_accuracy": 0.6545467376708984,
|
| 2589 |
+
"num_tokens": 15330745.0,
|
| 2590 |
+
"step": 2580
|
| 2591 |
+
},
|
| 2592 |
+
{
|
| 2593 |
+
"entropy": 1.739266212284565,
|
| 2594 |
+
"epoch": 4.450462067483344,
|
| 2595 |
+
"grad_norm": 0.7281847596168518,
|
| 2596 |
+
"learning_rate": 9.644e-05,
|
| 2597 |
+
"loss": 1.7686588287353515,
|
| 2598 |
+
"mean_token_accuracy": 0.6666045777499676,
|
| 2599 |
+
"num_tokens": 15391266.0,
|
| 2600 |
+
"step": 2590
|
| 2601 |
+
},
|
| 2602 |
+
{
|
| 2603 |
+
"entropy": 1.8295569285750388,
|
| 2604 |
+
"epoch": 4.467655276165915,
|
| 2605 |
+
"grad_norm": 0.7166727781295776,
|
| 2606 |
+
"learning_rate": 9.604000000000001e-05,
|
| 2607 |
+
"loss": 1.9156217575073242,
|
| 2608 |
+
"mean_token_accuracy": 0.655017600953579,
|
| 2609 |
+
"num_tokens": 15449819.0,
|
| 2610 |
+
"step": 2600
|
| 2611 |
+
},
|
| 2612 |
+
{
|
| 2613 |
+
"entropy": 1.8236071288585662,
|
| 2614 |
+
"epoch": 4.484848484848484,
|
| 2615 |
+
"grad_norm": 0.6946532726287842,
|
| 2616 |
+
"learning_rate": 9.564000000000001e-05,
|
| 2617 |
+
"loss": 1.9035514831542968,
|
| 2618 |
+
"mean_token_accuracy": 0.649907086789608,
|
| 2619 |
+
"num_tokens": 15513231.0,
|
| 2620 |
+
"step": 2610
|
| 2621 |
+
},
|
| 2622 |
+
{
|
| 2623 |
+
"entropy": 1.7869442969560623,
|
| 2624 |
+
"epoch": 4.502041693531055,
|
| 2625 |
+
"grad_norm": 0.7257023453712463,
|
| 2626 |
+
"learning_rate": 9.524e-05,
|
| 2627 |
+
"loss": 1.841336441040039,
|
| 2628 |
+
"mean_token_accuracy": 0.6655759517103433,
|
| 2629 |
+
"num_tokens": 15568973.0,
|
| 2630 |
+
"step": 2620
|
| 2631 |
+
},
|
| 2632 |
+
{
|
| 2633 |
+
"entropy": 1.7462848544120788,
|
| 2634 |
+
"epoch": 4.519234902213626,
|
| 2635 |
+
"grad_norm": 0.7239391803741455,
|
| 2636 |
+
"learning_rate": 9.484e-05,
|
| 2637 |
+
"loss": 1.7989360809326171,
|
| 2638 |
+
"mean_token_accuracy": 0.6646886244416237,
|
| 2639 |
+
"num_tokens": 15627655.0,
|
| 2640 |
+
"step": 2630
|
| 2641 |
+
},
|
| 2642 |
+
{
|
| 2643 |
+
"entropy": 1.7926493644714356,
|
| 2644 |
+
"epoch": 4.536428110896196,
|
| 2645 |
+
"grad_norm": 0.7628325819969177,
|
| 2646 |
+
"learning_rate": 9.444000000000001e-05,
|
| 2647 |
+
"loss": 1.8627632141113282,
|
| 2648 |
+
"mean_token_accuracy": 0.654141866415739,
|
| 2649 |
+
"num_tokens": 15687626.0,
|
| 2650 |
+
"step": 2640
|
| 2651 |
+
},
|
| 2652 |
+
{
|
| 2653 |
+
"entropy": 1.7928333327174186,
|
| 2654 |
+
"epoch": 4.553621319578767,
|
| 2655 |
+
"grad_norm": 0.629107654094696,
|
| 2656 |
+
"learning_rate": 9.404e-05,
|
| 2657 |
+
"loss": 1.8784042358398438,
|
| 2658 |
+
"mean_token_accuracy": 0.6618591919541359,
|
| 2659 |
+
"num_tokens": 15750035.0,
|
| 2660 |
+
"step": 2650
|
| 2661 |
+
},
|
| 2662 |
+
{
|
| 2663 |
+
"entropy": 1.7438783437013625,
|
| 2664 |
+
"epoch": 4.570814528261336,
|
| 2665 |
+
"grad_norm": 0.6948845982551575,
|
| 2666 |
+
"learning_rate": 9.364e-05,
|
| 2667 |
+
"loss": 1.7456579208374023,
|
| 2668 |
+
"mean_token_accuracy": 0.6722261719405651,
|
| 2669 |
+
"num_tokens": 15809533.0,
|
| 2670 |
+
"step": 2660
|
| 2671 |
+
},
|
| 2672 |
+
{
|
| 2673 |
+
"entropy": 1.7451874181628226,
|
| 2674 |
+
"epoch": 4.588007736943907,
|
| 2675 |
+
"grad_norm": 0.7213107943534851,
|
| 2676 |
+
"learning_rate": 9.324000000000001e-05,
|
| 2677 |
+
"loss": 1.8111917495727539,
|
| 2678 |
+
"mean_token_accuracy": 0.6621977139264346,
|
| 2679 |
+
"num_tokens": 15866570.0,
|
| 2680 |
+
"step": 2670
|
| 2681 |
+
},
|
| 2682 |
+
{
|
| 2683 |
+
"entropy": 1.806991095095873,
|
| 2684 |
+
"epoch": 4.6052009456264775,
|
| 2685 |
+
"grad_norm": 0.9146936535835266,
|
| 2686 |
+
"learning_rate": 9.284e-05,
|
| 2687 |
+
"loss": 1.8761199951171874,
|
| 2688 |
+
"mean_token_accuracy": 0.6552402298897505,
|
| 2689 |
+
"num_tokens": 15923681.0,
|
| 2690 |
+
"step": 2680
|
| 2691 |
+
},
|
| 2692 |
+
{
|
| 2693 |
+
"entropy": 1.854476225376129,
|
| 2694 |
+
"epoch": 4.622394154309048,
|
| 2695 |
+
"grad_norm": 0.675061047077179,
|
| 2696 |
+
"learning_rate": 9.244e-05,
|
| 2697 |
+
"loss": 1.8601364135742187,
|
| 2698 |
+
"mean_token_accuracy": 0.656403211131692,
|
| 2699 |
+
"num_tokens": 15979879.0,
|
| 2700 |
+
"step": 2690
|
| 2701 |
+
},
|
| 2702 |
+
{
|
| 2703 |
+
"entropy": 1.8345128282904626,
|
| 2704 |
+
"epoch": 4.639587362991619,
|
| 2705 |
+
"grad_norm": 0.7702699303627014,
|
| 2706 |
+
"learning_rate": 9.204e-05,
|
| 2707 |
+
"loss": 1.9170707702636718,
|
| 2708 |
+
"mean_token_accuracy": 0.6507652081549168,
|
| 2709 |
+
"num_tokens": 16040136.0,
|
| 2710 |
+
"step": 2700
|
| 2711 |
+
},
|
| 2712 |
+
{
|
| 2713 |
+
"entropy": 1.8444690719246863,
|
| 2714 |
+
"epoch": 4.656780571674188,
|
| 2715 |
+
"grad_norm": 0.7249677181243896,
|
| 2716 |
+
"learning_rate": 9.164000000000001e-05,
|
| 2717 |
+
"loss": 1.9021928787231446,
|
| 2718 |
+
"mean_token_accuracy": 0.6553504541516304,
|
| 2719 |
+
"num_tokens": 16097652.0,
|
| 2720 |
+
"step": 2710
|
| 2721 |
+
},
|
| 2722 |
+
{
|
| 2723 |
+
"entropy": 1.8083212688565253,
|
| 2724 |
+
"epoch": 4.673973780356759,
|
| 2725 |
+
"grad_norm": 0.7018275260925293,
|
| 2726 |
+
"learning_rate": 9.124e-05,
|
| 2727 |
+
"loss": 1.87921199798584,
|
| 2728 |
+
"mean_token_accuracy": 0.6609590038657188,
|
| 2729 |
+
"num_tokens": 16159014.0,
|
| 2730 |
+
"step": 2720
|
| 2731 |
+
},
|
| 2732 |
+
{
|
| 2733 |
+
"entropy": 1.793540646135807,
|
| 2734 |
+
"epoch": 4.6911669890393295,
|
| 2735 |
+
"grad_norm": 0.731863796710968,
|
| 2736 |
+
"learning_rate": 9.084e-05,
|
| 2737 |
+
"loss": 1.847224807739258,
|
| 2738 |
+
"mean_token_accuracy": 0.6638176888227463,
|
| 2739 |
+
"num_tokens": 16223636.0,
|
| 2740 |
+
"step": 2730
|
| 2741 |
+
},
|
| 2742 |
+
{
|
| 2743 |
+
"entropy": 1.7947301134467124,
|
| 2744 |
+
"epoch": 4.7083601977219,
|
| 2745 |
+
"grad_norm": 0.7208489775657654,
|
| 2746 |
+
"learning_rate": 9.044000000000001e-05,
|
| 2747 |
+
"loss": 1.8400375366210937,
|
| 2748 |
+
"mean_token_accuracy": 0.6600434482097626,
|
| 2749 |
+
"num_tokens": 16281647.0,
|
| 2750 |
+
"step": 2740
|
| 2751 |
+
},
|
| 2752 |
+
{
|
| 2753 |
+
"entropy": 1.8043948471546174,
|
| 2754 |
+
"epoch": 4.725553406404471,
|
| 2755 |
+
"grad_norm": 0.7633848190307617,
|
| 2756 |
+
"learning_rate": 9.004e-05,
|
| 2757 |
+
"loss": 1.8509382247924804,
|
| 2758 |
+
"mean_token_accuracy": 0.6632162068039179,
|
| 2759 |
+
"num_tokens": 16340706.0,
|
| 2760 |
+
"step": 2750
|
| 2761 |
+
},
|
| 2762 |
+
{
|
| 2763 |
+
"entropy": 1.8240734949707984,
|
| 2764 |
+
"epoch": 4.74274661508704,
|
| 2765 |
+
"grad_norm": 0.7516812086105347,
|
| 2766 |
+
"learning_rate": 8.964e-05,
|
| 2767 |
+
"loss": 1.9139686584472657,
|
| 2768 |
+
"mean_token_accuracy": 0.6504824224859476,
|
| 2769 |
+
"num_tokens": 16398077.0,
|
| 2770 |
+
"step": 2760
|
| 2771 |
+
},
|
| 2772 |
+
{
|
| 2773 |
+
"entropy": 1.7775158017873764,
|
| 2774 |
+
"epoch": 4.759939823769611,
|
| 2775 |
+
"grad_norm": 0.7677133679389954,
|
| 2776 |
+
"learning_rate": 8.924e-05,
|
| 2777 |
+
"loss": 1.8351661682128906,
|
| 2778 |
+
"mean_token_accuracy": 0.6568478621542454,
|
| 2779 |
+
"num_tokens": 16458898.0,
|
| 2780 |
+
"step": 2770
|
| 2781 |
+
},
|
| 2782 |
+
{
|
| 2783 |
+
"entropy": 1.8671277523040772,
|
| 2784 |
+
"epoch": 4.7771330324521815,
|
| 2785 |
+
"grad_norm": 0.750451385974884,
|
| 2786 |
+
"learning_rate": 8.884e-05,
|
| 2787 |
+
"loss": 1.9589305877685548,
|
| 2788 |
+
"mean_token_accuracy": 0.6506143860518933,
|
| 2789 |
+
"num_tokens": 16519496.0,
|
| 2790 |
+
"step": 2780
|
| 2791 |
+
},
|
| 2792 |
+
{
|
| 2793 |
+
"entropy": 1.7745324671268463,
|
| 2794 |
+
"epoch": 4.794326241134752,
|
| 2795 |
+
"grad_norm": 0.8302338719367981,
|
| 2796 |
+
"learning_rate": 8.844e-05,
|
| 2797 |
+
"loss": 1.8637496948242187,
|
| 2798 |
+
"mean_token_accuracy": 0.6621543657034635,
|
| 2799 |
+
"num_tokens": 16579080.0,
|
| 2800 |
+
"step": 2790
|
| 2801 |
+
},
|
| 2802 |
+
{
|
| 2803 |
+
"entropy": 1.73246541172266,
|
| 2804 |
+
"epoch": 4.811519449817322,
|
| 2805 |
+
"grad_norm": 0.778176486492157,
|
| 2806 |
+
"learning_rate": 8.804e-05,
|
| 2807 |
+
"loss": 1.752696418762207,
|
| 2808 |
+
"mean_token_accuracy": 0.6727286443114281,
|
| 2809 |
+
"num_tokens": 16640932.0,
|
| 2810 |
+
"step": 2800
|
| 2811 |
+
},
|
| 2812 |
+
{
|
| 2813 |
+
"entropy": 1.8060437709093093,
|
| 2814 |
+
"epoch": 4.828712658499892,
|
| 2815 |
+
"grad_norm": 0.9019444584846497,
|
| 2816 |
+
"learning_rate": 8.764e-05,
|
| 2817 |
+
"loss": 1.9031681060791015,
|
| 2818 |
+
"mean_token_accuracy": 0.6563040159642697,
|
| 2819 |
+
"num_tokens": 16702244.0,
|
| 2820 |
+
"step": 2810
|
| 2821 |
+
},
|
| 2822 |
+
{
|
| 2823 |
+
"entropy": 1.8732322439551354,
|
| 2824 |
+
"epoch": 4.845905867182463,
|
| 2825 |
+
"grad_norm": 0.7397829294204712,
|
| 2826 |
+
"learning_rate": 8.724e-05,
|
| 2827 |
+
"loss": 1.9326038360595703,
|
| 2828 |
+
"mean_token_accuracy": 0.6478111572563648,
|
| 2829 |
+
"num_tokens": 16764555.0,
|
| 2830 |
+
"step": 2820
|
| 2831 |
+
},
|
| 2832 |
+
{
|
| 2833 |
+
"entropy": 1.842681024968624,
|
| 2834 |
+
"epoch": 4.863099075865033,
|
| 2835 |
+
"grad_norm": 0.8511717915534973,
|
| 2836 |
+
"learning_rate": 8.684e-05,
|
| 2837 |
+
"loss": 1.9107376098632813,
|
| 2838 |
+
"mean_token_accuracy": 0.6531910292804242,
|
| 2839 |
+
"num_tokens": 16821936.0,
|
| 2840 |
+
"step": 2830
|
| 2841 |
+
},
|
| 2842 |
+
{
|
| 2843 |
+
"entropy": 1.7571960732340812,
|
| 2844 |
+
"epoch": 4.880292284547604,
|
| 2845 |
+
"grad_norm": 0.7064304947853088,
|
| 2846 |
+
"learning_rate": 8.643999999999999e-05,
|
| 2847 |
+
"loss": 1.7985404968261718,
|
| 2848 |
+
"mean_token_accuracy": 0.6667480751872062,
|
| 2849 |
+
"num_tokens": 16882205.0,
|
| 2850 |
+
"step": 2840
|
| 2851 |
+
},
|
| 2852 |
+
{
|
| 2853 |
+
"entropy": 1.8695308573544025,
|
| 2854 |
+
"epoch": 4.897485493230175,
|
| 2855 |
+
"grad_norm": 0.7386742234230042,
|
| 2856 |
+
"learning_rate": 8.604000000000001e-05,
|
| 2857 |
+
"loss": 1.9543342590332031,
|
| 2858 |
+
"mean_token_accuracy": 0.6496741093695164,
|
| 2859 |
+
"num_tokens": 16939799.0,
|
| 2860 |
+
"step": 2850
|
| 2861 |
+
},
|
| 2862 |
+
{
|
| 2863 |
+
"entropy": 1.7877972453832627,
|
| 2864 |
+
"epoch": 4.914678701912744,
|
| 2865 |
+
"grad_norm": 0.7687976956367493,
|
| 2866 |
+
"learning_rate": 8.564000000000001e-05,
|
| 2867 |
+
"loss": 1.7994373321533204,
|
| 2868 |
+
"mean_token_accuracy": 0.6637697361409665,
|
| 2869 |
+
"num_tokens": 16997716.0,
|
| 2870 |
+
"step": 2860
|
| 2871 |
+
},
|
| 2872 |
+
{
|
| 2873 |
+
"entropy": 1.761916320025921,
|
| 2874 |
+
"epoch": 4.931871910595315,
|
| 2875 |
+
"grad_norm": 0.7507193088531494,
|
| 2876 |
+
"learning_rate": 8.524e-05,
|
| 2877 |
+
"loss": 1.788670539855957,
|
| 2878 |
+
"mean_token_accuracy": 0.6648910716176033,
|
| 2879 |
+
"num_tokens": 17057260.0,
|
| 2880 |
+
"step": 2870
|
| 2881 |
+
},
|
| 2882 |
+
{
|
| 2883 |
+
"entropy": 1.804823537170887,
|
| 2884 |
+
"epoch": 4.949065119277885,
|
| 2885 |
+
"grad_norm": 0.727188229560852,
|
| 2886 |
+
"learning_rate": 8.484000000000001e-05,
|
| 2887 |
+
"loss": 1.855522346496582,
|
| 2888 |
+
"mean_token_accuracy": 0.657912939786911,
|
| 2889 |
+
"num_tokens": 17116073.0,
|
| 2890 |
+
"step": 2880
|
| 2891 |
+
},
|
| 2892 |
+
{
|
| 2893 |
+
"entropy": 1.8259041801095008,
|
| 2894 |
+
"epoch": 4.966258327960456,
|
| 2895 |
+
"grad_norm": 0.7195336818695068,
|
| 2896 |
+
"learning_rate": 8.444000000000001e-05,
|
| 2897 |
+
"loss": 1.8942272186279296,
|
| 2898 |
+
"mean_token_accuracy": 0.6546841934323311,
|
| 2899 |
+
"num_tokens": 17174141.0,
|
| 2900 |
+
"step": 2890
|
| 2901 |
+
},
|
| 2902 |
+
{
|
| 2903 |
+
"entropy": 1.7153871595859527,
|
| 2904 |
+
"epoch": 4.983451536643026,
|
| 2905 |
+
"grad_norm": 0.7093940377235413,
|
| 2906 |
+
"learning_rate": 8.404e-05,
|
| 2907 |
+
"loss": 1.7350996017456055,
|
| 2908 |
+
"mean_token_accuracy": 0.6728265054523945,
|
| 2909 |
+
"num_tokens": 17233307.0,
|
| 2910 |
+
"step": 2900
|
| 2911 |
+
},
|
| 2912 |
+
{
|
| 2913 |
+
"entropy": 1.7630670566063422,
|
| 2914 |
+
"epoch": 5.0,
|
| 2915 |
+
"grad_norm": 0.979345440864563,
|
| 2916 |
+
"learning_rate": 8.364e-05,
|
| 2917 |
+
"loss": 1.8098876953125,
|
| 2918 |
+
"mean_token_accuracy": 0.6604567510741097,
|
| 2919 |
+
"num_tokens": 17289810.0,
|
| 2920 |
+
"step": 2910
|
| 2921 |
+
},
|
| 2922 |
+
{
|
| 2923 |
+
"entropy": 1.8877688512206077,
|
| 2924 |
+
"epoch": 5.017193208682571,
|
| 2925 |
+
"grad_norm": 0.8140257596969604,
|
| 2926 |
+
"learning_rate": 8.324000000000001e-05,
|
| 2927 |
+
"loss": 1.9562681198120118,
|
| 2928 |
+
"mean_token_accuracy": 0.6476880256086588,
|
| 2929 |
+
"num_tokens": 17349922.0,
|
| 2930 |
+
"step": 2920
|
| 2931 |
+
},
|
| 2932 |
+
{
|
| 2933 |
+
"entropy": 1.6694072388112544,
|
| 2934 |
+
"epoch": 5.034386417365141,
|
| 2935 |
+
"grad_norm": 0.7486578226089478,
|
| 2936 |
+
"learning_rate": 8.284000000000001e-05,
|
| 2937 |
+
"loss": 1.71788330078125,
|
| 2938 |
+
"mean_token_accuracy": 0.6781885512173176,
|
| 2939 |
+
"num_tokens": 17409363.0,
|
| 2940 |
+
"step": 2930
|
| 2941 |
+
},
|
| 2942 |
+
{
|
| 2943 |
+
"entropy": 1.8061093628406524,
|
| 2944 |
+
"epoch": 5.051579626047711,
|
| 2945 |
+
"grad_norm": 0.8148984313011169,
|
| 2946 |
+
"learning_rate": 8.244e-05,
|
| 2947 |
+
"loss": 1.8484228134155274,
|
| 2948 |
+
"mean_token_accuracy": 0.6591597832739353,
|
| 2949 |
+
"num_tokens": 17468218.0,
|
| 2950 |
+
"step": 2940
|
| 2951 |
+
},
|
| 2952 |
+
{
|
| 2953 |
+
"entropy": 1.7561381176114081,
|
| 2954 |
+
"epoch": 5.068772834730281,
|
| 2955 |
+
"grad_norm": 0.7412339448928833,
|
| 2956 |
+
"learning_rate": 8.204000000000001e-05,
|
| 2957 |
+
"loss": 1.8109855651855469,
|
| 2958 |
+
"mean_token_accuracy": 0.6648329850286245,
|
| 2959 |
+
"num_tokens": 17529603.0,
|
| 2960 |
+
"step": 2950
|
| 2961 |
+
},
|
| 2962 |
+
{
|
| 2963 |
+
"entropy": 1.7058369636535644,
|
| 2964 |
+
"epoch": 5.085966043412852,
|
| 2965 |
+
"grad_norm": 0.7845883369445801,
|
| 2966 |
+
"learning_rate": 8.164000000000001e-05,
|
| 2967 |
+
"loss": 1.7577402114868164,
|
| 2968 |
+
"mean_token_accuracy": 0.675883399322629,
|
| 2969 |
+
"num_tokens": 17587275.0,
|
| 2970 |
+
"step": 2960
|
| 2971 |
+
},
|
| 2972 |
+
{
|
| 2973 |
+
"entropy": 1.7319279327988624,
|
| 2974 |
+
"epoch": 5.1031592520954225,
|
| 2975 |
+
"grad_norm": 0.7546029090881348,
|
| 2976 |
+
"learning_rate": 8.124e-05,
|
| 2977 |
+
"loss": 1.8096488952636718,
|
| 2978 |
+
"mean_token_accuracy": 0.668717809766531,
|
| 2979 |
+
"num_tokens": 17647368.0,
|
| 2980 |
+
"step": 2970
|
| 2981 |
+
},
|
| 2982 |
+
{
|
| 2983 |
+
"entropy": 1.7872621923685075,
|
| 2984 |
+
"epoch": 5.120352460777993,
|
| 2985 |
+
"grad_norm": 0.7214957475662231,
|
| 2986 |
+
"learning_rate": 8.084e-05,
|
| 2987 |
+
"loss": 1.7827239990234376,
|
| 2988 |
+
"mean_token_accuracy": 0.663322826102376,
|
| 2989 |
+
"num_tokens": 17708210.0,
|
| 2990 |
+
"step": 2980
|
| 2991 |
+
},
|
| 2992 |
+
{
|
| 2993 |
+
"entropy": 1.7479579642415046,
|
| 2994 |
+
"epoch": 5.137545669460563,
|
| 2995 |
+
"grad_norm": 0.6938044428825378,
|
| 2996 |
+
"learning_rate": 8.044000000000001e-05,
|
| 2997 |
+
"loss": 1.837489700317383,
|
| 2998 |
+
"mean_token_accuracy": 0.666904554143548,
|
| 2999 |
+
"num_tokens": 17770498.0,
|
| 3000 |
+
"step": 2990
|
| 3001 |
+
},
|
| 3002 |
+
{
|
| 3003 |
+
"entropy": 1.760008592903614,
|
| 3004 |
+
"epoch": 5.154738878143133,
|
| 3005 |
+
"grad_norm": 0.7440096139907837,
|
| 3006 |
+
"learning_rate": 8.004e-05,
|
| 3007 |
+
"loss": 1.7957250595092773,
|
| 3008 |
+
"mean_token_accuracy": 0.6704145818948746,
|
| 3009 |
+
"num_tokens": 17831493.0,
|
| 3010 |
+
"step": 3000
|
| 3011 |
}
|
| 3012 |
],
|
| 3013 |
"logging_steps": 10,
|
|
|
|
| 3027 |
"attributes": {}
|
| 3028 |
}
|
| 3029 |
},
|
| 3030 |
+
"total_flos": 1.4643251157506458e+17,
|
| 3031 |
"train_batch_size": 2,
|
| 3032 |
"trial_name": null,
|
| 3033 |
"trial_params": null
|