Training in progress, step 11000, checkpoint
Browse files- last-checkpoint/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step11000/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/model.safetensors +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +585 -5
last-checkpoint/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8e043ccfd2d14162108046118260c4a11838198a3378b8c63aef14e884f315e
|
| 3 |
+
size 5117197489
|
last-checkpoint/global_step11000/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45f6d58392e57f60153009c206c846732ef428fe79bd9f765140b63722b1c39e
|
| 3 |
+
size 859127933
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step11000
|
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 962205216
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55de6622ea2c12f2865659952fae3e7645ab102a38297690cb4fdbaeb6a9d78f
|
| 3 |
size 962205216
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a42b77849766d934d44019f3aaacdcb7addb89613853b8085a0f3dbdc6ec32df
|
| 3 |
size 14709
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:493d0f530ff7fc5bb7b7e09a1475f8ed1e6010e09c7b8eee02f261c6c00502eb
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
-
"best_metric":
|
| 3 |
-
"best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-
|
| 4 |
-
"epoch":
|
| 5 |
"eval_steps": 1000,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -2617,6 +2617,586 @@
|
|
| 2617 |
"eval_steps_per_second": 2.033,
|
| 2618 |
"eval_wer": 84.72418670438473,
|
| 2619 |
"step": 9000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2620 |
}
|
| 2621 |
],
|
| 2622 |
"logging_steps": 25,
|
|
@@ -2636,7 +3216,7 @@
|
|
| 2636 |
"attributes": {}
|
| 2637 |
}
|
| 2638 |
},
|
| 2639 |
-
"total_flos": 1.
|
| 2640 |
"train_batch_size": 4,
|
| 2641 |
"trial_name": null,
|
| 2642 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_metric": 83.86610089580387,
|
| 3 |
+
"best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-11000",
|
| 4 |
+
"epoch": 12.114537444933921,
|
| 5 |
"eval_steps": 1000,
|
| 6 |
+
"global_step": 11000,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 2617 |
"eval_steps_per_second": 2.033,
|
| 2618 |
"eval_wer": 84.72418670438473,
|
| 2619 |
"step": 9000
|
| 2620 |
+
},
|
| 2621 |
+
{
|
| 2622 |
+
"epoch": 9.939427312775331,
|
| 2623 |
+
"grad_norm": 0.9093891382217407,
|
| 2624 |
+
"learning_rate": 1.1256410256410258e-05,
|
| 2625 |
+
"loss": 0.0354,
|
| 2626 |
+
"step": 9025
|
| 2627 |
+
},
|
| 2628 |
+
{
|
| 2629 |
+
"epoch": 9.966960352422907,
|
| 2630 |
+
"grad_norm": 0.526305615901947,
|
| 2631 |
+
"learning_rate": 1.1230769230769232e-05,
|
| 2632 |
+
"loss": 0.04,
|
| 2633 |
+
"step": 9050
|
| 2634 |
+
},
|
| 2635 |
+
{
|
| 2636 |
+
"epoch": 9.994493392070485,
|
| 2637 |
+
"grad_norm": 0.4748174846172333,
|
| 2638 |
+
"learning_rate": 1.1205128205128205e-05,
|
| 2639 |
+
"loss": 0.0405,
|
| 2640 |
+
"step": 9075
|
| 2641 |
+
},
|
| 2642 |
+
{
|
| 2643 |
+
"epoch": 10.022026431718063,
|
| 2644 |
+
"grad_norm": 0.23602962493896484,
|
| 2645 |
+
"learning_rate": 1.117948717948718e-05,
|
| 2646 |
+
"loss": 0.0245,
|
| 2647 |
+
"step": 9100
|
| 2648 |
+
},
|
| 2649 |
+
{
|
| 2650 |
+
"epoch": 10.049559471365638,
|
| 2651 |
+
"grad_norm": 0.2989708185195923,
|
| 2652 |
+
"learning_rate": 1.1153846153846154e-05,
|
| 2653 |
+
"loss": 0.0231,
|
| 2654 |
+
"step": 9125
|
| 2655 |
+
},
|
| 2656 |
+
{
|
| 2657 |
+
"epoch": 10.077092511013216,
|
| 2658 |
+
"grad_norm": 0.34653839468955994,
|
| 2659 |
+
"learning_rate": 1.112820512820513e-05,
|
| 2660 |
+
"loss": 0.0306,
|
| 2661 |
+
"step": 9150
|
| 2662 |
+
},
|
| 2663 |
+
{
|
| 2664 |
+
"epoch": 10.104625550660794,
|
| 2665 |
+
"grad_norm": 0.4413544535636902,
|
| 2666 |
+
"learning_rate": 1.1102564102564103e-05,
|
| 2667 |
+
"loss": 0.0242,
|
| 2668 |
+
"step": 9175
|
| 2669 |
+
},
|
| 2670 |
+
{
|
| 2671 |
+
"epoch": 10.13215859030837,
|
| 2672 |
+
"grad_norm": 0.44882041215896606,
|
| 2673 |
+
"learning_rate": 1.1076923076923079e-05,
|
| 2674 |
+
"loss": 0.036,
|
| 2675 |
+
"step": 9200
|
| 2676 |
+
},
|
| 2677 |
+
{
|
| 2678 |
+
"epoch": 10.159691629955947,
|
| 2679 |
+
"grad_norm": 0.049951497465372086,
|
| 2680 |
+
"learning_rate": 1.1051282051282052e-05,
|
| 2681 |
+
"loss": 0.0249,
|
| 2682 |
+
"step": 9225
|
| 2683 |
+
},
|
| 2684 |
+
{
|
| 2685 |
+
"epoch": 10.187224669603523,
|
| 2686 |
+
"grad_norm": 0.34928587079048157,
|
| 2687 |
+
"learning_rate": 1.1025641025641028e-05,
|
| 2688 |
+
"loss": 0.0322,
|
| 2689 |
+
"step": 9250
|
| 2690 |
+
},
|
| 2691 |
+
{
|
| 2692 |
+
"epoch": 10.214757709251101,
|
| 2693 |
+
"grad_norm": 0.18765118718147278,
|
| 2694 |
+
"learning_rate": 1.1000000000000001e-05,
|
| 2695 |
+
"loss": 0.0249,
|
| 2696 |
+
"step": 9275
|
| 2697 |
+
},
|
| 2698 |
+
{
|
| 2699 |
+
"epoch": 10.242290748898679,
|
| 2700 |
+
"grad_norm": 0.09570558369159698,
|
| 2701 |
+
"learning_rate": 1.0974358974358977e-05,
|
| 2702 |
+
"loss": 0.0241,
|
| 2703 |
+
"step": 9300
|
| 2704 |
+
},
|
| 2705 |
+
{
|
| 2706 |
+
"epoch": 10.269823788546255,
|
| 2707 |
+
"grad_norm": 0.36708030104637146,
|
| 2708 |
+
"learning_rate": 1.094871794871795e-05,
|
| 2709 |
+
"loss": 0.0267,
|
| 2710 |
+
"step": 9325
|
| 2711 |
+
},
|
| 2712 |
+
{
|
| 2713 |
+
"epoch": 10.297356828193832,
|
| 2714 |
+
"grad_norm": 0.6306156516075134,
|
| 2715 |
+
"learning_rate": 1.0923076923076922e-05,
|
| 2716 |
+
"loss": 0.028,
|
| 2717 |
+
"step": 9350
|
| 2718 |
+
},
|
| 2719 |
+
{
|
| 2720 |
+
"epoch": 10.32488986784141,
|
| 2721 |
+
"grad_norm": 0.47958239912986755,
|
| 2722 |
+
"learning_rate": 1.0897435897435898e-05,
|
| 2723 |
+
"loss": 0.0374,
|
| 2724 |
+
"step": 9375
|
| 2725 |
+
},
|
| 2726 |
+
{
|
| 2727 |
+
"epoch": 10.352422907488986,
|
| 2728 |
+
"grad_norm": 0.5049773454666138,
|
| 2729 |
+
"learning_rate": 1.0871794871794871e-05,
|
| 2730 |
+
"loss": 0.0252,
|
| 2731 |
+
"step": 9400
|
| 2732 |
+
},
|
| 2733 |
+
{
|
| 2734 |
+
"epoch": 10.379955947136564,
|
| 2735 |
+
"grad_norm": 0.18035492300987244,
|
| 2736 |
+
"learning_rate": 1.0846153846153847e-05,
|
| 2737 |
+
"loss": 0.032,
|
| 2738 |
+
"step": 9425
|
| 2739 |
+
},
|
| 2740 |
+
{
|
| 2741 |
+
"epoch": 10.407488986784141,
|
| 2742 |
+
"grad_norm": 0.40862882137298584,
|
| 2743 |
+
"learning_rate": 1.082051282051282e-05,
|
| 2744 |
+
"loss": 0.0317,
|
| 2745 |
+
"step": 9450
|
| 2746 |
+
},
|
| 2747 |
+
{
|
| 2748 |
+
"epoch": 10.435022026431717,
|
| 2749 |
+
"grad_norm": 0.4345795512199402,
|
| 2750 |
+
"learning_rate": 1.0794871794871796e-05,
|
| 2751 |
+
"loss": 0.0227,
|
| 2752 |
+
"step": 9475
|
| 2753 |
+
},
|
| 2754 |
+
{
|
| 2755 |
+
"epoch": 10.462555066079295,
|
| 2756 |
+
"grad_norm": 0.32652077078819275,
|
| 2757 |
+
"learning_rate": 1.076923076923077e-05,
|
| 2758 |
+
"loss": 0.0274,
|
| 2759 |
+
"step": 9500
|
| 2760 |
+
},
|
| 2761 |
+
{
|
| 2762 |
+
"epoch": 10.490088105726873,
|
| 2763 |
+
"grad_norm": 0.49059435725212097,
|
| 2764 |
+
"learning_rate": 1.0743589743589745e-05,
|
| 2765 |
+
"loss": 0.0336,
|
| 2766 |
+
"step": 9525
|
| 2767 |
+
},
|
| 2768 |
+
{
|
| 2769 |
+
"epoch": 10.517621145374449,
|
| 2770 |
+
"grad_norm": 0.14571261405944824,
|
| 2771 |
+
"learning_rate": 1.0717948717948718e-05,
|
| 2772 |
+
"loss": 0.0244,
|
| 2773 |
+
"step": 9550
|
| 2774 |
+
},
|
| 2775 |
+
{
|
| 2776 |
+
"epoch": 10.545154185022026,
|
| 2777 |
+
"grad_norm": 0.2149128019809723,
|
| 2778 |
+
"learning_rate": 1.0692307692307694e-05,
|
| 2779 |
+
"loss": 0.0252,
|
| 2780 |
+
"step": 9575
|
| 2781 |
+
},
|
| 2782 |
+
{
|
| 2783 |
+
"epoch": 10.572687224669604,
|
| 2784 |
+
"grad_norm": 0.20995257794857025,
|
| 2785 |
+
"learning_rate": 1.0666666666666667e-05,
|
| 2786 |
+
"loss": 0.0311,
|
| 2787 |
+
"step": 9600
|
| 2788 |
+
},
|
| 2789 |
+
{
|
| 2790 |
+
"epoch": 10.60022026431718,
|
| 2791 |
+
"grad_norm": 0.4227479100227356,
|
| 2792 |
+
"learning_rate": 1.0641025641025643e-05,
|
| 2793 |
+
"loss": 0.0261,
|
| 2794 |
+
"step": 9625
|
| 2795 |
+
},
|
| 2796 |
+
{
|
| 2797 |
+
"epoch": 10.627753303964758,
|
| 2798 |
+
"grad_norm": 0.1345728039741516,
|
| 2799 |
+
"learning_rate": 1.0615384615384616e-05,
|
| 2800 |
+
"loss": 0.026,
|
| 2801 |
+
"step": 9650
|
| 2802 |
+
},
|
| 2803 |
+
{
|
| 2804 |
+
"epoch": 10.655286343612335,
|
| 2805 |
+
"grad_norm": 0.5568249821662903,
|
| 2806 |
+
"learning_rate": 1.058974358974359e-05,
|
| 2807 |
+
"loss": 0.0275,
|
| 2808 |
+
"step": 9675
|
| 2809 |
+
},
|
| 2810 |
+
{
|
| 2811 |
+
"epoch": 10.682819383259911,
|
| 2812 |
+
"grad_norm": 0.5649207234382629,
|
| 2813 |
+
"learning_rate": 1.0564102564102565e-05,
|
| 2814 |
+
"loss": 0.03,
|
| 2815 |
+
"step": 9700
|
| 2816 |
+
},
|
| 2817 |
+
{
|
| 2818 |
+
"epoch": 10.710352422907489,
|
| 2819 |
+
"grad_norm": 0.23224163055419922,
|
| 2820 |
+
"learning_rate": 1.0538461538461539e-05,
|
| 2821 |
+
"loss": 0.0292,
|
| 2822 |
+
"step": 9725
|
| 2823 |
+
},
|
| 2824 |
+
{
|
| 2825 |
+
"epoch": 10.737885462555067,
|
| 2826 |
+
"grad_norm": 0.2227552831172943,
|
| 2827 |
+
"learning_rate": 1.0512820512820514e-05,
|
| 2828 |
+
"loss": 0.028,
|
| 2829 |
+
"step": 9750
|
| 2830 |
+
},
|
| 2831 |
+
{
|
| 2832 |
+
"epoch": 10.765418502202643,
|
| 2833 |
+
"grad_norm": 0.07342702895402908,
|
| 2834 |
+
"learning_rate": 1.0487179487179488e-05,
|
| 2835 |
+
"loss": 0.0227,
|
| 2836 |
+
"step": 9775
|
| 2837 |
+
},
|
| 2838 |
+
{
|
| 2839 |
+
"epoch": 10.79295154185022,
|
| 2840 |
+
"grad_norm": 0.3385262191295624,
|
| 2841 |
+
"learning_rate": 1.0461538461538463e-05,
|
| 2842 |
+
"loss": 0.0325,
|
| 2843 |
+
"step": 9800
|
| 2844 |
+
},
|
| 2845 |
+
{
|
| 2846 |
+
"epoch": 10.820484581497798,
|
| 2847 |
+
"grad_norm": 0.2666647434234619,
|
| 2848 |
+
"learning_rate": 1.0435897435897437e-05,
|
| 2849 |
+
"loss": 0.0264,
|
| 2850 |
+
"step": 9825
|
| 2851 |
+
},
|
| 2852 |
+
{
|
| 2853 |
+
"epoch": 10.848017621145374,
|
| 2854 |
+
"grad_norm": 0.13147205114364624,
|
| 2855 |
+
"learning_rate": 1.0410256410256412e-05,
|
| 2856 |
+
"loss": 0.0184,
|
| 2857 |
+
"step": 9850
|
| 2858 |
+
},
|
| 2859 |
+
{
|
| 2860 |
+
"epoch": 10.875550660792952,
|
| 2861 |
+
"grad_norm": 0.24823608994483948,
|
| 2862 |
+
"learning_rate": 1.0384615384615386e-05,
|
| 2863 |
+
"loss": 0.0249,
|
| 2864 |
+
"step": 9875
|
| 2865 |
+
},
|
| 2866 |
+
{
|
| 2867 |
+
"epoch": 10.90308370044053,
|
| 2868 |
+
"grad_norm": 0.265788197517395,
|
| 2869 |
+
"learning_rate": 1.0358974358974361e-05,
|
| 2870 |
+
"loss": 0.0217,
|
| 2871 |
+
"step": 9900
|
| 2872 |
+
},
|
| 2873 |
+
{
|
| 2874 |
+
"epoch": 10.930616740088105,
|
| 2875 |
+
"grad_norm": 0.2914508879184723,
|
| 2876 |
+
"learning_rate": 1.0333333333333335e-05,
|
| 2877 |
+
"loss": 0.0199,
|
| 2878 |
+
"step": 9925
|
| 2879 |
+
},
|
| 2880 |
+
{
|
| 2881 |
+
"epoch": 10.958149779735683,
|
| 2882 |
+
"grad_norm": 0.19100092351436615,
|
| 2883 |
+
"learning_rate": 1.0307692307692307e-05,
|
| 2884 |
+
"loss": 0.0232,
|
| 2885 |
+
"step": 9950
|
| 2886 |
+
},
|
| 2887 |
+
{
|
| 2888 |
+
"epoch": 10.98568281938326,
|
| 2889 |
+
"grad_norm": 0.2141091227531433,
|
| 2890 |
+
"learning_rate": 1.0282051282051282e-05,
|
| 2891 |
+
"loss": 0.0276,
|
| 2892 |
+
"step": 9975
|
| 2893 |
+
},
|
| 2894 |
+
{
|
| 2895 |
+
"epoch": 11.013215859030836,
|
| 2896 |
+
"grad_norm": 0.09335622936487198,
|
| 2897 |
+
"learning_rate": 1.0256410256410256e-05,
|
| 2898 |
+
"loss": 0.0186,
|
| 2899 |
+
"step": 10000
|
| 2900 |
+
},
|
| 2901 |
+
{
|
| 2902 |
+
"epoch": 11.013215859030836,
|
| 2903 |
+
"eval_cer": 25.171093508190705,
|
| 2904 |
+
"eval_loss": 0.8366118669509888,
|
| 2905 |
+
"eval_runtime": 1307.8053,
|
| 2906 |
+
"eval_samples_per_second": 8.091,
|
| 2907 |
+
"eval_steps_per_second": 2.023,
|
| 2908 |
+
"eval_wer": 84.47901933050449,
|
| 2909 |
+
"step": 10000
|
| 2910 |
+
},
|
| 2911 |
+
{
|
| 2912 |
+
"epoch": 11.040748898678414,
|
| 2913 |
+
"grad_norm": 0.29987862706184387,
|
| 2914 |
+
"learning_rate": 1.0230769230769231e-05,
|
| 2915 |
+
"loss": 0.0117,
|
| 2916 |
+
"step": 10025
|
| 2917 |
+
},
|
| 2918 |
+
{
|
| 2919 |
+
"epoch": 11.068281938325992,
|
| 2920 |
+
"grad_norm": 0.22261077165603638,
|
| 2921 |
+
"learning_rate": 1.0205128205128205e-05,
|
| 2922 |
+
"loss": 0.0199,
|
| 2923 |
+
"step": 10050
|
| 2924 |
+
},
|
| 2925 |
+
{
|
| 2926 |
+
"epoch": 11.095814977973568,
|
| 2927 |
+
"grad_norm": 0.7212164402008057,
|
| 2928 |
+
"learning_rate": 1.017948717948718e-05,
|
| 2929 |
+
"loss": 0.0194,
|
| 2930 |
+
"step": 10075
|
| 2931 |
+
},
|
| 2932 |
+
{
|
| 2933 |
+
"epoch": 11.123348017621145,
|
| 2934 |
+
"grad_norm": 0.18654099106788635,
|
| 2935 |
+
"learning_rate": 1.0153846153846154e-05,
|
| 2936 |
+
"loss": 0.0191,
|
| 2937 |
+
"step": 10100
|
| 2938 |
+
},
|
| 2939 |
+
{
|
| 2940 |
+
"epoch": 11.150881057268723,
|
| 2941 |
+
"grad_norm": 0.1351199895143509,
|
| 2942 |
+
"learning_rate": 1.012820512820513e-05,
|
| 2943 |
+
"loss": 0.0151,
|
| 2944 |
+
"step": 10125
|
| 2945 |
+
},
|
| 2946 |
+
{
|
| 2947 |
+
"epoch": 11.178414096916299,
|
| 2948 |
+
"grad_norm": 0.24383758008480072,
|
| 2949 |
+
"learning_rate": 1.0102564102564103e-05,
|
| 2950 |
+
"loss": 0.0142,
|
| 2951 |
+
"step": 10150
|
| 2952 |
+
},
|
| 2953 |
+
{
|
| 2954 |
+
"epoch": 11.205947136563877,
|
| 2955 |
+
"grad_norm": 0.1962803304195404,
|
| 2956 |
+
"learning_rate": 1.0076923076923078e-05,
|
| 2957 |
+
"loss": 0.0159,
|
| 2958 |
+
"step": 10175
|
| 2959 |
+
},
|
| 2960 |
+
{
|
| 2961 |
+
"epoch": 11.233480176211454,
|
| 2962 |
+
"grad_norm": 0.1277613639831543,
|
| 2963 |
+
"learning_rate": 1.0051282051282052e-05,
|
| 2964 |
+
"loss": 0.018,
|
| 2965 |
+
"step": 10200
|
| 2966 |
+
},
|
| 2967 |
+
{
|
| 2968 |
+
"epoch": 11.26101321585903,
|
| 2969 |
+
"grad_norm": 0.17365778982639313,
|
| 2970 |
+
"learning_rate": 1.0025641025641027e-05,
|
| 2971 |
+
"loss": 0.0198,
|
| 2972 |
+
"step": 10225
|
| 2973 |
+
},
|
| 2974 |
+
{
|
| 2975 |
+
"epoch": 11.288546255506608,
|
| 2976 |
+
"grad_norm": 0.5494518876075745,
|
| 2977 |
+
"learning_rate": 1e-05,
|
| 2978 |
+
"loss": 0.0157,
|
| 2979 |
+
"step": 10250
|
| 2980 |
+
},
|
| 2981 |
+
{
|
| 2982 |
+
"epoch": 11.316079295154186,
|
| 2983 |
+
"grad_norm": 0.11686886101961136,
|
| 2984 |
+
"learning_rate": 9.974358974358974e-06,
|
| 2985 |
+
"loss": 0.024,
|
| 2986 |
+
"step": 10275
|
| 2987 |
+
},
|
| 2988 |
+
{
|
| 2989 |
+
"epoch": 11.343612334801762,
|
| 2990 |
+
"grad_norm": 0.15467554330825806,
|
| 2991 |
+
"learning_rate": 9.94871794871795e-06,
|
| 2992 |
+
"loss": 0.0174,
|
| 2993 |
+
"step": 10300
|
| 2994 |
+
},
|
| 2995 |
+
{
|
| 2996 |
+
"epoch": 11.37114537444934,
|
| 2997 |
+
"grad_norm": 0.10721301287412643,
|
| 2998 |
+
"learning_rate": 9.923076923076923e-06,
|
| 2999 |
+
"loss": 0.0169,
|
| 3000 |
+
"step": 10325
|
| 3001 |
+
},
|
| 3002 |
+
{
|
| 3003 |
+
"epoch": 11.398678414096917,
|
| 3004 |
+
"grad_norm": 0.1287498027086258,
|
| 3005 |
+
"learning_rate": 9.897435897435899e-06,
|
| 3006 |
+
"loss": 0.0202,
|
| 3007 |
+
"step": 10350
|
| 3008 |
+
},
|
| 3009 |
+
{
|
| 3010 |
+
"epoch": 11.426211453744493,
|
| 3011 |
+
"grad_norm": 0.4366730749607086,
|
| 3012 |
+
"learning_rate": 9.871794871794872e-06,
|
| 3013 |
+
"loss": 0.0166,
|
| 3014 |
+
"step": 10375
|
| 3015 |
+
},
|
| 3016 |
+
{
|
| 3017 |
+
"epoch": 11.45374449339207,
|
| 3018 |
+
"grad_norm": 0.12972579896450043,
|
| 3019 |
+
"learning_rate": 9.846153846153848e-06,
|
| 3020 |
+
"loss": 0.0177,
|
| 3021 |
+
"step": 10400
|
| 3022 |
+
},
|
| 3023 |
+
{
|
| 3024 |
+
"epoch": 11.481277533039648,
|
| 3025 |
+
"grad_norm": 0.810859203338623,
|
| 3026 |
+
"learning_rate": 9.820512820512821e-06,
|
| 3027 |
+
"loss": 0.0173,
|
| 3028 |
+
"step": 10425
|
| 3029 |
+
},
|
| 3030 |
+
{
|
| 3031 |
+
"epoch": 11.508810572687224,
|
| 3032 |
+
"grad_norm": 0.1165216714143753,
|
| 3033 |
+
"learning_rate": 9.794871794871795e-06,
|
| 3034 |
+
"loss": 0.0194,
|
| 3035 |
+
"step": 10450
|
| 3036 |
+
},
|
| 3037 |
+
{
|
| 3038 |
+
"epoch": 11.536343612334802,
|
| 3039 |
+
"grad_norm": 0.16423256695270538,
|
| 3040 |
+
"learning_rate": 9.76923076923077e-06,
|
| 3041 |
+
"loss": 0.017,
|
| 3042 |
+
"step": 10475
|
| 3043 |
+
},
|
| 3044 |
+
{
|
| 3045 |
+
"epoch": 11.56387665198238,
|
| 3046 |
+
"grad_norm": 0.6200090050697327,
|
| 3047 |
+
"learning_rate": 9.743589743589744e-06,
|
| 3048 |
+
"loss": 0.0233,
|
| 3049 |
+
"step": 10500
|
| 3050 |
+
},
|
| 3051 |
+
{
|
| 3052 |
+
"epoch": 11.591409691629956,
|
| 3053 |
+
"grad_norm": 0.3650573790073395,
|
| 3054 |
+
"learning_rate": 9.71794871794872e-06,
|
| 3055 |
+
"loss": 0.0188,
|
| 3056 |
+
"step": 10525
|
| 3057 |
+
},
|
| 3058 |
+
{
|
| 3059 |
+
"epoch": 11.618942731277533,
|
| 3060 |
+
"grad_norm": 0.23086689412593842,
|
| 3061 |
+
"learning_rate": 9.692307692307693e-06,
|
| 3062 |
+
"loss": 0.0166,
|
| 3063 |
+
"step": 10550
|
| 3064 |
+
},
|
| 3065 |
+
{
|
| 3066 |
+
"epoch": 11.646475770925111,
|
| 3067 |
+
"grad_norm": 0.28406432271003723,
|
| 3068 |
+
"learning_rate": 9.666666666666667e-06,
|
| 3069 |
+
"loss": 0.0199,
|
| 3070 |
+
"step": 10575
|
| 3071 |
+
},
|
| 3072 |
+
{
|
| 3073 |
+
"epoch": 11.674008810572687,
|
| 3074 |
+
"grad_norm": 0.13203246891498566,
|
| 3075 |
+
"learning_rate": 9.641025641025642e-06,
|
| 3076 |
+
"loss": 0.0169,
|
| 3077 |
+
"step": 10600
|
| 3078 |
+
},
|
| 3079 |
+
{
|
| 3080 |
+
"epoch": 11.701541850220265,
|
| 3081 |
+
"grad_norm": 0.3809435963630676,
|
| 3082 |
+
"learning_rate": 9.615384615384616e-06,
|
| 3083 |
+
"loss": 0.0167,
|
| 3084 |
+
"step": 10625
|
| 3085 |
+
},
|
| 3086 |
+
{
|
| 3087 |
+
"epoch": 11.729074889867842,
|
| 3088 |
+
"grad_norm": 0.2622781991958618,
|
| 3089 |
+
"learning_rate": 9.589743589743591e-06,
|
| 3090 |
+
"loss": 0.023,
|
| 3091 |
+
"step": 10650
|
| 3092 |
+
},
|
| 3093 |
+
{
|
| 3094 |
+
"epoch": 11.756607929515418,
|
| 3095 |
+
"grad_norm": 0.3118574321269989,
|
| 3096 |
+
"learning_rate": 9.564102564102565e-06,
|
| 3097 |
+
"loss": 0.0162,
|
| 3098 |
+
"step": 10675
|
| 3099 |
+
},
|
| 3100 |
+
{
|
| 3101 |
+
"epoch": 11.784140969162996,
|
| 3102 |
+
"grad_norm": 0.29195636510849,
|
| 3103 |
+
"learning_rate": 9.53846153846154e-06,
|
| 3104 |
+
"loss": 0.0166,
|
| 3105 |
+
"step": 10700
|
| 3106 |
+
},
|
| 3107 |
+
{
|
| 3108 |
+
"epoch": 11.811674008810574,
|
| 3109 |
+
"grad_norm": 0.16257286071777344,
|
| 3110 |
+
"learning_rate": 9.512820512820514e-06,
|
| 3111 |
+
"loss": 0.0186,
|
| 3112 |
+
"step": 10725
|
| 3113 |
+
},
|
| 3114 |
+
{
|
| 3115 |
+
"epoch": 11.83920704845815,
|
| 3116 |
+
"grad_norm": 0.2690454125404358,
|
| 3117 |
+
"learning_rate": 9.487179487179487e-06,
|
| 3118 |
+
"loss": 0.0184,
|
| 3119 |
+
"step": 10750
|
| 3120 |
+
},
|
| 3121 |
+
{
|
| 3122 |
+
"epoch": 11.866740088105727,
|
| 3123 |
+
"grad_norm": 0.07074102014303207,
|
| 3124 |
+
"learning_rate": 9.461538461538463e-06,
|
| 3125 |
+
"loss": 0.0147,
|
| 3126 |
+
"step": 10775
|
| 3127 |
+
},
|
| 3128 |
+
{
|
| 3129 |
+
"epoch": 11.894273127753303,
|
| 3130 |
+
"grad_norm": 0.0660664364695549,
|
| 3131 |
+
"learning_rate": 9.435897435897436e-06,
|
| 3132 |
+
"loss": 0.017,
|
| 3133 |
+
"step": 10800
|
| 3134 |
+
},
|
| 3135 |
+
{
|
| 3136 |
+
"epoch": 11.92180616740088,
|
| 3137 |
+
"grad_norm": 0.42482617497444153,
|
| 3138 |
+
"learning_rate": 9.410256410256412e-06,
|
| 3139 |
+
"loss": 0.0164,
|
| 3140 |
+
"step": 10825
|
| 3141 |
+
},
|
| 3142 |
+
{
|
| 3143 |
+
"epoch": 11.949339207048459,
|
| 3144 |
+
"grad_norm": 0.16394160687923431,
|
| 3145 |
+
"learning_rate": 9.384615384615385e-06,
|
| 3146 |
+
"loss": 0.0154,
|
| 3147 |
+
"step": 10850
|
| 3148 |
+
},
|
| 3149 |
+
{
|
| 3150 |
+
"epoch": 11.976872246696034,
|
| 3151 |
+
"grad_norm": 0.39682498574256897,
|
| 3152 |
+
"learning_rate": 9.358974358974359e-06,
|
| 3153 |
+
"loss": 0.0198,
|
| 3154 |
+
"step": 10875
|
| 3155 |
+
},
|
| 3156 |
+
{
|
| 3157 |
+
"epoch": 12.004405286343612,
|
| 3158 |
+
"grad_norm": 0.1381184458732605,
|
| 3159 |
+
"learning_rate": 9.333333333333334e-06,
|
| 3160 |
+
"loss": 0.0193,
|
| 3161 |
+
"step": 10900
|
| 3162 |
+
},
|
| 3163 |
+
{
|
| 3164 |
+
"epoch": 12.03193832599119,
|
| 3165 |
+
"grad_norm": 0.15030303597450256,
|
| 3166 |
+
"learning_rate": 9.307692307692308e-06,
|
| 3167 |
+
"loss": 0.0199,
|
| 3168 |
+
"step": 10925
|
| 3169 |
+
},
|
| 3170 |
+
{
|
| 3171 |
+
"epoch": 12.059471365638766,
|
| 3172 |
+
"grad_norm": 0.5344926714897156,
|
| 3173 |
+
"learning_rate": 9.282051282051283e-06,
|
| 3174 |
+
"loss": 0.0197,
|
| 3175 |
+
"step": 10950
|
| 3176 |
+
},
|
| 3177 |
+
{
|
| 3178 |
+
"epoch": 12.087004405286343,
|
| 3179 |
+
"grad_norm": 0.18761467933654785,
|
| 3180 |
+
"learning_rate": 9.256410256410257e-06,
|
| 3181 |
+
"loss": 0.0166,
|
| 3182 |
+
"step": 10975
|
| 3183 |
+
},
|
| 3184 |
+
{
|
| 3185 |
+
"epoch": 12.114537444933921,
|
| 3186 |
+
"grad_norm": 0.22124651074409485,
|
| 3187 |
+
"learning_rate": 9.230769230769232e-06,
|
| 3188 |
+
"loss": 0.0123,
|
| 3189 |
+
"step": 11000
|
| 3190 |
+
},
|
| 3191 |
+
{
|
| 3192 |
+
"epoch": 12.114537444933921,
|
| 3193 |
+
"eval_cer": 24.39787695023672,
|
| 3194 |
+
"eval_loss": 0.8476730585098267,
|
| 3195 |
+
"eval_runtime": 1307.0774,
|
| 3196 |
+
"eval_samples_per_second": 8.095,
|
| 3197 |
+
"eval_steps_per_second": 2.024,
|
| 3198 |
+
"eval_wer": 83.86610089580387,
|
| 3199 |
+
"step": 11000
|
| 3200 |
}
|
| 3201 |
],
|
| 3202 |
"logging_steps": 25,
|
|
|
|
| 3216 |
"attributes": {}
|
| 3217 |
}
|
| 3218 |
},
|
| 3219 |
+
"total_flos": 1.8899575051391074e+20,
|
| 3220 |
"train_batch_size": 4,
|
| 3221 |
"trial_name": null,
|
| 3222 |
"trial_params": null
|