Training in progress, step 3500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f96aaa5e97f3f83387afc0775efd5e922752a17138c7276a9efe7c9ff0bbeee
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:451881f3cab07a4e85e5f970801619f2d6aa94fada708d3b827ca3fafa636054
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cae1361ad95b650252f8194ff20a5669981349cd4f0f59f3528fb4497ea319b8
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2708,6 +2708,456 @@
|
|
| 2708 |
"mean_token_accuracy": 0.7911386549472809,
|
| 2709 |
"num_tokens": 3316348.0,
|
| 2710 |
"step": 3000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2711 |
}
|
| 2712 |
],
|
| 2713 |
"logging_steps": 10,
|
|
@@ -2727,7 +3177,7 @@
|
|
| 2727 |
"attributes": {}
|
| 2728 |
}
|
| 2729 |
},
|
| 2730 |
-
"total_flos":
|
| 2731 |
"train_batch_size": 8,
|
| 2732 |
"trial_name": null,
|
| 2733 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.7052186177715092,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 3500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2708 |
"mean_token_accuracy": 0.7911386549472809,
|
| 2709 |
"num_tokens": 3316348.0,
|
| 2710 |
"step": 3000
|
| 2711 |
+
},
|
| 2712 |
+
{
|
| 2713 |
+
"epoch": 0.6064880112834978,
|
| 2714 |
+
"grad_norm": 11.25,
|
| 2715 |
+
"learning_rate": 1.595808986500101e-05,
|
| 2716 |
+
"loss": 0.9079,
|
| 2717 |
+
"mean_token_accuracy": 0.7789463937282562,
|
| 2718 |
+
"num_tokens": 3326996.0,
|
| 2719 |
+
"step": 3010
|
| 2720 |
+
},
|
| 2721 |
+
{
|
| 2722 |
+
"epoch": 0.6085029216199879,
|
| 2723 |
+
"grad_norm": 11.75,
|
| 2724 |
+
"learning_rate": 1.5944657129424407e-05,
|
| 2725 |
+
"loss": 0.8986,
|
| 2726 |
+
"mean_token_accuracy": 0.7775606334209442,
|
| 2727 |
+
"num_tokens": 3339244.0,
|
| 2728 |
+
"step": 3020
|
| 2729 |
+
},
|
| 2730 |
+
{
|
| 2731 |
+
"epoch": 0.6105178319564779,
|
| 2732 |
+
"grad_norm": 11.1875,
|
| 2733 |
+
"learning_rate": 1.593122439384781e-05,
|
| 2734 |
+
"loss": 0.8756,
|
| 2735 |
+
"mean_token_accuracy": 0.7890569746494294,
|
| 2736 |
+
"num_tokens": 3349636.0,
|
| 2737 |
+
"step": 3030
|
| 2738 |
+
},
|
| 2739 |
+
{
|
| 2740 |
+
"epoch": 0.612532742292968,
|
| 2741 |
+
"grad_norm": 11.9375,
|
| 2742 |
+
"learning_rate": 1.591779165827121e-05,
|
| 2743 |
+
"loss": 0.8937,
|
| 2744 |
+
"mean_token_accuracy": 0.7829440474510193,
|
| 2745 |
+
"num_tokens": 3360711.0,
|
| 2746 |
+
"step": 3040
|
| 2747 |
+
},
|
| 2748 |
+
{
|
| 2749 |
+
"epoch": 0.614547652629458,
|
| 2750 |
+
"grad_norm": 10.25,
|
| 2751 |
+
"learning_rate": 1.5904358922694607e-05,
|
| 2752 |
+
"loss": 0.9303,
|
| 2753 |
+
"mean_token_accuracy": 0.7729782402515412,
|
| 2754 |
+
"num_tokens": 3372520.0,
|
| 2755 |
+
"step": 3050
|
| 2756 |
+
},
|
| 2757 |
+
{
|
| 2758 |
+
"epoch": 0.616562562965948,
|
| 2759 |
+
"grad_norm": 10.625,
|
| 2760 |
+
"learning_rate": 1.5890926187118006e-05,
|
| 2761 |
+
"loss": 0.9247,
|
| 2762 |
+
"mean_token_accuracy": 0.7741464018821717,
|
| 2763 |
+
"num_tokens": 3384007.0,
|
| 2764 |
+
"step": 3060
|
| 2765 |
+
},
|
| 2766 |
+
{
|
| 2767 |
+
"epoch": 0.6185774733024381,
|
| 2768 |
+
"grad_norm": 9.75,
|
| 2769 |
+
"learning_rate": 1.5877493451541408e-05,
|
| 2770 |
+
"loss": 0.7869,
|
| 2771 |
+
"mean_token_accuracy": 0.8034534513950348,
|
| 2772 |
+
"num_tokens": 3395421.0,
|
| 2773 |
+
"step": 3070
|
| 2774 |
+
},
|
| 2775 |
+
{
|
| 2776 |
+
"epoch": 0.6205923836389281,
|
| 2777 |
+
"grad_norm": 12.0,
|
| 2778 |
+
"learning_rate": 1.5864060715964807e-05,
|
| 2779 |
+
"loss": 0.8645,
|
| 2780 |
+
"mean_token_accuracy": 0.7838839650154114,
|
| 2781 |
+
"num_tokens": 3405277.0,
|
| 2782 |
+
"step": 3080
|
| 2783 |
+
},
|
| 2784 |
+
{
|
| 2785 |
+
"epoch": 0.6226072939754181,
|
| 2786 |
+
"grad_norm": 11.375,
|
| 2787 |
+
"learning_rate": 1.5850627980388206e-05,
|
| 2788 |
+
"loss": 0.8441,
|
| 2789 |
+
"mean_token_accuracy": 0.7875894546508789,
|
| 2790 |
+
"num_tokens": 3417672.0,
|
| 2791 |
+
"step": 3090
|
| 2792 |
+
},
|
| 2793 |
+
{
|
| 2794 |
+
"epoch": 0.6246222043119081,
|
| 2795 |
+
"grad_norm": 11.5625,
|
| 2796 |
+
"learning_rate": 1.5837195244811608e-05,
|
| 2797 |
+
"loss": 0.876,
|
| 2798 |
+
"mean_token_accuracy": 0.7891711950302124,
|
| 2799 |
+
"num_tokens": 3428808.0,
|
| 2800 |
+
"step": 3100
|
| 2801 |
+
},
|
| 2802 |
+
{
|
| 2803 |
+
"epoch": 0.6266371146483981,
|
| 2804 |
+
"grad_norm": 10.6875,
|
| 2805 |
+
"learning_rate": 1.5823762509235007e-05,
|
| 2806 |
+
"loss": 0.8585,
|
| 2807 |
+
"mean_token_accuracy": 0.790976220369339,
|
| 2808 |
+
"num_tokens": 3440047.0,
|
| 2809 |
+
"step": 3110
|
| 2810 |
+
},
|
| 2811 |
+
{
|
| 2812 |
+
"epoch": 0.6286520249848881,
|
| 2813 |
+
"grad_norm": 11.8125,
|
| 2814 |
+
"learning_rate": 1.5810329773658406e-05,
|
| 2815 |
+
"loss": 0.8671,
|
| 2816 |
+
"mean_token_accuracy": 0.7851876437664032,
|
| 2817 |
+
"num_tokens": 3450270.0,
|
| 2818 |
+
"step": 3120
|
| 2819 |
+
},
|
| 2820 |
+
{
|
| 2821 |
+
"epoch": 0.6306669353213782,
|
| 2822 |
+
"grad_norm": 13.125,
|
| 2823 |
+
"learning_rate": 1.5796897038081808e-05,
|
| 2824 |
+
"loss": 0.9273,
|
| 2825 |
+
"mean_token_accuracy": 0.7657091677188873,
|
| 2826 |
+
"num_tokens": 3462054.0,
|
| 2827 |
+
"step": 3130
|
| 2828 |
+
},
|
| 2829 |
+
{
|
| 2830 |
+
"epoch": 0.6326818456578682,
|
| 2831 |
+
"grad_norm": 10.9375,
|
| 2832 |
+
"learning_rate": 1.5783464302505207e-05,
|
| 2833 |
+
"loss": 0.8185,
|
| 2834 |
+
"mean_token_accuracy": 0.7917792797088623,
|
| 2835 |
+
"num_tokens": 3472333.0,
|
| 2836 |
+
"step": 3140
|
| 2837 |
+
},
|
| 2838 |
+
{
|
| 2839 |
+
"epoch": 0.6346967559943583,
|
| 2840 |
+
"grad_norm": 12.0625,
|
| 2841 |
+
"learning_rate": 1.5770031566928606e-05,
|
| 2842 |
+
"loss": 0.9939,
|
| 2843 |
+
"mean_token_accuracy": 0.7620590627193451,
|
| 2844 |
+
"num_tokens": 3484753.0,
|
| 2845 |
+
"step": 3150
|
| 2846 |
+
},
|
| 2847 |
+
{
|
| 2848 |
+
"epoch": 0.6367116663308483,
|
| 2849 |
+
"grad_norm": 14.1875,
|
| 2850 |
+
"learning_rate": 1.5756598831352005e-05,
|
| 2851 |
+
"loss": 0.8723,
|
| 2852 |
+
"mean_token_accuracy": 0.7846651554107666,
|
| 2853 |
+
"num_tokens": 3496220.0,
|
| 2854 |
+
"step": 3160
|
| 2855 |
+
},
|
| 2856 |
+
{
|
| 2857 |
+
"epoch": 0.6387265766673383,
|
| 2858 |
+
"grad_norm": 11.6875,
|
| 2859 |
+
"learning_rate": 1.5743166095775407e-05,
|
| 2860 |
+
"loss": 0.9375,
|
| 2861 |
+
"mean_token_accuracy": 0.7770143210887909,
|
| 2862 |
+
"num_tokens": 3508224.0,
|
| 2863 |
+
"step": 3170
|
| 2864 |
+
},
|
| 2865 |
+
{
|
| 2866 |
+
"epoch": 0.6407414870038284,
|
| 2867 |
+
"grad_norm": 11.0625,
|
| 2868 |
+
"learning_rate": 1.5729733360198806e-05,
|
| 2869 |
+
"loss": 0.8789,
|
| 2870 |
+
"mean_token_accuracy": 0.7903493702411651,
|
| 2871 |
+
"num_tokens": 3519271.0,
|
| 2872 |
+
"step": 3180
|
| 2873 |
+
},
|
| 2874 |
+
{
|
| 2875 |
+
"epoch": 0.6427563973403183,
|
| 2876 |
+
"grad_norm": 16.25,
|
| 2877 |
+
"learning_rate": 1.5716300624622204e-05,
|
| 2878 |
+
"loss": 0.9003,
|
| 2879 |
+
"mean_token_accuracy": 0.7797963619232178,
|
| 2880 |
+
"num_tokens": 3530537.0,
|
| 2881 |
+
"step": 3190
|
| 2882 |
+
},
|
| 2883 |
+
{
|
| 2884 |
+
"epoch": 0.6447713076768083,
|
| 2885 |
+
"grad_norm": 10.75,
|
| 2886 |
+
"learning_rate": 1.5702867889045607e-05,
|
| 2887 |
+
"loss": 0.9229,
|
| 2888 |
+
"mean_token_accuracy": 0.7731367945671082,
|
| 2889 |
+
"num_tokens": 3540961.0,
|
| 2890 |
+
"step": 3200
|
| 2891 |
+
},
|
| 2892 |
+
{
|
| 2893 |
+
"epoch": 0.6467862180132984,
|
| 2894 |
+
"grad_norm": 11.75,
|
| 2895 |
+
"learning_rate": 1.5689435153469006e-05,
|
| 2896 |
+
"loss": 0.9519,
|
| 2897 |
+
"mean_token_accuracy": 0.766649729013443,
|
| 2898 |
+
"num_tokens": 3552392.0,
|
| 2899 |
+
"step": 3210
|
| 2900 |
+
},
|
| 2901 |
+
{
|
| 2902 |
+
"epoch": 0.6488011283497884,
|
| 2903 |
+
"grad_norm": 11.375,
|
| 2904 |
+
"learning_rate": 1.5676002417892404e-05,
|
| 2905 |
+
"loss": 0.8958,
|
| 2906 |
+
"mean_token_accuracy": 0.7798868775367737,
|
| 2907 |
+
"num_tokens": 3563665.0,
|
| 2908 |
+
"step": 3220
|
| 2909 |
+
},
|
| 2910 |
+
{
|
| 2911 |
+
"epoch": 0.6508160386862785,
|
| 2912 |
+
"grad_norm": 10.875,
|
| 2913 |
+
"learning_rate": 1.5662569682315803e-05,
|
| 2914 |
+
"loss": 0.9158,
|
| 2915 |
+
"mean_token_accuracy": 0.7784943222999573,
|
| 2916 |
+
"num_tokens": 3575115.0,
|
| 2917 |
+
"step": 3230
|
| 2918 |
+
},
|
| 2919 |
+
{
|
| 2920 |
+
"epoch": 0.6528309490227685,
|
| 2921 |
+
"grad_norm": 10.1875,
|
| 2922 |
+
"learning_rate": 1.5649136946739205e-05,
|
| 2923 |
+
"loss": 0.8092,
|
| 2924 |
+
"mean_token_accuracy": 0.7988557398319245,
|
| 2925 |
+
"num_tokens": 3585453.0,
|
| 2926 |
+
"step": 3240
|
| 2927 |
+
},
|
| 2928 |
+
{
|
| 2929 |
+
"epoch": 0.6548458593592585,
|
| 2930 |
+
"grad_norm": 12.8125,
|
| 2931 |
+
"learning_rate": 1.5635704211162604e-05,
|
| 2932 |
+
"loss": 0.8562,
|
| 2933 |
+
"mean_token_accuracy": 0.7906098127365112,
|
| 2934 |
+
"num_tokens": 3595472.0,
|
| 2935 |
+
"step": 3250
|
| 2936 |
+
},
|
| 2937 |
+
{
|
| 2938 |
+
"epoch": 0.6568607696957486,
|
| 2939 |
+
"grad_norm": 10.9375,
|
| 2940 |
+
"learning_rate": 1.5622271475586003e-05,
|
| 2941 |
+
"loss": 0.9317,
|
| 2942 |
+
"mean_token_accuracy": 0.776879757642746,
|
| 2943 |
+
"num_tokens": 3607704.0,
|
| 2944 |
+
"step": 3260
|
| 2945 |
+
},
|
| 2946 |
+
{
|
| 2947 |
+
"epoch": 0.6588756800322386,
|
| 2948 |
+
"grad_norm": 9.6875,
|
| 2949 |
+
"learning_rate": 1.5608838740009405e-05,
|
| 2950 |
+
"loss": 0.8642,
|
| 2951 |
+
"mean_token_accuracy": 0.7901065409183502,
|
| 2952 |
+
"num_tokens": 3618233.0,
|
| 2953 |
+
"step": 3270
|
| 2954 |
+
},
|
| 2955 |
+
{
|
| 2956 |
+
"epoch": 0.6608905903687285,
|
| 2957 |
+
"grad_norm": 13.8125,
|
| 2958 |
+
"learning_rate": 1.5595406004432804e-05,
|
| 2959 |
+
"loss": 0.9939,
|
| 2960 |
+
"mean_token_accuracy": 0.7686895251274108,
|
| 2961 |
+
"num_tokens": 3628902.0,
|
| 2962 |
+
"step": 3280
|
| 2963 |
+
},
|
| 2964 |
+
{
|
| 2965 |
+
"epoch": 0.6629055007052186,
|
| 2966 |
+
"grad_norm": 12.25,
|
| 2967 |
+
"learning_rate": 1.5581973268856203e-05,
|
| 2968 |
+
"loss": 0.8935,
|
| 2969 |
+
"mean_token_accuracy": 0.7827515482902527,
|
| 2970 |
+
"num_tokens": 3640225.0,
|
| 2971 |
+
"step": 3290
|
| 2972 |
+
},
|
| 2973 |
+
{
|
| 2974 |
+
"epoch": 0.6649204110417086,
|
| 2975 |
+
"grad_norm": 13.8125,
|
| 2976 |
+
"learning_rate": 1.5568540533279605e-05,
|
| 2977 |
+
"loss": 0.8856,
|
| 2978 |
+
"mean_token_accuracy": 0.7820924818515778,
|
| 2979 |
+
"num_tokens": 3651992.0,
|
| 2980 |
+
"step": 3300
|
| 2981 |
+
},
|
| 2982 |
+
{
|
| 2983 |
+
"epoch": 0.6669353213781987,
|
| 2984 |
+
"grad_norm": 11.0,
|
| 2985 |
+
"learning_rate": 1.5555107797703004e-05,
|
| 2986 |
+
"loss": 0.9789,
|
| 2987 |
+
"mean_token_accuracy": 0.7679969072341919,
|
| 2988 |
+
"num_tokens": 3663369.0,
|
| 2989 |
+
"step": 3310
|
| 2990 |
+
},
|
| 2991 |
+
{
|
| 2992 |
+
"epoch": 0.6689502317146887,
|
| 2993 |
+
"grad_norm": 10.625,
|
| 2994 |
+
"learning_rate": 1.5541675062126403e-05,
|
| 2995 |
+
"loss": 0.9536,
|
| 2996 |
+
"mean_token_accuracy": 0.7675111889839172,
|
| 2997 |
+
"num_tokens": 3674969.0,
|
| 2998 |
+
"step": 3320
|
| 2999 |
+
},
|
| 3000 |
+
{
|
| 3001 |
+
"epoch": 0.6709651420511787,
|
| 3002 |
+
"grad_norm": 10.375,
|
| 3003 |
+
"learning_rate": 1.5528242326549802e-05,
|
| 3004 |
+
"loss": 0.917,
|
| 3005 |
+
"mean_token_accuracy": 0.7766897320747376,
|
| 3006 |
+
"num_tokens": 3685794.0,
|
| 3007 |
+
"step": 3330
|
| 3008 |
+
},
|
| 3009 |
+
{
|
| 3010 |
+
"epoch": 0.6729800523876688,
|
| 3011 |
+
"grad_norm": 12.6875,
|
| 3012 |
+
"learning_rate": 1.5514809590973204e-05,
|
| 3013 |
+
"loss": 0.8213,
|
| 3014 |
+
"mean_token_accuracy": 0.798279982805252,
|
| 3015 |
+
"num_tokens": 3698384.0,
|
| 3016 |
+
"step": 3340
|
| 3017 |
+
},
|
| 3018 |
+
{
|
| 3019 |
+
"epoch": 0.6749949627241588,
|
| 3020 |
+
"grad_norm": 14.125,
|
| 3021 |
+
"learning_rate": 1.5501376855396603e-05,
|
| 3022 |
+
"loss": 0.9941,
|
| 3023 |
+
"mean_token_accuracy": 0.7723333060741424,
|
| 3024 |
+
"num_tokens": 3709110.0,
|
| 3025 |
+
"step": 3350
|
| 3026 |
+
},
|
| 3027 |
+
{
|
| 3028 |
+
"epoch": 0.6770098730606487,
|
| 3029 |
+
"grad_norm": 10.875,
|
| 3030 |
+
"learning_rate": 1.548794411982e-05,
|
| 3031 |
+
"loss": 0.9428,
|
| 3032 |
+
"mean_token_accuracy": 0.7793790519237518,
|
| 3033 |
+
"num_tokens": 3720500.0,
|
| 3034 |
+
"step": 3360
|
| 3035 |
+
},
|
| 3036 |
+
{
|
| 3037 |
+
"epoch": 0.6790247833971388,
|
| 3038 |
+
"grad_norm": 10.5625,
|
| 3039 |
+
"learning_rate": 1.5474511384243404e-05,
|
| 3040 |
+
"loss": 0.9055,
|
| 3041 |
+
"mean_token_accuracy": 0.7757111012935638,
|
| 3042 |
+
"num_tokens": 3733649.0,
|
| 3043 |
+
"step": 3370
|
| 3044 |
+
},
|
| 3045 |
+
{
|
| 3046 |
+
"epoch": 0.6810396937336288,
|
| 3047 |
+
"grad_norm": 11.625,
|
| 3048 |
+
"learning_rate": 1.5461078648666803e-05,
|
| 3049 |
+
"loss": 1.0494,
|
| 3050 |
+
"mean_token_accuracy": 0.748576694726944,
|
| 3051 |
+
"num_tokens": 3744082.0,
|
| 3052 |
+
"step": 3380
|
| 3053 |
+
},
|
| 3054 |
+
{
|
| 3055 |
+
"epoch": 0.6830546040701189,
|
| 3056 |
+
"grad_norm": 11.0,
|
| 3057 |
+
"learning_rate": 1.54476459130902e-05,
|
| 3058 |
+
"loss": 0.9308,
|
| 3059 |
+
"mean_token_accuracy": 0.7800273001194,
|
| 3060 |
+
"num_tokens": 3755970.0,
|
| 3061 |
+
"step": 3390
|
| 3062 |
+
},
|
| 3063 |
+
{
|
| 3064 |
+
"epoch": 0.6850695144066089,
|
| 3065 |
+
"grad_norm": 10.4375,
|
| 3066 |
+
"learning_rate": 1.54342131775136e-05,
|
| 3067 |
+
"loss": 0.8138,
|
| 3068 |
+
"mean_token_accuracy": 0.7963871121406555,
|
| 3069 |
+
"num_tokens": 3766720.0,
|
| 3070 |
+
"step": 3400
|
| 3071 |
+
},
|
| 3072 |
+
{
|
| 3073 |
+
"epoch": 0.6870844247430989,
|
| 3074 |
+
"grad_norm": 11.5625,
|
| 3075 |
+
"learning_rate": 1.5420780441937003e-05,
|
| 3076 |
+
"loss": 0.8504,
|
| 3077 |
+
"mean_token_accuracy": 0.7921045780181885,
|
| 3078 |
+
"num_tokens": 3777338.0,
|
| 3079 |
+
"step": 3410
|
| 3080 |
+
},
|
| 3081 |
+
{
|
| 3082 |
+
"epoch": 0.689099335079589,
|
| 3083 |
+
"grad_norm": 11.625,
|
| 3084 |
+
"learning_rate": 1.54073477063604e-05,
|
| 3085 |
+
"loss": 0.9142,
|
| 3086 |
+
"mean_token_accuracy": 0.7711953699588776,
|
| 3087 |
+
"num_tokens": 3788475.0,
|
| 3088 |
+
"step": 3420
|
| 3089 |
+
},
|
| 3090 |
+
{
|
| 3091 |
+
"epoch": 0.691114245416079,
|
| 3092 |
+
"grad_norm": 11.1875,
|
| 3093 |
+
"learning_rate": 1.53939149707838e-05,
|
| 3094 |
+
"loss": 0.9957,
|
| 3095 |
+
"mean_token_accuracy": 0.7668613314628601,
|
| 3096 |
+
"num_tokens": 3800222.0,
|
| 3097 |
+
"step": 3430
|
| 3098 |
+
},
|
| 3099 |
+
{
|
| 3100 |
+
"epoch": 0.6931291557525691,
|
| 3101 |
+
"grad_norm": 13.125,
|
| 3102 |
+
"learning_rate": 1.5380482235207202e-05,
|
| 3103 |
+
"loss": 0.8789,
|
| 3104 |
+
"mean_token_accuracy": 0.7842870116233825,
|
| 3105 |
+
"num_tokens": 3811363.0,
|
| 3106 |
+
"step": 3440
|
| 3107 |
+
},
|
| 3108 |
+
{
|
| 3109 |
+
"epoch": 0.695144066089059,
|
| 3110 |
+
"grad_norm": 14.4375,
|
| 3111 |
+
"learning_rate": 1.53670494996306e-05,
|
| 3112 |
+
"loss": 0.7952,
|
| 3113 |
+
"mean_token_accuracy": 0.8054643094539642,
|
| 3114 |
+
"num_tokens": 3821569.0,
|
| 3115 |
+
"step": 3450
|
| 3116 |
+
},
|
| 3117 |
+
{
|
| 3118 |
+
"epoch": 0.697158976425549,
|
| 3119 |
+
"grad_norm": 9.6875,
|
| 3120 |
+
"learning_rate": 1.5353616764054e-05,
|
| 3121 |
+
"loss": 0.8705,
|
| 3122 |
+
"mean_token_accuracy": 0.7873030543327332,
|
| 3123 |
+
"num_tokens": 3833270.0,
|
| 3124 |
+
"step": 3460
|
| 3125 |
+
},
|
| 3126 |
+
{
|
| 3127 |
+
"epoch": 0.6991738867620391,
|
| 3128 |
+
"grad_norm": 10.6875,
|
| 3129 |
+
"learning_rate": 1.53401840284774e-05,
|
| 3130 |
+
"loss": 0.9286,
|
| 3131 |
+
"mean_token_accuracy": 0.7691307544708252,
|
| 3132 |
+
"num_tokens": 3844114.0,
|
| 3133 |
+
"step": 3470
|
| 3134 |
+
},
|
| 3135 |
+
{
|
| 3136 |
+
"epoch": 0.7011887970985291,
|
| 3137 |
+
"grad_norm": 11.4375,
|
| 3138 |
+
"learning_rate": 1.53267512929008e-05,
|
| 3139 |
+
"loss": 0.993,
|
| 3140 |
+
"mean_token_accuracy": 0.7620323598384857,
|
| 3141 |
+
"num_tokens": 3856040.0,
|
| 3142 |
+
"step": 3480
|
| 3143 |
+
},
|
| 3144 |
+
{
|
| 3145 |
+
"epoch": 0.7032037074350191,
|
| 3146 |
+
"grad_norm": 11.5,
|
| 3147 |
+
"learning_rate": 1.53133185573242e-05,
|
| 3148 |
+
"loss": 0.8668,
|
| 3149 |
+
"mean_token_accuracy": 0.7913073658943176,
|
| 3150 |
+
"num_tokens": 3867207.0,
|
| 3151 |
+
"step": 3490
|
| 3152 |
+
},
|
| 3153 |
+
{
|
| 3154 |
+
"epoch": 0.7052186177715092,
|
| 3155 |
+
"grad_norm": 9.25,
|
| 3156 |
+
"learning_rate": 1.52998858217476e-05,
|
| 3157 |
+
"loss": 0.8715,
|
| 3158 |
+
"mean_token_accuracy": 0.7891253709793091,
|
| 3159 |
+
"num_tokens": 3879065.0,
|
| 3160 |
+
"step": 3500
|
| 3161 |
}
|
| 3162 |
],
|
| 3163 |
"logging_steps": 10,
|
|
|
|
| 3177 |
"attributes": {}
|
| 3178 |
}
|
| 3179 |
},
|
| 3180 |
+
"total_flos": 4699418269335552.0,
|
| 3181 |
"train_batch_size": 8,
|
| 3182 |
"trial_name": null,
|
| 3183 |
"trial_params": null
|