Training in progress, step 3270, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2384234968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b681f70180e1b9b225d43794577d9735c1e90ae1f568ab2e1fb38b668291955
|
| 3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4768663315
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4baa6e79b984a80f604cda311b89dfe7d2e9a825e68647fc5d3797a8b813e2ea
|
| 3 |
size 4768663315
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:860913dca1e25255803c968661dca63cd03ec08cdee939cf5f78b0d42cbe6907
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2708,6 +2708,249 @@
|
|
| 2708 |
"mean_token_accuracy": 0.8454745601862669,
|
| 2709 |
"num_tokens": 24560640.0,
|
| 2710 |
"step": 3000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2711 |
}
|
| 2712 |
],
|
| 2713 |
"logging_steps": 10,
|
|
@@ -2722,12 +2965,12 @@
|
|
| 2722 |
"should_evaluate": false,
|
| 2723 |
"should_log": false,
|
| 2724 |
"should_save": true,
|
| 2725 |
-
"should_training_stop":
|
| 2726 |
},
|
| 2727 |
"attributes": {}
|
| 2728 |
}
|
| 2729 |
},
|
| 2730 |
-
"total_flos":
|
| 2731 |
"train_batch_size": 2,
|
| 2732 |
"trial_name": null,
|
| 2733 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 3270,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2708 |
"mean_token_accuracy": 0.8454745601862669,
|
| 2709 |
"num_tokens": 24560640.0,
|
| 2710 |
"step": 3000
|
| 2711 |
+
},
|
| 2712 |
+
{
|
| 2713 |
+
"epoch": 2.7620796510960632,
|
| 2714 |
+
"grad_norm": 1.08968985080719,
|
| 2715 |
+
"learning_rate": 8.868501529051989e-07,
|
| 2716 |
+
"loss": 0.1619,
|
| 2717 |
+
"mean_token_accuracy": 0.8165728993713856,
|
| 2718 |
+
"num_tokens": 24642560.0,
|
| 2719 |
+
"step": 3010
|
| 2720 |
+
},
|
| 2721 |
+
{
|
| 2722 |
+
"epoch": 2.7712613336393894,
|
| 2723 |
+
"grad_norm": 0.9702316522598267,
|
| 2724 |
+
"learning_rate": 8.528712198436969e-07,
|
| 2725 |
+
"loss": 0.1506,
|
| 2726 |
+
"mean_token_accuracy": 0.8149584170430899,
|
| 2727 |
+
"num_tokens": 24724480.0,
|
| 2728 |
+
"step": 3020
|
| 2729 |
+
},
|
| 2730 |
+
{
|
| 2731 |
+
"epoch": 2.7804430161827156,
|
| 2732 |
+
"grad_norm": 1.215406060218811,
|
| 2733 |
+
"learning_rate": 8.188922867821951e-07,
|
| 2734 |
+
"loss": 0.1273,
|
| 2735 |
+
"mean_token_accuracy": 0.8383683957159519,
|
| 2736 |
+
"num_tokens": 24806400.0,
|
| 2737 |
+
"step": 3030
|
| 2738 |
+
},
|
| 2739 |
+
{
|
| 2740 |
+
"epoch": 2.7896246987260414,
|
| 2741 |
+
"grad_norm": 1.3644214868545532,
|
| 2742 |
+
"learning_rate": 7.849133537206933e-07,
|
| 2743 |
+
"loss": 0.1361,
|
| 2744 |
+
"mean_token_accuracy": 0.823642372712493,
|
| 2745 |
+
"num_tokens": 24888320.0,
|
| 2746 |
+
"step": 3040
|
| 2747 |
+
},
|
| 2748 |
+
{
|
| 2749 |
+
"epoch": 2.7988063812693675,
|
| 2750 |
+
"grad_norm": 1.827764630317688,
|
| 2751 |
+
"learning_rate": 7.509344206591913e-07,
|
| 2752 |
+
"loss": 0.1394,
|
| 2753 |
+
"mean_token_accuracy": 0.8243884552270174,
|
| 2754 |
+
"num_tokens": 24970240.0,
|
| 2755 |
+
"step": 3050
|
| 2756 |
+
},
|
| 2757 |
+
{
|
| 2758 |
+
"epoch": 2.8079880638126937,
|
| 2759 |
+
"grad_norm": 1.2215831279754639,
|
| 2760 |
+
"learning_rate": 7.169554875976895e-07,
|
| 2761 |
+
"loss": 0.1345,
|
| 2762 |
+
"mean_token_accuracy": 0.8343688864260912,
|
| 2763 |
+
"num_tokens": 25052160.0,
|
| 2764 |
+
"step": 3060
|
| 2765 |
+
},
|
| 2766 |
+
{
|
| 2767 |
+
"epoch": 2.8171697463560195,
|
| 2768 |
+
"grad_norm": 1.2051235437393188,
|
| 2769 |
+
"learning_rate": 6.829765545361876e-07,
|
| 2770 |
+
"loss": 0.1448,
|
| 2771 |
+
"mean_token_accuracy": 0.8268346361815929,
|
| 2772 |
+
"num_tokens": 25134080.0,
|
| 2773 |
+
"step": 3070
|
| 2774 |
+
},
|
| 2775 |
+
{
|
| 2776 |
+
"epoch": 2.8263514288993457,
|
| 2777 |
+
"grad_norm": 1.358314037322998,
|
| 2778 |
+
"learning_rate": 6.489976214746857e-07,
|
| 2779 |
+
"loss": 0.1401,
|
| 2780 |
+
"mean_token_accuracy": 0.8394080217927694,
|
| 2781 |
+
"num_tokens": 25216000.0,
|
| 2782 |
+
"step": 3080
|
| 2783 |
+
},
|
| 2784 |
+
{
|
| 2785 |
+
"epoch": 2.835533111442672,
|
| 2786 |
+
"grad_norm": 1.5445815324783325,
|
| 2787 |
+
"learning_rate": 6.150186884131839e-07,
|
| 2788 |
+
"loss": 0.1444,
|
| 2789 |
+
"mean_token_accuracy": 0.8344789650291204,
|
| 2790 |
+
"num_tokens": 25297920.0,
|
| 2791 |
+
"step": 3090
|
| 2792 |
+
},
|
| 2793 |
+
{
|
| 2794 |
+
"epoch": 2.844714793985998,
|
| 2795 |
+
"grad_norm": 0.9232423305511475,
|
| 2796 |
+
"learning_rate": 5.81039755351682e-07,
|
| 2797 |
+
"loss": 0.1197,
|
| 2798 |
+
"mean_token_accuracy": 0.8397871796041727,
|
| 2799 |
+
"num_tokens": 25379840.0,
|
| 2800 |
+
"step": 3100
|
| 2801 |
+
},
|
| 2802 |
+
{
|
| 2803 |
+
"epoch": 2.853896476529324,
|
| 2804 |
+
"grad_norm": 1.2474477291107178,
|
| 2805 |
+
"learning_rate": 5.470608222901801e-07,
|
| 2806 |
+
"loss": 0.1358,
|
| 2807 |
+
"mean_token_accuracy": 0.830565071478486,
|
| 2808 |
+
"num_tokens": 25461760.0,
|
| 2809 |
+
"step": 3110
|
| 2810 |
+
},
|
| 2811 |
+
{
|
| 2812 |
+
"epoch": 2.86307815907265,
|
| 2813 |
+
"grad_norm": 1.3741815090179443,
|
| 2814 |
+
"learning_rate": 5.130818892286782e-07,
|
| 2815 |
+
"loss": 0.1367,
|
| 2816 |
+
"mean_token_accuracy": 0.822761744633317,
|
| 2817 |
+
"num_tokens": 25543680.0,
|
| 2818 |
+
"step": 3120
|
| 2819 |
+
},
|
| 2820 |
+
{
|
| 2821 |
+
"epoch": 2.872259841615976,
|
| 2822 |
+
"grad_norm": 0.8645684719085693,
|
| 2823 |
+
"learning_rate": 4.791029561671764e-07,
|
| 2824 |
+
"loss": 0.1414,
|
| 2825 |
+
"mean_token_accuracy": 0.8360322870314121,
|
| 2826 |
+
"num_tokens": 25625600.0,
|
| 2827 |
+
"step": 3130
|
| 2828 |
+
},
|
| 2829 |
+
{
|
| 2830 |
+
"epoch": 2.8814415241593023,
|
| 2831 |
+
"grad_norm": 1.3521939516067505,
|
| 2832 |
+
"learning_rate": 4.451240231056745e-07,
|
| 2833 |
+
"loss": 0.1368,
|
| 2834 |
+
"mean_token_accuracy": 0.8312010750174522,
|
| 2835 |
+
"num_tokens": 25707520.0,
|
| 2836 |
+
"step": 3140
|
| 2837 |
+
},
|
| 2838 |
+
{
|
| 2839 |
+
"epoch": 2.890623206702628,
|
| 2840 |
+
"grad_norm": 1.3353580236434937,
|
| 2841 |
+
"learning_rate": 4.111450900441726e-07,
|
| 2842 |
+
"loss": 0.1192,
|
| 2843 |
+
"mean_token_accuracy": 0.8368639908730984,
|
| 2844 |
+
"num_tokens": 25789440.0,
|
| 2845 |
+
"step": 3150
|
| 2846 |
+
},
|
| 2847 |
+
{
|
| 2848 |
+
"epoch": 2.8998048892459543,
|
| 2849 |
+
"grad_norm": 1.0142643451690674,
|
| 2850 |
+
"learning_rate": 3.7716615698267073e-07,
|
| 2851 |
+
"loss": 0.1391,
|
| 2852 |
+
"mean_token_accuracy": 0.8295865952968597,
|
| 2853 |
+
"num_tokens": 25871360.0,
|
| 2854 |
+
"step": 3160
|
| 2855 |
+
},
|
| 2856 |
+
{
|
| 2857 |
+
"epoch": 2.9089865717892804,
|
| 2858 |
+
"grad_norm": 1.3363066911697388,
|
| 2859 |
+
"learning_rate": 3.4318722392116895e-07,
|
| 2860 |
+
"loss": 0.15,
|
| 2861 |
+
"mean_token_accuracy": 0.8194104671478272,
|
| 2862 |
+
"num_tokens": 25953280.0,
|
| 2863 |
+
"step": 3170
|
| 2864 |
+
},
|
| 2865 |
+
{
|
| 2866 |
+
"epoch": 2.918168254332606,
|
| 2867 |
+
"grad_norm": 1.1663857698440552,
|
| 2868 |
+
"learning_rate": 3.09208290859667e-07,
|
| 2869 |
+
"loss": 0.1391,
|
| 2870 |
+
"mean_token_accuracy": 0.8333170261234045,
|
| 2871 |
+
"num_tokens": 26035200.0,
|
| 2872 |
+
"step": 3180
|
| 2873 |
+
},
|
| 2874 |
+
{
|
| 2875 |
+
"epoch": 2.9273499368759324,
|
| 2876 |
+
"grad_norm": 1.1857463121414185,
|
| 2877 |
+
"learning_rate": 2.752293577981652e-07,
|
| 2878 |
+
"loss": 0.1508,
|
| 2879 |
+
"mean_token_accuracy": 0.8284246563911438,
|
| 2880 |
+
"num_tokens": 26117120.0,
|
| 2881 |
+
"step": 3190
|
| 2882 |
+
},
|
| 2883 |
+
{
|
| 2884 |
+
"epoch": 2.9365316194192586,
|
| 2885 |
+
"grad_norm": 1.3892704248428345,
|
| 2886 |
+
"learning_rate": 2.412504247366633e-07,
|
| 2887 |
+
"loss": 0.1399,
|
| 2888 |
+
"mean_token_accuracy": 0.8355797432363034,
|
| 2889 |
+
"num_tokens": 26199040.0,
|
| 2890 |
+
"step": 3200
|
| 2891 |
+
},
|
| 2892 |
+
{
|
| 2893 |
+
"epoch": 2.9457133019625847,
|
| 2894 |
+
"grad_norm": 1.4766535758972168,
|
| 2895 |
+
"learning_rate": 2.0727149167516142e-07,
|
| 2896 |
+
"loss": 0.1713,
|
| 2897 |
+
"mean_token_accuracy": 0.8024461850523948,
|
| 2898 |
+
"num_tokens": 26280960.0,
|
| 2899 |
+
"step": 3210
|
| 2900 |
+
},
|
| 2901 |
+
{
|
| 2902 |
+
"epoch": 2.954894984505911,
|
| 2903 |
+
"grad_norm": 1.0841213464736938,
|
| 2904 |
+
"learning_rate": 1.7329255861365954e-07,
|
| 2905 |
+
"loss": 0.1464,
|
| 2906 |
+
"mean_token_accuracy": 0.822847356274724,
|
| 2907 |
+
"num_tokens": 26362880.0,
|
| 2908 |
+
"step": 3220
|
| 2909 |
+
},
|
| 2910 |
+
{
|
| 2911 |
+
"epoch": 2.9640766670492367,
|
| 2912 |
+
"grad_norm": 1.0091631412506104,
|
| 2913 |
+
"learning_rate": 1.3931362555215769e-07,
|
| 2914 |
+
"loss": 0.1402,
|
| 2915 |
+
"mean_token_accuracy": 0.8205234851688147,
|
| 2916 |
+
"num_tokens": 26444800.0,
|
| 2917 |
+
"step": 3230
|
| 2918 |
+
},
|
| 2919 |
+
{
|
| 2920 |
+
"epoch": 2.973258349592563,
|
| 2921 |
+
"grad_norm": 1.2437331676483154,
|
| 2922 |
+
"learning_rate": 1.053346924906558e-07,
|
| 2923 |
+
"loss": 0.1487,
|
| 2924 |
+
"mean_token_accuracy": 0.8193126212805509,
|
| 2925 |
+
"num_tokens": 26526720.0,
|
| 2926 |
+
"step": 3240
|
| 2927 |
+
},
|
| 2928 |
+
{
|
| 2929 |
+
"epoch": 2.982440032135889,
|
| 2930 |
+
"grad_norm": 1.2033171653747559,
|
| 2931 |
+
"learning_rate": 7.135575942915393e-08,
|
| 2932 |
+
"loss": 0.1081,
|
| 2933 |
+
"mean_token_accuracy": 0.8560909986495971,
|
| 2934 |
+
"num_tokens": 26608640.0,
|
| 2935 |
+
"step": 3250
|
| 2936 |
+
},
|
| 2937 |
+
{
|
| 2938 |
+
"epoch": 2.991621714679215,
|
| 2939 |
+
"grad_norm": 1.167435884475708,
|
| 2940 |
+
"learning_rate": 3.737682636765206e-08,
|
| 2941 |
+
"loss": 0.1496,
|
| 2942 |
+
"mean_token_accuracy": 0.8093933459371329,
|
| 2943 |
+
"num_tokens": 26690560.0,
|
| 2944 |
+
"step": 3260
|
| 2945 |
+
},
|
| 2946 |
+
{
|
| 2947 |
+
"epoch": 3.0,
|
| 2948 |
+
"grad_norm": 7.105273246765137,
|
| 2949 |
+
"learning_rate": 3.3978933061501875e-09,
|
| 2950 |
+
"loss": 0.1229,
|
| 2951 |
+
"mean_token_accuracy": 0.8537249038480732,
|
| 2952 |
+
"num_tokens": 26764800.0,
|
| 2953 |
+
"step": 3270
|
| 2954 |
}
|
| 2955 |
],
|
| 2956 |
"logging_steps": 10,
|
|
|
|
| 2965 |
"should_evaluate": false,
|
| 2966 |
"should_log": false,
|
| 2967 |
"should_save": true,
|
| 2968 |
+
"should_training_stop": true
|
| 2969 |
},
|
| 2970 |
"attributes": {}
|
| 2971 |
}
|
| 2972 |
},
|
| 2973 |
+
"total_flos": 7.07341401980928e+16,
|
| 2974 |
"train_batch_size": 2,
|
| 2975 |
"trial_name": null,
|
| 2976 |
"trial_params": null
|