Training in progress, step 14000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 150625560
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0579f8b01bd92a4b6d4d9542187f9f6be5d525493ee4cacf89313462b0d4fc29
|
| 3 |
size 150625560
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1ac4e5f1a091d05231fad9fd4f9941afbf6737a4f9256414d7439dd21637791
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dac96a69b6625532fa7a1849a782b63a79e8d1b28e764bc8297e354d748f16c9
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:081dc59c3c452b8ce89bfce5eae0952bf765aeed7903bbba40be0fb195d20006
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2842,11 +2842,229 @@
|
|
| 2842 |
"eval_steps_per_second": 20.539,
|
| 2843 |
"num_input_tokens_seen": 6280158129,
|
| 2844 |
"step": 13000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2845 |
}
|
| 2846 |
],
|
| 2847 |
"logging_steps": 50,
|
| 2848 |
"max_steps": 16568,
|
| 2849 |
-
"num_input_tokens_seen":
|
| 2850 |
"num_train_epochs": 4,
|
| 2851 |
"save_steps": 1000,
|
| 2852 |
"stateful_callbacks": {
|
|
@@ -2861,7 +3079,7 @@
|
|
| 2861 |
"attributes": {}
|
| 2862 |
}
|
| 2863 |
},
|
| 2864 |
-
"total_flos": 1.
|
| 2865 |
"train_batch_size": 16,
|
| 2866 |
"trial_name": null,
|
| 2867 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.3792567557372846,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 14000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2842 |
"eval_steps_per_second": 20.539,
|
| 2843 |
"num_input_tokens_seen": 6280158129,
|
| 2844 |
"step": 13000
|
| 2845 |
+
},
|
| 2846 |
+
{
|
| 2847 |
+
"epoch": 3.149916260542873,
|
| 2848 |
+
"grad_norm": 0.25390625,
|
| 2849 |
+
"learning_rate": 1.3271465218047383e-05,
|
| 2850 |
+
"loss": 2.0974,
|
| 2851 |
+
"mean_token_accuracy": 0.5548031070828437,
|
| 2852 |
+
"num_input_tokens_seen": 6304365713,
|
| 2853 |
+
"num_tokens": 2656912031.0,
|
| 2854 |
+
"step": 13050
|
| 2855 |
+
},
|
| 2856 |
+
{
|
| 2857 |
+
"epoch": 3.1619868129215263,
|
| 2858 |
+
"grad_norm": 0.24609375,
|
| 2859 |
+
"learning_rate": 1.3082842915346311e-05,
|
| 2860 |
+
"loss": 2.0981,
|
| 2861 |
+
"mean_token_accuracy": 0.5543636172637343,
|
| 2862 |
+
"num_input_tokens_seen": 6328561217,
|
| 2863 |
+
"num_tokens": 2667181848.0,
|
| 2864 |
+
"step": 13100
|
| 2865 |
+
},
|
| 2866 |
+
{
|
| 2867 |
+
"epoch": 3.1740573653001793,
|
| 2868 |
+
"grad_norm": 0.236328125,
|
| 2869 |
+
"learning_rate": 1.2894220612645238e-05,
|
| 2870 |
+
"loss": 2.093,
|
| 2871 |
+
"mean_token_accuracy": 0.5551713344082236,
|
| 2872 |
+
"num_input_tokens_seen": 6352657569,
|
| 2873 |
+
"num_tokens": 2677374089.0,
|
| 2874 |
+
"step": 13150
|
| 2875 |
+
},
|
| 2876 |
+
{
|
| 2877 |
+
"epoch": 3.186127917678833,
|
| 2878 |
+
"grad_norm": 0.267578125,
|
| 2879 |
+
"learning_rate": 1.2705598309944169e-05,
|
| 2880 |
+
"loss": 2.084,
|
| 2881 |
+
"mean_token_accuracy": 0.5568741805478931,
|
| 2882 |
+
"num_input_tokens_seen": 6376750801,
|
| 2883 |
+
"num_tokens": 2687517529.0,
|
| 2884 |
+
"step": 13200
|
| 2885 |
+
},
|
| 2886 |
+
{
|
| 2887 |
+
"epoch": 3.198198470057486,
|
| 2888 |
+
"grad_norm": 0.2578125,
|
| 2889 |
+
"learning_rate": 1.2516976007243097e-05,
|
| 2890 |
+
"loss": 2.0985,
|
| 2891 |
+
"mean_token_accuracy": 0.5545465455949307,
|
| 2892 |
+
"num_input_tokens_seen": 6400738145,
|
| 2893 |
+
"num_tokens": 2697615714.0,
|
| 2894 |
+
"step": 13250
|
| 2895 |
+
},
|
| 2896 |
+
{
|
| 2897 |
+
"epoch": 3.2102690224361394,
|
| 2898 |
+
"grad_norm": 0.2451171875,
|
| 2899 |
+
"learning_rate": 1.2328353704542026e-05,
|
| 2900 |
+
"loss": 2.0969,
|
| 2901 |
+
"mean_token_accuracy": 0.5544571406021714,
|
| 2902 |
+
"num_input_tokens_seen": 6424909057,
|
| 2903 |
+
"num_tokens": 2707784293.0,
|
| 2904 |
+
"step": 13300
|
| 2905 |
+
},
|
| 2906 |
+
{
|
| 2907 |
+
"epoch": 3.2223395748147925,
|
| 2908 |
+
"grad_norm": 0.302734375,
|
| 2909 |
+
"learning_rate": 1.2139731401840953e-05,
|
| 2910 |
+
"loss": 2.0932,
|
| 2911 |
+
"mean_token_accuracy": 0.5548350306227803,
|
| 2912 |
+
"num_input_tokens_seen": 6449111825,
|
| 2913 |
+
"num_tokens": 2717984302.0,
|
| 2914 |
+
"step": 13350
|
| 2915 |
+
},
|
| 2916 |
+
{
|
| 2917 |
+
"epoch": 3.2344101271934456,
|
| 2918 |
+
"grad_norm": 0.228515625,
|
| 2919 |
+
"learning_rate": 1.1951109099139883e-05,
|
| 2920 |
+
"loss": 2.1012,
|
| 2921 |
+
"mean_token_accuracy": 0.5535725425183773,
|
| 2922 |
+
"num_input_tokens_seen": 6473257953,
|
| 2923 |
+
"num_tokens": 2728233467.0,
|
| 2924 |
+
"step": 13400
|
| 2925 |
+
},
|
| 2926 |
+
{
|
| 2927 |
+
"epoch": 3.246480679572099,
|
| 2928 |
+
"grad_norm": 0.2578125,
|
| 2929 |
+
"learning_rate": 1.1762486796438812e-05,
|
| 2930 |
+
"loss": 2.0985,
|
| 2931 |
+
"mean_token_accuracy": 0.5541856496781111,
|
| 2932 |
+
"num_input_tokens_seen": 6497464865,
|
| 2933 |
+
"num_tokens": 2738326366.0,
|
| 2934 |
+
"step": 13450
|
| 2935 |
+
},
|
| 2936 |
+
{
|
| 2937 |
+
"epoch": 3.258551231950752,
|
| 2938 |
+
"grad_norm": 0.2412109375,
|
| 2939 |
+
"learning_rate": 1.157386449373774e-05,
|
| 2940 |
+
"loss": 2.0911,
|
| 2941 |
+
"num_input_tokens_seen": 6521634753,
|
| 2942 |
+
"step": 13500
|
| 2943 |
+
},
|
| 2944 |
+
{
|
| 2945 |
+
"epoch": 3.258551231950752,
|
| 2946 |
+
"eval_loss": 1.9680596590042114,
|
| 2947 |
+
"eval_mean_token_accuracy": 0.5785238554199033,
|
| 2948 |
+
"eval_num_tokens": 2748403907.0,
|
| 2949 |
+
"eval_runtime": 130.2372,
|
| 2950 |
+
"eval_samples_per_second": 82.25,
|
| 2951 |
+
"eval_steps_per_second": 20.562,
|
| 2952 |
+
"num_input_tokens_seen": 6521634753,
|
| 2953 |
+
"step": 13500
|
| 2954 |
+
},
|
| 2955 |
+
{
|
| 2956 |
+
"epoch": 3.270621784329405,
|
| 2957 |
+
"grad_norm": 0.251953125,
|
| 2958 |
+
"learning_rate": 1.1385242191036669e-05,
|
| 2959 |
+
"loss": 2.0844,
|
| 2960 |
+
"mean_token_accuracy": 0.5562084444984794,
|
| 2961 |
+
"num_input_tokens_seen": 6545823777,
|
| 2962 |
+
"num_tokens": 2758638062.0,
|
| 2963 |
+
"step": 13550
|
| 2964 |
+
},
|
| 2965 |
+
{
|
| 2966 |
+
"epoch": 3.2826923367080587,
|
| 2967 |
+
"grad_norm": 0.24609375,
|
| 2968 |
+
"learning_rate": 1.1196619888335598e-05,
|
| 2969 |
+
"loss": 2.089,
|
| 2970 |
+
"mean_token_accuracy": 0.5565486250445246,
|
| 2971 |
+
"num_input_tokens_seen": 6569949777,
|
| 2972 |
+
"num_tokens": 2768698376.0,
|
| 2973 |
+
"step": 13600
|
| 2974 |
+
},
|
| 2975 |
+
{
|
| 2976 |
+
"epoch": 3.2947628890867118,
|
| 2977 |
+
"grad_norm": 0.2431640625,
|
| 2978 |
+
"learning_rate": 1.1007997585634526e-05,
|
| 2979 |
+
"loss": 2.0915,
|
| 2980 |
+
"mean_token_accuracy": 0.5548499751463533,
|
| 2981 |
+
"num_input_tokens_seen": 6593997425,
|
| 2982 |
+
"num_tokens": 2778806953.0,
|
| 2983 |
+
"step": 13650
|
| 2984 |
+
},
|
| 2985 |
+
{
|
| 2986 |
+
"epoch": 3.306833441465365,
|
| 2987 |
+
"grad_norm": 0.330078125,
|
| 2988 |
+
"learning_rate": 1.0819375282933455e-05,
|
| 2989 |
+
"loss": 2.0875,
|
| 2990 |
+
"mean_token_accuracy": 0.5560770154371858,
|
| 2991 |
+
"num_input_tokens_seen": 6618153121,
|
| 2992 |
+
"num_tokens": 2789046249.0,
|
| 2993 |
+
"step": 13700
|
| 2994 |
+
},
|
| 2995 |
+
{
|
| 2996 |
+
"epoch": 3.3189039938440184,
|
| 2997 |
+
"grad_norm": 0.26171875,
|
| 2998 |
+
"learning_rate": 1.0630752980232384e-05,
|
| 2999 |
+
"loss": 2.0974,
|
| 3000 |
+
"mean_token_accuracy": 0.5540758088976144,
|
| 3001 |
+
"num_input_tokens_seen": 6642228561,
|
| 3002 |
+
"num_tokens": 2799134100.0,
|
| 3003 |
+
"step": 13750
|
| 3004 |
+
},
|
| 3005 |
+
{
|
| 3006 |
+
"epoch": 3.3309745462226714,
|
| 3007 |
+
"grad_norm": 0.2578125,
|
| 3008 |
+
"learning_rate": 1.0442130677531312e-05,
|
| 3009 |
+
"loss": 2.0837,
|
| 3010 |
+
"mean_token_accuracy": 0.5564264697581529,
|
| 3011 |
+
"num_input_tokens_seen": 6666487089,
|
| 3012 |
+
"num_tokens": 2809333203.0,
|
| 3013 |
+
"step": 13800
|
| 3014 |
+
},
|
| 3015 |
+
{
|
| 3016 |
+
"epoch": 3.343045098601325,
|
| 3017 |
+
"grad_norm": 0.271484375,
|
| 3018 |
+
"learning_rate": 1.025350837483024e-05,
|
| 3019 |
+
"loss": 2.0804,
|
| 3020 |
+
"mean_token_accuracy": 0.5564664682373405,
|
| 3021 |
+
"num_input_tokens_seen": 6690592209,
|
| 3022 |
+
"num_tokens": 2819507621.0,
|
| 3023 |
+
"step": 13850
|
| 3024 |
+
},
|
| 3025 |
+
{
|
| 3026 |
+
"epoch": 3.355115650979978,
|
| 3027 |
+
"grad_norm": 0.2578125,
|
| 3028 |
+
"learning_rate": 1.006488607212917e-05,
|
| 3029 |
+
"loss": 2.0875,
|
| 3030 |
+
"mean_token_accuracy": 0.5563617146387696,
|
| 3031 |
+
"num_input_tokens_seen": 6714782033,
|
| 3032 |
+
"num_tokens": 2829715451.0,
|
| 3033 |
+
"step": 13900
|
| 3034 |
+
},
|
| 3035 |
+
{
|
| 3036 |
+
"epoch": 3.367186203358631,
|
| 3037 |
+
"grad_norm": 0.26171875,
|
| 3038 |
+
"learning_rate": 9.876263769428096e-06,
|
| 3039 |
+
"loss": 2.1015,
|
| 3040 |
+
"mean_token_accuracy": 0.5533242063969374,
|
| 3041 |
+
"num_input_tokens_seen": 6738954721,
|
| 3042 |
+
"num_tokens": 2839876349.0,
|
| 3043 |
+
"step": 13950
|
| 3044 |
+
},
|
| 3045 |
+
{
|
| 3046 |
+
"epoch": 3.3792567557372846,
|
| 3047 |
+
"grad_norm": 0.2578125,
|
| 3048 |
+
"learning_rate": 9.687641466727027e-06,
|
| 3049 |
+
"loss": 2.1018,
|
| 3050 |
+
"num_input_tokens_seen": 6763271617,
|
| 3051 |
+
"step": 14000
|
| 3052 |
+
},
|
| 3053 |
+
{
|
| 3054 |
+
"epoch": 3.3792567557372846,
|
| 3055 |
+
"eval_loss": 1.9681081771850586,
|
| 3056 |
+
"eval_mean_token_accuracy": 0.5785279828634967,
|
| 3057 |
+
"eval_num_tokens": 2850147053.0,
|
| 3058 |
+
"eval_runtime": 131.6179,
|
| 3059 |
+
"eval_samples_per_second": 81.387,
|
| 3060 |
+
"eval_steps_per_second": 20.347,
|
| 3061 |
+
"num_input_tokens_seen": 6763271617,
|
| 3062 |
+
"step": 14000
|
| 3063 |
}
|
| 3064 |
],
|
| 3065 |
"logging_steps": 50,
|
| 3066 |
"max_steps": 16568,
|
| 3067 |
+
"num_input_tokens_seen": 6763271617,
|
| 3068 |
"num_train_epochs": 4,
|
| 3069 |
"save_steps": 1000,
|
| 3070 |
"stateful_callbacks": {
|
|
|
|
| 3079 |
"attributes": {}
|
| 3080 |
}
|
| 3081 |
},
|
| 3082 |
+
"total_flos": 1.809241167078482e+18,
|
| 3083 |
"train_batch_size": 16,
|
| 3084 |
"trial_name": null,
|
| 3085 |
"trial_params": null
|