Training in progress, step 17000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebeffb2c037b50b65a6c0dee470a06e80bc04cb18fc25d36c6c23ebbfb1bfdb7
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56670a00c0a6655472a5df0bad61f805ef42230ce33d41f768bab9a708635a97
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b383a5a228123d48b81ff62301f8c357c6f3a9cd7484f11e193f37bbe5162530
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2af187ec456db07cec83217e48a58a7d4609355155eba34a029dc1dd312e2a7
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2856,11 +2856,189 @@
|
|
| 2856 |
"eval_steps_per_second": 18.912,
|
| 2857 |
"num_input_tokens_seen": 16777216000,
|
| 2858 |
"step": 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2859 |
}
|
| 2860 |
],
|
| 2861 |
"logging_steps": 50,
|
| 2862 |
"max_steps": 200000,
|
| 2863 |
-
"num_input_tokens_seen":
|
| 2864 |
"num_train_epochs": 5,
|
| 2865 |
"save_steps": 1000,
|
| 2866 |
"stateful_callbacks": {
|
|
@@ -2875,7 +3053,7 @@
|
|
| 2875 |
"attributes": {}
|
| 2876 |
}
|
| 2877 |
},
|
| 2878 |
-
"total_flos":
|
| 2879 |
"train_batch_size": 64,
|
| 2880 |
"trial_name": null,
|
| 2881 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.37342273512933194,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 17000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2856 |
"eval_steps_per_second": 18.912,
|
| 2857 |
"num_input_tokens_seen": 16777216000,
|
| 2858 |
"step": 16000
|
| 2859 |
+
},
|
| 2860 |
+
{
|
| 2861 |
+
"epoch": 0.35255499404857515,
|
| 2862 |
+
"grad_norm": 0.1446143537759781,
|
| 2863 |
+
"learning_rate": 0.001,
|
| 2864 |
+
"loss": 2.7837,
|
| 2865 |
+
"num_input_tokens_seen": 16829644800,
|
| 2866 |
+
"step": 16050
|
| 2867 |
+
},
|
| 2868 |
+
{
|
| 2869 |
+
"epoch": 0.3536532962107203,
|
| 2870 |
+
"grad_norm": 0.12466421723365784,
|
| 2871 |
+
"learning_rate": 0.001,
|
| 2872 |
+
"loss": 2.7808,
|
| 2873 |
+
"num_input_tokens_seen": 16882073600,
|
| 2874 |
+
"step": 16100
|
| 2875 |
+
},
|
| 2876 |
+
{
|
| 2877 |
+
"epoch": 0.35475159837286535,
|
| 2878 |
+
"grad_norm": 0.13154324889183044,
|
| 2879 |
+
"learning_rate": 0.001,
|
| 2880 |
+
"loss": 2.7608,
|
| 2881 |
+
"num_input_tokens_seen": 16934502400,
|
| 2882 |
+
"step": 16150
|
| 2883 |
+
},
|
| 2884 |
+
{
|
| 2885 |
+
"epoch": 0.3558499005350104,
|
| 2886 |
+
"grad_norm": 0.12929347157478333,
|
| 2887 |
+
"learning_rate": 0.001,
|
| 2888 |
+
"loss": 2.7599,
|
| 2889 |
+
"num_input_tokens_seen": 16986931200,
|
| 2890 |
+
"step": 16200
|
| 2891 |
+
},
|
| 2892 |
+
{
|
| 2893 |
+
"epoch": 0.35694820269715555,
|
| 2894 |
+
"grad_norm": 0.12805528938770294,
|
| 2895 |
+
"learning_rate": 0.001,
|
| 2896 |
+
"loss": 2.7562,
|
| 2897 |
+
"num_input_tokens_seen": 17039360000,
|
| 2898 |
+
"step": 16250
|
| 2899 |
+
},
|
| 2900 |
+
{
|
| 2901 |
+
"epoch": 0.3580465048593006,
|
| 2902 |
+
"grad_norm": 0.12885579466819763,
|
| 2903 |
+
"learning_rate": 0.001,
|
| 2904 |
+
"loss": 2.7498,
|
| 2905 |
+
"num_input_tokens_seen": 17091788800,
|
| 2906 |
+
"step": 16300
|
| 2907 |
+
},
|
| 2908 |
+
{
|
| 2909 |
+
"epoch": 0.35914480702144574,
|
| 2910 |
+
"grad_norm": 0.14422497153282166,
|
| 2911 |
+
"learning_rate": 0.001,
|
| 2912 |
+
"loss": 2.7518,
|
| 2913 |
+
"num_input_tokens_seen": 17144217600,
|
| 2914 |
+
"step": 16350
|
| 2915 |
+
},
|
| 2916 |
+
{
|
| 2917 |
+
"epoch": 0.3602431091835908,
|
| 2918 |
+
"grad_norm": 0.13284224271774292,
|
| 2919 |
+
"learning_rate": 0.001,
|
| 2920 |
+
"loss": 2.7453,
|
| 2921 |
+
"num_input_tokens_seen": 17196646400,
|
| 2922 |
+
"step": 16400
|
| 2923 |
+
},
|
| 2924 |
+
{
|
| 2925 |
+
"epoch": 0.3613414113457359,
|
| 2926 |
+
"grad_norm": 0.1408185362815857,
|
| 2927 |
+
"learning_rate": 0.001,
|
| 2928 |
+
"loss": 2.7422,
|
| 2929 |
+
"num_input_tokens_seen": 17249075200,
|
| 2930 |
+
"step": 16450
|
| 2931 |
+
},
|
| 2932 |
+
{
|
| 2933 |
+
"epoch": 0.362439713507881,
|
| 2934 |
+
"grad_norm": 0.1295713484287262,
|
| 2935 |
+
"learning_rate": 0.001,
|
| 2936 |
+
"loss": 2.7394,
|
| 2937 |
+
"num_input_tokens_seen": 17301504000,
|
| 2938 |
+
"step": 16500
|
| 2939 |
+
},
|
| 2940 |
+
{
|
| 2941 |
+
"epoch": 0.362439713507881,
|
| 2942 |
+
"eval_loss": 2.6431446075439453,
|
| 2943 |
+
"eval_runtime": 65.9239,
|
| 2944 |
+
"eval_samples_per_second": 75.845,
|
| 2945 |
+
"eval_steps_per_second": 18.961,
|
| 2946 |
+
"num_input_tokens_seen": 17301504000,
|
| 2947 |
+
"step": 16500
|
| 2948 |
+
},
|
| 2949 |
+
{
|
| 2950 |
+
"epoch": 0.3635380156700261,
|
| 2951 |
+
"grad_norm": 0.1245918869972229,
|
| 2952 |
+
"learning_rate": 0.001,
|
| 2953 |
+
"loss": 2.7434,
|
| 2954 |
+
"num_input_tokens_seen": 17353932800,
|
| 2955 |
+
"step": 16550
|
| 2956 |
+
},
|
| 2957 |
+
{
|
| 2958 |
+
"epoch": 0.3646363178321712,
|
| 2959 |
+
"grad_norm": 0.15865615010261536,
|
| 2960 |
+
"learning_rate": 0.001,
|
| 2961 |
+
"loss": 2.7378,
|
| 2962 |
+
"num_input_tokens_seen": 17406361600,
|
| 2963 |
+
"step": 16600
|
| 2964 |
+
},
|
| 2965 |
+
{
|
| 2966 |
+
"epoch": 0.3657346199943163,
|
| 2967 |
+
"grad_norm": 0.1391313523054123,
|
| 2968 |
+
"learning_rate": 0.001,
|
| 2969 |
+
"loss": 2.7415,
|
| 2970 |
+
"num_input_tokens_seen": 17458790400,
|
| 2971 |
+
"step": 16650
|
| 2972 |
+
},
|
| 2973 |
+
{
|
| 2974 |
+
"epoch": 0.3668329221564614,
|
| 2975 |
+
"grad_norm": 0.13604389131069183,
|
| 2976 |
+
"learning_rate": 0.001,
|
| 2977 |
+
"loss": 2.7394,
|
| 2978 |
+
"num_input_tokens_seen": 17511219200,
|
| 2979 |
+
"step": 16700
|
| 2980 |
+
},
|
| 2981 |
+
{
|
| 2982 |
+
"epoch": 0.3679312243186065,
|
| 2983 |
+
"grad_norm": 0.14926299452781677,
|
| 2984 |
+
"learning_rate": 0.001,
|
| 2985 |
+
"loss": 2.732,
|
| 2986 |
+
"num_input_tokens_seen": 17563648000,
|
| 2987 |
+
"step": 16750
|
| 2988 |
+
},
|
| 2989 |
+
{
|
| 2990 |
+
"epoch": 0.36902952648075155,
|
| 2991 |
+
"grad_norm": 0.12619628012180328,
|
| 2992 |
+
"learning_rate": 0.001,
|
| 2993 |
+
"loss": 2.7275,
|
| 2994 |
+
"num_input_tokens_seen": 17616076800,
|
| 2995 |
+
"step": 16800
|
| 2996 |
+
},
|
| 2997 |
+
{
|
| 2998 |
+
"epoch": 0.3701278286428967,
|
| 2999 |
+
"grad_norm": 0.1268402636051178,
|
| 3000 |
+
"learning_rate": 0.001,
|
| 3001 |
+
"loss": 2.7309,
|
| 3002 |
+
"num_input_tokens_seen": 17668505600,
|
| 3003 |
+
"step": 16850
|
| 3004 |
+
},
|
| 3005 |
+
{
|
| 3006 |
+
"epoch": 0.37122613080504174,
|
| 3007 |
+
"grad_norm": 0.1379624754190445,
|
| 3008 |
+
"learning_rate": 0.001,
|
| 3009 |
+
"loss": 2.7266,
|
| 3010 |
+
"num_input_tokens_seen": 17720934400,
|
| 3011 |
+
"step": 16900
|
| 3012 |
+
},
|
| 3013 |
+
{
|
| 3014 |
+
"epoch": 0.37232443296718687,
|
| 3015 |
+
"grad_norm": 0.1443478763103485,
|
| 3016 |
+
"learning_rate": 0.001,
|
| 3017 |
+
"loss": 2.7321,
|
| 3018 |
+
"num_input_tokens_seen": 17773363200,
|
| 3019 |
+
"step": 16950
|
| 3020 |
+
},
|
| 3021 |
+
{
|
| 3022 |
+
"epoch": 0.37342273512933194,
|
| 3023 |
+
"grad_norm": 0.15214091539382935,
|
| 3024 |
+
"learning_rate": 0.001,
|
| 3025 |
+
"loss": 2.7284,
|
| 3026 |
+
"num_input_tokens_seen": 17825792000,
|
| 3027 |
+
"step": 17000
|
| 3028 |
+
},
|
| 3029 |
+
{
|
| 3030 |
+
"epoch": 0.37342273512933194,
|
| 3031 |
+
"eval_loss": 2.63478946685791,
|
| 3032 |
+
"eval_runtime": 65.141,
|
| 3033 |
+
"eval_samples_per_second": 76.757,
|
| 3034 |
+
"eval_steps_per_second": 19.189,
|
| 3035 |
+
"num_input_tokens_seen": 17825792000,
|
| 3036 |
+
"step": 17000
|
| 3037 |
}
|
| 3038 |
],
|
| 3039 |
"logging_steps": 50,
|
| 3040 |
"max_steps": 200000,
|
| 3041 |
+
"num_input_tokens_seen": 17825792000,
|
| 3042 |
"num_train_epochs": 5,
|
| 3043 |
"save_steps": 1000,
|
| 3044 |
"stateful_callbacks": {
|
|
|
|
| 3053 |
"attributes": {}
|
| 3054 |
}
|
| 3055 |
},
|
| 3056 |
+
"total_flos": 1.0151919171403776e+19,
|
| 3057 |
"train_batch_size": 64,
|
| 3058 |
"trial_name": null,
|
| 3059 |
"trial_params": null
|