Training in progress, step 15000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 150625560
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15f637ff72e852c00df336464cba31267a78c2fec942618a4cf3dbc081150cb8
|
| 3 |
size 150625560
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11255a9366d03d2ecf115313602ca401e81860858d4e1ecad341feef41b0e95b
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea828a56e17bf773dc8e4fa2c22d13619b805c6b9321028dd494ff57e5daf8e6
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1819c72414dd202fc7a5b387187559436ac1d66f4c4de3f13c18065ffbdf0216
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3060,11 +3060,229 @@
|
|
| 3060 |
"eval_steps_per_second": 20.347,
|
| 3061 |
"num_input_tokens_seen": 6763271617,
|
| 3062 |
"step": 14000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3063 |
}
|
| 3064 |
],
|
| 3065 |
"logging_steps": 50,
|
| 3066 |
"max_steps": 16568,
|
| 3067 |
-
"num_input_tokens_seen":
|
| 3068 |
"num_train_epochs": 4,
|
| 3069 |
"save_steps": 1000,
|
| 3070 |
"stateful_callbacks": {
|
|
@@ -3079,7 +3297,7 @@
|
|
| 3079 |
"attributes": {}
|
| 3080 |
}
|
| 3081 |
},
|
| 3082 |
-
"total_flos": 1.
|
| 3083 |
"train_batch_size": 16,
|
| 3084 |
"trial_name": null,
|
| 3085 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.620667803310349,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 15000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3060 |
"eval_steps_per_second": 20.347,
|
| 3061 |
"num_input_tokens_seen": 6763271617,
|
| 3062 |
"step": 14000
|
| 3063 |
+
},
|
| 3064 |
+
{
|
| 3065 |
+
"epoch": 3.3913273081159376,
|
| 3066 |
+
"grad_norm": 0.25,
|
| 3067 |
+
"learning_rate": 9.499019164025955e-06,
|
| 3068 |
+
"loss": 2.0975,
|
| 3069 |
+
"mean_token_accuracy": 0.5536782286874949,
|
| 3070 |
+
"num_input_tokens_seen": 6787391841,
|
| 3071 |
+
"num_tokens": 2860344977.0,
|
| 3072 |
+
"step": 14050
|
| 3073 |
+
},
|
| 3074 |
+
{
|
| 3075 |
+
"epoch": 3.4033978604945907,
|
| 3076 |
+
"grad_norm": 0.25,
|
| 3077 |
+
"learning_rate": 9.310396861324884e-06,
|
| 3078 |
+
"loss": 2.1022,
|
| 3079 |
+
"mean_token_accuracy": 0.5538700968772173,
|
| 3080 |
+
"num_input_tokens_seen": 6811630961,
|
| 3081 |
+
"num_tokens": 2870526506.0,
|
| 3082 |
+
"step": 14100
|
| 3083 |
+
},
|
| 3084 |
+
{
|
| 3085 |
+
"epoch": 3.415468412873244,
|
| 3086 |
+
"grad_norm": 0.2431640625,
|
| 3087 |
+
"learning_rate": 9.121774558623813e-06,
|
| 3088 |
+
"loss": 2.0934,
|
| 3089 |
+
"mean_token_accuracy": 0.5550757900252938,
|
| 3090 |
+
"num_input_tokens_seen": 6835811825,
|
| 3091 |
+
"num_tokens": 2880722898.0,
|
| 3092 |
+
"step": 14150
|
| 3093 |
+
},
|
| 3094 |
+
{
|
| 3095 |
+
"epoch": 3.4275389652518973,
|
| 3096 |
+
"grad_norm": 0.2578125,
|
| 3097 |
+
"learning_rate": 8.93315225592274e-06,
|
| 3098 |
+
"loss": 2.0875,
|
| 3099 |
+
"mean_token_accuracy": 0.5558257311582565,
|
| 3100 |
+
"num_input_tokens_seen": 6859918049,
|
| 3101 |
+
"num_tokens": 2890925546.0,
|
| 3102 |
+
"step": 14200
|
| 3103 |
+
},
|
| 3104 |
+
{
|
| 3105 |
+
"epoch": 3.439609517630551,
|
| 3106 |
+
"grad_norm": 0.2294921875,
|
| 3107 |
+
"learning_rate": 8.74452995322167e-06,
|
| 3108 |
+
"loss": 2.0969,
|
| 3109 |
+
"mean_token_accuracy": 0.5544555878639221,
|
| 3110 |
+
"num_input_tokens_seen": 6883973409,
|
| 3111 |
+
"num_tokens": 2901007248.0,
|
| 3112 |
+
"step": 14250
|
| 3113 |
+
},
|
| 3114 |
+
{
|
| 3115 |
+
"epoch": 3.451680070009204,
|
| 3116 |
+
"grad_norm": 0.25390625,
|
| 3117 |
+
"learning_rate": 8.555907650520598e-06,
|
| 3118 |
+
"loss": 2.0987,
|
| 3119 |
+
"mean_token_accuracy": 0.5544828617200256,
|
| 3120 |
+
"num_input_tokens_seen": 6908263985,
|
| 3121 |
+
"num_tokens": 2911355124.0,
|
| 3122 |
+
"step": 14300
|
| 3123 |
+
},
|
| 3124 |
+
{
|
| 3125 |
+
"epoch": 3.463750622387857,
|
| 3126 |
+
"grad_norm": 0.271484375,
|
| 3127 |
+
"learning_rate": 8.367285347819527e-06,
|
| 3128 |
+
"loss": 2.0889,
|
| 3129 |
+
"mean_token_accuracy": 0.5557316156104207,
|
| 3130 |
+
"num_input_tokens_seen": 6932344993,
|
| 3131 |
+
"num_tokens": 2921442830.0,
|
| 3132 |
+
"step": 14350
|
| 3133 |
+
},
|
| 3134 |
+
{
|
| 3135 |
+
"epoch": 3.4758211747665104,
|
| 3136 |
+
"grad_norm": 0.255859375,
|
| 3137 |
+
"learning_rate": 8.178663045118456e-06,
|
| 3138 |
+
"loss": 2.0979,
|
| 3139 |
+
"mean_token_accuracy": 0.5547628674656153,
|
| 3140 |
+
"num_input_tokens_seen": 6956417041,
|
| 3141 |
+
"num_tokens": 2931461417.0,
|
| 3142 |
+
"step": 14400
|
| 3143 |
+
},
|
| 3144 |
+
{
|
| 3145 |
+
"epoch": 3.4878917271451635,
|
| 3146 |
+
"grad_norm": 0.234375,
|
| 3147 |
+
"learning_rate": 7.990040742417383e-06,
|
| 3148 |
+
"loss": 2.1005,
|
| 3149 |
+
"mean_token_accuracy": 0.5539160283654928,
|
| 3150 |
+
"num_input_tokens_seen": 6980421889,
|
| 3151 |
+
"num_tokens": 2941531928.0,
|
| 3152 |
+
"step": 14450
|
| 3153 |
+
},
|
| 3154 |
+
{
|
| 3155 |
+
"epoch": 3.4999622795238166,
|
| 3156 |
+
"grad_norm": 0.275390625,
|
| 3157 |
+
"learning_rate": 7.801418439716313e-06,
|
| 3158 |
+
"loss": 2.1017,
|
| 3159 |
+
"num_input_tokens_seen": 7004552193,
|
| 3160 |
+
"step": 14500
|
| 3161 |
+
},
|
| 3162 |
+
{
|
| 3163 |
+
"epoch": 3.4999622795238166,
|
| 3164 |
+
"eval_loss": 1.9681284427642822,
|
| 3165 |
+
"eval_mean_token_accuracy": 0.5785388401566912,
|
| 3166 |
+
"eval_num_tokens": 2951727207.0,
|
| 3167 |
+
"eval_runtime": 131.2276,
|
| 3168 |
+
"eval_samples_per_second": 81.629,
|
| 3169 |
+
"eval_steps_per_second": 20.407,
|
| 3170 |
+
"num_input_tokens_seen": 7004552193,
|
| 3171 |
+
"step": 14500
|
| 3172 |
+
},
|
| 3173 |
+
{
|
| 3174 |
+
"epoch": 3.51203283190247,
|
| 3175 |
+
"grad_norm": 0.267578125,
|
| 3176 |
+
"learning_rate": 7.612796137015241e-06,
|
| 3177 |
+
"loss": 2.09,
|
| 3178 |
+
"mean_token_accuracy": 0.5543626462481916,
|
| 3179 |
+
"num_input_tokens_seen": 7028775953,
|
| 3180 |
+
"num_tokens": 2961945579.0,
|
| 3181 |
+
"step": 14550
|
| 3182 |
+
},
|
| 3183 |
+
{
|
| 3184 |
+
"epoch": 3.524103384281123,
|
| 3185 |
+
"grad_norm": 0.26171875,
|
| 3186 |
+
"learning_rate": 7.42417383431417e-06,
|
| 3187 |
+
"loss": 2.0978,
|
| 3188 |
+
"mean_token_accuracy": 0.5544422981515527,
|
| 3189 |
+
"num_input_tokens_seen": 7052883457,
|
| 3190 |
+
"num_tokens": 2972173798.0,
|
| 3191 |
+
"step": 14600
|
| 3192 |
+
},
|
| 3193 |
+
{
|
| 3194 |
+
"epoch": 3.536173936659776,
|
| 3195 |
+
"grad_norm": 0.251953125,
|
| 3196 |
+
"learning_rate": 7.235551531613098e-06,
|
| 3197 |
+
"loss": 2.0915,
|
| 3198 |
+
"mean_token_accuracy": 0.5559014651551842,
|
| 3199 |
+
"num_input_tokens_seen": 7077135185,
|
| 3200 |
+
"num_tokens": 2982315453.0,
|
| 3201 |
+
"step": 14650
|
| 3202 |
+
},
|
| 3203 |
+
{
|
| 3204 |
+
"epoch": 3.5482444890384297,
|
| 3205 |
+
"grad_norm": 0.310546875,
|
| 3206 |
+
"learning_rate": 7.0469292289120274e-06,
|
| 3207 |
+
"loss": 2.0932,
|
| 3208 |
+
"mean_token_accuracy": 0.5552764968574047,
|
| 3209 |
+
"num_input_tokens_seen": 7101260305,
|
| 3210 |
+
"num_tokens": 2992557355.0,
|
| 3211 |
+
"step": 14700
|
| 3212 |
+
},
|
| 3213 |
+
{
|
| 3214 |
+
"epoch": 3.5603150414170828,
|
| 3215 |
+
"grad_norm": 0.25390625,
|
| 3216 |
+
"learning_rate": 6.858306926210955e-06,
|
| 3217 |
+
"loss": 2.0959,
|
| 3218 |
+
"mean_token_accuracy": 0.555088207796216,
|
| 3219 |
+
"num_input_tokens_seen": 7125198545,
|
| 3220 |
+
"num_tokens": 3002657117.0,
|
| 3221 |
+
"step": 14750
|
| 3222 |
+
},
|
| 3223 |
+
{
|
| 3224 |
+
"epoch": 3.572385593795736,
|
| 3225 |
+
"grad_norm": 0.2314453125,
|
| 3226 |
+
"learning_rate": 6.669684623509884e-06,
|
| 3227 |
+
"loss": 2.0933,
|
| 3228 |
+
"mean_token_accuracy": 0.5554985254630447,
|
| 3229 |
+
"num_input_tokens_seen": 7149297905,
|
| 3230 |
+
"num_tokens": 3012818977.0,
|
| 3231 |
+
"step": 14800
|
| 3232 |
+
},
|
| 3233 |
+
{
|
| 3234 |
+
"epoch": 3.5844561461743893,
|
| 3235 |
+
"grad_norm": 0.23828125,
|
| 3236 |
+
"learning_rate": 6.481062320808813e-06,
|
| 3237 |
+
"loss": 2.0901,
|
| 3238 |
+
"mean_token_accuracy": 0.5556722393259406,
|
| 3239 |
+
"num_input_tokens_seen": 7173408417,
|
| 3240 |
+
"num_tokens": 3022993500.0,
|
| 3241 |
+
"step": 14850
|
| 3242 |
+
},
|
| 3243 |
+
{
|
| 3244 |
+
"epoch": 3.5965266985530424,
|
| 3245 |
+
"grad_norm": 0.279296875,
|
| 3246 |
+
"learning_rate": 6.292440018107741e-06,
|
| 3247 |
+
"loss": 2.0862,
|
| 3248 |
+
"mean_token_accuracy": 0.5560053834319114,
|
| 3249 |
+
"num_input_tokens_seen": 7197689201,
|
| 3250 |
+
"num_tokens": 3033239485.0,
|
| 3251 |
+
"step": 14900
|
| 3252 |
+
},
|
| 3253 |
+
{
|
| 3254 |
+
"epoch": 3.608597250931696,
|
| 3255 |
+
"grad_norm": 0.265625,
|
| 3256 |
+
"learning_rate": 6.10381771540667e-06,
|
| 3257 |
+
"loss": 2.093,
|
| 3258 |
+
"mean_token_accuracy": 0.5550377672165632,
|
| 3259 |
+
"num_input_tokens_seen": 7221805553,
|
| 3260 |
+
"num_tokens": 3043356374.0,
|
| 3261 |
+
"step": 14950
|
| 3262 |
+
},
|
| 3263 |
+
{
|
| 3264 |
+
"epoch": 3.620667803310349,
|
| 3265 |
+
"grad_norm": 0.24609375,
|
| 3266 |
+
"learning_rate": 5.915195412705598e-06,
|
| 3267 |
+
"loss": 2.0994,
|
| 3268 |
+
"num_input_tokens_seen": 7245951473,
|
| 3269 |
+
"step": 15000
|
| 3270 |
+
},
|
| 3271 |
+
{
|
| 3272 |
+
"epoch": 3.620667803310349,
|
| 3273 |
+
"eval_loss": 1.9680702686309814,
|
| 3274 |
+
"eval_mean_token_accuracy": 0.5785124528710748,
|
| 3275 |
+
"eval_num_tokens": 3053564564.0,
|
| 3276 |
+
"eval_runtime": 130.6855,
|
| 3277 |
+
"eval_samples_per_second": 81.968,
|
| 3278 |
+
"eval_steps_per_second": 20.492,
|
| 3279 |
+
"num_input_tokens_seen": 7245951473,
|
| 3280 |
+
"step": 15000
|
| 3281 |
}
|
| 3282 |
],
|
| 3283 |
"logging_steps": 50,
|
| 3284 |
"max_steps": 16568,
|
| 3285 |
+
"num_input_tokens_seen": 7245951473,
|
| 3286 |
"num_train_epochs": 4,
|
| 3287 |
"save_steps": 1000,
|
| 3288 |
"stateful_callbacks": {
|
|
|
|
| 3297 |
"attributes": {}
|
| 3298 |
}
|
| 3299 |
},
|
| 3300 |
+
"total_flos": 1.9383627395138765e+18,
|
| 3301 |
"train_batch_size": 16,
|
| 3302 |
"trial_name": null,
|
| 3303 |
"trial_params": null
|