Training in progress, step 3750, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 527048968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0f2359775ec058105a768a27e0aec2fd7b09c0fef450becc3fea6a2140d5551
|
| 3 |
size 527048968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1054135994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22b0e07d88b69f37af8463bb1ac2f6ff8e912db26c18c2ee123c3a1948596d38
|
| 3 |
size 1054135994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da0e93581e91c352d5ee493f505f8757c94a31fb5b16f71a9d85577535431525
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30b7c8be324c8b4289d82c59d6cbd2a46df58415895691106518590654dd09ba
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": 0.7166205048561096,
|
| 3 |
"best_model_checkpoint": "./output/checkpoint-450",
|
| 4 |
-
"epoch":
|
| 5 |
"eval_steps": 150,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -3197,6 +3197,441 @@
|
|
| 3197 |
"EMA_steps_per_second": 25.834,
|
| 3198 |
"epoch": 143.47826086956522,
|
| 3199 |
"step": 3300
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3200 |
}
|
| 3201 |
],
|
| 3202 |
"logging_steps": 10,
|
|
@@ -3216,7 +3651,7 @@
|
|
| 3216 |
"attributes": {}
|
| 3217 |
}
|
| 3218 |
},
|
| 3219 |
-
"total_flos":
|
| 3220 |
"train_batch_size": 4,
|
| 3221 |
"trial_name": null,
|
| 3222 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": 0.7166205048561096,
|
| 3 |
"best_model_checkpoint": "./output/checkpoint-450",
|
| 4 |
+
"epoch": 163.04347826086956,
|
| 5 |
"eval_steps": 150,
|
| 6 |
+
"global_step": 3750,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 3197 |
"EMA_steps_per_second": 25.834,
|
| 3198 |
"epoch": 143.47826086956522,
|
| 3199 |
"step": 3300
|
| 3200 |
+
},
|
| 3201 |
+
{
|
| 3202 |
+
"epoch": 143.91304347826087,
|
| 3203 |
+
"grad_norm": 1.9049878120422363,
|
| 3204 |
+
"learning_rate": 3.9382995689756636e-06,
|
| 3205 |
+
"loss": 0.2537,
|
| 3206 |
+
"step": 3310
|
| 3207 |
+
},
|
| 3208 |
+
{
|
| 3209 |
+
"epoch": 144.34782608695653,
|
| 3210 |
+
"grad_norm": 1.5125168561935425,
|
| 3211 |
+
"learning_rate": 3.9382593694923146e-06,
|
| 3212 |
+
"loss": 0.2142,
|
| 3213 |
+
"step": 3320
|
| 3214 |
+
},
|
| 3215 |
+
{
|
| 3216 |
+
"epoch": 144.7826086956522,
|
| 3217 |
+
"grad_norm": 1.737127661705017,
|
| 3218 |
+
"learning_rate": 3.938218389718042e-06,
|
| 3219 |
+
"loss": 0.2706,
|
| 3220 |
+
"step": 3330
|
| 3221 |
+
},
|
| 3222 |
+
{
|
| 3223 |
+
"epoch": 145.2173913043478,
|
| 3224 |
+
"grad_norm": 2.886361837387085,
|
| 3225 |
+
"learning_rate": 3.938176629669088e-06,
|
| 3226 |
+
"loss": 0.2079,
|
| 3227 |
+
"step": 3340
|
| 3228 |
+
},
|
| 3229 |
+
{
|
| 3230 |
+
"epoch": 145.65217391304347,
|
| 3231 |
+
"grad_norm": 1.8378046751022339,
|
| 3232 |
+
"learning_rate": 3.938134089362005e-06,
|
| 3233 |
+
"loss": 0.2378,
|
| 3234 |
+
"step": 3350
|
| 3235 |
+
},
|
| 3236 |
+
{
|
| 3237 |
+
"epoch": 146.08695652173913,
|
| 3238 |
+
"grad_norm": 1.9865158796310425,
|
| 3239 |
+
"learning_rate": 3.938090768813655e-06,
|
| 3240 |
+
"loss": 0.2649,
|
| 3241 |
+
"step": 3360
|
| 3242 |
+
},
|
| 3243 |
+
{
|
| 3244 |
+
"epoch": 146.52173913043478,
|
| 3245 |
+
"grad_norm": 1.9572851657867432,
|
| 3246 |
+
"learning_rate": 3.938046668041207e-06,
|
| 3247 |
+
"loss": 0.2268,
|
| 3248 |
+
"step": 3370
|
| 3249 |
+
},
|
| 3250 |
+
{
|
| 3251 |
+
"epoch": 146.95652173913044,
|
| 3252 |
+
"grad_norm": 1.6120030879974365,
|
| 3253 |
+
"learning_rate": 3.9380017870621435e-06,
|
| 3254 |
+
"loss": 0.2154,
|
| 3255 |
+
"step": 3380
|
| 3256 |
+
},
|
| 3257 |
+
{
|
| 3258 |
+
"epoch": 147.3913043478261,
|
| 3259 |
+
"grad_norm": 1.6852221488952637,
|
| 3260 |
+
"learning_rate": 3.9379561258942536e-06,
|
| 3261 |
+
"loss": 0.2284,
|
| 3262 |
+
"step": 3390
|
| 3263 |
+
},
|
| 3264 |
+
{
|
| 3265 |
+
"epoch": 147.82608695652175,
|
| 3266 |
+
"grad_norm": 1.568108081817627,
|
| 3267 |
+
"learning_rate": 3.937909684555634e-06,
|
| 3268 |
+
"loss": 0.2535,
|
| 3269 |
+
"step": 3400
|
| 3270 |
+
},
|
| 3271 |
+
{
|
| 3272 |
+
"epoch": 148.2608695652174,
|
| 3273 |
+
"grad_norm": 1.4495244026184082,
|
| 3274 |
+
"learning_rate": 3.937862463064695e-06,
|
| 3275 |
+
"loss": 0.2152,
|
| 3276 |
+
"step": 3410
|
| 3277 |
+
},
|
| 3278 |
+
{
|
| 3279 |
+
"epoch": 148.69565217391303,
|
| 3280 |
+
"grad_norm": 1.8378851413726807,
|
| 3281 |
+
"learning_rate": 3.937814461440151e-06,
|
| 3282 |
+
"loss": 0.2494,
|
| 3283 |
+
"step": 3420
|
| 3284 |
+
},
|
| 3285 |
+
{
|
| 3286 |
+
"epoch": 149.1304347826087,
|
| 3287 |
+
"grad_norm": 1.866101622581482,
|
| 3288 |
+
"learning_rate": 3.937765679701031e-06,
|
| 3289 |
+
"loss": 0.2711,
|
| 3290 |
+
"step": 3430
|
| 3291 |
+
},
|
| 3292 |
+
{
|
| 3293 |
+
"epoch": 149.56521739130434,
|
| 3294 |
+
"grad_norm": 2.2176806926727295,
|
| 3295 |
+
"learning_rate": 3.937716117866669e-06,
|
| 3296 |
+
"loss": 0.2648,
|
| 3297 |
+
"step": 3440
|
| 3298 |
+
},
|
| 3299 |
+
{
|
| 3300 |
+
"epoch": 150.0,
|
| 3301 |
+
"grad_norm": 3.262206792831421,
|
| 3302 |
+
"learning_rate": 3.93766577595671e-06,
|
| 3303 |
+
"loss": 0.2203,
|
| 3304 |
+
"step": 3450
|
| 3305 |
+
},
|
| 3306 |
+
{
|
| 3307 |
+
"epoch": 150.0,
|
| 3308 |
+
"eval_loss": 0.9511697888374329,
|
| 3309 |
+
"eval_runtime": 0.4583,
|
| 3310 |
+
"eval_samples_per_second": 21.821,
|
| 3311 |
+
"eval_steps_per_second": 21.821,
|
| 3312 |
+
"step": 3450
|
| 3313 |
+
},
|
| 3314 |
+
{
|
| 3315 |
+
"Start_State_loss": 0.8609819412231445,
|
| 3316 |
+
"Start_State_runtime": 0.5187,
|
| 3317 |
+
"Start_State_samples_per_second": 19.279,
|
| 3318 |
+
"Start_State_steps_per_second": 19.279,
|
| 3319 |
+
"epoch": 150.0,
|
| 3320 |
+
"step": 3450
|
| 3321 |
+
},
|
| 3322 |
+
{
|
| 3323 |
+
"Raw_Model_loss": 0.9511697888374329,
|
| 3324 |
+
"Raw_Model_runtime": 0.5075,
|
| 3325 |
+
"Raw_Model_samples_per_second": 19.705,
|
| 3326 |
+
"Raw_Model_steps_per_second": 19.705,
|
| 3327 |
+
"epoch": 150.0,
|
| 3328 |
+
"step": 3450
|
| 3329 |
+
},
|
| 3330 |
+
{
|
| 3331 |
+
"SWA_loss": 0.7842515707015991,
|
| 3332 |
+
"SWA_runtime": 0.4974,
|
| 3333 |
+
"SWA_samples_per_second": 20.104,
|
| 3334 |
+
"SWA_steps_per_second": 20.104,
|
| 3335 |
+
"epoch": 150.0,
|
| 3336 |
+
"step": 3450
|
| 3337 |
+
},
|
| 3338 |
+
{
|
| 3339 |
+
"EMA_loss": 0.8596795201301575,
|
| 3340 |
+
"EMA_runtime": 0.437,
|
| 3341 |
+
"EMA_samples_per_second": 22.881,
|
| 3342 |
+
"EMA_steps_per_second": 22.881,
|
| 3343 |
+
"epoch": 150.0,
|
| 3344 |
+
"step": 3450
|
| 3345 |
+
},
|
| 3346 |
+
{
|
| 3347 |
+
"epoch": 150.43478260869566,
|
| 3348 |
+
"grad_norm": 1.7477214336395264,
|
| 3349 |
+
"learning_rate": 2.5260336320414934e-07,
|
| 3350 |
+
"loss": 0.2137,
|
| 3351 |
+
"step": 3460
|
| 3352 |
+
},
|
| 3353 |
+
{
|
| 3354 |
+
"epoch": 150.8695652173913,
|
| 3355 |
+
"grad_norm": 1.9981499910354614,
|
| 3356 |
+
"learning_rate": 5.052067264082987e-07,
|
| 3357 |
+
"loss": 0.262,
|
| 3358 |
+
"step": 3470
|
| 3359 |
+
},
|
| 3360 |
+
{
|
| 3361 |
+
"epoch": 151.30434782608697,
|
| 3362 |
+
"grad_norm": 1.6229016780853271,
|
| 3363 |
+
"learning_rate": 7.57810089612448e-07,
|
| 3364 |
+
"loss": 0.1996,
|
| 3365 |
+
"step": 3480
|
| 3366 |
+
},
|
| 3367 |
+
{
|
| 3368 |
+
"epoch": 151.7391304347826,
|
| 3369 |
+
"grad_norm": 2.360182046890259,
|
| 3370 |
+
"learning_rate": 1.0104134528165973e-06,
|
| 3371 |
+
"loss": 0.2474,
|
| 3372 |
+
"step": 3490
|
| 3373 |
+
},
|
| 3374 |
+
{
|
| 3375 |
+
"epoch": 152.17391304347825,
|
| 3376 |
+
"grad_norm": 2.097730875015259,
|
| 3377 |
+
"learning_rate": 1.2630168160207466e-06,
|
| 3378 |
+
"loss": 0.2421,
|
| 3379 |
+
"step": 3500
|
| 3380 |
+
},
|
| 3381 |
+
{
|
| 3382 |
+
"epoch": 152.6086956521739,
|
| 3383 |
+
"grad_norm": 1.616011381149292,
|
| 3384 |
+
"learning_rate": 1.515620179224896e-06,
|
| 3385 |
+
"loss": 0.2398,
|
| 3386 |
+
"step": 3510
|
| 3387 |
+
},
|
| 3388 |
+
{
|
| 3389 |
+
"epoch": 153.04347826086956,
|
| 3390 |
+
"grad_norm": 1.5673476457595825,
|
| 3391 |
+
"learning_rate": 1.7682235424290452e-06,
|
| 3392 |
+
"loss": 0.2065,
|
| 3393 |
+
"step": 3520
|
| 3394 |
+
},
|
| 3395 |
+
{
|
| 3396 |
+
"epoch": 153.47826086956522,
|
| 3397 |
+
"grad_norm": 2.3053834438323975,
|
| 3398 |
+
"learning_rate": 2.0208269056331947e-06,
|
| 3399 |
+
"loss": 0.2502,
|
| 3400 |
+
"step": 3530
|
| 3401 |
+
},
|
| 3402 |
+
{
|
| 3403 |
+
"epoch": 153.91304347826087,
|
| 3404 |
+
"grad_norm": 2.665015697479248,
|
| 3405 |
+
"learning_rate": 2.273430268837344e-06,
|
| 3406 |
+
"loss": 0.2317,
|
| 3407 |
+
"step": 3540
|
| 3408 |
+
},
|
| 3409 |
+
{
|
| 3410 |
+
"epoch": 154.34782608695653,
|
| 3411 |
+
"grad_norm": 2.2935352325439453,
|
| 3412 |
+
"learning_rate": 2.5260336320414932e-06,
|
| 3413 |
+
"loss": 0.2402,
|
| 3414 |
+
"step": 3550
|
| 3415 |
+
},
|
| 3416 |
+
{
|
| 3417 |
+
"epoch": 154.7826086956522,
|
| 3418 |
+
"grad_norm": 2.005519151687622,
|
| 3419 |
+
"learning_rate": 2.5260333817317373e-06,
|
| 3420 |
+
"loss": 0.2341,
|
| 3421 |
+
"step": 3560
|
| 3422 |
+
},
|
| 3423 |
+
{
|
| 3424 |
+
"epoch": 155.2173913043478,
|
| 3425 |
+
"grad_norm": 1.6518237590789795,
|
| 3426 |
+
"learning_rate": 2.5260326308025684e-06,
|
| 3427 |
+
"loss": 0.1959,
|
| 3428 |
+
"step": 3570
|
| 3429 |
+
},
|
| 3430 |
+
{
|
| 3431 |
+
"epoch": 155.65217391304347,
|
| 3432 |
+
"grad_norm": 2.093646287918091,
|
| 3433 |
+
"learning_rate": 2.526031379254284e-06,
|
| 3434 |
+
"loss": 0.2666,
|
| 3435 |
+
"step": 3580
|
| 3436 |
+
},
|
| 3437 |
+
{
|
| 3438 |
+
"epoch": 156.08695652173913,
|
| 3439 |
+
"grad_norm": 1.6480534076690674,
|
| 3440 |
+
"learning_rate": 2.5260296270873804e-06,
|
| 3441 |
+
"loss": 0.203,
|
| 3442 |
+
"step": 3590
|
| 3443 |
+
},
|
| 3444 |
+
{
|
| 3445 |
+
"epoch": 156.52173913043478,
|
| 3446 |
+
"grad_norm": 2.494234323501587,
|
| 3447 |
+
"learning_rate": 2.5260273743025526e-06,
|
| 3448 |
+
"loss": 0.2677,
|
| 3449 |
+
"step": 3600
|
| 3450 |
+
},
|
| 3451 |
+
{
|
| 3452 |
+
"epoch": 156.52173913043478,
|
| 3453 |
+
"eval_loss": 0.9589301347732544,
|
| 3454 |
+
"eval_runtime": 0.5211,
|
| 3455 |
+
"eval_samples_per_second": 19.192,
|
| 3456 |
+
"eval_steps_per_second": 19.192,
|
| 3457 |
+
"step": 3600
|
| 3458 |
+
},
|
| 3459 |
+
{
|
| 3460 |
+
"Start_State_loss": 0.8609819412231445,
|
| 3461 |
+
"Start_State_runtime": 0.5189,
|
| 3462 |
+
"Start_State_samples_per_second": 19.273,
|
| 3463 |
+
"Start_State_steps_per_second": 19.273,
|
| 3464 |
+
"epoch": 156.52173913043478,
|
| 3465 |
+
"step": 3600
|
| 3466 |
+
},
|
| 3467 |
+
{
|
| 3468 |
+
"Raw_Model_loss": 0.9589301347732544,
|
| 3469 |
+
"Raw_Model_runtime": 0.4778,
|
| 3470 |
+
"Raw_Model_samples_per_second": 20.928,
|
| 3471 |
+
"Raw_Model_steps_per_second": 20.928,
|
| 3472 |
+
"epoch": 156.52173913043478,
|
| 3473 |
+
"step": 3600
|
| 3474 |
+
},
|
| 3475 |
+
{
|
| 3476 |
+
"SWA_loss": 0.7895848155021667,
|
| 3477 |
+
"SWA_runtime": 0.4495,
|
| 3478 |
+
"SWA_samples_per_second": 22.249,
|
| 3479 |
+
"SWA_steps_per_second": 22.249,
|
| 3480 |
+
"epoch": 156.52173913043478,
|
| 3481 |
+
"step": 3600
|
| 3482 |
+
},
|
| 3483 |
+
{
|
| 3484 |
+
"EMA_loss": 0.8595443964004517,
|
| 3485 |
+
"EMA_runtime": 0.4523,
|
| 3486 |
+
"EMA_samples_per_second": 22.111,
|
| 3487 |
+
"EMA_steps_per_second": 22.111,
|
| 3488 |
+
"epoch": 156.52173913043478,
|
| 3489 |
+
"step": 3600
|
| 3490 |
+
},
|
| 3491 |
+
{
|
| 3492 |
+
"epoch": 156.95652173913044,
|
| 3493 |
+
"grad_norm": 2.5807197093963623,
|
| 3494 |
+
"learning_rate": 2.526024620900692e-06,
|
| 3495 |
+
"loss": 0.1972,
|
| 3496 |
+
"step": 3610
|
| 3497 |
+
},
|
| 3498 |
+
{
|
| 3499 |
+
"epoch": 157.3913043478261,
|
| 3500 |
+
"grad_norm": 1.868238091468811,
|
| 3501 |
+
"learning_rate": 2.526021366882892e-06,
|
| 3502 |
+
"loss": 0.2551,
|
| 3503 |
+
"step": 3620
|
| 3504 |
+
},
|
| 3505 |
+
{
|
| 3506 |
+
"epoch": 157.82608695652175,
|
| 3507 |
+
"grad_norm": 1.9588141441345215,
|
| 3508 |
+
"learning_rate": 2.526017612250441e-06,
|
| 3509 |
+
"loss": 0.2346,
|
| 3510 |
+
"step": 3630
|
| 3511 |
+
},
|
| 3512 |
+
{
|
| 3513 |
+
"epoch": 158.2608695652174,
|
| 3514 |
+
"grad_norm": 2.2502245903015137,
|
| 3515 |
+
"learning_rate": 2.5260133570048273e-06,
|
| 3516 |
+
"loss": 0.2246,
|
| 3517 |
+
"step": 3640
|
| 3518 |
+
},
|
| 3519 |
+
{
|
| 3520 |
+
"epoch": 158.69565217391303,
|
| 3521 |
+
"grad_norm": 3.502547025680542,
|
| 3522 |
+
"learning_rate": 2.526008601147738e-06,
|
| 3523 |
+
"loss": 0.2274,
|
| 3524 |
+
"step": 3650
|
| 3525 |
+
},
|
| 3526 |
+
{
|
| 3527 |
+
"epoch": 159.1304347826087,
|
| 3528 |
+
"grad_norm": 2.578259229660034,
|
| 3529 |
+
"learning_rate": 2.526003344681058e-06,
|
| 3530 |
+
"loss": 0.2527,
|
| 3531 |
+
"step": 3660
|
| 3532 |
+
},
|
| 3533 |
+
{
|
| 3534 |
+
"epoch": 159.56521739130434,
|
| 3535 |
+
"grad_norm": 1.8786590099334717,
|
| 3536 |
+
"learning_rate": 2.5259975876068714e-06,
|
| 3537 |
+
"loss": 0.2368,
|
| 3538 |
+
"step": 3670
|
| 3539 |
+
},
|
| 3540 |
+
{
|
| 3541 |
+
"epoch": 160.0,
|
| 3542 |
+
"grad_norm": 2.6274077892303467,
|
| 3543 |
+
"learning_rate": 2.525991329927459e-06,
|
| 3544 |
+
"loss": 0.2028,
|
| 3545 |
+
"step": 3680
|
| 3546 |
+
},
|
| 3547 |
+
{
|
| 3548 |
+
"epoch": 160.43478260869566,
|
| 3549 |
+
"grad_norm": 2.002021551132202,
|
| 3550 |
+
"learning_rate": 2.5259845716453015e-06,
|
| 3551 |
+
"loss": 0.2279,
|
| 3552 |
+
"step": 3690
|
| 3553 |
+
},
|
| 3554 |
+
{
|
| 3555 |
+
"epoch": 160.8695652173913,
|
| 3556 |
+
"grad_norm": 1.5384571552276611,
|
| 3557 |
+
"learning_rate": 2.525977312763078e-06,
|
| 3558 |
+
"loss": 0.2436,
|
| 3559 |
+
"step": 3700
|
| 3560 |
+
},
|
| 3561 |
+
{
|
| 3562 |
+
"epoch": 161.30434782608697,
|
| 3563 |
+
"grad_norm": 1.5973771810531616,
|
| 3564 |
+
"learning_rate": 2.5259695532836654e-06,
|
| 3565 |
+
"loss": 0.2762,
|
| 3566 |
+
"step": 3710
|
| 3567 |
+
},
|
| 3568 |
+
{
|
| 3569 |
+
"epoch": 161.7391304347826,
|
| 3570 |
+
"grad_norm": 2.0545079708099365,
|
| 3571 |
+
"learning_rate": 2.52596129321014e-06,
|
| 3572 |
+
"loss": 0.2278,
|
| 3573 |
+
"step": 3720
|
| 3574 |
+
},
|
| 3575 |
+
{
|
| 3576 |
+
"epoch": 162.17391304347825,
|
| 3577 |
+
"grad_norm": 2.2520041465759277,
|
| 3578 |
+
"learning_rate": 2.525952532545775e-06,
|
| 3579 |
+
"loss": 0.2153,
|
| 3580 |
+
"step": 3730
|
| 3581 |
+
},
|
| 3582 |
+
{
|
| 3583 |
+
"epoch": 162.6086956521739,
|
| 3584 |
+
"grad_norm": 2.081439971923828,
|
| 3585 |
+
"learning_rate": 2.5259432712940426e-06,
|
| 3586 |
+
"loss": 0.2231,
|
| 3587 |
+
"step": 3740
|
| 3588 |
+
},
|
| 3589 |
+
{
|
| 3590 |
+
"epoch": 163.04347826086956,
|
| 3591 |
+
"grad_norm": 2.37165904045105,
|
| 3592 |
+
"learning_rate": 2.5259335094586143e-06,
|
| 3593 |
+
"loss": 0.2235,
|
| 3594 |
+
"step": 3750
|
| 3595 |
+
},
|
| 3596 |
+
{
|
| 3597 |
+
"epoch": 163.04347826086956,
|
| 3598 |
+
"eval_loss": 0.9614953994750977,
|
| 3599 |
+
"eval_runtime": 0.407,
|
| 3600 |
+
"eval_samples_per_second": 24.568,
|
| 3601 |
+
"eval_steps_per_second": 24.568,
|
| 3602 |
+
"step": 3750
|
| 3603 |
+
},
|
| 3604 |
+
{
|
| 3605 |
+
"Start_State_loss": 0.8609819412231445,
|
| 3606 |
+
"Start_State_runtime": 0.3898,
|
| 3607 |
+
"Start_State_samples_per_second": 25.657,
|
| 3608 |
+
"Start_State_steps_per_second": 25.657,
|
| 3609 |
+
"epoch": 163.04347826086956,
|
| 3610 |
+
"step": 3750
|
| 3611 |
+
},
|
| 3612 |
+
{
|
| 3613 |
+
"Raw_Model_loss": 0.9614953994750977,
|
| 3614 |
+
"Raw_Model_runtime": 0.386,
|
| 3615 |
+
"Raw_Model_samples_per_second": 25.905,
|
| 3616 |
+
"Raw_Model_steps_per_second": 25.905,
|
| 3617 |
+
"epoch": 163.04347826086956,
|
| 3618 |
+
"step": 3750
|
| 3619 |
+
},
|
| 3620 |
+
{
|
| 3621 |
+
"SWA_loss": 0.7928785681724548,
|
| 3622 |
+
"SWA_runtime": 0.3893,
|
| 3623 |
+
"SWA_samples_per_second": 25.686,
|
| 3624 |
+
"SWA_steps_per_second": 25.686,
|
| 3625 |
+
"epoch": 163.04347826086956,
|
| 3626 |
+
"step": 3750
|
| 3627 |
+
},
|
| 3628 |
+
{
|
| 3629 |
+
"EMA_loss": 0.860231876373291,
|
| 3630 |
+
"EMA_runtime": 0.386,
|
| 3631 |
+
"EMA_samples_per_second": 25.904,
|
| 3632 |
+
"EMA_steps_per_second": 25.904,
|
| 3633 |
+
"epoch": 163.04347826086956,
|
| 3634 |
+
"step": 3750
|
| 3635 |
}
|
| 3636 |
],
|
| 3637 |
"logging_steps": 10,
|
|
|
|
| 3651 |
"attributes": {}
|
| 3652 |
}
|
| 3653 |
},
|
| 3654 |
+
"total_flos": 9.668631592798618e+16,
|
| 3655 |
"train_batch_size": 4,
|
| 3656 |
"trial_name": null,
|
| 3657 |
"trial_params": null
|