Training in progress, epoch 25, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1227009528
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebf6b113805e8d5c18f20cc3a7f743cea1ac029ed8f4448a7b46de82a6c516e9
|
| 3 |
size 1227009528
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2454133690
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7534969a274bb3fa8023a906c28ee9fb96fa28e85e22f56fbd7e7b549d41dd80
|
| 3 |
size 2454133690
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f7a8d60a2f79f9fed2cea73d23dd3dfda5f5e479acfb4213a6f2e863cb76904
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cbd4af8c4ab3cb75893cf7c4c12466d6c795077167416da697449ce4a12b474
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": 34.53865432739258,
|
| 3 |
"best_model_checkpoint": "/kaggle/working/output/checkpoint-28710",
|
| 4 |
-
"epoch":
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -2291,6 +2291,204 @@
|
|
| 2291 |
"eval_samples_per_second": 26.45,
|
| 2292 |
"eval_steps_per_second": 3.324,
|
| 2293 |
"step": 30015
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2294 |
}
|
| 2295 |
],
|
| 2296 |
"logging_steps": 100,
|
|
@@ -2305,7 +2503,7 @@
|
|
| 2305 |
"early_stopping_threshold": 0.0
|
| 2306 |
},
|
| 2307 |
"attributes": {
|
| 2308 |
-
"early_stopping_patience_counter":
|
| 2309 |
}
|
| 2310 |
},
|
| 2311 |
"TrainerControl": {
|
|
@@ -2319,7 +2517,7 @@
|
|
| 2319 |
"attributes": {}
|
| 2320 |
}
|
| 2321 |
},
|
| 2322 |
-
"total_flos": 3.
|
| 2323 |
"train_batch_size": 8,
|
| 2324 |
"trial_name": null,
|
| 2325 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": 34.53865432739258,
|
| 3 |
"best_model_checkpoint": "/kaggle/working/output/checkpoint-28710",
|
| 4 |
+
"epoch": 25.0,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 32625,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 2291 |
"eval_samples_per_second": 26.45,
|
| 2292 |
"eval_steps_per_second": 3.324,
|
| 2293 |
"step": 30015
|
| 2294 |
+
},
|
| 2295 |
+
{
|
| 2296 |
+
"epoch": 23.06513409961686,
|
| 2297 |
+
"grad_norm": 1.9898459911346436,
|
| 2298 |
+
"learning_rate": 3.559099616858237e-05,
|
| 2299 |
+
"loss": 33.2659,
|
| 2300 |
+
"step": 30100
|
| 2301 |
+
},
|
| 2302 |
+
{
|
| 2303 |
+
"epoch": 23.14176245210728,
|
| 2304 |
+
"grad_norm": 3.3541698455810547,
|
| 2305 |
+
"learning_rate": 3.554310344827587e-05,
|
| 2306 |
+
"loss": 33.4747,
|
| 2307 |
+
"step": 30200
|
| 2308 |
+
},
|
| 2309 |
+
{
|
| 2310 |
+
"epoch": 23.2183908045977,
|
| 2311 |
+
"grad_norm": 2.298229694366455,
|
| 2312 |
+
"learning_rate": 3.5495210727969354e-05,
|
| 2313 |
+
"loss": 33.8791,
|
| 2314 |
+
"step": 30300
|
| 2315 |
+
},
|
| 2316 |
+
{
|
| 2317 |
+
"epoch": 23.295019157088124,
|
| 2318 |
+
"grad_norm": 3.9336183071136475,
|
| 2319 |
+
"learning_rate": 3.544731800766284e-05,
|
| 2320 |
+
"loss": 33.8427,
|
| 2321 |
+
"step": 30400
|
| 2322 |
+
},
|
| 2323 |
+
{
|
| 2324 |
+
"epoch": 23.371647509578544,
|
| 2325 |
+
"grad_norm": 2.9286720752716064,
|
| 2326 |
+
"learning_rate": 3.539942528735633e-05,
|
| 2327 |
+
"loss": 33.9572,
|
| 2328 |
+
"step": 30500
|
| 2329 |
+
},
|
| 2330 |
+
{
|
| 2331 |
+
"epoch": 23.448275862068964,
|
| 2332 |
+
"grad_norm": 2.9716665744781494,
|
| 2333 |
+
"learning_rate": 3.5351532567049814e-05,
|
| 2334 |
+
"loss": 32.5295,
|
| 2335 |
+
"step": 30600
|
| 2336 |
+
},
|
| 2337 |
+
{
|
| 2338 |
+
"epoch": 23.52490421455939,
|
| 2339 |
+
"grad_norm": 3.5073654651641846,
|
| 2340 |
+
"learning_rate": 3.5303639846743294e-05,
|
| 2341 |
+
"loss": 33.3511,
|
| 2342 |
+
"step": 30700
|
| 2343 |
+
},
|
| 2344 |
+
{
|
| 2345 |
+
"epoch": 23.60153256704981,
|
| 2346 |
+
"grad_norm": 4.5670084953308105,
|
| 2347 |
+
"learning_rate": 3.525574712643678e-05,
|
| 2348 |
+
"loss": 33.4249,
|
| 2349 |
+
"step": 30800
|
| 2350 |
+
},
|
| 2351 |
+
{
|
| 2352 |
+
"epoch": 23.67816091954023,
|
| 2353 |
+
"grad_norm": 2.563405990600586,
|
| 2354 |
+
"learning_rate": 3.520785440613027e-05,
|
| 2355 |
+
"loss": 33.821,
|
| 2356 |
+
"step": 30900
|
| 2357 |
+
},
|
| 2358 |
+
{
|
| 2359 |
+
"epoch": 23.754789272030653,
|
| 2360 |
+
"grad_norm": 3.5928332805633545,
|
| 2361 |
+
"learning_rate": 3.5159961685823755e-05,
|
| 2362 |
+
"loss": 32.9252,
|
| 2363 |
+
"step": 31000
|
| 2364 |
+
},
|
| 2365 |
+
{
|
| 2366 |
+
"epoch": 23.831417624521073,
|
| 2367 |
+
"grad_norm": 3.2677550315856934,
|
| 2368 |
+
"learning_rate": 3.511206896551724e-05,
|
| 2369 |
+
"loss": 33.4694,
|
| 2370 |
+
"step": 31100
|
| 2371 |
+
},
|
| 2372 |
+
{
|
| 2373 |
+
"epoch": 23.908045977011493,
|
| 2374 |
+
"grad_norm": 3.8751015663146973,
|
| 2375 |
+
"learning_rate": 3.506417624521073e-05,
|
| 2376 |
+
"loss": 32.7835,
|
| 2377 |
+
"step": 31200
|
| 2378 |
+
},
|
| 2379 |
+
{
|
| 2380 |
+
"epoch": 23.984674329501917,
|
| 2381 |
+
"grad_norm": 3.955101490020752,
|
| 2382 |
+
"learning_rate": 3.5016283524904216e-05,
|
| 2383 |
+
"loss": 32.6658,
|
| 2384 |
+
"step": 31300
|
| 2385 |
+
},
|
| 2386 |
+
{
|
| 2387 |
+
"epoch": 24.0,
|
| 2388 |
+
"eval_loss": 34.550262451171875,
|
| 2389 |
+
"eval_runtime": 49.3313,
|
| 2390 |
+
"eval_samples_per_second": 26.454,
|
| 2391 |
+
"eval_steps_per_second": 3.324,
|
| 2392 |
+
"step": 31320
|
| 2393 |
+
},
|
| 2394 |
+
{
|
| 2395 |
+
"epoch": 24.061302681992338,
|
| 2396 |
+
"grad_norm": 3.885087013244629,
|
| 2397 |
+
"learning_rate": 3.49683908045977e-05,
|
| 2398 |
+
"loss": 33.5285,
|
| 2399 |
+
"step": 31400
|
| 2400 |
+
},
|
| 2401 |
+
{
|
| 2402 |
+
"epoch": 24.137931034482758,
|
| 2403 |
+
"grad_norm": 8.908398628234863,
|
| 2404 |
+
"learning_rate": 3.4920977011494254e-05,
|
| 2405 |
+
"loss": 33.1673,
|
| 2406 |
+
"step": 31500
|
| 2407 |
+
},
|
| 2408 |
+
{
|
| 2409 |
+
"epoch": 24.21455938697318,
|
| 2410 |
+
"grad_norm": 4.042150974273682,
|
| 2411 |
+
"learning_rate": 3.487308429118774e-05,
|
| 2412 |
+
"loss": 33.0384,
|
| 2413 |
+
"step": 31600
|
| 2414 |
+
},
|
| 2415 |
+
{
|
| 2416 |
+
"epoch": 24.291187739463602,
|
| 2417 |
+
"grad_norm": 4.992551803588867,
|
| 2418 |
+
"learning_rate": 3.482519157088123e-05,
|
| 2419 |
+
"loss": 33.7439,
|
| 2420 |
+
"step": 31700
|
| 2421 |
+
},
|
| 2422 |
+
{
|
| 2423 |
+
"epoch": 24.367816091954023,
|
| 2424 |
+
"grad_norm": 5.118918418884277,
|
| 2425 |
+
"learning_rate": 3.4777298850574715e-05,
|
| 2426 |
+
"loss": 33.5604,
|
| 2427 |
+
"step": 31800
|
| 2428 |
+
},
|
| 2429 |
+
{
|
| 2430 |
+
"epoch": 24.444444444444443,
|
| 2431 |
+
"grad_norm": 3.2756083011627197,
|
| 2432 |
+
"learning_rate": 3.47294061302682e-05,
|
| 2433 |
+
"loss": 33.6225,
|
| 2434 |
+
"step": 31900
|
| 2435 |
+
},
|
| 2436 |
+
{
|
| 2437 |
+
"epoch": 24.521072796934867,
|
| 2438 |
+
"grad_norm": 2.9864351749420166,
|
| 2439 |
+
"learning_rate": 3.468151340996169e-05,
|
| 2440 |
+
"loss": 34.0539,
|
| 2441 |
+
"step": 32000
|
| 2442 |
+
},
|
| 2443 |
+
{
|
| 2444 |
+
"epoch": 24.597701149425287,
|
| 2445 |
+
"grad_norm": 2.945171356201172,
|
| 2446 |
+
"learning_rate": 3.463362068965517e-05,
|
| 2447 |
+
"loss": 33.2655,
|
| 2448 |
+
"step": 32100
|
| 2449 |
+
},
|
| 2450 |
+
{
|
| 2451 |
+
"epoch": 24.674329501915707,
|
| 2452 |
+
"grad_norm": 4.09877347946167,
|
| 2453 |
+
"learning_rate": 3.4585727969348656e-05,
|
| 2454 |
+
"loss": 33.239,
|
| 2455 |
+
"step": 32200
|
| 2456 |
+
},
|
| 2457 |
+
{
|
| 2458 |
+
"epoch": 24.75095785440613,
|
| 2459 |
+
"grad_norm": 3.7949306964874268,
|
| 2460 |
+
"learning_rate": 3.453783524904215e-05,
|
| 2461 |
+
"loss": 32.7246,
|
| 2462 |
+
"step": 32300
|
| 2463 |
+
},
|
| 2464 |
+
{
|
| 2465 |
+
"epoch": 24.82758620689655,
|
| 2466 |
+
"grad_norm": 3.8750340938568115,
|
| 2467 |
+
"learning_rate": 3.4489942528735636e-05,
|
| 2468 |
+
"loss": 32.5477,
|
| 2469 |
+
"step": 32400
|
| 2470 |
+
},
|
| 2471 |
+
{
|
| 2472 |
+
"epoch": 24.904214559386972,
|
| 2473 |
+
"grad_norm": 3.84676456451416,
|
| 2474 |
+
"learning_rate": 3.444204980842912e-05,
|
| 2475 |
+
"loss": 33.5781,
|
| 2476 |
+
"step": 32500
|
| 2477 |
+
},
|
| 2478 |
+
{
|
| 2479 |
+
"epoch": 24.980842911877396,
|
| 2480 |
+
"grad_norm": 2.3316519260406494,
|
| 2481 |
+
"learning_rate": 3.439415708812261e-05,
|
| 2482 |
+
"loss": 33.0241,
|
| 2483 |
+
"step": 32600
|
| 2484 |
+
},
|
| 2485 |
+
{
|
| 2486 |
+
"epoch": 25.0,
|
| 2487 |
+
"eval_loss": 34.565101623535156,
|
| 2488 |
+
"eval_runtime": 49.343,
|
| 2489 |
+
"eval_samples_per_second": 26.448,
|
| 2490 |
+
"eval_steps_per_second": 3.324,
|
| 2491 |
+
"step": 32625
|
| 2492 |
}
|
| 2493 |
],
|
| 2494 |
"logging_steps": 100,
|
|
|
|
| 2503 |
"early_stopping_threshold": 0.0
|
| 2504 |
},
|
| 2505 |
"attributes": {
|
| 2506 |
+
"early_stopping_patience_counter": 3
|
| 2507 |
}
|
| 2508 |
},
|
| 2509 |
"TrainerControl": {
|
|
|
|
| 2517 |
"attributes": {}
|
| 2518 |
}
|
| 2519 |
},
|
| 2520 |
+
"total_flos": 3.51827725761792e+16,
|
| 2521 |
"train_batch_size": 8,
|
| 2522 |
"trial_name": null,
|
| 2523 |
"trial_params": null
|