Training in progress, step 3750, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3809184360
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb01dfc1c2e8ff7808bbee052468667857cd0cc81291988921d045c2d7906b09
|
| 3 |
size 3809184360
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2458291491
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21ad5221e1f7ef28d1eefb19cd0fee3fef5ece52f6f27106c14b5c6e4c86bc91
|
| 3 |
size 2458291491
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fdbd0a64c232d94ea2546e029d229e872e7990a092a5c5c86566d2492a53f3b
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64a6a4e6bbe43acf16ab43cab9cd60b8f70df5b08ce4b3f2a9b7397dc1ce58b0
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2458,6 +2458,181 @@
|
|
| 2458 |
"learning_rate": 3.436657681940701e-06,
|
| 2459 |
"loss": 0.1441,
|
| 2460 |
"step": 3500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2461 |
}
|
| 2462 |
],
|
| 2463 |
"logging_steps": 10,
|
|
@@ -2472,12 +2647,12 @@
|
|
| 2472 |
"should_evaluate": false,
|
| 2473 |
"should_log": false,
|
| 2474 |
"should_save": true,
|
| 2475 |
-
"should_training_stop":
|
| 2476 |
},
|
| 2477 |
"attributes": {}
|
| 2478 |
}
|
| 2479 |
},
|
| 2480 |
-
"total_flos": 6.
|
| 2481 |
"train_batch_size": 2,
|
| 2482 |
"trial_name": null,
|
| 2483 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 3750,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2458 |
"learning_rate": 3.436657681940701e-06,
|
| 2459 |
"loss": 0.1441,
|
| 2460 |
"step": 3500
|
| 2461 |
+
},
|
| 2462 |
+
{
|
| 2463 |
+
"epoch": 0.936,
|
| 2464 |
+
"grad_norm": 9.182714462280273,
|
| 2465 |
+
"learning_rate": 3.30188679245283e-06,
|
| 2466 |
+
"loss": 0.2332,
|
| 2467 |
+
"step": 3510
|
| 2468 |
+
},
|
| 2469 |
+
{
|
| 2470 |
+
"epoch": 0.9386666666666666,
|
| 2471 |
+
"grad_norm": 6.365874290466309,
|
| 2472 |
+
"learning_rate": 3.1671159029649594e-06,
|
| 2473 |
+
"loss": 0.1443,
|
| 2474 |
+
"step": 3520
|
| 2475 |
+
},
|
| 2476 |
+
{
|
| 2477 |
+
"epoch": 0.9413333333333334,
|
| 2478 |
+
"grad_norm": 6.266571521759033,
|
| 2479 |
+
"learning_rate": 3.032345013477089e-06,
|
| 2480 |
+
"loss": 0.1959,
|
| 2481 |
+
"step": 3530
|
| 2482 |
+
},
|
| 2483 |
+
{
|
| 2484 |
+
"epoch": 0.944,
|
| 2485 |
+
"grad_norm": 7.494802474975586,
|
| 2486 |
+
"learning_rate": 2.8975741239892183e-06,
|
| 2487 |
+
"loss": 0.149,
|
| 2488 |
+
"step": 3540
|
| 2489 |
+
},
|
| 2490 |
+
{
|
| 2491 |
+
"epoch": 0.9466666666666667,
|
| 2492 |
+
"grad_norm": 5.22160005569458,
|
| 2493 |
+
"learning_rate": 2.762803234501348e-06,
|
| 2494 |
+
"loss": 0.3431,
|
| 2495 |
+
"step": 3550
|
| 2496 |
+
},
|
| 2497 |
+
{
|
| 2498 |
+
"epoch": 0.9493333333333334,
|
| 2499 |
+
"grad_norm": 11.847735404968262,
|
| 2500 |
+
"learning_rate": 2.628032345013477e-06,
|
| 2501 |
+
"loss": 0.2068,
|
| 2502 |
+
"step": 3560
|
| 2503 |
+
},
|
| 2504 |
+
{
|
| 2505 |
+
"epoch": 0.952,
|
| 2506 |
+
"grad_norm": 41.45210647583008,
|
| 2507 |
+
"learning_rate": 2.4932614555256068e-06,
|
| 2508 |
+
"loss": 0.2057,
|
| 2509 |
+
"step": 3570
|
| 2510 |
+
},
|
| 2511 |
+
{
|
| 2512 |
+
"epoch": 0.9546666666666667,
|
| 2513 |
+
"grad_norm": 8.89501953125,
|
| 2514 |
+
"learning_rate": 2.358490566037736e-06,
|
| 2515 |
+
"loss": 0.5128,
|
| 2516 |
+
"step": 3580
|
| 2517 |
+
},
|
| 2518 |
+
{
|
| 2519 |
+
"epoch": 0.9573333333333334,
|
| 2520 |
+
"grad_norm": 6.3149261474609375,
|
| 2521 |
+
"learning_rate": 2.223719676549865e-06,
|
| 2522 |
+
"loss": 0.1869,
|
| 2523 |
+
"step": 3590
|
| 2524 |
+
},
|
| 2525 |
+
{
|
| 2526 |
+
"epoch": 0.96,
|
| 2527 |
+
"grad_norm": 5.511444091796875,
|
| 2528 |
+
"learning_rate": 2.088948787061995e-06,
|
| 2529 |
+
"loss": 0.2311,
|
| 2530 |
+
"step": 3600
|
| 2531 |
+
},
|
| 2532 |
+
{
|
| 2533 |
+
"epoch": 0.9626666666666667,
|
| 2534 |
+
"grad_norm": 6.782158851623535,
|
| 2535 |
+
"learning_rate": 1.954177897574124e-06,
|
| 2536 |
+
"loss": 0.1655,
|
| 2537 |
+
"step": 3610
|
| 2538 |
+
},
|
| 2539 |
+
{
|
| 2540 |
+
"epoch": 0.9653333333333334,
|
| 2541 |
+
"grad_norm": 6.828353404998779,
|
| 2542 |
+
"learning_rate": 1.8194070080862537e-06,
|
| 2543 |
+
"loss": 0.1694,
|
| 2544 |
+
"step": 3620
|
| 2545 |
+
},
|
| 2546 |
+
{
|
| 2547 |
+
"epoch": 0.968,
|
| 2548 |
+
"grad_norm": 2.4872541427612305,
|
| 2549 |
+
"learning_rate": 1.6846361185983827e-06,
|
| 2550 |
+
"loss": 0.1647,
|
| 2551 |
+
"step": 3630
|
| 2552 |
+
},
|
| 2553 |
+
{
|
| 2554 |
+
"epoch": 0.9706666666666667,
|
| 2555 |
+
"grad_norm": 8.890005111694336,
|
| 2556 |
+
"learning_rate": 1.5498652291105121e-06,
|
| 2557 |
+
"loss": 0.1979,
|
| 2558 |
+
"step": 3640
|
| 2559 |
+
},
|
| 2560 |
+
{
|
| 2561 |
+
"epoch": 0.9733333333333334,
|
| 2562 |
+
"grad_norm": 7.4598259925842285,
|
| 2563 |
+
"learning_rate": 1.4150943396226415e-06,
|
| 2564 |
+
"loss": 0.3526,
|
| 2565 |
+
"step": 3650
|
| 2566 |
+
},
|
| 2567 |
+
{
|
| 2568 |
+
"epoch": 0.976,
|
| 2569 |
+
"grad_norm": 4.237139701843262,
|
| 2570 |
+
"learning_rate": 1.280323450134771e-06,
|
| 2571 |
+
"loss": 0.2159,
|
| 2572 |
+
"step": 3660
|
| 2573 |
+
},
|
| 2574 |
+
{
|
| 2575 |
+
"epoch": 0.9786666666666667,
|
| 2576 |
+
"grad_norm": 5.643311500549316,
|
| 2577 |
+
"learning_rate": 1.1455525606469004e-06,
|
| 2578 |
+
"loss": 0.1425,
|
| 2579 |
+
"step": 3670
|
| 2580 |
+
},
|
| 2581 |
+
{
|
| 2582 |
+
"epoch": 0.9813333333333333,
|
| 2583 |
+
"grad_norm": 7.4330267906188965,
|
| 2584 |
+
"learning_rate": 1.0107816711590296e-06,
|
| 2585 |
+
"loss": 0.1761,
|
| 2586 |
+
"step": 3680
|
| 2587 |
+
},
|
| 2588 |
+
{
|
| 2589 |
+
"epoch": 0.984,
|
| 2590 |
+
"grad_norm": 12.03699779510498,
|
| 2591 |
+
"learning_rate": 8.76010781671159e-07,
|
| 2592 |
+
"loss": 0.2607,
|
| 2593 |
+
"step": 3690
|
| 2594 |
+
},
|
| 2595 |
+
{
|
| 2596 |
+
"epoch": 0.9866666666666667,
|
| 2597 |
+
"grad_norm": 6.911093235015869,
|
| 2598 |
+
"learning_rate": 7.412398921832885e-07,
|
| 2599 |
+
"loss": 0.1755,
|
| 2600 |
+
"step": 3700
|
| 2601 |
+
},
|
| 2602 |
+
{
|
| 2603 |
+
"epoch": 0.9893333333333333,
|
| 2604 |
+
"grad_norm": 6.668974876403809,
|
| 2605 |
+
"learning_rate": 6.064690026954178e-07,
|
| 2606 |
+
"loss": 0.2031,
|
| 2607 |
+
"step": 3710
|
| 2608 |
+
},
|
| 2609 |
+
{
|
| 2610 |
+
"epoch": 0.992,
|
| 2611 |
+
"grad_norm": 11.474651336669922,
|
| 2612 |
+
"learning_rate": 4.7169811320754717e-07,
|
| 2613 |
+
"loss": 0.2236,
|
| 2614 |
+
"step": 3720
|
| 2615 |
+
},
|
| 2616 |
+
{
|
| 2617 |
+
"epoch": 0.9946666666666667,
|
| 2618 |
+
"grad_norm": 9.00444507598877,
|
| 2619 |
+
"learning_rate": 3.369272237196766e-07,
|
| 2620 |
+
"loss": 0.1306,
|
| 2621 |
+
"step": 3730
|
| 2622 |
+
},
|
| 2623 |
+
{
|
| 2624 |
+
"epoch": 0.9973333333333333,
|
| 2625 |
+
"grad_norm": 52.68935012817383,
|
| 2626 |
+
"learning_rate": 2.0215633423180594e-07,
|
| 2627 |
+
"loss": 0.231,
|
| 2628 |
+
"step": 3740
|
| 2629 |
+
},
|
| 2630 |
+
{
|
| 2631 |
+
"epoch": 1.0,
|
| 2632 |
+
"grad_norm": 5.777242183685303,
|
| 2633 |
+
"learning_rate": 6.738544474393531e-08,
|
| 2634 |
+
"loss": 0.1794,
|
| 2635 |
+
"step": 3750
|
| 2636 |
}
|
| 2637 |
],
|
| 2638 |
"logging_steps": 10,
|
|
|
|
| 2647 |
"should_evaluate": false,
|
| 2648 |
"should_log": false,
|
| 2649 |
"should_save": true,
|
| 2650 |
+
"should_training_stop": true
|
| 2651 |
},
|
| 2652 |
"attributes": {}
|
| 2653 |
}
|
| 2654 |
},
|
| 2655 |
+
"total_flos": 6.77976396217344e+16,
|
| 2656 |
"train_batch_size": 2,
|
| 2657 |
"trial_name": null,
|
| 2658 |
"trial_params": null
|