Training in progress, step 4000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 328277848
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd5257ed25b3deedcdfbd77b311ce64f39ce97cab4262552a2cce890d0e1ed2f
|
| 3 |
size 328277848
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 318646859
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38c6d9ddeda93bf2814232d10b3b4a6111c3ba43c271d5af0fe9ac07ad7bdf8f
|
| 3 |
size 318646859
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8647979d889bb2b15d0a3e8961a7e547be28d07767d240f858bd959476bb870c
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f34721a2fd924d02bdad3691f09e25bcb5ed140f7982be7b710c4ccbd2538c0
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2521,6 +2521,364 @@
|
|
| 2521 |
"eval_samples_per_second": 278.824,
|
| 2522 |
"eval_steps_per_second": 5.855,
|
| 2523 |
"step": 3500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2524 |
}
|
| 2525 |
],
|
| 2526 |
"logging_steps": 10,
|
|
@@ -2540,7 +2898,7 @@
|
|
| 2540 |
"attributes": {}
|
| 2541 |
}
|
| 2542 |
},
|
| 2543 |
-
"total_flos": 1.
|
| 2544 |
"train_batch_size": 48,
|
| 2545 |
"trial_name": null,
|
| 2546 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.6757898293630681,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 4000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2521 |
"eval_samples_per_second": 278.824,
|
| 2522 |
"eval_steps_per_second": 5.855,
|
| 2523 |
"step": 3500
|
| 2524 |
+
},
|
| 2525 |
+
{
|
| 2526 |
+
"epoch": 0.5930055752660922,
|
| 2527 |
+
"grad_norm": 0.6327577233314514,
|
| 2528 |
+
"learning_rate": 0.00028291929128998293,
|
| 2529 |
+
"loss": 4.818768310546875,
|
| 2530 |
+
"step": 3510
|
| 2531 |
+
},
|
| 2532 |
+
{
|
| 2533 |
+
"epoch": 0.5946950498395,
|
| 2534 |
+
"grad_norm": 0.5604032278060913,
|
| 2535 |
+
"learning_rate": 0.00028269662709221635,
|
| 2536 |
+
"loss": 4.822237777709961,
|
| 2537 |
+
"step": 3520
|
| 2538 |
+
},
|
| 2539 |
+
{
|
| 2540 |
+
"epoch": 0.5963845244129076,
|
| 2541 |
+
"grad_norm": 0.5533990859985352,
|
| 2542 |
+
"learning_rate": 0.00028247260974544037,
|
| 2543 |
+
"loss": 4.827470779418945,
|
| 2544 |
+
"step": 3530
|
| 2545 |
+
},
|
| 2546 |
+
{
|
| 2547 |
+
"epoch": 0.5980739989863153,
|
| 2548 |
+
"grad_norm": 0.613385796546936,
|
| 2549 |
+
"learning_rate": 0.00028224724153403015,
|
| 2550 |
+
"loss": 4.831082153320312,
|
| 2551 |
+
"step": 3540
|
| 2552 |
+
},
|
| 2553 |
+
{
|
| 2554 |
+
"epoch": 0.5997634735597229,
|
| 2555 |
+
"grad_norm": 0.5741211175918579,
|
| 2556 |
+
"learning_rate": 0.0002820205247561356,
|
| 2557 |
+
"loss": 4.8289031982421875,
|
| 2558 |
+
"step": 3550
|
| 2559 |
+
},
|
| 2560 |
+
{
|
| 2561 |
+
"epoch": 0.6014529481331305,
|
| 2562 |
+
"grad_norm": 0.5649030804634094,
|
| 2563 |
+
"learning_rate": 0.0002817924617236587,
|
| 2564 |
+
"loss": 4.8354541778564455,
|
| 2565 |
+
"step": 3560
|
| 2566 |
+
},
|
| 2567 |
+
{
|
| 2568 |
+
"epoch": 0.6031424227065383,
|
| 2569 |
+
"grad_norm": 0.5777986645698547,
|
| 2570 |
+
"learning_rate": 0.00028156305476222966,
|
| 2571 |
+
"loss": 4.831108856201172,
|
| 2572 |
+
"step": 3570
|
| 2573 |
+
},
|
| 2574 |
+
{
|
| 2575 |
+
"epoch": 0.6048318972799459,
|
| 2576 |
+
"grad_norm": 0.5951328277587891,
|
| 2577 |
+
"learning_rate": 0.0002813323062111828,
|
| 2578 |
+
"loss": 4.814881896972656,
|
| 2579 |
+
"step": 3580
|
| 2580 |
+
},
|
| 2581 |
+
{
|
| 2582 |
+
"epoch": 0.6065213718533536,
|
| 2583 |
+
"grad_norm": 0.6194175481796265,
|
| 2584 |
+
"learning_rate": 0.0002811002184235334,
|
| 2585 |
+
"loss": 4.799482345581055,
|
| 2586 |
+
"step": 3590
|
| 2587 |
+
},
|
| 2588 |
+
{
|
| 2589 |
+
"epoch": 0.6082108464267613,
|
| 2590 |
+
"grad_norm": 0.6000851392745972,
|
| 2591 |
+
"learning_rate": 0.00028086679376595314,
|
| 2592 |
+
"loss": 4.825896072387695,
|
| 2593 |
+
"step": 3600
|
| 2594 |
+
},
|
| 2595 |
+
{
|
| 2596 |
+
"epoch": 0.609900321000169,
|
| 2597 |
+
"grad_norm": 0.5737459659576416,
|
| 2598 |
+
"learning_rate": 0.00028063203461874635,
|
| 2599 |
+
"loss": 4.819542312622071,
|
| 2600 |
+
"step": 3610
|
| 2601 |
+
},
|
| 2602 |
+
{
|
| 2603 |
+
"epoch": 0.6115897955735766,
|
| 2604 |
+
"grad_norm": 0.5582530498504639,
|
| 2605 |
+
"learning_rate": 0.0002803959433758254,
|
| 2606 |
+
"loss": 4.792166137695313,
|
| 2607 |
+
"step": 3620
|
| 2608 |
+
},
|
| 2609 |
+
{
|
| 2610 |
+
"epoch": 0.6132792701469842,
|
| 2611 |
+
"grad_norm": 0.6597942113876343,
|
| 2612 |
+
"learning_rate": 0.0002801585224446866,
|
| 2613 |
+
"loss": 4.817278671264648,
|
| 2614 |
+
"step": 3630
|
| 2615 |
+
},
|
| 2616 |
+
{
|
| 2617 |
+
"epoch": 0.614968744720392,
|
| 2618 |
+
"grad_norm": 0.6341045498847961,
|
| 2619 |
+
"learning_rate": 0.0002799197742463854,
|
| 2620 |
+
"loss": 4.801259613037109,
|
| 2621 |
+
"step": 3640
|
| 2622 |
+
},
|
| 2623 |
+
{
|
| 2624 |
+
"epoch": 0.6166582192937996,
|
| 2625 |
+
"grad_norm": 0.5537974834442139,
|
| 2626 |
+
"learning_rate": 0.0002796797012155118,
|
| 2627 |
+
"loss": 4.792682266235351,
|
| 2628 |
+
"step": 3650
|
| 2629 |
+
},
|
| 2630 |
+
{
|
| 2631 |
+
"epoch": 0.6183476938672073,
|
| 2632 |
+
"grad_norm": 0.5912588238716125,
|
| 2633 |
+
"learning_rate": 0.0002794383058001657,
|
| 2634 |
+
"loss": 4.815021514892578,
|
| 2635 |
+
"step": 3660
|
| 2636 |
+
},
|
| 2637 |
+
{
|
| 2638 |
+
"epoch": 0.620037168440615,
|
| 2639 |
+
"grad_norm": 0.5857402086257935,
|
| 2640 |
+
"learning_rate": 0.00027919559046193156,
|
| 2641 |
+
"loss": 4.807975006103516,
|
| 2642 |
+
"step": 3670
|
| 2643 |
+
},
|
| 2644 |
+
{
|
| 2645 |
+
"epoch": 0.6217266430140226,
|
| 2646 |
+
"grad_norm": 0.5567502379417419,
|
| 2647 |
+
"learning_rate": 0.0002789515576758536,
|
| 2648 |
+
"loss": 4.815755081176758,
|
| 2649 |
+
"step": 3680
|
| 2650 |
+
},
|
| 2651 |
+
{
|
| 2652 |
+
"epoch": 0.6234161175874303,
|
| 2653 |
+
"grad_norm": 0.5999053120613098,
|
| 2654 |
+
"learning_rate": 0.00027870620993041055,
|
| 2655 |
+
"loss": 4.76678237915039,
|
| 2656 |
+
"step": 3690
|
| 2657 |
+
},
|
| 2658 |
+
{
|
| 2659 |
+
"epoch": 0.6251055921608379,
|
| 2660 |
+
"grad_norm": 0.6173041462898254,
|
| 2661 |
+
"learning_rate": 0.00027845954972749004,
|
| 2662 |
+
"loss": 4.78552474975586,
|
| 2663 |
+
"step": 3700
|
| 2664 |
+
},
|
| 2665 |
+
{
|
| 2666 |
+
"epoch": 0.6267950667342457,
|
| 2667 |
+
"grad_norm": 0.620463490486145,
|
| 2668 |
+
"learning_rate": 0.0002782115795823633,
|
| 2669 |
+
"loss": 4.800563812255859,
|
| 2670 |
+
"step": 3710
|
| 2671 |
+
},
|
| 2672 |
+
{
|
| 2673 |
+
"epoch": 0.6284845413076533,
|
| 2674 |
+
"grad_norm": 0.5800994038581848,
|
| 2675 |
+
"learning_rate": 0.0002779623020236594,
|
| 2676 |
+
"loss": 4.786049270629883,
|
| 2677 |
+
"step": 3720
|
| 2678 |
+
},
|
| 2679 |
+
{
|
| 2680 |
+
"epoch": 0.630174015881061,
|
| 2681 |
+
"grad_norm": 0.6238998174667358,
|
| 2682 |
+
"learning_rate": 0.00027771171959333976,
|
| 2683 |
+
"loss": 4.825713348388672,
|
| 2684 |
+
"step": 3730
|
| 2685 |
+
},
|
| 2686 |
+
{
|
| 2687 |
+
"epoch": 0.6318634904544687,
|
| 2688 |
+
"grad_norm": 0.5553771257400513,
|
| 2689 |
+
"learning_rate": 0.00027745983484667164,
|
| 2690 |
+
"loss": 4.7812854766845705,
|
| 2691 |
+
"step": 3740
|
| 2692 |
+
},
|
| 2693 |
+
{
|
| 2694 |
+
"epoch": 0.6335529650278763,
|
| 2695 |
+
"grad_norm": 0.5241145491600037,
|
| 2696 |
+
"learning_rate": 0.0002772066503522026,
|
| 2697 |
+
"loss": 4.793375396728516,
|
| 2698 |
+
"step": 3750
|
| 2699 |
+
},
|
| 2700 |
+
{
|
| 2701 |
+
"epoch": 0.635242439601284,
|
| 2702 |
+
"grad_norm": 0.6024186015129089,
|
| 2703 |
+
"learning_rate": 0.00027695216869173415,
|
| 2704 |
+
"loss": 4.763290786743164,
|
| 2705 |
+
"step": 3760
|
| 2706 |
+
},
|
| 2707 |
+
{
|
| 2708 |
+
"epoch": 0.6369319141746916,
|
| 2709 |
+
"grad_norm": 0.6140456199645996,
|
| 2710 |
+
"learning_rate": 0.0002766963924602953,
|
| 2711 |
+
"loss": 4.783917236328125,
|
| 2712 |
+
"step": 3770
|
| 2713 |
+
},
|
| 2714 |
+
{
|
| 2715 |
+
"epoch": 0.6386213887480994,
|
| 2716 |
+
"grad_norm": 0.58733731508255,
|
| 2717 |
+
"learning_rate": 0.00027643932426611647,
|
| 2718 |
+
"loss": 4.783477401733398,
|
| 2719 |
+
"step": 3780
|
| 2720 |
+
},
|
| 2721 |
+
{
|
| 2722 |
+
"epoch": 0.640310863321507,
|
| 2723 |
+
"grad_norm": 0.5765292048454285,
|
| 2724 |
+
"learning_rate": 0.0002761809667306022,
|
| 2725 |
+
"loss": 4.76646842956543,
|
| 2726 |
+
"step": 3790
|
| 2727 |
+
},
|
| 2728 |
+
{
|
| 2729 |
+
"epoch": 0.6420003378949147,
|
| 2730 |
+
"grad_norm": 0.6646907329559326,
|
| 2731 |
+
"learning_rate": 0.00027592132248830526,
|
| 2732 |
+
"loss": 4.773694229125977,
|
| 2733 |
+
"step": 3800
|
| 2734 |
+
},
|
| 2735 |
+
{
|
| 2736 |
+
"epoch": 0.6436898124683224,
|
| 2737 |
+
"grad_norm": 0.5674440264701843,
|
| 2738 |
+
"learning_rate": 0.00027566039418689905,
|
| 2739 |
+
"loss": 4.759212493896484,
|
| 2740 |
+
"step": 3810
|
| 2741 |
+
},
|
| 2742 |
+
{
|
| 2743 |
+
"epoch": 0.64537928704173,
|
| 2744 |
+
"grad_norm": 0.5485038161277771,
|
| 2745 |
+
"learning_rate": 0.00027539818448715124,
|
| 2746 |
+
"loss": 4.7634929656982425,
|
| 2747 |
+
"step": 3820
|
| 2748 |
+
},
|
| 2749 |
+
{
|
| 2750 |
+
"epoch": 0.6470687616151377,
|
| 2751 |
+
"grad_norm": 0.601775586605072,
|
| 2752 |
+
"learning_rate": 0.000275134696062896,
|
| 2753 |
+
"loss": 4.772266006469726,
|
| 2754 |
+
"step": 3830
|
| 2755 |
+
},
|
| 2756 |
+
{
|
| 2757 |
+
"epoch": 0.6487582361885453,
|
| 2758 |
+
"grad_norm": 0.5745565891265869,
|
| 2759 |
+
"learning_rate": 0.0002748699316010073,
|
| 2760 |
+
"loss": 4.750381851196289,
|
| 2761 |
+
"step": 3840
|
| 2762 |
+
},
|
| 2763 |
+
{
|
| 2764 |
+
"epoch": 0.6504477107619531,
|
| 2765 |
+
"grad_norm": 0.611356258392334,
|
| 2766 |
+
"learning_rate": 0.000274603893801371,
|
| 2767 |
+
"loss": 4.7574516296386715,
|
| 2768 |
+
"step": 3850
|
| 2769 |
+
},
|
| 2770 |
+
{
|
| 2771 |
+
"epoch": 0.6521371853353607,
|
| 2772 |
+
"grad_norm": 0.5577182173728943,
|
| 2773 |
+
"learning_rate": 0.000274336585376858,
|
| 2774 |
+
"loss": 4.7543701171875,
|
| 2775 |
+
"step": 3860
|
| 2776 |
+
},
|
| 2777 |
+
{
|
| 2778 |
+
"epoch": 0.6538266599087684,
|
| 2779 |
+
"grad_norm": 0.5428398251533508,
|
| 2780 |
+
"learning_rate": 0.0002740680090532958,
|
| 2781 |
+
"loss": 4.768844223022461,
|
| 2782 |
+
"step": 3870
|
| 2783 |
+
},
|
| 2784 |
+
{
|
| 2785 |
+
"epoch": 0.655516134482176,
|
| 2786 |
+
"grad_norm": 0.5419198870658875,
|
| 2787 |
+
"learning_rate": 0.0002737981675694411,
|
| 2788 |
+
"loss": 4.7659965515136715,
|
| 2789 |
+
"step": 3880
|
| 2790 |
+
},
|
| 2791 |
+
{
|
| 2792 |
+
"epoch": 0.6572056090555837,
|
| 2793 |
+
"grad_norm": 0.5463002920150757,
|
| 2794 |
+
"learning_rate": 0.00027352706367695203,
|
| 2795 |
+
"loss": 4.753771591186523,
|
| 2796 |
+
"step": 3890
|
| 2797 |
+
},
|
| 2798 |
+
{
|
| 2799 |
+
"epoch": 0.6588950836289914,
|
| 2800 |
+
"grad_norm": 0.5442675948143005,
|
| 2801 |
+
"learning_rate": 0.00027325470014035965,
|
| 2802 |
+
"loss": 4.76721076965332,
|
| 2803 |
+
"step": 3900
|
| 2804 |
+
},
|
| 2805 |
+
{
|
| 2806 |
+
"epoch": 0.660584558202399,
|
| 2807 |
+
"grad_norm": 0.54521244764328,
|
| 2808 |
+
"learning_rate": 0.0002729810797370402,
|
| 2809 |
+
"loss": 4.7523548126220705,
|
| 2810 |
+
"step": 3910
|
| 2811 |
+
},
|
| 2812 |
+
{
|
| 2813 |
+
"epoch": 0.6622740327758068,
|
| 2814 |
+
"grad_norm": 0.5418686866760254,
|
| 2815 |
+
"learning_rate": 0.00027270620525718647,
|
| 2816 |
+
"loss": 4.739299774169922,
|
| 2817 |
+
"step": 3920
|
| 2818 |
+
},
|
| 2819 |
+
{
|
| 2820 |
+
"epoch": 0.6639635073492144,
|
| 2821 |
+
"grad_norm": 0.540998101234436,
|
| 2822 |
+
"learning_rate": 0.0002724300795037796,
|
| 2823 |
+
"loss": 4.779198455810547,
|
| 2824 |
+
"step": 3930
|
| 2825 |
+
},
|
| 2826 |
+
{
|
| 2827 |
+
"epoch": 0.665652981922622,
|
| 2828 |
+
"grad_norm": 0.5339457392692566,
|
| 2829 |
+
"learning_rate": 0.00027215270529256015,
|
| 2830 |
+
"loss": 4.735797882080078,
|
| 2831 |
+
"step": 3940
|
| 2832 |
+
},
|
| 2833 |
+
{
|
| 2834 |
+
"epoch": 0.6673424564960297,
|
| 2835 |
+
"grad_norm": 0.5907135009765625,
|
| 2836 |
+
"learning_rate": 0.00027187408545199977,
|
| 2837 |
+
"loss": 4.734383392333984,
|
| 2838 |
+
"step": 3950
|
| 2839 |
+
},
|
| 2840 |
+
{
|
| 2841 |
+
"epoch": 0.6690319310694374,
|
| 2842 |
+
"grad_norm": 0.5747363567352295,
|
| 2843 |
+
"learning_rate": 0.00027159422282327204,
|
| 2844 |
+
"loss": 4.746397399902344,
|
| 2845 |
+
"step": 3960
|
| 2846 |
+
},
|
| 2847 |
+
{
|
| 2848 |
+
"epoch": 0.6707214056428451,
|
| 2849 |
+
"grad_norm": 0.5700234770774841,
|
| 2850 |
+
"learning_rate": 0.0002713131202602238,
|
| 2851 |
+
"loss": 4.765536499023438,
|
| 2852 |
+
"step": 3970
|
| 2853 |
+
},
|
| 2854 |
+
{
|
| 2855 |
+
"epoch": 0.6724108802162527,
|
| 2856 |
+
"grad_norm": 0.5308473706245422,
|
| 2857 |
+
"learning_rate": 0.0002710307806293458,
|
| 2858 |
+
"loss": 4.714194107055664,
|
| 2859 |
+
"step": 3980
|
| 2860 |
+
},
|
| 2861 |
+
{
|
| 2862 |
+
"epoch": 0.6741003547896605,
|
| 2863 |
+
"grad_norm": 0.5844800472259521,
|
| 2864 |
+
"learning_rate": 0.0002707472068097435,
|
| 2865 |
+
"loss": 4.748070526123047,
|
| 2866 |
+
"step": 3990
|
| 2867 |
+
},
|
| 2868 |
+
{
|
| 2869 |
+
"epoch": 0.6757898293630681,
|
| 2870 |
+
"grad_norm": 0.5399011373519897,
|
| 2871 |
+
"learning_rate": 0.0002704624016931079,
|
| 2872 |
+
"loss": 4.742585754394531,
|
| 2873 |
+
"step": 4000
|
| 2874 |
+
},
|
| 2875 |
+
{
|
| 2876 |
+
"epoch": 0.6757898293630681,
|
| 2877 |
+
"eval_loss": 4.705080509185791,
|
| 2878 |
+
"eval_runtime": 3.6213,
|
| 2879 |
+
"eval_samples_per_second": 276.145,
|
| 2880 |
+
"eval_steps_per_second": 5.799,
|
| 2881 |
+
"step": 4000
|
| 2882 |
}
|
| 2883 |
],
|
| 2884 |
"logging_steps": 10,
|
|
|
|
| 2898 |
"attributes": {}
|
| 2899 |
}
|
| 2900 |
},
|
| 2901 |
+
"total_flos": 1.33782728343552e+17,
|
| 2902 |
"train_batch_size": 48,
|
| 2903 |
"trial_name": null,
|
| 2904 |
"trial_params": null
|