Training in progress, step 13000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 150625560
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23be2c11c244c72601ea6f47dd507781736231ff1da2289fe5f8ba433277cb99
|
| 3 |
size 150625560
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04943bdcad0923c88796f61e80a911b94cde9c121a1bb27006e82c8a584a0c44
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff84b2998c9ce4e6e3eaf03e775fc93a7c11be8195c0bb3abb7a8b9a1cec86e5
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71a524f67e79e2b512d6d818f94e2b528e5b7f4447259f3966ae44cdba439db5
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2624,11 +2624,229 @@
|
|
| 2624 |
"eval_steps_per_second": 20.507,
|
| 2625 |
"num_input_tokens_seen": 5797304337,
|
| 2626 |
"step": 12000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2627 |
}
|
| 2628 |
],
|
| 2629 |
"logging_steps": 50,
|
| 2630 |
"max_steps": 16568,
|
| 2631 |
-
"num_input_tokens_seen":
|
| 2632 |
"num_train_epochs": 4,
|
| 2633 |
"save_steps": 1000,
|
| 2634 |
"stateful_callbacks": {
|
|
@@ -2643,7 +2861,7 @@
|
|
| 2643 |
"attributes": {}
|
| 2644 |
}
|
| 2645 |
},
|
| 2646 |
-
"total_flos": 1.
|
| 2647 |
"train_batch_size": 16,
|
| 2648 |
"trial_name": null,
|
| 2649 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.1378457081642197,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 13000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2624 |
"eval_steps_per_second": 20.507,
|
| 2625 |
"num_input_tokens_seen": 5797304337,
|
| 2626 |
"step": 12000
|
| 2627 |
+
},
|
| 2628 |
+
{
|
| 2629 |
+
"epoch": 2.908671183065015,
|
| 2630 |
+
"grad_norm": 0.2578125,
|
| 2631 |
+
"learning_rate": 1.704391127206881e-05,
|
| 2632 |
+
"loss": 2.0958,
|
| 2633 |
+
"mean_token_accuracy": 0.5546299646422267,
|
| 2634 |
+
"num_input_tokens_seen": 5821442385,
|
| 2635 |
+
"num_tokens": 2453453845.0,
|
| 2636 |
+
"step": 12050
|
| 2637 |
+
},
|
| 2638 |
+
{
|
| 2639 |
+
"epoch": 2.920741735443668,
|
| 2640 |
+
"grad_norm": 0.26171875,
|
| 2641 |
+
"learning_rate": 1.685528896936774e-05,
|
| 2642 |
+
"loss": 2.0926,
|
| 2643 |
+
"mean_token_accuracy": 0.5549974143505096,
|
| 2644 |
+
"num_input_tokens_seen": 5845686961,
|
| 2645 |
+
"num_tokens": 2463776050.0,
|
| 2646 |
+
"step": 12100
|
| 2647 |
+
},
|
| 2648 |
+
{
|
| 2649 |
+
"epoch": 2.9328122878223217,
|
| 2650 |
+
"grad_norm": 0.263671875,
|
| 2651 |
+
"learning_rate": 1.6666666666666667e-05,
|
| 2652 |
+
"loss": 2.1015,
|
| 2653 |
+
"mean_token_accuracy": 0.5541527543962002,
|
| 2654 |
+
"num_input_tokens_seen": 5869745137,
|
| 2655 |
+
"num_tokens": 2473828195.0,
|
| 2656 |
+
"step": 12150
|
| 2657 |
+
},
|
| 2658 |
+
{
|
| 2659 |
+
"epoch": 2.944882840200975,
|
| 2660 |
+
"grad_norm": 0.26953125,
|
| 2661 |
+
"learning_rate": 1.6478044363965596e-05,
|
| 2662 |
+
"loss": 2.1041,
|
| 2663 |
+
"mean_token_accuracy": 0.5541104365140199,
|
| 2664 |
+
"num_input_tokens_seen": 5893803025,
|
| 2665 |
+
"num_tokens": 2483915340.0,
|
| 2666 |
+
"step": 12200
|
| 2667 |
+
},
|
| 2668 |
+
{
|
| 2669 |
+
"epoch": 2.956953392579628,
|
| 2670 |
+
"grad_norm": 0.2333984375,
|
| 2671 |
+
"learning_rate": 1.6289422061264525e-05,
|
| 2672 |
+
"loss": 2.0922,
|
| 2673 |
+
"mean_token_accuracy": 0.5555301706120371,
|
| 2674 |
+
"num_input_tokens_seen": 5918068641,
|
| 2675 |
+
"num_tokens": 2494208052.0,
|
| 2676 |
+
"step": 12250
|
| 2677 |
+
},
|
| 2678 |
+
{
|
| 2679 |
+
"epoch": 2.9690239449582814,
|
| 2680 |
+
"grad_norm": 0.2490234375,
|
| 2681 |
+
"learning_rate": 1.6100799758563453e-05,
|
| 2682 |
+
"loss": 2.0938,
|
| 2683 |
+
"mean_token_accuracy": 0.5548020200431347,
|
| 2684 |
+
"num_input_tokens_seen": 5942257041,
|
| 2685 |
+
"num_tokens": 2504393372.0,
|
| 2686 |
+
"step": 12300
|
| 2687 |
+
},
|
| 2688 |
+
{
|
| 2689 |
+
"epoch": 2.9810944973369344,
|
| 2690 |
+
"grad_norm": 0.2890625,
|
| 2691 |
+
"learning_rate": 1.5912177455862382e-05,
|
| 2692 |
+
"loss": 2.0843,
|
| 2693 |
+
"mean_token_accuracy": 0.5566597804427147,
|
| 2694 |
+
"num_input_tokens_seen": 5966422081,
|
| 2695 |
+
"num_tokens": 2514627798.0,
|
| 2696 |
+
"step": 12350
|
| 2697 |
+
},
|
| 2698 |
+
{
|
| 2699 |
+
"epoch": 2.9931650497155875,
|
| 2700 |
+
"grad_norm": 0.2734375,
|
| 2701 |
+
"learning_rate": 1.572355515316131e-05,
|
| 2702 |
+
"loss": 2.0886,
|
| 2703 |
+
"mean_token_accuracy": 0.5566082544624805,
|
| 2704 |
+
"num_input_tokens_seen": 5990574321,
|
| 2705 |
+
"num_tokens": 2524805007.0,
|
| 2706 |
+
"step": 12400
|
| 2707 |
+
},
|
| 2708 |
+
{
|
| 2709 |
+
"epoch": 3.005069631999034,
|
| 2710 |
+
"grad_norm": 0.26171875,
|
| 2711 |
+
"learning_rate": 1.553493285046024e-05,
|
| 2712 |
+
"loss": 2.1001,
|
| 2713 |
+
"mean_token_accuracy": 0.5549402527407246,
|
| 2714 |
+
"num_input_tokens_seen": 6014380145,
|
| 2715 |
+
"num_tokens": 2534738802.0,
|
| 2716 |
+
"step": 12450
|
| 2717 |
+
},
|
| 2718 |
+
{
|
| 2719 |
+
"epoch": 3.0171401843776877,
|
| 2720 |
+
"grad_norm": 0.2314453125,
|
| 2721 |
+
"learning_rate": 1.5346310547759168e-05,
|
| 2722 |
+
"loss": 2.092,
|
| 2723 |
+
"num_input_tokens_seen": 6038556753,
|
| 2724 |
+
"step": 12500
|
| 2725 |
+
},
|
| 2726 |
+
{
|
| 2727 |
+
"epoch": 3.0171401843776877,
|
| 2728 |
+
"eval_loss": 1.9681233167648315,
|
| 2729 |
+
"eval_mean_token_accuracy": 0.5784891846355349,
|
| 2730 |
+
"eval_num_tokens": 2544886437.0,
|
| 2731 |
+
"eval_runtime": 130.6689,
|
| 2732 |
+
"eval_samples_per_second": 81.978,
|
| 2733 |
+
"eval_steps_per_second": 20.495,
|
| 2734 |
+
"num_input_tokens_seen": 6038556753,
|
| 2735 |
+
"step": 12500
|
| 2736 |
+
},
|
| 2737 |
+
{
|
| 2738 |
+
"epoch": 3.029210736756341,
|
| 2739 |
+
"grad_norm": 0.25390625,
|
| 2740 |
+
"learning_rate": 1.5157688245058096e-05,
|
| 2741 |
+
"loss": 2.0925,
|
| 2742 |
+
"mean_token_accuracy": 0.5550393326207995,
|
| 2743 |
+
"num_input_tokens_seen": 6062857617,
|
| 2744 |
+
"num_tokens": 2555112151.0,
|
| 2745 |
+
"step": 12550
|
| 2746 |
+
},
|
| 2747 |
+
{
|
| 2748 |
+
"epoch": 3.041281289134994,
|
| 2749 |
+
"grad_norm": 0.38671875,
|
| 2750 |
+
"learning_rate": 1.4969065942357025e-05,
|
| 2751 |
+
"loss": 2.0957,
|
| 2752 |
+
"mean_token_accuracy": 0.5551863227039575,
|
| 2753 |
+
"num_input_tokens_seen": 6087077841,
|
| 2754 |
+
"num_tokens": 2565388515.0,
|
| 2755 |
+
"step": 12600
|
| 2756 |
+
},
|
| 2757 |
+
{
|
| 2758 |
+
"epoch": 3.0533518415136474,
|
| 2759 |
+
"grad_norm": 0.279296875,
|
| 2760 |
+
"learning_rate": 1.4780443639655952e-05,
|
| 2761 |
+
"loss": 2.0858,
|
| 2762 |
+
"mean_token_accuracy": 0.5563259933143854,
|
| 2763 |
+
"num_input_tokens_seen": 6111161617,
|
| 2764 |
+
"num_tokens": 2575504513.0,
|
| 2765 |
+
"step": 12650
|
| 2766 |
+
},
|
| 2767 |
+
{
|
| 2768 |
+
"epoch": 3.0654223938923004,
|
| 2769 |
+
"grad_norm": 0.25,
|
| 2770 |
+
"learning_rate": 1.4591821336954884e-05,
|
| 2771 |
+
"loss": 2.101,
|
| 2772 |
+
"mean_token_accuracy": 0.5549140437319875,
|
| 2773 |
+
"num_input_tokens_seen": 6135170369,
|
| 2774 |
+
"num_tokens": 2585570498.0,
|
| 2775 |
+
"step": 12700
|
| 2776 |
+
},
|
| 2777 |
+
{
|
| 2778 |
+
"epoch": 3.077492946270954,
|
| 2779 |
+
"grad_norm": 0.263671875,
|
| 2780 |
+
"learning_rate": 1.4403199034253811e-05,
|
| 2781 |
+
"loss": 2.0935,
|
| 2782 |
+
"mean_token_accuracy": 0.5543564364686608,
|
| 2783 |
+
"num_input_tokens_seen": 6159397985,
|
| 2784 |
+
"num_tokens": 2595740107.0,
|
| 2785 |
+
"step": 12750
|
| 2786 |
+
},
|
| 2787 |
+
{
|
| 2788 |
+
"epoch": 3.089563498649607,
|
| 2789 |
+
"grad_norm": 0.265625,
|
| 2790 |
+
"learning_rate": 1.421457673155274e-05,
|
| 2791 |
+
"loss": 2.0928,
|
| 2792 |
+
"mean_token_accuracy": 0.5548016136884689,
|
| 2793 |
+
"num_input_tokens_seen": 6183511137,
|
| 2794 |
+
"num_tokens": 2605900153.0,
|
| 2795 |
+
"step": 12800
|
| 2796 |
+
},
|
| 2797 |
+
{
|
| 2798 |
+
"epoch": 3.10163405102826,
|
| 2799 |
+
"grad_norm": 0.2890625,
|
| 2800 |
+
"learning_rate": 1.4025954428851668e-05,
|
| 2801 |
+
"loss": 2.0862,
|
| 2802 |
+
"mean_token_accuracy": 0.5555924268066883,
|
| 2803 |
+
"num_input_tokens_seen": 6207630993,
|
| 2804 |
+
"num_tokens": 2616105592.0,
|
| 2805 |
+
"step": 12850
|
| 2806 |
+
},
|
| 2807 |
+
{
|
| 2808 |
+
"epoch": 3.1137046034069136,
|
| 2809 |
+
"grad_norm": 0.248046875,
|
| 2810 |
+
"learning_rate": 1.3837332126150595e-05,
|
| 2811 |
+
"loss": 2.0938,
|
| 2812 |
+
"mean_token_accuracy": 0.554584386125207,
|
| 2813 |
+
"num_input_tokens_seen": 6231763217,
|
| 2814 |
+
"num_tokens": 2626268256.0,
|
| 2815 |
+
"step": 12900
|
| 2816 |
+
},
|
| 2817 |
+
{
|
| 2818 |
+
"epoch": 3.1257751557855666,
|
| 2819 |
+
"grad_norm": 0.251953125,
|
| 2820 |
+
"learning_rate": 1.3648709823449527e-05,
|
| 2821 |
+
"loss": 2.1042,
|
| 2822 |
+
"mean_token_accuracy": 0.553115917481482,
|
| 2823 |
+
"num_input_tokens_seen": 6255995041,
|
| 2824 |
+
"num_tokens": 2636461653.0,
|
| 2825 |
+
"step": 12950
|
| 2826 |
+
},
|
| 2827 |
+
{
|
| 2828 |
+
"epoch": 3.1378457081642197,
|
| 2829 |
+
"grad_norm": 0.25390625,
|
| 2830 |
+
"learning_rate": 1.3460087520748454e-05,
|
| 2831 |
+
"loss": 2.0952,
|
| 2832 |
+
"num_input_tokens_seen": 6280158129,
|
| 2833 |
+
"step": 13000
|
| 2834 |
+
},
|
| 2835 |
+
{
|
| 2836 |
+
"epoch": 3.1378457081642197,
|
| 2837 |
+
"eval_loss": 1.9681209325790405,
|
| 2838 |
+
"eval_mean_token_accuracy": 0.5785721040555485,
|
| 2839 |
+
"eval_num_tokens": 2646712354.0,
|
| 2840 |
+
"eval_runtime": 130.3881,
|
| 2841 |
+
"eval_samples_per_second": 82.155,
|
| 2842 |
+
"eval_steps_per_second": 20.539,
|
| 2843 |
+
"num_input_tokens_seen": 6280158129,
|
| 2844 |
+
"step": 13000
|
| 2845 |
}
|
| 2846 |
],
|
| 2847 |
"logging_steps": 50,
|
| 2848 |
"max_steps": 16568,
|
| 2849 |
+
"num_input_tokens_seen": 6280158129,
|
| 2850 |
"num_train_epochs": 4,
|
| 2851 |
"save_steps": 1000,
|
| 2852 |
"stateful_callbacks": {
|
|
|
|
| 2861 |
"attributes": {}
|
| 2862 |
}
|
| 2863 |
},
|
| 2864 |
+
"total_flos": 1.680003593850839e+18,
|
| 2865 |
"train_batch_size": 16,
|
| 2866 |
"trial_name": null,
|
| 2867 |
"trial_params": null
|