Training in progress, step 7500, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step7500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step7500/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +206 -6
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e218a5bbad6f972caad0894b0b220511e0cb0cb3787c44a85e875a3ce67f3813
|
| 3 |
size 12017472
|
last-checkpoint/global_step7500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f0c6a776f557f95f4a5de2e5594e410d029df9d00a110f71924423c987a5e8b
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step7500/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc6aedf984eb49a766b5397998d1ccdee863f9d0b635a66e9096c9fb5555965a
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step7500
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5c58bb9b510ddfd192f4d2021c0156080905e1ad4e17052f1d4a70bda5c74ec
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-
|
| 5 |
-
"epoch": 5.
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2817,6 +2817,206 @@
|
|
| 2817 |
"eval_samples_per_second": 43.506,
|
| 2818 |
"eval_steps_per_second": 5.445,
|
| 2819 |
"step": 7000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2820 |
}
|
| 2821 |
],
|
| 2822 |
"logging_steps": 25,
|
|
@@ -2836,7 +3036,7 @@
|
|
| 2836 |
"attributes": {}
|
| 2837 |
}
|
| 2838 |
},
|
| 2839 |
-
"total_flos":
|
| 2840 |
"train_batch_size": 4,
|
| 2841 |
"trial_name": null,
|
| 2842 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 7500,
|
| 3 |
+
"best_metric": 0.5647426843643188,
|
| 4 |
+
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-7500",
|
| 5 |
+
"epoch": 5.450827122341392,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 7500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2817 |
"eval_samples_per_second": 43.506,
|
| 2818 |
"eval_steps_per_second": 5.445,
|
| 2819 |
"step": 7000
|
| 2820 |
+
},
|
| 2821 |
+
{
|
| 2822 |
+
"epoch": 5.1054353753862936,
|
| 2823 |
+
"grad_norm": 0.7935928702354431,
|
| 2824 |
+
"learning_rate": 4.971038696306446e-05,
|
| 2825 |
+
"loss": 0.5501,
|
| 2826 |
+
"mean_token_accuracy": 0.8285538706183434,
|
| 2827 |
+
"num_tokens": 154698091.0,
|
| 2828 |
+
"step": 7025
|
| 2829 |
+
},
|
| 2830 |
+
{
|
| 2831 |
+
"epoch": 5.123613888383931,
|
| 2832 |
+
"grad_norm": 0.7594472169876099,
|
| 2833 |
+
"learning_rate": 4.952458208239385e-05,
|
| 2834 |
+
"loss": 0.5487,
|
| 2835 |
+
"mean_token_accuracy": 0.8303073984384537,
|
| 2836 |
+
"num_tokens": 155238389.0,
|
| 2837 |
+
"step": 7050
|
| 2838 |
+
},
|
| 2839 |
+
{
|
| 2840 |
+
"epoch": 5.141792401381567,
|
| 2841 |
+
"grad_norm": 0.7793622016906738,
|
| 2842 |
+
"learning_rate": 4.933855907853041e-05,
|
| 2843 |
+
"loss": 0.5526,
|
| 2844 |
+
"mean_token_accuracy": 0.828688297867775,
|
| 2845 |
+
"num_tokens": 155796109.0,
|
| 2846 |
+
"step": 7075
|
| 2847 |
+
},
|
| 2848 |
+
{
|
| 2849 |
+
"epoch": 5.159970914379204,
|
| 2850 |
+
"grad_norm": 0.7963124513626099,
|
| 2851 |
+
"learning_rate": 4.9152322211601326e-05,
|
| 2852 |
+
"loss": 0.5617,
|
| 2853 |
+
"mean_token_accuracy": 0.8250745138525963,
|
| 2854 |
+
"num_tokens": 156367817.0,
|
| 2855 |
+
"step": 7100
|
| 2856 |
+
},
|
| 2857 |
+
{
|
| 2858 |
+
"epoch": 5.178149427376841,
|
| 2859 |
+
"grad_norm": 0.815303385257721,
|
| 2860 |
+
"learning_rate": 4.8965875746631553e-05,
|
| 2861 |
+
"loss": 0.5527,
|
| 2862 |
+
"mean_token_accuracy": 0.8272364658117294,
|
| 2863 |
+
"num_tokens": 156937564.0,
|
| 2864 |
+
"step": 7125
|
| 2865 |
+
},
|
| 2866 |
+
{
|
| 2867 |
+
"epoch": 5.196327940374477,
|
| 2868 |
+
"grad_norm": 0.7769586443901062,
|
| 2869 |
+
"learning_rate": 4.8779223953446054e-05,
|
| 2870 |
+
"loss": 0.5539,
|
| 2871 |
+
"mean_token_accuracy": 0.8281649795174598,
|
| 2872 |
+
"num_tokens": 157487986.0,
|
| 2873 |
+
"step": 7150
|
| 2874 |
+
},
|
| 2875 |
+
{
|
| 2876 |
+
"epoch": 5.214506453372114,
|
| 2877 |
+
"grad_norm": 0.7640786170959473,
|
| 2878 |
+
"learning_rate": 4.8592371106571984e-05,
|
| 2879 |
+
"loss": 0.5553,
|
| 2880 |
+
"mean_token_accuracy": 0.8278635969758034,
|
| 2881 |
+
"num_tokens": 158049502.0,
|
| 2882 |
+
"step": 7175
|
| 2883 |
+
},
|
| 2884 |
+
{
|
| 2885 |
+
"epoch": 5.232684966369751,
|
| 2886 |
+
"grad_norm": 0.7943294644355774,
|
| 2887 |
+
"learning_rate": 4.8405321485140926e-05,
|
| 2888 |
+
"loss": 0.5515,
|
| 2889 |
+
"mean_token_accuracy": 0.8292607891559601,
|
| 2890 |
+
"num_tokens": 158573810.0,
|
| 2891 |
+
"step": 7200
|
| 2892 |
+
},
|
| 2893 |
+
{
|
| 2894 |
+
"epoch": 5.250863479367387,
|
| 2895 |
+
"grad_norm": 0.8353666067123413,
|
| 2896 |
+
"learning_rate": 4.821807937279074e-05,
|
| 2897 |
+
"loss": 0.5493,
|
| 2898 |
+
"mean_token_accuracy": 0.8291581255197525,
|
| 2899 |
+
"num_tokens": 159126021.0,
|
| 2900 |
+
"step": 7225
|
| 2901 |
+
},
|
| 2902 |
+
{
|
| 2903 |
+
"epoch": 5.2690419923650245,
|
| 2904 |
+
"grad_norm": 0.8323714137077332,
|
| 2905 |
+
"learning_rate": 4.8030649057567545e-05,
|
| 2906 |
+
"loss": 0.5574,
|
| 2907 |
+
"mean_token_accuracy": 0.8271696311235428,
|
| 2908 |
+
"num_tokens": 159687774.0,
|
| 2909 |
+
"step": 7250
|
| 2910 |
+
},
|
| 2911 |
+
{
|
| 2912 |
+
"epoch": 5.2690419923650245,
|
| 2913 |
+
"eval_loss": 0.5662592053413391,
|
| 2914 |
+
"eval_mean_token_accuracy": 0.8236163130967445,
|
| 2915 |
+
"eval_num_tokens": 159687774.0,
|
| 2916 |
+
"eval_runtime": 112.894,
|
| 2917 |
+
"eval_samples_per_second": 43.315,
|
| 2918 |
+
"eval_steps_per_second": 5.421,
|
| 2919 |
+
"step": 7250
|
| 2920 |
+
},
|
| 2921 |
+
{
|
| 2922 |
+
"epoch": 5.2872205053626615,
|
| 2923 |
+
"grad_norm": 0.7205966114997864,
|
| 2924 |
+
"learning_rate": 4.784303483182755e-05,
|
| 2925 |
+
"loss": 0.553,
|
| 2926 |
+
"mean_token_accuracy": 0.8278142037987709,
|
| 2927 |
+
"num_tokens": 160241228.0,
|
| 2928 |
+
"step": 7275
|
| 2929 |
+
},
|
| 2930 |
+
{
|
| 2931 |
+
"epoch": 5.305399018360298,
|
| 2932 |
+
"grad_norm": 0.8180447816848755,
|
| 2933 |
+
"learning_rate": 4.7655240992138677e-05,
|
| 2934 |
+
"loss": 0.5491,
|
| 2935 |
+
"mean_token_accuracy": 0.829489229619503,
|
| 2936 |
+
"num_tokens": 160767430.0,
|
| 2937 |
+
"step": 7300
|
| 2938 |
+
},
|
| 2939 |
+
{
|
| 2940 |
+
"epoch": 5.323577531357935,
|
| 2941 |
+
"grad_norm": 0.7637699842453003,
|
| 2942 |
+
"learning_rate": 4.746727183918221e-05,
|
| 2943 |
+
"loss": 0.5595,
|
| 2944 |
+
"mean_token_accuracy": 0.8261320424079895,
|
| 2945 |
+
"num_tokens": 161318820.0,
|
| 2946 |
+
"step": 7325
|
| 2947 |
+
},
|
| 2948 |
+
{
|
| 2949 |
+
"epoch": 5.341756044355572,
|
| 2950 |
+
"grad_norm": 0.7907775640487671,
|
| 2951 |
+
"learning_rate": 4.727913167765431e-05,
|
| 2952 |
+
"loss": 0.5525,
|
| 2953 |
+
"mean_token_accuracy": 0.8275396654009819,
|
| 2954 |
+
"num_tokens": 161877946.0,
|
| 2955 |
+
"step": 7350
|
| 2956 |
+
},
|
| 2957 |
+
{
|
| 2958 |
+
"epoch": 5.359934557353209,
|
| 2959 |
+
"grad_norm": 0.7488855719566345,
|
| 2960 |
+
"learning_rate": 4.7090824816167384e-05,
|
| 2961 |
+
"loss": 0.5516,
|
| 2962 |
+
"mean_token_accuracy": 0.8294497436285019,
|
| 2963 |
+
"num_tokens": 162425201.0,
|
| 2964 |
+
"step": 7375
|
| 2965 |
+
},
|
| 2966 |
+
{
|
| 2967 |
+
"epoch": 5.378113070350845,
|
| 2968 |
+
"grad_norm": 0.8791596293449402,
|
| 2969 |
+
"learning_rate": 4.6902355567151486e-05,
|
| 2970 |
+
"loss": 0.5533,
|
| 2971 |
+
"mean_token_accuracy": 0.8279849541187286,
|
| 2972 |
+
"num_tokens": 162980062.0,
|
| 2973 |
+
"step": 7400
|
| 2974 |
+
},
|
| 2975 |
+
{
|
| 2976 |
+
"epoch": 5.396291583348482,
|
| 2977 |
+
"grad_norm": 0.8288064002990723,
|
| 2978 |
+
"learning_rate": 4.671372824675549e-05,
|
| 2979 |
+
"loss": 0.5463,
|
| 2980 |
+
"mean_token_accuracy": 0.8298707720637322,
|
| 2981 |
+
"num_tokens": 163536063.0,
|
| 2982 |
+
"step": 7425
|
| 2983 |
+
},
|
| 2984 |
+
{
|
| 2985 |
+
"epoch": 5.414470096346119,
|
| 2986 |
+
"grad_norm": 0.8334540724754333,
|
| 2987 |
+
"learning_rate": 4.65249471747483e-05,
|
| 2988 |
+
"loss": 0.5488,
|
| 2989 |
+
"mean_token_accuracy": 0.8298037537932396,
|
| 2990 |
+
"num_tokens": 164096275.0,
|
| 2991 |
+
"step": 7450
|
| 2992 |
+
},
|
| 2993 |
+
{
|
| 2994 |
+
"epoch": 5.432648609343755,
|
| 2995 |
+
"grad_norm": 0.7712641358375549,
|
| 2996 |
+
"learning_rate": 4.6336016674419886e-05,
|
| 2997 |
+
"loss": 0.5423,
|
| 2998 |
+
"mean_token_accuracy": 0.8307902818918228,
|
| 2999 |
+
"num_tokens": 164633060.0,
|
| 3000 |
+
"step": 7475
|
| 3001 |
+
},
|
| 3002 |
+
{
|
| 3003 |
+
"epoch": 5.450827122341392,
|
| 3004 |
+
"grad_norm": 0.7974975109100342,
|
| 3005 |
+
"learning_rate": 4.614694107248228e-05,
|
| 3006 |
+
"loss": 0.5527,
|
| 3007 |
+
"mean_token_accuracy": 0.8289200633764267,
|
| 3008 |
+
"num_tokens": 165169760.0,
|
| 3009 |
+
"step": 7500
|
| 3010 |
+
},
|
| 3011 |
+
{
|
| 3012 |
+
"epoch": 5.450827122341392,
|
| 3013 |
+
"eval_loss": 0.5647426843643188,
|
| 3014 |
+
"eval_mean_token_accuracy": 0.82398138930595,
|
| 3015 |
+
"eval_num_tokens": 165169760.0,
|
| 3016 |
+
"eval_runtime": 113.2742,
|
| 3017 |
+
"eval_samples_per_second": 43.17,
|
| 3018 |
+
"eval_steps_per_second": 5.403,
|
| 3019 |
+
"step": 7500
|
| 3020 |
}
|
| 3021 |
],
|
| 3022 |
"logging_steps": 25,
|
|
|
|
| 3036 |
"attributes": {}
|
| 3037 |
}
|
| 3038 |
},
|
| 3039 |
+
"total_flos": 4.16634638434304e+17,
|
| 3040 |
"train_batch_size": 4,
|
| 3041 |
"trial_name": null,
|
| 3042 |
"trial_params": null
|