Training checkpoint at step 8500
Browse files- trainer_state.json +186 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2888,6 +2888,186 @@
|
|
| 2888 |
"eval_samples_per_second": 2.309,
|
| 2889 |
"eval_steps_per_second": 1.155,
|
| 2890 |
"step": 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2891 |
}
|
| 2892 |
],
|
| 2893 |
"logging_steps": 25,
|
|
@@ -2907,7 +3087,7 @@
|
|
| 2907 |
"attributes": {}
|
| 2908 |
}
|
| 2909 |
},
|
| 2910 |
-
"total_flos": 1.
|
| 2911 |
"train_batch_size": 1,
|
| 2912 |
"trial_name": null,
|
| 2913 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 8500,
|
| 3 |
+
"best_metric": 2.564678430557251,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-8500",
|
| 5 |
+
"epoch": 0.17,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 8500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2888 |
"eval_samples_per_second": 2.309,
|
| 2889 |
"eval_steps_per_second": 1.155,
|
| 2890 |
"step": 8000
|
| 2891 |
+
},
|
| 2892 |
+
{
|
| 2893 |
+
"epoch": 0.1605,
|
| 2894 |
+
"grad_norm": 2.6560367873629835,
|
| 2895 |
+
"learning_rate": 9.328000000000001e-06,
|
| 2896 |
+
"loss": 2.5588,
|
| 2897 |
+
"step": 8025
|
| 2898 |
+
},
|
| 2899 |
+
{
|
| 2900 |
+
"epoch": 0.161,
|
| 2901 |
+
"grad_norm": 2.2401297319157614,
|
| 2902 |
+
"learning_rate": 9.322444444444445e-06,
|
| 2903 |
+
"loss": 2.564,
|
| 2904 |
+
"step": 8050
|
| 2905 |
+
},
|
| 2906 |
+
{
|
| 2907 |
+
"epoch": 0.1615,
|
| 2908 |
+
"grad_norm": 2.2847898029930653,
|
| 2909 |
+
"learning_rate": 9.31688888888889e-06,
|
| 2910 |
+
"loss": 2.5643,
|
| 2911 |
+
"step": 8075
|
| 2912 |
+
},
|
| 2913 |
+
{
|
| 2914 |
+
"epoch": 0.162,
|
| 2915 |
+
"grad_norm": 2.798251121826375,
|
| 2916 |
+
"learning_rate": 9.311333333333335e-06,
|
| 2917 |
+
"loss": 2.5577,
|
| 2918 |
+
"step": 8100
|
| 2919 |
+
},
|
| 2920 |
+
{
|
| 2921 |
+
"epoch": 0.162,
|
| 2922 |
+
"eval_loss": 2.568058967590332,
|
| 2923 |
+
"eval_runtime": 42.5915,
|
| 2924 |
+
"eval_samples_per_second": 2.442,
|
| 2925 |
+
"eval_steps_per_second": 1.221,
|
| 2926 |
+
"step": 8100
|
| 2927 |
+
},
|
| 2928 |
+
{
|
| 2929 |
+
"epoch": 0.1625,
|
| 2930 |
+
"grad_norm": 2.0139748360698895,
|
| 2931 |
+
"learning_rate": 9.305777777777779e-06,
|
| 2932 |
+
"loss": 2.5716,
|
| 2933 |
+
"step": 8125
|
| 2934 |
+
},
|
| 2935 |
+
{
|
| 2936 |
+
"epoch": 0.163,
|
| 2937 |
+
"grad_norm": 2.052859658987244,
|
| 2938 |
+
"learning_rate": 9.300222222222222e-06,
|
| 2939 |
+
"loss": 2.5555,
|
| 2940 |
+
"step": 8150
|
| 2941 |
+
},
|
| 2942 |
+
{
|
| 2943 |
+
"epoch": 0.1635,
|
| 2944 |
+
"grad_norm": 2.6452792973388584,
|
| 2945 |
+
"learning_rate": 9.294666666666668e-06,
|
| 2946 |
+
"loss": 2.5545,
|
| 2947 |
+
"step": 8175
|
| 2948 |
+
},
|
| 2949 |
+
{
|
| 2950 |
+
"epoch": 0.164,
|
| 2951 |
+
"grad_norm": 2.8085427073848543,
|
| 2952 |
+
"learning_rate": 9.289111111111113e-06,
|
| 2953 |
+
"loss": 2.5575,
|
| 2954 |
+
"step": 8200
|
| 2955 |
+
},
|
| 2956 |
+
{
|
| 2957 |
+
"epoch": 0.164,
|
| 2958 |
+
"eval_loss": 2.56640625,
|
| 2959 |
+
"eval_runtime": 42.2476,
|
| 2960 |
+
"eval_samples_per_second": 2.462,
|
| 2961 |
+
"eval_steps_per_second": 1.231,
|
| 2962 |
+
"step": 8200
|
| 2963 |
+
},
|
| 2964 |
+
{
|
| 2965 |
+
"epoch": 0.1645,
|
| 2966 |
+
"grad_norm": 1.994417686652318,
|
| 2967 |
+
"learning_rate": 9.283555555555556e-06,
|
| 2968 |
+
"loss": 2.5634,
|
| 2969 |
+
"step": 8225
|
| 2970 |
+
},
|
| 2971 |
+
{
|
| 2972 |
+
"epoch": 0.165,
|
| 2973 |
+
"grad_norm": 2.8569259303287917,
|
| 2974 |
+
"learning_rate": 9.278e-06,
|
| 2975 |
+
"loss": 2.5711,
|
| 2976 |
+
"step": 8250
|
| 2977 |
+
},
|
| 2978 |
+
{
|
| 2979 |
+
"epoch": 0.1655,
|
| 2980 |
+
"grad_norm": 2.15031573602464,
|
| 2981 |
+
"learning_rate": 9.272444444444445e-06,
|
| 2982 |
+
"loss": 2.5515,
|
| 2983 |
+
"step": 8275
|
| 2984 |
+
},
|
| 2985 |
+
{
|
| 2986 |
+
"epoch": 0.166,
|
| 2987 |
+
"grad_norm": 2.1903087160864234,
|
| 2988 |
+
"learning_rate": 9.26688888888889e-06,
|
| 2989 |
+
"loss": 2.5588,
|
| 2990 |
+
"step": 8300
|
| 2991 |
+
},
|
| 2992 |
+
{
|
| 2993 |
+
"epoch": 0.166,
|
| 2994 |
+
"eval_loss": 2.565354585647583,
|
| 2995 |
+
"eval_runtime": 42.2533,
|
| 2996 |
+
"eval_samples_per_second": 2.461,
|
| 2997 |
+
"eval_steps_per_second": 1.231,
|
| 2998 |
+
"step": 8300
|
| 2999 |
+
},
|
| 3000 |
+
{
|
| 3001 |
+
"epoch": 0.1665,
|
| 3002 |
+
"grad_norm": 2.1661066402797697,
|
| 3003 |
+
"learning_rate": 9.261333333333334e-06,
|
| 3004 |
+
"loss": 2.5582,
|
| 3005 |
+
"step": 8325
|
| 3006 |
+
},
|
| 3007 |
+
{
|
| 3008 |
+
"epoch": 0.167,
|
| 3009 |
+
"grad_norm": 2.3738673472152603,
|
| 3010 |
+
"learning_rate": 9.25577777777778e-06,
|
| 3011 |
+
"loss": 2.5598,
|
| 3012 |
+
"step": 8350
|
| 3013 |
+
},
|
| 3014 |
+
{
|
| 3015 |
+
"epoch": 0.1675,
|
| 3016 |
+
"grad_norm": 1.893415788443222,
|
| 3017 |
+
"learning_rate": 9.250222222222223e-06,
|
| 3018 |
+
"loss": 2.5553,
|
| 3019 |
+
"step": 8375
|
| 3020 |
+
},
|
| 3021 |
+
{
|
| 3022 |
+
"epoch": 0.168,
|
| 3023 |
+
"grad_norm": 3.245074933027149,
|
| 3024 |
+
"learning_rate": 9.244666666666668e-06,
|
| 3025 |
+
"loss": 2.5632,
|
| 3026 |
+
"step": 8400
|
| 3027 |
+
},
|
| 3028 |
+
{
|
| 3029 |
+
"epoch": 0.168,
|
| 3030 |
+
"eval_loss": 2.565354585647583,
|
| 3031 |
+
"eval_runtime": 42.2015,
|
| 3032 |
+
"eval_samples_per_second": 2.464,
|
| 3033 |
+
"eval_steps_per_second": 1.232,
|
| 3034 |
+
"step": 8400
|
| 3035 |
+
},
|
| 3036 |
+
{
|
| 3037 |
+
"epoch": 0.1685,
|
| 3038 |
+
"grad_norm": 2.359910509969222,
|
| 3039 |
+
"learning_rate": 9.239111111111112e-06,
|
| 3040 |
+
"loss": 2.5564,
|
| 3041 |
+
"step": 8425
|
| 3042 |
+
},
|
| 3043 |
+
{
|
| 3044 |
+
"epoch": 0.169,
|
| 3045 |
+
"grad_norm": 2.1851033577602355,
|
| 3046 |
+
"learning_rate": 9.233555555555557e-06,
|
| 3047 |
+
"loss": 2.5532,
|
| 3048 |
+
"step": 8450
|
| 3049 |
+
},
|
| 3050 |
+
{
|
| 3051 |
+
"epoch": 0.1695,
|
| 3052 |
+
"grad_norm": 2.0954334474208443,
|
| 3053 |
+
"learning_rate": 9.228e-06,
|
| 3054 |
+
"loss": 2.5585,
|
| 3055 |
+
"step": 8475
|
| 3056 |
+
},
|
| 3057 |
+
{
|
| 3058 |
+
"epoch": 0.17,
|
| 3059 |
+
"grad_norm": 2.326393982849659,
|
| 3060 |
+
"learning_rate": 9.222444444444446e-06,
|
| 3061 |
+
"loss": 2.5639,
|
| 3062 |
+
"step": 8500
|
| 3063 |
+
},
|
| 3064 |
+
{
|
| 3065 |
+
"epoch": 0.17,
|
| 3066 |
+
"eval_loss": 2.564678430557251,
|
| 3067 |
+
"eval_runtime": 42.3289,
|
| 3068 |
+
"eval_samples_per_second": 2.457,
|
| 3069 |
+
"eval_steps_per_second": 1.228,
|
| 3070 |
+
"step": 8500
|
| 3071 |
}
|
| 3072 |
],
|
| 3073 |
"logging_steps": 25,
|
|
|
|
| 3087 |
"attributes": {}
|
| 3088 |
}
|
| 3089 |
},
|
| 3090 |
+
"total_flos": 1.9075864440776688e+19,
|
| 3091 |
"train_batch_size": 1,
|
| 3092 |
"trial_name": null,
|
| 3093 |
"trial_params": null
|