irodkin commited on
Commit
d933cc5
·
verified ·
1 Parent(s): c14afa1

Training checkpoint at step 9000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 8000,
3
- "best_metric": 2.4125914573669434,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-8000",
5
- "epoch": 0.16,
6
  "eval_steps": 100,
7
- "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2888,6 +2888,366 @@
2888
  "eval_samples_per_second": 3.199,
2889
  "eval_steps_per_second": 1.6,
2890
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2891
  }
2892
  ],
2893
  "logging_steps": 25,
@@ -2907,7 +3267,7 @@
2907
  "attributes": {}
2908
  }
2909
  },
2910
- "total_flos": 2.546561838661763e+19,
2911
  "train_batch_size": 1,
2912
  "trial_name": null,
2913
  "trial_params": null
 
1
  {
2
+ "best_global_step": 9000,
3
+ "best_metric": 2.410008430480957,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-9000",
5
+ "epoch": 0.18,
6
  "eval_steps": 100,
7
+ "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2888
  "eval_samples_per_second": 3.199,
2889
  "eval_steps_per_second": 1.6,
2890
  "step": 8000
2891
+ },
2892
+ {
2893
+ "epoch": 0.1605,
2894
+ "grad_norm": 0.5861319558312379,
2895
+ "learning_rate": 9.328000000000001e-06,
2896
+ "loss": 2.3883,
2897
+ "step": 8025
2898
+ },
2899
+ {
2900
+ "epoch": 0.161,
2901
+ "grad_norm": 0.5836976562991806,
2902
+ "learning_rate": 9.322444444444445e-06,
2903
+ "loss": 2.3858,
2904
+ "step": 8050
2905
+ },
2906
+ {
2907
+ "epoch": 0.1615,
2908
+ "grad_norm": 0.5844356099514875,
2909
+ "learning_rate": 9.31688888888889e-06,
2910
+ "loss": 2.408,
2911
+ "step": 8075
2912
+ },
2913
+ {
2914
+ "epoch": 0.162,
2915
+ "grad_norm": 0.5898038882596441,
2916
+ "learning_rate": 9.311333333333335e-06,
2917
+ "loss": 2.3979,
2918
+ "step": 8100
2919
+ },
2920
+ {
2921
+ "epoch": 0.162,
2922
+ "eval_loss": 2.4123263359069824,
2923
+ "eval_runtime": 31.7798,
2924
+ "eval_samples_per_second": 3.21,
2925
+ "eval_steps_per_second": 1.605,
2926
+ "step": 8100
2927
+ },
2928
+ {
2929
+ "epoch": 0.1625,
2930
+ "grad_norm": 0.6072648398087778,
2931
+ "learning_rate": 9.305777777777779e-06,
2932
+ "loss": 2.3904,
2933
+ "step": 8125
2934
+ },
2935
+ {
2936
+ "epoch": 0.163,
2937
+ "grad_norm": 0.5947190221089934,
2938
+ "learning_rate": 9.300222222222222e-06,
2939
+ "loss": 2.3908,
2940
+ "step": 8150
2941
+ },
2942
+ {
2943
+ "epoch": 0.1635,
2944
+ "grad_norm": 0.5923294532719955,
2945
+ "learning_rate": 9.294666666666668e-06,
2946
+ "loss": 2.3994,
2947
+ "step": 8175
2948
+ },
2949
+ {
2950
+ "epoch": 0.164,
2951
+ "grad_norm": 0.6238957997579533,
2952
+ "learning_rate": 9.289111111111113e-06,
2953
+ "loss": 2.3935,
2954
+ "step": 8200
2955
+ },
2956
+ {
2957
+ "epoch": 0.164,
2958
+ "eval_loss": 2.4118340015411377,
2959
+ "eval_runtime": 31.8145,
2960
+ "eval_samples_per_second": 3.206,
2961
+ "eval_steps_per_second": 1.603,
2962
+ "step": 8200
2963
+ },
2964
+ {
2965
+ "epoch": 0.1645,
2966
+ "grad_norm": 0.576622489198895,
2967
+ "learning_rate": 9.283555555555556e-06,
2968
+ "loss": 2.396,
2969
+ "step": 8225
2970
+ },
2971
+ {
2972
+ "epoch": 0.165,
2973
+ "grad_norm": 0.6185118704471244,
2974
+ "learning_rate": 9.278e-06,
2975
+ "loss": 2.4035,
2976
+ "step": 8250
2977
+ },
2978
+ {
2979
+ "epoch": 0.1655,
2980
+ "grad_norm": 0.5796535805449304,
2981
+ "learning_rate": 9.272444444444445e-06,
2982
+ "loss": 2.3943,
2983
+ "step": 8275
2984
+ },
2985
+ {
2986
+ "epoch": 0.166,
2987
+ "grad_norm": 0.6173375014397958,
2988
+ "learning_rate": 9.26688888888889e-06,
2989
+ "loss": 2.3935,
2990
+ "step": 8300
2991
+ },
2992
+ {
2993
+ "epoch": 0.166,
2994
+ "eval_loss": 2.4114973545074463,
2995
+ "eval_runtime": 31.7754,
2996
+ "eval_samples_per_second": 3.21,
2997
+ "eval_steps_per_second": 1.605,
2998
+ "step": 8300
2999
+ },
3000
+ {
3001
+ "epoch": 0.1665,
3002
+ "grad_norm": 0.5618534321843206,
3003
+ "learning_rate": 9.261333333333334e-06,
3004
+ "loss": 2.3974,
3005
+ "step": 8325
3006
+ },
3007
+ {
3008
+ "epoch": 0.167,
3009
+ "grad_norm": 0.6009214777241336,
3010
+ "learning_rate": 9.25577777777778e-06,
3011
+ "loss": 2.4,
3012
+ "step": 8350
3013
+ },
3014
+ {
3015
+ "epoch": 0.1675,
3016
+ "grad_norm": 0.5772198441104387,
3017
+ "learning_rate": 9.250222222222223e-06,
3018
+ "loss": 2.3991,
3019
+ "step": 8375
3020
+ },
3021
+ {
3022
+ "epoch": 0.168,
3023
+ "grad_norm": 0.5740163940994337,
3024
+ "learning_rate": 9.244666666666668e-06,
3025
+ "loss": 2.3947,
3026
+ "step": 8400
3027
+ },
3028
+ {
3029
+ "epoch": 0.168,
3030
+ "eval_loss": 2.411425828933716,
3031
+ "eval_runtime": 31.5099,
3032
+ "eval_samples_per_second": 3.237,
3033
+ "eval_steps_per_second": 1.619,
3034
+ "step": 8400
3035
+ },
3036
+ {
3037
+ "epoch": 0.1685,
3038
+ "grad_norm": 0.5687873679002051,
3039
+ "learning_rate": 9.239111111111112e-06,
3040
+ "loss": 2.3966,
3041
+ "step": 8425
3042
+ },
3043
+ {
3044
+ "epoch": 0.169,
3045
+ "grad_norm": 0.5610136891748577,
3046
+ "learning_rate": 9.233555555555557e-06,
3047
+ "loss": 2.3998,
3048
+ "step": 8450
3049
+ },
3050
+ {
3051
+ "epoch": 0.1695,
3052
+ "grad_norm": 0.6032713755890403,
3053
+ "learning_rate": 9.228e-06,
3054
+ "loss": 2.3943,
3055
+ "step": 8475
3056
+ },
3057
+ {
3058
+ "epoch": 0.17,
3059
+ "grad_norm": 0.5964144518891603,
3060
+ "learning_rate": 9.222444444444446e-06,
3061
+ "loss": 2.3883,
3062
+ "step": 8500
3063
+ },
3064
+ {
3065
+ "epoch": 0.17,
3066
+ "eval_loss": 2.411017656326294,
3067
+ "eval_runtime": 31.5307,
3068
+ "eval_samples_per_second": 3.235,
3069
+ "eval_steps_per_second": 1.617,
3070
+ "step": 8500
3071
+ },
3072
+ {
3073
+ "epoch": 0.1705,
3074
+ "grad_norm": 0.6150332993234658,
3075
+ "learning_rate": 9.21688888888889e-06,
3076
+ "loss": 2.3947,
3077
+ "step": 8525
3078
+ },
3079
+ {
3080
+ "epoch": 0.171,
3081
+ "grad_norm": 0.5996705331900282,
3082
+ "learning_rate": 9.211333333333334e-06,
3083
+ "loss": 2.3767,
3084
+ "step": 8550
3085
+ },
3086
+ {
3087
+ "epoch": 0.1715,
3088
+ "grad_norm": 0.5824632831455251,
3089
+ "learning_rate": 9.20577777777778e-06,
3090
+ "loss": 2.3872,
3091
+ "step": 8575
3092
+ },
3093
+ {
3094
+ "epoch": 0.172,
3095
+ "grad_norm": 0.606207861483595,
3096
+ "learning_rate": 9.200222222222223e-06,
3097
+ "loss": 2.4039,
3098
+ "step": 8600
3099
+ },
3100
+ {
3101
+ "epoch": 0.172,
3102
+ "eval_loss": 2.4107751846313477,
3103
+ "eval_runtime": 31.4387,
3104
+ "eval_samples_per_second": 3.244,
3105
+ "eval_steps_per_second": 1.622,
3106
+ "step": 8600
3107
+ },
3108
+ {
3109
+ "epoch": 0.1725,
3110
+ "grad_norm": 0.576823131255562,
3111
+ "learning_rate": 9.194666666666667e-06,
3112
+ "loss": 2.3954,
3113
+ "step": 8625
3114
+ },
3115
+ {
3116
+ "epoch": 0.173,
3117
+ "grad_norm": 0.56597712239854,
3118
+ "learning_rate": 9.189111111111112e-06,
3119
+ "loss": 2.4072,
3120
+ "step": 8650
3121
+ },
3122
+ {
3123
+ "epoch": 0.1735,
3124
+ "grad_norm": 0.5825959007699376,
3125
+ "learning_rate": 9.183555555555557e-06,
3126
+ "loss": 2.4081,
3127
+ "step": 8675
3128
+ },
3129
+ {
3130
+ "epoch": 0.174,
3131
+ "grad_norm": 0.5776918671405765,
3132
+ "learning_rate": 9.178000000000001e-06,
3133
+ "loss": 2.4091,
3134
+ "step": 8700
3135
+ },
3136
+ {
3137
+ "epoch": 0.174,
3138
+ "eval_loss": 2.410761594772339,
3139
+ "eval_runtime": 31.7246,
3140
+ "eval_samples_per_second": 3.215,
3141
+ "eval_steps_per_second": 1.608,
3142
+ "step": 8700
3143
+ },
3144
+ {
3145
+ "epoch": 0.1745,
3146
+ "grad_norm": 0.6256369047041809,
3147
+ "learning_rate": 9.172444444444444e-06,
3148
+ "loss": 2.3953,
3149
+ "step": 8725
3150
+ },
3151
+ {
3152
+ "epoch": 0.175,
3153
+ "grad_norm": 0.5964709475887552,
3154
+ "learning_rate": 9.16688888888889e-06,
3155
+ "loss": 2.39,
3156
+ "step": 8750
3157
+ },
3158
+ {
3159
+ "epoch": 0.1755,
3160
+ "grad_norm": 0.5775755843795828,
3161
+ "learning_rate": 9.161333333333335e-06,
3162
+ "loss": 2.391,
3163
+ "step": 8775
3164
+ },
3165
+ {
3166
+ "epoch": 0.176,
3167
+ "grad_norm": 0.6655706627980364,
3168
+ "learning_rate": 9.155777777777779e-06,
3169
+ "loss": 2.4048,
3170
+ "step": 8800
3171
+ },
3172
+ {
3173
+ "epoch": 0.176,
3174
+ "eval_loss": 2.4105958938598633,
3175
+ "eval_runtime": 31.4248,
3176
+ "eval_samples_per_second": 3.246,
3177
+ "eval_steps_per_second": 1.623,
3178
+ "step": 8800
3179
+ },
3180
+ {
3181
+ "epoch": 0.1765,
3182
+ "grad_norm": 0.5865172878151053,
3183
+ "learning_rate": 9.150222222222222e-06,
3184
+ "loss": 2.3878,
3185
+ "step": 8825
3186
+ },
3187
+ {
3188
+ "epoch": 0.177,
3189
+ "grad_norm": 0.584391124965856,
3190
+ "learning_rate": 9.144666666666667e-06,
3191
+ "loss": 2.401,
3192
+ "step": 8850
3193
+ },
3194
+ {
3195
+ "epoch": 0.1775,
3196
+ "grad_norm": 0.5726598382185046,
3197
+ "learning_rate": 9.139111111111113e-06,
3198
+ "loss": 2.4018,
3199
+ "step": 8875
3200
+ },
3201
+ {
3202
+ "epoch": 0.178,
3203
+ "grad_norm": 0.5690725395770588,
3204
+ "learning_rate": 9.133555555555556e-06,
3205
+ "loss": 2.4034,
3206
+ "step": 8900
3207
+ },
3208
+ {
3209
+ "epoch": 0.178,
3210
+ "eval_loss": 2.4101033210754395,
3211
+ "eval_runtime": 31.4686,
3212
+ "eval_samples_per_second": 3.241,
3213
+ "eval_steps_per_second": 1.621,
3214
+ "step": 8900
3215
+ },
3216
+ {
3217
+ "epoch": 0.1785,
3218
+ "grad_norm": 0.5978143013011991,
3219
+ "learning_rate": 9.128e-06,
3220
+ "loss": 2.4014,
3221
+ "step": 8925
3222
+ },
3223
+ {
3224
+ "epoch": 0.179,
3225
+ "grad_norm": 0.6085180927490662,
3226
+ "learning_rate": 9.122444444444445e-06,
3227
+ "loss": 2.3924,
3228
+ "step": 8950
3229
+ },
3230
+ {
3231
+ "epoch": 0.1795,
3232
+ "grad_norm": 0.5720265034599029,
3233
+ "learning_rate": 9.11688888888889e-06,
3234
+ "loss": 2.3977,
3235
+ "step": 8975
3236
+ },
3237
+ {
3238
+ "epoch": 0.18,
3239
+ "grad_norm": 0.5739306861609581,
3240
+ "learning_rate": 9.111333333333334e-06,
3241
+ "loss": 2.3992,
3242
+ "step": 9000
3243
+ },
3244
+ {
3245
+ "epoch": 0.18,
3246
+ "eval_loss": 2.410008430480957,
3247
+ "eval_runtime": 32.192,
3248
+ "eval_samples_per_second": 3.168,
3249
+ "eval_steps_per_second": 1.584,
3250
+ "step": 9000
3251
  }
3252
  ],
3253
  "logging_steps": 25,
 
3267
  "attributes": {}
3268
  }
3269
  },
3270
+ "total_flos": 2.8648820684944835e+19,
3271
  "train_batch_size": 1,
3272
  "trial_name": null,
3273
  "trial_params": null