irodkin commited on
Commit
177cbcf
·
verified ·
1 Parent(s): 01307ea

Training checkpoint at step 8500

Browse files
Files changed (1) hide show
  1. trainer_state.json +186 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 8000,
3
- "best_metric": 2.568809986114502,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-8000",
5
- "epoch": 0.16,
6
  "eval_steps": 100,
7
- "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2888,6 +2888,186 @@
2888
  "eval_samples_per_second": 2.309,
2889
  "eval_steps_per_second": 1.155,
2890
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2891
  }
2892
  ],
2893
  "logging_steps": 25,
@@ -2907,7 +3087,7 @@
2907
  "attributes": {}
2908
  }
2909
  },
2910
- "total_flos": 1.7953754756673438e+19,
2911
  "train_batch_size": 1,
2912
  "trial_name": null,
2913
  "trial_params": null
 
1
  {
2
+ "best_global_step": 8500,
3
+ "best_metric": 2.564678430557251,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-8500",
5
+ "epoch": 0.17,
6
  "eval_steps": 100,
7
+ "global_step": 8500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2888
  "eval_samples_per_second": 2.309,
2889
  "eval_steps_per_second": 1.155,
2890
  "step": 8000
2891
+ },
2892
+ {
2893
+ "epoch": 0.1605,
2894
+ "grad_norm": 2.6560367873629835,
2895
+ "learning_rate": 9.328000000000001e-06,
2896
+ "loss": 2.5588,
2897
+ "step": 8025
2898
+ },
2899
+ {
2900
+ "epoch": 0.161,
2901
+ "grad_norm": 2.2401297319157614,
2902
+ "learning_rate": 9.322444444444445e-06,
2903
+ "loss": 2.564,
2904
+ "step": 8050
2905
+ },
2906
+ {
2907
+ "epoch": 0.1615,
2908
+ "grad_norm": 2.2847898029930653,
2909
+ "learning_rate": 9.31688888888889e-06,
2910
+ "loss": 2.5643,
2911
+ "step": 8075
2912
+ },
2913
+ {
2914
+ "epoch": 0.162,
2915
+ "grad_norm": 2.798251121826375,
2916
+ "learning_rate": 9.311333333333335e-06,
2917
+ "loss": 2.5577,
2918
+ "step": 8100
2919
+ },
2920
+ {
2921
+ "epoch": 0.162,
2922
+ "eval_loss": 2.568058967590332,
2923
+ "eval_runtime": 42.5915,
2924
+ "eval_samples_per_second": 2.442,
2925
+ "eval_steps_per_second": 1.221,
2926
+ "step": 8100
2927
+ },
2928
+ {
2929
+ "epoch": 0.1625,
2930
+ "grad_norm": 2.0139748360698895,
2931
+ "learning_rate": 9.305777777777779e-06,
2932
+ "loss": 2.5716,
2933
+ "step": 8125
2934
+ },
2935
+ {
2936
+ "epoch": 0.163,
2937
+ "grad_norm": 2.052859658987244,
2938
+ "learning_rate": 9.300222222222222e-06,
2939
+ "loss": 2.5555,
2940
+ "step": 8150
2941
+ },
2942
+ {
2943
+ "epoch": 0.1635,
2944
+ "grad_norm": 2.6452792973388584,
2945
+ "learning_rate": 9.294666666666668e-06,
2946
+ "loss": 2.5545,
2947
+ "step": 8175
2948
+ },
2949
+ {
2950
+ "epoch": 0.164,
2951
+ "grad_norm": 2.8085427073848543,
2952
+ "learning_rate": 9.289111111111113e-06,
2953
+ "loss": 2.5575,
2954
+ "step": 8200
2955
+ },
2956
+ {
2957
+ "epoch": 0.164,
2958
+ "eval_loss": 2.56640625,
2959
+ "eval_runtime": 42.2476,
2960
+ "eval_samples_per_second": 2.462,
2961
+ "eval_steps_per_second": 1.231,
2962
+ "step": 8200
2963
+ },
2964
+ {
2965
+ "epoch": 0.1645,
2966
+ "grad_norm": 1.994417686652318,
2967
+ "learning_rate": 9.283555555555556e-06,
2968
+ "loss": 2.5634,
2969
+ "step": 8225
2970
+ },
2971
+ {
2972
+ "epoch": 0.165,
2973
+ "grad_norm": 2.8569259303287917,
2974
+ "learning_rate": 9.278e-06,
2975
+ "loss": 2.5711,
2976
+ "step": 8250
2977
+ },
2978
+ {
2979
+ "epoch": 0.1655,
2980
+ "grad_norm": 2.15031573602464,
2981
+ "learning_rate": 9.272444444444445e-06,
2982
+ "loss": 2.5515,
2983
+ "step": 8275
2984
+ },
2985
+ {
2986
+ "epoch": 0.166,
2987
+ "grad_norm": 2.1903087160864234,
2988
+ "learning_rate": 9.26688888888889e-06,
2989
+ "loss": 2.5588,
2990
+ "step": 8300
2991
+ },
2992
+ {
2993
+ "epoch": 0.166,
2994
+ "eval_loss": 2.565354585647583,
2995
+ "eval_runtime": 42.2533,
2996
+ "eval_samples_per_second": 2.461,
2997
+ "eval_steps_per_second": 1.231,
2998
+ "step": 8300
2999
+ },
3000
+ {
3001
+ "epoch": 0.1665,
3002
+ "grad_norm": 2.1661066402797697,
3003
+ "learning_rate": 9.261333333333334e-06,
3004
+ "loss": 2.5582,
3005
+ "step": 8325
3006
+ },
3007
+ {
3008
+ "epoch": 0.167,
3009
+ "grad_norm": 2.3738673472152603,
3010
+ "learning_rate": 9.25577777777778e-06,
3011
+ "loss": 2.5598,
3012
+ "step": 8350
3013
+ },
3014
+ {
3015
+ "epoch": 0.1675,
3016
+ "grad_norm": 1.893415788443222,
3017
+ "learning_rate": 9.250222222222223e-06,
3018
+ "loss": 2.5553,
3019
+ "step": 8375
3020
+ },
3021
+ {
3022
+ "epoch": 0.168,
3023
+ "grad_norm": 3.245074933027149,
3024
+ "learning_rate": 9.244666666666668e-06,
3025
+ "loss": 2.5632,
3026
+ "step": 8400
3027
+ },
3028
+ {
3029
+ "epoch": 0.168,
3030
+ "eval_loss": 2.565354585647583,
3031
+ "eval_runtime": 42.2015,
3032
+ "eval_samples_per_second": 2.464,
3033
+ "eval_steps_per_second": 1.232,
3034
+ "step": 8400
3035
+ },
3036
+ {
3037
+ "epoch": 0.1685,
3038
+ "grad_norm": 2.359910509969222,
3039
+ "learning_rate": 9.239111111111112e-06,
3040
+ "loss": 2.5564,
3041
+ "step": 8425
3042
+ },
3043
+ {
3044
+ "epoch": 0.169,
3045
+ "grad_norm": 2.1851033577602355,
3046
+ "learning_rate": 9.233555555555557e-06,
3047
+ "loss": 2.5532,
3048
+ "step": 8450
3049
+ },
3050
+ {
3051
+ "epoch": 0.1695,
3052
+ "grad_norm": 2.0954334474208443,
3053
+ "learning_rate": 9.228e-06,
3054
+ "loss": 2.5585,
3055
+ "step": 8475
3056
+ },
3057
+ {
3058
+ "epoch": 0.17,
3059
+ "grad_norm": 2.326393982849659,
3060
+ "learning_rate": 9.222444444444446e-06,
3061
+ "loss": 2.5639,
3062
+ "step": 8500
3063
+ },
3064
+ {
3065
+ "epoch": 0.17,
3066
+ "eval_loss": 2.564678430557251,
3067
+ "eval_runtime": 42.3289,
3068
+ "eval_samples_per_second": 2.457,
3069
+ "eval_steps_per_second": 1.228,
3070
+ "step": 8500
3071
  }
3072
  ],
3073
  "logging_steps": 25,
 
3087
  "attributes": {}
3088
  }
3089
  },
3090
+ "total_flos": 1.9075864440776688e+19,
3091
  "train_batch_size": 1,
3092
  "trial_name": null,
3093
  "trial_params": null