CocoRoF commited on
Commit
1e2c9c5
·
verified ·
1 Parent(s): 0661a07

Training in progress, step 2282, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:524178631dea28d6a1be6624311bcec723e346dc14c66b3a74890d48f74535c0
3
  size 791781368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5c239fe048ae787ac9ba2071d45d41a1e490518eddc34d3cfaf3afacba9c7a8
3
  size 791781368
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7a1e9dee126e6e10988635b663718974d07307a09a0ef8f8e1a040065575d1d
3
  size 2375487866
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bec5a99e167b104da1b38d30711ce7795a0e35b3f0e9682e07d97909de250cc
3
  size 2375487866
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd44b9ad3ef5591f2a0671f1ec04c21ad479cbd1d478859e3ba017f1c74bf027
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d305aeb5f66edef533fe495bedd6ae8dca8a6220560f62265310b5e9dbe7ba24
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8762202050902917,
5
  "eval_steps": 1000,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2823,6 +2823,398 @@
2823
  "eval_samples_per_second": 988.943,
2824
  "eval_steps_per_second": 30.906,
2825
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2826
  }
2827
  ],
2828
  "logging_steps": 5,
@@ -2837,12 +3229,12 @@
2837
  "should_evaluate": false,
2838
  "should_log": false,
2839
  "should_save": true,
2840
- "should_training_stop": false
2841
  },
2842
  "attributes": {}
2843
  }
2844
  },
2845
- "total_flos": 1.732943025316823e+19,
2846
  "train_batch_size": 8,
2847
  "trial_name": null,
2848
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9997672540080229,
5
  "eval_steps": 1000,
6
+ "global_step": 2282,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2823
  "eval_samples_per_second": 988.943,
2824
  "eval_steps_per_second": 30.906,
2825
  "step": 2000
2826
+ },
2827
+ {
2828
+ "epoch": 0.8784107556030175,
2829
+ "grad_norm": 29.203125,
2830
+ "learning_rate": 2.698490014612762e-06,
2831
+ "loss": 24.1809,
2832
+ "step": 2005
2833
+ },
2834
+ {
2835
+ "epoch": 0.8806013061157432,
2836
+ "grad_norm": 28.46875,
2837
+ "learning_rate": 2.6497808085728203e-06,
2838
+ "loss": 23.4948,
2839
+ "step": 2010
2840
+ },
2841
+ {
2842
+ "epoch": 0.8827918566284689,
2843
+ "grad_norm": 16.171875,
2844
+ "learning_rate": 2.601071602532879e-06,
2845
+ "loss": 23.6219,
2846
+ "step": 2015
2847
+ },
2848
+ {
2849
+ "epoch": 0.8849824071411947,
2850
+ "grad_norm": 31.9375,
2851
+ "learning_rate": 2.5523623964929375e-06,
2852
+ "loss": 22.9886,
2853
+ "step": 2020
2854
+ },
2855
+ {
2856
+ "epoch": 0.8871729576539203,
2857
+ "grad_norm": 13.4453125,
2858
+ "learning_rate": 2.503653190452996e-06,
2859
+ "loss": 23.6685,
2860
+ "step": 2025
2861
+ },
2862
+ {
2863
+ "epoch": 0.8893635081666461,
2864
+ "grad_norm": 27.390625,
2865
+ "learning_rate": 2.4549439844130542e-06,
2866
+ "loss": 23.4918,
2867
+ "step": 2030
2868
+ },
2869
+ {
2870
+ "epoch": 0.8915540586793719,
2871
+ "grad_norm": 53.375,
2872
+ "learning_rate": 2.4062347783731126e-06,
2873
+ "loss": 23.38,
2874
+ "step": 2035
2875
+ },
2876
+ {
2877
+ "epoch": 0.8937446091920976,
2878
+ "grad_norm": 18.734375,
2879
+ "learning_rate": 2.357525572333171e-06,
2880
+ "loss": 23.5452,
2881
+ "step": 2040
2882
+ },
2883
+ {
2884
+ "epoch": 0.8959351597048233,
2885
+ "grad_norm": 25.671875,
2886
+ "learning_rate": 2.3088163662932294e-06,
2887
+ "loss": 23.5461,
2888
+ "step": 2045
2889
+ },
2890
+ {
2891
+ "epoch": 0.8981257102175491,
2892
+ "grad_norm": 28.953125,
2893
+ "learning_rate": 2.260107160253288e-06,
2894
+ "loss": 23.2701,
2895
+ "step": 2050
2896
+ },
2897
+ {
2898
+ "epoch": 0.9003162607302748,
2899
+ "grad_norm": 16.4375,
2900
+ "learning_rate": 2.2113979542133465e-06,
2901
+ "loss": 23.5859,
2902
+ "step": 2055
2903
+ },
2904
+ {
2905
+ "epoch": 0.9025068112430005,
2906
+ "grad_norm": 20.59375,
2907
+ "learning_rate": 2.162688748173405e-06,
2908
+ "loss": 22.9554,
2909
+ "step": 2060
2910
+ },
2911
+ {
2912
+ "epoch": 0.9046973617557262,
2913
+ "grad_norm": 31.21875,
2914
+ "learning_rate": 2.1139795421334633e-06,
2915
+ "loss": 23.3924,
2916
+ "step": 2065
2917
+ },
2918
+ {
2919
+ "epoch": 0.906887912268452,
2920
+ "grad_norm": 29.359375,
2921
+ "learning_rate": 2.065270336093522e-06,
2922
+ "loss": 23.0099,
2923
+ "step": 2070
2924
+ },
2925
+ {
2926
+ "epoch": 0.9090784627811777,
2927
+ "grad_norm": 22.734375,
2928
+ "learning_rate": 2.0165611300535805e-06,
2929
+ "loss": 23.4626,
2930
+ "step": 2075
2931
+ },
2932
+ {
2933
+ "epoch": 0.9112690132939034,
2934
+ "grad_norm": 59.6875,
2935
+ "learning_rate": 1.967851924013639e-06,
2936
+ "loss": 22.6226,
2937
+ "step": 2080
2938
+ },
2939
+ {
2940
+ "epoch": 0.9134595638066292,
2941
+ "grad_norm": 37.71875,
2942
+ "learning_rate": 1.9191427179736972e-06,
2943
+ "loss": 22.4083,
2944
+ "step": 2085
2945
+ },
2946
+ {
2947
+ "epoch": 0.9156501143193548,
2948
+ "grad_norm": 23.28125,
2949
+ "learning_rate": 1.8704335119337556e-06,
2950
+ "loss": 23.4292,
2951
+ "step": 2090
2952
+ },
2953
+ {
2954
+ "epoch": 0.9178406648320806,
2955
+ "grad_norm": 31.0625,
2956
+ "learning_rate": 1.8217243058938142e-06,
2957
+ "loss": 23.3258,
2958
+ "step": 2095
2959
+ },
2960
+ {
2961
+ "epoch": 0.9200312153448064,
2962
+ "grad_norm": 16.359375,
2963
+ "learning_rate": 1.7730150998538726e-06,
2964
+ "loss": 23.5227,
2965
+ "step": 2100
2966
+ },
2967
+ {
2968
+ "epoch": 0.922221765857532,
2969
+ "grad_norm": 63.59375,
2970
+ "learning_rate": 1.724305893813931e-06,
2971
+ "loss": 23.928,
2972
+ "step": 2105
2973
+ },
2974
+ {
2975
+ "epoch": 0.9244123163702578,
2976
+ "grad_norm": 30.21875,
2977
+ "learning_rate": 1.6755966877739893e-06,
2978
+ "loss": 22.57,
2979
+ "step": 2110
2980
+ },
2981
+ {
2982
+ "epoch": 0.9266028668829835,
2983
+ "grad_norm": 28.078125,
2984
+ "learning_rate": 1.626887481734048e-06,
2985
+ "loss": 23.4965,
2986
+ "step": 2115
2987
+ },
2988
+ {
2989
+ "epoch": 0.9287934173957093,
2990
+ "grad_norm": 21.421875,
2991
+ "learning_rate": 1.5781782756941063e-06,
2992
+ "loss": 23.3915,
2993
+ "step": 2120
2994
+ },
2995
+ {
2996
+ "epoch": 0.930983967908435,
2997
+ "grad_norm": 14.703125,
2998
+ "learning_rate": 1.5294690696541647e-06,
2999
+ "loss": 23.4732,
3000
+ "step": 2125
3001
+ },
3002
+ {
3003
+ "epoch": 0.9331745184211607,
3004
+ "grad_norm": 21.5625,
3005
+ "learning_rate": 1.480759863614223e-06,
3006
+ "loss": 23.06,
3007
+ "step": 2130
3008
+ },
3009
+ {
3010
+ "epoch": 0.9353650689338865,
3011
+ "grad_norm": 26.3125,
3012
+ "learning_rate": 1.4320506575742814e-06,
3013
+ "loss": 22.6973,
3014
+ "step": 2135
3015
+ },
3016
+ {
3017
+ "epoch": 0.9375556194466121,
3018
+ "grad_norm": 19.078125,
3019
+ "learning_rate": 1.3833414515343402e-06,
3020
+ "loss": 22.8054,
3021
+ "step": 2140
3022
+ },
3023
+ {
3024
+ "epoch": 0.9397461699593379,
3025
+ "grad_norm": 30.84375,
3026
+ "learning_rate": 1.3346322454943986e-06,
3027
+ "loss": 23.4105,
3028
+ "step": 2145
3029
+ },
3030
+ {
3031
+ "epoch": 0.9419367204720637,
3032
+ "grad_norm": 62.21875,
3033
+ "learning_rate": 1.285923039454457e-06,
3034
+ "loss": 22.8058,
3035
+ "step": 2150
3036
+ },
3037
+ {
3038
+ "epoch": 0.9441272709847893,
3039
+ "grad_norm": 15.203125,
3040
+ "learning_rate": 1.2372138334145156e-06,
3041
+ "loss": 23.0896,
3042
+ "step": 2155
3043
+ },
3044
+ {
3045
+ "epoch": 0.9463178214975151,
3046
+ "grad_norm": 27.8125,
3047
+ "learning_rate": 1.188504627374574e-06,
3048
+ "loss": 23.386,
3049
+ "step": 2160
3050
+ },
3051
+ {
3052
+ "epoch": 0.9485083720102409,
3053
+ "grad_norm": 17.546875,
3054
+ "learning_rate": 1.1397954213346323e-06,
3055
+ "loss": 23.0428,
3056
+ "step": 2165
3057
+ },
3058
+ {
3059
+ "epoch": 0.9506989225229665,
3060
+ "grad_norm": 20.015625,
3061
+ "learning_rate": 1.0910862152946907e-06,
3062
+ "loss": 23.0953,
3063
+ "step": 2170
3064
+ },
3065
+ {
3066
+ "epoch": 0.9528894730356923,
3067
+ "grad_norm": 24.59375,
3068
+ "learning_rate": 1.0423770092547493e-06,
3069
+ "loss": 22.9106,
3070
+ "step": 2175
3071
+ },
3072
+ {
3073
+ "epoch": 0.955080023548418,
3074
+ "grad_norm": 27.9375,
3075
+ "learning_rate": 9.936678032148077e-07,
3076
+ "loss": 22.686,
3077
+ "step": 2180
3078
+ },
3079
+ {
3080
+ "epoch": 0.9572705740611437,
3081
+ "grad_norm": 22.078125,
3082
+ "learning_rate": 9.44958597174866e-07,
3083
+ "loss": 23.1452,
3084
+ "step": 2185
3085
+ },
3086
+ {
3087
+ "epoch": 0.9594611245738695,
3088
+ "grad_norm": 20.65625,
3089
+ "learning_rate": 8.962493911349246e-07,
3090
+ "loss": 22.4537,
3091
+ "step": 2190
3092
+ },
3093
+ {
3094
+ "epoch": 0.9616516750865952,
3095
+ "grad_norm": 114.75,
3096
+ "learning_rate": 8.47540185094983e-07,
3097
+ "loss": 23.5824,
3098
+ "step": 2195
3099
+ },
3100
+ {
3101
+ "epoch": 0.963842225599321,
3102
+ "grad_norm": 21.65625,
3103
+ "learning_rate": 7.988309790550415e-07,
3104
+ "loss": 23.2908,
3105
+ "step": 2200
3106
+ },
3107
+ {
3108
+ "epoch": 0.9660327761120466,
3109
+ "grad_norm": 14.1875,
3110
+ "learning_rate": 7.501217730150999e-07,
3111
+ "loss": 22.8018,
3112
+ "step": 2205
3113
+ },
3114
+ {
3115
+ "epoch": 0.9682233266247724,
3116
+ "grad_norm": 15.3125,
3117
+ "learning_rate": 7.014125669751585e-07,
3118
+ "loss": 22.5503,
3119
+ "step": 2210
3120
+ },
3121
+ {
3122
+ "epoch": 0.9704138771374982,
3123
+ "grad_norm": 33.15625,
3124
+ "learning_rate": 6.527033609352168e-07,
3125
+ "loss": 22.7779,
3126
+ "step": 2215
3127
+ },
3128
+ {
3129
+ "epoch": 0.9726044276502238,
3130
+ "grad_norm": 14.2890625,
3131
+ "learning_rate": 6.039941548952752e-07,
3132
+ "loss": 22.3891,
3133
+ "step": 2220
3134
+ },
3135
+ {
3136
+ "epoch": 0.9747949781629496,
3137
+ "grad_norm": 61.40625,
3138
+ "learning_rate": 5.552849488553337e-07,
3139
+ "loss": 22.7335,
3140
+ "step": 2225
3141
+ },
3142
+ {
3143
+ "epoch": 0.9769855286756753,
3144
+ "grad_norm": 48.46875,
3145
+ "learning_rate": 5.065757428153922e-07,
3146
+ "loss": 23.5739,
3147
+ "step": 2230
3148
+ },
3149
+ {
3150
+ "epoch": 0.979176079188401,
3151
+ "grad_norm": 15.5703125,
3152
+ "learning_rate": 4.578665367754506e-07,
3153
+ "loss": 22.5284,
3154
+ "step": 2235
3155
+ },
3156
+ {
3157
+ "epoch": 0.9813666297011268,
3158
+ "grad_norm": 21.296875,
3159
+ "learning_rate": 4.091573307355091e-07,
3160
+ "loss": 23.336,
3161
+ "step": 2240
3162
+ },
3163
+ {
3164
+ "epoch": 0.9835571802138525,
3165
+ "grad_norm": 22.671875,
3166
+ "learning_rate": 3.6044812469556747e-07,
3167
+ "loss": 22.2249,
3168
+ "step": 2245
3169
+ },
3170
+ {
3171
+ "epoch": 0.9857477307265782,
3172
+ "grad_norm": 31.71875,
3173
+ "learning_rate": 3.1173891865562595e-07,
3174
+ "loss": 22.8697,
3175
+ "step": 2250
3176
+ },
3177
+ {
3178
+ "epoch": 0.9879382812393039,
3179
+ "grad_norm": 17.65625,
3180
+ "learning_rate": 2.630297126156844e-07,
3181
+ "loss": 22.5376,
3182
+ "step": 2255
3183
+ },
3184
+ {
3185
+ "epoch": 0.9901288317520297,
3186
+ "grad_norm": 25.34375,
3187
+ "learning_rate": 2.1432050657574284e-07,
3188
+ "loss": 22.7322,
3189
+ "step": 2260
3190
+ },
3191
+ {
3192
+ "epoch": 0.9923193822647554,
3193
+ "grad_norm": 57.375,
3194
+ "learning_rate": 1.6561130053580127e-07,
3195
+ "loss": 22.5554,
3196
+ "step": 2265
3197
+ },
3198
+ {
3199
+ "epoch": 0.9945099327774811,
3200
+ "grad_norm": 26.25,
3201
+ "learning_rate": 1.1690209449585972e-07,
3202
+ "loss": 22.8525,
3203
+ "step": 2270
3204
+ },
3205
+ {
3206
+ "epoch": 0.9967004832902069,
3207
+ "grad_norm": 32.375,
3208
+ "learning_rate": 6.819288845591817e-08,
3209
+ "loss": 22.8014,
3210
+ "step": 2275
3211
+ },
3212
+ {
3213
+ "epoch": 0.9988910338029326,
3214
+ "grad_norm": 21.90625,
3215
+ "learning_rate": 1.9483682415976622e-08,
3216
+ "loss": 22.4073,
3217
+ "step": 2280
3218
  }
3219
  ],
3220
  "logging_steps": 5,
 
3229
  "should_evaluate": false,
3230
  "should_log": false,
3231
  "should_save": true,
3232
+ "should_training_stop": true
3233
  },
3234
  "attributes": {}
3235
  }
3236
  },
3237
+ "total_flos": 1.9772879918220706e+19,
3238
  "train_batch_size": 8,
3239
  "trial_name": null,
3240
  "trial_params": null