CocoRoF commited on
Commit
7c96fec
·
verified ·
1 Parent(s): 33743cf

Training in progress, step 2282, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:320c69b34dca6d0448bda999d261b9b7dc2acc0901006dcd199ce13e45d754f1
3
  size 791781368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc189396d250cd02b58eccc2bb364409eaee0c8425639afa61aa38ebc2574ad2
3
  size 791781368
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2219206f12e11aa97a1823b9d064ff66778aab5d93dea10195714a4421d40f4d
3
  size 2375487866
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3a0e5ae6c5f327214099f137a7cc8083d4122fae307d1b2016880d7e75c10cd
3
  size 2375487866
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd44b9ad3ef5591f2a0671f1ec04c21ad479cbd1d478859e3ba017f1c74bf027
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d305aeb5f66edef533fe495bedd6ae8dca8a6220560f62265310b5e9dbe7ba24
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8762202050902917,
5
  "eval_steps": 1000,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2823,6 +2823,398 @@
2823
  "eval_samples_per_second": 1013.774,
2824
  "eval_steps_per_second": 31.682,
2825
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2826
  }
2827
  ],
2828
  "logging_steps": 5,
@@ -2837,12 +3229,12 @@
2837
  "should_evaluate": false,
2838
  "should_log": false,
2839
  "should_save": true,
2840
- "should_training_stop": false
2841
  },
2842
  "attributes": {}
2843
  }
2844
  },
2845
- "total_flos": 1.732943025316823e+19,
2846
  "train_batch_size": 8,
2847
  "trial_name": null,
2848
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9997672540080229,
5
  "eval_steps": 1000,
6
+ "global_step": 2282,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2823
  "eval_samples_per_second": 1013.774,
2824
  "eval_steps_per_second": 31.682,
2825
  "step": 2000
2826
+ },
2827
+ {
2828
+ "epoch": 0.8784107556030175,
2829
+ "grad_norm": 20.109375,
2830
+ "learning_rate": 2.698490014612762e-06,
2831
+ "loss": 25.8034,
2832
+ "step": 2005
2833
+ },
2834
+ {
2835
+ "epoch": 0.8806013061157432,
2836
+ "grad_norm": 10.953125,
2837
+ "learning_rate": 2.6497808085728203e-06,
2838
+ "loss": 25.0685,
2839
+ "step": 2010
2840
+ },
2841
+ {
2842
+ "epoch": 0.8827918566284689,
2843
+ "grad_norm": 31.0625,
2844
+ "learning_rate": 2.601071602532879e-06,
2845
+ "loss": 25.7096,
2846
+ "step": 2015
2847
+ },
2848
+ {
2849
+ "epoch": 0.8849824071411947,
2850
+ "grad_norm": 24.046875,
2851
+ "learning_rate": 2.5523623964929375e-06,
2852
+ "loss": 25.3376,
2853
+ "step": 2020
2854
+ },
2855
+ {
2856
+ "epoch": 0.8871729576539203,
2857
+ "grad_norm": 27.65625,
2858
+ "learning_rate": 2.503653190452996e-06,
2859
+ "loss": 24.6723,
2860
+ "step": 2025
2861
+ },
2862
+ {
2863
+ "epoch": 0.8893635081666461,
2864
+ "grad_norm": 12.5078125,
2865
+ "learning_rate": 2.4549439844130542e-06,
2866
+ "loss": 25.1757,
2867
+ "step": 2030
2868
+ },
2869
+ {
2870
+ "epoch": 0.8915540586793719,
2871
+ "grad_norm": 15.1015625,
2872
+ "learning_rate": 2.4062347783731126e-06,
2873
+ "loss": 25.6934,
2874
+ "step": 2035
2875
+ },
2876
+ {
2877
+ "epoch": 0.8937446091920976,
2878
+ "grad_norm": 28.75,
2879
+ "learning_rate": 2.357525572333171e-06,
2880
+ "loss": 25.4336,
2881
+ "step": 2040
2882
+ },
2883
+ {
2884
+ "epoch": 0.8959351597048233,
2885
+ "grad_norm": 18.53125,
2886
+ "learning_rate": 2.3088163662932294e-06,
2887
+ "loss": 24.4317,
2888
+ "step": 2045
2889
+ },
2890
+ {
2891
+ "epoch": 0.8981257102175491,
2892
+ "grad_norm": 15.265625,
2893
+ "learning_rate": 2.260107160253288e-06,
2894
+ "loss": 24.6006,
2895
+ "step": 2050
2896
+ },
2897
+ {
2898
+ "epoch": 0.9003162607302748,
2899
+ "grad_norm": 12.1796875,
2900
+ "learning_rate": 2.2113979542133465e-06,
2901
+ "loss": 24.7936,
2902
+ "step": 2055
2903
+ },
2904
+ {
2905
+ "epoch": 0.9025068112430005,
2906
+ "grad_norm": 12.21875,
2907
+ "learning_rate": 2.162688748173405e-06,
2908
+ "loss": 26.129,
2909
+ "step": 2060
2910
+ },
2911
+ {
2912
+ "epoch": 0.9046973617557262,
2913
+ "grad_norm": 12.984375,
2914
+ "learning_rate": 2.1139795421334633e-06,
2915
+ "loss": 24.6225,
2916
+ "step": 2065
2917
+ },
2918
+ {
2919
+ "epoch": 0.906887912268452,
2920
+ "grad_norm": 15.65625,
2921
+ "learning_rate": 2.065270336093522e-06,
2922
+ "loss": 24.7391,
2923
+ "step": 2070
2924
+ },
2925
+ {
2926
+ "epoch": 0.9090784627811777,
2927
+ "grad_norm": 13.3046875,
2928
+ "learning_rate": 2.0165611300535805e-06,
2929
+ "loss": 24.7855,
2930
+ "step": 2075
2931
+ },
2932
+ {
2933
+ "epoch": 0.9112690132939034,
2934
+ "grad_norm": 15.015625,
2935
+ "learning_rate": 1.967851924013639e-06,
2936
+ "loss": 24.7362,
2937
+ "step": 2080
2938
+ },
2939
+ {
2940
+ "epoch": 0.9134595638066292,
2941
+ "grad_norm": 18.53125,
2942
+ "learning_rate": 1.9191427179736972e-06,
2943
+ "loss": 24.8633,
2944
+ "step": 2085
2945
+ },
2946
+ {
2947
+ "epoch": 0.9156501143193548,
2948
+ "grad_norm": 19.0625,
2949
+ "learning_rate": 1.8704335119337556e-06,
2950
+ "loss": 24.7622,
2951
+ "step": 2090
2952
+ },
2953
+ {
2954
+ "epoch": 0.9178406648320806,
2955
+ "grad_norm": 22.96875,
2956
+ "learning_rate": 1.8217243058938142e-06,
2957
+ "loss": 24.8469,
2958
+ "step": 2095
2959
+ },
2960
+ {
2961
+ "epoch": 0.9200312153448064,
2962
+ "grad_norm": 16.546875,
2963
+ "learning_rate": 1.7730150998538726e-06,
2964
+ "loss": 24.4148,
2965
+ "step": 2100
2966
+ },
2967
+ {
2968
+ "epoch": 0.922221765857532,
2969
+ "grad_norm": 17.15625,
2970
+ "learning_rate": 1.724305893813931e-06,
2971
+ "loss": 24.5911,
2972
+ "step": 2105
2973
+ },
2974
+ {
2975
+ "epoch": 0.9244123163702578,
2976
+ "grad_norm": 32.375,
2977
+ "learning_rate": 1.6755966877739893e-06,
2978
+ "loss": 24.3472,
2979
+ "step": 2110
2980
+ },
2981
+ {
2982
+ "epoch": 0.9266028668829835,
2983
+ "grad_norm": 11.1796875,
2984
+ "learning_rate": 1.626887481734048e-06,
2985
+ "loss": 24.272,
2986
+ "step": 2115
2987
+ },
2988
+ {
2989
+ "epoch": 0.9287934173957093,
2990
+ "grad_norm": 15.828125,
2991
+ "learning_rate": 1.5781782756941063e-06,
2992
+ "loss": 24.1789,
2993
+ "step": 2120
2994
+ },
2995
+ {
2996
+ "epoch": 0.930983967908435,
2997
+ "grad_norm": 13.609375,
2998
+ "learning_rate": 1.5294690696541647e-06,
2999
+ "loss": 24.5268,
3000
+ "step": 2125
3001
+ },
3002
+ {
3003
+ "epoch": 0.9331745184211607,
3004
+ "grad_norm": 11.7890625,
3005
+ "learning_rate": 1.480759863614223e-06,
3006
+ "loss": 24.4547,
3007
+ "step": 2130
3008
+ },
3009
+ {
3010
+ "epoch": 0.9353650689338865,
3011
+ "grad_norm": 14.46875,
3012
+ "learning_rate": 1.4320506575742814e-06,
3013
+ "loss": 24.5342,
3014
+ "step": 2135
3015
+ },
3016
+ {
3017
+ "epoch": 0.9375556194466121,
3018
+ "grad_norm": 12.9765625,
3019
+ "learning_rate": 1.3833414515343402e-06,
3020
+ "loss": 24.1581,
3021
+ "step": 2140
3022
+ },
3023
+ {
3024
+ "epoch": 0.9397461699593379,
3025
+ "grad_norm": 13.4140625,
3026
+ "learning_rate": 1.3346322454943986e-06,
3027
+ "loss": 24.0952,
3028
+ "step": 2145
3029
+ },
3030
+ {
3031
+ "epoch": 0.9419367204720637,
3032
+ "grad_norm": 11.921875,
3033
+ "learning_rate": 1.285923039454457e-06,
3034
+ "loss": 24.6826,
3035
+ "step": 2150
3036
+ },
3037
+ {
3038
+ "epoch": 0.9441272709847893,
3039
+ "grad_norm": 13.7734375,
3040
+ "learning_rate": 1.2372138334145156e-06,
3041
+ "loss": 23.6338,
3042
+ "step": 2155
3043
+ },
3044
+ {
3045
+ "epoch": 0.9463178214975151,
3046
+ "grad_norm": 12.1015625,
3047
+ "learning_rate": 1.188504627374574e-06,
3048
+ "loss": 24.3637,
3049
+ "step": 2160
3050
+ },
3051
+ {
3052
+ "epoch": 0.9485083720102409,
3053
+ "grad_norm": 14.578125,
3054
+ "learning_rate": 1.1397954213346323e-06,
3055
+ "loss": 24.6624,
3056
+ "step": 2165
3057
+ },
3058
+ {
3059
+ "epoch": 0.9506989225229665,
3060
+ "grad_norm": 12.7578125,
3061
+ "learning_rate": 1.0910862152946907e-06,
3062
+ "loss": 23.9978,
3063
+ "step": 2170
3064
+ },
3065
+ {
3066
+ "epoch": 0.9528894730356923,
3067
+ "grad_norm": 13.2265625,
3068
+ "learning_rate": 1.0423770092547493e-06,
3069
+ "loss": 24.4468,
3070
+ "step": 2175
3071
+ },
3072
+ {
3073
+ "epoch": 0.955080023548418,
3074
+ "grad_norm": 11.7890625,
3075
+ "learning_rate": 9.936678032148077e-07,
3076
+ "loss": 24.303,
3077
+ "step": 2180
3078
+ },
3079
+ {
3080
+ "epoch": 0.9572705740611437,
3081
+ "grad_norm": 16.25,
3082
+ "learning_rate": 9.44958597174866e-07,
3083
+ "loss": 24.5248,
3084
+ "step": 2185
3085
+ },
3086
+ {
3087
+ "epoch": 0.9594611245738695,
3088
+ "grad_norm": 18.859375,
3089
+ "learning_rate": 8.962493911349246e-07,
3090
+ "loss": 24.4316,
3091
+ "step": 2190
3092
+ },
3093
+ {
3094
+ "epoch": 0.9616516750865952,
3095
+ "grad_norm": 10.6796875,
3096
+ "learning_rate": 8.47540185094983e-07,
3097
+ "loss": 23.8851,
3098
+ "step": 2195
3099
+ },
3100
+ {
3101
+ "epoch": 0.963842225599321,
3102
+ "grad_norm": 12.140625,
3103
+ "learning_rate": 7.988309790550415e-07,
3104
+ "loss": 24.2927,
3105
+ "step": 2200
3106
+ },
3107
+ {
3108
+ "epoch": 0.9660327761120466,
3109
+ "grad_norm": 18.078125,
3110
+ "learning_rate": 7.501217730150999e-07,
3111
+ "loss": 24.1125,
3112
+ "step": 2205
3113
+ },
3114
+ {
3115
+ "epoch": 0.9682233266247724,
3116
+ "grad_norm": 9.7109375,
3117
+ "learning_rate": 7.014125669751585e-07,
3118
+ "loss": 24.1763,
3119
+ "step": 2210
3120
+ },
3121
+ {
3122
+ "epoch": 0.9704138771374982,
3123
+ "grad_norm": 12.6953125,
3124
+ "learning_rate": 6.527033609352168e-07,
3125
+ "loss": 23.1948,
3126
+ "step": 2215
3127
+ },
3128
+ {
3129
+ "epoch": 0.9726044276502238,
3130
+ "grad_norm": 18.671875,
3131
+ "learning_rate": 6.039941548952752e-07,
3132
+ "loss": 24.2813,
3133
+ "step": 2220
3134
+ },
3135
+ {
3136
+ "epoch": 0.9747949781629496,
3137
+ "grad_norm": 9.96875,
3138
+ "learning_rate": 5.552849488553337e-07,
3139
+ "loss": 23.7533,
3140
+ "step": 2225
3141
+ },
3142
+ {
3143
+ "epoch": 0.9769855286756753,
3144
+ "grad_norm": 13.71875,
3145
+ "learning_rate": 5.065757428153922e-07,
3146
+ "loss": 24.5382,
3147
+ "step": 2230
3148
+ },
3149
+ {
3150
+ "epoch": 0.979176079188401,
3151
+ "grad_norm": 13.4296875,
3152
+ "learning_rate": 4.578665367754506e-07,
3153
+ "loss": 23.7636,
3154
+ "step": 2235
3155
+ },
3156
+ {
3157
+ "epoch": 0.9813666297011268,
3158
+ "grad_norm": 13.640625,
3159
+ "learning_rate": 4.091573307355091e-07,
3160
+ "loss": 23.785,
3161
+ "step": 2240
3162
+ },
3163
+ {
3164
+ "epoch": 0.9835571802138525,
3165
+ "grad_norm": 27.078125,
3166
+ "learning_rate": 3.6044812469556747e-07,
3167
+ "loss": 23.8269,
3168
+ "step": 2245
3169
+ },
3170
+ {
3171
+ "epoch": 0.9857477307265782,
3172
+ "grad_norm": 13.4609375,
3173
+ "learning_rate": 3.1173891865562595e-07,
3174
+ "loss": 24.0428,
3175
+ "step": 2250
3176
+ },
3177
+ {
3178
+ "epoch": 0.9879382812393039,
3179
+ "grad_norm": 12.109375,
3180
+ "learning_rate": 2.630297126156844e-07,
3181
+ "loss": 24.3293,
3182
+ "step": 2255
3183
+ },
3184
+ {
3185
+ "epoch": 0.9901288317520297,
3186
+ "grad_norm": 20.84375,
3187
+ "learning_rate": 2.1432050657574284e-07,
3188
+ "loss": 24.0019,
3189
+ "step": 2260
3190
+ },
3191
+ {
3192
+ "epoch": 0.9923193822647554,
3193
+ "grad_norm": 16.265625,
3194
+ "learning_rate": 1.6561130053580127e-07,
3195
+ "loss": 24.3604,
3196
+ "step": 2265
3197
+ },
3198
+ {
3199
+ "epoch": 0.9945099327774811,
3200
+ "grad_norm": 19.96875,
3201
+ "learning_rate": 1.1690209449585972e-07,
3202
+ "loss": 24.1925,
3203
+ "step": 2270
3204
+ },
3205
+ {
3206
+ "epoch": 0.9967004832902069,
3207
+ "grad_norm": 16.953125,
3208
+ "learning_rate": 6.819288845591817e-08,
3209
+ "loss": 23.6996,
3210
+ "step": 2275
3211
+ },
3212
+ {
3213
+ "epoch": 0.9988910338029326,
3214
+ "grad_norm": 19.6875,
3215
+ "learning_rate": 1.9483682415976622e-08,
3216
+ "loss": 23.9507,
3217
+ "step": 2280
3218
  }
3219
  ],
3220
  "logging_steps": 5,
 
3229
  "should_evaluate": false,
3230
  "should_log": false,
3231
  "should_save": true,
3232
+ "should_training_stop": true
3233
  },
3234
  "attributes": {}
3235
  }
3236
  },
3237
+ "total_flos": 1.9772879918220706e+19,
3238
  "train_batch_size": 8,
3239
  "trial_name": null,
3240
  "trial_params": null