minpeter commited on
Commit
71fe9f4
·
verified ·
1 Parent(s): 537c77d

Training in progress, step 481, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4452d8ee5d1c4b3f248050b462aef67647d83e8aa2c819475c2561ad6988260f
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d861b0c356377800f143df821f91f101184e3745994cd74adf5133106d79bde2
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abc52b4912bc5a69beadb79f29cd708ffaef6d5ab82fd19a670038c92f29c313
3
  size 373225675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799ccb4c58e58c79e5f8070f3d59b69f01779859a514d0911cf768c6e286cb76
3
  size 373225675
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f438d73941ac2939699522d3048115527267b7c8c06f9f728e1517b0c3c16832
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8b4903fb7be2c884aa62b335e9291d92240420b2bf52100c824048b850730b6
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8316008316008316,
6
  "eval_steps": 100,
7
- "global_step": 400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2840,6 +2840,573 @@
2840
  "eval_samples_per_second": 22.501,
2841
  "eval_steps_per_second": 2.813,
2842
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2843
  }
2844
  ],
2845
  "logging_steps": 1,
@@ -2854,12 +3421,12 @@
2854
  "should_evaluate": false,
2855
  "should_log": false,
2856
  "should_save": true,
2857
- "should_training_stop": false
2858
  },
2859
  "attributes": {}
2860
  }
2861
  },
2862
- "total_flos": 1.018894554759168e+17,
2863
  "train_batch_size": 32,
2864
  "trial_name": null,
2865
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 100,
7
+ "global_step": 481,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2840
  "eval_samples_per_second": 22.501,
2841
  "eval_steps_per_second": 2.813,
2842
  "step": 400
2843
+ },
2844
+ {
2845
+ "epoch": 0.8336798336798337,
2846
+ "grad_norm": 0.27734375,
2847
+ "learning_rate": 7.585430144121319e-05,
2848
+ "loss": 5.5678,
2849
+ "step": 401
2850
+ },
2851
+ {
2852
+ "epoch": 0.8357588357588358,
2853
+ "grad_norm": 0.275390625,
2854
+ "learning_rate": 7.404029558083653e-05,
2855
+ "loss": 5.4751,
2856
+ "step": 402
2857
+ },
2858
+ {
2859
+ "epoch": 0.8378378378378378,
2860
+ "grad_norm": 0.275390625,
2861
+ "learning_rate": 7.224650765840613e-05,
2862
+ "loss": 5.4895,
2863
+ "step": 403
2864
+ },
2865
+ {
2866
+ "epoch": 0.83991683991684,
2867
+ "grad_norm": 0.2890625,
2868
+ "learning_rate": 7.047302281505735e-05,
2869
+ "loss": 5.381,
2870
+ "step": 404
2871
+ },
2872
+ {
2873
+ "epoch": 0.841995841995842,
2874
+ "grad_norm": 0.64453125,
2875
+ "learning_rate": 6.871992522825182e-05,
2876
+ "loss": 5.3161,
2877
+ "step": 405
2878
+ },
2879
+ {
2880
+ "epoch": 0.8440748440748441,
2881
+ "grad_norm": 1.078125,
2882
+ "learning_rate": 6.698729810778065e-05,
2883
+ "loss": 5.3665,
2884
+ "step": 406
2885
+ },
2886
+ {
2887
+ "epoch": 0.8461538461538461,
2888
+ "grad_norm": 1.0859375,
2889
+ "learning_rate": 6.527522369181655e-05,
2890
+ "loss": 5.545,
2891
+ "step": 407
2892
+ },
2893
+ {
2894
+ "epoch": 0.8482328482328483,
2895
+ "grad_norm": 0.259765625,
2896
+ "learning_rate": 6.358378324300929e-05,
2897
+ "loss": 5.1788,
2898
+ "step": 408
2899
+ },
2900
+ {
2901
+ "epoch": 0.8503118503118503,
2902
+ "grad_norm": 0.26953125,
2903
+ "learning_rate": 6.191305704462896e-05,
2904
+ "loss": 5.2483,
2905
+ "step": 409
2906
+ },
2907
+ {
2908
+ "epoch": 0.8523908523908524,
2909
+ "grad_norm": 0.28125,
2910
+ "learning_rate": 6.026312439675552e-05,
2911
+ "loss": 4.8411,
2912
+ "step": 410
2913
+ },
2914
+ {
2915
+ "epoch": 0.8544698544698545,
2916
+ "grad_norm": 0.3359375,
2917
+ "learning_rate": 5.863406361251472e-05,
2918
+ "loss": 5.1144,
2919
+ "step": 411
2920
+ },
2921
+ {
2922
+ "epoch": 0.8565488565488566,
2923
+ "grad_norm": 1.0546875,
2924
+ "learning_rate": 5.7025952014361004e-05,
2925
+ "loss": 5.1504,
2926
+ "step": 412
2927
+ },
2928
+ {
2929
+ "epoch": 0.8586278586278586,
2930
+ "grad_norm": 0.337890625,
2931
+ "learning_rate": 5.543886593040737e-05,
2932
+ "loss": 5.2093,
2933
+ "step": 413
2934
+ },
2935
+ {
2936
+ "epoch": 0.8607068607068608,
2937
+ "grad_norm": 0.287109375,
2938
+ "learning_rate": 5.387288069080298e-05,
2939
+ "loss": 5.0402,
2940
+ "step": 414
2941
+ },
2942
+ {
2943
+ "epoch": 0.8627858627858628,
2944
+ "grad_norm": 0.32421875,
2945
+ "learning_rate": 5.23280706241569e-05,
2946
+ "loss": 4.7027,
2947
+ "step": 415
2948
+ },
2949
+ {
2950
+ "epoch": 0.8648648648648649,
2951
+ "grad_norm": 0.25,
2952
+ "learning_rate": 5.080450905401057e-05,
2953
+ "loss": 5.2766,
2954
+ "step": 416
2955
+ },
2956
+ {
2957
+ "epoch": 0.8669438669438669,
2958
+ "grad_norm": 0.28515625,
2959
+ "learning_rate": 4.930226829535767e-05,
2960
+ "loss": 5.3266,
2961
+ "step": 417
2962
+ },
2963
+ {
2964
+ "epoch": 0.8690228690228691,
2965
+ "grad_norm": 0.279296875,
2966
+ "learning_rate": 4.7821419651211284e-05,
2967
+ "loss": 5.1296,
2968
+ "step": 418
2969
+ },
2970
+ {
2971
+ "epoch": 0.8711018711018711,
2972
+ "grad_norm": 0.27734375,
2973
+ "learning_rate": 4.636203340922007e-05,
2974
+ "loss": 5.5194,
2975
+ "step": 419
2976
+ },
2977
+ {
2978
+ "epoch": 0.8731808731808732,
2979
+ "grad_norm": 0.2578125,
2980
+ "learning_rate": 4.492417883833155e-05,
2981
+ "loss": 4.968,
2982
+ "step": 420
2983
+ },
2984
+ {
2985
+ "epoch": 0.8752598752598753,
2986
+ "grad_norm": 0.2578125,
2987
+ "learning_rate": 4.350792418550509e-05,
2988
+ "loss": 5.6204,
2989
+ "step": 421
2990
+ },
2991
+ {
2992
+ "epoch": 0.8773388773388774,
2993
+ "grad_norm": 0.2451171875,
2994
+ "learning_rate": 4.211333667247125e-05,
2995
+ "loss": 5.2991,
2996
+ "step": 422
2997
+ },
2998
+ {
2999
+ "epoch": 0.8794178794178794,
3000
+ "grad_norm": 0.267578125,
3001
+ "learning_rate": 4.074048249254286e-05,
3002
+ "loss": 5.3253,
3003
+ "step": 423
3004
+ },
3005
+ {
3006
+ "epoch": 0.8814968814968815,
3007
+ "grad_norm": 1.0546875,
3008
+ "learning_rate": 3.938942680747176e-05,
3009
+ "loss": 4.9877,
3010
+ "step": 424
3011
+ },
3012
+ {
3013
+ "epoch": 0.8835758835758836,
3014
+ "grad_norm": 0.2373046875,
3015
+ "learning_rate": 3.806023374435663e-05,
3016
+ "loss": 5.5203,
3017
+ "step": 425
3018
+ },
3019
+ {
3020
+ "epoch": 0.8856548856548857,
3021
+ "grad_norm": 0.2890625,
3022
+ "learning_rate": 3.675296639259912e-05,
3023
+ "loss": 4.9162,
3024
+ "step": 426
3025
+ },
3026
+ {
3027
+ "epoch": 0.8877338877338877,
3028
+ "grad_norm": 0.267578125,
3029
+ "learning_rate": 3.546768680090934e-05,
3030
+ "loss": 5.3487,
3031
+ "step": 427
3032
+ },
3033
+ {
3034
+ "epoch": 0.8898128898128899,
3035
+ "grad_norm": 0.26171875,
3036
+ "learning_rate": 3.420445597436056e-05,
3037
+ "loss": 4.8628,
3038
+ "step": 428
3039
+ },
3040
+ {
3041
+ "epoch": 0.8918918918918919,
3042
+ "grad_norm": 0.271484375,
3043
+ "learning_rate": 3.296333387149392e-05,
3044
+ "loss": 5.2627,
3045
+ "step": 429
3046
+ },
3047
+ {
3048
+ "epoch": 0.893970893970894,
3049
+ "grad_norm": 0.27734375,
3050
+ "learning_rate": 3.174437940147268e-05,
3051
+ "loss": 5.3865,
3052
+ "step": 430
3053
+ },
3054
+ {
3055
+ "epoch": 0.896049896049896,
3056
+ "grad_norm": 0.296875,
3057
+ "learning_rate": 3.054765042128521e-05,
3058
+ "loss": 5.1347,
3059
+ "step": 431
3060
+ },
3061
+ {
3062
+ "epoch": 0.8981288981288982,
3063
+ "grad_norm": 0.296875,
3064
+ "learning_rate": 2.9373203733000232e-05,
3065
+ "loss": 5.308,
3066
+ "step": 432
3067
+ },
3068
+ {
3069
+ "epoch": 0.9002079002079002,
3070
+ "grad_norm": 0.26171875,
3071
+ "learning_rate": 2.8221095081069513e-05,
3072
+ "loss": 5.0021,
3073
+ "step": 433
3074
+ },
3075
+ {
3076
+ "epoch": 0.9022869022869023,
3077
+ "grad_norm": 1.046875,
3078
+ "learning_rate": 2.709137914968268e-05,
3079
+ "loss": 5.0882,
3080
+ "step": 434
3081
+ },
3082
+ {
3083
+ "epoch": 0.9043659043659044,
3084
+ "grad_norm": 1.0546875,
3085
+ "learning_rate": 2.5984109560171388e-05,
3086
+ "loss": 5.4556,
3087
+ "step": 435
3088
+ },
3089
+ {
3090
+ "epoch": 0.9064449064449065,
3091
+ "grad_norm": 0.248046875,
3092
+ "learning_rate": 2.4899338868464407e-05,
3093
+ "loss": 5.4887,
3094
+ "step": 436
3095
+ },
3096
+ {
3097
+ "epoch": 0.9085239085239085,
3098
+ "grad_norm": 0.2734375,
3099
+ "learning_rate": 2.3837118562592797e-05,
3100
+ "loss": 5.3905,
3101
+ "step": 437
3102
+ },
3103
+ {
3104
+ "epoch": 0.9106029106029107,
3105
+ "grad_norm": 0.283203125,
3106
+ "learning_rate": 2.2797499060246252e-05,
3107
+ "loss": 5.1305,
3108
+ "step": 438
3109
+ },
3110
+ {
3111
+ "epoch": 0.9126819126819127,
3112
+ "grad_norm": 0.2412109375,
3113
+ "learning_rate": 2.1780529706380336e-05,
3114
+ "loss": 5.3754,
3115
+ "step": 439
3116
+ },
3117
+ {
3118
+ "epoch": 0.9147609147609148,
3119
+ "grad_norm": 0.27734375,
3120
+ "learning_rate": 2.0786258770873646e-05,
3121
+ "loss": 5.2316,
3122
+ "step": 440
3123
+ },
3124
+ {
3125
+ "epoch": 0.9168399168399168,
3126
+ "grad_norm": 0.294921875,
3127
+ "learning_rate": 1.9814733446237355e-05,
3128
+ "loss": 5.37,
3129
+ "step": 441
3130
+ },
3131
+ {
3132
+ "epoch": 0.918918918918919,
3133
+ "grad_norm": 0.2392578125,
3134
+ "learning_rate": 1.886599984537479e-05,
3135
+ "loss": 5.4565,
3136
+ "step": 442
3137
+ },
3138
+ {
3139
+ "epoch": 0.920997920997921,
3140
+ "grad_norm": 0.31640625,
3141
+ "learning_rate": 1.7940102999393192e-05,
3142
+ "loss": 5.1685,
3143
+ "step": 443
3144
+ },
3145
+ {
3146
+ "epoch": 0.9230769230769231,
3147
+ "grad_norm": 0.275390625,
3148
+ "learning_rate": 1.70370868554659e-05,
3149
+ "loss": 5.2651,
3150
+ "step": 444
3151
+ },
3152
+ {
3153
+ "epoch": 0.9251559251559252,
3154
+ "grad_norm": 0.29296875,
3155
+ "learning_rate": 1.6156994274746485e-05,
3156
+ "loss": 5.1014,
3157
+ "step": 445
3158
+ },
3159
+ {
3160
+ "epoch": 0.9272349272349273,
3161
+ "grad_norm": 0.328125,
3162
+ "learning_rate": 1.5299867030334813e-05,
3163
+ "loss": 4.4473,
3164
+ "step": 446
3165
+ },
3166
+ {
3167
+ "epoch": 0.9293139293139293,
3168
+ "grad_norm": 0.275390625,
3169
+ "learning_rate": 1.4465745805293584e-05,
3170
+ "loss": 5.4957,
3171
+ "step": 447
3172
+ },
3173
+ {
3174
+ "epoch": 0.9313929313929314,
3175
+ "grad_norm": 0.259765625,
3176
+ "learning_rate": 1.3654670190718033e-05,
3177
+ "loss": 5.4438,
3178
+ "step": 448
3179
+ },
3180
+ {
3181
+ "epoch": 0.9334719334719335,
3182
+ "grad_norm": 0.263671875,
3183
+ "learning_rate": 1.286667868385627e-05,
3184
+ "loss": 5.1253,
3185
+ "step": 449
3186
+ },
3187
+ {
3188
+ "epoch": 0.9355509355509356,
3189
+ "grad_norm": 0.25390625,
3190
+ "learning_rate": 1.210180868628219e-05,
3191
+ "loss": 5.2587,
3192
+ "step": 450
3193
+ },
3194
+ {
3195
+ "epoch": 0.9376299376299376,
3196
+ "grad_norm": 0.28125,
3197
+ "learning_rate": 1.1360096502120388e-05,
3198
+ "loss": 5.2587,
3199
+ "step": 451
3200
+ },
3201
+ {
3202
+ "epoch": 0.9397089397089398,
3203
+ "grad_norm": 0.349609375,
3204
+ "learning_rate": 1.064157733632276e-05,
3205
+ "loss": 5.0973,
3206
+ "step": 452
3207
+ },
3208
+ {
3209
+ "epoch": 0.9417879417879418,
3210
+ "grad_norm": 0.263671875,
3211
+ "learning_rate": 9.94628529299768e-06,
3212
+ "loss": 5.1763,
3213
+ "step": 453
3214
+ },
3215
+ {
3216
+ "epoch": 0.9438669438669439,
3217
+ "grad_norm": 0.271484375,
3218
+ "learning_rate": 9.274253373791064e-06,
3219
+ "loss": 5.2711,
3220
+ "step": 454
3221
+ },
3222
+ {
3223
+ "epoch": 0.9459459459459459,
3224
+ "grad_norm": 0.279296875,
3225
+ "learning_rate": 8.62551347632029e-06,
3226
+ "loss": 5.1639,
3227
+ "step": 455
3228
+ },
3229
+ {
3230
+ "epoch": 0.9480249480249481,
3231
+ "grad_norm": 0.26171875,
3232
+ "learning_rate": 8.000096392660028e-06,
3233
+ "loss": 5.2604,
3234
+ "step": 456
3235
+ },
3236
+ {
3237
+ "epoch": 0.9501039501039501,
3238
+ "grad_norm": 0.7890625,
3239
+ "learning_rate": 7.398031807880457e-06,
3240
+ "loss": 4.9583,
3241
+ "step": 457
3242
+ },
3243
+ {
3244
+ "epoch": 0.9521829521829522,
3245
+ "grad_norm": 0.28125,
3246
+ "learning_rate": 6.819348298638839e-06,
3247
+ "loss": 5.5333,
3248
+ "step": 458
3249
+ },
3250
+ {
3251
+ "epoch": 0.9542619542619543,
3252
+ "grad_norm": 0.28515625,
3253
+ "learning_rate": 6.264073331822551e-06,
3254
+ "loss": 5.0472,
3255
+ "step": 459
3256
+ },
3257
+ {
3258
+ "epoch": 0.9563409563409564,
3259
+ "grad_norm": 0.373046875,
3260
+ "learning_rate": 5.732233263245845e-06,
3261
+ "loss": 4.8395,
3262
+ "step": 460
3263
+ },
3264
+ {
3265
+ "epoch": 0.9584199584199584,
3266
+ "grad_norm": 0.3046875,
3267
+ "learning_rate": 5.223853336398632e-06,
3268
+ "loss": 5.3617,
3269
+ "step": 461
3270
+ },
3271
+ {
3272
+ "epoch": 0.9604989604989606,
3273
+ "grad_norm": 0.380859375,
3274
+ "learning_rate": 4.738957681248379e-06,
3275
+ "loss": 4.9956,
3276
+ "step": 462
3277
+ },
3278
+ {
3279
+ "epoch": 0.9625779625779626,
3280
+ "grad_norm": 0.296875,
3281
+ "learning_rate": 4.277569313094809e-06,
3282
+ "loss": 5.0861,
3283
+ "step": 463
3284
+ },
3285
+ {
3286
+ "epoch": 0.9646569646569647,
3287
+ "grad_norm": 0.267578125,
3288
+ "learning_rate": 3.839710131477492e-06,
3289
+ "loss": 5.1291,
3290
+ "step": 464
3291
+ },
3292
+ {
3293
+ "epoch": 0.9667359667359667,
3294
+ "grad_norm": 0.26953125,
3295
+ "learning_rate": 3.4254009191363455e-06,
3296
+ "loss": 4.9485,
3297
+ "step": 465
3298
+ },
3299
+ {
3300
+ "epoch": 0.9688149688149689,
3301
+ "grad_norm": 0.263671875,
3302
+ "learning_rate": 3.034661341025258e-06,
3303
+ "loss": 5.0859,
3304
+ "step": 466
3305
+ },
3306
+ {
3307
+ "epoch": 0.9708939708939709,
3308
+ "grad_norm": 0.68359375,
3309
+ "learning_rate": 2.6675099433787208e-06,
3310
+ "loss": 5.1618,
3311
+ "step": 467
3312
+ },
3313
+ {
3314
+ "epoch": 0.972972972972973,
3315
+ "grad_norm": 0.263671875,
3316
+ "learning_rate": 2.323964152831426e-06,
3317
+ "loss": 5.5762,
3318
+ "step": 468
3319
+ },
3320
+ {
3321
+ "epoch": 0.975051975051975,
3322
+ "grad_norm": 0.36328125,
3323
+ "learning_rate": 2.0040402755912013e-06,
3324
+ "loss": 4.9616,
3325
+ "step": 469
3326
+ },
3327
+ {
3328
+ "epoch": 0.9771309771309772,
3329
+ "grad_norm": 0.283203125,
3330
+ "learning_rate": 1.7077534966650766e-06,
3331
+ "loss": 5.3332,
3332
+ "step": 470
3333
+ },
3334
+ {
3335
+ "epoch": 0.9792099792099792,
3336
+ "grad_norm": 0.26171875,
3337
+ "learning_rate": 1.43511787913847e-06,
3338
+ "loss": 4.9868,
3339
+ "step": 471
3340
+ },
3341
+ {
3342
+ "epoch": 0.9812889812889813,
3343
+ "grad_norm": 0.251953125,
3344
+ "learning_rate": 1.1861463635077786e-06,
3345
+ "loss": 5.3489,
3346
+ "step": 472
3347
+ },
3348
+ {
3349
+ "epoch": 0.9833679833679834,
3350
+ "grad_norm": 0.25390625,
3351
+ "learning_rate": 9.60850767065924e-07,
3352
+ "loss": 5.1146,
3353
+ "step": 473
3354
+ },
3355
+ {
3356
+ "epoch": 0.9854469854469855,
3357
+ "grad_norm": 0.66015625,
3358
+ "learning_rate": 7.592417833419129e-07,
3359
+ "loss": 5.0727,
3360
+ "step": 474
3361
+ },
3362
+ {
3363
+ "epoch": 0.9875259875259875,
3364
+ "grad_norm": 0.2421875,
3365
+ "learning_rate": 5.81328981592688e-07,
3366
+ "loss": 5.5923,
3367
+ "step": 475
3368
+ },
3369
+ {
3370
+ "epoch": 0.9896049896049897,
3371
+ "grad_norm": 0.259765625,
3372
+ "learning_rate": 4.2712080634949023e-07,
3373
+ "loss": 5.2152,
3374
+ "step": 476
3375
+ },
3376
+ {
3377
+ "epoch": 0.9916839916839917,
3378
+ "grad_norm": 0.25,
3379
+ "learning_rate": 2.966245770166243e-07,
3380
+ "loss": 5.2777,
3381
+ "step": 477
3382
+ },
3383
+ {
3384
+ "epoch": 0.9937629937629938,
3385
+ "grad_norm": 0.279296875,
3386
+ "learning_rate": 1.8984648752429223e-07,
3387
+ "loss": 5.2911,
3388
+ "step": 478
3389
+ },
3390
+ {
3391
+ "epoch": 0.9958419958419958,
3392
+ "grad_norm": 0.25390625,
3393
+ "learning_rate": 1.0679160603449533e-07,
3394
+ "loss": 5.1634,
3395
+ "step": 479
3396
+ },
3397
+ {
3398
+ "epoch": 0.997920997920998,
3399
+ "grad_norm": 0.2470703125,
3400
+ "learning_rate": 4.746387470044855e-08,
3401
+ "loss": 5.0967,
3402
+ "step": 480
3403
+ },
3404
+ {
3405
+ "epoch": 1.0,
3406
+ "grad_norm": 0.25390625,
3407
+ "learning_rate": 1.1866109479674591e-08,
3408
+ "loss": 5.2176,
3409
+ "step": 481
3410
  }
3411
  ],
3412
  "logging_steps": 1,
 
3421
  "should_evaluate": false,
3422
  "should_log": false,
3423
  "should_save": true,
3424
+ "should_training_stop": true
3425
  },
3426
  "attributes": {}
3427
  }
3428
  },
3429
+ "total_flos": 1.2252207020978995e+17,
3430
  "train_batch_size": 32,
3431
  "trial_name": null,
3432
  "trial_params": null