mtzig commited on
Commit
51a8e90
·
verified ·
1 Parent(s): 80fd6e0

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cd061667d62cdf6b8e885ae672a4fa1817a64172c0dc13f261537a3e31f28db
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046ad15b3a172be6c8a55556a3c20f15ef4ee714b05b61a7d6c92d4c6c9e3474
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87e886c2d84e224bcf754622b2803a8b4b64da30bce7eb4e4a3fb75b1b091c5e
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99483ad02ea22340771991cc6669d1256f76d10e032f69951c7480de7534bf0
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d01731a1af4d71978b316124603e4caa090cc86ccd121d20f40ef90314e39721
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15aeeac899877619eba9935ba0590b8f1fa55e2d75c220d96a220798bf78d453
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6befdc931c99a6a9572bf364f4fbf3a16a16ac047bda664b290f7eaf2d6f0509
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c991dfb7fae6e7e8e823a6ff78f7059aa6c3e2ee08cfa323cee3a4c276002a52
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d08e96af21e8b93e1cae1c1f298c74bd5cb903e59a95e666fe5d23d7c34e828
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2efd166763c9a22763a8e34b47f368dce987da9bde3aa0da236e9078b6b587f0
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:800dcee3d49bf7c4fb9af44a7247d8c8a98f39fbe21de15901e57a24fee6d511
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e892fe343502bd3fc0cfd63b8565786111da6ae6996697589256c318e3c3076d
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95e1a7c487043377d57b4e529a8c41b121f1a82a2bf5513187f81cd357b2a6fd
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e97d6a38ca3edf5744f51d03cec6812554f609ad1d7c762e2e3dcca3bc8af260
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cc70fc94ad214460b8f53afbc67815e264058229327612b212b333c955747d2
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a2e456b9f475387054566e8f129204eb628e3726aee77c4412ff11fc720706
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ffe6f9ada3514f92495fec3edd9e5bfa7e16527e9f4d407a243ffca3a335369
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99b43935a8d0c3ec7f6a15b5d02d38b25daf586d495f97529bae66a69e46d216
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea26ba6138daf1586403be19f69bfdf220f2970f3306409052e7562dbee71e8f
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31626c13f5c4cf0cf88e6b691ef4408c4d52105b3855f7889d25ca5f4f0a0734
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:350fe26b744c676e14aee27774ebc4bcf6a2961db0854ea02d257bc061e2b80c
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04807176bb2c22eeb6b0258c9226b2dfd4f8b8398c96841c95458ef393e0f56a
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8aa0e29a92b309693c410bb08006a182de233c5acd31c16b7450cbb9c31feeb
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40e15fa019a19c6cd7ce2d72f5afc609c5c0b834df5220f887d5ba71dae814ca
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f4d7d51569641046d070ffaf530561887033fff68178c32329f5f5841b1a076
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:689f27834221968cbb24970b2a0ef37515a668dd8bd2e8a00c81e11a90d7d4a4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.502092050209205,
5
  "eval_steps": 20,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3059,6 +3059,766 @@
3059
  "eval_samples_per_second": 5.361,
3060
  "eval_steps_per_second": 0.174,
3061
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3062
  }
3063
  ],
3064
  "logging_steps": 1,
@@ -3078,7 +3838,7 @@
3078
  "attributes": {}
3079
  }
3080
  },
3081
- "total_flos": 1.4611649697467597e+17,
3082
  "train_batch_size": 6,
3083
  "trial_name": null,
3084
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6276150627615062,
5
  "eval_steps": 20,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3059
  "eval_samples_per_second": 5.361,
3060
  "eval_steps_per_second": 0.174,
3061
  "step": 400
3062
+ },
3063
+ {
3064
+ "epoch": 0.5033472803347281,
3065
+ "grad_norm": 3.5723674297332764,
3066
+ "learning_rate": 1.1616326878747115e-05,
3067
+ "loss": 0.2269,
3068
+ "step": 401
3069
+ },
3070
+ {
3071
+ "epoch": 0.5046025104602511,
3072
+ "grad_norm": 3.2594027519226074,
3073
+ "learning_rate": 1.1573011405714214e-05,
3074
+ "loss": 0.2535,
3075
+ "step": 402
3076
+ },
3077
+ {
3078
+ "epoch": 0.505857740585774,
3079
+ "grad_norm": 4.008416652679443,
3080
+ "learning_rate": 1.1529665649264388e-05,
3081
+ "loss": 0.2676,
3082
+ "step": 403
3083
+ },
3084
+ {
3085
+ "epoch": 0.507112970711297,
3086
+ "grad_norm": 4.401585102081299,
3087
+ "learning_rate": 1.1486290443884666e-05,
3088
+ "loss": 0.2613,
3089
+ "step": 404
3090
+ },
3091
+ {
3092
+ "epoch": 0.5083682008368201,
3093
+ "grad_norm": 4.80834436416626,
3094
+ "learning_rate": 1.1442886624629035e-05,
3095
+ "loss": 0.3053,
3096
+ "step": 405
3097
+ },
3098
+ {
3099
+ "epoch": 0.5096234309623431,
3100
+ "grad_norm": 4.229012489318848,
3101
+ "learning_rate": 1.1399455027102327e-05,
3102
+ "loss": 0.241,
3103
+ "step": 406
3104
+ },
3105
+ {
3106
+ "epoch": 0.5108786610878661,
3107
+ "grad_norm": 3.3926737308502197,
3108
+ "learning_rate": 1.1355996487444178e-05,
3109
+ "loss": 0.1874,
3110
+ "step": 407
3111
+ },
3112
+ {
3113
+ "epoch": 0.5121338912133891,
3114
+ "grad_norm": 6.564388751983643,
3115
+ "learning_rate": 1.131251184231291e-05,
3116
+ "loss": 0.2087,
3117
+ "step": 408
3118
+ },
3119
+ {
3120
+ "epoch": 0.5133891213389121,
3121
+ "grad_norm": 4.077323913574219,
3122
+ "learning_rate": 1.1269001928869414e-05,
3123
+ "loss": 0.2787,
3124
+ "step": 409
3125
+ },
3126
+ {
3127
+ "epoch": 0.5146443514644351,
3128
+ "grad_norm": 4.169933319091797,
3129
+ "learning_rate": 1.122546758476105e-05,
3130
+ "loss": 0.2701,
3131
+ "step": 410
3132
+ },
3133
+ {
3134
+ "epoch": 0.5158995815899582,
3135
+ "grad_norm": 4.548664093017578,
3136
+ "learning_rate": 1.1181909648105511e-05,
3137
+ "loss": 0.2711,
3138
+ "step": 411
3139
+ },
3140
+ {
3141
+ "epoch": 0.5171548117154812,
3142
+ "grad_norm": 7.507246971130371,
3143
+ "learning_rate": 1.1138328957474691e-05,
3144
+ "loss": 0.2761,
3145
+ "step": 412
3146
+ },
3147
+ {
3148
+ "epoch": 0.5184100418410041,
3149
+ "grad_norm": 4.293572425842285,
3150
+ "learning_rate": 1.1094726351878549e-05,
3151
+ "loss": 0.2451,
3152
+ "step": 413
3153
+ },
3154
+ {
3155
+ "epoch": 0.5196652719665272,
3156
+ "grad_norm": 4.692587852478027,
3157
+ "learning_rate": 1.1051102670748939e-05,
3158
+ "loss": 0.2841,
3159
+ "step": 414
3160
+ },
3161
+ {
3162
+ "epoch": 0.5209205020920502,
3163
+ "grad_norm": 3.3623526096343994,
3164
+ "learning_rate": 1.1007458753923455e-05,
3165
+ "loss": 0.245,
3166
+ "step": 415
3167
+ },
3168
+ {
3169
+ "epoch": 0.5221757322175732,
3170
+ "grad_norm": 4.157695770263672,
3171
+ "learning_rate": 1.0963795441629275e-05,
3172
+ "loss": 0.2193,
3173
+ "step": 416
3174
+ },
3175
+ {
3176
+ "epoch": 0.5234309623430963,
3177
+ "grad_norm": 5.14530086517334,
3178
+ "learning_rate": 1.0920113574466975e-05,
3179
+ "loss": 0.3033,
3180
+ "step": 417
3181
+ },
3182
+ {
3183
+ "epoch": 0.5246861924686193,
3184
+ "grad_norm": 4.993683338165283,
3185
+ "learning_rate": 1.0876413993394346e-05,
3186
+ "loss": 0.2947,
3187
+ "step": 418
3188
+ },
3189
+ {
3190
+ "epoch": 0.5259414225941422,
3191
+ "grad_norm": 6.235597610473633,
3192
+ "learning_rate": 1.0832697539710197e-05,
3193
+ "loss": 0.2765,
3194
+ "step": 419
3195
+ },
3196
+ {
3197
+ "epoch": 0.5271966527196653,
3198
+ "grad_norm": 3.5556070804595947,
3199
+ "learning_rate": 1.0788965055038179e-05,
3200
+ "loss": 0.2743,
3201
+ "step": 420
3202
+ },
3203
+ {
3204
+ "epoch": 0.5271966527196653,
3205
+ "eval_accuracy": 0.8543046357615894,
3206
+ "eval_f1": 0.695852534562212,
3207
+ "eval_loss": 0.30108267068862915,
3208
+ "eval_precision": 0.8388888888888889,
3209
+ "eval_recall": 0.594488188976378,
3210
+ "eval_runtime": 53.2652,
3211
+ "eval_samples_per_second": 5.2,
3212
+ "eval_steps_per_second": 0.169,
3213
+ "step": 420
3214
+ },
3215
+ {
3216
+ "epoch": 0.5284518828451883,
3217
+ "grad_norm": 3.676478624343872,
3218
+ "learning_rate": 1.0745217381310562e-05,
3219
+ "loss": 0.2493,
3220
+ "step": 421
3221
+ },
3222
+ {
3223
+ "epoch": 0.5297071129707113,
3224
+ "grad_norm": 2.608015775680542,
3225
+ "learning_rate": 1.0701455360752038e-05,
3226
+ "loss": 0.2325,
3227
+ "step": 422
3228
+ },
3229
+ {
3230
+ "epoch": 0.5309623430962344,
3231
+ "grad_norm": 3.949383497238159,
3232
+ "learning_rate": 1.0657679835863497e-05,
3233
+ "loss": 0.3002,
3234
+ "step": 423
3235
+ },
3236
+ {
3237
+ "epoch": 0.5322175732217573,
3238
+ "grad_norm": 4.198700904846191,
3239
+ "learning_rate": 1.0613891649405816e-05,
3240
+ "loss": 0.1867,
3241
+ "step": 424
3242
+ },
3243
+ {
3244
+ "epoch": 0.5334728033472803,
3245
+ "grad_norm": 4.353850364685059,
3246
+ "learning_rate": 1.0570091644383631e-05,
3247
+ "loss": 0.2605,
3248
+ "step": 425
3249
+ },
3250
+ {
3251
+ "epoch": 0.5347280334728034,
3252
+ "grad_norm": 3.590062379837036,
3253
+ "learning_rate": 1.0526280664029105e-05,
3254
+ "loss": 0.2438,
3255
+ "step": 426
3256
+ },
3257
+ {
3258
+ "epoch": 0.5359832635983264,
3259
+ "grad_norm": 3.3330376148223877,
3260
+ "learning_rate": 1.0482459551785705e-05,
3261
+ "loss": 0.1865,
3262
+ "step": 427
3263
+ },
3264
+ {
3265
+ "epoch": 0.5372384937238494,
3266
+ "grad_norm": 3.8933749198913574,
3267
+ "learning_rate": 1.0438629151291944e-05,
3268
+ "loss": 0.3091,
3269
+ "step": 428
3270
+ },
3271
+ {
3272
+ "epoch": 0.5384937238493723,
3273
+ "grad_norm": 3.451763153076172,
3274
+ "learning_rate": 1.0394790306365154e-05,
3275
+ "loss": 0.2454,
3276
+ "step": 429
3277
+ },
3278
+ {
3279
+ "epoch": 0.5397489539748954,
3280
+ "grad_norm": 3.9449098110198975,
3281
+ "learning_rate": 1.0350943860985249e-05,
3282
+ "loss": 0.2863,
3283
+ "step": 430
3284
+ },
3285
+ {
3286
+ "epoch": 0.5410041841004184,
3287
+ "grad_norm": 3.1234138011932373,
3288
+ "learning_rate": 1.0307090659278453e-05,
3289
+ "loss": 0.2382,
3290
+ "step": 431
3291
+ },
3292
+ {
3293
+ "epoch": 0.5422594142259414,
3294
+ "grad_norm": 4.012730598449707,
3295
+ "learning_rate": 1.0263231545501068e-05,
3296
+ "loss": 0.1927,
3297
+ "step": 432
3298
+ },
3299
+ {
3300
+ "epoch": 0.5435146443514645,
3301
+ "grad_norm": 3.3617918491363525,
3302
+ "learning_rate": 1.0219367364023216e-05,
3303
+ "loss": 0.2557,
3304
+ "step": 433
3305
+ },
3306
+ {
3307
+ "epoch": 0.5447698744769874,
3308
+ "grad_norm": 3.0772593021392822,
3309
+ "learning_rate": 1.0175498959312572e-05,
3310
+ "loss": 0.2144,
3311
+ "step": 434
3312
+ },
3313
+ {
3314
+ "epoch": 0.5460251046025104,
3315
+ "grad_norm": 3.1349897384643555,
3316
+ "learning_rate": 1.013162717591813e-05,
3317
+ "loss": 0.2583,
3318
+ "step": 435
3319
+ },
3320
+ {
3321
+ "epoch": 0.5472803347280335,
3322
+ "grad_norm": 3.4480161666870117,
3323
+ "learning_rate": 1.0087752858453923e-05,
3324
+ "loss": 0.2406,
3325
+ "step": 436
3326
+ },
3327
+ {
3328
+ "epoch": 0.5485355648535565,
3329
+ "grad_norm": 7.350139617919922,
3330
+ "learning_rate": 1.0043876851582763e-05,
3331
+ "loss": 0.2542,
3332
+ "step": 437
3333
+ },
3334
+ {
3335
+ "epoch": 0.5497907949790795,
3336
+ "grad_norm": 3.588282346725464,
3337
+ "learning_rate": 1e-05,
3338
+ "loss": 0.2763,
3339
+ "step": 438
3340
+ },
3341
+ {
3342
+ "epoch": 0.5510460251046025,
3343
+ "grad_norm": 4.360295295715332,
3344
+ "learning_rate": 9.956123148417239e-06,
3345
+ "loss": 0.3057,
3346
+ "step": 439
3347
+ },
3348
+ {
3349
+ "epoch": 0.5523012552301255,
3350
+ "grad_norm": 2.705050468444824,
3351
+ "learning_rate": 9.91224714154608e-06,
3352
+ "loss": 0.2248,
3353
+ "step": 440
3354
+ },
3355
+ {
3356
+ "epoch": 0.5523012552301255,
3357
+ "eval_accuracy": 0.8532008830022075,
3358
+ "eval_f1": 0.6928406466512702,
3359
+ "eval_loss": 0.3030659258365631,
3360
+ "eval_precision": 0.8379888268156425,
3361
+ "eval_recall": 0.5905511811023622,
3362
+ "eval_runtime": 53.3187,
3363
+ "eval_samples_per_second": 5.195,
3364
+ "eval_steps_per_second": 0.169,
3365
+ "step": 440
3366
+ },
3367
+ {
3368
+ "epoch": 0.5535564853556485,
3369
+ "grad_norm": 8.555684089660645,
3370
+ "learning_rate": 9.86837282408187e-06,
3371
+ "loss": 0.2456,
3372
+ "step": 441
3373
+ },
3374
+ {
3375
+ "epoch": 0.5548117154811716,
3376
+ "grad_norm": 4.036064624786377,
3377
+ "learning_rate": 9.82450104068743e-06,
3378
+ "loss": 0.2197,
3379
+ "step": 442
3380
+ },
3381
+ {
3382
+ "epoch": 0.5560669456066946,
3383
+ "grad_norm": 3.9498164653778076,
3384
+ "learning_rate": 9.78063263597679e-06,
3385
+ "loss": 0.2475,
3386
+ "step": 443
3387
+ },
3388
+ {
3389
+ "epoch": 0.5573221757322175,
3390
+ "grad_norm": 5.120787620544434,
3391
+ "learning_rate": 9.736768454498935e-06,
3392
+ "loss": 0.2361,
3393
+ "step": 444
3394
+ },
3395
+ {
3396
+ "epoch": 0.5585774058577406,
3397
+ "grad_norm": 3.8310952186584473,
3398
+ "learning_rate": 9.692909340721549e-06,
3399
+ "loss": 0.1948,
3400
+ "step": 445
3401
+ },
3402
+ {
3403
+ "epoch": 0.5598326359832636,
3404
+ "grad_norm": 7.724740982055664,
3405
+ "learning_rate": 9.649056139014754e-06,
3406
+ "loss": 0.2821,
3407
+ "step": 446
3408
+ },
3409
+ {
3410
+ "epoch": 0.5610878661087866,
3411
+ "grad_norm": 3.3741142749786377,
3412
+ "learning_rate": 9.605209693634849e-06,
3413
+ "loss": 0.2063,
3414
+ "step": 447
3415
+ },
3416
+ {
3417
+ "epoch": 0.5623430962343097,
3418
+ "grad_norm": 4.161517143249512,
3419
+ "learning_rate": 9.561370848708061e-06,
3420
+ "loss": 0.2638,
3421
+ "step": 448
3422
+ },
3423
+ {
3424
+ "epoch": 0.5635983263598326,
3425
+ "grad_norm": 4.094989776611328,
3426
+ "learning_rate": 9.517540448214299e-06,
3427
+ "loss": 0.2374,
3428
+ "step": 449
3429
+ },
3430
+ {
3431
+ "epoch": 0.5648535564853556,
3432
+ "grad_norm": 5.402541160583496,
3433
+ "learning_rate": 9.473719335970896e-06,
3434
+ "loss": 0.2456,
3435
+ "step": 450
3436
+ },
3437
+ {
3438
+ "epoch": 0.5661087866108787,
3439
+ "grad_norm": 3.0757880210876465,
3440
+ "learning_rate": 9.429908355616372e-06,
3441
+ "loss": 0.1735,
3442
+ "step": 451
3443
+ },
3444
+ {
3445
+ "epoch": 0.5673640167364017,
3446
+ "grad_norm": 3.2094337940216064,
3447
+ "learning_rate": 9.38610835059419e-06,
3448
+ "loss": 0.1759,
3449
+ "step": 452
3450
+ },
3451
+ {
3452
+ "epoch": 0.5686192468619247,
3453
+ "grad_norm": 6.4844489097595215,
3454
+ "learning_rate": 9.342320164136506e-06,
3455
+ "loss": 0.274,
3456
+ "step": 453
3457
+ },
3458
+ {
3459
+ "epoch": 0.5698744769874478,
3460
+ "grad_norm": 3.759528636932373,
3461
+ "learning_rate": 9.298544639247965e-06,
3462
+ "loss": 0.2074,
3463
+ "step": 454
3464
+ },
3465
+ {
3466
+ "epoch": 0.5711297071129707,
3467
+ "grad_norm": 4.877233028411865,
3468
+ "learning_rate": 9.25478261868944e-06,
3469
+ "loss": 0.2442,
3470
+ "step": 455
3471
+ },
3472
+ {
3473
+ "epoch": 0.5723849372384937,
3474
+ "grad_norm": 4.176396369934082,
3475
+ "learning_rate": 9.211034944961825e-06,
3476
+ "loss": 0.2082,
3477
+ "step": 456
3478
+ },
3479
+ {
3480
+ "epoch": 0.5736401673640167,
3481
+ "grad_norm": 4.096301078796387,
3482
+ "learning_rate": 9.167302460289804e-06,
3483
+ "loss": 0.2556,
3484
+ "step": 457
3485
+ },
3486
+ {
3487
+ "epoch": 0.5748953974895398,
3488
+ "grad_norm": 3.9278416633605957,
3489
+ "learning_rate": 9.123586006605658e-06,
3490
+ "loss": 0.1989,
3491
+ "step": 458
3492
+ },
3493
+ {
3494
+ "epoch": 0.5761506276150627,
3495
+ "grad_norm": 6.082350254058838,
3496
+ "learning_rate": 9.079886425533026e-06,
3497
+ "loss": 0.3095,
3498
+ "step": 459
3499
+ },
3500
+ {
3501
+ "epoch": 0.5774058577405857,
3502
+ "grad_norm": 4.512117862701416,
3503
+ "learning_rate": 9.036204558370725e-06,
3504
+ "loss": 0.2149,
3505
+ "step": 460
3506
+ },
3507
+ {
3508
+ "epoch": 0.5774058577405857,
3509
+ "eval_accuracy": 0.8609271523178808,
3510
+ "eval_f1": 0.7307692307692307,
3511
+ "eval_loss": 0.28679677844047546,
3512
+ "eval_precision": 0.7990654205607477,
3513
+ "eval_recall": 0.6732283464566929,
3514
+ "eval_runtime": 52.9315,
3515
+ "eval_samples_per_second": 5.233,
3516
+ "eval_steps_per_second": 0.17,
3517
+ "step": 460
3518
+ },
3519
+ {
3520
+ "epoch": 0.5786610878661088,
3521
+ "grad_norm": 5.169907569885254,
3522
+ "learning_rate": 8.992541246076552e-06,
3523
+ "loss": 0.2715,
3524
+ "step": 461
3525
+ },
3526
+ {
3527
+ "epoch": 0.5799163179916318,
3528
+ "grad_norm": 7.113213539123535,
3529
+ "learning_rate": 8.948897329251066e-06,
3530
+ "loss": 0.2462,
3531
+ "step": 462
3532
+ },
3533
+ {
3534
+ "epoch": 0.5811715481171548,
3535
+ "grad_norm": 6.822581768035889,
3536
+ "learning_rate": 8.905273648121455e-06,
3537
+ "loss": 0.2062,
3538
+ "step": 463
3539
+ },
3540
+ {
3541
+ "epoch": 0.5824267782426779,
3542
+ "grad_norm": 4.880428314208984,
3543
+ "learning_rate": 8.861671042525312e-06,
3544
+ "loss": 0.2825,
3545
+ "step": 464
3546
+ },
3547
+ {
3548
+ "epoch": 0.5836820083682008,
3549
+ "grad_norm": 4.777677536010742,
3550
+ "learning_rate": 8.818090351894492e-06,
3551
+ "loss": 0.2439,
3552
+ "step": 465
3553
+ },
3554
+ {
3555
+ "epoch": 0.5849372384937238,
3556
+ "grad_norm": 4.444671154022217,
3557
+ "learning_rate": 8.774532415238954e-06,
3558
+ "loss": 0.2612,
3559
+ "step": 466
3560
+ },
3561
+ {
3562
+ "epoch": 0.5861924686192469,
3563
+ "grad_norm": 4.537267208099365,
3564
+ "learning_rate": 8.730998071130589e-06,
3565
+ "loss": 0.2002,
3566
+ "step": 467
3567
+ },
3568
+ {
3569
+ "epoch": 0.5874476987447699,
3570
+ "grad_norm": 2.443470001220703,
3571
+ "learning_rate": 8.68748815768709e-06,
3572
+ "loss": 0.205,
3573
+ "step": 468
3574
+ },
3575
+ {
3576
+ "epoch": 0.5887029288702929,
3577
+ "grad_norm": 3.0930683612823486,
3578
+ "learning_rate": 8.64400351255582e-06,
3579
+ "loss": 0.1858,
3580
+ "step": 469
3581
+ },
3582
+ {
3583
+ "epoch": 0.5899581589958159,
3584
+ "grad_norm": 4.16900634765625,
3585
+ "learning_rate": 8.600544972897678e-06,
3586
+ "loss": 0.2589,
3587
+ "step": 470
3588
+ },
3589
+ {
3590
+ "epoch": 0.5912133891213389,
3591
+ "grad_norm": 3.5443623065948486,
3592
+ "learning_rate": 8.55711337537097e-06,
3593
+ "loss": 0.2245,
3594
+ "step": 471
3595
+ },
3596
+ {
3597
+ "epoch": 0.5924686192468619,
3598
+ "grad_norm": 4.173429012298584,
3599
+ "learning_rate": 8.513709556115335e-06,
3600
+ "loss": 0.2116,
3601
+ "step": 472
3602
+ },
3603
+ {
3604
+ "epoch": 0.593723849372385,
3605
+ "grad_norm": 4.342430114746094,
3606
+ "learning_rate": 8.470334350735615e-06,
3607
+ "loss": 0.2919,
3608
+ "step": 473
3609
+ },
3610
+ {
3611
+ "epoch": 0.594979079497908,
3612
+ "grad_norm": 3.233147144317627,
3613
+ "learning_rate": 8.42698859428579e-06,
3614
+ "loss": 0.2359,
3615
+ "step": 474
3616
+ },
3617
+ {
3618
+ "epoch": 0.5962343096234309,
3619
+ "grad_norm": 3.438584327697754,
3620
+ "learning_rate": 8.383673121252887e-06,
3621
+ "loss": 0.2049,
3622
+ "step": 475
3623
+ },
3624
+ {
3625
+ "epoch": 0.597489539748954,
3626
+ "grad_norm": 6.184849739074707,
3627
+ "learning_rate": 8.340388765540923e-06,
3628
+ "loss": 0.2746,
3629
+ "step": 476
3630
+ },
3631
+ {
3632
+ "epoch": 0.598744769874477,
3633
+ "grad_norm": 3.2504115104675293,
3634
+ "learning_rate": 8.297136360454844e-06,
3635
+ "loss": 0.2135,
3636
+ "step": 477
3637
+ },
3638
+ {
3639
+ "epoch": 0.6,
3640
+ "grad_norm": 5.307207107543945,
3641
+ "learning_rate": 8.253916738684497e-06,
3642
+ "loss": 0.2292,
3643
+ "step": 478
3644
+ },
3645
+ {
3646
+ "epoch": 0.6012552301255231,
3647
+ "grad_norm": 8.800402641296387,
3648
+ "learning_rate": 8.21073073228858e-06,
3649
+ "loss": 0.3836,
3650
+ "step": 479
3651
+ },
3652
+ {
3653
+ "epoch": 0.602510460251046,
3654
+ "grad_norm": 3.201681613922119,
3655
+ "learning_rate": 8.16757917267863e-06,
3656
+ "loss": 0.1998,
3657
+ "step": 480
3658
+ },
3659
+ {
3660
+ "epoch": 0.602510460251046,
3661
+ "eval_accuracy": 0.8587196467991169,
3662
+ "eval_f1": 0.7117117117117117,
3663
+ "eval_loss": 0.2975335419178009,
3664
+ "eval_precision": 0.8315789473684211,
3665
+ "eval_recall": 0.6220472440944882,
3666
+ "eval_runtime": 52.1942,
3667
+ "eval_samples_per_second": 5.307,
3668
+ "eval_steps_per_second": 0.172,
3669
+ "step": 480
3670
+ },
3671
+ {
3672
+ "epoch": 0.603765690376569,
3673
+ "grad_norm": 4.214263916015625,
3674
+ "learning_rate": 8.124462890603027e-06,
3675
+ "loss": 0.2576,
3676
+ "step": 481
3677
+ },
3678
+ {
3679
+ "epoch": 0.605020920502092,
3680
+ "grad_norm": 4.688704490661621,
3681
+ "learning_rate": 8.081382716130982e-06,
3682
+ "loss": 0.2174,
3683
+ "step": 482
3684
+ },
3685
+ {
3686
+ "epoch": 0.6062761506276151,
3687
+ "grad_norm": 4.108887672424316,
3688
+ "learning_rate": 8.038339478636581e-06,
3689
+ "loss": 0.2709,
3690
+ "step": 483
3691
+ },
3692
+ {
3693
+ "epoch": 0.6075313807531381,
3694
+ "grad_norm": 3.246297597885132,
3695
+ "learning_rate": 7.995334006782793e-06,
3696
+ "loss": 0.2215,
3697
+ "step": 484
3698
+ },
3699
+ {
3700
+ "epoch": 0.608786610878661,
3701
+ "grad_norm": 4.0921454429626465,
3702
+ "learning_rate": 7.95236712850553e-06,
3703
+ "loss": 0.2766,
3704
+ "step": 485
3705
+ },
3706
+ {
3707
+ "epoch": 0.6100418410041841,
3708
+ "grad_norm": 3.6497576236724854,
3709
+ "learning_rate": 7.909439670997706e-06,
3710
+ "loss": 0.2337,
3711
+ "step": 486
3712
+ },
3713
+ {
3714
+ "epoch": 0.6112970711297071,
3715
+ "grad_norm": 3.3551504611968994,
3716
+ "learning_rate": 7.866552460693314e-06,
3717
+ "loss": 0.208,
3718
+ "step": 487
3719
+ },
3720
+ {
3721
+ "epoch": 0.6125523012552301,
3722
+ "grad_norm": 4.46877384185791,
3723
+ "learning_rate": 7.823706323251512e-06,
3724
+ "loss": 0.2493,
3725
+ "step": 488
3726
+ },
3727
+ {
3728
+ "epoch": 0.6138075313807532,
3729
+ "grad_norm": 4.38779878616333,
3730
+ "learning_rate": 7.78090208354072e-06,
3731
+ "loss": 0.2635,
3732
+ "step": 489
3733
+ },
3734
+ {
3735
+ "epoch": 0.6150627615062761,
3736
+ "grad_norm": 5.72041130065918,
3737
+ "learning_rate": 7.738140565622758e-06,
3738
+ "loss": 0.2348,
3739
+ "step": 490
3740
+ },
3741
+ {
3742
+ "epoch": 0.6163179916317991,
3743
+ "grad_norm": 5.17399263381958,
3744
+ "learning_rate": 7.69542259273697e-06,
3745
+ "loss": 0.2208,
3746
+ "step": 491
3747
+ },
3748
+ {
3749
+ "epoch": 0.6175732217573222,
3750
+ "grad_norm": 3.7989094257354736,
3751
+ "learning_rate": 7.652748987284375e-06,
3752
+ "loss": 0.256,
3753
+ "step": 492
3754
+ },
3755
+ {
3756
+ "epoch": 0.6188284518828452,
3757
+ "grad_norm": 3.5693840980529785,
3758
+ "learning_rate": 7.610120570811833e-06,
3759
+ "loss": 0.2408,
3760
+ "step": 493
3761
+ },
3762
+ {
3763
+ "epoch": 0.6200836820083682,
3764
+ "grad_norm": 3.3571958541870117,
3765
+ "learning_rate": 7.567538163996237e-06,
3766
+ "loss": 0.2263,
3767
+ "step": 494
3768
+ },
3769
+ {
3770
+ "epoch": 0.6213389121338913,
3771
+ "grad_norm": 5.469134330749512,
3772
+ "learning_rate": 7.525002586628707e-06,
3773
+ "loss": 0.2335,
3774
+ "step": 495
3775
+ },
3776
+ {
3777
+ "epoch": 0.6225941422594142,
3778
+ "grad_norm": 2.846597909927368,
3779
+ "learning_rate": 7.4825146575988e-06,
3780
+ "loss": 0.204,
3781
+ "step": 496
3782
+ },
3783
+ {
3784
+ "epoch": 0.6238493723849372,
3785
+ "grad_norm": 6.401832103729248,
3786
+ "learning_rate": 7.440075194878769e-06,
3787
+ "loss": 0.2643,
3788
+ "step": 497
3789
+ },
3790
+ {
3791
+ "epoch": 0.6251046025104603,
3792
+ "grad_norm": 4.148714065551758,
3793
+ "learning_rate": 7.397685015507781e-06,
3794
+ "loss": 0.1882,
3795
+ "step": 498
3796
+ },
3797
+ {
3798
+ "epoch": 0.6263598326359833,
3799
+ "grad_norm": 3.8023147583007812,
3800
+ "learning_rate": 7.355344935576221e-06,
3801
+ "loss": 0.179,
3802
+ "step": 499
3803
+ },
3804
+ {
3805
+ "epoch": 0.6276150627615062,
3806
+ "grad_norm": 3.7073490619659424,
3807
+ "learning_rate": 7.313055770209961e-06,
3808
+ "loss": 0.2459,
3809
+ "step": 500
3810
+ },
3811
+ {
3812
+ "epoch": 0.6276150627615062,
3813
+ "eval_accuracy": 0.8509933774834437,
3814
+ "eval_f1": 0.6882217090069284,
3815
+ "eval_loss": 0.29779428243637085,
3816
+ "eval_precision": 0.8324022346368715,
3817
+ "eval_recall": 0.5866141732283464,
3818
+ "eval_runtime": 52.9024,
3819
+ "eval_samples_per_second": 5.236,
3820
+ "eval_steps_per_second": 0.17,
3821
+ "step": 500
3822
  }
3823
  ],
3824
  "logging_steps": 1,
 
3838
  "attributes": {}
3839
  }
3840
  },
3841
+ "total_flos": 1.8275482733012582e+17,
3842
  "train_batch_size": 6,
3843
  "trial_name": null,
3844
  "trial_params": null