mtzig commited on
Commit
2ff964d
·
verified ·
1 Parent(s): ed54abb

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2685cd63188a12b6e51a5f87202565fae64cf192dedbad3017d3d9a00011716
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee6fb7682ac2960073f7f7f514a487e812e315d44419de7236d03c8ab15aadb0
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a564ea22e8120a98d07382ac39eaaeaf5ddb197a900e74e310ad7f4d8286fb1e
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e008363e8993235e6eb2af6c9a5ff56f447d8bdc2cf16eca2f0422b1ede8c6
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc676c0cbe36a418bebb6a770754ec6e6b4550ec6cc2173fee8f8473cfd8acb2
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b477d5172b476bbf9b578ef9878ce39658de519ce36fd3743830da1d68fefd
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e0189f4aacb249403224b0e30e9c8aa2b90c60a248a90d9f5fdee546b7c9452
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:145633610b9b152b0d2cfc8f1f6615f8471936dfa77ce591ae7e2e811ac751bb
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edae67eb4587445733f88b71e1245cfb3f5164c085e6d89fa55d75c346a20afb
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:265e64cfbd04131bffb2a5617a8c2c4c2ea31a7fad0d26e752c26ca24a5c050c
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee7eeca1e6b6ba7221886609291f76f4c0ee14fdab353839097d8c6fbf7ee572
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a960df6918c81f11623085f85ea94df1837af4b8f4267a1629fb6bab5c469523
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:daabfcc6e1a3483f4db21790ddb98974904e8bc9e77135d80c25a77935fcb7db
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5862dd9e056331bb039021a6505bb7a3f1fa98b8a9f5c3fda8a0888d65d5b053
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c434609d68ad98f7889ff98e8f1dc2f6ac56615be0728c4c1a08a0e95e8e983
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc9bd789414240ee2b44965c46a333328df7f9fce29ffffe29646f97e82894b8
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:672bc765d367d7d4ffae64a9a7720cba4580c54ee7b0c79f366c5fcfec50f7ae
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c60c3492b6f46efa4d46f08722f1ca59fb0c21715cf617886bb5ba859c7d9d5
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a9b6f6648a3802b1d7df894c5bbd13bfaaff3cdef6fb263aaa54636226f2ff6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b3388ef358d7f5f11e1879a04d7711953bb9f05b7b304558dfcff8e1df0bd0
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:742fcffc6aba3ef3530cc33f6ad6b89ccb93cdf34124275e474711f05d532257
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7c4655d165a45ceaebd2333d540d7ca911eb47056d42042ac402dbf7a3b1875
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56b41b47676e62b604b32ae1968e1fffa4fc215a0c59a21338a7a142638b146f
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:076dbf8750dbf683323b1e43171b411204fb0c6b72da57c223dae871c8d1c08e
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e3dd1bae2a933732e2e6fffe805496ee2173debbdafb8e9682fed9cdf6cd22a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5b83b9e4abe0e439076bc90dc25d49fcd5d4c81c9c012e6730d7f022accf132
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.37105751391465674,
5
  "eval_steps": 20,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3059,6 +3059,766 @@
3059
  "eval_samples_per_second": 5.761,
3060
  "eval_steps_per_second": 0.188,
3061
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3062
  }
3063
  ],
3064
  "logging_steps": 1,
@@ -3078,7 +3838,7 @@
3078
  "attributes": {}
3079
  }
3080
  },
3081
- "total_flos": 1.2786427319078093e+17,
3082
  "train_batch_size": 8,
3083
  "trial_name": null,
3084
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.46382189239332094,
5
  "eval_steps": 20,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3059
  "eval_samples_per_second": 5.761,
3060
  "eval_steps_per_second": 0.188,
3061
  "step": 400
3062
+ },
3063
+ {
3064
+ "epoch": 0.3719851576994434,
3065
+ "grad_norm": 4.4434332847595215,
3066
+ "learning_rate": 1.582532541615122e-05,
3067
+ "loss": 0.1563,
3068
+ "step": 401
3069
+ },
3070
+ {
3071
+ "epoch": 0.37291280148423006,
3072
+ "grad_norm": 2.928924083709717,
3073
+ "learning_rate": 1.57989700674967e-05,
3074
+ "loss": 0.1211,
3075
+ "step": 402
3076
+ },
3077
+ {
3078
+ "epoch": 0.3738404452690167,
3079
+ "grad_norm": 3.329848527908325,
3080
+ "learning_rate": 1.5772553890390196e-05,
3081
+ "loss": 0.2248,
3082
+ "step": 403
3083
+ },
3084
+ {
3085
+ "epoch": 0.3747680890538033,
3086
+ "grad_norm": 4.852266311645508,
3087
+ "learning_rate": 1.5746077161924905e-05,
3088
+ "loss": 0.2121,
3089
+ "step": 404
3090
+ },
3091
+ {
3092
+ "epoch": 0.37569573283859,
3093
+ "grad_norm": 8.039565086364746,
3094
+ "learning_rate": 1.5719540159829185e-05,
3095
+ "loss": 0.1864,
3096
+ "step": 405
3097
+ },
3098
+ {
3099
+ "epoch": 0.37662337662337664,
3100
+ "grad_norm": 4.178286552429199,
3101
+ "learning_rate": 1.5692943162463628e-05,
3102
+ "loss": 0.1164,
3103
+ "step": 406
3104
+ },
3105
+ {
3106
+ "epoch": 0.37755102040816324,
3107
+ "grad_norm": 4.181092739105225,
3108
+ "learning_rate": 1.5666286448818152e-05,
3109
+ "loss": 0.136,
3110
+ "step": 407
3111
+ },
3112
+ {
3113
+ "epoch": 0.3784786641929499,
3114
+ "grad_norm": 4.847424030303955,
3115
+ "learning_rate": 1.5639570298509067e-05,
3116
+ "loss": 0.1692,
3117
+ "step": 408
3118
+ },
3119
+ {
3120
+ "epoch": 0.37940630797773656,
3121
+ "grad_norm": 5.589816570281982,
3122
+ "learning_rate": 1.5612794991776147e-05,
3123
+ "loss": 0.1375,
3124
+ "step": 409
3125
+ },
3126
+ {
3127
+ "epoch": 0.3803339517625232,
3128
+ "grad_norm": 3.174445629119873,
3129
+ "learning_rate": 1.5585960809479698e-05,
3130
+ "loss": 0.1683,
3131
+ "step": 410
3132
+ },
3133
+ {
3134
+ "epoch": 0.3812615955473098,
3135
+ "grad_norm": 7.2739739418029785,
3136
+ "learning_rate": 1.5559068033097583e-05,
3137
+ "loss": 0.2135,
3138
+ "step": 411
3139
+ },
3140
+ {
3141
+ "epoch": 0.3821892393320965,
3142
+ "grad_norm": 5.514982223510742,
3143
+ "learning_rate": 1.5532116944722308e-05,
3144
+ "loss": 0.2366,
3145
+ "step": 412
3146
+ },
3147
+ {
3148
+ "epoch": 0.38311688311688313,
3149
+ "grad_norm": 4.4258036613464355,
3150
+ "learning_rate": 1.5505107827058038e-05,
3151
+ "loss": 0.1667,
3152
+ "step": 413
3153
+ },
3154
+ {
3155
+ "epoch": 0.38404452690166974,
3156
+ "grad_norm": 4.089566230773926,
3157
+ "learning_rate": 1.547804096341763e-05,
3158
+ "loss": 0.1647,
3159
+ "step": 414
3160
+ },
3161
+ {
3162
+ "epoch": 0.3849721706864564,
3163
+ "grad_norm": 3.6559548377990723,
3164
+ "learning_rate": 1.5450916637719683e-05,
3165
+ "loss": 0.1705,
3166
+ "step": 415
3167
+ },
3168
+ {
3169
+ "epoch": 0.38589981447124305,
3170
+ "grad_norm": 4.045246124267578,
3171
+ "learning_rate": 1.5423735134485537e-05,
3172
+ "loss": 0.1915,
3173
+ "step": 416
3174
+ },
3175
+ {
3176
+ "epoch": 0.3868274582560297,
3177
+ "grad_norm": 3.6718714237213135,
3178
+ "learning_rate": 1.5396496738836292e-05,
3179
+ "loss": 0.1532,
3180
+ "step": 417
3181
+ },
3182
+ {
3183
+ "epoch": 0.3877551020408163,
3184
+ "grad_norm": 4.593350887298584,
3185
+ "learning_rate": 1.536920173648984e-05,
3186
+ "loss": 0.1815,
3187
+ "step": 418
3188
+ },
3189
+ {
3190
+ "epoch": 0.38868274582560297,
3191
+ "grad_norm": 3.913667917251587,
3192
+ "learning_rate": 1.5341850413757834e-05,
3193
+ "loss": 0.1448,
3194
+ "step": 419
3195
+ },
3196
+ {
3197
+ "epoch": 0.38961038961038963,
3198
+ "grad_norm": 4.148723125457764,
3199
+ "learning_rate": 1.5314443057542703e-05,
3200
+ "loss": 0.2082,
3201
+ "step": 420
3202
+ },
3203
+ {
3204
+ "epoch": 0.38961038961038963,
3205
+ "eval_accuracy": 0.8603104212860311,
3206
+ "eval_f1": 0.7136363636363636,
3207
+ "eval_loss": 0.31429344415664673,
3208
+ "eval_precision": 0.839572192513369,
3209
+ "eval_recall": 0.6205533596837944,
3210
+ "eval_runtime": 48.5179,
3211
+ "eval_samples_per_second": 5.689,
3212
+ "eval_steps_per_second": 0.185,
3213
+ "step": 420
3214
+ },
3215
+ {
3216
+ "epoch": 0.39053803339517623,
3217
+ "grad_norm": 4.14030122756958,
3218
+ "learning_rate": 1.5286979955334655e-05,
3219
+ "loss": 0.1749,
3220
+ "step": 421
3221
+ },
3222
+ {
3223
+ "epoch": 0.3914656771799629,
3224
+ "grad_norm": 3.4345805644989014,
3225
+ "learning_rate": 1.5259461395208628e-05,
3226
+ "loss": 0.1712,
3227
+ "step": 422
3228
+ },
3229
+ {
3230
+ "epoch": 0.39239332096474955,
3231
+ "grad_norm": 3.9379382133483887,
3232
+ "learning_rate": 1.52318876658213e-05,
3233
+ "loss": 0.1658,
3234
+ "step": 423
3235
+ },
3236
+ {
3237
+ "epoch": 0.39332096474953615,
3238
+ "grad_norm": 3.1555135250091553,
3239
+ "learning_rate": 1.5204259056408046e-05,
3240
+ "loss": 0.1537,
3241
+ "step": 424
3242
+ },
3243
+ {
3244
+ "epoch": 0.3942486085343228,
3245
+ "grad_norm": 3.893655776977539,
3246
+ "learning_rate": 1.5176575856779904e-05,
3247
+ "loss": 0.1509,
3248
+ "step": 425
3249
+ },
3250
+ {
3251
+ "epoch": 0.39517625231910947,
3252
+ "grad_norm": 4.423066139221191,
3253
+ "learning_rate": 1.5148838357320537e-05,
3254
+ "loss": 0.1501,
3255
+ "step": 426
3256
+ },
3257
+ {
3258
+ "epoch": 0.3961038961038961,
3259
+ "grad_norm": 5.078852653503418,
3260
+ "learning_rate": 1.512104684898319e-05,
3261
+ "loss": 0.2835,
3262
+ "step": 427
3263
+ },
3264
+ {
3265
+ "epoch": 0.3970315398886827,
3266
+ "grad_norm": 3.377333164215088,
3267
+ "learning_rate": 1.5093201623287631e-05,
3268
+ "loss": 0.1393,
3269
+ "step": 428
3270
+ },
3271
+ {
3272
+ "epoch": 0.3979591836734694,
3273
+ "grad_norm": 4.0545196533203125,
3274
+ "learning_rate": 1.5065302972317108e-05,
3275
+ "loss": 0.1567,
3276
+ "step": 429
3277
+ },
3278
+ {
3279
+ "epoch": 0.39888682745825604,
3280
+ "grad_norm": 6.253279685974121,
3281
+ "learning_rate": 1.5037351188715265e-05,
3282
+ "loss": 0.2155,
3283
+ "step": 430
3284
+ },
3285
+ {
3286
+ "epoch": 0.39981447124304265,
3287
+ "grad_norm": 2.698172092437744,
3288
+ "learning_rate": 1.5009346565683088e-05,
3289
+ "loss": 0.1102,
3290
+ "step": 431
3291
+ },
3292
+ {
3293
+ "epoch": 0.4007421150278293,
3294
+ "grad_norm": 4.845376014709473,
3295
+ "learning_rate": 1.4981289396975818e-05,
3296
+ "loss": 0.2186,
3297
+ "step": 432
3298
+ },
3299
+ {
3300
+ "epoch": 0.40166975881261596,
3301
+ "grad_norm": 2.9927968978881836,
3302
+ "learning_rate": 1.4953179976899878e-05,
3303
+ "loss": 0.1365,
3304
+ "step": 433
3305
+ },
3306
+ {
3307
+ "epoch": 0.4025974025974026,
3308
+ "grad_norm": 3.484133720397949,
3309
+ "learning_rate": 1.4925018600309784e-05,
3310
+ "loss": 0.18,
3311
+ "step": 434
3312
+ },
3313
+ {
3314
+ "epoch": 0.4035250463821892,
3315
+ "grad_norm": 2.8350846767425537,
3316
+ "learning_rate": 1.4896805562605052e-05,
3317
+ "loss": 0.1644,
3318
+ "step": 435
3319
+ },
3320
+ {
3321
+ "epoch": 0.4044526901669759,
3322
+ "grad_norm": 3.7044577598571777,
3323
+ "learning_rate": 1.4868541159727097e-05,
3324
+ "loss": 0.1758,
3325
+ "step": 436
3326
+ },
3327
+ {
3328
+ "epoch": 0.40538033395176254,
3329
+ "grad_norm": 3.7919528484344482,
3330
+ "learning_rate": 1.4840225688156132e-05,
3331
+ "loss": 0.1609,
3332
+ "step": 437
3333
+ },
3334
+ {
3335
+ "epoch": 0.40630797773654914,
3336
+ "grad_norm": 3.7263407707214355,
3337
+ "learning_rate": 1.4811859444908053e-05,
3338
+ "loss": 0.1674,
3339
+ "step": 438
3340
+ },
3341
+ {
3342
+ "epoch": 0.4072356215213358,
3343
+ "grad_norm": 3.5427684783935547,
3344
+ "learning_rate": 1.4783442727531328e-05,
3345
+ "loss": 0.212,
3346
+ "step": 439
3347
+ },
3348
+ {
3349
+ "epoch": 0.40816326530612246,
3350
+ "grad_norm": 5.29186487197876,
3351
+ "learning_rate": 1.4754975834103877e-05,
3352
+ "loss": 0.2051,
3353
+ "step": 440
3354
+ },
3355
+ {
3356
+ "epoch": 0.40816326530612246,
3357
+ "eval_accuracy": 0.8569844789356984,
3358
+ "eval_f1": 0.6978922716627635,
3359
+ "eval_loss": 0.3138832747936249,
3360
+ "eval_precision": 0.8563218390804598,
3361
+ "eval_recall": 0.5889328063241107,
3362
+ "eval_runtime": 48.1239,
3363
+ "eval_samples_per_second": 5.735,
3364
+ "eval_steps_per_second": 0.187,
3365
+ "step": 440
3366
+ },
3367
+ {
3368
+ "epoch": 0.4090909090909091,
3369
+ "grad_norm": 6.476135730743408,
3370
+ "learning_rate": 1.4726459063229946e-05,
3371
+ "loss": 0.1586,
3372
+ "step": 441
3373
+ },
3374
+ {
3375
+ "epoch": 0.4100185528756957,
3376
+ "grad_norm": 4.70169734954834,
3377
+ "learning_rate": 1.4697892714036959e-05,
3378
+ "loss": 0.1604,
3379
+ "step": 442
3380
+ },
3381
+ {
3382
+ "epoch": 0.4109461966604824,
3383
+ "grad_norm": 3.1262881755828857,
3384
+ "learning_rate": 1.4669277086172406e-05,
3385
+ "loss": 0.1282,
3386
+ "step": 443
3387
+ },
3388
+ {
3389
+ "epoch": 0.41187384044526903,
3390
+ "grad_norm": 3.5137503147125244,
3391
+ "learning_rate": 1.4640612479800686e-05,
3392
+ "loss": 0.179,
3393
+ "step": 444
3394
+ },
3395
+ {
3396
+ "epoch": 0.41280148423005564,
3397
+ "grad_norm": 5.388190269470215,
3398
+ "learning_rate": 1.4611899195599952e-05,
3399
+ "loss": 0.2404,
3400
+ "step": 445
3401
+ },
3402
+ {
3403
+ "epoch": 0.4137291280148423,
3404
+ "grad_norm": 4.668002128601074,
3405
+ "learning_rate": 1.4583137534758968e-05,
3406
+ "loss": 0.1781,
3407
+ "step": 446
3408
+ },
3409
+ {
3410
+ "epoch": 0.41465677179962895,
3411
+ "grad_norm": 4.363613128662109,
3412
+ "learning_rate": 1.455432779897395e-05,
3413
+ "loss": 0.1636,
3414
+ "step": 447
3415
+ },
3416
+ {
3417
+ "epoch": 0.4155844155844156,
3418
+ "grad_norm": 3.608628749847412,
3419
+ "learning_rate": 1.4525470290445392e-05,
3420
+ "loss": 0.1604,
3421
+ "step": 448
3422
+ },
3423
+ {
3424
+ "epoch": 0.4165120593692022,
3425
+ "grad_norm": 4.1195387840271,
3426
+ "learning_rate": 1.4496565311874902e-05,
3427
+ "loss": 0.1752,
3428
+ "step": 449
3429
+ },
3430
+ {
3431
+ "epoch": 0.4174397031539889,
3432
+ "grad_norm": 2.9940760135650635,
3433
+ "learning_rate": 1.4467613166462024e-05,
3434
+ "loss": 0.1579,
3435
+ "step": 450
3436
+ },
3437
+ {
3438
+ "epoch": 0.41836734693877553,
3439
+ "grad_norm": 3.3826465606689453,
3440
+ "learning_rate": 1.4438614157901073e-05,
3441
+ "loss": 0.1529,
3442
+ "step": 451
3443
+ },
3444
+ {
3445
+ "epoch": 0.41929499072356213,
3446
+ "grad_norm": 2.9892001152038574,
3447
+ "learning_rate": 1.4409568590377918e-05,
3448
+ "loss": 0.1435,
3449
+ "step": 452
3450
+ },
3451
+ {
3452
+ "epoch": 0.4202226345083488,
3453
+ "grad_norm": 3.4909090995788574,
3454
+ "learning_rate": 1.4380476768566825e-05,
3455
+ "loss": 0.1879,
3456
+ "step": 453
3457
+ },
3458
+ {
3459
+ "epoch": 0.42115027829313545,
3460
+ "grad_norm": 3.8178699016571045,
3461
+ "learning_rate": 1.4351338997627233e-05,
3462
+ "loss": 0.1693,
3463
+ "step": 454
3464
+ },
3465
+ {
3466
+ "epoch": 0.42207792207792205,
3467
+ "grad_norm": 3.101930618286133,
3468
+ "learning_rate": 1.4322155583200577e-05,
3469
+ "loss": 0.1315,
3470
+ "step": 455
3471
+ },
3472
+ {
3473
+ "epoch": 0.4230055658627087,
3474
+ "grad_norm": 3.344278335571289,
3475
+ "learning_rate": 1.429292683140706e-05,
3476
+ "loss": 0.1539,
3477
+ "step": 456
3478
+ },
3479
+ {
3480
+ "epoch": 0.42393320964749537,
3481
+ "grad_norm": 2.970942497253418,
3482
+ "learning_rate": 1.4263653048842461e-05,
3483
+ "loss": 0.1374,
3484
+ "step": 457
3485
+ },
3486
+ {
3487
+ "epoch": 0.424860853432282,
3488
+ "grad_norm": 3.1692464351654053,
3489
+ "learning_rate": 1.4234334542574906e-05,
3490
+ "loss": 0.2035,
3491
+ "step": 458
3492
+ },
3493
+ {
3494
+ "epoch": 0.42578849721706863,
3495
+ "grad_norm": 2.829171895980835,
3496
+ "learning_rate": 1.4204971620141648e-05,
3497
+ "loss": 0.1483,
3498
+ "step": 459
3499
+ },
3500
+ {
3501
+ "epoch": 0.4267161410018553,
3502
+ "grad_norm": 3.1080052852630615,
3503
+ "learning_rate": 1.4175564589545853e-05,
3504
+ "loss": 0.0959,
3505
+ "step": 460
3506
+ },
3507
+ {
3508
+ "epoch": 0.4267161410018553,
3509
+ "eval_accuracy": 0.8569844789356984,
3510
+ "eval_f1": 0.6993006993006993,
3511
+ "eval_loss": 0.3129526674747467,
3512
+ "eval_precision": 0.8522727272727273,
3513
+ "eval_recall": 0.5928853754940712,
3514
+ "eval_runtime": 48.0013,
3515
+ "eval_samples_per_second": 5.75,
3516
+ "eval_steps_per_second": 0.187,
3517
+ "step": 460
3518
+ },
3519
+ {
3520
+ "epoch": 0.42764378478664195,
3521
+ "grad_norm": 4.785598278045654,
3522
+ "learning_rate": 1.4146113759253362e-05,
3523
+ "loss": 0.1891,
3524
+ "step": 461
3525
+ },
3526
+ {
3527
+ "epoch": 0.42857142857142855,
3528
+ "grad_norm": 4.229030609130859,
3529
+ "learning_rate": 1.411661943818944e-05,
3530
+ "loss": 0.1546,
3531
+ "step": 462
3532
+ },
3533
+ {
3534
+ "epoch": 0.4294990723562152,
3535
+ "grad_norm": 4.43852424621582,
3536
+ "learning_rate": 1.4087081935735565e-05,
3537
+ "loss": 0.1769,
3538
+ "step": 463
3539
+ },
3540
+ {
3541
+ "epoch": 0.43042671614100186,
3542
+ "grad_norm": 4.6244049072265625,
3543
+ "learning_rate": 1.4057501561726157e-05,
3544
+ "loss": 0.2404,
3545
+ "step": 464
3546
+ },
3547
+ {
3548
+ "epoch": 0.4313543599257885,
3549
+ "grad_norm": 4.3941168785095215,
3550
+ "learning_rate": 1.4027878626445339e-05,
3551
+ "loss": 0.1781,
3552
+ "step": 465
3553
+ },
3554
+ {
3555
+ "epoch": 0.4322820037105751,
3556
+ "grad_norm": 6.995157718658447,
3557
+ "learning_rate": 1.3998213440623691e-05,
3558
+ "loss": 0.2318,
3559
+ "step": 466
3560
+ },
3561
+ {
3562
+ "epoch": 0.4332096474953618,
3563
+ "grad_norm": 2.781472682952881,
3564
+ "learning_rate": 1.3968506315434973e-05,
3565
+ "loss": 0.0949,
3566
+ "step": 467
3567
+ },
3568
+ {
3569
+ "epoch": 0.43413729128014844,
3570
+ "grad_norm": 2.9485185146331787,
3571
+ "learning_rate": 1.3938757562492873e-05,
3572
+ "loss": 0.1268,
3573
+ "step": 468
3574
+ },
3575
+ {
3576
+ "epoch": 0.43506493506493504,
3577
+ "grad_norm": 6.8653130531311035,
3578
+ "learning_rate": 1.390896749384773e-05,
3579
+ "loss": 0.2818,
3580
+ "step": 469
3581
+ },
3582
+ {
3583
+ "epoch": 0.4359925788497217,
3584
+ "grad_norm": 5.269331932067871,
3585
+ "learning_rate": 1.3879136421983265e-05,
3586
+ "loss": 0.1386,
3587
+ "step": 470
3588
+ },
3589
+ {
3590
+ "epoch": 0.43692022263450836,
3591
+ "grad_norm": 5.839674472808838,
3592
+ "learning_rate": 1.3849264659813314e-05,
3593
+ "loss": 0.1739,
3594
+ "step": 471
3595
+ },
3596
+ {
3597
+ "epoch": 0.437847866419295,
3598
+ "grad_norm": 4.306594371795654,
3599
+ "learning_rate": 1.3819352520678519e-05,
3600
+ "loss": 0.1562,
3601
+ "step": 472
3602
+ },
3603
+ {
3604
+ "epoch": 0.4387755102040816,
3605
+ "grad_norm": 4.80615234375,
3606
+ "learning_rate": 1.378940031834307e-05,
3607
+ "loss": 0.2049,
3608
+ "step": 473
3609
+ },
3610
+ {
3611
+ "epoch": 0.4397031539888683,
3612
+ "grad_norm": 4.824836730957031,
3613
+ "learning_rate": 1.3759408366991391e-05,
3614
+ "loss": 0.1667,
3615
+ "step": 474
3616
+ },
3617
+ {
3618
+ "epoch": 0.44063079777365494,
3619
+ "grad_norm": 4.227314472198486,
3620
+ "learning_rate": 1.3729376981224869e-05,
3621
+ "loss": 0.1783,
3622
+ "step": 475
3623
+ },
3624
+ {
3625
+ "epoch": 0.44155844155844154,
3626
+ "grad_norm": 4.170743942260742,
3627
+ "learning_rate": 1.3699306476058523e-05,
3628
+ "loss": 0.1724,
3629
+ "step": 476
3630
+ },
3631
+ {
3632
+ "epoch": 0.4424860853432282,
3633
+ "grad_norm": 6.098860263824463,
3634
+ "learning_rate": 1.3669197166917723e-05,
3635
+ "loss": 0.2301,
3636
+ "step": 477
3637
+ },
3638
+ {
3639
+ "epoch": 0.44341372912801486,
3640
+ "grad_norm": 3.2894742488861084,
3641
+ "learning_rate": 1.3639049369634878e-05,
3642
+ "loss": 0.1636,
3643
+ "step": 478
3644
+ },
3645
+ {
3646
+ "epoch": 0.44434137291280146,
3647
+ "grad_norm": 3.913362503051758,
3648
+ "learning_rate": 1.3608863400446113e-05,
3649
+ "loss": 0.1762,
3650
+ "step": 479
3651
+ },
3652
+ {
3653
+ "epoch": 0.4452690166975881,
3654
+ "grad_norm": 4.007498741149902,
3655
+ "learning_rate": 1.357863957598796e-05,
3656
+ "loss": 0.1955,
3657
+ "step": 480
3658
+ },
3659
+ {
3660
+ "epoch": 0.4452690166975881,
3661
+ "eval_accuracy": 0.8592017738359202,
3662
+ "eval_f1": 0.7080459770114943,
3663
+ "eval_loss": 0.3043546974658966,
3664
+ "eval_precision": 0.8461538461538461,
3665
+ "eval_recall": 0.6086956521739131,
3666
+ "eval_runtime": 48.2191,
3667
+ "eval_samples_per_second": 5.724,
3668
+ "eval_steps_per_second": 0.187,
3669
+ "step": 480
3670
+ },
3671
+ {
3672
+ "epoch": 0.4461966604823748,
3673
+ "grad_norm": 4.113521575927734,
3674
+ "learning_rate": 1.3548378213294042e-05,
3675
+ "loss": 0.1875,
3676
+ "step": 481
3677
+ },
3678
+ {
3679
+ "epoch": 0.44712430426716143,
3680
+ "grad_norm": 4.532953262329102,
3681
+ "learning_rate": 1.3518079629791725e-05,
3682
+ "loss": 0.1425,
3683
+ "step": 482
3684
+ },
3685
+ {
3686
+ "epoch": 0.44805194805194803,
3687
+ "grad_norm": 2.7280144691467285,
3688
+ "learning_rate": 1.3487744143298822e-05,
3689
+ "loss": 0.0783,
3690
+ "step": 483
3691
+ },
3692
+ {
3693
+ "epoch": 0.4489795918367347,
3694
+ "grad_norm": 3.562812566757202,
3695
+ "learning_rate": 1.345737207202023e-05,
3696
+ "loss": 0.1569,
3697
+ "step": 484
3698
+ },
3699
+ {
3700
+ "epoch": 0.44990723562152135,
3701
+ "grad_norm": 5.523044109344482,
3702
+ "learning_rate": 1.3426963734544601e-05,
3703
+ "loss": 0.2227,
3704
+ "step": 485
3705
+ },
3706
+ {
3707
+ "epoch": 0.45083487940630795,
3708
+ "grad_norm": 4.155531883239746,
3709
+ "learning_rate": 1.3396519449841006e-05,
3710
+ "loss": 0.2297,
3711
+ "step": 486
3712
+ },
3713
+ {
3714
+ "epoch": 0.4517625231910946,
3715
+ "grad_norm": 2.8421530723571777,
3716
+ "learning_rate": 1.3366039537255589e-05,
3717
+ "loss": 0.1699,
3718
+ "step": 487
3719
+ },
3720
+ {
3721
+ "epoch": 0.45269016697588127,
3722
+ "grad_norm": 3.845489263534546,
3723
+ "learning_rate": 1.3335524316508208e-05,
3724
+ "loss": 0.1773,
3725
+ "step": 488
3726
+ },
3727
+ {
3728
+ "epoch": 0.4536178107606679,
3729
+ "grad_norm": 3.0157673358917236,
3730
+ "learning_rate": 1.3304974107689088e-05,
3731
+ "loss": 0.1379,
3732
+ "step": 489
3733
+ },
3734
+ {
3735
+ "epoch": 0.45454545454545453,
3736
+ "grad_norm": 2.74364972114563,
3737
+ "learning_rate": 1.3274389231255466e-05,
3738
+ "loss": 0.1265,
3739
+ "step": 490
3740
+ },
3741
+ {
3742
+ "epoch": 0.4554730983302412,
3743
+ "grad_norm": 6.820478439331055,
3744
+ "learning_rate": 1.3243770008028225e-05,
3745
+ "loss": 0.1849,
3746
+ "step": 491
3747
+ },
3748
+ {
3749
+ "epoch": 0.45640074211502785,
3750
+ "grad_norm": 2.741809368133545,
3751
+ "learning_rate": 1.3213116759188525e-05,
3752
+ "loss": 0.1295,
3753
+ "step": 492
3754
+ },
3755
+ {
3756
+ "epoch": 0.45732838589981445,
3757
+ "grad_norm": 3.473961353302002,
3758
+ "learning_rate": 1.3182429806274442e-05,
3759
+ "loss": 0.099,
3760
+ "step": 493
3761
+ },
3762
+ {
3763
+ "epoch": 0.4582560296846011,
3764
+ "grad_norm": 5.271576881408691,
3765
+ "learning_rate": 1.3151709471177589e-05,
3766
+ "loss": 0.1753,
3767
+ "step": 494
3768
+ },
3769
+ {
3770
+ "epoch": 0.45918367346938777,
3771
+ "grad_norm": 6.4196577072143555,
3772
+ "learning_rate": 1.3120956076139746e-05,
3773
+ "loss": 0.2568,
3774
+ "step": 495
3775
+ },
3776
+ {
3777
+ "epoch": 0.4601113172541744,
3778
+ "grad_norm": 5.339606761932373,
3779
+ "learning_rate": 1.3090169943749475e-05,
3780
+ "loss": 0.1535,
3781
+ "step": 496
3782
+ },
3783
+ {
3784
+ "epoch": 0.461038961038961,
3785
+ "grad_norm": 4.3615593910217285,
3786
+ "learning_rate": 1.305935139693874e-05,
3787
+ "loss": 0.1362,
3788
+ "step": 497
3789
+ },
3790
+ {
3791
+ "epoch": 0.4619666048237477,
3792
+ "grad_norm": 4.690557956695557,
3793
+ "learning_rate": 1.3028500758979507e-05,
3794
+ "loss": 0.1776,
3795
+ "step": 498
3796
+ },
3797
+ {
3798
+ "epoch": 0.46289424860853434,
3799
+ "grad_norm": 4.32938814163208,
3800
+ "learning_rate": 1.299761835348038e-05,
3801
+ "loss": 0.1429,
3802
+ "step": 499
3803
+ },
3804
+ {
3805
+ "epoch": 0.46382189239332094,
3806
+ "grad_norm": 3.8397958278656006,
3807
+ "learning_rate": 1.296670450438317e-05,
3808
+ "loss": 0.1904,
3809
+ "step": 500
3810
+ },
3811
+ {
3812
+ "epoch": 0.46382189239332094,
3813
+ "eval_accuracy": 0.8403547671840355,
3814
+ "eval_f1": 0.6381909547738693,
3815
+ "eval_loss": 0.3388740122318268,
3816
+ "eval_precision": 0.8758620689655172,
3817
+ "eval_recall": 0.5019762845849802,
3818
+ "eval_runtime": 48.3395,
3819
+ "eval_samples_per_second": 5.71,
3820
+ "eval_steps_per_second": 0.186,
3821
+ "step": 500
3822
  }
3823
  ],
3824
  "logging_steps": 1,
 
3838
  "attributes": {}
3839
  }
3840
  },
3841
+ "total_flos": 1.5957678843389542e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null