ErrorAI commited on
Commit
2699d37
·
verified ·
1 Parent(s): 119b79c

Training in progress, step 572, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b3c14f2a7cefaed118eec467a328dafde9262b2083b6c3d3b13f56930546ab7
3
  size 5327744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83da110c38b42c8f26332874e8bb627d6ead7522e50fc51e39079d4f9c35796a
3
  size 5327744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eef80311cf6487d9649388953f0b17a845c1d2a58891b6fd75a929528a6c0196
3
  size 2857850
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b3646603ee02c3a1dc6142029b48d761a5d129039bb1a5936f191072a5323f6
3
  size 2857850
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:269d8e525caae1cce17fdcc8f66b6c9fe5c358eb983e1f28c6bd81602e6038b2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:691d0261780df62255146217a0fa66594f0de41a9521d0f54f49d2368cb05292
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ee1869594529a13e34b89e7bfe7be5bc83ad15c1d5f0963d178cc0ef9c1351e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63ce17dd2c32e1042039dfe648c482c9ff0032ac68df46007019bf1f153ddc3e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.75032794053345,
5
  "eval_steps": 143,
6
- "global_step": 429,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3042,6 +3042,1015 @@
3042
  "eval_samples_per_second": 110.64,
3043
  "eval_steps_per_second": 55.55,
3044
  "step": 429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3045
  }
3046
  ],
3047
  "logging_steps": 1,
@@ -3056,12 +4065,12 @@
3056
  "should_evaluate": false,
3057
  "should_log": false,
3058
  "should_save": true,
3059
- "should_training_stop": false
3060
  },
3061
  "attributes": {}
3062
  }
3063
  },
3064
- "total_flos": 1821492098629632.0,
3065
  "train_batch_size": 2,
3066
  "trial_name": null,
3067
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0008745080891999,
5
  "eval_steps": 143,
6
+ "global_step": 572,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3042
  "eval_samples_per_second": 110.64,
3043
  "eval_steps_per_second": 55.55,
3044
  "step": 429
3045
+ },
3046
+ {
3047
+ "epoch": 0.7520769567118496,
3048
+ "grad_norm": 6.074078559875488,
3049
+ "learning_rate": 2.9884712134280324e-05,
3050
+ "loss": 6.6575,
3051
+ "step": 430
3052
+ },
3053
+ {
3054
+ "epoch": 0.7538259728902492,
3055
+ "grad_norm": 6.187041759490967,
3056
+ "learning_rate": 2.948723496423379e-05,
3057
+ "loss": 7.9171,
3058
+ "step": 431
3059
+ },
3060
+ {
3061
+ "epoch": 0.7555749890686488,
3062
+ "grad_norm": 6.312707901000977,
3063
+ "learning_rate": 2.909196119613218e-05,
3064
+ "loss": 8.3438,
3065
+ "step": 432
3066
+ },
3067
+ {
3068
+ "epoch": 0.7573240052470486,
3069
+ "grad_norm": 5.885359287261963,
3070
+ "learning_rate": 2.8698903181597127e-05,
3071
+ "loss": 7.2738,
3072
+ "step": 433
3073
+ },
3074
+ {
3075
+ "epoch": 0.7590730214254482,
3076
+ "grad_norm": 6.485308647155762,
3077
+ "learning_rate": 2.8308073203011663e-05,
3078
+ "loss": 8.3997,
3079
+ "step": 434
3080
+ },
3081
+ {
3082
+ "epoch": 0.7608220376038478,
3083
+ "grad_norm": 5.899681091308594,
3084
+ "learning_rate": 2.7919483473136676e-05,
3085
+ "loss": 6.9346,
3086
+ "step": 435
3087
+ },
3088
+ {
3089
+ "epoch": 0.7625710537822474,
3090
+ "grad_norm": 6.48416805267334,
3091
+ "learning_rate": 2.753314613472906e-05,
3092
+ "loss": 8.0168,
3093
+ "step": 436
3094
+ },
3095
+ {
3096
+ "epoch": 0.7643200699606472,
3097
+ "grad_norm": 6.464148998260498,
3098
+ "learning_rate": 2.7149073260162416e-05,
3099
+ "loss": 7.6221,
3100
+ "step": 437
3101
+ },
3102
+ {
3103
+ "epoch": 0.7660690861390468,
3104
+ "grad_norm": 6.158417224884033,
3105
+ "learning_rate": 2.6767276851049816e-05,
3106
+ "loss": 8.3056,
3107
+ "step": 438
3108
+ },
3109
+ {
3110
+ "epoch": 0.7678181023174464,
3111
+ "grad_norm": 6.386941432952881,
3112
+ "learning_rate": 2.6387768837868597e-05,
3113
+ "loss": 7.7409,
3114
+ "step": 439
3115
+ },
3116
+ {
3117
+ "epoch": 0.7695671184958461,
3118
+ "grad_norm": 5.983461380004883,
3119
+ "learning_rate": 2.6010561079587813e-05,
3120
+ "loss": 8.2641,
3121
+ "step": 440
3122
+ },
3123
+ {
3124
+ "epoch": 0.7713161346742458,
3125
+ "grad_norm": 6.103724479675293,
3126
+ "learning_rate": 2.5635665363297422e-05,
3127
+ "loss": 7.4958,
3128
+ "step": 441
3129
+ },
3130
+ {
3131
+ "epoch": 0.7730651508526454,
3132
+ "grad_norm": 6.250931739807129,
3133
+ "learning_rate": 2.5263093403840142e-05,
3134
+ "loss": 8.1695,
3135
+ "step": 442
3136
+ },
3137
+ {
3138
+ "epoch": 0.774814167031045,
3139
+ "grad_norm": 6.299392223358154,
3140
+ "learning_rate": 2.4892856843445322e-05,
3141
+ "loss": 8.1588,
3142
+ "step": 443
3143
+ },
3144
+ {
3145
+ "epoch": 0.7765631832094447,
3146
+ "grad_norm": 7.157833099365234,
3147
+ "learning_rate": 2.4524967251365026e-05,
3148
+ "loss": 8.4203,
3149
+ "step": 444
3150
+ },
3151
+ {
3152
+ "epoch": 0.7783121993878444,
3153
+ "grad_norm": 6.5352349281311035,
3154
+ "learning_rate": 2.4159436123512735e-05,
3155
+ "loss": 8.4377,
3156
+ "step": 445
3157
+ },
3158
+ {
3159
+ "epoch": 0.780061215566244,
3160
+ "grad_norm": 5.913573265075684,
3161
+ "learning_rate": 2.3796274882103964e-05,
3162
+ "loss": 6.8979,
3163
+ "step": 446
3164
+ },
3165
+ {
3166
+ "epoch": 0.7818102317446436,
3167
+ "grad_norm": 6.062626838684082,
3168
+ "learning_rate": 2.3435494875299314e-05,
3169
+ "loss": 9.2065,
3170
+ "step": 447
3171
+ },
3172
+ {
3173
+ "epoch": 0.7835592479230433,
3174
+ "grad_norm": 6.175731658935547,
3175
+ "learning_rate": 2.3077107376850005e-05,
3176
+ "loss": 7.3399,
3177
+ "step": 448
3178
+ },
3179
+ {
3180
+ "epoch": 0.7853082641014429,
3181
+ "grad_norm": 5.536900997161865,
3182
+ "learning_rate": 2.2721123585745507e-05,
3183
+ "loss": 7.0864,
3184
+ "step": 449
3185
+ },
3186
+ {
3187
+ "epoch": 0.7870572802798426,
3188
+ "grad_norm": 6.569960117340088,
3189
+ "learning_rate": 2.2367554625863497e-05,
3190
+ "loss": 7.0612,
3191
+ "step": 450
3192
+ },
3193
+ {
3194
+ "epoch": 0.7888062964582422,
3195
+ "grad_norm": 6.351140022277832,
3196
+ "learning_rate": 2.2016411545622495e-05,
3197
+ "loss": 7.9918,
3198
+ "step": 451
3199
+ },
3200
+ {
3201
+ "epoch": 0.7905553126366419,
3202
+ "grad_norm": 6.007497787475586,
3203
+ "learning_rate": 2.166770531763633e-05,
3204
+ "loss": 8.2044,
3205
+ "step": 452
3206
+ },
3207
+ {
3208
+ "epoch": 0.7923043288150415,
3209
+ "grad_norm": 6.685032844543457,
3210
+ "learning_rate": 2.132144683837155e-05,
3211
+ "loss": 8.174,
3212
+ "step": 453
3213
+ },
3214
+ {
3215
+ "epoch": 0.7940533449934412,
3216
+ "grad_norm": 5.885077953338623,
3217
+ "learning_rate": 2.0977646927806683e-05,
3218
+ "loss": 7.6945,
3219
+ "step": 454
3220
+ },
3221
+ {
3222
+ "epoch": 0.7958023611718409,
3223
+ "grad_norm": 6.740251064300537,
3224
+ "learning_rate": 2.0636316329094317e-05,
3225
+ "loss": 7.4873,
3226
+ "step": 455
3227
+ },
3228
+ {
3229
+ "epoch": 0.7975513773502405,
3230
+ "grad_norm": 6.132180690765381,
3231
+ "learning_rate": 2.0297465708225238e-05,
3232
+ "loss": 8.0784,
3233
+ "step": 456
3234
+ },
3235
+ {
3236
+ "epoch": 0.7993003935286401,
3237
+ "grad_norm": 5.6954779624938965,
3238
+ "learning_rate": 1.9961105653695266e-05,
3239
+ "loss": 8.4652,
3240
+ "step": 457
3241
+ },
3242
+ {
3243
+ "epoch": 0.8010494097070398,
3244
+ "grad_norm": 6.492379665374756,
3245
+ "learning_rate": 1.962724667617436e-05,
3246
+ "loss": 8.178,
3247
+ "step": 458
3248
+ },
3249
+ {
3250
+ "epoch": 0.8027984258854395,
3251
+ "grad_norm": 6.245090007781982,
3252
+ "learning_rate": 1.929589920817806e-05,
3253
+ "loss": 6.97,
3254
+ "step": 459
3255
+ },
3256
+ {
3257
+ "epoch": 0.8045474420638391,
3258
+ "grad_norm": 5.776219367980957,
3259
+ "learning_rate": 1.896707360374167e-05,
3260
+ "loss": 8.1005,
3261
+ "step": 460
3262
+ },
3263
+ {
3264
+ "epoch": 0.8062964582422387,
3265
+ "grad_norm": 6.456238746643066,
3266
+ "learning_rate": 1.8640780138096513e-05,
3267
+ "loss": 7.4033,
3268
+ "step": 461
3269
+ },
3270
+ {
3271
+ "epoch": 0.8080454744206383,
3272
+ "grad_norm": 6.484393119812012,
3273
+ "learning_rate": 1.8317029007349085e-05,
3274
+ "loss": 8.597,
3275
+ "step": 462
3276
+ },
3277
+ {
3278
+ "epoch": 0.8097944905990381,
3279
+ "grad_norm": 6.799935817718506,
3280
+ "learning_rate": 1.799583032816219e-05,
3281
+ "loss": 6.3503,
3282
+ "step": 463
3283
+ },
3284
+ {
3285
+ "epoch": 0.8115435067774377,
3286
+ "grad_norm": 5.950386047363281,
3287
+ "learning_rate": 1.7677194137439035e-05,
3288
+ "loss": 7.3482,
3289
+ "step": 464
3290
+ },
3291
+ {
3292
+ "epoch": 0.8132925229558373,
3293
+ "grad_norm": 6.03711462020874,
3294
+ "learning_rate": 1.7361130392009407e-05,
3295
+ "loss": 8.2516,
3296
+ "step": 465
3297
+ },
3298
+ {
3299
+ "epoch": 0.8150415391342369,
3300
+ "grad_norm": 6.34808874130249,
3301
+ "learning_rate": 1.7047648968318698e-05,
3302
+ "loss": 8.1899,
3303
+ "step": 466
3304
+ },
3305
+ {
3306
+ "epoch": 0.8167905553126367,
3307
+ "grad_norm": 5.498723030090332,
3308
+ "learning_rate": 1.6736759662119183e-05,
3309
+ "loss": 8.3873,
3310
+ "step": 467
3311
+ },
3312
+ {
3313
+ "epoch": 0.8185395714910363,
3314
+ "grad_norm": 6.472886562347412,
3315
+ "learning_rate": 1.642847218816398e-05,
3316
+ "loss": 8.2182,
3317
+ "step": 468
3318
+ },
3319
+ {
3320
+ "epoch": 0.8202885876694359,
3321
+ "grad_norm": 7.618271827697754,
3322
+ "learning_rate": 1.6122796179903354e-05,
3323
+ "loss": 8.3238,
3324
+ "step": 469
3325
+ },
3326
+ {
3327
+ "epoch": 0.8220376038478356,
3328
+ "grad_norm": 6.079611778259277,
3329
+ "learning_rate": 1.58197411891839e-05,
3330
+ "loss": 7.5876,
3331
+ "step": 470
3332
+ },
3333
+ {
3334
+ "epoch": 0.8237866200262353,
3335
+ "grad_norm": 6.017045497894287,
3336
+ "learning_rate": 1.5519316685949903e-05,
3337
+ "loss": 7.4218,
3338
+ "step": 471
3339
+ },
3340
+ {
3341
+ "epoch": 0.8255356362046349,
3342
+ "grad_norm": 5.659122943878174,
3343
+ "learning_rate": 1.5221532057947419e-05,
3344
+ "loss": 8.3443,
3345
+ "step": 472
3346
+ },
3347
+ {
3348
+ "epoch": 0.8272846523830345,
3349
+ "grad_norm": 7.566473484039307,
3350
+ "learning_rate": 1.4926396610431059e-05,
3351
+ "loss": 7.7792,
3352
+ "step": 473
3353
+ },
3354
+ {
3355
+ "epoch": 0.8290336685614342,
3356
+ "grad_norm": 6.2454729080200195,
3357
+ "learning_rate": 1.4633919565873033e-05,
3358
+ "loss": 8.3766,
3359
+ "step": 474
3360
+ },
3361
+ {
3362
+ "epoch": 0.8307826847398339,
3363
+ "grad_norm": 6.340610980987549,
3364
+ "learning_rate": 1.4344110063675142e-05,
3365
+ "loss": 7.4127,
3366
+ "step": 475
3367
+ },
3368
+ {
3369
+ "epoch": 0.8325317009182335,
3370
+ "grad_norm": 5.981843948364258,
3371
+ "learning_rate": 1.4056977159883012e-05,
3372
+ "loss": 7.7706,
3373
+ "step": 476
3374
+ },
3375
+ {
3376
+ "epoch": 0.8342807170966331,
3377
+ "grad_norm": 6.074410438537598,
3378
+ "learning_rate": 1.3772529826903269e-05,
3379
+ "loss": 7.0402,
3380
+ "step": 477
3381
+ },
3382
+ {
3383
+ "epoch": 0.8360297332750328,
3384
+ "grad_norm": 6.144327163696289,
3385
+ "learning_rate": 1.3490776953223105e-05,
3386
+ "loss": 7.6445,
3387
+ "step": 478
3388
+ },
3389
+ {
3390
+ "epoch": 0.8377787494534324,
3391
+ "grad_norm": 5.650998115539551,
3392
+ "learning_rate": 1.321172734313244e-05,
3393
+ "loss": 8.1351,
3394
+ "step": 479
3395
+ },
3396
+ {
3397
+ "epoch": 0.8395277656318321,
3398
+ "grad_norm": 6.38914680480957,
3399
+ "learning_rate": 1.2935389716448976e-05,
3400
+ "loss": 8.9728,
3401
+ "step": 480
3402
+ },
3403
+ {
3404
+ "epoch": 0.8412767818102318,
3405
+ "grad_norm": 6.161402225494385,
3406
+ "learning_rate": 1.2661772708245535e-05,
3407
+ "loss": 8.9174,
3408
+ "step": 481
3409
+ },
3410
+ {
3411
+ "epoch": 0.8430257979886314,
3412
+ "grad_norm": 5.9002685546875,
3413
+ "learning_rate": 1.23908848685804e-05,
3414
+ "loss": 6.2995,
3415
+ "step": 482
3416
+ },
3417
+ {
3418
+ "epoch": 0.844774814167031,
3419
+ "grad_norm": 5.791549205780029,
3420
+ "learning_rate": 1.2122734662229984e-05,
3421
+ "loss": 6.7385,
3422
+ "step": 483
3423
+ },
3424
+ {
3425
+ "epoch": 0.8465238303454307,
3426
+ "grad_norm": 7.234724521636963,
3427
+ "learning_rate": 1.1857330468424466e-05,
3428
+ "loss": 6.838,
3429
+ "step": 484
3430
+ },
3431
+ {
3432
+ "epoch": 0.8482728465238304,
3433
+ "grad_norm": 5.775229454040527,
3434
+ "learning_rate": 1.1594680580585814e-05,
3435
+ "loss": 8.2034,
3436
+ "step": 485
3437
+ },
3438
+ {
3439
+ "epoch": 0.85002186270223,
3440
+ "grad_norm": 6.084395885467529,
3441
+ "learning_rate": 1.133479320606874e-05,
3442
+ "loss": 8.3378,
3443
+ "step": 486
3444
+ },
3445
+ {
3446
+ "epoch": 0.8517708788806296,
3447
+ "grad_norm": 6.647040843963623,
3448
+ "learning_rate": 1.1077676465904208e-05,
3449
+ "loss": 8.3666,
3450
+ "step": 487
3451
+ },
3452
+ {
3453
+ "epoch": 0.8535198950590293,
3454
+ "grad_norm": 6.034310340881348,
3455
+ "learning_rate": 1.082333839454559e-05,
3456
+ "loss": 8.4993,
3457
+ "step": 488
3458
+ },
3459
+ {
3460
+ "epoch": 0.855268911237429,
3461
+ "grad_norm": 6.046447277069092,
3462
+ "learning_rate": 1.0571786939617712e-05,
3463
+ "loss": 6.8808,
3464
+ "step": 489
3465
+ },
3466
+ {
3467
+ "epoch": 0.8570179274158286,
3468
+ "grad_norm": 7.073398590087891,
3469
+ "learning_rate": 1.0323029961668462e-05,
3470
+ "loss": 8.4804,
3471
+ "step": 490
3472
+ },
3473
+ {
3474
+ "epoch": 0.8587669435942282,
3475
+ "grad_norm": 7.022246837615967,
3476
+ "learning_rate": 1.0077075233923116e-05,
3477
+ "loss": 8.4708,
3478
+ "step": 491
3479
+ },
3480
+ {
3481
+ "epoch": 0.8605159597726278,
3482
+ "grad_norm": 6.369495868682861,
3483
+ "learning_rate": 9.833930442041506e-06,
3484
+ "loss": 8.9675,
3485
+ "step": 492
3486
+ },
3487
+ {
3488
+ "epoch": 0.8622649759510276,
3489
+ "grad_norm": 7.497471809387207,
3490
+ "learning_rate": 9.593603183877841e-06,
3491
+ "loss": 9.2559,
3492
+ "step": 493
3493
+ },
3494
+ {
3495
+ "epoch": 0.8640139921294272,
3496
+ "grad_norm": 6.291429042816162,
3497
+ "learning_rate": 9.35610096924323e-06,
3498
+ "loss": 8.7969,
3499
+ "step": 494
3500
+ },
3501
+ {
3502
+ "epoch": 0.8657630083078268,
3503
+ "grad_norm": 5.909082412719727,
3504
+ "learning_rate": 9.121431219671095e-06,
3505
+ "loss": 9.1292,
3506
+ "step": 495
3507
+ },
3508
+ {
3509
+ "epoch": 0.8675120244862266,
3510
+ "grad_norm": 6.053621292114258,
3511
+ "learning_rate": 8.889601268185232e-06,
3512
+ "loss": 8.1841,
3513
+ "step": 496
3514
+ },
3515
+ {
3516
+ "epoch": 0.8692610406646262,
3517
+ "grad_norm": 7.89780855178833,
3518
+ "learning_rate": 8.660618359070604e-06,
3519
+ "loss": 8.2031,
3520
+ "step": 497
3521
+ },
3522
+ {
3523
+ "epoch": 0.8710100568430258,
3524
+ "grad_norm": 5.878389835357666,
3525
+ "learning_rate": 8.434489647647092e-06,
3526
+ "loss": 7.5146,
3527
+ "step": 498
3528
+ },
3529
+ {
3530
+ "epoch": 0.8727590730214254,
3531
+ "grad_norm": 6.576045513153076,
3532
+ "learning_rate": 8.211222200045788e-06,
3533
+ "loss": 6.9011,
3534
+ "step": 499
3535
+ },
3536
+ {
3537
+ "epoch": 0.8745080891998251,
3538
+ "grad_norm": 6.503655433654785,
3539
+ "learning_rate": 7.990822992988267e-06,
3540
+ "loss": 7.2335,
3541
+ "step": 500
3542
+ },
3543
+ {
3544
+ "epoch": 0.8762571053782248,
3545
+ "grad_norm": 6.5767998695373535,
3546
+ "learning_rate": 7.773298913568505e-06,
3547
+ "loss": 8.625,
3548
+ "step": 501
3549
+ },
3550
+ {
3551
+ "epoch": 0.8780061215566244,
3552
+ "grad_norm": 6.215456485748291,
3553
+ "learning_rate": 7.558656759037797e-06,
3554
+ "loss": 6.6789,
3555
+ "step": 502
3556
+ },
3557
+ {
3558
+ "epoch": 0.879755137735024,
3559
+ "grad_norm": 6.190433502197266,
3560
+ "learning_rate": 7.346903236592162e-06,
3561
+ "loss": 7.0194,
3562
+ "step": 503
3563
+ },
3564
+ {
3565
+ "epoch": 0.8815041539134237,
3566
+ "grad_norm": 6.697807788848877,
3567
+ "learning_rate": 7.13804496316296e-06,
3568
+ "loss": 7.5084,
3569
+ "step": 504
3570
+ },
3571
+ {
3572
+ "epoch": 0.8832531700918234,
3573
+ "grad_norm": 6.033225059509277,
3574
+ "learning_rate": 6.9320884652099406e-06,
3575
+ "loss": 7.5227,
3576
+ "step": 505
3577
+ },
3578
+ {
3579
+ "epoch": 0.885002186270223,
3580
+ "grad_norm": 6.665908336639404,
3581
+ "learning_rate": 6.729040178517454e-06,
3582
+ "loss": 7.7323,
3583
+ "step": 506
3584
+ },
3585
+ {
3586
+ "epoch": 0.8867512024486226,
3587
+ "grad_norm": 6.476802825927734,
3588
+ "learning_rate": 6.528906447993288e-06,
3589
+ "loss": 7.7431,
3590
+ "step": 507
3591
+ },
3592
+ {
3593
+ "epoch": 0.8885002186270223,
3594
+ "grad_norm": 5.98831844329834,
3595
+ "learning_rate": 6.331693527470306e-06,
3596
+ "loss": 7.3852,
3597
+ "step": 508
3598
+ },
3599
+ {
3600
+ "epoch": 0.890249234805422,
3601
+ "grad_norm": 6.885638236999512,
3602
+ "learning_rate": 6.137407579511212e-06,
3603
+ "loss": 7.667,
3604
+ "step": 509
3605
+ },
3606
+ {
3607
+ "epoch": 0.8919982509838216,
3608
+ "grad_norm": 7.12498140335083,
3609
+ "learning_rate": 5.946054675215784e-06,
3610
+ "loss": 8.1353,
3611
+ "step": 510
3612
+ },
3613
+ {
3614
+ "epoch": 0.8937472671622213,
3615
+ "grad_norm": 6.516235828399658,
3616
+ "learning_rate": 5.757640794031361e-06,
3617
+ "loss": 7.4876,
3618
+ "step": 511
3619
+ },
3620
+ {
3621
+ "epoch": 0.8954962833406209,
3622
+ "grad_norm": 6.076444149017334,
3623
+ "learning_rate": 5.572171823565797e-06,
3624
+ "loss": 9.0379,
3625
+ "step": 512
3626
+ },
3627
+ {
3628
+ "epoch": 0.8972452995190205,
3629
+ "grad_norm": 5.914377689361572,
3630
+ "learning_rate": 5.389653559403629e-06,
3631
+ "loss": 8.1439,
3632
+ "step": 513
3633
+ },
3634
+ {
3635
+ "epoch": 0.8989943156974202,
3636
+ "grad_norm": 6.191368579864502,
3637
+ "learning_rate": 5.210091704924946e-06,
3638
+ "loss": 8.6825,
3639
+ "step": 514
3640
+ },
3641
+ {
3642
+ "epoch": 0.9007433318758199,
3643
+ "grad_norm": 6.621984958648682,
3644
+ "learning_rate": 5.033491871127105e-06,
3645
+ "loss": 8.3616,
3646
+ "step": 515
3647
+ },
3648
+ {
3649
+ "epoch": 0.9024923480542195,
3650
+ "grad_norm": 6.113316059112549,
3651
+ "learning_rate": 4.859859576449444e-06,
3652
+ "loss": 7.6225,
3653
+ "step": 516
3654
+ },
3655
+ {
3656
+ "epoch": 0.9042413642326191,
3657
+ "grad_norm": 6.2419915199279785,
3658
+ "learning_rate": 4.689200246600867e-06,
3659
+ "loss": 7.8226,
3660
+ "step": 517
3661
+ },
3662
+ {
3663
+ "epoch": 0.9059903804110188,
3664
+ "grad_norm": 6.267433166503906,
3665
+ "learning_rate": 4.521519214390257e-06,
3666
+ "loss": 7.5588,
3667
+ "step": 518
3668
+ },
3669
+ {
3670
+ "epoch": 0.9077393965894185,
3671
+ "grad_norm": 7.470646381378174,
3672
+ "learning_rate": 4.356821719559812e-06,
3673
+ "loss": 7.5397,
3674
+ "step": 519
3675
+ },
3676
+ {
3677
+ "epoch": 0.9094884127678181,
3678
+ "grad_norm": 5.909048080444336,
3679
+ "learning_rate": 4.195112908621402e-06,
3680
+ "loss": 8.8447,
3681
+ "step": 520
3682
+ },
3683
+ {
3684
+ "epoch": 0.9112374289462177,
3685
+ "grad_norm": 6.860408782958984,
3686
+ "learning_rate": 4.03639783469566e-06,
3687
+ "loss": 8.8253,
3688
+ "step": 521
3689
+ },
3690
+ {
3691
+ "epoch": 0.9129864451246174,
3692
+ "grad_norm": 7.22110652923584,
3693
+ "learning_rate": 3.880681457354118e-06,
3694
+ "loss": 7.9479,
3695
+ "step": 522
3696
+ },
3697
+ {
3698
+ "epoch": 0.9147354613030171,
3699
+ "grad_norm": 5.659202575683594,
3700
+ "learning_rate": 3.727968642464241e-06,
3701
+ "loss": 7.4659,
3702
+ "step": 523
3703
+ },
3704
+ {
3705
+ "epoch": 0.9164844774814167,
3706
+ "grad_norm": 7.78839111328125,
3707
+ "learning_rate": 3.578264162037348e-06,
3708
+ "loss": 7.8924,
3709
+ "step": 524
3710
+ },
3711
+ {
3712
+ "epoch": 0.9182334936598163,
3713
+ "grad_norm": 6.77100133895874,
3714
+ "learning_rate": 3.4315726940795433e-06,
3715
+ "loss": 8.6822,
3716
+ "step": 525
3717
+ },
3718
+ {
3719
+ "epoch": 0.9199825098382161,
3720
+ "grad_norm": 7.059048175811768,
3721
+ "learning_rate": 3.2878988224454344e-06,
3722
+ "loss": 8.3176,
3723
+ "step": 526
3724
+ },
3725
+ {
3726
+ "epoch": 0.9217315260166157,
3727
+ "grad_norm": 5.931784152984619,
3728
+ "learning_rate": 3.1472470366950334e-06,
3729
+ "loss": 7.5494,
3730
+ "step": 527
3731
+ },
3732
+ {
3733
+ "epoch": 0.9234805421950153,
3734
+ "grad_norm": 6.578647613525391,
3735
+ "learning_rate": 3.0096217319533382e-06,
3736
+ "loss": 8.5582,
3737
+ "step": 528
3738
+ },
3739
+ {
3740
+ "epoch": 0.9252295583734149,
3741
+ "grad_norm": 5.951269626617432,
3742
+ "learning_rate": 2.875027208773118e-06,
3743
+ "loss": 7.3472,
3744
+ "step": 529
3745
+ },
3746
+ {
3747
+ "epoch": 0.9269785745518146,
3748
+ "grad_norm": 6.003902912139893,
3749
+ "learning_rate": 2.7434676730003884e-06,
3750
+ "loss": 8.754,
3751
+ "step": 530
3752
+ },
3753
+ {
3754
+ "epoch": 0.9287275907302143,
3755
+ "grad_norm": 6.373345851898193,
3756
+ "learning_rate": 2.614947235643106e-06,
3757
+ "loss": 7.5758,
3758
+ "step": 531
3759
+ },
3760
+ {
3761
+ "epoch": 0.9304766069086139,
3762
+ "grad_norm": 6.780086517333984,
3763
+ "learning_rate": 2.4894699127426367e-06,
3764
+ "loss": 9.3402,
3765
+ "step": 532
3766
+ },
3767
+ {
3768
+ "epoch": 0.9322256230870135,
3769
+ "grad_norm": 6.614811897277832,
3770
+ "learning_rate": 2.367039625248302e-06,
3771
+ "loss": 7.8778,
3772
+ "step": 533
3773
+ },
3774
+ {
3775
+ "epoch": 0.9339746392654132,
3776
+ "grad_norm": 6.098567008972168,
3777
+ "learning_rate": 2.2476601988947966e-06,
3778
+ "loss": 7.928,
3779
+ "step": 534
3780
+ },
3781
+ {
3782
+ "epoch": 0.9357236554438129,
3783
+ "grad_norm": 5.995659828186035,
3784
+ "learning_rate": 2.1313353640827206e-06,
3785
+ "loss": 9.0382,
3786
+ "step": 535
3787
+ },
3788
+ {
3789
+ "epoch": 0.9374726716222125,
3790
+ "grad_norm": 7.359274387359619,
3791
+ "learning_rate": 2.0180687557619816e-06,
3792
+ "loss": 8.395,
3793
+ "step": 536
3794
+ },
3795
+ {
3796
+ "epoch": 0.9392216878006121,
3797
+ "grad_norm": 6.481335639953613,
3798
+ "learning_rate": 1.907863913318153e-06,
3799
+ "loss": 8.3956,
3800
+ "step": 537
3801
+ },
3802
+ {
3803
+ "epoch": 0.9409707039790118,
3804
+ "grad_norm": 5.920598983764648,
3805
+ "learning_rate": 1.8007242804619628e-06,
3806
+ "loss": 7.8273,
3807
+ "step": 538
3808
+ },
3809
+ {
3810
+ "epoch": 0.9427197201574115,
3811
+ "grad_norm": 6.889771461486816,
3812
+ "learning_rate": 1.696653205121612e-06,
3813
+ "loss": 7.9301,
3814
+ "step": 539
3815
+ },
3816
+ {
3817
+ "epoch": 0.9444687363358111,
3818
+ "grad_norm": 7.978548049926758,
3819
+ "learning_rate": 1.595653939338204e-06,
3820
+ "loss": 8.1209,
3821
+ "step": 540
3822
+ },
3823
+ {
3824
+ "epoch": 0.9462177525142108,
3825
+ "grad_norm": 6.091990947723389,
3826
+ "learning_rate": 1.4977296391641026e-06,
3827
+ "loss": 7.4813,
3828
+ "step": 541
3829
+ },
3830
+ {
3831
+ "epoch": 0.9479667686926104,
3832
+ "grad_norm": 5.8564372062683105,
3833
+ "learning_rate": 1.4028833645643113e-06,
3834
+ "loss": 8.299,
3835
+ "step": 542
3836
+ },
3837
+ {
3838
+ "epoch": 0.94971578487101,
3839
+ "grad_norm": 6.623847484588623,
3840
+ "learning_rate": 1.31111807932085e-06,
3841
+ "loss": 7.3619,
3842
+ "step": 543
3843
+ },
3844
+ {
3845
+ "epoch": 0.9514648010494097,
3846
+ "grad_norm": 6.130828380584717,
3847
+ "learning_rate": 1.222436650940173e-06,
3848
+ "loss": 8.2312,
3849
+ "step": 544
3850
+ },
3851
+ {
3852
+ "epoch": 0.9532138172278094,
3853
+ "grad_norm": 5.993081569671631,
3854
+ "learning_rate": 1.1368418505635302e-06,
3855
+ "loss": 9.4749,
3856
+ "step": 545
3857
+ },
3858
+ {
3859
+ "epoch": 0.954962833406209,
3860
+ "grad_norm": 6.714028835296631,
3861
+ "learning_rate": 1.0543363528803696e-06,
3862
+ "loss": 8.6106,
3863
+ "step": 546
3864
+ },
3865
+ {
3866
+ "epoch": 0.9567118495846086,
3867
+ "grad_norm": 6.759970664978027,
3868
+ "learning_rate": 9.749227360448143e-07,
3869
+ "loss": 8.9938,
3870
+ "step": 547
3871
+ },
3872
+ {
3873
+ "epoch": 0.9584608657630083,
3874
+ "grad_norm": 6.770102500915527,
3875
+ "learning_rate": 8.986034815950172e-07,
3876
+ "loss": 9.3766,
3877
+ "step": 548
3878
+ },
3879
+ {
3880
+ "epoch": 0.960209881941408,
3881
+ "grad_norm": 5.988943576812744,
3882
+ "learning_rate": 8.253809743756668e-07,
3883
+ "loss": 8.0326,
3884
+ "step": 549
3885
+ },
3886
+ {
3887
+ "epoch": 0.9619588981198076,
3888
+ "grad_norm": 6.561304092407227,
3889
+ "learning_rate": 7.552575024634689e-07,
3890
+ "loss": 7.3145,
3891
+ "step": 550
3892
+ },
3893
+ {
3894
+ "epoch": 0.9637079142982072,
3895
+ "grad_norm": 6.857665061950684,
3896
+ "learning_rate": 6.882352570956485e-07,
3897
+ "loss": 8.4195,
3898
+ "step": 551
3899
+ },
3900
+ {
3901
+ "epoch": 0.9654569304766069,
3902
+ "grad_norm": 7.414341926574707,
3903
+ "learning_rate": 6.243163326014267e-07,
3904
+ "loss": 7.5319,
3905
+ "step": 552
3906
+ },
3907
+ {
3908
+ "epoch": 0.9672059466550066,
3909
+ "grad_norm": 7.155992031097412,
3910
+ "learning_rate": 5.635027263366399e-07,
3911
+ "loss": 7.9228,
3912
+ "step": 553
3913
+ },
3914
+ {
3915
+ "epoch": 0.9689549628334062,
3916
+ "grad_norm": 5.8317551612854,
3917
+ "learning_rate": 5.057963386213116e-07,
3918
+ "loss": 8.3518,
3919
+ "step": 554
3920
+ },
3921
+ {
3922
+ "epoch": 0.9707039790118058,
3923
+ "grad_norm": 6.625657081604004,
3924
+ "learning_rate": 4.5119897268023347e-07,
3925
+ "loss": 8.2842,
3926
+ "step": 555
3927
+ },
3928
+ {
3929
+ "epoch": 0.9724529951902056,
3930
+ "grad_norm": 6.393413543701172,
3931
+ "learning_rate": 3.9971233458665493e-07,
3932
+ "loss": 8.9058,
3933
+ "step": 556
3934
+ },
3935
+ {
3936
+ "epoch": 0.9742020113686052,
3937
+ "grad_norm": 6.962483882904053,
3938
+ "learning_rate": 3.5133803320896994e-07,
3939
+ "loss": 7.8907,
3940
+ "step": 557
3941
+ },
3942
+ {
3943
+ "epoch": 0.9759510275470048,
3944
+ "grad_norm": 6.612078666687012,
3945
+ "learning_rate": 3.060775801604354e-07,
3946
+ "loss": 8.0385,
3947
+ "step": 558
3948
+ },
3949
+ {
3950
+ "epoch": 0.9777000437254044,
3951
+ "grad_norm": 5.987767696380615,
3952
+ "learning_rate": 2.639323897518975e-07,
3953
+ "loss": 8.6059,
3954
+ "step": 559
3955
+ },
3956
+ {
3957
+ "epoch": 0.9794490599038042,
3958
+ "grad_norm": 5.532016277313232,
3959
+ "learning_rate": 2.2490377894768267e-07,
3960
+ "loss": 7.1836,
3961
+ "step": 560
3962
+ },
3963
+ {
3964
+ "epoch": 0.9811980760822038,
3965
+ "grad_norm": 6.169713020324707,
3966
+ "learning_rate": 1.889929673243529e-07,
3967
+ "loss": 7.9154,
3968
+ "step": 561
3969
+ },
3970
+ {
3971
+ "epoch": 0.9829470922606034,
3972
+ "grad_norm": 6.610177040100098,
3973
+ "learning_rate": 1.562010770326916e-07,
3974
+ "loss": 7.9639,
3975
+ "step": 562
3976
+ },
3977
+ {
3978
+ "epoch": 0.984696108439003,
3979
+ "grad_norm": 5.848151683807373,
3980
+ "learning_rate": 1.2652913276250955e-07,
3981
+ "loss": 7.69,
3982
+ "step": 563
3983
+ },
3984
+ {
3985
+ "epoch": 0.9864451246174027,
3986
+ "grad_norm": 6.159895420074463,
3987
+ "learning_rate": 9.99780617107815e-08,
3988
+ "loss": 7.9916,
3989
+ "step": 564
3990
+ },
3991
+ {
3992
+ "epoch": 0.9881941407958024,
3993
+ "grad_norm": 6.261981964111328,
3994
+ "learning_rate": 7.654869355252504e-08,
3995
+ "loss": 8.4931,
3996
+ "step": 565
3997
+ },
3998
+ {
3999
+ "epoch": 0.989943156974202,
4000
+ "grad_norm": 6.078785419464111,
4001
+ "learning_rate": 5.6241760414987856e-08,
4002
+ "loss": 9.2733,
4003
+ "step": 566
4004
+ },
4005
+ {
4006
+ "epoch": 0.9916921731526016,
4007
+ "grad_norm": 6.701304912567139,
4008
+ "learning_rate": 3.905789685471062e-08,
4009
+ "loss": 8.1601,
4010
+ "step": 567
4011
+ },
4012
+ {
4013
+ "epoch": 0.9934411893310013,
4014
+ "grad_norm": 6.677995681762695,
4015
+ "learning_rate": 2.4997639837687213e-08,
4016
+ "loss": 7.0004,
4017
+ "step": 568
4018
+ },
4019
+ {
4020
+ "epoch": 0.995190205509401,
4021
+ "grad_norm": 6.922513484954834,
4022
+ "learning_rate": 1.4061428722633718e-08,
4023
+ "loss": 8.7729,
4024
+ "step": 569
4025
+ },
4026
+ {
4027
+ "epoch": 0.9969392216878006,
4028
+ "grad_norm": 6.470855236053467,
4029
+ "learning_rate": 6.2496052472549304e-09,
4030
+ "loss": 8.6932,
4031
+ "step": 570
4032
+ },
4033
+ {
4034
+ "epoch": 0.9986882378662003,
4035
+ "grad_norm": 6.063665390014648,
4036
+ "learning_rate": 1.5624135174974186e-09,
4037
+ "loss": 8.2856,
4038
+ "step": 571
4039
+ },
4040
+ {
4041
+ "epoch": 1.0008745080891999,
4042
+ "grad_norm": 6.530557155609131,
4043
+ "learning_rate": 0.0,
4044
+ "loss": 8.5752,
4045
+ "step": 572
4046
+ },
4047
+ {
4048
+ "epoch": 1.0008745080891999,
4049
+ "eval_loss": 1.9150428771972656,
4050
+ "eval_runtime": 2.1677,
4051
+ "eval_samples_per_second": 111.18,
4052
+ "eval_steps_per_second": 55.821,
4053
+ "step": 572
4054
  }
4055
  ],
4056
  "logging_steps": 1,
 
4065
  "should_evaluate": false,
4066
  "should_log": false,
4067
  "should_save": true,
4068
+ "should_training_stop": true
4069
  },
4070
  "attributes": {}
4071
  }
4072
  },
4073
+ "total_flos": 2428656131506176.0,
4074
  "train_batch_size": 2,
4075
  "trial_name": null,
4076
  "trial_params": null