ErrorAI commited on
Commit
f27dec5
·
verified ·
1 Parent(s): c279f75

Training in progress, step 779, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b9e0eb8f0f1bf3b9da8a67aa754874f64bf7870c210bc4154f7d4ec9efdc9cd
3
  size 114106856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7e94715efa8923f83c90e8a4b164c445876d355075323cb068661e7c275ac8a
3
  size 114106856
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1528b404176076693f1c51dde3fcac8dd5077ff771426d00bea016a79cb2356
3
  size 58562836
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dcb761caed78bf48b94ced600453a1e977fa69914ed3c89ca938afcc739adab
3
  size 58562836
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dffe560a8e36d35e20f716232ccb147d69e438665908498107e980253447b7d1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f152be734e491ea28e2f66d9760c5b92c8c1c304987e832953ad75be8eaa1da9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94f9b0e3f5895d4bd35cc6f06585ba82ee519051b9f5519626392bea6dd86e01
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63400ae03f6cd4101f3500643a1b7255f746b2b1cc2a30342cfacd4858d6273c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7512038523274478,
5
  "eval_steps": 500,
6
- "global_step": 585,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4102,6 +4102,1372 @@
4102
  "learning_rate": 1.47164856854565e-05,
4103
  "loss": 3.4533,
4104
  "step": 585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4105
  }
4106
  ],
4107
  "logging_steps": 1,
@@ -4116,12 +5482,12 @@
4116
  "should_evaluate": false,
4117
  "should_log": false,
4118
  "should_save": true,
4119
- "should_training_stop": false
4120
  },
4121
  "attributes": {}
4122
  }
4123
  },
4124
- "total_flos": 6.896364316891546e+17,
4125
  "train_batch_size": 4,
4126
  "trial_name": null,
4127
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0009630818619584,
5
  "eval_steps": 500,
6
+ "global_step": 779,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4102
  "learning_rate": 1.47164856854565e-05,
4103
  "loss": 3.4533,
4104
  "step": 585
4105
+ },
4106
+ {
4107
+ "epoch": 0.7524879614767255,
4108
+ "grad_norm": 6.916558742523193,
4109
+ "learning_rate": 1.4572981813093507e-05,
4110
+ "loss": 3.6141,
4111
+ "step": 586
4112
+ },
4113
+ {
4114
+ "epoch": 0.7537720706260033,
4115
+ "grad_norm": 5.969259738922119,
4116
+ "learning_rate": 1.4430061589841121e-05,
4117
+ "loss": 2.0467,
4118
+ "step": 587
4119
+ },
4120
+ {
4121
+ "epoch": 0.755056179775281,
4122
+ "grad_norm": 5.210697174072266,
4123
+ "learning_rate": 1.4287727370265558e-05,
4124
+ "loss": 1.8851,
4125
+ "step": 588
4126
+ },
4127
+ {
4128
+ "epoch": 0.7563402889245586,
4129
+ "grad_norm": 7.276077747344971,
4130
+ "learning_rate": 1.4145981499278876e-05,
4131
+ "loss": 3.3363,
4132
+ "step": 589
4133
+ },
4134
+ {
4135
+ "epoch": 0.7576243980738363,
4136
+ "grad_norm": 9.250455856323242,
4137
+ "learning_rate": 1.4004826312100216e-05,
4138
+ "loss": 4.4776,
4139
+ "step": 590
4140
+ },
4141
+ {
4142
+ "epoch": 0.758908507223114,
4143
+ "grad_norm": 6.841946125030518,
4144
+ "learning_rate": 1.386426413421738e-05,
4145
+ "loss": 2.9102,
4146
+ "step": 591
4147
+ },
4148
+ {
4149
+ "epoch": 0.7601926163723917,
4150
+ "grad_norm": 6.715099334716797,
4151
+ "learning_rate": 1.3724297281348592e-05,
4152
+ "loss": 2.9798,
4153
+ "step": 592
4154
+ },
4155
+ {
4156
+ "epoch": 0.7614767255216693,
4157
+ "grad_norm": 7.211644649505615,
4158
+ "learning_rate": 1.3584928059404205e-05,
4159
+ "loss": 3.9018,
4160
+ "step": 593
4161
+ },
4162
+ {
4163
+ "epoch": 0.762760834670947,
4164
+ "grad_norm": 6.415737628936768,
4165
+ "learning_rate": 1.3446158764448841e-05,
4166
+ "loss": 2.4854,
4167
+ "step": 594
4168
+ },
4169
+ {
4170
+ "epoch": 0.7640449438202247,
4171
+ "grad_norm": 8.730350494384766,
4172
+ "learning_rate": 1.3307991682663462e-05,
4173
+ "loss": 3.7137,
4174
+ "step": 595
4175
+ },
4176
+ {
4177
+ "epoch": 0.7653290529695024,
4178
+ "grad_norm": 10.936077117919922,
4179
+ "learning_rate": 1.3170429090307823e-05,
4180
+ "loss": 4.2927,
4181
+ "step": 596
4182
+ },
4183
+ {
4184
+ "epoch": 0.7666131621187801,
4185
+ "grad_norm": 10.573296546936035,
4186
+ "learning_rate": 1.3033473253682848e-05,
4187
+ "loss": 4.0148,
4188
+ "step": 597
4189
+ },
4190
+ {
4191
+ "epoch": 0.7678972712680577,
4192
+ "grad_norm": 7.918160915374756,
4193
+ "learning_rate": 1.2897126429093354e-05,
4194
+ "loss": 3.0252,
4195
+ "step": 598
4196
+ },
4197
+ {
4198
+ "epoch": 0.7691813804173355,
4199
+ "grad_norm": 8.217205047607422,
4200
+ "learning_rate": 1.2761390862810907e-05,
4201
+ "loss": 3.4848,
4202
+ "step": 599
4203
+ },
4204
+ {
4205
+ "epoch": 0.7704654895666132,
4206
+ "grad_norm": 9.806517601013184,
4207
+ "learning_rate": 1.2626268791036767e-05,
4208
+ "loss": 4.4466,
4209
+ "step": 600
4210
+ },
4211
+ {
4212
+ "epoch": 0.7717495987158909,
4213
+ "grad_norm": 11.450250625610352,
4214
+ "learning_rate": 1.2491762439865035e-05,
4215
+ "loss": 4.424,
4216
+ "step": 601
4217
+ },
4218
+ {
4219
+ "epoch": 0.7730337078651686,
4220
+ "grad_norm": 9.043597221374512,
4221
+ "learning_rate": 1.235787402524603e-05,
4222
+ "loss": 4.3972,
4223
+ "step": 602
4224
+ },
4225
+ {
4226
+ "epoch": 0.7743178170144462,
4227
+ "grad_norm": 7.3961005210876465,
4228
+ "learning_rate": 1.2224605752949786e-05,
4229
+ "loss": 3.427,
4230
+ "step": 603
4231
+ },
4232
+ {
4233
+ "epoch": 0.7756019261637239,
4234
+ "grad_norm": 6.560033798217773,
4235
+ "learning_rate": 1.2091959818529636e-05,
4236
+ "loss": 2.9121,
4237
+ "step": 604
4238
+ },
4239
+ {
4240
+ "epoch": 0.7768860353130016,
4241
+ "grad_norm": 6.344895839691162,
4242
+ "learning_rate": 1.1959938407286097e-05,
4243
+ "loss": 2.7389,
4244
+ "step": 605
4245
+ },
4246
+ {
4247
+ "epoch": 0.7781701444622793,
4248
+ "grad_norm": 6.930209159851074,
4249
+ "learning_rate": 1.1828543694230909e-05,
4250
+ "loss": 2.8555,
4251
+ "step": 606
4252
+ },
4253
+ {
4254
+ "epoch": 0.779454253611557,
4255
+ "grad_norm": 8.534188270568848,
4256
+ "learning_rate": 1.1697777844051105e-05,
4257
+ "loss": 3.666,
4258
+ "step": 607
4259
+ },
4260
+ {
4261
+ "epoch": 0.7807383627608346,
4262
+ "grad_norm": 7.170388698577881,
4263
+ "learning_rate": 1.1567643011073392e-05,
4264
+ "loss": 3.4359,
4265
+ "step": 608
4266
+ },
4267
+ {
4268
+ "epoch": 0.7820224719101123,
4269
+ "grad_norm": 9.166104316711426,
4270
+ "learning_rate": 1.143814133922872e-05,
4271
+ "loss": 4.8905,
4272
+ "step": 609
4273
+ },
4274
+ {
4275
+ "epoch": 0.78330658105939,
4276
+ "grad_norm": 6.033200740814209,
4277
+ "learning_rate": 1.1309274962016852e-05,
4278
+ "loss": 2.4617,
4279
+ "step": 610
4280
+ },
4281
+ {
4282
+ "epoch": 0.7845906902086678,
4283
+ "grad_norm": 6.07381534576416,
4284
+ "learning_rate": 1.118104600247129e-05,
4285
+ "loss": 2.538,
4286
+ "step": 611
4287
+ },
4288
+ {
4289
+ "epoch": 0.7858747993579455,
4290
+ "grad_norm": 5.244448184967041,
4291
+ "learning_rate": 1.105345657312427e-05,
4292
+ "loss": 2.54,
4293
+ "step": 612
4294
+ },
4295
+ {
4296
+ "epoch": 0.7871589085072231,
4297
+ "grad_norm": 5.321899890899658,
4298
+ "learning_rate": 1.0926508775971994e-05,
4299
+ "loss": 2.1132,
4300
+ "step": 613
4301
+ },
4302
+ {
4303
+ "epoch": 0.7884430176565008,
4304
+ "grad_norm": 6.152393817901611,
4305
+ "learning_rate": 1.0800204702439937e-05,
4306
+ "loss": 2.3172,
4307
+ "step": 614
4308
+ },
4309
+ {
4310
+ "epoch": 0.7897271268057785,
4311
+ "grad_norm": 6.6167755126953125,
4312
+ "learning_rate": 1.0674546433348454e-05,
4313
+ "loss": 3.0088,
4314
+ "step": 615
4315
+ },
4316
+ {
4317
+ "epoch": 0.7910112359550562,
4318
+ "grad_norm": 6.641218185424805,
4319
+ "learning_rate": 1.0549536038878432e-05,
4320
+ "loss": 2.767,
4321
+ "step": 616
4322
+ },
4323
+ {
4324
+ "epoch": 0.7922953451043339,
4325
+ "grad_norm": 6.559841632843018,
4326
+ "learning_rate": 1.0425175578537299e-05,
4327
+ "loss": 2.868,
4328
+ "step": 617
4329
+ },
4330
+ {
4331
+ "epoch": 0.7935794542536115,
4332
+ "grad_norm": 5.474334716796875,
4333
+ "learning_rate": 1.0301467101124957e-05,
4334
+ "loss": 2.4926,
4335
+ "step": 618
4336
+ },
4337
+ {
4338
+ "epoch": 0.7948635634028892,
4339
+ "grad_norm": 6.981647491455078,
4340
+ "learning_rate": 1.0178412644700092e-05,
4341
+ "loss": 2.4924,
4342
+ "step": 619
4343
+ },
4344
+ {
4345
+ "epoch": 0.7961476725521669,
4346
+ "grad_norm": 7.217685222625732,
4347
+ "learning_rate": 1.0056014236546646e-05,
4348
+ "loss": 2.7151,
4349
+ "step": 620
4350
+ },
4351
+ {
4352
+ "epoch": 0.7974317817014446,
4353
+ "grad_norm": 6.473022937774658,
4354
+ "learning_rate": 9.934273893140334e-06,
4355
+ "loss": 2.993,
4356
+ "step": 621
4357
+ },
4358
+ {
4359
+ "epoch": 0.7987158908507224,
4360
+ "grad_norm": 7.541585922241211,
4361
+ "learning_rate": 9.813193620115447e-06,
4362
+ "loss": 3.0707,
4363
+ "step": 622
4364
+ },
4365
+ {
4366
+ "epoch": 0.8,
4367
+ "grad_norm": 7.679656982421875,
4368
+ "learning_rate": 9.692775412231863e-06,
4369
+ "loss": 2.8438,
4370
+ "step": 623
4371
+ },
4372
+ {
4373
+ "epoch": 0.8012841091492777,
4374
+ "grad_norm": 8.271646499633789,
4375
+ "learning_rate": 9.573021253342112e-06,
4376
+ "loss": 3.336,
4377
+ "step": 624
4378
+ },
4379
+ {
4380
+ "epoch": 0.8025682182985554,
4381
+ "grad_norm": 6.968911170959473,
4382
+ "learning_rate": 9.453933116358715e-06,
4383
+ "loss": 3.3099,
4384
+ "step": 625
4385
+ },
4386
+ {
4387
+ "epoch": 0.8038523274478331,
4388
+ "grad_norm": 8.06809139251709,
4389
+ "learning_rate": 9.335512963221732e-06,
4390
+ "loss": 3.7685,
4391
+ "step": 626
4392
+ },
4393
+ {
4394
+ "epoch": 0.8051364365971108,
4395
+ "grad_norm": 5.143070697784424,
4396
+ "learning_rate": 9.21776274486636e-06,
4397
+ "loss": 2.0179,
4398
+ "step": 627
4399
+ },
4400
+ {
4401
+ "epoch": 0.8064205457463884,
4402
+ "grad_norm": 9.07247543334961,
4403
+ "learning_rate": 9.100684401190828e-06,
4404
+ "loss": 4.0971,
4405
+ "step": 628
4406
+ },
4407
+ {
4408
+ "epoch": 0.8077046548956661,
4409
+ "grad_norm": 7.609170436859131,
4410
+ "learning_rate": 8.984279861024453e-06,
4411
+ "loss": 3.4357,
4412
+ "step": 629
4413
+ },
4414
+ {
4415
+ "epoch": 0.8089887640449438,
4416
+ "grad_norm": 6.985535144805908,
4417
+ "learning_rate": 8.868551042095851e-06,
4418
+ "loss": 3.1968,
4419
+ "step": 630
4420
+ },
4421
+ {
4422
+ "epoch": 0.8102728731942215,
4423
+ "grad_norm": 7.812018394470215,
4424
+ "learning_rate": 8.75349985100134e-06,
4425
+ "loss": 3.5456,
4426
+ "step": 631
4427
+ },
4428
+ {
4429
+ "epoch": 0.8115569823434992,
4430
+ "grad_norm": 8.600101470947266,
4431
+ "learning_rate": 8.639128183173518e-06,
4432
+ "loss": 3.4957,
4433
+ "step": 632
4434
+ },
4435
+ {
4436
+ "epoch": 0.8128410914927768,
4437
+ "grad_norm": 8.320466995239258,
4438
+ "learning_rate": 8.525437922850032e-06,
4439
+ "loss": 2.7815,
4440
+ "step": 633
4441
+ },
4442
+ {
4443
+ "epoch": 0.8141252006420546,
4444
+ "grad_norm": 7.145605564117432,
4445
+ "learning_rate": 8.412430943042615e-06,
4446
+ "loss": 3.4794,
4447
+ "step": 634
4448
+ },
4449
+ {
4450
+ "epoch": 0.8154093097913323,
4451
+ "grad_norm": 5.847816467285156,
4452
+ "learning_rate": 8.30010910550611e-06,
4453
+ "loss": 2.7875,
4454
+ "step": 635
4455
+ },
4456
+ {
4457
+ "epoch": 0.81669341894061,
4458
+ "grad_norm": 7.271595001220703,
4459
+ "learning_rate": 8.188474260707858e-06,
4460
+ "loss": 3.5987,
4461
+ "step": 636
4462
+ },
4463
+ {
4464
+ "epoch": 0.8179775280898877,
4465
+ "grad_norm": 8.630016326904297,
4466
+ "learning_rate": 8.077528247797234e-06,
4467
+ "loss": 3.823,
4468
+ "step": 637
4469
+ },
4470
+ {
4471
+ "epoch": 0.8192616372391653,
4472
+ "grad_norm": 9.090782165527344,
4473
+ "learning_rate": 7.967272894575312e-06,
4474
+ "loss": 4.3018,
4475
+ "step": 638
4476
+ },
4477
+ {
4478
+ "epoch": 0.820545746388443,
4479
+ "grad_norm": 9.1552152633667,
4480
+ "learning_rate": 7.857710017464737e-06,
4481
+ "loss": 4.3033,
4482
+ "step": 639
4483
+ },
4484
+ {
4485
+ "epoch": 0.8218298555377207,
4486
+ "grad_norm": 8.0325288772583,
4487
+ "learning_rate": 7.748841421479874e-06,
4488
+ "loss": 3.1759,
4489
+ "step": 640
4490
+ },
4491
+ {
4492
+ "epoch": 0.8231139646869984,
4493
+ "grad_norm": 11.123332023620605,
4494
+ "learning_rate": 7.640668900196984e-06,
4495
+ "loss": 4.3347,
4496
+ "step": 641
4497
+ },
4498
+ {
4499
+ "epoch": 0.8243980738362761,
4500
+ "grad_norm": 6.174449920654297,
4501
+ "learning_rate": 7.533194235724728e-06,
4502
+ "loss": 2.8528,
4503
+ "step": 642
4504
+ },
4505
+ {
4506
+ "epoch": 0.8256821829855537,
4507
+ "grad_norm": 7.3835554122924805,
4508
+ "learning_rate": 7.426419198674772e-06,
4509
+ "loss": 3.7275,
4510
+ "step": 643
4511
+ },
4512
+ {
4513
+ "epoch": 0.8269662921348314,
4514
+ "grad_norm": 7.971776962280273,
4515
+ "learning_rate": 7.320345548132679e-06,
4516
+ "loss": 3.6571,
4517
+ "step": 644
4518
+ },
4519
+ {
4520
+ "epoch": 0.8282504012841091,
4521
+ "grad_norm": 7.133792400360107,
4522
+ "learning_rate": 7.214975031628857e-06,
4523
+ "loss": 3.1931,
4524
+ "step": 645
4525
+ },
4526
+ {
4527
+ "epoch": 0.8295345104333869,
4528
+ "grad_norm": 6.824871063232422,
4529
+ "learning_rate": 7.110309385109803e-06,
4530
+ "loss": 2.9311,
4531
+ "step": 646
4532
+ },
4533
+ {
4534
+ "epoch": 0.8308186195826646,
4535
+ "grad_norm": 9.289607048034668,
4536
+ "learning_rate": 7.006350332909495e-06,
4537
+ "loss": 4.1846,
4538
+ "step": 647
4539
+ },
4540
+ {
4541
+ "epoch": 0.8321027287319422,
4542
+ "grad_norm": 7.833906173706055,
4543
+ "learning_rate": 6.9030995877210236e-06,
4544
+ "loss": 4.1159,
4545
+ "step": 648
4546
+ },
4547
+ {
4548
+ "epoch": 0.8333868378812199,
4549
+ "grad_norm": 8.37000846862793,
4550
+ "learning_rate": 6.800558850568295e-06,
4551
+ "loss": 4.2116,
4552
+ "step": 649
4553
+ },
4554
+ {
4555
+ "epoch": 0.8346709470304976,
4556
+ "grad_norm": 10.978693008422852,
4557
+ "learning_rate": 6.698729810778065e-06,
4558
+ "loss": 3.4674,
4559
+ "step": 650
4560
+ },
4561
+ {
4562
+ "epoch": 0.8359550561797753,
4563
+ "grad_norm": 7.171913146972656,
4564
+ "learning_rate": 6.5976141459521355e-06,
4565
+ "loss": 3.5828,
4566
+ "step": 651
4567
+ },
4568
+ {
4569
+ "epoch": 0.837239165329053,
4570
+ "grad_norm": 41.70615005493164,
4571
+ "learning_rate": 6.497213521939638e-06,
4572
+ "loss": 5.1122,
4573
+ "step": 652
4574
+ },
4575
+ {
4576
+ "epoch": 0.8385232744783306,
4577
+ "grad_norm": 13.99992847442627,
4578
+ "learning_rate": 6.397529592809614e-06,
4579
+ "loss": 4.6054,
4580
+ "step": 653
4581
+ },
4582
+ {
4583
+ "epoch": 0.8398073836276083,
4584
+ "grad_norm": 8.354331016540527,
4585
+ "learning_rate": 6.298564000823848e-06,
4586
+ "loss": 3.5946,
4587
+ "step": 654
4588
+ },
4589
+ {
4590
+ "epoch": 0.841091492776886,
4591
+ "grad_norm": 7.070506572723389,
4592
+ "learning_rate": 6.2003183764096695e-06,
4593
+ "loss": 3.7046,
4594
+ "step": 655
4595
+ },
4596
+ {
4597
+ "epoch": 0.8423756019261637,
4598
+ "grad_norm": 5.7081098556518555,
4599
+ "learning_rate": 6.102794338133194e-06,
4600
+ "loss": 2.483,
4601
+ "step": 656
4602
+ },
4603
+ {
4604
+ "epoch": 0.8436597110754415,
4605
+ "grad_norm": 9.342689514160156,
4606
+ "learning_rate": 6.005993492672657e-06,
4607
+ "loss": 2.9348,
4608
+ "step": 657
4609
+ },
4610
+ {
4611
+ "epoch": 0.8449438202247191,
4612
+ "grad_norm": 7.0432448387146,
4613
+ "learning_rate": 5.909917434791884e-06,
4614
+ "loss": 2.8985,
4615
+ "step": 658
4616
+ },
4617
+ {
4618
+ "epoch": 0.8462279293739968,
4619
+ "grad_norm": 6.8054914474487305,
4620
+ "learning_rate": 5.814567747314048e-06,
4621
+ "loss": 2.8271,
4622
+ "step": 659
4623
+ },
4624
+ {
4625
+ "epoch": 0.8475120385232745,
4626
+ "grad_norm": 8.824766159057617,
4627
+ "learning_rate": 5.719946001095616e-06,
4628
+ "loss": 2.989,
4629
+ "step": 660
4630
+ },
4631
+ {
4632
+ "epoch": 0.8487961476725522,
4633
+ "grad_norm": 9.073323249816895,
4634
+ "learning_rate": 5.626053755000421e-06,
4635
+ "loss": 4.1554,
4636
+ "step": 661
4637
+ },
4638
+ {
4639
+ "epoch": 0.8500802568218299,
4640
+ "grad_norm": 7.761005878448486,
4641
+ "learning_rate": 5.532892555874059e-06,
4642
+ "loss": 3.0684,
4643
+ "step": 662
4644
+ },
4645
+ {
4646
+ "epoch": 0.8513643659711075,
4647
+ "grad_norm": 6.79564905166626,
4648
+ "learning_rate": 5.440463938518303e-06,
4649
+ "loss": 3.2882,
4650
+ "step": 663
4651
+ },
4652
+ {
4653
+ "epoch": 0.8526484751203852,
4654
+ "grad_norm": 6.906652927398682,
4655
+ "learning_rate": 5.348769425665884e-06,
4656
+ "loss": 2.5132,
4657
+ "step": 664
4658
+ },
4659
+ {
4660
+ "epoch": 0.8539325842696629,
4661
+ "grad_norm": 9.796558380126953,
4662
+ "learning_rate": 5.257810527955409e-06,
4663
+ "loss": 3.4539,
4664
+ "step": 665
4665
+ },
4666
+ {
4667
+ "epoch": 0.8552166934189406,
4668
+ "grad_norm": 6.042295932769775,
4669
+ "learning_rate": 5.167588743906432e-06,
4670
+ "loss": 3.2793,
4671
+ "step": 666
4672
+ },
4673
+ {
4674
+ "epoch": 0.8565008025682183,
4675
+ "grad_norm": 5.436481475830078,
4676
+ "learning_rate": 5.078105559894791e-06,
4677
+ "loss": 2.2979,
4678
+ "step": 667
4679
+ },
4680
+ {
4681
+ "epoch": 0.8577849117174959,
4682
+ "grad_norm": 6.132842540740967,
4683
+ "learning_rate": 4.989362450128132e-06,
4684
+ "loss": 3.2762,
4685
+ "step": 668
4686
+ },
4687
+ {
4688
+ "epoch": 0.8590690208667737,
4689
+ "grad_norm": 19.214025497436523,
4690
+ "learning_rate": 4.901360876621597e-06,
4691
+ "loss": 4.0499,
4692
+ "step": 669
4693
+ },
4694
+ {
4695
+ "epoch": 0.8603531300160514,
4696
+ "grad_norm": 8.764758110046387,
4697
+ "learning_rate": 4.814102289173733e-06,
4698
+ "loss": 4.1739,
4699
+ "step": 670
4700
+ },
4701
+ {
4702
+ "epoch": 0.8616372391653291,
4703
+ "grad_norm": 6.873488426208496,
4704
+ "learning_rate": 4.727588125342669e-06,
4705
+ "loss": 3.2478,
4706
+ "step": 671
4707
+ },
4708
+ {
4709
+ "epoch": 0.8629213483146068,
4710
+ "grad_norm": 7.347663402557373,
4711
+ "learning_rate": 4.641819810422343e-06,
4712
+ "loss": 3.4422,
4713
+ "step": 672
4714
+ },
4715
+ {
4716
+ "epoch": 0.8642054574638844,
4717
+ "grad_norm": 7.408380031585693,
4718
+ "learning_rate": 4.556798757419068e-06,
4719
+ "loss": 2.9899,
4720
+ "step": 673
4721
+ },
4722
+ {
4723
+ "epoch": 0.8654895666131621,
4724
+ "grad_norm": 8.367959976196289,
4725
+ "learning_rate": 4.4725263670282905e-06,
4726
+ "loss": 3.5648,
4727
+ "step": 674
4728
+ },
4729
+ {
4730
+ "epoch": 0.8667736757624398,
4731
+ "grad_norm": 7.556360721588135,
4732
+ "learning_rate": 4.389004027611404e-06,
4733
+ "loss": 2.9127,
4734
+ "step": 675
4735
+ },
4736
+ {
4737
+ "epoch": 0.8680577849117175,
4738
+ "grad_norm": 6.504035949707031,
4739
+ "learning_rate": 4.3062331151730085e-06,
4740
+ "loss": 3.0646,
4741
+ "step": 676
4742
+ },
4743
+ {
4744
+ "epoch": 0.8693418940609952,
4745
+ "grad_norm": 7.055627346038818,
4746
+ "learning_rate": 4.224214993338149e-06,
4747
+ "loss": 2.8481,
4748
+ "step": 677
4749
+ },
4750
+ {
4751
+ "epoch": 0.8706260032102728,
4752
+ "grad_norm": 7.115128993988037,
4753
+ "learning_rate": 4.142951013329871e-06,
4754
+ "loss": 4.4346,
4755
+ "step": 678
4756
+ },
4757
+ {
4758
+ "epoch": 0.8719101123595505,
4759
+ "grad_norm": 8.57867431640625,
4760
+ "learning_rate": 4.062442513947007e-06,
4761
+ "loss": 3.6861,
4762
+ "step": 679
4763
+ },
4764
+ {
4765
+ "epoch": 0.8731942215088283,
4766
+ "grad_norm": 6.359896659851074,
4767
+ "learning_rate": 3.982690821542035e-06,
4768
+ "loss": 2.2618,
4769
+ "step": 680
4770
+ },
4771
+ {
4772
+ "epoch": 0.874478330658106,
4773
+ "grad_norm": 12.26174545288086,
4774
+ "learning_rate": 3.903697249999289e-06,
4775
+ "loss": 4.1746,
4776
+ "step": 681
4777
+ },
4778
+ {
4779
+ "epoch": 0.8757624398073837,
4780
+ "grad_norm": 7.479450225830078,
4781
+ "learning_rate": 3.8254631007133165e-06,
4782
+ "loss": 3.7854,
4783
+ "step": 682
4784
+ },
4785
+ {
4786
+ "epoch": 0.8770465489566613,
4787
+ "grad_norm": 8.125298500061035,
4788
+ "learning_rate": 3.7479896625674027e-06,
4789
+ "loss": 4.0026,
4790
+ "step": 683
4791
+ },
4792
+ {
4793
+ "epoch": 0.878330658105939,
4794
+ "grad_norm": 7.174749374389648,
4795
+ "learning_rate": 3.671278211912338e-06,
4796
+ "loss": 2.8277,
4797
+ "step": 684
4798
+ },
4799
+ {
4800
+ "epoch": 0.8796147672552167,
4801
+ "grad_norm": 7.114166736602783,
4802
+ "learning_rate": 3.595330012545445e-06,
4803
+ "loss": 3.7241,
4804
+ "step": 685
4805
+ },
4806
+ {
4807
+ "epoch": 0.8808988764044944,
4808
+ "grad_norm": 6.707666397094727,
4809
+ "learning_rate": 3.520146315689693e-06,
4810
+ "loss": 2.3535,
4811
+ "step": 686
4812
+ },
4813
+ {
4814
+ "epoch": 0.8821829855537721,
4815
+ "grad_norm": 7.33445930480957,
4816
+ "learning_rate": 3.445728359973094e-06,
4817
+ "loss": 3.2334,
4818
+ "step": 687
4819
+ },
4820
+ {
4821
+ "epoch": 0.8834670947030497,
4822
+ "grad_norm": 8.785782814025879,
4823
+ "learning_rate": 3.372077371408361e-06,
4824
+ "loss": 4.1721,
4825
+ "step": 688
4826
+ },
4827
+ {
4828
+ "epoch": 0.8847512038523274,
4829
+ "grad_norm": 6.775081634521484,
4830
+ "learning_rate": 3.299194563372604e-06,
4831
+ "loss": 2.6997,
4832
+ "step": 689
4833
+ },
4834
+ {
4835
+ "epoch": 0.8860353130016051,
4836
+ "grad_norm": 6.534388542175293,
4837
+ "learning_rate": 3.22708113658744e-06,
4838
+ "loss": 2.798,
4839
+ "step": 690
4840
+ },
4841
+ {
4842
+ "epoch": 0.8873194221508828,
4843
+ "grad_norm": 7.509188175201416,
4844
+ "learning_rate": 3.1557382790991687e-06,
4845
+ "loss": 3.993,
4846
+ "step": 691
4847
+ },
4848
+ {
4849
+ "epoch": 0.8886035313001606,
4850
+ "grad_norm": 6.793269157409668,
4851
+ "learning_rate": 3.085167166259162e-06,
4852
+ "loss": 2.6374,
4853
+ "step": 692
4854
+ },
4855
+ {
4856
+ "epoch": 0.8898876404494382,
4857
+ "grad_norm": 7.3369340896606445,
4858
+ "learning_rate": 3.0153689607045845e-06,
4859
+ "loss": 2.9641,
4860
+ "step": 693
4861
+ },
4862
+ {
4863
+ "epoch": 0.8911717495987159,
4864
+ "grad_norm": 8.02035140991211,
4865
+ "learning_rate": 2.9463448123391634e-06,
4866
+ "loss": 4.3613,
4867
+ "step": 694
4868
+ },
4869
+ {
4870
+ "epoch": 0.8924558587479936,
4871
+ "grad_norm": 6.194982051849365,
4872
+ "learning_rate": 2.878095858314278e-06,
4873
+ "loss": 2.8135,
4874
+ "step": 695
4875
+ },
4876
+ {
4877
+ "epoch": 0.8937399678972713,
4878
+ "grad_norm": 8.533773422241211,
4879
+ "learning_rate": 2.8106232230102448e-06,
4880
+ "loss": 3.7117,
4881
+ "step": 696
4882
+ },
4883
+ {
4884
+ "epoch": 0.895024077046549,
4885
+ "grad_norm": 7.960330009460449,
4886
+ "learning_rate": 2.743928018017744e-06,
4887
+ "loss": 2.6817,
4888
+ "step": 697
4889
+ },
4890
+ {
4891
+ "epoch": 0.8963081861958266,
4892
+ "grad_norm": 8.209793090820312,
4893
+ "learning_rate": 2.6780113421195298e-06,
4894
+ "loss": 2.9455,
4895
+ "step": 698
4896
+ },
4897
+ {
4898
+ "epoch": 0.8975922953451043,
4899
+ "grad_norm": 7.603484153747559,
4900
+ "learning_rate": 2.6128742812723704e-06,
4901
+ "loss": 2.6488,
4902
+ "step": 699
4903
+ },
4904
+ {
4905
+ "epoch": 0.898876404494382,
4906
+ "grad_norm": 7.542942523956299,
4907
+ "learning_rate": 2.5485179085890767e-06,
4908
+ "loss": 3.0132,
4909
+ "step": 700
4910
+ },
4911
+ {
4912
+ "epoch": 0.9001605136436597,
4913
+ "grad_norm": 8.356619834899902,
4914
+ "learning_rate": 2.4849432843208785e-06,
4915
+ "loss": 3.2918,
4916
+ "step": 701
4917
+ },
4918
+ {
4919
+ "epoch": 0.9014446227929374,
4920
+ "grad_norm": 8.363799095153809,
4921
+ "learning_rate": 2.4221514558399548e-06,
4922
+ "loss": 2.7588,
4923
+ "step": 702
4924
+ },
4925
+ {
4926
+ "epoch": 0.902728731942215,
4927
+ "grad_norm": 5.096377372741699,
4928
+ "learning_rate": 2.3601434576221546e-06,
4929
+ "loss": 2.2989,
4930
+ "step": 703
4931
+ },
4932
+ {
4933
+ "epoch": 0.9040128410914928,
4934
+ "grad_norm": 6.197229385375977,
4935
+ "learning_rate": 2.2989203112299684e-06,
4936
+ "loss": 3.1503,
4937
+ "step": 704
4938
+ },
4939
+ {
4940
+ "epoch": 0.9052969502407705,
4941
+ "grad_norm": 6.665518760681152,
4942
+ "learning_rate": 2.238483025295707e-06,
4943
+ "loss": 3.6999,
4944
+ "step": 705
4945
+ },
4946
+ {
4947
+ "epoch": 0.9065810593900482,
4948
+ "grad_norm": 5.712645053863525,
4949
+ "learning_rate": 2.178832595504854e-06,
4950
+ "loss": 2.3308,
4951
+ "step": 706
4952
+ },
4953
+ {
4954
+ "epoch": 0.9078651685393259,
4955
+ "grad_norm": 8.328536033630371,
4956
+ "learning_rate": 2.1199700045797077e-06,
4957
+ "loss": 4.041,
4958
+ "step": 707
4959
+ },
4960
+ {
4961
+ "epoch": 0.9091492776886035,
4962
+ "grad_norm": 7.73337984085083,
4963
+ "learning_rate": 2.0618962222631432e-06,
4964
+ "loss": 3.2939,
4965
+ "step": 708
4966
+ },
4967
+ {
4968
+ "epoch": 0.9104333868378812,
4969
+ "grad_norm": 5.95501184463501,
4970
+ "learning_rate": 2.0046122053026694e-06,
4971
+ "loss": 3.0356,
4972
+ "step": 709
4973
+ },
4974
+ {
4975
+ "epoch": 0.9117174959871589,
4976
+ "grad_norm": 7.161213397979736,
4977
+ "learning_rate": 1.9481188974346696e-06,
4978
+ "loss": 3.4678,
4979
+ "step": 710
4980
+ },
4981
+ {
4982
+ "epoch": 0.9130016051364366,
4983
+ "grad_norm": 5.240903377532959,
4984
+ "learning_rate": 1.8924172293688147e-06,
4985
+ "loss": 2.3559,
4986
+ "step": 711
4987
+ },
4988
+ {
4989
+ "epoch": 0.9142857142857143,
4990
+ "grad_norm": 7.932479381561279,
4991
+ "learning_rate": 1.8375081187727683e-06,
4992
+ "loss": 3.328,
4993
+ "step": 712
4994
+ },
4995
+ {
4996
+ "epoch": 0.9155698234349919,
4997
+ "grad_norm": 7.0898051261901855,
4998
+ "learning_rate": 1.7833924702570725e-06,
4999
+ "loss": 3.2187,
5000
+ "step": 713
5001
+ },
5002
+ {
5003
+ "epoch": 0.9168539325842696,
5004
+ "grad_norm": 6.591296195983887,
5005
+ "learning_rate": 1.7300711753601983e-06,
5006
+ "loss": 2.724,
5007
+ "step": 714
5008
+ },
5009
+ {
5010
+ "epoch": 0.9181380417335474,
5011
+ "grad_norm": 6.064736843109131,
5012
+ "learning_rate": 1.6775451125338959e-06,
5013
+ "loss": 2.4792,
5014
+ "step": 715
5015
+ },
5016
+ {
5017
+ "epoch": 0.9194221508828251,
5018
+ "grad_norm": 6.841074466705322,
5019
+ "learning_rate": 1.6258151471287396e-06,
5020
+ "loss": 3.3866,
5021
+ "step": 716
5022
+ },
5023
+ {
5024
+ "epoch": 0.9207062600321028,
5025
+ "grad_norm": 7.046119689941406,
5026
+ "learning_rate": 1.5748821313798124e-06,
5027
+ "loss": 3.0957,
5028
+ "step": 717
5029
+ },
5030
+ {
5031
+ "epoch": 0.9219903691813804,
5032
+ "grad_norm": 9.995711326599121,
5033
+ "learning_rate": 1.5247469043927155e-06,
5034
+ "loss": 3.2111,
5035
+ "step": 718
5036
+ },
5037
+ {
5038
+ "epoch": 0.9232744783306581,
5039
+ "grad_norm": 7.515043258666992,
5040
+ "learning_rate": 1.4754102921297364e-06,
5041
+ "loss": 2.9457,
5042
+ "step": 719
5043
+ },
5044
+ {
5045
+ "epoch": 0.9245585874799358,
5046
+ "grad_norm": 6.644869804382324,
5047
+ "learning_rate": 1.4268731073962094e-06,
5048
+ "loss": 3.0214,
5049
+ "step": 720
5050
+ },
5051
+ {
5052
+ "epoch": 0.9258426966292135,
5053
+ "grad_norm": 6.054432392120361,
5054
+ "learning_rate": 1.3791361498271705e-06,
5055
+ "loss": 2.3658,
5056
+ "step": 721
5057
+ },
5058
+ {
5059
+ "epoch": 0.9271268057784912,
5060
+ "grad_norm": 6.487936019897461,
5061
+ "learning_rate": 1.3322002058741678e-06,
5062
+ "loss": 2.7158,
5063
+ "step": 722
5064
+ },
5065
+ {
5066
+ "epoch": 0.9284109149277688,
5067
+ "grad_norm": 9.501916885375977,
5068
+ "learning_rate": 1.2860660487922616e-06,
5069
+ "loss": 3.9914,
5070
+ "step": 723
5071
+ },
5072
+ {
5073
+ "epoch": 0.9296950240770465,
5074
+ "grad_norm": 6.846279621124268,
5075
+ "learning_rate": 1.2407344386273611e-06,
5076
+ "loss": 2.7458,
5077
+ "step": 724
5078
+ },
5079
+ {
5080
+ "epoch": 0.9309791332263242,
5081
+ "grad_norm": 7.539971828460693,
5082
+ "learning_rate": 1.196206122203647e-06,
5083
+ "loss": 3.3559,
5084
+ "step": 725
5085
+ },
5086
+ {
5087
+ "epoch": 0.9322632423756019,
5088
+ "grad_norm": 8.702104568481445,
5089
+ "learning_rate": 1.1524818331112851e-06,
5090
+ "loss": 3.0192,
5091
+ "step": 726
5092
+ },
5093
+ {
5094
+ "epoch": 0.9335473515248797,
5095
+ "grad_norm": 7.85031270980835,
5096
+ "learning_rate": 1.1095622916943494e-06,
5097
+ "loss": 3.9229,
5098
+ "step": 727
5099
+ },
5100
+ {
5101
+ "epoch": 0.9348314606741573,
5102
+ "grad_norm": 6.479403972625732,
5103
+ "learning_rate": 1.0674482050389455e-06,
5104
+ "loss": 2.739,
5105
+ "step": 728
5106
+ },
5107
+ {
5108
+ "epoch": 0.936115569823435,
5109
+ "grad_norm": 7.294396877288818,
5110
+ "learning_rate": 1.0261402669615505e-06,
5111
+ "loss": 3.6074,
5112
+ "step": 729
5113
+ },
5114
+ {
5115
+ "epoch": 0.9373996789727127,
5116
+ "grad_norm": 7.156967639923096,
5117
+ "learning_rate": 9.856391579976032e-07,
5118
+ "loss": 3.3275,
5119
+ "step": 730
5120
+ },
5121
+ {
5122
+ "epoch": 0.9386837881219904,
5123
+ "grad_norm": 7.538506031036377,
5124
+ "learning_rate": 9.459455453902866e-07,
5125
+ "loss": 3.2643,
5126
+ "step": 731
5127
+ },
5128
+ {
5129
+ "epoch": 0.9399678972712681,
5130
+ "grad_norm": 8.024343490600586,
5131
+ "learning_rate": 9.070600830795251e-07,
5132
+ "loss": 2.7308,
5133
+ "step": 732
5134
+ },
5135
+ {
5136
+ "epoch": 0.9412520064205457,
5137
+ "grad_norm": 9.4879150390625,
5138
+ "learning_rate": 8.68983411691221e-07,
5139
+ "loss": 4.2723,
5140
+ "step": 733
5141
+ },
5142
+ {
5143
+ "epoch": 0.9425361155698234,
5144
+ "grad_norm": 5.0413713455200195,
5145
+ "learning_rate": 8.317161585266964e-07,
5146
+ "loss": 2.3525,
5147
+ "step": 734
5148
+ },
5149
+ {
5150
+ "epoch": 0.9438202247191011,
5151
+ "grad_norm": 8.031405448913574,
5152
+ "learning_rate": 7.952589375523567e-07,
5153
+ "loss": 3.5764,
5154
+ "step": 735
5155
+ },
5156
+ {
5157
+ "epoch": 0.9451043338683788,
5158
+ "grad_norm": 8.198379516601562,
5159
+ "learning_rate": 7.596123493895991e-07,
5160
+ "loss": 4.5045,
5161
+ "step": 736
5162
+ },
5163
+ {
5164
+ "epoch": 0.9463884430176565,
5165
+ "grad_norm": 8.198650360107422,
5166
+ "learning_rate": 7.247769813048644e-07,
5167
+ "loss": 3.1831,
5168
+ "step": 737
5169
+ },
5170
+ {
5171
+ "epoch": 0.9476725521669341,
5172
+ "grad_norm": 6.928763389587402,
5173
+ "learning_rate": 6.907534072000177e-07,
5174
+ "loss": 3.261,
5175
+ "step": 738
5176
+ },
5177
+ {
5178
+ "epoch": 0.9489566613162119,
5179
+ "grad_norm": 6.926176071166992,
5180
+ "learning_rate": 6.57542187602872e-07,
5181
+ "loss": 3.2459,
5182
+ "step": 739
5183
+ },
5184
+ {
5185
+ "epoch": 0.9502407704654896,
5186
+ "grad_norm": 7.637943267822266,
5187
+ "learning_rate": 6.251438696579293e-07,
5188
+ "loss": 3.2871,
5189
+ "step": 740
5190
+ },
5191
+ {
5192
+ "epoch": 0.9515248796147673,
5193
+ "grad_norm": 6.943028450012207,
5194
+ "learning_rate": 5.935589871174208e-07,
5195
+ "loss": 3.2108,
5196
+ "step": 741
5197
+ },
5198
+ {
5199
+ "epoch": 0.952808988764045,
5200
+ "grad_norm": 7.411741256713867,
5201
+ "learning_rate": 5.627880603324532e-07,
5202
+ "loss": 2.5244,
5203
+ "step": 742
5204
+ },
5205
+ {
5206
+ "epoch": 0.9540930979133226,
5207
+ "grad_norm": 8.225117683410645,
5208
+ "learning_rate": 5.328315962444874e-07,
5209
+ "loss": 3.3371,
5210
+ "step": 743
5211
+ },
5212
+ {
5213
+ "epoch": 0.9553772070626003,
5214
+ "grad_norm": 8.055129051208496,
5215
+ "learning_rate": 5.036900883769624e-07,
5216
+ "loss": 3.4518,
5217
+ "step": 744
5218
+ },
5219
+ {
5220
+ "epoch": 0.956661316211878,
5221
+ "grad_norm": 7.234530925750732,
5222
+ "learning_rate": 4.753640168271456e-07,
5223
+ "loss": 3.6194,
5224
+ "step": 745
5225
+ },
5226
+ {
5227
+ "epoch": 0.9579454253611557,
5228
+ "grad_norm": 7.3645920753479,
5229
+ "learning_rate": 4.4785384825826173e-07,
5230
+ "loss": 2.6069,
5231
+ "step": 746
5232
+ },
5233
+ {
5234
+ "epoch": 0.9592295345104334,
5235
+ "grad_norm": 8.200539588928223,
5236
+ "learning_rate": 4.2116003589179887e-07,
5237
+ "loss": 3.3361,
5238
+ "step": 747
5239
+ },
5240
+ {
5241
+ "epoch": 0.960513643659711,
5242
+ "grad_norm": 7.350730895996094,
5243
+ "learning_rate": 3.9528301950000347e-07,
5244
+ "loss": 3.3126,
5245
+ "step": 748
5246
+ },
5247
+ {
5248
+ "epoch": 0.9617977528089887,
5249
+ "grad_norm": 9.777095794677734,
5250
+ "learning_rate": 3.702232253986804e-07,
5251
+ "loss": 3.9927,
5252
+ "step": 749
5253
+ },
5254
+ {
5255
+ "epoch": 0.9630818619582665,
5256
+ "grad_norm": 7.824852466583252,
5257
+ "learning_rate": 3.459810664401486e-07,
5258
+ "loss": 2.5065,
5259
+ "step": 750
5260
+ },
5261
+ {
5262
+ "epoch": 0.9643659711075442,
5263
+ "grad_norm": 14.027108192443848,
5264
+ "learning_rate": 3.2255694200643006e-07,
5265
+ "loss": 4.6722,
5266
+ "step": 751
5267
+ },
5268
+ {
5269
+ "epoch": 0.9656500802568219,
5270
+ "grad_norm": 8.372529983520508,
5271
+ "learning_rate": 2.9995123800270476e-07,
5272
+ "loss": 3.2234,
5273
+ "step": 752
5274
+ },
5275
+ {
5276
+ "epoch": 0.9669341894060995,
5277
+ "grad_norm": 6.728063583374023,
5278
+ "learning_rate": 2.78164326850916e-07,
5279
+ "loss": 3.0884,
5280
+ "step": 753
5281
+ },
5282
+ {
5283
+ "epoch": 0.9682182985553772,
5284
+ "grad_norm": 6.5443902015686035,
5285
+ "learning_rate": 2.571965674836418e-07,
5286
+ "loss": 3.1012,
5287
+ "step": 754
5288
+ },
5289
+ {
5290
+ "epoch": 0.9695024077046549,
5291
+ "grad_norm": 5.537796497344971,
5292
+ "learning_rate": 2.3704830533821108e-07,
5293
+ "loss": 2.5468,
5294
+ "step": 755
5295
+ },
5296
+ {
5297
+ "epoch": 0.9707865168539326,
5298
+ "grad_norm": 5.125178813934326,
5299
+ "learning_rate": 2.177198723509688e-07,
5300
+ "loss": 2.5329,
5301
+ "step": 756
5302
+ },
5303
+ {
5304
+ "epoch": 0.9720706260032103,
5305
+ "grad_norm": 5.382153034210205,
5306
+ "learning_rate": 1.9921158695184738e-07,
5307
+ "loss": 2.6962,
5308
+ "step": 757
5309
+ },
5310
+ {
5311
+ "epoch": 0.9733547351524879,
5312
+ "grad_norm": 8.632124900817871,
5313
+ "learning_rate": 1.8152375405909305e-07,
5314
+ "loss": 4.1156,
5315
+ "step": 758
5316
+ },
5317
+ {
5318
+ "epoch": 0.9746388443017656,
5319
+ "grad_norm": 8.175101280212402,
5320
+ "learning_rate": 1.6465666507425315e-07,
5321
+ "loss": 3.4293,
5322
+ "step": 759
5323
+ },
5324
+ {
5325
+ "epoch": 0.9759229534510433,
5326
+ "grad_norm": 6.334762096405029,
5327
+ "learning_rate": 1.4861059787736885e-07,
5328
+ "loss": 3.3443,
5329
+ "step": 760
5330
+ },
5331
+ {
5332
+ "epoch": 0.977207062600321,
5333
+ "grad_norm": 7.494119644165039,
5334
+ "learning_rate": 1.333858168224178e-07,
5335
+ "loss": 2.7613,
5336
+ "step": 761
5337
+ },
5338
+ {
5339
+ "epoch": 0.9784911717495988,
5340
+ "grad_norm": 10.954715728759766,
5341
+ "learning_rate": 1.1898257273292857e-07,
5342
+ "loss": 2.9989,
5343
+ "step": 762
5344
+ },
5345
+ {
5346
+ "epoch": 0.9797752808988764,
5347
+ "grad_norm": 6.305700302124023,
5348
+ "learning_rate": 1.0540110289786742e-07,
5349
+ "loss": 3.4268,
5350
+ "step": 763
5351
+ },
5352
+ {
5353
+ "epoch": 0.9810593900481541,
5354
+ "grad_norm": 8.201444625854492,
5355
+ "learning_rate": 9.264163106774137e-08,
5356
+ "loss": 3.5135,
5357
+ "step": 764
5358
+ },
5359
+ {
5360
+ "epoch": 0.9823434991974318,
5361
+ "grad_norm": 7.041131019592285,
5362
+ "learning_rate": 8.07043674508623e-08,
5363
+ "loss": 3.0868,
5364
+ "step": 765
5365
+ },
5366
+ {
5367
+ "epoch": 0.9836276083467095,
5368
+ "grad_norm": 6.509793281555176,
5369
+ "learning_rate": 6.958950870994963e-08,
5370
+ "loss": 3.5145,
5371
+ "step": 766
5372
+ },
5373
+ {
5374
+ "epoch": 0.9849117174959872,
5375
+ "grad_norm": 7.05448055267334,
5376
+ "learning_rate": 5.929723795884967e-08,
5377
+ "loss": 2.7421,
5378
+ "step": 767
5379
+ },
5380
+ {
5381
+ "epoch": 0.9861958266452648,
5382
+ "grad_norm": 6.827080249786377,
5383
+ "learning_rate": 4.982772475951025e-08,
5384
+ "loss": 2.4727,
5385
+ "step": 768
5386
+ },
5387
+ {
5388
+ "epoch": 0.9874799357945425,
5389
+ "grad_norm": 9.33382511138916,
5390
+ "learning_rate": 4.1181125119221787e-08,
5391
+ "loss": 5.2741,
5392
+ "step": 769
5393
+ },
5394
+ {
5395
+ "epoch": 0.9887640449438202,
5396
+ "grad_norm": 6.9964447021484375,
5397
+ "learning_rate": 3.3357581488030475e-08,
5398
+ "loss": 2.8354,
5399
+ "step": 770
5400
+ },
5401
+ {
5402
+ "epoch": 0.9900481540930979,
5403
+ "grad_norm": 7.825475215911865,
5404
+ "learning_rate": 2.6357222756384636e-08,
5405
+ "loss": 2.8096,
5406
+ "step": 771
5407
+ },
5408
+ {
5409
+ "epoch": 0.9913322632423756,
5410
+ "grad_norm": 6.928106784820557,
5411
+ "learning_rate": 2.0180164253008614e-08,
5412
+ "loss": 3.5979,
5413
+ "step": 772
5414
+ },
5415
+ {
5416
+ "epoch": 0.9926163723916533,
5417
+ "grad_norm": 7.775272369384766,
5418
+ "learning_rate": 1.482650774303207e-08,
5419
+ "loss": 2.3856,
5420
+ "step": 773
5421
+ },
5422
+ {
5423
+ "epoch": 0.993900481540931,
5424
+ "grad_norm": 10.018780708312988,
5425
+ "learning_rate": 1.0296341426274669e-08,
5426
+ "loss": 4.6775,
5427
+ "step": 774
5428
+ },
5429
+ {
5430
+ "epoch": 0.9951845906902087,
5431
+ "grad_norm": 8.294422149658203,
5432
+ "learning_rate": 6.589739935819461e-09,
5433
+ "loss": 3.1598,
5434
+ "step": 775
5435
+ },
5436
+ {
5437
+ "epoch": 0.9964686998394864,
5438
+ "grad_norm": 8.607256889343262,
5439
+ "learning_rate": 3.7067643367749705e-09,
5440
+ "loss": 3.5999,
5441
+ "step": 776
5442
+ },
5443
+ {
5444
+ "epoch": 0.9977528089887641,
5445
+ "grad_norm": 7.560439586639404,
5446
+ "learning_rate": 1.6474621252704493e-09,
5447
+ "loss": 3.439,
5448
+ "step": 777
5449
+ },
5450
+ {
5451
+ "epoch": 0.9990369181380417,
5452
+ "grad_norm": 11.580015182495117,
5453
+ "learning_rate": 4.118672276620661e-10,
5454
+ "loss": 4.1616,
5455
+ "step": 778
5456
+ },
5457
+ {
5458
+ "epoch": 0.9990369181380417,
5459
+ "eval_loss": 0.8082237243652344,
5460
+ "eval_runtime": 52.1681,
5461
+ "eval_samples_per_second": 12.575,
5462
+ "eval_steps_per_second": 3.144,
5463
+ "step": 778
5464
+ },
5465
+ {
5466
+ "epoch": 1.0009630818619584,
5467
+ "grad_norm": 9.079451560974121,
5468
+ "learning_rate": 0.0,
5469
+ "loss": 4.5426,
5470
+ "step": 779
5471
  }
5472
  ],
5473
  "logging_steps": 1,
 
5482
  "should_evaluate": false,
5483
  "should_log": false,
5484
  "should_save": true,
5485
+ "should_training_stop": true
5486
  },
5487
  "attributes": {}
5488
  }
5489
  },
5490
+ "total_flos": 9.195152422522061e+17,
5491
  "train_batch_size": 4,
5492
  "trial_name": null,
5493
  "trial_params": null