error577 commited on
Commit
63fe405
·
verified ·
1 Parent(s): 86d644f

Training in progress, step 773, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef7cdebd2bed018b33b74e69d70d0b8f51e5885357807742e255db45a166d518
3
  size 1134088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dfe6177dd8e99f8251ddff2bd7c294f0cee74189477b02d858b7d75e148fab5
3
  size 1134088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c59585a9cc2b2a94cc5213a6b1c4ba8d1c93af07d2a2191a22b2ce124f63ac1
3
  size 608186
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45b3328583c18197b000ed09534cc64ce0599779b8718ca17d5cddb71214c0d7
3
  size 608186
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fa808660ddbc52908e9ed2eaba07f255839ebe308cc6b503b8b517a081559eb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8d25b65c8871ace8f29fc01c33af8c9535181702318707d9198298d9aa6cccf
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30616cefc52eeaa674739292fccd99b6c970a424ce347526e4f095c2b1685119
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45719461b132f33086549160c158249554ace1a4b2303f17dcf1d7d8dd8db700
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7536419553253481,
5
  "eval_steps": 155,
6
- "global_step": 582,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4113,6 +4113,1351 @@
4113
  "learning_rate": 1.4681072873881312e-05,
4114
  "loss": 1.0705,
4115
  "step": 582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4116
  }
4117
  ],
4118
  "logging_steps": 1,
@@ -4127,12 +5472,12 @@
4127
  "should_evaluate": false,
4128
  "should_log": false,
4129
  "should_save": true,
4130
- "should_training_stop": false
4131
  },
4132
  "attributes": {}
4133
  }
4134
  },
4135
- "total_flos": 819593242214400.0,
4136
  "train_batch_size": 2,
4137
  "trial_name": null,
4138
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0009711880867596,
5
  "eval_steps": 155,
6
+ "global_step": 773,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4113
  "learning_rate": 1.4681072873881312e-05,
4114
  "loss": 1.0705,
4115
  "step": 582
4116
+ },
4117
+ {
4118
+ "epoch": 0.7549368727743606,
4119
+ "grad_norm": 5.166491508483887,
4120
+ "learning_rate": 1.4535650064427003e-05,
4121
+ "loss": 0.6913,
4122
+ "step": 583
4123
+ },
4124
+ {
4125
+ "epoch": 0.7562317902233733,
4126
+ "grad_norm": 13.889178276062012,
4127
+ "learning_rate": 1.439082848693406e-05,
4128
+ "loss": 3.66,
4129
+ "step": 584
4130
+ },
4131
+ {
4132
+ "epoch": 0.7575267076723858,
4133
+ "grad_norm": 2.9717559814453125,
4134
+ "learning_rate": 1.4246610596582444e-05,
4135
+ "loss": 0.9647,
4136
+ "step": 585
4137
+ },
4138
+ {
4139
+ "epoch": 0.7588216251213985,
4140
+ "grad_norm": 3.951253652572632,
4141
+ "learning_rate": 1.41029988383177e-05,
4142
+ "loss": 0.7419,
4143
+ "step": 586
4144
+ },
4145
+ {
4146
+ "epoch": 0.7601165425704112,
4147
+ "grad_norm": 2.8635647296905518,
4148
+ "learning_rate": 1.3959995646809549e-05,
4149
+ "loss": 0.614,
4150
+ "step": 587
4151
+ },
4152
+ {
4153
+ "epoch": 0.7614114600194237,
4154
+ "grad_norm": 4.437415599822998,
4155
+ "learning_rate": 1.381760344641061e-05,
4156
+ "loss": 0.7816,
4157
+ "step": 588
4158
+ },
4159
+ {
4160
+ "epoch": 0.7627063774684364,
4161
+ "grad_norm": 3.6199607849121094,
4162
+ "learning_rate": 1.3675824651115276e-05,
4163
+ "loss": 0.659,
4164
+ "step": 589
4165
+ },
4166
+ {
4167
+ "epoch": 0.7640012949174491,
4168
+ "grad_norm": 6.909400939941406,
4169
+ "learning_rate": 1.3534661664518817e-05,
4170
+ "loss": 1.5943,
4171
+ "step": 590
4172
+ },
4173
+ {
4174
+ "epoch": 0.7652962123664616,
4175
+ "grad_norm": 6.746818542480469,
4176
+ "learning_rate": 1.339411687977657e-05,
4177
+ "loss": 1.505,
4178
+ "step": 591
4179
+ },
4180
+ {
4181
+ "epoch": 0.7665911298154743,
4182
+ "grad_norm": 4.21103048324585,
4183
+ "learning_rate": 1.325419267956346e-05,
4184
+ "loss": 0.4796,
4185
+ "step": 592
4186
+ },
4187
+ {
4188
+ "epoch": 0.7678860472644868,
4189
+ "grad_norm": 4.808207988739014,
4190
+ "learning_rate": 1.3114891436033522e-05,
4191
+ "loss": 0.7325,
4192
+ "step": 593
4193
+ },
4194
+ {
4195
+ "epoch": 0.7691809647134995,
4196
+ "grad_norm": 5.3486833572387695,
4197
+ "learning_rate": 1.2976215510779755e-05,
4198
+ "loss": 0.4682,
4199
+ "step": 594
4200
+ },
4201
+ {
4202
+ "epoch": 0.7704758821625122,
4203
+ "grad_norm": 7.150143623352051,
4204
+ "learning_rate": 1.2838167254794004e-05,
4205
+ "loss": 0.8242,
4206
+ "step": 595
4207
+ },
4208
+ {
4209
+ "epoch": 0.7717707996115247,
4210
+ "grad_norm": 7.019624710083008,
4211
+ "learning_rate": 1.2700749008427205e-05,
4212
+ "loss": 1.0202,
4213
+ "step": 596
4214
+ },
4215
+ {
4216
+ "epoch": 0.7730657170605374,
4217
+ "grad_norm": 5.221418857574463,
4218
+ "learning_rate": 1.2563963101349619e-05,
4219
+ "loss": 1.5878,
4220
+ "step": 597
4221
+ },
4222
+ {
4223
+ "epoch": 0.7743606345095501,
4224
+ "grad_norm": 2.91672682762146,
4225
+ "learning_rate": 1.2427811852511395e-05,
4226
+ "loss": 0.4305,
4227
+ "step": 598
4228
+ },
4229
+ {
4230
+ "epoch": 0.7756555519585626,
4231
+ "grad_norm": 9.53405475616455,
4232
+ "learning_rate": 1.2292297570103229e-05,
4233
+ "loss": 1.2377,
4234
+ "step": 599
4235
+ },
4236
+ {
4237
+ "epoch": 0.7769504694075753,
4238
+ "grad_norm": 4.454233646392822,
4239
+ "learning_rate": 1.2157422551517228e-05,
4240
+ "loss": 0.7973,
4241
+ "step": 600
4242
+ },
4243
+ {
4244
+ "epoch": 0.7782453868565878,
4245
+ "grad_norm": 6.495676040649414,
4246
+ "learning_rate": 1.202318908330795e-05,
4247
+ "loss": 1.1238,
4248
+ "step": 601
4249
+ },
4250
+ {
4251
+ "epoch": 0.7795403043056005,
4252
+ "grad_norm": 5.350216388702393,
4253
+ "learning_rate": 1.188959944115372e-05,
4254
+ "loss": 0.8506,
4255
+ "step": 602
4256
+ },
4257
+ {
4258
+ "epoch": 0.7808352217546132,
4259
+ "grad_norm": 3.3894073963165283,
4260
+ "learning_rate": 1.1756655889817953e-05,
4261
+ "loss": 0.9475,
4262
+ "step": 603
4263
+ },
4264
+ {
4265
+ "epoch": 0.7821301392036257,
4266
+ "grad_norm": 4.518886089324951,
4267
+ "learning_rate": 1.1624360683110819e-05,
4268
+ "loss": 1.1395,
4269
+ "step": 604
4270
+ },
4271
+ {
4272
+ "epoch": 0.7834250566526384,
4273
+ "grad_norm": 10.47011947631836,
4274
+ "learning_rate": 1.1492716063850973e-05,
4275
+ "loss": 1.136,
4276
+ "step": 605
4277
+ },
4278
+ {
4279
+ "epoch": 0.7847199741016511,
4280
+ "grad_norm": 5.102293491363525,
4281
+ "learning_rate": 1.1361724263827633e-05,
4282
+ "loss": 0.9341,
4283
+ "step": 606
4284
+ },
4285
+ {
4286
+ "epoch": 0.7860148915506636,
4287
+ "grad_norm": 4.892009258270264,
4288
+ "learning_rate": 1.123138750376262e-05,
4289
+ "loss": 1.017,
4290
+ "step": 607
4291
+ },
4292
+ {
4293
+ "epoch": 0.7873098089996763,
4294
+ "grad_norm": 5.155287265777588,
4295
+ "learning_rate": 1.1101707993272825e-05,
4296
+ "loss": 1.0451,
4297
+ "step": 608
4298
+ },
4299
+ {
4300
+ "epoch": 0.7886047264486888,
4301
+ "grad_norm": 8.183491706848145,
4302
+ "learning_rate": 1.097268793083266e-05,
4303
+ "loss": 1.3968,
4304
+ "step": 609
4305
+ },
4306
+ {
4307
+ "epoch": 0.7898996438977015,
4308
+ "grad_norm": 3.7006969451904297,
4309
+ "learning_rate": 1.084432950373685e-05,
4310
+ "loss": 0.9212,
4311
+ "step": 610
4312
+ },
4313
+ {
4314
+ "epoch": 0.7911945613467142,
4315
+ "grad_norm": 4.199628829956055,
4316
+ "learning_rate": 1.071663488806331e-05,
4317
+ "loss": 0.9636,
4318
+ "step": 611
4319
+ },
4320
+ {
4321
+ "epoch": 0.7924894787957267,
4322
+ "grad_norm": 3.8933045864105225,
4323
+ "learning_rate": 1.0589606248636292e-05,
4324
+ "loss": 0.4748,
4325
+ "step": 612
4326
+ },
4327
+ {
4328
+ "epoch": 0.7937843962447394,
4329
+ "grad_norm": 3.9218475818634033,
4330
+ "learning_rate": 1.0463245738989636e-05,
4331
+ "loss": 0.7236,
4332
+ "step": 613
4333
+ },
4334
+ {
4335
+ "epoch": 0.7950793136937521,
4336
+ "grad_norm": 5.374284744262695,
4337
+ "learning_rate": 1.0337555501330281e-05,
4338
+ "loss": 1.0386,
4339
+ "step": 614
4340
+ },
4341
+ {
4342
+ "epoch": 0.7963742311427646,
4343
+ "grad_norm": 5.497282028198242,
4344
+ "learning_rate": 1.0212537666501976e-05,
4345
+ "loss": 0.9231,
4346
+ "step": 615
4347
+ },
4348
+ {
4349
+ "epoch": 0.7976691485917773,
4350
+ "grad_norm": 7.081708908081055,
4351
+ "learning_rate": 1.0088194353949137e-05,
4352
+ "loss": 0.93,
4353
+ "step": 616
4354
+ },
4355
+ {
4356
+ "epoch": 0.7989640660407898,
4357
+ "grad_norm": 4.324736595153809,
4358
+ "learning_rate": 9.96452767168089e-06,
4359
+ "loss": 1.1612,
4360
+ "step": 617
4361
+ },
4362
+ {
4363
+ "epoch": 0.8002589834898025,
4364
+ "grad_norm": 5.2725605964660645,
4365
+ "learning_rate": 9.841539716235387e-06,
4366
+ "loss": 0.7365,
4367
+ "step": 618
4368
+ },
4369
+ {
4370
+ "epoch": 0.8015539009388152,
4371
+ "grad_norm": 6.695886135101318,
4372
+ "learning_rate": 9.719232572644187e-06,
4373
+ "loss": 2.2744,
4374
+ "step": 619
4375
+ },
4376
+ {
4377
+ "epoch": 0.8028488183878277,
4378
+ "grad_norm": 2.6052026748657227,
4379
+ "learning_rate": 9.597608314396978e-06,
4380
+ "loss": 0.4759,
4381
+ "step": 620
4382
+ },
4383
+ {
4384
+ "epoch": 0.8028488183878277,
4385
+ "eval_loss": 1.2390819787979126,
4386
+ "eval_runtime": 2.0056,
4387
+ "eval_samples_per_second": 162.545,
4388
+ "eval_steps_per_second": 81.272,
4389
+ "step": 620
4390
+ },
4391
+ {
4392
+ "epoch": 0.8041437358368404,
4393
+ "grad_norm": 4.534181594848633,
4394
+ "learning_rate": 9.476669003406403e-06,
4395
+ "loss": 1.3455,
4396
+ "step": 621
4397
+ },
4398
+ {
4399
+ "epoch": 0.8054386532858531,
4400
+ "grad_norm": 6.6253252029418945,
4401
+ "learning_rate": 9.356416689973108e-06,
4402
+ "loss": 1.6427,
4403
+ "step": 622
4404
+ },
4405
+ {
4406
+ "epoch": 0.8067335707348656,
4407
+ "grad_norm": 4.596927642822266,
4408
+ "learning_rate": 9.236853412750935e-06,
4409
+ "loss": 0.9381,
4410
+ "step": 623
4411
+ },
4412
+ {
4413
+ "epoch": 0.8080284881838783,
4414
+ "grad_norm": 12.092345237731934,
4415
+ "learning_rate": 9.11798119871245e-06,
4416
+ "loss": 3.2316,
4417
+ "step": 624
4418
+ },
4419
+ {
4420
+ "epoch": 0.809323405632891,
4421
+ "grad_norm": 3.8456172943115234,
4422
+ "learning_rate": 8.99980206311452e-06,
4423
+ "loss": 0.7777,
4424
+ "step": 625
4425
+ },
4426
+ {
4427
+ "epoch": 0.8106183230819035,
4428
+ "grad_norm": 4.333143711090088,
4429
+ "learning_rate": 8.882318009464125e-06,
4430
+ "loss": 0.8885,
4431
+ "step": 626
4432
+ },
4433
+ {
4434
+ "epoch": 0.8119132405309162,
4435
+ "grad_norm": 6.766184329986572,
4436
+ "learning_rate": 8.765531029484476e-06,
4437
+ "loss": 0.5351,
4438
+ "step": 627
4439
+ },
4440
+ {
4441
+ "epoch": 0.8132081579799287,
4442
+ "grad_norm": 5.541638374328613,
4443
+ "learning_rate": 8.64944310308114e-06,
4444
+ "loss": 1.9105,
4445
+ "step": 628
4446
+ },
4447
+ {
4448
+ "epoch": 0.8145030754289414,
4449
+ "grad_norm": 9.491156578063965,
4450
+ "learning_rate": 8.534056198308582e-06,
4451
+ "loss": 2.0113,
4452
+ "step": 629
4453
+ },
4454
+ {
4455
+ "epoch": 0.8157979928779541,
4456
+ "grad_norm": 4.046790599822998,
4457
+ "learning_rate": 8.419372271336745e-06,
4458
+ "loss": 0.7802,
4459
+ "step": 630
4460
+ },
4461
+ {
4462
+ "epoch": 0.8170929103269666,
4463
+ "grad_norm": 4.86843204498291,
4464
+ "learning_rate": 8.305393266417887e-06,
4465
+ "loss": 1.4356,
4466
+ "step": 631
4467
+ },
4468
+ {
4469
+ "epoch": 0.8183878277759793,
4470
+ "grad_norm": 5.277727127075195,
4471
+ "learning_rate": 8.192121115853602e-06,
4472
+ "loss": 0.5158,
4473
+ "step": 632
4474
+ },
4475
+ {
4476
+ "epoch": 0.819682745224992,
4477
+ "grad_norm": 4.455538272857666,
4478
+ "learning_rate": 8.079557739962128e-06,
4479
+ "loss": 0.6981,
4480
+ "step": 633
4481
+ },
4482
+ {
4483
+ "epoch": 0.8209776626740045,
4484
+ "grad_norm": 5.631432056427002,
4485
+ "learning_rate": 7.967705047045715e-06,
4486
+ "loss": 0.7477,
4487
+ "step": 634
4488
+ },
4489
+ {
4490
+ "epoch": 0.8222725801230172,
4491
+ "grad_norm": 5.253812789916992,
4492
+ "learning_rate": 7.856564933358324e-06,
4493
+ "loss": 0.9068,
4494
+ "step": 635
4495
+ },
4496
+ {
4497
+ "epoch": 0.8235674975720297,
4498
+ "grad_norm": 10.097738265991211,
4499
+ "learning_rate": 7.746139283073473e-06,
4500
+ "loss": 1.5746,
4501
+ "step": 636
4502
+ },
4503
+ {
4504
+ "epoch": 0.8248624150210424,
4505
+ "grad_norm": 4.023595333099365,
4506
+ "learning_rate": 7.636429968252257e-06,
4507
+ "loss": 0.9911,
4508
+ "step": 637
4509
+ },
4510
+ {
4511
+ "epoch": 0.8261573324700551,
4512
+ "grad_norm": 6.050657749176025,
4513
+ "learning_rate": 7.527438848811652e-06,
4514
+ "loss": 1.0283,
4515
+ "step": 638
4516
+ },
4517
+ {
4518
+ "epoch": 0.8274522499190676,
4519
+ "grad_norm": 6.443034648895264,
4520
+ "learning_rate": 7.4191677724929906e-06,
4521
+ "loss": 2.4053,
4522
+ "step": 639
4523
+ },
4524
+ {
4525
+ "epoch": 0.8287471673680803,
4526
+ "grad_norm": 9.725380897521973,
4527
+ "learning_rate": 7.31161857483057e-06,
4528
+ "loss": 1.0468,
4529
+ "step": 640
4530
+ },
4531
+ {
4532
+ "epoch": 0.830042084817093,
4533
+ "grad_norm": 5.43861722946167,
4534
+ "learning_rate": 7.204793079120636e-06,
4535
+ "loss": 1.044,
4536
+ "step": 641
4537
+ },
4538
+ {
4539
+ "epoch": 0.8313370022661055,
4540
+ "grad_norm": 4.303345680236816,
4541
+ "learning_rate": 7.0986930963903575e-06,
4542
+ "loss": 0.9896,
4543
+ "step": 642
4544
+ },
4545
+ {
4546
+ "epoch": 0.8326319197151182,
4547
+ "grad_norm": 5.471158981323242,
4548
+ "learning_rate": 6.993320425367222e-06,
4549
+ "loss": 1.501,
4550
+ "step": 643
4551
+ },
4552
+ {
4553
+ "epoch": 0.8339268371641307,
4554
+ "grad_norm": 4.747450828552246,
4555
+ "learning_rate": 6.8886768524485e-06,
4556
+ "loss": 0.6522,
4557
+ "step": 644
4558
+ },
4559
+ {
4560
+ "epoch": 0.8352217546131434,
4561
+ "grad_norm": 4.842288494110107,
4562
+ "learning_rate": 6.7847641516709635e-06,
4563
+ "loss": 0.819,
4564
+ "step": 645
4565
+ },
4566
+ {
4567
+ "epoch": 0.8365166720621561,
4568
+ "grad_norm": 8.534642219543457,
4569
+ "learning_rate": 6.681584084680787e-06,
4570
+ "loss": 0.7393,
4571
+ "step": 646
4572
+ },
4573
+ {
4574
+ "epoch": 0.8378115895111686,
4575
+ "grad_norm": 6.525353908538818,
4576
+ "learning_rate": 6.579138400703716e-06,
4577
+ "loss": 1.0387,
4578
+ "step": 647
4579
+ },
4580
+ {
4581
+ "epoch": 0.8391065069601813,
4582
+ "grad_norm": 6.289246082305908,
4583
+ "learning_rate": 6.4774288365154035e-06,
4584
+ "loss": 1.2105,
4585
+ "step": 648
4586
+ },
4587
+ {
4588
+ "epoch": 0.840401424409194,
4589
+ "grad_norm": 3.268260955810547,
4590
+ "learning_rate": 6.376457116411971e-06,
4591
+ "loss": 1.1416,
4592
+ "step": 649
4593
+ },
4594
+ {
4595
+ "epoch": 0.8416963418582065,
4596
+ "grad_norm": 4.153942108154297,
4597
+ "learning_rate": 6.2762249521807645e-06,
4598
+ "loss": 1.3022,
4599
+ "step": 650
4600
+ },
4601
+ {
4602
+ "epoch": 0.8429912593072192,
4603
+ "grad_norm": 4.653594493865967,
4604
+ "learning_rate": 6.17673404307132e-06,
4605
+ "loss": 1.1183,
4606
+ "step": 651
4607
+ },
4608
+ {
4609
+ "epoch": 0.8442861767562317,
4610
+ "grad_norm": 5.605881690979004,
4611
+ "learning_rate": 6.077986075766612e-06,
4612
+ "loss": 0.7487,
4613
+ "step": 652
4614
+ },
4615
+ {
4616
+ "epoch": 0.8455810942052444,
4617
+ "grad_norm": 4.79271936416626,
4618
+ "learning_rate": 5.979982724354366e-06,
4619
+ "loss": 1.7176,
4620
+ "step": 653
4621
+ },
4622
+ {
4623
+ "epoch": 0.8468760116542571,
4624
+ "grad_norm": 6.354664325714111,
4625
+ "learning_rate": 5.882725650298787e-06,
4626
+ "loss": 1.5522,
4627
+ "step": 654
4628
+ },
4629
+ {
4630
+ "epoch": 0.8481709291032696,
4631
+ "grad_norm": 4.5003862380981445,
4632
+ "learning_rate": 5.7862165024123175e-06,
4633
+ "loss": 1.0054,
4634
+ "step": 655
4635
+ },
4636
+ {
4637
+ "epoch": 0.8494658465522823,
4638
+ "grad_norm": 4.5881781578063965,
4639
+ "learning_rate": 5.690456916827691e-06,
4640
+ "loss": 0.8891,
4641
+ "step": 656
4642
+ },
4643
+ {
4644
+ "epoch": 0.850760764001295,
4645
+ "grad_norm": 4.843465328216553,
4646
+ "learning_rate": 5.5954485169702306e-06,
4647
+ "loss": 0.8689,
4648
+ "step": 657
4649
+ },
4650
+ {
4651
+ "epoch": 0.8520556814503075,
4652
+ "grad_norm": 4.317074775695801,
4653
+ "learning_rate": 5.501192913530301e-06,
4654
+ "loss": 0.963,
4655
+ "step": 658
4656
+ },
4657
+ {
4658
+ "epoch": 0.8533505988993202,
4659
+ "grad_norm": 14.343049049377441,
4660
+ "learning_rate": 5.407691704435991e-06,
4661
+ "loss": 1.5631,
4662
+ "step": 659
4663
+ },
4664
+ {
4665
+ "epoch": 0.8546455163483327,
4666
+ "grad_norm": 5.277613162994385,
4667
+ "learning_rate": 5.314946474826066e-06,
4668
+ "loss": 1.0714,
4669
+ "step": 660
4670
+ },
4671
+ {
4672
+ "epoch": 0.8559404337973454,
4673
+ "grad_norm": 5.00631046295166,
4674
+ "learning_rate": 5.222958797023036e-06,
4675
+ "loss": 0.4646,
4676
+ "step": 661
4677
+ },
4678
+ {
4679
+ "epoch": 0.8572353512463581,
4680
+ "grad_norm": 3.9715874195098877,
4681
+ "learning_rate": 5.13173023050656e-06,
4682
+ "loss": 0.5706,
4683
+ "step": 662
4684
+ },
4685
+ {
4686
+ "epoch": 0.8585302686953706,
4687
+ "grad_norm": 7.35955286026001,
4688
+ "learning_rate": 5.041262321886958e-06,
4689
+ "loss": 2.2692,
4690
+ "step": 663
4691
+ },
4692
+ {
4693
+ "epoch": 0.8598251861443833,
4694
+ "grad_norm": 3.029388666152954,
4695
+ "learning_rate": 4.951556604879048e-06,
4696
+ "loss": 0.6182,
4697
+ "step": 664
4698
+ },
4699
+ {
4700
+ "epoch": 0.861120103593396,
4701
+ "grad_norm": 5.505150318145752,
4702
+ "learning_rate": 4.862614600276061e-06,
4703
+ "loss": 0.4528,
4704
+ "step": 665
4705
+ },
4706
+ {
4707
+ "epoch": 0.8624150210424085,
4708
+ "grad_norm": 4.694687366485596,
4709
+ "learning_rate": 4.774437815923938e-06,
4710
+ "loss": 1.1174,
4711
+ "step": 666
4712
+ },
4713
+ {
4714
+ "epoch": 0.8637099384914212,
4715
+ "grad_norm": 5.113190650939941,
4716
+ "learning_rate": 4.687027746695727e-06,
4717
+ "loss": 1.0881,
4718
+ "step": 667
4719
+ },
4720
+ {
4721
+ "epoch": 0.8650048559404337,
4722
+ "grad_norm": 5.682245254516602,
4723
+ "learning_rate": 4.600385874466256e-06,
4724
+ "loss": 0.9992,
4725
+ "step": 668
4726
+ },
4727
+ {
4728
+ "epoch": 0.8662997733894464,
4729
+ "grad_norm": 5.2173285484313965,
4730
+ "learning_rate": 4.514513668087011e-06,
4731
+ "loss": 0.8637,
4732
+ "step": 669
4733
+ },
4734
+ {
4735
+ "epoch": 0.8675946908384591,
4736
+ "grad_norm": 6.744502067565918,
4737
+ "learning_rate": 4.429412583361209e-06,
4738
+ "loss": 0.8903,
4739
+ "step": 670
4740
+ },
4741
+ {
4742
+ "epoch": 0.8688896082874716,
4743
+ "grad_norm": 4.4934916496276855,
4744
+ "learning_rate": 4.34508406301915e-06,
4745
+ "loss": 0.8,
4746
+ "step": 671
4747
+ },
4748
+ {
4749
+ "epoch": 0.8701845257364843,
4750
+ "grad_norm": 5.831895351409912,
4751
+ "learning_rate": 4.261529536693737e-06,
4752
+ "loss": 1.1527,
4753
+ "step": 672
4754
+ },
4755
+ {
4756
+ "epoch": 0.871479443185497,
4757
+ "grad_norm": 6.142891883850098,
4758
+ "learning_rate": 4.178750420896255e-06,
4759
+ "loss": 0.6969,
4760
+ "step": 673
4761
+ },
4762
+ {
4763
+ "epoch": 0.8727743606345095,
4764
+ "grad_norm": 7.165826797485352,
4765
+ "learning_rate": 4.0967481189923384e-06,
4766
+ "loss": 1.0799,
4767
+ "step": 674
4768
+ },
4769
+ {
4770
+ "epoch": 0.8740692780835222,
4771
+ "grad_norm": 7.070796012878418,
4772
+ "learning_rate": 4.015524021178196e-06,
4773
+ "loss": 1.2246,
4774
+ "step": 675
4775
+ },
4776
+ {
4777
+ "epoch": 0.8753641955325348,
4778
+ "grad_norm": 5.325906276702881,
4779
+ "learning_rate": 3.935079504457034e-06,
4780
+ "loss": 0.6554,
4781
+ "step": 676
4782
+ },
4783
+ {
4784
+ "epoch": 0.8766591129815474,
4785
+ "grad_norm": 11.728653907775879,
4786
+ "learning_rate": 3.8554159326157304e-06,
4787
+ "loss": 2.4503,
4788
+ "step": 677
4789
+ },
4790
+ {
4791
+ "epoch": 0.8779540304305601,
4792
+ "grad_norm": 4.562655925750732,
4793
+ "learning_rate": 3.7765346562016744e-06,
4794
+ "loss": 1.2593,
4795
+ "step": 678
4796
+ },
4797
+ {
4798
+ "epoch": 0.8792489478795726,
4799
+ "grad_norm": 4.027012348175049,
4800
+ "learning_rate": 3.6984370124999058e-06,
4801
+ "loss": 0.9379,
4802
+ "step": 679
4803
+ },
4804
+ {
4805
+ "epoch": 0.8805438653285853,
4806
+ "grad_norm": 3.9469587802886963,
4807
+ "learning_rate": 3.621124325510422e-06,
4808
+ "loss": 1.204,
4809
+ "step": 680
4810
+ },
4811
+ {
4812
+ "epoch": 0.881838782777598,
4813
+ "grad_norm": 5.622085094451904,
4814
+ "learning_rate": 3.5445979059257505e-06,
4815
+ "loss": 1.4393,
4816
+ "step": 681
4817
+ },
4818
+ {
4819
+ "epoch": 0.8831337002266105,
4820
+ "grad_norm": 6.133405685424805,
4821
+ "learning_rate": 3.4688590511087304e-06,
4822
+ "loss": 1.0091,
4823
+ "step": 682
4824
+ },
4825
+ {
4826
+ "epoch": 0.8844286176756232,
4827
+ "grad_norm": 6.239793300628662,
4828
+ "learning_rate": 3.3939090450704925e-06,
4829
+ "loss": 1.4701,
4830
+ "step": 683
4831
+ },
4832
+ {
4833
+ "epoch": 0.8857235351246358,
4834
+ "grad_norm": 11.061779975891113,
4835
+ "learning_rate": 3.3197491584487093e-06,
4836
+ "loss": 1.2142,
4837
+ "step": 684
4838
+ },
4839
+ {
4840
+ "epoch": 0.8870184525736484,
4841
+ "grad_norm": 7.947810173034668,
4842
+ "learning_rate": 3.246380648486058e-06,
4843
+ "loss": 2.459,
4844
+ "step": 685
4845
+ },
4846
+ {
4847
+ "epoch": 0.8883133700226611,
4848
+ "grad_norm": 3.787471294403076,
4849
+ "learning_rate": 3.1738047590088803e-06,
4850
+ "loss": 0.5336,
4851
+ "step": 686
4852
+ },
4853
+ {
4854
+ "epoch": 0.8896082874716736,
4855
+ "grad_norm": 6.404763698577881,
4856
+ "learning_rate": 3.10202272040615e-06,
4857
+ "loss": 1.0348,
4858
+ "step": 687
4859
+ },
4860
+ {
4861
+ "epoch": 0.8909032049206863,
4862
+ "grad_norm": 3.7612321376800537,
4863
+ "learning_rate": 3.0310357496085405e-06,
4864
+ "loss": 0.5957,
4865
+ "step": 688
4866
+ },
4867
+ {
4868
+ "epoch": 0.892198122369699,
4869
+ "grad_norm": 4.630037784576416,
4870
+ "learning_rate": 2.9608450500678565e-06,
4871
+ "loss": 1.1506,
4872
+ "step": 689
4873
+ },
4874
+ {
4875
+ "epoch": 0.8934930398187115,
4876
+ "grad_norm": 4.205752372741699,
4877
+ "learning_rate": 2.8914518117366006e-06,
4878
+ "loss": 0.9329,
4879
+ "step": 690
4880
+ },
4881
+ {
4882
+ "epoch": 0.8947879572677242,
4883
+ "grad_norm": 3.5641722679138184,
4884
+ "learning_rate": 2.8228572110478133e-06,
4885
+ "loss": 0.7643,
4886
+ "step": 691
4887
+ },
4888
+ {
4889
+ "epoch": 0.8960828747167368,
4890
+ "grad_norm": 4.8683342933654785,
4891
+ "learning_rate": 2.755062410895104e-06,
4892
+ "loss": 1.3503,
4893
+ "step": 692
4894
+ },
4895
+ {
4896
+ "epoch": 0.8973777921657494,
4897
+ "grad_norm": 4.980081558227539,
4898
+ "learning_rate": 2.6880685606129664e-06,
4899
+ "loss": 0.9362,
4900
+ "step": 693
4901
+ },
4902
+ {
4903
+ "epoch": 0.8986727096147621,
4904
+ "grad_norm": 3.7096731662750244,
4905
+ "learning_rate": 2.62187679595729e-06,
4906
+ "loss": 0.735,
4907
+ "step": 694
4908
+ },
4909
+ {
4910
+ "epoch": 0.8999676270637746,
4911
+ "grad_norm": 6.497191905975342,
4912
+ "learning_rate": 2.55648823908608e-06,
4913
+ "loss": 1.1868,
4914
+ "step": 695
4915
+ },
4916
+ {
4917
+ "epoch": 0.9012625445127873,
4918
+ "grad_norm": 9.368871688842773,
4919
+ "learning_rate": 2.4919039985404626e-06,
4920
+ "loss": 0.6176,
4921
+ "step": 696
4922
+ },
4923
+ {
4924
+ "epoch": 0.9025574619618,
4925
+ "grad_norm": 2.9543919563293457,
4926
+ "learning_rate": 2.428125169225881e-06,
4927
+ "loss": 0.5762,
4928
+ "step": 697
4929
+ },
4930
+ {
4931
+ "epoch": 0.9038523794108125,
4932
+ "grad_norm": 5.527708530426025,
4933
+ "learning_rate": 2.36515283239353e-06,
4934
+ "loss": 0.8727,
4935
+ "step": 698
4936
+ },
4937
+ {
4938
+ "epoch": 0.9051472968598252,
4939
+ "grad_norm": 7.263698577880859,
4940
+ "learning_rate": 2.3029880556220074e-06,
4941
+ "loss": 0.5661,
4942
+ "step": 699
4943
+ },
4944
+ {
4945
+ "epoch": 0.9064422143088378,
4946
+ "grad_norm": 4.572330951690674,
4947
+ "learning_rate": 2.241631892799262e-06,
4948
+ "loss": 0.6105,
4949
+ "step": 700
4950
+ },
4951
+ {
4952
+ "epoch": 0.9077371317578504,
4953
+ "grad_norm": 7.1983795166015625,
4954
+ "learning_rate": 2.181085384104703e-06,
4955
+ "loss": 1.0509,
4956
+ "step": 701
4957
+ },
4958
+ {
4959
+ "epoch": 0.9090320492068631,
4960
+ "grad_norm": 4.944607734680176,
4961
+ "learning_rate": 2.121349555991525e-06,
4962
+ "loss": 1.3179,
4963
+ "step": 702
4964
+ },
4965
+ {
4966
+ "epoch": 0.9103269666558756,
4967
+ "grad_norm": 3.876669406890869,
4968
+ "learning_rate": 2.0624254211693894e-06,
4969
+ "loss": 0.5404,
4970
+ "step": 703
4971
+ },
4972
+ {
4973
+ "epoch": 0.9116218841048883,
4974
+ "grad_norm": 3.4817299842834473,
4975
+ "learning_rate": 2.004313978587186e-06,
4976
+ "loss": 0.881,
4977
+ "step": 704
4978
+ },
4979
+ {
4980
+ "epoch": 0.912916801553901,
4981
+ "grad_norm": 5.139196395874023,
4982
+ "learning_rate": 1.9470162134161143e-06,
4983
+ "loss": 0.7959,
4984
+ "step": 705
4985
+ },
4986
+ {
4987
+ "epoch": 0.9142117190029135,
4988
+ "grad_norm": 6.930759429931641,
4989
+ "learning_rate": 1.8905330970330259e-06,
4990
+ "loss": 1.6162,
4991
+ "step": 706
4992
+ },
4993
+ {
4994
+ "epoch": 0.9155066364519262,
4995
+ "grad_norm": 4.119734764099121,
4996
+ "learning_rate": 1.83486558700387e-06,
4997
+ "loss": 0.984,
4998
+ "step": 707
4999
+ },
5000
+ {
5001
+ "epoch": 0.9168015539009389,
5002
+ "grad_norm": 4.680385112762451,
5003
+ "learning_rate": 1.78001462706755e-06,
5004
+ "loss": 0.8123,
5005
+ "step": 708
5006
+ },
5007
+ {
5008
+ "epoch": 0.9180964713499514,
5009
+ "grad_norm": 6.069380283355713,
5010
+ "learning_rate": 1.7259811471198706e-06,
5011
+ "loss": 0.9954,
5012
+ "step": 709
5013
+ },
5014
+ {
5015
+ "epoch": 0.9193913887989641,
5016
+ "grad_norm": 3.990244150161743,
5017
+ "learning_rate": 1.6727660631977893e-06,
5018
+ "loss": 0.6567,
5019
+ "step": 710
5020
+ },
5021
+ {
5022
+ "epoch": 0.9206863062479766,
5023
+ "grad_norm": 4.472250461578369,
5024
+ "learning_rate": 1.620370277463884e-06,
5025
+ "loss": 1.8166,
5026
+ "step": 711
5027
+ },
5028
+ {
5029
+ "epoch": 0.9219812236969893,
5030
+ "grad_norm": 4.742919921875,
5031
+ "learning_rate": 1.5687946781910378e-06,
5032
+ "loss": 0.8593,
5033
+ "step": 712
5034
+ },
5035
+ {
5036
+ "epoch": 0.923276141146002,
5037
+ "grad_norm": 4.736976146697998,
5038
+ "learning_rate": 1.5180401397474343e-06,
5039
+ "loss": 1.4691,
5040
+ "step": 713
5041
+ },
5042
+ {
5043
+ "epoch": 0.9245710585950145,
5044
+ "grad_norm": 4.94817590713501,
5045
+ "learning_rate": 1.4681075225816854e-06,
5046
+ "loss": 1.0589,
5047
+ "step": 714
5048
+ },
5049
+ {
5050
+ "epoch": 0.9258659760440272,
5051
+ "grad_norm": 7.17746639251709,
5052
+ "learning_rate": 1.4189976732082666e-06,
5053
+ "loss": 2.4874,
5054
+ "step": 715
5055
+ },
5056
+ {
5057
+ "epoch": 0.9271608934930399,
5058
+ "grad_norm": 4.172902584075928,
5059
+ "learning_rate": 1.3707114241931328e-06,
5060
+ "loss": 0.9651,
5061
+ "step": 716
5062
+ },
5063
+ {
5064
+ "epoch": 0.9284558109420524,
5065
+ "grad_norm": 4.561225414276123,
5066
+ "learning_rate": 1.3232495941396639e-06,
5067
+ "loss": 0.8338,
5068
+ "step": 717
5069
+ },
5070
+ {
5071
+ "epoch": 0.9297507283910651,
5072
+ "grad_norm": 4.700865745544434,
5073
+ "learning_rate": 1.2766129876747413e-06,
5074
+ "loss": 0.6137,
5075
+ "step": 718
5076
+ },
5077
+ {
5078
+ "epoch": 0.9310456458400777,
5079
+ "grad_norm": 4.913844108581543,
5080
+ "learning_rate": 1.2308023954351043e-06,
5081
+ "loss": 1.0453,
5082
+ "step": 719
5083
+ },
5084
+ {
5085
+ "epoch": 0.9323405632890903,
5086
+ "grad_norm": 3.081576108932495,
5087
+ "learning_rate": 1.1858185940539779e-06,
5088
+ "loss": 0.738,
5089
+ "step": 720
5090
+ },
5091
+ {
5092
+ "epoch": 0.933635480738103,
5093
+ "grad_norm": 6.5089006423950195,
5094
+ "learning_rate": 1.1416623461478704e-06,
5095
+ "loss": 1.3315,
5096
+ "step": 721
5097
+ },
5098
+ {
5099
+ "epoch": 0.9349303981871155,
5100
+ "grad_norm": 4.095083236694336,
5101
+ "learning_rate": 1.0983344003036912e-06,
5102
+ "loss": 0.7988,
5103
+ "step": 722
5104
+ },
5105
+ {
5106
+ "epoch": 0.9362253156361282,
5107
+ "grad_norm": 4.421083927154541,
5108
+ "learning_rate": 1.055835491066004e-06,
5109
+ "loss": 0.6599,
5110
+ "step": 723
5111
+ },
5112
+ {
5113
+ "epoch": 0.9375202330851409,
5114
+ "grad_norm": 6.9717698097229,
5115
+ "learning_rate": 1.014166338924627e-06,
5116
+ "loss": 1.9028,
5117
+ "step": 724
5118
+ },
5119
+ {
5120
+ "epoch": 0.9388151505341534,
5121
+ "grad_norm": 5.759119033813477,
5122
+ "learning_rate": 9.733276503023692e-07,
5123
+ "loss": 1.0143,
5124
+ "step": 725
5125
+ },
5126
+ {
5127
+ "epoch": 0.9401100679831661,
5128
+ "grad_norm": 7.723138809204102,
5129
+ "learning_rate": 9.33320117543085e-07,
5130
+ "loss": 1.5553,
5131
+ "step": 726
5132
+ },
5133
+ {
5134
+ "epoch": 0.9414049854321787,
5135
+ "grad_norm": 5.182671070098877,
5136
+ "learning_rate": 8.941444188999393e-07,
5137
+ "loss": 1.2996,
5138
+ "step": 727
5139
+ },
5140
+ {
5141
+ "epoch": 0.9426999028811913,
5142
+ "grad_norm": 8.686777114868164,
5143
+ "learning_rate": 8.558012185238939e-07,
5144
+ "loss": 1.1213,
5145
+ "step": 728
5146
+ },
5147
+ {
5148
+ "epoch": 0.943994820330204,
5149
+ "grad_norm": 8.205387115478516,
5150
+ "learning_rate": 8.182911664524562e-07,
5151
+ "loss": 0.7117,
5152
+ "step": 729
5153
+ },
5154
+ {
5155
+ "epoch": 0.9452897377792165,
5156
+ "grad_norm": 5.972929000854492,
5157
+ "learning_rate": 7.816148985986483e-07,
5158
+ "loss": 0.9229,
5159
+ "step": 730
5160
+ },
5161
+ {
5162
+ "epoch": 0.9465846552282292,
5163
+ "grad_norm": 8.060487747192383,
5164
+ "learning_rate": 7.457730367402549e-07,
5165
+ "loss": 1.5945,
5166
+ "step": 731
5167
+ },
5168
+ {
5169
+ "epoch": 0.9478795726772419,
5170
+ "grad_norm": 4.760429382324219,
5171
+ "learning_rate": 7.107661885092321e-07,
5172
+ "loss": 0.9784,
5173
+ "step": 732
5174
+ },
5175
+ {
5176
+ "epoch": 0.9491744901262544,
5177
+ "grad_norm": 4.759274482727051,
5178
+ "learning_rate": 6.765949473814648e-07,
5179
+ "loss": 1.095,
5180
+ "step": 733
5181
+ },
5182
+ {
5183
+ "epoch": 0.9504694075752671,
5184
+ "grad_norm": 4.729125022888184,
5185
+ "learning_rate": 6.432598926666589e-07,
5186
+ "loss": 0.7815,
5187
+ "step": 734
5188
+ },
5189
+ {
5190
+ "epoch": 0.9517643250242797,
5191
+ "grad_norm": 5.148903846740723,
5192
+ "learning_rate": 6.107615894985375e-07,
5193
+ "loss": 1.2755,
5194
+ "step": 735
5195
+ },
5196
+ {
5197
+ "epoch": 0.9530592424732923,
5198
+ "grad_norm": 4.272920608520508,
5199
+ "learning_rate": 5.791005888252765e-07,
5200
+ "loss": 0.6422,
5201
+ "step": 736
5202
+ },
5203
+ {
5204
+ "epoch": 0.954354159922305,
5205
+ "grad_norm": 3.1463730335235596,
5206
+ "learning_rate": 5.482774274001401e-07,
5207
+ "loss": 0.7711,
5208
+ "step": 737
5209
+ },
5210
+ {
5211
+ "epoch": 0.9556490773713175,
5212
+ "grad_norm": 4.3048906326293945,
5213
+ "learning_rate": 5.18292627772382e-07,
5214
+ "loss": 0.6693,
5215
+ "step": 738
5216
+ },
5217
+ {
5218
+ "epoch": 0.9569439948203302,
5219
+ "grad_norm": 6.0783209800720215,
5220
+ "learning_rate": 4.891466982783977e-07,
5221
+ "loss": 0.8769,
5222
+ "step": 739
5223
+ },
5224
+ {
5225
+ "epoch": 0.9582389122693429,
5226
+ "grad_norm": 5.600512981414795,
5227
+ "learning_rate": 4.60840133033108e-07,
5228
+ "loss": 1.1665,
5229
+ "step": 740
5230
+ },
5231
+ {
5232
+ "epoch": 0.9595338297183554,
5233
+ "grad_norm": 9.313074111938477,
5234
+ "learning_rate": 4.3337341192157265e-07,
5235
+ "loss": 1.9236,
5236
+ "step": 741
5237
+ },
5238
+ {
5239
+ "epoch": 0.9608287471673681,
5240
+ "grad_norm": 7.646642208099365,
5241
+ "learning_rate": 4.067470005908625e-07,
5242
+ "loss": 0.8392,
5243
+ "step": 742
5244
+ },
5245
+ {
5246
+ "epoch": 0.9621236646163807,
5247
+ "grad_norm": 9.741737365722656,
5248
+ "learning_rate": 3.809613504421661e-07,
5249
+ "loss": 1.3232,
5250
+ "step": 743
5251
+ },
5252
+ {
5253
+ "epoch": 0.9634185820653933,
5254
+ "grad_norm": 6.090597152709961,
5255
+ "learning_rate": 3.5601689862311826e-07,
5256
+ "loss": 1.1143,
5257
+ "step": 744
5258
+ },
5259
+ {
5260
+ "epoch": 0.964713499514406,
5261
+ "grad_norm": 5.305496692657471,
5262
+ "learning_rate": 3.3191406802041693e-07,
5263
+ "loss": 0.7994,
5264
+ "step": 745
5265
+ },
5266
+ {
5267
+ "epoch": 0.9660084169634185,
5268
+ "grad_norm": 5.51359748840332,
5269
+ "learning_rate": 3.0865326725263435e-07,
5270
+ "loss": 2.0726,
5271
+ "step": 746
5272
+ },
5273
+ {
5274
+ "epoch": 0.9673033344124312,
5275
+ "grad_norm": 2.874615430831909,
5276
+ "learning_rate": 2.8623489066329503e-07,
5277
+ "loss": 0.6604,
5278
+ "step": 747
5279
+ },
5280
+ {
5281
+ "epoch": 0.9685982518614439,
5282
+ "grad_norm": 5.645188808441162,
5283
+ "learning_rate": 2.646593183142088e-07,
5284
+ "loss": 0.9807,
5285
+ "step": 748
5286
+ },
5287
+ {
5288
+ "epoch": 0.9698931693104564,
5289
+ "grad_norm": 3.8236138820648193,
5290
+ "learning_rate": 2.4392691597898143e-07,
5291
+ "loss": 0.923,
5292
+ "step": 749
5293
+ },
5294
+ {
5295
+ "epoch": 0.9711880867594691,
5296
+ "grad_norm": 4.602663993835449,
5297
+ "learning_rate": 2.2403803513686428e-07,
5298
+ "loss": 0.5873,
5299
+ "step": 750
5300
+ },
5301
+ {
5302
+ "epoch": 0.9724830042084817,
5303
+ "grad_norm": 3.112046718597412,
5304
+ "learning_rate": 2.0499301296676432e-07,
5305
+ "loss": 0.6582,
5306
+ "step": 751
5307
+ },
5308
+ {
5309
+ "epoch": 0.9737779216574943,
5310
+ "grad_norm": 5.2788920402526855,
5311
+ "learning_rate": 1.8679217234154334e-07,
5312
+ "loss": 1.0278,
5313
+ "step": 752
5314
+ },
5315
+ {
5316
+ "epoch": 0.975072839106507,
5317
+ "grad_norm": 6.206789493560791,
5318
+ "learning_rate": 1.6943582182253336e-07,
5319
+ "loss": 1.0729,
5320
+ "step": 753
5321
+ },
5322
+ {
5323
+ "epoch": 0.9763677565555196,
5324
+ "grad_norm": 7.300841331481934,
5325
+ "learning_rate": 1.5292425565430757e-07,
5326
+ "loss": 1.3212,
5327
+ "step": 754
5328
+ },
5329
+ {
5330
+ "epoch": 0.9776626740045322,
5331
+ "grad_norm": 4.0115275382995605,
5332
+ "learning_rate": 1.372577537597064e-07,
5333
+ "loss": 0.9395,
5334
+ "step": 755
5335
+ },
5336
+ {
5337
+ "epoch": 0.9789575914535449,
5338
+ "grad_norm": 7.623907089233398,
5339
+ "learning_rate": 1.224365817350692e-07,
5340
+ "loss": 1.3756,
5341
+ "step": 756
5342
+ },
5343
+ {
5344
+ "epoch": 0.9802525089025574,
5345
+ "grad_norm": 20.821199417114258,
5346
+ "learning_rate": 1.0846099084574346e-07,
5347
+ "loss": 3.2897,
5348
+ "step": 757
5349
+ },
5350
+ {
5351
+ "epoch": 0.9815474263515701,
5352
+ "grad_norm": 5.356776714324951,
5353
+ "learning_rate": 9.533121802183797e-08,
5354
+ "loss": 0.9017,
5355
+ "step": 758
5356
+ },
5357
+ {
5358
+ "epoch": 0.9828423438005827,
5359
+ "grad_norm": 6.1148505210876465,
5360
+ "learning_rate": 8.304748585417078e-08,
5361
+ "loss": 1.4205,
5362
+ "step": 759
5363
+ },
5364
+ {
5365
+ "epoch": 0.9841372612495953,
5366
+ "grad_norm": 6.684154033660889,
5367
+ "learning_rate": 7.161000259053308e-08,
5368
+ "loss": 1.3984,
5369
+ "step": 760
5370
+ },
5371
+ {
5372
+ "epoch": 0.985432178698608,
5373
+ "grad_norm": 2.9924120903015137,
5374
+ "learning_rate": 6.10189621321422e-08,
5375
+ "loss": 0.6213,
5376
+ "step": 761
5377
+ },
5378
+ {
5379
+ "epoch": 0.9867270961476206,
5380
+ "grad_norm": 4.251294136047363,
5381
+ "learning_rate": 5.127454403034415e-08,
5382
+ "loss": 1.0332,
5383
+ "step": 762
5384
+ },
5385
+ {
5386
+ "epoch": 0.9880220135966332,
5387
+ "grad_norm": 4.321407794952393,
5388
+ "learning_rate": 4.2376913483599404e-08,
5389
+ "loss": 1.1482,
5390
+ "step": 763
5391
+ },
5392
+ {
5393
+ "epoch": 0.9893169310456459,
5394
+ "grad_norm": 8.435138702392578,
5395
+ "learning_rate": 3.4326221334640695e-08,
5396
+ "loss": 2.5909,
5397
+ "step": 764
5398
+ },
5399
+ {
5400
+ "epoch": 0.9906118484946584,
5401
+ "grad_norm": 4.1094560623168945,
5402
+ "learning_rate": 2.712260406795286e-08,
5403
+ "loss": 0.4297,
5404
+ "step": 765
5405
+ },
5406
+ {
5407
+ "epoch": 0.9919067659436711,
5408
+ "grad_norm": 4.245459079742432,
5409
+ "learning_rate": 2.076618380744133e-08,
5410
+ "loss": 1.6027,
5411
+ "step": 766
5412
+ },
5413
+ {
5414
+ "epoch": 0.9932016833926837,
5415
+ "grad_norm": 4.5725297927856445,
5416
+ "learning_rate": 1.525706831437268e-08,
5417
+ "loss": 1.3605,
5418
+ "step": 767
5419
+ },
5420
+ {
5421
+ "epoch": 0.9944966008416963,
5422
+ "grad_norm": 3.8891658782958984,
5423
+ "learning_rate": 1.0595350985526109e-08,
5424
+ "loss": 0.8142,
5425
+ "step": 768
5426
+ },
5427
+ {
5428
+ "epoch": 0.995791518290709,
5429
+ "grad_norm": 4.054940223693848,
5430
+ "learning_rate": 6.781110851633576e-09,
5431
+ "loss": 1.1612,
5432
+ "step": 769
5433
+ },
5434
+ {
5435
+ "epoch": 0.9970864357397216,
5436
+ "grad_norm": 7.459210395812988,
5437
+ "learning_rate": 3.814412576025328e-09,
5438
+ "loss": 1.4391,
5439
+ "step": 770
5440
+ },
5441
+ {
5442
+ "epoch": 0.9983813531887342,
5443
+ "grad_norm": 4.039707183837891,
5444
+ "learning_rate": 1.6953064535474295e-09,
5445
+ "loss": 0.799,
5446
+ "step": 771
5447
+ },
5448
+ {
5449
+ "epoch": 0.9996762706377469,
5450
+ "grad_norm": 3.708043098449707,
5451
+ "learning_rate": 4.238284096902412e-10,
5452
+ "loss": 0.8702,
5453
+ "step": 772
5454
+ },
5455
+ {
5456
+ "epoch": 1.0009711880867596,
5457
+ "grad_norm": 27.973024368286133,
5458
+ "learning_rate": 0.0,
5459
+ "loss": 3.9517,
5460
+ "step": 773
5461
  }
5462
  ],
5463
  "logging_steps": 1,
 
5472
  "should_evaluate": false,
5473
  "should_log": false,
5474
  "should_save": true,
5475
+ "should_training_stop": true
5476
  },
5477
  "attributes": {}
5478
  }
5479
  },
5480
+ "total_flos": 1084147192037376.0,
5481
  "train_batch_size": 2,
5482
  "trial_name": null,
5483
  "trial_params": null