CocoRoF commited on
Commit
2afe695
·
verified ·
1 Parent(s): a72b5bb

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0c398bc843eea803ee6d700d1537de79fdc32bede5d65c5c9f86d67c40a71de
3
  size 737632172
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbfded01e29c2f16226927197c7b53cb17e6b0e25f4e77f11587c6e4e8cecdca
3
  size 737632172
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09e62b87c8cdb6fbdf1ca09dea3f1e41c6f59a2ccbb87b7b025db5eb51b5fa0d
3
  size 1475354682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c9c2501538e245b7dc88214c50a178f28922e91dc78a7635e1dfef030205c3
3
  size 1475354682
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2fbcd26bac3ea7dc02fc9ede5b8a1914ca51611473722a11a969e1f26ac0ee
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d97b511d2fdb8061e5bf72c139923941c148260fac1caedd654028da6986c1
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3839473129eb8c438ab312370daa55eb10a0790f33d38fc5eaa24859b54b0d1f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5088a0d34c7015afe60457fbb3f0a4740839369017a42ea4b3250322c2d63ceb
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9cac0eb25286b75549fa2030810940adf357064a83facaf5c58ebe37190b6ac
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0a57d29811122d52bd53f81af680412b91dde1cd2a12fa885d8a54388be8e2d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c90ab29b255eaf920ecc1cba0b586e426f8e2db67b44a65576693f84178a04f
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4efbfa3cfb1bb8fb9c3380e65959a8b4eaf3bceb0507a26ffba1a3e4636ddb1
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c9ee2be288a50938aa76c672a598bafcd789d6a5d6e08c069ef8e7d474b5cd2
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b579900d94a8c528190bb9fc0315439f3c057f344b31a3968eaa60ed56b9c9f5
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3328506998185964,
5
  "eval_steps": 1000,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5313,6 +5313,1780 @@
5313
  "learning_rate": 9.98699802855441e-06,
5314
  "loss": 11.8236,
5315
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5316
  }
5317
  ],
5318
  "logging_steps": 10,
@@ -5332,7 +7106,7 @@
5332
  "attributes": {}
5333
  }
5334
  },
5335
- "total_flos": 2.617292403769344e+18,
5336
  "train_batch_size": 4,
5337
  "trial_name": null,
5338
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.44380093309146185,
5
  "eval_steps": 1000,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5313
  "learning_rate": 9.98699802855441e-06,
5314
  "loss": 11.8236,
5315
  "step": 7500
5316
+ },
5317
+ {
5318
+ "epoch": 0.3332945007516878,
5319
+ "grad_norm": 76.12377166748047,
5320
+ "learning_rate": 9.986980692592483e-06,
5321
+ "loss": 11.2932,
5322
+ "step": 7510
5323
+ },
5324
+ {
5325
+ "epoch": 0.3337383016847793,
5326
+ "grad_norm": 94.77151489257812,
5327
+ "learning_rate": 9.986963356630556e-06,
5328
+ "loss": 12.1823,
5329
+ "step": 7520
5330
+ },
5331
+ {
5332
+ "epoch": 0.33418210261787074,
5333
+ "grad_norm": 100.01810455322266,
5334
+ "learning_rate": 9.986946020668629e-06,
5335
+ "loss": 12.6788,
5336
+ "step": 7530
5337
+ },
5338
+ {
5339
+ "epoch": 0.3346259035509622,
5340
+ "grad_norm": 132.56448364257812,
5341
+ "learning_rate": 9.9869286847067e-06,
5342
+ "loss": 12.149,
5343
+ "step": 7540
5344
+ },
5345
+ {
5346
+ "epoch": 0.33506970448405365,
5347
+ "grad_norm": 113.610107421875,
5348
+ "learning_rate": 9.986911348744773e-06,
5349
+ "loss": 11.8811,
5350
+ "step": 7550
5351
+ },
5352
+ {
5353
+ "epoch": 0.33551350541714514,
5354
+ "grad_norm": 94.93478393554688,
5355
+ "learning_rate": 9.986894012782846e-06,
5356
+ "loss": 11.5749,
5357
+ "step": 7560
5358
+ },
5359
+ {
5360
+ "epoch": 0.33595730635023663,
5361
+ "grad_norm": 92.85311126708984,
5362
+ "learning_rate": 9.986876676820918e-06,
5363
+ "loss": 11.7571,
5364
+ "step": 7570
5365
+ },
5366
+ {
5367
+ "epoch": 0.33640110728332806,
5368
+ "grad_norm": 79.2991943359375,
5369
+ "learning_rate": 9.986859340858991e-06,
5370
+ "loss": 11.3534,
5371
+ "step": 7580
5372
+ },
5373
+ {
5374
+ "epoch": 0.33684490821641955,
5375
+ "grad_norm": 103.73065185546875,
5376
+ "learning_rate": 9.986842004897064e-06,
5377
+ "loss": 11.6249,
5378
+ "step": 7590
5379
+ },
5380
+ {
5381
+ "epoch": 0.337288709149511,
5382
+ "grad_norm": 79.18226623535156,
5383
+ "learning_rate": 9.986824668935135e-06,
5384
+ "loss": 11.634,
5385
+ "step": 7600
5386
+ },
5387
+ {
5388
+ "epoch": 0.33773251008260247,
5389
+ "grad_norm": 100.51226806640625,
5390
+ "learning_rate": 9.986807332973208e-06,
5391
+ "loss": 11.2835,
5392
+ "step": 7610
5393
+ },
5394
+ {
5395
+ "epoch": 0.3381763110156939,
5396
+ "grad_norm": 92.74190521240234,
5397
+ "learning_rate": 9.986789997011281e-06,
5398
+ "loss": 11.3273,
5399
+ "step": 7620
5400
+ },
5401
+ {
5402
+ "epoch": 0.3386201119487854,
5403
+ "grad_norm": 89.3246841430664,
5404
+ "learning_rate": 9.986772661049353e-06,
5405
+ "loss": 12.6279,
5406
+ "step": 7630
5407
+ },
5408
+ {
5409
+ "epoch": 0.3390639128818768,
5410
+ "grad_norm": 117.41140747070312,
5411
+ "learning_rate": 9.986755325087426e-06,
5412
+ "loss": 11.9215,
5413
+ "step": 7640
5414
+ },
5415
+ {
5416
+ "epoch": 0.3395077138149683,
5417
+ "grad_norm": 105.69800567626953,
5418
+ "learning_rate": 9.986737989125499e-06,
5419
+ "loss": 11.4488,
5420
+ "step": 7650
5421
+ },
5422
+ {
5423
+ "epoch": 0.33995151474805974,
5424
+ "grad_norm": 99.95671844482422,
5425
+ "learning_rate": 9.98672065316357e-06,
5426
+ "loss": 11.5145,
5427
+ "step": 7660
5428
+ },
5429
+ {
5430
+ "epoch": 0.3403953156811512,
5431
+ "grad_norm": 85.2020263671875,
5432
+ "learning_rate": 9.986703317201643e-06,
5433
+ "loss": 11.0807,
5434
+ "step": 7670
5435
+ },
5436
+ {
5437
+ "epoch": 0.34083911661424265,
5438
+ "grad_norm": 92.34159088134766,
5439
+ "learning_rate": 9.986685981239717e-06,
5440
+ "loss": 11.5631,
5441
+ "step": 7680
5442
+ },
5443
+ {
5444
+ "epoch": 0.34128291754733414,
5445
+ "grad_norm": 89.49488067626953,
5446
+ "learning_rate": 9.986668645277788e-06,
5447
+ "loss": 11.5445,
5448
+ "step": 7690
5449
+ },
5450
+ {
5451
+ "epoch": 0.34172671848042563,
5452
+ "grad_norm": 96.70406341552734,
5453
+ "learning_rate": 9.986651309315861e-06,
5454
+ "loss": 11.5824,
5455
+ "step": 7700
5456
+ },
5457
+ {
5458
+ "epoch": 0.34217051941351706,
5459
+ "grad_norm": 87.1051254272461,
5460
+ "learning_rate": 9.986633973353934e-06,
5461
+ "loss": 11.9654,
5462
+ "step": 7710
5463
+ },
5464
+ {
5465
+ "epoch": 0.34261432034660855,
5466
+ "grad_norm": 94.70158386230469,
5467
+ "learning_rate": 9.986616637392005e-06,
5468
+ "loss": 10.8494,
5469
+ "step": 7720
5470
+ },
5471
+ {
5472
+ "epoch": 0.3430581212797,
5473
+ "grad_norm": 110.57974243164062,
5474
+ "learning_rate": 9.986599301430079e-06,
5475
+ "loss": 11.244,
5476
+ "step": 7730
5477
+ },
5478
+ {
5479
+ "epoch": 0.34350192221279147,
5480
+ "grad_norm": 87.20235443115234,
5481
+ "learning_rate": 9.986581965468152e-06,
5482
+ "loss": 11.3683,
5483
+ "step": 7740
5484
+ },
5485
+ {
5486
+ "epoch": 0.3439457231458829,
5487
+ "grad_norm": 89.35726165771484,
5488
+ "learning_rate": 9.986564629506225e-06,
5489
+ "loss": 11.1392,
5490
+ "step": 7750
5491
+ },
5492
+ {
5493
+ "epoch": 0.3443895240789744,
5494
+ "grad_norm": 97.75891876220703,
5495
+ "learning_rate": 9.986547293544296e-06,
5496
+ "loss": 11.761,
5497
+ "step": 7760
5498
+ },
5499
+ {
5500
+ "epoch": 0.3448333250120658,
5501
+ "grad_norm": 84.02690887451172,
5502
+ "learning_rate": 9.98652995758237e-06,
5503
+ "loss": 11.2544,
5504
+ "step": 7770
5505
+ },
5506
+ {
5507
+ "epoch": 0.3452771259451573,
5508
+ "grad_norm": 89.77345275878906,
5509
+ "learning_rate": 9.986512621620442e-06,
5510
+ "loss": 12.2648,
5511
+ "step": 7780
5512
+ },
5513
+ {
5514
+ "epoch": 0.34572092687824874,
5515
+ "grad_norm": 96.23056030273438,
5516
+ "learning_rate": 9.986495285658514e-06,
5517
+ "loss": 11.7363,
5518
+ "step": 7790
5519
+ },
5520
+ {
5521
+ "epoch": 0.3461647278113402,
5522
+ "grad_norm": 83.2893295288086,
5523
+ "learning_rate": 9.986477949696587e-06,
5524
+ "loss": 11.6393,
5525
+ "step": 7800
5526
+ },
5527
+ {
5528
+ "epoch": 0.3466085287444317,
5529
+ "grad_norm": 88.02169036865234,
5530
+ "learning_rate": 9.98646061373466e-06,
5531
+ "loss": 11.9822,
5532
+ "step": 7810
5533
+ },
5534
+ {
5535
+ "epoch": 0.34705232967752314,
5536
+ "grad_norm": 90.59603881835938,
5537
+ "learning_rate": 9.986443277772731e-06,
5538
+ "loss": 11.1564,
5539
+ "step": 7820
5540
+ },
5541
+ {
5542
+ "epoch": 0.34749613061061463,
5543
+ "grad_norm": 80.67443084716797,
5544
+ "learning_rate": 9.986425941810804e-06,
5545
+ "loss": 11.7602,
5546
+ "step": 7830
5547
+ },
5548
+ {
5549
+ "epoch": 0.34793993154370606,
5550
+ "grad_norm": 110.11127471923828,
5551
+ "learning_rate": 9.986408605848877e-06,
5552
+ "loss": 11.5633,
5553
+ "step": 7840
5554
+ },
5555
+ {
5556
+ "epoch": 0.34838373247679755,
5557
+ "grad_norm": 93.8271255493164,
5558
+ "learning_rate": 9.986391269886949e-06,
5559
+ "loss": 11.7892,
5560
+ "step": 7850
5561
+ },
5562
+ {
5563
+ "epoch": 0.348827533409889,
5564
+ "grad_norm": 108.33939361572266,
5565
+ "learning_rate": 9.986373933925022e-06,
5566
+ "loss": 11.5551,
5567
+ "step": 7860
5568
+ },
5569
+ {
5570
+ "epoch": 0.34927133434298047,
5571
+ "grad_norm": 92.38509368896484,
5572
+ "learning_rate": 9.986356597963095e-06,
5573
+ "loss": 11.3955,
5574
+ "step": 7870
5575
+ },
5576
+ {
5577
+ "epoch": 0.3497151352760719,
5578
+ "grad_norm": 83.159423828125,
5579
+ "learning_rate": 9.986339262001166e-06,
5580
+ "loss": 11.7543,
5581
+ "step": 7880
5582
+ },
5583
+ {
5584
+ "epoch": 0.3501589362091634,
5585
+ "grad_norm": 90.84649658203125,
5586
+ "learning_rate": 9.98632192603924e-06,
5587
+ "loss": 11.3032,
5588
+ "step": 7890
5589
+ },
5590
+ {
5591
+ "epoch": 0.3506027371422548,
5592
+ "grad_norm": 98.68833923339844,
5593
+ "learning_rate": 9.986304590077312e-06,
5594
+ "loss": 11.6455,
5595
+ "step": 7900
5596
+ },
5597
+ {
5598
+ "epoch": 0.3510465380753463,
5599
+ "grad_norm": 96.10926055908203,
5600
+ "learning_rate": 9.986287254115384e-06,
5601
+ "loss": 11.6115,
5602
+ "step": 7910
5603
+ },
5604
+ {
5605
+ "epoch": 0.3514903390084378,
5606
+ "grad_norm": 108.88908386230469,
5607
+ "learning_rate": 9.986269918153457e-06,
5608
+ "loss": 11.0329,
5609
+ "step": 7920
5610
+ },
5611
+ {
5612
+ "epoch": 0.3519341399415292,
5613
+ "grad_norm": 85.24256896972656,
5614
+ "learning_rate": 9.98625258219153e-06,
5615
+ "loss": 11.3518,
5616
+ "step": 7930
5617
+ },
5618
+ {
5619
+ "epoch": 0.3523779408746207,
5620
+ "grad_norm": 100.0363540649414,
5621
+ "learning_rate": 9.986235246229601e-06,
5622
+ "loss": 11.4287,
5623
+ "step": 7940
5624
+ },
5625
+ {
5626
+ "epoch": 0.35282174180771214,
5627
+ "grad_norm": 101.91362762451172,
5628
+ "learning_rate": 9.986217910267674e-06,
5629
+ "loss": 10.8322,
5630
+ "step": 7950
5631
+ },
5632
+ {
5633
+ "epoch": 0.35326554274080363,
5634
+ "grad_norm": 78.89401245117188,
5635
+ "learning_rate": 9.986200574305748e-06,
5636
+ "loss": 11.2172,
5637
+ "step": 7960
5638
+ },
5639
+ {
5640
+ "epoch": 0.35370934367389506,
5641
+ "grad_norm": 99.1776123046875,
5642
+ "learning_rate": 9.98618323834382e-06,
5643
+ "loss": 11.6107,
5644
+ "step": 7970
5645
+ },
5646
+ {
5647
+ "epoch": 0.35415314460698655,
5648
+ "grad_norm": 88.57538604736328,
5649
+ "learning_rate": 9.986165902381892e-06,
5650
+ "loss": 11.4944,
5651
+ "step": 7980
5652
+ },
5653
+ {
5654
+ "epoch": 0.354596945540078,
5655
+ "grad_norm": 91.77239227294922,
5656
+ "learning_rate": 9.986148566419965e-06,
5657
+ "loss": 11.4736,
5658
+ "step": 7990
5659
+ },
5660
+ {
5661
+ "epoch": 0.35504074647316947,
5662
+ "grad_norm": 118.60701751708984,
5663
+ "learning_rate": 9.986131230458038e-06,
5664
+ "loss": 11.3651,
5665
+ "step": 8000
5666
+ },
5667
+ {
5668
+ "epoch": 0.35504074647316947,
5669
+ "eval_loss": 0.35784557461738586,
5670
+ "eval_runtime": 673.1581,
5671
+ "eval_samples_per_second": 1804.021,
5672
+ "eval_steps_per_second": 56.376,
5673
+ "step": 8000
5674
+ },
5675
+ {
5676
+ "epoch": 0.3554845474062609,
5677
+ "grad_norm": 78.45508575439453,
5678
+ "learning_rate": 9.98611389449611e-06,
5679
+ "loss": 11.5346,
5680
+ "step": 8010
5681
+ },
5682
+ {
5683
+ "epoch": 0.3559283483393524,
5684
+ "grad_norm": 93.72156524658203,
5685
+ "learning_rate": 9.986096558534183e-06,
5686
+ "loss": 11.7047,
5687
+ "step": 8020
5688
+ },
5689
+ {
5690
+ "epoch": 0.35637214927244387,
5691
+ "grad_norm": 108.60897064208984,
5692
+ "learning_rate": 9.986079222572256e-06,
5693
+ "loss": 11.5851,
5694
+ "step": 8030
5695
+ },
5696
+ {
5697
+ "epoch": 0.3568159502055353,
5698
+ "grad_norm": 98.00389862060547,
5699
+ "learning_rate": 9.986061886610327e-06,
5700
+ "loss": 11.8359,
5701
+ "step": 8040
5702
+ },
5703
+ {
5704
+ "epoch": 0.3572597511386268,
5705
+ "grad_norm": 96.44226837158203,
5706
+ "learning_rate": 9.9860445506484e-06,
5707
+ "loss": 11.6475,
5708
+ "step": 8050
5709
+ },
5710
+ {
5711
+ "epoch": 0.3577035520717182,
5712
+ "grad_norm": 80.3302993774414,
5713
+ "learning_rate": 9.986027214686473e-06,
5714
+ "loss": 11.3945,
5715
+ "step": 8060
5716
+ },
5717
+ {
5718
+ "epoch": 0.3581473530048097,
5719
+ "grad_norm": 96.11526489257812,
5720
+ "learning_rate": 9.986009878724545e-06,
5721
+ "loss": 11.0479,
5722
+ "step": 8070
5723
+ },
5724
+ {
5725
+ "epoch": 0.35859115393790114,
5726
+ "grad_norm": 97.33485412597656,
5727
+ "learning_rate": 9.985992542762618e-06,
5728
+ "loss": 11.8224,
5729
+ "step": 8080
5730
+ },
5731
+ {
5732
+ "epoch": 0.35903495487099263,
5733
+ "grad_norm": 82.29984283447266,
5734
+ "learning_rate": 9.98597520680069e-06,
5735
+ "loss": 11.5318,
5736
+ "step": 8090
5737
+ },
5738
+ {
5739
+ "epoch": 0.35947875580408406,
5740
+ "grad_norm": 93.84577941894531,
5741
+ "learning_rate": 9.985957870838762e-06,
5742
+ "loss": 11.4744,
5743
+ "step": 8100
5744
+ },
5745
+ {
5746
+ "epoch": 0.35992255673717555,
5747
+ "grad_norm": 84.21646881103516,
5748
+ "learning_rate": 9.985940534876835e-06,
5749
+ "loss": 10.9926,
5750
+ "step": 8110
5751
+ },
5752
+ {
5753
+ "epoch": 0.360366357670267,
5754
+ "grad_norm": 83.08773040771484,
5755
+ "learning_rate": 9.985923198914908e-06,
5756
+ "loss": 10.9091,
5757
+ "step": 8120
5758
+ },
5759
+ {
5760
+ "epoch": 0.36081015860335847,
5761
+ "grad_norm": 79.4489974975586,
5762
+ "learning_rate": 9.98590586295298e-06,
5763
+ "loss": 10.9493,
5764
+ "step": 8130
5765
+ },
5766
+ {
5767
+ "epoch": 0.3612539595364499,
5768
+ "grad_norm": 85.71393585205078,
5769
+ "learning_rate": 9.985888526991053e-06,
5770
+ "loss": 11.4266,
5771
+ "step": 8140
5772
+ },
5773
+ {
5774
+ "epoch": 0.3616977604695414,
5775
+ "grad_norm": 81.63021087646484,
5776
+ "learning_rate": 9.985871191029126e-06,
5777
+ "loss": 11.6757,
5778
+ "step": 8150
5779
+ },
5780
+ {
5781
+ "epoch": 0.36214156140263287,
5782
+ "grad_norm": 91.55906677246094,
5783
+ "learning_rate": 9.985853855067197e-06,
5784
+ "loss": 11.6191,
5785
+ "step": 8160
5786
+ },
5787
+ {
5788
+ "epoch": 0.3625853623357243,
5789
+ "grad_norm": 80.61488342285156,
5790
+ "learning_rate": 9.98583651910527e-06,
5791
+ "loss": 11.5607,
5792
+ "step": 8170
5793
+ },
5794
+ {
5795
+ "epoch": 0.3630291632688158,
5796
+ "grad_norm": 100.6302261352539,
5797
+ "learning_rate": 9.985819183143343e-06,
5798
+ "loss": 11.6338,
5799
+ "step": 8180
5800
+ },
5801
+ {
5802
+ "epoch": 0.3634729642019072,
5803
+ "grad_norm": 98.94048309326172,
5804
+ "learning_rate": 9.985801847181416e-06,
5805
+ "loss": 11.8984,
5806
+ "step": 8190
5807
+ },
5808
+ {
5809
+ "epoch": 0.3639167651349987,
5810
+ "grad_norm": 94.3434066772461,
5811
+ "learning_rate": 9.985784511219488e-06,
5812
+ "loss": 11.4629,
5813
+ "step": 8200
5814
+ },
5815
+ {
5816
+ "epoch": 0.36436056606809014,
5817
+ "grad_norm": 117.29963684082031,
5818
+ "learning_rate": 9.985767175257561e-06,
5819
+ "loss": 11.5717,
5820
+ "step": 8210
5821
+ },
5822
+ {
5823
+ "epoch": 0.36480436700118163,
5824
+ "grad_norm": 96.46138763427734,
5825
+ "learning_rate": 9.985749839295634e-06,
5826
+ "loss": 11.2935,
5827
+ "step": 8220
5828
+ },
5829
+ {
5830
+ "epoch": 0.36524816793427306,
5831
+ "grad_norm": 88.48851776123047,
5832
+ "learning_rate": 9.985732503333705e-06,
5833
+ "loss": 11.4625,
5834
+ "step": 8230
5835
+ },
5836
+ {
5837
+ "epoch": 0.36569196886736455,
5838
+ "grad_norm": 90.18971252441406,
5839
+ "learning_rate": 9.985715167371778e-06,
5840
+ "loss": 11.2449,
5841
+ "step": 8240
5842
+ },
5843
+ {
5844
+ "epoch": 0.366135769800456,
5845
+ "grad_norm": 87.7426986694336,
5846
+ "learning_rate": 9.985697831409852e-06,
5847
+ "loss": 11.3782,
5848
+ "step": 8250
5849
+ },
5850
+ {
5851
+ "epoch": 0.36657957073354747,
5852
+ "grad_norm": 97.00252532958984,
5853
+ "learning_rate": 9.985680495447923e-06,
5854
+ "loss": 11.3337,
5855
+ "step": 8260
5856
+ },
5857
+ {
5858
+ "epoch": 0.36702337166663895,
5859
+ "grad_norm": 109.61273193359375,
5860
+ "learning_rate": 9.985663159485996e-06,
5861
+ "loss": 11.0625,
5862
+ "step": 8270
5863
+ },
5864
+ {
5865
+ "epoch": 0.3674671725997304,
5866
+ "grad_norm": 86.43873596191406,
5867
+ "learning_rate": 9.985645823524069e-06,
5868
+ "loss": 10.7205,
5869
+ "step": 8280
5870
+ },
5871
+ {
5872
+ "epoch": 0.36791097353282187,
5873
+ "grad_norm": 110.65450286865234,
5874
+ "learning_rate": 9.98562848756214e-06,
5875
+ "loss": 11.3779,
5876
+ "step": 8290
5877
+ },
5878
+ {
5879
+ "epoch": 0.3683547744659133,
5880
+ "grad_norm": 97.357421875,
5881
+ "learning_rate": 9.985611151600214e-06,
5882
+ "loss": 11.049,
5883
+ "step": 8300
5884
+ },
5885
+ {
5886
+ "epoch": 0.3687985753990048,
5887
+ "grad_norm": 72.67398834228516,
5888
+ "learning_rate": 9.985593815638287e-06,
5889
+ "loss": 10.9327,
5890
+ "step": 8310
5891
+ },
5892
+ {
5893
+ "epoch": 0.3692423763320962,
5894
+ "grad_norm": 80.5442886352539,
5895
+ "learning_rate": 9.985576479676358e-06,
5896
+ "loss": 11.3067,
5897
+ "step": 8320
5898
+ },
5899
+ {
5900
+ "epoch": 0.3696861772651877,
5901
+ "grad_norm": 104.49150848388672,
5902
+ "learning_rate": 9.985559143714431e-06,
5903
+ "loss": 11.7007,
5904
+ "step": 8330
5905
+ },
5906
+ {
5907
+ "epoch": 0.37012997819827914,
5908
+ "grad_norm": 93.68840789794922,
5909
+ "learning_rate": 9.985541807752504e-06,
5910
+ "loss": 12.1018,
5911
+ "step": 8340
5912
+ },
5913
+ {
5914
+ "epoch": 0.37057377913137063,
5915
+ "grad_norm": 92.62474060058594,
5916
+ "learning_rate": 9.985524471790576e-06,
5917
+ "loss": 11.2158,
5918
+ "step": 8350
5919
+ },
5920
+ {
5921
+ "epoch": 0.37101758006446206,
5922
+ "grad_norm": 94.18134307861328,
5923
+ "learning_rate": 9.985507135828649e-06,
5924
+ "loss": 11.551,
5925
+ "step": 8360
5926
+ },
5927
+ {
5928
+ "epoch": 0.37146138099755355,
5929
+ "grad_norm": 97.85765838623047,
5930
+ "learning_rate": 9.985489799866722e-06,
5931
+ "loss": 11.6042,
5932
+ "step": 8370
5933
+ },
5934
+ {
5935
+ "epoch": 0.37190518193064503,
5936
+ "grad_norm": 88.52871704101562,
5937
+ "learning_rate": 9.985472463904795e-06,
5938
+ "loss": 11.8907,
5939
+ "step": 8380
5940
+ },
5941
+ {
5942
+ "epoch": 0.37234898286373647,
5943
+ "grad_norm": 95.93720245361328,
5944
+ "learning_rate": 9.985455127942866e-06,
5945
+ "loss": 11.1994,
5946
+ "step": 8390
5947
+ },
5948
+ {
5949
+ "epoch": 0.37279278379682795,
5950
+ "grad_norm": 73.47252655029297,
5951
+ "learning_rate": 9.98543779198094e-06,
5952
+ "loss": 11.1229,
5953
+ "step": 8400
5954
+ },
5955
+ {
5956
+ "epoch": 0.3732365847299194,
5957
+ "grad_norm": 87.63044738769531,
5958
+ "learning_rate": 9.985420456019012e-06,
5959
+ "loss": 11.2802,
5960
+ "step": 8410
5961
+ },
5962
+ {
5963
+ "epoch": 0.37368038566301087,
5964
+ "grad_norm": 85.62527465820312,
5965
+ "learning_rate": 9.985403120057084e-06,
5966
+ "loss": 11.4917,
5967
+ "step": 8420
5968
+ },
5969
+ {
5970
+ "epoch": 0.3741241865961023,
5971
+ "grad_norm": 84.97439575195312,
5972
+ "learning_rate": 9.985385784095157e-06,
5973
+ "loss": 11.4111,
5974
+ "step": 8430
5975
+ },
5976
+ {
5977
+ "epoch": 0.3745679875291938,
5978
+ "grad_norm": 91.50364685058594,
5979
+ "learning_rate": 9.98536844813323e-06,
5980
+ "loss": 11.4548,
5981
+ "step": 8440
5982
+ },
5983
+ {
5984
+ "epoch": 0.3750117884622852,
5985
+ "grad_norm": 91.25043487548828,
5986
+ "learning_rate": 9.985351112171301e-06,
5987
+ "loss": 11.793,
5988
+ "step": 8450
5989
+ },
5990
+ {
5991
+ "epoch": 0.3754555893953767,
5992
+ "grad_norm": 93.69058227539062,
5993
+ "learning_rate": 9.985333776209374e-06,
5994
+ "loss": 11.0371,
5995
+ "step": 8460
5996
+ },
5997
+ {
5998
+ "epoch": 0.37589939032846814,
5999
+ "grad_norm": 89.45205688476562,
6000
+ "learning_rate": 9.985316440247447e-06,
6001
+ "loss": 11.3039,
6002
+ "step": 8470
6003
+ },
6004
+ {
6005
+ "epoch": 0.37634319126155963,
6006
+ "grad_norm": 97.13536071777344,
6007
+ "learning_rate": 9.98529910428552e-06,
6008
+ "loss": 11.1143,
6009
+ "step": 8480
6010
+ },
6011
+ {
6012
+ "epoch": 0.3767869921946511,
6013
+ "grad_norm": 95.88386535644531,
6014
+ "learning_rate": 9.985281768323592e-06,
6015
+ "loss": 10.9521,
6016
+ "step": 8490
6017
+ },
6018
+ {
6019
+ "epoch": 0.37723079312774255,
6020
+ "grad_norm": 107.3424072265625,
6021
+ "learning_rate": 9.985264432361665e-06,
6022
+ "loss": 11.4563,
6023
+ "step": 8500
6024
+ },
6025
+ {
6026
+ "epoch": 0.37767459406083403,
6027
+ "grad_norm": 78.75535583496094,
6028
+ "learning_rate": 9.985247096399738e-06,
6029
+ "loss": 11.6325,
6030
+ "step": 8510
6031
+ },
6032
+ {
6033
+ "epoch": 0.37811839499392547,
6034
+ "grad_norm": 93.4799575805664,
6035
+ "learning_rate": 9.98522976043781e-06,
6036
+ "loss": 11.5845,
6037
+ "step": 8520
6038
+ },
6039
+ {
6040
+ "epoch": 0.37856219592701695,
6041
+ "grad_norm": 82.9742202758789,
6042
+ "learning_rate": 9.985212424475882e-06,
6043
+ "loss": 10.9067,
6044
+ "step": 8530
6045
+ },
6046
+ {
6047
+ "epoch": 0.3790059968601084,
6048
+ "grad_norm": 86.02015686035156,
6049
+ "learning_rate": 9.985195088513956e-06,
6050
+ "loss": 11.7097,
6051
+ "step": 8540
6052
+ },
6053
+ {
6054
+ "epoch": 0.37944979779319987,
6055
+ "grad_norm": 78.73582458496094,
6056
+ "learning_rate": 9.985177752552027e-06,
6057
+ "loss": 11.3756,
6058
+ "step": 8550
6059
+ },
6060
+ {
6061
+ "epoch": 0.3798935987262913,
6062
+ "grad_norm": 86.17765808105469,
6063
+ "learning_rate": 9.9851604165901e-06,
6064
+ "loss": 11.1242,
6065
+ "step": 8560
6066
+ },
6067
+ {
6068
+ "epoch": 0.3803373996593828,
6069
+ "grad_norm": 103.56576538085938,
6070
+ "learning_rate": 9.985143080628173e-06,
6071
+ "loss": 11.5288,
6072
+ "step": 8570
6073
+ },
6074
+ {
6075
+ "epoch": 0.3807812005924742,
6076
+ "grad_norm": 96.16366577148438,
6077
+ "learning_rate": 9.985125744666244e-06,
6078
+ "loss": 11.3524,
6079
+ "step": 8580
6080
+ },
6081
+ {
6082
+ "epoch": 0.3812250015255657,
6083
+ "grad_norm": 79.89984893798828,
6084
+ "learning_rate": 9.985108408704318e-06,
6085
+ "loss": 11.2257,
6086
+ "step": 8590
6087
+ },
6088
+ {
6089
+ "epoch": 0.3816688024586572,
6090
+ "grad_norm": 91.93770599365234,
6091
+ "learning_rate": 9.98509107274239e-06,
6092
+ "loss": 11.5471,
6093
+ "step": 8600
6094
+ },
6095
+ {
6096
+ "epoch": 0.38211260339174863,
6097
+ "grad_norm": 87.27505493164062,
6098
+ "learning_rate": 9.985073736780464e-06,
6099
+ "loss": 11.2449,
6100
+ "step": 8610
6101
+ },
6102
+ {
6103
+ "epoch": 0.3825564043248401,
6104
+ "grad_norm": 93.93415069580078,
6105
+ "learning_rate": 9.985056400818535e-06,
6106
+ "loss": 11.0035,
6107
+ "step": 8620
6108
+ },
6109
+ {
6110
+ "epoch": 0.38300020525793155,
6111
+ "grad_norm": 88.42649841308594,
6112
+ "learning_rate": 9.985039064856608e-06,
6113
+ "loss": 11.3949,
6114
+ "step": 8630
6115
+ },
6116
+ {
6117
+ "epoch": 0.38344400619102303,
6118
+ "grad_norm": 87.21992492675781,
6119
+ "learning_rate": 9.985021728894681e-06,
6120
+ "loss": 11.419,
6121
+ "step": 8640
6122
+ },
6123
+ {
6124
+ "epoch": 0.38388780712411447,
6125
+ "grad_norm": 96.35975646972656,
6126
+ "learning_rate": 9.985004392932753e-06,
6127
+ "loss": 10.8626,
6128
+ "step": 8650
6129
+ },
6130
+ {
6131
+ "epoch": 0.38433160805720595,
6132
+ "grad_norm": 90.79749298095703,
6133
+ "learning_rate": 9.984987056970826e-06,
6134
+ "loss": 11.9004,
6135
+ "step": 8660
6136
+ },
6137
+ {
6138
+ "epoch": 0.3847754089902974,
6139
+ "grad_norm": 88.30585479736328,
6140
+ "learning_rate": 9.984969721008899e-06,
6141
+ "loss": 11.8872,
6142
+ "step": 8670
6143
+ },
6144
+ {
6145
+ "epoch": 0.38521920992338887,
6146
+ "grad_norm": 68.6938247680664,
6147
+ "learning_rate": 9.98495238504697e-06,
6148
+ "loss": 11.1578,
6149
+ "step": 8680
6150
+ },
6151
+ {
6152
+ "epoch": 0.3856630108564803,
6153
+ "grad_norm": 87.89897918701172,
6154
+ "learning_rate": 9.984935049085043e-06,
6155
+ "loss": 11.0376,
6156
+ "step": 8690
6157
+ },
6158
+ {
6159
+ "epoch": 0.3861068117895718,
6160
+ "grad_norm": 103.39437103271484,
6161
+ "learning_rate": 9.984917713123116e-06,
6162
+ "loss": 11.6238,
6163
+ "step": 8700
6164
+ },
6165
+ {
6166
+ "epoch": 0.3865506127226632,
6167
+ "grad_norm": 82.58814239501953,
6168
+ "learning_rate": 9.984900377161188e-06,
6169
+ "loss": 11.2413,
6170
+ "step": 8710
6171
+ },
6172
+ {
6173
+ "epoch": 0.3869944136557547,
6174
+ "grad_norm": 73.86261749267578,
6175
+ "learning_rate": 9.98488304119926e-06,
6176
+ "loss": 10.9585,
6177
+ "step": 8720
6178
+ },
6179
+ {
6180
+ "epoch": 0.3874382145888462,
6181
+ "grad_norm": 100.28836822509766,
6182
+ "learning_rate": 9.984865705237334e-06,
6183
+ "loss": 10.9422,
6184
+ "step": 8730
6185
+ },
6186
+ {
6187
+ "epoch": 0.38788201552193763,
6188
+ "grad_norm": 88.46509552001953,
6189
+ "learning_rate": 9.984848369275407e-06,
6190
+ "loss": 11.1186,
6191
+ "step": 8740
6192
+ },
6193
+ {
6194
+ "epoch": 0.3883258164550291,
6195
+ "grad_norm": 90.18559265136719,
6196
+ "learning_rate": 9.984831033313478e-06,
6197
+ "loss": 11.438,
6198
+ "step": 8750
6199
+ },
6200
+ {
6201
+ "epoch": 0.38876961738812055,
6202
+ "grad_norm": 99.76158142089844,
6203
+ "learning_rate": 9.984813697351551e-06,
6204
+ "loss": 11.6788,
6205
+ "step": 8760
6206
+ },
6207
+ {
6208
+ "epoch": 0.38921341832121203,
6209
+ "grad_norm": 78.58843994140625,
6210
+ "learning_rate": 9.984796361389624e-06,
6211
+ "loss": 11.2294,
6212
+ "step": 8770
6213
+ },
6214
+ {
6215
+ "epoch": 0.38965721925430347,
6216
+ "grad_norm": 117.36835479736328,
6217
+ "learning_rate": 9.984779025427696e-06,
6218
+ "loss": 11.3633,
6219
+ "step": 8780
6220
+ },
6221
+ {
6222
+ "epoch": 0.39010102018739495,
6223
+ "grad_norm": 81.84542846679688,
6224
+ "learning_rate": 9.984761689465769e-06,
6225
+ "loss": 11.9371,
6226
+ "step": 8790
6227
+ },
6228
+ {
6229
+ "epoch": 0.3905448211204864,
6230
+ "grad_norm": 84.65067291259766,
6231
+ "learning_rate": 9.984744353503842e-06,
6232
+ "loss": 11.3477,
6233
+ "step": 8800
6234
+ },
6235
+ {
6236
+ "epoch": 0.3909886220535779,
6237
+ "grad_norm": 86.41151428222656,
6238
+ "learning_rate": 9.984727017541913e-06,
6239
+ "loss": 11.537,
6240
+ "step": 8810
6241
+ },
6242
+ {
6243
+ "epoch": 0.3914324229866693,
6244
+ "grad_norm": 85.5174331665039,
6245
+ "learning_rate": 9.984709681579986e-06,
6246
+ "loss": 11.5844,
6247
+ "step": 8820
6248
+ },
6249
+ {
6250
+ "epoch": 0.3918762239197608,
6251
+ "grad_norm": 103.24414825439453,
6252
+ "learning_rate": 9.98469234561806e-06,
6253
+ "loss": 10.8962,
6254
+ "step": 8830
6255
+ },
6256
+ {
6257
+ "epoch": 0.3923200248528523,
6258
+ "grad_norm": 101.08570098876953,
6259
+ "learning_rate": 9.984675009656131e-06,
6260
+ "loss": 11.1054,
6261
+ "step": 8840
6262
+ },
6263
+ {
6264
+ "epoch": 0.3927638257859437,
6265
+ "grad_norm": 89.28972625732422,
6266
+ "learning_rate": 9.984657673694204e-06,
6267
+ "loss": 10.9952,
6268
+ "step": 8850
6269
+ },
6270
+ {
6271
+ "epoch": 0.3932076267190352,
6272
+ "grad_norm": 97.5589370727539,
6273
+ "learning_rate": 9.984640337732277e-06,
6274
+ "loss": 11.2572,
6275
+ "step": 8860
6276
+ },
6277
+ {
6278
+ "epoch": 0.39365142765212663,
6279
+ "grad_norm": 85.52608489990234,
6280
+ "learning_rate": 9.98462300177035e-06,
6281
+ "loss": 11.2462,
6282
+ "step": 8870
6283
+ },
6284
+ {
6285
+ "epoch": 0.3940952285852181,
6286
+ "grad_norm": 88.88489532470703,
6287
+ "learning_rate": 9.984605665808422e-06,
6288
+ "loss": 11.1294,
6289
+ "step": 8880
6290
+ },
6291
+ {
6292
+ "epoch": 0.39453902951830955,
6293
+ "grad_norm": 98.8681411743164,
6294
+ "learning_rate": 9.984588329846495e-06,
6295
+ "loss": 11.6495,
6296
+ "step": 8890
6297
+ },
6298
+ {
6299
+ "epoch": 0.39498283045140103,
6300
+ "grad_norm": 91.84007263183594,
6301
+ "learning_rate": 9.984570993884568e-06,
6302
+ "loss": 11.3472,
6303
+ "step": 8900
6304
+ },
6305
+ {
6306
+ "epoch": 0.39542663138449247,
6307
+ "grad_norm": 68.3472671508789,
6308
+ "learning_rate": 9.984553657922639e-06,
6309
+ "loss": 11.424,
6310
+ "step": 8910
6311
+ },
6312
+ {
6313
+ "epoch": 0.39587043231758395,
6314
+ "grad_norm": 83.57421875,
6315
+ "learning_rate": 9.984536321960712e-06,
6316
+ "loss": 11.078,
6317
+ "step": 8920
6318
+ },
6319
+ {
6320
+ "epoch": 0.3963142332506754,
6321
+ "grad_norm": 87.4074935913086,
6322
+ "learning_rate": 9.984518985998785e-06,
6323
+ "loss": 11.178,
6324
+ "step": 8930
6325
+ },
6326
+ {
6327
+ "epoch": 0.3967580341837669,
6328
+ "grad_norm": 73.35061645507812,
6329
+ "learning_rate": 9.984501650036857e-06,
6330
+ "loss": 11.0786,
6331
+ "step": 8940
6332
+ },
6333
+ {
6334
+ "epoch": 0.39720183511685836,
6335
+ "grad_norm": 79.557861328125,
6336
+ "learning_rate": 9.98448431407493e-06,
6337
+ "loss": 11.3864,
6338
+ "step": 8950
6339
+ },
6340
+ {
6341
+ "epoch": 0.3976456360499498,
6342
+ "grad_norm": 86.81566619873047,
6343
+ "learning_rate": 9.984466978113003e-06,
6344
+ "loss": 10.7141,
6345
+ "step": 8960
6346
+ },
6347
+ {
6348
+ "epoch": 0.3980894369830413,
6349
+ "grad_norm": 86.90424346923828,
6350
+ "learning_rate": 9.984449642151074e-06,
6351
+ "loss": 10.7673,
6352
+ "step": 8970
6353
+ },
6354
+ {
6355
+ "epoch": 0.3985332379161327,
6356
+ "grad_norm": 92.93916320800781,
6357
+ "learning_rate": 9.984432306189147e-06,
6358
+ "loss": 11.2373,
6359
+ "step": 8980
6360
+ },
6361
+ {
6362
+ "epoch": 0.3989770388492242,
6363
+ "grad_norm": 96.02029418945312,
6364
+ "learning_rate": 9.98441497022722e-06,
6365
+ "loss": 11.1749,
6366
+ "step": 8990
6367
+ },
6368
+ {
6369
+ "epoch": 0.39942083978231563,
6370
+ "grad_norm": 83.88191223144531,
6371
+ "learning_rate": 9.984397634265293e-06,
6372
+ "loss": 11.7742,
6373
+ "step": 9000
6374
+ },
6375
+ {
6376
+ "epoch": 0.39942083978231563,
6377
+ "eval_loss": 0.35243040323257446,
6378
+ "eval_runtime": 674.8358,
6379
+ "eval_samples_per_second": 1799.535,
6380
+ "eval_steps_per_second": 56.236,
6381
+ "step": 9000
6382
+ },
6383
+ {
6384
+ "epoch": 0.3998646407154071,
6385
+ "grad_norm": 73.33333587646484,
6386
+ "learning_rate": 9.984380298303365e-06,
6387
+ "loss": 10.9059,
6388
+ "step": 9010
6389
+ },
6390
+ {
6391
+ "epoch": 0.40030844164849855,
6392
+ "grad_norm": 87.8101806640625,
6393
+ "learning_rate": 9.984362962341438e-06,
6394
+ "loss": 11.2553,
6395
+ "step": 9020
6396
+ },
6397
+ {
6398
+ "epoch": 0.40075224258159003,
6399
+ "grad_norm": 79.9136734008789,
6400
+ "learning_rate": 9.984345626379511e-06,
6401
+ "loss": 11.2405,
6402
+ "step": 9030
6403
+ },
6404
+ {
6405
+ "epoch": 0.40119604351468147,
6406
+ "grad_norm": 92.52330017089844,
6407
+ "learning_rate": 9.984328290417582e-06,
6408
+ "loss": 10.9439,
6409
+ "step": 9040
6410
+ },
6411
+ {
6412
+ "epoch": 0.40163984444777295,
6413
+ "grad_norm": 92.92615509033203,
6414
+ "learning_rate": 9.984310954455655e-06,
6415
+ "loss": 11.3613,
6416
+ "step": 9050
6417
+ },
6418
+ {
6419
+ "epoch": 0.40208364538086444,
6420
+ "grad_norm": 87.73091125488281,
6421
+ "learning_rate": 9.984293618493729e-06,
6422
+ "loss": 11.2295,
6423
+ "step": 9060
6424
+ },
6425
+ {
6426
+ "epoch": 0.4025274463139559,
6427
+ "grad_norm": 94.13227081298828,
6428
+ "learning_rate": 9.9842762825318e-06,
6429
+ "loss": 11.305,
6430
+ "step": 9070
6431
+ },
6432
+ {
6433
+ "epoch": 0.40297124724704736,
6434
+ "grad_norm": 77.8934097290039,
6435
+ "learning_rate": 9.984258946569873e-06,
6436
+ "loss": 11.0494,
6437
+ "step": 9080
6438
+ },
6439
+ {
6440
+ "epoch": 0.4034150481801388,
6441
+ "grad_norm": 76.98465728759766,
6442
+ "learning_rate": 9.984241610607946e-06,
6443
+ "loss": 11.0135,
6444
+ "step": 9090
6445
+ },
6446
+ {
6447
+ "epoch": 0.4038588491132303,
6448
+ "grad_norm": 86.28607177734375,
6449
+ "learning_rate": 9.984224274646019e-06,
6450
+ "loss": 11.2476,
6451
+ "step": 9100
6452
+ },
6453
+ {
6454
+ "epoch": 0.4043026500463217,
6455
+ "grad_norm": 83.22606658935547,
6456
+ "learning_rate": 9.98420693868409e-06,
6457
+ "loss": 11.472,
6458
+ "step": 9110
6459
+ },
6460
+ {
6461
+ "epoch": 0.4047464509794132,
6462
+ "grad_norm": 87.87577819824219,
6463
+ "learning_rate": 9.984189602722164e-06,
6464
+ "loss": 11.237,
6465
+ "step": 9120
6466
+ },
6467
+ {
6468
+ "epoch": 0.40519025191250463,
6469
+ "grad_norm": 93.7770004272461,
6470
+ "learning_rate": 9.984172266760237e-06,
6471
+ "loss": 11.2706,
6472
+ "step": 9130
6473
+ },
6474
+ {
6475
+ "epoch": 0.4056340528455961,
6476
+ "grad_norm": 80.65582275390625,
6477
+ "learning_rate": 9.984154930798308e-06,
6478
+ "loss": 11.3943,
6479
+ "step": 9140
6480
+ },
6481
+ {
6482
+ "epoch": 0.40607785377868755,
6483
+ "grad_norm": 86.08631896972656,
6484
+ "learning_rate": 9.984137594836381e-06,
6485
+ "loss": 11.2704,
6486
+ "step": 9150
6487
+ },
6488
+ {
6489
+ "epoch": 0.40652165471177903,
6490
+ "grad_norm": 79.62726593017578,
6491
+ "learning_rate": 9.984120258874454e-06,
6492
+ "loss": 11.5724,
6493
+ "step": 9160
6494
+ },
6495
+ {
6496
+ "epoch": 0.40696545564487047,
6497
+ "grad_norm": 88.90939331054688,
6498
+ "learning_rate": 9.984102922912526e-06,
6499
+ "loss": 11.2739,
6500
+ "step": 9170
6501
+ },
6502
+ {
6503
+ "epoch": 0.40740925657796195,
6504
+ "grad_norm": 92.309814453125,
6505
+ "learning_rate": 9.984085586950599e-06,
6506
+ "loss": 11.535,
6507
+ "step": 9180
6508
+ },
6509
+ {
6510
+ "epoch": 0.40785305751105344,
6511
+ "grad_norm": 83.53838348388672,
6512
+ "learning_rate": 9.984068250988672e-06,
6513
+ "loss": 11.3033,
6514
+ "step": 9190
6515
+ },
6516
+ {
6517
+ "epoch": 0.4082968584441449,
6518
+ "grad_norm": 92.0191421508789,
6519
+ "learning_rate": 9.984050915026743e-06,
6520
+ "loss": 11.8525,
6521
+ "step": 9200
6522
+ },
6523
+ {
6524
+ "epoch": 0.40874065937723636,
6525
+ "grad_norm": 96.62713623046875,
6526
+ "learning_rate": 9.984033579064816e-06,
6527
+ "loss": 10.9853,
6528
+ "step": 9210
6529
+ },
6530
+ {
6531
+ "epoch": 0.4091844603103278,
6532
+ "grad_norm": 92.35746765136719,
6533
+ "learning_rate": 9.98401624310289e-06,
6534
+ "loss": 11.1736,
6535
+ "step": 9220
6536
+ },
6537
+ {
6538
+ "epoch": 0.4096282612434193,
6539
+ "grad_norm": 87.0098876953125,
6540
+ "learning_rate": 9.98399890714096e-06,
6541
+ "loss": 11.3787,
6542
+ "step": 9230
6543
+ },
6544
+ {
6545
+ "epoch": 0.4100720621765107,
6546
+ "grad_norm": 75.16019439697266,
6547
+ "learning_rate": 9.983981571179034e-06,
6548
+ "loss": 11.0318,
6549
+ "step": 9240
6550
+ },
6551
+ {
6552
+ "epoch": 0.4105158631096022,
6553
+ "grad_norm": 89.38213348388672,
6554
+ "learning_rate": 9.983964235217107e-06,
6555
+ "loss": 10.8708,
6556
+ "step": 9250
6557
+ },
6558
+ {
6559
+ "epoch": 0.41095966404269363,
6560
+ "grad_norm": 78.94715881347656,
6561
+ "learning_rate": 9.983946899255178e-06,
6562
+ "loss": 11.1103,
6563
+ "step": 9260
6564
+ },
6565
+ {
6566
+ "epoch": 0.4114034649757851,
6567
+ "grad_norm": 93.04794311523438,
6568
+ "learning_rate": 9.983929563293251e-06,
6569
+ "loss": 11.2785,
6570
+ "step": 9270
6571
+ },
6572
+ {
6573
+ "epoch": 0.41184726590887655,
6574
+ "grad_norm": 91.3328857421875,
6575
+ "learning_rate": 9.983912227331324e-06,
6576
+ "loss": 11.3683,
6577
+ "step": 9280
6578
+ },
6579
+ {
6580
+ "epoch": 0.41229106684196803,
6581
+ "grad_norm": 83.09625244140625,
6582
+ "learning_rate": 9.983894891369397e-06,
6583
+ "loss": 10.9742,
6584
+ "step": 9290
6585
+ },
6586
+ {
6587
+ "epoch": 0.4127348677750595,
6588
+ "grad_norm": 88.08326721191406,
6589
+ "learning_rate": 9.983877555407469e-06,
6590
+ "loss": 11.1625,
6591
+ "step": 9300
6592
+ },
6593
+ {
6594
+ "epoch": 0.41317866870815095,
6595
+ "grad_norm": 102.78692626953125,
6596
+ "learning_rate": 9.983860219445542e-06,
6597
+ "loss": 11.5739,
6598
+ "step": 9310
6599
+ },
6600
+ {
6601
+ "epoch": 0.41362246964124244,
6602
+ "grad_norm": 95.94892883300781,
6603
+ "learning_rate": 9.983842883483615e-06,
6604
+ "loss": 10.9203,
6605
+ "step": 9320
6606
+ },
6607
+ {
6608
+ "epoch": 0.4140662705743339,
6609
+ "grad_norm": 116.64979553222656,
6610
+ "learning_rate": 9.983825547521686e-06,
6611
+ "loss": 11.2924,
6612
+ "step": 9330
6613
+ },
6614
+ {
6615
+ "epoch": 0.41451007150742536,
6616
+ "grad_norm": 83.40705871582031,
6617
+ "learning_rate": 9.98380821155976e-06,
6618
+ "loss": 12.1134,
6619
+ "step": 9340
6620
+ },
6621
+ {
6622
+ "epoch": 0.4149538724405168,
6623
+ "grad_norm": 92.19294738769531,
6624
+ "learning_rate": 9.983790875597833e-06,
6625
+ "loss": 11.2714,
6626
+ "step": 9350
6627
+ },
6628
+ {
6629
+ "epoch": 0.4153976733736083,
6630
+ "grad_norm": 78.88662719726562,
6631
+ "learning_rate": 9.983773539635904e-06,
6632
+ "loss": 10.875,
6633
+ "step": 9360
6634
+ },
6635
+ {
6636
+ "epoch": 0.4158414743066997,
6637
+ "grad_norm": 82.31551361083984,
6638
+ "learning_rate": 9.983756203673977e-06,
6639
+ "loss": 10.9331,
6640
+ "step": 9370
6641
+ },
6642
+ {
6643
+ "epoch": 0.4162852752397912,
6644
+ "grad_norm": 92.06917572021484,
6645
+ "learning_rate": 9.98373886771205e-06,
6646
+ "loss": 11.0438,
6647
+ "step": 9380
6648
+ },
6649
+ {
6650
+ "epoch": 0.41672907617288263,
6651
+ "grad_norm": 81.9530029296875,
6652
+ "learning_rate": 9.983721531750121e-06,
6653
+ "loss": 11.4246,
6654
+ "step": 9390
6655
+ },
6656
+ {
6657
+ "epoch": 0.4171728771059741,
6658
+ "grad_norm": 88.1327896118164,
6659
+ "learning_rate": 9.983704195788195e-06,
6660
+ "loss": 11.0952,
6661
+ "step": 9400
6662
+ },
6663
+ {
6664
+ "epoch": 0.4176166780390656,
6665
+ "grad_norm": 98.14168548583984,
6666
+ "learning_rate": 9.983686859826268e-06,
6667
+ "loss": 10.9578,
6668
+ "step": 9410
6669
+ },
6670
+ {
6671
+ "epoch": 0.41806047897215703,
6672
+ "grad_norm": 98.64930725097656,
6673
+ "learning_rate": 9.983669523864339e-06,
6674
+ "loss": 12.0788,
6675
+ "step": 9420
6676
+ },
6677
+ {
6678
+ "epoch": 0.4185042799052485,
6679
+ "grad_norm": 90.62484741210938,
6680
+ "learning_rate": 9.983652187902412e-06,
6681
+ "loss": 11.4321,
6682
+ "step": 9430
6683
+ },
6684
+ {
6685
+ "epoch": 0.41894808083833995,
6686
+ "grad_norm": 86.90058898925781,
6687
+ "learning_rate": 9.983634851940485e-06,
6688
+ "loss": 11.3379,
6689
+ "step": 9440
6690
+ },
6691
+ {
6692
+ "epoch": 0.41939188177143144,
6693
+ "grad_norm": 95.19513702392578,
6694
+ "learning_rate": 9.983617515978557e-06,
6695
+ "loss": 11.4713,
6696
+ "step": 9450
6697
+ },
6698
+ {
6699
+ "epoch": 0.4198356827045229,
6700
+ "grad_norm": 86.9664535522461,
6701
+ "learning_rate": 9.98360018001663e-06,
6702
+ "loss": 11.6491,
6703
+ "step": 9460
6704
+ },
6705
+ {
6706
+ "epoch": 0.42027948363761436,
6707
+ "grad_norm": 83.81656646728516,
6708
+ "learning_rate": 9.983582844054703e-06,
6709
+ "loss": 10.9501,
6710
+ "step": 9470
6711
+ },
6712
+ {
6713
+ "epoch": 0.4207232845707058,
6714
+ "grad_norm": 80.9144058227539,
6715
+ "learning_rate": 9.983565508092774e-06,
6716
+ "loss": 11.2825,
6717
+ "step": 9480
6718
+ },
6719
+ {
6720
+ "epoch": 0.4211670855037973,
6721
+ "grad_norm": 85.7936782836914,
6722
+ "learning_rate": 9.983548172130847e-06,
6723
+ "loss": 10.8854,
6724
+ "step": 9490
6725
+ },
6726
+ {
6727
+ "epoch": 0.4216108864368887,
6728
+ "grad_norm": 94.24036407470703,
6729
+ "learning_rate": 9.98353083616892e-06,
6730
+ "loss": 11.5424,
6731
+ "step": 9500
6732
+ },
6733
+ {
6734
+ "epoch": 0.4220546873699802,
6735
+ "grad_norm": 86.44004821777344,
6736
+ "learning_rate": 9.983513500206993e-06,
6737
+ "loss": 11.6961,
6738
+ "step": 9510
6739
+ },
6740
+ {
6741
+ "epoch": 0.4224984883030717,
6742
+ "grad_norm": 79.09918975830078,
6743
+ "learning_rate": 9.983496164245065e-06,
6744
+ "loss": 11.5077,
6745
+ "step": 9520
6746
+ },
6747
+ {
6748
+ "epoch": 0.4229422892361631,
6749
+ "grad_norm": 106.33499908447266,
6750
+ "learning_rate": 9.983478828283138e-06,
6751
+ "loss": 11.7945,
6752
+ "step": 9530
6753
+ },
6754
+ {
6755
+ "epoch": 0.4233860901692546,
6756
+ "grad_norm": 89.4134292602539,
6757
+ "learning_rate": 9.983461492321211e-06,
6758
+ "loss": 11.2233,
6759
+ "step": 9540
6760
+ },
6761
+ {
6762
+ "epoch": 0.42382989110234603,
6763
+ "grad_norm": 85.91355895996094,
6764
+ "learning_rate": 9.983444156359282e-06,
6765
+ "loss": 11.2504,
6766
+ "step": 9550
6767
+ },
6768
+ {
6769
+ "epoch": 0.4242736920354375,
6770
+ "grad_norm": 74.06096649169922,
6771
+ "learning_rate": 9.983426820397355e-06,
6772
+ "loss": 11.6198,
6773
+ "step": 9560
6774
+ },
6775
+ {
6776
+ "epoch": 0.42471749296852895,
6777
+ "grad_norm": 81.07852172851562,
6778
+ "learning_rate": 9.983409484435428e-06,
6779
+ "loss": 11.7593,
6780
+ "step": 9570
6781
+ },
6782
+ {
6783
+ "epoch": 0.42516129390162044,
6784
+ "grad_norm": 87.0907211303711,
6785
+ "learning_rate": 9.9833921484735e-06,
6786
+ "loss": 11.0808,
6787
+ "step": 9580
6788
+ },
6789
+ {
6790
+ "epoch": 0.4256050948347119,
6791
+ "grad_norm": 95.18062591552734,
6792
+ "learning_rate": 9.983374812511573e-06,
6793
+ "loss": 11.0543,
6794
+ "step": 9590
6795
+ },
6796
+ {
6797
+ "epoch": 0.42604889576780336,
6798
+ "grad_norm": 85.12203979492188,
6799
+ "learning_rate": 9.983357476549646e-06,
6800
+ "loss": 11.0986,
6801
+ "step": 9600
6802
+ },
6803
+ {
6804
+ "epoch": 0.4264926967008948,
6805
+ "grad_norm": 80.9763412475586,
6806
+ "learning_rate": 9.983340140587717e-06,
6807
+ "loss": 11.1147,
6808
+ "step": 9610
6809
+ },
6810
+ {
6811
+ "epoch": 0.4269364976339863,
6812
+ "grad_norm": 80.79862213134766,
6813
+ "learning_rate": 9.98332280462579e-06,
6814
+ "loss": 10.9127,
6815
+ "step": 9620
6816
+ },
6817
+ {
6818
+ "epoch": 0.4273802985670777,
6819
+ "grad_norm": 93.28567504882812,
6820
+ "learning_rate": 9.983305468663863e-06,
6821
+ "loss": 11.6276,
6822
+ "step": 9630
6823
+ },
6824
+ {
6825
+ "epoch": 0.4278240995001692,
6826
+ "grad_norm": 92.6642074584961,
6827
+ "learning_rate": 9.983288132701935e-06,
6828
+ "loss": 11.6817,
6829
+ "step": 9640
6830
+ },
6831
+ {
6832
+ "epoch": 0.4282679004332607,
6833
+ "grad_norm": 84.80957794189453,
6834
+ "learning_rate": 9.983270796740008e-06,
6835
+ "loss": 11.0266,
6836
+ "step": 9650
6837
+ },
6838
+ {
6839
+ "epoch": 0.4287117013663521,
6840
+ "grad_norm": 80.59945678710938,
6841
+ "learning_rate": 9.983253460778081e-06,
6842
+ "loss": 11.211,
6843
+ "step": 9660
6844
+ },
6845
+ {
6846
+ "epoch": 0.4291555022994436,
6847
+ "grad_norm": 83.12669372558594,
6848
+ "learning_rate": 9.983236124816152e-06,
6849
+ "loss": 11.2012,
6850
+ "step": 9670
6851
+ },
6852
+ {
6853
+ "epoch": 0.42959930323253503,
6854
+ "grad_norm": 92.85382080078125,
6855
+ "learning_rate": 9.983218788854225e-06,
6856
+ "loss": 10.8469,
6857
+ "step": 9680
6858
+ },
6859
+ {
6860
+ "epoch": 0.4300431041656265,
6861
+ "grad_norm": 89.2972640991211,
6862
+ "learning_rate": 9.983201452892299e-06,
6863
+ "loss": 11.5653,
6864
+ "step": 9690
6865
+ },
6866
+ {
6867
+ "epoch": 0.43048690509871795,
6868
+ "grad_norm": 82.58189392089844,
6869
+ "learning_rate": 9.98318411693037e-06,
6870
+ "loss": 11.5281,
6871
+ "step": 9700
6872
+ },
6873
+ {
6874
+ "epoch": 0.43093070603180944,
6875
+ "grad_norm": 75.15813446044922,
6876
+ "learning_rate": 9.983166780968443e-06,
6877
+ "loss": 11.4775,
6878
+ "step": 9710
6879
+ },
6880
+ {
6881
+ "epoch": 0.4313745069649009,
6882
+ "grad_norm": 87.26850128173828,
6883
+ "learning_rate": 9.983149445006516e-06,
6884
+ "loss": 11.4795,
6885
+ "step": 9720
6886
+ },
6887
+ {
6888
+ "epoch": 0.43181830789799236,
6889
+ "grad_norm": 80.3275375366211,
6890
+ "learning_rate": 9.98313210904459e-06,
6891
+ "loss": 10.6572,
6892
+ "step": 9730
6893
+ },
6894
+ {
6895
+ "epoch": 0.4322621088310838,
6896
+ "grad_norm": 77.77581024169922,
6897
+ "learning_rate": 9.98311477308266e-06,
6898
+ "loss": 11.1713,
6899
+ "step": 9740
6900
+ },
6901
+ {
6902
+ "epoch": 0.4327059097641753,
6903
+ "grad_norm": 86.08430480957031,
6904
+ "learning_rate": 9.983097437120734e-06,
6905
+ "loss": 11.2765,
6906
+ "step": 9750
6907
+ },
6908
+ {
6909
+ "epoch": 0.43314971069726677,
6910
+ "grad_norm": 80.2632827758789,
6911
+ "learning_rate": 9.983080101158807e-06,
6912
+ "loss": 11.1026,
6913
+ "step": 9760
6914
+ },
6915
+ {
6916
+ "epoch": 0.4335935116303582,
6917
+ "grad_norm": 90.77334594726562,
6918
+ "learning_rate": 9.983062765196878e-06,
6919
+ "loss": 11.2743,
6920
+ "step": 9770
6921
+ },
6922
+ {
6923
+ "epoch": 0.4340373125634497,
6924
+ "grad_norm": 85.51402282714844,
6925
+ "learning_rate": 9.983045429234951e-06,
6926
+ "loss": 11.1924,
6927
+ "step": 9780
6928
+ },
6929
+ {
6930
+ "epoch": 0.4344811134965411,
6931
+ "grad_norm": 87.34100341796875,
6932
+ "learning_rate": 9.983028093273024e-06,
6933
+ "loss": 11.5827,
6934
+ "step": 9790
6935
+ },
6936
+ {
6937
+ "epoch": 0.4349249144296326,
6938
+ "grad_norm": 104.64212036132812,
6939
+ "learning_rate": 9.983010757311096e-06,
6940
+ "loss": 11.2375,
6941
+ "step": 9800
6942
+ },
6943
+ {
6944
+ "epoch": 0.43536871536272403,
6945
+ "grad_norm": 70.56439208984375,
6946
+ "learning_rate": 9.982993421349169e-06,
6947
+ "loss": 11.2364,
6948
+ "step": 9810
6949
+ },
6950
+ {
6951
+ "epoch": 0.4358125162958155,
6952
+ "grad_norm": 83.87458038330078,
6953
+ "learning_rate": 9.982976085387242e-06,
6954
+ "loss": 11.6634,
6955
+ "step": 9820
6956
+ },
6957
+ {
6958
+ "epoch": 0.43625631722890695,
6959
+ "grad_norm": 85.76553344726562,
6960
+ "learning_rate": 9.982958749425313e-06,
6961
+ "loss": 11.5362,
6962
+ "step": 9830
6963
+ },
6964
+ {
6965
+ "epoch": 0.43670011816199844,
6966
+ "grad_norm": 89.0379867553711,
6967
+ "learning_rate": 9.982941413463386e-06,
6968
+ "loss": 11.1996,
6969
+ "step": 9840
6970
+ },
6971
+ {
6972
+ "epoch": 0.4371439190950899,
6973
+ "grad_norm": 103.95638275146484,
6974
+ "learning_rate": 9.98292407750146e-06,
6975
+ "loss": 10.9446,
6976
+ "step": 9850
6977
+ },
6978
+ {
6979
+ "epoch": 0.43758772002818136,
6980
+ "grad_norm": 82.41026306152344,
6981
+ "learning_rate": 9.98290674153953e-06,
6982
+ "loss": 11.219,
6983
+ "step": 9860
6984
+ },
6985
+ {
6986
+ "epoch": 0.43803152096127285,
6987
+ "grad_norm": 86.83589172363281,
6988
+ "learning_rate": 9.982889405577604e-06,
6989
+ "loss": 11.787,
6990
+ "step": 9870
6991
+ },
6992
+ {
6993
+ "epoch": 0.4384753218943643,
6994
+ "grad_norm": 80.76217651367188,
6995
+ "learning_rate": 9.982872069615677e-06,
6996
+ "loss": 10.6874,
6997
+ "step": 9880
6998
+ },
6999
+ {
7000
+ "epoch": 0.43891912282745577,
7001
+ "grad_norm": 79.48180389404297,
7002
+ "learning_rate": 9.982854733653748e-06,
7003
+ "loss": 11.0988,
7004
+ "step": 9890
7005
+ },
7006
+ {
7007
+ "epoch": 0.4393629237605472,
7008
+ "grad_norm": 96.88164520263672,
7009
+ "learning_rate": 9.982837397691821e-06,
7010
+ "loss": 11.2804,
7011
+ "step": 9900
7012
+ },
7013
+ {
7014
+ "epoch": 0.4398067246936387,
7015
+ "grad_norm": 78.75862121582031,
7016
+ "learning_rate": 9.982820061729894e-06,
7017
+ "loss": 10.8656,
7018
+ "step": 9910
7019
+ },
7020
+ {
7021
+ "epoch": 0.4402505256267301,
7022
+ "grad_norm": 74.44935607910156,
7023
+ "learning_rate": 9.982802725767966e-06,
7024
+ "loss": 11.1552,
7025
+ "step": 9920
7026
+ },
7027
+ {
7028
+ "epoch": 0.4406943265598216,
7029
+ "grad_norm": 90.43840789794922,
7030
+ "learning_rate": 9.982785389806039e-06,
7031
+ "loss": 10.7856,
7032
+ "step": 9930
7033
+ },
7034
+ {
7035
+ "epoch": 0.44113812749291303,
7036
+ "grad_norm": 100.49180603027344,
7037
+ "learning_rate": 9.982768053844112e-06,
7038
+ "loss": 11.1607,
7039
+ "step": 9940
7040
+ },
7041
+ {
7042
+ "epoch": 0.4415819284260045,
7043
+ "grad_norm": 78.78294372558594,
7044
+ "learning_rate": 9.982750717882185e-06,
7045
+ "loss": 11.0682,
7046
+ "step": 9950
7047
+ },
7048
+ {
7049
+ "epoch": 0.44202572935909595,
7050
+ "grad_norm": 98.88738250732422,
7051
+ "learning_rate": 9.982733381920256e-06,
7052
+ "loss": 11.5788,
7053
+ "step": 9960
7054
+ },
7055
+ {
7056
+ "epoch": 0.44246953029218744,
7057
+ "grad_norm": 83.48590087890625,
7058
+ "learning_rate": 9.98271604595833e-06,
7059
+ "loss": 11.1786,
7060
+ "step": 9970
7061
+ },
7062
+ {
7063
+ "epoch": 0.44291333122527893,
7064
+ "grad_norm": 71.23033142089844,
7065
+ "learning_rate": 9.982698709996403e-06,
7066
+ "loss": 10.6031,
7067
+ "step": 9980
7068
+ },
7069
+ {
7070
+ "epoch": 0.44335713215837036,
7071
+ "grad_norm": 92.292236328125,
7072
+ "learning_rate": 9.982681374034474e-06,
7073
+ "loss": 11.3427,
7074
+ "step": 9990
7075
+ },
7076
+ {
7077
+ "epoch": 0.44380093309146185,
7078
+ "grad_norm": 85.08587646484375,
7079
+ "learning_rate": 9.982664038072547e-06,
7080
+ "loss": 11.022,
7081
+ "step": 10000
7082
+ },
7083
+ {
7084
+ "epoch": 0.44380093309146185,
7085
+ "eval_loss": 0.34827741980552673,
7086
+ "eval_runtime": 673.7997,
7087
+ "eval_samples_per_second": 1802.303,
7088
+ "eval_steps_per_second": 56.322,
7089
+ "step": 10000
7090
  }
7091
  ],
7092
  "logging_steps": 10,
 
7106
  "attributes": {}
7107
  }
7108
  },
7109
+ "total_flos": 3.489723205025792e+18,
7110
  "train_batch_size": 4,
7111
  "trial_name": null,
7112
  "trial_params": null