minpeter commited on
Commit
6a8b246
·
verified ·
1 Parent(s): 5444174

Training in progress, step 700, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:134e1237af4835ec7c09b2ddfb5c01c7318849193ac07665aeebcb20602a7c35
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa231a4fb18485169d08c9d1e7878f2c6c2747cf33272ebb7b91a615a73da69f
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68a372c381851fbd34898ce48030e1bd1636e6f679c35483af912455776bdccf
3
  size 422377675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff95fd30c41364a06356f6550493cfc79f8b5f14e8279f05b156b0d50603cfb7
3
  size 422377675
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:229d1a5c5efb9c108732a998af8aefc8b44d8ea5fb5a5844f3f1fd3716527d07
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36f1c8cafda7ec05bcf717e4cbc9d475e180378b36391598d72523001d0947ee
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3d5de6a60842cf6ceecc89c24a89facabb892cb2713ea74e41bdeaeb1177e51
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b636e22decc0690abb4217d3b016f329ae73b4d12bae4602c74bba0c4d4ffdc1
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6230529595015576,
6
  "eval_steps": 100,
7
- "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4256,6 +4256,714 @@
4256
  "eval_samples_per_second": 9.525,
4257
  "eval_steps_per_second": 1.191,
4258
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4259
  }
4260
  ],
4261
  "logging_steps": 1,
@@ -4275,7 +4983,7 @@
4275
  "attributes": {}
4276
  }
4277
  },
4278
- "total_flos": 7.64170916069376e+16,
4279
  "train_batch_size": 16,
4280
  "trial_name": null,
4281
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.726895119418484,
6
  "eval_steps": 100,
7
+ "global_step": 700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4256
  "eval_samples_per_second": 9.525,
4257
  "eval_steps_per_second": 1.191,
4258
  "step": 600
4259
+ },
4260
+ {
4261
+ "epoch": 0.6240913811007269,
4262
+ "grad_norm": 1.609375,
4263
+ "learning_rate": 0.0003412480773376864,
4264
+ "loss": 6.2849,
4265
+ "step": 601
4266
+ },
4267
+ {
4268
+ "epoch": 0.6251298026998962,
4269
+ "grad_norm": 1.5234375,
4270
+ "learning_rate": 0.00033961934808841024,
4271
+ "loss": 6.1491,
4272
+ "step": 602
4273
+ },
4274
+ {
4275
+ "epoch": 0.6261682242990654,
4276
+ "grad_norm": 1.3203125,
4277
+ "learning_rate": 0.0003379925136196088,
4278
+ "loss": 6.281,
4279
+ "step": 603
4280
+ },
4281
+ {
4282
+ "epoch": 0.6272066458982347,
4283
+ "grad_norm": 1.5625,
4284
+ "learning_rate": 0.0003363675931511455,
4285
+ "loss": 6.2496,
4286
+ "step": 604
4287
+ },
4288
+ {
4289
+ "epoch": 0.6282450674974039,
4290
+ "grad_norm": 1.3046875,
4291
+ "learning_rate": 0.0003347446058802708,
4292
+ "loss": 6.2776,
4293
+ "step": 605
4294
+ },
4295
+ {
4296
+ "epoch": 0.6292834890965732,
4297
+ "grad_norm": 1.46875,
4298
+ "learning_rate": 0.00033312357098139617,
4299
+ "loss": 6.3928,
4300
+ "step": 606
4301
+ },
4302
+ {
4303
+ "epoch": 0.6303219106957425,
4304
+ "grad_norm": 1.34375,
4305
+ "learning_rate": 0.0003315045076058671,
4306
+ "loss": 6.2803,
4307
+ "step": 607
4308
+ },
4309
+ {
4310
+ "epoch": 0.6313603322949117,
4311
+ "grad_norm": 1.4296875,
4312
+ "learning_rate": 0.00032988743488173697,
4313
+ "loss": 6.2263,
4314
+ "step": 608
4315
+ },
4316
+ {
4317
+ "epoch": 0.632398753894081,
4318
+ "grad_norm": 1.46875,
4319
+ "learning_rate": 0.000328272371913541,
4320
+ "loss": 6.1383,
4321
+ "step": 609
4322
+ },
4323
+ {
4324
+ "epoch": 0.6334371754932503,
4325
+ "grad_norm": 1.265625,
4326
+ "learning_rate": 0.0003266593377820708,
4327
+ "loss": 6.2603,
4328
+ "step": 610
4329
+ },
4330
+ {
4331
+ "epoch": 0.6344755970924195,
4332
+ "grad_norm": 2.25,
4333
+ "learning_rate": 0.0003250483515441485,
4334
+ "loss": 6.3359,
4335
+ "step": 611
4336
+ },
4337
+ {
4338
+ "epoch": 0.6355140186915887,
4339
+ "grad_norm": 1.6015625,
4340
+ "learning_rate": 0.0003234394322324019,
4341
+ "loss": 6.1653,
4342
+ "step": 612
4343
+ },
4344
+ {
4345
+ "epoch": 0.6365524402907581,
4346
+ "grad_norm": 2.125,
4347
+ "learning_rate": 0.00032183259885504,
4348
+ "loss": 6.2869,
4349
+ "step": 613
4350
+ },
4351
+ {
4352
+ "epoch": 0.6375908618899273,
4353
+ "grad_norm": 1.3515625,
4354
+ "learning_rate": 0.00032022787039562745,
4355
+ "loss": 6.3017,
4356
+ "step": 614
4357
+ },
4358
+ {
4359
+ "epoch": 0.6386292834890965,
4360
+ "grad_norm": 4.3125,
4361
+ "learning_rate": 0.0003186252658128611,
4362
+ "loss": 6.1045,
4363
+ "step": 615
4364
+ },
4365
+ {
4366
+ "epoch": 0.6396677050882659,
4367
+ "grad_norm": 1.6171875,
4368
+ "learning_rate": 0.00031702480404034565,
4369
+ "loss": 6.3121,
4370
+ "step": 616
4371
+ },
4372
+ {
4373
+ "epoch": 0.6407061266874351,
4374
+ "grad_norm": 1.8515625,
4375
+ "learning_rate": 0.00031542650398637016,
4376
+ "loss": 6.1043,
4377
+ "step": 617
4378
+ },
4379
+ {
4380
+ "epoch": 0.6417445482866043,
4381
+ "grad_norm": 1.4375,
4382
+ "learning_rate": 0.0003138303845336844,
4383
+ "loss": 6.2402,
4384
+ "step": 618
4385
+ },
4386
+ {
4387
+ "epoch": 0.6427829698857737,
4388
+ "grad_norm": 1.46875,
4389
+ "learning_rate": 0.0003122364645392762,
4390
+ "loss": 6.2972,
4391
+ "step": 619
4392
+ },
4393
+ {
4394
+ "epoch": 0.6438213914849429,
4395
+ "grad_norm": 2.234375,
4396
+ "learning_rate": 0.0003106447628341482,
4397
+ "loss": 6.2454,
4398
+ "step": 620
4399
+ },
4400
+ {
4401
+ "epoch": 0.6448598130841121,
4402
+ "grad_norm": 1.4921875,
4403
+ "learning_rate": 0.0003090552982230954,
4404
+ "loss": 6.1745,
4405
+ "step": 621
4406
+ },
4407
+ {
4408
+ "epoch": 0.6458982346832814,
4409
+ "grad_norm": 1.3984375,
4410
+ "learning_rate": 0.00030746808948448366,
4411
+ "loss": 6.224,
4412
+ "step": 622
4413
+ },
4414
+ {
4415
+ "epoch": 0.6469366562824507,
4416
+ "grad_norm": 1.4921875,
4417
+ "learning_rate": 0.0003058831553700268,
4418
+ "loss": 6.3142,
4419
+ "step": 623
4420
+ },
4421
+ {
4422
+ "epoch": 0.6479750778816199,
4423
+ "grad_norm": 1.9140625,
4424
+ "learning_rate": 0.00030430051460456596,
4425
+ "loss": 6.2258,
4426
+ "step": 624
4427
+ },
4428
+ {
4429
+ "epoch": 0.6490134994807892,
4430
+ "grad_norm": 1.6171875,
4431
+ "learning_rate": 0.0003027201858858479,
4432
+ "loss": 6.1406,
4433
+ "step": 625
4434
+ },
4435
+ {
4436
+ "epoch": 0.6500519210799585,
4437
+ "grad_norm": 1.59375,
4438
+ "learning_rate": 0.00030114218788430437,
4439
+ "loss": 5.7233,
4440
+ "step": 626
4441
+ },
4442
+ {
4443
+ "epoch": 0.6510903426791277,
4444
+ "grad_norm": 1.5234375,
4445
+ "learning_rate": 0.0002995665392428313,
4446
+ "loss": 6.0472,
4447
+ "step": 627
4448
+ },
4449
+ {
4450
+ "epoch": 0.652128764278297,
4451
+ "grad_norm": 2.265625,
4452
+ "learning_rate": 0.00029799325857656855,
4453
+ "loss": 6.161,
4454
+ "step": 628
4455
+ },
4456
+ {
4457
+ "epoch": 0.6531671858774662,
4458
+ "grad_norm": 1.53125,
4459
+ "learning_rate": 0.00029642236447268024,
4460
+ "loss": 6.112,
4461
+ "step": 629
4462
+ },
4463
+ {
4464
+ "epoch": 0.6542056074766355,
4465
+ "grad_norm": 1.4140625,
4466
+ "learning_rate": 0.00029485387549013485,
4467
+ "loss": 6.2132,
4468
+ "step": 630
4469
+ },
4470
+ {
4471
+ "epoch": 0.6552440290758048,
4472
+ "grad_norm": 1.1875,
4473
+ "learning_rate": 0.00029328781015948625,
4474
+ "loss": 6.2657,
4475
+ "step": 631
4476
+ },
4477
+ {
4478
+ "epoch": 0.656282450674974,
4479
+ "grad_norm": 1.5078125,
4480
+ "learning_rate": 0.00029172418698265444,
4481
+ "loss": 6.1711,
4482
+ "step": 632
4483
+ },
4484
+ {
4485
+ "epoch": 0.6573208722741433,
4486
+ "grad_norm": 1.5859375,
4487
+ "learning_rate": 0.0002901630244327075,
4488
+ "loss": 6.1935,
4489
+ "step": 633
4490
+ },
4491
+ {
4492
+ "epoch": 0.6583592938733126,
4493
+ "grad_norm": 1.6484375,
4494
+ "learning_rate": 0.00028860434095364263,
4495
+ "loss": 6.4055,
4496
+ "step": 634
4497
+ },
4498
+ {
4499
+ "epoch": 0.6593977154724818,
4500
+ "grad_norm": 1.96875,
4501
+ "learning_rate": 0.00028704815496016875,
4502
+ "loss": 5.9916,
4503
+ "step": 635
4504
+ },
4505
+ {
4506
+ "epoch": 0.660436137071651,
4507
+ "grad_norm": 1.3359375,
4508
+ "learning_rate": 0.00028549448483748886,
4509
+ "loss": 6.1303,
4510
+ "step": 636
4511
+ },
4512
+ {
4513
+ "epoch": 0.6614745586708204,
4514
+ "grad_norm": 1.65625,
4515
+ "learning_rate": 0.0002839433489410828,
4516
+ "loss": 6.1363,
4517
+ "step": 637
4518
+ },
4519
+ {
4520
+ "epoch": 0.6625129802699896,
4521
+ "grad_norm": 1.734375,
4522
+ "learning_rate": 0.0002823947655964901,
4523
+ "loss": 6.1624,
4524
+ "step": 638
4525
+ },
4526
+ {
4527
+ "epoch": 0.6635514018691588,
4528
+ "grad_norm": 1.453125,
4529
+ "learning_rate": 0.000280848753099094,
4530
+ "loss": 5.9139,
4531
+ "step": 639
4532
+ },
4533
+ {
4534
+ "epoch": 0.6645898234683282,
4535
+ "grad_norm": 2.625,
4536
+ "learning_rate": 0.0002793053297139054,
4537
+ "loss": 6.3661,
4538
+ "step": 640
4539
+ },
4540
+ {
4541
+ "epoch": 0.6656282450674974,
4542
+ "grad_norm": 1.7578125,
4543
+ "learning_rate": 0.0002777645136753459,
4544
+ "loss": 6.3797,
4545
+ "step": 641
4546
+ },
4547
+ {
4548
+ "epoch": 0.6666666666666666,
4549
+ "grad_norm": 1.8671875,
4550
+ "learning_rate": 0.0002762263231870339,
4551
+ "loss": 6.0671,
4552
+ "step": 642
4553
+ },
4554
+ {
4555
+ "epoch": 0.667705088265836,
4556
+ "grad_norm": 1.6328125,
4557
+ "learning_rate": 0.00027469077642156844,
4558
+ "loss": 6.1843,
4559
+ "step": 643
4560
+ },
4561
+ {
4562
+ "epoch": 0.6687435098650052,
4563
+ "grad_norm": 1.625,
4564
+ "learning_rate": 0.000273157891520315,
4565
+ "loss": 6.4473,
4566
+ "step": 644
4567
+ },
4568
+ {
4569
+ "epoch": 0.6697819314641744,
4570
+ "grad_norm": 1.6953125,
4571
+ "learning_rate": 0.00027162768659319114,
4572
+ "loss": 6.3164,
4573
+ "step": 645
4574
+ },
4575
+ {
4576
+ "epoch": 0.6708203530633438,
4577
+ "grad_norm": 1.6328125,
4578
+ "learning_rate": 0.00027010017971845264,
4579
+ "loss": 6.4004,
4580
+ "step": 646
4581
+ },
4582
+ {
4583
+ "epoch": 0.671858774662513,
4584
+ "grad_norm": 1.6171875,
4585
+ "learning_rate": 0.00026857538894247947,
4586
+ "loss": 6.242,
4587
+ "step": 647
4588
+ },
4589
+ {
4590
+ "epoch": 0.6728971962616822,
4591
+ "grad_norm": 1.28125,
4592
+ "learning_rate": 0.00026705333227956303,
4593
+ "loss": 6.2597,
4594
+ "step": 648
4595
+ },
4596
+ {
4597
+ "epoch": 0.6739356178608515,
4598
+ "grad_norm": 1.2734375,
4599
+ "learning_rate": 0.000265534027711693,
4600
+ "loss": 6.3972,
4601
+ "step": 649
4602
+ },
4603
+ {
4604
+ "epoch": 0.6749740394600208,
4605
+ "grad_norm": 1.90625,
4606
+ "learning_rate": 0.00026401749318834527,
4607
+ "loss": 6.4521,
4608
+ "step": 650
4609
+ },
4610
+ {
4611
+ "epoch": 0.67601246105919,
4612
+ "grad_norm": 1.40625,
4613
+ "learning_rate": 0.0002625037466262696,
4614
+ "loss": 5.9594,
4615
+ "step": 651
4616
+ },
4617
+ {
4618
+ "epoch": 0.6770508826583593,
4619
+ "grad_norm": 1.6015625,
4620
+ "learning_rate": 0.0002609928059092779,
4621
+ "loss": 6.2181,
4622
+ "step": 652
4623
+ },
4624
+ {
4625
+ "epoch": 0.6780893042575286,
4626
+ "grad_norm": 1.3828125,
4627
+ "learning_rate": 0.00025948468888803324,
4628
+ "loss": 6.2781,
4629
+ "step": 653
4630
+ },
4631
+ {
4632
+ "epoch": 0.6791277258566978,
4633
+ "grad_norm": 1.65625,
4634
+ "learning_rate": 0.00025797941337983875,
4635
+ "loss": 6.1757,
4636
+ "step": 654
4637
+ },
4638
+ {
4639
+ "epoch": 0.6801661474558671,
4640
+ "grad_norm": 1.4609375,
4641
+ "learning_rate": 0.0002564769971684271,
4642
+ "loss": 6.266,
4643
+ "step": 655
4644
+ },
4645
+ {
4646
+ "epoch": 0.6812045690550363,
4647
+ "grad_norm": 1.390625,
4648
+ "learning_rate": 0.00025497745800375036,
4649
+ "loss": 6.3151,
4650
+ "step": 656
4651
+ },
4652
+ {
4653
+ "epoch": 0.6822429906542056,
4654
+ "grad_norm": 1.3671875,
4655
+ "learning_rate": 0.0002534808136017707,
4656
+ "loss": 6.2002,
4657
+ "step": 657
4658
+ },
4659
+ {
4660
+ "epoch": 0.6832814122533749,
4661
+ "grad_norm": 1.3671875,
4662
+ "learning_rate": 0.00025198708164425045,
4663
+ "loss": 6.1243,
4664
+ "step": 658
4665
+ },
4666
+ {
4667
+ "epoch": 0.6843198338525441,
4668
+ "grad_norm": 1.3671875,
4669
+ "learning_rate": 0.0002504962797785435,
4670
+ "loss": 6.2666,
4671
+ "step": 659
4672
+ },
4673
+ {
4674
+ "epoch": 0.6853582554517134,
4675
+ "grad_norm": 4.4375,
4676
+ "learning_rate": 0.00024900842561738736,
4677
+ "loss": 5.9076,
4678
+ "step": 660
4679
+ },
4680
+ {
4681
+ "epoch": 0.6863966770508827,
4682
+ "grad_norm": 1.484375,
4683
+ "learning_rate": 0.00024752353673869405,
4684
+ "loss": 6.1968,
4685
+ "step": 661
4686
+ },
4687
+ {
4688
+ "epoch": 0.6874350986500519,
4689
+ "grad_norm": 1.203125,
4690
+ "learning_rate": 0.00024604163068534315,
4691
+ "loss": 6.2919,
4692
+ "step": 662
4693
+ },
4694
+ {
4695
+ "epoch": 0.6884735202492211,
4696
+ "grad_norm": 1.46875,
4697
+ "learning_rate": 0.00024456272496497415,
4698
+ "loss": 6.1599,
4699
+ "step": 663
4700
+ },
4701
+ {
4702
+ "epoch": 0.6895119418483905,
4703
+ "grad_norm": 1.8984375,
4704
+ "learning_rate": 0.00024308683704978002,
4705
+ "loss": 6.2052,
4706
+ "step": 664
4707
+ },
4708
+ {
4709
+ "epoch": 0.6905503634475597,
4710
+ "grad_norm": 1.375,
4711
+ "learning_rate": 0.00024161398437630045,
4712
+ "loss": 6.3025,
4713
+ "step": 665
4714
+ },
4715
+ {
4716
+ "epoch": 0.6915887850467289,
4717
+ "grad_norm": 1.296875,
4718
+ "learning_rate": 0.0002401441843452159,
4719
+ "loss": 6.1938,
4720
+ "step": 666
4721
+ },
4722
+ {
4723
+ "epoch": 0.6926272066458983,
4724
+ "grad_norm": 1.46875,
4725
+ "learning_rate": 0.0002386774543211423,
4726
+ "loss": 6.3049,
4727
+ "step": 667
4728
+ },
4729
+ {
4730
+ "epoch": 0.6936656282450675,
4731
+ "grad_norm": 1.8515625,
4732
+ "learning_rate": 0.0002372138116324254,
4733
+ "loss": 6.3624,
4734
+ "step": 668
4735
+ },
4736
+ {
4737
+ "epoch": 0.6947040498442367,
4738
+ "grad_norm": 1.84375,
4739
+ "learning_rate": 0.00023575327357093658,
4740
+ "loss": 6.3294,
4741
+ "step": 669
4742
+ },
4743
+ {
4744
+ "epoch": 0.6957424714434061,
4745
+ "grad_norm": 1.296875,
4746
+ "learning_rate": 0.0002342958573918682,
4747
+ "loss": 6.097,
4748
+ "step": 670
4749
+ },
4750
+ {
4751
+ "epoch": 0.6967808930425753,
4752
+ "grad_norm": 1.8203125,
4753
+ "learning_rate": 0.0002328415803135298,
4754
+ "loss": 6.1121,
4755
+ "step": 671
4756
+ },
4757
+ {
4758
+ "epoch": 0.6978193146417445,
4759
+ "grad_norm": 1.5859375,
4760
+ "learning_rate": 0.0002313904595171447,
4761
+ "loss": 6.1288,
4762
+ "step": 672
4763
+ },
4764
+ {
4765
+ "epoch": 0.6988577362409139,
4766
+ "grad_norm": 1.5,
4767
+ "learning_rate": 0.0002299425121466475,
4768
+ "loss": 6.0291,
4769
+ "step": 673
4770
+ },
4771
+ {
4772
+ "epoch": 0.6998961578400831,
4773
+ "grad_norm": 1.5,
4774
+ "learning_rate": 0.00022849775530848056,
4775
+ "loss": 6.3553,
4776
+ "step": 674
4777
+ },
4778
+ {
4779
+ "epoch": 0.7009345794392523,
4780
+ "grad_norm": 1.4765625,
4781
+ "learning_rate": 0.00022705620607139254,
4782
+ "loss": 6.2154,
4783
+ "step": 675
4784
+ },
4785
+ {
4786
+ "epoch": 0.7019730010384216,
4787
+ "grad_norm": 1.3984375,
4788
+ "learning_rate": 0.00022561788146623679,
4789
+ "loss": 6.1831,
4790
+ "step": 676
4791
+ },
4792
+ {
4793
+ "epoch": 0.7030114226375909,
4794
+ "grad_norm": 1.34375,
4795
+ "learning_rate": 0.0002241827984857698,
4796
+ "loss": 6.1788,
4797
+ "step": 677
4798
+ },
4799
+ {
4800
+ "epoch": 0.7040498442367601,
4801
+ "grad_norm": 1.6875,
4802
+ "learning_rate": 0.00022275097408445076,
4803
+ "loss": 6.3325,
4804
+ "step": 678
4805
+ },
4806
+ {
4807
+ "epoch": 0.7050882658359294,
4808
+ "grad_norm": 1.3203125,
4809
+ "learning_rate": 0.00022132242517824115,
4810
+ "loss": 6.2826,
4811
+ "step": 679
4812
+ },
4813
+ {
4814
+ "epoch": 0.7061266874350987,
4815
+ "grad_norm": 1.3359375,
4816
+ "learning_rate": 0.0002198971686444047,
4817
+ "loss": 6.1395,
4818
+ "step": 680
4819
+ },
4820
+ {
4821
+ "epoch": 0.7071651090342679,
4822
+ "grad_norm": 1.4296875,
4823
+ "learning_rate": 0.00021847522132130827,
4824
+ "loss": 6.3967,
4825
+ "step": 681
4826
+ },
4827
+ {
4828
+ "epoch": 0.7082035306334372,
4829
+ "grad_norm": 2.234375,
4830
+ "learning_rate": 0.00021705660000822285,
4831
+ "loss": 6.4163,
4832
+ "step": 682
4833
+ },
4834
+ {
4835
+ "epoch": 0.7092419522326064,
4836
+ "grad_norm": 1.5546875,
4837
+ "learning_rate": 0.00021564132146512495,
4838
+ "loss": 6.1454,
4839
+ "step": 683
4840
+ },
4841
+ {
4842
+ "epoch": 0.7102803738317757,
4843
+ "grad_norm": 1.9296875,
4844
+ "learning_rate": 0.00021422940241249872,
4845
+ "loss": 6.2889,
4846
+ "step": 684
4847
+ },
4848
+ {
4849
+ "epoch": 0.711318795430945,
4850
+ "grad_norm": 1.4765625,
4851
+ "learning_rate": 0.0002128208595311384,
4852
+ "loss": 6.227,
4853
+ "step": 685
4854
+ },
4855
+ {
4856
+ "epoch": 0.7123572170301142,
4857
+ "grad_norm": 1.5078125,
4858
+ "learning_rate": 0.00021141570946195105,
4859
+ "loss": 6.2655,
4860
+ "step": 686
4861
+ },
4862
+ {
4863
+ "epoch": 0.7133956386292835,
4864
+ "grad_norm": 1.4140625,
4865
+ "learning_rate": 0.00021001396880576063,
4866
+ "loss": 6.1813,
4867
+ "step": 687
4868
+ },
4869
+ {
4870
+ "epoch": 0.7144340602284528,
4871
+ "grad_norm": 1.1875,
4872
+ "learning_rate": 0.0002086156541231109,
4873
+ "loss": 6.3148,
4874
+ "step": 688
4875
+ },
4876
+ {
4877
+ "epoch": 0.715472481827622,
4878
+ "grad_norm": 1.6015625,
4879
+ "learning_rate": 0.00020722078193407035,
4880
+ "loss": 6.2127,
4881
+ "step": 689
4882
+ },
4883
+ {
4884
+ "epoch": 0.7165109034267912,
4885
+ "grad_norm": 1.53125,
4886
+ "learning_rate": 0.00020582936871803693,
4887
+ "loss": 6.2863,
4888
+ "step": 690
4889
+ },
4890
+ {
4891
+ "epoch": 0.7175493250259606,
4892
+ "grad_norm": 1.203125,
4893
+ "learning_rate": 0.0002044414309135434,
4894
+ "loss": 6.236,
4895
+ "step": 691
4896
+ },
4897
+ {
4898
+ "epoch": 0.7185877466251298,
4899
+ "grad_norm": 1.5546875,
4900
+ "learning_rate": 0.00020305698491806295,
4901
+ "loss": 6.3918,
4902
+ "step": 692
4903
+ },
4904
+ {
4905
+ "epoch": 0.719626168224299,
4906
+ "grad_norm": 1.59375,
4907
+ "learning_rate": 0.0002016760470878158,
4908
+ "loss": 5.9227,
4909
+ "step": 693
4910
+ },
4911
+ {
4912
+ "epoch": 0.7206645898234684,
4913
+ "grad_norm": 1.4921875,
4914
+ "learning_rate": 0.0002002986337375755,
4915
+ "loss": 6.0943,
4916
+ "step": 694
4917
+ },
4918
+ {
4919
+ "epoch": 0.7217030114226376,
4920
+ "grad_norm": 1.4921875,
4921
+ "learning_rate": 0.00019892476114047664,
4922
+ "loss": 6.2113,
4923
+ "step": 695
4924
+ },
4925
+ {
4926
+ "epoch": 0.7227414330218068,
4927
+ "grad_norm": 1.4921875,
4928
+ "learning_rate": 0.00019755444552782225,
4929
+ "loss": 6.3502,
4930
+ "step": 696
4931
+ },
4932
+ {
4933
+ "epoch": 0.7237798546209762,
4934
+ "grad_norm": 1.515625,
4935
+ "learning_rate": 0.00019618770308889227,
4936
+ "loss": 5.9658,
4937
+ "step": 697
4938
+ },
4939
+ {
4940
+ "epoch": 0.7248182762201454,
4941
+ "grad_norm": 1.5,
4942
+ "learning_rate": 0.00019482454997075228,
4943
+ "loss": 6.1518,
4944
+ "step": 698
4945
+ },
4946
+ {
4947
+ "epoch": 0.7258566978193146,
4948
+ "grad_norm": 1.4609375,
4949
+ "learning_rate": 0.00019346500227806218,
4950
+ "loss": 6.1651,
4951
+ "step": 699
4952
+ },
4953
+ {
4954
+ "epoch": 0.726895119418484,
4955
+ "grad_norm": 1.359375,
4956
+ "learning_rate": 0.00019210907607288723,
4957
+ "loss": 6.2656,
4958
+ "step": 700
4959
+ },
4960
+ {
4961
+ "epoch": 0.726895119418484,
4962
+ "eval_loss": 6.260857105255127,
4963
+ "eval_runtime": 1.646,
4964
+ "eval_samples_per_second": 9.72,
4965
+ "eval_steps_per_second": 1.215,
4966
+ "step": 700
4967
  }
4968
  ],
4969
  "logging_steps": 1,
 
4983
  "attributes": {}
4984
  }
4985
  },
4986
+ "total_flos": 8.91532735414272e+16,
4987
  "train_batch_size": 16,
4988
  "trial_name": null,
4989
  "trial_params": null