CocoRoF commited on
Commit
83e69d3
·
verified ·
1 Parent(s): 45be345

Training in progress, step 34605, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:055db62e7afaf72547d8020ffa4c60d79b2df7d5d99747310e09d238a4ba1fa7
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9964a1a24656f4a90b748a0d19a5c15f9373211cfeeb8598c8bdac5b70bf7fe
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c57ff1606838ae93b0606705e53592c3c93bfa3a777074b3409ef82ed78e848
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:507be84a97a2ba7522e34ce2a578a239f9ad03fbfe2399c46a9b1d5e76cdcb68
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a2fbcd26bac3ea7dc02fc9ede5b8a1914ca51611473722a11a969e1f26ac0ee
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5f18f2e1ac96b2a7568111293c1bcd8d35880ccf3f2819600a270caeb99015d
3
+ size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66d97b511d2fdb8061e5bf72c139923941c148260fac1caedd654028da6986c1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcff98006be86afc3f75b37d6113fdf5b62db51c94b6f68b33f555f4ac346822
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3839473129eb8c438ab312370daa55eb10a0790f33d38fc5eaa24859b54b0d1f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f220fd74a6757e167d014f721e96b7e5710e8f5c97f48c9fe6d72e19ebbbd65c
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5088a0d34c7015afe60457fbb3f0a4740839369017a42ea4b3250322c2d63ceb
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21565575b5db0aa139865ffb0d9df6ceb55078dc7b218f601419cc3d7b873134
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9cac0eb25286b75549fa2030810940adf357064a83facaf5c58ebe37190b6ac
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:487a03a3b6c36091572b8fbb74add1eb3c753efe5ab0eee791c8d03f495e5c98
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a57d29811122d52bd53f81af680412b91dde1cd2a12fa885d8a54388be8e2d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f8c6e22cfd0b3668705becc42fb2c443ef5e4cfe38d4ba5e3dfdc565094143
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c90ab29b255eaf920ecc1cba0b586e426f8e2db67b44a65576693f84178a04f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:379eebc7ccebea3c24281c6604242d09589a64d4774ea37b6d5cf6e7bbece645
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4efbfa3cfb1bb8fb9c3380e65959a8b4eaf3bceb0507a26ffba1a3e4636ddb1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3668b553f323a1aa5806c5d8feff7c926f6116dc2b7f961e9746634c8e825c0
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4460050461ccd15ef821d88f33ca8aec62edc9562663da8bad202acbfef43bd7
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8199a654ab4fa9a6ef354ce8db82e82a6447b8c307bd6b0f26113c9bdf05dcf1
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8669079560477666,
5
  "eval_steps": 3000,
6
- "global_step": 30000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4287,6 +4287,658 @@
4287
  "eval_samples_per_second": 2664.65,
4288
  "eval_steps_per_second": 41.636,
4289
  "step": 30000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4290
  }
4291
  ],
4292
  "logging_steps": 50,
@@ -4301,12 +4953,12 @@
4301
  "should_evaluate": false,
4302
  "should_log": false,
4303
  "should_save": true,
4304
- "should_training_stop": false
4305
  },
4306
  "attributes": {}
4307
  }
4308
  },
4309
- "total_flos": 5.238481846114714e+18,
4310
  "train_batch_size": 8,
4311
  "trial_name": null,
4312
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9999783273010988,
5
  "eval_steps": 3000,
6
+ "global_step": 34605,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4287
  "eval_samples_per_second": 2664.65,
4288
  "eval_steps_per_second": 41.636,
4289
  "step": 30000
4290
+ },
4291
+ {
4292
+ "epoch": 0.8683528026411795,
4293
+ "grad_norm": 21.8125,
4294
+ "learning_rate": 1.3568061550687207e-07,
4295
+ "loss": 15.327,
4296
+ "step": 30050
4297
+ },
4298
+ {
4299
+ "epoch": 0.8697976492345925,
4300
+ "grad_norm": 22.8125,
4301
+ "learning_rate": 1.3590637360255738e-07,
4302
+ "loss": 15.3229,
4303
+ "step": 30100
4304
+ },
4305
+ {
4306
+ "epoch": 0.8712424958280055,
4307
+ "grad_norm": 23.671875,
4308
+ "learning_rate": 1.361321316982427e-07,
4309
+ "loss": 15.3576,
4310
+ "step": 30150
4311
+ },
4312
+ {
4313
+ "epoch": 0.8726873424214184,
4314
+ "grad_norm": 22.609375,
4315
+ "learning_rate": 1.36357889793928e-07,
4316
+ "loss": 15.3539,
4317
+ "step": 30200
4318
+ },
4319
+ {
4320
+ "epoch": 0.8741321890148314,
4321
+ "grad_norm": 20.65625,
4322
+ "learning_rate": 1.365836478896133e-07,
4323
+ "loss": 15.2693,
4324
+ "step": 30250
4325
+ },
4326
+ {
4327
+ "epoch": 0.8755770356082443,
4328
+ "grad_norm": 22.5625,
4329
+ "learning_rate": 1.3680940598529864e-07,
4330
+ "loss": 15.4018,
4331
+ "step": 30300
4332
+ },
4333
+ {
4334
+ "epoch": 0.8770218822016572,
4335
+ "grad_norm": 19.875,
4336
+ "learning_rate": 1.3703516408098394e-07,
4337
+ "loss": 15.4242,
4338
+ "step": 30350
4339
+ },
4340
+ {
4341
+ "epoch": 0.8784667287950702,
4342
+ "grad_norm": 27.234375,
4343
+ "learning_rate": 1.3726092217666924e-07,
4344
+ "loss": 15.3294,
4345
+ "step": 30400
4346
+ },
4347
+ {
4348
+ "epoch": 0.8799115753884831,
4349
+ "grad_norm": 23.375,
4350
+ "learning_rate": 1.3748668027235455e-07,
4351
+ "loss": 15.3841,
4352
+ "step": 30450
4353
+ },
4354
+ {
4355
+ "epoch": 0.8813564219818961,
4356
+ "grad_norm": 23.125,
4357
+ "learning_rate": 1.3771243836803988e-07,
4358
+ "loss": 15.3368,
4359
+ "step": 30500
4360
+ },
4361
+ {
4362
+ "epoch": 0.882801268575309,
4363
+ "grad_norm": 23.171875,
4364
+ "learning_rate": 1.3793819646372518e-07,
4365
+ "loss": 15.343,
4366
+ "step": 30550
4367
+ },
4368
+ {
4369
+ "epoch": 0.884246115168722,
4370
+ "grad_norm": 29.78125,
4371
+ "learning_rate": 1.3816395455941048e-07,
4372
+ "loss": 15.3782,
4373
+ "step": 30600
4374
+ },
4375
+ {
4376
+ "epoch": 0.8856909617621349,
4377
+ "grad_norm": 22.453125,
4378
+ "learning_rate": 1.383897126550958e-07,
4379
+ "loss": 15.4537,
4380
+ "step": 30650
4381
+ },
4382
+ {
4383
+ "epoch": 0.8871358083555478,
4384
+ "grad_norm": 21.265625,
4385
+ "learning_rate": 1.386154707507811e-07,
4386
+ "loss": 15.4048,
4387
+ "step": 30700
4388
+ },
4389
+ {
4390
+ "epoch": 0.8885806549489608,
4391
+ "grad_norm": 24.25,
4392
+ "learning_rate": 1.3884122884646644e-07,
4393
+ "loss": 15.3433,
4394
+ "step": 30750
4395
+ },
4396
+ {
4397
+ "epoch": 0.8900255015423737,
4398
+ "grad_norm": 25.1875,
4399
+ "learning_rate": 1.3906698694215174e-07,
4400
+ "loss": 15.3141,
4401
+ "step": 30800
4402
+ },
4403
+ {
4404
+ "epoch": 0.8914703481357866,
4405
+ "grad_norm": 24.25,
4406
+ "learning_rate": 1.3929274503783704e-07,
4407
+ "loss": 15.2703,
4408
+ "step": 30850
4409
+ },
4410
+ {
4411
+ "epoch": 0.8929151947291997,
4412
+ "grad_norm": 22.3125,
4413
+ "learning_rate": 1.3951850313352237e-07,
4414
+ "loss": 15.4022,
4415
+ "step": 30900
4416
+ },
4417
+ {
4418
+ "epoch": 0.8943600413226126,
4419
+ "grad_norm": 19.859375,
4420
+ "learning_rate": 1.3974426122920768e-07,
4421
+ "loss": 15.2936,
4422
+ "step": 30950
4423
+ },
4424
+ {
4425
+ "epoch": 0.8958048879160255,
4426
+ "grad_norm": 20.5,
4427
+ "learning_rate": 1.39970019324893e-07,
4428
+ "loss": 15.3219,
4429
+ "step": 31000
4430
+ },
4431
+ {
4432
+ "epoch": 0.8972497345094385,
4433
+ "grad_norm": 21.71875,
4434
+ "learning_rate": 1.4019577742057828e-07,
4435
+ "loss": 15.2468,
4436
+ "step": 31050
4437
+ },
4438
+ {
4439
+ "epoch": 0.8986945811028514,
4440
+ "grad_norm": 23.421875,
4441
+ "learning_rate": 1.4042153551626358e-07,
4442
+ "loss": 15.2591,
4443
+ "step": 31100
4444
+ },
4445
+ {
4446
+ "epoch": 0.9001394276962643,
4447
+ "grad_norm": 23.09375,
4448
+ "learning_rate": 1.406472936119489e-07,
4449
+ "loss": 15.3318,
4450
+ "step": 31150
4451
+ },
4452
+ {
4453
+ "epoch": 0.9015842742896772,
4454
+ "grad_norm": 24.09375,
4455
+ "learning_rate": 1.4087305170763421e-07,
4456
+ "loss": 15.2105,
4457
+ "step": 31200
4458
+ },
4459
+ {
4460
+ "epoch": 0.9030291208830903,
4461
+ "grad_norm": 22.671875,
4462
+ "learning_rate": 1.4109880980331954e-07,
4463
+ "loss": 15.2557,
4464
+ "step": 31250
4465
+ },
4466
+ {
4467
+ "epoch": 0.9044739674765032,
4468
+ "grad_norm": 22.0625,
4469
+ "learning_rate": 1.4132456789900484e-07,
4470
+ "loss": 15.4014,
4471
+ "step": 31300
4472
+ },
4473
+ {
4474
+ "epoch": 0.9059188140699161,
4475
+ "grad_norm": 21.796875,
4476
+ "learning_rate": 1.4155032599469017e-07,
4477
+ "loss": 15.2382,
4478
+ "step": 31350
4479
+ },
4480
+ {
4481
+ "epoch": 0.9073636606633291,
4482
+ "grad_norm": 24.5,
4483
+ "learning_rate": 1.4177608409037548e-07,
4484
+ "loss": 15.395,
4485
+ "step": 31400
4486
+ },
4487
+ {
4488
+ "epoch": 0.908808507256742,
4489
+ "grad_norm": 21.828125,
4490
+ "learning_rate": 1.4200184218606078e-07,
4491
+ "loss": 15.2785,
4492
+ "step": 31450
4493
+ },
4494
+ {
4495
+ "epoch": 0.9102533538501549,
4496
+ "grad_norm": 22.5625,
4497
+ "learning_rate": 1.422276002817461e-07,
4498
+ "loss": 15.2983,
4499
+ "step": 31500
4500
+ },
4501
+ {
4502
+ "epoch": 0.9116982004435679,
4503
+ "grad_norm": 21.328125,
4504
+ "learning_rate": 1.424533583774314e-07,
4505
+ "loss": 15.382,
4506
+ "step": 31550
4507
+ },
4508
+ {
4509
+ "epoch": 0.9131430470369809,
4510
+ "grad_norm": 21.3125,
4511
+ "learning_rate": 1.4267911647311674e-07,
4512
+ "loss": 15.2084,
4513
+ "step": 31600
4514
+ },
4515
+ {
4516
+ "epoch": 0.9145878936303938,
4517
+ "grad_norm": 22.6875,
4518
+ "learning_rate": 1.4290487456880204e-07,
4519
+ "loss": 15.2803,
4520
+ "step": 31650
4521
+ },
4522
+ {
4523
+ "epoch": 0.9160327402238068,
4524
+ "grad_norm": 20.953125,
4525
+ "learning_rate": 1.4313063266448734e-07,
4526
+ "loss": 15.3734,
4527
+ "step": 31700
4528
+ },
4529
+ {
4530
+ "epoch": 0.9174775868172197,
4531
+ "grad_norm": 22.765625,
4532
+ "learning_rate": 1.4335639076017265e-07,
4533
+ "loss": 15.3248,
4534
+ "step": 31750
4535
+ },
4536
+ {
4537
+ "epoch": 0.9189224334106326,
4538
+ "grad_norm": 21.640625,
4539
+ "learning_rate": 1.4358214885585795e-07,
4540
+ "loss": 15.2958,
4541
+ "step": 31800
4542
+ },
4543
+ {
4544
+ "epoch": 0.9203672800040456,
4545
+ "grad_norm": 21.53125,
4546
+ "learning_rate": 1.4380790695154328e-07,
4547
+ "loss": 15.2854,
4548
+ "step": 31850
4549
+ },
4550
+ {
4551
+ "epoch": 0.9218121265974585,
4552
+ "grad_norm": 21.265625,
4553
+ "learning_rate": 1.4403366504722858e-07,
4554
+ "loss": 15.2983,
4555
+ "step": 31900
4556
+ },
4557
+ {
4558
+ "epoch": 0.9232569731908714,
4559
+ "grad_norm": 26.265625,
4560
+ "learning_rate": 1.4425942314291388e-07,
4561
+ "loss": 15.1967,
4562
+ "step": 31950
4563
+ },
4564
+ {
4565
+ "epoch": 0.9247018197842845,
4566
+ "grad_norm": 21.078125,
4567
+ "learning_rate": 1.444851812385992e-07,
4568
+ "loss": 15.2861,
4569
+ "step": 32000
4570
+ },
4571
+ {
4572
+ "epoch": 0.9261466663776974,
4573
+ "grad_norm": 22.53125,
4574
+ "learning_rate": 1.447109393342845e-07,
4575
+ "loss": 15.203,
4576
+ "step": 32050
4577
+ },
4578
+ {
4579
+ "epoch": 0.9275915129711103,
4580
+ "grad_norm": 20.46875,
4581
+ "learning_rate": 1.4493669742996984e-07,
4582
+ "loss": 15.3343,
4583
+ "step": 32100
4584
+ },
4585
+ {
4586
+ "epoch": 0.9290363595645232,
4587
+ "grad_norm": 21.5625,
4588
+ "learning_rate": 1.4516245552565514e-07,
4589
+ "loss": 15.1377,
4590
+ "step": 32150
4591
+ },
4592
+ {
4593
+ "epoch": 0.9304812061579362,
4594
+ "grad_norm": 23.609375,
4595
+ "learning_rate": 1.4538821362134047e-07,
4596
+ "loss": 15.267,
4597
+ "step": 32200
4598
+ },
4599
+ {
4600
+ "epoch": 0.9319260527513491,
4601
+ "grad_norm": 22.59375,
4602
+ "learning_rate": 1.4561397171702577e-07,
4603
+ "loss": 15.3935,
4604
+ "step": 32250
4605
+ },
4606
+ {
4607
+ "epoch": 0.933370899344762,
4608
+ "grad_norm": 23.90625,
4609
+ "learning_rate": 1.4583972981271108e-07,
4610
+ "loss": 15.2605,
4611
+ "step": 32300
4612
+ },
4613
+ {
4614
+ "epoch": 0.9348157459381751,
4615
+ "grad_norm": 23.171875,
4616
+ "learning_rate": 1.460654879083964e-07,
4617
+ "loss": 15.2479,
4618
+ "step": 32350
4619
+ },
4620
+ {
4621
+ "epoch": 0.936260592531588,
4622
+ "grad_norm": 22.734375,
4623
+ "learning_rate": 1.4629124600408168e-07,
4624
+ "loss": 15.1586,
4625
+ "step": 32400
4626
+ },
4627
+ {
4628
+ "epoch": 0.9377054391250009,
4629
+ "grad_norm": 22.078125,
4630
+ "learning_rate": 1.46517004099767e-07,
4631
+ "loss": 15.2709,
4632
+ "step": 32450
4633
+ },
4634
+ {
4635
+ "epoch": 0.9391502857184139,
4636
+ "grad_norm": 20.484375,
4637
+ "learning_rate": 1.467427621954523e-07,
4638
+ "loss": 15.3932,
4639
+ "step": 32500
4640
+ },
4641
+ {
4642
+ "epoch": 0.9405951323118268,
4643
+ "grad_norm": 20.171875,
4644
+ "learning_rate": 1.4696852029113762e-07,
4645
+ "loss": 15.3021,
4646
+ "step": 32550
4647
+ },
4648
+ {
4649
+ "epoch": 0.9420399789052397,
4650
+ "grad_norm": 20.375,
4651
+ "learning_rate": 1.4719427838682294e-07,
4652
+ "loss": 15.3676,
4653
+ "step": 32600
4654
+ },
4655
+ {
4656
+ "epoch": 0.9434848254986526,
4657
+ "grad_norm": 18.40625,
4658
+ "learning_rate": 1.4742003648250825e-07,
4659
+ "loss": 15.225,
4660
+ "step": 32650
4661
+ },
4662
+ {
4663
+ "epoch": 0.9449296720920656,
4664
+ "grad_norm": 22.921875,
4665
+ "learning_rate": 1.4764579457819357e-07,
4666
+ "loss": 15.2833,
4667
+ "step": 32700
4668
+ },
4669
+ {
4670
+ "epoch": 0.9463745186854786,
4671
+ "grad_norm": 21.359375,
4672
+ "learning_rate": 1.4787155267387888e-07,
4673
+ "loss": 15.2648,
4674
+ "step": 32750
4675
+ },
4676
+ {
4677
+ "epoch": 0.9478193652788915,
4678
+ "grad_norm": 18.03125,
4679
+ "learning_rate": 1.480973107695642e-07,
4680
+ "loss": 15.1701,
4681
+ "step": 32800
4682
+ },
4683
+ {
4684
+ "epoch": 0.9492642118723045,
4685
+ "grad_norm": 20.734375,
4686
+ "learning_rate": 1.483230688652495e-07,
4687
+ "loss": 15.3169,
4688
+ "step": 32850
4689
+ },
4690
+ {
4691
+ "epoch": 0.9507090584657174,
4692
+ "grad_norm": 30.40625,
4693
+ "learning_rate": 1.485488269609348e-07,
4694
+ "loss": 15.1193,
4695
+ "step": 32900
4696
+ },
4697
+ {
4698
+ "epoch": 0.9521539050591303,
4699
+ "grad_norm": 18.96875,
4700
+ "learning_rate": 1.4877458505662014e-07,
4701
+ "loss": 15.1477,
4702
+ "step": 32950
4703
+ },
4704
+ {
4705
+ "epoch": 0.9535987516525433,
4706
+ "grad_norm": 22.125,
4707
+ "learning_rate": 1.4900034315230544e-07,
4708
+ "loss": 15.1363,
4709
+ "step": 33000
4710
+ },
4711
+ {
4712
+ "epoch": 0.9535987516525433,
4713
+ "eval_loss": 1.9005507230758667,
4714
+ "eval_runtime": 343.9939,
4715
+ "eval_samples_per_second": 2710.903,
4716
+ "eval_steps_per_second": 42.358,
4717
+ "step": 33000
4718
+ },
4719
+ {
4720
+ "epoch": 0.9550435982459562,
4721
+ "grad_norm": 20.171875,
4722
+ "learning_rate": 1.4922610124799077e-07,
4723
+ "loss": 15.2704,
4724
+ "step": 33050
4725
+ },
4726
+ {
4727
+ "epoch": 0.9564884448393692,
4728
+ "grad_norm": 25.21875,
4729
+ "learning_rate": 1.4945185934367605e-07,
4730
+ "loss": 15.2413,
4731
+ "step": 33100
4732
+ },
4733
+ {
4734
+ "epoch": 0.9579332914327822,
4735
+ "grad_norm": 20.984375,
4736
+ "learning_rate": 1.4967761743936135e-07,
4737
+ "loss": 15.0358,
4738
+ "step": 33150
4739
+ },
4740
+ {
4741
+ "epoch": 0.9593781380261951,
4742
+ "grad_norm": 22.65625,
4743
+ "learning_rate": 1.4990337553504668e-07,
4744
+ "loss": 15.148,
4745
+ "step": 33200
4746
+ },
4747
+ {
4748
+ "epoch": 0.960822984619608,
4749
+ "grad_norm": 24.671875,
4750
+ "learning_rate": 1.5012913363073198e-07,
4751
+ "loss": 15.0575,
4752
+ "step": 33250
4753
+ },
4754
+ {
4755
+ "epoch": 0.962267831213021,
4756
+ "grad_norm": 21.28125,
4757
+ "learning_rate": 1.503548917264173e-07,
4758
+ "loss": 15.1119,
4759
+ "step": 33300
4760
+ },
4761
+ {
4762
+ "epoch": 0.9637126778064339,
4763
+ "grad_norm": 24.21875,
4764
+ "learning_rate": 1.505806498221026e-07,
4765
+ "loss": 15.21,
4766
+ "step": 33350
4767
+ },
4768
+ {
4769
+ "epoch": 0.9651575243998468,
4770
+ "grad_norm": 21.6875,
4771
+ "learning_rate": 1.5080640791778791e-07,
4772
+ "loss": 15.1355,
4773
+ "step": 33400
4774
+ },
4775
+ {
4776
+ "epoch": 0.9666023709932597,
4777
+ "grad_norm": 24.390625,
4778
+ "learning_rate": 1.5103216601347324e-07,
4779
+ "loss": 15.2218,
4780
+ "step": 33450
4781
+ },
4782
+ {
4783
+ "epoch": 0.9680472175866728,
4784
+ "grad_norm": 19.25,
4785
+ "learning_rate": 1.5125792410915854e-07,
4786
+ "loss": 15.1256,
4787
+ "step": 33500
4788
+ },
4789
+ {
4790
+ "epoch": 0.9694920641800857,
4791
+ "grad_norm": 19.984375,
4792
+ "learning_rate": 1.5148368220484387e-07,
4793
+ "loss": 15.1171,
4794
+ "step": 33550
4795
+ },
4796
+ {
4797
+ "epoch": 0.9709369107734986,
4798
+ "grad_norm": 19.640625,
4799
+ "learning_rate": 1.5170944030052918e-07,
4800
+ "loss": 15.0999,
4801
+ "step": 33600
4802
+ },
4803
+ {
4804
+ "epoch": 0.9723817573669116,
4805
+ "grad_norm": 24.265625,
4806
+ "learning_rate": 1.519351983962145e-07,
4807
+ "loss": 15.2255,
4808
+ "step": 33650
4809
+ },
4810
+ {
4811
+ "epoch": 0.9738266039603245,
4812
+ "grad_norm": 25.546875,
4813
+ "learning_rate": 1.521609564918998e-07,
4814
+ "loss": 15.0743,
4815
+ "step": 33700
4816
+ },
4817
+ {
4818
+ "epoch": 0.9752714505537374,
4819
+ "grad_norm": 21.578125,
4820
+ "learning_rate": 1.5238671458758508e-07,
4821
+ "loss": 15.145,
4822
+ "step": 33750
4823
+ },
4824
+ {
4825
+ "epoch": 0.9767162971471504,
4826
+ "grad_norm": 24.46875,
4827
+ "learning_rate": 1.526124726832704e-07,
4828
+ "loss": 15.2408,
4829
+ "step": 33800
4830
+ },
4831
+ {
4832
+ "epoch": 0.9781611437405634,
4833
+ "grad_norm": 21.984375,
4834
+ "learning_rate": 1.5283823077895571e-07,
4835
+ "loss": 15.1413,
4836
+ "step": 33850
4837
+ },
4838
+ {
4839
+ "epoch": 0.9796059903339763,
4840
+ "grad_norm": 21.828125,
4841
+ "learning_rate": 1.5306398887464104e-07,
4842
+ "loss": 15.1452,
4843
+ "step": 33900
4844
+ },
4845
+ {
4846
+ "epoch": 0.9810508369273893,
4847
+ "grad_norm": 22.125,
4848
+ "learning_rate": 1.5328974697032635e-07,
4849
+ "loss": 15.1786,
4850
+ "step": 33950
4851
+ },
4852
+ {
4853
+ "epoch": 0.9824956835208022,
4854
+ "grad_norm": 27.046875,
4855
+ "learning_rate": 1.5351550506601165e-07,
4856
+ "loss": 15.069,
4857
+ "step": 34000
4858
+ },
4859
+ {
4860
+ "epoch": 0.9839405301142151,
4861
+ "grad_norm": 21.65625,
4862
+ "learning_rate": 1.5374126316169698e-07,
4863
+ "loss": 15.1776,
4864
+ "step": 34050
4865
+ },
4866
+ {
4867
+ "epoch": 0.985385376707628,
4868
+ "grad_norm": 21.953125,
4869
+ "learning_rate": 1.5396702125738228e-07,
4870
+ "loss": 15.1561,
4871
+ "step": 34100
4872
+ },
4873
+ {
4874
+ "epoch": 0.986830223301041,
4875
+ "grad_norm": 25.75,
4876
+ "learning_rate": 1.541927793530676e-07,
4877
+ "loss": 15.2242,
4878
+ "step": 34150
4879
+ },
4880
+ {
4881
+ "epoch": 0.9882750698944539,
4882
+ "grad_norm": 23.484375,
4883
+ "learning_rate": 1.544185374487529e-07,
4884
+ "loss": 15.1583,
4885
+ "step": 34200
4886
+ },
4887
+ {
4888
+ "epoch": 0.989719916487867,
4889
+ "grad_norm": 27.984375,
4890
+ "learning_rate": 1.546442955444382e-07,
4891
+ "loss": 15.1159,
4892
+ "step": 34250
4893
+ },
4894
+ {
4895
+ "epoch": 0.9911647630812799,
4896
+ "grad_norm": 21.34375,
4897
+ "learning_rate": 1.5487005364012354e-07,
4898
+ "loss": 15.0641,
4899
+ "step": 34300
4900
+ },
4901
+ {
4902
+ "epoch": 0.9926096096746928,
4903
+ "grad_norm": 20.25,
4904
+ "learning_rate": 1.5509581173580884e-07,
4905
+ "loss": 15.139,
4906
+ "step": 34350
4907
+ },
4908
+ {
4909
+ "epoch": 0.9940544562681057,
4910
+ "grad_norm": 21.640625,
4911
+ "learning_rate": 1.5532156983149415e-07,
4912
+ "loss": 15.0966,
4913
+ "step": 34400
4914
+ },
4915
+ {
4916
+ "epoch": 0.9954993028615187,
4917
+ "grad_norm": 22.125,
4918
+ "learning_rate": 1.5554732792717945e-07,
4919
+ "loss": 15.1072,
4920
+ "step": 34450
4921
+ },
4922
+ {
4923
+ "epoch": 0.9969441494549316,
4924
+ "grad_norm": 21.859375,
4925
+ "learning_rate": 1.5577308602286478e-07,
4926
+ "loss": 15.1392,
4927
+ "step": 34500
4928
+ },
4929
+ {
4930
+ "epoch": 0.9983889960483445,
4931
+ "grad_norm": 21.703125,
4932
+ "learning_rate": 1.5599884411855008e-07,
4933
+ "loss": 15.2097,
4934
+ "step": 34550
4935
+ },
4936
+ {
4937
+ "epoch": 0.9998338426417576,
4938
+ "grad_norm": 23.140625,
4939
+ "learning_rate": 1.5622460221423538e-07,
4940
+ "loss": 15.0667,
4941
+ "step": 34600
4942
  }
4943
  ],
4944
  "logging_steps": 50,
 
4953
  "should_evaluate": false,
4954
  "should_log": false,
4955
  "should_save": true,
4956
+ "should_training_stop": true
4957
  },
4958
  "attributes": {}
4959
  }
4960
  },
4961
+ "total_flos": 6.042588809533587e+18,
4962
  "train_batch_size": 8,
4963
  "trial_name": null,
4964
  "trial_params": null