ToastyPigeon commited on
Commit
9701dad
·
verified ·
1 Parent(s): 0d8d4b4

Training in progress, step 855, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37ddee871f5c8860c14421aa33d7cdbaa19f59b850fbe75607f8e04d53149ec8
3
  size 1527066456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0355df4e5063562f37d09abcb14b93955eed834c29f96e9873c1ae6867862c3c
3
  size 1527066456
last-checkpoint/global_step855/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e3946b3681927eb61dbc0943eac914d926599e3e93d2bb344c31a171fb3f88a
3
+ size 2303346228
last-checkpoint/global_step855/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:957045c26127b5ae0cf698962b6f3bea091dd1f36a574987ff44d98775cd7601
3
+ size 2303346228
last-checkpoint/global_step855/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0edccf7dfc2621373d5aad8800a44738a6fd5044232f274d3c92fe9166a15764
3
+ size 354242335
last-checkpoint/global_step855/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01b813789517429aedbb8c970fc2103dd038913fa55a904f2f73ed1c12a4f2da
3
+ size 354242335
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step760
 
1
+ global_step855
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd02523e9a2772795dcb8a5377c30dd5d755619121ed3b3ff62475427588a9c3
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03593c226bb4a31df5c4e26aaac693fbb9741d071e04c601b5ac1382f878f52b
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de3e6ca92eb194178d24b38dccc3684b16157506fcf7c4fed89bc27d96db71bd
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d987383f11815234d8954c0b84d9c5f500a84408fd66a9795cd135ccfaa9f970
3
  size 14917
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ec9f9edd6063090e9cce93303e9da244a2299a1a318fe32b73067a10396350a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:515697a4ba82550e0f5b33a4815d00a842066a9ee3219fde5b4a86b4c7e265ef
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.804871591209955,
6
  "eval_steps": 95,
7
- "global_step": 760,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5400,6 +5400,679 @@
5400
  "eval_samples_per_second": 0.578,
5401
  "eval_steps_per_second": 0.29,
5402
  "step": 760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5403
  }
5404
  ],
5405
  "logging_steps": 1,
@@ -5419,7 +6092,7 @@
5419
  "attributes": {}
5420
  }
5421
  },
5422
- "total_flos": 3.5199521730330624e+17,
5423
  "train_batch_size": 1,
5424
  "trial_name": null,
5425
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9054805401111994,
6
  "eval_steps": 95,
7
+ "global_step": 855,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5400
  "eval_samples_per_second": 0.578,
5401
  "eval_steps_per_second": 0.29,
5402
  "step": 760
5403
+ },
5404
+ {
5405
+ "epoch": 0.8059306327773366,
5406
+ "grad_norm": 0.6803131636311378,
5407
+ "learning_rate": 7.140070230198985e-06,
5408
+ "loss": 1.8325,
5409
+ "step": 761
5410
+ },
5411
+ {
5412
+ "epoch": 0.806989674344718,
5413
+ "grad_norm": 0.5797389494956084,
5414
+ "learning_rate": 7.126076742364918e-06,
5415
+ "loss": 1.9813,
5416
+ "step": 762
5417
+ },
5418
+ {
5419
+ "epoch": 0.8080487159120996,
5420
+ "grad_norm": 0.44784413151847136,
5421
+ "learning_rate": 7.111984282907661e-06,
5422
+ "loss": 2.3973,
5423
+ "step": 763
5424
+ },
5425
+ {
5426
+ "epoch": 0.809107757479481,
5427
+ "grad_norm": 1.5430000937697363,
5428
+ "learning_rate": 7.097791798107255e-06,
5429
+ "loss": 2.147,
5430
+ "step": 764
5431
+ },
5432
+ {
5433
+ "epoch": 0.8101667990468626,
5434
+ "grad_norm": 0.5136408504852492,
5435
+ "learning_rate": 7.083498219232293e-06,
5436
+ "loss": 1.848,
5437
+ "step": 765
5438
+ },
5439
+ {
5440
+ "epoch": 0.8112258406142441,
5441
+ "grad_norm": 0.5467920174538149,
5442
+ "learning_rate": 7.069102462271644e-06,
5443
+ "loss": 2.2383,
5444
+ "step": 766
5445
+ },
5446
+ {
5447
+ "epoch": 0.8122848821816256,
5448
+ "grad_norm": 0.5125300215381279,
5449
+ "learning_rate": 7.054603427660422e-06,
5450
+ "loss": 2.1579,
5451
+ "step": 767
5452
+ },
5453
+ {
5454
+ "epoch": 0.8133439237490071,
5455
+ "grad_norm": 0.8105607354385294,
5456
+ "learning_rate": 7.04e-06,
5457
+ "loss": 2.236,
5458
+ "step": 768
5459
+ },
5460
+ {
5461
+ "epoch": 0.8144029653163887,
5462
+ "grad_norm": 0.46810244908141757,
5463
+ "learning_rate": 7.02529104777198e-06,
5464
+ "loss": 2.1892,
5465
+ "step": 769
5466
+ },
5467
+ {
5468
+ "epoch": 0.8154620068837702,
5469
+ "grad_norm": 0.4455237449570357,
5470
+ "learning_rate": 7.0104754230459316e-06,
5471
+ "loss": 2.2361,
5472
+ "step": 770
5473
+ },
5474
+ {
5475
+ "epoch": 0.8165210484511517,
5476
+ "grad_norm": 0.5665884230798665,
5477
+ "learning_rate": 6.995551961180752e-06,
5478
+ "loss": 1.9058,
5479
+ "step": 771
5480
+ },
5481
+ {
5482
+ "epoch": 0.8175800900185333,
5483
+ "grad_norm": 0.4633746613411758,
5484
+ "learning_rate": 6.980519480519481e-06,
5485
+ "loss": 2.1833,
5486
+ "step": 772
5487
+ },
5488
+ {
5489
+ "epoch": 0.8186391315859147,
5490
+ "grad_norm": 0.44219293669742443,
5491
+ "learning_rate": 6.965376782077392e-06,
5492
+ "loss": 2.434,
5493
+ "step": 773
5494
+ },
5495
+ {
5496
+ "epoch": 0.8196981731532963,
5497
+ "grad_norm": 0.42875965911174113,
5498
+ "learning_rate": 6.950122649223222e-06,
5499
+ "loss": 2.1409,
5500
+ "step": 774
5501
+ },
5502
+ {
5503
+ "epoch": 0.8207572147206778,
5504
+ "grad_norm": 0.47230346181676486,
5505
+ "learning_rate": 6.934755847353305e-06,
5506
+ "loss": 2.1609,
5507
+ "step": 775
5508
+ },
5509
+ {
5510
+ "epoch": 0.8218162562880593,
5511
+ "grad_norm": 0.47784292161116215,
5512
+ "learning_rate": 6.919275123558486e-06,
5513
+ "loss": 2.0429,
5514
+ "step": 776
5515
+ },
5516
+ {
5517
+ "epoch": 0.8228752978554408,
5518
+ "grad_norm": 1.1727376426648415,
5519
+ "learning_rate": 6.903679206283588e-06,
5520
+ "loss": 1.9888,
5521
+ "step": 777
5522
+ },
5523
+ {
5524
+ "epoch": 0.8239343394228223,
5525
+ "grad_norm": 2.1540603636137363,
5526
+ "learning_rate": 6.887966804979253e-06,
5527
+ "loss": 2.2259,
5528
+ "step": 778
5529
+ },
5530
+ {
5531
+ "epoch": 0.8249933809902039,
5532
+ "grad_norm": 0.39665352588092573,
5533
+ "learning_rate": 6.8721366097459395e-06,
5534
+ "loss": 2.1907,
5535
+ "step": 779
5536
+ },
5537
+ {
5538
+ "epoch": 0.8260524225575854,
5539
+ "grad_norm": 0.41043373870823585,
5540
+ "learning_rate": 6.8561872909699e-06,
5541
+ "loss": 2.2407,
5542
+ "step": 780
5543
+ },
5544
+ {
5545
+ "epoch": 0.8271114641249669,
5546
+ "grad_norm": 0.5256052719666096,
5547
+ "learning_rate": 6.840117498950903e-06,
5548
+ "loss": 1.9323,
5549
+ "step": 781
5550
+ },
5551
+ {
5552
+ "epoch": 0.8281705056923484,
5553
+ "grad_norm": 0.43680474985593193,
5554
+ "learning_rate": 6.823925863521483e-06,
5555
+ "loss": 2.3057,
5556
+ "step": 782
5557
+ },
5558
+ {
5559
+ "epoch": 0.82922954725973,
5560
+ "grad_norm": 0.56024481376916,
5561
+ "learning_rate": 6.807610993657505e-06,
5562
+ "loss": 2.283,
5563
+ "step": 783
5564
+ },
5565
+ {
5566
+ "epoch": 0.8302885888271114,
5567
+ "grad_norm": 2.461245897952525,
5568
+ "learning_rate": 6.7911714770797965e-06,
5569
+ "loss": 2.2754,
5570
+ "step": 784
5571
+ },
5572
+ {
5573
+ "epoch": 0.831347630394493,
5574
+ "grad_norm": 0.4239538684026029,
5575
+ "learning_rate": 6.774605879846613e-06,
5576
+ "loss": 2.1561,
5577
+ "step": 785
5578
+ },
5579
+ {
5580
+ "epoch": 0.8324066719618745,
5581
+ "grad_norm": 0.5086795267311749,
5582
+ "learning_rate": 6.757912745936699e-06,
5583
+ "loss": 1.9561,
5584
+ "step": 786
5585
+ },
5586
+ {
5587
+ "epoch": 0.833465713529256,
5588
+ "grad_norm": 0.529180140080738,
5589
+ "learning_rate": 6.7410905968226705e-06,
5590
+ "loss": 2.2472,
5591
+ "step": 787
5592
+ },
5593
+ {
5594
+ "epoch": 0.8345247550966376,
5595
+ "grad_norm": 0.906770591412783,
5596
+ "learning_rate": 6.724137931034482e-06,
5597
+ "loss": 2.0894,
5598
+ "step": 788
5599
+ },
5600
+ {
5601
+ "epoch": 0.835583796664019,
5602
+ "grad_norm": 0.4763073163865911,
5603
+ "learning_rate": 6.707053223712678e-06,
5604
+ "loss": 2.1889,
5605
+ "step": 789
5606
+ },
5607
+ {
5608
+ "epoch": 0.8366428382314006,
5609
+ "grad_norm": 0.41043514041608953,
5610
+ "learning_rate": 6.689834926151174e-06,
5611
+ "loss": 2.2596,
5612
+ "step": 790
5613
+ },
5614
+ {
5615
+ "epoch": 0.8377018797987821,
5616
+ "grad_norm": 0.4875150728481836,
5617
+ "learning_rate": 6.672481465329265e-06,
5618
+ "loss": 2.171,
5619
+ "step": 791
5620
+ },
5621
+ {
5622
+ "epoch": 0.8387609213661636,
5623
+ "grad_norm": 0.4563054157354168,
5624
+ "learning_rate": 6.654991243432575e-06,
5625
+ "loss": 2.0479,
5626
+ "step": 792
5627
+ },
5628
+ {
5629
+ "epoch": 0.8398199629335451,
5630
+ "grad_norm": 0.40698891088293526,
5631
+ "learning_rate": 6.637362637362638e-06,
5632
+ "loss": 2.1311,
5633
+ "step": 793
5634
+ },
5635
+ {
5636
+ "epoch": 0.8408790045009267,
5637
+ "grad_norm": 1.3105970922128873,
5638
+ "learning_rate": 6.619593998234774e-06,
5639
+ "loss": 2.1572,
5640
+ "step": 794
5641
+ },
5642
+ {
5643
+ "epoch": 0.8419380460683081,
5644
+ "grad_norm": 0.5405021865928985,
5645
+ "learning_rate": 6.601683650863979e-06,
5646
+ "loss": 2.0705,
5647
+ "step": 795
5648
+ },
5649
+ {
5650
+ "epoch": 0.8429970876356897,
5651
+ "grad_norm": 0.4641185563260275,
5652
+ "learning_rate": 6.5836298932384346e-06,
5653
+ "loss": 2.2504,
5654
+ "step": 796
5655
+ },
5656
+ {
5657
+ "epoch": 0.8440561292030713,
5658
+ "grad_norm": 0.42421600635418305,
5659
+ "learning_rate": 6.565430995980349e-06,
5660
+ "loss": 2.2085,
5661
+ "step": 797
5662
+ },
5663
+ {
5664
+ "epoch": 0.8451151707704527,
5665
+ "grad_norm": 0.49354822735708614,
5666
+ "learning_rate": 6.547085201793723e-06,
5667
+ "loss": 2.1794,
5668
+ "step": 798
5669
+ },
5670
+ {
5671
+ "epoch": 0.8461742123378343,
5672
+ "grad_norm": 0.3921126516756366,
5673
+ "learning_rate": 6.528590724898695e-06,
5674
+ "loss": 2.227,
5675
+ "step": 799
5676
+ },
5677
+ {
5678
+ "epoch": 0.8472332539052158,
5679
+ "grad_norm": 0.6764827329334605,
5680
+ "learning_rate": 6.50994575045208e-06,
5681
+ "loss": 2.1332,
5682
+ "step": 800
5683
+ },
5684
+ {
5685
+ "epoch": 0.8482922954725973,
5686
+ "grad_norm": 0.4061658039995484,
5687
+ "learning_rate": 6.4911484339537e-06,
5688
+ "loss": 2.3831,
5689
+ "step": 801
5690
+ },
5691
+ {
5692
+ "epoch": 0.8493513370399788,
5693
+ "grad_norm": 0.5826701454101317,
5694
+ "learning_rate": 6.4721969006381045e-06,
5695
+ "loss": 2.3832,
5696
+ "step": 802
5697
+ },
5698
+ {
5699
+ "epoch": 0.8504103786073604,
5700
+ "grad_norm": 0.41435088004419374,
5701
+ "learning_rate": 6.453089244851258e-06,
5702
+ "loss": 2.2416,
5703
+ "step": 803
5704
+ },
5705
+ {
5706
+ "epoch": 0.8514694201747418,
5707
+ "grad_norm": 0.4150959050115928,
5708
+ "learning_rate": 6.433823529411766e-06,
5709
+ "loss": 2.0757,
5710
+ "step": 804
5711
+ },
5712
+ {
5713
+ "epoch": 0.8525284617421234,
5714
+ "grad_norm": 0.43352625752977775,
5715
+ "learning_rate": 6.414397784956161e-06,
5716
+ "loss": 1.9316,
5717
+ "step": 805
5718
+ },
5719
+ {
5720
+ "epoch": 0.853587503309505,
5721
+ "grad_norm": 0.5885629954685914,
5722
+ "learning_rate": 6.39481000926784e-06,
5723
+ "loss": 2.283,
5724
+ "step": 806
5725
+ },
5726
+ {
5727
+ "epoch": 0.8546465448768864,
5728
+ "grad_norm": 0.6425274615571129,
5729
+ "learning_rate": 6.375058166589111e-06,
5730
+ "loss": 2.1418,
5731
+ "step": 807
5732
+ },
5733
+ {
5734
+ "epoch": 0.855705586444268,
5735
+ "grad_norm": 0.3921348356319304,
5736
+ "learning_rate": 6.355140186915888e-06,
5737
+ "loss": 2.367,
5738
+ "step": 808
5739
+ },
5740
+ {
5741
+ "epoch": 0.8567646280116494,
5742
+ "grad_norm": 0.48187968387901975,
5743
+ "learning_rate": 6.3350539652745195e-06,
5744
+ "loss": 2.2415,
5745
+ "step": 809
5746
+ },
5747
+ {
5748
+ "epoch": 0.857823669579031,
5749
+ "grad_norm": 0.5163238323570051,
5750
+ "learning_rate": 6.3147973609802075e-06,
5751
+ "loss": 2.096,
5752
+ "step": 810
5753
+ },
5754
+ {
5755
+ "epoch": 0.8588827111464125,
5756
+ "grad_norm": 0.42537069143517753,
5757
+ "learning_rate": 6.294368196876479e-06,
5758
+ "loss": 2.2303,
5759
+ "step": 811
5760
+ },
5761
+ {
5762
+ "epoch": 0.859941752713794,
5763
+ "grad_norm": 0.5111353722058767,
5764
+ "learning_rate": 6.273764258555133e-06,
5765
+ "loss": 2.1343,
5766
+ "step": 812
5767
+ },
5768
+ {
5769
+ "epoch": 0.8610007942811755,
5770
+ "grad_norm": 0.4948365929852451,
5771
+ "learning_rate": 6.252983293556086e-06,
5772
+ "loss": 2.2779,
5773
+ "step": 813
5774
+ },
5775
+ {
5776
+ "epoch": 0.862059835848557,
5777
+ "grad_norm": 0.42418098706232704,
5778
+ "learning_rate": 6.232023010546501e-06,
5779
+ "loss": 2.2429,
5780
+ "step": 814
5781
+ },
5782
+ {
5783
+ "epoch": 0.8631188774159386,
5784
+ "grad_norm": 0.592767643023693,
5785
+ "learning_rate": 6.210881078478576e-06,
5786
+ "loss": 2.1041,
5787
+ "step": 815
5788
+ },
5789
+ {
5790
+ "epoch": 0.8641779189833201,
5791
+ "grad_norm": 1.111541479840316,
5792
+ "learning_rate": 6.189555125725339e-06,
5793
+ "loss": 2.2028,
5794
+ "step": 816
5795
+ },
5796
+ {
5797
+ "epoch": 0.8652369605507017,
5798
+ "grad_norm": 0.39821783655109405,
5799
+ "learning_rate": 6.168042739193783e-06,
5800
+ "loss": 2.2218,
5801
+ "step": 817
5802
+ },
5803
+ {
5804
+ "epoch": 0.8662960021180831,
5805
+ "grad_norm": 1.003472324980434,
5806
+ "learning_rate": 6.1463414634146346e-06,
5807
+ "loss": 2.1237,
5808
+ "step": 818
5809
+ },
5810
+ {
5811
+ "epoch": 0.8673550436854647,
5812
+ "grad_norm": 0.5140913920253216,
5813
+ "learning_rate": 6.124448799608036e-06,
5814
+ "loss": 2.2554,
5815
+ "step": 819
5816
+ },
5817
+ {
5818
+ "epoch": 0.8684140852528461,
5819
+ "grad_norm": 1.304653340463634,
5820
+ "learning_rate": 6.1023622047244104e-06,
5821
+ "loss": 2.1231,
5822
+ "step": 820
5823
+ },
5824
+ {
5825
+ "epoch": 0.8694731268202277,
5826
+ "grad_norm": 0.46644604217264457,
5827
+ "learning_rate": 6.080079090459714e-06,
5828
+ "loss": 2.068,
5829
+ "step": 821
5830
+ },
5831
+ {
5832
+ "epoch": 0.8705321683876092,
5833
+ "grad_norm": 0.3784392165588507,
5834
+ "learning_rate": 6.05759682224429e-06,
5835
+ "loss": 2.3086,
5836
+ "step": 822
5837
+ },
5838
+ {
5839
+ "epoch": 0.8715912099549907,
5840
+ "grad_norm": 0.42750467258430275,
5841
+ "learning_rate": 6.03491271820449e-06,
5842
+ "loss": 2.3433,
5843
+ "step": 823
5844
+ },
5845
+ {
5846
+ "epoch": 0.8726502515223723,
5847
+ "grad_norm": 0.3855344094267301,
5848
+ "learning_rate": 6.0120240480961935e-06,
5849
+ "loss": 2.2661,
5850
+ "step": 824
5851
+ },
5852
+ {
5853
+ "epoch": 0.8737092930897538,
5854
+ "grad_norm": 0.43412569682236263,
5855
+ "learning_rate": 5.9889280322093616e-06,
5856
+ "loss": 2.0809,
5857
+ "step": 825
5858
+ },
5859
+ {
5860
+ "epoch": 0.8747683346571353,
5861
+ "grad_norm": 0.4567508929265723,
5862
+ "learning_rate": 5.96562184024267e-06,
5863
+ "loss": 1.9931,
5864
+ "step": 826
5865
+ },
5866
+ {
5867
+ "epoch": 0.8758273762245168,
5868
+ "grad_norm": 0.7858185134588669,
5869
+ "learning_rate": 5.942102590147283e-06,
5870
+ "loss": 1.8915,
5871
+ "step": 827
5872
+ },
5873
+ {
5874
+ "epoch": 0.8768864177918984,
5875
+ "grad_norm": 0.4865728159534291,
5876
+ "learning_rate": 5.918367346938776e-06,
5877
+ "loss": 2.1063,
5878
+ "step": 828
5879
+ },
5880
+ {
5881
+ "epoch": 0.8779454593592798,
5882
+ "grad_norm": 0.7998515233126545,
5883
+ "learning_rate": 5.894413121476167e-06,
5884
+ "loss": 1.7472,
5885
+ "step": 829
5886
+ },
5887
+ {
5888
+ "epoch": 0.8790045009266614,
5889
+ "grad_norm": 0.4524437200975804,
5890
+ "learning_rate": 5.870236869207003e-06,
5891
+ "loss": 2.2934,
5892
+ "step": 830
5893
+ },
5894
+ {
5895
+ "epoch": 0.8800635424940428,
5896
+ "grad_norm": 0.4057519289307801,
5897
+ "learning_rate": 5.845835488877393e-06,
5898
+ "loss": 2.1486,
5899
+ "step": 831
5900
+ },
5901
+ {
5902
+ "epoch": 0.8811225840614244,
5903
+ "grad_norm": 0.4391117370907218,
5904
+ "learning_rate": 5.821205821205822e-06,
5905
+ "loss": 2.262,
5906
+ "step": 832
5907
+ },
5908
+ {
5909
+ "epoch": 0.882181625628806,
5910
+ "grad_norm": 0.6376172649412307,
5911
+ "learning_rate": 5.7963446475195825e-06,
5912
+ "loss": 2.155,
5913
+ "step": 833
5914
+ },
5915
+ {
5916
+ "epoch": 0.8832406671961874,
5917
+ "grad_norm": 0.37373198069593205,
5918
+ "learning_rate": 5.771248688352571e-06,
5919
+ "loss": 2.1126,
5920
+ "step": 834
5921
+ },
5922
+ {
5923
+ "epoch": 0.884299708763569,
5924
+ "grad_norm": 0.5897412582783383,
5925
+ "learning_rate": 5.745914602003163e-06,
5926
+ "loss": 2.2602,
5927
+ "step": 835
5928
+ },
5929
+ {
5930
+ "epoch": 0.8853587503309505,
5931
+ "grad_norm": 0.47097565886520626,
5932
+ "learning_rate": 5.720338983050848e-06,
5933
+ "loss": 1.8018,
5934
+ "step": 836
5935
+ },
5936
+ {
5937
+ "epoch": 0.886417791898332,
5938
+ "grad_norm": 0.4395931623754186,
5939
+ "learning_rate": 5.694518360830229e-06,
5940
+ "loss": 2.1072,
5941
+ "step": 837
5942
+ },
5943
+ {
5944
+ "epoch": 0.8874768334657135,
5945
+ "grad_norm": 0.46574222534352466,
5946
+ "learning_rate": 5.6684491978609635e-06,
5947
+ "loss": 2.1773,
5948
+ "step": 838
5949
+ },
5950
+ {
5951
+ "epoch": 0.8885358750330951,
5952
+ "grad_norm": 0.4551573251462227,
5953
+ "learning_rate": 5.642127888232134e-06,
5954
+ "loss": 2.3387,
5955
+ "step": 839
5956
+ },
5957
+ {
5958
+ "epoch": 0.8895949166004765,
5959
+ "grad_norm": 0.4493811713490901,
5960
+ "learning_rate": 5.615550755939525e-06,
5961
+ "loss": 2.166,
5962
+ "step": 840
5963
+ },
5964
+ {
5965
+ "epoch": 0.8906539581678581,
5966
+ "grad_norm": 0.39495794701015247,
5967
+ "learning_rate": 5.588714053174173e-06,
5968
+ "loss": 2.1217,
5969
+ "step": 841
5970
+ },
5971
+ {
5972
+ "epoch": 0.8917129997352397,
5973
+ "grad_norm": 0.5230334500510622,
5974
+ "learning_rate": 5.5616139585605235e-06,
5975
+ "loss": 2.208,
5976
+ "step": 842
5977
+ },
5978
+ {
5979
+ "epoch": 0.8927720413026211,
5980
+ "grad_norm": 0.5613270004933778,
5981
+ "learning_rate": 5.534246575342466e-06,
5982
+ "loss": 2.1997,
5983
+ "step": 843
5984
+ },
5985
+ {
5986
+ "epoch": 0.8938310828700027,
5987
+ "grad_norm": 0.5650905985537464,
5988
+ "learning_rate": 5.506607929515418e-06,
5989
+ "loss": 2.198,
5990
+ "step": 844
5991
+ },
5992
+ {
5993
+ "epoch": 0.8948901244373841,
5994
+ "grad_norm": 0.3895565844023344,
5995
+ "learning_rate": 5.4786939679026e-06,
5996
+ "loss": 2.1742,
5997
+ "step": 845
5998
+ },
5999
+ {
6000
+ "epoch": 0.8959491660047657,
6001
+ "grad_norm": 0.42224606882848037,
6002
+ "learning_rate": 5.450500556173527e-06,
6003
+ "loss": 2.0029,
6004
+ "step": 846
6005
+ },
6006
+ {
6007
+ "epoch": 0.8970082075721472,
6008
+ "grad_norm": 0.4528295653901916,
6009
+ "learning_rate": 5.422023476802684e-06,
6010
+ "loss": 2.2477,
6011
+ "step": 847
6012
+ },
6013
+ {
6014
+ "epoch": 0.8980672491395287,
6015
+ "grad_norm": 0.42318696679446294,
6016
+ "learning_rate": 5.393258426966292e-06,
6017
+ "loss": 2.2269,
6018
+ "step": 848
6019
+ },
6020
+ {
6021
+ "epoch": 0.8991262907069102,
6022
+ "grad_norm": 0.5825734030559547,
6023
+ "learning_rate": 5.36420101637493e-06,
6024
+ "loss": 1.9258,
6025
+ "step": 849
6026
+ },
6027
+ {
6028
+ "epoch": 0.9001853322742918,
6029
+ "grad_norm": 0.5182408045493935,
6030
+ "learning_rate": 5.334846765039727e-06,
6031
+ "loss": 2.3554,
6032
+ "step": 850
6033
+ },
6034
+ {
6035
+ "epoch": 0.9012443738416733,
6036
+ "grad_norm": 0.38168303079212984,
6037
+ "learning_rate": 5.305191100969766e-06,
6038
+ "loss": 2.21,
6039
+ "step": 851
6040
+ },
6041
+ {
6042
+ "epoch": 0.9023034154090548,
6043
+ "grad_norm": 0.5296996772260512,
6044
+ "learning_rate": 5.275229357798165e-06,
6045
+ "loss": 2.1481,
6046
+ "step": 852
6047
+ },
6048
+ {
6049
+ "epoch": 0.9033624569764364,
6050
+ "grad_norm": 0.44191291244492953,
6051
+ "learning_rate": 5.244956772334294e-06,
6052
+ "loss": 2.292,
6053
+ "step": 853
6054
+ },
6055
+ {
6056
+ "epoch": 0.9044214985438178,
6057
+ "grad_norm": 0.39411649769962676,
6058
+ "learning_rate": 5.214368482039398e-06,
6059
+ "loss": 2.1063,
6060
+ "step": 854
6061
+ },
6062
+ {
6063
+ "epoch": 0.9054805401111994,
6064
+ "grad_norm": 0.4979125672714714,
6065
+ "learning_rate": 5.18345952242283e-06,
6066
+ "loss": 2.1021,
6067
+ "step": 855
6068
+ },
6069
+ {
6070
+ "epoch": 0.9054805401111994,
6071
+ "eval_loss": 2.1856369972229004,
6072
+ "eval_runtime": 560.4973,
6073
+ "eval_samples_per_second": 0.58,
6074
+ "eval_steps_per_second": 0.291,
6075
+ "step": 855
6076
  }
6077
  ],
6078
  "logging_steps": 1,
 
6092
  "attributes": {}
6093
  }
6094
  },
6095
+ "total_flos": 3.9599464066842624e+17,
6096
  "train_batch_size": 1,
6097
  "trial_name": null,
6098
  "trial_params": null