error577 commited on
Commit
76b9aca
·
verified ·
1 Parent(s): f71ba12

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f62a3669267cf71d82d7613940fda8316817761a312976ab3eaf115e310acc5
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef365db25022057197b4b51b720031484568324974a69eac1519e50b0f27180e
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:028cc8a1a5f704ba3fd8b0a0c40fff116ce068a6aeb95280523010390d01a4f7
3
  size 179316182
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c797a7af223641265261c12a7109ff441b5a98196dd5655fd2f5252583938e32
3
  size 179316182
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71352a8a29de772e57a41bb86fae428e2d7704c4a8210b01b9dc37d6bce4b251
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:566bad833abc8e24d98072c362038aaf0a56ffba2d3a483eddb4f69725e48c7a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b438af450d5f669b412dd3e9981bf7d3209f28ad248f243baf4956b744ebafc5
3
  size 2080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51377c3c97ab55526aa4f5f8b0c3786821eafd572b923cffeba3b976830c2d0f
3
  size 2080
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.7760652303695679,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.03883570554768054,
5
  "eval_steps": 100,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3555,6 +3555,714 @@
3555
  "eval_samples_per_second": 9.359,
3556
  "eval_steps_per_second": 9.359,
3557
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3558
  }
3559
  ],
3560
  "logging_steps": 1,
@@ -3569,7 +4277,7 @@
3569
  "early_stopping_threshold": 0.0
3570
  },
3571
  "attributes": {
3572
- "early_stopping_patience_counter": 3
3573
  }
3574
  },
3575
  "TrainerControl": {
@@ -3578,12 +4286,12 @@
3578
  "should_evaluate": false,
3579
  "should_log": false,
3580
  "should_save": true,
3581
- "should_training_stop": false
3582
  },
3583
  "attributes": {}
3584
  }
3585
  },
3586
- "total_flos": 9.450601865596109e+16,
3587
  "train_batch_size": 1,
3588
  "trial_name": null,
3589
  "trial_params": null
 
1
  {
2
  "best_metric": 1.7760652303695679,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
+ "epoch": 0.046602846657216646,
5
  "eval_steps": 100,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3555
  "eval_samples_per_second": 9.359,
3556
  "eval_steps_per_second": 9.359,
3557
  "step": 500
3558
+ },
3559
+ {
3560
+ "epoch": 0.0389133769587759,
3561
+ "grad_norm": 1.7078206539154053,
3562
+ "learning_rate": 0.00019933712610509247,
3563
+ "loss": 1.8207,
3564
+ "step": 501
3565
+ },
3566
+ {
3567
+ "epoch": 0.03899104836987126,
3568
+ "grad_norm": 1.6380209922790527,
3569
+ "learning_rate": 0.00019933430303353816,
3570
+ "loss": 1.7694,
3571
+ "step": 502
3572
+ },
3573
+ {
3574
+ "epoch": 0.03906871978096662,
3575
+ "grad_norm": 1.7188258171081543,
3576
+ "learning_rate": 0.00019933147996198386,
3577
+ "loss": 1.7103,
3578
+ "step": 503
3579
+ },
3580
+ {
3581
+ "epoch": 0.03914639119206198,
3582
+ "grad_norm": 1.780552625656128,
3583
+ "learning_rate": 0.00019932865689042956,
3584
+ "loss": 2.0916,
3585
+ "step": 504
3586
+ },
3587
+ {
3588
+ "epoch": 0.03922406260315734,
3589
+ "grad_norm": 1.7240533828735352,
3590
+ "learning_rate": 0.00019932583381887525,
3591
+ "loss": 1.7221,
3592
+ "step": 505
3593
+ },
3594
+ {
3595
+ "epoch": 0.0393017340142527,
3596
+ "grad_norm": 1.8025785684585571,
3597
+ "learning_rate": 0.00019932299619540572,
3598
+ "loss": 1.7756,
3599
+ "step": 506
3600
+ },
3601
+ {
3602
+ "epoch": 0.03937940542534806,
3603
+ "grad_norm": 1.6106324195861816,
3604
+ "learning_rate": 0.00019932014402002096,
3605
+ "loss": 1.7496,
3606
+ "step": 507
3607
+ },
3608
+ {
3609
+ "epoch": 0.03945707683644343,
3610
+ "grad_norm": 1.6666816473007202,
3611
+ "learning_rate": 0.0001993172918446362,
3612
+ "loss": 1.658,
3613
+ "step": 508
3614
+ },
3615
+ {
3616
+ "epoch": 0.03953474824753879,
3617
+ "grad_norm": 1.9957692623138428,
3618
+ "learning_rate": 0.00019931443966925144,
3619
+ "loss": 1.8459,
3620
+ "step": 509
3621
+ },
3622
+ {
3623
+ "epoch": 0.03961241965863415,
3624
+ "grad_norm": 1.49708890914917,
3625
+ "learning_rate": 0.00019931157294195145,
3626
+ "loss": 1.4847,
3627
+ "step": 510
3628
+ },
3629
+ {
3630
+ "epoch": 0.03969009106972951,
3631
+ "grad_norm": 1.823359489440918,
3632
+ "learning_rate": 0.00019930870621465147,
3633
+ "loss": 1.7518,
3634
+ "step": 511
3635
+ },
3636
+ {
3637
+ "epoch": 0.03976776248082487,
3638
+ "grad_norm": 1.6615630388259888,
3639
+ "learning_rate": 0.00019930583948735148,
3640
+ "loss": 1.6027,
3641
+ "step": 512
3642
+ },
3643
+ {
3644
+ "epoch": 0.03984543389192023,
3645
+ "grad_norm": 2.001208782196045,
3646
+ "learning_rate": 0.00019930295820813626,
3647
+ "loss": 1.8741,
3648
+ "step": 513
3649
+ },
3650
+ {
3651
+ "epoch": 0.03992310530301559,
3652
+ "grad_norm": 1.87227463722229,
3653
+ "learning_rate": 0.00019930006237700582,
3654
+ "loss": 1.9748,
3655
+ "step": 514
3656
+ },
3657
+ {
3658
+ "epoch": 0.04000077671411095,
3659
+ "grad_norm": 1.8258823156356812,
3660
+ "learning_rate": 0.00019929716654587537,
3661
+ "loss": 1.8829,
3662
+ "step": 515
3663
+ },
3664
+ {
3665
+ "epoch": 0.04007844812520631,
3666
+ "grad_norm": 2.002673387527466,
3667
+ "learning_rate": 0.00019929427071474493,
3668
+ "loss": 1.8528,
3669
+ "step": 516
3670
+ },
3671
+ {
3672
+ "epoch": 0.040156119536301674,
3673
+ "grad_norm": 1.7360410690307617,
3674
+ "learning_rate": 0.00019929137488361448,
3675
+ "loss": 1.8319,
3676
+ "step": 517
3677
+ },
3678
+ {
3679
+ "epoch": 0.040233790947397034,
3680
+ "grad_norm": 1.4678514003753662,
3681
+ "learning_rate": 0.0001992884645005688,
3682
+ "loss": 1.7465,
3683
+ "step": 518
3684
+ },
3685
+ {
3686
+ "epoch": 0.040311462358492395,
3687
+ "grad_norm": 1.7761526107788086,
3688
+ "learning_rate": 0.00019928555411752313,
3689
+ "loss": 2.01,
3690
+ "step": 519
3691
+ },
3692
+ {
3693
+ "epoch": 0.04038913376958776,
3694
+ "grad_norm": 1.6452142000198364,
3695
+ "learning_rate": 0.00019928262918256223,
3696
+ "loss": 1.7229,
3697
+ "step": 520
3698
+ },
3699
+ {
3700
+ "epoch": 0.04046680518068312,
3701
+ "grad_norm": 1.6721060276031494,
3702
+ "learning_rate": 0.0001992796896956861,
3703
+ "loss": 1.8213,
3704
+ "step": 521
3705
+ },
3706
+ {
3707
+ "epoch": 0.04054447659177848,
3708
+ "grad_norm": 1.492349624633789,
3709
+ "learning_rate": 0.0001992767647607252,
3710
+ "loss": 1.7843,
3711
+ "step": 522
3712
+ },
3713
+ {
3714
+ "epoch": 0.040622148002873844,
3715
+ "grad_norm": 1.6278711557388306,
3716
+ "learning_rate": 0.00019927382527384907,
3717
+ "loss": 1.4859,
3718
+ "step": 523
3719
+ },
3720
+ {
3721
+ "epoch": 0.040699819413969204,
3722
+ "grad_norm": 1.5856826305389404,
3723
+ "learning_rate": 0.00019927088578697294,
3724
+ "loss": 1.7912,
3725
+ "step": 524
3726
+ },
3727
+ {
3728
+ "epoch": 0.040777490825064565,
3729
+ "grad_norm": 1.6515257358551025,
3730
+ "learning_rate": 0.00019926793174818158,
3731
+ "loss": 1.7693,
3732
+ "step": 525
3733
+ },
3734
+ {
3735
+ "epoch": 0.040855162236159925,
3736
+ "grad_norm": 1.8710856437683105,
3737
+ "learning_rate": 0.00019926497770939022,
3738
+ "loss": 1.8219,
3739
+ "step": 526
3740
+ },
3741
+ {
3742
+ "epoch": 0.040932833647255286,
3743
+ "grad_norm": 1.731695532798767,
3744
+ "learning_rate": 0.00019926200911868364,
3745
+ "loss": 1.7707,
3746
+ "step": 527
3747
+ },
3748
+ {
3749
+ "epoch": 0.041010505058350646,
3750
+ "grad_norm": 1.7034029960632324,
3751
+ "learning_rate": 0.00019925904052797705,
3752
+ "loss": 1.5997,
3753
+ "step": 528
3754
+ },
3755
+ {
3756
+ "epoch": 0.04108817646944601,
3757
+ "grad_norm": 1.6124377250671387,
3758
+ "learning_rate": 0.00019925607193727046,
3759
+ "loss": 1.657,
3760
+ "step": 529
3761
+ },
3762
+ {
3763
+ "epoch": 0.04116584788054137,
3764
+ "grad_norm": 1.540908694267273,
3765
+ "learning_rate": 0.00019925308879464865,
3766
+ "loss": 1.7859,
3767
+ "step": 530
3768
+ },
3769
+ {
3770
+ "epoch": 0.04124351929163673,
3771
+ "grad_norm": 1.5464683771133423,
3772
+ "learning_rate": 0.0001992500911001116,
3773
+ "loss": 1.7216,
3774
+ "step": 531
3775
+ },
3776
+ {
3777
+ "epoch": 0.04132119070273209,
3778
+ "grad_norm": 1.6259061098098755,
3779
+ "learning_rate": 0.0001992471079574898,
3780
+ "loss": 1.7018,
3781
+ "step": 532
3782
+ },
3783
+ {
3784
+ "epoch": 0.041398862113827456,
3785
+ "grad_norm": 1.667738914489746,
3786
+ "learning_rate": 0.00019924411026295274,
3787
+ "loss": 1.5638,
3788
+ "step": 533
3789
+ },
3790
+ {
3791
+ "epoch": 0.041476533524922816,
3792
+ "grad_norm": 1.5789062976837158,
3793
+ "learning_rate": 0.00019924109801650047,
3794
+ "loss": 1.7942,
3795
+ "step": 534
3796
+ },
3797
+ {
3798
+ "epoch": 0.04155420493601818,
3799
+ "grad_norm": 1.6071553230285645,
3800
+ "learning_rate": 0.0001992380857700482,
3801
+ "loss": 1.9271,
3802
+ "step": 535
3803
+ },
3804
+ {
3805
+ "epoch": 0.04163187634711354,
3806
+ "grad_norm": 1.4592769145965576,
3807
+ "learning_rate": 0.00019923507352359593,
3808
+ "loss": 1.7467,
3809
+ "step": 536
3810
+ },
3811
+ {
3812
+ "epoch": 0.0417095477582089,
3813
+ "grad_norm": 1.689941644668579,
3814
+ "learning_rate": 0.00019923204672522843,
3815
+ "loss": 1.936,
3816
+ "step": 537
3817
+ },
3818
+ {
3819
+ "epoch": 0.04178721916930426,
3820
+ "grad_norm": 1.7450604438781738,
3821
+ "learning_rate": 0.00019922901992686093,
3822
+ "loss": 2.0821,
3823
+ "step": 538
3824
+ },
3825
+ {
3826
+ "epoch": 0.04186489058039962,
3827
+ "grad_norm": 1.4920010566711426,
3828
+ "learning_rate": 0.00019922599312849343,
3829
+ "loss": 1.7365,
3830
+ "step": 539
3831
+ },
3832
+ {
3833
+ "epoch": 0.04194256199149498,
3834
+ "grad_norm": 1.7665345668792725,
3835
+ "learning_rate": 0.0001992229517782107,
3836
+ "loss": 1.8057,
3837
+ "step": 540
3838
+ },
3839
+ {
3840
+ "epoch": 0.04202023340259034,
3841
+ "grad_norm": 1.5312821865081787,
3842
+ "learning_rate": 0.00019921989587601274,
3843
+ "loss": 1.8117,
3844
+ "step": 541
3845
+ },
3846
+ {
3847
+ "epoch": 0.0420979048136857,
3848
+ "grad_norm": 1.6132386922836304,
3849
+ "learning_rate": 0.00019921685452573001,
3850
+ "loss": 1.7944,
3851
+ "step": 542
3852
+ },
3853
+ {
3854
+ "epoch": 0.04217557622478106,
3855
+ "grad_norm": 1.7471600770950317,
3856
+ "learning_rate": 0.00019921379862353206,
3857
+ "loss": 1.751,
3858
+ "step": 543
3859
+ },
3860
+ {
3861
+ "epoch": 0.04225324763587642,
3862
+ "grad_norm": 1.647037148475647,
3863
+ "learning_rate": 0.00019921072816941887,
3864
+ "loss": 1.9872,
3865
+ "step": 544
3866
+ },
3867
+ {
3868
+ "epoch": 0.04233091904697179,
3869
+ "grad_norm": 1.8728786706924438,
3870
+ "learning_rate": 0.00019920765771530569,
3871
+ "loss": 1.7143,
3872
+ "step": 545
3873
+ },
3874
+ {
3875
+ "epoch": 0.04240859045806715,
3876
+ "grad_norm": 1.4720885753631592,
3877
+ "learning_rate": 0.0001992045872611925,
3878
+ "loss": 1.4329,
3879
+ "step": 546
3880
+ },
3881
+ {
3882
+ "epoch": 0.04248626186916251,
3883
+ "grad_norm": 1.9787479639053345,
3884
+ "learning_rate": 0.0001992015022551641,
3885
+ "loss": 1.8716,
3886
+ "step": 547
3887
+ },
3888
+ {
3889
+ "epoch": 0.04256393328025787,
3890
+ "grad_norm": 1.5727592706680298,
3891
+ "learning_rate": 0.00019919840269722044,
3892
+ "loss": 1.7307,
3893
+ "step": 548
3894
+ },
3895
+ {
3896
+ "epoch": 0.04264160469135323,
3897
+ "grad_norm": 1.695572853088379,
3898
+ "learning_rate": 0.00019919531769119203,
3899
+ "loss": 1.8648,
3900
+ "step": 549
3901
+ },
3902
+ {
3903
+ "epoch": 0.04271927610244859,
3904
+ "grad_norm": 1.5870908498764038,
3905
+ "learning_rate": 0.0001991922181332484,
3906
+ "loss": 1.8435,
3907
+ "step": 550
3908
+ },
3909
+ {
3910
+ "epoch": 0.04279694751354395,
3911
+ "grad_norm": 1.7436878681182861,
3912
+ "learning_rate": 0.00019918910402338952,
3913
+ "loss": 1.9561,
3914
+ "step": 551
3915
+ },
3916
+ {
3917
+ "epoch": 0.04287461892463931,
3918
+ "grad_norm": 1.8657782077789307,
3919
+ "learning_rate": 0.00019918600446544588,
3920
+ "loss": 1.9407,
3921
+ "step": 552
3922
+ },
3923
+ {
3924
+ "epoch": 0.04295229033573467,
3925
+ "grad_norm": 1.8304105997085571,
3926
+ "learning_rate": 0.000199182890355587,
3927
+ "loss": 1.7994,
3928
+ "step": 553
3929
+ },
3930
+ {
3931
+ "epoch": 0.043029961746830034,
3932
+ "grad_norm": 1.7628461122512817,
3933
+ "learning_rate": 0.0001991797616938129,
3934
+ "loss": 1.8482,
3935
+ "step": 554
3936
+ },
3937
+ {
3938
+ "epoch": 0.043107633157925394,
3939
+ "grad_norm": 1.6865324974060059,
3940
+ "learning_rate": 0.0001991766330320388,
3941
+ "loss": 1.8126,
3942
+ "step": 555
3943
+ },
3944
+ {
3945
+ "epoch": 0.043185304569020755,
3946
+ "grad_norm": 1.8172606229782104,
3947
+ "learning_rate": 0.00019917348981834948,
3948
+ "loss": 1.7682,
3949
+ "step": 556
3950
+ },
3951
+ {
3952
+ "epoch": 0.043262975980116115,
3953
+ "grad_norm": 1.6614211797714233,
3954
+ "learning_rate": 0.00019917036115657538,
3955
+ "loss": 1.8664,
3956
+ "step": 557
3957
+ },
3958
+ {
3959
+ "epoch": 0.04334064739121148,
3960
+ "grad_norm": 1.6559243202209473,
3961
+ "learning_rate": 0.00019916721794288605,
3962
+ "loss": 1.8261,
3963
+ "step": 558
3964
+ },
3965
+ {
3966
+ "epoch": 0.04341831880230684,
3967
+ "grad_norm": 1.5907464027404785,
3968
+ "learning_rate": 0.00019916404562536627,
3969
+ "loss": 1.6371,
3970
+ "step": 559
3971
+ },
3972
+ {
3973
+ "epoch": 0.043495990213402204,
3974
+ "grad_norm": 1.774848461151123,
3975
+ "learning_rate": 0.00019916088785976171,
3976
+ "loss": 1.8444,
3977
+ "step": 560
3978
+ },
3979
+ {
3980
+ "epoch": 0.043573661624497564,
3981
+ "grad_norm": 1.6430023908615112,
3982
+ "learning_rate": 0.00019915773009415716,
3983
+ "loss": 1.8364,
3984
+ "step": 561
3985
+ },
3986
+ {
3987
+ "epoch": 0.043651333035592925,
3988
+ "grad_norm": 1.5221737623214722,
3989
+ "learning_rate": 0.00019915455777663738,
3990
+ "loss": 1.8028,
3991
+ "step": 562
3992
+ },
3993
+ {
3994
+ "epoch": 0.043729004446688285,
3995
+ "grad_norm": 1.879007339477539,
3996
+ "learning_rate": 0.0001991513854591176,
3997
+ "loss": 1.892,
3998
+ "step": 563
3999
+ },
4000
+ {
4001
+ "epoch": 0.043806675857783646,
4002
+ "grad_norm": 1.5331951379776,
4003
+ "learning_rate": 0.00019914819858968258,
4004
+ "loss": 1.7295,
4005
+ "step": 564
4006
+ },
4007
+ {
4008
+ "epoch": 0.043884347268879006,
4009
+ "grad_norm": 1.6735893487930298,
4010
+ "learning_rate": 0.00019914501172024757,
4011
+ "loss": 1.7931,
4012
+ "step": 565
4013
+ },
4014
+ {
4015
+ "epoch": 0.04396201867997437,
4016
+ "grad_norm": 1.6903493404388428,
4017
+ "learning_rate": 0.00019914182485081255,
4018
+ "loss": 1.6104,
4019
+ "step": 566
4020
+ },
4021
+ {
4022
+ "epoch": 0.04403969009106973,
4023
+ "grad_norm": 1.9910458326339722,
4024
+ "learning_rate": 0.00019913860887754709,
4025
+ "loss": 1.8777,
4026
+ "step": 567
4027
+ },
4028
+ {
4029
+ "epoch": 0.04411736150216509,
4030
+ "grad_norm": 1.6240683794021606,
4031
+ "learning_rate": 0.00019913540745619684,
4032
+ "loss": 1.7052,
4033
+ "step": 568
4034
+ },
4035
+ {
4036
+ "epoch": 0.04419503291326045,
4037
+ "grad_norm": 1.6185333728790283,
4038
+ "learning_rate": 0.0001991322060348466,
4039
+ "loss": 1.6544,
4040
+ "step": 569
4041
+ },
4042
+ {
4043
+ "epoch": 0.044272704324355816,
4044
+ "grad_norm": 1.7077709436416626,
4045
+ "learning_rate": 0.0001991289755096659,
4046
+ "loss": 1.5459,
4047
+ "step": 570
4048
+ },
4049
+ {
4050
+ "epoch": 0.04435037573545118,
4051
+ "grad_norm": 1.8203749656677246,
4052
+ "learning_rate": 0.00019912575953640044,
4053
+ "loss": 2.0252,
4054
+ "step": 571
4055
+ },
4056
+ {
4057
+ "epoch": 0.04442804714654654,
4058
+ "grad_norm": 1.8006839752197266,
4059
+ "learning_rate": 0.00019912252901121974,
4060
+ "loss": 2.0242,
4061
+ "step": 572
4062
+ },
4063
+ {
4064
+ "epoch": 0.0445057185576419,
4065
+ "grad_norm": 1.777665615081787,
4066
+ "learning_rate": 0.00019911928393412381,
4067
+ "loss": 1.8088,
4068
+ "step": 573
4069
+ },
4070
+ {
4071
+ "epoch": 0.04458338996873726,
4072
+ "grad_norm": 1.7679201364517212,
4073
+ "learning_rate": 0.00019911605340894312,
4074
+ "loss": 1.849,
4075
+ "step": 574
4076
+ },
4077
+ {
4078
+ "epoch": 0.04466106137983262,
4079
+ "grad_norm": 1.9602786302566528,
4080
+ "learning_rate": 0.0001991128083318472,
4081
+ "loss": 1.6431,
4082
+ "step": 575
4083
+ },
4084
+ {
4085
+ "epoch": 0.04473873279092798,
4086
+ "grad_norm": 1.8283534049987793,
4087
+ "learning_rate": 0.00019910954870283604,
4088
+ "loss": 1.7402,
4089
+ "step": 576
4090
+ },
4091
+ {
4092
+ "epoch": 0.04481640420202334,
4093
+ "grad_norm": 1.6526250839233398,
4094
+ "learning_rate": 0.00019910628907382488,
4095
+ "loss": 1.7082,
4096
+ "step": 577
4097
+ },
4098
+ {
4099
+ "epoch": 0.0448940756131187,
4100
+ "grad_norm": 2.265866756439209,
4101
+ "learning_rate": 0.0001991030148928985,
4102
+ "loss": 1.7652,
4103
+ "step": 578
4104
+ },
4105
+ {
4106
+ "epoch": 0.04497174702421406,
4107
+ "grad_norm": 1.7171560525894165,
4108
+ "learning_rate": 0.00019909975526388735,
4109
+ "loss": 1.8516,
4110
+ "step": 579
4111
+ },
4112
+ {
4113
+ "epoch": 0.04504941843530942,
4114
+ "grad_norm": 1.8056362867355347,
4115
+ "learning_rate": 0.00019909646653104573,
4116
+ "loss": 1.8199,
4117
+ "step": 580
4118
+ },
4119
+ {
4120
+ "epoch": 0.04512708984640478,
4121
+ "grad_norm": 1.7205848693847656,
4122
+ "learning_rate": 0.00019909319235011935,
4123
+ "loss": 1.6558,
4124
+ "step": 581
4125
+ },
4126
+ {
4127
+ "epoch": 0.04520476125750014,
4128
+ "grad_norm": 1.722912311553955,
4129
+ "learning_rate": 0.00019908990361727774,
4130
+ "loss": 1.8511,
4131
+ "step": 582
4132
+ },
4133
+ {
4134
+ "epoch": 0.04528243266859551,
4135
+ "grad_norm": 1.8617427349090576,
4136
+ "learning_rate": 0.0001990866003325209,
4137
+ "loss": 1.7757,
4138
+ "step": 583
4139
+ },
4140
+ {
4141
+ "epoch": 0.04536010407969087,
4142
+ "grad_norm": 1.748336672782898,
4143
+ "learning_rate": 0.00019908329704776406,
4144
+ "loss": 1.7258,
4145
+ "step": 584
4146
+ },
4147
+ {
4148
+ "epoch": 0.04543777549078623,
4149
+ "grad_norm": 1.6378040313720703,
4150
+ "learning_rate": 0.00019907999376300722,
4151
+ "loss": 1.7745,
4152
+ "step": 585
4153
+ },
4154
+ {
4155
+ "epoch": 0.04551544690188159,
4156
+ "grad_norm": 1.869744062423706,
4157
+ "learning_rate": 0.00019907669047825038,
4158
+ "loss": 1.9481,
4159
+ "step": 586
4160
+ },
4161
+ {
4162
+ "epoch": 0.04559311831297695,
4163
+ "grad_norm": 1.8933027982711792,
4164
+ "learning_rate": 0.0001990733580896631,
4165
+ "loss": 2.0266,
4166
+ "step": 587
4167
+ },
4168
+ {
4169
+ "epoch": 0.04567078972407231,
4170
+ "grad_norm": 1.7739107608795166,
4171
+ "learning_rate": 0.0001990700257010758,
4172
+ "loss": 1.974,
4173
+ "step": 588
4174
+ },
4175
+ {
4176
+ "epoch": 0.04574846113516767,
4177
+ "grad_norm": 1.6742160320281982,
4178
+ "learning_rate": 0.00019906670786440372,
4179
+ "loss": 1.7525,
4180
+ "step": 589
4181
+ },
4182
+ {
4183
+ "epoch": 0.04582613254626303,
4184
+ "grad_norm": 1.7374258041381836,
4185
+ "learning_rate": 0.0001990633609239012,
4186
+ "loss": 1.6707,
4187
+ "step": 590
4188
+ },
4189
+ {
4190
+ "epoch": 0.045903803957358394,
4191
+ "grad_norm": 1.5789968967437744,
4192
+ "learning_rate": 0.0001990600285353139,
4193
+ "loss": 1.7975,
4194
+ "step": 591
4195
+ },
4196
+ {
4197
+ "epoch": 0.045981475368453754,
4198
+ "grad_norm": 1.609497308731079,
4199
+ "learning_rate": 0.00019905668159481138,
4200
+ "loss": 1.6833,
4201
+ "step": 592
4202
+ },
4203
+ {
4204
+ "epoch": 0.046059146779549115,
4205
+ "grad_norm": 1.6820900440216064,
4206
+ "learning_rate": 0.00019905332010239363,
4207
+ "loss": 1.7528,
4208
+ "step": 593
4209
+ },
4210
+ {
4211
+ "epoch": 0.046136818190644475,
4212
+ "grad_norm": 1.68113374710083,
4213
+ "learning_rate": 0.00019904995860997587,
4214
+ "loss": 1.8833,
4215
+ "step": 594
4216
+ },
4217
+ {
4218
+ "epoch": 0.04621448960173984,
4219
+ "grad_norm": 1.8364982604980469,
4220
+ "learning_rate": 0.00019904659711755812,
4221
+ "loss": 1.851,
4222
+ "step": 595
4223
+ },
4224
+ {
4225
+ "epoch": 0.0462921610128352,
4226
+ "grad_norm": 1.7463794946670532,
4227
+ "learning_rate": 0.00019904322107322514,
4228
+ "loss": 1.6327,
4229
+ "step": 596
4230
+ },
4231
+ {
4232
+ "epoch": 0.046369832423930564,
4233
+ "grad_norm": 2.0899975299835205,
4234
+ "learning_rate": 0.00019903984502889216,
4235
+ "loss": 1.9714,
4236
+ "step": 597
4237
+ },
4238
+ {
4239
+ "epoch": 0.046447503835025925,
4240
+ "grad_norm": 1.9922312498092651,
4241
+ "learning_rate": 0.00019903646898455918,
4242
+ "loss": 1.8302,
4243
+ "step": 598
4244
+ },
4245
+ {
4246
+ "epoch": 0.046525175246121285,
4247
+ "grad_norm": 1.7716776132583618,
4248
+ "learning_rate": 0.00019903306383639574,
4249
+ "loss": 1.8257,
4250
+ "step": 599
4251
+ },
4252
+ {
4253
+ "epoch": 0.046602846657216646,
4254
+ "grad_norm": 1.7818366289138794,
4255
+ "learning_rate": 0.00019902967324014753,
4256
+ "loss": 1.8584,
4257
+ "step": 600
4258
+ },
4259
+ {
4260
+ "epoch": 0.046602846657216646,
4261
+ "eval_loss": 1.8018074035644531,
4262
+ "eval_runtime": 22.0543,
4263
+ "eval_samples_per_second": 9.386,
4264
+ "eval_steps_per_second": 9.386,
4265
+ "step": 600
4266
  }
4267
  ],
4268
  "logging_steps": 1,
 
4277
  "early_stopping_threshold": 0.0
4278
  },
4279
  "attributes": {
4280
+ "early_stopping_patience_counter": 4
4281
  }
4282
  },
4283
  "TrainerControl": {
 
4286
  "should_evaluate": false,
4287
  "should_log": false,
4288
  "should_save": true,
4289
+ "should_training_stop": true
4290
  },
4291
  "attributes": {}
4292
  }
4293
  },
4294
+ "total_flos": 1.1341547207078707e+17,
4295
  "train_batch_size": 1,
4296
  "trial_name": null,
4297
  "trial_params": null