irishprancer commited on
Commit
2b24c26
·
verified ·
1 Parent(s): 5fe3361

Training in progress, step 4050, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a7db4ae93951b0eb394bb0a363f73cd5df34f9278223503ea607797313cdef9
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93acfdd5cdd6ef89b1ed91c73d19149a93a528cd9738fccb13d443f45881ee04
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:550ddc0253077b9ade8068188ab7383f87735a416196347e817b58cdd6eecfa7
3
  size 1054135994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beb77df5cad9d7d40202a2e651689ae053bffaaee0955f5237af06baab810e66
3
  size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79ae35034e3077f87418b20f4a24e69590c4f56a313fa0284d685c7f3a1b03d8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13bbe2b3622d3ee43d6da056e5089821b40ad6dddafb42e72943c6257b2fb9fb
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92a0ba1807c4ff64f4d8fc6d84a7a517689523073c7ea31a60948b80a14d9e61
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a3a16384602460333a9f0cd2b323d7cca6df1f8b990046ccd0e2a05526d44a1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.717534065246582,
3
  "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 156.52173913043478,
5
  "eval_steps": 150,
6
- "global_step": 3600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3447,6 +3447,441 @@
3447
  "EMA_steps_per_second": 24.896,
3448
  "epoch": 156.52173913043478,
3449
  "step": 3600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3450
  }
3451
  ],
3452
  "logging_steps": 10,
@@ -3466,7 +3901,7 @@
3466
  "attributes": {}
3467
  }
3468
  },
3469
- "total_flos": 9.28760054861906e+16,
3470
  "train_batch_size": 4,
3471
  "trial_name": null,
3472
  "trial_params": null
 
1
  {
2
  "best_metric": 0.717534065246582,
3
  "best_model_checkpoint": "./output/checkpoint-450",
4
+ "epoch": 176.08695652173913,
5
  "eval_steps": 150,
6
+ "global_step": 4050,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3447
  "EMA_steps_per_second": 24.896,
3448
  "epoch": 156.52173913043478,
3449
  "step": 3600
3450
+ },
3451
+ {
3452
+ "epoch": 156.95652173913044,
3453
+ "grad_norm": 2.645265579223633,
3454
+ "learning_rate": 2.4671994267584554e-06,
3455
+ "loss": 0.198,
3456
+ "step": 3610
3457
+ },
3458
+ {
3459
+ "epoch": 157.3913043478261,
3460
+ "grad_norm": 1.8821053504943848,
3461
+ "learning_rate": 2.467196248519105e-06,
3462
+ "loss": 0.2556,
3463
+ "step": 3620
3464
+ },
3465
+ {
3466
+ "epoch": 157.82608695652175,
3467
+ "grad_norm": 1.9857944250106812,
3468
+ "learning_rate": 2.467192581323245e-06,
3469
+ "loss": 0.2351,
3470
+ "step": 3630
3471
+ },
3472
+ {
3473
+ "epoch": 158.2608695652174,
3474
+ "grad_norm": 2.166137218475342,
3475
+ "learning_rate": 2.4671884251723307e-06,
3476
+ "loss": 0.2247,
3477
+ "step": 3640
3478
+ },
3479
+ {
3480
+ "epoch": 158.69565217391303,
3481
+ "grad_norm": 3.4636070728302,
3482
+ "learning_rate": 2.467183780068008e-06,
3483
+ "loss": 0.2279,
3484
+ "step": 3650
3485
+ },
3486
+ {
3487
+ "epoch": 159.1304347826087,
3488
+ "grad_norm": 2.6010055541992188,
3489
+ "learning_rate": 2.46717864601212e-06,
3490
+ "loss": 0.2532,
3491
+ "step": 3660
3492
+ },
3493
+ {
3494
+ "epoch": 159.56521739130434,
3495
+ "grad_norm": 1.8783644437789917,
3496
+ "learning_rate": 2.4671730230067e-06,
3497
+ "loss": 0.2374,
3498
+ "step": 3670
3499
+ },
3500
+ {
3501
+ "epoch": 160.0,
3502
+ "grad_norm": 2.6553757190704346,
3503
+ "learning_rate": 2.467166911053978e-06,
3504
+ "loss": 0.2033,
3505
+ "step": 3680
3506
+ },
3507
+ {
3508
+ "epoch": 160.43478260869566,
3509
+ "grad_norm": 2.024184465408325,
3510
+ "learning_rate": 2.467160310156375e-06,
3511
+ "loss": 0.2283,
3512
+ "step": 3690
3513
+ },
3514
+ {
3515
+ "epoch": 160.8695652173913,
3516
+ "grad_norm": 1.5430588722229004,
3517
+ "learning_rate": 2.4671532203165083e-06,
3518
+ "loss": 0.2442,
3519
+ "step": 3700
3520
+ },
3521
+ {
3522
+ "epoch": 161.30434782608697,
3523
+ "grad_norm": 1.56803560256958,
3524
+ "learning_rate": 2.4671456415371886e-06,
3525
+ "loss": 0.277,
3526
+ "step": 3710
3527
+ },
3528
+ {
3529
+ "epoch": 161.7391304347826,
3530
+ "grad_norm": 2.1057708263397217,
3531
+ "learning_rate": 2.4671375738214194e-06,
3532
+ "loss": 0.2282,
3533
+ "step": 3720
3534
+ },
3535
+ {
3536
+ "epoch": 162.17391304347825,
3537
+ "grad_norm": 2.3112103939056396,
3538
+ "learning_rate": 2.4671290171723985e-06,
3539
+ "loss": 0.2158,
3540
+ "step": 3730
3541
+ },
3542
+ {
3543
+ "epoch": 162.6086956521739,
3544
+ "grad_norm": 2.1062066555023193,
3545
+ "learning_rate": 2.467119971593517e-06,
3546
+ "loss": 0.2238,
3547
+ "step": 3740
3548
+ },
3549
+ {
3550
+ "epoch": 163.04347826086956,
3551
+ "grad_norm": 2.408543586730957,
3552
+ "learning_rate": 2.4671104370883605e-06,
3553
+ "loss": 0.2239,
3554
+ "step": 3750
3555
+ },
3556
+ {
3557
+ "epoch": 163.04347826086956,
3558
+ "eval_loss": 0.9615678787231445,
3559
+ "eval_runtime": 0.5503,
3560
+ "eval_samples_per_second": 18.171,
3561
+ "eval_steps_per_second": 18.171,
3562
+ "step": 3750
3563
+ },
3564
+ {
3565
+ "Start_State_loss": 0.7309322357177734,
3566
+ "Start_State_runtime": 0.5599,
3567
+ "Start_State_samples_per_second": 17.861,
3568
+ "Start_State_steps_per_second": 17.861,
3569
+ "epoch": 163.04347826086956,
3570
+ "step": 3750
3571
+ },
3572
+ {
3573
+ "Raw_Model_loss": 0.9615678787231445,
3574
+ "Raw_Model_runtime": 0.5603,
3575
+ "Raw_Model_samples_per_second": 17.847,
3576
+ "Raw_Model_steps_per_second": 17.847,
3577
+ "epoch": 163.04347826086956,
3578
+ "step": 3750
3579
+ },
3580
+ {
3581
+ "SWA_loss": 0.8130480051040649,
3582
+ "SWA_runtime": 0.5108,
3583
+ "SWA_samples_per_second": 19.577,
3584
+ "SWA_steps_per_second": 19.577,
3585
+ "epoch": 163.04347826086956,
3586
+ "step": 3750
3587
+ },
3588
+ {
3589
+ "EMA_loss": 0.7312911748886108,
3590
+ "EMA_runtime": 0.4587,
3591
+ "EMA_samples_per_second": 21.799,
3592
+ "EMA_steps_per_second": 21.799,
3593
+ "epoch": 163.04347826086956,
3594
+ "step": 3750
3595
+ },
3596
+ {
3597
+ "epoch": 163.47826086956522,
3598
+ "grad_norm": 1.994370698928833,
3599
+ "learning_rate": 2.467100413660709e-06,
3600
+ "loss": 0.2369,
3601
+ "step": 3760
3602
+ },
3603
+ {
3604
+ "epoch": 163.91304347826087,
3605
+ "grad_norm": 2.161099672317505,
3606
+ "learning_rate": 2.467089901314535e-06,
3607
+ "loss": 0.2382,
3608
+ "step": 3770
3609
+ },
3610
+ {
3611
+ "epoch": 164.34782608695653,
3612
+ "grad_norm": 1.2869818210601807,
3613
+ "learning_rate": 2.467078900054005e-06,
3614
+ "loss": 0.2242,
3615
+ "step": 3780
3616
+ },
3617
+ {
3618
+ "epoch": 164.7826086956522,
3619
+ "grad_norm": 2.0708134174346924,
3620
+ "learning_rate": 2.4670674098834794e-06,
3621
+ "loss": 0.2331,
3622
+ "step": 3790
3623
+ },
3624
+ {
3625
+ "epoch": 165.2173913043478,
3626
+ "grad_norm": 2.1501667499542236,
3627
+ "learning_rate": 2.467055430807513e-06,
3628
+ "loss": 0.2612,
3629
+ "step": 3800
3630
+ },
3631
+ {
3632
+ "epoch": 165.65217391304347,
3633
+ "grad_norm": 1.9154974222183228,
3634
+ "learning_rate": 2.4670429628308534e-06,
3635
+ "loss": 0.2377,
3636
+ "step": 3810
3637
+ },
3638
+ {
3639
+ "epoch": 166.08695652173913,
3640
+ "grad_norm": 2.9878361225128174,
3641
+ "learning_rate": 2.4670300059584434e-06,
3642
+ "loss": 0.1821,
3643
+ "step": 3820
3644
+ },
3645
+ {
3646
+ "epoch": 166.52173913043478,
3647
+ "grad_norm": 2.1303274631500244,
3648
+ "learning_rate": 2.4670165601954175e-06,
3649
+ "loss": 0.2576,
3650
+ "step": 3830
3651
+ },
3652
+ {
3653
+ "epoch": 166.95652173913044,
3654
+ "grad_norm": 1.6496716737747192,
3655
+ "learning_rate": 2.467002625547106e-06,
3656
+ "loss": 0.236,
3657
+ "step": 3840
3658
+ },
3659
+ {
3660
+ "epoch": 167.3913043478261,
3661
+ "grad_norm": 2.8413376808166504,
3662
+ "learning_rate": 2.466988202019032e-06,
3663
+ "loss": 0.2196,
3664
+ "step": 3850
3665
+ },
3666
+ {
3667
+ "epoch": 167.82608695652175,
3668
+ "grad_norm": 3.063800096511841,
3669
+ "learning_rate": 2.4669732896169126e-06,
3670
+ "loss": 0.2183,
3671
+ "step": 3860
3672
+ },
3673
+ {
3674
+ "epoch": 168.2608695652174,
3675
+ "grad_norm": 2.015465259552002,
3676
+ "learning_rate": 2.4669578883466584e-06,
3677
+ "loss": 0.2874,
3678
+ "step": 3870
3679
+ },
3680
+ {
3681
+ "epoch": 168.69565217391303,
3682
+ "grad_norm": 1.853532314300537,
3683
+ "learning_rate": 2.466941998214374e-06,
3684
+ "loss": 0.2132,
3685
+ "step": 3880
3686
+ },
3687
+ {
3688
+ "epoch": 169.1304347826087,
3689
+ "grad_norm": 2.128767251968384,
3690
+ "learning_rate": 2.466925619226358e-06,
3691
+ "loss": 0.1985,
3692
+ "step": 3890
3693
+ },
3694
+ {
3695
+ "epoch": 169.56521739130434,
3696
+ "grad_norm": 1.844394564628601,
3697
+ "learning_rate": 2.466908751389102e-06,
3698
+ "loss": 0.2458,
3699
+ "step": 3900
3700
+ },
3701
+ {
3702
+ "epoch": 169.56521739130434,
3703
+ "eval_loss": 0.9633600115776062,
3704
+ "eval_runtime": 0.5853,
3705
+ "eval_samples_per_second": 17.085,
3706
+ "eval_steps_per_second": 17.085,
3707
+ "step": 3900
3708
+ },
3709
+ {
3710
+ "Start_State_loss": 0.7309322357177734,
3711
+ "Start_State_runtime": 0.4434,
3712
+ "Start_State_samples_per_second": 22.551,
3713
+ "Start_State_steps_per_second": 22.551,
3714
+ "epoch": 169.56521739130434,
3715
+ "step": 3900
3716
+ },
3717
+ {
3718
+ "Raw_Model_loss": 0.9633600115776062,
3719
+ "Raw_Model_runtime": 0.5336,
3720
+ "Raw_Model_samples_per_second": 18.74,
3721
+ "Raw_Model_steps_per_second": 18.74,
3722
+ "epoch": 169.56521739130434,
3723
+ "step": 3900
3724
+ },
3725
+ {
3726
+ "SWA_loss": 0.8171514272689819,
3727
+ "SWA_runtime": 0.5299,
3728
+ "SWA_samples_per_second": 18.872,
3729
+ "SWA_steps_per_second": 18.872,
3730
+ "epoch": 169.56521739130434,
3731
+ "step": 3900
3732
+ },
3733
+ {
3734
+ "EMA_loss": 0.7304112315177917,
3735
+ "EMA_runtime": 0.4785,
3736
+ "EMA_samples_per_second": 20.898,
3737
+ "EMA_steps_per_second": 20.898,
3738
+ "epoch": 169.56521739130434,
3739
+ "step": 3900
3740
+ },
3741
+ {
3742
+ "epoch": 170.0,
3743
+ "grad_norm": 3.224315881729126,
3744
+ "learning_rate": 2.4668913947092922e-06,
3745
+ "loss": 0.2318,
3746
+ "step": 3910
3747
+ },
3748
+ {
3749
+ "epoch": 170.43478260869566,
3750
+ "grad_norm": 1.8782731294631958,
3751
+ "learning_rate": 2.466873549193808e-06,
3752
+ "loss": 0.2355,
3753
+ "step": 3920
3754
+ },
3755
+ {
3756
+ "epoch": 170.8695652173913,
3757
+ "grad_norm": 2.6162474155426025,
3758
+ "learning_rate": 2.4668552148497236e-06,
3759
+ "loss": 0.2346,
3760
+ "step": 3930
3761
+ },
3762
+ {
3763
+ "epoch": 171.30434782608697,
3764
+ "grad_norm": 2.095191478729248,
3765
+ "learning_rate": 2.466836391684305e-06,
3766
+ "loss": 0.2302,
3767
+ "step": 3940
3768
+ },
3769
+ {
3770
+ "epoch": 171.7391304347826,
3771
+ "grad_norm": 1.8082070350646973,
3772
+ "learning_rate": 2.4668170797050144e-06,
3773
+ "loss": 0.2509,
3774
+ "step": 3950
3775
+ },
3776
+ {
3777
+ "epoch": 172.17391304347825,
3778
+ "grad_norm": 2.0523226261138916,
3779
+ "learning_rate": 2.466797278919505e-06,
3780
+ "loss": 0.2415,
3781
+ "step": 3960
3782
+ },
3783
+ {
3784
+ "epoch": 172.6086956521739,
3785
+ "grad_norm": 2.007352352142334,
3786
+ "learning_rate": 2.466776989335626e-06,
3787
+ "loss": 0.2341,
3788
+ "step": 3970
3789
+ },
3790
+ {
3791
+ "epoch": 173.04347826086956,
3792
+ "grad_norm": 1.9718679189682007,
3793
+ "learning_rate": 2.4667562109614197e-06,
3794
+ "loss": 0.2143,
3795
+ "step": 3980
3796
+ },
3797
+ {
3798
+ "epoch": 173.47826086956522,
3799
+ "grad_norm": 2.318025827407837,
3800
+ "learning_rate": 2.466734943805121e-06,
3801
+ "loss": 0.2391,
3802
+ "step": 3990
3803
+ },
3804
+ {
3805
+ "epoch": 173.91304347826087,
3806
+ "grad_norm": 2.0616047382354736,
3807
+ "learning_rate": 2.466713187875161e-06,
3808
+ "loss": 0.2474,
3809
+ "step": 4000
3810
+ },
3811
+ {
3812
+ "epoch": 174.34782608695653,
3813
+ "grad_norm": 2.416261672973633,
3814
+ "learning_rate": 2.4666909431801617e-06,
3815
+ "loss": 0.2285,
3816
+ "step": 4010
3817
+ },
3818
+ {
3819
+ "epoch": 174.7826086956522,
3820
+ "grad_norm": 2.434659719467163,
3821
+ "learning_rate": 2.466668209728941e-06,
3822
+ "loss": 0.2296,
3823
+ "step": 4020
3824
+ },
3825
+ {
3826
+ "epoch": 175.2173913043478,
3827
+ "grad_norm": 1.9978010654449463,
3828
+ "learning_rate": 2.46664498753051e-06,
3829
+ "loss": 0.2333,
3830
+ "step": 4030
3831
+ },
3832
+ {
3833
+ "epoch": 175.65217391304347,
3834
+ "grad_norm": 2.245964765548706,
3835
+ "learning_rate": 2.466621276594072e-06,
3836
+ "loss": 0.2148,
3837
+ "step": 4040
3838
+ },
3839
+ {
3840
+ "epoch": 176.08695652173913,
3841
+ "grad_norm": 1.842283010482788,
3842
+ "learning_rate": 2.466597076929027e-06,
3843
+ "loss": 0.2419,
3844
+ "step": 4050
3845
+ },
3846
+ {
3847
+ "epoch": 176.08695652173913,
3848
+ "eval_loss": 0.97156822681427,
3849
+ "eval_runtime": 0.4085,
3850
+ "eval_samples_per_second": 24.483,
3851
+ "eval_steps_per_second": 24.483,
3852
+ "step": 4050
3853
+ },
3854
+ {
3855
+ "Start_State_loss": 0.7309322357177734,
3856
+ "Start_State_runtime": 0.397,
3857
+ "Start_State_samples_per_second": 25.187,
3858
+ "Start_State_steps_per_second": 25.187,
3859
+ "epoch": 176.08695652173913,
3860
+ "step": 4050
3861
+ },
3862
+ {
3863
+ "Raw_Model_loss": 0.97156822681427,
3864
+ "Raw_Model_runtime": 0.4199,
3865
+ "Raw_Model_samples_per_second": 23.816,
3866
+ "Raw_Model_steps_per_second": 23.816,
3867
+ "epoch": 176.08695652173913,
3868
+ "step": 4050
3869
+ },
3870
+ {
3871
+ "SWA_loss": 0.8219975233078003,
3872
+ "SWA_runtime": 0.4136,
3873
+ "SWA_samples_per_second": 24.176,
3874
+ "SWA_steps_per_second": 24.176,
3875
+ "epoch": 176.08695652173913,
3876
+ "step": 4050
3877
+ },
3878
+ {
3879
+ "EMA_loss": 0.7310546040534973,
3880
+ "EMA_runtime": 0.3998,
3881
+ "EMA_samples_per_second": 25.013,
3882
+ "EMA_steps_per_second": 25.013,
3883
+ "epoch": 176.08695652173913,
3884
+ "step": 4050
3885
  }
3886
  ],
3887
  "logging_steps": 10,
 
3901
  "attributes": {}
3902
  }
3903
  },
3904
+ "total_flos": 1.0440404596622131e+17,
3905
  "train_batch_size": 4,
3906
  "trial_name": null,
3907
  "trial_params": null