Sabbir772 commited on
Commit
508eaf1
·
verified ·
1 Parent(s): f2340e0

Training in progress, step 62500

Browse files
last-checkpoint/config.json CHANGED
@@ -9,7 +9,6 @@
9
  "decoder_start_token_id": 0,
10
  "dense_act_fn": "gelu_new",
11
  "dropout_rate": 0.1,
12
- "dtype": "float32",
13
  "eos_token_id": 1,
14
  "feed_forward_proj": "gated-gelu",
15
  "gradient_checkpointing": false,
@@ -26,7 +25,8 @@
26
  "relative_attention_max_distance": 128,
27
  "relative_attention_num_buckets": 32,
28
  "tie_word_embeddings": false,
29
- "transformers_version": "4.57.1",
 
30
  "use_cache": true,
31
  "vocab_size": 32102
32
  }
 
9
  "decoder_start_token_id": 0,
10
  "dense_act_fn": "gelu_new",
11
  "dropout_rate": 0.1,
 
12
  "eos_token_id": 1,
13
  "feed_forward_proj": "gated-gelu",
14
  "gradient_checkpointing": false,
 
25
  "relative_attention_max_distance": 128,
26
  "relative_attention_num_buckets": 32,
27
  "tie_word_embeddings": false,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.54.1",
30
  "use_cache": true,
31
  "vocab_size": 32102
32
  }
last-checkpoint/generation_config.json CHANGED
@@ -1,8 +1,7 @@
1
  {
 
2
  "decoder_start_token_id": 0,
3
- "eos_token_id": [
4
- 1
5
- ],
6
  "pad_token_id": 0,
7
- "transformers_version": "4.57.1"
8
  }
 
1
  {
2
+ "_from_model_config": true,
3
  "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
 
 
5
  "pad_token_id": 0,
6
+ "transformers_version": "4.54.1"
7
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aabc3fced6b151fc05539357053d33bcd4755e4e98846d06507236d52dd7e5c3
3
  size 990185320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7d9005889552e029ae0f4ad6c88f14926c5f00dadfc9d159f3073b9bd1ed7e5
3
  size 990185320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8c1b4f6030f249330c726f7011b5a52b7644567b3e4c984cbf366482b29f109
3
- size 1980541387
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbc2f647bccc068bd4031b56c17c84cc33de7c9cbadfbc3408e9aad88ac8b8cf
3
+ size 1980540922
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9dafbbb5ddc2c29002833b6ed5711dc2af1472fffb6629ad247f578cd6fa9666
3
- size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3a3008648501d24764fe333294c85c928d239dc3b0530f6dceccd81d60bc59
3
+ size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7ae60dd4b0e7ec1532f15a7f6a52644b055c6120f550f8d7c91916c45516a59
3
- size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25f0268140834c71b91899b9158c0b00cd62b9c624ee206b9aa4ff7a0e9ff469
3
+ size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 22.878228782287824,
6
  "eval_steps": 500,
7
- "global_step": 62000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3696,1064 +3696,12 @@
3696
  "eval_samples_per_second": 47.97,
3697
  "eval_steps_per_second": 6.004,
3698
  "step": 50420
3699
- },
3700
- {
3701
- "epoch": 18.634686346863468,
3702
- "grad_norm": 6.383838176727295,
3703
- "learning_rate": 2.3379546652609387e-05,
3704
- "loss": 1.0453,
3705
- "step": 50500
3706
- },
3707
- {
3708
- "epoch": 18.634686346863468,
3709
- "eval_bleu": 45.041038890307775,
3710
- "eval_chrf": 69.41451182032911,
3711
- "eval_loss": 1.023424744606018,
3712
- "eval_runtime": 93.238,
3713
- "eval_samples_per_second": 8.752,
3714
- "eval_steps_per_second": 1.094,
3715
- "step": 50500
3716
- },
3717
- {
3718
- "epoch": 18.671586715867157,
3719
- "grad_norm": 3.4755825996398926,
3720
- "learning_rate": 2.332683183974697e-05,
3721
- "loss": 1.0585,
3722
- "step": 50600
3723
- },
3724
- {
3725
- "epoch": 18.70848708487085,
3726
- "grad_norm": 4.731332302093506,
3727
- "learning_rate": 2.3274117026884556e-05,
3728
- "loss": 1.0681,
3729
- "step": 50700
3730
- },
3731
- {
3732
- "epoch": 18.74538745387454,
3733
- "grad_norm": 3.3240673542022705,
3734
- "learning_rate": 2.3221402214022142e-05,
3735
- "loss": 0.9811,
3736
- "step": 50800
3737
- },
3738
- {
3739
- "epoch": 18.782287822878228,
3740
- "grad_norm": 4.01174783706665,
3741
- "learning_rate": 2.316868740115973e-05,
3742
- "loss": 0.9955,
3743
- "step": 50900
3744
- },
3745
- {
3746
- "epoch": 18.81918819188192,
3747
- "grad_norm": 3.45139741897583,
3748
- "learning_rate": 2.3115972588297315e-05,
3749
- "loss": 1.0291,
3750
- "step": 51000
3751
- },
3752
- {
3753
- "epoch": 18.81918819188192,
3754
- "eval_bleu": 45.05869645905028,
3755
- "eval_chrf": 69.40206039350805,
3756
- "eval_loss": 1.0173133611679077,
3757
- "eval_runtime": 93.4838,
3758
- "eval_samples_per_second": 8.729,
3759
- "eval_steps_per_second": 1.091,
3760
- "step": 51000
3761
- },
3762
- {
3763
- "epoch": 18.85608856088561,
3764
- "grad_norm": 5.509322643280029,
3765
- "learning_rate": 2.3063257775434898e-05,
3766
- "loss": 1.0201,
3767
- "step": 51100
3768
- },
3769
- {
3770
- "epoch": 18.8929889298893,
3771
- "grad_norm": 3.0491347312927246,
3772
- "learning_rate": 2.3010542962572484e-05,
3773
- "loss": 1.0014,
3774
- "step": 51200
3775
- },
3776
- {
3777
- "epoch": 18.929889298892988,
3778
- "grad_norm": 2.939685821533203,
3779
- "learning_rate": 2.295782814971007e-05,
3780
- "loss": 1.0372,
3781
- "step": 51300
3782
- },
3783
- {
3784
- "epoch": 18.96678966789668,
3785
- "grad_norm": 6.572051525115967,
3786
- "learning_rate": 2.2905113336847657e-05,
3787
- "loss": 1.0221,
3788
- "step": 51400
3789
- },
3790
- {
3791
- "epoch": 19.00369003690037,
3792
- "grad_norm": 6.473498821258545,
3793
- "learning_rate": 2.285239852398524e-05,
3794
- "loss": 1.0437,
3795
- "step": 51500
3796
- },
3797
- {
3798
- "epoch": 19.00369003690037,
3799
- "eval_bleu": 45.377464139800466,
3800
- "eval_chrf": 69.57987345624291,
3801
- "eval_loss": 1.0092017650604248,
3802
- "eval_runtime": 92.7751,
3803
- "eval_samples_per_second": 8.795,
3804
- "eval_steps_per_second": 1.099,
3805
- "step": 51500
3806
- },
3807
- {
3808
- "epoch": 19.04059040590406,
3809
- "grad_norm": 5.248044967651367,
3810
- "learning_rate": 2.2799683711122826e-05,
3811
- "loss": 1.0291,
3812
- "step": 51600
3813
- },
3814
- {
3815
- "epoch": 19.077490774907748,
3816
- "grad_norm": 3.91925311088562,
3817
- "learning_rate": 2.2746968898260412e-05,
3818
- "loss": 1.0228,
3819
- "step": 51700
3820
- },
3821
- {
3822
- "epoch": 19.11439114391144,
3823
- "grad_norm": 4.581681728363037,
3824
- "learning_rate": 2.2694254085398e-05,
3825
- "loss": 0.9896,
3826
- "step": 51800
3827
- },
3828
- {
3829
- "epoch": 19.15129151291513,
3830
- "grad_norm": 3.2478437423706055,
3831
- "learning_rate": 2.2641539272535585e-05,
3832
- "loss": 0.9898,
3833
- "step": 51900
3834
- },
3835
- {
3836
- "epoch": 19.18819188191882,
3837
- "grad_norm": 4.589653015136719,
3838
- "learning_rate": 2.2588824459673168e-05,
3839
- "loss": 0.9883,
3840
- "step": 52000
3841
- },
3842
- {
3843
- "epoch": 19.18819188191882,
3844
- "eval_bleu": 45.182278437943154,
3845
- "eval_chrf": 69.60795584033274,
3846
- "eval_loss": 0.9994527101516724,
3847
- "eval_runtime": 93.0344,
3848
- "eval_samples_per_second": 8.771,
3849
- "eval_steps_per_second": 1.096,
3850
- "step": 52000
3851
- },
3852
- {
3853
- "epoch": 19.225092250922508,
3854
- "grad_norm": 3.865722179412842,
3855
- "learning_rate": 2.2536109646810754e-05,
3856
- "loss": 0.9701,
3857
- "step": 52100
3858
- },
3859
- {
3860
- "epoch": 19.2619926199262,
3861
- "grad_norm": 3.5048811435699463,
3862
- "learning_rate": 2.248339483394834e-05,
3863
- "loss": 0.986,
3864
- "step": 52200
3865
- },
3866
- {
3867
- "epoch": 19.29889298892989,
3868
- "grad_norm": 4.171955585479736,
3869
- "learning_rate": 2.2430680021085927e-05,
3870
- "loss": 1.0095,
3871
- "step": 52300
3872
- },
3873
- {
3874
- "epoch": 19.33579335793358,
3875
- "grad_norm": 2.502441644668579,
3876
- "learning_rate": 2.237796520822351e-05,
3877
- "loss": 0.9996,
3878
- "step": 52400
3879
- },
3880
- {
3881
- "epoch": 19.372693726937268,
3882
- "grad_norm": 4.4848737716674805,
3883
- "learning_rate": 2.2325250395361096e-05,
3884
- "loss": 0.9496,
3885
- "step": 52500
3886
- },
3887
- {
3888
- "epoch": 19.372693726937268,
3889
- "eval_bleu": 45.2363393703102,
3890
- "eval_chrf": 69.37454521026567,
3891
- "eval_loss": 0.9982830882072449,
3892
- "eval_runtime": 93.5422,
3893
- "eval_samples_per_second": 8.723,
3894
- "eval_steps_per_second": 1.09,
3895
- "step": 52500
3896
- },
3897
- {
3898
- "epoch": 19.40959409594096,
3899
- "grad_norm": 4.602016925811768,
3900
- "learning_rate": 2.2272535582498682e-05,
3901
- "loss": 0.9874,
3902
- "step": 52600
3903
- },
3904
- {
3905
- "epoch": 19.44649446494465,
3906
- "grad_norm": 3.7375121116638184,
3907
- "learning_rate": 2.221982076963627e-05,
3908
- "loss": 0.9843,
3909
- "step": 52700
3910
- },
3911
- {
3912
- "epoch": 19.48339483394834,
3913
- "grad_norm": 3.5808184146881104,
3914
- "learning_rate": 2.2167105956773855e-05,
3915
- "loss": 1.0236,
3916
- "step": 52800
3917
- },
3918
- {
3919
- "epoch": 19.52029520295203,
3920
- "grad_norm": 1.8931940793991089,
3921
- "learning_rate": 2.2114391143911438e-05,
3922
- "loss": 1.0546,
3923
- "step": 52900
3924
- },
3925
- {
3926
- "epoch": 19.55719557195572,
3927
- "grad_norm": 3.6316375732421875,
3928
- "learning_rate": 2.2061676331049024e-05,
3929
- "loss": 0.989,
3930
- "step": 53000
3931
- },
3932
- {
3933
- "epoch": 19.55719557195572,
3934
- "eval_bleu": 45.49473361839618,
3935
- "eval_chrf": 69.6430207436413,
3936
- "eval_loss": 0.9908942580223083,
3937
- "eval_runtime": 94.2497,
3938
- "eval_samples_per_second": 8.658,
3939
- "eval_steps_per_second": 1.082,
3940
- "step": 53000
3941
- },
3942
- {
3943
- "epoch": 19.59409594095941,
3944
- "grad_norm": 4.76518440246582,
3945
- "learning_rate": 2.200896151818661e-05,
3946
- "loss": 1.0023,
3947
- "step": 53100
3948
- },
3949
- {
3950
- "epoch": 19.6309963099631,
3951
- "grad_norm": 3.2376883029937744,
3952
- "learning_rate": 2.1956246705324197e-05,
3953
- "loss": 1.0358,
3954
- "step": 53200
3955
- },
3956
- {
3957
- "epoch": 19.66789667896679,
3958
- "grad_norm": 4.3444318771362305,
3959
- "learning_rate": 2.190353189246178e-05,
3960
- "loss": 1.0315,
3961
- "step": 53300
3962
- },
3963
- {
3964
- "epoch": 19.70479704797048,
3965
- "grad_norm": 5.0184102058410645,
3966
- "learning_rate": 2.1850817079599366e-05,
3967
- "loss": 1.0637,
3968
- "step": 53400
3969
- },
3970
- {
3971
- "epoch": 19.74169741697417,
3972
- "grad_norm": 3.515033483505249,
3973
- "learning_rate": 2.1798102266736953e-05,
3974
- "loss": 1.0379,
3975
- "step": 53500
3976
- },
3977
- {
3978
- "epoch": 19.74169741697417,
3979
- "eval_bleu": 45.418676252884964,
3980
- "eval_chrf": 69.78137735109217,
3981
- "eval_loss": 0.9870654940605164,
3982
- "eval_runtime": 93.9848,
3983
- "eval_samples_per_second": 8.682,
3984
- "eval_steps_per_second": 1.085,
3985
- "step": 53500
3986
- },
3987
- {
3988
- "epoch": 19.77859778597786,
3989
- "grad_norm": 3.5330288410186768,
3990
- "learning_rate": 2.1745387453874542e-05,
3991
- "loss": 0.9752,
3992
- "step": 53600
3993
- },
3994
- {
3995
- "epoch": 19.81549815498155,
3996
- "grad_norm": 4.465782642364502,
3997
- "learning_rate": 2.1692672641012125e-05,
3998
- "loss": 1.0044,
3999
- "step": 53700
4000
- },
4001
- {
4002
- "epoch": 19.85239852398524,
4003
- "grad_norm": 5.873017311096191,
4004
- "learning_rate": 2.163995782814971e-05,
4005
- "loss": 1.0744,
4006
- "step": 53800
4007
- },
4008
- {
4009
- "epoch": 19.88929889298893,
4010
- "grad_norm": 3.305344581604004,
4011
- "learning_rate": 2.1587243015287298e-05,
4012
- "loss": 0.8986,
4013
- "step": 53900
4014
- },
4015
- {
4016
- "epoch": 19.92619926199262,
4017
- "grad_norm": 3.4520583152770996,
4018
- "learning_rate": 2.1534528202424884e-05,
4019
- "loss": 1.0354,
4020
- "step": 54000
4021
- },
4022
- {
4023
- "epoch": 19.92619926199262,
4024
- "eval_bleu": 45.40658414024313,
4025
- "eval_chrf": 69.79824864043482,
4026
- "eval_loss": 0.9898651242256165,
4027
- "eval_runtime": 93.077,
4028
- "eval_samples_per_second": 8.767,
4029
- "eval_steps_per_second": 1.096,
4030
- "step": 54000
4031
- },
4032
- {
4033
- "epoch": 19.96309963099631,
4034
- "grad_norm": 4.757823944091797,
4035
- "learning_rate": 2.148181338956247e-05,
4036
- "loss": 1.0031,
4037
- "step": 54100
4038
- },
4039
- {
4040
- "epoch": 20.0,
4041
- "grad_norm": 3.4500021934509277,
4042
- "learning_rate": 2.1429098576700054e-05,
4043
- "loss": 1.0126,
4044
- "step": 54200
4045
- },
4046
- {
4047
- "epoch": 20.03690036900369,
4048
- "grad_norm": 3.2020909786224365,
4049
- "learning_rate": 2.137638376383764e-05,
4050
- "loss": 0.96,
4051
- "step": 54300
4052
- },
4053
- {
4054
- "epoch": 20.07380073800738,
4055
- "grad_norm": 4.481809616088867,
4056
- "learning_rate": 2.1323668950975226e-05,
4057
- "loss": 0.9614,
4058
- "step": 54400
4059
- },
4060
- {
4061
- "epoch": 20.11070110701107,
4062
- "grad_norm": 3.2812538146972656,
4063
- "learning_rate": 2.1270954138112813e-05,
4064
- "loss": 0.9636,
4065
- "step": 54500
4066
- },
4067
- {
4068
- "epoch": 20.11070110701107,
4069
- "eval_bleu": 45.41666654146133,
4070
- "eval_chrf": 69.96897005750033,
4071
- "eval_loss": 0.9812939167022705,
4072
- "eval_runtime": 93.5969,
4073
- "eval_samples_per_second": 8.718,
4074
- "eval_steps_per_second": 1.09,
4075
- "step": 54500
4076
- },
4077
- {
4078
- "epoch": 20.14760147601476,
4079
- "grad_norm": 4.628011703491211,
4080
- "learning_rate": 2.1218239325250395e-05,
4081
- "loss": 0.9792,
4082
- "step": 54600
4083
- },
4084
- {
4085
- "epoch": 20.18450184501845,
4086
- "grad_norm": 4.964925765991211,
4087
- "learning_rate": 2.1165524512387982e-05,
4088
- "loss": 0.9477,
4089
- "step": 54700
4090
- },
4091
- {
4092
- "epoch": 20.22140221402214,
4093
- "grad_norm": 3.1874473094940186,
4094
- "learning_rate": 2.1112809699525568e-05,
4095
- "loss": 0.9928,
4096
- "step": 54800
4097
- },
4098
- {
4099
- "epoch": 20.25830258302583,
4100
- "grad_norm": 3.8035147190093994,
4101
- "learning_rate": 2.1060094886663154e-05,
4102
- "loss": 0.9688,
4103
- "step": 54900
4104
- },
4105
- {
4106
- "epoch": 20.29520295202952,
4107
- "grad_norm": 4.595950603485107,
4108
- "learning_rate": 2.100738007380074e-05,
4109
- "loss": 0.9575,
4110
- "step": 55000
4111
- },
4112
- {
4113
- "epoch": 20.29520295202952,
4114
- "eval_bleu": 45.26791746653591,
4115
- "eval_chrf": 69.59072341748964,
4116
- "eval_loss": 0.9762688279151917,
4117
- "eval_runtime": 93.4589,
4118
- "eval_samples_per_second": 8.731,
4119
- "eval_steps_per_second": 1.091,
4120
- "step": 55000
4121
- },
4122
- {
4123
- "epoch": 20.33210332103321,
4124
- "grad_norm": 4.018571853637695,
4125
- "learning_rate": 2.0954665260938324e-05,
4126
- "loss": 0.9425,
4127
- "step": 55100
4128
- },
4129
- {
4130
- "epoch": 20.3690036900369,
4131
- "grad_norm": 5.99881649017334,
4132
- "learning_rate": 2.090195044807591e-05,
4133
- "loss": 0.9874,
4134
- "step": 55200
4135
- },
4136
- {
4137
- "epoch": 20.40590405904059,
4138
- "grad_norm": 3.563143253326416,
4139
- "learning_rate": 2.0849235635213496e-05,
4140
- "loss": 0.9795,
4141
- "step": 55300
4142
- },
4143
- {
4144
- "epoch": 20.44280442804428,
4145
- "grad_norm": 5.876094341278076,
4146
- "learning_rate": 2.0796520822351083e-05,
4147
- "loss": 0.9751,
4148
- "step": 55400
4149
- },
4150
- {
4151
- "epoch": 20.47970479704797,
4152
- "grad_norm": 2.8638973236083984,
4153
- "learning_rate": 2.0743806009488666e-05,
4154
- "loss": 0.9223,
4155
- "step": 55500
4156
- },
4157
- {
4158
- "epoch": 20.47970479704797,
4159
- "eval_bleu": 45.916257953363164,
4160
- "eval_chrf": 69.76962902573563,
4161
- "eval_loss": 0.9757766127586365,
4162
- "eval_runtime": 92.5471,
4163
- "eval_samples_per_second": 8.817,
4164
- "eval_steps_per_second": 1.102,
4165
- "step": 55500
4166
- },
4167
- {
4168
- "epoch": 20.51660516605166,
4169
- "grad_norm": 6.471508502960205,
4170
- "learning_rate": 2.0691091196626252e-05,
4171
- "loss": 0.9933,
4172
- "step": 55600
4173
- },
4174
- {
4175
- "epoch": 20.55350553505535,
4176
- "grad_norm": 3.317532539367676,
4177
- "learning_rate": 2.0638376383763838e-05,
4178
- "loss": 0.9708,
4179
- "step": 55700
4180
- },
4181
- {
4182
- "epoch": 20.59040590405904,
4183
- "grad_norm": 3.4863741397857666,
4184
- "learning_rate": 2.0585661570901425e-05,
4185
- "loss": 0.9717,
4186
- "step": 55800
4187
- },
4188
- {
4189
- "epoch": 20.627306273062732,
4190
- "grad_norm": 3.6253013610839844,
4191
- "learning_rate": 2.053294675803901e-05,
4192
- "loss": 0.9628,
4193
- "step": 55900
4194
- },
4195
- {
4196
- "epoch": 20.66420664206642,
4197
- "grad_norm": 4.725039958953857,
4198
- "learning_rate": 2.0480231945176594e-05,
4199
- "loss": 0.9179,
4200
- "step": 56000
4201
- },
4202
- {
4203
- "epoch": 20.66420664206642,
4204
- "eval_bleu": 46.28172619461589,
4205
- "eval_chrf": 70.11693794292695,
4206
- "eval_loss": 0.968273401260376,
4207
- "eval_runtime": 93.3175,
4208
- "eval_samples_per_second": 8.744,
4209
- "eval_steps_per_second": 1.093,
4210
- "step": 56000
4211
- },
4212
- {
4213
- "epoch": 20.70110701107011,
4214
- "grad_norm": 2.480011463165283,
4215
- "learning_rate": 2.042751713231418e-05,
4216
- "loss": 0.9707,
4217
- "step": 56100
4218
- },
4219
- {
4220
- "epoch": 20.7380073800738,
4221
- "grad_norm": 4.387946605682373,
4222
- "learning_rate": 2.0374802319451766e-05,
4223
- "loss": 0.9964,
4224
- "step": 56200
4225
- },
4226
- {
4227
- "epoch": 20.774907749077492,
4228
- "grad_norm": 4.427938461303711,
4229
- "learning_rate": 2.0322087506589353e-05,
4230
- "loss": 0.947,
4231
- "step": 56300
4232
- },
4233
- {
4234
- "epoch": 20.81180811808118,
4235
- "grad_norm": 3.5348544120788574,
4236
- "learning_rate": 2.0269372693726936e-05,
4237
- "loss": 0.9539,
4238
- "step": 56400
4239
- },
4240
- {
4241
- "epoch": 20.84870848708487,
4242
- "grad_norm": 3.754854440689087,
4243
- "learning_rate": 2.0216657880864522e-05,
4244
- "loss": 0.9696,
4245
- "step": 56500
4246
- },
4247
- {
4248
- "epoch": 20.84870848708487,
4249
- "eval_bleu": 46.21577470581903,
4250
- "eval_chrf": 70.26264135399155,
4251
- "eval_loss": 0.9621157646179199,
4252
- "eval_runtime": 92.5871,
4253
- "eval_samples_per_second": 8.813,
4254
- "eval_steps_per_second": 1.102,
4255
- "step": 56500
4256
- },
4257
- {
4258
- "epoch": 20.88560885608856,
4259
- "grad_norm": 6.947758197784424,
4260
- "learning_rate": 2.016394306800211e-05,
4261
- "loss": 1.0204,
4262
- "step": 56600
4263
- },
4264
- {
4265
- "epoch": 20.922509225092252,
4266
- "grad_norm": 4.733431339263916,
4267
- "learning_rate": 2.0111228255139695e-05,
4268
- "loss": 1.0111,
4269
- "step": 56700
4270
- },
4271
- {
4272
- "epoch": 20.95940959409594,
4273
- "grad_norm": 4.140303134918213,
4274
- "learning_rate": 2.005851344227728e-05,
4275
- "loss": 0.9599,
4276
- "step": 56800
4277
- },
4278
- {
4279
- "epoch": 20.99630996309963,
4280
- "grad_norm": 6.344222068786621,
4281
- "learning_rate": 2.0005798629414864e-05,
4282
- "loss": 0.9768,
4283
- "step": 56900
4284
- },
4285
- {
4286
- "epoch": 21.03321033210332,
4287
- "grad_norm": 4.12951135635376,
4288
- "learning_rate": 1.995308381655245e-05,
4289
- "loss": 1.002,
4290
- "step": 57000
4291
- },
4292
- {
4293
- "epoch": 21.03321033210332,
4294
- "eval_bleu": 45.90122957003376,
4295
- "eval_chrf": 70.13557288840417,
4296
- "eval_loss": 0.9641706943511963,
4297
- "eval_runtime": 92.0161,
4298
- "eval_samples_per_second": 8.868,
4299
- "eval_steps_per_second": 1.109,
4300
- "step": 57000
4301
- },
4302
- {
4303
- "epoch": 21.070110701107012,
4304
- "grad_norm": 2.7685956954956055,
4305
- "learning_rate": 1.990036900369004e-05,
4306
- "loss": 0.9171,
4307
- "step": 57100
4308
- },
4309
- {
4310
- "epoch": 21.1070110701107,
4311
- "grad_norm": 3.8239712715148926,
4312
- "learning_rate": 1.9847654190827626e-05,
4313
- "loss": 0.935,
4314
- "step": 57200
4315
- },
4316
- {
4317
- "epoch": 21.14391143911439,
4318
- "grad_norm": 3.2187681198120117,
4319
- "learning_rate": 1.979493937796521e-05,
4320
- "loss": 0.9205,
4321
- "step": 57300
4322
- },
4323
- {
4324
- "epoch": 21.18081180811808,
4325
- "grad_norm": 3.0021488666534424,
4326
- "learning_rate": 1.9742224565102796e-05,
4327
- "loss": 0.9241,
4328
- "step": 57400
4329
- },
4330
- {
4331
- "epoch": 21.217712177121772,
4332
- "grad_norm": 2.5294923782348633,
4333
- "learning_rate": 1.9689509752240382e-05,
4334
- "loss": 0.9327,
4335
- "step": 57500
4336
- },
4337
- {
4338
- "epoch": 21.217712177121772,
4339
- "eval_bleu": 46.11265350722499,
4340
- "eval_chrf": 70.36260746480464,
4341
- "eval_loss": 0.9620640873908997,
4342
- "eval_runtime": 93.0763,
4343
- "eval_samples_per_second": 8.767,
4344
- "eval_steps_per_second": 1.096,
4345
- "step": 57500
4346
- },
4347
- {
4348
- "epoch": 21.25461254612546,
4349
- "grad_norm": 3.535879135131836,
4350
- "learning_rate": 1.9636794939377968e-05,
4351
- "loss": 0.9289,
4352
- "step": 57600
4353
- },
4354
- {
4355
- "epoch": 21.29151291512915,
4356
- "grad_norm": 4.959736347198486,
4357
- "learning_rate": 1.958408012651555e-05,
4358
- "loss": 0.9379,
4359
- "step": 57700
4360
- },
4361
- {
4362
- "epoch": 21.328413284132843,
4363
- "grad_norm": 4.482137203216553,
4364
- "learning_rate": 1.9531365313653138e-05,
4365
- "loss": 0.9173,
4366
- "step": 57800
4367
- },
4368
- {
4369
- "epoch": 21.365313653136532,
4370
- "grad_norm": 4.553799629211426,
4371
- "learning_rate": 1.9478650500790724e-05,
4372
- "loss": 0.8799,
4373
- "step": 57900
4374
- },
4375
- {
4376
- "epoch": 21.40221402214022,
4377
- "grad_norm": 5.203136444091797,
4378
- "learning_rate": 1.942593568792831e-05,
4379
- "loss": 0.8895,
4380
- "step": 58000
4381
- },
4382
- {
4383
- "epoch": 21.40221402214022,
4384
- "eval_bleu": 46.48714823641416,
4385
- "eval_chrf": 70.45381756168473,
4386
- "eval_loss": 0.9568957090377808,
4387
- "eval_runtime": 92.5969,
4388
- "eval_samples_per_second": 8.812,
4389
- "eval_steps_per_second": 1.102,
4390
- "step": 58000
4391
- },
4392
- {
4393
- "epoch": 21.43911439114391,
4394
- "grad_norm": 4.262024879455566,
4395
- "learning_rate": 1.9373220875065897e-05,
4396
- "loss": 0.959,
4397
- "step": 58100
4398
- },
4399
- {
4400
- "epoch": 21.476014760147603,
4401
- "grad_norm": 3.5024478435516357,
4402
- "learning_rate": 1.932050606220348e-05,
4403
- "loss": 0.9557,
4404
- "step": 58200
4405
- },
4406
- {
4407
- "epoch": 21.512915129151292,
4408
- "grad_norm": 5.715458393096924,
4409
- "learning_rate": 1.9267791249341066e-05,
4410
- "loss": 0.9356,
4411
- "step": 58300
4412
- },
4413
- {
4414
- "epoch": 21.54981549815498,
4415
- "grad_norm": 3.2510526180267334,
4416
- "learning_rate": 1.9215076436478652e-05,
4417
- "loss": 0.9177,
4418
- "step": 58400
4419
- },
4420
- {
4421
- "epoch": 21.58671586715867,
4422
- "grad_norm": 3.2294719219207764,
4423
- "learning_rate": 1.916236162361624e-05,
4424
- "loss": 0.955,
4425
- "step": 58500
4426
- },
4427
- {
4428
- "epoch": 21.58671586715867,
4429
- "eval_bleu": 46.45543563139011,
4430
- "eval_chrf": 70.41267683825878,
4431
- "eval_loss": 0.9556043148040771,
4432
- "eval_runtime": 93.5764,
4433
- "eval_samples_per_second": 8.72,
4434
- "eval_steps_per_second": 1.09,
4435
- "step": 58500
4436
- },
4437
- {
4438
- "epoch": 21.623616236162363,
4439
- "grad_norm": 4.483983039855957,
4440
- "learning_rate": 1.910964681075382e-05,
4441
- "loss": 0.9296,
4442
- "step": 58600
4443
- },
4444
- {
4445
- "epoch": 21.660516605166052,
4446
- "grad_norm": 5.226687431335449,
4447
- "learning_rate": 1.9056931997891408e-05,
4448
- "loss": 0.9644,
4449
- "step": 58700
4450
- },
4451
- {
4452
- "epoch": 21.69741697416974,
4453
- "grad_norm": 4.515336036682129,
4454
- "learning_rate": 1.9004217185028994e-05,
4455
- "loss": 0.9485,
4456
- "step": 58800
4457
- },
4458
- {
4459
- "epoch": 21.73431734317343,
4460
- "grad_norm": 4.825827121734619,
4461
- "learning_rate": 1.895150237216658e-05,
4462
- "loss": 0.9554,
4463
- "step": 58900
4464
- },
4465
- {
4466
- "epoch": 21.771217712177123,
4467
- "grad_norm": 3.3719112873077393,
4468
- "learning_rate": 1.8898787559304167e-05,
4469
- "loss": 0.937,
4470
- "step": 59000
4471
- },
4472
- {
4473
- "epoch": 21.771217712177123,
4474
- "eval_bleu": 46.87817019053264,
4475
- "eval_chrf": 70.79154520929303,
4476
- "eval_loss": 0.9485617280006409,
4477
- "eval_runtime": 92.225,
4478
- "eval_samples_per_second": 8.848,
4479
- "eval_steps_per_second": 1.106,
4480
- "step": 59000
4481
- },
4482
- {
4483
- "epoch": 21.80811808118081,
4484
- "grad_norm": 3.7261431217193604,
4485
- "learning_rate": 1.884607274644175e-05,
4486
- "loss": 0.983,
4487
- "step": 59100
4488
- },
4489
- {
4490
- "epoch": 21.8450184501845,
4491
- "grad_norm": 5.664323329925537,
4492
- "learning_rate": 1.8793357933579336e-05,
4493
- "loss": 0.9468,
4494
- "step": 59200
4495
- },
4496
- {
4497
- "epoch": 21.881918819188193,
4498
- "grad_norm": 3.194990873336792,
4499
- "learning_rate": 1.8740643120716922e-05,
4500
- "loss": 0.9457,
4501
- "step": 59300
4502
- },
4503
- {
4504
- "epoch": 21.918819188191883,
4505
- "grad_norm": 6.040603160858154,
4506
- "learning_rate": 1.868792830785451e-05,
4507
- "loss": 0.8814,
4508
- "step": 59400
4509
- },
4510
- {
4511
- "epoch": 21.95571955719557,
4512
- "grad_norm": 3.8153860569000244,
4513
- "learning_rate": 1.863521349499209e-05,
4514
- "loss": 0.8858,
4515
- "step": 59500
4516
- },
4517
- {
4518
- "epoch": 21.95571955719557,
4519
- "eval_bleu": 47.37172520377558,
4520
- "eval_chrf": 71.22515832818081,
4521
- "eval_loss": 0.9359919428825378,
4522
- "eval_runtime": 92.8284,
4523
- "eval_samples_per_second": 8.79,
4524
- "eval_steps_per_second": 1.099,
4525
- "step": 59500
4526
- },
4527
- {
4528
- "epoch": 21.99261992619926,
4529
- "grad_norm": 6.741827487945557,
4530
- "learning_rate": 1.8582498682129678e-05,
4531
- "loss": 0.9004,
4532
- "step": 59600
4533
- },
4534
- {
4535
- "epoch": 22.029520295202953,
4536
- "grad_norm": 3.961733818054199,
4537
- "learning_rate": 1.8529783869267264e-05,
4538
- "loss": 0.9092,
4539
- "step": 59700
4540
- },
4541
- {
4542
- "epoch": 22.066420664206642,
4543
- "grad_norm": 3.397977352142334,
4544
- "learning_rate": 1.847706905640485e-05,
4545
- "loss": 0.9474,
4546
- "step": 59800
4547
- },
4548
- {
4549
- "epoch": 22.10332103321033,
4550
- "grad_norm": 4.655407428741455,
4551
- "learning_rate": 1.8424354243542437e-05,
4552
- "loss": 0.9012,
4553
- "step": 59900
4554
- },
4555
- {
4556
- "epoch": 22.14022140221402,
4557
- "grad_norm": 5.649438381195068,
4558
- "learning_rate": 1.837163943068002e-05,
4559
- "loss": 0.8825,
4560
- "step": 60000
4561
- },
4562
- {
4563
- "epoch": 22.14022140221402,
4564
- "eval_bleu": 46.5005676140855,
4565
- "eval_chrf": 70.5285416310898,
4566
- "eval_loss": 0.9432012438774109,
4567
- "eval_runtime": 93.2807,
4568
- "eval_samples_per_second": 8.748,
4569
- "eval_steps_per_second": 1.093,
4570
- "step": 60000
4571
- },
4572
- {
4573
- "epoch": 22.177121771217713,
4574
- "grad_norm": 4.629878520965576,
4575
- "learning_rate": 1.8318924617817606e-05,
4576
- "loss": 0.942,
4577
- "step": 60100
4578
- },
4579
- {
4580
- "epoch": 22.214022140221402,
4581
- "grad_norm": 3.1749112606048584,
4582
- "learning_rate": 1.8266209804955192e-05,
4583
- "loss": 0.8875,
4584
- "step": 60200
4585
- },
4586
- {
4587
- "epoch": 22.25092250922509,
4588
- "grad_norm": 4.075937271118164,
4589
- "learning_rate": 1.821349499209278e-05,
4590
- "loss": 0.8974,
4591
- "step": 60300
4592
- },
4593
- {
4594
- "epoch": 22.28782287822878,
4595
- "grad_norm": 4.900486946105957,
4596
- "learning_rate": 1.816078017923036e-05,
4597
- "loss": 0.8756,
4598
- "step": 60400
4599
- },
4600
- {
4601
- "epoch": 22.324723247232473,
4602
- "grad_norm": 5.85085391998291,
4603
- "learning_rate": 1.8108065366367948e-05,
4604
- "loss": 0.9037,
4605
- "step": 60500
4606
- },
4607
- {
4608
- "epoch": 22.324723247232473,
4609
- "eval_bleu": 46.80960057807192,
4610
- "eval_chrf": 70.97934673749698,
4611
- "eval_loss": 0.9416308403015137,
4612
- "eval_runtime": 93.7729,
4613
- "eval_samples_per_second": 8.702,
4614
- "eval_steps_per_second": 1.088,
4615
- "step": 60500
4616
- },
4617
- {
4618
- "epoch": 22.361623616236162,
4619
- "grad_norm": 3.0147836208343506,
4620
- "learning_rate": 1.8055350553505538e-05,
4621
- "loss": 0.9062,
4622
- "step": 60600
4623
- },
4624
- {
4625
- "epoch": 22.39852398523985,
4626
- "grad_norm": 4.866212368011475,
4627
- "learning_rate": 1.8002635740643124e-05,
4628
- "loss": 0.8591,
4629
- "step": 60700
4630
- },
4631
- {
4632
- "epoch": 22.435424354243544,
4633
- "grad_norm": 5.024223804473877,
4634
- "learning_rate": 1.7949920927780707e-05,
4635
- "loss": 0.9472,
4636
- "step": 60800
4637
- },
4638
- {
4639
- "epoch": 22.472324723247233,
4640
- "grad_norm": 4.542778968811035,
4641
- "learning_rate": 1.7897206114918293e-05,
4642
- "loss": 0.8968,
4643
- "step": 60900
4644
- },
4645
- {
4646
- "epoch": 22.509225092250922,
4647
- "grad_norm": 4.99670934677124,
4648
- "learning_rate": 1.784449130205588e-05,
4649
- "loss": 0.8718,
4650
- "step": 61000
4651
- },
4652
- {
4653
- "epoch": 22.509225092250922,
4654
- "eval_bleu": 47.081381775180894,
4655
- "eval_chrf": 70.98627232269999,
4656
- "eval_loss": 0.9361989498138428,
4657
- "eval_runtime": 92.7139,
4658
- "eval_samples_per_second": 8.801,
4659
- "eval_steps_per_second": 1.1,
4660
- "step": 61000
4661
- },
4662
- {
4663
- "epoch": 22.54612546125461,
4664
- "grad_norm": 4.439650535583496,
4665
- "learning_rate": 1.7791776489193466e-05,
4666
- "loss": 0.9003,
4667
- "step": 61100
4668
- },
4669
- {
4670
- "epoch": 22.583025830258304,
4671
- "grad_norm": 3.1378018856048584,
4672
- "learning_rate": 1.7739061676331052e-05,
4673
- "loss": 0.8969,
4674
- "step": 61200
4675
- },
4676
- {
4677
- "epoch": 22.619926199261993,
4678
- "grad_norm": 4.780954837799072,
4679
- "learning_rate": 1.7686346863468635e-05,
4680
- "loss": 0.923,
4681
- "step": 61300
4682
- },
4683
- {
4684
- "epoch": 22.656826568265682,
4685
- "grad_norm": 3.4595189094543457,
4686
- "learning_rate": 1.763363205060622e-05,
4687
- "loss": 0.8594,
4688
- "step": 61400
4689
- },
4690
- {
4691
- "epoch": 22.69372693726937,
4692
- "grad_norm": 3.2997348308563232,
4693
- "learning_rate": 1.7580917237743808e-05,
4694
- "loss": 0.9022,
4695
- "step": 61500
4696
- },
4697
- {
4698
- "epoch": 22.69372693726937,
4699
- "eval_bleu": 47.1028817880757,
4700
- "eval_chrf": 71.06093863096652,
4701
- "eval_loss": 0.9276468753814697,
4702
- "eval_runtime": 93.0419,
4703
- "eval_samples_per_second": 8.77,
4704
- "eval_steps_per_second": 1.096,
4705
- "step": 61500
4706
- },
4707
- {
4708
- "epoch": 22.730627306273064,
4709
- "grad_norm": 3.217003583908081,
4710
- "learning_rate": 1.7528202424881394e-05,
4711
- "loss": 0.9786,
4712
- "step": 61600
4713
- },
4714
- {
4715
- "epoch": 22.767527675276753,
4716
- "grad_norm": 3.641460418701172,
4717
- "learning_rate": 1.7475487612018977e-05,
4718
- "loss": 0.9548,
4719
- "step": 61700
4720
- },
4721
- {
4722
- "epoch": 22.804428044280442,
4723
- "grad_norm": 4.382227420806885,
4724
- "learning_rate": 1.7422772799156563e-05,
4725
- "loss": 0.885,
4726
- "step": 61800
4727
- },
4728
- {
4729
- "epoch": 22.84132841328413,
4730
- "grad_norm": 4.368484973907471,
4731
- "learning_rate": 1.737005798629415e-05,
4732
- "loss": 0.8851,
4733
- "step": 61900
4734
- },
4735
- {
4736
- "epoch": 22.878228782287824,
4737
- "grad_norm": 3.0234997272491455,
4738
- "learning_rate": 1.7317343173431736e-05,
4739
- "loss": 0.8995,
4740
- "step": 62000
4741
- },
4742
- {
4743
- "epoch": 22.878228782287824,
4744
- "eval_bleu": 46.956009820086116,
4745
- "eval_chrf": 71.03846001924904,
4746
- "eval_loss": 0.9235355257987976,
4747
- "eval_runtime": 92.5602,
4748
- "eval_samples_per_second": 8.816,
4749
- "eval_steps_per_second": 1.102,
4750
- "step": 62000
4751
  }
4752
  ],
4753
  "logging_steps": 100,
4754
- "max_steps": 94850,
4755
  "num_input_tokens_seen": 0,
4756
- "num_train_epochs": 35,
4757
  "save_steps": 500,
4758
  "stateful_callbacks": {
4759
  "TrainerControl": {
@@ -4762,12 +3710,12 @@
4762
  "should_evaluate": false,
4763
  "should_log": false,
4764
  "should_save": true,
4765
- "should_training_stop": false
4766
  },
4767
  "attributes": {}
4768
  }
4769
  },
4770
- "total_flos": 1.0073932406194176e+17,
4771
  "train_batch_size": 8,
4772
  "trial_name": null,
4773
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 20.0,
6
  "eval_steps": 500,
7
+ "global_step": 50420,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3696
  "eval_samples_per_second": 47.97,
3697
  "eval_steps_per_second": 6.004,
3698
  "step": 50420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3699
  }
3700
  ],
3701
  "logging_steps": 100,
3702
+ "max_steps": 50420,
3703
  "num_input_tokens_seen": 0,
3704
+ "num_train_epochs": 20,
3705
  "save_steps": 500,
3706
  "stateful_callbacks": {
3707
  "TrainerControl": {
 
3710
  "should_evaluate": false,
3711
  "should_log": false,
3712
  "should_save": true,
3713
+ "should_training_stop": true
3714
  },
3715
  "attributes": {}
3716
  }
3717
  },
3718
+ "total_flos": 6.902420484390912e+16,
3719
  "train_batch_size": 8,
3720
  "trial_name": null,
3721
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f204478991a1e565717112446a50fbdebbfc0c2fa1e8885ad052d434d413f2f
3
- size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75188bd643b1477939c9a6819e9a5ad3a7a388c5c13a37c2fcbd210bd1b5fc26
3
+ size 5496
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aabc3fced6b151fc05539357053d33bcd4755e4e98846d06507236d52dd7e5c3
3
  size 990185320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a29e0afbdf7dacbda2541986ccaba1c2707ac1b4969c8e3a880d544bb15612e
3
  size 990185320