ErrorAI commited on
Commit
3f27c6e
·
verified ·
1 Parent(s): 7e22008

Training in progress, step 725, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd49a981d6e5d7b8d35b0518f84de65b60fa8f580c113bab8306646c42d29b5f
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a9728ad149b17d6e388d14fc0092f7866b47970310ba1ee971994a0c412caa0
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b42fbc772764711e018b1d49ab028749e0abf7ebe824e91ade036bc285d757d
3
  size 43123028
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a6be952359022b7f3bd188ecda8b1550bad03314ad34d49b5081a5290540fb3
3
  size 43123028
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1158e552c046d0c3fab18114f93625ecd57a165df28fe206cbc92bec0657cc87
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cf6ba00f59cf81a848257ca605964ca94bd8077ad494096d04d397120668e04
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7499ce06c57eaf59267f0dd19afaded848accd9f8015ec916809ba0f27f1fc8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb0dd6f595260943e4a46a32458e4a547c9bb110c47e094cd85eeb5c578a29ae
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7536231884057971,
5
  "eval_steps": 182,
6
- "global_step": 546,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3861,6 +3861,1259 @@
3861
  "eval_samples_per_second": 31.469,
3862
  "eval_steps_per_second": 15.786,
3863
  "step": 546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3864
  }
3865
  ],
3866
  "logging_steps": 1,
@@ -3875,12 +5128,12 @@
3875
  "should_evaluate": false,
3876
  "should_log": false,
3877
  "should_save": true,
3878
- "should_training_stop": false
3879
  },
3880
  "attributes": {}
3881
  }
3882
  },
3883
- "total_flos": 9.569582527492915e+16,
3884
  "train_batch_size": 2,
3885
  "trial_name": null,
3886
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0006901311249137,
5
  "eval_steps": 182,
6
+ "global_step": 725,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3861
  "eval_samples_per_second": 31.469,
3862
  "eval_steps_per_second": 15.786,
3863
  "step": 546
3864
+ },
3865
+ {
3866
+ "epoch": 0.7550034506556246,
3867
+ "grad_norm": 19.898162841796875,
3868
+ "learning_rate": 2.9056687908446102e-05,
3869
+ "loss": 4.1283,
3870
+ "step": 547
3871
+ },
3872
+ {
3873
+ "epoch": 0.756383712905452,
3874
+ "grad_norm": 20.785324096679688,
3875
+ "learning_rate": 2.8747708127098593e-05,
3876
+ "loss": 5.1127,
3877
+ "step": 548
3878
+ },
3879
+ {
3880
+ "epoch": 0.7577639751552795,
3881
+ "grad_norm": 18.006441116333008,
3882
+ "learning_rate": 2.84401039255879e-05,
3883
+ "loss": 4.9055,
3884
+ "step": 549
3885
+ },
3886
+ {
3887
+ "epoch": 0.759144237405107,
3888
+ "grad_norm": 24.335613250732422,
3889
+ "learning_rate": 2.813388124244778e-05,
3890
+ "loss": 6.7394,
3891
+ "step": 550
3892
+ },
3893
+ {
3894
+ "epoch": 0.7605244996549344,
3895
+ "grad_norm": 22.286670684814453,
3896
+ "learning_rate": 2.7829045989540593e-05,
3897
+ "loss": 5.5805,
3898
+ "step": 551
3899
+ },
3900
+ {
3901
+ "epoch": 0.7619047619047619,
3902
+ "grad_norm": 15.699090003967285,
3903
+ "learning_rate": 2.752560405194351e-05,
3904
+ "loss": 2.9697,
3905
+ "step": 552
3906
+ },
3907
+ {
3908
+ "epoch": 0.7632850241545893,
3909
+ "grad_norm": 19.606931686401367,
3910
+ "learning_rate": 2.7223561287834465e-05,
3911
+ "loss": 3.3231,
3912
+ "step": 553
3913
+ },
3914
+ {
3915
+ "epoch": 0.7646652864044169,
3916
+ "grad_norm": 17.65702247619629,
3917
+ "learning_rate": 2.692292352837942e-05,
3918
+ "loss": 6.1613,
3919
+ "step": 554
3920
+ },
3921
+ {
3922
+ "epoch": 0.7660455486542443,
3923
+ "grad_norm": 21.681224822998047,
3924
+ "learning_rate": 2.6623696577619627e-05,
3925
+ "loss": 3.443,
3926
+ "step": 555
3927
+ },
3928
+ {
3929
+ "epoch": 0.7674258109040718,
3930
+ "grad_norm": 22.102741241455078,
3931
+ "learning_rate": 2.6325886212359498e-05,
3932
+ "loss": 4.5611,
3933
+ "step": 556
3934
+ },
3935
+ {
3936
+ "epoch": 0.7688060731538993,
3937
+ "grad_norm": 21.978052139282227,
3938
+ "learning_rate": 2.602949818205539e-05,
3939
+ "loss": 5.8679,
3940
+ "step": 557
3941
+ },
3942
+ {
3943
+ "epoch": 0.7701863354037267,
3944
+ "grad_norm": 19.260032653808594,
3945
+ "learning_rate": 2.5734538208704195e-05,
3946
+ "loss": 3.9815,
3947
+ "step": 558
3948
+ },
3949
+ {
3950
+ "epoch": 0.7715665976535542,
3951
+ "grad_norm": 17.845951080322266,
3952
+ "learning_rate": 2.5441011986733166e-05,
3953
+ "loss": 3.3853,
3954
+ "step": 559
3955
+ },
3956
+ {
3957
+ "epoch": 0.7729468599033816,
3958
+ "grad_norm": 23.304794311523438,
3959
+ "learning_rate": 2.514892518288988e-05,
3960
+ "loss": 5.8599,
3961
+ "step": 560
3962
+ },
3963
+ {
3964
+ "epoch": 0.7743271221532091,
3965
+ "grad_norm": 19.359304428100586,
3966
+ "learning_rate": 2.485828343613288e-05,
3967
+ "loss": 4.678,
3968
+ "step": 561
3969
+ },
3970
+ {
3971
+ "epoch": 0.7757073844030365,
3972
+ "grad_norm": 22.408767700195312,
3973
+ "learning_rate": 2.456909235752276e-05,
3974
+ "loss": 5.0614,
3975
+ "step": 562
3976
+ },
3977
+ {
3978
+ "epoch": 0.777087646652864,
3979
+ "grad_norm": 21.053815841674805,
3980
+ "learning_rate": 2.42813575301138e-05,
3981
+ "loss": 4.851,
3982
+ "step": 563
3983
+ },
3984
+ {
3985
+ "epoch": 0.7784679089026915,
3986
+ "grad_norm": 19.501859664916992,
3987
+ "learning_rate": 2.399508450884631e-05,
3988
+ "loss": 3.8913,
3989
+ "step": 564
3990
+ },
3991
+ {
3992
+ "epoch": 0.779848171152519,
3993
+ "grad_norm": 26.093942642211914,
3994
+ "learning_rate": 2.3710278820439313e-05,
3995
+ "loss": 4.5444,
3996
+ "step": 565
3997
+ },
3998
+ {
3999
+ "epoch": 0.7812284334023465,
4000
+ "grad_norm": 16.253053665161133,
4001
+ "learning_rate": 2.3426945963283853e-05,
4002
+ "loss": 3.8306,
4003
+ "step": 566
4004
+ },
4005
+ {
4006
+ "epoch": 0.782608695652174,
4007
+ "grad_norm": 22.449371337890625,
4008
+ "learning_rate": 2.3145091407336784e-05,
4009
+ "loss": 3.9088,
4010
+ "step": 567
4011
+ },
4012
+ {
4013
+ "epoch": 0.7839889579020014,
4014
+ "grad_norm": 18.785884857177734,
4015
+ "learning_rate": 2.2864720594015288e-05,
4016
+ "loss": 3.6168,
4017
+ "step": 568
4018
+ },
4019
+ {
4020
+ "epoch": 0.7853692201518289,
4021
+ "grad_norm": 25.96500015258789,
4022
+ "learning_rate": 2.2585838936091754e-05,
4023
+ "loss": 4.7778,
4024
+ "step": 569
4025
+ },
4026
+ {
4027
+ "epoch": 0.7867494824016563,
4028
+ "grad_norm": 32.132694244384766,
4029
+ "learning_rate": 2.2308451817589283e-05,
4030
+ "loss": 7.8387,
4031
+ "step": 570
4032
+ },
4033
+ {
4034
+ "epoch": 0.7881297446514838,
4035
+ "grad_norm": 19.92507553100586,
4036
+ "learning_rate": 2.2032564593677774e-05,
4037
+ "loss": 3.7773,
4038
+ "step": 571
4039
+ },
4040
+ {
4041
+ "epoch": 0.7895100069013112,
4042
+ "grad_norm": 15.472801208496094,
4043
+ "learning_rate": 2.175818259057045e-05,
4044
+ "loss": 3.1225,
4045
+ "step": 572
4046
+ },
4047
+ {
4048
+ "epoch": 0.7908902691511387,
4049
+ "grad_norm": 18.66532325744629,
4050
+ "learning_rate": 2.148531110542118e-05,
4051
+ "loss": 3.4943,
4052
+ "step": 573
4053
+ },
4054
+ {
4055
+ "epoch": 0.7922705314009661,
4056
+ "grad_norm": 19.573230743408203,
4057
+ "learning_rate": 2.1213955406222074e-05,
4058
+ "loss": 3.7143,
4059
+ "step": 574
4060
+ },
4061
+ {
4062
+ "epoch": 0.7936507936507936,
4063
+ "grad_norm": 24.81309700012207,
4064
+ "learning_rate": 2.09441207317019e-05,
4065
+ "loss": 4.6328,
4066
+ "step": 575
4067
+ },
4068
+ {
4069
+ "epoch": 0.7950310559006211,
4070
+ "grad_norm": 21.88273811340332,
4071
+ "learning_rate": 2.0675812291224793e-05,
4072
+ "loss": 4.0822,
4073
+ "step": 576
4074
+ },
4075
+ {
4076
+ "epoch": 0.7964113181504486,
4077
+ "grad_norm": 29.533687591552734,
4078
+ "learning_rate": 2.0409035264689857e-05,
4079
+ "loss": 5.4004,
4080
+ "step": 577
4081
+ },
4082
+ {
4083
+ "epoch": 0.7977915804002761,
4084
+ "grad_norm": 21.00408172607422,
4085
+ "learning_rate": 2.0143794802431047e-05,
4086
+ "loss": 3.9792,
4087
+ "step": 578
4088
+ },
4089
+ {
4090
+ "epoch": 0.7991718426501035,
4091
+ "grad_norm": 16.558748245239258,
4092
+ "learning_rate": 1.988009602511779e-05,
4093
+ "loss": 3.084,
4094
+ "step": 579
4095
+ },
4096
+ {
4097
+ "epoch": 0.800552104899931,
4098
+ "grad_norm": 20.905330657958984,
4099
+ "learning_rate": 1.961794402365611e-05,
4100
+ "loss": 4.823,
4101
+ "step": 580
4102
+ },
4103
+ {
4104
+ "epoch": 0.8019323671497585,
4105
+ "grad_norm": 16.98253631591797,
4106
+ "learning_rate": 1.935734385909028e-05,
4107
+ "loss": 3.5759,
4108
+ "step": 581
4109
+ },
4110
+ {
4111
+ "epoch": 0.8033126293995859,
4112
+ "grad_norm": 24.210498809814453,
4113
+ "learning_rate": 1.9098300562505266e-05,
4114
+ "loss": 6.3733,
4115
+ "step": 582
4116
+ },
4117
+ {
4118
+ "epoch": 0.8046928916494134,
4119
+ "grad_norm": 18.199722290039062,
4120
+ "learning_rate": 1.8840819134929465e-05,
4121
+ "loss": 3.6873,
4122
+ "step": 583
4123
+ },
4124
+ {
4125
+ "epoch": 0.8060731538992408,
4126
+ "grad_norm": 19.356443405151367,
4127
+ "learning_rate": 1.8584904547238212e-05,
4128
+ "loss": 4.3978,
4129
+ "step": 584
4130
+ },
4131
+ {
4132
+ "epoch": 0.8074534161490683,
4133
+ "grad_norm": 21.787994384765625,
4134
+ "learning_rate": 1.833056174005784e-05,
4135
+ "loss": 4.2661,
4136
+ "step": 585
4137
+ },
4138
+ {
4139
+ "epoch": 0.8088336783988958,
4140
+ "grad_norm": 24.865665435791016,
4141
+ "learning_rate": 1.8077795623670137e-05,
4142
+ "loss": 5.5552,
4143
+ "step": 586
4144
+ },
4145
+ {
4146
+ "epoch": 0.8102139406487232,
4147
+ "grad_norm": 21.28284454345703,
4148
+ "learning_rate": 1.7826611077917842e-05,
4149
+ "loss": 5.044,
4150
+ "step": 587
4151
+ },
4152
+ {
4153
+ "epoch": 0.8115942028985508,
4154
+ "grad_norm": 17.053913116455078,
4155
+ "learning_rate": 1.757701295211014e-05,
4156
+ "loss": 3.1183,
4157
+ "step": 588
4158
+ },
4159
+ {
4160
+ "epoch": 0.8129744651483782,
4161
+ "grad_norm": 20.05781364440918,
4162
+ "learning_rate": 1.7329006064929232e-05,
4163
+ "loss": 3.9421,
4164
+ "step": 589
4165
+ },
4166
+ {
4167
+ "epoch": 0.8143547273982057,
4168
+ "grad_norm": 26.919628143310547,
4169
+ "learning_rate": 1.7082595204337182e-05,
4170
+ "loss": 7.2336,
4171
+ "step": 590
4172
+ },
4173
+ {
4174
+ "epoch": 0.8157349896480331,
4175
+ "grad_norm": 24.17670249938965,
4176
+ "learning_rate": 1.6837785127483618e-05,
4177
+ "loss": 5.4203,
4178
+ "step": 591
4179
+ },
4180
+ {
4181
+ "epoch": 0.8171152518978606,
4182
+ "grad_norm": 18.08180809020996,
4183
+ "learning_rate": 1.659458056061378e-05,
4184
+ "loss": 3.136,
4185
+ "step": 592
4186
+ },
4187
+ {
4188
+ "epoch": 0.8184955141476881,
4189
+ "grad_norm": 25.226892471313477,
4190
+ "learning_rate": 1.6352986198977325e-05,
4191
+ "loss": 5.6002,
4192
+ "step": 593
4193
+ },
4194
+ {
4195
+ "epoch": 0.8198757763975155,
4196
+ "grad_norm": 21.082447052001953,
4197
+ "learning_rate": 1.6113006706737667e-05,
4198
+ "loss": 3.8211,
4199
+ "step": 594
4200
+ },
4201
+ {
4202
+ "epoch": 0.821256038647343,
4203
+ "grad_norm": 23.459060668945312,
4204
+ "learning_rate": 1.587464671688187e-05,
4205
+ "loss": 5.1864,
4206
+ "step": 595
4207
+ },
4208
+ {
4209
+ "epoch": 0.8226363008971704,
4210
+ "grad_norm": 19.926836013793945,
4211
+ "learning_rate": 1.563791083113142e-05,
4212
+ "loss": 2.9439,
4213
+ "step": 596
4214
+ },
4215
+ {
4216
+ "epoch": 0.8240165631469979,
4217
+ "grad_norm": 21.065765380859375,
4218
+ "learning_rate": 1.540280361985308e-05,
4219
+ "loss": 5.3703,
4220
+ "step": 597
4221
+ },
4222
+ {
4223
+ "epoch": 0.8253968253968254,
4224
+ "grad_norm": 24.38242530822754,
4225
+ "learning_rate": 1.5169329621970918e-05,
4226
+ "loss": 4.0207,
4227
+ "step": 598
4228
+ },
4229
+ {
4230
+ "epoch": 0.8267770876466529,
4231
+ "grad_norm": 24.726024627685547,
4232
+ "learning_rate": 1.4937493344878473e-05,
4233
+ "loss": 6.1451,
4234
+ "step": 599
4235
+ },
4236
+ {
4237
+ "epoch": 0.8281573498964804,
4238
+ "grad_norm": 20.777873992919922,
4239
+ "learning_rate": 1.4707299264351915e-05,
4240
+ "loss": 4.3696,
4241
+ "step": 600
4242
+ },
4243
+ {
4244
+ "epoch": 0.8295376121463078,
4245
+ "grad_norm": 22.057493209838867,
4246
+ "learning_rate": 1.4478751824463543e-05,
4247
+ "loss": 5.1269,
4248
+ "step": 601
4249
+ },
4250
+ {
4251
+ "epoch": 0.8309178743961353,
4252
+ "grad_norm": 21.792823791503906,
4253
+ "learning_rate": 1.4251855437495975e-05,
4254
+ "loss": 5.1566,
4255
+ "step": 602
4256
+ },
4257
+ {
4258
+ "epoch": 0.8322981366459627,
4259
+ "grad_norm": 18.181209564208984,
4260
+ "learning_rate": 1.4026614483857036e-05,
4261
+ "loss": 4.1663,
4262
+ "step": 603
4263
+ },
4264
+ {
4265
+ "epoch": 0.8336783988957902,
4266
+ "grad_norm": 23.090852737426758,
4267
+ "learning_rate": 1.3803033311995072e-05,
4268
+ "loss": 4.3103,
4269
+ "step": 604
4270
+ },
4271
+ {
4272
+ "epoch": 0.8350586611456177,
4273
+ "grad_norm": 22.44392967224121,
4274
+ "learning_rate": 1.3581116238315195e-05,
4275
+ "loss": 4.3511,
4276
+ "step": 605
4277
+ },
4278
+ {
4279
+ "epoch": 0.8364389233954451,
4280
+ "grad_norm": 19.312456130981445,
4281
+ "learning_rate": 1.336086754709569e-05,
4282
+ "loss": 3.5947,
4283
+ "step": 606
4284
+ },
4285
+ {
4286
+ "epoch": 0.8378191856452726,
4287
+ "grad_norm": 25.088590621948242,
4288
+ "learning_rate": 1.3142291490405568e-05,
4289
+ "loss": 4.6283,
4290
+ "step": 607
4291
+ },
4292
+ {
4293
+ "epoch": 0.8391994478951,
4294
+ "grad_norm": 20.34014129638672,
4295
+ "learning_rate": 1.2925392288022298e-05,
4296
+ "loss": 4.1621,
4297
+ "step": 608
4298
+ },
4299
+ {
4300
+ "epoch": 0.8405797101449275,
4301
+ "grad_norm": 18.029075622558594,
4302
+ "learning_rate": 1.2710174127350361e-05,
4303
+ "loss": 4.1052,
4304
+ "step": 609
4305
+ },
4306
+ {
4307
+ "epoch": 0.841959972394755,
4308
+ "grad_norm": 26.527965545654297,
4309
+ "learning_rate": 1.2496641163340562e-05,
4310
+ "loss": 5.4877,
4311
+ "step": 610
4312
+ },
4313
+ {
4314
+ "epoch": 0.8433402346445825,
4315
+ "grad_norm": 17.80951690673828,
4316
+ "learning_rate": 1.2284797518409575e-05,
4317
+ "loss": 3.6808,
4318
+ "step": 611
4319
+ },
4320
+ {
4321
+ "epoch": 0.84472049689441,
4322
+ "grad_norm": 23.62249183654785,
4323
+ "learning_rate": 1.2074647282360574e-05,
4324
+ "loss": 3.8818,
4325
+ "step": 612
4326
+ },
4327
+ {
4328
+ "epoch": 0.8461007591442374,
4329
+ "grad_norm": 22.320627212524414,
4330
+ "learning_rate": 1.1866194512304073e-05,
4331
+ "loss": 4.7351,
4332
+ "step": 613
4333
+ },
4334
+ {
4335
+ "epoch": 0.8474810213940649,
4336
+ "grad_norm": 25.21207046508789,
4337
+ "learning_rate": 1.1659443232579858e-05,
4338
+ "loss": 6.1288,
4339
+ "step": 614
4340
+ },
4341
+ {
4342
+ "epoch": 0.8488612836438924,
4343
+ "grad_norm": 21.264904022216797,
4344
+ "learning_rate": 1.1454397434679021e-05,
4345
+ "loss": 4.2007,
4346
+ "step": 615
4347
+ },
4348
+ {
4349
+ "epoch": 0.8502415458937198,
4350
+ "grad_norm": 20.78594398498535,
4351
+ "learning_rate": 1.125106107716708e-05,
4352
+ "loss": 4.3806,
4353
+ "step": 616
4354
+ },
4355
+ {
4356
+ "epoch": 0.8516218081435473,
4357
+ "grad_norm": 20.06401824951172,
4358
+ "learning_rate": 1.10494380856075e-05,
4359
+ "loss": 3.2476,
4360
+ "step": 617
4361
+ },
4362
+ {
4363
+ "epoch": 0.8530020703933747,
4364
+ "grad_norm": 26.73411750793457,
4365
+ "learning_rate": 1.0849532352485903e-05,
4366
+ "loss": 5.4257,
4367
+ "step": 618
4368
+ },
4369
+ {
4370
+ "epoch": 0.8543823326432022,
4371
+ "grad_norm": 21.02422332763672,
4372
+ "learning_rate": 1.0651347737134965e-05,
4373
+ "loss": 3.4578,
4374
+ "step": 619
4375
+ },
4376
+ {
4377
+ "epoch": 0.8557625948930296,
4378
+ "grad_norm": 25.98722267150879,
4379
+ "learning_rate": 1.0454888065659774e-05,
4380
+ "loss": 4.4609,
4381
+ "step": 620
4382
+ },
4383
+ {
4384
+ "epoch": 0.8571428571428571,
4385
+ "grad_norm": 20.993576049804688,
4386
+ "learning_rate": 1.026015713086418e-05,
4387
+ "loss": 3.4471,
4388
+ "step": 621
4389
+ },
4390
+ {
4391
+ "epoch": 0.8585231193926847,
4392
+ "grad_norm": 19.924787521362305,
4393
+ "learning_rate": 1.0067158692177326e-05,
4394
+ "loss": 3.8117,
4395
+ "step": 622
4396
+ },
4397
+ {
4398
+ "epoch": 0.8599033816425121,
4399
+ "grad_norm": 21.03854751586914,
4400
+ "learning_rate": 9.875896475581348e-06,
4401
+ "loss": 4.3201,
4402
+ "step": 623
4403
+ },
4404
+ {
4405
+ "epoch": 0.8612836438923396,
4406
+ "grad_norm": 21.847026824951172,
4407
+ "learning_rate": 9.686374173539148e-06,
4408
+ "loss": 4.7991,
4409
+ "step": 624
4410
+ },
4411
+ {
4412
+ "epoch": 0.862663906142167,
4413
+ "grad_norm": 24.594127655029297,
4414
+ "learning_rate": 9.49859544492332e-06,
4415
+ "loss": 5.826,
4416
+ "step": 625
4417
+ },
4418
+ {
4419
+ "epoch": 0.8640441683919945,
4420
+ "grad_norm": 21.561784744262695,
4421
+ "learning_rate": 9.31256391494546e-06,
4422
+ "loss": 4.593,
4423
+ "step": 626
4424
+ },
4425
+ {
4426
+ "epoch": 0.865424430641822,
4427
+ "grad_norm": 22.014583587646484,
4428
+ "learning_rate": 9.128283175086105e-06,
4429
+ "loss": 4.8819,
4430
+ "step": 627
4431
+ },
4432
+ {
4433
+ "epoch": 0.8668046928916494,
4434
+ "grad_norm": 26.051342010498047,
4435
+ "learning_rate": 8.945756783025527e-06,
4436
+ "loss": 5.2834,
4437
+ "step": 628
4438
+ },
4439
+ {
4440
+ "epoch": 0.8681849551414769,
4441
+ "grad_norm": 23.40290069580078,
4442
+ "learning_rate": 8.76498826257488e-06,
4443
+ "loss": 4.1225,
4444
+ "step": 629
4445
+ },
4446
+ {
4447
+ "epoch": 0.8695652173913043,
4448
+ "grad_norm": 23.62264060974121,
4449
+ "learning_rate": 8.585981103608342e-06,
4450
+ "loss": 4.5137,
4451
+ "step": 630
4452
+ },
4453
+ {
4454
+ "epoch": 0.8709454796411318,
4455
+ "grad_norm": 17.327234268188477,
4456
+ "learning_rate": 8.408738761995649e-06,
4457
+ "loss": 3.1094,
4458
+ "step": 631
4459
+ },
4460
+ {
4461
+ "epoch": 0.8723257418909592,
4462
+ "grad_norm": 17.089147567749023,
4463
+ "learning_rate": 8.233264659535367e-06,
4464
+ "loss": 3.8834,
4465
+ "step": 632
4466
+ },
4467
+ {
4468
+ "epoch": 0.8737060041407867,
4469
+ "grad_norm": 17.930011749267578,
4470
+ "learning_rate": 8.059562183888902e-06,
4471
+ "loss": 4.0732,
4472
+ "step": 633
4473
+ },
4474
+ {
4475
+ "epoch": 0.8750862663906143,
4476
+ "grad_norm": 17.872472763061523,
4477
+ "learning_rate": 7.887634688515e-06,
4478
+ "loss": 3.4639,
4479
+ "step": 634
4480
+ },
4481
+ {
4482
+ "epoch": 0.8764665286404417,
4483
+ "grad_norm": 29.12419319152832,
4484
+ "learning_rate": 7.71748549260507e-06,
4485
+ "loss": 5.0055,
4486
+ "step": 635
4487
+ },
4488
+ {
4489
+ "epoch": 0.8778467908902692,
4490
+ "grad_norm": 15.905129432678223,
4491
+ "learning_rate": 7.549117881019141e-06,
4492
+ "loss": 4.0129,
4493
+ "step": 636
4494
+ },
4495
+ {
4496
+ "epoch": 0.8792270531400966,
4497
+ "grad_norm": 18.55198097229004,
4498
+ "learning_rate": 7.382535104222366e-06,
4499
+ "loss": 4.0759,
4500
+ "step": 637
4501
+ },
4502
+ {
4503
+ "epoch": 0.8806073153899241,
4504
+ "grad_norm": 15.301910400390625,
4505
+ "learning_rate": 7.21774037822226e-06,
4506
+ "loss": 2.5912,
4507
+ "step": 638
4508
+ },
4509
+ {
4510
+ "epoch": 0.8819875776397516,
4511
+ "grad_norm": 23.361509323120117,
4512
+ "learning_rate": 7.054736884506719e-06,
4513
+ "loss": 5.7187,
4514
+ "step": 639
4515
+ },
4516
+ {
4517
+ "epoch": 0.883367839889579,
4518
+ "grad_norm": 18.59641456604004,
4519
+ "learning_rate": 6.893527769982499e-06,
4520
+ "loss": 3.4449,
4521
+ "step": 640
4522
+ },
4523
+ {
4524
+ "epoch": 0.8847481021394065,
4525
+ "grad_norm": 25.18804931640625,
4526
+ "learning_rate": 6.734116146914516e-06,
4527
+ "loss": 4.9088,
4528
+ "step": 641
4529
+ },
4530
+ {
4531
+ "epoch": 0.8861283643892339,
4532
+ "grad_norm": 22.016386032104492,
4533
+ "learning_rate": 6.576505092865748e-06,
4534
+ "loss": 4.6404,
4535
+ "step": 642
4536
+ },
4537
+ {
4538
+ "epoch": 0.8875086266390614,
4539
+ "grad_norm": 22.610445022583008,
4540
+ "learning_rate": 6.420697650637752e-06,
4541
+ "loss": 5.5401,
4542
+ "step": 643
4543
+ },
4544
+ {
4545
+ "epoch": 0.8888888888888888,
4546
+ "grad_norm": 22.13422966003418,
4547
+ "learning_rate": 6.266696828212071e-06,
4548
+ "loss": 3.569,
4549
+ "step": 644
4550
+ },
4551
+ {
4552
+ "epoch": 0.8902691511387164,
4553
+ "grad_norm": 20.980058670043945,
4554
+ "learning_rate": 6.1145055986920105e-06,
4555
+ "loss": 5.7634,
4556
+ "step": 645
4557
+ },
4558
+ {
4559
+ "epoch": 0.8916494133885439,
4560
+ "grad_norm": 21.698143005371094,
4561
+ "learning_rate": 5.964126900245359e-06,
4562
+ "loss": 3.6239,
4563
+ "step": 646
4564
+ },
4565
+ {
4566
+ "epoch": 0.8930296756383713,
4567
+ "grad_norm": 19.81194305419922,
4568
+ "learning_rate": 5.8155636360475385e-06,
4569
+ "loss": 4.6039,
4570
+ "step": 647
4571
+ },
4572
+ {
4573
+ "epoch": 0.8944099378881988,
4574
+ "grad_norm": 15.131420135498047,
4575
+ "learning_rate": 5.668818674225685e-06,
4576
+ "loss": 2.5671,
4577
+ "step": 648
4578
+ },
4579
+ {
4580
+ "epoch": 0.8957902001380262,
4581
+ "grad_norm": 17.206186294555664,
4582
+ "learning_rate": 5.523894847803235e-06,
4583
+ "loss": 3.0977,
4584
+ "step": 649
4585
+ },
4586
+ {
4587
+ "epoch": 0.8971704623878537,
4588
+ "grad_norm": 18.428306579589844,
4589
+ "learning_rate": 5.3807949546451404e-06,
4590
+ "loss": 3.2133,
4591
+ "step": 650
4592
+ },
4593
+ {
4594
+ "epoch": 0.8985507246376812,
4595
+ "grad_norm": 15.980635643005371,
4596
+ "learning_rate": 5.23952175740402e-06,
4597
+ "loss": 3.5965,
4598
+ "step": 651
4599
+ },
4600
+ {
4601
+ "epoch": 0.8999309868875086,
4602
+ "grad_norm": 22.935367584228516,
4603
+ "learning_rate": 5.100077983466667e-06,
4604
+ "loss": 7.5044,
4605
+ "step": 652
4606
+ },
4607
+ {
4608
+ "epoch": 0.9013112491373361,
4609
+ "grad_norm": 18.72053337097168,
4610
+ "learning_rate": 4.9624663249014825e-06,
4611
+ "loss": 3.108,
4612
+ "step": 653
4613
+ },
4614
+ {
4615
+ "epoch": 0.9026915113871635,
4616
+ "grad_norm": 20.732444763183594,
4617
+ "learning_rate": 4.826689438406495e-06,
4618
+ "loss": 4.3188,
4619
+ "step": 654
4620
+ },
4621
+ {
4622
+ "epoch": 0.904071773636991,
4623
+ "grad_norm": 18.4609375,
4624
+ "learning_rate": 4.692749945258057e-06,
4625
+ "loss": 2.5593,
4626
+ "step": 655
4627
+ },
4628
+ {
4629
+ "epoch": 0.9054520358868184,
4630
+ "grad_norm": 21.71295738220215,
4631
+ "learning_rate": 4.560650431260239e-06,
4632
+ "loss": 4.629,
4633
+ "step": 656
4634
+ },
4635
+ {
4636
+ "epoch": 0.906832298136646,
4637
+ "grad_norm": 22.27686309814453,
4638
+ "learning_rate": 4.4303934466948805e-06,
4639
+ "loss": 5.1413,
4640
+ "step": 657
4641
+ },
4642
+ {
4643
+ "epoch": 0.9082125603864735,
4644
+ "grad_norm": 19.66610336303711,
4645
+ "learning_rate": 4.301981506272457e-06,
4646
+ "loss": 3.9924,
4647
+ "step": 658
4648
+ },
4649
+ {
4650
+ "epoch": 0.9095928226363009,
4651
+ "grad_norm": 17.331315994262695,
4652
+ "learning_rate": 4.175417089083378e-06,
4653
+ "loss": 3.9328,
4654
+ "step": 659
4655
+ },
4656
+ {
4657
+ "epoch": 0.9109730848861284,
4658
+ "grad_norm": 16.983257293701172,
4659
+ "learning_rate": 4.050702638550275e-06,
4660
+ "loss": 4.0252,
4661
+ "step": 660
4662
+ },
4663
+ {
4664
+ "epoch": 0.9123533471359558,
4665
+ "grad_norm": 19.764429092407227,
4666
+ "learning_rate": 3.927840562380692e-06,
4667
+ "loss": 2.7445,
4668
+ "step": 661
4669
+ },
4670
+ {
4671
+ "epoch": 0.9137336093857833,
4672
+ "grad_norm": 18.17449378967285,
4673
+ "learning_rate": 3.8068332325207457e-06,
4674
+ "loss": 3.8537,
4675
+ "step": 662
4676
+ },
4677
+ {
4678
+ "epoch": 0.9151138716356108,
4679
+ "grad_norm": 20.223976135253906,
4680
+ "learning_rate": 3.6876829851092087e-06,
4681
+ "loss": 3.5495,
4682
+ "step": 663
4683
+ },
4684
+ {
4685
+ "epoch": 0.9164941338854382,
4686
+ "grad_norm": 25.060184478759766,
4687
+ "learning_rate": 3.5703921204324865e-06,
4688
+ "loss": 5.1044,
4689
+ "step": 664
4690
+ },
4691
+ {
4692
+ "epoch": 0.9178743961352657,
4693
+ "grad_norm": 21.926902770996094,
4694
+ "learning_rate": 3.454962902880199e-06,
4695
+ "loss": 5.1936,
4696
+ "step": 665
4697
+ },
4698
+ {
4699
+ "epoch": 0.9192546583850931,
4700
+ "grad_norm": 23.566516876220703,
4701
+ "learning_rate": 3.3413975609013713e-06,
4702
+ "loss": 6.3522,
4703
+ "step": 666
4704
+ },
4705
+ {
4706
+ "epoch": 0.9206349206349206,
4707
+ "grad_norm": 30.516544342041016,
4708
+ "learning_rate": 3.2296982869616133e-06,
4709
+ "loss": 6.1352,
4710
+ "step": 667
4711
+ },
4712
+ {
4713
+ "epoch": 0.9220151828847482,
4714
+ "grad_norm": 27.719097137451172,
4715
+ "learning_rate": 3.11986723750054e-06,
4716
+ "loss": 6.3657,
4717
+ "step": 668
4718
+ },
4719
+ {
4720
+ "epoch": 0.9233954451345756,
4721
+ "grad_norm": 21.81649398803711,
4722
+ "learning_rate": 3.011906532890352e-06,
4723
+ "loss": 4.6634,
4724
+ "step": 669
4725
+ },
4726
+ {
4727
+ "epoch": 0.9247757073844031,
4728
+ "grad_norm": 19.389427185058594,
4729
+ "learning_rate": 2.905818257394799e-06,
4730
+ "loss": 3.3689,
4731
+ "step": 670
4732
+ },
4733
+ {
4734
+ "epoch": 0.9261559696342305,
4735
+ "grad_norm": 22.784374237060547,
4736
+ "learning_rate": 2.801604459128926e-06,
4737
+ "loss": 3.927,
4738
+ "step": 671
4739
+ },
4740
+ {
4741
+ "epoch": 0.927536231884058,
4742
+ "grad_norm": 23.757984161376953,
4743
+ "learning_rate": 2.6992671500196132e-06,
4744
+ "loss": 4.9386,
4745
+ "step": 672
4746
+ },
4747
+ {
4748
+ "epoch": 0.9289164941338854,
4749
+ "grad_norm": 24.24261474609375,
4750
+ "learning_rate": 2.5988083057666533e-06,
4751
+ "loss": 4.3493,
4752
+ "step": 673
4753
+ },
4754
+ {
4755
+ "epoch": 0.9302967563837129,
4756
+ "grad_norm": 21.458877563476562,
4757
+ "learning_rate": 2.5002298658046487e-06,
4758
+ "loss": 4.5866,
4759
+ "step": 674
4760
+ },
4761
+ {
4762
+ "epoch": 0.9316770186335404,
4763
+ "grad_norm": 19.24845314025879,
4764
+ "learning_rate": 2.4035337332655506e-06,
4765
+ "loss": 5.569,
4766
+ "step": 675
4767
+ },
4768
+ {
4769
+ "epoch": 0.9330572808833678,
4770
+ "grad_norm": 27.00035285949707,
4771
+ "learning_rate": 2.308721774941991e-06,
4772
+ "loss": 4.5707,
4773
+ "step": 676
4774
+ },
4775
+ {
4776
+ "epoch": 0.9344375431331953,
4777
+ "grad_norm": 23.597713470458984,
4778
+ "learning_rate": 2.2157958212510877e-06,
4779
+ "loss": 3.8271,
4780
+ "step": 677
4781
+ },
4782
+ {
4783
+ "epoch": 0.9358178053830227,
4784
+ "grad_norm": 18.218555450439453,
4785
+ "learning_rate": 2.124757666199273e-06,
4786
+ "loss": 3.6037,
4787
+ "step": 678
4788
+ },
4789
+ {
4790
+ "epoch": 0.9371980676328503,
4791
+ "grad_norm": 13.22622299194336,
4792
+ "learning_rate": 2.035609067347566e-06,
4793
+ "loss": 2.1873,
4794
+ "step": 679
4795
+ },
4796
+ {
4797
+ "epoch": 0.9385783298826778,
4798
+ "grad_norm": 18.116043090820312,
4799
+ "learning_rate": 1.9483517457776436e-06,
4800
+ "loss": 4.0654,
4801
+ "step": 680
4802
+ },
4803
+ {
4804
+ "epoch": 0.9399585921325052,
4805
+ "grad_norm": 17.52006721496582,
4806
+ "learning_rate": 1.8629873860586566e-06,
4807
+ "loss": 3.4956,
4808
+ "step": 681
4809
+ },
4810
+ {
4811
+ "epoch": 0.9413388543823327,
4812
+ "grad_norm": 22.48879051208496,
4813
+ "learning_rate": 1.779517636214678e-06,
4814
+ "loss": 3.2213,
4815
+ "step": 682
4816
+ },
4817
+ {
4818
+ "epoch": 0.9427191166321601,
4819
+ "grad_norm": 16.940576553344727,
4820
+ "learning_rate": 1.6979441076928836e-06,
4821
+ "loss": 3.7022,
4822
+ "step": 683
4823
+ },
4824
+ {
4825
+ "epoch": 0.9440993788819876,
4826
+ "grad_norm": 19.83792495727539,
4827
+ "learning_rate": 1.6182683753324435e-06,
4828
+ "loss": 6.5674,
4829
+ "step": 684
4830
+ },
4831
+ {
4832
+ "epoch": 0.945479641131815,
4833
+ "grad_norm": 24.09896469116211,
4834
+ "learning_rate": 1.5404919773341576e-06,
4835
+ "loss": 3.5441,
4836
+ "step": 685
4837
+ },
4838
+ {
4839
+ "epoch": 0.9468599033816425,
4840
+ "grad_norm": 22.972850799560547,
4841
+ "learning_rate": 1.4646164152307018e-06,
4842
+ "loss": 6.0495,
4843
+ "step": 686
4844
+ },
4845
+ {
4846
+ "epoch": 0.94824016563147,
4847
+ "grad_norm": 28.845718383789062,
4848
+ "learning_rate": 1.3906431538576626e-06,
4849
+ "loss": 6.6508,
4850
+ "step": 687
4851
+ },
4852
+ {
4853
+ "epoch": 0.9496204278812974,
4854
+ "grad_norm": 19.374427795410156,
4855
+ "learning_rate": 1.3185736213252809e-06,
4856
+ "loss": 3.758,
4857
+ "step": 688
4858
+ },
4859
+ {
4860
+ "epoch": 0.9510006901311249,
4861
+ "grad_norm": 24.994707107543945,
4862
+ "learning_rate": 1.2484092089908305e-06,
4863
+ "loss": 5.0294,
4864
+ "step": 689
4865
+ },
4866
+ {
4867
+ "epoch": 0.9523809523809523,
4868
+ "grad_norm": 23.40306282043457,
4869
+ "learning_rate": 1.1801512714318285e-06,
4870
+ "loss": 4.6954,
4871
+ "step": 690
4872
+ },
4873
+ {
4874
+ "epoch": 0.9537612146307799,
4875
+ "grad_norm": 28.18086051940918,
4876
+ "learning_rate": 1.113801126419789e-06,
4877
+ "loss": 5.1102,
4878
+ "step": 691
4879
+ },
4880
+ {
4881
+ "epoch": 0.9551414768806074,
4882
+ "grad_norm": 23.489946365356445,
4883
+ "learning_rate": 1.0493600548948878e-06,
4884
+ "loss": 4.7169,
4885
+ "step": 692
4886
+ },
4887
+ {
4888
+ "epoch": 0.9565217391304348,
4889
+ "grad_norm": 19.926145553588867,
4890
+ "learning_rate": 9.8682930094115e-07,
4891
+ "loss": 4.3657,
4892
+ "step": 693
4893
+ },
4894
+ {
4895
+ "epoch": 0.9579020013802623,
4896
+ "grad_norm": 21.520130157470703,
4897
+ "learning_rate": 9.262100717624677e-07,
4898
+ "loss": 3.7835,
4899
+ "step": 694
4900
+ },
4901
+ {
4902
+ "epoch": 0.9592822636300897,
4903
+ "grad_norm": 23.506120681762695,
4904
+ "learning_rate": 8.675035376593088e-07,
4905
+ "loss": 5.4438,
4906
+ "step": 695
4907
+ },
4908
+ {
4909
+ "epoch": 0.9606625258799172,
4910
+ "grad_norm": 39.26706314086914,
4911
+ "learning_rate": 8.107108320060674e-07,
4912
+ "loss": 5.6457,
4913
+ "step": 696
4914
+ },
4915
+ {
4916
+ "epoch": 0.9620427881297446,
4917
+ "grad_norm": 20.84963607788086,
4918
+ "learning_rate": 7.558330512292378e-07,
4919
+ "loss": 5.1845,
4920
+ "step": 697
4921
+ },
4922
+ {
4923
+ "epoch": 0.9634230503795721,
4924
+ "grad_norm": 20.25655174255371,
4925
+ "learning_rate": 7.028712547862526e-07,
4926
+ "loss": 3.9212,
4927
+ "step": 698
4928
+ },
4929
+ {
4930
+ "epoch": 0.9648033126293996,
4931
+ "grad_norm": 21.61455535888672,
4932
+ "learning_rate": 6.518264651449779e-07,
4933
+ "loss": 5.3809,
4934
+ "step": 699
4935
+ },
4936
+ {
4937
+ "epoch": 0.966183574879227,
4938
+ "grad_norm": 19.310720443725586,
4939
+ "learning_rate": 6.026996677640062e-07,
4940
+ "loss": 3.6578,
4941
+ "step": 700
4942
+ },
4943
+ {
4944
+ "epoch": 0.9675638371290545,
4945
+ "grad_norm": 20.59223175048828,
4946
+ "learning_rate": 5.554918110736273e-07,
4947
+ "loss": 4.8361,
4948
+ "step": 701
4949
+ },
4950
+ {
4951
+ "epoch": 0.968944099378882,
4952
+ "grad_norm": 18.125131607055664,
4953
+ "learning_rate": 5.102038064575099e-07,
4954
+ "loss": 2.8242,
4955
+ "step": 702
4956
+ },
4957
+ {
4958
+ "epoch": 0.9703243616287095,
4959
+ "grad_norm": 19.294221878051758,
4960
+ "learning_rate": 4.668365282351372e-07,
4961
+ "loss": 3.2637,
4962
+ "step": 703
4963
+ },
4964
+ {
4965
+ "epoch": 0.971704623878537,
4966
+ "grad_norm": 15.094534873962402,
4967
+ "learning_rate": 4.2539081364488807e-07,
4968
+ "loss": 3.3754,
4969
+ "step": 704
4970
+ },
4971
+ {
4972
+ "epoch": 0.9730848861283644,
4973
+ "grad_norm": 19.29479217529297,
4974
+ "learning_rate": 3.8586746282788244e-07,
4975
+ "loss": 4.4401,
4976
+ "step": 705
4977
+ },
4978
+ {
4979
+ "epoch": 0.9744651483781919,
4980
+ "grad_norm": 27.921985626220703,
4981
+ "learning_rate": 3.4826723881257186e-07,
4982
+ "loss": 5.3932,
4983
+ "step": 706
4984
+ },
4985
+ {
4986
+ "epoch": 0.9758454106280193,
4987
+ "grad_norm": 26.087324142456055,
4988
+ "learning_rate": 3.1259086749992895e-07,
4989
+ "loss": 5.4936,
4990
+ "step": 707
4991
+ },
4992
+ {
4993
+ "epoch": 0.9772256728778468,
4994
+ "grad_norm": 19.32054901123047,
4995
+ "learning_rate": 2.7883903764953644e-07,
4996
+ "loss": 3.5865,
4997
+ "step": 708
4998
+ },
4999
+ {
5000
+ "epoch": 0.9786059351276742,
5001
+ "grad_norm": 19.498537063598633,
5002
+ "learning_rate": 2.470124008661978e-07,
5003
+ "loss": 3.1686,
5004
+ "step": 709
5005
+ },
5006
+ {
5007
+ "epoch": 0.9799861973775017,
5008
+ "grad_norm": 22.79661750793457,
5009
+ "learning_rate": 2.171115715874139e-07,
5010
+ "loss": 4.4749,
5011
+ "step": 710
5012
+ },
5013
+ {
5014
+ "epoch": 0.9813664596273292,
5015
+ "grad_norm": 16.39584732055664,
5016
+ "learning_rate": 1.8913712707149256e-07,
5017
+ "loss": 3.414,
5018
+ "step": 711
5019
+ },
5020
+ {
5021
+ "epoch": 0.9827467218771566,
5022
+ "grad_norm": 19.80657196044922,
5023
+ "learning_rate": 1.630896073864352e-07,
5024
+ "loss": 4.0456,
5025
+ "step": 712
5026
+ },
5027
+ {
5028
+ "epoch": 0.9841269841269841,
5029
+ "grad_norm": 24.56593894958496,
5030
+ "learning_rate": 1.3896951539945636e-07,
5031
+ "loss": 5.6586,
5032
+ "step": 713
5033
+ },
5034
+ {
5035
+ "epoch": 0.9855072463768116,
5036
+ "grad_norm": 23.019926071166992,
5037
+ "learning_rate": 1.1677731676733584e-07,
5038
+ "loss": 4.3073,
5039
+ "step": 714
5040
+ },
5041
+ {
5042
+ "epoch": 0.9868875086266391,
5043
+ "grad_norm": 26.53896713256836,
5044
+ "learning_rate": 9.65134399274037e-08,
5045
+ "loss": 5.958,
5046
+ "step": 715
5047
+ },
5048
+ {
5049
+ "epoch": 0.9882677708764666,
5050
+ "grad_norm": 15.73496150970459,
5051
+ "learning_rate": 7.817827608924688e-08,
5052
+ "loss": 2.9358,
5053
+ "step": 716
5054
+ },
5055
+ {
5056
+ "epoch": 0.989648033126294,
5057
+ "grad_norm": 23.03923797607422,
5058
+ "learning_rate": 6.177217922718192e-08,
5059
+ "loss": 4.2186,
5060
+ "step": 717
5061
+ },
5062
+ {
5063
+ "epoch": 0.9910282953761215,
5064
+ "grad_norm": 20.209564208984375,
5065
+ "learning_rate": 4.7295466073427053e-08,
5066
+ "loss": 4.3526,
5067
+ "step": 718
5068
+ },
5069
+ {
5070
+ "epoch": 0.9924085576259489,
5071
+ "grad_norm": 17.545196533203125,
5072
+ "learning_rate": 3.474841611197377e-08,
5073
+ "loss": 3.5361,
5074
+ "step": 719
5075
+ },
5076
+ {
5077
+ "epoch": 0.9937888198757764,
5078
+ "grad_norm": 16.188581466674805,
5079
+ "learning_rate": 2.4131271573191172e-08,
5080
+ "loss": 2.5456,
5081
+ "step": 720
5082
+ },
5083
+ {
5084
+ "epoch": 0.9951690821256038,
5085
+ "grad_norm": 17.67585563659668,
5086
+ "learning_rate": 1.5444237429140807e-08,
5087
+ "loss": 3.162,
5088
+ "step": 721
5089
+ },
5090
+ {
5091
+ "epoch": 0.9965493443754313,
5092
+ "grad_norm": 21.626819610595703,
5093
+ "learning_rate": 8.687481389657582e-09,
5094
+ "loss": 4.8709,
5095
+ "step": 722
5096
+ },
5097
+ {
5098
+ "epoch": 0.9979296066252588,
5099
+ "grad_norm": 18.768949508666992,
5100
+ "learning_rate": 3.861133899063507e-09,
5101
+ "loss": 3.8211,
5102
+ "step": 723
5103
+ },
5104
+ {
5105
+ "epoch": 0.9993098688750862,
5106
+ "grad_norm": 20.41644287109375,
5107
+ "learning_rate": 9.65288133669695e-10,
5108
+ "loss": 3.3884,
5109
+ "step": 724
5110
+ },
5111
+ {
5112
+ "epoch": 1.0006901311249137,
5113
+ "grad_norm": 21.81919288635254,
5114
+ "learning_rate": 0.0,
5115
+ "loss": 3.69,
5116
+ "step": 725
5117
  }
5118
  ],
5119
  "logging_steps": 1,
 
5128
  "should_evaluate": false,
5129
  "should_log": false,
5130
  "should_save": true,
5131
+ "should_training_stop": true
5132
  },
5133
  "attributes": {}
5134
  }
5135
  },
5136
+ "total_flos": 1.2704672407722394e+17,
5137
  "train_batch_size": 2,
5138
  "trial_name": null,
5139
  "trial_params": null