ErrorAI commited on
Commit
e89d7e2
·
verified ·
1 Parent(s): 8b15882

Training in progress, step 831, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:966d2a3ec027061a7d2959b9f01d6883af2d5ddcefab3bd12005c1dcd090bd08
3
  size 25271744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1dbd19107319c058bd00055bb44a8c7f5eb208b888d6b52575bd7776291612d
3
  size 25271744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de283ffa0aeffaa0d88721ab5957b4cf88289ac7ca5d625abe8c3aac9ad3350b
3
  size 13685836
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70faa927d2d54c8ea7e7448c78084d6e6bba62b12460f6e68ddb25a4eda68fbd
3
  size 13685836
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bc6b2e5f5d5204fd29592bb4982b6258c4188ad9b74db13340f4a4a5085bb01
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2738afd85fa000763436a2d636f7e58177e0ad5cf69c1697cda8d874f4897ae9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f385ce5b80ae8a55b98f980bb378ed51c18254c7dfa913dbd38fc5080060a2a7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:896d4cf2739e2a4f3e21a7eda08b38acf92ea653e52c8f1d68438ebfb7a26d99
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5002257336343116,
5
  "eval_steps": 277,
6
- "global_step": 554,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3909,6 +3909,1953 @@
3909
  "eval_samples_per_second": 55.531,
3910
  "eval_steps_per_second": 27.825,
3911
  "step": 554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3912
  }
3913
  ],
3914
  "logging_steps": 1,
@@ -3928,7 +5875,7 @@
3928
  "attributes": {}
3929
  }
3930
  },
3931
- "total_flos": 1.4170875595063296e+16,
3932
  "train_batch_size": 2,
3933
  "trial_name": null,
3934
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7503386004514673,
5
  "eval_steps": 277,
6
+ "global_step": 831,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3909
  "eval_samples_per_second": 55.531,
3910
  "eval_steps_per_second": 27.825,
3911
  "step": 554
3912
+ },
3913
+ {
3914
+ "epoch": 0.5011286681715575,
3915
+ "grad_norm": 1.9571224451065063,
3916
+ "learning_rate": 0.00010114445321365482,
3917
+ "loss": 1.4151,
3918
+ "step": 555
3919
+ },
3920
+ {
3921
+ "epoch": 0.5020316027088036,
3922
+ "grad_norm": 1.3597534894943237,
3923
+ "learning_rate": 0.00010085834810820871,
3924
+ "loss": 1.1126,
3925
+ "step": 556
3926
+ },
3927
+ {
3928
+ "epoch": 0.5029345372460496,
3929
+ "grad_norm": 1.4733638763427734,
3930
+ "learning_rate": 0.00010057223597595243,
3931
+ "loss": 1.2403,
3932
+ "step": 557
3933
+ },
3934
+ {
3935
+ "epoch": 0.5038374717832957,
3936
+ "grad_norm": 1.2308521270751953,
3937
+ "learning_rate": 0.00010028611915912405,
3938
+ "loss": 0.7117,
3939
+ "step": 558
3940
+ },
3941
+ {
3942
+ "epoch": 0.5047404063205417,
3943
+ "grad_norm": 0.9335219860076904,
3944
+ "learning_rate": 0.0001,
3945
+ "loss": 0.6408,
3946
+ "step": 559
3947
+ },
3948
+ {
3949
+ "epoch": 0.5056433408577878,
3950
+ "grad_norm": 1.7657417058944702,
3951
+ "learning_rate": 9.9713880840876e-05,
3952
+ "loss": 1.53,
3953
+ "step": 560
3954
+ },
3955
+ {
3956
+ "epoch": 0.5065462753950338,
3957
+ "grad_norm": 1.1229721307754517,
3958
+ "learning_rate": 9.94277640240476e-05,
3959
+ "loss": 0.6362,
3960
+ "step": 561
3961
+ },
3962
+ {
3963
+ "epoch": 0.5074492099322799,
3964
+ "grad_norm": 1.421522855758667,
3965
+ "learning_rate": 9.914165189179131e-05,
3966
+ "loss": 1.0754,
3967
+ "step": 562
3968
+ },
3969
+ {
3970
+ "epoch": 0.5083521444695259,
3971
+ "grad_norm": 1.4801396131515503,
3972
+ "learning_rate": 9.885554678634523e-05,
3973
+ "loss": 1.2265,
3974
+ "step": 563
3975
+ },
3976
+ {
3977
+ "epoch": 0.509255079006772,
3978
+ "grad_norm": 1.18939208984375,
3979
+ "learning_rate": 9.856945104988989e-05,
3980
+ "loss": 1.0753,
3981
+ "step": 564
3982
+ },
3983
+ {
3984
+ "epoch": 0.510158013544018,
3985
+ "grad_norm": 1.6806690692901611,
3986
+ "learning_rate": 9.828336702452927e-05,
3987
+ "loss": 1.4325,
3988
+ "step": 565
3989
+ },
3990
+ {
3991
+ "epoch": 0.5110609480812641,
3992
+ "grad_norm": 1.2649235725402832,
3993
+ "learning_rate": 9.799729705227133e-05,
3994
+ "loss": 0.9571,
3995
+ "step": 566
3996
+ },
3997
+ {
3998
+ "epoch": 0.5119638826185101,
3999
+ "grad_norm": 1.5114171504974365,
4000
+ "learning_rate": 9.771124347500903e-05,
4001
+ "loss": 1.3416,
4002
+ "step": 567
4003
+ },
4004
+ {
4005
+ "epoch": 0.5128668171557562,
4006
+ "grad_norm": 1.5993719100952148,
4007
+ "learning_rate": 9.742520863450115e-05,
4008
+ "loss": 1.4492,
4009
+ "step": 568
4010
+ },
4011
+ {
4012
+ "epoch": 0.5137697516930022,
4013
+ "grad_norm": 1.540785789489746,
4014
+ "learning_rate": 9.713919487235306e-05,
4015
+ "loss": 1.4828,
4016
+ "step": 569
4017
+ },
4018
+ {
4019
+ "epoch": 0.5146726862302483,
4020
+ "grad_norm": 1.5753827095031738,
4021
+ "learning_rate": 9.685320452999751e-05,
4022
+ "loss": 1.2541,
4023
+ "step": 570
4024
+ },
4025
+ {
4026
+ "epoch": 0.5155756207674943,
4027
+ "grad_norm": 1.5678349733352661,
4028
+ "learning_rate": 9.656723994867566e-05,
4029
+ "loss": 1.6312,
4030
+ "step": 571
4031
+ },
4032
+ {
4033
+ "epoch": 0.5164785553047404,
4034
+ "grad_norm": 1.3026212453842163,
4035
+ "learning_rate": 9.628130346941767e-05,
4036
+ "loss": 1.431,
4037
+ "step": 572
4038
+ },
4039
+ {
4040
+ "epoch": 0.5173814898419865,
4041
+ "grad_norm": 1.6213833093643188,
4042
+ "learning_rate": 9.599539743302364e-05,
4043
+ "loss": 1.4334,
4044
+ "step": 573
4045
+ },
4046
+ {
4047
+ "epoch": 0.5182844243792325,
4048
+ "grad_norm": 1.3841042518615723,
4049
+ "learning_rate": 9.570952418004455e-05,
4050
+ "loss": 1.2536,
4051
+ "step": 574
4052
+ },
4053
+ {
4054
+ "epoch": 0.5191873589164786,
4055
+ "grad_norm": 1.253650426864624,
4056
+ "learning_rate": 9.542368605076296e-05,
4057
+ "loss": 1.1555,
4058
+ "step": 575
4059
+ },
4060
+ {
4061
+ "epoch": 0.5200902934537246,
4062
+ "grad_norm": 1.06034255027771,
4063
+ "learning_rate": 9.513788538517375e-05,
4064
+ "loss": 0.8516,
4065
+ "step": 576
4066
+ },
4067
+ {
4068
+ "epoch": 0.5209932279909707,
4069
+ "grad_norm": 1.22532320022583,
4070
+ "learning_rate": 9.485212452296535e-05,
4071
+ "loss": 0.7823,
4072
+ "step": 577
4073
+ },
4074
+ {
4075
+ "epoch": 0.5218961625282167,
4076
+ "grad_norm": 1.5111454725265503,
4077
+ "learning_rate": 9.456640580350018e-05,
4078
+ "loss": 1.3708,
4079
+ "step": 578
4080
+ },
4081
+ {
4082
+ "epoch": 0.5227990970654628,
4083
+ "grad_norm": 1.2933988571166992,
4084
+ "learning_rate": 9.428073156579569e-05,
4085
+ "loss": 1.0624,
4086
+ "step": 579
4087
+ },
4088
+ {
4089
+ "epoch": 0.5237020316027088,
4090
+ "grad_norm": 1.409776210784912,
4091
+ "learning_rate": 9.399510414850518e-05,
4092
+ "loss": 1.2465,
4093
+ "step": 580
4094
+ },
4095
+ {
4096
+ "epoch": 0.5246049661399549,
4097
+ "grad_norm": 1.200989842414856,
4098
+ "learning_rate": 9.370952588989872e-05,
4099
+ "loss": 0.9357,
4100
+ "step": 581
4101
+ },
4102
+ {
4103
+ "epoch": 0.5255079006772009,
4104
+ "grad_norm": 1.2518566846847534,
4105
+ "learning_rate": 9.342399912784386e-05,
4106
+ "loss": 1.2201,
4107
+ "step": 582
4108
+ },
4109
+ {
4110
+ "epoch": 0.526410835214447,
4111
+ "grad_norm": 1.6580489873886108,
4112
+ "learning_rate": 9.313852619978659e-05,
4113
+ "loss": 1.7361,
4114
+ "step": 583
4115
+ },
4116
+ {
4117
+ "epoch": 0.527313769751693,
4118
+ "grad_norm": 1.4227832555770874,
4119
+ "learning_rate": 9.285310944273232e-05,
4120
+ "loss": 1.2134,
4121
+ "step": 584
4122
+ },
4123
+ {
4124
+ "epoch": 0.5282167042889391,
4125
+ "grad_norm": 1.3826041221618652,
4126
+ "learning_rate": 9.256775119322642e-05,
4127
+ "loss": 1.0884,
4128
+ "step": 585
4129
+ },
4130
+ {
4131
+ "epoch": 0.5291196388261851,
4132
+ "grad_norm": 1.5409297943115234,
4133
+ "learning_rate": 9.228245378733537e-05,
4134
+ "loss": 1.0612,
4135
+ "step": 586
4136
+ },
4137
+ {
4138
+ "epoch": 0.5300225733634312,
4139
+ "grad_norm": 1.4004051685333252,
4140
+ "learning_rate": 9.199721956062766e-05,
4141
+ "loss": 1.0878,
4142
+ "step": 587
4143
+ },
4144
+ {
4145
+ "epoch": 0.5309255079006772,
4146
+ "grad_norm": 1.718218445777893,
4147
+ "learning_rate": 9.171205084815444e-05,
4148
+ "loss": 1.6136,
4149
+ "step": 588
4150
+ },
4151
+ {
4152
+ "epoch": 0.5318284424379233,
4153
+ "grad_norm": 1.3253271579742432,
4154
+ "learning_rate": 9.142694998443056e-05,
4155
+ "loss": 1.0533,
4156
+ "step": 589
4157
+ },
4158
+ {
4159
+ "epoch": 0.5327313769751693,
4160
+ "grad_norm": 1.449233055114746,
4161
+ "learning_rate": 9.11419193034155e-05,
4162
+ "loss": 1.4656,
4163
+ "step": 590
4164
+ },
4165
+ {
4166
+ "epoch": 0.5336343115124154,
4167
+ "grad_norm": 1.109061360359192,
4168
+ "learning_rate": 9.08569611384941e-05,
4169
+ "loss": 0.8056,
4170
+ "step": 591
4171
+ },
4172
+ {
4173
+ "epoch": 0.5345372460496614,
4174
+ "grad_norm": 1.4240176677703857,
4175
+ "learning_rate": 9.057207782245757e-05,
4176
+ "loss": 0.8339,
4177
+ "step": 592
4178
+ },
4179
+ {
4180
+ "epoch": 0.5354401805869075,
4181
+ "grad_norm": 1.1838033199310303,
4182
+ "learning_rate": 9.028727168748444e-05,
4183
+ "loss": 0.6906,
4184
+ "step": 593
4185
+ },
4186
+ {
4187
+ "epoch": 0.5363431151241534,
4188
+ "grad_norm": 1.3202743530273438,
4189
+ "learning_rate": 9.000254506512133e-05,
4190
+ "loss": 1.0769,
4191
+ "step": 594
4192
+ },
4193
+ {
4194
+ "epoch": 0.5372460496613995,
4195
+ "grad_norm": 1.3041194677352905,
4196
+ "learning_rate": 8.971790028626395e-05,
4197
+ "loss": 1.1342,
4198
+ "step": 595
4199
+ },
4200
+ {
4201
+ "epoch": 0.5381489841986457,
4202
+ "grad_norm": 1.382552981376648,
4203
+ "learning_rate": 8.943333968113808e-05,
4204
+ "loss": 1.0642,
4205
+ "step": 596
4206
+ },
4207
+ {
4208
+ "epoch": 0.5390519187358916,
4209
+ "grad_norm": 1.3533730506896973,
4210
+ "learning_rate": 8.914886557928031e-05,
4211
+ "loss": 1.3536,
4212
+ "step": 597
4213
+ },
4214
+ {
4215
+ "epoch": 0.5399548532731377,
4216
+ "grad_norm": 1.3678534030914307,
4217
+ "learning_rate": 8.886448030951912e-05,
4218
+ "loss": 0.9511,
4219
+ "step": 598
4220
+ },
4221
+ {
4222
+ "epoch": 0.5408577878103837,
4223
+ "grad_norm": 1.6321760416030884,
4224
+ "learning_rate": 8.85801861999558e-05,
4225
+ "loss": 1.0163,
4226
+ "step": 599
4227
+ },
4228
+ {
4229
+ "epoch": 0.5417607223476298,
4230
+ "grad_norm": 1.5338650941848755,
4231
+ "learning_rate": 8.82959855779453e-05,
4232
+ "loss": 1.3266,
4233
+ "step": 600
4234
+ },
4235
+ {
4236
+ "epoch": 0.5426636568848758,
4237
+ "grad_norm": 1.57759690284729,
4238
+ "learning_rate": 8.801188077007728e-05,
4239
+ "loss": 0.9428,
4240
+ "step": 601
4241
+ },
4242
+ {
4243
+ "epoch": 0.5435665914221219,
4244
+ "grad_norm": 1.4672127962112427,
4245
+ "learning_rate": 8.772787410215707e-05,
4246
+ "loss": 1.3282,
4247
+ "step": 602
4248
+ },
4249
+ {
4250
+ "epoch": 0.5444695259593679,
4251
+ "grad_norm": 1.4892001152038574,
4252
+ "learning_rate": 8.744396789918647e-05,
4253
+ "loss": 1.0562,
4254
+ "step": 603
4255
+ },
4256
+ {
4257
+ "epoch": 0.545372460496614,
4258
+ "grad_norm": 1.5642857551574707,
4259
+ "learning_rate": 8.71601644853449e-05,
4260
+ "loss": 1.0113,
4261
+ "step": 604
4262
+ },
4263
+ {
4264
+ "epoch": 0.54627539503386,
4265
+ "grad_norm": 1.4271188974380493,
4266
+ "learning_rate": 8.687646618397035e-05,
4267
+ "loss": 0.8131,
4268
+ "step": 605
4269
+ },
4270
+ {
4271
+ "epoch": 0.5471783295711061,
4272
+ "grad_norm": 1.384011149406433,
4273
+ "learning_rate": 8.659287531754024e-05,
4274
+ "loss": 0.9166,
4275
+ "step": 606
4276
+ },
4277
+ {
4278
+ "epoch": 0.5480812641083521,
4279
+ "grad_norm": 1.609860897064209,
4280
+ "learning_rate": 8.630939420765247e-05,
4281
+ "loss": 1.2092,
4282
+ "step": 607
4283
+ },
4284
+ {
4285
+ "epoch": 0.5489841986455982,
4286
+ "grad_norm": 1.5530509948730469,
4287
+ "learning_rate": 8.602602517500651e-05,
4288
+ "loss": 1.0349,
4289
+ "step": 608
4290
+ },
4291
+ {
4292
+ "epoch": 0.5498871331828442,
4293
+ "grad_norm": 1.5369367599487305,
4294
+ "learning_rate": 8.574277053938423e-05,
4295
+ "loss": 1.2942,
4296
+ "step": 609
4297
+ },
4298
+ {
4299
+ "epoch": 0.5507900677200903,
4300
+ "grad_norm": 1.3397136926651,
4301
+ "learning_rate": 8.545963261963102e-05,
4302
+ "loss": 0.9546,
4303
+ "step": 610
4304
+ },
4305
+ {
4306
+ "epoch": 0.5516930022573363,
4307
+ "grad_norm": 1.3543927669525146,
4308
+ "learning_rate": 8.517661373363684e-05,
4309
+ "loss": 1.0404,
4310
+ "step": 611
4311
+ },
4312
+ {
4313
+ "epoch": 0.5525959367945824,
4314
+ "grad_norm": 1.440779685974121,
4315
+ "learning_rate": 8.48937161983171e-05,
4316
+ "loss": 1.0322,
4317
+ "step": 612
4318
+ },
4319
+ {
4320
+ "epoch": 0.5534988713318284,
4321
+ "grad_norm": 1.3192452192306519,
4322
+ "learning_rate": 8.461094232959381e-05,
4323
+ "loss": 0.8567,
4324
+ "step": 613
4325
+ },
4326
+ {
4327
+ "epoch": 0.5544018058690745,
4328
+ "grad_norm": 1.2406502962112427,
4329
+ "learning_rate": 8.432829444237666e-05,
4330
+ "loss": 1.0435,
4331
+ "step": 614
4332
+ },
4333
+ {
4334
+ "epoch": 0.5553047404063205,
4335
+ "grad_norm": 1.4534803628921509,
4336
+ "learning_rate": 8.404577485054394e-05,
4337
+ "loss": 1.0001,
4338
+ "step": 615
4339
+ },
4340
+ {
4341
+ "epoch": 0.5562076749435666,
4342
+ "grad_norm": 1.1070985794067383,
4343
+ "learning_rate": 8.376338586692366e-05,
4344
+ "loss": 0.5828,
4345
+ "step": 616
4346
+ },
4347
+ {
4348
+ "epoch": 0.5571106094808126,
4349
+ "grad_norm": 1.6241154670715332,
4350
+ "learning_rate": 8.348112980327454e-05,
4351
+ "loss": 1.3879,
4352
+ "step": 617
4353
+ },
4354
+ {
4355
+ "epoch": 0.5580135440180587,
4356
+ "grad_norm": 1.4151923656463623,
4357
+ "learning_rate": 8.319900897026733e-05,
4358
+ "loss": 1.2522,
4359
+ "step": 618
4360
+ },
4361
+ {
4362
+ "epoch": 0.5589164785553047,
4363
+ "grad_norm": 1.5467820167541504,
4364
+ "learning_rate": 8.29170256774656e-05,
4365
+ "loss": 1.3313,
4366
+ "step": 619
4367
+ },
4368
+ {
4369
+ "epoch": 0.5598194130925508,
4370
+ "grad_norm": 1.7408809661865234,
4371
+ "learning_rate": 8.263518223330697e-05,
4372
+ "loss": 1.3344,
4373
+ "step": 620
4374
+ },
4375
+ {
4376
+ "epoch": 0.5607223476297969,
4377
+ "grad_norm": 1.752617597579956,
4378
+ "learning_rate": 8.235348094508426e-05,
4379
+ "loss": 1.1067,
4380
+ "step": 621
4381
+ },
4382
+ {
4383
+ "epoch": 0.5616252821670429,
4384
+ "grad_norm": 1.5512912273406982,
4385
+ "learning_rate": 8.207192411892646e-05,
4386
+ "loss": 1.2465,
4387
+ "step": 622
4388
+ },
4389
+ {
4390
+ "epoch": 0.562528216704289,
4391
+ "grad_norm": 1.5489177703857422,
4392
+ "learning_rate": 8.179051405977993e-05,
4393
+ "loss": 1.1418,
4394
+ "step": 623
4395
+ },
4396
+ {
4397
+ "epoch": 0.563431151241535,
4398
+ "grad_norm": 1.3127493858337402,
4399
+ "learning_rate": 8.150925307138969e-05,
4400
+ "loss": 0.7634,
4401
+ "step": 624
4402
+ },
4403
+ {
4404
+ "epoch": 0.5643340857787811,
4405
+ "grad_norm": 1.4094018936157227,
4406
+ "learning_rate": 8.122814345628016e-05,
4407
+ "loss": 0.8071,
4408
+ "step": 625
4409
+ },
4410
+ {
4411
+ "epoch": 0.5652370203160271,
4412
+ "grad_norm": 1.5054543018341064,
4413
+ "learning_rate": 8.094718751573667e-05,
4414
+ "loss": 1.0688,
4415
+ "step": 626
4416
+ },
4417
+ {
4418
+ "epoch": 0.5661399548532732,
4419
+ "grad_norm": 1.7666995525360107,
4420
+ "learning_rate": 8.06663875497866e-05,
4421
+ "loss": 1.1685,
4422
+ "step": 627
4423
+ },
4424
+ {
4425
+ "epoch": 0.5670428893905192,
4426
+ "grad_norm": 1.767767071723938,
4427
+ "learning_rate": 8.038574585718032e-05,
4428
+ "loss": 1.6177,
4429
+ "step": 628
4430
+ },
4431
+ {
4432
+ "epoch": 0.5679458239277653,
4433
+ "grad_norm": 1.3216131925582886,
4434
+ "learning_rate": 8.010526473537249e-05,
4435
+ "loss": 0.673,
4436
+ "step": 629
4437
+ },
4438
+ {
4439
+ "epoch": 0.5688487584650113,
4440
+ "grad_norm": 1.6139782667160034,
4441
+ "learning_rate": 7.98249464805034e-05,
4442
+ "loss": 1.2622,
4443
+ "step": 630
4444
+ },
4445
+ {
4446
+ "epoch": 0.5697516930022574,
4447
+ "grad_norm": 1.6862844228744507,
4448
+ "learning_rate": 7.954479338737995e-05,
4449
+ "loss": 1.4653,
4450
+ "step": 631
4451
+ },
4452
+ {
4453
+ "epoch": 0.5706546275395034,
4454
+ "grad_norm": 1.3713010549545288,
4455
+ "learning_rate": 7.926480774945689e-05,
4456
+ "loss": 0.9346,
4457
+ "step": 632
4458
+ },
4459
+ {
4460
+ "epoch": 0.5715575620767495,
4461
+ "grad_norm": 1.6012693643569946,
4462
+ "learning_rate": 7.89849918588183e-05,
4463
+ "loss": 0.862,
4464
+ "step": 633
4465
+ },
4466
+ {
4467
+ "epoch": 0.5724604966139955,
4468
+ "grad_norm": 1.5387890338897705,
4469
+ "learning_rate": 7.870534800615845e-05,
4470
+ "loss": 1.0758,
4471
+ "step": 634
4472
+ },
4473
+ {
4474
+ "epoch": 0.5733634311512416,
4475
+ "grad_norm": 1.8564848899841309,
4476
+ "learning_rate": 7.84258784807633e-05,
4477
+ "loss": 1.4123,
4478
+ "step": 635
4479
+ },
4480
+ {
4481
+ "epoch": 0.5742663656884875,
4482
+ "grad_norm": 1.7306365966796875,
4483
+ "learning_rate": 7.814658557049175e-05,
4484
+ "loss": 0.9185,
4485
+ "step": 636
4486
+ },
4487
+ {
4488
+ "epoch": 0.5751693002257336,
4489
+ "grad_norm": 1.6491857767105103,
4490
+ "learning_rate": 7.786747156175676e-05,
4491
+ "loss": 1.067,
4492
+ "step": 637
4493
+ },
4494
+ {
4495
+ "epoch": 0.5760722347629796,
4496
+ "grad_norm": 1.5658364295959473,
4497
+ "learning_rate": 7.758853873950676e-05,
4498
+ "loss": 1.2779,
4499
+ "step": 638
4500
+ },
4501
+ {
4502
+ "epoch": 0.5769751693002257,
4503
+ "grad_norm": 1.6873422861099243,
4504
+ "learning_rate": 7.730978938720693e-05,
4505
+ "loss": 1.2657,
4506
+ "step": 639
4507
+ },
4508
+ {
4509
+ "epoch": 0.5778781038374717,
4510
+ "grad_norm": 1.5114984512329102,
4511
+ "learning_rate": 7.703122578682046e-05,
4512
+ "loss": 1.1484,
4513
+ "step": 640
4514
+ },
4515
+ {
4516
+ "epoch": 0.5787810383747178,
4517
+ "grad_norm": 1.3105080127716064,
4518
+ "learning_rate": 7.675285021878989e-05,
4519
+ "loss": 0.9168,
4520
+ "step": 641
4521
+ },
4522
+ {
4523
+ "epoch": 0.5796839729119638,
4524
+ "grad_norm": 1.3600938320159912,
4525
+ "learning_rate": 7.647466496201847e-05,
4526
+ "loss": 0.9933,
4527
+ "step": 642
4528
+ },
4529
+ {
4530
+ "epoch": 0.5805869074492099,
4531
+ "grad_norm": 1.7162373065948486,
4532
+ "learning_rate": 7.619667229385146e-05,
4533
+ "loss": 1.456,
4534
+ "step": 643
4535
+ },
4536
+ {
4537
+ "epoch": 0.581489841986456,
4538
+ "grad_norm": 1.4823421239852905,
4539
+ "learning_rate": 7.591887449005748e-05,
4540
+ "loss": 0.9799,
4541
+ "step": 644
4542
+ },
4543
+ {
4544
+ "epoch": 0.582392776523702,
4545
+ "grad_norm": 1.467466115951538,
4546
+ "learning_rate": 7.564127382481e-05,
4547
+ "loss": 1.2861,
4548
+ "step": 645
4549
+ },
4550
+ {
4551
+ "epoch": 0.5832957110609481,
4552
+ "grad_norm": 1.736372470855713,
4553
+ "learning_rate": 7.536387257066854e-05,
4554
+ "loss": 1.4079,
4555
+ "step": 646
4556
+ },
4557
+ {
4558
+ "epoch": 0.5841986455981941,
4559
+ "grad_norm": 1.3974703550338745,
4560
+ "learning_rate": 7.508667299856014e-05,
4561
+ "loss": 0.6523,
4562
+ "step": 647
4563
+ },
4564
+ {
4565
+ "epoch": 0.5851015801354402,
4566
+ "grad_norm": 1.7177207469940186,
4567
+ "learning_rate": 7.480967737776089e-05,
4568
+ "loss": 1.2047,
4569
+ "step": 648
4570
+ },
4571
+ {
4572
+ "epoch": 0.5860045146726862,
4573
+ "grad_norm": 1.6860179901123047,
4574
+ "learning_rate": 7.453288797587714e-05,
4575
+ "loss": 1.6312,
4576
+ "step": 649
4577
+ },
4578
+ {
4579
+ "epoch": 0.5869074492099323,
4580
+ "grad_norm": 1.3072704076766968,
4581
+ "learning_rate": 7.425630705882707e-05,
4582
+ "loss": 0.8851,
4583
+ "step": 650
4584
+ },
4585
+ {
4586
+ "epoch": 0.5878103837471783,
4587
+ "grad_norm": 1.2708418369293213,
4588
+ "learning_rate": 7.397993689082216e-05,
4589
+ "loss": 0.826,
4590
+ "step": 651
4591
+ },
4592
+ {
4593
+ "epoch": 0.5887133182844244,
4594
+ "grad_norm": 1.4069938659667969,
4595
+ "learning_rate": 7.370377973434855e-05,
4596
+ "loss": 1.0625,
4597
+ "step": 652
4598
+ },
4599
+ {
4600
+ "epoch": 0.5896162528216704,
4601
+ "grad_norm": 1.5313855409622192,
4602
+ "learning_rate": 7.34278378501486e-05,
4603
+ "loss": 0.9191,
4604
+ "step": 653
4605
+ },
4606
+ {
4607
+ "epoch": 0.5905191873589165,
4608
+ "grad_norm": 1.6954371929168701,
4609
+ "learning_rate": 7.31521134972023e-05,
4610
+ "loss": 1.564,
4611
+ "step": 654
4612
+ },
4613
+ {
4614
+ "epoch": 0.5914221218961625,
4615
+ "grad_norm": 1.5678001642227173,
4616
+ "learning_rate": 7.2876608932709e-05,
4617
+ "loss": 1.1355,
4618
+ "step": 655
4619
+ },
4620
+ {
4621
+ "epoch": 0.5923250564334086,
4622
+ "grad_norm": 1.3186235427856445,
4623
+ "learning_rate": 7.26013264120686e-05,
4624
+ "loss": 0.842,
4625
+ "step": 656
4626
+ },
4627
+ {
4628
+ "epoch": 0.5932279909706546,
4629
+ "grad_norm": 1.612156629562378,
4630
+ "learning_rate": 7.232626818886326e-05,
4631
+ "loss": 0.8356,
4632
+ "step": 657
4633
+ },
4634
+ {
4635
+ "epoch": 0.5941309255079007,
4636
+ "grad_norm": 1.3746870756149292,
4637
+ "learning_rate": 7.205143651483906e-05,
4638
+ "loss": 0.7663,
4639
+ "step": 658
4640
+ },
4641
+ {
4642
+ "epoch": 0.5950338600451467,
4643
+ "grad_norm": 1.4078235626220703,
4644
+ "learning_rate": 7.177683363988736e-05,
4645
+ "loss": 0.9621,
4646
+ "step": 659
4647
+ },
4648
+ {
4649
+ "epoch": 0.5959367945823928,
4650
+ "grad_norm": 1.5795187950134277,
4651
+ "learning_rate": 7.150246181202648e-05,
4652
+ "loss": 0.8875,
4653
+ "step": 660
4654
+ },
4655
+ {
4656
+ "epoch": 0.5968397291196388,
4657
+ "grad_norm": 1.5938857793807983,
4658
+ "learning_rate": 7.122832327738331e-05,
4659
+ "loss": 0.9685,
4660
+ "step": 661
4661
+ },
4662
+ {
4663
+ "epoch": 0.5977426636568849,
4664
+ "grad_norm": 1.5371569395065308,
4665
+ "learning_rate": 7.095442028017492e-05,
4666
+ "loss": 1.0103,
4667
+ "step": 662
4668
+ },
4669
+ {
4670
+ "epoch": 0.5986455981941309,
4671
+ "grad_norm": 1.641914963722229,
4672
+ "learning_rate": 7.068075506269005e-05,
4673
+ "loss": 1.1853,
4674
+ "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.599548532731377,
4678
+ "grad_norm": 1.546448826789856,
4679
+ "learning_rate": 7.040732986527108e-05,
4680
+ "loss": 0.9724,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.600451467268623,
4685
+ "grad_norm": 1.414829134941101,
4686
+ "learning_rate": 7.01341469262953e-05,
4687
+ "loss": 1.0056,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.6013544018058691,
4692
+ "grad_norm": 2.0793347358703613,
4693
+ "learning_rate": 6.986120848215678e-05,
4694
+ "loss": 1.5461,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.6022573363431152,
4699
+ "grad_norm": 1.2860586643218994,
4700
+ "learning_rate": 6.958851676724823e-05,
4701
+ "loss": 0.7181,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.6031602708803612,
4706
+ "grad_norm": 1.9850184917449951,
4707
+ "learning_rate": 6.931607401394229e-05,
4708
+ "loss": 1.6305,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.6040632054176073,
4713
+ "grad_norm": 1.7549471855163574,
4714
+ "learning_rate": 6.904388245257363e-05,
4715
+ "loss": 1.4215,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.6049661399548533,
4720
+ "grad_norm": 1.667541742324829,
4721
+ "learning_rate": 6.877194431142055e-05,
4722
+ "loss": 1.323,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.6058690744920994,
4727
+ "grad_norm": 1.388028860092163,
4728
+ "learning_rate": 6.850026181668668e-05,
4729
+ "loss": 0.9346,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.6067720090293454,
4734
+ "grad_norm": 1.8620973825454712,
4735
+ "learning_rate": 6.822883719248283e-05,
4736
+ "loss": 1.5254,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.6076749435665915,
4741
+ "grad_norm": 1.5007909536361694,
4742
+ "learning_rate": 6.79576726608089e-05,
4743
+ "loss": 1.1383,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.6085778781038375,
4748
+ "grad_norm": 1.584130048751831,
4749
+ "learning_rate": 6.768677044153535e-05,
4750
+ "loss": 1.0465,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.6094808126410836,
4755
+ "grad_norm": 1.9776028394699097,
4756
+ "learning_rate": 6.741613275238534e-05,
4757
+ "loss": 1.3916,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.6103837471783295,
4762
+ "grad_norm": 2.143301248550415,
4763
+ "learning_rate": 6.714576180891654e-05,
4764
+ "loss": 1.4605,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.6112866817155757,
4769
+ "grad_norm": 1.706349492073059,
4770
+ "learning_rate": 6.68756598245028e-05,
4771
+ "loss": 1.248,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.6121896162528216,
4776
+ "grad_norm": 1.7645604610443115,
4777
+ "learning_rate": 6.660582901031621e-05,
4778
+ "loss": 1.3162,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.6130925507900677,
4783
+ "grad_norm": 0.9834024906158447,
4784
+ "learning_rate": 6.633627157530899e-05,
4785
+ "loss": 0.4117,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.6139954853273137,
4790
+ "grad_norm": 1.4688466787338257,
4791
+ "learning_rate": 6.606698972619526e-05,
4792
+ "loss": 0.9471,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.6148984198645598,
4797
+ "grad_norm": 1.369699239730835,
4798
+ "learning_rate": 6.579798566743314e-05,
4799
+ "loss": 0.8842,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.6158013544018058,
4804
+ "grad_norm": 1.4613699913024902,
4805
+ "learning_rate": 6.552926160120663e-05,
4806
+ "loss": 1.0846,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.6167042889390519,
4811
+ "grad_norm": 1.527610182762146,
4812
+ "learning_rate": 6.526081972740758e-05,
4813
+ "loss": 1.0844,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.6176072234762979,
4818
+ "grad_norm": 1.742908239364624,
4819
+ "learning_rate": 6.499266224361767e-05,
4820
+ "loss": 1.3409,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.618510158013544,
4825
+ "grad_norm": 1.8235154151916504,
4826
+ "learning_rate": 6.472479134509052e-05,
4827
+ "loss": 1.308,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.61941309255079,
4832
+ "grad_norm": 1.399803876876831,
4833
+ "learning_rate": 6.445720922473355e-05,
4834
+ "loss": 0.9267,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.6203160270880361,
4839
+ "grad_norm": 1.8101701736450195,
4840
+ "learning_rate": 6.418991807309012e-05,
4841
+ "loss": 0.9509,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.6212189616252821,
4846
+ "grad_norm": 1.5924134254455566,
4847
+ "learning_rate": 6.392292007832168e-05,
4848
+ "loss": 1.2864,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.6221218961625282,
4853
+ "grad_norm": 1.2724056243896484,
4854
+ "learning_rate": 6.365621742618966e-05,
4855
+ "loss": 0.7186,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.6230248306997742,
4860
+ "grad_norm": 1.3141125440597534,
4861
+ "learning_rate": 6.338981230003778e-05,
4862
+ "loss": 0.9604,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.6239277652370203,
4867
+ "grad_norm": 1.656320571899414,
4868
+ "learning_rate": 6.312370688077399e-05,
4869
+ "loss": 1.2993,
4870
+ "step": 691
4871
+ },
4872
+ {
4873
+ "epoch": 0.6248306997742664,
4874
+ "grad_norm": 1.3529797792434692,
4875
+ "learning_rate": 6.285790334685282e-05,
4876
+ "loss": 0.4246,
4877
+ "step": 692
4878
+ },
4879
+ {
4880
+ "epoch": 0.6257336343115124,
4881
+ "grad_norm": 1.2153328657150269,
4882
+ "learning_rate": 6.259240387425736e-05,
4883
+ "loss": 0.9525,
4884
+ "step": 693
4885
+ },
4886
+ {
4887
+ "epoch": 0.6266365688487585,
4888
+ "grad_norm": 1.6419172286987305,
4889
+ "learning_rate": 6.232721063648148e-05,
4890
+ "loss": 0.9395,
4891
+ "step": 694
4892
+ },
4893
+ {
4894
+ "epoch": 0.6275395033860045,
4895
+ "grad_norm": 1.4371968507766724,
4896
+ "learning_rate": 6.206232580451225e-05,
4897
+ "loss": 1.4503,
4898
+ "step": 695
4899
+ },
4900
+ {
4901
+ "epoch": 0.6284424379232506,
4902
+ "grad_norm": 2.1209614276885986,
4903
+ "learning_rate": 6.179775154681184e-05,
4904
+ "loss": 1.5918,
4905
+ "step": 696
4906
+ },
4907
+ {
4908
+ "epoch": 0.6293453724604966,
4909
+ "grad_norm": 1.8328834772109985,
4910
+ "learning_rate": 6.153349002929987e-05,
4911
+ "loss": 1.1838,
4912
+ "step": 697
4913
+ },
4914
+ {
4915
+ "epoch": 0.6302483069977427,
4916
+ "grad_norm": 1.7110613584518433,
4917
+ "learning_rate": 6.126954341533599e-05,
4918
+ "loss": 1.4122,
4919
+ "step": 698
4920
+ },
4921
+ {
4922
+ "epoch": 0.6311512415349887,
4923
+ "grad_norm": 1.826737642288208,
4924
+ "learning_rate": 6.100591386570167e-05,
4925
+ "loss": 1.5058,
4926
+ "step": 699
4927
+ },
4928
+ {
4929
+ "epoch": 0.6320541760722348,
4930
+ "grad_norm": 1.8801661729812622,
4931
+ "learning_rate": 6.0742603538582835e-05,
4932
+ "loss": 1.5475,
4933
+ "step": 700
4934
+ },
4935
+ {
4936
+ "epoch": 0.6329571106094808,
4937
+ "grad_norm": 1.763385534286499,
4938
+ "learning_rate": 6.047961458955214e-05,
4939
+ "loss": 1.3673,
4940
+ "step": 701
4941
+ },
4942
+ {
4943
+ "epoch": 0.6338600451467269,
4944
+ "grad_norm": 1.508298397064209,
4945
+ "learning_rate": 6.021694917155129e-05,
4946
+ "loss": 1.1175,
4947
+ "step": 702
4948
+ },
4949
+ {
4950
+ "epoch": 0.6347629796839729,
4951
+ "grad_norm": 1.5054030418395996,
4952
+ "learning_rate": 5.9954609434873344e-05,
4953
+ "loss": 1.32,
4954
+ "step": 703
4955
+ },
4956
+ {
4957
+ "epoch": 0.635665914221219,
4958
+ "grad_norm": 1.6304901838302612,
4959
+ "learning_rate": 5.9692597527145354e-05,
4960
+ "loss": 0.9387,
4961
+ "step": 704
4962
+ },
4963
+ {
4964
+ "epoch": 0.636568848758465,
4965
+ "grad_norm": 1.3840839862823486,
4966
+ "learning_rate": 5.943091559331053e-05,
4967
+ "loss": 0.627,
4968
+ "step": 705
4969
+ },
4970
+ {
4971
+ "epoch": 0.6374717832957111,
4972
+ "grad_norm": 1.8724223375320435,
4973
+ "learning_rate": 5.9169565775610656e-05,
4974
+ "loss": 1.6199,
4975
+ "step": 706
4976
+ },
4977
+ {
4978
+ "epoch": 0.6383747178329571,
4979
+ "grad_norm": 1.563077449798584,
4980
+ "learning_rate": 5.890855021356891e-05,
4981
+ "loss": 1.1355,
4982
+ "step": 707
4983
+ },
4984
+ {
4985
+ "epoch": 0.6392776523702032,
4986
+ "grad_norm": 1.436528205871582,
4987
+ "learning_rate": 5.864787104397194e-05,
4988
+ "loss": 1.3004,
4989
+ "step": 708
4990
+ },
4991
+ {
4992
+ "epoch": 0.6401805869074492,
4993
+ "grad_norm": 1.5161725282669067,
4994
+ "learning_rate": 5.838753040085256e-05,
4995
+ "loss": 0.8622,
4996
+ "step": 709
4997
+ },
4998
+ {
4999
+ "epoch": 0.6410835214446953,
5000
+ "grad_norm": 1.5266764163970947,
5001
+ "learning_rate": 5.81275304154723e-05,
5002
+ "loss": 0.9748,
5003
+ "step": 710
5004
+ },
5005
+ {
5006
+ "epoch": 0.6419864559819413,
5007
+ "grad_norm": 1.5332485437393188,
5008
+ "learning_rate": 5.786787321630394e-05,
5009
+ "loss": 0.8654,
5010
+ "step": 711
5011
+ },
5012
+ {
5013
+ "epoch": 0.6428893905191874,
5014
+ "grad_norm": 1.5081924200057983,
5015
+ "learning_rate": 5.7608560929013946e-05,
5016
+ "loss": 1.2023,
5017
+ "step": 712
5018
+ },
5019
+ {
5020
+ "epoch": 0.6437923250564334,
5021
+ "grad_norm": 1.880600094795227,
5022
+ "learning_rate": 5.7349595676445286e-05,
5023
+ "loss": 1.3637,
5024
+ "step": 713
5025
+ },
5026
+ {
5027
+ "epoch": 0.6446952595936795,
5028
+ "grad_norm": 1.742201328277588,
5029
+ "learning_rate": 5.709097957860001e-05,
5030
+ "loss": 1.4513,
5031
+ "step": 714
5032
+ },
5033
+ {
5034
+ "epoch": 0.6455981941309256,
5035
+ "grad_norm": 1.4408034086227417,
5036
+ "learning_rate": 5.683271475262164e-05,
5037
+ "loss": 1.0124,
5038
+ "step": 715
5039
+ },
5040
+ {
5041
+ "epoch": 0.6465011286681716,
5042
+ "grad_norm": 1.2453595399856567,
5043
+ "learning_rate": 5.657480331277819e-05,
5044
+ "loss": 0.908,
5045
+ "step": 716
5046
+ },
5047
+ {
5048
+ "epoch": 0.6474040632054177,
5049
+ "grad_norm": 1.4883902072906494,
5050
+ "learning_rate": 5.6317247370444715e-05,
5051
+ "loss": 1.15,
5052
+ "step": 717
5053
+ },
5054
+ {
5055
+ "epoch": 0.6483069977426636,
5056
+ "grad_norm": 1.3922057151794434,
5057
+ "learning_rate": 5.6060049034085815e-05,
5058
+ "loss": 1.2913,
5059
+ "step": 718
5060
+ },
5061
+ {
5062
+ "epoch": 0.6492099322799098,
5063
+ "grad_norm": 1.6199557781219482,
5064
+ "learning_rate": 5.58032104092389e-05,
5065
+ "loss": 1.0053,
5066
+ "step": 719
5067
+ },
5068
+ {
5069
+ "epoch": 0.6501128668171557,
5070
+ "grad_norm": 1.8710253238677979,
5071
+ "learning_rate": 5.5546733598496314e-05,
5072
+ "loss": 1.4156,
5073
+ "step": 720
5074
+ },
5075
+ {
5076
+ "epoch": 0.6510158013544018,
5077
+ "grad_norm": 1.3759031295776367,
5078
+ "learning_rate": 5.5290620701488594e-05,
5079
+ "loss": 0.9242,
5080
+ "step": 721
5081
+ },
5082
+ {
5083
+ "epoch": 0.6519187358916478,
5084
+ "grad_norm": 1.869899034500122,
5085
+ "learning_rate": 5.5034873814867125e-05,
5086
+ "loss": 1.4667,
5087
+ "step": 722
5088
+ },
5089
+ {
5090
+ "epoch": 0.6528216704288939,
5091
+ "grad_norm": 1.673433780670166,
5092
+ "learning_rate": 5.477949503228686e-05,
5093
+ "loss": 1.4162,
5094
+ "step": 723
5095
+ },
5096
+ {
5097
+ "epoch": 0.6537246049661399,
5098
+ "grad_norm": 1.4881922006607056,
5099
+ "learning_rate": 5.452448644438946e-05,
5100
+ "loss": 1.4349,
5101
+ "step": 724
5102
+ },
5103
+ {
5104
+ "epoch": 0.654627539503386,
5105
+ "grad_norm": 1.577567219734192,
5106
+ "learning_rate": 5.42698501387858e-05,
5107
+ "loss": 1.3597,
5108
+ "step": 725
5109
+ },
5110
+ {
5111
+ "epoch": 0.655530474040632,
5112
+ "grad_norm": 1.6177291870117188,
5113
+ "learning_rate": 5.401558820003929e-05,
5114
+ "loss": 1.5005,
5115
+ "step": 726
5116
+ },
5117
+ {
5118
+ "epoch": 0.6564334085778781,
5119
+ "grad_norm": 1.8385038375854492,
5120
+ "learning_rate": 5.3761702709648556e-05,
5121
+ "loss": 1.3802,
5122
+ "step": 727
5123
+ },
5124
+ {
5125
+ "epoch": 0.6573363431151241,
5126
+ "grad_norm": 1.299546718597412,
5127
+ "learning_rate": 5.3508195746030296e-05,
5128
+ "loss": 1.1285,
5129
+ "step": 728
5130
+ },
5131
+ {
5132
+ "epoch": 0.6582392776523702,
5133
+ "grad_norm": 1.2800817489624023,
5134
+ "learning_rate": 5.325506938450273e-05,
5135
+ "loss": 0.8695,
5136
+ "step": 729
5137
+ },
5138
+ {
5139
+ "epoch": 0.6591422121896162,
5140
+ "grad_norm": 1.4967060089111328,
5141
+ "learning_rate": 5.300232569726804e-05,
5142
+ "loss": 1.4879,
5143
+ "step": 730
5144
+ },
5145
+ {
5146
+ "epoch": 0.6600451467268623,
5147
+ "grad_norm": 1.702681064605713,
5148
+ "learning_rate": 5.2749966753395686e-05,
5149
+ "loss": 1.2565,
5150
+ "step": 731
5151
+ },
5152
+ {
5153
+ "epoch": 0.6609480812641083,
5154
+ "grad_norm": 1.6170333623886108,
5155
+ "learning_rate": 5.249799461880569e-05,
5156
+ "loss": 1.2811,
5157
+ "step": 732
5158
+ },
5159
+ {
5160
+ "epoch": 0.6618510158013544,
5161
+ "grad_norm": 1.6071534156799316,
5162
+ "learning_rate": 5.224641135625119e-05,
5163
+ "loss": 1.3454,
5164
+ "step": 733
5165
+ },
5166
+ {
5167
+ "epoch": 0.6627539503386004,
5168
+ "grad_norm": 1.8343545198440552,
5169
+ "learning_rate": 5.1995219025302025e-05,
5170
+ "loss": 1.6234,
5171
+ "step": 734
5172
+ },
5173
+ {
5174
+ "epoch": 0.6636568848758465,
5175
+ "grad_norm": 1.4651812314987183,
5176
+ "learning_rate": 5.174441968232768e-05,
5177
+ "loss": 1.2592,
5178
+ "step": 735
5179
+ },
5180
+ {
5181
+ "epoch": 0.6645598194130925,
5182
+ "grad_norm": 1.6615885496139526,
5183
+ "learning_rate": 5.1494015380480396e-05,
5184
+ "loss": 0.9647,
5185
+ "step": 736
5186
+ },
5187
+ {
5188
+ "epoch": 0.6654627539503386,
5189
+ "grad_norm": 1.6668320894241333,
5190
+ "learning_rate": 5.1244008169678495e-05,
5191
+ "loss": 0.8748,
5192
+ "step": 737
5193
+ },
5194
+ {
5195
+ "epoch": 0.6663656884875846,
5196
+ "grad_norm": 1.682271122932434,
5197
+ "learning_rate": 5.099440009658957e-05,
5198
+ "loss": 1.2871,
5199
+ "step": 738
5200
+ },
5201
+ {
5202
+ "epoch": 0.6672686230248307,
5203
+ "grad_norm": 1.3818674087524414,
5204
+ "learning_rate": 5.074519320461357e-05,
5205
+ "loss": 1.0104,
5206
+ "step": 739
5207
+ },
5208
+ {
5209
+ "epoch": 0.6681715575620768,
5210
+ "grad_norm": 1.4028613567352295,
5211
+ "learning_rate": 5.049638953386635e-05,
5212
+ "loss": 1.2135,
5213
+ "step": 740
5214
+ },
5215
+ {
5216
+ "epoch": 0.6690744920993228,
5217
+ "grad_norm": 1.6344983577728271,
5218
+ "learning_rate": 5.024799112116274e-05,
5219
+ "loss": 1.1836,
5220
+ "step": 741
5221
+ },
5222
+ {
5223
+ "epoch": 0.6699774266365689,
5224
+ "grad_norm": 1.5904343128204346,
5225
+ "learning_rate": 5.000000000000002e-05,
5226
+ "loss": 1.2949,
5227
+ "step": 742
5228
+ },
5229
+ {
5230
+ "epoch": 0.6708803611738149,
5231
+ "grad_norm": 1.2303252220153809,
5232
+ "learning_rate": 4.975241820054107e-05,
5233
+ "loss": 0.8138,
5234
+ "step": 743
5235
+ },
5236
+ {
5237
+ "epoch": 0.671783295711061,
5238
+ "grad_norm": 1.8008294105529785,
5239
+ "learning_rate": 4.9505247749598006e-05,
5240
+ "loss": 1.9048,
5241
+ "step": 744
5242
+ },
5243
+ {
5244
+ "epoch": 0.672686230248307,
5245
+ "grad_norm": 1.6088261604309082,
5246
+ "learning_rate": 4.9258490670615475e-05,
5247
+ "loss": 1.6434,
5248
+ "step": 745
5249
+ },
5250
+ {
5251
+ "epoch": 0.6735891647855531,
5252
+ "grad_norm": 1.501248836517334,
5253
+ "learning_rate": 4.9012148983653964e-05,
5254
+ "loss": 1.176,
5255
+ "step": 746
5256
+ },
5257
+ {
5258
+ "epoch": 0.6744920993227991,
5259
+ "grad_norm": 1.6089529991149902,
5260
+ "learning_rate": 4.8766224705373476e-05,
5261
+ "loss": 1.0786,
5262
+ "step": 747
5263
+ },
5264
+ {
5265
+ "epoch": 0.6753950338600452,
5266
+ "grad_norm": 1.3817884922027588,
5267
+ "learning_rate": 4.852071984901696e-05,
5268
+ "loss": 0.8196,
5269
+ "step": 748
5270
+ },
5271
+ {
5272
+ "epoch": 0.6762979683972912,
5273
+ "grad_norm": 1.625156044960022,
5274
+ "learning_rate": 4.827563642439361e-05,
5275
+ "loss": 1.337,
5276
+ "step": 749
5277
+ },
5278
+ {
5279
+ "epoch": 0.6772009029345373,
5280
+ "grad_norm": 1.7386070489883423,
5281
+ "learning_rate": 4.803097643786288e-05,
5282
+ "loss": 1.5089,
5283
+ "step": 750
5284
+ },
5285
+ {
5286
+ "epoch": 0.6781038374717833,
5287
+ "grad_norm": 1.3682126998901367,
5288
+ "learning_rate": 4.778674189231751e-05,
5289
+ "loss": 0.9877,
5290
+ "step": 751
5291
+ },
5292
+ {
5293
+ "epoch": 0.6790067720090294,
5294
+ "grad_norm": 1.2330286502838135,
5295
+ "learning_rate": 4.754293478716755e-05,
5296
+ "loss": 0.864,
5297
+ "step": 752
5298
+ },
5299
+ {
5300
+ "epoch": 0.6799097065462754,
5301
+ "grad_norm": 1.4554107189178467,
5302
+ "learning_rate": 4.7299557118323814e-05,
5303
+ "loss": 1.3097,
5304
+ "step": 753
5305
+ },
5306
+ {
5307
+ "epoch": 0.6808126410835215,
5308
+ "grad_norm": 1.650094985961914,
5309
+ "learning_rate": 4.7056610878181486e-05,
5310
+ "loss": 1.2152,
5311
+ "step": 754
5312
+ },
5313
+ {
5314
+ "epoch": 0.6817155756207675,
5315
+ "grad_norm": 1.435055136680603,
5316
+ "learning_rate": 4.681409805560396e-05,
5317
+ "loss": 0.6956,
5318
+ "step": 755
5319
+ },
5320
+ {
5321
+ "epoch": 0.6826185101580136,
5322
+ "grad_norm": 1.376915454864502,
5323
+ "learning_rate": 4.657202063590653e-05,
5324
+ "loss": 0.8902,
5325
+ "step": 756
5326
+ },
5327
+ {
5328
+ "epoch": 0.6835214446952596,
5329
+ "grad_norm": 1.2419427633285522,
5330
+ "learning_rate": 4.633038060083996e-05,
5331
+ "loss": 0.6594,
5332
+ "step": 757
5333
+ },
5334
+ {
5335
+ "epoch": 0.6844243792325057,
5336
+ "grad_norm": 1.9876960515975952,
5337
+ "learning_rate": 4.6089179928574487e-05,
5338
+ "loss": 1.1539,
5339
+ "step": 758
5340
+ },
5341
+ {
5342
+ "epoch": 0.6853273137697516,
5343
+ "grad_norm": 1.5876964330673218,
5344
+ "learning_rate": 4.584842059368354e-05,
5345
+ "loss": 1.0687,
5346
+ "step": 759
5347
+ },
5348
+ {
5349
+ "epoch": 0.6862302483069977,
5350
+ "grad_norm": 1.317826271057129,
5351
+ "learning_rate": 4.560810456712754e-05,
5352
+ "loss": 0.9303,
5353
+ "step": 760
5354
+ },
5355
+ {
5356
+ "epoch": 0.6871331828442437,
5357
+ "grad_norm": 1.5974258184432983,
5358
+ "learning_rate": 4.536823381623779e-05,
5359
+ "loss": 1.2335,
5360
+ "step": 761
5361
+ },
5362
+ {
5363
+ "epoch": 0.6880361173814898,
5364
+ "grad_norm": 1.6230992078781128,
5365
+ "learning_rate": 4.512881030470031e-05,
5366
+ "loss": 1.3595,
5367
+ "step": 762
5368
+ },
5369
+ {
5370
+ "epoch": 0.688939051918736,
5371
+ "grad_norm": 1.4521753787994385,
5372
+ "learning_rate": 4.488983599254001e-05,
5373
+ "loss": 1.0362,
5374
+ "step": 763
5375
+ },
5376
+ {
5377
+ "epoch": 0.6898419864559819,
5378
+ "grad_norm": 1.566362977027893,
5379
+ "learning_rate": 4.4651312836104254e-05,
5380
+ "loss": 1.2261,
5381
+ "step": 764
5382
+ },
5383
+ {
5384
+ "epoch": 0.690744920993228,
5385
+ "grad_norm": 1.9560340642929077,
5386
+ "learning_rate": 4.441324278804717e-05,
5387
+ "loss": 1.4274,
5388
+ "step": 765
5389
+ },
5390
+ {
5391
+ "epoch": 0.691647855530474,
5392
+ "grad_norm": 1.6547048091888428,
5393
+ "learning_rate": 4.417562779731355e-05,
5394
+ "loss": 0.8835,
5395
+ "step": 766
5396
+ },
5397
+ {
5398
+ "epoch": 0.6925507900677201,
5399
+ "grad_norm": 1.3919938802719116,
5400
+ "learning_rate": 4.3938469809122776e-05,
5401
+ "loss": 0.776,
5402
+ "step": 767
5403
+ },
5404
+ {
5405
+ "epoch": 0.6934537246049661,
5406
+ "grad_norm": 1.4622743129730225,
5407
+ "learning_rate": 4.3701770764953124e-05,
5408
+ "loss": 0.9917,
5409
+ "step": 768
5410
+ },
5411
+ {
5412
+ "epoch": 0.6943566591422122,
5413
+ "grad_norm": 1.9236985445022583,
5414
+ "learning_rate": 4.346553260252574e-05,
5415
+ "loss": 1.7837,
5416
+ "step": 769
5417
+ },
5418
+ {
5419
+ "epoch": 0.6952595936794582,
5420
+ "grad_norm": 1.6482877731323242,
5421
+ "learning_rate": 4.32297572557887e-05,
5422
+ "loss": 1.66,
5423
+ "step": 770
5424
+ },
5425
+ {
5426
+ "epoch": 0.6961625282167043,
5427
+ "grad_norm": 1.2067683935165405,
5428
+ "learning_rate": 4.299444665490139e-05,
5429
+ "loss": 0.7246,
5430
+ "step": 771
5431
+ },
5432
+ {
5433
+ "epoch": 0.6970654627539503,
5434
+ "grad_norm": 1.3889422416687012,
5435
+ "learning_rate": 4.275960272621852e-05,
5436
+ "loss": 0.7338,
5437
+ "step": 772
5438
+ },
5439
+ {
5440
+ "epoch": 0.6979683972911964,
5441
+ "grad_norm": 1.8344866037368774,
5442
+ "learning_rate": 4.252522739227445e-05,
5443
+ "loss": 1.1381,
5444
+ "step": 773
5445
+ },
5446
+ {
5447
+ "epoch": 0.6988713318284424,
5448
+ "grad_norm": 1.595993161201477,
5449
+ "learning_rate": 4.2291322571767344e-05,
5450
+ "loss": 0.9127,
5451
+ "step": 774
5452
+ },
5453
+ {
5454
+ "epoch": 0.6997742663656885,
5455
+ "grad_norm": 1.143455982208252,
5456
+ "learning_rate": 4.205789017954364e-05,
5457
+ "loss": 0.4581,
5458
+ "step": 775
5459
+ },
5460
+ {
5461
+ "epoch": 0.7006772009029345,
5462
+ "grad_norm": 1.399437427520752,
5463
+ "learning_rate": 4.182493212658224e-05,
5464
+ "loss": 0.7188,
5465
+ "step": 776
5466
+ },
5467
+ {
5468
+ "epoch": 0.7015801354401806,
5469
+ "grad_norm": 1.8559757471084595,
5470
+ "learning_rate": 4.159245031997881e-05,
5471
+ "loss": 1.1238,
5472
+ "step": 777
5473
+ },
5474
+ {
5475
+ "epoch": 0.7024830699774266,
5476
+ "grad_norm": 1.4119492769241333,
5477
+ "learning_rate": 4.136044666293044e-05,
5478
+ "loss": 1.0352,
5479
+ "step": 778
5480
+ },
5481
+ {
5482
+ "epoch": 0.7033860045146727,
5483
+ "grad_norm": 1.46328604221344,
5484
+ "learning_rate": 4.112892305471974e-05,
5485
+ "loss": 1.0075,
5486
+ "step": 779
5487
+ },
5488
+ {
5489
+ "epoch": 0.7042889390519187,
5490
+ "grad_norm": 1.7197778224945068,
5491
+ "learning_rate": 4.0897881390699356e-05,
5492
+ "loss": 1.067,
5493
+ "step": 780
5494
+ },
5495
+ {
5496
+ "epoch": 0.7051918735891648,
5497
+ "grad_norm": 1.2641351222991943,
5498
+ "learning_rate": 4.0667323562276814e-05,
5499
+ "loss": 0.6254,
5500
+ "step": 781
5501
+ },
5502
+ {
5503
+ "epoch": 0.7060948081264108,
5504
+ "grad_norm": 1.173400640487671,
5505
+ "learning_rate": 4.043725145689846e-05,
5506
+ "loss": 0.8572,
5507
+ "step": 782
5508
+ },
5509
+ {
5510
+ "epoch": 0.7069977426636569,
5511
+ "grad_norm": 1.3645060062408447,
5512
+ "learning_rate": 4.0207666958034465e-05,
5513
+ "loss": 1.0241,
5514
+ "step": 783
5515
+ },
5516
+ {
5517
+ "epoch": 0.7079006772009029,
5518
+ "grad_norm": 1.5815194845199585,
5519
+ "learning_rate": 3.997857194516319e-05,
5520
+ "loss": 1.0335,
5521
+ "step": 784
5522
+ },
5523
+ {
5524
+ "epoch": 0.708803611738149,
5525
+ "grad_norm": 2.148763418197632,
5526
+ "learning_rate": 3.9749968293755834e-05,
5527
+ "loss": 1.6093,
5528
+ "step": 785
5529
+ },
5530
+ {
5531
+ "epoch": 0.7097065462753951,
5532
+ "grad_norm": 1.3801579475402832,
5533
+ "learning_rate": 3.9521857875261114e-05,
5534
+ "loss": 0.912,
5535
+ "step": 786
5536
+ },
5537
+ {
5538
+ "epoch": 0.7106094808126411,
5539
+ "grad_norm": 1.2974218130111694,
5540
+ "learning_rate": 3.929424255708999e-05,
5541
+ "loss": 0.7171,
5542
+ "step": 787
5543
+ },
5544
+ {
5545
+ "epoch": 0.7115124153498872,
5546
+ "grad_norm": 2.0133774280548096,
5547
+ "learning_rate": 3.9067124202600194e-05,
5548
+ "loss": 1.4236,
5549
+ "step": 788
5550
+ },
5551
+ {
5552
+ "epoch": 0.7124153498871332,
5553
+ "grad_norm": 1.2373929023742676,
5554
+ "learning_rate": 3.884050467108117e-05,
5555
+ "loss": 0.69,
5556
+ "step": 789
5557
+ },
5558
+ {
5559
+ "epoch": 0.7133182844243793,
5560
+ "grad_norm": 1.5660927295684814,
5561
+ "learning_rate": 3.8614385817738794e-05,
5562
+ "loss": 1.1102,
5563
+ "step": 790
5564
+ },
5565
+ {
5566
+ "epoch": 0.7142212189616253,
5567
+ "grad_norm": 1.4448158740997314,
5568
+ "learning_rate": 3.83887694936802e-05,
5569
+ "loss": 0.8871,
5570
+ "step": 791
5571
+ },
5572
+ {
5573
+ "epoch": 0.7151241534988714,
5574
+ "grad_norm": 1.6486666202545166,
5575
+ "learning_rate": 3.816365754589845e-05,
5576
+ "loss": 1.2077,
5577
+ "step": 792
5578
+ },
5579
+ {
5580
+ "epoch": 0.7160270880361174,
5581
+ "grad_norm": 2.0939183235168457,
5582
+ "learning_rate": 3.793905181725772e-05,
5583
+ "loss": 1.3465,
5584
+ "step": 793
5585
+ },
5586
+ {
5587
+ "epoch": 0.7169300225733635,
5588
+ "grad_norm": 1.6600865125656128,
5589
+ "learning_rate": 3.771495414647802e-05,
5590
+ "loss": 0.9234,
5591
+ "step": 794
5592
+ },
5593
+ {
5594
+ "epoch": 0.7178329571106095,
5595
+ "grad_norm": 1.938519835472107,
5596
+ "learning_rate": 3.7491366368120104e-05,
5597
+ "loss": 1.4282,
5598
+ "step": 795
5599
+ },
5600
+ {
5601
+ "epoch": 0.7187358916478556,
5602
+ "grad_norm": 1.9457025527954102,
5603
+ "learning_rate": 3.726829031257062e-05,
5604
+ "loss": 1.3572,
5605
+ "step": 796
5606
+ },
5607
+ {
5608
+ "epoch": 0.7196388261851016,
5609
+ "grad_norm": 1.8345212936401367,
5610
+ "learning_rate": 3.704572780602701e-05,
5611
+ "loss": 1.0558,
5612
+ "step": 797
5613
+ },
5614
+ {
5615
+ "epoch": 0.7205417607223477,
5616
+ "grad_norm": 1.7764463424682617,
5617
+ "learning_rate": 3.6823680670482485e-05,
5618
+ "loss": 1.0214,
5619
+ "step": 798
5620
+ },
5621
+ {
5622
+ "epoch": 0.7214446952595936,
5623
+ "grad_norm": 1.5642400979995728,
5624
+ "learning_rate": 3.660215072371135e-05,
5625
+ "loss": 1.0181,
5626
+ "step": 799
5627
+ },
5628
+ {
5629
+ "epoch": 0.7223476297968398,
5630
+ "grad_norm": 1.6106574535369873,
5631
+ "learning_rate": 3.638113977925387e-05,
5632
+ "loss": 0.9659,
5633
+ "step": 800
5634
+ },
5635
+ {
5636
+ "epoch": 0.7232505643340857,
5637
+ "grad_norm": 1.6759979724884033,
5638
+ "learning_rate": 3.6160649646401635e-05,
5639
+ "loss": 1.0394,
5640
+ "step": 801
5641
+ },
5642
+ {
5643
+ "epoch": 0.7241534988713318,
5644
+ "grad_norm": 1.783288598060608,
5645
+ "learning_rate": 3.594068213018249e-05,
5646
+ "loss": 1.3223,
5647
+ "step": 802
5648
+ },
5649
+ {
5650
+ "epoch": 0.7250564334085778,
5651
+ "grad_norm": 1.7645738124847412,
5652
+ "learning_rate": 3.5721239031346066e-05,
5653
+ "loss": 1.0735,
5654
+ "step": 803
5655
+ },
5656
+ {
5657
+ "epoch": 0.7259593679458239,
5658
+ "grad_norm": 1.9430018663406372,
5659
+ "learning_rate": 3.550232214634884e-05,
5660
+ "loss": 1.5089,
5661
+ "step": 804
5662
+ },
5663
+ {
5664
+ "epoch": 0.7268623024830699,
5665
+ "grad_norm": 1.5665555000305176,
5666
+ "learning_rate": 3.528393326733941e-05,
5667
+ "loss": 1.0646,
5668
+ "step": 805
5669
+ },
5670
+ {
5671
+ "epoch": 0.727765237020316,
5672
+ "grad_norm": 1.7760690450668335,
5673
+ "learning_rate": 3.506607418214395e-05,
5674
+ "loss": 1.5192,
5675
+ "step": 806
5676
+ },
5677
+ {
5678
+ "epoch": 0.728668171557562,
5679
+ "grad_norm": 1.5534496307373047,
5680
+ "learning_rate": 3.4848746674251545e-05,
5681
+ "loss": 0.9254,
5682
+ "step": 807
5683
+ },
5684
+ {
5685
+ "epoch": 0.7295711060948081,
5686
+ "grad_norm": 1.872107744216919,
5687
+ "learning_rate": 3.463195252279939e-05,
5688
+ "loss": 1.2138,
5689
+ "step": 808
5690
+ },
5691
+ {
5692
+ "epoch": 0.7304740406320541,
5693
+ "grad_norm": 1.5574711561203003,
5694
+ "learning_rate": 3.441569350255868e-05,
5695
+ "loss": 1.206,
5696
+ "step": 809
5697
+ },
5698
+ {
5699
+ "epoch": 0.7313769751693002,
5700
+ "grad_norm": 1.5422251224517822,
5701
+ "learning_rate": 3.419997138391954e-05,
5702
+ "loss": 0.9381,
5703
+ "step": 810
5704
+ },
5705
+ {
5706
+ "epoch": 0.7322799097065463,
5707
+ "grad_norm": 1.3428008556365967,
5708
+ "learning_rate": 3.3984787932876814e-05,
5709
+ "loss": 0.982,
5710
+ "step": 811
5711
+ },
5712
+ {
5713
+ "epoch": 0.7331828442437923,
5714
+ "grad_norm": 1.870396614074707,
5715
+ "learning_rate": 3.377014491101577e-05,
5716
+ "loss": 1.3272,
5717
+ "step": 812
5718
+ },
5719
+ {
5720
+ "epoch": 0.7340857787810384,
5721
+ "grad_norm": 2.201111078262329,
5722
+ "learning_rate": 3.355604407549725e-05,
5723
+ "loss": 1.5567,
5724
+ "step": 813
5725
+ },
5726
+ {
5727
+ "epoch": 0.7349887133182844,
5728
+ "grad_norm": 1.5531777143478394,
5729
+ "learning_rate": 3.334248717904368e-05,
5730
+ "loss": 1.0115,
5731
+ "step": 814
5732
+ },
5733
+ {
5734
+ "epoch": 0.7358916478555305,
5735
+ "grad_norm": 1.5128211975097656,
5736
+ "learning_rate": 3.3129475969924526e-05,
5737
+ "loss": 0.929,
5738
+ "step": 815
5739
+ },
5740
+ {
5741
+ "epoch": 0.7367945823927765,
5742
+ "grad_norm": 1.2759476900100708,
5743
+ "learning_rate": 3.291701219194195e-05,
5744
+ "loss": 0.6896,
5745
+ "step": 816
5746
+ },
5747
+ {
5748
+ "epoch": 0.7376975169300226,
5749
+ "grad_norm": 1.4616997241973877,
5750
+ "learning_rate": 3.270509758441671e-05,
5751
+ "loss": 1.0415,
5752
+ "step": 817
5753
+ },
5754
+ {
5755
+ "epoch": 0.7386004514672686,
5756
+ "grad_norm": 1.445452094078064,
5757
+ "learning_rate": 3.24937338821738e-05,
5758
+ "loss": 1.0383,
5759
+ "step": 818
5760
+ },
5761
+ {
5762
+ "epoch": 0.7395033860045147,
5763
+ "grad_norm": 1.2269552946090698,
5764
+ "learning_rate": 3.2282922815528135e-05,
5765
+ "loss": 0.6495,
5766
+ "step": 819
5767
+ },
5768
+ {
5769
+ "epoch": 0.7404063205417607,
5770
+ "grad_norm": 1.6146233081817627,
5771
+ "learning_rate": 3.207266611027069e-05,
5772
+ "loss": 0.9821,
5773
+ "step": 820
5774
+ },
5775
+ {
5776
+ "epoch": 0.7413092550790068,
5777
+ "grad_norm": 1.2971818447113037,
5778
+ "learning_rate": 3.186296548765411e-05,
5779
+ "loss": 0.7807,
5780
+ "step": 821
5781
+ },
5782
+ {
5783
+ "epoch": 0.7422121896162528,
5784
+ "grad_norm": 1.6828197240829468,
5785
+ "learning_rate": 3.165382266437874e-05,
5786
+ "loss": 1.0824,
5787
+ "step": 822
5788
+ },
5789
+ {
5790
+ "epoch": 0.7431151241534989,
5791
+ "grad_norm": 1.1024787425994873,
5792
+ "learning_rate": 3.144523935257846e-05,
5793
+ "loss": 0.5144,
5794
+ "step": 823
5795
+ },
5796
+ {
5797
+ "epoch": 0.7440180586907449,
5798
+ "grad_norm": 1.3970650434494019,
5799
+ "learning_rate": 3.123721725980683e-05,
5800
+ "loss": 0.9,
5801
+ "step": 824
5802
+ },
5803
+ {
5804
+ "epoch": 0.744920993227991,
5805
+ "grad_norm": 1.3488242626190186,
5806
+ "learning_rate": 3.102975808902303e-05,
5807
+ "loss": 1.0081,
5808
+ "step": 825
5809
+ },
5810
+ {
5811
+ "epoch": 0.745823927765237,
5812
+ "grad_norm": 1.3460159301757812,
5813
+ "learning_rate": 3.082286353857782e-05,
5814
+ "loss": 1.2013,
5815
+ "step": 826
5816
+ },
5817
+ {
5818
+ "epoch": 0.7467268623024831,
5819
+ "grad_norm": 1.8649550676345825,
5820
+ "learning_rate": 3.061653530219983e-05,
5821
+ "loss": 1.3269,
5822
+ "step": 827
5823
+ },
5824
+ {
5825
+ "epoch": 0.7476297968397291,
5826
+ "grad_norm": 1.3285624980926514,
5827
+ "learning_rate": 3.0410775068981613e-05,
5828
+ "loss": 0.8373,
5829
+ "step": 828
5830
+ },
5831
+ {
5832
+ "epoch": 0.7485327313769752,
5833
+ "grad_norm": 1.1073821783065796,
5834
+ "learning_rate": 3.0205584523365626e-05,
5835
+ "loss": 0.6991,
5836
+ "step": 829
5837
+ },
5838
+ {
5839
+ "epoch": 0.7494356659142212,
5840
+ "grad_norm": 1.5901031494140625,
5841
+ "learning_rate": 3.0000965345130903e-05,
5842
+ "loss": 1.1712,
5843
+ "step": 830
5844
+ },
5845
+ {
5846
+ "epoch": 0.7503386004514673,
5847
+ "grad_norm": 1.817070722579956,
5848
+ "learning_rate": 2.979691920937875e-05,
5849
+ "loss": 1.4764,
5850
+ "step": 831
5851
+ },
5852
+ {
5853
+ "epoch": 0.7503386004514673,
5854
+ "eval_loss": 1.0855916738510132,
5855
+ "eval_runtime": 8.3193,
5856
+ "eval_samples_per_second": 56.135,
5857
+ "eval_steps_per_second": 28.127,
5858
+ "step": 831
5859
  }
5860
  ],
5861
  "logging_steps": 1,
 
5875
  "attributes": {}
5876
  }
5877
  },
5878
+ "total_flos": 2.1256313392594944e+16,
5879
  "train_batch_size": 2,
5880
  "trial_name": null,
5881
  "trial_params": null