THIRAWAT-BioLORD / trainer_state.json
na399's picture
Initial commit
52e4a5e verified
{
"best_global_step": 7032,
"best_metric": 0.1703886240720749,
"best_model_checkpoint": "runs/de_biolord/checkpoint-7032",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 7032,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002133560913164071,
"grad_norm": 1407.41943359375,
"learning_rate": 1.1363636363636364e-07,
"loss": 260.9562,
"step": 5
},
{
"epoch": 0.004267121826328142,
"grad_norm": 1352.476806640625,
"learning_rate": 2.556818181818182e-07,
"loss": 243.0724,
"step": 10
},
{
"epoch": 0.006400682739492213,
"grad_norm": 1270.781982421875,
"learning_rate": 3.9772727272727276e-07,
"loss": 226.9033,
"step": 15
},
{
"epoch": 0.008534243652656284,
"grad_norm": 1122.8021240234375,
"learning_rate": 5.397727272727273e-07,
"loss": 247.598,
"step": 20
},
{
"epoch": 0.010667804565820354,
"grad_norm": 1343.4222412109375,
"learning_rate": 6.818181818181818e-07,
"loss": 237.7835,
"step": 25
},
{
"epoch": 0.012801365478984425,
"grad_norm": 1176.0780029296875,
"learning_rate": 8.238636363636364e-07,
"loss": 218.3724,
"step": 30
},
{
"epoch": 0.014934926392148495,
"grad_norm": 1119.2242431640625,
"learning_rate": 9.65909090909091e-07,
"loss": 208.53,
"step": 35
},
{
"epoch": 0.017068487305312567,
"grad_norm": 1353.07958984375,
"learning_rate": 1.1079545454545456e-06,
"loss": 224.4951,
"step": 40
},
{
"epoch": 0.01920204821847664,
"grad_norm": 1231.170166015625,
"learning_rate": 1.25e-06,
"loss": 207.6082,
"step": 45
},
{
"epoch": 0.021335609131640707,
"grad_norm": 1226.5369873046875,
"learning_rate": 1.3920454545454546e-06,
"loss": 215.5779,
"step": 50
},
{
"epoch": 0.02346917004480478,
"grad_norm": 1271.869873046875,
"learning_rate": 1.5340909090909093e-06,
"loss": 235.3491,
"step": 55
},
{
"epoch": 0.02560273095796885,
"grad_norm": 1287.97900390625,
"learning_rate": 1.6761363636363636e-06,
"loss": 221.6164,
"step": 60
},
{
"epoch": 0.027736291871132922,
"grad_norm": 1170.921630859375,
"learning_rate": 1.8181818181818183e-06,
"loss": 212.6889,
"step": 65
},
{
"epoch": 0.02986985278429699,
"grad_norm": 1338.1890869140625,
"learning_rate": 1.9602272727272728e-06,
"loss": 216.2898,
"step": 70
},
{
"epoch": 0.03200341369746106,
"grad_norm": 1271.138916015625,
"learning_rate": 2.1022727272727277e-06,
"loss": 231.7364,
"step": 75
},
{
"epoch": 0.034136974610625134,
"grad_norm": 1132.897705078125,
"learning_rate": 2.2443181818181818e-06,
"loss": 206.7381,
"step": 80
},
{
"epoch": 0.036270535523789206,
"grad_norm": 1292.3948974609375,
"learning_rate": 2.3863636363636367e-06,
"loss": 194.5393,
"step": 85
},
{
"epoch": 0.03840409643695328,
"grad_norm": 1246.8048095703125,
"learning_rate": 2.528409090909091e-06,
"loss": 212.0394,
"step": 90
},
{
"epoch": 0.04053765735011734,
"grad_norm": 970.118896484375,
"learning_rate": 2.6704545454545457e-06,
"loss": 199.709,
"step": 95
},
{
"epoch": 0.042671218263281414,
"grad_norm": 1266.0760498046875,
"learning_rate": 2.8125e-06,
"loss": 208.5693,
"step": 100
},
{
"epoch": 0.044804779176445486,
"grad_norm": 1050.2158203125,
"learning_rate": 2.954545454545455e-06,
"loss": 185.4214,
"step": 105
},
{
"epoch": 0.04693834008960956,
"grad_norm": 1134.134033203125,
"learning_rate": 3.096590909090909e-06,
"loss": 207.6509,
"step": 110
},
{
"epoch": 0.04907190100277363,
"grad_norm": 1127.47021484375,
"learning_rate": 3.2386363636363637e-06,
"loss": 207.7226,
"step": 115
},
{
"epoch": 0.0512054619159377,
"grad_norm": 1074.081787109375,
"learning_rate": 3.3806818181818186e-06,
"loss": 176.0556,
"step": 120
},
{
"epoch": 0.05333902282910177,
"grad_norm": 1120.4603271484375,
"learning_rate": 3.522727272727273e-06,
"loss": 194.877,
"step": 125
},
{
"epoch": 0.055472583742265845,
"grad_norm": 1111.07080078125,
"learning_rate": 3.6647727272727276e-06,
"loss": 194.9979,
"step": 130
},
{
"epoch": 0.05760614465542991,
"grad_norm": 1141.6068115234375,
"learning_rate": 3.806818181818182e-06,
"loss": 201.5107,
"step": 135
},
{
"epoch": 0.05973970556859398,
"grad_norm": 1131.10986328125,
"learning_rate": 3.9488636363636366e-06,
"loss": 191.0996,
"step": 140
},
{
"epoch": 0.06187326648175805,
"grad_norm": 966.1857299804688,
"learning_rate": 4.0909090909090915e-06,
"loss": 156.6461,
"step": 145
},
{
"epoch": 0.06400682739492213,
"grad_norm": 1031.0262451171875,
"learning_rate": 4.2329545454545455e-06,
"loss": 163.2723,
"step": 150
},
{
"epoch": 0.06614038830808619,
"grad_norm": 1065.27392578125,
"learning_rate": 4.3750000000000005e-06,
"loss": 160.0,
"step": 155
},
{
"epoch": 0.06827394922125027,
"grad_norm": 1084.3116455078125,
"learning_rate": 4.517045454545455e-06,
"loss": 142.0651,
"step": 160
},
{
"epoch": 0.07040751013441433,
"grad_norm": 986.8313598632812,
"learning_rate": 4.6590909090909095e-06,
"loss": 147.7512,
"step": 165
},
{
"epoch": 0.07254107104757841,
"grad_norm": 877.9501342773438,
"learning_rate": 4.8011363636363635e-06,
"loss": 140.8079,
"step": 170
},
{
"epoch": 0.07467463196074248,
"grad_norm": 947.1781005859375,
"learning_rate": 4.9431818181818184e-06,
"loss": 125.8033,
"step": 175
},
{
"epoch": 0.07680819287390656,
"grad_norm": 901.2457885742188,
"learning_rate": 5.085227272727273e-06,
"loss": 139.7856,
"step": 180
},
{
"epoch": 0.07894175378707062,
"grad_norm": 741.2592163085938,
"learning_rate": 5.2272727272727274e-06,
"loss": 116.0057,
"step": 185
},
{
"epoch": 0.08107531470023469,
"grad_norm": 747.7615966796875,
"learning_rate": 5.369318181818182e-06,
"loss": 110.8626,
"step": 190
},
{
"epoch": 0.08320887561339876,
"grad_norm": 747.3321533203125,
"learning_rate": 5.511363636363637e-06,
"loss": 109.0995,
"step": 195
},
{
"epoch": 0.08534243652656283,
"grad_norm": 780.7266845703125,
"learning_rate": 5.653409090909091e-06,
"loss": 119.4655,
"step": 200
},
{
"epoch": 0.08747599743972691,
"grad_norm": 695.3804321289062,
"learning_rate": 5.795454545454546e-06,
"loss": 110.2229,
"step": 205
},
{
"epoch": 0.08960955835289097,
"grad_norm": 789.1644897460938,
"learning_rate": 5.9375e-06,
"loss": 112.9283,
"step": 210
},
{
"epoch": 0.09174311926605505,
"grad_norm": 767.181396484375,
"learning_rate": 6.079545454545454e-06,
"loss": 112.8705,
"step": 215
},
{
"epoch": 0.09387668017921912,
"grad_norm": 720.9426879882812,
"learning_rate": 6.22159090909091e-06,
"loss": 99.1093,
"step": 220
},
{
"epoch": 0.09601024109238318,
"grad_norm": 646.9892578125,
"learning_rate": 6.363636363636364e-06,
"loss": 84.201,
"step": 225
},
{
"epoch": 0.09814380200554726,
"grad_norm": 596.7828979492188,
"learning_rate": 6.505681818181818e-06,
"loss": 85.2275,
"step": 230
},
{
"epoch": 0.10027736291871132,
"grad_norm": 511.76837158203125,
"learning_rate": 6.647727272727273e-06,
"loss": 73.787,
"step": 235
},
{
"epoch": 0.1024109238318754,
"grad_norm": 587.9999389648438,
"learning_rate": 6.789772727272727e-06,
"loss": 79.8435,
"step": 240
},
{
"epoch": 0.10454448474503947,
"grad_norm": 537.0763549804688,
"learning_rate": 6.931818181818183e-06,
"loss": 67.2795,
"step": 245
},
{
"epoch": 0.10667804565820355,
"grad_norm": 509.71734619140625,
"learning_rate": 7.073863636363637e-06,
"loss": 56.0195,
"step": 250
},
{
"epoch": 0.10881160657136761,
"grad_norm": 365.5326232910156,
"learning_rate": 7.215909090909091e-06,
"loss": 49.6022,
"step": 255
},
{
"epoch": 0.11094516748453169,
"grad_norm": 407.3646240234375,
"learning_rate": 7.357954545454546e-06,
"loss": 52.5486,
"step": 260
},
{
"epoch": 0.11307872839769575,
"grad_norm": 401.7477722167969,
"learning_rate": 7.500000000000001e-06,
"loss": 42.2398,
"step": 265
},
{
"epoch": 0.11521228931085982,
"grad_norm": 341.845947265625,
"learning_rate": 7.642045454545454e-06,
"loss": 40.5546,
"step": 270
},
{
"epoch": 0.1173458502240239,
"grad_norm": 294.14501953125,
"learning_rate": 7.784090909090911e-06,
"loss": 40.2339,
"step": 275
},
{
"epoch": 0.11947941113718796,
"grad_norm": 306.5284423828125,
"learning_rate": 7.926136363636364e-06,
"loss": 33.0703,
"step": 280
},
{
"epoch": 0.12161297205035204,
"grad_norm": 261.7529296875,
"learning_rate": 8.068181818181819e-06,
"loss": 28.2356,
"step": 285
},
{
"epoch": 0.1237465329635161,
"grad_norm": 180.16305541992188,
"learning_rate": 8.210227272727274e-06,
"loss": 25.7024,
"step": 290
},
{
"epoch": 0.12588009387668017,
"grad_norm": 198.47352600097656,
"learning_rate": 8.352272727272727e-06,
"loss": 21.9268,
"step": 295
},
{
"epoch": 0.12801365478984425,
"grad_norm": 155.0713653564453,
"learning_rate": 8.494318181818184e-06,
"loss": 18.9837,
"step": 300
},
{
"epoch": 0.13014721570300833,
"grad_norm": 114.19086456298828,
"learning_rate": 8.636363636363637e-06,
"loss": 14.5069,
"step": 305
},
{
"epoch": 0.13228077661617238,
"grad_norm": 127.5108413696289,
"learning_rate": 8.778409090909092e-06,
"loss": 13.9749,
"step": 310
},
{
"epoch": 0.13441433752933646,
"grad_norm": 101.1385726928711,
"learning_rate": 8.920454545454547e-06,
"loss": 10.4993,
"step": 315
},
{
"epoch": 0.13654789844250054,
"grad_norm": 102.41287994384766,
"learning_rate": 9.0625e-06,
"loss": 10.8756,
"step": 320
},
{
"epoch": 0.13868145935566462,
"grad_norm": 82.27557373046875,
"learning_rate": 9.204545454545455e-06,
"loss": 8.448,
"step": 325
},
{
"epoch": 0.14081502026882867,
"grad_norm": 59.02256774902344,
"learning_rate": 9.34659090909091e-06,
"loss": 7.3985,
"step": 330
},
{
"epoch": 0.14294858118199275,
"grad_norm": 55.171485900878906,
"learning_rate": 9.488636363636365e-06,
"loss": 6.7277,
"step": 335
},
{
"epoch": 0.14508214209515682,
"grad_norm": 51.14794921875,
"learning_rate": 9.630681818181818e-06,
"loss": 5.369,
"step": 340
},
{
"epoch": 0.14721570300832088,
"grad_norm": 47.7487678527832,
"learning_rate": 9.772727272727273e-06,
"loss": 5.1479,
"step": 345
},
{
"epoch": 0.14934926392148495,
"grad_norm": 34.616519927978516,
"learning_rate": 9.914772727272728e-06,
"loss": 3.9802,
"step": 350
},
{
"epoch": 0.15148282483464903,
"grad_norm": 28.53857421875,
"learning_rate": 1.0056818181818183e-05,
"loss": 3.3223,
"step": 355
},
{
"epoch": 0.1536163857478131,
"grad_norm": 26.984251022338867,
"learning_rate": 1.0198863636363636e-05,
"loss": 3.148,
"step": 360
},
{
"epoch": 0.15574994666097716,
"grad_norm": 25.347354888916016,
"learning_rate": 1.0340909090909093e-05,
"loss": 3.1232,
"step": 365
},
{
"epoch": 0.15788350757414124,
"grad_norm": 21.828699111938477,
"learning_rate": 1.0482954545454548e-05,
"loss": 2.6608,
"step": 370
},
{
"epoch": 0.16001706848730532,
"grad_norm": 19.460886001586914,
"learning_rate": 1.0625e-05,
"loss": 2.6175,
"step": 375
},
{
"epoch": 0.16215062940046937,
"grad_norm": 14.729393005371094,
"learning_rate": 1.0767045454545456e-05,
"loss": 2.1366,
"step": 380
},
{
"epoch": 0.16428419031363345,
"grad_norm": 15.992349624633789,
"learning_rate": 1.0909090909090909e-05,
"loss": 2.2121,
"step": 385
},
{
"epoch": 0.16641775122679753,
"grad_norm": 14.3778715133667,
"learning_rate": 1.1051136363636366e-05,
"loss": 2.2454,
"step": 390
},
{
"epoch": 0.1685513121399616,
"grad_norm": 14.635651588439941,
"learning_rate": 1.119318181818182e-05,
"loss": 2.1155,
"step": 395
},
{
"epoch": 0.17068487305312566,
"grad_norm": 14.150617599487305,
"learning_rate": 1.1335227272727274e-05,
"loss": 1.988,
"step": 400
},
{
"epoch": 0.17281843396628974,
"grad_norm": 13.40462875366211,
"learning_rate": 1.1477272727272729e-05,
"loss": 1.965,
"step": 405
},
{
"epoch": 0.17495199487945381,
"grad_norm": 12.215423583984375,
"learning_rate": 1.1619318181818182e-05,
"loss": 1.8777,
"step": 410
},
{
"epoch": 0.17708555579261787,
"grad_norm": 14.43690013885498,
"learning_rate": 1.1761363636363637e-05,
"loss": 1.9024,
"step": 415
},
{
"epoch": 0.17921911670578194,
"grad_norm": 11.984452247619629,
"learning_rate": 1.1903409090909093e-05,
"loss": 1.7564,
"step": 420
},
{
"epoch": 0.18135267761894602,
"grad_norm": 11.757567405700684,
"learning_rate": 1.2045454545454547e-05,
"loss": 1.7513,
"step": 425
},
{
"epoch": 0.1834862385321101,
"grad_norm": 12.034418106079102,
"learning_rate": 1.2187500000000001e-05,
"loss": 1.8298,
"step": 430
},
{
"epoch": 0.18561979944527415,
"grad_norm": 9.716742515563965,
"learning_rate": 1.2329545454545455e-05,
"loss": 1.5716,
"step": 435
},
{
"epoch": 0.18775336035843823,
"grad_norm": 9.575140953063965,
"learning_rate": 1.247159090909091e-05,
"loss": 1.5193,
"step": 440
},
{
"epoch": 0.1898869212716023,
"grad_norm": 10.11562442779541,
"learning_rate": 1.2613636363636366e-05,
"loss": 1.4442,
"step": 445
},
{
"epoch": 0.19202048218476636,
"grad_norm": 9.89345645904541,
"learning_rate": 1.275568181818182e-05,
"loss": 1.5014,
"step": 450
},
{
"epoch": 0.19415404309793044,
"grad_norm": 9.934481620788574,
"learning_rate": 1.2897727272727274e-05,
"loss": 1.4301,
"step": 455
},
{
"epoch": 0.19628760401109452,
"grad_norm": 9.267789840698242,
"learning_rate": 1.3039772727272728e-05,
"loss": 1.5181,
"step": 460
},
{
"epoch": 0.1984211649242586,
"grad_norm": 8.098825454711914,
"learning_rate": 1.3181818181818183e-05,
"loss": 1.328,
"step": 465
},
{
"epoch": 0.20055472583742265,
"grad_norm": 8.572412490844727,
"learning_rate": 1.3323863636363636e-05,
"loss": 1.2831,
"step": 470
},
{
"epoch": 0.20268828675058673,
"grad_norm": 9.359737396240234,
"learning_rate": 1.3465909090909092e-05,
"loss": 1.4169,
"step": 475
},
{
"epoch": 0.2048218476637508,
"grad_norm": 11.379630088806152,
"learning_rate": 1.3607954545454547e-05,
"loss": 1.6943,
"step": 480
},
{
"epoch": 0.20695540857691488,
"grad_norm": 8.644891738891602,
"learning_rate": 1.375e-05,
"loss": 1.2827,
"step": 485
},
{
"epoch": 0.20908896949007894,
"grad_norm": 8.371868133544922,
"learning_rate": 1.3892045454545455e-05,
"loss": 1.3442,
"step": 490
},
{
"epoch": 0.21122253040324301,
"grad_norm": 8.641389846801758,
"learning_rate": 1.4034090909090909e-05,
"loss": 1.3282,
"step": 495
},
{
"epoch": 0.2133560913164071,
"grad_norm": 8.642644882202148,
"learning_rate": 1.4176136363636365e-05,
"loss": 1.2025,
"step": 500
},
{
"epoch": 0.21548965222957114,
"grad_norm": 8.546757698059082,
"learning_rate": 1.431818181818182e-05,
"loss": 1.4431,
"step": 505
},
{
"epoch": 0.21762321314273522,
"grad_norm": 9.831377983093262,
"learning_rate": 1.4460227272727273e-05,
"loss": 1.2434,
"step": 510
},
{
"epoch": 0.2197567740558993,
"grad_norm": 8.539069175720215,
"learning_rate": 1.4602272727272728e-05,
"loss": 1.1499,
"step": 515
},
{
"epoch": 0.22189033496906338,
"grad_norm": 7.561169147491455,
"learning_rate": 1.4744318181818183e-05,
"loss": 1.0773,
"step": 520
},
{
"epoch": 0.22402389588222743,
"grad_norm": 8.947616577148438,
"learning_rate": 1.4886363636363636e-05,
"loss": 1.2212,
"step": 525
},
{
"epoch": 0.2261574567953915,
"grad_norm": 9.044981002807617,
"learning_rate": 1.5028409090909093e-05,
"loss": 1.2723,
"step": 530
},
{
"epoch": 0.2282910177085556,
"grad_norm": 8.213238716125488,
"learning_rate": 1.5170454545454546e-05,
"loss": 1.2088,
"step": 535
},
{
"epoch": 0.23042457862171964,
"grad_norm": 7.895395278930664,
"learning_rate": 1.5312500000000003e-05,
"loss": 1.1508,
"step": 540
},
{
"epoch": 0.23255813953488372,
"grad_norm": 7.332098007202148,
"learning_rate": 1.5454545454545454e-05,
"loss": 1.1426,
"step": 545
},
{
"epoch": 0.2346917004480478,
"grad_norm": 7.83052921295166,
"learning_rate": 1.559659090909091e-05,
"loss": 1.1775,
"step": 550
},
{
"epoch": 0.23682526136121188,
"grad_norm": 8.019132614135742,
"learning_rate": 1.5738636363636364e-05,
"loss": 1.2678,
"step": 555
},
{
"epoch": 0.23895882227437593,
"grad_norm": 7.728061199188232,
"learning_rate": 1.588068181818182e-05,
"loss": 1.1158,
"step": 560
},
{
"epoch": 0.24109238318754,
"grad_norm": 6.924149513244629,
"learning_rate": 1.6022727272727274e-05,
"loss": 1.0517,
"step": 565
},
{
"epoch": 0.24322594410070408,
"grad_norm": 7.8373894691467285,
"learning_rate": 1.616477272727273e-05,
"loss": 1.1448,
"step": 570
},
{
"epoch": 0.24535950501386813,
"grad_norm": 8.716146469116211,
"learning_rate": 1.6306818181818184e-05,
"loss": 1.16,
"step": 575
},
{
"epoch": 0.2474930659270322,
"grad_norm": 6.993872165679932,
"learning_rate": 1.6448863636363635e-05,
"loss": 1.0683,
"step": 580
},
{
"epoch": 0.2496266268401963,
"grad_norm": 6.647220134735107,
"learning_rate": 1.6590909090909094e-05,
"loss": 1.0468,
"step": 585
},
{
"epoch": 0.25176018775336034,
"grad_norm": 6.776019096374512,
"learning_rate": 1.673295454545455e-05,
"loss": 0.9907,
"step": 590
},
{
"epoch": 0.2538937486665244,
"grad_norm": 8.398134231567383,
"learning_rate": 1.6875e-05,
"loss": 1.1522,
"step": 595
},
{
"epoch": 0.2560273095796885,
"grad_norm": 7.639389514923096,
"learning_rate": 1.7017045454545455e-05,
"loss": 1.0523,
"step": 600
},
{
"epoch": 0.2581608704928526,
"grad_norm": 7.4302263259887695,
"learning_rate": 1.715909090909091e-05,
"loss": 1.0185,
"step": 605
},
{
"epoch": 0.26029443140601666,
"grad_norm": 6.816908836364746,
"learning_rate": 1.7301136363636365e-05,
"loss": 1.0248,
"step": 610
},
{
"epoch": 0.26242799231918074,
"grad_norm": 5.953906536102295,
"learning_rate": 1.744318181818182e-05,
"loss": 0.9046,
"step": 615
},
{
"epoch": 0.26456155323234476,
"grad_norm": 7.096611022949219,
"learning_rate": 1.7585227272727275e-05,
"loss": 0.9747,
"step": 620
},
{
"epoch": 0.26669511414550884,
"grad_norm": 6.390513896942139,
"learning_rate": 1.772727272727273e-05,
"loss": 0.9869,
"step": 625
},
{
"epoch": 0.2688286750586729,
"grad_norm": 7.380507946014404,
"learning_rate": 1.786931818181818e-05,
"loss": 1.0425,
"step": 630
},
{
"epoch": 0.270962235971837,
"grad_norm": 6.44447660446167,
"learning_rate": 1.8011363636363636e-05,
"loss": 0.9651,
"step": 635
},
{
"epoch": 0.2730957968850011,
"grad_norm": 6.748442649841309,
"learning_rate": 1.8153409090909094e-05,
"loss": 1.0077,
"step": 640
},
{
"epoch": 0.27522935779816515,
"grad_norm": 6.084346771240234,
"learning_rate": 1.8295454545454546e-05,
"loss": 0.8222,
"step": 645
},
{
"epoch": 0.27736291871132923,
"grad_norm": 6.8281073570251465,
"learning_rate": 1.84375e-05,
"loss": 0.8685,
"step": 650
},
{
"epoch": 0.27949647962449325,
"grad_norm": 7.083287715911865,
"learning_rate": 1.8579545454545456e-05,
"loss": 0.8913,
"step": 655
},
{
"epoch": 0.28163004053765733,
"grad_norm": 6.964037895202637,
"learning_rate": 1.872159090909091e-05,
"loss": 0.935,
"step": 660
},
{
"epoch": 0.2837636014508214,
"grad_norm": 6.590587139129639,
"learning_rate": 1.8863636363636366e-05,
"loss": 0.967,
"step": 665
},
{
"epoch": 0.2858971623639855,
"grad_norm": 5.6939921379089355,
"learning_rate": 1.900568181818182e-05,
"loss": 0.9005,
"step": 670
},
{
"epoch": 0.28803072327714957,
"grad_norm": 6.028235912322998,
"learning_rate": 1.9147727272727276e-05,
"loss": 0.8015,
"step": 675
},
{
"epoch": 0.29016428419031365,
"grad_norm": 7.533740043640137,
"learning_rate": 1.9289772727272727e-05,
"loss": 0.912,
"step": 680
},
{
"epoch": 0.2922978451034777,
"grad_norm": 6.917770862579346,
"learning_rate": 1.9431818181818182e-05,
"loss": 0.9849,
"step": 685
},
{
"epoch": 0.29443140601664175,
"grad_norm": 8.427461624145508,
"learning_rate": 1.9573863636363637e-05,
"loss": 1.0628,
"step": 690
},
{
"epoch": 0.29656496692980583,
"grad_norm": 5.852340221405029,
"learning_rate": 1.9715909090909092e-05,
"loss": 0.8542,
"step": 695
},
{
"epoch": 0.2986985278429699,
"grad_norm": 6.263002395629883,
"learning_rate": 1.9857954545454547e-05,
"loss": 0.911,
"step": 700
},
{
"epoch": 0.300832088756134,
"grad_norm": 6.515467643737793,
"learning_rate": 2e-05,
"loss": 0.9248,
"step": 705
},
{
"epoch": 0.30296564966929806,
"grad_norm": 5.9300217628479,
"learning_rate": 1.9984197218710496e-05,
"loss": 0.8878,
"step": 710
},
{
"epoch": 0.30509921058246214,
"grad_norm": 6.8438801765441895,
"learning_rate": 1.9968394437420987e-05,
"loss": 0.9006,
"step": 715
},
{
"epoch": 0.3072327714956262,
"grad_norm": 6.085014820098877,
"learning_rate": 1.9952591656131482e-05,
"loss": 0.9037,
"step": 720
},
{
"epoch": 0.30936633240879025,
"grad_norm": 6.003873348236084,
"learning_rate": 1.9936788874841973e-05,
"loss": 0.8492,
"step": 725
},
{
"epoch": 0.3114998933219543,
"grad_norm": 6.141636848449707,
"learning_rate": 1.9920986093552468e-05,
"loss": 0.8886,
"step": 730
},
{
"epoch": 0.3136334542351184,
"grad_norm": 6.797048091888428,
"learning_rate": 1.990518331226296e-05,
"loss": 0.8558,
"step": 735
},
{
"epoch": 0.3157670151482825,
"grad_norm": 5.367070198059082,
"learning_rate": 1.9889380530973453e-05,
"loss": 0.8353,
"step": 740
},
{
"epoch": 0.31790057606144656,
"grad_norm": 6.601510524749756,
"learning_rate": 1.9873577749683945e-05,
"loss": 0.8278,
"step": 745
},
{
"epoch": 0.32003413697461064,
"grad_norm": 6.147311687469482,
"learning_rate": 1.985777496839444e-05,
"loss": 0.8283,
"step": 750
},
{
"epoch": 0.3221676978877747,
"grad_norm": 6.131889820098877,
"learning_rate": 1.984197218710493e-05,
"loss": 0.867,
"step": 755
},
{
"epoch": 0.32430125880093874,
"grad_norm": 6.493585109710693,
"learning_rate": 1.9826169405815425e-05,
"loss": 0.7298,
"step": 760
},
{
"epoch": 0.3264348197141028,
"grad_norm": 6.721904277801514,
"learning_rate": 1.981036662452592e-05,
"loss": 0.874,
"step": 765
},
{
"epoch": 0.3285683806272669,
"grad_norm": 5.271955490112305,
"learning_rate": 1.979456384323641e-05,
"loss": 0.7672,
"step": 770
},
{
"epoch": 0.330701941540431,
"grad_norm": 5.658133506774902,
"learning_rate": 1.9778761061946905e-05,
"loss": 0.8225,
"step": 775
},
{
"epoch": 0.33283550245359506,
"grad_norm": 5.675405025482178,
"learning_rate": 1.97629582806574e-05,
"loss": 0.8248,
"step": 780
},
{
"epoch": 0.33496906336675913,
"grad_norm": 6.578061580657959,
"learning_rate": 1.974715549936789e-05,
"loss": 0.8961,
"step": 785
},
{
"epoch": 0.3371026242799232,
"grad_norm": 5.728764057159424,
"learning_rate": 1.9731352718078382e-05,
"loss": 0.8242,
"step": 790
},
{
"epoch": 0.33923618519308724,
"grad_norm": 6.199042320251465,
"learning_rate": 1.9715549936788877e-05,
"loss": 0.764,
"step": 795
},
{
"epoch": 0.3413697461062513,
"grad_norm": 5.93390417098999,
"learning_rate": 1.9699747155499368e-05,
"loss": 0.7479,
"step": 800
},
{
"epoch": 0.3435033070194154,
"grad_norm": 6.78892183303833,
"learning_rate": 1.9683944374209863e-05,
"loss": 0.9007,
"step": 805
},
{
"epoch": 0.3456368679325795,
"grad_norm": 5.648488998413086,
"learning_rate": 1.9668141592920357e-05,
"loss": 0.8867,
"step": 810
},
{
"epoch": 0.34777042884574355,
"grad_norm": 6.640005588531494,
"learning_rate": 1.9652338811630848e-05,
"loss": 0.8493,
"step": 815
},
{
"epoch": 0.34990398975890763,
"grad_norm": 6.431436061859131,
"learning_rate": 1.9636536030341343e-05,
"loss": 0.7158,
"step": 820
},
{
"epoch": 0.3520375506720717,
"grad_norm": 6.325982093811035,
"learning_rate": 1.9620733249051834e-05,
"loss": 0.7642,
"step": 825
},
{
"epoch": 0.35417111158523573,
"grad_norm": 5.668117046356201,
"learning_rate": 1.960493046776233e-05,
"loss": 0.7993,
"step": 830
},
{
"epoch": 0.3563046724983998,
"grad_norm": 5.590553283691406,
"learning_rate": 1.9589127686472823e-05,
"loss": 0.7103,
"step": 835
},
{
"epoch": 0.3584382334115639,
"grad_norm": 5.006702423095703,
"learning_rate": 1.9573324905183314e-05,
"loss": 0.7772,
"step": 840
},
{
"epoch": 0.36057179432472797,
"grad_norm": 5.906203269958496,
"learning_rate": 1.9557522123893806e-05,
"loss": 0.8199,
"step": 845
},
{
"epoch": 0.36270535523789205,
"grad_norm": 5.0988264083862305,
"learning_rate": 1.95417193426043e-05,
"loss": 0.9472,
"step": 850
},
{
"epoch": 0.3648389161510561,
"grad_norm": 5.713870525360107,
"learning_rate": 1.952591656131479e-05,
"loss": 0.7318,
"step": 855
},
{
"epoch": 0.3669724770642202,
"grad_norm": 5.6474432945251465,
"learning_rate": 1.9510113780025286e-05,
"loss": 0.6891,
"step": 860
},
{
"epoch": 0.3691060379773842,
"grad_norm": 5.688441753387451,
"learning_rate": 1.949431099873578e-05,
"loss": 0.8185,
"step": 865
},
{
"epoch": 0.3712395988905483,
"grad_norm": 5.583171367645264,
"learning_rate": 1.947850821744627e-05,
"loss": 0.7771,
"step": 870
},
{
"epoch": 0.3733731598037124,
"grad_norm": 6.464980125427246,
"learning_rate": 1.9462705436156766e-05,
"loss": 0.7638,
"step": 875
},
{
"epoch": 0.37550672071687646,
"grad_norm": 6.436215877532959,
"learning_rate": 1.944690265486726e-05,
"loss": 1.1106,
"step": 880
},
{
"epoch": 0.37764028163004054,
"grad_norm": 6.303576946258545,
"learning_rate": 1.9431099873577752e-05,
"loss": 0.8603,
"step": 885
},
{
"epoch": 0.3797738425432046,
"grad_norm": 5.866418838500977,
"learning_rate": 1.9415297092288243e-05,
"loss": 0.9143,
"step": 890
},
{
"epoch": 0.3819074034563687,
"grad_norm": 5.543066024780273,
"learning_rate": 1.9399494310998738e-05,
"loss": 0.7764,
"step": 895
},
{
"epoch": 0.3840409643695327,
"grad_norm": 5.441503047943115,
"learning_rate": 1.938369152970923e-05,
"loss": 0.8121,
"step": 900
},
{
"epoch": 0.3861745252826968,
"grad_norm": 5.828523635864258,
"learning_rate": 1.9367888748419723e-05,
"loss": 0.7664,
"step": 905
},
{
"epoch": 0.3883080861958609,
"grad_norm": 5.6354804039001465,
"learning_rate": 1.9352085967130218e-05,
"loss": 0.7424,
"step": 910
},
{
"epoch": 0.39044164710902496,
"grad_norm": 6.025097846984863,
"learning_rate": 1.933628318584071e-05,
"loss": 0.7377,
"step": 915
},
{
"epoch": 0.39257520802218904,
"grad_norm": 7.527228355407715,
"learning_rate": 1.9320480404551204e-05,
"loss": 0.8329,
"step": 920
},
{
"epoch": 0.3947087689353531,
"grad_norm": 5.251444339752197,
"learning_rate": 1.9304677623261695e-05,
"loss": 0.6529,
"step": 925
},
{
"epoch": 0.3968423298485172,
"grad_norm": 5.458242416381836,
"learning_rate": 1.928887484197219e-05,
"loss": 0.653,
"step": 930
},
{
"epoch": 0.3989758907616813,
"grad_norm": 5.475050926208496,
"learning_rate": 1.9273072060682684e-05,
"loss": 0.6713,
"step": 935
},
{
"epoch": 0.4011094516748453,
"grad_norm": 5.815523624420166,
"learning_rate": 1.9257269279393175e-05,
"loss": 0.8805,
"step": 940
},
{
"epoch": 0.4032430125880094,
"grad_norm": 5.540900707244873,
"learning_rate": 1.9241466498103666e-05,
"loss": 0.7712,
"step": 945
},
{
"epoch": 0.40537657350117345,
"grad_norm": 6.0890045166015625,
"learning_rate": 1.922566371681416e-05,
"loss": 0.7399,
"step": 950
},
{
"epoch": 0.40751013441433753,
"grad_norm": 6.03472375869751,
"learning_rate": 1.9209860935524652e-05,
"loss": 0.7024,
"step": 955
},
{
"epoch": 0.4096436953275016,
"grad_norm": 6.81519079208374,
"learning_rate": 1.9194058154235147e-05,
"loss": 0.7185,
"step": 960
},
{
"epoch": 0.4117772562406657,
"grad_norm": 4.7606425285339355,
"learning_rate": 1.917825537294564e-05,
"loss": 0.6943,
"step": 965
},
{
"epoch": 0.41391081715382977,
"grad_norm": 5.0476555824279785,
"learning_rate": 1.9162452591656132e-05,
"loss": 0.7553,
"step": 970
},
{
"epoch": 0.4160443780669938,
"grad_norm": 4.818223476409912,
"learning_rate": 1.9146649810366627e-05,
"loss": 0.5834,
"step": 975
},
{
"epoch": 0.41817793898015787,
"grad_norm": 5.1618499755859375,
"learning_rate": 1.9130847029077118e-05,
"loss": 0.6494,
"step": 980
},
{
"epoch": 0.42031149989332195,
"grad_norm": 5.9180731773376465,
"learning_rate": 1.9115044247787613e-05,
"loss": 0.7961,
"step": 985
},
{
"epoch": 0.42244506080648603,
"grad_norm": 5.7833051681518555,
"learning_rate": 1.9099241466498107e-05,
"loss": 0.6523,
"step": 990
},
{
"epoch": 0.4245786217196501,
"grad_norm": 6.119492053985596,
"learning_rate": 1.90834386852086e-05,
"loss": 0.9619,
"step": 995
},
{
"epoch": 0.4267121826328142,
"grad_norm": 5.727389812469482,
"learning_rate": 1.906763590391909e-05,
"loss": 0.7668,
"step": 1000
},
{
"epoch": 0.42884574354597826,
"grad_norm": 4.694662570953369,
"learning_rate": 1.9051833122629584e-05,
"loss": 0.6901,
"step": 1005
},
{
"epoch": 0.4309793044591423,
"grad_norm": 5.501200199127197,
"learning_rate": 1.9036030341340075e-05,
"loss": 0.6974,
"step": 1010
},
{
"epoch": 0.43311286537230637,
"grad_norm": 8.685210227966309,
"learning_rate": 1.902022756005057e-05,
"loss": 0.6458,
"step": 1015
},
{
"epoch": 0.43524642628547044,
"grad_norm": 5.705277919769287,
"learning_rate": 1.9004424778761065e-05,
"loss": 0.664,
"step": 1020
},
{
"epoch": 0.4373799871986345,
"grad_norm": 5.0383453369140625,
"learning_rate": 1.8988621997471556e-05,
"loss": 0.639,
"step": 1025
},
{
"epoch": 0.4395135481117986,
"grad_norm": 5.572895526885986,
"learning_rate": 1.897281921618205e-05,
"loss": 0.7748,
"step": 1030
},
{
"epoch": 0.4416471090249627,
"grad_norm": 5.489956378936768,
"learning_rate": 1.8957016434892545e-05,
"loss": 0.666,
"step": 1035
},
{
"epoch": 0.44378066993812676,
"grad_norm": 5.743871212005615,
"learning_rate": 1.8941213653603036e-05,
"loss": 0.7466,
"step": 1040
},
{
"epoch": 0.4459142308512908,
"grad_norm": 5.379419803619385,
"learning_rate": 1.8925410872313527e-05,
"loss": 0.6738,
"step": 1045
},
{
"epoch": 0.44804779176445486,
"grad_norm": 4.491464614868164,
"learning_rate": 1.8909608091024022e-05,
"loss": 0.769,
"step": 1050
},
{
"epoch": 0.45018135267761894,
"grad_norm": 5.5537848472595215,
"learning_rate": 1.8893805309734513e-05,
"loss": 0.6594,
"step": 1055
},
{
"epoch": 0.452314913590783,
"grad_norm": 5.834591388702393,
"learning_rate": 1.8878002528445008e-05,
"loss": 0.7834,
"step": 1060
},
{
"epoch": 0.4544484745039471,
"grad_norm": 5.1776509284973145,
"learning_rate": 1.8862199747155502e-05,
"loss": 0.5834,
"step": 1065
},
{
"epoch": 0.4565820354171112,
"grad_norm": 5.313874244689941,
"learning_rate": 1.8846396965865993e-05,
"loss": 0.5964,
"step": 1070
},
{
"epoch": 0.45871559633027525,
"grad_norm": 5.8994574546813965,
"learning_rate": 1.8830594184576488e-05,
"loss": 0.673,
"step": 1075
},
{
"epoch": 0.4608491572434393,
"grad_norm": 5.38604736328125,
"learning_rate": 1.881479140328698e-05,
"loss": 0.6506,
"step": 1080
},
{
"epoch": 0.46298271815660336,
"grad_norm": 4.179570198059082,
"learning_rate": 1.8798988621997474e-05,
"loss": 0.6009,
"step": 1085
},
{
"epoch": 0.46511627906976744,
"grad_norm": 5.218341827392578,
"learning_rate": 1.8783185840707968e-05,
"loss": 0.628,
"step": 1090
},
{
"epoch": 0.4672498399829315,
"grad_norm": 5.472168922424316,
"learning_rate": 1.876738305941846e-05,
"loss": 0.7201,
"step": 1095
},
{
"epoch": 0.4693834008960956,
"grad_norm": 4.910761833190918,
"learning_rate": 1.875158027812895e-05,
"loss": 0.6409,
"step": 1100
},
{
"epoch": 0.47151696180925967,
"grad_norm": 5.117238998413086,
"learning_rate": 1.8735777496839445e-05,
"loss": 0.5792,
"step": 1105
},
{
"epoch": 0.47365052272242375,
"grad_norm": 5.522660732269287,
"learning_rate": 1.8719974715549936e-05,
"loss": 0.5869,
"step": 1110
},
{
"epoch": 0.4757840836355878,
"grad_norm": 4.868000507354736,
"learning_rate": 1.870417193426043e-05,
"loss": 0.6068,
"step": 1115
},
{
"epoch": 0.47791764454875185,
"grad_norm": 4.8891921043396,
"learning_rate": 1.8688369152970925e-05,
"loss": 0.6184,
"step": 1120
},
{
"epoch": 0.48005120546191593,
"grad_norm": 5.064128398895264,
"learning_rate": 1.8672566371681417e-05,
"loss": 0.62,
"step": 1125
},
{
"epoch": 0.48218476637508,
"grad_norm": 5.008705139160156,
"learning_rate": 1.865676359039191e-05,
"loss": 0.6352,
"step": 1130
},
{
"epoch": 0.4843183272882441,
"grad_norm": 4.730052471160889,
"learning_rate": 1.8640960809102406e-05,
"loss": 0.6614,
"step": 1135
},
{
"epoch": 0.48645188820140817,
"grad_norm": 4.680159091949463,
"learning_rate": 1.8625158027812897e-05,
"loss": 0.6086,
"step": 1140
},
{
"epoch": 0.48858544911457225,
"grad_norm": 5.151489734649658,
"learning_rate": 1.860935524652339e-05,
"loss": 0.6339,
"step": 1145
},
{
"epoch": 0.49071901002773627,
"grad_norm": 5.28818941116333,
"learning_rate": 1.8593552465233883e-05,
"loss": 0.6728,
"step": 1150
},
{
"epoch": 0.49285257094090035,
"grad_norm": 5.3361687660217285,
"learning_rate": 1.8577749683944374e-05,
"loss": 0.6508,
"step": 1155
},
{
"epoch": 0.4949861318540644,
"grad_norm": 4.8041090965271,
"learning_rate": 1.856194690265487e-05,
"loss": 0.6442,
"step": 1160
},
{
"epoch": 0.4971196927672285,
"grad_norm": 4.674312591552734,
"learning_rate": 1.8546144121365363e-05,
"loss": 0.7078,
"step": 1165
},
{
"epoch": 0.4992532536803926,
"grad_norm": 4.928088665008545,
"learning_rate": 1.8530341340075854e-05,
"loss": 0.7788,
"step": 1170
},
{
"epoch": 0.5013868145935566,
"grad_norm": 4.974058628082275,
"learning_rate": 1.851453855878635e-05,
"loss": 0.7097,
"step": 1175
},
{
"epoch": 0.5035203755067207,
"grad_norm": 4.790674209594727,
"learning_rate": 1.849873577749684e-05,
"loss": 0.5971,
"step": 1180
},
{
"epoch": 0.5056539364198848,
"grad_norm": 5.095616817474365,
"learning_rate": 1.8482932996207335e-05,
"loss": 0.6299,
"step": 1185
},
{
"epoch": 0.5077874973330488,
"grad_norm": 4.432722091674805,
"learning_rate": 1.846713021491783e-05,
"loss": 0.6348,
"step": 1190
},
{
"epoch": 0.5099210582462129,
"grad_norm": 4.803964614868164,
"learning_rate": 1.845132743362832e-05,
"loss": 0.6403,
"step": 1195
},
{
"epoch": 0.512054619159377,
"grad_norm": 5.166224956512451,
"learning_rate": 1.843552465233881e-05,
"loss": 0.655,
"step": 1200
},
{
"epoch": 0.5141881800725411,
"grad_norm": 5.379065990447998,
"learning_rate": 1.8419721871049306e-05,
"loss": 0.617,
"step": 1205
},
{
"epoch": 0.5163217409857052,
"grad_norm": 5.9183735847473145,
"learning_rate": 1.8403919089759797e-05,
"loss": 0.586,
"step": 1210
},
{
"epoch": 0.5184553018988692,
"grad_norm": 5.269925117492676,
"learning_rate": 1.8388116308470292e-05,
"loss": 0.6441,
"step": 1215
},
{
"epoch": 0.5205888628120333,
"grad_norm": 4.2534003257751465,
"learning_rate": 1.8372313527180786e-05,
"loss": 0.5532,
"step": 1220
},
{
"epoch": 0.5227224237251974,
"grad_norm": 4.715899467468262,
"learning_rate": 1.8356510745891278e-05,
"loss": 0.5795,
"step": 1225
},
{
"epoch": 0.5248559846383615,
"grad_norm": 5.345327377319336,
"learning_rate": 1.8340707964601772e-05,
"loss": 0.652,
"step": 1230
},
{
"epoch": 0.5269895455515256,
"grad_norm": 4.632909297943115,
"learning_rate": 1.8324905183312263e-05,
"loss": 0.6023,
"step": 1235
},
{
"epoch": 0.5291231064646895,
"grad_norm": 5.490262031555176,
"learning_rate": 1.8309102402022758e-05,
"loss": 0.6742,
"step": 1240
},
{
"epoch": 0.5312566673778536,
"grad_norm": 4.147167205810547,
"learning_rate": 1.8293299620733252e-05,
"loss": 0.5938,
"step": 1245
},
{
"epoch": 0.5333902282910177,
"grad_norm": 4.870655059814453,
"learning_rate": 1.8277496839443744e-05,
"loss": 0.5831,
"step": 1250
},
{
"epoch": 0.5355237892041818,
"grad_norm": 5.774129390716553,
"learning_rate": 1.8261694058154235e-05,
"loss": 0.6278,
"step": 1255
},
{
"epoch": 0.5376573501173458,
"grad_norm": 4.7453999519348145,
"learning_rate": 1.824589127686473e-05,
"loss": 0.5228,
"step": 1260
},
{
"epoch": 0.5397909110305099,
"grad_norm": 5.04367733001709,
"learning_rate": 1.823008849557522e-05,
"loss": 0.5514,
"step": 1265
},
{
"epoch": 0.541924471943674,
"grad_norm": 4.159360408782959,
"learning_rate": 1.8214285714285715e-05,
"loss": 0.6121,
"step": 1270
},
{
"epoch": 0.5440580328568381,
"grad_norm": 4.117012023925781,
"learning_rate": 1.819848293299621e-05,
"loss": 0.5538,
"step": 1275
},
{
"epoch": 0.5461915937700021,
"grad_norm": 4.434225082397461,
"learning_rate": 1.81826801517067e-05,
"loss": 0.5912,
"step": 1280
},
{
"epoch": 0.5483251546831662,
"grad_norm": 5.941255569458008,
"learning_rate": 1.8166877370417195e-05,
"loss": 0.5777,
"step": 1285
},
{
"epoch": 0.5504587155963303,
"grad_norm": 5.3827738761901855,
"learning_rate": 1.815107458912769e-05,
"loss": 0.5657,
"step": 1290
},
{
"epoch": 0.5525922765094944,
"grad_norm": 4.9497575759887695,
"learning_rate": 1.813527180783818e-05,
"loss": 0.6132,
"step": 1295
},
{
"epoch": 0.5547258374226585,
"grad_norm": 4.846462249755859,
"learning_rate": 1.8119469026548676e-05,
"loss": 0.5407,
"step": 1300
},
{
"epoch": 0.5568593983358225,
"grad_norm": 5.622977256774902,
"learning_rate": 1.8103666245259167e-05,
"loss": 0.5939,
"step": 1305
},
{
"epoch": 0.5589929592489865,
"grad_norm": 5.20183801651001,
"learning_rate": 1.8087863463969658e-05,
"loss": 0.6412,
"step": 1310
},
{
"epoch": 0.5611265201621506,
"grad_norm": 4.769690990447998,
"learning_rate": 1.8072060682680153e-05,
"loss": 0.5422,
"step": 1315
},
{
"epoch": 0.5632600810753147,
"grad_norm": 5.3674516677856445,
"learning_rate": 1.8056257901390647e-05,
"loss": 0.6404,
"step": 1320
},
{
"epoch": 0.5653936419884787,
"grad_norm": 4.712153911590576,
"learning_rate": 1.804045512010114e-05,
"loss": 0.6208,
"step": 1325
},
{
"epoch": 0.5675272029016428,
"grad_norm": 4.993585586547852,
"learning_rate": 1.8024652338811633e-05,
"loss": 0.5722,
"step": 1330
},
{
"epoch": 0.5696607638148069,
"grad_norm": 4.3871307373046875,
"learning_rate": 1.8008849557522124e-05,
"loss": 0.5092,
"step": 1335
},
{
"epoch": 0.571794324727971,
"grad_norm": 4.752344131469727,
"learning_rate": 1.799304677623262e-05,
"loss": 0.6061,
"step": 1340
},
{
"epoch": 0.5739278856411351,
"grad_norm": 5.579290390014648,
"learning_rate": 1.7977243994943113e-05,
"loss": 0.6441,
"step": 1345
},
{
"epoch": 0.5760614465542991,
"grad_norm": 4.870927810668945,
"learning_rate": 1.7961441213653604e-05,
"loss": 0.5957,
"step": 1350
},
{
"epoch": 0.5781950074674632,
"grad_norm": 5.871009349822998,
"learning_rate": 1.79456384323641e-05,
"loss": 0.6249,
"step": 1355
},
{
"epoch": 0.5803285683806273,
"grad_norm": 5.544130325317383,
"learning_rate": 1.792983565107459e-05,
"loss": 0.6002,
"step": 1360
},
{
"epoch": 0.5824621292937914,
"grad_norm": 4.897180080413818,
"learning_rate": 1.791403286978508e-05,
"loss": 0.6096,
"step": 1365
},
{
"epoch": 0.5845956902069555,
"grad_norm": 4.563004016876221,
"learning_rate": 1.7898230088495576e-05,
"loss": 0.5466,
"step": 1370
},
{
"epoch": 0.5867292511201195,
"grad_norm": 4.7146430015563965,
"learning_rate": 1.788242730720607e-05,
"loss": 0.5664,
"step": 1375
},
{
"epoch": 0.5888628120332835,
"grad_norm": 5.06727933883667,
"learning_rate": 1.7866624525916562e-05,
"loss": 0.6218,
"step": 1380
},
{
"epoch": 0.5909963729464476,
"grad_norm": 4.825130462646484,
"learning_rate": 1.7850821744627056e-05,
"loss": 0.513,
"step": 1385
},
{
"epoch": 0.5931299338596117,
"grad_norm": 5.076493740081787,
"learning_rate": 1.783501896333755e-05,
"loss": 0.6036,
"step": 1390
},
{
"epoch": 0.5952634947727757,
"grad_norm": 4.400293350219727,
"learning_rate": 1.7819216182048042e-05,
"loss": 0.5427,
"step": 1395
},
{
"epoch": 0.5973970556859398,
"grad_norm": 4.34608268737793,
"learning_rate": 1.7803413400758537e-05,
"loss": 0.4911,
"step": 1400
},
{
"epoch": 0.5995306165991039,
"grad_norm": 4.652610778808594,
"learning_rate": 1.7787610619469028e-05,
"loss": 0.5789,
"step": 1405
},
{
"epoch": 0.601664177512268,
"grad_norm": 5.2942914962768555,
"learning_rate": 1.777180783817952e-05,
"loss": 0.5746,
"step": 1410
},
{
"epoch": 0.603797738425432,
"grad_norm": 4.799631118774414,
"learning_rate": 1.7756005056890014e-05,
"loss": 0.6073,
"step": 1415
},
{
"epoch": 0.6059312993385961,
"grad_norm": 4.464809417724609,
"learning_rate": 1.7740202275600508e-05,
"loss": 0.5064,
"step": 1420
},
{
"epoch": 0.6080648602517602,
"grad_norm": 5.692066192626953,
"learning_rate": 1.7724399494311e-05,
"loss": 0.5846,
"step": 1425
},
{
"epoch": 0.6101984211649243,
"grad_norm": 4.2043890953063965,
"learning_rate": 1.7708596713021494e-05,
"loss": 0.5368,
"step": 1430
},
{
"epoch": 0.6123319820780884,
"grad_norm": 5.393744468688965,
"learning_rate": 1.7692793931731985e-05,
"loss": 0.5735,
"step": 1435
},
{
"epoch": 0.6144655429912524,
"grad_norm": 4.345822334289551,
"learning_rate": 1.767699115044248e-05,
"loss": 0.5119,
"step": 1440
},
{
"epoch": 0.6165991039044165,
"grad_norm": 4.4069647789001465,
"learning_rate": 1.7661188369152974e-05,
"loss": 0.5429,
"step": 1445
},
{
"epoch": 0.6187326648175805,
"grad_norm": 4.706110000610352,
"learning_rate": 1.7645385587863465e-05,
"loss": 0.6141,
"step": 1450
},
{
"epoch": 0.6208662257307446,
"grad_norm": 4.513293266296387,
"learning_rate": 1.762958280657396e-05,
"loss": 0.5224,
"step": 1455
},
{
"epoch": 0.6229997866439086,
"grad_norm": 4.868618488311768,
"learning_rate": 1.761378002528445e-05,
"loss": 0.5656,
"step": 1460
},
{
"epoch": 0.6251333475570727,
"grad_norm": 4.385486602783203,
"learning_rate": 1.7597977243994942e-05,
"loss": 0.567,
"step": 1465
},
{
"epoch": 0.6272669084702368,
"grad_norm": 5.354750633239746,
"learning_rate": 1.7582174462705437e-05,
"loss": 0.5517,
"step": 1470
},
{
"epoch": 0.6294004693834009,
"grad_norm": 5.496529579162598,
"learning_rate": 1.756637168141593e-05,
"loss": 0.6311,
"step": 1475
},
{
"epoch": 0.631534030296565,
"grad_norm": 5.110471725463867,
"learning_rate": 1.7550568900126423e-05,
"loss": 0.5862,
"step": 1480
},
{
"epoch": 0.633667591209729,
"grad_norm": 4.875749111175537,
"learning_rate": 1.7534766118836917e-05,
"loss": 0.5699,
"step": 1485
},
{
"epoch": 0.6358011521228931,
"grad_norm": 4.413814544677734,
"learning_rate": 1.751896333754741e-05,
"loss": 0.574,
"step": 1490
},
{
"epoch": 0.6379347130360572,
"grad_norm": 4.2545857429504395,
"learning_rate": 1.7503160556257903e-05,
"loss": 0.5611,
"step": 1495
},
{
"epoch": 0.6400682739492213,
"grad_norm": 5.520027160644531,
"learning_rate": 1.7487357774968398e-05,
"loss": 0.4966,
"step": 1500
},
{
"epoch": 0.6422018348623854,
"grad_norm": 5.082278728485107,
"learning_rate": 1.747155499367889e-05,
"loss": 0.5305,
"step": 1505
},
{
"epoch": 0.6443353957755494,
"grad_norm": 4.388136863708496,
"learning_rate": 1.7455752212389383e-05,
"loss": 0.5228,
"step": 1510
},
{
"epoch": 0.6464689566887135,
"grad_norm": 5.492458343505859,
"learning_rate": 1.7439949431099874e-05,
"loss": 0.5816,
"step": 1515
},
{
"epoch": 0.6486025176018775,
"grad_norm": 4.697801113128662,
"learning_rate": 1.7424146649810366e-05,
"loss": 0.5538,
"step": 1520
},
{
"epoch": 0.6507360785150416,
"grad_norm": 4.820540904998779,
"learning_rate": 1.740834386852086e-05,
"loss": 0.4972,
"step": 1525
},
{
"epoch": 0.6528696394282056,
"grad_norm": 4.574212551116943,
"learning_rate": 1.7392541087231355e-05,
"loss": 0.5677,
"step": 1530
},
{
"epoch": 0.6550032003413697,
"grad_norm": 5.26272439956665,
"learning_rate": 1.7376738305941846e-05,
"loss": 0.5891,
"step": 1535
},
{
"epoch": 0.6571367612545338,
"grad_norm": 4.426779747009277,
"learning_rate": 1.736093552465234e-05,
"loss": 0.5463,
"step": 1540
},
{
"epoch": 0.6592703221676979,
"grad_norm": 4.585545063018799,
"learning_rate": 1.7345132743362835e-05,
"loss": 0.6041,
"step": 1545
},
{
"epoch": 0.661403883080862,
"grad_norm": 5.416412353515625,
"learning_rate": 1.7329329962073326e-05,
"loss": 0.7248,
"step": 1550
},
{
"epoch": 0.663537443994026,
"grad_norm": 4.863205909729004,
"learning_rate": 1.731352718078382e-05,
"loss": 0.6366,
"step": 1555
},
{
"epoch": 0.6656710049071901,
"grad_norm": 4.924341201782227,
"learning_rate": 1.7297724399494312e-05,
"loss": 0.6056,
"step": 1560
},
{
"epoch": 0.6678045658203542,
"grad_norm": 4.459977149963379,
"learning_rate": 1.7281921618204803e-05,
"loss": 0.5468,
"step": 1565
},
{
"epoch": 0.6699381267335183,
"grad_norm": 4.483630657196045,
"learning_rate": 1.7266118836915298e-05,
"loss": 0.5638,
"step": 1570
},
{
"epoch": 0.6720716876466823,
"grad_norm": 5.0850605964660645,
"learning_rate": 1.7250316055625792e-05,
"loss": 0.5833,
"step": 1575
},
{
"epoch": 0.6742052485598464,
"grad_norm": 5.052149295806885,
"learning_rate": 1.7234513274336284e-05,
"loss": 0.6128,
"step": 1580
},
{
"epoch": 0.6763388094730105,
"grad_norm": 4.196040153503418,
"learning_rate": 1.7218710493046778e-05,
"loss": 0.5525,
"step": 1585
},
{
"epoch": 0.6784723703861745,
"grad_norm": 4.805011749267578,
"learning_rate": 1.720290771175727e-05,
"loss": 0.5735,
"step": 1590
},
{
"epoch": 0.6806059312993386,
"grad_norm": 4.9073405265808105,
"learning_rate": 1.7187104930467764e-05,
"loss": 0.5409,
"step": 1595
},
{
"epoch": 0.6827394922125026,
"grad_norm": 5.15145206451416,
"learning_rate": 1.717130214917826e-05,
"loss": 0.545,
"step": 1600
},
{
"epoch": 0.6848730531256667,
"grad_norm": 4.6511945724487305,
"learning_rate": 1.715549936788875e-05,
"loss": 0.4922,
"step": 1605
},
{
"epoch": 0.6870066140388308,
"grad_norm": 4.7381591796875,
"learning_rate": 1.7139696586599244e-05,
"loss": 0.6188,
"step": 1610
},
{
"epoch": 0.6891401749519949,
"grad_norm": 5.406619071960449,
"learning_rate": 1.7123893805309735e-05,
"loss": 0.5705,
"step": 1615
},
{
"epoch": 0.691273735865159,
"grad_norm": 5.582167625427246,
"learning_rate": 1.7108091024020227e-05,
"loss": 0.5205,
"step": 1620
},
{
"epoch": 0.693407296778323,
"grad_norm": 4.347970962524414,
"learning_rate": 1.709228824273072e-05,
"loss": 0.5568,
"step": 1625
},
{
"epoch": 0.6955408576914871,
"grad_norm": 4.038663387298584,
"learning_rate": 1.7076485461441216e-05,
"loss": 0.5565,
"step": 1630
},
{
"epoch": 0.6976744186046512,
"grad_norm": 4.6226911544799805,
"learning_rate": 1.7060682680151707e-05,
"loss": 0.5122,
"step": 1635
},
{
"epoch": 0.6998079795178153,
"grad_norm": 5.612771034240723,
"learning_rate": 1.70448798988622e-05,
"loss": 0.5414,
"step": 1640
},
{
"epoch": 0.7019415404309793,
"grad_norm": 4.659555912017822,
"learning_rate": 1.7029077117572696e-05,
"loss": 0.5126,
"step": 1645
},
{
"epoch": 0.7040751013441434,
"grad_norm": 4.515445709228516,
"learning_rate": 1.7013274336283187e-05,
"loss": 0.5426,
"step": 1650
},
{
"epoch": 0.7062086622573075,
"grad_norm": 5.030318260192871,
"learning_rate": 1.6997471554993682e-05,
"loss": 0.5764,
"step": 1655
},
{
"epoch": 0.7083422231704715,
"grad_norm": 4.8684587478637695,
"learning_rate": 1.6981668773704173e-05,
"loss": 0.5159,
"step": 1660
},
{
"epoch": 0.7104757840836355,
"grad_norm": 5.1901092529296875,
"learning_rate": 1.6965865992414667e-05,
"loss": 0.576,
"step": 1665
},
{
"epoch": 0.7126093449967996,
"grad_norm": 4.969446659088135,
"learning_rate": 1.695006321112516e-05,
"loss": 0.5781,
"step": 1670
},
{
"epoch": 0.7147429059099637,
"grad_norm": 4.46921443939209,
"learning_rate": 1.6934260429835653e-05,
"loss": 0.5164,
"step": 1675
},
{
"epoch": 0.7168764668231278,
"grad_norm": 5.267427444458008,
"learning_rate": 1.6918457648546144e-05,
"loss": 0.5278,
"step": 1680
},
{
"epoch": 0.7190100277362919,
"grad_norm": 4.0481390953063965,
"learning_rate": 1.690265486725664e-05,
"loss": 0.5667,
"step": 1685
},
{
"epoch": 0.7211435886494559,
"grad_norm": 4.155159950256348,
"learning_rate": 1.688685208596713e-05,
"loss": 0.4889,
"step": 1690
},
{
"epoch": 0.72327714956262,
"grad_norm": 4.967867851257324,
"learning_rate": 1.6871049304677625e-05,
"loss": 0.5163,
"step": 1695
},
{
"epoch": 0.7254107104757841,
"grad_norm": 4.598382472991943,
"learning_rate": 1.685524652338812e-05,
"loss": 0.523,
"step": 1700
},
{
"epoch": 0.7275442713889482,
"grad_norm": 4.9795756340026855,
"learning_rate": 1.683944374209861e-05,
"loss": 0.5503,
"step": 1705
},
{
"epoch": 0.7296778323021123,
"grad_norm": 4.119642734527588,
"learning_rate": 1.6823640960809105e-05,
"loss": 0.4932,
"step": 1710
},
{
"epoch": 0.7318113932152763,
"grad_norm": 3.7534000873565674,
"learning_rate": 1.6807838179519596e-05,
"loss": 0.4973,
"step": 1715
},
{
"epoch": 0.7339449541284404,
"grad_norm": 4.928293704986572,
"learning_rate": 1.6792035398230087e-05,
"loss": 0.5625,
"step": 1720
},
{
"epoch": 0.7360785150416045,
"grad_norm": 5.265719890594482,
"learning_rate": 1.6776232616940582e-05,
"loss": 0.5335,
"step": 1725
},
{
"epoch": 0.7382120759547685,
"grad_norm": 4.559508323669434,
"learning_rate": 1.6760429835651077e-05,
"loss": 0.4446,
"step": 1730
},
{
"epoch": 0.7403456368679325,
"grad_norm": 5.78938627243042,
"learning_rate": 1.6744627054361568e-05,
"loss": 0.5377,
"step": 1735
},
{
"epoch": 0.7424791977810966,
"grad_norm": 4.455636501312256,
"learning_rate": 1.6728824273072062e-05,
"loss": 0.5036,
"step": 1740
},
{
"epoch": 0.7446127586942607,
"grad_norm": 5.289336204528809,
"learning_rate": 1.6713021491782553e-05,
"loss": 0.6074,
"step": 1745
},
{
"epoch": 0.7467463196074248,
"grad_norm": 5.169851303100586,
"learning_rate": 1.6697218710493048e-05,
"loss": 0.5982,
"step": 1750
},
{
"epoch": 0.7488798805205888,
"grad_norm": 5.2046217918396,
"learning_rate": 1.6681415929203543e-05,
"loss": 0.5762,
"step": 1755
},
{
"epoch": 0.7510134414337529,
"grad_norm": 4.986084461212158,
"learning_rate": 1.6665613147914034e-05,
"loss": 0.5379,
"step": 1760
},
{
"epoch": 0.753147002346917,
"grad_norm": 5.0369791984558105,
"learning_rate": 1.664981036662453e-05,
"loss": 0.5702,
"step": 1765
},
{
"epoch": 0.7552805632600811,
"grad_norm": 4.595948696136475,
"learning_rate": 1.6634007585335023e-05,
"loss": 0.4272,
"step": 1770
},
{
"epoch": 0.7574141241732452,
"grad_norm": 4.182741641998291,
"learning_rate": 1.6618204804045514e-05,
"loss": 0.4805,
"step": 1775
},
{
"epoch": 0.7595476850864092,
"grad_norm": 4.699618816375732,
"learning_rate": 1.6602402022756005e-05,
"loss": 0.4963,
"step": 1780
},
{
"epoch": 0.7616812459995733,
"grad_norm": 5.032837867736816,
"learning_rate": 1.65865992414665e-05,
"loss": 0.4846,
"step": 1785
},
{
"epoch": 0.7638148069127374,
"grad_norm": 5.413885116577148,
"learning_rate": 1.657079646017699e-05,
"loss": 0.584,
"step": 1790
},
{
"epoch": 0.7659483678259015,
"grad_norm": 5.019171237945557,
"learning_rate": 1.6554993678887486e-05,
"loss": 0.5158,
"step": 1795
},
{
"epoch": 0.7680819287390654,
"grad_norm": 5.328768730163574,
"learning_rate": 1.653919089759798e-05,
"loss": 0.5821,
"step": 1800
},
{
"epoch": 0.7702154896522295,
"grad_norm": 5.146249771118164,
"learning_rate": 1.652338811630847e-05,
"loss": 0.4609,
"step": 1805
},
{
"epoch": 0.7723490505653936,
"grad_norm": 4.079854488372803,
"learning_rate": 1.6507585335018966e-05,
"loss": 0.4335,
"step": 1810
},
{
"epoch": 0.7744826114785577,
"grad_norm": 5.487088203430176,
"learning_rate": 1.6491782553729457e-05,
"loss": 0.5672,
"step": 1815
},
{
"epoch": 0.7766161723917218,
"grad_norm": 4.999423980712891,
"learning_rate": 1.647597977243995e-05,
"loss": 0.467,
"step": 1820
},
{
"epoch": 0.7787497333048858,
"grad_norm": 4.289427280426025,
"learning_rate": 1.6460176991150443e-05,
"loss": 0.4924,
"step": 1825
},
{
"epoch": 0.7808832942180499,
"grad_norm": 4.848026752471924,
"learning_rate": 1.6444374209860937e-05,
"loss": 0.4786,
"step": 1830
},
{
"epoch": 0.783016855131214,
"grad_norm": 4.637599945068359,
"learning_rate": 1.642857142857143e-05,
"loss": 0.5365,
"step": 1835
},
{
"epoch": 0.7851504160443781,
"grad_norm": 4.305636405944824,
"learning_rate": 1.6412768647281923e-05,
"loss": 0.5199,
"step": 1840
},
{
"epoch": 0.7872839769575422,
"grad_norm": 4.758025169372559,
"learning_rate": 1.6396965865992414e-05,
"loss": 0.5243,
"step": 1845
},
{
"epoch": 0.7894175378707062,
"grad_norm": 5.047544479370117,
"learning_rate": 1.638116308470291e-05,
"loss": 0.4685,
"step": 1850
},
{
"epoch": 0.7915510987838703,
"grad_norm": 4.558396816253662,
"learning_rate": 1.6365360303413403e-05,
"loss": 0.6146,
"step": 1855
},
{
"epoch": 0.7936846596970344,
"grad_norm": 5.177937984466553,
"learning_rate": 1.6349557522123895e-05,
"loss": 0.5446,
"step": 1860
},
{
"epoch": 0.7958182206101985,
"grad_norm": 5.484969139099121,
"learning_rate": 1.633375474083439e-05,
"loss": 0.5278,
"step": 1865
},
{
"epoch": 0.7979517815233625,
"grad_norm": 5.092281341552734,
"learning_rate": 1.631795195954488e-05,
"loss": 0.5452,
"step": 1870
},
{
"epoch": 0.8000853424365265,
"grad_norm": 4.620217323303223,
"learning_rate": 1.6302149178255375e-05,
"loss": 0.5923,
"step": 1875
},
{
"epoch": 0.8022189033496906,
"grad_norm": 4.69462776184082,
"learning_rate": 1.6286346396965866e-05,
"loss": 0.4944,
"step": 1880
},
{
"epoch": 0.8043524642628547,
"grad_norm": 4.684912204742432,
"learning_rate": 1.627054361567636e-05,
"loss": 0.5447,
"step": 1885
},
{
"epoch": 0.8064860251760188,
"grad_norm": 5.183788299560547,
"learning_rate": 1.6254740834386852e-05,
"loss": 0.5575,
"step": 1890
},
{
"epoch": 0.8086195860891828,
"grad_norm": 4.963923931121826,
"learning_rate": 1.6238938053097346e-05,
"loss": 0.5922,
"step": 1895
},
{
"epoch": 0.8107531470023469,
"grad_norm": 5.103732585906982,
"learning_rate": 1.622313527180784e-05,
"loss": 0.498,
"step": 1900
},
{
"epoch": 0.812886707915511,
"grad_norm": 5.205685615539551,
"learning_rate": 1.6207332490518332e-05,
"loss": 0.4965,
"step": 1905
},
{
"epoch": 0.8150202688286751,
"grad_norm": 5.297272682189941,
"learning_rate": 1.6191529709228827e-05,
"loss": 0.5762,
"step": 1910
},
{
"epoch": 0.8171538297418391,
"grad_norm": 3.759392023086548,
"learning_rate": 1.6175726927939318e-05,
"loss": 0.5036,
"step": 1915
},
{
"epoch": 0.8192873906550032,
"grad_norm": 4.451435089111328,
"learning_rate": 1.6159924146649813e-05,
"loss": 0.5089,
"step": 1920
},
{
"epoch": 0.8214209515681673,
"grad_norm": 5.265392780303955,
"learning_rate": 1.6144121365360307e-05,
"loss": 0.5219,
"step": 1925
},
{
"epoch": 0.8235545124813314,
"grad_norm": 5.984591960906982,
"learning_rate": 1.6128318584070798e-05,
"loss": 0.5821,
"step": 1930
},
{
"epoch": 0.8256880733944955,
"grad_norm": 4.420891761779785,
"learning_rate": 1.611251580278129e-05,
"loss": 0.4672,
"step": 1935
},
{
"epoch": 0.8278216343076595,
"grad_norm": 3.7576348781585693,
"learning_rate": 1.6096713021491784e-05,
"loss": 0.4639,
"step": 1940
},
{
"epoch": 0.8299551952208235,
"grad_norm": 5.487934112548828,
"learning_rate": 1.6080910240202275e-05,
"loss": 0.518,
"step": 1945
},
{
"epoch": 0.8320887561339876,
"grad_norm": 4.478968620300293,
"learning_rate": 1.606510745891277e-05,
"loss": 0.4295,
"step": 1950
},
{
"epoch": 0.8342223170471517,
"grad_norm": 5.025543212890625,
"learning_rate": 1.6049304677623264e-05,
"loss": 0.4736,
"step": 1955
},
{
"epoch": 0.8363558779603157,
"grad_norm": 4.544827461242676,
"learning_rate": 1.6033501896333756e-05,
"loss": 0.5154,
"step": 1960
},
{
"epoch": 0.8384894388734798,
"grad_norm": 5.281660079956055,
"learning_rate": 1.601769911504425e-05,
"loss": 0.598,
"step": 1965
},
{
"epoch": 0.8406229997866439,
"grad_norm": 4.328815460205078,
"learning_rate": 1.600189633375474e-05,
"loss": 0.52,
"step": 1970
},
{
"epoch": 0.842756560699808,
"grad_norm": 4.648770332336426,
"learning_rate": 1.5986093552465236e-05,
"loss": 0.438,
"step": 1975
},
{
"epoch": 0.8448901216129721,
"grad_norm": 4.124217987060547,
"learning_rate": 1.5970290771175727e-05,
"loss": 0.4639,
"step": 1980
},
{
"epoch": 0.8470236825261361,
"grad_norm": 4.548424243927002,
"learning_rate": 1.595448798988622e-05,
"loss": 0.5121,
"step": 1985
},
{
"epoch": 0.8491572434393002,
"grad_norm": 4.506327152252197,
"learning_rate": 1.5938685208596713e-05,
"loss": 0.4759,
"step": 1990
},
{
"epoch": 0.8512908043524643,
"grad_norm": 4.667163848876953,
"learning_rate": 1.5922882427307207e-05,
"loss": 0.5611,
"step": 1995
},
{
"epoch": 0.8534243652656284,
"grad_norm": 4.311827182769775,
"learning_rate": 1.59070796460177e-05,
"loss": 0.5262,
"step": 2000
},
{
"epoch": 0.8555579261787924,
"grad_norm": 4.407560348510742,
"learning_rate": 1.5891276864728193e-05,
"loss": 0.5217,
"step": 2005
},
{
"epoch": 0.8576914870919565,
"grad_norm": 5.029135704040527,
"learning_rate": 1.5875474083438688e-05,
"loss": 0.4964,
"step": 2010
},
{
"epoch": 0.8598250480051205,
"grad_norm": 4.426456451416016,
"learning_rate": 1.585967130214918e-05,
"loss": 0.4598,
"step": 2015
},
{
"epoch": 0.8619586089182846,
"grad_norm": 5.080218315124512,
"learning_rate": 1.5843868520859673e-05,
"loss": 0.525,
"step": 2020
},
{
"epoch": 0.8640921698314487,
"grad_norm": 4.200524806976318,
"learning_rate": 1.5828065739570168e-05,
"loss": 0.4956,
"step": 2025
},
{
"epoch": 0.8662257307446127,
"grad_norm": 5.445434093475342,
"learning_rate": 1.581226295828066e-05,
"loss": 0.5389,
"step": 2030
},
{
"epoch": 0.8683592916577768,
"grad_norm": 4.662383556365967,
"learning_rate": 1.579646017699115e-05,
"loss": 0.4701,
"step": 2035
},
{
"epoch": 0.8704928525709409,
"grad_norm": 4.182600498199463,
"learning_rate": 1.5780657395701645e-05,
"loss": 0.5139,
"step": 2040
},
{
"epoch": 0.872626413484105,
"grad_norm": 4.490233421325684,
"learning_rate": 1.5764854614412136e-05,
"loss": 0.4604,
"step": 2045
},
{
"epoch": 0.874759974397269,
"grad_norm": 5.007779598236084,
"learning_rate": 1.574905183312263e-05,
"loss": 0.5102,
"step": 2050
},
{
"epoch": 0.8768935353104331,
"grad_norm": 5.043194770812988,
"learning_rate": 1.5733249051833125e-05,
"loss": 0.4905,
"step": 2055
},
{
"epoch": 0.8790270962235972,
"grad_norm": 4.701770782470703,
"learning_rate": 1.5717446270543616e-05,
"loss": 0.4595,
"step": 2060
},
{
"epoch": 0.8811606571367613,
"grad_norm": 5.017876148223877,
"learning_rate": 1.570164348925411e-05,
"loss": 0.5007,
"step": 2065
},
{
"epoch": 0.8832942180499254,
"grad_norm": 3.628516435623169,
"learning_rate": 1.5685840707964602e-05,
"loss": 0.438,
"step": 2070
},
{
"epoch": 0.8854277789630894,
"grad_norm": 4.184997081756592,
"learning_rate": 1.5670037926675097e-05,
"loss": 0.4689,
"step": 2075
},
{
"epoch": 0.8875613398762535,
"grad_norm": 4.757014751434326,
"learning_rate": 1.565423514538559e-05,
"loss": 0.4569,
"step": 2080
},
{
"epoch": 0.8896949007894175,
"grad_norm": 3.718052625656128,
"learning_rate": 1.5638432364096082e-05,
"loss": 0.5259,
"step": 2085
},
{
"epoch": 0.8918284617025816,
"grad_norm": 3.926389217376709,
"learning_rate": 1.5622629582806574e-05,
"loss": 0.4039,
"step": 2090
},
{
"epoch": 0.8939620226157456,
"grad_norm": 5.952889919281006,
"learning_rate": 1.5606826801517068e-05,
"loss": 0.4846,
"step": 2095
},
{
"epoch": 0.8960955835289097,
"grad_norm": 4.66580057144165,
"learning_rate": 1.559102402022756e-05,
"loss": 0.4887,
"step": 2100
},
{
"epoch": 0.8982291444420738,
"grad_norm": 4.7099103927612305,
"learning_rate": 1.5575221238938054e-05,
"loss": 0.5263,
"step": 2105
},
{
"epoch": 0.9003627053552379,
"grad_norm": 4.66274881362915,
"learning_rate": 1.555941845764855e-05,
"loss": 0.4924,
"step": 2110
},
{
"epoch": 0.902496266268402,
"grad_norm": 4.509885311126709,
"learning_rate": 1.554361567635904e-05,
"loss": 0.4973,
"step": 2115
},
{
"epoch": 0.904629827181566,
"grad_norm": 4.4420647621154785,
"learning_rate": 1.5527812895069534e-05,
"loss": 0.4772,
"step": 2120
},
{
"epoch": 0.9067633880947301,
"grad_norm": 4.7722554206848145,
"learning_rate": 1.551201011378003e-05,
"loss": 0.5006,
"step": 2125
},
{
"epoch": 0.9088969490078942,
"grad_norm": 3.9744837284088135,
"learning_rate": 1.549620733249052e-05,
"loss": 0.4475,
"step": 2130
},
{
"epoch": 0.9110305099210583,
"grad_norm": 4.898085594177246,
"learning_rate": 1.5480404551201015e-05,
"loss": 0.497,
"step": 2135
},
{
"epoch": 0.9131640708342224,
"grad_norm": 4.290428161621094,
"learning_rate": 1.5464601769911506e-05,
"loss": 0.4469,
"step": 2140
},
{
"epoch": 0.9152976317473864,
"grad_norm": 4.357161998748779,
"learning_rate": 1.5448798988621997e-05,
"loss": 0.4795,
"step": 2145
},
{
"epoch": 0.9174311926605505,
"grad_norm": 4.403848171234131,
"learning_rate": 1.543299620733249e-05,
"loss": 0.5059,
"step": 2150
},
{
"epoch": 0.9195647535737145,
"grad_norm": 4.178798675537109,
"learning_rate": 1.5417193426042986e-05,
"loss": 0.4438,
"step": 2155
},
{
"epoch": 0.9216983144868786,
"grad_norm": 5.097316265106201,
"learning_rate": 1.5401390644753477e-05,
"loss": 0.4385,
"step": 2160
},
{
"epoch": 0.9238318754000426,
"grad_norm": 4.63726282119751,
"learning_rate": 1.5385587863463972e-05,
"loss": 0.5495,
"step": 2165
},
{
"epoch": 0.9259654363132067,
"grad_norm": 3.993021011352539,
"learning_rate": 1.5369785082174463e-05,
"loss": 0.45,
"step": 2170
},
{
"epoch": 0.9280989972263708,
"grad_norm": 4.778984069824219,
"learning_rate": 1.5353982300884958e-05,
"loss": 0.5123,
"step": 2175
},
{
"epoch": 0.9302325581395349,
"grad_norm": 4.495462894439697,
"learning_rate": 1.5338179519595452e-05,
"loss": 0.4433,
"step": 2180
},
{
"epoch": 0.932366119052699,
"grad_norm": 4.673117160797119,
"learning_rate": 1.5322376738305943e-05,
"loss": 0.4567,
"step": 2185
},
{
"epoch": 0.934499679965863,
"grad_norm": 4.821684837341309,
"learning_rate": 1.5306573957016435e-05,
"loss": 0.4398,
"step": 2190
},
{
"epoch": 0.9366332408790271,
"grad_norm": 4.811855316162109,
"learning_rate": 1.529077117572693e-05,
"loss": 0.4711,
"step": 2195
},
{
"epoch": 0.9387668017921912,
"grad_norm": 4.224590301513672,
"learning_rate": 1.527496839443742e-05,
"loss": 0.4646,
"step": 2200
},
{
"epoch": 0.9409003627053553,
"grad_norm": 5.050698280334473,
"learning_rate": 1.5259165613147915e-05,
"loss": 0.4591,
"step": 2205
},
{
"epoch": 0.9430339236185193,
"grad_norm": 4.678995609283447,
"learning_rate": 1.5243362831858408e-05,
"loss": 0.4937,
"step": 2210
},
{
"epoch": 0.9451674845316834,
"grad_norm": 3.495490789413452,
"learning_rate": 1.52275600505689e-05,
"loss": 0.4774,
"step": 2215
},
{
"epoch": 0.9473010454448475,
"grad_norm": 4.670935153961182,
"learning_rate": 1.5211757269279395e-05,
"loss": 0.4905,
"step": 2220
},
{
"epoch": 0.9494346063580115,
"grad_norm": 4.865047454833984,
"learning_rate": 1.5195954487989888e-05,
"loss": 0.4965,
"step": 2225
},
{
"epoch": 0.9515681672711755,
"grad_norm": 3.9217941761016846,
"learning_rate": 1.5180151706700381e-05,
"loss": 0.4581,
"step": 2230
},
{
"epoch": 0.9537017281843396,
"grad_norm": 4.600928783416748,
"learning_rate": 1.5164348925410874e-05,
"loss": 0.4821,
"step": 2235
},
{
"epoch": 0.9558352890975037,
"grad_norm": 4.538738250732422,
"learning_rate": 1.5148546144121368e-05,
"loss": 0.4719,
"step": 2240
},
{
"epoch": 0.9579688500106678,
"grad_norm": 4.893208026885986,
"learning_rate": 1.513274336283186e-05,
"loss": 0.522,
"step": 2245
},
{
"epoch": 0.9601024109238319,
"grad_norm": 4.285687446594238,
"learning_rate": 1.5116940581542352e-05,
"loss": 0.5162,
"step": 2250
},
{
"epoch": 0.9622359718369959,
"grad_norm": 4.50706148147583,
"learning_rate": 1.5101137800252845e-05,
"loss": 0.5248,
"step": 2255
},
{
"epoch": 0.96436953275016,
"grad_norm": 4.416393756866455,
"learning_rate": 1.5085335018963338e-05,
"loss": 0.5118,
"step": 2260
},
{
"epoch": 0.9665030936633241,
"grad_norm": 3.7533748149871826,
"learning_rate": 1.5069532237673831e-05,
"loss": 0.4359,
"step": 2265
},
{
"epoch": 0.9686366545764882,
"grad_norm": 4.33950662612915,
"learning_rate": 1.5053729456384326e-05,
"loss": 0.5012,
"step": 2270
},
{
"epoch": 0.9707702154896523,
"grad_norm": 4.270744800567627,
"learning_rate": 1.5037926675094818e-05,
"loss": 0.4344,
"step": 2275
},
{
"epoch": 0.9729037764028163,
"grad_norm": 4.739436626434326,
"learning_rate": 1.5022123893805311e-05,
"loss": 0.5507,
"step": 2280
},
{
"epoch": 0.9750373373159804,
"grad_norm": 4.521956920623779,
"learning_rate": 1.5006321112515804e-05,
"loss": 0.4657,
"step": 2285
},
{
"epoch": 0.9771708982291445,
"grad_norm": 4.782139778137207,
"learning_rate": 1.4990518331226299e-05,
"loss": 0.471,
"step": 2290
},
{
"epoch": 0.9793044591423085,
"grad_norm": 3.9926064014434814,
"learning_rate": 1.4974715549936788e-05,
"loss": 0.4363,
"step": 2295
},
{
"epoch": 0.9814380200554725,
"grad_norm": 4.13399076461792,
"learning_rate": 1.4958912768647283e-05,
"loss": 0.4178,
"step": 2300
},
{
"epoch": 0.9835715809686366,
"grad_norm": 4.709731101989746,
"learning_rate": 1.4943109987357776e-05,
"loss": 0.4703,
"step": 2305
},
{
"epoch": 0.9857051418818007,
"grad_norm": 4.387861251831055,
"learning_rate": 1.4927307206068269e-05,
"loss": 0.4736,
"step": 2310
},
{
"epoch": 0.9878387027949648,
"grad_norm": 4.607449531555176,
"learning_rate": 1.4911504424778761e-05,
"loss": 0.4432,
"step": 2315
},
{
"epoch": 0.9899722637081289,
"grad_norm": 3.77618145942688,
"learning_rate": 1.4895701643489256e-05,
"loss": 0.4397,
"step": 2320
},
{
"epoch": 0.9921058246212929,
"grad_norm": 4.32602071762085,
"learning_rate": 1.4879898862199749e-05,
"loss": 0.4539,
"step": 2325
},
{
"epoch": 0.994239385534457,
"grad_norm": 4.454192161560059,
"learning_rate": 1.4864096080910242e-05,
"loss": 0.4944,
"step": 2330
},
{
"epoch": 0.9963729464476211,
"grad_norm": 4.736626625061035,
"learning_rate": 1.4848293299620735e-05,
"loss": 0.553,
"step": 2335
},
{
"epoch": 0.9985065073607852,
"grad_norm": 4.773657321929932,
"learning_rate": 1.483249051833123e-05,
"loss": 0.5004,
"step": 2340
},
{
"epoch": 1.0,
"eval_evaluator": 0.9877204489141523,
"eval_loss": 0.20730140805244446,
"eval_runtime": 125.912,
"eval_samples_per_second": 18.163,
"eval_steps_per_second": 2.271,
"step": 2344
},
{
"epoch": 1.000426712182633,
"grad_norm": 4.5521745681762695,
"learning_rate": 1.4816687737041719e-05,
"loss": 0.4931,
"step": 2345
},
{
"epoch": 1.002560273095797,
"grad_norm": 4.24761438369751,
"learning_rate": 1.4800884955752213e-05,
"loss": 0.5427,
"step": 2350
},
{
"epoch": 1.004693834008961,
"grad_norm": 4.0507612228393555,
"learning_rate": 1.4785082174462706e-05,
"loss": 0.5448,
"step": 2355
},
{
"epoch": 1.006827394922125,
"grad_norm": 4.63831901550293,
"learning_rate": 1.4769279393173199e-05,
"loss": 0.4535,
"step": 2360
},
{
"epoch": 1.008960955835289,
"grad_norm": 3.7772696018218994,
"learning_rate": 1.4753476611883692e-05,
"loss": 0.4225,
"step": 2365
},
{
"epoch": 1.011094516748453,
"grad_norm": 5.008701324462891,
"learning_rate": 1.4737673830594187e-05,
"loss": 0.4968,
"step": 2370
},
{
"epoch": 1.0132280776616172,
"grad_norm": 4.774904727935791,
"learning_rate": 1.472187104930468e-05,
"loss": 0.487,
"step": 2375
},
{
"epoch": 1.0153616385747812,
"grad_norm": 4.383615970611572,
"learning_rate": 1.4706068268015172e-05,
"loss": 0.554,
"step": 2380
},
{
"epoch": 1.0174951994879453,
"grad_norm": 5.2985663414001465,
"learning_rate": 1.4690265486725665e-05,
"loss": 0.5981,
"step": 2385
},
{
"epoch": 1.0196287604011094,
"grad_norm": 4.582570552825928,
"learning_rate": 1.4674462705436158e-05,
"loss": 0.4725,
"step": 2390
},
{
"epoch": 1.0217623213142735,
"grad_norm": 4.422050476074219,
"learning_rate": 1.4658659924146653e-05,
"loss": 0.595,
"step": 2395
},
{
"epoch": 1.0238958822274375,
"grad_norm": 4.118880748748779,
"learning_rate": 1.4642857142857144e-05,
"loss": 0.5523,
"step": 2400
},
{
"epoch": 1.0260294431406016,
"grad_norm": 5.410613536834717,
"learning_rate": 1.4627054361567637e-05,
"loss": 0.5019,
"step": 2405
},
{
"epoch": 1.0281630040537657,
"grad_norm": 4.719892978668213,
"learning_rate": 1.461125158027813e-05,
"loss": 0.4736,
"step": 2410
},
{
"epoch": 1.0302965649669298,
"grad_norm": 3.992363452911377,
"learning_rate": 1.4595448798988622e-05,
"loss": 0.5149,
"step": 2415
},
{
"epoch": 1.0324301258800939,
"grad_norm": 4.250847816467285,
"learning_rate": 1.4579646017699117e-05,
"loss": 0.4451,
"step": 2420
},
{
"epoch": 1.034563686793258,
"grad_norm": 3.982151508331299,
"learning_rate": 1.456384323640961e-05,
"loss": 0.437,
"step": 2425
},
{
"epoch": 1.036697247706422,
"grad_norm": 4.611386299133301,
"learning_rate": 1.4548040455120103e-05,
"loss": 0.5862,
"step": 2430
},
{
"epoch": 1.038830808619586,
"grad_norm": 4.45111608505249,
"learning_rate": 1.4532237673830596e-05,
"loss": 0.488,
"step": 2435
},
{
"epoch": 1.0409643695327502,
"grad_norm": 4.2482757568359375,
"learning_rate": 1.4516434892541088e-05,
"loss": 0.4786,
"step": 2440
},
{
"epoch": 1.0430979304459143,
"grad_norm": 4.836545467376709,
"learning_rate": 1.4500632111251583e-05,
"loss": 0.5068,
"step": 2445
},
{
"epoch": 1.0452314913590783,
"grad_norm": 4.293904781341553,
"learning_rate": 1.4484829329962074e-05,
"loss": 0.4296,
"step": 2450
},
{
"epoch": 1.0473650522722424,
"grad_norm": 4.53025484085083,
"learning_rate": 1.4469026548672567e-05,
"loss": 0.4776,
"step": 2455
},
{
"epoch": 1.0494986131854065,
"grad_norm": 3.6319494247436523,
"learning_rate": 1.445322376738306e-05,
"loss": 0.4776,
"step": 2460
},
{
"epoch": 1.0516321740985706,
"grad_norm": 4.650353908538818,
"learning_rate": 1.4437420986093553e-05,
"loss": 0.5713,
"step": 2465
},
{
"epoch": 1.0537657350117346,
"grad_norm": 4.444876194000244,
"learning_rate": 1.4421618204804046e-05,
"loss": 0.4727,
"step": 2470
},
{
"epoch": 1.0558992959248987,
"grad_norm": 4.457956314086914,
"learning_rate": 1.440581542351454e-05,
"loss": 0.5128,
"step": 2475
},
{
"epoch": 1.0580328568380628,
"grad_norm": 5.207244873046875,
"learning_rate": 1.4390012642225033e-05,
"loss": 0.5382,
"step": 2480
},
{
"epoch": 1.0601664177512269,
"grad_norm": 4.9665398597717285,
"learning_rate": 1.4374209860935526e-05,
"loss": 0.4477,
"step": 2485
},
{
"epoch": 1.062299978664391,
"grad_norm": 4.235536575317383,
"learning_rate": 1.4358407079646019e-05,
"loss": 0.4534,
"step": 2490
},
{
"epoch": 1.0644335395775548,
"grad_norm": 4.579362869262695,
"learning_rate": 1.4342604298356513e-05,
"loss": 0.5227,
"step": 2495
},
{
"epoch": 1.066567100490719,
"grad_norm": 5.049173355102539,
"learning_rate": 1.4326801517067006e-05,
"loss": 0.4626,
"step": 2500
},
{
"epoch": 1.068700661403883,
"grad_norm": 3.9807817935943604,
"learning_rate": 1.4310998735777498e-05,
"loss": 0.5692,
"step": 2505
},
{
"epoch": 1.070834222317047,
"grad_norm": 4.704874515533447,
"learning_rate": 1.429519595448799e-05,
"loss": 0.4777,
"step": 2510
},
{
"epoch": 1.0729677832302111,
"grad_norm": 4.278888702392578,
"learning_rate": 1.4279393173198483e-05,
"loss": 0.4565,
"step": 2515
},
{
"epoch": 1.0751013441433752,
"grad_norm": 4.737946510314941,
"learning_rate": 1.4263590391908976e-05,
"loss": 0.5691,
"step": 2520
},
{
"epoch": 1.0772349050565393,
"grad_norm": 4.856266498565674,
"learning_rate": 1.424778761061947e-05,
"loss": 0.6618,
"step": 2525
},
{
"epoch": 1.0793684659697034,
"grad_norm": 4.112383842468262,
"learning_rate": 1.4231984829329964e-05,
"loss": 0.513,
"step": 2530
},
{
"epoch": 1.0815020268828675,
"grad_norm": 4.354598522186279,
"learning_rate": 1.4216182048040456e-05,
"loss": 0.4544,
"step": 2535
},
{
"epoch": 1.0836355877960315,
"grad_norm": 4.665713787078857,
"learning_rate": 1.420037926675095e-05,
"loss": 0.476,
"step": 2540
},
{
"epoch": 1.0857691487091956,
"grad_norm": 4.962756633758545,
"learning_rate": 1.4184576485461444e-05,
"loss": 0.4708,
"step": 2545
},
{
"epoch": 1.0879027096223597,
"grad_norm": 5.993206024169922,
"learning_rate": 1.4168773704171937e-05,
"loss": 0.5527,
"step": 2550
},
{
"epoch": 1.0900362705355238,
"grad_norm": 4.374881744384766,
"learning_rate": 1.4152970922882428e-05,
"loss": 0.4507,
"step": 2555
},
{
"epoch": 1.0921698314486878,
"grad_norm": 4.0649590492248535,
"learning_rate": 1.413716814159292e-05,
"loss": 0.5308,
"step": 2560
},
{
"epoch": 1.094303392361852,
"grad_norm": 4.3973917961120605,
"learning_rate": 1.4121365360303414e-05,
"loss": 0.461,
"step": 2565
},
{
"epoch": 1.096436953275016,
"grad_norm": 3.8453307151794434,
"learning_rate": 1.4105562579013907e-05,
"loss": 0.4482,
"step": 2570
},
{
"epoch": 1.09857051418818,
"grad_norm": 4.03675651550293,
"learning_rate": 1.4089759797724401e-05,
"loss": 0.4656,
"step": 2575
},
{
"epoch": 1.1007040751013442,
"grad_norm": 4.694682598114014,
"learning_rate": 1.4073957016434894e-05,
"loss": 0.5048,
"step": 2580
},
{
"epoch": 1.1028376360145082,
"grad_norm": 4.792722225189209,
"learning_rate": 1.4058154235145387e-05,
"loss": 0.4857,
"step": 2585
},
{
"epoch": 1.1049711969276723,
"grad_norm": 4.986560821533203,
"learning_rate": 1.404235145385588e-05,
"loss": 0.5365,
"step": 2590
},
{
"epoch": 1.1071047578408364,
"grad_norm": 3.972780466079712,
"learning_rate": 1.4026548672566374e-05,
"loss": 0.4044,
"step": 2595
},
{
"epoch": 1.1092383187540005,
"grad_norm": 4.662002086639404,
"learning_rate": 1.4010745891276867e-05,
"loss": 0.4476,
"step": 2600
},
{
"epoch": 1.1113718796671646,
"grad_norm": 5.053280353546143,
"learning_rate": 1.3994943109987358e-05,
"loss": 0.6451,
"step": 2605
},
{
"epoch": 1.1135054405803286,
"grad_norm": 3.8143022060394287,
"learning_rate": 1.3979140328697851e-05,
"loss": 0.4513,
"step": 2610
},
{
"epoch": 1.1156390014934927,
"grad_norm": 3.985607385635376,
"learning_rate": 1.3963337547408344e-05,
"loss": 0.3723,
"step": 2615
},
{
"epoch": 1.1177725624066568,
"grad_norm": 4.714868068695068,
"learning_rate": 1.3947534766118837e-05,
"loss": 0.4665,
"step": 2620
},
{
"epoch": 1.1199061233198209,
"grad_norm": 4.590309143066406,
"learning_rate": 1.3931731984829332e-05,
"loss": 0.522,
"step": 2625
},
{
"epoch": 1.122039684232985,
"grad_norm": 4.820577144622803,
"learning_rate": 1.3915929203539824e-05,
"loss": 0.5245,
"step": 2630
},
{
"epoch": 1.124173245146149,
"grad_norm": 4.013303756713867,
"learning_rate": 1.3900126422250317e-05,
"loss": 0.4309,
"step": 2635
},
{
"epoch": 1.126306806059313,
"grad_norm": 4.0792717933654785,
"learning_rate": 1.388432364096081e-05,
"loss": 0.4553,
"step": 2640
},
{
"epoch": 1.1284403669724772,
"grad_norm": 4.793567180633545,
"learning_rate": 1.3868520859671303e-05,
"loss": 0.4862,
"step": 2645
},
{
"epoch": 1.130573927885641,
"grad_norm": 4.6147260665893555,
"learning_rate": 1.3852718078381798e-05,
"loss": 0.5337,
"step": 2650
},
{
"epoch": 1.1327074887988051,
"grad_norm": 3.888552665710449,
"learning_rate": 1.383691529709229e-05,
"loss": 0.4743,
"step": 2655
},
{
"epoch": 1.1348410497119692,
"grad_norm": 4.152586460113525,
"learning_rate": 1.3821112515802782e-05,
"loss": 0.4623,
"step": 2660
},
{
"epoch": 1.1369746106251333,
"grad_norm": 4.437031269073486,
"learning_rate": 1.3805309734513275e-05,
"loss": 0.5064,
"step": 2665
},
{
"epoch": 1.1391081715382974,
"grad_norm": 4.659839153289795,
"learning_rate": 1.3789506953223767e-05,
"loss": 0.503,
"step": 2670
},
{
"epoch": 1.1412417324514614,
"grad_norm": 4.968484401702881,
"learning_rate": 1.3773704171934262e-05,
"loss": 0.5103,
"step": 2675
},
{
"epoch": 1.1433752933646255,
"grad_norm": 4.300429344177246,
"learning_rate": 1.3757901390644755e-05,
"loss": 0.4481,
"step": 2680
},
{
"epoch": 1.1455088542777896,
"grad_norm": 4.291085243225098,
"learning_rate": 1.3742098609355248e-05,
"loss": 0.4804,
"step": 2685
},
{
"epoch": 1.1476424151909537,
"grad_norm": 5.287739276885986,
"learning_rate": 1.372629582806574e-05,
"loss": 0.4587,
"step": 2690
},
{
"epoch": 1.1497759761041177,
"grad_norm": 4.765116214752197,
"learning_rate": 1.3710493046776234e-05,
"loss": 0.5718,
"step": 2695
},
{
"epoch": 1.1519095370172818,
"grad_norm": 4.568051815032959,
"learning_rate": 1.3694690265486728e-05,
"loss": 0.428,
"step": 2700
},
{
"epoch": 1.154043097930446,
"grad_norm": 5.483386516571045,
"learning_rate": 1.3678887484197221e-05,
"loss": 0.4927,
"step": 2705
},
{
"epoch": 1.15617665884361,
"grad_norm": 3.856008529663086,
"learning_rate": 1.3663084702907712e-05,
"loss": 0.4218,
"step": 2710
},
{
"epoch": 1.158310219756774,
"grad_norm": 4.166346549987793,
"learning_rate": 1.3647281921618205e-05,
"loss": 0.4965,
"step": 2715
},
{
"epoch": 1.1604437806699381,
"grad_norm": 3.648595094680786,
"learning_rate": 1.3631479140328698e-05,
"loss": 0.4098,
"step": 2720
},
{
"epoch": 1.1625773415831022,
"grad_norm": 4.0944695472717285,
"learning_rate": 1.361567635903919e-05,
"loss": 0.4597,
"step": 2725
},
{
"epoch": 1.1647109024962663,
"grad_norm": 4.698819160461426,
"learning_rate": 1.3599873577749685e-05,
"loss": 0.4558,
"step": 2730
},
{
"epoch": 1.1668444634094304,
"grad_norm": 4.123109340667725,
"learning_rate": 1.3584070796460178e-05,
"loss": 0.4923,
"step": 2735
},
{
"epoch": 1.1689780243225945,
"grad_norm": 3.7588372230529785,
"learning_rate": 1.3568268015170671e-05,
"loss": 0.5261,
"step": 2740
},
{
"epoch": 1.1711115852357585,
"grad_norm": 4.115803241729736,
"learning_rate": 1.3552465233881164e-05,
"loss": 0.5161,
"step": 2745
},
{
"epoch": 1.1732451461489226,
"grad_norm": 4.102150917053223,
"learning_rate": 1.3536662452591659e-05,
"loss": 0.524,
"step": 2750
},
{
"epoch": 1.1753787070620867,
"grad_norm": 5.2263264656066895,
"learning_rate": 1.3520859671302151e-05,
"loss": 0.4531,
"step": 2755
},
{
"epoch": 1.1775122679752508,
"grad_norm": 4.422746181488037,
"learning_rate": 1.3505056890012644e-05,
"loss": 0.5125,
"step": 2760
},
{
"epoch": 1.1796458288884148,
"grad_norm": 5.224121570587158,
"learning_rate": 1.3489254108723135e-05,
"loss": 0.5122,
"step": 2765
},
{
"epoch": 1.181779389801579,
"grad_norm": 4.187285423278809,
"learning_rate": 1.3473451327433628e-05,
"loss": 0.4124,
"step": 2770
},
{
"epoch": 1.1839129507147428,
"grad_norm": 4.32193660736084,
"learning_rate": 1.3457648546144121e-05,
"loss": 0.7222,
"step": 2775
},
{
"epoch": 1.1860465116279069,
"grad_norm": 4.2760539054870605,
"learning_rate": 1.3441845764854616e-05,
"loss": 0.4534,
"step": 2780
},
{
"epoch": 1.188180072541071,
"grad_norm": 5.836961269378662,
"learning_rate": 1.3426042983565109e-05,
"loss": 0.5466,
"step": 2785
},
{
"epoch": 1.190313633454235,
"grad_norm": 3.944058656692505,
"learning_rate": 1.3410240202275602e-05,
"loss": 0.5265,
"step": 2790
},
{
"epoch": 1.192447194367399,
"grad_norm": 3.726409912109375,
"learning_rate": 1.3394437420986094e-05,
"loss": 0.4826,
"step": 2795
},
{
"epoch": 1.1945807552805632,
"grad_norm": 4.134906768798828,
"learning_rate": 1.3378634639696589e-05,
"loss": 0.3948,
"step": 2800
},
{
"epoch": 1.1967143161937273,
"grad_norm": 4.931777000427246,
"learning_rate": 1.3362831858407082e-05,
"loss": 0.5772,
"step": 2805
},
{
"epoch": 1.1988478771068913,
"grad_norm": 4.482186794281006,
"learning_rate": 1.3347029077117575e-05,
"loss": 0.4961,
"step": 2810
},
{
"epoch": 1.2009814380200554,
"grad_norm": 4.433976173400879,
"learning_rate": 1.3331226295828066e-05,
"loss": 0.4958,
"step": 2815
},
{
"epoch": 1.2031149989332195,
"grad_norm": 4.163667678833008,
"learning_rate": 1.3315423514538559e-05,
"loss": 0.4373,
"step": 2820
},
{
"epoch": 1.2052485598463836,
"grad_norm": 4.757978916168213,
"learning_rate": 1.3299620733249052e-05,
"loss": 0.4558,
"step": 2825
},
{
"epoch": 1.2073821207595477,
"grad_norm": 5.00388240814209,
"learning_rate": 1.3283817951959546e-05,
"loss": 0.5777,
"step": 2830
},
{
"epoch": 1.2095156816727117,
"grad_norm": 3.7060387134552,
"learning_rate": 1.3268015170670039e-05,
"loss": 0.4237,
"step": 2835
},
{
"epoch": 1.2116492425858758,
"grad_norm": 3.6805360317230225,
"learning_rate": 1.3252212389380532e-05,
"loss": 0.435,
"step": 2840
},
{
"epoch": 1.2137828034990399,
"grad_norm": 5.00631046295166,
"learning_rate": 1.3236409608091025e-05,
"loss": 0.5261,
"step": 2845
},
{
"epoch": 1.215916364412204,
"grad_norm": 4.413496971130371,
"learning_rate": 1.322060682680152e-05,
"loss": 0.486,
"step": 2850
},
{
"epoch": 1.218049925325368,
"grad_norm": 4.5624098777771,
"learning_rate": 1.3204804045512012e-05,
"loss": 0.4067,
"step": 2855
},
{
"epoch": 1.2201834862385321,
"grad_norm": 4.407283782958984,
"learning_rate": 1.3189001264222505e-05,
"loss": 0.5407,
"step": 2860
},
{
"epoch": 1.2223170471516962,
"grad_norm": 4.349571228027344,
"learning_rate": 1.3173198482932996e-05,
"loss": 0.4533,
"step": 2865
},
{
"epoch": 1.2244506080648603,
"grad_norm": 4.270420074462891,
"learning_rate": 1.315739570164349e-05,
"loss": 0.4071,
"step": 2870
},
{
"epoch": 1.2265841689780244,
"grad_norm": 4.370794296264648,
"learning_rate": 1.3141592920353982e-05,
"loss": 0.591,
"step": 2875
},
{
"epoch": 1.2287177298911884,
"grad_norm": 4.321391582489014,
"learning_rate": 1.3125790139064477e-05,
"loss": 0.6181,
"step": 2880
},
{
"epoch": 1.2308512908043525,
"grad_norm": 4.3742523193359375,
"learning_rate": 1.310998735777497e-05,
"loss": 0.4433,
"step": 2885
},
{
"epoch": 1.2329848517175166,
"grad_norm": 4.781976699829102,
"learning_rate": 1.3094184576485462e-05,
"loss": 0.4551,
"step": 2890
},
{
"epoch": 1.2351184126306807,
"grad_norm": 4.505169868469238,
"learning_rate": 1.3078381795195955e-05,
"loss": 0.4241,
"step": 2895
},
{
"epoch": 1.2372519735438448,
"grad_norm": 4.392031669616699,
"learning_rate": 1.3062579013906448e-05,
"loss": 0.4712,
"step": 2900
},
{
"epoch": 1.2393855344570088,
"grad_norm": 4.21456241607666,
"learning_rate": 1.3046776232616943e-05,
"loss": 0.4937,
"step": 2905
},
{
"epoch": 1.241519095370173,
"grad_norm": 5.207596778869629,
"learning_rate": 1.3030973451327436e-05,
"loss": 0.5162,
"step": 2910
},
{
"epoch": 1.243652656283337,
"grad_norm": 4.438053607940674,
"learning_rate": 1.3015170670037928e-05,
"loss": 0.4604,
"step": 2915
},
{
"epoch": 1.245786217196501,
"grad_norm": 4.927883148193359,
"learning_rate": 1.299936788874842e-05,
"loss": 0.5275,
"step": 2920
},
{
"epoch": 1.2479197781096651,
"grad_norm": 4.349573135375977,
"learning_rate": 1.2983565107458913e-05,
"loss": 0.4295,
"step": 2925
},
{
"epoch": 1.2500533390228292,
"grad_norm": 3.778911590576172,
"learning_rate": 1.2967762326169407e-05,
"loss": 0.3889,
"step": 2930
},
{
"epoch": 1.2521868999359933,
"grad_norm": 3.9124295711517334,
"learning_rate": 1.29519595448799e-05,
"loss": 0.4719,
"step": 2935
},
{
"epoch": 1.2543204608491574,
"grad_norm": 4.444380283355713,
"learning_rate": 1.2936156763590393e-05,
"loss": 0.4852,
"step": 2940
},
{
"epoch": 1.2564540217623212,
"grad_norm": 4.332208633422852,
"learning_rate": 1.2920353982300886e-05,
"loss": 0.4006,
"step": 2945
},
{
"epoch": 1.2585875826754853,
"grad_norm": 4.473910331726074,
"learning_rate": 1.2904551201011379e-05,
"loss": 0.4696,
"step": 2950
},
{
"epoch": 1.2607211435886494,
"grad_norm": 4.439748287200928,
"learning_rate": 1.2888748419721873e-05,
"loss": 0.426,
"step": 2955
},
{
"epoch": 1.2628547045018135,
"grad_norm": 4.909393310546875,
"learning_rate": 1.2872945638432366e-05,
"loss": 0.4443,
"step": 2960
},
{
"epoch": 1.2649882654149776,
"grad_norm": 3.3987629413604736,
"learning_rate": 1.2857142857142859e-05,
"loss": 0.4202,
"step": 2965
},
{
"epoch": 1.2671218263281416,
"grad_norm": 5.1597418785095215,
"learning_rate": 1.284134007585335e-05,
"loss": 0.4982,
"step": 2970
},
{
"epoch": 1.2692553872413057,
"grad_norm": 4.151483535766602,
"learning_rate": 1.2825537294563843e-05,
"loss": 0.4503,
"step": 2975
},
{
"epoch": 1.2713889481544698,
"grad_norm": 4.21342658996582,
"learning_rate": 1.2809734513274338e-05,
"loss": 0.4007,
"step": 2980
},
{
"epoch": 1.2735225090676339,
"grad_norm": 4.098394870758057,
"learning_rate": 1.279393173198483e-05,
"loss": 0.4376,
"step": 2985
},
{
"epoch": 1.275656069980798,
"grad_norm": 4.1846723556518555,
"learning_rate": 1.2778128950695323e-05,
"loss": 0.4763,
"step": 2990
},
{
"epoch": 1.277789630893962,
"grad_norm": 4.545650482177734,
"learning_rate": 1.2762326169405816e-05,
"loss": 0.4709,
"step": 2995
},
{
"epoch": 1.279923191807126,
"grad_norm": 4.151042461395264,
"learning_rate": 1.2746523388116309e-05,
"loss": 0.5067,
"step": 3000
},
{
"epoch": 1.2820567527202902,
"grad_norm": 3.928866386413574,
"learning_rate": 1.2730720606826804e-05,
"loss": 0.4276,
"step": 3005
},
{
"epoch": 1.2841903136334543,
"grad_norm": 4.5378851890563965,
"learning_rate": 1.2714917825537296e-05,
"loss": 0.4038,
"step": 3010
},
{
"epoch": 1.2863238745466183,
"grad_norm": 4.427021026611328,
"learning_rate": 1.269911504424779e-05,
"loss": 0.5393,
"step": 3015
},
{
"epoch": 1.2884574354597824,
"grad_norm": 4.889599323272705,
"learning_rate": 1.2683312262958282e-05,
"loss": 0.4085,
"step": 3020
},
{
"epoch": 1.2905909963729465,
"grad_norm": 3.6527562141418457,
"learning_rate": 1.2667509481668773e-05,
"loss": 0.415,
"step": 3025
},
{
"epoch": 1.2927245572861106,
"grad_norm": 4.558052062988281,
"learning_rate": 1.2651706700379266e-05,
"loss": 0.5042,
"step": 3030
},
{
"epoch": 1.2948581181992747,
"grad_norm": 5.053831577301025,
"learning_rate": 1.2635903919089761e-05,
"loss": 0.6117,
"step": 3035
},
{
"epoch": 1.2969916791124387,
"grad_norm": 4.6014723777771,
"learning_rate": 1.2620101137800254e-05,
"loss": 0.4961,
"step": 3040
},
{
"epoch": 1.2991252400256028,
"grad_norm": 4.463820934295654,
"learning_rate": 1.2604298356510747e-05,
"loss": 0.4577,
"step": 3045
},
{
"epoch": 1.3012588009387667,
"grad_norm": 3.9237937927246094,
"learning_rate": 1.258849557522124e-05,
"loss": 0.4806,
"step": 3050
},
{
"epoch": 1.3033923618519307,
"grad_norm": 4.813193321228027,
"learning_rate": 1.2572692793931734e-05,
"loss": 0.5106,
"step": 3055
},
{
"epoch": 1.3055259227650948,
"grad_norm": 5.197639465332031,
"learning_rate": 1.2556890012642227e-05,
"loss": 0.4831,
"step": 3060
},
{
"epoch": 1.307659483678259,
"grad_norm": 3.4804067611694336,
"learning_rate": 1.254108723135272e-05,
"loss": 0.463,
"step": 3065
},
{
"epoch": 1.309793044591423,
"grad_norm": 5.394224166870117,
"learning_rate": 1.2525284450063213e-05,
"loss": 0.562,
"step": 3070
},
{
"epoch": 1.311926605504587,
"grad_norm": 3.6379518508911133,
"learning_rate": 1.2509481668773704e-05,
"loss": 0.407,
"step": 3075
},
{
"epoch": 1.3140601664177511,
"grad_norm": 3.515030860900879,
"learning_rate": 1.2493678887484197e-05,
"loss": 0.371,
"step": 3080
},
{
"epoch": 1.3161937273309152,
"grad_norm": 4.32614803314209,
"learning_rate": 1.2477876106194691e-05,
"loss": 0.4977,
"step": 3085
},
{
"epoch": 1.3183272882440793,
"grad_norm": 3.8737730979919434,
"learning_rate": 1.2462073324905184e-05,
"loss": 0.4592,
"step": 3090
},
{
"epoch": 1.3204608491572434,
"grad_norm": 5.182056903839111,
"learning_rate": 1.2446270543615677e-05,
"loss": 0.4373,
"step": 3095
},
{
"epoch": 1.3225944100704075,
"grad_norm": 4.620556831359863,
"learning_rate": 1.243046776232617e-05,
"loss": 0.41,
"step": 3100
},
{
"epoch": 1.3247279709835715,
"grad_norm": 3.9290201663970947,
"learning_rate": 1.2414664981036664e-05,
"loss": 0.4909,
"step": 3105
},
{
"epoch": 1.3268615318967356,
"grad_norm": 4.072641849517822,
"learning_rate": 1.2398862199747157e-05,
"loss": 0.4441,
"step": 3110
},
{
"epoch": 1.3289950928098997,
"grad_norm": 3.8127377033233643,
"learning_rate": 1.238305941845765e-05,
"loss": 0.4104,
"step": 3115
},
{
"epoch": 1.3311286537230638,
"grad_norm": 3.3174235820770264,
"learning_rate": 1.2367256637168143e-05,
"loss": 0.5863,
"step": 3120
},
{
"epoch": 1.3332622146362278,
"grad_norm": 4.908815383911133,
"learning_rate": 1.2351453855878634e-05,
"loss": 0.4861,
"step": 3125
},
{
"epoch": 1.335395775549392,
"grad_norm": 4.559378623962402,
"learning_rate": 1.2335651074589127e-05,
"loss": 0.448,
"step": 3130
},
{
"epoch": 1.337529336462556,
"grad_norm": 5.478936195373535,
"learning_rate": 1.2319848293299622e-05,
"loss": 0.4856,
"step": 3135
},
{
"epoch": 1.33966289737572,
"grad_norm": 3.962242364883423,
"learning_rate": 1.2304045512010115e-05,
"loss": 0.4659,
"step": 3140
},
{
"epoch": 1.3417964582888842,
"grad_norm": 4.678815841674805,
"learning_rate": 1.2288242730720607e-05,
"loss": 0.4927,
"step": 3145
},
{
"epoch": 1.3439300192020482,
"grad_norm": 4.564944744110107,
"learning_rate": 1.22724399494311e-05,
"loss": 0.3763,
"step": 3150
},
{
"epoch": 1.3460635801152123,
"grad_norm": 4.308345794677734,
"learning_rate": 1.2256637168141595e-05,
"loss": 0.3982,
"step": 3155
},
{
"epoch": 1.3481971410283764,
"grad_norm": 4.139796257019043,
"learning_rate": 1.2240834386852088e-05,
"loss": 0.4306,
"step": 3160
},
{
"epoch": 1.3503307019415405,
"grad_norm": 3.8987877368927,
"learning_rate": 1.222503160556258e-05,
"loss": 0.4321,
"step": 3165
},
{
"epoch": 1.3524642628547046,
"grad_norm": 4.241171360015869,
"learning_rate": 1.2209228824273074e-05,
"loss": 0.4122,
"step": 3170
},
{
"epoch": 1.3545978237678686,
"grad_norm": 4.369818210601807,
"learning_rate": 1.2193426042983566e-05,
"loss": 0.4976,
"step": 3175
},
{
"epoch": 1.3567313846810327,
"grad_norm": 4.028532028198242,
"learning_rate": 1.2177623261694058e-05,
"loss": 0.468,
"step": 3180
},
{
"epoch": 1.3588649455941968,
"grad_norm": 5.026562690734863,
"learning_rate": 1.2161820480404552e-05,
"loss": 0.5257,
"step": 3185
},
{
"epoch": 1.3609985065073609,
"grad_norm": 4.444199085235596,
"learning_rate": 1.2146017699115045e-05,
"loss": 0.4232,
"step": 3190
},
{
"epoch": 1.363132067420525,
"grad_norm": 3.5891497135162354,
"learning_rate": 1.2130214917825538e-05,
"loss": 0.4477,
"step": 3195
},
{
"epoch": 1.365265628333689,
"grad_norm": 4.004302024841309,
"learning_rate": 1.211441213653603e-05,
"loss": 0.4556,
"step": 3200
},
{
"epoch": 1.367399189246853,
"grad_norm": 4.825848579406738,
"learning_rate": 1.2098609355246524e-05,
"loss": 0.5006,
"step": 3205
},
{
"epoch": 1.3695327501600172,
"grad_norm": 3.6671814918518066,
"learning_rate": 1.2082806573957018e-05,
"loss": 0.4101,
"step": 3210
},
{
"epoch": 1.3716663110731813,
"grad_norm": 3.879122018814087,
"learning_rate": 1.2067003792667511e-05,
"loss": 0.3969,
"step": 3215
},
{
"epoch": 1.3737998719863453,
"grad_norm": 4.866029262542725,
"learning_rate": 1.2051201011378004e-05,
"loss": 0.3966,
"step": 3220
},
{
"epoch": 1.3759334328995092,
"grad_norm": 5.581394672393799,
"learning_rate": 1.2035398230088497e-05,
"loss": 0.494,
"step": 3225
},
{
"epoch": 1.3780669938126733,
"grad_norm": 4.49777364730835,
"learning_rate": 1.2019595448798988e-05,
"loss": 0.4584,
"step": 3230
},
{
"epoch": 1.3802005547258374,
"grad_norm": 4.342195510864258,
"learning_rate": 1.2003792667509483e-05,
"loss": 0.5692,
"step": 3235
},
{
"epoch": 1.3823341156390014,
"grad_norm": 4.519298076629639,
"learning_rate": 1.1987989886219975e-05,
"loss": 0.44,
"step": 3240
},
{
"epoch": 1.3844676765521655,
"grad_norm": 3.987589120864868,
"learning_rate": 1.1972187104930468e-05,
"loss": 0.4181,
"step": 3245
},
{
"epoch": 1.3866012374653296,
"grad_norm": 5.103501319885254,
"learning_rate": 1.1956384323640961e-05,
"loss": 0.486,
"step": 3250
},
{
"epoch": 1.3887347983784937,
"grad_norm": 4.8116631507873535,
"learning_rate": 1.1940581542351454e-05,
"loss": 0.4839,
"step": 3255
},
{
"epoch": 1.3908683592916578,
"grad_norm": 3.0158679485321045,
"learning_rate": 1.1924778761061949e-05,
"loss": 0.4425,
"step": 3260
},
{
"epoch": 1.3930019202048218,
"grad_norm": 4.129778861999512,
"learning_rate": 1.1908975979772442e-05,
"loss": 0.4286,
"step": 3265
},
{
"epoch": 1.395135481117986,
"grad_norm": 3.9445719718933105,
"learning_rate": 1.1893173198482934e-05,
"loss": 0.4605,
"step": 3270
},
{
"epoch": 1.39726904203115,
"grad_norm": 4.372819900512695,
"learning_rate": 1.1877370417193427e-05,
"loss": 0.4891,
"step": 3275
},
{
"epoch": 1.399402602944314,
"grad_norm": 4.265586853027344,
"learning_rate": 1.1861567635903922e-05,
"loss": 0.4053,
"step": 3280
},
{
"epoch": 1.4015361638574781,
"grad_norm": 4.457062244415283,
"learning_rate": 1.1845764854614411e-05,
"loss": 0.4438,
"step": 3285
},
{
"epoch": 1.4036697247706422,
"grad_norm": 3.898768663406372,
"learning_rate": 1.1829962073324906e-05,
"loss": 0.4735,
"step": 3290
},
{
"epoch": 1.4058032856838063,
"grad_norm": 5.223904132843018,
"learning_rate": 1.1814159292035399e-05,
"loss": 0.5695,
"step": 3295
},
{
"epoch": 1.4079368465969704,
"grad_norm": 4.149979591369629,
"learning_rate": 1.1798356510745892e-05,
"loss": 0.464,
"step": 3300
},
{
"epoch": 1.4100704075101345,
"grad_norm": 4.598405838012695,
"learning_rate": 1.1782553729456385e-05,
"loss": 0.4787,
"step": 3305
},
{
"epoch": 1.4122039684232985,
"grad_norm": 4.7623982429504395,
"learning_rate": 1.1766750948166879e-05,
"loss": 0.4302,
"step": 3310
},
{
"epoch": 1.4143375293364626,
"grad_norm": 3.785916805267334,
"learning_rate": 1.1750948166877372e-05,
"loss": 0.4341,
"step": 3315
},
{
"epoch": 1.4164710902496267,
"grad_norm": 3.600949764251709,
"learning_rate": 1.1735145385587865e-05,
"loss": 0.367,
"step": 3320
},
{
"epoch": 1.4186046511627908,
"grad_norm": 4.522639751434326,
"learning_rate": 1.1719342604298358e-05,
"loss": 0.4708,
"step": 3325
},
{
"epoch": 1.4207382120759546,
"grad_norm": 3.9708735942840576,
"learning_rate": 1.1703539823008852e-05,
"loss": 0.4751,
"step": 3330
},
{
"epoch": 1.4228717729891187,
"grad_norm": 4.261918067932129,
"learning_rate": 1.1687737041719342e-05,
"loss": 0.4474,
"step": 3335
},
{
"epoch": 1.4250053339022828,
"grad_norm": 4.602210521697998,
"learning_rate": 1.1671934260429836e-05,
"loss": 0.4168,
"step": 3340
},
{
"epoch": 1.4271388948154469,
"grad_norm": 4.670740604400635,
"learning_rate": 1.165613147914033e-05,
"loss": 0.4383,
"step": 3345
},
{
"epoch": 1.429272455728611,
"grad_norm": 4.135427951812744,
"learning_rate": 1.1640328697850822e-05,
"loss": 0.438,
"step": 3350
},
{
"epoch": 1.431406016641775,
"grad_norm": 4.086421012878418,
"learning_rate": 1.1624525916561315e-05,
"loss": 0.4385,
"step": 3355
},
{
"epoch": 1.433539577554939,
"grad_norm": 4.7353692054748535,
"learning_rate": 1.160872313527181e-05,
"loss": 0.4577,
"step": 3360
},
{
"epoch": 1.4356731384681032,
"grad_norm": 4.442506313323975,
"learning_rate": 1.1592920353982302e-05,
"loss": 0.435,
"step": 3365
},
{
"epoch": 1.4378066993812673,
"grad_norm": 4.130180358886719,
"learning_rate": 1.1577117572692795e-05,
"loss": 0.4775,
"step": 3370
},
{
"epoch": 1.4399402602944313,
"grad_norm": 3.9793550968170166,
"learning_rate": 1.1561314791403288e-05,
"loss": 0.4046,
"step": 3375
},
{
"epoch": 1.4420738212075954,
"grad_norm": 5.080319881439209,
"learning_rate": 1.1545512010113781e-05,
"loss": 0.4921,
"step": 3380
},
{
"epoch": 1.4442073821207595,
"grad_norm": 4.066112041473389,
"learning_rate": 1.1529709228824276e-05,
"loss": 0.4148,
"step": 3385
},
{
"epoch": 1.4463409430339236,
"grad_norm": 4.144713878631592,
"learning_rate": 1.1513906447534767e-05,
"loss": 0.4061,
"step": 3390
},
{
"epoch": 1.4484745039470877,
"grad_norm": 4.271573543548584,
"learning_rate": 1.149810366624526e-05,
"loss": 0.3859,
"step": 3395
},
{
"epoch": 1.4506080648602517,
"grad_norm": 3.9691481590270996,
"learning_rate": 1.1482300884955753e-05,
"loss": 0.5035,
"step": 3400
},
{
"epoch": 1.4527416257734158,
"grad_norm": 5.163878917694092,
"learning_rate": 1.1466498103666245e-05,
"loss": 0.4537,
"step": 3405
},
{
"epoch": 1.45487518668658,
"grad_norm": 3.944066286087036,
"learning_rate": 1.145069532237674e-05,
"loss": 0.4655,
"step": 3410
},
{
"epoch": 1.457008747599744,
"grad_norm": 4.504640102386475,
"learning_rate": 1.1434892541087233e-05,
"loss": 0.3928,
"step": 3415
},
{
"epoch": 1.459142308512908,
"grad_norm": 4.684928894042969,
"learning_rate": 1.1419089759797726e-05,
"loss": 0.4426,
"step": 3420
},
{
"epoch": 1.4612758694260721,
"grad_norm": 4.373624801635742,
"learning_rate": 1.1403286978508219e-05,
"loss": 0.506,
"step": 3425
},
{
"epoch": 1.4634094303392362,
"grad_norm": 4.624093532562256,
"learning_rate": 1.1387484197218712e-05,
"loss": 0.3835,
"step": 3430
},
{
"epoch": 1.4655429912524003,
"grad_norm": 4.191517353057861,
"learning_rate": 1.1371681415929206e-05,
"loss": 0.5111,
"step": 3435
},
{
"epoch": 1.4676765521655644,
"grad_norm": 4.49861478805542,
"learning_rate": 1.1355878634639697e-05,
"loss": 0.3948,
"step": 3440
},
{
"epoch": 1.4698101130787284,
"grad_norm": 4.452141284942627,
"learning_rate": 1.134007585335019e-05,
"loss": 0.4282,
"step": 3445
},
{
"epoch": 1.4719436739918925,
"grad_norm": 4.915762424468994,
"learning_rate": 1.1324273072060683e-05,
"loss": 0.4408,
"step": 3450
},
{
"epoch": 1.4740772349050566,
"grad_norm": 3.6840269565582275,
"learning_rate": 1.1308470290771176e-05,
"loss": 0.4192,
"step": 3455
},
{
"epoch": 1.4762107958182207,
"grad_norm": 3.9970285892486572,
"learning_rate": 1.1292667509481669e-05,
"loss": 0.4471,
"step": 3460
},
{
"epoch": 1.4783443567313848,
"grad_norm": 3.540578603744507,
"learning_rate": 1.1276864728192163e-05,
"loss": 0.4319,
"step": 3465
},
{
"epoch": 1.4804779176445488,
"grad_norm": 4.35250186920166,
"learning_rate": 1.1261061946902656e-05,
"loss": 0.4367,
"step": 3470
},
{
"epoch": 1.482611478557713,
"grad_norm": 4.181733131408691,
"learning_rate": 1.1245259165613149e-05,
"loss": 0.4216,
"step": 3475
},
{
"epoch": 1.484745039470877,
"grad_norm": 4.138208389282227,
"learning_rate": 1.1229456384323642e-05,
"loss": 0.4344,
"step": 3480
},
{
"epoch": 1.486878600384041,
"grad_norm": 4.309459209442139,
"learning_rate": 1.1213653603034137e-05,
"loss": 0.4263,
"step": 3485
},
{
"epoch": 1.4890121612972052,
"grad_norm": 4.881312370300293,
"learning_rate": 1.1197850821744628e-05,
"loss": 0.4619,
"step": 3490
},
{
"epoch": 1.4911457222103692,
"grad_norm": 3.9794363975524902,
"learning_rate": 1.118204804045512e-05,
"loss": 0.5105,
"step": 3495
},
{
"epoch": 1.4932792831235333,
"grad_norm": 4.884064674377441,
"learning_rate": 1.1166245259165613e-05,
"loss": 0.4722,
"step": 3500
},
{
"epoch": 1.4954128440366974,
"grad_norm": 4.475371360778809,
"learning_rate": 1.1150442477876106e-05,
"loss": 0.4152,
"step": 3505
},
{
"epoch": 1.4975464049498612,
"grad_norm": 3.525907039642334,
"learning_rate": 1.11346396965866e-05,
"loss": 0.4017,
"step": 3510
},
{
"epoch": 1.4996799658630253,
"grad_norm": 4.34111213684082,
"learning_rate": 1.1118836915297094e-05,
"loss": 0.4496,
"step": 3515
},
{
"epoch": 1.5018135267761896,
"grad_norm": 4.264297008514404,
"learning_rate": 1.1103034134007587e-05,
"loss": 0.4965,
"step": 3520
},
{
"epoch": 1.5039470876893535,
"grad_norm": 3.88047194480896,
"learning_rate": 1.108723135271808e-05,
"loss": 0.4009,
"step": 3525
},
{
"epoch": 1.5060806486025176,
"grad_norm": 4.288839340209961,
"learning_rate": 1.1071428571428572e-05,
"loss": 0.5115,
"step": 3530
},
{
"epoch": 1.5082142095156816,
"grad_norm": 4.137423992156982,
"learning_rate": 1.1055625790139067e-05,
"loss": 0.4007,
"step": 3535
},
{
"epoch": 1.5103477704288457,
"grad_norm": 4.802023887634277,
"learning_rate": 1.103982300884956e-05,
"loss": 0.4195,
"step": 3540
},
{
"epoch": 1.5124813313420098,
"grad_norm": 4.864555358886719,
"learning_rate": 1.1024020227560051e-05,
"loss": 0.4412,
"step": 3545
},
{
"epoch": 1.5146148922551739,
"grad_norm": 4.190124988555908,
"learning_rate": 1.1008217446270544e-05,
"loss": 0.5177,
"step": 3550
},
{
"epoch": 1.516748453168338,
"grad_norm": 3.876676321029663,
"learning_rate": 1.0992414664981037e-05,
"loss": 0.4553,
"step": 3555
},
{
"epoch": 1.518882014081502,
"grad_norm": 4.408483505249023,
"learning_rate": 1.097661188369153e-05,
"loss": 0.4378,
"step": 3560
},
{
"epoch": 1.521015574994666,
"grad_norm": 4.281198501586914,
"learning_rate": 1.0960809102402024e-05,
"loss": 0.4296,
"step": 3565
},
{
"epoch": 1.5231491359078302,
"grad_norm": 4.272386074066162,
"learning_rate": 1.0945006321112517e-05,
"loss": 0.406,
"step": 3570
},
{
"epoch": 1.5252826968209943,
"grad_norm": 4.343184947967529,
"learning_rate": 1.092920353982301e-05,
"loss": 0.4636,
"step": 3575
},
{
"epoch": 1.5274162577341583,
"grad_norm": 4.392275333404541,
"learning_rate": 1.0913400758533503e-05,
"loss": 0.4244,
"step": 3580
},
{
"epoch": 1.5295498186473224,
"grad_norm": 4.830089569091797,
"learning_rate": 1.0897597977243997e-05,
"loss": 0.5027,
"step": 3585
},
{
"epoch": 1.5316833795604865,
"grad_norm": 4.782260894775391,
"learning_rate": 1.088179519595449e-05,
"loss": 0.4284,
"step": 3590
},
{
"epoch": 1.5338169404736504,
"grad_norm": 4.546128273010254,
"learning_rate": 1.0865992414664981e-05,
"loss": 0.3893,
"step": 3595
},
{
"epoch": 1.5359505013868144,
"grad_norm": 4.104976177215576,
"learning_rate": 1.0850189633375474e-05,
"loss": 0.515,
"step": 3600
},
{
"epoch": 1.5380840622999785,
"grad_norm": 4.276109218597412,
"learning_rate": 1.0834386852085967e-05,
"loss": 0.3867,
"step": 3605
},
{
"epoch": 1.5402176232131426,
"grad_norm": 5.1553192138671875,
"learning_rate": 1.081858407079646e-05,
"loss": 0.5842,
"step": 3610
},
{
"epoch": 1.5423511841263067,
"grad_norm": 4.334433555603027,
"learning_rate": 1.0802781289506955e-05,
"loss": 0.3753,
"step": 3615
},
{
"epoch": 1.5444847450394708,
"grad_norm": 4.369014263153076,
"learning_rate": 1.0786978508217448e-05,
"loss": 0.4834,
"step": 3620
},
{
"epoch": 1.5466183059526348,
"grad_norm": 3.7355666160583496,
"learning_rate": 1.077117572692794e-05,
"loss": 0.4547,
"step": 3625
},
{
"epoch": 1.548751866865799,
"grad_norm": 4.238455295562744,
"learning_rate": 1.0755372945638433e-05,
"loss": 0.4224,
"step": 3630
},
{
"epoch": 1.550885427778963,
"grad_norm": 4.657512664794922,
"learning_rate": 1.0739570164348926e-05,
"loss": 0.4341,
"step": 3635
},
{
"epoch": 1.553018988692127,
"grad_norm": 4.200009346008301,
"learning_rate": 1.072376738305942e-05,
"loss": 0.4038,
"step": 3640
},
{
"epoch": 1.5551525496052911,
"grad_norm": 3.9212570190429688,
"learning_rate": 1.0707964601769914e-05,
"loss": 0.3606,
"step": 3645
},
{
"epoch": 1.5572861105184552,
"grad_norm": 4.267481327056885,
"learning_rate": 1.0692161820480405e-05,
"loss": 0.4427,
"step": 3650
},
{
"epoch": 1.5594196714316193,
"grad_norm": 4.647229194641113,
"learning_rate": 1.0676359039190898e-05,
"loss": 0.4919,
"step": 3655
},
{
"epoch": 1.5615532323447834,
"grad_norm": 4.122211456298828,
"learning_rate": 1.066055625790139e-05,
"loss": 0.362,
"step": 3660
},
{
"epoch": 1.5636867932579475,
"grad_norm": 4.275843143463135,
"learning_rate": 1.0644753476611885e-05,
"loss": 0.4261,
"step": 3665
},
{
"epoch": 1.5658203541711115,
"grad_norm": 4.351519584655762,
"learning_rate": 1.0628950695322378e-05,
"loss": 0.4269,
"step": 3670
},
{
"epoch": 1.5679539150842756,
"grad_norm": 4.048177242279053,
"learning_rate": 1.061314791403287e-05,
"loss": 0.4291,
"step": 3675
},
{
"epoch": 1.5700874759974397,
"grad_norm": 3.983682632446289,
"learning_rate": 1.0597345132743364e-05,
"loss": 0.3967,
"step": 3680
},
{
"epoch": 1.5722210369106038,
"grad_norm": 4.443732738494873,
"learning_rate": 1.0581542351453857e-05,
"loss": 0.4645,
"step": 3685
},
{
"epoch": 1.5743545978237679,
"grad_norm": 4.516783237457275,
"learning_rate": 1.0565739570164351e-05,
"loss": 0.4052,
"step": 3690
},
{
"epoch": 1.576488158736932,
"grad_norm": 4.3395538330078125,
"learning_rate": 1.0549936788874844e-05,
"loss": 0.4627,
"step": 3695
},
{
"epoch": 1.578621719650096,
"grad_norm": 3.796954393386841,
"learning_rate": 1.0534134007585335e-05,
"loss": 0.3856,
"step": 3700
},
{
"epoch": 1.58075528056326,
"grad_norm": 4.659563064575195,
"learning_rate": 1.0518331226295828e-05,
"loss": 0.4917,
"step": 3705
},
{
"epoch": 1.5828888414764242,
"grad_norm": 4.653059482574463,
"learning_rate": 1.0502528445006321e-05,
"loss": 0.4104,
"step": 3710
},
{
"epoch": 1.5850224023895882,
"grad_norm": 3.088853120803833,
"learning_rate": 1.0486725663716814e-05,
"loss": 0.3659,
"step": 3715
},
{
"epoch": 1.5871559633027523,
"grad_norm": 5.347471714019775,
"learning_rate": 1.0470922882427308e-05,
"loss": 0.5145,
"step": 3720
},
{
"epoch": 1.5892895242159164,
"grad_norm": 3.9844131469726562,
"learning_rate": 1.0455120101137801e-05,
"loss": 0.433,
"step": 3725
},
{
"epoch": 1.5914230851290805,
"grad_norm": 3.934298038482666,
"learning_rate": 1.0439317319848294e-05,
"loss": 0.3861,
"step": 3730
},
{
"epoch": 1.5935566460422446,
"grad_norm": 5.001667499542236,
"learning_rate": 1.0423514538558787e-05,
"loss": 0.4274,
"step": 3735
},
{
"epoch": 1.5956902069554086,
"grad_norm": 4.3066606521606445,
"learning_rate": 1.0407711757269282e-05,
"loss": 0.4857,
"step": 3740
},
{
"epoch": 1.5978237678685727,
"grad_norm": 4.791617393493652,
"learning_rate": 1.0391908975979774e-05,
"loss": 0.5526,
"step": 3745
},
{
"epoch": 1.5999573287817368,
"grad_norm": 5.158261299133301,
"learning_rate": 1.0376106194690266e-05,
"loss": 0.4052,
"step": 3750
},
{
"epoch": 1.6020908896949009,
"grad_norm": 4.077587127685547,
"learning_rate": 1.0360303413400759e-05,
"loss": 0.4087,
"step": 3755
},
{
"epoch": 1.604224450608065,
"grad_norm": 4.145094394683838,
"learning_rate": 1.0344500632111251e-05,
"loss": 0.3831,
"step": 3760
},
{
"epoch": 1.606358011521229,
"grad_norm": 4.910739421844482,
"learning_rate": 1.0328697850821744e-05,
"loss": 0.4532,
"step": 3765
},
{
"epoch": 1.6084915724343931,
"grad_norm": 3.4111921787261963,
"learning_rate": 1.0312895069532239e-05,
"loss": 0.4158,
"step": 3770
},
{
"epoch": 1.6106251333475572,
"grad_norm": 4.56186056137085,
"learning_rate": 1.0297092288242732e-05,
"loss": 0.3978,
"step": 3775
},
{
"epoch": 1.6127586942607213,
"grad_norm": 4.2800822257995605,
"learning_rate": 1.0281289506953225e-05,
"loss": 0.4082,
"step": 3780
},
{
"epoch": 1.6148922551738853,
"grad_norm": 4.143041133880615,
"learning_rate": 1.0265486725663717e-05,
"loss": 0.37,
"step": 3785
},
{
"epoch": 1.6170258160870494,
"grad_norm": 4.374189853668213,
"learning_rate": 1.0249683944374212e-05,
"loss": 0.4613,
"step": 3790
},
{
"epoch": 1.6191593770002135,
"grad_norm": 4.439857006072998,
"learning_rate": 1.0233881163084705e-05,
"loss": 0.4321,
"step": 3795
},
{
"epoch": 1.6212929379133776,
"grad_norm": 4.954322338104248,
"learning_rate": 1.0218078381795198e-05,
"loss": 0.4867,
"step": 3800
},
{
"epoch": 1.6234264988265417,
"grad_norm": 4.941915035247803,
"learning_rate": 1.0202275600505689e-05,
"loss": 0.4404,
"step": 3805
},
{
"epoch": 1.6255600597397055,
"grad_norm": 3.809436082839966,
"learning_rate": 1.0186472819216182e-05,
"loss": 0.4435,
"step": 3810
},
{
"epoch": 1.6276936206528696,
"grad_norm": 3.886300802230835,
"learning_rate": 1.0170670037926675e-05,
"loss": 0.4115,
"step": 3815
},
{
"epoch": 1.6298271815660337,
"grad_norm": 5.091635227203369,
"learning_rate": 1.015486725663717e-05,
"loss": 0.4811,
"step": 3820
},
{
"epoch": 1.6319607424791978,
"grad_norm": 4.407512664794922,
"learning_rate": 1.0139064475347662e-05,
"loss": 0.4128,
"step": 3825
},
{
"epoch": 1.6340943033923618,
"grad_norm": 4.914755821228027,
"learning_rate": 1.0123261694058155e-05,
"loss": 0.4569,
"step": 3830
},
{
"epoch": 1.636227864305526,
"grad_norm": 4.0821428298950195,
"learning_rate": 1.0107458912768648e-05,
"loss": 0.4714,
"step": 3835
},
{
"epoch": 1.63836142521869,
"grad_norm": 3.7787981033325195,
"learning_rate": 1.0091656131479142e-05,
"loss": 0.3844,
"step": 3840
},
{
"epoch": 1.640494986131854,
"grad_norm": 4.078645706176758,
"learning_rate": 1.0075853350189635e-05,
"loss": 0.3967,
"step": 3845
},
{
"epoch": 1.6426285470450182,
"grad_norm": 4.631909370422363,
"learning_rate": 1.0060050568900128e-05,
"loss": 0.3869,
"step": 3850
},
{
"epoch": 1.6447621079581822,
"grad_norm": 3.907121419906616,
"learning_rate": 1.004424778761062e-05,
"loss": 0.4722,
"step": 3855
},
{
"epoch": 1.6468956688713463,
"grad_norm": 3.532806634902954,
"learning_rate": 1.0028445006321112e-05,
"loss": 0.3377,
"step": 3860
},
{
"epoch": 1.6490292297845104,
"grad_norm": 3.775578260421753,
"learning_rate": 1.0012642225031605e-05,
"loss": 0.4017,
"step": 3865
},
{
"epoch": 1.6511627906976745,
"grad_norm": 4.62343168258667,
"learning_rate": 9.9968394437421e-06,
"loss": 0.3972,
"step": 3870
},
{
"epoch": 1.6532963516108385,
"grad_norm": 4.519557476043701,
"learning_rate": 9.981036662452593e-06,
"loss": 0.4669,
"step": 3875
},
{
"epoch": 1.6554299125240024,
"grad_norm": 4.216794013977051,
"learning_rate": 9.965233881163085e-06,
"loss": 0.4569,
"step": 3880
},
{
"epoch": 1.6575634734371665,
"grad_norm": 3.9565041065216064,
"learning_rate": 9.949431099873578e-06,
"loss": 0.4096,
"step": 3885
},
{
"epoch": 1.6596970343503306,
"grad_norm": 3.795283317565918,
"learning_rate": 9.933628318584071e-06,
"loss": 0.4411,
"step": 3890
},
{
"epoch": 1.6618305952634946,
"grad_norm": 4.511595726013184,
"learning_rate": 9.917825537294564e-06,
"loss": 0.4277,
"step": 3895
},
{
"epoch": 1.6639641561766587,
"grad_norm": 4.257791519165039,
"learning_rate": 9.902022756005057e-06,
"loss": 0.3477,
"step": 3900
},
{
"epoch": 1.6660977170898228,
"grad_norm": 5.031363010406494,
"learning_rate": 9.886219974715552e-06,
"loss": 0.4967,
"step": 3905
},
{
"epoch": 1.6682312780029869,
"grad_norm": 4.413186073303223,
"learning_rate": 9.870417193426044e-06,
"loss": 0.371,
"step": 3910
},
{
"epoch": 1.670364838916151,
"grad_norm": 4.389219760894775,
"learning_rate": 9.854614412136537e-06,
"loss": 0.4501,
"step": 3915
},
{
"epoch": 1.672498399829315,
"grad_norm": 4.395909309387207,
"learning_rate": 9.83881163084703e-06,
"loss": 0.4275,
"step": 3920
},
{
"epoch": 1.674631960742479,
"grad_norm": 4.407324314117432,
"learning_rate": 9.823008849557523e-06,
"loss": 0.423,
"step": 3925
},
{
"epoch": 1.6767655216556432,
"grad_norm": 4.677619934082031,
"learning_rate": 9.807206068268016e-06,
"loss": 0.4042,
"step": 3930
},
{
"epoch": 1.6788990825688073,
"grad_norm": 3.673964262008667,
"learning_rate": 9.791403286978509e-06,
"loss": 0.4142,
"step": 3935
},
{
"epoch": 1.6810326434819713,
"grad_norm": 4.465169429779053,
"learning_rate": 9.775600505689002e-06,
"loss": 0.4424,
"step": 3940
},
{
"epoch": 1.6831662043951354,
"grad_norm": 4.479079246520996,
"learning_rate": 9.759797724399495e-06,
"loss": 0.4194,
"step": 3945
},
{
"epoch": 1.6852997653082995,
"grad_norm": 4.500971794128418,
"learning_rate": 9.743994943109987e-06,
"loss": 0.456,
"step": 3950
},
{
"epoch": 1.6874333262214636,
"grad_norm": 3.6217551231384277,
"learning_rate": 9.72819216182048e-06,
"loss": 0.3881,
"step": 3955
},
{
"epoch": 1.6895668871346277,
"grad_norm": 3.604400873184204,
"learning_rate": 9.712389380530975e-06,
"loss": 0.3924,
"step": 3960
},
{
"epoch": 1.6917004480477917,
"grad_norm": 3.9066615104675293,
"learning_rate": 9.696586599241468e-06,
"loss": 0.4241,
"step": 3965
},
{
"epoch": 1.6938340089609558,
"grad_norm": 4.320919513702393,
"learning_rate": 9.68078381795196e-06,
"loss": 0.4007,
"step": 3970
},
{
"epoch": 1.69596756987412,
"grad_norm": 4.506808757781982,
"learning_rate": 9.664981036662453e-06,
"loss": 0.4394,
"step": 3975
},
{
"epoch": 1.698101130787284,
"grad_norm": 4.534313201904297,
"learning_rate": 9.649178255372946e-06,
"loss": 0.4414,
"step": 3980
},
{
"epoch": 1.700234691700448,
"grad_norm": 3.8250062465667725,
"learning_rate": 9.63337547408344e-06,
"loss": 0.4157,
"step": 3985
},
{
"epoch": 1.7023682526136121,
"grad_norm": 4.336977005004883,
"learning_rate": 9.617572692793932e-06,
"loss": 0.4267,
"step": 3990
},
{
"epoch": 1.7045018135267762,
"grad_norm": 3.7711403369903564,
"learning_rate": 9.601769911504427e-06,
"loss": 0.399,
"step": 3995
},
{
"epoch": 1.7066353744399403,
"grad_norm": 3.7794551849365234,
"learning_rate": 9.585967130214918e-06,
"loss": 0.4125,
"step": 4000
},
{
"epoch": 1.7087689353531044,
"grad_norm": 3.5244486331939697,
"learning_rate": 9.57016434892541e-06,
"loss": 0.4284,
"step": 4005
},
{
"epoch": 1.7109024962662684,
"grad_norm": 4.090603351593018,
"learning_rate": 9.554361567635905e-06,
"loss": 0.3937,
"step": 4010
},
{
"epoch": 1.7130360571794325,
"grad_norm": 5.152937412261963,
"learning_rate": 9.538558786346398e-06,
"loss": 0.4293,
"step": 4015
},
{
"epoch": 1.7151696180925966,
"grad_norm": 5.739314079284668,
"learning_rate": 9.522756005056891e-06,
"loss": 0.4916,
"step": 4020
},
{
"epoch": 1.7173031790057607,
"grad_norm": 3.626616954803467,
"learning_rate": 9.506953223767384e-06,
"loss": 0.3178,
"step": 4025
},
{
"epoch": 1.7194367399189248,
"grad_norm": 4.267282962799072,
"learning_rate": 9.491150442477877e-06,
"loss": 0.4212,
"step": 4030
},
{
"epoch": 1.7215703008320888,
"grad_norm": 3.8010354042053223,
"learning_rate": 9.47534766118837e-06,
"loss": 0.4214,
"step": 4035
},
{
"epoch": 1.723703861745253,
"grad_norm": 4.761562347412109,
"learning_rate": 9.459544879898863e-06,
"loss": 0.4117,
"step": 4040
},
{
"epoch": 1.725837422658417,
"grad_norm": 4.024481773376465,
"learning_rate": 9.443742098609357e-06,
"loss": 0.4416,
"step": 4045
},
{
"epoch": 1.727970983571581,
"grad_norm": 4.028324127197266,
"learning_rate": 9.427939317319848e-06,
"loss": 0.4346,
"step": 4050
},
{
"epoch": 1.7301045444847452,
"grad_norm": 4.539582252502441,
"learning_rate": 9.412136536030341e-06,
"loss": 0.4518,
"step": 4055
},
{
"epoch": 1.7322381053979092,
"grad_norm": 3.911097526550293,
"learning_rate": 9.396333754740836e-06,
"loss": 0.4419,
"step": 4060
},
{
"epoch": 1.7343716663110733,
"grad_norm": 4.783831596374512,
"learning_rate": 9.380530973451329e-06,
"loss": 0.4222,
"step": 4065
},
{
"epoch": 1.7365052272242374,
"grad_norm": 3.7832529544830322,
"learning_rate": 9.364728192161821e-06,
"loss": 0.4169,
"step": 4070
},
{
"epoch": 1.7386387881374015,
"grad_norm": 4.095695495605469,
"learning_rate": 9.348925410872314e-06,
"loss": 0.4259,
"step": 4075
},
{
"epoch": 1.7407723490505655,
"grad_norm": 4.235688209533691,
"learning_rate": 9.333122629582807e-06,
"loss": 0.3874,
"step": 4080
},
{
"epoch": 1.7429059099637296,
"grad_norm": 4.536983489990234,
"learning_rate": 9.3173198482933e-06,
"loss": 0.4318,
"step": 4085
},
{
"epoch": 1.7450394708768935,
"grad_norm": 3.5148367881774902,
"learning_rate": 9.301517067003793e-06,
"loss": 0.4566,
"step": 4090
},
{
"epoch": 1.7471730317900576,
"grad_norm": 4.49671745300293,
"learning_rate": 9.285714285714288e-06,
"loss": 0.4351,
"step": 4095
},
{
"epoch": 1.7493065927032216,
"grad_norm": 6.083174705505371,
"learning_rate": 9.26991150442478e-06,
"loss": 0.5365,
"step": 4100
},
{
"epoch": 1.7514401536163857,
"grad_norm": 3.449429512023926,
"learning_rate": 9.254108723135272e-06,
"loss": 0.3924,
"step": 4105
},
{
"epoch": 1.7535737145295498,
"grad_norm": 3.685215950012207,
"learning_rate": 9.238305941845766e-06,
"loss": 0.4072,
"step": 4110
},
{
"epoch": 1.7557072754427139,
"grad_norm": 4.891070365905762,
"learning_rate": 9.222503160556259e-06,
"loss": 0.4324,
"step": 4115
},
{
"epoch": 1.757840836355878,
"grad_norm": 4.432384014129639,
"learning_rate": 9.206700379266752e-06,
"loss": 0.3614,
"step": 4120
},
{
"epoch": 1.759974397269042,
"grad_norm": 4.073850631713867,
"learning_rate": 9.190897597977245e-06,
"loss": 0.4256,
"step": 4125
},
{
"epoch": 1.7621079581822061,
"grad_norm": 4.328255653381348,
"learning_rate": 9.175094816687738e-06,
"loss": 0.4156,
"step": 4130
},
{
"epoch": 1.7642415190953702,
"grad_norm": 4.667512893676758,
"learning_rate": 9.15929203539823e-06,
"loss": 0.4392,
"step": 4135
},
{
"epoch": 1.7663750800085343,
"grad_norm": 3.1964938640594482,
"learning_rate": 9.143489254108723e-06,
"loss": 0.3626,
"step": 4140
},
{
"epoch": 1.7685086409216983,
"grad_norm": 4.073289394378662,
"learning_rate": 9.127686472819218e-06,
"loss": 0.4007,
"step": 4145
},
{
"epoch": 1.7706422018348624,
"grad_norm": 4.5602545738220215,
"learning_rate": 9.111883691529711e-06,
"loss": 0.4329,
"step": 4150
},
{
"epoch": 1.7727757627480265,
"grad_norm": 4.590991497039795,
"learning_rate": 9.096080910240202e-06,
"loss": 0.4734,
"step": 4155
},
{
"epoch": 1.7749093236611904,
"grad_norm": 4.509527683258057,
"learning_rate": 9.080278128950697e-06,
"loss": 0.3963,
"step": 4160
},
{
"epoch": 1.7770428845743544,
"grad_norm": 4.132796287536621,
"learning_rate": 9.06447534766119e-06,
"loss": 0.3708,
"step": 4165
},
{
"epoch": 1.7791764454875185,
"grad_norm": 3.8218822479248047,
"learning_rate": 9.048672566371682e-06,
"loss": 0.385,
"step": 4170
},
{
"epoch": 1.7813100064006826,
"grad_norm": 4.133715629577637,
"learning_rate": 9.032869785082175e-06,
"loss": 0.3566,
"step": 4175
},
{
"epoch": 1.7834435673138467,
"grad_norm": 4.322253227233887,
"learning_rate": 9.017067003792668e-06,
"loss": 0.4153,
"step": 4180
},
{
"epoch": 1.7855771282270108,
"grad_norm": 4.001209259033203,
"learning_rate": 9.001264222503161e-06,
"loss": 0.3932,
"step": 4185
},
{
"epoch": 1.7877106891401748,
"grad_norm": 3.911928176879883,
"learning_rate": 8.985461441213654e-06,
"loss": 0.421,
"step": 4190
},
{
"epoch": 1.789844250053339,
"grad_norm": 4.439943313598633,
"learning_rate": 8.969658659924147e-06,
"loss": 0.411,
"step": 4195
},
{
"epoch": 1.791977810966503,
"grad_norm": 4.437414169311523,
"learning_rate": 8.953855878634641e-06,
"loss": 0.4133,
"step": 4200
},
{
"epoch": 1.794111371879667,
"grad_norm": 4.42112398147583,
"learning_rate": 8.938053097345133e-06,
"loss": 0.4671,
"step": 4205
},
{
"epoch": 1.7962449327928312,
"grad_norm": 4.949692726135254,
"learning_rate": 8.922250316055625e-06,
"loss": 0.4151,
"step": 4210
},
{
"epoch": 1.7983784937059952,
"grad_norm": 4.196777820587158,
"learning_rate": 8.90644753476612e-06,
"loss": 0.3906,
"step": 4215
},
{
"epoch": 1.8005120546191593,
"grad_norm": 3.9743521213531494,
"learning_rate": 8.890644753476613e-06,
"loss": 0.38,
"step": 4220
},
{
"epoch": 1.8026456155323234,
"grad_norm": 3.6272308826446533,
"learning_rate": 8.874841972187106e-06,
"loss": 0.4217,
"step": 4225
},
{
"epoch": 1.8047791764454875,
"grad_norm": 4.151697635650635,
"learning_rate": 8.859039190897599e-06,
"loss": 0.391,
"step": 4230
},
{
"epoch": 1.8069127373586515,
"grad_norm": 3.6812775135040283,
"learning_rate": 8.843236409608091e-06,
"loss": 0.4046,
"step": 4235
},
{
"epoch": 1.8090462982718156,
"grad_norm": 3.767869234085083,
"learning_rate": 8.827433628318584e-06,
"loss": 0.4448,
"step": 4240
},
{
"epoch": 1.8111798591849797,
"grad_norm": 4.005570411682129,
"learning_rate": 8.811630847029077e-06,
"loss": 0.4197,
"step": 4245
},
{
"epoch": 1.8133134200981438,
"grad_norm": 3.979504108428955,
"learning_rate": 8.795828065739572e-06,
"loss": 0.4442,
"step": 4250
},
{
"epoch": 1.8154469810113079,
"grad_norm": 4.660085201263428,
"learning_rate": 8.780025284450065e-06,
"loss": 0.4299,
"step": 4255
},
{
"epoch": 1.817580541924472,
"grad_norm": 3.891530990600586,
"learning_rate": 8.764222503160556e-06,
"loss": 0.3981,
"step": 4260
},
{
"epoch": 1.819714102837636,
"grad_norm": 4.534486293792725,
"learning_rate": 8.74841972187105e-06,
"loss": 0.4499,
"step": 4265
},
{
"epoch": 1.8218476637508,
"grad_norm": 5.114262580871582,
"learning_rate": 8.732616940581543e-06,
"loss": 0.4081,
"step": 4270
},
{
"epoch": 1.8239812246639642,
"grad_norm": 3.8600802421569824,
"learning_rate": 8.716814159292036e-06,
"loss": 0.389,
"step": 4275
},
{
"epoch": 1.8261147855771283,
"grad_norm": 3.8939297199249268,
"learning_rate": 8.701011378002529e-06,
"loss": 0.4191,
"step": 4280
},
{
"epoch": 1.8282483464902923,
"grad_norm": 4.046480655670166,
"learning_rate": 8.685208596713022e-06,
"loss": 0.3995,
"step": 4285
},
{
"epoch": 1.8303819074034564,
"grad_norm": 3.921691417694092,
"learning_rate": 8.669405815423515e-06,
"loss": 0.3994,
"step": 4290
},
{
"epoch": 1.8325154683166205,
"grad_norm": 4.322704792022705,
"learning_rate": 8.653603034134008e-06,
"loss": 0.4107,
"step": 4295
},
{
"epoch": 1.8346490292297846,
"grad_norm": 4.012720108032227,
"learning_rate": 8.637800252844502e-06,
"loss": 0.3564,
"step": 4300
},
{
"epoch": 1.8367825901429486,
"grad_norm": 4.165811061859131,
"learning_rate": 8.621997471554995e-06,
"loss": 0.403,
"step": 4305
},
{
"epoch": 1.8389161510561127,
"grad_norm": 3.617009401321411,
"learning_rate": 8.606194690265486e-06,
"loss": 0.3079,
"step": 4310
},
{
"epoch": 1.8410497119692768,
"grad_norm": 3.6533565521240234,
"learning_rate": 8.59039190897598e-06,
"loss": 0.3903,
"step": 4315
},
{
"epoch": 1.8431832728824409,
"grad_norm": 4.798677444458008,
"learning_rate": 8.574589127686474e-06,
"loss": 0.4295,
"step": 4320
},
{
"epoch": 1.845316833795605,
"grad_norm": 4.112203598022461,
"learning_rate": 8.558786346396967e-06,
"loss": 0.4108,
"step": 4325
},
{
"epoch": 1.847450394708769,
"grad_norm": 3.938917398452759,
"learning_rate": 8.54298356510746e-06,
"loss": 0.3975,
"step": 4330
},
{
"epoch": 1.8495839556219331,
"grad_norm": 3.279505968093872,
"learning_rate": 8.527180783817952e-06,
"loss": 0.36,
"step": 4335
},
{
"epoch": 1.8517175165350972,
"grad_norm": 4.134898662567139,
"learning_rate": 8.511378002528445e-06,
"loss": 0.3914,
"step": 4340
},
{
"epoch": 1.8538510774482613,
"grad_norm": 4.558252811431885,
"learning_rate": 8.495575221238938e-06,
"loss": 0.3827,
"step": 4345
},
{
"epoch": 1.8559846383614254,
"grad_norm": 3.923495292663574,
"learning_rate": 8.479772439949433e-06,
"loss": 0.4142,
"step": 4350
},
{
"epoch": 1.8581181992745894,
"grad_norm": 3.759852170944214,
"learning_rate": 8.463969658659926e-06,
"loss": 0.3415,
"step": 4355
},
{
"epoch": 1.8602517601877535,
"grad_norm": 3.701773166656494,
"learning_rate": 8.448166877370418e-06,
"loss": 0.3879,
"step": 4360
},
{
"epoch": 1.8623853211009176,
"grad_norm": 4.116669654846191,
"learning_rate": 8.432364096080911e-06,
"loss": 0.4991,
"step": 4365
},
{
"epoch": 1.8645188820140814,
"grad_norm": 4.087536811828613,
"learning_rate": 8.416561314791404e-06,
"loss": 0.3334,
"step": 4370
},
{
"epoch": 1.8666524429272455,
"grad_norm": 4.180550575256348,
"learning_rate": 8.400758533501897e-06,
"loss": 0.3978,
"step": 4375
},
{
"epoch": 1.8687860038404096,
"grad_norm": 4.742728233337402,
"learning_rate": 8.38495575221239e-06,
"loss": 0.4217,
"step": 4380
},
{
"epoch": 1.8709195647535737,
"grad_norm": 5.107389450073242,
"learning_rate": 8.369152970922883e-06,
"loss": 0.4304,
"step": 4385
},
{
"epoch": 1.8730531256667378,
"grad_norm": 4.502194404602051,
"learning_rate": 8.353350189633376e-06,
"loss": 0.3924,
"step": 4390
},
{
"epoch": 1.8751866865799018,
"grad_norm": 4.717820167541504,
"learning_rate": 8.337547408343869e-06,
"loss": 0.4051,
"step": 4395
},
{
"epoch": 1.877320247493066,
"grad_norm": 4.939509391784668,
"learning_rate": 8.321744627054363e-06,
"loss": 0.4178,
"step": 4400
},
{
"epoch": 1.87945380840623,
"grad_norm": 4.088109493255615,
"learning_rate": 8.305941845764856e-06,
"loss": 0.3778,
"step": 4405
},
{
"epoch": 1.881587369319394,
"grad_norm": 3.666149854660034,
"learning_rate": 8.290139064475349e-06,
"loss": 0.3979,
"step": 4410
},
{
"epoch": 1.8837209302325582,
"grad_norm": 4.25274658203125,
"learning_rate": 8.274336283185842e-06,
"loss": 0.3942,
"step": 4415
},
{
"epoch": 1.8858544911457222,
"grad_norm": 3.761164665222168,
"learning_rate": 8.258533501896335e-06,
"loss": 0.3896,
"step": 4420
},
{
"epoch": 1.8879880520588863,
"grad_norm": 3.869654417037964,
"learning_rate": 8.242730720606827e-06,
"loss": 0.4409,
"step": 4425
},
{
"epoch": 1.8901216129720504,
"grad_norm": 3.9715282917022705,
"learning_rate": 8.22692793931732e-06,
"loss": 0.4299,
"step": 4430
},
{
"epoch": 1.8922551738852145,
"grad_norm": 3.939626455307007,
"learning_rate": 8.211125158027813e-06,
"loss": 0.399,
"step": 4435
},
{
"epoch": 1.8943887347983785,
"grad_norm": 3.9082634449005127,
"learning_rate": 8.195322376738306e-06,
"loss": 0.4148,
"step": 4440
},
{
"epoch": 1.8965222957115424,
"grad_norm": 4.443459510803223,
"learning_rate": 8.179519595448799e-06,
"loss": 0.4501,
"step": 4445
},
{
"epoch": 1.8986558566247065,
"grad_norm": 3.7164127826690674,
"learning_rate": 8.163716814159292e-06,
"loss": 0.3909,
"step": 4450
},
{
"epoch": 1.9007894175378706,
"grad_norm": 4.089094161987305,
"learning_rate": 8.147914032869786e-06,
"loss": 0.4316,
"step": 4455
},
{
"epoch": 1.9029229784510346,
"grad_norm": 4.615549087524414,
"learning_rate": 8.13211125158028e-06,
"loss": 0.3662,
"step": 4460
},
{
"epoch": 1.9050565393641987,
"grad_norm": 4.402790069580078,
"learning_rate": 8.11630847029077e-06,
"loss": 0.4446,
"step": 4465
},
{
"epoch": 1.9071901002773628,
"grad_norm": 3.7550132274627686,
"learning_rate": 8.100505689001265e-06,
"loss": 0.3931,
"step": 4470
},
{
"epoch": 1.9093236611905269,
"grad_norm": 3.8456757068634033,
"learning_rate": 8.084702907711758e-06,
"loss": 0.3947,
"step": 4475
},
{
"epoch": 1.911457222103691,
"grad_norm": 3.9646549224853516,
"learning_rate": 8.06890012642225e-06,
"loss": 0.3415,
"step": 4480
},
{
"epoch": 1.913590783016855,
"grad_norm": 4.79493522644043,
"learning_rate": 8.053097345132744e-06,
"loss": 0.3661,
"step": 4485
},
{
"epoch": 1.9157243439300191,
"grad_norm": 3.39487886428833,
"learning_rate": 8.037294563843238e-06,
"loss": 0.3743,
"step": 4490
},
{
"epoch": 1.9178579048431832,
"grad_norm": 3.981194496154785,
"learning_rate": 8.02149178255373e-06,
"loss": 0.3918,
"step": 4495
},
{
"epoch": 1.9199914657563473,
"grad_norm": 3.72912859916687,
"learning_rate": 8.005689001264222e-06,
"loss": 0.3459,
"step": 4500
},
{
"epoch": 1.9221250266695113,
"grad_norm": 4.027529239654541,
"learning_rate": 7.989886219974717e-06,
"loss": 0.3944,
"step": 4505
},
{
"epoch": 1.9242585875826754,
"grad_norm": 3.9047298431396484,
"learning_rate": 7.97408343868521e-06,
"loss": 0.3727,
"step": 4510
},
{
"epoch": 1.9263921484958395,
"grad_norm": 4.601792335510254,
"learning_rate": 7.958280657395703e-06,
"loss": 0.3956,
"step": 4515
},
{
"epoch": 1.9285257094090036,
"grad_norm": 4.3443756103515625,
"learning_rate": 7.942477876106195e-06,
"loss": 0.4233,
"step": 4520
},
{
"epoch": 1.9306592703221677,
"grad_norm": 4.115539073944092,
"learning_rate": 7.926675094816688e-06,
"loss": 0.3929,
"step": 4525
},
{
"epoch": 1.9327928312353317,
"grad_norm": 4.373830795288086,
"learning_rate": 7.910872313527181e-06,
"loss": 0.4206,
"step": 4530
},
{
"epoch": 1.9349263921484958,
"grad_norm": 4.398421287536621,
"learning_rate": 7.895069532237674e-06,
"loss": 0.3835,
"step": 4535
},
{
"epoch": 1.93705995306166,
"grad_norm": 4.316149711608887,
"learning_rate": 7.879266750948169e-06,
"loss": 0.459,
"step": 4540
},
{
"epoch": 1.939193513974824,
"grad_norm": 4.977426052093506,
"learning_rate": 7.86346396965866e-06,
"loss": 0.4094,
"step": 4545
},
{
"epoch": 1.941327074887988,
"grad_norm": 3.934713125228882,
"learning_rate": 7.847661188369153e-06,
"loss": 0.3996,
"step": 4550
},
{
"epoch": 1.9434606358011521,
"grad_norm": 5.019830226898193,
"learning_rate": 7.831858407079647e-06,
"loss": 0.4309,
"step": 4555
},
{
"epoch": 1.9455941967143162,
"grad_norm": 3.6168367862701416,
"learning_rate": 7.81605562579014e-06,
"loss": 0.3579,
"step": 4560
},
{
"epoch": 1.9477277576274803,
"grad_norm": 4.30971097946167,
"learning_rate": 7.800252844500633e-06,
"loss": 0.4028,
"step": 4565
},
{
"epoch": 1.9498613185406444,
"grad_norm": 4.342047214508057,
"learning_rate": 7.784450063211126e-06,
"loss": 0.3659,
"step": 4570
},
{
"epoch": 1.9519948794538085,
"grad_norm": 4.505122184753418,
"learning_rate": 7.768647281921619e-06,
"loss": 0.3697,
"step": 4575
},
{
"epoch": 1.9541284403669725,
"grad_norm": 4.1636271476745605,
"learning_rate": 7.752844500632112e-06,
"loss": 0.3926,
"step": 4580
},
{
"epoch": 1.9562620012801366,
"grad_norm": 4.6492719650268555,
"learning_rate": 7.737041719342605e-06,
"loss": 0.3827,
"step": 4585
},
{
"epoch": 1.9583955621933007,
"grad_norm": 4.6462812423706055,
"learning_rate": 7.721238938053099e-06,
"loss": 0.4603,
"step": 4590
},
{
"epoch": 1.9605291231064648,
"grad_norm": 4.198066711425781,
"learning_rate": 7.70543615676359e-06,
"loss": 0.4473,
"step": 4595
},
{
"epoch": 1.9626626840196288,
"grad_norm": 3.5620765686035156,
"learning_rate": 7.689633375474083e-06,
"loss": 0.4027,
"step": 4600
},
{
"epoch": 1.964796244932793,
"grad_norm": 4.463738918304443,
"learning_rate": 7.673830594184578e-06,
"loss": 0.4164,
"step": 4605
},
{
"epoch": 1.966929805845957,
"grad_norm": 4.4071364402771,
"learning_rate": 7.65802781289507e-06,
"loss": 0.4502,
"step": 4610
},
{
"epoch": 1.969063366759121,
"grad_norm": 4.269393444061279,
"learning_rate": 7.642225031605563e-06,
"loss": 0.3708,
"step": 4615
},
{
"epoch": 1.9711969276722852,
"grad_norm": 3.918552875518799,
"learning_rate": 7.626422250316056e-06,
"loss": 0.3747,
"step": 4620
},
{
"epoch": 1.9733304885854492,
"grad_norm": 3.7500672340393066,
"learning_rate": 7.610619469026549e-06,
"loss": 0.3385,
"step": 4625
},
{
"epoch": 1.9754640494986133,
"grad_norm": 5.05972957611084,
"learning_rate": 7.594816687737042e-06,
"loss": 0.3845,
"step": 4630
},
{
"epoch": 1.9775976104117774,
"grad_norm": 4.773119926452637,
"learning_rate": 7.579013906447536e-06,
"loss": 0.4506,
"step": 4635
},
{
"epoch": 1.9797311713249415,
"grad_norm": 4.228442192077637,
"learning_rate": 7.563211125158029e-06,
"loss": 0.3889,
"step": 4640
},
{
"epoch": 1.9818647322381056,
"grad_norm": 4.035905838012695,
"learning_rate": 7.5474083438685216e-06,
"loss": 0.4104,
"step": 4645
},
{
"epoch": 1.9839982931512696,
"grad_norm": 5.019458293914795,
"learning_rate": 7.5316055625790144e-06,
"loss": 0.4283,
"step": 4650
},
{
"epoch": 1.9861318540644335,
"grad_norm": 4.0822978019714355,
"learning_rate": 7.515802781289507e-06,
"loss": 0.3305,
"step": 4655
},
{
"epoch": 1.9882654149775976,
"grad_norm": 3.953634023666382,
"learning_rate": 7.500000000000001e-06,
"loss": 0.393,
"step": 4660
},
{
"epoch": 1.9903989758907616,
"grad_norm": 4.482757091522217,
"learning_rate": 7.484197218710494e-06,
"loss": 0.416,
"step": 4665
},
{
"epoch": 1.9925325368039257,
"grad_norm": 3.983945369720459,
"learning_rate": 7.468394437420987e-06,
"loss": 0.3541,
"step": 4670
},
{
"epoch": 1.9946660977170898,
"grad_norm": 3.568307638168335,
"learning_rate": 7.45259165613148e-06,
"loss": 0.4027,
"step": 4675
},
{
"epoch": 1.9967996586302539,
"grad_norm": 4.563660621643066,
"learning_rate": 7.4367888748419725e-06,
"loss": 0.3685,
"step": 4680
},
{
"epoch": 1.998933219543418,
"grad_norm": 4.3235554695129395,
"learning_rate": 7.420986093552465e-06,
"loss": 0.3796,
"step": 4685
},
{
"epoch": 2.0,
"eval_evaluator": 0.9877204489141523,
"eval_loss": 0.1771988570690155,
"eval_runtime": 125.952,
"eval_samples_per_second": 18.158,
"eval_steps_per_second": 2.271,
"step": 4688
},
{
"epoch": 2.000853424365266,
"grad_norm": 3.9099795818328857,
"learning_rate": 7.405183312262959e-06,
"loss": 0.3541,
"step": 4690
},
{
"epoch": 2.00298698527843,
"grad_norm": 4.449174404144287,
"learning_rate": 7.389380530973452e-06,
"loss": 0.4464,
"step": 4695
},
{
"epoch": 2.005120546191594,
"grad_norm": 3.8515255451202393,
"learning_rate": 7.373577749683945e-06,
"loss": 0.3864,
"step": 4700
},
{
"epoch": 2.007254107104758,
"grad_norm": 3.965477466583252,
"learning_rate": 7.357774968394438e-06,
"loss": 0.3213,
"step": 4705
},
{
"epoch": 2.009387668017922,
"grad_norm": 4.251551628112793,
"learning_rate": 7.341972187104931e-06,
"loss": 0.3773,
"step": 4710
},
{
"epoch": 2.011521228931086,
"grad_norm": 4.710624694824219,
"learning_rate": 7.326169405815424e-06,
"loss": 0.4576,
"step": 4715
},
{
"epoch": 2.01365478984425,
"grad_norm": 3.9441611766815186,
"learning_rate": 7.310366624525917e-06,
"loss": 0.3428,
"step": 4720
},
{
"epoch": 2.015788350757414,
"grad_norm": 3.8340277671813965,
"learning_rate": 7.294563843236411e-06,
"loss": 0.4218,
"step": 4725
},
{
"epoch": 2.017921911670578,
"grad_norm": 5.2679219245910645,
"learning_rate": 7.278761061946903e-06,
"loss": 0.4831,
"step": 4730
},
{
"epoch": 2.020055472583742,
"grad_norm": 4.511500835418701,
"learning_rate": 7.262958280657396e-06,
"loss": 0.4228,
"step": 4735
},
{
"epoch": 2.022189033496906,
"grad_norm": 3.8335185050964355,
"learning_rate": 7.2471554993678896e-06,
"loss": 0.3894,
"step": 4740
},
{
"epoch": 2.0243225944100702,
"grad_norm": 3.6677894592285156,
"learning_rate": 7.2313527180783824e-06,
"loss": 0.4975,
"step": 4745
},
{
"epoch": 2.0264561553232343,
"grad_norm": 3.9892594814300537,
"learning_rate": 7.215549936788876e-06,
"loss": 0.3693,
"step": 4750
},
{
"epoch": 2.0285897162363984,
"grad_norm": 3.551366090774536,
"learning_rate": 7.199747155499368e-06,
"loss": 0.3899,
"step": 4755
},
{
"epoch": 2.0307232771495625,
"grad_norm": 3.675837516784668,
"learning_rate": 7.183944374209861e-06,
"loss": 0.365,
"step": 4760
},
{
"epoch": 2.0328568380627265,
"grad_norm": 4.143002033233643,
"learning_rate": 7.168141592920355e-06,
"loss": 0.4794,
"step": 4765
},
{
"epoch": 2.0349903989758906,
"grad_norm": 3.7219038009643555,
"learning_rate": 7.152338811630848e-06,
"loss": 0.4105,
"step": 4770
},
{
"epoch": 2.0371239598890547,
"grad_norm": 4.915626525878906,
"learning_rate": 7.136536030341341e-06,
"loss": 0.4166,
"step": 4775
},
{
"epoch": 2.039257520802219,
"grad_norm": 4.077551364898682,
"learning_rate": 7.120733249051833e-06,
"loss": 0.3861,
"step": 4780
},
{
"epoch": 2.041391081715383,
"grad_norm": 4.394146919250488,
"learning_rate": 7.104930467762326e-06,
"loss": 0.4576,
"step": 4785
},
{
"epoch": 2.043524642628547,
"grad_norm": 4.272018909454346,
"learning_rate": 7.08912768647282e-06,
"loss": 0.4476,
"step": 4790
},
{
"epoch": 2.045658203541711,
"grad_norm": 4.265909194946289,
"learning_rate": 7.073324905183313e-06,
"loss": 0.4411,
"step": 4795
},
{
"epoch": 2.047791764454875,
"grad_norm": 3.3345210552215576,
"learning_rate": 7.057522123893807e-06,
"loss": 0.3745,
"step": 4800
},
{
"epoch": 2.049925325368039,
"grad_norm": 3.579895496368408,
"learning_rate": 7.041719342604299e-06,
"loss": 0.5016,
"step": 4805
},
{
"epoch": 2.0520588862812033,
"grad_norm": 3.8164703845977783,
"learning_rate": 7.0259165613147915e-06,
"loss": 0.4101,
"step": 4810
},
{
"epoch": 2.0541924471943673,
"grad_norm": 3.7674551010131836,
"learning_rate": 7.010113780025285e-06,
"loss": 0.4508,
"step": 4815
},
{
"epoch": 2.0563260081075314,
"grad_norm": 4.42473030090332,
"learning_rate": 6.994310998735778e-06,
"loss": 0.4173,
"step": 4820
},
{
"epoch": 2.0584595690206955,
"grad_norm": 3.532865285873413,
"learning_rate": 6.978508217446272e-06,
"loss": 0.4008,
"step": 4825
},
{
"epoch": 2.0605931299338596,
"grad_norm": 3.7293918132781982,
"learning_rate": 6.962705436156764e-06,
"loss": 0.4128,
"step": 4830
},
{
"epoch": 2.0627266908470236,
"grad_norm": 4.818244457244873,
"learning_rate": 6.946902654867257e-06,
"loss": 0.4003,
"step": 4835
},
{
"epoch": 2.0648602517601877,
"grad_norm": 3.0435068607330322,
"learning_rate": 6.9310998735777505e-06,
"loss": 0.389,
"step": 4840
},
{
"epoch": 2.066993812673352,
"grad_norm": 3.7885820865631104,
"learning_rate": 6.915297092288243e-06,
"loss": 0.3549,
"step": 4845
},
{
"epoch": 2.069127373586516,
"grad_norm": 3.8366339206695557,
"learning_rate": 6.899494310998737e-06,
"loss": 0.4706,
"step": 4850
},
{
"epoch": 2.07126093449968,
"grad_norm": 4.349545955657959,
"learning_rate": 6.88369152970923e-06,
"loss": 0.398,
"step": 4855
},
{
"epoch": 2.073394495412844,
"grad_norm": 4.000646591186523,
"learning_rate": 6.867888748419722e-06,
"loss": 0.3775,
"step": 4860
},
{
"epoch": 2.075528056326008,
"grad_norm": 4.560521602630615,
"learning_rate": 6.852085967130216e-06,
"loss": 0.4547,
"step": 4865
},
{
"epoch": 2.077661617239172,
"grad_norm": 4.749170780181885,
"learning_rate": 6.8362831858407086e-06,
"loss": 0.4048,
"step": 4870
},
{
"epoch": 2.0797951781523363,
"grad_norm": 4.038886070251465,
"learning_rate": 6.820480404551202e-06,
"loss": 0.4562,
"step": 4875
},
{
"epoch": 2.0819287390655004,
"grad_norm": 4.166180610656738,
"learning_rate": 6.804677623261695e-06,
"loss": 0.4327,
"step": 4880
},
{
"epoch": 2.0840622999786644,
"grad_norm": 4.0146050453186035,
"learning_rate": 6.788874841972187e-06,
"loss": 0.49,
"step": 4885
},
{
"epoch": 2.0861958608918285,
"grad_norm": 3.8984649181365967,
"learning_rate": 6.773072060682681e-06,
"loss": 0.3935,
"step": 4890
},
{
"epoch": 2.0883294218049926,
"grad_norm": 4.444411754608154,
"learning_rate": 6.757269279393174e-06,
"loss": 0.3395,
"step": 4895
},
{
"epoch": 2.0904629827181567,
"grad_norm": 4.304215908050537,
"learning_rate": 6.741466498103667e-06,
"loss": 0.4126,
"step": 4900
},
{
"epoch": 2.0925965436313207,
"grad_norm": 4.9501118659973145,
"learning_rate": 6.72566371681416e-06,
"loss": 0.5132,
"step": 4905
},
{
"epoch": 2.094730104544485,
"grad_norm": 3.9908359050750732,
"learning_rate": 6.709860935524652e-06,
"loss": 0.4168,
"step": 4910
},
{
"epoch": 2.096863665457649,
"grad_norm": 3.9207379817962646,
"learning_rate": 6.694058154235146e-06,
"loss": 0.4265,
"step": 4915
},
{
"epoch": 2.098997226370813,
"grad_norm": 3.6123504638671875,
"learning_rate": 6.678255372945639e-06,
"loss": 0.4056,
"step": 4920
},
{
"epoch": 2.101130787283977,
"grad_norm": 3.6765084266662598,
"learning_rate": 6.662452591656132e-06,
"loss": 0.391,
"step": 4925
},
{
"epoch": 2.103264348197141,
"grad_norm": 3.6983890533447266,
"learning_rate": 6.646649810366626e-06,
"loss": 0.452,
"step": 4930
},
{
"epoch": 2.105397909110305,
"grad_norm": 4.123793125152588,
"learning_rate": 6.630847029077118e-06,
"loss": 0.3674,
"step": 4935
},
{
"epoch": 2.1075314700234693,
"grad_norm": 4.987173080444336,
"learning_rate": 6.6150442477876105e-06,
"loss": 0.4484,
"step": 4940
},
{
"epoch": 2.1096650309366334,
"grad_norm": 4.321348667144775,
"learning_rate": 6.599241466498104e-06,
"loss": 0.4447,
"step": 4945
},
{
"epoch": 2.1117985918497975,
"grad_norm": 3.74847149848938,
"learning_rate": 6.583438685208597e-06,
"loss": 0.4224,
"step": 4950
},
{
"epoch": 2.1139321527629615,
"grad_norm": 4.59506368637085,
"learning_rate": 6.567635903919091e-06,
"loss": 0.4074,
"step": 4955
},
{
"epoch": 2.1160657136761256,
"grad_norm": 5.061137676239014,
"learning_rate": 6.551833122629583e-06,
"loss": 0.4031,
"step": 4960
},
{
"epoch": 2.1181992745892897,
"grad_norm": 4.153493881225586,
"learning_rate": 6.536030341340076e-06,
"loss": 0.4308,
"step": 4965
},
{
"epoch": 2.1203328355024538,
"grad_norm": 4.935019493103027,
"learning_rate": 6.5202275600505694e-06,
"loss": 0.5257,
"step": 4970
},
{
"epoch": 2.122466396415618,
"grad_norm": 4.236289978027344,
"learning_rate": 6.504424778761062e-06,
"loss": 0.4075,
"step": 4975
},
{
"epoch": 2.124599957328782,
"grad_norm": 4.530546188354492,
"learning_rate": 6.488621997471556e-06,
"loss": 0.3787,
"step": 4980
},
{
"epoch": 2.126733518241946,
"grad_norm": 3.604985237121582,
"learning_rate": 6.472819216182049e-06,
"loss": 0.415,
"step": 4985
},
{
"epoch": 2.1288670791551096,
"grad_norm": 3.9142749309539795,
"learning_rate": 6.457016434892541e-06,
"loss": 0.4389,
"step": 4990
},
{
"epoch": 2.131000640068274,
"grad_norm": 4.416208744049072,
"learning_rate": 6.441213653603035e-06,
"loss": 0.4248,
"step": 4995
},
{
"epoch": 2.133134200981438,
"grad_norm": 4.188726902008057,
"learning_rate": 6.4254108723135275e-06,
"loss": 0.4361,
"step": 5000
},
{
"epoch": 2.1352677618946023,
"grad_norm": 3.7509398460388184,
"learning_rate": 6.409608091024021e-06,
"loss": 0.3702,
"step": 5005
},
{
"epoch": 2.137401322807766,
"grad_norm": 3.6640818119049072,
"learning_rate": 6.393805309734514e-06,
"loss": 0.397,
"step": 5010
},
{
"epoch": 2.13953488372093,
"grad_norm": 3.572796106338501,
"learning_rate": 6.378002528445006e-06,
"loss": 0.4284,
"step": 5015
},
{
"epoch": 2.141668444634094,
"grad_norm": 3.946179151535034,
"learning_rate": 6.3621997471555e-06,
"loss": 0.3721,
"step": 5020
},
{
"epoch": 2.143802005547258,
"grad_norm": 4.065572738647461,
"learning_rate": 6.346396965865993e-06,
"loss": 0.3834,
"step": 5025
},
{
"epoch": 2.1459355664604223,
"grad_norm": 4.278841018676758,
"learning_rate": 6.3305941845764865e-06,
"loss": 0.3937,
"step": 5030
},
{
"epoch": 2.1480691273735864,
"grad_norm": 3.989811658859253,
"learning_rate": 6.314791403286979e-06,
"loss": 0.3983,
"step": 5035
},
{
"epoch": 2.1502026882867504,
"grad_norm": 3.7060208320617676,
"learning_rate": 6.298988621997471e-06,
"loss": 0.3478,
"step": 5040
},
{
"epoch": 2.1523362491999145,
"grad_norm": 4.620419502258301,
"learning_rate": 6.283185840707965e-06,
"loss": 0.4092,
"step": 5045
},
{
"epoch": 2.1544698101130786,
"grad_norm": 3.965667247772217,
"learning_rate": 6.267383059418458e-06,
"loss": 0.4197,
"step": 5050
},
{
"epoch": 2.1566033710262427,
"grad_norm": 3.439267158508301,
"learning_rate": 6.251580278128952e-06,
"loss": 0.38,
"step": 5055
},
{
"epoch": 2.1587369319394067,
"grad_norm": 3.8684778213500977,
"learning_rate": 6.2357774968394446e-06,
"loss": 0.3638,
"step": 5060
},
{
"epoch": 2.160870492852571,
"grad_norm": 4.314586639404297,
"learning_rate": 6.219974715549937e-06,
"loss": 0.4944,
"step": 5065
},
{
"epoch": 2.163004053765735,
"grad_norm": 4.4946722984313965,
"learning_rate": 6.20417193426043e-06,
"loss": 0.4456,
"step": 5070
},
{
"epoch": 2.165137614678899,
"grad_norm": 4.2454071044921875,
"learning_rate": 6.188369152970923e-06,
"loss": 0.4251,
"step": 5075
},
{
"epoch": 2.167271175592063,
"grad_norm": 4.110542297363281,
"learning_rate": 6.172566371681417e-06,
"loss": 0.4029,
"step": 5080
},
{
"epoch": 2.169404736505227,
"grad_norm": 4.185825347900391,
"learning_rate": 6.15676359039191e-06,
"loss": 0.3532,
"step": 5085
},
{
"epoch": 2.171538297418391,
"grad_norm": 3.9194812774658203,
"learning_rate": 6.140960809102402e-06,
"loss": 0.3824,
"step": 5090
},
{
"epoch": 2.1736718583315553,
"grad_norm": 3.871528387069702,
"learning_rate": 6.1251580278128955e-06,
"loss": 0.4084,
"step": 5095
},
{
"epoch": 2.1758054192447194,
"grad_norm": 4.807036876678467,
"learning_rate": 6.109355246523388e-06,
"loss": 0.4154,
"step": 5100
},
{
"epoch": 2.1779389801578835,
"grad_norm": 3.8847007751464844,
"learning_rate": 6.093552465233882e-06,
"loss": 0.3933,
"step": 5105
},
{
"epoch": 2.1800725410710475,
"grad_norm": 3.88389253616333,
"learning_rate": 6.077749683944375e-06,
"loss": 0.392,
"step": 5110
},
{
"epoch": 2.1822061019842116,
"grad_norm": 3.9758973121643066,
"learning_rate": 6.061946902654868e-06,
"loss": 0.3911,
"step": 5115
},
{
"epoch": 2.1843396628973757,
"grad_norm": 6.718324661254883,
"learning_rate": 6.046144121365361e-06,
"loss": 0.338,
"step": 5120
},
{
"epoch": 2.1864732238105398,
"grad_norm": 4.2538981437683105,
"learning_rate": 6.030341340075854e-06,
"loss": 0.3823,
"step": 5125
},
{
"epoch": 2.188606784723704,
"grad_norm": 4.390100955963135,
"learning_rate": 6.014538558786347e-06,
"loss": 0.4173,
"step": 5130
},
{
"epoch": 2.190740345636868,
"grad_norm": 3.906677722930908,
"learning_rate": 5.99873577749684e-06,
"loss": 0.3648,
"step": 5135
},
{
"epoch": 2.192873906550032,
"grad_norm": 4.540939807891846,
"learning_rate": 5.982932996207333e-06,
"loss": 0.4877,
"step": 5140
},
{
"epoch": 2.195007467463196,
"grad_norm": 4.823376655578613,
"learning_rate": 5.967130214917826e-06,
"loss": 0.4069,
"step": 5145
},
{
"epoch": 2.19714102837636,
"grad_norm": 3.3051562309265137,
"learning_rate": 5.951327433628319e-06,
"loss": 0.447,
"step": 5150
},
{
"epoch": 2.1992745892895242,
"grad_norm": 4.335805892944336,
"learning_rate": 5.935524652338812e-06,
"loss": 0.4186,
"step": 5155
},
{
"epoch": 2.2014081502026883,
"grad_norm": 4.150125980377197,
"learning_rate": 5.9197218710493054e-06,
"loss": 0.4088,
"step": 5160
},
{
"epoch": 2.2035417111158524,
"grad_norm": 6.423460960388184,
"learning_rate": 5.903919089759798e-06,
"loss": 0.5046,
"step": 5165
},
{
"epoch": 2.2056752720290165,
"grad_norm": 4.221232891082764,
"learning_rate": 5.888116308470291e-06,
"loss": 0.409,
"step": 5170
},
{
"epoch": 2.2078088329421806,
"grad_norm": 4.568662643432617,
"learning_rate": 5.872313527180784e-06,
"loss": 0.4102,
"step": 5175
},
{
"epoch": 2.2099423938553446,
"grad_norm": 4.5810089111328125,
"learning_rate": 5.856510745891277e-06,
"loss": 0.4145,
"step": 5180
},
{
"epoch": 2.2120759547685087,
"grad_norm": 3.7417185306549072,
"learning_rate": 5.840707964601771e-06,
"loss": 0.4229,
"step": 5185
},
{
"epoch": 2.214209515681673,
"grad_norm": 3.5175845623016357,
"learning_rate": 5.8249051833122635e-06,
"loss": 0.4613,
"step": 5190
},
{
"epoch": 2.216343076594837,
"grad_norm": 3.3224520683288574,
"learning_rate": 5.809102402022756e-06,
"loss": 0.413,
"step": 5195
},
{
"epoch": 2.218476637508001,
"grad_norm": 3.738093852996826,
"learning_rate": 5.793299620733249e-06,
"loss": 0.3725,
"step": 5200
},
{
"epoch": 2.220610198421165,
"grad_norm": 4.55635929107666,
"learning_rate": 5.777496839443742e-06,
"loss": 0.4417,
"step": 5205
},
{
"epoch": 2.222743759334329,
"grad_norm": 3.722395658493042,
"learning_rate": 5.761694058154236e-06,
"loss": 0.3696,
"step": 5210
},
{
"epoch": 2.224877320247493,
"grad_norm": 4.449429988861084,
"learning_rate": 5.745891276864729e-06,
"loss": 0.4294,
"step": 5215
},
{
"epoch": 2.2270108811606573,
"grad_norm": 5.1202778816223145,
"learning_rate": 5.730088495575221e-06,
"loss": 0.4408,
"step": 5220
},
{
"epoch": 2.2291444420738213,
"grad_norm": 4.097009181976318,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.4007,
"step": 5225
},
{
"epoch": 2.2312780029869854,
"grad_norm": 3.9691903591156006,
"learning_rate": 5.698482932996207e-06,
"loss": 0.4927,
"step": 5230
},
{
"epoch": 2.2334115639001495,
"grad_norm": 4.4439921379089355,
"learning_rate": 5.682680151706701e-06,
"loss": 0.3488,
"step": 5235
},
{
"epoch": 2.2355451248133136,
"grad_norm": 3.679431438446045,
"learning_rate": 5.666877370417194e-06,
"loss": 0.4898,
"step": 5240
},
{
"epoch": 2.2376786857264777,
"grad_norm": 4.508449077606201,
"learning_rate": 5.651074589127688e-06,
"loss": 0.43,
"step": 5245
},
{
"epoch": 2.2398122466396417,
"grad_norm": 4.32645320892334,
"learning_rate": 5.63527180783818e-06,
"loss": 0.4281,
"step": 5250
},
{
"epoch": 2.241945807552806,
"grad_norm": 3.590062141418457,
"learning_rate": 5.619469026548673e-06,
"loss": 0.4307,
"step": 5255
},
{
"epoch": 2.24407936846597,
"grad_norm": 4.356314182281494,
"learning_rate": 5.603666245259166e-06,
"loss": 0.4487,
"step": 5260
},
{
"epoch": 2.246212929379134,
"grad_norm": 4.716103553771973,
"learning_rate": 5.587863463969659e-06,
"loss": 0.4379,
"step": 5265
},
{
"epoch": 2.248346490292298,
"grad_norm": 4.041739463806152,
"learning_rate": 5.572060682680153e-06,
"loss": 0.3901,
"step": 5270
},
{
"epoch": 2.2504800512054617,
"grad_norm": 3.673758029937744,
"learning_rate": 5.556257901390645e-06,
"loss": 0.4434,
"step": 5275
},
{
"epoch": 2.252613612118626,
"grad_norm": 4.207825660705566,
"learning_rate": 5.540455120101138e-06,
"loss": 0.3758,
"step": 5280
},
{
"epoch": 2.25474717303179,
"grad_norm": 4.471085548400879,
"learning_rate": 5.5246523388116315e-06,
"loss": 0.3693,
"step": 5285
},
{
"epoch": 2.2568807339449544,
"grad_norm": 4.205175399780273,
"learning_rate": 5.508849557522124e-06,
"loss": 0.3965,
"step": 5290
},
{
"epoch": 2.259014294858118,
"grad_norm": 4.6766533851623535,
"learning_rate": 5.493046776232618e-06,
"loss": 0.3508,
"step": 5295
},
{
"epoch": 2.261147855771282,
"grad_norm": 4.840975284576416,
"learning_rate": 5.47724399494311e-06,
"loss": 0.4771,
"step": 5300
},
{
"epoch": 2.263281416684446,
"grad_norm": 3.9258434772491455,
"learning_rate": 5.461441213653603e-06,
"loss": 0.3601,
"step": 5305
},
{
"epoch": 2.2654149775976102,
"grad_norm": 4.151968479156494,
"learning_rate": 5.445638432364097e-06,
"loss": 0.4161,
"step": 5310
},
{
"epoch": 2.2675485385107743,
"grad_norm": 4.543685436248779,
"learning_rate": 5.42983565107459e-06,
"loss": 0.4813,
"step": 5315
},
{
"epoch": 2.2696820994239384,
"grad_norm": 3.733811140060425,
"learning_rate": 5.414032869785083e-06,
"loss": 0.3715,
"step": 5320
},
{
"epoch": 2.2718156603371025,
"grad_norm": 3.2990171909332275,
"learning_rate": 5.398230088495575e-06,
"loss": 0.3873,
"step": 5325
},
{
"epoch": 2.2739492212502666,
"grad_norm": 4.1041436195373535,
"learning_rate": 5.382427307206068e-06,
"loss": 0.4092,
"step": 5330
},
{
"epoch": 2.2760827821634306,
"grad_norm": 4.364267349243164,
"learning_rate": 5.366624525916562e-06,
"loss": 0.4073,
"step": 5335
},
{
"epoch": 2.2782163430765947,
"grad_norm": 3.123594045639038,
"learning_rate": 5.350821744627055e-06,
"loss": 0.3975,
"step": 5340
},
{
"epoch": 2.280349903989759,
"grad_norm": 4.143852710723877,
"learning_rate": 5.335018963337549e-06,
"loss": 0.3485,
"step": 5345
},
{
"epoch": 2.282483464902923,
"grad_norm": 4.994611740112305,
"learning_rate": 5.319216182048041e-06,
"loss": 0.3857,
"step": 5350
},
{
"epoch": 2.284617025816087,
"grad_norm": 4.080162525177002,
"learning_rate": 5.3034134007585335e-06,
"loss": 0.3158,
"step": 5355
},
{
"epoch": 2.286750586729251,
"grad_norm": 4.270036220550537,
"learning_rate": 5.287610619469027e-06,
"loss": 0.4476,
"step": 5360
},
{
"epoch": 2.288884147642415,
"grad_norm": 4.030431270599365,
"learning_rate": 5.27180783817952e-06,
"loss": 0.4784,
"step": 5365
},
{
"epoch": 2.291017708555579,
"grad_norm": 4.152432441711426,
"learning_rate": 5.256005056890014e-06,
"loss": 0.4499,
"step": 5370
},
{
"epoch": 2.2931512694687433,
"grad_norm": 4.297497749328613,
"learning_rate": 5.240202275600507e-06,
"loss": 0.3852,
"step": 5375
},
{
"epoch": 2.2952848303819073,
"grad_norm": 4.033621311187744,
"learning_rate": 5.224399494310999e-06,
"loss": 0.4927,
"step": 5380
},
{
"epoch": 2.2974183912950714,
"grad_norm": 4.530407428741455,
"learning_rate": 5.2085967130214924e-06,
"loss": 0.5091,
"step": 5385
},
{
"epoch": 2.2995519522082355,
"grad_norm": 4.24984073638916,
"learning_rate": 5.192793931731985e-06,
"loss": 0.4201,
"step": 5390
},
{
"epoch": 2.3016855131213996,
"grad_norm": 4.040515899658203,
"learning_rate": 5.176991150442478e-06,
"loss": 0.406,
"step": 5395
},
{
"epoch": 2.3038190740345637,
"grad_norm": 4.016532897949219,
"learning_rate": 5.161188369152972e-06,
"loss": 0.3989,
"step": 5400
},
{
"epoch": 2.3059526349477277,
"grad_norm": 4.394332408905029,
"learning_rate": 5.145385587863464e-06,
"loss": 0.4134,
"step": 5405
},
{
"epoch": 2.308086195860892,
"grad_norm": 4.987480640411377,
"learning_rate": 5.129582806573958e-06,
"loss": 0.4595,
"step": 5410
},
{
"epoch": 2.310219756774056,
"grad_norm": 4.412329196929932,
"learning_rate": 5.1137800252844505e-06,
"loss": 0.4157,
"step": 5415
},
{
"epoch": 2.31235331768722,
"grad_norm": 4.324588775634766,
"learning_rate": 5.097977243994943e-06,
"loss": 0.4703,
"step": 5420
},
{
"epoch": 2.314486878600384,
"grad_norm": 4.05607795715332,
"learning_rate": 5.082174462705437e-06,
"loss": 0.4543,
"step": 5425
},
{
"epoch": 2.316620439513548,
"grad_norm": 4.508897304534912,
"learning_rate": 5.066371681415929e-06,
"loss": 0.4207,
"step": 5430
},
{
"epoch": 2.318754000426712,
"grad_norm": 4.336424350738525,
"learning_rate": 5.050568900126422e-06,
"loss": 0.4049,
"step": 5435
},
{
"epoch": 2.3208875613398763,
"grad_norm": 4.038027286529541,
"learning_rate": 5.034766118836916e-06,
"loss": 0.4193,
"step": 5440
},
{
"epoch": 2.3230211222530404,
"grad_norm": 3.8997299671173096,
"learning_rate": 5.018963337547409e-06,
"loss": 0.3917,
"step": 5445
},
{
"epoch": 2.3251546831662044,
"grad_norm": 3.959617853164673,
"learning_rate": 5.003160556257902e-06,
"loss": 0.4742,
"step": 5450
},
{
"epoch": 2.3272882440793685,
"grad_norm": 4.371520042419434,
"learning_rate": 4.987357774968395e-06,
"loss": 0.3965,
"step": 5455
},
{
"epoch": 2.3294218049925326,
"grad_norm": 4.2197394371032715,
"learning_rate": 4.971554993678888e-06,
"loss": 0.3876,
"step": 5460
},
{
"epoch": 2.3315553659056967,
"grad_norm": 3.5643868446350098,
"learning_rate": 4.955752212389381e-06,
"loss": 0.4568,
"step": 5465
},
{
"epoch": 2.3336889268188608,
"grad_norm": 4.49404764175415,
"learning_rate": 4.939949431099874e-06,
"loss": 0.418,
"step": 5470
},
{
"epoch": 2.335822487732025,
"grad_norm": 3.6703479290008545,
"learning_rate": 4.924146649810367e-06,
"loss": 0.3959,
"step": 5475
},
{
"epoch": 2.337956048645189,
"grad_norm": 5.356283664703369,
"learning_rate": 4.9083438685208604e-06,
"loss": 0.4605,
"step": 5480
},
{
"epoch": 2.340089609558353,
"grad_norm": 4.413150310516357,
"learning_rate": 4.892541087231353e-06,
"loss": 0.3838,
"step": 5485
},
{
"epoch": 2.342223170471517,
"grad_norm": 5.07502555847168,
"learning_rate": 4.876738305941846e-06,
"loss": 0.4498,
"step": 5490
},
{
"epoch": 2.344356731384681,
"grad_norm": 4.0401458740234375,
"learning_rate": 4.860935524652339e-06,
"loss": 0.4091,
"step": 5495
},
{
"epoch": 2.3464902922978452,
"grad_norm": 3.7641711235046387,
"learning_rate": 4.845132743362832e-06,
"loss": 0.3803,
"step": 5500
},
{
"epoch": 2.3486238532110093,
"grad_norm": 4.7735595703125,
"learning_rate": 4.829329962073326e-06,
"loss": 0.3776,
"step": 5505
},
{
"epoch": 2.3507574141241734,
"grad_norm": 3.3556971549987793,
"learning_rate": 4.8135271807838185e-06,
"loss": 0.4464,
"step": 5510
},
{
"epoch": 2.3528909750373375,
"grad_norm": 3.9931693077087402,
"learning_rate": 4.797724399494311e-06,
"loss": 0.461,
"step": 5515
},
{
"epoch": 2.3550245359505015,
"grad_norm": 4.047415733337402,
"learning_rate": 4.781921618204804e-06,
"loss": 0.4251,
"step": 5520
},
{
"epoch": 2.3571580968636656,
"grad_norm": 3.7913167476654053,
"learning_rate": 4.766118836915298e-06,
"loss": 0.3529,
"step": 5525
},
{
"epoch": 2.3592916577768297,
"grad_norm": 3.9271814823150635,
"learning_rate": 4.750316055625791e-06,
"loss": 0.3897,
"step": 5530
},
{
"epoch": 2.3614252186899938,
"grad_norm": 3.8952436447143555,
"learning_rate": 4.734513274336284e-06,
"loss": 0.3916,
"step": 5535
},
{
"epoch": 2.363558779603158,
"grad_norm": 4.268993854522705,
"learning_rate": 4.718710493046777e-06,
"loss": 0.3637,
"step": 5540
},
{
"epoch": 2.365692340516322,
"grad_norm": 4.283237934112549,
"learning_rate": 4.7029077117572695e-06,
"loss": 0.387,
"step": 5545
},
{
"epoch": 2.3678259014294856,
"grad_norm": 4.506287097930908,
"learning_rate": 4.687104930467763e-06,
"loss": 0.3847,
"step": 5550
},
{
"epoch": 2.36995946234265,
"grad_norm": 4.476803779602051,
"learning_rate": 4.671302149178255e-06,
"loss": 0.4186,
"step": 5555
},
{
"epoch": 2.3720930232558137,
"grad_norm": 4.122984409332275,
"learning_rate": 4.655499367888749e-06,
"loss": 0.4446,
"step": 5560
},
{
"epoch": 2.3742265841689782,
"grad_norm": 4.000293731689453,
"learning_rate": 4.639696586599242e-06,
"loss": 0.3906,
"step": 5565
},
{
"epoch": 2.376360145082142,
"grad_norm": 3.5631563663482666,
"learning_rate": 4.623893805309735e-06,
"loss": 0.3741,
"step": 5570
},
{
"epoch": 2.3784937059953064,
"grad_norm": 3.814074754714966,
"learning_rate": 4.6080910240202284e-06,
"loss": 0.385,
"step": 5575
},
{
"epoch": 2.38062726690847,
"grad_norm": 3.454033374786377,
"learning_rate": 4.5922882427307205e-06,
"loss": 0.3454,
"step": 5580
},
{
"epoch": 2.382760827821634,
"grad_norm": 4.314971446990967,
"learning_rate": 4.576485461441214e-06,
"loss": 0.4328,
"step": 5585
},
{
"epoch": 2.384894388734798,
"grad_norm": 4.24696683883667,
"learning_rate": 4.560682680151707e-06,
"loss": 0.3027,
"step": 5590
},
{
"epoch": 2.3870279496479623,
"grad_norm": 3.906006336212158,
"learning_rate": 4.5448798988622e-06,
"loss": 0.3366,
"step": 5595
},
{
"epoch": 2.3891615105611264,
"grad_norm": 4.338901996612549,
"learning_rate": 4.529077117572694e-06,
"loss": 0.4076,
"step": 5600
},
{
"epoch": 2.3912950714742904,
"grad_norm": 3.6618947982788086,
"learning_rate": 4.513274336283186e-06,
"loss": 0.3898,
"step": 5605
},
{
"epoch": 2.3934286323874545,
"grad_norm": 4.170609474182129,
"learning_rate": 4.497471554993679e-06,
"loss": 0.3639,
"step": 5610
},
{
"epoch": 2.3955621933006186,
"grad_norm": 4.2701287269592285,
"learning_rate": 4.481668773704172e-06,
"loss": 0.4414,
"step": 5615
},
{
"epoch": 2.3976957542137827,
"grad_norm": 3.552507162094116,
"learning_rate": 4.465865992414665e-06,
"loss": 0.3376,
"step": 5620
},
{
"epoch": 2.3998293151269467,
"grad_norm": 3.7421629428863525,
"learning_rate": 4.450063211125159e-06,
"loss": 0.3839,
"step": 5625
},
{
"epoch": 2.401962876040111,
"grad_norm": 3.554124593734741,
"learning_rate": 4.434260429835651e-06,
"loss": 0.3679,
"step": 5630
},
{
"epoch": 2.404096436953275,
"grad_norm": 4.3994140625,
"learning_rate": 4.418457648546145e-06,
"loss": 0.3895,
"step": 5635
},
{
"epoch": 2.406229997866439,
"grad_norm": 3.282832622528076,
"learning_rate": 4.4026548672566375e-06,
"loss": 0.3644,
"step": 5640
},
{
"epoch": 2.408363558779603,
"grad_norm": 4.626904010772705,
"learning_rate": 4.38685208596713e-06,
"loss": 0.4035,
"step": 5645
},
{
"epoch": 2.410497119692767,
"grad_norm": 4.141104698181152,
"learning_rate": 4.371049304677623e-06,
"loss": 0.5727,
"step": 5650
},
{
"epoch": 2.412630680605931,
"grad_norm": 3.8444101810455322,
"learning_rate": 4.355246523388117e-06,
"loss": 0.4091,
"step": 5655
},
{
"epoch": 2.4147642415190953,
"grad_norm": 3.6316781044006348,
"learning_rate": 4.33944374209861e-06,
"loss": 0.4028,
"step": 5660
},
{
"epoch": 2.4168978024322594,
"grad_norm": 4.120214462280273,
"learning_rate": 4.323640960809103e-06,
"loss": 0.4611,
"step": 5665
},
{
"epoch": 2.4190313633454235,
"grad_norm": 4.207626819610596,
"learning_rate": 4.307838179519596e-06,
"loss": 0.4054,
"step": 5670
},
{
"epoch": 2.4211649242585875,
"grad_norm": 4.325310707092285,
"learning_rate": 4.2920353982300885e-06,
"loss": 0.489,
"step": 5675
},
{
"epoch": 2.4232984851717516,
"grad_norm": 3.8266632556915283,
"learning_rate": 4.276232616940582e-06,
"loss": 0.3478,
"step": 5680
},
{
"epoch": 2.4254320460849157,
"grad_norm": 4.00970983505249,
"learning_rate": 4.260429835651075e-06,
"loss": 0.3713,
"step": 5685
},
{
"epoch": 2.4275656069980798,
"grad_norm": 3.556880235671997,
"learning_rate": 4.244627054361568e-06,
"loss": 0.4119,
"step": 5690
},
{
"epoch": 2.429699167911244,
"grad_norm": 4.504357814788818,
"learning_rate": 4.228824273072061e-06,
"loss": 0.4012,
"step": 5695
},
{
"epoch": 2.431832728824408,
"grad_norm": 4.244117736816406,
"learning_rate": 4.213021491782554e-06,
"loss": 0.3511,
"step": 5700
},
{
"epoch": 2.433966289737572,
"grad_norm": 4.272021770477295,
"learning_rate": 4.197218710493047e-06,
"loss": 0.466,
"step": 5705
},
{
"epoch": 2.436099850650736,
"grad_norm": 3.49529767036438,
"learning_rate": 4.18141592920354e-06,
"loss": 0.4053,
"step": 5710
},
{
"epoch": 2.4382334115639,
"grad_norm": 3.5074594020843506,
"learning_rate": 4.165613147914033e-06,
"loss": 0.3865,
"step": 5715
},
{
"epoch": 2.4403669724770642,
"grad_norm": 4.044558525085449,
"learning_rate": 4.149810366624527e-06,
"loss": 0.3319,
"step": 5720
},
{
"epoch": 2.4425005333902283,
"grad_norm": 4.200631618499756,
"learning_rate": 4.134007585335019e-06,
"loss": 0.3993,
"step": 5725
},
{
"epoch": 2.4446340943033924,
"grad_norm": 4.75645112991333,
"learning_rate": 4.118204804045513e-06,
"loss": 0.4318,
"step": 5730
},
{
"epoch": 2.4467676552165565,
"grad_norm": 4.220727920532227,
"learning_rate": 4.1024020227560055e-06,
"loss": 0.4326,
"step": 5735
},
{
"epoch": 2.4489012161297206,
"grad_norm": 3.325622797012329,
"learning_rate": 4.086599241466498e-06,
"loss": 0.3337,
"step": 5740
},
{
"epoch": 2.4510347770428846,
"grad_norm": 3.2215867042541504,
"learning_rate": 4.070796460176992e-06,
"loss": 0.3598,
"step": 5745
},
{
"epoch": 2.4531683379560487,
"grad_norm": 3.8645408153533936,
"learning_rate": 4.054993678887484e-06,
"loss": 0.3828,
"step": 5750
},
{
"epoch": 2.455301898869213,
"grad_norm": 3.710660696029663,
"learning_rate": 4.039190897597978e-06,
"loss": 0.3431,
"step": 5755
},
{
"epoch": 2.457435459782377,
"grad_norm": 4.17116641998291,
"learning_rate": 4.023388116308471e-06,
"loss": 0.4396,
"step": 5760
},
{
"epoch": 2.459569020695541,
"grad_norm": 3.2954678535461426,
"learning_rate": 4.007585335018964e-06,
"loss": 0.3572,
"step": 5765
},
{
"epoch": 2.461702581608705,
"grad_norm": 3.6551170349121094,
"learning_rate": 3.9917825537294565e-06,
"loss": 0.3988,
"step": 5770
},
{
"epoch": 2.463836142521869,
"grad_norm": 5.171153545379639,
"learning_rate": 3.975979772439949e-06,
"loss": 0.4699,
"step": 5775
},
{
"epoch": 2.465969703435033,
"grad_norm": 4.080657482147217,
"learning_rate": 3.960176991150443e-06,
"loss": 0.4798,
"step": 5780
},
{
"epoch": 2.4681032643481973,
"grad_norm": 3.91977596282959,
"learning_rate": 3.944374209860936e-06,
"loss": 0.4607,
"step": 5785
},
{
"epoch": 2.4702368252613613,
"grad_norm": 3.9406278133392334,
"learning_rate": 3.928571428571429e-06,
"loss": 0.3383,
"step": 5790
},
{
"epoch": 2.4723703861745254,
"grad_norm": 3.8521249294281006,
"learning_rate": 3.912768647281922e-06,
"loss": 0.3431,
"step": 5795
},
{
"epoch": 2.4745039470876895,
"grad_norm": 4.789560794830322,
"learning_rate": 3.896965865992415e-06,
"loss": 0.4159,
"step": 5800
},
{
"epoch": 2.4766375080008536,
"grad_norm": 4.372644901275635,
"learning_rate": 3.881163084702908e-06,
"loss": 0.5409,
"step": 5805
},
{
"epoch": 2.4787710689140177,
"grad_norm": 4.152048587799072,
"learning_rate": 3.865360303413401e-06,
"loss": 0.4068,
"step": 5810
},
{
"epoch": 2.4809046298271817,
"grad_norm": 4.586190223693848,
"learning_rate": 3.849557522123894e-06,
"loss": 0.4264,
"step": 5815
},
{
"epoch": 2.483038190740346,
"grad_norm": 4.218143463134766,
"learning_rate": 3.833754740834387e-06,
"loss": 0.4537,
"step": 5820
},
{
"epoch": 2.48517175165351,
"grad_norm": 4.002030372619629,
"learning_rate": 3.81795195954488e-06,
"loss": 0.3723,
"step": 5825
},
{
"epoch": 2.487305312566674,
"grad_norm": 4.468664646148682,
"learning_rate": 3.802149178255373e-06,
"loss": 0.3822,
"step": 5830
},
{
"epoch": 2.4894388734798376,
"grad_norm": 3.4531240463256836,
"learning_rate": 3.7863463969658664e-06,
"loss": 0.3885,
"step": 5835
},
{
"epoch": 2.491572434393002,
"grad_norm": 4.368101596832275,
"learning_rate": 3.7705436156763593e-06,
"loss": 0.3656,
"step": 5840
},
{
"epoch": 2.4937059953061658,
"grad_norm": 4.486471176147461,
"learning_rate": 3.7547408343868526e-06,
"loss": 0.4383,
"step": 5845
},
{
"epoch": 2.4958395562193303,
"grad_norm": 3.639932632446289,
"learning_rate": 3.738938053097346e-06,
"loss": 0.3617,
"step": 5850
},
{
"epoch": 2.497973117132494,
"grad_norm": 4.590126991271973,
"learning_rate": 3.7231352718078383e-06,
"loss": 0.3459,
"step": 5855
},
{
"epoch": 2.5001066780456584,
"grad_norm": 4.252352237701416,
"learning_rate": 3.7073324905183316e-06,
"loss": 0.4128,
"step": 5860
},
{
"epoch": 2.502240238958822,
"grad_norm": 5.0256805419921875,
"learning_rate": 3.6915297092288245e-06,
"loss": 0.347,
"step": 5865
},
{
"epoch": 2.5043737998719866,
"grad_norm": 4.06841516494751,
"learning_rate": 3.675726927939318e-06,
"loss": 0.414,
"step": 5870
},
{
"epoch": 2.5065073607851502,
"grad_norm": 4.037867069244385,
"learning_rate": 3.659924146649811e-06,
"loss": 0.3732,
"step": 5875
},
{
"epoch": 2.5086409216983148,
"grad_norm": 3.40893292427063,
"learning_rate": 3.6441213653603035e-06,
"loss": 0.3981,
"step": 5880
},
{
"epoch": 2.5107744826114784,
"grad_norm": 4.29473876953125,
"learning_rate": 3.628318584070797e-06,
"loss": 0.3326,
"step": 5885
},
{
"epoch": 2.5129080435246425,
"grad_norm": 4.168461799621582,
"learning_rate": 3.6125158027812897e-06,
"loss": 0.4446,
"step": 5890
},
{
"epoch": 2.5150416044378066,
"grad_norm": 4.106093883514404,
"learning_rate": 3.596713021491783e-06,
"loss": 0.3966,
"step": 5895
},
{
"epoch": 2.5171751653509706,
"grad_norm": 4.701243877410889,
"learning_rate": 3.580910240202276e-06,
"loss": 0.3961,
"step": 5900
},
{
"epoch": 2.5193087262641347,
"grad_norm": 3.523848295211792,
"learning_rate": 3.5651074589127688e-06,
"loss": 0.3975,
"step": 5905
},
{
"epoch": 2.521442287177299,
"grad_norm": 4.162672519683838,
"learning_rate": 3.549304677623262e-06,
"loss": 0.4075,
"step": 5910
},
{
"epoch": 2.523575848090463,
"grad_norm": 4.189688205718994,
"learning_rate": 3.5335018963337554e-06,
"loss": 0.3733,
"step": 5915
},
{
"epoch": 2.525709409003627,
"grad_norm": 3.7579400539398193,
"learning_rate": 3.517699115044248e-06,
"loss": 0.4036,
"step": 5920
},
{
"epoch": 2.527842969916791,
"grad_norm": 4.20217227935791,
"learning_rate": 3.501896333754741e-06,
"loss": 0.3915,
"step": 5925
},
{
"epoch": 2.529976530829955,
"grad_norm": 4.595426082611084,
"learning_rate": 3.486093552465234e-06,
"loss": 0.4529,
"step": 5930
},
{
"epoch": 2.532110091743119,
"grad_norm": 3.9579648971557617,
"learning_rate": 3.4702907711757273e-06,
"loss": 0.3848,
"step": 5935
},
{
"epoch": 2.5342436526562833,
"grad_norm": 4.378324508666992,
"learning_rate": 3.4544879898862206e-06,
"loss": 0.4371,
"step": 5940
},
{
"epoch": 2.5363772135694473,
"grad_norm": 4.3715996742248535,
"learning_rate": 3.438685208596713e-06,
"loss": 0.374,
"step": 5945
},
{
"epoch": 2.5385107744826114,
"grad_norm": 4.015042304992676,
"learning_rate": 3.4228824273072063e-06,
"loss": 0.4176,
"step": 5950
},
{
"epoch": 2.5406443353957755,
"grad_norm": 3.8862552642822266,
"learning_rate": 3.407079646017699e-06,
"loss": 0.3825,
"step": 5955
},
{
"epoch": 2.5427778963089396,
"grad_norm": 4.243908882141113,
"learning_rate": 3.3912768647281925e-06,
"loss": 0.3904,
"step": 5960
},
{
"epoch": 2.5449114572221037,
"grad_norm": 3.6207668781280518,
"learning_rate": 3.375474083438686e-06,
"loss": 0.3464,
"step": 5965
},
{
"epoch": 2.5470450181352677,
"grad_norm": 4.175275802612305,
"learning_rate": 3.3596713021491783e-06,
"loss": 0.3689,
"step": 5970
},
{
"epoch": 2.549178579048432,
"grad_norm": 5.810354709625244,
"learning_rate": 3.3438685208596715e-06,
"loss": 0.4211,
"step": 5975
},
{
"epoch": 2.551312139961596,
"grad_norm": 3.521697998046875,
"learning_rate": 3.328065739570165e-06,
"loss": 0.3731,
"step": 5980
},
{
"epoch": 2.55344570087476,
"grad_norm": 4.137831211090088,
"learning_rate": 3.3122629582806577e-06,
"loss": 0.3762,
"step": 5985
},
{
"epoch": 2.555579261787924,
"grad_norm": 3.7665231227874756,
"learning_rate": 3.296460176991151e-06,
"loss": 0.3827,
"step": 5990
},
{
"epoch": 2.557712822701088,
"grad_norm": 3.6081109046936035,
"learning_rate": 3.2806573957016435e-06,
"loss": 0.3539,
"step": 5995
},
{
"epoch": 2.559846383614252,
"grad_norm": 3.835299491882324,
"learning_rate": 3.2648546144121368e-06,
"loss": 0.4552,
"step": 6000
},
{
"epoch": 2.5619799445274163,
"grad_norm": 4.092353343963623,
"learning_rate": 3.24905183312263e-06,
"loss": 0.3902,
"step": 6005
},
{
"epoch": 2.5641135054405804,
"grad_norm": 3.8998003005981445,
"learning_rate": 3.233249051833123e-06,
"loss": 0.3735,
"step": 6010
},
{
"epoch": 2.5662470663537444,
"grad_norm": 3.8976283073425293,
"learning_rate": 3.217446270543616e-06,
"loss": 0.4999,
"step": 6015
},
{
"epoch": 2.5683806272669085,
"grad_norm": 3.637864351272583,
"learning_rate": 3.2016434892541087e-06,
"loss": 0.3277,
"step": 6020
},
{
"epoch": 2.5705141881800726,
"grad_norm": 4.05525016784668,
"learning_rate": 3.185840707964602e-06,
"loss": 0.3806,
"step": 6025
},
{
"epoch": 2.5726477490932367,
"grad_norm": 3.850229501724243,
"learning_rate": 3.1700379266750953e-06,
"loss": 0.423,
"step": 6030
},
{
"epoch": 2.5747813100064008,
"grad_norm": 4.023748874664307,
"learning_rate": 3.1542351453855877e-06,
"loss": 0.4111,
"step": 6035
},
{
"epoch": 2.576914870919565,
"grad_norm": 4.824608325958252,
"learning_rate": 3.138432364096081e-06,
"loss": 0.3823,
"step": 6040
},
{
"epoch": 2.579048431832729,
"grad_norm": 4.730124473571777,
"learning_rate": 3.1226295828065743e-06,
"loss": 0.4564,
"step": 6045
},
{
"epoch": 2.581181992745893,
"grad_norm": 3.960134506225586,
"learning_rate": 3.106826801517067e-06,
"loss": 0.4409,
"step": 6050
},
{
"epoch": 2.583315553659057,
"grad_norm": 4.118432521820068,
"learning_rate": 3.0910240202275605e-06,
"loss": 0.4298,
"step": 6055
},
{
"epoch": 2.585449114572221,
"grad_norm": 4.568760395050049,
"learning_rate": 3.075221238938053e-06,
"loss": 0.4348,
"step": 6060
},
{
"epoch": 2.5875826754853852,
"grad_norm": 4.839498996734619,
"learning_rate": 3.0594184576485463e-06,
"loss": 0.3997,
"step": 6065
},
{
"epoch": 2.5897162363985493,
"grad_norm": 4.513376235961914,
"learning_rate": 3.0436156763590396e-06,
"loss": 0.4664,
"step": 6070
},
{
"epoch": 2.5918497973117134,
"grad_norm": 4.747183799743652,
"learning_rate": 3.0278128950695324e-06,
"loss": 0.4231,
"step": 6075
},
{
"epoch": 2.5939833582248775,
"grad_norm": 4.03349494934082,
"learning_rate": 3.0120101137800257e-06,
"loss": 0.3717,
"step": 6080
},
{
"epoch": 2.5961169191380415,
"grad_norm": 3.8517165184020996,
"learning_rate": 2.996207332490518e-06,
"loss": 0.4009,
"step": 6085
},
{
"epoch": 2.5982504800512056,
"grad_norm": 4.061285018920898,
"learning_rate": 2.9804045512010115e-06,
"loss": 0.3993,
"step": 6090
},
{
"epoch": 2.6003840409643697,
"grad_norm": 3.6961746215820312,
"learning_rate": 2.9646017699115048e-06,
"loss": 0.3631,
"step": 6095
},
{
"epoch": 2.6025176018775333,
"grad_norm": 4.444972515106201,
"learning_rate": 2.9487989886219977e-06,
"loss": 0.405,
"step": 6100
},
{
"epoch": 2.604651162790698,
"grad_norm": 3.410012722015381,
"learning_rate": 2.932996207332491e-06,
"loss": 0.3838,
"step": 6105
},
{
"epoch": 2.6067847237038615,
"grad_norm": 4.757806301116943,
"learning_rate": 2.9171934260429842e-06,
"loss": 0.4622,
"step": 6110
},
{
"epoch": 2.608918284617026,
"grad_norm": 6.638218402862549,
"learning_rate": 2.9013906447534767e-06,
"loss": 0.5596,
"step": 6115
},
{
"epoch": 2.6110518455301897,
"grad_norm": 3.4803130626678467,
"learning_rate": 2.88558786346397e-06,
"loss": 0.4108,
"step": 6120
},
{
"epoch": 2.613185406443354,
"grad_norm": 4.698441028594971,
"learning_rate": 2.869785082174463e-06,
"loss": 0.4462,
"step": 6125
},
{
"epoch": 2.615318967356518,
"grad_norm": 3.7789926528930664,
"learning_rate": 2.853982300884956e-06,
"loss": 0.3573,
"step": 6130
},
{
"epoch": 2.6174525282696823,
"grad_norm": 4.063235282897949,
"learning_rate": 2.838179519595449e-06,
"loss": 0.4009,
"step": 6135
},
{
"epoch": 2.619586089182846,
"grad_norm": 3.718618869781494,
"learning_rate": 2.822376738305942e-06,
"loss": 0.4118,
"step": 6140
},
{
"epoch": 2.6217196500960105,
"grad_norm": 4.963037014007568,
"learning_rate": 2.8065739570164352e-06,
"loss": 0.3667,
"step": 6145
},
{
"epoch": 2.623853211009174,
"grad_norm": 3.7930514812469482,
"learning_rate": 2.790771175726928e-06,
"loss": 0.3552,
"step": 6150
},
{
"epoch": 2.6259867719223386,
"grad_norm": 3.7048425674438477,
"learning_rate": 2.774968394437421e-06,
"loss": 0.4853,
"step": 6155
},
{
"epoch": 2.6281203328355023,
"grad_norm": 4.370087623596191,
"learning_rate": 2.7591656131479143e-06,
"loss": 0.4286,
"step": 6160
},
{
"epoch": 2.6302538937486664,
"grad_norm": 3.973276376724243,
"learning_rate": 2.743362831858407e-06,
"loss": 0.4329,
"step": 6165
},
{
"epoch": 2.6323874546618304,
"grad_norm": 4.450080871582031,
"learning_rate": 2.7275600505689004e-06,
"loss": 0.375,
"step": 6170
},
{
"epoch": 2.6345210155749945,
"grad_norm": 4.750911712646484,
"learning_rate": 2.7117572692793937e-06,
"loss": 0.4039,
"step": 6175
},
{
"epoch": 2.6366545764881586,
"grad_norm": 3.761721611022949,
"learning_rate": 2.695954487989886e-06,
"loss": 0.4411,
"step": 6180
},
{
"epoch": 2.6387881374013227,
"grad_norm": 3.7865817546844482,
"learning_rate": 2.6801517067003795e-06,
"loss": 0.3401,
"step": 6185
},
{
"epoch": 2.6409216983144868,
"grad_norm": 5.113328456878662,
"learning_rate": 2.6643489254108724e-06,
"loss": 0.4211,
"step": 6190
},
{
"epoch": 2.643055259227651,
"grad_norm": 4.039052486419678,
"learning_rate": 2.6485461441213657e-06,
"loss": 0.4059,
"step": 6195
},
{
"epoch": 2.645188820140815,
"grad_norm": 3.8695180416107178,
"learning_rate": 2.632743362831859e-06,
"loss": 0.3441,
"step": 6200
},
{
"epoch": 2.647322381053979,
"grad_norm": 3.8771300315856934,
"learning_rate": 2.6169405815423514e-06,
"loss": 0.3342,
"step": 6205
},
{
"epoch": 2.649455941967143,
"grad_norm": 3.479433298110962,
"learning_rate": 2.6011378002528447e-06,
"loss": 0.3825,
"step": 6210
},
{
"epoch": 2.651589502880307,
"grad_norm": 4.334216594696045,
"learning_rate": 2.5853350189633376e-06,
"loss": 0.4421,
"step": 6215
},
{
"epoch": 2.6537230637934712,
"grad_norm": 3.76131272315979,
"learning_rate": 2.569532237673831e-06,
"loss": 0.3765,
"step": 6220
},
{
"epoch": 2.6558566247066353,
"grad_norm": 3.95220685005188,
"learning_rate": 2.553729456384324e-06,
"loss": 0.3601,
"step": 6225
},
{
"epoch": 2.6579901856197994,
"grad_norm": 4.079404354095459,
"learning_rate": 2.5379266750948166e-06,
"loss": 0.397,
"step": 6230
},
{
"epoch": 2.6601237465329635,
"grad_norm": 4.499898433685303,
"learning_rate": 2.52212389380531e-06,
"loss": 0.4327,
"step": 6235
},
{
"epoch": 2.6622573074461275,
"grad_norm": 3.1831462383270264,
"learning_rate": 2.5063211125158032e-06,
"loss": 0.3815,
"step": 6240
},
{
"epoch": 2.6643908683592916,
"grad_norm": 4.471576690673828,
"learning_rate": 2.490518331226296e-06,
"loss": 0.4483,
"step": 6245
},
{
"epoch": 2.6665244292724557,
"grad_norm": 4.825654983520508,
"learning_rate": 2.4747155499367894e-06,
"loss": 0.3829,
"step": 6250
},
{
"epoch": 2.6686579901856198,
"grad_norm": 4.086824893951416,
"learning_rate": 2.4589127686472823e-06,
"loss": 0.4072,
"step": 6255
},
{
"epoch": 2.670791551098784,
"grad_norm": 3.817394495010376,
"learning_rate": 2.443109987357775e-06,
"loss": 0.3995,
"step": 6260
},
{
"epoch": 2.672925112011948,
"grad_norm": 3.841449022293091,
"learning_rate": 2.427307206068268e-06,
"loss": 0.3702,
"step": 6265
},
{
"epoch": 2.675058672925112,
"grad_norm": 4.572795867919922,
"learning_rate": 2.4115044247787613e-06,
"loss": 0.3905,
"step": 6270
},
{
"epoch": 2.677192233838276,
"grad_norm": 4.133564472198486,
"learning_rate": 2.395701643489254e-06,
"loss": 0.4461,
"step": 6275
},
{
"epoch": 2.67932579475144,
"grad_norm": 4.098212242126465,
"learning_rate": 2.3798988621997475e-06,
"loss": 0.3363,
"step": 6280
},
{
"epoch": 2.6814593556646042,
"grad_norm": 4.384210586547852,
"learning_rate": 2.3640960809102404e-06,
"loss": 0.3668,
"step": 6285
},
{
"epoch": 2.6835929165777683,
"grad_norm": 3.9941020011901855,
"learning_rate": 2.3482932996207332e-06,
"loss": 0.3367,
"step": 6290
},
{
"epoch": 2.6857264774909324,
"grad_norm": 4.1736650466918945,
"learning_rate": 2.3324905183312265e-06,
"loss": 0.3845,
"step": 6295
},
{
"epoch": 2.6878600384040965,
"grad_norm": 2.97011137008667,
"learning_rate": 2.3166877370417194e-06,
"loss": 0.3561,
"step": 6300
},
{
"epoch": 2.6899935993172606,
"grad_norm": 4.362311840057373,
"learning_rate": 2.3008849557522127e-06,
"loss": 0.3932,
"step": 6305
},
{
"epoch": 2.6921271602304246,
"grad_norm": 3.9310810565948486,
"learning_rate": 2.2850821744627056e-06,
"loss": 0.4074,
"step": 6310
},
{
"epoch": 2.6942607211435887,
"grad_norm": 3.619035005569458,
"learning_rate": 2.269279393173199e-06,
"loss": 0.3642,
"step": 6315
},
{
"epoch": 2.696394282056753,
"grad_norm": 3.4863860607147217,
"learning_rate": 2.2534766118836918e-06,
"loss": 0.3365,
"step": 6320
},
{
"epoch": 2.698527842969917,
"grad_norm": 4.252373218536377,
"learning_rate": 2.2376738305941846e-06,
"loss": 0.3501,
"step": 6325
},
{
"epoch": 2.700661403883081,
"grad_norm": 3.3830983638763428,
"learning_rate": 2.221871049304678e-06,
"loss": 0.3824,
"step": 6330
},
{
"epoch": 2.702794964796245,
"grad_norm": 4.0998382568359375,
"learning_rate": 2.206068268015171e-06,
"loss": 0.4223,
"step": 6335
},
{
"epoch": 2.704928525709409,
"grad_norm": 3.9439263343811035,
"learning_rate": 2.190265486725664e-06,
"loss": 0.3694,
"step": 6340
},
{
"epoch": 2.707062086622573,
"grad_norm": 5.345850944519043,
"learning_rate": 2.174462705436157e-06,
"loss": 0.4587,
"step": 6345
},
{
"epoch": 2.7091956475357373,
"grad_norm": 3.7166903018951416,
"learning_rate": 2.15865992414665e-06,
"loss": 0.3643,
"step": 6350
},
{
"epoch": 2.7113292084489014,
"grad_norm": 3.7905008792877197,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.3443,
"step": 6355
},
{
"epoch": 2.7134627693620654,
"grad_norm": 4.21207857131958,
"learning_rate": 2.127054361567636e-06,
"loss": 0.3883,
"step": 6360
},
{
"epoch": 2.7155963302752295,
"grad_norm": 3.345935583114624,
"learning_rate": 2.1112515802781293e-06,
"loss": 0.3858,
"step": 6365
},
{
"epoch": 2.7177298911883936,
"grad_norm": 3.8819267749786377,
"learning_rate": 2.095448798988622e-06,
"loss": 0.3656,
"step": 6370
},
{
"epoch": 2.7198634521015577,
"grad_norm": 4.006298065185547,
"learning_rate": 2.079646017699115e-06,
"loss": 0.3921,
"step": 6375
},
{
"epoch": 2.7219970130147217,
"grad_norm": 4.349729061126709,
"learning_rate": 2.0638432364096084e-06,
"loss": 0.3966,
"step": 6380
},
{
"epoch": 2.7241305739278854,
"grad_norm": 4.517011642456055,
"learning_rate": 2.0480404551201012e-06,
"loss": 0.4512,
"step": 6385
},
{
"epoch": 2.72626413484105,
"grad_norm": 3.898801565170288,
"learning_rate": 2.0322376738305945e-06,
"loss": 0.36,
"step": 6390
},
{
"epoch": 2.7283976957542135,
"grad_norm": 3.9057013988494873,
"learning_rate": 2.0164348925410874e-06,
"loss": 0.4329,
"step": 6395
},
{
"epoch": 2.730531256667378,
"grad_norm": 3.8034722805023193,
"learning_rate": 2.0006321112515807e-06,
"loss": 0.3722,
"step": 6400
},
{
"epoch": 2.7326648175805417,
"grad_norm": 4.9013895988464355,
"learning_rate": 1.9848293299620736e-06,
"loss": 0.3646,
"step": 6405
},
{
"epoch": 2.734798378493706,
"grad_norm": 3.664039134979248,
"learning_rate": 1.9690265486725665e-06,
"loss": 0.393,
"step": 6410
},
{
"epoch": 2.73693193940687,
"grad_norm": 3.9663522243499756,
"learning_rate": 1.9532237673830593e-06,
"loss": 0.4266,
"step": 6415
},
{
"epoch": 2.7390655003200344,
"grad_norm": 3.832275152206421,
"learning_rate": 1.9374209860935526e-06,
"loss": 0.363,
"step": 6420
},
{
"epoch": 2.741199061233198,
"grad_norm": 3.2930123805999756,
"learning_rate": 1.921618204804046e-06,
"loss": 0.3979,
"step": 6425
},
{
"epoch": 2.7433326221463625,
"grad_norm": 4.9362053871154785,
"learning_rate": 1.9058154235145388e-06,
"loss": 0.379,
"step": 6430
},
{
"epoch": 2.745466183059526,
"grad_norm": 4.307873249053955,
"learning_rate": 1.8900126422250317e-06,
"loss": 0.3853,
"step": 6435
},
{
"epoch": 2.7475997439726907,
"grad_norm": 4.564358234405518,
"learning_rate": 1.8742098609355248e-06,
"loss": 0.3632,
"step": 6440
},
{
"epoch": 2.7497333048858543,
"grad_norm": 3.611410617828369,
"learning_rate": 1.8584070796460179e-06,
"loss": 0.4081,
"step": 6445
},
{
"epoch": 2.7518668657990184,
"grad_norm": 4.470289707183838,
"learning_rate": 1.842604298356511e-06,
"loss": 0.4033,
"step": 6450
},
{
"epoch": 2.7540004267121825,
"grad_norm": 3.99035906791687,
"learning_rate": 1.8268015170670038e-06,
"loss": 0.3946,
"step": 6455
},
{
"epoch": 2.7561339876253466,
"grad_norm": 4.113675117492676,
"learning_rate": 1.810998735777497e-06,
"loss": 0.3884,
"step": 6460
},
{
"epoch": 2.7582675485385106,
"grad_norm": 3.7869863510131836,
"learning_rate": 1.7951959544879902e-06,
"loss": 0.4043,
"step": 6465
},
{
"epoch": 2.7604011094516747,
"grad_norm": 3.4098997116088867,
"learning_rate": 1.779393173198483e-06,
"loss": 0.3765,
"step": 6470
},
{
"epoch": 2.762534670364839,
"grad_norm": 4.8651123046875,
"learning_rate": 1.7635903919089762e-06,
"loss": 0.4379,
"step": 6475
},
{
"epoch": 2.764668231278003,
"grad_norm": 3.7873408794403076,
"learning_rate": 1.747787610619469e-06,
"loss": 0.3688,
"step": 6480
},
{
"epoch": 2.766801792191167,
"grad_norm": 3.624573230743408,
"learning_rate": 1.7319848293299621e-06,
"loss": 0.351,
"step": 6485
},
{
"epoch": 2.768935353104331,
"grad_norm": 3.621868848800659,
"learning_rate": 1.7161820480404554e-06,
"loss": 0.3465,
"step": 6490
},
{
"epoch": 2.771068914017495,
"grad_norm": 4.348837852478027,
"learning_rate": 1.7003792667509483e-06,
"loss": 0.424,
"step": 6495
},
{
"epoch": 2.773202474930659,
"grad_norm": 4.25526237487793,
"learning_rate": 1.6845764854614414e-06,
"loss": 0.3487,
"step": 6500
},
{
"epoch": 2.7753360358438233,
"grad_norm": 4.0821757316589355,
"learning_rate": 1.6687737041719343e-06,
"loss": 0.343,
"step": 6505
},
{
"epoch": 2.7774695967569873,
"grad_norm": 3.5596158504486084,
"learning_rate": 1.6529709228824276e-06,
"loss": 0.3745,
"step": 6510
},
{
"epoch": 2.7796031576701514,
"grad_norm": 3.369270086288452,
"learning_rate": 1.6371681415929204e-06,
"loss": 0.3482,
"step": 6515
},
{
"epoch": 2.7817367185833155,
"grad_norm": 5.09242582321167,
"learning_rate": 1.6213653603034135e-06,
"loss": 0.419,
"step": 6520
},
{
"epoch": 2.7838702794964796,
"grad_norm": 4.409122943878174,
"learning_rate": 1.6055625790139064e-06,
"loss": 0.4107,
"step": 6525
},
{
"epoch": 2.7860038404096437,
"grad_norm": 3.660127878189087,
"learning_rate": 1.5897597977243997e-06,
"loss": 0.3103,
"step": 6530
},
{
"epoch": 2.7881374013228077,
"grad_norm": 4.103714942932129,
"learning_rate": 1.5739570164348928e-06,
"loss": 0.3836,
"step": 6535
},
{
"epoch": 2.790270962235972,
"grad_norm": 4.289230823516846,
"learning_rate": 1.5581542351453857e-06,
"loss": 0.3736,
"step": 6540
},
{
"epoch": 2.792404523149136,
"grad_norm": 3.898728370666504,
"learning_rate": 1.5423514538558787e-06,
"loss": 0.3926,
"step": 6545
},
{
"epoch": 2.7945380840623,
"grad_norm": 4.545414924621582,
"learning_rate": 1.5265486725663716e-06,
"loss": 0.384,
"step": 6550
},
{
"epoch": 2.796671644975464,
"grad_norm": 3.980564832687378,
"learning_rate": 1.510745891276865e-06,
"loss": 0.3862,
"step": 6555
},
{
"epoch": 2.798805205888628,
"grad_norm": 4.078009605407715,
"learning_rate": 1.494943109987358e-06,
"loss": 0.41,
"step": 6560
},
{
"epoch": 2.800938766801792,
"grad_norm": 5.77632999420166,
"learning_rate": 1.4791403286978509e-06,
"loss": 0.4223,
"step": 6565
},
{
"epoch": 2.8030723277149563,
"grad_norm": 3.2947072982788086,
"learning_rate": 1.463337547408344e-06,
"loss": 0.385,
"step": 6570
},
{
"epoch": 2.8052058886281204,
"grad_norm": 4.24403715133667,
"learning_rate": 1.447534766118837e-06,
"loss": 0.3875,
"step": 6575
},
{
"epoch": 2.8073394495412844,
"grad_norm": 4.188129425048828,
"learning_rate": 1.4317319848293301e-06,
"loss": 0.3847,
"step": 6580
},
{
"epoch": 2.8094730104544485,
"grad_norm": 4.100208282470703,
"learning_rate": 1.415929203539823e-06,
"loss": 0.4115,
"step": 6585
},
{
"epoch": 2.8116065713676126,
"grad_norm": 3.90510630607605,
"learning_rate": 1.400126422250316e-06,
"loss": 0.3578,
"step": 6590
},
{
"epoch": 2.8137401322807767,
"grad_norm": 4.556890487670898,
"learning_rate": 1.3843236409608094e-06,
"loss": 0.3868,
"step": 6595
},
{
"epoch": 2.8158736931939408,
"grad_norm": 3.50069260597229,
"learning_rate": 1.3685208596713023e-06,
"loss": 0.3581,
"step": 6600
},
{
"epoch": 2.818007254107105,
"grad_norm": 4.0225749015808105,
"learning_rate": 1.3527180783817954e-06,
"loss": 0.3467,
"step": 6605
},
{
"epoch": 2.820140815020269,
"grad_norm": 3.903599500656128,
"learning_rate": 1.3369152970922882e-06,
"loss": 0.3521,
"step": 6610
},
{
"epoch": 2.822274375933433,
"grad_norm": 4.0492634773254395,
"learning_rate": 1.3211125158027813e-06,
"loss": 0.3841,
"step": 6615
},
{
"epoch": 2.824407936846597,
"grad_norm": 4.392933368682861,
"learning_rate": 1.3053097345132746e-06,
"loss": 0.3963,
"step": 6620
},
{
"epoch": 2.826541497759761,
"grad_norm": 4.814109802246094,
"learning_rate": 1.2895069532237675e-06,
"loss": 0.4293,
"step": 6625
},
{
"epoch": 2.8286750586729252,
"grad_norm": 4.4884724617004395,
"learning_rate": 1.2737041719342606e-06,
"loss": 0.4127,
"step": 6630
},
{
"epoch": 2.8308086195860893,
"grad_norm": 4.531534671783447,
"learning_rate": 1.2579013906447535e-06,
"loss": 0.3977,
"step": 6635
},
{
"epoch": 2.8329421804992534,
"grad_norm": 4.81414270401001,
"learning_rate": 1.2420986093552465e-06,
"loss": 0.3416,
"step": 6640
},
{
"epoch": 2.8350757414124175,
"grad_norm": 4.427682399749756,
"learning_rate": 1.2262958280657396e-06,
"loss": 0.4633,
"step": 6645
},
{
"epoch": 2.8372093023255816,
"grad_norm": 3.9264962673187256,
"learning_rate": 1.2104930467762327e-06,
"loss": 0.3987,
"step": 6650
},
{
"epoch": 2.8393428632387456,
"grad_norm": 4.122159957885742,
"learning_rate": 1.1946902654867258e-06,
"loss": 0.3409,
"step": 6655
},
{
"epoch": 2.8414764241519093,
"grad_norm": 4.230727672576904,
"learning_rate": 1.1788874841972189e-06,
"loss": 0.4259,
"step": 6660
},
{
"epoch": 2.843609985065074,
"grad_norm": 4.188933849334717,
"learning_rate": 1.163084702907712e-06,
"loss": 0.3581,
"step": 6665
},
{
"epoch": 2.8457435459782374,
"grad_norm": 3.4019768238067627,
"learning_rate": 1.1472819216182048e-06,
"loss": 0.439,
"step": 6670
},
{
"epoch": 2.847877106891402,
"grad_norm": 4.435439109802246,
"learning_rate": 1.131479140328698e-06,
"loss": 0.4366,
"step": 6675
},
{
"epoch": 2.8500106678045656,
"grad_norm": 3.905317783355713,
"learning_rate": 1.115676359039191e-06,
"loss": 0.3568,
"step": 6680
},
{
"epoch": 2.85214422871773,
"grad_norm": 4.476743698120117,
"learning_rate": 1.0998735777496839e-06,
"loss": 0.4044,
"step": 6685
},
{
"epoch": 2.8542777896308937,
"grad_norm": 4.263827323913574,
"learning_rate": 1.0840707964601772e-06,
"loss": 0.3394,
"step": 6690
},
{
"epoch": 2.8564113505440583,
"grad_norm": 4.395534515380859,
"learning_rate": 1.06826801517067e-06,
"loss": 0.3631,
"step": 6695
},
{
"epoch": 2.858544911457222,
"grad_norm": 3.917830228805542,
"learning_rate": 1.0524652338811632e-06,
"loss": 0.3771,
"step": 6700
},
{
"epoch": 2.8606784723703864,
"grad_norm": 3.799147367477417,
"learning_rate": 1.0366624525916562e-06,
"loss": 0.3673,
"step": 6705
},
{
"epoch": 2.86281203328355,
"grad_norm": 4.744193077087402,
"learning_rate": 1.0208596713021493e-06,
"loss": 0.431,
"step": 6710
},
{
"epoch": 2.8649455941967146,
"grad_norm": 4.385441780090332,
"learning_rate": 1.0050568900126422e-06,
"loss": 0.3884,
"step": 6715
},
{
"epoch": 2.867079155109878,
"grad_norm": 3.170003890991211,
"learning_rate": 9.892541087231355e-07,
"loss": 0.3317,
"step": 6720
},
{
"epoch": 2.8692127160230427,
"grad_norm": 4.889204978942871,
"learning_rate": 9.734513274336284e-07,
"loss": 0.4444,
"step": 6725
},
{
"epoch": 2.8713462769362064,
"grad_norm": 4.576068878173828,
"learning_rate": 9.576485461441215e-07,
"loss": 0.3629,
"step": 6730
},
{
"epoch": 2.8734798378493704,
"grad_norm": 3.7488820552825928,
"learning_rate": 9.418457648546144e-07,
"loss": 0.3985,
"step": 6735
},
{
"epoch": 2.8756133987625345,
"grad_norm": 4.805878639221191,
"learning_rate": 9.260429835651074e-07,
"loss": 0.3658,
"step": 6740
},
{
"epoch": 2.8777469596756986,
"grad_norm": 4.053656101226807,
"learning_rate": 9.102402022756006e-07,
"loss": 0.4409,
"step": 6745
},
{
"epoch": 2.8798805205888627,
"grad_norm": 3.8377509117126465,
"learning_rate": 8.944374209860936e-07,
"loss": 0.3299,
"step": 6750
},
{
"epoch": 2.8820140815020268,
"grad_norm": 3.885563611984253,
"learning_rate": 8.786346396965867e-07,
"loss": 0.4275,
"step": 6755
},
{
"epoch": 2.884147642415191,
"grad_norm": 4.142803192138672,
"learning_rate": 8.628318584070797e-07,
"loss": 0.398,
"step": 6760
},
{
"epoch": 2.886281203328355,
"grad_norm": 4.158621788024902,
"learning_rate": 8.470290771175727e-07,
"loss": 0.4017,
"step": 6765
},
{
"epoch": 2.888414764241519,
"grad_norm": 3.649864912033081,
"learning_rate": 8.312262958280657e-07,
"loss": 0.4163,
"step": 6770
},
{
"epoch": 2.890548325154683,
"grad_norm": 3.5101370811462402,
"learning_rate": 8.154235145385589e-07,
"loss": 0.4025,
"step": 6775
},
{
"epoch": 2.892681886067847,
"grad_norm": 4.628073215484619,
"learning_rate": 7.996207332490519e-07,
"loss": 0.4233,
"step": 6780
},
{
"epoch": 2.8948154469810112,
"grad_norm": 4.449802875518799,
"learning_rate": 7.83817951959545e-07,
"loss": 0.4032,
"step": 6785
},
{
"epoch": 2.8969490078941753,
"grad_norm": 4.176910877227783,
"learning_rate": 7.68015170670038e-07,
"loss": 0.3724,
"step": 6790
},
{
"epoch": 2.8990825688073394,
"grad_norm": 3.777665138244629,
"learning_rate": 7.522123893805311e-07,
"loss": 0.3426,
"step": 6795
},
{
"epoch": 2.9012161297205035,
"grad_norm": 4.363525390625,
"learning_rate": 7.36409608091024e-07,
"loss": 0.4207,
"step": 6800
},
{
"epoch": 2.9033496906336675,
"grad_norm": 3.7728726863861084,
"learning_rate": 7.206068268015172e-07,
"loss": 0.394,
"step": 6805
},
{
"epoch": 2.9054832515468316,
"grad_norm": 4.496046543121338,
"learning_rate": 7.048040455120102e-07,
"loss": 0.3566,
"step": 6810
},
{
"epoch": 2.9076168124599957,
"grad_norm": 3.437410354614258,
"learning_rate": 6.890012642225032e-07,
"loss": 0.3491,
"step": 6815
},
{
"epoch": 2.90975037337316,
"grad_norm": 3.4725382328033447,
"learning_rate": 6.731984829329963e-07,
"loss": 0.4027,
"step": 6820
},
{
"epoch": 2.911883934286324,
"grad_norm": 3.788536787033081,
"learning_rate": 6.573957016434893e-07,
"loss": 0.3844,
"step": 6825
},
{
"epoch": 2.914017495199488,
"grad_norm": 3.9548215866088867,
"learning_rate": 6.415929203539823e-07,
"loss": 0.3469,
"step": 6830
},
{
"epoch": 2.916151056112652,
"grad_norm": 3.581763505935669,
"learning_rate": 6.257901390644753e-07,
"loss": 0.368,
"step": 6835
},
{
"epoch": 2.918284617025816,
"grad_norm": 3.8996713161468506,
"learning_rate": 6.099873577749684e-07,
"loss": 0.3673,
"step": 6840
},
{
"epoch": 2.92041817793898,
"grad_norm": 3.3684163093566895,
"learning_rate": 5.941845764854615e-07,
"loss": 0.3668,
"step": 6845
},
{
"epoch": 2.9225517388521443,
"grad_norm": 3.806769609451294,
"learning_rate": 5.783817951959545e-07,
"loss": 0.3892,
"step": 6850
},
{
"epoch": 2.9246852997653083,
"grad_norm": 4.019096374511719,
"learning_rate": 5.625790139064476e-07,
"loss": 0.3819,
"step": 6855
},
{
"epoch": 2.9268188606784724,
"grad_norm": 4.2586164474487305,
"learning_rate": 5.467762326169406e-07,
"loss": 0.3969,
"step": 6860
},
{
"epoch": 2.9289524215916365,
"grad_norm": 4.982457637786865,
"learning_rate": 5.309734513274336e-07,
"loss": 0.3556,
"step": 6865
},
{
"epoch": 2.9310859825048006,
"grad_norm": 3.756347179412842,
"learning_rate": 5.151706700379267e-07,
"loss": 0.3972,
"step": 6870
},
{
"epoch": 2.9332195434179646,
"grad_norm": 4.047727584838867,
"learning_rate": 4.993678887484198e-07,
"loss": 0.3881,
"step": 6875
},
{
"epoch": 2.9353531043311287,
"grad_norm": 4.2889862060546875,
"learning_rate": 4.835651074589128e-07,
"loss": 0.3569,
"step": 6880
},
{
"epoch": 2.937486665244293,
"grad_norm": 4.17496395111084,
"learning_rate": 4.6776232616940587e-07,
"loss": 0.3775,
"step": 6885
},
{
"epoch": 2.939620226157457,
"grad_norm": 4.326033115386963,
"learning_rate": 4.519595448798989e-07,
"loss": 0.3723,
"step": 6890
},
{
"epoch": 2.941753787070621,
"grad_norm": 3.82330322265625,
"learning_rate": 4.36156763590392e-07,
"loss": 0.4026,
"step": 6895
},
{
"epoch": 2.943887347983785,
"grad_norm": 3.445920944213867,
"learning_rate": 4.20353982300885e-07,
"loss": 0.3598,
"step": 6900
},
{
"epoch": 2.946020908896949,
"grad_norm": 4.0058698654174805,
"learning_rate": 4.0455120101137806e-07,
"loss": 0.396,
"step": 6905
},
{
"epoch": 2.948154469810113,
"grad_norm": 3.8937366008758545,
"learning_rate": 3.8874841972187104e-07,
"loss": 0.4212,
"step": 6910
},
{
"epoch": 2.9502880307232773,
"grad_norm": 3.048259735107422,
"learning_rate": 3.729456384323641e-07,
"loss": 0.3016,
"step": 6915
},
{
"epoch": 2.9524215916364414,
"grad_norm": 4.167364597320557,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.3691,
"step": 6920
},
{
"epoch": 2.9545551525496054,
"grad_norm": 3.733313798904419,
"learning_rate": 3.413400758533502e-07,
"loss": 0.4057,
"step": 6925
},
{
"epoch": 2.9566887134627695,
"grad_norm": 3.94075608253479,
"learning_rate": 3.255372945638433e-07,
"loss": 0.3758,
"step": 6930
},
{
"epoch": 2.9588222743759336,
"grad_norm": 4.1353535652160645,
"learning_rate": 3.097345132743363e-07,
"loss": 0.4074,
"step": 6935
},
{
"epoch": 2.9609558352890977,
"grad_norm": 4.167180061340332,
"learning_rate": 2.9393173198482934e-07,
"loss": 0.3908,
"step": 6940
},
{
"epoch": 2.9630893962022613,
"grad_norm": 3.7538034915924072,
"learning_rate": 2.7812895069532243e-07,
"loss": 0.3797,
"step": 6945
},
{
"epoch": 2.965222957115426,
"grad_norm": 3.6969258785247803,
"learning_rate": 2.6232616940581546e-07,
"loss": 0.3643,
"step": 6950
},
{
"epoch": 2.9673565180285895,
"grad_norm": 3.8094379901885986,
"learning_rate": 2.465233881163085e-07,
"loss": 0.3865,
"step": 6955
},
{
"epoch": 2.969490078941754,
"grad_norm": 4.114381790161133,
"learning_rate": 2.3072060682680153e-07,
"loss": 0.3911,
"step": 6960
},
{
"epoch": 2.9716236398549176,
"grad_norm": 3.7223260402679443,
"learning_rate": 2.1491782553729456e-07,
"loss": 0.3594,
"step": 6965
},
{
"epoch": 2.973757200768082,
"grad_norm": 3.774750232696533,
"learning_rate": 1.9911504424778762e-07,
"loss": 0.3772,
"step": 6970
},
{
"epoch": 2.9758907616812458,
"grad_norm": 3.5546209812164307,
"learning_rate": 1.8331226295828068e-07,
"loss": 0.405,
"step": 6975
},
{
"epoch": 2.9780243225944103,
"grad_norm": 4.0937180519104,
"learning_rate": 1.6750948166877372e-07,
"loss": 0.3537,
"step": 6980
},
{
"epoch": 2.980157883507574,
"grad_norm": 4.477363109588623,
"learning_rate": 1.5170670037926675e-07,
"loss": 0.3708,
"step": 6985
},
{
"epoch": 2.9822914444207385,
"grad_norm": 4.295274257659912,
"learning_rate": 1.359039190897598e-07,
"loss": 0.3649,
"step": 6990
},
{
"epoch": 2.984425005333902,
"grad_norm": 4.677340507507324,
"learning_rate": 1.2010113780025287e-07,
"loss": 0.3878,
"step": 6995
},
{
"epoch": 2.9865585662470666,
"grad_norm": 3.6075425148010254,
"learning_rate": 1.0429835651074589e-07,
"loss": 0.3452,
"step": 7000
},
{
"epoch": 2.9886921271602302,
"grad_norm": 3.9504892826080322,
"learning_rate": 8.849557522123894e-08,
"loss": 0.3655,
"step": 7005
},
{
"epoch": 2.9908256880733948,
"grad_norm": 4.34686803817749,
"learning_rate": 7.2692793931732e-08,
"loss": 0.3878,
"step": 7010
},
{
"epoch": 2.9929592489865584,
"grad_norm": 4.2292046546936035,
"learning_rate": 5.689001264222504e-08,
"loss": 0.3704,
"step": 7015
},
{
"epoch": 2.9950928098997225,
"grad_norm": 3.9971399307250977,
"learning_rate": 4.108723135271808e-08,
"loss": 0.3715,
"step": 7020
},
{
"epoch": 2.9972263708128866,
"grad_norm": 4.397487640380859,
"learning_rate": 2.528445006321113e-08,
"loss": 0.3896,
"step": 7025
},
{
"epoch": 2.9993599317260506,
"grad_norm": 3.856173276901245,
"learning_rate": 9.481668773704172e-09,
"loss": 0.4088,
"step": 7030
},
{
"epoch": 3.0,
"eval_evaluator": 0.9877204489141523,
"eval_loss": 0.1703886240720749,
"eval_runtime": 127.056,
"eval_samples_per_second": 18.0,
"eval_steps_per_second": 2.251,
"step": 7032
}
],
"logging_steps": 5,
"max_steps": 7032,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}