{ "best_global_step": 7032, "best_metric": 0.1703886240720749, "best_model_checkpoint": "runs/de_biolord/checkpoint-7032", "epoch": 3.0, "eval_steps": 500, "global_step": 7032, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002133560913164071, "grad_norm": 1407.41943359375, "learning_rate": 1.1363636363636364e-07, "loss": 260.9562, "step": 5 }, { "epoch": 0.004267121826328142, "grad_norm": 1352.476806640625, "learning_rate": 2.556818181818182e-07, "loss": 243.0724, "step": 10 }, { "epoch": 0.006400682739492213, "grad_norm": 1270.781982421875, "learning_rate": 3.9772727272727276e-07, "loss": 226.9033, "step": 15 }, { "epoch": 0.008534243652656284, "grad_norm": 1122.8021240234375, "learning_rate": 5.397727272727273e-07, "loss": 247.598, "step": 20 }, { "epoch": 0.010667804565820354, "grad_norm": 1343.4222412109375, "learning_rate": 6.818181818181818e-07, "loss": 237.7835, "step": 25 }, { "epoch": 0.012801365478984425, "grad_norm": 1176.0780029296875, "learning_rate": 8.238636363636364e-07, "loss": 218.3724, "step": 30 }, { "epoch": 0.014934926392148495, "grad_norm": 1119.2242431640625, "learning_rate": 9.65909090909091e-07, "loss": 208.53, "step": 35 }, { "epoch": 0.017068487305312567, "grad_norm": 1353.07958984375, "learning_rate": 1.1079545454545456e-06, "loss": 224.4951, "step": 40 }, { "epoch": 0.01920204821847664, "grad_norm": 1231.170166015625, "learning_rate": 1.25e-06, "loss": 207.6082, "step": 45 }, { "epoch": 0.021335609131640707, "grad_norm": 1226.5369873046875, "learning_rate": 1.3920454545454546e-06, "loss": 215.5779, "step": 50 }, { "epoch": 0.02346917004480478, "grad_norm": 1271.869873046875, "learning_rate": 1.5340909090909093e-06, "loss": 235.3491, "step": 55 }, { "epoch": 0.02560273095796885, "grad_norm": 1287.97900390625, "learning_rate": 1.6761363636363636e-06, "loss": 221.6164, "step": 60 }, { "epoch": 0.027736291871132922, "grad_norm": 1170.921630859375, "learning_rate": 1.8181818181818183e-06, "loss": 212.6889, "step": 65 }, { "epoch": 0.02986985278429699, "grad_norm": 1338.1890869140625, "learning_rate": 1.9602272727272728e-06, "loss": 216.2898, "step": 70 }, { "epoch": 0.03200341369746106, "grad_norm": 1271.138916015625, "learning_rate": 2.1022727272727277e-06, "loss": 231.7364, "step": 75 }, { "epoch": 0.034136974610625134, "grad_norm": 1132.897705078125, "learning_rate": 2.2443181818181818e-06, "loss": 206.7381, "step": 80 }, { "epoch": 0.036270535523789206, "grad_norm": 1292.3948974609375, "learning_rate": 2.3863636363636367e-06, "loss": 194.5393, "step": 85 }, { "epoch": 0.03840409643695328, "grad_norm": 1246.8048095703125, "learning_rate": 2.528409090909091e-06, "loss": 212.0394, "step": 90 }, { "epoch": 0.04053765735011734, "grad_norm": 970.118896484375, "learning_rate": 2.6704545454545457e-06, "loss": 199.709, "step": 95 }, { "epoch": 0.042671218263281414, "grad_norm": 1266.0760498046875, "learning_rate": 2.8125e-06, "loss": 208.5693, "step": 100 }, { "epoch": 0.044804779176445486, "grad_norm": 1050.2158203125, "learning_rate": 2.954545454545455e-06, "loss": 185.4214, "step": 105 }, { "epoch": 0.04693834008960956, "grad_norm": 1134.134033203125, "learning_rate": 3.096590909090909e-06, "loss": 207.6509, "step": 110 }, { "epoch": 0.04907190100277363, "grad_norm": 1127.47021484375, "learning_rate": 3.2386363636363637e-06, "loss": 207.7226, "step": 115 }, { "epoch": 0.0512054619159377, "grad_norm": 1074.081787109375, "learning_rate": 3.3806818181818186e-06, "loss": 176.0556, "step": 120 }, { "epoch": 0.05333902282910177, "grad_norm": 1120.4603271484375, "learning_rate": 3.522727272727273e-06, "loss": 194.877, "step": 125 }, { "epoch": 0.055472583742265845, "grad_norm": 1111.07080078125, "learning_rate": 3.6647727272727276e-06, "loss": 194.9979, "step": 130 }, { "epoch": 0.05760614465542991, "grad_norm": 1141.6068115234375, "learning_rate": 3.806818181818182e-06, "loss": 201.5107, "step": 135 }, { "epoch": 0.05973970556859398, "grad_norm": 1131.10986328125, "learning_rate": 3.9488636363636366e-06, "loss": 191.0996, "step": 140 }, { "epoch": 0.06187326648175805, "grad_norm": 966.1857299804688, "learning_rate": 4.0909090909090915e-06, "loss": 156.6461, "step": 145 }, { "epoch": 0.06400682739492213, "grad_norm": 1031.0262451171875, "learning_rate": 4.2329545454545455e-06, "loss": 163.2723, "step": 150 }, { "epoch": 0.06614038830808619, "grad_norm": 1065.27392578125, "learning_rate": 4.3750000000000005e-06, "loss": 160.0, "step": 155 }, { "epoch": 0.06827394922125027, "grad_norm": 1084.3116455078125, "learning_rate": 4.517045454545455e-06, "loss": 142.0651, "step": 160 }, { "epoch": 0.07040751013441433, "grad_norm": 986.8313598632812, "learning_rate": 4.6590909090909095e-06, "loss": 147.7512, "step": 165 }, { "epoch": 0.07254107104757841, "grad_norm": 877.9501342773438, "learning_rate": 4.8011363636363635e-06, "loss": 140.8079, "step": 170 }, { "epoch": 0.07467463196074248, "grad_norm": 947.1781005859375, "learning_rate": 4.9431818181818184e-06, "loss": 125.8033, "step": 175 }, { "epoch": 0.07680819287390656, "grad_norm": 901.2457885742188, "learning_rate": 5.085227272727273e-06, "loss": 139.7856, "step": 180 }, { "epoch": 0.07894175378707062, "grad_norm": 741.2592163085938, "learning_rate": 5.2272727272727274e-06, "loss": 116.0057, "step": 185 }, { "epoch": 0.08107531470023469, "grad_norm": 747.7615966796875, "learning_rate": 5.369318181818182e-06, "loss": 110.8626, "step": 190 }, { "epoch": 0.08320887561339876, "grad_norm": 747.3321533203125, "learning_rate": 5.511363636363637e-06, "loss": 109.0995, "step": 195 }, { "epoch": 0.08534243652656283, "grad_norm": 780.7266845703125, "learning_rate": 5.653409090909091e-06, "loss": 119.4655, "step": 200 }, { "epoch": 0.08747599743972691, "grad_norm": 695.3804321289062, "learning_rate": 5.795454545454546e-06, "loss": 110.2229, "step": 205 }, { "epoch": 0.08960955835289097, "grad_norm": 789.1644897460938, "learning_rate": 5.9375e-06, "loss": 112.9283, "step": 210 }, { "epoch": 0.09174311926605505, "grad_norm": 767.181396484375, "learning_rate": 6.079545454545454e-06, "loss": 112.8705, "step": 215 }, { "epoch": 0.09387668017921912, "grad_norm": 720.9426879882812, "learning_rate": 6.22159090909091e-06, "loss": 99.1093, "step": 220 }, { "epoch": 0.09601024109238318, "grad_norm": 646.9892578125, "learning_rate": 6.363636363636364e-06, "loss": 84.201, "step": 225 }, { "epoch": 0.09814380200554726, "grad_norm": 596.7828979492188, "learning_rate": 6.505681818181818e-06, "loss": 85.2275, "step": 230 }, { "epoch": 0.10027736291871132, "grad_norm": 511.76837158203125, "learning_rate": 6.647727272727273e-06, "loss": 73.787, "step": 235 }, { "epoch": 0.1024109238318754, "grad_norm": 587.9999389648438, "learning_rate": 6.789772727272727e-06, "loss": 79.8435, "step": 240 }, { "epoch": 0.10454448474503947, "grad_norm": 537.0763549804688, "learning_rate": 6.931818181818183e-06, "loss": 67.2795, "step": 245 }, { "epoch": 0.10667804565820355, "grad_norm": 509.71734619140625, "learning_rate": 7.073863636363637e-06, "loss": 56.0195, "step": 250 }, { "epoch": 0.10881160657136761, "grad_norm": 365.5326232910156, "learning_rate": 7.215909090909091e-06, "loss": 49.6022, "step": 255 }, { "epoch": 0.11094516748453169, "grad_norm": 407.3646240234375, "learning_rate": 7.357954545454546e-06, "loss": 52.5486, "step": 260 }, { "epoch": 0.11307872839769575, "grad_norm": 401.7477722167969, "learning_rate": 7.500000000000001e-06, "loss": 42.2398, "step": 265 }, { "epoch": 0.11521228931085982, "grad_norm": 341.845947265625, "learning_rate": 7.642045454545454e-06, "loss": 40.5546, "step": 270 }, { "epoch": 0.1173458502240239, "grad_norm": 294.14501953125, "learning_rate": 7.784090909090911e-06, "loss": 40.2339, "step": 275 }, { "epoch": 0.11947941113718796, "grad_norm": 306.5284423828125, "learning_rate": 7.926136363636364e-06, "loss": 33.0703, "step": 280 }, { "epoch": 0.12161297205035204, "grad_norm": 261.7529296875, "learning_rate": 8.068181818181819e-06, "loss": 28.2356, "step": 285 }, { "epoch": 0.1237465329635161, "grad_norm": 180.16305541992188, "learning_rate": 8.210227272727274e-06, "loss": 25.7024, "step": 290 }, { "epoch": 0.12588009387668017, "grad_norm": 198.47352600097656, "learning_rate": 8.352272727272727e-06, "loss": 21.9268, "step": 295 }, { "epoch": 0.12801365478984425, "grad_norm": 155.0713653564453, "learning_rate": 8.494318181818184e-06, "loss": 18.9837, "step": 300 }, { "epoch": 0.13014721570300833, "grad_norm": 114.19086456298828, "learning_rate": 8.636363636363637e-06, "loss": 14.5069, "step": 305 }, { "epoch": 0.13228077661617238, "grad_norm": 127.5108413696289, "learning_rate": 8.778409090909092e-06, "loss": 13.9749, "step": 310 }, { "epoch": 0.13441433752933646, "grad_norm": 101.1385726928711, "learning_rate": 8.920454545454547e-06, "loss": 10.4993, "step": 315 }, { "epoch": 0.13654789844250054, "grad_norm": 102.41287994384766, "learning_rate": 9.0625e-06, "loss": 10.8756, "step": 320 }, { "epoch": 0.13868145935566462, "grad_norm": 82.27557373046875, "learning_rate": 9.204545454545455e-06, "loss": 8.448, "step": 325 }, { "epoch": 0.14081502026882867, "grad_norm": 59.02256774902344, "learning_rate": 9.34659090909091e-06, "loss": 7.3985, "step": 330 }, { "epoch": 0.14294858118199275, "grad_norm": 55.171485900878906, "learning_rate": 9.488636363636365e-06, "loss": 6.7277, "step": 335 }, { "epoch": 0.14508214209515682, "grad_norm": 51.14794921875, "learning_rate": 9.630681818181818e-06, "loss": 5.369, "step": 340 }, { "epoch": 0.14721570300832088, "grad_norm": 47.7487678527832, "learning_rate": 9.772727272727273e-06, "loss": 5.1479, "step": 345 }, { "epoch": 0.14934926392148495, "grad_norm": 34.616519927978516, "learning_rate": 9.914772727272728e-06, "loss": 3.9802, "step": 350 }, { "epoch": 0.15148282483464903, "grad_norm": 28.53857421875, "learning_rate": 1.0056818181818183e-05, "loss": 3.3223, "step": 355 }, { "epoch": 0.1536163857478131, "grad_norm": 26.984251022338867, "learning_rate": 1.0198863636363636e-05, "loss": 3.148, "step": 360 }, { "epoch": 0.15574994666097716, "grad_norm": 25.347354888916016, "learning_rate": 1.0340909090909093e-05, "loss": 3.1232, "step": 365 }, { "epoch": 0.15788350757414124, "grad_norm": 21.828699111938477, "learning_rate": 1.0482954545454548e-05, "loss": 2.6608, "step": 370 }, { "epoch": 0.16001706848730532, "grad_norm": 19.460886001586914, "learning_rate": 1.0625e-05, "loss": 2.6175, "step": 375 }, { "epoch": 0.16215062940046937, "grad_norm": 14.729393005371094, "learning_rate": 1.0767045454545456e-05, "loss": 2.1366, "step": 380 }, { "epoch": 0.16428419031363345, "grad_norm": 15.992349624633789, "learning_rate": 1.0909090909090909e-05, "loss": 2.2121, "step": 385 }, { "epoch": 0.16641775122679753, "grad_norm": 14.3778715133667, "learning_rate": 1.1051136363636366e-05, "loss": 2.2454, "step": 390 }, { "epoch": 0.1685513121399616, "grad_norm": 14.635651588439941, "learning_rate": 1.119318181818182e-05, "loss": 2.1155, "step": 395 }, { "epoch": 0.17068487305312566, "grad_norm": 14.150617599487305, "learning_rate": 1.1335227272727274e-05, "loss": 1.988, "step": 400 }, { "epoch": 0.17281843396628974, "grad_norm": 13.40462875366211, "learning_rate": 1.1477272727272729e-05, "loss": 1.965, "step": 405 }, { "epoch": 0.17495199487945381, "grad_norm": 12.215423583984375, "learning_rate": 1.1619318181818182e-05, "loss": 1.8777, "step": 410 }, { "epoch": 0.17708555579261787, "grad_norm": 14.43690013885498, "learning_rate": 1.1761363636363637e-05, "loss": 1.9024, "step": 415 }, { "epoch": 0.17921911670578194, "grad_norm": 11.984452247619629, "learning_rate": 1.1903409090909093e-05, "loss": 1.7564, "step": 420 }, { "epoch": 0.18135267761894602, "grad_norm": 11.757567405700684, "learning_rate": 1.2045454545454547e-05, "loss": 1.7513, "step": 425 }, { "epoch": 0.1834862385321101, "grad_norm": 12.034418106079102, "learning_rate": 1.2187500000000001e-05, "loss": 1.8298, "step": 430 }, { "epoch": 0.18561979944527415, "grad_norm": 9.716742515563965, "learning_rate": 1.2329545454545455e-05, "loss": 1.5716, "step": 435 }, { "epoch": 0.18775336035843823, "grad_norm": 9.575140953063965, "learning_rate": 1.247159090909091e-05, "loss": 1.5193, "step": 440 }, { "epoch": 0.1898869212716023, "grad_norm": 10.11562442779541, "learning_rate": 1.2613636363636366e-05, "loss": 1.4442, "step": 445 }, { "epoch": 0.19202048218476636, "grad_norm": 9.89345645904541, "learning_rate": 1.275568181818182e-05, "loss": 1.5014, "step": 450 }, { "epoch": 0.19415404309793044, "grad_norm": 9.934481620788574, "learning_rate": 1.2897727272727274e-05, "loss": 1.4301, "step": 455 }, { "epoch": 0.19628760401109452, "grad_norm": 9.267789840698242, "learning_rate": 1.3039772727272728e-05, "loss": 1.5181, "step": 460 }, { "epoch": 0.1984211649242586, "grad_norm": 8.098825454711914, "learning_rate": 1.3181818181818183e-05, "loss": 1.328, "step": 465 }, { "epoch": 0.20055472583742265, "grad_norm": 8.572412490844727, "learning_rate": 1.3323863636363636e-05, "loss": 1.2831, "step": 470 }, { "epoch": 0.20268828675058673, "grad_norm": 9.359737396240234, "learning_rate": 1.3465909090909092e-05, "loss": 1.4169, "step": 475 }, { "epoch": 0.2048218476637508, "grad_norm": 11.379630088806152, "learning_rate": 1.3607954545454547e-05, "loss": 1.6943, "step": 480 }, { "epoch": 0.20695540857691488, "grad_norm": 8.644891738891602, "learning_rate": 1.375e-05, "loss": 1.2827, "step": 485 }, { "epoch": 0.20908896949007894, "grad_norm": 8.371868133544922, "learning_rate": 1.3892045454545455e-05, "loss": 1.3442, "step": 490 }, { "epoch": 0.21122253040324301, "grad_norm": 8.641389846801758, "learning_rate": 1.4034090909090909e-05, "loss": 1.3282, "step": 495 }, { "epoch": 0.2133560913164071, "grad_norm": 8.642644882202148, "learning_rate": 1.4176136363636365e-05, "loss": 1.2025, "step": 500 }, { "epoch": 0.21548965222957114, "grad_norm": 8.546757698059082, "learning_rate": 1.431818181818182e-05, "loss": 1.4431, "step": 505 }, { "epoch": 0.21762321314273522, "grad_norm": 9.831377983093262, "learning_rate": 1.4460227272727273e-05, "loss": 1.2434, "step": 510 }, { "epoch": 0.2197567740558993, "grad_norm": 8.539069175720215, "learning_rate": 1.4602272727272728e-05, "loss": 1.1499, "step": 515 }, { "epoch": 0.22189033496906338, "grad_norm": 7.561169147491455, "learning_rate": 1.4744318181818183e-05, "loss": 1.0773, "step": 520 }, { "epoch": 0.22402389588222743, "grad_norm": 8.947616577148438, "learning_rate": 1.4886363636363636e-05, "loss": 1.2212, "step": 525 }, { "epoch": 0.2261574567953915, "grad_norm": 9.044981002807617, "learning_rate": 1.5028409090909093e-05, "loss": 1.2723, "step": 530 }, { "epoch": 0.2282910177085556, "grad_norm": 8.213238716125488, "learning_rate": 1.5170454545454546e-05, "loss": 1.2088, "step": 535 }, { "epoch": 0.23042457862171964, "grad_norm": 7.895395278930664, "learning_rate": 1.5312500000000003e-05, "loss": 1.1508, "step": 540 }, { "epoch": 0.23255813953488372, "grad_norm": 7.332098007202148, "learning_rate": 1.5454545454545454e-05, "loss": 1.1426, "step": 545 }, { "epoch": 0.2346917004480478, "grad_norm": 7.83052921295166, "learning_rate": 1.559659090909091e-05, "loss": 1.1775, "step": 550 }, { "epoch": 0.23682526136121188, "grad_norm": 8.019132614135742, "learning_rate": 1.5738636363636364e-05, "loss": 1.2678, "step": 555 }, { "epoch": 0.23895882227437593, "grad_norm": 7.728061199188232, "learning_rate": 1.588068181818182e-05, "loss": 1.1158, "step": 560 }, { "epoch": 0.24109238318754, "grad_norm": 6.924149513244629, "learning_rate": 1.6022727272727274e-05, "loss": 1.0517, "step": 565 }, { "epoch": 0.24322594410070408, "grad_norm": 7.8373894691467285, "learning_rate": 1.616477272727273e-05, "loss": 1.1448, "step": 570 }, { "epoch": 0.24535950501386813, "grad_norm": 8.716146469116211, "learning_rate": 1.6306818181818184e-05, "loss": 1.16, "step": 575 }, { "epoch": 0.2474930659270322, "grad_norm": 6.993872165679932, "learning_rate": 1.6448863636363635e-05, "loss": 1.0683, "step": 580 }, { "epoch": 0.2496266268401963, "grad_norm": 6.647220134735107, "learning_rate": 1.6590909090909094e-05, "loss": 1.0468, "step": 585 }, { "epoch": 0.25176018775336034, "grad_norm": 6.776019096374512, "learning_rate": 1.673295454545455e-05, "loss": 0.9907, "step": 590 }, { "epoch": 0.2538937486665244, "grad_norm": 8.398134231567383, "learning_rate": 1.6875e-05, "loss": 1.1522, "step": 595 }, { "epoch": 0.2560273095796885, "grad_norm": 7.639389514923096, "learning_rate": 1.7017045454545455e-05, "loss": 1.0523, "step": 600 }, { "epoch": 0.2581608704928526, "grad_norm": 7.4302263259887695, "learning_rate": 1.715909090909091e-05, "loss": 1.0185, "step": 605 }, { "epoch": 0.26029443140601666, "grad_norm": 6.816908836364746, "learning_rate": 1.7301136363636365e-05, "loss": 1.0248, "step": 610 }, { "epoch": 0.26242799231918074, "grad_norm": 5.953906536102295, "learning_rate": 1.744318181818182e-05, "loss": 0.9046, "step": 615 }, { "epoch": 0.26456155323234476, "grad_norm": 7.096611022949219, "learning_rate": 1.7585227272727275e-05, "loss": 0.9747, "step": 620 }, { "epoch": 0.26669511414550884, "grad_norm": 6.390513896942139, "learning_rate": 1.772727272727273e-05, "loss": 0.9869, "step": 625 }, { "epoch": 0.2688286750586729, "grad_norm": 7.380507946014404, "learning_rate": 1.786931818181818e-05, "loss": 1.0425, "step": 630 }, { "epoch": 0.270962235971837, "grad_norm": 6.44447660446167, "learning_rate": 1.8011363636363636e-05, "loss": 0.9651, "step": 635 }, { "epoch": 0.2730957968850011, "grad_norm": 6.748442649841309, "learning_rate": 1.8153409090909094e-05, "loss": 1.0077, "step": 640 }, { "epoch": 0.27522935779816515, "grad_norm": 6.084346771240234, "learning_rate": 1.8295454545454546e-05, "loss": 0.8222, "step": 645 }, { "epoch": 0.27736291871132923, "grad_norm": 6.8281073570251465, "learning_rate": 1.84375e-05, "loss": 0.8685, "step": 650 }, { "epoch": 0.27949647962449325, "grad_norm": 7.083287715911865, "learning_rate": 1.8579545454545456e-05, "loss": 0.8913, "step": 655 }, { "epoch": 0.28163004053765733, "grad_norm": 6.964037895202637, "learning_rate": 1.872159090909091e-05, "loss": 0.935, "step": 660 }, { "epoch": 0.2837636014508214, "grad_norm": 6.590587139129639, "learning_rate": 1.8863636363636366e-05, "loss": 0.967, "step": 665 }, { "epoch": 0.2858971623639855, "grad_norm": 5.6939921379089355, "learning_rate": 1.900568181818182e-05, "loss": 0.9005, "step": 670 }, { "epoch": 0.28803072327714957, "grad_norm": 6.028235912322998, "learning_rate": 1.9147727272727276e-05, "loss": 0.8015, "step": 675 }, { "epoch": 0.29016428419031365, "grad_norm": 7.533740043640137, "learning_rate": 1.9289772727272727e-05, "loss": 0.912, "step": 680 }, { "epoch": 0.2922978451034777, "grad_norm": 6.917770862579346, "learning_rate": 1.9431818181818182e-05, "loss": 0.9849, "step": 685 }, { "epoch": 0.29443140601664175, "grad_norm": 8.427461624145508, "learning_rate": 1.9573863636363637e-05, "loss": 1.0628, "step": 690 }, { "epoch": 0.29656496692980583, "grad_norm": 5.852340221405029, "learning_rate": 1.9715909090909092e-05, "loss": 0.8542, "step": 695 }, { "epoch": 0.2986985278429699, "grad_norm": 6.263002395629883, "learning_rate": 1.9857954545454547e-05, "loss": 0.911, "step": 700 }, { "epoch": 0.300832088756134, "grad_norm": 6.515467643737793, "learning_rate": 2e-05, "loss": 0.9248, "step": 705 }, { "epoch": 0.30296564966929806, "grad_norm": 5.9300217628479, "learning_rate": 1.9984197218710496e-05, "loss": 0.8878, "step": 710 }, { "epoch": 0.30509921058246214, "grad_norm": 6.8438801765441895, "learning_rate": 1.9968394437420987e-05, "loss": 0.9006, "step": 715 }, { "epoch": 0.3072327714956262, "grad_norm": 6.085014820098877, "learning_rate": 1.9952591656131482e-05, "loss": 0.9037, "step": 720 }, { "epoch": 0.30936633240879025, "grad_norm": 6.003873348236084, "learning_rate": 1.9936788874841973e-05, "loss": 0.8492, "step": 725 }, { "epoch": 0.3114998933219543, "grad_norm": 6.141636848449707, "learning_rate": 1.9920986093552468e-05, "loss": 0.8886, "step": 730 }, { "epoch": 0.3136334542351184, "grad_norm": 6.797048091888428, "learning_rate": 1.990518331226296e-05, "loss": 0.8558, "step": 735 }, { "epoch": 0.3157670151482825, "grad_norm": 5.367070198059082, "learning_rate": 1.9889380530973453e-05, "loss": 0.8353, "step": 740 }, { "epoch": 0.31790057606144656, "grad_norm": 6.601510524749756, "learning_rate": 1.9873577749683945e-05, "loss": 0.8278, "step": 745 }, { "epoch": 0.32003413697461064, "grad_norm": 6.147311687469482, "learning_rate": 1.985777496839444e-05, "loss": 0.8283, "step": 750 }, { "epoch": 0.3221676978877747, "grad_norm": 6.131889820098877, "learning_rate": 1.984197218710493e-05, "loss": 0.867, "step": 755 }, { "epoch": 0.32430125880093874, "grad_norm": 6.493585109710693, "learning_rate": 1.9826169405815425e-05, "loss": 0.7298, "step": 760 }, { "epoch": 0.3264348197141028, "grad_norm": 6.721904277801514, "learning_rate": 1.981036662452592e-05, "loss": 0.874, "step": 765 }, { "epoch": 0.3285683806272669, "grad_norm": 5.271955490112305, "learning_rate": 1.979456384323641e-05, "loss": 0.7672, "step": 770 }, { "epoch": 0.330701941540431, "grad_norm": 5.658133506774902, "learning_rate": 1.9778761061946905e-05, "loss": 0.8225, "step": 775 }, { "epoch": 0.33283550245359506, "grad_norm": 5.675405025482178, "learning_rate": 1.97629582806574e-05, "loss": 0.8248, "step": 780 }, { "epoch": 0.33496906336675913, "grad_norm": 6.578061580657959, "learning_rate": 1.974715549936789e-05, "loss": 0.8961, "step": 785 }, { "epoch": 0.3371026242799232, "grad_norm": 5.728764057159424, "learning_rate": 1.9731352718078382e-05, "loss": 0.8242, "step": 790 }, { "epoch": 0.33923618519308724, "grad_norm": 6.199042320251465, "learning_rate": 1.9715549936788877e-05, "loss": 0.764, "step": 795 }, { "epoch": 0.3413697461062513, "grad_norm": 5.93390417098999, "learning_rate": 1.9699747155499368e-05, "loss": 0.7479, "step": 800 }, { "epoch": 0.3435033070194154, "grad_norm": 6.78892183303833, "learning_rate": 1.9683944374209863e-05, "loss": 0.9007, "step": 805 }, { "epoch": 0.3456368679325795, "grad_norm": 5.648488998413086, "learning_rate": 1.9668141592920357e-05, "loss": 0.8867, "step": 810 }, { "epoch": 0.34777042884574355, "grad_norm": 6.640005588531494, "learning_rate": 1.9652338811630848e-05, "loss": 0.8493, "step": 815 }, { "epoch": 0.34990398975890763, "grad_norm": 6.431436061859131, "learning_rate": 1.9636536030341343e-05, "loss": 0.7158, "step": 820 }, { "epoch": 0.3520375506720717, "grad_norm": 6.325982093811035, "learning_rate": 1.9620733249051834e-05, "loss": 0.7642, "step": 825 }, { "epoch": 0.35417111158523573, "grad_norm": 5.668117046356201, "learning_rate": 1.960493046776233e-05, "loss": 0.7993, "step": 830 }, { "epoch": 0.3563046724983998, "grad_norm": 5.590553283691406, "learning_rate": 1.9589127686472823e-05, "loss": 0.7103, "step": 835 }, { "epoch": 0.3584382334115639, "grad_norm": 5.006702423095703, "learning_rate": 1.9573324905183314e-05, "loss": 0.7772, "step": 840 }, { "epoch": 0.36057179432472797, "grad_norm": 5.906203269958496, "learning_rate": 1.9557522123893806e-05, "loss": 0.8199, "step": 845 }, { "epoch": 0.36270535523789205, "grad_norm": 5.0988264083862305, "learning_rate": 1.95417193426043e-05, "loss": 0.9472, "step": 850 }, { "epoch": 0.3648389161510561, "grad_norm": 5.713870525360107, "learning_rate": 1.952591656131479e-05, "loss": 0.7318, "step": 855 }, { "epoch": 0.3669724770642202, "grad_norm": 5.6474432945251465, "learning_rate": 1.9510113780025286e-05, "loss": 0.6891, "step": 860 }, { "epoch": 0.3691060379773842, "grad_norm": 5.688441753387451, "learning_rate": 1.949431099873578e-05, "loss": 0.8185, "step": 865 }, { "epoch": 0.3712395988905483, "grad_norm": 5.583171367645264, "learning_rate": 1.947850821744627e-05, "loss": 0.7771, "step": 870 }, { "epoch": 0.3733731598037124, "grad_norm": 6.464980125427246, "learning_rate": 1.9462705436156766e-05, "loss": 0.7638, "step": 875 }, { "epoch": 0.37550672071687646, "grad_norm": 6.436215877532959, "learning_rate": 1.944690265486726e-05, "loss": 1.1106, "step": 880 }, { "epoch": 0.37764028163004054, "grad_norm": 6.303576946258545, "learning_rate": 1.9431099873577752e-05, "loss": 0.8603, "step": 885 }, { "epoch": 0.3797738425432046, "grad_norm": 5.866418838500977, "learning_rate": 1.9415297092288243e-05, "loss": 0.9143, "step": 890 }, { "epoch": 0.3819074034563687, "grad_norm": 5.543066024780273, "learning_rate": 1.9399494310998738e-05, "loss": 0.7764, "step": 895 }, { "epoch": 0.3840409643695327, "grad_norm": 5.441503047943115, "learning_rate": 1.938369152970923e-05, "loss": 0.8121, "step": 900 }, { "epoch": 0.3861745252826968, "grad_norm": 5.828523635864258, "learning_rate": 1.9367888748419723e-05, "loss": 0.7664, "step": 905 }, { "epoch": 0.3883080861958609, "grad_norm": 5.6354804039001465, "learning_rate": 1.9352085967130218e-05, "loss": 0.7424, "step": 910 }, { "epoch": 0.39044164710902496, "grad_norm": 6.025097846984863, "learning_rate": 1.933628318584071e-05, "loss": 0.7377, "step": 915 }, { "epoch": 0.39257520802218904, "grad_norm": 7.527228355407715, "learning_rate": 1.9320480404551204e-05, "loss": 0.8329, "step": 920 }, { "epoch": 0.3947087689353531, "grad_norm": 5.251444339752197, "learning_rate": 1.9304677623261695e-05, "loss": 0.6529, "step": 925 }, { "epoch": 0.3968423298485172, "grad_norm": 5.458242416381836, "learning_rate": 1.928887484197219e-05, "loss": 0.653, "step": 930 }, { "epoch": 0.3989758907616813, "grad_norm": 5.475050926208496, "learning_rate": 1.9273072060682684e-05, "loss": 0.6713, "step": 935 }, { "epoch": 0.4011094516748453, "grad_norm": 5.815523624420166, "learning_rate": 1.9257269279393175e-05, "loss": 0.8805, "step": 940 }, { "epoch": 0.4032430125880094, "grad_norm": 5.540900707244873, "learning_rate": 1.9241466498103666e-05, "loss": 0.7712, "step": 945 }, { "epoch": 0.40537657350117345, "grad_norm": 6.0890045166015625, "learning_rate": 1.922566371681416e-05, "loss": 0.7399, "step": 950 }, { "epoch": 0.40751013441433753, "grad_norm": 6.03472375869751, "learning_rate": 1.9209860935524652e-05, "loss": 0.7024, "step": 955 }, { "epoch": 0.4096436953275016, "grad_norm": 6.81519079208374, "learning_rate": 1.9194058154235147e-05, "loss": 0.7185, "step": 960 }, { "epoch": 0.4117772562406657, "grad_norm": 4.7606425285339355, "learning_rate": 1.917825537294564e-05, "loss": 0.6943, "step": 965 }, { "epoch": 0.41391081715382977, "grad_norm": 5.0476555824279785, "learning_rate": 1.9162452591656132e-05, "loss": 0.7553, "step": 970 }, { "epoch": 0.4160443780669938, "grad_norm": 4.818223476409912, "learning_rate": 1.9146649810366627e-05, "loss": 0.5834, "step": 975 }, { "epoch": 0.41817793898015787, "grad_norm": 5.1618499755859375, "learning_rate": 1.9130847029077118e-05, "loss": 0.6494, "step": 980 }, { "epoch": 0.42031149989332195, "grad_norm": 5.9180731773376465, "learning_rate": 1.9115044247787613e-05, "loss": 0.7961, "step": 985 }, { "epoch": 0.42244506080648603, "grad_norm": 5.7833051681518555, "learning_rate": 1.9099241466498107e-05, "loss": 0.6523, "step": 990 }, { "epoch": 0.4245786217196501, "grad_norm": 6.119492053985596, "learning_rate": 1.90834386852086e-05, "loss": 0.9619, "step": 995 }, { "epoch": 0.4267121826328142, "grad_norm": 5.727389812469482, "learning_rate": 1.906763590391909e-05, "loss": 0.7668, "step": 1000 }, { "epoch": 0.42884574354597826, "grad_norm": 4.694662570953369, "learning_rate": 1.9051833122629584e-05, "loss": 0.6901, "step": 1005 }, { "epoch": 0.4309793044591423, "grad_norm": 5.501200199127197, "learning_rate": 1.9036030341340075e-05, "loss": 0.6974, "step": 1010 }, { "epoch": 0.43311286537230637, "grad_norm": 8.685210227966309, "learning_rate": 1.902022756005057e-05, "loss": 0.6458, "step": 1015 }, { "epoch": 0.43524642628547044, "grad_norm": 5.705277919769287, "learning_rate": 1.9004424778761065e-05, "loss": 0.664, "step": 1020 }, { "epoch": 0.4373799871986345, "grad_norm": 5.0383453369140625, "learning_rate": 1.8988621997471556e-05, "loss": 0.639, "step": 1025 }, { "epoch": 0.4395135481117986, "grad_norm": 5.572895526885986, "learning_rate": 1.897281921618205e-05, "loss": 0.7748, "step": 1030 }, { "epoch": 0.4416471090249627, "grad_norm": 5.489956378936768, "learning_rate": 1.8957016434892545e-05, "loss": 0.666, "step": 1035 }, { "epoch": 0.44378066993812676, "grad_norm": 5.743871212005615, "learning_rate": 1.8941213653603036e-05, "loss": 0.7466, "step": 1040 }, { "epoch": 0.4459142308512908, "grad_norm": 5.379419803619385, "learning_rate": 1.8925410872313527e-05, "loss": 0.6738, "step": 1045 }, { "epoch": 0.44804779176445486, "grad_norm": 4.491464614868164, "learning_rate": 1.8909608091024022e-05, "loss": 0.769, "step": 1050 }, { "epoch": 0.45018135267761894, "grad_norm": 5.5537848472595215, "learning_rate": 1.8893805309734513e-05, "loss": 0.6594, "step": 1055 }, { "epoch": 0.452314913590783, "grad_norm": 5.834591388702393, "learning_rate": 1.8878002528445008e-05, "loss": 0.7834, "step": 1060 }, { "epoch": 0.4544484745039471, "grad_norm": 5.1776509284973145, "learning_rate": 1.8862199747155502e-05, "loss": 0.5834, "step": 1065 }, { "epoch": 0.4565820354171112, "grad_norm": 5.313874244689941, "learning_rate": 1.8846396965865993e-05, "loss": 0.5964, "step": 1070 }, { "epoch": 0.45871559633027525, "grad_norm": 5.8994574546813965, "learning_rate": 1.8830594184576488e-05, "loss": 0.673, "step": 1075 }, { "epoch": 0.4608491572434393, "grad_norm": 5.38604736328125, "learning_rate": 1.881479140328698e-05, "loss": 0.6506, "step": 1080 }, { "epoch": 0.46298271815660336, "grad_norm": 4.179570198059082, "learning_rate": 1.8798988621997474e-05, "loss": 0.6009, "step": 1085 }, { "epoch": 0.46511627906976744, "grad_norm": 5.218341827392578, "learning_rate": 1.8783185840707968e-05, "loss": 0.628, "step": 1090 }, { "epoch": 0.4672498399829315, "grad_norm": 5.472168922424316, "learning_rate": 1.876738305941846e-05, "loss": 0.7201, "step": 1095 }, { "epoch": 0.4693834008960956, "grad_norm": 4.910761833190918, "learning_rate": 1.875158027812895e-05, "loss": 0.6409, "step": 1100 }, { "epoch": 0.47151696180925967, "grad_norm": 5.117238998413086, "learning_rate": 1.8735777496839445e-05, "loss": 0.5792, "step": 1105 }, { "epoch": 0.47365052272242375, "grad_norm": 5.522660732269287, "learning_rate": 1.8719974715549936e-05, "loss": 0.5869, "step": 1110 }, { "epoch": 0.4757840836355878, "grad_norm": 4.868000507354736, "learning_rate": 1.870417193426043e-05, "loss": 0.6068, "step": 1115 }, { "epoch": 0.47791764454875185, "grad_norm": 4.8891921043396, "learning_rate": 1.8688369152970925e-05, "loss": 0.6184, "step": 1120 }, { "epoch": 0.48005120546191593, "grad_norm": 5.064128398895264, "learning_rate": 1.8672566371681417e-05, "loss": 0.62, "step": 1125 }, { "epoch": 0.48218476637508, "grad_norm": 5.008705139160156, "learning_rate": 1.865676359039191e-05, "loss": 0.6352, "step": 1130 }, { "epoch": 0.4843183272882441, "grad_norm": 4.730052471160889, "learning_rate": 1.8640960809102406e-05, "loss": 0.6614, "step": 1135 }, { "epoch": 0.48645188820140817, "grad_norm": 4.680159091949463, "learning_rate": 1.8625158027812897e-05, "loss": 0.6086, "step": 1140 }, { "epoch": 0.48858544911457225, "grad_norm": 5.151489734649658, "learning_rate": 1.860935524652339e-05, "loss": 0.6339, "step": 1145 }, { "epoch": 0.49071901002773627, "grad_norm": 5.28818941116333, "learning_rate": 1.8593552465233883e-05, "loss": 0.6728, "step": 1150 }, { "epoch": 0.49285257094090035, "grad_norm": 5.3361687660217285, "learning_rate": 1.8577749683944374e-05, "loss": 0.6508, "step": 1155 }, { "epoch": 0.4949861318540644, "grad_norm": 4.8041090965271, "learning_rate": 1.856194690265487e-05, "loss": 0.6442, "step": 1160 }, { "epoch": 0.4971196927672285, "grad_norm": 4.674312591552734, "learning_rate": 1.8546144121365363e-05, "loss": 0.7078, "step": 1165 }, { "epoch": 0.4992532536803926, "grad_norm": 4.928088665008545, "learning_rate": 1.8530341340075854e-05, "loss": 0.7788, "step": 1170 }, { "epoch": 0.5013868145935566, "grad_norm": 4.974058628082275, "learning_rate": 1.851453855878635e-05, "loss": 0.7097, "step": 1175 }, { "epoch": 0.5035203755067207, "grad_norm": 4.790674209594727, "learning_rate": 1.849873577749684e-05, "loss": 0.5971, "step": 1180 }, { "epoch": 0.5056539364198848, "grad_norm": 5.095616817474365, "learning_rate": 1.8482932996207335e-05, "loss": 0.6299, "step": 1185 }, { "epoch": 0.5077874973330488, "grad_norm": 4.432722091674805, "learning_rate": 1.846713021491783e-05, "loss": 0.6348, "step": 1190 }, { "epoch": 0.5099210582462129, "grad_norm": 4.803964614868164, "learning_rate": 1.845132743362832e-05, "loss": 0.6403, "step": 1195 }, { "epoch": 0.512054619159377, "grad_norm": 5.166224956512451, "learning_rate": 1.843552465233881e-05, "loss": 0.655, "step": 1200 }, { "epoch": 0.5141881800725411, "grad_norm": 5.379065990447998, "learning_rate": 1.8419721871049306e-05, "loss": 0.617, "step": 1205 }, { "epoch": 0.5163217409857052, "grad_norm": 5.9183735847473145, "learning_rate": 1.8403919089759797e-05, "loss": 0.586, "step": 1210 }, { "epoch": 0.5184553018988692, "grad_norm": 5.269925117492676, "learning_rate": 1.8388116308470292e-05, "loss": 0.6441, "step": 1215 }, { "epoch": 0.5205888628120333, "grad_norm": 4.2534003257751465, "learning_rate": 1.8372313527180786e-05, "loss": 0.5532, "step": 1220 }, { "epoch": 0.5227224237251974, "grad_norm": 4.715899467468262, "learning_rate": 1.8356510745891278e-05, "loss": 0.5795, "step": 1225 }, { "epoch": 0.5248559846383615, "grad_norm": 5.345327377319336, "learning_rate": 1.8340707964601772e-05, "loss": 0.652, "step": 1230 }, { "epoch": 0.5269895455515256, "grad_norm": 4.632909297943115, "learning_rate": 1.8324905183312263e-05, "loss": 0.6023, "step": 1235 }, { "epoch": 0.5291231064646895, "grad_norm": 5.490262031555176, "learning_rate": 1.8309102402022758e-05, "loss": 0.6742, "step": 1240 }, { "epoch": 0.5312566673778536, "grad_norm": 4.147167205810547, "learning_rate": 1.8293299620733252e-05, "loss": 0.5938, "step": 1245 }, { "epoch": 0.5333902282910177, "grad_norm": 4.870655059814453, "learning_rate": 1.8277496839443744e-05, "loss": 0.5831, "step": 1250 }, { "epoch": 0.5355237892041818, "grad_norm": 5.774129390716553, "learning_rate": 1.8261694058154235e-05, "loss": 0.6278, "step": 1255 }, { "epoch": 0.5376573501173458, "grad_norm": 4.7453999519348145, "learning_rate": 1.824589127686473e-05, "loss": 0.5228, "step": 1260 }, { "epoch": 0.5397909110305099, "grad_norm": 5.04367733001709, "learning_rate": 1.823008849557522e-05, "loss": 0.5514, "step": 1265 }, { "epoch": 0.541924471943674, "grad_norm": 4.159360408782959, "learning_rate": 1.8214285714285715e-05, "loss": 0.6121, "step": 1270 }, { "epoch": 0.5440580328568381, "grad_norm": 4.117012023925781, "learning_rate": 1.819848293299621e-05, "loss": 0.5538, "step": 1275 }, { "epoch": 0.5461915937700021, "grad_norm": 4.434225082397461, "learning_rate": 1.81826801517067e-05, "loss": 0.5912, "step": 1280 }, { "epoch": 0.5483251546831662, "grad_norm": 5.941255569458008, "learning_rate": 1.8166877370417195e-05, "loss": 0.5777, "step": 1285 }, { "epoch": 0.5504587155963303, "grad_norm": 5.3827738761901855, "learning_rate": 1.815107458912769e-05, "loss": 0.5657, "step": 1290 }, { "epoch": 0.5525922765094944, "grad_norm": 4.9497575759887695, "learning_rate": 1.813527180783818e-05, "loss": 0.6132, "step": 1295 }, { "epoch": 0.5547258374226585, "grad_norm": 4.846462249755859, "learning_rate": 1.8119469026548676e-05, "loss": 0.5407, "step": 1300 }, { "epoch": 0.5568593983358225, "grad_norm": 5.622977256774902, "learning_rate": 1.8103666245259167e-05, "loss": 0.5939, "step": 1305 }, { "epoch": 0.5589929592489865, "grad_norm": 5.20183801651001, "learning_rate": 1.8087863463969658e-05, "loss": 0.6412, "step": 1310 }, { "epoch": 0.5611265201621506, "grad_norm": 4.769690990447998, "learning_rate": 1.8072060682680153e-05, "loss": 0.5422, "step": 1315 }, { "epoch": 0.5632600810753147, "grad_norm": 5.3674516677856445, "learning_rate": 1.8056257901390647e-05, "loss": 0.6404, "step": 1320 }, { "epoch": 0.5653936419884787, "grad_norm": 4.712153911590576, "learning_rate": 1.804045512010114e-05, "loss": 0.6208, "step": 1325 }, { "epoch": 0.5675272029016428, "grad_norm": 4.993585586547852, "learning_rate": 1.8024652338811633e-05, "loss": 0.5722, "step": 1330 }, { "epoch": 0.5696607638148069, "grad_norm": 4.3871307373046875, "learning_rate": 1.8008849557522124e-05, "loss": 0.5092, "step": 1335 }, { "epoch": 0.571794324727971, "grad_norm": 4.752344131469727, "learning_rate": 1.799304677623262e-05, "loss": 0.6061, "step": 1340 }, { "epoch": 0.5739278856411351, "grad_norm": 5.579290390014648, "learning_rate": 1.7977243994943113e-05, "loss": 0.6441, "step": 1345 }, { "epoch": 0.5760614465542991, "grad_norm": 4.870927810668945, "learning_rate": 1.7961441213653604e-05, "loss": 0.5957, "step": 1350 }, { "epoch": 0.5781950074674632, "grad_norm": 5.871009349822998, "learning_rate": 1.79456384323641e-05, "loss": 0.6249, "step": 1355 }, { "epoch": 0.5803285683806273, "grad_norm": 5.544130325317383, "learning_rate": 1.792983565107459e-05, "loss": 0.6002, "step": 1360 }, { "epoch": 0.5824621292937914, "grad_norm": 4.897180080413818, "learning_rate": 1.791403286978508e-05, "loss": 0.6096, "step": 1365 }, { "epoch": 0.5845956902069555, "grad_norm": 4.563004016876221, "learning_rate": 1.7898230088495576e-05, "loss": 0.5466, "step": 1370 }, { "epoch": 0.5867292511201195, "grad_norm": 4.7146430015563965, "learning_rate": 1.788242730720607e-05, "loss": 0.5664, "step": 1375 }, { "epoch": 0.5888628120332835, "grad_norm": 5.06727933883667, "learning_rate": 1.7866624525916562e-05, "loss": 0.6218, "step": 1380 }, { "epoch": 0.5909963729464476, "grad_norm": 4.825130462646484, "learning_rate": 1.7850821744627056e-05, "loss": 0.513, "step": 1385 }, { "epoch": 0.5931299338596117, "grad_norm": 5.076493740081787, "learning_rate": 1.783501896333755e-05, "loss": 0.6036, "step": 1390 }, { "epoch": 0.5952634947727757, "grad_norm": 4.400293350219727, "learning_rate": 1.7819216182048042e-05, "loss": 0.5427, "step": 1395 }, { "epoch": 0.5973970556859398, "grad_norm": 4.34608268737793, "learning_rate": 1.7803413400758537e-05, "loss": 0.4911, "step": 1400 }, { "epoch": 0.5995306165991039, "grad_norm": 4.652610778808594, "learning_rate": 1.7787610619469028e-05, "loss": 0.5789, "step": 1405 }, { "epoch": 0.601664177512268, "grad_norm": 5.2942914962768555, "learning_rate": 1.777180783817952e-05, "loss": 0.5746, "step": 1410 }, { "epoch": 0.603797738425432, "grad_norm": 4.799631118774414, "learning_rate": 1.7756005056890014e-05, "loss": 0.6073, "step": 1415 }, { "epoch": 0.6059312993385961, "grad_norm": 4.464809417724609, "learning_rate": 1.7740202275600508e-05, "loss": 0.5064, "step": 1420 }, { "epoch": 0.6080648602517602, "grad_norm": 5.692066192626953, "learning_rate": 1.7724399494311e-05, "loss": 0.5846, "step": 1425 }, { "epoch": 0.6101984211649243, "grad_norm": 4.2043890953063965, "learning_rate": 1.7708596713021494e-05, "loss": 0.5368, "step": 1430 }, { "epoch": 0.6123319820780884, "grad_norm": 5.393744468688965, "learning_rate": 1.7692793931731985e-05, "loss": 0.5735, "step": 1435 }, { "epoch": 0.6144655429912524, "grad_norm": 4.345822334289551, "learning_rate": 1.767699115044248e-05, "loss": 0.5119, "step": 1440 }, { "epoch": 0.6165991039044165, "grad_norm": 4.4069647789001465, "learning_rate": 1.7661188369152974e-05, "loss": 0.5429, "step": 1445 }, { "epoch": 0.6187326648175805, "grad_norm": 4.706110000610352, "learning_rate": 1.7645385587863465e-05, "loss": 0.6141, "step": 1450 }, { "epoch": 0.6208662257307446, "grad_norm": 4.513293266296387, "learning_rate": 1.762958280657396e-05, "loss": 0.5224, "step": 1455 }, { "epoch": 0.6229997866439086, "grad_norm": 4.868618488311768, "learning_rate": 1.761378002528445e-05, "loss": 0.5656, "step": 1460 }, { "epoch": 0.6251333475570727, "grad_norm": 4.385486602783203, "learning_rate": 1.7597977243994942e-05, "loss": 0.567, "step": 1465 }, { "epoch": 0.6272669084702368, "grad_norm": 5.354750633239746, "learning_rate": 1.7582174462705437e-05, "loss": 0.5517, "step": 1470 }, { "epoch": 0.6294004693834009, "grad_norm": 5.496529579162598, "learning_rate": 1.756637168141593e-05, "loss": 0.6311, "step": 1475 }, { "epoch": 0.631534030296565, "grad_norm": 5.110471725463867, "learning_rate": 1.7550568900126423e-05, "loss": 0.5862, "step": 1480 }, { "epoch": 0.633667591209729, "grad_norm": 4.875749111175537, "learning_rate": 1.7534766118836917e-05, "loss": 0.5699, "step": 1485 }, { "epoch": 0.6358011521228931, "grad_norm": 4.413814544677734, "learning_rate": 1.751896333754741e-05, "loss": 0.574, "step": 1490 }, { "epoch": 0.6379347130360572, "grad_norm": 4.2545857429504395, "learning_rate": 1.7503160556257903e-05, "loss": 0.5611, "step": 1495 }, { "epoch": 0.6400682739492213, "grad_norm": 5.520027160644531, "learning_rate": 1.7487357774968398e-05, "loss": 0.4966, "step": 1500 }, { "epoch": 0.6422018348623854, "grad_norm": 5.082278728485107, "learning_rate": 1.747155499367889e-05, "loss": 0.5305, "step": 1505 }, { "epoch": 0.6443353957755494, "grad_norm": 4.388136863708496, "learning_rate": 1.7455752212389383e-05, "loss": 0.5228, "step": 1510 }, { "epoch": 0.6464689566887135, "grad_norm": 5.492458343505859, "learning_rate": 1.7439949431099874e-05, "loss": 0.5816, "step": 1515 }, { "epoch": 0.6486025176018775, "grad_norm": 4.697801113128662, "learning_rate": 1.7424146649810366e-05, "loss": 0.5538, "step": 1520 }, { "epoch": 0.6507360785150416, "grad_norm": 4.820540904998779, "learning_rate": 1.740834386852086e-05, "loss": 0.4972, "step": 1525 }, { "epoch": 0.6528696394282056, "grad_norm": 4.574212551116943, "learning_rate": 1.7392541087231355e-05, "loss": 0.5677, "step": 1530 }, { "epoch": 0.6550032003413697, "grad_norm": 5.26272439956665, "learning_rate": 1.7376738305941846e-05, "loss": 0.5891, "step": 1535 }, { "epoch": 0.6571367612545338, "grad_norm": 4.426779747009277, "learning_rate": 1.736093552465234e-05, "loss": 0.5463, "step": 1540 }, { "epoch": 0.6592703221676979, "grad_norm": 4.585545063018799, "learning_rate": 1.7345132743362835e-05, "loss": 0.6041, "step": 1545 }, { "epoch": 0.661403883080862, "grad_norm": 5.416412353515625, "learning_rate": 1.7329329962073326e-05, "loss": 0.7248, "step": 1550 }, { "epoch": 0.663537443994026, "grad_norm": 4.863205909729004, "learning_rate": 1.731352718078382e-05, "loss": 0.6366, "step": 1555 }, { "epoch": 0.6656710049071901, "grad_norm": 4.924341201782227, "learning_rate": 1.7297724399494312e-05, "loss": 0.6056, "step": 1560 }, { "epoch": 0.6678045658203542, "grad_norm": 4.459977149963379, "learning_rate": 1.7281921618204803e-05, "loss": 0.5468, "step": 1565 }, { "epoch": 0.6699381267335183, "grad_norm": 4.483630657196045, "learning_rate": 1.7266118836915298e-05, "loss": 0.5638, "step": 1570 }, { "epoch": 0.6720716876466823, "grad_norm": 5.0850605964660645, "learning_rate": 1.7250316055625792e-05, "loss": 0.5833, "step": 1575 }, { "epoch": 0.6742052485598464, "grad_norm": 5.052149295806885, "learning_rate": 1.7234513274336284e-05, "loss": 0.6128, "step": 1580 }, { "epoch": 0.6763388094730105, "grad_norm": 4.196040153503418, "learning_rate": 1.7218710493046778e-05, "loss": 0.5525, "step": 1585 }, { "epoch": 0.6784723703861745, "grad_norm": 4.805011749267578, "learning_rate": 1.720290771175727e-05, "loss": 0.5735, "step": 1590 }, { "epoch": 0.6806059312993386, "grad_norm": 4.9073405265808105, "learning_rate": 1.7187104930467764e-05, "loss": 0.5409, "step": 1595 }, { "epoch": 0.6827394922125026, "grad_norm": 5.15145206451416, "learning_rate": 1.717130214917826e-05, "loss": 0.545, "step": 1600 }, { "epoch": 0.6848730531256667, "grad_norm": 4.6511945724487305, "learning_rate": 1.715549936788875e-05, "loss": 0.4922, "step": 1605 }, { "epoch": 0.6870066140388308, "grad_norm": 4.7381591796875, "learning_rate": 1.7139696586599244e-05, "loss": 0.6188, "step": 1610 }, { "epoch": 0.6891401749519949, "grad_norm": 5.406619071960449, "learning_rate": 1.7123893805309735e-05, "loss": 0.5705, "step": 1615 }, { "epoch": 0.691273735865159, "grad_norm": 5.582167625427246, "learning_rate": 1.7108091024020227e-05, "loss": 0.5205, "step": 1620 }, { "epoch": 0.693407296778323, "grad_norm": 4.347970962524414, "learning_rate": 1.709228824273072e-05, "loss": 0.5568, "step": 1625 }, { "epoch": 0.6955408576914871, "grad_norm": 4.038663387298584, "learning_rate": 1.7076485461441216e-05, "loss": 0.5565, "step": 1630 }, { "epoch": 0.6976744186046512, "grad_norm": 4.6226911544799805, "learning_rate": 1.7060682680151707e-05, "loss": 0.5122, "step": 1635 }, { "epoch": 0.6998079795178153, "grad_norm": 5.612771034240723, "learning_rate": 1.70448798988622e-05, "loss": 0.5414, "step": 1640 }, { "epoch": 0.7019415404309793, "grad_norm": 4.659555912017822, "learning_rate": 1.7029077117572696e-05, "loss": 0.5126, "step": 1645 }, { "epoch": 0.7040751013441434, "grad_norm": 4.515445709228516, "learning_rate": 1.7013274336283187e-05, "loss": 0.5426, "step": 1650 }, { "epoch": 0.7062086622573075, "grad_norm": 5.030318260192871, "learning_rate": 1.6997471554993682e-05, "loss": 0.5764, "step": 1655 }, { "epoch": 0.7083422231704715, "grad_norm": 4.8684587478637695, "learning_rate": 1.6981668773704173e-05, "loss": 0.5159, "step": 1660 }, { "epoch": 0.7104757840836355, "grad_norm": 5.1901092529296875, "learning_rate": 1.6965865992414667e-05, "loss": 0.576, "step": 1665 }, { "epoch": 0.7126093449967996, "grad_norm": 4.969446659088135, "learning_rate": 1.695006321112516e-05, "loss": 0.5781, "step": 1670 }, { "epoch": 0.7147429059099637, "grad_norm": 4.46921443939209, "learning_rate": 1.6934260429835653e-05, "loss": 0.5164, "step": 1675 }, { "epoch": 0.7168764668231278, "grad_norm": 5.267427444458008, "learning_rate": 1.6918457648546144e-05, "loss": 0.5278, "step": 1680 }, { "epoch": 0.7190100277362919, "grad_norm": 4.0481390953063965, "learning_rate": 1.690265486725664e-05, "loss": 0.5667, "step": 1685 }, { "epoch": 0.7211435886494559, "grad_norm": 4.155159950256348, "learning_rate": 1.688685208596713e-05, "loss": 0.4889, "step": 1690 }, { "epoch": 0.72327714956262, "grad_norm": 4.967867851257324, "learning_rate": 1.6871049304677625e-05, "loss": 0.5163, "step": 1695 }, { "epoch": 0.7254107104757841, "grad_norm": 4.598382472991943, "learning_rate": 1.685524652338812e-05, "loss": 0.523, "step": 1700 }, { "epoch": 0.7275442713889482, "grad_norm": 4.9795756340026855, "learning_rate": 1.683944374209861e-05, "loss": 0.5503, "step": 1705 }, { "epoch": 0.7296778323021123, "grad_norm": 4.119642734527588, "learning_rate": 1.6823640960809105e-05, "loss": 0.4932, "step": 1710 }, { "epoch": 0.7318113932152763, "grad_norm": 3.7534000873565674, "learning_rate": 1.6807838179519596e-05, "loss": 0.4973, "step": 1715 }, { "epoch": 0.7339449541284404, "grad_norm": 4.928293704986572, "learning_rate": 1.6792035398230087e-05, "loss": 0.5625, "step": 1720 }, { "epoch": 0.7360785150416045, "grad_norm": 5.265719890594482, "learning_rate": 1.6776232616940582e-05, "loss": 0.5335, "step": 1725 }, { "epoch": 0.7382120759547685, "grad_norm": 4.559508323669434, "learning_rate": 1.6760429835651077e-05, "loss": 0.4446, "step": 1730 }, { "epoch": 0.7403456368679325, "grad_norm": 5.78938627243042, "learning_rate": 1.6744627054361568e-05, "loss": 0.5377, "step": 1735 }, { "epoch": 0.7424791977810966, "grad_norm": 4.455636501312256, "learning_rate": 1.6728824273072062e-05, "loss": 0.5036, "step": 1740 }, { "epoch": 0.7446127586942607, "grad_norm": 5.289336204528809, "learning_rate": 1.6713021491782553e-05, "loss": 0.6074, "step": 1745 }, { "epoch": 0.7467463196074248, "grad_norm": 5.169851303100586, "learning_rate": 1.6697218710493048e-05, "loss": 0.5982, "step": 1750 }, { "epoch": 0.7488798805205888, "grad_norm": 5.2046217918396, "learning_rate": 1.6681415929203543e-05, "loss": 0.5762, "step": 1755 }, { "epoch": 0.7510134414337529, "grad_norm": 4.986084461212158, "learning_rate": 1.6665613147914034e-05, "loss": 0.5379, "step": 1760 }, { "epoch": 0.753147002346917, "grad_norm": 5.0369791984558105, "learning_rate": 1.664981036662453e-05, "loss": 0.5702, "step": 1765 }, { "epoch": 0.7552805632600811, "grad_norm": 4.595948696136475, "learning_rate": 1.6634007585335023e-05, "loss": 0.4272, "step": 1770 }, { "epoch": 0.7574141241732452, "grad_norm": 4.182741641998291, "learning_rate": 1.6618204804045514e-05, "loss": 0.4805, "step": 1775 }, { "epoch": 0.7595476850864092, "grad_norm": 4.699618816375732, "learning_rate": 1.6602402022756005e-05, "loss": 0.4963, "step": 1780 }, { "epoch": 0.7616812459995733, "grad_norm": 5.032837867736816, "learning_rate": 1.65865992414665e-05, "loss": 0.4846, "step": 1785 }, { "epoch": 0.7638148069127374, "grad_norm": 5.413885116577148, "learning_rate": 1.657079646017699e-05, "loss": 0.584, "step": 1790 }, { "epoch": 0.7659483678259015, "grad_norm": 5.019171237945557, "learning_rate": 1.6554993678887486e-05, "loss": 0.5158, "step": 1795 }, { "epoch": 0.7680819287390654, "grad_norm": 5.328768730163574, "learning_rate": 1.653919089759798e-05, "loss": 0.5821, "step": 1800 }, { "epoch": 0.7702154896522295, "grad_norm": 5.146249771118164, "learning_rate": 1.652338811630847e-05, "loss": 0.4609, "step": 1805 }, { "epoch": 0.7723490505653936, "grad_norm": 4.079854488372803, "learning_rate": 1.6507585335018966e-05, "loss": 0.4335, "step": 1810 }, { "epoch": 0.7744826114785577, "grad_norm": 5.487088203430176, "learning_rate": 1.6491782553729457e-05, "loss": 0.5672, "step": 1815 }, { "epoch": 0.7766161723917218, "grad_norm": 4.999423980712891, "learning_rate": 1.647597977243995e-05, "loss": 0.467, "step": 1820 }, { "epoch": 0.7787497333048858, "grad_norm": 4.289427280426025, "learning_rate": 1.6460176991150443e-05, "loss": 0.4924, "step": 1825 }, { "epoch": 0.7808832942180499, "grad_norm": 4.848026752471924, "learning_rate": 1.6444374209860937e-05, "loss": 0.4786, "step": 1830 }, { "epoch": 0.783016855131214, "grad_norm": 4.637599945068359, "learning_rate": 1.642857142857143e-05, "loss": 0.5365, "step": 1835 }, { "epoch": 0.7851504160443781, "grad_norm": 4.305636405944824, "learning_rate": 1.6412768647281923e-05, "loss": 0.5199, "step": 1840 }, { "epoch": 0.7872839769575422, "grad_norm": 4.758025169372559, "learning_rate": 1.6396965865992414e-05, "loss": 0.5243, "step": 1845 }, { "epoch": 0.7894175378707062, "grad_norm": 5.047544479370117, "learning_rate": 1.638116308470291e-05, "loss": 0.4685, "step": 1850 }, { "epoch": 0.7915510987838703, "grad_norm": 4.558396816253662, "learning_rate": 1.6365360303413403e-05, "loss": 0.6146, "step": 1855 }, { "epoch": 0.7936846596970344, "grad_norm": 5.177937984466553, "learning_rate": 1.6349557522123895e-05, "loss": 0.5446, "step": 1860 }, { "epoch": 0.7958182206101985, "grad_norm": 5.484969139099121, "learning_rate": 1.633375474083439e-05, "loss": 0.5278, "step": 1865 }, { "epoch": 0.7979517815233625, "grad_norm": 5.092281341552734, "learning_rate": 1.631795195954488e-05, "loss": 0.5452, "step": 1870 }, { "epoch": 0.8000853424365265, "grad_norm": 4.620217323303223, "learning_rate": 1.6302149178255375e-05, "loss": 0.5923, "step": 1875 }, { "epoch": 0.8022189033496906, "grad_norm": 4.69462776184082, "learning_rate": 1.6286346396965866e-05, "loss": 0.4944, "step": 1880 }, { "epoch": 0.8043524642628547, "grad_norm": 4.684912204742432, "learning_rate": 1.627054361567636e-05, "loss": 0.5447, "step": 1885 }, { "epoch": 0.8064860251760188, "grad_norm": 5.183788299560547, "learning_rate": 1.6254740834386852e-05, "loss": 0.5575, "step": 1890 }, { "epoch": 0.8086195860891828, "grad_norm": 4.963923931121826, "learning_rate": 1.6238938053097346e-05, "loss": 0.5922, "step": 1895 }, { "epoch": 0.8107531470023469, "grad_norm": 5.103732585906982, "learning_rate": 1.622313527180784e-05, "loss": 0.498, "step": 1900 }, { "epoch": 0.812886707915511, "grad_norm": 5.205685615539551, "learning_rate": 1.6207332490518332e-05, "loss": 0.4965, "step": 1905 }, { "epoch": 0.8150202688286751, "grad_norm": 5.297272682189941, "learning_rate": 1.6191529709228827e-05, "loss": 0.5762, "step": 1910 }, { "epoch": 0.8171538297418391, "grad_norm": 3.759392023086548, "learning_rate": 1.6175726927939318e-05, "loss": 0.5036, "step": 1915 }, { "epoch": 0.8192873906550032, "grad_norm": 4.451435089111328, "learning_rate": 1.6159924146649813e-05, "loss": 0.5089, "step": 1920 }, { "epoch": 0.8214209515681673, "grad_norm": 5.265392780303955, "learning_rate": 1.6144121365360307e-05, "loss": 0.5219, "step": 1925 }, { "epoch": 0.8235545124813314, "grad_norm": 5.984591960906982, "learning_rate": 1.6128318584070798e-05, "loss": 0.5821, "step": 1930 }, { "epoch": 0.8256880733944955, "grad_norm": 4.420891761779785, "learning_rate": 1.611251580278129e-05, "loss": 0.4672, "step": 1935 }, { "epoch": 0.8278216343076595, "grad_norm": 3.7576348781585693, "learning_rate": 1.6096713021491784e-05, "loss": 0.4639, "step": 1940 }, { "epoch": 0.8299551952208235, "grad_norm": 5.487934112548828, "learning_rate": 1.6080910240202275e-05, "loss": 0.518, "step": 1945 }, { "epoch": 0.8320887561339876, "grad_norm": 4.478968620300293, "learning_rate": 1.606510745891277e-05, "loss": 0.4295, "step": 1950 }, { "epoch": 0.8342223170471517, "grad_norm": 5.025543212890625, "learning_rate": 1.6049304677623264e-05, "loss": 0.4736, "step": 1955 }, { "epoch": 0.8363558779603157, "grad_norm": 4.544827461242676, "learning_rate": 1.6033501896333756e-05, "loss": 0.5154, "step": 1960 }, { "epoch": 0.8384894388734798, "grad_norm": 5.281660079956055, "learning_rate": 1.601769911504425e-05, "loss": 0.598, "step": 1965 }, { "epoch": 0.8406229997866439, "grad_norm": 4.328815460205078, "learning_rate": 1.600189633375474e-05, "loss": 0.52, "step": 1970 }, { "epoch": 0.842756560699808, "grad_norm": 4.648770332336426, "learning_rate": 1.5986093552465236e-05, "loss": 0.438, "step": 1975 }, { "epoch": 0.8448901216129721, "grad_norm": 4.124217987060547, "learning_rate": 1.5970290771175727e-05, "loss": 0.4639, "step": 1980 }, { "epoch": 0.8470236825261361, "grad_norm": 4.548424243927002, "learning_rate": 1.595448798988622e-05, "loss": 0.5121, "step": 1985 }, { "epoch": 0.8491572434393002, "grad_norm": 4.506327152252197, "learning_rate": 1.5938685208596713e-05, "loss": 0.4759, "step": 1990 }, { "epoch": 0.8512908043524643, "grad_norm": 4.667163848876953, "learning_rate": 1.5922882427307207e-05, "loss": 0.5611, "step": 1995 }, { "epoch": 0.8534243652656284, "grad_norm": 4.311827182769775, "learning_rate": 1.59070796460177e-05, "loss": 0.5262, "step": 2000 }, { "epoch": 0.8555579261787924, "grad_norm": 4.407560348510742, "learning_rate": 1.5891276864728193e-05, "loss": 0.5217, "step": 2005 }, { "epoch": 0.8576914870919565, "grad_norm": 5.029135704040527, "learning_rate": 1.5875474083438688e-05, "loss": 0.4964, "step": 2010 }, { "epoch": 0.8598250480051205, "grad_norm": 4.426456451416016, "learning_rate": 1.585967130214918e-05, "loss": 0.4598, "step": 2015 }, { "epoch": 0.8619586089182846, "grad_norm": 5.080218315124512, "learning_rate": 1.5843868520859673e-05, "loss": 0.525, "step": 2020 }, { "epoch": 0.8640921698314487, "grad_norm": 4.200524806976318, "learning_rate": 1.5828065739570168e-05, "loss": 0.4956, "step": 2025 }, { "epoch": 0.8662257307446127, "grad_norm": 5.445434093475342, "learning_rate": 1.581226295828066e-05, "loss": 0.5389, "step": 2030 }, { "epoch": 0.8683592916577768, "grad_norm": 4.662383556365967, "learning_rate": 1.579646017699115e-05, "loss": 0.4701, "step": 2035 }, { "epoch": 0.8704928525709409, "grad_norm": 4.182600498199463, "learning_rate": 1.5780657395701645e-05, "loss": 0.5139, "step": 2040 }, { "epoch": 0.872626413484105, "grad_norm": 4.490233421325684, "learning_rate": 1.5764854614412136e-05, "loss": 0.4604, "step": 2045 }, { "epoch": 0.874759974397269, "grad_norm": 5.007779598236084, "learning_rate": 1.574905183312263e-05, "loss": 0.5102, "step": 2050 }, { "epoch": 0.8768935353104331, "grad_norm": 5.043194770812988, "learning_rate": 1.5733249051833125e-05, "loss": 0.4905, "step": 2055 }, { "epoch": 0.8790270962235972, "grad_norm": 4.701770782470703, "learning_rate": 1.5717446270543616e-05, "loss": 0.4595, "step": 2060 }, { "epoch": 0.8811606571367613, "grad_norm": 5.017876148223877, "learning_rate": 1.570164348925411e-05, "loss": 0.5007, "step": 2065 }, { "epoch": 0.8832942180499254, "grad_norm": 3.628516435623169, "learning_rate": 1.5685840707964602e-05, "loss": 0.438, "step": 2070 }, { "epoch": 0.8854277789630894, "grad_norm": 4.184997081756592, "learning_rate": 1.5670037926675097e-05, "loss": 0.4689, "step": 2075 }, { "epoch": 0.8875613398762535, "grad_norm": 4.757014751434326, "learning_rate": 1.565423514538559e-05, "loss": 0.4569, "step": 2080 }, { "epoch": 0.8896949007894175, "grad_norm": 3.718052625656128, "learning_rate": 1.5638432364096082e-05, "loss": 0.5259, "step": 2085 }, { "epoch": 0.8918284617025816, "grad_norm": 3.926389217376709, "learning_rate": 1.5622629582806574e-05, "loss": 0.4039, "step": 2090 }, { "epoch": 0.8939620226157456, "grad_norm": 5.952889919281006, "learning_rate": 1.5606826801517068e-05, "loss": 0.4846, "step": 2095 }, { "epoch": 0.8960955835289097, "grad_norm": 4.66580057144165, "learning_rate": 1.559102402022756e-05, "loss": 0.4887, "step": 2100 }, { "epoch": 0.8982291444420738, "grad_norm": 4.7099103927612305, "learning_rate": 1.5575221238938054e-05, "loss": 0.5263, "step": 2105 }, { "epoch": 0.9003627053552379, "grad_norm": 4.66274881362915, "learning_rate": 1.555941845764855e-05, "loss": 0.4924, "step": 2110 }, { "epoch": 0.902496266268402, "grad_norm": 4.509885311126709, "learning_rate": 1.554361567635904e-05, "loss": 0.4973, "step": 2115 }, { "epoch": 0.904629827181566, "grad_norm": 4.4420647621154785, "learning_rate": 1.5527812895069534e-05, "loss": 0.4772, "step": 2120 }, { "epoch": 0.9067633880947301, "grad_norm": 4.7722554206848145, "learning_rate": 1.551201011378003e-05, "loss": 0.5006, "step": 2125 }, { "epoch": 0.9088969490078942, "grad_norm": 3.9744837284088135, "learning_rate": 1.549620733249052e-05, "loss": 0.4475, "step": 2130 }, { "epoch": 0.9110305099210583, "grad_norm": 4.898085594177246, "learning_rate": 1.5480404551201015e-05, "loss": 0.497, "step": 2135 }, { "epoch": 0.9131640708342224, "grad_norm": 4.290428161621094, "learning_rate": 1.5464601769911506e-05, "loss": 0.4469, "step": 2140 }, { "epoch": 0.9152976317473864, "grad_norm": 4.357161998748779, "learning_rate": 1.5448798988621997e-05, "loss": 0.4795, "step": 2145 }, { "epoch": 0.9174311926605505, "grad_norm": 4.403848171234131, "learning_rate": 1.543299620733249e-05, "loss": 0.5059, "step": 2150 }, { "epoch": 0.9195647535737145, "grad_norm": 4.178798675537109, "learning_rate": 1.5417193426042986e-05, "loss": 0.4438, "step": 2155 }, { "epoch": 0.9216983144868786, "grad_norm": 5.097316265106201, "learning_rate": 1.5401390644753477e-05, "loss": 0.4385, "step": 2160 }, { "epoch": 0.9238318754000426, "grad_norm": 4.63726282119751, "learning_rate": 1.5385587863463972e-05, "loss": 0.5495, "step": 2165 }, { "epoch": 0.9259654363132067, "grad_norm": 3.993021011352539, "learning_rate": 1.5369785082174463e-05, "loss": 0.45, "step": 2170 }, { "epoch": 0.9280989972263708, "grad_norm": 4.778984069824219, "learning_rate": 1.5353982300884958e-05, "loss": 0.5123, "step": 2175 }, { "epoch": 0.9302325581395349, "grad_norm": 4.495462894439697, "learning_rate": 1.5338179519595452e-05, "loss": 0.4433, "step": 2180 }, { "epoch": 0.932366119052699, "grad_norm": 4.673117160797119, "learning_rate": 1.5322376738305943e-05, "loss": 0.4567, "step": 2185 }, { "epoch": 0.934499679965863, "grad_norm": 4.821684837341309, "learning_rate": 1.5306573957016435e-05, "loss": 0.4398, "step": 2190 }, { "epoch": 0.9366332408790271, "grad_norm": 4.811855316162109, "learning_rate": 1.529077117572693e-05, "loss": 0.4711, "step": 2195 }, { "epoch": 0.9387668017921912, "grad_norm": 4.224590301513672, "learning_rate": 1.527496839443742e-05, "loss": 0.4646, "step": 2200 }, { "epoch": 0.9409003627053553, "grad_norm": 5.050698280334473, "learning_rate": 1.5259165613147915e-05, "loss": 0.4591, "step": 2205 }, { "epoch": 0.9430339236185193, "grad_norm": 4.678995609283447, "learning_rate": 1.5243362831858408e-05, "loss": 0.4937, "step": 2210 }, { "epoch": 0.9451674845316834, "grad_norm": 3.495490789413452, "learning_rate": 1.52275600505689e-05, "loss": 0.4774, "step": 2215 }, { "epoch": 0.9473010454448475, "grad_norm": 4.670935153961182, "learning_rate": 1.5211757269279395e-05, "loss": 0.4905, "step": 2220 }, { "epoch": 0.9494346063580115, "grad_norm": 4.865047454833984, "learning_rate": 1.5195954487989888e-05, "loss": 0.4965, "step": 2225 }, { "epoch": 0.9515681672711755, "grad_norm": 3.9217941761016846, "learning_rate": 1.5180151706700381e-05, "loss": 0.4581, "step": 2230 }, { "epoch": 0.9537017281843396, "grad_norm": 4.600928783416748, "learning_rate": 1.5164348925410874e-05, "loss": 0.4821, "step": 2235 }, { "epoch": 0.9558352890975037, "grad_norm": 4.538738250732422, "learning_rate": 1.5148546144121368e-05, "loss": 0.4719, "step": 2240 }, { "epoch": 0.9579688500106678, "grad_norm": 4.893208026885986, "learning_rate": 1.513274336283186e-05, "loss": 0.522, "step": 2245 }, { "epoch": 0.9601024109238319, "grad_norm": 4.285687446594238, "learning_rate": 1.5116940581542352e-05, "loss": 0.5162, "step": 2250 }, { "epoch": 0.9622359718369959, "grad_norm": 4.50706148147583, "learning_rate": 1.5101137800252845e-05, "loss": 0.5248, "step": 2255 }, { "epoch": 0.96436953275016, "grad_norm": 4.416393756866455, "learning_rate": 1.5085335018963338e-05, "loss": 0.5118, "step": 2260 }, { "epoch": 0.9665030936633241, "grad_norm": 3.7533748149871826, "learning_rate": 1.5069532237673831e-05, "loss": 0.4359, "step": 2265 }, { "epoch": 0.9686366545764882, "grad_norm": 4.33950662612915, "learning_rate": 1.5053729456384326e-05, "loss": 0.5012, "step": 2270 }, { "epoch": 0.9707702154896523, "grad_norm": 4.270744800567627, "learning_rate": 1.5037926675094818e-05, "loss": 0.4344, "step": 2275 }, { "epoch": 0.9729037764028163, "grad_norm": 4.739436626434326, "learning_rate": 1.5022123893805311e-05, "loss": 0.5507, "step": 2280 }, { "epoch": 0.9750373373159804, "grad_norm": 4.521956920623779, "learning_rate": 1.5006321112515804e-05, "loss": 0.4657, "step": 2285 }, { "epoch": 0.9771708982291445, "grad_norm": 4.782139778137207, "learning_rate": 1.4990518331226299e-05, "loss": 0.471, "step": 2290 }, { "epoch": 0.9793044591423085, "grad_norm": 3.9926064014434814, "learning_rate": 1.4974715549936788e-05, "loss": 0.4363, "step": 2295 }, { "epoch": 0.9814380200554725, "grad_norm": 4.13399076461792, "learning_rate": 1.4958912768647283e-05, "loss": 0.4178, "step": 2300 }, { "epoch": 0.9835715809686366, "grad_norm": 4.709731101989746, "learning_rate": 1.4943109987357776e-05, "loss": 0.4703, "step": 2305 }, { "epoch": 0.9857051418818007, "grad_norm": 4.387861251831055, "learning_rate": 1.4927307206068269e-05, "loss": 0.4736, "step": 2310 }, { "epoch": 0.9878387027949648, "grad_norm": 4.607449531555176, "learning_rate": 1.4911504424778761e-05, "loss": 0.4432, "step": 2315 }, { "epoch": 0.9899722637081289, "grad_norm": 3.77618145942688, "learning_rate": 1.4895701643489256e-05, "loss": 0.4397, "step": 2320 }, { "epoch": 0.9921058246212929, "grad_norm": 4.32602071762085, "learning_rate": 1.4879898862199749e-05, "loss": 0.4539, "step": 2325 }, { "epoch": 0.994239385534457, "grad_norm": 4.454192161560059, "learning_rate": 1.4864096080910242e-05, "loss": 0.4944, "step": 2330 }, { "epoch": 0.9963729464476211, "grad_norm": 4.736626625061035, "learning_rate": 1.4848293299620735e-05, "loss": 0.553, "step": 2335 }, { "epoch": 0.9985065073607852, "grad_norm": 4.773657321929932, "learning_rate": 1.483249051833123e-05, "loss": 0.5004, "step": 2340 }, { "epoch": 1.0, "eval_evaluator": 0.9877204489141523, "eval_loss": 0.20730140805244446, "eval_runtime": 125.912, "eval_samples_per_second": 18.163, "eval_steps_per_second": 2.271, "step": 2344 }, { "epoch": 1.000426712182633, "grad_norm": 4.5521745681762695, "learning_rate": 1.4816687737041719e-05, "loss": 0.4931, "step": 2345 }, { "epoch": 1.002560273095797, "grad_norm": 4.24761438369751, "learning_rate": 1.4800884955752213e-05, "loss": 0.5427, "step": 2350 }, { "epoch": 1.004693834008961, "grad_norm": 4.0507612228393555, "learning_rate": 1.4785082174462706e-05, "loss": 0.5448, "step": 2355 }, { "epoch": 1.006827394922125, "grad_norm": 4.63831901550293, "learning_rate": 1.4769279393173199e-05, "loss": 0.4535, "step": 2360 }, { "epoch": 1.008960955835289, "grad_norm": 3.7772696018218994, "learning_rate": 1.4753476611883692e-05, "loss": 0.4225, "step": 2365 }, { "epoch": 1.011094516748453, "grad_norm": 5.008701324462891, "learning_rate": 1.4737673830594187e-05, "loss": 0.4968, "step": 2370 }, { "epoch": 1.0132280776616172, "grad_norm": 4.774904727935791, "learning_rate": 1.472187104930468e-05, "loss": 0.487, "step": 2375 }, { "epoch": 1.0153616385747812, "grad_norm": 4.383615970611572, "learning_rate": 1.4706068268015172e-05, "loss": 0.554, "step": 2380 }, { "epoch": 1.0174951994879453, "grad_norm": 5.2985663414001465, "learning_rate": 1.4690265486725665e-05, "loss": 0.5981, "step": 2385 }, { "epoch": 1.0196287604011094, "grad_norm": 4.582570552825928, "learning_rate": 1.4674462705436158e-05, "loss": 0.4725, "step": 2390 }, { "epoch": 1.0217623213142735, "grad_norm": 4.422050476074219, "learning_rate": 1.4658659924146653e-05, "loss": 0.595, "step": 2395 }, { "epoch": 1.0238958822274375, "grad_norm": 4.118880748748779, "learning_rate": 1.4642857142857144e-05, "loss": 0.5523, "step": 2400 }, { "epoch": 1.0260294431406016, "grad_norm": 5.410613536834717, "learning_rate": 1.4627054361567637e-05, "loss": 0.5019, "step": 2405 }, { "epoch": 1.0281630040537657, "grad_norm": 4.719892978668213, "learning_rate": 1.461125158027813e-05, "loss": 0.4736, "step": 2410 }, { "epoch": 1.0302965649669298, "grad_norm": 3.992363452911377, "learning_rate": 1.4595448798988622e-05, "loss": 0.5149, "step": 2415 }, { "epoch": 1.0324301258800939, "grad_norm": 4.250847816467285, "learning_rate": 1.4579646017699117e-05, "loss": 0.4451, "step": 2420 }, { "epoch": 1.034563686793258, "grad_norm": 3.982151508331299, "learning_rate": 1.456384323640961e-05, "loss": 0.437, "step": 2425 }, { "epoch": 1.036697247706422, "grad_norm": 4.611386299133301, "learning_rate": 1.4548040455120103e-05, "loss": 0.5862, "step": 2430 }, { "epoch": 1.038830808619586, "grad_norm": 4.45111608505249, "learning_rate": 1.4532237673830596e-05, "loss": 0.488, "step": 2435 }, { "epoch": 1.0409643695327502, "grad_norm": 4.2482757568359375, "learning_rate": 1.4516434892541088e-05, "loss": 0.4786, "step": 2440 }, { "epoch": 1.0430979304459143, "grad_norm": 4.836545467376709, "learning_rate": 1.4500632111251583e-05, "loss": 0.5068, "step": 2445 }, { "epoch": 1.0452314913590783, "grad_norm": 4.293904781341553, "learning_rate": 1.4484829329962074e-05, "loss": 0.4296, "step": 2450 }, { "epoch": 1.0473650522722424, "grad_norm": 4.53025484085083, "learning_rate": 1.4469026548672567e-05, "loss": 0.4776, "step": 2455 }, { "epoch": 1.0494986131854065, "grad_norm": 3.6319494247436523, "learning_rate": 1.445322376738306e-05, "loss": 0.4776, "step": 2460 }, { "epoch": 1.0516321740985706, "grad_norm": 4.650353908538818, "learning_rate": 1.4437420986093553e-05, "loss": 0.5713, "step": 2465 }, { "epoch": 1.0537657350117346, "grad_norm": 4.444876194000244, "learning_rate": 1.4421618204804046e-05, "loss": 0.4727, "step": 2470 }, { "epoch": 1.0558992959248987, "grad_norm": 4.457956314086914, "learning_rate": 1.440581542351454e-05, "loss": 0.5128, "step": 2475 }, { "epoch": 1.0580328568380628, "grad_norm": 5.207244873046875, "learning_rate": 1.4390012642225033e-05, "loss": 0.5382, "step": 2480 }, { "epoch": 1.0601664177512269, "grad_norm": 4.9665398597717285, "learning_rate": 1.4374209860935526e-05, "loss": 0.4477, "step": 2485 }, { "epoch": 1.062299978664391, "grad_norm": 4.235536575317383, "learning_rate": 1.4358407079646019e-05, "loss": 0.4534, "step": 2490 }, { "epoch": 1.0644335395775548, "grad_norm": 4.579362869262695, "learning_rate": 1.4342604298356513e-05, "loss": 0.5227, "step": 2495 }, { "epoch": 1.066567100490719, "grad_norm": 5.049173355102539, "learning_rate": 1.4326801517067006e-05, "loss": 0.4626, "step": 2500 }, { "epoch": 1.068700661403883, "grad_norm": 3.9807817935943604, "learning_rate": 1.4310998735777498e-05, "loss": 0.5692, "step": 2505 }, { "epoch": 1.070834222317047, "grad_norm": 4.704874515533447, "learning_rate": 1.429519595448799e-05, "loss": 0.4777, "step": 2510 }, { "epoch": 1.0729677832302111, "grad_norm": 4.278888702392578, "learning_rate": 1.4279393173198483e-05, "loss": 0.4565, "step": 2515 }, { "epoch": 1.0751013441433752, "grad_norm": 4.737946510314941, "learning_rate": 1.4263590391908976e-05, "loss": 0.5691, "step": 2520 }, { "epoch": 1.0772349050565393, "grad_norm": 4.856266498565674, "learning_rate": 1.424778761061947e-05, "loss": 0.6618, "step": 2525 }, { "epoch": 1.0793684659697034, "grad_norm": 4.112383842468262, "learning_rate": 1.4231984829329964e-05, "loss": 0.513, "step": 2530 }, { "epoch": 1.0815020268828675, "grad_norm": 4.354598522186279, "learning_rate": 1.4216182048040456e-05, "loss": 0.4544, "step": 2535 }, { "epoch": 1.0836355877960315, "grad_norm": 4.665713787078857, "learning_rate": 1.420037926675095e-05, "loss": 0.476, "step": 2540 }, { "epoch": 1.0857691487091956, "grad_norm": 4.962756633758545, "learning_rate": 1.4184576485461444e-05, "loss": 0.4708, "step": 2545 }, { "epoch": 1.0879027096223597, "grad_norm": 5.993206024169922, "learning_rate": 1.4168773704171937e-05, "loss": 0.5527, "step": 2550 }, { "epoch": 1.0900362705355238, "grad_norm": 4.374881744384766, "learning_rate": 1.4152970922882428e-05, "loss": 0.4507, "step": 2555 }, { "epoch": 1.0921698314486878, "grad_norm": 4.0649590492248535, "learning_rate": 1.413716814159292e-05, "loss": 0.5308, "step": 2560 }, { "epoch": 1.094303392361852, "grad_norm": 4.3973917961120605, "learning_rate": 1.4121365360303414e-05, "loss": 0.461, "step": 2565 }, { "epoch": 1.096436953275016, "grad_norm": 3.8453307151794434, "learning_rate": 1.4105562579013907e-05, "loss": 0.4482, "step": 2570 }, { "epoch": 1.09857051418818, "grad_norm": 4.03675651550293, "learning_rate": 1.4089759797724401e-05, "loss": 0.4656, "step": 2575 }, { "epoch": 1.1007040751013442, "grad_norm": 4.694682598114014, "learning_rate": 1.4073957016434894e-05, "loss": 0.5048, "step": 2580 }, { "epoch": 1.1028376360145082, "grad_norm": 4.792722225189209, "learning_rate": 1.4058154235145387e-05, "loss": 0.4857, "step": 2585 }, { "epoch": 1.1049711969276723, "grad_norm": 4.986560821533203, "learning_rate": 1.404235145385588e-05, "loss": 0.5365, "step": 2590 }, { "epoch": 1.1071047578408364, "grad_norm": 3.972780466079712, "learning_rate": 1.4026548672566374e-05, "loss": 0.4044, "step": 2595 }, { "epoch": 1.1092383187540005, "grad_norm": 4.662002086639404, "learning_rate": 1.4010745891276867e-05, "loss": 0.4476, "step": 2600 }, { "epoch": 1.1113718796671646, "grad_norm": 5.053280353546143, "learning_rate": 1.3994943109987358e-05, "loss": 0.6451, "step": 2605 }, { "epoch": 1.1135054405803286, "grad_norm": 3.8143022060394287, "learning_rate": 1.3979140328697851e-05, "loss": 0.4513, "step": 2610 }, { "epoch": 1.1156390014934927, "grad_norm": 3.985607385635376, "learning_rate": 1.3963337547408344e-05, "loss": 0.3723, "step": 2615 }, { "epoch": 1.1177725624066568, "grad_norm": 4.714868068695068, "learning_rate": 1.3947534766118837e-05, "loss": 0.4665, "step": 2620 }, { "epoch": 1.1199061233198209, "grad_norm": 4.590309143066406, "learning_rate": 1.3931731984829332e-05, "loss": 0.522, "step": 2625 }, { "epoch": 1.122039684232985, "grad_norm": 4.820577144622803, "learning_rate": 1.3915929203539824e-05, "loss": 0.5245, "step": 2630 }, { "epoch": 1.124173245146149, "grad_norm": 4.013303756713867, "learning_rate": 1.3900126422250317e-05, "loss": 0.4309, "step": 2635 }, { "epoch": 1.126306806059313, "grad_norm": 4.0792717933654785, "learning_rate": 1.388432364096081e-05, "loss": 0.4553, "step": 2640 }, { "epoch": 1.1284403669724772, "grad_norm": 4.793567180633545, "learning_rate": 1.3868520859671303e-05, "loss": 0.4862, "step": 2645 }, { "epoch": 1.130573927885641, "grad_norm": 4.6147260665893555, "learning_rate": 1.3852718078381798e-05, "loss": 0.5337, "step": 2650 }, { "epoch": 1.1327074887988051, "grad_norm": 3.888552665710449, "learning_rate": 1.383691529709229e-05, "loss": 0.4743, "step": 2655 }, { "epoch": 1.1348410497119692, "grad_norm": 4.152586460113525, "learning_rate": 1.3821112515802782e-05, "loss": 0.4623, "step": 2660 }, { "epoch": 1.1369746106251333, "grad_norm": 4.437031269073486, "learning_rate": 1.3805309734513275e-05, "loss": 0.5064, "step": 2665 }, { "epoch": 1.1391081715382974, "grad_norm": 4.659839153289795, "learning_rate": 1.3789506953223767e-05, "loss": 0.503, "step": 2670 }, { "epoch": 1.1412417324514614, "grad_norm": 4.968484401702881, "learning_rate": 1.3773704171934262e-05, "loss": 0.5103, "step": 2675 }, { "epoch": 1.1433752933646255, "grad_norm": 4.300429344177246, "learning_rate": 1.3757901390644755e-05, "loss": 0.4481, "step": 2680 }, { "epoch": 1.1455088542777896, "grad_norm": 4.291085243225098, "learning_rate": 1.3742098609355248e-05, "loss": 0.4804, "step": 2685 }, { "epoch": 1.1476424151909537, "grad_norm": 5.287739276885986, "learning_rate": 1.372629582806574e-05, "loss": 0.4587, "step": 2690 }, { "epoch": 1.1497759761041177, "grad_norm": 4.765116214752197, "learning_rate": 1.3710493046776234e-05, "loss": 0.5718, "step": 2695 }, { "epoch": 1.1519095370172818, "grad_norm": 4.568051815032959, "learning_rate": 1.3694690265486728e-05, "loss": 0.428, "step": 2700 }, { "epoch": 1.154043097930446, "grad_norm": 5.483386516571045, "learning_rate": 1.3678887484197221e-05, "loss": 0.4927, "step": 2705 }, { "epoch": 1.15617665884361, "grad_norm": 3.856008529663086, "learning_rate": 1.3663084702907712e-05, "loss": 0.4218, "step": 2710 }, { "epoch": 1.158310219756774, "grad_norm": 4.166346549987793, "learning_rate": 1.3647281921618205e-05, "loss": 0.4965, "step": 2715 }, { "epoch": 1.1604437806699381, "grad_norm": 3.648595094680786, "learning_rate": 1.3631479140328698e-05, "loss": 0.4098, "step": 2720 }, { "epoch": 1.1625773415831022, "grad_norm": 4.0944695472717285, "learning_rate": 1.361567635903919e-05, "loss": 0.4597, "step": 2725 }, { "epoch": 1.1647109024962663, "grad_norm": 4.698819160461426, "learning_rate": 1.3599873577749685e-05, "loss": 0.4558, "step": 2730 }, { "epoch": 1.1668444634094304, "grad_norm": 4.123109340667725, "learning_rate": 1.3584070796460178e-05, "loss": 0.4923, "step": 2735 }, { "epoch": 1.1689780243225945, "grad_norm": 3.7588372230529785, "learning_rate": 1.3568268015170671e-05, "loss": 0.5261, "step": 2740 }, { "epoch": 1.1711115852357585, "grad_norm": 4.115803241729736, "learning_rate": 1.3552465233881164e-05, "loss": 0.5161, "step": 2745 }, { "epoch": 1.1732451461489226, "grad_norm": 4.102150917053223, "learning_rate": 1.3536662452591659e-05, "loss": 0.524, "step": 2750 }, { "epoch": 1.1753787070620867, "grad_norm": 5.2263264656066895, "learning_rate": 1.3520859671302151e-05, "loss": 0.4531, "step": 2755 }, { "epoch": 1.1775122679752508, "grad_norm": 4.422746181488037, "learning_rate": 1.3505056890012644e-05, "loss": 0.5125, "step": 2760 }, { "epoch": 1.1796458288884148, "grad_norm": 5.224121570587158, "learning_rate": 1.3489254108723135e-05, "loss": 0.5122, "step": 2765 }, { "epoch": 1.181779389801579, "grad_norm": 4.187285423278809, "learning_rate": 1.3473451327433628e-05, "loss": 0.4124, "step": 2770 }, { "epoch": 1.1839129507147428, "grad_norm": 4.32193660736084, "learning_rate": 1.3457648546144121e-05, "loss": 0.7222, "step": 2775 }, { "epoch": 1.1860465116279069, "grad_norm": 4.2760539054870605, "learning_rate": 1.3441845764854616e-05, "loss": 0.4534, "step": 2780 }, { "epoch": 1.188180072541071, "grad_norm": 5.836961269378662, "learning_rate": 1.3426042983565109e-05, "loss": 0.5466, "step": 2785 }, { "epoch": 1.190313633454235, "grad_norm": 3.944058656692505, "learning_rate": 1.3410240202275602e-05, "loss": 0.5265, "step": 2790 }, { "epoch": 1.192447194367399, "grad_norm": 3.726409912109375, "learning_rate": 1.3394437420986094e-05, "loss": 0.4826, "step": 2795 }, { "epoch": 1.1945807552805632, "grad_norm": 4.134906768798828, "learning_rate": 1.3378634639696589e-05, "loss": 0.3948, "step": 2800 }, { "epoch": 1.1967143161937273, "grad_norm": 4.931777000427246, "learning_rate": 1.3362831858407082e-05, "loss": 0.5772, "step": 2805 }, { "epoch": 1.1988478771068913, "grad_norm": 4.482186794281006, "learning_rate": 1.3347029077117575e-05, "loss": 0.4961, "step": 2810 }, { "epoch": 1.2009814380200554, "grad_norm": 4.433976173400879, "learning_rate": 1.3331226295828066e-05, "loss": 0.4958, "step": 2815 }, { "epoch": 1.2031149989332195, "grad_norm": 4.163667678833008, "learning_rate": 1.3315423514538559e-05, "loss": 0.4373, "step": 2820 }, { "epoch": 1.2052485598463836, "grad_norm": 4.757978916168213, "learning_rate": 1.3299620733249052e-05, "loss": 0.4558, "step": 2825 }, { "epoch": 1.2073821207595477, "grad_norm": 5.00388240814209, "learning_rate": 1.3283817951959546e-05, "loss": 0.5777, "step": 2830 }, { "epoch": 1.2095156816727117, "grad_norm": 3.7060387134552, "learning_rate": 1.3268015170670039e-05, "loss": 0.4237, "step": 2835 }, { "epoch": 1.2116492425858758, "grad_norm": 3.6805360317230225, "learning_rate": 1.3252212389380532e-05, "loss": 0.435, "step": 2840 }, { "epoch": 1.2137828034990399, "grad_norm": 5.00631046295166, "learning_rate": 1.3236409608091025e-05, "loss": 0.5261, "step": 2845 }, { "epoch": 1.215916364412204, "grad_norm": 4.413496971130371, "learning_rate": 1.322060682680152e-05, "loss": 0.486, "step": 2850 }, { "epoch": 1.218049925325368, "grad_norm": 4.5624098777771, "learning_rate": 1.3204804045512012e-05, "loss": 0.4067, "step": 2855 }, { "epoch": 1.2201834862385321, "grad_norm": 4.407283782958984, "learning_rate": 1.3189001264222505e-05, "loss": 0.5407, "step": 2860 }, { "epoch": 1.2223170471516962, "grad_norm": 4.349571228027344, "learning_rate": 1.3173198482932996e-05, "loss": 0.4533, "step": 2865 }, { "epoch": 1.2244506080648603, "grad_norm": 4.270420074462891, "learning_rate": 1.315739570164349e-05, "loss": 0.4071, "step": 2870 }, { "epoch": 1.2265841689780244, "grad_norm": 4.370794296264648, "learning_rate": 1.3141592920353982e-05, "loss": 0.591, "step": 2875 }, { "epoch": 1.2287177298911884, "grad_norm": 4.321391582489014, "learning_rate": 1.3125790139064477e-05, "loss": 0.6181, "step": 2880 }, { "epoch": 1.2308512908043525, "grad_norm": 4.3742523193359375, "learning_rate": 1.310998735777497e-05, "loss": 0.4433, "step": 2885 }, { "epoch": 1.2329848517175166, "grad_norm": 4.781976699829102, "learning_rate": 1.3094184576485462e-05, "loss": 0.4551, "step": 2890 }, { "epoch": 1.2351184126306807, "grad_norm": 4.505169868469238, "learning_rate": 1.3078381795195955e-05, "loss": 0.4241, "step": 2895 }, { "epoch": 1.2372519735438448, "grad_norm": 4.392031669616699, "learning_rate": 1.3062579013906448e-05, "loss": 0.4712, "step": 2900 }, { "epoch": 1.2393855344570088, "grad_norm": 4.21456241607666, "learning_rate": 1.3046776232616943e-05, "loss": 0.4937, "step": 2905 }, { "epoch": 1.241519095370173, "grad_norm": 5.207596778869629, "learning_rate": 1.3030973451327436e-05, "loss": 0.5162, "step": 2910 }, { "epoch": 1.243652656283337, "grad_norm": 4.438053607940674, "learning_rate": 1.3015170670037928e-05, "loss": 0.4604, "step": 2915 }, { "epoch": 1.245786217196501, "grad_norm": 4.927883148193359, "learning_rate": 1.299936788874842e-05, "loss": 0.5275, "step": 2920 }, { "epoch": 1.2479197781096651, "grad_norm": 4.349573135375977, "learning_rate": 1.2983565107458913e-05, "loss": 0.4295, "step": 2925 }, { "epoch": 1.2500533390228292, "grad_norm": 3.778911590576172, "learning_rate": 1.2967762326169407e-05, "loss": 0.3889, "step": 2930 }, { "epoch": 1.2521868999359933, "grad_norm": 3.9124295711517334, "learning_rate": 1.29519595448799e-05, "loss": 0.4719, "step": 2935 }, { "epoch": 1.2543204608491574, "grad_norm": 4.444380283355713, "learning_rate": 1.2936156763590393e-05, "loss": 0.4852, "step": 2940 }, { "epoch": 1.2564540217623212, "grad_norm": 4.332208633422852, "learning_rate": 1.2920353982300886e-05, "loss": 0.4006, "step": 2945 }, { "epoch": 1.2585875826754853, "grad_norm": 4.473910331726074, "learning_rate": 1.2904551201011379e-05, "loss": 0.4696, "step": 2950 }, { "epoch": 1.2607211435886494, "grad_norm": 4.439748287200928, "learning_rate": 1.2888748419721873e-05, "loss": 0.426, "step": 2955 }, { "epoch": 1.2628547045018135, "grad_norm": 4.909393310546875, "learning_rate": 1.2872945638432366e-05, "loss": 0.4443, "step": 2960 }, { "epoch": 1.2649882654149776, "grad_norm": 3.3987629413604736, "learning_rate": 1.2857142857142859e-05, "loss": 0.4202, "step": 2965 }, { "epoch": 1.2671218263281416, "grad_norm": 5.1597418785095215, "learning_rate": 1.284134007585335e-05, "loss": 0.4982, "step": 2970 }, { "epoch": 1.2692553872413057, "grad_norm": 4.151483535766602, "learning_rate": 1.2825537294563843e-05, "loss": 0.4503, "step": 2975 }, { "epoch": 1.2713889481544698, "grad_norm": 4.21342658996582, "learning_rate": 1.2809734513274338e-05, "loss": 0.4007, "step": 2980 }, { "epoch": 1.2735225090676339, "grad_norm": 4.098394870758057, "learning_rate": 1.279393173198483e-05, "loss": 0.4376, "step": 2985 }, { "epoch": 1.275656069980798, "grad_norm": 4.1846723556518555, "learning_rate": 1.2778128950695323e-05, "loss": 0.4763, "step": 2990 }, { "epoch": 1.277789630893962, "grad_norm": 4.545650482177734, "learning_rate": 1.2762326169405816e-05, "loss": 0.4709, "step": 2995 }, { "epoch": 1.279923191807126, "grad_norm": 4.151042461395264, "learning_rate": 1.2746523388116309e-05, "loss": 0.5067, "step": 3000 }, { "epoch": 1.2820567527202902, "grad_norm": 3.928866386413574, "learning_rate": 1.2730720606826804e-05, "loss": 0.4276, "step": 3005 }, { "epoch": 1.2841903136334543, "grad_norm": 4.5378851890563965, "learning_rate": 1.2714917825537296e-05, "loss": 0.4038, "step": 3010 }, { "epoch": 1.2863238745466183, "grad_norm": 4.427021026611328, "learning_rate": 1.269911504424779e-05, "loss": 0.5393, "step": 3015 }, { "epoch": 1.2884574354597824, "grad_norm": 4.889599323272705, "learning_rate": 1.2683312262958282e-05, "loss": 0.4085, "step": 3020 }, { "epoch": 1.2905909963729465, "grad_norm": 3.6527562141418457, "learning_rate": 1.2667509481668773e-05, "loss": 0.415, "step": 3025 }, { "epoch": 1.2927245572861106, "grad_norm": 4.558052062988281, "learning_rate": 1.2651706700379266e-05, "loss": 0.5042, "step": 3030 }, { "epoch": 1.2948581181992747, "grad_norm": 5.053831577301025, "learning_rate": 1.2635903919089761e-05, "loss": 0.6117, "step": 3035 }, { "epoch": 1.2969916791124387, "grad_norm": 4.6014723777771, "learning_rate": 1.2620101137800254e-05, "loss": 0.4961, "step": 3040 }, { "epoch": 1.2991252400256028, "grad_norm": 4.463820934295654, "learning_rate": 1.2604298356510747e-05, "loss": 0.4577, "step": 3045 }, { "epoch": 1.3012588009387667, "grad_norm": 3.9237937927246094, "learning_rate": 1.258849557522124e-05, "loss": 0.4806, "step": 3050 }, { "epoch": 1.3033923618519307, "grad_norm": 4.813193321228027, "learning_rate": 1.2572692793931734e-05, "loss": 0.5106, "step": 3055 }, { "epoch": 1.3055259227650948, "grad_norm": 5.197639465332031, "learning_rate": 1.2556890012642227e-05, "loss": 0.4831, "step": 3060 }, { "epoch": 1.307659483678259, "grad_norm": 3.4804067611694336, "learning_rate": 1.254108723135272e-05, "loss": 0.463, "step": 3065 }, { "epoch": 1.309793044591423, "grad_norm": 5.394224166870117, "learning_rate": 1.2525284450063213e-05, "loss": 0.562, "step": 3070 }, { "epoch": 1.311926605504587, "grad_norm": 3.6379518508911133, "learning_rate": 1.2509481668773704e-05, "loss": 0.407, "step": 3075 }, { "epoch": 1.3140601664177511, "grad_norm": 3.515030860900879, "learning_rate": 1.2493678887484197e-05, "loss": 0.371, "step": 3080 }, { "epoch": 1.3161937273309152, "grad_norm": 4.32614803314209, "learning_rate": 1.2477876106194691e-05, "loss": 0.4977, "step": 3085 }, { "epoch": 1.3183272882440793, "grad_norm": 3.8737730979919434, "learning_rate": 1.2462073324905184e-05, "loss": 0.4592, "step": 3090 }, { "epoch": 1.3204608491572434, "grad_norm": 5.182056903839111, "learning_rate": 1.2446270543615677e-05, "loss": 0.4373, "step": 3095 }, { "epoch": 1.3225944100704075, "grad_norm": 4.620556831359863, "learning_rate": 1.243046776232617e-05, "loss": 0.41, "step": 3100 }, { "epoch": 1.3247279709835715, "grad_norm": 3.9290201663970947, "learning_rate": 1.2414664981036664e-05, "loss": 0.4909, "step": 3105 }, { "epoch": 1.3268615318967356, "grad_norm": 4.072641849517822, "learning_rate": 1.2398862199747157e-05, "loss": 0.4441, "step": 3110 }, { "epoch": 1.3289950928098997, "grad_norm": 3.8127377033233643, "learning_rate": 1.238305941845765e-05, "loss": 0.4104, "step": 3115 }, { "epoch": 1.3311286537230638, "grad_norm": 3.3174235820770264, "learning_rate": 1.2367256637168143e-05, "loss": 0.5863, "step": 3120 }, { "epoch": 1.3332622146362278, "grad_norm": 4.908815383911133, "learning_rate": 1.2351453855878634e-05, "loss": 0.4861, "step": 3125 }, { "epoch": 1.335395775549392, "grad_norm": 4.559378623962402, "learning_rate": 1.2335651074589127e-05, "loss": 0.448, "step": 3130 }, { "epoch": 1.337529336462556, "grad_norm": 5.478936195373535, "learning_rate": 1.2319848293299622e-05, "loss": 0.4856, "step": 3135 }, { "epoch": 1.33966289737572, "grad_norm": 3.962242364883423, "learning_rate": 1.2304045512010115e-05, "loss": 0.4659, "step": 3140 }, { "epoch": 1.3417964582888842, "grad_norm": 4.678815841674805, "learning_rate": 1.2288242730720607e-05, "loss": 0.4927, "step": 3145 }, { "epoch": 1.3439300192020482, "grad_norm": 4.564944744110107, "learning_rate": 1.22724399494311e-05, "loss": 0.3763, "step": 3150 }, { "epoch": 1.3460635801152123, "grad_norm": 4.308345794677734, "learning_rate": 1.2256637168141595e-05, "loss": 0.3982, "step": 3155 }, { "epoch": 1.3481971410283764, "grad_norm": 4.139796257019043, "learning_rate": 1.2240834386852088e-05, "loss": 0.4306, "step": 3160 }, { "epoch": 1.3503307019415405, "grad_norm": 3.8987877368927, "learning_rate": 1.222503160556258e-05, "loss": 0.4321, "step": 3165 }, { "epoch": 1.3524642628547046, "grad_norm": 4.241171360015869, "learning_rate": 1.2209228824273074e-05, "loss": 0.4122, "step": 3170 }, { "epoch": 1.3545978237678686, "grad_norm": 4.369818210601807, "learning_rate": 1.2193426042983566e-05, "loss": 0.4976, "step": 3175 }, { "epoch": 1.3567313846810327, "grad_norm": 4.028532028198242, "learning_rate": 1.2177623261694058e-05, "loss": 0.468, "step": 3180 }, { "epoch": 1.3588649455941968, "grad_norm": 5.026562690734863, "learning_rate": 1.2161820480404552e-05, "loss": 0.5257, "step": 3185 }, { "epoch": 1.3609985065073609, "grad_norm": 4.444199085235596, "learning_rate": 1.2146017699115045e-05, "loss": 0.4232, "step": 3190 }, { "epoch": 1.363132067420525, "grad_norm": 3.5891497135162354, "learning_rate": 1.2130214917825538e-05, "loss": 0.4477, "step": 3195 }, { "epoch": 1.365265628333689, "grad_norm": 4.004302024841309, "learning_rate": 1.211441213653603e-05, "loss": 0.4556, "step": 3200 }, { "epoch": 1.367399189246853, "grad_norm": 4.825848579406738, "learning_rate": 1.2098609355246524e-05, "loss": 0.5006, "step": 3205 }, { "epoch": 1.3695327501600172, "grad_norm": 3.6671814918518066, "learning_rate": 1.2082806573957018e-05, "loss": 0.4101, "step": 3210 }, { "epoch": 1.3716663110731813, "grad_norm": 3.879122018814087, "learning_rate": 1.2067003792667511e-05, "loss": 0.3969, "step": 3215 }, { "epoch": 1.3737998719863453, "grad_norm": 4.866029262542725, "learning_rate": 1.2051201011378004e-05, "loss": 0.3966, "step": 3220 }, { "epoch": 1.3759334328995092, "grad_norm": 5.581394672393799, "learning_rate": 1.2035398230088497e-05, "loss": 0.494, "step": 3225 }, { "epoch": 1.3780669938126733, "grad_norm": 4.49777364730835, "learning_rate": 1.2019595448798988e-05, "loss": 0.4584, "step": 3230 }, { "epoch": 1.3802005547258374, "grad_norm": 4.342195510864258, "learning_rate": 1.2003792667509483e-05, "loss": 0.5692, "step": 3235 }, { "epoch": 1.3823341156390014, "grad_norm": 4.519298076629639, "learning_rate": 1.1987989886219975e-05, "loss": 0.44, "step": 3240 }, { "epoch": 1.3844676765521655, "grad_norm": 3.987589120864868, "learning_rate": 1.1972187104930468e-05, "loss": 0.4181, "step": 3245 }, { "epoch": 1.3866012374653296, "grad_norm": 5.103501319885254, "learning_rate": 1.1956384323640961e-05, "loss": 0.486, "step": 3250 }, { "epoch": 1.3887347983784937, "grad_norm": 4.8116631507873535, "learning_rate": 1.1940581542351454e-05, "loss": 0.4839, "step": 3255 }, { "epoch": 1.3908683592916578, "grad_norm": 3.0158679485321045, "learning_rate": 1.1924778761061949e-05, "loss": 0.4425, "step": 3260 }, { "epoch": 1.3930019202048218, "grad_norm": 4.129778861999512, "learning_rate": 1.1908975979772442e-05, "loss": 0.4286, "step": 3265 }, { "epoch": 1.395135481117986, "grad_norm": 3.9445719718933105, "learning_rate": 1.1893173198482934e-05, "loss": 0.4605, "step": 3270 }, { "epoch": 1.39726904203115, "grad_norm": 4.372819900512695, "learning_rate": 1.1877370417193427e-05, "loss": 0.4891, "step": 3275 }, { "epoch": 1.399402602944314, "grad_norm": 4.265586853027344, "learning_rate": 1.1861567635903922e-05, "loss": 0.4053, "step": 3280 }, { "epoch": 1.4015361638574781, "grad_norm": 4.457062244415283, "learning_rate": 1.1845764854614411e-05, "loss": 0.4438, "step": 3285 }, { "epoch": 1.4036697247706422, "grad_norm": 3.898768663406372, "learning_rate": 1.1829962073324906e-05, "loss": 0.4735, "step": 3290 }, { "epoch": 1.4058032856838063, "grad_norm": 5.223904132843018, "learning_rate": 1.1814159292035399e-05, "loss": 0.5695, "step": 3295 }, { "epoch": 1.4079368465969704, "grad_norm": 4.149979591369629, "learning_rate": 1.1798356510745892e-05, "loss": 0.464, "step": 3300 }, { "epoch": 1.4100704075101345, "grad_norm": 4.598405838012695, "learning_rate": 1.1782553729456385e-05, "loss": 0.4787, "step": 3305 }, { "epoch": 1.4122039684232985, "grad_norm": 4.7623982429504395, "learning_rate": 1.1766750948166879e-05, "loss": 0.4302, "step": 3310 }, { "epoch": 1.4143375293364626, "grad_norm": 3.785916805267334, "learning_rate": 1.1750948166877372e-05, "loss": 0.4341, "step": 3315 }, { "epoch": 1.4164710902496267, "grad_norm": 3.600949764251709, "learning_rate": 1.1735145385587865e-05, "loss": 0.367, "step": 3320 }, { "epoch": 1.4186046511627908, "grad_norm": 4.522639751434326, "learning_rate": 1.1719342604298358e-05, "loss": 0.4708, "step": 3325 }, { "epoch": 1.4207382120759546, "grad_norm": 3.9708735942840576, "learning_rate": 1.1703539823008852e-05, "loss": 0.4751, "step": 3330 }, { "epoch": 1.4228717729891187, "grad_norm": 4.261918067932129, "learning_rate": 1.1687737041719342e-05, "loss": 0.4474, "step": 3335 }, { "epoch": 1.4250053339022828, "grad_norm": 4.602210521697998, "learning_rate": 1.1671934260429836e-05, "loss": 0.4168, "step": 3340 }, { "epoch": 1.4271388948154469, "grad_norm": 4.670740604400635, "learning_rate": 1.165613147914033e-05, "loss": 0.4383, "step": 3345 }, { "epoch": 1.429272455728611, "grad_norm": 4.135427951812744, "learning_rate": 1.1640328697850822e-05, "loss": 0.438, "step": 3350 }, { "epoch": 1.431406016641775, "grad_norm": 4.086421012878418, "learning_rate": 1.1624525916561315e-05, "loss": 0.4385, "step": 3355 }, { "epoch": 1.433539577554939, "grad_norm": 4.7353692054748535, "learning_rate": 1.160872313527181e-05, "loss": 0.4577, "step": 3360 }, { "epoch": 1.4356731384681032, "grad_norm": 4.442506313323975, "learning_rate": 1.1592920353982302e-05, "loss": 0.435, "step": 3365 }, { "epoch": 1.4378066993812673, "grad_norm": 4.130180358886719, "learning_rate": 1.1577117572692795e-05, "loss": 0.4775, "step": 3370 }, { "epoch": 1.4399402602944313, "grad_norm": 3.9793550968170166, "learning_rate": 1.1561314791403288e-05, "loss": 0.4046, "step": 3375 }, { "epoch": 1.4420738212075954, "grad_norm": 5.080319881439209, "learning_rate": 1.1545512010113781e-05, "loss": 0.4921, "step": 3380 }, { "epoch": 1.4442073821207595, "grad_norm": 4.066112041473389, "learning_rate": 1.1529709228824276e-05, "loss": 0.4148, "step": 3385 }, { "epoch": 1.4463409430339236, "grad_norm": 4.144713878631592, "learning_rate": 1.1513906447534767e-05, "loss": 0.4061, "step": 3390 }, { "epoch": 1.4484745039470877, "grad_norm": 4.271573543548584, "learning_rate": 1.149810366624526e-05, "loss": 0.3859, "step": 3395 }, { "epoch": 1.4506080648602517, "grad_norm": 3.9691481590270996, "learning_rate": 1.1482300884955753e-05, "loss": 0.5035, "step": 3400 }, { "epoch": 1.4527416257734158, "grad_norm": 5.163878917694092, "learning_rate": 1.1466498103666245e-05, "loss": 0.4537, "step": 3405 }, { "epoch": 1.45487518668658, "grad_norm": 3.944066286087036, "learning_rate": 1.145069532237674e-05, "loss": 0.4655, "step": 3410 }, { "epoch": 1.457008747599744, "grad_norm": 4.504640102386475, "learning_rate": 1.1434892541087233e-05, "loss": 0.3928, "step": 3415 }, { "epoch": 1.459142308512908, "grad_norm": 4.684928894042969, "learning_rate": 1.1419089759797726e-05, "loss": 0.4426, "step": 3420 }, { "epoch": 1.4612758694260721, "grad_norm": 4.373624801635742, "learning_rate": 1.1403286978508219e-05, "loss": 0.506, "step": 3425 }, { "epoch": 1.4634094303392362, "grad_norm": 4.624093532562256, "learning_rate": 1.1387484197218712e-05, "loss": 0.3835, "step": 3430 }, { "epoch": 1.4655429912524003, "grad_norm": 4.191517353057861, "learning_rate": 1.1371681415929206e-05, "loss": 0.5111, "step": 3435 }, { "epoch": 1.4676765521655644, "grad_norm": 4.49861478805542, "learning_rate": 1.1355878634639697e-05, "loss": 0.3948, "step": 3440 }, { "epoch": 1.4698101130787284, "grad_norm": 4.452141284942627, "learning_rate": 1.134007585335019e-05, "loss": 0.4282, "step": 3445 }, { "epoch": 1.4719436739918925, "grad_norm": 4.915762424468994, "learning_rate": 1.1324273072060683e-05, "loss": 0.4408, "step": 3450 }, { "epoch": 1.4740772349050566, "grad_norm": 3.6840269565582275, "learning_rate": 1.1308470290771176e-05, "loss": 0.4192, "step": 3455 }, { "epoch": 1.4762107958182207, "grad_norm": 3.9970285892486572, "learning_rate": 1.1292667509481669e-05, "loss": 0.4471, "step": 3460 }, { "epoch": 1.4783443567313848, "grad_norm": 3.540578603744507, "learning_rate": 1.1276864728192163e-05, "loss": 0.4319, "step": 3465 }, { "epoch": 1.4804779176445488, "grad_norm": 4.35250186920166, "learning_rate": 1.1261061946902656e-05, "loss": 0.4367, "step": 3470 }, { "epoch": 1.482611478557713, "grad_norm": 4.181733131408691, "learning_rate": 1.1245259165613149e-05, "loss": 0.4216, "step": 3475 }, { "epoch": 1.484745039470877, "grad_norm": 4.138208389282227, "learning_rate": 1.1229456384323642e-05, "loss": 0.4344, "step": 3480 }, { "epoch": 1.486878600384041, "grad_norm": 4.309459209442139, "learning_rate": 1.1213653603034137e-05, "loss": 0.4263, "step": 3485 }, { "epoch": 1.4890121612972052, "grad_norm": 4.881312370300293, "learning_rate": 1.1197850821744628e-05, "loss": 0.4619, "step": 3490 }, { "epoch": 1.4911457222103692, "grad_norm": 3.9794363975524902, "learning_rate": 1.118204804045512e-05, "loss": 0.5105, "step": 3495 }, { "epoch": 1.4932792831235333, "grad_norm": 4.884064674377441, "learning_rate": 1.1166245259165613e-05, "loss": 0.4722, "step": 3500 }, { "epoch": 1.4954128440366974, "grad_norm": 4.475371360778809, "learning_rate": 1.1150442477876106e-05, "loss": 0.4152, "step": 3505 }, { "epoch": 1.4975464049498612, "grad_norm": 3.525907039642334, "learning_rate": 1.11346396965866e-05, "loss": 0.4017, "step": 3510 }, { "epoch": 1.4996799658630253, "grad_norm": 4.34111213684082, "learning_rate": 1.1118836915297094e-05, "loss": 0.4496, "step": 3515 }, { "epoch": 1.5018135267761896, "grad_norm": 4.264297008514404, "learning_rate": 1.1103034134007587e-05, "loss": 0.4965, "step": 3520 }, { "epoch": 1.5039470876893535, "grad_norm": 3.88047194480896, "learning_rate": 1.108723135271808e-05, "loss": 0.4009, "step": 3525 }, { "epoch": 1.5060806486025176, "grad_norm": 4.288839340209961, "learning_rate": 1.1071428571428572e-05, "loss": 0.5115, "step": 3530 }, { "epoch": 1.5082142095156816, "grad_norm": 4.137423992156982, "learning_rate": 1.1055625790139067e-05, "loss": 0.4007, "step": 3535 }, { "epoch": 1.5103477704288457, "grad_norm": 4.802023887634277, "learning_rate": 1.103982300884956e-05, "loss": 0.4195, "step": 3540 }, { "epoch": 1.5124813313420098, "grad_norm": 4.864555358886719, "learning_rate": 1.1024020227560051e-05, "loss": 0.4412, "step": 3545 }, { "epoch": 1.5146148922551739, "grad_norm": 4.190124988555908, "learning_rate": 1.1008217446270544e-05, "loss": 0.5177, "step": 3550 }, { "epoch": 1.516748453168338, "grad_norm": 3.876676321029663, "learning_rate": 1.0992414664981037e-05, "loss": 0.4553, "step": 3555 }, { "epoch": 1.518882014081502, "grad_norm": 4.408483505249023, "learning_rate": 1.097661188369153e-05, "loss": 0.4378, "step": 3560 }, { "epoch": 1.521015574994666, "grad_norm": 4.281198501586914, "learning_rate": 1.0960809102402024e-05, "loss": 0.4296, "step": 3565 }, { "epoch": 1.5231491359078302, "grad_norm": 4.272386074066162, "learning_rate": 1.0945006321112517e-05, "loss": 0.406, "step": 3570 }, { "epoch": 1.5252826968209943, "grad_norm": 4.343184947967529, "learning_rate": 1.092920353982301e-05, "loss": 0.4636, "step": 3575 }, { "epoch": 1.5274162577341583, "grad_norm": 4.392275333404541, "learning_rate": 1.0913400758533503e-05, "loss": 0.4244, "step": 3580 }, { "epoch": 1.5295498186473224, "grad_norm": 4.830089569091797, "learning_rate": 1.0897597977243997e-05, "loss": 0.5027, "step": 3585 }, { "epoch": 1.5316833795604865, "grad_norm": 4.782260894775391, "learning_rate": 1.088179519595449e-05, "loss": 0.4284, "step": 3590 }, { "epoch": 1.5338169404736504, "grad_norm": 4.546128273010254, "learning_rate": 1.0865992414664981e-05, "loss": 0.3893, "step": 3595 }, { "epoch": 1.5359505013868144, "grad_norm": 4.104976177215576, "learning_rate": 1.0850189633375474e-05, "loss": 0.515, "step": 3600 }, { "epoch": 1.5380840622999785, "grad_norm": 4.276109218597412, "learning_rate": 1.0834386852085967e-05, "loss": 0.3867, "step": 3605 }, { "epoch": 1.5402176232131426, "grad_norm": 5.1553192138671875, "learning_rate": 1.081858407079646e-05, "loss": 0.5842, "step": 3610 }, { "epoch": 1.5423511841263067, "grad_norm": 4.334433555603027, "learning_rate": 1.0802781289506955e-05, "loss": 0.3753, "step": 3615 }, { "epoch": 1.5444847450394708, "grad_norm": 4.369014263153076, "learning_rate": 1.0786978508217448e-05, "loss": 0.4834, "step": 3620 }, { "epoch": 1.5466183059526348, "grad_norm": 3.7355666160583496, "learning_rate": 1.077117572692794e-05, "loss": 0.4547, "step": 3625 }, { "epoch": 1.548751866865799, "grad_norm": 4.238455295562744, "learning_rate": 1.0755372945638433e-05, "loss": 0.4224, "step": 3630 }, { "epoch": 1.550885427778963, "grad_norm": 4.657512664794922, "learning_rate": 1.0739570164348926e-05, "loss": 0.4341, "step": 3635 }, { "epoch": 1.553018988692127, "grad_norm": 4.200009346008301, "learning_rate": 1.072376738305942e-05, "loss": 0.4038, "step": 3640 }, { "epoch": 1.5551525496052911, "grad_norm": 3.9212570190429688, "learning_rate": 1.0707964601769914e-05, "loss": 0.3606, "step": 3645 }, { "epoch": 1.5572861105184552, "grad_norm": 4.267481327056885, "learning_rate": 1.0692161820480405e-05, "loss": 0.4427, "step": 3650 }, { "epoch": 1.5594196714316193, "grad_norm": 4.647229194641113, "learning_rate": 1.0676359039190898e-05, "loss": 0.4919, "step": 3655 }, { "epoch": 1.5615532323447834, "grad_norm": 4.122211456298828, "learning_rate": 1.066055625790139e-05, "loss": 0.362, "step": 3660 }, { "epoch": 1.5636867932579475, "grad_norm": 4.275843143463135, "learning_rate": 1.0644753476611885e-05, "loss": 0.4261, "step": 3665 }, { "epoch": 1.5658203541711115, "grad_norm": 4.351519584655762, "learning_rate": 1.0628950695322378e-05, "loss": 0.4269, "step": 3670 }, { "epoch": 1.5679539150842756, "grad_norm": 4.048177242279053, "learning_rate": 1.061314791403287e-05, "loss": 0.4291, "step": 3675 }, { "epoch": 1.5700874759974397, "grad_norm": 3.983682632446289, "learning_rate": 1.0597345132743364e-05, "loss": 0.3967, "step": 3680 }, { "epoch": 1.5722210369106038, "grad_norm": 4.443732738494873, "learning_rate": 1.0581542351453857e-05, "loss": 0.4645, "step": 3685 }, { "epoch": 1.5743545978237679, "grad_norm": 4.516783237457275, "learning_rate": 1.0565739570164351e-05, "loss": 0.4052, "step": 3690 }, { "epoch": 1.576488158736932, "grad_norm": 4.3395538330078125, "learning_rate": 1.0549936788874844e-05, "loss": 0.4627, "step": 3695 }, { "epoch": 1.578621719650096, "grad_norm": 3.796954393386841, "learning_rate": 1.0534134007585335e-05, "loss": 0.3856, "step": 3700 }, { "epoch": 1.58075528056326, "grad_norm": 4.659563064575195, "learning_rate": 1.0518331226295828e-05, "loss": 0.4917, "step": 3705 }, { "epoch": 1.5828888414764242, "grad_norm": 4.653059482574463, "learning_rate": 1.0502528445006321e-05, "loss": 0.4104, "step": 3710 }, { "epoch": 1.5850224023895882, "grad_norm": 3.088853120803833, "learning_rate": 1.0486725663716814e-05, "loss": 0.3659, "step": 3715 }, { "epoch": 1.5871559633027523, "grad_norm": 5.347471714019775, "learning_rate": 1.0470922882427308e-05, "loss": 0.5145, "step": 3720 }, { "epoch": 1.5892895242159164, "grad_norm": 3.9844131469726562, "learning_rate": 1.0455120101137801e-05, "loss": 0.433, "step": 3725 }, { "epoch": 1.5914230851290805, "grad_norm": 3.934298038482666, "learning_rate": 1.0439317319848294e-05, "loss": 0.3861, "step": 3730 }, { "epoch": 1.5935566460422446, "grad_norm": 5.001667499542236, "learning_rate": 1.0423514538558787e-05, "loss": 0.4274, "step": 3735 }, { "epoch": 1.5956902069554086, "grad_norm": 4.3066606521606445, "learning_rate": 1.0407711757269282e-05, "loss": 0.4857, "step": 3740 }, { "epoch": 1.5978237678685727, "grad_norm": 4.791617393493652, "learning_rate": 1.0391908975979774e-05, "loss": 0.5526, "step": 3745 }, { "epoch": 1.5999573287817368, "grad_norm": 5.158261299133301, "learning_rate": 1.0376106194690266e-05, "loss": 0.4052, "step": 3750 }, { "epoch": 1.6020908896949009, "grad_norm": 4.077587127685547, "learning_rate": 1.0360303413400759e-05, "loss": 0.4087, "step": 3755 }, { "epoch": 1.604224450608065, "grad_norm": 4.145094394683838, "learning_rate": 1.0344500632111251e-05, "loss": 0.3831, "step": 3760 }, { "epoch": 1.606358011521229, "grad_norm": 4.910739421844482, "learning_rate": 1.0328697850821744e-05, "loss": 0.4532, "step": 3765 }, { "epoch": 1.6084915724343931, "grad_norm": 3.4111921787261963, "learning_rate": 1.0312895069532239e-05, "loss": 0.4158, "step": 3770 }, { "epoch": 1.6106251333475572, "grad_norm": 4.56186056137085, "learning_rate": 1.0297092288242732e-05, "loss": 0.3978, "step": 3775 }, { "epoch": 1.6127586942607213, "grad_norm": 4.2800822257995605, "learning_rate": 1.0281289506953225e-05, "loss": 0.4082, "step": 3780 }, { "epoch": 1.6148922551738853, "grad_norm": 4.143041133880615, "learning_rate": 1.0265486725663717e-05, "loss": 0.37, "step": 3785 }, { "epoch": 1.6170258160870494, "grad_norm": 4.374189853668213, "learning_rate": 1.0249683944374212e-05, "loss": 0.4613, "step": 3790 }, { "epoch": 1.6191593770002135, "grad_norm": 4.439857006072998, "learning_rate": 1.0233881163084705e-05, "loss": 0.4321, "step": 3795 }, { "epoch": 1.6212929379133776, "grad_norm": 4.954322338104248, "learning_rate": 1.0218078381795198e-05, "loss": 0.4867, "step": 3800 }, { "epoch": 1.6234264988265417, "grad_norm": 4.941915035247803, "learning_rate": 1.0202275600505689e-05, "loss": 0.4404, "step": 3805 }, { "epoch": 1.6255600597397055, "grad_norm": 3.809436082839966, "learning_rate": 1.0186472819216182e-05, "loss": 0.4435, "step": 3810 }, { "epoch": 1.6276936206528696, "grad_norm": 3.886300802230835, "learning_rate": 1.0170670037926675e-05, "loss": 0.4115, "step": 3815 }, { "epoch": 1.6298271815660337, "grad_norm": 5.091635227203369, "learning_rate": 1.015486725663717e-05, "loss": 0.4811, "step": 3820 }, { "epoch": 1.6319607424791978, "grad_norm": 4.407512664794922, "learning_rate": 1.0139064475347662e-05, "loss": 0.4128, "step": 3825 }, { "epoch": 1.6340943033923618, "grad_norm": 4.914755821228027, "learning_rate": 1.0123261694058155e-05, "loss": 0.4569, "step": 3830 }, { "epoch": 1.636227864305526, "grad_norm": 4.0821428298950195, "learning_rate": 1.0107458912768648e-05, "loss": 0.4714, "step": 3835 }, { "epoch": 1.63836142521869, "grad_norm": 3.7787981033325195, "learning_rate": 1.0091656131479142e-05, "loss": 0.3844, "step": 3840 }, { "epoch": 1.640494986131854, "grad_norm": 4.078645706176758, "learning_rate": 1.0075853350189635e-05, "loss": 0.3967, "step": 3845 }, { "epoch": 1.6426285470450182, "grad_norm": 4.631909370422363, "learning_rate": 1.0060050568900128e-05, "loss": 0.3869, "step": 3850 }, { "epoch": 1.6447621079581822, "grad_norm": 3.907121419906616, "learning_rate": 1.004424778761062e-05, "loss": 0.4722, "step": 3855 }, { "epoch": 1.6468956688713463, "grad_norm": 3.532806634902954, "learning_rate": 1.0028445006321112e-05, "loss": 0.3377, "step": 3860 }, { "epoch": 1.6490292297845104, "grad_norm": 3.775578260421753, "learning_rate": 1.0012642225031605e-05, "loss": 0.4017, "step": 3865 }, { "epoch": 1.6511627906976745, "grad_norm": 4.62343168258667, "learning_rate": 9.9968394437421e-06, "loss": 0.3972, "step": 3870 }, { "epoch": 1.6532963516108385, "grad_norm": 4.519557476043701, "learning_rate": 9.981036662452593e-06, "loss": 0.4669, "step": 3875 }, { "epoch": 1.6554299125240024, "grad_norm": 4.216794013977051, "learning_rate": 9.965233881163085e-06, "loss": 0.4569, "step": 3880 }, { "epoch": 1.6575634734371665, "grad_norm": 3.9565041065216064, "learning_rate": 9.949431099873578e-06, "loss": 0.4096, "step": 3885 }, { "epoch": 1.6596970343503306, "grad_norm": 3.795283317565918, "learning_rate": 9.933628318584071e-06, "loss": 0.4411, "step": 3890 }, { "epoch": 1.6618305952634946, "grad_norm": 4.511595726013184, "learning_rate": 9.917825537294564e-06, "loss": 0.4277, "step": 3895 }, { "epoch": 1.6639641561766587, "grad_norm": 4.257791519165039, "learning_rate": 9.902022756005057e-06, "loss": 0.3477, "step": 3900 }, { "epoch": 1.6660977170898228, "grad_norm": 5.031363010406494, "learning_rate": 9.886219974715552e-06, "loss": 0.4967, "step": 3905 }, { "epoch": 1.6682312780029869, "grad_norm": 4.413186073303223, "learning_rate": 9.870417193426044e-06, "loss": 0.371, "step": 3910 }, { "epoch": 1.670364838916151, "grad_norm": 4.389219760894775, "learning_rate": 9.854614412136537e-06, "loss": 0.4501, "step": 3915 }, { "epoch": 1.672498399829315, "grad_norm": 4.395909309387207, "learning_rate": 9.83881163084703e-06, "loss": 0.4275, "step": 3920 }, { "epoch": 1.674631960742479, "grad_norm": 4.407324314117432, "learning_rate": 9.823008849557523e-06, "loss": 0.423, "step": 3925 }, { "epoch": 1.6767655216556432, "grad_norm": 4.677619934082031, "learning_rate": 9.807206068268016e-06, "loss": 0.4042, "step": 3930 }, { "epoch": 1.6788990825688073, "grad_norm": 3.673964262008667, "learning_rate": 9.791403286978509e-06, "loss": 0.4142, "step": 3935 }, { "epoch": 1.6810326434819713, "grad_norm": 4.465169429779053, "learning_rate": 9.775600505689002e-06, "loss": 0.4424, "step": 3940 }, { "epoch": 1.6831662043951354, "grad_norm": 4.479079246520996, "learning_rate": 9.759797724399495e-06, "loss": 0.4194, "step": 3945 }, { "epoch": 1.6852997653082995, "grad_norm": 4.500971794128418, "learning_rate": 9.743994943109987e-06, "loss": 0.456, "step": 3950 }, { "epoch": 1.6874333262214636, "grad_norm": 3.6217551231384277, "learning_rate": 9.72819216182048e-06, "loss": 0.3881, "step": 3955 }, { "epoch": 1.6895668871346277, "grad_norm": 3.604400873184204, "learning_rate": 9.712389380530975e-06, "loss": 0.3924, "step": 3960 }, { "epoch": 1.6917004480477917, "grad_norm": 3.9066615104675293, "learning_rate": 9.696586599241468e-06, "loss": 0.4241, "step": 3965 }, { "epoch": 1.6938340089609558, "grad_norm": 4.320919513702393, "learning_rate": 9.68078381795196e-06, "loss": 0.4007, "step": 3970 }, { "epoch": 1.69596756987412, "grad_norm": 4.506808757781982, "learning_rate": 9.664981036662453e-06, "loss": 0.4394, "step": 3975 }, { "epoch": 1.698101130787284, "grad_norm": 4.534313201904297, "learning_rate": 9.649178255372946e-06, "loss": 0.4414, "step": 3980 }, { "epoch": 1.700234691700448, "grad_norm": 3.8250062465667725, "learning_rate": 9.63337547408344e-06, "loss": 0.4157, "step": 3985 }, { "epoch": 1.7023682526136121, "grad_norm": 4.336977005004883, "learning_rate": 9.617572692793932e-06, "loss": 0.4267, "step": 3990 }, { "epoch": 1.7045018135267762, "grad_norm": 3.7711403369903564, "learning_rate": 9.601769911504427e-06, "loss": 0.399, "step": 3995 }, { "epoch": 1.7066353744399403, "grad_norm": 3.7794551849365234, "learning_rate": 9.585967130214918e-06, "loss": 0.4125, "step": 4000 }, { "epoch": 1.7087689353531044, "grad_norm": 3.5244486331939697, "learning_rate": 9.57016434892541e-06, "loss": 0.4284, "step": 4005 }, { "epoch": 1.7109024962662684, "grad_norm": 4.090603351593018, "learning_rate": 9.554361567635905e-06, "loss": 0.3937, "step": 4010 }, { "epoch": 1.7130360571794325, "grad_norm": 5.152937412261963, "learning_rate": 9.538558786346398e-06, "loss": 0.4293, "step": 4015 }, { "epoch": 1.7151696180925966, "grad_norm": 5.739314079284668, "learning_rate": 9.522756005056891e-06, "loss": 0.4916, "step": 4020 }, { "epoch": 1.7173031790057607, "grad_norm": 3.626616954803467, "learning_rate": 9.506953223767384e-06, "loss": 0.3178, "step": 4025 }, { "epoch": 1.7194367399189248, "grad_norm": 4.267282962799072, "learning_rate": 9.491150442477877e-06, "loss": 0.4212, "step": 4030 }, { "epoch": 1.7215703008320888, "grad_norm": 3.8010354042053223, "learning_rate": 9.47534766118837e-06, "loss": 0.4214, "step": 4035 }, { "epoch": 1.723703861745253, "grad_norm": 4.761562347412109, "learning_rate": 9.459544879898863e-06, "loss": 0.4117, "step": 4040 }, { "epoch": 1.725837422658417, "grad_norm": 4.024481773376465, "learning_rate": 9.443742098609357e-06, "loss": 0.4416, "step": 4045 }, { "epoch": 1.727970983571581, "grad_norm": 4.028324127197266, "learning_rate": 9.427939317319848e-06, "loss": 0.4346, "step": 4050 }, { "epoch": 1.7301045444847452, "grad_norm": 4.539582252502441, "learning_rate": 9.412136536030341e-06, "loss": 0.4518, "step": 4055 }, { "epoch": 1.7322381053979092, "grad_norm": 3.911097526550293, "learning_rate": 9.396333754740836e-06, "loss": 0.4419, "step": 4060 }, { "epoch": 1.7343716663110733, "grad_norm": 4.783831596374512, "learning_rate": 9.380530973451329e-06, "loss": 0.4222, "step": 4065 }, { "epoch": 1.7365052272242374, "grad_norm": 3.7832529544830322, "learning_rate": 9.364728192161821e-06, "loss": 0.4169, "step": 4070 }, { "epoch": 1.7386387881374015, "grad_norm": 4.095695495605469, "learning_rate": 9.348925410872314e-06, "loss": 0.4259, "step": 4075 }, { "epoch": 1.7407723490505655, "grad_norm": 4.235688209533691, "learning_rate": 9.333122629582807e-06, "loss": 0.3874, "step": 4080 }, { "epoch": 1.7429059099637296, "grad_norm": 4.536983489990234, "learning_rate": 9.3173198482933e-06, "loss": 0.4318, "step": 4085 }, { "epoch": 1.7450394708768935, "grad_norm": 3.5148367881774902, "learning_rate": 9.301517067003793e-06, "loss": 0.4566, "step": 4090 }, { "epoch": 1.7471730317900576, "grad_norm": 4.49671745300293, "learning_rate": 9.285714285714288e-06, "loss": 0.4351, "step": 4095 }, { "epoch": 1.7493065927032216, "grad_norm": 6.083174705505371, "learning_rate": 9.26991150442478e-06, "loss": 0.5365, "step": 4100 }, { "epoch": 1.7514401536163857, "grad_norm": 3.449429512023926, "learning_rate": 9.254108723135272e-06, "loss": 0.3924, "step": 4105 }, { "epoch": 1.7535737145295498, "grad_norm": 3.685215950012207, "learning_rate": 9.238305941845766e-06, "loss": 0.4072, "step": 4110 }, { "epoch": 1.7557072754427139, "grad_norm": 4.891070365905762, "learning_rate": 9.222503160556259e-06, "loss": 0.4324, "step": 4115 }, { "epoch": 1.757840836355878, "grad_norm": 4.432384014129639, "learning_rate": 9.206700379266752e-06, "loss": 0.3614, "step": 4120 }, { "epoch": 1.759974397269042, "grad_norm": 4.073850631713867, "learning_rate": 9.190897597977245e-06, "loss": 0.4256, "step": 4125 }, { "epoch": 1.7621079581822061, "grad_norm": 4.328255653381348, "learning_rate": 9.175094816687738e-06, "loss": 0.4156, "step": 4130 }, { "epoch": 1.7642415190953702, "grad_norm": 4.667512893676758, "learning_rate": 9.15929203539823e-06, "loss": 0.4392, "step": 4135 }, { "epoch": 1.7663750800085343, "grad_norm": 3.1964938640594482, "learning_rate": 9.143489254108723e-06, "loss": 0.3626, "step": 4140 }, { "epoch": 1.7685086409216983, "grad_norm": 4.073289394378662, "learning_rate": 9.127686472819218e-06, "loss": 0.4007, "step": 4145 }, { "epoch": 1.7706422018348624, "grad_norm": 4.5602545738220215, "learning_rate": 9.111883691529711e-06, "loss": 0.4329, "step": 4150 }, { "epoch": 1.7727757627480265, "grad_norm": 4.590991497039795, "learning_rate": 9.096080910240202e-06, "loss": 0.4734, "step": 4155 }, { "epoch": 1.7749093236611904, "grad_norm": 4.509527683258057, "learning_rate": 9.080278128950697e-06, "loss": 0.3963, "step": 4160 }, { "epoch": 1.7770428845743544, "grad_norm": 4.132796287536621, "learning_rate": 9.06447534766119e-06, "loss": 0.3708, "step": 4165 }, { "epoch": 1.7791764454875185, "grad_norm": 3.8218822479248047, "learning_rate": 9.048672566371682e-06, "loss": 0.385, "step": 4170 }, { "epoch": 1.7813100064006826, "grad_norm": 4.133715629577637, "learning_rate": 9.032869785082175e-06, "loss": 0.3566, "step": 4175 }, { "epoch": 1.7834435673138467, "grad_norm": 4.322253227233887, "learning_rate": 9.017067003792668e-06, "loss": 0.4153, "step": 4180 }, { "epoch": 1.7855771282270108, "grad_norm": 4.001209259033203, "learning_rate": 9.001264222503161e-06, "loss": 0.3932, "step": 4185 }, { "epoch": 1.7877106891401748, "grad_norm": 3.911928176879883, "learning_rate": 8.985461441213654e-06, "loss": 0.421, "step": 4190 }, { "epoch": 1.789844250053339, "grad_norm": 4.439943313598633, "learning_rate": 8.969658659924147e-06, "loss": 0.411, "step": 4195 }, { "epoch": 1.791977810966503, "grad_norm": 4.437414169311523, "learning_rate": 8.953855878634641e-06, "loss": 0.4133, "step": 4200 }, { "epoch": 1.794111371879667, "grad_norm": 4.42112398147583, "learning_rate": 8.938053097345133e-06, "loss": 0.4671, "step": 4205 }, { "epoch": 1.7962449327928312, "grad_norm": 4.949692726135254, "learning_rate": 8.922250316055625e-06, "loss": 0.4151, "step": 4210 }, { "epoch": 1.7983784937059952, "grad_norm": 4.196777820587158, "learning_rate": 8.90644753476612e-06, "loss": 0.3906, "step": 4215 }, { "epoch": 1.8005120546191593, "grad_norm": 3.9743521213531494, "learning_rate": 8.890644753476613e-06, "loss": 0.38, "step": 4220 }, { "epoch": 1.8026456155323234, "grad_norm": 3.6272308826446533, "learning_rate": 8.874841972187106e-06, "loss": 0.4217, "step": 4225 }, { "epoch": 1.8047791764454875, "grad_norm": 4.151697635650635, "learning_rate": 8.859039190897599e-06, "loss": 0.391, "step": 4230 }, { "epoch": 1.8069127373586515, "grad_norm": 3.6812775135040283, "learning_rate": 8.843236409608091e-06, "loss": 0.4046, "step": 4235 }, { "epoch": 1.8090462982718156, "grad_norm": 3.767869234085083, "learning_rate": 8.827433628318584e-06, "loss": 0.4448, "step": 4240 }, { "epoch": 1.8111798591849797, "grad_norm": 4.005570411682129, "learning_rate": 8.811630847029077e-06, "loss": 0.4197, "step": 4245 }, { "epoch": 1.8133134200981438, "grad_norm": 3.979504108428955, "learning_rate": 8.795828065739572e-06, "loss": 0.4442, "step": 4250 }, { "epoch": 1.8154469810113079, "grad_norm": 4.660085201263428, "learning_rate": 8.780025284450065e-06, "loss": 0.4299, "step": 4255 }, { "epoch": 1.817580541924472, "grad_norm": 3.891530990600586, "learning_rate": 8.764222503160556e-06, "loss": 0.3981, "step": 4260 }, { "epoch": 1.819714102837636, "grad_norm": 4.534486293792725, "learning_rate": 8.74841972187105e-06, "loss": 0.4499, "step": 4265 }, { "epoch": 1.8218476637508, "grad_norm": 5.114262580871582, "learning_rate": 8.732616940581543e-06, "loss": 0.4081, "step": 4270 }, { "epoch": 1.8239812246639642, "grad_norm": 3.8600802421569824, "learning_rate": 8.716814159292036e-06, "loss": 0.389, "step": 4275 }, { "epoch": 1.8261147855771283, "grad_norm": 3.8939297199249268, "learning_rate": 8.701011378002529e-06, "loss": 0.4191, "step": 4280 }, { "epoch": 1.8282483464902923, "grad_norm": 4.046480655670166, "learning_rate": 8.685208596713022e-06, "loss": 0.3995, "step": 4285 }, { "epoch": 1.8303819074034564, "grad_norm": 3.921691417694092, "learning_rate": 8.669405815423515e-06, "loss": 0.3994, "step": 4290 }, { "epoch": 1.8325154683166205, "grad_norm": 4.322704792022705, "learning_rate": 8.653603034134008e-06, "loss": 0.4107, "step": 4295 }, { "epoch": 1.8346490292297846, "grad_norm": 4.012720108032227, "learning_rate": 8.637800252844502e-06, "loss": 0.3564, "step": 4300 }, { "epoch": 1.8367825901429486, "grad_norm": 4.165811061859131, "learning_rate": 8.621997471554995e-06, "loss": 0.403, "step": 4305 }, { "epoch": 1.8389161510561127, "grad_norm": 3.617009401321411, "learning_rate": 8.606194690265486e-06, "loss": 0.3079, "step": 4310 }, { "epoch": 1.8410497119692768, "grad_norm": 3.6533565521240234, "learning_rate": 8.59039190897598e-06, "loss": 0.3903, "step": 4315 }, { "epoch": 1.8431832728824409, "grad_norm": 4.798677444458008, "learning_rate": 8.574589127686474e-06, "loss": 0.4295, "step": 4320 }, { "epoch": 1.845316833795605, "grad_norm": 4.112203598022461, "learning_rate": 8.558786346396967e-06, "loss": 0.4108, "step": 4325 }, { "epoch": 1.847450394708769, "grad_norm": 3.938917398452759, "learning_rate": 8.54298356510746e-06, "loss": 0.3975, "step": 4330 }, { "epoch": 1.8495839556219331, "grad_norm": 3.279505968093872, "learning_rate": 8.527180783817952e-06, "loss": 0.36, "step": 4335 }, { "epoch": 1.8517175165350972, "grad_norm": 4.134898662567139, "learning_rate": 8.511378002528445e-06, "loss": 0.3914, "step": 4340 }, { "epoch": 1.8538510774482613, "grad_norm": 4.558252811431885, "learning_rate": 8.495575221238938e-06, "loss": 0.3827, "step": 4345 }, { "epoch": 1.8559846383614254, "grad_norm": 3.923495292663574, "learning_rate": 8.479772439949433e-06, "loss": 0.4142, "step": 4350 }, { "epoch": 1.8581181992745894, "grad_norm": 3.759852170944214, "learning_rate": 8.463969658659926e-06, "loss": 0.3415, "step": 4355 }, { "epoch": 1.8602517601877535, "grad_norm": 3.701773166656494, "learning_rate": 8.448166877370418e-06, "loss": 0.3879, "step": 4360 }, { "epoch": 1.8623853211009176, "grad_norm": 4.116669654846191, "learning_rate": 8.432364096080911e-06, "loss": 0.4991, "step": 4365 }, { "epoch": 1.8645188820140814, "grad_norm": 4.087536811828613, "learning_rate": 8.416561314791404e-06, "loss": 0.3334, "step": 4370 }, { "epoch": 1.8666524429272455, "grad_norm": 4.180550575256348, "learning_rate": 8.400758533501897e-06, "loss": 0.3978, "step": 4375 }, { "epoch": 1.8687860038404096, "grad_norm": 4.742728233337402, "learning_rate": 8.38495575221239e-06, "loss": 0.4217, "step": 4380 }, { "epoch": 1.8709195647535737, "grad_norm": 5.107389450073242, "learning_rate": 8.369152970922883e-06, "loss": 0.4304, "step": 4385 }, { "epoch": 1.8730531256667378, "grad_norm": 4.502194404602051, "learning_rate": 8.353350189633376e-06, "loss": 0.3924, "step": 4390 }, { "epoch": 1.8751866865799018, "grad_norm": 4.717820167541504, "learning_rate": 8.337547408343869e-06, "loss": 0.4051, "step": 4395 }, { "epoch": 1.877320247493066, "grad_norm": 4.939509391784668, "learning_rate": 8.321744627054363e-06, "loss": 0.4178, "step": 4400 }, { "epoch": 1.87945380840623, "grad_norm": 4.088109493255615, "learning_rate": 8.305941845764856e-06, "loss": 0.3778, "step": 4405 }, { "epoch": 1.881587369319394, "grad_norm": 3.666149854660034, "learning_rate": 8.290139064475349e-06, "loss": 0.3979, "step": 4410 }, { "epoch": 1.8837209302325582, "grad_norm": 4.25274658203125, "learning_rate": 8.274336283185842e-06, "loss": 0.3942, "step": 4415 }, { "epoch": 1.8858544911457222, "grad_norm": 3.761164665222168, "learning_rate": 8.258533501896335e-06, "loss": 0.3896, "step": 4420 }, { "epoch": 1.8879880520588863, "grad_norm": 3.869654417037964, "learning_rate": 8.242730720606827e-06, "loss": 0.4409, "step": 4425 }, { "epoch": 1.8901216129720504, "grad_norm": 3.9715282917022705, "learning_rate": 8.22692793931732e-06, "loss": 0.4299, "step": 4430 }, { "epoch": 1.8922551738852145, "grad_norm": 3.939626455307007, "learning_rate": 8.211125158027813e-06, "loss": 0.399, "step": 4435 }, { "epoch": 1.8943887347983785, "grad_norm": 3.9082634449005127, "learning_rate": 8.195322376738306e-06, "loss": 0.4148, "step": 4440 }, { "epoch": 1.8965222957115424, "grad_norm": 4.443459510803223, "learning_rate": 8.179519595448799e-06, "loss": 0.4501, "step": 4445 }, { "epoch": 1.8986558566247065, "grad_norm": 3.7164127826690674, "learning_rate": 8.163716814159292e-06, "loss": 0.3909, "step": 4450 }, { "epoch": 1.9007894175378706, "grad_norm": 4.089094161987305, "learning_rate": 8.147914032869786e-06, "loss": 0.4316, "step": 4455 }, { "epoch": 1.9029229784510346, "grad_norm": 4.615549087524414, "learning_rate": 8.13211125158028e-06, "loss": 0.3662, "step": 4460 }, { "epoch": 1.9050565393641987, "grad_norm": 4.402790069580078, "learning_rate": 8.11630847029077e-06, "loss": 0.4446, "step": 4465 }, { "epoch": 1.9071901002773628, "grad_norm": 3.7550132274627686, "learning_rate": 8.100505689001265e-06, "loss": 0.3931, "step": 4470 }, { "epoch": 1.9093236611905269, "grad_norm": 3.8456757068634033, "learning_rate": 8.084702907711758e-06, "loss": 0.3947, "step": 4475 }, { "epoch": 1.911457222103691, "grad_norm": 3.9646549224853516, "learning_rate": 8.06890012642225e-06, "loss": 0.3415, "step": 4480 }, { "epoch": 1.913590783016855, "grad_norm": 4.79493522644043, "learning_rate": 8.053097345132744e-06, "loss": 0.3661, "step": 4485 }, { "epoch": 1.9157243439300191, "grad_norm": 3.39487886428833, "learning_rate": 8.037294563843238e-06, "loss": 0.3743, "step": 4490 }, { "epoch": 1.9178579048431832, "grad_norm": 3.981194496154785, "learning_rate": 8.02149178255373e-06, "loss": 0.3918, "step": 4495 }, { "epoch": 1.9199914657563473, "grad_norm": 3.72912859916687, "learning_rate": 8.005689001264222e-06, "loss": 0.3459, "step": 4500 }, { "epoch": 1.9221250266695113, "grad_norm": 4.027529239654541, "learning_rate": 7.989886219974717e-06, "loss": 0.3944, "step": 4505 }, { "epoch": 1.9242585875826754, "grad_norm": 3.9047298431396484, "learning_rate": 7.97408343868521e-06, "loss": 0.3727, "step": 4510 }, { "epoch": 1.9263921484958395, "grad_norm": 4.601792335510254, "learning_rate": 7.958280657395703e-06, "loss": 0.3956, "step": 4515 }, { "epoch": 1.9285257094090036, "grad_norm": 4.3443756103515625, "learning_rate": 7.942477876106195e-06, "loss": 0.4233, "step": 4520 }, { "epoch": 1.9306592703221677, "grad_norm": 4.115539073944092, "learning_rate": 7.926675094816688e-06, "loss": 0.3929, "step": 4525 }, { "epoch": 1.9327928312353317, "grad_norm": 4.373830795288086, "learning_rate": 7.910872313527181e-06, "loss": 0.4206, "step": 4530 }, { "epoch": 1.9349263921484958, "grad_norm": 4.398421287536621, "learning_rate": 7.895069532237674e-06, "loss": 0.3835, "step": 4535 }, { "epoch": 1.93705995306166, "grad_norm": 4.316149711608887, "learning_rate": 7.879266750948169e-06, "loss": 0.459, "step": 4540 }, { "epoch": 1.939193513974824, "grad_norm": 4.977426052093506, "learning_rate": 7.86346396965866e-06, "loss": 0.4094, "step": 4545 }, { "epoch": 1.941327074887988, "grad_norm": 3.934713125228882, "learning_rate": 7.847661188369153e-06, "loss": 0.3996, "step": 4550 }, { "epoch": 1.9434606358011521, "grad_norm": 5.019830226898193, "learning_rate": 7.831858407079647e-06, "loss": 0.4309, "step": 4555 }, { "epoch": 1.9455941967143162, "grad_norm": 3.6168367862701416, "learning_rate": 7.81605562579014e-06, "loss": 0.3579, "step": 4560 }, { "epoch": 1.9477277576274803, "grad_norm": 4.30971097946167, "learning_rate": 7.800252844500633e-06, "loss": 0.4028, "step": 4565 }, { "epoch": 1.9498613185406444, "grad_norm": 4.342047214508057, "learning_rate": 7.784450063211126e-06, "loss": 0.3659, "step": 4570 }, { "epoch": 1.9519948794538085, "grad_norm": 4.505122184753418, "learning_rate": 7.768647281921619e-06, "loss": 0.3697, "step": 4575 }, { "epoch": 1.9541284403669725, "grad_norm": 4.1636271476745605, "learning_rate": 7.752844500632112e-06, "loss": 0.3926, "step": 4580 }, { "epoch": 1.9562620012801366, "grad_norm": 4.6492719650268555, "learning_rate": 7.737041719342605e-06, "loss": 0.3827, "step": 4585 }, { "epoch": 1.9583955621933007, "grad_norm": 4.6462812423706055, "learning_rate": 7.721238938053099e-06, "loss": 0.4603, "step": 4590 }, { "epoch": 1.9605291231064648, "grad_norm": 4.198066711425781, "learning_rate": 7.70543615676359e-06, "loss": 0.4473, "step": 4595 }, { "epoch": 1.9626626840196288, "grad_norm": 3.5620765686035156, "learning_rate": 7.689633375474083e-06, "loss": 0.4027, "step": 4600 }, { "epoch": 1.964796244932793, "grad_norm": 4.463738918304443, "learning_rate": 7.673830594184578e-06, "loss": 0.4164, "step": 4605 }, { "epoch": 1.966929805845957, "grad_norm": 4.4071364402771, "learning_rate": 7.65802781289507e-06, "loss": 0.4502, "step": 4610 }, { "epoch": 1.969063366759121, "grad_norm": 4.269393444061279, "learning_rate": 7.642225031605563e-06, "loss": 0.3708, "step": 4615 }, { "epoch": 1.9711969276722852, "grad_norm": 3.918552875518799, "learning_rate": 7.626422250316056e-06, "loss": 0.3747, "step": 4620 }, { "epoch": 1.9733304885854492, "grad_norm": 3.7500672340393066, "learning_rate": 7.610619469026549e-06, "loss": 0.3385, "step": 4625 }, { "epoch": 1.9754640494986133, "grad_norm": 5.05972957611084, "learning_rate": 7.594816687737042e-06, "loss": 0.3845, "step": 4630 }, { "epoch": 1.9775976104117774, "grad_norm": 4.773119926452637, "learning_rate": 7.579013906447536e-06, "loss": 0.4506, "step": 4635 }, { "epoch": 1.9797311713249415, "grad_norm": 4.228442192077637, "learning_rate": 7.563211125158029e-06, "loss": 0.3889, "step": 4640 }, { "epoch": 1.9818647322381056, "grad_norm": 4.035905838012695, "learning_rate": 7.5474083438685216e-06, "loss": 0.4104, "step": 4645 }, { "epoch": 1.9839982931512696, "grad_norm": 5.019458293914795, "learning_rate": 7.5316055625790144e-06, "loss": 0.4283, "step": 4650 }, { "epoch": 1.9861318540644335, "grad_norm": 4.0822978019714355, "learning_rate": 7.515802781289507e-06, "loss": 0.3305, "step": 4655 }, { "epoch": 1.9882654149775976, "grad_norm": 3.953634023666382, "learning_rate": 7.500000000000001e-06, "loss": 0.393, "step": 4660 }, { "epoch": 1.9903989758907616, "grad_norm": 4.482757091522217, "learning_rate": 7.484197218710494e-06, "loss": 0.416, "step": 4665 }, { "epoch": 1.9925325368039257, "grad_norm": 3.983945369720459, "learning_rate": 7.468394437420987e-06, "loss": 0.3541, "step": 4670 }, { "epoch": 1.9946660977170898, "grad_norm": 3.568307638168335, "learning_rate": 7.45259165613148e-06, "loss": 0.4027, "step": 4675 }, { "epoch": 1.9967996586302539, "grad_norm": 4.563660621643066, "learning_rate": 7.4367888748419725e-06, "loss": 0.3685, "step": 4680 }, { "epoch": 1.998933219543418, "grad_norm": 4.3235554695129395, "learning_rate": 7.420986093552465e-06, "loss": 0.3796, "step": 4685 }, { "epoch": 2.0, "eval_evaluator": 0.9877204489141523, "eval_loss": 0.1771988570690155, "eval_runtime": 125.952, "eval_samples_per_second": 18.158, "eval_steps_per_second": 2.271, "step": 4688 }, { "epoch": 2.000853424365266, "grad_norm": 3.9099795818328857, "learning_rate": 7.405183312262959e-06, "loss": 0.3541, "step": 4690 }, { "epoch": 2.00298698527843, "grad_norm": 4.449174404144287, "learning_rate": 7.389380530973452e-06, "loss": 0.4464, "step": 4695 }, { "epoch": 2.005120546191594, "grad_norm": 3.8515255451202393, "learning_rate": 7.373577749683945e-06, "loss": 0.3864, "step": 4700 }, { "epoch": 2.007254107104758, "grad_norm": 3.965477466583252, "learning_rate": 7.357774968394438e-06, "loss": 0.3213, "step": 4705 }, { "epoch": 2.009387668017922, "grad_norm": 4.251551628112793, "learning_rate": 7.341972187104931e-06, "loss": 0.3773, "step": 4710 }, { "epoch": 2.011521228931086, "grad_norm": 4.710624694824219, "learning_rate": 7.326169405815424e-06, "loss": 0.4576, "step": 4715 }, { "epoch": 2.01365478984425, "grad_norm": 3.9441611766815186, "learning_rate": 7.310366624525917e-06, "loss": 0.3428, "step": 4720 }, { "epoch": 2.015788350757414, "grad_norm": 3.8340277671813965, "learning_rate": 7.294563843236411e-06, "loss": 0.4218, "step": 4725 }, { "epoch": 2.017921911670578, "grad_norm": 5.2679219245910645, "learning_rate": 7.278761061946903e-06, "loss": 0.4831, "step": 4730 }, { "epoch": 2.020055472583742, "grad_norm": 4.511500835418701, "learning_rate": 7.262958280657396e-06, "loss": 0.4228, "step": 4735 }, { "epoch": 2.022189033496906, "grad_norm": 3.8335185050964355, "learning_rate": 7.2471554993678896e-06, "loss": 0.3894, "step": 4740 }, { "epoch": 2.0243225944100702, "grad_norm": 3.6677894592285156, "learning_rate": 7.2313527180783824e-06, "loss": 0.4975, "step": 4745 }, { "epoch": 2.0264561553232343, "grad_norm": 3.9892594814300537, "learning_rate": 7.215549936788876e-06, "loss": 0.3693, "step": 4750 }, { "epoch": 2.0285897162363984, "grad_norm": 3.551366090774536, "learning_rate": 7.199747155499368e-06, "loss": 0.3899, "step": 4755 }, { "epoch": 2.0307232771495625, "grad_norm": 3.675837516784668, "learning_rate": 7.183944374209861e-06, "loss": 0.365, "step": 4760 }, { "epoch": 2.0328568380627265, "grad_norm": 4.143002033233643, "learning_rate": 7.168141592920355e-06, "loss": 0.4794, "step": 4765 }, { "epoch": 2.0349903989758906, "grad_norm": 3.7219038009643555, "learning_rate": 7.152338811630848e-06, "loss": 0.4105, "step": 4770 }, { "epoch": 2.0371239598890547, "grad_norm": 4.915626525878906, "learning_rate": 7.136536030341341e-06, "loss": 0.4166, "step": 4775 }, { "epoch": 2.039257520802219, "grad_norm": 4.077551364898682, "learning_rate": 7.120733249051833e-06, "loss": 0.3861, "step": 4780 }, { "epoch": 2.041391081715383, "grad_norm": 4.394146919250488, "learning_rate": 7.104930467762326e-06, "loss": 0.4576, "step": 4785 }, { "epoch": 2.043524642628547, "grad_norm": 4.272018909454346, "learning_rate": 7.08912768647282e-06, "loss": 0.4476, "step": 4790 }, { "epoch": 2.045658203541711, "grad_norm": 4.265909194946289, "learning_rate": 7.073324905183313e-06, "loss": 0.4411, "step": 4795 }, { "epoch": 2.047791764454875, "grad_norm": 3.3345210552215576, "learning_rate": 7.057522123893807e-06, "loss": 0.3745, "step": 4800 }, { "epoch": 2.049925325368039, "grad_norm": 3.579895496368408, "learning_rate": 7.041719342604299e-06, "loss": 0.5016, "step": 4805 }, { "epoch": 2.0520588862812033, "grad_norm": 3.8164703845977783, "learning_rate": 7.0259165613147915e-06, "loss": 0.4101, "step": 4810 }, { "epoch": 2.0541924471943673, "grad_norm": 3.7674551010131836, "learning_rate": 7.010113780025285e-06, "loss": 0.4508, "step": 4815 }, { "epoch": 2.0563260081075314, "grad_norm": 4.42473030090332, "learning_rate": 6.994310998735778e-06, "loss": 0.4173, "step": 4820 }, { "epoch": 2.0584595690206955, "grad_norm": 3.532865285873413, "learning_rate": 6.978508217446272e-06, "loss": 0.4008, "step": 4825 }, { "epoch": 2.0605931299338596, "grad_norm": 3.7293918132781982, "learning_rate": 6.962705436156764e-06, "loss": 0.4128, "step": 4830 }, { "epoch": 2.0627266908470236, "grad_norm": 4.818244457244873, "learning_rate": 6.946902654867257e-06, "loss": 0.4003, "step": 4835 }, { "epoch": 2.0648602517601877, "grad_norm": 3.0435068607330322, "learning_rate": 6.9310998735777505e-06, "loss": 0.389, "step": 4840 }, { "epoch": 2.066993812673352, "grad_norm": 3.7885820865631104, "learning_rate": 6.915297092288243e-06, "loss": 0.3549, "step": 4845 }, { "epoch": 2.069127373586516, "grad_norm": 3.8366339206695557, "learning_rate": 6.899494310998737e-06, "loss": 0.4706, "step": 4850 }, { "epoch": 2.07126093449968, "grad_norm": 4.349545955657959, "learning_rate": 6.88369152970923e-06, "loss": 0.398, "step": 4855 }, { "epoch": 2.073394495412844, "grad_norm": 4.000646591186523, "learning_rate": 6.867888748419722e-06, "loss": 0.3775, "step": 4860 }, { "epoch": 2.075528056326008, "grad_norm": 4.560521602630615, "learning_rate": 6.852085967130216e-06, "loss": 0.4547, "step": 4865 }, { "epoch": 2.077661617239172, "grad_norm": 4.749170780181885, "learning_rate": 6.8362831858407086e-06, "loss": 0.4048, "step": 4870 }, { "epoch": 2.0797951781523363, "grad_norm": 4.038886070251465, "learning_rate": 6.820480404551202e-06, "loss": 0.4562, "step": 4875 }, { "epoch": 2.0819287390655004, "grad_norm": 4.166180610656738, "learning_rate": 6.804677623261695e-06, "loss": 0.4327, "step": 4880 }, { "epoch": 2.0840622999786644, "grad_norm": 4.0146050453186035, "learning_rate": 6.788874841972187e-06, "loss": 0.49, "step": 4885 }, { "epoch": 2.0861958608918285, "grad_norm": 3.8984649181365967, "learning_rate": 6.773072060682681e-06, "loss": 0.3935, "step": 4890 }, { "epoch": 2.0883294218049926, "grad_norm": 4.444411754608154, "learning_rate": 6.757269279393174e-06, "loss": 0.3395, "step": 4895 }, { "epoch": 2.0904629827181567, "grad_norm": 4.304215908050537, "learning_rate": 6.741466498103667e-06, "loss": 0.4126, "step": 4900 }, { "epoch": 2.0925965436313207, "grad_norm": 4.9501118659973145, "learning_rate": 6.72566371681416e-06, "loss": 0.5132, "step": 4905 }, { "epoch": 2.094730104544485, "grad_norm": 3.9908359050750732, "learning_rate": 6.709860935524652e-06, "loss": 0.4168, "step": 4910 }, { "epoch": 2.096863665457649, "grad_norm": 3.9207379817962646, "learning_rate": 6.694058154235146e-06, "loss": 0.4265, "step": 4915 }, { "epoch": 2.098997226370813, "grad_norm": 3.6123504638671875, "learning_rate": 6.678255372945639e-06, "loss": 0.4056, "step": 4920 }, { "epoch": 2.101130787283977, "grad_norm": 3.6765084266662598, "learning_rate": 6.662452591656132e-06, "loss": 0.391, "step": 4925 }, { "epoch": 2.103264348197141, "grad_norm": 3.6983890533447266, "learning_rate": 6.646649810366626e-06, "loss": 0.452, "step": 4930 }, { "epoch": 2.105397909110305, "grad_norm": 4.123793125152588, "learning_rate": 6.630847029077118e-06, "loss": 0.3674, "step": 4935 }, { "epoch": 2.1075314700234693, "grad_norm": 4.987173080444336, "learning_rate": 6.6150442477876105e-06, "loss": 0.4484, "step": 4940 }, { "epoch": 2.1096650309366334, "grad_norm": 4.321348667144775, "learning_rate": 6.599241466498104e-06, "loss": 0.4447, "step": 4945 }, { "epoch": 2.1117985918497975, "grad_norm": 3.74847149848938, "learning_rate": 6.583438685208597e-06, "loss": 0.4224, "step": 4950 }, { "epoch": 2.1139321527629615, "grad_norm": 4.59506368637085, "learning_rate": 6.567635903919091e-06, "loss": 0.4074, "step": 4955 }, { "epoch": 2.1160657136761256, "grad_norm": 5.061137676239014, "learning_rate": 6.551833122629583e-06, "loss": 0.4031, "step": 4960 }, { "epoch": 2.1181992745892897, "grad_norm": 4.153493881225586, "learning_rate": 6.536030341340076e-06, "loss": 0.4308, "step": 4965 }, { "epoch": 2.1203328355024538, "grad_norm": 4.935019493103027, "learning_rate": 6.5202275600505694e-06, "loss": 0.5257, "step": 4970 }, { "epoch": 2.122466396415618, "grad_norm": 4.236289978027344, "learning_rate": 6.504424778761062e-06, "loss": 0.4075, "step": 4975 }, { "epoch": 2.124599957328782, "grad_norm": 4.530546188354492, "learning_rate": 6.488621997471556e-06, "loss": 0.3787, "step": 4980 }, { "epoch": 2.126733518241946, "grad_norm": 3.604985237121582, "learning_rate": 6.472819216182049e-06, "loss": 0.415, "step": 4985 }, { "epoch": 2.1288670791551096, "grad_norm": 3.9142749309539795, "learning_rate": 6.457016434892541e-06, "loss": 0.4389, "step": 4990 }, { "epoch": 2.131000640068274, "grad_norm": 4.416208744049072, "learning_rate": 6.441213653603035e-06, "loss": 0.4248, "step": 4995 }, { "epoch": 2.133134200981438, "grad_norm": 4.188726902008057, "learning_rate": 6.4254108723135275e-06, "loss": 0.4361, "step": 5000 }, { "epoch": 2.1352677618946023, "grad_norm": 3.7509398460388184, "learning_rate": 6.409608091024021e-06, "loss": 0.3702, "step": 5005 }, { "epoch": 2.137401322807766, "grad_norm": 3.6640818119049072, "learning_rate": 6.393805309734514e-06, "loss": 0.397, "step": 5010 }, { "epoch": 2.13953488372093, "grad_norm": 3.572796106338501, "learning_rate": 6.378002528445006e-06, "loss": 0.4284, "step": 5015 }, { "epoch": 2.141668444634094, "grad_norm": 3.946179151535034, "learning_rate": 6.3621997471555e-06, "loss": 0.3721, "step": 5020 }, { "epoch": 2.143802005547258, "grad_norm": 4.065572738647461, "learning_rate": 6.346396965865993e-06, "loss": 0.3834, "step": 5025 }, { "epoch": 2.1459355664604223, "grad_norm": 4.278841018676758, "learning_rate": 6.3305941845764865e-06, "loss": 0.3937, "step": 5030 }, { "epoch": 2.1480691273735864, "grad_norm": 3.989811658859253, "learning_rate": 6.314791403286979e-06, "loss": 0.3983, "step": 5035 }, { "epoch": 2.1502026882867504, "grad_norm": 3.7060208320617676, "learning_rate": 6.298988621997471e-06, "loss": 0.3478, "step": 5040 }, { "epoch": 2.1523362491999145, "grad_norm": 4.620419502258301, "learning_rate": 6.283185840707965e-06, "loss": 0.4092, "step": 5045 }, { "epoch": 2.1544698101130786, "grad_norm": 3.965667247772217, "learning_rate": 6.267383059418458e-06, "loss": 0.4197, "step": 5050 }, { "epoch": 2.1566033710262427, "grad_norm": 3.439267158508301, "learning_rate": 6.251580278128952e-06, "loss": 0.38, "step": 5055 }, { "epoch": 2.1587369319394067, "grad_norm": 3.8684778213500977, "learning_rate": 6.2357774968394446e-06, "loss": 0.3638, "step": 5060 }, { "epoch": 2.160870492852571, "grad_norm": 4.314586639404297, "learning_rate": 6.219974715549937e-06, "loss": 0.4944, "step": 5065 }, { "epoch": 2.163004053765735, "grad_norm": 4.4946722984313965, "learning_rate": 6.20417193426043e-06, "loss": 0.4456, "step": 5070 }, { "epoch": 2.165137614678899, "grad_norm": 4.2454071044921875, "learning_rate": 6.188369152970923e-06, "loss": 0.4251, "step": 5075 }, { "epoch": 2.167271175592063, "grad_norm": 4.110542297363281, "learning_rate": 6.172566371681417e-06, "loss": 0.4029, "step": 5080 }, { "epoch": 2.169404736505227, "grad_norm": 4.185825347900391, "learning_rate": 6.15676359039191e-06, "loss": 0.3532, "step": 5085 }, { "epoch": 2.171538297418391, "grad_norm": 3.9194812774658203, "learning_rate": 6.140960809102402e-06, "loss": 0.3824, "step": 5090 }, { "epoch": 2.1736718583315553, "grad_norm": 3.871528387069702, "learning_rate": 6.1251580278128955e-06, "loss": 0.4084, "step": 5095 }, { "epoch": 2.1758054192447194, "grad_norm": 4.807036876678467, "learning_rate": 6.109355246523388e-06, "loss": 0.4154, "step": 5100 }, { "epoch": 2.1779389801578835, "grad_norm": 3.8847007751464844, "learning_rate": 6.093552465233882e-06, "loss": 0.3933, "step": 5105 }, { "epoch": 2.1800725410710475, "grad_norm": 3.88389253616333, "learning_rate": 6.077749683944375e-06, "loss": 0.392, "step": 5110 }, { "epoch": 2.1822061019842116, "grad_norm": 3.9758973121643066, "learning_rate": 6.061946902654868e-06, "loss": 0.3911, "step": 5115 }, { "epoch": 2.1843396628973757, "grad_norm": 6.718324661254883, "learning_rate": 6.046144121365361e-06, "loss": 0.338, "step": 5120 }, { "epoch": 2.1864732238105398, "grad_norm": 4.2538981437683105, "learning_rate": 6.030341340075854e-06, "loss": 0.3823, "step": 5125 }, { "epoch": 2.188606784723704, "grad_norm": 4.390100955963135, "learning_rate": 6.014538558786347e-06, "loss": 0.4173, "step": 5130 }, { "epoch": 2.190740345636868, "grad_norm": 3.906677722930908, "learning_rate": 5.99873577749684e-06, "loss": 0.3648, "step": 5135 }, { "epoch": 2.192873906550032, "grad_norm": 4.540939807891846, "learning_rate": 5.982932996207333e-06, "loss": 0.4877, "step": 5140 }, { "epoch": 2.195007467463196, "grad_norm": 4.823376655578613, "learning_rate": 5.967130214917826e-06, "loss": 0.4069, "step": 5145 }, { "epoch": 2.19714102837636, "grad_norm": 3.3051562309265137, "learning_rate": 5.951327433628319e-06, "loss": 0.447, "step": 5150 }, { "epoch": 2.1992745892895242, "grad_norm": 4.335805892944336, "learning_rate": 5.935524652338812e-06, "loss": 0.4186, "step": 5155 }, { "epoch": 2.2014081502026883, "grad_norm": 4.150125980377197, "learning_rate": 5.9197218710493054e-06, "loss": 0.4088, "step": 5160 }, { "epoch": 2.2035417111158524, "grad_norm": 6.423460960388184, "learning_rate": 5.903919089759798e-06, "loss": 0.5046, "step": 5165 }, { "epoch": 2.2056752720290165, "grad_norm": 4.221232891082764, "learning_rate": 5.888116308470291e-06, "loss": 0.409, "step": 5170 }, { "epoch": 2.2078088329421806, "grad_norm": 4.568662643432617, "learning_rate": 5.872313527180784e-06, "loss": 0.4102, "step": 5175 }, { "epoch": 2.2099423938553446, "grad_norm": 4.5810089111328125, "learning_rate": 5.856510745891277e-06, "loss": 0.4145, "step": 5180 }, { "epoch": 2.2120759547685087, "grad_norm": 3.7417185306549072, "learning_rate": 5.840707964601771e-06, "loss": 0.4229, "step": 5185 }, { "epoch": 2.214209515681673, "grad_norm": 3.5175845623016357, "learning_rate": 5.8249051833122635e-06, "loss": 0.4613, "step": 5190 }, { "epoch": 2.216343076594837, "grad_norm": 3.3224520683288574, "learning_rate": 5.809102402022756e-06, "loss": 0.413, "step": 5195 }, { "epoch": 2.218476637508001, "grad_norm": 3.738093852996826, "learning_rate": 5.793299620733249e-06, "loss": 0.3725, "step": 5200 }, { "epoch": 2.220610198421165, "grad_norm": 4.55635929107666, "learning_rate": 5.777496839443742e-06, "loss": 0.4417, "step": 5205 }, { "epoch": 2.222743759334329, "grad_norm": 3.722395658493042, "learning_rate": 5.761694058154236e-06, "loss": 0.3696, "step": 5210 }, { "epoch": 2.224877320247493, "grad_norm": 4.449429988861084, "learning_rate": 5.745891276864729e-06, "loss": 0.4294, "step": 5215 }, { "epoch": 2.2270108811606573, "grad_norm": 5.1202778816223145, "learning_rate": 5.730088495575221e-06, "loss": 0.4408, "step": 5220 }, { "epoch": 2.2291444420738213, "grad_norm": 4.097009181976318, "learning_rate": 5.7142857142857145e-06, "loss": 0.4007, "step": 5225 }, { "epoch": 2.2312780029869854, "grad_norm": 3.9691903591156006, "learning_rate": 5.698482932996207e-06, "loss": 0.4927, "step": 5230 }, { "epoch": 2.2334115639001495, "grad_norm": 4.4439921379089355, "learning_rate": 5.682680151706701e-06, "loss": 0.3488, "step": 5235 }, { "epoch": 2.2355451248133136, "grad_norm": 3.679431438446045, "learning_rate": 5.666877370417194e-06, "loss": 0.4898, "step": 5240 }, { "epoch": 2.2376786857264777, "grad_norm": 4.508449077606201, "learning_rate": 5.651074589127688e-06, "loss": 0.43, "step": 5245 }, { "epoch": 2.2398122466396417, "grad_norm": 4.32645320892334, "learning_rate": 5.63527180783818e-06, "loss": 0.4281, "step": 5250 }, { "epoch": 2.241945807552806, "grad_norm": 3.590062141418457, "learning_rate": 5.619469026548673e-06, "loss": 0.4307, "step": 5255 }, { "epoch": 2.24407936846597, "grad_norm": 4.356314182281494, "learning_rate": 5.603666245259166e-06, "loss": 0.4487, "step": 5260 }, { "epoch": 2.246212929379134, "grad_norm": 4.716103553771973, "learning_rate": 5.587863463969659e-06, "loss": 0.4379, "step": 5265 }, { "epoch": 2.248346490292298, "grad_norm": 4.041739463806152, "learning_rate": 5.572060682680153e-06, "loss": 0.3901, "step": 5270 }, { "epoch": 2.2504800512054617, "grad_norm": 3.673758029937744, "learning_rate": 5.556257901390645e-06, "loss": 0.4434, "step": 5275 }, { "epoch": 2.252613612118626, "grad_norm": 4.207825660705566, "learning_rate": 5.540455120101138e-06, "loss": 0.3758, "step": 5280 }, { "epoch": 2.25474717303179, "grad_norm": 4.471085548400879, "learning_rate": 5.5246523388116315e-06, "loss": 0.3693, "step": 5285 }, { "epoch": 2.2568807339449544, "grad_norm": 4.205175399780273, "learning_rate": 5.508849557522124e-06, "loss": 0.3965, "step": 5290 }, { "epoch": 2.259014294858118, "grad_norm": 4.6766533851623535, "learning_rate": 5.493046776232618e-06, "loss": 0.3508, "step": 5295 }, { "epoch": 2.261147855771282, "grad_norm": 4.840975284576416, "learning_rate": 5.47724399494311e-06, "loss": 0.4771, "step": 5300 }, { "epoch": 2.263281416684446, "grad_norm": 3.9258434772491455, "learning_rate": 5.461441213653603e-06, "loss": 0.3601, "step": 5305 }, { "epoch": 2.2654149775976102, "grad_norm": 4.151968479156494, "learning_rate": 5.445638432364097e-06, "loss": 0.4161, "step": 5310 }, { "epoch": 2.2675485385107743, "grad_norm": 4.543685436248779, "learning_rate": 5.42983565107459e-06, "loss": 0.4813, "step": 5315 }, { "epoch": 2.2696820994239384, "grad_norm": 3.733811140060425, "learning_rate": 5.414032869785083e-06, "loss": 0.3715, "step": 5320 }, { "epoch": 2.2718156603371025, "grad_norm": 3.2990171909332275, "learning_rate": 5.398230088495575e-06, "loss": 0.3873, "step": 5325 }, { "epoch": 2.2739492212502666, "grad_norm": 4.1041436195373535, "learning_rate": 5.382427307206068e-06, "loss": 0.4092, "step": 5330 }, { "epoch": 2.2760827821634306, "grad_norm": 4.364267349243164, "learning_rate": 5.366624525916562e-06, "loss": 0.4073, "step": 5335 }, { "epoch": 2.2782163430765947, "grad_norm": 3.123594045639038, "learning_rate": 5.350821744627055e-06, "loss": 0.3975, "step": 5340 }, { "epoch": 2.280349903989759, "grad_norm": 4.143852710723877, "learning_rate": 5.335018963337549e-06, "loss": 0.3485, "step": 5345 }, { "epoch": 2.282483464902923, "grad_norm": 4.994611740112305, "learning_rate": 5.319216182048041e-06, "loss": 0.3857, "step": 5350 }, { "epoch": 2.284617025816087, "grad_norm": 4.080162525177002, "learning_rate": 5.3034134007585335e-06, "loss": 0.3158, "step": 5355 }, { "epoch": 2.286750586729251, "grad_norm": 4.270036220550537, "learning_rate": 5.287610619469027e-06, "loss": 0.4476, "step": 5360 }, { "epoch": 2.288884147642415, "grad_norm": 4.030431270599365, "learning_rate": 5.27180783817952e-06, "loss": 0.4784, "step": 5365 }, { "epoch": 2.291017708555579, "grad_norm": 4.152432441711426, "learning_rate": 5.256005056890014e-06, "loss": 0.4499, "step": 5370 }, { "epoch": 2.2931512694687433, "grad_norm": 4.297497749328613, "learning_rate": 5.240202275600507e-06, "loss": 0.3852, "step": 5375 }, { "epoch": 2.2952848303819073, "grad_norm": 4.033621311187744, "learning_rate": 5.224399494310999e-06, "loss": 0.4927, "step": 5380 }, { "epoch": 2.2974183912950714, "grad_norm": 4.530407428741455, "learning_rate": 5.2085967130214924e-06, "loss": 0.5091, "step": 5385 }, { "epoch": 2.2995519522082355, "grad_norm": 4.24984073638916, "learning_rate": 5.192793931731985e-06, "loss": 0.4201, "step": 5390 }, { "epoch": 2.3016855131213996, "grad_norm": 4.040515899658203, "learning_rate": 5.176991150442478e-06, "loss": 0.406, "step": 5395 }, { "epoch": 2.3038190740345637, "grad_norm": 4.016532897949219, "learning_rate": 5.161188369152972e-06, "loss": 0.3989, "step": 5400 }, { "epoch": 2.3059526349477277, "grad_norm": 4.394332408905029, "learning_rate": 5.145385587863464e-06, "loss": 0.4134, "step": 5405 }, { "epoch": 2.308086195860892, "grad_norm": 4.987480640411377, "learning_rate": 5.129582806573958e-06, "loss": 0.4595, "step": 5410 }, { "epoch": 2.310219756774056, "grad_norm": 4.412329196929932, "learning_rate": 5.1137800252844505e-06, "loss": 0.4157, "step": 5415 }, { "epoch": 2.31235331768722, "grad_norm": 4.324588775634766, "learning_rate": 5.097977243994943e-06, "loss": 0.4703, "step": 5420 }, { "epoch": 2.314486878600384, "grad_norm": 4.05607795715332, "learning_rate": 5.082174462705437e-06, "loss": 0.4543, "step": 5425 }, { "epoch": 2.316620439513548, "grad_norm": 4.508897304534912, "learning_rate": 5.066371681415929e-06, "loss": 0.4207, "step": 5430 }, { "epoch": 2.318754000426712, "grad_norm": 4.336424350738525, "learning_rate": 5.050568900126422e-06, "loss": 0.4049, "step": 5435 }, { "epoch": 2.3208875613398763, "grad_norm": 4.038027286529541, "learning_rate": 5.034766118836916e-06, "loss": 0.4193, "step": 5440 }, { "epoch": 2.3230211222530404, "grad_norm": 3.8997299671173096, "learning_rate": 5.018963337547409e-06, "loss": 0.3917, "step": 5445 }, { "epoch": 2.3251546831662044, "grad_norm": 3.959617853164673, "learning_rate": 5.003160556257902e-06, "loss": 0.4742, "step": 5450 }, { "epoch": 2.3272882440793685, "grad_norm": 4.371520042419434, "learning_rate": 4.987357774968395e-06, "loss": 0.3965, "step": 5455 }, { "epoch": 2.3294218049925326, "grad_norm": 4.2197394371032715, "learning_rate": 4.971554993678888e-06, "loss": 0.3876, "step": 5460 }, { "epoch": 2.3315553659056967, "grad_norm": 3.5643868446350098, "learning_rate": 4.955752212389381e-06, "loss": 0.4568, "step": 5465 }, { "epoch": 2.3336889268188608, "grad_norm": 4.49404764175415, "learning_rate": 4.939949431099874e-06, "loss": 0.418, "step": 5470 }, { "epoch": 2.335822487732025, "grad_norm": 3.6703479290008545, "learning_rate": 4.924146649810367e-06, "loss": 0.3959, "step": 5475 }, { "epoch": 2.337956048645189, "grad_norm": 5.356283664703369, "learning_rate": 4.9083438685208604e-06, "loss": 0.4605, "step": 5480 }, { "epoch": 2.340089609558353, "grad_norm": 4.413150310516357, "learning_rate": 4.892541087231353e-06, "loss": 0.3838, "step": 5485 }, { "epoch": 2.342223170471517, "grad_norm": 5.07502555847168, "learning_rate": 4.876738305941846e-06, "loss": 0.4498, "step": 5490 }, { "epoch": 2.344356731384681, "grad_norm": 4.0401458740234375, "learning_rate": 4.860935524652339e-06, "loss": 0.4091, "step": 5495 }, { "epoch": 2.3464902922978452, "grad_norm": 3.7641711235046387, "learning_rate": 4.845132743362832e-06, "loss": 0.3803, "step": 5500 }, { "epoch": 2.3486238532110093, "grad_norm": 4.7735595703125, "learning_rate": 4.829329962073326e-06, "loss": 0.3776, "step": 5505 }, { "epoch": 2.3507574141241734, "grad_norm": 3.3556971549987793, "learning_rate": 4.8135271807838185e-06, "loss": 0.4464, "step": 5510 }, { "epoch": 2.3528909750373375, "grad_norm": 3.9931693077087402, "learning_rate": 4.797724399494311e-06, "loss": 0.461, "step": 5515 }, { "epoch": 2.3550245359505015, "grad_norm": 4.047415733337402, "learning_rate": 4.781921618204804e-06, "loss": 0.4251, "step": 5520 }, { "epoch": 2.3571580968636656, "grad_norm": 3.7913167476654053, "learning_rate": 4.766118836915298e-06, "loss": 0.3529, "step": 5525 }, { "epoch": 2.3592916577768297, "grad_norm": 3.9271814823150635, "learning_rate": 4.750316055625791e-06, "loss": 0.3897, "step": 5530 }, { "epoch": 2.3614252186899938, "grad_norm": 3.8952436447143555, "learning_rate": 4.734513274336284e-06, "loss": 0.3916, "step": 5535 }, { "epoch": 2.363558779603158, "grad_norm": 4.268993854522705, "learning_rate": 4.718710493046777e-06, "loss": 0.3637, "step": 5540 }, { "epoch": 2.365692340516322, "grad_norm": 4.283237934112549, "learning_rate": 4.7029077117572695e-06, "loss": 0.387, "step": 5545 }, { "epoch": 2.3678259014294856, "grad_norm": 4.506287097930908, "learning_rate": 4.687104930467763e-06, "loss": 0.3847, "step": 5550 }, { "epoch": 2.36995946234265, "grad_norm": 4.476803779602051, "learning_rate": 4.671302149178255e-06, "loss": 0.4186, "step": 5555 }, { "epoch": 2.3720930232558137, "grad_norm": 4.122984409332275, "learning_rate": 4.655499367888749e-06, "loss": 0.4446, "step": 5560 }, { "epoch": 2.3742265841689782, "grad_norm": 4.000293731689453, "learning_rate": 4.639696586599242e-06, "loss": 0.3906, "step": 5565 }, { "epoch": 2.376360145082142, "grad_norm": 3.5631563663482666, "learning_rate": 4.623893805309735e-06, "loss": 0.3741, "step": 5570 }, { "epoch": 2.3784937059953064, "grad_norm": 3.814074754714966, "learning_rate": 4.6080910240202284e-06, "loss": 0.385, "step": 5575 }, { "epoch": 2.38062726690847, "grad_norm": 3.454033374786377, "learning_rate": 4.5922882427307205e-06, "loss": 0.3454, "step": 5580 }, { "epoch": 2.382760827821634, "grad_norm": 4.314971446990967, "learning_rate": 4.576485461441214e-06, "loss": 0.4328, "step": 5585 }, { "epoch": 2.384894388734798, "grad_norm": 4.24696683883667, "learning_rate": 4.560682680151707e-06, "loss": 0.3027, "step": 5590 }, { "epoch": 2.3870279496479623, "grad_norm": 3.906006336212158, "learning_rate": 4.5448798988622e-06, "loss": 0.3366, "step": 5595 }, { "epoch": 2.3891615105611264, "grad_norm": 4.338901996612549, "learning_rate": 4.529077117572694e-06, "loss": 0.4076, "step": 5600 }, { "epoch": 2.3912950714742904, "grad_norm": 3.6618947982788086, "learning_rate": 4.513274336283186e-06, "loss": 0.3898, "step": 5605 }, { "epoch": 2.3934286323874545, "grad_norm": 4.170609474182129, "learning_rate": 4.497471554993679e-06, "loss": 0.3639, "step": 5610 }, { "epoch": 2.3955621933006186, "grad_norm": 4.2701287269592285, "learning_rate": 4.481668773704172e-06, "loss": 0.4414, "step": 5615 }, { "epoch": 2.3976957542137827, "grad_norm": 3.552507162094116, "learning_rate": 4.465865992414665e-06, "loss": 0.3376, "step": 5620 }, { "epoch": 2.3998293151269467, "grad_norm": 3.7421629428863525, "learning_rate": 4.450063211125159e-06, "loss": 0.3839, "step": 5625 }, { "epoch": 2.401962876040111, "grad_norm": 3.554124593734741, "learning_rate": 4.434260429835651e-06, "loss": 0.3679, "step": 5630 }, { "epoch": 2.404096436953275, "grad_norm": 4.3994140625, "learning_rate": 4.418457648546145e-06, "loss": 0.3895, "step": 5635 }, { "epoch": 2.406229997866439, "grad_norm": 3.282832622528076, "learning_rate": 4.4026548672566375e-06, "loss": 0.3644, "step": 5640 }, { "epoch": 2.408363558779603, "grad_norm": 4.626904010772705, "learning_rate": 4.38685208596713e-06, "loss": 0.4035, "step": 5645 }, { "epoch": 2.410497119692767, "grad_norm": 4.141104698181152, "learning_rate": 4.371049304677623e-06, "loss": 0.5727, "step": 5650 }, { "epoch": 2.412630680605931, "grad_norm": 3.8444101810455322, "learning_rate": 4.355246523388117e-06, "loss": 0.4091, "step": 5655 }, { "epoch": 2.4147642415190953, "grad_norm": 3.6316781044006348, "learning_rate": 4.33944374209861e-06, "loss": 0.4028, "step": 5660 }, { "epoch": 2.4168978024322594, "grad_norm": 4.120214462280273, "learning_rate": 4.323640960809103e-06, "loss": 0.4611, "step": 5665 }, { "epoch": 2.4190313633454235, "grad_norm": 4.207626819610596, "learning_rate": 4.307838179519596e-06, "loss": 0.4054, "step": 5670 }, { "epoch": 2.4211649242585875, "grad_norm": 4.325310707092285, "learning_rate": 4.2920353982300885e-06, "loss": 0.489, "step": 5675 }, { "epoch": 2.4232984851717516, "grad_norm": 3.8266632556915283, "learning_rate": 4.276232616940582e-06, "loss": 0.3478, "step": 5680 }, { "epoch": 2.4254320460849157, "grad_norm": 4.00970983505249, "learning_rate": 4.260429835651075e-06, "loss": 0.3713, "step": 5685 }, { "epoch": 2.4275656069980798, "grad_norm": 3.556880235671997, "learning_rate": 4.244627054361568e-06, "loss": 0.4119, "step": 5690 }, { "epoch": 2.429699167911244, "grad_norm": 4.504357814788818, "learning_rate": 4.228824273072061e-06, "loss": 0.4012, "step": 5695 }, { "epoch": 2.431832728824408, "grad_norm": 4.244117736816406, "learning_rate": 4.213021491782554e-06, "loss": 0.3511, "step": 5700 }, { "epoch": 2.433966289737572, "grad_norm": 4.272021770477295, "learning_rate": 4.197218710493047e-06, "loss": 0.466, "step": 5705 }, { "epoch": 2.436099850650736, "grad_norm": 3.49529767036438, "learning_rate": 4.18141592920354e-06, "loss": 0.4053, "step": 5710 }, { "epoch": 2.4382334115639, "grad_norm": 3.5074594020843506, "learning_rate": 4.165613147914033e-06, "loss": 0.3865, "step": 5715 }, { "epoch": 2.4403669724770642, "grad_norm": 4.044558525085449, "learning_rate": 4.149810366624527e-06, "loss": 0.3319, "step": 5720 }, { "epoch": 2.4425005333902283, "grad_norm": 4.200631618499756, "learning_rate": 4.134007585335019e-06, "loss": 0.3993, "step": 5725 }, { "epoch": 2.4446340943033924, "grad_norm": 4.75645112991333, "learning_rate": 4.118204804045513e-06, "loss": 0.4318, "step": 5730 }, { "epoch": 2.4467676552165565, "grad_norm": 4.220727920532227, "learning_rate": 4.1024020227560055e-06, "loss": 0.4326, "step": 5735 }, { "epoch": 2.4489012161297206, "grad_norm": 3.325622797012329, "learning_rate": 4.086599241466498e-06, "loss": 0.3337, "step": 5740 }, { "epoch": 2.4510347770428846, "grad_norm": 3.2215867042541504, "learning_rate": 4.070796460176992e-06, "loss": 0.3598, "step": 5745 }, { "epoch": 2.4531683379560487, "grad_norm": 3.8645408153533936, "learning_rate": 4.054993678887484e-06, "loss": 0.3828, "step": 5750 }, { "epoch": 2.455301898869213, "grad_norm": 3.710660696029663, "learning_rate": 4.039190897597978e-06, "loss": 0.3431, "step": 5755 }, { "epoch": 2.457435459782377, "grad_norm": 4.17116641998291, "learning_rate": 4.023388116308471e-06, "loss": 0.4396, "step": 5760 }, { "epoch": 2.459569020695541, "grad_norm": 3.2954678535461426, "learning_rate": 4.007585335018964e-06, "loss": 0.3572, "step": 5765 }, { "epoch": 2.461702581608705, "grad_norm": 3.6551170349121094, "learning_rate": 3.9917825537294565e-06, "loss": 0.3988, "step": 5770 }, { "epoch": 2.463836142521869, "grad_norm": 5.171153545379639, "learning_rate": 3.975979772439949e-06, "loss": 0.4699, "step": 5775 }, { "epoch": 2.465969703435033, "grad_norm": 4.080657482147217, "learning_rate": 3.960176991150443e-06, "loss": 0.4798, "step": 5780 }, { "epoch": 2.4681032643481973, "grad_norm": 3.91977596282959, "learning_rate": 3.944374209860936e-06, "loss": 0.4607, "step": 5785 }, { "epoch": 2.4702368252613613, "grad_norm": 3.9406278133392334, "learning_rate": 3.928571428571429e-06, "loss": 0.3383, "step": 5790 }, { "epoch": 2.4723703861745254, "grad_norm": 3.8521249294281006, "learning_rate": 3.912768647281922e-06, "loss": 0.3431, "step": 5795 }, { "epoch": 2.4745039470876895, "grad_norm": 4.789560794830322, "learning_rate": 3.896965865992415e-06, "loss": 0.4159, "step": 5800 }, { "epoch": 2.4766375080008536, "grad_norm": 4.372644901275635, "learning_rate": 3.881163084702908e-06, "loss": 0.5409, "step": 5805 }, { "epoch": 2.4787710689140177, "grad_norm": 4.152048587799072, "learning_rate": 3.865360303413401e-06, "loss": 0.4068, "step": 5810 }, { "epoch": 2.4809046298271817, "grad_norm": 4.586190223693848, "learning_rate": 3.849557522123894e-06, "loss": 0.4264, "step": 5815 }, { "epoch": 2.483038190740346, "grad_norm": 4.218143463134766, "learning_rate": 3.833754740834387e-06, "loss": 0.4537, "step": 5820 }, { "epoch": 2.48517175165351, "grad_norm": 4.002030372619629, "learning_rate": 3.81795195954488e-06, "loss": 0.3723, "step": 5825 }, { "epoch": 2.487305312566674, "grad_norm": 4.468664646148682, "learning_rate": 3.802149178255373e-06, "loss": 0.3822, "step": 5830 }, { "epoch": 2.4894388734798376, "grad_norm": 3.4531240463256836, "learning_rate": 3.7863463969658664e-06, "loss": 0.3885, "step": 5835 }, { "epoch": 2.491572434393002, "grad_norm": 4.368101596832275, "learning_rate": 3.7705436156763593e-06, "loss": 0.3656, "step": 5840 }, { "epoch": 2.4937059953061658, "grad_norm": 4.486471176147461, "learning_rate": 3.7547408343868526e-06, "loss": 0.4383, "step": 5845 }, { "epoch": 2.4958395562193303, "grad_norm": 3.639932632446289, "learning_rate": 3.738938053097346e-06, "loss": 0.3617, "step": 5850 }, { "epoch": 2.497973117132494, "grad_norm": 4.590126991271973, "learning_rate": 3.7231352718078383e-06, "loss": 0.3459, "step": 5855 }, { "epoch": 2.5001066780456584, "grad_norm": 4.252352237701416, "learning_rate": 3.7073324905183316e-06, "loss": 0.4128, "step": 5860 }, { "epoch": 2.502240238958822, "grad_norm": 5.0256805419921875, "learning_rate": 3.6915297092288245e-06, "loss": 0.347, "step": 5865 }, { "epoch": 2.5043737998719866, "grad_norm": 4.06841516494751, "learning_rate": 3.675726927939318e-06, "loss": 0.414, "step": 5870 }, { "epoch": 2.5065073607851502, "grad_norm": 4.037867069244385, "learning_rate": 3.659924146649811e-06, "loss": 0.3732, "step": 5875 }, { "epoch": 2.5086409216983148, "grad_norm": 3.40893292427063, "learning_rate": 3.6441213653603035e-06, "loss": 0.3981, "step": 5880 }, { "epoch": 2.5107744826114784, "grad_norm": 4.29473876953125, "learning_rate": 3.628318584070797e-06, "loss": 0.3326, "step": 5885 }, { "epoch": 2.5129080435246425, "grad_norm": 4.168461799621582, "learning_rate": 3.6125158027812897e-06, "loss": 0.4446, "step": 5890 }, { "epoch": 2.5150416044378066, "grad_norm": 4.106093883514404, "learning_rate": 3.596713021491783e-06, "loss": 0.3966, "step": 5895 }, { "epoch": 2.5171751653509706, "grad_norm": 4.701243877410889, "learning_rate": 3.580910240202276e-06, "loss": 0.3961, "step": 5900 }, { "epoch": 2.5193087262641347, "grad_norm": 3.523848295211792, "learning_rate": 3.5651074589127688e-06, "loss": 0.3975, "step": 5905 }, { "epoch": 2.521442287177299, "grad_norm": 4.162672519683838, "learning_rate": 3.549304677623262e-06, "loss": 0.4075, "step": 5910 }, { "epoch": 2.523575848090463, "grad_norm": 4.189688205718994, "learning_rate": 3.5335018963337554e-06, "loss": 0.3733, "step": 5915 }, { "epoch": 2.525709409003627, "grad_norm": 3.7579400539398193, "learning_rate": 3.517699115044248e-06, "loss": 0.4036, "step": 5920 }, { "epoch": 2.527842969916791, "grad_norm": 4.20217227935791, "learning_rate": 3.501896333754741e-06, "loss": 0.3915, "step": 5925 }, { "epoch": 2.529976530829955, "grad_norm": 4.595426082611084, "learning_rate": 3.486093552465234e-06, "loss": 0.4529, "step": 5930 }, { "epoch": 2.532110091743119, "grad_norm": 3.9579648971557617, "learning_rate": 3.4702907711757273e-06, "loss": 0.3848, "step": 5935 }, { "epoch": 2.5342436526562833, "grad_norm": 4.378324508666992, "learning_rate": 3.4544879898862206e-06, "loss": 0.4371, "step": 5940 }, { "epoch": 2.5363772135694473, "grad_norm": 4.3715996742248535, "learning_rate": 3.438685208596713e-06, "loss": 0.374, "step": 5945 }, { "epoch": 2.5385107744826114, "grad_norm": 4.015042304992676, "learning_rate": 3.4228824273072063e-06, "loss": 0.4176, "step": 5950 }, { "epoch": 2.5406443353957755, "grad_norm": 3.8862552642822266, "learning_rate": 3.407079646017699e-06, "loss": 0.3825, "step": 5955 }, { "epoch": 2.5427778963089396, "grad_norm": 4.243908882141113, "learning_rate": 3.3912768647281925e-06, "loss": 0.3904, "step": 5960 }, { "epoch": 2.5449114572221037, "grad_norm": 3.6207668781280518, "learning_rate": 3.375474083438686e-06, "loss": 0.3464, "step": 5965 }, { "epoch": 2.5470450181352677, "grad_norm": 4.175275802612305, "learning_rate": 3.3596713021491783e-06, "loss": 0.3689, "step": 5970 }, { "epoch": 2.549178579048432, "grad_norm": 5.810354709625244, "learning_rate": 3.3438685208596715e-06, "loss": 0.4211, "step": 5975 }, { "epoch": 2.551312139961596, "grad_norm": 3.521697998046875, "learning_rate": 3.328065739570165e-06, "loss": 0.3731, "step": 5980 }, { "epoch": 2.55344570087476, "grad_norm": 4.137831211090088, "learning_rate": 3.3122629582806577e-06, "loss": 0.3762, "step": 5985 }, { "epoch": 2.555579261787924, "grad_norm": 3.7665231227874756, "learning_rate": 3.296460176991151e-06, "loss": 0.3827, "step": 5990 }, { "epoch": 2.557712822701088, "grad_norm": 3.6081109046936035, "learning_rate": 3.2806573957016435e-06, "loss": 0.3539, "step": 5995 }, { "epoch": 2.559846383614252, "grad_norm": 3.835299491882324, "learning_rate": 3.2648546144121368e-06, "loss": 0.4552, "step": 6000 }, { "epoch": 2.5619799445274163, "grad_norm": 4.092353343963623, "learning_rate": 3.24905183312263e-06, "loss": 0.3902, "step": 6005 }, { "epoch": 2.5641135054405804, "grad_norm": 3.8998003005981445, "learning_rate": 3.233249051833123e-06, "loss": 0.3735, "step": 6010 }, { "epoch": 2.5662470663537444, "grad_norm": 3.8976283073425293, "learning_rate": 3.217446270543616e-06, "loss": 0.4999, "step": 6015 }, { "epoch": 2.5683806272669085, "grad_norm": 3.637864351272583, "learning_rate": 3.2016434892541087e-06, "loss": 0.3277, "step": 6020 }, { "epoch": 2.5705141881800726, "grad_norm": 4.05525016784668, "learning_rate": 3.185840707964602e-06, "loss": 0.3806, "step": 6025 }, { "epoch": 2.5726477490932367, "grad_norm": 3.850229501724243, "learning_rate": 3.1700379266750953e-06, "loss": 0.423, "step": 6030 }, { "epoch": 2.5747813100064008, "grad_norm": 4.023748874664307, "learning_rate": 3.1542351453855877e-06, "loss": 0.4111, "step": 6035 }, { "epoch": 2.576914870919565, "grad_norm": 4.824608325958252, "learning_rate": 3.138432364096081e-06, "loss": 0.3823, "step": 6040 }, { "epoch": 2.579048431832729, "grad_norm": 4.730124473571777, "learning_rate": 3.1226295828065743e-06, "loss": 0.4564, "step": 6045 }, { "epoch": 2.581181992745893, "grad_norm": 3.960134506225586, "learning_rate": 3.106826801517067e-06, "loss": 0.4409, "step": 6050 }, { "epoch": 2.583315553659057, "grad_norm": 4.118432521820068, "learning_rate": 3.0910240202275605e-06, "loss": 0.4298, "step": 6055 }, { "epoch": 2.585449114572221, "grad_norm": 4.568760395050049, "learning_rate": 3.075221238938053e-06, "loss": 0.4348, "step": 6060 }, { "epoch": 2.5875826754853852, "grad_norm": 4.839498996734619, "learning_rate": 3.0594184576485463e-06, "loss": 0.3997, "step": 6065 }, { "epoch": 2.5897162363985493, "grad_norm": 4.513376235961914, "learning_rate": 3.0436156763590396e-06, "loss": 0.4664, "step": 6070 }, { "epoch": 2.5918497973117134, "grad_norm": 4.747183799743652, "learning_rate": 3.0278128950695324e-06, "loss": 0.4231, "step": 6075 }, { "epoch": 2.5939833582248775, "grad_norm": 4.03349494934082, "learning_rate": 3.0120101137800257e-06, "loss": 0.3717, "step": 6080 }, { "epoch": 2.5961169191380415, "grad_norm": 3.8517165184020996, "learning_rate": 2.996207332490518e-06, "loss": 0.4009, "step": 6085 }, { "epoch": 2.5982504800512056, "grad_norm": 4.061285018920898, "learning_rate": 2.9804045512010115e-06, "loss": 0.3993, "step": 6090 }, { "epoch": 2.6003840409643697, "grad_norm": 3.6961746215820312, "learning_rate": 2.9646017699115048e-06, "loss": 0.3631, "step": 6095 }, { "epoch": 2.6025176018775333, "grad_norm": 4.444972515106201, "learning_rate": 2.9487989886219977e-06, "loss": 0.405, "step": 6100 }, { "epoch": 2.604651162790698, "grad_norm": 3.410012722015381, "learning_rate": 2.932996207332491e-06, "loss": 0.3838, "step": 6105 }, { "epoch": 2.6067847237038615, "grad_norm": 4.757806301116943, "learning_rate": 2.9171934260429842e-06, "loss": 0.4622, "step": 6110 }, { "epoch": 2.608918284617026, "grad_norm": 6.638218402862549, "learning_rate": 2.9013906447534767e-06, "loss": 0.5596, "step": 6115 }, { "epoch": 2.6110518455301897, "grad_norm": 3.4803130626678467, "learning_rate": 2.88558786346397e-06, "loss": 0.4108, "step": 6120 }, { "epoch": 2.613185406443354, "grad_norm": 4.698441028594971, "learning_rate": 2.869785082174463e-06, "loss": 0.4462, "step": 6125 }, { "epoch": 2.615318967356518, "grad_norm": 3.7789926528930664, "learning_rate": 2.853982300884956e-06, "loss": 0.3573, "step": 6130 }, { "epoch": 2.6174525282696823, "grad_norm": 4.063235282897949, "learning_rate": 2.838179519595449e-06, "loss": 0.4009, "step": 6135 }, { "epoch": 2.619586089182846, "grad_norm": 3.718618869781494, "learning_rate": 2.822376738305942e-06, "loss": 0.4118, "step": 6140 }, { "epoch": 2.6217196500960105, "grad_norm": 4.963037014007568, "learning_rate": 2.8065739570164352e-06, "loss": 0.3667, "step": 6145 }, { "epoch": 2.623853211009174, "grad_norm": 3.7930514812469482, "learning_rate": 2.790771175726928e-06, "loss": 0.3552, "step": 6150 }, { "epoch": 2.6259867719223386, "grad_norm": 3.7048425674438477, "learning_rate": 2.774968394437421e-06, "loss": 0.4853, "step": 6155 }, { "epoch": 2.6281203328355023, "grad_norm": 4.370087623596191, "learning_rate": 2.7591656131479143e-06, "loss": 0.4286, "step": 6160 }, { "epoch": 2.6302538937486664, "grad_norm": 3.973276376724243, "learning_rate": 2.743362831858407e-06, "loss": 0.4329, "step": 6165 }, { "epoch": 2.6323874546618304, "grad_norm": 4.450080871582031, "learning_rate": 2.7275600505689004e-06, "loss": 0.375, "step": 6170 }, { "epoch": 2.6345210155749945, "grad_norm": 4.750911712646484, "learning_rate": 2.7117572692793937e-06, "loss": 0.4039, "step": 6175 }, { "epoch": 2.6366545764881586, "grad_norm": 3.761721611022949, "learning_rate": 2.695954487989886e-06, "loss": 0.4411, "step": 6180 }, { "epoch": 2.6387881374013227, "grad_norm": 3.7865817546844482, "learning_rate": 2.6801517067003795e-06, "loss": 0.3401, "step": 6185 }, { "epoch": 2.6409216983144868, "grad_norm": 5.113328456878662, "learning_rate": 2.6643489254108724e-06, "loss": 0.4211, "step": 6190 }, { "epoch": 2.643055259227651, "grad_norm": 4.039052486419678, "learning_rate": 2.6485461441213657e-06, "loss": 0.4059, "step": 6195 }, { "epoch": 2.645188820140815, "grad_norm": 3.8695180416107178, "learning_rate": 2.632743362831859e-06, "loss": 0.3441, "step": 6200 }, { "epoch": 2.647322381053979, "grad_norm": 3.8771300315856934, "learning_rate": 2.6169405815423514e-06, "loss": 0.3342, "step": 6205 }, { "epoch": 2.649455941967143, "grad_norm": 3.479433298110962, "learning_rate": 2.6011378002528447e-06, "loss": 0.3825, "step": 6210 }, { "epoch": 2.651589502880307, "grad_norm": 4.334216594696045, "learning_rate": 2.5853350189633376e-06, "loss": 0.4421, "step": 6215 }, { "epoch": 2.6537230637934712, "grad_norm": 3.76131272315979, "learning_rate": 2.569532237673831e-06, "loss": 0.3765, "step": 6220 }, { "epoch": 2.6558566247066353, "grad_norm": 3.95220685005188, "learning_rate": 2.553729456384324e-06, "loss": 0.3601, "step": 6225 }, { "epoch": 2.6579901856197994, "grad_norm": 4.079404354095459, "learning_rate": 2.5379266750948166e-06, "loss": 0.397, "step": 6230 }, { "epoch": 2.6601237465329635, "grad_norm": 4.499898433685303, "learning_rate": 2.52212389380531e-06, "loss": 0.4327, "step": 6235 }, { "epoch": 2.6622573074461275, "grad_norm": 3.1831462383270264, "learning_rate": 2.5063211125158032e-06, "loss": 0.3815, "step": 6240 }, { "epoch": 2.6643908683592916, "grad_norm": 4.471576690673828, "learning_rate": 2.490518331226296e-06, "loss": 0.4483, "step": 6245 }, { "epoch": 2.6665244292724557, "grad_norm": 4.825654983520508, "learning_rate": 2.4747155499367894e-06, "loss": 0.3829, "step": 6250 }, { "epoch": 2.6686579901856198, "grad_norm": 4.086824893951416, "learning_rate": 2.4589127686472823e-06, "loss": 0.4072, "step": 6255 }, { "epoch": 2.670791551098784, "grad_norm": 3.817394495010376, "learning_rate": 2.443109987357775e-06, "loss": 0.3995, "step": 6260 }, { "epoch": 2.672925112011948, "grad_norm": 3.841449022293091, "learning_rate": 2.427307206068268e-06, "loss": 0.3702, "step": 6265 }, { "epoch": 2.675058672925112, "grad_norm": 4.572795867919922, "learning_rate": 2.4115044247787613e-06, "loss": 0.3905, "step": 6270 }, { "epoch": 2.677192233838276, "grad_norm": 4.133564472198486, "learning_rate": 2.395701643489254e-06, "loss": 0.4461, "step": 6275 }, { "epoch": 2.67932579475144, "grad_norm": 4.098212242126465, "learning_rate": 2.3798988621997475e-06, "loss": 0.3363, "step": 6280 }, { "epoch": 2.6814593556646042, "grad_norm": 4.384210586547852, "learning_rate": 2.3640960809102404e-06, "loss": 0.3668, "step": 6285 }, { "epoch": 2.6835929165777683, "grad_norm": 3.9941020011901855, "learning_rate": 2.3482932996207332e-06, "loss": 0.3367, "step": 6290 }, { "epoch": 2.6857264774909324, "grad_norm": 4.1736650466918945, "learning_rate": 2.3324905183312265e-06, "loss": 0.3845, "step": 6295 }, { "epoch": 2.6878600384040965, "grad_norm": 2.97011137008667, "learning_rate": 2.3166877370417194e-06, "loss": 0.3561, "step": 6300 }, { "epoch": 2.6899935993172606, "grad_norm": 4.362311840057373, "learning_rate": 2.3008849557522127e-06, "loss": 0.3932, "step": 6305 }, { "epoch": 2.6921271602304246, "grad_norm": 3.9310810565948486, "learning_rate": 2.2850821744627056e-06, "loss": 0.4074, "step": 6310 }, { "epoch": 2.6942607211435887, "grad_norm": 3.619035005569458, "learning_rate": 2.269279393173199e-06, "loss": 0.3642, "step": 6315 }, { "epoch": 2.696394282056753, "grad_norm": 3.4863860607147217, "learning_rate": 2.2534766118836918e-06, "loss": 0.3365, "step": 6320 }, { "epoch": 2.698527842969917, "grad_norm": 4.252373218536377, "learning_rate": 2.2376738305941846e-06, "loss": 0.3501, "step": 6325 }, { "epoch": 2.700661403883081, "grad_norm": 3.3830983638763428, "learning_rate": 2.221871049304678e-06, "loss": 0.3824, "step": 6330 }, { "epoch": 2.702794964796245, "grad_norm": 4.0998382568359375, "learning_rate": 2.206068268015171e-06, "loss": 0.4223, "step": 6335 }, { "epoch": 2.704928525709409, "grad_norm": 3.9439263343811035, "learning_rate": 2.190265486725664e-06, "loss": 0.3694, "step": 6340 }, { "epoch": 2.707062086622573, "grad_norm": 5.345850944519043, "learning_rate": 2.174462705436157e-06, "loss": 0.4587, "step": 6345 }, { "epoch": 2.7091956475357373, "grad_norm": 3.7166903018951416, "learning_rate": 2.15865992414665e-06, "loss": 0.3643, "step": 6350 }, { "epoch": 2.7113292084489014, "grad_norm": 3.7905008792877197, "learning_rate": 2.1428571428571427e-06, "loss": 0.3443, "step": 6355 }, { "epoch": 2.7134627693620654, "grad_norm": 4.21207857131958, "learning_rate": 2.127054361567636e-06, "loss": 0.3883, "step": 6360 }, { "epoch": 2.7155963302752295, "grad_norm": 3.345935583114624, "learning_rate": 2.1112515802781293e-06, "loss": 0.3858, "step": 6365 }, { "epoch": 2.7177298911883936, "grad_norm": 3.8819267749786377, "learning_rate": 2.095448798988622e-06, "loss": 0.3656, "step": 6370 }, { "epoch": 2.7198634521015577, "grad_norm": 4.006298065185547, "learning_rate": 2.079646017699115e-06, "loss": 0.3921, "step": 6375 }, { "epoch": 2.7219970130147217, "grad_norm": 4.349729061126709, "learning_rate": 2.0638432364096084e-06, "loss": 0.3966, "step": 6380 }, { "epoch": 2.7241305739278854, "grad_norm": 4.517011642456055, "learning_rate": 2.0480404551201012e-06, "loss": 0.4512, "step": 6385 }, { "epoch": 2.72626413484105, "grad_norm": 3.898801565170288, "learning_rate": 2.0322376738305945e-06, "loss": 0.36, "step": 6390 }, { "epoch": 2.7283976957542135, "grad_norm": 3.9057013988494873, "learning_rate": 2.0164348925410874e-06, "loss": 0.4329, "step": 6395 }, { "epoch": 2.730531256667378, "grad_norm": 3.8034722805023193, "learning_rate": 2.0006321112515807e-06, "loss": 0.3722, "step": 6400 }, { "epoch": 2.7326648175805417, "grad_norm": 4.9013895988464355, "learning_rate": 1.9848293299620736e-06, "loss": 0.3646, "step": 6405 }, { "epoch": 2.734798378493706, "grad_norm": 3.664039134979248, "learning_rate": 1.9690265486725665e-06, "loss": 0.393, "step": 6410 }, { "epoch": 2.73693193940687, "grad_norm": 3.9663522243499756, "learning_rate": 1.9532237673830593e-06, "loss": 0.4266, "step": 6415 }, { "epoch": 2.7390655003200344, "grad_norm": 3.832275152206421, "learning_rate": 1.9374209860935526e-06, "loss": 0.363, "step": 6420 }, { "epoch": 2.741199061233198, "grad_norm": 3.2930123805999756, "learning_rate": 1.921618204804046e-06, "loss": 0.3979, "step": 6425 }, { "epoch": 2.7433326221463625, "grad_norm": 4.9362053871154785, "learning_rate": 1.9058154235145388e-06, "loss": 0.379, "step": 6430 }, { "epoch": 2.745466183059526, "grad_norm": 4.307873249053955, "learning_rate": 1.8900126422250317e-06, "loss": 0.3853, "step": 6435 }, { "epoch": 2.7475997439726907, "grad_norm": 4.564358234405518, "learning_rate": 1.8742098609355248e-06, "loss": 0.3632, "step": 6440 }, { "epoch": 2.7497333048858543, "grad_norm": 3.611410617828369, "learning_rate": 1.8584070796460179e-06, "loss": 0.4081, "step": 6445 }, { "epoch": 2.7518668657990184, "grad_norm": 4.470289707183838, "learning_rate": 1.842604298356511e-06, "loss": 0.4033, "step": 6450 }, { "epoch": 2.7540004267121825, "grad_norm": 3.99035906791687, "learning_rate": 1.8268015170670038e-06, "loss": 0.3946, "step": 6455 }, { "epoch": 2.7561339876253466, "grad_norm": 4.113675117492676, "learning_rate": 1.810998735777497e-06, "loss": 0.3884, "step": 6460 }, { "epoch": 2.7582675485385106, "grad_norm": 3.7869863510131836, "learning_rate": 1.7951959544879902e-06, "loss": 0.4043, "step": 6465 }, { "epoch": 2.7604011094516747, "grad_norm": 3.4098997116088867, "learning_rate": 1.779393173198483e-06, "loss": 0.3765, "step": 6470 }, { "epoch": 2.762534670364839, "grad_norm": 4.8651123046875, "learning_rate": 1.7635903919089762e-06, "loss": 0.4379, "step": 6475 }, { "epoch": 2.764668231278003, "grad_norm": 3.7873408794403076, "learning_rate": 1.747787610619469e-06, "loss": 0.3688, "step": 6480 }, { "epoch": 2.766801792191167, "grad_norm": 3.624573230743408, "learning_rate": 1.7319848293299621e-06, "loss": 0.351, "step": 6485 }, { "epoch": 2.768935353104331, "grad_norm": 3.621868848800659, "learning_rate": 1.7161820480404554e-06, "loss": 0.3465, "step": 6490 }, { "epoch": 2.771068914017495, "grad_norm": 4.348837852478027, "learning_rate": 1.7003792667509483e-06, "loss": 0.424, "step": 6495 }, { "epoch": 2.773202474930659, "grad_norm": 4.25526237487793, "learning_rate": 1.6845764854614414e-06, "loss": 0.3487, "step": 6500 }, { "epoch": 2.7753360358438233, "grad_norm": 4.0821757316589355, "learning_rate": 1.6687737041719343e-06, "loss": 0.343, "step": 6505 }, { "epoch": 2.7774695967569873, "grad_norm": 3.5596158504486084, "learning_rate": 1.6529709228824276e-06, "loss": 0.3745, "step": 6510 }, { "epoch": 2.7796031576701514, "grad_norm": 3.369270086288452, "learning_rate": 1.6371681415929204e-06, "loss": 0.3482, "step": 6515 }, { "epoch": 2.7817367185833155, "grad_norm": 5.09242582321167, "learning_rate": 1.6213653603034135e-06, "loss": 0.419, "step": 6520 }, { "epoch": 2.7838702794964796, "grad_norm": 4.409122943878174, "learning_rate": 1.6055625790139064e-06, "loss": 0.4107, "step": 6525 }, { "epoch": 2.7860038404096437, "grad_norm": 3.660127878189087, "learning_rate": 1.5897597977243997e-06, "loss": 0.3103, "step": 6530 }, { "epoch": 2.7881374013228077, "grad_norm": 4.103714942932129, "learning_rate": 1.5739570164348928e-06, "loss": 0.3836, "step": 6535 }, { "epoch": 2.790270962235972, "grad_norm": 4.289230823516846, "learning_rate": 1.5581542351453857e-06, "loss": 0.3736, "step": 6540 }, { "epoch": 2.792404523149136, "grad_norm": 3.898728370666504, "learning_rate": 1.5423514538558787e-06, "loss": 0.3926, "step": 6545 }, { "epoch": 2.7945380840623, "grad_norm": 4.545414924621582, "learning_rate": 1.5265486725663716e-06, "loss": 0.384, "step": 6550 }, { "epoch": 2.796671644975464, "grad_norm": 3.980564832687378, "learning_rate": 1.510745891276865e-06, "loss": 0.3862, "step": 6555 }, { "epoch": 2.798805205888628, "grad_norm": 4.078009605407715, "learning_rate": 1.494943109987358e-06, "loss": 0.41, "step": 6560 }, { "epoch": 2.800938766801792, "grad_norm": 5.77632999420166, "learning_rate": 1.4791403286978509e-06, "loss": 0.4223, "step": 6565 }, { "epoch": 2.8030723277149563, "grad_norm": 3.2947072982788086, "learning_rate": 1.463337547408344e-06, "loss": 0.385, "step": 6570 }, { "epoch": 2.8052058886281204, "grad_norm": 4.24403715133667, "learning_rate": 1.447534766118837e-06, "loss": 0.3875, "step": 6575 }, { "epoch": 2.8073394495412844, "grad_norm": 4.188129425048828, "learning_rate": 1.4317319848293301e-06, "loss": 0.3847, "step": 6580 }, { "epoch": 2.8094730104544485, "grad_norm": 4.100208282470703, "learning_rate": 1.415929203539823e-06, "loss": 0.4115, "step": 6585 }, { "epoch": 2.8116065713676126, "grad_norm": 3.90510630607605, "learning_rate": 1.400126422250316e-06, "loss": 0.3578, "step": 6590 }, { "epoch": 2.8137401322807767, "grad_norm": 4.556890487670898, "learning_rate": 1.3843236409608094e-06, "loss": 0.3868, "step": 6595 }, { "epoch": 2.8158736931939408, "grad_norm": 3.50069260597229, "learning_rate": 1.3685208596713023e-06, "loss": 0.3581, "step": 6600 }, { "epoch": 2.818007254107105, "grad_norm": 4.0225749015808105, "learning_rate": 1.3527180783817954e-06, "loss": 0.3467, "step": 6605 }, { "epoch": 2.820140815020269, "grad_norm": 3.903599500656128, "learning_rate": 1.3369152970922882e-06, "loss": 0.3521, "step": 6610 }, { "epoch": 2.822274375933433, "grad_norm": 4.0492634773254395, "learning_rate": 1.3211125158027813e-06, "loss": 0.3841, "step": 6615 }, { "epoch": 2.824407936846597, "grad_norm": 4.392933368682861, "learning_rate": 1.3053097345132746e-06, "loss": 0.3963, "step": 6620 }, { "epoch": 2.826541497759761, "grad_norm": 4.814109802246094, "learning_rate": 1.2895069532237675e-06, "loss": 0.4293, "step": 6625 }, { "epoch": 2.8286750586729252, "grad_norm": 4.4884724617004395, "learning_rate": 1.2737041719342606e-06, "loss": 0.4127, "step": 6630 }, { "epoch": 2.8308086195860893, "grad_norm": 4.531534671783447, "learning_rate": 1.2579013906447535e-06, "loss": 0.3977, "step": 6635 }, { "epoch": 2.8329421804992534, "grad_norm": 4.81414270401001, "learning_rate": 1.2420986093552465e-06, "loss": 0.3416, "step": 6640 }, { "epoch": 2.8350757414124175, "grad_norm": 4.427682399749756, "learning_rate": 1.2262958280657396e-06, "loss": 0.4633, "step": 6645 }, { "epoch": 2.8372093023255816, "grad_norm": 3.9264962673187256, "learning_rate": 1.2104930467762327e-06, "loss": 0.3987, "step": 6650 }, { "epoch": 2.8393428632387456, "grad_norm": 4.122159957885742, "learning_rate": 1.1946902654867258e-06, "loss": 0.3409, "step": 6655 }, { "epoch": 2.8414764241519093, "grad_norm": 4.230727672576904, "learning_rate": 1.1788874841972189e-06, "loss": 0.4259, "step": 6660 }, { "epoch": 2.843609985065074, "grad_norm": 4.188933849334717, "learning_rate": 1.163084702907712e-06, "loss": 0.3581, "step": 6665 }, { "epoch": 2.8457435459782374, "grad_norm": 3.4019768238067627, "learning_rate": 1.1472819216182048e-06, "loss": 0.439, "step": 6670 }, { "epoch": 2.847877106891402, "grad_norm": 4.435439109802246, "learning_rate": 1.131479140328698e-06, "loss": 0.4366, "step": 6675 }, { "epoch": 2.8500106678045656, "grad_norm": 3.905317783355713, "learning_rate": 1.115676359039191e-06, "loss": 0.3568, "step": 6680 }, { "epoch": 2.85214422871773, "grad_norm": 4.476743698120117, "learning_rate": 1.0998735777496839e-06, "loss": 0.4044, "step": 6685 }, { "epoch": 2.8542777896308937, "grad_norm": 4.263827323913574, "learning_rate": 1.0840707964601772e-06, "loss": 0.3394, "step": 6690 }, { "epoch": 2.8564113505440583, "grad_norm": 4.395534515380859, "learning_rate": 1.06826801517067e-06, "loss": 0.3631, "step": 6695 }, { "epoch": 2.858544911457222, "grad_norm": 3.917830228805542, "learning_rate": 1.0524652338811632e-06, "loss": 0.3771, "step": 6700 }, { "epoch": 2.8606784723703864, "grad_norm": 3.799147367477417, "learning_rate": 1.0366624525916562e-06, "loss": 0.3673, "step": 6705 }, { "epoch": 2.86281203328355, "grad_norm": 4.744193077087402, "learning_rate": 1.0208596713021493e-06, "loss": 0.431, "step": 6710 }, { "epoch": 2.8649455941967146, "grad_norm": 4.385441780090332, "learning_rate": 1.0050568900126422e-06, "loss": 0.3884, "step": 6715 }, { "epoch": 2.867079155109878, "grad_norm": 3.170003890991211, "learning_rate": 9.892541087231355e-07, "loss": 0.3317, "step": 6720 }, { "epoch": 2.8692127160230427, "grad_norm": 4.889204978942871, "learning_rate": 9.734513274336284e-07, "loss": 0.4444, "step": 6725 }, { "epoch": 2.8713462769362064, "grad_norm": 4.576068878173828, "learning_rate": 9.576485461441215e-07, "loss": 0.3629, "step": 6730 }, { "epoch": 2.8734798378493704, "grad_norm": 3.7488820552825928, "learning_rate": 9.418457648546144e-07, "loss": 0.3985, "step": 6735 }, { "epoch": 2.8756133987625345, "grad_norm": 4.805878639221191, "learning_rate": 9.260429835651074e-07, "loss": 0.3658, "step": 6740 }, { "epoch": 2.8777469596756986, "grad_norm": 4.053656101226807, "learning_rate": 9.102402022756006e-07, "loss": 0.4409, "step": 6745 }, { "epoch": 2.8798805205888627, "grad_norm": 3.8377509117126465, "learning_rate": 8.944374209860936e-07, "loss": 0.3299, "step": 6750 }, { "epoch": 2.8820140815020268, "grad_norm": 3.885563611984253, "learning_rate": 8.786346396965867e-07, "loss": 0.4275, "step": 6755 }, { "epoch": 2.884147642415191, "grad_norm": 4.142803192138672, "learning_rate": 8.628318584070797e-07, "loss": 0.398, "step": 6760 }, { "epoch": 2.886281203328355, "grad_norm": 4.158621788024902, "learning_rate": 8.470290771175727e-07, "loss": 0.4017, "step": 6765 }, { "epoch": 2.888414764241519, "grad_norm": 3.649864912033081, "learning_rate": 8.312262958280657e-07, "loss": 0.4163, "step": 6770 }, { "epoch": 2.890548325154683, "grad_norm": 3.5101370811462402, "learning_rate": 8.154235145385589e-07, "loss": 0.4025, "step": 6775 }, { "epoch": 2.892681886067847, "grad_norm": 4.628073215484619, "learning_rate": 7.996207332490519e-07, "loss": 0.4233, "step": 6780 }, { "epoch": 2.8948154469810112, "grad_norm": 4.449802875518799, "learning_rate": 7.83817951959545e-07, "loss": 0.4032, "step": 6785 }, { "epoch": 2.8969490078941753, "grad_norm": 4.176910877227783, "learning_rate": 7.68015170670038e-07, "loss": 0.3724, "step": 6790 }, { "epoch": 2.8990825688073394, "grad_norm": 3.777665138244629, "learning_rate": 7.522123893805311e-07, "loss": 0.3426, "step": 6795 }, { "epoch": 2.9012161297205035, "grad_norm": 4.363525390625, "learning_rate": 7.36409608091024e-07, "loss": 0.4207, "step": 6800 }, { "epoch": 2.9033496906336675, "grad_norm": 3.7728726863861084, "learning_rate": 7.206068268015172e-07, "loss": 0.394, "step": 6805 }, { "epoch": 2.9054832515468316, "grad_norm": 4.496046543121338, "learning_rate": 7.048040455120102e-07, "loss": 0.3566, "step": 6810 }, { "epoch": 2.9076168124599957, "grad_norm": 3.437410354614258, "learning_rate": 6.890012642225032e-07, "loss": 0.3491, "step": 6815 }, { "epoch": 2.90975037337316, "grad_norm": 3.4725382328033447, "learning_rate": 6.731984829329963e-07, "loss": 0.4027, "step": 6820 }, { "epoch": 2.911883934286324, "grad_norm": 3.788536787033081, "learning_rate": 6.573957016434893e-07, "loss": 0.3844, "step": 6825 }, { "epoch": 2.914017495199488, "grad_norm": 3.9548215866088867, "learning_rate": 6.415929203539823e-07, "loss": 0.3469, "step": 6830 }, { "epoch": 2.916151056112652, "grad_norm": 3.581763505935669, "learning_rate": 6.257901390644753e-07, "loss": 0.368, "step": 6835 }, { "epoch": 2.918284617025816, "grad_norm": 3.8996713161468506, "learning_rate": 6.099873577749684e-07, "loss": 0.3673, "step": 6840 }, { "epoch": 2.92041817793898, "grad_norm": 3.3684163093566895, "learning_rate": 5.941845764854615e-07, "loss": 0.3668, "step": 6845 }, { "epoch": 2.9225517388521443, "grad_norm": 3.806769609451294, "learning_rate": 5.783817951959545e-07, "loss": 0.3892, "step": 6850 }, { "epoch": 2.9246852997653083, "grad_norm": 4.019096374511719, "learning_rate": 5.625790139064476e-07, "loss": 0.3819, "step": 6855 }, { "epoch": 2.9268188606784724, "grad_norm": 4.2586164474487305, "learning_rate": 5.467762326169406e-07, "loss": 0.3969, "step": 6860 }, { "epoch": 2.9289524215916365, "grad_norm": 4.982457637786865, "learning_rate": 5.309734513274336e-07, "loss": 0.3556, "step": 6865 }, { "epoch": 2.9310859825048006, "grad_norm": 3.756347179412842, "learning_rate": 5.151706700379267e-07, "loss": 0.3972, "step": 6870 }, { "epoch": 2.9332195434179646, "grad_norm": 4.047727584838867, "learning_rate": 4.993678887484198e-07, "loss": 0.3881, "step": 6875 }, { "epoch": 2.9353531043311287, "grad_norm": 4.2889862060546875, "learning_rate": 4.835651074589128e-07, "loss": 0.3569, "step": 6880 }, { "epoch": 2.937486665244293, "grad_norm": 4.17496395111084, "learning_rate": 4.6776232616940587e-07, "loss": 0.3775, "step": 6885 }, { "epoch": 2.939620226157457, "grad_norm": 4.326033115386963, "learning_rate": 4.519595448798989e-07, "loss": 0.3723, "step": 6890 }, { "epoch": 2.941753787070621, "grad_norm": 3.82330322265625, "learning_rate": 4.36156763590392e-07, "loss": 0.4026, "step": 6895 }, { "epoch": 2.943887347983785, "grad_norm": 3.445920944213867, "learning_rate": 4.20353982300885e-07, "loss": 0.3598, "step": 6900 }, { "epoch": 2.946020908896949, "grad_norm": 4.0058698654174805, "learning_rate": 4.0455120101137806e-07, "loss": 0.396, "step": 6905 }, { "epoch": 2.948154469810113, "grad_norm": 3.8937366008758545, "learning_rate": 3.8874841972187104e-07, "loss": 0.4212, "step": 6910 }, { "epoch": 2.9502880307232773, "grad_norm": 3.048259735107422, "learning_rate": 3.729456384323641e-07, "loss": 0.3016, "step": 6915 }, { "epoch": 2.9524215916364414, "grad_norm": 4.167364597320557, "learning_rate": 3.5714285714285716e-07, "loss": 0.3691, "step": 6920 }, { "epoch": 2.9545551525496054, "grad_norm": 3.733313798904419, "learning_rate": 3.413400758533502e-07, "loss": 0.4057, "step": 6925 }, { "epoch": 2.9566887134627695, "grad_norm": 3.94075608253479, "learning_rate": 3.255372945638433e-07, "loss": 0.3758, "step": 6930 }, { "epoch": 2.9588222743759336, "grad_norm": 4.1353535652160645, "learning_rate": 3.097345132743363e-07, "loss": 0.4074, "step": 6935 }, { "epoch": 2.9609558352890977, "grad_norm": 4.167180061340332, "learning_rate": 2.9393173198482934e-07, "loss": 0.3908, "step": 6940 }, { "epoch": 2.9630893962022613, "grad_norm": 3.7538034915924072, "learning_rate": 2.7812895069532243e-07, "loss": 0.3797, "step": 6945 }, { "epoch": 2.965222957115426, "grad_norm": 3.6969258785247803, "learning_rate": 2.6232616940581546e-07, "loss": 0.3643, "step": 6950 }, { "epoch": 2.9673565180285895, "grad_norm": 3.8094379901885986, "learning_rate": 2.465233881163085e-07, "loss": 0.3865, "step": 6955 }, { "epoch": 2.969490078941754, "grad_norm": 4.114381790161133, "learning_rate": 2.3072060682680153e-07, "loss": 0.3911, "step": 6960 }, { "epoch": 2.9716236398549176, "grad_norm": 3.7223260402679443, "learning_rate": 2.1491782553729456e-07, "loss": 0.3594, "step": 6965 }, { "epoch": 2.973757200768082, "grad_norm": 3.774750232696533, "learning_rate": 1.9911504424778762e-07, "loss": 0.3772, "step": 6970 }, { "epoch": 2.9758907616812458, "grad_norm": 3.5546209812164307, "learning_rate": 1.8331226295828068e-07, "loss": 0.405, "step": 6975 }, { "epoch": 2.9780243225944103, "grad_norm": 4.0937180519104, "learning_rate": 1.6750948166877372e-07, "loss": 0.3537, "step": 6980 }, { "epoch": 2.980157883507574, "grad_norm": 4.477363109588623, "learning_rate": 1.5170670037926675e-07, "loss": 0.3708, "step": 6985 }, { "epoch": 2.9822914444207385, "grad_norm": 4.295274257659912, "learning_rate": 1.359039190897598e-07, "loss": 0.3649, "step": 6990 }, { "epoch": 2.984425005333902, "grad_norm": 4.677340507507324, "learning_rate": 1.2010113780025287e-07, "loss": 0.3878, "step": 6995 }, { "epoch": 2.9865585662470666, "grad_norm": 3.6075425148010254, "learning_rate": 1.0429835651074589e-07, "loss": 0.3452, "step": 7000 }, { "epoch": 2.9886921271602302, "grad_norm": 3.9504892826080322, "learning_rate": 8.849557522123894e-08, "loss": 0.3655, "step": 7005 }, { "epoch": 2.9908256880733948, "grad_norm": 4.34686803817749, "learning_rate": 7.2692793931732e-08, "loss": 0.3878, "step": 7010 }, { "epoch": 2.9929592489865584, "grad_norm": 4.2292046546936035, "learning_rate": 5.689001264222504e-08, "loss": 0.3704, "step": 7015 }, { "epoch": 2.9950928098997225, "grad_norm": 3.9971399307250977, "learning_rate": 4.108723135271808e-08, "loss": 0.3715, "step": 7020 }, { "epoch": 2.9972263708128866, "grad_norm": 4.397487640380859, "learning_rate": 2.528445006321113e-08, "loss": 0.3896, "step": 7025 }, { "epoch": 2.9993599317260506, "grad_norm": 3.856173276901245, "learning_rate": 9.481668773704172e-09, "loss": 0.4088, "step": 7030 }, { "epoch": 3.0, "eval_evaluator": 0.9877204489141523, "eval_loss": 0.1703886240720749, "eval_runtime": 127.056, "eval_samples_per_second": 18.0, "eval_steps_per_second": 2.251, "step": 7032 } ], "logging_steps": 5, "max_steps": 7032, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }