{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9000513434879343, "eval_steps": 877, "global_step": 5259, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017114495978093444, "grad_norm": NaN, "learning_rate": 0.0, "loss": 9.5502, "step": 1 }, { "epoch": 0.0003422899195618689, "grad_norm": NaN, "learning_rate": 0.0, "loss": 17.5546, "step": 2 }, { "epoch": 0.0005134348793428033, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 25.9988, "step": 3 }, { "epoch": 0.0006845798391237378, "grad_norm": 60.18782043457031, "learning_rate": 5.704506560182544e-09, "loss": 9.6042, "step": 4 }, { "epoch": 0.0008557247989046722, "grad_norm": Infinity, "learning_rate": 5.704506560182544e-09, "loss": 17.9169, "step": 5 }, { "epoch": 0.0010268697586856067, "grad_norm": 84.36212158203125, "learning_rate": 1.1409013120365088e-08, "loss": 10.1633, "step": 6 }, { "epoch": 0.001198014718466541, "grad_norm": 82.20382690429688, "learning_rate": 1.711351968054763e-08, "loss": 8.4392, "step": 7 }, { "epoch": 0.0013691596782474755, "grad_norm": 16.687606811523438, "learning_rate": 2.2818026240730176e-08, "loss": 6.4113, "step": 8 }, { "epoch": 0.00154030463802841, "grad_norm": NaN, "learning_rate": 2.2818026240730176e-08, "loss": 17.185, "step": 9 }, { "epoch": 0.0017114495978093444, "grad_norm": 48.527183532714844, "learning_rate": 2.852253280091272e-08, "loss": 7.868, "step": 10 }, { "epoch": 0.0018825945575902789, "grad_norm": 76.19969177246094, "learning_rate": 3.422703936109526e-08, "loss": 25.5097, "step": 11 }, { "epoch": 0.0020537395173712133, "grad_norm": 44.624080657958984, "learning_rate": 3.9931545921277814e-08, "loss": 8.533, "step": 12 }, { "epoch": 0.002224884477152148, "grad_norm": 68.30242156982422, "learning_rate": 4.563605248146035e-08, "loss": 12.1618, "step": 13 }, { "epoch": 0.002396029436933082, "grad_norm": 77.02323913574219, "learning_rate": 5.1340559041642904e-08, "loss": 10.1531, "step": 14 }, { "epoch": 0.002567174396714017, "grad_norm": 37.27943420410156, "learning_rate": 5.704506560182544e-08, "loss": 8.2236, "step": 15 }, { "epoch": 0.002738319356494951, "grad_norm": 47.14276123046875, "learning_rate": 6.274957216200798e-08, "loss": 6.6764, "step": 16 }, { "epoch": 0.0029094643162758858, "grad_norm": 68.288818359375, "learning_rate": 6.845407872219053e-08, "loss": 9.6404, "step": 17 }, { "epoch": 0.00308060927605682, "grad_norm": 72.5563735961914, "learning_rate": 7.415858528237308e-08, "loss": 7.2553, "step": 18 }, { "epoch": 0.0032517542358377546, "grad_norm": 133.2047576904297, "learning_rate": 7.986309184255563e-08, "loss": 17.6761, "step": 19 }, { "epoch": 0.003422899195618689, "grad_norm": 96.22279357910156, "learning_rate": 8.556759840273816e-08, "loss": 23.6993, "step": 20 }, { "epoch": 0.0035940441553996235, "grad_norm": 72.40164947509766, "learning_rate": 9.12721049629207e-08, "loss": 12.5069, "step": 21 }, { "epoch": 0.0037651891151805577, "grad_norm": 58.689125061035156, "learning_rate": 9.697661152310325e-08, "loss": 10.4915, "step": 22 }, { "epoch": 0.003936334074961492, "grad_norm": 41.391170501708984, "learning_rate": 1.0268111808328581e-07, "loss": 8.2323, "step": 23 }, { "epoch": 0.004107479034742427, "grad_norm": 60.9368896484375, "learning_rate": 1.0838562464346835e-07, "loss": 9.4007, "step": 24 }, { "epoch": 0.004278623994523361, "grad_norm": 132.59597778320312, "learning_rate": 1.1409013120365088e-07, "loss": 16.9119, "step": 25 }, { "epoch": 0.004449768954304296, "grad_norm": 72.2205581665039, "learning_rate": 1.1979463776383346e-07, "loss": 12.5137, "step": 26 }, { "epoch": 0.00462091391408523, "grad_norm": 69.2486572265625, "learning_rate": 1.2549914432401596e-07, "loss": 10.2477, "step": 27 }, { "epoch": 0.004792058873866164, "grad_norm": 30.01091194152832, "learning_rate": 1.3120365088419852e-07, "loss": 6.6456, "step": 28 }, { "epoch": 0.0049632038336470995, "grad_norm": 75.28530883789062, "learning_rate": 1.3690815744438105e-07, "loss": 9.7946, "step": 29 }, { "epoch": 0.005134348793428034, "grad_norm": 70.37921142578125, "learning_rate": 1.426126640045636e-07, "loss": 12.4969, "step": 30 }, { "epoch": 0.005305493753208968, "grad_norm": 90.83671569824219, "learning_rate": 1.4831717056474617e-07, "loss": 7.6589, "step": 31 }, { "epoch": 0.005476638712989902, "grad_norm": 65.92588806152344, "learning_rate": 1.540216771249287e-07, "loss": 9.7764, "step": 32 }, { "epoch": 0.005647783672770837, "grad_norm": 86.14967346191406, "learning_rate": 1.5972618368511126e-07, "loss": 8.2129, "step": 33 }, { "epoch": 0.0058189286325517715, "grad_norm": 145.2432098388672, "learning_rate": 1.654306902452938e-07, "loss": 17.7377, "step": 34 }, { "epoch": 0.005990073592332706, "grad_norm": 64.69364929199219, "learning_rate": 1.7113519680547632e-07, "loss": 9.9994, "step": 35 }, { "epoch": 0.00616121855211364, "grad_norm": 58.662803649902344, "learning_rate": 1.7683970336565888e-07, "loss": 11.459, "step": 36 }, { "epoch": 0.006332363511894575, "grad_norm": 123.47699737548828, "learning_rate": 1.825442099258414e-07, "loss": 16.278, "step": 37 }, { "epoch": 0.006503508471675509, "grad_norm": 40.24553680419922, "learning_rate": 1.8824871648602397e-07, "loss": 8.1959, "step": 38 }, { "epoch": 0.0066746534314564435, "grad_norm": 71.55098724365234, "learning_rate": 1.939532230462065e-07, "loss": 12.4429, "step": 39 }, { "epoch": 0.006845798391237378, "grad_norm": 73.94329833984375, "learning_rate": 1.9965772960638906e-07, "loss": 12.4672, "step": 40 }, { "epoch": 0.007016943351018313, "grad_norm": 78.10585021972656, "learning_rate": 2.0536223616657162e-07, "loss": 12.6372, "step": 41 }, { "epoch": 0.007188088310799247, "grad_norm": 154.09982299804688, "learning_rate": 2.1106674272675415e-07, "loss": 18.2102, "step": 42 }, { "epoch": 0.007359233270580181, "grad_norm": 69.93523406982422, "learning_rate": 2.167712492869367e-07, "loss": 10.2355, "step": 43 }, { "epoch": 0.0075303782303611155, "grad_norm": 53.971500396728516, "learning_rate": 2.224757558471192e-07, "loss": 11.0821, "step": 44 }, { "epoch": 0.007701523190142051, "grad_norm": 79.0840835571289, "learning_rate": 2.2818026240730177e-07, "loss": 12.838, "step": 45 }, { "epoch": 0.007872668149922985, "grad_norm": 68.8064956665039, "learning_rate": 2.3388476896748433e-07, "loss": 11.9172, "step": 46 }, { "epoch": 0.00804381310970392, "grad_norm": 114.59835815429688, "learning_rate": 2.395892755276669e-07, "loss": 16.4603, "step": 47 }, { "epoch": 0.008214958069484853, "grad_norm": 76.0896987915039, "learning_rate": 2.452937820878494e-07, "loss": 12.7007, "step": 48 }, { "epoch": 0.008386103029265788, "grad_norm": 45.47982406616211, "learning_rate": 2.509982886480319e-07, "loss": 10.2213, "step": 49 }, { "epoch": 0.008557247989046722, "grad_norm": 136.47836303710938, "learning_rate": 2.567027952082145e-07, "loss": 17.2777, "step": 50 }, { "epoch": 0.008728392948827657, "grad_norm": 57.07033920288086, "learning_rate": 2.6240730176839704e-07, "loss": 10.8704, "step": 51 }, { "epoch": 0.008899537908608592, "grad_norm": 50.97236633300781, "learning_rate": 2.681118083285796e-07, "loss": 10.8266, "step": 52 }, { "epoch": 0.009070682868389525, "grad_norm": 166.14840698242188, "learning_rate": 2.738163148887621e-07, "loss": 17.2345, "step": 53 }, { "epoch": 0.00924182782817046, "grad_norm": 154.5965576171875, "learning_rate": 2.795208214489447e-07, "loss": 18.5922, "step": 54 }, { "epoch": 0.009412972787951395, "grad_norm": 61.19700622558594, "learning_rate": 2.852253280091272e-07, "loss": 9.3514, "step": 55 }, { "epoch": 0.009584117747732329, "grad_norm": 64.54351806640625, "learning_rate": 2.909298345693098e-07, "loss": 7.0867, "step": 56 }, { "epoch": 0.009755262707513264, "grad_norm": 41.97494888305664, "learning_rate": 2.9663434112949233e-07, "loss": 8.319, "step": 57 }, { "epoch": 0.009926407667294199, "grad_norm": 158.86936950683594, "learning_rate": 3.023388476896748e-07, "loss": 18.7347, "step": 58 }, { "epoch": 0.010097552627075132, "grad_norm": 93.42990112304688, "learning_rate": 3.080433542498574e-07, "loss": 24.4121, "step": 59 }, { "epoch": 0.010268697586856067, "grad_norm": 40.078529357910156, "learning_rate": 3.1374786081003993e-07, "loss": 8.4254, "step": 60 }, { "epoch": 0.010439842546637, "grad_norm": 172.25357055664062, "learning_rate": 3.194523673702225e-07, "loss": 17.8112, "step": 61 }, { "epoch": 0.010610987506417936, "grad_norm": 64.05378723144531, "learning_rate": 3.2515687393040504e-07, "loss": 11.648, "step": 62 }, { "epoch": 0.010782132466198871, "grad_norm": 31.915542602539062, "learning_rate": 3.308613804905876e-07, "loss": 6.576, "step": 63 }, { "epoch": 0.010953277425979804, "grad_norm": 97.90230560302734, "learning_rate": 3.365658870507701e-07, "loss": 24.4094, "step": 64 }, { "epoch": 0.01112442238576074, "grad_norm": 49.965126037597656, "learning_rate": 3.4227039361095264e-07, "loss": 10.7296, "step": 65 }, { "epoch": 0.011295567345541675, "grad_norm": 103.74539947509766, "learning_rate": 3.479749001711352e-07, "loss": 24.3446, "step": 66 }, { "epoch": 0.011466712305322608, "grad_norm": 100.20292663574219, "learning_rate": 3.5367940673131776e-07, "loss": 24.4799, "step": 67 }, { "epoch": 0.011637857265103543, "grad_norm": 87.23709869384766, "learning_rate": 3.593839132915003e-07, "loss": 7.8589, "step": 68 }, { "epoch": 0.011809002224884476, "grad_norm": 105.97203063964844, "learning_rate": 3.650884198516828e-07, "loss": 24.5711, "step": 69 }, { "epoch": 0.011980147184665411, "grad_norm": 139.98709106445312, "learning_rate": 3.707929264118654e-07, "loss": 18.0228, "step": 70 }, { "epoch": 0.012151292144446347, "grad_norm": 32.724159240722656, "learning_rate": 3.7649743297204793e-07, "loss": 8.2503, "step": 71 }, { "epoch": 0.01232243710422728, "grad_norm": 55.843509674072266, "learning_rate": 3.822019395322305e-07, "loss": 7.1578, "step": 72 }, { "epoch": 0.012493582064008215, "grad_norm": 183.3024444580078, "learning_rate": 3.87906446092413e-07, "loss": 18.0044, "step": 73 }, { "epoch": 0.01266472702378915, "grad_norm": 118.88136291503906, "learning_rate": 3.9361095265259553e-07, "loss": 15.586, "step": 74 }, { "epoch": 0.012835871983570083, "grad_norm": 64.66754150390625, "learning_rate": 3.993154592127781e-07, "loss": 12.3289, "step": 75 }, { "epoch": 0.013007016943351019, "grad_norm": 62.58869552612305, "learning_rate": 4.0501996577296065e-07, "loss": 8.9949, "step": 76 }, { "epoch": 0.013178161903131952, "grad_norm": 66.82809448242188, "learning_rate": 4.1072447233314323e-07, "loss": 12.1411, "step": 77 }, { "epoch": 0.013349306862912887, "grad_norm": 144.4269256591797, "learning_rate": 4.164289788933257e-07, "loss": 18.5364, "step": 78 }, { "epoch": 0.013520451822693822, "grad_norm": 60.626834869384766, "learning_rate": 4.221334854535083e-07, "loss": 11.9262, "step": 79 }, { "epoch": 0.013691596782474755, "grad_norm": 139.23158264160156, "learning_rate": 4.278379920136908e-07, "loss": 17.5537, "step": 80 }, { "epoch": 0.01386274174225569, "grad_norm": 74.82394409179688, "learning_rate": 4.335424985738734e-07, "loss": 12.6047, "step": 81 }, { "epoch": 0.014033886702036626, "grad_norm": 128.57716369628906, "learning_rate": 4.3924700513405594e-07, "loss": 17.2671, "step": 82 }, { "epoch": 0.014205031661817559, "grad_norm": 50.15757369995117, "learning_rate": 4.449515116942384e-07, "loss": 9.8308, "step": 83 }, { "epoch": 0.014376176621598494, "grad_norm": 46.99615478515625, "learning_rate": 4.50656018254421e-07, "loss": 8.6575, "step": 84 }, { "epoch": 0.01454732158137943, "grad_norm": 61.31080627441406, "learning_rate": 4.5636052481460354e-07, "loss": 6.8446, "step": 85 }, { "epoch": 0.014718466541160363, "grad_norm": 60.068443298339844, "learning_rate": 4.620650313747861e-07, "loss": 11.2968, "step": 86 }, { "epoch": 0.014889611500941298, "grad_norm": 67.9156265258789, "learning_rate": 4.6776953793496865e-07, "loss": 12.672, "step": 87 }, { "epoch": 0.015060756460722231, "grad_norm": 49.77825164794922, "learning_rate": 4.734740444951512e-07, "loss": 9.5329, "step": 88 }, { "epoch": 0.015231901420503166, "grad_norm": 57.99795913696289, "learning_rate": 4.791785510553338e-07, "loss": 11.3883, "step": 89 }, { "epoch": 0.015403046380284101, "grad_norm": 54.070491790771484, "learning_rate": 4.848830576155162e-07, "loss": 11.282, "step": 90 }, { "epoch": 0.015574191340065035, "grad_norm": 46.44948959350586, "learning_rate": 4.905875641756988e-07, "loss": 10.4933, "step": 91 }, { "epoch": 0.01574533629984597, "grad_norm": 91.90750885009766, "learning_rate": 4.962920707358814e-07, "loss": 14.1568, "step": 92 }, { "epoch": 0.015916481259626903, "grad_norm": 45.950286865234375, "learning_rate": 5.019965772960638e-07, "loss": 8.6589, "step": 93 }, { "epoch": 0.01608762621940784, "grad_norm": 172.71951293945312, "learning_rate": 5.077010838562465e-07, "loss": 16.7382, "step": 94 }, { "epoch": 0.016258771179188773, "grad_norm": 54.993247985839844, "learning_rate": 5.13405590416429e-07, "loss": 11.4407, "step": 95 }, { "epoch": 0.016429916138969707, "grad_norm": 110.25296020507812, "learning_rate": 5.191100969766115e-07, "loss": 23.8689, "step": 96 }, { "epoch": 0.016601061098750643, "grad_norm": 43.9764289855957, "learning_rate": 5.248146035367941e-07, "loss": 7.3984, "step": 97 }, { "epoch": 0.016772206058531577, "grad_norm": 72.5784912109375, "learning_rate": 5.305191100969766e-07, "loss": 7.2256, "step": 98 }, { "epoch": 0.01694335101831251, "grad_norm": 35.58343505859375, "learning_rate": 5.362236166571592e-07, "loss": 7.8513, "step": 99 }, { "epoch": 0.017114495978093443, "grad_norm": 69.7222900390625, "learning_rate": 5.419281232173417e-07, "loss": 11.8726, "step": 100 }, { "epoch": 0.01728564093787438, "grad_norm": 54.240943908691406, "learning_rate": 5.476326297775242e-07, "loss": 11.6433, "step": 101 }, { "epoch": 0.017456785897655314, "grad_norm": 40.061763763427734, "learning_rate": 5.533371363377068e-07, "loss": 6.452, "step": 102 }, { "epoch": 0.017627930857436247, "grad_norm": 43.3102912902832, "learning_rate": 5.590416428978894e-07, "loss": 10.9229, "step": 103 }, { "epoch": 0.017799075817217184, "grad_norm": 48.96671676635742, "learning_rate": 5.647461494580719e-07, "loss": 10.9523, "step": 104 }, { "epoch": 0.017970220776998117, "grad_norm": 107.66687774658203, "learning_rate": 5.704506560182544e-07, "loss": 15.756, "step": 105 }, { "epoch": 0.01814136573677905, "grad_norm": 50.87533950805664, "learning_rate": 5.76155162578437e-07, "loss": 9.8941, "step": 106 }, { "epoch": 0.018312510696559987, "grad_norm": 142.70115661621094, "learning_rate": 5.818596691386196e-07, "loss": 16.205, "step": 107 }, { "epoch": 0.01848365565634092, "grad_norm": 62.69704818725586, "learning_rate": 5.87564175698802e-07, "loss": 9.7933, "step": 108 }, { "epoch": 0.018654800616121854, "grad_norm": 52.710227966308594, "learning_rate": 5.932686822589847e-07, "loss": 10.7189, "step": 109 }, { "epoch": 0.01882594557590279, "grad_norm": 131.87474060058594, "learning_rate": 5.989731888191672e-07, "loss": 24.6335, "step": 110 }, { "epoch": 0.018997090535683724, "grad_norm": 105.79902648925781, "learning_rate": 6.046776953793496e-07, "loss": 16.133, "step": 111 }, { "epoch": 0.019168235495464658, "grad_norm": 56.011474609375, "learning_rate": 6.103822019395323e-07, "loss": 11.9402, "step": 112 }, { "epoch": 0.019339380455245594, "grad_norm": 97.4761962890625, "learning_rate": 6.160867084997148e-07, "loss": 14.1356, "step": 113 }, { "epoch": 0.019510525415026528, "grad_norm": 52.7200813293457, "learning_rate": 6.217912150598974e-07, "loss": 11.4511, "step": 114 }, { "epoch": 0.01968167037480746, "grad_norm": 42.6909065246582, "learning_rate": 6.274957216200799e-07, "loss": 8.5194, "step": 115 }, { "epoch": 0.019852815334588398, "grad_norm": 44.77908706665039, "learning_rate": 6.332002281802624e-07, "loss": 11.437, "step": 116 }, { "epoch": 0.02002396029436933, "grad_norm": 136.7108612060547, "learning_rate": 6.38904734740445e-07, "loss": 23.712, "step": 117 }, { "epoch": 0.020195105254150265, "grad_norm": 44.484893798828125, "learning_rate": 6.446092413006275e-07, "loss": 10.4753, "step": 118 }, { "epoch": 0.020366250213931198, "grad_norm": 57.66374206542969, "learning_rate": 6.503137478608101e-07, "loss": 11.2803, "step": 119 }, { "epoch": 0.020537395173712135, "grad_norm": 110.59872436523438, "learning_rate": 6.560182544209926e-07, "loss": 15.7956, "step": 120 }, { "epoch": 0.02070854013349307, "grad_norm": 33.806732177734375, "learning_rate": 6.617227609811752e-07, "loss": 10.327, "step": 121 }, { "epoch": 0.020879685093274, "grad_norm": 52.41442108154297, "learning_rate": 6.674272675413577e-07, "loss": 10.5716, "step": 122 }, { "epoch": 0.02105083005305494, "grad_norm": 40.95213317871094, "learning_rate": 6.731317741015402e-07, "loss": 11.6353, "step": 123 }, { "epoch": 0.021221975012835872, "grad_norm": 43.268775939941406, "learning_rate": 6.788362806617229e-07, "loss": 11.2876, "step": 124 }, { "epoch": 0.021393119972616805, "grad_norm": 102.84829711914062, "learning_rate": 6.845407872219053e-07, "loss": 15.5444, "step": 125 }, { "epoch": 0.021564264932397742, "grad_norm": 56.55605697631836, "learning_rate": 6.902452937820878e-07, "loss": 10.2011, "step": 126 }, { "epoch": 0.021735409892178675, "grad_norm": 37.294795989990234, "learning_rate": 6.959498003422704e-07, "loss": 8.1014, "step": 127 }, { "epoch": 0.02190655485195961, "grad_norm": 55.67061233520508, "learning_rate": 7.01654306902453e-07, "loss": 11.638, "step": 128 }, { "epoch": 0.022077699811740546, "grad_norm": 67.4786605834961, "learning_rate": 7.073588134626355e-07, "loss": 6.8779, "step": 129 }, { "epoch": 0.02224884477152148, "grad_norm": 30.9260196685791, "learning_rate": 7.13063320022818e-07, "loss": 9.4357, "step": 130 }, { "epoch": 0.022419989731302412, "grad_norm": 100.22219848632812, "learning_rate": 7.187678265830006e-07, "loss": 14.6476, "step": 131 }, { "epoch": 0.02259113469108335, "grad_norm": 29.24936294555664, "learning_rate": 7.244723331431832e-07, "loss": 6.5599, "step": 132 }, { "epoch": 0.022762279650864282, "grad_norm": 31.59239959716797, "learning_rate": 7.301768397033656e-07, "loss": 6.6734, "step": 133 }, { "epoch": 0.022933424610645216, "grad_norm": 42.93860626220703, "learning_rate": 7.358813462635483e-07, "loss": 9.125, "step": 134 }, { "epoch": 0.023104569570426153, "grad_norm": 46.8751335144043, "learning_rate": 7.415858528237308e-07, "loss": 10.3878, "step": 135 }, { "epoch": 0.023275714530207086, "grad_norm": 106.5069351196289, "learning_rate": 7.472903593839132e-07, "loss": 14.3693, "step": 136 }, { "epoch": 0.02344685948998802, "grad_norm": 126.05512237548828, "learning_rate": 7.529948659440959e-07, "loss": 22.8488, "step": 137 }, { "epoch": 0.023618004449768953, "grad_norm": 89.99185180664062, "learning_rate": 7.586993725042784e-07, "loss": 13.3307, "step": 138 }, { "epoch": 0.02378914940954989, "grad_norm": 35.95622253417969, "learning_rate": 7.64403879064461e-07, "loss": 9.7673, "step": 139 }, { "epoch": 0.023960294369330823, "grad_norm": 31.504724502563477, "learning_rate": 7.701083856246435e-07, "loss": 9.0915, "step": 140 }, { "epoch": 0.024131439329111756, "grad_norm": 74.6131591796875, "learning_rate": 7.75812892184826e-07, "loss": 12.7378, "step": 141 }, { "epoch": 0.024302584288892693, "grad_norm": 40.63880920410156, "learning_rate": 7.815173987450086e-07, "loss": 8.9323, "step": 142 }, { "epoch": 0.024473729248673626, "grad_norm": 101.69804382324219, "learning_rate": 7.872219053051911e-07, "loss": 14.547, "step": 143 }, { "epoch": 0.02464487420845456, "grad_norm": 131.046142578125, "learning_rate": 7.929264118653737e-07, "loss": 23.0012, "step": 144 }, { "epoch": 0.024816019168235497, "grad_norm": 74.94658660888672, "learning_rate": 7.986309184255562e-07, "loss": 12.8286, "step": 145 }, { "epoch": 0.02498716412801643, "grad_norm": 50.227718353271484, "learning_rate": 8.043354249857388e-07, "loss": 10.9684, "step": 146 }, { "epoch": 0.025158309087797363, "grad_norm": 162.12603759765625, "learning_rate": 8.100399315459213e-07, "loss": 23.0181, "step": 147 }, { "epoch": 0.0253294540475783, "grad_norm": 34.11660385131836, "learning_rate": 8.157444381061038e-07, "loss": 9.2808, "step": 148 }, { "epoch": 0.025500599007359234, "grad_norm": 34.07155990600586, "learning_rate": 8.214489446662865e-07, "loss": 11.3059, "step": 149 }, { "epoch": 0.025671743967140167, "grad_norm": 42.31085968017578, "learning_rate": 8.271534512264689e-07, "loss": 8.7746, "step": 150 }, { "epoch": 0.025842888926921104, "grad_norm": 132.6522216796875, "learning_rate": 8.328579577866514e-07, "loss": 16.6408, "step": 151 }, { "epoch": 0.026014033886702037, "grad_norm": 21.736328125, "learning_rate": 8.385624643468341e-07, "loss": 9.6907, "step": 152 }, { "epoch": 0.02618517884648297, "grad_norm": 64.54568481445312, "learning_rate": 8.442669709070166e-07, "loss": 12.8573, "step": 153 }, { "epoch": 0.026356323806263904, "grad_norm": 32.484703063964844, "learning_rate": 8.499714774671991e-07, "loss": 7.4299, "step": 154 }, { "epoch": 0.02652746876604484, "grad_norm": 86.55378723144531, "learning_rate": 8.556759840273817e-07, "loss": 13.347, "step": 155 }, { "epoch": 0.026698613725825774, "grad_norm": 32.97962188720703, "learning_rate": 8.613804905875642e-07, "loss": 9.2004, "step": 156 }, { "epoch": 0.026869758685606707, "grad_norm": 66.87654113769531, "learning_rate": 8.670849971477468e-07, "loss": 6.6107, "step": 157 }, { "epoch": 0.027040903645387644, "grad_norm": 56.53002166748047, "learning_rate": 8.727895037079292e-07, "loss": 6.0957, "step": 158 }, { "epoch": 0.027212048605168578, "grad_norm": 37.223453521728516, "learning_rate": 8.784940102681119e-07, "loss": 8.9968, "step": 159 }, { "epoch": 0.02738319356494951, "grad_norm": 30.637619018554688, "learning_rate": 8.841985168282944e-07, "loss": 8.9597, "step": 160 }, { "epoch": 0.027554338524730448, "grad_norm": 22.8154354095459, "learning_rate": 8.899030233884768e-07, "loss": 9.2794, "step": 161 }, { "epoch": 0.02772548348451138, "grad_norm": 64.24419403076172, "learning_rate": 8.956075299486595e-07, "loss": 12.9209, "step": 162 }, { "epoch": 0.027896628444292314, "grad_norm": 27.159826278686523, "learning_rate": 9.01312036508842e-07, "loss": 10.7092, "step": 163 }, { "epoch": 0.02806777340407325, "grad_norm": 29.741992950439453, "learning_rate": 9.070165430690246e-07, "loss": 10.1098, "step": 164 }, { "epoch": 0.028238918363854185, "grad_norm": 61.4916877746582, "learning_rate": 9.127210496292071e-07, "loss": 12.5023, "step": 165 }, { "epoch": 0.028410063323635118, "grad_norm": 21.36608123779297, "learning_rate": 9.184255561893896e-07, "loss": 7.2161, "step": 166 }, { "epoch": 0.028581208283416055, "grad_norm": 51.13070297241211, "learning_rate": 9.241300627495722e-07, "loss": 5.5324, "step": 167 }, { "epoch": 0.028752353243196988, "grad_norm": 27.232070922851562, "learning_rate": 9.298345693097547e-07, "loss": 9.3162, "step": 168 }, { "epoch": 0.02892349820297792, "grad_norm": 51.84492111206055, "learning_rate": 9.355390758699373e-07, "loss": 6.0306, "step": 169 }, { "epoch": 0.02909464316275886, "grad_norm": 24.21738052368164, "learning_rate": 9.412435824301197e-07, "loss": 6.6994, "step": 170 }, { "epoch": 0.02926578812253979, "grad_norm": 27.428897857666016, "learning_rate": 9.469480889903024e-07, "loss": 10.5412, "step": 171 }, { "epoch": 0.029436933082320725, "grad_norm": 123.71875762939453, "learning_rate": 9.526525955504849e-07, "loss": 15.9849, "step": 172 }, { "epoch": 0.02960807804210166, "grad_norm": 34.90501403808594, "learning_rate": 9.583571021106676e-07, "loss": 9.2574, "step": 173 }, { "epoch": 0.029779223001882595, "grad_norm": 26.623390197753906, "learning_rate": 9.6406160867085e-07, "loss": 8.7904, "step": 174 }, { "epoch": 0.02995036796166353, "grad_norm": 21.868566513061523, "learning_rate": 9.697661152310325e-07, "loss": 9.2638, "step": 175 }, { "epoch": 0.030121512921444462, "grad_norm": 28.389110565185547, "learning_rate": 9.754706217912152e-07, "loss": 5.9086, "step": 176 }, { "epoch": 0.0302926578812254, "grad_norm": 51.29762649536133, "learning_rate": 9.811751283513976e-07, "loss": 5.9646, "step": 177 }, { "epoch": 0.030463802841006332, "grad_norm": 28.91325569152832, "learning_rate": 9.8687963491158e-07, "loss": 6.0877, "step": 178 }, { "epoch": 0.030634947800787266, "grad_norm": 66.74105834960938, "learning_rate": 9.925841414717628e-07, "loss": 12.4348, "step": 179 }, { "epoch": 0.030806092760568202, "grad_norm": 19.138124465942383, "learning_rate": 9.982886480319452e-07, "loss": 9.5496, "step": 180 }, { "epoch": 0.030977237720349136, "grad_norm": 43.17308044433594, "learning_rate": 1.0039931545921277e-06, "loss": 5.5641, "step": 181 }, { "epoch": 0.03114838268013007, "grad_norm": 32.97599411010742, "learning_rate": 1.0096976611523104e-06, "loss": 9.0529, "step": 182 }, { "epoch": 0.031319527639911006, "grad_norm": 56.315521240234375, "learning_rate": 1.015402167712493e-06, "loss": 12.0747, "step": 183 }, { "epoch": 0.03149067259969194, "grad_norm": 76.77662658691406, "learning_rate": 1.0211066742726755e-06, "loss": 13.0892, "step": 184 }, { "epoch": 0.03166181755947287, "grad_norm": 25.544397354125977, "learning_rate": 1.026811180832858e-06, "loss": 7.7117, "step": 185 }, { "epoch": 0.031832962519253806, "grad_norm": 24.205764770507812, "learning_rate": 1.0325156873930406e-06, "loss": 6.6426, "step": 186 }, { "epoch": 0.03200410747903474, "grad_norm": 25.586280822753906, "learning_rate": 1.038220193953223e-06, "loss": 10.4785, "step": 187 }, { "epoch": 0.03217525243881568, "grad_norm": 68.83911895751953, "learning_rate": 1.0439247005134056e-06, "loss": 12.2132, "step": 188 }, { "epoch": 0.03234639739859661, "grad_norm": 24.825489044189453, "learning_rate": 1.0496292070735881e-06, "loss": 6.3336, "step": 189 }, { "epoch": 0.032517542358377546, "grad_norm": 28.293699264526367, "learning_rate": 1.0553337136337707e-06, "loss": 8.5374, "step": 190 }, { "epoch": 0.03268868731815848, "grad_norm": 28.26664924621582, "learning_rate": 1.0610382201939532e-06, "loss": 9.7218, "step": 191 }, { "epoch": 0.03285983227793941, "grad_norm": 84.32862854003906, "learning_rate": 1.0667427267541357e-06, "loss": 12.782, "step": 192 }, { "epoch": 0.033030977237720346, "grad_norm": 26.818071365356445, "learning_rate": 1.0724472333143185e-06, "loss": 7.3125, "step": 193 }, { "epoch": 0.03320212219750129, "grad_norm": 16.650196075439453, "learning_rate": 1.0781517398745008e-06, "loss": 9.0232, "step": 194 }, { "epoch": 0.03337326715728222, "grad_norm": 22.659135818481445, "learning_rate": 1.0838562464346833e-06, "loss": 6.2787, "step": 195 }, { "epoch": 0.03354441211706315, "grad_norm": 24.644168853759766, "learning_rate": 1.089560752994866e-06, "loss": 6.0047, "step": 196 }, { "epoch": 0.03371555707684409, "grad_norm": 32.078712463378906, "learning_rate": 1.0952652595550484e-06, "loss": 7.5748, "step": 197 }, { "epoch": 0.03388670203662502, "grad_norm": 55.345855712890625, "learning_rate": 1.1009697661152311e-06, "loss": 11.8703, "step": 198 }, { "epoch": 0.034057846996405954, "grad_norm": 70.49486541748047, "learning_rate": 1.1066742726754137e-06, "loss": 11.7983, "step": 199 }, { "epoch": 0.03422899195618689, "grad_norm": 29.946758270263672, "learning_rate": 1.112378779235596e-06, "loss": 7.6286, "step": 200 }, { "epoch": 0.03440013691596783, "grad_norm": 278.3395080566406, "learning_rate": 1.1180832857957787e-06, "loss": 17.6192, "step": 201 }, { "epoch": 0.03457128187574876, "grad_norm": 310.682861328125, "learning_rate": 1.1237877923559613e-06, "loss": 17.6315, "step": 202 }, { "epoch": 0.034742426835529694, "grad_norm": 46.159568786621094, "learning_rate": 1.1294922989161438e-06, "loss": 11.6001, "step": 203 }, { "epoch": 0.03491357179531063, "grad_norm": 20.635892868041992, "learning_rate": 1.1351968054763263e-06, "loss": 9.4128, "step": 204 }, { "epoch": 0.03508471675509156, "grad_norm": 143.4097137451172, "learning_rate": 1.1409013120365089e-06, "loss": 16.3943, "step": 205 }, { "epoch": 0.035255861714872494, "grad_norm": 265.5577087402344, "learning_rate": 1.1466058185966914e-06, "loss": 18.6869, "step": 206 }, { "epoch": 0.035427006674653434, "grad_norm": 19.766063690185547, "learning_rate": 1.152310325156874e-06, "loss": 8.6515, "step": 207 }, { "epoch": 0.03559815163443437, "grad_norm": 43.8801383972168, "learning_rate": 1.1580148317170565e-06, "loss": 11.424, "step": 208 }, { "epoch": 0.0357692965942153, "grad_norm": 12.928386688232422, "learning_rate": 1.1637193382772392e-06, "loss": 5.5902, "step": 209 }, { "epoch": 0.035940441553996234, "grad_norm": 123.55076599121094, "learning_rate": 1.1694238448374215e-06, "loss": 15.6958, "step": 210 }, { "epoch": 0.03611158651377717, "grad_norm": 44.79010772705078, "learning_rate": 1.175128351397604e-06, "loss": 11.1894, "step": 211 }, { "epoch": 0.0362827314735581, "grad_norm": 26.461137771606445, "learning_rate": 1.1808328579577868e-06, "loss": 7.3237, "step": 212 }, { "epoch": 0.03645387643333904, "grad_norm": 24.63947296142578, "learning_rate": 1.1865373645179693e-06, "loss": 5.7252, "step": 213 }, { "epoch": 0.036625021393119975, "grad_norm": 17.151113510131836, "learning_rate": 1.1922418710781517e-06, "loss": 9.0419, "step": 214 }, { "epoch": 0.03679616635290091, "grad_norm": 26.69593620300293, "learning_rate": 1.1979463776383344e-06, "loss": 9.4836, "step": 215 }, { "epoch": 0.03696731131268184, "grad_norm": 50.901573181152344, "learning_rate": 1.203650884198517e-06, "loss": 11.2858, "step": 216 }, { "epoch": 0.037138456272462775, "grad_norm": 48.110328674316406, "learning_rate": 1.2093553907586992e-06, "loss": 11.5594, "step": 217 }, { "epoch": 0.03730960123224371, "grad_norm": 51.77389907836914, "learning_rate": 1.215059897318882e-06, "loss": 11.6974, "step": 218 }, { "epoch": 0.03748074619202464, "grad_norm": 23.52347183227539, "learning_rate": 1.2207644038790645e-06, "loss": 9.5737, "step": 219 }, { "epoch": 0.03765189115180558, "grad_norm": 20.402074813842773, "learning_rate": 1.2264689104392468e-06, "loss": 6.1995, "step": 220 }, { "epoch": 0.037823036111586515, "grad_norm": 18.76962661743164, "learning_rate": 1.2321734169994296e-06, "loss": 7.1013, "step": 221 }, { "epoch": 0.03799418107136745, "grad_norm": 21.817501068115234, "learning_rate": 1.2378779235596121e-06, "loss": 9.3332, "step": 222 }, { "epoch": 0.03816532603114838, "grad_norm": 11.452000617980957, "learning_rate": 1.2435824301197949e-06, "loss": 6.2887, "step": 223 }, { "epoch": 0.038336470990929315, "grad_norm": 22.69776153564453, "learning_rate": 1.2492869366799772e-06, "loss": 7.9947, "step": 224 }, { "epoch": 0.03850761595071025, "grad_norm": 25.39488410949707, "learning_rate": 1.2549914432401597e-06, "loss": 5.1894, "step": 225 }, { "epoch": 0.03867876091049119, "grad_norm": 17.65719223022461, "learning_rate": 1.2606959498003425e-06, "loss": 7.4931, "step": 226 }, { "epoch": 0.03884990587027212, "grad_norm": 23.45711898803711, "learning_rate": 1.2664004563605248e-06, "loss": 9.6157, "step": 227 }, { "epoch": 0.039021050830053056, "grad_norm": 29.114194869995117, "learning_rate": 1.2721049629207073e-06, "loss": 10.4857, "step": 228 }, { "epoch": 0.03919219578983399, "grad_norm": 46.365013122558594, "learning_rate": 1.27780946948089e-06, "loss": 11.9216, "step": 229 }, { "epoch": 0.03936334074961492, "grad_norm": 23.066879272460938, "learning_rate": 1.2835139760410724e-06, "loss": 9.4344, "step": 230 }, { "epoch": 0.039534485709395856, "grad_norm": 15.414644241333008, "learning_rate": 1.289218482601255e-06, "loss": 6.4409, "step": 231 }, { "epoch": 0.039705630669176796, "grad_norm": 16.58795166015625, "learning_rate": 1.2949229891614376e-06, "loss": 7.3307, "step": 232 }, { "epoch": 0.03987677562895773, "grad_norm": 36.44779968261719, "learning_rate": 1.3006274957216202e-06, "loss": 11.1388, "step": 233 }, { "epoch": 0.04004792058873866, "grad_norm": 20.902912139892578, "learning_rate": 1.3063320022818027e-06, "loss": 7.378, "step": 234 }, { "epoch": 0.040219065548519596, "grad_norm": 20.50259017944336, "learning_rate": 1.3120365088419852e-06, "loss": 6.156, "step": 235 }, { "epoch": 0.04039021050830053, "grad_norm": 22.57229995727539, "learning_rate": 1.3177410154021678e-06, "loss": 7.0029, "step": 236 }, { "epoch": 0.04056135546808146, "grad_norm": 25.610868453979492, "learning_rate": 1.3234455219623503e-06, "loss": 8.7721, "step": 237 }, { "epoch": 0.040732500427862396, "grad_norm": 278.795654296875, "learning_rate": 1.3291500285225328e-06, "loss": 15.9633, "step": 238 }, { "epoch": 0.040903645387643336, "grad_norm": 11.644048690795898, "learning_rate": 1.3348545350827154e-06, "loss": 6.3707, "step": 239 }, { "epoch": 0.04107479034742427, "grad_norm": 36.32057189941406, "learning_rate": 1.340559041642898e-06, "loss": 10.7901, "step": 240 }, { "epoch": 0.0412459353072052, "grad_norm": 22.911476135253906, "learning_rate": 1.3462635482030804e-06, "loss": 9.4097, "step": 241 }, { "epoch": 0.04141708026698614, "grad_norm": 24.35552406311035, "learning_rate": 1.351968054763263e-06, "loss": 9.081, "step": 242 }, { "epoch": 0.04158822522676707, "grad_norm": 18.466432571411133, "learning_rate": 1.3576725613234457e-06, "loss": 7.4805, "step": 243 }, { "epoch": 0.041759370186548, "grad_norm": 44.41029357910156, "learning_rate": 1.363377067883628e-06, "loss": 11.2131, "step": 244 }, { "epoch": 0.041930515146328944, "grad_norm": 15.328824043273926, "learning_rate": 1.3690815744438106e-06, "loss": 8.2706, "step": 245 }, { "epoch": 0.04210166010610988, "grad_norm": 274.3642578125, "learning_rate": 1.3747860810039933e-06, "loss": 15.8791, "step": 246 }, { "epoch": 0.04227280506589081, "grad_norm": 18.105318069458008, "learning_rate": 1.3804905875641756e-06, "loss": 8.9079, "step": 247 }, { "epoch": 0.042443950025671744, "grad_norm": 22.90168571472168, "learning_rate": 1.3861950941243584e-06, "loss": 6.6905, "step": 248 }, { "epoch": 0.04261509498545268, "grad_norm": 16.96687126159668, "learning_rate": 1.391899600684541e-06, "loss": 8.5567, "step": 249 }, { "epoch": 0.04278623994523361, "grad_norm": 283.76409912109375, "learning_rate": 1.3976041072447232e-06, "loss": 14.4204, "step": 250 }, { "epoch": 0.04295738490501455, "grad_norm": 22.41378402709961, "learning_rate": 1.403308613804906e-06, "loss": 9.6063, "step": 251 }, { "epoch": 0.043128529864795484, "grad_norm": 23.26137924194336, "learning_rate": 1.4090131203650885e-06, "loss": 9.9569, "step": 252 }, { "epoch": 0.04329967482457642, "grad_norm": 19.40400505065918, "learning_rate": 1.414717626925271e-06, "loss": 6.4322, "step": 253 }, { "epoch": 0.04347081978435735, "grad_norm": 21.541933059692383, "learning_rate": 1.4204221334854536e-06, "loss": 4.5325, "step": 254 }, { "epoch": 0.043641964744138284, "grad_norm": 17.52275276184082, "learning_rate": 1.426126640045636e-06, "loss": 8.3479, "step": 255 }, { "epoch": 0.04381310970391922, "grad_norm": 125.6756591796875, "learning_rate": 1.4318311466058186e-06, "loss": 15.4145, "step": 256 }, { "epoch": 0.04398425466370015, "grad_norm": 18.166152954101562, "learning_rate": 1.4375356531660011e-06, "loss": 4.2531, "step": 257 }, { "epoch": 0.04415539962348109, "grad_norm": 25.4247989654541, "learning_rate": 1.4432401597261837e-06, "loss": 10.4856, "step": 258 }, { "epoch": 0.044326544583262024, "grad_norm": 17.259897232055664, "learning_rate": 1.4489446662863664e-06, "loss": 8.6032, "step": 259 }, { "epoch": 0.04449768954304296, "grad_norm": 23.197059631347656, "learning_rate": 1.4546491728465487e-06, "loss": 7.8062, "step": 260 }, { "epoch": 0.04466883450282389, "grad_norm": 43.4500617980957, "learning_rate": 1.4603536794067313e-06, "loss": 11.1986, "step": 261 }, { "epoch": 0.044839979462604825, "grad_norm": 122.06368255615234, "learning_rate": 1.466058185966914e-06, "loss": 15.5832, "step": 262 }, { "epoch": 0.04501112442238576, "grad_norm": 16.506317138671875, "learning_rate": 1.4717626925270965e-06, "loss": 8.7747, "step": 263 }, { "epoch": 0.0451822693821667, "grad_norm": 19.03982162475586, "learning_rate": 1.4774671990872789e-06, "loss": 7.6134, "step": 264 }, { "epoch": 0.04535341434194763, "grad_norm": 33.20307540893555, "learning_rate": 1.4831717056474616e-06, "loss": 10.3325, "step": 265 }, { "epoch": 0.045524559301728565, "grad_norm": 16.946876525878906, "learning_rate": 1.4888762122076441e-06, "loss": 8.2866, "step": 266 }, { "epoch": 0.0456957042615095, "grad_norm": 25.170318603515625, "learning_rate": 1.4945807187678265e-06, "loss": 10.2958, "step": 267 }, { "epoch": 0.04586684922129043, "grad_norm": 16.860721588134766, "learning_rate": 1.5002852253280092e-06, "loss": 8.5198, "step": 268 }, { "epoch": 0.046037994181071365, "grad_norm": 18.003284454345703, "learning_rate": 1.5059897318881917e-06, "loss": 8.8484, "step": 269 }, { "epoch": 0.046209139140852305, "grad_norm": 17.796016693115234, "learning_rate": 1.511694238448374e-06, "loss": 6.2495, "step": 270 }, { "epoch": 0.04638028410063324, "grad_norm": 23.97182846069336, "learning_rate": 1.5173987450085568e-06, "loss": 7.0879, "step": 271 }, { "epoch": 0.04655142906041417, "grad_norm": 213.1482696533203, "learning_rate": 1.5231032515687393e-06, "loss": 12.8754, "step": 272 }, { "epoch": 0.046722574020195105, "grad_norm": 25.503662109375, "learning_rate": 1.528807758128922e-06, "loss": 7.9125, "step": 273 }, { "epoch": 0.04689371897997604, "grad_norm": 19.832860946655273, "learning_rate": 1.5345122646891044e-06, "loss": 9.0794, "step": 274 }, { "epoch": 0.04706486393975697, "grad_norm": 32.311920166015625, "learning_rate": 1.540216771249287e-06, "loss": 10.648, "step": 275 }, { "epoch": 0.047236008899537905, "grad_norm": 39.916603088378906, "learning_rate": 1.5459212778094697e-06, "loss": 10.9246, "step": 276 }, { "epoch": 0.047407153859318846, "grad_norm": 21.337602615356445, "learning_rate": 1.551625784369652e-06, "loss": 9.2191, "step": 277 }, { "epoch": 0.04757829881909978, "grad_norm": 25.114675521850586, "learning_rate": 1.5573302909298345e-06, "loss": 10.2576, "step": 278 }, { "epoch": 0.04774944377888071, "grad_norm": 14.945568084716797, "learning_rate": 1.5630347974900173e-06, "loss": 8.4857, "step": 279 }, { "epoch": 0.047920588738661646, "grad_norm": 33.542449951171875, "learning_rate": 1.5687393040501996e-06, "loss": 10.9193, "step": 280 }, { "epoch": 0.04809173369844258, "grad_norm": 27.331628799438477, "learning_rate": 1.5744438106103821e-06, "loss": 9.9441, "step": 281 }, { "epoch": 0.04826287865822351, "grad_norm": 17.784677505493164, "learning_rate": 1.5801483171705649e-06, "loss": 6.4105, "step": 282 }, { "epoch": 0.04843402361800445, "grad_norm": 46.38033676147461, "learning_rate": 1.5858528237307474e-06, "loss": 10.5075, "step": 283 }, { "epoch": 0.048605168577785386, "grad_norm": 13.535309791564941, "learning_rate": 1.59155733029093e-06, "loss": 4.4568, "step": 284 }, { "epoch": 0.04877631353756632, "grad_norm": 27.45166015625, "learning_rate": 1.5972618368511125e-06, "loss": 10.2344, "step": 285 }, { "epoch": 0.04894745849734725, "grad_norm": 16.50087547302246, "learning_rate": 1.602966343411295e-06, "loss": 8.5428, "step": 286 }, { "epoch": 0.049118603457128186, "grad_norm": 42.31341552734375, "learning_rate": 1.6086708499714775e-06, "loss": 10.0868, "step": 287 }, { "epoch": 0.04928974841690912, "grad_norm": 17.977153778076172, "learning_rate": 1.61437535653166e-06, "loss": 9.012, "step": 288 }, { "epoch": 0.04946089337669006, "grad_norm": 104.7464828491211, "learning_rate": 1.6200798630918426e-06, "loss": 14.6671, "step": 289 }, { "epoch": 0.04963203833647099, "grad_norm": 17.432056427001953, "learning_rate": 1.6257843696520251e-06, "loss": 6.8872, "step": 290 }, { "epoch": 0.04980318329625193, "grad_norm": 242.7275390625, "learning_rate": 1.6314888762122076e-06, "loss": 11.2526, "step": 291 }, { "epoch": 0.04997432825603286, "grad_norm": 15.779862403869629, "learning_rate": 1.6371933827723902e-06, "loss": 8.7887, "step": 292 }, { "epoch": 0.05014547321581379, "grad_norm": 13.621806144714355, "learning_rate": 1.642897889332573e-06, "loss": 7.0578, "step": 293 }, { "epoch": 0.05031661817559473, "grad_norm": 14.4631986618042, "learning_rate": 1.6486023958927552e-06, "loss": 8.2147, "step": 294 }, { "epoch": 0.05048776313537566, "grad_norm": 18.11038589477539, "learning_rate": 1.6543069024529378e-06, "loss": 6.4308, "step": 295 }, { "epoch": 0.0506589080951566, "grad_norm": 16.797258377075195, "learning_rate": 1.6600114090131205e-06, "loss": 6.3738, "step": 296 }, { "epoch": 0.050830053054937534, "grad_norm": 17.457462310791016, "learning_rate": 1.6657159155733028e-06, "loss": 6.3681, "step": 297 }, { "epoch": 0.05100119801471847, "grad_norm": 14.502140045166016, "learning_rate": 1.6714204221334856e-06, "loss": 7.1297, "step": 298 }, { "epoch": 0.0511723429744994, "grad_norm": 14.4544677734375, "learning_rate": 1.6771249286936681e-06, "loss": 8.5584, "step": 299 }, { "epoch": 0.051343487934280334, "grad_norm": 13.313618659973145, "learning_rate": 1.6828294352538504e-06, "loss": 8.1348, "step": 300 }, { "epoch": 0.05151463289406127, "grad_norm": 91.8434829711914, "learning_rate": 1.6885339418140332e-06, "loss": 13.9421, "step": 301 }, { "epoch": 0.05168577785384221, "grad_norm": 39.31818389892578, "learning_rate": 1.6942384483742157e-06, "loss": 10.3291, "step": 302 }, { "epoch": 0.05185692281362314, "grad_norm": 16.320667266845703, "learning_rate": 1.6999429549343982e-06, "loss": 4.6866, "step": 303 }, { "epoch": 0.052028067773404074, "grad_norm": 13.367071151733398, "learning_rate": 1.7056474614945808e-06, "loss": 8.1535, "step": 304 }, { "epoch": 0.05219921273318501, "grad_norm": 186.96824645996094, "learning_rate": 1.7113519680547633e-06, "loss": 10.6341, "step": 305 }, { "epoch": 0.05237035769296594, "grad_norm": 28.400169372558594, "learning_rate": 1.7170564746149458e-06, "loss": 9.7369, "step": 306 }, { "epoch": 0.052541502652746874, "grad_norm": 15.559652328491211, "learning_rate": 1.7227609811751284e-06, "loss": 7.1427, "step": 307 }, { "epoch": 0.05271264761252781, "grad_norm": 5.730342864990234, "learning_rate": 1.728465487735311e-06, "loss": 5.4861, "step": 308 }, { "epoch": 0.05288379257230875, "grad_norm": 19.06242561340332, "learning_rate": 1.7341699942954936e-06, "loss": 9.0657, "step": 309 }, { "epoch": 0.05305493753208968, "grad_norm": 18.580720901489258, "learning_rate": 1.739874500855676e-06, "loss": 5.9947, "step": 310 }, { "epoch": 0.053226082491870615, "grad_norm": 13.939530372619629, "learning_rate": 1.7455790074158585e-06, "loss": 7.1715, "step": 311 }, { "epoch": 0.05339722745165155, "grad_norm": 12.347646713256836, "learning_rate": 1.7512835139760412e-06, "loss": 4.5087, "step": 312 }, { "epoch": 0.05356837241143248, "grad_norm": 16.251863479614258, "learning_rate": 1.7569880205362238e-06, "loss": 8.7544, "step": 313 }, { "epoch": 0.053739517371213415, "grad_norm": 18.887571334838867, "learning_rate": 1.762692527096406e-06, "loss": 7.1006, "step": 314 }, { "epoch": 0.053910662330994355, "grad_norm": 29.57771873474121, "learning_rate": 1.7683970336565888e-06, "loss": 10.2554, "step": 315 }, { "epoch": 0.05408180729077529, "grad_norm": 215.26080322265625, "learning_rate": 1.7741015402167714e-06, "loss": 10.6589, "step": 316 }, { "epoch": 0.05425295225055622, "grad_norm": 6.18715763092041, "learning_rate": 1.7798060467769537e-06, "loss": 5.3794, "step": 317 }, { "epoch": 0.054424097210337155, "grad_norm": 30.351348876953125, "learning_rate": 1.7855105533371364e-06, "loss": 10.3749, "step": 318 }, { "epoch": 0.05459524217011809, "grad_norm": 16.978347778320312, "learning_rate": 1.791215059897319e-06, "loss": 6.2012, "step": 319 }, { "epoch": 0.05476638712989902, "grad_norm": 19.239072799682617, "learning_rate": 1.7969195664575015e-06, "loss": 9.1925, "step": 320 }, { "epoch": 0.05493753208967996, "grad_norm": 20.378984451293945, "learning_rate": 1.802624073017684e-06, "loss": 8.7484, "step": 321 }, { "epoch": 0.055108677049460895, "grad_norm": 11.863981246948242, "learning_rate": 1.8083285795778666e-06, "loss": 6.308, "step": 322 }, { "epoch": 0.05527982200924183, "grad_norm": 15.815791130065918, "learning_rate": 1.8140330861380493e-06, "loss": 8.9935, "step": 323 }, { "epoch": 0.05545096696902276, "grad_norm": 31.865665435791016, "learning_rate": 1.8197375926982316e-06, "loss": 10.1397, "step": 324 }, { "epoch": 0.055622111928803696, "grad_norm": 160.87301635742188, "learning_rate": 1.8254420992584141e-06, "loss": 9.2965, "step": 325 }, { "epoch": 0.05579325688858463, "grad_norm": 16.763856887817383, "learning_rate": 1.8311466058185969e-06, "loss": 6.6638, "step": 326 }, { "epoch": 0.05596440184836556, "grad_norm": 12.291769981384277, "learning_rate": 1.8368511123787792e-06, "loss": 8.2182, "step": 327 }, { "epoch": 0.0561355468081465, "grad_norm": 20.839473724365234, "learning_rate": 1.8425556189389617e-06, "loss": 5.9446, "step": 328 }, { "epoch": 0.056306691767927436, "grad_norm": 41.371337890625, "learning_rate": 1.8482601254991445e-06, "loss": 10.0738, "step": 329 }, { "epoch": 0.05647783672770837, "grad_norm": 12.416519165039062, "learning_rate": 1.8539646320593268e-06, "loss": 7.9372, "step": 330 }, { "epoch": 0.0566489816874893, "grad_norm": 12.856998443603516, "learning_rate": 1.8596691386195093e-06, "loss": 8.5894, "step": 331 }, { "epoch": 0.056820126647270236, "grad_norm": 28.67165184020996, "learning_rate": 1.865373645179692e-06, "loss": 9.856, "step": 332 }, { "epoch": 0.05699127160705117, "grad_norm": 17.425006866455078, "learning_rate": 1.8710781517398746e-06, "loss": 7.6487, "step": 333 }, { "epoch": 0.05716241656683211, "grad_norm": 29.102951049804688, "learning_rate": 1.8767826583000571e-06, "loss": 9.7985, "step": 334 }, { "epoch": 0.05733356152661304, "grad_norm": 15.120597839355469, "learning_rate": 1.8824871648602395e-06, "loss": 8.856, "step": 335 }, { "epoch": 0.057504706486393976, "grad_norm": 188.02642822265625, "learning_rate": 1.8881916714204222e-06, "loss": 9.3274, "step": 336 }, { "epoch": 0.05767585144617491, "grad_norm": 14.4713134765625, "learning_rate": 1.8938961779806047e-06, "loss": 8.8408, "step": 337 }, { "epoch": 0.05784699640595584, "grad_norm": 27.848546981811523, "learning_rate": 1.8996006845407875e-06, "loss": 10.1598, "step": 338 }, { "epoch": 0.058018141365736776, "grad_norm": 12.024163246154785, "learning_rate": 1.9053051911009698e-06, "loss": 6.2088, "step": 339 }, { "epoch": 0.05818928632551772, "grad_norm": 11.968954086303711, "learning_rate": 1.9110096976611523e-06, "loss": 7.3791, "step": 340 }, { "epoch": 0.05836043128529865, "grad_norm": 27.01519775390625, "learning_rate": 1.9167142042213353e-06, "loss": 10.5111, "step": 341 }, { "epoch": 0.05853157624507958, "grad_norm": 13.136455535888672, "learning_rate": 1.9224187107815174e-06, "loss": 4.512, "step": 342 }, { "epoch": 0.05870272120486052, "grad_norm": 16.26902198791504, "learning_rate": 1.9281232173417e-06, "loss": 6.8285, "step": 343 }, { "epoch": 0.05887386616464145, "grad_norm": 16.47487449645996, "learning_rate": 1.933827723901883e-06, "loss": 8.8793, "step": 344 }, { "epoch": 0.059045011124422384, "grad_norm": 32.750850677490234, "learning_rate": 1.939532230462065e-06, "loss": 10.0536, "step": 345 }, { "epoch": 0.05921615608420332, "grad_norm": 18.996196746826172, "learning_rate": 1.9452367370222475e-06, "loss": 6.1966, "step": 346 }, { "epoch": 0.05938730104398426, "grad_norm": 24.546964645385742, "learning_rate": 1.9509412435824305e-06, "loss": 9.4677, "step": 347 }, { "epoch": 0.05955844600376519, "grad_norm": 84.20301055908203, "learning_rate": 1.9566457501426126e-06, "loss": 14.0007, "step": 348 }, { "epoch": 0.059729590963546124, "grad_norm": 18.845518112182617, "learning_rate": 1.962350256702795e-06, "loss": 6.1715, "step": 349 }, { "epoch": 0.05990073592332706, "grad_norm": 32.177085876464844, "learning_rate": 1.968054763262978e-06, "loss": 10.5934, "step": 350 }, { "epoch": 0.06007188088310799, "grad_norm": 24.051923751831055, "learning_rate": 1.97375926982316e-06, "loss": 9.6501, "step": 351 }, { "epoch": 0.060243025842888924, "grad_norm": 13.522736549377441, "learning_rate": 1.9794637763833427e-06, "loss": 8.8967, "step": 352 }, { "epoch": 0.060414170802669864, "grad_norm": 21.437868118286133, "learning_rate": 1.9851682829435257e-06, "loss": 9.8243, "step": 353 }, { "epoch": 0.0605853157624508, "grad_norm": 30.177589416503906, "learning_rate": 1.9908727895037078e-06, "loss": 9.5401, "step": 354 }, { "epoch": 0.06075646072223173, "grad_norm": 12.939532279968262, "learning_rate": 1.9965772960638903e-06, "loss": 6.4143, "step": 355 }, { "epoch": 0.060927605682012664, "grad_norm": 18.022136688232422, "learning_rate": 2.0022818026240733e-06, "loss": 9.6522, "step": 356 }, { "epoch": 0.0610987506417936, "grad_norm": 12.483067512512207, "learning_rate": 2.0079863091842554e-06, "loss": 9.2254, "step": 357 }, { "epoch": 0.06126989560157453, "grad_norm": 19.432615280151367, "learning_rate": 2.0136908157444383e-06, "loss": 6.2483, "step": 358 }, { "epoch": 0.06144104056135547, "grad_norm": 177.3258819580078, "learning_rate": 2.019395322304621e-06, "loss": 9.2975, "step": 359 }, { "epoch": 0.061612185521136405, "grad_norm": 14.458636283874512, "learning_rate": 2.025099828864803e-06, "loss": 8.5874, "step": 360 }, { "epoch": 0.06178333048091734, "grad_norm": 21.112350463867188, "learning_rate": 2.030804335424986e-06, "loss": 9.4896, "step": 361 }, { "epoch": 0.06195447544069827, "grad_norm": 15.956084251403809, "learning_rate": 2.0365088419851685e-06, "loss": 9.3311, "step": 362 }, { "epoch": 0.062125620400479205, "grad_norm": 11.96216869354248, "learning_rate": 2.042213348545351e-06, "loss": 8.2885, "step": 363 }, { "epoch": 0.06229676536026014, "grad_norm": 16.588687896728516, "learning_rate": 2.0479178551055335e-06, "loss": 8.5745, "step": 364 }, { "epoch": 0.06246791032004107, "grad_norm": 20.95501708984375, "learning_rate": 2.053622361665716e-06, "loss": 9.5327, "step": 365 }, { "epoch": 0.06263905527982201, "grad_norm": 14.255351066589355, "learning_rate": 2.0593268682258986e-06, "loss": 9.1372, "step": 366 }, { "epoch": 0.06281020023960295, "grad_norm": 17.529571533203125, "learning_rate": 2.065031374786081e-06, "loss": 6.9098, "step": 367 }, { "epoch": 0.06298134519938388, "grad_norm": 23.381641387939453, "learning_rate": 2.0707358813462636e-06, "loss": 9.4994, "step": 368 }, { "epoch": 0.06315249015916481, "grad_norm": 152.30535888671875, "learning_rate": 2.076440387906446e-06, "loss": 8.5952, "step": 369 }, { "epoch": 0.06332363511894575, "grad_norm": 15.447931289672852, "learning_rate": 2.0821448944666287e-06, "loss": 7.1287, "step": 370 }, { "epoch": 0.06349478007872668, "grad_norm": 13.553053855895996, "learning_rate": 2.0878494010268112e-06, "loss": 8.0622, "step": 371 }, { "epoch": 0.06366592503850761, "grad_norm": 13.198517799377441, "learning_rate": 2.0935539075869938e-06, "loss": 8.3527, "step": 372 }, { "epoch": 0.06383706999828855, "grad_norm": 21.851369857788086, "learning_rate": 2.0992584141471763e-06, "loss": 6.7771, "step": 373 }, { "epoch": 0.06400821495806948, "grad_norm": 30.56134605407715, "learning_rate": 2.104962920707359e-06, "loss": 9.666, "step": 374 }, { "epoch": 0.06417935991785043, "grad_norm": 18.76494026184082, "learning_rate": 2.1106674272675414e-06, "loss": 9.3941, "step": 375 }, { "epoch": 0.06435050487763136, "grad_norm": 19.92658805847168, "learning_rate": 2.116371933827724e-06, "loss": 9.3741, "step": 376 }, { "epoch": 0.06452164983741229, "grad_norm": 10.430363655090332, "learning_rate": 2.1220764403879064e-06, "loss": 7.8113, "step": 377 }, { "epoch": 0.06469279479719323, "grad_norm": 18.093847274780273, "learning_rate": 2.1277809469480894e-06, "loss": 6.1706, "step": 378 }, { "epoch": 0.06486393975697416, "grad_norm": 21.807714462280273, "learning_rate": 2.1334854535082715e-06, "loss": 9.471, "step": 379 }, { "epoch": 0.06503508471675509, "grad_norm": 10.38511848449707, "learning_rate": 2.139189960068454e-06, "loss": 4.2784, "step": 380 }, { "epoch": 0.06520622967653603, "grad_norm": 18.564613342285156, "learning_rate": 2.144894466628637e-06, "loss": 9.548, "step": 381 }, { "epoch": 0.06537737463631696, "grad_norm": 13.890935897827148, "learning_rate": 2.150598973188819e-06, "loss": 7.9354, "step": 382 }, { "epoch": 0.06554851959609789, "grad_norm": 18.593252182006836, "learning_rate": 2.1563034797490016e-06, "loss": 6.34, "step": 383 }, { "epoch": 0.06571966455587883, "grad_norm": 10.455931663513184, "learning_rate": 2.1620079863091846e-06, "loss": 6.2716, "step": 384 }, { "epoch": 0.06589080951565976, "grad_norm": 21.231943130493164, "learning_rate": 2.1677124928693667e-06, "loss": 5.9761, "step": 385 }, { "epoch": 0.06606195447544069, "grad_norm": 11.568195343017578, "learning_rate": 2.173416999429549e-06, "loss": 4.4776, "step": 386 }, { "epoch": 0.06623309943522163, "grad_norm": 23.829204559326172, "learning_rate": 2.179121505989732e-06, "loss": 9.648, "step": 387 }, { "epoch": 0.06640424439500257, "grad_norm": 10.398987770080566, "learning_rate": 2.1848260125499147e-06, "loss": 4.7062, "step": 388 }, { "epoch": 0.0665753893547835, "grad_norm": 11.396307945251465, "learning_rate": 2.190530519110097e-06, "loss": 8.1087, "step": 389 }, { "epoch": 0.06674653431456444, "grad_norm": 18.780866622924805, "learning_rate": 2.1962350256702798e-06, "loss": 6.196, "step": 390 }, { "epoch": 0.06691767927434537, "grad_norm": 18.36736488342285, "learning_rate": 2.2019395322304623e-06, "loss": 6.2459, "step": 391 }, { "epoch": 0.0670888242341263, "grad_norm": 18.681446075439453, "learning_rate": 2.2076440387906444e-06, "loss": 9.4161, "step": 392 }, { "epoch": 0.06725996919390724, "grad_norm": 15.113629341125488, "learning_rate": 2.2133485453508274e-06, "loss": 6.3517, "step": 393 }, { "epoch": 0.06743111415368817, "grad_norm": 11.273137092590332, "learning_rate": 2.21905305191101e-06, "loss": 8.0886, "step": 394 }, { "epoch": 0.06760225911346911, "grad_norm": 17.580646514892578, "learning_rate": 2.224757558471192e-06, "loss": 6.5059, "step": 395 }, { "epoch": 0.06777340407325004, "grad_norm": 15.864416122436523, "learning_rate": 2.230462065031375e-06, "loss": 8.5624, "step": 396 }, { "epoch": 0.06794454903303097, "grad_norm": 11.407431602478027, "learning_rate": 2.2361665715915575e-06, "loss": 7.853, "step": 397 }, { "epoch": 0.06811569399281191, "grad_norm": 28.192079544067383, "learning_rate": 2.24187107815174e-06, "loss": 9.4467, "step": 398 }, { "epoch": 0.06828683895259284, "grad_norm": 19.4180965423584, "learning_rate": 2.2475755847119225e-06, "loss": 5.6605, "step": 399 }, { "epoch": 0.06845798391237377, "grad_norm": 19.75929069519043, "learning_rate": 2.253280091272105e-06, "loss": 9.4512, "step": 400 }, { "epoch": 0.06862912887215472, "grad_norm": 10.311906814575195, "learning_rate": 2.2589845978322876e-06, "loss": 7.9644, "step": 401 }, { "epoch": 0.06880027383193565, "grad_norm": 20.4741268157959, "learning_rate": 2.26468910439247e-06, "loss": 9.4577, "step": 402 }, { "epoch": 0.06897141879171659, "grad_norm": 25.65606117248535, "learning_rate": 2.2703936109526527e-06, "loss": 9.6371, "step": 403 }, { "epoch": 0.06914256375149752, "grad_norm": 26.26441192626953, "learning_rate": 2.276098117512835e-06, "loss": 9.5365, "step": 404 }, { "epoch": 0.06931370871127845, "grad_norm": 14.249612808227539, "learning_rate": 2.2818026240730177e-06, "loss": 8.5897, "step": 405 }, { "epoch": 0.06948485367105939, "grad_norm": 17.306989669799805, "learning_rate": 2.2875071306332003e-06, "loss": 6.9065, "step": 406 }, { "epoch": 0.06965599863084032, "grad_norm": 10.925597190856934, "learning_rate": 2.293211637193383e-06, "loss": 4.4132, "step": 407 }, { "epoch": 0.06982714359062125, "grad_norm": 20.995426177978516, "learning_rate": 2.2989161437535653e-06, "loss": 9.6018, "step": 408 }, { "epoch": 0.06999828855040219, "grad_norm": 13.343510627746582, "learning_rate": 2.304620650313748e-06, "loss": 8.3354, "step": 409 }, { "epoch": 0.07016943351018312, "grad_norm": 21.461809158325195, "learning_rate": 2.3103251568739304e-06, "loss": 9.3101, "step": 410 }, { "epoch": 0.07034057846996405, "grad_norm": 25.428903579711914, "learning_rate": 2.316029663434113e-06, "loss": 9.4155, "step": 411 }, { "epoch": 0.07051172342974499, "grad_norm": 22.469390869140625, "learning_rate": 2.3217341699942955e-06, "loss": 6.4331, "step": 412 }, { "epoch": 0.07068286838952594, "grad_norm": 157.02752685546875, "learning_rate": 2.3274386765544784e-06, "loss": 7.6313, "step": 413 }, { "epoch": 0.07085401334930687, "grad_norm": 12.20741081237793, "learning_rate": 2.3331431831146605e-06, "loss": 4.2273, "step": 414 }, { "epoch": 0.0710251583090878, "grad_norm": 19.81876564025879, "learning_rate": 2.338847689674843e-06, "loss": 9.5364, "step": 415 }, { "epoch": 0.07119630326886874, "grad_norm": 17.362276077270508, "learning_rate": 2.344552196235026e-06, "loss": 9.4605, "step": 416 }, { "epoch": 0.07136744822864967, "grad_norm": 22.898147583007812, "learning_rate": 2.350256702795208e-06, "loss": 9.5846, "step": 417 }, { "epoch": 0.0715385931884306, "grad_norm": 17.685535430908203, "learning_rate": 2.3559612093553906e-06, "loss": 8.0604, "step": 418 }, { "epoch": 0.07170973814821154, "grad_norm": 16.97225570678711, "learning_rate": 2.3616657159155736e-06, "loss": 9.0822, "step": 419 }, { "epoch": 0.07188088310799247, "grad_norm": 21.690431594848633, "learning_rate": 2.3673702224757557e-06, "loss": 5.9587, "step": 420 }, { "epoch": 0.0720520280677734, "grad_norm": 20.209810256958008, "learning_rate": 2.3730747290359387e-06, "loss": 6.1507, "step": 421 }, { "epoch": 0.07222317302755434, "grad_norm": 19.15233039855957, "learning_rate": 2.378779235596121e-06, "loss": 9.5222, "step": 422 }, { "epoch": 0.07239431798733527, "grad_norm": 15.19393539428711, "learning_rate": 2.3844837421563033e-06, "loss": 8.3063, "step": 423 }, { "epoch": 0.0725654629471162, "grad_norm": 14.138923645019531, "learning_rate": 2.3901882487164863e-06, "loss": 8.3802, "step": 424 }, { "epoch": 0.07273660790689714, "grad_norm": 23.83425521850586, "learning_rate": 2.395892755276669e-06, "loss": 9.554, "step": 425 }, { "epoch": 0.07290775286667808, "grad_norm": 19.778850555419922, "learning_rate": 2.401597261836851e-06, "loss": 6.0866, "step": 426 }, { "epoch": 0.07307889782645902, "grad_norm": 12.418360710144043, "learning_rate": 2.407301768397034e-06, "loss": 7.7723, "step": 427 }, { "epoch": 0.07325004278623995, "grad_norm": 21.105587005615234, "learning_rate": 2.4130062749572164e-06, "loss": 6.011, "step": 428 }, { "epoch": 0.07342118774602088, "grad_norm": 18.78055763244629, "learning_rate": 2.4187107815173985e-06, "loss": 6.4389, "step": 429 }, { "epoch": 0.07359233270580182, "grad_norm": 17.227916717529297, "learning_rate": 2.4244152880775814e-06, "loss": 6.6973, "step": 430 }, { "epoch": 0.07376347766558275, "grad_norm": 21.845876693725586, "learning_rate": 2.430119794637764e-06, "loss": 5.9158, "step": 431 }, { "epoch": 0.07393462262536368, "grad_norm": 14.355096817016602, "learning_rate": 2.435824301197946e-06, "loss": 8.6576, "step": 432 }, { "epoch": 0.07410576758514462, "grad_norm": 149.28054809570312, "learning_rate": 2.441528807758129e-06, "loss": 7.7649, "step": 433 }, { "epoch": 0.07427691254492555, "grad_norm": 18.152389526367188, "learning_rate": 2.4472333143183116e-06, "loss": 6.6434, "step": 434 }, { "epoch": 0.07444805750470648, "grad_norm": 17.05584716796875, "learning_rate": 2.4529378208784937e-06, "loss": 9.1462, "step": 435 }, { "epoch": 0.07461920246448742, "grad_norm": 11.82278060913086, "learning_rate": 2.4586423274386766e-06, "loss": 8.2832, "step": 436 }, { "epoch": 0.07479034742426835, "grad_norm": 17.951648712158203, "learning_rate": 2.464346833998859e-06, "loss": 8.4052, "step": 437 }, { "epoch": 0.07496149238404928, "grad_norm": 31.258188247680664, "learning_rate": 2.4700513405590417e-06, "loss": 9.4477, "step": 438 }, { "epoch": 0.07513263734383023, "grad_norm": 138.91761779785156, "learning_rate": 2.4757558471192242e-06, "loss": 8.3869, "step": 439 }, { "epoch": 0.07530378230361116, "grad_norm": 17.930551528930664, "learning_rate": 2.4814603536794068e-06, "loss": 9.1768, "step": 440 }, { "epoch": 0.0754749272633921, "grad_norm": 10.999883651733398, "learning_rate": 2.4871648602395897e-06, "loss": 4.1341, "step": 441 }, { "epoch": 0.07564607222317303, "grad_norm": 19.707490921020508, "learning_rate": 2.492869366799772e-06, "loss": 6.0241, "step": 442 }, { "epoch": 0.07581721718295396, "grad_norm": 19.63069725036621, "learning_rate": 2.4985738733599544e-06, "loss": 9.5659, "step": 443 }, { "epoch": 0.0759883621427349, "grad_norm": 19.783658981323242, "learning_rate": 2.5042783799201373e-06, "loss": 6.632, "step": 444 }, { "epoch": 0.07615950710251583, "grad_norm": 11.193924903869629, "learning_rate": 2.5099828864803194e-06, "loss": 4.213, "step": 445 }, { "epoch": 0.07633065206229676, "grad_norm": 65.09992218017578, "learning_rate": 2.515687393040502e-06, "loss": 13.1721, "step": 446 }, { "epoch": 0.0765017970220777, "grad_norm": 19.081214904785156, "learning_rate": 2.521391899600685e-06, "loss": 8.7605, "step": 447 }, { "epoch": 0.07667294198185863, "grad_norm": 17.08602523803711, "learning_rate": 2.527096406160867e-06, "loss": 8.4352, "step": 448 }, { "epoch": 0.07684408694163956, "grad_norm": 11.796391487121582, "learning_rate": 2.5328009127210495e-06, "loss": 7.9838, "step": 449 }, { "epoch": 0.0770152319014205, "grad_norm": 17.306316375732422, "learning_rate": 2.5385054192812325e-06, "loss": 7.9123, "step": 450 }, { "epoch": 0.07718637686120144, "grad_norm": 11.991724014282227, "learning_rate": 2.5442099258414146e-06, "loss": 7.904, "step": 451 }, { "epoch": 0.07735752182098238, "grad_norm": 18.394563674926758, "learning_rate": 2.549914432401597e-06, "loss": 6.7541, "step": 452 }, { "epoch": 0.07752866678076331, "grad_norm": 21.436811447143555, "learning_rate": 2.55561893896178e-06, "loss": 5.5488, "step": 453 }, { "epoch": 0.07769981174054424, "grad_norm": 15.822162628173828, "learning_rate": 2.561323445521962e-06, "loss": 7.7392, "step": 454 }, { "epoch": 0.07787095670032518, "grad_norm": 19.68645668029785, "learning_rate": 2.5670279520821447e-06, "loss": 6.6529, "step": 455 }, { "epoch": 0.07804210166010611, "grad_norm": 18.808198928833008, "learning_rate": 2.5727324586423277e-06, "loss": 8.784, "step": 456 }, { "epoch": 0.07821324661988704, "grad_norm": 131.1753692626953, "learning_rate": 2.57843696520251e-06, "loss": 7.8706, "step": 457 }, { "epoch": 0.07838439157966798, "grad_norm": 11.708639144897461, "learning_rate": 2.5841414717626923e-06, "loss": 7.7402, "step": 458 }, { "epoch": 0.07855553653944891, "grad_norm": 15.965631484985352, "learning_rate": 2.5898459783228753e-06, "loss": 8.301, "step": 459 }, { "epoch": 0.07872668149922984, "grad_norm": 14.710309982299805, "learning_rate": 2.5955504848830574e-06, "loss": 7.9566, "step": 460 }, { "epoch": 0.07889782645901078, "grad_norm": 15.00783634185791, "learning_rate": 2.6012549914432404e-06, "loss": 8.488, "step": 461 }, { "epoch": 0.07906897141879171, "grad_norm": 13.231627464294434, "learning_rate": 2.606959498003423e-06, "loss": 8.1184, "step": 462 }, { "epoch": 0.07924011637857264, "grad_norm": 170.4566192626953, "learning_rate": 2.6126640045636054e-06, "loss": 8.1805, "step": 463 }, { "epoch": 0.07941126133835359, "grad_norm": 23.66990089416504, "learning_rate": 2.618368511123788e-06, "loss": 9.2852, "step": 464 }, { "epoch": 0.07958240629813453, "grad_norm": 20.218496322631836, "learning_rate": 2.6240730176839705e-06, "loss": 6.264, "step": 465 }, { "epoch": 0.07975355125791546, "grad_norm": 27.905323028564453, "learning_rate": 2.629777524244153e-06, "loss": 10.0002, "step": 466 }, { "epoch": 0.07992469621769639, "grad_norm": 22.043649673461914, "learning_rate": 2.6354820308043355e-06, "loss": 8.6303, "step": 467 }, { "epoch": 0.08009584117747733, "grad_norm": 20.095890045166016, "learning_rate": 2.641186537364518e-06, "loss": 8.7857, "step": 468 }, { "epoch": 0.08026698613725826, "grad_norm": 30.715435028076172, "learning_rate": 2.6468910439247006e-06, "loss": 9.6486, "step": 469 }, { "epoch": 0.08043813109703919, "grad_norm": 18.83611488342285, "learning_rate": 2.652595550484883e-06, "loss": 7.9544, "step": 470 }, { "epoch": 0.08060927605682013, "grad_norm": 20.929931640625, "learning_rate": 2.6583000570450657e-06, "loss": 6.2772, "step": 471 }, { "epoch": 0.08078042101660106, "grad_norm": 18.414594650268555, "learning_rate": 2.664004563605248e-06, "loss": 6.1477, "step": 472 }, { "epoch": 0.08095156597638199, "grad_norm": 18.188846588134766, "learning_rate": 2.6697090701654307e-06, "loss": 7.099, "step": 473 }, { "epoch": 0.08112271093616293, "grad_norm": 8.666217803955078, "learning_rate": 2.6754135767256133e-06, "loss": 5.0929, "step": 474 }, { "epoch": 0.08129385589594386, "grad_norm": 15.457167625427246, "learning_rate": 2.681118083285796e-06, "loss": 7.8706, "step": 475 }, { "epoch": 0.08146500085572479, "grad_norm": 17.11892318725586, "learning_rate": 2.6868225898459783e-06, "loss": 8.5293, "step": 476 }, { "epoch": 0.08163614581550574, "grad_norm": 28.18759536743164, "learning_rate": 2.692527096406161e-06, "loss": 5.7448, "step": 477 }, { "epoch": 0.08180729077528667, "grad_norm": 19.842830657958984, "learning_rate": 2.6982316029663434e-06, "loss": 8.5854, "step": 478 }, { "epoch": 0.0819784357350676, "grad_norm": 59.76820373535156, "learning_rate": 2.703936109526526e-06, "loss": 12.4879, "step": 479 }, { "epoch": 0.08214958069484854, "grad_norm": 15.530830383300781, "learning_rate": 2.7096406160867085e-06, "loss": 8.191, "step": 480 }, { "epoch": 0.08232072565462947, "grad_norm": 21.211435317993164, "learning_rate": 2.7153451226468914e-06, "loss": 9.4326, "step": 481 }, { "epoch": 0.0824918706144104, "grad_norm": 16.38536834716797, "learning_rate": 2.7210496292070735e-06, "loss": 6.3342, "step": 482 }, { "epoch": 0.08266301557419134, "grad_norm": 30.17742919921875, "learning_rate": 2.726754135767256e-06, "loss": 5.5068, "step": 483 }, { "epoch": 0.08283416053397227, "grad_norm": 27.44713020324707, "learning_rate": 2.732458642327439e-06, "loss": 9.4586, "step": 484 }, { "epoch": 0.0830053054937532, "grad_norm": 59.46120071411133, "learning_rate": 2.738163148887621e-06, "loss": 12.3889, "step": 485 }, { "epoch": 0.08317645045353414, "grad_norm": 26.801589965820312, "learning_rate": 2.7438676554478036e-06, "loss": 5.3141, "step": 486 }, { "epoch": 0.08334759541331507, "grad_norm": 32.20411682128906, "learning_rate": 2.7495721620079866e-06, "loss": 5.4274, "step": 487 }, { "epoch": 0.083518740373096, "grad_norm": 16.14412498474121, "learning_rate": 2.755276668568169e-06, "loss": 8.3447, "step": 488 }, { "epoch": 0.08368988533287694, "grad_norm": 16.79600715637207, "learning_rate": 2.7609811751283512e-06, "loss": 7.7737, "step": 489 }, { "epoch": 0.08386103029265789, "grad_norm": 171.59872436523438, "learning_rate": 2.766685681688534e-06, "loss": 8.277, "step": 490 }, { "epoch": 0.08403217525243882, "grad_norm": 29.80289649963379, "learning_rate": 2.7723901882487167e-06, "loss": 5.273, "step": 491 }, { "epoch": 0.08420332021221975, "grad_norm": 15.38176155090332, "learning_rate": 2.778094694808899e-06, "loss": 7.8611, "step": 492 }, { "epoch": 0.08437446517200069, "grad_norm": 19.766082763671875, "learning_rate": 2.783799201369082e-06, "loss": 7.7926, "step": 493 }, { "epoch": 0.08454561013178162, "grad_norm": 13.274962425231934, "learning_rate": 2.7895037079292643e-06, "loss": 4.1215, "step": 494 }, { "epoch": 0.08471675509156255, "grad_norm": 29.015403747558594, "learning_rate": 2.7952082144894464e-06, "loss": 5.4146, "step": 495 }, { "epoch": 0.08488790005134349, "grad_norm": 22.243703842163086, "learning_rate": 2.8009127210496294e-06, "loss": 5.753, "step": 496 }, { "epoch": 0.08505904501112442, "grad_norm": 23.75475311279297, "learning_rate": 2.806617227609812e-06, "loss": 5.7119, "step": 497 }, { "epoch": 0.08523018997090535, "grad_norm": 19.524032592773438, "learning_rate": 2.812321734169994e-06, "loss": 8.9719, "step": 498 }, { "epoch": 0.08540133493068629, "grad_norm": 22.207155227661133, "learning_rate": 2.818026240730177e-06, "loss": 8.5433, "step": 499 }, { "epoch": 0.08557247989046722, "grad_norm": 20.369564056396484, "learning_rate": 2.8237307472903595e-06, "loss": 9.2212, "step": 500 }, { "epoch": 0.08574362485024815, "grad_norm": 12.617632865905762, "learning_rate": 2.829435253850542e-06, "loss": 7.5878, "step": 501 }, { "epoch": 0.0859147698100291, "grad_norm": 16.92389678955078, "learning_rate": 2.8351397604107246e-06, "loss": 8.1394, "step": 502 }, { "epoch": 0.08608591476981003, "grad_norm": 52.22781753540039, "learning_rate": 2.840844266970907e-06, "loss": 11.9304, "step": 503 }, { "epoch": 0.08625705972959097, "grad_norm": 19.299196243286133, "learning_rate": 2.8465487735310896e-06, "loss": 7.5487, "step": 504 }, { "epoch": 0.0864282046893719, "grad_norm": 25.007366180419922, "learning_rate": 2.852253280091272e-06, "loss": 6.4953, "step": 505 }, { "epoch": 0.08659934964915283, "grad_norm": 44.58477020263672, "learning_rate": 2.8579577866514547e-06, "loss": 11.543, "step": 506 }, { "epoch": 0.08677049460893377, "grad_norm": 18.95302963256836, "learning_rate": 2.8636622932116372e-06, "loss": 7.7713, "step": 507 }, { "epoch": 0.0869416395687147, "grad_norm": 15.56648063659668, "learning_rate": 2.8693667997718198e-06, "loss": 8.5567, "step": 508 }, { "epoch": 0.08711278452849563, "grad_norm": 20.78284454345703, "learning_rate": 2.8750713063320023e-06, "loss": 7.7135, "step": 509 }, { "epoch": 0.08728392948827657, "grad_norm": 23.176607131958008, "learning_rate": 2.880775812892185e-06, "loss": 8.2685, "step": 510 }, { "epoch": 0.0874550744480575, "grad_norm": 25.212718963623047, "learning_rate": 2.8864803194523674e-06, "loss": 8.9983, "step": 511 }, { "epoch": 0.08762621940783843, "grad_norm": 27.220836639404297, "learning_rate": 2.89218482601255e-06, "loss": 6.6334, "step": 512 }, { "epoch": 0.08779736436761937, "grad_norm": 13.128168106079102, "learning_rate": 2.897889332572733e-06, "loss": 3.847, "step": 513 }, { "epoch": 0.0879685093274003, "grad_norm": 19.84160614013672, "learning_rate": 2.903593839132915e-06, "loss": 8.0045, "step": 514 }, { "epoch": 0.08813965428718125, "grad_norm": 15.77076530456543, "learning_rate": 2.9092983456930975e-06, "loss": 7.8019, "step": 515 }, { "epoch": 0.08831079924696218, "grad_norm": 158.41465759277344, "learning_rate": 2.9150028522532804e-06, "loss": 8.6448, "step": 516 }, { "epoch": 0.08848194420674312, "grad_norm": 23.563339233398438, "learning_rate": 2.9207073588134625e-06, "loss": 8.8163, "step": 517 }, { "epoch": 0.08865308916652405, "grad_norm": 30.82549476623535, "learning_rate": 2.926411865373645e-06, "loss": 8.627, "step": 518 }, { "epoch": 0.08882423412630498, "grad_norm": 24.138612747192383, "learning_rate": 2.932116371933828e-06, "loss": 6.2112, "step": 519 }, { "epoch": 0.08899537908608592, "grad_norm": 42.6961784362793, "learning_rate": 2.93782087849401e-06, "loss": 11.5101, "step": 520 }, { "epoch": 0.08916652404586685, "grad_norm": 16.58330726623535, "learning_rate": 2.943525385054193e-06, "loss": 7.885, "step": 521 }, { "epoch": 0.08933766900564778, "grad_norm": 17.490467071533203, "learning_rate": 2.9492298916143756e-06, "loss": 7.6631, "step": 522 }, { "epoch": 0.08950881396542872, "grad_norm": 24.303665161132812, "learning_rate": 2.9549343981745577e-06, "loss": 8.5512, "step": 523 }, { "epoch": 0.08967995892520965, "grad_norm": 14.5447416305542, "learning_rate": 2.9606389047347407e-06, "loss": 3.9844, "step": 524 }, { "epoch": 0.08985110388499058, "grad_norm": 28.421756744384766, "learning_rate": 2.9663434112949232e-06, "loss": 9.2179, "step": 525 }, { "epoch": 0.09002224884477152, "grad_norm": 20.097034454345703, "learning_rate": 2.9720479178551053e-06, "loss": 9.2422, "step": 526 }, { "epoch": 0.09019339380455245, "grad_norm": 20.862869262695312, "learning_rate": 2.9777524244152883e-06, "loss": 8.2303, "step": 527 }, { "epoch": 0.0903645387643334, "grad_norm": 30.980390548706055, "learning_rate": 2.983456930975471e-06, "loss": 9.1253, "step": 528 }, { "epoch": 0.09053568372411433, "grad_norm": 29.973567962646484, "learning_rate": 2.989161437535653e-06, "loss": 4.8928, "step": 529 }, { "epoch": 0.09070682868389526, "grad_norm": 35.399349212646484, "learning_rate": 2.994865944095836e-06, "loss": 4.5559, "step": 530 }, { "epoch": 0.0908779736436762, "grad_norm": 21.178098678588867, "learning_rate": 3.0005704506560184e-06, "loss": 8.8876, "step": 531 }, { "epoch": 0.09104911860345713, "grad_norm": 24.755205154418945, "learning_rate": 3.0062749572162005e-06, "loss": 7.5809, "step": 532 }, { "epoch": 0.09122026356323806, "grad_norm": 23.76934051513672, "learning_rate": 3.0119794637763835e-06, "loss": 8.5706, "step": 533 }, { "epoch": 0.091391408523019, "grad_norm": 40.431190490722656, "learning_rate": 3.017683970336566e-06, "loss": 11.3342, "step": 534 }, { "epoch": 0.09156255348279993, "grad_norm": 22.674354553222656, "learning_rate": 3.023388476896748e-06, "loss": 8.8756, "step": 535 }, { "epoch": 0.09173369844258086, "grad_norm": 33.92606735229492, "learning_rate": 3.029092983456931e-06, "loss": 4.738, "step": 536 }, { "epoch": 0.0919048434023618, "grad_norm": 27.1170711517334, "learning_rate": 3.0347974900171136e-06, "loss": 6.4223, "step": 537 }, { "epoch": 0.09207598836214273, "grad_norm": 25.11066246032715, "learning_rate": 3.040501996577296e-06, "loss": 9.0946, "step": 538 }, { "epoch": 0.09224713332192366, "grad_norm": 23.894901275634766, "learning_rate": 3.0462065031374787e-06, "loss": 6.3395, "step": 539 }, { "epoch": 0.09241827828170461, "grad_norm": 20.199861526489258, "learning_rate": 3.051911009697661e-06, "loss": 6.0524, "step": 540 }, { "epoch": 0.09258942324148554, "grad_norm": 22.757362365722656, "learning_rate": 3.057615516257844e-06, "loss": 7.1293, "step": 541 }, { "epoch": 0.09276056820126648, "grad_norm": 22.62543487548828, "learning_rate": 3.0633200228180263e-06, "loss": 7.5053, "step": 542 }, { "epoch": 0.09293171316104741, "grad_norm": 16.598411560058594, "learning_rate": 3.069024529378209e-06, "loss": 7.8322, "step": 543 }, { "epoch": 0.09310285812082834, "grad_norm": 20.656627655029297, "learning_rate": 3.0747290359383917e-06, "loss": 8.8013, "step": 544 }, { "epoch": 0.09327400308060928, "grad_norm": 20.95423126220703, "learning_rate": 3.080433542498574e-06, "loss": 6.1923, "step": 545 }, { "epoch": 0.09344514804039021, "grad_norm": 175.26722717285156, "learning_rate": 3.0861380490587564e-06, "loss": 10.2252, "step": 546 }, { "epoch": 0.09361629300017114, "grad_norm": 21.737558364868164, "learning_rate": 3.0918425556189393e-06, "loss": 7.7486, "step": 547 }, { "epoch": 0.09378743795995208, "grad_norm": 41.67558288574219, "learning_rate": 3.0975470621791215e-06, "loss": 11.1347, "step": 548 }, { "epoch": 0.09395858291973301, "grad_norm": 24.20724868774414, "learning_rate": 3.103251568739304e-06, "loss": 8.1228, "step": 549 }, { "epoch": 0.09412972787951394, "grad_norm": 23.995750427246094, "learning_rate": 3.108956075299487e-06, "loss": 8.1871, "step": 550 }, { "epoch": 0.09430087283929488, "grad_norm": 18.58646583557129, "learning_rate": 3.114660581859669e-06, "loss": 7.4311, "step": 551 }, { "epoch": 0.09447201779907581, "grad_norm": 26.01420021057129, "learning_rate": 3.1203650884198516e-06, "loss": 9.3426, "step": 552 }, { "epoch": 0.09464316275885676, "grad_norm": 18.335588455200195, "learning_rate": 3.1260695949800345e-06, "loss": 8.9575, "step": 553 }, { "epoch": 0.09481430771863769, "grad_norm": 21.414621353149414, "learning_rate": 3.1317741015402166e-06, "loss": 7.744, "step": 554 }, { "epoch": 0.09498545267841862, "grad_norm": 15.28297233581543, "learning_rate": 3.137478608100399e-06, "loss": 8.0683, "step": 555 }, { "epoch": 0.09515659763819956, "grad_norm": 20.182992935180664, "learning_rate": 3.143183114660582e-06, "loss": 7.5161, "step": 556 }, { "epoch": 0.09532774259798049, "grad_norm": 22.94892120361328, "learning_rate": 3.1488876212207642e-06, "loss": 7.728, "step": 557 }, { "epoch": 0.09549888755776142, "grad_norm": 16.93927764892578, "learning_rate": 3.1545921277809468e-06, "loss": 7.7731, "step": 558 }, { "epoch": 0.09567003251754236, "grad_norm": 21.27535629272461, "learning_rate": 3.1602966343411297e-06, "loss": 8.7629, "step": 559 }, { "epoch": 0.09584117747732329, "grad_norm": 20.056377410888672, "learning_rate": 3.166001140901312e-06, "loss": 7.3943, "step": 560 }, { "epoch": 0.09601232243710422, "grad_norm": 37.84750747680664, "learning_rate": 3.1717056474614948e-06, "loss": 10.5613, "step": 561 }, { "epoch": 0.09618346739688516, "grad_norm": 19.577177047729492, "learning_rate": 3.1774101540216773e-06, "loss": 7.6761, "step": 562 }, { "epoch": 0.09635461235666609, "grad_norm": 22.209712982177734, "learning_rate": 3.18311466058186e-06, "loss": 6.9584, "step": 563 }, { "epoch": 0.09652575731644703, "grad_norm": 25.258302688598633, "learning_rate": 3.1888191671420424e-06, "loss": 8.0565, "step": 564 }, { "epoch": 0.09669690227622796, "grad_norm": 15.993329048156738, "learning_rate": 3.194523673702225e-06, "loss": 4.2519, "step": 565 }, { "epoch": 0.0968680472360089, "grad_norm": 18.609046936035156, "learning_rate": 3.2002281802624074e-06, "loss": 8.0901, "step": 566 }, { "epoch": 0.09703919219578984, "grad_norm": 39.24065017700195, "learning_rate": 3.20593268682259e-06, "loss": 4.2716, "step": 567 }, { "epoch": 0.09721033715557077, "grad_norm": 158.3350067138672, "learning_rate": 3.2116371933827725e-06, "loss": 9.5332, "step": 568 }, { "epoch": 0.0973814821153517, "grad_norm": 59.29450607299805, "learning_rate": 3.217341699942955e-06, "loss": 10.343, "step": 569 }, { "epoch": 0.09755262707513264, "grad_norm": 16.113664627075195, "learning_rate": 3.2230462065031376e-06, "loss": 3.4944, "step": 570 }, { "epoch": 0.09772377203491357, "grad_norm": 23.105350494384766, "learning_rate": 3.22875071306332e-06, "loss": 8.2848, "step": 571 }, { "epoch": 0.0978949169946945, "grad_norm": 21.425796508789062, "learning_rate": 3.2344552196235026e-06, "loss": 8.7655, "step": 572 }, { "epoch": 0.09806606195447544, "grad_norm": 22.587278366088867, "learning_rate": 3.240159726183685e-06, "loss": 5.394, "step": 573 }, { "epoch": 0.09823720691425637, "grad_norm": 37.69017028808594, "learning_rate": 3.2458642327438677e-06, "loss": 9.9902, "step": 574 }, { "epoch": 0.0984083518740373, "grad_norm": 25.255393981933594, "learning_rate": 3.2515687393040502e-06, "loss": 8.3951, "step": 575 }, { "epoch": 0.09857949683381824, "grad_norm": 18.790040969848633, "learning_rate": 3.2572732458642328e-06, "loss": 8.2647, "step": 576 }, { "epoch": 0.09875064179359917, "grad_norm": 18.215757369995117, "learning_rate": 3.2629777524244153e-06, "loss": 7.8027, "step": 577 }, { "epoch": 0.09892178675338012, "grad_norm": 17.263294219970703, "learning_rate": 3.268682258984598e-06, "loss": 7.7728, "step": 578 }, { "epoch": 0.09909293171316105, "grad_norm": 18.384496688842773, "learning_rate": 3.2743867655447804e-06, "loss": 7.9191, "step": 579 }, { "epoch": 0.09926407667294199, "grad_norm": 32.68930435180664, "learning_rate": 3.280091272104963e-06, "loss": 7.8591, "step": 580 }, { "epoch": 0.09943522163272292, "grad_norm": 31.514266967773438, "learning_rate": 3.285795778665146e-06, "loss": 5.5621, "step": 581 }, { "epoch": 0.09960636659250385, "grad_norm": 18.912736892700195, "learning_rate": 3.291500285225328e-06, "loss": 7.6952, "step": 582 }, { "epoch": 0.09977751155228479, "grad_norm": 37.68309783935547, "learning_rate": 3.2972047917855105e-06, "loss": 4.1392, "step": 583 }, { "epoch": 0.09994865651206572, "grad_norm": 31.56082534790039, "learning_rate": 3.3029092983456934e-06, "loss": 4.3801, "step": 584 }, { "epoch": 0.10011980147184665, "grad_norm": 24.57911491394043, "learning_rate": 3.3086138049058755e-06, "loss": 8.6316, "step": 585 }, { "epoch": 0.10029094643162759, "grad_norm": 23.80208396911621, "learning_rate": 3.314318311466058e-06, "loss": 7.9077, "step": 586 }, { "epoch": 0.10046209139140852, "grad_norm": 16.849803924560547, "learning_rate": 3.320022818026241e-06, "loss": 7.6992, "step": 587 }, { "epoch": 0.10063323635118945, "grad_norm": 18.981300354003906, "learning_rate": 3.3257273245864236e-06, "loss": 8.6023, "step": 588 }, { "epoch": 0.10080438131097039, "grad_norm": 24.398378372192383, "learning_rate": 3.3314318311466057e-06, "loss": 7.0319, "step": 589 }, { "epoch": 0.10097552627075132, "grad_norm": 14.639533996582031, "learning_rate": 3.3371363377067886e-06, "loss": 4.7698, "step": 590 }, { "epoch": 0.10114667123053227, "grad_norm": 25.046255111694336, "learning_rate": 3.342840844266971e-06, "loss": 7.1782, "step": 591 }, { "epoch": 0.1013178161903132, "grad_norm": 20.012542724609375, "learning_rate": 3.3485453508271533e-06, "loss": 7.3167, "step": 592 }, { "epoch": 0.10148896115009413, "grad_norm": 27.766891479492188, "learning_rate": 3.3542498573873362e-06, "loss": 4.6521, "step": 593 }, { "epoch": 0.10166010610987507, "grad_norm": 30.79694175720215, "learning_rate": 3.3599543639475188e-06, "loss": 5.1196, "step": 594 }, { "epoch": 0.101831251069656, "grad_norm": 24.9854736328125, "learning_rate": 3.365658870507701e-06, "loss": 7.8056, "step": 595 }, { "epoch": 0.10200239602943693, "grad_norm": 30.63117218017578, "learning_rate": 3.371363377067884e-06, "loss": 4.7903, "step": 596 }, { "epoch": 0.10217354098921787, "grad_norm": 34.852256774902344, "learning_rate": 3.3770678836280663e-06, "loss": 4.1994, "step": 597 }, { "epoch": 0.1023446859489988, "grad_norm": 26.979557037353516, "learning_rate": 3.3827723901882485e-06, "loss": 8.5955, "step": 598 }, { "epoch": 0.10251583090877973, "grad_norm": 21.797626495361328, "learning_rate": 3.3884768967484314e-06, "loss": 7.5743, "step": 599 }, { "epoch": 0.10268697586856067, "grad_norm": 37.774139404296875, "learning_rate": 3.394181403308614e-06, "loss": 6.7935, "step": 600 }, { "epoch": 0.1028581208283416, "grad_norm": 27.917823791503906, "learning_rate": 3.3998859098687965e-06, "loss": 7.9018, "step": 601 }, { "epoch": 0.10302926578812253, "grad_norm": 28.479934692382812, "learning_rate": 3.405590416428979e-06, "loss": 4.566, "step": 602 }, { "epoch": 0.10320041074790347, "grad_norm": 33.35675811767578, "learning_rate": 3.4112949229891615e-06, "loss": 4.1986, "step": 603 }, { "epoch": 0.10337155570768441, "grad_norm": 17.17736053466797, "learning_rate": 3.416999429549344e-06, "loss": 4.4831, "step": 604 }, { "epoch": 0.10354270066746535, "grad_norm": 33.52507781982422, "learning_rate": 3.4227039361095266e-06, "loss": 5.2033, "step": 605 }, { "epoch": 0.10371384562724628, "grad_norm": 38.001678466796875, "learning_rate": 3.428408442669709e-06, "loss": 4.2796, "step": 606 }, { "epoch": 0.10388499058702722, "grad_norm": 27.487375259399414, "learning_rate": 3.4341129492298917e-06, "loss": 4.3677, "step": 607 }, { "epoch": 0.10405613554680815, "grad_norm": 43.33926010131836, "learning_rate": 3.439817455790074e-06, "loss": 8.1044, "step": 608 }, { "epoch": 0.10422728050658908, "grad_norm": 19.231143951416016, "learning_rate": 3.4455219623502567e-06, "loss": 3.8195, "step": 609 }, { "epoch": 0.10439842546637002, "grad_norm": 51.54021453857422, "learning_rate": 3.4512264689104393e-06, "loss": 8.4844, "step": 610 }, { "epoch": 0.10456957042615095, "grad_norm": 18.752532958984375, "learning_rate": 3.456930975470622e-06, "loss": 4.7959, "step": 611 }, { "epoch": 0.10474071538593188, "grad_norm": 31.644916534423828, "learning_rate": 3.4626354820308043e-06, "loss": 3.5838, "step": 612 }, { "epoch": 0.10491186034571282, "grad_norm": 29.887203216552734, "learning_rate": 3.4683399885909873e-06, "loss": 8.1693, "step": 613 }, { "epoch": 0.10508300530549375, "grad_norm": 26.583890914916992, "learning_rate": 3.4740444951511694e-06, "loss": 6.6237, "step": 614 }, { "epoch": 0.10525415026527468, "grad_norm": 30.845338821411133, "learning_rate": 3.479749001711352e-06, "loss": 3.4824, "step": 615 }, { "epoch": 0.10542529522505562, "grad_norm": 38.40910339355469, "learning_rate": 3.485453508271535e-06, "loss": 3.9889, "step": 616 }, { "epoch": 0.10559644018483656, "grad_norm": 39.16193389892578, "learning_rate": 3.491158014831717e-06, "loss": 8.4244, "step": 617 }, { "epoch": 0.1057675851446175, "grad_norm": 17.97920036315918, "learning_rate": 3.4968625213918995e-06, "loss": 3.2418, "step": 618 }, { "epoch": 0.10593873010439843, "grad_norm": 176.26966857910156, "learning_rate": 3.5025670279520825e-06, "loss": 7.2639, "step": 619 }, { "epoch": 0.10610987506417936, "grad_norm": 31.25491714477539, "learning_rate": 3.5082715345122646e-06, "loss": 7.428, "step": 620 }, { "epoch": 0.1062810200239603, "grad_norm": 291.8013916015625, "learning_rate": 3.5139760410724475e-06, "loss": 12.9756, "step": 621 }, { "epoch": 0.10645216498374123, "grad_norm": 34.713497161865234, "learning_rate": 3.51968054763263e-06, "loss": 4.2338, "step": 622 }, { "epoch": 0.10662330994352216, "grad_norm": 30.151296615600586, "learning_rate": 3.525385054192812e-06, "loss": 8.7988, "step": 623 }, { "epoch": 0.1067944549033031, "grad_norm": 36.128414154052734, "learning_rate": 3.531089560752995e-06, "loss": 8.0574, "step": 624 }, { "epoch": 0.10696559986308403, "grad_norm": 15.85501480102539, "learning_rate": 3.5367940673131777e-06, "loss": 3.4627, "step": 625 }, { "epoch": 0.10713674482286496, "grad_norm": 296.1280212402344, "learning_rate": 3.5424985738733598e-06, "loss": 11.8753, "step": 626 }, { "epoch": 0.1073078897826459, "grad_norm": 30.480113983154297, "learning_rate": 3.5482030804335427e-06, "loss": 6.966, "step": 627 }, { "epoch": 0.10747903474242683, "grad_norm": 34.42314529418945, "learning_rate": 3.5539075869937253e-06, "loss": 6.8004, "step": 628 }, { "epoch": 0.10765017970220778, "grad_norm": 239.69007873535156, "learning_rate": 3.5596120935539074e-06, "loss": 10.727, "step": 629 }, { "epoch": 0.10782132466198871, "grad_norm": 42.74559783935547, "learning_rate": 3.5653166001140903e-06, "loss": 7.948, "step": 630 }, { "epoch": 0.10799246962176964, "grad_norm": 24.176240921020508, "learning_rate": 3.571021106674273e-06, "loss": 5.4348, "step": 631 }, { "epoch": 0.10816361458155058, "grad_norm": 32.64130783081055, "learning_rate": 3.576725613234455e-06, "loss": 8.4, "step": 632 }, { "epoch": 0.10833475954133151, "grad_norm": 32.354248046875, "learning_rate": 3.582430119794638e-06, "loss": 6.4397, "step": 633 }, { "epoch": 0.10850590450111244, "grad_norm": 25.767475128173828, "learning_rate": 3.5881346263548204e-06, "loss": 5.7136, "step": 634 }, { "epoch": 0.10867704946089338, "grad_norm": 28.90591812133789, "learning_rate": 3.593839132915003e-06, "loss": 5.9131, "step": 635 }, { "epoch": 0.10884819442067431, "grad_norm": 32.62278747558594, "learning_rate": 3.5995436394751855e-06, "loss": 7.6547, "step": 636 }, { "epoch": 0.10901933938045524, "grad_norm": 30.387760162353516, "learning_rate": 3.605248146035368e-06, "loss": 4.7063, "step": 637 }, { "epoch": 0.10919048434023618, "grad_norm": 33.034420013427734, "learning_rate": 3.6109526525955506e-06, "loss": 6.8851, "step": 638 }, { "epoch": 0.10936162930001711, "grad_norm": 31.42691421508789, "learning_rate": 3.616657159155733e-06, "loss": 4.1348, "step": 639 }, { "epoch": 0.10953277425979804, "grad_norm": 32.439395904541016, "learning_rate": 3.6223616657159156e-06, "loss": 6.3162, "step": 640 }, { "epoch": 0.10970391921957898, "grad_norm": 26.49324607849121, "learning_rate": 3.6280661722760986e-06, "loss": 5.1818, "step": 641 }, { "epoch": 0.10987506417935992, "grad_norm": 25.558427810668945, "learning_rate": 3.6337706788362807e-06, "loss": 6.7631, "step": 642 }, { "epoch": 0.11004620913914086, "grad_norm": 24.655729293823242, "learning_rate": 3.6394751853964632e-06, "loss": 7.4925, "step": 643 }, { "epoch": 0.11021735409892179, "grad_norm": 28.129770278930664, "learning_rate": 3.645179691956646e-06, "loss": 7.4969, "step": 644 }, { "epoch": 0.11038849905870272, "grad_norm": 14.367050170898438, "learning_rate": 3.6508841985168283e-06, "loss": 2.9942, "step": 645 }, { "epoch": 0.11055964401848366, "grad_norm": 17.681976318359375, "learning_rate": 3.656588705077011e-06, "loss": 3.4156, "step": 646 }, { "epoch": 0.11073078897826459, "grad_norm": 16.25703239440918, "learning_rate": 3.6622932116371938e-06, "loss": 4.095, "step": 647 }, { "epoch": 0.11090193393804552, "grad_norm": 26.604623794555664, "learning_rate": 3.667997718197376e-06, "loss": 7.7521, "step": 648 }, { "epoch": 0.11107307889782646, "grad_norm": 24.250492095947266, "learning_rate": 3.6737022247575584e-06, "loss": 5.5501, "step": 649 }, { "epoch": 0.11124422385760739, "grad_norm": 31.94316864013672, "learning_rate": 3.6794067313177414e-06, "loss": 6.24, "step": 650 }, { "epoch": 0.11141536881738832, "grad_norm": 18.14836883544922, "learning_rate": 3.6851112378779235e-06, "loss": 3.024, "step": 651 }, { "epoch": 0.11158651377716926, "grad_norm": 25.239274978637695, "learning_rate": 3.690815744438106e-06, "loss": 7.4699, "step": 652 }, { "epoch": 0.11175765873695019, "grad_norm": 66.97354125976562, "learning_rate": 3.696520250998289e-06, "loss": 11.3565, "step": 653 }, { "epoch": 0.11192880369673112, "grad_norm": 30.029356002807617, "learning_rate": 3.702224757558471e-06, "loss": 7.0383, "step": 654 }, { "epoch": 0.11209994865651207, "grad_norm": 22.021820068359375, "learning_rate": 3.7079292641186536e-06, "loss": 5.1655, "step": 655 }, { "epoch": 0.112271093616293, "grad_norm": 40.79402160644531, "learning_rate": 3.7136337706788366e-06, "loss": 7.3738, "step": 656 }, { "epoch": 0.11244223857607394, "grad_norm": 49.726810455322266, "learning_rate": 3.7193382772390187e-06, "loss": 10.1751, "step": 657 }, { "epoch": 0.11261338353585487, "grad_norm": 34.322078704833984, "learning_rate": 3.725042783799201e-06, "loss": 8.127, "step": 658 }, { "epoch": 0.1127845284956358, "grad_norm": 31.094890594482422, "learning_rate": 3.730747290359384e-06, "loss": 7.2578, "step": 659 }, { "epoch": 0.11295567345541674, "grad_norm": 17.61489486694336, "learning_rate": 3.7364517969195667e-06, "loss": 2.9985, "step": 660 }, { "epoch": 0.11312681841519767, "grad_norm": 31.467206954956055, "learning_rate": 3.7421563034797492e-06, "loss": 3.7049, "step": 661 }, { "epoch": 0.1132979633749786, "grad_norm": 24.94162368774414, "learning_rate": 3.7478608100399318e-06, "loss": 5.2144, "step": 662 }, { "epoch": 0.11346910833475954, "grad_norm": 61.16570281982422, "learning_rate": 3.7535653166001143e-06, "loss": 10.6124, "step": 663 }, { "epoch": 0.11364025329454047, "grad_norm": 30.18357276916504, "learning_rate": 3.7592698231602964e-06, "loss": 7.2241, "step": 664 }, { "epoch": 0.1138113982543214, "grad_norm": 40.68777847290039, "learning_rate": 3.764974329720479e-06, "loss": 7.604, "step": 665 }, { "epoch": 0.11398254321410234, "grad_norm": 24.30128288269043, "learning_rate": 3.7706788362806623e-06, "loss": 4.6842, "step": 666 }, { "epoch": 0.11415368817388329, "grad_norm": 33.77325439453125, "learning_rate": 3.7763833428408444e-06, "loss": 7.3903, "step": 667 }, { "epoch": 0.11432483313366422, "grad_norm": 30.10031509399414, "learning_rate": 3.782087849401027e-06, "loss": 7.0533, "step": 668 }, { "epoch": 0.11449597809344515, "grad_norm": 34.8586540222168, "learning_rate": 3.7877923559612095e-06, "loss": 7.5631, "step": 669 }, { "epoch": 0.11466712305322609, "grad_norm": 33.20988082885742, "learning_rate": 3.7934968625213916e-06, "loss": 3.7029, "step": 670 }, { "epoch": 0.11483826801300702, "grad_norm": 31.075176239013672, "learning_rate": 3.799201369081575e-06, "loss": 8.2182, "step": 671 }, { "epoch": 0.11500941297278795, "grad_norm": 30.962139129638672, "learning_rate": 3.8049058756417575e-06, "loss": 6.5288, "step": 672 }, { "epoch": 0.11518055793256889, "grad_norm": 37.01807403564453, "learning_rate": 3.8106103822019396e-06, "loss": 7.8449, "step": 673 }, { "epoch": 0.11535170289234982, "grad_norm": 35.002742767333984, "learning_rate": 3.816314888762122e-06, "loss": 6.4509, "step": 674 }, { "epoch": 0.11552284785213075, "grad_norm": 51.06761169433594, "learning_rate": 3.822019395322305e-06, "loss": 10.6236, "step": 675 }, { "epoch": 0.11569399281191169, "grad_norm": 37.48448181152344, "learning_rate": 3.827723901882487e-06, "loss": 6.7785, "step": 676 }, { "epoch": 0.11586513777169262, "grad_norm": 35.638832092285156, "learning_rate": 3.8334284084426706e-06, "loss": 7.6172, "step": 677 }, { "epoch": 0.11603628273147355, "grad_norm": 35.00564956665039, "learning_rate": 3.839132915002852e-06, "loss": 7.1866, "step": 678 }, { "epoch": 0.11620742769125449, "grad_norm": 31.42662811279297, "learning_rate": 3.844837421563035e-06, "loss": 3.2075, "step": 679 }, { "epoch": 0.11637857265103543, "grad_norm": 16.111412048339844, "learning_rate": 3.850541928123217e-06, "loss": 4.0132, "step": 680 }, { "epoch": 0.11654971761081637, "grad_norm": 29.7305850982666, "learning_rate": 3.8562464346834e-06, "loss": 7.1253, "step": 681 }, { "epoch": 0.1167208625705973, "grad_norm": 28.033987045288086, "learning_rate": 3.861950941243582e-06, "loss": 3.1947, "step": 682 }, { "epoch": 0.11689200753037823, "grad_norm": 31.460405349731445, "learning_rate": 3.867655447803766e-06, "loss": 7.1427, "step": 683 }, { "epoch": 0.11706315249015917, "grad_norm": 269.6858825683594, "learning_rate": 3.8733599543639474e-06, "loss": 11.9947, "step": 684 }, { "epoch": 0.1172342974499401, "grad_norm": 35.384727478027344, "learning_rate": 3.87906446092413e-06, "loss": 6.8334, "step": 685 }, { "epoch": 0.11740544240972103, "grad_norm": 25.98334312438965, "learning_rate": 3.8847689674843125e-06, "loss": 7.2384, "step": 686 }, { "epoch": 0.11757658736950197, "grad_norm": 33.84842300415039, "learning_rate": 3.890473474044495e-06, "loss": 5.9906, "step": 687 }, { "epoch": 0.1177477323292829, "grad_norm": 41.04487609863281, "learning_rate": 3.8961779806046776e-06, "loss": 6.2759, "step": 688 }, { "epoch": 0.11791887728906383, "grad_norm": 16.468915939331055, "learning_rate": 3.901882487164861e-06, "loss": 3.8545, "step": 689 }, { "epoch": 0.11809002224884477, "grad_norm": 27.10782241821289, "learning_rate": 3.907586993725043e-06, "loss": 7.596, "step": 690 }, { "epoch": 0.1182611672086257, "grad_norm": 25.684919357299805, "learning_rate": 3.913291500285225e-06, "loss": 4.2235, "step": 691 }, { "epoch": 0.11843231216840663, "grad_norm": 27.2288761138916, "learning_rate": 3.918996006845408e-06, "loss": 6.9975, "step": 692 }, { "epoch": 0.11860345712818758, "grad_norm": 33.26142120361328, "learning_rate": 3.92470051340559e-06, "loss": 6.5592, "step": 693 }, { "epoch": 0.11877460208796851, "grad_norm": 38.89694595336914, "learning_rate": 3.930405019965774e-06, "loss": 7.2757, "step": 694 }, { "epoch": 0.11894574704774945, "grad_norm": 27.99795150756836, "learning_rate": 3.936109526525956e-06, "loss": 6.7422, "step": 695 }, { "epoch": 0.11911689200753038, "grad_norm": 24.109289169311523, "learning_rate": 3.941814033086138e-06, "loss": 4.9175, "step": 696 }, { "epoch": 0.11928803696731131, "grad_norm": 15.462040901184082, "learning_rate": 3.94751853964632e-06, "loss": 2.7684, "step": 697 }, { "epoch": 0.11945918192709225, "grad_norm": 39.17838668823242, "learning_rate": 3.953223046206503e-06, "loss": 7.1518, "step": 698 }, { "epoch": 0.11963032688687318, "grad_norm": 30.83951759338379, "learning_rate": 3.958927552766685e-06, "loss": 3.8832, "step": 699 }, { "epoch": 0.11980147184665411, "grad_norm": 26.964744567871094, "learning_rate": 3.964632059326869e-06, "loss": 7.224, "step": 700 }, { "epoch": 0.11997261680643505, "grad_norm": 36.607975006103516, "learning_rate": 3.970336565887051e-06, "loss": 7.3389, "step": 701 }, { "epoch": 0.12014376176621598, "grad_norm": 37.18532180786133, "learning_rate": 3.976041072447234e-06, "loss": 6.1083, "step": 702 }, { "epoch": 0.12031490672599691, "grad_norm": 29.550649642944336, "learning_rate": 3.9817455790074155e-06, "loss": 5.1898, "step": 703 }, { "epoch": 0.12048605168577785, "grad_norm": 24.146198272705078, "learning_rate": 3.987450085567598e-06, "loss": 4.6196, "step": 704 }, { "epoch": 0.1206571966455588, "grad_norm": 25.126737594604492, "learning_rate": 3.993154592127781e-06, "loss": 2.7422, "step": 705 }, { "epoch": 0.12082834160533973, "grad_norm": 18.79334259033203, "learning_rate": 3.998859098687964e-06, "loss": 2.6716, "step": 706 }, { "epoch": 0.12099948656512066, "grad_norm": 33.249168395996094, "learning_rate": 4.0045636052481465e-06, "loss": 5.3053, "step": 707 }, { "epoch": 0.1211706315249016, "grad_norm": 26.934682846069336, "learning_rate": 4.010268111808329e-06, "loss": 4.514, "step": 708 }, { "epoch": 0.12134177648468253, "grad_norm": 44.88846206665039, "learning_rate": 4.015972618368511e-06, "loss": 6.4733, "step": 709 }, { "epoch": 0.12151292144446346, "grad_norm": 41.93711471557617, "learning_rate": 4.021677124928693e-06, "loss": 7.4558, "step": 710 }, { "epoch": 0.1216840664042444, "grad_norm": 41.59209060668945, "learning_rate": 4.027381631488877e-06, "loss": 7.0372, "step": 711 }, { "epoch": 0.12185521136402533, "grad_norm": 41.47358703613281, "learning_rate": 4.033086138049059e-06, "loss": 7.2868, "step": 712 }, { "epoch": 0.12202635632380626, "grad_norm": 41.380741119384766, "learning_rate": 4.038790644609242e-06, "loss": 7.6225, "step": 713 }, { "epoch": 0.1221975012835872, "grad_norm": 40.343788146972656, "learning_rate": 4.044495151169424e-06, "loss": 7.2916, "step": 714 }, { "epoch": 0.12236864624336813, "grad_norm": 30.69339370727539, "learning_rate": 4.050199657729606e-06, "loss": 5.8809, "step": 715 }, { "epoch": 0.12253979120314906, "grad_norm": 25.84669303894043, "learning_rate": 4.0559041642897885e-06, "loss": 2.8596, "step": 716 }, { "epoch": 0.12271093616293, "grad_norm": 37.5709114074707, "learning_rate": 4.061608670849972e-06, "loss": 6.5536, "step": 717 }, { "epoch": 0.12288208112271094, "grad_norm": 44.87430191040039, "learning_rate": 4.067313177410154e-06, "loss": 7.8952, "step": 718 }, { "epoch": 0.12305322608249188, "grad_norm": 29.630413055419922, "learning_rate": 4.073017683970337e-06, "loss": 7.2852, "step": 719 }, { "epoch": 0.12322437104227281, "grad_norm": 38.17768096923828, "learning_rate": 4.0787221905305194e-06, "loss": 7.2025, "step": 720 }, { "epoch": 0.12339551600205374, "grad_norm": 31.9378719329834, "learning_rate": 4.084426697090702e-06, "loss": 3.3231, "step": 721 }, { "epoch": 0.12356666096183468, "grad_norm": 15.323390007019043, "learning_rate": 4.090131203650884e-06, "loss": 3.5441, "step": 722 }, { "epoch": 0.12373780592161561, "grad_norm": 32.09744644165039, "learning_rate": 4.095835710211067e-06, "loss": 5.9737, "step": 723 }, { "epoch": 0.12390895088139654, "grad_norm": 32.49777603149414, "learning_rate": 4.1015402167712496e-06, "loss": 5.6116, "step": 724 }, { "epoch": 0.12408009584117748, "grad_norm": 32.568031311035156, "learning_rate": 4.107244723331432e-06, "loss": 6.8512, "step": 725 }, { "epoch": 0.12425124080095841, "grad_norm": 27.68449592590332, "learning_rate": 4.112949229891615e-06, "loss": 3.8807, "step": 726 }, { "epoch": 0.12442238576073934, "grad_norm": 28.595746994018555, "learning_rate": 4.118653736451797e-06, "loss": 2.7513, "step": 727 }, { "epoch": 0.12459353072052028, "grad_norm": 40.44917678833008, "learning_rate": 4.124358243011979e-06, "loss": 6.9441, "step": 728 }, { "epoch": 0.12476467568030121, "grad_norm": 34.75537872314453, "learning_rate": 4.130062749572162e-06, "loss": 6.2836, "step": 729 }, { "epoch": 0.12493582064008214, "grad_norm": 32.49576950073242, "learning_rate": 4.135767256132345e-06, "loss": 3.6985, "step": 730 }, { "epoch": 0.1251069655998631, "grad_norm": 33.09941482543945, "learning_rate": 4.141471762692527e-06, "loss": 6.414, "step": 731 }, { "epoch": 0.12527811055964402, "grad_norm": 33.988101959228516, "learning_rate": 4.14717626925271e-06, "loss": 6.8846, "step": 732 }, { "epoch": 0.12544925551942496, "grad_norm": 34.69337844848633, "learning_rate": 4.152880775812892e-06, "loss": 5.9908, "step": 733 }, { "epoch": 0.1256204004792059, "grad_norm": 42.33815383911133, "learning_rate": 4.158585282373075e-06, "loss": 7.2186, "step": 734 }, { "epoch": 0.12579154543898682, "grad_norm": 21.35869598388672, "learning_rate": 4.164289788933257e-06, "loss": 3.2239, "step": 735 }, { "epoch": 0.12596269039876776, "grad_norm": 34.62517166137695, "learning_rate": 4.16999429549344e-06, "loss": 6.2926, "step": 736 }, { "epoch": 0.1261338353585487, "grad_norm": 32.758544921875, "learning_rate": 4.1756988020536225e-06, "loss": 6.2203, "step": 737 }, { "epoch": 0.12630498031832962, "grad_norm": 17.39285659790039, "learning_rate": 4.181403308613805e-06, "loss": 3.2563, "step": 738 }, { "epoch": 0.12647612527811056, "grad_norm": 32.22175598144531, "learning_rate": 4.1871078151739875e-06, "loss": 5.3068, "step": 739 }, { "epoch": 0.1266472702378915, "grad_norm": 38.13700485229492, "learning_rate": 4.19281232173417e-06, "loss": 7.6128, "step": 740 }, { "epoch": 0.12681841519767242, "grad_norm": 35.74038314819336, "learning_rate": 4.198516828294353e-06, "loss": 6.6528, "step": 741 }, { "epoch": 0.12698956015745336, "grad_norm": 12.027849197387695, "learning_rate": 4.204221334854535e-06, "loss": 2.255, "step": 742 }, { "epoch": 0.1271607051172343, "grad_norm": 36.75061798095703, "learning_rate": 4.209925841414718e-06, "loss": 6.2444, "step": 743 }, { "epoch": 0.12733185007701522, "grad_norm": 43.853187561035156, "learning_rate": 4.2156303479749e-06, "loss": 6.3509, "step": 744 }, { "epoch": 0.12750299503679616, "grad_norm": 31.670143127441406, "learning_rate": 4.221334854535083e-06, "loss": 7.3242, "step": 745 }, { "epoch": 0.1276741399965771, "grad_norm": 24.049455642700195, "learning_rate": 4.227039361095265e-06, "loss": 4.5557, "step": 746 }, { "epoch": 0.12784528495635802, "grad_norm": 22.603431701660156, "learning_rate": 4.232743867655448e-06, "loss": 4.3686, "step": 747 }, { "epoch": 0.12801642991613896, "grad_norm": 33.28196716308594, "learning_rate": 4.23844837421563e-06, "loss": 7.897, "step": 748 }, { "epoch": 0.1281875748759199, "grad_norm": 14.154582023620605, "learning_rate": 4.244152880775813e-06, "loss": 2.5184, "step": 749 }, { "epoch": 0.12835871983570085, "grad_norm": 34.31758117675781, "learning_rate": 4.249857387335995e-06, "loss": 6.4663, "step": 750 }, { "epoch": 0.12852986479548179, "grad_norm": 29.4487361907959, "learning_rate": 4.255561893896179e-06, "loss": 6.4908, "step": 751 }, { "epoch": 0.12870100975526272, "grad_norm": 26.144145965576172, "learning_rate": 4.261266400456361e-06, "loss": 2.7209, "step": 752 }, { "epoch": 0.12887215471504365, "grad_norm": 32.20002746582031, "learning_rate": 4.266970907016543e-06, "loss": 5.3474, "step": 753 }, { "epoch": 0.12904329967482459, "grad_norm": 22.889114379882812, "learning_rate": 4.2726754135767255e-06, "loss": 4.439, "step": 754 }, { "epoch": 0.12921444463460552, "grad_norm": 29.033794403076172, "learning_rate": 4.278379920136908e-06, "loss": 3.1768, "step": 755 }, { "epoch": 0.12938558959438645, "grad_norm": 36.977718353271484, "learning_rate": 4.2840844266970906e-06, "loss": 6.725, "step": 756 }, { "epoch": 0.12955673455416739, "grad_norm": 24.76682472229004, "learning_rate": 4.289788933257274e-06, "loss": 2.3437, "step": 757 }, { "epoch": 0.12972787951394832, "grad_norm": 16.016826629638672, "learning_rate": 4.2954934398174565e-06, "loss": 2.8201, "step": 758 }, { "epoch": 0.12989902447372925, "grad_norm": 14.587915420532227, "learning_rate": 4.301197946377638e-06, "loss": 3.5146, "step": 759 }, { "epoch": 0.13007016943351019, "grad_norm": 26.081321716308594, "learning_rate": 4.306902452937821e-06, "loss": 2.6582, "step": 760 }, { "epoch": 0.13024131439329112, "grad_norm": 16.497404098510742, "learning_rate": 4.312606959498003e-06, "loss": 2.6039, "step": 761 }, { "epoch": 0.13041245935307205, "grad_norm": 30.642013549804688, "learning_rate": 4.318311466058186e-06, "loss": 5.6791, "step": 762 }, { "epoch": 0.13058360431285299, "grad_norm": 78.80982971191406, "learning_rate": 4.324015972618369e-06, "loss": 7.0474, "step": 763 }, { "epoch": 0.13075474927263392, "grad_norm": 31.678878784179688, "learning_rate": 4.329720479178552e-06, "loss": 7.2185, "step": 764 }, { "epoch": 0.13092589423241485, "grad_norm": 67.14193725585938, "learning_rate": 4.335424985738733e-06, "loss": 11.1752, "step": 765 }, { "epoch": 0.13109703919219579, "grad_norm": 30.7507381439209, "learning_rate": 4.341129492298916e-06, "loss": 5.3445, "step": 766 }, { "epoch": 0.13126818415197672, "grad_norm": 295.94195556640625, "learning_rate": 4.346833998859098e-06, "loss": 14.8463, "step": 767 }, { "epoch": 0.13143932911175765, "grad_norm": 31.96709442138672, "learning_rate": 4.352538505419281e-06, "loss": 5.9573, "step": 768 }, { "epoch": 0.13161047407153859, "grad_norm": 21.086137771606445, "learning_rate": 4.358243011979464e-06, "loss": 2.51, "step": 769 }, { "epoch": 0.13178161903131952, "grad_norm": 23.69211196899414, "learning_rate": 4.363947518539647e-06, "loss": 3.9384, "step": 770 }, { "epoch": 0.13195276399110045, "grad_norm": 29.09503173828125, "learning_rate": 4.369652025099829e-06, "loss": 2.6477, "step": 771 }, { "epoch": 0.13212390895088139, "grad_norm": 34.086483001708984, "learning_rate": 4.375356531660011e-06, "loss": 6.8362, "step": 772 }, { "epoch": 0.13229505391066232, "grad_norm": 22.358131408691406, "learning_rate": 4.381061038220194e-06, "loss": 2.5553, "step": 773 }, { "epoch": 0.13246619887044325, "grad_norm": 32.83020782470703, "learning_rate": 4.386765544780377e-06, "loss": 5.6526, "step": 774 }, { "epoch": 0.1326373438302242, "grad_norm": 32.111629486083984, "learning_rate": 4.3924700513405595e-06, "loss": 7.0077, "step": 775 }, { "epoch": 0.13280848879000515, "grad_norm": 28.587032318115234, "learning_rate": 4.398174557900742e-06, "loss": 3.9449, "step": 776 }, { "epoch": 0.13297963374978608, "grad_norm": 28.547178268432617, "learning_rate": 4.403879064460925e-06, "loss": 5.1286, "step": 777 }, { "epoch": 0.133150778709567, "grad_norm": 31.409543991088867, "learning_rate": 4.409583571021106e-06, "loss": 5.3343, "step": 778 }, { "epoch": 0.13332192366934795, "grad_norm": 33.33236312866211, "learning_rate": 4.415288077581289e-06, "loss": 6.5061, "step": 779 }, { "epoch": 0.13349306862912888, "grad_norm": 226.51580810546875, "learning_rate": 4.420992584141472e-06, "loss": 13.5725, "step": 780 }, { "epoch": 0.1336642135889098, "grad_norm": 29.707599639892578, "learning_rate": 4.426697090701655e-06, "loss": 3.5054, "step": 781 }, { "epoch": 0.13383535854869075, "grad_norm": 29.84592628479004, "learning_rate": 4.432401597261837e-06, "loss": 5.5461, "step": 782 }, { "epoch": 0.13400650350847168, "grad_norm": 23.87710189819336, "learning_rate": 4.43810610382202e-06, "loss": 2.7581, "step": 783 }, { "epoch": 0.1341776484682526, "grad_norm": 27.90047264099121, "learning_rate": 4.4438106103822015e-06, "loss": 5.0602, "step": 784 }, { "epoch": 0.13434879342803355, "grad_norm": 229.59202575683594, "learning_rate": 4.449515116942384e-06, "loss": 8.4299, "step": 785 }, { "epoch": 0.13451993838781448, "grad_norm": 35.904483795166016, "learning_rate": 4.455219623502567e-06, "loss": 6.6261, "step": 786 }, { "epoch": 0.13469108334759541, "grad_norm": 13.451172828674316, "learning_rate": 4.46092413006275e-06, "loss": 3.6844, "step": 787 }, { "epoch": 0.13486222830737635, "grad_norm": 220.5408172607422, "learning_rate": 4.4666286366229324e-06, "loss": 12.0786, "step": 788 }, { "epoch": 0.13503337326715728, "grad_norm": 30.378768920898438, "learning_rate": 4.472333143183115e-06, "loss": 6.4301, "step": 789 }, { "epoch": 0.13520451822693821, "grad_norm": 24.894784927368164, "learning_rate": 4.478037649743297e-06, "loss": 4.0456, "step": 790 }, { "epoch": 0.13537566318671915, "grad_norm": 63.11225509643555, "learning_rate": 4.48374215630348e-06, "loss": 10.8823, "step": 791 }, { "epoch": 0.13554680814650008, "grad_norm": 30.484046936035156, "learning_rate": 4.4894466628636626e-06, "loss": 4.5215, "step": 792 }, { "epoch": 0.13571795310628101, "grad_norm": 33.15967559814453, "learning_rate": 4.495151169423845e-06, "loss": 6.1466, "step": 793 }, { "epoch": 0.13588909806606195, "grad_norm": 31.415679931640625, "learning_rate": 4.500855675984028e-06, "loss": 5.0997, "step": 794 }, { "epoch": 0.13606024302584288, "grad_norm": 29.878276824951172, "learning_rate": 4.50656018254421e-06, "loss": 6.9375, "step": 795 }, { "epoch": 0.13623138798562381, "grad_norm": 33.10092544555664, "learning_rate": 4.512264689104393e-06, "loss": 6.2231, "step": 796 }, { "epoch": 0.13640253294540475, "grad_norm": 21.412826538085938, "learning_rate": 4.517969195664575e-06, "loss": 1.8474, "step": 797 }, { "epoch": 0.13657367790518568, "grad_norm": 31.297100067138672, "learning_rate": 4.523673702224758e-06, "loss": 5.4762, "step": 798 }, { "epoch": 0.13674482286496661, "grad_norm": 234.58111572265625, "learning_rate": 4.52937820878494e-06, "loss": 11.739, "step": 799 }, { "epoch": 0.13691596782474755, "grad_norm": 204.88748168945312, "learning_rate": 4.535082715345123e-06, "loss": 13.5482, "step": 800 }, { "epoch": 0.1370871127845285, "grad_norm": 33.66855239868164, "learning_rate": 4.540787221905305e-06, "loss": 5.9551, "step": 801 }, { "epoch": 0.13725825774430944, "grad_norm": 30.423555374145508, "learning_rate": 4.546491728465488e-06, "loss": 6.3931, "step": 802 }, { "epoch": 0.13742940270409038, "grad_norm": 30.737445831298828, "learning_rate": 4.55219623502567e-06, "loss": 4.7871, "step": 803 }, { "epoch": 0.1376005476638713, "grad_norm": 95.86985778808594, "learning_rate": 4.557900741585853e-06, "loss": 6.8129, "step": 804 }, { "epoch": 0.13777169262365224, "grad_norm": 36.5138053894043, "learning_rate": 4.5636052481460355e-06, "loss": 6.4333, "step": 805 }, { "epoch": 0.13794283758343318, "grad_norm": 31.310596466064453, "learning_rate": 4.569309754706218e-06, "loss": 6.1982, "step": 806 }, { "epoch": 0.1381139825432141, "grad_norm": 32.40011978149414, "learning_rate": 4.5750142612664005e-06, "loss": 6.5281, "step": 807 }, { "epoch": 0.13828512750299504, "grad_norm": 33.58089828491211, "learning_rate": 4.580718767826583e-06, "loss": 5.0059, "step": 808 }, { "epoch": 0.13845627246277598, "grad_norm": 46.53955841064453, "learning_rate": 4.586423274386766e-06, "loss": 10.3345, "step": 809 }, { "epoch": 0.1386274174225569, "grad_norm": 23.006080627441406, "learning_rate": 4.592127780946948e-06, "loss": 2.1468, "step": 810 }, { "epoch": 0.13879856238233784, "grad_norm": 21.113685607910156, "learning_rate": 4.597832287507131e-06, "loss": 2.0972, "step": 811 }, { "epoch": 0.13896970734211878, "grad_norm": 29.228193283081055, "learning_rate": 4.603536794067313e-06, "loss": 2.9408, "step": 812 }, { "epoch": 0.1391408523018997, "grad_norm": 39.542686462402344, "learning_rate": 4.609241300627496e-06, "loss": 6.4624, "step": 813 }, { "epoch": 0.13931199726168064, "grad_norm": 42.17389678955078, "learning_rate": 4.614945807187679e-06, "loss": 7.4244, "step": 814 }, { "epoch": 0.13948314222146158, "grad_norm": 31.26105308532715, "learning_rate": 4.620650313747861e-06, "loss": 6.5606, "step": 815 }, { "epoch": 0.1396542871812425, "grad_norm": 40.22693634033203, "learning_rate": 4.626354820308043e-06, "loss": 6.2725, "step": 816 }, { "epoch": 0.13982543214102344, "grad_norm": 25.14350700378418, "learning_rate": 4.632059326868226e-06, "loss": 4.0754, "step": 817 }, { "epoch": 0.13999657710080438, "grad_norm": 23.578937530517578, "learning_rate": 4.637763833428408e-06, "loss": 4.1309, "step": 818 }, { "epoch": 0.1401677220605853, "grad_norm": 37.57481002807617, "learning_rate": 4.643468339988591e-06, "loss": 5.8135, "step": 819 }, { "epoch": 0.14033886702036624, "grad_norm": 35.21710205078125, "learning_rate": 4.649172846548774e-06, "loss": 6.6982, "step": 820 }, { "epoch": 0.14051001198014718, "grad_norm": 14.915112495422363, "learning_rate": 4.654877353108957e-06, "loss": 2.1068, "step": 821 }, { "epoch": 0.1406811569399281, "grad_norm": 27.366252899169922, "learning_rate": 4.6605818596691385e-06, "loss": 3.2475, "step": 822 }, { "epoch": 0.14085230189970904, "grad_norm": 36.40489196777344, "learning_rate": 4.666286366229321e-06, "loss": 6.7448, "step": 823 }, { "epoch": 0.14102344685948998, "grad_norm": 37.40996551513672, "learning_rate": 4.6719908727895036e-06, "loss": 6.8328, "step": 824 }, { "epoch": 0.1411945918192709, "grad_norm": 255.09320068359375, "learning_rate": 4.677695379349686e-06, "loss": 12.0992, "step": 825 }, { "epoch": 0.14136573677905187, "grad_norm": 41.39365768432617, "learning_rate": 4.6833998859098695e-06, "loss": 6.1908, "step": 826 }, { "epoch": 0.1415368817388328, "grad_norm": 14.086997032165527, "learning_rate": 4.689104392470052e-06, "loss": 3.3856, "step": 827 }, { "epoch": 0.14170802669861374, "grad_norm": 33.170352935791016, "learning_rate": 4.694808899030234e-06, "loss": 6.9479, "step": 828 }, { "epoch": 0.14187917165839467, "grad_norm": 37.625064849853516, "learning_rate": 4.700513405590416e-06, "loss": 7.6713, "step": 829 }, { "epoch": 0.1420503166181756, "grad_norm": 25.476303100585938, "learning_rate": 4.706217912150599e-06, "loss": 4.2481, "step": 830 }, { "epoch": 0.14222146157795654, "grad_norm": 27.399072647094727, "learning_rate": 4.711922418710781e-06, "loss": 5.508, "step": 831 }, { "epoch": 0.14239260653773747, "grad_norm": 31.020893096923828, "learning_rate": 4.717626925270965e-06, "loss": 5.8831, "step": 832 }, { "epoch": 0.1425637514975184, "grad_norm": 26.108135223388672, "learning_rate": 4.723331431831147e-06, "loss": 3.5932, "step": 833 }, { "epoch": 0.14273489645729934, "grad_norm": 35.8662109375, "learning_rate": 4.729035938391329e-06, "loss": 5.1499, "step": 834 }, { "epoch": 0.14290604141708027, "grad_norm": 34.714324951171875, "learning_rate": 4.734740444951511e-06, "loss": 5.9969, "step": 835 }, { "epoch": 0.1430771863768612, "grad_norm": 34.023067474365234, "learning_rate": 4.740444951511694e-06, "loss": 6.4575, "step": 836 }, { "epoch": 0.14324833133664214, "grad_norm": 17.601118087768555, "learning_rate": 4.746149458071877e-06, "loss": 2.5208, "step": 837 }, { "epoch": 0.14341947629642307, "grad_norm": 19.672815322875977, "learning_rate": 4.75185396463206e-06, "loss": 2.1216, "step": 838 }, { "epoch": 0.143590621256204, "grad_norm": 25.771137237548828, "learning_rate": 4.757558471192242e-06, "loss": 2.6146, "step": 839 }, { "epoch": 0.14376176621598494, "grad_norm": 32.17550277709961, "learning_rate": 4.763262977752424e-06, "loss": 5.8516, "step": 840 }, { "epoch": 0.14393291117576587, "grad_norm": 72.34523010253906, "learning_rate": 4.768967484312607e-06, "loss": 11.0212, "step": 841 }, { "epoch": 0.1441040561355468, "grad_norm": 22.756717681884766, "learning_rate": 4.774671990872789e-06, "loss": 2.3166, "step": 842 }, { "epoch": 0.14427520109532774, "grad_norm": 22.13291358947754, "learning_rate": 4.7803764974329725e-06, "loss": 2.2079, "step": 843 }, { "epoch": 0.14444634605510867, "grad_norm": 65.32748413085938, "learning_rate": 4.786081003993155e-06, "loss": 6.3309, "step": 844 }, { "epoch": 0.1446174910148896, "grad_norm": 242.9714813232422, "learning_rate": 4.791785510553338e-06, "loss": 11.9379, "step": 845 }, { "epoch": 0.14478863597467054, "grad_norm": 21.737802505493164, "learning_rate": 4.79749001711352e-06, "loss": 3.806, "step": 846 }, { "epoch": 0.14495978093445147, "grad_norm": 29.438758850097656, "learning_rate": 4.803194523673702e-06, "loss": 5.7729, "step": 847 }, { "epoch": 0.1451309258942324, "grad_norm": 25.701087951660156, "learning_rate": 4.808899030233884e-06, "loss": 2.8536, "step": 848 }, { "epoch": 0.14530207085401334, "grad_norm": 130.01524353027344, "learning_rate": 4.814603536794068e-06, "loss": 7.5391, "step": 849 }, { "epoch": 0.14547321581379427, "grad_norm": 30.284828186035156, "learning_rate": 4.82030804335425e-06, "loss": 3.806, "step": 850 }, { "epoch": 0.1456443607735752, "grad_norm": 23.351642608642578, "learning_rate": 4.826012549914433e-06, "loss": 4.2263, "step": 851 }, { "epoch": 0.14581550573335617, "grad_norm": 216.2431182861328, "learning_rate": 4.831717056474615e-06, "loss": 9.159, "step": 852 }, { "epoch": 0.1459866506931371, "grad_norm": 35.071754455566406, "learning_rate": 4.837421563034797e-06, "loss": 6.503, "step": 853 }, { "epoch": 0.14615779565291803, "grad_norm": 34.0211296081543, "learning_rate": 4.84312606959498e-06, "loss": 6.4636, "step": 854 }, { "epoch": 0.14632894061269897, "grad_norm": 17.20896339416504, "learning_rate": 4.848830576155163e-06, "loss": 2.7218, "step": 855 }, { "epoch": 0.1465000855724799, "grad_norm": 136.72647094726562, "learning_rate": 4.8545350827153454e-06, "loss": 7.7082, "step": 856 }, { "epoch": 0.14667123053226083, "grad_norm": 53.50956344604492, "learning_rate": 4.860239589275528e-06, "loss": 10.0171, "step": 857 }, { "epoch": 0.14684237549204177, "grad_norm": 21.030473709106445, "learning_rate": 4.8659440958357105e-06, "loss": 4.1916, "step": 858 }, { "epoch": 0.1470135204518227, "grad_norm": 34.38727569580078, "learning_rate": 4.871648602395892e-06, "loss": 5.969, "step": 859 }, { "epoch": 0.14718466541160363, "grad_norm": 22.703882217407227, "learning_rate": 4.8773531089560756e-06, "loss": 2.4073, "step": 860 }, { "epoch": 0.14735581037138457, "grad_norm": 33.388858795166016, "learning_rate": 4.883057615516258e-06, "loss": 5.7571, "step": 861 }, { "epoch": 0.1475269553311655, "grad_norm": 35.79853820800781, "learning_rate": 4.888762122076441e-06, "loss": 5.9363, "step": 862 }, { "epoch": 0.14769810029094643, "grad_norm": 20.656721115112305, "learning_rate": 4.894466628636623e-06, "loss": 2.0406, "step": 863 }, { "epoch": 0.14786924525072737, "grad_norm": 35.20976638793945, "learning_rate": 4.900171135196806e-06, "loss": 5.8613, "step": 864 }, { "epoch": 0.1480403902105083, "grad_norm": 22.342880249023438, "learning_rate": 4.905875641756987e-06, "loss": 4.0119, "step": 865 }, { "epoch": 0.14821153517028923, "grad_norm": 33.253292083740234, "learning_rate": 4.911580148317171e-06, "loss": 4.62, "step": 866 }, { "epoch": 0.14838268013007017, "grad_norm": 186.65093994140625, "learning_rate": 4.917284654877353e-06, "loss": 11.2662, "step": 867 }, { "epoch": 0.1485538250898511, "grad_norm": 15.842426300048828, "learning_rate": 4.922989161437536e-06, "loss": 2.0607, "step": 868 }, { "epoch": 0.14872497004963203, "grad_norm": 26.70699119567871, "learning_rate": 4.928693667997718e-06, "loss": 3.1737, "step": 869 }, { "epoch": 0.14889611500941297, "grad_norm": 33.37158966064453, "learning_rate": 4.934398174557901e-06, "loss": 4.7352, "step": 870 }, { "epoch": 0.1490672599691939, "grad_norm": 26.4490966796875, "learning_rate": 4.940102681118083e-06, "loss": 4.2178, "step": 871 }, { "epoch": 0.14923840492897483, "grad_norm": 33.25678634643555, "learning_rate": 4.945807187678266e-06, "loss": 5.0764, "step": 872 }, { "epoch": 0.14940954988875577, "grad_norm": 38.204769134521484, "learning_rate": 4.9515116942384485e-06, "loss": 5.8078, "step": 873 }, { "epoch": 0.1495806948485367, "grad_norm": 27.79875946044922, "learning_rate": 4.957216200798631e-06, "loss": 5.6432, "step": 874 }, { "epoch": 0.14975183980831763, "grad_norm": 32.442115783691406, "learning_rate": 4.9629207073588135e-06, "loss": 5.7378, "step": 875 }, { "epoch": 0.14992298476809857, "grad_norm": 57.06877517700195, "learning_rate": 4.968625213918996e-06, "loss": 10.3136, "step": 876 }, { "epoch": 0.15009412972787953, "grad_norm": 32.131187438964844, "learning_rate": 4.9743297204791794e-06, "loss": 4.6921, "step": 877 }, { "epoch": 0.15009412972787953, "eval_nli-pairs_loss": 5.535374164581299, "eval_nli-pairs_runtime": 4.3709, "eval_nli-pairs_samples_per_second": 45.757, "eval_nli-pairs_steps_per_second": 1.601, "eval_sts-test_pearson_cosine": 0.6147169012893178, "eval_sts-test_pearson_dot": 0.4334302941897573, "eval_sts-test_pearson_euclidean": 0.6082490673246602, "eval_sts-test_pearson_manhattan": 0.616700428941834, "eval_sts-test_pearson_max": 0.616700428941834, "eval_sts-test_spearman_cosine": 0.5972327557562241, "eval_sts-test_spearman_dot": 0.41946207508864325, "eval_sts-test_spearman_euclidean": 0.5959187544369754, "eval_sts-test_spearman_manhattan": 0.6029031731511296, "eval_sts-test_spearman_max": 0.6029031731511296, "step": 877 }, { "epoch": 0.15009412972787953, "eval_vitaminc-pairs_loss": 3.619838237762451, "eval_vitaminc-pairs_runtime": 2.7372, "eval_vitaminc-pairs_samples_per_second": 73.068, "eval_vitaminc-pairs_steps_per_second": 2.557, "step": 877 }, { "epoch": 0.15009412972787953, "eval_qnli-contrastive_loss": 12.3779878616333, "eval_qnli-contrastive_runtime": 0.6382, "eval_qnli-contrastive_samples_per_second": 313.373, "eval_qnli-contrastive_steps_per_second": 10.968, "step": 877 }, { "epoch": 0.15009412972787953, "eval_scitail-pairs-qa_loss": 1.6706750392913818, "eval_scitail-pairs-qa_runtime": 1.6279, "eval_scitail-pairs-qa_samples_per_second": 122.855, "eval_scitail-pairs-qa_steps_per_second": 4.3, "step": 877 }, { "epoch": 0.15009412972787953, "eval_scitail-pairs-pos_loss": 3.0242857933044434, "eval_scitail-pairs-pos_runtime": 2.6188, "eval_scitail-pairs-pos_samples_per_second": 76.369, "eval_scitail-pairs-pos_steps_per_second": 2.673, "step": 877 }, { "epoch": 0.15009412972787953, "eval_xsum-pairs_loss": 3.0581634044647217, "eval_xsum-pairs_runtime": 2.6458, "eval_xsum-pairs_samples_per_second": 66.142, "eval_xsum-pairs_steps_per_second": 2.268, "step": 877 }, { "epoch": 0.15009412972787953, "eval_compression-pairs_loss": 1.9685934782028198, "eval_compression-pairs_runtime": 0.5084, "eval_compression-pairs_samples_per_second": 393.398, "eval_compression-pairs_steps_per_second": 13.769, "step": 877 }, { "epoch": 0.15009412972787953, "eval_sciq_pairs_loss": 6.824851989746094, "eval_sciq_pairs_runtime": 9.1685, "eval_sciq_pairs_samples_per_second": 21.814, "eval_sciq_pairs_steps_per_second": 0.763, "step": 877 }, { "epoch": 0.15009412972787953, "eval_qasc_pairs_loss": 10.253314018249512, "eval_qasc_pairs_runtime": 2.6538, "eval_qasc_pairs_samples_per_second": 75.363, "eval_qasc_pairs_steps_per_second": 2.638, "step": 877 }, { "epoch": 0.15009412972787953, "eval_openbookqa_pairs_loss": 5.933743953704834, "eval_openbookqa_pairs_runtime": 0.6418, "eval_openbookqa_pairs_samples_per_second": 107.513, "eval_openbookqa_pairs_steps_per_second": 4.674, "step": 877 }, { "epoch": 0.15009412972787953, "eval_msmarco_pairs_loss": 5.185385704040527, "eval_msmarco_pairs_runtime": 3.9947, "eval_msmarco_pairs_samples_per_second": 50.067, "eval_msmarco_pairs_steps_per_second": 1.752, "step": 877 }, { "epoch": 0.15009412972787953, "eval_nq_pairs_loss": 6.44993782043457, "eval_nq_pairs_runtime": 8.638, "eval_nq_pairs_samples_per_second": 23.153, "eval_nq_pairs_steps_per_second": 0.81, "step": 877 }, { "epoch": 0.15009412972787953, "eval_trivia_pairs_loss": 6.129721641540527, "eval_trivia_pairs_runtime": 12.8296, "eval_trivia_pairs_samples_per_second": 15.589, "eval_trivia_pairs_steps_per_second": 0.546, "step": 877 }, { "epoch": 0.15009412972787953, "eval_quora_pairs_loss": 1.7218067646026611, "eval_quora_pairs_runtime": 1.5931, "eval_quora_pairs_samples_per_second": 125.544, "eval_quora_pairs_steps_per_second": 4.394, "step": 877 }, { "epoch": 0.15009412972787953, "eval_gooaq_pairs_loss": 4.168159008026123, "eval_gooaq_pairs_runtime": 2.6679, "eval_gooaq_pairs_samples_per_second": 74.966, "eval_gooaq_pairs_steps_per_second": 2.624, "step": 877 }, { "epoch": 0.15026527468766046, "grad_norm": 29.085119247436523, "learning_rate": 4.980034227039361e-06, "loss": 5.8249, "step": 878 }, { "epoch": 0.1504364196474414, "grad_norm": 35.45232009887695, "learning_rate": 4.985738733599544e-06, "loss": 6.378, "step": 879 }, { "epoch": 0.15060756460722233, "grad_norm": 34.018470764160156, "learning_rate": 4.991443240159726e-06, "loss": 5.326, "step": 880 }, { "epoch": 0.15077870956700326, "grad_norm": 22.30814552307129, "learning_rate": 4.997147746719909e-06, "loss": 2.6674, "step": 881 }, { "epoch": 0.1509498545267842, "grad_norm": 36.679046630859375, "learning_rate": 5.002852253280091e-06, "loss": 6.6655, "step": 882 }, { "epoch": 0.15112099948656513, "grad_norm": 36.78900146484375, "learning_rate": 5.008556759840275e-06, "loss": 4.5851, "step": 883 }, { "epoch": 0.15129214444634606, "grad_norm": 46.770057678222656, "learning_rate": 5.014261266400456e-06, "loss": 9.9308, "step": 884 }, { "epoch": 0.151463289406127, "grad_norm": 27.262338638305664, "learning_rate": 5.019965772960639e-06, "loss": 2.2515, "step": 885 }, { "epoch": 0.15163443436590793, "grad_norm": 193.24122619628906, "learning_rate": 5.025670279520821e-06, "loss": 10.7631, "step": 886 }, { "epoch": 0.15180557932568886, "grad_norm": 30.53336524963379, "learning_rate": 5.031374786081004e-06, "loss": 3.9297, "step": 887 }, { "epoch": 0.1519767242854698, "grad_norm": 13.035544395446777, "learning_rate": 5.0370792926411864e-06, "loss": 3.16, "step": 888 }, { "epoch": 0.15214786924525073, "grad_norm": 27.65202522277832, "learning_rate": 5.04278379920137e-06, "loss": 3.1012, "step": 889 }, { "epoch": 0.15231901420503166, "grad_norm": 28.412954330444336, "learning_rate": 5.0484883057615515e-06, "loss": 2.4251, "step": 890 }, { "epoch": 0.1524901591648126, "grad_norm": 35.567386627197266, "learning_rate": 5.054192812321734e-06, "loss": 5.1793, "step": 891 }, { "epoch": 0.15266130412459353, "grad_norm": 31.945302963256836, "learning_rate": 5.0598973188819166e-06, "loss": 4.9138, "step": 892 }, { "epoch": 0.15283244908437446, "grad_norm": 30.31682014465332, "learning_rate": 5.065601825442099e-06, "loss": 4.8582, "step": 893 }, { "epoch": 0.1530035940441554, "grad_norm": 22.3225040435791, "learning_rate": 5.0713063320022825e-06, "loss": 2.003, "step": 894 }, { "epoch": 0.15317473900393633, "grad_norm": 23.375139236450195, "learning_rate": 5.077010838562465e-06, "loss": 2.3547, "step": 895 }, { "epoch": 0.15334588396371726, "grad_norm": 32.41263198852539, "learning_rate": 5.0827153451226475e-06, "loss": 6.2287, "step": 896 }, { "epoch": 0.1535170289234982, "grad_norm": 20.43022346496582, "learning_rate": 5.088419851682829e-06, "loss": 2.1189, "step": 897 }, { "epoch": 0.15368817388327913, "grad_norm": 37.203250885009766, "learning_rate": 5.094124358243012e-06, "loss": 6.3629, "step": 898 }, { "epoch": 0.15385931884306006, "grad_norm": 19.725624084472656, "learning_rate": 5.099828864803194e-06, "loss": 2.2277, "step": 899 }, { "epoch": 0.154030463802841, "grad_norm": 27.29782485961914, "learning_rate": 5.105533371363378e-06, "loss": 2.8851, "step": 900 }, { "epoch": 0.15420160876262193, "grad_norm": 172.8111572265625, "learning_rate": 5.11123787792356e-06, "loss": 9.9783, "step": 901 }, { "epoch": 0.1543727537224029, "grad_norm": 56.5546875, "learning_rate": 5.116942384483743e-06, "loss": 10.3301, "step": 902 }, { "epoch": 0.15454389868218382, "grad_norm": 32.12007522583008, "learning_rate": 5.122646891043924e-06, "loss": 3.3146, "step": 903 }, { "epoch": 0.15471504364196476, "grad_norm": 197.39170837402344, "learning_rate": 5.128351397604107e-06, "loss": 11.016, "step": 904 }, { "epoch": 0.1548861886017457, "grad_norm": 36.48847579956055, "learning_rate": 5.1340559041642895e-06, "loss": 4.8215, "step": 905 }, { "epoch": 0.15505733356152662, "grad_norm": 31.014644622802734, "learning_rate": 5.139760410724473e-06, "loss": 4.7237, "step": 906 }, { "epoch": 0.15522847852130756, "grad_norm": 31.436952590942383, "learning_rate": 5.145464917284655e-06, "loss": 4.6175, "step": 907 }, { "epoch": 0.1553996234810885, "grad_norm": 27.38591194152832, "learning_rate": 5.151169423844838e-06, "loss": 4.0958, "step": 908 }, { "epoch": 0.15557076844086942, "grad_norm": 31.732324600219727, "learning_rate": 5.15687393040502e-06, "loss": 4.4682, "step": 909 }, { "epoch": 0.15574191340065036, "grad_norm": 15.360635757446289, "learning_rate": 5.162578436965202e-06, "loss": 2.4148, "step": 910 }, { "epoch": 0.1559130583604313, "grad_norm": 172.3378448486328, "learning_rate": 5.168282943525385e-06, "loss": 9.8466, "step": 911 }, { "epoch": 0.15608420332021222, "grad_norm": 31.59737777709961, "learning_rate": 5.173987450085568e-06, "loss": 6.1221, "step": 912 }, { "epoch": 0.15625534827999316, "grad_norm": 20.06523323059082, "learning_rate": 5.179691956645751e-06, "loss": 2.0035, "step": 913 }, { "epoch": 0.1564264932397741, "grad_norm": 25.82581329345703, "learning_rate": 5.185396463205933e-06, "loss": 4.7388, "step": 914 }, { "epoch": 0.15659763819955502, "grad_norm": 13.644715309143066, "learning_rate": 5.191100969766115e-06, "loss": 2.1442, "step": 915 }, { "epoch": 0.15676878315933596, "grad_norm": 36.4990119934082, "learning_rate": 5.196805476326297e-06, "loss": 6.2552, "step": 916 }, { "epoch": 0.1569399281191169, "grad_norm": 35.6190185546875, "learning_rate": 5.202509982886481e-06, "loss": 6.3529, "step": 917 }, { "epoch": 0.15711107307889782, "grad_norm": 13.495047569274902, "learning_rate": 5.208214489446663e-06, "loss": 3.5731, "step": 918 }, { "epoch": 0.15728221803867876, "grad_norm": 236.7681121826172, "learning_rate": 5.213918996006846e-06, "loss": 10.5726, "step": 919 }, { "epoch": 0.1574533629984597, "grad_norm": 34.39946746826172, "learning_rate": 5.219623502567028e-06, "loss": 6.0673, "step": 920 }, { "epoch": 0.15762450795824062, "grad_norm": 12.590995788574219, "learning_rate": 5.225328009127211e-06, "loss": 2.77, "step": 921 }, { "epoch": 0.15779565291802156, "grad_norm": 31.968891143798828, "learning_rate": 5.2310325156873925e-06, "loss": 4.1677, "step": 922 }, { "epoch": 0.1579667978778025, "grad_norm": 31.067489624023438, "learning_rate": 5.236737022247576e-06, "loss": 4.716, "step": 923 }, { "epoch": 0.15813794283758342, "grad_norm": 36.08390808105469, "learning_rate": 5.2424415288077584e-06, "loss": 6.528, "step": 924 }, { "epoch": 0.15830908779736436, "grad_norm": 34.2723274230957, "learning_rate": 5.248146035367941e-06, "loss": 6.4655, "step": 925 }, { "epoch": 0.1584802327571453, "grad_norm": 43.43145751953125, "learning_rate": 5.2538505419281235e-06, "loss": 5.6795, "step": 926 }, { "epoch": 0.15865137771692622, "grad_norm": 32.78499221801758, "learning_rate": 5.259555048488306e-06, "loss": 5.6396, "step": 927 }, { "epoch": 0.15882252267670718, "grad_norm": 35.156925201416016, "learning_rate": 5.265259555048488e-06, "loss": 4.7143, "step": 928 }, { "epoch": 0.15899366763648812, "grad_norm": 34.6341552734375, "learning_rate": 5.270964061608671e-06, "loss": 5.6931, "step": 929 }, { "epoch": 0.15916481259626905, "grad_norm": 35.668331146240234, "learning_rate": 5.276668568168854e-06, "loss": 5.6404, "step": 930 }, { "epoch": 0.15933595755604998, "grad_norm": 34.62514877319336, "learning_rate": 5.282373074729036e-06, "loss": 5.0469, "step": 931 }, { "epoch": 0.15950710251583092, "grad_norm": 37.79499435424805, "learning_rate": 5.288077581289219e-06, "loss": 5.3761, "step": 932 }, { "epoch": 0.15967824747561185, "grad_norm": 40.4017333984375, "learning_rate": 5.293782087849401e-06, "loss": 5.6738, "step": 933 }, { "epoch": 0.15984939243539278, "grad_norm": 35.31856155395508, "learning_rate": 5.299486594409584e-06, "loss": 6.4936, "step": 934 }, { "epoch": 0.16002053739517372, "grad_norm": 126.11963653564453, "learning_rate": 5.305191100969766e-06, "loss": 9.9326, "step": 935 }, { "epoch": 0.16019168235495465, "grad_norm": 34.740753173828125, "learning_rate": 5.310895607529949e-06, "loss": 2.0987, "step": 936 }, { "epoch": 0.16036282731473558, "grad_norm": 34.9671745300293, "learning_rate": 5.316600114090131e-06, "loss": 6.2338, "step": 937 }, { "epoch": 0.16053397227451652, "grad_norm": 21.198925018310547, "learning_rate": 5.322304620650314e-06, "loss": 3.5463, "step": 938 }, { "epoch": 0.16070511723429745, "grad_norm": 30.98229217529297, "learning_rate": 5.328009127210496e-06, "loss": 4.7342, "step": 939 }, { "epoch": 0.16087626219407838, "grad_norm": 41.88993835449219, "learning_rate": 5.333713633770679e-06, "loss": 6.5058, "step": 940 }, { "epoch": 0.16104740715385932, "grad_norm": 24.218576431274414, "learning_rate": 5.3394181403308615e-06, "loss": 2.0172, "step": 941 }, { "epoch": 0.16121855211364025, "grad_norm": 32.891719818115234, "learning_rate": 5.345122646891044e-06, "loss": 5.893, "step": 942 }, { "epoch": 0.16138969707342118, "grad_norm": 38.93867874145508, "learning_rate": 5.3508271534512265e-06, "loss": 5.8157, "step": 943 }, { "epoch": 0.16156084203320212, "grad_norm": 31.02938461303711, "learning_rate": 5.356531660011409e-06, "loss": 5.529, "step": 944 }, { "epoch": 0.16173198699298305, "grad_norm": 36.240440368652344, "learning_rate": 5.362236166571592e-06, "loss": 4.7931, "step": 945 }, { "epoch": 0.16190313195276398, "grad_norm": 23.227556228637695, "learning_rate": 5.367940673131775e-06, "loss": 2.1265, "step": 946 }, { "epoch": 0.16207427691254492, "grad_norm": 40.07374954223633, "learning_rate": 5.373645179691957e-06, "loss": 5.8823, "step": 947 }, { "epoch": 0.16224542187232585, "grad_norm": 29.960735321044922, "learning_rate": 5.379349686252139e-06, "loss": 4.6281, "step": 948 }, { "epoch": 0.16241656683210678, "grad_norm": 173.5910186767578, "learning_rate": 5.385054192812322e-06, "loss": 10.3282, "step": 949 }, { "epoch": 0.16258771179188772, "grad_norm": 37.48442840576172, "learning_rate": 5.390758699372504e-06, "loss": 6.1584, "step": 950 }, { "epoch": 0.16275885675166865, "grad_norm": 39.48939514160156, "learning_rate": 5.396463205932687e-06, "loss": 5.655, "step": 951 }, { "epoch": 0.16293000171144958, "grad_norm": 34.57015609741211, "learning_rate": 5.40216771249287e-06, "loss": 5.4251, "step": 952 }, { "epoch": 0.16310114667123055, "grad_norm": 51.02991485595703, "learning_rate": 5.407872219053052e-06, "loss": 10.2283, "step": 953 }, { "epoch": 0.16327229163101148, "grad_norm": 31.77302360534668, "learning_rate": 5.413576725613234e-06, "loss": 4.0174, "step": 954 }, { "epoch": 0.1634434365907924, "grad_norm": 31.242929458618164, "learning_rate": 5.419281232173417e-06, "loss": 5.5883, "step": 955 }, { "epoch": 0.16361458155057335, "grad_norm": 31.789701461791992, "learning_rate": 5.4249857387335994e-06, "loss": 4.5646, "step": 956 }, { "epoch": 0.16378572651035428, "grad_norm": 34.09980392456055, "learning_rate": 5.430690245293783e-06, "loss": 4.9872, "step": 957 }, { "epoch": 0.1639568714701352, "grad_norm": 31.57735252380371, "learning_rate": 5.436394751853965e-06, "loss": 5.158, "step": 958 }, { "epoch": 0.16412801642991615, "grad_norm": 32.941917419433594, "learning_rate": 5.442099258414147e-06, "loss": 5.4497, "step": 959 }, { "epoch": 0.16429916138969708, "grad_norm": 200.919921875, "learning_rate": 5.4478037649743296e-06, "loss": 9.7888, "step": 960 }, { "epoch": 0.164470306349478, "grad_norm": 28.78856658935547, "learning_rate": 5.453508271534512e-06, "loss": 5.0757, "step": 961 }, { "epoch": 0.16464145130925895, "grad_norm": 22.877927780151367, "learning_rate": 5.459212778094695e-06, "loss": 3.6177, "step": 962 }, { "epoch": 0.16481259626903988, "grad_norm": 24.904977798461914, "learning_rate": 5.464917284654878e-06, "loss": 4.2287, "step": 963 }, { "epoch": 0.1649837412288208, "grad_norm": 35.849124908447266, "learning_rate": 5.4706217912150605e-06, "loss": 5.1121, "step": 964 }, { "epoch": 0.16515488618860175, "grad_norm": 31.580976486206055, "learning_rate": 5.476326297775242e-06, "loss": 4.4859, "step": 965 }, { "epoch": 0.16532603114838268, "grad_norm": 30.3056697845459, "learning_rate": 5.482030804335425e-06, "loss": 4.5076, "step": 966 }, { "epoch": 0.1654971761081636, "grad_norm": 34.674468994140625, "learning_rate": 5.487735310895607e-06, "loss": 5.7789, "step": 967 }, { "epoch": 0.16566832106794455, "grad_norm": 28.0445556640625, "learning_rate": 5.49343981745579e-06, "loss": 2.7613, "step": 968 }, { "epoch": 0.16583946602772548, "grad_norm": 33.28575134277344, "learning_rate": 5.499144324015973e-06, "loss": 5.1032, "step": 969 }, { "epoch": 0.1660106109875064, "grad_norm": 35.53700637817383, "learning_rate": 5.504848830576156e-06, "loss": 5.2129, "step": 970 }, { "epoch": 0.16618175594728735, "grad_norm": 33.2183952331543, "learning_rate": 5.510553337136338e-06, "loss": 5.6908, "step": 971 }, { "epoch": 0.16635290090706828, "grad_norm": 30.640926361083984, "learning_rate": 5.51625784369652e-06, "loss": 4.4325, "step": 972 }, { "epoch": 0.1665240458668492, "grad_norm": 24.672338485717773, "learning_rate": 5.5219623502567025e-06, "loss": 3.9552, "step": 973 }, { "epoch": 0.16669519082663015, "grad_norm": 33.66337585449219, "learning_rate": 5.527666856816886e-06, "loss": 5.4014, "step": 974 }, { "epoch": 0.16686633578641108, "grad_norm": 32.082942962646484, "learning_rate": 5.533371363377068e-06, "loss": 5.9258, "step": 975 }, { "epoch": 0.167037480746192, "grad_norm": 37.91094970703125, "learning_rate": 5.539075869937251e-06, "loss": 5.717, "step": 976 }, { "epoch": 0.16720862570597295, "grad_norm": 20.26280975341797, "learning_rate": 5.5447803764974335e-06, "loss": 2.2263, "step": 977 }, { "epoch": 0.16737977066575388, "grad_norm": 48.14308547973633, "learning_rate": 5.550484883057615e-06, "loss": 9.6938, "step": 978 }, { "epoch": 0.16755091562553484, "grad_norm": 22.81192970275879, "learning_rate": 5.556189389617798e-06, "loss": 3.7015, "step": 979 }, { "epoch": 0.16772206058531577, "grad_norm": 27.474571228027344, "learning_rate": 5.561893896177981e-06, "loss": 2.9404, "step": 980 }, { "epoch": 0.1678932055450967, "grad_norm": 25.376007080078125, "learning_rate": 5.567598402738164e-06, "loss": 2.3926, "step": 981 }, { "epoch": 0.16806435050487764, "grad_norm": 31.575468063354492, "learning_rate": 5.573302909298346e-06, "loss": 4.7349, "step": 982 }, { "epoch": 0.16823549546465857, "grad_norm": 194.93817138671875, "learning_rate": 5.579007415858529e-06, "loss": 9.7172, "step": 983 }, { "epoch": 0.1684066404244395, "grad_norm": 31.26558494567871, "learning_rate": 5.58471192241871e-06, "loss": 3.9837, "step": 984 }, { "epoch": 0.16857778538422044, "grad_norm": 32.1373405456543, "learning_rate": 5.590416428978893e-06, "loss": 5.0026, "step": 985 }, { "epoch": 0.16874893034400137, "grad_norm": 37.07416915893555, "learning_rate": 5.596120935539076e-06, "loss": 5.8572, "step": 986 }, { "epoch": 0.1689200753037823, "grad_norm": 35.09983825683594, "learning_rate": 5.601825442099259e-06, "loss": 5.6302, "step": 987 }, { "epoch": 0.16909122026356324, "grad_norm": 46.96855926513672, "learning_rate": 5.607529948659441e-06, "loss": 9.6255, "step": 988 }, { "epoch": 0.16926236522334417, "grad_norm": 36.15262985229492, "learning_rate": 5.613234455219624e-06, "loss": 5.5484, "step": 989 }, { "epoch": 0.1694335101831251, "grad_norm": 33.642967224121094, "learning_rate": 5.6189389617798055e-06, "loss": 5.5827, "step": 990 }, { "epoch": 0.16960465514290604, "grad_norm": 27.581716537475586, "learning_rate": 5.624643468339988e-06, "loss": 2.9652, "step": 991 }, { "epoch": 0.16977580010268697, "grad_norm": 19.107044219970703, "learning_rate": 5.6303479749001714e-06, "loss": 1.7442, "step": 992 }, { "epoch": 0.1699469450624679, "grad_norm": 165.6937255859375, "learning_rate": 5.636052481460354e-06, "loss": 10.2439, "step": 993 }, { "epoch": 0.17011809002224884, "grad_norm": 171.38658142089844, "learning_rate": 5.6417569880205365e-06, "loss": 10.7544, "step": 994 }, { "epoch": 0.17028923498202977, "grad_norm": 29.20503807067871, "learning_rate": 5.647461494580719e-06, "loss": 4.176, "step": 995 }, { "epoch": 0.1704603799418107, "grad_norm": 29.09612274169922, "learning_rate": 5.6531660011409016e-06, "loss": 4.1945, "step": 996 }, { "epoch": 0.17063152490159164, "grad_norm": 39.78682327270508, "learning_rate": 5.658870507701084e-06, "loss": 6.4205, "step": 997 }, { "epoch": 0.17080266986137257, "grad_norm": 13.687639236450195, "learning_rate": 5.664575014261267e-06, "loss": 3.468, "step": 998 }, { "epoch": 0.1709738148211535, "grad_norm": 41.89799118041992, "learning_rate": 5.670279520821449e-06, "loss": 7.13, "step": 999 }, { "epoch": 0.17114495978093444, "grad_norm": 22.78835678100586, "learning_rate": 5.675984027381632e-06, "loss": 2.7249, "step": 1000 }, { "epoch": 0.17131610474071537, "grad_norm": 26.538780212402344, "learning_rate": 5.681688533941814e-06, "loss": 3.2385, "step": 1001 }, { "epoch": 0.1714872497004963, "grad_norm": 24.171205520629883, "learning_rate": 5.687393040501997e-06, "loss": 3.7183, "step": 1002 }, { "epoch": 0.17165839466027724, "grad_norm": 35.46499252319336, "learning_rate": 5.693097547062179e-06, "loss": 5.4996, "step": 1003 }, { "epoch": 0.1718295396200582, "grad_norm": 15.119646072387695, "learning_rate": 5.698802053622362e-06, "loss": 2.4476, "step": 1004 }, { "epoch": 0.17200068457983914, "grad_norm": 43.560546875, "learning_rate": 5.704506560182544e-06, "loss": 9.1856, "step": 1005 }, { "epoch": 0.17217182953962007, "grad_norm": 42.41808319091797, "learning_rate": 5.710211066742727e-06, "loss": 5.6756, "step": 1006 }, { "epoch": 0.172342974499401, "grad_norm": 34.344207763671875, "learning_rate": 5.715915573302909e-06, "loss": 5.2383, "step": 1007 }, { "epoch": 0.17251411945918194, "grad_norm": 19.511310577392578, "learning_rate": 5.721620079863092e-06, "loss": 3.3214, "step": 1008 }, { "epoch": 0.17268526441896287, "grad_norm": 33.06563949584961, "learning_rate": 5.7273245864232745e-06, "loss": 5.6944, "step": 1009 }, { "epoch": 0.1728564093787438, "grad_norm": 38.382041931152344, "learning_rate": 5.733029092983457e-06, "loss": 5.9898, "step": 1010 }, { "epoch": 0.17302755433852474, "grad_norm": 28.5861759185791, "learning_rate": 5.7387335995436395e-06, "loss": 5.2048, "step": 1011 }, { "epoch": 0.17319869929830567, "grad_norm": 31.76646614074707, "learning_rate": 5.744438106103822e-06, "loss": 6.0811, "step": 1012 }, { "epoch": 0.1733698442580866, "grad_norm": 37.81482696533203, "learning_rate": 5.750142612664005e-06, "loss": 4.8642, "step": 1013 }, { "epoch": 0.17354098921786754, "grad_norm": 45.32394790649414, "learning_rate": 5.755847119224188e-06, "loss": 9.5803, "step": 1014 }, { "epoch": 0.17371213417764847, "grad_norm": 35.39071273803711, "learning_rate": 5.76155162578437e-06, "loss": 4.3758, "step": 1015 }, { "epoch": 0.1738832791374294, "grad_norm": 31.971323013305664, "learning_rate": 5.767256132344552e-06, "loss": 4.2616, "step": 1016 }, { "epoch": 0.17405442409721034, "grad_norm": 29.855161666870117, "learning_rate": 5.772960638904735e-06, "loss": 5.5371, "step": 1017 }, { "epoch": 0.17422556905699127, "grad_norm": 21.00974464416504, "learning_rate": 5.778665145464917e-06, "loss": 1.9809, "step": 1018 }, { "epoch": 0.1743967140167722, "grad_norm": 23.60835075378418, "learning_rate": 5.7843696520251e-06, "loss": 2.5916, "step": 1019 }, { "epoch": 0.17456785897655314, "grad_norm": 36.11520767211914, "learning_rate": 5.790074158585283e-06, "loss": 4.9198, "step": 1020 }, { "epoch": 0.17473900393633407, "grad_norm": 21.838703155517578, "learning_rate": 5.795778665145466e-06, "loss": 2.1235, "step": 1021 }, { "epoch": 0.174910148896115, "grad_norm": 28.41387367248535, "learning_rate": 5.801483171705647e-06, "loss": 5.0401, "step": 1022 }, { "epoch": 0.17508129385589594, "grad_norm": 28.482187271118164, "learning_rate": 5.80718767826583e-06, "loss": 4.7167, "step": 1023 }, { "epoch": 0.17525243881567687, "grad_norm": 33.954307556152344, "learning_rate": 5.8128921848260124e-06, "loss": 4.9666, "step": 1024 }, { "epoch": 0.1754235837754578, "grad_norm": 33.401920318603516, "learning_rate": 5.818596691386195e-06, "loss": 6.3783, "step": 1025 }, { "epoch": 0.17559472873523874, "grad_norm": 37.047691345214844, "learning_rate": 5.824301197946378e-06, "loss": 5.5925, "step": 1026 }, { "epoch": 0.17576587369501967, "grad_norm": 30.060083389282227, "learning_rate": 5.830005704506561e-06, "loss": 3.8415, "step": 1027 }, { "epoch": 0.1759370186548006, "grad_norm": 30.832544326782227, "learning_rate": 5.8357102110667426e-06, "loss": 4.9379, "step": 1028 }, { "epoch": 0.17610816361458156, "grad_norm": 30.651966094970703, "learning_rate": 5.841414717626925e-06, "loss": 3.9393, "step": 1029 }, { "epoch": 0.1762793085743625, "grad_norm": 12.284616470336914, "learning_rate": 5.847119224187108e-06, "loss": 2.7979, "step": 1030 }, { "epoch": 0.17645045353414343, "grad_norm": 25.138864517211914, "learning_rate": 5.85282373074729e-06, "loss": 3.6294, "step": 1031 }, { "epoch": 0.17662159849392436, "grad_norm": 19.136524200439453, "learning_rate": 5.8585282373074735e-06, "loss": 1.5926, "step": 1032 }, { "epoch": 0.1767927434537053, "grad_norm": 36.646968841552734, "learning_rate": 5.864232743867656e-06, "loss": 5.8265, "step": 1033 }, { "epoch": 0.17696388841348623, "grad_norm": 17.363170623779297, "learning_rate": 5.869937250427838e-06, "loss": 1.7465, "step": 1034 }, { "epoch": 0.17713503337326716, "grad_norm": 29.55439567565918, "learning_rate": 5.87564175698802e-06, "loss": 3.617, "step": 1035 }, { "epoch": 0.1773061783330481, "grad_norm": 203.16549682617188, "learning_rate": 5.881346263548203e-06, "loss": 7.9826, "step": 1036 }, { "epoch": 0.17747732329282903, "grad_norm": 17.790836334228516, "learning_rate": 5.887050770108386e-06, "loss": 2.1574, "step": 1037 }, { "epoch": 0.17764846825260996, "grad_norm": 40.40040969848633, "learning_rate": 5.892755276668569e-06, "loss": 5.5116, "step": 1038 }, { "epoch": 0.1778196132123909, "grad_norm": 30.316959381103516, "learning_rate": 5.898459783228751e-06, "loss": 4.4268, "step": 1039 }, { "epoch": 0.17799075817217183, "grad_norm": 34.86418151855469, "learning_rate": 5.904164289788933e-06, "loss": 4.9673, "step": 1040 }, { "epoch": 0.17816190313195276, "grad_norm": 198.34268188476562, "learning_rate": 5.9098687963491155e-06, "loss": 10.3881, "step": 1041 }, { "epoch": 0.1783330480917337, "grad_norm": 29.608211517333984, "learning_rate": 5.915573302909298e-06, "loss": 3.9641, "step": 1042 }, { "epoch": 0.17850419305151463, "grad_norm": 28.76857566833496, "learning_rate": 5.921277809469481e-06, "loss": 4.0211, "step": 1043 }, { "epoch": 0.17867533801129556, "grad_norm": 26.37080955505371, "learning_rate": 5.926982316029664e-06, "loss": 4.6642, "step": 1044 }, { "epoch": 0.1788464829710765, "grad_norm": 32.01490020751953, "learning_rate": 5.9326868225898464e-06, "loss": 5.5217, "step": 1045 }, { "epoch": 0.17901762793085743, "grad_norm": 22.62516212463379, "learning_rate": 5.938391329150029e-06, "loss": 1.9563, "step": 1046 }, { "epoch": 0.17918877289063836, "grad_norm": 40.089229583740234, "learning_rate": 5.944095835710211e-06, "loss": 5.9567, "step": 1047 }, { "epoch": 0.1793599178504193, "grad_norm": 22.854562759399414, "learning_rate": 5.949800342270393e-06, "loss": 1.9063, "step": 1048 }, { "epoch": 0.17953106281020023, "grad_norm": 99.86076354980469, "learning_rate": 5.9555048488305766e-06, "loss": 6.6872, "step": 1049 }, { "epoch": 0.17970220776998116, "grad_norm": 42.04011154174805, "learning_rate": 5.961209355390759e-06, "loss": 6.4974, "step": 1050 }, { "epoch": 0.1798733527297621, "grad_norm": 26.85508155822754, "learning_rate": 5.966913861950942e-06, "loss": 4.3443, "step": 1051 }, { "epoch": 0.18004449768954303, "grad_norm": 29.8301944732666, "learning_rate": 5.972618368511124e-06, "loss": 5.0599, "step": 1052 }, { "epoch": 0.18021564264932396, "grad_norm": 50.89991760253906, "learning_rate": 5.978322875071306e-06, "loss": 9.764, "step": 1053 }, { "epoch": 0.1803867876091049, "grad_norm": 32.19784927368164, "learning_rate": 5.984027381631489e-06, "loss": 4.1811, "step": 1054 }, { "epoch": 0.18055793256888586, "grad_norm": 46.780487060546875, "learning_rate": 5.989731888191672e-06, "loss": 9.4505, "step": 1055 }, { "epoch": 0.1807290775286668, "grad_norm": 17.571828842163086, "learning_rate": 5.995436394751854e-06, "loss": 1.8957, "step": 1056 }, { "epoch": 0.18090022248844773, "grad_norm": 30.740095138549805, "learning_rate": 6.001140901312037e-06, "loss": 4.0522, "step": 1057 }, { "epoch": 0.18107136744822866, "grad_norm": 36.38762283325195, "learning_rate": 6.006845407872219e-06, "loss": 5.546, "step": 1058 }, { "epoch": 0.1812425124080096, "grad_norm": 37.66824722290039, "learning_rate": 6.012549914432401e-06, "loss": 4.7406, "step": 1059 }, { "epoch": 0.18141365736779053, "grad_norm": 33.9829216003418, "learning_rate": 6.018254420992584e-06, "loss": 4.8123, "step": 1060 }, { "epoch": 0.18158480232757146, "grad_norm": 25.99117088317871, "learning_rate": 6.023958927552767e-06, "loss": 4.6063, "step": 1061 }, { "epoch": 0.1817559472873524, "grad_norm": 29.198394775390625, "learning_rate": 6.0296634341129495e-06, "loss": 5.0514, "step": 1062 }, { "epoch": 0.18192709224713333, "grad_norm": 14.127655982971191, "learning_rate": 6.035367940673132e-06, "loss": 1.3962, "step": 1063 }, { "epoch": 0.18209823720691426, "grad_norm": 12.10257339477539, "learning_rate": 6.0410724472333145e-06, "loss": 2.0181, "step": 1064 }, { "epoch": 0.1822693821666952, "grad_norm": 19.635854721069336, "learning_rate": 6.046776953793496e-06, "loss": 1.7151, "step": 1065 }, { "epoch": 0.18244052712647613, "grad_norm": 189.35772705078125, "learning_rate": 6.05248146035368e-06, "loss": 9.8327, "step": 1066 }, { "epoch": 0.18261167208625706, "grad_norm": 34.833229064941406, "learning_rate": 6.058185966913862e-06, "loss": 5.6448, "step": 1067 }, { "epoch": 0.182782817046038, "grad_norm": 24.17336654663086, "learning_rate": 6.063890473474045e-06, "loss": 3.8977, "step": 1068 }, { "epoch": 0.18295396200581893, "grad_norm": 32.84638214111328, "learning_rate": 6.069594980034227e-06, "loss": 5.7649, "step": 1069 }, { "epoch": 0.18312510696559986, "grad_norm": 46.32835388183594, "learning_rate": 6.07529948659441e-06, "loss": 9.2569, "step": 1070 }, { "epoch": 0.1832962519253808, "grad_norm": 15.697673797607422, "learning_rate": 6.081003993154592e-06, "loss": 1.6445, "step": 1071 }, { "epoch": 0.18346739688516173, "grad_norm": 31.891868591308594, "learning_rate": 6.086708499714775e-06, "loss": 5.4669, "step": 1072 }, { "epoch": 0.18363854184494266, "grad_norm": 29.735248565673828, "learning_rate": 6.092413006274957e-06, "loss": 5.0552, "step": 1073 }, { "epoch": 0.1838096868047236, "grad_norm": 15.486328125, "learning_rate": 6.09811751283514e-06, "loss": 2.2292, "step": 1074 }, { "epoch": 0.18398083176450453, "grad_norm": 24.518693923950195, "learning_rate": 6.103822019395322e-06, "loss": 3.5355, "step": 1075 }, { "epoch": 0.18415197672428546, "grad_norm": 27.474645614624023, "learning_rate": 6.109526525955505e-06, "loss": 2.0704, "step": 1076 }, { "epoch": 0.1843231216840664, "grad_norm": 21.003856658935547, "learning_rate": 6.115231032515688e-06, "loss": 2.0773, "step": 1077 }, { "epoch": 0.18449426664384733, "grad_norm": 12.948555946350098, "learning_rate": 6.12093553907587e-06, "loss": 1.9105, "step": 1078 }, { "epoch": 0.18466541160362826, "grad_norm": 28.35967254638672, "learning_rate": 6.1266400456360525e-06, "loss": 5.1778, "step": 1079 }, { "epoch": 0.18483655656340922, "grad_norm": 28.59235954284668, "learning_rate": 6.132344552196235e-06, "loss": 3.9724, "step": 1080 }, { "epoch": 0.18500770152319015, "grad_norm": 32.077518463134766, "learning_rate": 6.138049058756418e-06, "loss": 4.2397, "step": 1081 }, { "epoch": 0.1851788464829711, "grad_norm": 34.8428955078125, "learning_rate": 6.1437535653166e-06, "loss": 4.3906, "step": 1082 }, { "epoch": 0.18534999144275202, "grad_norm": 36.8244743347168, "learning_rate": 6.1494580718767835e-06, "loss": 4.6433, "step": 1083 }, { "epoch": 0.18552113640253295, "grad_norm": 34.37318420410156, "learning_rate": 6.155162578436965e-06, "loss": 4.7285, "step": 1084 }, { "epoch": 0.1856922813623139, "grad_norm": 34.02301025390625, "learning_rate": 6.160867084997148e-06, "loss": 5.1995, "step": 1085 }, { "epoch": 0.18586342632209482, "grad_norm": 15.779897689819336, "learning_rate": 6.16657159155733e-06, "loss": 1.5138, "step": 1086 }, { "epoch": 0.18603457128187575, "grad_norm": 45.183841705322266, "learning_rate": 6.172276098117513e-06, "loss": 6.6194, "step": 1087 }, { "epoch": 0.1862057162416567, "grad_norm": 15.437774658203125, "learning_rate": 6.177980604677695e-06, "loss": 1.4242, "step": 1088 }, { "epoch": 0.18637686120143762, "grad_norm": 246.0555419921875, "learning_rate": 6.183685111237879e-06, "loss": 10.7677, "step": 1089 }, { "epoch": 0.18654800616121855, "grad_norm": 8.7081937789917, "learning_rate": 6.18938961779806e-06, "loss": 2.3527, "step": 1090 }, { "epoch": 0.1867191511209995, "grad_norm": 35.0928840637207, "learning_rate": 6.195094124358243e-06, "loss": 5.4856, "step": 1091 }, { "epoch": 0.18689029608078042, "grad_norm": 36.24078369140625, "learning_rate": 6.2007986309184254e-06, "loss": 5.1105, "step": 1092 }, { "epoch": 0.18706144104056135, "grad_norm": 41.07029724121094, "learning_rate": 6.206503137478608e-06, "loss": 5.543, "step": 1093 }, { "epoch": 0.1872325860003423, "grad_norm": 36.27534484863281, "learning_rate": 6.212207644038791e-06, "loss": 4.4058, "step": 1094 }, { "epoch": 0.18740373096012322, "grad_norm": 34.61309814453125, "learning_rate": 6.217912150598974e-06, "loss": 4.9065, "step": 1095 }, { "epoch": 0.18757487591990415, "grad_norm": 36.856388092041016, "learning_rate": 6.223616657159156e-06, "loss": 4.8059, "step": 1096 }, { "epoch": 0.1877460208796851, "grad_norm": 39.40951156616211, "learning_rate": 6.229321163719338e-06, "loss": 5.8853, "step": 1097 }, { "epoch": 0.18791716583946602, "grad_norm": 30.013790130615234, "learning_rate": 6.235025670279521e-06, "loss": 4.1051, "step": 1098 }, { "epoch": 0.18808831079924695, "grad_norm": 27.43667984008789, "learning_rate": 6.240730176839703e-06, "loss": 3.661, "step": 1099 }, { "epoch": 0.1882594557590279, "grad_norm": 22.01202964782715, "learning_rate": 6.2464346833998865e-06, "loss": 2.0165, "step": 1100 }, { "epoch": 0.18843060071880882, "grad_norm": 23.981887817382812, "learning_rate": 6.252139189960069e-06, "loss": 1.8586, "step": 1101 }, { "epoch": 0.18860174567858976, "grad_norm": 221.93540954589844, "learning_rate": 6.257843696520252e-06, "loss": 8.0869, "step": 1102 }, { "epoch": 0.1887728906383707, "grad_norm": 32.2524299621582, "learning_rate": 6.263548203080433e-06, "loss": 4.6553, "step": 1103 }, { "epoch": 0.18894403559815162, "grad_norm": 14.555329322814941, "learning_rate": 6.269252709640616e-06, "loss": 2.0657, "step": 1104 }, { "epoch": 0.18911518055793256, "grad_norm": 27.233903884887695, "learning_rate": 6.274957216200798e-06, "loss": 3.7143, "step": 1105 }, { "epoch": 0.18928632551771352, "grad_norm": 15.294402122497559, "learning_rate": 6.280661722760982e-06, "loss": 1.4409, "step": 1106 }, { "epoch": 0.18945747047749445, "grad_norm": 223.1316375732422, "learning_rate": 6.286366229321164e-06, "loss": 9.676, "step": 1107 }, { "epoch": 0.18962861543727538, "grad_norm": 36.643463134765625, "learning_rate": 6.292070735881347e-06, "loss": 4.7202, "step": 1108 }, { "epoch": 0.18979976039705632, "grad_norm": 37.47721481323242, "learning_rate": 6.2977752424415285e-06, "loss": 4.8366, "step": 1109 }, { "epoch": 0.18997090535683725, "grad_norm": 34.74982833862305, "learning_rate": 6.303479749001711e-06, "loss": 4.6667, "step": 1110 }, { "epoch": 0.19014205031661818, "grad_norm": 38.055728912353516, "learning_rate": 6.3091842555618935e-06, "loss": 5.3396, "step": 1111 }, { "epoch": 0.19031319527639912, "grad_norm": 33.44966506958008, "learning_rate": 6.314888762122077e-06, "loss": 5.0909, "step": 1112 }, { "epoch": 0.19048434023618005, "grad_norm": 34.397132873535156, "learning_rate": 6.3205932686822594e-06, "loss": 5.3514, "step": 1113 }, { "epoch": 0.19065548519596098, "grad_norm": 39.06338119506836, "learning_rate": 6.326297775242442e-06, "loss": 6.3797, "step": 1114 }, { "epoch": 0.19082663015574192, "grad_norm": 40.017799377441406, "learning_rate": 6.332002281802624e-06, "loss": 5.5943, "step": 1115 }, { "epoch": 0.19099777511552285, "grad_norm": 11.964347839355469, "learning_rate": 6.337706788362806e-06, "loss": 1.8095, "step": 1116 }, { "epoch": 0.19116892007530378, "grad_norm": 12.956400871276855, "learning_rate": 6.3434112949229896e-06, "loss": 1.3529, "step": 1117 }, { "epoch": 0.19134006503508472, "grad_norm": 36.93289566040039, "learning_rate": 6.349115801483172e-06, "loss": 6.0492, "step": 1118 }, { "epoch": 0.19151120999486565, "grad_norm": 33.92202377319336, "learning_rate": 6.354820308043355e-06, "loss": 5.9093, "step": 1119 }, { "epoch": 0.19168235495464658, "grad_norm": 37.51108169555664, "learning_rate": 6.360524814603537e-06, "loss": 5.5156, "step": 1120 }, { "epoch": 0.19185349991442752, "grad_norm": 23.369075775146484, "learning_rate": 6.36622932116372e-06, "loss": 3.9585, "step": 1121 }, { "epoch": 0.19202464487420845, "grad_norm": 27.76898765563965, "learning_rate": 6.371933827723901e-06, "loss": 4.0578, "step": 1122 }, { "epoch": 0.19219578983398938, "grad_norm": 21.719980239868164, "learning_rate": 6.377638334284085e-06, "loss": 1.6746, "step": 1123 }, { "epoch": 0.19236693479377032, "grad_norm": 32.65765380859375, "learning_rate": 6.383342840844267e-06, "loss": 4.4355, "step": 1124 }, { "epoch": 0.19253807975355125, "grad_norm": 31.302228927612305, "learning_rate": 6.38904734740445e-06, "loss": 4.3111, "step": 1125 }, { "epoch": 0.19270922471333218, "grad_norm": 36.785396575927734, "learning_rate": 6.394751853964632e-06, "loss": 5.3737, "step": 1126 }, { "epoch": 0.19288036967311312, "grad_norm": 32.185787200927734, "learning_rate": 6.400456360524815e-06, "loss": 4.2842, "step": 1127 }, { "epoch": 0.19305151463289405, "grad_norm": 49.154666900634766, "learning_rate": 6.4061608670849966e-06, "loss": 8.8989, "step": 1128 }, { "epoch": 0.19322265959267498, "grad_norm": 31.552207946777344, "learning_rate": 6.41186537364518e-06, "loss": 4.2685, "step": 1129 }, { "epoch": 0.19339380455245592, "grad_norm": 21.41136932373047, "learning_rate": 6.4175698802053625e-06, "loss": 2.3051, "step": 1130 }, { "epoch": 0.19356494951223688, "grad_norm": 13.525940895080566, "learning_rate": 6.423274386765545e-06, "loss": 2.1123, "step": 1131 }, { "epoch": 0.1937360944720178, "grad_norm": 37.48530960083008, "learning_rate": 6.4289788933257275e-06, "loss": 4.8037, "step": 1132 }, { "epoch": 0.19390723943179874, "grad_norm": 38.14132308959961, "learning_rate": 6.43468339988591e-06, "loss": 6.2294, "step": 1133 }, { "epoch": 0.19407838439157968, "grad_norm": 33.01750183105469, "learning_rate": 6.440387906446093e-06, "loss": 4.9204, "step": 1134 }, { "epoch": 0.1942495293513606, "grad_norm": 36.364158630371094, "learning_rate": 6.446092413006275e-06, "loss": 4.5797, "step": 1135 }, { "epoch": 0.19442067431114154, "grad_norm": 46.81378173828125, "learning_rate": 6.451796919566458e-06, "loss": 6.538, "step": 1136 }, { "epoch": 0.19459181927092248, "grad_norm": 23.135957717895508, "learning_rate": 6.45750142612664e-06, "loss": 4.3991, "step": 1137 }, { "epoch": 0.1947629642307034, "grad_norm": 25.031917572021484, "learning_rate": 6.463205932686823e-06, "loss": 2.3886, "step": 1138 }, { "epoch": 0.19493410919048434, "grad_norm": 35.31920623779297, "learning_rate": 6.468910439247005e-06, "loss": 6.0172, "step": 1139 }, { "epoch": 0.19510525415026528, "grad_norm": 36.97047424316406, "learning_rate": 6.474614945807188e-06, "loss": 5.4822, "step": 1140 }, { "epoch": 0.1952763991100462, "grad_norm": 31.77883529663086, "learning_rate": 6.48031945236737e-06, "loss": 4.7072, "step": 1141 }, { "epoch": 0.19544754406982714, "grad_norm": 28.897930145263672, "learning_rate": 6.486023958927553e-06, "loss": 3.7105, "step": 1142 }, { "epoch": 0.19561868902960808, "grad_norm": 29.99696922302246, "learning_rate": 6.491728465487735e-06, "loss": 4.5102, "step": 1143 }, { "epoch": 0.195789833989389, "grad_norm": 25.783557891845703, "learning_rate": 6.497432972047918e-06, "loss": 3.6023, "step": 1144 }, { "epoch": 0.19596097894916994, "grad_norm": 35.004642486572266, "learning_rate": 6.5031374786081005e-06, "loss": 4.1587, "step": 1145 }, { "epoch": 0.19613212390895088, "grad_norm": 173.46754455566406, "learning_rate": 6.508841985168284e-06, "loss": 7.5547, "step": 1146 }, { "epoch": 0.1963032688687318, "grad_norm": 18.749853134155273, "learning_rate": 6.5145464917284655e-06, "loss": 1.7298, "step": 1147 }, { "epoch": 0.19647441382851275, "grad_norm": 31.15353012084961, "learning_rate": 6.520250998288648e-06, "loss": 5.4053, "step": 1148 }, { "epoch": 0.19664555878829368, "grad_norm": 21.659912109375, "learning_rate": 6.525955504848831e-06, "loss": 1.8891, "step": 1149 }, { "epoch": 0.1968167037480746, "grad_norm": 23.412139892578125, "learning_rate": 6.531660011409013e-06, "loss": 3.8619, "step": 1150 }, { "epoch": 0.19698784870785555, "grad_norm": 22.16069221496582, "learning_rate": 6.537364517969196e-06, "loss": 2.0106, "step": 1151 }, { "epoch": 0.19715899366763648, "grad_norm": 33.494136810302734, "learning_rate": 6.543069024529379e-06, "loss": 5.4958, "step": 1152 }, { "epoch": 0.1973301386274174, "grad_norm": 32.96882629394531, "learning_rate": 6.548773531089561e-06, "loss": 4.5927, "step": 1153 }, { "epoch": 0.19750128358719835, "grad_norm": 36.14384078979492, "learning_rate": 6.554478037649743e-06, "loss": 5.6357, "step": 1154 }, { "epoch": 0.19767242854697928, "grad_norm": 23.875118255615234, "learning_rate": 6.560182544209926e-06, "loss": 3.158, "step": 1155 }, { "epoch": 0.19784357350676024, "grad_norm": 23.001026153564453, "learning_rate": 6.565887050770108e-06, "loss": 1.8949, "step": 1156 }, { "epoch": 0.19801471846654117, "grad_norm": 46.26600646972656, "learning_rate": 6.571591557330292e-06, "loss": 9.1329, "step": 1157 }, { "epoch": 0.1981858634263221, "grad_norm": 16.32296371459961, "learning_rate": 6.577296063890474e-06, "loss": 1.5302, "step": 1158 }, { "epoch": 0.19835700838610304, "grad_norm": 26.114614486694336, "learning_rate": 6.583000570450656e-06, "loss": 2.3763, "step": 1159 }, { "epoch": 0.19852815334588397, "grad_norm": 37.42622756958008, "learning_rate": 6.5887050770108384e-06, "loss": 5.5999, "step": 1160 }, { "epoch": 0.1986992983056649, "grad_norm": 21.48786735534668, "learning_rate": 6.594409583571021e-06, "loss": 3.4369, "step": 1161 }, { "epoch": 0.19887044326544584, "grad_norm": 24.472808837890625, "learning_rate": 6.6001140901312035e-06, "loss": 2.0175, "step": 1162 }, { "epoch": 0.19904158822522677, "grad_norm": 25.275909423828125, "learning_rate": 6.605818596691387e-06, "loss": 2.6992, "step": 1163 }, { "epoch": 0.1992127331850077, "grad_norm": 29.439197540283203, "learning_rate": 6.611523103251569e-06, "loss": 4.4373, "step": 1164 }, { "epoch": 0.19938387814478864, "grad_norm": 224.64663696289062, "learning_rate": 6.617227609811751e-06, "loss": 10.3737, "step": 1165 }, { "epoch": 0.19955502310456957, "grad_norm": 34.043575286865234, "learning_rate": 6.622932116371934e-06, "loss": 5.0921, "step": 1166 }, { "epoch": 0.1997261680643505, "grad_norm": 11.060107231140137, "learning_rate": 6.628636622932116e-06, "loss": 1.2996, "step": 1167 }, { "epoch": 0.19989731302413144, "grad_norm": 32.19368362426758, "learning_rate": 6.634341129492299e-06, "loss": 4.2537, "step": 1168 }, { "epoch": 0.20006845798391237, "grad_norm": 48.267578125, "learning_rate": 6.640045636052482e-06, "loss": 9.335, "step": 1169 }, { "epoch": 0.2002396029436933, "grad_norm": 19.327762603759766, "learning_rate": 6.645750142612665e-06, "loss": 1.8859, "step": 1170 }, { "epoch": 0.20041074790347424, "grad_norm": 28.81614875793457, "learning_rate": 6.651454649172847e-06, "loss": 3.8125, "step": 1171 }, { "epoch": 0.20058189286325517, "grad_norm": 24.971960067749023, "learning_rate": 6.657159155733029e-06, "loss": 3.0816, "step": 1172 }, { "epoch": 0.2007530378230361, "grad_norm": 154.4432373046875, "learning_rate": 6.662863662293211e-06, "loss": 8.568, "step": 1173 }, { "epoch": 0.20092418278281704, "grad_norm": 47.04978942871094, "learning_rate": 6.668568168853395e-06, "loss": 5.1816, "step": 1174 }, { "epoch": 0.20109532774259797, "grad_norm": 24.374345779418945, "learning_rate": 6.674272675413577e-06, "loss": 2.6078, "step": 1175 }, { "epoch": 0.2012664727023789, "grad_norm": 36.597232818603516, "learning_rate": 6.67997718197376e-06, "loss": 5.5402, "step": 1176 }, { "epoch": 0.20143761766215984, "grad_norm": 36.612060546875, "learning_rate": 6.685681688533942e-06, "loss": 5.17, "step": 1177 }, { "epoch": 0.20160876262194077, "grad_norm": 39.452117919921875, "learning_rate": 6.691386195094124e-06, "loss": 6.2861, "step": 1178 }, { "epoch": 0.2017799075817217, "grad_norm": 35.985816955566406, "learning_rate": 6.6970907016543065e-06, "loss": 5.7763, "step": 1179 }, { "epoch": 0.20195105254150264, "grad_norm": 11.960805892944336, "learning_rate": 6.70279520821449e-06, "loss": 2.7312, "step": 1180 }, { "epoch": 0.20212219750128357, "grad_norm": 154.7554168701172, "learning_rate": 6.7084997147746724e-06, "loss": 9.5806, "step": 1181 }, { "epoch": 0.20229334246106453, "grad_norm": 31.713943481445312, "learning_rate": 6.714204221334855e-06, "loss": 4.9006, "step": 1182 }, { "epoch": 0.20246448742084547, "grad_norm": 11.431591987609863, "learning_rate": 6.7199087278950375e-06, "loss": 3.1028, "step": 1183 }, { "epoch": 0.2026356323806264, "grad_norm": 208.2880859375, "learning_rate": 6.725613234455219e-06, "loss": 8.5447, "step": 1184 }, { "epoch": 0.20280677734040733, "grad_norm": 32.78763198852539, "learning_rate": 6.731317741015402e-06, "loss": 5.0437, "step": 1185 }, { "epoch": 0.20297792230018827, "grad_norm": 31.15655517578125, "learning_rate": 6.737022247575585e-06, "loss": 4.1921, "step": 1186 }, { "epoch": 0.2031490672599692, "grad_norm": 12.072607040405273, "learning_rate": 6.742726754135768e-06, "loss": 1.9291, "step": 1187 }, { "epoch": 0.20332021221975013, "grad_norm": 46.76679992675781, "learning_rate": 6.74843126069595e-06, "loss": 9.0577, "step": 1188 }, { "epoch": 0.20349135717953107, "grad_norm": 28.912738800048828, "learning_rate": 6.754135767256133e-06, "loss": 4.3274, "step": 1189 }, { "epoch": 0.203662502139312, "grad_norm": 151.7112579345703, "learning_rate": 6.759840273816315e-06, "loss": 8.1049, "step": 1190 }, { "epoch": 0.20383364709909293, "grad_norm": 19.557729721069336, "learning_rate": 6.765544780376497e-06, "loss": 1.6717, "step": 1191 }, { "epoch": 0.20400479205887387, "grad_norm": 37.28075408935547, "learning_rate": 6.77124928693668e-06, "loss": 5.6393, "step": 1192 }, { "epoch": 0.2041759370186548, "grad_norm": 33.639183044433594, "learning_rate": 6.776953793496863e-06, "loss": 4.9937, "step": 1193 }, { "epoch": 0.20434708197843574, "grad_norm": 16.514705657958984, "learning_rate": 6.782658300057045e-06, "loss": 2.2396, "step": 1194 }, { "epoch": 0.20451822693821667, "grad_norm": 29.29157066345215, "learning_rate": 6.788362806617228e-06, "loss": 4.5062, "step": 1195 }, { "epoch": 0.2046893718979976, "grad_norm": 24.25420570373535, "learning_rate": 6.79406731317741e-06, "loss": 2.5282, "step": 1196 }, { "epoch": 0.20486051685777854, "grad_norm": 21.87625503540039, "learning_rate": 6.799771819737593e-06, "loss": 2.2101, "step": 1197 }, { "epoch": 0.20503166181755947, "grad_norm": 29.727163314819336, "learning_rate": 6.8054763262977755e-06, "loss": 3.5679, "step": 1198 }, { "epoch": 0.2052028067773404, "grad_norm": 23.502267837524414, "learning_rate": 6.811180832857958e-06, "loss": 3.9821, "step": 1199 }, { "epoch": 0.20537395173712134, "grad_norm": 31.961931228637695, "learning_rate": 6.8168853394181405e-06, "loss": 4.6, "step": 1200 }, { "epoch": 0.20554509669690227, "grad_norm": 27.584300994873047, "learning_rate": 6.822589845978323e-06, "loss": 3.389, "step": 1201 }, { "epoch": 0.2057162416566832, "grad_norm": 34.41096115112305, "learning_rate": 6.828294352538506e-06, "loss": 4.722, "step": 1202 }, { "epoch": 0.20588738661646414, "grad_norm": 41.341312408447266, "learning_rate": 6.833998859098688e-06, "loss": 6.7225, "step": 1203 }, { "epoch": 0.20605853157624507, "grad_norm": 160.5906982421875, "learning_rate": 6.839703365658871e-06, "loss": 9.8412, "step": 1204 }, { "epoch": 0.206229676536026, "grad_norm": 23.49472999572754, "learning_rate": 6.845407872219053e-06, "loss": 3.6378, "step": 1205 }, { "epoch": 0.20640082149580694, "grad_norm": 31.307947158813477, "learning_rate": 6.851112378779236e-06, "loss": 3.6813, "step": 1206 }, { "epoch": 0.2065719664555879, "grad_norm": 27.893850326538086, "learning_rate": 6.856816885339418e-06, "loss": 4.5216, "step": 1207 }, { "epoch": 0.20674311141536883, "grad_norm": 32.200157165527344, "learning_rate": 6.862521391899601e-06, "loss": 4.5525, "step": 1208 }, { "epoch": 0.20691425637514976, "grad_norm": 31.765216827392578, "learning_rate": 6.868225898459783e-06, "loss": 5.2865, "step": 1209 }, { "epoch": 0.2070854013349307, "grad_norm": 35.562294006347656, "learning_rate": 6.873930405019966e-06, "loss": 5.0758, "step": 1210 }, { "epoch": 0.20725654629471163, "grad_norm": 44.582786560058594, "learning_rate": 6.879634911580148e-06, "loss": 8.7973, "step": 1211 }, { "epoch": 0.20742769125449256, "grad_norm": 29.667964935302734, "learning_rate": 6.885339418140331e-06, "loss": 3.7483, "step": 1212 }, { "epoch": 0.2075988362142735, "grad_norm": 33.826454162597656, "learning_rate": 6.8910439247005135e-06, "loss": 5.321, "step": 1213 }, { "epoch": 0.20776998117405443, "grad_norm": 36.56757354736328, "learning_rate": 6.896748431260697e-06, "loss": 4.6366, "step": 1214 }, { "epoch": 0.20794112613383536, "grad_norm": 21.483030319213867, "learning_rate": 6.9024529378208785e-06, "loss": 1.7844, "step": 1215 }, { "epoch": 0.2081122710936163, "grad_norm": 22.398630142211914, "learning_rate": 6.908157444381061e-06, "loss": 2.9002, "step": 1216 }, { "epoch": 0.20828341605339723, "grad_norm": 16.41680145263672, "learning_rate": 6.913861950941244e-06, "loss": 1.5466, "step": 1217 }, { "epoch": 0.20845456101317816, "grad_norm": 22.448949813842773, "learning_rate": 6.919566457501426e-06, "loss": 3.4011, "step": 1218 }, { "epoch": 0.2086257059729591, "grad_norm": 35.074989318847656, "learning_rate": 6.925270964061609e-06, "loss": 4.4769, "step": 1219 }, { "epoch": 0.20879685093274003, "grad_norm": 29.737442016601562, "learning_rate": 6.930975470621792e-06, "loss": 4.6152, "step": 1220 }, { "epoch": 0.20896799589252096, "grad_norm": 29.097299575805664, "learning_rate": 6.9366799771819746e-06, "loss": 3.8591, "step": 1221 }, { "epoch": 0.2091391408523019, "grad_norm": 22.356008529663086, "learning_rate": 6.942384483742156e-06, "loss": 3.6379, "step": 1222 }, { "epoch": 0.20931028581208283, "grad_norm": 29.412656784057617, "learning_rate": 6.948088990302339e-06, "loss": 3.5976, "step": 1223 }, { "epoch": 0.20948143077186376, "grad_norm": 19.5412654876709, "learning_rate": 6.953793496862521e-06, "loss": 2.0718, "step": 1224 }, { "epoch": 0.2096525757316447, "grad_norm": 17.43561363220215, "learning_rate": 6.959498003422704e-06, "loss": 1.5389, "step": 1225 }, { "epoch": 0.20982372069142563, "grad_norm": 34.85890579223633, "learning_rate": 6.965202509982887e-06, "loss": 4.4105, "step": 1226 }, { "epoch": 0.20999486565120656, "grad_norm": 33.83147430419922, "learning_rate": 6.97090701654307e-06, "loss": 4.108, "step": 1227 }, { "epoch": 0.2101660106109875, "grad_norm": 33.77149963378906, "learning_rate": 6.9766115231032514e-06, "loss": 4.4198, "step": 1228 }, { "epoch": 0.21033715557076843, "grad_norm": 12.30455207824707, "learning_rate": 6.982316029663434e-06, "loss": 1.7759, "step": 1229 }, { "epoch": 0.21050830053054936, "grad_norm": 34.55380630493164, "learning_rate": 6.9880205362236165e-06, "loss": 4.4813, "step": 1230 }, { "epoch": 0.2106794454903303, "grad_norm": 23.975025177001953, "learning_rate": 6.993725042783799e-06, "loss": 3.728, "step": 1231 }, { "epoch": 0.21085059045011123, "grad_norm": 190.6012725830078, "learning_rate": 6.999429549343982e-06, "loss": 10.1602, "step": 1232 }, { "epoch": 0.2110217354098922, "grad_norm": 34.527076721191406, "learning_rate": 7.005134055904165e-06, "loss": 4.7483, "step": 1233 }, { "epoch": 0.21119288036967312, "grad_norm": 35.65943908691406, "learning_rate": 7.010838562464347e-06, "loss": 5.5499, "step": 1234 }, { "epoch": 0.21136402532945406, "grad_norm": 34.03565216064453, "learning_rate": 7.016543069024529e-06, "loss": 4.7829, "step": 1235 }, { "epoch": 0.211535170289235, "grad_norm": 20.10201072692871, "learning_rate": 7.022247575584712e-06, "loss": 2.9853, "step": 1236 }, { "epoch": 0.21170631524901593, "grad_norm": 72.77118682861328, "learning_rate": 7.027952082144895e-06, "loss": 6.8184, "step": 1237 }, { "epoch": 0.21187746020879686, "grad_norm": 32.084381103515625, "learning_rate": 7.033656588705078e-06, "loss": 5.0572, "step": 1238 }, { "epoch": 0.2120486051685778, "grad_norm": 28.180423736572266, "learning_rate": 7.03936109526526e-06, "loss": 3.8185, "step": 1239 }, { "epoch": 0.21221975012835873, "grad_norm": 20.687843322753906, "learning_rate": 7.045065601825443e-06, "loss": 2.1643, "step": 1240 }, { "epoch": 0.21239089508813966, "grad_norm": 15.380537033081055, "learning_rate": 7.050770108385624e-06, "loss": 1.6453, "step": 1241 }, { "epoch": 0.2125620400479206, "grad_norm": 38.16814422607422, "learning_rate": 7.056474614945807e-06, "loss": 5.8775, "step": 1242 }, { "epoch": 0.21273318500770153, "grad_norm": 43.55405807495117, "learning_rate": 7.06217912150599e-06, "loss": 5.1528, "step": 1243 }, { "epoch": 0.21290432996748246, "grad_norm": 30.40400505065918, "learning_rate": 7.067883628066173e-06, "loss": 4.155, "step": 1244 }, { "epoch": 0.2130754749272634, "grad_norm": 39.55487823486328, "learning_rate": 7.073588134626355e-06, "loss": 6.8649, "step": 1245 }, { "epoch": 0.21324661988704433, "grad_norm": 46.886600494384766, "learning_rate": 7.079292641186538e-06, "loss": 4.8251, "step": 1246 }, { "epoch": 0.21341776484682526, "grad_norm": 35.842594146728516, "learning_rate": 7.0849971477467195e-06, "loss": 5.3382, "step": 1247 }, { "epoch": 0.2135889098066062, "grad_norm": 10.459444999694824, "learning_rate": 7.090701654306902e-06, "loss": 1.1781, "step": 1248 }, { "epoch": 0.21376005476638713, "grad_norm": 31.134531021118164, "learning_rate": 7.0964061608670854e-06, "loss": 3.3419, "step": 1249 }, { "epoch": 0.21393119972616806, "grad_norm": 32.50645065307617, "learning_rate": 7.102110667427268e-06, "loss": 4.1592, "step": 1250 }, { "epoch": 0.214102344685949, "grad_norm": 38.065643310546875, "learning_rate": 7.1078151739874505e-06, "loss": 6.1903, "step": 1251 }, { "epoch": 0.21427348964572993, "grad_norm": 32.13066482543945, "learning_rate": 7.113519680547633e-06, "loss": 3.8917, "step": 1252 }, { "epoch": 0.21444463460551086, "grad_norm": 22.333932876586914, "learning_rate": 7.119224187107815e-06, "loss": 3.308, "step": 1253 }, { "epoch": 0.2146157795652918, "grad_norm": 8.437789916992188, "learning_rate": 7.124928693667997e-06, "loss": 2.2375, "step": 1254 }, { "epoch": 0.21478692452507273, "grad_norm": 32.72603225708008, "learning_rate": 7.130633200228181e-06, "loss": 4.8237, "step": 1255 }, { "epoch": 0.21495806948485366, "grad_norm": 34.640647888183594, "learning_rate": 7.136337706788363e-06, "loss": 5.2757, "step": 1256 }, { "epoch": 0.2151292144446346, "grad_norm": 20.100618362426758, "learning_rate": 7.142042213348546e-06, "loss": 2.961, "step": 1257 }, { "epoch": 0.21530035940441555, "grad_norm": 43.29427719116211, "learning_rate": 7.147746719908728e-06, "loss": 8.933, "step": 1258 }, { "epoch": 0.2154715043641965, "grad_norm": 33.56546401977539, "learning_rate": 7.15345122646891e-06, "loss": 4.6558, "step": 1259 }, { "epoch": 0.21564264932397742, "grad_norm": 33.7791633605957, "learning_rate": 7.159155733029093e-06, "loss": 4.183, "step": 1260 }, { "epoch": 0.21581379428375835, "grad_norm": 33.235233306884766, "learning_rate": 7.164860239589276e-06, "loss": 3.7487, "step": 1261 }, { "epoch": 0.2159849392435393, "grad_norm": 140.30621337890625, "learning_rate": 7.170564746149458e-06, "loss": 9.0381, "step": 1262 }, { "epoch": 0.21615608420332022, "grad_norm": 20.70719337463379, "learning_rate": 7.176269252709641e-06, "loss": 1.7769, "step": 1263 }, { "epoch": 0.21632722916310115, "grad_norm": 36.93478012084961, "learning_rate": 7.181973759269823e-06, "loss": 4.5665, "step": 1264 }, { "epoch": 0.2164983741228821, "grad_norm": 81.26618957519531, "learning_rate": 7.187678265830006e-06, "loss": 7.0141, "step": 1265 }, { "epoch": 0.21666951908266302, "grad_norm": 33.15439224243164, "learning_rate": 7.1933827723901885e-06, "loss": 4.5814, "step": 1266 }, { "epoch": 0.21684066404244395, "grad_norm": 26.268171310424805, "learning_rate": 7.199087278950371e-06, "loss": 3.0891, "step": 1267 }, { "epoch": 0.2170118090022249, "grad_norm": 35.35780715942383, "learning_rate": 7.2047917855105535e-06, "loss": 4.8355, "step": 1268 }, { "epoch": 0.21718295396200582, "grad_norm": 21.87150764465332, "learning_rate": 7.210496292070736e-06, "loss": 1.7614, "step": 1269 }, { "epoch": 0.21735409892178675, "grad_norm": 36.49989318847656, "learning_rate": 7.216200798630919e-06, "loss": 5.8824, "step": 1270 }, { "epoch": 0.2175252438815677, "grad_norm": 11.613662719726562, "learning_rate": 7.221905305191101e-06, "loss": 1.7057, "step": 1271 }, { "epoch": 0.21769638884134862, "grad_norm": 28.447458267211914, "learning_rate": 7.227609811751284e-06, "loss": 4.3815, "step": 1272 }, { "epoch": 0.21786753380112955, "grad_norm": 34.95615005493164, "learning_rate": 7.233314318311466e-06, "loss": 4.7223, "step": 1273 }, { "epoch": 0.2180386787609105, "grad_norm": 36.12034606933594, "learning_rate": 7.239018824871649e-06, "loss": 5.4639, "step": 1274 }, { "epoch": 0.21820982372069142, "grad_norm": 29.200042724609375, "learning_rate": 7.244723331431831e-06, "loss": 3.9203, "step": 1275 }, { "epoch": 0.21838096868047235, "grad_norm": 173.54055786132812, "learning_rate": 7.250427837992014e-06, "loss": 9.2819, "step": 1276 }, { "epoch": 0.2185521136402533, "grad_norm": 30.67865562438965, "learning_rate": 7.256132344552197e-06, "loss": 4.7412, "step": 1277 }, { "epoch": 0.21872325860003422, "grad_norm": 35.703468322753906, "learning_rate": 7.261836851112379e-06, "loss": 5.3418, "step": 1278 }, { "epoch": 0.21889440355981515, "grad_norm": 35.29546356201172, "learning_rate": 7.267541357672561e-06, "loss": 5.1735, "step": 1279 }, { "epoch": 0.2190655485195961, "grad_norm": 20.382551193237305, "learning_rate": 7.273245864232744e-06, "loss": 1.8851, "step": 1280 }, { "epoch": 0.21923669347937702, "grad_norm": 20.68045997619629, "learning_rate": 7.2789503707929265e-06, "loss": 2.681, "step": 1281 }, { "epoch": 0.21940783843915795, "grad_norm": 37.52497482299805, "learning_rate": 7.284654877353109e-06, "loss": 5.9113, "step": 1282 }, { "epoch": 0.21957898339893892, "grad_norm": 154.6285858154297, "learning_rate": 7.290359383913292e-06, "loss": 8.0077, "step": 1283 }, { "epoch": 0.21975012835871985, "grad_norm": 28.380836486816406, "learning_rate": 7.296063890473474e-06, "loss": 3.5758, "step": 1284 }, { "epoch": 0.21992127331850078, "grad_norm": 13.987469673156738, "learning_rate": 7.301768397033657e-06, "loss": 1.4051, "step": 1285 }, { "epoch": 0.22009241827828172, "grad_norm": 21.18030548095703, "learning_rate": 7.307472903593839e-06, "loss": 3.1844, "step": 1286 }, { "epoch": 0.22026356323806265, "grad_norm": 13.61611270904541, "learning_rate": 7.313177410154022e-06, "loss": 1.4008, "step": 1287 }, { "epoch": 0.22043470819784358, "grad_norm": 32.63056182861328, "learning_rate": 7.318881916714204e-06, "loss": 5.485, "step": 1288 }, { "epoch": 0.22060585315762452, "grad_norm": 12.39704704284668, "learning_rate": 7.3245864232743876e-06, "loss": 2.8362, "step": 1289 }, { "epoch": 0.22077699811740545, "grad_norm": 160.39300537109375, "learning_rate": 7.33029092983457e-06, "loss": 9.3207, "step": 1290 }, { "epoch": 0.22094814307718638, "grad_norm": 35.63487243652344, "learning_rate": 7.335995436394752e-06, "loss": 4.3364, "step": 1291 }, { "epoch": 0.22111928803696732, "grad_norm": 18.865745544433594, "learning_rate": 7.341699942954934e-06, "loss": 1.9152, "step": 1292 }, { "epoch": 0.22129043299674825, "grad_norm": 34.95203399658203, "learning_rate": 7.347404449515117e-06, "loss": 4.2394, "step": 1293 }, { "epoch": 0.22146157795652918, "grad_norm": 32.99889373779297, "learning_rate": 7.353108956075299e-06, "loss": 5.7603, "step": 1294 }, { "epoch": 0.22163272291631012, "grad_norm": 31.541820526123047, "learning_rate": 7.358813462635483e-06, "loss": 4.7464, "step": 1295 }, { "epoch": 0.22180386787609105, "grad_norm": 22.86473274230957, "learning_rate": 7.364517969195665e-06, "loss": 3.2885, "step": 1296 }, { "epoch": 0.22197501283587198, "grad_norm": 34.75326919555664, "learning_rate": 7.370222475755847e-06, "loss": 4.4337, "step": 1297 }, { "epoch": 0.22214615779565292, "grad_norm": 33.42300796508789, "learning_rate": 7.3759269823160295e-06, "loss": 4.8641, "step": 1298 }, { "epoch": 0.22231730275543385, "grad_norm": 40.14048385620117, "learning_rate": 7.381631488876212e-06, "loss": 5.3092, "step": 1299 }, { "epoch": 0.22248844771521478, "grad_norm": 33.59206008911133, "learning_rate": 7.387335995436395e-06, "loss": 4.6114, "step": 1300 }, { "epoch": 0.22265959267499572, "grad_norm": 32.96902084350586, "learning_rate": 7.393040501996578e-06, "loss": 4.9559, "step": 1301 }, { "epoch": 0.22283073763477665, "grad_norm": 76.84076690673828, "learning_rate": 7.3987450085567605e-06, "loss": 7.2409, "step": 1302 }, { "epoch": 0.22300188259455758, "grad_norm": 29.227497100830078, "learning_rate": 7.404449515116942e-06, "loss": 3.4494, "step": 1303 }, { "epoch": 0.22317302755433852, "grad_norm": 34.10039520263672, "learning_rate": 7.410154021677125e-06, "loss": 4.6513, "step": 1304 }, { "epoch": 0.22334417251411945, "grad_norm": 43.62645721435547, "learning_rate": 7.415858528237307e-06, "loss": 6.1141, "step": 1305 }, { "epoch": 0.22351531747390038, "grad_norm": 29.59916877746582, "learning_rate": 7.421563034797491e-06, "loss": 4.5189, "step": 1306 }, { "epoch": 0.22368646243368132, "grad_norm": 32.00434494018555, "learning_rate": 7.427267541357673e-06, "loss": 3.7625, "step": 1307 }, { "epoch": 0.22385760739346225, "grad_norm": 12.214600563049316, "learning_rate": 7.432972047917856e-06, "loss": 1.6093, "step": 1308 }, { "epoch": 0.2240287523532432, "grad_norm": 13.289321899414062, "learning_rate": 7.438676554478037e-06, "loss": 1.8433, "step": 1309 }, { "epoch": 0.22419989731302414, "grad_norm": 12.391509056091309, "learning_rate": 7.44438106103822e-06, "loss": 1.8211, "step": 1310 }, { "epoch": 0.22437104227280508, "grad_norm": 31.827852249145508, "learning_rate": 7.450085567598402e-06, "loss": 3.5607, "step": 1311 }, { "epoch": 0.224542187232586, "grad_norm": 172.93185424804688, "learning_rate": 7.455790074158586e-06, "loss": 9.5445, "step": 1312 }, { "epoch": 0.22471333219236694, "grad_norm": 18.688396453857422, "learning_rate": 7.461494580718768e-06, "loss": 1.9759, "step": 1313 }, { "epoch": 0.22488447715214788, "grad_norm": 26.364185333251953, "learning_rate": 7.467199087278951e-06, "loss": 3.2682, "step": 1314 }, { "epoch": 0.2250556221119288, "grad_norm": 8.573413848876953, "learning_rate": 7.472903593839133e-06, "loss": 1.3051, "step": 1315 }, { "epoch": 0.22522676707170974, "grad_norm": 24.913686752319336, "learning_rate": 7.478608100399315e-06, "loss": 2.4598, "step": 1316 }, { "epoch": 0.22539791203149068, "grad_norm": 30.283504486083984, "learning_rate": 7.4843126069594984e-06, "loss": 4.1503, "step": 1317 }, { "epoch": 0.2255690569912716, "grad_norm": 18.146724700927734, "learning_rate": 7.490017113519681e-06, "loss": 1.8957, "step": 1318 }, { "epoch": 0.22574020195105254, "grad_norm": 11.016623497009277, "learning_rate": 7.4957216200798635e-06, "loss": 2.636, "step": 1319 }, { "epoch": 0.22591134691083348, "grad_norm": 35.766883850097656, "learning_rate": 7.501426126640046e-06, "loss": 4.3588, "step": 1320 }, { "epoch": 0.2260824918706144, "grad_norm": 24.76753807067871, "learning_rate": 7.5071306332002286e-06, "loss": 3.2106, "step": 1321 }, { "epoch": 0.22625363683039534, "grad_norm": 35.969505310058594, "learning_rate": 7.51283513976041e-06, "loss": 4.5488, "step": 1322 }, { "epoch": 0.22642478179017628, "grad_norm": 13.215656280517578, "learning_rate": 7.518539646320593e-06, "loss": 1.7273, "step": 1323 }, { "epoch": 0.2265959267499572, "grad_norm": 32.75537872314453, "learning_rate": 7.524244152880775e-06, "loss": 4.442, "step": 1324 }, { "epoch": 0.22676707170973814, "grad_norm": 13.069498062133789, "learning_rate": 7.529948659440958e-06, "loss": 1.2864, "step": 1325 }, { "epoch": 0.22693821666951908, "grad_norm": 29.5541934967041, "learning_rate": 7.535653166001142e-06, "loss": 3.5993, "step": 1326 }, { "epoch": 0.2271093616293, "grad_norm": 36.506736755371094, "learning_rate": 7.541357672561325e-06, "loss": 4.7108, "step": 1327 }, { "epoch": 0.22728050658908094, "grad_norm": 30.510953903198242, "learning_rate": 7.547062179121506e-06, "loss": 4.168, "step": 1328 }, { "epoch": 0.22745165154886188, "grad_norm": 11.754740715026855, "learning_rate": 7.552766685681689e-06, "loss": 2.7865, "step": 1329 }, { "epoch": 0.2276227965086428, "grad_norm": 31.793643951416016, "learning_rate": 7.558471192241871e-06, "loss": 3.4931, "step": 1330 }, { "epoch": 0.22779394146842374, "grad_norm": 23.95293426513672, "learning_rate": 7.564175698802054e-06, "loss": 3.0252, "step": 1331 }, { "epoch": 0.22796508642820468, "grad_norm": 28.809511184692383, "learning_rate": 7.569880205362236e-06, "loss": 4.2144, "step": 1332 }, { "epoch": 0.2281362313879856, "grad_norm": 34.645267486572266, "learning_rate": 7.575584711922419e-06, "loss": 4.5155, "step": 1333 }, { "epoch": 0.22830737634776657, "grad_norm": 31.90658950805664, "learning_rate": 7.581289218482601e-06, "loss": 3.6445, "step": 1334 }, { "epoch": 0.2284785213075475, "grad_norm": 26.37479591369629, "learning_rate": 7.586993725042783e-06, "loss": 2.6728, "step": 1335 }, { "epoch": 0.22864966626732844, "grad_norm": 29.64954376220703, "learning_rate": 7.592698231602966e-06, "loss": 4.0421, "step": 1336 }, { "epoch": 0.22882081122710937, "grad_norm": 28.596891403198242, "learning_rate": 7.59840273816315e-06, "loss": 3.3059, "step": 1337 }, { "epoch": 0.2289919561868903, "grad_norm": 36.07052993774414, "learning_rate": 7.6041072447233325e-06, "loss": 4.2618, "step": 1338 }, { "epoch": 0.22916310114667124, "grad_norm": 50.589454650878906, "learning_rate": 7.609811751283515e-06, "loss": 9.3326, "step": 1339 }, { "epoch": 0.22933424610645217, "grad_norm": 31.4276180267334, "learning_rate": 7.6155162578436975e-06, "loss": 4.6035, "step": 1340 }, { "epoch": 0.2295053910662331, "grad_norm": 32.5452766418457, "learning_rate": 7.621220764403879e-06, "loss": 3.9264, "step": 1341 }, { "epoch": 0.22967653602601404, "grad_norm": 32.74778747558594, "learning_rate": 7.626925270964062e-06, "loss": 4.6618, "step": 1342 }, { "epoch": 0.22984768098579497, "grad_norm": 11.447990417480469, "learning_rate": 7.632629777524244e-06, "loss": 1.2467, "step": 1343 }, { "epoch": 0.2300188259455759, "grad_norm": 19.261301040649414, "learning_rate": 7.638334284084426e-06, "loss": 1.4108, "step": 1344 }, { "epoch": 0.23018997090535684, "grad_norm": 17.838138580322266, "learning_rate": 7.64403879064461e-06, "loss": 1.4796, "step": 1345 }, { "epoch": 0.23036111586513777, "grad_norm": 36.09761047363281, "learning_rate": 7.649743297204791e-06, "loss": 4.8769, "step": 1346 }, { "epoch": 0.2305322608249187, "grad_norm": 17.18463706970215, "learning_rate": 7.655447803764974e-06, "loss": 1.6009, "step": 1347 }, { "epoch": 0.23070340578469964, "grad_norm": 20.603784561157227, "learning_rate": 7.661152310325156e-06, "loss": 3.0856, "step": 1348 }, { "epoch": 0.23087455074448057, "grad_norm": 41.716590881347656, "learning_rate": 7.666856816885341e-06, "loss": 5.4047, "step": 1349 }, { "epoch": 0.2310456957042615, "grad_norm": 181.26748657226562, "learning_rate": 7.672561323445523e-06, "loss": 8.5903, "step": 1350 }, { "epoch": 0.23121684066404244, "grad_norm": 41.98673629760742, "learning_rate": 7.678265830005705e-06, "loss": 5.2161, "step": 1351 }, { "epoch": 0.23138798562382337, "grad_norm": 35.29446792602539, "learning_rate": 7.683970336565888e-06, "loss": 4.2135, "step": 1352 }, { "epoch": 0.2315591305836043, "grad_norm": 164.35977172851562, "learning_rate": 7.68967484312607e-06, "loss": 7.3687, "step": 1353 }, { "epoch": 0.23173027554338524, "grad_norm": 20.39377784729004, "learning_rate": 7.695379349686253e-06, "loss": 1.6669, "step": 1354 }, { "epoch": 0.23190142050316617, "grad_norm": 33.71407699584961, "learning_rate": 7.701083856246435e-06, "loss": 4.5662, "step": 1355 }, { "epoch": 0.2320725654629471, "grad_norm": 9.964597702026367, "learning_rate": 7.706788362806616e-06, "loss": 2.2199, "step": 1356 }, { "epoch": 0.23224371042272804, "grad_norm": 41.83567810058594, "learning_rate": 7.7124928693668e-06, "loss": 5.3555, "step": 1357 }, { "epoch": 0.23241485538250897, "grad_norm": 19.700429916381836, "learning_rate": 7.718197375926981e-06, "loss": 1.6864, "step": 1358 }, { "epoch": 0.2325860003422899, "grad_norm": 32.94630432128906, "learning_rate": 7.723901882487165e-06, "loss": 3.5872, "step": 1359 }, { "epoch": 0.23275714530207087, "grad_norm": 26.41133689880371, "learning_rate": 7.729606389047348e-06, "loss": 3.5806, "step": 1360 }, { "epoch": 0.2329282902618518, "grad_norm": 17.184593200683594, "learning_rate": 7.735310895607532e-06, "loss": 1.6527, "step": 1361 }, { "epoch": 0.23309943522163273, "grad_norm": 11.024751663208008, "learning_rate": 7.741015402167713e-06, "loss": 1.2203, "step": 1362 }, { "epoch": 0.23327058018141367, "grad_norm": 35.2708625793457, "learning_rate": 7.746719908727895e-06, "loss": 4.5728, "step": 1363 }, { "epoch": 0.2334417251411946, "grad_norm": 35.836387634277344, "learning_rate": 7.752424415288078e-06, "loss": 4.9165, "step": 1364 }, { "epoch": 0.23361287010097553, "grad_norm": 24.741012573242188, "learning_rate": 7.75812892184826e-06, "loss": 2.2991, "step": 1365 }, { "epoch": 0.23378401506075647, "grad_norm": 41.604007720947266, "learning_rate": 7.763833428408443e-06, "loss": 4.7384, "step": 1366 }, { "epoch": 0.2339551600205374, "grad_norm": 37.068485260009766, "learning_rate": 7.769537934968625e-06, "loss": 4.1609, "step": 1367 }, { "epoch": 0.23412630498031833, "grad_norm": 31.635995864868164, "learning_rate": 7.775242441528808e-06, "loss": 3.6394, "step": 1368 }, { "epoch": 0.23429744994009927, "grad_norm": 36.181602478027344, "learning_rate": 7.78094694808899e-06, "loss": 3.9604, "step": 1369 }, { "epoch": 0.2344685948998802, "grad_norm": 34.47708511352539, "learning_rate": 7.786651454649172e-06, "loss": 4.4621, "step": 1370 }, { "epoch": 0.23463973985966113, "grad_norm": 36.583919525146484, "learning_rate": 7.792355961209355e-06, "loss": 5.4214, "step": 1371 }, { "epoch": 0.23481088481944207, "grad_norm": 139.80113220214844, "learning_rate": 7.798060467769539e-06, "loss": 7.582, "step": 1372 }, { "epoch": 0.234982029779223, "grad_norm": 10.627038955688477, "learning_rate": 7.803764974329722e-06, "loss": 1.1265, "step": 1373 }, { "epoch": 0.23515317473900393, "grad_norm": 56.01224899291992, "learning_rate": 7.809469480889904e-06, "loss": 9.2401, "step": 1374 }, { "epoch": 0.23532431969878487, "grad_norm": 13.42536449432373, "learning_rate": 7.815173987450085e-06, "loss": 1.3022, "step": 1375 }, { "epoch": 0.2354954646585658, "grad_norm": 34.816341400146484, "learning_rate": 7.820878494010269e-06, "loss": 4.6249, "step": 1376 }, { "epoch": 0.23566660961834673, "grad_norm": 13.037670135498047, "learning_rate": 7.82658300057045e-06, "loss": 1.5747, "step": 1377 }, { "epoch": 0.23583775457812767, "grad_norm": 38.446537017822266, "learning_rate": 7.832287507130634e-06, "loss": 4.9983, "step": 1378 }, { "epoch": 0.2360088995379086, "grad_norm": 32.81908416748047, "learning_rate": 7.837992013690815e-06, "loss": 3.4363, "step": 1379 }, { "epoch": 0.23618004449768953, "grad_norm": 12.17697525024414, "learning_rate": 7.843696520250999e-06, "loss": 1.6211, "step": 1380 }, { "epoch": 0.23635118945747047, "grad_norm": 35.46131896972656, "learning_rate": 7.84940102681118e-06, "loss": 4.8981, "step": 1381 }, { "epoch": 0.2365223344172514, "grad_norm": 29.793787002563477, "learning_rate": 7.855105533371362e-06, "loss": 3.5648, "step": 1382 }, { "epoch": 0.23669347937703233, "grad_norm": 14.550475120544434, "learning_rate": 7.860810039931547e-06, "loss": 1.6714, "step": 1383 }, { "epoch": 0.23686462433681327, "grad_norm": 36.01753234863281, "learning_rate": 7.866514546491729e-06, "loss": 4.936, "step": 1384 }, { "epoch": 0.23703576929659423, "grad_norm": 21.261749267578125, "learning_rate": 7.872219053051912e-06, "loss": 2.3239, "step": 1385 }, { "epoch": 0.23720691425637516, "grad_norm": 160.96620178222656, "learning_rate": 7.877923559612094e-06, "loss": 7.9267, "step": 1386 }, { "epoch": 0.2373780592161561, "grad_norm": 34.994293212890625, "learning_rate": 7.883628066172276e-06, "loss": 4.6021, "step": 1387 }, { "epoch": 0.23754920417593703, "grad_norm": 32.08713912963867, "learning_rate": 7.889332572732459e-06, "loss": 4.0803, "step": 1388 }, { "epoch": 0.23772034913571796, "grad_norm": 36.49545669555664, "learning_rate": 7.89503707929264e-06, "loss": 4.4858, "step": 1389 }, { "epoch": 0.2378914940954989, "grad_norm": 146.2379608154297, "learning_rate": 7.900741585852824e-06, "loss": 8.1082, "step": 1390 }, { "epoch": 0.23806263905527983, "grad_norm": 31.705169677734375, "learning_rate": 7.906446092413006e-06, "loss": 4.1572, "step": 1391 }, { "epoch": 0.23823378401506076, "grad_norm": 13.439140319824219, "learning_rate": 7.91215059897319e-06, "loss": 1.1091, "step": 1392 }, { "epoch": 0.2384049289748417, "grad_norm": 51.37181854248047, "learning_rate": 7.91785510553337e-06, "loss": 9.8544, "step": 1393 }, { "epoch": 0.23857607393462263, "grad_norm": 16.763200759887695, "learning_rate": 7.923559612093553e-06, "loss": 1.4605, "step": 1394 }, { "epoch": 0.23874721889440356, "grad_norm": 32.19613265991211, "learning_rate": 7.929264118653738e-06, "loss": 4.0605, "step": 1395 }, { "epoch": 0.2389183638541845, "grad_norm": 36.1611442565918, "learning_rate": 7.93496862521392e-06, "loss": 4.1027, "step": 1396 }, { "epoch": 0.23908950881396543, "grad_norm": 36.234344482421875, "learning_rate": 7.940673131774103e-06, "loss": 5.0933, "step": 1397 }, { "epoch": 0.23926065377374636, "grad_norm": 39.589111328125, "learning_rate": 7.946377638334284e-06, "loss": 5.4176, "step": 1398 }, { "epoch": 0.2394317987335273, "grad_norm": 13.162062644958496, "learning_rate": 7.952082144894468e-06, "loss": 1.3262, "step": 1399 }, { "epoch": 0.23960294369330823, "grad_norm": 11.512036323547363, "learning_rate": 7.95778665145465e-06, "loss": 2.8916, "step": 1400 }, { "epoch": 0.23977408865308916, "grad_norm": 30.82523536682129, "learning_rate": 7.963491158014831e-06, "loss": 3.7983, "step": 1401 }, { "epoch": 0.2399452336128701, "grad_norm": 9.881488800048828, "learning_rate": 7.969195664575014e-06, "loss": 1.6009, "step": 1402 }, { "epoch": 0.24011637857265103, "grad_norm": 26.221534729003906, "learning_rate": 7.974900171135196e-06, "loss": 3.2459, "step": 1403 }, { "epoch": 0.24028752353243196, "grad_norm": 34.7869987487793, "learning_rate": 7.98060467769538e-06, "loss": 4.2736, "step": 1404 }, { "epoch": 0.2404586684922129, "grad_norm": 42.81889343261719, "learning_rate": 7.986309184255561e-06, "loss": 6.0254, "step": 1405 }, { "epoch": 0.24062981345199383, "grad_norm": 35.25808334350586, "learning_rate": 7.992013690815745e-06, "loss": 3.8331, "step": 1406 }, { "epoch": 0.24080095841177476, "grad_norm": 29.81654167175293, "learning_rate": 7.997718197375928e-06, "loss": 3.3841, "step": 1407 }, { "epoch": 0.2409721033715557, "grad_norm": 34.251243591308594, "learning_rate": 8.00342270393611e-06, "loss": 4.8157, "step": 1408 }, { "epoch": 0.24114324833133663, "grad_norm": 31.04636573791504, "learning_rate": 8.009127210496293e-06, "loss": 3.4431, "step": 1409 }, { "epoch": 0.2413143932911176, "grad_norm": 33.0612678527832, "learning_rate": 8.014831717056475e-06, "loss": 3.8054, "step": 1410 }, { "epoch": 0.24148553825089852, "grad_norm": 25.215789794921875, "learning_rate": 8.020536223616658e-06, "loss": 3.2052, "step": 1411 }, { "epoch": 0.24165668321067946, "grad_norm": 22.657257080078125, "learning_rate": 8.02624073017684e-06, "loss": 2.5621, "step": 1412 }, { "epoch": 0.2418278281704604, "grad_norm": 32.54667282104492, "learning_rate": 8.031945236737021e-06, "loss": 4.1257, "step": 1413 }, { "epoch": 0.24199897313024132, "grad_norm": 14.109042167663574, "learning_rate": 8.037649743297205e-06, "loss": 1.2616, "step": 1414 }, { "epoch": 0.24217011809002226, "grad_norm": 35.718116760253906, "learning_rate": 8.043354249857387e-06, "loss": 5.263, "step": 1415 }, { "epoch": 0.2423412630498032, "grad_norm": 10.830004692077637, "learning_rate": 8.04905875641757e-06, "loss": 1.6628, "step": 1416 }, { "epoch": 0.24251240800958412, "grad_norm": 21.519893646240234, "learning_rate": 8.054763262977753e-06, "loss": 2.2681, "step": 1417 }, { "epoch": 0.24268355296936506, "grad_norm": 16.527233123779297, "learning_rate": 8.060467769537937e-06, "loss": 1.7274, "step": 1418 }, { "epoch": 0.242854697929146, "grad_norm": 17.97334098815918, "learning_rate": 8.066172276098118e-06, "loss": 1.4341, "step": 1419 }, { "epoch": 0.24302584288892692, "grad_norm": 38.63325500488281, "learning_rate": 8.0718767826583e-06, "loss": 5.4521, "step": 1420 }, { "epoch": 0.24319698784870786, "grad_norm": 37.572818756103516, "learning_rate": 8.077581289218483e-06, "loss": 4.057, "step": 1421 }, { "epoch": 0.2433681328084888, "grad_norm": 36.495025634765625, "learning_rate": 8.083285795778665e-06, "loss": 5.3841, "step": 1422 }, { "epoch": 0.24353927776826972, "grad_norm": 46.322486877441406, "learning_rate": 8.088990302338848e-06, "loss": 9.2447, "step": 1423 }, { "epoch": 0.24371042272805066, "grad_norm": 32.26517868041992, "learning_rate": 8.09469480889903e-06, "loss": 3.4902, "step": 1424 }, { "epoch": 0.2438815676878316, "grad_norm": 29.286020278930664, "learning_rate": 8.100399315459212e-06, "loss": 3.6562, "step": 1425 }, { "epoch": 0.24405271264761252, "grad_norm": 9.768603324890137, "learning_rate": 8.106103822019395e-06, "loss": 1.0808, "step": 1426 }, { "epoch": 0.24422385760739346, "grad_norm": 40.53557205200195, "learning_rate": 8.111808328579577e-06, "loss": 5.3038, "step": 1427 }, { "epoch": 0.2443950025671744, "grad_norm": 36.29978561401367, "learning_rate": 8.11751283513976e-06, "loss": 4.9487, "step": 1428 }, { "epoch": 0.24456614752695532, "grad_norm": 50.365440368652344, "learning_rate": 8.123217341699944e-06, "loss": 9.1753, "step": 1429 }, { "epoch": 0.24473729248673626, "grad_norm": 25.204608917236328, "learning_rate": 8.128921848260127e-06, "loss": 3.06, "step": 1430 }, { "epoch": 0.2449084374465172, "grad_norm": 36.821929931640625, "learning_rate": 8.134626354820309e-06, "loss": 4.2367, "step": 1431 }, { "epoch": 0.24507958240629812, "grad_norm": 9.532563209533691, "learning_rate": 8.14033086138049e-06, "loss": 1.0511, "step": 1432 }, { "epoch": 0.24525072736607906, "grad_norm": 31.35403060913086, "learning_rate": 8.146035367940674e-06, "loss": 4.1655, "step": 1433 }, { "epoch": 0.24542187232586, "grad_norm": 29.057531356811523, "learning_rate": 8.151739874500855e-06, "loss": 3.6622, "step": 1434 }, { "epoch": 0.24559301728564092, "grad_norm": 18.69387435913086, "learning_rate": 8.157444381061039e-06, "loss": 1.6006, "step": 1435 }, { "epoch": 0.24576416224542189, "grad_norm": 27.337491989135742, "learning_rate": 8.16314888762122e-06, "loss": 2.1133, "step": 1436 }, { "epoch": 0.24593530720520282, "grad_norm": 59.810035705566406, "learning_rate": 8.168853394181404e-06, "loss": 9.2893, "step": 1437 }, { "epoch": 0.24610645216498375, "grad_norm": 34.85076141357422, "learning_rate": 8.174557900741586e-06, "loss": 4.76, "step": 1438 }, { "epoch": 0.24627759712476469, "grad_norm": 16.229951858520508, "learning_rate": 8.180262407301767e-06, "loss": 1.111, "step": 1439 }, { "epoch": 0.24644874208454562, "grad_norm": 191.14859008789062, "learning_rate": 8.185966913861952e-06, "loss": 8.6606, "step": 1440 }, { "epoch": 0.24661988704432655, "grad_norm": 25.192026138305664, "learning_rate": 8.191671420422134e-06, "loss": 2.2213, "step": 1441 }, { "epoch": 0.24679103200410749, "grad_norm": 16.577152252197266, "learning_rate": 8.197375926982317e-06, "loss": 1.4564, "step": 1442 }, { "epoch": 0.24696217696388842, "grad_norm": 37.47216796875, "learning_rate": 8.203080433542499e-06, "loss": 4.9652, "step": 1443 }, { "epoch": 0.24713332192366935, "grad_norm": 33.50614547729492, "learning_rate": 8.20878494010268e-06, "loss": 3.8217, "step": 1444 }, { "epoch": 0.24730446688345029, "grad_norm": 35.54981994628906, "learning_rate": 8.214489446662864e-06, "loss": 5.0781, "step": 1445 }, { "epoch": 0.24747561184323122, "grad_norm": 29.486570358276367, "learning_rate": 8.220193953223046e-06, "loss": 3.4324, "step": 1446 }, { "epoch": 0.24764675680301215, "grad_norm": 23.952808380126953, "learning_rate": 8.22589845978323e-06, "loss": 2.9791, "step": 1447 }, { "epoch": 0.24781790176279309, "grad_norm": 22.885963439941406, "learning_rate": 8.231602966343411e-06, "loss": 2.1029, "step": 1448 }, { "epoch": 0.24798904672257402, "grad_norm": 38.23826217651367, "learning_rate": 8.237307472903594e-06, "loss": 5.1107, "step": 1449 }, { "epoch": 0.24816019168235495, "grad_norm": 21.183773040771484, "learning_rate": 8.243011979463776e-06, "loss": 2.6462, "step": 1450 }, { "epoch": 0.24833133664213589, "grad_norm": 11.436287879943848, "learning_rate": 8.248716486023958e-06, "loss": 1.139, "step": 1451 }, { "epoch": 0.24850248160191682, "grad_norm": 21.1058349609375, "learning_rate": 8.254420992584143e-06, "loss": 2.6237, "step": 1452 }, { "epoch": 0.24867362656169775, "grad_norm": 29.661510467529297, "learning_rate": 8.260125499144324e-06, "loss": 3.9416, "step": 1453 }, { "epoch": 0.24884477152147869, "grad_norm": 25.654918670654297, "learning_rate": 8.265830005704508e-06, "loss": 2.9109, "step": 1454 }, { "epoch": 0.24901591648125962, "grad_norm": 29.254196166992188, "learning_rate": 8.27153451226469e-06, "loss": 3.9703, "step": 1455 }, { "epoch": 0.24918706144104055, "grad_norm": 15.34985065460205, "learning_rate": 8.277239018824871e-06, "loss": 1.277, "step": 1456 }, { "epoch": 0.24935820640082149, "grad_norm": 20.940813064575195, "learning_rate": 8.282943525385055e-06, "loss": 2.8225, "step": 1457 }, { "epoch": 0.24952935136060242, "grad_norm": 156.33163452148438, "learning_rate": 8.288648031945236e-06, "loss": 6.8667, "step": 1458 }, { "epoch": 0.24970049632038335, "grad_norm": 142.04833984375, "learning_rate": 8.29435253850542e-06, "loss": 7.7845, "step": 1459 }, { "epoch": 0.24987164128016429, "grad_norm": 52.80269241333008, "learning_rate": 8.300057045065601e-06, "loss": 9.2945, "step": 1460 }, { "epoch": 0.25004278623994525, "grad_norm": 36.25229263305664, "learning_rate": 8.305761551625785e-06, "loss": 4.1385, "step": 1461 }, { "epoch": 0.2502139311997262, "grad_norm": 32.63280487060547, "learning_rate": 8.311466058185966e-06, "loss": 4.9526, "step": 1462 }, { "epoch": 0.2503850761595071, "grad_norm": 36.09181213378906, "learning_rate": 8.31717056474615e-06, "loss": 4.9655, "step": 1463 }, { "epoch": 0.25055622111928805, "grad_norm": 13.666475296020508, "learning_rate": 8.322875071306333e-06, "loss": 1.2171, "step": 1464 }, { "epoch": 0.250727366079069, "grad_norm": 21.431262969970703, "learning_rate": 8.328579577866515e-06, "loss": 2.0253, "step": 1465 }, { "epoch": 0.2508985110388499, "grad_norm": 34.866493225097656, "learning_rate": 8.334284084426698e-06, "loss": 4.7963, "step": 1466 }, { "epoch": 0.25106965599863085, "grad_norm": 28.299697875976562, "learning_rate": 8.33998859098688e-06, "loss": 3.2393, "step": 1467 }, { "epoch": 0.2512408009584118, "grad_norm": 30.702220916748047, "learning_rate": 8.345693097547063e-06, "loss": 4.459, "step": 1468 }, { "epoch": 0.2514119459181927, "grad_norm": 35.572662353515625, "learning_rate": 8.351397604107245e-06, "loss": 4.0362, "step": 1469 }, { "epoch": 0.25158309087797365, "grad_norm": 31.228361129760742, "learning_rate": 8.357102110667427e-06, "loss": 3.7291, "step": 1470 }, { "epoch": 0.2517542358377546, "grad_norm": 158.43309020996094, "learning_rate": 8.36280661722761e-06, "loss": 7.5395, "step": 1471 }, { "epoch": 0.2519253807975355, "grad_norm": 26.111873626708984, "learning_rate": 8.368511123787792e-06, "loss": 3.2816, "step": 1472 }, { "epoch": 0.25209652575731645, "grad_norm": 152.1773681640625, "learning_rate": 8.374215630347975e-06, "loss": 9.2757, "step": 1473 }, { "epoch": 0.2522676707170974, "grad_norm": 28.91309928894043, "learning_rate": 8.379920136908157e-06, "loss": 3.8, "step": 1474 }, { "epoch": 0.2524388156768783, "grad_norm": 138.71820068359375, "learning_rate": 8.38562464346834e-06, "loss": 8.3701, "step": 1475 }, { "epoch": 0.25260996063665925, "grad_norm": 10.94738483428955, "learning_rate": 8.391329150028524e-06, "loss": 1.0987, "step": 1476 }, { "epoch": 0.2527811055964402, "grad_norm": 33.45675277709961, "learning_rate": 8.397033656588705e-06, "loss": 3.8679, "step": 1477 }, { "epoch": 0.2529522505562211, "grad_norm": 30.219728469848633, "learning_rate": 8.402738163148889e-06, "loss": 3.7668, "step": 1478 }, { "epoch": 0.25312339551600205, "grad_norm": 153.4755859375, "learning_rate": 8.40844266970907e-06, "loss": 8.493, "step": 1479 }, { "epoch": 0.253294540475783, "grad_norm": 27.030277252197266, "learning_rate": 8.414147176269254e-06, "loss": 3.6373, "step": 1480 }, { "epoch": 0.2534656854355639, "grad_norm": 26.931581497192383, "learning_rate": 8.419851682829435e-06, "loss": 2.4114, "step": 1481 }, { "epoch": 0.25363683039534485, "grad_norm": 33.86345672607422, "learning_rate": 8.425556189389617e-06, "loss": 4.18, "step": 1482 }, { "epoch": 0.2538079753551258, "grad_norm": 40.67789840698242, "learning_rate": 8.4312606959498e-06, "loss": 5.2501, "step": 1483 }, { "epoch": 0.2539791203149067, "grad_norm": 11.627734184265137, "learning_rate": 8.436965202509982e-06, "loss": 1.2352, "step": 1484 }, { "epoch": 0.25415026527468765, "grad_norm": 27.1390438079834, "learning_rate": 8.442669709070165e-06, "loss": 2.4447, "step": 1485 }, { "epoch": 0.2543214102344686, "grad_norm": 33.907615661621094, "learning_rate": 8.448374215630349e-06, "loss": 5.7028, "step": 1486 }, { "epoch": 0.2544925551942495, "grad_norm": 34.770687103271484, "learning_rate": 8.45407872219053e-06, "loss": 5.4022, "step": 1487 }, { "epoch": 0.25466370015403045, "grad_norm": 87.67970275878906, "learning_rate": 8.459783228750714e-06, "loss": 7.2429, "step": 1488 }, { "epoch": 0.2548348451138114, "grad_norm": 36.1263313293457, "learning_rate": 8.465487735310896e-06, "loss": 4.7788, "step": 1489 }, { "epoch": 0.2550059900735923, "grad_norm": 35.22165298461914, "learning_rate": 8.471192241871079e-06, "loss": 4.132, "step": 1490 }, { "epoch": 0.25517713503337325, "grad_norm": 28.420682907104492, "learning_rate": 8.47689674843126e-06, "loss": 3.6288, "step": 1491 }, { "epoch": 0.2553482799931542, "grad_norm": 36.37025451660156, "learning_rate": 8.482601254991444e-06, "loss": 5.1911, "step": 1492 }, { "epoch": 0.2555194249529351, "grad_norm": 40.647789001464844, "learning_rate": 8.488305761551626e-06, "loss": 5.5946, "step": 1493 }, { "epoch": 0.25569056991271605, "grad_norm": 19.504039764404297, "learning_rate": 8.494010268111807e-06, "loss": 1.7075, "step": 1494 }, { "epoch": 0.255861714872497, "grad_norm": 32.866695404052734, "learning_rate": 8.49971477467199e-06, "loss": 4.4763, "step": 1495 }, { "epoch": 0.2560328598322779, "grad_norm": 33.1104736328125, "learning_rate": 8.505419281232172e-06, "loss": 4.4053, "step": 1496 }, { "epoch": 0.25620400479205885, "grad_norm": 22.860944747924805, "learning_rate": 8.511123787792358e-06, "loss": 2.5604, "step": 1497 }, { "epoch": 0.2563751497518398, "grad_norm": 34.79046630859375, "learning_rate": 8.51682829435254e-06, "loss": 4.993, "step": 1498 }, { "epoch": 0.25654629471162077, "grad_norm": 28.405912399291992, "learning_rate": 8.522532800912723e-06, "loss": 3.3138, "step": 1499 }, { "epoch": 0.2567174396714017, "grad_norm": 32.89986038208008, "learning_rate": 8.528237307472904e-06, "loss": 3.1908, "step": 1500 }, { "epoch": 0.25688858463118264, "grad_norm": 20.201610565185547, "learning_rate": 8.533941814033086e-06, "loss": 1.974, "step": 1501 }, { "epoch": 0.25705972959096357, "grad_norm": 32.933231353759766, "learning_rate": 8.53964632059327e-06, "loss": 4.8342, "step": 1502 }, { "epoch": 0.2572308745507445, "grad_norm": 25.67669105529785, "learning_rate": 8.545350827153451e-06, "loss": 2.8345, "step": 1503 }, { "epoch": 0.25740201951052544, "grad_norm": 50.461097717285156, "learning_rate": 8.551055333713634e-06, "loss": 6.9385, "step": 1504 }, { "epoch": 0.25757316447030637, "grad_norm": 32.42000198364258, "learning_rate": 8.556759840273816e-06, "loss": 3.4542, "step": 1505 }, { "epoch": 0.2577443094300873, "grad_norm": 29.946523666381836, "learning_rate": 8.562464346833998e-06, "loss": 3.2486, "step": 1506 }, { "epoch": 0.25791545438986824, "grad_norm": 17.451496124267578, "learning_rate": 8.568168853394181e-06, "loss": 1.4946, "step": 1507 }, { "epoch": 0.25808659934964917, "grad_norm": 30.164350509643555, "learning_rate": 8.573873359954363e-06, "loss": 3.8272, "step": 1508 }, { "epoch": 0.2582577443094301, "grad_norm": 26.747682571411133, "learning_rate": 8.579577866514548e-06, "loss": 3.0653, "step": 1509 }, { "epoch": 0.25842888926921104, "grad_norm": 20.9317626953125, "learning_rate": 8.58528237307473e-06, "loss": 1.8431, "step": 1510 }, { "epoch": 0.25860003422899197, "grad_norm": 36.90618896484375, "learning_rate": 8.590986879634913e-06, "loss": 3.7371, "step": 1511 }, { "epoch": 0.2587711791887729, "grad_norm": 19.612281799316406, "learning_rate": 8.596691386195095e-06, "loss": 1.4799, "step": 1512 }, { "epoch": 0.25894232414855384, "grad_norm": 35.63535690307617, "learning_rate": 8.602395892755276e-06, "loss": 4.2458, "step": 1513 }, { "epoch": 0.25911346910833477, "grad_norm": 37.25559997558594, "learning_rate": 8.60810039931546e-06, "loss": 3.7735, "step": 1514 }, { "epoch": 0.2592846140681157, "grad_norm": 26.81685447692871, "learning_rate": 8.613804905875641e-06, "loss": 2.621, "step": 1515 }, { "epoch": 0.25945575902789664, "grad_norm": 22.918485641479492, "learning_rate": 8.619509412435825e-06, "loss": 1.6105, "step": 1516 }, { "epoch": 0.25962690398767757, "grad_norm": 12.06033992767334, "learning_rate": 8.625213918996006e-06, "loss": 1.1731, "step": 1517 }, { "epoch": 0.2597980489474585, "grad_norm": 35.15945053100586, "learning_rate": 8.63091842555619e-06, "loss": 3.8198, "step": 1518 }, { "epoch": 0.25996919390723944, "grad_norm": 13.90102767944336, "learning_rate": 8.636622932116372e-06, "loss": 2.736, "step": 1519 }, { "epoch": 0.26014033886702037, "grad_norm": 35.0964469909668, "learning_rate": 8.642327438676555e-06, "loss": 4.3737, "step": 1520 }, { "epoch": 0.2603114838268013, "grad_norm": 33.16070556640625, "learning_rate": 8.648031945236738e-06, "loss": 3.8065, "step": 1521 }, { "epoch": 0.26048262878658224, "grad_norm": 16.28618621826172, "learning_rate": 8.65373645179692e-06, "loss": 1.257, "step": 1522 }, { "epoch": 0.26065377374636317, "grad_norm": 28.174516677856445, "learning_rate": 8.659440958357103e-06, "loss": 3.7114, "step": 1523 }, { "epoch": 0.2608249187061441, "grad_norm": 26.44544792175293, "learning_rate": 8.665145464917285e-06, "loss": 2.829, "step": 1524 }, { "epoch": 0.26099606366592504, "grad_norm": 38.186378479003906, "learning_rate": 8.670849971477467e-06, "loss": 4.3011, "step": 1525 }, { "epoch": 0.26116720862570597, "grad_norm": 206.24801635742188, "learning_rate": 8.67655447803765e-06, "loss": 9.2851, "step": 1526 }, { "epoch": 0.2613383535854869, "grad_norm": 33.12008285522461, "learning_rate": 8.682258984597832e-06, "loss": 4.3036, "step": 1527 }, { "epoch": 0.26150949854526784, "grad_norm": 136.57029724121094, "learning_rate": 8.687963491158015e-06, "loss": 8.4189, "step": 1528 }, { "epoch": 0.26168064350504877, "grad_norm": 40.36309051513672, "learning_rate": 8.693667997718197e-06, "loss": 5.4948, "step": 1529 }, { "epoch": 0.2618517884648297, "grad_norm": 19.74286651611328, "learning_rate": 8.69937250427838e-06, "loss": 2.0893, "step": 1530 }, { "epoch": 0.26202293342461064, "grad_norm": 33.62118148803711, "learning_rate": 8.705077010838562e-06, "loss": 3.796, "step": 1531 }, { "epoch": 0.26219407838439157, "grad_norm": 36.64006805419922, "learning_rate": 8.710781517398745e-06, "loss": 3.9848, "step": 1532 }, { "epoch": 0.2623652233441725, "grad_norm": 12.980084419250488, "learning_rate": 8.716486023958929e-06, "loss": 1.1166, "step": 1533 }, { "epoch": 0.26253636830395344, "grad_norm": 35.808021545410156, "learning_rate": 8.72219053051911e-06, "loss": 4.3018, "step": 1534 }, { "epoch": 0.26270751326373437, "grad_norm": 51.2911491394043, "learning_rate": 8.727895037079294e-06, "loss": 9.237, "step": 1535 }, { "epoch": 0.2628786582235153, "grad_norm": 26.75223731994629, "learning_rate": 8.733599543639475e-06, "loss": 3.3625, "step": 1536 }, { "epoch": 0.26304980318329624, "grad_norm": 81.07520294189453, "learning_rate": 8.739304050199659e-06, "loss": 7.5686, "step": 1537 }, { "epoch": 0.26322094814307717, "grad_norm": 37.027191162109375, "learning_rate": 8.74500855675984e-06, "loss": 3.7701, "step": 1538 }, { "epoch": 0.2633920931028581, "grad_norm": 47.393333435058594, "learning_rate": 8.750713063320022e-06, "loss": 9.0139, "step": 1539 }, { "epoch": 0.26356323806263904, "grad_norm": 34.1210823059082, "learning_rate": 8.756417569880206e-06, "loss": 4.4995, "step": 1540 }, { "epoch": 0.26373438302241997, "grad_norm": 14.312548637390137, "learning_rate": 8.762122076440387e-06, "loss": 2.1827, "step": 1541 }, { "epoch": 0.2639055279822009, "grad_norm": 30.19961166381836, "learning_rate": 8.76782658300057e-06, "loss": 3.9737, "step": 1542 }, { "epoch": 0.26407667294198184, "grad_norm": 10.720991134643555, "learning_rate": 8.773531089560754e-06, "loss": 1.1108, "step": 1543 }, { "epoch": 0.26424781790176277, "grad_norm": 26.29660987854004, "learning_rate": 8.779235596120936e-06, "loss": 2.9509, "step": 1544 }, { "epoch": 0.2644189628615437, "grad_norm": 7.651371479034424, "learning_rate": 8.784940102681119e-06, "loss": 0.8929, "step": 1545 }, { "epoch": 0.26459010782132464, "grad_norm": 32.411407470703125, "learning_rate": 8.7906446092413e-06, "loss": 3.9279, "step": 1546 }, { "epoch": 0.26476125278110557, "grad_norm": 43.62602233886719, "learning_rate": 8.796349115801484e-06, "loss": 8.7932, "step": 1547 }, { "epoch": 0.2649323977408865, "grad_norm": 28.391075134277344, "learning_rate": 8.802053622361666e-06, "loss": 3.3049, "step": 1548 }, { "epoch": 0.26510354270066744, "grad_norm": 35.11864471435547, "learning_rate": 8.80775812892185e-06, "loss": 4.0323, "step": 1549 }, { "epoch": 0.2652746876604484, "grad_norm": 10.911874771118164, "learning_rate": 8.813462635482031e-06, "loss": 1.3744, "step": 1550 }, { "epoch": 0.26544583262022936, "grad_norm": 22.232980728149414, "learning_rate": 8.819167142042213e-06, "loss": 1.972, "step": 1551 }, { "epoch": 0.2656169775800103, "grad_norm": 171.640625, "learning_rate": 8.824871648602396e-06, "loss": 8.4712, "step": 1552 }, { "epoch": 0.2657881225397912, "grad_norm": 30.831897735595703, "learning_rate": 8.830576155162578e-06, "loss": 3.5869, "step": 1553 }, { "epoch": 0.26595926749957216, "grad_norm": 36.305782318115234, "learning_rate": 8.836280661722761e-06, "loss": 4.9009, "step": 1554 }, { "epoch": 0.2661304124593531, "grad_norm": 44.463626861572266, "learning_rate": 8.841985168282944e-06, "loss": 4.6015, "step": 1555 }, { "epoch": 0.266301557419134, "grad_norm": 22.66800308227539, "learning_rate": 8.847689674843126e-06, "loss": 2.1498, "step": 1556 }, { "epoch": 0.26647270237891496, "grad_norm": 30.886274337768555, "learning_rate": 8.85339418140331e-06, "loss": 4.3322, "step": 1557 }, { "epoch": 0.2666438473386959, "grad_norm": 34.30126190185547, "learning_rate": 8.859098687963491e-06, "loss": 4.5378, "step": 1558 }, { "epoch": 0.2668149922984768, "grad_norm": 36.92926025390625, "learning_rate": 8.864803194523674e-06, "loss": 4.2903, "step": 1559 }, { "epoch": 0.26698613725825776, "grad_norm": 34.588077545166016, "learning_rate": 8.870507701083856e-06, "loss": 4.9088, "step": 1560 }, { "epoch": 0.2671572822180387, "grad_norm": 30.621044158935547, "learning_rate": 8.87621220764404e-06, "loss": 3.6051, "step": 1561 }, { "epoch": 0.2673284271778196, "grad_norm": 30.107677459716797, "learning_rate": 8.881916714204221e-06, "loss": 3.4027, "step": 1562 }, { "epoch": 0.26749957213760056, "grad_norm": 16.614532470703125, "learning_rate": 8.887621220764403e-06, "loss": 1.5846, "step": 1563 }, { "epoch": 0.2676707170973815, "grad_norm": 35.577842712402344, "learning_rate": 8.893325727324586e-06, "loss": 4.2335, "step": 1564 }, { "epoch": 0.2678418620571624, "grad_norm": 33.13545227050781, "learning_rate": 8.899030233884768e-06, "loss": 4.6539, "step": 1565 }, { "epoch": 0.26801300701694336, "grad_norm": 170.64297485351562, "learning_rate": 8.904734740444953e-06, "loss": 9.3362, "step": 1566 }, { "epoch": 0.2681841519767243, "grad_norm": 12.3065185546875, "learning_rate": 8.910439247005135e-06, "loss": 1.573, "step": 1567 }, { "epoch": 0.2683552969365052, "grad_norm": 38.08529281616211, "learning_rate": 8.916143753565318e-06, "loss": 3.7314, "step": 1568 }, { "epoch": 0.26852644189628616, "grad_norm": 169.76089477539062, "learning_rate": 8.9218482601255e-06, "loss": 9.6942, "step": 1569 }, { "epoch": 0.2686975868560671, "grad_norm": 38.42169952392578, "learning_rate": 8.927552766685681e-06, "loss": 5.3158, "step": 1570 }, { "epoch": 0.268868731815848, "grad_norm": 14.410723686218262, "learning_rate": 8.933257273245865e-06, "loss": 1.2377, "step": 1571 }, { "epoch": 0.26903987677562896, "grad_norm": 52.682533264160156, "learning_rate": 8.938961779806047e-06, "loss": 6.516, "step": 1572 }, { "epoch": 0.2692110217354099, "grad_norm": 34.07759094238281, "learning_rate": 8.94466628636623e-06, "loss": 4.013, "step": 1573 }, { "epoch": 0.26938216669519083, "grad_norm": 29.74109649658203, "learning_rate": 8.950370792926412e-06, "loss": 3.4177, "step": 1574 }, { "epoch": 0.26955331165497176, "grad_norm": 35.098876953125, "learning_rate": 8.956075299486593e-06, "loss": 4.1055, "step": 1575 }, { "epoch": 0.2697244566147527, "grad_norm": 50.082366943359375, "learning_rate": 8.961779806046777e-06, "loss": 8.4876, "step": 1576 }, { "epoch": 0.26989560157453363, "grad_norm": 116.58244323730469, "learning_rate": 8.96748431260696e-06, "loss": 7.8558, "step": 1577 }, { "epoch": 0.27006674653431456, "grad_norm": 32.75837326049805, "learning_rate": 8.973188819167143e-06, "loss": 3.8977, "step": 1578 }, { "epoch": 0.2702378914940955, "grad_norm": 13.686226844787598, "learning_rate": 8.978893325727325e-06, "loss": 1.5984, "step": 1579 }, { "epoch": 0.27040903645387643, "grad_norm": 31.057418823242188, "learning_rate": 8.984597832287508e-06, "loss": 4.2033, "step": 1580 }, { "epoch": 0.27058018141365736, "grad_norm": 31.405447006225586, "learning_rate": 8.99030233884769e-06, "loss": 3.2895, "step": 1581 }, { "epoch": 0.2707513263734383, "grad_norm": 29.978918075561523, "learning_rate": 8.996006845407872e-06, "loss": 4.0648, "step": 1582 }, { "epoch": 0.27092247133321923, "grad_norm": 11.317312240600586, "learning_rate": 9.001711351968055e-06, "loss": 0.9835, "step": 1583 }, { "epoch": 0.27109361629300016, "grad_norm": 17.877771377563477, "learning_rate": 9.007415858528237e-06, "loss": 1.4293, "step": 1584 }, { "epoch": 0.2712647612527811, "grad_norm": 26.353673934936523, "learning_rate": 9.01312036508842e-06, "loss": 2.6549, "step": 1585 }, { "epoch": 0.27143590621256203, "grad_norm": 31.735876083374023, "learning_rate": 9.018824871648602e-06, "loss": 3.9997, "step": 1586 }, { "epoch": 0.27160705117234296, "grad_norm": 35.91917037963867, "learning_rate": 9.024529378208785e-06, "loss": 4.2824, "step": 1587 }, { "epoch": 0.2717781961321239, "grad_norm": 32.27674865722656, "learning_rate": 9.030233884768967e-06, "loss": 4.0964, "step": 1588 }, { "epoch": 0.27194934109190483, "grad_norm": 37.242549896240234, "learning_rate": 9.03593839132915e-06, "loss": 4.4567, "step": 1589 }, { "epoch": 0.27212048605168576, "grad_norm": 15.34211540222168, "learning_rate": 9.041642897889334e-06, "loss": 1.1567, "step": 1590 }, { "epoch": 0.2722916310114667, "grad_norm": 35.38195037841797, "learning_rate": 9.047347404449515e-06, "loss": 4.7975, "step": 1591 }, { "epoch": 0.27246277597124763, "grad_norm": 29.104900360107422, "learning_rate": 9.053051911009699e-06, "loss": 3.1354, "step": 1592 }, { "epoch": 0.27263392093102856, "grad_norm": 15.004528999328613, "learning_rate": 9.05875641756988e-06, "loss": 1.1248, "step": 1593 }, { "epoch": 0.2728050658908095, "grad_norm": 26.269655227661133, "learning_rate": 9.064460924130062e-06, "loss": 2.0743, "step": 1594 }, { "epoch": 0.27297621085059043, "grad_norm": 19.79959487915039, "learning_rate": 9.070165430690246e-06, "loss": 1.3031, "step": 1595 }, { "epoch": 0.27314735581037136, "grad_norm": 43.51731491088867, "learning_rate": 9.075869937250427e-06, "loss": 4.4293, "step": 1596 }, { "epoch": 0.2733185007701523, "grad_norm": 7.138434410095215, "learning_rate": 9.08157444381061e-06, "loss": 0.8485, "step": 1597 }, { "epoch": 0.27348964572993323, "grad_norm": 32.309593200683594, "learning_rate": 9.087278950370792e-06, "loss": 3.4497, "step": 1598 }, { "epoch": 0.27366079068971416, "grad_norm": 24.805715560913086, "learning_rate": 9.092983456930976e-06, "loss": 2.9256, "step": 1599 }, { "epoch": 0.2738319356494951, "grad_norm": 61.22898483276367, "learning_rate": 9.098687963491159e-06, "loss": 5.9283, "step": 1600 }, { "epoch": 0.2740030806092761, "grad_norm": 29.417680740356445, "learning_rate": 9.10439247005134e-06, "loss": 3.8084, "step": 1601 }, { "epoch": 0.274174225569057, "grad_norm": 34.00372314453125, "learning_rate": 9.110096976611524e-06, "loss": 3.4933, "step": 1602 }, { "epoch": 0.27434537052883795, "grad_norm": 14.374422073364258, "learning_rate": 9.115801483171706e-06, "loss": 1.4626, "step": 1603 }, { "epoch": 0.2745165154886189, "grad_norm": 12.729880332946777, "learning_rate": 9.12150598973189e-06, "loss": 1.1151, "step": 1604 }, { "epoch": 0.2746876604483998, "grad_norm": 17.94257164001465, "learning_rate": 9.127210496292071e-06, "loss": 1.3846, "step": 1605 }, { "epoch": 0.27485880540818075, "grad_norm": 38.29545974731445, "learning_rate": 9.132915002852253e-06, "loss": 4.5905, "step": 1606 }, { "epoch": 0.2750299503679617, "grad_norm": 35.37318420410156, "learning_rate": 9.138619509412436e-06, "loss": 4.3784, "step": 1607 }, { "epoch": 0.2752010953277426, "grad_norm": 35.77292251586914, "learning_rate": 9.144324015972618e-06, "loss": 3.315, "step": 1608 }, { "epoch": 0.27537224028752355, "grad_norm": 38.70093536376953, "learning_rate": 9.150028522532801e-06, "loss": 5.4718, "step": 1609 }, { "epoch": 0.2755433852473045, "grad_norm": 185.0310516357422, "learning_rate": 9.155733029092983e-06, "loss": 7.5009, "step": 1610 }, { "epoch": 0.2757145302070854, "grad_norm": 28.145288467407227, "learning_rate": 9.161437535653166e-06, "loss": 2.8764, "step": 1611 }, { "epoch": 0.27588567516686635, "grad_norm": 7.594282150268555, "learning_rate": 9.16714204221335e-06, "loss": 0.8713, "step": 1612 }, { "epoch": 0.2760568201266473, "grad_norm": 32.899845123291016, "learning_rate": 9.172846548773531e-06, "loss": 4.6094, "step": 1613 }, { "epoch": 0.2762279650864282, "grad_norm": 39.75630569458008, "learning_rate": 9.178551055333715e-06, "loss": 4.5632, "step": 1614 }, { "epoch": 0.27639911004620915, "grad_norm": 29.607851028442383, "learning_rate": 9.184255561893896e-06, "loss": 2.9606, "step": 1615 }, { "epoch": 0.2765702550059901, "grad_norm": 76.37677001953125, "learning_rate": 9.18996006845408e-06, "loss": 7.355, "step": 1616 }, { "epoch": 0.276741399965771, "grad_norm": 22.215526580810547, "learning_rate": 9.195664575014261e-06, "loss": 2.8241, "step": 1617 }, { "epoch": 0.27691254492555195, "grad_norm": 9.465276718139648, "learning_rate": 9.201369081574445e-06, "loss": 0.9882, "step": 1618 }, { "epoch": 0.2770836898853329, "grad_norm": 27.726600646972656, "learning_rate": 9.207073588134626e-06, "loss": 3.238, "step": 1619 }, { "epoch": 0.2772548348451138, "grad_norm": 35.69710922241211, "learning_rate": 9.212778094694808e-06, "loss": 4.4113, "step": 1620 }, { "epoch": 0.27742597980489475, "grad_norm": 34.97329330444336, "learning_rate": 9.218482601254991e-06, "loss": 5.005, "step": 1621 }, { "epoch": 0.2775971247646757, "grad_norm": 18.749282836914062, "learning_rate": 9.224187107815173e-06, "loss": 1.7009, "step": 1622 }, { "epoch": 0.2777682697244566, "grad_norm": 130.61004638671875, "learning_rate": 9.229891614375358e-06, "loss": 7.8661, "step": 1623 }, { "epoch": 0.27793941468423755, "grad_norm": 12.980770111083984, "learning_rate": 9.23559612093554e-06, "loss": 1.1125, "step": 1624 }, { "epoch": 0.2781105596440185, "grad_norm": 46.32781219482422, "learning_rate": 9.241300627495722e-06, "loss": 8.7552, "step": 1625 }, { "epoch": 0.2782817046037994, "grad_norm": 101.5696029663086, "learning_rate": 9.247005134055905e-06, "loss": 7.1054, "step": 1626 }, { "epoch": 0.27845284956358035, "grad_norm": 22.125795364379883, "learning_rate": 9.252709640616087e-06, "loss": 1.7911, "step": 1627 }, { "epoch": 0.2786239945233613, "grad_norm": 34.277095794677734, "learning_rate": 9.25841414717627e-06, "loss": 4.438, "step": 1628 }, { "epoch": 0.2787951394831422, "grad_norm": 22.72269058227539, "learning_rate": 9.264118653736452e-06, "loss": 1.7455, "step": 1629 }, { "epoch": 0.27896628444292315, "grad_norm": 30.11455726623535, "learning_rate": 9.269823160296635e-06, "loss": 3.3549, "step": 1630 }, { "epoch": 0.2791374294027041, "grad_norm": 34.13120651245117, "learning_rate": 9.275527666856817e-06, "loss": 3.7081, "step": 1631 }, { "epoch": 0.279308574362485, "grad_norm": 8.457001686096191, "learning_rate": 9.281232173416998e-06, "loss": 0.9564, "step": 1632 }, { "epoch": 0.27947971932226595, "grad_norm": 38.574615478515625, "learning_rate": 9.286936679977182e-06, "loss": 4.3973, "step": 1633 }, { "epoch": 0.2796508642820469, "grad_norm": 11.158347129821777, "learning_rate": 9.292641186537364e-06, "loss": 0.9696, "step": 1634 }, { "epoch": 0.2798220092418278, "grad_norm": 11.847931861877441, "learning_rate": 9.298345693097549e-06, "loss": 1.5377, "step": 1635 }, { "epoch": 0.27999315420160875, "grad_norm": 11.096319198608398, "learning_rate": 9.30405019965773e-06, "loss": 1.136, "step": 1636 }, { "epoch": 0.2801642991613897, "grad_norm": 36.63529586791992, "learning_rate": 9.309754706217914e-06, "loss": 5.2998, "step": 1637 }, { "epoch": 0.2803354441211706, "grad_norm": 30.421175003051758, "learning_rate": 9.315459212778095e-06, "loss": 3.5562, "step": 1638 }, { "epoch": 0.28050658908095155, "grad_norm": 34.89402770996094, "learning_rate": 9.321163719338277e-06, "loss": 4.9255, "step": 1639 }, { "epoch": 0.2806777340407325, "grad_norm": 28.486478805541992, "learning_rate": 9.32686822589846e-06, "loss": 3.4583, "step": 1640 }, { "epoch": 0.2808488790005134, "grad_norm": 7.498641490936279, "learning_rate": 9.332572732458642e-06, "loss": 0.8123, "step": 1641 }, { "epoch": 0.28102002396029435, "grad_norm": 47.50094223022461, "learning_rate": 9.338277239018825e-06, "loss": 7.894, "step": 1642 }, { "epoch": 0.2811911689200753, "grad_norm": 62.95503616333008, "learning_rate": 9.343981745579007e-06, "loss": 6.7316, "step": 1643 }, { "epoch": 0.2813623138798562, "grad_norm": 26.29498291015625, "learning_rate": 9.349686252139189e-06, "loss": 2.9299, "step": 1644 }, { "epoch": 0.28153345883963715, "grad_norm": 13.663917541503906, "learning_rate": 9.355390758699372e-06, "loss": 1.6658, "step": 1645 }, { "epoch": 0.2817046037994181, "grad_norm": 31.745132446289062, "learning_rate": 9.361095265259556e-06, "loss": 4.9097, "step": 1646 }, { "epoch": 0.281875748759199, "grad_norm": 16.757953643798828, "learning_rate": 9.366799771819739e-06, "loss": 1.4769, "step": 1647 }, { "epoch": 0.28204689371897995, "grad_norm": 21.601877212524414, "learning_rate": 9.37250427837992e-06, "loss": 1.7352, "step": 1648 }, { "epoch": 0.2822180386787609, "grad_norm": 38.61962127685547, "learning_rate": 9.378208784940104e-06, "loss": 4.4803, "step": 1649 }, { "epoch": 0.2823891836385418, "grad_norm": 31.342639923095703, "learning_rate": 9.383913291500286e-06, "loss": 3.8044, "step": 1650 }, { "epoch": 0.28256032859832275, "grad_norm": 9.416754722595215, "learning_rate": 9.389617798060467e-06, "loss": 0.804, "step": 1651 }, { "epoch": 0.28273147355810374, "grad_norm": 31.227413177490234, "learning_rate": 9.39532230462065e-06, "loss": 4.1229, "step": 1652 }, { "epoch": 0.2829026185178847, "grad_norm": 13.257563591003418, "learning_rate": 9.401026811180832e-06, "loss": 1.1089, "step": 1653 }, { "epoch": 0.2830737634776656, "grad_norm": 33.36773681640625, "learning_rate": 9.406731317741016e-06, "loss": 4.6453, "step": 1654 }, { "epoch": 0.28324490843744654, "grad_norm": 30.116289138793945, "learning_rate": 9.412435824301198e-06, "loss": 3.3475, "step": 1655 }, { "epoch": 0.2834160533972275, "grad_norm": 9.72807502746582, "learning_rate": 9.41814033086138e-06, "loss": 1.3987, "step": 1656 }, { "epoch": 0.2835871983570084, "grad_norm": 35.53730392456055, "learning_rate": 9.423844837421563e-06, "loss": 4.4274, "step": 1657 }, { "epoch": 0.28375834331678934, "grad_norm": 25.7310733795166, "learning_rate": 9.429549343981746e-06, "loss": 3.1681, "step": 1658 }, { "epoch": 0.2839294882765703, "grad_norm": 41.159175872802734, "learning_rate": 9.43525385054193e-06, "loss": 5.0249, "step": 1659 }, { "epoch": 0.2841006332363512, "grad_norm": 44.80512619018555, "learning_rate": 9.440958357102111e-06, "loss": 8.5485, "step": 1660 }, { "epoch": 0.28427177819613214, "grad_norm": 30.980173110961914, "learning_rate": 9.446662863662294e-06, "loss": 3.4326, "step": 1661 }, { "epoch": 0.2844429231559131, "grad_norm": 34.32295608520508, "learning_rate": 9.452367370222476e-06, "loss": 3.1846, "step": 1662 }, { "epoch": 0.284614068115694, "grad_norm": 31.66938591003418, "learning_rate": 9.458071876782658e-06, "loss": 3.5118, "step": 1663 }, { "epoch": 0.28478521307547494, "grad_norm": 32.4676513671875, "learning_rate": 9.463776383342841e-06, "loss": 4.7146, "step": 1664 }, { "epoch": 0.2849563580352559, "grad_norm": 10.913191795349121, "learning_rate": 9.469480889903023e-06, "loss": 0.9731, "step": 1665 }, { "epoch": 0.2851275029950368, "grad_norm": 35.5974006652832, "learning_rate": 9.475185396463206e-06, "loss": 4.3522, "step": 1666 }, { "epoch": 0.28529864795481774, "grad_norm": 33.59803771972656, "learning_rate": 9.480889903023388e-06, "loss": 3.3641, "step": 1667 }, { "epoch": 0.2854697929145987, "grad_norm": 35.429466247558594, "learning_rate": 9.486594409583571e-06, "loss": 3.9219, "step": 1668 }, { "epoch": 0.2856409378743796, "grad_norm": 13.85142707824707, "learning_rate": 9.492298916143755e-06, "loss": 1.3341, "step": 1669 }, { "epoch": 0.28581208283416054, "grad_norm": 9.107728004455566, "learning_rate": 9.498003422703936e-06, "loss": 0.8871, "step": 1670 }, { "epoch": 0.2859832277939415, "grad_norm": 35.564979553222656, "learning_rate": 9.50370792926412e-06, "loss": 4.1451, "step": 1671 }, { "epoch": 0.2861543727537224, "grad_norm": 27.561506271362305, "learning_rate": 9.509412435824301e-06, "loss": 3.3942, "step": 1672 }, { "epoch": 0.28632551771350334, "grad_norm": 35.57343292236328, "learning_rate": 9.515116942384485e-06, "loss": 3.7111, "step": 1673 }, { "epoch": 0.2864966626732843, "grad_norm": 39.25908279418945, "learning_rate": 9.520821448944666e-06, "loss": 4.3091, "step": 1674 }, { "epoch": 0.2866678076330652, "grad_norm": 41.76926803588867, "learning_rate": 9.526525955504848e-06, "loss": 4.7086, "step": 1675 }, { "epoch": 0.28683895259284614, "grad_norm": 30.626611709594727, "learning_rate": 9.532230462065032e-06, "loss": 3.1129, "step": 1676 }, { "epoch": 0.2870100975526271, "grad_norm": 15.441875457763672, "learning_rate": 9.537934968625213e-06, "loss": 1.2888, "step": 1677 }, { "epoch": 0.287181242512408, "grad_norm": 28.600982666015625, "learning_rate": 9.543639475185397e-06, "loss": 2.9279, "step": 1678 }, { "epoch": 0.28735238747218894, "grad_norm": 31.3085994720459, "learning_rate": 9.549343981745578e-06, "loss": 3.8741, "step": 1679 }, { "epoch": 0.2875235324319699, "grad_norm": 207.9216766357422, "learning_rate": 9.555048488305763e-06, "loss": 8.5201, "step": 1680 }, { "epoch": 0.2876946773917508, "grad_norm": 38.76487731933594, "learning_rate": 9.560752994865945e-06, "loss": 4.8258, "step": 1681 }, { "epoch": 0.28786582235153174, "grad_norm": 35.18633270263672, "learning_rate": 9.566457501426127e-06, "loss": 3.9555, "step": 1682 }, { "epoch": 0.2880369673113127, "grad_norm": 153.19830322265625, "learning_rate": 9.57216200798631e-06, "loss": 8.1017, "step": 1683 }, { "epoch": 0.2882081122710936, "grad_norm": 8.444355010986328, "learning_rate": 9.577866514546492e-06, "loss": 0.8761, "step": 1684 }, { "epoch": 0.28837925723087454, "grad_norm": 44.78715515136719, "learning_rate": 9.583571021106675e-06, "loss": 8.4681, "step": 1685 }, { "epoch": 0.2885504021906555, "grad_norm": 25.710901260375977, "learning_rate": 9.589275527666857e-06, "loss": 3.2682, "step": 1686 }, { "epoch": 0.2887215471504364, "grad_norm": 161.20376586914062, "learning_rate": 9.59498003422704e-06, "loss": 8.3231, "step": 1687 }, { "epoch": 0.28889269211021734, "grad_norm": 36.88936996459961, "learning_rate": 9.600684540787222e-06, "loss": 4.4629, "step": 1688 }, { "epoch": 0.2890638370699983, "grad_norm": 33.05325698852539, "learning_rate": 9.606389047347404e-06, "loss": 4.2398, "step": 1689 }, { "epoch": 0.2892349820297792, "grad_norm": 31.297021865844727, "learning_rate": 9.612093553907587e-06, "loss": 3.9676, "step": 1690 }, { "epoch": 0.28940612698956014, "grad_norm": 33.626365661621094, "learning_rate": 9.617798060467769e-06, "loss": 4.2342, "step": 1691 }, { "epoch": 0.2895772719493411, "grad_norm": 32.812740325927734, "learning_rate": 9.623502567027954e-06, "loss": 3.9633, "step": 1692 }, { "epoch": 0.289748416909122, "grad_norm": 16.281417846679688, "learning_rate": 9.629207073588135e-06, "loss": 1.3504, "step": 1693 }, { "epoch": 0.28991956186890294, "grad_norm": 36.80635070800781, "learning_rate": 9.634911580148317e-06, "loss": 4.8133, "step": 1694 }, { "epoch": 0.2900907068286839, "grad_norm": 36.548397064208984, "learning_rate": 9.6406160867085e-06, "loss": 4.0484, "step": 1695 }, { "epoch": 0.2902618517884648, "grad_norm": 35.513729095458984, "learning_rate": 9.646320593268682e-06, "loss": 4.3281, "step": 1696 }, { "epoch": 0.29043299674824574, "grad_norm": 33.258995056152344, "learning_rate": 9.652025099828866e-06, "loss": 4.3749, "step": 1697 }, { "epoch": 0.2906041417080267, "grad_norm": 30.854419708251953, "learning_rate": 9.657729606389047e-06, "loss": 3.4016, "step": 1698 }, { "epoch": 0.2907752866678076, "grad_norm": 8.308602333068848, "learning_rate": 9.66343411294923e-06, "loss": 1.221, "step": 1699 }, { "epoch": 0.29094643162758854, "grad_norm": 10.515448570251465, "learning_rate": 9.669138619509412e-06, "loss": 0.9434, "step": 1700 }, { "epoch": 0.2911175765873695, "grad_norm": 20.784067153930664, "learning_rate": 9.674843126069594e-06, "loss": 2.2873, "step": 1701 }, { "epoch": 0.2912887215471504, "grad_norm": 44.417884826660156, "learning_rate": 9.680547632629777e-06, "loss": 8.2452, "step": 1702 }, { "epoch": 0.2914598665069314, "grad_norm": 28.057279586791992, "learning_rate": 9.68625213918996e-06, "loss": 2.7355, "step": 1703 }, { "epoch": 0.29163101146671233, "grad_norm": 50.71089553833008, "learning_rate": 9.691956645750144e-06, "loss": 8.431, "step": 1704 }, { "epoch": 0.29180215642649326, "grad_norm": 44.918087005615234, "learning_rate": 9.697661152310326e-06, "loss": 7.8944, "step": 1705 }, { "epoch": 0.2919733013862742, "grad_norm": 143.09837341308594, "learning_rate": 9.703365658870507e-06, "loss": 7.8241, "step": 1706 }, { "epoch": 0.29214444634605513, "grad_norm": 22.652225494384766, "learning_rate": 9.709070165430691e-06, "loss": 2.888, "step": 1707 }, { "epoch": 0.29231559130583606, "grad_norm": 32.992774963378906, "learning_rate": 9.714774671990873e-06, "loss": 3.7048, "step": 1708 }, { "epoch": 0.292486736265617, "grad_norm": 30.531761169433594, "learning_rate": 9.720479178551056e-06, "loss": 3.8219, "step": 1709 }, { "epoch": 0.29265788122539793, "grad_norm": 39.57463073730469, "learning_rate": 9.726183685111238e-06, "loss": 4.5677, "step": 1710 }, { "epoch": 0.29282902618517886, "grad_norm": 32.177650451660156, "learning_rate": 9.731888191671421e-06, "loss": 3.9562, "step": 1711 }, { "epoch": 0.2930001711449598, "grad_norm": 11.071613311767578, "learning_rate": 9.737592698231603e-06, "loss": 1.3818, "step": 1712 }, { "epoch": 0.29317131610474073, "grad_norm": 183.07089233398438, "learning_rate": 9.743297204791784e-06, "loss": 8.202, "step": 1713 }, { "epoch": 0.29334246106452166, "grad_norm": 127.64228057861328, "learning_rate": 9.749001711351968e-06, "loss": 7.7497, "step": 1714 }, { "epoch": 0.2935136060243026, "grad_norm": 30.094449996948242, "learning_rate": 9.754706217912151e-06, "loss": 3.133, "step": 1715 }, { "epoch": 0.29368475098408353, "grad_norm": 33.56199264526367, "learning_rate": 9.760410724472334e-06, "loss": 3.9969, "step": 1716 }, { "epoch": 0.29385589594386446, "grad_norm": 30.969953536987305, "learning_rate": 9.766115231032516e-06, "loss": 3.2142, "step": 1717 }, { "epoch": 0.2940270409036454, "grad_norm": 26.745988845825195, "learning_rate": 9.7718197375927e-06, "loss": 2.5008, "step": 1718 }, { "epoch": 0.29419818586342633, "grad_norm": 19.184772491455078, "learning_rate": 9.777524244152881e-06, "loss": 1.7379, "step": 1719 }, { "epoch": 0.29436933082320726, "grad_norm": 30.3228759765625, "learning_rate": 9.783228750713063e-06, "loss": 3.6131, "step": 1720 }, { "epoch": 0.2945404757829882, "grad_norm": 30.700254440307617, "learning_rate": 9.788933257273246e-06, "loss": 3.4613, "step": 1721 }, { "epoch": 0.29471162074276913, "grad_norm": 35.11033248901367, "learning_rate": 9.794637763833428e-06, "loss": 4.2554, "step": 1722 }, { "epoch": 0.29488276570255006, "grad_norm": 47.508277893066406, "learning_rate": 9.800342270393611e-06, "loss": 8.2195, "step": 1723 }, { "epoch": 0.295053910662331, "grad_norm": 35.247528076171875, "learning_rate": 9.806046776953793e-06, "loss": 4.3316, "step": 1724 }, { "epoch": 0.29522505562211193, "grad_norm": 27.610280990600586, "learning_rate": 9.811751283513975e-06, "loss": 3.7587, "step": 1725 }, { "epoch": 0.29539620058189286, "grad_norm": 34.314918518066406, "learning_rate": 9.81745579007416e-06, "loss": 4.4546, "step": 1726 }, { "epoch": 0.2955673455416738, "grad_norm": 31.43994140625, "learning_rate": 9.823160296634341e-06, "loss": 3.8653, "step": 1727 }, { "epoch": 0.29573849050145473, "grad_norm": 15.655001640319824, "learning_rate": 9.828864803194525e-06, "loss": 1.2015, "step": 1728 }, { "epoch": 0.29590963546123566, "grad_norm": 13.799985885620117, "learning_rate": 9.834569309754707e-06, "loss": 1.6485, "step": 1729 }, { "epoch": 0.2960807804210166, "grad_norm": 35.408145904541016, "learning_rate": 9.84027381631489e-06, "loss": 4.6912, "step": 1730 }, { "epoch": 0.29625192538079753, "grad_norm": 33.258941650390625, "learning_rate": 9.845978322875072e-06, "loss": 3.8243, "step": 1731 }, { "epoch": 0.29642307034057847, "grad_norm": 34.537960052490234, "learning_rate": 9.851682829435253e-06, "loss": 3.6863, "step": 1732 }, { "epoch": 0.2965942153003594, "grad_norm": 25.667997360229492, "learning_rate": 9.857387335995437e-06, "loss": 3.21, "step": 1733 }, { "epoch": 0.29676536026014033, "grad_norm": 146.46380615234375, "learning_rate": 9.863091842555618e-06, "loss": 7.1136, "step": 1734 }, { "epoch": 0.29693650521992127, "grad_norm": 20.732595443725586, "learning_rate": 9.868796349115802e-06, "loss": 2.1932, "step": 1735 }, { "epoch": 0.2971076501797022, "grad_norm": 37.78299331665039, "learning_rate": 9.874500855675983e-06, "loss": 4.4365, "step": 1736 }, { "epoch": 0.29727879513948313, "grad_norm": 30.049827575683594, "learning_rate": 9.880205362236167e-06, "loss": 3.2983, "step": 1737 }, { "epoch": 0.29744994009926407, "grad_norm": 12.33377742767334, "learning_rate": 9.88590986879635e-06, "loss": 1.4362, "step": 1738 }, { "epoch": 0.297621085059045, "grad_norm": 24.165996551513672, "learning_rate": 9.891614375356532e-06, "loss": 2.7512, "step": 1739 }, { "epoch": 0.29779223001882593, "grad_norm": 34.980438232421875, "learning_rate": 9.897318881916715e-06, "loss": 3.4089, "step": 1740 }, { "epoch": 0.29796337497860687, "grad_norm": 52.22333526611328, "learning_rate": 9.903023388476897e-06, "loss": 8.55, "step": 1741 }, { "epoch": 0.2981345199383878, "grad_norm": 30.178720474243164, "learning_rate": 9.90872789503708e-06, "loss": 3.7629, "step": 1742 }, { "epoch": 0.29830566489816873, "grad_norm": 12.83564281463623, "learning_rate": 9.914432401597262e-06, "loss": 1.5206, "step": 1743 }, { "epoch": 0.29847680985794967, "grad_norm": 116.20635223388672, "learning_rate": 9.920136908157444e-06, "loss": 7.1701, "step": 1744 }, { "epoch": 0.2986479548177306, "grad_norm": 28.332143783569336, "learning_rate": 9.925841414717627e-06, "loss": 4.0808, "step": 1745 }, { "epoch": 0.29881909977751153, "grad_norm": 17.009302139282227, "learning_rate": 9.931545921277809e-06, "loss": 1.1237, "step": 1746 }, { "epoch": 0.29899024473729247, "grad_norm": 22.102079391479492, "learning_rate": 9.937250427837992e-06, "loss": 2.1591, "step": 1747 }, { "epoch": 0.2991613896970734, "grad_norm": 31.704936981201172, "learning_rate": 9.942954934398174e-06, "loss": 3.3555, "step": 1748 }, { "epoch": 0.29933253465685433, "grad_norm": 7.139681816101074, "learning_rate": 9.948659440958359e-06, "loss": 0.8492, "step": 1749 }, { "epoch": 0.29950367961663527, "grad_norm": 37.93485641479492, "learning_rate": 9.95436394751854e-06, "loss": 4.3445, "step": 1750 }, { "epoch": 0.2996748245764162, "grad_norm": 23.79175567626953, "learning_rate": 9.960068454078722e-06, "loss": 2.6891, "step": 1751 }, { "epoch": 0.29984596953619713, "grad_norm": 26.583223342895508, "learning_rate": 9.965772960638906e-06, "loss": 3.0936, "step": 1752 }, { "epoch": 0.3000171144959781, "grad_norm": 16.86503791809082, "learning_rate": 9.971477467199087e-06, "loss": 1.5367, "step": 1753 }, { "epoch": 0.30018825945575905, "grad_norm": 37.780025482177734, "learning_rate": 9.97718197375927e-06, "loss": 4.5862, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_nli-pairs_loss": 3.577563524246216, "eval_nli-pairs_runtime": 4.5158, "eval_nli-pairs_samples_per_second": 44.289, "eval_nli-pairs_steps_per_second": 1.55, "eval_sts-test_pearson_cosine": 0.7051574603634622, "eval_sts-test_pearson_dot": 0.5937802816639131, "eval_sts-test_pearson_euclidean": 0.7000060119936138, "eval_sts-test_pearson_manhattan": 0.7079127065958083, "eval_sts-test_pearson_max": 0.7079127065958083, "eval_sts-test_spearman_cosine": 0.6765504113809614, "eval_sts-test_spearman_dot": 0.5611218190113842, "eval_sts-test_spearman_euclidean": 0.6793571635918119, "eval_sts-test_spearman_manhattan": 0.6864576898108908, "eval_sts-test_spearman_max": 0.6864576898108908, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_vitaminc-pairs_loss": 2.382566213607788, "eval_vitaminc-pairs_runtime": 2.7572, "eval_vitaminc-pairs_samples_per_second": 72.538, "eval_vitaminc-pairs_steps_per_second": 2.539, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_qnli-contrastive_loss": 7.762363910675049, "eval_qnli-contrastive_runtime": 0.6686, "eval_qnli-contrastive_samples_per_second": 299.128, "eval_qnli-contrastive_steps_per_second": 10.469, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_scitail-pairs-qa_loss": 0.7197363972663879, "eval_scitail-pairs-qa_runtime": 1.7426, "eval_scitail-pairs-qa_samples_per_second": 114.768, "eval_scitail-pairs-qa_steps_per_second": 4.017, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_scitail-pairs-pos_loss": 2.2759039402008057, "eval_scitail-pairs-pos_runtime": 2.8206, "eval_scitail-pairs-pos_samples_per_second": 70.906, "eval_scitail-pairs-pos_steps_per_second": 2.482, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_xsum-pairs_loss": 2.1139955520629883, "eval_xsum-pairs_runtime": 2.6563, "eval_xsum-pairs_samples_per_second": 65.88, "eval_xsum-pairs_steps_per_second": 2.259, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_compression-pairs_loss": 1.1527378559112549, "eval_compression-pairs_runtime": 0.5278, "eval_compression-pairs_samples_per_second": 378.929, "eval_compression-pairs_steps_per_second": 13.263, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_sciq_pairs_loss": 6.166472434997559, "eval_sciq_pairs_runtime": 9.2821, "eval_sciq_pairs_samples_per_second": 21.547, "eval_sciq_pairs_steps_per_second": 0.754, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_qasc_pairs_loss": 8.247413635253906, "eval_qasc_pairs_runtime": 2.7444, "eval_qasc_pairs_samples_per_second": 72.876, "eval_qasc_pairs_steps_per_second": 2.551, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_openbookqa_pairs_loss": 4.27993631362915, "eval_openbookqa_pairs_runtime": 0.68, "eval_openbookqa_pairs_samples_per_second": 101.475, "eval_openbookqa_pairs_steps_per_second": 4.412, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_msmarco_pairs_loss": 3.4503884315490723, "eval_msmarco_pairs_runtime": 4.1424, "eval_msmarco_pairs_samples_per_second": 48.281, "eval_msmarco_pairs_steps_per_second": 1.69, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_nq_pairs_loss": 4.303767204284668, "eval_nq_pairs_runtime": 8.7194, "eval_nq_pairs_samples_per_second": 22.937, "eval_nq_pairs_steps_per_second": 0.803, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_trivia_pairs_loss": 3.893390417098999, "eval_trivia_pairs_runtime": 13.177, "eval_trivia_pairs_samples_per_second": 15.178, "eval_trivia_pairs_steps_per_second": 0.531, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_quora_pairs_loss": 1.0257954597473145, "eval_quora_pairs_runtime": 1.5896, "eval_quora_pairs_samples_per_second": 125.821, "eval_quora_pairs_steps_per_second": 4.404, "step": 1754 }, { "epoch": 0.30018825945575905, "eval_gooaq_pairs_loss": 2.6827940940856934, "eval_gooaq_pairs_runtime": 2.6669, "eval_gooaq_pairs_samples_per_second": 74.993, "eval_gooaq_pairs_steps_per_second": 2.625, "step": 1754 }, { "epoch": 0.30035940441554, "grad_norm": 32.57681655883789, "learning_rate": 9.982886480319452e-06, "loss": 4.3391, "step": 1755 }, { "epoch": 0.3005305493753209, "grad_norm": 26.65064811706543, "learning_rate": 9.988590986879634e-06, "loss": 2.7014, "step": 1756 }, { "epoch": 0.30070169433510185, "grad_norm": 33.25247573852539, "learning_rate": 9.994295493439817e-06, "loss": 4.1446, "step": 1757 }, { "epoch": 0.3008728392948828, "grad_norm": 25.792116165161133, "learning_rate": 9.999999999999999e-06, "loss": 2.7164, "step": 1758 }, { "epoch": 0.3010439842546637, "grad_norm": 28.707399368286133, "learning_rate": 1.0005704506560183e-05, "loss": 3.1937, "step": 1759 }, { "epoch": 0.30121512921444465, "grad_norm": 38.30696105957031, "learning_rate": 1.0011409013120366e-05, "loss": 4.2427, "step": 1760 }, { "epoch": 0.3013862741742256, "grad_norm": 26.254148483276367, "learning_rate": 1.001711351968055e-05, "loss": 2.5525, "step": 1761 }, { "epoch": 0.3015574191340065, "grad_norm": 7.5429487228393555, "learning_rate": 1.0022818026240731e-05, "loss": 0.8481, "step": 1762 }, { "epoch": 0.30172856409378745, "grad_norm": 45.37841796875, "learning_rate": 1.0028522532800913e-05, "loss": 6.5584, "step": 1763 }, { "epoch": 0.3018997090535684, "grad_norm": 17.617197036743164, "learning_rate": 1.0034227039361096e-05, "loss": 1.5689, "step": 1764 }, { "epoch": 0.3020708540133493, "grad_norm": 8.921030044555664, "learning_rate": 1.0039931545921278e-05, "loss": 1.9049, "step": 1765 }, { "epoch": 0.30224199897313025, "grad_norm": 11.456149101257324, "learning_rate": 1.0045636052481461e-05, "loss": 1.4351, "step": 1766 }, { "epoch": 0.3024131439329112, "grad_norm": 36.827125549316406, "learning_rate": 1.0051340559041643e-05, "loss": 3.8073, "step": 1767 }, { "epoch": 0.3025842888926921, "grad_norm": 31.50043296813965, "learning_rate": 1.0057045065601826e-05, "loss": 3.4761, "step": 1768 }, { "epoch": 0.30275543385247305, "grad_norm": 212.15618896484375, "learning_rate": 1.0062749572162008e-05, "loss": 8.804, "step": 1769 }, { "epoch": 0.302926578812254, "grad_norm": 11.170289039611816, "learning_rate": 1.006845407872219e-05, "loss": 1.5324, "step": 1770 }, { "epoch": 0.3030977237720349, "grad_norm": 11.275130271911621, "learning_rate": 1.0074158585282373e-05, "loss": 1.0326, "step": 1771 }, { "epoch": 0.30326886873181585, "grad_norm": 37.139068603515625, "learning_rate": 1.0079863091842556e-05, "loss": 4.5464, "step": 1772 }, { "epoch": 0.3034400136915968, "grad_norm": 24.030378341674805, "learning_rate": 1.008556759840274e-05, "loss": 1.9306, "step": 1773 }, { "epoch": 0.3036111586513777, "grad_norm": 23.25863265991211, "learning_rate": 1.0091272104962921e-05, "loss": 1.8897, "step": 1774 }, { "epoch": 0.30378230361115865, "grad_norm": 33.125823974609375, "learning_rate": 1.0096976611523103e-05, "loss": 3.4839, "step": 1775 }, { "epoch": 0.3039534485709396, "grad_norm": 21.4809627532959, "learning_rate": 1.0102681118083286e-05, "loss": 2.866, "step": 1776 }, { "epoch": 0.3041245935307205, "grad_norm": 54.2559928894043, "learning_rate": 1.0108385624643468e-05, "loss": 8.802, "step": 1777 }, { "epoch": 0.30429573849050146, "grad_norm": 39.62715148925781, "learning_rate": 1.0114090131203651e-05, "loss": 5.1068, "step": 1778 }, { "epoch": 0.3044668834502824, "grad_norm": 14.615751266479492, "learning_rate": 1.0119794637763833e-05, "loss": 1.2298, "step": 1779 }, { "epoch": 0.3046380284100633, "grad_norm": 36.6978874206543, "learning_rate": 1.0125499144324017e-05, "loss": 4.1995, "step": 1780 }, { "epoch": 0.30480917336984426, "grad_norm": 14.718832015991211, "learning_rate": 1.0131203650884198e-05, "loss": 1.1796, "step": 1781 }, { "epoch": 0.3049803183296252, "grad_norm": 36.830204010009766, "learning_rate": 1.013690815744438e-05, "loss": 4.1858, "step": 1782 }, { "epoch": 0.3051514632894061, "grad_norm": 23.391765594482422, "learning_rate": 1.0142612664004565e-05, "loss": 2.4115, "step": 1783 }, { "epoch": 0.30532260824918706, "grad_norm": 35.27947998046875, "learning_rate": 1.0148317170564747e-05, "loss": 4.8061, "step": 1784 }, { "epoch": 0.305493753208968, "grad_norm": 10.68021297454834, "learning_rate": 1.015402167712493e-05, "loss": 2.1324, "step": 1785 }, { "epoch": 0.3056648981687489, "grad_norm": 23.529436111450195, "learning_rate": 1.0159726183685112e-05, "loss": 2.7194, "step": 1786 }, { "epoch": 0.30583604312852986, "grad_norm": 32.76841354370117, "learning_rate": 1.0165430690245295e-05, "loss": 3.9735, "step": 1787 }, { "epoch": 0.3060071880883108, "grad_norm": 20.872732162475586, "learning_rate": 1.0171135196805477e-05, "loss": 2.3385, "step": 1788 }, { "epoch": 0.3061783330480917, "grad_norm": 14.08251953125, "learning_rate": 1.0176839703365658e-05, "loss": 1.8159, "step": 1789 }, { "epoch": 0.30634947800787266, "grad_norm": 39.58723831176758, "learning_rate": 1.0182544209925842e-05, "loss": 4.7749, "step": 1790 }, { "epoch": 0.3065206229676536, "grad_norm": 65.20591735839844, "learning_rate": 1.0188248716486024e-05, "loss": 6.4724, "step": 1791 }, { "epoch": 0.3066917679274345, "grad_norm": 44.97452926635742, "learning_rate": 1.0193953223046207e-05, "loss": 4.9313, "step": 1792 }, { "epoch": 0.30686291288721546, "grad_norm": 35.091163635253906, "learning_rate": 1.0199657729606389e-05, "loss": 3.4266, "step": 1793 }, { "epoch": 0.3070340578469964, "grad_norm": 17.238380432128906, "learning_rate": 1.020536223616657e-05, "loss": 1.4114, "step": 1794 }, { "epoch": 0.3072052028067773, "grad_norm": 12.661242485046387, "learning_rate": 1.0211066742726755e-05, "loss": 2.2799, "step": 1795 }, { "epoch": 0.30737634776655826, "grad_norm": 29.67556381225586, "learning_rate": 1.0216771249286937e-05, "loss": 2.9217, "step": 1796 }, { "epoch": 0.3075474927263392, "grad_norm": 34.465126037597656, "learning_rate": 1.022247575584712e-05, "loss": 3.9674, "step": 1797 }, { "epoch": 0.3077186376861201, "grad_norm": 66.6548080444336, "learning_rate": 1.0228180262407302e-05, "loss": 6.0514, "step": 1798 }, { "epoch": 0.30788978264590106, "grad_norm": 36.210044860839844, "learning_rate": 1.0233884768967485e-05, "loss": 4.2555, "step": 1799 }, { "epoch": 0.308060927605682, "grad_norm": 24.441967010498047, "learning_rate": 1.0239589275527667e-05, "loss": 2.5473, "step": 1800 }, { "epoch": 0.3082320725654629, "grad_norm": 20.574525833129883, "learning_rate": 1.0245293782087849e-05, "loss": 1.6693, "step": 1801 }, { "epoch": 0.30840321752524386, "grad_norm": 26.07015037536621, "learning_rate": 1.0250998288648032e-05, "loss": 2.7451, "step": 1802 }, { "epoch": 0.3085743624850248, "grad_norm": 29.663963317871094, "learning_rate": 1.0256702795208214e-05, "loss": 4.0482, "step": 1803 }, { "epoch": 0.3087455074448058, "grad_norm": 27.77281379699707, "learning_rate": 1.0262407301768397e-05, "loss": 3.0752, "step": 1804 }, { "epoch": 0.3089166524045867, "grad_norm": 34.827430725097656, "learning_rate": 1.0268111808328579e-05, "loss": 3.7669, "step": 1805 }, { "epoch": 0.30908779736436764, "grad_norm": 37.112361907958984, "learning_rate": 1.0273816314888762e-05, "loss": 4.7788, "step": 1806 }, { "epoch": 0.3092589423241486, "grad_norm": 53.2462272644043, "learning_rate": 1.0279520821448946e-05, "loss": 8.0593, "step": 1807 }, { "epoch": 0.3094300872839295, "grad_norm": 38.18441390991211, "learning_rate": 1.0285225328009127e-05, "loss": 4.2028, "step": 1808 }, { "epoch": 0.30960123224371044, "grad_norm": 13.605740547180176, "learning_rate": 1.029092983456931e-05, "loss": 2.5679, "step": 1809 }, { "epoch": 0.3097723772034914, "grad_norm": 37.292240142822266, "learning_rate": 1.0296634341129492e-05, "loss": 4.0864, "step": 1810 }, { "epoch": 0.3099435221632723, "grad_norm": 10.673694610595703, "learning_rate": 1.0302338847689676e-05, "loss": 0.953, "step": 1811 }, { "epoch": 0.31011466712305324, "grad_norm": 30.847604751586914, "learning_rate": 1.0308043354249858e-05, "loss": 4.4181, "step": 1812 }, { "epoch": 0.3102858120828342, "grad_norm": 25.303640365600586, "learning_rate": 1.031374786081004e-05, "loss": 3.0808, "step": 1813 }, { "epoch": 0.3104569570426151, "grad_norm": 31.284347534179688, "learning_rate": 1.0319452367370223e-05, "loss": 3.3148, "step": 1814 }, { "epoch": 0.31062810200239604, "grad_norm": 18.292266845703125, "learning_rate": 1.0325156873930404e-05, "loss": 1.4786, "step": 1815 }, { "epoch": 0.310799246962177, "grad_norm": 93.66471099853516, "learning_rate": 1.0330861380490588e-05, "loss": 6.8127, "step": 1816 }, { "epoch": 0.3109703919219579, "grad_norm": 38.12440872192383, "learning_rate": 1.033656588705077e-05, "loss": 5.0019, "step": 1817 }, { "epoch": 0.31114153688173884, "grad_norm": 32.61493682861328, "learning_rate": 1.0342270393610954e-05, "loss": 4.3171, "step": 1818 }, { "epoch": 0.3113126818415198, "grad_norm": 38.087646484375, "learning_rate": 1.0347974900171136e-05, "loss": 7.6945, "step": 1819 }, { "epoch": 0.3114838268013007, "grad_norm": 21.899497985839844, "learning_rate": 1.0353679406731318e-05, "loss": 1.7206, "step": 1820 }, { "epoch": 0.31165497176108165, "grad_norm": 113.81354522705078, "learning_rate": 1.0359383913291501e-05, "loss": 7.2513, "step": 1821 }, { "epoch": 0.3118261167208626, "grad_norm": 11.316397666931152, "learning_rate": 1.0365088419851683e-05, "loss": 2.1259, "step": 1822 }, { "epoch": 0.3119972616806435, "grad_norm": 26.67529296875, "learning_rate": 1.0370792926411866e-05, "loss": 3.1664, "step": 1823 }, { "epoch": 0.31216840664042445, "grad_norm": 25.253353118896484, "learning_rate": 1.0376497432972048e-05, "loss": 2.4222, "step": 1824 }, { "epoch": 0.3123395516002054, "grad_norm": 8.143440246582031, "learning_rate": 1.038220193953223e-05, "loss": 0.7973, "step": 1825 }, { "epoch": 0.3125106965599863, "grad_norm": 19.66392707824707, "learning_rate": 1.0387906446092413e-05, "loss": 1.5552, "step": 1826 }, { "epoch": 0.31268184151976725, "grad_norm": 23.67314910888672, "learning_rate": 1.0393610952652595e-05, "loss": 3.07, "step": 1827 }, { "epoch": 0.3128529864795482, "grad_norm": 26.236251831054688, "learning_rate": 1.0399315459212778e-05, "loss": 3.1091, "step": 1828 }, { "epoch": 0.3130241314393291, "grad_norm": 28.10502815246582, "learning_rate": 1.0405019965772961e-05, "loss": 3.0707, "step": 1829 }, { "epoch": 0.31319527639911005, "grad_norm": 34.508846282958984, "learning_rate": 1.0410724472333145e-05, "loss": 4.872, "step": 1830 }, { "epoch": 0.313366421358891, "grad_norm": 34.22414016723633, "learning_rate": 1.0416428978893326e-05, "loss": 3.3169, "step": 1831 }, { "epoch": 0.3135375663186719, "grad_norm": 46.06840515136719, "learning_rate": 1.0422133485453508e-05, "loss": 7.9438, "step": 1832 }, { "epoch": 0.31370871127845285, "grad_norm": 18.041322708129883, "learning_rate": 1.0427837992013692e-05, "loss": 1.629, "step": 1833 }, { "epoch": 0.3138798562382338, "grad_norm": 14.525741577148438, "learning_rate": 1.0433542498573873e-05, "loss": 1.1969, "step": 1834 }, { "epoch": 0.3140510011980147, "grad_norm": 4.135936260223389, "learning_rate": 1.0439247005134057e-05, "loss": 0.7184, "step": 1835 }, { "epoch": 0.31422214615779565, "grad_norm": 41.9599723815918, "learning_rate": 1.0444951511694238e-05, "loss": 4.2524, "step": 1836 }, { "epoch": 0.3143932911175766, "grad_norm": 7.373823642730713, "learning_rate": 1.0450656018254422e-05, "loss": 1.8983, "step": 1837 }, { "epoch": 0.3145644360773575, "grad_norm": 31.084392547607422, "learning_rate": 1.0456360524814603e-05, "loss": 4.0436, "step": 1838 }, { "epoch": 0.31473558103713845, "grad_norm": 11.967267036437988, "learning_rate": 1.0462065031374785e-05, "loss": 1.0282, "step": 1839 }, { "epoch": 0.3149067259969194, "grad_norm": 33.466121673583984, "learning_rate": 1.046776953793497e-05, "loss": 3.9262, "step": 1840 }, { "epoch": 0.3150778709567003, "grad_norm": 39.21562576293945, "learning_rate": 1.0473474044495152e-05, "loss": 4.844, "step": 1841 }, { "epoch": 0.31524901591648125, "grad_norm": 33.843055725097656, "learning_rate": 1.0479178551055335e-05, "loss": 3.5103, "step": 1842 }, { "epoch": 0.3154201608762622, "grad_norm": 35.37272644042969, "learning_rate": 1.0484883057615517e-05, "loss": 3.584, "step": 1843 }, { "epoch": 0.3155913058360431, "grad_norm": 17.376483917236328, "learning_rate": 1.0490587564175699e-05, "loss": 1.4993, "step": 1844 }, { "epoch": 0.31576245079582405, "grad_norm": 45.614688873291016, "learning_rate": 1.0496292070735882e-05, "loss": 8.1587, "step": 1845 }, { "epoch": 0.315933595755605, "grad_norm": 31.185443878173828, "learning_rate": 1.0501996577296064e-05, "loss": 4.1762, "step": 1846 }, { "epoch": 0.3161047407153859, "grad_norm": 33.703514099121094, "learning_rate": 1.0507701083856247e-05, "loss": 4.1885, "step": 1847 }, { "epoch": 0.31627588567516685, "grad_norm": 24.48247718811035, "learning_rate": 1.0513405590416429e-05, "loss": 2.7277, "step": 1848 }, { "epoch": 0.3164470306349478, "grad_norm": 25.966876983642578, "learning_rate": 1.0519110096976612e-05, "loss": 2.8921, "step": 1849 }, { "epoch": 0.3166181755947287, "grad_norm": 35.0124626159668, "learning_rate": 1.0524814603536794e-05, "loss": 4.3145, "step": 1850 }, { "epoch": 0.31678932055450965, "grad_norm": 33.62586975097656, "learning_rate": 1.0530519110096975e-05, "loss": 3.8524, "step": 1851 }, { "epoch": 0.3169604655142906, "grad_norm": 30.16233253479004, "learning_rate": 1.053622361665716e-05, "loss": 3.3166, "step": 1852 }, { "epoch": 0.3171316104740715, "grad_norm": 31.811193466186523, "learning_rate": 1.0541928123217342e-05, "loss": 3.5965, "step": 1853 }, { "epoch": 0.31730275543385245, "grad_norm": 35.756778717041016, "learning_rate": 1.0547632629777526e-05, "loss": 4.4027, "step": 1854 }, { "epoch": 0.31747390039363343, "grad_norm": 17.929304122924805, "learning_rate": 1.0553337136337707e-05, "loss": 2.2128, "step": 1855 }, { "epoch": 0.31764504535341437, "grad_norm": 29.329362869262695, "learning_rate": 1.0559041642897889e-05, "loss": 2.4503, "step": 1856 }, { "epoch": 0.3178161903131953, "grad_norm": 38.31791305541992, "learning_rate": 1.0564746149458072e-05, "loss": 4.1596, "step": 1857 }, { "epoch": 0.31798733527297623, "grad_norm": 26.978776931762695, "learning_rate": 1.0570450656018254e-05, "loss": 2.5148, "step": 1858 }, { "epoch": 0.31815848023275717, "grad_norm": 183.96864318847656, "learning_rate": 1.0576155162578437e-05, "loss": 7.8451, "step": 1859 }, { "epoch": 0.3183296251925381, "grad_norm": 34.898677825927734, "learning_rate": 1.0581859669138619e-05, "loss": 3.631, "step": 1860 }, { "epoch": 0.31850077015231903, "grad_norm": 18.749799728393555, "learning_rate": 1.0587564175698802e-05, "loss": 1.5066, "step": 1861 }, { "epoch": 0.31867191511209997, "grad_norm": 32.26422882080078, "learning_rate": 1.0593268682258984e-05, "loss": 4.0466, "step": 1862 }, { "epoch": 0.3188430600718809, "grad_norm": 9.538769721984863, "learning_rate": 1.0598973188819167e-05, "loss": 1.2133, "step": 1863 }, { "epoch": 0.31901420503166183, "grad_norm": 9.156614303588867, "learning_rate": 1.0604677695379351e-05, "loss": 0.9202, "step": 1864 }, { "epoch": 0.31918534999144277, "grad_norm": 137.56471252441406, "learning_rate": 1.0610382201939533e-05, "loss": 7.6205, "step": 1865 }, { "epoch": 0.3193564949512237, "grad_norm": 24.30291748046875, "learning_rate": 1.0616086708499716e-05, "loss": 2.5704, "step": 1866 }, { "epoch": 0.31952763991100464, "grad_norm": 32.78607940673828, "learning_rate": 1.0621791215059898e-05, "loss": 3.4866, "step": 1867 }, { "epoch": 0.31969878487078557, "grad_norm": 25.44717025756836, "learning_rate": 1.0627495721620081e-05, "loss": 2.8747, "step": 1868 }, { "epoch": 0.3198699298305665, "grad_norm": 71.5486831665039, "learning_rate": 1.0633200228180263e-05, "loss": 6.3834, "step": 1869 }, { "epoch": 0.32004107479034744, "grad_norm": 36.36513900756836, "learning_rate": 1.0638904734740444e-05, "loss": 3.8896, "step": 1870 }, { "epoch": 0.32021221975012837, "grad_norm": 14.369461059570312, "learning_rate": 1.0644609241300628e-05, "loss": 1.2576, "step": 1871 }, { "epoch": 0.3203833647099093, "grad_norm": 34.6867561340332, "learning_rate": 1.065031374786081e-05, "loss": 3.4093, "step": 1872 }, { "epoch": 0.32055450966969024, "grad_norm": 21.84122657775879, "learning_rate": 1.0656018254420993e-05, "loss": 2.2791, "step": 1873 }, { "epoch": 0.32072565462947117, "grad_norm": 21.254135131835938, "learning_rate": 1.0661722760981174e-05, "loss": 2.2054, "step": 1874 }, { "epoch": 0.3208967995892521, "grad_norm": 33.362220764160156, "learning_rate": 1.0667427267541358e-05, "loss": 4.1888, "step": 1875 }, { "epoch": 0.32106794454903304, "grad_norm": 63.412601470947266, "learning_rate": 1.0673131774101541e-05, "loss": 8.5606, "step": 1876 }, { "epoch": 0.32123908950881397, "grad_norm": 14.283455848693848, "learning_rate": 1.0678836280661723e-05, "loss": 0.9998, "step": 1877 }, { "epoch": 0.3214102344685949, "grad_norm": 35.16504669189453, "learning_rate": 1.0684540787221906e-05, "loss": 4.2321, "step": 1878 }, { "epoch": 0.32158137942837584, "grad_norm": 12.61963939666748, "learning_rate": 1.0690245293782088e-05, "loss": 1.5004, "step": 1879 }, { "epoch": 0.32175252438815677, "grad_norm": 32.174076080322266, "learning_rate": 1.0695949800342271e-05, "loss": 3.5576, "step": 1880 }, { "epoch": 0.3219236693479377, "grad_norm": 30.472043991088867, "learning_rate": 1.0701654306902453e-05, "loss": 3.4048, "step": 1881 }, { "epoch": 0.32209481430771864, "grad_norm": 84.8609848022461, "learning_rate": 1.0707358813462635e-05, "loss": 6.2658, "step": 1882 }, { "epoch": 0.32226595926749957, "grad_norm": 25.621240615844727, "learning_rate": 1.0713063320022818e-05, "loss": 2.6459, "step": 1883 }, { "epoch": 0.3224371042272805, "grad_norm": 79.82257080078125, "learning_rate": 1.0718767826583e-05, "loss": 6.3192, "step": 1884 }, { "epoch": 0.32260824918706144, "grad_norm": 7.729169845581055, "learning_rate": 1.0724472333143183e-05, "loss": 0.825, "step": 1885 }, { "epoch": 0.32277939414684237, "grad_norm": 29.313451766967773, "learning_rate": 1.0730176839703367e-05, "loss": 2.9915, "step": 1886 }, { "epoch": 0.3229505391066233, "grad_norm": 6.555768013000488, "learning_rate": 1.073588134626355e-05, "loss": 0.7525, "step": 1887 }, { "epoch": 0.32312168406640424, "grad_norm": 35.07060241699219, "learning_rate": 1.0741585852823732e-05, "loss": 4.2147, "step": 1888 }, { "epoch": 0.32329282902618517, "grad_norm": 10.583313941955566, "learning_rate": 1.0747290359383913e-05, "loss": 0.8557, "step": 1889 }, { "epoch": 0.3234639739859661, "grad_norm": 26.075578689575195, "learning_rate": 1.0752994865944097e-05, "loss": 2.9433, "step": 1890 }, { "epoch": 0.32363511894574704, "grad_norm": 17.7381591796875, "learning_rate": 1.0758699372504278e-05, "loss": 1.4998, "step": 1891 }, { "epoch": 0.32380626390552797, "grad_norm": 16.11162567138672, "learning_rate": 1.0764403879064462e-05, "loss": 1.2949, "step": 1892 }, { "epoch": 0.3239774088653089, "grad_norm": 28.165752410888672, "learning_rate": 1.0770108385624643e-05, "loss": 3.4363, "step": 1893 }, { "epoch": 0.32414855382508984, "grad_norm": 37.37394714355469, "learning_rate": 1.0775812892184825e-05, "loss": 4.7016, "step": 1894 }, { "epoch": 0.32431969878487077, "grad_norm": 35.620826721191406, "learning_rate": 1.0781517398745008e-05, "loss": 4.4153, "step": 1895 }, { "epoch": 0.3244908437446517, "grad_norm": 35.83405303955078, "learning_rate": 1.078722190530519e-05, "loss": 4.4295, "step": 1896 }, { "epoch": 0.32466198870443264, "grad_norm": 12.846619606018066, "learning_rate": 1.0792926411865374e-05, "loss": 1.4411, "step": 1897 }, { "epoch": 0.32483313366421357, "grad_norm": 11.455179214477539, "learning_rate": 1.0798630918425557e-05, "loss": 1.1335, "step": 1898 }, { "epoch": 0.3250042786239945, "grad_norm": 36.278289794921875, "learning_rate": 1.080433542498574e-05, "loss": 3.6505, "step": 1899 }, { "epoch": 0.32517542358377544, "grad_norm": 37.59969711303711, "learning_rate": 1.0810039931545922e-05, "loss": 5.1473, "step": 1900 }, { "epoch": 0.32534656854355637, "grad_norm": 27.851537704467773, "learning_rate": 1.0815744438106104e-05, "loss": 2.792, "step": 1901 }, { "epoch": 0.3255177135033373, "grad_norm": 20.874591827392578, "learning_rate": 1.0821448944666287e-05, "loss": 2.5421, "step": 1902 }, { "epoch": 0.32568885846311824, "grad_norm": 12.82272720336914, "learning_rate": 1.0827153451226469e-05, "loss": 0.9663, "step": 1903 }, { "epoch": 0.32586000342289917, "grad_norm": 27.367874145507812, "learning_rate": 1.0832857957786652e-05, "loss": 2.6934, "step": 1904 }, { "epoch": 0.3260311483826801, "grad_norm": 31.575483322143555, "learning_rate": 1.0838562464346834e-05, "loss": 3.3276, "step": 1905 }, { "epoch": 0.3262022933424611, "grad_norm": 36.26526641845703, "learning_rate": 1.0844266970907017e-05, "loss": 4.196, "step": 1906 }, { "epoch": 0.326373438302242, "grad_norm": 20.60125160217285, "learning_rate": 1.0849971477467199e-05, "loss": 1.5247, "step": 1907 }, { "epoch": 0.32654458326202296, "grad_norm": 19.104351043701172, "learning_rate": 1.085567598402738e-05, "loss": 1.9953, "step": 1908 }, { "epoch": 0.3267157282218039, "grad_norm": 31.618993759155273, "learning_rate": 1.0861380490587566e-05, "loss": 3.2496, "step": 1909 }, { "epoch": 0.3268868731815848, "grad_norm": 20.25756072998047, "learning_rate": 1.0867084997147747e-05, "loss": 1.4173, "step": 1910 }, { "epoch": 0.32705801814136576, "grad_norm": 19.579376220703125, "learning_rate": 1.087278950370793e-05, "loss": 1.4559, "step": 1911 }, { "epoch": 0.3272291631011467, "grad_norm": 33.51919174194336, "learning_rate": 1.0878494010268112e-05, "loss": 4.3546, "step": 1912 }, { "epoch": 0.3274003080609276, "grad_norm": 34.54380416870117, "learning_rate": 1.0884198516828294e-05, "loss": 3.8532, "step": 1913 }, { "epoch": 0.32757145302070856, "grad_norm": 43.39759063720703, "learning_rate": 1.0889903023388477e-05, "loss": 5.7, "step": 1914 }, { "epoch": 0.3277425979804895, "grad_norm": 31.343278884887695, "learning_rate": 1.0895607529948659e-05, "loss": 3.6086, "step": 1915 }, { "epoch": 0.3279137429402704, "grad_norm": 37.40540313720703, "learning_rate": 1.0901312036508843e-05, "loss": 3.6012, "step": 1916 }, { "epoch": 0.32808488790005136, "grad_norm": 10.474573135375977, "learning_rate": 1.0907016543069024e-05, "loss": 0.9649, "step": 1917 }, { "epoch": 0.3282560328598323, "grad_norm": 26.88408088684082, "learning_rate": 1.0912721049629208e-05, "loss": 2.6185, "step": 1918 }, { "epoch": 0.3284271778196132, "grad_norm": 24.986539840698242, "learning_rate": 1.091842555618939e-05, "loss": 2.0861, "step": 1919 }, { "epoch": 0.32859832277939416, "grad_norm": 36.754337310791016, "learning_rate": 1.0924130062749573e-05, "loss": 4.4734, "step": 1920 }, { "epoch": 0.3287694677391751, "grad_norm": 36.0711555480957, "learning_rate": 1.0929834569309756e-05, "loss": 3.7612, "step": 1921 }, { "epoch": 0.328940612698956, "grad_norm": 33.72808074951172, "learning_rate": 1.0935539075869938e-05, "loss": 3.6817, "step": 1922 }, { "epoch": 0.32911175765873696, "grad_norm": 31.21643829345703, "learning_rate": 1.0941243582430121e-05, "loss": 3.1247, "step": 1923 }, { "epoch": 0.3292829026185179, "grad_norm": 26.2045955657959, "learning_rate": 1.0946948088990303e-05, "loss": 3.1474, "step": 1924 }, { "epoch": 0.3294540475782988, "grad_norm": 30.681350708007812, "learning_rate": 1.0952652595550484e-05, "loss": 3.1958, "step": 1925 }, { "epoch": 0.32962519253807976, "grad_norm": 57.95525360107422, "learning_rate": 1.0958357102110668e-05, "loss": 8.8044, "step": 1926 }, { "epoch": 0.3297963374978607, "grad_norm": 178.06443786621094, "learning_rate": 1.096406160867085e-05, "loss": 8.7701, "step": 1927 }, { "epoch": 0.3299674824576416, "grad_norm": 35.5237922668457, "learning_rate": 1.0969766115231033e-05, "loss": 3.8513, "step": 1928 }, { "epoch": 0.33013862741742256, "grad_norm": 39.186771392822266, "learning_rate": 1.0975470621791215e-05, "loss": 4.4358, "step": 1929 }, { "epoch": 0.3303097723772035, "grad_norm": 25.387964248657227, "learning_rate": 1.0981175128351398e-05, "loss": 2.6496, "step": 1930 }, { "epoch": 0.3304809173369844, "grad_norm": 41.67265319824219, "learning_rate": 1.098687963491158e-05, "loss": 4.5891, "step": 1931 }, { "epoch": 0.33065206229676536, "grad_norm": 36.71438217163086, "learning_rate": 1.0992584141471763e-05, "loss": 4.1564, "step": 1932 }, { "epoch": 0.3308232072565463, "grad_norm": 12.194602012634277, "learning_rate": 1.0998288648031946e-05, "loss": 1.3654, "step": 1933 }, { "epoch": 0.3309943522163272, "grad_norm": 30.5019474029541, "learning_rate": 1.1003993154592128e-05, "loss": 2.9248, "step": 1934 }, { "epoch": 0.33116549717610816, "grad_norm": 30.596206665039062, "learning_rate": 1.1009697661152311e-05, "loss": 3.6483, "step": 1935 }, { "epoch": 0.3313366421358891, "grad_norm": 190.34573364257812, "learning_rate": 1.1015402167712493e-05, "loss": 9.976, "step": 1936 }, { "epoch": 0.33150778709567, "grad_norm": 23.65143585205078, "learning_rate": 1.1021106674272677e-05, "loss": 2.6501, "step": 1937 }, { "epoch": 0.33167893205545096, "grad_norm": 32.524288177490234, "learning_rate": 1.1026811180832858e-05, "loss": 3.6287, "step": 1938 }, { "epoch": 0.3318500770152319, "grad_norm": 24.90087890625, "learning_rate": 1.103251568739304e-05, "loss": 2.8126, "step": 1939 }, { "epoch": 0.3320212219750128, "grad_norm": 11.670059204101562, "learning_rate": 1.1038220193953223e-05, "loss": 0.9268, "step": 1940 }, { "epoch": 0.33219236693479376, "grad_norm": 20.560199737548828, "learning_rate": 1.1043924700513405e-05, "loss": 2.0298, "step": 1941 }, { "epoch": 0.3323635118945747, "grad_norm": 32.11676788330078, "learning_rate": 1.1049629207073588e-05, "loss": 3.379, "step": 1942 }, { "epoch": 0.3325346568543556, "grad_norm": 31.273881912231445, "learning_rate": 1.1055333713633772e-05, "loss": 3.6115, "step": 1943 }, { "epoch": 0.33270580181413656, "grad_norm": 76.62176513671875, "learning_rate": 1.1061038220193953e-05, "loss": 6.2689, "step": 1944 }, { "epoch": 0.3328769467739175, "grad_norm": 29.79790496826172, "learning_rate": 1.1066742726754137e-05, "loss": 2.9922, "step": 1945 }, { "epoch": 0.3330480917336984, "grad_norm": 28.528804779052734, "learning_rate": 1.1072447233314318e-05, "loss": 3.192, "step": 1946 }, { "epoch": 0.33321923669347936, "grad_norm": 101.99966430664062, "learning_rate": 1.1078151739874502e-05, "loss": 6.9582, "step": 1947 }, { "epoch": 0.3333903816532603, "grad_norm": 33.45838165283203, "learning_rate": 1.1083856246434684e-05, "loss": 4.3572, "step": 1948 }, { "epoch": 0.3335615266130412, "grad_norm": 31.591665267944336, "learning_rate": 1.1089560752994867e-05, "loss": 3.7906, "step": 1949 }, { "epoch": 0.33373267157282216, "grad_norm": 42.0833740234375, "learning_rate": 1.1095265259555049e-05, "loss": 4.95, "step": 1950 }, { "epoch": 0.3339038165326031, "grad_norm": 94.96964263916016, "learning_rate": 1.110096976611523e-05, "loss": 6.5888, "step": 1951 }, { "epoch": 0.334074961492384, "grad_norm": 35.450111389160156, "learning_rate": 1.1106674272675414e-05, "loss": 4.8891, "step": 1952 }, { "epoch": 0.33424610645216496, "grad_norm": 32.57542037963867, "learning_rate": 1.1112378779235595e-05, "loss": 4.2762, "step": 1953 }, { "epoch": 0.3344172514119459, "grad_norm": 24.635988235473633, "learning_rate": 1.1118083285795779e-05, "loss": 2.6646, "step": 1954 }, { "epoch": 0.3345883963717268, "grad_norm": 22.50608253479004, "learning_rate": 1.1123787792355962e-05, "loss": 2.1994, "step": 1955 }, { "epoch": 0.33475954133150776, "grad_norm": 35.915611267089844, "learning_rate": 1.1129492298916144e-05, "loss": 4.3539, "step": 1956 }, { "epoch": 0.33493068629128875, "grad_norm": 39.85637283325195, "learning_rate": 1.1135196805476327e-05, "loss": 8.0766, "step": 1957 }, { "epoch": 0.3351018312510697, "grad_norm": 31.60897445678711, "learning_rate": 1.1140901312036509e-05, "loss": 3.5052, "step": 1958 }, { "epoch": 0.3352729762108506, "grad_norm": 10.988346099853516, "learning_rate": 1.1146605818596692e-05, "loss": 2.0192, "step": 1959 }, { "epoch": 0.33544412117063155, "grad_norm": 77.31686401367188, "learning_rate": 1.1152310325156874e-05, "loss": 6.7873, "step": 1960 }, { "epoch": 0.3356152661304125, "grad_norm": 37.3287239074707, "learning_rate": 1.1158014831717057e-05, "loss": 4.5134, "step": 1961 }, { "epoch": 0.3357864110901934, "grad_norm": 28.940874099731445, "learning_rate": 1.1163719338277239e-05, "loss": 3.5488, "step": 1962 }, { "epoch": 0.33595755604997435, "grad_norm": 27.005020141601562, "learning_rate": 1.116942384483742e-05, "loss": 3.4131, "step": 1963 }, { "epoch": 0.3361287010097553, "grad_norm": 23.171354293823242, "learning_rate": 1.1175128351397604e-05, "loss": 3.0202, "step": 1964 }, { "epoch": 0.3362998459695362, "grad_norm": 33.08194351196289, "learning_rate": 1.1180832857957786e-05, "loss": 3.5406, "step": 1965 }, { "epoch": 0.33647099092931715, "grad_norm": 42.914058685302734, "learning_rate": 1.118653736451797e-05, "loss": 7.7143, "step": 1966 }, { "epoch": 0.3366421358890981, "grad_norm": 6.044030666351318, "learning_rate": 1.1192241871078152e-05, "loss": 1.0934, "step": 1967 }, { "epoch": 0.336813280848879, "grad_norm": 13.652383804321289, "learning_rate": 1.1197946377638336e-05, "loss": 1.2611, "step": 1968 }, { "epoch": 0.33698442580865995, "grad_norm": 120.25743103027344, "learning_rate": 1.1203650884198518e-05, "loss": 6.9692, "step": 1969 }, { "epoch": 0.3371555707684409, "grad_norm": 138.58935546875, "learning_rate": 1.12093553907587e-05, "loss": 6.7316, "step": 1970 }, { "epoch": 0.3373267157282218, "grad_norm": 30.030006408691406, "learning_rate": 1.1215059897318883e-05, "loss": 4.1817, "step": 1971 }, { "epoch": 0.33749786068800275, "grad_norm": 9.535407066345215, "learning_rate": 1.1220764403879064e-05, "loss": 0.9512, "step": 1972 }, { "epoch": 0.3376690056477837, "grad_norm": 25.748254776000977, "learning_rate": 1.1226468910439248e-05, "loss": 3.1973, "step": 1973 }, { "epoch": 0.3378401506075646, "grad_norm": 29.184724807739258, "learning_rate": 1.123217341699943e-05, "loss": 3.5403, "step": 1974 }, { "epoch": 0.33801129556734555, "grad_norm": 36.09633255004883, "learning_rate": 1.1237877923559611e-05, "loss": 4.1013, "step": 1975 }, { "epoch": 0.3381824405271265, "grad_norm": 31.967252731323242, "learning_rate": 1.1243582430119794e-05, "loss": 3.2354, "step": 1976 }, { "epoch": 0.3383535854869074, "grad_norm": 38.74686813354492, "learning_rate": 1.1249286936679976e-05, "loss": 4.5663, "step": 1977 }, { "epoch": 0.33852473044668835, "grad_norm": 30.3746395111084, "learning_rate": 1.1254991443240161e-05, "loss": 3.3973, "step": 1978 }, { "epoch": 0.3386958754064693, "grad_norm": 11.366987228393555, "learning_rate": 1.1260695949800343e-05, "loss": 0.8323, "step": 1979 }, { "epoch": 0.3388670203662502, "grad_norm": 20.15157699584961, "learning_rate": 1.1266400456360526e-05, "loss": 1.5111, "step": 1980 }, { "epoch": 0.33903816532603115, "grad_norm": 25.638330459594727, "learning_rate": 1.1272104962920708e-05, "loss": 2.7039, "step": 1981 }, { "epoch": 0.3392093102858121, "grad_norm": 30.38153839111328, "learning_rate": 1.127780946948089e-05, "loss": 3.6275, "step": 1982 }, { "epoch": 0.339380455245593, "grad_norm": 31.235469818115234, "learning_rate": 1.1283513976041073e-05, "loss": 4.032, "step": 1983 }, { "epoch": 0.33955160020537395, "grad_norm": 36.95757293701172, "learning_rate": 1.1289218482601255e-05, "loss": 4.052, "step": 1984 }, { "epoch": 0.3397227451651549, "grad_norm": 5.83810567855835, "learning_rate": 1.1294922989161438e-05, "loss": 0.7531, "step": 1985 }, { "epoch": 0.3398938901249358, "grad_norm": 187.32872009277344, "learning_rate": 1.130062749572162e-05, "loss": 8.1223, "step": 1986 }, { "epoch": 0.34006503508471675, "grad_norm": 10.221015930175781, "learning_rate": 1.1306332002281803e-05, "loss": 1.3128, "step": 1987 }, { "epoch": 0.3402361800444977, "grad_norm": 23.46990203857422, "learning_rate": 1.1312036508841985e-05, "loss": 2.2877, "step": 1988 }, { "epoch": 0.3404073250042786, "grad_norm": 204.71218872070312, "learning_rate": 1.1317741015402168e-05, "loss": 9.0911, "step": 1989 }, { "epoch": 0.34057846996405955, "grad_norm": 11.691418647766113, "learning_rate": 1.1323445521962352e-05, "loss": 2.0669, "step": 1990 }, { "epoch": 0.3407496149238405, "grad_norm": 34.32474899291992, "learning_rate": 1.1329150028522533e-05, "loss": 3.8131, "step": 1991 }, { "epoch": 0.3409207598836214, "grad_norm": 15.316189765930176, "learning_rate": 1.1334854535082717e-05, "loss": 1.4449, "step": 1992 }, { "epoch": 0.34109190484340235, "grad_norm": 33.847110748291016, "learning_rate": 1.1340559041642898e-05, "loss": 3.6209, "step": 1993 }, { "epoch": 0.3412630498031833, "grad_norm": 30.83047103881836, "learning_rate": 1.134626354820308e-05, "loss": 3.3044, "step": 1994 }, { "epoch": 0.3414341947629642, "grad_norm": 23.169050216674805, "learning_rate": 1.1351968054763263e-05, "loss": 2.7778, "step": 1995 }, { "epoch": 0.34160533972274515, "grad_norm": 28.009946823120117, "learning_rate": 1.1357672561323445e-05, "loss": 2.5658, "step": 1996 }, { "epoch": 0.3417764846825261, "grad_norm": 24.620206832885742, "learning_rate": 1.1363377067883628e-05, "loss": 2.8611, "step": 1997 }, { "epoch": 0.341947629642307, "grad_norm": 35.302894592285156, "learning_rate": 1.136908157444381e-05, "loss": 3.8368, "step": 1998 }, { "epoch": 0.34211877460208795, "grad_norm": 48.49169921875, "learning_rate": 1.1374786081003993e-05, "loss": 8.3039, "step": 1999 }, { "epoch": 0.3422899195618689, "grad_norm": 26.473003387451172, "learning_rate": 1.1380490587564177e-05, "loss": 2.6571, "step": 2000 }, { "epoch": 0.3424610645216498, "grad_norm": 8.975080490112305, "learning_rate": 1.1386195094124359e-05, "loss": 0.8311, "step": 2001 }, { "epoch": 0.34263220948143075, "grad_norm": 29.154399871826172, "learning_rate": 1.1391899600684542e-05, "loss": 3.3092, "step": 2002 }, { "epoch": 0.3428033544412117, "grad_norm": 9.116958618164062, "learning_rate": 1.1397604107244724e-05, "loss": 1.109, "step": 2003 }, { "epoch": 0.3429744994009926, "grad_norm": 150.9268341064453, "learning_rate": 1.1403308613804907e-05, "loss": 6.7063, "step": 2004 }, { "epoch": 0.34314564436077355, "grad_norm": 28.97213363647461, "learning_rate": 1.1409013120365089e-05, "loss": 3.4316, "step": 2005 }, { "epoch": 0.3433167893205545, "grad_norm": 35.343074798583984, "learning_rate": 1.1414717626925272e-05, "loss": 4.1921, "step": 2006 }, { "epoch": 0.34348793428033547, "grad_norm": 26.21539306640625, "learning_rate": 1.1420422133485454e-05, "loss": 2.8775, "step": 2007 }, { "epoch": 0.3436590792401164, "grad_norm": 24.8580322265625, "learning_rate": 1.1426126640045635e-05, "loss": 2.7428, "step": 2008 }, { "epoch": 0.34383022419989734, "grad_norm": 18.229679107666016, "learning_rate": 1.1431831146605819e-05, "loss": 2.1508, "step": 2009 }, { "epoch": 0.34400136915967827, "grad_norm": 12.01388168334961, "learning_rate": 1.1437535653166e-05, "loss": 1.002, "step": 2010 }, { "epoch": 0.3441725141194592, "grad_norm": 101.5674819946289, "learning_rate": 1.1443240159726184e-05, "loss": 6.9708, "step": 2011 }, { "epoch": 0.34434365907924014, "grad_norm": 135.65138244628906, "learning_rate": 1.1448944666286367e-05, "loss": 6.0953, "step": 2012 }, { "epoch": 0.34451480403902107, "grad_norm": 28.10844612121582, "learning_rate": 1.1454649172846549e-05, "loss": 3.5016, "step": 2013 }, { "epoch": 0.344685948998802, "grad_norm": 31.837894439697266, "learning_rate": 1.1460353679406732e-05, "loss": 3.2448, "step": 2014 }, { "epoch": 0.34485709395858294, "grad_norm": 28.26076889038086, "learning_rate": 1.1466058185966914e-05, "loss": 3.1378, "step": 2015 }, { "epoch": 0.34502823891836387, "grad_norm": 32.99501419067383, "learning_rate": 1.1471762692527097e-05, "loss": 3.4328, "step": 2016 }, { "epoch": 0.3451993838781448, "grad_norm": 31.268230438232422, "learning_rate": 1.1477467199087279e-05, "loss": 4.0378, "step": 2017 }, { "epoch": 0.34537052883792574, "grad_norm": 32.19254684448242, "learning_rate": 1.1483171705647462e-05, "loss": 4.356, "step": 2018 }, { "epoch": 0.34554167379770667, "grad_norm": 28.953779220581055, "learning_rate": 1.1488876212207644e-05, "loss": 3.8967, "step": 2019 }, { "epoch": 0.3457128187574876, "grad_norm": 26.264999389648438, "learning_rate": 1.1494580718767826e-05, "loss": 2.7881, "step": 2020 }, { "epoch": 0.34588396371726854, "grad_norm": 21.80779457092285, "learning_rate": 1.150028522532801e-05, "loss": 2.0569, "step": 2021 }, { "epoch": 0.34605510867704947, "grad_norm": 5.897726535797119, "learning_rate": 1.1505989731888191e-05, "loss": 0.6854, "step": 2022 }, { "epoch": 0.3462262536368304, "grad_norm": 18.685945510864258, "learning_rate": 1.1511694238448376e-05, "loss": 1.7189, "step": 2023 }, { "epoch": 0.34639739859661134, "grad_norm": 16.55164909362793, "learning_rate": 1.1517398745008558e-05, "loss": 1.6266, "step": 2024 }, { "epoch": 0.3465685435563923, "grad_norm": 26.497346878051758, "learning_rate": 1.152310325156874e-05, "loss": 3.1355, "step": 2025 }, { "epoch": 0.3467396885161732, "grad_norm": 36.22391128540039, "learning_rate": 1.1528807758128923e-05, "loss": 4.2871, "step": 2026 }, { "epoch": 0.34691083347595414, "grad_norm": 25.69757080078125, "learning_rate": 1.1534512264689104e-05, "loss": 2.4604, "step": 2027 }, { "epoch": 0.3470819784357351, "grad_norm": 34.47371292114258, "learning_rate": 1.1540216771249288e-05, "loss": 4.5727, "step": 2028 }, { "epoch": 0.347253123395516, "grad_norm": 25.829330444335938, "learning_rate": 1.154592127780947e-05, "loss": 2.3708, "step": 2029 }, { "epoch": 0.34742426835529694, "grad_norm": 23.152074813842773, "learning_rate": 1.1551625784369653e-05, "loss": 2.5885, "step": 2030 }, { "epoch": 0.3475954133150779, "grad_norm": 33.27009582519531, "learning_rate": 1.1557330290929834e-05, "loss": 4.0326, "step": 2031 }, { "epoch": 0.3477665582748588, "grad_norm": 11.642922401428223, "learning_rate": 1.1563034797490016e-05, "loss": 1.3036, "step": 2032 }, { "epoch": 0.34793770323463974, "grad_norm": 16.035924911499023, "learning_rate": 1.15687393040502e-05, "loss": 1.3584, "step": 2033 }, { "epoch": 0.3481088481944207, "grad_norm": 38.5884895324707, "learning_rate": 1.1574443810610381e-05, "loss": 5.2381, "step": 2034 }, { "epoch": 0.3482799931542016, "grad_norm": 34.79248046875, "learning_rate": 1.1580148317170566e-05, "loss": 3.4977, "step": 2035 }, { "epoch": 0.34845113811398254, "grad_norm": 24.086618423461914, "learning_rate": 1.1585852823730748e-05, "loss": 2.489, "step": 2036 }, { "epoch": 0.3486222830737635, "grad_norm": 17.970691680908203, "learning_rate": 1.1591557330290931e-05, "loss": 1.2174, "step": 2037 }, { "epoch": 0.3487934280335444, "grad_norm": 27.199962615966797, "learning_rate": 1.1597261836851113e-05, "loss": 2.4304, "step": 2038 }, { "epoch": 0.34896457299332534, "grad_norm": 36.157230377197266, "learning_rate": 1.1602966343411295e-05, "loss": 4.5914, "step": 2039 }, { "epoch": 0.3491357179531063, "grad_norm": 30.98073387145996, "learning_rate": 1.1608670849971478e-05, "loss": 3.1108, "step": 2040 }, { "epoch": 0.3493068629128872, "grad_norm": 4.110781192779541, "learning_rate": 1.161437535653166e-05, "loss": 0.6784, "step": 2041 }, { "epoch": 0.34947800787266814, "grad_norm": 7.259744644165039, "learning_rate": 1.1620079863091843e-05, "loss": 0.7546, "step": 2042 }, { "epoch": 0.3496491528324491, "grad_norm": 9.056280136108398, "learning_rate": 1.1625784369652025e-05, "loss": 0.8102, "step": 2043 }, { "epoch": 0.34982029779223, "grad_norm": 17.079927444458008, "learning_rate": 1.1631488876212207e-05, "loss": 1.8825, "step": 2044 }, { "epoch": 0.34999144275201094, "grad_norm": 5.583414077758789, "learning_rate": 1.163719338277239e-05, "loss": 0.6958, "step": 2045 }, { "epoch": 0.3501625877117919, "grad_norm": 32.52211380004883, "learning_rate": 1.1642897889332573e-05, "loss": 3.8308, "step": 2046 }, { "epoch": 0.3503337326715728, "grad_norm": 8.453152656555176, "learning_rate": 1.1648602395892757e-05, "loss": 0.9997, "step": 2047 }, { "epoch": 0.35050487763135374, "grad_norm": 17.828163146972656, "learning_rate": 1.1654306902452938e-05, "loss": 2.0197, "step": 2048 }, { "epoch": 0.3506760225911347, "grad_norm": 33.86958312988281, "learning_rate": 1.1660011409013122e-05, "loss": 3.5889, "step": 2049 }, { "epoch": 0.3508471675509156, "grad_norm": 39.53785705566406, "learning_rate": 1.1665715915573303e-05, "loss": 4.3322, "step": 2050 }, { "epoch": 0.35101831251069654, "grad_norm": 119.68132019042969, "learning_rate": 1.1671420422133485e-05, "loss": 8.5534, "step": 2051 }, { "epoch": 0.3511894574704775, "grad_norm": 20.703731536865234, "learning_rate": 1.1677124928693669e-05, "loss": 1.9145, "step": 2052 }, { "epoch": 0.3513606024302584, "grad_norm": 32.62479019165039, "learning_rate": 1.168282943525385e-05, "loss": 3.4411, "step": 2053 }, { "epoch": 0.35153174739003934, "grad_norm": 28.38721466064453, "learning_rate": 1.1688533941814034e-05, "loss": 2.913, "step": 2054 }, { "epoch": 0.3517028923498203, "grad_norm": 11.139078140258789, "learning_rate": 1.1694238448374215e-05, "loss": 1.2331, "step": 2055 }, { "epoch": 0.3518740373096012, "grad_norm": 36.095458984375, "learning_rate": 1.1699942954934399e-05, "loss": 4.4497, "step": 2056 }, { "epoch": 0.35204518226938214, "grad_norm": 17.7105655670166, "learning_rate": 1.170564746149458e-05, "loss": 1.341, "step": 2057 }, { "epoch": 0.35221632722916313, "grad_norm": 34.70029067993164, "learning_rate": 1.1711351968054764e-05, "loss": 3.8577, "step": 2058 }, { "epoch": 0.35238747218894406, "grad_norm": 30.967939376831055, "learning_rate": 1.1717056474614947e-05, "loss": 3.5998, "step": 2059 }, { "epoch": 0.352558617148725, "grad_norm": 175.67909240722656, "learning_rate": 1.1722760981175129e-05, "loss": 7.5725, "step": 2060 }, { "epoch": 0.35272976210850593, "grad_norm": 14.09093189239502, "learning_rate": 1.1728465487735312e-05, "loss": 1.1863, "step": 2061 }, { "epoch": 0.35290090706828686, "grad_norm": 16.4505672454834, "learning_rate": 1.1734169994295494e-05, "loss": 1.3923, "step": 2062 }, { "epoch": 0.3530720520280678, "grad_norm": 30.69254493713379, "learning_rate": 1.1739874500855676e-05, "loss": 4.0609, "step": 2063 }, { "epoch": 0.35324319698784873, "grad_norm": 35.82154846191406, "learning_rate": 1.1745579007415859e-05, "loss": 4.1915, "step": 2064 }, { "epoch": 0.35341434194762966, "grad_norm": 34.619754791259766, "learning_rate": 1.175128351397604e-05, "loss": 4.8903, "step": 2065 }, { "epoch": 0.3535854869074106, "grad_norm": 13.456661224365234, "learning_rate": 1.1756988020536224e-05, "loss": 1.4971, "step": 2066 }, { "epoch": 0.35375663186719153, "grad_norm": 34.76420974731445, "learning_rate": 1.1762692527096406e-05, "loss": 3.9249, "step": 2067 }, { "epoch": 0.35392777682697246, "grad_norm": 11.180761337280273, "learning_rate": 1.1768397033656589e-05, "loss": 0.7, "step": 2068 }, { "epoch": 0.3540989217867534, "grad_norm": 195.03485107421875, "learning_rate": 1.1774101540216772e-05, "loss": 6.9708, "step": 2069 }, { "epoch": 0.35427006674653433, "grad_norm": 34.15081787109375, "learning_rate": 1.1779806046776954e-05, "loss": 4.0197, "step": 2070 }, { "epoch": 0.35444121170631526, "grad_norm": 44.15553283691406, "learning_rate": 1.1785510553337137e-05, "loss": 8.034, "step": 2071 }, { "epoch": 0.3546123566660962, "grad_norm": 36.1580924987793, "learning_rate": 1.1791215059897319e-05, "loss": 4.3774, "step": 2072 }, { "epoch": 0.35478350162587713, "grad_norm": 37.583351135253906, "learning_rate": 1.1796919566457503e-05, "loss": 5.0443, "step": 2073 }, { "epoch": 0.35495464658565806, "grad_norm": 7.443456172943115, "learning_rate": 1.1802624073017684e-05, "loss": 0.7081, "step": 2074 }, { "epoch": 0.355125791545439, "grad_norm": 27.195236206054688, "learning_rate": 1.1808328579577866e-05, "loss": 2.7896, "step": 2075 }, { "epoch": 0.35529693650521993, "grad_norm": 10.81725788116455, "learning_rate": 1.181403308613805e-05, "loss": 2.1049, "step": 2076 }, { "epoch": 0.35546808146500086, "grad_norm": 32.889869689941406, "learning_rate": 1.1819737592698231e-05, "loss": 3.9205, "step": 2077 }, { "epoch": 0.3556392264247818, "grad_norm": 119.37525939941406, "learning_rate": 1.1825442099258414e-05, "loss": 7.0729, "step": 2078 }, { "epoch": 0.35581037138456273, "grad_norm": 13.211540222167969, "learning_rate": 1.1831146605818596e-05, "loss": 1.5046, "step": 2079 }, { "epoch": 0.35598151634434366, "grad_norm": 29.677011489868164, "learning_rate": 1.183685111237878e-05, "loss": 3.4441, "step": 2080 }, { "epoch": 0.3561526613041246, "grad_norm": 116.09097290039062, "learning_rate": 1.1842555618938963e-05, "loss": 6.9657, "step": 2081 }, { "epoch": 0.35632380626390553, "grad_norm": 36.9529914855957, "learning_rate": 1.1848260125499144e-05, "loss": 5.1966, "step": 2082 }, { "epoch": 0.35649495122368646, "grad_norm": 32.45378112792969, "learning_rate": 1.1853964632059328e-05, "loss": 3.8259, "step": 2083 }, { "epoch": 0.3566660961834674, "grad_norm": 28.279193878173828, "learning_rate": 1.185966913861951e-05, "loss": 3.0802, "step": 2084 }, { "epoch": 0.35683724114324833, "grad_norm": 16.36111831665039, "learning_rate": 1.1865373645179693e-05, "loss": 1.6254, "step": 2085 }, { "epoch": 0.35700838610302926, "grad_norm": 33.62881851196289, "learning_rate": 1.1871078151739875e-05, "loss": 3.482, "step": 2086 }, { "epoch": 0.3571795310628102, "grad_norm": 22.785282135009766, "learning_rate": 1.1876782658300058e-05, "loss": 2.5492, "step": 2087 }, { "epoch": 0.35735067602259113, "grad_norm": 18.783733367919922, "learning_rate": 1.188248716486024e-05, "loss": 2.1471, "step": 2088 }, { "epoch": 0.35752182098237206, "grad_norm": 25.175399780273438, "learning_rate": 1.1888191671420421e-05, "loss": 2.901, "step": 2089 }, { "epoch": 0.357692965942153, "grad_norm": 32.070228576660156, "learning_rate": 1.1893896177980605e-05, "loss": 4.0126, "step": 2090 }, { "epoch": 0.35786411090193393, "grad_norm": 30.165206909179688, "learning_rate": 1.1899600684540786e-05, "loss": 3.1196, "step": 2091 }, { "epoch": 0.35803525586171486, "grad_norm": 25.695375442504883, "learning_rate": 1.1905305191100971e-05, "loss": 2.5124, "step": 2092 }, { "epoch": 0.3582064008214958, "grad_norm": 7.505849838256836, "learning_rate": 1.1911009697661153e-05, "loss": 1.0043, "step": 2093 }, { "epoch": 0.35837754578127673, "grad_norm": 28.15729522705078, "learning_rate": 1.1916714204221335e-05, "loss": 3.8256, "step": 2094 }, { "epoch": 0.35854869074105766, "grad_norm": 15.077316284179688, "learning_rate": 1.1922418710781518e-05, "loss": 0.9039, "step": 2095 }, { "epoch": 0.3587198357008386, "grad_norm": 11.068819999694824, "learning_rate": 1.19281232173417e-05, "loss": 0.9256, "step": 2096 }, { "epoch": 0.35889098066061953, "grad_norm": 30.34836769104004, "learning_rate": 1.1933827723901883e-05, "loss": 3.3198, "step": 2097 }, { "epoch": 0.35906212562040046, "grad_norm": 92.60661315917969, "learning_rate": 1.1939532230462065e-05, "loss": 5.7395, "step": 2098 }, { "epoch": 0.3592332705801814, "grad_norm": 26.518394470214844, "learning_rate": 1.1945236737022248e-05, "loss": 2.7506, "step": 2099 }, { "epoch": 0.35940441553996233, "grad_norm": 4.0069780349731445, "learning_rate": 1.195094124358243e-05, "loss": 0.622, "step": 2100 }, { "epoch": 0.35957556049974326, "grad_norm": 25.66058349609375, "learning_rate": 1.1956645750142612e-05, "loss": 2.4436, "step": 2101 }, { "epoch": 0.3597467054595242, "grad_norm": 16.090246200561523, "learning_rate": 1.1962350256702795e-05, "loss": 1.4181, "step": 2102 }, { "epoch": 0.35991785041930513, "grad_norm": 9.653539657592773, "learning_rate": 1.1968054763262978e-05, "loss": 1.1303, "step": 2103 }, { "epoch": 0.36008899537908606, "grad_norm": 26.997007369995117, "learning_rate": 1.1973759269823162e-05, "loss": 2.8454, "step": 2104 }, { "epoch": 0.360260140338867, "grad_norm": 35.292945861816406, "learning_rate": 1.1979463776383344e-05, "loss": 4.4265, "step": 2105 }, { "epoch": 0.36043128529864793, "grad_norm": 9.962848663330078, "learning_rate": 1.1985168282943527e-05, "loss": 1.1083, "step": 2106 }, { "epoch": 0.36060243025842886, "grad_norm": 21.34442138671875, "learning_rate": 1.1990872789503709e-05, "loss": 1.9815, "step": 2107 }, { "epoch": 0.3607735752182098, "grad_norm": 63.102256774902344, "learning_rate": 1.199657729606389e-05, "loss": 8.2906, "step": 2108 }, { "epoch": 0.3609447201779908, "grad_norm": 31.640159606933594, "learning_rate": 1.2002281802624074e-05, "loss": 3.9734, "step": 2109 }, { "epoch": 0.3611158651377717, "grad_norm": 29.008909225463867, "learning_rate": 1.2007986309184255e-05, "loss": 2.8619, "step": 2110 }, { "epoch": 0.36128701009755265, "grad_norm": 158.99563598632812, "learning_rate": 1.2013690815744439e-05, "loss": 8.8876, "step": 2111 }, { "epoch": 0.3614581550573336, "grad_norm": 12.028635025024414, "learning_rate": 1.201939532230462e-05, "loss": 1.1747, "step": 2112 }, { "epoch": 0.3616293000171145, "grad_norm": 49.29413986206055, "learning_rate": 1.2025099828864802e-05, "loss": 8.4677, "step": 2113 }, { "epoch": 0.36180044497689545, "grad_norm": 35.586788177490234, "learning_rate": 1.2030804335424985e-05, "loss": 4.3141, "step": 2114 }, { "epoch": 0.3619715899366764, "grad_norm": 15.967235565185547, "learning_rate": 1.2036508841985169e-05, "loss": 1.4648, "step": 2115 }, { "epoch": 0.3621427348964573, "grad_norm": 116.31715393066406, "learning_rate": 1.2042213348545352e-05, "loss": 5.9115, "step": 2116 }, { "epoch": 0.36231387985623825, "grad_norm": 39.9970703125, "learning_rate": 1.2047917855105534e-05, "loss": 5.2751, "step": 2117 }, { "epoch": 0.3624850248160192, "grad_norm": 15.636171340942383, "learning_rate": 1.2053622361665717e-05, "loss": 1.1331, "step": 2118 }, { "epoch": 0.3626561697758001, "grad_norm": 29.51291847229004, "learning_rate": 1.2059326868225899e-05, "loss": 3.0782, "step": 2119 }, { "epoch": 0.36282731473558105, "grad_norm": 33.99169921875, "learning_rate": 1.206503137478608e-05, "loss": 3.4875, "step": 2120 }, { "epoch": 0.362998459695362, "grad_norm": 8.469818115234375, "learning_rate": 1.2070735881346264e-05, "loss": 0.9351, "step": 2121 }, { "epoch": 0.3631696046551429, "grad_norm": 87.96151733398438, "learning_rate": 1.2076440387906446e-05, "loss": 5.0553, "step": 2122 }, { "epoch": 0.36334074961492385, "grad_norm": 11.59670352935791, "learning_rate": 1.2082144894466629e-05, "loss": 1.314, "step": 2123 }, { "epoch": 0.3635118945747048, "grad_norm": 7.859058856964111, "learning_rate": 1.208784940102681e-05, "loss": 0.9692, "step": 2124 }, { "epoch": 0.3636830395344857, "grad_norm": 48.24964904785156, "learning_rate": 1.2093553907586992e-05, "loss": 5.6168, "step": 2125 }, { "epoch": 0.36385418449426665, "grad_norm": 35.264366149902344, "learning_rate": 1.2099258414147178e-05, "loss": 3.7475, "step": 2126 }, { "epoch": 0.3640253294540476, "grad_norm": 30.4807071685791, "learning_rate": 1.210496292070736e-05, "loss": 3.6681, "step": 2127 }, { "epoch": 0.3641964744138285, "grad_norm": 37.583274841308594, "learning_rate": 1.2110667427267543e-05, "loss": 7.6763, "step": 2128 }, { "epoch": 0.36436761937360945, "grad_norm": 10.553574562072754, "learning_rate": 1.2116371933827724e-05, "loss": 0.7119, "step": 2129 }, { "epoch": 0.3645387643333904, "grad_norm": 25.893739700317383, "learning_rate": 1.2122076440387908e-05, "loss": 2.7102, "step": 2130 }, { "epoch": 0.3647099092931713, "grad_norm": 37.81182861328125, "learning_rate": 1.212778094694809e-05, "loss": 7.7056, "step": 2131 }, { "epoch": 0.36488105425295225, "grad_norm": 24.436336517333984, "learning_rate": 1.2133485453508271e-05, "loss": 3.0385, "step": 2132 }, { "epoch": 0.3650521992127332, "grad_norm": 33.72613525390625, "learning_rate": 1.2139189960068454e-05, "loss": 3.625, "step": 2133 }, { "epoch": 0.3652233441725141, "grad_norm": 29.429370880126953, "learning_rate": 1.2144894466628636e-05, "loss": 3.2735, "step": 2134 }, { "epoch": 0.36539448913229505, "grad_norm": 29.37833595275879, "learning_rate": 1.215059897318882e-05, "loss": 3.3102, "step": 2135 }, { "epoch": 0.365565634092076, "grad_norm": 4.678672790527344, "learning_rate": 1.2156303479749001e-05, "loss": 0.6167, "step": 2136 }, { "epoch": 0.3657367790518569, "grad_norm": 13.350298881530762, "learning_rate": 1.2162007986309185e-05, "loss": 0.9838, "step": 2137 }, { "epoch": 0.36590792401163785, "grad_norm": 197.19981384277344, "learning_rate": 1.2167712492869368e-05, "loss": 10.1095, "step": 2138 }, { "epoch": 0.3660790689714188, "grad_norm": 33.24477767944336, "learning_rate": 1.217341699942955e-05, "loss": 3.6562, "step": 2139 }, { "epoch": 0.3662502139311997, "grad_norm": 31.698823928833008, "learning_rate": 1.2179121505989733e-05, "loss": 3.1984, "step": 2140 }, { "epoch": 0.36642135889098065, "grad_norm": 28.302553176879883, "learning_rate": 1.2184826012549915e-05, "loss": 2.9794, "step": 2141 }, { "epoch": 0.3665925038507616, "grad_norm": 26.840988159179688, "learning_rate": 1.2190530519110098e-05, "loss": 3.1451, "step": 2142 }, { "epoch": 0.3667636488105425, "grad_norm": 10.02106761932373, "learning_rate": 1.219623502567028e-05, "loss": 1.7728, "step": 2143 }, { "epoch": 0.36693479377032345, "grad_norm": 19.4163761138916, "learning_rate": 1.2201939532230461e-05, "loss": 1.4892, "step": 2144 }, { "epoch": 0.3671059387301044, "grad_norm": 117.40380096435547, "learning_rate": 1.2207644038790645e-05, "loss": 6.037, "step": 2145 }, { "epoch": 0.3672770836898853, "grad_norm": 36.802330017089844, "learning_rate": 1.2213348545350826e-05, "loss": 3.9472, "step": 2146 }, { "epoch": 0.36744822864966625, "grad_norm": 26.534914016723633, "learning_rate": 1.221905305191101e-05, "loss": 2.9076, "step": 2147 }, { "epoch": 0.3676193736094472, "grad_norm": 8.252175331115723, "learning_rate": 1.2224757558471192e-05, "loss": 1.7274, "step": 2148 }, { "epoch": 0.3677905185692281, "grad_norm": 36.72080993652344, "learning_rate": 1.2230462065031377e-05, "loss": 3.9691, "step": 2149 }, { "epoch": 0.36796166352900905, "grad_norm": 31.389694213867188, "learning_rate": 1.2236166571591558e-05, "loss": 3.662, "step": 2150 }, { "epoch": 0.36813280848879, "grad_norm": 17.889827728271484, "learning_rate": 1.224187107815174e-05, "loss": 1.3585, "step": 2151 }, { "epoch": 0.3683039534485709, "grad_norm": 37.55808639526367, "learning_rate": 1.2247575584711923e-05, "loss": 3.733, "step": 2152 }, { "epoch": 0.36847509840835185, "grad_norm": 28.830768585205078, "learning_rate": 1.2253280091272105e-05, "loss": 2.596, "step": 2153 }, { "epoch": 0.3686462433681328, "grad_norm": 11.456624031066895, "learning_rate": 1.2258984597832288e-05, "loss": 0.6827, "step": 2154 }, { "epoch": 0.3688173883279137, "grad_norm": 29.137744903564453, "learning_rate": 1.226468910439247e-05, "loss": 3.4631, "step": 2155 }, { "epoch": 0.36898853328769465, "grad_norm": 27.315082550048828, "learning_rate": 1.2270393610952653e-05, "loss": 2.4743, "step": 2156 }, { "epoch": 0.3691596782474756, "grad_norm": 15.013253211975098, "learning_rate": 1.2276098117512835e-05, "loss": 1.2812, "step": 2157 }, { "epoch": 0.3693308232072565, "grad_norm": 33.02097702026367, "learning_rate": 1.2281802624073017e-05, "loss": 3.1825, "step": 2158 }, { "epoch": 0.36950196816703745, "grad_norm": 37.75695037841797, "learning_rate": 1.22875071306332e-05, "loss": 3.9585, "step": 2159 }, { "epoch": 0.36967311312681844, "grad_norm": 33.56565475463867, "learning_rate": 1.2293211637193382e-05, "loss": 3.9576, "step": 2160 }, { "epoch": 0.3698442580865994, "grad_norm": 8.82251262664795, "learning_rate": 1.2298916143753567e-05, "loss": 1.045, "step": 2161 }, { "epoch": 0.3700154030463803, "grad_norm": 26.975778579711914, "learning_rate": 1.2304620650313749e-05, "loss": 2.5674, "step": 2162 }, { "epoch": 0.37018654800616124, "grad_norm": 136.73780822753906, "learning_rate": 1.231032515687393e-05, "loss": 5.8467, "step": 2163 }, { "epoch": 0.3703576929659422, "grad_norm": 29.269546508789062, "learning_rate": 1.2316029663434114e-05, "loss": 2.9019, "step": 2164 }, { "epoch": 0.3705288379257231, "grad_norm": 31.114402770996094, "learning_rate": 1.2321734169994295e-05, "loss": 3.803, "step": 2165 }, { "epoch": 0.37069998288550404, "grad_norm": 28.02252769470215, "learning_rate": 1.2327438676554479e-05, "loss": 3.334, "step": 2166 }, { "epoch": 0.370871127845285, "grad_norm": 36.24296951293945, "learning_rate": 1.233314318311466e-05, "loss": 4.0221, "step": 2167 }, { "epoch": 0.3710422728050659, "grad_norm": 42.49361801147461, "learning_rate": 1.2338847689674844e-05, "loss": 4.4893, "step": 2168 }, { "epoch": 0.37121341776484684, "grad_norm": 31.110870361328125, "learning_rate": 1.2344552196235026e-05, "loss": 3.2998, "step": 2169 }, { "epoch": 0.3713845627246278, "grad_norm": 38.54166030883789, "learning_rate": 1.2350256702795207e-05, "loss": 3.9307, "step": 2170 }, { "epoch": 0.3715557076844087, "grad_norm": 31.027143478393555, "learning_rate": 1.235596120935539e-05, "loss": 3.3663, "step": 2171 }, { "epoch": 0.37172685264418964, "grad_norm": 19.778564453125, "learning_rate": 1.2361665715915574e-05, "loss": 1.4132, "step": 2172 }, { "epoch": 0.3718979976039706, "grad_norm": 6.935482025146484, "learning_rate": 1.2367370222475757e-05, "loss": 0.7138, "step": 2173 }, { "epoch": 0.3720691425637515, "grad_norm": 17.002243041992188, "learning_rate": 1.2373074729035939e-05, "loss": 1.363, "step": 2174 }, { "epoch": 0.37224028752353244, "grad_norm": 36.1330451965332, "learning_rate": 1.237877923559612e-05, "loss": 4.304, "step": 2175 }, { "epoch": 0.3724114324833134, "grad_norm": 37.96760940551758, "learning_rate": 1.2384483742156304e-05, "loss": 4.1877, "step": 2176 }, { "epoch": 0.3725825774430943, "grad_norm": 37.3785400390625, "learning_rate": 1.2390188248716486e-05, "loss": 4.2806, "step": 2177 }, { "epoch": 0.37275372240287524, "grad_norm": 124.93565368652344, "learning_rate": 1.239589275527667e-05, "loss": 5.4911, "step": 2178 }, { "epoch": 0.3729248673626562, "grad_norm": 28.42656707763672, "learning_rate": 1.2401597261836851e-05, "loss": 2.685, "step": 2179 }, { "epoch": 0.3730960123224371, "grad_norm": 44.78040313720703, "learning_rate": 1.2407301768397034e-05, "loss": 8.0281, "step": 2180 }, { "epoch": 0.37326715728221804, "grad_norm": 106.5615005493164, "learning_rate": 1.2413006274957216e-05, "loss": 9.7692, "step": 2181 }, { "epoch": 0.373438302241999, "grad_norm": 32.70700454711914, "learning_rate": 1.2418710781517398e-05, "loss": 3.7167, "step": 2182 }, { "epoch": 0.3736094472017799, "grad_norm": 27.95832633972168, "learning_rate": 1.2424415288077583e-05, "loss": 3.4558, "step": 2183 }, { "epoch": 0.37378059216156084, "grad_norm": 51.62168502807617, "learning_rate": 1.2430119794637764e-05, "loss": 7.8843, "step": 2184 }, { "epoch": 0.3739517371213418, "grad_norm": 22.549152374267578, "learning_rate": 1.2435824301197948e-05, "loss": 2.2902, "step": 2185 }, { "epoch": 0.3741228820811227, "grad_norm": 49.26498031616211, "learning_rate": 1.244152880775813e-05, "loss": 7.9836, "step": 2186 }, { "epoch": 0.37429402704090364, "grad_norm": 32.918434143066406, "learning_rate": 1.2447233314318313e-05, "loss": 3.7321, "step": 2187 }, { "epoch": 0.3744651720006846, "grad_norm": 115.87164306640625, "learning_rate": 1.2452937820878495e-05, "loss": 6.1209, "step": 2188 }, { "epoch": 0.3746363169604655, "grad_norm": 32.60509490966797, "learning_rate": 1.2458642327438676e-05, "loss": 4.3652, "step": 2189 }, { "epoch": 0.37480746192024644, "grad_norm": 40.821720123291016, "learning_rate": 1.246434683399886e-05, "loss": 7.5982, "step": 2190 }, { "epoch": 0.3749786068800274, "grad_norm": 30.804649353027344, "learning_rate": 1.2470051340559041e-05, "loss": 3.6332, "step": 2191 }, { "epoch": 0.3751497518398083, "grad_norm": 28.10482406616211, "learning_rate": 1.2475755847119225e-05, "loss": 3.1805, "step": 2192 }, { "epoch": 0.37532089679958924, "grad_norm": 5.394840240478516, "learning_rate": 1.2481460353679406e-05, "loss": 0.6131, "step": 2193 }, { "epoch": 0.3754920417593702, "grad_norm": 22.42398452758789, "learning_rate": 1.2487164860239588e-05, "loss": 2.9095, "step": 2194 }, { "epoch": 0.3756631867191511, "grad_norm": 31.861984252929688, "learning_rate": 1.2492869366799773e-05, "loss": 3.7872, "step": 2195 }, { "epoch": 0.37583433167893204, "grad_norm": 30.0163631439209, "learning_rate": 1.2498573873359955e-05, "loss": 3.2556, "step": 2196 }, { "epoch": 0.376005476638713, "grad_norm": 43.01797103881836, "learning_rate": 1.2504278379920138e-05, "loss": 7.4534, "step": 2197 }, { "epoch": 0.3761766215984939, "grad_norm": 26.029483795166016, "learning_rate": 1.250998288648032e-05, "loss": 3.4138, "step": 2198 }, { "epoch": 0.37634776655827484, "grad_norm": 31.733152389526367, "learning_rate": 1.2515687393040503e-05, "loss": 3.7965, "step": 2199 }, { "epoch": 0.3765189115180558, "grad_norm": 29.86209487915039, "learning_rate": 1.2521391899600685e-05, "loss": 3.1073, "step": 2200 }, { "epoch": 0.3766900564778367, "grad_norm": 73.94261932373047, "learning_rate": 1.2527096406160867e-05, "loss": 6.7022, "step": 2201 }, { "epoch": 0.37686120143761764, "grad_norm": 33.266666412353516, "learning_rate": 1.253280091272105e-05, "loss": 3.467, "step": 2202 }, { "epoch": 0.3770323463973986, "grad_norm": 9.25309944152832, "learning_rate": 1.2538505419281232e-05, "loss": 0.9735, "step": 2203 }, { "epoch": 0.3772034913571795, "grad_norm": 32.7879753112793, "learning_rate": 1.2544209925841415e-05, "loss": 4.3873, "step": 2204 }, { "epoch": 0.37737463631696044, "grad_norm": 38.24089813232422, "learning_rate": 1.2549914432401597e-05, "loss": 4.1272, "step": 2205 }, { "epoch": 0.3775457812767414, "grad_norm": 11.10142707824707, "learning_rate": 1.2555618938961782e-05, "loss": 0.8028, "step": 2206 }, { "epoch": 0.3777169262365223, "grad_norm": 37.619815826416016, "learning_rate": 1.2561323445521963e-05, "loss": 3.8663, "step": 2207 }, { "epoch": 0.37788807119630324, "grad_norm": 43.338417053222656, "learning_rate": 1.2567027952082145e-05, "loss": 4.6084, "step": 2208 }, { "epoch": 0.3780592161560842, "grad_norm": 29.597476959228516, "learning_rate": 1.2572732458642329e-05, "loss": 4.3275, "step": 2209 }, { "epoch": 0.3782303611158651, "grad_norm": 111.00467681884766, "learning_rate": 1.257843696520251e-05, "loss": 6.2678, "step": 2210 }, { "epoch": 0.3784015060756461, "grad_norm": 28.328218460083008, "learning_rate": 1.2584141471762694e-05, "loss": 3.7021, "step": 2211 }, { "epoch": 0.37857265103542703, "grad_norm": 7.334059238433838, "learning_rate": 1.2589845978322875e-05, "loss": 1.0266, "step": 2212 }, { "epoch": 0.37874379599520797, "grad_norm": 12.333498001098633, "learning_rate": 1.2595550484883057e-05, "loss": 1.2937, "step": 2213 }, { "epoch": 0.3789149409549889, "grad_norm": 33.395259857177734, "learning_rate": 1.260125499144324e-05, "loss": 3.7564, "step": 2214 }, { "epoch": 0.37908608591476983, "grad_norm": 12.443466186523438, "learning_rate": 1.2606959498003422e-05, "loss": 1.1804, "step": 2215 }, { "epoch": 0.37925723087455077, "grad_norm": 29.2781982421875, "learning_rate": 1.2612664004563605e-05, "loss": 2.8089, "step": 2216 }, { "epoch": 0.3794283758343317, "grad_norm": 30.066843032836914, "learning_rate": 1.2618368511123787e-05, "loss": 3.678, "step": 2217 }, { "epoch": 0.37959952079411263, "grad_norm": 198.62889099121094, "learning_rate": 1.2624073017683972e-05, "loss": 10.5079, "step": 2218 }, { "epoch": 0.37977066575389357, "grad_norm": 36.29426574707031, "learning_rate": 1.2629777524244154e-05, "loss": 3.7594, "step": 2219 }, { "epoch": 0.3799418107136745, "grad_norm": 4.288938522338867, "learning_rate": 1.2635482030804336e-05, "loss": 0.6002, "step": 2220 }, { "epoch": 0.38011295567345543, "grad_norm": 16.282394409179688, "learning_rate": 1.2641186537364519e-05, "loss": 1.2398, "step": 2221 }, { "epoch": 0.38028410063323637, "grad_norm": 15.423003196716309, "learning_rate": 1.26468910439247e-05, "loss": 1.0447, "step": 2222 }, { "epoch": 0.3804552455930173, "grad_norm": 8.580951690673828, "learning_rate": 1.2652595550484884e-05, "loss": 1.0967, "step": 2223 }, { "epoch": 0.38062639055279823, "grad_norm": 23.481037139892578, "learning_rate": 1.2658300057045066e-05, "loss": 3.0215, "step": 2224 }, { "epoch": 0.38079753551257917, "grad_norm": 31.463350296020508, "learning_rate": 1.2664004563605247e-05, "loss": 3.9185, "step": 2225 }, { "epoch": 0.3809686804723601, "grad_norm": 33.95023727416992, "learning_rate": 1.266970907016543e-05, "loss": 4.4252, "step": 2226 }, { "epoch": 0.38113982543214103, "grad_norm": 32.201377868652344, "learning_rate": 1.2675413576725612e-05, "loss": 3.1638, "step": 2227 }, { "epoch": 0.38131097039192197, "grad_norm": 33.09391784667969, "learning_rate": 1.2681118083285796e-05, "loss": 4.5716, "step": 2228 }, { "epoch": 0.3814821153517029, "grad_norm": 89.28120422363281, "learning_rate": 1.2686822589845979e-05, "loss": 5.4798, "step": 2229 }, { "epoch": 0.38165326031148383, "grad_norm": 18.636362075805664, "learning_rate": 1.2692527096406163e-05, "loss": 1.3417, "step": 2230 }, { "epoch": 0.38182440527126477, "grad_norm": 108.82768249511719, "learning_rate": 1.2698231602966344e-05, "loss": 5.2101, "step": 2231 }, { "epoch": 0.3819955502310457, "grad_norm": 32.57135009765625, "learning_rate": 1.2703936109526526e-05, "loss": 4.3203, "step": 2232 }, { "epoch": 0.38216669519082663, "grad_norm": 33.27009963989258, "learning_rate": 1.270964061608671e-05, "loss": 3.9393, "step": 2233 }, { "epoch": 0.38233784015060757, "grad_norm": 16.50580406188965, "learning_rate": 1.2715345122646891e-05, "loss": 1.5263, "step": 2234 }, { "epoch": 0.3825089851103885, "grad_norm": 18.65876579284668, "learning_rate": 1.2721049629207074e-05, "loss": 1.948, "step": 2235 }, { "epoch": 0.38268013007016943, "grad_norm": 28.283248901367188, "learning_rate": 1.2726754135767256e-05, "loss": 2.8414, "step": 2236 }, { "epoch": 0.38285127502995037, "grad_norm": 118.61890411376953, "learning_rate": 1.273245864232744e-05, "loss": 7.9885, "step": 2237 }, { "epoch": 0.3830224199897313, "grad_norm": 16.00472640991211, "learning_rate": 1.2738163148887621e-05, "loss": 1.4454, "step": 2238 }, { "epoch": 0.38319356494951223, "grad_norm": 18.229719161987305, "learning_rate": 1.2743867655447803e-05, "loss": 1.5843, "step": 2239 }, { "epoch": 0.38336470990929317, "grad_norm": 26.571413040161133, "learning_rate": 1.2749572162007986e-05, "loss": 2.6752, "step": 2240 }, { "epoch": 0.3835358548690741, "grad_norm": 66.64990234375, "learning_rate": 1.275527666856817e-05, "loss": 5.1676, "step": 2241 }, { "epoch": 0.38370699982885503, "grad_norm": 19.84005355834961, "learning_rate": 1.2760981175128353e-05, "loss": 1.4974, "step": 2242 }, { "epoch": 0.38387814478863597, "grad_norm": 18.671689987182617, "learning_rate": 1.2766685681688535e-05, "loss": 1.9852, "step": 2243 }, { "epoch": 0.3840492897484169, "grad_norm": 98.68587493896484, "learning_rate": 1.2772390188248716e-05, "loss": 4.9745, "step": 2244 }, { "epoch": 0.38422043470819783, "grad_norm": 6.933028221130371, "learning_rate": 1.27780946948089e-05, "loss": 0.6802, "step": 2245 }, { "epoch": 0.38439157966797877, "grad_norm": 18.11700439453125, "learning_rate": 1.2783799201369081e-05, "loss": 1.5278, "step": 2246 }, { "epoch": 0.3845627246277597, "grad_norm": 18.046253204345703, "learning_rate": 1.2789503707929265e-05, "loss": 2.1576, "step": 2247 }, { "epoch": 0.38473386958754063, "grad_norm": 44.326602935791016, "learning_rate": 1.2795208214489446e-05, "loss": 7.2696, "step": 2248 }, { "epoch": 0.38490501454732157, "grad_norm": 95.7645034790039, "learning_rate": 1.280091272104963e-05, "loss": 6.0354, "step": 2249 }, { "epoch": 0.3850761595071025, "grad_norm": 7.336085796356201, "learning_rate": 1.2806617227609811e-05, "loss": 1.6021, "step": 2250 }, { "epoch": 0.38524730446688343, "grad_norm": 21.488544464111328, "learning_rate": 1.2812321734169993e-05, "loss": 1.9826, "step": 2251 }, { "epoch": 0.38541844942666437, "grad_norm": 34.97186279296875, "learning_rate": 1.2818026240730178e-05, "loss": 4.3046, "step": 2252 }, { "epoch": 0.3855895943864453, "grad_norm": 4.4676008224487305, "learning_rate": 1.282373074729036e-05, "loss": 0.6622, "step": 2253 }, { "epoch": 0.38576073934622623, "grad_norm": 6.151776313781738, "learning_rate": 1.2829435253850543e-05, "loss": 0.6381, "step": 2254 }, { "epoch": 0.38593188430600717, "grad_norm": 6.36190938949585, "learning_rate": 1.2835139760410725e-05, "loss": 0.6743, "step": 2255 }, { "epoch": 0.3861030292657881, "grad_norm": 24.97540283203125, "learning_rate": 1.2840844266970908e-05, "loss": 2.6396, "step": 2256 }, { "epoch": 0.38627417422556903, "grad_norm": 141.4521026611328, "learning_rate": 1.284654877353109e-05, "loss": 4.7593, "step": 2257 }, { "epoch": 0.38644531918534997, "grad_norm": 16.442359924316406, "learning_rate": 1.2852253280091272e-05, "loss": 1.3891, "step": 2258 }, { "epoch": 0.3866164641451309, "grad_norm": 26.524892807006836, "learning_rate": 1.2857957786651455e-05, "loss": 2.8351, "step": 2259 }, { "epoch": 0.38678760910491183, "grad_norm": 20.284482955932617, "learning_rate": 1.2863662293211637e-05, "loss": 2.2276, "step": 2260 }, { "epoch": 0.3869587540646928, "grad_norm": 13.217884063720703, "learning_rate": 1.286936679977182e-05, "loss": 0.9694, "step": 2261 }, { "epoch": 0.38712989902447376, "grad_norm": 33.80121612548828, "learning_rate": 1.2875071306332002e-05, "loss": 4.1736, "step": 2262 }, { "epoch": 0.3873010439842547, "grad_norm": 33.30670928955078, "learning_rate": 1.2880775812892185e-05, "loss": 3.5895, "step": 2263 }, { "epoch": 0.3874721889440356, "grad_norm": 24.27392578125, "learning_rate": 1.2886480319452369e-05, "loss": 2.6142, "step": 2264 }, { "epoch": 0.38764333390381656, "grad_norm": 4.387927055358887, "learning_rate": 1.289218482601255e-05, "loss": 0.5611, "step": 2265 }, { "epoch": 0.3878144788635975, "grad_norm": 11.723445892333984, "learning_rate": 1.2897889332572734e-05, "loss": 1.9691, "step": 2266 }, { "epoch": 0.3879856238233784, "grad_norm": 31.290142059326172, "learning_rate": 1.2903593839132915e-05, "loss": 4.4918, "step": 2267 }, { "epoch": 0.38815676878315936, "grad_norm": 29.301557540893555, "learning_rate": 1.2909298345693099e-05, "loss": 3.0265, "step": 2268 }, { "epoch": 0.3883279137429403, "grad_norm": 102.96603393554688, "learning_rate": 1.291500285225328e-05, "loss": 5.4491, "step": 2269 }, { "epoch": 0.3884990587027212, "grad_norm": 6.566992282867432, "learning_rate": 1.2920707358813462e-05, "loss": 0.6619, "step": 2270 }, { "epoch": 0.38867020366250216, "grad_norm": 20.63521385192871, "learning_rate": 1.2926411865373645e-05, "loss": 1.4227, "step": 2271 }, { "epoch": 0.3888413486222831, "grad_norm": 35.605445861816406, "learning_rate": 1.2932116371933827e-05, "loss": 3.0135, "step": 2272 }, { "epoch": 0.389012493582064, "grad_norm": 26.535554885864258, "learning_rate": 1.293782087849401e-05, "loss": 2.8951, "step": 2273 }, { "epoch": 0.38918363854184496, "grad_norm": 216.86865234375, "learning_rate": 1.2943525385054192e-05, "loss": 9.7157, "step": 2274 }, { "epoch": 0.3893547835016259, "grad_norm": 30.289108276367188, "learning_rate": 1.2949229891614376e-05, "loss": 3.7199, "step": 2275 }, { "epoch": 0.3895259284614068, "grad_norm": 70.54218292236328, "learning_rate": 1.2954934398174559e-05, "loss": 8.7961, "step": 2276 }, { "epoch": 0.38969707342118776, "grad_norm": 37.42404556274414, "learning_rate": 1.296063890473474e-05, "loss": 4.6408, "step": 2277 }, { "epoch": 0.3898682183809687, "grad_norm": 20.272388458251953, "learning_rate": 1.2966343411294924e-05, "loss": 2.2787, "step": 2278 }, { "epoch": 0.3900393633407496, "grad_norm": 21.717552185058594, "learning_rate": 1.2972047917855106e-05, "loss": 2.6312, "step": 2279 }, { "epoch": 0.39021050830053056, "grad_norm": 27.405563354492188, "learning_rate": 1.2977752424415289e-05, "loss": 2.922, "step": 2280 }, { "epoch": 0.3903816532603115, "grad_norm": 9.014309883117676, "learning_rate": 1.298345693097547e-05, "loss": 0.7394, "step": 2281 }, { "epoch": 0.3905527982200924, "grad_norm": 34.70540237426758, "learning_rate": 1.2989161437535652e-05, "loss": 4.5773, "step": 2282 }, { "epoch": 0.39072394317987336, "grad_norm": 17.615568161010742, "learning_rate": 1.2994865944095836e-05, "loss": 1.3793, "step": 2283 }, { "epoch": 0.3908950881396543, "grad_norm": 9.69536018371582, "learning_rate": 1.3000570450656018e-05, "loss": 1.4154, "step": 2284 }, { "epoch": 0.3910662330994352, "grad_norm": 42.174076080322266, "learning_rate": 1.3006274957216201e-05, "loss": 3.7989, "step": 2285 }, { "epoch": 0.39123737805921616, "grad_norm": 23.85903549194336, "learning_rate": 1.3011979463776384e-05, "loss": 2.7904, "step": 2286 }, { "epoch": 0.3914085230189971, "grad_norm": 12.722695350646973, "learning_rate": 1.3017683970336568e-05, "loss": 0.9675, "step": 2287 }, { "epoch": 0.391579667978778, "grad_norm": 29.10125160217285, "learning_rate": 1.302338847689675e-05, "loss": 3.3556, "step": 2288 }, { "epoch": 0.39175081293855896, "grad_norm": 28.335847854614258, "learning_rate": 1.3029092983456931e-05, "loss": 3.6913, "step": 2289 }, { "epoch": 0.3919219578983399, "grad_norm": 27.098371505737305, "learning_rate": 1.3034797490017114e-05, "loss": 3.4171, "step": 2290 }, { "epoch": 0.3920931028581208, "grad_norm": 24.61624526977539, "learning_rate": 1.3040501996577296e-05, "loss": 2.5601, "step": 2291 }, { "epoch": 0.39226424781790176, "grad_norm": 36.29865264892578, "learning_rate": 1.304620650313748e-05, "loss": 4.1205, "step": 2292 }, { "epoch": 0.3924353927776827, "grad_norm": Infinity, "learning_rate": 1.304620650313748e-05, "loss": 10.2854, "step": 2293 }, { "epoch": 0.3926065377374636, "grad_norm": 43.4719352722168, "learning_rate": 1.3051911009697661e-05, "loss": 5.7968, "step": 2294 }, { "epoch": 0.39277768269724456, "grad_norm": 34.03304672241211, "learning_rate": 1.3057615516257843e-05, "loss": 3.2482, "step": 2295 }, { "epoch": 0.3929488276570255, "grad_norm": 28.92998695373535, "learning_rate": 1.3063320022818026e-05, "loss": 3.1048, "step": 2296 }, { "epoch": 0.3931199726168064, "grad_norm": 30.764570236206055, "learning_rate": 1.3069024529378208e-05, "loss": 3.4204, "step": 2297 }, { "epoch": 0.39329111757658736, "grad_norm": 30.185405731201172, "learning_rate": 1.3074729035938391e-05, "loss": 3.6159, "step": 2298 }, { "epoch": 0.3934622625363683, "grad_norm": 15.160475730895996, "learning_rate": 1.3080433542498575e-05, "loss": 1.0351, "step": 2299 }, { "epoch": 0.3936334074961492, "grad_norm": 30.460662841796875, "learning_rate": 1.3086138049058758e-05, "loss": 3.0676, "step": 2300 }, { "epoch": 0.39380455245593016, "grad_norm": 31.176111221313477, "learning_rate": 1.309184255561894e-05, "loss": 3.07, "step": 2301 }, { "epoch": 0.3939756974157111, "grad_norm": 40.287208557128906, "learning_rate": 1.3097547062179121e-05, "loss": 4.0389, "step": 2302 }, { "epoch": 0.394146842375492, "grad_norm": 31.603471755981445, "learning_rate": 1.3103251568739305e-05, "loss": 3.1424, "step": 2303 }, { "epoch": 0.39431798733527296, "grad_norm": 27.959386825561523, "learning_rate": 1.3108956075299486e-05, "loss": 2.7499, "step": 2304 }, { "epoch": 0.3944891322950539, "grad_norm": 9.44000244140625, "learning_rate": 1.311466058185967e-05, "loss": 0.6843, "step": 2305 }, { "epoch": 0.3946602772548348, "grad_norm": 31.026531219482422, "learning_rate": 1.3120365088419852e-05, "loss": 3.0951, "step": 2306 }, { "epoch": 0.39483142221461576, "grad_norm": 28.429651260375977, "learning_rate": 1.3126069594980035e-05, "loss": 3.1954, "step": 2307 }, { "epoch": 0.3950025671743967, "grad_norm": 36.807884216308594, "learning_rate": 1.3131774101540217e-05, "loss": 3.5692, "step": 2308 }, { "epoch": 0.3951737121341776, "grad_norm": 27.523998260498047, "learning_rate": 1.3137478608100398e-05, "loss": 3.04, "step": 2309 }, { "epoch": 0.39534485709395856, "grad_norm": 22.569734573364258, "learning_rate": 1.3143183114660583e-05, "loss": 2.6063, "step": 2310 }, { "epoch": 0.3955160020537395, "grad_norm": 30.23894691467285, "learning_rate": 1.3148887621220765e-05, "loss": 2.6877, "step": 2311 }, { "epoch": 0.3956871470135205, "grad_norm": 32.485286712646484, "learning_rate": 1.3154592127780948e-05, "loss": 4.2341, "step": 2312 }, { "epoch": 0.3958582919733014, "grad_norm": 9.512272834777832, "learning_rate": 1.316029663434113e-05, "loss": 0.7438, "step": 2313 }, { "epoch": 0.39602943693308235, "grad_norm": 30.39967918395996, "learning_rate": 1.3166001140901312e-05, "loss": 2.678, "step": 2314 }, { "epoch": 0.3962005818928633, "grad_norm": 26.347349166870117, "learning_rate": 1.3171705647461495e-05, "loss": 2.6173, "step": 2315 }, { "epoch": 0.3963717268526442, "grad_norm": 11.27676010131836, "learning_rate": 1.3177410154021677e-05, "loss": 0.9184, "step": 2316 }, { "epoch": 0.39654287181242515, "grad_norm": 28.942106246948242, "learning_rate": 1.318311466058186e-05, "loss": 2.7751, "step": 2317 }, { "epoch": 0.3967140167722061, "grad_norm": 155.31259155273438, "learning_rate": 1.3188819167142042e-05, "loss": 7.0884, "step": 2318 }, { "epoch": 0.396885161731987, "grad_norm": 15.048434257507324, "learning_rate": 1.3194523673702225e-05, "loss": 1.0894, "step": 2319 }, { "epoch": 0.39705630669176795, "grad_norm": 29.555904388427734, "learning_rate": 1.3200228180262407e-05, "loss": 3.0407, "step": 2320 }, { "epoch": 0.3972274516515489, "grad_norm": 22.9705810546875, "learning_rate": 1.3205932686822589e-05, "loss": 2.1862, "step": 2321 }, { "epoch": 0.3973985966113298, "grad_norm": 31.04474449157715, "learning_rate": 1.3211637193382774e-05, "loss": 3.5704, "step": 2322 }, { "epoch": 0.39756974157111075, "grad_norm": 38.25536346435547, "learning_rate": 1.3217341699942955e-05, "loss": 3.7726, "step": 2323 }, { "epoch": 0.3977408865308917, "grad_norm": 24.22712516784668, "learning_rate": 1.3223046206503139e-05, "loss": 2.9952, "step": 2324 }, { "epoch": 0.3979120314906726, "grad_norm": 32.82272720336914, "learning_rate": 1.322875071306332e-05, "loss": 3.313, "step": 2325 }, { "epoch": 0.39808317645045355, "grad_norm": 29.25124168395996, "learning_rate": 1.3234455219623502e-05, "loss": 2.9707, "step": 2326 }, { "epoch": 0.3982543214102345, "grad_norm": 42.494041442871094, "learning_rate": 1.3240159726183686e-05, "loss": 4.4698, "step": 2327 }, { "epoch": 0.3984254663700154, "grad_norm": 32.5220947265625, "learning_rate": 1.3245864232743867e-05, "loss": 3.7016, "step": 2328 }, { "epoch": 0.39859661132979635, "grad_norm": 4.652500629425049, "learning_rate": 1.325156873930405e-05, "loss": 0.5571, "step": 2329 }, { "epoch": 0.3987677562895773, "grad_norm": 136.5018768310547, "learning_rate": 1.3257273245864232e-05, "loss": 5.8926, "step": 2330 }, { "epoch": 0.3989389012493582, "grad_norm": 51.504051208496094, "learning_rate": 1.3262977752424416e-05, "loss": 5.6096, "step": 2331 }, { "epoch": 0.39911004620913915, "grad_norm": 18.578340530395508, "learning_rate": 1.3268682258984597e-05, "loss": 2.2877, "step": 2332 }, { "epoch": 0.3992811911689201, "grad_norm": 26.573881149291992, "learning_rate": 1.327438676554478e-05, "loss": 3.3675, "step": 2333 }, { "epoch": 0.399452336128701, "grad_norm": 28.39176368713379, "learning_rate": 1.3280091272104964e-05, "loss": 3.4595, "step": 2334 }, { "epoch": 0.39962348108848195, "grad_norm": 27.315298080444336, "learning_rate": 1.3285795778665146e-05, "loss": 3.7229, "step": 2335 }, { "epoch": 0.3997946260482629, "grad_norm": 34.018280029296875, "learning_rate": 1.329150028522533e-05, "loss": 3.7833, "step": 2336 }, { "epoch": 0.3999657710080438, "grad_norm": 35.161949157714844, "learning_rate": 1.3297204791785511e-05, "loss": 3.6256, "step": 2337 }, { "epoch": 0.40013691596782475, "grad_norm": 54.34180450439453, "learning_rate": 1.3302909298345694e-05, "loss": 8.2972, "step": 2338 }, { "epoch": 0.4003080609276057, "grad_norm": 37.41242980957031, "learning_rate": 1.3308613804905876e-05, "loss": 4.1554, "step": 2339 }, { "epoch": 0.4004792058873866, "grad_norm": 24.117671966552734, "learning_rate": 1.3314318311466058e-05, "loss": 2.6779, "step": 2340 }, { "epoch": 0.40065035084716755, "grad_norm": 75.865234375, "learning_rate": 1.3320022818026241e-05, "loss": 5.2769, "step": 2341 }, { "epoch": 0.4008214958069485, "grad_norm": 10.710969924926758, "learning_rate": 1.3325727324586423e-05, "loss": 0.8464, "step": 2342 }, { "epoch": 0.4009926407667294, "grad_norm": 32.42598342895508, "learning_rate": 1.3331431831146606e-05, "loss": 4.2176, "step": 2343 }, { "epoch": 0.40116378572651035, "grad_norm": 31.665010452270508, "learning_rate": 1.333713633770679e-05, "loss": 3.6966, "step": 2344 }, { "epoch": 0.4013349306862913, "grad_norm": 19.899494171142578, "learning_rate": 1.3342840844266971e-05, "loss": 1.5844, "step": 2345 }, { "epoch": 0.4015060756460722, "grad_norm": 26.72218132019043, "learning_rate": 1.3348545350827155e-05, "loss": 2.6391, "step": 2346 }, { "epoch": 0.40167722060585315, "grad_norm": 67.58808135986328, "learning_rate": 1.3354249857387336e-05, "loss": 4.8317, "step": 2347 }, { "epoch": 0.4018483655656341, "grad_norm": 35.947166442871094, "learning_rate": 1.335995436394752e-05, "loss": 4.4359, "step": 2348 }, { "epoch": 0.402019510525415, "grad_norm": 33.10310745239258, "learning_rate": 1.3365658870507701e-05, "loss": 4.2287, "step": 2349 }, { "epoch": 0.40219065548519595, "grad_norm": 26.962339401245117, "learning_rate": 1.3371363377067885e-05, "loss": 2.8973, "step": 2350 }, { "epoch": 0.4023618004449769, "grad_norm": 10.832432746887207, "learning_rate": 1.3377067883628066e-05, "loss": 1.1334, "step": 2351 }, { "epoch": 0.4025329454047578, "grad_norm": 3.8930623531341553, "learning_rate": 1.3382772390188248e-05, "loss": 0.5691, "step": 2352 }, { "epoch": 0.40270409036453875, "grad_norm": 30.617422103881836, "learning_rate": 1.3388476896748431e-05, "loss": 4.1985, "step": 2353 }, { "epoch": 0.4028752353243197, "grad_norm": 29.522432327270508, "learning_rate": 1.3394181403308613e-05, "loss": 3.3929, "step": 2354 }, { "epoch": 0.4030463802841006, "grad_norm": 29.46415901184082, "learning_rate": 1.3399885909868796e-05, "loss": 3.0491, "step": 2355 }, { "epoch": 0.40321752524388155, "grad_norm": 28.462308883666992, "learning_rate": 1.340559041642898e-05, "loss": 3.4012, "step": 2356 }, { "epoch": 0.4033886702036625, "grad_norm": 26.383548736572266, "learning_rate": 1.3411294922989163e-05, "loss": 2.8104, "step": 2357 }, { "epoch": 0.4035598151634434, "grad_norm": 5.228971004486084, "learning_rate": 1.3416999429549345e-05, "loss": 0.6984, "step": 2358 }, { "epoch": 0.40373096012322435, "grad_norm": 65.2656021118164, "learning_rate": 1.3422703936109527e-05, "loss": 4.842, "step": 2359 }, { "epoch": 0.4039021050830053, "grad_norm": 9.332406044006348, "learning_rate": 1.342840844266971e-05, "loss": 1.1261, "step": 2360 }, { "epoch": 0.4040732500427862, "grad_norm": 19.021203994750977, "learning_rate": 1.3434112949229892e-05, "loss": 2.1595, "step": 2361 }, { "epoch": 0.40424439500256715, "grad_norm": 26.664148330688477, "learning_rate": 1.3439817455790075e-05, "loss": 3.1312, "step": 2362 }, { "epoch": 0.40441553996234814, "grad_norm": 14.464133262634277, "learning_rate": 1.3445521962350257e-05, "loss": 1.3528, "step": 2363 }, { "epoch": 0.40458668492212907, "grad_norm": 25.62092399597168, "learning_rate": 1.3451226468910438e-05, "loss": 2.6933, "step": 2364 }, { "epoch": 0.40475782988191, "grad_norm": 30.344446182250977, "learning_rate": 1.3456930975470622e-05, "loss": 3.5816, "step": 2365 }, { "epoch": 0.40492897484169094, "grad_norm": 34.0131950378418, "learning_rate": 1.3462635482030803e-05, "loss": 3.8169, "step": 2366 }, { "epoch": 0.40510011980147187, "grad_norm": 25.75012969970703, "learning_rate": 1.3468339988590989e-05, "loss": 2.9735, "step": 2367 }, { "epoch": 0.4052712647612528, "grad_norm": 31.590328216552734, "learning_rate": 1.347404449515117e-05, "loss": 3.4589, "step": 2368 }, { "epoch": 0.40544240972103374, "grad_norm": 24.881752014160156, "learning_rate": 1.3479749001711354e-05, "loss": 2.5041, "step": 2369 }, { "epoch": 0.40561355468081467, "grad_norm": 20.611392974853516, "learning_rate": 1.3485453508271535e-05, "loss": 2.0903, "step": 2370 }, { "epoch": 0.4057846996405956, "grad_norm": 35.99172592163086, "learning_rate": 1.3491158014831717e-05, "loss": 4.3005, "step": 2371 }, { "epoch": 0.40595584460037654, "grad_norm": 44.53636932373047, "learning_rate": 1.34968625213919e-05, "loss": 7.8911, "step": 2372 }, { "epoch": 0.40612698956015747, "grad_norm": 41.1456298828125, "learning_rate": 1.3502567027952082e-05, "loss": 7.4041, "step": 2373 }, { "epoch": 0.4062981345199384, "grad_norm": 24.72629737854004, "learning_rate": 1.3508271534512265e-05, "loss": 2.0511, "step": 2374 }, { "epoch": 0.40646927947971934, "grad_norm": 9.164275169372559, "learning_rate": 1.3513976041072447e-05, "loss": 1.0127, "step": 2375 }, { "epoch": 0.40664042443950027, "grad_norm": 55.97251892089844, "learning_rate": 1.351968054763263e-05, "loss": 7.7883, "step": 2376 }, { "epoch": 0.4068115693992812, "grad_norm": 15.729819297790527, "learning_rate": 1.3525385054192812e-05, "loss": 1.3747, "step": 2377 }, { "epoch": 0.40698271435906214, "grad_norm": 31.85474967956543, "learning_rate": 1.3531089560752994e-05, "loss": 3.7341, "step": 2378 }, { "epoch": 0.40715385931884307, "grad_norm": 32.369163513183594, "learning_rate": 1.3536794067313179e-05, "loss": 3.4044, "step": 2379 }, { "epoch": 0.407325004278624, "grad_norm": 26.481473922729492, "learning_rate": 1.354249857387336e-05, "loss": 2.7264, "step": 2380 }, { "epoch": 0.40749614923840494, "grad_norm": 36.87574005126953, "learning_rate": 1.3548203080433544e-05, "loss": 7.1091, "step": 2381 }, { "epoch": 0.40766729419818587, "grad_norm": 34.68164825439453, "learning_rate": 1.3553907586993726e-05, "loss": 3.5182, "step": 2382 }, { "epoch": 0.4078384391579668, "grad_norm": 4.539041042327881, "learning_rate": 1.3559612093553907e-05, "loss": 0.5712, "step": 2383 }, { "epoch": 0.40800958411774774, "grad_norm": 18.264692306518555, "learning_rate": 1.356531660011409e-05, "loss": 1.6827, "step": 2384 }, { "epoch": 0.40818072907752867, "grad_norm": 58.49655532836914, "learning_rate": 1.3571021106674272e-05, "loss": 4.7984, "step": 2385 }, { "epoch": 0.4083518740373096, "grad_norm": 38.31999969482422, "learning_rate": 1.3576725613234456e-05, "loss": 4.9844, "step": 2386 }, { "epoch": 0.40852301899709054, "grad_norm": 31.779747009277344, "learning_rate": 1.3582430119794637e-05, "loss": 4.109, "step": 2387 }, { "epoch": 0.40869416395687147, "grad_norm": 28.318117141723633, "learning_rate": 1.358813462635482e-05, "loss": 3.1768, "step": 2388 }, { "epoch": 0.4088653089166524, "grad_norm": 109.76797485351562, "learning_rate": 1.3593839132915003e-05, "loss": 8.1629, "step": 2389 }, { "epoch": 0.40903645387643334, "grad_norm": 29.490888595581055, "learning_rate": 1.3599543639475186e-05, "loss": 2.8556, "step": 2390 }, { "epoch": 0.40920759883621427, "grad_norm": 59.6926383972168, "learning_rate": 1.360524814603537e-05, "loss": 7.2838, "step": 2391 }, { "epoch": 0.4093787437959952, "grad_norm": 26.968727111816406, "learning_rate": 1.3610952652595551e-05, "loss": 3.1073, "step": 2392 }, { "epoch": 0.40954988875577614, "grad_norm": 14.444951057434082, "learning_rate": 1.3616657159155734e-05, "loss": 1.1151, "step": 2393 }, { "epoch": 0.40972103371555707, "grad_norm": 27.179691314697266, "learning_rate": 1.3622361665715916e-05, "loss": 3.4778, "step": 2394 }, { "epoch": 0.409892178675338, "grad_norm": 28.209474563598633, "learning_rate": 1.3628066172276098e-05, "loss": 3.1061, "step": 2395 }, { "epoch": 0.41006332363511894, "grad_norm": 28.115158081054688, "learning_rate": 1.3633770678836281e-05, "loss": 3.3303, "step": 2396 }, { "epoch": 0.41023446859489987, "grad_norm": 33.9571418762207, "learning_rate": 1.3639475185396463e-05, "loss": 3.476, "step": 2397 }, { "epoch": 0.4104056135546808, "grad_norm": 99.95455932617188, "learning_rate": 1.3645179691956646e-05, "loss": 4.891, "step": 2398 }, { "epoch": 0.41057675851446174, "grad_norm": 32.09910583496094, "learning_rate": 1.3650884198516828e-05, "loss": 4.5344, "step": 2399 }, { "epoch": 0.41074790347424267, "grad_norm": 22.752981185913086, "learning_rate": 1.3656588705077011e-05, "loss": 2.5455, "step": 2400 }, { "epoch": 0.4109190484340236, "grad_norm": 31.16071128845215, "learning_rate": 1.3662293211637193e-05, "loss": 3.5245, "step": 2401 }, { "epoch": 0.41109019339380454, "grad_norm": 16.054365158081055, "learning_rate": 1.3667997718197376e-05, "loss": 1.1714, "step": 2402 }, { "epoch": 0.41126133835358547, "grad_norm": 61.82563018798828, "learning_rate": 1.367370222475756e-05, "loss": 4.4332, "step": 2403 }, { "epoch": 0.4114324833133664, "grad_norm": 25.521482467651367, "learning_rate": 1.3679406731317741e-05, "loss": 3.1523, "step": 2404 }, { "epoch": 0.41160362827314734, "grad_norm": 28.02633285522461, "learning_rate": 1.3685111237877925e-05, "loss": 3.3077, "step": 2405 }, { "epoch": 0.41177477323292827, "grad_norm": 31.012575149536133, "learning_rate": 1.3690815744438106e-05, "loss": 3.2338, "step": 2406 }, { "epoch": 0.4119459181927092, "grad_norm": 24.693798065185547, "learning_rate": 1.369652025099829e-05, "loss": 3.1159, "step": 2407 }, { "epoch": 0.41211706315249014, "grad_norm": 28.928600311279297, "learning_rate": 1.3702224757558471e-05, "loss": 3.231, "step": 2408 }, { "epoch": 0.41228820811227107, "grad_norm": 30.929235458374023, "learning_rate": 1.3707929264118653e-05, "loss": 3.7399, "step": 2409 }, { "epoch": 0.412459353072052, "grad_norm": 29.809967041015625, "learning_rate": 1.3713633770678837e-05, "loss": 2.8299, "step": 2410 }, { "epoch": 0.41263049803183294, "grad_norm": 34.67237091064453, "learning_rate": 1.3719338277239018e-05, "loss": 3.7692, "step": 2411 }, { "epoch": 0.41280164299161387, "grad_norm": 29.03022575378418, "learning_rate": 1.3725042783799202e-05, "loss": 3.1456, "step": 2412 }, { "epoch": 0.4129727879513948, "grad_norm": 27.838979721069336, "learning_rate": 1.3730747290359385e-05, "loss": 2.7944, "step": 2413 }, { "epoch": 0.4131439329111758, "grad_norm": 27.87117576599121, "learning_rate": 1.3736451796919567e-05, "loss": 2.9217, "step": 2414 }, { "epoch": 0.4133150778709567, "grad_norm": 34.504295349121094, "learning_rate": 1.374215630347975e-05, "loss": 3.9797, "step": 2415 }, { "epoch": 0.41348622283073766, "grad_norm": 21.570331573486328, "learning_rate": 1.3747860810039932e-05, "loss": 2.3607, "step": 2416 }, { "epoch": 0.4136573677905186, "grad_norm": 132.19834899902344, "learning_rate": 1.3753565316600115e-05, "loss": 7.9895, "step": 2417 }, { "epoch": 0.4138285127502995, "grad_norm": 29.5281925201416, "learning_rate": 1.3759269823160297e-05, "loss": 3.3183, "step": 2418 }, { "epoch": 0.41399965771008046, "grad_norm": 7.409353256225586, "learning_rate": 1.376497432972048e-05, "loss": 0.6978, "step": 2419 }, { "epoch": 0.4141708026698614, "grad_norm": 23.6326847076416, "learning_rate": 1.3770678836280662e-05, "loss": 2.4085, "step": 2420 }, { "epoch": 0.4143419476296423, "grad_norm": 6.584427356719971, "learning_rate": 1.3776383342840844e-05, "loss": 0.7725, "step": 2421 }, { "epoch": 0.41451309258942326, "grad_norm": 5.124080181121826, "learning_rate": 1.3782087849401027e-05, "loss": 0.5998, "step": 2422 }, { "epoch": 0.4146842375492042, "grad_norm": 129.28781127929688, "learning_rate": 1.3787792355961209e-05, "loss": 5.2033, "step": 2423 }, { "epoch": 0.4148553825089851, "grad_norm": 30.348461151123047, "learning_rate": 1.3793496862521394e-05, "loss": 3.4905, "step": 2424 }, { "epoch": 0.41502652746876606, "grad_norm": 25.107507705688477, "learning_rate": 1.3799201369081575e-05, "loss": 2.4536, "step": 2425 }, { "epoch": 0.415197672428547, "grad_norm": 20.649410247802734, "learning_rate": 1.3804905875641757e-05, "loss": 2.11, "step": 2426 }, { "epoch": 0.4153688173883279, "grad_norm": 31.82566261291504, "learning_rate": 1.381061038220194e-05, "loss": 3.4624, "step": 2427 }, { "epoch": 0.41553996234810886, "grad_norm": 25.216468811035156, "learning_rate": 1.3816314888762122e-05, "loss": 2.7103, "step": 2428 }, { "epoch": 0.4157111073078898, "grad_norm": 62.44169616699219, "learning_rate": 1.3822019395322305e-05, "loss": 7.9435, "step": 2429 }, { "epoch": 0.4158822522676707, "grad_norm": 14.46311092376709, "learning_rate": 1.3827723901882487e-05, "loss": 1.3546, "step": 2430 }, { "epoch": 0.41605339722745166, "grad_norm": 21.584251403808594, "learning_rate": 1.383342840844267e-05, "loss": 2.3481, "step": 2431 }, { "epoch": 0.4162245421872326, "grad_norm": 28.41043472290039, "learning_rate": 1.3839132915002852e-05, "loss": 3.242, "step": 2432 }, { "epoch": 0.4163956871470135, "grad_norm": 57.48540496826172, "learning_rate": 1.3844837421563034e-05, "loss": 8.205, "step": 2433 }, { "epoch": 0.41656683210679446, "grad_norm": 20.560029983520508, "learning_rate": 1.3850541928123217e-05, "loss": 2.4152, "step": 2434 }, { "epoch": 0.4167379770665754, "grad_norm": 29.860027313232422, "learning_rate": 1.3856246434683399e-05, "loss": 2.9946, "step": 2435 }, { "epoch": 0.4169091220263563, "grad_norm": 34.29914855957031, "learning_rate": 1.3861950941243584e-05, "loss": 4.0154, "step": 2436 }, { "epoch": 0.41708026698613726, "grad_norm": 31.778980255126953, "learning_rate": 1.3867655447803766e-05, "loss": 3.279, "step": 2437 }, { "epoch": 0.4172514119459182, "grad_norm": 30.92992401123047, "learning_rate": 1.3873359954363949e-05, "loss": 3.7128, "step": 2438 }, { "epoch": 0.4174225569056991, "grad_norm": 30.067113876342773, "learning_rate": 1.387906446092413e-05, "loss": 3.3294, "step": 2439 }, { "epoch": 0.41759370186548006, "grad_norm": 78.94349670410156, "learning_rate": 1.3884768967484312e-05, "loss": 4.5151, "step": 2440 }, { "epoch": 0.417764846825261, "grad_norm": 35.60622787475586, "learning_rate": 1.3890473474044496e-05, "loss": 3.3575, "step": 2441 }, { "epoch": 0.4179359917850419, "grad_norm": 29.288429260253906, "learning_rate": 1.3896177980604678e-05, "loss": 3.0086, "step": 2442 }, { "epoch": 0.41810713674482286, "grad_norm": 29.106294631958008, "learning_rate": 1.3901882487164861e-05, "loss": 3.228, "step": 2443 }, { "epoch": 0.4182782817046038, "grad_norm": 20.533992767333984, "learning_rate": 1.3907586993725043e-05, "loss": 2.2425, "step": 2444 }, { "epoch": 0.4184494266643847, "grad_norm": 26.775163650512695, "learning_rate": 1.3913291500285224e-05, "loss": 2.6486, "step": 2445 }, { "epoch": 0.41862057162416566, "grad_norm": 23.887187957763672, "learning_rate": 1.3918996006845408e-05, "loss": 2.0254, "step": 2446 }, { "epoch": 0.4187917165839466, "grad_norm": 32.54766082763672, "learning_rate": 1.3924700513405591e-05, "loss": 4.0827, "step": 2447 }, { "epoch": 0.4189628615437275, "grad_norm": 24.691999435424805, "learning_rate": 1.3930405019965774e-05, "loss": 2.7637, "step": 2448 }, { "epoch": 0.41913400650350846, "grad_norm": 36.446842193603516, "learning_rate": 1.3936109526525956e-05, "loss": 7.2209, "step": 2449 }, { "epoch": 0.4193051514632894, "grad_norm": 24.245582580566406, "learning_rate": 1.394181403308614e-05, "loss": 2.5936, "step": 2450 }, { "epoch": 0.4194762964230703, "grad_norm": 34.520198822021484, "learning_rate": 1.3947518539646321e-05, "loss": 3.7529, "step": 2451 }, { "epoch": 0.41964744138285126, "grad_norm": 34.79539489746094, "learning_rate": 1.3953223046206503e-05, "loss": 4.0982, "step": 2452 }, { "epoch": 0.4198185863426322, "grad_norm": 6.38947057723999, "learning_rate": 1.3958927552766686e-05, "loss": 0.5817, "step": 2453 }, { "epoch": 0.4199897313024131, "grad_norm": 35.33879852294922, "learning_rate": 1.3964632059326868e-05, "loss": 3.3258, "step": 2454 }, { "epoch": 0.42016087626219406, "grad_norm": 8.833622932434082, "learning_rate": 1.3970336565887051e-05, "loss": 1.1178, "step": 2455 }, { "epoch": 0.420332021221975, "grad_norm": 25.07313346862793, "learning_rate": 1.3976041072447233e-05, "loss": 2.7209, "step": 2456 }, { "epoch": 0.4205031661817559, "grad_norm": 30.224679946899414, "learning_rate": 1.3981745579007416e-05, "loss": 3.4578, "step": 2457 }, { "epoch": 0.42067431114153686, "grad_norm": 26.421674728393555, "learning_rate": 1.3987450085567598e-05, "loss": 2.6023, "step": 2458 }, { "epoch": 0.4208454561013178, "grad_norm": 33.97099685668945, "learning_rate": 1.3993154592127781e-05, "loss": 4.066, "step": 2459 }, { "epoch": 0.4210166010610987, "grad_norm": 28.07750701904297, "learning_rate": 1.3998859098687965e-05, "loss": 3.1308, "step": 2460 }, { "epoch": 0.42118774602087966, "grad_norm": 33.50989532470703, "learning_rate": 1.4004563605248146e-05, "loss": 3.461, "step": 2461 }, { "epoch": 0.4213588909806606, "grad_norm": 16.63654136657715, "learning_rate": 1.401026811180833e-05, "loss": 1.3419, "step": 2462 }, { "epoch": 0.4215300359404415, "grad_norm": 16.42656707763672, "learning_rate": 1.4015972618368512e-05, "loss": 1.8682, "step": 2463 }, { "epoch": 0.42170118090022246, "grad_norm": 30.457616806030273, "learning_rate": 1.4021677124928693e-05, "loss": 3.1266, "step": 2464 }, { "epoch": 0.42187232586000345, "grad_norm": 20.4791202545166, "learning_rate": 1.4027381631488877e-05, "loss": 2.2995, "step": 2465 }, { "epoch": 0.4220434708197844, "grad_norm": 8.99075698852539, "learning_rate": 1.4033086138049058e-05, "loss": 1.6982, "step": 2466 }, { "epoch": 0.4222146157795653, "grad_norm": 57.64451599121094, "learning_rate": 1.4038790644609242e-05, "loss": 8.0313, "step": 2467 }, { "epoch": 0.42238576073934625, "grad_norm": 13.558103561401367, "learning_rate": 1.4044495151169423e-05, "loss": 1.2578, "step": 2468 }, { "epoch": 0.4225569056991272, "grad_norm": 21.366905212402344, "learning_rate": 1.4050199657729607e-05, "loss": 2.2006, "step": 2469 }, { "epoch": 0.4227280506589081, "grad_norm": 14.984084129333496, "learning_rate": 1.405590416428979e-05, "loss": 1.1421, "step": 2470 }, { "epoch": 0.42289919561868905, "grad_norm": 37.106781005859375, "learning_rate": 1.4061608670849972e-05, "loss": 3.448, "step": 2471 }, { "epoch": 0.42307034057847, "grad_norm": 103.56417083740234, "learning_rate": 1.4067313177410155e-05, "loss": 8.869, "step": 2472 }, { "epoch": 0.4232414855382509, "grad_norm": 34.15910339355469, "learning_rate": 1.4073017683970337e-05, "loss": 3.99, "step": 2473 }, { "epoch": 0.42341263049803185, "grad_norm": 9.371402740478516, "learning_rate": 1.407872219053052e-05, "loss": 1.7126, "step": 2474 }, { "epoch": 0.4235837754578128, "grad_norm": 5.610677719116211, "learning_rate": 1.4084426697090702e-05, "loss": 0.6139, "step": 2475 }, { "epoch": 0.4237549204175937, "grad_norm": 25.18387222290039, "learning_rate": 1.4090131203650885e-05, "loss": 3.0951, "step": 2476 }, { "epoch": 0.42392606537737465, "grad_norm": 21.81611442565918, "learning_rate": 1.4095835710211067e-05, "loss": 2.1739, "step": 2477 }, { "epoch": 0.4240972103371556, "grad_norm": 37.39387893676758, "learning_rate": 1.4101540216771249e-05, "loss": 4.5252, "step": 2478 }, { "epoch": 0.4242683552969365, "grad_norm": 5.823449611663818, "learning_rate": 1.4107244723331432e-05, "loss": 0.6128, "step": 2479 }, { "epoch": 0.42443950025671745, "grad_norm": 31.69689178466797, "learning_rate": 1.4112949229891614e-05, "loss": 2.9986, "step": 2480 }, { "epoch": 0.4246106452164984, "grad_norm": 35.987152099609375, "learning_rate": 1.4118653736451797e-05, "loss": 3.4619, "step": 2481 }, { "epoch": 0.4247817901762793, "grad_norm": 16.255069732666016, "learning_rate": 1.412435824301198e-05, "loss": 1.2687, "step": 2482 }, { "epoch": 0.42495293513606025, "grad_norm": 38.878501892089844, "learning_rate": 1.4130062749572162e-05, "loss": 4.4326, "step": 2483 }, { "epoch": 0.4251240800958412, "grad_norm": 33.7603759765625, "learning_rate": 1.4135767256132346e-05, "loss": 3.6103, "step": 2484 }, { "epoch": 0.4252952250556221, "grad_norm": 30.415058135986328, "learning_rate": 1.4141471762692527e-05, "loss": 2.9553, "step": 2485 }, { "epoch": 0.42546637001540305, "grad_norm": 18.42668914794922, "learning_rate": 1.414717626925271e-05, "loss": 1.6697, "step": 2486 }, { "epoch": 0.425637514975184, "grad_norm": 28.910137176513672, "learning_rate": 1.4152880775812892e-05, "loss": 2.7775, "step": 2487 }, { "epoch": 0.4258086599349649, "grad_norm": 24.642555236816406, "learning_rate": 1.4158585282373076e-05, "loss": 2.6428, "step": 2488 }, { "epoch": 0.42597980489474585, "grad_norm": 6.298614501953125, "learning_rate": 1.4164289788933257e-05, "loss": 0.5965, "step": 2489 }, { "epoch": 0.4261509498545268, "grad_norm": 134.2201690673828, "learning_rate": 1.4169994295493439e-05, "loss": 5.7153, "step": 2490 }, { "epoch": 0.4263220948143077, "grad_norm": 29.22636604309082, "learning_rate": 1.4175698802053622e-05, "loss": 3.1392, "step": 2491 }, { "epoch": 0.42649323977408865, "grad_norm": 29.945589065551758, "learning_rate": 1.4181403308613804e-05, "loss": 2.9141, "step": 2492 }, { "epoch": 0.4266643847338696, "grad_norm": 122.40117645263672, "learning_rate": 1.418710781517399e-05, "loss": 9.1376, "step": 2493 }, { "epoch": 0.4268355296936505, "grad_norm": 12.278093338012695, "learning_rate": 1.4192812321734171e-05, "loss": 0.8576, "step": 2494 }, { "epoch": 0.42700667465343145, "grad_norm": 29.33226776123047, "learning_rate": 1.4198516828294353e-05, "loss": 2.9443, "step": 2495 }, { "epoch": 0.4271778196132124, "grad_norm": 31.89412498474121, "learning_rate": 1.4204221334854536e-05, "loss": 3.7789, "step": 2496 }, { "epoch": 0.4273489645729933, "grad_norm": 30.404138565063477, "learning_rate": 1.4209925841414718e-05, "loss": 4.0024, "step": 2497 }, { "epoch": 0.42752010953277425, "grad_norm": 7.538527488708496, "learning_rate": 1.4215630347974901e-05, "loss": 0.6812, "step": 2498 }, { "epoch": 0.4276912544925552, "grad_norm": 194.7794952392578, "learning_rate": 1.4221334854535083e-05, "loss": 10.7557, "step": 2499 }, { "epoch": 0.4278623994523361, "grad_norm": 27.38447380065918, "learning_rate": 1.4227039361095266e-05, "loss": 3.0669, "step": 2500 }, { "epoch": 0.42803354441211705, "grad_norm": 36.52588653564453, "learning_rate": 1.4232743867655448e-05, "loss": 3.7922, "step": 2501 }, { "epoch": 0.428204689371898, "grad_norm": 14.776211738586426, "learning_rate": 1.423844837421563e-05, "loss": 1.0695, "step": 2502 }, { "epoch": 0.4283758343316789, "grad_norm": 22.516334533691406, "learning_rate": 1.4244152880775813e-05, "loss": 2.175, "step": 2503 }, { "epoch": 0.42854697929145985, "grad_norm": 31.414302825927734, "learning_rate": 1.4249857387335995e-05, "loss": 3.5488, "step": 2504 }, { "epoch": 0.4287181242512408, "grad_norm": 20.823116302490234, "learning_rate": 1.425556189389618e-05, "loss": 2.0128, "step": 2505 }, { "epoch": 0.4288892692110217, "grad_norm": 33.47979736328125, "learning_rate": 1.4261266400456361e-05, "loss": 3.6721, "step": 2506 }, { "epoch": 0.42906041417080265, "grad_norm": 33.771358489990234, "learning_rate": 1.4266970907016545e-05, "loss": 4.2083, "step": 2507 }, { "epoch": 0.4292315591305836, "grad_norm": 21.674623489379883, "learning_rate": 1.4272675413576726e-05, "loss": 1.8789, "step": 2508 }, { "epoch": 0.4294027040903645, "grad_norm": 31.44987678527832, "learning_rate": 1.4278379920136908e-05, "loss": 3.6812, "step": 2509 }, { "epoch": 0.42957384905014545, "grad_norm": 9.912192344665527, "learning_rate": 1.4284084426697091e-05, "loss": 1.2073, "step": 2510 }, { "epoch": 0.4297449940099264, "grad_norm": 26.342119216918945, "learning_rate": 1.4289788933257273e-05, "loss": 2.4193, "step": 2511 }, { "epoch": 0.4299161389697073, "grad_norm": 57.646331787109375, "learning_rate": 1.4295493439817456e-05, "loss": 8.0588, "step": 2512 }, { "epoch": 0.43008728392948825, "grad_norm": 25.247426986694336, "learning_rate": 1.4301197946377638e-05, "loss": 2.5184, "step": 2513 }, { "epoch": 0.4302584288892692, "grad_norm": 21.471519470214844, "learning_rate": 1.430690245293782e-05, "loss": 2.2334, "step": 2514 }, { "epoch": 0.4304295738490501, "grad_norm": 25.605525970458984, "learning_rate": 1.4312606959498003e-05, "loss": 2.584, "step": 2515 }, { "epoch": 0.4306007188088311, "grad_norm": 34.87372589111328, "learning_rate": 1.4318311466058187e-05, "loss": 3.7447, "step": 2516 }, { "epoch": 0.43077186376861204, "grad_norm": 28.899642944335938, "learning_rate": 1.432401597261837e-05, "loss": 2.9852, "step": 2517 }, { "epoch": 0.430943008728393, "grad_norm": 24.084014892578125, "learning_rate": 1.4329720479178552e-05, "loss": 2.5703, "step": 2518 }, { "epoch": 0.4311141536881739, "grad_norm": 13.15533447265625, "learning_rate": 1.4335424985738735e-05, "loss": 0.8629, "step": 2519 }, { "epoch": 0.43128529864795484, "grad_norm": 100.28350067138672, "learning_rate": 1.4341129492298917e-05, "loss": 5.5365, "step": 2520 }, { "epoch": 0.4314564436077358, "grad_norm": 25.63288116455078, "learning_rate": 1.4346833998859098e-05, "loss": 2.2759, "step": 2521 }, { "epoch": 0.4316275885675167, "grad_norm": 4.588881969451904, "learning_rate": 1.4352538505419282e-05, "loss": 0.5143, "step": 2522 }, { "epoch": 0.43179873352729764, "grad_norm": 31.304664611816406, "learning_rate": 1.4358243011979463e-05, "loss": 4.1463, "step": 2523 }, { "epoch": 0.4319698784870786, "grad_norm": 18.030874252319336, "learning_rate": 1.4363947518539647e-05, "loss": 1.8934, "step": 2524 }, { "epoch": 0.4321410234468595, "grad_norm": 29.338178634643555, "learning_rate": 1.4369652025099829e-05, "loss": 3.3346, "step": 2525 }, { "epoch": 0.43231216840664044, "grad_norm": 29.54951286315918, "learning_rate": 1.4375356531660012e-05, "loss": 2.695, "step": 2526 }, { "epoch": 0.4324833133664214, "grad_norm": 17.5317325592041, "learning_rate": 1.4381061038220195e-05, "loss": 1.4575, "step": 2527 }, { "epoch": 0.4326544583262023, "grad_norm": 32.96657943725586, "learning_rate": 1.4386765544780377e-05, "loss": 3.592, "step": 2528 }, { "epoch": 0.43282560328598324, "grad_norm": 19.32137107849121, "learning_rate": 1.439247005134056e-05, "loss": 1.2429, "step": 2529 }, { "epoch": 0.4329967482457642, "grad_norm": 8.846491813659668, "learning_rate": 1.4398174557900742e-05, "loss": 1.021, "step": 2530 }, { "epoch": 0.4331678932055451, "grad_norm": 4.180466651916504, "learning_rate": 1.4403879064460925e-05, "loss": 0.5108, "step": 2531 }, { "epoch": 0.43333903816532604, "grad_norm": 28.7572078704834, "learning_rate": 1.4409583571021107e-05, "loss": 2.694, "step": 2532 }, { "epoch": 0.433510183125107, "grad_norm": 34.48351287841797, "learning_rate": 1.4415288077581289e-05, "loss": 3.7674, "step": 2533 }, { "epoch": 0.4336813280848879, "grad_norm": 27.524559020996094, "learning_rate": 1.4420992584141472e-05, "loss": 2.768, "step": 2534 }, { "epoch": 0.43385247304466884, "grad_norm": 33.70855712890625, "learning_rate": 1.4426697090701654e-05, "loss": 3.1902, "step": 2535 }, { "epoch": 0.4340236180044498, "grad_norm": 30.53034210205078, "learning_rate": 1.4432401597261837e-05, "loss": 3.4593, "step": 2536 }, { "epoch": 0.4341947629642307, "grad_norm": 30.834991455078125, "learning_rate": 1.4438106103822019e-05, "loss": 3.439, "step": 2537 }, { "epoch": 0.43436590792401164, "grad_norm": 31.815725326538086, "learning_rate": 1.4443810610382202e-05, "loss": 3.683, "step": 2538 }, { "epoch": 0.4345370528837926, "grad_norm": 29.159996032714844, "learning_rate": 1.4449515116942386e-05, "loss": 2.9478, "step": 2539 }, { "epoch": 0.4347081978435735, "grad_norm": 81.45700073242188, "learning_rate": 1.4455219623502567e-05, "loss": 9.0416, "step": 2540 }, { "epoch": 0.43487934280335444, "grad_norm": 87.70926666259766, "learning_rate": 1.446092413006275e-05, "loss": 4.7629, "step": 2541 }, { "epoch": 0.4350504877631354, "grad_norm": 9.934538841247559, "learning_rate": 1.4466628636622932e-05, "loss": 1.8243, "step": 2542 }, { "epoch": 0.4352216327229163, "grad_norm": 9.613969802856445, "learning_rate": 1.4472333143183116e-05, "loss": 1.031, "step": 2543 }, { "epoch": 0.43539277768269724, "grad_norm": 45.231689453125, "learning_rate": 1.4478037649743297e-05, "loss": 7.5705, "step": 2544 }, { "epoch": 0.4355639226424782, "grad_norm": 9.317858695983887, "learning_rate": 1.4483742156303479e-05, "loss": 0.6934, "step": 2545 }, { "epoch": 0.4357350676022591, "grad_norm": 35.789794921875, "learning_rate": 1.4489446662863663e-05, "loss": 4.1345, "step": 2546 }, { "epoch": 0.43590621256204004, "grad_norm": 11.596151351928711, "learning_rate": 1.4495151169423844e-05, "loss": 1.3557, "step": 2547 }, { "epoch": 0.436077357521821, "grad_norm": 4.43747091293335, "learning_rate": 1.4500855675984028e-05, "loss": 0.4991, "step": 2548 }, { "epoch": 0.4362485024816019, "grad_norm": 29.71784019470215, "learning_rate": 1.450656018254421e-05, "loss": 3.192, "step": 2549 }, { "epoch": 0.43641964744138284, "grad_norm": 44.21783447265625, "learning_rate": 1.4512264689104394e-05, "loss": 3.6461, "step": 2550 }, { "epoch": 0.4365907924011638, "grad_norm": 27.61203384399414, "learning_rate": 1.4517969195664576e-05, "loss": 3.3135, "step": 2551 }, { "epoch": 0.4367619373609447, "grad_norm": 23.84665298461914, "learning_rate": 1.4523673702224758e-05, "loss": 2.5268, "step": 2552 }, { "epoch": 0.43693308232072564, "grad_norm": 29.368938446044922, "learning_rate": 1.4529378208784941e-05, "loss": 2.7211, "step": 2553 }, { "epoch": 0.4371042272805066, "grad_norm": 36.08073806762695, "learning_rate": 1.4535082715345123e-05, "loss": 3.7964, "step": 2554 }, { "epoch": 0.4372753722402875, "grad_norm": 32.68186950683594, "learning_rate": 1.4540787221905306e-05, "loss": 3.6173, "step": 2555 }, { "epoch": 0.43744651720006844, "grad_norm": 34.985904693603516, "learning_rate": 1.4546491728465488e-05, "loss": 4.2656, "step": 2556 }, { "epoch": 0.4376176621598494, "grad_norm": 129.27252197265625, "learning_rate": 1.4552196235025671e-05, "loss": 8.7164, "step": 2557 }, { "epoch": 0.4377888071196303, "grad_norm": 29.99295997619629, "learning_rate": 1.4557900741585853e-05, "loss": 3.6185, "step": 2558 }, { "epoch": 0.43795995207941124, "grad_norm": 28.371896743774414, "learning_rate": 1.4563605248146035e-05, "loss": 2.9256, "step": 2559 }, { "epoch": 0.4381310970391922, "grad_norm": 8.728231430053711, "learning_rate": 1.4569309754706218e-05, "loss": 0.9915, "step": 2560 }, { "epoch": 0.4383022419989731, "grad_norm": 31.164567947387695, "learning_rate": 1.45750142612664e-05, "loss": 3.8704, "step": 2561 }, { "epoch": 0.43847338695875404, "grad_norm": 32.18178176879883, "learning_rate": 1.4580718767826585e-05, "loss": 4.2259, "step": 2562 }, { "epoch": 0.438644531918535, "grad_norm": 25.499011993408203, "learning_rate": 1.4586423274386766e-05, "loss": 2.6854, "step": 2563 }, { "epoch": 0.4388156768783159, "grad_norm": 34.26057815551758, "learning_rate": 1.4592127780946948e-05, "loss": 3.4689, "step": 2564 }, { "epoch": 0.43898682183809684, "grad_norm": 33.73667526245117, "learning_rate": 1.4597832287507131e-05, "loss": 4.3474, "step": 2565 }, { "epoch": 0.43915796679787783, "grad_norm": 32.83565902709961, "learning_rate": 1.4603536794067313e-05, "loss": 3.475, "step": 2566 }, { "epoch": 0.43932911175765876, "grad_norm": 3.187453269958496, "learning_rate": 1.4609241300627497e-05, "loss": 0.4736, "step": 2567 }, { "epoch": 0.4395002567174397, "grad_norm": 19.98860740661621, "learning_rate": 1.4614945807187678e-05, "loss": 2.0086, "step": 2568 }, { "epoch": 0.43967140167722063, "grad_norm": 27.594697952270508, "learning_rate": 1.4620650313747862e-05, "loss": 3.2803, "step": 2569 }, { "epoch": 0.43984254663700156, "grad_norm": 3.9966156482696533, "learning_rate": 1.4626354820308043e-05, "loss": 0.568, "step": 2570 }, { "epoch": 0.4400136915967825, "grad_norm": 5.779835224151611, "learning_rate": 1.4632059326868225e-05, "loss": 0.648, "step": 2571 }, { "epoch": 0.44018483655656343, "grad_norm": 39.23750305175781, "learning_rate": 1.4637763833428408e-05, "loss": 7.0941, "step": 2572 }, { "epoch": 0.44035598151634436, "grad_norm": 28.068208694458008, "learning_rate": 1.4643468339988592e-05, "loss": 3.0381, "step": 2573 }, { "epoch": 0.4405271264761253, "grad_norm": 25.783096313476562, "learning_rate": 1.4649172846548775e-05, "loss": 3.0511, "step": 2574 }, { "epoch": 0.44069827143590623, "grad_norm": 29.101238250732422, "learning_rate": 1.4654877353108957e-05, "loss": 2.9123, "step": 2575 }, { "epoch": 0.44086941639568716, "grad_norm": 14.171677589416504, "learning_rate": 1.466058185966914e-05, "loss": 1.0138, "step": 2576 }, { "epoch": 0.4410405613554681, "grad_norm": 27.117347717285156, "learning_rate": 1.4666286366229322e-05, "loss": 3.1994, "step": 2577 }, { "epoch": 0.44121170631524903, "grad_norm": 29.480358123779297, "learning_rate": 1.4671990872789504e-05, "loss": 3.4766, "step": 2578 }, { "epoch": 0.44138285127502996, "grad_norm": 4.977560997009277, "learning_rate": 1.4677695379349687e-05, "loss": 0.7032, "step": 2579 }, { "epoch": 0.4415539962348109, "grad_norm": 31.941097259521484, "learning_rate": 1.4683399885909869e-05, "loss": 2.931, "step": 2580 }, { "epoch": 0.44172514119459183, "grad_norm": 136.83563232421875, "learning_rate": 1.4689104392470052e-05, "loss": 5.0846, "step": 2581 }, { "epoch": 0.44189628615437276, "grad_norm": 9.305535316467285, "learning_rate": 1.4694808899030234e-05, "loss": 0.911, "step": 2582 }, { "epoch": 0.4420674311141537, "grad_norm": 18.890281677246094, "learning_rate": 1.4700513405590415e-05, "loss": 1.4747, "step": 2583 }, { "epoch": 0.44223857607393463, "grad_norm": 46.04558563232422, "learning_rate": 1.4706217912150599e-05, "loss": 7.673, "step": 2584 }, { "epoch": 0.44240972103371556, "grad_norm": 24.37186050415039, "learning_rate": 1.4711922418710782e-05, "loss": 2.3299, "step": 2585 }, { "epoch": 0.4425808659934965, "grad_norm": 38.21072006225586, "learning_rate": 1.4717626925270965e-05, "loss": 7.1275, "step": 2586 }, { "epoch": 0.44275201095327743, "grad_norm": 77.47330474853516, "learning_rate": 1.4723331431831147e-05, "loss": 4.7878, "step": 2587 }, { "epoch": 0.44292315591305836, "grad_norm": 37.18149185180664, "learning_rate": 1.472903593839133e-05, "loss": 3.7571, "step": 2588 }, { "epoch": 0.4430943008728393, "grad_norm": 3.5262255668640137, "learning_rate": 1.4734740444951512e-05, "loss": 0.5224, "step": 2589 }, { "epoch": 0.44326544583262023, "grad_norm": 11.645423889160156, "learning_rate": 1.4740444951511694e-05, "loss": 1.0846, "step": 2590 }, { "epoch": 0.44343659079240116, "grad_norm": 6.892613410949707, "learning_rate": 1.4746149458071877e-05, "loss": 0.54, "step": 2591 }, { "epoch": 0.4436077357521821, "grad_norm": 8.752089500427246, "learning_rate": 1.4751853964632059e-05, "loss": 0.6267, "step": 2592 }, { "epoch": 0.44377888071196303, "grad_norm": 23.974550247192383, "learning_rate": 1.4757558471192242e-05, "loss": 2.6998, "step": 2593 }, { "epoch": 0.44395002567174396, "grad_norm": 7.374299049377441, "learning_rate": 1.4763262977752424e-05, "loss": 0.8349, "step": 2594 }, { "epoch": 0.4441211706315249, "grad_norm": 23.21881103515625, "learning_rate": 1.4768967484312606e-05, "loss": 2.7586, "step": 2595 }, { "epoch": 0.44429231559130583, "grad_norm": 170.6956024169922, "learning_rate": 1.477467199087279e-05, "loss": 9.1929, "step": 2596 }, { "epoch": 0.44446346055108676, "grad_norm": 20.11836814880371, "learning_rate": 1.4780376497432972e-05, "loss": 2.3475, "step": 2597 }, { "epoch": 0.4446346055108677, "grad_norm": 24.03493881225586, "learning_rate": 1.4786081003993156e-05, "loss": 2.9464, "step": 2598 }, { "epoch": 0.44480575047064863, "grad_norm": 27.76041603088379, "learning_rate": 1.4791785510553338e-05, "loss": 2.7217, "step": 2599 }, { "epoch": 0.44497689543042956, "grad_norm": 17.792516708374023, "learning_rate": 1.4797490017113521e-05, "loss": 1.6209, "step": 2600 }, { "epoch": 0.4451480403902105, "grad_norm": 34.788169860839844, "learning_rate": 1.4803194523673703e-05, "loss": 2.8761, "step": 2601 }, { "epoch": 0.44531918534999143, "grad_norm": 18.824007034301758, "learning_rate": 1.4808899030233884e-05, "loss": 2.5789, "step": 2602 }, { "epoch": 0.44549033030977236, "grad_norm": 19.51264190673828, "learning_rate": 1.4814603536794068e-05, "loss": 2.2163, "step": 2603 }, { "epoch": 0.4456614752695533, "grad_norm": 31.428625106811523, "learning_rate": 1.482030804335425e-05, "loss": 2.9496, "step": 2604 }, { "epoch": 0.44583262022933423, "grad_norm": 13.012333869934082, "learning_rate": 1.4826012549914433e-05, "loss": 0.8159, "step": 2605 }, { "epoch": 0.44600376518911516, "grad_norm": 23.477638244628906, "learning_rate": 1.4831717056474614e-05, "loss": 2.4937, "step": 2606 }, { "epoch": 0.4461749101488961, "grad_norm": 35.8111572265625, "learning_rate": 1.48374215630348e-05, "loss": 3.2833, "step": 2607 }, { "epoch": 0.44634605510867703, "grad_norm": 32.99673080444336, "learning_rate": 1.4843126069594981e-05, "loss": 3.5874, "step": 2608 }, { "epoch": 0.44651720006845796, "grad_norm": 3.853698253631592, "learning_rate": 1.4848830576155163e-05, "loss": 0.4709, "step": 2609 }, { "epoch": 0.4466883450282389, "grad_norm": 27.9306583404541, "learning_rate": 1.4854535082715346e-05, "loss": 2.6247, "step": 2610 }, { "epoch": 0.44685948998801983, "grad_norm": 11.854992866516113, "learning_rate": 1.4860239589275528e-05, "loss": 0.9967, "step": 2611 }, { "epoch": 0.44703063494780076, "grad_norm": 49.759117126464844, "learning_rate": 1.4865944095835711e-05, "loss": 7.1995, "step": 2612 }, { "epoch": 0.4472017799075817, "grad_norm": 31.380281448364258, "learning_rate": 1.4871648602395893e-05, "loss": 2.7301, "step": 2613 }, { "epoch": 0.44737292486736263, "grad_norm": 29.84979820251465, "learning_rate": 1.4877353108956075e-05, "loss": 3.1099, "step": 2614 }, { "epoch": 0.44754406982714356, "grad_norm": 13.841278076171875, "learning_rate": 1.4883057615516258e-05, "loss": 1.0569, "step": 2615 }, { "epoch": 0.4477152147869245, "grad_norm": 28.414051055908203, "learning_rate": 1.488876212207644e-05, "loss": 2.6359, "step": 2616 }, { "epoch": 0.4478863597467055, "grad_norm": 29.42824363708496, "learning_rate": 1.4894466628636623e-05, "loss": 3.3203, "step": 2617 }, { "epoch": 0.4480575047064864, "grad_norm": 33.065799713134766, "learning_rate": 1.4900171135196805e-05, "loss": 4.0124, "step": 2618 }, { "epoch": 0.44822864966626735, "grad_norm": 9.898391723632812, "learning_rate": 1.490587564175699e-05, "loss": 0.9107, "step": 2619 }, { "epoch": 0.4483997946260483, "grad_norm": 23.923398971557617, "learning_rate": 1.4911580148317172e-05, "loss": 2.5028, "step": 2620 }, { "epoch": 0.4485709395858292, "grad_norm": 25.825178146362305, "learning_rate": 1.4917284654877353e-05, "loss": 2.3647, "step": 2621 }, { "epoch": 0.44874208454561015, "grad_norm": 24.46117401123047, "learning_rate": 1.4922989161437537e-05, "loss": 2.66, "step": 2622 }, { "epoch": 0.4489132295053911, "grad_norm": 19.926624298095703, "learning_rate": 1.4928693667997718e-05, "loss": 1.771, "step": 2623 }, { "epoch": 0.449084374465172, "grad_norm": 107.68805694580078, "learning_rate": 1.4934398174557902e-05, "loss": 9.219, "step": 2624 }, { "epoch": 0.44925551942495295, "grad_norm": 18.121204376220703, "learning_rate": 1.4940102681118083e-05, "loss": 1.3726, "step": 2625 }, { "epoch": 0.4494266643847339, "grad_norm": 27.648178100585938, "learning_rate": 1.4945807187678267e-05, "loss": 2.6469, "step": 2626 }, { "epoch": 0.4495978093445148, "grad_norm": 28.146556854248047, "learning_rate": 1.4951511694238448e-05, "loss": 2.8926, "step": 2627 }, { "epoch": 0.44976895430429575, "grad_norm": 52.536190032958984, "learning_rate": 1.495721620079863e-05, "loss": 7.5002, "step": 2628 }, { "epoch": 0.4499400992640767, "grad_norm": 24.027881622314453, "learning_rate": 1.4962920707358814e-05, "loss": 2.3452, "step": 2629 }, { "epoch": 0.4501112442238576, "grad_norm": 34.977684020996094, "learning_rate": 1.4968625213918997e-05, "loss": 3.9508, "step": 2630 }, { "epoch": 0.45028238918363855, "grad_norm": 30.991193771362305, "learning_rate": 1.497432972047918e-05, "loss": 3.6064, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_nli-pairs_loss": 2.871744394302368, "eval_nli-pairs_runtime": 4.2947, "eval_nli-pairs_samples_per_second": 46.569, "eval_nli-pairs_steps_per_second": 1.63, "eval_sts-test_pearson_cosine": 0.7195428557259504, "eval_sts-test_pearson_dot": 0.6098064793689061, "eval_sts-test_pearson_euclidean": 0.7205423612792191, "eval_sts-test_pearson_manhattan": 0.7293110123887395, "eval_sts-test_pearson_max": 0.7293110123887395, "eval_sts-test_spearman_cosine": 0.6966954300008318, "eval_sts-test_spearman_dot": 0.5822364450229315, "eval_sts-test_spearman_euclidean": 0.7004689124572796, "eval_sts-test_spearman_manhattan": 0.7099498051685355, "eval_sts-test_spearman_max": 0.7099498051685355, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_vitaminc-pairs_loss": 1.8629615306854248, "eval_vitaminc-pairs_runtime": 2.7342, "eval_vitaminc-pairs_samples_per_second": 73.148, "eval_vitaminc-pairs_steps_per_second": 2.56, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_qnli-contrastive_loss": 5.418925762176514, "eval_qnli-contrastive_runtime": 0.6359, "eval_qnli-contrastive_samples_per_second": 314.496, "eval_qnli-contrastive_steps_per_second": 11.007, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_scitail-pairs-qa_loss": 0.4216327965259552, "eval_scitail-pairs-qa_runtime": 1.6135, "eval_scitail-pairs-qa_samples_per_second": 123.956, "eval_scitail-pairs-qa_steps_per_second": 4.338, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_scitail-pairs-pos_loss": 1.3018670082092285, "eval_scitail-pairs-pos_runtime": 2.6103, "eval_scitail-pairs-pos_samples_per_second": 76.619, "eval_scitail-pairs-pos_steps_per_second": 2.682, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_xsum-pairs_loss": 1.584064245223999, "eval_xsum-pairs_runtime": 2.6388, "eval_xsum-pairs_samples_per_second": 66.317, "eval_xsum-pairs_steps_per_second": 2.274, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_compression-pairs_loss": 0.7760603427886963, "eval_compression-pairs_runtime": 0.5146, "eval_compression-pairs_samples_per_second": 388.623, "eval_compression-pairs_steps_per_second": 13.602, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_sciq_pairs_loss": 5.851566314697266, "eval_sciq_pairs_runtime": 9.2089, "eval_sciq_pairs_samples_per_second": 21.718, "eval_sciq_pairs_steps_per_second": 0.76, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_qasc_pairs_loss": 7.442629814147949, "eval_qasc_pairs_runtime": 2.6477, "eval_qasc_pairs_samples_per_second": 75.537, "eval_qasc_pairs_steps_per_second": 2.644, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_openbookqa_pairs_loss": 4.049252033233643, "eval_openbookqa_pairs_runtime": 0.6399, "eval_openbookqa_pairs_samples_per_second": 107.834, "eval_openbookqa_pairs_steps_per_second": 4.688, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_msmarco_pairs_loss": 2.6957242488861084, "eval_msmarco_pairs_runtime": 3.9586, "eval_msmarco_pairs_samples_per_second": 50.523, "eval_msmarco_pairs_steps_per_second": 1.768, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_nq_pairs_loss": 3.332510471343994, "eval_nq_pairs_runtime": 8.6125, "eval_nq_pairs_samples_per_second": 23.222, "eval_nq_pairs_steps_per_second": 0.813, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_trivia_pairs_loss": 3.298595905303955, "eval_trivia_pairs_runtime": 12.8335, "eval_trivia_pairs_samples_per_second": 15.584, "eval_trivia_pairs_steps_per_second": 0.545, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_quora_pairs_loss": 0.6931056380271912, "eval_quora_pairs_runtime": 1.5975, "eval_quora_pairs_samples_per_second": 125.194, "eval_quora_pairs_steps_per_second": 4.382, "step": 2631 }, { "epoch": 0.45028238918363855, "eval_gooaq_pairs_loss": 2.1408634185791016, "eval_gooaq_pairs_runtime": 2.6505, "eval_gooaq_pairs_samples_per_second": 75.457, "eval_gooaq_pairs_steps_per_second": 2.641, "step": 2631 }, { "epoch": 0.4504535341434195, "grad_norm": 25.071407318115234, "learning_rate": 1.4980034227039362e-05, "loss": 2.3902, "step": 2632 }, { "epoch": 0.4506246791032004, "grad_norm": 14.988327026367188, "learning_rate": 1.4985738733599544e-05, "loss": 1.1409, "step": 2633 }, { "epoch": 0.45079582406298135, "grad_norm": 26.867197036743164, "learning_rate": 1.4991443240159727e-05, "loss": 2.932, "step": 2634 }, { "epoch": 0.4509669690227623, "grad_norm": 36.01612091064453, "learning_rate": 1.4997147746719909e-05, "loss": 4.7158, "step": 2635 }, { "epoch": 0.4511381139825432, "grad_norm": 16.741594314575195, "learning_rate": 1.5002852253280092e-05, "loss": 1.3786, "step": 2636 }, { "epoch": 0.45130925894232415, "grad_norm": 27.737268447875977, "learning_rate": 1.5008556759840274e-05, "loss": 3.3069, "step": 2637 }, { "epoch": 0.4514804039021051, "grad_norm": 12.152483940124512, "learning_rate": 1.5014261266400457e-05, "loss": 1.0332, "step": 2638 }, { "epoch": 0.451651548861886, "grad_norm": 12.2247314453125, "learning_rate": 1.5019965772960639e-05, "loss": 0.825, "step": 2639 }, { "epoch": 0.45182269382166695, "grad_norm": 10.578752517700195, "learning_rate": 1.502567027952082e-05, "loss": 1.097, "step": 2640 }, { "epoch": 0.4519938387814479, "grad_norm": 27.834949493408203, "learning_rate": 1.5031374786081004e-05, "loss": 3.3057, "step": 2641 }, { "epoch": 0.4521649837412288, "grad_norm": 31.40846824645996, "learning_rate": 1.5037079292641186e-05, "loss": 3.5691, "step": 2642 }, { "epoch": 0.45233612870100975, "grad_norm": 37.22605514526367, "learning_rate": 1.5042783799201369e-05, "loss": 3.8636, "step": 2643 }, { "epoch": 0.4525072736607907, "grad_norm": 10.362072944641113, "learning_rate": 1.504848830576155e-05, "loss": 0.9574, "step": 2644 }, { "epoch": 0.4526784186205716, "grad_norm": 27.246967315673828, "learning_rate": 1.5054192812321734e-05, "loss": 2.6717, "step": 2645 }, { "epoch": 0.45284956358035255, "grad_norm": 17.54155921936035, "learning_rate": 1.5059897318881916e-05, "loss": 1.1453, "step": 2646 }, { "epoch": 0.4530207085401335, "grad_norm": 27.662446975708008, "learning_rate": 1.50656018254421e-05, "loss": 2.3804, "step": 2647 }, { "epoch": 0.4531918534999144, "grad_norm": 94.5572738647461, "learning_rate": 1.5071306332002284e-05, "loss": 8.0449, "step": 2648 }, { "epoch": 0.45336299845969535, "grad_norm": 30.91036605834961, "learning_rate": 1.5077010838562466e-05, "loss": 3.4507, "step": 2649 }, { "epoch": 0.4535341434194763, "grad_norm": 24.88844108581543, "learning_rate": 1.508271534512265e-05, "loss": 2.4537, "step": 2650 }, { "epoch": 0.4537052883792572, "grad_norm": 36.03679656982422, "learning_rate": 1.5088419851682831e-05, "loss": 7.241, "step": 2651 }, { "epoch": 0.45387643333903815, "grad_norm": 25.070192337036133, "learning_rate": 1.5094124358243013e-05, "loss": 2.5557, "step": 2652 }, { "epoch": 0.4540475782988191, "grad_norm": 7.677706718444824, "learning_rate": 1.5099828864803196e-05, "loss": 0.5579, "step": 2653 }, { "epoch": 0.4542187232586, "grad_norm": 30.037338256835938, "learning_rate": 1.5105533371363378e-05, "loss": 3.2643, "step": 2654 }, { "epoch": 0.45438986821838095, "grad_norm": 25.368310928344727, "learning_rate": 1.5111237877923561e-05, "loss": 2.722, "step": 2655 }, { "epoch": 0.4545610131781619, "grad_norm": 36.92127990722656, "learning_rate": 1.5116942384483743e-05, "loss": 4.7207, "step": 2656 }, { "epoch": 0.4547321581379428, "grad_norm": 22.686552047729492, "learning_rate": 1.5122646891043926e-05, "loss": 2.1942, "step": 2657 }, { "epoch": 0.45490330309772375, "grad_norm": 53.640262603759766, "learning_rate": 1.5128351397604108e-05, "loss": 6.8632, "step": 2658 }, { "epoch": 0.4550744480575047, "grad_norm": 24.542247772216797, "learning_rate": 1.513405590416429e-05, "loss": 2.4562, "step": 2659 }, { "epoch": 0.4552455930172856, "grad_norm": 5.353951930999756, "learning_rate": 1.5139760410724473e-05, "loss": 0.5523, "step": 2660 }, { "epoch": 0.45541673797706655, "grad_norm": 32.79592514038086, "learning_rate": 1.5145464917284655e-05, "loss": 3.4424, "step": 2661 }, { "epoch": 0.4555878829368475, "grad_norm": 35.7240104675293, "learning_rate": 1.5151169423844838e-05, "loss": 3.5062, "step": 2662 }, { "epoch": 0.4557590278966284, "grad_norm": 30.997047424316406, "learning_rate": 1.515687393040502e-05, "loss": 3.9807, "step": 2663 }, { "epoch": 0.45593017285640935, "grad_norm": 41.52260208129883, "learning_rate": 1.5162578436965201e-05, "loss": 4.4682, "step": 2664 }, { "epoch": 0.4561013178161903, "grad_norm": 33.410797119140625, "learning_rate": 1.5168282943525385e-05, "loss": 3.3602, "step": 2665 }, { "epoch": 0.4562724627759712, "grad_norm": 22.308074951171875, "learning_rate": 1.5173987450085566e-05, "loss": 2.5111, "step": 2666 }, { "epoch": 0.45644360773575215, "grad_norm": 21.073827743530273, "learning_rate": 1.517969195664575e-05, "loss": 2.1332, "step": 2667 }, { "epoch": 0.45661475269553314, "grad_norm": 36.2976188659668, "learning_rate": 1.5185396463205931e-05, "loss": 4.8541, "step": 2668 }, { "epoch": 0.4567858976553141, "grad_norm": 37.76522445678711, "learning_rate": 1.5191100969766115e-05, "loss": 6.96, "step": 2669 }, { "epoch": 0.456957042615095, "grad_norm": 29.864612579345703, "learning_rate": 1.51968054763263e-05, "loss": 2.896, "step": 2670 }, { "epoch": 0.45712818757487594, "grad_norm": 22.04704475402832, "learning_rate": 1.5202509982886482e-05, "loss": 2.6772, "step": 2671 }, { "epoch": 0.4572993325346569, "grad_norm": 19.153793334960938, "learning_rate": 1.5208214489446665e-05, "loss": 1.7357, "step": 2672 }, { "epoch": 0.4574704774944378, "grad_norm": 30.495540618896484, "learning_rate": 1.5213918996006847e-05, "loss": 3.1067, "step": 2673 }, { "epoch": 0.45764162245421874, "grad_norm": 12.724396705627441, "learning_rate": 1.521962350256703e-05, "loss": 0.9931, "step": 2674 }, { "epoch": 0.4578127674139997, "grad_norm": 6.2942399978637695, "learning_rate": 1.5225328009127212e-05, "loss": 0.5454, "step": 2675 }, { "epoch": 0.4579839123737806, "grad_norm": 10.231136322021484, "learning_rate": 1.5231032515687395e-05, "loss": 1.696, "step": 2676 }, { "epoch": 0.45815505733356154, "grad_norm": 152.32469177246094, "learning_rate": 1.5236737022247577e-05, "loss": 8.8958, "step": 2677 }, { "epoch": 0.4583262022933425, "grad_norm": 38.06270980834961, "learning_rate": 1.5242441528807758e-05, "loss": 3.9409, "step": 2678 }, { "epoch": 0.4584973472531234, "grad_norm": 25.77074432373047, "learning_rate": 1.5248146035367942e-05, "loss": 2.6594, "step": 2679 }, { "epoch": 0.45866849221290434, "grad_norm": 29.309284210205078, "learning_rate": 1.5253850541928123e-05, "loss": 3.3099, "step": 2680 }, { "epoch": 0.4588396371726853, "grad_norm": 25.866558074951172, "learning_rate": 1.5259555048488305e-05, "loss": 3.4843, "step": 2681 }, { "epoch": 0.4590107821324662, "grad_norm": 23.736045837402344, "learning_rate": 1.526525955504849e-05, "loss": 2.5762, "step": 2682 }, { "epoch": 0.45918192709224714, "grad_norm": 24.48654556274414, "learning_rate": 1.5270964061608672e-05, "loss": 2.4442, "step": 2683 }, { "epoch": 0.4593530720520281, "grad_norm": 9.48880386352539, "learning_rate": 1.5276668568168852e-05, "loss": 1.0744, "step": 2684 }, { "epoch": 0.459524217011809, "grad_norm": 5.050061225891113, "learning_rate": 1.5282373074729035e-05, "loss": 0.4942, "step": 2685 }, { "epoch": 0.45969536197158994, "grad_norm": 22.153188705444336, "learning_rate": 1.528807758128922e-05, "loss": 2.3914, "step": 2686 }, { "epoch": 0.4598665069313709, "grad_norm": 3.6951212882995605, "learning_rate": 1.5293782087849402e-05, "loss": 0.4808, "step": 2687 }, { "epoch": 0.4600376518911518, "grad_norm": 11.074764251708984, "learning_rate": 1.5299486594409582e-05, "loss": 1.7231, "step": 2688 }, { "epoch": 0.46020879685093274, "grad_norm": 19.63498306274414, "learning_rate": 1.5305191100969765e-05, "loss": 2.1931, "step": 2689 }, { "epoch": 0.4603799418107137, "grad_norm": 30.376020431518555, "learning_rate": 1.531089560752995e-05, "loss": 3.2142, "step": 2690 }, { "epoch": 0.4605510867704946, "grad_norm": 39.05623245239258, "learning_rate": 1.531660011409013e-05, "loss": 4.3445, "step": 2691 }, { "epoch": 0.46072223173027554, "grad_norm": 34.95427703857422, "learning_rate": 1.5322304620650312e-05, "loss": 3.5087, "step": 2692 }, { "epoch": 0.4608933766900565, "grad_norm": 24.339468002319336, "learning_rate": 1.53280091272105e-05, "loss": 2.139, "step": 2693 }, { "epoch": 0.4610645216498374, "grad_norm": 36.85024642944336, "learning_rate": 1.5333713633770682e-05, "loss": 4.5667, "step": 2694 }, { "epoch": 0.46123566660961834, "grad_norm": 121.48307037353516, "learning_rate": 1.5339418140330862e-05, "loss": 5.9012, "step": 2695 }, { "epoch": 0.4614068115693993, "grad_norm": 7.473188877105713, "learning_rate": 1.5345122646891046e-05, "loss": 0.9301, "step": 2696 }, { "epoch": 0.4615779565291802, "grad_norm": 27.48497200012207, "learning_rate": 1.535082715345123e-05, "loss": 3.0351, "step": 2697 }, { "epoch": 0.46174910148896114, "grad_norm": 22.619394302368164, "learning_rate": 1.535653166001141e-05, "loss": 2.4788, "step": 2698 }, { "epoch": 0.4619202464487421, "grad_norm": 25.0198974609375, "learning_rate": 1.5362236166571592e-05, "loss": 2.3989, "step": 2699 }, { "epoch": 0.462091391408523, "grad_norm": 23.36564064025879, "learning_rate": 1.5367940673131776e-05, "loss": 2.4179, "step": 2700 }, { "epoch": 0.46226253636830394, "grad_norm": 29.04068946838379, "learning_rate": 1.537364517969196e-05, "loss": 3.1229, "step": 2701 }, { "epoch": 0.4624336813280849, "grad_norm": 27.629722595214844, "learning_rate": 1.537934968625214e-05, "loss": 2.7618, "step": 2702 }, { "epoch": 0.4626048262878658, "grad_norm": 23.081079483032227, "learning_rate": 1.5385054192812323e-05, "loss": 2.8201, "step": 2703 }, { "epoch": 0.46277597124764674, "grad_norm": 26.009172439575195, "learning_rate": 1.5390758699372506e-05, "loss": 3.1322, "step": 2704 }, { "epoch": 0.4629471162074277, "grad_norm": 18.447147369384766, "learning_rate": 1.5396463205932686e-05, "loss": 1.2356, "step": 2705 }, { "epoch": 0.4631182611672086, "grad_norm": 22.773012161254883, "learning_rate": 1.540216771249287e-05, "loss": 2.6551, "step": 2706 }, { "epoch": 0.46328940612698954, "grad_norm": 32.899314880371094, "learning_rate": 1.5407872219053053e-05, "loss": 3.7763, "step": 2707 }, { "epoch": 0.4634605510867705, "grad_norm": 97.4777603149414, "learning_rate": 1.5413576725613233e-05, "loss": 4.5767, "step": 2708 }, { "epoch": 0.4636316960465514, "grad_norm": 41.41079330444336, "learning_rate": 1.5419281232173416e-05, "loss": 7.352, "step": 2709 }, { "epoch": 0.46380284100633234, "grad_norm": 24.83094024658203, "learning_rate": 1.54249857387336e-05, "loss": 2.836, "step": 2710 }, { "epoch": 0.4639739859661133, "grad_norm": 12.101001739501953, "learning_rate": 1.5430690245293783e-05, "loss": 0.9624, "step": 2711 }, { "epoch": 0.4641451309258942, "grad_norm": 24.289182662963867, "learning_rate": 1.5436394751853963e-05, "loss": 2.3101, "step": 2712 }, { "epoch": 0.46431627588567514, "grad_norm": 23.911334991455078, "learning_rate": 1.5442099258414146e-05, "loss": 2.4969, "step": 2713 }, { "epoch": 0.4644874208454561, "grad_norm": 35.51081085205078, "learning_rate": 1.544780376497433e-05, "loss": 3.353, "step": 2714 }, { "epoch": 0.464658565805237, "grad_norm": 21.24627113342285, "learning_rate": 1.545350827153451e-05, "loss": 2.5466, "step": 2715 }, { "epoch": 0.46482971076501794, "grad_norm": 30.70880126953125, "learning_rate": 1.5459212778094696e-05, "loss": 3.8228, "step": 2716 }, { "epoch": 0.4650008557247989, "grad_norm": 25.956119537353516, "learning_rate": 1.546491728465488e-05, "loss": 2.6475, "step": 2717 }, { "epoch": 0.4651720006845798, "grad_norm": 37.32086944580078, "learning_rate": 1.5470621791215063e-05, "loss": 3.6192, "step": 2718 }, { "epoch": 0.4653431456443608, "grad_norm": 25.61843490600586, "learning_rate": 1.5476326297775243e-05, "loss": 2.336, "step": 2719 }, { "epoch": 0.46551429060414173, "grad_norm": 31.5511531829834, "learning_rate": 1.5482030804335426e-05, "loss": 3.1832, "step": 2720 }, { "epoch": 0.46568543556392267, "grad_norm": 35.96617889404297, "learning_rate": 1.548773531089561e-05, "loss": 4.2684, "step": 2721 }, { "epoch": 0.4658565805237036, "grad_norm": 12.214024543762207, "learning_rate": 1.549343981745579e-05, "loss": 0.8686, "step": 2722 }, { "epoch": 0.46602772548348453, "grad_norm": 3.517146110534668, "learning_rate": 1.5499144324015973e-05, "loss": 0.4999, "step": 2723 }, { "epoch": 0.46619887044326547, "grad_norm": 27.56136703491211, "learning_rate": 1.5504848830576157e-05, "loss": 2.7019, "step": 2724 }, { "epoch": 0.4663700154030464, "grad_norm": 4.812444686889648, "learning_rate": 1.551055333713634e-05, "loss": 0.5107, "step": 2725 }, { "epoch": 0.46654116036282733, "grad_norm": 30.523237228393555, "learning_rate": 1.551625784369652e-05, "loss": 3.2109, "step": 2726 }, { "epoch": 0.46671230532260827, "grad_norm": 28.326934814453125, "learning_rate": 1.5521962350256703e-05, "loss": 3.2289, "step": 2727 }, { "epoch": 0.4668834502823892, "grad_norm": 34.37868118286133, "learning_rate": 1.5527666856816887e-05, "loss": 3.6814, "step": 2728 }, { "epoch": 0.46705459524217013, "grad_norm": 30.16160774230957, "learning_rate": 1.5533371363377067e-05, "loss": 3.4049, "step": 2729 }, { "epoch": 0.46722574020195107, "grad_norm": 4.218698024749756, "learning_rate": 1.553907586993725e-05, "loss": 0.4987, "step": 2730 }, { "epoch": 0.467396885161732, "grad_norm": 23.180875778198242, "learning_rate": 1.5544780376497433e-05, "loss": 2.2238, "step": 2731 }, { "epoch": 0.46756803012151293, "grad_norm": 25.21503257751465, "learning_rate": 1.5550484883057617e-05, "loss": 2.4819, "step": 2732 }, { "epoch": 0.46773917508129387, "grad_norm": 30.37474822998047, "learning_rate": 1.5556189389617797e-05, "loss": 3.2935, "step": 2733 }, { "epoch": 0.4679103200410748, "grad_norm": 16.8712100982666, "learning_rate": 1.556189389617798e-05, "loss": 1.0892, "step": 2734 }, { "epoch": 0.46808146500085573, "grad_norm": 23.52683448791504, "learning_rate": 1.5567598402738164e-05, "loss": 2.3256, "step": 2735 }, { "epoch": 0.46825260996063667, "grad_norm": 37.76002502441406, "learning_rate": 1.5573302909298344e-05, "loss": 3.8535, "step": 2736 }, { "epoch": 0.4684237549204176, "grad_norm": 31.672475814819336, "learning_rate": 1.5579007415858527e-05, "loss": 2.5348, "step": 2737 }, { "epoch": 0.46859489988019853, "grad_norm": 59.173072814941406, "learning_rate": 1.558471192241871e-05, "loss": 7.7627, "step": 2738 }, { "epoch": 0.46876604483997947, "grad_norm": 23.428421020507812, "learning_rate": 1.5590416428978894e-05, "loss": 2.3317, "step": 2739 }, { "epoch": 0.4689371897997604, "grad_norm": 38.1778564453125, "learning_rate": 1.5596120935539077e-05, "loss": 7.6561, "step": 2740 }, { "epoch": 0.46910833475954133, "grad_norm": 10.163063049316406, "learning_rate": 1.560182544209926e-05, "loss": 0.7524, "step": 2741 }, { "epoch": 0.46927947971932227, "grad_norm": 3.395460367202759, "learning_rate": 1.5607529948659444e-05, "loss": 0.4881, "step": 2742 }, { "epoch": 0.4694506246791032, "grad_norm": 28.233747482299805, "learning_rate": 1.5613234455219624e-05, "loss": 2.606, "step": 2743 }, { "epoch": 0.46962176963888413, "grad_norm": 33.14704513549805, "learning_rate": 1.5618938961779807e-05, "loss": 3.4617, "step": 2744 }, { "epoch": 0.46979291459866507, "grad_norm": 4.885557651519775, "learning_rate": 1.562464346833999e-05, "loss": 0.5159, "step": 2745 }, { "epoch": 0.469964059558446, "grad_norm": 32.37671661376953, "learning_rate": 1.563034797490017e-05, "loss": 3.1744, "step": 2746 }, { "epoch": 0.47013520451822693, "grad_norm": 26.680980682373047, "learning_rate": 1.5636052481460354e-05, "loss": 2.8685, "step": 2747 }, { "epoch": 0.47030634947800787, "grad_norm": 27.004371643066406, "learning_rate": 1.5641756988020537e-05, "loss": 3.0092, "step": 2748 }, { "epoch": 0.4704774944377888, "grad_norm": 21.964834213256836, "learning_rate": 1.564746149458072e-05, "loss": 2.2193, "step": 2749 }, { "epoch": 0.47064863939756973, "grad_norm": 66.10285186767578, "learning_rate": 1.56531660011409e-05, "loss": 7.346, "step": 2750 }, { "epoch": 0.47081978435735067, "grad_norm": 6.018518924713135, "learning_rate": 1.5658870507701084e-05, "loss": 0.5488, "step": 2751 }, { "epoch": 0.4709909293171316, "grad_norm": 30.385318756103516, "learning_rate": 1.5664575014261267e-05, "loss": 3.0093, "step": 2752 }, { "epoch": 0.47116207427691253, "grad_norm": 13.027030944824219, "learning_rate": 1.5670279520821447e-05, "loss": 1.9682, "step": 2753 }, { "epoch": 0.47133321923669347, "grad_norm": 35.71416473388672, "learning_rate": 1.567598402738163e-05, "loss": 6.69, "step": 2754 }, { "epoch": 0.4715043641964744, "grad_norm": 29.253435134887695, "learning_rate": 1.5681688533941814e-05, "loss": 2.932, "step": 2755 }, { "epoch": 0.47167550915625533, "grad_norm": 30.07666778564453, "learning_rate": 1.5687393040501998e-05, "loss": 3.8444, "step": 2756 }, { "epoch": 0.47184665411603627, "grad_norm": 35.976871490478516, "learning_rate": 1.5693097547062178e-05, "loss": 4.9907, "step": 2757 }, { "epoch": 0.4720177990758172, "grad_norm": 87.46236419677734, "learning_rate": 1.569880205362236e-05, "loss": 4.691, "step": 2758 }, { "epoch": 0.47218894403559813, "grad_norm": 22.59965705871582, "learning_rate": 1.5704506560182544e-05, "loss": 2.1086, "step": 2759 }, { "epoch": 0.47236008899537907, "grad_norm": 8.256918907165527, "learning_rate": 1.5710211066742724e-05, "loss": 0.9678, "step": 2760 }, { "epoch": 0.47253123395516, "grad_norm": 38.63548278808594, "learning_rate": 1.5715915573302908e-05, "loss": 3.4149, "step": 2761 }, { "epoch": 0.47270237891494093, "grad_norm": 7.380117893218994, "learning_rate": 1.5721620079863094e-05, "loss": 0.6134, "step": 2762 }, { "epoch": 0.47287352387472187, "grad_norm": 27.26441764831543, "learning_rate": 1.5727324586423278e-05, "loss": 2.8164, "step": 2763 }, { "epoch": 0.4730446688345028, "grad_norm": 18.643917083740234, "learning_rate": 1.5733029092983458e-05, "loss": 1.9656, "step": 2764 }, { "epoch": 0.47321581379428374, "grad_norm": 27.289445877075195, "learning_rate": 1.573873359954364e-05, "loss": 2.8402, "step": 2765 }, { "epoch": 0.47338695875406467, "grad_norm": 6.67548942565918, "learning_rate": 1.5744438106103825e-05, "loss": 0.7842, "step": 2766 }, { "epoch": 0.4735581037138456, "grad_norm": 33.30831527709961, "learning_rate": 1.5750142612664005e-05, "loss": 3.6011, "step": 2767 }, { "epoch": 0.47372924867362654, "grad_norm": 104.88871765136719, "learning_rate": 1.5755847119224188e-05, "loss": 5.0141, "step": 2768 }, { "epoch": 0.47390039363340747, "grad_norm": 5.102696418762207, "learning_rate": 1.576155162578437e-05, "loss": 0.5258, "step": 2769 }, { "epoch": 0.47407153859318846, "grad_norm": 3.6947317123413086, "learning_rate": 1.576725613234455e-05, "loss": 0.4833, "step": 2770 }, { "epoch": 0.4742426835529694, "grad_norm": 32.43489074707031, "learning_rate": 1.5772960638904735e-05, "loss": 3.7272, "step": 2771 }, { "epoch": 0.4744138285127503, "grad_norm": 32.32176971435547, "learning_rate": 1.5778665145464918e-05, "loss": 3.4356, "step": 2772 }, { "epoch": 0.47458497347253126, "grad_norm": 2.583381175994873, "learning_rate": 1.57843696520251e-05, "loss": 0.4212, "step": 2773 }, { "epoch": 0.4747561184323122, "grad_norm": 38.029869079589844, "learning_rate": 1.579007415858528e-05, "loss": 7.2033, "step": 2774 }, { "epoch": 0.4749272633920931, "grad_norm": 10.577736854553223, "learning_rate": 1.5795778665145465e-05, "loss": 1.2395, "step": 2775 }, { "epoch": 0.47509840835187406, "grad_norm": 9.981147766113281, "learning_rate": 1.5801483171705648e-05, "loss": 1.4924, "step": 2776 }, { "epoch": 0.475269553311655, "grad_norm": 28.36383819580078, "learning_rate": 1.5807187678265828e-05, "loss": 3.8155, "step": 2777 }, { "epoch": 0.4754406982714359, "grad_norm": 6.329680442810059, "learning_rate": 1.581289218482601e-05, "loss": 0.4932, "step": 2778 }, { "epoch": 0.47561184323121686, "grad_norm": 17.587629318237305, "learning_rate": 1.5818596691386195e-05, "loss": 1.8358, "step": 2779 }, { "epoch": 0.4757829881909978, "grad_norm": 32.48772048950195, "learning_rate": 1.582430119794638e-05, "loss": 4.1859, "step": 2780 }, { "epoch": 0.4759541331507787, "grad_norm": 41.349056243896484, "learning_rate": 1.583000570450656e-05, "loss": 7.0338, "step": 2781 }, { "epoch": 0.47612527811055966, "grad_norm": 32.28718185424805, "learning_rate": 1.583571021106674e-05, "loss": 3.439, "step": 2782 }, { "epoch": 0.4762964230703406, "grad_norm": 52.53911209106445, "learning_rate": 1.5841414717626925e-05, "loss": 6.9516, "step": 2783 }, { "epoch": 0.4764675680301215, "grad_norm": 6.450766086578369, "learning_rate": 1.5847119224187105e-05, "loss": 0.4587, "step": 2784 }, { "epoch": 0.47663871298990246, "grad_norm": 31.295753479003906, "learning_rate": 1.5852823730747292e-05, "loss": 3.6037, "step": 2785 }, { "epoch": 0.4768098579496834, "grad_norm": 10.392585754394531, "learning_rate": 1.5858528237307475e-05, "loss": 0.7695, "step": 2786 }, { "epoch": 0.4769810029094643, "grad_norm": 31.578166961669922, "learning_rate": 1.586423274386766e-05, "loss": 3.4914, "step": 2787 }, { "epoch": 0.47715214786924526, "grad_norm": 35.540199279785156, "learning_rate": 1.586993725042784e-05, "loss": 4.0507, "step": 2788 }, { "epoch": 0.4773232928290262, "grad_norm": 30.065216064453125, "learning_rate": 1.5875641756988022e-05, "loss": 3.4183, "step": 2789 }, { "epoch": 0.4774944377888071, "grad_norm": 3.6649258136749268, "learning_rate": 1.5881346263548205e-05, "loss": 0.4127, "step": 2790 }, { "epoch": 0.47766558274858806, "grad_norm": 88.72532653808594, "learning_rate": 1.5887050770108385e-05, "loss": 4.5608, "step": 2791 }, { "epoch": 0.477836727708369, "grad_norm": 23.770221710205078, "learning_rate": 1.589275527666857e-05, "loss": 2.2223, "step": 2792 }, { "epoch": 0.4780078726681499, "grad_norm": 5.284163951873779, "learning_rate": 1.5898459783228752e-05, "loss": 0.5186, "step": 2793 }, { "epoch": 0.47817901762793086, "grad_norm": 29.41139793395996, "learning_rate": 1.5904164289788935e-05, "loss": 3.0647, "step": 2794 }, { "epoch": 0.4783501625877118, "grad_norm": 26.757612228393555, "learning_rate": 1.5909868796349115e-05, "loss": 2.3827, "step": 2795 }, { "epoch": 0.4785213075474927, "grad_norm": 12.758798599243164, "learning_rate": 1.59155733029093e-05, "loss": 0.818, "step": 2796 }, { "epoch": 0.47869245250727366, "grad_norm": 29.093143463134766, "learning_rate": 1.5921277809469482e-05, "loss": 2.9151, "step": 2797 }, { "epoch": 0.4788635974670546, "grad_norm": 126.96649932861328, "learning_rate": 1.5926982316029662e-05, "loss": 8.4343, "step": 2798 }, { "epoch": 0.4790347424268355, "grad_norm": 31.195518493652344, "learning_rate": 1.5932686822589846e-05, "loss": 3.7256, "step": 2799 }, { "epoch": 0.47920588738661646, "grad_norm": 28.148395538330078, "learning_rate": 1.593839132915003e-05, "loss": 2.8813, "step": 2800 }, { "epoch": 0.4793770323463974, "grad_norm": 36.015403747558594, "learning_rate": 1.5944095835710212e-05, "loss": 4.6005, "step": 2801 }, { "epoch": 0.4795481773061783, "grad_norm": 31.1592960357666, "learning_rate": 1.5949800342270392e-05, "loss": 3.1305, "step": 2802 }, { "epoch": 0.47971932226595926, "grad_norm": 128.36007690429688, "learning_rate": 1.5955504848830576e-05, "loss": 4.3169, "step": 2803 }, { "epoch": 0.4798904672257402, "grad_norm": 13.735505104064941, "learning_rate": 1.596120935539076e-05, "loss": 0.9245, "step": 2804 }, { "epoch": 0.4800616121855211, "grad_norm": 42.414024353027344, "learning_rate": 1.596691386195094e-05, "loss": 6.975, "step": 2805 }, { "epoch": 0.48023275714530206, "grad_norm": 31.763032913208008, "learning_rate": 1.5972618368511122e-05, "loss": 3.1284, "step": 2806 }, { "epoch": 0.480403902105083, "grad_norm": 27.716442108154297, "learning_rate": 1.597832287507131e-05, "loss": 3.0007, "step": 2807 }, { "epoch": 0.4805750470648639, "grad_norm": 32.059425354003906, "learning_rate": 1.598402738163149e-05, "loss": 4.0829, "step": 2808 }, { "epoch": 0.48074619202464486, "grad_norm": 36.31050491333008, "learning_rate": 1.5989731888191673e-05, "loss": 4.8666, "step": 2809 }, { "epoch": 0.4809173369844258, "grad_norm": 23.16267204284668, "learning_rate": 1.5995436394751856e-05, "loss": 2.0993, "step": 2810 }, { "epoch": 0.4810884819442067, "grad_norm": 12.366683006286621, "learning_rate": 1.600114090131204e-05, "loss": 0.913, "step": 2811 }, { "epoch": 0.48125962690398766, "grad_norm": 11.630936622619629, "learning_rate": 1.600684540787222e-05, "loss": 1.233, "step": 2812 }, { "epoch": 0.4814307718637686, "grad_norm": 5.433574676513672, "learning_rate": 1.6012549914432403e-05, "loss": 0.5408, "step": 2813 }, { "epoch": 0.4816019168235495, "grad_norm": 25.152584075927734, "learning_rate": 1.6018254420992586e-05, "loss": 2.7558, "step": 2814 }, { "epoch": 0.48177306178333046, "grad_norm": 32.2104377746582, "learning_rate": 1.6023958927552766e-05, "loss": 3.56, "step": 2815 }, { "epoch": 0.4819442067431114, "grad_norm": 21.62321662902832, "learning_rate": 1.602966343411295e-05, "loss": 2.626, "step": 2816 }, { "epoch": 0.4821153517028923, "grad_norm": 27.26594352722168, "learning_rate": 1.6035367940673133e-05, "loss": 3.057, "step": 2817 }, { "epoch": 0.48228649666267326, "grad_norm": 29.751848220825195, "learning_rate": 1.6041072447233316e-05, "loss": 3.0557, "step": 2818 }, { "epoch": 0.4824576416224542, "grad_norm": 28.00129508972168, "learning_rate": 1.6046776953793496e-05, "loss": 2.9606, "step": 2819 }, { "epoch": 0.4826287865822352, "grad_norm": 10.21130084991455, "learning_rate": 1.605248146035368e-05, "loss": 1.0526, "step": 2820 }, { "epoch": 0.4827999315420161, "grad_norm": 194.53099060058594, "learning_rate": 1.6058185966913863e-05, "loss": 9.7692, "step": 2821 }, { "epoch": 0.48297107650179705, "grad_norm": 20.116971969604492, "learning_rate": 1.6063890473474043e-05, "loss": 1.702, "step": 2822 }, { "epoch": 0.483142221461578, "grad_norm": 25.585695266723633, "learning_rate": 1.6069594980034226e-05, "loss": 3.1031, "step": 2823 }, { "epoch": 0.4833133664213589, "grad_norm": 10.690316200256348, "learning_rate": 1.607529948659441e-05, "loss": 1.112, "step": 2824 }, { "epoch": 0.48348451138113985, "grad_norm": 46.31101989746094, "learning_rate": 1.6081003993154593e-05, "loss": 7.0695, "step": 2825 }, { "epoch": 0.4836556563409208, "grad_norm": 2.7449769973754883, "learning_rate": 1.6086708499714773e-05, "loss": 0.412, "step": 2826 }, { "epoch": 0.4838268013007017, "grad_norm": 31.60761260986328, "learning_rate": 1.6092413006274956e-05, "loss": 3.5248, "step": 2827 }, { "epoch": 0.48399794626048265, "grad_norm": 30.04467010498047, "learning_rate": 1.609811751283514e-05, "loss": 3.5359, "step": 2828 }, { "epoch": 0.4841690912202636, "grad_norm": 10.859264373779297, "learning_rate": 1.610382201939532e-05, "loss": 0.9806, "step": 2829 }, { "epoch": 0.4843402361800445, "grad_norm": 24.42304229736328, "learning_rate": 1.6109526525955507e-05, "loss": 2.5163, "step": 2830 }, { "epoch": 0.48451138113982545, "grad_norm": 32.02371597290039, "learning_rate": 1.611523103251569e-05, "loss": 4.1818, "step": 2831 }, { "epoch": 0.4846825260996064, "grad_norm": 35.690147399902344, "learning_rate": 1.6120935539075873e-05, "loss": 3.3438, "step": 2832 }, { "epoch": 0.4848536710593873, "grad_norm": 25.543243408203125, "learning_rate": 1.6126640045636053e-05, "loss": 2.5981, "step": 2833 }, { "epoch": 0.48502481601916825, "grad_norm": 25.119115829467773, "learning_rate": 1.6132344552196237e-05, "loss": 2.1322, "step": 2834 }, { "epoch": 0.4851959609789492, "grad_norm": 23.112409591674805, "learning_rate": 1.613804905875642e-05, "loss": 2.5395, "step": 2835 }, { "epoch": 0.4853671059387301, "grad_norm": 91.41179656982422, "learning_rate": 1.61437535653166e-05, "loss": 4.2215, "step": 2836 }, { "epoch": 0.48553825089851105, "grad_norm": 34.66135787963867, "learning_rate": 1.6149458071876783e-05, "loss": 3.1988, "step": 2837 }, { "epoch": 0.485709395858292, "grad_norm": 28.888839721679688, "learning_rate": 1.6155162578436967e-05, "loss": 3.1345, "step": 2838 }, { "epoch": 0.4858805408180729, "grad_norm": 63.08065414428711, "learning_rate": 1.6160867084997147e-05, "loss": 8.1288, "step": 2839 }, { "epoch": 0.48605168577785385, "grad_norm": 148.98455810546875, "learning_rate": 1.616657159155733e-05, "loss": 4.9747, "step": 2840 }, { "epoch": 0.4862228307376348, "grad_norm": 29.048202514648438, "learning_rate": 1.6172276098117514e-05, "loss": 2.9531, "step": 2841 }, { "epoch": 0.4863939756974157, "grad_norm": 6.495917320251465, "learning_rate": 1.6177980604677697e-05, "loss": 0.5056, "step": 2842 }, { "epoch": 0.48656512065719665, "grad_norm": 8.356714248657227, "learning_rate": 1.6183685111237877e-05, "loss": 0.9125, "step": 2843 }, { "epoch": 0.4867362656169776, "grad_norm": 26.18461036682129, "learning_rate": 1.618938961779806e-05, "loss": 3.175, "step": 2844 }, { "epoch": 0.4869074105767585, "grad_norm": 9.202829360961914, "learning_rate": 1.6195094124358244e-05, "loss": 1.0864, "step": 2845 }, { "epoch": 0.48707855553653945, "grad_norm": 34.182373046875, "learning_rate": 1.6200798630918424e-05, "loss": 2.7523, "step": 2846 }, { "epoch": 0.4872497004963204, "grad_norm": 68.9462890625, "learning_rate": 1.6206503137478607e-05, "loss": 3.7044, "step": 2847 }, { "epoch": 0.4874208454561013, "grad_norm": 24.633121490478516, "learning_rate": 1.621220764403879e-05, "loss": 2.6342, "step": 2848 }, { "epoch": 0.48759199041588225, "grad_norm": 32.68869400024414, "learning_rate": 1.6217912150598974e-05, "loss": 4.6795, "step": 2849 }, { "epoch": 0.4877631353756632, "grad_norm": 28.001712799072266, "learning_rate": 1.6223616657159154e-05, "loss": 3.3885, "step": 2850 }, { "epoch": 0.4879342803354441, "grad_norm": 4.1197099685668945, "learning_rate": 1.6229321163719337e-05, "loss": 0.4097, "step": 2851 }, { "epoch": 0.48810542529522505, "grad_norm": 29.35110092163086, "learning_rate": 1.623502567027952e-05, "loss": 3.5865, "step": 2852 }, { "epoch": 0.488276570255006, "grad_norm": 26.92041778564453, "learning_rate": 1.6240730176839704e-05, "loss": 2.7247, "step": 2853 }, { "epoch": 0.4884477152147869, "grad_norm": 34.873775482177734, "learning_rate": 1.6246434683399887e-05, "loss": 7.1172, "step": 2854 }, { "epoch": 0.48861886017456785, "grad_norm": 24.180212020874023, "learning_rate": 1.625213918996007e-05, "loss": 2.4944, "step": 2855 }, { "epoch": 0.4887900051343488, "grad_norm": 28.294334411621094, "learning_rate": 1.6257843696520254e-05, "loss": 3.4049, "step": 2856 }, { "epoch": 0.4889611500941297, "grad_norm": 20.231170654296875, "learning_rate": 1.6263548203080434e-05, "loss": 2.2117, "step": 2857 }, { "epoch": 0.48913229505391065, "grad_norm": 21.00507164001465, "learning_rate": 1.6269252709640617e-05, "loss": 1.8153, "step": 2858 }, { "epoch": 0.4893034400136916, "grad_norm": 26.58632469177246, "learning_rate": 1.62749572162008e-05, "loss": 2.7509, "step": 2859 }, { "epoch": 0.4894745849734725, "grad_norm": 25.922264099121094, "learning_rate": 1.628066172276098e-05, "loss": 3.0767, "step": 2860 }, { "epoch": 0.48964572993325345, "grad_norm": 36.93525695800781, "learning_rate": 1.6286366229321164e-05, "loss": 6.587, "step": 2861 }, { "epoch": 0.4898168748930344, "grad_norm": 30.786312103271484, "learning_rate": 1.6292070735881348e-05, "loss": 4.5453, "step": 2862 }, { "epoch": 0.4899880198528153, "grad_norm": 10.850686073303223, "learning_rate": 1.629777524244153e-05, "loss": 0.8675, "step": 2863 }, { "epoch": 0.49015916481259625, "grad_norm": 22.04916763305664, "learning_rate": 1.630347974900171e-05, "loss": 2.1868, "step": 2864 }, { "epoch": 0.4903303097723772, "grad_norm": 27.125104904174805, "learning_rate": 1.6309184255561894e-05, "loss": 2.7107, "step": 2865 }, { "epoch": 0.4905014547321581, "grad_norm": 26.232017517089844, "learning_rate": 1.6314888762122078e-05, "loss": 3.0023, "step": 2866 }, { "epoch": 0.49067259969193905, "grad_norm": 2.6513617038726807, "learning_rate": 1.6320593268682258e-05, "loss": 0.4064, "step": 2867 }, { "epoch": 0.49084374465172, "grad_norm": 29.269208908081055, "learning_rate": 1.632629777524244e-05, "loss": 2.9074, "step": 2868 }, { "epoch": 0.4910148896115009, "grad_norm": 28.653419494628906, "learning_rate": 1.6332002281802624e-05, "loss": 2.749, "step": 2869 }, { "epoch": 0.49118603457128185, "grad_norm": 24.419513702392578, "learning_rate": 1.6337706788362808e-05, "loss": 2.4733, "step": 2870 }, { "epoch": 0.49135717953106284, "grad_norm": 31.81149673461914, "learning_rate": 1.6343411294922988e-05, "loss": 3.0798, "step": 2871 }, { "epoch": 0.49152832449084377, "grad_norm": 7.05307149887085, "learning_rate": 1.634911580148317e-05, "loss": 0.6362, "step": 2872 }, { "epoch": 0.4916994694506247, "grad_norm": 22.482975006103516, "learning_rate": 1.6354820308043355e-05, "loss": 2.0351, "step": 2873 }, { "epoch": 0.49187061441040564, "grad_norm": 9.290128707885742, "learning_rate": 1.6360524814603535e-05, "loss": 0.6272, "step": 2874 }, { "epoch": 0.49204175937018657, "grad_norm": 27.201467514038086, "learning_rate": 1.6366229321163718e-05, "loss": 2.6431, "step": 2875 }, { "epoch": 0.4922129043299675, "grad_norm": 44.08928298950195, "learning_rate": 1.6371933827723905e-05, "loss": 6.6881, "step": 2876 }, { "epoch": 0.49238404928974844, "grad_norm": 14.08613109588623, "learning_rate": 1.6377638334284085e-05, "loss": 1.0184, "step": 2877 }, { "epoch": 0.49255519424952937, "grad_norm": 19.89874839782715, "learning_rate": 1.6383342840844268e-05, "loss": 2.0983, "step": 2878 }, { "epoch": 0.4927263392093103, "grad_norm": 31.281314849853516, "learning_rate": 1.638904734740445e-05, "loss": 4.3604, "step": 2879 }, { "epoch": 0.49289748416909124, "grad_norm": 4.3934245109558105, "learning_rate": 1.6394751853964635e-05, "loss": 0.4535, "step": 2880 }, { "epoch": 0.49306862912887217, "grad_norm": 16.13640785217285, "learning_rate": 1.6400456360524815e-05, "loss": 1.4628, "step": 2881 }, { "epoch": 0.4932397740886531, "grad_norm": 2.4228832721710205, "learning_rate": 1.6406160867084998e-05, "loss": 0.3669, "step": 2882 }, { "epoch": 0.49341091904843404, "grad_norm": 39.298160552978516, "learning_rate": 1.641186537364518e-05, "loss": 5.1978, "step": 2883 }, { "epoch": 0.49358206400821497, "grad_norm": 7.103499889373779, "learning_rate": 1.641756988020536e-05, "loss": 0.7534, "step": 2884 }, { "epoch": 0.4937532089679959, "grad_norm": 36.24224090576172, "learning_rate": 1.6423274386765545e-05, "loss": 5.1747, "step": 2885 }, { "epoch": 0.49392435392777684, "grad_norm": 88.6714859008789, "learning_rate": 1.642897889332573e-05, "loss": 4.1515, "step": 2886 }, { "epoch": 0.49409549888755777, "grad_norm": 102.38868713378906, "learning_rate": 1.6434683399885912e-05, "loss": 4.1397, "step": 2887 }, { "epoch": 0.4942666438473387, "grad_norm": 32.09382247924805, "learning_rate": 1.6440387906446092e-05, "loss": 3.822, "step": 2888 }, { "epoch": 0.49443778880711964, "grad_norm": 27.632850646972656, "learning_rate": 1.6446092413006275e-05, "loss": 3.0071, "step": 2889 }, { "epoch": 0.49460893376690057, "grad_norm": 29.850147247314453, "learning_rate": 1.645179691956646e-05, "loss": 4.5876, "step": 2890 }, { "epoch": 0.4947800787266815, "grad_norm": 20.323644638061523, "learning_rate": 1.645750142612664e-05, "loss": 2.0093, "step": 2891 }, { "epoch": 0.49495122368646244, "grad_norm": 28.592273712158203, "learning_rate": 1.6463205932686822e-05, "loss": 2.6316, "step": 2892 }, { "epoch": 0.49512236864624337, "grad_norm": 29.890256881713867, "learning_rate": 1.6468910439247005e-05, "loss": 2.7351, "step": 2893 }, { "epoch": 0.4952935136060243, "grad_norm": 25.856136322021484, "learning_rate": 1.647461494580719e-05, "loss": 2.7318, "step": 2894 }, { "epoch": 0.49546465856580524, "grad_norm": 28.1647891998291, "learning_rate": 1.648031945236737e-05, "loss": 2.7787, "step": 2895 }, { "epoch": 0.49563580352558617, "grad_norm": 24.757694244384766, "learning_rate": 1.6486023958927552e-05, "loss": 2.7135, "step": 2896 }, { "epoch": 0.4958069484853671, "grad_norm": 42.44664764404297, "learning_rate": 1.6491728465487735e-05, "loss": 3.6649, "step": 2897 }, { "epoch": 0.49597809344514804, "grad_norm": 30.2053279876709, "learning_rate": 1.6497432972047915e-05, "loss": 4.0259, "step": 2898 }, { "epoch": 0.49614923840492897, "grad_norm": 12.054943084716797, "learning_rate": 1.6503137478608102e-05, "loss": 1.0105, "step": 2899 }, { "epoch": 0.4963203833647099, "grad_norm": 17.974079132080078, "learning_rate": 1.6508841985168286e-05, "loss": 2.0786, "step": 2900 }, { "epoch": 0.49649152832449084, "grad_norm": 12.725552558898926, "learning_rate": 1.651454649172847e-05, "loss": 1.0647, "step": 2901 }, { "epoch": 0.49666267328427177, "grad_norm": 22.831754684448242, "learning_rate": 1.652025099828865e-05, "loss": 2.2329, "step": 2902 }, { "epoch": 0.4968338182440527, "grad_norm": 21.267478942871094, "learning_rate": 1.6525955504848832e-05, "loss": 2.5314, "step": 2903 }, { "epoch": 0.49700496320383364, "grad_norm": 27.087793350219727, "learning_rate": 1.6531660011409016e-05, "loss": 2.8437, "step": 2904 }, { "epoch": 0.49717610816361457, "grad_norm": 19.73915672302246, "learning_rate": 1.6537364517969196e-05, "loss": 1.8543, "step": 2905 }, { "epoch": 0.4973472531233955, "grad_norm": 2.955650806427002, "learning_rate": 1.654306902452938e-05, "loss": 0.4054, "step": 2906 }, { "epoch": 0.49751839808317644, "grad_norm": 25.305593490600586, "learning_rate": 1.6548773531089562e-05, "loss": 2.5412, "step": 2907 }, { "epoch": 0.49768954304295737, "grad_norm": 29.378746032714844, "learning_rate": 1.6554478037649742e-05, "loss": 2.7018, "step": 2908 }, { "epoch": 0.4978606880027383, "grad_norm": 14.516071319580078, "learning_rate": 1.6560182544209926e-05, "loss": 1.9194, "step": 2909 }, { "epoch": 0.49803183296251924, "grad_norm": 25.602577209472656, "learning_rate": 1.656588705077011e-05, "loss": 2.1128, "step": 2910 }, { "epoch": 0.49820297792230017, "grad_norm": 109.72111511230469, "learning_rate": 1.6571591557330293e-05, "loss": 4.1774, "step": 2911 }, { "epoch": 0.4983741228820811, "grad_norm": 19.274553298950195, "learning_rate": 1.6577296063890472e-05, "loss": 1.5632, "step": 2912 }, { "epoch": 0.49854526784186204, "grad_norm": 29.17140007019043, "learning_rate": 1.6583000570450656e-05, "loss": 3.7158, "step": 2913 }, { "epoch": 0.49871641280164297, "grad_norm": 31.559934616088867, "learning_rate": 1.658870507701084e-05, "loss": 4.5437, "step": 2914 }, { "epoch": 0.4988875577614239, "grad_norm": 18.08380699157715, "learning_rate": 1.659440958357102e-05, "loss": 1.1722, "step": 2915 }, { "epoch": 0.49905870272120484, "grad_norm": 29.155492782592773, "learning_rate": 1.6600114090131203e-05, "loss": 3.2768, "step": 2916 }, { "epoch": 0.49922984768098577, "grad_norm": 36.51355743408203, "learning_rate": 1.6605818596691386e-05, "loss": 4.8346, "step": 2917 }, { "epoch": 0.4994009926407667, "grad_norm": 18.29048728942871, "learning_rate": 1.661152310325157e-05, "loss": 1.1614, "step": 2918 }, { "epoch": 0.49957213760054764, "grad_norm": 29.851797103881836, "learning_rate": 1.661722760981175e-05, "loss": 2.9554, "step": 2919 }, { "epoch": 0.49974328256032857, "grad_norm": 27.82573699951172, "learning_rate": 1.6622932116371933e-05, "loss": 3.4135, "step": 2920 }, { "epoch": 0.4999144275201095, "grad_norm": 26.42146110534668, "learning_rate": 1.6628636622932116e-05, "loss": 2.5056, "step": 2921 }, { "epoch": 0.5000855724798905, "grad_norm": 11.394399642944336, "learning_rate": 1.66343411294923e-05, "loss": 1.5378, "step": 2922 }, { "epoch": 0.5002567174396714, "grad_norm": 76.39617156982422, "learning_rate": 1.6640045636052483e-05, "loss": 7.2706, "step": 2923 }, { "epoch": 0.5004278623994524, "grad_norm": 30.514179229736328, "learning_rate": 1.6645750142612666e-05, "loss": 3.1234, "step": 2924 }, { "epoch": 0.5005990073592332, "grad_norm": 25.776514053344727, "learning_rate": 1.665145464917285e-05, "loss": 2.852, "step": 2925 }, { "epoch": 0.5007701523190142, "grad_norm": 33.94929122924805, "learning_rate": 1.665715915573303e-05, "loss": 4.5202, "step": 2926 }, { "epoch": 0.5009412972787951, "grad_norm": 42.92927551269531, "learning_rate": 1.6662863662293213e-05, "loss": 7.0151, "step": 2927 }, { "epoch": 0.5011124422385761, "grad_norm": 8.699772834777832, "learning_rate": 1.6668568168853396e-05, "loss": 0.8725, "step": 2928 }, { "epoch": 0.501283587198357, "grad_norm": 27.853302001953125, "learning_rate": 1.6674272675413576e-05, "loss": 2.2825, "step": 2929 }, { "epoch": 0.501454732158138, "grad_norm": 26.110185623168945, "learning_rate": 1.667997718197376e-05, "loss": 2.5107, "step": 2930 }, { "epoch": 0.5016258771179188, "grad_norm": 4.521554946899414, "learning_rate": 1.6685681688533943e-05, "loss": 0.4957, "step": 2931 }, { "epoch": 0.5017970220776998, "grad_norm": 42.245086669921875, "learning_rate": 1.6691386195094127e-05, "loss": 6.5318, "step": 2932 }, { "epoch": 0.5019681670374807, "grad_norm": 25.86848258972168, "learning_rate": 1.6697090701654307e-05, "loss": 2.2382, "step": 2933 }, { "epoch": 0.5021393119972617, "grad_norm": 48.50715637207031, "learning_rate": 1.670279520821449e-05, "loss": 7.148, "step": 2934 }, { "epoch": 0.5023104569570426, "grad_norm": 32.559574127197266, "learning_rate": 1.6708499714774673e-05, "loss": 3.6438, "step": 2935 }, { "epoch": 0.5024816019168236, "grad_norm": 24.84282112121582, "learning_rate": 1.6714204221334853e-05, "loss": 2.7729, "step": 2936 }, { "epoch": 0.5026527468766044, "grad_norm": 14.403919219970703, "learning_rate": 1.6719908727895037e-05, "loss": 1.217, "step": 2937 }, { "epoch": 0.5028238918363854, "grad_norm": 27.424219131469727, "learning_rate": 1.672561323445522e-05, "loss": 2.5197, "step": 2938 }, { "epoch": 0.5029950367961663, "grad_norm": 9.789163589477539, "learning_rate": 1.67313177410154e-05, "loss": 1.7931, "step": 2939 }, { "epoch": 0.5031661817559473, "grad_norm": 27.327239990234375, "learning_rate": 1.6737022247575583e-05, "loss": 3.3266, "step": 2940 }, { "epoch": 0.5033373267157282, "grad_norm": 19.182161331176758, "learning_rate": 1.6742726754135767e-05, "loss": 1.9967, "step": 2941 }, { "epoch": 0.5035084716755092, "grad_norm": 56.43001174926758, "learning_rate": 1.674843126069595e-05, "loss": 3.7189, "step": 2942 }, { "epoch": 0.50367961663529, "grad_norm": 19.654386520385742, "learning_rate": 1.675413576725613e-05, "loss": 1.9999, "step": 2943 }, { "epoch": 0.503850761595071, "grad_norm": 22.203187942504883, "learning_rate": 1.6759840273816314e-05, "loss": 2.1959, "step": 2944 }, { "epoch": 0.5040219065548519, "grad_norm": 6.563319683074951, "learning_rate": 1.67655447803765e-05, "loss": 0.6367, "step": 2945 }, { "epoch": 0.5041930515146329, "grad_norm": 10.192085266113281, "learning_rate": 1.677124928693668e-05, "loss": 0.7288, "step": 2946 }, { "epoch": 0.5043641964744139, "grad_norm": 32.45716094970703, "learning_rate": 1.6776953793496864e-05, "loss": 3.9021, "step": 2947 }, { "epoch": 0.5045353414341948, "grad_norm": 4.9417595863342285, "learning_rate": 1.6782658300057047e-05, "loss": 0.4681, "step": 2948 }, { "epoch": 0.5047064863939758, "grad_norm": 27.206302642822266, "learning_rate": 1.678836280661723e-05, "loss": 3.3352, "step": 2949 }, { "epoch": 0.5048776313537566, "grad_norm": 28.154144287109375, "learning_rate": 1.679406731317741e-05, "loss": 2.9377, "step": 2950 }, { "epoch": 0.5050487763135376, "grad_norm": 21.303789138793945, "learning_rate": 1.6799771819737594e-05, "loss": 2.785, "step": 2951 }, { "epoch": 0.5052199212733185, "grad_norm": 31.954051971435547, "learning_rate": 1.6805476326297777e-05, "loss": 3.1305, "step": 2952 }, { "epoch": 0.5053910662330995, "grad_norm": 10.69640827178955, "learning_rate": 1.6811180832857957e-05, "loss": 1.6799, "step": 2953 }, { "epoch": 0.5055622111928804, "grad_norm": 30.222347259521484, "learning_rate": 1.681688533941814e-05, "loss": 2.8247, "step": 2954 }, { "epoch": 0.5057333561526614, "grad_norm": 96.27491760253906, "learning_rate": 1.6822589845978324e-05, "loss": 4.6357, "step": 2955 }, { "epoch": 0.5059045011124422, "grad_norm": 28.582870483398438, "learning_rate": 1.6828294352538507e-05, "loss": 3.2733, "step": 2956 }, { "epoch": 0.5060756460722232, "grad_norm": 41.087825775146484, "learning_rate": 1.6833998859098687e-05, "loss": 7.1278, "step": 2957 }, { "epoch": 0.5062467910320041, "grad_norm": 7.500061511993408, "learning_rate": 1.683970336565887e-05, "loss": 0.8286, "step": 2958 }, { "epoch": 0.5064179359917851, "grad_norm": 26.969345092773438, "learning_rate": 1.6845407872219054e-05, "loss": 2.235, "step": 2959 }, { "epoch": 0.506589080951566, "grad_norm": 26.311525344848633, "learning_rate": 1.6851112378779234e-05, "loss": 3.0085, "step": 2960 }, { "epoch": 0.506760225911347, "grad_norm": 31.306970596313477, "learning_rate": 1.6856816885339417e-05, "loss": 2.5939, "step": 2961 }, { "epoch": 0.5069313708711278, "grad_norm": 24.608043670654297, "learning_rate": 1.68625213918996e-05, "loss": 2.3096, "step": 2962 }, { "epoch": 0.5071025158309088, "grad_norm": 27.197254180908203, "learning_rate": 1.6868225898459784e-05, "loss": 2.9187, "step": 2963 }, { "epoch": 0.5072736607906897, "grad_norm": 28.446548461914062, "learning_rate": 1.6873930405019964e-05, "loss": 3.2735, "step": 2964 }, { "epoch": 0.5074448057504707, "grad_norm": 32.15707778930664, "learning_rate": 1.6879634911580148e-05, "loss": 6.7019, "step": 2965 }, { "epoch": 0.5076159507102516, "grad_norm": 23.724163055419922, "learning_rate": 1.688533941814033e-05, "loss": 2.1627, "step": 2966 }, { "epoch": 0.5077870956700326, "grad_norm": 28.04530143737793, "learning_rate": 1.689104392470051e-05, "loss": 2.6273, "step": 2967 }, { "epoch": 0.5079582406298134, "grad_norm": 30.895709991455078, "learning_rate": 1.6896748431260698e-05, "loss": 3.8368, "step": 2968 }, { "epoch": 0.5081293855895944, "grad_norm": 14.024374961853027, "learning_rate": 1.690245293782088e-05, "loss": 1.0194, "step": 2969 }, { "epoch": 0.5083005305493753, "grad_norm": 29.09341049194336, "learning_rate": 1.690815744438106e-05, "loss": 3.5337, "step": 2970 }, { "epoch": 0.5084716755091563, "grad_norm": 28.34062385559082, "learning_rate": 1.6913861950941244e-05, "loss": 3.1743, "step": 2971 }, { "epoch": 0.5086428204689372, "grad_norm": 25.496129989624023, "learning_rate": 1.6919566457501428e-05, "loss": 2.898, "step": 2972 }, { "epoch": 0.5088139654287182, "grad_norm": 38.798343658447266, "learning_rate": 1.692527096406161e-05, "loss": 3.5201, "step": 2973 }, { "epoch": 0.508985110388499, "grad_norm": 10.149602890014648, "learning_rate": 1.693097547062179e-05, "loss": 0.7939, "step": 2974 }, { "epoch": 0.50915625534828, "grad_norm": 6.670815944671631, "learning_rate": 1.6936679977181975e-05, "loss": 0.854, "step": 2975 }, { "epoch": 0.5093274003080609, "grad_norm": 75.72901153564453, "learning_rate": 1.6942384483742158e-05, "loss": 2.9379, "step": 2976 }, { "epoch": 0.5094985452678419, "grad_norm": 26.788955688476562, "learning_rate": 1.6948088990302338e-05, "loss": 2.4457, "step": 2977 }, { "epoch": 0.5096696902276228, "grad_norm": 14.796418190002441, "learning_rate": 1.695379349686252e-05, "loss": 1.0122, "step": 2978 }, { "epoch": 0.5098408351874038, "grad_norm": 4.948236465454102, "learning_rate": 1.6959498003422705e-05, "loss": 0.5853, "step": 2979 }, { "epoch": 0.5100119801471846, "grad_norm": 182.9610595703125, "learning_rate": 1.6965202509982888e-05, "loss": 8.9776, "step": 2980 }, { "epoch": 0.5101831251069656, "grad_norm": 29.51963996887207, "learning_rate": 1.6970907016543068e-05, "loss": 2.9543, "step": 2981 }, { "epoch": 0.5103542700667465, "grad_norm": 28.639034271240234, "learning_rate": 1.697661152310325e-05, "loss": 3.2262, "step": 2982 }, { "epoch": 0.5105254150265275, "grad_norm": 29.50834846496582, "learning_rate": 1.6982316029663435e-05, "loss": 3.0001, "step": 2983 }, { "epoch": 0.5106965599863084, "grad_norm": 15.582537651062012, "learning_rate": 1.6988020536223615e-05, "loss": 1.1638, "step": 2984 }, { "epoch": 0.5108677049460894, "grad_norm": 27.667177200317383, "learning_rate": 1.6993725042783798e-05, "loss": 2.9351, "step": 2985 }, { "epoch": 0.5110388499058702, "grad_norm": 28.853923797607422, "learning_rate": 1.699942954934398e-05, "loss": 3.6286, "step": 2986 }, { "epoch": 0.5112099948656512, "grad_norm": 26.117013931274414, "learning_rate": 1.7005134055904165e-05, "loss": 2.8584, "step": 2987 }, { "epoch": 0.5113811398254321, "grad_norm": 34.81660842895508, "learning_rate": 1.7010838562464345e-05, "loss": 4.3968, "step": 2988 }, { "epoch": 0.5115522847852131, "grad_norm": 35.10283279418945, "learning_rate": 1.7016543069024528e-05, "loss": 6.599, "step": 2989 }, { "epoch": 0.511723429744994, "grad_norm": 19.16140365600586, "learning_rate": 1.7022247575584715e-05, "loss": 2.1204, "step": 2990 }, { "epoch": 0.511894574704775, "grad_norm": 22.029394149780273, "learning_rate": 1.7027952082144895e-05, "loss": 1.8696, "step": 2991 }, { "epoch": 0.5120657196645558, "grad_norm": 3.448702335357666, "learning_rate": 1.703365658870508e-05, "loss": 0.4607, "step": 2992 }, { "epoch": 0.5122368646243368, "grad_norm": 22.506763458251953, "learning_rate": 1.7039361095265262e-05, "loss": 2.1106, "step": 2993 }, { "epoch": 0.5124080095841177, "grad_norm": 31.842361450195312, "learning_rate": 1.7045065601825445e-05, "loss": 3.8676, "step": 2994 }, { "epoch": 0.5125791545438987, "grad_norm": 141.6663818359375, "learning_rate": 1.7050770108385625e-05, "loss": 8.5208, "step": 2995 }, { "epoch": 0.5127502995036796, "grad_norm": 62.276729583740234, "learning_rate": 1.705647461494581e-05, "loss": 3.2482, "step": 2996 }, { "epoch": 0.5129214444634606, "grad_norm": 22.119609832763672, "learning_rate": 1.7062179121505992e-05, "loss": 1.9903, "step": 2997 }, { "epoch": 0.5130925894232415, "grad_norm": 52.37403106689453, "learning_rate": 1.7067883628066172e-05, "loss": 6.8319, "step": 2998 }, { "epoch": 0.5132637343830224, "grad_norm": 12.259587287902832, "learning_rate": 1.7073588134626355e-05, "loss": 1.162, "step": 2999 }, { "epoch": 0.5134348793428034, "grad_norm": 8.290674209594727, "learning_rate": 1.707929264118654e-05, "loss": 0.9012, "step": 3000 }, { "epoch": 0.5136060243025843, "grad_norm": 32.74642562866211, "learning_rate": 1.7084997147746722e-05, "loss": 3.4785, "step": 3001 }, { "epoch": 0.5137771692623653, "grad_norm": 31.82801055908203, "learning_rate": 1.7090701654306902e-05, "loss": 4.2721, "step": 3002 }, { "epoch": 0.5139483142221462, "grad_norm": 32.273136138916016, "learning_rate": 1.7096406160867085e-05, "loss": 3.2625, "step": 3003 }, { "epoch": 0.5141194591819271, "grad_norm": 78.98668670654297, "learning_rate": 1.710211066742727e-05, "loss": 3.2698, "step": 3004 }, { "epoch": 0.514290604141708, "grad_norm": 30.16362762451172, "learning_rate": 1.710781517398745e-05, "loss": 3.9137, "step": 3005 }, { "epoch": 0.514461749101489, "grad_norm": 18.465227127075195, "learning_rate": 1.7113519680547632e-05, "loss": 1.8387, "step": 3006 }, { "epoch": 0.5146328940612699, "grad_norm": 3.536219358444214, "learning_rate": 1.7119224187107816e-05, "loss": 0.446, "step": 3007 }, { "epoch": 0.5148040390210509, "grad_norm": 17.390464782714844, "learning_rate": 1.7124928693667996e-05, "loss": 1.7668, "step": 3008 }, { "epoch": 0.5149751839808318, "grad_norm": 18.47218894958496, "learning_rate": 1.713063320022818e-05, "loss": 2.1817, "step": 3009 }, { "epoch": 0.5151463289406127, "grad_norm": 28.22992515563965, "learning_rate": 1.7136337706788362e-05, "loss": 2.9769, "step": 3010 }, { "epoch": 0.5153174739003936, "grad_norm": 62.36894989013672, "learning_rate": 1.7142042213348546e-05, "loss": 7.6922, "step": 3011 }, { "epoch": 0.5154886188601746, "grad_norm": 33.23900604248047, "learning_rate": 1.7147746719908726e-05, "loss": 3.3971, "step": 3012 }, { "epoch": 0.5156597638199555, "grad_norm": 2.5457472801208496, "learning_rate": 1.7153451226468912e-05, "loss": 0.4122, "step": 3013 }, { "epoch": 0.5158309087797365, "grad_norm": 26.533376693725586, "learning_rate": 1.7159155733029096e-05, "loss": 2.9528, "step": 3014 }, { "epoch": 0.5160020537395174, "grad_norm": 33.18933868408203, "learning_rate": 1.7164860239589276e-05, "loss": 3.7197, "step": 3015 }, { "epoch": 0.5161731986992983, "grad_norm": 25.48127555847168, "learning_rate": 1.717056474614946e-05, "loss": 2.8834, "step": 3016 }, { "epoch": 0.5163443436590792, "grad_norm": 32.51988983154297, "learning_rate": 1.7176269252709643e-05, "loss": 3.6681, "step": 3017 }, { "epoch": 0.5165154886188602, "grad_norm": 21.83390998840332, "learning_rate": 1.7181973759269826e-05, "loss": 2.3579, "step": 3018 }, { "epoch": 0.5166866335786411, "grad_norm": 21.106168746948242, "learning_rate": 1.7187678265830006e-05, "loss": 2.1503, "step": 3019 }, { "epoch": 0.5168577785384221, "grad_norm": 23.668697357177734, "learning_rate": 1.719338277239019e-05, "loss": 2.65, "step": 3020 }, { "epoch": 0.517028923498203, "grad_norm": 56.29466247558594, "learning_rate": 1.7199087278950373e-05, "loss": 7.269, "step": 3021 }, { "epoch": 0.5172000684579839, "grad_norm": 14.612650871276855, "learning_rate": 1.7204791785510553e-05, "loss": 1.5426, "step": 3022 }, { "epoch": 0.5173712134177648, "grad_norm": 28.365121841430664, "learning_rate": 1.7210496292070736e-05, "loss": 3.636, "step": 3023 }, { "epoch": 0.5175423583775458, "grad_norm": 25.329317092895508, "learning_rate": 1.721620079863092e-05, "loss": 2.3847, "step": 3024 }, { "epoch": 0.5177135033373267, "grad_norm": 32.05517578125, "learning_rate": 1.7221905305191103e-05, "loss": 3.7742, "step": 3025 }, { "epoch": 0.5178846482971077, "grad_norm": 11.009437561035156, "learning_rate": 1.7227609811751283e-05, "loss": 1.5541, "step": 3026 }, { "epoch": 0.5180557932568886, "grad_norm": 4.6759490966796875, "learning_rate": 1.7233314318311466e-05, "loss": 0.448, "step": 3027 }, { "epoch": 0.5182269382166695, "grad_norm": 23.18576431274414, "learning_rate": 1.723901882487165e-05, "loss": 2.3099, "step": 3028 }, { "epoch": 0.5183980831764504, "grad_norm": 21.823318481445312, "learning_rate": 1.724472333143183e-05, "loss": 2.0502, "step": 3029 }, { "epoch": 0.5185692281362314, "grad_norm": 33.11149215698242, "learning_rate": 1.7250427837992013e-05, "loss": 3.7448, "step": 3030 }, { "epoch": 0.5187403730960123, "grad_norm": 32.03651809692383, "learning_rate": 1.7256132344552196e-05, "loss": 3.5141, "step": 3031 }, { "epoch": 0.5189115180557933, "grad_norm": 29.257003784179688, "learning_rate": 1.726183685111238e-05, "loss": 3.5149, "step": 3032 }, { "epoch": 0.5190826630155742, "grad_norm": 6.367782115936279, "learning_rate": 1.726754135767256e-05, "loss": 1.1882, "step": 3033 }, { "epoch": 0.5192538079753551, "grad_norm": 21.6986083984375, "learning_rate": 1.7273245864232743e-05, "loss": 1.7948, "step": 3034 }, { "epoch": 0.519424952935136, "grad_norm": 14.612825393676758, "learning_rate": 1.7278950370792926e-05, "loss": 1.1659, "step": 3035 }, { "epoch": 0.519596097894917, "grad_norm": 28.725549697875977, "learning_rate": 1.728465487735311e-05, "loss": 2.7611, "step": 3036 }, { "epoch": 0.5197672428546979, "grad_norm": 30.985149383544922, "learning_rate": 1.7290359383913293e-05, "loss": 2.9706, "step": 3037 }, { "epoch": 0.5199383878144789, "grad_norm": 17.664464950561523, "learning_rate": 1.7296063890473477e-05, "loss": 1.6902, "step": 3038 }, { "epoch": 0.5201095327742598, "grad_norm": 32.17440414428711, "learning_rate": 1.7301768397033657e-05, "loss": 3.8811, "step": 3039 }, { "epoch": 0.5202806777340407, "grad_norm": 5.3300580978393555, "learning_rate": 1.730747290359384e-05, "loss": 0.4921, "step": 3040 }, { "epoch": 0.5204518226938216, "grad_norm": 38.537044525146484, "learning_rate": 1.7313177410154023e-05, "loss": 3.2832, "step": 3041 }, { "epoch": 0.5206229676536026, "grad_norm": 39.10978698730469, "learning_rate": 1.7318881916714207e-05, "loss": 3.8919, "step": 3042 }, { "epoch": 0.5207941126133835, "grad_norm": 29.357208251953125, "learning_rate": 1.7324586423274387e-05, "loss": 3.115, "step": 3043 }, { "epoch": 0.5209652575731645, "grad_norm": 15.655451774597168, "learning_rate": 1.733029092983457e-05, "loss": 1.4122, "step": 3044 }, { "epoch": 0.5211364025329454, "grad_norm": 28.293025970458984, "learning_rate": 1.7335995436394753e-05, "loss": 2.9349, "step": 3045 }, { "epoch": 0.5213075474927263, "grad_norm": 32.65211868286133, "learning_rate": 1.7341699942954933e-05, "loss": 3.2992, "step": 3046 }, { "epoch": 0.5214786924525072, "grad_norm": 23.2037296295166, "learning_rate": 1.7347404449515117e-05, "loss": 2.3879, "step": 3047 }, { "epoch": 0.5216498374122882, "grad_norm": 26.37859535217285, "learning_rate": 1.73531089560753e-05, "loss": 2.1987, "step": 3048 }, { "epoch": 0.5218209823720692, "grad_norm": 18.490966796875, "learning_rate": 1.7358813462635484e-05, "loss": 1.461, "step": 3049 }, { "epoch": 0.5219921273318501, "grad_norm": 31.97382354736328, "learning_rate": 1.7364517969195664e-05, "loss": 2.8849, "step": 3050 }, { "epoch": 0.5221632722916311, "grad_norm": 49.7996711730957, "learning_rate": 1.7370222475755847e-05, "loss": 6.9056, "step": 3051 }, { "epoch": 0.5223344172514119, "grad_norm": 28.981660842895508, "learning_rate": 1.737592698231603e-05, "loss": 3.0778, "step": 3052 }, { "epoch": 0.5225055622111929, "grad_norm": 120.67489624023438, "learning_rate": 1.738163148887621e-05, "loss": 4.2964, "step": 3053 }, { "epoch": 0.5226767071709738, "grad_norm": 158.1115264892578, "learning_rate": 1.7387335995436394e-05, "loss": 8.5312, "step": 3054 }, { "epoch": 0.5228478521307548, "grad_norm": 28.185558319091797, "learning_rate": 1.7393040501996577e-05, "loss": 3.537, "step": 3055 }, { "epoch": 0.5230189970905357, "grad_norm": 9.107454299926758, "learning_rate": 1.739874500855676e-05, "loss": 1.0684, "step": 3056 }, { "epoch": 0.5231901420503167, "grad_norm": 36.81668472290039, "learning_rate": 1.740444951511694e-05, "loss": 3.8926, "step": 3057 }, { "epoch": 0.5233612870100975, "grad_norm": 26.352327346801758, "learning_rate": 1.7410154021677124e-05, "loss": 2.664, "step": 3058 }, { "epoch": 0.5235324319698785, "grad_norm": 21.38902473449707, "learning_rate": 1.741585852823731e-05, "loss": 2.2566, "step": 3059 }, { "epoch": 0.5237035769296594, "grad_norm": 10.19254207611084, "learning_rate": 1.742156303479749e-05, "loss": 0.8717, "step": 3060 }, { "epoch": 0.5238747218894404, "grad_norm": 19.25916862487793, "learning_rate": 1.7427267541357674e-05, "loss": 1.3792, "step": 3061 }, { "epoch": 0.5240458668492213, "grad_norm": 21.88836669921875, "learning_rate": 1.7432972047917857e-05, "loss": 2.2543, "step": 3062 }, { "epoch": 0.5242170118090023, "grad_norm": 30.14661979675293, "learning_rate": 1.743867655447804e-05, "loss": 3.4672, "step": 3063 }, { "epoch": 0.5243881567687831, "grad_norm": 25.134571075439453, "learning_rate": 1.744438106103822e-05, "loss": 2.5836, "step": 3064 }, { "epoch": 0.5245593017285641, "grad_norm": 21.906818389892578, "learning_rate": 1.7450085567598404e-05, "loss": 2.4618, "step": 3065 }, { "epoch": 0.524730446688345, "grad_norm": 31.330976486206055, "learning_rate": 1.7455790074158587e-05, "loss": 3.1397, "step": 3066 }, { "epoch": 0.524901591648126, "grad_norm": 6.355524063110352, "learning_rate": 1.7461494580718767e-05, "loss": 0.5091, "step": 3067 }, { "epoch": 0.5250727366079069, "grad_norm": 110.98942565917969, "learning_rate": 1.746719908727895e-05, "loss": 7.9114, "step": 3068 }, { "epoch": 0.5252438815676879, "grad_norm": 31.17119789123535, "learning_rate": 1.7472903593839134e-05, "loss": 3.2594, "step": 3069 }, { "epoch": 0.5254150265274687, "grad_norm": 24.364032745361328, "learning_rate": 1.7478608100399318e-05, "loss": 1.9217, "step": 3070 }, { "epoch": 0.5255861714872497, "grad_norm": 34.264041900634766, "learning_rate": 1.7484312606959498e-05, "loss": 3.9812, "step": 3071 }, { "epoch": 0.5257573164470306, "grad_norm": 27.54375648498535, "learning_rate": 1.749001711351968e-05, "loss": 2.8373, "step": 3072 }, { "epoch": 0.5259284614068116, "grad_norm": 58.27510452270508, "learning_rate": 1.7495721620079864e-05, "loss": 7.2686, "step": 3073 }, { "epoch": 0.5260996063665925, "grad_norm": 27.861116409301758, "learning_rate": 1.7501426126640044e-05, "loss": 3.2877, "step": 3074 }, { "epoch": 0.5262707513263735, "grad_norm": 28.097177505493164, "learning_rate": 1.7507130633200228e-05, "loss": 2.3413, "step": 3075 }, { "epoch": 0.5264418962861543, "grad_norm": 30.74901008605957, "learning_rate": 1.751283513976041e-05, "loss": 3.2284, "step": 3076 }, { "epoch": 0.5266130412459353, "grad_norm": 5.434010982513428, "learning_rate": 1.751853964632059e-05, "loss": 0.5515, "step": 3077 }, { "epoch": 0.5267841862057162, "grad_norm": 19.591594696044922, "learning_rate": 1.7524244152880774e-05, "loss": 1.8104, "step": 3078 }, { "epoch": 0.5269553311654972, "grad_norm": 27.989707946777344, "learning_rate": 1.7529948659440958e-05, "loss": 2.4876, "step": 3079 }, { "epoch": 0.5271264761252781, "grad_norm": 45.50398635864258, "learning_rate": 1.753565316600114e-05, "loss": 6.9276, "step": 3080 }, { "epoch": 0.5272976210850591, "grad_norm": 29.907915115356445, "learning_rate": 1.754135767256132e-05, "loss": 3.8381, "step": 3081 }, { "epoch": 0.5274687660448399, "grad_norm": 22.03485679626465, "learning_rate": 1.7547062179121508e-05, "loss": 1.8432, "step": 3082 }, { "epoch": 0.5276399110046209, "grad_norm": 41.72187042236328, "learning_rate": 1.755276668568169e-05, "loss": 6.81, "step": 3083 }, { "epoch": 0.5278110559644018, "grad_norm": 15.85753345489502, "learning_rate": 1.755847119224187e-05, "loss": 1.1867, "step": 3084 }, { "epoch": 0.5279822009241828, "grad_norm": 14.52872085571289, "learning_rate": 1.7564175698802055e-05, "loss": 1.02, "step": 3085 }, { "epoch": 0.5281533458839637, "grad_norm": 47.226070404052734, "learning_rate": 1.7569880205362238e-05, "loss": 6.5701, "step": 3086 }, { "epoch": 0.5283244908437447, "grad_norm": 26.31117820739746, "learning_rate": 1.757558471192242e-05, "loss": 2.8588, "step": 3087 }, { "epoch": 0.5284956358035255, "grad_norm": 24.817096710205078, "learning_rate": 1.75812892184826e-05, "loss": 2.5557, "step": 3088 }, { "epoch": 0.5286667807633065, "grad_norm": 3.8697149753570557, "learning_rate": 1.7586993725042785e-05, "loss": 0.447, "step": 3089 }, { "epoch": 0.5288379257230874, "grad_norm": 27.01019287109375, "learning_rate": 1.7592698231602968e-05, "loss": 2.6394, "step": 3090 }, { "epoch": 0.5290090706828684, "grad_norm": 3.1552348136901855, "learning_rate": 1.7598402738163148e-05, "loss": 0.4523, "step": 3091 }, { "epoch": 0.5291802156426493, "grad_norm": 30.454021453857422, "learning_rate": 1.760410724472333e-05, "loss": 3.3332, "step": 3092 }, { "epoch": 0.5293513606024303, "grad_norm": 2.6408188343048096, "learning_rate": 1.7609811751283515e-05, "loss": 0.4075, "step": 3093 }, { "epoch": 0.5295225055622111, "grad_norm": 27.623132705688477, "learning_rate": 1.76155162578437e-05, "loss": 2.7079, "step": 3094 }, { "epoch": 0.5296936505219921, "grad_norm": 22.717605590820312, "learning_rate": 1.762122076440388e-05, "loss": 2.2378, "step": 3095 }, { "epoch": 0.529864795481773, "grad_norm": 50.63970184326172, "learning_rate": 1.7626925270964062e-05, "loss": 3.3046, "step": 3096 }, { "epoch": 0.530035940441554, "grad_norm": 47.14366912841797, "learning_rate": 1.7632629777524245e-05, "loss": 6.9276, "step": 3097 }, { "epoch": 0.5302070854013349, "grad_norm": 26.201753616333008, "learning_rate": 1.7638334284084425e-05, "loss": 2.3728, "step": 3098 }, { "epoch": 0.5303782303611159, "grad_norm": 33.462398529052734, "learning_rate": 1.764403879064461e-05, "loss": 6.8023, "step": 3099 }, { "epoch": 0.5305493753208969, "grad_norm": 32.51939010620117, "learning_rate": 1.7649743297204792e-05, "loss": 3.992, "step": 3100 }, { "epoch": 0.5307205202806777, "grad_norm": 14.161356925964355, "learning_rate": 1.7655447803764975e-05, "loss": 1.0058, "step": 3101 }, { "epoch": 0.5308916652404587, "grad_norm": 91.61168670654297, "learning_rate": 1.7661152310325155e-05, "loss": 6.6935, "step": 3102 }, { "epoch": 0.5310628102002396, "grad_norm": 26.40794563293457, "learning_rate": 1.766685681688534e-05, "loss": 2.5008, "step": 3103 }, { "epoch": 0.5312339551600206, "grad_norm": 21.793699264526367, "learning_rate": 1.7672561323445522e-05, "loss": 2.0443, "step": 3104 }, { "epoch": 0.5314051001198015, "grad_norm": 27.75925636291504, "learning_rate": 1.7678265830005705e-05, "loss": 2.893, "step": 3105 }, { "epoch": 0.5315762450795825, "grad_norm": 22.48872947692871, "learning_rate": 1.768397033656589e-05, "loss": 1.8274, "step": 3106 }, { "epoch": 0.5317473900393633, "grad_norm": 21.972978591918945, "learning_rate": 1.7689674843126072e-05, "loss": 2.0134, "step": 3107 }, { "epoch": 0.5319185349991443, "grad_norm": 4.393357753753662, "learning_rate": 1.7695379349686252e-05, "loss": 0.4006, "step": 3108 }, { "epoch": 0.5320896799589252, "grad_norm": 15.986166000366211, "learning_rate": 1.7701083856246435e-05, "loss": 1.0921, "step": 3109 }, { "epoch": 0.5322608249187062, "grad_norm": 23.317607879638672, "learning_rate": 1.770678836280662e-05, "loss": 2.5234, "step": 3110 }, { "epoch": 0.5324319698784871, "grad_norm": 81.8624267578125, "learning_rate": 1.7712492869366802e-05, "loss": 3.4206, "step": 3111 }, { "epoch": 0.532603114838268, "grad_norm": 46.01921844482422, "learning_rate": 1.7718197375926982e-05, "loss": 3.2694, "step": 3112 }, { "epoch": 0.5327742597980489, "grad_norm": 14.079997062683105, "learning_rate": 1.7723901882487166e-05, "loss": 1.1213, "step": 3113 }, { "epoch": 0.5329454047578299, "grad_norm": 27.70348358154297, "learning_rate": 1.772960638904735e-05, "loss": 2.9553, "step": 3114 }, { "epoch": 0.5331165497176108, "grad_norm": 13.08663558959961, "learning_rate": 1.773531089560753e-05, "loss": 0.9058, "step": 3115 }, { "epoch": 0.5332876946773918, "grad_norm": 5.895364761352539, "learning_rate": 1.7741015402167712e-05, "loss": 0.5572, "step": 3116 }, { "epoch": 0.5334588396371727, "grad_norm": 14.521390914916992, "learning_rate": 1.7746719908727896e-05, "loss": 1.2956, "step": 3117 }, { "epoch": 0.5336299845969537, "grad_norm": 5.561517238616943, "learning_rate": 1.775242441528808e-05, "loss": 0.7001, "step": 3118 }, { "epoch": 0.5338011295567345, "grad_norm": 12.158028602600098, "learning_rate": 1.775812892184826e-05, "loss": 0.8123, "step": 3119 }, { "epoch": 0.5339722745165155, "grad_norm": 32.72988510131836, "learning_rate": 1.7763833428408442e-05, "loss": 2.9845, "step": 3120 }, { "epoch": 0.5341434194762964, "grad_norm": 31.350831985473633, "learning_rate": 1.7769537934968626e-05, "loss": 3.6956, "step": 3121 }, { "epoch": 0.5343145644360774, "grad_norm": 19.63844871520996, "learning_rate": 1.7775242441528806e-05, "loss": 2.0279, "step": 3122 }, { "epoch": 0.5344857093958583, "grad_norm": 10.440444946289062, "learning_rate": 1.778094694808899e-05, "loss": 0.8947, "step": 3123 }, { "epoch": 0.5346568543556393, "grad_norm": 28.158235549926758, "learning_rate": 1.7786651454649173e-05, "loss": 2.9307, "step": 3124 }, { "epoch": 0.5348279993154201, "grad_norm": 25.009632110595703, "learning_rate": 1.7792355961209356e-05, "loss": 2.3439, "step": 3125 }, { "epoch": 0.5349991442752011, "grad_norm": 25.99068832397461, "learning_rate": 1.7798060467769536e-05, "loss": 2.681, "step": 3126 }, { "epoch": 0.535170289234982, "grad_norm": 16.541526794433594, "learning_rate": 1.780376497432972e-05, "loss": 1.7645, "step": 3127 }, { "epoch": 0.535341434194763, "grad_norm": 32.52701950073242, "learning_rate": 1.7809469480889906e-05, "loss": 3.4399, "step": 3128 }, { "epoch": 0.5355125791545439, "grad_norm": 1.9595259428024292, "learning_rate": 1.7815173987450086e-05, "loss": 0.3411, "step": 3129 }, { "epoch": 0.5356837241143249, "grad_norm": 22.871707916259766, "learning_rate": 1.782087849401027e-05, "loss": 2.7152, "step": 3130 }, { "epoch": 0.5358548690741057, "grad_norm": 30.88572120666504, "learning_rate": 1.7826583000570453e-05, "loss": 3.0383, "step": 3131 }, { "epoch": 0.5360260140338867, "grad_norm": 24.158727645874023, "learning_rate": 1.7832287507130636e-05, "loss": 2.7594, "step": 3132 }, { "epoch": 0.5361971589936676, "grad_norm": 19.16653823852539, "learning_rate": 1.7837992013690816e-05, "loss": 1.7749, "step": 3133 }, { "epoch": 0.5363683039534486, "grad_norm": 11.925354957580566, "learning_rate": 1.7843696520251e-05, "loss": 0.8569, "step": 3134 }, { "epoch": 0.5365394489132295, "grad_norm": 20.42278289794922, "learning_rate": 1.7849401026811183e-05, "loss": 1.9146, "step": 3135 }, { "epoch": 0.5367105938730105, "grad_norm": 36.13545227050781, "learning_rate": 1.7855105533371363e-05, "loss": 4.4798, "step": 3136 }, { "epoch": 0.5368817388327913, "grad_norm": 4.70065975189209, "learning_rate": 1.7860810039931546e-05, "loss": 0.4304, "step": 3137 }, { "epoch": 0.5370528837925723, "grad_norm": 24.28241539001465, "learning_rate": 1.786651454649173e-05, "loss": 2.4005, "step": 3138 }, { "epoch": 0.5372240287523532, "grad_norm": 14.650952339172363, "learning_rate": 1.787221905305191e-05, "loss": 0.968, "step": 3139 }, { "epoch": 0.5373951737121342, "grad_norm": 16.861696243286133, "learning_rate": 1.7877923559612093e-05, "loss": 1.7277, "step": 3140 }, { "epoch": 0.5375663186719151, "grad_norm": 5.233786106109619, "learning_rate": 1.7883628066172276e-05, "loss": 0.7488, "step": 3141 }, { "epoch": 0.537737463631696, "grad_norm": 32.38574981689453, "learning_rate": 1.788933257273246e-05, "loss": 3.4367, "step": 3142 }, { "epoch": 0.5379086085914769, "grad_norm": 75.13265991210938, "learning_rate": 1.789503707929264e-05, "loss": 3.4272, "step": 3143 }, { "epoch": 0.5380797535512579, "grad_norm": 23.56121063232422, "learning_rate": 1.7900741585852823e-05, "loss": 2.2267, "step": 3144 }, { "epoch": 0.5382508985110388, "grad_norm": 6.575436592102051, "learning_rate": 1.7906446092413007e-05, "loss": 0.7715, "step": 3145 }, { "epoch": 0.5384220434708198, "grad_norm": 30.233795166015625, "learning_rate": 1.7912150598973187e-05, "loss": 2.2247, "step": 3146 }, { "epoch": 0.5385931884306007, "grad_norm": 18.158550262451172, "learning_rate": 1.791785510553337e-05, "loss": 1.4116, "step": 3147 }, { "epoch": 0.5387643333903817, "grad_norm": 25.578800201416016, "learning_rate": 1.7923559612093553e-05, "loss": 2.237, "step": 3148 }, { "epoch": 0.5389354783501625, "grad_norm": 4.3977460861206055, "learning_rate": 1.7929264118653737e-05, "loss": 0.4421, "step": 3149 }, { "epoch": 0.5391066233099435, "grad_norm": 23.86539649963379, "learning_rate": 1.793496862521392e-05, "loss": 2.2867, "step": 3150 }, { "epoch": 0.5392777682697245, "grad_norm": 2.900665283203125, "learning_rate": 1.7940673131774103e-05, "loss": 0.3778, "step": 3151 }, { "epoch": 0.5394489132295054, "grad_norm": 28.02079200744629, "learning_rate": 1.7946377638334287e-05, "loss": 2.756, "step": 3152 }, { "epoch": 0.5396200581892864, "grad_norm": 27.565895080566406, "learning_rate": 1.7952082144894467e-05, "loss": 2.3044, "step": 3153 }, { "epoch": 0.5397912031490673, "grad_norm": 35.14018630981445, "learning_rate": 1.795778665145465e-05, "loss": 4.3437, "step": 3154 }, { "epoch": 0.5399623481088482, "grad_norm": 24.932573318481445, "learning_rate": 1.7963491158014834e-05, "loss": 2.2505, "step": 3155 }, { "epoch": 0.5401334930686291, "grad_norm": 26.866313934326172, "learning_rate": 1.7969195664575017e-05, "loss": 2.7324, "step": 3156 }, { "epoch": 0.5403046380284101, "grad_norm": 22.461328506469727, "learning_rate": 1.7974900171135197e-05, "loss": 2.1863, "step": 3157 }, { "epoch": 0.540475782988191, "grad_norm": 16.967121124267578, "learning_rate": 1.798060467769538e-05, "loss": 1.0429, "step": 3158 }, { "epoch": 0.540646927947972, "grad_norm": 116.18841552734375, "learning_rate": 1.7986309184255564e-05, "loss": 3.4443, "step": 3159 }, { "epoch": 0.5408180729077529, "grad_norm": 28.559480667114258, "learning_rate": 1.7992013690815744e-05, "loss": 2.4973, "step": 3160 }, { "epoch": 0.5409892178675338, "grad_norm": 3.590916395187378, "learning_rate": 1.7997718197375927e-05, "loss": 0.4966, "step": 3161 }, { "epoch": 0.5411603628273147, "grad_norm": 78.85108947753906, "learning_rate": 1.800342270393611e-05, "loss": 4.2312, "step": 3162 }, { "epoch": 0.5413315077870957, "grad_norm": 25.83539390563965, "learning_rate": 1.8009127210496294e-05, "loss": 2.4909, "step": 3163 }, { "epoch": 0.5415026527468766, "grad_norm": 4.292176246643066, "learning_rate": 1.8014831717056474e-05, "loss": 0.4314, "step": 3164 }, { "epoch": 0.5416737977066576, "grad_norm": 6.629253387451172, "learning_rate": 1.8020536223616657e-05, "loss": 0.7743, "step": 3165 }, { "epoch": 0.5418449426664385, "grad_norm": 22.770082473754883, "learning_rate": 1.802624073017684e-05, "loss": 2.2334, "step": 3166 }, { "epoch": 0.5420160876262194, "grad_norm": 26.48427963256836, "learning_rate": 1.803194523673702e-05, "loss": 3.4341, "step": 3167 }, { "epoch": 0.5421872325860003, "grad_norm": 9.429801940917969, "learning_rate": 1.8037649743297204e-05, "loss": 0.8092, "step": 3168 }, { "epoch": 0.5423583775457813, "grad_norm": 56.79134750366211, "learning_rate": 1.8043354249857387e-05, "loss": 7.2408, "step": 3169 }, { "epoch": 0.5425295225055622, "grad_norm": 26.484098434448242, "learning_rate": 1.804905875641757e-05, "loss": 2.4684, "step": 3170 }, { "epoch": 0.5427006674653432, "grad_norm": 21.694990158081055, "learning_rate": 1.805476326297775e-05, "loss": 2.1088, "step": 3171 }, { "epoch": 0.5428718124251241, "grad_norm": 23.824108123779297, "learning_rate": 1.8060467769537934e-05, "loss": 2.7028, "step": 3172 }, { "epoch": 0.543042957384905, "grad_norm": 23.9963321685791, "learning_rate": 1.806617227609812e-05, "loss": 2.1508, "step": 3173 }, { "epoch": 0.5432141023446859, "grad_norm": 23.810443878173828, "learning_rate": 1.80718767826583e-05, "loss": 3.0015, "step": 3174 }, { "epoch": 0.5433852473044669, "grad_norm": 38.47050857543945, "learning_rate": 1.8077581289218484e-05, "loss": 7.0851, "step": 3175 }, { "epoch": 0.5435563922642478, "grad_norm": 26.14175033569336, "learning_rate": 1.8083285795778668e-05, "loss": 2.8927, "step": 3176 }, { "epoch": 0.5437275372240288, "grad_norm": 34.895294189453125, "learning_rate": 1.8088990302338848e-05, "loss": 6.4431, "step": 3177 }, { "epoch": 0.5438986821838097, "grad_norm": 30.46366310119629, "learning_rate": 1.809469480889903e-05, "loss": 3.8774, "step": 3178 }, { "epoch": 0.5440698271435906, "grad_norm": 2.045729637145996, "learning_rate": 1.8100399315459214e-05, "loss": 0.3887, "step": 3179 }, { "epoch": 0.5442409721033715, "grad_norm": 23.526275634765625, "learning_rate": 1.8106103822019398e-05, "loss": 2.2568, "step": 3180 }, { "epoch": 0.5444121170631525, "grad_norm": 37.37553024291992, "learning_rate": 1.8111808328579578e-05, "loss": 5.5636, "step": 3181 }, { "epoch": 0.5445832620229334, "grad_norm": 2.853957176208496, "learning_rate": 1.811751283513976e-05, "loss": 0.3745, "step": 3182 }, { "epoch": 0.5447544069827144, "grad_norm": 3.5641119480133057, "learning_rate": 1.8123217341699944e-05, "loss": 0.424, "step": 3183 }, { "epoch": 0.5449255519424953, "grad_norm": 8.759005546569824, "learning_rate": 1.8128921848260124e-05, "loss": 0.7256, "step": 3184 }, { "epoch": 0.5450966969022762, "grad_norm": 54.43397521972656, "learning_rate": 1.8134626354820308e-05, "loss": 7.7319, "step": 3185 }, { "epoch": 0.5452678418620571, "grad_norm": 26.35443878173828, "learning_rate": 1.814033086138049e-05, "loss": 3.1477, "step": 3186 }, { "epoch": 0.5454389868218381, "grad_norm": 18.872291564941406, "learning_rate": 1.8146035367940675e-05, "loss": 1.8724, "step": 3187 }, { "epoch": 0.545610131781619, "grad_norm": 22.673784255981445, "learning_rate": 1.8151739874500855e-05, "loss": 1.9899, "step": 3188 }, { "epoch": 0.5457812767414, "grad_norm": 9.217958450317383, "learning_rate": 1.8157444381061038e-05, "loss": 0.726, "step": 3189 }, { "epoch": 0.5459524217011809, "grad_norm": 2.148630380630493, "learning_rate": 1.816314888762122e-05, "loss": 0.323, "step": 3190 }, { "epoch": 0.5461235666609618, "grad_norm": 26.988340377807617, "learning_rate": 1.81688533941814e-05, "loss": 2.586, "step": 3191 }, { "epoch": 0.5462947116207427, "grad_norm": 37.6932373046875, "learning_rate": 1.8174557900741585e-05, "loss": 2.9146, "step": 3192 }, { "epoch": 0.5464658565805237, "grad_norm": 74.42720794677734, "learning_rate": 1.8180262407301768e-05, "loss": 3.2535, "step": 3193 }, { "epoch": 0.5466370015403046, "grad_norm": 29.757360458374023, "learning_rate": 1.818596691386195e-05, "loss": 2.8882, "step": 3194 }, { "epoch": 0.5468081465000856, "grad_norm": 15.420557975769043, "learning_rate": 1.819167142042213e-05, "loss": 1.6278, "step": 3195 }, { "epoch": 0.5469792914598665, "grad_norm": 31.367387771606445, "learning_rate": 1.8197375926982318e-05, "loss": 2.9209, "step": 3196 }, { "epoch": 0.5471504364196474, "grad_norm": 28.30303382873535, "learning_rate": 1.82030804335425e-05, "loss": 3.066, "step": 3197 }, { "epoch": 0.5473215813794283, "grad_norm": 27.540369033813477, "learning_rate": 1.820878494010268e-05, "loss": 3.3267, "step": 3198 }, { "epoch": 0.5474927263392093, "grad_norm": 4.438743591308594, "learning_rate": 1.8214489446662865e-05, "loss": 0.3722, "step": 3199 }, { "epoch": 0.5476638712989902, "grad_norm": 29.85404396057129, "learning_rate": 1.822019395322305e-05, "loss": 4.0139, "step": 3200 }, { "epoch": 0.5478350162587712, "grad_norm": 28.56346893310547, "learning_rate": 1.8225898459783232e-05, "loss": 3.6119, "step": 3201 }, { "epoch": 0.5480061612185522, "grad_norm": 29.419742584228516, "learning_rate": 1.8231602966343412e-05, "loss": 2.9087, "step": 3202 }, { "epoch": 0.548177306178333, "grad_norm": 44.72222900390625, "learning_rate": 1.8237307472903595e-05, "loss": 6.7043, "step": 3203 }, { "epoch": 0.548348451138114, "grad_norm": 21.762168884277344, "learning_rate": 1.824301197946378e-05, "loss": 1.9849, "step": 3204 }, { "epoch": 0.5485195960978949, "grad_norm": 26.09598731994629, "learning_rate": 1.824871648602396e-05, "loss": 2.6737, "step": 3205 }, { "epoch": 0.5486907410576759, "grad_norm": 32.6449089050293, "learning_rate": 1.8254420992584142e-05, "loss": 2.7534, "step": 3206 }, { "epoch": 0.5488618860174568, "grad_norm": 20.140134811401367, "learning_rate": 1.8260125499144325e-05, "loss": 2.0024, "step": 3207 }, { "epoch": 0.5490330309772378, "grad_norm": 8.021845817565918, "learning_rate": 1.8265830005704505e-05, "loss": 0.8484, "step": 3208 }, { "epoch": 0.5492041759370186, "grad_norm": 23.706680297851562, "learning_rate": 1.827153451226469e-05, "loss": 2.5228, "step": 3209 }, { "epoch": 0.5493753208967996, "grad_norm": 25.105031967163086, "learning_rate": 1.8277239018824872e-05, "loss": 2.4342, "step": 3210 }, { "epoch": 0.5495464658565805, "grad_norm": 16.53352165222168, "learning_rate": 1.8282943525385055e-05, "loss": 1.4315, "step": 3211 }, { "epoch": 0.5497176108163615, "grad_norm": 24.55224609375, "learning_rate": 1.8288648031945235e-05, "loss": 2.469, "step": 3212 }, { "epoch": 0.5498887557761424, "grad_norm": 3.4264721870422363, "learning_rate": 1.829435253850542e-05, "loss": 0.3677, "step": 3213 }, { "epoch": 0.5500599007359234, "grad_norm": 20.305509567260742, "learning_rate": 1.8300057045065602e-05, "loss": 2.3214, "step": 3214 }, { "epoch": 0.5502310456957042, "grad_norm": 27.69756507873535, "learning_rate": 1.8305761551625782e-05, "loss": 2.8746, "step": 3215 }, { "epoch": 0.5504021906554852, "grad_norm": 100.86264038085938, "learning_rate": 1.8311466058185965e-05, "loss": 7.5686, "step": 3216 }, { "epoch": 0.5505733356152661, "grad_norm": 26.603628158569336, "learning_rate": 1.831717056474615e-05, "loss": 2.5639, "step": 3217 }, { "epoch": 0.5507444805750471, "grad_norm": 31.449655532836914, "learning_rate": 1.8322875071306332e-05, "loss": 3.9795, "step": 3218 }, { "epoch": 0.550915625534828, "grad_norm": 25.562639236450195, "learning_rate": 1.8328579577866516e-05, "loss": 3.0795, "step": 3219 }, { "epoch": 0.551086770494609, "grad_norm": 4.988560199737549, "learning_rate": 1.83342840844267e-05, "loss": 0.4445, "step": 3220 }, { "epoch": 0.5512579154543898, "grad_norm": 31.045183181762695, "learning_rate": 1.8339988590986882e-05, "loss": 6.4413, "step": 3221 }, { "epoch": 0.5514290604141708, "grad_norm": 32.938106536865234, "learning_rate": 1.8345693097547062e-05, "loss": 4.0821, "step": 3222 }, { "epoch": 0.5516002053739517, "grad_norm": 23.498254776000977, "learning_rate": 1.8351397604107246e-05, "loss": 2.8752, "step": 3223 }, { "epoch": 0.5517713503337327, "grad_norm": 27.559247970581055, "learning_rate": 1.835710211066743e-05, "loss": 3.3216, "step": 3224 }, { "epoch": 0.5519424952935136, "grad_norm": 46.420135498046875, "learning_rate": 1.8362806617227613e-05, "loss": 6.9903, "step": 3225 }, { "epoch": 0.5521136402532946, "grad_norm": 23.508155822753906, "learning_rate": 1.8368511123787793e-05, "loss": 2.1877, "step": 3226 }, { "epoch": 0.5522847852130754, "grad_norm": 20.4776611328125, "learning_rate": 1.8374215630347976e-05, "loss": 1.8942, "step": 3227 }, { "epoch": 0.5524559301728564, "grad_norm": 15.294054985046387, "learning_rate": 1.837992013690816e-05, "loss": 1.2082, "step": 3228 }, { "epoch": 0.5526270751326373, "grad_norm": 22.51180076599121, "learning_rate": 1.838562464346834e-05, "loss": 2.2929, "step": 3229 }, { "epoch": 0.5527982200924183, "grad_norm": 21.741634368896484, "learning_rate": 1.8391329150028523e-05, "loss": 1.884, "step": 3230 }, { "epoch": 0.5529693650521992, "grad_norm": 4.330467224121094, "learning_rate": 1.8397033656588706e-05, "loss": 0.4163, "step": 3231 }, { "epoch": 0.5531405100119802, "grad_norm": 26.344017028808594, "learning_rate": 1.840273816314889e-05, "loss": 2.5767, "step": 3232 }, { "epoch": 0.553311654971761, "grad_norm": 53.116172790527344, "learning_rate": 1.840844266970907e-05, "loss": 2.7426, "step": 3233 }, { "epoch": 0.553482799931542, "grad_norm": 15.442861557006836, "learning_rate": 1.8414147176269253e-05, "loss": 0.9586, "step": 3234 }, { "epoch": 0.5536539448913229, "grad_norm": 28.15229606628418, "learning_rate": 1.8419851682829436e-05, "loss": 2.9423, "step": 3235 }, { "epoch": 0.5538250898511039, "grad_norm": 21.91160011291504, "learning_rate": 1.8425556189389616e-05, "loss": 2.3152, "step": 3236 }, { "epoch": 0.5539962348108848, "grad_norm": 21.20878028869629, "learning_rate": 1.84312606959498e-05, "loss": 2.117, "step": 3237 }, { "epoch": 0.5541673797706658, "grad_norm": 1.7572641372680664, "learning_rate": 1.8436965202509983e-05, "loss": 0.3516, "step": 3238 }, { "epoch": 0.5543385247304466, "grad_norm": 25.218217849731445, "learning_rate": 1.8442669709070166e-05, "loss": 2.5554, "step": 3239 }, { "epoch": 0.5545096696902276, "grad_norm": 30.133291244506836, "learning_rate": 1.8448374215630346e-05, "loss": 3.078, "step": 3240 }, { "epoch": 0.5546808146500085, "grad_norm": 30.298227310180664, "learning_rate": 1.845407872219053e-05, "loss": 3.2276, "step": 3241 }, { "epoch": 0.5548519596097895, "grad_norm": 31.560077667236328, "learning_rate": 1.8459783228750716e-05, "loss": 3.713, "step": 3242 }, { "epoch": 0.5550231045695704, "grad_norm": 7.287442207336426, "learning_rate": 1.8465487735310896e-05, "loss": 0.4505, "step": 3243 }, { "epoch": 0.5551942495293514, "grad_norm": 12.331917762756348, "learning_rate": 1.847119224187108e-05, "loss": 0.8588, "step": 3244 }, { "epoch": 0.5553653944891322, "grad_norm": 44.76494216918945, "learning_rate": 1.8476896748431263e-05, "loss": 7.2343, "step": 3245 }, { "epoch": 0.5555365394489132, "grad_norm": 32.351600646972656, "learning_rate": 1.8482601254991443e-05, "loss": 4.0445, "step": 3246 }, { "epoch": 0.5557076844086941, "grad_norm": 28.279098510742188, "learning_rate": 1.8488305761551627e-05, "loss": 3.0796, "step": 3247 }, { "epoch": 0.5558788293684751, "grad_norm": 28.361543655395508, "learning_rate": 1.849401026811181e-05, "loss": 3.3123, "step": 3248 }, { "epoch": 0.556049974328256, "grad_norm": 41.510597229003906, "learning_rate": 1.8499714774671993e-05, "loss": 6.904, "step": 3249 }, { "epoch": 0.556221119288037, "grad_norm": 28.658105850219727, "learning_rate": 1.8505419281232173e-05, "loss": 3.0404, "step": 3250 }, { "epoch": 0.5563922642478178, "grad_norm": 28.752214431762695, "learning_rate": 1.8511123787792357e-05, "loss": 3.7672, "step": 3251 }, { "epoch": 0.5565634092075988, "grad_norm": 34.223060607910156, "learning_rate": 1.851682829435254e-05, "loss": 5.3153, "step": 3252 }, { "epoch": 0.5567345541673798, "grad_norm": 12.981977462768555, "learning_rate": 1.852253280091272e-05, "loss": 0.9685, "step": 3253 }, { "epoch": 0.5569056991271607, "grad_norm": 20.176815032958984, "learning_rate": 1.8528237307472903e-05, "loss": 1.9294, "step": 3254 }, { "epoch": 0.5570768440869417, "grad_norm": 24.55870246887207, "learning_rate": 1.8533941814033087e-05, "loss": 2.8725, "step": 3255 }, { "epoch": 0.5572479890467226, "grad_norm": 21.821077346801758, "learning_rate": 1.853964632059327e-05, "loss": 2.0436, "step": 3256 }, { "epoch": 0.5574191340065036, "grad_norm": 23.027362823486328, "learning_rate": 1.854535082715345e-05, "loss": 2.1357, "step": 3257 }, { "epoch": 0.5575902789662844, "grad_norm": 10.035751342773438, "learning_rate": 1.8551055333713634e-05, "loss": 0.6818, "step": 3258 }, { "epoch": 0.5577614239260654, "grad_norm": 4.451612949371338, "learning_rate": 1.8556759840273817e-05, "loss": 0.4069, "step": 3259 }, { "epoch": 0.5579325688858463, "grad_norm": 25.953588485717773, "learning_rate": 1.8562464346833997e-05, "loss": 2.8325, "step": 3260 }, { "epoch": 0.5581037138456273, "grad_norm": 63.914024353027344, "learning_rate": 1.856816885339418e-05, "loss": 8.0674, "step": 3261 }, { "epoch": 0.5582748588054082, "grad_norm": 11.405961990356445, "learning_rate": 1.8573873359954364e-05, "loss": 0.8008, "step": 3262 }, { "epoch": 0.5584460037651892, "grad_norm": 21.461894989013672, "learning_rate": 1.8579577866514547e-05, "loss": 2.1028, "step": 3263 }, { "epoch": 0.55861714872497, "grad_norm": 22.72207260131836, "learning_rate": 1.8585282373074727e-05, "loss": 2.0172, "step": 3264 }, { "epoch": 0.558788293684751, "grad_norm": 27.586618423461914, "learning_rate": 1.8590986879634914e-05, "loss": 2.0431, "step": 3265 }, { "epoch": 0.5589594386445319, "grad_norm": 26.892311096191406, "learning_rate": 1.8596691386195097e-05, "loss": 2.5198, "step": 3266 }, { "epoch": 0.5591305836043129, "grad_norm": 22.420379638671875, "learning_rate": 1.8602395892755277e-05, "loss": 2.155, "step": 3267 }, { "epoch": 0.5593017285640938, "grad_norm": 8.758207321166992, "learning_rate": 1.860810039931546e-05, "loss": 1.3864, "step": 3268 }, { "epoch": 0.5594728735238748, "grad_norm": 8.072163581848145, "learning_rate": 1.8613804905875644e-05, "loss": 0.5867, "step": 3269 }, { "epoch": 0.5596440184836556, "grad_norm": 6.457092761993408, "learning_rate": 1.8619509412435827e-05, "loss": 0.6464, "step": 3270 }, { "epoch": 0.5598151634434366, "grad_norm": 7.5770392417907715, "learning_rate": 1.8625213918996007e-05, "loss": 0.8377, "step": 3271 }, { "epoch": 0.5599863084032175, "grad_norm": 28.0118350982666, "learning_rate": 1.863091842555619e-05, "loss": 3.5048, "step": 3272 }, { "epoch": 0.5601574533629985, "grad_norm": 75.3667984008789, "learning_rate": 1.8636622932116374e-05, "loss": 7.3634, "step": 3273 }, { "epoch": 0.5603285983227794, "grad_norm": 6.486256122589111, "learning_rate": 1.8642327438676554e-05, "loss": 0.772, "step": 3274 }, { "epoch": 0.5604997432825604, "grad_norm": 18.678125381469727, "learning_rate": 1.8648031945236737e-05, "loss": 1.9006, "step": 3275 }, { "epoch": 0.5606708882423412, "grad_norm": 7.29653263092041, "learning_rate": 1.865373645179692e-05, "loss": 1.075, "step": 3276 }, { "epoch": 0.5608420332021222, "grad_norm": 2.164841890335083, "learning_rate": 1.86594409583571e-05, "loss": 0.3693, "step": 3277 }, { "epoch": 0.5610131781619031, "grad_norm": 21.217857360839844, "learning_rate": 1.8665145464917284e-05, "loss": 2.2186, "step": 3278 }, { "epoch": 0.5611843231216841, "grad_norm": 8.882852554321289, "learning_rate": 1.8670849971477468e-05, "loss": 0.9134, "step": 3279 }, { "epoch": 0.561355468081465, "grad_norm": 17.709449768066406, "learning_rate": 1.867655447803765e-05, "loss": 1.5424, "step": 3280 }, { "epoch": 0.561526613041246, "grad_norm": 5.205548286437988, "learning_rate": 1.868225898459783e-05, "loss": 0.6676, "step": 3281 }, { "epoch": 0.5616977580010268, "grad_norm": 52.959259033203125, "learning_rate": 1.8687963491158014e-05, "loss": 7.2033, "step": 3282 }, { "epoch": 0.5618689029608078, "grad_norm": 29.030555725097656, "learning_rate": 1.8693667997718198e-05, "loss": 2.9477, "step": 3283 }, { "epoch": 0.5620400479205887, "grad_norm": 27.745031356811523, "learning_rate": 1.8699372504278378e-05, "loss": 2.719, "step": 3284 }, { "epoch": 0.5622111928803697, "grad_norm": 25.971677780151367, "learning_rate": 1.870507701083856e-05, "loss": 2.706, "step": 3285 }, { "epoch": 0.5623823378401506, "grad_norm": 27.33722686767578, "learning_rate": 1.8710781517398744e-05, "loss": 2.568, "step": 3286 }, { "epoch": 0.5625534827999316, "grad_norm": 22.52666473388672, "learning_rate": 1.8716486023958928e-05, "loss": 2.3127, "step": 3287 }, { "epoch": 0.5627246277597124, "grad_norm": 10.016031265258789, "learning_rate": 1.872219053051911e-05, "loss": 1.4001, "step": 3288 }, { "epoch": 0.5628957727194934, "grad_norm": 24.30003547668457, "learning_rate": 1.8727895037079295e-05, "loss": 2.4201, "step": 3289 }, { "epoch": 0.5630669176792743, "grad_norm": 6.622725009918213, "learning_rate": 1.8733599543639478e-05, "loss": 0.7098, "step": 3290 }, { "epoch": 0.5632380626390553, "grad_norm": 24.1121883392334, "learning_rate": 1.8739304050199658e-05, "loss": 1.887, "step": 3291 }, { "epoch": 0.5634092075988362, "grad_norm": 31.559614181518555, "learning_rate": 1.874500855675984e-05, "loss": 3.5751, "step": 3292 }, { "epoch": 0.5635803525586172, "grad_norm": 33.04099655151367, "learning_rate": 1.8750713063320025e-05, "loss": 3.2233, "step": 3293 }, { "epoch": 0.563751497518398, "grad_norm": 81.57552337646484, "learning_rate": 1.8756417569880208e-05, "loss": 3.8443, "step": 3294 }, { "epoch": 0.563922642478179, "grad_norm": 30.438037872314453, "learning_rate": 1.8762122076440388e-05, "loss": 3.7816, "step": 3295 }, { "epoch": 0.5640937874379599, "grad_norm": 7.756313323974609, "learning_rate": 1.876782658300057e-05, "loss": 0.7225, "step": 3296 }, { "epoch": 0.5642649323977409, "grad_norm": 28.59238624572754, "learning_rate": 1.8773531089560755e-05, "loss": 3.1516, "step": 3297 }, { "epoch": 0.5644360773575218, "grad_norm": 25.167417526245117, "learning_rate": 1.8779235596120935e-05, "loss": 3.174, "step": 3298 }, { "epoch": 0.5646072223173028, "grad_norm": 86.82372283935547, "learning_rate": 1.8784940102681118e-05, "loss": 4.5193, "step": 3299 }, { "epoch": 0.5647783672770836, "grad_norm": 30.278440475463867, "learning_rate": 1.87906446092413e-05, "loss": 3.2045, "step": 3300 }, { "epoch": 0.5649495122368646, "grad_norm": 34.26241683959961, "learning_rate": 1.8796349115801485e-05, "loss": 3.7586, "step": 3301 }, { "epoch": 0.5651206571966455, "grad_norm": 20.874797821044922, "learning_rate": 1.8802053622361665e-05, "loss": 1.9123, "step": 3302 }, { "epoch": 0.5652918021564265, "grad_norm": 26.034624099731445, "learning_rate": 1.8807758128921848e-05, "loss": 2.522, "step": 3303 }, { "epoch": 0.5654629471162075, "grad_norm": 11.349614143371582, "learning_rate": 1.881346263548203e-05, "loss": 0.9236, "step": 3304 }, { "epoch": 0.5656340920759884, "grad_norm": 10.266570091247559, "learning_rate": 1.881916714204221e-05, "loss": 0.6643, "step": 3305 }, { "epoch": 0.5658052370357693, "grad_norm": 32.189842224121094, "learning_rate": 1.8824871648602395e-05, "loss": 4.5101, "step": 3306 }, { "epoch": 0.5659763819955502, "grad_norm": 24.921152114868164, "learning_rate": 1.883057615516258e-05, "loss": 2.9263, "step": 3307 }, { "epoch": 0.5661475269553312, "grad_norm": 35.14552307128906, "learning_rate": 1.883628066172276e-05, "loss": 4.0464, "step": 3308 }, { "epoch": 0.5663186719151121, "grad_norm": 37.087039947509766, "learning_rate": 1.8841985168282942e-05, "loss": 4.0199, "step": 3309 }, { "epoch": 0.5664898168748931, "grad_norm": 26.691438674926758, "learning_rate": 1.8847689674843125e-05, "loss": 3.2809, "step": 3310 }, { "epoch": 0.566660961834674, "grad_norm": 31.133575439453125, "learning_rate": 1.8853394181403312e-05, "loss": 3.4234, "step": 3311 }, { "epoch": 0.566832106794455, "grad_norm": 98.82320404052734, "learning_rate": 1.8859098687963492e-05, "loss": 7.426, "step": 3312 }, { "epoch": 0.5670032517542358, "grad_norm": 26.13225746154785, "learning_rate": 1.8864803194523675e-05, "loss": 2.5794, "step": 3313 }, { "epoch": 0.5671743967140168, "grad_norm": 28.947038650512695, "learning_rate": 1.887050770108386e-05, "loss": 2.7049, "step": 3314 }, { "epoch": 0.5673455416737977, "grad_norm": 23.491085052490234, "learning_rate": 1.887621220764404e-05, "loss": 2.1517, "step": 3315 }, { "epoch": 0.5675166866335787, "grad_norm": 17.27471351623535, "learning_rate": 1.8881916714204222e-05, "loss": 1.4919, "step": 3316 }, { "epoch": 0.5676878315933596, "grad_norm": 34.11532974243164, "learning_rate": 1.8887621220764405e-05, "loss": 4.5321, "step": 3317 }, { "epoch": 0.5678589765531405, "grad_norm": 19.040075302124023, "learning_rate": 1.889332572732459e-05, "loss": 1.6601, "step": 3318 }, { "epoch": 0.5680301215129214, "grad_norm": 18.085039138793945, "learning_rate": 1.889903023388477e-05, "loss": 1.5302, "step": 3319 }, { "epoch": 0.5682012664727024, "grad_norm": 27.968341827392578, "learning_rate": 1.8904734740444952e-05, "loss": 3.3977, "step": 3320 }, { "epoch": 0.5683724114324833, "grad_norm": 2.1676626205444336, "learning_rate": 1.8910439247005136e-05, "loss": 0.3746, "step": 3321 }, { "epoch": 0.5685435563922643, "grad_norm": 3.0772573947906494, "learning_rate": 1.8916143753565316e-05, "loss": 0.4364, "step": 3322 }, { "epoch": 0.5687147013520452, "grad_norm": 25.465309143066406, "learning_rate": 1.89218482601255e-05, "loss": 2.5141, "step": 3323 }, { "epoch": 0.5688858463118261, "grad_norm": 28.62706184387207, "learning_rate": 1.8927552766685682e-05, "loss": 2.9376, "step": 3324 }, { "epoch": 0.569056991271607, "grad_norm": 10.36950969696045, "learning_rate": 1.8933257273245866e-05, "loss": 0.9048, "step": 3325 }, { "epoch": 0.569228136231388, "grad_norm": 22.50096893310547, "learning_rate": 1.8938961779806046e-05, "loss": 2.3555, "step": 3326 }, { "epoch": 0.5693992811911689, "grad_norm": 25.440292358398438, "learning_rate": 1.894466628636623e-05, "loss": 2.5734, "step": 3327 }, { "epoch": 0.5695704261509499, "grad_norm": 5.363638401031494, "learning_rate": 1.8950370792926412e-05, "loss": 0.4315, "step": 3328 }, { "epoch": 0.5697415711107308, "grad_norm": 29.033611297607422, "learning_rate": 1.8956075299486592e-05, "loss": 2.7605, "step": 3329 }, { "epoch": 0.5699127160705117, "grad_norm": 6.961116790771484, "learning_rate": 1.8961779806046776e-05, "loss": 1.2524, "step": 3330 }, { "epoch": 0.5700838610302926, "grad_norm": 29.2668399810791, "learning_rate": 1.896748431260696e-05, "loss": 3.3542, "step": 3331 }, { "epoch": 0.5702550059900736, "grad_norm": 38.82827377319336, "learning_rate": 1.8973188819167143e-05, "loss": 4.5204, "step": 3332 }, { "epoch": 0.5704261509498545, "grad_norm": 32.07524871826172, "learning_rate": 1.8978893325727326e-05, "loss": 4.3224, "step": 3333 }, { "epoch": 0.5705972959096355, "grad_norm": 3.1426124572753906, "learning_rate": 1.898459783228751e-05, "loss": 0.4036, "step": 3334 }, { "epoch": 0.5707684408694164, "grad_norm": 19.389469146728516, "learning_rate": 1.8990302338847693e-05, "loss": 2.1048, "step": 3335 }, { "epoch": 0.5709395858291973, "grad_norm": 17.071313858032227, "learning_rate": 1.8996006845407873e-05, "loss": 1.6332, "step": 3336 }, { "epoch": 0.5711107307889782, "grad_norm": 7.998443603515625, "learning_rate": 1.9001711351968056e-05, "loss": 0.6125, "step": 3337 }, { "epoch": 0.5712818757487592, "grad_norm": 27.566017150878906, "learning_rate": 1.900741585852824e-05, "loss": 3.0635, "step": 3338 }, { "epoch": 0.5714530207085401, "grad_norm": 6.867462158203125, "learning_rate": 1.901312036508842e-05, "loss": 0.5083, "step": 3339 }, { "epoch": 0.5716241656683211, "grad_norm": 24.942699432373047, "learning_rate": 1.9018824871648603e-05, "loss": 2.4329, "step": 3340 }, { "epoch": 0.571795310628102, "grad_norm": 17.44595718383789, "learning_rate": 1.9024529378208786e-05, "loss": 1.2566, "step": 3341 }, { "epoch": 0.571966455587883, "grad_norm": 30.833187103271484, "learning_rate": 1.903023388476897e-05, "loss": 2.7353, "step": 3342 }, { "epoch": 0.5721376005476638, "grad_norm": 31.722270965576172, "learning_rate": 1.903593839132915e-05, "loss": 3.8463, "step": 3343 }, { "epoch": 0.5723087455074448, "grad_norm": 12.909158706665039, "learning_rate": 1.9041642897889333e-05, "loss": 1.0365, "step": 3344 }, { "epoch": 0.5724798904672257, "grad_norm": 32.17844772338867, "learning_rate": 1.9047347404449516e-05, "loss": 3.2414, "step": 3345 }, { "epoch": 0.5726510354270067, "grad_norm": 25.432022094726562, "learning_rate": 1.9053051911009696e-05, "loss": 2.4539, "step": 3346 }, { "epoch": 0.5728221803867876, "grad_norm": 2.373732805252075, "learning_rate": 1.905875641756988e-05, "loss": 0.3677, "step": 3347 }, { "epoch": 0.5729933253465685, "grad_norm": 40.49632263183594, "learning_rate": 1.9064460924130063e-05, "loss": 3.9831, "step": 3348 }, { "epoch": 0.5731644703063494, "grad_norm": 1.9657033681869507, "learning_rate": 1.9070165430690246e-05, "loss": 0.3775, "step": 3349 }, { "epoch": 0.5733356152661304, "grad_norm": 61.38923645019531, "learning_rate": 1.9075869937250426e-05, "loss": 7.6187, "step": 3350 }, { "epoch": 0.5735067602259113, "grad_norm": 24.892297744750977, "learning_rate": 1.908157444381061e-05, "loss": 2.5702, "step": 3351 }, { "epoch": 0.5736779051856923, "grad_norm": 27.634868621826172, "learning_rate": 1.9087278950370793e-05, "loss": 2.6693, "step": 3352 }, { "epoch": 0.5738490501454732, "grad_norm": 30.543689727783203, "learning_rate": 1.9092983456930973e-05, "loss": 2.881, "step": 3353 }, { "epoch": 0.5740201951052541, "grad_norm": 20.875457763671875, "learning_rate": 1.9098687963491157e-05, "loss": 2.0431, "step": 3354 }, { "epoch": 0.5741913400650351, "grad_norm": 10.260396003723145, "learning_rate": 1.910439247005134e-05, "loss": 1.0317, "step": 3355 }, { "epoch": 0.574362485024816, "grad_norm": 28.790538787841797, "learning_rate": 1.9110096976611527e-05, "loss": 2.8238, "step": 3356 }, { "epoch": 0.574533629984597, "grad_norm": 25.868772506713867, "learning_rate": 1.9115801483171707e-05, "loss": 2.5919, "step": 3357 }, { "epoch": 0.5747047749443779, "grad_norm": 25.83347511291504, "learning_rate": 1.912150598973189e-05, "loss": 3.1996, "step": 3358 }, { "epoch": 0.5748759199041589, "grad_norm": 29.329633712768555, "learning_rate": 1.9127210496292073e-05, "loss": 3.1316, "step": 3359 }, { "epoch": 0.5750470648639397, "grad_norm": 9.001529693603516, "learning_rate": 1.9132915002852253e-05, "loss": 1.2363, "step": 3360 }, { "epoch": 0.5752182098237207, "grad_norm": 5.358071804046631, "learning_rate": 1.9138619509412437e-05, "loss": 0.3949, "step": 3361 }, { "epoch": 0.5753893547835016, "grad_norm": 13.40963363647461, "learning_rate": 1.914432401597262e-05, "loss": 0.7941, "step": 3362 }, { "epoch": 0.5755604997432826, "grad_norm": 37.820556640625, "learning_rate": 1.9150028522532804e-05, "loss": 6.717, "step": 3363 }, { "epoch": 0.5757316447030635, "grad_norm": 181.16746520996094, "learning_rate": 1.9155733029092984e-05, "loss": 9.5897, "step": 3364 }, { "epoch": 0.5759027896628445, "grad_norm": 29.568683624267578, "learning_rate": 1.9161437535653167e-05, "loss": 2.4104, "step": 3365 }, { "epoch": 0.5760739346226253, "grad_norm": 10.582496643066406, "learning_rate": 1.916714204221335e-05, "loss": 0.5204, "step": 3366 }, { "epoch": 0.5762450795824063, "grad_norm": 37.75896072387695, "learning_rate": 1.917284654877353e-05, "loss": 6.4064, "step": 3367 }, { "epoch": 0.5764162245421872, "grad_norm": 23.44141960144043, "learning_rate": 1.9178551055333714e-05, "loss": 2.2333, "step": 3368 }, { "epoch": 0.5765873695019682, "grad_norm": 23.17081642150879, "learning_rate": 1.9184255561893897e-05, "loss": 2.3916, "step": 3369 }, { "epoch": 0.5767585144617491, "grad_norm": 22.356122970581055, "learning_rate": 1.918996006845408e-05, "loss": 2.259, "step": 3370 }, { "epoch": 0.5769296594215301, "grad_norm": 25.988954544067383, "learning_rate": 1.919566457501426e-05, "loss": 2.6056, "step": 3371 }, { "epoch": 0.577100804381311, "grad_norm": 17.81022071838379, "learning_rate": 1.9201369081574444e-05, "loss": 1.3797, "step": 3372 }, { "epoch": 0.5772719493410919, "grad_norm": 28.269866943359375, "learning_rate": 1.9207073588134627e-05, "loss": 2.3896, "step": 3373 }, { "epoch": 0.5774430943008728, "grad_norm": 24.576251983642578, "learning_rate": 1.9212778094694807e-05, "loss": 2.2518, "step": 3374 }, { "epoch": 0.5776142392606538, "grad_norm": 5.2097649574279785, "learning_rate": 1.921848260125499e-05, "loss": 0.3953, "step": 3375 }, { "epoch": 0.5777853842204347, "grad_norm": 3.1124250888824463, "learning_rate": 1.9224187107815174e-05, "loss": 0.3687, "step": 3376 }, { "epoch": 0.5779565291802157, "grad_norm": 20.81354331970215, "learning_rate": 1.9229891614375354e-05, "loss": 2.0595, "step": 3377 }, { "epoch": 0.5781276741399966, "grad_norm": 29.21316909790039, "learning_rate": 1.9235596120935537e-05, "loss": 3.7875, "step": 3378 }, { "epoch": 0.5782988190997775, "grad_norm": 84.69393157958984, "learning_rate": 1.9241300627495724e-05, "loss": 3.0112, "step": 3379 }, { "epoch": 0.5784699640595584, "grad_norm": 1.8985782861709595, "learning_rate": 1.9247005134055907e-05, "loss": 0.3148, "step": 3380 }, { "epoch": 0.5786411090193394, "grad_norm": 10.058646202087402, "learning_rate": 1.9252709640616087e-05, "loss": 0.8459, "step": 3381 }, { "epoch": 0.5788122539791203, "grad_norm": 27.1168270111084, "learning_rate": 1.925841414717627e-05, "loss": 2.5347, "step": 3382 }, { "epoch": 0.5789833989389013, "grad_norm": 89.62450408935547, "learning_rate": 1.9264118653736454e-05, "loss": 3.7248, "step": 3383 }, { "epoch": 0.5791545438986822, "grad_norm": 4.5566558837890625, "learning_rate": 1.9269823160296634e-05, "loss": 0.6092, "step": 3384 }, { "epoch": 0.5793256888584631, "grad_norm": 30.642803192138672, "learning_rate": 1.9275527666856818e-05, "loss": 3.5006, "step": 3385 }, { "epoch": 0.579496833818244, "grad_norm": 27.308584213256836, "learning_rate": 1.9281232173417e-05, "loss": 3.4485, "step": 3386 }, { "epoch": 0.579667978778025, "grad_norm": 29.646587371826172, "learning_rate": 1.9286936679977184e-05, "loss": 3.0531, "step": 3387 }, { "epoch": 0.5798391237378059, "grad_norm": 14.223383903503418, "learning_rate": 1.9292641186537364e-05, "loss": 1.5835, "step": 3388 }, { "epoch": 0.5800102686975869, "grad_norm": 24.695066452026367, "learning_rate": 1.9298345693097548e-05, "loss": 2.368, "step": 3389 }, { "epoch": 0.5801814136573678, "grad_norm": 26.341815948486328, "learning_rate": 1.930405019965773e-05, "loss": 2.3547, "step": 3390 }, { "epoch": 0.5803525586171487, "grad_norm": 18.38678741455078, "learning_rate": 1.930975470621791e-05, "loss": 1.7797, "step": 3391 }, { "epoch": 0.5805237035769296, "grad_norm": 25.61127471923828, "learning_rate": 1.9315459212778094e-05, "loss": 2.638, "step": 3392 }, { "epoch": 0.5806948485367106, "grad_norm": 2.290560007095337, "learning_rate": 1.9321163719338278e-05, "loss": 0.3367, "step": 3393 }, { "epoch": 0.5808659934964915, "grad_norm": 11.412469863891602, "learning_rate": 1.932686822589846e-05, "loss": 1.1406, "step": 3394 }, { "epoch": 0.5810371384562725, "grad_norm": 8.998905181884766, "learning_rate": 1.933257273245864e-05, "loss": 0.6117, "step": 3395 }, { "epoch": 0.5812082834160534, "grad_norm": 7.52636194229126, "learning_rate": 1.9338277239018825e-05, "loss": 0.653, "step": 3396 }, { "epoch": 0.5813794283758343, "grad_norm": 27.57058334350586, "learning_rate": 1.9343981745579008e-05, "loss": 3.0084, "step": 3397 }, { "epoch": 0.5815505733356152, "grad_norm": 26.775415420532227, "learning_rate": 1.9349686252139188e-05, "loss": 2.9132, "step": 3398 }, { "epoch": 0.5817217182953962, "grad_norm": 3.1826353073120117, "learning_rate": 1.935539075869937e-05, "loss": 0.3669, "step": 3399 }, { "epoch": 0.5818928632551771, "grad_norm": 6.152859210968018, "learning_rate": 1.9361095265259555e-05, "loss": 0.6359, "step": 3400 }, { "epoch": 0.5820640082149581, "grad_norm": 1.7208553552627563, "learning_rate": 1.9366799771819738e-05, "loss": 0.3746, "step": 3401 }, { "epoch": 0.582235153174739, "grad_norm": 7.883406162261963, "learning_rate": 1.937250427837992e-05, "loss": 1.0997, "step": 3402 }, { "epoch": 0.5824062981345199, "grad_norm": 26.301164627075195, "learning_rate": 1.9378208784940105e-05, "loss": 2.4291, "step": 3403 }, { "epoch": 0.5825774430943008, "grad_norm": 23.660444259643555, "learning_rate": 1.9383913291500288e-05, "loss": 2.6292, "step": 3404 }, { "epoch": 0.5827485880540818, "grad_norm": 17.410369873046875, "learning_rate": 1.9389617798060468e-05, "loss": 1.4877, "step": 3405 }, { "epoch": 0.5829197330138628, "grad_norm": 31.716928482055664, "learning_rate": 1.939532230462065e-05, "loss": 3.778, "step": 3406 }, { "epoch": 0.5830908779736437, "grad_norm": 39.23788833618164, "learning_rate": 1.9401026811180835e-05, "loss": 3.4125, "step": 3407 }, { "epoch": 0.5832620229334247, "grad_norm": 21.296669006347656, "learning_rate": 1.9406731317741015e-05, "loss": 1.9439, "step": 3408 }, { "epoch": 0.5834331678932055, "grad_norm": 4.249104022979736, "learning_rate": 1.94124358243012e-05, "loss": 0.5641, "step": 3409 }, { "epoch": 0.5836043128529865, "grad_norm": 25.32843780517578, "learning_rate": 1.9418140330861382e-05, "loss": 2.1923, "step": 3410 }, { "epoch": 0.5837754578127674, "grad_norm": 31.81114387512207, "learning_rate": 1.9423844837421565e-05, "loss": 6.2289, "step": 3411 }, { "epoch": 0.5839466027725484, "grad_norm": 34.15937423706055, "learning_rate": 1.9429549343981745e-05, "loss": 7.0035, "step": 3412 }, { "epoch": 0.5841177477323293, "grad_norm": 27.947298049926758, "learning_rate": 1.943525385054193e-05, "loss": 3.2937, "step": 3413 }, { "epoch": 0.5842888926921103, "grad_norm": 13.201940536499023, "learning_rate": 1.9440958357102112e-05, "loss": 0.9029, "step": 3414 }, { "epoch": 0.5844600376518911, "grad_norm": 21.287315368652344, "learning_rate": 1.9446662863662292e-05, "loss": 2.1476, "step": 3415 }, { "epoch": 0.5846311826116721, "grad_norm": 27.151569366455078, "learning_rate": 1.9452367370222475e-05, "loss": 2.8889, "step": 3416 }, { "epoch": 0.584802327571453, "grad_norm": 25.92886734008789, "learning_rate": 1.945807187678266e-05, "loss": 2.43, "step": 3417 }, { "epoch": 0.584973472531234, "grad_norm": 16.41077423095703, "learning_rate": 1.9463776383342842e-05, "loss": 1.1515, "step": 3418 }, { "epoch": 0.5851446174910149, "grad_norm": 7.387080669403076, "learning_rate": 1.9469480889903022e-05, "loss": 1.0438, "step": 3419 }, { "epoch": 0.5853157624507959, "grad_norm": 28.30823516845703, "learning_rate": 1.9475185396463205e-05, "loss": 2.4817, "step": 3420 }, { "epoch": 0.5854869074105767, "grad_norm": 19.957653045654297, "learning_rate": 1.948088990302339e-05, "loss": 2.492, "step": 3421 }, { "epoch": 0.5856580523703577, "grad_norm": 26.708097457885742, "learning_rate": 1.948659440958357e-05, "loss": 2.9457, "step": 3422 }, { "epoch": 0.5858291973301386, "grad_norm": 6.408317565917969, "learning_rate": 1.9492298916143752e-05, "loss": 0.7295, "step": 3423 }, { "epoch": 0.5860003422899196, "grad_norm": 30.148130416870117, "learning_rate": 1.9498003422703935e-05, "loss": 3.3614, "step": 3424 }, { "epoch": 0.5861714872497005, "grad_norm": 22.77581787109375, "learning_rate": 1.9503707929264122e-05, "loss": 1.9865, "step": 3425 }, { "epoch": 0.5863426322094815, "grad_norm": 27.753477096557617, "learning_rate": 1.9509412435824302e-05, "loss": 2.899, "step": 3426 }, { "epoch": 0.5865137771692623, "grad_norm": 6.288846015930176, "learning_rate": 1.9515116942384486e-05, "loss": 0.646, "step": 3427 }, { "epoch": 0.5866849221290433, "grad_norm": 24.92253303527832, "learning_rate": 1.952082144894467e-05, "loss": 2.5784, "step": 3428 }, { "epoch": 0.5868560670888242, "grad_norm": 24.49477767944336, "learning_rate": 1.952652595550485e-05, "loss": 2.1704, "step": 3429 }, { "epoch": 0.5870272120486052, "grad_norm": 24.100597381591797, "learning_rate": 1.9532230462065032e-05, "loss": 2.5277, "step": 3430 }, { "epoch": 0.5871983570083861, "grad_norm": 21.0911922454834, "learning_rate": 1.9537934968625216e-05, "loss": 2.2242, "step": 3431 }, { "epoch": 0.5873695019681671, "grad_norm": 22.534944534301758, "learning_rate": 1.95436394751854e-05, "loss": 2.3912, "step": 3432 }, { "epoch": 0.5875406469279479, "grad_norm": 22.132417678833008, "learning_rate": 1.954934398174558e-05, "loss": 2.3557, "step": 3433 }, { "epoch": 0.5877117918877289, "grad_norm": 21.22612953186035, "learning_rate": 1.9555048488305762e-05, "loss": 1.8585, "step": 3434 }, { "epoch": 0.5878829368475098, "grad_norm": 87.90875244140625, "learning_rate": 1.9560752994865946e-05, "loss": 4.0792, "step": 3435 }, { "epoch": 0.5880540818072908, "grad_norm": 89.19034576416016, "learning_rate": 1.9566457501426126e-05, "loss": 4.43, "step": 3436 }, { "epoch": 0.5882252267670717, "grad_norm": 19.258451461791992, "learning_rate": 1.957216200798631e-05, "loss": 1.7874, "step": 3437 }, { "epoch": 0.5883963717268527, "grad_norm": 1.7522574663162231, "learning_rate": 1.9577866514546493e-05, "loss": 0.3112, "step": 3438 }, { "epoch": 0.5885675166866335, "grad_norm": 18.229957580566406, "learning_rate": 1.9583571021106676e-05, "loss": 2.766, "step": 3439 }, { "epoch": 0.5887386616464145, "grad_norm": 36.58788299560547, "learning_rate": 1.9589275527666856e-05, "loss": 5.2847, "step": 3440 }, { "epoch": 0.5889098066061954, "grad_norm": 23.946247100830078, "learning_rate": 1.959498003422704e-05, "loss": 2.3658, "step": 3441 }, { "epoch": 0.5890809515659764, "grad_norm": 29.713180541992188, "learning_rate": 1.9600684540787223e-05, "loss": 3.6696, "step": 3442 }, { "epoch": 0.5892520965257573, "grad_norm": 22.247447967529297, "learning_rate": 1.9606389047347403e-05, "loss": 2.6781, "step": 3443 }, { "epoch": 0.5894232414855383, "grad_norm": 23.726993560791016, "learning_rate": 1.9612093553907586e-05, "loss": 2.765, "step": 3444 }, { "epoch": 0.5895943864453191, "grad_norm": 39.94513702392578, "learning_rate": 1.961779806046777e-05, "loss": 7.0047, "step": 3445 }, { "epoch": 0.5897655314051001, "grad_norm": 24.248090744018555, "learning_rate": 1.962350256702795e-05, "loss": 2.7753, "step": 3446 }, { "epoch": 0.589936676364881, "grad_norm": 3.6631691455841064, "learning_rate": 1.9629207073588133e-05, "loss": 0.3529, "step": 3447 }, { "epoch": 0.590107821324662, "grad_norm": 25.42365264892578, "learning_rate": 1.963491158014832e-05, "loss": 2.1961, "step": 3448 }, { "epoch": 0.5902789662844429, "grad_norm": 25.308515548706055, "learning_rate": 1.9640616086708503e-05, "loss": 2.3387, "step": 3449 }, { "epoch": 0.5904501112442239, "grad_norm": 19.806636810302734, "learning_rate": 1.9646320593268683e-05, "loss": 2.0071, "step": 3450 }, { "epoch": 0.5906212562040047, "grad_norm": 17.552900314331055, "learning_rate": 1.9652025099828866e-05, "loss": 1.4304, "step": 3451 }, { "epoch": 0.5907924011637857, "grad_norm": 23.210519790649414, "learning_rate": 1.965772960638905e-05, "loss": 2.1445, "step": 3452 }, { "epoch": 0.5909635461235666, "grad_norm": 25.595361709594727, "learning_rate": 1.966343411294923e-05, "loss": 3.0135, "step": 3453 }, { "epoch": 0.5911346910833476, "grad_norm": 3.9893271923065186, "learning_rate": 1.9669138619509413e-05, "loss": 0.4081, "step": 3454 }, { "epoch": 0.5913058360431286, "grad_norm": 2.2912561893463135, "learning_rate": 1.9674843126069596e-05, "loss": 0.3066, "step": 3455 }, { "epoch": 0.5914769810029095, "grad_norm": 23.45972442626953, "learning_rate": 1.968054763262978e-05, "loss": 2.4938, "step": 3456 }, { "epoch": 0.5916481259626905, "grad_norm": 24.78557777404785, "learning_rate": 1.968625213918996e-05, "loss": 2.2888, "step": 3457 }, { "epoch": 0.5918192709224713, "grad_norm": 56.51396560668945, "learning_rate": 1.9691956645750143e-05, "loss": 3.3839, "step": 3458 }, { "epoch": 0.5919904158822523, "grad_norm": 15.350875854492188, "learning_rate": 1.9697661152310327e-05, "loss": 1.0531, "step": 3459 }, { "epoch": 0.5921615608420332, "grad_norm": 73.21929931640625, "learning_rate": 1.9703365658870507e-05, "loss": 3.5199, "step": 3460 }, { "epoch": 0.5923327058018142, "grad_norm": 29.828990936279297, "learning_rate": 1.970907016543069e-05, "loss": 3.8054, "step": 3461 }, { "epoch": 0.5925038507615951, "grad_norm": 18.3194637298584, "learning_rate": 1.9714774671990873e-05, "loss": 1.7246, "step": 3462 }, { "epoch": 0.592674995721376, "grad_norm": 29.311429977416992, "learning_rate": 1.9720479178551057e-05, "loss": 3.2428, "step": 3463 }, { "epoch": 0.5928461406811569, "grad_norm": 1.9222893714904785, "learning_rate": 1.9726183685111237e-05, "loss": 0.3078, "step": 3464 }, { "epoch": 0.5930172856409379, "grad_norm": 6.286295413970947, "learning_rate": 1.973188819167142e-05, "loss": 0.6446, "step": 3465 }, { "epoch": 0.5931884306007188, "grad_norm": 29.647480010986328, "learning_rate": 1.9737592698231603e-05, "loss": 2.9621, "step": 3466 }, { "epoch": 0.5933595755604998, "grad_norm": 26.92269515991211, "learning_rate": 1.9743297204791783e-05, "loss": 2.5933, "step": 3467 }, { "epoch": 0.5935307205202807, "grad_norm": 50.6396484375, "learning_rate": 1.9749001711351967e-05, "loss": 6.8098, "step": 3468 }, { "epoch": 0.5937018654800617, "grad_norm": 25.224733352661133, "learning_rate": 1.975470621791215e-05, "loss": 2.4431, "step": 3469 }, { "epoch": 0.5938730104398425, "grad_norm": 17.845563888549805, "learning_rate": 1.9760410724472334e-05, "loss": 1.7165, "step": 3470 }, { "epoch": 0.5940441553996235, "grad_norm": 5.634066104888916, "learning_rate": 1.9766115231032517e-05, "loss": 0.5882, "step": 3471 }, { "epoch": 0.5942153003594044, "grad_norm": 34.622920989990234, "learning_rate": 1.97718197375927e-05, "loss": 3.5714, "step": 3472 }, { "epoch": 0.5943864453191854, "grad_norm": 63.40961837768555, "learning_rate": 1.9777524244152884e-05, "loss": 2.734, "step": 3473 }, { "epoch": 0.5945575902789663, "grad_norm": 29.88731575012207, "learning_rate": 1.9783228750713064e-05, "loss": 3.8436, "step": 3474 }, { "epoch": 0.5947287352387473, "grad_norm": 27.8708553314209, "learning_rate": 1.9788933257273247e-05, "loss": 2.3388, "step": 3475 }, { "epoch": 0.5948998801985281, "grad_norm": 25.777362823486328, "learning_rate": 1.979463776383343e-05, "loss": 2.3517, "step": 3476 }, { "epoch": 0.5950710251583091, "grad_norm": 14.805953979492188, "learning_rate": 1.980034227039361e-05, "loss": 1.5038, "step": 3477 }, { "epoch": 0.59524217011809, "grad_norm": 19.073440551757812, "learning_rate": 1.9806046776953794e-05, "loss": 1.8955, "step": 3478 }, { "epoch": 0.595413315077871, "grad_norm": 21.738014221191406, "learning_rate": 1.9811751283513977e-05, "loss": 2.3869, "step": 3479 }, { "epoch": 0.5955844600376519, "grad_norm": 2.9714324474334717, "learning_rate": 1.981745579007416e-05, "loss": 0.3378, "step": 3480 }, { "epoch": 0.5957556049974329, "grad_norm": 8.826178550720215, "learning_rate": 1.982316029663434e-05, "loss": 0.9817, "step": 3481 }, { "epoch": 0.5959267499572137, "grad_norm": 16.54644012451172, "learning_rate": 1.9828864803194524e-05, "loss": 1.2827, "step": 3482 }, { "epoch": 0.5960978949169947, "grad_norm": 9.384221076965332, "learning_rate": 1.9834569309754707e-05, "loss": 1.3316, "step": 3483 }, { "epoch": 0.5962690398767756, "grad_norm": 25.255199432373047, "learning_rate": 1.9840273816314887e-05, "loss": 2.1236, "step": 3484 }, { "epoch": 0.5964401848365566, "grad_norm": 27.23832893371582, "learning_rate": 1.984597832287507e-05, "loss": 2.8921, "step": 3485 }, { "epoch": 0.5966113297963375, "grad_norm": 31.743816375732422, "learning_rate": 1.9851682829435254e-05, "loss": 4.1041, "step": 3486 }, { "epoch": 0.5967824747561185, "grad_norm": 23.10817527770996, "learning_rate": 1.9857387335995437e-05, "loss": 1.973, "step": 3487 }, { "epoch": 0.5969536197158993, "grad_norm": 40.163639068603516, "learning_rate": 1.9863091842555617e-05, "loss": 6.3457, "step": 3488 }, { "epoch": 0.5971247646756803, "grad_norm": 29.302976608276367, "learning_rate": 1.98687963491158e-05, "loss": 2.8273, "step": 3489 }, { "epoch": 0.5972959096354612, "grad_norm": 29.635021209716797, "learning_rate": 1.9874500855675984e-05, "loss": 3.671, "step": 3490 }, { "epoch": 0.5974670545952422, "grad_norm": 21.227108001708984, "learning_rate": 1.9880205362236164e-05, "loss": 2.1672, "step": 3491 }, { "epoch": 0.5976381995550231, "grad_norm": 30.448522567749023, "learning_rate": 1.9885909868796348e-05, "loss": 3.0936, "step": 3492 }, { "epoch": 0.597809344514804, "grad_norm": 27.133663177490234, "learning_rate": 1.9891614375356534e-05, "loss": 2.4887, "step": 3493 }, { "epoch": 0.5979804894745849, "grad_norm": 39.466121673583984, "learning_rate": 1.9897318881916718e-05, "loss": 4.8888, "step": 3494 }, { "epoch": 0.5981516344343659, "grad_norm": 39.85908889770508, "learning_rate": 1.9903023388476898e-05, "loss": 6.6469, "step": 3495 }, { "epoch": 0.5983227793941468, "grad_norm": 19.293907165527344, "learning_rate": 1.990872789503708e-05, "loss": 2.0085, "step": 3496 }, { "epoch": 0.5984939243539278, "grad_norm": 30.540531158447266, "learning_rate": 1.9914432401597265e-05, "loss": 2.8524, "step": 3497 }, { "epoch": 0.5986650693137087, "grad_norm": 2.173297882080078, "learning_rate": 1.9920136908157444e-05, "loss": 0.4671, "step": 3498 }, { "epoch": 0.5988362142734897, "grad_norm": 23.616220474243164, "learning_rate": 1.9925841414717628e-05, "loss": 2.1941, "step": 3499 }, { "epoch": 0.5990073592332705, "grad_norm": 10.88476276397705, "learning_rate": 1.993154592127781e-05, "loss": 0.9849, "step": 3500 }, { "epoch": 0.5991785041930515, "grad_norm": 35.73077392578125, "learning_rate": 1.9937250427837995e-05, "loss": 3.3526, "step": 3501 }, { "epoch": 0.5993496491528324, "grad_norm": 16.617977142333984, "learning_rate": 1.9942954934398175e-05, "loss": 1.305, "step": 3502 }, { "epoch": 0.5995207941126134, "grad_norm": 18.637554168701172, "learning_rate": 1.9948659440958358e-05, "loss": 1.7833, "step": 3503 }, { "epoch": 0.5996919390723943, "grad_norm": 22.126482009887695, "learning_rate": 1.995436394751854e-05, "loss": 1.8701, "step": 3504 }, { "epoch": 0.5998630840321753, "grad_norm": 19.62862777709961, "learning_rate": 1.996006845407872e-05, "loss": 1.9236, "step": 3505 }, { "epoch": 0.6000342289919562, "grad_norm": 27.936777114868164, "learning_rate": 1.9965772960638905e-05, "loss": 2.4178, "step": 3506 }, { "epoch": 0.6002053739517371, "grad_norm": 19.932191848754883, "learning_rate": 1.9971477467199088e-05, "loss": 2.1511, "step": 3507 }, { "epoch": 0.6003765189115181, "grad_norm": 25.053146362304688, "learning_rate": 1.9977181973759268e-05, "loss": 2.4803, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_nli-pairs_loss": 2.4920291900634766, "eval_nli-pairs_runtime": 4.6698, "eval_nli-pairs_samples_per_second": 42.828, "eval_nli-pairs_steps_per_second": 1.499, "eval_sts-test_pearson_cosine": 0.7445126100709293, "eval_sts-test_pearson_dot": 0.6267026529286148, "eval_sts-test_pearson_euclidean": 0.7432252885023554, "eval_sts-test_pearson_manhattan": 0.7498148030136934, "eval_sts-test_pearson_max": 0.7498148030136934, "eval_sts-test_spearman_cosine": 0.7257459075346154, "eval_sts-test_spearman_dot": 0.6080996929747863, "eval_sts-test_spearman_euclidean": 0.7251182727779897, "eval_sts-test_spearman_manhattan": 0.7328124096687271, "eval_sts-test_spearman_max": 0.7328124096687271, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_vitaminc-pairs_loss": 1.5536390542984009, "eval_vitaminc-pairs_runtime": 2.8901, "eval_vitaminc-pairs_samples_per_second": 69.202, "eval_vitaminc-pairs_steps_per_second": 2.422, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_qnli-contrastive_loss": 3.72904109954834, "eval_qnli-contrastive_runtime": 0.7044, "eval_qnli-contrastive_samples_per_second": 283.946, "eval_qnli-contrastive_steps_per_second": 9.938, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_scitail-pairs-qa_loss": 0.28478389978408813, "eval_scitail-pairs-qa_runtime": 1.9184, "eval_scitail-pairs-qa_samples_per_second": 104.251, "eval_scitail-pairs-qa_steps_per_second": 3.649, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_scitail-pairs-pos_loss": 1.0560411214828491, "eval_scitail-pairs-pos_runtime": 2.9426, "eval_scitail-pairs-pos_samples_per_second": 67.966, "eval_scitail-pairs-pos_steps_per_second": 2.379, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_xsum-pairs_loss": 1.246793508529663, "eval_xsum-pairs_runtime": 2.6747, "eval_xsum-pairs_samples_per_second": 65.429, "eval_xsum-pairs_steps_per_second": 2.243, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_compression-pairs_loss": 0.5663356184959412, "eval_compression-pairs_runtime": 0.5441, "eval_compression-pairs_samples_per_second": 367.559, "eval_compression-pairs_steps_per_second": 12.865, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_sciq_pairs_loss": 5.566298484802246, "eval_sciq_pairs_runtime": 9.5047, "eval_sciq_pairs_samples_per_second": 21.042, "eval_sciq_pairs_steps_per_second": 0.736, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_qasc_pairs_loss": 6.534984588623047, "eval_qasc_pairs_runtime": 2.8892, "eval_qasc_pairs_samples_per_second": 69.224, "eval_qasc_pairs_steps_per_second": 2.423, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_openbookqa_pairs_loss": 3.5413291454315186, "eval_openbookqa_pairs_runtime": 0.7338, "eval_openbookqa_pairs_samples_per_second": 94.027, "eval_openbookqa_pairs_steps_per_second": 4.088, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_msmarco_pairs_loss": 2.2276792526245117, "eval_msmarco_pairs_runtime": 4.1013, "eval_msmarco_pairs_samples_per_second": 48.765, "eval_msmarco_pairs_steps_per_second": 1.707, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_nq_pairs_loss": 2.868544340133667, "eval_nq_pairs_runtime": 8.7773, "eval_nq_pairs_samples_per_second": 22.786, "eval_nq_pairs_steps_per_second": 0.798, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_trivia_pairs_loss": 2.8433399200439453, "eval_trivia_pairs_runtime": 12.7884, "eval_trivia_pairs_samples_per_second": 15.639, "eval_trivia_pairs_steps_per_second": 0.547, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_quora_pairs_loss": 0.5191998481750488, "eval_quora_pairs_runtime": 1.6069, "eval_quora_pairs_samples_per_second": 124.459, "eval_quora_pairs_steps_per_second": 4.356, "step": 3508 }, { "epoch": 0.6003765189115181, "eval_gooaq_pairs_loss": 1.7708619832992554, "eval_gooaq_pairs_runtime": 2.6531, "eval_gooaq_pairs_samples_per_second": 75.384, "eval_gooaq_pairs_steps_per_second": 2.638, "step": 3508 }, { "epoch": 0.600547663871299, "grad_norm": 25.026371002197266, "learning_rate": 1.998288648031945e-05, "loss": 2.5669, "step": 3509 }, { "epoch": 0.60071880883108, "grad_norm": 27.141857147216797, "learning_rate": 1.9988590986879635e-05, "loss": 3.5018, "step": 3510 }, { "epoch": 0.6008899537908609, "grad_norm": 14.711791038513184, "learning_rate": 1.9994295493439818e-05, "loss": 1.5597, "step": 3511 }, { "epoch": 0.6010610987506418, "grad_norm": 21.489046096801758, "learning_rate": 1.9999999999999998e-05, "loss": 1.9611, "step": 3512 }, { "epoch": 0.6012322437104227, "grad_norm": 25.058837890625, "learning_rate": 2.000570450656018e-05, "loss": 2.6108, "step": 3513 }, { "epoch": 0.6014033886702037, "grad_norm": 35.174922943115234, "learning_rate": 2.0011409013120365e-05, "loss": 4.3163, "step": 3514 }, { "epoch": 0.6015745336299846, "grad_norm": 24.40542984008789, "learning_rate": 2.0017113519680545e-05, "loss": 2.2731, "step": 3515 }, { "epoch": 0.6017456785897656, "grad_norm": 25.144128799438477, "learning_rate": 2.0022818026240732e-05, "loss": 2.9364, "step": 3516 }, { "epoch": 0.6019168235495465, "grad_norm": 24.48015594482422, "learning_rate": 2.0028522532800915e-05, "loss": 2.9065, "step": 3517 }, { "epoch": 0.6020879685093274, "grad_norm": 3.758195161819458, "learning_rate": 2.00342270393611e-05, "loss": 0.3902, "step": 3518 }, { "epoch": 0.6022591134691083, "grad_norm": 24.701047897338867, "learning_rate": 2.003993154592128e-05, "loss": 2.7503, "step": 3519 }, { "epoch": 0.6024302584288893, "grad_norm": 35.492271423339844, "learning_rate": 2.0045636052481462e-05, "loss": 3.48, "step": 3520 }, { "epoch": 0.6026014033886702, "grad_norm": 23.064668655395508, "learning_rate": 2.0051340559041645e-05, "loss": 2.0561, "step": 3521 }, { "epoch": 0.6027725483484512, "grad_norm": 20.811628341674805, "learning_rate": 2.0057045065601825e-05, "loss": 2.1345, "step": 3522 }, { "epoch": 0.602943693308232, "grad_norm": 29.092313766479492, "learning_rate": 2.006274957216201e-05, "loss": 3.3692, "step": 3523 }, { "epoch": 0.603114838268013, "grad_norm": 4.536581993103027, "learning_rate": 2.0068454078722192e-05, "loss": 0.3497, "step": 3524 }, { "epoch": 0.6032859832277939, "grad_norm": 30.311725616455078, "learning_rate": 2.0074158585282375e-05, "loss": 3.1903, "step": 3525 }, { "epoch": 0.6034571281875749, "grad_norm": 22.41107749938965, "learning_rate": 2.0079863091842555e-05, "loss": 1.8769, "step": 3526 }, { "epoch": 0.6036282731473558, "grad_norm": 110.15360260009766, "learning_rate": 2.008556759840274e-05, "loss": 8.2043, "step": 3527 }, { "epoch": 0.6037994181071368, "grad_norm": 22.728544235229492, "learning_rate": 2.0091272104962922e-05, "loss": 2.2539, "step": 3528 }, { "epoch": 0.6039705630669177, "grad_norm": 20.25188446044922, "learning_rate": 2.0096976611523102e-05, "loss": 2.1306, "step": 3529 }, { "epoch": 0.6041417080266986, "grad_norm": 25.181900024414062, "learning_rate": 2.0102681118083286e-05, "loss": 2.7009, "step": 3530 }, { "epoch": 0.6043128529864795, "grad_norm": 20.236555099487305, "learning_rate": 2.010838562464347e-05, "loss": 2.0943, "step": 3531 }, { "epoch": 0.6044839979462605, "grad_norm": 28.511568069458008, "learning_rate": 2.0114090131203652e-05, "loss": 3.4193, "step": 3532 }, { "epoch": 0.6046551429060414, "grad_norm": 20.58401870727539, "learning_rate": 2.0119794637763832e-05, "loss": 1.8277, "step": 3533 }, { "epoch": 0.6048262878658224, "grad_norm": 28.2340087890625, "learning_rate": 2.0125499144324016e-05, "loss": 3.125, "step": 3534 }, { "epoch": 0.6049974328256033, "grad_norm": 29.512651443481445, "learning_rate": 2.01312036508842e-05, "loss": 2.8504, "step": 3535 }, { "epoch": 0.6051685777853842, "grad_norm": 20.814037322998047, "learning_rate": 2.013690815744438e-05, "loss": 1.988, "step": 3536 }, { "epoch": 0.6053397227451651, "grad_norm": 31.458534240722656, "learning_rate": 2.0142612664004562e-05, "loss": 3.4279, "step": 3537 }, { "epoch": 0.6055108677049461, "grad_norm": 6.904580116271973, "learning_rate": 2.0148317170564746e-05, "loss": 0.596, "step": 3538 }, { "epoch": 0.605682012664727, "grad_norm": 4.964241981506348, "learning_rate": 2.015402167712493e-05, "loss": 0.5453, "step": 3539 }, { "epoch": 0.605853157624508, "grad_norm": 13.879029273986816, "learning_rate": 2.0159726183685113e-05, "loss": 0.8886, "step": 3540 }, { "epoch": 0.6060243025842889, "grad_norm": 5.926167964935303, "learning_rate": 2.0165430690245296e-05, "loss": 0.4682, "step": 3541 }, { "epoch": 0.6061954475440698, "grad_norm": 37.99885177612305, "learning_rate": 2.017113519680548e-05, "loss": 2.6702, "step": 3542 }, { "epoch": 0.6063665925038507, "grad_norm": 1.8707212209701538, "learning_rate": 2.017683970336566e-05, "loss": 0.3064, "step": 3543 }, { "epoch": 0.6065377374636317, "grad_norm": 19.801597595214844, "learning_rate": 2.0182544209925843e-05, "loss": 1.9881, "step": 3544 }, { "epoch": 0.6067088824234126, "grad_norm": 6.91411828994751, "learning_rate": 2.0188248716486026e-05, "loss": 0.6814, "step": 3545 }, { "epoch": 0.6068800273831936, "grad_norm": 24.066730499267578, "learning_rate": 2.0193953223046206e-05, "loss": 2.9773, "step": 3546 }, { "epoch": 0.6070511723429745, "grad_norm": 26.214096069335938, "learning_rate": 2.019965772960639e-05, "loss": 2.5413, "step": 3547 }, { "epoch": 0.6072223173027554, "grad_norm": 26.099639892578125, "learning_rate": 2.0205362236166573e-05, "loss": 3.1048, "step": 3548 }, { "epoch": 0.6073934622625363, "grad_norm": 5.310629844665527, "learning_rate": 2.0211066742726756e-05, "loss": 0.4841, "step": 3549 }, { "epoch": 0.6075646072223173, "grad_norm": 31.81784439086914, "learning_rate": 2.0216771249286936e-05, "loss": 2.6404, "step": 3550 }, { "epoch": 0.6077357521820982, "grad_norm": 20.07958221435547, "learning_rate": 2.022247575584712e-05, "loss": 2.0417, "step": 3551 }, { "epoch": 0.6079068971418792, "grad_norm": 23.589279174804688, "learning_rate": 2.0228180262407303e-05, "loss": 1.8377, "step": 3552 }, { "epoch": 0.60807804210166, "grad_norm": 7.810224533081055, "learning_rate": 2.0233884768967483e-05, "loss": 0.6155, "step": 3553 }, { "epoch": 0.608249187061441, "grad_norm": 25.48611068725586, "learning_rate": 2.0239589275527666e-05, "loss": 2.6452, "step": 3554 }, { "epoch": 0.6084203320212219, "grad_norm": 28.041179656982422, "learning_rate": 2.024529378208785e-05, "loss": 2.7049, "step": 3555 }, { "epoch": 0.6085914769810029, "grad_norm": 29.171598434448242, "learning_rate": 2.0250998288648033e-05, "loss": 2.9279, "step": 3556 }, { "epoch": 0.6087626219407839, "grad_norm": 31.75198745727539, "learning_rate": 2.0256702795208213e-05, "loss": 3.5333, "step": 3557 }, { "epoch": 0.6089337669005648, "grad_norm": 27.840137481689453, "learning_rate": 2.0262407301768396e-05, "loss": 2.582, "step": 3558 }, { "epoch": 0.6091049118603458, "grad_norm": 30.64188575744629, "learning_rate": 2.026811180832858e-05, "loss": 3.4939, "step": 3559 }, { "epoch": 0.6092760568201266, "grad_norm": 5.60610294342041, "learning_rate": 2.027381631488876e-05, "loss": 0.5944, "step": 3560 }, { "epoch": 0.6094472017799076, "grad_norm": 6.2669148445129395, "learning_rate": 2.0279520821448943e-05, "loss": 0.6124, "step": 3561 }, { "epoch": 0.6096183467396885, "grad_norm": 24.084915161132812, "learning_rate": 2.028522532800913e-05, "loss": 2.3808, "step": 3562 }, { "epoch": 0.6097894916994695, "grad_norm": 25.230403900146484, "learning_rate": 2.0290929834569313e-05, "loss": 2.2763, "step": 3563 }, { "epoch": 0.6099606366592504, "grad_norm": 3.4166452884674072, "learning_rate": 2.0296634341129493e-05, "loss": 0.3625, "step": 3564 }, { "epoch": 0.6101317816190314, "grad_norm": 43.064022064208984, "learning_rate": 2.0302338847689677e-05, "loss": 7.0079, "step": 3565 }, { "epoch": 0.6103029265788122, "grad_norm": 126.53868103027344, "learning_rate": 2.030804335424986e-05, "loss": 8.3917, "step": 3566 }, { "epoch": 0.6104740715385932, "grad_norm": 131.35928344726562, "learning_rate": 2.031374786081004e-05, "loss": 3.4001, "step": 3567 }, { "epoch": 0.6106452164983741, "grad_norm": 25.176708221435547, "learning_rate": 2.0319452367370223e-05, "loss": 2.6828, "step": 3568 }, { "epoch": 0.6108163614581551, "grad_norm": 10.270312309265137, "learning_rate": 2.0325156873930407e-05, "loss": 0.6769, "step": 3569 }, { "epoch": 0.610987506417936, "grad_norm": 134.7881317138672, "learning_rate": 2.033086138049059e-05, "loss": 9.4478, "step": 3570 }, { "epoch": 0.611158651377717, "grad_norm": 29.06338119506836, "learning_rate": 2.033656588705077e-05, "loss": 3.239, "step": 3571 }, { "epoch": 0.6113297963374978, "grad_norm": 28.089710235595703, "learning_rate": 2.0342270393610954e-05, "loss": 3.3457, "step": 3572 }, { "epoch": 0.6115009412972788, "grad_norm": 30.67607879638672, "learning_rate": 2.0347974900171137e-05, "loss": 3.2694, "step": 3573 }, { "epoch": 0.6116720862570597, "grad_norm": 22.92820167541504, "learning_rate": 2.0353679406731317e-05, "loss": 2.2235, "step": 3574 }, { "epoch": 0.6118432312168407, "grad_norm": 34.413116455078125, "learning_rate": 2.03593839132915e-05, "loss": 4.2986, "step": 3575 }, { "epoch": 0.6120143761766216, "grad_norm": 31.22587013244629, "learning_rate": 2.0365088419851684e-05, "loss": 3.0481, "step": 3576 }, { "epoch": 0.6121855211364026, "grad_norm": 25.429521560668945, "learning_rate": 2.0370792926411864e-05, "loss": 2.229, "step": 3577 }, { "epoch": 0.6123566660961834, "grad_norm": 11.0814208984375, "learning_rate": 2.0376497432972047e-05, "loss": 0.8888, "step": 3578 }, { "epoch": 0.6125278110559644, "grad_norm": 7.115586757659912, "learning_rate": 2.038220193953223e-05, "loss": 0.7247, "step": 3579 }, { "epoch": 0.6126989560157453, "grad_norm": 23.84605598449707, "learning_rate": 2.0387906446092414e-05, "loss": 2.5407, "step": 3580 }, { "epoch": 0.6128701009755263, "grad_norm": 29.189983367919922, "learning_rate": 2.0393610952652594e-05, "loss": 2.9275, "step": 3581 }, { "epoch": 0.6130412459353072, "grad_norm": 13.353384971618652, "learning_rate": 2.0399315459212777e-05, "loss": 1.2951, "step": 3582 }, { "epoch": 0.6132123908950882, "grad_norm": 19.755294799804688, "learning_rate": 2.040501996577296e-05, "loss": 1.69, "step": 3583 }, { "epoch": 0.613383535854869, "grad_norm": 42.32930374145508, "learning_rate": 2.041072447233314e-05, "loss": 6.7412, "step": 3584 }, { "epoch": 0.61355468081465, "grad_norm": 28.144733428955078, "learning_rate": 2.0416428978893327e-05, "loss": 3.0857, "step": 3585 }, { "epoch": 0.6137258257744309, "grad_norm": 9.064329147338867, "learning_rate": 2.042213348545351e-05, "loss": 0.6073, "step": 3586 }, { "epoch": 0.6138969707342119, "grad_norm": 13.769346237182617, "learning_rate": 2.0427837992013694e-05, "loss": 0.9597, "step": 3587 }, { "epoch": 0.6140681156939928, "grad_norm": 26.750154495239258, "learning_rate": 2.0433542498573874e-05, "loss": 2.5717, "step": 3588 }, { "epoch": 0.6142392606537738, "grad_norm": 21.833545684814453, "learning_rate": 2.0439247005134057e-05, "loss": 2.2722, "step": 3589 }, { "epoch": 0.6144104056135546, "grad_norm": 20.934206008911133, "learning_rate": 2.044495151169424e-05, "loss": 1.8977, "step": 3590 }, { "epoch": 0.6145815505733356, "grad_norm": 20.740619659423828, "learning_rate": 2.045065601825442e-05, "loss": 2.1618, "step": 3591 }, { "epoch": 0.6147526955331165, "grad_norm": 18.453035354614258, "learning_rate": 2.0456360524814604e-05, "loss": 1.5326, "step": 3592 }, { "epoch": 0.6149238404928975, "grad_norm": 21.29932403564453, "learning_rate": 2.0462065031374788e-05, "loss": 1.8663, "step": 3593 }, { "epoch": 0.6150949854526784, "grad_norm": 43.378116607666016, "learning_rate": 2.046776953793497e-05, "loss": 2.6436, "step": 3594 }, { "epoch": 0.6152661304124594, "grad_norm": 30.3048038482666, "learning_rate": 2.047347404449515e-05, "loss": 2.4363, "step": 3595 }, { "epoch": 0.6154372753722402, "grad_norm": 7.601318836212158, "learning_rate": 2.0479178551055334e-05, "loss": 0.5412, "step": 3596 }, { "epoch": 0.6156084203320212, "grad_norm": 22.385950088500977, "learning_rate": 2.0484883057615518e-05, "loss": 2.1345, "step": 3597 }, { "epoch": 0.6157795652918021, "grad_norm": 21.425384521484375, "learning_rate": 2.0490587564175698e-05, "loss": 2.3012, "step": 3598 }, { "epoch": 0.6159507102515831, "grad_norm": 35.983375549316406, "learning_rate": 2.049629207073588e-05, "loss": 3.4264, "step": 3599 }, { "epoch": 0.616121855211364, "grad_norm": 21.63048553466797, "learning_rate": 2.0501996577296064e-05, "loss": 1.8898, "step": 3600 }, { "epoch": 0.616293000171145, "grad_norm": 22.900203704833984, "learning_rate": 2.0507701083856248e-05, "loss": 2.0752, "step": 3601 }, { "epoch": 0.6164641451309258, "grad_norm": 36.20056915283203, "learning_rate": 2.0513405590416428e-05, "loss": 3.6793, "step": 3602 }, { "epoch": 0.6166352900907068, "grad_norm": 65.48631286621094, "learning_rate": 2.051911009697661e-05, "loss": 3.0171, "step": 3603 }, { "epoch": 0.6168064350504877, "grad_norm": 28.85053062438965, "learning_rate": 2.0524814603536795e-05, "loss": 2.9372, "step": 3604 }, { "epoch": 0.6169775800102687, "grad_norm": 22.775283813476562, "learning_rate": 2.0530519110096975e-05, "loss": 2.0222, "step": 3605 }, { "epoch": 0.6171487249700496, "grad_norm": 20.72616195678711, "learning_rate": 2.0536223616657158e-05, "loss": 1.9483, "step": 3606 }, { "epoch": 0.6173198699298306, "grad_norm": 82.96855926513672, "learning_rate": 2.054192812321734e-05, "loss": 3.4477, "step": 3607 }, { "epoch": 0.6174910148896116, "grad_norm": 31.961795806884766, "learning_rate": 2.0547632629777525e-05, "loss": 3.855, "step": 3608 }, { "epoch": 0.6176621598493924, "grad_norm": 8.887676239013672, "learning_rate": 2.0553337136337708e-05, "loss": 0.9918, "step": 3609 }, { "epoch": 0.6178333048091734, "grad_norm": 28.133888244628906, "learning_rate": 2.055904164289789e-05, "loss": 3.5373, "step": 3610 }, { "epoch": 0.6180044497689543, "grad_norm": 15.302783966064453, "learning_rate": 2.0564746149458075e-05, "loss": 1.4358, "step": 3611 }, { "epoch": 0.6181755947287353, "grad_norm": 1.7312853336334229, "learning_rate": 2.0570450656018255e-05, "loss": 0.3083, "step": 3612 }, { "epoch": 0.6183467396885162, "grad_norm": 57.10562515258789, "learning_rate": 2.0576155162578438e-05, "loss": 2.6412, "step": 3613 }, { "epoch": 0.6185178846482972, "grad_norm": 23.632875442504883, "learning_rate": 2.058185966913862e-05, "loss": 2.165, "step": 3614 }, { "epoch": 0.618689029608078, "grad_norm": 12.391186714172363, "learning_rate": 2.05875641756988e-05, "loss": 0.8518, "step": 3615 }, { "epoch": 0.618860174567859, "grad_norm": 33.55079650878906, "learning_rate": 2.0593268682258985e-05, "loss": 6.3741, "step": 3616 }, { "epoch": 0.6190313195276399, "grad_norm": 30.267724990844727, "learning_rate": 2.0598973188819168e-05, "loss": 3.5029, "step": 3617 }, { "epoch": 0.6192024644874209, "grad_norm": 23.680438995361328, "learning_rate": 2.060467769537935e-05, "loss": 2.4026, "step": 3618 }, { "epoch": 0.6193736094472018, "grad_norm": 24.904333114624023, "learning_rate": 2.061038220193953e-05, "loss": 2.6435, "step": 3619 }, { "epoch": 0.6195447544069828, "grad_norm": 16.03217124938965, "learning_rate": 2.0616086708499715e-05, "loss": 1.515, "step": 3620 }, { "epoch": 0.6197158993667636, "grad_norm": 20.824888229370117, "learning_rate": 2.06217912150599e-05, "loss": 1.8808, "step": 3621 }, { "epoch": 0.6198870443265446, "grad_norm": 18.110668182373047, "learning_rate": 2.062749572162008e-05, "loss": 1.6689, "step": 3622 }, { "epoch": 0.6200581892863255, "grad_norm": 21.38336753845215, "learning_rate": 2.0633200228180262e-05, "loss": 2.0968, "step": 3623 }, { "epoch": 0.6202293342461065, "grad_norm": 19.99571418762207, "learning_rate": 2.0638904734740445e-05, "loss": 1.7209, "step": 3624 }, { "epoch": 0.6204004792058874, "grad_norm": 26.232772827148438, "learning_rate": 2.064460924130063e-05, "loss": 2.5226, "step": 3625 }, { "epoch": 0.6205716241656684, "grad_norm": 2.9006407260894775, "learning_rate": 2.065031374786081e-05, "loss": 0.3453, "step": 3626 }, { "epoch": 0.6207427691254492, "grad_norm": 1.3548880815505981, "learning_rate": 2.0656018254420992e-05, "loss": 0.318, "step": 3627 }, { "epoch": 0.6209139140852302, "grad_norm": 28.67076301574707, "learning_rate": 2.0661722760981175e-05, "loss": 2.7404, "step": 3628 }, { "epoch": 0.6210850590450111, "grad_norm": 19.108945846557617, "learning_rate": 2.0667427267541355e-05, "loss": 1.9497, "step": 3629 }, { "epoch": 0.6212562040047921, "grad_norm": 19.495290756225586, "learning_rate": 2.067313177410154e-05, "loss": 1.7563, "step": 3630 }, { "epoch": 0.621427348964573, "grad_norm": 9.806105613708496, "learning_rate": 2.0678836280661725e-05, "loss": 0.7359, "step": 3631 }, { "epoch": 0.621598493924354, "grad_norm": 21.068401336669922, "learning_rate": 2.068454078722191e-05, "loss": 1.8909, "step": 3632 }, { "epoch": 0.6217696388841348, "grad_norm": 24.633346557617188, "learning_rate": 2.069024529378209e-05, "loss": 2.1693, "step": 3633 }, { "epoch": 0.6219407838439158, "grad_norm": 20.427921295166016, "learning_rate": 2.0695949800342272e-05, "loss": 1.9249, "step": 3634 }, { "epoch": 0.6221119288036967, "grad_norm": 31.32949447631836, "learning_rate": 2.0701654306902456e-05, "loss": 3.3678, "step": 3635 }, { "epoch": 0.6222830737634777, "grad_norm": 1.1559346914291382, "learning_rate": 2.0707358813462636e-05, "loss": 0.2955, "step": 3636 }, { "epoch": 0.6224542187232586, "grad_norm": 31.57821273803711, "learning_rate": 2.071306332002282e-05, "loss": 3.6773, "step": 3637 }, { "epoch": 0.6226253636830396, "grad_norm": 54.79661560058594, "learning_rate": 2.0718767826583002e-05, "loss": 2.7538, "step": 3638 }, { "epoch": 0.6227965086428204, "grad_norm": 22.394084930419922, "learning_rate": 2.0724472333143186e-05, "loss": 1.9239, "step": 3639 }, { "epoch": 0.6229676536026014, "grad_norm": 27.102100372314453, "learning_rate": 2.0730176839703366e-05, "loss": 2.4047, "step": 3640 }, { "epoch": 0.6231387985623823, "grad_norm": 10.830344200134277, "learning_rate": 2.073588134626355e-05, "loss": 0.9573, "step": 3641 }, { "epoch": 0.6233099435221633, "grad_norm": 36.204898834228516, "learning_rate": 2.0741585852823732e-05, "loss": 4.9509, "step": 3642 }, { "epoch": 0.6234810884819442, "grad_norm": 27.16095542907715, "learning_rate": 2.0747290359383912e-05, "loss": 3.0999, "step": 3643 }, { "epoch": 0.6236522334417252, "grad_norm": 25.359394073486328, "learning_rate": 2.0752994865944096e-05, "loss": 1.8888, "step": 3644 }, { "epoch": 0.623823378401506, "grad_norm": 27.978900909423828, "learning_rate": 2.075869937250428e-05, "loss": 2.641, "step": 3645 }, { "epoch": 0.623994523361287, "grad_norm": 22.19740867614746, "learning_rate": 2.076440387906446e-05, "loss": 2.2979, "step": 3646 }, { "epoch": 0.6241656683210679, "grad_norm": 21.350025177001953, "learning_rate": 2.0770108385624643e-05, "loss": 2.7609, "step": 3647 }, { "epoch": 0.6243368132808489, "grad_norm": 24.283403396606445, "learning_rate": 2.0775812892184826e-05, "loss": 2.2185, "step": 3648 }, { "epoch": 0.6245079582406298, "grad_norm": 24.77626609802246, "learning_rate": 2.078151739874501e-05, "loss": 3.0207, "step": 3649 }, { "epoch": 0.6246791032004108, "grad_norm": 25.94424819946289, "learning_rate": 2.078722190530519e-05, "loss": 3.1535, "step": 3650 }, { "epoch": 0.6248502481601916, "grad_norm": 23.725664138793945, "learning_rate": 2.0792926411865373e-05, "loss": 2.1339, "step": 3651 }, { "epoch": 0.6250213931199726, "grad_norm": 20.791276931762695, "learning_rate": 2.0798630918425556e-05, "loss": 2.2234, "step": 3652 }, { "epoch": 0.6251925380797535, "grad_norm": 18.761796951293945, "learning_rate": 2.0804335424985736e-05, "loss": 1.7418, "step": 3653 }, { "epoch": 0.6253636830395345, "grad_norm": 19.412919998168945, "learning_rate": 2.0810039931545923e-05, "loss": 1.6532, "step": 3654 }, { "epoch": 0.6255348279993154, "grad_norm": 34.13801193237305, "learning_rate": 2.0815744438106106e-05, "loss": 6.6811, "step": 3655 }, { "epoch": 0.6257059729590964, "grad_norm": 28.877214431762695, "learning_rate": 2.082144894466629e-05, "loss": 3.9092, "step": 3656 }, { "epoch": 0.6258771179188772, "grad_norm": 25.978158950805664, "learning_rate": 2.082715345122647e-05, "loss": 2.3646, "step": 3657 }, { "epoch": 0.6260482628786582, "grad_norm": 45.43318176269531, "learning_rate": 2.0832857957786653e-05, "loss": 2.9491, "step": 3658 }, { "epoch": 0.6262194078384392, "grad_norm": 25.172359466552734, "learning_rate": 2.0838562464346836e-05, "loss": 3.1014, "step": 3659 }, { "epoch": 0.6263905527982201, "grad_norm": 31.28904914855957, "learning_rate": 2.0844266970907016e-05, "loss": 3.1078, "step": 3660 }, { "epoch": 0.6265616977580011, "grad_norm": 28.047880172729492, "learning_rate": 2.08499714774672e-05, "loss": 3.2356, "step": 3661 }, { "epoch": 0.626732842717782, "grad_norm": 14.974970817565918, "learning_rate": 2.0855675984027383e-05, "loss": 1.3538, "step": 3662 }, { "epoch": 0.626903987677563, "grad_norm": 1.7350009679794312, "learning_rate": 2.0861380490587566e-05, "loss": 0.3083, "step": 3663 }, { "epoch": 0.6270751326373438, "grad_norm": 24.56871795654297, "learning_rate": 2.0867084997147746e-05, "loss": 2.5915, "step": 3664 }, { "epoch": 0.6272462775971248, "grad_norm": 24.486120223999023, "learning_rate": 2.087278950370793e-05, "loss": 2.461, "step": 3665 }, { "epoch": 0.6274174225569057, "grad_norm": 24.524600982666016, "learning_rate": 2.0878494010268113e-05, "loss": 2.5748, "step": 3666 }, { "epoch": 0.6275885675166867, "grad_norm": 5.067863464355469, "learning_rate": 2.0884198516828293e-05, "loss": 0.5522, "step": 3667 }, { "epoch": 0.6277597124764676, "grad_norm": 15.868297576904297, "learning_rate": 2.0889903023388477e-05, "loss": 1.6833, "step": 3668 }, { "epoch": 0.6279308574362485, "grad_norm": 25.489429473876953, "learning_rate": 2.089560752994866e-05, "loss": 2.5381, "step": 3669 }, { "epoch": 0.6281020023960294, "grad_norm": 26.983837127685547, "learning_rate": 2.0901312036508843e-05, "loss": 3.3307, "step": 3670 }, { "epoch": 0.6282731473558104, "grad_norm": 17.707273483276367, "learning_rate": 2.0907016543069023e-05, "loss": 1.8142, "step": 3671 }, { "epoch": 0.6284442923155913, "grad_norm": 23.989248275756836, "learning_rate": 2.0912721049629207e-05, "loss": 2.5138, "step": 3672 }, { "epoch": 0.6286154372753723, "grad_norm": 24.12046241760254, "learning_rate": 2.091842555618939e-05, "loss": 2.7494, "step": 3673 }, { "epoch": 0.6287865822351532, "grad_norm": 1.9827460050582886, "learning_rate": 2.092413006274957e-05, "loss": 0.2992, "step": 3674 }, { "epoch": 0.6289577271949341, "grad_norm": 15.272665977478027, "learning_rate": 2.0929834569309753e-05, "loss": 1.5809, "step": 3675 }, { "epoch": 0.629128872154715, "grad_norm": 7.758640289306641, "learning_rate": 2.093553907586994e-05, "loss": 0.5765, "step": 3676 }, { "epoch": 0.629300017114496, "grad_norm": 29.360862731933594, "learning_rate": 2.094124358243012e-05, "loss": 3.1532, "step": 3677 }, { "epoch": 0.6294711620742769, "grad_norm": 2.6255311965942383, "learning_rate": 2.0946948088990304e-05, "loss": 0.3356, "step": 3678 }, { "epoch": 0.6296423070340579, "grad_norm": 31.4219970703125, "learning_rate": 2.0952652595550487e-05, "loss": 6.6895, "step": 3679 }, { "epoch": 0.6298134519938388, "grad_norm": 26.191577911376953, "learning_rate": 2.095835710211067e-05, "loss": 2.5616, "step": 3680 }, { "epoch": 0.6299845969536197, "grad_norm": 22.00040054321289, "learning_rate": 2.096406160867085e-05, "loss": 2.0715, "step": 3681 }, { "epoch": 0.6301557419134006, "grad_norm": 18.956966400146484, "learning_rate": 2.0969766115231034e-05, "loss": 1.9574, "step": 3682 }, { "epoch": 0.6303268868731816, "grad_norm": 27.760032653808594, "learning_rate": 2.0975470621791217e-05, "loss": 3.7785, "step": 3683 }, { "epoch": 0.6304980318329625, "grad_norm": 10.644538879394531, "learning_rate": 2.0981175128351397e-05, "loss": 0.8287, "step": 3684 }, { "epoch": 0.6306691767927435, "grad_norm": 51.96141815185547, "learning_rate": 2.098687963491158e-05, "loss": 2.4722, "step": 3685 }, { "epoch": 0.6308403217525244, "grad_norm": 7.464876174926758, "learning_rate": 2.0992584141471764e-05, "loss": 0.6759, "step": 3686 }, { "epoch": 0.6310114667123053, "grad_norm": 18.195411682128906, "learning_rate": 2.0998288648031947e-05, "loss": 1.814, "step": 3687 }, { "epoch": 0.6311826116720862, "grad_norm": 35.01757049560547, "learning_rate": 2.1003993154592127e-05, "loss": 4.6493, "step": 3688 }, { "epoch": 0.6313537566318672, "grad_norm": 27.28526496887207, "learning_rate": 2.100969766115231e-05, "loss": 2.6136, "step": 3689 }, { "epoch": 0.6315249015916481, "grad_norm": 10.132340431213379, "learning_rate": 2.1015402167712494e-05, "loss": 1.4228, "step": 3690 }, { "epoch": 0.6316960465514291, "grad_norm": 29.740331649780273, "learning_rate": 2.1021106674272674e-05, "loss": 3.4955, "step": 3691 }, { "epoch": 0.63186719151121, "grad_norm": 6.783731937408447, "learning_rate": 2.1026811180832857e-05, "loss": 0.589, "step": 3692 }, { "epoch": 0.632038336470991, "grad_norm": 26.901226043701172, "learning_rate": 2.103251568739304e-05, "loss": 2.8409, "step": 3693 }, { "epoch": 0.6322094814307718, "grad_norm": 67.48046112060547, "learning_rate": 2.1038220193953224e-05, "loss": 2.4784, "step": 3694 }, { "epoch": 0.6323806263905528, "grad_norm": 16.813676834106445, "learning_rate": 2.1043924700513404e-05, "loss": 1.3682, "step": 3695 }, { "epoch": 0.6325517713503337, "grad_norm": 27.411855697631836, "learning_rate": 2.1049629207073587e-05, "loss": 2.8456, "step": 3696 }, { "epoch": 0.6327229163101147, "grad_norm": 73.62898254394531, "learning_rate": 2.105533371363377e-05, "loss": 3.2879, "step": 3697 }, { "epoch": 0.6328940612698956, "grad_norm": 22.297090530395508, "learning_rate": 2.106103822019395e-05, "loss": 2.3233, "step": 3698 }, { "epoch": 0.6330652062296765, "grad_norm": 24.923654556274414, "learning_rate": 2.1066742726754138e-05, "loss": 2.1826, "step": 3699 }, { "epoch": 0.6332363511894574, "grad_norm": 20.588891983032227, "learning_rate": 2.107244723331432e-05, "loss": 2.0226, "step": 3700 }, { "epoch": 0.6334074961492384, "grad_norm": 5.975876331329346, "learning_rate": 2.1078151739874504e-05, "loss": 0.6341, "step": 3701 }, { "epoch": 0.6335786411090193, "grad_norm": 21.88986587524414, "learning_rate": 2.1083856246434684e-05, "loss": 2.1575, "step": 3702 }, { "epoch": 0.6337497860688003, "grad_norm": 19.65184211730957, "learning_rate": 2.1089560752994868e-05, "loss": 1.663, "step": 3703 }, { "epoch": 0.6339209310285812, "grad_norm": 30.190269470214844, "learning_rate": 2.109526525955505e-05, "loss": 3.6871, "step": 3704 }, { "epoch": 0.6340920759883621, "grad_norm": 8.569622993469238, "learning_rate": 2.110096976611523e-05, "loss": 0.9122, "step": 3705 }, { "epoch": 0.634263220948143, "grad_norm": 59.63459777832031, "learning_rate": 2.1106674272675414e-05, "loss": 2.3494, "step": 3706 }, { "epoch": 0.634434365907924, "grad_norm": 26.309602737426758, "learning_rate": 2.1112378779235598e-05, "loss": 3.1797, "step": 3707 }, { "epoch": 0.6346055108677049, "grad_norm": 28.47893714904785, "learning_rate": 2.1118083285795778e-05, "loss": 3.2675, "step": 3708 }, { "epoch": 0.6347766558274859, "grad_norm": 24.453359603881836, "learning_rate": 2.112378779235596e-05, "loss": 2.5852, "step": 3709 }, { "epoch": 0.6349478007872669, "grad_norm": 24.291488647460938, "learning_rate": 2.1129492298916145e-05, "loss": 2.3409, "step": 3710 }, { "epoch": 0.6351189457470477, "grad_norm": 26.9472599029541, "learning_rate": 2.1135196805476328e-05, "loss": 3.1146, "step": 3711 }, { "epoch": 0.6352900907068287, "grad_norm": 20.753297805786133, "learning_rate": 2.1140901312036508e-05, "loss": 1.8245, "step": 3712 }, { "epoch": 0.6354612356666096, "grad_norm": 11.65485668182373, "learning_rate": 2.114660581859669e-05, "loss": 0.8492, "step": 3713 }, { "epoch": 0.6356323806263906, "grad_norm": 18.369417190551758, "learning_rate": 2.1152310325156875e-05, "loss": 1.8056, "step": 3714 }, { "epoch": 0.6358035255861715, "grad_norm": 3.041557788848877, "learning_rate": 2.1158014831717055e-05, "loss": 0.3459, "step": 3715 }, { "epoch": 0.6359746705459525, "grad_norm": 31.116910934448242, "learning_rate": 2.1163719338277238e-05, "loss": 2.4418, "step": 3716 }, { "epoch": 0.6361458155057333, "grad_norm": 26.295557022094727, "learning_rate": 2.116942384483742e-05, "loss": 1.8444, "step": 3717 }, { "epoch": 0.6363169604655143, "grad_norm": 25.38450813293457, "learning_rate": 2.1175128351397605e-05, "loss": 2.2447, "step": 3718 }, { "epoch": 0.6364881054252952, "grad_norm": 25.307218551635742, "learning_rate": 2.1180832857957785e-05, "loss": 2.5005, "step": 3719 }, { "epoch": 0.6366592503850762, "grad_norm": 2.224104642868042, "learning_rate": 2.1186537364517968e-05, "loss": 0.3241, "step": 3720 }, { "epoch": 0.6368303953448571, "grad_norm": 17.863842010498047, "learning_rate": 2.119224187107815e-05, "loss": 1.8059, "step": 3721 }, { "epoch": 0.6370015403046381, "grad_norm": 143.09255981445312, "learning_rate": 2.1197946377638335e-05, "loss": 7.8615, "step": 3722 }, { "epoch": 0.637172685264419, "grad_norm": 20.51776695251465, "learning_rate": 2.120365088419852e-05, "loss": 1.9465, "step": 3723 }, { "epoch": 0.6373438302241999, "grad_norm": 19.772676467895508, "learning_rate": 2.1209355390758702e-05, "loss": 1.874, "step": 3724 }, { "epoch": 0.6375149751839808, "grad_norm": 29.25998306274414, "learning_rate": 2.1215059897318885e-05, "loss": 3.1729, "step": 3725 }, { "epoch": 0.6376861201437618, "grad_norm": 68.94001770019531, "learning_rate": 2.1220764403879065e-05, "loss": 7.3574, "step": 3726 }, { "epoch": 0.6378572651035427, "grad_norm": 26.3350887298584, "learning_rate": 2.122646891043925e-05, "loss": 3.0566, "step": 3727 }, { "epoch": 0.6380284100633237, "grad_norm": 1.6111328601837158, "learning_rate": 2.1232173416999432e-05, "loss": 0.2934, "step": 3728 }, { "epoch": 0.6381995550231045, "grad_norm": 20.667644500732422, "learning_rate": 2.1237877923559612e-05, "loss": 2.0751, "step": 3729 }, { "epoch": 0.6383706999828855, "grad_norm": 14.264472961425781, "learning_rate": 2.1243582430119795e-05, "loss": 1.0027, "step": 3730 }, { "epoch": 0.6385418449426664, "grad_norm": 22.407548904418945, "learning_rate": 2.124928693667998e-05, "loss": 1.9791, "step": 3731 }, { "epoch": 0.6387129899024474, "grad_norm": 31.578723907470703, "learning_rate": 2.1254991443240162e-05, "loss": 2.7384, "step": 3732 }, { "epoch": 0.6388841348622283, "grad_norm": 2.103879690170288, "learning_rate": 2.1260695949800342e-05, "loss": 0.312, "step": 3733 }, { "epoch": 0.6390552798220093, "grad_norm": 68.59303283691406, "learning_rate": 2.1266400456360525e-05, "loss": 7.0889, "step": 3734 }, { "epoch": 0.6392264247817901, "grad_norm": 7.920656681060791, "learning_rate": 2.127210496292071e-05, "loss": 1.09, "step": 3735 }, { "epoch": 0.6393975697415711, "grad_norm": 29.83785057067871, "learning_rate": 2.127780946948089e-05, "loss": 2.6416, "step": 3736 }, { "epoch": 0.639568714701352, "grad_norm": 22.06658172607422, "learning_rate": 2.1283513976041072e-05, "loss": 2.3255, "step": 3737 }, { "epoch": 0.639739859661133, "grad_norm": 18.649507522583008, "learning_rate": 2.1289218482601255e-05, "loss": 1.5747, "step": 3738 }, { "epoch": 0.6399110046209139, "grad_norm": 25.857921600341797, "learning_rate": 2.129492298916144e-05, "loss": 2.3616, "step": 3739 }, { "epoch": 0.6400821495806949, "grad_norm": 27.74761390686035, "learning_rate": 2.130062749572162e-05, "loss": 3.559, "step": 3740 }, { "epoch": 0.6402532945404757, "grad_norm": 42.965763092041016, "learning_rate": 2.1306332002281802e-05, "loss": 6.2665, "step": 3741 }, { "epoch": 0.6404244395002567, "grad_norm": 13.754633903503418, "learning_rate": 2.1312036508841986e-05, "loss": 0.9964, "step": 3742 }, { "epoch": 0.6405955844600376, "grad_norm": 30.93828010559082, "learning_rate": 2.1317741015402166e-05, "loss": 2.1571, "step": 3743 }, { "epoch": 0.6407667294198186, "grad_norm": 13.315516471862793, "learning_rate": 2.132344552196235e-05, "loss": 0.9772, "step": 3744 }, { "epoch": 0.6409378743795995, "grad_norm": 25.19696617126465, "learning_rate": 2.1329150028522536e-05, "loss": 2.2795, "step": 3745 }, { "epoch": 0.6411090193393805, "grad_norm": 19.73836326599121, "learning_rate": 2.1334854535082716e-05, "loss": 2.1502, "step": 3746 }, { "epoch": 0.6412801642991613, "grad_norm": 13.088768005371094, "learning_rate": 2.13405590416429e-05, "loss": 0.6967, "step": 3747 }, { "epoch": 0.6414513092589423, "grad_norm": 8.49268627166748, "learning_rate": 2.1346263548203082e-05, "loss": 0.8886, "step": 3748 }, { "epoch": 0.6416224542187232, "grad_norm": 14.05295181274414, "learning_rate": 2.1351968054763266e-05, "loss": 1.1393, "step": 3749 }, { "epoch": 0.6417935991785042, "grad_norm": 35.74152755737305, "learning_rate": 2.1357672561323446e-05, "loss": 3.8807, "step": 3750 }, { "epoch": 0.6419647441382851, "grad_norm": 12.692075729370117, "learning_rate": 2.136337706788363e-05, "loss": 0.9175, "step": 3751 }, { "epoch": 0.6421358890980661, "grad_norm": 3.9487550258636475, "learning_rate": 2.1369081574443813e-05, "loss": 0.3691, "step": 3752 }, { "epoch": 0.642307034057847, "grad_norm": 25.308374404907227, "learning_rate": 2.1374786081003993e-05, "loss": 3.566, "step": 3753 }, { "epoch": 0.6424781790176279, "grad_norm": 27.15434455871582, "learning_rate": 2.1380490587564176e-05, "loss": 2.8604, "step": 3754 }, { "epoch": 0.6426493239774088, "grad_norm": 28.912559509277344, "learning_rate": 2.138619509412436e-05, "loss": 2.7714, "step": 3755 }, { "epoch": 0.6428204689371898, "grad_norm": 185.9123992919922, "learning_rate": 2.1391899600684543e-05, "loss": 11.7502, "step": 3756 }, { "epoch": 0.6429916138969707, "grad_norm": 33.01267623901367, "learning_rate": 2.1397604107244723e-05, "loss": 1.9282, "step": 3757 }, { "epoch": 0.6431627588567517, "grad_norm": 32.01244354248047, "learning_rate": 2.1403308613804906e-05, "loss": 3.0304, "step": 3758 }, { "epoch": 0.6433339038165325, "grad_norm": 26.142648696899414, "learning_rate": 2.140901312036509e-05, "loss": 2.5164, "step": 3759 }, { "epoch": 0.6435050487763135, "grad_norm": 24.568946838378906, "learning_rate": 2.141471762692527e-05, "loss": 2.3979, "step": 3760 }, { "epoch": 0.6436761937360945, "grad_norm": 26.051137924194336, "learning_rate": 2.1420422133485453e-05, "loss": 3.0876, "step": 3761 }, { "epoch": 0.6438473386958754, "grad_norm": 17.334243774414062, "learning_rate": 2.1426126640045636e-05, "loss": 1.4332, "step": 3762 }, { "epoch": 0.6440184836556564, "grad_norm": 29.278783798217773, "learning_rate": 2.143183114660582e-05, "loss": 2.8496, "step": 3763 }, { "epoch": 0.6441896286154373, "grad_norm": 24.168411254882812, "learning_rate": 2.1437535653166e-05, "loss": 2.4451, "step": 3764 }, { "epoch": 0.6443607735752183, "grad_norm": 31.55498504638672, "learning_rate": 2.1443240159726183e-05, "loss": 2.0817, "step": 3765 }, { "epoch": 0.6445319185349991, "grad_norm": 32.670005798339844, "learning_rate": 2.1448944666286366e-05, "loss": 6.7024, "step": 3766 }, { "epoch": 0.6447030634947801, "grad_norm": 2.9258251190185547, "learning_rate": 2.1454649172846546e-05, "loss": 0.3334, "step": 3767 }, { "epoch": 0.644874208454561, "grad_norm": 12.174300193786621, "learning_rate": 2.1460353679406733e-05, "loss": 0.8169, "step": 3768 }, { "epoch": 0.645045353414342, "grad_norm": 28.00621223449707, "learning_rate": 2.1466058185966917e-05, "loss": 3.2524, "step": 3769 }, { "epoch": 0.6452164983741229, "grad_norm": 26.712377548217773, "learning_rate": 2.14717626925271e-05, "loss": 2.7371, "step": 3770 }, { "epoch": 0.6453876433339039, "grad_norm": 21.200624465942383, "learning_rate": 2.147746719908728e-05, "loss": 2.6966, "step": 3771 }, { "epoch": 0.6455587882936847, "grad_norm": 11.284048080444336, "learning_rate": 2.1483171705647463e-05, "loss": 0.7751, "step": 3772 }, { "epoch": 0.6457299332534657, "grad_norm": 11.401342391967773, "learning_rate": 2.1488876212207647e-05, "loss": 0.7674, "step": 3773 }, { "epoch": 0.6459010782132466, "grad_norm": 6.0696587562561035, "learning_rate": 2.1494580718767827e-05, "loss": 0.6054, "step": 3774 }, { "epoch": 0.6460722231730276, "grad_norm": 27.059720993041992, "learning_rate": 2.150028522532801e-05, "loss": 3.0029, "step": 3775 }, { "epoch": 0.6462433681328085, "grad_norm": 20.365650177001953, "learning_rate": 2.1505989731888193e-05, "loss": 2.0427, "step": 3776 }, { "epoch": 0.6464145130925895, "grad_norm": 6.034448146820068, "learning_rate": 2.1511694238448373e-05, "loss": 0.6441, "step": 3777 }, { "epoch": 0.6465856580523703, "grad_norm": 23.394229888916016, "learning_rate": 2.1517398745008557e-05, "loss": 2.4589, "step": 3778 }, { "epoch": 0.6467568030121513, "grad_norm": 46.16388702392578, "learning_rate": 2.152310325156874e-05, "loss": 7.0883, "step": 3779 }, { "epoch": 0.6469279479719322, "grad_norm": 23.118371963500977, "learning_rate": 2.1528807758128924e-05, "loss": 2.5132, "step": 3780 }, { "epoch": 0.6470990929317132, "grad_norm": 29.06417465209961, "learning_rate": 2.1534512264689103e-05, "loss": 2.967, "step": 3781 }, { "epoch": 0.6472702378914941, "grad_norm": 4.700016498565674, "learning_rate": 2.1540216771249287e-05, "loss": 0.3476, "step": 3782 }, { "epoch": 0.6474413828512751, "grad_norm": 9.841592788696289, "learning_rate": 2.154592127780947e-05, "loss": 0.6132, "step": 3783 }, { "epoch": 0.6476125278110559, "grad_norm": 6.57474946975708, "learning_rate": 2.155162578436965e-05, "loss": 0.5898, "step": 3784 }, { "epoch": 0.6477836727708369, "grad_norm": 10.39101791381836, "learning_rate": 2.1557330290929834e-05, "loss": 0.9835, "step": 3785 }, { "epoch": 0.6479548177306178, "grad_norm": 23.276077270507812, "learning_rate": 2.1563034797490017e-05, "loss": 2.6405, "step": 3786 }, { "epoch": 0.6481259626903988, "grad_norm": 25.941986083984375, "learning_rate": 2.15687393040502e-05, "loss": 2.89, "step": 3787 }, { "epoch": 0.6482971076501797, "grad_norm": 32.4000129699707, "learning_rate": 2.157444381061038e-05, "loss": 3.63, "step": 3788 }, { "epoch": 0.6484682526099607, "grad_norm": 36.605247497558594, "learning_rate": 2.1580148317170564e-05, "loss": 6.3912, "step": 3789 }, { "epoch": 0.6486393975697415, "grad_norm": 31.485267639160156, "learning_rate": 2.1585852823730747e-05, "loss": 3.3366, "step": 3790 }, { "epoch": 0.6488105425295225, "grad_norm": 23.93595314025879, "learning_rate": 2.159155733029093e-05, "loss": 2.4522, "step": 3791 }, { "epoch": 0.6489816874893034, "grad_norm": 25.628398895263672, "learning_rate": 2.1597261836851114e-05, "loss": 3.0389, "step": 3792 }, { "epoch": 0.6491528324490844, "grad_norm": 25.61122703552246, "learning_rate": 2.1602966343411297e-05, "loss": 2.1422, "step": 3793 }, { "epoch": 0.6493239774088653, "grad_norm": 26.866369247436523, "learning_rate": 2.160867084997148e-05, "loss": 2.8132, "step": 3794 }, { "epoch": 0.6494951223686463, "grad_norm": 60.774818420410156, "learning_rate": 2.161437535653166e-05, "loss": 6.1755, "step": 3795 }, { "epoch": 0.6496662673284271, "grad_norm": 12.326183319091797, "learning_rate": 2.1620079863091844e-05, "loss": 0.839, "step": 3796 }, { "epoch": 0.6498374122882081, "grad_norm": 22.0472354888916, "learning_rate": 2.1625784369652027e-05, "loss": 2.4891, "step": 3797 }, { "epoch": 0.650008557247989, "grad_norm": 8.49109935760498, "learning_rate": 2.1631488876212207e-05, "loss": 0.7858, "step": 3798 }, { "epoch": 0.65017970220777, "grad_norm": 46.285579681396484, "learning_rate": 2.163719338277239e-05, "loss": 2.7199, "step": 3799 }, { "epoch": 0.6503508471675509, "grad_norm": 16.816619873046875, "learning_rate": 2.1642897889332574e-05, "loss": 1.5917, "step": 3800 }, { "epoch": 0.6505219921273319, "grad_norm": 28.71768569946289, "learning_rate": 2.1648602395892758e-05, "loss": 3.4323, "step": 3801 }, { "epoch": 0.6506931370871127, "grad_norm": 23.60746192932129, "learning_rate": 2.1654306902452938e-05, "loss": 2.0654, "step": 3802 }, { "epoch": 0.6508642820468937, "grad_norm": 26.327360153198242, "learning_rate": 2.166001140901312e-05, "loss": 1.8876, "step": 3803 }, { "epoch": 0.6510354270066746, "grad_norm": 44.482337951660156, "learning_rate": 2.1665715915573304e-05, "loss": 6.5739, "step": 3804 }, { "epoch": 0.6512065719664556, "grad_norm": 27.297197341918945, "learning_rate": 2.1671420422133484e-05, "loss": 2.1462, "step": 3805 }, { "epoch": 0.6513777169262365, "grad_norm": 11.890837669372559, "learning_rate": 2.1677124928693668e-05, "loss": 0.7348, "step": 3806 }, { "epoch": 0.6515488618860175, "grad_norm": 27.35932731628418, "learning_rate": 2.168282943525385e-05, "loss": 2.9561, "step": 3807 }, { "epoch": 0.6517200068457983, "grad_norm": 25.932842254638672, "learning_rate": 2.1688533941814034e-05, "loss": 2.5524, "step": 3808 }, { "epoch": 0.6518911518055793, "grad_norm": 7.344489574432373, "learning_rate": 2.1694238448374214e-05, "loss": 0.8045, "step": 3809 }, { "epoch": 0.6520622967653602, "grad_norm": 24.049985885620117, "learning_rate": 2.1699942954934398e-05, "loss": 2.3474, "step": 3810 }, { "epoch": 0.6522334417251412, "grad_norm": 25.154258728027344, "learning_rate": 2.170564746149458e-05, "loss": 2.8768, "step": 3811 }, { "epoch": 0.6524045866849222, "grad_norm": 35.475502014160156, "learning_rate": 2.171135196805476e-05, "loss": 5.6263, "step": 3812 }, { "epoch": 0.6525757316447031, "grad_norm": 18.898576736450195, "learning_rate": 2.1717056474614945e-05, "loss": 2.0987, "step": 3813 }, { "epoch": 0.652746876604484, "grad_norm": 64.42694091796875, "learning_rate": 2.172276098117513e-05, "loss": 1.9397, "step": 3814 }, { "epoch": 0.6529180215642649, "grad_norm": 51.23388671875, "learning_rate": 2.172846548773531e-05, "loss": 2.187, "step": 3815 }, { "epoch": 0.6530891665240459, "grad_norm": 24.042943954467773, "learning_rate": 2.1734169994295495e-05, "loss": 2.9904, "step": 3816 }, { "epoch": 0.6532603114838268, "grad_norm": 4.368581295013428, "learning_rate": 2.1739874500855678e-05, "loss": 0.442, "step": 3817 }, { "epoch": 0.6534314564436078, "grad_norm": 22.971675872802734, "learning_rate": 2.174557900741586e-05, "loss": 2.3311, "step": 3818 }, { "epoch": 0.6536026014033887, "grad_norm": 23.986604690551758, "learning_rate": 2.175128351397604e-05, "loss": 2.6135, "step": 3819 }, { "epoch": 0.6537737463631696, "grad_norm": 28.69915771484375, "learning_rate": 2.1756988020536225e-05, "loss": 3.5022, "step": 3820 }, { "epoch": 0.6539448913229505, "grad_norm": 8.601239204406738, "learning_rate": 2.1762692527096408e-05, "loss": 0.6592, "step": 3821 }, { "epoch": 0.6541160362827315, "grad_norm": 22.482227325439453, "learning_rate": 2.1768397033656588e-05, "loss": 2.4048, "step": 3822 }, { "epoch": 0.6542871812425124, "grad_norm": 25.31351089477539, "learning_rate": 2.177410154021677e-05, "loss": 3.4277, "step": 3823 }, { "epoch": 0.6544583262022934, "grad_norm": 20.58570671081543, "learning_rate": 2.1779806046776955e-05, "loss": 2.1318, "step": 3824 }, { "epoch": 0.6546294711620743, "grad_norm": 15.284663200378418, "learning_rate": 2.1785510553337138e-05, "loss": 1.6332, "step": 3825 }, { "epoch": 0.6548006161218553, "grad_norm": 22.946290969848633, "learning_rate": 2.1791215059897318e-05, "loss": 2.5015, "step": 3826 }, { "epoch": 0.6549717610816361, "grad_norm": 49.23842239379883, "learning_rate": 2.17969195664575e-05, "loss": 7.6205, "step": 3827 }, { "epoch": 0.6551429060414171, "grad_norm": 16.11168670654297, "learning_rate": 2.1802624073017685e-05, "loss": 1.5222, "step": 3828 }, { "epoch": 0.655314051001198, "grad_norm": 25.72747039794922, "learning_rate": 2.1808328579577865e-05, "loss": 2.7138, "step": 3829 }, { "epoch": 0.655485195960979, "grad_norm": 14.393827438354492, "learning_rate": 2.181403308613805e-05, "loss": 1.1036, "step": 3830 }, { "epoch": 0.6556563409207599, "grad_norm": 27.66619300842285, "learning_rate": 2.1819737592698232e-05, "loss": 2.5863, "step": 3831 }, { "epoch": 0.6558274858805409, "grad_norm": 34.8533935546875, "learning_rate": 2.1825442099258415e-05, "loss": 6.3297, "step": 3832 }, { "epoch": 0.6559986308403217, "grad_norm": 32.486045837402344, "learning_rate": 2.1831146605818595e-05, "loss": 3.5127, "step": 3833 }, { "epoch": 0.6561697758001027, "grad_norm": 22.248271942138672, "learning_rate": 2.183685111237878e-05, "loss": 2.0005, "step": 3834 }, { "epoch": 0.6563409207598836, "grad_norm": 10.389534950256348, "learning_rate": 2.1842555618938962e-05, "loss": 0.7072, "step": 3835 }, { "epoch": 0.6565120657196646, "grad_norm": 11.556964874267578, "learning_rate": 2.1848260125499145e-05, "loss": 0.8002, "step": 3836 }, { "epoch": 0.6566832106794455, "grad_norm": 41.708778381347656, "learning_rate": 2.185396463205933e-05, "loss": 6.6155, "step": 3837 }, { "epoch": 0.6568543556392265, "grad_norm": 26.74636459350586, "learning_rate": 2.1859669138619512e-05, "loss": 3.0992, "step": 3838 }, { "epoch": 0.6570255005990073, "grad_norm": 24.822227478027344, "learning_rate": 2.1865373645179695e-05, "loss": 1.993, "step": 3839 }, { "epoch": 0.6571966455587883, "grad_norm": 28.681447982788086, "learning_rate": 2.1871078151739875e-05, "loss": 2.4919, "step": 3840 }, { "epoch": 0.6573677905185692, "grad_norm": 24.745319366455078, "learning_rate": 2.187678265830006e-05, "loss": 2.4664, "step": 3841 }, { "epoch": 0.6575389354783502, "grad_norm": 21.362070083618164, "learning_rate": 2.1882487164860242e-05, "loss": 1.9891, "step": 3842 }, { "epoch": 0.6577100804381311, "grad_norm": 5.6285481452941895, "learning_rate": 2.1888191671420422e-05, "loss": 0.5344, "step": 3843 }, { "epoch": 0.657881225397912, "grad_norm": 22.420122146606445, "learning_rate": 2.1893896177980606e-05, "loss": 2.1797, "step": 3844 }, { "epoch": 0.6580523703576929, "grad_norm": 1.2516911029815674, "learning_rate": 2.189960068454079e-05, "loss": 0.284, "step": 3845 }, { "epoch": 0.6582235153174739, "grad_norm": 16.150665283203125, "learning_rate": 2.190530519110097e-05, "loss": 1.2482, "step": 3846 }, { "epoch": 0.6583946602772548, "grad_norm": 29.241182327270508, "learning_rate": 2.1911009697661152e-05, "loss": 3.0362, "step": 3847 }, { "epoch": 0.6585658052370358, "grad_norm": 12.874473571777344, "learning_rate": 2.1916714204221336e-05, "loss": 1.3586, "step": 3848 }, { "epoch": 0.6587369501968167, "grad_norm": 25.05373764038086, "learning_rate": 2.192241871078152e-05, "loss": 2.9891, "step": 3849 }, { "epoch": 0.6589080951565977, "grad_norm": 5.338008880615234, "learning_rate": 2.19281232173417e-05, "loss": 0.6053, "step": 3850 }, { "epoch": 0.6590792401163785, "grad_norm": 24.934619903564453, "learning_rate": 2.1933827723901882e-05, "loss": 3.4717, "step": 3851 }, { "epoch": 0.6592503850761595, "grad_norm": 18.92520523071289, "learning_rate": 2.1939532230462066e-05, "loss": 1.792, "step": 3852 }, { "epoch": 0.6594215300359404, "grad_norm": 22.526769638061523, "learning_rate": 2.1945236737022246e-05, "loss": 2.2758, "step": 3853 }, { "epoch": 0.6595926749957214, "grad_norm": 13.877473831176758, "learning_rate": 2.195094124358243e-05, "loss": 0.9918, "step": 3854 }, { "epoch": 0.6597638199555023, "grad_norm": 26.623685836791992, "learning_rate": 2.1956645750142613e-05, "loss": 3.1529, "step": 3855 }, { "epoch": 0.6599349649152833, "grad_norm": 9.52644157409668, "learning_rate": 2.1962350256702796e-05, "loss": 1.4529, "step": 3856 }, { "epoch": 0.6601061098750641, "grad_norm": 27.445514678955078, "learning_rate": 2.1968054763262976e-05, "loss": 3.2014, "step": 3857 }, { "epoch": 0.6602772548348451, "grad_norm": 26.250980377197266, "learning_rate": 2.197375926982316e-05, "loss": 2.4078, "step": 3858 }, { "epoch": 0.660448399794626, "grad_norm": 13.75600814819336, "learning_rate": 2.1979463776383346e-05, "loss": 1.2816, "step": 3859 }, { "epoch": 0.660619544754407, "grad_norm": 39.159141540527344, "learning_rate": 2.1985168282943526e-05, "loss": 7.109, "step": 3860 }, { "epoch": 0.6607906897141879, "grad_norm": 29.373641967773438, "learning_rate": 2.199087278950371e-05, "loss": 3.3187, "step": 3861 }, { "epoch": 0.6609618346739689, "grad_norm": 21.516931533813477, "learning_rate": 2.1996577296063893e-05, "loss": 2.3298, "step": 3862 }, { "epoch": 0.6611329796337498, "grad_norm": 95.70633697509766, "learning_rate": 2.2002281802624076e-05, "loss": 7.5026, "step": 3863 }, { "epoch": 0.6613041245935307, "grad_norm": 34.55537796020508, "learning_rate": 2.2007986309184256e-05, "loss": 4.4655, "step": 3864 }, { "epoch": 0.6614752695533117, "grad_norm": 6.342493534088135, "learning_rate": 2.201369081574444e-05, "loss": 0.7267, "step": 3865 }, { "epoch": 0.6616464145130926, "grad_norm": 21.994108200073242, "learning_rate": 2.2019395322304623e-05, "loss": 1.894, "step": 3866 }, { "epoch": 0.6618175594728736, "grad_norm": 62.81085205078125, "learning_rate": 2.2025099828864803e-05, "loss": 2.5491, "step": 3867 }, { "epoch": 0.6619887044326545, "grad_norm": 13.31482219696045, "learning_rate": 2.2030804335424986e-05, "loss": 1.2874, "step": 3868 }, { "epoch": 0.6621598493924354, "grad_norm": 37.990882873535156, "learning_rate": 2.203650884198517e-05, "loss": 6.5223, "step": 3869 }, { "epoch": 0.6623309943522163, "grad_norm": 7.525701522827148, "learning_rate": 2.2042213348545353e-05, "loss": 0.7646, "step": 3870 }, { "epoch": 0.6625021393119973, "grad_norm": 28.383960723876953, "learning_rate": 2.2047917855105533e-05, "loss": 3.8601, "step": 3871 }, { "epoch": 0.6626732842717782, "grad_norm": 25.914175033569336, "learning_rate": 2.2053622361665716e-05, "loss": 2.8097, "step": 3872 }, { "epoch": 0.6628444292315592, "grad_norm": 25.789932250976562, "learning_rate": 2.20593268682259e-05, "loss": 2.4306, "step": 3873 }, { "epoch": 0.66301557419134, "grad_norm": 25.461977005004883, "learning_rate": 2.206503137478608e-05, "loss": 3.0559, "step": 3874 }, { "epoch": 0.663186719151121, "grad_norm": 21.75908088684082, "learning_rate": 2.2070735881346263e-05, "loss": 2.186, "step": 3875 }, { "epoch": 0.6633578641109019, "grad_norm": 26.937217712402344, "learning_rate": 2.2076440387906447e-05, "loss": 3.0092, "step": 3876 }, { "epoch": 0.6635290090706829, "grad_norm": 23.69822883605957, "learning_rate": 2.2082144894466627e-05, "loss": 2.4399, "step": 3877 }, { "epoch": 0.6637001540304638, "grad_norm": 2.651796579360962, "learning_rate": 2.208784940102681e-05, "loss": 0.3089, "step": 3878 }, { "epoch": 0.6638712989902448, "grad_norm": 13.779840469360352, "learning_rate": 2.2093553907586993e-05, "loss": 1.4293, "step": 3879 }, { "epoch": 0.6640424439500257, "grad_norm": 9.433732032775879, "learning_rate": 2.2099258414147177e-05, "loss": 0.738, "step": 3880 }, { "epoch": 0.6642135889098066, "grad_norm": 8.053833961486816, "learning_rate": 2.2104962920707357e-05, "loss": 0.8599, "step": 3881 }, { "epoch": 0.6643847338695875, "grad_norm": 8.789999008178711, "learning_rate": 2.2110667427267543e-05, "loss": 1.2145, "step": 3882 }, { "epoch": 0.6645558788293685, "grad_norm": 35.39924240112305, "learning_rate": 2.2116371933827727e-05, "loss": 3.1608, "step": 3883 }, { "epoch": 0.6647270237891494, "grad_norm": 123.84489440917969, "learning_rate": 2.2122076440387907e-05, "loss": 6.8157, "step": 3884 }, { "epoch": 0.6648981687489304, "grad_norm": 24.360124588012695, "learning_rate": 2.212778094694809e-05, "loss": 2.1383, "step": 3885 }, { "epoch": 0.6650693137087113, "grad_norm": 26.01473045349121, "learning_rate": 2.2133485453508274e-05, "loss": 3.114, "step": 3886 }, { "epoch": 0.6652404586684922, "grad_norm": 27.838552474975586, "learning_rate": 2.2139189960068457e-05, "loss": 2.8416, "step": 3887 }, { "epoch": 0.6654116036282731, "grad_norm": 27.128395080566406, "learning_rate": 2.2144894466628637e-05, "loss": 2.7295, "step": 3888 }, { "epoch": 0.6655827485880541, "grad_norm": 20.852027893066406, "learning_rate": 2.215059897318882e-05, "loss": 2.2252, "step": 3889 }, { "epoch": 0.665753893547835, "grad_norm": 51.29511260986328, "learning_rate": 2.2156303479749004e-05, "loss": 2.4494, "step": 3890 }, { "epoch": 0.665925038507616, "grad_norm": 27.06675910949707, "learning_rate": 2.2162007986309184e-05, "loss": 2.8832, "step": 3891 }, { "epoch": 0.6660961834673969, "grad_norm": 8.248744010925293, "learning_rate": 2.2167712492869367e-05, "loss": 0.7378, "step": 3892 }, { "epoch": 0.6662673284271778, "grad_norm": 23.798690795898438, "learning_rate": 2.217341699942955e-05, "loss": 3.3288, "step": 3893 }, { "epoch": 0.6664384733869587, "grad_norm": 25.778766632080078, "learning_rate": 2.2179121505989734e-05, "loss": 3.0603, "step": 3894 }, { "epoch": 0.6666096183467397, "grad_norm": 2.8828747272491455, "learning_rate": 2.2184826012549914e-05, "loss": 0.3141, "step": 3895 }, { "epoch": 0.6667807633065206, "grad_norm": 26.101049423217773, "learning_rate": 2.2190530519110097e-05, "loss": 3.1024, "step": 3896 }, { "epoch": 0.6669519082663016, "grad_norm": 20.340776443481445, "learning_rate": 2.219623502567028e-05, "loss": 1.7807, "step": 3897 }, { "epoch": 0.6671230532260825, "grad_norm": 22.285655975341797, "learning_rate": 2.220193953223046e-05, "loss": 1.9623, "step": 3898 }, { "epoch": 0.6672941981858634, "grad_norm": 10.816023826599121, "learning_rate": 2.2207644038790644e-05, "loss": 0.7881, "step": 3899 }, { "epoch": 0.6674653431456443, "grad_norm": 25.382898330688477, "learning_rate": 2.2213348545350827e-05, "loss": 2.2422, "step": 3900 }, { "epoch": 0.6676364881054253, "grad_norm": 18.11640167236328, "learning_rate": 2.221905305191101e-05, "loss": 1.323, "step": 3901 }, { "epoch": 0.6678076330652062, "grad_norm": 30.607837677001953, "learning_rate": 2.222475755847119e-05, "loss": 2.3077, "step": 3902 }, { "epoch": 0.6679787780249872, "grad_norm": 22.524381637573242, "learning_rate": 2.2230462065031374e-05, "loss": 2.7118, "step": 3903 }, { "epoch": 0.668149922984768, "grad_norm": 22.379953384399414, "learning_rate": 2.2236166571591557e-05, "loss": 2.0846, "step": 3904 }, { "epoch": 0.668321067944549, "grad_norm": 4.688474655151367, "learning_rate": 2.224187107815174e-05, "loss": 0.5239, "step": 3905 }, { "epoch": 0.6684922129043299, "grad_norm": 26.99576759338379, "learning_rate": 2.2247575584711924e-05, "loss": 3.4886, "step": 3906 }, { "epoch": 0.6686633578641109, "grad_norm": 29.186248779296875, "learning_rate": 2.2253280091272108e-05, "loss": 3.6048, "step": 3907 }, { "epoch": 0.6688345028238918, "grad_norm": 37.26026916503906, "learning_rate": 2.2258984597832288e-05, "loss": 6.4346, "step": 3908 }, { "epoch": 0.6690056477836728, "grad_norm": 15.596887588500977, "learning_rate": 2.226468910439247e-05, "loss": 1.5087, "step": 3909 }, { "epoch": 0.6691767927434537, "grad_norm": 22.914793014526367, "learning_rate": 2.2270393610952654e-05, "loss": 2.4031, "step": 3910 }, { "epoch": 0.6693479377032346, "grad_norm": 34.148956298828125, "learning_rate": 2.2276098117512838e-05, "loss": 3.4389, "step": 3911 }, { "epoch": 0.6695190826630155, "grad_norm": 21.66793441772461, "learning_rate": 2.2281802624073018e-05, "loss": 2.514, "step": 3912 }, { "epoch": 0.6696902276227965, "grad_norm": 10.826380729675293, "learning_rate": 2.22875071306332e-05, "loss": 0.8586, "step": 3913 }, { "epoch": 0.6698613725825775, "grad_norm": 25.435211181640625, "learning_rate": 2.2293211637193384e-05, "loss": 2.6889, "step": 3914 }, { "epoch": 0.6700325175423584, "grad_norm": 35.62110900878906, "learning_rate": 2.2298916143753564e-05, "loss": 6.5931, "step": 3915 }, { "epoch": 0.6702036625021394, "grad_norm": 30.739681243896484, "learning_rate": 2.2304620650313748e-05, "loss": 2.809, "step": 3916 }, { "epoch": 0.6703748074619202, "grad_norm": 3.0653045177459717, "learning_rate": 2.231032515687393e-05, "loss": 0.3283, "step": 3917 }, { "epoch": 0.6705459524217012, "grad_norm": 29.558330535888672, "learning_rate": 2.2316029663434115e-05, "loss": 2.0042, "step": 3918 }, { "epoch": 0.6707170973814821, "grad_norm": 23.827219009399414, "learning_rate": 2.2321734169994295e-05, "loss": 2.737, "step": 3919 }, { "epoch": 0.6708882423412631, "grad_norm": 33.99700927734375, "learning_rate": 2.2327438676554478e-05, "loss": 6.5747, "step": 3920 }, { "epoch": 0.671059387301044, "grad_norm": 27.55402374267578, "learning_rate": 2.233314318311466e-05, "loss": 2.6091, "step": 3921 }, { "epoch": 0.671230532260825, "grad_norm": 31.95720672607422, "learning_rate": 2.233884768967484e-05, "loss": 3.6537, "step": 3922 }, { "epoch": 0.6714016772206058, "grad_norm": 25.14667510986328, "learning_rate": 2.2344552196235025e-05, "loss": 2.6334, "step": 3923 }, { "epoch": 0.6715728221803868, "grad_norm": 23.47039794921875, "learning_rate": 2.2350256702795208e-05, "loss": 2.1698, "step": 3924 }, { "epoch": 0.6717439671401677, "grad_norm": 18.31406021118164, "learning_rate": 2.235596120935539e-05, "loss": 1.5905, "step": 3925 }, { "epoch": 0.6719151120999487, "grad_norm": 23.610937118530273, "learning_rate": 2.236166571591557e-05, "loss": 2.4129, "step": 3926 }, { "epoch": 0.6720862570597296, "grad_norm": 26.94730567932129, "learning_rate": 2.2367370222475755e-05, "loss": 2.8646, "step": 3927 }, { "epoch": 0.6722574020195106, "grad_norm": 19.2611026763916, "learning_rate": 2.237307472903594e-05, "loss": 1.685, "step": 3928 }, { "epoch": 0.6724285469792914, "grad_norm": 7.879249095916748, "learning_rate": 2.237877923559612e-05, "loss": 0.6331, "step": 3929 }, { "epoch": 0.6725996919390724, "grad_norm": 24.44508171081543, "learning_rate": 2.2384483742156305e-05, "loss": 2.3878, "step": 3930 }, { "epoch": 0.6727708368988533, "grad_norm": 20.18474769592285, "learning_rate": 2.239018824871649e-05, "loss": 1.8989, "step": 3931 }, { "epoch": 0.6729419818586343, "grad_norm": 5.985182762145996, "learning_rate": 2.2395892755276672e-05, "loss": 0.6958, "step": 3932 }, { "epoch": 0.6731131268184152, "grad_norm": 17.770193099975586, "learning_rate": 2.240159726183685e-05, "loss": 1.5905, "step": 3933 }, { "epoch": 0.6732842717781962, "grad_norm": 28.44164276123047, "learning_rate": 2.2407301768397035e-05, "loss": 2.9128, "step": 3934 }, { "epoch": 0.673455416737977, "grad_norm": 27.433252334594727, "learning_rate": 2.241300627495722e-05, "loss": 2.608, "step": 3935 }, { "epoch": 0.673626561697758, "grad_norm": 27.8862247467041, "learning_rate": 2.24187107815174e-05, "loss": 3.3689, "step": 3936 }, { "epoch": 0.6737977066575389, "grad_norm": 1.7033040523529053, "learning_rate": 2.2424415288077582e-05, "loss": 0.3299, "step": 3937 }, { "epoch": 0.6739688516173199, "grad_norm": 24.00095558166504, "learning_rate": 2.2430119794637765e-05, "loss": 2.1404, "step": 3938 }, { "epoch": 0.6741399965771008, "grad_norm": 28.09699821472168, "learning_rate": 2.243582430119795e-05, "loss": 2.9765, "step": 3939 }, { "epoch": 0.6743111415368818, "grad_norm": 33.0010871887207, "learning_rate": 2.244152880775813e-05, "loss": 6.8094, "step": 3940 }, { "epoch": 0.6744822864966626, "grad_norm": 25.918590545654297, "learning_rate": 2.2447233314318312e-05, "loss": 2.3787, "step": 3941 }, { "epoch": 0.6746534314564436, "grad_norm": 6.518866062164307, "learning_rate": 2.2452937820878495e-05, "loss": 0.501, "step": 3942 }, { "epoch": 0.6748245764162245, "grad_norm": 22.836181640625, "learning_rate": 2.2458642327438675e-05, "loss": 1.9666, "step": 3943 }, { "epoch": 0.6749957213760055, "grad_norm": 21.074495315551758, "learning_rate": 2.246434683399886e-05, "loss": 1.9682, "step": 3944 }, { "epoch": 0.6751668663357864, "grad_norm": 24.013696670532227, "learning_rate": 2.2470051340559042e-05, "loss": 2.5082, "step": 3945 }, { "epoch": 0.6753380112955674, "grad_norm": 19.56346893310547, "learning_rate": 2.2475755847119222e-05, "loss": 2.1205, "step": 3946 }, { "epoch": 0.6755091562553482, "grad_norm": 22.354597091674805, "learning_rate": 2.2481460353679405e-05, "loss": 2.1432, "step": 3947 }, { "epoch": 0.6756803012151292, "grad_norm": 20.16799545288086, "learning_rate": 2.248716486023959e-05, "loss": 1.7877, "step": 3948 }, { "epoch": 0.6758514461749101, "grad_norm": 31.60150146484375, "learning_rate": 2.2492869366799772e-05, "loss": 3.0757, "step": 3949 }, { "epoch": 0.6760225911346911, "grad_norm": 27.673959732055664, "learning_rate": 2.2498573873359952e-05, "loss": 3.013, "step": 3950 }, { "epoch": 0.676193736094472, "grad_norm": 20.703968048095703, "learning_rate": 2.250427837992014e-05, "loss": 2.0428, "step": 3951 }, { "epoch": 0.676364881054253, "grad_norm": 1.652134656906128, "learning_rate": 2.2509982886480322e-05, "loss": 0.2892, "step": 3952 }, { "epoch": 0.6765360260140338, "grad_norm": 4.8036017417907715, "learning_rate": 2.2515687393040502e-05, "loss": 0.3591, "step": 3953 }, { "epoch": 0.6767071709738148, "grad_norm": 1.2371207475662231, "learning_rate": 2.2521391899600686e-05, "loss": 0.2731, "step": 3954 }, { "epoch": 0.6768783159335957, "grad_norm": 34.3635368347168, "learning_rate": 2.252709640616087e-05, "loss": 6.5111, "step": 3955 }, { "epoch": 0.6770494608933767, "grad_norm": 26.6789493560791, "learning_rate": 2.2532800912721052e-05, "loss": 2.1933, "step": 3956 }, { "epoch": 0.6772206058531576, "grad_norm": 2.4849586486816406, "learning_rate": 2.2538505419281232e-05, "loss": 0.3021, "step": 3957 }, { "epoch": 0.6773917508129386, "grad_norm": 17.35017204284668, "learning_rate": 2.2544209925841416e-05, "loss": 1.7005, "step": 3958 }, { "epoch": 0.6775628957727194, "grad_norm": 36.90235137939453, "learning_rate": 2.25499144324016e-05, "loss": 3.5547, "step": 3959 }, { "epoch": 0.6777340407325004, "grad_norm": 14.82332706451416, "learning_rate": 2.255561893896178e-05, "loss": 1.2344, "step": 3960 }, { "epoch": 0.6779051856922813, "grad_norm": 16.59126853942871, "learning_rate": 2.2561323445521963e-05, "loss": 1.6819, "step": 3961 }, { "epoch": 0.6780763306520623, "grad_norm": 35.49818420410156, "learning_rate": 2.2567027952082146e-05, "loss": 5.1844, "step": 3962 }, { "epoch": 0.6782474756118432, "grad_norm": 18.354028701782227, "learning_rate": 2.257273245864233e-05, "loss": 1.5253, "step": 3963 }, { "epoch": 0.6784186205716242, "grad_norm": 33.30420684814453, "learning_rate": 2.257843696520251e-05, "loss": 3.3261, "step": 3964 }, { "epoch": 0.6785897655314052, "grad_norm": 1.1292017698287964, "learning_rate": 2.2584141471762693e-05, "loss": 0.2504, "step": 3965 }, { "epoch": 0.678760910491186, "grad_norm": 11.09749984741211, "learning_rate": 2.2589845978322876e-05, "loss": 0.7005, "step": 3966 }, { "epoch": 0.678932055450967, "grad_norm": 26.572961807250977, "learning_rate": 2.2595550484883056e-05, "loss": 2.5559, "step": 3967 }, { "epoch": 0.6791032004107479, "grad_norm": 28.02602767944336, "learning_rate": 2.260125499144324e-05, "loss": 2.5011, "step": 3968 }, { "epoch": 0.6792743453705289, "grad_norm": 6.116679668426514, "learning_rate": 2.2606959498003423e-05, "loss": 0.6422, "step": 3969 }, { "epoch": 0.6794454903303098, "grad_norm": 49.190433502197266, "learning_rate": 2.2612664004563606e-05, "loss": 2.2484, "step": 3970 }, { "epoch": 0.6796166352900908, "grad_norm": 27.249277114868164, "learning_rate": 2.2618368511123786e-05, "loss": 2.8075, "step": 3971 }, { "epoch": 0.6797877802498716, "grad_norm": 9.899073600769043, "learning_rate": 2.262407301768397e-05, "loss": 0.7593, "step": 3972 }, { "epoch": 0.6799589252096526, "grad_norm": 32.11344528198242, "learning_rate": 2.2629777524244153e-05, "loss": 3.4356, "step": 3973 }, { "epoch": 0.6801300701694335, "grad_norm": 33.132877349853516, "learning_rate": 2.2635482030804336e-05, "loss": 6.1209, "step": 3974 }, { "epoch": 0.6803012151292145, "grad_norm": 23.018150329589844, "learning_rate": 2.264118653736452e-05, "loss": 2.6845, "step": 3975 }, { "epoch": 0.6804723600889954, "grad_norm": 19.093454360961914, "learning_rate": 2.2646891043924703e-05, "loss": 1.9479, "step": 3976 }, { "epoch": 0.6806435050487764, "grad_norm": 16.804319381713867, "learning_rate": 2.2652595550484883e-05, "loss": 1.7933, "step": 3977 }, { "epoch": 0.6808146500085572, "grad_norm": 14.481977462768555, "learning_rate": 2.2658300057045066e-05, "loss": 1.2585, "step": 3978 }, { "epoch": 0.6809857949683382, "grad_norm": 30.040294647216797, "learning_rate": 2.266400456360525e-05, "loss": 3.3274, "step": 3979 }, { "epoch": 0.6811569399281191, "grad_norm": 13.815556526184082, "learning_rate": 2.2669709070165433e-05, "loss": 1.3265, "step": 3980 }, { "epoch": 0.6813280848879001, "grad_norm": 6.664211273193359, "learning_rate": 2.2675413576725613e-05, "loss": 0.673, "step": 3981 }, { "epoch": 0.681499229847681, "grad_norm": 28.066905975341797, "learning_rate": 2.2681118083285797e-05, "loss": 3.1732, "step": 3982 }, { "epoch": 0.681670374807462, "grad_norm": 3.707343101501465, "learning_rate": 2.268682258984598e-05, "loss": 0.6108, "step": 3983 }, { "epoch": 0.6818415197672428, "grad_norm": 3.5951898097991943, "learning_rate": 2.269252709640616e-05, "loss": 0.3119, "step": 3984 }, { "epoch": 0.6820126647270238, "grad_norm": 34.33369064331055, "learning_rate": 2.2698231602966343e-05, "loss": 3.5062, "step": 3985 }, { "epoch": 0.6821838096868047, "grad_norm": 10.984424591064453, "learning_rate": 2.2703936109526527e-05, "loss": 1.2835, "step": 3986 }, { "epoch": 0.6823549546465857, "grad_norm": 44.93880844116211, "learning_rate": 2.270964061608671e-05, "loss": 2.0757, "step": 3987 }, { "epoch": 0.6825260996063666, "grad_norm": 25.658374786376953, "learning_rate": 2.271534512264689e-05, "loss": 2.4198, "step": 3988 }, { "epoch": 0.6826972445661476, "grad_norm": 23.74066162109375, "learning_rate": 2.2721049629207073e-05, "loss": 2.3887, "step": 3989 }, { "epoch": 0.6828683895259284, "grad_norm": 22.6767578125, "learning_rate": 2.2726754135767257e-05, "loss": 2.0854, "step": 3990 }, { "epoch": 0.6830395344857094, "grad_norm": 13.210683822631836, "learning_rate": 2.2732458642327437e-05, "loss": 0.7998, "step": 3991 }, { "epoch": 0.6832106794454903, "grad_norm": 17.68242073059082, "learning_rate": 2.273816314888762e-05, "loss": 1.6588, "step": 3992 }, { "epoch": 0.6833818244052713, "grad_norm": 29.65629005432129, "learning_rate": 2.2743867655447804e-05, "loss": 3.4329, "step": 3993 }, { "epoch": 0.6835529693650522, "grad_norm": 23.901870727539062, "learning_rate": 2.2749572162007987e-05, "loss": 2.0216, "step": 3994 }, { "epoch": 0.6837241143248332, "grad_norm": 26.25312614440918, "learning_rate": 2.2755276668568167e-05, "loss": 2.3758, "step": 3995 }, { "epoch": 0.683895259284614, "grad_norm": 21.86573028564453, "learning_rate": 2.2760981175128354e-05, "loss": 1.8268, "step": 3996 }, { "epoch": 0.684066404244395, "grad_norm": 32.05353546142578, "learning_rate": 2.2766685681688537e-05, "loss": 2.6875, "step": 3997 }, { "epoch": 0.6842375492041759, "grad_norm": 24.166894912719727, "learning_rate": 2.2772390188248717e-05, "loss": 2.9649, "step": 3998 }, { "epoch": 0.6844086941639569, "grad_norm": 2.158492088317871, "learning_rate": 2.27780946948089e-05, "loss": 0.3136, "step": 3999 }, { "epoch": 0.6845798391237378, "grad_norm": 3.6095380783081055, "learning_rate": 2.2783799201369084e-05, "loss": 0.4003, "step": 4000 }, { "epoch": 0.6847509840835188, "grad_norm": 15.331098556518555, "learning_rate": 2.2789503707929267e-05, "loss": 1.3303, "step": 4001 }, { "epoch": 0.6849221290432996, "grad_norm": 72.33516693115234, "learning_rate": 2.2795208214489447e-05, "loss": 2.7759, "step": 4002 }, { "epoch": 0.6850932740030806, "grad_norm": 28.182132720947266, "learning_rate": 2.280091272104963e-05, "loss": 2.6851, "step": 4003 }, { "epoch": 0.6852644189628615, "grad_norm": 28.051651000976562, "learning_rate": 2.2806617227609814e-05, "loss": 2.7711, "step": 4004 }, { "epoch": 0.6854355639226425, "grad_norm": 30.069196701049805, "learning_rate": 2.2812321734169994e-05, "loss": 3.0094, "step": 4005 }, { "epoch": 0.6856067088824234, "grad_norm": 22.24188232421875, "learning_rate": 2.2818026240730177e-05, "loss": 2.049, "step": 4006 }, { "epoch": 0.6857778538422044, "grad_norm": 26.202171325683594, "learning_rate": 2.282373074729036e-05, "loss": 2.9979, "step": 4007 }, { "epoch": 0.6859489988019852, "grad_norm": 30.185165405273438, "learning_rate": 2.2829435253850544e-05, "loss": 2.4561, "step": 4008 }, { "epoch": 0.6861201437617662, "grad_norm": 38.99590301513672, "learning_rate": 2.2835139760410724e-05, "loss": 6.2926, "step": 4009 }, { "epoch": 0.6862912887215471, "grad_norm": 22.699562072753906, "learning_rate": 2.2840844266970907e-05, "loss": 2.1231, "step": 4010 }, { "epoch": 0.6864624336813281, "grad_norm": 16.00046730041504, "learning_rate": 2.284654877353109e-05, "loss": 1.4517, "step": 4011 }, { "epoch": 0.686633578641109, "grad_norm": 2.1720409393310547, "learning_rate": 2.285225328009127e-05, "loss": 0.2857, "step": 4012 }, { "epoch": 0.68680472360089, "grad_norm": 24.013540267944336, "learning_rate": 2.2857957786651454e-05, "loss": 2.0506, "step": 4013 }, { "epoch": 0.6869758685606709, "grad_norm": 31.45075798034668, "learning_rate": 2.2863662293211638e-05, "loss": 3.625, "step": 4014 }, { "epoch": 0.6871470135204518, "grad_norm": 13.581439018249512, "learning_rate": 2.2869366799771818e-05, "loss": 1.0461, "step": 4015 }, { "epoch": 0.6873181584802328, "grad_norm": 4.323340892791748, "learning_rate": 2.2875071306332e-05, "loss": 0.4245, "step": 4016 }, { "epoch": 0.6874893034400137, "grad_norm": 22.576906204223633, "learning_rate": 2.2880775812892184e-05, "loss": 1.9977, "step": 4017 }, { "epoch": 0.6876604483997947, "grad_norm": 24.18263053894043, "learning_rate": 2.2886480319452368e-05, "loss": 2.257, "step": 4018 }, { "epoch": 0.6878315933595756, "grad_norm": 31.072450637817383, "learning_rate": 2.289218482601255e-05, "loss": 4.4122, "step": 4019 }, { "epoch": 0.6880027383193565, "grad_norm": 6.332727432250977, "learning_rate": 2.2897889332572734e-05, "loss": 0.5804, "step": 4020 }, { "epoch": 0.6881738832791374, "grad_norm": 28.49344825744629, "learning_rate": 2.2903593839132918e-05, "loss": 3.0976, "step": 4021 }, { "epoch": 0.6883450282389184, "grad_norm": 25.831926345825195, "learning_rate": 2.2909298345693098e-05, "loss": 2.437, "step": 4022 }, { "epoch": 0.6885161731986993, "grad_norm": 1.773380994796753, "learning_rate": 2.291500285225328e-05, "loss": 0.316, "step": 4023 }, { "epoch": 0.6886873181584803, "grad_norm": 28.22812843322754, "learning_rate": 2.2920707358813465e-05, "loss": 3.162, "step": 4024 }, { "epoch": 0.6888584631182612, "grad_norm": 25.598621368408203, "learning_rate": 2.2926411865373648e-05, "loss": 2.4509, "step": 4025 }, { "epoch": 0.6890296080780421, "grad_norm": 23.125686645507812, "learning_rate": 2.2932116371933828e-05, "loss": 2.4663, "step": 4026 }, { "epoch": 0.689200753037823, "grad_norm": 16.213899612426758, "learning_rate": 2.293782087849401e-05, "loss": 1.3174, "step": 4027 }, { "epoch": 0.689371897997604, "grad_norm": 21.237468719482422, "learning_rate": 2.2943525385054195e-05, "loss": 1.7555, "step": 4028 }, { "epoch": 0.6895430429573849, "grad_norm": 18.583372116088867, "learning_rate": 2.2949229891614375e-05, "loss": 2.1211, "step": 4029 }, { "epoch": 0.6897141879171659, "grad_norm": 23.49361228942871, "learning_rate": 2.2954934398174558e-05, "loss": 2.3383, "step": 4030 }, { "epoch": 0.6898853328769468, "grad_norm": 18.21615219116211, "learning_rate": 2.296063890473474e-05, "loss": 1.7035, "step": 4031 }, { "epoch": 0.6900564778367277, "grad_norm": 21.006032943725586, "learning_rate": 2.2966343411294925e-05, "loss": 1.9274, "step": 4032 }, { "epoch": 0.6902276227965086, "grad_norm": 33.84695816040039, "learning_rate": 2.2972047917855105e-05, "loss": 4.7633, "step": 4033 }, { "epoch": 0.6903987677562896, "grad_norm": 23.903696060180664, "learning_rate": 2.2977752424415288e-05, "loss": 2.2789, "step": 4034 }, { "epoch": 0.6905699127160705, "grad_norm": 32.980133056640625, "learning_rate": 2.298345693097547e-05, "loss": 4.0104, "step": 4035 }, { "epoch": 0.6907410576758515, "grad_norm": 26.50374984741211, "learning_rate": 2.298916143753565e-05, "loss": 3.0929, "step": 4036 }, { "epoch": 0.6909122026356324, "grad_norm": 18.61760902404785, "learning_rate": 2.2994865944095835e-05, "loss": 1.5559, "step": 4037 }, { "epoch": 0.6910833475954133, "grad_norm": 21.44686508178711, "learning_rate": 2.300057045065602e-05, "loss": 1.9526, "step": 4038 }, { "epoch": 0.6912544925551942, "grad_norm": 21.688053131103516, "learning_rate": 2.3006274957216202e-05, "loss": 2.2512, "step": 4039 }, { "epoch": 0.6914256375149752, "grad_norm": 16.904464721679688, "learning_rate": 2.3011979463776382e-05, "loss": 1.5846, "step": 4040 }, { "epoch": 0.6915967824747561, "grad_norm": 15.30504035949707, "learning_rate": 2.3017683970336565e-05, "loss": 1.4085, "step": 4041 }, { "epoch": 0.6917679274345371, "grad_norm": 38.0904655456543, "learning_rate": 2.3023388476896752e-05, "loss": 6.407, "step": 4042 }, { "epoch": 0.691939072394318, "grad_norm": 1.5367884635925293, "learning_rate": 2.3029092983456932e-05, "loss": 0.2607, "step": 4043 }, { "epoch": 0.6921102173540989, "grad_norm": 6.76278018951416, "learning_rate": 2.3034797490017115e-05, "loss": 0.5019, "step": 4044 }, { "epoch": 0.6922813623138798, "grad_norm": 28.894147872924805, "learning_rate": 2.30405019965773e-05, "loss": 3.1341, "step": 4045 }, { "epoch": 0.6924525072736608, "grad_norm": 21.855031967163086, "learning_rate": 2.304620650313748e-05, "loss": 2.6851, "step": 4046 }, { "epoch": 0.6926236522334417, "grad_norm": 21.59860610961914, "learning_rate": 2.3051911009697662e-05, "loss": 2.0748, "step": 4047 }, { "epoch": 0.6927947971932227, "grad_norm": 82.68946838378906, "learning_rate": 2.3057615516257845e-05, "loss": 2.8348, "step": 4048 }, { "epoch": 0.6929659421530036, "grad_norm": 47.771636962890625, "learning_rate": 2.306332002281803e-05, "loss": 6.3127, "step": 4049 }, { "epoch": 0.6931370871127845, "grad_norm": 19.974872589111328, "learning_rate": 2.306902452937821e-05, "loss": 2.0138, "step": 4050 }, { "epoch": 0.6933082320725654, "grad_norm": 28.05762481689453, "learning_rate": 2.3074729035938392e-05, "loss": 2.804, "step": 4051 }, { "epoch": 0.6934793770323464, "grad_norm": 26.722768783569336, "learning_rate": 2.3080433542498575e-05, "loss": 3.3621, "step": 4052 }, { "epoch": 0.6936505219921273, "grad_norm": 10.087456703186035, "learning_rate": 2.3086138049058755e-05, "loss": 0.6185, "step": 4053 }, { "epoch": 0.6938216669519083, "grad_norm": 19.535655975341797, "learning_rate": 2.309184255561894e-05, "loss": 2.3951, "step": 4054 }, { "epoch": 0.6939928119116892, "grad_norm": 24.08242416381836, "learning_rate": 2.3097547062179122e-05, "loss": 2.5265, "step": 4055 }, { "epoch": 0.6941639568714701, "grad_norm": 10.019787788391113, "learning_rate": 2.3103251568739306e-05, "loss": 1.679, "step": 4056 }, { "epoch": 0.694335101831251, "grad_norm": 23.744293212890625, "learning_rate": 2.3108956075299486e-05, "loss": 2.3542, "step": 4057 }, { "epoch": 0.694506246791032, "grad_norm": 4.5092244148254395, "learning_rate": 2.311466058185967e-05, "loss": 0.4777, "step": 4058 }, { "epoch": 0.6946773917508129, "grad_norm": 17.50345802307129, "learning_rate": 2.3120365088419852e-05, "loss": 1.8737, "step": 4059 }, { "epoch": 0.6948485367105939, "grad_norm": 23.234378814697266, "learning_rate": 2.3126069594980032e-05, "loss": 2.307, "step": 4060 }, { "epoch": 0.6950196816703748, "grad_norm": 2.192140579223633, "learning_rate": 2.3131774101540216e-05, "loss": 0.2844, "step": 4061 }, { "epoch": 0.6951908266301557, "grad_norm": 22.02082633972168, "learning_rate": 2.31374786081004e-05, "loss": 2.0206, "step": 4062 }, { "epoch": 0.6953619715899366, "grad_norm": 18.239028930664062, "learning_rate": 2.3143183114660582e-05, "loss": 1.5416, "step": 4063 }, { "epoch": 0.6955331165497176, "grad_norm": 8.209535598754883, "learning_rate": 2.3148887621220762e-05, "loss": 0.8478, "step": 4064 }, { "epoch": 0.6957042615094986, "grad_norm": 38.67818832397461, "learning_rate": 2.315459212778095e-05, "loss": 2.1182, "step": 4065 }, { "epoch": 0.6958754064692795, "grad_norm": 27.814809799194336, "learning_rate": 2.3160296634341133e-05, "loss": 3.3903, "step": 4066 }, { "epoch": 0.6960465514290605, "grad_norm": 1.4533177614212036, "learning_rate": 2.3166001140901313e-05, "loss": 0.2869, "step": 4067 }, { "epoch": 0.6962176963888413, "grad_norm": 10.602791786193848, "learning_rate": 2.3171705647461496e-05, "loss": 0.6294, "step": 4068 }, { "epoch": 0.6963888413486223, "grad_norm": 20.941123962402344, "learning_rate": 2.317741015402168e-05, "loss": 2.1678, "step": 4069 }, { "epoch": 0.6965599863084032, "grad_norm": 26.170923233032227, "learning_rate": 2.3183114660581863e-05, "loss": 2.4743, "step": 4070 }, { "epoch": 0.6967311312681842, "grad_norm": 24.81916618347168, "learning_rate": 2.3188819167142043e-05, "loss": 2.6961, "step": 4071 }, { "epoch": 0.6969022762279651, "grad_norm": 21.444581985473633, "learning_rate": 2.3194523673702226e-05, "loss": 2.297, "step": 4072 }, { "epoch": 0.6970734211877461, "grad_norm": 24.241352081298828, "learning_rate": 2.320022818026241e-05, "loss": 2.47, "step": 4073 }, { "epoch": 0.697244566147527, "grad_norm": 19.334854125976562, "learning_rate": 2.320593268682259e-05, "loss": 1.8912, "step": 4074 }, { "epoch": 0.6974157111073079, "grad_norm": 1.0382440090179443, "learning_rate": 2.3211637193382773e-05, "loss": 0.2496, "step": 4075 }, { "epoch": 0.6975868560670888, "grad_norm": 9.449311256408691, "learning_rate": 2.3217341699942956e-05, "loss": 0.7254, "step": 4076 }, { "epoch": 0.6977580010268698, "grad_norm": 73.3146743774414, "learning_rate": 2.3223046206503136e-05, "loss": 7.7671, "step": 4077 }, { "epoch": 0.6979291459866507, "grad_norm": 1.6645268201828003, "learning_rate": 2.322875071306332e-05, "loss": 0.2892, "step": 4078 }, { "epoch": 0.6981002909464317, "grad_norm": 4.770233631134033, "learning_rate": 2.3234455219623503e-05, "loss": 0.4124, "step": 4079 }, { "epoch": 0.6982714359062125, "grad_norm": 28.565988540649414, "learning_rate": 2.3240159726183686e-05, "loss": 1.4992, "step": 4080 }, { "epoch": 0.6984425808659935, "grad_norm": 12.26234245300293, "learning_rate": 2.3245864232743866e-05, "loss": 0.8612, "step": 4081 }, { "epoch": 0.6986137258257744, "grad_norm": 25.978046417236328, "learning_rate": 2.325156873930405e-05, "loss": 2.1483, "step": 4082 }, { "epoch": 0.6987848707855554, "grad_norm": 19.333969116210938, "learning_rate": 2.3257273245864233e-05, "loss": 2.0506, "step": 4083 }, { "epoch": 0.6989560157453363, "grad_norm": 28.06734275817871, "learning_rate": 2.3262977752424413e-05, "loss": 2.6222, "step": 4084 }, { "epoch": 0.6991271607051173, "grad_norm": 9.045817375183105, "learning_rate": 2.3268682258984596e-05, "loss": 0.8083, "step": 4085 }, { "epoch": 0.6992983056648981, "grad_norm": 6.522378444671631, "learning_rate": 2.327438676554478e-05, "loss": 0.5991, "step": 4086 }, { "epoch": 0.6994694506246791, "grad_norm": 24.520263671875, "learning_rate": 2.3280091272104963e-05, "loss": 1.9785, "step": 4087 }, { "epoch": 0.69964059558446, "grad_norm": 25.192462921142578, "learning_rate": 2.3285795778665147e-05, "loss": 1.9242, "step": 4088 }, { "epoch": 0.699811740544241, "grad_norm": 21.36008071899414, "learning_rate": 2.329150028522533e-05, "loss": 2.2738, "step": 4089 }, { "epoch": 0.6999828855040219, "grad_norm": 15.994437217712402, "learning_rate": 2.3297204791785513e-05, "loss": 1.5939, "step": 4090 }, { "epoch": 0.7001540304638029, "grad_norm": 13.80662727355957, "learning_rate": 2.3302909298345693e-05, "loss": 0.8196, "step": 4091 }, { "epoch": 0.7003251754235837, "grad_norm": 1.906544804573059, "learning_rate": 2.3308613804905877e-05, "loss": 0.2608, "step": 4092 }, { "epoch": 0.7004963203833647, "grad_norm": 6.288933753967285, "learning_rate": 2.331431831146606e-05, "loss": 0.8815, "step": 4093 }, { "epoch": 0.7006674653431456, "grad_norm": 25.848539352416992, "learning_rate": 2.3320022818026244e-05, "loss": 3.1319, "step": 4094 }, { "epoch": 0.7008386103029266, "grad_norm": 2.6723341941833496, "learning_rate": 2.3325727324586424e-05, "loss": 0.3466, "step": 4095 }, { "epoch": 0.7010097552627075, "grad_norm": 34.171104431152344, "learning_rate": 2.3331431831146607e-05, "loss": 6.0408, "step": 4096 }, { "epoch": 0.7011809002224885, "grad_norm": 6.2798662185668945, "learning_rate": 2.333713633770679e-05, "loss": 0.5715, "step": 4097 }, { "epoch": 0.7013520451822693, "grad_norm": 21.24723243713379, "learning_rate": 2.334284084426697e-05, "loss": 1.801, "step": 4098 }, { "epoch": 0.7015231901420503, "grad_norm": 2.3047332763671875, "learning_rate": 2.3348545350827154e-05, "loss": 0.2806, "step": 4099 }, { "epoch": 0.7016943351018312, "grad_norm": 35.639129638671875, "learning_rate": 2.3354249857387337e-05, "loss": 6.2857, "step": 4100 }, { "epoch": 0.7018654800616122, "grad_norm": 22.578310012817383, "learning_rate": 2.335995436394752e-05, "loss": 2.3757, "step": 4101 }, { "epoch": 0.7020366250213931, "grad_norm": 27.650184631347656, "learning_rate": 2.33656588705077e-05, "loss": 2.7948, "step": 4102 }, { "epoch": 0.7022077699811741, "grad_norm": 22.47934913635254, "learning_rate": 2.3371363377067884e-05, "loss": 2.9155, "step": 4103 }, { "epoch": 0.702378914940955, "grad_norm": 117.08856201171875, "learning_rate": 2.3377067883628067e-05, "loss": 8.9408, "step": 4104 }, { "epoch": 0.7025500599007359, "grad_norm": 6.346577167510986, "learning_rate": 2.3382772390188247e-05, "loss": 0.4762, "step": 4105 }, { "epoch": 0.7027212048605168, "grad_norm": 24.838397979736328, "learning_rate": 2.338847689674843e-05, "loss": 2.0305, "step": 4106 }, { "epoch": 0.7028923498202978, "grad_norm": 5.3628716468811035, "learning_rate": 2.3394181403308614e-05, "loss": 0.4928, "step": 4107 }, { "epoch": 0.7030634947800787, "grad_norm": 27.933374404907227, "learning_rate": 2.3399885909868797e-05, "loss": 2.7148, "step": 4108 }, { "epoch": 0.7032346397398597, "grad_norm": 61.49900436401367, "learning_rate": 2.3405590416428977e-05, "loss": 2.22, "step": 4109 }, { "epoch": 0.7034057846996405, "grad_norm": 35.6771354675293, "learning_rate": 2.341129492298916e-05, "loss": 2.7082, "step": 4110 }, { "epoch": 0.7035769296594215, "grad_norm": 6.308041095733643, "learning_rate": 2.3416999429549347e-05, "loss": 0.5861, "step": 4111 }, { "epoch": 0.7037480746192024, "grad_norm": 18.36146354675293, "learning_rate": 2.3422703936109527e-05, "loss": 1.5818, "step": 4112 }, { "epoch": 0.7039192195789834, "grad_norm": 26.6254940032959, "learning_rate": 2.342840844266971e-05, "loss": 2.5972, "step": 4113 }, { "epoch": 0.7040903645387643, "grad_norm": 41.90875244140625, "learning_rate": 2.3434112949229894e-05, "loss": 7.2004, "step": 4114 }, { "epoch": 0.7042615094985453, "grad_norm": 26.454225540161133, "learning_rate": 2.3439817455790074e-05, "loss": 3.0425, "step": 4115 }, { "epoch": 0.7044326544583263, "grad_norm": 67.04540252685547, "learning_rate": 2.3445521962350258e-05, "loss": 2.5417, "step": 4116 }, { "epoch": 0.7046037994181071, "grad_norm": 29.956275939941406, "learning_rate": 2.345122646891044e-05, "loss": 3.9422, "step": 4117 }, { "epoch": 0.7047749443778881, "grad_norm": 28.678985595703125, "learning_rate": 2.3456930975470624e-05, "loss": 2.8701, "step": 4118 }, { "epoch": 0.704946089337669, "grad_norm": 21.728593826293945, "learning_rate": 2.3462635482030804e-05, "loss": 1.9539, "step": 4119 }, { "epoch": 0.70511723429745, "grad_norm": 21.133193969726562, "learning_rate": 2.3468339988590988e-05, "loss": 2.0111, "step": 4120 }, { "epoch": 0.7052883792572309, "grad_norm": 57.13780975341797, "learning_rate": 2.347404449515117e-05, "loss": 2.4365, "step": 4121 }, { "epoch": 0.7054595242170119, "grad_norm": 1.5548804998397827, "learning_rate": 2.347974900171135e-05, "loss": 0.2656, "step": 4122 }, { "epoch": 0.7056306691767927, "grad_norm": 16.94969940185547, "learning_rate": 2.3485453508271534e-05, "loss": 1.5448, "step": 4123 }, { "epoch": 0.7058018141365737, "grad_norm": 29.882259368896484, "learning_rate": 2.3491158014831718e-05, "loss": 1.7722, "step": 4124 }, { "epoch": 0.7059729590963546, "grad_norm": 13.851731300354004, "learning_rate": 2.34968625213919e-05, "loss": 1.2664, "step": 4125 }, { "epoch": 0.7061441040561356, "grad_norm": 19.379756927490234, "learning_rate": 2.350256702795208e-05, "loss": 1.5466, "step": 4126 }, { "epoch": 0.7063152490159165, "grad_norm": 20.181297302246094, "learning_rate": 2.3508271534512265e-05, "loss": 2.0799, "step": 4127 }, { "epoch": 0.7064863939756975, "grad_norm": 18.206491470336914, "learning_rate": 2.3513976041072448e-05, "loss": 1.7423, "step": 4128 }, { "epoch": 0.7066575389354783, "grad_norm": 39.42982482910156, "learning_rate": 2.3519680547632628e-05, "loss": 6.003, "step": 4129 }, { "epoch": 0.7068286838952593, "grad_norm": 10.509867668151855, "learning_rate": 2.352538505419281e-05, "loss": 0.7636, "step": 4130 }, { "epoch": 0.7069998288550402, "grad_norm": 2.2939412593841553, "learning_rate": 2.3531089560752995e-05, "loss": 0.2719, "step": 4131 }, { "epoch": 0.7071709738148212, "grad_norm": 25.9577579498291, "learning_rate": 2.3536794067313178e-05, "loss": 2.4994, "step": 4132 }, { "epoch": 0.7073421187746021, "grad_norm": 72.57787322998047, "learning_rate": 2.3542498573873358e-05, "loss": 7.4552, "step": 4133 }, { "epoch": 0.7075132637343831, "grad_norm": 138.49476623535156, "learning_rate": 2.3548203080433545e-05, "loss": 8.7384, "step": 4134 }, { "epoch": 0.7076844086941639, "grad_norm": 18.31711769104004, "learning_rate": 2.3553907586993728e-05, "loss": 1.8353, "step": 4135 }, { "epoch": 0.7078555536539449, "grad_norm": 23.117900848388672, "learning_rate": 2.3559612093553908e-05, "loss": 2.2019, "step": 4136 }, { "epoch": 0.7080266986137258, "grad_norm": 29.79839515686035, "learning_rate": 2.356531660011409e-05, "loss": 3.6913, "step": 4137 }, { "epoch": 0.7081978435735068, "grad_norm": 6.226611614227295, "learning_rate": 2.3571021106674275e-05, "loss": 0.3817, "step": 4138 }, { "epoch": 0.7083689885332877, "grad_norm": 7.866857051849365, "learning_rate": 2.3576725613234458e-05, "loss": 0.7468, "step": 4139 }, { "epoch": 0.7085401334930687, "grad_norm": 13.393908500671387, "learning_rate": 2.3582430119794638e-05, "loss": 0.9038, "step": 4140 }, { "epoch": 0.7087112784528495, "grad_norm": 4.077215194702148, "learning_rate": 2.358813462635482e-05, "loss": 0.3156, "step": 4141 }, { "epoch": 0.7088824234126305, "grad_norm": 35.0775146484375, "learning_rate": 2.3593839132915005e-05, "loss": 6.4035, "step": 4142 }, { "epoch": 0.7090535683724114, "grad_norm": 27.827789306640625, "learning_rate": 2.3599543639475185e-05, "loss": 2.6875, "step": 4143 }, { "epoch": 0.7092247133321924, "grad_norm": 7.222084045410156, "learning_rate": 2.360524814603537e-05, "loss": 0.6337, "step": 4144 }, { "epoch": 0.7093958582919733, "grad_norm": 16.112009048461914, "learning_rate": 2.3610952652595552e-05, "loss": 1.5604, "step": 4145 }, { "epoch": 0.7095670032517543, "grad_norm": 33.34373092651367, "learning_rate": 2.3616657159155732e-05, "loss": 6.1852, "step": 4146 }, { "epoch": 0.7097381482115351, "grad_norm": 23.43242835998535, "learning_rate": 2.3622361665715915e-05, "loss": 2.2633, "step": 4147 }, { "epoch": 0.7099092931713161, "grad_norm": 23.43446159362793, "learning_rate": 2.36280661722761e-05, "loss": 2.2967, "step": 4148 }, { "epoch": 0.710080438131097, "grad_norm": 9.296281814575195, "learning_rate": 2.3633770678836282e-05, "loss": 0.9115, "step": 4149 }, { "epoch": 0.710251583090878, "grad_norm": 23.257322311401367, "learning_rate": 2.3639475185396462e-05, "loss": 2.2038, "step": 4150 }, { "epoch": 0.7104227280506589, "grad_norm": 30.016109466552734, "learning_rate": 2.3645179691956645e-05, "loss": 3.0839, "step": 4151 }, { "epoch": 0.7105938730104399, "grad_norm": 25.682640075683594, "learning_rate": 2.365088419851683e-05, "loss": 2.8843, "step": 4152 }, { "epoch": 0.7107650179702207, "grad_norm": 23.85866355895996, "learning_rate": 2.365658870507701e-05, "loss": 1.9993, "step": 4153 }, { "epoch": 0.7109361629300017, "grad_norm": 4.204997539520264, "learning_rate": 2.3662293211637192e-05, "loss": 0.5192, "step": 4154 }, { "epoch": 0.7111073078897826, "grad_norm": 24.741037368774414, "learning_rate": 2.3667997718197375e-05, "loss": 2.4707, "step": 4155 }, { "epoch": 0.7112784528495636, "grad_norm": 5.143214225769043, "learning_rate": 2.367370222475756e-05, "loss": 0.3212, "step": 4156 }, { "epoch": 0.7114495978093445, "grad_norm": 30.81825828552246, "learning_rate": 2.3679406731317742e-05, "loss": 2.3157, "step": 4157 }, { "epoch": 0.7116207427691255, "grad_norm": 25.597097396850586, "learning_rate": 2.3685111237877926e-05, "loss": 2.8225, "step": 4158 }, { "epoch": 0.7117918877289063, "grad_norm": 12.126123428344727, "learning_rate": 2.369081574443811e-05, "loss": 0.6904, "step": 4159 }, { "epoch": 0.7119630326886873, "grad_norm": 17.513898849487305, "learning_rate": 2.369652025099829e-05, "loss": 1.9711, "step": 4160 }, { "epoch": 0.7121341776484682, "grad_norm": 24.142879486083984, "learning_rate": 2.3702224757558472e-05, "loss": 2.5014, "step": 4161 }, { "epoch": 0.7123053226082492, "grad_norm": 26.637598037719727, "learning_rate": 2.3707929264118656e-05, "loss": 2.5949, "step": 4162 }, { "epoch": 0.7124764675680301, "grad_norm": 22.141407012939453, "learning_rate": 2.371363377067884e-05, "loss": 2.4929, "step": 4163 }, { "epoch": 0.7126476125278111, "grad_norm": 26.212926864624023, "learning_rate": 2.371933827723902e-05, "loss": 2.6947, "step": 4164 }, { "epoch": 0.7128187574875919, "grad_norm": 18.995336532592773, "learning_rate": 2.3725042783799202e-05, "loss": 1.6993, "step": 4165 }, { "epoch": 0.7129899024473729, "grad_norm": 27.700637817382812, "learning_rate": 2.3730747290359386e-05, "loss": 3.3321, "step": 4166 }, { "epoch": 0.7131610474071539, "grad_norm": 5.5868754386901855, "learning_rate": 2.3736451796919566e-05, "loss": 0.5355, "step": 4167 }, { "epoch": 0.7133321923669348, "grad_norm": 15.529837608337402, "learning_rate": 2.374215630347975e-05, "loss": 1.5037, "step": 4168 }, { "epoch": 0.7135033373267158, "grad_norm": 21.299524307250977, "learning_rate": 2.3747860810039933e-05, "loss": 1.9929, "step": 4169 }, { "epoch": 0.7136744822864967, "grad_norm": 45.21254348754883, "learning_rate": 2.3753565316600116e-05, "loss": 6.7553, "step": 4170 }, { "epoch": 0.7138456272462776, "grad_norm": 26.380695343017578, "learning_rate": 2.3759269823160296e-05, "loss": 2.6056, "step": 4171 }, { "epoch": 0.7140167722060585, "grad_norm": 3.5742998123168945, "learning_rate": 2.376497432972048e-05, "loss": 0.4821, "step": 4172 }, { "epoch": 0.7141879171658395, "grad_norm": 31.626493453979492, "learning_rate": 2.3770678836280663e-05, "loss": 3.368, "step": 4173 }, { "epoch": 0.7143590621256204, "grad_norm": 21.61958885192871, "learning_rate": 2.3776383342840843e-05, "loss": 1.709, "step": 4174 }, { "epoch": 0.7145302070854014, "grad_norm": 6.5677809715271, "learning_rate": 2.3782087849401026e-05, "loss": 0.5463, "step": 4175 }, { "epoch": 0.7147013520451823, "grad_norm": 151.24124145507812, "learning_rate": 2.378779235596121e-05, "loss": 8.6429, "step": 4176 }, { "epoch": 0.7148724970049632, "grad_norm": 34.68931579589844, "learning_rate": 2.3793496862521393e-05, "loss": 6.1887, "step": 4177 }, { "epoch": 0.7150436419647441, "grad_norm": 18.861997604370117, "learning_rate": 2.3799201369081573e-05, "loss": 1.8546, "step": 4178 }, { "epoch": 0.7152147869245251, "grad_norm": 19.337419509887695, "learning_rate": 2.380490587564176e-05, "loss": 1.8454, "step": 4179 }, { "epoch": 0.715385931884306, "grad_norm": 1.8883466720581055, "learning_rate": 2.3810610382201943e-05, "loss": 0.289, "step": 4180 }, { "epoch": 0.715557076844087, "grad_norm": 110.51686096191406, "learning_rate": 2.3816314888762123e-05, "loss": 7.8896, "step": 4181 }, { "epoch": 0.7157282218038679, "grad_norm": 69.93323516845703, "learning_rate": 2.3822019395322306e-05, "loss": 2.6675, "step": 4182 }, { "epoch": 0.7158993667636488, "grad_norm": 23.35276222229004, "learning_rate": 2.382772390188249e-05, "loss": 2.0773, "step": 4183 }, { "epoch": 0.7160705117234297, "grad_norm": 20.778461456298828, "learning_rate": 2.383342840844267e-05, "loss": 2.2745, "step": 4184 }, { "epoch": 0.7162416566832107, "grad_norm": 13.58486557006836, "learning_rate": 2.3839132915002853e-05, "loss": 1.2723, "step": 4185 }, { "epoch": 0.7164128016429916, "grad_norm": 6.069742202758789, "learning_rate": 2.3844837421563036e-05, "loss": 0.865, "step": 4186 }, { "epoch": 0.7165839466027726, "grad_norm": 21.17997169494629, "learning_rate": 2.385054192812322e-05, "loss": 1.8901, "step": 4187 }, { "epoch": 0.7167550915625535, "grad_norm": 24.12006187438965, "learning_rate": 2.38562464346834e-05, "loss": 2.7853, "step": 4188 }, { "epoch": 0.7169262365223344, "grad_norm": 13.66297721862793, "learning_rate": 2.3861950941243583e-05, "loss": 0.9545, "step": 4189 }, { "epoch": 0.7170973814821153, "grad_norm": 51.27836990356445, "learning_rate": 2.3867655447803767e-05, "loss": 1.9518, "step": 4190 }, { "epoch": 0.7172685264418963, "grad_norm": 7.221101760864258, "learning_rate": 2.3873359954363947e-05, "loss": 0.5895, "step": 4191 }, { "epoch": 0.7174396714016772, "grad_norm": 91.32466125488281, "learning_rate": 2.387906446092413e-05, "loss": 7.6073, "step": 4192 }, { "epoch": 0.7176108163614582, "grad_norm": 19.606168746948242, "learning_rate": 2.3884768967484313e-05, "loss": 1.8311, "step": 4193 }, { "epoch": 0.7177819613212391, "grad_norm": 8.86071491241455, "learning_rate": 2.3890473474044497e-05, "loss": 0.8949, "step": 4194 }, { "epoch": 0.71795310628102, "grad_norm": 21.827577590942383, "learning_rate": 2.3896177980604677e-05, "loss": 2.0555, "step": 4195 }, { "epoch": 0.7181242512408009, "grad_norm": 24.964656829833984, "learning_rate": 2.390188248716486e-05, "loss": 3.0599, "step": 4196 }, { "epoch": 0.7182953962005819, "grad_norm": 1.9594327211380005, "learning_rate": 2.3907586993725043e-05, "loss": 0.2823, "step": 4197 }, { "epoch": 0.7184665411603628, "grad_norm": 18.58384132385254, "learning_rate": 2.3913291500285223e-05, "loss": 1.8147, "step": 4198 }, { "epoch": 0.7186376861201438, "grad_norm": 22.12485122680664, "learning_rate": 2.3918996006845407e-05, "loss": 2.4456, "step": 4199 }, { "epoch": 0.7188088310799247, "grad_norm": 20.110654830932617, "learning_rate": 2.392470051340559e-05, "loss": 1.9338, "step": 4200 }, { "epoch": 0.7189799760397056, "grad_norm": 25.46828842163086, "learning_rate": 2.3930405019965774e-05, "loss": 1.6106, "step": 4201 }, { "epoch": 0.7191511209994865, "grad_norm": 10.579689979553223, "learning_rate": 2.3936109526525957e-05, "loss": 0.6947, "step": 4202 }, { "epoch": 0.7193222659592675, "grad_norm": 50.630008697509766, "learning_rate": 2.394181403308614e-05, "loss": 1.6672, "step": 4203 }, { "epoch": 0.7194934109190484, "grad_norm": 22.716222763061523, "learning_rate": 2.3947518539646324e-05, "loss": 2.0138, "step": 4204 }, { "epoch": 0.7196645558788294, "grad_norm": 16.84745216369629, "learning_rate": 2.3953223046206504e-05, "loss": 1.0839, "step": 4205 }, { "epoch": 0.7198357008386103, "grad_norm": 1.9779396057128906, "learning_rate": 2.3958927552766687e-05, "loss": 0.2798, "step": 4206 }, { "epoch": 0.7200068457983912, "grad_norm": 30.206113815307617, "learning_rate": 2.396463205932687e-05, "loss": 2.7617, "step": 4207 }, { "epoch": 0.7201779907581721, "grad_norm": 14.300946235656738, "learning_rate": 2.3970336565887054e-05, "loss": 1.4846, "step": 4208 }, { "epoch": 0.7203491357179531, "grad_norm": 14.153777122497559, "learning_rate": 2.3976041072447234e-05, "loss": 0.9053, "step": 4209 }, { "epoch": 0.720520280677734, "grad_norm": 30.488285064697266, "learning_rate": 2.3981745579007417e-05, "loss": 3.6317, "step": 4210 }, { "epoch": 0.720691425637515, "grad_norm": 40.644447326660156, "learning_rate": 2.39874500855676e-05, "loss": 6.598, "step": 4211 }, { "epoch": 0.7208625705972959, "grad_norm": 18.58864402770996, "learning_rate": 2.399315459212778e-05, "loss": 1.7715, "step": 4212 }, { "epoch": 0.7210337155570768, "grad_norm": 25.57322883605957, "learning_rate": 2.3998859098687964e-05, "loss": 2.5285, "step": 4213 }, { "epoch": 0.7212048605168577, "grad_norm": 21.300174713134766, "learning_rate": 2.4004563605248147e-05, "loss": 2.5907, "step": 4214 }, { "epoch": 0.7213760054766387, "grad_norm": 15.046341896057129, "learning_rate": 2.4010268111808327e-05, "loss": 1.2937, "step": 4215 }, { "epoch": 0.7215471504364196, "grad_norm": 23.890518188476562, "learning_rate": 2.401597261836851e-05, "loss": 2.1427, "step": 4216 }, { "epoch": 0.7217182953962006, "grad_norm": 12.539481163024902, "learning_rate": 2.4021677124928694e-05, "loss": 0.8432, "step": 4217 }, { "epoch": 0.7218894403559816, "grad_norm": 21.108722686767578, "learning_rate": 2.4027381631488877e-05, "loss": 2.3811, "step": 4218 }, { "epoch": 0.7220605853157624, "grad_norm": 18.81709861755371, "learning_rate": 2.4033086138049057e-05, "loss": 1.6914, "step": 4219 }, { "epoch": 0.7222317302755434, "grad_norm": 20.49457359313965, "learning_rate": 2.403879064460924e-05, "loss": 1.9627, "step": 4220 }, { "epoch": 0.7224028752353243, "grad_norm": 9.025213241577148, "learning_rate": 2.4044495151169424e-05, "loss": 0.6996, "step": 4221 }, { "epoch": 0.7225740201951053, "grad_norm": 2.5068697929382324, "learning_rate": 2.4050199657729604e-05, "loss": 0.3039, "step": 4222 }, { "epoch": 0.7227451651548862, "grad_norm": 9.711918830871582, "learning_rate": 2.4055904164289788e-05, "loss": 1.3666, "step": 4223 }, { "epoch": 0.7229163101146672, "grad_norm": 17.298583984375, "learning_rate": 2.406160867084997e-05, "loss": 1.5982, "step": 4224 }, { "epoch": 0.723087455074448, "grad_norm": 8.982203483581543, "learning_rate": 2.4067313177410158e-05, "loss": 0.6486, "step": 4225 }, { "epoch": 0.723258600034229, "grad_norm": 24.49521827697754, "learning_rate": 2.4073017683970338e-05, "loss": 3.2304, "step": 4226 }, { "epoch": 0.7234297449940099, "grad_norm": 21.123031616210938, "learning_rate": 2.407872219053052e-05, "loss": 1.8011, "step": 4227 }, { "epoch": 0.7236008899537909, "grad_norm": 9.760648727416992, "learning_rate": 2.4084426697090704e-05, "loss": 0.6659, "step": 4228 }, { "epoch": 0.7237720349135718, "grad_norm": 8.795894622802734, "learning_rate": 2.4090131203650884e-05, "loss": 0.8128, "step": 4229 }, { "epoch": 0.7239431798733528, "grad_norm": 5.41191291809082, "learning_rate": 2.4095835710211068e-05, "loss": 0.4972, "step": 4230 }, { "epoch": 0.7241143248331336, "grad_norm": 6.078951358795166, "learning_rate": 2.410154021677125e-05, "loss": 0.6283, "step": 4231 }, { "epoch": 0.7242854697929146, "grad_norm": 9.235028266906738, "learning_rate": 2.4107244723331435e-05, "loss": 1.1542, "step": 4232 }, { "epoch": 0.7244566147526955, "grad_norm": 18.9094295501709, "learning_rate": 2.4112949229891615e-05, "loss": 1.3619, "step": 4233 }, { "epoch": 0.7246277597124765, "grad_norm": 8.397602081298828, "learning_rate": 2.4118653736451798e-05, "loss": 0.5137, "step": 4234 }, { "epoch": 0.7247989046722574, "grad_norm": 35.325618743896484, "learning_rate": 2.412435824301198e-05, "loss": 5.9278, "step": 4235 }, { "epoch": 0.7249700496320384, "grad_norm": 7.471834182739258, "learning_rate": 2.413006274957216e-05, "loss": 0.5769, "step": 4236 }, { "epoch": 0.7251411945918192, "grad_norm": 10.854155540466309, "learning_rate": 2.4135767256132345e-05, "loss": 0.8743, "step": 4237 }, { "epoch": 0.7253123395516002, "grad_norm": 26.70799446105957, "learning_rate": 2.4141471762692528e-05, "loss": 2.6374, "step": 4238 }, { "epoch": 0.7254834845113811, "grad_norm": 23.99932861328125, "learning_rate": 2.414717626925271e-05, "loss": 1.9431, "step": 4239 }, { "epoch": 0.7256546294711621, "grad_norm": 7.719635963439941, "learning_rate": 2.415288077581289e-05, "loss": 0.5437, "step": 4240 }, { "epoch": 0.725825774430943, "grad_norm": 11.576183319091797, "learning_rate": 2.4158585282373075e-05, "loss": 0.8931, "step": 4241 }, { "epoch": 0.725996919390724, "grad_norm": 1.2357739210128784, "learning_rate": 2.4164289788933258e-05, "loss": 0.2441, "step": 4242 }, { "epoch": 0.7261680643505048, "grad_norm": 32.51354217529297, "learning_rate": 2.4169994295493438e-05, "loss": 5.9717, "step": 4243 }, { "epoch": 0.7263392093102858, "grad_norm": 20.202180862426758, "learning_rate": 2.417569880205362e-05, "loss": 1.909, "step": 4244 }, { "epoch": 0.7265103542700667, "grad_norm": 84.00714111328125, "learning_rate": 2.4181403308613805e-05, "loss": 1.9052, "step": 4245 }, { "epoch": 0.7266814992298477, "grad_norm": 28.061588287353516, "learning_rate": 2.4187107815173985e-05, "loss": 1.4887, "step": 4246 }, { "epoch": 0.7268526441896286, "grad_norm": 2.4384922981262207, "learning_rate": 2.419281232173417e-05, "loss": 0.2672, "step": 4247 }, { "epoch": 0.7270237891494096, "grad_norm": 2.515739679336548, "learning_rate": 2.4198516828294355e-05, "loss": 0.2884, "step": 4248 }, { "epoch": 0.7271949341091904, "grad_norm": 29.182708740234375, "learning_rate": 2.420422133485454e-05, "loss": 3.5217, "step": 4249 }, { "epoch": 0.7273660790689714, "grad_norm": 16.3360538482666, "learning_rate": 2.420992584141472e-05, "loss": 1.7574, "step": 4250 }, { "epoch": 0.7275372240287523, "grad_norm": 37.92715072631836, "learning_rate": 2.4215630347974902e-05, "loss": 3.4675, "step": 4251 }, { "epoch": 0.7277083689885333, "grad_norm": 26.36214256286621, "learning_rate": 2.4221334854535085e-05, "loss": 3.8431, "step": 4252 }, { "epoch": 0.7278795139483142, "grad_norm": 26.715503692626953, "learning_rate": 2.4227039361095265e-05, "loss": 2.7822, "step": 4253 }, { "epoch": 0.7280506589080952, "grad_norm": 27.2398624420166, "learning_rate": 2.423274386765545e-05, "loss": 3.1297, "step": 4254 }, { "epoch": 0.728221803867876, "grad_norm": 30.56747055053711, "learning_rate": 2.4238448374215632e-05, "loss": 5.8095, "step": 4255 }, { "epoch": 0.728392948827657, "grad_norm": 7.464992523193359, "learning_rate": 2.4244152880775815e-05, "loss": 0.6275, "step": 4256 }, { "epoch": 0.7285640937874379, "grad_norm": 12.612624168395996, "learning_rate": 2.4249857387335995e-05, "loss": 0.8099, "step": 4257 }, { "epoch": 0.7287352387472189, "grad_norm": 24.279560089111328, "learning_rate": 2.425556189389618e-05, "loss": 2.72, "step": 4258 }, { "epoch": 0.7289063837069998, "grad_norm": 21.02090835571289, "learning_rate": 2.4261266400456362e-05, "loss": 2.1918, "step": 4259 }, { "epoch": 0.7290775286667808, "grad_norm": 4.548274040222168, "learning_rate": 2.4266970907016542e-05, "loss": 0.3331, "step": 4260 }, { "epoch": 0.7292486736265616, "grad_norm": 20.990015029907227, "learning_rate": 2.4272675413576725e-05, "loss": 2.199, "step": 4261 }, { "epoch": 0.7294198185863426, "grad_norm": 3.5553269386291504, "learning_rate": 2.427837992013691e-05, "loss": 0.3964, "step": 4262 }, { "epoch": 0.7295909635461235, "grad_norm": 17.434099197387695, "learning_rate": 2.4284084426697092e-05, "loss": 1.7278, "step": 4263 }, { "epoch": 0.7297621085059045, "grad_norm": 26.248044967651367, "learning_rate": 2.4289788933257272e-05, "loss": 3.4657, "step": 4264 }, { "epoch": 0.7299332534656854, "grad_norm": 26.377473831176758, "learning_rate": 2.4295493439817456e-05, "loss": 2.8837, "step": 4265 }, { "epoch": 0.7301043984254664, "grad_norm": 29.46402931213379, "learning_rate": 2.430119794637764e-05, "loss": 2.9555, "step": 4266 }, { "epoch": 0.7302755433852472, "grad_norm": 27.894542694091797, "learning_rate": 2.430690245293782e-05, "loss": 2.4692, "step": 4267 }, { "epoch": 0.7304466883450282, "grad_norm": 23.581409454345703, "learning_rate": 2.4312606959498002e-05, "loss": 2.2207, "step": 4268 }, { "epoch": 0.7306178333048092, "grad_norm": 26.48499870300293, "learning_rate": 2.4318311466058186e-05, "loss": 3.1471, "step": 4269 }, { "epoch": 0.7307889782645901, "grad_norm": 1.8997199535369873, "learning_rate": 2.432401597261837e-05, "loss": 0.2555, "step": 4270 }, { "epoch": 0.7309601232243711, "grad_norm": 27.24410057067871, "learning_rate": 2.4329720479178552e-05, "loss": 5.7181, "step": 4271 }, { "epoch": 0.731131268184152, "grad_norm": 20.901037216186523, "learning_rate": 2.4335424985738736e-05, "loss": 1.8875, "step": 4272 }, { "epoch": 0.731302413143933, "grad_norm": 14.34110164642334, "learning_rate": 2.434112949229892e-05, "loss": 1.1739, "step": 4273 }, { "epoch": 0.7314735581037138, "grad_norm": 1.0765384435653687, "learning_rate": 2.43468339988591e-05, "loss": 0.2476, "step": 4274 }, { "epoch": 0.7316447030634948, "grad_norm": 32.98508834838867, "learning_rate": 2.4352538505419283e-05, "loss": 1.3042, "step": 4275 }, { "epoch": 0.7318158480232757, "grad_norm": 16.529098510742188, "learning_rate": 2.4358243011979466e-05, "loss": 1.3914, "step": 4276 }, { "epoch": 0.7319869929830567, "grad_norm": 25.088991165161133, "learning_rate": 2.4363947518539646e-05, "loss": 1.8855, "step": 4277 }, { "epoch": 0.7321581379428376, "grad_norm": 16.27592658996582, "learning_rate": 2.436965202509983e-05, "loss": 1.2685, "step": 4278 }, { "epoch": 0.7323292829026186, "grad_norm": 24.759716033935547, "learning_rate": 2.4375356531660013e-05, "loss": 2.4846, "step": 4279 }, { "epoch": 0.7325004278623994, "grad_norm": 30.153457641601562, "learning_rate": 2.4381061038220196e-05, "loss": 3.6713, "step": 4280 }, { "epoch": 0.7326715728221804, "grad_norm": 25.423839569091797, "learning_rate": 2.4386765544780376e-05, "loss": 2.4835, "step": 4281 }, { "epoch": 0.7328427177819613, "grad_norm": 17.405637741088867, "learning_rate": 2.439247005134056e-05, "loss": 1.6224, "step": 4282 }, { "epoch": 0.7330138627417423, "grad_norm": 21.038610458374023, "learning_rate": 2.4398174557900743e-05, "loss": 1.7716, "step": 4283 }, { "epoch": 0.7331850077015232, "grad_norm": 30.59052848815918, "learning_rate": 2.4403879064460923e-05, "loss": 2.7644, "step": 4284 }, { "epoch": 0.7333561526613042, "grad_norm": 22.464582443237305, "learning_rate": 2.4409583571021106e-05, "loss": 1.891, "step": 4285 }, { "epoch": 0.733527297621085, "grad_norm": 7.699871063232422, "learning_rate": 2.441528807758129e-05, "loss": 0.7411, "step": 4286 }, { "epoch": 0.733698442580866, "grad_norm": 4.126267910003662, "learning_rate": 2.4420992584141473e-05, "loss": 0.4547, "step": 4287 }, { "epoch": 0.7338695875406469, "grad_norm": 35.644535064697266, "learning_rate": 2.4426697090701653e-05, "loss": 6.2534, "step": 4288 }, { "epoch": 0.7340407325004279, "grad_norm": 28.631427764892578, "learning_rate": 2.4432401597261836e-05, "loss": 2.9862, "step": 4289 }, { "epoch": 0.7342118774602088, "grad_norm": 2.322627067565918, "learning_rate": 2.443810610382202e-05, "loss": 0.2372, "step": 4290 }, { "epoch": 0.7343830224199898, "grad_norm": 24.10704803466797, "learning_rate": 2.44438106103822e-05, "loss": 2.0144, "step": 4291 }, { "epoch": 0.7345541673797706, "grad_norm": 30.596025466918945, "learning_rate": 2.4449515116942383e-05, "loss": 4.3575, "step": 4292 }, { "epoch": 0.7347253123395516, "grad_norm": 16.235584259033203, "learning_rate": 2.4455219623502566e-05, "loss": 1.3492, "step": 4293 }, { "epoch": 0.7348964572993325, "grad_norm": 8.513513565063477, "learning_rate": 2.4460924130062753e-05, "loss": 0.6521, "step": 4294 }, { "epoch": 0.7350676022591135, "grad_norm": 20.195348739624023, "learning_rate": 2.4466628636622933e-05, "loss": 2.3305, "step": 4295 }, { "epoch": 0.7352387472188944, "grad_norm": 18.629962921142578, "learning_rate": 2.4472333143183117e-05, "loss": 1.8291, "step": 4296 }, { "epoch": 0.7354098921786754, "grad_norm": 26.3973388671875, "learning_rate": 2.44780376497433e-05, "loss": 3.334, "step": 4297 }, { "epoch": 0.7355810371384562, "grad_norm": 0.9989029765129089, "learning_rate": 2.448374215630348e-05, "loss": 0.2261, "step": 4298 }, { "epoch": 0.7357521820982372, "grad_norm": 22.981550216674805, "learning_rate": 2.4489446662863663e-05, "loss": 2.0884, "step": 4299 }, { "epoch": 0.7359233270580181, "grad_norm": 22.626544952392578, "learning_rate": 2.4495151169423847e-05, "loss": 2.1202, "step": 4300 }, { "epoch": 0.7360944720177991, "grad_norm": 16.54318618774414, "learning_rate": 2.450085567598403e-05, "loss": 1.5155, "step": 4301 }, { "epoch": 0.73626561697758, "grad_norm": 116.46958923339844, "learning_rate": 2.450656018254421e-05, "loss": 8.2721, "step": 4302 }, { "epoch": 0.736436761937361, "grad_norm": 19.956342697143555, "learning_rate": 2.4512264689104393e-05, "loss": 1.8512, "step": 4303 }, { "epoch": 0.7366079068971418, "grad_norm": 27.972797393798828, "learning_rate": 2.4517969195664577e-05, "loss": 2.7114, "step": 4304 }, { "epoch": 0.7367790518569228, "grad_norm": 11.407204627990723, "learning_rate": 2.4523673702224757e-05, "loss": 0.9488, "step": 4305 }, { "epoch": 0.7369501968167037, "grad_norm": 33.092105865478516, "learning_rate": 2.452937820878494e-05, "loss": 2.9253, "step": 4306 }, { "epoch": 0.7371213417764847, "grad_norm": 10.819000244140625, "learning_rate": 2.4535082715345124e-05, "loss": 0.7126, "step": 4307 }, { "epoch": 0.7372924867362656, "grad_norm": 7.100123882293701, "learning_rate": 2.4540787221905307e-05, "loss": 0.5677, "step": 4308 }, { "epoch": 0.7374636316960466, "grad_norm": 3.432849407196045, "learning_rate": 2.4546491728465487e-05, "loss": 0.2723, "step": 4309 }, { "epoch": 0.7376347766558274, "grad_norm": 22.43340492248535, "learning_rate": 2.455219623502567e-05, "loss": 2.1355, "step": 4310 }, { "epoch": 0.7378059216156084, "grad_norm": 22.277841567993164, "learning_rate": 2.4557900741585854e-05, "loss": 3.1528, "step": 4311 }, { "epoch": 0.7379770665753893, "grad_norm": 26.116939544677734, "learning_rate": 2.4563605248146034e-05, "loss": 2.7783, "step": 4312 }, { "epoch": 0.7381482115351703, "grad_norm": 21.692012786865234, "learning_rate": 2.4569309754706217e-05, "loss": 2.08, "step": 4313 }, { "epoch": 0.7383193564949512, "grad_norm": 2.461122751235962, "learning_rate": 2.45750142612664e-05, "loss": 0.2903, "step": 4314 }, { "epoch": 0.7384905014547322, "grad_norm": 26.099124908447266, "learning_rate": 2.458071876782658e-05, "loss": 2.0427, "step": 4315 }, { "epoch": 0.738661646414513, "grad_norm": 18.19993782043457, "learning_rate": 2.4586423274386764e-05, "loss": 1.5727, "step": 4316 }, { "epoch": 0.738832791374294, "grad_norm": 28.66299057006836, "learning_rate": 2.459212778094695e-05, "loss": 2.5081, "step": 4317 }, { "epoch": 0.7390039363340749, "grad_norm": 27.575727462768555, "learning_rate": 2.4597832287507134e-05, "loss": 2.7801, "step": 4318 }, { "epoch": 0.7391750812938559, "grad_norm": 46.224586486816406, "learning_rate": 2.4603536794067314e-05, "loss": 6.5383, "step": 4319 }, { "epoch": 0.7393462262536369, "grad_norm": 9.273065567016602, "learning_rate": 2.4609241300627497e-05, "loss": 1.2797, "step": 4320 }, { "epoch": 0.7395173712134178, "grad_norm": 12.391923904418945, "learning_rate": 2.461494580718768e-05, "loss": 1.0646, "step": 4321 }, { "epoch": 0.7396885161731988, "grad_norm": 33.82769012451172, "learning_rate": 2.462065031374786e-05, "loss": 4.4309, "step": 4322 }, { "epoch": 0.7398596611329796, "grad_norm": 9.532022476196289, "learning_rate": 2.4626354820308044e-05, "loss": 1.0837, "step": 4323 }, { "epoch": 0.7400308060927606, "grad_norm": 16.09440803527832, "learning_rate": 2.4632059326868227e-05, "loss": 1.2765, "step": 4324 }, { "epoch": 0.7402019510525415, "grad_norm": 1.2351820468902588, "learning_rate": 2.463776383342841e-05, "loss": 0.2564, "step": 4325 }, { "epoch": 0.7403730960123225, "grad_norm": 31.442874908447266, "learning_rate": 2.464346833998859e-05, "loss": 3.7296, "step": 4326 }, { "epoch": 0.7405442409721034, "grad_norm": 21.058252334594727, "learning_rate": 2.4649172846548774e-05, "loss": 1.7717, "step": 4327 }, { "epoch": 0.7407153859318844, "grad_norm": 25.62652015686035, "learning_rate": 2.4654877353108958e-05, "loss": 2.1194, "step": 4328 }, { "epoch": 0.7408865308916652, "grad_norm": 16.54743003845215, "learning_rate": 2.4660581859669138e-05, "loss": 1.6091, "step": 4329 }, { "epoch": 0.7410576758514462, "grad_norm": 9.183897018432617, "learning_rate": 2.466628636622932e-05, "loss": 0.7841, "step": 4330 }, { "epoch": 0.7412288208112271, "grad_norm": 12.366703987121582, "learning_rate": 2.4671990872789504e-05, "loss": 1.2934, "step": 4331 }, { "epoch": 0.7413999657710081, "grad_norm": 9.97996997833252, "learning_rate": 2.4677695379349688e-05, "loss": 0.648, "step": 4332 }, { "epoch": 0.741571110730789, "grad_norm": 4.609592437744141, "learning_rate": 2.4683399885909868e-05, "loss": 0.4228, "step": 4333 }, { "epoch": 0.74174225569057, "grad_norm": 40.74041748046875, "learning_rate": 2.468910439247005e-05, "loss": 6.2855, "step": 4334 }, { "epoch": 0.7419134006503508, "grad_norm": 34.59137725830078, "learning_rate": 2.4694808899030234e-05, "loss": 5.9909, "step": 4335 }, { "epoch": 0.7420845456101318, "grad_norm": 8.708815574645996, "learning_rate": 2.4700513405590414e-05, "loss": 0.7173, "step": 4336 }, { "epoch": 0.7422556905699127, "grad_norm": 16.282533645629883, "learning_rate": 2.4706217912150598e-05, "loss": 1.7374, "step": 4337 }, { "epoch": 0.7424268355296937, "grad_norm": 11.787439346313477, "learning_rate": 2.471192241871078e-05, "loss": 0.7362, "step": 4338 }, { "epoch": 0.7425979804894746, "grad_norm": 6.672769546508789, "learning_rate": 2.4717626925270968e-05, "loss": 0.8264, "step": 4339 }, { "epoch": 0.7427691254492556, "grad_norm": 11.766725540161133, "learning_rate": 2.4723331431831148e-05, "loss": 1.1676, "step": 4340 }, { "epoch": 0.7429402704090364, "grad_norm": 40.45454788208008, "learning_rate": 2.472903593839133e-05, "loss": 3.9712, "step": 4341 }, { "epoch": 0.7431114153688174, "grad_norm": 26.344924926757812, "learning_rate": 2.4734740444951515e-05, "loss": 5.5252, "step": 4342 }, { "epoch": 0.7432825603285983, "grad_norm": 23.83209228515625, "learning_rate": 2.4740444951511695e-05, "loss": 2.138, "step": 4343 }, { "epoch": 0.7434537052883793, "grad_norm": 31.80006980895996, "learning_rate": 2.4746149458071878e-05, "loss": 3.0915, "step": 4344 }, { "epoch": 0.7436248502481602, "grad_norm": 6.755828380584717, "learning_rate": 2.475185396463206e-05, "loss": 0.5986, "step": 4345 }, { "epoch": 0.7437959952079412, "grad_norm": 20.988128662109375, "learning_rate": 2.475755847119224e-05, "loss": 2.3074, "step": 4346 }, { "epoch": 0.743967140167722, "grad_norm": 15.407384872436523, "learning_rate": 2.4763262977752425e-05, "loss": 1.2107, "step": 4347 }, { "epoch": 0.744138285127503, "grad_norm": 189.76187133789062, "learning_rate": 2.4768967484312608e-05, "loss": 7.4887, "step": 4348 }, { "epoch": 0.7443094300872839, "grad_norm": 15.863279342651367, "learning_rate": 2.477467199087279e-05, "loss": 1.4751, "step": 4349 }, { "epoch": 0.7444805750470649, "grad_norm": 15.989771842956543, "learning_rate": 2.478037649743297e-05, "loss": 1.5714, "step": 4350 }, { "epoch": 0.7446517200068458, "grad_norm": 22.554706573486328, "learning_rate": 2.4786081003993155e-05, "loss": 2.0379, "step": 4351 }, { "epoch": 0.7448228649666268, "grad_norm": 19.29569435119629, "learning_rate": 2.479178551055334e-05, "loss": 1.6695, "step": 4352 }, { "epoch": 0.7449940099264076, "grad_norm": 32.85340881347656, "learning_rate": 2.479749001711352e-05, "loss": 6.1252, "step": 4353 }, { "epoch": 0.7451651548861886, "grad_norm": 7.867766380310059, "learning_rate": 2.4803194523673702e-05, "loss": 0.5292, "step": 4354 }, { "epoch": 0.7453362998459695, "grad_norm": 31.13640022277832, "learning_rate": 2.4808899030233885e-05, "loss": 3.6394, "step": 4355 }, { "epoch": 0.7455074448057505, "grad_norm": 29.437509536743164, "learning_rate": 2.481460353679407e-05, "loss": 5.7376, "step": 4356 }, { "epoch": 0.7456785897655314, "grad_norm": 22.40192222595215, "learning_rate": 2.482030804335425e-05, "loss": 2.1447, "step": 4357 }, { "epoch": 0.7458497347253124, "grad_norm": 16.596187591552734, "learning_rate": 2.4826012549914432e-05, "loss": 1.5623, "step": 4358 }, { "epoch": 0.7460208796850932, "grad_norm": 18.423601150512695, "learning_rate": 2.4831717056474615e-05, "loss": 1.8746, "step": 4359 }, { "epoch": 0.7461920246448742, "grad_norm": 27.663909912109375, "learning_rate": 2.4837421563034795e-05, "loss": 3.0094, "step": 4360 }, { "epoch": 0.7463631696046551, "grad_norm": 22.51800537109375, "learning_rate": 2.484312606959498e-05, "loss": 2.4867, "step": 4361 }, { "epoch": 0.7465343145644361, "grad_norm": 14.807696342468262, "learning_rate": 2.4848830576155165e-05, "loss": 1.3953, "step": 4362 }, { "epoch": 0.746705459524217, "grad_norm": 27.524484634399414, "learning_rate": 2.485453508271535e-05, "loss": 2.9434, "step": 4363 }, { "epoch": 0.746876604483998, "grad_norm": 20.50353240966797, "learning_rate": 2.486023958927553e-05, "loss": 1.9921, "step": 4364 }, { "epoch": 0.7470477494437788, "grad_norm": 8.223814010620117, "learning_rate": 2.4865944095835712e-05, "loss": 1.2869, "step": 4365 }, { "epoch": 0.7472188944035598, "grad_norm": 9.882466316223145, "learning_rate": 2.4871648602395896e-05, "loss": 0.8515, "step": 4366 }, { "epoch": 0.7473900393633407, "grad_norm": 9.574341773986816, "learning_rate": 2.4877353108956075e-05, "loss": 0.617, "step": 4367 }, { "epoch": 0.7475611843231217, "grad_norm": 65.22699737548828, "learning_rate": 2.488305761551626e-05, "loss": 3.1356, "step": 4368 }, { "epoch": 0.7477323292829026, "grad_norm": 3.961726427078247, "learning_rate": 2.4888762122076442e-05, "loss": 0.4726, "step": 4369 }, { "epoch": 0.7479034742426836, "grad_norm": 24.925546646118164, "learning_rate": 2.4894466628636626e-05, "loss": 2.5579, "step": 4370 }, { "epoch": 0.7480746192024645, "grad_norm": 18.773216247558594, "learning_rate": 2.4900171135196806e-05, "loss": 1.7584, "step": 4371 }, { "epoch": 0.7482457641622454, "grad_norm": 29.555532455444336, "learning_rate": 2.490587564175699e-05, "loss": 6.1397, "step": 4372 }, { "epoch": 0.7484169091220264, "grad_norm": 24.256031036376953, "learning_rate": 2.4911580148317172e-05, "loss": 1.9726, "step": 4373 }, { "epoch": 0.7485880540818073, "grad_norm": 16.05461883544922, "learning_rate": 2.4917284654877352e-05, "loss": 1.264, "step": 4374 }, { "epoch": 0.7487591990415883, "grad_norm": 24.213459014892578, "learning_rate": 2.4922989161437536e-05, "loss": 2.3189, "step": 4375 }, { "epoch": 0.7489303440013692, "grad_norm": 29.8961181640625, "learning_rate": 2.492869366799772e-05, "loss": 3.6839, "step": 4376 }, { "epoch": 0.7491014889611501, "grad_norm": 22.435625076293945, "learning_rate": 2.4934398174557903e-05, "loss": 2.2289, "step": 4377 }, { "epoch": 0.749272633920931, "grad_norm": 30.220745086669922, "learning_rate": 2.4940102681118082e-05, "loss": 4.4896, "step": 4378 }, { "epoch": 0.749443778880712, "grad_norm": 18.16118049621582, "learning_rate": 2.4945807187678266e-05, "loss": 1.8377, "step": 4379 }, { "epoch": 0.7496149238404929, "grad_norm": 27.19329833984375, "learning_rate": 2.495151169423845e-05, "loss": 2.8044, "step": 4380 }, { "epoch": 0.7497860688002739, "grad_norm": 22.332542419433594, "learning_rate": 2.495721620079863e-05, "loss": 1.8498, "step": 4381 }, { "epoch": 0.7499572137600548, "grad_norm": 4.663109302520752, "learning_rate": 2.4962920707358813e-05, "loss": 0.5393, "step": 4382 }, { "epoch": 0.7501283587198357, "grad_norm": 18.246469497680664, "learning_rate": 2.4968625213918996e-05, "loss": 1.5186, "step": 4383 }, { "epoch": 0.7502995036796166, "grad_norm": 28.977493286132812, "learning_rate": 2.4974329720479176e-05, "loss": 2.8719, "step": 4384 }, { "epoch": 0.7504706486393976, "grad_norm": 6.890769958496094, "learning_rate": 2.4980034227039363e-05, "loss": 0.601, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_nli-pairs_loss": 2.050740957260132, "eval_nli-pairs_runtime": 4.2795, "eval_nli-pairs_samples_per_second": 46.734, "eval_nli-pairs_steps_per_second": 1.636, "eval_sts-test_pearson_cosine": 0.756734064986887, "eval_sts-test_pearson_dot": 0.6528865740820513, "eval_sts-test_pearson_euclidean": 0.7545477323381371, "eval_sts-test_pearson_manhattan": 0.7602184258166524, "eval_sts-test_pearson_max": 0.7602184258166524, "eval_sts-test_spearman_cosine": 0.7444733315413253, "eval_sts-test_spearman_dot": 0.6319213377688324, "eval_sts-test_spearman_euclidean": 0.7398981584440489, "eval_sts-test_spearman_manhattan": 0.7468720146418238, "eval_sts-test_spearman_max": 0.7468720146418238, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_vitaminc-pairs_loss": 1.3987665176391602, "eval_vitaminc-pairs_runtime": 2.7296, "eval_vitaminc-pairs_samples_per_second": 73.272, "eval_vitaminc-pairs_steps_per_second": 2.565, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_qnli-contrastive_loss": 2.7516510486602783, "eval_qnli-contrastive_runtime": 0.6347, "eval_qnli-contrastive_samples_per_second": 315.112, "eval_qnli-contrastive_steps_per_second": 11.029, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_scitail-pairs-qa_loss": 0.22120414674282074, "eval_scitail-pairs-qa_runtime": 1.6102, "eval_scitail-pairs-qa_samples_per_second": 124.21, "eval_scitail-pairs-qa_steps_per_second": 4.347, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_scitail-pairs-pos_loss": 0.9076427817344666, "eval_scitail-pairs-pos_runtime": 2.6161, "eval_scitail-pairs-pos_samples_per_second": 76.449, "eval_scitail-pairs-pos_steps_per_second": 2.676, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_xsum-pairs_loss": 1.0805269479751587, "eval_xsum-pairs_runtime": 2.6446, "eval_xsum-pairs_samples_per_second": 66.172, "eval_xsum-pairs_steps_per_second": 2.269, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_compression-pairs_loss": 0.44501441717147827, "eval_compression-pairs_runtime": 0.5283, "eval_compression-pairs_samples_per_second": 378.589, "eval_compression-pairs_steps_per_second": 13.251, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_sciq_pairs_loss": 5.368130207061768, "eval_sciq_pairs_runtime": 9.1813, "eval_sciq_pairs_samples_per_second": 21.783, "eval_sciq_pairs_steps_per_second": 0.762, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_qasc_pairs_loss": 5.916055679321289, "eval_qasc_pairs_runtime": 2.6536, "eval_qasc_pairs_samples_per_second": 75.369, "eval_qasc_pairs_steps_per_second": 2.638, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_openbookqa_pairs_loss": 3.2691121101379395, "eval_openbookqa_pairs_runtime": 0.6379, "eval_openbookqa_pairs_samples_per_second": 108.16, "eval_openbookqa_pairs_steps_per_second": 4.703, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_msmarco_pairs_loss": 1.845609426498413, "eval_msmarco_pairs_runtime": 3.9718, "eval_msmarco_pairs_samples_per_second": 50.355, "eval_msmarco_pairs_steps_per_second": 1.762, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_nq_pairs_loss": 2.279620409011841, "eval_nq_pairs_runtime": 8.6017, "eval_nq_pairs_samples_per_second": 23.251, "eval_nq_pairs_steps_per_second": 0.814, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_trivia_pairs_loss": 2.464531421661377, "eval_trivia_pairs_runtime": 12.8394, "eval_trivia_pairs_samples_per_second": 15.577, "eval_trivia_pairs_steps_per_second": 0.545, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_quora_pairs_loss": 0.40776023268699646, "eval_quora_pairs_runtime": 1.5837, "eval_quora_pairs_samples_per_second": 126.29, "eval_quora_pairs_steps_per_second": 4.42, "step": 4385 }, { "epoch": 0.7504706486393976, "eval_gooaq_pairs_loss": 1.4506279230117798, "eval_gooaq_pairs_runtime": 2.6527, "eval_gooaq_pairs_samples_per_second": 75.396, "eval_gooaq_pairs_steps_per_second": 2.639, "step": 4385 }, { "epoch": 0.7506417935991785, "grad_norm": 29.356266021728516, "learning_rate": 2.4985738733599546e-05, "loss": 2.6466, "step": 4386 }, { "epoch": 0.7508129385589595, "grad_norm": 20.16213607788086, "learning_rate": 2.499144324015973e-05, "loss": 2.096, "step": 4387 }, { "epoch": 0.7509840835187404, "grad_norm": 6.681097984313965, "learning_rate": 2.499714774671991e-05, "loss": 0.4151, "step": 4388 }, { "epoch": 0.7511552284785213, "grad_norm": 28.097131729125977, "learning_rate": 2.5002852253280093e-05, "loss": 2.6453, "step": 4389 }, { "epoch": 0.7513263734383022, "grad_norm": 23.283533096313477, "learning_rate": 2.5008556759840276e-05, "loss": 1.8409, "step": 4390 }, { "epoch": 0.7514975183980832, "grad_norm": 17.86503791809082, "learning_rate": 2.5014261266400456e-05, "loss": 1.9202, "step": 4391 }, { "epoch": 0.7516686633578641, "grad_norm": 24.048988342285156, "learning_rate": 2.501996577296064e-05, "loss": 2.8925, "step": 4392 }, { "epoch": 0.7518398083176451, "grad_norm": 14.309977531433105, "learning_rate": 2.5025670279520823e-05, "loss": 1.2498, "step": 4393 }, { "epoch": 0.752010953277426, "grad_norm": 23.62467384338379, "learning_rate": 2.5031374786081006e-05, "loss": 2.1623, "step": 4394 }, { "epoch": 0.7521820982372069, "grad_norm": 13.426614761352539, "learning_rate": 2.5037079292641186e-05, "loss": 1.299, "step": 4395 }, { "epoch": 0.7523532431969878, "grad_norm": 8.141080856323242, "learning_rate": 2.504278379920137e-05, "loss": 1.2283, "step": 4396 }, { "epoch": 0.7525243881567688, "grad_norm": 21.36097526550293, "learning_rate": 2.5048488305761553e-05, "loss": 1.5297, "step": 4397 }, { "epoch": 0.7526955331165497, "grad_norm": 50.34371566772461, "learning_rate": 2.5054192812321733e-05, "loss": 2.2539, "step": 4398 }, { "epoch": 0.7528666780763307, "grad_norm": 1.270694613456726, "learning_rate": 2.5059897318881917e-05, "loss": 0.2376, "step": 4399 }, { "epoch": 0.7530378230361116, "grad_norm": 25.70792579650879, "learning_rate": 2.50656018254421e-05, "loss": 2.8688, "step": 4400 }, { "epoch": 0.7532089679958925, "grad_norm": 21.675689697265625, "learning_rate": 2.5071306332002283e-05, "loss": 2.2722, "step": 4401 }, { "epoch": 0.7533801129556734, "grad_norm": 0.9436509013175964, "learning_rate": 2.5077010838562463e-05, "loss": 0.2405, "step": 4402 }, { "epoch": 0.7535512579154544, "grad_norm": 17.242216110229492, "learning_rate": 2.5082715345122647e-05, "loss": 1.5298, "step": 4403 }, { "epoch": 0.7537224028752353, "grad_norm": 23.45809555053711, "learning_rate": 2.508841985168283e-05, "loss": 1.9691, "step": 4404 }, { "epoch": 0.7538935478350163, "grad_norm": 3.92891788482666, "learning_rate": 2.509412435824301e-05, "loss": 0.555, "step": 4405 }, { "epoch": 0.7540646927947972, "grad_norm": 22.707956314086914, "learning_rate": 2.5099828864803193e-05, "loss": 2.3504, "step": 4406 }, { "epoch": 0.7542358377545781, "grad_norm": 20.21977996826172, "learning_rate": 2.5105533371363377e-05, "loss": 2.0231, "step": 4407 }, { "epoch": 0.754406982714359, "grad_norm": 1.7529772520065308, "learning_rate": 2.5111237877923564e-05, "loss": 0.2485, "step": 4408 }, { "epoch": 0.75457812767414, "grad_norm": 1.6466134786605835, "learning_rate": 2.5116942384483744e-05, "loss": 0.2669, "step": 4409 }, { "epoch": 0.7547492726339209, "grad_norm": 17.274892807006836, "learning_rate": 2.5122646891043927e-05, "loss": 1.6309, "step": 4410 }, { "epoch": 0.7549204175937019, "grad_norm": 21.545635223388672, "learning_rate": 2.512835139760411e-05, "loss": 2.004, "step": 4411 }, { "epoch": 0.7550915625534828, "grad_norm": 12.705281257629395, "learning_rate": 2.513405590416429e-05, "loss": 0.8108, "step": 4412 }, { "epoch": 0.7552627075132637, "grad_norm": 21.702417373657227, "learning_rate": 2.5139760410724474e-05, "loss": 1.9521, "step": 4413 }, { "epoch": 0.7554338524730446, "grad_norm": 5.507816314697266, "learning_rate": 2.5145464917284657e-05, "loss": 0.4295, "step": 4414 }, { "epoch": 0.7556049974328256, "grad_norm": 44.05494689941406, "learning_rate": 2.5151169423844837e-05, "loss": 2.7, "step": 4415 }, { "epoch": 0.7557761423926065, "grad_norm": 23.11194610595703, "learning_rate": 2.515687393040502e-05, "loss": 2.84, "step": 4416 }, { "epoch": 0.7559472873523875, "grad_norm": 26.601444244384766, "learning_rate": 2.5162578436965204e-05, "loss": 2.6999, "step": 4417 }, { "epoch": 0.7561184323121684, "grad_norm": 21.454803466796875, "learning_rate": 2.5168282943525387e-05, "loss": 2.186, "step": 4418 }, { "epoch": 0.7562895772719493, "grad_norm": 19.278148651123047, "learning_rate": 2.5173987450085567e-05, "loss": 1.9123, "step": 4419 }, { "epoch": 0.7564607222317302, "grad_norm": 25.827354431152344, "learning_rate": 2.517969195664575e-05, "loss": 2.32, "step": 4420 }, { "epoch": 0.7566318671915112, "grad_norm": 24.202350616455078, "learning_rate": 2.5185396463205934e-05, "loss": 3.0039, "step": 4421 }, { "epoch": 0.7568030121512922, "grad_norm": 16.3718318939209, "learning_rate": 2.5191100969766114e-05, "loss": 1.56, "step": 4422 }, { "epoch": 0.7569741571110731, "grad_norm": 27.989944458007812, "learning_rate": 2.5196805476326297e-05, "loss": 3.6797, "step": 4423 }, { "epoch": 0.7571453020708541, "grad_norm": 29.627376556396484, "learning_rate": 2.520250998288648e-05, "loss": 3.0339, "step": 4424 }, { "epoch": 0.7573164470306349, "grad_norm": 23.632122039794922, "learning_rate": 2.5208214489446664e-05, "loss": 2.2779, "step": 4425 }, { "epoch": 0.7574875919904159, "grad_norm": 9.593125343322754, "learning_rate": 2.5213918996006844e-05, "loss": 0.7422, "step": 4426 }, { "epoch": 0.7576587369501968, "grad_norm": 17.23970603942871, "learning_rate": 2.5219623502567027e-05, "loss": 1.2251, "step": 4427 }, { "epoch": 0.7578298819099778, "grad_norm": 26.436206817626953, "learning_rate": 2.522532800912721e-05, "loss": 2.0061, "step": 4428 }, { "epoch": 0.7580010268697587, "grad_norm": 5.741855144500732, "learning_rate": 2.523103251568739e-05, "loss": 0.4587, "step": 4429 }, { "epoch": 0.7581721718295397, "grad_norm": 28.37969398498535, "learning_rate": 2.5236737022247574e-05, "loss": 2.671, "step": 4430 }, { "epoch": 0.7583433167893205, "grad_norm": 18.421100616455078, "learning_rate": 2.524244152880776e-05, "loss": 1.7614, "step": 4431 }, { "epoch": 0.7585144617491015, "grad_norm": 4.070728302001953, "learning_rate": 2.5248146035367944e-05, "loss": 0.2731, "step": 4432 }, { "epoch": 0.7586856067088824, "grad_norm": 1.1695162057876587, "learning_rate": 2.5253850541928124e-05, "loss": 0.2524, "step": 4433 }, { "epoch": 0.7588567516686634, "grad_norm": 22.150390625, "learning_rate": 2.5259555048488308e-05, "loss": 2.1763, "step": 4434 }, { "epoch": 0.7590278966284443, "grad_norm": 24.137784957885742, "learning_rate": 2.526525955504849e-05, "loss": 2.7464, "step": 4435 }, { "epoch": 0.7591990415882253, "grad_norm": 97.06147003173828, "learning_rate": 2.527096406160867e-05, "loss": 8.2917, "step": 4436 }, { "epoch": 0.7593701865480061, "grad_norm": 18.736169815063477, "learning_rate": 2.5276668568168854e-05, "loss": 1.8898, "step": 4437 }, { "epoch": 0.7595413315077871, "grad_norm": 6.186467170715332, "learning_rate": 2.5282373074729038e-05, "loss": 0.6734, "step": 4438 }, { "epoch": 0.759712476467568, "grad_norm": 2.713296413421631, "learning_rate": 2.528807758128922e-05, "loss": 0.2576, "step": 4439 }, { "epoch": 0.759883621427349, "grad_norm": 5.079885482788086, "learning_rate": 2.52937820878494e-05, "loss": 0.928, "step": 4440 }, { "epoch": 0.7600547663871299, "grad_norm": 7.678714275360107, "learning_rate": 2.5299486594409585e-05, "loss": 0.9906, "step": 4441 }, { "epoch": 0.7602259113469109, "grad_norm": 28.940954208374023, "learning_rate": 2.5305191100969768e-05, "loss": 2.2819, "step": 4442 }, { "epoch": 0.7603970563066917, "grad_norm": 19.63545799255371, "learning_rate": 2.5310895607529948e-05, "loss": 1.5862, "step": 4443 }, { "epoch": 0.7605682012664727, "grad_norm": 24.078157424926758, "learning_rate": 2.531660011409013e-05, "loss": 1.9171, "step": 4444 }, { "epoch": 0.7607393462262536, "grad_norm": 25.036775588989258, "learning_rate": 2.5322304620650315e-05, "loss": 2.7408, "step": 4445 }, { "epoch": 0.7609104911860346, "grad_norm": 25.063066482543945, "learning_rate": 2.5328009127210495e-05, "loss": 2.0789, "step": 4446 }, { "epoch": 0.7610816361458155, "grad_norm": 26.909889221191406, "learning_rate": 2.5333713633770678e-05, "loss": 3.0789, "step": 4447 }, { "epoch": 0.7612527811055965, "grad_norm": 55.91215896606445, "learning_rate": 2.533941814033086e-05, "loss": 2.2273, "step": 4448 }, { "epoch": 0.7614239260653773, "grad_norm": 16.553728103637695, "learning_rate": 2.5345122646891045e-05, "loss": 1.4893, "step": 4449 }, { "epoch": 0.7615950710251583, "grad_norm": 35.240394592285156, "learning_rate": 2.5350827153451225e-05, "loss": 6.035, "step": 4450 }, { "epoch": 0.7617662159849392, "grad_norm": 9.102727890014648, "learning_rate": 2.5356531660011408e-05, "loss": 0.5959, "step": 4451 }, { "epoch": 0.7619373609447202, "grad_norm": 19.193933486938477, "learning_rate": 2.536223616657159e-05, "loss": 1.4628, "step": 4452 }, { "epoch": 0.7621085059045011, "grad_norm": 31.62565803527832, "learning_rate": 2.536794067313177e-05, "loss": 5.7421, "step": 4453 }, { "epoch": 0.7622796508642821, "grad_norm": 19.388065338134766, "learning_rate": 2.5373645179691958e-05, "loss": 1.8378, "step": 4454 }, { "epoch": 0.7624507958240629, "grad_norm": 4.330984115600586, "learning_rate": 2.537934968625214e-05, "loss": 0.4705, "step": 4455 }, { "epoch": 0.7626219407838439, "grad_norm": 5.933356761932373, "learning_rate": 2.5385054192812325e-05, "loss": 0.5481, "step": 4456 }, { "epoch": 0.7627930857436248, "grad_norm": 30.84785270690918, "learning_rate": 2.5390758699372505e-05, "loss": 5.6798, "step": 4457 }, { "epoch": 0.7629642307034058, "grad_norm": 22.500408172607422, "learning_rate": 2.539646320593269e-05, "loss": 2.1908, "step": 4458 }, { "epoch": 0.7631353756631867, "grad_norm": 1.4484208822250366, "learning_rate": 2.5402167712492872e-05, "loss": 0.2429, "step": 4459 }, { "epoch": 0.7633065206229677, "grad_norm": 18.820663452148438, "learning_rate": 2.5407872219053052e-05, "loss": 1.5979, "step": 4460 }, { "epoch": 0.7634776655827485, "grad_norm": 29.224742889404297, "learning_rate": 2.5413576725613235e-05, "loss": 3.4263, "step": 4461 }, { "epoch": 0.7636488105425295, "grad_norm": 27.641237258911133, "learning_rate": 2.541928123217342e-05, "loss": 2.9448, "step": 4462 }, { "epoch": 0.7638199555023104, "grad_norm": 28.82822608947754, "learning_rate": 2.5424985738733602e-05, "loss": 3.3548, "step": 4463 }, { "epoch": 0.7639911004620914, "grad_norm": 31.449243545532227, "learning_rate": 2.5430690245293782e-05, "loss": 3.569, "step": 4464 }, { "epoch": 0.7641622454218723, "grad_norm": 22.96309471130371, "learning_rate": 2.5436394751853965e-05, "loss": 2.7067, "step": 4465 }, { "epoch": 0.7643333903816533, "grad_norm": 54.38570022583008, "learning_rate": 2.544209925841415e-05, "loss": 1.9878, "step": 4466 }, { "epoch": 0.7645045353414341, "grad_norm": 13.225268363952637, "learning_rate": 2.544780376497433e-05, "loss": 0.9805, "step": 4467 }, { "epoch": 0.7646756803012151, "grad_norm": 11.44771957397461, "learning_rate": 2.5453508271534512e-05, "loss": 0.8328, "step": 4468 }, { "epoch": 0.764846825260996, "grad_norm": 1.3261395692825317, "learning_rate": 2.5459212778094695e-05, "loss": 0.2307, "step": 4469 }, { "epoch": 0.765017970220777, "grad_norm": 2.594686269760132, "learning_rate": 2.546491728465488e-05, "loss": 0.2594, "step": 4470 }, { "epoch": 0.7651891151805579, "grad_norm": 1.5864319801330566, "learning_rate": 2.547062179121506e-05, "loss": 0.2351, "step": 4471 }, { "epoch": 0.7653602601403389, "grad_norm": 23.80762481689453, "learning_rate": 2.5476326297775242e-05, "loss": 2.3596, "step": 4472 }, { "epoch": 0.7655314051001199, "grad_norm": 0.9860055446624756, "learning_rate": 2.5482030804335426e-05, "loss": 0.2113, "step": 4473 }, { "epoch": 0.7657025500599007, "grad_norm": 24.054868698120117, "learning_rate": 2.5487735310895606e-05, "loss": 2.4686, "step": 4474 }, { "epoch": 0.7658736950196817, "grad_norm": 20.621498107910156, "learning_rate": 2.549343981745579e-05, "loss": 1.9467, "step": 4475 }, { "epoch": 0.7660448399794626, "grad_norm": 23.417434692382812, "learning_rate": 2.5499144324015972e-05, "loss": 2.6939, "step": 4476 }, { "epoch": 0.7662159849392436, "grad_norm": 35.24362564086914, "learning_rate": 2.5504848830576156e-05, "loss": 2.1405, "step": 4477 }, { "epoch": 0.7663871298990245, "grad_norm": 24.250762939453125, "learning_rate": 2.551055333713634e-05, "loss": 2.0974, "step": 4478 }, { "epoch": 0.7665582748588055, "grad_norm": 34.85161590576172, "learning_rate": 2.5516257843696522e-05, "loss": 2.6934, "step": 4479 }, { "epoch": 0.7667294198185863, "grad_norm": 23.28230857849121, "learning_rate": 2.5521962350256706e-05, "loss": 2.4125, "step": 4480 }, { "epoch": 0.7669005647783673, "grad_norm": 24.412673950195312, "learning_rate": 2.5527666856816886e-05, "loss": 2.0085, "step": 4481 }, { "epoch": 0.7670717097381482, "grad_norm": 5.911852836608887, "learning_rate": 2.553337136337707e-05, "loss": 0.4889, "step": 4482 }, { "epoch": 0.7672428546979292, "grad_norm": 15.7787504196167, "learning_rate": 2.5539075869937253e-05, "loss": 1.6727, "step": 4483 }, { "epoch": 0.7674139996577101, "grad_norm": 30.21489715576172, "learning_rate": 2.5544780376497433e-05, "loss": 3.3485, "step": 4484 }, { "epoch": 0.767585144617491, "grad_norm": 1.4315123558044434, "learning_rate": 2.5550484883057616e-05, "loss": 0.2253, "step": 4485 }, { "epoch": 0.7677562895772719, "grad_norm": 43.58045959472656, "learning_rate": 2.55561893896178e-05, "loss": 2.2129, "step": 4486 }, { "epoch": 0.7679274345370529, "grad_norm": 15.73321533203125, "learning_rate": 2.5561893896177983e-05, "loss": 1.8963, "step": 4487 }, { "epoch": 0.7680985794968338, "grad_norm": 15.624593734741211, "learning_rate": 2.5567598402738163e-05, "loss": 1.4826, "step": 4488 }, { "epoch": 0.7682697244566148, "grad_norm": 19.84630012512207, "learning_rate": 2.5573302909298346e-05, "loss": 1.9842, "step": 4489 }, { "epoch": 0.7684408694163957, "grad_norm": 23.67464828491211, "learning_rate": 2.557900741585853e-05, "loss": 1.9172, "step": 4490 }, { "epoch": 0.7686120143761767, "grad_norm": 26.324172973632812, "learning_rate": 2.558471192241871e-05, "loss": 2.4427, "step": 4491 }, { "epoch": 0.7687831593359575, "grad_norm": 29.327041625976562, "learning_rate": 2.5590416428978893e-05, "loss": 2.9564, "step": 4492 }, { "epoch": 0.7689543042957385, "grad_norm": 18.07971954345703, "learning_rate": 2.5596120935539076e-05, "loss": 1.5832, "step": 4493 }, { "epoch": 0.7691254492555194, "grad_norm": 25.531024932861328, "learning_rate": 2.560182544209926e-05, "loss": 2.5085, "step": 4494 }, { "epoch": 0.7692965942153004, "grad_norm": 26.901735305786133, "learning_rate": 2.560752994865944e-05, "loss": 2.7555, "step": 4495 }, { "epoch": 0.7694677391750813, "grad_norm": 1.0921388864517212, "learning_rate": 2.5613234455219623e-05, "loss": 0.2269, "step": 4496 }, { "epoch": 0.7696388841348623, "grad_norm": 24.066415786743164, "learning_rate": 2.5618938961779806e-05, "loss": 2.2307, "step": 4497 }, { "epoch": 0.7698100290946431, "grad_norm": 27.785593032836914, "learning_rate": 2.5624643468339986e-05, "loss": 3.4921, "step": 4498 }, { "epoch": 0.7699811740544241, "grad_norm": 29.524158477783203, "learning_rate": 2.563034797490017e-05, "loss": 3.7322, "step": 4499 }, { "epoch": 0.770152319014205, "grad_norm": 8.157841682434082, "learning_rate": 2.5636052481460356e-05, "loss": 0.6122, "step": 4500 }, { "epoch": 0.770323463973986, "grad_norm": 0.7895174026489258, "learning_rate": 2.564175698802054e-05, "loss": 0.2004, "step": 4501 }, { "epoch": 0.7704946089337669, "grad_norm": 19.612058639526367, "learning_rate": 2.564746149458072e-05, "loss": 1.4928, "step": 4502 }, { "epoch": 0.7706657538935479, "grad_norm": 24.910694122314453, "learning_rate": 2.5653166001140903e-05, "loss": 3.1004, "step": 4503 }, { "epoch": 0.7708368988533287, "grad_norm": 16.94511604309082, "learning_rate": 2.5658870507701087e-05, "loss": 1.5157, "step": 4504 }, { "epoch": 0.7710080438131097, "grad_norm": 18.35201072692871, "learning_rate": 2.5664575014261267e-05, "loss": 1.6228, "step": 4505 }, { "epoch": 0.7711791887728906, "grad_norm": 1.8921977281570435, "learning_rate": 2.567027952082145e-05, "loss": 0.2404, "step": 4506 }, { "epoch": 0.7713503337326716, "grad_norm": 15.37820053100586, "learning_rate": 2.5675984027381633e-05, "loss": 1.4636, "step": 4507 }, { "epoch": 0.7715214786924525, "grad_norm": 45.28339767456055, "learning_rate": 2.5681688533941817e-05, "loss": 5.8686, "step": 4508 }, { "epoch": 0.7716926236522335, "grad_norm": 10.306188583374023, "learning_rate": 2.5687393040501997e-05, "loss": 0.8692, "step": 4509 }, { "epoch": 0.7718637686120143, "grad_norm": 7.009439468383789, "learning_rate": 2.569309754706218e-05, "loss": 0.4821, "step": 4510 }, { "epoch": 0.7720349135717953, "grad_norm": 16.78397560119629, "learning_rate": 2.5698802053622363e-05, "loss": 1.4687, "step": 4511 }, { "epoch": 0.7722060585315762, "grad_norm": 27.457345962524414, "learning_rate": 2.5704506560182543e-05, "loss": 5.8033, "step": 4512 }, { "epoch": 0.7723772034913572, "grad_norm": 18.70033836364746, "learning_rate": 2.5710211066742727e-05, "loss": 1.7159, "step": 4513 }, { "epoch": 0.7725483484511381, "grad_norm": 6.5526204109191895, "learning_rate": 2.571591557330291e-05, "loss": 0.6285, "step": 4514 }, { "epoch": 0.7727194934109191, "grad_norm": 27.923290252685547, "learning_rate": 2.572162007986309e-05, "loss": 2.8709, "step": 4515 }, { "epoch": 0.7728906383706999, "grad_norm": 21.99191665649414, "learning_rate": 2.5727324586423274e-05, "loss": 1.7881, "step": 4516 }, { "epoch": 0.7730617833304809, "grad_norm": 23.24179458618164, "learning_rate": 2.5733029092983457e-05, "loss": 2.0511, "step": 4517 }, { "epoch": 0.7732329282902618, "grad_norm": 17.54611587524414, "learning_rate": 2.573873359954364e-05, "loss": 1.326, "step": 4518 }, { "epoch": 0.7734040732500428, "grad_norm": 30.525188446044922, "learning_rate": 2.574443810610382e-05, "loss": 5.8319, "step": 4519 }, { "epoch": 0.7735752182098237, "grad_norm": 22.358930587768555, "learning_rate": 2.5750142612664004e-05, "loss": 2.7106, "step": 4520 }, { "epoch": 0.7737463631696047, "grad_norm": 28.14476776123047, "learning_rate": 2.5755847119224187e-05, "loss": 2.6532, "step": 4521 }, { "epoch": 0.7739175081293856, "grad_norm": 14.640401840209961, "learning_rate": 2.576155162578437e-05, "loss": 1.2756, "step": 4522 }, { "epoch": 0.7740886530891665, "grad_norm": 22.830739974975586, "learning_rate": 2.5767256132344554e-05, "loss": 1.9665, "step": 4523 }, { "epoch": 0.7742597980489475, "grad_norm": 2.0804736614227295, "learning_rate": 2.5772960638904737e-05, "loss": 0.2506, "step": 4524 }, { "epoch": 0.7744309430087284, "grad_norm": 80.72746276855469, "learning_rate": 2.577866514546492e-05, "loss": 2.0284, "step": 4525 }, { "epoch": 0.7746020879685094, "grad_norm": 1.6171777248382568, "learning_rate": 2.57843696520251e-05, "loss": 0.2526, "step": 4526 }, { "epoch": 0.7747732329282903, "grad_norm": 1.1948031187057495, "learning_rate": 2.5790074158585284e-05, "loss": 0.2329, "step": 4527 }, { "epoch": 0.7749443778880712, "grad_norm": 16.24471664428711, "learning_rate": 2.5795778665145467e-05, "loss": 1.2654, "step": 4528 }, { "epoch": 0.7751155228478521, "grad_norm": 11.590794563293457, "learning_rate": 2.5801483171705647e-05, "loss": 0.7527, "step": 4529 }, { "epoch": 0.7752866678076331, "grad_norm": 21.045690536499023, "learning_rate": 2.580718767826583e-05, "loss": 2.1076, "step": 4530 }, { "epoch": 0.775457812767414, "grad_norm": 29.146739959716797, "learning_rate": 2.5812892184826014e-05, "loss": 2.3028, "step": 4531 }, { "epoch": 0.775628957727195, "grad_norm": 27.92205810546875, "learning_rate": 2.5818596691386197e-05, "loss": 2.6337, "step": 4532 }, { "epoch": 0.7758001026869759, "grad_norm": 4.507087707519531, "learning_rate": 2.5824301197946377e-05, "loss": 0.3644, "step": 4533 }, { "epoch": 0.7759712476467568, "grad_norm": 31.79628562927246, "learning_rate": 2.583000570450656e-05, "loss": 2.0363, "step": 4534 }, { "epoch": 0.7761423926065377, "grad_norm": 20.116243362426758, "learning_rate": 2.5835710211066744e-05, "loss": 2.037, "step": 4535 }, { "epoch": 0.7763135375663187, "grad_norm": 20.690771102905273, "learning_rate": 2.5841414717626924e-05, "loss": 2.5927, "step": 4536 }, { "epoch": 0.7764846825260996, "grad_norm": 26.378053665161133, "learning_rate": 2.5847119224187108e-05, "loss": 2.4488, "step": 4537 }, { "epoch": 0.7766558274858806, "grad_norm": 1.2249003648757935, "learning_rate": 2.585282373074729e-05, "loss": 0.2434, "step": 4538 }, { "epoch": 0.7768269724456615, "grad_norm": 6.3218793869018555, "learning_rate": 2.5858528237307474e-05, "loss": 0.5515, "step": 4539 }, { "epoch": 0.7769981174054424, "grad_norm": 2.316464424133301, "learning_rate": 2.5864232743867654e-05, "loss": 0.2183, "step": 4540 }, { "epoch": 0.7771692623652233, "grad_norm": 19.063066482543945, "learning_rate": 2.5869937250427838e-05, "loss": 1.6202, "step": 4541 }, { "epoch": 0.7773404073250043, "grad_norm": 13.33887767791748, "learning_rate": 2.587564175698802e-05, "loss": 1.0805, "step": 4542 }, { "epoch": 0.7775115522847852, "grad_norm": 60.49800109863281, "learning_rate": 2.58813462635482e-05, "loss": 2.2874, "step": 4543 }, { "epoch": 0.7776826972445662, "grad_norm": 1.045248031616211, "learning_rate": 2.5887050770108384e-05, "loss": 0.2115, "step": 4544 }, { "epoch": 0.7778538422043471, "grad_norm": 0.962874710559845, "learning_rate": 2.589275527666857e-05, "loss": 0.2111, "step": 4545 }, { "epoch": 0.778024987164128, "grad_norm": 9.272252082824707, "learning_rate": 2.589845978322875e-05, "loss": 0.6141, "step": 4546 }, { "epoch": 0.7781961321239089, "grad_norm": 1.7271431684494019, "learning_rate": 2.5904164289788935e-05, "loss": 0.2153, "step": 4547 }, { "epoch": 0.7783672770836899, "grad_norm": 10.894009590148926, "learning_rate": 2.5909868796349118e-05, "loss": 0.7097, "step": 4548 }, { "epoch": 0.7785384220434708, "grad_norm": 26.39044761657715, "learning_rate": 2.59155733029093e-05, "loss": 2.3284, "step": 4549 }, { "epoch": 0.7787095670032518, "grad_norm": 18.677852630615234, "learning_rate": 2.592127780946948e-05, "loss": 1.6088, "step": 4550 }, { "epoch": 0.7788807119630327, "grad_norm": 24.6732177734375, "learning_rate": 2.5926982316029665e-05, "loss": 3.2763, "step": 4551 }, { "epoch": 0.7790518569228136, "grad_norm": 19.83483123779297, "learning_rate": 2.5932686822589848e-05, "loss": 1.9389, "step": 4552 }, { "epoch": 0.7792230018825945, "grad_norm": 20.399280548095703, "learning_rate": 2.5938391329150028e-05, "loss": 1.7964, "step": 4553 }, { "epoch": 0.7793941468423755, "grad_norm": 17.091896057128906, "learning_rate": 2.594409583571021e-05, "loss": 1.3734, "step": 4554 }, { "epoch": 0.7795652918021564, "grad_norm": 31.487939834594727, "learning_rate": 2.5949800342270395e-05, "loss": 2.4755, "step": 4555 }, { "epoch": 0.7797364367619374, "grad_norm": 31.707317352294922, "learning_rate": 2.5955504848830578e-05, "loss": 4.5809, "step": 4556 }, { "epoch": 0.7799075817217183, "grad_norm": 14.886727333068848, "learning_rate": 2.5961209355390758e-05, "loss": 0.8313, "step": 4557 }, { "epoch": 0.7800787266814992, "grad_norm": 21.29567527770996, "learning_rate": 2.596691386195094e-05, "loss": 1.757, "step": 4558 }, { "epoch": 0.7802498716412801, "grad_norm": 68.5009765625, "learning_rate": 2.5972618368511125e-05, "loss": 2.4949, "step": 4559 }, { "epoch": 0.7804210166010611, "grad_norm": 34.35353088378906, "learning_rate": 2.5978322875071305e-05, "loss": 2.2289, "step": 4560 }, { "epoch": 0.780592161560842, "grad_norm": 23.028303146362305, "learning_rate": 2.598402738163149e-05, "loss": 2.4301, "step": 4561 }, { "epoch": 0.780763306520623, "grad_norm": 19.792776107788086, "learning_rate": 2.5989731888191672e-05, "loss": 1.9774, "step": 4562 }, { "epoch": 0.7809344514804039, "grad_norm": 21.630163192749023, "learning_rate": 2.5995436394751855e-05, "loss": 1.9028, "step": 4563 }, { "epoch": 0.7811055964401848, "grad_norm": 28.1812744140625, "learning_rate": 2.6001140901312035e-05, "loss": 3.1802, "step": 4564 }, { "epoch": 0.7812767413999657, "grad_norm": 24.079713821411133, "learning_rate": 2.600684540787222e-05, "loss": 2.3985, "step": 4565 }, { "epoch": 0.7814478863597467, "grad_norm": 20.737655639648438, "learning_rate": 2.6012549914432402e-05, "loss": 2.114, "step": 4566 }, { "epoch": 0.7816190313195276, "grad_norm": 24.673608779907227, "learning_rate": 2.6018254420992582e-05, "loss": 2.7534, "step": 4567 }, { "epoch": 0.7817901762793086, "grad_norm": 26.247447967529297, "learning_rate": 2.602395892755277e-05, "loss": 2.557, "step": 4568 }, { "epoch": 0.7819613212390895, "grad_norm": 5.863075256347656, "learning_rate": 2.6029663434112952e-05, "loss": 0.5004, "step": 4569 }, { "epoch": 0.7821324661988704, "grad_norm": 17.882400512695312, "learning_rate": 2.6035367940673135e-05, "loss": 1.7885, "step": 4570 }, { "epoch": 0.7823036111586513, "grad_norm": 38.73212814331055, "learning_rate": 2.6041072447233315e-05, "loss": 6.2463, "step": 4571 }, { "epoch": 0.7824747561184323, "grad_norm": 22.570146560668945, "learning_rate": 2.60467769537935e-05, "loss": 2.3429, "step": 4572 }, { "epoch": 0.7826459010782133, "grad_norm": 33.48434066772461, "learning_rate": 2.6052481460353682e-05, "loss": 4.7106, "step": 4573 }, { "epoch": 0.7828170460379942, "grad_norm": 8.184353828430176, "learning_rate": 2.6058185966913862e-05, "loss": 0.5866, "step": 4574 }, { "epoch": 0.7829881909977752, "grad_norm": 24.859272003173828, "learning_rate": 2.6063890473474045e-05, "loss": 2.7946, "step": 4575 }, { "epoch": 0.783159335957556, "grad_norm": 24.745906829833984, "learning_rate": 2.606959498003423e-05, "loss": 2.6296, "step": 4576 }, { "epoch": 0.783330480917337, "grad_norm": 21.468034744262695, "learning_rate": 2.6075299486594412e-05, "loss": 2.0762, "step": 4577 }, { "epoch": 0.7835016258771179, "grad_norm": 19.518083572387695, "learning_rate": 2.6081003993154592e-05, "loss": 1.7499, "step": 4578 }, { "epoch": 0.7836727708368989, "grad_norm": 25.370912551879883, "learning_rate": 2.6086708499714776e-05, "loss": 2.0335, "step": 4579 }, { "epoch": 0.7838439157966798, "grad_norm": 9.235052108764648, "learning_rate": 2.609241300627496e-05, "loss": 0.7306, "step": 4580 }, { "epoch": 0.7840150607564608, "grad_norm": 26.33196449279785, "learning_rate": 2.609811751283514e-05, "loss": 2.8747, "step": 4581 }, { "epoch": 0.7841862057162416, "grad_norm": 22.177465438842773, "learning_rate": 2.6103822019395322e-05, "loss": 1.9306, "step": 4582 }, { "epoch": 0.7843573506760226, "grad_norm": 15.260997772216797, "learning_rate": 2.6109526525955506e-05, "loss": 1.5436, "step": 4583 }, { "epoch": 0.7845284956358035, "grad_norm": 17.693405151367188, "learning_rate": 2.6115231032515686e-05, "loss": 1.7023, "step": 4584 }, { "epoch": 0.7846996405955845, "grad_norm": 19.666189193725586, "learning_rate": 2.612093553907587e-05, "loss": 1.7938, "step": 4585 }, { "epoch": 0.7848707855553654, "grad_norm": 1.3055251836776733, "learning_rate": 2.6126640045636052e-05, "loss": 0.2317, "step": 4586 }, { "epoch": 0.7850419305151464, "grad_norm": 23.14202880859375, "learning_rate": 2.6132344552196236e-05, "loss": 2.4039, "step": 4587 }, { "epoch": 0.7852130754749272, "grad_norm": 24.73307991027832, "learning_rate": 2.6138049058756416e-05, "loss": 2.5498, "step": 4588 }, { "epoch": 0.7853842204347082, "grad_norm": 14.071855545043945, "learning_rate": 2.61437535653166e-05, "loss": 1.1738, "step": 4589 }, { "epoch": 0.7855553653944891, "grad_norm": 31.454723358154297, "learning_rate": 2.6149458071876783e-05, "loss": 6.066, "step": 4590 }, { "epoch": 0.7857265103542701, "grad_norm": 10.21793270111084, "learning_rate": 2.6155162578436966e-05, "loss": 0.7021, "step": 4591 }, { "epoch": 0.785897655314051, "grad_norm": 0.9553564190864563, "learning_rate": 2.616086708499715e-05, "loss": 0.2104, "step": 4592 }, { "epoch": 0.786068800273832, "grad_norm": 10.402186393737793, "learning_rate": 2.6166571591557333e-05, "loss": 0.6858, "step": 4593 }, { "epoch": 0.7862399452336128, "grad_norm": 83.13465118408203, "learning_rate": 2.6172276098117516e-05, "loss": 8.0191, "step": 4594 }, { "epoch": 0.7864110901933938, "grad_norm": 79.72745513916016, "learning_rate": 2.6177980604677696e-05, "loss": 7.8977, "step": 4595 }, { "epoch": 0.7865822351531747, "grad_norm": 19.631969451904297, "learning_rate": 2.618368511123788e-05, "loss": 1.6448, "step": 4596 }, { "epoch": 0.7867533801129557, "grad_norm": 21.243122100830078, "learning_rate": 2.6189389617798063e-05, "loss": 2.725, "step": 4597 }, { "epoch": 0.7869245250727366, "grad_norm": 27.33664321899414, "learning_rate": 2.6195094124358243e-05, "loss": 6.0632, "step": 4598 }, { "epoch": 0.7870956700325176, "grad_norm": 27.157442092895508, "learning_rate": 2.6200798630918426e-05, "loss": 2.6574, "step": 4599 }, { "epoch": 0.7872668149922984, "grad_norm": 16.595340728759766, "learning_rate": 2.620650313747861e-05, "loss": 1.8749, "step": 4600 }, { "epoch": 0.7874379599520794, "grad_norm": 5.997491836547852, "learning_rate": 2.6212207644038793e-05, "loss": 0.8366, "step": 4601 }, { "epoch": 0.7876091049118603, "grad_norm": 8.601006507873535, "learning_rate": 2.6217912150598973e-05, "loss": 0.6105, "step": 4602 }, { "epoch": 0.7877802498716413, "grad_norm": 22.949264526367188, "learning_rate": 2.6223616657159156e-05, "loss": 2.0644, "step": 4603 }, { "epoch": 0.7879513948314222, "grad_norm": 20.955198287963867, "learning_rate": 2.622932116371934e-05, "loss": 1.9205, "step": 4604 }, { "epoch": 0.7881225397912032, "grad_norm": 4.057135105133057, "learning_rate": 2.623502567027952e-05, "loss": 0.531, "step": 4605 }, { "epoch": 0.788293684750984, "grad_norm": 38.576942443847656, "learning_rate": 2.6240730176839703e-05, "loss": 1.9383, "step": 4606 }, { "epoch": 0.788464829710765, "grad_norm": 28.404165267944336, "learning_rate": 2.6246434683399886e-05, "loss": 2.4168, "step": 4607 }, { "epoch": 0.7886359746705459, "grad_norm": 30.351381301879883, "learning_rate": 2.625213918996007e-05, "loss": 5.746, "step": 4608 }, { "epoch": 0.7888071196303269, "grad_norm": 6.266965389251709, "learning_rate": 2.625784369652025e-05, "loss": 0.5733, "step": 4609 }, { "epoch": 0.7889782645901078, "grad_norm": 20.09222984313965, "learning_rate": 2.6263548203080433e-05, "loss": 1.7396, "step": 4610 }, { "epoch": 0.7891494095498888, "grad_norm": 29.68480110168457, "learning_rate": 2.6269252709640617e-05, "loss": 3.2917, "step": 4611 }, { "epoch": 0.7893205545096696, "grad_norm": 28.49522590637207, "learning_rate": 2.6274957216200797e-05, "loss": 3.904, "step": 4612 }, { "epoch": 0.7894916994694506, "grad_norm": 24.60774803161621, "learning_rate": 2.628066172276098e-05, "loss": 2.4849, "step": 4613 }, { "epoch": 0.7896628444292315, "grad_norm": 19.17363929748535, "learning_rate": 2.6286366229321167e-05, "loss": 1.6276, "step": 4614 }, { "epoch": 0.7898339893890125, "grad_norm": 27.375503540039062, "learning_rate": 2.6292070735881347e-05, "loss": 3.1904, "step": 4615 }, { "epoch": 0.7900051343487934, "grad_norm": 7.622123718261719, "learning_rate": 2.629777524244153e-05, "loss": 0.6504, "step": 4616 }, { "epoch": 0.7901762793085744, "grad_norm": 18.307863235473633, "learning_rate": 2.6303479749001713e-05, "loss": 1.6608, "step": 4617 }, { "epoch": 0.7903474242683552, "grad_norm": 23.98590850830078, "learning_rate": 2.6309184255561897e-05, "loss": 2.2049, "step": 4618 }, { "epoch": 0.7905185692281362, "grad_norm": 5.612167835235596, "learning_rate": 2.6314888762122077e-05, "loss": 0.552, "step": 4619 }, { "epoch": 0.7906897141879171, "grad_norm": 26.797847747802734, "learning_rate": 2.632059326868226e-05, "loss": 2.891, "step": 4620 }, { "epoch": 0.7908608591476981, "grad_norm": 26.22957420349121, "learning_rate": 2.6326297775242444e-05, "loss": 2.4378, "step": 4621 }, { "epoch": 0.791032004107479, "grad_norm": 1.1691210269927979, "learning_rate": 2.6332002281802624e-05, "loss": 0.2558, "step": 4622 }, { "epoch": 0.79120314906726, "grad_norm": 3.431807041168213, "learning_rate": 2.6337706788362807e-05, "loss": 0.2987, "step": 4623 }, { "epoch": 0.791374294027041, "grad_norm": 9.09062385559082, "learning_rate": 2.634341129492299e-05, "loss": 1.2888, "step": 4624 }, { "epoch": 0.7915454389868218, "grad_norm": 15.95527172088623, "learning_rate": 2.6349115801483174e-05, "loss": 1.237, "step": 4625 }, { "epoch": 0.7917165839466028, "grad_norm": 15.407099723815918, "learning_rate": 2.6354820308043354e-05, "loss": 1.4981, "step": 4626 }, { "epoch": 0.7918877289063837, "grad_norm": 21.4428653717041, "learning_rate": 2.6360524814603537e-05, "loss": 1.9965, "step": 4627 }, { "epoch": 0.7920588738661647, "grad_norm": 32.93854522705078, "learning_rate": 2.636622932116372e-05, "loss": 1.6353, "step": 4628 }, { "epoch": 0.7922300188259456, "grad_norm": 17.02910804748535, "learning_rate": 2.63719338277239e-05, "loss": 1.6158, "step": 4629 }, { "epoch": 0.7924011637857266, "grad_norm": 18.701148986816406, "learning_rate": 2.6377638334284084e-05, "loss": 1.4652, "step": 4630 }, { "epoch": 0.7925723087455074, "grad_norm": 20.513490676879883, "learning_rate": 2.6383342840844267e-05, "loss": 2.1445, "step": 4631 }, { "epoch": 0.7927434537052884, "grad_norm": 33.653228759765625, "learning_rate": 2.638904734740445e-05, "loss": 3.4384, "step": 4632 }, { "epoch": 0.7929145986650693, "grad_norm": 22.135591506958008, "learning_rate": 2.639475185396463e-05, "loss": 1.9552, "step": 4633 }, { "epoch": 0.7930857436248503, "grad_norm": 7.528356552124023, "learning_rate": 2.6400456360524814e-05, "loss": 0.6067, "step": 4634 }, { "epoch": 0.7932568885846312, "grad_norm": 17.62458038330078, "learning_rate": 2.6406160867084997e-05, "loss": 1.4482, "step": 4635 }, { "epoch": 0.7934280335444122, "grad_norm": 1.2217499017715454, "learning_rate": 2.6411865373645177e-05, "loss": 0.2197, "step": 4636 }, { "epoch": 0.793599178504193, "grad_norm": 26.31184196472168, "learning_rate": 2.6417569880205364e-05, "loss": 2.5345, "step": 4637 }, { "epoch": 0.793770323463974, "grad_norm": 1.5256792306900024, "learning_rate": 2.6423274386765547e-05, "loss": 0.2251, "step": 4638 }, { "epoch": 0.7939414684237549, "grad_norm": 14.517436981201172, "learning_rate": 2.642897889332573e-05, "loss": 0.9695, "step": 4639 }, { "epoch": 0.7941126133835359, "grad_norm": 23.73163414001465, "learning_rate": 2.643468339988591e-05, "loss": 2.5979, "step": 4640 }, { "epoch": 0.7942837583433168, "grad_norm": 4.0032172203063965, "learning_rate": 2.6440387906446094e-05, "loss": 0.3854, "step": 4641 }, { "epoch": 0.7944549033030978, "grad_norm": 17.482643127441406, "learning_rate": 2.6446092413006278e-05, "loss": 1.5197, "step": 4642 }, { "epoch": 0.7946260482628786, "grad_norm": 14.50070571899414, "learning_rate": 2.6451796919566458e-05, "loss": 1.1852, "step": 4643 }, { "epoch": 0.7947971932226596, "grad_norm": 25.02318572998047, "learning_rate": 2.645750142612664e-05, "loss": 2.3457, "step": 4644 }, { "epoch": 0.7949683381824405, "grad_norm": 31.2506103515625, "learning_rate": 2.6463205932686824e-05, "loss": 2.6284, "step": 4645 }, { "epoch": 0.7951394831422215, "grad_norm": 7.928152084350586, "learning_rate": 2.6468910439247004e-05, "loss": 0.6422, "step": 4646 }, { "epoch": 0.7953106281020024, "grad_norm": 0.9403290152549744, "learning_rate": 2.6474614945807188e-05, "loss": 0.1969, "step": 4647 }, { "epoch": 0.7954817730617834, "grad_norm": 56.036251068115234, "learning_rate": 2.648031945236737e-05, "loss": 2.2416, "step": 4648 }, { "epoch": 0.7956529180215642, "grad_norm": 35.657833099365234, "learning_rate": 2.6486023958927554e-05, "loss": 6.1541, "step": 4649 }, { "epoch": 0.7958240629813452, "grad_norm": 14.264200210571289, "learning_rate": 2.6491728465487734e-05, "loss": 1.4948, "step": 4650 }, { "epoch": 0.7959952079411261, "grad_norm": 0.9517439603805542, "learning_rate": 2.6497432972047918e-05, "loss": 0.1973, "step": 4651 }, { "epoch": 0.7961663529009071, "grad_norm": 3.9652233123779297, "learning_rate": 2.65031374786081e-05, "loss": 0.3578, "step": 4652 }, { "epoch": 0.796337497860688, "grad_norm": 20.68568229675293, "learning_rate": 2.650884198516828e-05, "loss": 1.6687, "step": 4653 }, { "epoch": 0.796508642820469, "grad_norm": 36.3359375, "learning_rate": 2.6514546491728465e-05, "loss": 6.1452, "step": 4654 }, { "epoch": 0.7966797877802498, "grad_norm": 13.956791877746582, "learning_rate": 2.6520250998288648e-05, "loss": 0.8717, "step": 4655 }, { "epoch": 0.7968509327400308, "grad_norm": 30.4771671295166, "learning_rate": 2.652595550484883e-05, "loss": 3.0092, "step": 4656 }, { "epoch": 0.7970220776998117, "grad_norm": 6.4612226486206055, "learning_rate": 2.653166001140901e-05, "loss": 0.6398, "step": 4657 }, { "epoch": 0.7971932226595927, "grad_norm": 16.6127986907959, "learning_rate": 2.6537364517969195e-05, "loss": 1.8739, "step": 4658 }, { "epoch": 0.7973643676193736, "grad_norm": 21.27251625061035, "learning_rate": 2.6543069024529378e-05, "loss": 1.9702, "step": 4659 }, { "epoch": 0.7975355125791546, "grad_norm": 18.95979118347168, "learning_rate": 2.654877353108956e-05, "loss": 1.6295, "step": 4660 }, { "epoch": 0.7977066575389354, "grad_norm": 10.863266944885254, "learning_rate": 2.6554478037649745e-05, "loss": 0.7761, "step": 4661 }, { "epoch": 0.7978778024987164, "grad_norm": 10.78805160522461, "learning_rate": 2.6560182544209928e-05, "loss": 0.9169, "step": 4662 }, { "epoch": 0.7980489474584973, "grad_norm": 21.447656631469727, "learning_rate": 2.656588705077011e-05, "loss": 2.3001, "step": 4663 }, { "epoch": 0.7982200924182783, "grad_norm": 7.096908092498779, "learning_rate": 2.657159155733029e-05, "loss": 0.6332, "step": 4664 }, { "epoch": 0.7983912373780592, "grad_norm": 11.815482139587402, "learning_rate": 2.6577296063890475e-05, "loss": 0.9556, "step": 4665 }, { "epoch": 0.7985623823378402, "grad_norm": 25.208463668823242, "learning_rate": 2.658300057045066e-05, "loss": 2.7888, "step": 4666 }, { "epoch": 0.798733527297621, "grad_norm": 28.86720848083496, "learning_rate": 2.658870507701084e-05, "loss": 1.907, "step": 4667 }, { "epoch": 0.798904672257402, "grad_norm": 6.2774858474731445, "learning_rate": 2.6594409583571022e-05, "loss": 0.4982, "step": 4668 }, { "epoch": 0.7990758172171829, "grad_norm": 31.023252487182617, "learning_rate": 2.6600114090131205e-05, "loss": 5.9759, "step": 4669 }, { "epoch": 0.7992469621769639, "grad_norm": 22.48405647277832, "learning_rate": 2.660581859669139e-05, "loss": 2.4245, "step": 4670 }, { "epoch": 0.7994181071367448, "grad_norm": 21.80152130126953, "learning_rate": 2.661152310325157e-05, "loss": 2.3967, "step": 4671 }, { "epoch": 0.7995892520965258, "grad_norm": 24.37519645690918, "learning_rate": 2.6617227609811752e-05, "loss": 2.6876, "step": 4672 }, { "epoch": 0.7997603970563066, "grad_norm": 24.82401466369629, "learning_rate": 2.6622932116371935e-05, "loss": 3.2294, "step": 4673 }, { "epoch": 0.7999315420160876, "grad_norm": 0.9125476479530334, "learning_rate": 2.6628636622932115e-05, "loss": 0.2049, "step": 4674 }, { "epoch": 0.8001026869758686, "grad_norm": 20.301301956176758, "learning_rate": 2.66343411294923e-05, "loss": 2.0656, "step": 4675 }, { "epoch": 0.8002738319356495, "grad_norm": 17.966495513916016, "learning_rate": 2.6640045636052482e-05, "loss": 1.658, "step": 4676 }, { "epoch": 0.8004449768954305, "grad_norm": 0.8491156697273254, "learning_rate": 2.6645750142612665e-05, "loss": 0.2129, "step": 4677 }, { "epoch": 0.8006161218552114, "grad_norm": 21.60484504699707, "learning_rate": 2.6651454649172845e-05, "loss": 1.6655, "step": 4678 }, { "epoch": 0.8007872668149923, "grad_norm": 19.46196174621582, "learning_rate": 2.665715915573303e-05, "loss": 1.7872, "step": 4679 }, { "epoch": 0.8009584117747732, "grad_norm": 21.08289909362793, "learning_rate": 2.6662863662293212e-05, "loss": 2.101, "step": 4680 }, { "epoch": 0.8011295567345542, "grad_norm": 19.137561798095703, "learning_rate": 2.6668568168853392e-05, "loss": 1.9419, "step": 4681 }, { "epoch": 0.8013007016943351, "grad_norm": 22.642850875854492, "learning_rate": 2.667427267541358e-05, "loss": 2.2932, "step": 4682 }, { "epoch": 0.8014718466541161, "grad_norm": 31.17798614501953, "learning_rate": 2.6679977181973762e-05, "loss": 2.9005, "step": 4683 }, { "epoch": 0.801642991613897, "grad_norm": 21.248584747314453, "learning_rate": 2.6685681688533942e-05, "loss": 2.0125, "step": 4684 }, { "epoch": 0.801814136573678, "grad_norm": 2.3411998748779297, "learning_rate": 2.6691386195094126e-05, "loss": 0.2708, "step": 4685 }, { "epoch": 0.8019852815334588, "grad_norm": 1.263325810432434, "learning_rate": 2.669709070165431e-05, "loss": 0.2294, "step": 4686 }, { "epoch": 0.8021564264932398, "grad_norm": 24.95157814025879, "learning_rate": 2.6702795208214492e-05, "loss": 3.1625, "step": 4687 }, { "epoch": 0.8023275714530207, "grad_norm": 26.514177322387695, "learning_rate": 2.6708499714774672e-05, "loss": 3.3918, "step": 4688 }, { "epoch": 0.8024987164128017, "grad_norm": 12.605335235595703, "learning_rate": 2.6714204221334856e-05, "loss": 0.9508, "step": 4689 }, { "epoch": 0.8026698613725826, "grad_norm": 19.918142318725586, "learning_rate": 2.671990872789504e-05, "loss": 1.5214, "step": 4690 }, { "epoch": 0.8028410063323635, "grad_norm": 1.9992042779922485, "learning_rate": 2.672561323445522e-05, "loss": 0.2171, "step": 4691 }, { "epoch": 0.8030121512921444, "grad_norm": 9.957181930541992, "learning_rate": 2.6731317741015403e-05, "loss": 0.7101, "step": 4692 }, { "epoch": 0.8031832962519254, "grad_norm": 25.742799758911133, "learning_rate": 2.6737022247575586e-05, "loss": 2.705, "step": 4693 }, { "epoch": 0.8033544412117063, "grad_norm": 25.987510681152344, "learning_rate": 2.674272675413577e-05, "loss": 2.4936, "step": 4694 }, { "epoch": 0.8035255861714873, "grad_norm": 19.842357635498047, "learning_rate": 2.674843126069595e-05, "loss": 1.7686, "step": 4695 }, { "epoch": 0.8036967311312682, "grad_norm": 26.451980590820312, "learning_rate": 2.6754135767256133e-05, "loss": 2.3944, "step": 4696 }, { "epoch": 0.8038678760910491, "grad_norm": 15.866630554199219, "learning_rate": 2.6759840273816316e-05, "loss": 1.239, "step": 4697 }, { "epoch": 0.80403902105083, "grad_norm": 58.552215576171875, "learning_rate": 2.6765544780376496e-05, "loss": 1.9071, "step": 4698 }, { "epoch": 0.804210166010611, "grad_norm": 19.024993896484375, "learning_rate": 2.677124928693668e-05, "loss": 1.7101, "step": 4699 }, { "epoch": 0.8043813109703919, "grad_norm": 25.27145767211914, "learning_rate": 2.6776953793496863e-05, "loss": 2.543, "step": 4700 }, { "epoch": 0.8045524559301729, "grad_norm": 7.051549911499023, "learning_rate": 2.6782658300057046e-05, "loss": 0.5373, "step": 4701 }, { "epoch": 0.8047236008899538, "grad_norm": 24.07325553894043, "learning_rate": 2.6788362806617226e-05, "loss": 2.1983, "step": 4702 }, { "epoch": 0.8048947458497347, "grad_norm": 16.690013885498047, "learning_rate": 2.679406731317741e-05, "loss": 1.6365, "step": 4703 }, { "epoch": 0.8050658908095156, "grad_norm": 29.604305267333984, "learning_rate": 2.6799771819737593e-05, "loss": 1.4153, "step": 4704 }, { "epoch": 0.8052370357692966, "grad_norm": 10.602456092834473, "learning_rate": 2.6805476326297776e-05, "loss": 0.8433, "step": 4705 }, { "epoch": 0.8054081807290775, "grad_norm": 18.099092483520508, "learning_rate": 2.681118083285796e-05, "loss": 1.6778, "step": 4706 }, { "epoch": 0.8055793256888585, "grad_norm": 26.6840763092041, "learning_rate": 2.6816885339418143e-05, "loss": 2.8567, "step": 4707 }, { "epoch": 0.8057504706486394, "grad_norm": 25.60426139831543, "learning_rate": 2.6822589845978326e-05, "loss": 2.7501, "step": 4708 }, { "epoch": 0.8059216156084203, "grad_norm": 23.098224639892578, "learning_rate": 2.6828294352538506e-05, "loss": 2.5179, "step": 4709 }, { "epoch": 0.8060927605682012, "grad_norm": 5.202851295471191, "learning_rate": 2.683399885909869e-05, "loss": 0.4799, "step": 4710 }, { "epoch": 0.8062639055279822, "grad_norm": 1.1651976108551025, "learning_rate": 2.6839703365658873e-05, "loss": 0.2147, "step": 4711 }, { "epoch": 0.8064350504877631, "grad_norm": 24.902393341064453, "learning_rate": 2.6845407872219053e-05, "loss": 2.4519, "step": 4712 }, { "epoch": 0.8066061954475441, "grad_norm": 12.363120079040527, "learning_rate": 2.6851112378779237e-05, "loss": 0.9881, "step": 4713 }, { "epoch": 0.806777340407325, "grad_norm": 5.912435054779053, "learning_rate": 2.685681688533942e-05, "loss": 0.5938, "step": 4714 }, { "epoch": 0.806948485367106, "grad_norm": 106.52516174316406, "learning_rate": 2.68625213918996e-05, "loss": 7.5182, "step": 4715 }, { "epoch": 0.8071196303268868, "grad_norm": 1.668204665184021, "learning_rate": 2.6868225898459783e-05, "loss": 0.2216, "step": 4716 }, { "epoch": 0.8072907752866678, "grad_norm": 16.875843048095703, "learning_rate": 2.6873930405019967e-05, "loss": 1.6175, "step": 4717 }, { "epoch": 0.8074619202464487, "grad_norm": 25.818157196044922, "learning_rate": 2.687963491158015e-05, "loss": 3.323, "step": 4718 }, { "epoch": 0.8076330652062297, "grad_norm": 15.579858779907227, "learning_rate": 2.688533941814033e-05, "loss": 1.5144, "step": 4719 }, { "epoch": 0.8078042101660106, "grad_norm": 16.2536563873291, "learning_rate": 2.6891043924700513e-05, "loss": 1.7982, "step": 4720 }, { "epoch": 0.8079753551257916, "grad_norm": 24.011157989501953, "learning_rate": 2.6896748431260697e-05, "loss": 1.8936, "step": 4721 }, { "epoch": 0.8081465000855724, "grad_norm": 2.2636773586273193, "learning_rate": 2.6902452937820877e-05, "loss": 0.2311, "step": 4722 }, { "epoch": 0.8083176450453534, "grad_norm": 19.2104434967041, "learning_rate": 2.690815744438106e-05, "loss": 2.1646, "step": 4723 }, { "epoch": 0.8084887900051343, "grad_norm": 18.140581130981445, "learning_rate": 2.6913861950941244e-05, "loss": 1.5111, "step": 4724 }, { "epoch": 0.8086599349649153, "grad_norm": 17.46432113647461, "learning_rate": 2.6919566457501427e-05, "loss": 1.6828, "step": 4725 }, { "epoch": 0.8088310799246963, "grad_norm": 9.412572860717773, "learning_rate": 2.6925270964061607e-05, "loss": 0.6319, "step": 4726 }, { "epoch": 0.8090022248844772, "grad_norm": 27.534698486328125, "learning_rate": 2.693097547062179e-05, "loss": 3.4404, "step": 4727 }, { "epoch": 0.8091733698442581, "grad_norm": 20.456804275512695, "learning_rate": 2.6936679977181977e-05, "loss": 1.8323, "step": 4728 }, { "epoch": 0.809344514804039, "grad_norm": 29.691913604736328, "learning_rate": 2.6942384483742157e-05, "loss": 2.6556, "step": 4729 }, { "epoch": 0.80951565976382, "grad_norm": 88.99212646484375, "learning_rate": 2.694808899030234e-05, "loss": 8.2099, "step": 4730 }, { "epoch": 0.8096868047236009, "grad_norm": 24.28471565246582, "learning_rate": 2.6953793496862524e-05, "loss": 2.0042, "step": 4731 }, { "epoch": 0.8098579496833819, "grad_norm": 7.255256175994873, "learning_rate": 2.6959498003422707e-05, "loss": 0.5848, "step": 4732 }, { "epoch": 0.8100290946431628, "grad_norm": 57.900230407714844, "learning_rate": 2.6965202509982887e-05, "loss": 1.7669, "step": 4733 }, { "epoch": 0.8102002396029437, "grad_norm": 23.751659393310547, "learning_rate": 2.697090701654307e-05, "loss": 2.1911, "step": 4734 }, { "epoch": 0.8103713845627246, "grad_norm": 29.752071380615234, "learning_rate": 2.6976611523103254e-05, "loss": 2.9697, "step": 4735 }, { "epoch": 0.8105425295225056, "grad_norm": 24.04039192199707, "learning_rate": 2.6982316029663434e-05, "loss": 2.1467, "step": 4736 }, { "epoch": 0.8107136744822865, "grad_norm": 7.993510723114014, "learning_rate": 2.6988020536223617e-05, "loss": 0.5852, "step": 4737 }, { "epoch": 0.8108848194420675, "grad_norm": 20.55529022216797, "learning_rate": 2.69937250427838e-05, "loss": 1.6321, "step": 4738 }, { "epoch": 0.8110559644018484, "grad_norm": 10.042283058166504, "learning_rate": 2.6999429549343984e-05, "loss": 0.7064, "step": 4739 }, { "epoch": 0.8112271093616293, "grad_norm": 13.695612907409668, "learning_rate": 2.7005134055904164e-05, "loss": 1.1387, "step": 4740 }, { "epoch": 0.8113982543214102, "grad_norm": 21.712265014648438, "learning_rate": 2.7010838562464347e-05, "loss": 1.6631, "step": 4741 }, { "epoch": 0.8115693992811912, "grad_norm": 23.428848266601562, "learning_rate": 2.701654306902453e-05, "loss": 2.6433, "step": 4742 }, { "epoch": 0.8117405442409721, "grad_norm": 25.32332420349121, "learning_rate": 2.702224757558471e-05, "loss": 2.2108, "step": 4743 }, { "epoch": 0.8119116892007531, "grad_norm": 21.363313674926758, "learning_rate": 2.7027952082144894e-05, "loss": 2.5228, "step": 4744 }, { "epoch": 0.812082834160534, "grad_norm": 8.273282051086426, "learning_rate": 2.7033656588705078e-05, "loss": 0.5786, "step": 4745 }, { "epoch": 0.8122539791203149, "grad_norm": 14.859856605529785, "learning_rate": 2.703936109526526e-05, "loss": 1.1601, "step": 4746 }, { "epoch": 0.8124251240800958, "grad_norm": 22.67235565185547, "learning_rate": 2.704506560182544e-05, "loss": 2.1171, "step": 4747 }, { "epoch": 0.8125962690398768, "grad_norm": 20.38551139831543, "learning_rate": 2.7050770108385624e-05, "loss": 2.1415, "step": 4748 }, { "epoch": 0.8127674139996577, "grad_norm": 15.979461669921875, "learning_rate": 2.7056474614945808e-05, "loss": 1.4184, "step": 4749 }, { "epoch": 0.8129385589594387, "grad_norm": 15.520298957824707, "learning_rate": 2.7062179121505988e-05, "loss": 1.4953, "step": 4750 }, { "epoch": 0.8131097039192196, "grad_norm": 25.01488494873047, "learning_rate": 2.7067883628066174e-05, "loss": 2.7059, "step": 4751 }, { "epoch": 0.8132808488790005, "grad_norm": 5.247277736663818, "learning_rate": 2.7073588134626358e-05, "loss": 0.514, "step": 4752 }, { "epoch": 0.8134519938387814, "grad_norm": 9.065279006958008, "learning_rate": 2.7079292641186538e-05, "loss": 0.8338, "step": 4753 }, { "epoch": 0.8136231387985624, "grad_norm": 20.89956283569336, "learning_rate": 2.708499714774672e-05, "loss": 1.9556, "step": 4754 }, { "epoch": 0.8137942837583433, "grad_norm": 22.0202579498291, "learning_rate": 2.7090701654306905e-05, "loss": 1.9706, "step": 4755 }, { "epoch": 0.8139654287181243, "grad_norm": 23.732559204101562, "learning_rate": 2.7096406160867088e-05, "loss": 2.6891, "step": 4756 }, { "epoch": 0.8141365736779052, "grad_norm": 33.41729736328125, "learning_rate": 2.7102110667427268e-05, "loss": 6.6742, "step": 4757 }, { "epoch": 0.8143077186376861, "grad_norm": 24.37078285217285, "learning_rate": 2.710781517398745e-05, "loss": 2.2191, "step": 4758 }, { "epoch": 0.814478863597467, "grad_norm": 1.2146947383880615, "learning_rate": 2.7113519680547635e-05, "loss": 0.2281, "step": 4759 }, { "epoch": 0.814650008557248, "grad_norm": 18.136754989624023, "learning_rate": 2.7119224187107815e-05, "loss": 1.4566, "step": 4760 }, { "epoch": 0.8148211535170289, "grad_norm": 51.885501861572266, "learning_rate": 2.7124928693667998e-05, "loss": 6.6106, "step": 4761 }, { "epoch": 0.8149922984768099, "grad_norm": 23.535844802856445, "learning_rate": 2.713063320022818e-05, "loss": 2.0082, "step": 4762 }, { "epoch": 0.8151634434365908, "grad_norm": 21.241649627685547, "learning_rate": 2.7136337706788365e-05, "loss": 2.3369, "step": 4763 }, { "epoch": 0.8153345883963717, "grad_norm": 18.623498916625977, "learning_rate": 2.7142042213348545e-05, "loss": 1.9244, "step": 4764 }, { "epoch": 0.8155057333561526, "grad_norm": 5.655921936035156, "learning_rate": 2.7147746719908728e-05, "loss": 0.5875, "step": 4765 }, { "epoch": 0.8156768783159336, "grad_norm": 21.945968627929688, "learning_rate": 2.715345122646891e-05, "loss": 2.1567, "step": 4766 }, { "epoch": 0.8158480232757145, "grad_norm": 51.72159957885742, "learning_rate": 2.715915573302909e-05, "loss": 6.8426, "step": 4767 }, { "epoch": 0.8160191682354955, "grad_norm": 21.90216636657715, "learning_rate": 2.7164860239589275e-05, "loss": 2.084, "step": 4768 }, { "epoch": 0.8161903131952764, "grad_norm": 11.635622024536133, "learning_rate": 2.7170564746149458e-05, "loss": 0.8471, "step": 4769 }, { "epoch": 0.8163614581550573, "grad_norm": 4.031811714172363, "learning_rate": 2.717626925270964e-05, "loss": 0.4889, "step": 4770 }, { "epoch": 0.8165326031148382, "grad_norm": 30.011260986328125, "learning_rate": 2.718197375926982e-05, "loss": 2.7194, "step": 4771 }, { "epoch": 0.8167037480746192, "grad_norm": 18.62017250061035, "learning_rate": 2.7187678265830005e-05, "loss": 1.7315, "step": 4772 }, { "epoch": 0.8168748930344001, "grad_norm": 27.6317138671875, "learning_rate": 2.719338277239019e-05, "loss": 3.5465, "step": 4773 }, { "epoch": 0.8170460379941811, "grad_norm": 25.174705505371094, "learning_rate": 2.7199087278950372e-05, "loss": 2.5848, "step": 4774 }, { "epoch": 0.817217182953962, "grad_norm": 25.61824607849121, "learning_rate": 2.7204791785510555e-05, "loss": 2.3883, "step": 4775 }, { "epoch": 0.8173883279137429, "grad_norm": 23.317171096801758, "learning_rate": 2.721049629207074e-05, "loss": 1.9388, "step": 4776 }, { "epoch": 0.8175594728735239, "grad_norm": 19.05599021911621, "learning_rate": 2.7216200798630922e-05, "loss": 1.9781, "step": 4777 }, { "epoch": 0.8177306178333048, "grad_norm": 3.6496589183807373, "learning_rate": 2.7221905305191102e-05, "loss": 0.3759, "step": 4778 }, { "epoch": 0.8179017627930858, "grad_norm": 1.0984550714492798, "learning_rate": 2.7227609811751285e-05, "loss": 0.212, "step": 4779 }, { "epoch": 0.8180729077528667, "grad_norm": 6.329287052154541, "learning_rate": 2.723331431831147e-05, "loss": 0.6039, "step": 4780 }, { "epoch": 0.8182440527126477, "grad_norm": 2.0273239612579346, "learning_rate": 2.723901882487165e-05, "loss": 0.2497, "step": 4781 }, { "epoch": 0.8184151976724285, "grad_norm": 25.492948532104492, "learning_rate": 2.7244723331431832e-05, "loss": 2.387, "step": 4782 }, { "epoch": 0.8185863426322095, "grad_norm": 26.385509490966797, "learning_rate": 2.7250427837992015e-05, "loss": 2.7364, "step": 4783 }, { "epoch": 0.8187574875919904, "grad_norm": 3.4072940349578857, "learning_rate": 2.7256132344552195e-05, "loss": 0.3718, "step": 4784 }, { "epoch": 0.8189286325517714, "grad_norm": 14.639547348022461, "learning_rate": 2.726183685111238e-05, "loss": 1.3097, "step": 4785 }, { "epoch": 0.8190997775115523, "grad_norm": 22.575746536254883, "learning_rate": 2.7267541357672562e-05, "loss": 1.9179, "step": 4786 }, { "epoch": 0.8192709224713333, "grad_norm": 25.742076873779297, "learning_rate": 2.7273245864232746e-05, "loss": 3.313, "step": 4787 }, { "epoch": 0.8194420674311141, "grad_norm": 37.464515686035156, "learning_rate": 2.7278950370792926e-05, "loss": 6.2871, "step": 4788 }, { "epoch": 0.8196132123908951, "grad_norm": 18.994226455688477, "learning_rate": 2.728465487735311e-05, "loss": 1.8293, "step": 4789 }, { "epoch": 0.819784357350676, "grad_norm": 21.22791290283203, "learning_rate": 2.7290359383913292e-05, "loss": 2.0439, "step": 4790 }, { "epoch": 0.819955502310457, "grad_norm": 23.675783157348633, "learning_rate": 2.7296063890473472e-05, "loss": 2.7619, "step": 4791 }, { "epoch": 0.8201266472702379, "grad_norm": 20.29876708984375, "learning_rate": 2.7301768397033656e-05, "loss": 1.6941, "step": 4792 }, { "epoch": 0.8202977922300189, "grad_norm": 18.569841384887695, "learning_rate": 2.730747290359384e-05, "loss": 1.9272, "step": 4793 }, { "epoch": 0.8204689371897997, "grad_norm": 45.40720748901367, "learning_rate": 2.7313177410154022e-05, "loss": 2.4204, "step": 4794 }, { "epoch": 0.8206400821495807, "grad_norm": 20.549243927001953, "learning_rate": 2.7318881916714202e-05, "loss": 2.1771, "step": 4795 }, { "epoch": 0.8208112271093616, "grad_norm": 17.86515235900879, "learning_rate": 2.7324586423274386e-05, "loss": 1.666, "step": 4796 }, { "epoch": 0.8209823720691426, "grad_norm": 20.349185943603516, "learning_rate": 2.7330290929834573e-05, "loss": 1.7841, "step": 4797 }, { "epoch": 0.8211535170289235, "grad_norm": 20.81956672668457, "learning_rate": 2.7335995436394753e-05, "loss": 1.9564, "step": 4798 }, { "epoch": 0.8213246619887045, "grad_norm": 23.731029510498047, "learning_rate": 2.7341699942954936e-05, "loss": 1.8663, "step": 4799 }, { "epoch": 0.8214958069484853, "grad_norm": 20.08209991455078, "learning_rate": 2.734740444951512e-05, "loss": 1.9046, "step": 4800 }, { "epoch": 0.8216669519082663, "grad_norm": 153.8986053466797, "learning_rate": 2.7353108956075303e-05, "loss": 8.5195, "step": 4801 }, { "epoch": 0.8218380968680472, "grad_norm": 21.99418067932129, "learning_rate": 2.7358813462635483e-05, "loss": 2.2924, "step": 4802 }, { "epoch": 0.8220092418278282, "grad_norm": 20.278175354003906, "learning_rate": 2.7364517969195666e-05, "loss": 2.2382, "step": 4803 }, { "epoch": 0.8221803867876091, "grad_norm": 0.9226766228675842, "learning_rate": 2.737022247575585e-05, "loss": 0.2091, "step": 4804 }, { "epoch": 0.8223515317473901, "grad_norm": 25.265033721923828, "learning_rate": 2.737592698231603e-05, "loss": 2.7456, "step": 4805 }, { "epoch": 0.8225226767071709, "grad_norm": 17.61090660095215, "learning_rate": 2.7381631488876213e-05, "loss": 1.7355, "step": 4806 }, { "epoch": 0.8226938216669519, "grad_norm": 27.21466827392578, "learning_rate": 2.7387335995436396e-05, "loss": 2.3528, "step": 4807 }, { "epoch": 0.8228649666267328, "grad_norm": 1.4176955223083496, "learning_rate": 2.739304050199658e-05, "loss": 0.2325, "step": 4808 }, { "epoch": 0.8230361115865138, "grad_norm": 26.031166076660156, "learning_rate": 2.739874500855676e-05, "loss": 2.5746, "step": 4809 }, { "epoch": 0.8232072565462947, "grad_norm": 23.706703186035156, "learning_rate": 2.7404449515116943e-05, "loss": 2.6316, "step": 4810 }, { "epoch": 0.8233784015060757, "grad_norm": 2.113330841064453, "learning_rate": 2.7410154021677126e-05, "loss": 0.3267, "step": 4811 }, { "epoch": 0.8235495464658565, "grad_norm": 13.31404972076416, "learning_rate": 2.7415858528237306e-05, "loss": 1.3433, "step": 4812 }, { "epoch": 0.8237206914256375, "grad_norm": 22.22062110900879, "learning_rate": 2.742156303479749e-05, "loss": 2.5822, "step": 4813 }, { "epoch": 0.8238918363854184, "grad_norm": 17.15830421447754, "learning_rate": 2.7427267541357673e-05, "loss": 1.4505, "step": 4814 }, { "epoch": 0.8240629813451994, "grad_norm": 25.40969467163086, "learning_rate": 2.7432972047917853e-05, "loss": 1.8021, "step": 4815 }, { "epoch": 0.8242341263049803, "grad_norm": 9.526328086853027, "learning_rate": 2.7438676554478036e-05, "loss": 1.0048, "step": 4816 }, { "epoch": 0.8244052712647613, "grad_norm": 16.948484420776367, "learning_rate": 2.744438106103822e-05, "loss": 1.4061, "step": 4817 }, { "epoch": 0.8245764162245421, "grad_norm": 1.4328776597976685, "learning_rate": 2.7450085567598403e-05, "loss": 0.2117, "step": 4818 }, { "epoch": 0.8247475611843231, "grad_norm": 18.04222869873047, "learning_rate": 2.7455790074158583e-05, "loss": 1.676, "step": 4819 }, { "epoch": 0.824918706144104, "grad_norm": 21.746219635009766, "learning_rate": 2.746149458071877e-05, "loss": 2.1541, "step": 4820 }, { "epoch": 0.825089851103885, "grad_norm": 28.491777420043945, "learning_rate": 2.7467199087278953e-05, "loss": 4.0988, "step": 4821 }, { "epoch": 0.8252609960636659, "grad_norm": 4.467132091522217, "learning_rate": 2.7472903593839133e-05, "loss": 0.4074, "step": 4822 }, { "epoch": 0.8254321410234469, "grad_norm": 2.841317892074585, "learning_rate": 2.7478608100399317e-05, "loss": 0.2482, "step": 4823 }, { "epoch": 0.8256032859832277, "grad_norm": 25.000173568725586, "learning_rate": 2.74843126069595e-05, "loss": 2.6425, "step": 4824 }, { "epoch": 0.8257744309430087, "grad_norm": 22.400287628173828, "learning_rate": 2.7490017113519683e-05, "loss": 2.0586, "step": 4825 }, { "epoch": 0.8259455759027896, "grad_norm": 0.8643401265144348, "learning_rate": 2.7495721620079863e-05, "loss": 0.1913, "step": 4826 }, { "epoch": 0.8261167208625706, "grad_norm": 52.71999740600586, "learning_rate": 2.7501426126640047e-05, "loss": 1.9317, "step": 4827 }, { "epoch": 0.8262878658223516, "grad_norm": 16.348684310913086, "learning_rate": 2.750713063320023e-05, "loss": 1.3432, "step": 4828 }, { "epoch": 0.8264590107821325, "grad_norm": 15.317984580993652, "learning_rate": 2.751283513976041e-05, "loss": 1.2614, "step": 4829 }, { "epoch": 0.8266301557419135, "grad_norm": 5.9997406005859375, "learning_rate": 2.7518539646320594e-05, "loss": 0.5047, "step": 4830 }, { "epoch": 0.8268013007016943, "grad_norm": 21.828754425048828, "learning_rate": 2.7524244152880777e-05, "loss": 2.1042, "step": 4831 }, { "epoch": 0.8269724456614753, "grad_norm": 27.086246490478516, "learning_rate": 2.752994865944096e-05, "loss": 2.9267, "step": 4832 }, { "epoch": 0.8271435906212562, "grad_norm": 22.872150421142578, "learning_rate": 2.753565316600114e-05, "loss": 2.6579, "step": 4833 }, { "epoch": 0.8273147355810372, "grad_norm": 13.048178672790527, "learning_rate": 2.7541357672561324e-05, "loss": 1.2089, "step": 4834 }, { "epoch": 0.8274858805408181, "grad_norm": 8.70570182800293, "learning_rate": 2.7547062179121507e-05, "loss": 0.6508, "step": 4835 }, { "epoch": 0.827657025500599, "grad_norm": 6.766833782196045, "learning_rate": 2.7552766685681687e-05, "loss": 0.6133, "step": 4836 }, { "epoch": 0.8278281704603799, "grad_norm": 5.038801193237305, "learning_rate": 2.755847119224187e-05, "loss": 0.4408, "step": 4837 }, { "epoch": 0.8279993154201609, "grad_norm": 17.220415115356445, "learning_rate": 2.7564175698802054e-05, "loss": 1.5166, "step": 4838 }, { "epoch": 0.8281704603799418, "grad_norm": 4.953532695770264, "learning_rate": 2.7569880205362237e-05, "loss": 0.467, "step": 4839 }, { "epoch": 0.8283416053397228, "grad_norm": 1.3376152515411377, "learning_rate": 2.7575584711922417e-05, "loss": 0.2116, "step": 4840 }, { "epoch": 0.8285127502995037, "grad_norm": 12.627934455871582, "learning_rate": 2.75812892184826e-05, "loss": 0.7787, "step": 4841 }, { "epoch": 0.8286838952592847, "grad_norm": 24.3588809967041, "learning_rate": 2.7586993725042787e-05, "loss": 2.7193, "step": 4842 }, { "epoch": 0.8288550402190655, "grad_norm": 11.016646385192871, "learning_rate": 2.7592698231602967e-05, "loss": 0.5866, "step": 4843 }, { "epoch": 0.8290261851788465, "grad_norm": 29.477998733520508, "learning_rate": 2.759840273816315e-05, "loss": 3.4923, "step": 4844 }, { "epoch": 0.8291973301386274, "grad_norm": 44.269596099853516, "learning_rate": 2.7604107244723334e-05, "loss": 1.683, "step": 4845 }, { "epoch": 0.8293684750984084, "grad_norm": 1.1761341094970703, "learning_rate": 2.7609811751283514e-05, "loss": 0.2143, "step": 4846 }, { "epoch": 0.8295396200581893, "grad_norm": 0.9938428401947021, "learning_rate": 2.7615516257843697e-05, "loss": 0.1943, "step": 4847 }, { "epoch": 0.8297107650179703, "grad_norm": 24.60161590576172, "learning_rate": 2.762122076440388e-05, "loss": 1.7267, "step": 4848 }, { "epoch": 0.8298819099777511, "grad_norm": 24.709163665771484, "learning_rate": 2.7626925270964064e-05, "loss": 2.423, "step": 4849 }, { "epoch": 0.8300530549375321, "grad_norm": 7.876855850219727, "learning_rate": 2.7632629777524244e-05, "loss": 0.9071, "step": 4850 }, { "epoch": 0.830224199897313, "grad_norm": 6.107041358947754, "learning_rate": 2.7638334284084428e-05, "loss": 0.4236, "step": 4851 }, { "epoch": 0.830395344857094, "grad_norm": 2.59680438041687, "learning_rate": 2.764403879064461e-05, "loss": 0.2292, "step": 4852 }, { "epoch": 0.8305664898168749, "grad_norm": 17.364612579345703, "learning_rate": 2.764974329720479e-05, "loss": 1.3699, "step": 4853 }, { "epoch": 0.8307376347766559, "grad_norm": 19.087657928466797, "learning_rate": 2.7655447803764974e-05, "loss": 1.1447, "step": 4854 }, { "epoch": 0.8309087797364367, "grad_norm": 0.8207781910896301, "learning_rate": 2.7661152310325158e-05, "loss": 0.1838, "step": 4855 }, { "epoch": 0.8310799246962177, "grad_norm": 5.4272894859313965, "learning_rate": 2.766685681688534e-05, "loss": 0.5259, "step": 4856 }, { "epoch": 0.8312510696559986, "grad_norm": 21.006179809570312, "learning_rate": 2.767256132344552e-05, "loss": 1.4462, "step": 4857 }, { "epoch": 0.8314222146157796, "grad_norm": 27.480995178222656, "learning_rate": 2.7678265830005704e-05, "loss": 2.9023, "step": 4858 }, { "epoch": 0.8315933595755605, "grad_norm": 25.569726943969727, "learning_rate": 2.7683970336565888e-05, "loss": 2.3783, "step": 4859 }, { "epoch": 0.8317645045353415, "grad_norm": 21.275972366333008, "learning_rate": 2.7689674843126068e-05, "loss": 2.207, "step": 4860 }, { "epoch": 0.8319356494951223, "grad_norm": 18.127140045166016, "learning_rate": 2.769537934968625e-05, "loss": 1.7016, "step": 4861 }, { "epoch": 0.8321067944549033, "grad_norm": 8.969490051269531, "learning_rate": 2.7701083856246435e-05, "loss": 0.7251, "step": 4862 }, { "epoch": 0.8322779394146842, "grad_norm": 21.6286678314209, "learning_rate": 2.7706788362806618e-05, "loss": 2.1559, "step": 4863 }, { "epoch": 0.8324490843744652, "grad_norm": 22.46690559387207, "learning_rate": 2.7712492869366798e-05, "loss": 2.5798, "step": 4864 }, { "epoch": 0.8326202293342461, "grad_norm": 24.45953941345215, "learning_rate": 2.7718197375926985e-05, "loss": 2.7626, "step": 4865 }, { "epoch": 0.832791374294027, "grad_norm": 23.767484664916992, "learning_rate": 2.7723901882487168e-05, "loss": 2.5827, "step": 4866 }, { "epoch": 0.8329625192538079, "grad_norm": 12.12441349029541, "learning_rate": 2.7729606389047348e-05, "loss": 1.1174, "step": 4867 }, { "epoch": 0.8331336642135889, "grad_norm": 3.884326934814453, "learning_rate": 2.773531089560753e-05, "loss": 0.4211, "step": 4868 }, { "epoch": 0.8333048091733698, "grad_norm": 17.074594497680664, "learning_rate": 2.7741015402167715e-05, "loss": 1.4077, "step": 4869 }, { "epoch": 0.8334759541331508, "grad_norm": 13.826687812805176, "learning_rate": 2.7746719908727898e-05, "loss": 1.2421, "step": 4870 }, { "epoch": 0.8336470990929317, "grad_norm": 19.932655334472656, "learning_rate": 2.7752424415288078e-05, "loss": 1.6989, "step": 4871 }, { "epoch": 0.8338182440527127, "grad_norm": 10.384773254394531, "learning_rate": 2.775812892184826e-05, "loss": 0.9357, "step": 4872 }, { "epoch": 0.8339893890124935, "grad_norm": 21.14198112487793, "learning_rate": 2.7763833428408445e-05, "loss": 1.8342, "step": 4873 }, { "epoch": 0.8341605339722745, "grad_norm": 1.3305509090423584, "learning_rate": 2.7769537934968625e-05, "loss": 0.2297, "step": 4874 }, { "epoch": 0.8343316789320554, "grad_norm": 19.950328826904297, "learning_rate": 2.777524244152881e-05, "loss": 1.6679, "step": 4875 }, { "epoch": 0.8345028238918364, "grad_norm": 0.8316130042076111, "learning_rate": 2.7780946948088992e-05, "loss": 0.1935, "step": 4876 }, { "epoch": 0.8346739688516173, "grad_norm": 19.933990478515625, "learning_rate": 2.7786651454649175e-05, "loss": 1.7269, "step": 4877 }, { "epoch": 0.8348451138113983, "grad_norm": 26.975513458251953, "learning_rate": 2.7792355961209355e-05, "loss": 1.4489, "step": 4878 }, { "epoch": 0.8350162587711792, "grad_norm": 31.234407424926758, "learning_rate": 2.779806046776954e-05, "loss": 5.6514, "step": 4879 }, { "epoch": 0.8351874037309601, "grad_norm": 26.34868049621582, "learning_rate": 2.7803764974329722e-05, "loss": 2.7207, "step": 4880 }, { "epoch": 0.8353585486907411, "grad_norm": 22.602392196655273, "learning_rate": 2.7809469480889902e-05, "loss": 2.0605, "step": 4881 }, { "epoch": 0.835529693650522, "grad_norm": 2.0254154205322266, "learning_rate": 2.7815173987450085e-05, "loss": 0.2238, "step": 4882 }, { "epoch": 0.835700838610303, "grad_norm": 19.05372428894043, "learning_rate": 2.782087849401027e-05, "loss": 1.8702, "step": 4883 }, { "epoch": 0.8358719835700839, "grad_norm": 17.850038528442383, "learning_rate": 2.782658300057045e-05, "loss": 1.7083, "step": 4884 }, { "epoch": 0.8360431285298648, "grad_norm": 1.1266815662384033, "learning_rate": 2.7832287507130632e-05, "loss": 0.1847, "step": 4885 }, { "epoch": 0.8362142734896457, "grad_norm": 145.55615234375, "learning_rate": 2.7837992013690815e-05, "loss": 8.2946, "step": 4886 }, { "epoch": 0.8363854184494267, "grad_norm": 19.88404083251953, "learning_rate": 2.7843696520251e-05, "loss": 2.1403, "step": 4887 }, { "epoch": 0.8365565634092076, "grad_norm": 18.304166793823242, "learning_rate": 2.7849401026811182e-05, "loss": 1.3405, "step": 4888 }, { "epoch": 0.8367277083689886, "grad_norm": 31.058061599731445, "learning_rate": 2.7855105533371365e-05, "loss": 3.5569, "step": 4889 }, { "epoch": 0.8368988533287695, "grad_norm": 3.8306217193603516, "learning_rate": 2.786081003993155e-05, "loss": 0.3999, "step": 4890 }, { "epoch": 0.8370699982885504, "grad_norm": 21.254697799682617, "learning_rate": 2.786651454649173e-05, "loss": 2.2565, "step": 4891 }, { "epoch": 0.8372411432483313, "grad_norm": 24.048858642578125, "learning_rate": 2.7872219053051912e-05, "loss": 2.2869, "step": 4892 }, { "epoch": 0.8374122882081123, "grad_norm": 32.828433990478516, "learning_rate": 2.7877923559612096e-05, "loss": 6.1064, "step": 4893 }, { "epoch": 0.8375834331678932, "grad_norm": 12.22719669342041, "learning_rate": 2.788362806617228e-05, "loss": 1.727, "step": 4894 }, { "epoch": 0.8377545781276742, "grad_norm": 78.69731903076172, "learning_rate": 2.788933257273246e-05, "loss": 7.6272, "step": 4895 }, { "epoch": 0.837925723087455, "grad_norm": 27.703277587890625, "learning_rate": 2.7895037079292642e-05, "loss": 5.7777, "step": 4896 }, { "epoch": 0.838096868047236, "grad_norm": 31.29784393310547, "learning_rate": 2.7900741585852826e-05, "loss": 4.6196, "step": 4897 }, { "epoch": 0.8382680130070169, "grad_norm": 18.404560089111328, "learning_rate": 2.7906446092413006e-05, "loss": 1.4676, "step": 4898 }, { "epoch": 0.8384391579667979, "grad_norm": 14.947914123535156, "learning_rate": 2.791215059897319e-05, "loss": 1.4166, "step": 4899 }, { "epoch": 0.8386103029265788, "grad_norm": 13.326464653015137, "learning_rate": 2.7917855105533372e-05, "loss": 0.923, "step": 4900 }, { "epoch": 0.8387814478863598, "grad_norm": 19.72146987915039, "learning_rate": 2.7923559612093556e-05, "loss": 1.8728, "step": 4901 }, { "epoch": 0.8389525928461407, "grad_norm": 23.139135360717773, "learning_rate": 2.7929264118653736e-05, "loss": 2.6928, "step": 4902 }, { "epoch": 0.8391237378059216, "grad_norm": 24.41969871520996, "learning_rate": 2.793496862521392e-05, "loss": 2.5487, "step": 4903 }, { "epoch": 0.8392948827657025, "grad_norm": 18.01527214050293, "learning_rate": 2.7940673131774103e-05, "loss": 1.7845, "step": 4904 }, { "epoch": 0.8394660277254835, "grad_norm": 33.79884719848633, "learning_rate": 2.7946377638334283e-05, "loss": 2.9576, "step": 4905 }, { "epoch": 0.8396371726852644, "grad_norm": 20.432056427001953, "learning_rate": 2.7952082144894466e-05, "loss": 2.1053, "step": 4906 }, { "epoch": 0.8398083176450454, "grad_norm": 53.79264831542969, "learning_rate": 2.795778665145465e-05, "loss": 2.1853, "step": 4907 }, { "epoch": 0.8399794626048263, "grad_norm": 30.717485427856445, "learning_rate": 2.7963491158014833e-05, "loss": 5.9112, "step": 4908 }, { "epoch": 0.8401506075646072, "grad_norm": 24.705486297607422, "learning_rate": 2.7969195664575013e-05, "loss": 3.0099, "step": 4909 }, { "epoch": 0.8403217525243881, "grad_norm": 14.869185447692871, "learning_rate": 2.7974900171135196e-05, "loss": 1.0484, "step": 4910 }, { "epoch": 0.8404928974841691, "grad_norm": 5.514411449432373, "learning_rate": 2.7980604677695383e-05, "loss": 0.3749, "step": 4911 }, { "epoch": 0.84066404244395, "grad_norm": 25.00140380859375, "learning_rate": 2.7986309184255563e-05, "loss": 2.7309, "step": 4912 }, { "epoch": 0.840835187403731, "grad_norm": 16.862314224243164, "learning_rate": 2.7992013690815746e-05, "loss": 1.5645, "step": 4913 }, { "epoch": 0.8410063323635119, "grad_norm": 16.41586685180664, "learning_rate": 2.799771819737593e-05, "loss": 1.3737, "step": 4914 }, { "epoch": 0.8411774773232928, "grad_norm": 4.305161476135254, "learning_rate": 2.800342270393611e-05, "loss": 0.4024, "step": 4915 }, { "epoch": 0.8413486222830737, "grad_norm": 9.081815719604492, "learning_rate": 2.8009127210496293e-05, "loss": 0.5857, "step": 4916 }, { "epoch": 0.8415197672428547, "grad_norm": 23.440717697143555, "learning_rate": 2.8014831717056476e-05, "loss": 2.1649, "step": 4917 }, { "epoch": 0.8416909122026356, "grad_norm": 9.641824722290039, "learning_rate": 2.802053622361666e-05, "loss": 0.5756, "step": 4918 }, { "epoch": 0.8418620571624166, "grad_norm": 22.86307716369629, "learning_rate": 2.802624073017684e-05, "loss": 2.4724, "step": 4919 }, { "epoch": 0.8420332021221975, "grad_norm": 20.317447662353516, "learning_rate": 2.8031945236737023e-05, "loss": 2.1701, "step": 4920 }, { "epoch": 0.8422043470819784, "grad_norm": 5.848948955535889, "learning_rate": 2.8037649743297206e-05, "loss": 0.5332, "step": 4921 }, { "epoch": 0.8423754920417593, "grad_norm": 29.350927352905273, "learning_rate": 2.8043354249857386e-05, "loss": 1.6343, "step": 4922 }, { "epoch": 0.8425466370015403, "grad_norm": 23.29707145690918, "learning_rate": 2.804905875641757e-05, "loss": 2.0175, "step": 4923 }, { "epoch": 0.8427177819613212, "grad_norm": 21.697025299072266, "learning_rate": 2.8054763262977753e-05, "loss": 2.3221, "step": 4924 }, { "epoch": 0.8428889269211022, "grad_norm": 24.808443069458008, "learning_rate": 2.8060467769537937e-05, "loss": 2.1885, "step": 4925 }, { "epoch": 0.843060071880883, "grad_norm": 18.006011962890625, "learning_rate": 2.8066172276098117e-05, "loss": 1.2711, "step": 4926 }, { "epoch": 0.843231216840664, "grad_norm": 18.567319869995117, "learning_rate": 2.80718767826583e-05, "loss": 1.6861, "step": 4927 }, { "epoch": 0.8434023618004449, "grad_norm": 19.155935287475586, "learning_rate": 2.8077581289218483e-05, "loss": 1.5559, "step": 4928 }, { "epoch": 0.8435735067602259, "grad_norm": 14.273578643798828, "learning_rate": 2.8083285795778663e-05, "loss": 1.5563, "step": 4929 }, { "epoch": 0.8437446517200069, "grad_norm": 23.349454879760742, "learning_rate": 2.8088990302338847e-05, "loss": 2.4812, "step": 4930 }, { "epoch": 0.8439157966797878, "grad_norm": 1.6741917133331299, "learning_rate": 2.809469480889903e-05, "loss": 0.2197, "step": 4931 }, { "epoch": 0.8440869416395688, "grad_norm": 24.860034942626953, "learning_rate": 2.8100399315459213e-05, "loss": 2.4578, "step": 4932 }, { "epoch": 0.8442580865993496, "grad_norm": 14.901060104370117, "learning_rate": 2.8106103822019393e-05, "loss": 1.5235, "step": 4933 }, { "epoch": 0.8444292315591306, "grad_norm": 9.101666450500488, "learning_rate": 2.811180832857958e-05, "loss": 0.5367, "step": 4934 }, { "epoch": 0.8446003765189115, "grad_norm": 25.940553665161133, "learning_rate": 2.8117512835139764e-05, "loss": 2.8182, "step": 4935 }, { "epoch": 0.8447715214786925, "grad_norm": 21.500452041625977, "learning_rate": 2.8123217341699944e-05, "loss": 2.2284, "step": 4936 }, { "epoch": 0.8449426664384734, "grad_norm": 9.465425491333008, "learning_rate": 2.8128921848260127e-05, "loss": 0.8752, "step": 4937 }, { "epoch": 0.8451138113982544, "grad_norm": 12.658205032348633, "learning_rate": 2.813462635482031e-05, "loss": 0.7855, "step": 4938 }, { "epoch": 0.8452849563580352, "grad_norm": 3.6830475330352783, "learning_rate": 2.8140330861380494e-05, "loss": 0.3844, "step": 4939 }, { "epoch": 0.8454561013178162, "grad_norm": 79.1318130493164, "learning_rate": 2.8146035367940674e-05, "loss": 7.1857, "step": 4940 }, { "epoch": 0.8456272462775971, "grad_norm": 18.231271743774414, "learning_rate": 2.8151739874500857e-05, "loss": 1.4963, "step": 4941 }, { "epoch": 0.8457983912373781, "grad_norm": 30.293922424316406, "learning_rate": 2.815744438106104e-05, "loss": 3.6746, "step": 4942 }, { "epoch": 0.845969536197159, "grad_norm": 23.53990364074707, "learning_rate": 2.816314888762122e-05, "loss": 2.1727, "step": 4943 }, { "epoch": 0.84614068115694, "grad_norm": 45.553245544433594, "learning_rate": 2.8168853394181404e-05, "loss": 6.6332, "step": 4944 }, { "epoch": 0.8463118261167208, "grad_norm": 23.199127197265625, "learning_rate": 2.8174557900741587e-05, "loss": 1.8783, "step": 4945 }, { "epoch": 0.8464829710765018, "grad_norm": 29.977962493896484, "learning_rate": 2.818026240730177e-05, "loss": 3.5376, "step": 4946 }, { "epoch": 0.8466541160362827, "grad_norm": 13.712217330932617, "learning_rate": 2.818596691386195e-05, "loss": 1.2064, "step": 4947 }, { "epoch": 0.8468252609960637, "grad_norm": 21.567646026611328, "learning_rate": 2.8191671420422134e-05, "loss": 1.7774, "step": 4948 }, { "epoch": 0.8469964059558446, "grad_norm": 20.53908348083496, "learning_rate": 2.8197375926982317e-05, "loss": 1.5944, "step": 4949 }, { "epoch": 0.8471675509156256, "grad_norm": 57.32319641113281, "learning_rate": 2.8203080433542497e-05, "loss": 1.6839, "step": 4950 }, { "epoch": 0.8473386958754064, "grad_norm": 19.78857421875, "learning_rate": 2.820878494010268e-05, "loss": 1.645, "step": 4951 }, { "epoch": 0.8475098408351874, "grad_norm": 18.463855743408203, "learning_rate": 2.8214489446662864e-05, "loss": 1.8035, "step": 4952 }, { "epoch": 0.8476809857949683, "grad_norm": 3.614971160888672, "learning_rate": 2.8220193953223044e-05, "loss": 0.3524, "step": 4953 }, { "epoch": 0.8478521307547493, "grad_norm": 44.223514556884766, "learning_rate": 2.8225898459783227e-05, "loss": 1.5668, "step": 4954 }, { "epoch": 0.8480232757145302, "grad_norm": 28.211584091186523, "learning_rate": 2.823160296634341e-05, "loss": 2.7987, "step": 4955 }, { "epoch": 0.8481944206743112, "grad_norm": 6.280284881591797, "learning_rate": 2.8237307472903594e-05, "loss": 0.5097, "step": 4956 }, { "epoch": 0.848365565634092, "grad_norm": 27.26231575012207, "learning_rate": 2.8243011979463778e-05, "loss": 3.0276, "step": 4957 }, { "epoch": 0.848536710593873, "grad_norm": 23.75172996520996, "learning_rate": 2.824871648602396e-05, "loss": 2.684, "step": 4958 }, { "epoch": 0.8487078555536539, "grad_norm": 25.655179977416992, "learning_rate": 2.8254420992584144e-05, "loss": 2.2778, "step": 4959 }, { "epoch": 0.8488790005134349, "grad_norm": 1.2349088191986084, "learning_rate": 2.8260125499144324e-05, "loss": 0.2043, "step": 4960 }, { "epoch": 0.8490501454732158, "grad_norm": 29.569950103759766, "learning_rate": 2.8265830005704508e-05, "loss": 5.8835, "step": 4961 }, { "epoch": 0.8492212904329968, "grad_norm": 21.26607322692871, "learning_rate": 2.827153451226469e-05, "loss": 1.8996, "step": 4962 }, { "epoch": 0.8493924353927776, "grad_norm": 18.51241111755371, "learning_rate": 2.8277239018824875e-05, "loss": 1.6964, "step": 4963 }, { "epoch": 0.8495635803525586, "grad_norm": 129.38690185546875, "learning_rate": 2.8282943525385055e-05, "loss": 8.2196, "step": 4964 }, { "epoch": 0.8497347253123395, "grad_norm": 14.641566276550293, "learning_rate": 2.8288648031945238e-05, "loss": 1.8633, "step": 4965 }, { "epoch": 0.8499058702721205, "grad_norm": 31.69512176513672, "learning_rate": 2.829435253850542e-05, "loss": 3.3676, "step": 4966 }, { "epoch": 0.8500770152319014, "grad_norm": 1.0442372560501099, "learning_rate": 2.83000570450656e-05, "loss": 0.201, "step": 4967 }, { "epoch": 0.8502481601916824, "grad_norm": 20.171241760253906, "learning_rate": 2.8305761551625785e-05, "loss": 1.8819, "step": 4968 }, { "epoch": 0.8504193051514632, "grad_norm": 7.881687164306641, "learning_rate": 2.8311466058185968e-05, "loss": 0.5707, "step": 4969 }, { "epoch": 0.8505904501112442, "grad_norm": 5.979015350341797, "learning_rate": 2.831717056474615e-05, "loss": 0.5286, "step": 4970 }, { "epoch": 0.8507615950710251, "grad_norm": 0.8935146331787109, "learning_rate": 2.832287507130633e-05, "loss": 0.1914, "step": 4971 }, { "epoch": 0.8509327400308061, "grad_norm": 28.41226577758789, "learning_rate": 2.8328579577866515e-05, "loss": 3.873, "step": 4972 }, { "epoch": 0.851103884990587, "grad_norm": 23.3878116607666, "learning_rate": 2.8334284084426698e-05, "loss": 2.738, "step": 4973 }, { "epoch": 0.851275029950368, "grad_norm": 1.1107969284057617, "learning_rate": 2.8339988590986878e-05, "loss": 0.2044, "step": 4974 }, { "epoch": 0.8514461749101488, "grad_norm": 12.226590156555176, "learning_rate": 2.834569309754706e-05, "loss": 0.729, "step": 4975 }, { "epoch": 0.8516173198699298, "grad_norm": 24.799179077148438, "learning_rate": 2.8351397604107245e-05, "loss": 1.7961, "step": 4976 }, { "epoch": 0.8517884648297107, "grad_norm": 22.254865646362305, "learning_rate": 2.8357102110667428e-05, "loss": 2.345, "step": 4977 }, { "epoch": 0.8519596097894917, "grad_norm": 22.752105712890625, "learning_rate": 2.8362806617227608e-05, "loss": 2.2281, "step": 4978 }, { "epoch": 0.8521307547492726, "grad_norm": 2.3499200344085693, "learning_rate": 2.836851112378779e-05, "loss": 0.2585, "step": 4979 }, { "epoch": 0.8523018997090536, "grad_norm": 24.360803604125977, "learning_rate": 2.837421563034798e-05, "loss": 2.2443, "step": 4980 }, { "epoch": 0.8524730446688346, "grad_norm": 22.948808670043945, "learning_rate": 2.837992013690816e-05, "loss": 2.2422, "step": 4981 }, { "epoch": 0.8526441896286154, "grad_norm": 7.84674072265625, "learning_rate": 2.8385624643468342e-05, "loss": 0.6825, "step": 4982 }, { "epoch": 0.8528153345883964, "grad_norm": 11.925847053527832, "learning_rate": 2.8391329150028525e-05, "loss": 0.9807, "step": 4983 }, { "epoch": 0.8529864795481773, "grad_norm": 12.705161094665527, "learning_rate": 2.8397033656588705e-05, "loss": 1.0903, "step": 4984 }, { "epoch": 0.8531576245079583, "grad_norm": 22.5230770111084, "learning_rate": 2.840273816314889e-05, "loss": 2.2401, "step": 4985 }, { "epoch": 0.8533287694677392, "grad_norm": 20.577150344848633, "learning_rate": 2.8408442669709072e-05, "loss": 1.4695, "step": 4986 }, { "epoch": 0.8534999144275202, "grad_norm": 8.088134765625, "learning_rate": 2.8414147176269255e-05, "loss": 0.9019, "step": 4987 }, { "epoch": 0.853671059387301, "grad_norm": 2.71311354637146, "learning_rate": 2.8419851682829435e-05, "loss": 0.3453, "step": 4988 }, { "epoch": 0.853842204347082, "grad_norm": 22.019060134887695, "learning_rate": 2.842555618938962e-05, "loss": 1.6718, "step": 4989 }, { "epoch": 0.8540133493068629, "grad_norm": 30.339303970336914, "learning_rate": 2.8431260695949802e-05, "loss": 2.9397, "step": 4990 }, { "epoch": 0.8541844942666439, "grad_norm": 1.055464744567871, "learning_rate": 2.8436965202509982e-05, "loss": 0.1917, "step": 4991 }, { "epoch": 0.8543556392264248, "grad_norm": 19.96885871887207, "learning_rate": 2.8442669709070165e-05, "loss": 1.551, "step": 4992 }, { "epoch": 0.8545267841862058, "grad_norm": 31.464706420898438, "learning_rate": 2.844837421563035e-05, "loss": 1.7026, "step": 4993 }, { "epoch": 0.8546979291459866, "grad_norm": 6.268205165863037, "learning_rate": 2.8454078722190532e-05, "loss": 0.6232, "step": 4994 }, { "epoch": 0.8548690741057676, "grad_norm": 1.3084250688552856, "learning_rate": 2.8459783228750712e-05, "loss": 0.194, "step": 4995 }, { "epoch": 0.8550402190655485, "grad_norm": 24.96925163269043, "learning_rate": 2.8465487735310896e-05, "loss": 2.4543, "step": 4996 }, { "epoch": 0.8552113640253295, "grad_norm": 20.334264755249023, "learning_rate": 2.847119224187108e-05, "loss": 1.9524, "step": 4997 }, { "epoch": 0.8553825089851104, "grad_norm": 32.75785446166992, "learning_rate": 2.847689674843126e-05, "loss": 1.7313, "step": 4998 }, { "epoch": 0.8555536539448914, "grad_norm": 20.249191284179688, "learning_rate": 2.8482601254991442e-05, "loss": 1.7413, "step": 4999 }, { "epoch": 0.8557247989046722, "grad_norm": 9.610689163208008, "learning_rate": 2.8488305761551626e-05, "loss": 0.5728, "step": 5000 }, { "epoch": 0.8558959438644532, "grad_norm": 20.597206115722656, "learning_rate": 2.849401026811181e-05, "loss": 1.9744, "step": 5001 }, { "epoch": 0.8560670888242341, "grad_norm": 22.978784561157227, "learning_rate": 2.849971477467199e-05, "loss": 1.547, "step": 5002 }, { "epoch": 0.8562382337840151, "grad_norm": 15.783121109008789, "learning_rate": 2.8505419281232176e-05, "loss": 1.2838, "step": 5003 }, { "epoch": 0.856409378743796, "grad_norm": 30.393911361694336, "learning_rate": 2.851112378779236e-05, "loss": 5.1174, "step": 5004 }, { "epoch": 0.856580523703577, "grad_norm": 30.374027252197266, "learning_rate": 2.851682829435254e-05, "loss": 6.2091, "step": 5005 }, { "epoch": 0.8567516686633578, "grad_norm": 20.213665008544922, "learning_rate": 2.8522532800912723e-05, "loss": 1.4388, "step": 5006 }, { "epoch": 0.8569228136231388, "grad_norm": 22.47203254699707, "learning_rate": 2.8528237307472906e-05, "loss": 2.1662, "step": 5007 }, { "epoch": 0.8570939585829197, "grad_norm": 67.63754272460938, "learning_rate": 2.853394181403309e-05, "loss": 8.0588, "step": 5008 }, { "epoch": 0.8572651035427007, "grad_norm": 18.833168029785156, "learning_rate": 2.853964632059327e-05, "loss": 1.5184, "step": 5009 }, { "epoch": 0.8574362485024816, "grad_norm": 20.953800201416016, "learning_rate": 2.8545350827153453e-05, "loss": 1.9016, "step": 5010 }, { "epoch": 0.8576073934622626, "grad_norm": 11.872946739196777, "learning_rate": 2.8551055333713636e-05, "loss": 0.689, "step": 5011 }, { "epoch": 0.8577785384220434, "grad_norm": 20.248685836791992, "learning_rate": 2.8556759840273816e-05, "loss": 2.0556, "step": 5012 }, { "epoch": 0.8579496833818244, "grad_norm": 12.471953392028809, "learning_rate": 2.8562464346834e-05, "loss": 0.8929, "step": 5013 }, { "epoch": 0.8581208283416053, "grad_norm": 9.479846954345703, "learning_rate": 2.8568168853394183e-05, "loss": 0.8552, "step": 5014 }, { "epoch": 0.8582919733013863, "grad_norm": 25.91791343688965, "learning_rate": 2.8573873359954363e-05, "loss": 2.4487, "step": 5015 }, { "epoch": 0.8584631182611672, "grad_norm": 30.794086456298828, "learning_rate": 2.8579577866514546e-05, "loss": 1.6757, "step": 5016 }, { "epoch": 0.8586342632209482, "grad_norm": 30.115619659423828, "learning_rate": 2.858528237307473e-05, "loss": 2.2301, "step": 5017 }, { "epoch": 0.858805408180729, "grad_norm": 15.450135231018066, "learning_rate": 2.8590986879634913e-05, "loss": 1.3106, "step": 5018 }, { "epoch": 0.85897655314051, "grad_norm": 23.82478904724121, "learning_rate": 2.8596691386195093e-05, "loss": 2.1391, "step": 5019 }, { "epoch": 0.8591476981002909, "grad_norm": 21.615318298339844, "learning_rate": 2.8602395892755276e-05, "loss": 1.7556, "step": 5020 }, { "epoch": 0.8593188430600719, "grad_norm": 20.00986671447754, "learning_rate": 2.860810039931546e-05, "loss": 1.8048, "step": 5021 }, { "epoch": 0.8594899880198528, "grad_norm": 1.3434984683990479, "learning_rate": 2.861380490587564e-05, "loss": 0.2228, "step": 5022 }, { "epoch": 0.8596611329796338, "grad_norm": 12.534896850585938, "learning_rate": 2.8619509412435823e-05, "loss": 1.6978, "step": 5023 }, { "epoch": 0.8598322779394146, "grad_norm": 30.588682174682617, "learning_rate": 2.8625213918996006e-05, "loss": 2.4971, "step": 5024 }, { "epoch": 0.8600034228991956, "grad_norm": 16.27142333984375, "learning_rate": 2.8630918425556193e-05, "loss": 1.2722, "step": 5025 }, { "epoch": 0.8601745678589765, "grad_norm": 8.828721046447754, "learning_rate": 2.8636622932116373e-05, "loss": 0.4231, "step": 5026 }, { "epoch": 0.8603457128187575, "grad_norm": 11.027689933776855, "learning_rate": 2.8642327438676557e-05, "loss": 0.7595, "step": 5027 }, { "epoch": 0.8605168577785384, "grad_norm": 23.096948623657227, "learning_rate": 2.864803194523674e-05, "loss": 2.6978, "step": 5028 }, { "epoch": 0.8606880027383194, "grad_norm": 20.453035354614258, "learning_rate": 2.865373645179692e-05, "loss": 2.2458, "step": 5029 }, { "epoch": 0.8608591476981002, "grad_norm": 16.359405517578125, "learning_rate": 2.8659440958357103e-05, "loss": 1.6137, "step": 5030 }, { "epoch": 0.8610302926578812, "grad_norm": 21.571128845214844, "learning_rate": 2.8665145464917287e-05, "loss": 1.8346, "step": 5031 }, { "epoch": 0.8612014376176622, "grad_norm": 30.24646759033203, "learning_rate": 2.867084997147747e-05, "loss": 3.9606, "step": 5032 }, { "epoch": 0.8613725825774431, "grad_norm": 20.61684799194336, "learning_rate": 2.867655447803765e-05, "loss": 2.4751, "step": 5033 }, { "epoch": 0.8615437275372241, "grad_norm": 22.913393020629883, "learning_rate": 2.8682258984597833e-05, "loss": 2.5773, "step": 5034 }, { "epoch": 0.861714872497005, "grad_norm": 16.3863582611084, "learning_rate": 2.8687963491158017e-05, "loss": 1.2117, "step": 5035 }, { "epoch": 0.861886017456786, "grad_norm": 1.4799400568008423, "learning_rate": 2.8693667997718197e-05, "loss": 0.2188, "step": 5036 }, { "epoch": 0.8620571624165668, "grad_norm": 2.393829822540283, "learning_rate": 2.869937250427838e-05, "loss": 0.3249, "step": 5037 }, { "epoch": 0.8622283073763478, "grad_norm": 6.015737533569336, "learning_rate": 2.8705077010838564e-05, "loss": 0.475, "step": 5038 }, { "epoch": 0.8623994523361287, "grad_norm": 10.611120223999023, "learning_rate": 2.8710781517398747e-05, "loss": 0.6693, "step": 5039 }, { "epoch": 0.8625705972959097, "grad_norm": 40.11723709106445, "learning_rate": 2.8716486023958927e-05, "loss": 6.5789, "step": 5040 }, { "epoch": 0.8627417422556906, "grad_norm": 18.530126571655273, "learning_rate": 2.872219053051911e-05, "loss": 1.5516, "step": 5041 }, { "epoch": 0.8629128872154715, "grad_norm": 18.136308670043945, "learning_rate": 2.8727895037079294e-05, "loss": 1.4063, "step": 5042 }, { "epoch": 0.8630840321752524, "grad_norm": 25.583057403564453, "learning_rate": 2.8733599543639474e-05, "loss": 1.7697, "step": 5043 }, { "epoch": 0.8632551771350334, "grad_norm": 19.754838943481445, "learning_rate": 2.8739304050199657e-05, "loss": 1.9445, "step": 5044 }, { "epoch": 0.8634263220948143, "grad_norm": 24.454795837402344, "learning_rate": 2.874500855675984e-05, "loss": 2.0861, "step": 5045 }, { "epoch": 0.8635974670545953, "grad_norm": 21.876201629638672, "learning_rate": 2.8750713063320024e-05, "loss": 2.0072, "step": 5046 }, { "epoch": 0.8637686120143762, "grad_norm": 6.661749839782715, "learning_rate": 2.8756417569880204e-05, "loss": 0.5025, "step": 5047 }, { "epoch": 0.8639397569741571, "grad_norm": 190.44247436523438, "learning_rate": 2.876212207644039e-05, "loss": 8.5548, "step": 5048 }, { "epoch": 0.864110901933938, "grad_norm": 22.162609100341797, "learning_rate": 2.8767826583000574e-05, "loss": 2.0999, "step": 5049 }, { "epoch": 0.864282046893719, "grad_norm": 1.4531558752059937, "learning_rate": 2.8773531089560754e-05, "loss": 0.2048, "step": 5050 }, { "epoch": 0.8644531918534999, "grad_norm": 24.412981033325195, "learning_rate": 2.8779235596120937e-05, "loss": 2.3353, "step": 5051 }, { "epoch": 0.8646243368132809, "grad_norm": 31.54360580444336, "learning_rate": 2.878494010268112e-05, "loss": 1.6217, "step": 5052 }, { "epoch": 0.8647954817730618, "grad_norm": 23.30241584777832, "learning_rate": 2.87906446092413e-05, "loss": 2.1526, "step": 5053 }, { "epoch": 0.8649666267328427, "grad_norm": 18.61789321899414, "learning_rate": 2.8796349115801484e-05, "loss": 1.5916, "step": 5054 }, { "epoch": 0.8651377716926236, "grad_norm": 8.036980628967285, "learning_rate": 2.8802053622361667e-05, "loss": 0.6728, "step": 5055 }, { "epoch": 0.8653089166524046, "grad_norm": 1.1689972877502441, "learning_rate": 2.880775812892185e-05, "loss": 0.1996, "step": 5056 }, { "epoch": 0.8654800616121855, "grad_norm": 20.204479217529297, "learning_rate": 2.881346263548203e-05, "loss": 1.8613, "step": 5057 }, { "epoch": 0.8656512065719665, "grad_norm": 20.377853393554688, "learning_rate": 2.8819167142042214e-05, "loss": 1.9245, "step": 5058 }, { "epoch": 0.8658223515317474, "grad_norm": 32.21659851074219, "learning_rate": 2.8824871648602398e-05, "loss": 4.4384, "step": 5059 }, { "epoch": 0.8659934964915283, "grad_norm": 17.437664031982422, "learning_rate": 2.8830576155162578e-05, "loss": 1.8003, "step": 5060 }, { "epoch": 0.8661646414513092, "grad_norm": 8.696148872375488, "learning_rate": 2.883628066172276e-05, "loss": 0.8588, "step": 5061 }, { "epoch": 0.8663357864110902, "grad_norm": 38.84085464477539, "learning_rate": 2.8841985168282944e-05, "loss": 6.4101, "step": 5062 }, { "epoch": 0.8665069313708711, "grad_norm": 22.779674530029297, "learning_rate": 2.8847689674843128e-05, "loss": 2.1035, "step": 5063 }, { "epoch": 0.8666780763306521, "grad_norm": 20.018033981323242, "learning_rate": 2.8853394181403308e-05, "loss": 1.7867, "step": 5064 }, { "epoch": 0.866849221290433, "grad_norm": 20.817026138305664, "learning_rate": 2.885909868796349e-05, "loss": 1.8507, "step": 5065 }, { "epoch": 0.867020366250214, "grad_norm": 22.181928634643555, "learning_rate": 2.8864803194523674e-05, "loss": 2.3944, "step": 5066 }, { "epoch": 0.8671915112099948, "grad_norm": 21.743671417236328, "learning_rate": 2.8870507701083854e-05, "loss": 2.0094, "step": 5067 }, { "epoch": 0.8673626561697758, "grad_norm": 18.235118865966797, "learning_rate": 2.8876212207644038e-05, "loss": 1.7605, "step": 5068 }, { "epoch": 0.8675338011295567, "grad_norm": 16.165708541870117, "learning_rate": 2.888191671420422e-05, "loss": 1.5405, "step": 5069 }, { "epoch": 0.8677049460893377, "grad_norm": 33.633140563964844, "learning_rate": 2.8887621220764405e-05, "loss": 6.2029, "step": 5070 }, { "epoch": 0.8678760910491186, "grad_norm": 9.106796264648438, "learning_rate": 2.8893325727324588e-05, "loss": 0.5696, "step": 5071 }, { "epoch": 0.8680472360088995, "grad_norm": 10.702054023742676, "learning_rate": 2.889903023388477e-05, "loss": 0.8735, "step": 5072 }, { "epoch": 0.8682183809686804, "grad_norm": 34.0421142578125, "learning_rate": 2.8904734740444955e-05, "loss": 2.06, "step": 5073 }, { "epoch": 0.8683895259284614, "grad_norm": 26.271299362182617, "learning_rate": 2.8910439247005135e-05, "loss": 2.3461, "step": 5074 }, { "epoch": 0.8685606708882423, "grad_norm": 24.45592498779297, "learning_rate": 2.8916143753565318e-05, "loss": 2.7885, "step": 5075 }, { "epoch": 0.8687318158480233, "grad_norm": 29.799867630004883, "learning_rate": 2.89218482601255e-05, "loss": 3.7322, "step": 5076 }, { "epoch": 0.8689029608078042, "grad_norm": 50.683589935302734, "learning_rate": 2.8927552766685685e-05, "loss": 1.8316, "step": 5077 }, { "epoch": 0.8690741057675851, "grad_norm": 8.911881446838379, "learning_rate": 2.8933257273245865e-05, "loss": 0.9942, "step": 5078 }, { "epoch": 0.869245250727366, "grad_norm": 14.339098930358887, "learning_rate": 2.8938961779806048e-05, "loss": 1.3359, "step": 5079 }, { "epoch": 0.869416395687147, "grad_norm": 60.05573272705078, "learning_rate": 2.894466628636623e-05, "loss": 6.966, "step": 5080 }, { "epoch": 0.869587540646928, "grad_norm": 30.60601806640625, "learning_rate": 2.895037079292641e-05, "loss": 5.8897, "step": 5081 }, { "epoch": 0.8697586856067089, "grad_norm": 19.455875396728516, "learning_rate": 2.8956075299486595e-05, "loss": 1.5791, "step": 5082 }, { "epoch": 0.8699298305664899, "grad_norm": 11.915603637695312, "learning_rate": 2.896177980604678e-05, "loss": 0.9361, "step": 5083 }, { "epoch": 0.8701009755262707, "grad_norm": 20.202234268188477, "learning_rate": 2.8967484312606958e-05, "loss": 5.2419, "step": 5084 }, { "epoch": 0.8702721204860517, "grad_norm": 6.621884346008301, "learning_rate": 2.897318881916714e-05, "loss": 0.6413, "step": 5085 }, { "epoch": 0.8704432654458326, "grad_norm": 28.005216598510742, "learning_rate": 2.8978893325727325e-05, "loss": 2.8525, "step": 5086 }, { "epoch": 0.8706144104056136, "grad_norm": 20.600162506103516, "learning_rate": 2.898459783228751e-05, "loss": 1.7191, "step": 5087 }, { "epoch": 0.8707855553653945, "grad_norm": 6.3435821533203125, "learning_rate": 2.899030233884769e-05, "loss": 0.5639, "step": 5088 }, { "epoch": 0.8709567003251755, "grad_norm": 25.650978088378906, "learning_rate": 2.8996006845407872e-05, "loss": 5.7733, "step": 5089 }, { "epoch": 0.8711278452849563, "grad_norm": 22.818950653076172, "learning_rate": 2.9001711351968055e-05, "loss": 2.128, "step": 5090 }, { "epoch": 0.8712989902447373, "grad_norm": 5.952839374542236, "learning_rate": 2.9007415858528235e-05, "loss": 0.4348, "step": 5091 }, { "epoch": 0.8714701352045182, "grad_norm": 1.6554477214813232, "learning_rate": 2.901312036508842e-05, "loss": 0.2175, "step": 5092 }, { "epoch": 0.8716412801642992, "grad_norm": 22.035751342773438, "learning_rate": 2.9018824871648602e-05, "loss": 2.054, "step": 5093 }, { "epoch": 0.8718124251240801, "grad_norm": 1.5505925416946411, "learning_rate": 2.902452937820879e-05, "loss": 0.2221, "step": 5094 }, { "epoch": 0.8719835700838611, "grad_norm": 27.077693939208984, "learning_rate": 2.903023388476897e-05, "loss": 3.1948, "step": 5095 }, { "epoch": 0.872154715043642, "grad_norm": 22.83882713317871, "learning_rate": 2.9035938391329152e-05, "loss": 2.3321, "step": 5096 }, { "epoch": 0.8723258600034229, "grad_norm": 4.607493877410889, "learning_rate": 2.9041642897889335e-05, "loss": 0.4254, "step": 5097 }, { "epoch": 0.8724970049632038, "grad_norm": 17.07670783996582, "learning_rate": 2.9047347404449515e-05, "loss": 1.4832, "step": 5098 }, { "epoch": 0.8726681499229848, "grad_norm": 15.903471946716309, "learning_rate": 2.90530519110097e-05, "loss": 1.5948, "step": 5099 }, { "epoch": 0.8728392948827657, "grad_norm": 1.760149598121643, "learning_rate": 2.9058756417569882e-05, "loss": 0.2047, "step": 5100 }, { "epoch": 0.8730104398425467, "grad_norm": 6.830605983734131, "learning_rate": 2.9064460924130066e-05, "loss": 0.6232, "step": 5101 }, { "epoch": 0.8731815848023275, "grad_norm": 24.54201316833496, "learning_rate": 2.9070165430690246e-05, "loss": 2.2265, "step": 5102 }, { "epoch": 0.8733527297621085, "grad_norm": 20.139869689941406, "learning_rate": 2.907586993725043e-05, "loss": 2.3484, "step": 5103 }, { "epoch": 0.8735238747218894, "grad_norm": 17.505416870117188, "learning_rate": 2.9081574443810612e-05, "loss": 1.434, "step": 5104 }, { "epoch": 0.8736950196816704, "grad_norm": 33.22725296020508, "learning_rate": 2.9087278950370792e-05, "loss": 6.1542, "step": 5105 }, { "epoch": 0.8738661646414513, "grad_norm": 146.86790466308594, "learning_rate": 2.9092983456930976e-05, "loss": 9.0732, "step": 5106 }, { "epoch": 0.8740373096012323, "grad_norm": 16.295713424682617, "learning_rate": 2.909868796349116e-05, "loss": 1.4125, "step": 5107 }, { "epoch": 0.8742084545610131, "grad_norm": 14.710768699645996, "learning_rate": 2.9104392470051342e-05, "loss": 1.1648, "step": 5108 }, { "epoch": 0.8743795995207941, "grad_norm": 12.489028930664062, "learning_rate": 2.9110096976611522e-05, "loss": 1.268, "step": 5109 }, { "epoch": 0.874550744480575, "grad_norm": 13.691608428955078, "learning_rate": 2.9115801483171706e-05, "loss": 1.085, "step": 5110 }, { "epoch": 0.874721889440356, "grad_norm": 23.710769653320312, "learning_rate": 2.912150598973189e-05, "loss": 2.4187, "step": 5111 }, { "epoch": 0.8748930344001369, "grad_norm": 25.6949405670166, "learning_rate": 2.912721049629207e-05, "loss": 2.5228, "step": 5112 }, { "epoch": 0.8750641793599179, "grad_norm": 21.368337631225586, "learning_rate": 2.9132915002852253e-05, "loss": 1.8782, "step": 5113 }, { "epoch": 0.8752353243196987, "grad_norm": 82.89339447021484, "learning_rate": 2.9138619509412436e-05, "loss": 8.207, "step": 5114 }, { "epoch": 0.8754064692794797, "grad_norm": 16.71074676513672, "learning_rate": 2.914432401597262e-05, "loss": 1.4629, "step": 5115 }, { "epoch": 0.8755776142392606, "grad_norm": 2.993272542953491, "learning_rate": 2.91500285225328e-05, "loss": 0.394, "step": 5116 }, { "epoch": 0.8757487591990416, "grad_norm": 14.644696235656738, "learning_rate": 2.9155733029092986e-05, "loss": 1.3512, "step": 5117 }, { "epoch": 0.8759199041588225, "grad_norm": 19.816692352294922, "learning_rate": 2.916143753565317e-05, "loss": 2.3304, "step": 5118 }, { "epoch": 0.8760910491186035, "grad_norm": 33.53482437133789, "learning_rate": 2.916714204221335e-05, "loss": 6.1537, "step": 5119 }, { "epoch": 0.8762621940783843, "grad_norm": 18.77507972717285, "learning_rate": 2.9172846548773533e-05, "loss": 1.5366, "step": 5120 }, { "epoch": 0.8764333390381653, "grad_norm": 19.784685134887695, "learning_rate": 2.9178551055333716e-05, "loss": 2.1277, "step": 5121 }, { "epoch": 0.8766044839979462, "grad_norm": 9.049237251281738, "learning_rate": 2.9184255561893896e-05, "loss": 0.6134, "step": 5122 }, { "epoch": 0.8767756289577272, "grad_norm": 8.538041114807129, "learning_rate": 2.918996006845408e-05, "loss": 0.5802, "step": 5123 }, { "epoch": 0.8769467739175081, "grad_norm": 26.41038703918457, "learning_rate": 2.9195664575014263e-05, "loss": 2.5314, "step": 5124 }, { "epoch": 0.8771179188772891, "grad_norm": 9.25900936126709, "learning_rate": 2.9201369081574446e-05, "loss": 0.7571, "step": 5125 }, { "epoch": 0.87728906383707, "grad_norm": 28.40367317199707, "learning_rate": 2.9207073588134626e-05, "loss": 2.4607, "step": 5126 }, { "epoch": 0.8774602087968509, "grad_norm": 28.677005767822266, "learning_rate": 2.921277809469481e-05, "loss": 3.544, "step": 5127 }, { "epoch": 0.8776313537566318, "grad_norm": 1.4200770854949951, "learning_rate": 2.9218482601254993e-05, "loss": 0.2095, "step": 5128 }, { "epoch": 0.8778024987164128, "grad_norm": 18.276594161987305, "learning_rate": 2.9224187107815173e-05, "loss": 1.494, "step": 5129 }, { "epoch": 0.8779736436761937, "grad_norm": 24.763029098510742, "learning_rate": 2.9229891614375356e-05, "loss": 2.4242, "step": 5130 }, { "epoch": 0.8781447886359747, "grad_norm": 10.431611061096191, "learning_rate": 2.923559612093554e-05, "loss": 0.6153, "step": 5131 }, { "epoch": 0.8783159335957557, "grad_norm": 58.08489990234375, "learning_rate": 2.9241300627495723e-05, "loss": 1.706, "step": 5132 }, { "epoch": 0.8784870785555365, "grad_norm": 22.97351837158203, "learning_rate": 2.9247005134055903e-05, "loss": 2.2527, "step": 5133 }, { "epoch": 0.8786582235153175, "grad_norm": 27.15216827392578, "learning_rate": 2.9252709640616087e-05, "loss": 2.759, "step": 5134 }, { "epoch": 0.8788293684750984, "grad_norm": 22.588558197021484, "learning_rate": 2.925841414717627e-05, "loss": 2.6227, "step": 5135 }, { "epoch": 0.8790005134348794, "grad_norm": 33.35072326660156, "learning_rate": 2.926411865373645e-05, "loss": 1.2665, "step": 5136 }, { "epoch": 0.8791716583946603, "grad_norm": 41.13359451293945, "learning_rate": 2.9269823160296633e-05, "loss": 5.9406, "step": 5137 }, { "epoch": 0.8793428033544413, "grad_norm": 23.982492446899414, "learning_rate": 2.9275527666856817e-05, "loss": 2.9734, "step": 5138 }, { "epoch": 0.8795139483142221, "grad_norm": 16.065969467163086, "learning_rate": 2.9281232173417e-05, "loss": 1.4901, "step": 5139 }, { "epoch": 0.8796850932740031, "grad_norm": 29.961244583129883, "learning_rate": 2.9286936679977183e-05, "loss": 1.5246, "step": 5140 }, { "epoch": 0.879856238233784, "grad_norm": 33.30720138549805, "learning_rate": 2.9292641186537367e-05, "loss": 5.9995, "step": 5141 }, { "epoch": 0.880027383193565, "grad_norm": 24.118555068969727, "learning_rate": 2.929834569309755e-05, "loss": 2.6075, "step": 5142 }, { "epoch": 0.8801985281533459, "grad_norm": 19.221567153930664, "learning_rate": 2.930405019965773e-05, "loss": 1.7106, "step": 5143 }, { "epoch": 0.8803696731131269, "grad_norm": 22.26023292541504, "learning_rate": 2.9309754706217914e-05, "loss": 2.5151, "step": 5144 }, { "epoch": 0.8805408180729077, "grad_norm": 13.728766441345215, "learning_rate": 2.9315459212778097e-05, "loss": 1.1806, "step": 5145 }, { "epoch": 0.8807119630326887, "grad_norm": 20.878938674926758, "learning_rate": 2.932116371933828e-05, "loss": 2.107, "step": 5146 }, { "epoch": 0.8808831079924696, "grad_norm": 21.325651168823242, "learning_rate": 2.932686822589846e-05, "loss": 2.1829, "step": 5147 }, { "epoch": 0.8810542529522506, "grad_norm": 1.0309278964996338, "learning_rate": 2.9332572732458644e-05, "loss": 0.1782, "step": 5148 }, { "epoch": 0.8812253979120315, "grad_norm": 4.816126823425293, "learning_rate": 2.9338277239018827e-05, "loss": 0.3899, "step": 5149 }, { "epoch": 0.8813965428718125, "grad_norm": 1.190704584121704, "learning_rate": 2.9343981745579007e-05, "loss": 0.2021, "step": 5150 }, { "epoch": 0.8815676878315933, "grad_norm": 21.205900192260742, "learning_rate": 2.934968625213919e-05, "loss": 2.7627, "step": 5151 }, { "epoch": 0.8817388327913743, "grad_norm": 1.5020864009857178, "learning_rate": 2.9355390758699374e-05, "loss": 0.2301, "step": 5152 }, { "epoch": 0.8819099777511552, "grad_norm": 0.8392736315727234, "learning_rate": 2.9361095265259554e-05, "loss": 0.1906, "step": 5153 }, { "epoch": 0.8820811227109362, "grad_norm": 0.8165884017944336, "learning_rate": 2.9366799771819737e-05, "loss": 0.1924, "step": 5154 }, { "epoch": 0.8822522676707171, "grad_norm": 16.63069725036621, "learning_rate": 2.937250427837992e-05, "loss": 1.2813, "step": 5155 }, { "epoch": 0.8824234126304981, "grad_norm": 8.837629318237305, "learning_rate": 2.9378208784940104e-05, "loss": 0.5551, "step": 5156 }, { "epoch": 0.8825945575902789, "grad_norm": 30.035585403442383, "learning_rate": 2.9383913291500284e-05, "loss": 5.6298, "step": 5157 }, { "epoch": 0.8827657025500599, "grad_norm": 18.92460060119629, "learning_rate": 2.9389617798060467e-05, "loss": 2.1136, "step": 5158 }, { "epoch": 0.8829368475098408, "grad_norm": 18.019941329956055, "learning_rate": 2.939532230462065e-05, "loss": 1.701, "step": 5159 }, { "epoch": 0.8831079924696218, "grad_norm": 25.73262596130371, "learning_rate": 2.940102681118083e-05, "loss": 2.4158, "step": 5160 }, { "epoch": 0.8832791374294027, "grad_norm": 30.859712600708008, "learning_rate": 2.9406731317741014e-05, "loss": 5.5833, "step": 5161 }, { "epoch": 0.8834502823891837, "grad_norm": 18.802223205566406, "learning_rate": 2.9412435824301197e-05, "loss": 1.9511, "step": 5162 }, { "epoch": 0.8836214273489645, "grad_norm": 5.314499855041504, "learning_rate": 2.9418140330861384e-05, "loss": 0.4971, "step": 5163 }, { "epoch": 0.8837925723087455, "grad_norm": 27.17005157470703, "learning_rate": 2.9423844837421564e-05, "loss": 3.7148, "step": 5164 }, { "epoch": 0.8839637172685264, "grad_norm": 25.687992095947266, "learning_rate": 2.9429549343981748e-05, "loss": 3.388, "step": 5165 }, { "epoch": 0.8841348622283074, "grad_norm": 29.165775299072266, "learning_rate": 2.943525385054193e-05, "loss": 3.7435, "step": 5166 }, { "epoch": 0.8843060071880883, "grad_norm": 21.13896942138672, "learning_rate": 2.944095835710211e-05, "loss": 1.7033, "step": 5167 }, { "epoch": 0.8844771521478693, "grad_norm": 14.15404224395752, "learning_rate": 2.9446662863662294e-05, "loss": 1.156, "step": 5168 }, { "epoch": 0.8846482971076501, "grad_norm": 27.660737991333008, "learning_rate": 2.9452367370222478e-05, "loss": 1.8129, "step": 5169 }, { "epoch": 0.8848194420674311, "grad_norm": 24.719099044799805, "learning_rate": 2.945807187678266e-05, "loss": 2.121, "step": 5170 }, { "epoch": 0.884990587027212, "grad_norm": 21.35886001586914, "learning_rate": 2.946377638334284e-05, "loss": 2.0431, "step": 5171 }, { "epoch": 0.885161731986993, "grad_norm": 22.36219596862793, "learning_rate": 2.9469480889903024e-05, "loss": 2.134, "step": 5172 }, { "epoch": 0.8853328769467739, "grad_norm": 24.10101890563965, "learning_rate": 2.9475185396463208e-05, "loss": 2.071, "step": 5173 }, { "epoch": 0.8855040219065549, "grad_norm": 18.05703353881836, "learning_rate": 2.9480889903023388e-05, "loss": 1.558, "step": 5174 }, { "epoch": 0.8856751668663357, "grad_norm": 26.822391510009766, "learning_rate": 2.948659440958357e-05, "loss": 2.3976, "step": 5175 }, { "epoch": 0.8858463118261167, "grad_norm": 18.833152770996094, "learning_rate": 2.9492298916143755e-05, "loss": 1.6977, "step": 5176 }, { "epoch": 0.8860174567858976, "grad_norm": 19.91875648498535, "learning_rate": 2.9498003422703938e-05, "loss": 1.5125, "step": 5177 }, { "epoch": 0.8861886017456786, "grad_norm": 20.179113388061523, "learning_rate": 2.9503707929264118e-05, "loss": 1.9563, "step": 5178 }, { "epoch": 0.8863597467054595, "grad_norm": 22.212738037109375, "learning_rate": 2.95094124358243e-05, "loss": 1.8149, "step": 5179 }, { "epoch": 0.8865308916652405, "grad_norm": 19.415225982666016, "learning_rate": 2.9515116942384485e-05, "loss": 1.6777, "step": 5180 }, { "epoch": 0.8867020366250213, "grad_norm": 31.313318252563477, "learning_rate": 2.9520821448944665e-05, "loss": 4.1531, "step": 5181 }, { "epoch": 0.8868731815848023, "grad_norm": 34.813720703125, "learning_rate": 2.9526525955504848e-05, "loss": 5.9337, "step": 5182 }, { "epoch": 0.8870443265445833, "grad_norm": 5.619294166564941, "learning_rate": 2.953223046206503e-05, "loss": 0.4206, "step": 5183 }, { "epoch": 0.8872154715043642, "grad_norm": 18.10093116760254, "learning_rate": 2.953793496862521e-05, "loss": 1.8818, "step": 5184 }, { "epoch": 0.8873866164641452, "grad_norm": 22.031768798828125, "learning_rate": 2.9543639475185398e-05, "loss": 2.6025, "step": 5185 }, { "epoch": 0.8875577614239261, "grad_norm": 19.005178451538086, "learning_rate": 2.954934398174558e-05, "loss": 2.3032, "step": 5186 }, { "epoch": 0.887728906383707, "grad_norm": 2.4968888759613037, "learning_rate": 2.9555048488305765e-05, "loss": 0.2314, "step": 5187 }, { "epoch": 0.8879000513434879, "grad_norm": 15.055726051330566, "learning_rate": 2.9560752994865945e-05, "loss": 1.2134, "step": 5188 }, { "epoch": 0.8880711963032689, "grad_norm": 16.972787857055664, "learning_rate": 2.956645750142613e-05, "loss": 1.5495, "step": 5189 }, { "epoch": 0.8882423412630498, "grad_norm": 5.097226142883301, "learning_rate": 2.9572162007986312e-05, "loss": 0.4263, "step": 5190 }, { "epoch": 0.8884134862228308, "grad_norm": 23.945755004882812, "learning_rate": 2.9577866514546492e-05, "loss": 1.9982, "step": 5191 }, { "epoch": 0.8885846311826117, "grad_norm": 18.41358184814453, "learning_rate": 2.9583571021106675e-05, "loss": 1.5775, "step": 5192 }, { "epoch": 0.8887557761423927, "grad_norm": 20.26495361328125, "learning_rate": 2.958927552766686e-05, "loss": 2.0699, "step": 5193 }, { "epoch": 0.8889269211021735, "grad_norm": 21.891618728637695, "learning_rate": 2.9594980034227042e-05, "loss": 1.8668, "step": 5194 }, { "epoch": 0.8890980660619545, "grad_norm": 18.51753807067871, "learning_rate": 2.9600684540787222e-05, "loss": 1.7949, "step": 5195 }, { "epoch": 0.8892692110217354, "grad_norm": 21.540264129638672, "learning_rate": 2.9606389047347405e-05, "loss": 2.2159, "step": 5196 }, { "epoch": 0.8894403559815164, "grad_norm": 25.46014976501465, "learning_rate": 2.961209355390759e-05, "loss": 2.5065, "step": 5197 }, { "epoch": 0.8896115009412973, "grad_norm": 37.4268798828125, "learning_rate": 2.961779806046777e-05, "loss": 1.9917, "step": 5198 }, { "epoch": 0.8897826459010783, "grad_norm": 25.361825942993164, "learning_rate": 2.9623502567027952e-05, "loss": 2.6214, "step": 5199 }, { "epoch": 0.8899537908608591, "grad_norm": 8.111501693725586, "learning_rate": 2.9629207073588135e-05, "loss": 0.8144, "step": 5200 }, { "epoch": 0.8901249358206401, "grad_norm": 18.618261337280273, "learning_rate": 2.963491158014832e-05, "loss": 1.9664, "step": 5201 }, { "epoch": 0.890296080780421, "grad_norm": 17.81608009338379, "learning_rate": 2.96406160867085e-05, "loss": 1.5596, "step": 5202 }, { "epoch": 0.890467225740202, "grad_norm": 5.169036388397217, "learning_rate": 2.9646320593268682e-05, "loss": 0.4681, "step": 5203 }, { "epoch": 0.8906383706999829, "grad_norm": 21.34773826599121, "learning_rate": 2.9652025099828865e-05, "loss": 2.1088, "step": 5204 }, { "epoch": 0.8908095156597639, "grad_norm": 13.635762214660645, "learning_rate": 2.9657729606389045e-05, "loss": 1.147, "step": 5205 }, { "epoch": 0.8909806606195447, "grad_norm": 21.587596893310547, "learning_rate": 2.966343411294923e-05, "loss": 2.0986, "step": 5206 }, { "epoch": 0.8911518055793257, "grad_norm": 1.29149329662323, "learning_rate": 2.9669138619509412e-05, "loss": 0.2093, "step": 5207 }, { "epoch": 0.8913229505391066, "grad_norm": 20.227731704711914, "learning_rate": 2.96748431260696e-05, "loss": 1.682, "step": 5208 }, { "epoch": 0.8914940954988876, "grad_norm": 12.934089660644531, "learning_rate": 2.968054763262978e-05, "loss": 0.6278, "step": 5209 }, { "epoch": 0.8916652404586685, "grad_norm": 18.040390014648438, "learning_rate": 2.9686252139189962e-05, "loss": 1.561, "step": 5210 }, { "epoch": 0.8918363854184495, "grad_norm": 24.204835891723633, "learning_rate": 2.9691956645750146e-05, "loss": 1.7786, "step": 5211 }, { "epoch": 0.8920075303782303, "grad_norm": 23.571611404418945, "learning_rate": 2.9697661152310326e-05, "loss": 2.0676, "step": 5212 }, { "epoch": 0.8921786753380113, "grad_norm": 22.09473991394043, "learning_rate": 2.970336565887051e-05, "loss": 2.099, "step": 5213 }, { "epoch": 0.8923498202977922, "grad_norm": 1.3482792377471924, "learning_rate": 2.9709070165430692e-05, "loss": 0.2145, "step": 5214 }, { "epoch": 0.8925209652575732, "grad_norm": 21.764923095703125, "learning_rate": 2.9714774671990872e-05, "loss": 2.0434, "step": 5215 }, { "epoch": 0.8926921102173541, "grad_norm": 19.195314407348633, "learning_rate": 2.9720479178551056e-05, "loss": 1.4061, "step": 5216 }, { "epoch": 0.892863255177135, "grad_norm": 87.61866760253906, "learning_rate": 2.972618368511124e-05, "loss": 7.9821, "step": 5217 }, { "epoch": 0.8930344001369159, "grad_norm": 19.71204376220703, "learning_rate": 2.9731888191671423e-05, "loss": 1.7201, "step": 5218 }, { "epoch": 0.8932055450966969, "grad_norm": 25.464221954345703, "learning_rate": 2.9737592698231603e-05, "loss": 2.8268, "step": 5219 }, { "epoch": 0.8933766900564778, "grad_norm": 1.361387014389038, "learning_rate": 2.9743297204791786e-05, "loss": 0.2127, "step": 5220 }, { "epoch": 0.8935478350162588, "grad_norm": 18.932161331176758, "learning_rate": 2.974900171135197e-05, "loss": 1.7737, "step": 5221 }, { "epoch": 0.8937189799760397, "grad_norm": 6.523068904876709, "learning_rate": 2.975470621791215e-05, "loss": 0.5011, "step": 5222 }, { "epoch": 0.8938901249358207, "grad_norm": 21.28632926940918, "learning_rate": 2.9760410724472333e-05, "loss": 2.0839, "step": 5223 }, { "epoch": 0.8940612698956015, "grad_norm": 2.508774995803833, "learning_rate": 2.9766115231032516e-05, "loss": 0.2127, "step": 5224 }, { "epoch": 0.8942324148553825, "grad_norm": 22.384605407714844, "learning_rate": 2.97718197375927e-05, "loss": 1.7595, "step": 5225 }, { "epoch": 0.8944035598151634, "grad_norm": 13.072036743164062, "learning_rate": 2.977752424415288e-05, "loss": 1.0962, "step": 5226 }, { "epoch": 0.8945747047749444, "grad_norm": 27.329818725585938, "learning_rate": 2.9783228750713063e-05, "loss": 2.492, "step": 5227 }, { "epoch": 0.8947458497347253, "grad_norm": 0.9053159952163696, "learning_rate": 2.9788933257273246e-05, "loss": 0.1874, "step": 5228 }, { "epoch": 0.8949169946945063, "grad_norm": 17.836803436279297, "learning_rate": 2.9794637763833426e-05, "loss": 1.5566, "step": 5229 }, { "epoch": 0.8950881396542871, "grad_norm": 15.548909187316895, "learning_rate": 2.980034227039361e-05, "loss": 1.2955, "step": 5230 }, { "epoch": 0.8952592846140681, "grad_norm": 19.422529220581055, "learning_rate": 2.9806046776953796e-05, "loss": 2.5122, "step": 5231 }, { "epoch": 0.895430429573849, "grad_norm": 0.8730620741844177, "learning_rate": 2.981175128351398e-05, "loss": 0.1783, "step": 5232 }, { "epoch": 0.89560157453363, "grad_norm": 22.175594329833984, "learning_rate": 2.981745579007416e-05, "loss": 1.6344, "step": 5233 }, { "epoch": 0.895772719493411, "grad_norm": 4.08912467956543, "learning_rate": 2.9823160296634343e-05, "loss": 0.3297, "step": 5234 }, { "epoch": 0.8959438644531919, "grad_norm": 20.610801696777344, "learning_rate": 2.9828864803194527e-05, "loss": 2.0051, "step": 5235 }, { "epoch": 0.8961150094129728, "grad_norm": 19.67643165588379, "learning_rate": 2.9834569309754706e-05, "loss": 1.281, "step": 5236 }, { "epoch": 0.8962861543727537, "grad_norm": 22.13687515258789, "learning_rate": 2.984027381631489e-05, "loss": 2.5087, "step": 5237 }, { "epoch": 0.8964572993325347, "grad_norm": 5.210666656494141, "learning_rate": 2.9845978322875073e-05, "loss": 0.4318, "step": 5238 }, { "epoch": 0.8966284442923156, "grad_norm": 31.837879180908203, "learning_rate": 2.9851682829435257e-05, "loss": 3.9733, "step": 5239 }, { "epoch": 0.8967995892520966, "grad_norm": 24.229183197021484, "learning_rate": 2.9857387335995437e-05, "loss": 2.0104, "step": 5240 }, { "epoch": 0.8969707342118775, "grad_norm": 62.011810302734375, "learning_rate": 2.986309184255562e-05, "loss": 7.4416, "step": 5241 }, { "epoch": 0.8971418791716584, "grad_norm": 23.459850311279297, "learning_rate": 2.9868796349115803e-05, "loss": 2.7344, "step": 5242 }, { "epoch": 0.8973130241314393, "grad_norm": 106.41157531738281, "learning_rate": 2.9874500855675983e-05, "loss": 8.8184, "step": 5243 }, { "epoch": 0.8974841690912203, "grad_norm": 18.093460083007812, "learning_rate": 2.9880205362236167e-05, "loss": 1.3483, "step": 5244 }, { "epoch": 0.8976553140510012, "grad_norm": 6.958301544189453, "learning_rate": 2.988590986879635e-05, "loss": 0.5888, "step": 5245 }, { "epoch": 0.8978264590107822, "grad_norm": 28.051706314086914, "learning_rate": 2.9891614375356534e-05, "loss": 3.8837, "step": 5246 }, { "epoch": 0.897997603970563, "grad_norm": 16.499065399169922, "learning_rate": 2.9897318881916713e-05, "loss": 1.548, "step": 5247 }, { "epoch": 0.898168748930344, "grad_norm": 24.6423282623291, "learning_rate": 2.9903023388476897e-05, "loss": 2.0936, "step": 5248 }, { "epoch": 0.8983398938901249, "grad_norm": 23.600177764892578, "learning_rate": 2.990872789503708e-05, "loss": 2.4881, "step": 5249 }, { "epoch": 0.8985110388499059, "grad_norm": 6.3298163414001465, "learning_rate": 2.991443240159726e-05, "loss": 0.8614, "step": 5250 }, { "epoch": 0.8986821838096868, "grad_norm": 84.1204833984375, "learning_rate": 2.9920136908157444e-05, "loss": 7.2037, "step": 5251 }, { "epoch": 0.8988533287694678, "grad_norm": 13.504063606262207, "learning_rate": 2.9925841414717627e-05, "loss": 1.2107, "step": 5252 }, { "epoch": 0.8990244737292487, "grad_norm": 10.01652717590332, "learning_rate": 2.9931545921277807e-05, "loss": 0.6106, "step": 5253 }, { "epoch": 0.8991956186890296, "grad_norm": 139.29315185546875, "learning_rate": 2.9937250427837994e-05, "loss": 8.8993, "step": 5254 }, { "epoch": 0.8993667636488105, "grad_norm": 31.561298370361328, "learning_rate": 2.9942954934398177e-05, "loss": 3.2313, "step": 5255 }, { "epoch": 0.8995379086085915, "grad_norm": 2.1672093868255615, "learning_rate": 2.994865944095836e-05, "loss": 0.2206, "step": 5256 }, { "epoch": 0.8997090535683724, "grad_norm": 6.4866414070129395, "learning_rate": 2.995436394751854e-05, "loss": 0.5123, "step": 5257 }, { "epoch": 0.8998801985281534, "grad_norm": 12.993927955627441, "learning_rate": 2.9960068454078724e-05, "loss": 1.0112, "step": 5258 }, { "epoch": 0.9000513434879343, "grad_norm": 27.867324829101562, "learning_rate": 2.9965772960638907e-05, "loss": 3.0155, "step": 5259 } ], "logging_steps": 1, "max_steps": 17529, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1753, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }