{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.987886944818304, "eval_steps": 500, "global_step": 555, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005383580080753701, "grad_norm": 9.846100807189941, "learning_rate": 5.882352941176471e-07, "loss": 1.3462, "step": 1 }, { "epoch": 0.010767160161507403, "grad_norm": 10.834526062011719, "learning_rate": 1.1764705882352942e-06, "loss": 1.4331, "step": 2 }, { "epoch": 0.016150740242261104, "grad_norm": 8.454448699951172, "learning_rate": 1.7647058823529414e-06, "loss": 1.2743, "step": 3 }, { "epoch": 0.021534320323014805, "grad_norm": 9.057403564453125, "learning_rate": 2.3529411764705885e-06, "loss": 1.2552, "step": 4 }, { "epoch": 0.026917900403768506, "grad_norm": 7.3954267501831055, "learning_rate": 2.9411764705882355e-06, "loss": 1.2539, "step": 5 }, { "epoch": 0.03230148048452221, "grad_norm": 4.456072807312012, "learning_rate": 3.529411764705883e-06, "loss": 0.9338, "step": 6 }, { "epoch": 0.03768506056527591, "grad_norm": 1.6015737056732178, "learning_rate": 4.11764705882353e-06, "loss": 0.7819, "step": 7 }, { "epoch": 0.04306864064602961, "grad_norm": 1.523898959159851, "learning_rate": 4.705882352941177e-06, "loss": 0.6926, "step": 8 }, { "epoch": 0.04845222072678331, "grad_norm": 1.4889625310897827, "learning_rate": 5.294117647058824e-06, "loss": 0.6867, "step": 9 }, { "epoch": 0.05383580080753701, "grad_norm": 1.1910319328308105, "learning_rate": 5.882352941176471e-06, "loss": 0.5507, "step": 10 }, { "epoch": 0.059219380888290714, "grad_norm": 1.136652946472168, "learning_rate": 6.470588235294119e-06, "loss": 0.6116, "step": 11 }, { "epoch": 0.06460296096904442, "grad_norm": 1.6838195323944092, "learning_rate": 7.058823529411766e-06, "loss": 0.656, "step": 12 }, { "epoch": 0.06998654104979811, "grad_norm": 12.243623733520508, "learning_rate": 7.647058823529411e-06, "loss": 0.6089, "step": 13 }, { "epoch": 0.07537012113055182, "grad_norm": 2.4841673374176025, "learning_rate": 8.23529411764706e-06, "loss": 0.5457, "step": 14 }, { "epoch": 0.08075370121130551, "grad_norm": 1.2509280443191528, "learning_rate": 8.823529411764707e-06, "loss": 0.6457, "step": 15 }, { "epoch": 0.08613728129205922, "grad_norm": 1.0484827756881714, "learning_rate": 9.411764705882354e-06, "loss": 0.5583, "step": 16 }, { "epoch": 0.09152086137281291, "grad_norm": 0.8880680203437805, "learning_rate": 1e-05, "loss": 0.5628, "step": 17 }, { "epoch": 0.09690444145356662, "grad_norm": 0.9158921241760254, "learning_rate": 9.999914754008063e-06, "loss": 0.5625, "step": 18 }, { "epoch": 0.10228802153432032, "grad_norm": 0.9270734786987305, "learning_rate": 9.999659018938999e-06, "loss": 0.5319, "step": 19 }, { "epoch": 0.10767160161507403, "grad_norm": 0.9035325050354004, "learning_rate": 9.999232803512967e-06, "loss": 0.518, "step": 20 }, { "epoch": 0.11305518169582772, "grad_norm": 1.2811542749404907, "learning_rate": 9.998636122263227e-06, "loss": 0.5504, "step": 21 }, { "epoch": 0.11843876177658143, "grad_norm": 0.8128458261489868, "learning_rate": 9.997868995535658e-06, "loss": 0.5344, "step": 22 }, { "epoch": 0.12382234185733512, "grad_norm": 0.8413318991661072, "learning_rate": 9.996931449488046e-06, "loss": 0.5376, "step": 23 }, { "epoch": 0.12920592193808883, "grad_norm": 0.8115915656089783, "learning_rate": 9.99582351608921e-06, "loss": 0.5844, "step": 24 }, { "epoch": 0.13458950201884254, "grad_norm": 0.8173759579658508, "learning_rate": 9.994545233117904e-06, "loss": 0.5126, "step": 25 }, { "epoch": 0.13997308209959622, "grad_norm": 0.7367566823959351, "learning_rate": 9.993096644161526e-06, "loss": 0.5311, "step": 26 }, { "epoch": 0.14535666218034993, "grad_norm": 0.7710299491882324, "learning_rate": 9.991477798614638e-06, "loss": 0.5286, "step": 27 }, { "epoch": 0.15074024226110364, "grad_norm": 0.7534223794937134, "learning_rate": 9.989688751677277e-06, "loss": 0.5462, "step": 28 }, { "epoch": 0.15612382234185734, "grad_norm": 0.7281956672668457, "learning_rate": 9.987729564353077e-06, "loss": 0.5298, "step": 29 }, { "epoch": 0.16150740242261102, "grad_norm": 0.6779235601425171, "learning_rate": 9.985600303447185e-06, "loss": 0.4758, "step": 30 }, { "epoch": 0.16689098250336473, "grad_norm": 0.7668159008026123, "learning_rate": 9.98330104156398e-06, "loss": 0.5493, "step": 31 }, { "epoch": 0.17227456258411844, "grad_norm": 0.7769574522972107, "learning_rate": 9.980831857104612e-06, "loss": 0.5033, "step": 32 }, { "epoch": 0.17765814266487215, "grad_norm": 0.7682322263717651, "learning_rate": 9.978192834264307e-06, "loss": 0.4927, "step": 33 }, { "epoch": 0.18304172274562583, "grad_norm": 0.7225139737129211, "learning_rate": 9.975384063029516e-06, "loss": 0.4922, "step": 34 }, { "epoch": 0.18842530282637954, "grad_norm": 0.7247219681739807, "learning_rate": 9.972405639174833e-06, "loss": 0.5248, "step": 35 }, { "epoch": 0.19380888290713325, "grad_norm": 0.7795732617378235, "learning_rate": 9.96925766425974e-06, "loss": 0.5207, "step": 36 }, { "epoch": 0.19919246298788695, "grad_norm": 0.6990232467651367, "learning_rate": 9.965940245625131e-06, "loss": 0.5078, "step": 37 }, { "epoch": 0.20457604306864063, "grad_norm": 0.7676703929901123, "learning_rate": 9.962453496389665e-06, "loss": 0.4908, "step": 38 }, { "epoch": 0.20995962314939434, "grad_norm": 0.7075534462928772, "learning_rate": 9.958797535445898e-06, "loss": 0.5156, "step": 39 }, { "epoch": 0.21534320323014805, "grad_norm": 0.7213850021362305, "learning_rate": 9.95497248745624e-06, "loss": 0.5212, "step": 40 }, { "epoch": 0.22072678331090176, "grad_norm": 0.703669011592865, "learning_rate": 9.950978482848694e-06, "loss": 0.5124, "step": 41 }, { "epoch": 0.22611036339165544, "grad_norm": 0.7526930570602417, "learning_rate": 9.946815657812416e-06, "loss": 0.537, "step": 42 }, { "epoch": 0.23149394347240915, "grad_norm": 0.7019714117050171, "learning_rate": 9.94248415429306e-06, "loss": 0.5013, "step": 43 }, { "epoch": 0.23687752355316286, "grad_norm": 0.753139078617096, "learning_rate": 9.937984119987958e-06, "loss": 0.5205, "step": 44 }, { "epoch": 0.24226110363391656, "grad_norm": 0.7210888862609863, "learning_rate": 9.93331570834106e-06, "loss": 0.4658, "step": 45 }, { "epoch": 0.24764468371467024, "grad_norm": 0.6602186560630798, "learning_rate": 9.928479078537722e-06, "loss": 0.4819, "step": 46 }, { "epoch": 0.253028263795424, "grad_norm": 0.7594957947731018, "learning_rate": 9.923474395499266e-06, "loss": 0.5389, "step": 47 }, { "epoch": 0.25841184387617766, "grad_norm": 0.7201584577560425, "learning_rate": 9.91830182987736e-06, "loss": 0.5362, "step": 48 }, { "epoch": 0.26379542395693134, "grad_norm": 0.8874572515487671, "learning_rate": 9.912961558048196e-06, "loss": 0.5384, "step": 49 }, { "epoch": 0.2691790040376851, "grad_norm": 0.6909434199333191, "learning_rate": 9.907453762106484e-06, "loss": 0.5042, "step": 50 }, { "epoch": 0.27456258411843876, "grad_norm": 0.6489965319633484, "learning_rate": 9.901778629859236e-06, "loss": 0.4282, "step": 51 }, { "epoch": 0.27994616419919244, "grad_norm": 0.6962871551513672, "learning_rate": 9.895936354819362e-06, "loss": 0.549, "step": 52 }, { "epoch": 0.2853297442799462, "grad_norm": 0.6420189738273621, "learning_rate": 9.889927136199075e-06, "loss": 0.5255, "step": 53 }, { "epoch": 0.29071332436069985, "grad_norm": 0.6697545647621155, "learning_rate": 9.883751178903095e-06, "loss": 0.5122, "step": 54 }, { "epoch": 0.2960969044414536, "grad_norm": 0.6961387395858765, "learning_rate": 9.877408693521664e-06, "loss": 0.5277, "step": 55 }, { "epoch": 0.30148048452220727, "grad_norm": 0.721615195274353, "learning_rate": 9.870899896323368e-06, "loss": 0.5309, "step": 56 }, { "epoch": 0.30686406460296095, "grad_norm": 0.7135268449783325, "learning_rate": 9.864225009247753e-06, "loss": 0.5451, "step": 57 }, { "epoch": 0.3122476446837147, "grad_norm": 0.6227580904960632, "learning_rate": 9.857384259897768e-06, "loss": 0.4653, "step": 58 }, { "epoch": 0.31763122476446837, "grad_norm": 0.6683838963508606, "learning_rate": 9.850377881532e-06, "loss": 0.549, "step": 59 }, { "epoch": 0.32301480484522205, "grad_norm": 0.6848832964897156, "learning_rate": 9.843206113056715e-06, "loss": 0.4432, "step": 60 }, { "epoch": 0.3283983849259758, "grad_norm": 0.6483569145202637, "learning_rate": 9.835869199017725e-06, "loss": 0.467, "step": 61 }, { "epoch": 0.33378196500672946, "grad_norm": 0.7005964517593384, "learning_rate": 9.828367389592034e-06, "loss": 0.5185, "step": 62 }, { "epoch": 0.3391655450874832, "grad_norm": 0.6160753965377808, "learning_rate": 9.820700940579312e-06, "loss": 0.4116, "step": 63 }, { "epoch": 0.3445491251682369, "grad_norm": 0.6587129235267639, "learning_rate": 9.812870113393185e-06, "loss": 0.5197, "step": 64 }, { "epoch": 0.34993270524899056, "grad_norm": 1.4088473320007324, "learning_rate": 9.804875175052304e-06, "loss": 0.4992, "step": 65 }, { "epoch": 0.3553162853297443, "grad_norm": 0.7397728562355042, "learning_rate": 9.796716398171248e-06, "loss": 0.5006, "step": 66 }, { "epoch": 0.360699865410498, "grad_norm": 0.6741731762886047, "learning_rate": 9.788394060951228e-06, "loss": 0.4474, "step": 67 }, { "epoch": 0.36608344549125166, "grad_norm": 0.6397544741630554, "learning_rate": 9.779908447170602e-06, "loss": 0.4835, "step": 68 }, { "epoch": 0.3714670255720054, "grad_norm": 0.6985306739807129, "learning_rate": 9.771259846175195e-06, "loss": 0.476, "step": 69 }, { "epoch": 0.3768506056527591, "grad_norm": 0.64960116147995, "learning_rate": 9.762448552868433e-06, "loss": 0.4628, "step": 70 }, { "epoch": 0.3822341857335128, "grad_norm": 0.6351596713066101, "learning_rate": 9.753474867701294e-06, "loss": 0.4925, "step": 71 }, { "epoch": 0.3876177658142665, "grad_norm": 0.6702280640602112, "learning_rate": 9.744339096662056e-06, "loss": 0.482, "step": 72 }, { "epoch": 0.39300134589502017, "grad_norm": 0.5831217169761658, "learning_rate": 9.735041551265862e-06, "loss": 0.4794, "step": 73 }, { "epoch": 0.3983849259757739, "grad_norm": 0.6301687359809875, "learning_rate": 9.725582548544106e-06, "loss": 0.4483, "step": 74 }, { "epoch": 0.4037685060565276, "grad_norm": 0.6406306624412537, "learning_rate": 9.715962411033614e-06, "loss": 0.4514, "step": 75 }, { "epoch": 0.40915208613728127, "grad_norm": 0.6490384936332703, "learning_rate": 9.706181466765654e-06, "loss": 0.4615, "step": 76 }, { "epoch": 0.414535666218035, "grad_norm": 0.6236180663108826, "learning_rate": 9.696240049254744e-06, "loss": 0.4375, "step": 77 }, { "epoch": 0.4199192462987887, "grad_norm": 0.6604640483856201, "learning_rate": 9.686138497487282e-06, "loss": 0.3954, "step": 78 }, { "epoch": 0.4253028263795424, "grad_norm": 0.6148284673690796, "learning_rate": 9.675877155909989e-06, "loss": 0.4611, "step": 79 }, { "epoch": 0.4306864064602961, "grad_norm": 0.6233279705047607, "learning_rate": 9.66545637441816e-06, "loss": 0.4489, "step": 80 }, { "epoch": 0.4360699865410498, "grad_norm": 0.6342514157295227, "learning_rate": 9.654876508343739e-06, "loss": 0.4852, "step": 81 }, { "epoch": 0.4414535666218035, "grad_norm": 0.6237147450447083, "learning_rate": 9.644137918443198e-06, "loss": 0.4351, "step": 82 }, { "epoch": 0.4468371467025572, "grad_norm": 0.6277084946632385, "learning_rate": 9.633240970885231e-06, "loss": 0.4747, "step": 83 }, { "epoch": 0.4522207267833109, "grad_norm": 0.6557124257087708, "learning_rate": 9.622186037238286e-06, "loss": 0.475, "step": 84 }, { "epoch": 0.4576043068640646, "grad_norm": 0.6205596923828125, "learning_rate": 9.610973494457873e-06, "loss": 0.4732, "step": 85 }, { "epoch": 0.4629878869448183, "grad_norm": 0.6538224816322327, "learning_rate": 9.599603724873725e-06, "loss": 0.4817, "step": 86 }, { "epoch": 0.46837146702557203, "grad_norm": 0.6803449392318726, "learning_rate": 9.588077116176756e-06, "loss": 0.5178, "step": 87 }, { "epoch": 0.4737550471063257, "grad_norm": 0.6003801226615906, "learning_rate": 9.576394061405847e-06, "loss": 0.4771, "step": 88 }, { "epoch": 0.4791386271870794, "grad_norm": 0.6364747881889343, "learning_rate": 9.564554958934432e-06, "loss": 0.5041, "step": 89 }, { "epoch": 0.4845222072678331, "grad_norm": 0.6516885757446289, "learning_rate": 9.55256021245692e-06, "loss": 0.5322, "step": 90 }, { "epoch": 0.4899057873485868, "grad_norm": 0.6385886073112488, "learning_rate": 9.540410230974943e-06, "loss": 0.4747, "step": 91 }, { "epoch": 0.4952893674293405, "grad_norm": 0.6282247304916382, "learning_rate": 9.52810542878339e-06, "loss": 0.4859, "step": 92 }, { "epoch": 0.5006729475100942, "grad_norm": 0.6206268668174744, "learning_rate": 9.515646225456283e-06, "loss": 0.4458, "step": 93 }, { "epoch": 0.506056527590848, "grad_norm": 0.6404337882995605, "learning_rate": 9.503033045832484e-06, "loss": 0.5434, "step": 94 }, { "epoch": 0.5114401076716016, "grad_norm": 0.5463396906852722, "learning_rate": 9.490266320001195e-06, "loss": 0.4286, "step": 95 }, { "epoch": 0.5168236877523553, "grad_norm": 0.6801166534423828, "learning_rate": 9.4773464832873e-06, "loss": 0.4889, "step": 96 }, { "epoch": 0.522207267833109, "grad_norm": 0.6011826992034912, "learning_rate": 9.464273976236518e-06, "loss": 0.5188, "step": 97 }, { "epoch": 0.5275908479138627, "grad_norm": 0.5876375436782837, "learning_rate": 9.451049244600381e-06, "loss": 0.4622, "step": 98 }, { "epoch": 0.5329744279946165, "grad_norm": 0.582517147064209, "learning_rate": 9.437672739321034e-06, "loss": 0.4523, "step": 99 }, { "epoch": 0.5383580080753702, "grad_norm": 0.5836907029151917, "learning_rate": 9.424144916515863e-06, "loss": 0.498, "step": 100 }, { "epoch": 0.5437415881561238, "grad_norm": 0.5619045495986938, "learning_rate": 9.410466237461937e-06, "loss": 0.4475, "step": 101 }, { "epoch": 0.5491251682368775, "grad_norm": 0.6337983012199402, "learning_rate": 9.396637168580282e-06, "loss": 0.4562, "step": 102 }, { "epoch": 0.5545087483176312, "grad_norm": 0.6391755938529968, "learning_rate": 9.382658181419977e-06, "loss": 0.4738, "step": 103 }, { "epoch": 0.5598923283983849, "grad_norm": 0.6735963821411133, "learning_rate": 9.36852975264207e-06, "loss": 0.4888, "step": 104 }, { "epoch": 0.5652759084791387, "grad_norm": 0.5379722118377686, "learning_rate": 9.354252364003334e-06, "loss": 0.3988, "step": 105 }, { "epoch": 0.5706594885598923, "grad_norm": 0.6036385893821716, "learning_rate": 9.339826502339828e-06, "loss": 0.5088, "step": 106 }, { "epoch": 0.576043068640646, "grad_norm": 0.6139518022537231, "learning_rate": 9.32525265955031e-06, "loss": 0.4708, "step": 107 }, { "epoch": 0.5814266487213997, "grad_norm": 0.5770635008811951, "learning_rate": 9.310531332579453e-06, "loss": 0.4981, "step": 108 }, { "epoch": 0.5868102288021534, "grad_norm": 0.6464108228683472, "learning_rate": 9.295663023400907e-06, "loss": 0.5121, "step": 109 }, { "epoch": 0.5921938088829072, "grad_norm": 0.6004317402839661, "learning_rate": 9.280648239000174e-06, "loss": 0.4751, "step": 110 }, { "epoch": 0.5975773889636609, "grad_norm": 0.5914390683174133, "learning_rate": 9.265487491357334e-06, "loss": 0.4878, "step": 111 }, { "epoch": 0.6029609690444145, "grad_norm": 0.6945117712020874, "learning_rate": 9.250181297429573e-06, "loss": 0.4927, "step": 112 }, { "epoch": 0.6083445491251682, "grad_norm": 0.5963965058326721, "learning_rate": 9.234730179133564e-06, "loss": 0.4405, "step": 113 }, { "epoch": 0.6137281292059219, "grad_norm": 0.6108909845352173, "learning_rate": 9.219134663327672e-06, "loss": 0.5301, "step": 114 }, { "epoch": 0.6191117092866757, "grad_norm": 0.590741753578186, "learning_rate": 9.203395281793979e-06, "loss": 0.4701, "step": 115 }, { "epoch": 0.6244952893674294, "grad_norm": 0.5966534614562988, "learning_rate": 9.187512571220166e-06, "loss": 0.4829, "step": 116 }, { "epoch": 0.629878869448183, "grad_norm": 0.5713053941726685, "learning_rate": 9.171487073181198e-06, "loss": 0.4208, "step": 117 }, { "epoch": 0.6352624495289367, "grad_norm": 0.6419247388839722, "learning_rate": 9.155319334120864e-06, "loss": 0.4565, "step": 118 }, { "epoch": 0.6406460296096904, "grad_norm": 0.5234012007713318, "learning_rate": 9.139009905333147e-06, "loss": 0.3937, "step": 119 }, { "epoch": 0.6460296096904441, "grad_norm": 0.5776930451393127, "learning_rate": 9.122559342943423e-06, "loss": 0.4677, "step": 120 }, { "epoch": 0.6514131897711979, "grad_norm": 0.5588910579681396, "learning_rate": 9.105968207889493e-06, "loss": 0.4171, "step": 121 }, { "epoch": 0.6567967698519516, "grad_norm": 0.5887078046798706, "learning_rate": 9.089237065902464e-06, "loss": 0.4209, "step": 122 }, { "epoch": 0.6621803499327052, "grad_norm": 0.5707204937934875, "learning_rate": 9.072366487487451e-06, "loss": 0.4502, "step": 123 }, { "epoch": 0.6675639300134589, "grad_norm": 0.5806924104690552, "learning_rate": 9.055357047904133e-06, "loss": 0.4428, "step": 124 }, { "epoch": 0.6729475100942126, "grad_norm": 0.6028096079826355, "learning_rate": 9.038209327147134e-06, "loss": 0.4816, "step": 125 }, { "epoch": 0.6783310901749664, "grad_norm": 0.592367947101593, "learning_rate": 9.020923909926233e-06, "loss": 0.49, "step": 126 }, { "epoch": 0.6837146702557201, "grad_norm": 0.6010198593139648, "learning_rate": 9.00350138564645e-06, "loss": 0.4971, "step": 127 }, { "epoch": 0.6890982503364738, "grad_norm": 0.5716829299926758, "learning_rate": 8.985942348387926e-06, "loss": 0.4828, "step": 128 }, { "epoch": 0.6944818304172274, "grad_norm": 0.527796745300293, "learning_rate": 8.968247396885685e-06, "loss": 0.4113, "step": 129 }, { "epoch": 0.6998654104979811, "grad_norm": 0.5992532968521118, "learning_rate": 8.950417134509201e-06, "loss": 0.4487, "step": 130 }, { "epoch": 0.7052489905787349, "grad_norm": 0.5818247199058533, "learning_rate": 8.932452169241838e-06, "loss": 0.4804, "step": 131 }, { "epoch": 0.7106325706594886, "grad_norm": 0.6332154870033264, "learning_rate": 8.914353113660107e-06, "loss": 0.5535, "step": 132 }, { "epoch": 0.7160161507402423, "grad_norm": 0.5611910820007324, "learning_rate": 8.89612058491279e-06, "loss": 0.4464, "step": 133 }, { "epoch": 0.721399730820996, "grad_norm": 0.5586318969726562, "learning_rate": 8.877755204699883e-06, "loss": 0.4606, "step": 134 }, { "epoch": 0.7267833109017496, "grad_norm": 0.5422524809837341, "learning_rate": 8.859257599251408e-06, "loss": 0.4452, "step": 135 }, { "epoch": 0.7321668909825033, "grad_norm": 0.5787152051925659, "learning_rate": 8.840628399306056e-06, "loss": 0.4997, "step": 136 }, { "epoch": 0.7375504710632571, "grad_norm": 0.5561872720718384, "learning_rate": 8.821868240089676e-06, "loss": 0.4712, "step": 137 }, { "epoch": 0.7429340511440108, "grad_norm": 0.629596471786499, "learning_rate": 8.802977761293625e-06, "loss": 0.5005, "step": 138 }, { "epoch": 0.7483176312247645, "grad_norm": 0.5670992136001587, "learning_rate": 8.783957607052941e-06, "loss": 0.4594, "step": 139 }, { "epoch": 0.7537012113055181, "grad_norm": 0.6181672811508179, "learning_rate": 8.764808425924392e-06, "loss": 0.48, "step": 140 }, { "epoch": 0.7590847913862718, "grad_norm": 0.5901859998703003, "learning_rate": 8.745530870864351e-06, "loss": 0.4121, "step": 141 }, { "epoch": 0.7644683714670256, "grad_norm": 0.5341172218322754, "learning_rate": 8.726125599206543e-06, "loss": 0.4905, "step": 142 }, { "epoch": 0.7698519515477793, "grad_norm": 0.6587361097335815, "learning_rate": 8.706593272639616e-06, "loss": 0.4846, "step": 143 }, { "epoch": 0.775235531628533, "grad_norm": 0.5404164791107178, "learning_rate": 8.686934557184594e-06, "loss": 0.4265, "step": 144 }, { "epoch": 0.7806191117092867, "grad_norm": 0.6407716870307922, "learning_rate": 8.667150123172159e-06, "loss": 0.5006, "step": 145 }, { "epoch": 0.7860026917900403, "grad_norm": 0.5715042948722839, "learning_rate": 8.647240645219787e-06, "loss": 0.4388, "step": 146 }, { "epoch": 0.7913862718707941, "grad_norm": 0.575707197189331, "learning_rate": 8.62720680220876e-06, "loss": 0.4626, "step": 147 }, { "epoch": 0.7967698519515478, "grad_norm": 0.5612806677818298, "learning_rate": 8.607049277261005e-06, "loss": 0.4644, "step": 148 }, { "epoch": 0.8021534320323015, "grad_norm": 0.5671082735061646, "learning_rate": 8.586768757715806e-06, "loss": 0.4442, "step": 149 }, { "epoch": 0.8075370121130552, "grad_norm": 0.598675012588501, "learning_rate": 8.566365935106367e-06, "loss": 0.4802, "step": 150 }, { "epoch": 0.8129205921938089, "grad_norm": 0.546492338180542, "learning_rate": 8.545841505136224e-06, "loss": 0.4551, "step": 151 }, { "epoch": 0.8183041722745625, "grad_norm": 0.5794171094894409, "learning_rate": 8.525196167655539e-06, "loss": 0.4755, "step": 152 }, { "epoch": 0.8236877523553163, "grad_norm": 0.5300067663192749, "learning_rate": 8.504430626637215e-06, "loss": 0.4233, "step": 153 }, { "epoch": 0.82907133243607, "grad_norm": 0.5738832950592041, "learning_rate": 8.483545590152915e-06, "loss": 0.5016, "step": 154 }, { "epoch": 0.8344549125168237, "grad_norm": 0.5611905455589294, "learning_rate": 8.462541770348896e-06, "loss": 0.4444, "step": 155 }, { "epoch": 0.8398384925975774, "grad_norm": 0.554915189743042, "learning_rate": 8.441419883421742e-06, "loss": 0.4603, "step": 156 }, { "epoch": 0.845222072678331, "grad_norm": 0.5979538559913635, "learning_rate": 8.42018064959393e-06, "loss": 0.5154, "step": 157 }, { "epoch": 0.8506056527590848, "grad_norm": 0.54628986120224, "learning_rate": 8.398824793089287e-06, "loss": 0.3947, "step": 158 }, { "epoch": 0.8559892328398385, "grad_norm": 0.5486013889312744, "learning_rate": 8.377353042108278e-06, "loss": 0.4317, "step": 159 }, { "epoch": 0.8613728129205922, "grad_norm": 0.5597162246704102, "learning_rate": 8.355766128803192e-06, "loss": 0.4471, "step": 160 }, { "epoch": 0.8667563930013459, "grad_norm": 0.5271990895271301, "learning_rate": 8.334064789253157e-06, "loss": 0.3983, "step": 161 }, { "epoch": 0.8721399730820996, "grad_norm": 0.5897473692893982, "learning_rate": 8.312249763439066e-06, "loss": 0.4504, "step": 162 }, { "epoch": 0.8775235531628532, "grad_norm": 0.6026889085769653, "learning_rate": 8.29032179521832e-06, "loss": 0.4785, "step": 163 }, { "epoch": 0.882907133243607, "grad_norm": 0.5334970951080322, "learning_rate": 8.268281632299483e-06, "loss": 0.5166, "step": 164 }, { "epoch": 0.8882907133243607, "grad_norm": 0.568034827709198, "learning_rate": 8.246130026216777e-06, "loss": 0.4354, "step": 165 }, { "epoch": 0.8936742934051144, "grad_norm": 0.5437761545181274, "learning_rate": 8.22386773230445e-06, "loss": 0.4398, "step": 166 }, { "epoch": 0.8990578734858681, "grad_norm": 0.5542709231376648, "learning_rate": 8.201495509671036e-06, "loss": 0.4074, "step": 167 }, { "epoch": 0.9044414535666218, "grad_norm": 0.5601239800453186, "learning_rate": 8.179014121173461e-06, "loss": 0.4764, "step": 168 }, { "epoch": 0.9098250336473755, "grad_norm": 0.5747672319412231, "learning_rate": 8.156424333391026e-06, "loss": 0.4537, "step": 169 }, { "epoch": 0.9152086137281292, "grad_norm": 0.56292325258255, "learning_rate": 8.13372691659928e-06, "loss": 0.4641, "step": 170 }, { "epoch": 0.9205921938088829, "grad_norm": 0.5486699938774109, "learning_rate": 8.110922644743747e-06, "loss": 0.4489, "step": 171 }, { "epoch": 0.9259757738896366, "grad_norm": 0.5740337371826172, "learning_rate": 8.088012295413536e-06, "loss": 0.475, "step": 172 }, { "epoch": 0.9313593539703903, "grad_norm": 0.5686214566230774, "learning_rate": 8.064996649814826e-06, "loss": 0.4182, "step": 173 }, { "epoch": 0.9367429340511441, "grad_norm": 0.5474251508712769, "learning_rate": 8.041876492744239e-06, "loss": 0.4011, "step": 174 }, { "epoch": 0.9421265141318977, "grad_norm": 0.5313992500305176, "learning_rate": 8.018652612562061e-06, "loss": 0.4598, "step": 175 }, { "epoch": 0.9475100942126514, "grad_norm": 0.5516825914382935, "learning_rate": 7.99532580116537e-06, "loss": 0.3926, "step": 176 }, { "epoch": 0.9528936742934051, "grad_norm": 0.567688524723053, "learning_rate": 7.971896853961043e-06, "loss": 0.442, "step": 177 }, { "epoch": 0.9582772543741588, "grad_norm": 0.5734118819236755, "learning_rate": 7.948366569838612e-06, "loss": 0.4221, "step": 178 }, { "epoch": 0.9636608344549125, "grad_norm": 0.5655908584594727, "learning_rate": 7.924735751143044e-06, "loss": 0.51, "step": 179 }, { "epoch": 0.9690444145356663, "grad_norm": 0.5655565857887268, "learning_rate": 7.901005203647373e-06, "loss": 0.3944, "step": 180 }, { "epoch": 0.9744279946164199, "grad_norm": 0.6050511598587036, "learning_rate": 7.877175736525217e-06, "loss": 0.4433, "step": 181 }, { "epoch": 0.9798115746971736, "grad_norm": 0.5776525139808655, "learning_rate": 7.853248162323208e-06, "loss": 0.5174, "step": 182 }, { "epoch": 0.9851951547779273, "grad_norm": 0.5618104338645935, "learning_rate": 7.829223296933259e-06, "loss": 0.4297, "step": 183 }, { "epoch": 0.990578734858681, "grad_norm": 0.5539780855178833, "learning_rate": 7.805101959564768e-06, "loss": 0.4988, "step": 184 }, { "epoch": 0.9959623149394348, "grad_norm": 0.5038336515426636, "learning_rate": 7.780884972716663e-06, "loss": 0.3906, "step": 185 }, { "epoch": 1.0013458950201883, "grad_norm": 0.6332990527153015, "learning_rate": 7.75657316214937e-06, "loss": 0.4842, "step": 186 }, { "epoch": 1.0067294751009421, "grad_norm": 0.41341373324394226, "learning_rate": 7.732167356856656e-06, "loss": 0.2382, "step": 187 }, { "epoch": 1.012113055181696, "grad_norm": 0.5181017518043518, "learning_rate": 7.70766838903735e-06, "loss": 0.2906, "step": 188 }, { "epoch": 1.0174966352624495, "grad_norm": 0.4716527760028839, "learning_rate": 7.683077094066981e-06, "loss": 0.2688, "step": 189 }, { "epoch": 1.0228802153432033, "grad_norm": 0.48120298981666565, "learning_rate": 7.65839431046928e-06, "loss": 0.2854, "step": 190 }, { "epoch": 1.0282637954239569, "grad_norm": 0.4271540343761444, "learning_rate": 7.63362087988759e-06, "loss": 0.2093, "step": 191 }, { "epoch": 1.0336473755047106, "grad_norm": 0.5108612775802612, "learning_rate": 7.608757647056186e-06, "loss": 0.2317, "step": 192 }, { "epoch": 1.0390309555854644, "grad_norm": 0.4512535333633423, "learning_rate": 7.583805459771443e-06, "loss": 0.249, "step": 193 }, { "epoch": 1.044414535666218, "grad_norm": 0.4441206455230713, "learning_rate": 7.5587651688629405e-06, "loss": 0.2657, "step": 194 }, { "epoch": 1.0497981157469718, "grad_norm": 0.46206924319267273, "learning_rate": 7.533637628164456e-06, "loss": 0.2207, "step": 195 }, { "epoch": 1.0551816958277254, "grad_norm": 0.52704918384552, "learning_rate": 7.508423694484841e-06, "loss": 0.2705, "step": 196 }, { "epoch": 1.0605652759084792, "grad_norm": 0.5095883011817932, "learning_rate": 7.483124227578811e-06, "loss": 0.2428, "step": 197 }, { "epoch": 1.065948855989233, "grad_norm": 0.5210585594177246, "learning_rate": 7.457740090117627e-06, "loss": 0.2344, "step": 198 }, { "epoch": 1.0713324360699865, "grad_norm": 0.46602457761764526, "learning_rate": 7.432272147659678e-06, "loss": 0.241, "step": 199 }, { "epoch": 1.0767160161507403, "grad_norm": 0.4984048306941986, "learning_rate": 7.406721268620975e-06, "loss": 0.2388, "step": 200 }, { "epoch": 1.0820995962314939, "grad_norm": 0.5057407021522522, "learning_rate": 7.381088324245526e-06, "loss": 0.23, "step": 201 }, { "epoch": 1.0874831763122477, "grad_norm": 0.4600376784801483, "learning_rate": 7.355374188575639e-06, "loss": 0.2022, "step": 202 }, { "epoch": 1.0928667563930015, "grad_norm": 0.5112857818603516, "learning_rate": 7.3295797384221156e-06, "loss": 0.2333, "step": 203 }, { "epoch": 1.098250336473755, "grad_norm": 0.527310848236084, "learning_rate": 7.303705853334353e-06, "loss": 0.242, "step": 204 }, { "epoch": 1.1036339165545088, "grad_norm": 0.5270518660545349, "learning_rate": 7.277753415570349e-06, "loss": 0.2417, "step": 205 }, { "epoch": 1.1090174966352624, "grad_norm": 0.5107465386390686, "learning_rate": 7.2517233100666255e-06, "loss": 0.2162, "step": 206 }, { "epoch": 1.1144010767160162, "grad_norm": 0.5194461345672607, "learning_rate": 7.225616424408045e-06, "loss": 0.255, "step": 207 }, { "epoch": 1.1197846567967698, "grad_norm": 0.5149202346801758, "learning_rate": 7.199433648797558e-06, "loss": 0.2593, "step": 208 }, { "epoch": 1.1251682368775235, "grad_norm": 0.5071370005607605, "learning_rate": 7.1731758760258315e-06, "loss": 0.2427, "step": 209 }, { "epoch": 1.1305518169582773, "grad_norm": 0.4726599454879761, "learning_rate": 7.146844001440823e-06, "loss": 0.2344, "step": 210 }, { "epoch": 1.135935397039031, "grad_norm": 0.43700599670410156, "learning_rate": 7.120438922917237e-06, "loss": 0.1889, "step": 211 }, { "epoch": 1.1413189771197847, "grad_norm": 0.4685395359992981, "learning_rate": 7.09396154082592e-06, "loss": 0.2127, "step": 212 }, { "epoch": 1.1467025572005383, "grad_norm": 0.4829280972480774, "learning_rate": 7.067412758003154e-06, "loss": 0.2271, "step": 213 }, { "epoch": 1.152086137281292, "grad_norm": 0.4522843360900879, "learning_rate": 7.040793479719864e-06, "loss": 0.217, "step": 214 }, { "epoch": 1.1574697173620458, "grad_norm": 0.42811307311058044, "learning_rate": 7.014104613650767e-06, "loss": 0.1944, "step": 215 }, { "epoch": 1.1628532974427994, "grad_norm": 0.465836763381958, "learning_rate": 6.987347069843406e-06, "loss": 0.2352, "step": 216 }, { "epoch": 1.1682368775235532, "grad_norm": 0.5526953339576721, "learning_rate": 6.96052176068713e-06, "loss": 0.2839, "step": 217 }, { "epoch": 1.1736204576043068, "grad_norm": 0.5280203223228455, "learning_rate": 6.93362960088197e-06, "loss": 0.2398, "step": 218 }, { "epoch": 1.1790040376850606, "grad_norm": 0.4957825839519501, "learning_rate": 6.906671507407463e-06, "loss": 0.2391, "step": 219 }, { "epoch": 1.1843876177658144, "grad_norm": 0.47294560074806213, "learning_rate": 6.879648399491376e-06, "loss": 0.1976, "step": 220 }, { "epoch": 1.189771197846568, "grad_norm": 0.45914170145988464, "learning_rate": 6.852561198578364e-06, "loss": 0.1903, "step": 221 }, { "epoch": 1.1951547779273217, "grad_norm": 0.5234487652778625, "learning_rate": 6.825410828298552e-06, "loss": 0.2548, "step": 222 }, { "epoch": 1.2005383580080753, "grad_norm": 0.4907478094100952, "learning_rate": 6.79819821443604e-06, "loss": 0.2203, "step": 223 }, { "epoch": 1.205921938088829, "grad_norm": 0.488614559173584, "learning_rate": 6.7709242848973326e-06, "loss": 0.1889, "step": 224 }, { "epoch": 1.2113055181695827, "grad_norm": 0.42549803853034973, "learning_rate": 6.743589969679697e-06, "loss": 0.173, "step": 225 }, { "epoch": 1.2166890982503364, "grad_norm": 0.5077455639839172, "learning_rate": 6.716196200839465e-06, "loss": 0.2301, "step": 226 }, { "epoch": 1.2220726783310902, "grad_norm": 0.4867914915084839, "learning_rate": 6.6887439124602295e-06, "loss": 0.2455, "step": 227 }, { "epoch": 1.2274562584118438, "grad_norm": 0.4867931306362152, "learning_rate": 6.661234040621017e-06, "loss": 0.201, "step": 228 }, { "epoch": 1.2328398384925976, "grad_norm": 0.4922155737876892, "learning_rate": 6.63366752336435e-06, "loss": 0.2068, "step": 229 }, { "epoch": 1.2382234185733512, "grad_norm": 0.5053098797798157, "learning_rate": 6.606045300664272e-06, "loss": 0.2237, "step": 230 }, { "epoch": 1.243606998654105, "grad_norm": 0.5080535411834717, "learning_rate": 6.578368314394293e-06, "loss": 0.2189, "step": 231 }, { "epoch": 1.2489905787348587, "grad_norm": 0.4673517346382141, "learning_rate": 6.550637508295272e-06, "loss": 0.202, "step": 232 }, { "epoch": 1.2543741588156123, "grad_norm": 0.5345984697341919, "learning_rate": 6.52285382794324e-06, "loss": 0.2197, "step": 233 }, { "epoch": 1.259757738896366, "grad_norm": 0.4533955752849579, "learning_rate": 6.49501822071715e-06, "loss": 0.1996, "step": 234 }, { "epoch": 1.2651413189771197, "grad_norm": 0.48141008615493774, "learning_rate": 6.467131635766585e-06, "loss": 0.225, "step": 235 }, { "epoch": 1.2705248990578735, "grad_norm": 0.5605146288871765, "learning_rate": 6.439195023979381e-06, "loss": 0.2769, "step": 236 }, { "epoch": 1.2759084791386273, "grad_norm": 0.4871980845928192, "learning_rate": 6.411209337949214e-06, "loss": 0.2054, "step": 237 }, { "epoch": 1.2812920592193808, "grad_norm": 0.5211129784584045, "learning_rate": 6.383175531943106e-06, "loss": 0.2682, "step": 238 }, { "epoch": 1.2866756393001346, "grad_norm": 0.5319603085517883, "learning_rate": 6.355094561868902e-06, "loss": 0.2581, "step": 239 }, { "epoch": 1.2920592193808882, "grad_norm": 0.4909502863883972, "learning_rate": 6.3269673852426575e-06, "loss": 0.208, "step": 240 }, { "epoch": 1.297442799461642, "grad_norm": 0.5048267245292664, "learning_rate": 6.298794961156004e-06, "loss": 0.2213, "step": 241 }, { "epoch": 1.3028263795423958, "grad_norm": 0.45375633239746094, "learning_rate": 6.270578250243437e-06, "loss": 0.1804, "step": 242 }, { "epoch": 1.3082099596231493, "grad_norm": 0.4308919608592987, "learning_rate": 6.242318214649556e-06, "loss": 0.1866, "step": 243 }, { "epoch": 1.3135935397039031, "grad_norm": 0.6137887835502625, "learning_rate": 6.214015817996273e-06, "loss": 0.2951, "step": 244 }, { "epoch": 1.3189771197846567, "grad_norm": 0.5159800052642822, "learning_rate": 6.185672025349936e-06, "loss": 0.2405, "step": 245 }, { "epoch": 1.3243606998654105, "grad_norm": 0.5221627354621887, "learning_rate": 6.157287803188432e-06, "loss": 0.2361, "step": 246 }, { "epoch": 1.3297442799461643, "grad_norm": 0.5131467580795288, "learning_rate": 6.128864119368234e-06, "loss": 0.2467, "step": 247 }, { "epoch": 1.3351278600269179, "grad_norm": 0.5357580780982971, "learning_rate": 6.100401943091386e-06, "loss": 0.2142, "step": 248 }, { "epoch": 1.3405114401076716, "grad_norm": 0.5234276056289673, "learning_rate": 6.0719022448724705e-06, "loss": 0.2387, "step": 249 }, { "epoch": 1.3458950201884252, "grad_norm": 0.5050548911094666, "learning_rate": 6.043365996505506e-06, "loss": 0.2257, "step": 250 }, { "epoch": 1.351278600269179, "grad_norm": 0.5760233998298645, "learning_rate": 6.014794171030811e-06, "loss": 0.2929, "step": 251 }, { "epoch": 1.3566621803499328, "grad_norm": 0.5137818455696106, "learning_rate": 5.986187742701825e-06, "loss": 0.2604, "step": 252 }, { "epoch": 1.3620457604306864, "grad_norm": 0.4670131504535675, "learning_rate": 5.9575476869518945e-06, "loss": 0.2222, "step": 253 }, { "epoch": 1.3674293405114402, "grad_norm": 0.5121346116065979, "learning_rate": 5.928874980361005e-06, "loss": 0.254, "step": 254 }, { "epoch": 1.3728129205921937, "grad_norm": 0.47050395607948303, "learning_rate": 5.900170600622477e-06, "loss": 0.2295, "step": 255 }, { "epoch": 1.3781965006729475, "grad_norm": 0.5137650966644287, "learning_rate": 5.871435526509647e-06, "loss": 0.1969, "step": 256 }, { "epoch": 1.3835800807537013, "grad_norm": 0.5146386623382568, "learning_rate": 5.8426707378424675e-06, "loss": 0.2523, "step": 257 }, { "epoch": 1.3889636608344549, "grad_norm": 0.47957491874694824, "learning_rate": 5.813877215454118e-06, "loss": 0.2406, "step": 258 }, { "epoch": 1.3943472409152087, "grad_norm": 0.4431574046611786, "learning_rate": 5.78505594115755e-06, "loss": 0.2141, "step": 259 }, { "epoch": 1.3997308209959622, "grad_norm": 0.5288009643554688, "learning_rate": 5.756207897712011e-06, "loss": 0.2348, "step": 260 }, { "epoch": 1.405114401076716, "grad_norm": 0.47516876459121704, "learning_rate": 5.727334068789529e-06, "loss": 0.2324, "step": 261 }, { "epoch": 1.4104979811574698, "grad_norm": 0.4710802137851715, "learning_rate": 5.698435438941382e-06, "loss": 0.217, "step": 262 }, { "epoch": 1.4158815612382234, "grad_norm": 0.5013542175292969, "learning_rate": 5.669512993564517e-06, "loss": 0.2538, "step": 263 }, { "epoch": 1.4212651413189772, "grad_norm": 0.4954458773136139, "learning_rate": 5.640567718867951e-06, "loss": 0.2175, "step": 264 }, { "epoch": 1.4266487213997308, "grad_norm": 0.5086066126823425, "learning_rate": 5.611600601839144e-06, "loss": 0.2649, "step": 265 }, { "epoch": 1.4320323014804845, "grad_norm": 0.5038528442382812, "learning_rate": 5.582612630210349e-06, "loss": 0.2396, "step": 266 }, { "epoch": 1.4374158815612383, "grad_norm": 0.4795680642127991, "learning_rate": 5.553604792424923e-06, "loss": 0.2234, "step": 267 }, { "epoch": 1.442799461641992, "grad_norm": 0.553688645362854, "learning_rate": 5.524578077603627e-06, "loss": 0.2435, "step": 268 }, { "epoch": 1.4481830417227457, "grad_norm": 0.5056889057159424, "learning_rate": 5.495533475510901e-06, "loss": 0.2224, "step": 269 }, { "epoch": 1.4535666218034993, "grad_norm": 0.44364944100379944, "learning_rate": 5.4664719765211125e-06, "loss": 0.185, "step": 270 }, { "epoch": 1.458950201884253, "grad_norm": 0.5148865580558777, "learning_rate": 5.4373945715847845e-06, "loss": 0.2416, "step": 271 }, { "epoch": 1.4643337819650069, "grad_norm": 0.5296265482902527, "learning_rate": 5.408302252194806e-06, "loss": 0.2179, "step": 272 }, { "epoch": 1.4697173620457604, "grad_norm": 0.5192491412162781, "learning_rate": 5.379196010352629e-06, "loss": 0.2338, "step": 273 }, { "epoch": 1.4751009421265142, "grad_norm": 0.45017164945602417, "learning_rate": 5.3500768385344345e-06, "loss": 0.203, "step": 274 }, { "epoch": 1.4804845222072678, "grad_norm": 0.47436919808387756, "learning_rate": 5.320945729657299e-06, "loss": 0.2495, "step": 275 }, { "epoch": 1.4858681022880216, "grad_norm": 0.47932523488998413, "learning_rate": 5.2918036770453285e-06, "loss": 0.2123, "step": 276 }, { "epoch": 1.4912516823687754, "grad_norm": 0.5231288075447083, "learning_rate": 5.262651674395799e-06, "loss": 0.2636, "step": 277 }, { "epoch": 1.496635262449529, "grad_norm": 0.46927890181541443, "learning_rate": 5.2334907157452605e-06, "loss": 0.2045, "step": 278 }, { "epoch": 1.5020188425302825, "grad_norm": 0.5273484587669373, "learning_rate": 5.204321795435656e-06, "loss": 0.2352, "step": 279 }, { "epoch": 1.5074024226110363, "grad_norm": 0.4517362713813782, "learning_rate": 5.1751459080803986e-06, "loss": 0.2068, "step": 280 }, { "epoch": 1.51278600269179, "grad_norm": 0.5345643758773804, "learning_rate": 5.145964048530475e-06, "loss": 0.2578, "step": 281 }, { "epoch": 1.5181695827725439, "grad_norm": 0.6723287105560303, "learning_rate": 5.11677721184051e-06, "loss": 0.2362, "step": 282 }, { "epoch": 1.5235531628532974, "grad_norm": 0.4516390562057495, "learning_rate": 5.08758639323484e-06, "loss": 0.1979, "step": 283 }, { "epoch": 1.528936742934051, "grad_norm": 0.4627610445022583, "learning_rate": 5.058392588073583e-06, "loss": 0.2235, "step": 284 }, { "epoch": 1.5343203230148048, "grad_norm": 0.4922831356525421, "learning_rate": 5.029196791818688e-06, "loss": 0.2141, "step": 285 }, { "epoch": 1.5397039030955586, "grad_norm": 0.4735919237136841, "learning_rate": 5e-06, "loss": 0.2235, "step": 286 }, { "epoch": 1.5450874831763124, "grad_norm": 0.5311393737792969, "learning_rate": 4.970803208181315e-06, "loss": 0.2127, "step": 287 }, { "epoch": 1.550471063257066, "grad_norm": 0.5476110577583313, "learning_rate": 4.941607411926419e-06, "loss": 0.236, "step": 288 }, { "epoch": 1.5558546433378195, "grad_norm": 0.44367510080337524, "learning_rate": 4.9124136067651615e-06, "loss": 0.1843, "step": 289 }, { "epoch": 1.5612382234185733, "grad_norm": 0.5168237686157227, "learning_rate": 4.883222788159491e-06, "loss": 0.2349, "step": 290 }, { "epoch": 1.5666218034993271, "grad_norm": 0.5239467620849609, "learning_rate": 4.8540359514695266e-06, "loss": 0.2424, "step": 291 }, { "epoch": 1.572005383580081, "grad_norm": 0.5578256845474243, "learning_rate": 4.824854091919601e-06, "loss": 0.2492, "step": 292 }, { "epoch": 1.5773889636608345, "grad_norm": 0.5159158110618591, "learning_rate": 4.795678204564346e-06, "loss": 0.2031, "step": 293 }, { "epoch": 1.582772543741588, "grad_norm": 0.4600106179714203, "learning_rate": 4.766509284254739e-06, "loss": 0.2042, "step": 294 }, { "epoch": 1.5881561238223418, "grad_norm": 0.46104931831359863, "learning_rate": 4.737348325604203e-06, "loss": 0.1984, "step": 295 }, { "epoch": 1.5935397039030956, "grad_norm": 0.5123720765113831, "learning_rate": 4.708196322954673e-06, "loss": 0.2449, "step": 296 }, { "epoch": 1.5989232839838494, "grad_norm": 0.5240789651870728, "learning_rate": 4.679054270342703e-06, "loss": 0.1956, "step": 297 }, { "epoch": 1.604306864064603, "grad_norm": 0.5075330138206482, "learning_rate": 4.649923161465567e-06, "loss": 0.2318, "step": 298 }, { "epoch": 1.6096904441453566, "grad_norm": 0.5857378840446472, "learning_rate": 4.620803989647373e-06, "loss": 0.2623, "step": 299 }, { "epoch": 1.6150740242261103, "grad_norm": 0.5065007209777832, "learning_rate": 4.591697747805196e-06, "loss": 0.2171, "step": 300 }, { "epoch": 1.6204576043068641, "grad_norm": 0.47048458456993103, "learning_rate": 4.562605428415216e-06, "loss": 0.1985, "step": 301 }, { "epoch": 1.6258411843876177, "grad_norm": 0.4939180314540863, "learning_rate": 4.533528023478888e-06, "loss": 0.2162, "step": 302 }, { "epoch": 1.6312247644683715, "grad_norm": 0.5094431638717651, "learning_rate": 4.5044665244891e-06, "loss": 0.1996, "step": 303 }, { "epoch": 1.636608344549125, "grad_norm": 0.5184011459350586, "learning_rate": 4.475421922396375e-06, "loss": 0.2053, "step": 304 }, { "epoch": 1.6419919246298789, "grad_norm": 0.485853374004364, "learning_rate": 4.446395207575081e-06, "loss": 0.2063, "step": 305 }, { "epoch": 1.6473755047106327, "grad_norm": 0.48953792452812195, "learning_rate": 4.417387369789652e-06, "loss": 0.2208, "step": 306 }, { "epoch": 1.6527590847913862, "grad_norm": 0.48435530066490173, "learning_rate": 4.388399398160857e-06, "loss": 0.1991, "step": 307 }, { "epoch": 1.65814266487214, "grad_norm": 0.4711257219314575, "learning_rate": 4.359432281132051e-06, "loss": 0.1985, "step": 308 }, { "epoch": 1.6635262449528936, "grad_norm": 0.49920031428337097, "learning_rate": 4.330487006435485e-06, "loss": 0.2281, "step": 309 }, { "epoch": 1.6689098250336474, "grad_norm": 0.4793451428413391, "learning_rate": 4.301564561058618e-06, "loss": 0.2052, "step": 310 }, { "epoch": 1.6742934051144012, "grad_norm": 0.49276602268218994, "learning_rate": 4.272665931210472e-06, "loss": 0.2163, "step": 311 }, { "epoch": 1.6796769851951547, "grad_norm": 0.48469507694244385, "learning_rate": 4.243792102287991e-06, "loss": 0.214, "step": 312 }, { "epoch": 1.6850605652759085, "grad_norm": 0.5068939328193665, "learning_rate": 4.214944058842452e-06, "loss": 0.2463, "step": 313 }, { "epoch": 1.690444145356662, "grad_norm": 0.4834253489971161, "learning_rate": 4.186122784545885e-06, "loss": 0.2204, "step": 314 }, { "epoch": 1.695827725437416, "grad_norm": 0.7421865463256836, "learning_rate": 4.157329262157534e-06, "loss": 0.2297, "step": 315 }, { "epoch": 1.7012113055181697, "grad_norm": 0.5400863289833069, "learning_rate": 4.128564473490357e-06, "loss": 0.2784, "step": 316 }, { "epoch": 1.7065948855989233, "grad_norm": 0.46585744619369507, "learning_rate": 4.099829399377524e-06, "loss": 0.2039, "step": 317 }, { "epoch": 1.7119784656796768, "grad_norm": 0.45379072427749634, "learning_rate": 4.071125019638998e-06, "loss": 0.1987, "step": 318 }, { "epoch": 1.7173620457604306, "grad_norm": 0.5648776292800903, "learning_rate": 4.0424523130481055e-06, "loss": 0.2224, "step": 319 }, { "epoch": 1.7227456258411844, "grad_norm": 0.4834424555301666, "learning_rate": 4.013812257298175e-06, "loss": 0.2175, "step": 320 }, { "epoch": 1.7281292059219382, "grad_norm": 0.49235790967941284, "learning_rate": 3.985205828969191e-06, "loss": 0.1996, "step": 321 }, { "epoch": 1.7335127860026918, "grad_norm": 0.4619491994380951, "learning_rate": 3.956634003494496e-06, "loss": 0.2143, "step": 322 }, { "epoch": 1.7388963660834453, "grad_norm": 0.4783826172351837, "learning_rate": 3.9280977551275294e-06, "loss": 0.2154, "step": 323 }, { "epoch": 1.7442799461641991, "grad_norm": 0.5519052743911743, "learning_rate": 3.899598056908615e-06, "loss": 0.2516, "step": 324 }, { "epoch": 1.749663526244953, "grad_norm": 0.5011211633682251, "learning_rate": 3.871135880631769e-06, "loss": 0.2265, "step": 325 }, { "epoch": 1.7550471063257067, "grad_norm": 0.41989102959632874, "learning_rate": 3.842712196811569e-06, "loss": 0.1792, "step": 326 }, { "epoch": 1.7604306864064603, "grad_norm": 0.472318172454834, "learning_rate": 3.8143279746500665e-06, "loss": 0.2204, "step": 327 }, { "epoch": 1.7658142664872138, "grad_norm": 0.531564474105835, "learning_rate": 3.785984182003728e-06, "loss": 0.2012, "step": 328 }, { "epoch": 1.7711978465679676, "grad_norm": 0.5032511353492737, "learning_rate": 3.757681785350445e-06, "loss": 0.2242, "step": 329 }, { "epoch": 1.7765814266487214, "grad_norm": 0.48782920837402344, "learning_rate": 3.729421749756564e-06, "loss": 0.2187, "step": 330 }, { "epoch": 1.7819650067294752, "grad_norm": 0.4836859405040741, "learning_rate": 3.701205038843997e-06, "loss": 0.2194, "step": 331 }, { "epoch": 1.7873485868102288, "grad_norm": 0.49115753173828125, "learning_rate": 3.6730326147573425e-06, "loss": 0.1968, "step": 332 }, { "epoch": 1.7927321668909824, "grad_norm": 0.5141318440437317, "learning_rate": 3.6449054381311e-06, "loss": 0.2233, "step": 333 }, { "epoch": 1.7981157469717362, "grad_norm": 0.5064616799354553, "learning_rate": 3.616824468056896e-06, "loss": 0.2065, "step": 334 }, { "epoch": 1.80349932705249, "grad_norm": 0.47807809710502625, "learning_rate": 3.5887906620507877e-06, "loss": 0.2145, "step": 335 }, { "epoch": 1.8088829071332437, "grad_norm": 0.5218194723129272, "learning_rate": 3.5608049760206203e-06, "loss": 0.227, "step": 336 }, { "epoch": 1.8142664872139973, "grad_norm": 0.4956798851490021, "learning_rate": 3.532868364233416e-06, "loss": 0.2089, "step": 337 }, { "epoch": 1.8196500672947509, "grad_norm": 0.5096341967582703, "learning_rate": 3.504981779282852e-06, "loss": 0.2397, "step": 338 }, { "epoch": 1.8250336473755047, "grad_norm": 0.4995509684085846, "learning_rate": 3.4771461720567613e-06, "loss": 0.2397, "step": 339 }, { "epoch": 1.8304172274562585, "grad_norm": 0.4688532054424286, "learning_rate": 3.4493624917047284e-06, "loss": 0.2161, "step": 340 }, { "epoch": 1.8358008075370122, "grad_norm": 0.5076211094856262, "learning_rate": 3.4216316856057074e-06, "loss": 0.24, "step": 341 }, { "epoch": 1.8411843876177658, "grad_norm": 0.4792284667491913, "learning_rate": 3.3939546993357297e-06, "loss": 0.1995, "step": 342 }, { "epoch": 1.8465679676985194, "grad_norm": 0.4893110692501068, "learning_rate": 3.3663324766356524e-06, "loss": 0.2117, "step": 343 }, { "epoch": 1.8519515477792732, "grad_norm": 0.493745893239975, "learning_rate": 3.3387659593789845e-06, "loss": 0.2422, "step": 344 }, { "epoch": 1.857335127860027, "grad_norm": 0.494195818901062, "learning_rate": 3.3112560875397713e-06, "loss": 0.2344, "step": 345 }, { "epoch": 1.8627187079407808, "grad_norm": 0.47956109046936035, "learning_rate": 3.283803799160537e-06, "loss": 0.2228, "step": 346 }, { "epoch": 1.8681022880215343, "grad_norm": 0.4594026803970337, "learning_rate": 3.256410030320304e-06, "loss": 0.2119, "step": 347 }, { "epoch": 1.873485868102288, "grad_norm": 0.512570321559906, "learning_rate": 3.2290757151026687e-06, "loss": 0.2414, "step": 348 }, { "epoch": 1.8788694481830417, "grad_norm": 0.5020653605461121, "learning_rate": 3.2018017855639605e-06, "loss": 0.2425, "step": 349 }, { "epoch": 1.8842530282637955, "grad_norm": 0.46298474073410034, "learning_rate": 3.1745891717014477e-06, "loss": 0.2077, "step": 350 }, { "epoch": 1.8896366083445493, "grad_norm": 0.48863649368286133, "learning_rate": 3.147438801421638e-06, "loss": 0.2181, "step": 351 }, { "epoch": 1.8950201884253028, "grad_norm": 0.4544221758842468, "learning_rate": 3.1203516005086276e-06, "loss": 0.2052, "step": 352 }, { "epoch": 1.9004037685060564, "grad_norm": 0.4919374883174896, "learning_rate": 3.093328492592539e-06, "loss": 0.2266, "step": 353 }, { "epoch": 1.9057873485868102, "grad_norm": 0.5141823291778564, "learning_rate": 3.0663703991180318e-06, "loss": 0.2329, "step": 354 }, { "epoch": 1.911170928667564, "grad_norm": 0.46769434213638306, "learning_rate": 3.0394782393128713e-06, "loss": 0.2006, "step": 355 }, { "epoch": 1.9165545087483178, "grad_norm": 0.4760676622390747, "learning_rate": 3.0126529301565945e-06, "loss": 0.1909, "step": 356 }, { "epoch": 1.9219380888290714, "grad_norm": 0.4960988163948059, "learning_rate": 2.9858953863492334e-06, "loss": 0.2177, "step": 357 }, { "epoch": 1.927321668909825, "grad_norm": 0.5212114453315735, "learning_rate": 2.9592065202801374e-06, "loss": 0.2096, "step": 358 }, { "epoch": 1.9327052489905787, "grad_norm": 0.5346338152885437, "learning_rate": 2.9325872419968484e-06, "loss": 0.2391, "step": 359 }, { "epoch": 1.9380888290713325, "grad_norm": 0.4992043375968933, "learning_rate": 2.906038459174081e-06, "loss": 0.2113, "step": 360 }, { "epoch": 1.9434724091520863, "grad_norm": 0.4740796387195587, "learning_rate": 2.879561077082764e-06, "loss": 0.2178, "step": 361 }, { "epoch": 1.9488559892328399, "grad_norm": 0.512220025062561, "learning_rate": 2.853155998559179e-06, "loss": 0.2325, "step": 362 }, { "epoch": 1.9542395693135934, "grad_norm": 0.5286325216293335, "learning_rate": 2.826824123974171e-06, "loss": 0.2405, "step": 363 }, { "epoch": 1.9596231493943472, "grad_norm": 0.4532966911792755, "learning_rate": 2.800566351202443e-06, "loss": 0.1983, "step": 364 }, { "epoch": 1.965006729475101, "grad_norm": 0.5386168360710144, "learning_rate": 2.774383575591956e-06, "loss": 0.225, "step": 365 }, { "epoch": 1.9703903095558546, "grad_norm": 0.49068483710289, "learning_rate": 2.748276689933377e-06, "loss": 0.2142, "step": 366 }, { "epoch": 1.9757738896366084, "grad_norm": 0.5264994502067566, "learning_rate": 2.722246584429652e-06, "loss": 0.2197, "step": 367 }, { "epoch": 1.981157469717362, "grad_norm": 0.5036882162094116, "learning_rate": 2.6962941466656477e-06, "loss": 0.2153, "step": 368 }, { "epoch": 1.9865410497981157, "grad_norm": 0.46985024213790894, "learning_rate": 2.6704202615778844e-06, "loss": 0.216, "step": 369 }, { "epoch": 1.9919246298788695, "grad_norm": 0.5271331667900085, "learning_rate": 2.6446258114243633e-06, "loss": 0.2125, "step": 370 }, { "epoch": 1.997308209959623, "grad_norm": 0.5481729507446289, "learning_rate": 2.6189116757544765e-06, "loss": 0.2351, "step": 371 }, { "epoch": 2.0026917900403767, "grad_norm": 0.4495651125907898, "learning_rate": 2.593278731379027e-06, "loss": 0.1652, "step": 372 }, { "epoch": 2.0080753701211305, "grad_norm": 0.345325231552124, "learning_rate": 2.567727852340323e-06, "loss": 0.1108, "step": 373 }, { "epoch": 2.0134589502018843, "grad_norm": 0.29901817440986633, "learning_rate": 2.542259909882374e-06, "loss": 0.0865, "step": 374 }, { "epoch": 2.018842530282638, "grad_norm": 0.33557021617889404, "learning_rate": 2.51687577242119e-06, "loss": 0.107, "step": 375 }, { "epoch": 2.024226110363392, "grad_norm": 0.2968936264514923, "learning_rate": 2.4915763055151615e-06, "loss": 0.0858, "step": 376 }, { "epoch": 2.029609690444145, "grad_norm": 0.3676191568374634, "learning_rate": 2.4663623718355444e-06, "loss": 0.1066, "step": 377 }, { "epoch": 2.034993270524899, "grad_norm": 0.30083024501800537, "learning_rate": 2.4412348311370616e-06, "loss": 0.0871, "step": 378 }, { "epoch": 2.0403768506056528, "grad_norm": 0.2911483347415924, "learning_rate": 2.416194540228559e-06, "loss": 0.0808, "step": 379 }, { "epoch": 2.0457604306864066, "grad_norm": 0.31706151366233826, "learning_rate": 2.3912423529438145e-06, "loss": 0.0818, "step": 380 }, { "epoch": 2.0511440107671604, "grad_norm": 0.30930769443511963, "learning_rate": 2.3663791201124093e-06, "loss": 0.0812, "step": 381 }, { "epoch": 2.0565275908479137, "grad_norm": 0.35245367884635925, "learning_rate": 2.341605689530723e-06, "loss": 0.0856, "step": 382 }, { "epoch": 2.0619111709286675, "grad_norm": 0.3333040177822113, "learning_rate": 2.316922905933022e-06, "loss": 0.0745, "step": 383 }, { "epoch": 2.0672947510094213, "grad_norm": 0.3866671025753021, "learning_rate": 2.292331610962649e-06, "loss": 0.0844, "step": 384 }, { "epoch": 2.072678331090175, "grad_norm": 0.33665308356285095, "learning_rate": 2.2678326431433456e-06, "loss": 0.0773, "step": 385 }, { "epoch": 2.078061911170929, "grad_norm": 0.3511718809604645, "learning_rate": 2.243426837850631e-06, "loss": 0.0775, "step": 386 }, { "epoch": 2.083445491251682, "grad_norm": 0.3618534505367279, "learning_rate": 2.219115027283339e-06, "loss": 0.0812, "step": 387 }, { "epoch": 2.088829071332436, "grad_norm": 0.39068838953971863, "learning_rate": 2.194898040435234e-06, "loss": 0.0829, "step": 388 }, { "epoch": 2.09421265141319, "grad_norm": 0.47448840737342834, "learning_rate": 2.17077670306674e-06, "loss": 0.1055, "step": 389 }, { "epoch": 2.0995962314939436, "grad_norm": 0.3499176800251007, "learning_rate": 2.146751837676794e-06, "loss": 0.0677, "step": 390 }, { "epoch": 2.1049798115746974, "grad_norm": 0.39072269201278687, "learning_rate": 2.122824263474784e-06, "loss": 0.0754, "step": 391 }, { "epoch": 2.1103633916554507, "grad_norm": 0.33510833978652954, "learning_rate": 2.098994796352629e-06, "loss": 0.058, "step": 392 }, { "epoch": 2.1157469717362045, "grad_norm": 0.39688751101493835, "learning_rate": 2.0752642488569557e-06, "loss": 0.0728, "step": 393 }, { "epoch": 2.1211305518169583, "grad_norm": 0.389644593000412, "learning_rate": 2.0516334301613876e-06, "loss": 0.0815, "step": 394 }, { "epoch": 2.126514131897712, "grad_norm": 0.3516867160797119, "learning_rate": 2.028103146038958e-06, "loss": 0.0724, "step": 395 }, { "epoch": 2.131897711978466, "grad_norm": 0.3905945420265198, "learning_rate": 2.004674198834631e-06, "loss": 0.0792, "step": 396 }, { "epoch": 2.1372812920592192, "grad_norm": 0.46998897194862366, "learning_rate": 1.98134738743794e-06, "loss": 0.0793, "step": 397 }, { "epoch": 2.142664872139973, "grad_norm": 0.4259118139743805, "learning_rate": 1.9581235072557618e-06, "loss": 0.0916, "step": 398 }, { "epoch": 2.148048452220727, "grad_norm": 0.47033047676086426, "learning_rate": 1.935003350185174e-06, "loss": 0.0857, "step": 399 }, { "epoch": 2.1534320323014806, "grad_norm": 0.4288282096385956, "learning_rate": 1.911987704586466e-06, "loss": 0.0709, "step": 400 }, { "epoch": 2.1588156123822344, "grad_norm": 0.3920668661594391, "learning_rate": 1.8890773552562564e-06, "loss": 0.0722, "step": 401 }, { "epoch": 2.1641991924629878, "grad_norm": 0.35498660802841187, "learning_rate": 1.8662730834007204e-06, "loss": 0.0635, "step": 402 }, { "epoch": 2.1695827725437415, "grad_norm": 0.4081229269504547, "learning_rate": 1.843575666608976e-06, "loss": 0.0713, "step": 403 }, { "epoch": 2.1749663526244953, "grad_norm": 0.41039130091667175, "learning_rate": 1.8209858788265411e-06, "loss": 0.0838, "step": 404 }, { "epoch": 2.180349932705249, "grad_norm": 0.44797372817993164, "learning_rate": 1.7985044903289645e-06, "loss": 0.1013, "step": 405 }, { "epoch": 2.185733512786003, "grad_norm": 0.3503686785697937, "learning_rate": 1.7761322676955505e-06, "loss": 0.066, "step": 406 }, { "epoch": 2.1911170928667563, "grad_norm": 0.4590007960796356, "learning_rate": 1.7538699737832237e-06, "loss": 0.0772, "step": 407 }, { "epoch": 2.19650067294751, "grad_norm": 0.3556067943572998, "learning_rate": 1.7317183677005173e-06, "loss": 0.0648, "step": 408 }, { "epoch": 2.201884253028264, "grad_norm": 0.3512371778488159, "learning_rate": 1.7096782047816806e-06, "loss": 0.069, "step": 409 }, { "epoch": 2.2072678331090176, "grad_norm": 0.39259177446365356, "learning_rate": 1.687750236560936e-06, "loss": 0.0793, "step": 410 }, { "epoch": 2.2126514131897714, "grad_norm": 0.3561786711215973, "learning_rate": 1.665935210746844e-06, "loss": 0.0586, "step": 411 }, { "epoch": 2.218034993270525, "grad_norm": 0.35219818353652954, "learning_rate": 1.6442338711968102e-06, "loss": 0.0681, "step": 412 }, { "epoch": 2.2234185733512786, "grad_norm": 0.3837469220161438, "learning_rate": 1.622646957891722e-06, "loss": 0.0736, "step": 413 }, { "epoch": 2.2288021534320324, "grad_norm": 0.39585286378860474, "learning_rate": 1.601175206910715e-06, "loss": 0.0826, "step": 414 }, { "epoch": 2.234185733512786, "grad_norm": 0.33951419591903687, "learning_rate": 1.5798193504060693e-06, "loss": 0.0599, "step": 415 }, { "epoch": 2.2395693135935395, "grad_norm": 0.39095380902290344, "learning_rate": 1.5585801165782606e-06, "loss": 0.0724, "step": 416 }, { "epoch": 2.2449528936742933, "grad_norm": 0.3765682876110077, "learning_rate": 1.5374582296511054e-06, "loss": 0.0747, "step": 417 }, { "epoch": 2.250336473755047, "grad_norm": 0.3725675046443939, "learning_rate": 1.5164544098470862e-06, "loss": 0.0717, "step": 418 }, { "epoch": 2.255720053835801, "grad_norm": 0.37952670454978943, "learning_rate": 1.4955693733627869e-06, "loss": 0.0776, "step": 419 }, { "epoch": 2.2611036339165547, "grad_norm": 0.39090678095817566, "learning_rate": 1.474803832344463e-06, "loss": 0.0766, "step": 420 }, { "epoch": 2.2664872139973085, "grad_norm": 0.3887679874897003, "learning_rate": 1.4541584948637777e-06, "loss": 0.0868, "step": 421 }, { "epoch": 2.271870794078062, "grad_norm": 0.3668728768825531, "learning_rate": 1.4336340648936342e-06, "loss": 0.0797, "step": 422 }, { "epoch": 2.2772543741588156, "grad_norm": 0.3776654005050659, "learning_rate": 1.413231242284195e-06, "loss": 0.0775, "step": 423 }, { "epoch": 2.2826379542395694, "grad_norm": 0.43863725662231445, "learning_rate": 1.3929507227389954e-06, "loss": 0.0848, "step": 424 }, { "epoch": 2.288021534320323, "grad_norm": 0.3964315354824066, "learning_rate": 1.3727931977912406e-06, "loss": 0.0719, "step": 425 }, { "epoch": 2.2934051144010765, "grad_norm": 0.3711508810520172, "learning_rate": 1.352759354780215e-06, "loss": 0.0602, "step": 426 }, { "epoch": 2.2987886944818303, "grad_norm": 0.3771410584449768, "learning_rate": 1.332849876827842e-06, "loss": 0.0689, "step": 427 }, { "epoch": 2.304172274562584, "grad_norm": 0.45632028579711914, "learning_rate": 1.3130654428154066e-06, "loss": 0.0634, "step": 428 }, { "epoch": 2.309555854643338, "grad_norm": 0.40130868554115295, "learning_rate": 1.2934067273603855e-06, "loss": 0.0818, "step": 429 }, { "epoch": 2.3149394347240917, "grad_norm": 0.3942681849002838, "learning_rate": 1.2738744007934595e-06, "loss": 0.0843, "step": 430 }, { "epoch": 2.320323014804845, "grad_norm": 0.3565605580806732, "learning_rate": 1.2544691291356497e-06, "loss": 0.0584, "step": 431 }, { "epoch": 2.325706594885599, "grad_norm": 0.38263797760009766, "learning_rate": 1.2351915740756087e-06, "loss": 0.0652, "step": 432 }, { "epoch": 2.3310901749663526, "grad_norm": 0.4015883207321167, "learning_rate": 1.2160423929470584e-06, "loss": 0.0751, "step": 433 }, { "epoch": 2.3364737550471064, "grad_norm": 0.3580048680305481, "learning_rate": 1.1970222387063756e-06, "loss": 0.0624, "step": 434 }, { "epoch": 2.34185733512786, "grad_norm": 0.47708114981651306, "learning_rate": 1.1781317599103238e-06, "loss": 0.0829, "step": 435 }, { "epoch": 2.3472409152086136, "grad_norm": 0.3463763892650604, "learning_rate": 1.1593716006939455e-06, "loss": 0.0693, "step": 436 }, { "epoch": 2.3526244952893673, "grad_norm": 0.3862798810005188, "learning_rate": 1.140742400748593e-06, "loss": 0.0716, "step": 437 }, { "epoch": 2.358008075370121, "grad_norm": 0.3969804346561432, "learning_rate": 1.1222447953001182e-06, "loss": 0.0708, "step": 438 }, { "epoch": 2.363391655450875, "grad_norm": 0.3394986689090729, "learning_rate": 1.1038794150872117e-06, "loss": 0.0595, "step": 439 }, { "epoch": 2.3687752355316287, "grad_norm": 0.39073002338409424, "learning_rate": 1.0856468863398917e-06, "loss": 0.0634, "step": 440 }, { "epoch": 2.374158815612382, "grad_norm": 0.3924263119697571, "learning_rate": 1.0675478307581627e-06, "loss": 0.0725, "step": 441 }, { "epoch": 2.379542395693136, "grad_norm": 0.3952764868736267, "learning_rate": 1.0495828654907991e-06, "loss": 0.0663, "step": 442 }, { "epoch": 2.3849259757738897, "grad_norm": 0.37942010164260864, "learning_rate": 1.0317526031143161e-06, "loss": 0.0683, "step": 443 }, { "epoch": 2.3903095558546434, "grad_norm": 0.35665637254714966, "learning_rate": 1.014057651612076e-06, "loss": 0.0662, "step": 444 }, { "epoch": 2.3956931359353972, "grad_norm": 0.3667193651199341, "learning_rate": 9.964986143535515e-07, "loss": 0.0616, "step": 445 }, { "epoch": 2.4010767160161506, "grad_norm": 0.4359084367752075, "learning_rate": 9.790760900737683e-07, "loss": 0.0637, "step": 446 }, { "epoch": 2.4064602960969044, "grad_norm": 0.3700020909309387, "learning_rate": 9.61790672852868e-07, "loss": 0.0569, "step": 447 }, { "epoch": 2.411843876177658, "grad_norm": 0.4084100127220154, "learning_rate": 9.446429520958666e-07, "loss": 0.0708, "step": 448 }, { "epoch": 2.417227456258412, "grad_norm": 0.40237903594970703, "learning_rate": 9.276335125125502e-07, "loss": 0.0755, "step": 449 }, { "epoch": 2.4226110363391653, "grad_norm": 0.36956214904785156, "learning_rate": 9.107629340975388e-07, "loss": 0.0618, "step": 450 }, { "epoch": 2.427994616419919, "grad_norm": 0.38042622804641724, "learning_rate": 8.940317921105085e-07, "loss": 0.0579, "step": 451 }, { "epoch": 2.433378196500673, "grad_norm": 0.39496564865112305, "learning_rate": 8.774406570565791e-07, "loss": 0.0702, "step": 452 }, { "epoch": 2.4387617765814267, "grad_norm": 0.3166196942329407, "learning_rate": 8.609900946668536e-07, "loss": 0.0555, "step": 453 }, { "epoch": 2.4441453566621805, "grad_norm": 0.3680025637149811, "learning_rate": 8.446806658791373e-07, "loss": 0.0593, "step": 454 }, { "epoch": 2.449528936742934, "grad_norm": 0.39065518975257874, "learning_rate": 8.285129268188042e-07, "loss": 0.0708, "step": 455 }, { "epoch": 2.4549125168236876, "grad_norm": 0.40179872512817383, "learning_rate": 8.124874287798352e-07, "loss": 0.0773, "step": 456 }, { "epoch": 2.4602960969044414, "grad_norm": 0.33520442247390747, "learning_rate": 7.966047182060226e-07, "loss": 0.0573, "step": 457 }, { "epoch": 2.465679676985195, "grad_norm": 0.4467129111289978, "learning_rate": 7.808653366723296e-07, "loss": 0.0826, "step": 458 }, { "epoch": 2.471063257065949, "grad_norm": 0.3427630662918091, "learning_rate": 7.652698208664377e-07, "loss": 0.0657, "step": 459 }, { "epoch": 2.4764468371467023, "grad_norm": 0.3667747974395752, "learning_rate": 7.498187025704296e-07, "loss": 0.0649, "step": 460 }, { "epoch": 2.481830417227456, "grad_norm": 0.36384788155555725, "learning_rate": 7.345125086426675e-07, "loss": 0.0532, "step": 461 }, { "epoch": 2.48721399730821, "grad_norm": 0.40607815980911255, "learning_rate": 7.193517609998263e-07, "loss": 0.0796, "step": 462 }, { "epoch": 2.4925975773889637, "grad_norm": 0.36063507199287415, "learning_rate": 7.043369765990943e-07, "loss": 0.0639, "step": 463 }, { "epoch": 2.4979811574697175, "grad_norm": 0.3970508277416229, "learning_rate": 6.894686674205481e-07, "loss": 0.0688, "step": 464 }, { "epoch": 2.503364737550471, "grad_norm": 0.3685045540332794, "learning_rate": 6.747473404496902e-07, "loss": 0.0661, "step": 465 }, { "epoch": 2.5087483176312246, "grad_norm": 0.45861902832984924, "learning_rate": 6.601734976601737e-07, "loss": 0.0673, "step": 466 }, { "epoch": 2.5141318977119784, "grad_norm": 0.40021732449531555, "learning_rate": 6.457476359966685e-07, "loss": 0.0757, "step": 467 }, { "epoch": 2.519515477792732, "grad_norm": 0.3946848511695862, "learning_rate": 6.314702473579309e-07, "loss": 0.0654, "step": 468 }, { "epoch": 2.524899057873486, "grad_norm": 0.4420785903930664, "learning_rate": 6.17341818580024e-07, "loss": 0.0864, "step": 469 }, { "epoch": 2.5302826379542394, "grad_norm": 0.4311622679233551, "learning_rate": 6.033628314197176e-07, "loss": 0.0823, "step": 470 }, { "epoch": 2.535666218034993, "grad_norm": 0.4172739088535309, "learning_rate": 5.895337625380632e-07, "loss": 0.0892, "step": 471 }, { "epoch": 2.541049798115747, "grad_norm": 0.46117520332336426, "learning_rate": 5.758550834841381e-07, "loss": 0.0762, "step": 472 }, { "epoch": 2.5464333781965007, "grad_norm": 0.38010281324386597, "learning_rate": 5.62327260678967e-07, "loss": 0.0576, "step": 473 }, { "epoch": 2.5518169582772545, "grad_norm": 0.32299867272377014, "learning_rate": 5.489507553996204e-07, "loss": 0.0593, "step": 474 }, { "epoch": 2.557200538358008, "grad_norm": 0.39713406562805176, "learning_rate": 5.357260237634826e-07, "loss": 0.0742, "step": 475 }, { "epoch": 2.5625841184387617, "grad_norm": 0.4520042836666107, "learning_rate": 5.226535167127e-07, "loss": 0.0823, "step": 476 }, { "epoch": 2.5679676985195155, "grad_norm": 0.38494300842285156, "learning_rate": 5.097336799988067e-07, "loss": 0.0723, "step": 477 }, { "epoch": 2.5733512786002692, "grad_norm": 0.30375781655311584, "learning_rate": 4.96966954167517e-07, "loss": 0.0588, "step": 478 }, { "epoch": 2.578734858681023, "grad_norm": 0.35356128215789795, "learning_rate": 4.843537745437188e-07, "loss": 0.0653, "step": 479 }, { "epoch": 2.5841184387617764, "grad_norm": 0.3791372776031494, "learning_rate": 4.718945712166123e-07, "loss": 0.0715, "step": 480 }, { "epoch": 2.58950201884253, "grad_norm": 0.42902350425720215, "learning_rate": 4.595897690250567e-07, "loss": 0.0797, "step": 481 }, { "epoch": 2.594885598923284, "grad_norm": 0.39135926961898804, "learning_rate": 4.4743978754308027e-07, "loss": 0.0708, "step": 482 }, { "epoch": 2.6002691790040378, "grad_norm": 0.4254235625267029, "learning_rate": 4.3544504106557026e-07, "loss": 0.0802, "step": 483 }, { "epoch": 2.6056527590847915, "grad_norm": 0.3718099296092987, "learning_rate": 4.2360593859415433e-07, "loss": 0.0617, "step": 484 }, { "epoch": 2.611036339165545, "grad_norm": 0.4191717505455017, "learning_rate": 4.1192288382324363e-07, "loss": 0.0859, "step": 485 }, { "epoch": 2.6164199192462987, "grad_norm": 0.3816201388835907, "learning_rate": 4.003962751262763e-07, "loss": 0.0646, "step": 486 }, { "epoch": 2.6218034993270525, "grad_norm": 0.36653172969818115, "learning_rate": 3.890265055421283e-07, "loss": 0.0641, "step": 487 }, { "epoch": 2.6271870794078063, "grad_norm": 0.3723650276660919, "learning_rate": 3.77813962761715e-07, "loss": 0.075, "step": 488 }, { "epoch": 2.63257065948856, "grad_norm": 0.34589794278144836, "learning_rate": 3.6675902911476937e-07, "loss": 0.0595, "step": 489 }, { "epoch": 2.6379542395693134, "grad_norm": 0.4536292552947998, "learning_rate": 3.558620815568048e-07, "loss": 0.0766, "step": 490 }, { "epoch": 2.643337819650067, "grad_norm": 0.4030088782310486, "learning_rate": 3.451234916562618e-07, "loss": 0.0702, "step": 491 }, { "epoch": 2.648721399730821, "grad_norm": 0.6007040739059448, "learning_rate": 3.3454362558184075e-07, "loss": 0.0665, "step": 492 }, { "epoch": 2.654104979811575, "grad_norm": 0.3956087827682495, "learning_rate": 3.241228440900124e-07, "loss": 0.0669, "step": 493 }, { "epoch": 2.6594885598923286, "grad_norm": 0.4161822199821472, "learning_rate": 3.1386150251271897e-07, "loss": 0.0722, "step": 494 }, { "epoch": 2.664872139973082, "grad_norm": 0.36707159876823425, "learning_rate": 3.0375995074525764e-07, "loss": 0.0602, "step": 495 }, { "epoch": 2.6702557200538357, "grad_norm": 0.4103851318359375, "learning_rate": 2.9381853323434627e-07, "loss": 0.0898, "step": 496 }, { "epoch": 2.6756393001345895, "grad_norm": 0.3391963541507721, "learning_rate": 2.840375889663871e-07, "loss": 0.06, "step": 497 }, { "epoch": 2.6810228802153433, "grad_norm": 0.36111244559288025, "learning_rate": 2.744174514558956e-07, "loss": 0.0595, "step": 498 }, { "epoch": 2.686406460296097, "grad_norm": 0.33847886323928833, "learning_rate": 2.6495844873413944e-07, "loss": 0.0604, "step": 499 }, { "epoch": 2.6917900403768504, "grad_norm": 0.38463106751441956, "learning_rate": 2.556609033379459e-07, "loss": 0.0642, "step": 500 }, { "epoch": 2.6971736204576042, "grad_norm": 0.42590996623039246, "learning_rate": 2.465251322987061e-07, "loss": 0.0773, "step": 501 }, { "epoch": 2.702557200538358, "grad_norm": 0.4083699584007263, "learning_rate": 2.3755144713156819e-07, "loss": 0.0744, "step": 502 }, { "epoch": 2.707940780619112, "grad_norm": 0.34972718358039856, "learning_rate": 2.287401538248074e-07, "loss": 0.0631, "step": 503 }, { "epoch": 2.7133243606998656, "grad_norm": 0.3795744776725769, "learning_rate": 2.20091552829399e-07, "loss": 0.0611, "step": 504 }, { "epoch": 2.718707940780619, "grad_norm": 0.4142250120639801, "learning_rate": 2.1160593904877236e-07, "loss": 0.0755, "step": 505 }, { "epoch": 2.7240915208613727, "grad_norm": 0.3663713335990906, "learning_rate": 2.0328360182875262e-07, "loss": 0.0674, "step": 506 }, { "epoch": 2.7294751009421265, "grad_norm": 0.391294926404953, "learning_rate": 1.9512482494769613e-07, "loss": 0.0597, "step": 507 }, { "epoch": 2.7348586810228803, "grad_norm": 0.4010995328426361, "learning_rate": 1.8712988660681498e-07, "loss": 0.0702, "step": 508 }, { "epoch": 2.740242261103634, "grad_norm": 0.3831869065761566, "learning_rate": 1.7929905942068836e-07, "loss": 0.0618, "step": 509 }, { "epoch": 2.7456258411843875, "grad_norm": 0.34535014629364014, "learning_rate": 1.7163261040796797e-07, "loss": 0.0598, "step": 510 }, { "epoch": 2.7510094212651413, "grad_norm": 0.4758029282093048, "learning_rate": 1.6413080098227562e-07, "loss": 0.0895, "step": 511 }, { "epoch": 2.756393001345895, "grad_norm": 0.3582858741283417, "learning_rate": 1.5679388694328446e-07, "loss": 0.0603, "step": 512 }, { "epoch": 2.761776581426649, "grad_norm": 0.3556898832321167, "learning_rate": 1.4962211846800078e-07, "loss": 0.0551, "step": 513 }, { "epoch": 2.7671601615074026, "grad_norm": 0.4652111232280731, "learning_rate": 1.426157401022321e-07, "loss": 0.0935, "step": 514 }, { "epoch": 2.772543741588156, "grad_norm": 0.33882859349250793, "learning_rate": 1.3577499075224821e-07, "loss": 0.0569, "step": 515 }, { "epoch": 2.7779273216689098, "grad_norm": 0.3924010992050171, "learning_rate": 1.2910010367663317e-07, "loss": 0.0646, "step": 516 }, { "epoch": 2.7833109017496636, "grad_norm": 0.38544705510139465, "learning_rate": 1.2259130647833627e-07, "loss": 0.0836, "step": 517 }, { "epoch": 2.7886944818304173, "grad_norm": 0.4027419984340668, "learning_rate": 1.162488210969065e-07, "loss": 0.0653, "step": 518 }, { "epoch": 2.794078061911171, "grad_norm": 0.3646996021270752, "learning_rate": 1.100728638009263e-07, "loss": 0.0617, "step": 519 }, { "epoch": 2.7994616419919245, "grad_norm": 0.2809794247150421, "learning_rate": 1.0406364518063927e-07, "loss": 0.0394, "step": 520 }, { "epoch": 2.8048452220726783, "grad_norm": 0.3826785385608673, "learning_rate": 9.822137014076472e-08, "loss": 0.0793, "step": 521 }, { "epoch": 2.810228802153432, "grad_norm": 0.324332594871521, "learning_rate": 9.254623789351714e-08, "loss": 0.0598, "step": 522 }, { "epoch": 2.815612382234186, "grad_norm": 0.32736143469810486, "learning_rate": 8.703844195180555e-08, "loss": 0.056, "step": 523 }, { "epoch": 2.8209959623149397, "grad_norm": 0.3823404908180237, "learning_rate": 8.169817012264214e-08, "loss": 0.068, "step": 524 }, { "epoch": 2.826379542395693, "grad_norm": 0.4002109169960022, "learning_rate": 7.652560450073454e-08, "loss": 0.0803, "step": 525 }, { "epoch": 2.831763122476447, "grad_norm": 0.44719937443733215, "learning_rate": 7.152092146227806e-08, "loss": 0.0853, "step": 526 }, { "epoch": 2.8371467025572006, "grad_norm": 0.40953177213668823, "learning_rate": 6.668429165893996e-08, "loss": 0.0587, "step": 527 }, { "epoch": 2.8425302826379544, "grad_norm": 0.34889018535614014, "learning_rate": 6.20158800120435e-08, "loss": 0.0653, "step": 528 }, { "epoch": 2.847913862718708, "grad_norm": 0.4065254330635071, "learning_rate": 5.7515845706940246e-08, "loss": 0.0847, "step": 529 }, { "epoch": 2.8532974427994615, "grad_norm": 0.4163700342178345, "learning_rate": 5.31843421875855e-08, "loss": 0.0616, "step": 530 }, { "epoch": 2.8586810228802153, "grad_norm": 0.40107670426368713, "learning_rate": 4.9021517151305875e-08, "loss": 0.0793, "step": 531 }, { "epoch": 2.864064602960969, "grad_norm": 0.34356680512428284, "learning_rate": 4.502751254375992e-08, "loss": 0.0571, "step": 532 }, { "epoch": 2.869448183041723, "grad_norm": 0.4364492893218994, "learning_rate": 4.120246455410204e-08, "loss": 0.0545, "step": 533 }, { "epoch": 2.8748317631224767, "grad_norm": 0.3949114680290222, "learning_rate": 3.7546503610336183e-08, "loss": 0.0672, "step": 534 }, { "epoch": 2.88021534320323, "grad_norm": 0.35691335797309875, "learning_rate": 3.405975437486997e-08, "loss": 0.0646, "step": 535 }, { "epoch": 2.885598923283984, "grad_norm": 0.3505745828151703, "learning_rate": 3.074233574026087e-08, "loss": 0.0556, "step": 536 }, { "epoch": 2.8909825033647376, "grad_norm": 0.345758318901062, "learning_rate": 2.7594360825166644e-08, "loss": 0.0664, "step": 537 }, { "epoch": 2.8963660834454914, "grad_norm": 0.3653146028518677, "learning_rate": 2.4615936970485144e-08, "loss": 0.0568, "step": 538 }, { "epoch": 2.901749663526245, "grad_norm": 0.35214874148368835, "learning_rate": 2.180716573569386e-08, "loss": 0.0723, "step": 539 }, { "epoch": 2.9071332436069985, "grad_norm": 0.31391990184783936, "learning_rate": 1.9168142895389376e-08, "loss": 0.0511, "step": 540 }, { "epoch": 2.9125168236877523, "grad_norm": 0.3372190594673157, "learning_rate": 1.6698958436019986e-08, "loss": 0.0559, "step": 541 }, { "epoch": 2.917900403768506, "grad_norm": 0.32231083512306213, "learning_rate": 1.4399696552816477e-08, "loss": 0.0585, "step": 542 }, { "epoch": 2.92328398384926, "grad_norm": 0.4236755669116974, "learning_rate": 1.2270435646922763e-08, "loss": 0.0818, "step": 543 }, { "epoch": 2.9286675639300137, "grad_norm": 0.3500356078147888, "learning_rate": 1.031124832272301e-08, "loss": 0.0716, "step": 544 }, { "epoch": 2.934051144010767, "grad_norm": 0.38234201073646545, "learning_rate": 8.522201385362528e-09, "loss": 0.0632, "step": 545 }, { "epoch": 2.939434724091521, "grad_norm": 0.39198631048202515, "learning_rate": 6.903355838475123e-09, "loss": 0.0707, "step": 546 }, { "epoch": 2.9448183041722746, "grad_norm": 0.3228546977043152, "learning_rate": 5.454766882097007e-09, "loss": 0.058, "step": 547 }, { "epoch": 2.9502018842530284, "grad_norm": 0.35666099190711975, "learning_rate": 4.1764839107905074e-09, "loss": 0.061, "step": 548 }, { "epoch": 2.955585464333782, "grad_norm": 0.3645073175430298, "learning_rate": 3.068550511955426e-09, "loss": 0.061, "step": 549 }, { "epoch": 2.9609690444145356, "grad_norm": 0.34374579787254333, "learning_rate": 2.131004464343556e-09, "loss": 0.0671, "step": 550 }, { "epoch": 2.9663526244952894, "grad_norm": 0.38777437806129456, "learning_rate": 1.3638777367724898e-09, "loss": 0.0789, "step": 551 }, { "epoch": 2.971736204576043, "grad_norm": 0.35030388832092285, "learning_rate": 7.671964870337168e-10, "loss": 0.0649, "step": 552 }, { "epoch": 2.9771197846567965, "grad_norm": 0.39920809864997864, "learning_rate": 3.4098106100166616e-10, "loss": 0.0783, "step": 553 }, { "epoch": 2.9825033647375507, "grad_norm": 0.4224764406681061, "learning_rate": 8.52459919381543e-11, "loss": 0.0818, "step": 554 }, { "epoch": 2.987886944818304, "grad_norm": 0.34364205598831177, "learning_rate": 0.0, "loss": 0.0664, "step": 555 }, { "epoch": 2.987886944818304, "step": 555, "total_flos": 1.777533654151463e+18, "train_loss": 0.2704765519647448, "train_runtime": 4301.2874, "train_samples_per_second": 4.142, "train_steps_per_second": 0.129 } ], "logging_steps": 1.0, "max_steps": 555, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.777533654151463e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }