{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045495905368516835, "grad_norm": 9.460982644953551, "learning_rate": 1e-06, "loss": 0.1263, "step": 1 }, { "epoch": 0.0009099181073703367, "grad_norm": 8.496088007652286, "learning_rate": 9.999999795711289e-07, "loss": 0.1976, "step": 2 }, { "epoch": 0.001364877161055505, "grad_norm": 8.795727896186301, "learning_rate": 9.999999182845176e-07, "loss": 0.1936, "step": 3 }, { "epoch": 0.0018198362147406734, "grad_norm": 2.884208258076819, "learning_rate": 9.999998161401707e-07, "loss": 0.1341, "step": 4 }, { "epoch": 0.0022747952684258415, "grad_norm": 6.093272608844917, "learning_rate": 9.999996731380972e-07, "loss": 0.1653, "step": 5 }, { "epoch": 0.00272975432211101, "grad_norm": 6.206942040526342, "learning_rate": 9.99999489278308e-07, "loss": 0.1233, "step": 6 }, { "epoch": 0.0031847133757961785, "grad_norm": 5.623248620409181, "learning_rate": 9.999992645608189e-07, "loss": 0.11, "step": 7 }, { "epoch": 0.003639672429481347, "grad_norm": 5.6212334255704395, "learning_rate": 9.999989989856477e-07, "loss": 0.1256, "step": 8 }, { "epoch": 0.004094631483166515, "grad_norm": 5.115332368305895, "learning_rate": 9.999986925528161e-07, "loss": 0.0802, "step": 9 }, { "epoch": 0.004549590536851683, "grad_norm": 7.6754517372106115, "learning_rate": 9.999983452623496e-07, "loss": 0.1339, "step": 10 }, { "epoch": 0.005004549590536852, "grad_norm": 7.4519771920318885, "learning_rate": 9.999979571142762e-07, "loss": 0.1241, "step": 11 }, { "epoch": 0.00545950864422202, "grad_norm": 5.281625514057187, "learning_rate": 9.999975281086276e-07, "loss": 0.1225, "step": 12 }, { "epoch": 0.005914467697907188, "grad_norm": 2.826051783008299, "learning_rate": 9.99997058245439e-07, "loss": 0.0938, "step": 13 }, { "epoch": 0.006369426751592357, "grad_norm": 16.98790451752032, "learning_rate": 9.99996547524749e-07, "loss": 0.1451, "step": 14 }, { "epoch": 0.006824385805277525, "grad_norm": 3.879403880177224, "learning_rate": 9.999959959465988e-07, "loss": 0.1075, "step": 15 }, { "epoch": 0.007279344858962694, "grad_norm": 6.135164623225957, "learning_rate": 9.999954035110341e-07, "loss": 0.0868, "step": 16 }, { "epoch": 0.0077343039126478615, "grad_norm": 7.353193747413332, "learning_rate": 9.999947702181026e-07, "loss": 0.1358, "step": 17 }, { "epoch": 0.00818926296633303, "grad_norm": 4.665297196059896, "learning_rate": 9.999940960678568e-07, "loss": 0.1028, "step": 18 }, { "epoch": 0.008644222020018199, "grad_norm": 6.343910178022308, "learning_rate": 9.99993381060351e-07, "loss": 0.1079, "step": 19 }, { "epoch": 0.009099181073703366, "grad_norm": 3.1318769019186106, "learning_rate": 9.999926251956445e-07, "loss": 0.0762, "step": 20 }, { "epoch": 0.009554140127388535, "grad_norm": 5.639674558770591, "learning_rate": 9.999918284737984e-07, "loss": 0.1107, "step": 21 }, { "epoch": 0.010009099181073703, "grad_norm": 3.944694198517734, "learning_rate": 9.999909908948781e-07, "loss": 0.0792, "step": 22 }, { "epoch": 0.010464058234758872, "grad_norm": 5.277637634775166, "learning_rate": 9.999901124589518e-07, "loss": 0.0955, "step": 23 }, { "epoch": 0.01091901728844404, "grad_norm": 3.159102517555636, "learning_rate": 9.999891931660915e-07, "loss": 0.1177, "step": 24 }, { "epoch": 0.011373976342129208, "grad_norm": 3.876069018264601, "learning_rate": 9.999882330163725e-07, "loss": 0.0942, "step": 25 }, { "epoch": 0.011828935395814377, "grad_norm": 5.310251289002937, "learning_rate": 9.999872320098726e-07, "loss": 0.0772, "step": 26 }, { "epoch": 0.012283894449499545, "grad_norm": 2.4013872287511204, "learning_rate": 9.999861901466744e-07, "loss": 0.113, "step": 27 }, { "epoch": 0.012738853503184714, "grad_norm": 3.226872006603208, "learning_rate": 9.999851074268623e-07, "loss": 0.0948, "step": 28 }, { "epoch": 0.013193812556869881, "grad_norm": 4.948814654016484, "learning_rate": 9.999839838505255e-07, "loss": 0.1225, "step": 29 }, { "epoch": 0.01364877161055505, "grad_norm": 2.1915501350462883, "learning_rate": 9.999828194177554e-07, "loss": 0.1092, "step": 30 }, { "epoch": 0.014103730664240218, "grad_norm": 2.457727680328418, "learning_rate": 9.999816141286471e-07, "loss": 0.091, "step": 31 }, { "epoch": 0.014558689717925387, "grad_norm": 2.096610231076784, "learning_rate": 9.99980367983299e-07, "loss": 0.079, "step": 32 }, { "epoch": 0.015013648771610554, "grad_norm": 3.4789632001628985, "learning_rate": 9.999790809818133e-07, "loss": 0.069, "step": 33 }, { "epoch": 0.015468607825295723, "grad_norm": 2.432234105105967, "learning_rate": 9.99977753124295e-07, "loss": 0.0916, "step": 34 }, { "epoch": 0.01592356687898089, "grad_norm": 2.8811408833854797, "learning_rate": 9.999763844108525e-07, "loss": 0.0558, "step": 35 }, { "epoch": 0.01637852593266606, "grad_norm": 3.0517587459597832, "learning_rate": 9.99974974841598e-07, "loss": 0.0574, "step": 36 }, { "epoch": 0.01683348498635123, "grad_norm": 5.911523880873187, "learning_rate": 9.999735244166462e-07, "loss": 0.1149, "step": 37 }, { "epoch": 0.017288444040036398, "grad_norm": 2.2249115738169034, "learning_rate": 9.99972033136116e-07, "loss": 0.1024, "step": 38 }, { "epoch": 0.017743403093721567, "grad_norm": 18.233741266972995, "learning_rate": 9.99970501000129e-07, "loss": 0.0919, "step": 39 }, { "epoch": 0.018198362147406732, "grad_norm": 5.1798303460368755, "learning_rate": 9.999689280088103e-07, "loss": 0.0775, "step": 40 }, { "epoch": 0.0186533212010919, "grad_norm": 2.1459089246867316, "learning_rate": 9.99967314162289e-07, "loss": 0.0771, "step": 41 }, { "epoch": 0.01910828025477707, "grad_norm": 2.310140859334937, "learning_rate": 9.999656594606964e-07, "loss": 0.096, "step": 42 }, { "epoch": 0.019563239308462238, "grad_norm": 1.7877695653527323, "learning_rate": 9.99963963904168e-07, "loss": 0.0889, "step": 43 }, { "epoch": 0.020018198362147407, "grad_norm": 2.3036066458968403, "learning_rate": 9.999622274928424e-07, "loss": 0.0904, "step": 44 }, { "epoch": 0.020473157415832575, "grad_norm": 8.068779945527933, "learning_rate": 9.999604502268613e-07, "loss": 0.1085, "step": 45 }, { "epoch": 0.020928116469517744, "grad_norm": 4.428001428676284, "learning_rate": 9.999586321063698e-07, "loss": 0.0724, "step": 46 }, { "epoch": 0.021383075523202913, "grad_norm": 3.472814836739773, "learning_rate": 9.999567731315169e-07, "loss": 0.1807, "step": 47 }, { "epoch": 0.02183803457688808, "grad_norm": 4.53270932228853, "learning_rate": 9.999548733024543e-07, "loss": 0.183, "step": 48 }, { "epoch": 0.022292993630573247, "grad_norm": 4.635214304750299, "learning_rate": 9.99952932619337e-07, "loss": 0.0963, "step": 49 }, { "epoch": 0.022747952684258416, "grad_norm": 2.524081592582436, "learning_rate": 9.99950951082324e-07, "loss": 0.0768, "step": 50 }, { "epoch": 0.023202911737943584, "grad_norm": 2.4497670829961495, "learning_rate": 9.999489286915772e-07, "loss": 0.0754, "step": 51 }, { "epoch": 0.023657870791628753, "grad_norm": 2.3453588015577154, "learning_rate": 9.999468654472614e-07, "loss": 0.0742, "step": 52 }, { "epoch": 0.024112829845313922, "grad_norm": 1.0559086878519577, "learning_rate": 9.999447613495457e-07, "loss": 0.0561, "step": 53 }, { "epoch": 0.02456778889899909, "grad_norm": 4.180479602032368, "learning_rate": 9.999426163986018e-07, "loss": 0.0685, "step": 54 }, { "epoch": 0.02502274795268426, "grad_norm": 1.485403464030856, "learning_rate": 9.99940430594605e-07, "loss": 0.0595, "step": 55 }, { "epoch": 0.025477707006369428, "grad_norm": 3.818350504676734, "learning_rate": 9.999382039377338e-07, "loss": 0.0703, "step": 56 }, { "epoch": 0.025932666060054597, "grad_norm": 3.712467276552201, "learning_rate": 9.999359364281704e-07, "loss": 0.0465, "step": 57 }, { "epoch": 0.026387625113739762, "grad_norm": 2.5314824071119997, "learning_rate": 9.999336280660999e-07, "loss": 0.1118, "step": 58 }, { "epoch": 0.02684258416742493, "grad_norm": 2.01187791111904, "learning_rate": 9.99931278851711e-07, "loss": 0.0779, "step": 59 }, { "epoch": 0.0272975432211101, "grad_norm": 1.8034515649341665, "learning_rate": 9.999288887851956e-07, "loss": 0.0714, "step": 60 }, { "epoch": 0.027752502274795268, "grad_norm": 5.939307329468221, "learning_rate": 9.999264578667492e-07, "loss": 0.0624, "step": 61 }, { "epoch": 0.028207461328480437, "grad_norm": 3.5462049672421694, "learning_rate": 9.999239860965701e-07, "loss": 0.084, "step": 62 }, { "epoch": 0.028662420382165606, "grad_norm": 1.9190519391810084, "learning_rate": 9.999214734748607e-07, "loss": 0.0868, "step": 63 }, { "epoch": 0.029117379435850774, "grad_norm": 2.990182475432006, "learning_rate": 9.999189200018262e-07, "loss": 0.0687, "step": 64 }, { "epoch": 0.029572338489535943, "grad_norm": 3.6283565145413035, "learning_rate": 9.999163256776748e-07, "loss": 0.1066, "step": 65 }, { "epoch": 0.03002729754322111, "grad_norm": 2.46041046107172, "learning_rate": 9.999136905026192e-07, "loss": 0.1429, "step": 66 }, { "epoch": 0.030482256596906277, "grad_norm": 2.540310192062455, "learning_rate": 9.999110144768743e-07, "loss": 0.0871, "step": 67 }, { "epoch": 0.030937215650591446, "grad_norm": 2.3481485598576555, "learning_rate": 9.999082976006589e-07, "loss": 0.0708, "step": 68 }, { "epoch": 0.03139217470427662, "grad_norm": 1.5421437731161647, "learning_rate": 9.99905539874195e-07, "loss": 0.0646, "step": 69 }, { "epoch": 0.03184713375796178, "grad_norm": 1.862783458602375, "learning_rate": 9.99902741297708e-07, "loss": 0.073, "step": 70 }, { "epoch": 0.03230209281164695, "grad_norm": 2.7514307829983196, "learning_rate": 9.998999018714263e-07, "loss": 0.0636, "step": 71 }, { "epoch": 0.03275705186533212, "grad_norm": 2.088273099435258, "learning_rate": 9.998970215955822e-07, "loss": 0.0759, "step": 72 }, { "epoch": 0.033212010919017286, "grad_norm": 4.880835542241873, "learning_rate": 9.998941004704111e-07, "loss": 0.0607, "step": 73 }, { "epoch": 0.03366696997270246, "grad_norm": 2.294548042436939, "learning_rate": 9.998911384961517e-07, "loss": 0.0851, "step": 74 }, { "epoch": 0.034121929026387623, "grad_norm": 3.135395035411956, "learning_rate": 9.998881356730458e-07, "loss": 0.0615, "step": 75 }, { "epoch": 0.034576888080072796, "grad_norm": 2.575133750401631, "learning_rate": 9.998850920013388e-07, "loss": 0.0799, "step": 76 }, { "epoch": 0.03503184713375796, "grad_norm": 2.565050657771875, "learning_rate": 9.998820074812797e-07, "loss": 0.0696, "step": 77 }, { "epoch": 0.03548680618744313, "grad_norm": 2.2396897888125915, "learning_rate": 9.998788821131206e-07, "loss": 0.1037, "step": 78 }, { "epoch": 0.0359417652411283, "grad_norm": 1.617159978432207, "learning_rate": 9.998757158971164e-07, "loss": 0.0709, "step": 79 }, { "epoch": 0.036396724294813464, "grad_norm": 2.8489915433996913, "learning_rate": 9.998725088335263e-07, "loss": 0.0888, "step": 80 }, { "epoch": 0.036851683348498636, "grad_norm": 0.9923870511013093, "learning_rate": 9.99869260922612e-07, "loss": 0.0475, "step": 81 }, { "epoch": 0.0373066424021838, "grad_norm": 9.863429979788943, "learning_rate": 9.998659721646392e-07, "loss": 0.082, "step": 82 }, { "epoch": 0.03776160145586897, "grad_norm": 1.468013198811096, "learning_rate": 9.998626425598765e-07, "loss": 0.0783, "step": 83 }, { "epoch": 0.03821656050955414, "grad_norm": 2.6390574435332756, "learning_rate": 9.99859272108596e-07, "loss": 0.0803, "step": 84 }, { "epoch": 0.03867151956323931, "grad_norm": 1.4779232115601733, "learning_rate": 9.998558608110731e-07, "loss": 0.0871, "step": 85 }, { "epoch": 0.039126478616924476, "grad_norm": 1.5249547529775251, "learning_rate": 9.998524086675866e-07, "loss": 0.0723, "step": 86 }, { "epoch": 0.03958143767060965, "grad_norm": 2.119420999088343, "learning_rate": 9.998489156784186e-07, "loss": 0.0816, "step": 87 }, { "epoch": 0.040036396724294813, "grad_norm": 1.6071500805434977, "learning_rate": 9.998453818438546e-07, "loss": 0.0882, "step": 88 }, { "epoch": 0.04049135577797998, "grad_norm": 1.3520647569618234, "learning_rate": 9.998418071641832e-07, "loss": 0.0717, "step": 89 }, { "epoch": 0.04094631483166515, "grad_norm": 1.3581714785879615, "learning_rate": 9.998381916396965e-07, "loss": 0.0619, "step": 90 }, { "epoch": 0.041401273885350316, "grad_norm": 1.2211168419055232, "learning_rate": 9.9983453527069e-07, "loss": 0.049, "step": 91 }, { "epoch": 0.04185623293903549, "grad_norm": 1.4130875618701322, "learning_rate": 9.998308380574627e-07, "loss": 0.0603, "step": 92 }, { "epoch": 0.042311191992720654, "grad_norm": 2.5116395183528146, "learning_rate": 9.998271000003164e-07, "loss": 0.117, "step": 93 }, { "epoch": 0.042766151046405826, "grad_norm": 1.3221909432162122, "learning_rate": 9.998233210995568e-07, "loss": 0.0752, "step": 94 }, { "epoch": 0.04322111010009099, "grad_norm": 8.422591179527869, "learning_rate": 9.998195013554926e-07, "loss": 0.1043, "step": 95 }, { "epoch": 0.04367606915377616, "grad_norm": 1.3172883326606952, "learning_rate": 9.998156407684356e-07, "loss": 0.0932, "step": 96 }, { "epoch": 0.04413102820746133, "grad_norm": 1.6768409465958336, "learning_rate": 9.99811739338702e-07, "loss": 0.0542, "step": 97 }, { "epoch": 0.044585987261146494, "grad_norm": 2.7309524458462535, "learning_rate": 9.9980779706661e-07, "loss": 0.0695, "step": 98 }, { "epoch": 0.045040946314831666, "grad_norm": 1.7815589902074003, "learning_rate": 9.998038139524819e-07, "loss": 0.0677, "step": 99 }, { "epoch": 0.04549590536851683, "grad_norm": 2.0846413148949043, "learning_rate": 9.997997899966432e-07, "loss": 0.0721, "step": 100 }, { "epoch": 0.045950864422202004, "grad_norm": 1.6681584962433291, "learning_rate": 9.997957251994229e-07, "loss": 0.092, "step": 101 }, { "epoch": 0.04640582347588717, "grad_norm": 2.0217998424011934, "learning_rate": 9.99791619561153e-07, "loss": 0.0785, "step": 102 }, { "epoch": 0.04686078252957234, "grad_norm": 1.31297777091851, "learning_rate": 9.997874730821687e-07, "loss": 0.062, "step": 103 }, { "epoch": 0.047315741583257506, "grad_norm": 28.84255784761873, "learning_rate": 9.997832857628093e-07, "loss": 0.2905, "step": 104 }, { "epoch": 0.04777070063694268, "grad_norm": 2.1953371485715953, "learning_rate": 9.997790576034168e-07, "loss": 0.1315, "step": 105 }, { "epoch": 0.048225659690627844, "grad_norm": 1.752579441726796, "learning_rate": 9.997747886043366e-07, "loss": 0.0592, "step": 106 }, { "epoch": 0.04868061874431301, "grad_norm": 1.214479974144816, "learning_rate": 9.99770478765918e-07, "loss": 0.0658, "step": 107 }, { "epoch": 0.04913557779799818, "grad_norm": 2.119879894581063, "learning_rate": 9.997661280885123e-07, "loss": 0.1249, "step": 108 }, { "epoch": 0.049590536851683346, "grad_norm": 1.6123956274563962, "learning_rate": 9.997617365724757e-07, "loss": 0.0721, "step": 109 }, { "epoch": 0.05004549590536852, "grad_norm": 3.8956481181984954, "learning_rate": 9.997573042181672e-07, "loss": 0.0668, "step": 110 }, { "epoch": 0.050500454959053684, "grad_norm": 1.302369323422075, "learning_rate": 9.997528310259483e-07, "loss": 0.0978, "step": 111 }, { "epoch": 0.050955414012738856, "grad_norm": 1.9393382827627617, "learning_rate": 9.997483169961851e-07, "loss": 0.0919, "step": 112 }, { "epoch": 0.05141037306642402, "grad_norm": 1.8924498716761984, "learning_rate": 9.99743762129246e-07, "loss": 0.0737, "step": 113 }, { "epoch": 0.051865332120109194, "grad_norm": 1.5015649842043453, "learning_rate": 9.99739166425504e-07, "loss": 0.0614, "step": 114 }, { "epoch": 0.05232029117379436, "grad_norm": 2.1366922787715863, "learning_rate": 9.997345298853336e-07, "loss": 0.1374, "step": 115 }, { "epoch": 0.052775250227479524, "grad_norm": 1.9167133713490603, "learning_rate": 9.997298525091147e-07, "loss": 0.0726, "step": 116 }, { "epoch": 0.053230209281164696, "grad_norm": 1.340457987244246, "learning_rate": 9.997251342972286e-07, "loss": 0.0685, "step": 117 }, { "epoch": 0.05368516833484986, "grad_norm": 1.6965958368742107, "learning_rate": 9.997203752500615e-07, "loss": 0.074, "step": 118 }, { "epoch": 0.054140127388535034, "grad_norm": 2.096296179848149, "learning_rate": 9.99715575368002e-07, "loss": 0.0716, "step": 119 }, { "epoch": 0.0545950864422202, "grad_norm": 1.2376773021772507, "learning_rate": 9.997107346514424e-07, "loss": 0.066, "step": 120 }, { "epoch": 0.05505004549590537, "grad_norm": 0.9612721709748118, "learning_rate": 9.99705853100778e-07, "loss": 0.0512, "step": 121 }, { "epoch": 0.055505004549590536, "grad_norm": 1.4181375311562314, "learning_rate": 9.99700930716408e-07, "loss": 0.0823, "step": 122 }, { "epoch": 0.05595996360327571, "grad_norm": 3.5180526458549797, "learning_rate": 9.99695967498735e-07, "loss": 0.0983, "step": 123 }, { "epoch": 0.056414922656960874, "grad_norm": 4.112860504161507, "learning_rate": 9.996909634481637e-07, "loss": 0.0561, "step": 124 }, { "epoch": 0.05686988171064604, "grad_norm": 28.236450106120056, "learning_rate": 9.996859185651036e-07, "loss": 0.2988, "step": 125 }, { "epoch": 0.05732484076433121, "grad_norm": 1.5902388588480536, "learning_rate": 9.99680832849967e-07, "loss": 0.0975, "step": 126 }, { "epoch": 0.05777979981801638, "grad_norm": 1.2347324615542328, "learning_rate": 9.996757063031689e-07, "loss": 0.0494, "step": 127 }, { "epoch": 0.05823475887170155, "grad_norm": 4.028509250415612, "learning_rate": 9.996705389251287e-07, "loss": 0.0937, "step": 128 }, { "epoch": 0.058689717925386714, "grad_norm": 1.771230906552479, "learning_rate": 9.996653307162686e-07, "loss": 0.0639, "step": 129 }, { "epoch": 0.059144676979071886, "grad_norm": 2.795754260594168, "learning_rate": 9.996600816770142e-07, "loss": 0.0768, "step": 130 }, { "epoch": 0.05959963603275705, "grad_norm": 1.876401549309021, "learning_rate": 9.996547918077943e-07, "loss": 0.0952, "step": 131 }, { "epoch": 0.06005459508644222, "grad_norm": 1.6308837105022365, "learning_rate": 9.996494611090412e-07, "loss": 0.0774, "step": 132 }, { "epoch": 0.06050955414012739, "grad_norm": 1.0100069577180617, "learning_rate": 9.996440895811907e-07, "loss": 0.0523, "step": 133 }, { "epoch": 0.060964513193812554, "grad_norm": 1.9655730360996575, "learning_rate": 9.996386772246814e-07, "loss": 0.0858, "step": 134 }, { "epoch": 0.061419472247497726, "grad_norm": 2.098545470677641, "learning_rate": 9.996332240399558e-07, "loss": 0.0791, "step": 135 }, { "epoch": 0.06187443130118289, "grad_norm": 10.955612559167871, "learning_rate": 9.996277300274595e-07, "loss": 0.1788, "step": 136 }, { "epoch": 0.062329390354868064, "grad_norm": 1.389590670783533, "learning_rate": 9.996221951876414e-07, "loss": 0.1015, "step": 137 }, { "epoch": 0.06278434940855324, "grad_norm": 1.75423879868311, "learning_rate": 9.996166195209536e-07, "loss": 0.1009, "step": 138 }, { "epoch": 0.0632393084622384, "grad_norm": 1.1611431350130335, "learning_rate": 9.996110030278523e-07, "loss": 0.0542, "step": 139 }, { "epoch": 0.06369426751592357, "grad_norm": 1.177330204385083, "learning_rate": 9.996053457087956e-07, "loss": 0.0678, "step": 140 }, { "epoch": 0.06414922656960874, "grad_norm": 1.1343819646499882, "learning_rate": 9.995996475642466e-07, "loss": 0.0735, "step": 141 }, { "epoch": 0.0646041856232939, "grad_norm": 1.6851312977878308, "learning_rate": 9.995939085946703e-07, "loss": 0.0627, "step": 142 }, { "epoch": 0.06505914467697907, "grad_norm": 1.5381425131907351, "learning_rate": 9.995881288005362e-07, "loss": 0.0713, "step": 143 }, { "epoch": 0.06551410373066424, "grad_norm": 2.1112756199827003, "learning_rate": 9.995823081823161e-07, "loss": 0.0619, "step": 144 }, { "epoch": 0.06596906278434941, "grad_norm": 1.4393207733449387, "learning_rate": 9.99576446740486e-07, "loss": 0.0779, "step": 145 }, { "epoch": 0.06642402183803457, "grad_norm": 4.030195148512648, "learning_rate": 9.995705444755247e-07, "loss": 0.1547, "step": 146 }, { "epoch": 0.06687898089171974, "grad_norm": 1.0198030200938744, "learning_rate": 9.995646013879146e-07, "loss": 0.0591, "step": 147 }, { "epoch": 0.06733393994540492, "grad_norm": 2.5309104773358615, "learning_rate": 9.995586174781413e-07, "loss": 0.0702, "step": 148 }, { "epoch": 0.06778889899909009, "grad_norm": 1.2810295777297256, "learning_rate": 9.995525927466935e-07, "loss": 0.0764, "step": 149 }, { "epoch": 0.06824385805277525, "grad_norm": 1.4301504427281813, "learning_rate": 9.995465271940642e-07, "loss": 0.0643, "step": 150 }, { "epoch": 0.06869881710646042, "grad_norm": 1.8711805426782187, "learning_rate": 9.995404208207482e-07, "loss": 0.0826, "step": 151 }, { "epoch": 0.06915377616014559, "grad_norm": 1.875735096442477, "learning_rate": 9.99534273627245e-07, "loss": 0.0949, "step": 152 }, { "epoch": 0.06960873521383075, "grad_norm": 3.96430833554185, "learning_rate": 9.99528085614057e-07, "loss": 0.1094, "step": 153 }, { "epoch": 0.07006369426751592, "grad_norm": 1.4251935695546751, "learning_rate": 9.995218567816899e-07, "loss": 0.0564, "step": 154 }, { "epoch": 0.0705186533212011, "grad_norm": 1.3176900708215529, "learning_rate": 9.99515587130652e-07, "loss": 0.0631, "step": 155 }, { "epoch": 0.07097361237488627, "grad_norm": 1.7624159071995484, "learning_rate": 9.995092766614566e-07, "loss": 0.0572, "step": 156 }, { "epoch": 0.07142857142857142, "grad_norm": 1.3925157957876113, "learning_rate": 9.995029253746186e-07, "loss": 0.0805, "step": 157 }, { "epoch": 0.0718835304822566, "grad_norm": 1.4376441428641062, "learning_rate": 9.994965332706572e-07, "loss": 0.0548, "step": 158 }, { "epoch": 0.07233848953594177, "grad_norm": 1.7813293579714622, "learning_rate": 9.99490100350095e-07, "loss": 0.0842, "step": 159 }, { "epoch": 0.07279344858962693, "grad_norm": 1.2919649617338766, "learning_rate": 9.994836266134575e-07, "loss": 0.0707, "step": 160 }, { "epoch": 0.0732484076433121, "grad_norm": 1.464068687152245, "learning_rate": 9.994771120612734e-07, "loss": 0.0802, "step": 161 }, { "epoch": 0.07370336669699727, "grad_norm": 1.4409455272274696, "learning_rate": 9.994705566940756e-07, "loss": 0.0735, "step": 162 }, { "epoch": 0.07415832575068244, "grad_norm": 1.6004712311017868, "learning_rate": 9.994639605123993e-07, "loss": 0.0751, "step": 163 }, { "epoch": 0.0746132848043676, "grad_norm": 1.4178445048602115, "learning_rate": 9.994573235167837e-07, "loss": 0.0677, "step": 164 }, { "epoch": 0.07506824385805277, "grad_norm": 1.9143467099814084, "learning_rate": 9.994506457077713e-07, "loss": 0.0803, "step": 165 }, { "epoch": 0.07552320291173795, "grad_norm": 2.308950856245503, "learning_rate": 9.994439270859077e-07, "loss": 0.1114, "step": 166 }, { "epoch": 0.07597816196542312, "grad_norm": 1.4976553220218047, "learning_rate": 9.994371676517416e-07, "loss": 0.0648, "step": 167 }, { "epoch": 0.07643312101910828, "grad_norm": 1.3873054469654502, "learning_rate": 9.994303674058258e-07, "loss": 0.0656, "step": 168 }, { "epoch": 0.07688808007279345, "grad_norm": 1.4242003105924865, "learning_rate": 9.994235263487158e-07, "loss": 0.0696, "step": 169 }, { "epoch": 0.07734303912647862, "grad_norm": 1.5170054860626223, "learning_rate": 9.994166444809704e-07, "loss": 0.06, "step": 170 }, { "epoch": 0.07779799818016378, "grad_norm": 1.3198324267816783, "learning_rate": 9.994097218031523e-07, "loss": 0.0572, "step": 171 }, { "epoch": 0.07825295723384895, "grad_norm": 1.2355032396516417, "learning_rate": 9.99402758315827e-07, "loss": 0.0714, "step": 172 }, { "epoch": 0.07870791628753412, "grad_norm": 2.2363524490578994, "learning_rate": 9.993957540195637e-07, "loss": 0.069, "step": 173 }, { "epoch": 0.0791628753412193, "grad_norm": 1.1537045057849564, "learning_rate": 9.993887089149345e-07, "loss": 0.0781, "step": 174 }, { "epoch": 0.07961783439490445, "grad_norm": 1.1483158852396702, "learning_rate": 9.993816230025151e-07, "loss": 0.0586, "step": 175 }, { "epoch": 0.08007279344858963, "grad_norm": 2.060757875095848, "learning_rate": 9.99374496282885e-07, "loss": 0.084, "step": 176 }, { "epoch": 0.0805277525022748, "grad_norm": 2.745784074986501, "learning_rate": 9.99367328756626e-07, "loss": 0.1264, "step": 177 }, { "epoch": 0.08098271155595996, "grad_norm": 0.8765067657246923, "learning_rate": 9.99360120424324e-07, "loss": 0.0442, "step": 178 }, { "epoch": 0.08143767060964513, "grad_norm": 0.8649614951279925, "learning_rate": 9.99352871286568e-07, "loss": 0.0614, "step": 179 }, { "epoch": 0.0818926296633303, "grad_norm": 1.891800612550326, "learning_rate": 9.993455813439506e-07, "loss": 0.0612, "step": 180 }, { "epoch": 0.08234758871701547, "grad_norm": 1.2201801550256919, "learning_rate": 9.993382505970671e-07, "loss": 0.0491, "step": 181 }, { "epoch": 0.08280254777070063, "grad_norm": 1.3933852940059142, "learning_rate": 9.99330879046517e-07, "loss": 0.0621, "step": 182 }, { "epoch": 0.0832575068243858, "grad_norm": 1.0007322979585427, "learning_rate": 9.993234666929023e-07, "loss": 0.0512, "step": 183 }, { "epoch": 0.08371246587807098, "grad_norm": 1.5648385532765003, "learning_rate": 9.993160135368288e-07, "loss": 0.0641, "step": 184 }, { "epoch": 0.08416742493175614, "grad_norm": 1.1000106659426367, "learning_rate": 9.993085195789055e-07, "loss": 0.088, "step": 185 }, { "epoch": 0.08462238398544131, "grad_norm": 1.3353211888071153, "learning_rate": 9.99300984819745e-07, "loss": 0.0618, "step": 186 }, { "epoch": 0.08507734303912648, "grad_norm": 1.166219455861875, "learning_rate": 9.992934092599627e-07, "loss": 0.0615, "step": 187 }, { "epoch": 0.08553230209281165, "grad_norm": 1.2821420783611344, "learning_rate": 9.99285792900178e-07, "loss": 0.0902, "step": 188 }, { "epoch": 0.08598726114649681, "grad_norm": 1.4699402496364335, "learning_rate": 9.99278135741013e-07, "loss": 0.0965, "step": 189 }, { "epoch": 0.08644222020018198, "grad_norm": 1.2092187178175942, "learning_rate": 9.992704377830933e-07, "loss": 0.0495, "step": 190 }, { "epoch": 0.08689717925386715, "grad_norm": 2.4242211713295094, "learning_rate": 9.99262699027048e-07, "loss": 0.1044, "step": 191 }, { "epoch": 0.08735213830755233, "grad_norm": 1.4912711552359401, "learning_rate": 9.9925491947351e-07, "loss": 0.0803, "step": 192 }, { "epoch": 0.08780709736123748, "grad_norm": 0.8264184113688682, "learning_rate": 9.992470991231143e-07, "loss": 0.044, "step": 193 }, { "epoch": 0.08826205641492266, "grad_norm": 1.5856229366100132, "learning_rate": 9.992392379765003e-07, "loss": 0.0846, "step": 194 }, { "epoch": 0.08871701546860783, "grad_norm": 1.711195707943432, "learning_rate": 9.992313360343104e-07, "loss": 0.0953, "step": 195 }, { "epoch": 0.08917197452229299, "grad_norm": 1.4375057634941018, "learning_rate": 9.992233932971901e-07, "loss": 0.0598, "step": 196 }, { "epoch": 0.08962693357597816, "grad_norm": 1.9214247088463914, "learning_rate": 9.992154097657887e-07, "loss": 0.0844, "step": 197 }, { "epoch": 0.09008189262966333, "grad_norm": 1.5260802768094281, "learning_rate": 9.992073854407584e-07, "loss": 0.0821, "step": 198 }, { "epoch": 0.0905368516833485, "grad_norm": 4.544205646306275, "learning_rate": 9.991993203227549e-07, "loss": 0.0774, "step": 199 }, { "epoch": 0.09099181073703366, "grad_norm": 0.9784468610737326, "learning_rate": 9.991912144124373e-07, "loss": 0.0665, "step": 200 }, { "epoch": 0.09144676979071883, "grad_norm": 1.533362131047682, "learning_rate": 9.991830677104681e-07, "loss": 0.094, "step": 201 }, { "epoch": 0.09190172884440401, "grad_norm": 1.4442948304988266, "learning_rate": 9.991748802175128e-07, "loss": 0.0679, "step": 202 }, { "epoch": 0.09235668789808917, "grad_norm": 1.778353868301846, "learning_rate": 9.991666519342406e-07, "loss": 0.0802, "step": 203 }, { "epoch": 0.09281164695177434, "grad_norm": 1.4080886395933547, "learning_rate": 9.991583828613238e-07, "loss": 0.0714, "step": 204 }, { "epoch": 0.09326660600545951, "grad_norm": 1.7407833171061422, "learning_rate": 9.991500729994381e-07, "loss": 0.0708, "step": 205 }, { "epoch": 0.09372156505914468, "grad_norm": 1.309202587334328, "learning_rate": 9.991417223492626e-07, "loss": 0.061, "step": 206 }, { "epoch": 0.09417652411282984, "grad_norm": 1.700325090710129, "learning_rate": 9.991333309114798e-07, "loss": 0.0765, "step": 207 }, { "epoch": 0.09463148316651501, "grad_norm": 1.279994579409623, "learning_rate": 9.991248986867752e-07, "loss": 0.0764, "step": 208 }, { "epoch": 0.09508644222020018, "grad_norm": 2.1446370224541944, "learning_rate": 9.991164256758377e-07, "loss": 0.0849, "step": 209 }, { "epoch": 0.09554140127388536, "grad_norm": 1.9861669945110423, "learning_rate": 9.991079118793599e-07, "loss": 0.1085, "step": 210 }, { "epoch": 0.09599636032757052, "grad_norm": 1.2727945113244215, "learning_rate": 9.990993572980376e-07, "loss": 0.0604, "step": 211 }, { "epoch": 0.09645131938125569, "grad_norm": 1.5167661565264423, "learning_rate": 9.990907619325699e-07, "loss": 0.0764, "step": 212 }, { "epoch": 0.09690627843494086, "grad_norm": 1.8297145638902679, "learning_rate": 9.990821257836587e-07, "loss": 0.0645, "step": 213 }, { "epoch": 0.09736123748862602, "grad_norm": 1.5890455937330983, "learning_rate": 9.990734488520102e-07, "loss": 0.0921, "step": 214 }, { "epoch": 0.09781619654231119, "grad_norm": 1.1160881756443122, "learning_rate": 9.990647311383334e-07, "loss": 0.0438, "step": 215 }, { "epoch": 0.09827115559599636, "grad_norm": 2.4418827554895692, "learning_rate": 9.990559726433402e-07, "loss": 0.0815, "step": 216 }, { "epoch": 0.09872611464968153, "grad_norm": 1.1378394657905113, "learning_rate": 9.990471733677468e-07, "loss": 0.0533, "step": 217 }, { "epoch": 0.09918107370336669, "grad_norm": 1.0552640743236452, "learning_rate": 9.99038333312272e-07, "loss": 0.059, "step": 218 }, { "epoch": 0.09963603275705187, "grad_norm": 0.9353344080834061, "learning_rate": 9.990294524776383e-07, "loss": 0.0453, "step": 219 }, { "epoch": 0.10009099181073704, "grad_norm": 1.157580669698293, "learning_rate": 9.990205308645714e-07, "loss": 0.0656, "step": 220 }, { "epoch": 0.1005459508644222, "grad_norm": 0.97129693549583, "learning_rate": 9.990115684738003e-07, "loss": 0.0409, "step": 221 }, { "epoch": 0.10100090991810737, "grad_norm": 1.114729616865196, "learning_rate": 9.990025653060571e-07, "loss": 0.0678, "step": 222 }, { "epoch": 0.10145586897179254, "grad_norm": 1.6357787684705223, "learning_rate": 9.98993521362078e-07, "loss": 0.0982, "step": 223 }, { "epoch": 0.10191082802547771, "grad_norm": 1.7357276362378158, "learning_rate": 9.989844366426017e-07, "loss": 0.1056, "step": 224 }, { "epoch": 0.10236578707916287, "grad_norm": 0.8966549628660352, "learning_rate": 9.989753111483705e-07, "loss": 0.0379, "step": 225 }, { "epoch": 0.10282074613284804, "grad_norm": 1.475329265238904, "learning_rate": 9.989661448801302e-07, "loss": 0.0826, "step": 226 }, { "epoch": 0.10327570518653321, "grad_norm": 1.078938369819215, "learning_rate": 9.989569378386302e-07, "loss": 0.0632, "step": 227 }, { "epoch": 0.10373066424021839, "grad_norm": 0.8607431700980628, "learning_rate": 9.989476900246223e-07, "loss": 0.0481, "step": 228 }, { "epoch": 0.10418562329390355, "grad_norm": 1.5366919549219145, "learning_rate": 9.989384014388623e-07, "loss": 0.0824, "step": 229 }, { "epoch": 0.10464058234758872, "grad_norm": 0.9856687411956576, "learning_rate": 9.989290720821093e-07, "loss": 0.0602, "step": 230 }, { "epoch": 0.10509554140127389, "grad_norm": 1.15791640911629, "learning_rate": 9.989197019551259e-07, "loss": 0.0636, "step": 231 }, { "epoch": 0.10555050045495905, "grad_norm": 1.109842184462552, "learning_rate": 9.989102910586774e-07, "loss": 0.0538, "step": 232 }, { "epoch": 0.10600545950864422, "grad_norm": 1.2643069571576309, "learning_rate": 9.98900839393533e-07, "loss": 0.0667, "step": 233 }, { "epoch": 0.10646041856232939, "grad_norm": 1.884418925771264, "learning_rate": 9.988913469604649e-07, "loss": 0.0643, "step": 234 }, { "epoch": 0.10691537761601456, "grad_norm": 0.8160882482716078, "learning_rate": 9.988818137602492e-07, "loss": 0.056, "step": 235 }, { "epoch": 0.10737033666969972, "grad_norm": 0.9647512675410035, "learning_rate": 9.988722397936644e-07, "loss": 0.0644, "step": 236 }, { "epoch": 0.1078252957233849, "grad_norm": 1.4831200436529093, "learning_rate": 9.98862625061493e-07, "loss": 0.0704, "step": 237 }, { "epoch": 0.10828025477707007, "grad_norm": 1.1363581545946497, "learning_rate": 9.988529695645208e-07, "loss": 0.0594, "step": 238 }, { "epoch": 0.10873521383075523, "grad_norm": 1.0825887558294587, "learning_rate": 9.988432733035368e-07, "loss": 0.0425, "step": 239 }, { "epoch": 0.1091901728844404, "grad_norm": 1.0742908192322842, "learning_rate": 9.988335362793332e-07, "loss": 0.0612, "step": 240 }, { "epoch": 0.10964513193812557, "grad_norm": 1.4490316756613832, "learning_rate": 9.988237584927058e-07, "loss": 0.0852, "step": 241 }, { "epoch": 0.11010009099181074, "grad_norm": 1.7627374272466951, "learning_rate": 9.988139399444533e-07, "loss": 0.1052, "step": 242 }, { "epoch": 0.1105550500454959, "grad_norm": 1.2720322330503708, "learning_rate": 9.988040806353785e-07, "loss": 0.0749, "step": 243 }, { "epoch": 0.11101000909918107, "grad_norm": 1.3864180916313802, "learning_rate": 9.987941805662868e-07, "loss": 0.0875, "step": 244 }, { "epoch": 0.11146496815286625, "grad_norm": 1.7655726368367308, "learning_rate": 9.98784239737987e-07, "loss": 0.0517, "step": 245 }, { "epoch": 0.11191992720655142, "grad_norm": 1.1316272357044634, "learning_rate": 9.987742581512916e-07, "loss": 0.0507, "step": 246 }, { "epoch": 0.11237488626023658, "grad_norm": 1.002032807340167, "learning_rate": 9.987642358070166e-07, "loss": 0.0474, "step": 247 }, { "epoch": 0.11282984531392175, "grad_norm": 1.2909143562499068, "learning_rate": 9.987541727059802e-07, "loss": 0.0797, "step": 248 }, { "epoch": 0.11328480436760692, "grad_norm": 1.261509337485699, "learning_rate": 9.987440688490057e-07, "loss": 0.0581, "step": 249 }, { "epoch": 0.11373976342129208, "grad_norm": 1.4433807092243254, "learning_rate": 9.987339242369178e-07, "loss": 0.0648, "step": 250 }, { "epoch": 0.11419472247497725, "grad_norm": 1.1641001365141075, "learning_rate": 9.98723738870546e-07, "loss": 0.0536, "step": 251 }, { "epoch": 0.11464968152866242, "grad_norm": 1.463192110680986, "learning_rate": 9.987135127507225e-07, "loss": 0.0687, "step": 252 }, { "epoch": 0.1151046405823476, "grad_norm": 1.5589009929123119, "learning_rate": 9.987032458782826e-07, "loss": 0.0809, "step": 253 }, { "epoch": 0.11555959963603275, "grad_norm": 0.8981801314196295, "learning_rate": 9.986929382540661e-07, "loss": 0.0377, "step": 254 }, { "epoch": 0.11601455868971793, "grad_norm": 1.3046134929926776, "learning_rate": 9.986825898789143e-07, "loss": 0.0584, "step": 255 }, { "epoch": 0.1164695177434031, "grad_norm": 1.1968271902775738, "learning_rate": 9.986722007536736e-07, "loss": 0.0604, "step": 256 }, { "epoch": 0.11692447679708826, "grad_norm": 0.9256739589207105, "learning_rate": 9.986617708791926e-07, "loss": 0.0563, "step": 257 }, { "epoch": 0.11737943585077343, "grad_norm": 1.0562236765098105, "learning_rate": 9.986513002563234e-07, "loss": 0.0441, "step": 258 }, { "epoch": 0.1178343949044586, "grad_norm": 1.1396797838942825, "learning_rate": 9.986407888859221e-07, "loss": 0.0779, "step": 259 }, { "epoch": 0.11828935395814377, "grad_norm": 0.9511338510504711, "learning_rate": 9.986302367688472e-07, "loss": 0.0479, "step": 260 }, { "epoch": 0.11874431301182893, "grad_norm": 1.2626792340501498, "learning_rate": 9.986196439059613e-07, "loss": 0.054, "step": 261 }, { "epoch": 0.1191992720655141, "grad_norm": 1.2414582106094108, "learning_rate": 9.986090102981297e-07, "loss": 0.0637, "step": 262 }, { "epoch": 0.11965423111919928, "grad_norm": 1.3386637457925286, "learning_rate": 9.985983359462214e-07, "loss": 0.0559, "step": 263 }, { "epoch": 0.12010919017288443, "grad_norm": 1.5528857942460832, "learning_rate": 9.98587620851109e-07, "loss": 0.1111, "step": 264 }, { "epoch": 0.1205641492265696, "grad_norm": 1.0084557380928558, "learning_rate": 9.985768650136676e-07, "loss": 0.0528, "step": 265 }, { "epoch": 0.12101910828025478, "grad_norm": 0.9836579322808242, "learning_rate": 9.985660684347765e-07, "loss": 0.0723, "step": 266 }, { "epoch": 0.12147406733393995, "grad_norm": 1.651136541287689, "learning_rate": 9.985552311153176e-07, "loss": 0.0901, "step": 267 }, { "epoch": 0.12192902638762511, "grad_norm": 0.99646358827593, "learning_rate": 9.985443530561768e-07, "loss": 0.077, "step": 268 }, { "epoch": 0.12238398544131028, "grad_norm": 1.6243418259246538, "learning_rate": 9.98533434258243e-07, "loss": 0.0919, "step": 269 }, { "epoch": 0.12283894449499545, "grad_norm": 1.0467582706654044, "learning_rate": 9.985224747224082e-07, "loss": 0.0484, "step": 270 }, { "epoch": 0.12329390354868063, "grad_norm": 1.736810556376589, "learning_rate": 9.98511474449568e-07, "loss": 0.0753, "step": 271 }, { "epoch": 0.12374886260236578, "grad_norm": 1.0004729483670864, "learning_rate": 9.985004334406213e-07, "loss": 0.0596, "step": 272 }, { "epoch": 0.12420382165605096, "grad_norm": 1.2151561323315139, "learning_rate": 9.984893516964706e-07, "loss": 0.0582, "step": 273 }, { "epoch": 0.12465878070973613, "grad_norm": 1.828699579113966, "learning_rate": 9.98478229218021e-07, "loss": 0.1102, "step": 274 }, { "epoch": 0.1251137397634213, "grad_norm": 0.9077132204761329, "learning_rate": 9.984670660061819e-07, "loss": 0.0481, "step": 275 }, { "epoch": 0.12556869881710647, "grad_norm": 0.9423342802512885, "learning_rate": 9.98455862061865e-07, "loss": 0.0767, "step": 276 }, { "epoch": 0.12602365787079162, "grad_norm": 1.6287766199301192, "learning_rate": 9.984446173859861e-07, "loss": 0.1155, "step": 277 }, { "epoch": 0.1264786169244768, "grad_norm": 1.0561750039623734, "learning_rate": 9.98433331979464e-07, "loss": 0.0595, "step": 278 }, { "epoch": 0.12693357597816196, "grad_norm": 0.972781180486093, "learning_rate": 9.98422005843221e-07, "loss": 0.0484, "step": 279 }, { "epoch": 0.12738853503184713, "grad_norm": 1.1854391526869976, "learning_rate": 9.984106389781825e-07, "loss": 0.0482, "step": 280 }, { "epoch": 0.1278434940855323, "grad_norm": 1.3197396977134914, "learning_rate": 9.983992313852773e-07, "loss": 0.0623, "step": 281 }, { "epoch": 0.12829845313921748, "grad_norm": 1.6362190624757336, "learning_rate": 9.983877830654378e-07, "loss": 0.0715, "step": 282 }, { "epoch": 0.12875341219290265, "grad_norm": 0.9077240765741056, "learning_rate": 9.983762940195995e-07, "loss": 0.0631, "step": 283 }, { "epoch": 0.1292083712465878, "grad_norm": 1.1421219589647587, "learning_rate": 9.983647642487009e-07, "loss": 0.0583, "step": 284 }, { "epoch": 0.12966333030027297, "grad_norm": 1.8376550060739605, "learning_rate": 9.983531937536844e-07, "loss": 0.0801, "step": 285 }, { "epoch": 0.13011828935395814, "grad_norm": 1.2064252363743322, "learning_rate": 9.983415825354954e-07, "loss": 0.0479, "step": 286 }, { "epoch": 0.1305732484076433, "grad_norm": 1.3522899893414668, "learning_rate": 9.983299305950828e-07, "loss": 0.0533, "step": 287 }, { "epoch": 0.13102820746132848, "grad_norm": 0.8486243193478907, "learning_rate": 9.983182379333988e-07, "loss": 0.0466, "step": 288 }, { "epoch": 0.13148316651501366, "grad_norm": 1.1699982302380614, "learning_rate": 9.983065045513985e-07, "loss": 0.0623, "step": 289 }, { "epoch": 0.13193812556869883, "grad_norm": 0.8620933594779188, "learning_rate": 9.982947304500413e-07, "loss": 0.0448, "step": 290 }, { "epoch": 0.13239308462238397, "grad_norm": 0.9334314458594886, "learning_rate": 9.982829156302889e-07, "loss": 0.0525, "step": 291 }, { "epoch": 0.13284804367606914, "grad_norm": 2.1870474292658284, "learning_rate": 9.982710600931068e-07, "loss": 0.0999, "step": 292 }, { "epoch": 0.13330300272975432, "grad_norm": 1.206782826393217, "learning_rate": 9.982591638394639e-07, "loss": 0.0581, "step": 293 }, { "epoch": 0.1337579617834395, "grad_norm": 0.9252727575790715, "learning_rate": 9.98247226870332e-07, "loss": 0.0421, "step": 294 }, { "epoch": 0.13421292083712466, "grad_norm": 1.1920898160478353, "learning_rate": 9.982352491866872e-07, "loss": 0.0568, "step": 295 }, { "epoch": 0.13466787989080983, "grad_norm": 0.9875857130572099, "learning_rate": 9.982232307895076e-07, "loss": 0.0472, "step": 296 }, { "epoch": 0.135122838944495, "grad_norm": 2.2221045162643143, "learning_rate": 9.982111716797757e-07, "loss": 0.0822, "step": 297 }, { "epoch": 0.13557779799818018, "grad_norm": 1.4938459925674554, "learning_rate": 9.981990718584767e-07, "loss": 0.0852, "step": 298 }, { "epoch": 0.13603275705186532, "grad_norm": 1.1066809951614336, "learning_rate": 9.981869313265993e-07, "loss": 0.0703, "step": 299 }, { "epoch": 0.1364877161055505, "grad_norm": 1.6923355030264176, "learning_rate": 9.981747500851356e-07, "loss": 0.0553, "step": 300 }, { "epoch": 0.13694267515923567, "grad_norm": 1.171335220808776, "learning_rate": 9.981625281350813e-07, "loss": 0.0586, "step": 301 }, { "epoch": 0.13739763421292084, "grad_norm": 1.070896005930655, "learning_rate": 9.981502654774347e-07, "loss": 0.0444, "step": 302 }, { "epoch": 0.137852593266606, "grad_norm": 1.1728237717715497, "learning_rate": 9.98137962113198e-07, "loss": 0.0618, "step": 303 }, { "epoch": 0.13830755232029118, "grad_norm": 1.1905967346525261, "learning_rate": 9.981256180433768e-07, "loss": 0.0675, "step": 304 }, { "epoch": 0.13876251137397635, "grad_norm": 0.8684976727617932, "learning_rate": 9.981132332689794e-07, "loss": 0.0466, "step": 305 }, { "epoch": 0.1392174704276615, "grad_norm": 1.036935632301667, "learning_rate": 9.981008077910183e-07, "loss": 0.0579, "step": 306 }, { "epoch": 0.13967242948134667, "grad_norm": 1.5316413035476537, "learning_rate": 9.980883416105083e-07, "loss": 0.0588, "step": 307 }, { "epoch": 0.14012738853503184, "grad_norm": 0.831351740285057, "learning_rate": 9.980758347284685e-07, "loss": 0.0477, "step": 308 }, { "epoch": 0.14058234758871702, "grad_norm": 1.1114213029102946, "learning_rate": 9.980632871459208e-07, "loss": 0.056, "step": 309 }, { "epoch": 0.1410373066424022, "grad_norm": 0.9174179487762331, "learning_rate": 9.980506988638905e-07, "loss": 0.0689, "step": 310 }, { "epoch": 0.14149226569608736, "grad_norm": 0.8932952501778931, "learning_rate": 9.980380698834064e-07, "loss": 0.0594, "step": 311 }, { "epoch": 0.14194722474977253, "grad_norm": 1.2719258167198786, "learning_rate": 9.980254002055e-07, "loss": 0.0588, "step": 312 }, { "epoch": 0.14240218380345768, "grad_norm": 0.9786921183743694, "learning_rate": 9.980126898312072e-07, "loss": 0.0457, "step": 313 }, { "epoch": 0.14285714285714285, "grad_norm": 1.2371622871220518, "learning_rate": 9.979999387615663e-07, "loss": 0.0671, "step": 314 }, { "epoch": 0.14331210191082802, "grad_norm": 1.470791435721143, "learning_rate": 9.979871469976195e-07, "loss": 0.0698, "step": 315 }, { "epoch": 0.1437670609645132, "grad_norm": 1.7240976919181443, "learning_rate": 9.97974314540412e-07, "loss": 0.0601, "step": 316 }, { "epoch": 0.14422202001819837, "grad_norm": 1.977436972779476, "learning_rate": 9.979614413909921e-07, "loss": 0.0926, "step": 317 }, { "epoch": 0.14467697907188354, "grad_norm": 0.6968374388220965, "learning_rate": 9.979485275504121e-07, "loss": 0.0407, "step": 318 }, { "epoch": 0.1451319381255687, "grad_norm": 1.973909641193579, "learning_rate": 9.97935573019727e-07, "loss": 0.0893, "step": 319 }, { "epoch": 0.14558689717925385, "grad_norm": 0.9393128304458204, "learning_rate": 9.979225777999956e-07, "loss": 0.037, "step": 320 }, { "epoch": 0.14604185623293903, "grad_norm": 1.3609794979063135, "learning_rate": 9.979095418922797e-07, "loss": 0.0582, "step": 321 }, { "epoch": 0.1464968152866242, "grad_norm": 0.8197293804224054, "learning_rate": 9.978964652976446e-07, "loss": 0.0426, "step": 322 }, { "epoch": 0.14695177434030937, "grad_norm": 1.158997352707507, "learning_rate": 9.978833480171591e-07, "loss": 0.0645, "step": 323 }, { "epoch": 0.14740673339399454, "grad_norm": 1.3440812732849405, "learning_rate": 9.978701900518946e-07, "loss": 0.0516, "step": 324 }, { "epoch": 0.14786169244767972, "grad_norm": 1.237216388178485, "learning_rate": 9.978569914029265e-07, "loss": 0.0744, "step": 325 }, { "epoch": 0.1483166515013649, "grad_norm": 1.2020225514578025, "learning_rate": 9.978437520713334e-07, "loss": 0.0648, "step": 326 }, { "epoch": 0.14877161055505003, "grad_norm": 1.707463401339971, "learning_rate": 9.97830472058197e-07, "loss": 0.0659, "step": 327 }, { "epoch": 0.1492265696087352, "grad_norm": 1.5089520418583864, "learning_rate": 9.97817151364603e-07, "loss": 0.0873, "step": 328 }, { "epoch": 0.14968152866242038, "grad_norm": 1.1521285274825894, "learning_rate": 9.978037899916391e-07, "loss": 0.0675, "step": 329 }, { "epoch": 0.15013648771610555, "grad_norm": 1.2580161193848187, "learning_rate": 9.977903879403978e-07, "loss": 0.041, "step": 330 }, { "epoch": 0.15059144676979072, "grad_norm": 1.6179915778926504, "learning_rate": 9.97776945211974e-07, "loss": 0.0642, "step": 331 }, { "epoch": 0.1510464058234759, "grad_norm": 2.0103740445789726, "learning_rate": 9.97763461807466e-07, "loss": 0.0653, "step": 332 }, { "epoch": 0.15150136487716107, "grad_norm": 1.7810437636409457, "learning_rate": 9.977499377279759e-07, "loss": 0.0782, "step": 333 }, { "epoch": 0.15195632393084624, "grad_norm": 1.2715761230366713, "learning_rate": 9.977363729746086e-07, "loss": 0.0638, "step": 334 }, { "epoch": 0.15241128298453138, "grad_norm": 1.0301107381047787, "learning_rate": 9.977227675484728e-07, "loss": 0.0489, "step": 335 }, { "epoch": 0.15286624203821655, "grad_norm": 1.559853805666478, "learning_rate": 9.977091214506801e-07, "loss": 0.0685, "step": 336 }, { "epoch": 0.15332120109190173, "grad_norm": 1.9357378638541771, "learning_rate": 9.976954346823455e-07, "loss": 0.0706, "step": 337 }, { "epoch": 0.1537761601455869, "grad_norm": 0.9386653307796838, "learning_rate": 9.976817072445876e-07, "loss": 0.0384, "step": 338 }, { "epoch": 0.15423111919927207, "grad_norm": 1.1849304842409263, "learning_rate": 9.976679391385283e-07, "loss": 0.0544, "step": 339 }, { "epoch": 0.15468607825295724, "grad_norm": 0.9794514859133302, "learning_rate": 9.976541303652921e-07, "loss": 0.0395, "step": 340 }, { "epoch": 0.15514103730664242, "grad_norm": 1.522980464711304, "learning_rate": 9.97640280926008e-07, "loss": 0.0553, "step": 341 }, { "epoch": 0.15559599636032756, "grad_norm": 1.114897917471998, "learning_rate": 9.976263908218075e-07, "loss": 0.0562, "step": 342 }, { "epoch": 0.15605095541401273, "grad_norm": 1.4106813797280966, "learning_rate": 9.976124600538254e-07, "loss": 0.0842, "step": 343 }, { "epoch": 0.1565059144676979, "grad_norm": 0.9765199742846444, "learning_rate": 9.975984886232005e-07, "loss": 0.0604, "step": 344 }, { "epoch": 0.15696087352138308, "grad_norm": 1.7134665109938934, "learning_rate": 9.975844765310742e-07, "loss": 0.065, "step": 345 }, { "epoch": 0.15741583257506825, "grad_norm": 1.6486501390150328, "learning_rate": 9.975704237785914e-07, "loss": 0.0981, "step": 346 }, { "epoch": 0.15787079162875342, "grad_norm": 1.3598544586513228, "learning_rate": 9.975563303669005e-07, "loss": 0.074, "step": 347 }, { "epoch": 0.1583257506824386, "grad_norm": 1.3859738189047066, "learning_rate": 9.975421962971535e-07, "loss": 0.0771, "step": 348 }, { "epoch": 0.15878070973612374, "grad_norm": 1.3273068228762428, "learning_rate": 9.97528021570505e-07, "loss": 0.0891, "step": 349 }, { "epoch": 0.1592356687898089, "grad_norm": 1.008603335566121, "learning_rate": 9.975138061881134e-07, "loss": 0.0501, "step": 350 }, { "epoch": 0.15969062784349408, "grad_norm": 1.2683219501430032, "learning_rate": 9.974995501511404e-07, "loss": 0.0436, "step": 351 }, { "epoch": 0.16014558689717925, "grad_norm": 1.7861764054006601, "learning_rate": 9.974852534607505e-07, "loss": 0.0792, "step": 352 }, { "epoch": 0.16060054595086443, "grad_norm": 1.1324807753830382, "learning_rate": 9.974709161181125e-07, "loss": 0.067, "step": 353 }, { "epoch": 0.1610555050045496, "grad_norm": 1.4672040252585383, "learning_rate": 9.97456538124398e-07, "loss": 0.0708, "step": 354 }, { "epoch": 0.16151046405823477, "grad_norm": 0.9934827912617749, "learning_rate": 9.974421194807814e-07, "loss": 0.0564, "step": 355 }, { "epoch": 0.16196542311191992, "grad_norm": 1.414051763184749, "learning_rate": 9.974276601884415e-07, "loss": 0.0583, "step": 356 }, { "epoch": 0.1624203821656051, "grad_norm": 1.0382802063632235, "learning_rate": 9.974131602485593e-07, "loss": 0.0487, "step": 357 }, { "epoch": 0.16287534121929026, "grad_norm": 1.2586454862238812, "learning_rate": 9.9739861966232e-07, "loss": 0.0673, "step": 358 }, { "epoch": 0.16333030027297543, "grad_norm": 1.6838254742760412, "learning_rate": 9.97384038430912e-07, "loss": 0.0581, "step": 359 }, { "epoch": 0.1637852593266606, "grad_norm": 0.8399174068109083, "learning_rate": 9.973694165555263e-07, "loss": 0.0466, "step": 360 }, { "epoch": 0.16424021838034578, "grad_norm": 1.69300928648134, "learning_rate": 9.973547540373581e-07, "loss": 0.0665, "step": 361 }, { "epoch": 0.16469517743403095, "grad_norm": 1.3262644115821105, "learning_rate": 9.973400508776053e-07, "loss": 0.0782, "step": 362 }, { "epoch": 0.1651501364877161, "grad_norm": 0.8615488987713292, "learning_rate": 9.973253070774696e-07, "loss": 0.0457, "step": 363 }, { "epoch": 0.16560509554140126, "grad_norm": 1.4429396631470555, "learning_rate": 9.973105226381557e-07, "loss": 0.0573, "step": 364 }, { "epoch": 0.16606005459508644, "grad_norm": 1.2208505509003302, "learning_rate": 9.972956975608717e-07, "loss": 0.0775, "step": 365 }, { "epoch": 0.1665150136487716, "grad_norm": 1.5660572502770571, "learning_rate": 9.97280831846829e-07, "loss": 0.0716, "step": 366 }, { "epoch": 0.16696997270245678, "grad_norm": 1.1700776292958537, "learning_rate": 9.972659254972426e-07, "loss": 0.0452, "step": 367 }, { "epoch": 0.16742493175614195, "grad_norm": 1.5529551923189542, "learning_rate": 9.972509785133304e-07, "loss": 0.0757, "step": 368 }, { "epoch": 0.16787989080982713, "grad_norm": 1.0967911703334838, "learning_rate": 9.972359908963134e-07, "loss": 0.0479, "step": 369 }, { "epoch": 0.16833484986351227, "grad_norm": 1.121195555833821, "learning_rate": 9.972209626474171e-07, "loss": 0.059, "step": 370 }, { "epoch": 0.16878980891719744, "grad_norm": 1.0602851512538802, "learning_rate": 9.97205893767869e-07, "loss": 0.0472, "step": 371 }, { "epoch": 0.16924476797088261, "grad_norm": 1.6258552400713955, "learning_rate": 9.97190784258901e-07, "loss": 0.0836, "step": 372 }, { "epoch": 0.1696997270245678, "grad_norm": 1.1061834571332927, "learning_rate": 9.97175634121747e-07, "loss": 0.0409, "step": 373 }, { "epoch": 0.17015468607825296, "grad_norm": 1.2773941061251708, "learning_rate": 9.971604433576454e-07, "loss": 0.0481, "step": 374 }, { "epoch": 0.17060964513193813, "grad_norm": 1.2094234552253622, "learning_rate": 9.971452119678378e-07, "loss": 0.0617, "step": 375 }, { "epoch": 0.1710646041856233, "grad_norm": 1.3606176451795315, "learning_rate": 9.971299399535683e-07, "loss": 0.04, "step": 376 }, { "epoch": 0.17151956323930848, "grad_norm": 0.961589378037891, "learning_rate": 9.971146273160853e-07, "loss": 0.0518, "step": 377 }, { "epoch": 0.17197452229299362, "grad_norm": 0.9551141213830807, "learning_rate": 9.970992740566397e-07, "loss": 0.0435, "step": 378 }, { "epoch": 0.1724294813466788, "grad_norm": 1.1061327916265118, "learning_rate": 9.970838801764864e-07, "loss": 0.0684, "step": 379 }, { "epoch": 0.17288444040036396, "grad_norm": 1.7551193923627544, "learning_rate": 9.970684456768834e-07, "loss": 0.0923, "step": 380 }, { "epoch": 0.17333939945404914, "grad_norm": 61.55739208115707, "learning_rate": 9.970529705590917e-07, "loss": 0.7199, "step": 381 }, { "epoch": 0.1737943585077343, "grad_norm": 1.062292372101319, "learning_rate": 9.97037454824376e-07, "loss": 0.0599, "step": 382 }, { "epoch": 0.17424931756141948, "grad_norm": 2.182495657030285, "learning_rate": 9.970218984740038e-07, "loss": 0.0867, "step": 383 }, { "epoch": 0.17470427661510465, "grad_norm": 1.2018828615521753, "learning_rate": 9.970063015092466e-07, "loss": 0.0575, "step": 384 }, { "epoch": 0.1751592356687898, "grad_norm": 2.090124221916724, "learning_rate": 9.96990663931379e-07, "loss": 0.0891, "step": 385 }, { "epoch": 0.17561419472247497, "grad_norm": 1.5328987798385059, "learning_rate": 9.969749857416787e-07, "loss": 0.0568, "step": 386 }, { "epoch": 0.17606915377616014, "grad_norm": 1.6062714136358844, "learning_rate": 9.969592669414271e-07, "loss": 0.0754, "step": 387 }, { "epoch": 0.17652411282984531, "grad_norm": 61.447015835286706, "learning_rate": 9.969435075319082e-07, "loss": 0.3728, "step": 388 }, { "epoch": 0.1769790718835305, "grad_norm": 1.196371461247331, "learning_rate": 9.969277075144103e-07, "loss": 0.0339, "step": 389 }, { "epoch": 0.17743403093721566, "grad_norm": 1.0547191705603283, "learning_rate": 9.96911866890224e-07, "loss": 0.052, "step": 390 }, { "epoch": 0.17788898999090083, "grad_norm": 1.636068604244968, "learning_rate": 9.96895985660644e-07, "loss": 0.0476, "step": 391 }, { "epoch": 0.17834394904458598, "grad_norm": 1.1179771422167613, "learning_rate": 9.968800638269681e-07, "loss": 0.0485, "step": 392 }, { "epoch": 0.17879890809827115, "grad_norm": 1.8062730281108257, "learning_rate": 9.968641013904973e-07, "loss": 0.101, "step": 393 }, { "epoch": 0.17925386715195632, "grad_norm": 1.3090872450284912, "learning_rate": 9.968480983525359e-07, "loss": 0.0726, "step": 394 }, { "epoch": 0.1797088262056415, "grad_norm": 1.0383883885799372, "learning_rate": 9.968320547143916e-07, "loss": 0.0506, "step": 395 }, { "epoch": 0.18016378525932666, "grad_norm": 1.2807828034028284, "learning_rate": 9.968159704773755e-07, "loss": 0.0743, "step": 396 }, { "epoch": 0.18061874431301184, "grad_norm": 0.9611825955304264, "learning_rate": 9.96799845642802e-07, "loss": 0.0482, "step": 397 }, { "epoch": 0.181073703366697, "grad_norm": 1.0411952429933702, "learning_rate": 9.967836802119884e-07, "loss": 0.0498, "step": 398 }, { "epoch": 0.18152866242038215, "grad_norm": 1.222101450416741, "learning_rate": 9.96767474186256e-07, "loss": 0.0789, "step": 399 }, { "epoch": 0.18198362147406733, "grad_norm": 1.4322600827316334, "learning_rate": 9.967512275669292e-07, "loss": 0.106, "step": 400 }, { "epoch": 0.1824385805277525, "grad_norm": 1.2829081018513966, "learning_rate": 9.967349403553352e-07, "loss": 0.063, "step": 401 }, { "epoch": 0.18289353958143767, "grad_norm": 1.200933243638394, "learning_rate": 9.967186125528051e-07, "loss": 0.0743, "step": 402 }, { "epoch": 0.18334849863512284, "grad_norm": 1.8295483448302041, "learning_rate": 9.967022441606733e-07, "loss": 0.0874, "step": 403 }, { "epoch": 0.18380345768880801, "grad_norm": 1.0306589085176303, "learning_rate": 9.96685835180277e-07, "loss": 0.0577, "step": 404 }, { "epoch": 0.1842584167424932, "grad_norm": 0.9874463007020853, "learning_rate": 9.966693856129574e-07, "loss": 0.047, "step": 405 }, { "epoch": 0.18471337579617833, "grad_norm": 1.1366418912620115, "learning_rate": 9.966528954600586e-07, "loss": 0.0622, "step": 406 }, { "epoch": 0.1851683348498635, "grad_norm": 0.8680772630500574, "learning_rate": 9.96636364722928e-07, "loss": 0.0548, "step": 407 }, { "epoch": 0.18562329390354868, "grad_norm": 1.3804602960080863, "learning_rate": 9.966197934029165e-07, "loss": 0.0683, "step": 408 }, { "epoch": 0.18607825295723385, "grad_norm": 1.59662534548154, "learning_rate": 9.96603181501378e-07, "loss": 0.0675, "step": 409 }, { "epoch": 0.18653321201091902, "grad_norm": 1.3299660068409016, "learning_rate": 9.965865290196703e-07, "loss": 0.0663, "step": 410 }, { "epoch": 0.1869881710646042, "grad_norm": 2.671540218875107, "learning_rate": 9.96569835959154e-07, "loss": 0.0553, "step": 411 }, { "epoch": 0.18744313011828936, "grad_norm": 1.0571773866876972, "learning_rate": 9.96553102321193e-07, "loss": 0.0578, "step": 412 }, { "epoch": 0.18789808917197454, "grad_norm": 1.0398857538940627, "learning_rate": 9.965363281071551e-07, "loss": 0.0668, "step": 413 }, { "epoch": 0.18835304822565968, "grad_norm": 1.4312019555605475, "learning_rate": 9.965195133184108e-07, "loss": 0.0369, "step": 414 }, { "epoch": 0.18880800727934485, "grad_norm": 1.5897636214571498, "learning_rate": 9.96502657956334e-07, "loss": 0.0818, "step": 415 }, { "epoch": 0.18926296633303002, "grad_norm": 1.1532024813728672, "learning_rate": 9.964857620223023e-07, "loss": 0.0596, "step": 416 }, { "epoch": 0.1897179253867152, "grad_norm": 0.9518278091196013, "learning_rate": 9.964688255176962e-07, "loss": 0.05, "step": 417 }, { "epoch": 0.19017288444040037, "grad_norm": 1.5471894337999064, "learning_rate": 9.964518484438998e-07, "loss": 0.0563, "step": 418 }, { "epoch": 0.19062784349408554, "grad_norm": 1.3042951137857424, "learning_rate": 9.964348308023001e-07, "loss": 0.0458, "step": 419 }, { "epoch": 0.1910828025477707, "grad_norm": 1.089779462541599, "learning_rate": 9.96417772594288e-07, "loss": 0.0614, "step": 420 }, { "epoch": 0.19153776160145586, "grad_norm": 1.4054118320593119, "learning_rate": 9.964006738212574e-07, "loss": 0.0679, "step": 421 }, { "epoch": 0.19199272065514103, "grad_norm": 1.2982954130784632, "learning_rate": 9.963835344846054e-07, "loss": 0.0381, "step": 422 }, { "epoch": 0.1924476797088262, "grad_norm": 2.01173969090651, "learning_rate": 9.963663545857326e-07, "loss": 0.0528, "step": 423 }, { "epoch": 0.19290263876251137, "grad_norm": 1.9618141972515633, "learning_rate": 9.96349134126043e-07, "loss": 0.0838, "step": 424 }, { "epoch": 0.19335759781619655, "grad_norm": 1.0042217144229835, "learning_rate": 9.963318731069436e-07, "loss": 0.0733, "step": 425 }, { "epoch": 0.19381255686988172, "grad_norm": 2.4100555778342794, "learning_rate": 9.963145715298449e-07, "loss": 0.0739, "step": 426 }, { "epoch": 0.1942675159235669, "grad_norm": 1.500503281681586, "learning_rate": 9.962972293961606e-07, "loss": 0.0446, "step": 427 }, { "epoch": 0.19472247497725204, "grad_norm": 3.5903262452802367, "learning_rate": 9.962798467073081e-07, "loss": 0.0646, "step": 428 }, { "epoch": 0.1951774340309372, "grad_norm": 1.1886352602369927, "learning_rate": 9.96262423464708e-07, "loss": 0.0678, "step": 429 }, { "epoch": 0.19563239308462238, "grad_norm": 1.0944066722324761, "learning_rate": 9.962449596697833e-07, "loss": 0.0438, "step": 430 }, { "epoch": 0.19608735213830755, "grad_norm": 1.6641191926729932, "learning_rate": 9.962274553239618e-07, "loss": 0.091, "step": 431 }, { "epoch": 0.19654231119199272, "grad_norm": 0.8676639680944157, "learning_rate": 9.962099104286735e-07, "loss": 0.0514, "step": 432 }, { "epoch": 0.1969972702456779, "grad_norm": 1.517836992908138, "learning_rate": 9.961923249853521e-07, "loss": 0.085, "step": 433 }, { "epoch": 0.19745222929936307, "grad_norm": 1.7718658971906733, "learning_rate": 9.961746989954348e-07, "loss": 0.0776, "step": 434 }, { "epoch": 0.1979071883530482, "grad_norm": 1.0729427219458574, "learning_rate": 9.961570324603619e-07, "loss": 0.0421, "step": 435 }, { "epoch": 0.19836214740673339, "grad_norm": 1.2745743932943103, "learning_rate": 9.961393253815766e-07, "loss": 0.0655, "step": 436 }, { "epoch": 0.19881710646041856, "grad_norm": 1.7408841931848638, "learning_rate": 9.961215777605264e-07, "loss": 0.068, "step": 437 }, { "epoch": 0.19927206551410373, "grad_norm": 1.0885959204768643, "learning_rate": 9.961037895986615e-07, "loss": 0.0638, "step": 438 }, { "epoch": 0.1997270245677889, "grad_norm": 1.3587225368396716, "learning_rate": 9.96085960897435e-07, "loss": 0.0586, "step": 439 }, { "epoch": 0.20018198362147407, "grad_norm": 1.6064555114395305, "learning_rate": 9.960680916583041e-07, "loss": 0.0453, "step": 440 }, { "epoch": 0.20063694267515925, "grad_norm": 1.0240837682499038, "learning_rate": 9.960501818827291e-07, "loss": 0.0631, "step": 441 }, { "epoch": 0.2010919017288444, "grad_norm": 1.344540696401774, "learning_rate": 9.960322315721735e-07, "loss": 0.0493, "step": 442 }, { "epoch": 0.20154686078252956, "grad_norm": 1.7156213755343936, "learning_rate": 9.960142407281039e-07, "loss": 0.0669, "step": 443 }, { "epoch": 0.20200181983621474, "grad_norm": 0.6644642115399774, "learning_rate": 9.959962093519902e-07, "loss": 0.0361, "step": 444 }, { "epoch": 0.2024567788898999, "grad_norm": 1.049223043207897, "learning_rate": 9.959781374453065e-07, "loss": 0.0568, "step": 445 }, { "epoch": 0.20291173794358508, "grad_norm": 1.3083536330510794, "learning_rate": 9.959600250095293e-07, "loss": 0.0511, "step": 446 }, { "epoch": 0.20336669699727025, "grad_norm": 1.6091097914590038, "learning_rate": 9.959418720461382e-07, "loss": 0.0648, "step": 447 }, { "epoch": 0.20382165605095542, "grad_norm": 1.1847392762582725, "learning_rate": 9.959236785566173e-07, "loss": 0.0405, "step": 448 }, { "epoch": 0.20427661510464057, "grad_norm": 0.8757076055677958, "learning_rate": 9.959054445424532e-07, "loss": 0.0524, "step": 449 }, { "epoch": 0.20473157415832574, "grad_norm": 0.9429373676892292, "learning_rate": 9.95887170005135e-07, "loss": 0.0441, "step": 450 }, { "epoch": 0.2051865332120109, "grad_norm": 1.0894783560908516, "learning_rate": 9.958688549461571e-07, "loss": 0.0555, "step": 451 }, { "epoch": 0.20564149226569609, "grad_norm": 1.3373909572267781, "learning_rate": 9.958504993670157e-07, "loss": 0.0732, "step": 452 }, { "epoch": 0.20609645131938126, "grad_norm": 1.4816850476226722, "learning_rate": 9.958321032692107e-07, "loss": 0.1016, "step": 453 }, { "epoch": 0.20655141037306643, "grad_norm": 53.81813108365593, "learning_rate": 9.958136666542454e-07, "loss": 0.898, "step": 454 }, { "epoch": 0.2070063694267516, "grad_norm": 1.3654333042202123, "learning_rate": 9.957951895236261e-07, "loss": 0.0608, "step": 455 }, { "epoch": 0.20746132848043677, "grad_norm": 1.0155389279636473, "learning_rate": 9.957766718788632e-07, "loss": 0.0787, "step": 456 }, { "epoch": 0.20791628753412192, "grad_norm": 1.2290510013718745, "learning_rate": 9.957581137214693e-07, "loss": 0.0553, "step": 457 }, { "epoch": 0.2083712465878071, "grad_norm": 1.2111687813458196, "learning_rate": 9.957395150529613e-07, "loss": 0.0796, "step": 458 }, { "epoch": 0.20882620564149226, "grad_norm": 0.8268484664908767, "learning_rate": 9.95720875874859e-07, "loss": 0.0467, "step": 459 }, { "epoch": 0.20928116469517744, "grad_norm": 1.0724121457636888, "learning_rate": 9.957021961886852e-07, "loss": 0.0472, "step": 460 }, { "epoch": 0.2097361237488626, "grad_norm": 1.36288005491486, "learning_rate": 9.956834759959667e-07, "loss": 0.069, "step": 461 }, { "epoch": 0.21019108280254778, "grad_norm": 1.0671698426170464, "learning_rate": 9.956647152982327e-07, "loss": 0.0483, "step": 462 }, { "epoch": 0.21064604185623295, "grad_norm": 1.0082396911079197, "learning_rate": 9.95645914097017e-07, "loss": 0.0693, "step": 463 }, { "epoch": 0.2111010009099181, "grad_norm": 0.9047703234477522, "learning_rate": 9.95627072393855e-07, "loss": 0.0527, "step": 464 }, { "epoch": 0.21155595996360327, "grad_norm": 1.0304355765633446, "learning_rate": 9.956081901902873e-07, "loss": 0.0561, "step": 465 }, { "epoch": 0.21201091901728844, "grad_norm": 2.1359343716316777, "learning_rate": 9.955892674878562e-07, "loss": 0.1373, "step": 466 }, { "epoch": 0.2124658780709736, "grad_norm": 1.095854007962646, "learning_rate": 9.955703042881084e-07, "loss": 0.0708, "step": 467 }, { "epoch": 0.21292083712465878, "grad_norm": 1.1290628737529602, "learning_rate": 9.955513005925934e-07, "loss": 0.065, "step": 468 }, { "epoch": 0.21337579617834396, "grad_norm": 0.8400043936670604, "learning_rate": 9.955322564028639e-07, "loss": 0.0363, "step": 469 }, { "epoch": 0.21383075523202913, "grad_norm": 0.7393598237168953, "learning_rate": 9.955131717204761e-07, "loss": 0.0485, "step": 470 }, { "epoch": 0.21428571428571427, "grad_norm": 1.0910463695302328, "learning_rate": 9.954940465469896e-07, "loss": 0.0545, "step": 471 }, { "epoch": 0.21474067333939945, "grad_norm": 1.0009365863779738, "learning_rate": 9.954748808839674e-07, "loss": 0.0543, "step": 472 }, { "epoch": 0.21519563239308462, "grad_norm": 1.126683118313642, "learning_rate": 9.954556747329753e-07, "loss": 0.054, "step": 473 }, { "epoch": 0.2156505914467698, "grad_norm": 2.642066847598599, "learning_rate": 9.95436428095583e-07, "loss": 0.1234, "step": 474 }, { "epoch": 0.21610555050045496, "grad_norm": 0.9976397272874167, "learning_rate": 9.954171409733632e-07, "loss": 0.0385, "step": 475 }, { "epoch": 0.21656050955414013, "grad_norm": 0.823622963218879, "learning_rate": 9.95397813367892e-07, "loss": 0.0504, "step": 476 }, { "epoch": 0.2170154686078253, "grad_norm": 1.3652338974635019, "learning_rate": 9.953784452807485e-07, "loss": 0.0736, "step": 477 }, { "epoch": 0.21747042766151045, "grad_norm": 0.7847256244107973, "learning_rate": 9.953590367135157e-07, "loss": 0.041, "step": 478 }, { "epoch": 0.21792538671519562, "grad_norm": 1.129093580464179, "learning_rate": 9.953395876677795e-07, "loss": 0.041, "step": 479 }, { "epoch": 0.2183803457688808, "grad_norm": 0.7402537923820297, "learning_rate": 9.95320098145129e-07, "loss": 0.0369, "step": 480 }, { "epoch": 0.21883530482256597, "grad_norm": 1.268108716381297, "learning_rate": 9.95300568147157e-07, "loss": 0.0988, "step": 481 }, { "epoch": 0.21929026387625114, "grad_norm": 0.9397260025959921, "learning_rate": 9.952809976754592e-07, "loss": 0.0368, "step": 482 }, { "epoch": 0.2197452229299363, "grad_norm": 1.3811632120582926, "learning_rate": 9.95261386731635e-07, "loss": 0.0748, "step": 483 }, { "epoch": 0.22020018198362148, "grad_norm": 1.1016438359971403, "learning_rate": 9.95241735317287e-07, "loss": 0.0802, "step": 484 }, { "epoch": 0.22065514103730663, "grad_norm": 1.3205496202774076, "learning_rate": 9.952220434340208e-07, "loss": 0.0517, "step": 485 }, { "epoch": 0.2211101000909918, "grad_norm": 1.0603265513262499, "learning_rate": 9.952023110834455e-07, "loss": 0.0508, "step": 486 }, { "epoch": 0.22156505914467697, "grad_norm": 0.8451250385457886, "learning_rate": 9.951825382671737e-07, "loss": 0.0527, "step": 487 }, { "epoch": 0.22202001819836215, "grad_norm": 1.0171819004806768, "learning_rate": 9.951627249868212e-07, "loss": 0.0399, "step": 488 }, { "epoch": 0.22247497725204732, "grad_norm": 1.0453017379544192, "learning_rate": 9.951428712440069e-07, "loss": 0.0468, "step": 489 }, { "epoch": 0.2229299363057325, "grad_norm": 0.8995715474035276, "learning_rate": 9.95122977040353e-07, "loss": 0.0283, "step": 490 }, { "epoch": 0.22338489535941766, "grad_norm": 0.9039851559662124, "learning_rate": 9.951030423774858e-07, "loss": 0.0556, "step": 491 }, { "epoch": 0.22383985441310283, "grad_norm": 1.6175216247717694, "learning_rate": 9.950830672570335e-07, "loss": 0.0591, "step": 492 }, { "epoch": 0.22429481346678798, "grad_norm": 1.7511997475900873, "learning_rate": 9.950630516806288e-07, "loss": 0.1114, "step": 493 }, { "epoch": 0.22474977252047315, "grad_norm": 1.1373427610842786, "learning_rate": 9.950429956499072e-07, "loss": 0.0518, "step": 494 }, { "epoch": 0.22520473157415832, "grad_norm": 0.8358050895339546, "learning_rate": 9.950228991665078e-07, "loss": 0.0531, "step": 495 }, { "epoch": 0.2256596906278435, "grad_norm": 1.3531918761874655, "learning_rate": 9.950027622320723e-07, "loss": 0.0573, "step": 496 }, { "epoch": 0.22611464968152867, "grad_norm": 1.2768329156974692, "learning_rate": 9.949825848482465e-07, "loss": 0.0597, "step": 497 }, { "epoch": 0.22656960873521384, "grad_norm": 1.0159602004460029, "learning_rate": 9.949623670166793e-07, "loss": 0.0584, "step": 498 }, { "epoch": 0.227024567788899, "grad_norm": 4.869001899537454, "learning_rate": 9.949421087390225e-07, "loss": 0.1167, "step": 499 }, { "epoch": 0.22747952684258416, "grad_norm": 1.5083888104631782, "learning_rate": 9.94921810016932e-07, "loss": 0.0954, "step": 500 }, { "epoch": 0.22793448589626933, "grad_norm": 1.3342128194101248, "learning_rate": 9.949014708520663e-07, "loss": 0.0754, "step": 501 }, { "epoch": 0.2283894449499545, "grad_norm": 1.2498802034306913, "learning_rate": 9.94881091246087e-07, "loss": 0.0675, "step": 502 }, { "epoch": 0.22884440400363967, "grad_norm": 1.523623590025437, "learning_rate": 9.9486067120066e-07, "loss": 0.0547, "step": 503 }, { "epoch": 0.22929936305732485, "grad_norm": 1.0293101719236053, "learning_rate": 9.948402107174536e-07, "loss": 0.055, "step": 504 }, { "epoch": 0.22975432211101002, "grad_norm": 0.9895457631928973, "learning_rate": 9.9481970979814e-07, "loss": 0.0429, "step": 505 }, { "epoch": 0.2302092811646952, "grad_norm": 1.8146151677736246, "learning_rate": 9.94799168444394e-07, "loss": 0.063, "step": 506 }, { "epoch": 0.23066424021838033, "grad_norm": 1.5501254904610202, "learning_rate": 9.94778586657895e-07, "loss": 0.0722, "step": 507 }, { "epoch": 0.2311191992720655, "grad_norm": 0.9718145426138437, "learning_rate": 9.94757964440324e-07, "loss": 0.0537, "step": 508 }, { "epoch": 0.23157415832575068, "grad_norm": 1.6338186673908792, "learning_rate": 9.947373017933663e-07, "loss": 0.0378, "step": 509 }, { "epoch": 0.23202911737943585, "grad_norm": 0.9556135092500326, "learning_rate": 9.947165987187107e-07, "loss": 0.0447, "step": 510 }, { "epoch": 0.23248407643312102, "grad_norm": 0.8078513633293719, "learning_rate": 9.946958552180486e-07, "loss": 0.0328, "step": 511 }, { "epoch": 0.2329390354868062, "grad_norm": 1.022761761604989, "learning_rate": 9.946750712930754e-07, "loss": 0.0359, "step": 512 }, { "epoch": 0.23339399454049137, "grad_norm": 1.1366788360995828, "learning_rate": 9.946542469454893e-07, "loss": 0.0718, "step": 513 }, { "epoch": 0.2338489535941765, "grad_norm": 0.9810382870543687, "learning_rate": 9.94633382176992e-07, "loss": 0.0534, "step": 514 }, { "epoch": 0.23430391264786168, "grad_norm": 1.581853556545306, "learning_rate": 9.946124769892882e-07, "loss": 0.0773, "step": 515 }, { "epoch": 0.23475887170154686, "grad_norm": 0.7644818434101259, "learning_rate": 9.945915313840867e-07, "loss": 0.0423, "step": 516 }, { "epoch": 0.23521383075523203, "grad_norm": 1.1026489788865934, "learning_rate": 9.945705453630988e-07, "loss": 0.0701, "step": 517 }, { "epoch": 0.2356687898089172, "grad_norm": 1.30242239343761, "learning_rate": 9.945495189280394e-07, "loss": 0.0775, "step": 518 }, { "epoch": 0.23612374886260237, "grad_norm": 1.3643812174363148, "learning_rate": 9.945284520806266e-07, "loss": 0.0592, "step": 519 }, { "epoch": 0.23657870791628755, "grad_norm": 0.7728712449894798, "learning_rate": 9.94507344822582e-07, "loss": 0.045, "step": 520 }, { "epoch": 0.2370336669699727, "grad_norm": 0.9753489954613876, "learning_rate": 9.944861971556302e-07, "loss": 0.0401, "step": 521 }, { "epoch": 0.23748862602365786, "grad_norm": 1.116447882604098, "learning_rate": 9.944650090814996e-07, "loss": 0.0506, "step": 522 }, { "epoch": 0.23794358507734303, "grad_norm": 1.7959469233171275, "learning_rate": 9.944437806019215e-07, "loss": 0.0531, "step": 523 }, { "epoch": 0.2383985441310282, "grad_norm": 0.9179182439207999, "learning_rate": 9.944225117186304e-07, "loss": 0.0561, "step": 524 }, { "epoch": 0.23885350318471338, "grad_norm": 0.753951520122613, "learning_rate": 9.944012024333646e-07, "loss": 0.0376, "step": 525 }, { "epoch": 0.23930846223839855, "grad_norm": 0.8101275452789138, "learning_rate": 9.94379852747865e-07, "loss": 0.0431, "step": 526 }, { "epoch": 0.23976342129208372, "grad_norm": 1.3152440421129992, "learning_rate": 9.943584626638767e-07, "loss": 0.0705, "step": 527 }, { "epoch": 0.24021838034576887, "grad_norm": 0.9322535268543007, "learning_rate": 9.943370321831472e-07, "loss": 0.0493, "step": 528 }, { "epoch": 0.24067333939945404, "grad_norm": 2.3492333838000303, "learning_rate": 9.943155613074278e-07, "loss": 0.0422, "step": 529 }, { "epoch": 0.2411282984531392, "grad_norm": 0.974498951251244, "learning_rate": 9.94294050038473e-07, "loss": 0.0641, "step": 530 }, { "epoch": 0.24158325750682438, "grad_norm": 37.95804074486314, "learning_rate": 9.942724983780409e-07, "loss": 0.3584, "step": 531 }, { "epoch": 0.24203821656050956, "grad_norm": 1.637859871500906, "learning_rate": 9.942509063278921e-07, "loss": 0.0868, "step": 532 }, { "epoch": 0.24249317561419473, "grad_norm": 1.1722515185911306, "learning_rate": 9.942292738897913e-07, "loss": 0.0593, "step": 533 }, { "epoch": 0.2429481346678799, "grad_norm": 1.018141151759373, "learning_rate": 9.942076010655062e-07, "loss": 0.0663, "step": 534 }, { "epoch": 0.24340309372156507, "grad_norm": 1.5807982542236025, "learning_rate": 9.941858878568076e-07, "loss": 0.0846, "step": 535 }, { "epoch": 0.24385805277525022, "grad_norm": 1.4544459365608695, "learning_rate": 9.941641342654701e-07, "loss": 0.0795, "step": 536 }, { "epoch": 0.2443130118289354, "grad_norm": 0.7991375127435578, "learning_rate": 9.941423402932712e-07, "loss": 0.041, "step": 537 }, { "epoch": 0.24476797088262056, "grad_norm": 1.3935629119093738, "learning_rate": 9.941205059419918e-07, "loss": 0.0807, "step": 538 }, { "epoch": 0.24522292993630573, "grad_norm": 1.0563722753433893, "learning_rate": 9.94098631213416e-07, "loss": 0.0525, "step": 539 }, { "epoch": 0.2456778889899909, "grad_norm": 1.1009271567670207, "learning_rate": 9.940767161093316e-07, "loss": 0.041, "step": 540 }, { "epoch": 0.24613284804367608, "grad_norm": 1.7826575028692246, "learning_rate": 9.940547606315289e-07, "loss": 0.0689, "step": 541 }, { "epoch": 0.24658780709736125, "grad_norm": 1.0336207217621083, "learning_rate": 9.940327647818025e-07, "loss": 0.0535, "step": 542 }, { "epoch": 0.2470427661510464, "grad_norm": 2.45401048234137, "learning_rate": 9.940107285619495e-07, "loss": 0.0512, "step": 543 }, { "epoch": 0.24749772520473157, "grad_norm": 1.19308205945342, "learning_rate": 9.939886519737706e-07, "loss": 0.0556, "step": 544 }, { "epoch": 0.24795268425841674, "grad_norm": 1.5225580058372714, "learning_rate": 9.9396653501907e-07, "loss": 0.0688, "step": 545 }, { "epoch": 0.2484076433121019, "grad_norm": 1.3121205826406792, "learning_rate": 9.939443776996548e-07, "loss": 0.0761, "step": 546 }, { "epoch": 0.24886260236578708, "grad_norm": 1.186650953021527, "learning_rate": 9.93922180017336e-07, "loss": 0.0487, "step": 547 }, { "epoch": 0.24931756141947226, "grad_norm": 1.2365716348507871, "learning_rate": 9.93899941973927e-07, "loss": 0.0926, "step": 548 }, { "epoch": 0.24977252047315743, "grad_norm": 1.159458828975085, "learning_rate": 9.938776635712448e-07, "loss": 0.0633, "step": 549 }, { "epoch": 0.2502274795268426, "grad_norm": 1.3562046587474255, "learning_rate": 9.938553448111106e-07, "loss": 0.0832, "step": 550 }, { "epoch": 0.25068243858052774, "grad_norm": 1.2395861301175688, "learning_rate": 9.93832985695348e-07, "loss": 0.0643, "step": 551 }, { "epoch": 0.25113739763421294, "grad_norm": 1.0153499872507543, "learning_rate": 9.938105862257838e-07, "loss": 0.0545, "step": 552 }, { "epoch": 0.2515923566878981, "grad_norm": 0.9451041008867292, "learning_rate": 9.937881464042485e-07, "loss": 0.0536, "step": 553 }, { "epoch": 0.25204731574158323, "grad_norm": 1.1282651296840591, "learning_rate": 9.937656662325756e-07, "loss": 0.0799, "step": 554 }, { "epoch": 0.25250227479526843, "grad_norm": 2.314570695596751, "learning_rate": 9.937431457126027e-07, "loss": 0.0623, "step": 555 }, { "epoch": 0.2529572338489536, "grad_norm": 1.1155809644780679, "learning_rate": 9.937205848461694e-07, "loss": 0.0408, "step": 556 }, { "epoch": 0.2534121929026388, "grad_norm": 1.913572776922951, "learning_rate": 9.936979836351196e-07, "loss": 0.0818, "step": 557 }, { "epoch": 0.2538671519563239, "grad_norm": 1.9351926020367605, "learning_rate": 9.936753420813002e-07, "loss": 0.0611, "step": 558 }, { "epoch": 0.2543221110100091, "grad_norm": 1.0209977170376683, "learning_rate": 9.93652660186561e-07, "loss": 0.0417, "step": 559 }, { "epoch": 0.25477707006369427, "grad_norm": 0.8415189328102437, "learning_rate": 9.93629937952756e-07, "loss": 0.0425, "step": 560 }, { "epoch": 0.2552320291173794, "grad_norm": 1.4220280707925794, "learning_rate": 9.936071753817414e-07, "loss": 0.0607, "step": 561 }, { "epoch": 0.2556869881710646, "grad_norm": 1.6380914201367045, "learning_rate": 9.935843724753777e-07, "loss": 0.0721, "step": 562 }, { "epoch": 0.25614194722474976, "grad_norm": 1.7964943264969302, "learning_rate": 9.93561529235528e-07, "loss": 0.0732, "step": 563 }, { "epoch": 0.25659690627843496, "grad_norm": 1.1805655425665382, "learning_rate": 9.935386456640592e-07, "loss": 0.0745, "step": 564 }, { "epoch": 0.2570518653321201, "grad_norm": 1.5472221893134444, "learning_rate": 9.93515721762841e-07, "loss": 0.0675, "step": 565 }, { "epoch": 0.2575068243858053, "grad_norm": 1.1765594893575706, "learning_rate": 9.934927575337468e-07, "loss": 0.0586, "step": 566 }, { "epoch": 0.25796178343949044, "grad_norm": 0.8136913608920742, "learning_rate": 9.934697529786528e-07, "loss": 0.0362, "step": 567 }, { "epoch": 0.2584167424931756, "grad_norm": 1.173127495097662, "learning_rate": 9.934467080994392e-07, "loss": 0.0519, "step": 568 }, { "epoch": 0.2588717015468608, "grad_norm": 1.306348062382054, "learning_rate": 9.93423622897989e-07, "loss": 0.0497, "step": 569 }, { "epoch": 0.25932666060054593, "grad_norm": 1.0310256575612966, "learning_rate": 9.934004973761886e-07, "loss": 0.0417, "step": 570 }, { "epoch": 0.25978161965423113, "grad_norm": 1.6792423520622828, "learning_rate": 9.93377331535928e-07, "loss": 0.0595, "step": 571 }, { "epoch": 0.2602365787079163, "grad_norm": 1.760431319421607, "learning_rate": 9.933541253790996e-07, "loss": 0.0757, "step": 572 }, { "epoch": 0.2606915377616015, "grad_norm": 1.4599462837304698, "learning_rate": 9.933308789076003e-07, "loss": 0.0627, "step": 573 }, { "epoch": 0.2611464968152866, "grad_norm": 0.8994305397672283, "learning_rate": 9.933075921233292e-07, "loss": 0.04, "step": 574 }, { "epoch": 0.26160145586897177, "grad_norm": 1.7615092995470198, "learning_rate": 9.932842650281895e-07, "loss": 0.0574, "step": 575 }, { "epoch": 0.26205641492265697, "grad_norm": 1.334379412673449, "learning_rate": 9.932608976240873e-07, "loss": 0.0904, "step": 576 }, { "epoch": 0.2625113739763421, "grad_norm": 1.0626711710307233, "learning_rate": 9.932374899129321e-07, "loss": 0.0558, "step": 577 }, { "epoch": 0.2629663330300273, "grad_norm": 0.9062531481794093, "learning_rate": 9.932140418966368e-07, "loss": 0.0566, "step": 578 }, { "epoch": 0.26342129208371245, "grad_norm": 1.3481619325857905, "learning_rate": 9.931905535771172e-07, "loss": 0.0503, "step": 579 }, { "epoch": 0.26387625113739765, "grad_norm": 1.3149521834856652, "learning_rate": 9.931670249562929e-07, "loss": 0.0719, "step": 580 }, { "epoch": 0.2643312101910828, "grad_norm": 0.7473353765923089, "learning_rate": 9.931434560360863e-07, "loss": 0.0477, "step": 581 }, { "epoch": 0.26478616924476794, "grad_norm": 1.167928923614994, "learning_rate": 9.931198468184236e-07, "loss": 0.0471, "step": 582 }, { "epoch": 0.26524112829845314, "grad_norm": 1.310612754883531, "learning_rate": 9.930961973052338e-07, "loss": 0.0709, "step": 583 }, { "epoch": 0.2656960873521383, "grad_norm": 1.3235978757573221, "learning_rate": 9.930725074984496e-07, "loss": 0.0815, "step": 584 }, { "epoch": 0.2661510464058235, "grad_norm": 0.9628039769906691, "learning_rate": 9.93048777400007e-07, "loss": 0.0474, "step": 585 }, { "epoch": 0.26660600545950863, "grad_norm": 0.6788073817585508, "learning_rate": 9.930250070118447e-07, "loss": 0.0492, "step": 586 }, { "epoch": 0.26706096451319383, "grad_norm": 1.0492568851031845, "learning_rate": 9.930011963359053e-07, "loss": 0.0521, "step": 587 }, { "epoch": 0.267515923566879, "grad_norm": 1.615172529828041, "learning_rate": 9.929773453741344e-07, "loss": 0.0809, "step": 588 }, { "epoch": 0.2679708826205642, "grad_norm": 1.0253341550071549, "learning_rate": 9.929534541284813e-07, "loss": 0.0426, "step": 589 }, { "epoch": 0.2684258416742493, "grad_norm": 1.1696618244725745, "learning_rate": 9.92929522600898e-07, "loss": 0.0641, "step": 590 }, { "epoch": 0.26888080072793447, "grad_norm": 0.9827272283069487, "learning_rate": 9.929055507933402e-07, "loss": 0.0452, "step": 591 }, { "epoch": 0.26933575978161967, "grad_norm": 0.8430198960753034, "learning_rate": 9.928815387077668e-07, "loss": 0.0508, "step": 592 }, { "epoch": 0.2697907188353048, "grad_norm": 1.382038033621589, "learning_rate": 9.928574863461398e-07, "loss": 0.0681, "step": 593 }, { "epoch": 0.27024567788899, "grad_norm": 1.1940213065451255, "learning_rate": 9.928333937104247e-07, "loss": 0.082, "step": 594 }, { "epoch": 0.27070063694267515, "grad_norm": 0.8404894697073862, "learning_rate": 9.928092608025904e-07, "loss": 0.0425, "step": 595 }, { "epoch": 0.27115559599636035, "grad_norm": 0.8707359898643742, "learning_rate": 9.927850876246087e-07, "loss": 0.0385, "step": 596 }, { "epoch": 0.2716105550500455, "grad_norm": 1.004084628043732, "learning_rate": 9.92760874178455e-07, "loss": 0.0551, "step": 597 }, { "epoch": 0.27206551410373064, "grad_norm": 1.1449385144827775, "learning_rate": 9.927366204661082e-07, "loss": 0.0713, "step": 598 }, { "epoch": 0.27252047315741584, "grad_norm": 0.7989715539153988, "learning_rate": 9.927123264895495e-07, "loss": 0.0416, "step": 599 }, { "epoch": 0.272975432211101, "grad_norm": 0.8363124711263994, "learning_rate": 9.926879922507648e-07, "loss": 0.0383, "step": 600 }, { "epoch": 0.2734303912647862, "grad_norm": 29.55084555102274, "learning_rate": 9.926636177517425e-07, "loss": 0.3416, "step": 601 }, { "epoch": 0.27388535031847133, "grad_norm": 0.9107899157031711, "learning_rate": 9.926392029944742e-07, "loss": 0.0445, "step": 602 }, { "epoch": 0.27434030937215653, "grad_norm": 0.9183702196862396, "learning_rate": 9.926147479809548e-07, "loss": 0.0498, "step": 603 }, { "epoch": 0.2747952684258417, "grad_norm": 0.8029128869183492, "learning_rate": 9.92590252713183e-07, "loss": 0.0384, "step": 604 }, { "epoch": 0.2752502274795268, "grad_norm": 1.3745017591581123, "learning_rate": 9.9256571719316e-07, "loss": 0.0611, "step": 605 }, { "epoch": 0.275705186533212, "grad_norm": 1.041782284515593, "learning_rate": 9.925411414228912e-07, "loss": 0.0564, "step": 606 }, { "epoch": 0.27616014558689717, "grad_norm": 0.9822583220595585, "learning_rate": 9.925165254043845e-07, "loss": 0.0552, "step": 607 }, { "epoch": 0.27661510464058237, "grad_norm": 1.2217611256637895, "learning_rate": 9.924918691396515e-07, "loss": 0.0636, "step": 608 }, { "epoch": 0.2770700636942675, "grad_norm": 1.078364301909632, "learning_rate": 9.924671726307072e-07, "loss": 0.0467, "step": 609 }, { "epoch": 0.2775250227479527, "grad_norm": 0.7795034924194706, "learning_rate": 9.924424358795693e-07, "loss": 0.0513, "step": 610 }, { "epoch": 0.27797998180163785, "grad_norm": 1.040749803384716, "learning_rate": 9.924176588882596e-07, "loss": 0.0422, "step": 611 }, { "epoch": 0.278434940855323, "grad_norm": 0.9955043548788979, "learning_rate": 9.923928416588025e-07, "loss": 0.0601, "step": 612 }, { "epoch": 0.2788898999090082, "grad_norm": 1.2788050875618786, "learning_rate": 9.92367984193226e-07, "loss": 0.0656, "step": 613 }, { "epoch": 0.27934485896269334, "grad_norm": 1.0520208261165467, "learning_rate": 9.923430864935613e-07, "loss": 0.044, "step": 614 }, { "epoch": 0.27979981801637854, "grad_norm": 0.8046608450839557, "learning_rate": 9.92318148561843e-07, "loss": 0.0389, "step": 615 }, { "epoch": 0.2802547770700637, "grad_norm": 0.9446459942758076, "learning_rate": 9.92293170400109e-07, "loss": 0.0574, "step": 616 }, { "epoch": 0.2807097361237489, "grad_norm": 1.286165064577384, "learning_rate": 9.922681520104e-07, "loss": 0.0516, "step": 617 }, { "epoch": 0.28116469517743403, "grad_norm": 0.9757980714878888, "learning_rate": 9.92243093394761e-07, "loss": 0.0427, "step": 618 }, { "epoch": 0.2816196542311192, "grad_norm": 1.8875683874381963, "learning_rate": 9.92217994555239e-07, "loss": 0.1038, "step": 619 }, { "epoch": 0.2820746132848044, "grad_norm": 0.9667936513074024, "learning_rate": 9.921928554938857e-07, "loss": 0.0418, "step": 620 }, { "epoch": 0.2825295723384895, "grad_norm": 1.487921281718676, "learning_rate": 9.921676762127548e-07, "loss": 0.0633, "step": 621 }, { "epoch": 0.2829845313921747, "grad_norm": 1.4119788602644432, "learning_rate": 9.92142456713904e-07, "loss": 0.0832, "step": 622 }, { "epoch": 0.28343949044585987, "grad_norm": 1.0587495456834994, "learning_rate": 9.921171969993942e-07, "loss": 0.0574, "step": 623 }, { "epoch": 0.28389444949954507, "grad_norm": 1.1310281344404047, "learning_rate": 9.920918970712892e-07, "loss": 0.0739, "step": 624 }, { "epoch": 0.2843494085532302, "grad_norm": 1.0487491154495072, "learning_rate": 9.92066556931657e-07, "loss": 0.0567, "step": 625 }, { "epoch": 0.28480436760691535, "grad_norm": 1.3209589226801457, "learning_rate": 9.920411765825677e-07, "loss": 0.0748, "step": 626 }, { "epoch": 0.28525932666060055, "grad_norm": 1.5711306033130534, "learning_rate": 9.920157560260956e-07, "loss": 0.1072, "step": 627 }, { "epoch": 0.2857142857142857, "grad_norm": 1.3376971168224838, "learning_rate": 9.919902952643177e-07, "loss": 0.0785, "step": 628 }, { "epoch": 0.2861692447679709, "grad_norm": 0.7269760473078776, "learning_rate": 9.919647942993147e-07, "loss": 0.0497, "step": 629 }, { "epoch": 0.28662420382165604, "grad_norm": 3.5110121965250496, "learning_rate": 9.919392531331705e-07, "loss": 0.1684, "step": 630 }, { "epoch": 0.28707916287534124, "grad_norm": 0.9707752564014238, "learning_rate": 9.91913671767972e-07, "loss": 0.0313, "step": 631 }, { "epoch": 0.2875341219290264, "grad_norm": 0.921399392571264, "learning_rate": 9.9188805020581e-07, "loss": 0.0427, "step": 632 }, { "epoch": 0.28798908098271153, "grad_norm": 1.3165423282341409, "learning_rate": 9.918623884487777e-07, "loss": 0.0627, "step": 633 }, { "epoch": 0.28844404003639673, "grad_norm": 0.9265710654563936, "learning_rate": 9.91836686498972e-07, "loss": 0.0501, "step": 634 }, { "epoch": 0.2888989990900819, "grad_norm": 1.3367985068100987, "learning_rate": 9.918109443584936e-07, "loss": 0.0706, "step": 635 }, { "epoch": 0.2893539581437671, "grad_norm": 0.9642948759548694, "learning_rate": 9.91785162029446e-07, "loss": 0.0509, "step": 636 }, { "epoch": 0.2898089171974522, "grad_norm": 1.0086097519700952, "learning_rate": 9.917593395139357e-07, "loss": 0.0537, "step": 637 }, { "epoch": 0.2902638762511374, "grad_norm": 1.0234605225183588, "learning_rate": 9.917334768140728e-07, "loss": 0.0425, "step": 638 }, { "epoch": 0.29071883530482256, "grad_norm": 0.9468371907337818, "learning_rate": 9.917075739319709e-07, "loss": 0.0666, "step": 639 }, { "epoch": 0.2911737943585077, "grad_norm": 1.47578738530635, "learning_rate": 9.916816308697466e-07, "loss": 0.0417, "step": 640 }, { "epoch": 0.2916287534121929, "grad_norm": 0.984760350266392, "learning_rate": 9.916556476295199e-07, "loss": 0.0684, "step": 641 }, { "epoch": 0.29208371246587805, "grad_norm": 1.5253857730731182, "learning_rate": 9.91629624213414e-07, "loss": 0.0917, "step": 642 }, { "epoch": 0.29253867151956325, "grad_norm": 1.0045846173194668, "learning_rate": 9.916035606235553e-07, "loss": 0.0687, "step": 643 }, { "epoch": 0.2929936305732484, "grad_norm": 0.812312399494177, "learning_rate": 9.915774568620736e-07, "loss": 0.0619, "step": 644 }, { "epoch": 0.2934485896269336, "grad_norm": 0.9815748164520712, "learning_rate": 9.915513129311024e-07, "loss": 0.0595, "step": 645 }, { "epoch": 0.29390354868061874, "grad_norm": 1.0317601999881185, "learning_rate": 9.915251288327774e-07, "loss": 0.0642, "step": 646 }, { "epoch": 0.2943585077343039, "grad_norm": 1.0962961459419371, "learning_rate": 9.914989045692388e-07, "loss": 0.0569, "step": 647 }, { "epoch": 0.2948134667879891, "grad_norm": 1.131097965549591, "learning_rate": 9.914726401426291e-07, "loss": 0.0923, "step": 648 }, { "epoch": 0.29526842584167423, "grad_norm": 0.9408642324293485, "learning_rate": 9.914463355550948e-07, "loss": 0.0492, "step": 649 }, { "epoch": 0.29572338489535943, "grad_norm": 1.3443658070138855, "learning_rate": 9.914199908087854e-07, "loss": 0.0929, "step": 650 }, { "epoch": 0.2961783439490446, "grad_norm": 0.9125575946870041, "learning_rate": 9.913936059058536e-07, "loss": 0.066, "step": 651 }, { "epoch": 0.2966333030027298, "grad_norm": 1.1337882030874984, "learning_rate": 9.913671808484554e-07, "loss": 0.0699, "step": 652 }, { "epoch": 0.2970882620564149, "grad_norm": 0.9961557585139583, "learning_rate": 9.9134071563875e-07, "loss": 0.0373, "step": 653 }, { "epoch": 0.29754322111010006, "grad_norm": 1.2117410924714211, "learning_rate": 9.913142102789004e-07, "loss": 0.0568, "step": 654 }, { "epoch": 0.29799818016378526, "grad_norm": 1.2977995246627883, "learning_rate": 9.912876647710722e-07, "loss": 0.0589, "step": 655 }, { "epoch": 0.2984531392174704, "grad_norm": 1.2934012457350892, "learning_rate": 9.912610791174347e-07, "loss": 0.08, "step": 656 }, { "epoch": 0.2989080982711556, "grad_norm": 0.9503715983298989, "learning_rate": 9.912344533201603e-07, "loss": 0.0447, "step": 657 }, { "epoch": 0.29936305732484075, "grad_norm": 1.046065350087953, "learning_rate": 9.912077873814248e-07, "loss": 0.0415, "step": 658 }, { "epoch": 0.29981801637852595, "grad_norm": 1.0154368058219054, "learning_rate": 9.911810813034072e-07, "loss": 0.0461, "step": 659 }, { "epoch": 0.3002729754322111, "grad_norm": 1.2461745651172187, "learning_rate": 9.911543350882898e-07, "loss": 0.063, "step": 660 }, { "epoch": 0.30072793448589624, "grad_norm": 1.6534923979652207, "learning_rate": 9.911275487382581e-07, "loss": 0.0663, "step": 661 }, { "epoch": 0.30118289353958144, "grad_norm": 1.5624536280466172, "learning_rate": 9.91100722255501e-07, "loss": 0.0633, "step": 662 }, { "epoch": 0.3016378525932666, "grad_norm": 1.5116955150090208, "learning_rate": 9.910738556422107e-07, "loss": 0.0532, "step": 663 }, { "epoch": 0.3020928116469518, "grad_norm": 1.3985316435724882, "learning_rate": 9.910469489005828e-07, "loss": 0.062, "step": 664 }, { "epoch": 0.30254777070063693, "grad_norm": 1.382268882589595, "learning_rate": 9.910200020328156e-07, "loss": 0.0756, "step": 665 }, { "epoch": 0.30300272975432213, "grad_norm": 1.0085694446210933, "learning_rate": 9.909930150411113e-07, "loss": 0.0398, "step": 666 }, { "epoch": 0.3034576888080073, "grad_norm": 7.296856441995926, "learning_rate": 9.90965987927675e-07, "loss": 0.1161, "step": 667 }, { "epoch": 0.3039126478616925, "grad_norm": 1.3617370295918843, "learning_rate": 9.909389206947153e-07, "loss": 0.0457, "step": 668 }, { "epoch": 0.3043676069153776, "grad_norm": 1.307439764979237, "learning_rate": 9.909118133444443e-07, "loss": 0.0713, "step": 669 }, { "epoch": 0.30482256596906276, "grad_norm": 1.356296800293178, "learning_rate": 9.90884665879077e-07, "loss": 0.0687, "step": 670 }, { "epoch": 0.30527752502274796, "grad_norm": 1.3327603075024197, "learning_rate": 9.908574783008312e-07, "loss": 0.0791, "step": 671 }, { "epoch": 0.3057324840764331, "grad_norm": 1.4572157288705185, "learning_rate": 9.90830250611929e-07, "loss": 0.0697, "step": 672 }, { "epoch": 0.3061874431301183, "grad_norm": 1.6223697913631583, "learning_rate": 9.908029828145955e-07, "loss": 0.0612, "step": 673 }, { "epoch": 0.30664240218380345, "grad_norm": 1.3167422297872369, "learning_rate": 9.907756749110586e-07, "loss": 0.059, "step": 674 }, { "epoch": 0.30709736123748865, "grad_norm": 2.027374931602704, "learning_rate": 9.9074832690355e-07, "loss": 0.0785, "step": 675 }, { "epoch": 0.3075523202911738, "grad_norm": 1.2579435861289403, "learning_rate": 9.907209387943041e-07, "loss": 0.0618, "step": 676 }, { "epoch": 0.30800727934485894, "grad_norm": 1.1857901457653914, "learning_rate": 9.906935105855594e-07, "loss": 0.0347, "step": 677 }, { "epoch": 0.30846223839854414, "grad_norm": 1.452352433489291, "learning_rate": 9.906660422795568e-07, "loss": 0.0718, "step": 678 }, { "epoch": 0.3089171974522293, "grad_norm": 1.628136341272367, "learning_rate": 9.90638533878541e-07, "loss": 0.0727, "step": 679 }, { "epoch": 0.3093721565059145, "grad_norm": 1.1403865777244844, "learning_rate": 9.9061098538476e-07, "loss": 0.0653, "step": 680 }, { "epoch": 0.30982711555959963, "grad_norm": 1.6648555380096433, "learning_rate": 9.905833968004648e-07, "loss": 0.0637, "step": 681 }, { "epoch": 0.31028207461328483, "grad_norm": 1.2274938856516977, "learning_rate": 9.9055576812791e-07, "loss": 0.0669, "step": 682 }, { "epoch": 0.31073703366697, "grad_norm": 0.9713092147216414, "learning_rate": 9.90528099369353e-07, "loss": 0.0643, "step": 683 }, { "epoch": 0.3111919927206551, "grad_norm": 0.9733868728185259, "learning_rate": 9.90500390527055e-07, "loss": 0.0527, "step": 684 }, { "epoch": 0.3116469517743403, "grad_norm": 1.3613496793071267, "learning_rate": 9.904726416032802e-07, "loss": 0.0874, "step": 685 }, { "epoch": 0.31210191082802546, "grad_norm": 1.1620852948117812, "learning_rate": 9.904448526002962e-07, "loss": 0.0383, "step": 686 }, { "epoch": 0.31255686988171066, "grad_norm": 0.6853926531471232, "learning_rate": 9.904170235203735e-07, "loss": 0.0363, "step": 687 }, { "epoch": 0.3130118289353958, "grad_norm": 1.356455301183509, "learning_rate": 9.903891543657864e-07, "loss": 0.0954, "step": 688 }, { "epoch": 0.313466787989081, "grad_norm": 0.7458360864136233, "learning_rate": 9.903612451388122e-07, "loss": 0.0521, "step": 689 }, { "epoch": 0.31392174704276615, "grad_norm": 1.1952028578338365, "learning_rate": 9.903332958417314e-07, "loss": 0.0617, "step": 690 }, { "epoch": 0.3143767060964513, "grad_norm": 0.8713662211918018, "learning_rate": 9.903053064768281e-07, "loss": 0.0456, "step": 691 }, { "epoch": 0.3148316651501365, "grad_norm": 1.2100908677831657, "learning_rate": 9.902772770463892e-07, "loss": 0.0642, "step": 692 }, { "epoch": 0.31528662420382164, "grad_norm": 0.8108654037269365, "learning_rate": 9.902492075527055e-07, "loss": 0.0421, "step": 693 }, { "epoch": 0.31574158325750684, "grad_norm": 1.1045281519748291, "learning_rate": 9.902210979980704e-07, "loss": 0.0441, "step": 694 }, { "epoch": 0.316196542311192, "grad_norm": 1.5162256953140276, "learning_rate": 9.90192948384781e-07, "loss": 0.0642, "step": 695 }, { "epoch": 0.3166515013648772, "grad_norm": 0.9647200531573402, "learning_rate": 9.901647587151376e-07, "loss": 0.0491, "step": 696 }, { "epoch": 0.31710646041856233, "grad_norm": 1.6320391467680888, "learning_rate": 9.901365289914436e-07, "loss": 0.0682, "step": 697 }, { "epoch": 0.3175614194722475, "grad_norm": 1.845846057566512, "learning_rate": 9.901082592160057e-07, "loss": 0.0839, "step": 698 }, { "epoch": 0.3180163785259327, "grad_norm": 1.0981917286301393, "learning_rate": 9.900799493911345e-07, "loss": 0.0647, "step": 699 }, { "epoch": 0.3184713375796178, "grad_norm": 1.2821281964925575, "learning_rate": 9.90051599519143e-07, "loss": 0.0509, "step": 700 }, { "epoch": 0.318926296633303, "grad_norm": 1.181897564667462, "learning_rate": 9.900232096023476e-07, "loss": 0.0589, "step": 701 }, { "epoch": 0.31938125568698816, "grad_norm": 0.7349346727926805, "learning_rate": 9.899947796430685e-07, "loss": 0.0284, "step": 702 }, { "epoch": 0.31983621474067336, "grad_norm": 1.1338027982133403, "learning_rate": 9.899663096436291e-07, "loss": 0.0576, "step": 703 }, { "epoch": 0.3202911737943585, "grad_norm": 1.5551411118621452, "learning_rate": 9.899377996063551e-07, "loss": 0.0654, "step": 704 }, { "epoch": 0.32074613284804365, "grad_norm": 0.9208251963138269, "learning_rate": 9.89909249533577e-07, "loss": 0.0442, "step": 705 }, { "epoch": 0.32120109190172885, "grad_norm": 1.2671528203960218, "learning_rate": 9.898806594276272e-07, "loss": 0.0521, "step": 706 }, { "epoch": 0.321656050955414, "grad_norm": 0.9845964761216766, "learning_rate": 9.898520292908423e-07, "loss": 0.0626, "step": 707 }, { "epoch": 0.3221110100090992, "grad_norm": 0.9201857291564361, "learning_rate": 9.898233591255618e-07, "loss": 0.0414, "step": 708 }, { "epoch": 0.32256596906278434, "grad_norm": 1.3802427541635225, "learning_rate": 9.897946489341284e-07, "loss": 0.0631, "step": 709 }, { "epoch": 0.32302092811646954, "grad_norm": 0.9169255922193607, "learning_rate": 9.89765898718888e-07, "loss": 0.0463, "step": 710 }, { "epoch": 0.3234758871701547, "grad_norm": 0.8489596136214773, "learning_rate": 9.897371084821904e-07, "loss": 0.0279, "step": 711 }, { "epoch": 0.32393084622383983, "grad_norm": 0.9947187495866323, "learning_rate": 9.897082782263878e-07, "loss": 0.0491, "step": 712 }, { "epoch": 0.32438580527752503, "grad_norm": 0.8636931822601663, "learning_rate": 9.896794079538362e-07, "loss": 0.0288, "step": 713 }, { "epoch": 0.3248407643312102, "grad_norm": 0.7038785213990234, "learning_rate": 9.896504976668946e-07, "loss": 0.0299, "step": 714 }, { "epoch": 0.3252957233848954, "grad_norm": 0.7919316226101341, "learning_rate": 9.896215473679256e-07, "loss": 0.0483, "step": 715 }, { "epoch": 0.3257506824385805, "grad_norm": 1.1431942474622694, "learning_rate": 9.89592557059295e-07, "loss": 0.0674, "step": 716 }, { "epoch": 0.3262056414922657, "grad_norm": 1.4320499560835513, "learning_rate": 9.895635267433716e-07, "loss": 0.0633, "step": 717 }, { "epoch": 0.32666060054595086, "grad_norm": 1.8599515421854902, "learning_rate": 9.895344564225276e-07, "loss": 0.1073, "step": 718 }, { "epoch": 0.327115559599636, "grad_norm": 1.1060164210833159, "learning_rate": 9.895053460991388e-07, "loss": 0.0627, "step": 719 }, { "epoch": 0.3275705186533212, "grad_norm": 1.2370280349125007, "learning_rate": 9.894761957755833e-07, "loss": 0.0661, "step": 720 }, { "epoch": 0.32802547770700635, "grad_norm": 1.1794898817174824, "learning_rate": 9.894470054542438e-07, "loss": 0.0891, "step": 721 }, { "epoch": 0.32848043676069155, "grad_norm": 1.0219540363977015, "learning_rate": 9.894177751375052e-07, "loss": 0.0357, "step": 722 }, { "epoch": 0.3289353958143767, "grad_norm": 0.9261416205683722, "learning_rate": 9.893885048277562e-07, "loss": 0.0437, "step": 723 }, { "epoch": 0.3293903548680619, "grad_norm": 1.1764770329895786, "learning_rate": 9.893591945273888e-07, "loss": 0.0554, "step": 724 }, { "epoch": 0.32984531392174704, "grad_norm": 1.1630835283432235, "learning_rate": 9.89329844238798e-07, "loss": 0.0776, "step": 725 }, { "epoch": 0.3303002729754322, "grad_norm": 1.1291233347560112, "learning_rate": 9.893004539643818e-07, "loss": 0.055, "step": 726 }, { "epoch": 0.3307552320291174, "grad_norm": 1.6265957853112565, "learning_rate": 9.892710237065422e-07, "loss": 0.137, "step": 727 }, { "epoch": 0.33121019108280253, "grad_norm": 0.9413931036817039, "learning_rate": 9.892415534676844e-07, "loss": 0.0382, "step": 728 }, { "epoch": 0.33166515013648773, "grad_norm": 0.8283127220515004, "learning_rate": 9.89212043250216e-07, "loss": 0.045, "step": 729 }, { "epoch": 0.3321201091901729, "grad_norm": 0.9191157693998399, "learning_rate": 9.891824930565487e-07, "loss": 0.0379, "step": 730 }, { "epoch": 0.3325750682438581, "grad_norm": 1.1703508124126667, "learning_rate": 9.891529028890973e-07, "loss": 0.0855, "step": 731 }, { "epoch": 0.3330300272975432, "grad_norm": 1.4346132724934677, "learning_rate": 9.891232727502795e-07, "loss": 0.0603, "step": 732 }, { "epoch": 0.33348498635122836, "grad_norm": 1.5012362126787608, "learning_rate": 9.89093602642517e-07, "loss": 0.0653, "step": 733 }, { "epoch": 0.33393994540491356, "grad_norm": 1.0859634329673766, "learning_rate": 9.890638925682338e-07, "loss": 0.0648, "step": 734 }, { "epoch": 0.3343949044585987, "grad_norm": 1.232006806795307, "learning_rate": 9.890341425298578e-07, "loss": 0.0605, "step": 735 }, { "epoch": 0.3348498635122839, "grad_norm": 1.1483754645505195, "learning_rate": 9.890043525298203e-07, "loss": 0.038, "step": 736 }, { "epoch": 0.33530482256596905, "grad_norm": 0.8905630939875826, "learning_rate": 9.889745225705553e-07, "loss": 0.054, "step": 737 }, { "epoch": 0.33575978161965425, "grad_norm": 1.0578430284981697, "learning_rate": 9.889446526545006e-07, "loss": 0.0625, "step": 738 }, { "epoch": 0.3362147406733394, "grad_norm": 0.9285828621379187, "learning_rate": 9.88914742784097e-07, "loss": 0.0635, "step": 739 }, { "epoch": 0.33666969972702454, "grad_norm": 3.815140638262396, "learning_rate": 9.888847929617884e-07, "loss": 0.1459, "step": 740 }, { "epoch": 0.33712465878070974, "grad_norm": 1.3928949636535293, "learning_rate": 9.888548031900225e-07, "loss": 0.0578, "step": 741 }, { "epoch": 0.3375796178343949, "grad_norm": 1.1754372827695299, "learning_rate": 9.888247734712497e-07, "loss": 0.0442, "step": 742 }, { "epoch": 0.3380345768880801, "grad_norm": 1.2998350175949678, "learning_rate": 9.887947038079237e-07, "loss": 0.0817, "step": 743 }, { "epoch": 0.33848953594176523, "grad_norm": 0.9950492241365165, "learning_rate": 9.88764594202502e-07, "loss": 0.0411, "step": 744 }, { "epoch": 0.33894449499545043, "grad_norm": 0.7917809313616403, "learning_rate": 9.887344446574451e-07, "loss": 0.0361, "step": 745 }, { "epoch": 0.3393994540491356, "grad_norm": 1.4356540391991606, "learning_rate": 9.887042551752162e-07, "loss": 0.0878, "step": 746 }, { "epoch": 0.3398544131028208, "grad_norm": 0.8243574844840938, "learning_rate": 9.886740257582826e-07, "loss": 0.0415, "step": 747 }, { "epoch": 0.3403093721565059, "grad_norm": 0.8439777930122507, "learning_rate": 9.886437564091146e-07, "loss": 0.0546, "step": 748 }, { "epoch": 0.34076433121019106, "grad_norm": 1.2114529277008583, "learning_rate": 9.886134471301853e-07, "loss": 0.0677, "step": 749 }, { "epoch": 0.34121929026387626, "grad_norm": 1.0226089397658138, "learning_rate": 9.885830979239718e-07, "loss": 0.0563, "step": 750 }, { "epoch": 0.3416742493175614, "grad_norm": 1.0110974492493314, "learning_rate": 9.885527087929538e-07, "loss": 0.0626, "step": 751 }, { "epoch": 0.3421292083712466, "grad_norm": 1.0903660150853904, "learning_rate": 9.88522279739615e-07, "loss": 0.0607, "step": 752 }, { "epoch": 0.34258416742493175, "grad_norm": 1.072611948369762, "learning_rate": 9.884918107664414e-07, "loss": 0.0605, "step": 753 }, { "epoch": 0.34303912647861695, "grad_norm": 1.128742393993082, "learning_rate": 9.884613018759232e-07, "loss": 0.0522, "step": 754 }, { "epoch": 0.3434940855323021, "grad_norm": 0.8658277171169773, "learning_rate": 9.884307530705534e-07, "loss": 0.0563, "step": 755 }, { "epoch": 0.34394904458598724, "grad_norm": 0.8376780829571386, "learning_rate": 9.884001643528279e-07, "loss": 0.0396, "step": 756 }, { "epoch": 0.34440400363967244, "grad_norm": 0.8906241749534454, "learning_rate": 9.883695357252466e-07, "loss": 0.0461, "step": 757 }, { "epoch": 0.3448589626933576, "grad_norm": 0.8276199164583825, "learning_rate": 9.883388671903124e-07, "loss": 0.0557, "step": 758 }, { "epoch": 0.3453139217470428, "grad_norm": 1.2722172400064715, "learning_rate": 9.883081587505314e-07, "loss": 0.056, "step": 759 }, { "epoch": 0.34576888080072793, "grad_norm": 1.3214696417785712, "learning_rate": 9.882774104084125e-07, "loss": 0.0834, "step": 760 }, { "epoch": 0.34622383985441313, "grad_norm": 0.7934488982241936, "learning_rate": 9.882466221664691e-07, "loss": 0.0284, "step": 761 }, { "epoch": 0.3466787989080983, "grad_norm": 1.2629253079562457, "learning_rate": 9.882157940272163e-07, "loss": 0.0628, "step": 762 }, { "epoch": 0.3471337579617834, "grad_norm": 1.198862665136291, "learning_rate": 9.881849259931736e-07, "loss": 0.0761, "step": 763 }, { "epoch": 0.3475887170154686, "grad_norm": 0.9396769668127813, "learning_rate": 9.881540180668637e-07, "loss": 0.0428, "step": 764 }, { "epoch": 0.34804367606915376, "grad_norm": 0.9483287291011322, "learning_rate": 9.881230702508117e-07, "loss": 0.0455, "step": 765 }, { "epoch": 0.34849863512283896, "grad_norm": 0.6948356587243723, "learning_rate": 9.880920825475466e-07, "loss": 0.049, "step": 766 }, { "epoch": 0.3489535941765241, "grad_norm": 0.8764664593855477, "learning_rate": 9.88061054959601e-07, "loss": 0.0468, "step": 767 }, { "epoch": 0.3494085532302093, "grad_norm": 0.8791097681895492, "learning_rate": 9.880299874895098e-07, "loss": 0.0541, "step": 768 }, { "epoch": 0.34986351228389445, "grad_norm": 0.7806928475405208, "learning_rate": 9.87998880139812e-07, "loss": 0.0405, "step": 769 }, { "epoch": 0.3503184713375796, "grad_norm": 0.9986905512274477, "learning_rate": 9.879677329130496e-07, "loss": 0.0526, "step": 770 }, { "epoch": 0.3507734303912648, "grad_norm": 0.9287061179408991, "learning_rate": 9.879365458117676e-07, "loss": 0.0442, "step": 771 }, { "epoch": 0.35122838944494994, "grad_norm": 0.899071118753459, "learning_rate": 9.879053188385147e-07, "loss": 0.0455, "step": 772 }, { "epoch": 0.35168334849863514, "grad_norm": 0.9936912456097078, "learning_rate": 9.878740519958423e-07, "loss": 0.0405, "step": 773 }, { "epoch": 0.3521383075523203, "grad_norm": 1.2568803464111604, "learning_rate": 9.878427452863059e-07, "loss": 0.0679, "step": 774 }, { "epoch": 0.3525932666060055, "grad_norm": 1.2788336416132267, "learning_rate": 9.878113987124633e-07, "loss": 0.0902, "step": 775 }, { "epoch": 0.35304822565969063, "grad_norm": 1.0760193845277177, "learning_rate": 9.87780012276876e-07, "loss": 0.0479, "step": 776 }, { "epoch": 0.3535031847133758, "grad_norm": 0.9300743052647006, "learning_rate": 9.877485859821091e-07, "loss": 0.0457, "step": 777 }, { "epoch": 0.353958143767061, "grad_norm": 0.5849902243659433, "learning_rate": 9.877171198307304e-07, "loss": 0.0342, "step": 778 }, { "epoch": 0.3544131028207461, "grad_norm": 1.0764544843910653, "learning_rate": 9.876856138253109e-07, "loss": 0.0675, "step": 779 }, { "epoch": 0.3548680618744313, "grad_norm": 1.2734310995258185, "learning_rate": 9.876540679684255e-07, "loss": 0.0618, "step": 780 }, { "epoch": 0.35532302092811646, "grad_norm": 1.3149458641547824, "learning_rate": 9.876224822626521e-07, "loss": 0.0669, "step": 781 }, { "epoch": 0.35577797998180166, "grad_norm": 0.8536263040340912, "learning_rate": 9.875908567105714e-07, "loss": 0.042, "step": 782 }, { "epoch": 0.3562329390354868, "grad_norm": 1.5543008749481344, "learning_rate": 9.875591913147678e-07, "loss": 0.074, "step": 783 }, { "epoch": 0.35668789808917195, "grad_norm": 1.9615355852870393, "learning_rate": 9.87527486077829e-07, "loss": 0.0773, "step": 784 }, { "epoch": 0.35714285714285715, "grad_norm": 1.0896405339670319, "learning_rate": 9.874957410023456e-07, "loss": 0.0342, "step": 785 }, { "epoch": 0.3575978161965423, "grad_norm": 1.1339386420559001, "learning_rate": 9.874639560909118e-07, "loss": 0.0583, "step": 786 }, { "epoch": 0.3580527752502275, "grad_norm": 0.9828226607515503, "learning_rate": 9.874321313461248e-07, "loss": 0.0587, "step": 787 }, { "epoch": 0.35850773430391264, "grad_norm": 1.016094661638095, "learning_rate": 9.874002667705852e-07, "loss": 0.0485, "step": 788 }, { "epoch": 0.35896269335759784, "grad_norm": 0.830233747808448, "learning_rate": 9.87368362366897e-07, "loss": 0.0472, "step": 789 }, { "epoch": 0.359417652411283, "grad_norm": 1.3608309134532897, "learning_rate": 9.873364181376672e-07, "loss": 0.1011, "step": 790 }, { "epoch": 0.35987261146496813, "grad_norm": 1.0475930652878, "learning_rate": 9.873044340855061e-07, "loss": 0.0395, "step": 791 }, { "epoch": 0.36032757051865333, "grad_norm": 1.2283691711013998, "learning_rate": 9.872724102130272e-07, "loss": 0.0498, "step": 792 }, { "epoch": 0.3607825295723385, "grad_norm": 1.1411326217070377, "learning_rate": 9.872403465228475e-07, "loss": 0.0477, "step": 793 }, { "epoch": 0.3612374886260237, "grad_norm": 1.0679176790784732, "learning_rate": 9.87208243017587e-07, "loss": 0.0536, "step": 794 }, { "epoch": 0.3616924476797088, "grad_norm": 1.3204965657880923, "learning_rate": 9.87176099699869e-07, "loss": 0.0528, "step": 795 }, { "epoch": 0.362147406733394, "grad_norm": 1.5656645053169096, "learning_rate": 9.871439165723206e-07, "loss": 0.0712, "step": 796 }, { "epoch": 0.36260236578707916, "grad_norm": 1.1255325878442837, "learning_rate": 9.871116936375708e-07, "loss": 0.0631, "step": 797 }, { "epoch": 0.3630573248407643, "grad_norm": 1.4032360412457041, "learning_rate": 9.870794308982534e-07, "loss": 0.0743, "step": 798 }, { "epoch": 0.3635122838944495, "grad_norm": 1.4594947304255608, "learning_rate": 9.870471283570046e-07, "loss": 0.0863, "step": 799 }, { "epoch": 0.36396724294813465, "grad_norm": 1.305071451218416, "learning_rate": 9.870147860164637e-07, "loss": 0.0537, "step": 800 }, { "epoch": 0.36442220200181985, "grad_norm": 0.9906903716091005, "learning_rate": 9.86982403879274e-07, "loss": 0.0424, "step": 801 }, { "epoch": 0.364877161055505, "grad_norm": 1.1331645032808157, "learning_rate": 9.869499819480814e-07, "loss": 0.0654, "step": 802 }, { "epoch": 0.3653321201091902, "grad_norm": 0.9177042441340905, "learning_rate": 9.869175202255353e-07, "loss": 0.0514, "step": 803 }, { "epoch": 0.36578707916287534, "grad_norm": 0.9391822731873478, "learning_rate": 9.868850187142884e-07, "loss": 0.0493, "step": 804 }, { "epoch": 0.3662420382165605, "grad_norm": 0.933416930574812, "learning_rate": 9.868524774169966e-07, "loss": 0.0535, "step": 805 }, { "epoch": 0.3666969972702457, "grad_norm": 1.1959668329347215, "learning_rate": 9.868198963363189e-07, "loss": 0.0403, "step": 806 }, { "epoch": 0.36715195632393083, "grad_norm": 1.0646453928329296, "learning_rate": 9.867872754749175e-07, "loss": 0.0595, "step": 807 }, { "epoch": 0.36760691537761603, "grad_norm": 0.7727709312250975, "learning_rate": 9.867546148354586e-07, "loss": 0.0448, "step": 808 }, { "epoch": 0.3680618744313012, "grad_norm": 0.8739430655383877, "learning_rate": 9.867219144206104e-07, "loss": 0.047, "step": 809 }, { "epoch": 0.3685168334849864, "grad_norm": 1.3030308232877215, "learning_rate": 9.866891742330456e-07, "loss": 0.0891, "step": 810 }, { "epoch": 0.3689717925386715, "grad_norm": 1.0474886851090488, "learning_rate": 9.866563942754392e-07, "loss": 0.0474, "step": 811 }, { "epoch": 0.36942675159235666, "grad_norm": 0.9685688813982076, "learning_rate": 9.866235745504704e-07, "loss": 0.0578, "step": 812 }, { "epoch": 0.36988171064604186, "grad_norm": 1.584661593329521, "learning_rate": 9.865907150608202e-07, "loss": 0.0657, "step": 813 }, { "epoch": 0.370336669699727, "grad_norm": 0.9731669609765669, "learning_rate": 9.865578158091744e-07, "loss": 0.0615, "step": 814 }, { "epoch": 0.3707916287534122, "grad_norm": 1.1723616539004342, "learning_rate": 9.86524876798221e-07, "loss": 0.0807, "step": 815 }, { "epoch": 0.37124658780709735, "grad_norm": 1.1720495378697444, "learning_rate": 9.86491898030652e-07, "loss": 0.0632, "step": 816 }, { "epoch": 0.37170154686078255, "grad_norm": 0.8596653857982595, "learning_rate": 9.86458879509162e-07, "loss": 0.049, "step": 817 }, { "epoch": 0.3721565059144677, "grad_norm": 1.7199551853498367, "learning_rate": 9.864258212364492e-07, "loss": 0.0696, "step": 818 }, { "epoch": 0.37261146496815284, "grad_norm": 1.1553459158623995, "learning_rate": 9.86392723215215e-07, "loss": 0.0468, "step": 819 }, { "epoch": 0.37306642402183804, "grad_norm": 0.6901843796083772, "learning_rate": 9.863595854481639e-07, "loss": 0.0333, "step": 820 }, { "epoch": 0.3735213830755232, "grad_norm": 1.016196881810049, "learning_rate": 9.863264079380038e-07, "loss": 0.0482, "step": 821 }, { "epoch": 0.3739763421292084, "grad_norm": 1.304769551119967, "learning_rate": 9.86293190687446e-07, "loss": 0.0796, "step": 822 }, { "epoch": 0.37443130118289353, "grad_norm": 0.8271645108887761, "learning_rate": 9.862599336992047e-07, "loss": 0.0339, "step": 823 }, { "epoch": 0.37488626023657873, "grad_norm": 1.0237868444660387, "learning_rate": 9.862266369759976e-07, "loss": 0.049, "step": 824 }, { "epoch": 0.37534121929026387, "grad_norm": 0.7481112813525742, "learning_rate": 9.861933005205454e-07, "loss": 0.037, "step": 825 }, { "epoch": 0.37579617834394907, "grad_norm": 1.671586808500231, "learning_rate": 9.861599243355723e-07, "loss": 0.0792, "step": 826 }, { "epoch": 0.3762511373976342, "grad_norm": 1.0553514466652605, "learning_rate": 9.861265084238057e-07, "loss": 0.0527, "step": 827 }, { "epoch": 0.37670609645131936, "grad_norm": 0.7864060727472882, "learning_rate": 9.860930527879763e-07, "loss": 0.0345, "step": 828 }, { "epoch": 0.37716105550500456, "grad_norm": 1.327471759559032, "learning_rate": 9.860595574308178e-07, "loss": 0.0681, "step": 829 }, { "epoch": 0.3776160145586897, "grad_norm": 0.9733040996995092, "learning_rate": 9.86026022355067e-07, "loss": 0.0465, "step": 830 }, { "epoch": 0.3780709736123749, "grad_norm": 0.837260952665494, "learning_rate": 9.859924475634647e-07, "loss": 0.0348, "step": 831 }, { "epoch": 0.37852593266606005, "grad_norm": 1.5856875991758357, "learning_rate": 9.859588330587545e-07, "loss": 0.0781, "step": 832 }, { "epoch": 0.37898089171974525, "grad_norm": 0.8791110293961706, "learning_rate": 9.859251788436827e-07, "loss": 0.0527, "step": 833 }, { "epoch": 0.3794358507734304, "grad_norm": 1.4738255348098261, "learning_rate": 9.858914849209999e-07, "loss": 0.0703, "step": 834 }, { "epoch": 0.37989080982711554, "grad_norm": 1.4462945721985379, "learning_rate": 9.858577512934591e-07, "loss": 0.082, "step": 835 }, { "epoch": 0.38034576888080074, "grad_norm": 1.0943129317604596, "learning_rate": 9.858239779638172e-07, "loss": 0.0493, "step": 836 }, { "epoch": 0.3808007279344859, "grad_norm": 1.929443473369058, "learning_rate": 9.857901649348337e-07, "loss": 0.095, "step": 837 }, { "epoch": 0.3812556869881711, "grad_norm": 1.1280848978709037, "learning_rate": 9.857563122092716e-07, "loss": 0.0541, "step": 838 }, { "epoch": 0.3817106460418562, "grad_norm": 1.7523363454270473, "learning_rate": 9.857224197898972e-07, "loss": 0.0526, "step": 839 }, { "epoch": 0.3821656050955414, "grad_norm": 1.033542049273595, "learning_rate": 9.856884876794805e-07, "loss": 0.0512, "step": 840 }, { "epoch": 0.38262056414922657, "grad_norm": 0.7514059855481252, "learning_rate": 9.856545158807938e-07, "loss": 0.0323, "step": 841 }, { "epoch": 0.3830755232029117, "grad_norm": 1.1345607174029522, "learning_rate": 9.856205043966132e-07, "loss": 0.0727, "step": 842 }, { "epoch": 0.3835304822565969, "grad_norm": 1.4421407689979044, "learning_rate": 9.855864532297181e-07, "loss": 0.0891, "step": 843 }, { "epoch": 0.38398544131028206, "grad_norm": 1.142563752338784, "learning_rate": 9.855523623828908e-07, "loss": 0.0634, "step": 844 }, { "epoch": 0.38444040036396726, "grad_norm": 0.8784660971028496, "learning_rate": 9.855182318589173e-07, "loss": 0.0493, "step": 845 }, { "epoch": 0.3848953594176524, "grad_norm": 1.7179525054736222, "learning_rate": 9.854840616605864e-07, "loss": 0.0625, "step": 846 }, { "epoch": 0.3853503184713376, "grad_norm": 0.9279745059596685, "learning_rate": 9.854498517906907e-07, "loss": 0.0541, "step": 847 }, { "epoch": 0.38580527752502275, "grad_norm": 1.0991592526395082, "learning_rate": 9.85415602252025e-07, "loss": 0.0528, "step": 848 }, { "epoch": 0.3862602365787079, "grad_norm": 1.5791838958653412, "learning_rate": 9.853813130473885e-07, "loss": 0.0783, "step": 849 }, { "epoch": 0.3867151956323931, "grad_norm": 1.0391443427002267, "learning_rate": 9.853469841795831e-07, "loss": 0.0549, "step": 850 }, { "epoch": 0.38717015468607824, "grad_norm": 0.802198180768101, "learning_rate": 9.85312615651414e-07, "loss": 0.0498, "step": 851 }, { "epoch": 0.38762511373976344, "grad_norm": 0.6796015264215622, "learning_rate": 9.852782074656896e-07, "loss": 0.051, "step": 852 }, { "epoch": 0.3880800727934486, "grad_norm": 1.0655680895323487, "learning_rate": 9.852437596252214e-07, "loss": 0.0508, "step": 853 }, { "epoch": 0.3885350318471338, "grad_norm": 1.3019162109093398, "learning_rate": 9.852092721328248e-07, "loss": 0.054, "step": 854 }, { "epoch": 0.3889899909008189, "grad_norm": 1.2636805181197226, "learning_rate": 9.851747449913176e-07, "loss": 0.0651, "step": 855 }, { "epoch": 0.38944494995450407, "grad_norm": 1.2565548515380993, "learning_rate": 9.851401782035212e-07, "loss": 0.0954, "step": 856 }, { "epoch": 0.38989990900818927, "grad_norm": 1.069899437624589, "learning_rate": 9.851055717722601e-07, "loss": 0.0488, "step": 857 }, { "epoch": 0.3903548680618744, "grad_norm": 1.3711934448611085, "learning_rate": 9.850709257003626e-07, "loss": 0.0601, "step": 858 }, { "epoch": 0.3908098271155596, "grad_norm": 1.58883184217963, "learning_rate": 9.850362399906596e-07, "loss": 0.0673, "step": 859 }, { "epoch": 0.39126478616924476, "grad_norm": 1.190237033447639, "learning_rate": 9.850015146459855e-07, "loss": 0.0559, "step": 860 }, { "epoch": 0.39171974522292996, "grad_norm": 1.2368342232988891, "learning_rate": 9.84966749669178e-07, "loss": 0.0721, "step": 861 }, { "epoch": 0.3921747042766151, "grad_norm": 0.9443321106366053, "learning_rate": 9.849319450630775e-07, "loss": 0.0511, "step": 862 }, { "epoch": 0.39262966333030025, "grad_norm": 0.741718681720957, "learning_rate": 9.848971008305287e-07, "loss": 0.0414, "step": 863 }, { "epoch": 0.39308462238398545, "grad_norm": 1.045766650305341, "learning_rate": 9.848622169743782e-07, "loss": 0.0568, "step": 864 }, { "epoch": 0.3935395814376706, "grad_norm": 3.3519053133043433, "learning_rate": 9.848272934974774e-07, "loss": 0.0718, "step": 865 }, { "epoch": 0.3939945404913558, "grad_norm": 1.0273195709937508, "learning_rate": 9.847923304026793e-07, "loss": 0.0437, "step": 866 }, { "epoch": 0.39444949954504094, "grad_norm": 0.8906770110298587, "learning_rate": 9.847573276928414e-07, "loss": 0.0614, "step": 867 }, { "epoch": 0.39490445859872614, "grad_norm": 0.8856125965003636, "learning_rate": 9.847222853708237e-07, "loss": 0.0487, "step": 868 }, { "epoch": 0.3953594176524113, "grad_norm": 0.950536880586547, "learning_rate": 9.846872034394902e-07, "loss": 0.0417, "step": 869 }, { "epoch": 0.3958143767060964, "grad_norm": 0.82426928891462, "learning_rate": 9.846520819017068e-07, "loss": 0.0432, "step": 870 }, { "epoch": 0.3962693357597816, "grad_norm": 1.372052410922468, "learning_rate": 9.846169207603442e-07, "loss": 0.0887, "step": 871 }, { "epoch": 0.39672429481346677, "grad_norm": 1.1445151174227057, "learning_rate": 9.845817200182754e-07, "loss": 0.0451, "step": 872 }, { "epoch": 0.39717925386715197, "grad_norm": 1.139914599890856, "learning_rate": 9.845464796783766e-07, "loss": 0.0962, "step": 873 }, { "epoch": 0.3976342129208371, "grad_norm": 0.824861427712107, "learning_rate": 9.845111997435278e-07, "loss": 0.0538, "step": 874 }, { "epoch": 0.3980891719745223, "grad_norm": 1.0548345460566464, "learning_rate": 9.844758802166115e-07, "loss": 0.0437, "step": 875 }, { "epoch": 0.39854413102820746, "grad_norm": 1.098016279723092, "learning_rate": 9.844405211005144e-07, "loss": 0.0473, "step": 876 }, { "epoch": 0.3989990900818926, "grad_norm": 0.8203953789844892, "learning_rate": 9.844051223981256e-07, "loss": 0.0429, "step": 877 }, { "epoch": 0.3994540491355778, "grad_norm": 1.0270219165790127, "learning_rate": 9.84369684112338e-07, "loss": 0.0487, "step": 878 }, { "epoch": 0.39990900818926295, "grad_norm": 0.8436040675180059, "learning_rate": 9.843342062460468e-07, "loss": 0.0439, "step": 879 }, { "epoch": 0.40036396724294815, "grad_norm": 1.0936050028553777, "learning_rate": 9.842986888021516e-07, "loss": 0.0705, "step": 880 }, { "epoch": 0.4008189262966333, "grad_norm": 0.7670096174396747, "learning_rate": 9.842631317835548e-07, "loss": 0.0432, "step": 881 }, { "epoch": 0.4012738853503185, "grad_norm": 1.0135001138826303, "learning_rate": 9.842275351931617e-07, "loss": 0.0441, "step": 882 }, { "epoch": 0.40172884440400364, "grad_norm": 0.6633020938555181, "learning_rate": 9.84191899033881e-07, "loss": 0.0431, "step": 883 }, { "epoch": 0.4021838034576888, "grad_norm": 1.313992069238165, "learning_rate": 9.84156223308625e-07, "loss": 0.0712, "step": 884 }, { "epoch": 0.402638762511374, "grad_norm": 1.1802928750540758, "learning_rate": 9.84120508020309e-07, "loss": 0.0529, "step": 885 }, { "epoch": 0.4030937215650591, "grad_norm": 1.1180772785241204, "learning_rate": 9.840847531718514e-07, "loss": 0.0519, "step": 886 }, { "epoch": 0.4035486806187443, "grad_norm": 0.9756760945890554, "learning_rate": 9.840489587661737e-07, "loss": 0.0478, "step": 887 }, { "epoch": 0.40400363967242947, "grad_norm": 1.0238413939758209, "learning_rate": 9.84013124806201e-07, "loss": 0.0573, "step": 888 }, { "epoch": 0.40445859872611467, "grad_norm": 0.7924995782364864, "learning_rate": 9.839772512948617e-07, "loss": 0.0473, "step": 889 }, { "epoch": 0.4049135577797998, "grad_norm": 1.2386419618995468, "learning_rate": 9.83941338235087e-07, "loss": 0.0594, "step": 890 }, { "epoch": 0.40536851683348496, "grad_norm": 1.2549710433176888, "learning_rate": 9.839053856298113e-07, "loss": 0.0747, "step": 891 }, { "epoch": 0.40582347588717016, "grad_norm": 0.9189822000022692, "learning_rate": 9.838693934819732e-07, "loss": 0.0555, "step": 892 }, { "epoch": 0.4062784349408553, "grad_norm": 1.0395803161306463, "learning_rate": 9.838333617945133e-07, "loss": 0.0713, "step": 893 }, { "epoch": 0.4067333939945405, "grad_norm": 1.4426328973245872, "learning_rate": 9.83797290570376e-07, "loss": 0.0566, "step": 894 }, { "epoch": 0.40718835304822565, "grad_norm": 0.8603913038692429, "learning_rate": 9.837611798125088e-07, "loss": 0.0327, "step": 895 }, { "epoch": 0.40764331210191085, "grad_norm": 0.5635998649576651, "learning_rate": 9.837250295238628e-07, "loss": 0.0289, "step": 896 }, { "epoch": 0.408098271155596, "grad_norm": 1.1984552003557507, "learning_rate": 9.836888397073918e-07, "loss": 0.0821, "step": 897 }, { "epoch": 0.40855323020928114, "grad_norm": 1.187456056630682, "learning_rate": 9.836526103660532e-07, "loss": 0.0761, "step": 898 }, { "epoch": 0.40900818926296634, "grad_norm": 1.7286099188479387, "learning_rate": 9.836163415028073e-07, "loss": 0.0514, "step": 899 }, { "epoch": 0.4094631483166515, "grad_norm": 1.3897338156914127, "learning_rate": 9.83580033120618e-07, "loss": 0.0651, "step": 900 }, { "epoch": 0.4099181073703367, "grad_norm": 1.165464556068957, "learning_rate": 9.835436852224524e-07, "loss": 0.0591, "step": 901 }, { "epoch": 0.4103730664240218, "grad_norm": 1.199639020048541, "learning_rate": 9.835072978112802e-07, "loss": 0.0496, "step": 902 }, { "epoch": 0.410828025477707, "grad_norm": 1.2534961999205232, "learning_rate": 9.834708708900753e-07, "loss": 0.0476, "step": 903 }, { "epoch": 0.41128298453139217, "grad_norm": 0.927614598023347, "learning_rate": 9.834344044618143e-07, "loss": 0.0391, "step": 904 }, { "epoch": 0.41173794358507737, "grad_norm": 0.8589639897120175, "learning_rate": 9.833978985294769e-07, "loss": 0.0358, "step": 905 }, { "epoch": 0.4121929026387625, "grad_norm": 0.7942007758965253, "learning_rate": 9.83361353096046e-07, "loss": 0.0413, "step": 906 }, { "epoch": 0.41264786169244766, "grad_norm": 1.2834313529857662, "learning_rate": 9.833247681645082e-07, "loss": 0.1026, "step": 907 }, { "epoch": 0.41310282074613286, "grad_norm": 1.3287516231216665, "learning_rate": 9.832881437378532e-07, "loss": 0.0565, "step": 908 }, { "epoch": 0.413557779799818, "grad_norm": 0.648324329334497, "learning_rate": 9.832514798190737e-07, "loss": 0.0306, "step": 909 }, { "epoch": 0.4140127388535032, "grad_norm": 0.6098894352961793, "learning_rate": 9.832147764111655e-07, "loss": 0.0299, "step": 910 }, { "epoch": 0.41446769790718835, "grad_norm": 1.353367347362825, "learning_rate": 9.831780335171278e-07, "loss": 0.0621, "step": 911 }, { "epoch": 0.41492265696087355, "grad_norm": 0.9815179863684923, "learning_rate": 9.831412511399633e-07, "loss": 0.0625, "step": 912 }, { "epoch": 0.4153776160145587, "grad_norm": 1.2817649685411325, "learning_rate": 9.831044292826776e-07, "loss": 0.0838, "step": 913 }, { "epoch": 0.41583257506824384, "grad_norm": 0.9767364821189666, "learning_rate": 9.830675679482796e-07, "loss": 0.0539, "step": 914 }, { "epoch": 0.41628753412192904, "grad_norm": 1.4965019727546385, "learning_rate": 9.830306671397815e-07, "loss": 0.0733, "step": 915 }, { "epoch": 0.4167424931756142, "grad_norm": 1.2495593098984477, "learning_rate": 9.829937268601987e-07, "loss": 0.0812, "step": 916 }, { "epoch": 0.4171974522292994, "grad_norm": 0.7819192350186043, "learning_rate": 9.829567471125496e-07, "loss": 0.0489, "step": 917 }, { "epoch": 0.4176524112829845, "grad_norm": 0.8321452241120129, "learning_rate": 9.82919727899856e-07, "loss": 0.0504, "step": 918 }, { "epoch": 0.4181073703366697, "grad_norm": 0.8565914847996501, "learning_rate": 9.828826692251433e-07, "loss": 0.0504, "step": 919 }, { "epoch": 0.41856232939035487, "grad_norm": 0.9314762402609892, "learning_rate": 9.828455710914397e-07, "loss": 0.0392, "step": 920 }, { "epoch": 0.41901728844404, "grad_norm": 1.0898568664935744, "learning_rate": 9.828084335017763e-07, "loss": 0.0484, "step": 921 }, { "epoch": 0.4194722474977252, "grad_norm": 0.8848782682518258, "learning_rate": 9.82771256459188e-07, "loss": 0.0378, "step": 922 }, { "epoch": 0.41992720655141036, "grad_norm": 1.0161076224445202, "learning_rate": 9.82734039966713e-07, "loss": 0.0409, "step": 923 }, { "epoch": 0.42038216560509556, "grad_norm": 1.3606451167571607, "learning_rate": 9.82696784027392e-07, "loss": 0.043, "step": 924 }, { "epoch": 0.4208371246587807, "grad_norm": 1.3709123135581351, "learning_rate": 9.8265948864427e-07, "loss": 0.0548, "step": 925 }, { "epoch": 0.4212920837124659, "grad_norm": 0.9198603388302915, "learning_rate": 9.82622153820394e-07, "loss": 0.0511, "step": 926 }, { "epoch": 0.42174704276615105, "grad_norm": 1.0593981577901854, "learning_rate": 9.825847795588152e-07, "loss": 0.0503, "step": 927 }, { "epoch": 0.4222020018198362, "grad_norm": 1.0257348564038227, "learning_rate": 9.825473658625875e-07, "loss": 0.0624, "step": 928 }, { "epoch": 0.4226569608735214, "grad_norm": 1.0142424821591325, "learning_rate": 9.825099127347683e-07, "loss": 0.0594, "step": 929 }, { "epoch": 0.42311191992720654, "grad_norm": 1.159116040175061, "learning_rate": 9.82472420178418e-07, "loss": 0.0833, "step": 930 }, { "epoch": 0.42356687898089174, "grad_norm": 0.8827860011310269, "learning_rate": 9.824348881966002e-07, "loss": 0.0464, "step": 931 }, { "epoch": 0.4240218380345769, "grad_norm": 1.2835202383988082, "learning_rate": 9.823973167923823e-07, "loss": 0.0666, "step": 932 }, { "epoch": 0.4244767970882621, "grad_norm": 1.2737835333143062, "learning_rate": 9.82359705968834e-07, "loss": 0.065, "step": 933 }, { "epoch": 0.4249317561419472, "grad_norm": 0.9090780325024205, "learning_rate": 9.823220557290287e-07, "loss": 0.0514, "step": 934 }, { "epoch": 0.42538671519563237, "grad_norm": 0.9038507665138411, "learning_rate": 9.822843660760432e-07, "loss": 0.0312, "step": 935 }, { "epoch": 0.42584167424931757, "grad_norm": 1.2589497024600078, "learning_rate": 9.822466370129575e-07, "loss": 0.0742, "step": 936 }, { "epoch": 0.4262966333030027, "grad_norm": 1.299945464807195, "learning_rate": 9.822088685428542e-07, "loss": 0.051, "step": 937 }, { "epoch": 0.4267515923566879, "grad_norm": 0.9114166392179066, "learning_rate": 9.8217106066882e-07, "loss": 0.0404, "step": 938 }, { "epoch": 0.42720655141037306, "grad_norm": 0.9536305957394986, "learning_rate": 9.82133213393944e-07, "loss": 0.0434, "step": 939 }, { "epoch": 0.42766151046405826, "grad_norm": 1.3223760169593808, "learning_rate": 9.820953267213191e-07, "loss": 0.0418, "step": 940 }, { "epoch": 0.4281164695177434, "grad_norm": 1.3159077198062077, "learning_rate": 9.820574006540415e-07, "loss": 0.0365, "step": 941 }, { "epoch": 0.42857142857142855, "grad_norm": 0.8538535162017564, "learning_rate": 9.820194351952097e-07, "loss": 0.0436, "step": 942 }, { "epoch": 0.42902638762511375, "grad_norm": 0.8832804648886509, "learning_rate": 9.819814303479267e-07, "loss": 0.0415, "step": 943 }, { "epoch": 0.4294813466787989, "grad_norm": 1.6560942227663815, "learning_rate": 9.819433861152976e-07, "loss": 0.0851, "step": 944 }, { "epoch": 0.4299363057324841, "grad_norm": 0.9642762227140381, "learning_rate": 9.819053025004316e-07, "loss": 0.0695, "step": 945 }, { "epoch": 0.43039126478616924, "grad_norm": 1.1149732970537978, "learning_rate": 9.818671795064403e-07, "loss": 0.0604, "step": 946 }, { "epoch": 0.43084622383985444, "grad_norm": 0.9985696761415442, "learning_rate": 9.818290171364394e-07, "loss": 0.064, "step": 947 }, { "epoch": 0.4313011828935396, "grad_norm": 1.19602347930505, "learning_rate": 9.817908153935473e-07, "loss": 0.0422, "step": 948 }, { "epoch": 0.4317561419472247, "grad_norm": 1.1639522083754654, "learning_rate": 9.817525742808852e-07, "loss": 0.0954, "step": 949 }, { "epoch": 0.4322111010009099, "grad_norm": 1.0196471273879122, "learning_rate": 9.817142938015785e-07, "loss": 0.0518, "step": 950 }, { "epoch": 0.43266606005459507, "grad_norm": 1.899781616643873, "learning_rate": 9.81675973958755e-07, "loss": 0.0458, "step": 951 }, { "epoch": 0.43312101910828027, "grad_norm": 0.9403413572170655, "learning_rate": 9.816376147555463e-07, "loss": 0.0453, "step": 952 }, { "epoch": 0.4335759781619654, "grad_norm": 1.3393667012235977, "learning_rate": 9.815992161950865e-07, "loss": 0.0995, "step": 953 }, { "epoch": 0.4340309372156506, "grad_norm": 0.8253383055124498, "learning_rate": 9.815607782805139e-07, "loss": 0.0429, "step": 954 }, { "epoch": 0.43448589626933576, "grad_norm": 1.4804848000003228, "learning_rate": 9.815223010149691e-07, "loss": 0.0609, "step": 955 }, { "epoch": 0.4349408553230209, "grad_norm": 1.3048284591571215, "learning_rate": 9.814837844015964e-07, "loss": 0.0637, "step": 956 }, { "epoch": 0.4353958143767061, "grad_norm": 1.0127450932166644, "learning_rate": 9.81445228443543e-07, "loss": 0.0418, "step": 957 }, { "epoch": 0.43585077343039125, "grad_norm": 1.0521176750295256, "learning_rate": 9.8140663314396e-07, "loss": 0.053, "step": 958 }, { "epoch": 0.43630573248407645, "grad_norm": 1.0741290803191978, "learning_rate": 9.81367998506001e-07, "loss": 0.0662, "step": 959 }, { "epoch": 0.4367606915377616, "grad_norm": 1.0570170180743712, "learning_rate": 9.81329324532823e-07, "loss": 0.0608, "step": 960 }, { "epoch": 0.4372156505914468, "grad_norm": 1.043789615398663, "learning_rate": 9.81290611227586e-07, "loss": 0.0376, "step": 961 }, { "epoch": 0.43767060964513194, "grad_norm": 1.6102611480581357, "learning_rate": 9.81251858593454e-07, "loss": 0.0956, "step": 962 }, { "epoch": 0.4381255686988171, "grad_norm": 1.3437963757532758, "learning_rate": 9.812130666335931e-07, "loss": 0.0657, "step": 963 }, { "epoch": 0.4385805277525023, "grad_norm": 1.0607440727548485, "learning_rate": 9.811742353511738e-07, "loss": 0.0733, "step": 964 }, { "epoch": 0.4390354868061874, "grad_norm": 0.9095244532763905, "learning_rate": 9.81135364749369e-07, "loss": 0.0532, "step": 965 }, { "epoch": 0.4394904458598726, "grad_norm": 0.6738085340590958, "learning_rate": 9.810964548313547e-07, "loss": 0.0384, "step": 966 }, { "epoch": 0.43994540491355777, "grad_norm": 1.130077415136585, "learning_rate": 9.81057505600311e-07, "loss": 0.0473, "step": 967 }, { "epoch": 0.44040036396724297, "grad_norm": 0.8767821219378822, "learning_rate": 9.810185170594204e-07, "loss": 0.0524, "step": 968 }, { "epoch": 0.4408553230209281, "grad_norm": 1.2011645349880367, "learning_rate": 9.809794892118686e-07, "loss": 0.059, "step": 969 }, { "epoch": 0.44131028207461326, "grad_norm": 0.9783184451795246, "learning_rate": 9.80940422060845e-07, "loss": 0.0431, "step": 970 }, { "epoch": 0.44176524112829846, "grad_norm": 1.135564117297465, "learning_rate": 9.809013156095423e-07, "loss": 0.0412, "step": 971 }, { "epoch": 0.4422202001819836, "grad_norm": 1.2733568897021437, "learning_rate": 9.808621698611556e-07, "loss": 0.0546, "step": 972 }, { "epoch": 0.4426751592356688, "grad_norm": 0.887227787175639, "learning_rate": 9.80822984818884e-07, "loss": 0.0417, "step": 973 }, { "epoch": 0.44313011828935395, "grad_norm": 0.9955330389532967, "learning_rate": 9.807837604859294e-07, "loss": 0.057, "step": 974 }, { "epoch": 0.44358507734303915, "grad_norm": 0.7348704026221938, "learning_rate": 9.807444968654972e-07, "loss": 0.038, "step": 975 }, { "epoch": 0.4440400363967243, "grad_norm": 1.2924164665527813, "learning_rate": 9.807051939607957e-07, "loss": 0.0778, "step": 976 }, { "epoch": 0.44449499545040944, "grad_norm": 1.0108551442415985, "learning_rate": 9.806658517750367e-07, "loss": 0.0466, "step": 977 }, { "epoch": 0.44494995450409464, "grad_norm": 0.9502035931604473, "learning_rate": 9.806264703114348e-07, "loss": 0.0574, "step": 978 }, { "epoch": 0.4454049135577798, "grad_norm": 1.0572284939752399, "learning_rate": 9.805870495732083e-07, "loss": 0.0462, "step": 979 }, { "epoch": 0.445859872611465, "grad_norm": 1.3042646141268028, "learning_rate": 9.805475895635786e-07, "loss": 0.0517, "step": 980 }, { "epoch": 0.4463148316651501, "grad_norm": 1.019826185217419, "learning_rate": 9.805080902857697e-07, "loss": 0.0608, "step": 981 }, { "epoch": 0.4467697907188353, "grad_norm": 1.046442166704059, "learning_rate": 9.804685517430098e-07, "loss": 0.0434, "step": 982 }, { "epoch": 0.44722474977252047, "grad_norm": 1.5014734744524338, "learning_rate": 9.804289739385296e-07, "loss": 0.083, "step": 983 }, { "epoch": 0.44767970882620567, "grad_norm": 0.7390261403887619, "learning_rate": 9.803893568755632e-07, "loss": 0.0577, "step": 984 }, { "epoch": 0.4481346678798908, "grad_norm": 0.7443056555180294, "learning_rate": 9.80349700557348e-07, "loss": 0.0475, "step": 985 }, { "epoch": 0.44858962693357596, "grad_norm": 0.8068638298412407, "learning_rate": 9.803100049871244e-07, "loss": 0.054, "step": 986 }, { "epoch": 0.44904458598726116, "grad_norm": 0.6890299122051128, "learning_rate": 9.802702701681364e-07, "loss": 0.0432, "step": 987 }, { "epoch": 0.4494995450409463, "grad_norm": 1.159749571194803, "learning_rate": 9.80230496103631e-07, "loss": 0.0398, "step": 988 }, { "epoch": 0.4499545040946315, "grad_norm": 0.7351217592794722, "learning_rate": 9.801906827968577e-07, "loss": 0.0425, "step": 989 }, { "epoch": 0.45040946314831665, "grad_norm": 1.324447326217995, "learning_rate": 9.801508302510706e-07, "loss": 0.097, "step": 990 }, { "epoch": 0.45086442220200185, "grad_norm": 0.8908007592805687, "learning_rate": 9.80110938469526e-07, "loss": 0.0417, "step": 991 }, { "epoch": 0.451319381255687, "grad_norm": 1.0429516789807913, "learning_rate": 9.800710074554836e-07, "loss": 0.0624, "step": 992 }, { "epoch": 0.45177434030937214, "grad_norm": 0.964007179484218, "learning_rate": 9.800310372122065e-07, "loss": 0.0481, "step": 993 }, { "epoch": 0.45222929936305734, "grad_norm": 1.5993545653581558, "learning_rate": 9.799910277429609e-07, "loss": 0.0444, "step": 994 }, { "epoch": 0.4526842584167425, "grad_norm": 0.8789912164867874, "learning_rate": 9.799509790510158e-07, "loss": 0.0443, "step": 995 }, { "epoch": 0.4531392174704277, "grad_norm": 1.132579986027016, "learning_rate": 9.799108911396445e-07, "loss": 0.058, "step": 996 }, { "epoch": 0.4535941765241128, "grad_norm": 0.9421551240630235, "learning_rate": 9.798707640121223e-07, "loss": 0.0434, "step": 997 }, { "epoch": 0.454049135577798, "grad_norm": 1.2599492665066678, "learning_rate": 9.798305976717283e-07, "loss": 0.0774, "step": 998 }, { "epoch": 0.45450409463148317, "grad_norm": 0.8095181551992111, "learning_rate": 9.79790392121745e-07, "loss": 0.0436, "step": 999 }, { "epoch": 0.4549590536851683, "grad_norm": 1.0925614426508448, "learning_rate": 9.797501473654573e-07, "loss": 0.0626, "step": 1000 }, { "epoch": 0.4554140127388535, "grad_norm": 1.866450866140287, "learning_rate": 9.797098634061542e-07, "loss": 0.0637, "step": 1001 }, { "epoch": 0.45586897179253866, "grad_norm": 1.136977904386429, "learning_rate": 9.796695402471273e-07, "loss": 0.062, "step": 1002 }, { "epoch": 0.45632393084622386, "grad_norm": 1.160254261312887, "learning_rate": 9.79629177891672e-07, "loss": 0.0711, "step": 1003 }, { "epoch": 0.456778889899909, "grad_norm": 1.0972761617835352, "learning_rate": 9.795887763430859e-07, "loss": 0.0597, "step": 1004 }, { "epoch": 0.4572338489535942, "grad_norm": 1.48108282092481, "learning_rate": 9.79548335604671e-07, "loss": 0.0649, "step": 1005 }, { "epoch": 0.45768880800727935, "grad_norm": 1.070893790350091, "learning_rate": 9.795078556797318e-07, "loss": 0.0435, "step": 1006 }, { "epoch": 0.4581437670609645, "grad_norm": 1.2263881558353562, "learning_rate": 9.79467336571576e-07, "loss": 0.0637, "step": 1007 }, { "epoch": 0.4585987261146497, "grad_norm": 0.9241417909544685, "learning_rate": 9.794267782835147e-07, "loss": 0.0554, "step": 1008 }, { "epoch": 0.45905368516833484, "grad_norm": 1.2054716884074999, "learning_rate": 9.79386180818862e-07, "loss": 0.046, "step": 1009 }, { "epoch": 0.45950864422202004, "grad_norm": 1.424213934952081, "learning_rate": 9.793455441809357e-07, "loss": 0.0666, "step": 1010 }, { "epoch": 0.4599636032757052, "grad_norm": 0.6931104553091149, "learning_rate": 9.793048683730564e-07, "loss": 0.0362, "step": 1011 }, { "epoch": 0.4604185623293904, "grad_norm": 1.1589470890505629, "learning_rate": 9.792641533985472e-07, "loss": 0.079, "step": 1012 }, { "epoch": 0.4608735213830755, "grad_norm": 0.7229466174773208, "learning_rate": 9.792233992607362e-07, "loss": 0.0475, "step": 1013 }, { "epoch": 0.46132848043676067, "grad_norm": 1.1682237634463724, "learning_rate": 9.791826059629532e-07, "loss": 0.0514, "step": 1014 }, { "epoch": 0.46178343949044587, "grad_norm": 1.2251167178478946, "learning_rate": 9.791417735085314e-07, "loss": 0.067, "step": 1015 }, { "epoch": 0.462238398544131, "grad_norm": 1.054543646753414, "learning_rate": 9.791009019008075e-07, "loss": 0.0579, "step": 1016 }, { "epoch": 0.4626933575978162, "grad_norm": 0.9458283317798665, "learning_rate": 9.790599911431218e-07, "loss": 0.0591, "step": 1017 }, { "epoch": 0.46314831665150136, "grad_norm": 1.1527898157513183, "learning_rate": 9.790190412388171e-07, "loss": 0.0678, "step": 1018 }, { "epoch": 0.46360327570518656, "grad_norm": 1.136178230349509, "learning_rate": 9.789780521912394e-07, "loss": 0.0595, "step": 1019 }, { "epoch": 0.4640582347588717, "grad_norm": 1.1508642406552656, "learning_rate": 9.789370240037383e-07, "loss": 0.058, "step": 1020 }, { "epoch": 0.46451319381255685, "grad_norm": 0.8996475929285287, "learning_rate": 9.788959566796667e-07, "loss": 0.0538, "step": 1021 }, { "epoch": 0.46496815286624205, "grad_norm": 1.1208725149864451, "learning_rate": 9.7885485022238e-07, "loss": 0.0644, "step": 1022 }, { "epoch": 0.4654231119199272, "grad_norm": 5.792648526072557, "learning_rate": 9.788137046352372e-07, "loss": 0.1674, "step": 1023 }, { "epoch": 0.4658780709736124, "grad_norm": 0.945256522894609, "learning_rate": 9.78772519921601e-07, "loss": 0.0554, "step": 1024 }, { "epoch": 0.46633303002729753, "grad_norm": 0.6667171688196832, "learning_rate": 9.787312960848366e-07, "loss": 0.029, "step": 1025 }, { "epoch": 0.46678798908098273, "grad_norm": 1.0532098931525624, "learning_rate": 9.786900331283126e-07, "loss": 0.0515, "step": 1026 }, { "epoch": 0.4672429481346679, "grad_norm": 0.7943259984453677, "learning_rate": 9.78648731055401e-07, "loss": 0.0483, "step": 1027 }, { "epoch": 0.467697907188353, "grad_norm": 0.978696986547437, "learning_rate": 9.786073898694764e-07, "loss": 0.0517, "step": 1028 }, { "epoch": 0.4681528662420382, "grad_norm": 0.8681434748702896, "learning_rate": 9.785660095739174e-07, "loss": 0.0479, "step": 1029 }, { "epoch": 0.46860782529572337, "grad_norm": 1.2394248470683742, "learning_rate": 9.785245901721053e-07, "loss": 0.0428, "step": 1030 }, { "epoch": 0.46906278434940857, "grad_norm": 0.9076906046041907, "learning_rate": 9.784831316674246e-07, "loss": 0.0525, "step": 1031 }, { "epoch": 0.4695177434030937, "grad_norm": 0.9316086626853852, "learning_rate": 9.784416340632633e-07, "loss": 0.0459, "step": 1032 }, { "epoch": 0.4699727024567789, "grad_norm": 0.6963474019277235, "learning_rate": 9.784000973630121e-07, "loss": 0.0362, "step": 1033 }, { "epoch": 0.47042766151046406, "grad_norm": 0.8693961923175989, "learning_rate": 9.783585215700657e-07, "loss": 0.0505, "step": 1034 }, { "epoch": 0.4708826205641492, "grad_norm": 1.6279925048731907, "learning_rate": 9.783169066878208e-07, "loss": 0.0772, "step": 1035 }, { "epoch": 0.4713375796178344, "grad_norm": 0.9261194988626575, "learning_rate": 9.782752527196784e-07, "loss": 0.0526, "step": 1036 }, { "epoch": 0.47179253867151955, "grad_norm": 1.0819950999537684, "learning_rate": 9.782335596690424e-07, "loss": 0.0506, "step": 1037 }, { "epoch": 0.47224749772520475, "grad_norm": 1.6187783952358246, "learning_rate": 9.781918275393195e-07, "loss": 0.0808, "step": 1038 }, { "epoch": 0.4727024567788899, "grad_norm": 1.077624914044847, "learning_rate": 9.7815005633392e-07, "loss": 0.0512, "step": 1039 }, { "epoch": 0.4731574158325751, "grad_norm": 0.8675486033170192, "learning_rate": 9.781082460562572e-07, "loss": 0.0377, "step": 1040 }, { "epoch": 0.47361237488626023, "grad_norm": 0.9963042507500924, "learning_rate": 9.780663967097476e-07, "loss": 0.0519, "step": 1041 }, { "epoch": 0.4740673339399454, "grad_norm": 1.3571695488631474, "learning_rate": 9.78024508297811e-07, "loss": 0.0678, "step": 1042 }, { "epoch": 0.4745222929936306, "grad_norm": 0.9289339069521403, "learning_rate": 9.779825808238703e-07, "loss": 0.0594, "step": 1043 }, { "epoch": 0.4749772520473157, "grad_norm": 1.2845761401988007, "learning_rate": 9.779406142913517e-07, "loss": 0.0598, "step": 1044 }, { "epoch": 0.4754322111010009, "grad_norm": 0.785622670713627, "learning_rate": 9.778986087036845e-07, "loss": 0.0453, "step": 1045 }, { "epoch": 0.47588717015468607, "grad_norm": 0.7582808336992353, "learning_rate": 9.77856564064301e-07, "loss": 0.0317, "step": 1046 }, { "epoch": 0.47634212920837127, "grad_norm": 1.2341462990548693, "learning_rate": 9.778144803766372e-07, "loss": 0.0591, "step": 1047 }, { "epoch": 0.4767970882620564, "grad_norm": 0.7884103081988637, "learning_rate": 9.77772357644132e-07, "loss": 0.0451, "step": 1048 }, { "epoch": 0.47725204731574156, "grad_norm": 1.1566208474481872, "learning_rate": 9.777301958702272e-07, "loss": 0.0571, "step": 1049 }, { "epoch": 0.47770700636942676, "grad_norm": 1.3232336961035833, "learning_rate": 9.776879950583681e-07, "loss": 0.0856, "step": 1050 }, { "epoch": 0.4781619654231119, "grad_norm": 0.8429412892983671, "learning_rate": 9.776457552120033e-07, "loss": 0.0541, "step": 1051 }, { "epoch": 0.4786169244767971, "grad_norm": 1.2195545478343996, "learning_rate": 9.776034763345844e-07, "loss": 0.0569, "step": 1052 }, { "epoch": 0.47907188353048225, "grad_norm": 0.8614062827162355, "learning_rate": 9.775611584295663e-07, "loss": 0.0485, "step": 1053 }, { "epoch": 0.47952684258416745, "grad_norm": 1.650731912815663, "learning_rate": 9.77518801500407e-07, "loss": 0.0612, "step": 1054 }, { "epoch": 0.4799818016378526, "grad_norm": 1.2752214057449334, "learning_rate": 9.774764055505676e-07, "loss": 0.0668, "step": 1055 }, { "epoch": 0.48043676069153773, "grad_norm": 1.003141212433932, "learning_rate": 9.774339705835127e-07, "loss": 0.0572, "step": 1056 }, { "epoch": 0.48089171974522293, "grad_norm": 1.0762439479749761, "learning_rate": 9.773914966027097e-07, "loss": 0.0705, "step": 1057 }, { "epoch": 0.4813466787989081, "grad_norm": 1.4172864833888303, "learning_rate": 9.773489836116296e-07, "loss": 0.0827, "step": 1058 }, { "epoch": 0.4818016378525933, "grad_norm": 1.0186327622827214, "learning_rate": 9.773064316137462e-07, "loss": 0.0459, "step": 1059 }, { "epoch": 0.4822565969062784, "grad_norm": 1.074804237713067, "learning_rate": 9.772638406125366e-07, "loss": 0.0451, "step": 1060 }, { "epoch": 0.4827115559599636, "grad_norm": 1.1638107814187253, "learning_rate": 9.772212106114813e-07, "loss": 0.0545, "step": 1061 }, { "epoch": 0.48316651501364877, "grad_norm": 1.1954170062377587, "learning_rate": 9.77178541614064e-07, "loss": 0.0725, "step": 1062 }, { "epoch": 0.48362147406733397, "grad_norm": 0.9350669595951756, "learning_rate": 9.77135833623771e-07, "loss": 0.0639, "step": 1063 }, { "epoch": 0.4840764331210191, "grad_norm": 1.7339475206961852, "learning_rate": 9.770930866440926e-07, "loss": 0.0754, "step": 1064 }, { "epoch": 0.48453139217470426, "grad_norm": 0.9566617627121812, "learning_rate": 9.770503006785213e-07, "loss": 0.0487, "step": 1065 }, { "epoch": 0.48498635122838946, "grad_norm": 1.287693274855434, "learning_rate": 9.77007475730554e-07, "loss": 0.0768, "step": 1066 }, { "epoch": 0.4854413102820746, "grad_norm": 0.9657792241032565, "learning_rate": 9.7696461180369e-07, "loss": 0.0431, "step": 1067 }, { "epoch": 0.4858962693357598, "grad_norm": 1.0440790618440403, "learning_rate": 9.769217089014318e-07, "loss": 0.0414, "step": 1068 }, { "epoch": 0.48635122838944495, "grad_norm": 0.7222195506323853, "learning_rate": 9.768787670272853e-07, "loss": 0.0454, "step": 1069 }, { "epoch": 0.48680618744313015, "grad_norm": 1.0668202166945229, "learning_rate": 9.768357861847596e-07, "loss": 0.049, "step": 1070 }, { "epoch": 0.4872611464968153, "grad_norm": 1.2655304325798473, "learning_rate": 9.767927663773667e-07, "loss": 0.0558, "step": 1071 }, { "epoch": 0.48771610555050043, "grad_norm": 1.1538925000557327, "learning_rate": 9.76749707608622e-07, "loss": 0.0539, "step": 1072 }, { "epoch": 0.48817106460418563, "grad_norm": 0.9157630861579026, "learning_rate": 9.767066098820445e-07, "loss": 0.0461, "step": 1073 }, { "epoch": 0.4886260236578708, "grad_norm": 1.3427056590735855, "learning_rate": 9.766634732011556e-07, "loss": 0.0653, "step": 1074 }, { "epoch": 0.489080982711556, "grad_norm": 1.2721417282214194, "learning_rate": 9.766202975694799e-07, "loss": 0.0616, "step": 1075 }, { "epoch": 0.4895359417652411, "grad_norm": 1.0672220340486525, "learning_rate": 9.765770829905461e-07, "loss": 0.052, "step": 1076 }, { "epoch": 0.4899909008189263, "grad_norm": 0.7259192603123809, "learning_rate": 9.765338294678854e-07, "loss": 0.0324, "step": 1077 }, { "epoch": 0.49044585987261147, "grad_norm": 0.8758885825731395, "learning_rate": 9.76490537005032e-07, "loss": 0.0376, "step": 1078 }, { "epoch": 0.4909008189262966, "grad_norm": 1.1904182046379264, "learning_rate": 9.76447205605524e-07, "loss": 0.0555, "step": 1079 }, { "epoch": 0.4913557779799818, "grad_norm": 0.9879755175600146, "learning_rate": 9.764038352729017e-07, "loss": 0.0559, "step": 1080 }, { "epoch": 0.49181073703366696, "grad_norm": 1.2086322889424477, "learning_rate": 9.763604260107094e-07, "loss": 0.0707, "step": 1081 }, { "epoch": 0.49226569608735216, "grad_norm": 1.0891170249854678, "learning_rate": 9.763169778224946e-07, "loss": 0.0498, "step": 1082 }, { "epoch": 0.4927206551410373, "grad_norm": 0.9746260178729034, "learning_rate": 9.76273490711807e-07, "loss": 0.0533, "step": 1083 }, { "epoch": 0.4931756141947225, "grad_norm": 0.9150343031531886, "learning_rate": 9.762299646822007e-07, "loss": 0.0327, "step": 1084 }, { "epoch": 0.49363057324840764, "grad_norm": 0.8158201586437064, "learning_rate": 9.761863997372325e-07, "loss": 0.0363, "step": 1085 }, { "epoch": 0.4940855323020928, "grad_norm": 1.3282332247950932, "learning_rate": 9.76142795880462e-07, "loss": 0.0843, "step": 1086 }, { "epoch": 0.494540491355778, "grad_norm": 2.1107554355128393, "learning_rate": 9.760991531154523e-07, "loss": 0.0842, "step": 1087 }, { "epoch": 0.49499545040946313, "grad_norm": 1.0031757741267602, "learning_rate": 9.760554714457701e-07, "loss": 0.0447, "step": 1088 }, { "epoch": 0.49545040946314833, "grad_norm": 0.6829151260348073, "learning_rate": 9.760117508749845e-07, "loss": 0.0474, "step": 1089 }, { "epoch": 0.4959053685168335, "grad_norm": 1.1924820574731907, "learning_rate": 9.759679914066684e-07, "loss": 0.0622, "step": 1090 }, { "epoch": 0.4963603275705187, "grad_norm": 0.9443341037777827, "learning_rate": 9.759241930443975e-07, "loss": 0.0494, "step": 1091 }, { "epoch": 0.4968152866242038, "grad_norm": 0.81742619980956, "learning_rate": 9.758803557917508e-07, "loss": 0.0382, "step": 1092 }, { "epoch": 0.49727024567788897, "grad_norm": 0.74626712847151, "learning_rate": 9.758364796523104e-07, "loss": 0.0292, "step": 1093 }, { "epoch": 0.49772520473157417, "grad_norm": 0.6810394296739468, "learning_rate": 9.757925646296616e-07, "loss": 0.0383, "step": 1094 }, { "epoch": 0.4981801637852593, "grad_norm": 1.2267886068341995, "learning_rate": 9.757486107273933e-07, "loss": 0.0662, "step": 1095 }, { "epoch": 0.4986351228389445, "grad_norm": 1.0020208071063283, "learning_rate": 9.757046179490968e-07, "loss": 0.0685, "step": 1096 }, { "epoch": 0.49909008189262966, "grad_norm": 0.625163416945547, "learning_rate": 9.756605862983674e-07, "loss": 0.0359, "step": 1097 }, { "epoch": 0.49954504094631486, "grad_norm": 0.7053481302726589, "learning_rate": 9.756165157788029e-07, "loss": 0.0287, "step": 1098 }, { "epoch": 0.5, "grad_norm": 1.0863147755303157, "learning_rate": 9.755724063940045e-07, "loss": 0.0575, "step": 1099 }, { "epoch": 0.5004549590536852, "grad_norm": 1.2533642860522964, "learning_rate": 9.755282581475767e-07, "loss": 0.0578, "step": 1100 }, { "epoch": 0.5009099181073703, "grad_norm": 1.0781474389599952, "learning_rate": 9.754840710431273e-07, "loss": 0.0518, "step": 1101 }, { "epoch": 0.5013648771610555, "grad_norm": 1.2463941019251055, "learning_rate": 9.754398450842667e-07, "loss": 0.0757, "step": 1102 }, { "epoch": 0.5018198362147407, "grad_norm": 1.3422279914613613, "learning_rate": 9.75395580274609e-07, "loss": 0.0965, "step": 1103 }, { "epoch": 0.5022747952684259, "grad_norm": 1.1532769405549157, "learning_rate": 9.753512766177715e-07, "loss": 0.0685, "step": 1104 }, { "epoch": 0.502729754322111, "grad_norm": 1.209251596568967, "learning_rate": 9.753069341173744e-07, "loss": 0.0452, "step": 1105 }, { "epoch": 0.5031847133757962, "grad_norm": 0.7797444472389766, "learning_rate": 9.752625527770408e-07, "loss": 0.047, "step": 1106 }, { "epoch": 0.5036396724294814, "grad_norm": 0.9331803105881898, "learning_rate": 9.752181326003978e-07, "loss": 0.0565, "step": 1107 }, { "epoch": 0.5040946314831665, "grad_norm": 1.201213474585957, "learning_rate": 9.751736735910752e-07, "loss": 0.0668, "step": 1108 }, { "epoch": 0.5045495905368517, "grad_norm": 1.3736880131164142, "learning_rate": 9.751291757527057e-07, "loss": 0.0732, "step": 1109 }, { "epoch": 0.5050045495905369, "grad_norm": 1.1566787005548642, "learning_rate": 9.750846390889258e-07, "loss": 0.0508, "step": 1110 }, { "epoch": 0.5054595086442221, "grad_norm": 1.1066713060588476, "learning_rate": 9.750400636033743e-07, "loss": 0.0599, "step": 1111 }, { "epoch": 0.5059144676979072, "grad_norm": 1.0305977865198839, "learning_rate": 9.749954492996945e-07, "loss": 0.064, "step": 1112 }, { "epoch": 0.5063694267515924, "grad_norm": 1.1011541484629546, "learning_rate": 9.749507961815314e-07, "loss": 0.0767, "step": 1113 }, { "epoch": 0.5068243858052776, "grad_norm": 0.8640641012407045, "learning_rate": 9.74906104252534e-07, "loss": 0.0419, "step": 1114 }, { "epoch": 0.5072793448589626, "grad_norm": 0.8955757785353544, "learning_rate": 9.748613735163548e-07, "loss": 0.0512, "step": 1115 }, { "epoch": 0.5077343039126478, "grad_norm": 0.9046112462073723, "learning_rate": 9.748166039766484e-07, "loss": 0.0322, "step": 1116 }, { "epoch": 0.508189262966333, "grad_norm": 1.1375357694260264, "learning_rate": 9.747717956370734e-07, "loss": 0.0717, "step": 1117 }, { "epoch": 0.5086442220200182, "grad_norm": 1.4731673275852457, "learning_rate": 9.747269485012911e-07, "loss": 0.0879, "step": 1118 }, { "epoch": 0.5090991810737033, "grad_norm": 1.0339482291055033, "learning_rate": 9.746820625729667e-07, "loss": 0.0481, "step": 1119 }, { "epoch": 0.5095541401273885, "grad_norm": 0.8698363530329196, "learning_rate": 9.746371378557676e-07, "loss": 0.0373, "step": 1120 }, { "epoch": 0.5100090991810737, "grad_norm": 1.360232150616582, "learning_rate": 9.745921743533651e-07, "loss": 0.0889, "step": 1121 }, { "epoch": 0.5104640582347588, "grad_norm": 0.8646685338663944, "learning_rate": 9.745471720694334e-07, "loss": 0.0672, "step": 1122 }, { "epoch": 0.510919017288444, "grad_norm": 1.097864569909349, "learning_rate": 9.745021310076497e-07, "loss": 0.0453, "step": 1123 }, { "epoch": 0.5113739763421292, "grad_norm": 0.8492737378880987, "learning_rate": 9.74457051171695e-07, "loss": 0.0573, "step": 1124 }, { "epoch": 0.5118289353958144, "grad_norm": 1.1456340016011937, "learning_rate": 9.744119325652524e-07, "loss": 0.0662, "step": 1125 }, { "epoch": 0.5122838944494995, "grad_norm": 0.9090895763998762, "learning_rate": 9.743667751920093e-07, "loss": 0.0472, "step": 1126 }, { "epoch": 0.5127388535031847, "grad_norm": 0.9413014101188506, "learning_rate": 9.743215790556553e-07, "loss": 0.0635, "step": 1127 }, { "epoch": 0.5131938125568699, "grad_norm": 0.844330322140278, "learning_rate": 9.742763441598841e-07, "loss": 0.0402, "step": 1128 }, { "epoch": 0.513648771610555, "grad_norm": 1.0998041827146376, "learning_rate": 9.742310705083917e-07, "loss": 0.056, "step": 1129 }, { "epoch": 0.5141037306642402, "grad_norm": 0.9500662399276046, "learning_rate": 9.741857581048779e-07, "loss": 0.0573, "step": 1130 }, { "epoch": 0.5145586897179254, "grad_norm": 0.6201204645400248, "learning_rate": 9.741404069530454e-07, "loss": 0.0332, "step": 1131 }, { "epoch": 0.5150136487716106, "grad_norm": 0.9410345680337132, "learning_rate": 9.740950170566e-07, "loss": 0.0559, "step": 1132 }, { "epoch": 0.5154686078252957, "grad_norm": 0.9840746274075391, "learning_rate": 9.740495884192508e-07, "loss": 0.0516, "step": 1133 }, { "epoch": 0.5159235668789809, "grad_norm": 0.82501021569373, "learning_rate": 9.740041210447101e-07, "loss": 0.0364, "step": 1134 }, { "epoch": 0.5163785259326661, "grad_norm": 0.9302654323576818, "learning_rate": 9.73958614936693e-07, "loss": 0.0456, "step": 1135 }, { "epoch": 0.5168334849863512, "grad_norm": 1.4136627478886525, "learning_rate": 9.739130700989185e-07, "loss": 0.0759, "step": 1136 }, { "epoch": 0.5172884440400364, "grad_norm": 2.514313121400006, "learning_rate": 9.738674865351079e-07, "loss": 0.1096, "step": 1137 }, { "epoch": 0.5177434030937216, "grad_norm": 0.7704028041051092, "learning_rate": 9.738218642489864e-07, "loss": 0.0516, "step": 1138 }, { "epoch": 0.5181983621474068, "grad_norm": 0.9995040503442165, "learning_rate": 9.737762032442818e-07, "loss": 0.056, "step": 1139 }, { "epoch": 0.5186533212010919, "grad_norm": 1.0170965536550989, "learning_rate": 9.737305035247256e-07, "loss": 0.0397, "step": 1140 }, { "epoch": 0.5191082802547771, "grad_norm": 0.8140203306358152, "learning_rate": 9.736847650940518e-07, "loss": 0.0435, "step": 1141 }, { "epoch": 0.5195632393084623, "grad_norm": 0.8543937772075509, "learning_rate": 9.736389879559984e-07, "loss": 0.0308, "step": 1142 }, { "epoch": 0.5200181983621474, "grad_norm": 0.815515743939858, "learning_rate": 9.735931721143055e-07, "loss": 0.0471, "step": 1143 }, { "epoch": 0.5204731574158326, "grad_norm": 1.3333374378355076, "learning_rate": 9.735473175727176e-07, "loss": 0.07, "step": 1144 }, { "epoch": 0.5209281164695178, "grad_norm": 1.4489392246960282, "learning_rate": 9.735014243349814e-07, "loss": 0.0771, "step": 1145 }, { "epoch": 0.521383075523203, "grad_norm": 1.1887250270437455, "learning_rate": 9.73455492404847e-07, "loss": 0.054, "step": 1146 }, { "epoch": 0.521838034576888, "grad_norm": 1.1993838165834463, "learning_rate": 9.734095217860677e-07, "loss": 0.0802, "step": 1147 }, { "epoch": 0.5222929936305732, "grad_norm": 1.1327880949455273, "learning_rate": 9.733635124824004e-07, "loss": 0.0975, "step": 1148 }, { "epoch": 0.5227479526842584, "grad_norm": 0.8746161772661623, "learning_rate": 9.733174644976046e-07, "loss": 0.0636, "step": 1149 }, { "epoch": 0.5232029117379435, "grad_norm": 1.6652392917054692, "learning_rate": 9.73271377835443e-07, "loss": 0.0887, "step": 1150 }, { "epoch": 0.5236578707916287, "grad_norm": 1.180756572528785, "learning_rate": 9.732252524996816e-07, "loss": 0.0684, "step": 1151 }, { "epoch": 0.5241128298453139, "grad_norm": 1.7472171018340186, "learning_rate": 9.731790884940898e-07, "loss": 0.0746, "step": 1152 }, { "epoch": 0.5245677888989991, "grad_norm": 0.7179433370045839, "learning_rate": 9.731328858224398e-07, "loss": 0.0402, "step": 1153 }, { "epoch": 0.5250227479526842, "grad_norm": 1.2994566371078677, "learning_rate": 9.730866444885067e-07, "loss": 0.0594, "step": 1154 }, { "epoch": 0.5254777070063694, "grad_norm": 1.2198370493987598, "learning_rate": 9.730403644960696e-07, "loss": 0.064, "step": 1155 }, { "epoch": 0.5259326660600546, "grad_norm": 0.7906925171293432, "learning_rate": 9.729940458489103e-07, "loss": 0.0357, "step": 1156 }, { "epoch": 0.5263876251137397, "grad_norm": 0.7123072744290468, "learning_rate": 9.729476885508135e-07, "loss": 0.037, "step": 1157 }, { "epoch": 0.5268425841674249, "grad_norm": 0.8478347985286624, "learning_rate": 9.729012926055672e-07, "loss": 0.0451, "step": 1158 }, { "epoch": 0.5272975432211101, "grad_norm": 1.2904030356507508, "learning_rate": 9.72854858016963e-07, "loss": 0.0713, "step": 1159 }, { "epoch": 0.5277525022747953, "grad_norm": 1.1495637377877812, "learning_rate": 9.728083847887954e-07, "loss": 0.0515, "step": 1160 }, { "epoch": 0.5282074613284804, "grad_norm": 1.1051423154727984, "learning_rate": 9.727618729248617e-07, "loss": 0.0485, "step": 1161 }, { "epoch": 0.5286624203821656, "grad_norm": 0.8959362577870547, "learning_rate": 9.727153224289626e-07, "loss": 0.058, "step": 1162 }, { "epoch": 0.5291173794358508, "grad_norm": 1.2736348105423334, "learning_rate": 9.726687333049022e-07, "loss": 0.0635, "step": 1163 }, { "epoch": 0.5295723384895359, "grad_norm": 1.1517296646491162, "learning_rate": 9.726221055564874e-07, "loss": 0.0497, "step": 1164 }, { "epoch": 0.5300272975432211, "grad_norm": 1.0042734308191221, "learning_rate": 9.725754391875284e-07, "loss": 0.0505, "step": 1165 }, { "epoch": 0.5304822565969063, "grad_norm": 1.380044375399426, "learning_rate": 9.725287342018387e-07, "loss": 0.0549, "step": 1166 }, { "epoch": 0.5309372156505915, "grad_norm": 0.6826718085711656, "learning_rate": 9.72481990603235e-07, "loss": 0.0553, "step": 1167 }, { "epoch": 0.5313921747042766, "grad_norm": 1.4194509510180409, "learning_rate": 9.724352083955365e-07, "loss": 0.0785, "step": 1168 }, { "epoch": 0.5318471337579618, "grad_norm": 0.9268727611277036, "learning_rate": 9.723883875825663e-07, "loss": 0.0602, "step": 1169 }, { "epoch": 0.532302092811647, "grad_norm": 0.8275631679366757, "learning_rate": 9.723415281681503e-07, "loss": 0.0404, "step": 1170 }, { "epoch": 0.5327570518653321, "grad_norm": 0.9105594389508459, "learning_rate": 9.722946301561177e-07, "loss": 0.0537, "step": 1171 }, { "epoch": 0.5332120109190173, "grad_norm": 1.100931337451936, "learning_rate": 9.722476935503009e-07, "loss": 0.068, "step": 1172 }, { "epoch": 0.5336669699727025, "grad_norm": 0.8064156001937169, "learning_rate": 9.722007183545352e-07, "loss": 0.0442, "step": 1173 }, { "epoch": 0.5341219290263877, "grad_norm": 1.0318297444473845, "learning_rate": 9.721537045726593e-07, "loss": 0.0411, "step": 1174 }, { "epoch": 0.5345768880800728, "grad_norm": 1.054739977118117, "learning_rate": 9.721066522085147e-07, "loss": 0.0527, "step": 1175 }, { "epoch": 0.535031847133758, "grad_norm": 0.9018779043134231, "learning_rate": 9.720595612659467e-07, "loss": 0.073, "step": 1176 }, { "epoch": 0.5354868061874432, "grad_norm": 1.0443056594752111, "learning_rate": 9.72012431748803e-07, "loss": 0.048, "step": 1177 }, { "epoch": 0.5359417652411284, "grad_norm": 0.9653954537008506, "learning_rate": 9.71965263660935e-07, "loss": 0.0594, "step": 1178 }, { "epoch": 0.5363967242948134, "grad_norm": 0.6827358160614889, "learning_rate": 9.71918057006197e-07, "loss": 0.0506, "step": 1179 }, { "epoch": 0.5368516833484986, "grad_norm": 0.9604437351628684, "learning_rate": 9.718708117884467e-07, "loss": 0.0599, "step": 1180 }, { "epoch": 0.5373066424021838, "grad_norm": 0.9281377765752405, "learning_rate": 9.718235280115444e-07, "loss": 0.0564, "step": 1181 }, { "epoch": 0.5377616014558689, "grad_norm": 1.3539170396929319, "learning_rate": 9.717762056793544e-07, "loss": 0.0796, "step": 1182 }, { "epoch": 0.5382165605095541, "grad_norm": 1.0893717452517335, "learning_rate": 9.717288447957432e-07, "loss": 0.0536, "step": 1183 }, { "epoch": 0.5386715195632393, "grad_norm": 1.0424356769768954, "learning_rate": 9.71681445364581e-07, "loss": 0.0664, "step": 1184 }, { "epoch": 0.5391264786169245, "grad_norm": 0.8230553564584211, "learning_rate": 9.716340073897414e-07, "loss": 0.0485, "step": 1185 }, { "epoch": 0.5395814376706096, "grad_norm": 1.3030278525380978, "learning_rate": 9.715865308751006e-07, "loss": 0.0386, "step": 1186 }, { "epoch": 0.5400363967242948, "grad_norm": 1.6592622728734634, "learning_rate": 9.71539015824538e-07, "loss": 0.1015, "step": 1187 }, { "epoch": 0.54049135577798, "grad_norm": 0.8779802868532464, "learning_rate": 9.714914622419366e-07, "loss": 0.0535, "step": 1188 }, { "epoch": 0.5409463148316651, "grad_norm": 0.861626332354384, "learning_rate": 9.71443870131182e-07, "loss": 0.0506, "step": 1189 }, { "epoch": 0.5414012738853503, "grad_norm": 0.861194990409651, "learning_rate": 9.713962394961635e-07, "loss": 0.0516, "step": 1190 }, { "epoch": 0.5418562329390355, "grad_norm": 0.7632297847635168, "learning_rate": 9.713485703407731e-07, "loss": 0.0398, "step": 1191 }, { "epoch": 0.5423111919927207, "grad_norm": 1.1231903823237734, "learning_rate": 9.71300862668906e-07, "loss": 0.0778, "step": 1192 }, { "epoch": 0.5427661510464058, "grad_norm": 1.0117774195583789, "learning_rate": 9.71253116484461e-07, "loss": 0.0489, "step": 1193 }, { "epoch": 0.543221110100091, "grad_norm": 0.7420448212650324, "learning_rate": 9.712053317913393e-07, "loss": 0.0357, "step": 1194 }, { "epoch": 0.5436760691537762, "grad_norm": 0.7223016402690831, "learning_rate": 9.711575085934458e-07, "loss": 0.0563, "step": 1195 }, { "epoch": 0.5441310282074613, "grad_norm": 0.9462111628320398, "learning_rate": 9.711096468946887e-07, "loss": 0.0721, "step": 1196 }, { "epoch": 0.5445859872611465, "grad_norm": 1.315722078039905, "learning_rate": 9.710617466989786e-07, "loss": 0.0661, "step": 1197 }, { "epoch": 0.5450409463148317, "grad_norm": 0.8141786858321084, "learning_rate": 9.710138080102296e-07, "loss": 0.0462, "step": 1198 }, { "epoch": 0.5454959053685169, "grad_norm": 1.1781823774718467, "learning_rate": 9.709658308323596e-07, "loss": 0.0697, "step": 1199 }, { "epoch": 0.545950864422202, "grad_norm": 0.913969668760858, "learning_rate": 9.709178151692887e-07, "loss": 0.0592, "step": 1200 }, { "epoch": 0.5464058234758872, "grad_norm": 0.8483650780204732, "learning_rate": 9.708697610249407e-07, "loss": 0.0543, "step": 1201 }, { "epoch": 0.5468607825295724, "grad_norm": 0.9528105477642708, "learning_rate": 9.70821668403242e-07, "loss": 0.0441, "step": 1202 }, { "epoch": 0.5473157415832575, "grad_norm": 1.3419142464619187, "learning_rate": 9.70773537308123e-07, "loss": 0.0906, "step": 1203 }, { "epoch": 0.5477707006369427, "grad_norm": 0.7158313119459663, "learning_rate": 9.707253677435163e-07, "loss": 0.034, "step": 1204 }, { "epoch": 0.5482256596906279, "grad_norm": 0.9533459159623864, "learning_rate": 9.706771597133586e-07, "loss": 0.0402, "step": 1205 }, { "epoch": 0.5486806187443131, "grad_norm": 1.0424966871610146, "learning_rate": 9.706289132215887e-07, "loss": 0.0441, "step": 1206 }, { "epoch": 0.5491355777979982, "grad_norm": 1.1807999409571042, "learning_rate": 9.705806282721497e-07, "loss": 0.0654, "step": 1207 }, { "epoch": 0.5495905368516834, "grad_norm": 0.7487942685710397, "learning_rate": 9.705323048689865e-07, "loss": 0.0348, "step": 1208 }, { "epoch": 0.5500454959053686, "grad_norm": 1.0562299039522207, "learning_rate": 9.704839430160485e-07, "loss": 0.0606, "step": 1209 }, { "epoch": 0.5505004549590536, "grad_norm": 0.8226877139255735, "learning_rate": 9.704355427172872e-07, "loss": 0.0472, "step": 1210 }, { "epoch": 0.5509554140127388, "grad_norm": 0.6854122320293553, "learning_rate": 9.70387103976658e-07, "loss": 0.0407, "step": 1211 }, { "epoch": 0.551410373066424, "grad_norm": 0.8893656407469974, "learning_rate": 9.703386267981187e-07, "loss": 0.0396, "step": 1212 }, { "epoch": 0.5518653321201092, "grad_norm": 0.7569166672807338, "learning_rate": 9.70290111185631e-07, "loss": 0.034, "step": 1213 }, { "epoch": 0.5523202911737943, "grad_norm": 1.3998788243065783, "learning_rate": 9.702415571431593e-07, "loss": 0.1004, "step": 1214 }, { "epoch": 0.5527752502274795, "grad_norm": 0.90804624908086, "learning_rate": 9.70192964674671e-07, "loss": 0.073, "step": 1215 }, { "epoch": 0.5532302092811647, "grad_norm": 0.9160165529538629, "learning_rate": 9.701443337841368e-07, "loss": 0.0448, "step": 1216 }, { "epoch": 0.5536851683348498, "grad_norm": 1.0423504345138865, "learning_rate": 9.70095664475531e-07, "loss": 0.0568, "step": 1217 }, { "epoch": 0.554140127388535, "grad_norm": 1.2824477453570433, "learning_rate": 9.700469567528307e-07, "loss": 0.0819, "step": 1218 }, { "epoch": 0.5545950864422202, "grad_norm": 1.0068051926940271, "learning_rate": 9.699982106200155e-07, "loss": 0.0565, "step": 1219 }, { "epoch": 0.5550500454959054, "grad_norm": 1.2972138317594593, "learning_rate": 9.69949426081069e-07, "loss": 0.0566, "step": 1220 }, { "epoch": 0.5555050045495905, "grad_norm": 1.3463624636546208, "learning_rate": 9.699006031399777e-07, "loss": 0.0531, "step": 1221 }, { "epoch": 0.5559599636032757, "grad_norm": 1.0594025664912645, "learning_rate": 9.698517418007314e-07, "loss": 0.0576, "step": 1222 }, { "epoch": 0.5564149226569609, "grad_norm": 1.2804284498240521, "learning_rate": 9.698028420673223e-07, "loss": 0.0738, "step": 1223 }, { "epoch": 0.556869881710646, "grad_norm": 1.0731306494981614, "learning_rate": 9.697539039437467e-07, "loss": 0.0705, "step": 1224 }, { "epoch": 0.5573248407643312, "grad_norm": 1.025391983458191, "learning_rate": 9.697049274340034e-07, "loss": 0.044, "step": 1225 }, { "epoch": 0.5577797998180164, "grad_norm": 1.752599226406024, "learning_rate": 9.696559125420947e-07, "loss": 0.0791, "step": 1226 }, { "epoch": 0.5582347588717016, "grad_norm": 0.762476841297899, "learning_rate": 9.696068592720257e-07, "loss": 0.0395, "step": 1227 }, { "epoch": 0.5586897179253867, "grad_norm": 0.7806056978177014, "learning_rate": 9.695577676278047e-07, "loss": 0.0493, "step": 1228 }, { "epoch": 0.5591446769790719, "grad_norm": 0.9361753516024877, "learning_rate": 9.695086376134438e-07, "loss": 0.0396, "step": 1229 }, { "epoch": 0.5595996360327571, "grad_norm": 1.1591495687183044, "learning_rate": 9.694594692329572e-07, "loss": 0.0577, "step": 1230 }, { "epoch": 0.5600545950864422, "grad_norm": 0.9601679074681683, "learning_rate": 9.694102624903626e-07, "loss": 0.0576, "step": 1231 }, { "epoch": 0.5605095541401274, "grad_norm": 1.3856063186624548, "learning_rate": 9.693610173896812e-07, "loss": 0.0732, "step": 1232 }, { "epoch": 0.5609645131938126, "grad_norm": 0.9549302171545597, "learning_rate": 9.693117339349375e-07, "loss": 0.0447, "step": 1233 }, { "epoch": 0.5614194722474978, "grad_norm": 1.1823392374506065, "learning_rate": 9.69262412130158e-07, "loss": 0.0499, "step": 1234 }, { "epoch": 0.5618744313011829, "grad_norm": 1.2799163049034223, "learning_rate": 9.692130519793734e-07, "loss": 0.0669, "step": 1235 }, { "epoch": 0.5623293903548681, "grad_norm": 1.1603721991903995, "learning_rate": 9.69163653486617e-07, "loss": 0.069, "step": 1236 }, { "epoch": 0.5627843494085533, "grad_norm": 0.9323396563518179, "learning_rate": 9.691142166559257e-07, "loss": 0.0513, "step": 1237 }, { "epoch": 0.5632393084622384, "grad_norm": 0.7395981551174077, "learning_rate": 9.69064741491339e-07, "loss": 0.0557, "step": 1238 }, { "epoch": 0.5636942675159236, "grad_norm": 0.9266296051993105, "learning_rate": 9.690152279969002e-07, "loss": 0.0473, "step": 1239 }, { "epoch": 0.5641492265696088, "grad_norm": 1.1836738802611, "learning_rate": 9.689656761766546e-07, "loss": 0.0665, "step": 1240 }, { "epoch": 0.564604185623294, "grad_norm": 0.8273366224786877, "learning_rate": 9.689160860346522e-07, "loss": 0.0531, "step": 1241 }, { "epoch": 0.565059144676979, "grad_norm": 0.6148599768334516, "learning_rate": 9.688664575749445e-07, "loss": 0.0313, "step": 1242 }, { "epoch": 0.5655141037306642, "grad_norm": 0.8018866768213117, "learning_rate": 9.688167908015876e-07, "loss": 0.047, "step": 1243 }, { "epoch": 0.5659690627843494, "grad_norm": 1.331006214253376, "learning_rate": 9.687670857186393e-07, "loss": 0.0831, "step": 1244 }, { "epoch": 0.5664240218380345, "grad_norm": 1.1595512699904893, "learning_rate": 9.68717342330162e-07, "loss": 0.0674, "step": 1245 }, { "epoch": 0.5668789808917197, "grad_norm": 1.1008084358104073, "learning_rate": 9.686675606402201e-07, "loss": 0.0755, "step": 1246 }, { "epoch": 0.5673339399454049, "grad_norm": 0.9325974726083311, "learning_rate": 9.686177406528817e-07, "loss": 0.0607, "step": 1247 }, { "epoch": 0.5677888989990901, "grad_norm": 1.035892988721836, "learning_rate": 9.685678823722177e-07, "loss": 0.0529, "step": 1248 }, { "epoch": 0.5682438580527752, "grad_norm": 0.9088777213997143, "learning_rate": 9.685179858023025e-07, "loss": 0.0515, "step": 1249 }, { "epoch": 0.5686988171064604, "grad_norm": 0.9653891907289217, "learning_rate": 9.68468050947213e-07, "loss": 0.0588, "step": 1250 }, { "epoch": 0.5691537761601456, "grad_norm": 0.8844749666873807, "learning_rate": 9.684180778110304e-07, "loss": 0.0412, "step": 1251 }, { "epoch": 0.5696087352138307, "grad_norm": 0.8266627391749207, "learning_rate": 9.683680663978377e-07, "loss": 0.0469, "step": 1252 }, { "epoch": 0.5700636942675159, "grad_norm": 0.8768625859882312, "learning_rate": 9.683180167117216e-07, "loss": 0.0506, "step": 1253 }, { "epoch": 0.5705186533212011, "grad_norm": 0.8921051565517838, "learning_rate": 9.68267928756772e-07, "loss": 0.0496, "step": 1254 }, { "epoch": 0.5709736123748863, "grad_norm": 1.1623134419057843, "learning_rate": 9.682178025370823e-07, "loss": 0.0764, "step": 1255 }, { "epoch": 0.5714285714285714, "grad_norm": 0.9348424347921139, "learning_rate": 9.68167638056748e-07, "loss": 0.0578, "step": 1256 }, { "epoch": 0.5718835304822566, "grad_norm": 0.6587249344692087, "learning_rate": 9.681174353198686e-07, "loss": 0.0294, "step": 1257 }, { "epoch": 0.5723384895359418, "grad_norm": 0.8977507321835083, "learning_rate": 9.680671943305462e-07, "loss": 0.0452, "step": 1258 }, { "epoch": 0.5727934485896269, "grad_norm": 0.9242910415516392, "learning_rate": 9.680169150928868e-07, "loss": 0.0557, "step": 1259 }, { "epoch": 0.5732484076433121, "grad_norm": 0.8786792203566551, "learning_rate": 9.679665976109983e-07, "loss": 0.0536, "step": 1260 }, { "epoch": 0.5737033666969973, "grad_norm": 0.8492840747614591, "learning_rate": 9.67916241888993e-07, "loss": 0.0403, "step": 1261 }, { "epoch": 0.5741583257506825, "grad_norm": 1.0401337658789427, "learning_rate": 9.678658479309853e-07, "loss": 0.0588, "step": 1262 }, { "epoch": 0.5746132848043676, "grad_norm": 1.2643948456210201, "learning_rate": 9.678154157410937e-07, "loss": 0.0438, "step": 1263 }, { "epoch": 0.5750682438580528, "grad_norm": 1.2192472441676274, "learning_rate": 9.677649453234388e-07, "loss": 0.0737, "step": 1264 }, { "epoch": 0.575523202911738, "grad_norm": 1.1669848336429034, "learning_rate": 9.677144366821448e-07, "loss": 0.0571, "step": 1265 }, { "epoch": 0.5759781619654231, "grad_norm": 1.0542652349099781, "learning_rate": 9.676638898213393e-07, "loss": 0.0575, "step": 1266 }, { "epoch": 0.5764331210191083, "grad_norm": 0.7707484712593001, "learning_rate": 9.676133047451527e-07, "loss": 0.0368, "step": 1267 }, { "epoch": 0.5768880800727935, "grad_norm": 0.9390810081151801, "learning_rate": 9.675626814577187e-07, "loss": 0.0619, "step": 1268 }, { "epoch": 0.5773430391264787, "grad_norm": 0.817896689959064, "learning_rate": 9.675120199631736e-07, "loss": 0.0438, "step": 1269 }, { "epoch": 0.5777979981801638, "grad_norm": 0.8854282508390303, "learning_rate": 9.674613202656576e-07, "loss": 0.058, "step": 1270 }, { "epoch": 0.578252957233849, "grad_norm": 0.7775598538453966, "learning_rate": 9.674105823693137e-07, "loss": 0.0509, "step": 1271 }, { "epoch": 0.5787079162875342, "grad_norm": 1.062098474797389, "learning_rate": 9.673598062782877e-07, "loss": 0.0674, "step": 1272 }, { "epoch": 0.5791628753412192, "grad_norm": 0.49926186763652963, "learning_rate": 9.673089919967288e-07, "loss": 0.0256, "step": 1273 }, { "epoch": 0.5796178343949044, "grad_norm": 1.35078161644512, "learning_rate": 9.672581395287895e-07, "loss": 0.078, "step": 1274 }, { "epoch": 0.5800727934485896, "grad_norm": 1.426452489767498, "learning_rate": 9.672072488786252e-07, "loss": 0.0792, "step": 1275 }, { "epoch": 0.5805277525022748, "grad_norm": 1.0799654487238644, "learning_rate": 9.671563200503944e-07, "loss": 0.0661, "step": 1276 }, { "epoch": 0.5809827115559599, "grad_norm": 0.7322120884988809, "learning_rate": 9.67105353048259e-07, "loss": 0.0396, "step": 1277 }, { "epoch": 0.5814376706096451, "grad_norm": 1.077582094198137, "learning_rate": 9.670543478763834e-07, "loss": 0.0601, "step": 1278 }, { "epoch": 0.5818926296633303, "grad_norm": 0.6421679192465619, "learning_rate": 9.670033045389354e-07, "loss": 0.0416, "step": 1279 }, { "epoch": 0.5823475887170154, "grad_norm": 1.2016753070465214, "learning_rate": 9.669522230400867e-07, "loss": 0.1013, "step": 1280 }, { "epoch": 0.5828025477707006, "grad_norm": 1.0097519299373778, "learning_rate": 9.66901103384011e-07, "loss": 0.0688, "step": 1281 }, { "epoch": 0.5832575068243858, "grad_norm": 1.3294263407910267, "learning_rate": 9.668499455748855e-07, "loss": 0.0658, "step": 1282 }, { "epoch": 0.583712465878071, "grad_norm": 0.8637036921294736, "learning_rate": 9.667987496168907e-07, "loss": 0.0423, "step": 1283 }, { "epoch": 0.5841674249317561, "grad_norm": 0.7259234366870886, "learning_rate": 9.667475155142103e-07, "loss": 0.0378, "step": 1284 }, { "epoch": 0.5846223839854413, "grad_norm": 1.0873774525717022, "learning_rate": 9.666962432710307e-07, "loss": 0.0478, "step": 1285 }, { "epoch": 0.5850773430391265, "grad_norm": 1.5324659995984513, "learning_rate": 9.666449328915415e-07, "loss": 0.0821, "step": 1286 }, { "epoch": 0.5855323020928116, "grad_norm": 1.3715611029917771, "learning_rate": 9.66593584379936e-07, "loss": 0.064, "step": 1287 }, { "epoch": 0.5859872611464968, "grad_norm": 0.8948327459851355, "learning_rate": 9.665421977404098e-07, "loss": 0.046, "step": 1288 }, { "epoch": 0.586442220200182, "grad_norm": 1.0352787523649565, "learning_rate": 9.664907729771622e-07, "loss": 0.0654, "step": 1289 }, { "epoch": 0.5868971792538672, "grad_norm": 0.8602953183598696, "learning_rate": 9.66439310094395e-07, "loss": 0.0498, "step": 1290 }, { "epoch": 0.5873521383075523, "grad_norm": 0.9687265831207073, "learning_rate": 9.663878090963141e-07, "loss": 0.0628, "step": 1291 }, { "epoch": 0.5878070973612375, "grad_norm": 0.8373492984852973, "learning_rate": 9.663362699871275e-07, "loss": 0.0484, "step": 1292 }, { "epoch": 0.5882620564149227, "grad_norm": 0.858829186596976, "learning_rate": 9.662846927710468e-07, "loss": 0.0431, "step": 1293 }, { "epoch": 0.5887170154686078, "grad_norm": 0.7955928740751219, "learning_rate": 9.662330774522868e-07, "loss": 0.0434, "step": 1294 }, { "epoch": 0.589171974522293, "grad_norm": 0.7798606434181914, "learning_rate": 9.661814240350652e-07, "loss": 0.0394, "step": 1295 }, { "epoch": 0.5896269335759782, "grad_norm": 1.83872867344148, "learning_rate": 9.66129732523603e-07, "loss": 0.1035, "step": 1296 }, { "epoch": 0.5900818926296634, "grad_norm": 0.6081034748297524, "learning_rate": 9.660780029221239e-07, "loss": 0.0337, "step": 1297 }, { "epoch": 0.5905368516833485, "grad_norm": 1.0712423515968665, "learning_rate": 9.66026235234855e-07, "loss": 0.048, "step": 1298 }, { "epoch": 0.5909918107370337, "grad_norm": 1.225350234539244, "learning_rate": 9.659744294660272e-07, "loss": 0.0645, "step": 1299 }, { "epoch": 0.5914467697907189, "grad_norm": 0.8788447049360754, "learning_rate": 9.659225856198731e-07, "loss": 0.0523, "step": 1300 }, { "epoch": 0.591901728844404, "grad_norm": 0.6492880437236374, "learning_rate": 9.658707037006293e-07, "loss": 0.0398, "step": 1301 }, { "epoch": 0.5923566878980892, "grad_norm": 0.7914996236538643, "learning_rate": 9.658187837125355e-07, "loss": 0.044, "step": 1302 }, { "epoch": 0.5928116469517744, "grad_norm": 1.2953642836710044, "learning_rate": 9.657668256598344e-07, "loss": 0.0774, "step": 1303 }, { "epoch": 0.5932666060054596, "grad_norm": 1.260439443000597, "learning_rate": 9.657148295467717e-07, "loss": 0.0819, "step": 1304 }, { "epoch": 0.5937215650591446, "grad_norm": 0.8647657322585391, "learning_rate": 9.656627953775962e-07, "loss": 0.0429, "step": 1305 }, { "epoch": 0.5941765241128298, "grad_norm": 0.9799399918380723, "learning_rate": 9.6561072315656e-07, "loss": 0.0609, "step": 1306 }, { "epoch": 0.594631483166515, "grad_norm": 1.13284493211371, "learning_rate": 9.655586128879183e-07, "loss": 0.048, "step": 1307 }, { "epoch": 0.5950864422202001, "grad_norm": 0.735655189106713, "learning_rate": 9.655064645759289e-07, "loss": 0.0386, "step": 1308 }, { "epoch": 0.5955414012738853, "grad_norm": 0.5398593474564078, "learning_rate": 9.654542782248539e-07, "loss": 0.0171, "step": 1309 }, { "epoch": 0.5959963603275705, "grad_norm": 0.7251173486923949, "learning_rate": 9.65402053838957e-07, "loss": 0.0356, "step": 1310 }, { "epoch": 0.5964513193812557, "grad_norm": 1.1322995595063907, "learning_rate": 9.653497914225057e-07, "loss": 0.0545, "step": 1311 }, { "epoch": 0.5969062784349408, "grad_norm": 1.5006177009326187, "learning_rate": 9.652974909797714e-07, "loss": 0.0896, "step": 1312 }, { "epoch": 0.597361237488626, "grad_norm": 0.6954423078281272, "learning_rate": 9.652451525150271e-07, "loss": 0.0454, "step": 1313 }, { "epoch": 0.5978161965423112, "grad_norm": 0.8147553075796471, "learning_rate": 9.651927760325501e-07, "loss": 0.0485, "step": 1314 }, { "epoch": 0.5982711555959963, "grad_norm": 1.4138813868676818, "learning_rate": 9.651403615366202e-07, "loss": 0.0587, "step": 1315 }, { "epoch": 0.5987261146496815, "grad_norm": 0.9604572000390108, "learning_rate": 9.650879090315206e-07, "loss": 0.0511, "step": 1316 }, { "epoch": 0.5991810737033667, "grad_norm": 1.0486842210720533, "learning_rate": 9.650354185215373e-07, "loss": 0.0655, "step": 1317 }, { "epoch": 0.5996360327570519, "grad_norm": 0.8814562571770873, "learning_rate": 9.649828900109598e-07, "loss": 0.0497, "step": 1318 }, { "epoch": 0.600090991810737, "grad_norm": 0.7084799113489009, "learning_rate": 9.649303235040802e-07, "loss": 0.0313, "step": 1319 }, { "epoch": 0.6005459508644222, "grad_norm": 0.8791004910567216, "learning_rate": 9.648777190051942e-07, "loss": 0.0643, "step": 1320 }, { "epoch": 0.6010009099181074, "grad_norm": 0.8830318665040463, "learning_rate": 9.648250765186006e-07, "loss": 0.0426, "step": 1321 }, { "epoch": 0.6014558689717925, "grad_norm": 0.8900818578151345, "learning_rate": 9.647723960486005e-07, "loss": 0.0672, "step": 1322 }, { "epoch": 0.6019108280254777, "grad_norm": 1.2159765611243998, "learning_rate": 9.647196775994994e-07, "loss": 0.0556, "step": 1323 }, { "epoch": 0.6023657870791629, "grad_norm": 0.9673125625457091, "learning_rate": 9.646669211756046e-07, "loss": 0.0416, "step": 1324 }, { "epoch": 0.6028207461328481, "grad_norm": 0.9685475046693527, "learning_rate": 9.646141267812278e-07, "loss": 0.0477, "step": 1325 }, { "epoch": 0.6032757051865332, "grad_norm": 1.0856225128238883, "learning_rate": 9.645612944206826e-07, "loss": 0.056, "step": 1326 }, { "epoch": 0.6037306642402184, "grad_norm": 0.8301745188048383, "learning_rate": 9.645084240982862e-07, "loss": 0.0523, "step": 1327 }, { "epoch": 0.6041856232939036, "grad_norm": 0.5746549934273473, "learning_rate": 9.644555158183591e-07, "loss": 0.0489, "step": 1328 }, { "epoch": 0.6046405823475887, "grad_norm": 0.7755548033892664, "learning_rate": 9.64402569585225e-07, "loss": 0.0456, "step": 1329 }, { "epoch": 0.6050955414012739, "grad_norm": 1.24217407331199, "learning_rate": 9.643495854032097e-07, "loss": 0.0319, "step": 1330 }, { "epoch": 0.6055505004549591, "grad_norm": 1.2037483478808235, "learning_rate": 9.642965632766434e-07, "loss": 0.0657, "step": 1331 }, { "epoch": 0.6060054595086443, "grad_norm": 0.7260705083043161, "learning_rate": 9.64243503209859e-07, "loss": 0.0396, "step": 1332 }, { "epoch": 0.6064604185623294, "grad_norm": 0.9941856070306597, "learning_rate": 9.64190405207192e-07, "loss": 0.0595, "step": 1333 }, { "epoch": 0.6069153776160146, "grad_norm": 0.9049167979295172, "learning_rate": 9.64137269272981e-07, "loss": 0.0492, "step": 1334 }, { "epoch": 0.6073703366696998, "grad_norm": 1.129611157113714, "learning_rate": 9.640840954115685e-07, "loss": 0.0554, "step": 1335 }, { "epoch": 0.607825295723385, "grad_norm": 1.0571833415023526, "learning_rate": 9.640308836272996e-07, "loss": 0.0623, "step": 1336 }, { "epoch": 0.60828025477707, "grad_norm": 1.1597809963448742, "learning_rate": 9.639776339245224e-07, "loss": 0.0642, "step": 1337 }, { "epoch": 0.6087352138307552, "grad_norm": 2.113314939232621, "learning_rate": 9.639243463075882e-07, "loss": 0.0711, "step": 1338 }, { "epoch": 0.6091901728844404, "grad_norm": 0.8758357082176966, "learning_rate": 9.638710207808516e-07, "loss": 0.0442, "step": 1339 }, { "epoch": 0.6096451319381255, "grad_norm": 1.0417617530764565, "learning_rate": 9.638176573486699e-07, "loss": 0.0461, "step": 1340 }, { "epoch": 0.6101000909918107, "grad_norm": 0.8163829654028246, "learning_rate": 9.637642560154039e-07, "loss": 0.0361, "step": 1341 }, { "epoch": 0.6105550500454959, "grad_norm": 0.9708163414401448, "learning_rate": 9.63710816785417e-07, "loss": 0.0523, "step": 1342 }, { "epoch": 0.6110100090991811, "grad_norm": 1.2637266423474305, "learning_rate": 9.636573396630766e-07, "loss": 0.0495, "step": 1343 }, { "epoch": 0.6114649681528662, "grad_norm": 1.02202340805819, "learning_rate": 9.63603824652752e-07, "loss": 0.0718, "step": 1344 }, { "epoch": 0.6119199272065514, "grad_norm": 0.796136257114391, "learning_rate": 9.635502717588167e-07, "loss": 0.0405, "step": 1345 }, { "epoch": 0.6123748862602366, "grad_norm": 0.9279182632876206, "learning_rate": 9.634966809856465e-07, "loss": 0.052, "step": 1346 }, { "epoch": 0.6128298453139217, "grad_norm": 1.3131867430737394, "learning_rate": 9.634430523376206e-07, "loss": 0.0799, "step": 1347 }, { "epoch": 0.6132848043676069, "grad_norm": 0.9854947671856729, "learning_rate": 9.633893858191214e-07, "loss": 0.0424, "step": 1348 }, { "epoch": 0.6137397634212921, "grad_norm": 1.0880433656687452, "learning_rate": 9.633356814345341e-07, "loss": 0.0626, "step": 1349 }, { "epoch": 0.6141947224749773, "grad_norm": 0.971646609630187, "learning_rate": 9.632819391882473e-07, "loss": 0.0505, "step": 1350 }, { "epoch": 0.6146496815286624, "grad_norm": 0.955287744171009, "learning_rate": 9.63228159084653e-07, "loss": 0.0499, "step": 1351 }, { "epoch": 0.6151046405823476, "grad_norm": 1.3353763545820454, "learning_rate": 9.63174341128145e-07, "loss": 0.0742, "step": 1352 }, { "epoch": 0.6155595996360328, "grad_norm": 1.2375861106136756, "learning_rate": 9.631204853231217e-07, "loss": 0.0583, "step": 1353 }, { "epoch": 0.6160145586897179, "grad_norm": 0.9654453108243133, "learning_rate": 9.630665916739837e-07, "loss": 0.0553, "step": 1354 }, { "epoch": 0.6164695177434031, "grad_norm": 0.9819536752232059, "learning_rate": 9.630126601851353e-07, "loss": 0.0445, "step": 1355 }, { "epoch": 0.6169244767970883, "grad_norm": 0.8721058240357414, "learning_rate": 9.62958690860983e-07, "loss": 0.0539, "step": 1356 }, { "epoch": 0.6173794358507735, "grad_norm": 0.8831619531004216, "learning_rate": 9.629046837059372e-07, "loss": 0.0623, "step": 1357 }, { "epoch": 0.6178343949044586, "grad_norm": 0.9334285229581504, "learning_rate": 9.628506387244112e-07, "loss": 0.0462, "step": 1358 }, { "epoch": 0.6182893539581438, "grad_norm": 1.0156816682896814, "learning_rate": 9.627965559208212e-07, "loss": 0.0407, "step": 1359 }, { "epoch": 0.618744313011829, "grad_norm": 0.962153168873835, "learning_rate": 9.627424352995864e-07, "loss": 0.0566, "step": 1360 }, { "epoch": 0.6191992720655141, "grad_norm": 1.1495638017975578, "learning_rate": 9.626882768651297e-07, "loss": 0.061, "step": 1361 }, { "epoch": 0.6196542311191993, "grad_norm": 0.8626025361498736, "learning_rate": 9.626340806218764e-07, "loss": 0.0336, "step": 1362 }, { "epoch": 0.6201091901728845, "grad_norm": 0.8494364585203383, "learning_rate": 9.625798465742554e-07, "loss": 0.0326, "step": 1363 }, { "epoch": 0.6205641492265697, "grad_norm": 0.8035151819955791, "learning_rate": 9.625255747266983e-07, "loss": 0.0443, "step": 1364 }, { "epoch": 0.6210191082802548, "grad_norm": 1.1194298826659628, "learning_rate": 9.6247126508364e-07, "loss": 0.078, "step": 1365 }, { "epoch": 0.62147406733394, "grad_norm": 1.314372181815368, "learning_rate": 9.624169176495184e-07, "loss": 0.076, "step": 1366 }, { "epoch": 0.6219290263876252, "grad_norm": 1.204846228544712, "learning_rate": 9.623625324287746e-07, "loss": 0.0675, "step": 1367 }, { "epoch": 0.6223839854413102, "grad_norm": 1.0629469691418807, "learning_rate": 9.623081094258526e-07, "loss": 0.0914, "step": 1368 }, { "epoch": 0.6228389444949954, "grad_norm": 1.3796866222163617, "learning_rate": 9.622536486451995e-07, "loss": 0.0756, "step": 1369 }, { "epoch": 0.6232939035486806, "grad_norm": 0.9234152382742571, "learning_rate": 9.621991500912659e-07, "loss": 0.0404, "step": 1370 }, { "epoch": 0.6237488626023658, "grad_norm": 1.4305438262310661, "learning_rate": 9.621446137685052e-07, "loss": 0.0732, "step": 1371 }, { "epoch": 0.6242038216560509, "grad_norm": 1.0906198049978535, "learning_rate": 9.620900396813735e-07, "loss": 0.0765, "step": 1372 }, { "epoch": 0.6246587807097361, "grad_norm": 1.4087778344200577, "learning_rate": 9.620354278343304e-07, "loss": 0.0689, "step": 1373 }, { "epoch": 0.6251137397634213, "grad_norm": 0.6306780871757539, "learning_rate": 9.619807782318389e-07, "loss": 0.0283, "step": 1374 }, { "epoch": 0.6255686988171064, "grad_norm": 1.4699572165770995, "learning_rate": 9.619260908783643e-07, "loss": 0.0733, "step": 1375 }, { "epoch": 0.6260236578707916, "grad_norm": 1.172619459794558, "learning_rate": 9.61871365778376e-07, "loss": 0.0806, "step": 1376 }, { "epoch": 0.6264786169244768, "grad_norm": 1.0419516621519074, "learning_rate": 9.61816602936345e-07, "loss": 0.0642, "step": 1377 }, { "epoch": 0.626933575978162, "grad_norm": 1.1544219468885668, "learning_rate": 9.61761802356747e-07, "loss": 0.0761, "step": 1378 }, { "epoch": 0.6273885350318471, "grad_norm": 0.9449449839073086, "learning_rate": 9.617069640440598e-07, "loss": 0.0476, "step": 1379 }, { "epoch": 0.6278434940855323, "grad_norm": 1.2425814008450113, "learning_rate": 9.616520880027644e-07, "loss": 0.084, "step": 1380 }, { "epoch": 0.6282984531392175, "grad_norm": 0.7838116497003709, "learning_rate": 9.615971742373451e-07, "loss": 0.047, "step": 1381 }, { "epoch": 0.6287534121929026, "grad_norm": 1.5489045739714193, "learning_rate": 9.615422227522896e-07, "loss": 0.0695, "step": 1382 }, { "epoch": 0.6292083712465878, "grad_norm": 1.2424691611901795, "learning_rate": 9.614872335520878e-07, "loss": 0.0555, "step": 1383 }, { "epoch": 0.629663330300273, "grad_norm": 0.9470890009275095, "learning_rate": 9.614322066412335e-07, "loss": 0.0543, "step": 1384 }, { "epoch": 0.6301182893539582, "grad_norm": 1.1556863254569283, "learning_rate": 9.61377142024223e-07, "loss": 0.0635, "step": 1385 }, { "epoch": 0.6305732484076433, "grad_norm": 0.956499002110131, "learning_rate": 9.613220397055556e-07, "loss": 0.045, "step": 1386 }, { "epoch": 0.6310282074613285, "grad_norm": 1.1285158499995285, "learning_rate": 9.61266899689735e-07, "loss": 0.0574, "step": 1387 }, { "epoch": 0.6314831665150137, "grad_norm": 1.0665371501215408, "learning_rate": 9.612117219812661e-07, "loss": 0.054, "step": 1388 }, { "epoch": 0.6319381255686988, "grad_norm": 1.5724055365493932, "learning_rate": 9.611565065846581e-07, "loss": 0.0799, "step": 1389 }, { "epoch": 0.632393084622384, "grad_norm": 1.041459908091188, "learning_rate": 9.611012535044231e-07, "loss": 0.0763, "step": 1390 }, { "epoch": 0.6328480436760692, "grad_norm": 1.3889834700522472, "learning_rate": 9.61045962745076e-07, "loss": 0.0715, "step": 1391 }, { "epoch": 0.6333030027297544, "grad_norm": 0.642679596869822, "learning_rate": 9.609906343111348e-07, "loss": 0.041, "step": 1392 }, { "epoch": 0.6337579617834395, "grad_norm": 1.3445917287604259, "learning_rate": 9.609352682071207e-07, "loss": 0.0511, "step": 1393 }, { "epoch": 0.6342129208371247, "grad_norm": 1.5982408939519754, "learning_rate": 9.608798644375582e-07, "loss": 0.09, "step": 1394 }, { "epoch": 0.6346678798908099, "grad_norm": 1.3631579558885258, "learning_rate": 9.608244230069744e-07, "loss": 0.0804, "step": 1395 }, { "epoch": 0.635122838944495, "grad_norm": 1.049789643646565, "learning_rate": 9.607689439198999e-07, "loss": 0.0626, "step": 1396 }, { "epoch": 0.6355777979981801, "grad_norm": 1.1492839350522404, "learning_rate": 9.607134271808679e-07, "loss": 0.0627, "step": 1397 }, { "epoch": 0.6360327570518653, "grad_norm": 1.2504658670606188, "learning_rate": 9.606578727944155e-07, "loss": 0.0764, "step": 1398 }, { "epoch": 0.6364877161055505, "grad_norm": 0.957077259519955, "learning_rate": 9.606022807650818e-07, "loss": 0.0456, "step": 1399 }, { "epoch": 0.6369426751592356, "grad_norm": 0.8961648827555176, "learning_rate": 9.6054665109741e-07, "loss": 0.0486, "step": 1400 }, { "epoch": 0.6373976342129208, "grad_norm": 1.0113420145574747, "learning_rate": 9.604909837959454e-07, "loss": 0.0518, "step": 1401 }, { "epoch": 0.637852593266606, "grad_norm": 1.4677121895972032, "learning_rate": 9.604352788652374e-07, "loss": 0.0899, "step": 1402 }, { "epoch": 0.6383075523202911, "grad_norm": 1.0401408986934393, "learning_rate": 9.603795363098375e-07, "loss": 0.0384, "step": 1403 }, { "epoch": 0.6387625113739763, "grad_norm": 1.1572925538329017, "learning_rate": 9.603237561343012e-07, "loss": 0.0668, "step": 1404 }, { "epoch": 0.6392174704276615, "grad_norm": 0.886318106337005, "learning_rate": 9.602679383431863e-07, "loss": 0.05, "step": 1405 }, { "epoch": 0.6396724294813467, "grad_norm": 1.0458031409412327, "learning_rate": 9.602120829410537e-07, "loss": 0.0373, "step": 1406 }, { "epoch": 0.6401273885350318, "grad_norm": 0.9313999338595971, "learning_rate": 9.601561899324684e-07, "loss": 0.0556, "step": 1407 }, { "epoch": 0.640582347588717, "grad_norm": 0.9719608919708435, "learning_rate": 9.601002593219972e-07, "loss": 0.0441, "step": 1408 }, { "epoch": 0.6410373066424022, "grad_norm": 1.1405171765166928, "learning_rate": 9.600442911142105e-07, "loss": 0.0637, "step": 1409 }, { "epoch": 0.6414922656960873, "grad_norm": 0.8188515533336069, "learning_rate": 9.59988285313682e-07, "loss": 0.0449, "step": 1410 }, { "epoch": 0.6419472247497725, "grad_norm": 1.0133319801734209, "learning_rate": 9.59932241924988e-07, "loss": 0.066, "step": 1411 }, { "epoch": 0.6424021838034577, "grad_norm": 1.0622778306308414, "learning_rate": 9.598761609527084e-07, "loss": 0.0553, "step": 1412 }, { "epoch": 0.6428571428571429, "grad_norm": 1.3968867899223683, "learning_rate": 9.598200424014254e-07, "loss": 0.0577, "step": 1413 }, { "epoch": 0.643312101910828, "grad_norm": 1.028130382622468, "learning_rate": 9.597638862757253e-07, "loss": 0.0447, "step": 1414 }, { "epoch": 0.6437670609645132, "grad_norm": 0.6532456496861823, "learning_rate": 9.597076925801967e-07, "loss": 0.0384, "step": 1415 }, { "epoch": 0.6442220200181984, "grad_norm": 0.879422884831013, "learning_rate": 9.596514613194313e-07, "loss": 0.0405, "step": 1416 }, { "epoch": 0.6446769790718835, "grad_norm": 0.9713924784454125, "learning_rate": 9.595951924980243e-07, "loss": 0.0538, "step": 1417 }, { "epoch": 0.6451319381255687, "grad_norm": 0.7071305448311551, "learning_rate": 9.595388861205738e-07, "loss": 0.0268, "step": 1418 }, { "epoch": 0.6455868971792539, "grad_norm": 1.251402082931037, "learning_rate": 9.594825421916809e-07, "loss": 0.0717, "step": 1419 }, { "epoch": 0.6460418562329391, "grad_norm": 1.0703455325706082, "learning_rate": 9.594261607159492e-07, "loss": 0.0535, "step": 1420 }, { "epoch": 0.6464968152866242, "grad_norm": 1.3172241003679301, "learning_rate": 9.593697416979868e-07, "loss": 0.0731, "step": 1421 }, { "epoch": 0.6469517743403094, "grad_norm": 1.043847584102769, "learning_rate": 9.593132851424035e-07, "loss": 0.0729, "step": 1422 }, { "epoch": 0.6474067333939946, "grad_norm": 1.0004868915734777, "learning_rate": 9.592567910538128e-07, "loss": 0.0774, "step": 1423 }, { "epoch": 0.6478616924476797, "grad_norm": 1.1707101755423484, "learning_rate": 9.59200259436831e-07, "loss": 0.0793, "step": 1424 }, { "epoch": 0.6483166515013649, "grad_norm": 1.2269222161166873, "learning_rate": 9.59143690296078e-07, "loss": 0.0785, "step": 1425 }, { "epoch": 0.6487716105550501, "grad_norm": 0.7816791028722773, "learning_rate": 9.590870836361756e-07, "loss": 0.0527, "step": 1426 }, { "epoch": 0.6492265696087353, "grad_norm": 1.0818851439423862, "learning_rate": 9.590304394617505e-07, "loss": 0.0581, "step": 1427 }, { "epoch": 0.6496815286624203, "grad_norm": 0.6524658522467383, "learning_rate": 9.589737577774306e-07, "loss": 0.0286, "step": 1428 }, { "epoch": 0.6501364877161055, "grad_norm": 1.0689428140494595, "learning_rate": 9.589170385878479e-07, "loss": 0.0702, "step": 1429 }, { "epoch": 0.6505914467697907, "grad_norm": 0.8701504830472667, "learning_rate": 9.588602818976373e-07, "loss": 0.0401, "step": 1430 }, { "epoch": 0.6510464058234758, "grad_norm": 0.9935036606662533, "learning_rate": 9.588034877114366e-07, "loss": 0.0525, "step": 1431 }, { "epoch": 0.651501364877161, "grad_norm": 0.9262351156349617, "learning_rate": 9.58746656033887e-07, "loss": 0.0642, "step": 1432 }, { "epoch": 0.6519563239308462, "grad_norm": 1.7669913785975562, "learning_rate": 9.586897868696322e-07, "loss": 0.0867, "step": 1433 }, { "epoch": 0.6524112829845314, "grad_norm": 1.1064629387402922, "learning_rate": 9.586328802233195e-07, "loss": 0.0422, "step": 1434 }, { "epoch": 0.6528662420382165, "grad_norm": 0.825025138964913, "learning_rate": 9.585759360995988e-07, "loss": 0.0442, "step": 1435 }, { "epoch": 0.6533212010919017, "grad_norm": 1.1547482861944165, "learning_rate": 9.585189545031238e-07, "loss": 0.0553, "step": 1436 }, { "epoch": 0.6537761601455869, "grad_norm": 1.5001221403381506, "learning_rate": 9.584619354385504e-07, "loss": 0.0797, "step": 1437 }, { "epoch": 0.654231119199272, "grad_norm": 1.0919566323761012, "learning_rate": 9.584048789105379e-07, "loss": 0.0569, "step": 1438 }, { "epoch": 0.6546860782529572, "grad_norm": 0.8122590210063091, "learning_rate": 9.583477849237488e-07, "loss": 0.0501, "step": 1439 }, { "epoch": 0.6551410373066424, "grad_norm": 0.9966885346910843, "learning_rate": 9.582906534828489e-07, "loss": 0.0411, "step": 1440 }, { "epoch": 0.6555959963603276, "grad_norm": 0.9959023677705958, "learning_rate": 9.582334845925062e-07, "loss": 0.051, "step": 1441 }, { "epoch": 0.6560509554140127, "grad_norm": 1.2885597755131308, "learning_rate": 9.581762782573925e-07, "loss": 0.0667, "step": 1442 }, { "epoch": 0.6565059144676979, "grad_norm": 0.9908484604867948, "learning_rate": 9.581190344821826e-07, "loss": 0.0537, "step": 1443 }, { "epoch": 0.6569608735213831, "grad_norm": 1.1716019591007731, "learning_rate": 9.58061753271554e-07, "loss": 0.0649, "step": 1444 }, { "epoch": 0.6574158325750682, "grad_norm": 0.8366475630908029, "learning_rate": 9.580044346301874e-07, "loss": 0.0525, "step": 1445 }, { "epoch": 0.6578707916287534, "grad_norm": 0.7575406807513989, "learning_rate": 9.579470785627669e-07, "loss": 0.0365, "step": 1446 }, { "epoch": 0.6583257506824386, "grad_norm": 0.9832679012959014, "learning_rate": 9.578896850739791e-07, "loss": 0.0547, "step": 1447 }, { "epoch": 0.6587807097361238, "grad_norm": 0.9760014596562895, "learning_rate": 9.578322541685139e-07, "loss": 0.0505, "step": 1448 }, { "epoch": 0.6592356687898089, "grad_norm": 1.2256040204745375, "learning_rate": 9.577747858510647e-07, "loss": 0.065, "step": 1449 }, { "epoch": 0.6596906278434941, "grad_norm": 1.2241253736968625, "learning_rate": 9.57717280126327e-07, "loss": 0.082, "step": 1450 }, { "epoch": 0.6601455868971793, "grad_norm": 0.8117267870621638, "learning_rate": 9.576597369990004e-07, "loss": 0.0527, "step": 1451 }, { "epoch": 0.6606005459508644, "grad_norm": 0.7467740676105091, "learning_rate": 9.57602156473787e-07, "loss": 0.0337, "step": 1452 }, { "epoch": 0.6610555050045496, "grad_norm": 1.0345414809313083, "learning_rate": 9.575445385553916e-07, "loss": 0.0685, "step": 1453 }, { "epoch": 0.6615104640582348, "grad_norm": 1.1691795142918866, "learning_rate": 9.57486883248523e-07, "loss": 0.0733, "step": 1454 }, { "epoch": 0.66196542311192, "grad_norm": 1.7355365408172576, "learning_rate": 9.574291905578923e-07, "loss": 0.097, "step": 1455 }, { "epoch": 0.6624203821656051, "grad_norm": 0.9915278303254428, "learning_rate": 9.573714604882136e-07, "loss": 0.0376, "step": 1456 }, { "epoch": 0.6628753412192903, "grad_norm": 1.1568924245996273, "learning_rate": 9.573136930442049e-07, "loss": 0.0703, "step": 1457 }, { "epoch": 0.6633303002729755, "grad_norm": 1.096791491175382, "learning_rate": 9.572558882305863e-07, "loss": 0.0636, "step": 1458 }, { "epoch": 0.6637852593266605, "grad_norm": 1.3916740709110964, "learning_rate": 9.571980460520813e-07, "loss": 0.0895, "step": 1459 }, { "epoch": 0.6642402183803457, "grad_norm": 1.5307449268982747, "learning_rate": 9.571401665134169e-07, "loss": 0.0662, "step": 1460 }, { "epoch": 0.664695177434031, "grad_norm": 0.7828165232209624, "learning_rate": 9.570822496193225e-07, "loss": 0.0362, "step": 1461 }, { "epoch": 0.6651501364877161, "grad_norm": 0.7592010519931505, "learning_rate": 9.570242953745307e-07, "loss": 0.0566, "step": 1462 }, { "epoch": 0.6656050955414012, "grad_norm": 1.0505573175858869, "learning_rate": 9.569663037837775e-07, "loss": 0.0726, "step": 1463 }, { "epoch": 0.6660600545950864, "grad_norm": 0.8513824421975892, "learning_rate": 9.569082748518016e-07, "loss": 0.0493, "step": 1464 }, { "epoch": 0.6665150136487716, "grad_norm": 1.005243783723884, "learning_rate": 9.568502085833447e-07, "loss": 0.0555, "step": 1465 }, { "epoch": 0.6669699727024567, "grad_norm": 1.016800668341396, "learning_rate": 9.56792104983152e-07, "loss": 0.0616, "step": 1466 }, { "epoch": 0.6674249317561419, "grad_norm": 0.9285177508384169, "learning_rate": 9.567339640559714e-07, "loss": 0.0482, "step": 1467 }, { "epoch": 0.6678798908098271, "grad_norm": 0.8339104331509083, "learning_rate": 9.566757858065537e-07, "loss": 0.0464, "step": 1468 }, { "epoch": 0.6683348498635123, "grad_norm": 1.0367907649509682, "learning_rate": 9.566175702396532e-07, "loss": 0.0578, "step": 1469 }, { "epoch": 0.6687898089171974, "grad_norm": 1.2326076414281266, "learning_rate": 9.565593173600271e-07, "loss": 0.0668, "step": 1470 }, { "epoch": 0.6692447679708826, "grad_norm": 0.8924109722236269, "learning_rate": 9.56501027172435e-07, "loss": 0.0574, "step": 1471 }, { "epoch": 0.6696997270245678, "grad_norm": 1.0323230994671857, "learning_rate": 9.564426996816407e-07, "loss": 0.07, "step": 1472 }, { "epoch": 0.6701546860782529, "grad_norm": 1.0162610989040703, "learning_rate": 9.563843348924104e-07, "loss": 0.0426, "step": 1473 }, { "epoch": 0.6706096451319381, "grad_norm": 1.5414295920409435, "learning_rate": 9.56325932809513e-07, "loss": 0.0615, "step": 1474 }, { "epoch": 0.6710646041856233, "grad_norm": 0.910204824084491, "learning_rate": 9.562674934377212e-07, "loss": 0.0593, "step": 1475 }, { "epoch": 0.6715195632393085, "grad_norm": 1.0892680072882466, "learning_rate": 9.562090167818105e-07, "loss": 0.0667, "step": 1476 }, { "epoch": 0.6719745222929936, "grad_norm": 1.1263448855254186, "learning_rate": 9.56150502846559e-07, "loss": 0.0493, "step": 1477 }, { "epoch": 0.6724294813466788, "grad_norm": 0.8159372790533973, "learning_rate": 9.560919516367485e-07, "loss": 0.0467, "step": 1478 }, { "epoch": 0.672884440400364, "grad_norm": 0.803695342569013, "learning_rate": 9.560333631571634e-07, "loss": 0.0508, "step": 1479 }, { "epoch": 0.6733393994540491, "grad_norm": 0.9598073085454722, "learning_rate": 9.55974737412591e-07, "loss": 0.0687, "step": 1480 }, { "epoch": 0.6737943585077343, "grad_norm": 0.9763237033636989, "learning_rate": 9.559160744078225e-07, "loss": 0.0398, "step": 1481 }, { "epoch": 0.6742493175614195, "grad_norm": 1.1897628869086339, "learning_rate": 9.558573741476512e-07, "loss": 0.0584, "step": 1482 }, { "epoch": 0.6747042766151047, "grad_norm": 1.0741521921979502, "learning_rate": 9.55798636636874e-07, "loss": 0.0473, "step": 1483 }, { "epoch": 0.6751592356687898, "grad_norm": 1.3221194041907163, "learning_rate": 9.557398618802905e-07, "loss": 0.0684, "step": 1484 }, { "epoch": 0.675614194722475, "grad_norm": 0.9066999926559742, "learning_rate": 9.556810498827036e-07, "loss": 0.056, "step": 1485 }, { "epoch": 0.6760691537761602, "grad_norm": 0.9378443102469896, "learning_rate": 9.556222006489192e-07, "loss": 0.0605, "step": 1486 }, { "epoch": 0.6765241128298453, "grad_norm": 0.7546973063862114, "learning_rate": 9.55563314183746e-07, "loss": 0.0445, "step": 1487 }, { "epoch": 0.6769790718835305, "grad_norm": 0.8820386643693041, "learning_rate": 9.555043904919962e-07, "loss": 0.0526, "step": 1488 }, { "epoch": 0.6774340309372157, "grad_norm": 1.1229142009071518, "learning_rate": 9.554454295784847e-07, "loss": 0.0617, "step": 1489 }, { "epoch": 0.6778889899909009, "grad_norm": 0.9621337636885208, "learning_rate": 9.553864314480294e-07, "loss": 0.0544, "step": 1490 }, { "epoch": 0.678343949044586, "grad_norm": 2.057216107685726, "learning_rate": 9.553273961054514e-07, "loss": 0.0608, "step": 1491 }, { "epoch": 0.6787989080982711, "grad_norm": 1.3574520363299087, "learning_rate": 9.552683235555748e-07, "loss": 0.0615, "step": 1492 }, { "epoch": 0.6792538671519563, "grad_norm": 0.8857312875238341, "learning_rate": 9.552092138032268e-07, "loss": 0.0529, "step": 1493 }, { "epoch": 0.6797088262056415, "grad_norm": 1.2577057153594355, "learning_rate": 9.551500668532376e-07, "loss": 0.075, "step": 1494 }, { "epoch": 0.6801637852593266, "grad_norm": 0.7749521967632427, "learning_rate": 9.550908827104403e-07, "loss": 0.0421, "step": 1495 }, { "epoch": 0.6806187443130118, "grad_norm": 2.948558327893664, "learning_rate": 9.550316613796714e-07, "loss": 0.0921, "step": 1496 }, { "epoch": 0.681073703366697, "grad_norm": 0.9645231035028902, "learning_rate": 9.549724028657698e-07, "loss": 0.05, "step": 1497 }, { "epoch": 0.6815286624203821, "grad_norm": 0.8703562056816192, "learning_rate": 9.549131071735782e-07, "loss": 0.0534, "step": 1498 }, { "epoch": 0.6819836214740673, "grad_norm": 1.2820799583640452, "learning_rate": 9.54853774307942e-07, "loss": 0.0648, "step": 1499 }, { "epoch": 0.6824385805277525, "grad_norm": 1.1632959309154356, "learning_rate": 9.547944042737091e-07, "loss": 0.069, "step": 1500 }, { "epoch": 0.6828935395814377, "grad_norm": 1.2322730932080577, "learning_rate": 9.547349970757317e-07, "loss": 0.1109, "step": 1501 }, { "epoch": 0.6833484986351228, "grad_norm": 0.9530618052354195, "learning_rate": 9.546755527188637e-07, "loss": 0.0395, "step": 1502 }, { "epoch": 0.683803457688808, "grad_norm": 0.9573314398427334, "learning_rate": 9.546160712079628e-07, "loss": 0.0412, "step": 1503 }, { "epoch": 0.6842584167424932, "grad_norm": 1.0409838521186003, "learning_rate": 9.545565525478896e-07, "loss": 0.0532, "step": 1504 }, { "epoch": 0.6847133757961783, "grad_norm": 0.9378349323705331, "learning_rate": 9.544969967435078e-07, "loss": 0.054, "step": 1505 }, { "epoch": 0.6851683348498635, "grad_norm": 1.2271367975377327, "learning_rate": 9.544374037996838e-07, "loss": 0.0521, "step": 1506 }, { "epoch": 0.6856232939035487, "grad_norm": 1.4136812130964629, "learning_rate": 9.543777737212874e-07, "loss": 0.079, "step": 1507 }, { "epoch": 0.6860782529572339, "grad_norm": 0.8279301916689641, "learning_rate": 9.543181065131914e-07, "loss": 0.0313, "step": 1508 }, { "epoch": 0.686533212010919, "grad_norm": 1.1158131778374005, "learning_rate": 9.542584021802713e-07, "loss": 0.0472, "step": 1509 }, { "epoch": 0.6869881710646042, "grad_norm": 1.309709682070072, "learning_rate": 9.54198660727406e-07, "loss": 0.059, "step": 1510 }, { "epoch": 0.6874431301182894, "grad_norm": 0.8147714118458105, "learning_rate": 9.541388821594774e-07, "loss": 0.0496, "step": 1511 }, { "epoch": 0.6878980891719745, "grad_norm": 1.2645223926025457, "learning_rate": 9.5407906648137e-07, "loss": 0.0527, "step": 1512 }, { "epoch": 0.6883530482256597, "grad_norm": 1.0041510288859172, "learning_rate": 9.540192136979721e-07, "loss": 0.0573, "step": 1513 }, { "epoch": 0.6888080072793449, "grad_norm": 0.8021302288363799, "learning_rate": 9.539593238141742e-07, "loss": 0.052, "step": 1514 }, { "epoch": 0.6892629663330301, "grad_norm": 0.7215684743513331, "learning_rate": 9.538993968348706e-07, "loss": 0.0415, "step": 1515 }, { "epoch": 0.6897179253867152, "grad_norm": 1.1672097183922474, "learning_rate": 9.53839432764958e-07, "loss": 0.0495, "step": 1516 }, { "epoch": 0.6901728844404004, "grad_norm": 1.1043485378580087, "learning_rate": 9.537794316093366e-07, "loss": 0.0562, "step": 1517 }, { "epoch": 0.6906278434940856, "grad_norm": 0.8606458963398059, "learning_rate": 9.53719393372909e-07, "loss": 0.0416, "step": 1518 }, { "epoch": 0.6910828025477707, "grad_norm": 1.5257831048064958, "learning_rate": 9.536593180605819e-07, "loss": 0.0998, "step": 1519 }, { "epoch": 0.6915377616014559, "grad_norm": 1.2806517133370892, "learning_rate": 9.535992056772638e-07, "loss": 0.0788, "step": 1520 }, { "epoch": 0.6919927206551411, "grad_norm": 1.5061227516422715, "learning_rate": 9.535390562278671e-07, "loss": 0.057, "step": 1521 }, { "epoch": 0.6924476797088263, "grad_norm": 1.2226473482934994, "learning_rate": 9.53478869717307e-07, "loss": 0.0548, "step": 1522 }, { "epoch": 0.6929026387625113, "grad_norm": 1.3050139099343194, "learning_rate": 9.534186461505014e-07, "loss": 0.0792, "step": 1523 }, { "epoch": 0.6933575978161965, "grad_norm": 0.9725783602989069, "learning_rate": 9.533583855323717e-07, "loss": 0.0648, "step": 1524 }, { "epoch": 0.6938125568698817, "grad_norm": 1.4697272129227426, "learning_rate": 9.532980878678422e-07, "loss": 0.0589, "step": 1525 }, { "epoch": 0.6942675159235668, "grad_norm": 1.4222239205263287, "learning_rate": 9.532377531618398e-07, "loss": 0.0527, "step": 1526 }, { "epoch": 0.694722474977252, "grad_norm": 1.0935755548693302, "learning_rate": 9.531773814192952e-07, "loss": 0.0568, "step": 1527 }, { "epoch": 0.6951774340309372, "grad_norm": 1.2984989365288144, "learning_rate": 9.531169726451416e-07, "loss": 0.0853, "step": 1528 }, { "epoch": 0.6956323930846224, "grad_norm": 1.1353349667594435, "learning_rate": 9.530565268443151e-07, "loss": 0.0722, "step": 1529 }, { "epoch": 0.6960873521383075, "grad_norm": 1.3395343595439133, "learning_rate": 9.529960440217553e-07, "loss": 0.0688, "step": 1530 }, { "epoch": 0.6965423111919927, "grad_norm": 1.4217237670654193, "learning_rate": 9.529355241824044e-07, "loss": 0.0825, "step": 1531 }, { "epoch": 0.6969972702456779, "grad_norm": 0.8925333602531222, "learning_rate": 9.528749673312079e-07, "loss": 0.0529, "step": 1532 }, { "epoch": 0.697452229299363, "grad_norm": 0.8111216308483729, "learning_rate": 9.528143734731142e-07, "loss": 0.0626, "step": 1533 }, { "epoch": 0.6979071883530482, "grad_norm": 0.9978978567775276, "learning_rate": 9.52753742613075e-07, "loss": 0.0512, "step": 1534 }, { "epoch": 0.6983621474067334, "grad_norm": 1.5494080922306182, "learning_rate": 9.526930747560444e-07, "loss": 0.0713, "step": 1535 }, { "epoch": 0.6988171064604186, "grad_norm": 1.1158760853117153, "learning_rate": 9.526323699069801e-07, "loss": 0.0518, "step": 1536 }, { "epoch": 0.6992720655141037, "grad_norm": 0.7430562065121343, "learning_rate": 9.525716280708426e-07, "loss": 0.0456, "step": 1537 }, { "epoch": 0.6997270245677889, "grad_norm": 0.9298844811963138, "learning_rate": 9.525108492525956e-07, "loss": 0.0598, "step": 1538 }, { "epoch": 0.7001819836214741, "grad_norm": 0.6175936735837297, "learning_rate": 9.524500334572053e-07, "loss": 0.0259, "step": 1539 }, { "epoch": 0.7006369426751592, "grad_norm": 1.152214832908206, "learning_rate": 9.523891806896415e-07, "loss": 0.0746, "step": 1540 }, { "epoch": 0.7010919017288444, "grad_norm": 0.6760971193433334, "learning_rate": 9.523282909548771e-07, "loss": 0.0447, "step": 1541 }, { "epoch": 0.7015468607825296, "grad_norm": 1.1649570725023224, "learning_rate": 9.522673642578872e-07, "loss": 0.073, "step": 1542 }, { "epoch": 0.7020018198362148, "grad_norm": 0.7016305510187092, "learning_rate": 9.522064006036507e-07, "loss": 0.0472, "step": 1543 }, { "epoch": 0.7024567788898999, "grad_norm": 0.8972613088855633, "learning_rate": 9.521453999971495e-07, "loss": 0.0353, "step": 1544 }, { "epoch": 0.7029117379435851, "grad_norm": 1.0216685243900925, "learning_rate": 9.520843624433679e-07, "loss": 0.0484, "step": 1545 }, { "epoch": 0.7033666969972703, "grad_norm": 1.3111654892330646, "learning_rate": 9.52023287947294e-07, "loss": 0.0854, "step": 1546 }, { "epoch": 0.7038216560509554, "grad_norm": 1.5343905151376545, "learning_rate": 9.51962176513918e-07, "loss": 0.0891, "step": 1547 }, { "epoch": 0.7042766151046406, "grad_norm": 0.5514746116297383, "learning_rate": 9.519010281482342e-07, "loss": 0.0304, "step": 1548 }, { "epoch": 0.7047315741583258, "grad_norm": 1.5404953666349488, "learning_rate": 9.518398428552393e-07, "loss": 0.0884, "step": 1549 }, { "epoch": 0.705186533212011, "grad_norm": 0.7715311748200352, "learning_rate": 9.517786206399327e-07, "loss": 0.0521, "step": 1550 }, { "epoch": 0.7056414922656961, "grad_norm": 1.00867860420815, "learning_rate": 9.517173615073175e-07, "loss": 0.0546, "step": 1551 }, { "epoch": 0.7060964513193813, "grad_norm": 0.623174643693698, "learning_rate": 9.516560654623995e-07, "loss": 0.0336, "step": 1552 }, { "epoch": 0.7065514103730665, "grad_norm": 1.2060464210757715, "learning_rate": 9.515947325101874e-07, "loss": 0.0606, "step": 1553 }, { "epoch": 0.7070063694267515, "grad_norm": 0.7928237380493136, "learning_rate": 9.515333626556932e-07, "loss": 0.0446, "step": 1554 }, { "epoch": 0.7074613284804367, "grad_norm": 1.0193036635032333, "learning_rate": 9.514719559039317e-07, "loss": 0.0642, "step": 1555 }, { "epoch": 0.707916287534122, "grad_norm": 1.1393141476843611, "learning_rate": 9.514105122599208e-07, "loss": 0.0868, "step": 1556 }, { "epoch": 0.7083712465878071, "grad_norm": 1.3113417326135421, "learning_rate": 9.513490317286813e-07, "loss": 0.0843, "step": 1557 }, { "epoch": 0.7088262056414922, "grad_norm": 1.0484604363306664, "learning_rate": 9.512875143152372e-07, "loss": 0.0645, "step": 1558 }, { "epoch": 0.7092811646951774, "grad_norm": 0.5547791593290061, "learning_rate": 9.512259600246156e-07, "loss": 0.0227, "step": 1559 }, { "epoch": 0.7097361237488626, "grad_norm": 0.8625906565868993, "learning_rate": 9.511643688618462e-07, "loss": 0.0663, "step": 1560 }, { "epoch": 0.7101910828025477, "grad_norm": 1.0987191083831986, "learning_rate": 9.511027408319619e-07, "loss": 0.0628, "step": 1561 }, { "epoch": 0.7106460418562329, "grad_norm": 1.0427741774956505, "learning_rate": 9.51041075939999e-07, "loss": 0.0571, "step": 1562 }, { "epoch": 0.7111010009099181, "grad_norm": 0.9229135460412132, "learning_rate": 9.50979374190996e-07, "loss": 0.0558, "step": 1563 }, { "epoch": 0.7115559599636033, "grad_norm": 1.1620256944434195, "learning_rate": 9.509176355899953e-07, "loss": 0.0595, "step": 1564 }, { "epoch": 0.7120109190172884, "grad_norm": 0.9625550767974997, "learning_rate": 9.508558601420417e-07, "loss": 0.0454, "step": 1565 }, { "epoch": 0.7124658780709736, "grad_norm": 1.0757617669214439, "learning_rate": 9.507940478521832e-07, "loss": 0.079, "step": 1566 }, { "epoch": 0.7129208371246588, "grad_norm": 0.8157411763121906, "learning_rate": 9.507321987254711e-07, "loss": 0.0355, "step": 1567 }, { "epoch": 0.7133757961783439, "grad_norm": 1.4312669337917656, "learning_rate": 9.506703127669588e-07, "loss": 0.0745, "step": 1568 }, { "epoch": 0.7138307552320291, "grad_norm": 1.3832674539690235, "learning_rate": 9.506083899817041e-07, "loss": 0.0544, "step": 1569 }, { "epoch": 0.7142857142857143, "grad_norm": 1.3256870477303846, "learning_rate": 9.505464303747667e-07, "loss": 0.0683, "step": 1570 }, { "epoch": 0.7147406733393995, "grad_norm": 0.8458850476594002, "learning_rate": 9.504844339512094e-07, "loss": 0.0568, "step": 1571 }, { "epoch": 0.7151956323930846, "grad_norm": 0.7609487280945063, "learning_rate": 9.504224007160988e-07, "loss": 0.0395, "step": 1572 }, { "epoch": 0.7156505914467698, "grad_norm": 1.0149855941085815, "learning_rate": 9.503603306745036e-07, "loss": 0.0431, "step": 1573 }, { "epoch": 0.716105550500455, "grad_norm": 1.0323702805877413, "learning_rate": 9.502982238314961e-07, "loss": 0.0565, "step": 1574 }, { "epoch": 0.7165605095541401, "grad_norm": 0.6545677701780207, "learning_rate": 9.50236080192151e-07, "loss": 0.0386, "step": 1575 }, { "epoch": 0.7170154686078253, "grad_norm": 1.1918691782274384, "learning_rate": 9.501738997615469e-07, "loss": 0.0645, "step": 1576 }, { "epoch": 0.7174704276615105, "grad_norm": 1.0872351557499234, "learning_rate": 9.501116825447646e-07, "loss": 0.0466, "step": 1577 }, { "epoch": 0.7179253867151957, "grad_norm": 0.6960629571971281, "learning_rate": 9.500494285468883e-07, "loss": 0.0343, "step": 1578 }, { "epoch": 0.7183803457688808, "grad_norm": 1.091955568868288, "learning_rate": 9.499871377730051e-07, "loss": 0.0777, "step": 1579 }, { "epoch": 0.718835304822566, "grad_norm": 0.6211628672072906, "learning_rate": 9.499248102282052e-07, "loss": 0.0294, "step": 1580 }, { "epoch": 0.7192902638762512, "grad_norm": 0.6862211642245908, "learning_rate": 9.498624459175815e-07, "loss": 0.0439, "step": 1581 }, { "epoch": 0.7197452229299363, "grad_norm": 1.132931437210576, "learning_rate": 9.498000448462304e-07, "loss": 0.0711, "step": 1582 }, { "epoch": 0.7202001819836215, "grad_norm": 0.8280219653942137, "learning_rate": 9.497376070192509e-07, "loss": 0.0519, "step": 1583 }, { "epoch": 0.7206551410373067, "grad_norm": 0.8018959944411896, "learning_rate": 9.496751324417451e-07, "loss": 0.0328, "step": 1584 }, { "epoch": 0.7211101000909919, "grad_norm": 1.2263130429602858, "learning_rate": 9.496126211188182e-07, "loss": 0.0805, "step": 1585 }, { "epoch": 0.721565059144677, "grad_norm": 0.9913004233307587, "learning_rate": 9.495500730555783e-07, "loss": 0.0663, "step": 1586 }, { "epoch": 0.7220200181983621, "grad_norm": 0.9712781825840319, "learning_rate": 9.494874882571367e-07, "loss": 0.0285, "step": 1587 }, { "epoch": 0.7224749772520473, "grad_norm": 1.0669266342043542, "learning_rate": 9.494248667286073e-07, "loss": 0.0662, "step": 1588 }, { "epoch": 0.7229299363057324, "grad_norm": 1.0113741816269124, "learning_rate": 9.493622084751075e-07, "loss": 0.0563, "step": 1589 }, { "epoch": 0.7233848953594176, "grad_norm": 0.8827493788551256, "learning_rate": 9.492995135017573e-07, "loss": 0.0483, "step": 1590 }, { "epoch": 0.7238398544131028, "grad_norm": 0.8640941377578134, "learning_rate": 9.492367818136798e-07, "loss": 0.0616, "step": 1591 }, { "epoch": 0.724294813466788, "grad_norm": 1.1122349769969266, "learning_rate": 9.491740134160013e-07, "loss": 0.053, "step": 1592 }, { "epoch": 0.7247497725204731, "grad_norm": 1.6701201748885373, "learning_rate": 9.491112083138508e-07, "loss": 0.0896, "step": 1593 }, { "epoch": 0.7252047315741583, "grad_norm": 1.0527306705691355, "learning_rate": 9.490483665123605e-07, "loss": 0.0399, "step": 1594 }, { "epoch": 0.7256596906278435, "grad_norm": 0.7495648365576982, "learning_rate": 9.489854880166657e-07, "loss": 0.0471, "step": 1595 }, { "epoch": 0.7261146496815286, "grad_norm": 0.7983459994122855, "learning_rate": 9.489225728319044e-07, "loss": 0.0449, "step": 1596 }, { "epoch": 0.7265696087352138, "grad_norm": 1.1635685734026922, "learning_rate": 9.488596209632177e-07, "loss": 0.0753, "step": 1597 }, { "epoch": 0.727024567788899, "grad_norm": 0.8962410545977373, "learning_rate": 9.487966324157498e-07, "loss": 0.0387, "step": 1598 }, { "epoch": 0.7274795268425842, "grad_norm": 1.6300554763231074, "learning_rate": 9.487336071946479e-07, "loss": 0.1107, "step": 1599 }, { "epoch": 0.7279344858962693, "grad_norm": 1.0608066705162214, "learning_rate": 9.486705453050621e-07, "loss": 0.0559, "step": 1600 }, { "epoch": 0.7283894449499545, "grad_norm": 0.7888961668281369, "learning_rate": 9.486074467521455e-07, "loss": 0.0441, "step": 1601 }, { "epoch": 0.7288444040036397, "grad_norm": 1.1564237677410028, "learning_rate": 9.485443115410541e-07, "loss": 0.0459, "step": 1602 }, { "epoch": 0.7292993630573248, "grad_norm": 0.8454642605214944, "learning_rate": 9.484811396769473e-07, "loss": 0.0663, "step": 1603 }, { "epoch": 0.72975432211101, "grad_norm": 1.1651767523165664, "learning_rate": 9.484179311649871e-07, "loss": 0.0696, "step": 1604 }, { "epoch": 0.7302092811646952, "grad_norm": 0.634992051258353, "learning_rate": 9.483546860103386e-07, "loss": 0.0337, "step": 1605 }, { "epoch": 0.7306642402183804, "grad_norm": 0.9998720316028941, "learning_rate": 9.482914042181699e-07, "loss": 0.0543, "step": 1606 }, { "epoch": 0.7311191992720655, "grad_norm": 1.3354768080452477, "learning_rate": 9.482280857936521e-07, "loss": 0.0746, "step": 1607 }, { "epoch": 0.7315741583257507, "grad_norm": 0.5104976427621315, "learning_rate": 9.481647307419594e-07, "loss": 0.0296, "step": 1608 }, { "epoch": 0.7320291173794359, "grad_norm": 0.76964512607771, "learning_rate": 9.481013390682686e-07, "loss": 0.0469, "step": 1609 }, { "epoch": 0.732484076433121, "grad_norm": 1.17884905427705, "learning_rate": 9.480379107777601e-07, "loss": 0.0809, "step": 1610 }, { "epoch": 0.7329390354868062, "grad_norm": 0.865753319408929, "learning_rate": 9.479744458756169e-07, "loss": 0.0477, "step": 1611 }, { "epoch": 0.7333939945404914, "grad_norm": 1.0744559695873264, "learning_rate": 9.479109443670249e-07, "loss": 0.0454, "step": 1612 }, { "epoch": 0.7338489535941766, "grad_norm": 1.1969867584557983, "learning_rate": 9.478474062571733e-07, "loss": 0.0779, "step": 1613 }, { "epoch": 0.7343039126478617, "grad_norm": 1.187213311437901, "learning_rate": 9.477838315512543e-07, "loss": 0.0647, "step": 1614 }, { "epoch": 0.7347588717015469, "grad_norm": 0.8979107869524696, "learning_rate": 9.477202202544625e-07, "loss": 0.0572, "step": 1615 }, { "epoch": 0.7352138307552321, "grad_norm": 0.910357870222474, "learning_rate": 9.476565723719964e-07, "loss": 0.0487, "step": 1616 }, { "epoch": 0.7356687898089171, "grad_norm": 1.0483367257192846, "learning_rate": 9.475928879090567e-07, "loss": 0.0572, "step": 1617 }, { "epoch": 0.7361237488626023, "grad_norm": 0.9009319798356943, "learning_rate": 9.475291668708474e-07, "loss": 0.0455, "step": 1618 }, { "epoch": 0.7365787079162875, "grad_norm": 1.0063686612423808, "learning_rate": 9.474654092625758e-07, "loss": 0.0377, "step": 1619 }, { "epoch": 0.7370336669699727, "grad_norm": 1.021902616963658, "learning_rate": 9.474016150894517e-07, "loss": 0.0475, "step": 1620 }, { "epoch": 0.7374886260236578, "grad_norm": 0.9818359153240885, "learning_rate": 9.473377843566879e-07, "loss": 0.0587, "step": 1621 }, { "epoch": 0.737943585077343, "grad_norm": 0.9846308273245427, "learning_rate": 9.472739170695006e-07, "loss": 0.0603, "step": 1622 }, { "epoch": 0.7383985441310282, "grad_norm": 1.4563767871770792, "learning_rate": 9.472100132331087e-07, "loss": 0.061, "step": 1623 }, { "epoch": 0.7388535031847133, "grad_norm": 1.252191091111057, "learning_rate": 9.471460728527341e-07, "loss": 0.0725, "step": 1624 }, { "epoch": 0.7393084622383985, "grad_norm": 0.8982284986509258, "learning_rate": 9.470820959336017e-07, "loss": 0.0542, "step": 1625 }, { "epoch": 0.7397634212920837, "grad_norm": 0.9901495084520407, "learning_rate": 9.470180824809393e-07, "loss": 0.0649, "step": 1626 }, { "epoch": 0.7402183803457689, "grad_norm": 0.9465415516229594, "learning_rate": 9.46954032499978e-07, "loss": 0.0419, "step": 1627 }, { "epoch": 0.740673339399454, "grad_norm": 0.9070059843174324, "learning_rate": 9.468899459959518e-07, "loss": 0.0368, "step": 1628 }, { "epoch": 0.7411282984531392, "grad_norm": 1.169629084864992, "learning_rate": 9.46825822974097e-07, "loss": 0.0596, "step": 1629 }, { "epoch": 0.7415832575068244, "grad_norm": 0.7954083171402722, "learning_rate": 9.46761663439654e-07, "loss": 0.0319, "step": 1630 }, { "epoch": 0.7420382165605095, "grad_norm": 0.8601801083818359, "learning_rate": 9.466974673978653e-07, "loss": 0.047, "step": 1631 }, { "epoch": 0.7424931756141947, "grad_norm": 1.157434973261911, "learning_rate": 9.466332348539772e-07, "loss": 0.064, "step": 1632 }, { "epoch": 0.7429481346678799, "grad_norm": 1.1981691266234098, "learning_rate": 9.465689658132378e-07, "loss": 0.0645, "step": 1633 }, { "epoch": 0.7434030937215651, "grad_norm": 1.4386290722235262, "learning_rate": 9.465046602808992e-07, "loss": 0.0621, "step": 1634 }, { "epoch": 0.7438580527752502, "grad_norm": 1.2263101353593575, "learning_rate": 9.464403182622163e-07, "loss": 0.0748, "step": 1635 }, { "epoch": 0.7443130118289354, "grad_norm": 1.3462370355699171, "learning_rate": 9.463759397624465e-07, "loss": 0.0747, "step": 1636 }, { "epoch": 0.7447679708826206, "grad_norm": 0.7671014872074868, "learning_rate": 9.463115247868509e-07, "loss": 0.0379, "step": 1637 }, { "epoch": 0.7452229299363057, "grad_norm": 1.7677215021436725, "learning_rate": 9.462470733406929e-07, "loss": 0.0877, "step": 1638 }, { "epoch": 0.7456778889899909, "grad_norm": 1.636409165661253, "learning_rate": 9.461825854292393e-07, "loss": 0.0763, "step": 1639 }, { "epoch": 0.7461328480436761, "grad_norm": 0.9189032789803171, "learning_rate": 9.461180610577598e-07, "loss": 0.054, "step": 1640 }, { "epoch": 0.7465878070973613, "grad_norm": 1.1954973063028607, "learning_rate": 9.460535002315271e-07, "loss": 0.0528, "step": 1641 }, { "epoch": 0.7470427661510464, "grad_norm": 1.2324820562907415, "learning_rate": 9.459889029558165e-07, "loss": 0.0539, "step": 1642 }, { "epoch": 0.7474977252047316, "grad_norm": 0.8798556249970283, "learning_rate": 9.459242692359071e-07, "loss": 0.0496, "step": 1643 }, { "epoch": 0.7479526842584168, "grad_norm": 0.9250224465459985, "learning_rate": 9.458595990770799e-07, "loss": 0.0391, "step": 1644 }, { "epoch": 0.7484076433121019, "grad_norm": 0.8187637102760383, "learning_rate": 9.4579489248462e-07, "loss": 0.033, "step": 1645 }, { "epoch": 0.7488626023657871, "grad_norm": 2.119612440254283, "learning_rate": 9.457301494638146e-07, "loss": 0.0718, "step": 1646 }, { "epoch": 0.7493175614194723, "grad_norm": 1.5657085128763972, "learning_rate": 9.456653700199541e-07, "loss": 0.0681, "step": 1647 }, { "epoch": 0.7497725204731575, "grad_norm": 0.8209647344422758, "learning_rate": 9.456005541583325e-07, "loss": 0.0553, "step": 1648 }, { "epoch": 0.7502274795268425, "grad_norm": 0.8975249756258198, "learning_rate": 9.455357018842457e-07, "loss": 0.0484, "step": 1649 }, { "epoch": 0.7506824385805277, "grad_norm": 0.610402401871437, "learning_rate": 9.454708132029933e-07, "loss": 0.0315, "step": 1650 }, { "epoch": 0.7511373976342129, "grad_norm": 1.255586673826788, "learning_rate": 9.45405888119878e-07, "loss": 0.0499, "step": 1651 }, { "epoch": 0.7515923566878981, "grad_norm": 0.8387462153335264, "learning_rate": 9.453409266402049e-07, "loss": 0.0455, "step": 1652 }, { "epoch": 0.7520473157415832, "grad_norm": 1.3468337682263247, "learning_rate": 9.452759287692823e-07, "loss": 0.0638, "step": 1653 }, { "epoch": 0.7525022747952684, "grad_norm": 1.3943264412350056, "learning_rate": 9.452108945124218e-07, "loss": 0.0677, "step": 1654 }, { "epoch": 0.7529572338489536, "grad_norm": 1.4423867126872867, "learning_rate": 9.451458238749373e-07, "loss": 0.0361, "step": 1655 }, { "epoch": 0.7534121929026387, "grad_norm": 0.8239918464236422, "learning_rate": 9.450807168621467e-07, "loss": 0.0351, "step": 1656 }, { "epoch": 0.7538671519563239, "grad_norm": 1.1656066933904463, "learning_rate": 9.450155734793695e-07, "loss": 0.041, "step": 1657 }, { "epoch": 0.7543221110100091, "grad_norm": 0.904749888404318, "learning_rate": 9.449503937319296e-07, "loss": 0.07, "step": 1658 }, { "epoch": 0.7547770700636943, "grad_norm": 1.211441167380313, "learning_rate": 9.448851776251527e-07, "loss": 0.0655, "step": 1659 }, { "epoch": 0.7552320291173794, "grad_norm": 1.0223359197945585, "learning_rate": 9.448199251643682e-07, "loss": 0.0518, "step": 1660 }, { "epoch": 0.7556869881710646, "grad_norm": 1.1165011785652879, "learning_rate": 9.447546363549084e-07, "loss": 0.0611, "step": 1661 }, { "epoch": 0.7561419472247498, "grad_norm": 1.187876262729752, "learning_rate": 9.446893112021081e-07, "loss": 0.0569, "step": 1662 }, { "epoch": 0.7565969062784349, "grad_norm": 0.7736235536604416, "learning_rate": 9.446239497113054e-07, "loss": 0.0588, "step": 1663 }, { "epoch": 0.7570518653321201, "grad_norm": 1.125570613283056, "learning_rate": 9.445585518878416e-07, "loss": 0.0655, "step": 1664 }, { "epoch": 0.7575068243858053, "grad_norm": 1.2870512696875822, "learning_rate": 9.444931177370604e-07, "loss": 0.0897, "step": 1665 }, { "epoch": 0.7579617834394905, "grad_norm": 0.7216204190158066, "learning_rate": 9.444276472643089e-07, "loss": 0.0333, "step": 1666 }, { "epoch": 0.7584167424931756, "grad_norm": 1.065865543424559, "learning_rate": 9.443621404749372e-07, "loss": 0.0449, "step": 1667 }, { "epoch": 0.7588717015468608, "grad_norm": 0.8446708090262295, "learning_rate": 9.442965973742981e-07, "loss": 0.0428, "step": 1668 }, { "epoch": 0.759326660600546, "grad_norm": 1.7978710967014038, "learning_rate": 9.442310179677475e-07, "loss": 0.061, "step": 1669 }, { "epoch": 0.7597816196542311, "grad_norm": 0.9417749685345189, "learning_rate": 9.441654022606443e-07, "loss": 0.0465, "step": 1670 }, { "epoch": 0.7602365787079163, "grad_norm": 1.1082188593230238, "learning_rate": 9.440997502583501e-07, "loss": 0.0661, "step": 1671 }, { "epoch": 0.7606915377616015, "grad_norm": 1.38395787534029, "learning_rate": 9.440340619662299e-07, "loss": 0.0958, "step": 1672 }, { "epoch": 0.7611464968152867, "grad_norm": 1.2099658157876856, "learning_rate": 9.439683373896515e-07, "loss": 0.0657, "step": 1673 }, { "epoch": 0.7616014558689718, "grad_norm": 1.3392185771148815, "learning_rate": 9.439025765339852e-07, "loss": 0.0799, "step": 1674 }, { "epoch": 0.762056414922657, "grad_norm": 0.8545669627599219, "learning_rate": 9.438367794046053e-07, "loss": 0.0396, "step": 1675 }, { "epoch": 0.7625113739763422, "grad_norm": 1.0258478192810774, "learning_rate": 9.43770946006888e-07, "loss": 0.0456, "step": 1676 }, { "epoch": 0.7629663330300273, "grad_norm": 0.8661866766015353, "learning_rate": 9.437050763462131e-07, "loss": 0.0483, "step": 1677 }, { "epoch": 0.7634212920837125, "grad_norm": 1.374889653184246, "learning_rate": 9.436391704279631e-07, "loss": 0.0828, "step": 1678 }, { "epoch": 0.7638762511373977, "grad_norm": 1.7726162082835275, "learning_rate": 9.435732282575235e-07, "loss": 0.1057, "step": 1679 }, { "epoch": 0.7643312101910829, "grad_norm": 1.0758612355599462, "learning_rate": 9.435072498402829e-07, "loss": 0.0599, "step": 1680 }, { "epoch": 0.7647861692447679, "grad_norm": 0.7578331598161245, "learning_rate": 9.434412351816328e-07, "loss": 0.0441, "step": 1681 }, { "epoch": 0.7652411282984531, "grad_norm": 1.2981661318091267, "learning_rate": 9.433751842869674e-07, "loss": 0.0677, "step": 1682 }, { "epoch": 0.7656960873521383, "grad_norm": 1.3070838821724735, "learning_rate": 9.433090971616842e-07, "loss": 0.0642, "step": 1683 }, { "epoch": 0.7661510464058234, "grad_norm": 1.2256714856269175, "learning_rate": 9.432429738111836e-07, "loss": 0.0611, "step": 1684 }, { "epoch": 0.7666060054595086, "grad_norm": 1.0374078346904088, "learning_rate": 9.431768142408687e-07, "loss": 0.0551, "step": 1685 }, { "epoch": 0.7670609645131938, "grad_norm": 0.87290848309223, "learning_rate": 9.431106184561462e-07, "loss": 0.0567, "step": 1686 }, { "epoch": 0.767515923566879, "grad_norm": 1.1014216318430343, "learning_rate": 9.430443864624248e-07, "loss": 0.0643, "step": 1687 }, { "epoch": 0.7679708826205641, "grad_norm": 1.5582842583289689, "learning_rate": 9.42978118265117e-07, "loss": 0.0804, "step": 1688 }, { "epoch": 0.7684258416742493, "grad_norm": 0.8191912918388858, "learning_rate": 9.429118138696377e-07, "loss": 0.0424, "step": 1689 }, { "epoch": 0.7688808007279345, "grad_norm": 1.1710383443949361, "learning_rate": 9.428454732814053e-07, "loss": 0.0566, "step": 1690 }, { "epoch": 0.7693357597816196, "grad_norm": 0.9384113578203046, "learning_rate": 9.427790965058406e-07, "loss": 0.0468, "step": 1691 }, { "epoch": 0.7697907188353048, "grad_norm": 1.0801136169134784, "learning_rate": 9.427126835483679e-07, "loss": 0.0556, "step": 1692 }, { "epoch": 0.77024567788899, "grad_norm": 1.2086015141903894, "learning_rate": 9.426462344144137e-07, "loss": 0.0566, "step": 1693 }, { "epoch": 0.7707006369426752, "grad_norm": 1.062202709046726, "learning_rate": 9.425797491094084e-07, "loss": 0.0541, "step": 1694 }, { "epoch": 0.7711555959963603, "grad_norm": 1.2483272255459357, "learning_rate": 9.425132276387846e-07, "loss": 0.0761, "step": 1695 }, { "epoch": 0.7716105550500455, "grad_norm": 0.7956593056190194, "learning_rate": 9.424466700079783e-07, "loss": 0.0431, "step": 1696 }, { "epoch": 0.7720655141037307, "grad_norm": 0.7529112834435677, "learning_rate": 9.423800762224281e-07, "loss": 0.03, "step": 1697 }, { "epoch": 0.7725204731574158, "grad_norm": 0.703116523077997, "learning_rate": 9.423134462875758e-07, "loss": 0.0357, "step": 1698 }, { "epoch": 0.772975432211101, "grad_norm": 0.7610529129906067, "learning_rate": 9.422467802088663e-07, "loss": 0.0546, "step": 1699 }, { "epoch": 0.7734303912647862, "grad_norm": 0.898947966509435, "learning_rate": 9.421800779917469e-07, "loss": 0.0572, "step": 1700 }, { "epoch": 0.7738853503184714, "grad_norm": 1.4191776517652666, "learning_rate": 9.421133396416685e-07, "loss": 0.0896, "step": 1701 }, { "epoch": 0.7743403093721565, "grad_norm": 0.8954001056582984, "learning_rate": 9.420465651640846e-07, "loss": 0.0536, "step": 1702 }, { "epoch": 0.7747952684258417, "grad_norm": 0.6546139758124209, "learning_rate": 9.419797545644514e-07, "loss": 0.0314, "step": 1703 }, { "epoch": 0.7752502274795269, "grad_norm": 0.7845685794529155, "learning_rate": 9.419129078482288e-07, "loss": 0.0443, "step": 1704 }, { "epoch": 0.775705186533212, "grad_norm": 0.77873153901093, "learning_rate": 9.418460250208791e-07, "loss": 0.0493, "step": 1705 }, { "epoch": 0.7761601455868972, "grad_norm": 0.7777834735237951, "learning_rate": 9.417791060878676e-07, "loss": 0.0317, "step": 1706 }, { "epoch": 0.7766151046405824, "grad_norm": 0.7796134172440536, "learning_rate": 9.417121510546625e-07, "loss": 0.0295, "step": 1707 }, { "epoch": 0.7770700636942676, "grad_norm": 1.5561109957696264, "learning_rate": 9.416451599267352e-07, "loss": 0.0874, "step": 1708 }, { "epoch": 0.7775250227479527, "grad_norm": 0.7700389850183532, "learning_rate": 9.4157813270956e-07, "loss": 0.0574, "step": 1709 }, { "epoch": 0.7779799818016379, "grad_norm": 0.8630715417604102, "learning_rate": 9.415110694086138e-07, "loss": 0.0622, "step": 1710 }, { "epoch": 0.778434940855323, "grad_norm": 1.9932648548393246, "learning_rate": 9.414439700293768e-07, "loss": 0.0734, "step": 1711 }, { "epoch": 0.7788898999090081, "grad_norm": 0.9108071580383189, "learning_rate": 9.413768345773323e-07, "loss": 0.0638, "step": 1712 }, { "epoch": 0.7793448589626933, "grad_norm": 0.7091607099299985, "learning_rate": 9.41309663057966e-07, "loss": 0.0486, "step": 1713 }, { "epoch": 0.7797998180163785, "grad_norm": 0.7629590923767112, "learning_rate": 9.41242455476767e-07, "loss": 0.0409, "step": 1714 }, { "epoch": 0.7802547770700637, "grad_norm": 1.1005006607662584, "learning_rate": 9.411752118392271e-07, "loss": 0.0668, "step": 1715 }, { "epoch": 0.7807097361237488, "grad_norm": 0.9368099195947924, "learning_rate": 9.411079321508414e-07, "loss": 0.0648, "step": 1716 }, { "epoch": 0.781164695177434, "grad_norm": 0.9950621198003168, "learning_rate": 9.410406164171074e-07, "loss": 0.0508, "step": 1717 }, { "epoch": 0.7816196542311192, "grad_norm": 1.4233715677906844, "learning_rate": 9.40973264643526e-07, "loss": 0.0746, "step": 1718 }, { "epoch": 0.7820746132848043, "grad_norm": 1.4052062617699765, "learning_rate": 9.409058768356006e-07, "loss": 0.0485, "step": 1719 }, { "epoch": 0.7825295723384895, "grad_norm": 0.6994078647785952, "learning_rate": 9.408384529988384e-07, "loss": 0.0342, "step": 1720 }, { "epoch": 0.7829845313921747, "grad_norm": 0.6180981810000639, "learning_rate": 9.407709931387484e-07, "loss": 0.0318, "step": 1721 }, { "epoch": 0.7834394904458599, "grad_norm": 1.1265776256056161, "learning_rate": 9.407034972608434e-07, "loss": 0.0622, "step": 1722 }, { "epoch": 0.783894449499545, "grad_norm": 0.8194162799814318, "learning_rate": 9.406359653706388e-07, "loss": 0.0492, "step": 1723 }, { "epoch": 0.7843494085532302, "grad_norm": 1.2526092679504774, "learning_rate": 9.405683974736531e-07, "loss": 0.0538, "step": 1724 }, { "epoch": 0.7848043676069154, "grad_norm": 0.7889705061713871, "learning_rate": 9.405007935754074e-07, "loss": 0.0345, "step": 1725 }, { "epoch": 0.7852593266606005, "grad_norm": 1.4769455228907302, "learning_rate": 9.404331536814263e-07, "loss": 0.0872, "step": 1726 }, { "epoch": 0.7857142857142857, "grad_norm": 0.8612981121762904, "learning_rate": 9.403654777972368e-07, "loss": 0.0557, "step": 1727 }, { "epoch": 0.7861692447679709, "grad_norm": 1.001332837179155, "learning_rate": 9.402977659283689e-07, "loss": 0.0469, "step": 1728 }, { "epoch": 0.7866242038216561, "grad_norm": 0.8608892509025668, "learning_rate": 9.402300180803562e-07, "loss": 0.0523, "step": 1729 }, { "epoch": 0.7870791628753412, "grad_norm": 0.8463247772422802, "learning_rate": 9.401622342587344e-07, "loss": 0.0468, "step": 1730 }, { "epoch": 0.7875341219290264, "grad_norm": 0.9416550849903463, "learning_rate": 9.400944144690426e-07, "loss": 0.0684, "step": 1731 }, { "epoch": 0.7879890809827116, "grad_norm": 0.9571108074022935, "learning_rate": 9.400265587168226e-07, "loss": 0.053, "step": 1732 }, { "epoch": 0.7884440400363967, "grad_norm": 0.9346581332656714, "learning_rate": 9.399586670076195e-07, "loss": 0.0511, "step": 1733 }, { "epoch": 0.7888989990900819, "grad_norm": 0.854974112590178, "learning_rate": 9.398907393469809e-07, "loss": 0.0472, "step": 1734 }, { "epoch": 0.7893539581437671, "grad_norm": 1.3316346078702088, "learning_rate": 9.398227757404576e-07, "loss": 0.0804, "step": 1735 }, { "epoch": 0.7898089171974523, "grad_norm": 0.6848089631272882, "learning_rate": 9.397547761936033e-07, "loss": 0.0271, "step": 1736 }, { "epoch": 0.7902638762511374, "grad_norm": 0.6358290790705619, "learning_rate": 9.396867407119747e-07, "loss": 0.0288, "step": 1737 }, { "epoch": 0.7907188353048226, "grad_norm": 0.5892961160922258, "learning_rate": 9.39618669301131e-07, "loss": 0.0297, "step": 1738 }, { "epoch": 0.7911737943585078, "grad_norm": 1.172169975683426, "learning_rate": 9.395505619666353e-07, "loss": 0.0592, "step": 1739 }, { "epoch": 0.7916287534121929, "grad_norm": 1.330789204906966, "learning_rate": 9.394824187140524e-07, "loss": 0.0646, "step": 1740 }, { "epoch": 0.792083712465878, "grad_norm": 0.8355148679798231, "learning_rate": 9.394142395489511e-07, "loss": 0.0417, "step": 1741 }, { "epoch": 0.7925386715195633, "grad_norm": 1.0199456321293288, "learning_rate": 9.393460244769022e-07, "loss": 0.0459, "step": 1742 }, { "epoch": 0.7929936305732485, "grad_norm": 0.885803266966633, "learning_rate": 9.392777735034806e-07, "loss": 0.0488, "step": 1743 }, { "epoch": 0.7934485896269335, "grad_norm": 1.06804939574801, "learning_rate": 9.39209486634263e-07, "loss": 0.0433, "step": 1744 }, { "epoch": 0.7939035486806187, "grad_norm": 1.0644507118709123, "learning_rate": 9.391411638748297e-07, "loss": 0.0466, "step": 1745 }, { "epoch": 0.7943585077343039, "grad_norm": 0.7107193043582056, "learning_rate": 9.390728052307635e-07, "loss": 0.044, "step": 1746 }, { "epoch": 0.794813466787989, "grad_norm": 0.9371361103894694, "learning_rate": 9.390044107076504e-07, "loss": 0.0457, "step": 1747 }, { "epoch": 0.7952684258416742, "grad_norm": 1.4410998887076174, "learning_rate": 9.389359803110794e-07, "loss": 0.0785, "step": 1748 }, { "epoch": 0.7957233848953594, "grad_norm": 7.883436010292361, "learning_rate": 9.388675140466425e-07, "loss": 0.2112, "step": 1749 }, { "epoch": 0.7961783439490446, "grad_norm": 0.8420796228127656, "learning_rate": 9.387990119199342e-07, "loss": 0.0489, "step": 1750 }, { "epoch": 0.7966333030027297, "grad_norm": 1.1088874207768145, "learning_rate": 9.387304739365523e-07, "loss": 0.0728, "step": 1751 }, { "epoch": 0.7970882620564149, "grad_norm": 0.9517755732413012, "learning_rate": 9.386619001020972e-07, "loss": 0.0366, "step": 1752 }, { "epoch": 0.7975432211101001, "grad_norm": 1.0119587166820345, "learning_rate": 9.385932904221728e-07, "loss": 0.0336, "step": 1753 }, { "epoch": 0.7979981801637852, "grad_norm": 0.9859694783093257, "learning_rate": 9.385246449023851e-07, "loss": 0.0825, "step": 1754 }, { "epoch": 0.7984531392174704, "grad_norm": 1.3439243976637008, "learning_rate": 9.38455963548344e-07, "loss": 0.0573, "step": 1755 }, { "epoch": 0.7989080982711556, "grad_norm": 1.157796396991948, "learning_rate": 9.383872463656615e-07, "loss": 0.0579, "step": 1756 }, { "epoch": 0.7993630573248408, "grad_norm": 1.0948322207477093, "learning_rate": 9.38318493359953e-07, "loss": 0.0678, "step": 1757 }, { "epoch": 0.7998180163785259, "grad_norm": 0.8760631028132075, "learning_rate": 9.382497045368367e-07, "loss": 0.0481, "step": 1758 }, { "epoch": 0.8002729754322111, "grad_norm": 0.7004104254054347, "learning_rate": 9.381808799019336e-07, "loss": 0.0367, "step": 1759 }, { "epoch": 0.8007279344858963, "grad_norm": 1.3673816864496526, "learning_rate": 9.381120194608678e-07, "loss": 0.069, "step": 1760 }, { "epoch": 0.8011828935395814, "grad_norm": 1.3000981262149816, "learning_rate": 9.380431232192662e-07, "loss": 0.071, "step": 1761 }, { "epoch": 0.8016378525932666, "grad_norm": 1.500314228121765, "learning_rate": 9.37974191182759e-07, "loss": 0.0625, "step": 1762 }, { "epoch": 0.8020928116469518, "grad_norm": 0.6792548364968968, "learning_rate": 9.379052233569786e-07, "loss": 0.0352, "step": 1763 }, { "epoch": 0.802547770700637, "grad_norm": 1.0387727205935837, "learning_rate": 9.378362197475609e-07, "loss": 0.05, "step": 1764 }, { "epoch": 0.8030027297543221, "grad_norm": 1.0880942100266682, "learning_rate": 9.377671803601445e-07, "loss": 0.0366, "step": 1765 }, { "epoch": 0.8034576888080073, "grad_norm": 0.8821870116314942, "learning_rate": 9.376981052003712e-07, "loss": 0.0352, "step": 1766 }, { "epoch": 0.8039126478616925, "grad_norm": 0.9927356160239248, "learning_rate": 9.376289942738853e-07, "loss": 0.0527, "step": 1767 }, { "epoch": 0.8043676069153776, "grad_norm": 0.9448861929372048, "learning_rate": 9.375598475863345e-07, "loss": 0.0455, "step": 1768 }, { "epoch": 0.8048225659690628, "grad_norm": 1.321774322528315, "learning_rate": 9.374906651433687e-07, "loss": 0.0972, "step": 1769 }, { "epoch": 0.805277525022748, "grad_norm": 1.302235616927363, "learning_rate": 9.374214469506415e-07, "loss": 0.0616, "step": 1770 }, { "epoch": 0.8057324840764332, "grad_norm": 1.1907089111948943, "learning_rate": 9.373521930138091e-07, "loss": 0.0605, "step": 1771 }, { "epoch": 0.8061874431301183, "grad_norm": 0.9108149609815163, "learning_rate": 9.372829033385305e-07, "loss": 0.0635, "step": 1772 }, { "epoch": 0.8066424021838035, "grad_norm": 1.9173463525207384, "learning_rate": 9.372135779304677e-07, "loss": 0.0666, "step": 1773 }, { "epoch": 0.8070973612374887, "grad_norm": 1.4860205259777834, "learning_rate": 9.371442167952859e-07, "loss": 0.0902, "step": 1774 }, { "epoch": 0.8075523202911737, "grad_norm": 1.1223603794963917, "learning_rate": 9.370748199386528e-07, "loss": 0.0602, "step": 1775 }, { "epoch": 0.8080072793448589, "grad_norm": 0.8044846677829923, "learning_rate": 9.370053873662391e-07, "loss": 0.0436, "step": 1776 }, { "epoch": 0.8084622383985441, "grad_norm": 0.9902004314238286, "learning_rate": 9.36935919083719e-07, "loss": 0.0428, "step": 1777 }, { "epoch": 0.8089171974522293, "grad_norm": 1.2413881386551218, "learning_rate": 9.368664150967684e-07, "loss": 0.0544, "step": 1778 }, { "epoch": 0.8093721565059144, "grad_norm": 1.1092771114421527, "learning_rate": 9.367968754110674e-07, "loss": 0.0732, "step": 1779 }, { "epoch": 0.8098271155595996, "grad_norm": 0.7486143078802923, "learning_rate": 9.367273000322982e-07, "loss": 0.0482, "step": 1780 }, { "epoch": 0.8102820746132848, "grad_norm": 1.5378174202592885, "learning_rate": 9.366576889661464e-07, "loss": 0.0913, "step": 1781 }, { "epoch": 0.8107370336669699, "grad_norm": 1.0074897497658482, "learning_rate": 9.365880422183001e-07, "loss": 0.0443, "step": 1782 }, { "epoch": 0.8111919927206551, "grad_norm": 0.9364114311199989, "learning_rate": 9.365183597944506e-07, "loss": 0.0493, "step": 1783 }, { "epoch": 0.8116469517743403, "grad_norm": 0.8996320423095179, "learning_rate": 9.364486417002921e-07, "loss": 0.0506, "step": 1784 }, { "epoch": 0.8121019108280255, "grad_norm": 0.7311364383410565, "learning_rate": 9.363788879415215e-07, "loss": 0.0328, "step": 1785 }, { "epoch": 0.8125568698817106, "grad_norm": 0.8699368103972209, "learning_rate": 9.363090985238389e-07, "loss": 0.0463, "step": 1786 }, { "epoch": 0.8130118289353958, "grad_norm": 0.944435074348195, "learning_rate": 9.362392734529471e-07, "loss": 0.0483, "step": 1787 }, { "epoch": 0.813466787989081, "grad_norm": 0.927593151205702, "learning_rate": 9.361694127345521e-07, "loss": 0.0572, "step": 1788 }, { "epoch": 0.8139217470427661, "grad_norm": 0.8920230191788332, "learning_rate": 9.360995163743622e-07, "loss": 0.0543, "step": 1789 }, { "epoch": 0.8143767060964513, "grad_norm": 1.480878065821143, "learning_rate": 9.360295843780892e-07, "loss": 0.0587, "step": 1790 }, { "epoch": 0.8148316651501365, "grad_norm": 0.8009883969020426, "learning_rate": 9.359596167514479e-07, "loss": 0.0394, "step": 1791 }, { "epoch": 0.8152866242038217, "grad_norm": 0.9699382066081428, "learning_rate": 9.358896135001553e-07, "loss": 0.0447, "step": 1792 }, { "epoch": 0.8157415832575068, "grad_norm": 22.724961683221967, "learning_rate": 9.358195746299319e-07, "loss": 0.2678, "step": 1793 }, { "epoch": 0.816196542311192, "grad_norm": 1.2365744345854286, "learning_rate": 9.35749500146501e-07, "loss": 0.0705, "step": 1794 }, { "epoch": 0.8166515013648772, "grad_norm": 1.048620644581375, "learning_rate": 9.35679390055589e-07, "loss": 0.0517, "step": 1795 }, { "epoch": 0.8171064604185623, "grad_norm": 1.015334029545976, "learning_rate": 9.356092443629246e-07, "loss": 0.0567, "step": 1796 }, { "epoch": 0.8175614194722475, "grad_norm": 1.0144730970387636, "learning_rate": 9.355390630742399e-07, "loss": 0.0804, "step": 1797 }, { "epoch": 0.8180163785259327, "grad_norm": 1.068203687359315, "learning_rate": 9.354688461952699e-07, "loss": 0.0614, "step": 1798 }, { "epoch": 0.8184713375796179, "grad_norm": 0.7262634871017531, "learning_rate": 9.353985937317523e-07, "loss": 0.0515, "step": 1799 }, { "epoch": 0.818926296633303, "grad_norm": 0.8353683226126879, "learning_rate": 9.353283056894279e-07, "loss": 0.0411, "step": 1800 }, { "epoch": 0.8193812556869882, "grad_norm": 1.0114506413437392, "learning_rate": 9.352579820740404e-07, "loss": 0.0391, "step": 1801 }, { "epoch": 0.8198362147406734, "grad_norm": 1.569889906661366, "learning_rate": 9.351876228913361e-07, "loss": 0.0819, "step": 1802 }, { "epoch": 0.8202911737943585, "grad_norm": 0.811737412077671, "learning_rate": 9.351172281470645e-07, "loss": 0.0503, "step": 1803 }, { "epoch": 0.8207461328480437, "grad_norm": 0.966123970812319, "learning_rate": 9.350467978469781e-07, "loss": 0.0548, "step": 1804 }, { "epoch": 0.8212010919017289, "grad_norm": 0.8295678268716212, "learning_rate": 9.34976331996832e-07, "loss": 0.0432, "step": 1805 }, { "epoch": 0.821656050955414, "grad_norm": 1.1579061622896587, "learning_rate": 9.349058306023842e-07, "loss": 0.0716, "step": 1806 }, { "epoch": 0.8221110100090991, "grad_norm": 1.1279696297947739, "learning_rate": 9.348352936693962e-07, "loss": 0.0557, "step": 1807 }, { "epoch": 0.8225659690627843, "grad_norm": 0.8597452737601717, "learning_rate": 9.347647212036316e-07, "loss": 0.0456, "step": 1808 }, { "epoch": 0.8230209281164695, "grad_norm": 1.1122148983327282, "learning_rate": 9.346941132108575e-07, "loss": 0.0585, "step": 1809 }, { "epoch": 0.8234758871701547, "grad_norm": 1.1537374377287266, "learning_rate": 9.346234696968433e-07, "loss": 0.057, "step": 1810 }, { "epoch": 0.8239308462238398, "grad_norm": 0.746810192633866, "learning_rate": 9.34552790667362e-07, "loss": 0.0437, "step": 1811 }, { "epoch": 0.824385805277525, "grad_norm": 1.0375194079387995, "learning_rate": 9.344820761281891e-07, "loss": 0.0505, "step": 1812 }, { "epoch": 0.8248407643312102, "grad_norm": 0.9397209949005714, "learning_rate": 9.34411326085103e-07, "loss": 0.0474, "step": 1813 }, { "epoch": 0.8252957233848953, "grad_norm": 0.7046063978717224, "learning_rate": 9.343405405438851e-07, "loss": 0.0464, "step": 1814 }, { "epoch": 0.8257506824385805, "grad_norm": 1.048365205768173, "learning_rate": 9.342697195103197e-07, "loss": 0.0537, "step": 1815 }, { "epoch": 0.8262056414922657, "grad_norm": 0.9224949030742537, "learning_rate": 9.34198862990194e-07, "loss": 0.0522, "step": 1816 }, { "epoch": 0.8266606005459509, "grad_norm": 0.7253683784340074, "learning_rate": 9.34127970989298e-07, "loss": 0.0302, "step": 1817 }, { "epoch": 0.827115559599636, "grad_norm": 0.8087765475254761, "learning_rate": 9.340570435134247e-07, "loss": 0.0276, "step": 1818 }, { "epoch": 0.8275705186533212, "grad_norm": 1.0192085822982717, "learning_rate": 9.339860805683701e-07, "loss": 0.0638, "step": 1819 }, { "epoch": 0.8280254777070064, "grad_norm": 1.202609597488887, "learning_rate": 9.339150821599329e-07, "loss": 0.0503, "step": 1820 }, { "epoch": 0.8284804367606915, "grad_norm": 0.7634039479073679, "learning_rate": 9.338440482939145e-07, "loss": 0.054, "step": 1821 }, { "epoch": 0.8289353958143767, "grad_norm": 0.8586705169819839, "learning_rate": 9.337729789761198e-07, "loss": 0.0497, "step": 1822 }, { "epoch": 0.8293903548680619, "grad_norm": 10.130924362270788, "learning_rate": 9.337018742123561e-07, "loss": 0.2142, "step": 1823 }, { "epoch": 0.8298453139217471, "grad_norm": 1.2498144166631469, "learning_rate": 9.33630734008434e-07, "loss": 0.065, "step": 1824 }, { "epoch": 0.8303002729754322, "grad_norm": 0.908992264706484, "learning_rate": 9.335595583701665e-07, "loss": 0.0455, "step": 1825 }, { "epoch": 0.8307552320291174, "grad_norm": 0.8838215163230482, "learning_rate": 9.334883473033697e-07, "loss": 0.0462, "step": 1826 }, { "epoch": 0.8312101910828026, "grad_norm": 1.083730890081073, "learning_rate": 9.334171008138629e-07, "loss": 0.0581, "step": 1827 }, { "epoch": 0.8316651501364877, "grad_norm": 1.2279040510627228, "learning_rate": 9.333458189074679e-07, "loss": 0.0797, "step": 1828 }, { "epoch": 0.8321201091901729, "grad_norm": 31.949341660203583, "learning_rate": 9.332745015900095e-07, "loss": 0.2637, "step": 1829 }, { "epoch": 0.8325750682438581, "grad_norm": 0.8031017798793587, "learning_rate": 9.332031488673156e-07, "loss": 0.0389, "step": 1830 }, { "epoch": 0.8330300272975433, "grad_norm": 1.0133696140725323, "learning_rate": 9.331317607452166e-07, "loss": 0.0467, "step": 1831 }, { "epoch": 0.8334849863512284, "grad_norm": 0.7095033716598639, "learning_rate": 9.330603372295462e-07, "loss": 0.0369, "step": 1832 }, { "epoch": 0.8339399454049136, "grad_norm": 0.7264928445665955, "learning_rate": 9.329888783261406e-07, "loss": 0.0398, "step": 1833 }, { "epoch": 0.8343949044585988, "grad_norm": 0.9321573797912573, "learning_rate": 9.329173840408393e-07, "loss": 0.0552, "step": 1834 }, { "epoch": 0.8348498635122839, "grad_norm": 0.9935804286416898, "learning_rate": 9.328458543794844e-07, "loss": 0.0376, "step": 1835 }, { "epoch": 0.835304822565969, "grad_norm": 0.8345855728495803, "learning_rate": 9.32774289347921e-07, "loss": 0.0609, "step": 1836 }, { "epoch": 0.8357597816196543, "grad_norm": 0.9806073883807114, "learning_rate": 9.327026889519972e-07, "loss": 0.0467, "step": 1837 }, { "epoch": 0.8362147406733395, "grad_norm": 1.2593440949880186, "learning_rate": 9.326310531975634e-07, "loss": 0.0708, "step": 1838 }, { "epoch": 0.8366696997270245, "grad_norm": 0.8318271686252489, "learning_rate": 9.325593820904739e-07, "loss": 0.0409, "step": 1839 }, { "epoch": 0.8371246587807097, "grad_norm": 1.4010779207662638, "learning_rate": 9.324876756365851e-07, "loss": 0.077, "step": 1840 }, { "epoch": 0.8375796178343949, "grad_norm": 1.0222162671798671, "learning_rate": 9.324159338417565e-07, "loss": 0.049, "step": 1841 }, { "epoch": 0.83803457688808, "grad_norm": 1.2395898426858405, "learning_rate": 9.323441567118507e-07, "loss": 0.0713, "step": 1842 }, { "epoch": 0.8384895359417652, "grad_norm": 0.8307022755481545, "learning_rate": 9.322723442527327e-07, "loss": 0.0489, "step": 1843 }, { "epoch": 0.8389444949954504, "grad_norm": 0.8899262260547925, "learning_rate": 9.322004964702708e-07, "loss": 0.0537, "step": 1844 }, { "epoch": 0.8393994540491356, "grad_norm": 0.9603440166233256, "learning_rate": 9.321286133703363e-07, "loss": 0.044, "step": 1845 }, { "epoch": 0.8398544131028207, "grad_norm": 0.8026758910743704, "learning_rate": 9.320566949588029e-07, "loss": 0.0498, "step": 1846 }, { "epoch": 0.8403093721565059, "grad_norm": 0.8575982052527746, "learning_rate": 9.319847412415476e-07, "loss": 0.045, "step": 1847 }, { "epoch": 0.8407643312101911, "grad_norm": 1.0006732050410936, "learning_rate": 9.3191275222445e-07, "loss": 0.0518, "step": 1848 }, { "epoch": 0.8412192902638762, "grad_norm": 0.5793766944002295, "learning_rate": 9.318407279133928e-07, "loss": 0.0279, "step": 1849 }, { "epoch": 0.8416742493175614, "grad_norm": 1.0306435217102934, "learning_rate": 9.317686683142616e-07, "loss": 0.0569, "step": 1850 }, { "epoch": 0.8421292083712466, "grad_norm": 0.8266853469710619, "learning_rate": 9.316965734329444e-07, "loss": 0.0356, "step": 1851 }, { "epoch": 0.8425841674249318, "grad_norm": 0.959576816770563, "learning_rate": 9.316244432753331e-07, "loss": 0.0683, "step": 1852 }, { "epoch": 0.8430391264786169, "grad_norm": 1.4053059228731632, "learning_rate": 9.315522778473213e-07, "loss": 0.0853, "step": 1853 }, { "epoch": 0.8434940855323021, "grad_norm": 2.1097687771671856, "learning_rate": 9.314800771548062e-07, "loss": 0.0941, "step": 1854 }, { "epoch": 0.8439490445859873, "grad_norm": 0.9467703445837146, "learning_rate": 9.314078412036879e-07, "loss": 0.0684, "step": 1855 }, { "epoch": 0.8444040036396724, "grad_norm": 0.8315683371844956, "learning_rate": 9.313355699998689e-07, "loss": 0.0389, "step": 1856 }, { "epoch": 0.8448589626933576, "grad_norm": 0.7888352744197424, "learning_rate": 9.31263263549255e-07, "loss": 0.0263, "step": 1857 }, { "epoch": 0.8453139217470428, "grad_norm": 0.8628388520010605, "learning_rate": 9.311909218577549e-07, "loss": 0.05, "step": 1858 }, { "epoch": 0.845768880800728, "grad_norm": 1.1036251715120253, "learning_rate": 9.311185449312797e-07, "loss": 0.0581, "step": 1859 }, { "epoch": 0.8462238398544131, "grad_norm": 0.712327763383876, "learning_rate": 9.31046132775744e-07, "loss": 0.0382, "step": 1860 }, { "epoch": 0.8466787989080983, "grad_norm": 0.798974592787987, "learning_rate": 9.309736853970651e-07, "loss": 0.0427, "step": 1861 }, { "epoch": 0.8471337579617835, "grad_norm": 1.5107472159842041, "learning_rate": 9.309012028011627e-07, "loss": 0.0848, "step": 1862 }, { "epoch": 0.8475887170154686, "grad_norm": 1.3049630609523242, "learning_rate": 9.308286849939599e-07, "loss": 0.0757, "step": 1863 }, { "epoch": 0.8480436760691538, "grad_norm": 0.7444740366597293, "learning_rate": 9.307561319813827e-07, "loss": 0.0487, "step": 1864 }, { "epoch": 0.848498635122839, "grad_norm": 1.102585196313778, "learning_rate": 9.306835437693597e-07, "loss": 0.0577, "step": 1865 }, { "epoch": 0.8489535941765242, "grad_norm": 1.3978133662235908, "learning_rate": 9.306109203638224e-07, "loss": 0.0626, "step": 1866 }, { "epoch": 0.8494085532302093, "grad_norm": 4.829521509709941, "learning_rate": 9.305382617707051e-07, "loss": 0.1045, "step": 1867 }, { "epoch": 0.8498635122838945, "grad_norm": 1.0370745708422457, "learning_rate": 9.304655679959456e-07, "loss": 0.0552, "step": 1868 }, { "epoch": 0.8503184713375797, "grad_norm": 0.9763721087402688, "learning_rate": 9.303928390454839e-07, "loss": 0.0367, "step": 1869 }, { "epoch": 0.8507734303912647, "grad_norm": 0.871893769233498, "learning_rate": 9.303200749252629e-07, "loss": 0.0587, "step": 1870 }, { "epoch": 0.8512283894449499, "grad_norm": 0.8389491400467552, "learning_rate": 9.302472756412287e-07, "loss": 0.0474, "step": 1871 }, { "epoch": 0.8516833484986351, "grad_norm": 1.333653931336288, "learning_rate": 9.301744411993301e-07, "loss": 0.0804, "step": 1872 }, { "epoch": 0.8521383075523203, "grad_norm": 1.1161816361537744, "learning_rate": 9.301015716055188e-07, "loss": 0.0484, "step": 1873 }, { "epoch": 0.8525932666060054, "grad_norm": 0.8132876155956268, "learning_rate": 9.300286668657494e-07, "loss": 0.0445, "step": 1874 }, { "epoch": 0.8530482256596906, "grad_norm": 1.1085703602980612, "learning_rate": 9.299557269859794e-07, "loss": 0.0492, "step": 1875 }, { "epoch": 0.8535031847133758, "grad_norm": 0.8472885063489958, "learning_rate": 9.29882751972169e-07, "loss": 0.0382, "step": 1876 }, { "epoch": 0.8539581437670609, "grad_norm": 1.0022500087163217, "learning_rate": 9.298097418302814e-07, "loss": 0.0684, "step": 1877 }, { "epoch": 0.8544131028207461, "grad_norm": 0.7912580205838655, "learning_rate": 9.297366965662827e-07, "loss": 0.0486, "step": 1878 }, { "epoch": 0.8548680618744313, "grad_norm": 1.2982937314170122, "learning_rate": 9.29663616186142e-07, "loss": 0.0808, "step": 1879 }, { "epoch": 0.8553230209281165, "grad_norm": 3.5568505976991562, "learning_rate": 9.295905006958308e-07, "loss": 0.1399, "step": 1880 }, { "epoch": 0.8557779799818016, "grad_norm": 1.1251380731414167, "learning_rate": 9.295173501013238e-07, "loss": 0.0721, "step": 1881 }, { "epoch": 0.8562329390354868, "grad_norm": 1.060192732328411, "learning_rate": 9.294441644085988e-07, "loss": 0.06, "step": 1882 }, { "epoch": 0.856687898089172, "grad_norm": 1.1551612589888198, "learning_rate": 9.293709436236359e-07, "loss": 0.0491, "step": 1883 }, { "epoch": 0.8571428571428571, "grad_norm": 0.8965134252714515, "learning_rate": 9.292976877524186e-07, "loss": 0.0618, "step": 1884 }, { "epoch": 0.8575978161965423, "grad_norm": 1.1516925627960652, "learning_rate": 9.29224396800933e-07, "loss": 0.0722, "step": 1885 }, { "epoch": 0.8580527752502275, "grad_norm": 1.733207172790817, "learning_rate": 9.291510707751679e-07, "loss": 0.0848, "step": 1886 }, { "epoch": 0.8585077343039127, "grad_norm": 1.180692422989153, "learning_rate": 9.290777096811156e-07, "loss": 0.0643, "step": 1887 }, { "epoch": 0.8589626933575978, "grad_norm": 1.2984484146041984, "learning_rate": 9.290043135247703e-07, "loss": 0.06, "step": 1888 }, { "epoch": 0.859417652411283, "grad_norm": 0.8883377302490483, "learning_rate": 9.2893088231213e-07, "loss": 0.052, "step": 1889 }, { "epoch": 0.8598726114649682, "grad_norm": 1.2683971001938434, "learning_rate": 9.288574160491948e-07, "loss": 0.0528, "step": 1890 }, { "epoch": 0.8603275705186533, "grad_norm": 1.1557263177790513, "learning_rate": 9.287839147419685e-07, "loss": 0.0647, "step": 1891 }, { "epoch": 0.8607825295723385, "grad_norm": 1.1532094934804349, "learning_rate": 9.28710378396457e-07, "loss": 0.0794, "step": 1892 }, { "epoch": 0.8612374886260237, "grad_norm": 0.8026636455517018, "learning_rate": 9.286368070186694e-07, "loss": 0.0387, "step": 1893 }, { "epoch": 0.8616924476797089, "grad_norm": 1.1844155311199769, "learning_rate": 9.285632006146177e-07, "loss": 0.0581, "step": 1894 }, { "epoch": 0.862147406733394, "grad_norm": 0.9330838825719893, "learning_rate": 9.284895591903166e-07, "loss": 0.0612, "step": 1895 }, { "epoch": 0.8626023657870792, "grad_norm": 0.809821854576786, "learning_rate": 9.284158827517838e-07, "loss": 0.0419, "step": 1896 }, { "epoch": 0.8630573248407644, "grad_norm": 0.8919123006268792, "learning_rate": 9.283421713050397e-07, "loss": 0.048, "step": 1897 }, { "epoch": 0.8635122838944495, "grad_norm": 1.0516783509925776, "learning_rate": 9.282684248561077e-07, "loss": 0.0745, "step": 1898 }, { "epoch": 0.8639672429481347, "grad_norm": 0.6960985296548401, "learning_rate": 9.281946434110141e-07, "loss": 0.0404, "step": 1899 }, { "epoch": 0.8644222020018199, "grad_norm": 1.0264330075068924, "learning_rate": 9.281208269757879e-07, "loss": 0.0664, "step": 1900 }, { "epoch": 0.864877161055505, "grad_norm": 0.6753929377654491, "learning_rate": 9.280469755564611e-07, "loss": 0.0402, "step": 1901 }, { "epoch": 0.8653321201091901, "grad_norm": 1.0528205590844575, "learning_rate": 9.279730891590687e-07, "loss": 0.0424, "step": 1902 }, { "epoch": 0.8657870791628753, "grad_norm": 0.8167173946963349, "learning_rate": 9.278991677896479e-07, "loss": 0.0443, "step": 1903 }, { "epoch": 0.8662420382165605, "grad_norm": 1.0487529415084276, "learning_rate": 9.278252114542396e-07, "loss": 0.0697, "step": 1904 }, { "epoch": 0.8666969972702456, "grad_norm": 1.4086780678825157, "learning_rate": 9.277512201588871e-07, "loss": 0.1003, "step": 1905 }, { "epoch": 0.8671519563239308, "grad_norm": 0.9635423571551115, "learning_rate": 9.276771939096366e-07, "loss": 0.0716, "step": 1906 }, { "epoch": 0.867606915377616, "grad_norm": 0.9058543017248273, "learning_rate": 9.276031327125371e-07, "loss": 0.0526, "step": 1907 }, { "epoch": 0.8680618744313012, "grad_norm": 1.025674586849764, "learning_rate": 9.275290365736407e-07, "loss": 0.0576, "step": 1908 }, { "epoch": 0.8685168334849863, "grad_norm": 0.9159333866678363, "learning_rate": 9.274549054990021e-07, "loss": 0.0413, "step": 1909 }, { "epoch": 0.8689717925386715, "grad_norm": 0.8632597318928529, "learning_rate": 9.27380739494679e-07, "loss": 0.0491, "step": 1910 }, { "epoch": 0.8694267515923567, "grad_norm": 0.7783928289595495, "learning_rate": 9.27306538566732e-07, "loss": 0.0522, "step": 1911 }, { "epoch": 0.8698817106460418, "grad_norm": 0.7745366680266066, "learning_rate": 9.272323027212242e-07, "loss": 0.0614, "step": 1912 }, { "epoch": 0.870336669699727, "grad_norm": 1.0837005686193038, "learning_rate": 9.271580319642221e-07, "loss": 0.0725, "step": 1913 }, { "epoch": 0.8707916287534122, "grad_norm": 0.9523866075134934, "learning_rate": 9.270837263017946e-07, "loss": 0.0527, "step": 1914 }, { "epoch": 0.8712465878070974, "grad_norm": 0.7550805536802333, "learning_rate": 9.270093857400136e-07, "loss": 0.0497, "step": 1915 }, { "epoch": 0.8717015468607825, "grad_norm": 0.917040985094328, "learning_rate": 9.269350102849542e-07, "loss": 0.0503, "step": 1916 }, { "epoch": 0.8721565059144677, "grad_norm": 0.8538391848094934, "learning_rate": 9.268605999426935e-07, "loss": 0.0393, "step": 1917 }, { "epoch": 0.8726114649681529, "grad_norm": 1.8164547080889952, "learning_rate": 9.267861547193126e-07, "loss": 0.1048, "step": 1918 }, { "epoch": 0.873066424021838, "grad_norm": 1.2700944546017825, "learning_rate": 9.267116746208942e-07, "loss": 0.0737, "step": 1919 }, { "epoch": 0.8735213830755232, "grad_norm": 0.9613205018987895, "learning_rate": 9.266371596535247e-07, "loss": 0.0683, "step": 1920 }, { "epoch": 0.8739763421292084, "grad_norm": 1.1814772760838073, "learning_rate": 9.265626098232934e-07, "loss": 0.0661, "step": 1921 }, { "epoch": 0.8744313011828936, "grad_norm": 1.0236401834149416, "learning_rate": 9.264880251362919e-07, "loss": 0.0572, "step": 1922 }, { "epoch": 0.8748862602365787, "grad_norm": 1.2419238157955896, "learning_rate": 9.26413405598615e-07, "loss": 0.0579, "step": 1923 }, { "epoch": 0.8753412192902639, "grad_norm": 1.2937525451589709, "learning_rate": 9.263387512163603e-07, "loss": 0.0648, "step": 1924 }, { "epoch": 0.8757961783439491, "grad_norm": 0.8782851525564298, "learning_rate": 9.262640619956281e-07, "loss": 0.0637, "step": 1925 }, { "epoch": 0.8762511373976342, "grad_norm": 1.1304485906208028, "learning_rate": 9.261893379425217e-07, "loss": 0.0662, "step": 1926 }, { "epoch": 0.8767060964513194, "grad_norm": 0.9114586749249854, "learning_rate": 9.261145790631473e-07, "loss": 0.0523, "step": 1927 }, { "epoch": 0.8771610555050046, "grad_norm": 0.6704686165985292, "learning_rate": 9.260397853636139e-07, "loss": 0.0405, "step": 1928 }, { "epoch": 0.8776160145586898, "grad_norm": 0.6839657953746731, "learning_rate": 9.259649568500332e-07, "loss": 0.0383, "step": 1929 }, { "epoch": 0.8780709736123748, "grad_norm": 0.7850477191527149, "learning_rate": 9.258900935285197e-07, "loss": 0.0387, "step": 1930 }, { "epoch": 0.87852593266606, "grad_norm": 0.9494467095880083, "learning_rate": 9.258151954051913e-07, "loss": 0.063, "step": 1931 }, { "epoch": 0.8789808917197452, "grad_norm": 0.9140608084661813, "learning_rate": 9.257402624861679e-07, "loss": 0.0611, "step": 1932 }, { "epoch": 0.8794358507734303, "grad_norm": 0.8151642046221819, "learning_rate": 9.256652947775729e-07, "loss": 0.0333, "step": 1933 }, { "epoch": 0.8798908098271155, "grad_norm": 0.7222014434621276, "learning_rate": 9.255902922855324e-07, "loss": 0.029, "step": 1934 }, { "epoch": 0.8803457688808007, "grad_norm": 0.8425280693567242, "learning_rate": 9.255152550161751e-07, "loss": 0.0397, "step": 1935 }, { "epoch": 0.8808007279344859, "grad_norm": 0.7895030655921811, "learning_rate": 9.254401829756329e-07, "loss": 0.0478, "step": 1936 }, { "epoch": 0.881255686988171, "grad_norm": 1.2373053848876134, "learning_rate": 9.2536507617004e-07, "loss": 0.0824, "step": 1937 }, { "epoch": 0.8817106460418562, "grad_norm": 0.4979392054073163, "learning_rate": 9.252899346055342e-07, "loss": 0.0274, "step": 1938 }, { "epoch": 0.8821656050955414, "grad_norm": 0.9045787460581309, "learning_rate": 9.252147582882555e-07, "loss": 0.0524, "step": 1939 }, { "epoch": 0.8826205641492265, "grad_norm": 1.0685734515092518, "learning_rate": 9.251395472243469e-07, "loss": 0.0711, "step": 1940 }, { "epoch": 0.8830755232029117, "grad_norm": 1.5369096406872502, "learning_rate": 9.250643014199545e-07, "loss": 0.083, "step": 1941 }, { "epoch": 0.8835304822565969, "grad_norm": 0.9172150341451163, "learning_rate": 9.249890208812268e-07, "loss": 0.0495, "step": 1942 }, { "epoch": 0.8839854413102821, "grad_norm": 0.9603462172245303, "learning_rate": 9.249137056143158e-07, "loss": 0.0427, "step": 1943 }, { "epoch": 0.8844404003639672, "grad_norm": 0.8039093132479822, "learning_rate": 9.248383556253756e-07, "loss": 0.0497, "step": 1944 }, { "epoch": 0.8848953594176524, "grad_norm": 1.1416716008457375, "learning_rate": 9.247629709205634e-07, "loss": 0.0724, "step": 1945 }, { "epoch": 0.8853503184713376, "grad_norm": 1.0732218817330206, "learning_rate": 9.246875515060395e-07, "loss": 0.0533, "step": 1946 }, { "epoch": 0.8858052775250227, "grad_norm": 0.7918368032473713, "learning_rate": 9.246120973879668e-07, "loss": 0.0557, "step": 1947 }, { "epoch": 0.8862602365787079, "grad_norm": 1.1284042515662727, "learning_rate": 9.245366085725111e-07, "loss": 0.0605, "step": 1948 }, { "epoch": 0.8867151956323931, "grad_norm": 0.7829134466940569, "learning_rate": 9.244610850658408e-07, "loss": 0.0457, "step": 1949 }, { "epoch": 0.8871701546860783, "grad_norm": 0.6225560769842485, "learning_rate": 9.243855268741274e-07, "loss": 0.036, "step": 1950 }, { "epoch": 0.8876251137397634, "grad_norm": 0.8635763170988161, "learning_rate": 9.243099340035454e-07, "loss": 0.0449, "step": 1951 }, { "epoch": 0.8880800727934486, "grad_norm": 0.8547858412493784, "learning_rate": 9.242343064602718e-07, "loss": 0.0502, "step": 1952 }, { "epoch": 0.8885350318471338, "grad_norm": 1.1869947345786445, "learning_rate": 9.241586442504864e-07, "loss": 0.0743, "step": 1953 }, { "epoch": 0.8889899909008189, "grad_norm": 0.944356595588029, "learning_rate": 9.240829473803722e-07, "loss": 0.0539, "step": 1954 }, { "epoch": 0.8894449499545041, "grad_norm": 0.962653872823283, "learning_rate": 9.240072158561145e-07, "loss": 0.0588, "step": 1955 }, { "epoch": 0.8898999090081893, "grad_norm": 1.221390713350519, "learning_rate": 9.23931449683902e-07, "loss": 0.0732, "step": 1956 }, { "epoch": 0.8903548680618745, "grad_norm": 0.7387505642019456, "learning_rate": 9.238556488699259e-07, "loss": 0.0417, "step": 1957 }, { "epoch": 0.8908098271155596, "grad_norm": 0.9878328809395385, "learning_rate": 9.237798134203802e-07, "loss": 0.051, "step": 1958 }, { "epoch": 0.8912647861692448, "grad_norm": 0.975942703487734, "learning_rate": 9.237039433414622e-07, "loss": 0.0634, "step": 1959 }, { "epoch": 0.89171974522293, "grad_norm": 0.8508841379710823, "learning_rate": 9.236280386393711e-07, "loss": 0.0564, "step": 1960 }, { "epoch": 0.892174704276615, "grad_norm": 0.8983112132535972, "learning_rate": 9.235520993203099e-07, "loss": 0.0565, "step": 1961 }, { "epoch": 0.8926296633303002, "grad_norm": 0.7224674021289562, "learning_rate": 9.234761253904838e-07, "loss": 0.0409, "step": 1962 }, { "epoch": 0.8930846223839854, "grad_norm": 1.123639192496533, "learning_rate": 9.234001168561012e-07, "loss": 0.0453, "step": 1963 }, { "epoch": 0.8935395814376706, "grad_norm": 1.1732396517232442, "learning_rate": 9.233240737233732e-07, "loss": 0.0711, "step": 1964 }, { "epoch": 0.8939945404913557, "grad_norm": 1.692585348143773, "learning_rate": 9.232479959985134e-07, "loss": 0.0889, "step": 1965 }, { "epoch": 0.8944494995450409, "grad_norm": 0.9006528116424255, "learning_rate": 9.23171883687739e-07, "loss": 0.045, "step": 1966 }, { "epoch": 0.8949044585987261, "grad_norm": 0.6164204698922958, "learning_rate": 9.230957367972689e-07, "loss": 0.0376, "step": 1967 }, { "epoch": 0.8953594176524113, "grad_norm": 1.4139029076583685, "learning_rate": 9.230195553333262e-07, "loss": 0.1014, "step": 1968 }, { "epoch": 0.8958143767060964, "grad_norm": 2.614083053338007, "learning_rate": 9.229433393021358e-07, "loss": 0.1402, "step": 1969 }, { "epoch": 0.8962693357597816, "grad_norm": 0.939868575765368, "learning_rate": 9.228670887099254e-07, "loss": 0.0491, "step": 1970 }, { "epoch": 0.8967242948134668, "grad_norm": 0.6836018999673201, "learning_rate": 9.227908035629264e-07, "loss": 0.0333, "step": 1971 }, { "epoch": 0.8971792538671519, "grad_norm": 0.7836319999004981, "learning_rate": 9.227144838673722e-07, "loss": 0.042, "step": 1972 }, { "epoch": 0.8976342129208371, "grad_norm": 1.2972212100735558, "learning_rate": 9.226381296294994e-07, "loss": 0.0718, "step": 1973 }, { "epoch": 0.8980891719745223, "grad_norm": 1.0887841907525193, "learning_rate": 9.225617408555471e-07, "loss": 0.0616, "step": 1974 }, { "epoch": 0.8985441310282075, "grad_norm": 1.3589886435291056, "learning_rate": 9.224853175517577e-07, "loss": 0.0963, "step": 1975 }, { "epoch": 0.8989990900818926, "grad_norm": 0.8355999542931333, "learning_rate": 9.224088597243761e-07, "loss": 0.0458, "step": 1976 }, { "epoch": 0.8994540491355778, "grad_norm": 1.0508378453084755, "learning_rate": 9.223323673796502e-07, "loss": 0.0597, "step": 1977 }, { "epoch": 0.899909008189263, "grad_norm": 1.137279283638017, "learning_rate": 9.222558405238302e-07, "loss": 0.0564, "step": 1978 }, { "epoch": 0.9003639672429481, "grad_norm": 1.1069498638908082, "learning_rate": 9.221792791631699e-07, "loss": 0.0697, "step": 1979 }, { "epoch": 0.9008189262966333, "grad_norm": 1.1039852140894104, "learning_rate": 9.221026833039255e-07, "loss": 0.0582, "step": 1980 }, { "epoch": 0.9012738853503185, "grad_norm": 0.9370764833650652, "learning_rate": 9.220260529523561e-07, "loss": 0.0511, "step": 1981 }, { "epoch": 0.9017288444040037, "grad_norm": 1.1725854107622533, "learning_rate": 9.219493881147233e-07, "loss": 0.052, "step": 1982 }, { "epoch": 0.9021838034576888, "grad_norm": 1.0190663956911505, "learning_rate": 9.21872688797292e-07, "loss": 0.0527, "step": 1983 }, { "epoch": 0.902638762511374, "grad_norm": 0.778142456084107, "learning_rate": 9.217959550063299e-07, "loss": 0.0404, "step": 1984 }, { "epoch": 0.9030937215650592, "grad_norm": 1.4865320405784064, "learning_rate": 9.217191867481072e-07, "loss": 0.0955, "step": 1985 }, { "epoch": 0.9035486806187443, "grad_norm": 1.2689251863162416, "learning_rate": 9.21642384028897e-07, "loss": 0.1006, "step": 1986 }, { "epoch": 0.9040036396724295, "grad_norm": 1.2640908636346724, "learning_rate": 9.215655468549751e-07, "loss": 0.0648, "step": 1987 }, { "epoch": 0.9044585987261147, "grad_norm": 0.7540623195847336, "learning_rate": 9.214886752326206e-07, "loss": 0.0307, "step": 1988 }, { "epoch": 0.9049135577797999, "grad_norm": 1.0053264652186125, "learning_rate": 9.214117691681151e-07, "loss": 0.0579, "step": 1989 }, { "epoch": 0.905368516833485, "grad_norm": 0.6983272477723668, "learning_rate": 9.213348286677427e-07, "loss": 0.0354, "step": 1990 }, { "epoch": 0.9058234758871702, "grad_norm": 0.8869667572379786, "learning_rate": 9.21257853737791e-07, "loss": 0.0426, "step": 1991 }, { "epoch": 0.9062784349408554, "grad_norm": 1.0920075774477838, "learning_rate": 9.211808443845498e-07, "loss": 0.0599, "step": 1992 }, { "epoch": 0.9067333939945404, "grad_norm": 0.921144650929535, "learning_rate": 9.21103800614312e-07, "loss": 0.0454, "step": 1993 }, { "epoch": 0.9071883530482256, "grad_norm": 0.8719426008542652, "learning_rate": 9.210267224333734e-07, "loss": 0.0497, "step": 1994 }, { "epoch": 0.9076433121019108, "grad_norm": 1.1655090035562825, "learning_rate": 9.209496098480322e-07, "loss": 0.0658, "step": 1995 }, { "epoch": 0.908098271155596, "grad_norm": 0.8489993977556466, "learning_rate": 9.2087246286459e-07, "loss": 0.047, "step": 1996 }, { "epoch": 0.9085532302092811, "grad_norm": 0.8735917279543147, "learning_rate": 9.207952814893509e-07, "loss": 0.0461, "step": 1997 }, { "epoch": 0.9090081892629663, "grad_norm": 0.8571159102637165, "learning_rate": 9.207180657286216e-07, "loss": 0.0392, "step": 1998 }, { "epoch": 0.9094631483166515, "grad_norm": 0.879545265373811, "learning_rate": 9.206408155887119e-07, "loss": 0.0414, "step": 1999 }, { "epoch": 0.9099181073703366, "grad_norm": 0.8242468608159004, "learning_rate": 9.205635310759344e-07, "loss": 0.0586, "step": 2000 }, { "epoch": 0.9103730664240218, "grad_norm": 1.012058662069608, "learning_rate": 9.204862121966043e-07, "loss": 0.0589, "step": 2001 }, { "epoch": 0.910828025477707, "grad_norm": 0.9782244718514459, "learning_rate": 9.204088589570398e-07, "loss": 0.0636, "step": 2002 }, { "epoch": 0.9112829845313922, "grad_norm": 0.8104500635580972, "learning_rate": 9.203314713635619e-07, "loss": 0.0341, "step": 2003 }, { "epoch": 0.9117379435850773, "grad_norm": 1.0476135343693151, "learning_rate": 9.202540494224945e-07, "loss": 0.0801, "step": 2004 }, { "epoch": 0.9121929026387625, "grad_norm": 1.2100511343226552, "learning_rate": 9.201765931401639e-07, "loss": 0.0494, "step": 2005 }, { "epoch": 0.9126478616924477, "grad_norm": 0.6444769708562996, "learning_rate": 9.200991025228997e-07, "loss": 0.0412, "step": 2006 }, { "epoch": 0.9131028207461328, "grad_norm": 1.3136792556048043, "learning_rate": 9.200215775770339e-07, "loss": 0.0866, "step": 2007 }, { "epoch": 0.913557779799818, "grad_norm": 0.9365845466613806, "learning_rate": 9.199440183089018e-07, "loss": 0.0536, "step": 2008 }, { "epoch": 0.9140127388535032, "grad_norm": 1.0276546815034164, "learning_rate": 9.198664247248407e-07, "loss": 0.0689, "step": 2009 }, { "epoch": 0.9144676979071884, "grad_norm": 1.0108050469451855, "learning_rate": 9.197887968311916e-07, "loss": 0.0564, "step": 2010 }, { "epoch": 0.9149226569608735, "grad_norm": 0.8381285072165795, "learning_rate": 9.197111346342978e-07, "loss": 0.0348, "step": 2011 }, { "epoch": 0.9153776160145587, "grad_norm": 0.6970614668295937, "learning_rate": 9.196334381405053e-07, "loss": 0.0377, "step": 2012 }, { "epoch": 0.9158325750682439, "grad_norm": 1.1972134505300938, "learning_rate": 9.195557073561635e-07, "loss": 0.0506, "step": 2013 }, { "epoch": 0.916287534121929, "grad_norm": 1.2583869193923143, "learning_rate": 9.19477942287624e-07, "loss": 0.0483, "step": 2014 }, { "epoch": 0.9167424931756142, "grad_norm": 1.3120769397339376, "learning_rate": 9.194001429412412e-07, "loss": 0.0386, "step": 2015 }, { "epoch": 0.9171974522292994, "grad_norm": 1.5862141621252062, "learning_rate": 9.193223093233729e-07, "loss": 0.0864, "step": 2016 }, { "epoch": 0.9176524112829846, "grad_norm": 1.8461174809466454, "learning_rate": 9.192444414403791e-07, "loss": 0.0561, "step": 2017 }, { "epoch": 0.9181073703366697, "grad_norm": 2.3311643990361026, "learning_rate": 9.191665392986227e-07, "loss": 0.1032, "step": 2018 }, { "epoch": 0.9185623293903549, "grad_norm": 1.1729750714322675, "learning_rate": 9.190886029044698e-07, "loss": 0.0535, "step": 2019 }, { "epoch": 0.9190172884440401, "grad_norm": 1.403306445184113, "learning_rate": 9.190106322642887e-07, "loss": 0.0533, "step": 2020 }, { "epoch": 0.9194722474977252, "grad_norm": 0.9671832640252803, "learning_rate": 9.189326273844511e-07, "loss": 0.0516, "step": 2021 }, { "epoch": 0.9199272065514104, "grad_norm": 0.8192289450672798, "learning_rate": 9.18854588271331e-07, "loss": 0.0435, "step": 2022 }, { "epoch": 0.9203821656050956, "grad_norm": 1.2355213914338194, "learning_rate": 9.187765149313055e-07, "loss": 0.0751, "step": 2023 }, { "epoch": 0.9208371246587808, "grad_norm": 1.1334113443695233, "learning_rate": 9.186984073707544e-07, "loss": 0.0508, "step": 2024 }, { "epoch": 0.9212920837124658, "grad_norm": 1.920086740466786, "learning_rate": 9.186202655960602e-07, "loss": 0.0504, "step": 2025 }, { "epoch": 0.921747042766151, "grad_norm": 1.05335405395226, "learning_rate": 9.185420896136084e-07, "loss": 0.0579, "step": 2026 }, { "epoch": 0.9222020018198362, "grad_norm": 0.8984447527884369, "learning_rate": 9.184638794297872e-07, "loss": 0.0472, "step": 2027 }, { "epoch": 0.9226569608735213, "grad_norm": 0.9498634838971349, "learning_rate": 9.183856350509875e-07, "loss": 0.0624, "step": 2028 }, { "epoch": 0.9231119199272065, "grad_norm": 1.7785274740456138, "learning_rate": 9.183073564836033e-07, "loss": 0.0649, "step": 2029 }, { "epoch": 0.9235668789808917, "grad_norm": 1.4038050837646774, "learning_rate": 9.182290437340308e-07, "loss": 0.0377, "step": 2030 }, { "epoch": 0.9240218380345769, "grad_norm": 1.0921929951915006, "learning_rate": 9.181506968086695e-07, "loss": 0.052, "step": 2031 }, { "epoch": 0.924476797088262, "grad_norm": 1.5087734228181873, "learning_rate": 9.180723157139216e-07, "loss": 0.0793, "step": 2032 }, { "epoch": 0.9249317561419472, "grad_norm": 0.9082820889719045, "learning_rate": 9.179939004561923e-07, "loss": 0.046, "step": 2033 }, { "epoch": 0.9253867151956324, "grad_norm": 0.7125283619138187, "learning_rate": 9.179154510418889e-07, "loss": 0.0484, "step": 2034 }, { "epoch": 0.9258416742493175, "grad_norm": 1.0047725724059955, "learning_rate": 9.178369674774223e-07, "loss": 0.0548, "step": 2035 }, { "epoch": 0.9262966333030027, "grad_norm": 0.8512648584287806, "learning_rate": 9.177584497692055e-07, "loss": 0.0502, "step": 2036 }, { "epoch": 0.9267515923566879, "grad_norm": 1.2128581482116723, "learning_rate": 9.176798979236546e-07, "loss": 0.0622, "step": 2037 }, { "epoch": 0.9272065514103731, "grad_norm": 0.735505729188697, "learning_rate": 9.176013119471889e-07, "loss": 0.048, "step": 2038 }, { "epoch": 0.9276615104640582, "grad_norm": 0.8642815981144529, "learning_rate": 9.175226918462298e-07, "loss": 0.0479, "step": 2039 }, { "epoch": 0.9281164695177434, "grad_norm": 1.1963028282736674, "learning_rate": 9.174440376272019e-07, "loss": 0.0631, "step": 2040 }, { "epoch": 0.9285714285714286, "grad_norm": 1.1989506183699274, "learning_rate": 9.173653492965323e-07, "loss": 0.0689, "step": 2041 }, { "epoch": 0.9290263876251137, "grad_norm": 0.8444739898981314, "learning_rate": 9.172866268606513e-07, "loss": 0.0512, "step": 2042 }, { "epoch": 0.9294813466787989, "grad_norm": 1.2170574527872122, "learning_rate": 9.172078703259916e-07, "loss": 0.0507, "step": 2043 }, { "epoch": 0.9299363057324841, "grad_norm": 0.643523711354568, "learning_rate": 9.171290796989886e-07, "loss": 0.0297, "step": 2044 }, { "epoch": 0.9303912647861693, "grad_norm": 0.8937571766381986, "learning_rate": 9.170502549860813e-07, "loss": 0.0437, "step": 2045 }, { "epoch": 0.9308462238398544, "grad_norm": 1.2791878684400173, "learning_rate": 9.169713961937103e-07, "loss": 0.071, "step": 2046 }, { "epoch": 0.9313011828935396, "grad_norm": 0.9359856122259343, "learning_rate": 9.168925033283198e-07, "loss": 0.0574, "step": 2047 }, { "epoch": 0.9317561419472248, "grad_norm": 0.9243793500259689, "learning_rate": 9.168135763963567e-07, "loss": 0.0415, "step": 2048 }, { "epoch": 0.9322111010009099, "grad_norm": 0.8441985917967354, "learning_rate": 9.167346154042703e-07, "loss": 0.0292, "step": 2049 }, { "epoch": 0.9326660600545951, "grad_norm": 0.9517985117286746, "learning_rate": 9.166556203585133e-07, "loss": 0.0551, "step": 2050 }, { "epoch": 0.9331210191082803, "grad_norm": 1.2835342196660695, "learning_rate": 9.165765912655405e-07, "loss": 0.0571, "step": 2051 }, { "epoch": 0.9335759781619655, "grad_norm": 0.9470729996183466, "learning_rate": 9.164975281318099e-07, "loss": 0.0515, "step": 2052 }, { "epoch": 0.9340309372156506, "grad_norm": 0.7785824245891043, "learning_rate": 9.164184309637822e-07, "loss": 0.0503, "step": 2053 }, { "epoch": 0.9344858962693358, "grad_norm": 0.9511013403071947, "learning_rate": 9.163392997679209e-07, "loss": 0.0556, "step": 2054 }, { "epoch": 0.934940855323021, "grad_norm": 0.8703962777205587, "learning_rate": 9.162601345506922e-07, "loss": 0.0529, "step": 2055 }, { "epoch": 0.935395814376706, "grad_norm": 0.775877389293392, "learning_rate": 9.16180935318565e-07, "loss": 0.0376, "step": 2056 }, { "epoch": 0.9358507734303912, "grad_norm": 0.9521273120772348, "learning_rate": 9.161017020780113e-07, "loss": 0.0343, "step": 2057 }, { "epoch": 0.9363057324840764, "grad_norm": 1.1287957992490114, "learning_rate": 9.160224348355056e-07, "loss": 0.0778, "step": 2058 }, { "epoch": 0.9367606915377616, "grad_norm": 0.8631454061327333, "learning_rate": 9.159431335975253e-07, "loss": 0.0417, "step": 2059 }, { "epoch": 0.9372156505914467, "grad_norm": 0.8918406409972216, "learning_rate": 9.158637983705504e-07, "loss": 0.0643, "step": 2060 }, { "epoch": 0.9376706096451319, "grad_norm": 0.9350632881606027, "learning_rate": 9.15784429161064e-07, "loss": 0.0525, "step": 2061 }, { "epoch": 0.9381255686988171, "grad_norm": 1.3889340051916939, "learning_rate": 9.157050259755518e-07, "loss": 0.067, "step": 2062 }, { "epoch": 0.9385805277525022, "grad_norm": 0.9157982857814395, "learning_rate": 9.15625588820502e-07, "loss": 0.0616, "step": 2063 }, { "epoch": 0.9390354868061874, "grad_norm": 1.0238483631144177, "learning_rate": 9.155461177024062e-07, "loss": 0.0595, "step": 2064 }, { "epoch": 0.9394904458598726, "grad_norm": 1.2756489707490353, "learning_rate": 9.154666126277581e-07, "loss": 0.063, "step": 2065 }, { "epoch": 0.9399454049135578, "grad_norm": 0.7826732540204179, "learning_rate": 9.153870736030547e-07, "loss": 0.0433, "step": 2066 }, { "epoch": 0.9404003639672429, "grad_norm": 1.2018386192795854, "learning_rate": 9.153075006347955e-07, "loss": 0.0986, "step": 2067 }, { "epoch": 0.9408553230209281, "grad_norm": 1.1890263313524296, "learning_rate": 9.15227893729483e-07, "loss": 0.0531, "step": 2068 }, { "epoch": 0.9413102820746133, "grad_norm": 1.0095643160089312, "learning_rate": 9.15148252893622e-07, "loss": 0.0352, "step": 2069 }, { "epoch": 0.9417652411282984, "grad_norm": 1.0194374316717485, "learning_rate": 9.150685781337206e-07, "loss": 0.056, "step": 2070 }, { "epoch": 0.9422202001819836, "grad_norm": 1.4905201595047883, "learning_rate": 9.149888694562895e-07, "loss": 0.0882, "step": 2071 }, { "epoch": 0.9426751592356688, "grad_norm": 1.0759562504062816, "learning_rate": 9.149091268678422e-07, "loss": 0.0441, "step": 2072 }, { "epoch": 0.943130118289354, "grad_norm": 1.057407518234346, "learning_rate": 9.148293503748945e-07, "loss": 0.0603, "step": 2073 }, { "epoch": 0.9435850773430391, "grad_norm": 0.9829275291680709, "learning_rate": 9.147495399839658e-07, "loss": 0.0519, "step": 2074 }, { "epoch": 0.9440400363967243, "grad_norm": 0.9642214517122738, "learning_rate": 9.146696957015775e-07, "loss": 0.0425, "step": 2075 }, { "epoch": 0.9444949954504095, "grad_norm": 1.4377029154256058, "learning_rate": 9.145898175342545e-07, "loss": 0.102, "step": 2076 }, { "epoch": 0.9449499545040946, "grad_norm": 0.9689521970500141, "learning_rate": 9.145099054885237e-07, "loss": 0.0507, "step": 2077 }, { "epoch": 0.9454049135577798, "grad_norm": 1.0199560185198782, "learning_rate": 9.144299595709155e-07, "loss": 0.0602, "step": 2078 }, { "epoch": 0.945859872611465, "grad_norm": 1.3595548238848658, "learning_rate": 9.143499797879625e-07, "loss": 0.0434, "step": 2079 }, { "epoch": 0.9463148316651502, "grad_norm": 0.7717239935229712, "learning_rate": 9.142699661462003e-07, "loss": 0.0362, "step": 2080 }, { "epoch": 0.9467697907188353, "grad_norm": 0.8257462440905602, "learning_rate": 9.141899186521673e-07, "loss": 0.0367, "step": 2081 }, { "epoch": 0.9472247497725205, "grad_norm": 1.3741372375852208, "learning_rate": 9.141098373124047e-07, "loss": 0.0499, "step": 2082 }, { "epoch": 0.9476797088262057, "grad_norm": 1.4088294589214712, "learning_rate": 9.140297221334562e-07, "loss": 0.0692, "step": 2083 }, { "epoch": 0.9481346678798908, "grad_norm": 1.7452460528470837, "learning_rate": 9.139495731218684e-07, "loss": 0.0737, "step": 2084 }, { "epoch": 0.948589626933576, "grad_norm": 1.4532747242895865, "learning_rate": 9.138693902841912e-07, "loss": 0.0783, "step": 2085 }, { "epoch": 0.9490445859872612, "grad_norm": 1.040344959885373, "learning_rate": 9.137891736269763e-07, "loss": 0.0619, "step": 2086 }, { "epoch": 0.9494995450409464, "grad_norm": 1.2144971070890684, "learning_rate": 9.137089231567787e-07, "loss": 0.0689, "step": 2087 }, { "epoch": 0.9499545040946314, "grad_norm": 0.8628931978442566, "learning_rate": 9.136286388801564e-07, "loss": 0.0423, "step": 2088 }, { "epoch": 0.9504094631483166, "grad_norm": 1.9590676079789158, "learning_rate": 9.135483208036694e-07, "loss": 0.0506, "step": 2089 }, { "epoch": 0.9508644222020018, "grad_norm": 1.5759342637401192, "learning_rate": 9.134679689338813e-07, "loss": 0.0679, "step": 2090 }, { "epoch": 0.9513193812556869, "grad_norm": 0.6897443808039588, "learning_rate": 9.13387583277358e-07, "loss": 0.0508, "step": 2091 }, { "epoch": 0.9517743403093721, "grad_norm": 0.8157378879865193, "learning_rate": 9.133071638406683e-07, "loss": 0.049, "step": 2092 }, { "epoch": 0.9522292993630573, "grad_norm": 1.062437312361308, "learning_rate": 9.132267106303836e-07, "loss": 0.0743, "step": 2093 }, { "epoch": 0.9526842584167425, "grad_norm": 1.212079077352486, "learning_rate": 9.131462236530782e-07, "loss": 0.0603, "step": 2094 }, { "epoch": 0.9531392174704276, "grad_norm": 1.36827655886766, "learning_rate": 9.130657029153293e-07, "loss": 0.0614, "step": 2095 }, { "epoch": 0.9535941765241128, "grad_norm": 0.5382214989923869, "learning_rate": 9.129851484237164e-07, "loss": 0.0309, "step": 2096 }, { "epoch": 0.954049135577798, "grad_norm": 1.3475390208245206, "learning_rate": 9.129045601848221e-07, "loss": 0.0796, "step": 2097 }, { "epoch": 0.9545040946314831, "grad_norm": 0.8987650341594184, "learning_rate": 9.128239382052318e-07, "loss": 0.0426, "step": 2098 }, { "epoch": 0.9549590536851683, "grad_norm": 1.1835950226908383, "learning_rate": 9.127432824915338e-07, "loss": 0.0601, "step": 2099 }, { "epoch": 0.9554140127388535, "grad_norm": 0.7827783551271729, "learning_rate": 9.126625930503185e-07, "loss": 0.0361, "step": 2100 }, { "epoch": 0.9558689717925387, "grad_norm": 1.0220585229270296, "learning_rate": 9.125818698881797e-07, "loss": 0.0527, "step": 2101 }, { "epoch": 0.9563239308462238, "grad_norm": 1.2650039365782404, "learning_rate": 9.125011130117137e-07, "loss": 0.0456, "step": 2102 }, { "epoch": 0.956778889899909, "grad_norm": 0.9980431748540327, "learning_rate": 9.124203224275197e-07, "loss": 0.0549, "step": 2103 }, { "epoch": 0.9572338489535942, "grad_norm": 0.769047674334955, "learning_rate": 9.123394981421993e-07, "loss": 0.0437, "step": 2104 }, { "epoch": 0.9576888080072793, "grad_norm": 1.243278547773053, "learning_rate": 9.122586401623573e-07, "loss": 0.0536, "step": 2105 }, { "epoch": 0.9581437670609645, "grad_norm": 1.4485506866680222, "learning_rate": 9.12177748494601e-07, "loss": 0.075, "step": 2106 }, { "epoch": 0.9585987261146497, "grad_norm": 1.1660754366932242, "learning_rate": 9.120968231455404e-07, "loss": 0.0695, "step": 2107 }, { "epoch": 0.9590536851683349, "grad_norm": 1.0369797810706198, "learning_rate": 9.120158641217884e-07, "loss": 0.0443, "step": 2108 }, { "epoch": 0.95950864422202, "grad_norm": 1.0485023842780912, "learning_rate": 9.119348714299606e-07, "loss": 0.0499, "step": 2109 }, { "epoch": 0.9599636032757052, "grad_norm": 1.159510817665223, "learning_rate": 9.118538450766754e-07, "loss": 0.0507, "step": 2110 }, { "epoch": 0.9604185623293904, "grad_norm": 0.9875084233063389, "learning_rate": 9.117727850685539e-07, "loss": 0.0435, "step": 2111 }, { "epoch": 0.9608735213830755, "grad_norm": 0.7933931702721846, "learning_rate": 9.1169169141222e-07, "loss": 0.0414, "step": 2112 }, { "epoch": 0.9613284804367607, "grad_norm": 0.9415131625972293, "learning_rate": 9.116105641143004e-07, "loss": 0.0474, "step": 2113 }, { "epoch": 0.9617834394904459, "grad_norm": 1.2755742073209326, "learning_rate": 9.11529403181424e-07, "loss": 0.0701, "step": 2114 }, { "epoch": 0.9622383985441311, "grad_norm": 0.8337954249235803, "learning_rate": 9.114482086202234e-07, "loss": 0.0492, "step": 2115 }, { "epoch": 0.9626933575978162, "grad_norm": 1.1747804445792143, "learning_rate": 9.113669804373333e-07, "loss": 0.0716, "step": 2116 }, { "epoch": 0.9631483166515014, "grad_norm": 0.9674517173374639, "learning_rate": 9.112857186393912e-07, "loss": 0.0446, "step": 2117 }, { "epoch": 0.9636032757051866, "grad_norm": 1.4448673508504575, "learning_rate": 9.112044232330376e-07, "loss": 0.1041, "step": 2118 }, { "epoch": 0.9640582347588716, "grad_norm": 1.559230766133167, "learning_rate": 9.111230942249155e-07, "loss": 0.0732, "step": 2119 }, { "epoch": 0.9645131938125568, "grad_norm": 0.9700263304412974, "learning_rate": 9.110417316216707e-07, "loss": 0.0361, "step": 2120 }, { "epoch": 0.964968152866242, "grad_norm": 1.803390007090429, "learning_rate": 9.109603354299519e-07, "loss": 0.0435, "step": 2121 }, { "epoch": 0.9654231119199272, "grad_norm": 0.9357258177560913, "learning_rate": 9.108789056564104e-07, "loss": 0.0705, "step": 2122 }, { "epoch": 0.9658780709736123, "grad_norm": 0.7807496728264608, "learning_rate": 9.107974423077e-07, "loss": 0.0335, "step": 2123 }, { "epoch": 0.9663330300272975, "grad_norm": 2.6745734350927157, "learning_rate": 9.10715945390478e-07, "loss": 0.0637, "step": 2124 }, { "epoch": 0.9667879890809827, "grad_norm": 1.1239087098329166, "learning_rate": 9.106344149114038e-07, "loss": 0.049, "step": 2125 }, { "epoch": 0.9672429481346679, "grad_norm": 0.8265756001523038, "learning_rate": 9.105528508771394e-07, "loss": 0.0553, "step": 2126 }, { "epoch": 0.967697907188353, "grad_norm": 1.0749657881140233, "learning_rate": 9.104712532943501e-07, "loss": 0.0457, "step": 2127 }, { "epoch": 0.9681528662420382, "grad_norm": 1.0324221025078966, "learning_rate": 9.103896221697038e-07, "loss": 0.058, "step": 2128 }, { "epoch": 0.9686078252957234, "grad_norm": 1.4550456232914004, "learning_rate": 9.103079575098708e-07, "loss": 0.0713, "step": 2129 }, { "epoch": 0.9690627843494085, "grad_norm": 0.8204315419279635, "learning_rate": 9.102262593215244e-07, "loss": 0.0348, "step": 2130 }, { "epoch": 0.9695177434030937, "grad_norm": 1.303970632164576, "learning_rate": 9.101445276113407e-07, "loss": 0.0431, "step": 2131 }, { "epoch": 0.9699727024567789, "grad_norm": 1.0556756365241537, "learning_rate": 9.100627623859984e-07, "loss": 0.0452, "step": 2132 }, { "epoch": 0.9704276615104641, "grad_norm": 1.0350851781416792, "learning_rate": 9.099809636521789e-07, "loss": 0.0558, "step": 2133 }, { "epoch": 0.9708826205641492, "grad_norm": 1.2125873172980899, "learning_rate": 9.098991314165666e-07, "loss": 0.0746, "step": 2134 }, { "epoch": 0.9713375796178344, "grad_norm": 1.0706097889383221, "learning_rate": 9.098172656858484e-07, "loss": 0.0412, "step": 2135 }, { "epoch": 0.9717925386715196, "grad_norm": 1.2143149711842174, "learning_rate": 9.097353664667138e-07, "loss": 0.0553, "step": 2136 }, { "epoch": 0.9722474977252047, "grad_norm": 1.01766342421967, "learning_rate": 9.096534337658555e-07, "loss": 0.0449, "step": 2137 }, { "epoch": 0.9727024567788899, "grad_norm": 1.0193312430492827, "learning_rate": 9.095714675899686e-07, "loss": 0.0511, "step": 2138 }, { "epoch": 0.9731574158325751, "grad_norm": 0.8278681019499655, "learning_rate": 9.09489467945751e-07, "loss": 0.0396, "step": 2139 }, { "epoch": 0.9736123748862603, "grad_norm": 0.7520174949101124, "learning_rate": 9.094074348399034e-07, "loss": 0.0392, "step": 2140 }, { "epoch": 0.9740673339399454, "grad_norm": 1.3732724817844, "learning_rate": 9.093253682791288e-07, "loss": 0.0736, "step": 2141 }, { "epoch": 0.9745222929936306, "grad_norm": 1.366213067608846, "learning_rate": 9.092432682701339e-07, "loss": 0.054, "step": 2142 }, { "epoch": 0.9749772520473158, "grad_norm": 1.2306983464945604, "learning_rate": 9.09161134819627e-07, "loss": 0.0601, "step": 2143 }, { "epoch": 0.9754322111010009, "grad_norm": 1.0510983388666453, "learning_rate": 9.090789679343201e-07, "loss": 0.0551, "step": 2144 }, { "epoch": 0.9758871701546861, "grad_norm": 1.283144561220179, "learning_rate": 9.089967676209273e-07, "loss": 0.0577, "step": 2145 }, { "epoch": 0.9763421292083713, "grad_norm": 1.0160797271570203, "learning_rate": 9.089145338861655e-07, "loss": 0.0734, "step": 2146 }, { "epoch": 0.9767970882620565, "grad_norm": 1.4001616648106356, "learning_rate": 9.088322667367548e-07, "loss": 0.0857, "step": 2147 }, { "epoch": 0.9772520473157416, "grad_norm": 1.0176709223122151, "learning_rate": 9.087499661794175e-07, "loss": 0.0683, "step": 2148 }, { "epoch": 0.9777070063694268, "grad_norm": 1.2453451044041897, "learning_rate": 9.086676322208789e-07, "loss": 0.0664, "step": 2149 }, { "epoch": 0.978161965423112, "grad_norm": 0.8591849240473343, "learning_rate": 9.085852648678669e-07, "loss": 0.0512, "step": 2150 }, { "epoch": 0.978616924476797, "grad_norm": 1.1751443941929876, "learning_rate": 9.085028641271122e-07, "loss": 0.0731, "step": 2151 }, { "epoch": 0.9790718835304822, "grad_norm": 1.4198059816673938, "learning_rate": 9.084204300053482e-07, "loss": 0.0773, "step": 2152 }, { "epoch": 0.9795268425841674, "grad_norm": 1.0684746837566197, "learning_rate": 9.083379625093111e-07, "loss": 0.0702, "step": 2153 }, { "epoch": 0.9799818016378526, "grad_norm": 1.1776613189503011, "learning_rate": 9.082554616457397e-07, "loss": 0.0689, "step": 2154 }, { "epoch": 0.9804367606915377, "grad_norm": 0.841307548267332, "learning_rate": 9.081729274213756e-07, "loss": 0.0491, "step": 2155 }, { "epoch": 0.9808917197452229, "grad_norm": 1.1927116526876629, "learning_rate": 9.080903598429633e-07, "loss": 0.0405, "step": 2156 }, { "epoch": 0.9813466787989081, "grad_norm": 0.8556350639101715, "learning_rate": 9.080077589172496e-07, "loss": 0.0386, "step": 2157 }, { "epoch": 0.9818016378525932, "grad_norm": 0.9466503885362708, "learning_rate": 9.079251246509845e-07, "loss": 0.0356, "step": 2158 }, { "epoch": 0.9822565969062784, "grad_norm": 1.246972924531473, "learning_rate": 9.078424570509202e-07, "loss": 0.0819, "step": 2159 }, { "epoch": 0.9827115559599636, "grad_norm": 0.9522816797677247, "learning_rate": 9.077597561238122e-07, "loss": 0.0452, "step": 2160 }, { "epoch": 0.9831665150136488, "grad_norm": 0.998808667138825, "learning_rate": 9.076770218764185e-07, "loss": 0.0629, "step": 2161 }, { "epoch": 0.9836214740673339, "grad_norm": 0.9023771854991036, "learning_rate": 9.075942543154995e-07, "loss": 0.0589, "step": 2162 }, { "epoch": 0.9840764331210191, "grad_norm": 0.7491943423054259, "learning_rate": 9.075114534478185e-07, "loss": 0.0466, "step": 2163 }, { "epoch": 0.9845313921747043, "grad_norm": 0.796508435627229, "learning_rate": 9.074286192801421e-07, "loss": 0.0378, "step": 2164 }, { "epoch": 0.9849863512283894, "grad_norm": 1.342770289878589, "learning_rate": 9.073457518192389e-07, "loss": 0.0548, "step": 2165 }, { "epoch": 0.9854413102820746, "grad_norm": 1.2262791351416316, "learning_rate": 9.072628510718803e-07, "loss": 0.0422, "step": 2166 }, { "epoch": 0.9858962693357598, "grad_norm": 1.0037529776989549, "learning_rate": 9.071799170448407e-07, "loss": 0.0542, "step": 2167 }, { "epoch": 0.986351228389445, "grad_norm": 0.9314990936928467, "learning_rate": 9.070969497448971e-07, "loss": 0.0408, "step": 2168 }, { "epoch": 0.9868061874431301, "grad_norm": 0.8896614598717153, "learning_rate": 9.070139491788294e-07, "loss": 0.0487, "step": 2169 }, { "epoch": 0.9872611464968153, "grad_norm": 1.0634049951440678, "learning_rate": 9.069309153534195e-07, "loss": 0.0535, "step": 2170 }, { "epoch": 0.9877161055505005, "grad_norm": 1.6220032832919737, "learning_rate": 9.068478482754532e-07, "loss": 0.0968, "step": 2171 }, { "epoch": 0.9881710646041856, "grad_norm": 0.9890593400070957, "learning_rate": 9.067647479517177e-07, "loss": 0.0785, "step": 2172 }, { "epoch": 0.9886260236578708, "grad_norm": 1.1329639186918616, "learning_rate": 9.066816143890042e-07, "loss": 0.0351, "step": 2173 }, { "epoch": 0.989080982711556, "grad_norm": 0.9232638107047763, "learning_rate": 9.065984475941055e-07, "loss": 0.0499, "step": 2174 }, { "epoch": 0.9895359417652412, "grad_norm": 1.1410357973376561, "learning_rate": 9.06515247573818e-07, "loss": 0.0348, "step": 2175 }, { "epoch": 0.9899909008189263, "grad_norm": 1.2458432987213264, "learning_rate": 9.064320143349403e-07, "loss": 0.0338, "step": 2176 }, { "epoch": 0.9904458598726115, "grad_norm": 1.0919708527544933, "learning_rate": 9.063487478842737e-07, "loss": 0.0536, "step": 2177 }, { "epoch": 0.9909008189262967, "grad_norm": 1.1652337624547673, "learning_rate": 9.062654482286226e-07, "loss": 0.0347, "step": 2178 }, { "epoch": 0.9913557779799818, "grad_norm": 1.282513256450786, "learning_rate": 9.061821153747937e-07, "loss": 0.0463, "step": 2179 }, { "epoch": 0.991810737033667, "grad_norm": 1.0107467837266504, "learning_rate": 9.060987493295966e-07, "loss": 0.0385, "step": 2180 }, { "epoch": 0.9922656960873522, "grad_norm": 1.1394513601265601, "learning_rate": 9.060153500998438e-07, "loss": 0.0641, "step": 2181 }, { "epoch": 0.9927206551410374, "grad_norm": 1.1066752541604623, "learning_rate": 9.059319176923499e-07, "loss": 0.0611, "step": 2182 }, { "epoch": 0.9931756141947224, "grad_norm": 0.6478005639299917, "learning_rate": 9.05848452113933e-07, "loss": 0.0356, "step": 2183 }, { "epoch": 0.9936305732484076, "grad_norm": 1.5818642034394523, "learning_rate": 9.057649533714134e-07, "loss": 0.0861, "step": 2184 }, { "epoch": 0.9940855323020928, "grad_norm": 1.1183103097765958, "learning_rate": 9.056814214716142e-07, "loss": 0.0598, "step": 2185 }, { "epoch": 0.9945404913557779, "grad_norm": 0.926790846538659, "learning_rate": 9.055978564213612e-07, "loss": 0.0464, "step": 2186 }, { "epoch": 0.9949954504094631, "grad_norm": 1.5874416684859385, "learning_rate": 9.055142582274831e-07, "loss": 0.0682, "step": 2187 }, { "epoch": 0.9954504094631483, "grad_norm": 1.0882090223348664, "learning_rate": 9.05430626896811e-07, "loss": 0.0598, "step": 2188 }, { "epoch": 0.9959053685168335, "grad_norm": 0.9461842477272115, "learning_rate": 9.053469624361791e-07, "loss": 0.0455, "step": 2189 }, { "epoch": 0.9963603275705186, "grad_norm": 0.8515719887484394, "learning_rate": 9.052632648524241e-07, "loss": 0.0631, "step": 2190 }, { "epoch": 0.9968152866242038, "grad_norm": 0.9603058490820368, "learning_rate": 9.05179534152385e-07, "loss": 0.0413, "step": 2191 }, { "epoch": 0.997270245677889, "grad_norm": 1.153973203958824, "learning_rate": 9.050957703429043e-07, "loss": 0.0611, "step": 2192 }, { "epoch": 0.9977252047315741, "grad_norm": 1.0139953461346294, "learning_rate": 9.050119734308266e-07, "loss": 0.0547, "step": 2193 }, { "epoch": 0.9981801637852593, "grad_norm": 0.854196375686895, "learning_rate": 9.049281434229994e-07, "loss": 0.0561, "step": 2194 }, { "epoch": 0.9986351228389445, "grad_norm": 0.904919602551994, "learning_rate": 9.04844280326273e-07, "loss": 0.0485, "step": 2195 }, { "epoch": 0.9990900818926297, "grad_norm": 0.800324417790531, "learning_rate": 9.047603841475002e-07, "loss": 0.0492, "step": 2196 }, { "epoch": 0.9995450409463148, "grad_norm": 0.8933838287516636, "learning_rate": 9.046764548935367e-07, "loss": 0.0529, "step": 2197 }, { "epoch": 1.0, "grad_norm": 0.6217683290004481, "learning_rate": 9.045924925712409e-07, "loss": 0.0281, "step": 2198 }, { "epoch": 1.000454959053685, "grad_norm": 0.7719383851619033, "learning_rate": 9.045084971874737e-07, "loss": 0.0237, "step": 2199 }, { "epoch": 1.0009099181073704, "grad_norm": 0.7466975837592527, "learning_rate": 9.044244687490988e-07, "loss": 0.0311, "step": 2200 }, { "epoch": 1.0013648771610555, "grad_norm": 0.8136123927032112, "learning_rate": 9.043404072629828e-07, "loss": 0.0373, "step": 2201 }, { "epoch": 1.0018198362147406, "grad_norm": 0.5357637620018818, "learning_rate": 9.042563127359945e-07, "loss": 0.0185, "step": 2202 }, { "epoch": 1.0022747952684259, "grad_norm": 0.609229508872201, "learning_rate": 9.041721851750062e-07, "loss": 0.0208, "step": 2203 }, { "epoch": 1.002729754322111, "grad_norm": 0.618733343568908, "learning_rate": 9.040880245868919e-07, "loss": 0.0323, "step": 2204 }, { "epoch": 1.0031847133757963, "grad_norm": 0.6292208411229613, "learning_rate": 9.040038309785292e-07, "loss": 0.0331, "step": 2205 }, { "epoch": 1.0036396724294814, "grad_norm": 1.88004951811103, "learning_rate": 9.039196043567978e-07, "loss": 0.0758, "step": 2206 }, { "epoch": 1.0040946314831665, "grad_norm": 0.7271858041938704, "learning_rate": 9.038353447285806e-07, "loss": 0.044, "step": 2207 }, { "epoch": 1.0045495905368518, "grad_norm": 0.9318817510811654, "learning_rate": 9.037510521007625e-07, "loss": 0.0362, "step": 2208 }, { "epoch": 1.0050045495905369, "grad_norm": 0.6074452310852222, "learning_rate": 9.03666726480232e-07, "loss": 0.0326, "step": 2209 }, { "epoch": 1.005459508644222, "grad_norm": 0.6733938758551979, "learning_rate": 9.035823678738795e-07, "loss": 0.0336, "step": 2210 }, { "epoch": 1.0059144676979073, "grad_norm": 0.9175545639271417, "learning_rate": 9.034979762885984e-07, "loss": 0.0345, "step": 2211 }, { "epoch": 1.0063694267515924, "grad_norm": 0.6796891816368287, "learning_rate": 9.034135517312848e-07, "loss": 0.0231, "step": 2212 }, { "epoch": 1.0068243858052774, "grad_norm": 0.6465998925293506, "learning_rate": 9.033290942088375e-07, "loss": 0.0243, "step": 2213 }, { "epoch": 1.0072793448589628, "grad_norm": 0.537505587119838, "learning_rate": 9.032446037281581e-07, "loss": 0.0231, "step": 2214 }, { "epoch": 1.0077343039126478, "grad_norm": 2.5064739779242227, "learning_rate": 9.031600802961508e-07, "loss": 0.0401, "step": 2215 }, { "epoch": 1.008189262966333, "grad_norm": 0.6341878351647718, "learning_rate": 9.030755239197223e-07, "loss": 0.0353, "step": 2216 }, { "epoch": 1.0086442220200182, "grad_norm": 0.7579632957178596, "learning_rate": 9.029909346057824e-07, "loss": 0.0291, "step": 2217 }, { "epoch": 1.0090991810737033, "grad_norm": 0.6501778872353086, "learning_rate": 9.02906312361243e-07, "loss": 0.0291, "step": 2218 }, { "epoch": 1.0095541401273886, "grad_norm": 0.9096938881927145, "learning_rate": 9.028216571930195e-07, "loss": 0.0372, "step": 2219 }, { "epoch": 1.0100090991810737, "grad_norm": 0.6211871708379059, "learning_rate": 9.027369691080291e-07, "loss": 0.0276, "step": 2220 }, { "epoch": 1.0104640582347588, "grad_norm": 0.7827606651799178, "learning_rate": 9.026522481131925e-07, "loss": 0.0423, "step": 2221 }, { "epoch": 1.0109190172884441, "grad_norm": 0.5462695994091984, "learning_rate": 9.025674942154324e-07, "loss": 0.0185, "step": 2222 }, { "epoch": 1.0113739763421292, "grad_norm": 0.7893426724229007, "learning_rate": 9.024827074216748e-07, "loss": 0.0363, "step": 2223 }, { "epoch": 1.0118289353958143, "grad_norm": 0.6635414008146848, "learning_rate": 9.023978877388479e-07, "loss": 0.0208, "step": 2224 }, { "epoch": 1.0122838944494996, "grad_norm": 0.8083613235261458, "learning_rate": 9.023130351738828e-07, "loss": 0.0254, "step": 2225 }, { "epoch": 1.0127388535031847, "grad_norm": 0.7806418718108659, "learning_rate": 9.022281497337132e-07, "loss": 0.0425, "step": 2226 }, { "epoch": 1.0131938125568698, "grad_norm": 0.5575845914599133, "learning_rate": 9.021432314252757e-07, "loss": 0.029, "step": 2227 }, { "epoch": 1.013648771610555, "grad_norm": 0.705241129372005, "learning_rate": 9.020582802555095e-07, "loss": 0.0403, "step": 2228 }, { "epoch": 1.0141037306642402, "grad_norm": 1.0530587288668456, "learning_rate": 9.019732962313562e-07, "loss": 0.0235, "step": 2229 }, { "epoch": 1.0145586897179253, "grad_norm": 0.7117179151907187, "learning_rate": 9.018882793597605e-07, "loss": 0.0248, "step": 2230 }, { "epoch": 1.0150136487716106, "grad_norm": 2.428940500679297, "learning_rate": 9.018032296476694e-07, "loss": 0.0287, "step": 2231 }, { "epoch": 1.0154686078252957, "grad_norm": 0.6194271173700975, "learning_rate": 9.01718147102033e-07, "loss": 0.0329, "step": 2232 }, { "epoch": 1.015923566878981, "grad_norm": 1.2156050167016974, "learning_rate": 9.016330317298037e-07, "loss": 0.0235, "step": 2233 }, { "epoch": 1.016378525932666, "grad_norm": 0.6199278189273147, "learning_rate": 9.015478835379369e-07, "loss": 0.0236, "step": 2234 }, { "epoch": 1.0168334849863512, "grad_norm": 1.3129793840167467, "learning_rate": 9.014627025333904e-07, "loss": 0.0266, "step": 2235 }, { "epoch": 1.0172884440400365, "grad_norm": 0.48724438428115363, "learning_rate": 9.013774887231248e-07, "loss": 0.0171, "step": 2236 }, { "epoch": 1.0177434030937216, "grad_norm": 0.8259257797209306, "learning_rate": 9.012922421141035e-07, "loss": 0.0358, "step": 2237 }, { "epoch": 1.0181983621474067, "grad_norm": 0.8940838290075597, "learning_rate": 9.012069627132924e-07, "loss": 0.0538, "step": 2238 }, { "epoch": 1.018653321201092, "grad_norm": 1.2406678996587794, "learning_rate": 9.011216505276599e-07, "loss": 0.0451, "step": 2239 }, { "epoch": 1.019108280254777, "grad_norm": 0.8896260745108636, "learning_rate": 9.010363055641779e-07, "loss": 0.0455, "step": 2240 }, { "epoch": 1.0195632393084622, "grad_norm": 0.596128830213427, "learning_rate": 9.0095092782982e-07, "loss": 0.0291, "step": 2241 }, { "epoch": 1.0200181983621475, "grad_norm": 0.7262020731774727, "learning_rate": 9.008655173315628e-07, "loss": 0.036, "step": 2242 }, { "epoch": 1.0204731574158326, "grad_norm": 0.8816942009549891, "learning_rate": 9.007800740763859e-07, "loss": 0.0347, "step": 2243 }, { "epoch": 1.0209281164695176, "grad_norm": 0.7134852393828885, "learning_rate": 9.006945980712713e-07, "loss": 0.0368, "step": 2244 }, { "epoch": 1.021383075523203, "grad_norm": 0.5934622054687926, "learning_rate": 9.006090893232036e-07, "loss": 0.024, "step": 2245 }, { "epoch": 1.021838034576888, "grad_norm": 0.6448882521984775, "learning_rate": 9.005235478391703e-07, "loss": 0.0338, "step": 2246 }, { "epoch": 1.0222929936305734, "grad_norm": 0.6653283030862152, "learning_rate": 9.004379736261614e-07, "loss": 0.0247, "step": 2247 }, { "epoch": 1.0227479526842584, "grad_norm": 0.9544965204907059, "learning_rate": 9.003523666911697e-07, "loss": 0.0386, "step": 2248 }, { "epoch": 1.0232029117379435, "grad_norm": 0.6728412097776658, "learning_rate": 9.002667270411905e-07, "loss": 0.0218, "step": 2249 }, { "epoch": 1.0236578707916288, "grad_norm": 0.5852534424416875, "learning_rate": 9.001810546832218e-07, "loss": 0.0284, "step": 2250 }, { "epoch": 1.024112829845314, "grad_norm": 0.6158965280597645, "learning_rate": 9.000953496242647e-07, "loss": 0.0327, "step": 2251 }, { "epoch": 1.024567788898999, "grad_norm": 0.7922187155456415, "learning_rate": 9.000096118713226e-07, "loss": 0.0517, "step": 2252 }, { "epoch": 1.0250227479526843, "grad_norm": 0.5650250362118168, "learning_rate": 8.999238414314012e-07, "loss": 0.0229, "step": 2253 }, { "epoch": 1.0254777070063694, "grad_norm": 0.697878865497645, "learning_rate": 8.998380383115096e-07, "loss": 0.0396, "step": 2254 }, { "epoch": 1.0259326660600545, "grad_norm": 0.9399847184108263, "learning_rate": 8.997522025186591e-07, "loss": 0.0425, "step": 2255 }, { "epoch": 1.0263876251137398, "grad_norm": 0.8544212729150434, "learning_rate": 8.99666334059864e-07, "loss": 0.0534, "step": 2256 }, { "epoch": 1.026842584167425, "grad_norm": 0.6022073290971853, "learning_rate": 8.995804329421408e-07, "loss": 0.0224, "step": 2257 }, { "epoch": 1.02729754322111, "grad_norm": 0.9971073260949552, "learning_rate": 8.994944991725094e-07, "loss": 0.0473, "step": 2258 }, { "epoch": 1.0277525022747953, "grad_norm": 0.5807682131146813, "learning_rate": 8.994085327579913e-07, "loss": 0.0252, "step": 2259 }, { "epoch": 1.0282074613284804, "grad_norm": 0.8489347092121144, "learning_rate": 8.993225337056117e-07, "loss": 0.0422, "step": 2260 }, { "epoch": 1.0286624203821657, "grad_norm": 0.9350808293061635, "learning_rate": 8.992365020223981e-07, "loss": 0.0496, "step": 2261 }, { "epoch": 1.0291173794358508, "grad_norm": 0.6983461989473276, "learning_rate": 8.991504377153803e-07, "loss": 0.0364, "step": 2262 }, { "epoch": 1.0295723384895359, "grad_norm": 0.9194119496101955, "learning_rate": 8.990643407915914e-07, "loss": 0.0343, "step": 2263 }, { "epoch": 1.0300272975432212, "grad_norm": 0.9138867323529014, "learning_rate": 8.989782112580668e-07, "loss": 0.0543, "step": 2264 }, { "epoch": 1.0304822565969063, "grad_norm": 0.7115711233330113, "learning_rate": 8.988920491218445e-07, "loss": 0.0324, "step": 2265 }, { "epoch": 1.0309372156505914, "grad_norm": 0.771435972099578, "learning_rate": 8.988058543899652e-07, "loss": 0.0285, "step": 2266 }, { "epoch": 1.0313921747042767, "grad_norm": 0.6724016468108822, "learning_rate": 8.987196270694726e-07, "loss": 0.0316, "step": 2267 }, { "epoch": 1.0318471337579618, "grad_norm": 0.8777376661939447, "learning_rate": 8.986333671674127e-07, "loss": 0.0208, "step": 2268 }, { "epoch": 1.0323020928116469, "grad_norm": 1.0750310321688274, "learning_rate": 8.985470746908342e-07, "loss": 0.0458, "step": 2269 }, { "epoch": 1.0327570518653322, "grad_norm": 0.8432186458544342, "learning_rate": 8.984607496467884e-07, "loss": 0.0294, "step": 2270 }, { "epoch": 1.0332120109190173, "grad_norm": 0.9080895793973706, "learning_rate": 8.983743920423297e-07, "loss": 0.0449, "step": 2271 }, { "epoch": 1.0336669699727024, "grad_norm": 0.6465239261990675, "learning_rate": 8.982880018845149e-07, "loss": 0.0256, "step": 2272 }, { "epoch": 1.0341219290263877, "grad_norm": 0.8271660096416348, "learning_rate": 8.982015791804029e-07, "loss": 0.0352, "step": 2273 }, { "epoch": 1.0345768880800728, "grad_norm": 0.800664336252902, "learning_rate": 8.981151239370565e-07, "loss": 0.0348, "step": 2274 }, { "epoch": 1.035031847133758, "grad_norm": 0.9341869718084582, "learning_rate": 8.980286361615399e-07, "loss": 0.0267, "step": 2275 }, { "epoch": 1.0354868061874432, "grad_norm": 0.9817730236433213, "learning_rate": 8.979421158609205e-07, "loss": 0.0372, "step": 2276 }, { "epoch": 1.0359417652411282, "grad_norm": 0.9927896973193208, "learning_rate": 8.978555630422686e-07, "loss": 0.0292, "step": 2277 }, { "epoch": 1.0363967242948136, "grad_norm": 1.1904824644989265, "learning_rate": 8.977689777126567e-07, "loss": 0.0676, "step": 2278 }, { "epoch": 1.0368516833484986, "grad_norm": 0.7472700334395056, "learning_rate": 8.976823598791603e-07, "loss": 0.0267, "step": 2279 }, { "epoch": 1.0373066424021837, "grad_norm": 1.0530356935360714, "learning_rate": 8.975957095488573e-07, "loss": 0.0378, "step": 2280 }, { "epoch": 1.037761601455869, "grad_norm": 1.0254459864657846, "learning_rate": 8.975090267288286e-07, "loss": 0.0382, "step": 2281 }, { "epoch": 1.0382165605095541, "grad_norm": 0.9128239530740545, "learning_rate": 8.974223114261573e-07, "loss": 0.0478, "step": 2282 }, { "epoch": 1.0386715195632392, "grad_norm": 1.2163925219456002, "learning_rate": 8.973355636479294e-07, "loss": 0.0642, "step": 2283 }, { "epoch": 1.0391264786169245, "grad_norm": 0.7002191478902219, "learning_rate": 8.972487834012337e-07, "loss": 0.0193, "step": 2284 }, { "epoch": 1.0395814376706096, "grad_norm": 1.3802143568851817, "learning_rate": 8.971619706931612e-07, "loss": 0.027, "step": 2285 }, { "epoch": 1.0400363967242947, "grad_norm": 1.1460887631856784, "learning_rate": 8.970751255308063e-07, "loss": 0.0505, "step": 2286 }, { "epoch": 1.04049135577798, "grad_norm": 0.9552138326200104, "learning_rate": 8.969882479212652e-07, "loss": 0.0281, "step": 2287 }, { "epoch": 1.040946314831665, "grad_norm": 0.9072720074402363, "learning_rate": 8.969013378716371e-07, "loss": 0.0308, "step": 2288 }, { "epoch": 1.0414012738853504, "grad_norm": 0.7187228322750128, "learning_rate": 8.968143953890242e-07, "loss": 0.0241, "step": 2289 }, { "epoch": 1.0418562329390355, "grad_norm": 0.6859943439466548, "learning_rate": 8.967274204805309e-07, "loss": 0.0402, "step": 2290 }, { "epoch": 1.0423111919927206, "grad_norm": 0.6867306420221936, "learning_rate": 8.966404131532643e-07, "loss": 0.0293, "step": 2291 }, { "epoch": 1.042766151046406, "grad_norm": 0.5676994380500459, "learning_rate": 8.965533734143345e-07, "loss": 0.0188, "step": 2292 }, { "epoch": 1.043221110100091, "grad_norm": 0.9565281877583584, "learning_rate": 8.964663012708538e-07, "loss": 0.0405, "step": 2293 }, { "epoch": 1.043676069153776, "grad_norm": 0.7826506000045265, "learning_rate": 8.963791967299373e-07, "loss": 0.0168, "step": 2294 }, { "epoch": 1.0441310282074614, "grad_norm": 0.7310068839618513, "learning_rate": 8.962920597987029e-07, "loss": 0.0278, "step": 2295 }, { "epoch": 1.0445859872611465, "grad_norm": 0.9136290396133182, "learning_rate": 8.962048904842711e-07, "loss": 0.0474, "step": 2296 }, { "epoch": 1.0450409463148316, "grad_norm": 0.8981005186611655, "learning_rate": 8.961176887937648e-07, "loss": 0.0231, "step": 2297 }, { "epoch": 1.0454959053685169, "grad_norm": 0.6691346139173115, "learning_rate": 8.960304547343099e-07, "loss": 0.0272, "step": 2298 }, { "epoch": 1.045950864422202, "grad_norm": 0.9330016430100659, "learning_rate": 8.959431883130346e-07, "loss": 0.0362, "step": 2299 }, { "epoch": 1.046405823475887, "grad_norm": 0.5724598155217767, "learning_rate": 8.958558895370703e-07, "loss": 0.0159, "step": 2300 }, { "epoch": 1.0468607825295724, "grad_norm": 0.875906255963188, "learning_rate": 8.9576855841355e-07, "loss": 0.0409, "step": 2301 }, { "epoch": 1.0473157415832575, "grad_norm": 1.0811495330683363, "learning_rate": 8.956811949496106e-07, "loss": 0.0344, "step": 2302 }, { "epoch": 1.0477707006369428, "grad_norm": 0.89927103579881, "learning_rate": 8.955937991523908e-07, "loss": 0.0341, "step": 2303 }, { "epoch": 1.0482256596906279, "grad_norm": 0.8818650722630335, "learning_rate": 8.955063710290321e-07, "loss": 0.0273, "step": 2304 }, { "epoch": 1.048680618744313, "grad_norm": 0.7774341690540894, "learning_rate": 8.95418910586679e-07, "loss": 0.0297, "step": 2305 }, { "epoch": 1.0491355777979983, "grad_norm": 0.6010550581632089, "learning_rate": 8.953314178324781e-07, "loss": 0.0323, "step": 2306 }, { "epoch": 1.0495905368516834, "grad_norm": 0.8249440318999435, "learning_rate": 8.952438927735791e-07, "loss": 0.0408, "step": 2307 }, { "epoch": 1.0500454959053684, "grad_norm": 0.9828686818877147, "learning_rate": 8.951563354171341e-07, "loss": 0.0289, "step": 2308 }, { "epoch": 1.0505004549590538, "grad_norm": 0.8541882048016893, "learning_rate": 8.950687457702979e-07, "loss": 0.03, "step": 2309 }, { "epoch": 1.0509554140127388, "grad_norm": 0.7850273383650231, "learning_rate": 8.949811238402278e-07, "loss": 0.0276, "step": 2310 }, { "epoch": 1.051410373066424, "grad_norm": 0.955132355713523, "learning_rate": 8.948934696340842e-07, "loss": 0.0463, "step": 2311 }, { "epoch": 1.0518653321201092, "grad_norm": 0.9370667734247461, "learning_rate": 8.948057831590295e-07, "loss": 0.0438, "step": 2312 }, { "epoch": 1.0523202911737943, "grad_norm": 0.778599678826226, "learning_rate": 8.94718064422229e-07, "loss": 0.0361, "step": 2313 }, { "epoch": 1.0527752502274794, "grad_norm": 0.8230462355663495, "learning_rate": 8.946303134308509e-07, "loss": 0.0387, "step": 2314 }, { "epoch": 1.0532302092811647, "grad_norm": 0.6879456908429652, "learning_rate": 8.945425301920657e-07, "loss": 0.0276, "step": 2315 }, { "epoch": 1.0536851683348498, "grad_norm": 0.8311720118191518, "learning_rate": 8.944547147130465e-07, "loss": 0.0535, "step": 2316 }, { "epoch": 1.0541401273885351, "grad_norm": 1.1074310784405148, "learning_rate": 8.943668670009696e-07, "loss": 0.0451, "step": 2317 }, { "epoch": 1.0545950864422202, "grad_norm": 0.9370004991335201, "learning_rate": 8.942789870630132e-07, "loss": 0.0218, "step": 2318 }, { "epoch": 1.0550500454959053, "grad_norm": 0.909276419266083, "learning_rate": 8.941910749063586e-07, "loss": 0.0308, "step": 2319 }, { "epoch": 1.0555050045495906, "grad_norm": 0.7600866731454429, "learning_rate": 8.941031305381893e-07, "loss": 0.0241, "step": 2320 }, { "epoch": 1.0559599636032757, "grad_norm": 0.7718742838892803, "learning_rate": 8.940151539656922e-07, "loss": 0.0321, "step": 2321 }, { "epoch": 1.0564149226569608, "grad_norm": 1.4405239000372414, "learning_rate": 8.939271451960559e-07, "loss": 0.0447, "step": 2322 }, { "epoch": 1.056869881710646, "grad_norm": 0.7928566773383434, "learning_rate": 8.938391042364721e-07, "loss": 0.033, "step": 2323 }, { "epoch": 1.0573248407643312, "grad_norm": 0.8124126353063322, "learning_rate": 8.937510310941356e-07, "loss": 0.0339, "step": 2324 }, { "epoch": 1.0577797998180163, "grad_norm": 0.8305360869282954, "learning_rate": 8.936629257762428e-07, "loss": 0.0286, "step": 2325 }, { "epoch": 1.0582347588717016, "grad_norm": 0.8005686363153205, "learning_rate": 8.935747882899935e-07, "loss": 0.0347, "step": 2326 }, { "epoch": 1.0586897179253867, "grad_norm": 0.8516782543827766, "learning_rate": 8.9348661864259e-07, "loss": 0.031, "step": 2327 }, { "epoch": 1.0591446769790718, "grad_norm": 1.1295581714533367, "learning_rate": 8.933984168412369e-07, "loss": 0.046, "step": 2328 }, { "epoch": 1.059599636032757, "grad_norm": 0.6585391850666715, "learning_rate": 8.933101828931417e-07, "loss": 0.0244, "step": 2329 }, { "epoch": 1.0600545950864422, "grad_norm": 0.9548259833211814, "learning_rate": 8.932219168055145e-07, "loss": 0.044, "step": 2330 }, { "epoch": 1.0605095541401275, "grad_norm": 0.9591435686407737, "learning_rate": 8.931336185855681e-07, "loss": 0.0339, "step": 2331 }, { "epoch": 1.0609645131938126, "grad_norm": 0.9058531108695391, "learning_rate": 8.930452882405177e-07, "loss": 0.0273, "step": 2332 }, { "epoch": 1.0614194722474977, "grad_norm": 0.853185946903912, "learning_rate": 8.929569257775814e-07, "loss": 0.0276, "step": 2333 }, { "epoch": 1.061874431301183, "grad_norm": 0.7526173108923806, "learning_rate": 8.928685312039798e-07, "loss": 0.0225, "step": 2334 }, { "epoch": 1.062329390354868, "grad_norm": 0.8968868930250231, "learning_rate": 8.927801045269359e-07, "loss": 0.0393, "step": 2335 }, { "epoch": 1.0627843494085532, "grad_norm": 0.9452791617640827, "learning_rate": 8.926916457536755e-07, "loss": 0.0287, "step": 2336 }, { "epoch": 1.0632393084622385, "grad_norm": 0.6492845755203341, "learning_rate": 8.926031548914274e-07, "loss": 0.0229, "step": 2337 }, { "epoch": 1.0636942675159236, "grad_norm": 1.2733514091283322, "learning_rate": 8.925146319474224e-07, "loss": 0.0494, "step": 2338 }, { "epoch": 1.0641492265696086, "grad_norm": 0.6799008869571052, "learning_rate": 8.924260769288943e-07, "loss": 0.0244, "step": 2339 }, { "epoch": 1.064604185623294, "grad_norm": 0.9512816864029636, "learning_rate": 8.923374898430793e-07, "loss": 0.032, "step": 2340 }, { "epoch": 1.065059144676979, "grad_norm": 0.9114866884782238, "learning_rate": 8.922488706972164e-07, "loss": 0.0439, "step": 2341 }, { "epoch": 1.0655141037306644, "grad_norm": 0.8568254857590972, "learning_rate": 8.921602194985471e-07, "loss": 0.0238, "step": 2342 }, { "epoch": 1.0659690627843494, "grad_norm": 0.6273797422411886, "learning_rate": 8.920715362543158e-07, "loss": 0.0164, "step": 2343 }, { "epoch": 1.0664240218380345, "grad_norm": 0.7921885310641688, "learning_rate": 8.919828209717691e-07, "loss": 0.0411, "step": 2344 }, { "epoch": 1.0668789808917198, "grad_norm": 0.9978892611252268, "learning_rate": 8.918940736581564e-07, "loss": 0.0303, "step": 2345 }, { "epoch": 1.067333939945405, "grad_norm": 0.80456271624151, "learning_rate": 8.918052943207297e-07, "loss": 0.0254, "step": 2346 }, { "epoch": 1.06778889899909, "grad_norm": 0.9315565966087159, "learning_rate": 8.917164829667438e-07, "loss": 0.0297, "step": 2347 }, { "epoch": 1.0682438580527753, "grad_norm": 0.8772106263581991, "learning_rate": 8.91627639603456e-07, "loss": 0.0377, "step": 2348 }, { "epoch": 1.0686988171064604, "grad_norm": 0.9447839729460547, "learning_rate": 8.91538764238126e-07, "loss": 0.0363, "step": 2349 }, { "epoch": 1.0691537761601455, "grad_norm": 1.1219837207811905, "learning_rate": 8.914498568780162e-07, "loss": 0.0336, "step": 2350 }, { "epoch": 1.0696087352138308, "grad_norm": 0.8620404049283086, "learning_rate": 8.913609175303921e-07, "loss": 0.0364, "step": 2351 }, { "epoch": 1.070063694267516, "grad_norm": 0.8605418858463079, "learning_rate": 8.912719462025211e-07, "loss": 0.0322, "step": 2352 }, { "epoch": 1.070518653321201, "grad_norm": 0.9464624598558937, "learning_rate": 8.911829429016737e-07, "loss": 0.0226, "step": 2353 }, { "epoch": 1.0709736123748863, "grad_norm": 0.6655807890325081, "learning_rate": 8.910939076351226e-07, "loss": 0.0213, "step": 2354 }, { "epoch": 1.0714285714285714, "grad_norm": 0.9253371713117516, "learning_rate": 8.910048404101435e-07, "loss": 0.0325, "step": 2355 }, { "epoch": 1.0718835304822565, "grad_norm": 0.9256403019831281, "learning_rate": 8.909157412340149e-07, "loss": 0.0269, "step": 2356 }, { "epoch": 1.0723384895359418, "grad_norm": 0.9128822984388533, "learning_rate": 8.908266101140172e-07, "loss": 0.0318, "step": 2357 }, { "epoch": 1.0727934485896269, "grad_norm": 0.8541785013757026, "learning_rate": 8.907374470574337e-07, "loss": 0.0329, "step": 2358 }, { "epoch": 1.0732484076433122, "grad_norm": 0.700583961503913, "learning_rate": 8.906482520715506e-07, "loss": 0.0287, "step": 2359 }, { "epoch": 1.0737033666969973, "grad_norm": 1.0412423627176521, "learning_rate": 8.905590251636565e-07, "loss": 0.0364, "step": 2360 }, { "epoch": 1.0741583257506824, "grad_norm": 1.1368290067057727, "learning_rate": 8.904697663410428e-07, "loss": 0.0508, "step": 2361 }, { "epoch": 1.0746132848043677, "grad_norm": 0.6477061664818674, "learning_rate": 8.90380475611003e-07, "loss": 0.0254, "step": 2362 }, { "epoch": 1.0750682438580528, "grad_norm": 0.7169852034857356, "learning_rate": 8.902911529808337e-07, "loss": 0.0305, "step": 2363 }, { "epoch": 1.0755232029117379, "grad_norm": 1.128314932035694, "learning_rate": 8.902017984578339e-07, "loss": 0.0237, "step": 2364 }, { "epoch": 1.0759781619654232, "grad_norm": 0.584404501714208, "learning_rate": 8.901124120493053e-07, "loss": 0.0213, "step": 2365 }, { "epoch": 1.0764331210191083, "grad_norm": 0.8307011815858157, "learning_rate": 8.90022993762552e-07, "loss": 0.0298, "step": 2366 }, { "epoch": 1.0768880800727934, "grad_norm": 0.8461806137466344, "learning_rate": 8.899335436048811e-07, "loss": 0.0322, "step": 2367 }, { "epoch": 1.0773430391264787, "grad_norm": 1.303345411661501, "learning_rate": 8.89844061583602e-07, "loss": 0.051, "step": 2368 }, { "epoch": 1.0777979981801638, "grad_norm": 0.8003216331566971, "learning_rate": 8.897545477060266e-07, "loss": 0.0148, "step": 2369 }, { "epoch": 1.078252957233849, "grad_norm": 0.8540198124214369, "learning_rate": 8.896650019794699e-07, "loss": 0.0275, "step": 2370 }, { "epoch": 1.0787079162875342, "grad_norm": 0.8954014528474152, "learning_rate": 8.895754244112486e-07, "loss": 0.0299, "step": 2371 }, { "epoch": 1.0791628753412192, "grad_norm": 1.0050975454724953, "learning_rate": 8.894858150086831e-07, "loss": 0.0334, "step": 2372 }, { "epoch": 1.0796178343949046, "grad_norm": 0.9247658344942917, "learning_rate": 8.893961737790956e-07, "loss": 0.0302, "step": 2373 }, { "epoch": 1.0800727934485896, "grad_norm": 1.046782159087304, "learning_rate": 8.893065007298115e-07, "loss": 0.0395, "step": 2374 }, { "epoch": 1.0805277525022747, "grad_norm": 0.8891076728781091, "learning_rate": 8.89216795868158e-07, "loss": 0.0364, "step": 2375 }, { "epoch": 1.08098271155596, "grad_norm": 0.9924840878351873, "learning_rate": 8.891270592014657e-07, "loss": 0.0422, "step": 2376 }, { "epoch": 1.0814376706096451, "grad_norm": 0.8642876515173423, "learning_rate": 8.890372907370676e-07, "loss": 0.0333, "step": 2377 }, { "epoch": 1.0818926296633302, "grad_norm": 1.1582919085779018, "learning_rate": 8.889474904822986e-07, "loss": 0.0311, "step": 2378 }, { "epoch": 1.0823475887170155, "grad_norm": 1.0138485467922973, "learning_rate": 8.888576584444975e-07, "loss": 0.0286, "step": 2379 }, { "epoch": 1.0828025477707006, "grad_norm": 0.8180299053545974, "learning_rate": 8.887677946310045e-07, "loss": 0.0273, "step": 2380 }, { "epoch": 1.0832575068243857, "grad_norm": 1.052855271593513, "learning_rate": 8.88677899049163e-07, "loss": 0.0393, "step": 2381 }, { "epoch": 1.083712465878071, "grad_norm": 0.6752847434195942, "learning_rate": 8.885879717063187e-07, "loss": 0.033, "step": 2382 }, { "epoch": 1.084167424931756, "grad_norm": 0.8324974258334753, "learning_rate": 8.884980126098203e-07, "loss": 0.0298, "step": 2383 }, { "epoch": 1.0846223839854412, "grad_norm": 0.9701503925745943, "learning_rate": 8.88408021767019e-07, "loss": 0.0238, "step": 2384 }, { "epoch": 1.0850773430391265, "grad_norm": 0.9881634502061726, "learning_rate": 8.883179991852678e-07, "loss": 0.031, "step": 2385 }, { "epoch": 1.0855323020928116, "grad_norm": 0.7926118267942757, "learning_rate": 8.882279448719234e-07, "loss": 0.0235, "step": 2386 }, { "epoch": 1.085987261146497, "grad_norm": 0.7438227378828037, "learning_rate": 8.881378588343447e-07, "loss": 0.0311, "step": 2387 }, { "epoch": 1.086442220200182, "grad_norm": 0.8155838630847136, "learning_rate": 8.88047741079893e-07, "loss": 0.0265, "step": 2388 }, { "epoch": 1.086897179253867, "grad_norm": 1.0470759439754946, "learning_rate": 8.879575916159323e-07, "loss": 0.0225, "step": 2389 }, { "epoch": 1.0873521383075524, "grad_norm": 0.7952871641910291, "learning_rate": 8.878674104498292e-07, "loss": 0.0165, "step": 2390 }, { "epoch": 1.0878070973612375, "grad_norm": 0.8188737298043667, "learning_rate": 8.877771975889528e-07, "loss": 0.0273, "step": 2391 }, { "epoch": 1.0882620564149226, "grad_norm": 2.6954949381383866, "learning_rate": 8.876869530406753e-07, "loss": 0.1138, "step": 2392 }, { "epoch": 1.0887170154686079, "grad_norm": 0.8559904819640873, "learning_rate": 8.875966768123703e-07, "loss": 0.0351, "step": 2393 }, { "epoch": 1.089171974522293, "grad_norm": 1.3567825594425384, "learning_rate": 8.875063689114155e-07, "loss": 0.0412, "step": 2394 }, { "epoch": 1.089626933575978, "grad_norm": 0.9712063773644144, "learning_rate": 8.874160293451903e-07, "loss": 0.0286, "step": 2395 }, { "epoch": 1.0900818926296634, "grad_norm": 0.7615827205864779, "learning_rate": 8.873256581210767e-07, "loss": 0.0316, "step": 2396 }, { "epoch": 1.0905368516833485, "grad_norm": 0.7911671074539933, "learning_rate": 8.872352552464592e-07, "loss": 0.0321, "step": 2397 }, { "epoch": 1.0909918107370338, "grad_norm": 0.7807781057384573, "learning_rate": 8.871448207287257e-07, "loss": 0.0354, "step": 2398 }, { "epoch": 1.0914467697907189, "grad_norm": 0.659492960727526, "learning_rate": 8.870543545752657e-07, "loss": 0.0317, "step": 2399 }, { "epoch": 1.091901728844404, "grad_norm": 0.9947720669307031, "learning_rate": 8.869638567934717e-07, "loss": 0.0483, "step": 2400 }, { "epoch": 1.0923566878980893, "grad_norm": 0.8285139117042748, "learning_rate": 8.868733273907388e-07, "loss": 0.0228, "step": 2401 }, { "epoch": 1.0928116469517744, "grad_norm": 0.9270092203933784, "learning_rate": 8.867827663744648e-07, "loss": 0.0183, "step": 2402 }, { "epoch": 1.0932666060054594, "grad_norm": 0.9482427198814914, "learning_rate": 8.866921737520499e-07, "loss": 0.0622, "step": 2403 }, { "epoch": 1.0937215650591448, "grad_norm": 0.9863603159631976, "learning_rate": 8.866015495308967e-07, "loss": 0.0303, "step": 2404 }, { "epoch": 1.0941765241128298, "grad_norm": 0.8553034881896592, "learning_rate": 8.865108937184107e-07, "loss": 0.0381, "step": 2405 }, { "epoch": 1.094631483166515, "grad_norm": 0.903577779407689, "learning_rate": 8.864202063220001e-07, "loss": 0.0299, "step": 2406 }, { "epoch": 1.0950864422202002, "grad_norm": 0.7635241285707882, "learning_rate": 8.863294873490751e-07, "loss": 0.0201, "step": 2407 }, { "epoch": 1.0955414012738853, "grad_norm": 0.8433093119217772, "learning_rate": 8.862387368070492e-07, "loss": 0.0238, "step": 2408 }, { "epoch": 1.0959963603275704, "grad_norm": 0.7429644949418723, "learning_rate": 8.861479547033379e-07, "loss": 0.0303, "step": 2409 }, { "epoch": 1.0964513193812557, "grad_norm": 0.5952115865136361, "learning_rate": 8.860571410453597e-07, "loss": 0.0103, "step": 2410 }, { "epoch": 1.0969062784349408, "grad_norm": 1.0834360332917525, "learning_rate": 8.859662958405351e-07, "loss": 0.0356, "step": 2411 }, { "epoch": 1.097361237488626, "grad_norm": 0.9316742589038043, "learning_rate": 8.85875419096288e-07, "loss": 0.0337, "step": 2412 }, { "epoch": 1.0978161965423112, "grad_norm": 0.889146726132034, "learning_rate": 8.857845108200441e-07, "loss": 0.0529, "step": 2413 }, { "epoch": 1.0982711555959963, "grad_norm": 0.6499318987098758, "learning_rate": 8.856935710192323e-07, "loss": 0.0246, "step": 2414 }, { "epoch": 1.0987261146496816, "grad_norm": 1.139018202606324, "learning_rate": 8.856025997012837e-07, "loss": 0.0242, "step": 2415 }, { "epoch": 1.0991810737033667, "grad_norm": 1.1340009213469804, "learning_rate": 8.85511596873632e-07, "loss": 0.0438, "step": 2416 }, { "epoch": 1.0996360327570518, "grad_norm": 1.075200393668298, "learning_rate": 8.854205625437135e-07, "loss": 0.03, "step": 2417 }, { "epoch": 1.100090991810737, "grad_norm": 0.805745978440032, "learning_rate": 8.85329496718967e-07, "loss": 0.0312, "step": 2418 }, { "epoch": 1.1005459508644222, "grad_norm": 0.9166446855827484, "learning_rate": 8.852383994068344e-07, "loss": 0.0561, "step": 2419 }, { "epoch": 1.1010009099181073, "grad_norm": 0.6303507148970261, "learning_rate": 8.851472706147594e-07, "loss": 0.0224, "step": 2420 }, { "epoch": 1.1014558689717926, "grad_norm": 0.8336499143830036, "learning_rate": 8.850561103501889e-07, "loss": 0.0259, "step": 2421 }, { "epoch": 1.1019108280254777, "grad_norm": 0.5590698674246094, "learning_rate": 8.849649186205719e-07, "loss": 0.0254, "step": 2422 }, { "epoch": 1.1023657870791628, "grad_norm": 0.963842898128466, "learning_rate": 8.848736954333602e-07, "loss": 0.0399, "step": 2423 }, { "epoch": 1.102820746132848, "grad_norm": 0.8446521526462915, "learning_rate": 8.847824407960081e-07, "loss": 0.0369, "step": 2424 }, { "epoch": 1.1032757051865332, "grad_norm": 0.8404842394370811, "learning_rate": 8.846911547159728e-07, "loss": 0.0218, "step": 2425 }, { "epoch": 1.1037306642402185, "grad_norm": 0.8258679090506431, "learning_rate": 8.845998372007134e-07, "loss": 0.0241, "step": 2426 }, { "epoch": 1.1041856232939036, "grad_norm": 0.837784932987909, "learning_rate": 8.845084882576923e-07, "loss": 0.0348, "step": 2427 }, { "epoch": 1.1046405823475887, "grad_norm": 0.9018327511482758, "learning_rate": 8.844171078943739e-07, "loss": 0.0345, "step": 2428 }, { "epoch": 1.105095541401274, "grad_norm": 0.8408314637448464, "learning_rate": 8.843256961182253e-07, "loss": 0.021, "step": 2429 }, { "epoch": 1.105550500454959, "grad_norm": 1.2824479563277507, "learning_rate": 8.842342529367166e-07, "loss": 0.0429, "step": 2430 }, { "epoch": 1.1060054595086442, "grad_norm": 1.324765373218032, "learning_rate": 8.841427783573199e-07, "loss": 0.0477, "step": 2431 }, { "epoch": 1.1064604185623295, "grad_norm": 0.7840176115600862, "learning_rate": 8.8405127238751e-07, "loss": 0.0234, "step": 2432 }, { "epoch": 1.1069153776160146, "grad_norm": 0.641650992086639, "learning_rate": 8.839597350347647e-07, "loss": 0.0232, "step": 2433 }, { "epoch": 1.1073703366696996, "grad_norm": 0.9642976707973323, "learning_rate": 8.838681663065637e-07, "loss": 0.0411, "step": 2434 }, { "epoch": 1.107825295723385, "grad_norm": 0.9113990805831763, "learning_rate": 8.837765662103897e-07, "loss": 0.0308, "step": 2435 }, { "epoch": 1.10828025477707, "grad_norm": 0.7730173685427556, "learning_rate": 8.836849347537277e-07, "loss": 0.0279, "step": 2436 }, { "epoch": 1.1087352138307551, "grad_norm": 0.8360717094273613, "learning_rate": 8.835932719440657e-07, "loss": 0.0171, "step": 2437 }, { "epoch": 1.1091901728844404, "grad_norm": 0.7653377373889798, "learning_rate": 8.835015777888937e-07, "loss": 0.0283, "step": 2438 }, { "epoch": 1.1096451319381255, "grad_norm": 1.0231782805505814, "learning_rate": 8.834098522957047e-07, "loss": 0.0454, "step": 2439 }, { "epoch": 1.1101000909918108, "grad_norm": 1.0126637525489022, "learning_rate": 8.83318095471994e-07, "loss": 0.0317, "step": 2440 }, { "epoch": 1.110555050045496, "grad_norm": 0.7651871323514453, "learning_rate": 8.832263073252597e-07, "loss": 0.035, "step": 2441 }, { "epoch": 1.111010009099181, "grad_norm": 0.8937780578997531, "learning_rate": 8.831344878630021e-07, "loss": 0.0417, "step": 2442 }, { "epoch": 1.1114649681528663, "grad_norm": 0.7943947476225677, "learning_rate": 8.830426370927244e-07, "loss": 0.0325, "step": 2443 }, { "epoch": 1.1119199272065514, "grad_norm": 0.897392046989779, "learning_rate": 8.829507550219322e-07, "loss": 0.0184, "step": 2444 }, { "epoch": 1.1123748862602365, "grad_norm": 0.7132302527527316, "learning_rate": 8.828588416581337e-07, "loss": 0.0263, "step": 2445 }, { "epoch": 1.1128298453139218, "grad_norm": 0.7450437954808692, "learning_rate": 8.827668970088396e-07, "loss": 0.0345, "step": 2446 }, { "epoch": 1.113284804367607, "grad_norm": 0.8843618979501796, "learning_rate": 8.826749210815631e-07, "loss": 0.021, "step": 2447 }, { "epoch": 1.113739763421292, "grad_norm": 0.9210487303833903, "learning_rate": 8.825829138838204e-07, "loss": 0.0224, "step": 2448 }, { "epoch": 1.1141947224749773, "grad_norm": 0.8697042182070017, "learning_rate": 8.824908754231298e-07, "loss": 0.0419, "step": 2449 }, { "epoch": 1.1146496815286624, "grad_norm": 0.9584077570510247, "learning_rate": 8.82398805707012e-07, "loss": 0.0343, "step": 2450 }, { "epoch": 1.1151046405823477, "grad_norm": 1.0470358763490941, "learning_rate": 8.823067047429906e-07, "loss": 0.031, "step": 2451 }, { "epoch": 1.1155595996360328, "grad_norm": 1.2853589270231134, "learning_rate": 8.822145725385919e-07, "loss": 0.0453, "step": 2452 }, { "epoch": 1.1160145586897179, "grad_norm": 0.9110892605007435, "learning_rate": 8.821224091013444e-07, "loss": 0.0417, "step": 2453 }, { "epoch": 1.1164695177434032, "grad_norm": 0.7711718194268029, "learning_rate": 8.820302144387793e-07, "loss": 0.0343, "step": 2454 }, { "epoch": 1.1169244767970883, "grad_norm": 0.8165860345340062, "learning_rate": 8.819379885584302e-07, "loss": 0.036, "step": 2455 }, { "epoch": 1.1173794358507734, "grad_norm": 1.1313682315600067, "learning_rate": 8.818457314678336e-07, "loss": 0.0413, "step": 2456 }, { "epoch": 1.1178343949044587, "grad_norm": 0.9167565746863611, "learning_rate": 8.817534431745281e-07, "loss": 0.0239, "step": 2457 }, { "epoch": 1.1182893539581438, "grad_norm": 0.8310921695231491, "learning_rate": 8.816611236860554e-07, "loss": 0.0257, "step": 2458 }, { "epoch": 1.1187443130118289, "grad_norm": 1.1140937078958777, "learning_rate": 8.815687730099593e-07, "loss": 0.0298, "step": 2459 }, { "epoch": 1.1191992720655142, "grad_norm": 0.7655187148759364, "learning_rate": 8.814763911537859e-07, "loss": 0.0239, "step": 2460 }, { "epoch": 1.1196542311191993, "grad_norm": 0.7482826850271679, "learning_rate": 8.813839781250847e-07, "loss": 0.0259, "step": 2461 }, { "epoch": 1.1201091901728844, "grad_norm": 0.8090316851696208, "learning_rate": 8.812915339314071e-07, "loss": 0.0398, "step": 2462 }, { "epoch": 1.1205641492265697, "grad_norm": 1.265615335396071, "learning_rate": 8.811990585803072e-07, "loss": 0.0462, "step": 2463 }, { "epoch": 1.1210191082802548, "grad_norm": 0.9736257113751081, "learning_rate": 8.81106552079342e-07, "loss": 0.043, "step": 2464 }, { "epoch": 1.1214740673339398, "grad_norm": 0.765871247938917, "learning_rate": 8.810140144360701e-07, "loss": 0.0296, "step": 2465 }, { "epoch": 1.1219290263876252, "grad_norm": 0.9685967262044362, "learning_rate": 8.809214456580537e-07, "loss": 0.0332, "step": 2466 }, { "epoch": 1.1223839854413102, "grad_norm": 0.7941215608538682, "learning_rate": 8.80828845752857e-07, "loss": 0.0211, "step": 2467 }, { "epoch": 1.1228389444949956, "grad_norm": 1.0095216441577834, "learning_rate": 8.807362147280468e-07, "loss": 0.0412, "step": 2468 }, { "epoch": 1.1232939035486806, "grad_norm": 0.8746472360953905, "learning_rate": 8.806435525911926e-07, "loss": 0.0286, "step": 2469 }, { "epoch": 1.1237488626023657, "grad_norm": 0.8372750605530158, "learning_rate": 8.80550859349866e-07, "loss": 0.0478, "step": 2470 }, { "epoch": 1.124203821656051, "grad_norm": 0.8002936156152544, "learning_rate": 8.80458135011642e-07, "loss": 0.031, "step": 2471 }, { "epoch": 1.1246587807097361, "grad_norm": 1.1011920893693101, "learning_rate": 8.803653795840973e-07, "loss": 0.0463, "step": 2472 }, { "epoch": 1.1251137397634212, "grad_norm": 1.705565193165256, "learning_rate": 8.802725930748114e-07, "loss": 0.0373, "step": 2473 }, { "epoch": 1.1255686988171065, "grad_norm": 0.8972633159149157, "learning_rate": 8.801797754913665e-07, "loss": 0.0315, "step": 2474 }, { "epoch": 1.1260236578707916, "grad_norm": 0.6859325739372538, "learning_rate": 8.800869268413473e-07, "loss": 0.0261, "step": 2475 }, { "epoch": 1.1264786169244767, "grad_norm": 0.8897847775591781, "learning_rate": 8.799940471323408e-07, "loss": 0.0229, "step": 2476 }, { "epoch": 1.126933575978162, "grad_norm": 1.0327610487006351, "learning_rate": 8.799011363719369e-07, "loss": 0.0376, "step": 2477 }, { "epoch": 1.127388535031847, "grad_norm": 0.60867833811083, "learning_rate": 8.798081945677279e-07, "loss": 0.0201, "step": 2478 }, { "epoch": 1.1278434940855324, "grad_norm": 0.8272025314190995, "learning_rate": 8.797152217273081e-07, "loss": 0.0231, "step": 2479 }, { "epoch": 1.1282984531392175, "grad_norm": 0.9696037662670335, "learning_rate": 8.796222178582754e-07, "loss": 0.0304, "step": 2480 }, { "epoch": 1.1287534121929026, "grad_norm": 0.8945425205730094, "learning_rate": 8.795291829682292e-07, "loss": 0.0257, "step": 2481 }, { "epoch": 1.129208371246588, "grad_norm": 0.8471843623838259, "learning_rate": 8.794361170647722e-07, "loss": 0.0229, "step": 2482 }, { "epoch": 1.129663330300273, "grad_norm": 0.79791363424229, "learning_rate": 8.793430201555094e-07, "loss": 0.0113, "step": 2483 }, { "epoch": 1.130118289353958, "grad_norm": 1.1280512751710647, "learning_rate": 8.792498922480479e-07, "loss": 0.0273, "step": 2484 }, { "epoch": 1.1305732484076434, "grad_norm": 0.8634474344309044, "learning_rate": 8.79156733349998e-07, "loss": 0.0356, "step": 2485 }, { "epoch": 1.1310282074613285, "grad_norm": 0.9103903747412124, "learning_rate": 8.79063543468972e-07, "loss": 0.0294, "step": 2486 }, { "epoch": 1.1314831665150136, "grad_norm": 1.5284283975429618, "learning_rate": 8.789703226125852e-07, "loss": 0.036, "step": 2487 }, { "epoch": 1.1319381255686989, "grad_norm": 0.9175611716349003, "learning_rate": 8.78877070788455e-07, "loss": 0.0246, "step": 2488 }, { "epoch": 1.132393084622384, "grad_norm": 0.8666908195926003, "learning_rate": 8.787837880042015e-07, "loss": 0.023, "step": 2489 }, { "epoch": 1.132848043676069, "grad_norm": 0.9378988615581378, "learning_rate": 8.786904742674475e-07, "loss": 0.0277, "step": 2490 }, { "epoch": 1.1333030027297544, "grad_norm": 0.8283113138484444, "learning_rate": 8.785971295858178e-07, "loss": 0.027, "step": 2491 }, { "epoch": 1.1337579617834395, "grad_norm": 0.9397779888894104, "learning_rate": 8.785037539669408e-07, "loss": 0.0273, "step": 2492 }, { "epoch": 1.1342129208371245, "grad_norm": 1.0351376313630762, "learning_rate": 8.784103474184461e-07, "loss": 0.0356, "step": 2493 }, { "epoch": 1.1346678798908099, "grad_norm": 0.9607817406092596, "learning_rate": 8.783169099479668e-07, "loss": 0.029, "step": 2494 }, { "epoch": 1.135122838944495, "grad_norm": 1.0938584505128677, "learning_rate": 8.782234415631381e-07, "loss": 0.0309, "step": 2495 }, { "epoch": 1.1355777979981803, "grad_norm": 0.8458037500091956, "learning_rate": 8.781299422715977e-07, "loss": 0.0173, "step": 2496 }, { "epoch": 1.1360327570518653, "grad_norm": 1.16935951129933, "learning_rate": 8.780364120809863e-07, "loss": 0.0349, "step": 2497 }, { "epoch": 1.1364877161055504, "grad_norm": 0.8077227363908266, "learning_rate": 8.779428509989462e-07, "loss": 0.0363, "step": 2498 }, { "epoch": 1.1369426751592357, "grad_norm": 0.911955986160999, "learning_rate": 8.778492590331233e-07, "loss": 0.0253, "step": 2499 }, { "epoch": 1.1373976342129208, "grad_norm": 0.6864722829508173, "learning_rate": 8.777556361911651e-07, "loss": 0.0352, "step": 2500 }, { "epoch": 1.137852593266606, "grad_norm": 0.9826494800534459, "learning_rate": 8.776619824807224e-07, "loss": 0.0236, "step": 2501 }, { "epoch": 1.1383075523202912, "grad_norm": 1.3203417049393482, "learning_rate": 8.775682979094479e-07, "loss": 0.0556, "step": 2502 }, { "epoch": 1.1387625113739763, "grad_norm": 1.0011776544168194, "learning_rate": 8.774745824849972e-07, "loss": 0.0303, "step": 2503 }, { "epoch": 1.1392174704276614, "grad_norm": 1.2388585492511763, "learning_rate": 8.773808362150282e-07, "loss": 0.0744, "step": 2504 }, { "epoch": 1.1396724294813467, "grad_norm": 0.789904511179679, "learning_rate": 8.772870591072016e-07, "loss": 0.0294, "step": 2505 }, { "epoch": 1.1401273885350318, "grad_norm": 0.8749208840183204, "learning_rate": 8.771932511691803e-07, "loss": 0.0298, "step": 2506 }, { "epoch": 1.1405823475887171, "grad_norm": 0.7865344617564735, "learning_rate": 8.7709941240863e-07, "loss": 0.0349, "step": 2507 }, { "epoch": 1.1410373066424022, "grad_norm": 0.8777258286393839, "learning_rate": 8.770055428332186e-07, "loss": 0.0372, "step": 2508 }, { "epoch": 1.1414922656960873, "grad_norm": 0.8710776698348903, "learning_rate": 8.769116424506168e-07, "loss": 0.0293, "step": 2509 }, { "epoch": 1.1419472247497726, "grad_norm": 0.989733226667955, "learning_rate": 8.768177112684976e-07, "loss": 0.0251, "step": 2510 }, { "epoch": 1.1424021838034577, "grad_norm": 0.8608246065643106, "learning_rate": 8.767237492945369e-07, "loss": 0.0221, "step": 2511 }, { "epoch": 1.1428571428571428, "grad_norm": 1.3056475852372622, "learning_rate": 8.766297565364126e-07, "loss": 0.0323, "step": 2512 }, { "epoch": 1.143312101910828, "grad_norm": 0.8945649708959262, "learning_rate": 8.765357330018055e-07, "loss": 0.0472, "step": 2513 }, { "epoch": 1.1437670609645132, "grad_norm": 0.8892906769685106, "learning_rate": 8.764416786983985e-07, "loss": 0.0409, "step": 2514 }, { "epoch": 1.1442220200181983, "grad_norm": 1.7282951087173073, "learning_rate": 8.763475936338778e-07, "loss": 0.0491, "step": 2515 }, { "epoch": 1.1446769790718836, "grad_norm": 1.1035449473994428, "learning_rate": 8.762534778159312e-07, "loss": 0.0299, "step": 2516 }, { "epoch": 1.1451319381255687, "grad_norm": 0.7404783386311238, "learning_rate": 8.761593312522495e-07, "loss": 0.0299, "step": 2517 }, { "epoch": 1.1455868971792538, "grad_norm": 1.0336073704960465, "learning_rate": 8.76065153950526e-07, "loss": 0.0407, "step": 2518 }, { "epoch": 1.146041856232939, "grad_norm": 1.177711152204267, "learning_rate": 8.759709459184564e-07, "loss": 0.0358, "step": 2519 }, { "epoch": 1.1464968152866242, "grad_norm": 1.0133414559011744, "learning_rate": 8.758767071637389e-07, "loss": 0.0388, "step": 2520 }, { "epoch": 1.1469517743403093, "grad_norm": 1.0937987748780054, "learning_rate": 8.757824376940745e-07, "loss": 0.0566, "step": 2521 }, { "epoch": 1.1474067333939946, "grad_norm": 0.8600644056884543, "learning_rate": 8.756881375171663e-07, "loss": 0.03, "step": 2522 }, { "epoch": 1.1478616924476797, "grad_norm": 1.041073321424695, "learning_rate": 8.7559380664072e-07, "loss": 0.0333, "step": 2523 }, { "epoch": 1.148316651501365, "grad_norm": 0.9262349051438561, "learning_rate": 8.75499445072444e-07, "loss": 0.0313, "step": 2524 }, { "epoch": 1.14877161055505, "grad_norm": 0.9473361085287598, "learning_rate": 8.754050528200492e-07, "loss": 0.0402, "step": 2525 }, { "epoch": 1.1492265696087351, "grad_norm": 0.9079695151632244, "learning_rate": 8.753106298912487e-07, "loss": 0.0254, "step": 2526 }, { "epoch": 1.1496815286624205, "grad_norm": 0.7481764262665361, "learning_rate": 8.752161762937585e-07, "loss": 0.0163, "step": 2527 }, { "epoch": 1.1501364877161055, "grad_norm": 1.0548565841708264, "learning_rate": 8.751216920352966e-07, "loss": 0.0276, "step": 2528 }, { "epoch": 1.1505914467697906, "grad_norm": 1.2294341743389245, "learning_rate": 8.750271771235842e-07, "loss": 0.029, "step": 2529 }, { "epoch": 1.151046405823476, "grad_norm": 0.64874358738655, "learning_rate": 8.749326315663445e-07, "loss": 0.0233, "step": 2530 }, { "epoch": 1.151501364877161, "grad_norm": 0.8099035740788556, "learning_rate": 8.748380553713033e-07, "loss": 0.0226, "step": 2531 }, { "epoch": 1.1519563239308463, "grad_norm": 0.9658366495777941, "learning_rate": 8.747434485461891e-07, "loss": 0.0242, "step": 2532 }, { "epoch": 1.1524112829845314, "grad_norm": 0.7675832658244434, "learning_rate": 8.746488110987325e-07, "loss": 0.0203, "step": 2533 }, { "epoch": 1.1528662420382165, "grad_norm": 1.368259791175065, "learning_rate": 8.745541430366669e-07, "loss": 0.0393, "step": 2534 }, { "epoch": 1.1533212010919018, "grad_norm": 2.145810269778326, "learning_rate": 8.744594443677282e-07, "loss": 0.0869, "step": 2535 }, { "epoch": 1.153776160145587, "grad_norm": 0.6567118725563217, "learning_rate": 8.74364715099655e-07, "loss": 0.0285, "step": 2536 }, { "epoch": 1.154231119199272, "grad_norm": 0.9004796223292545, "learning_rate": 8.742699552401876e-07, "loss": 0.0553, "step": 2537 }, { "epoch": 1.1546860782529573, "grad_norm": 0.8183304987123935, "learning_rate": 8.741751647970699e-07, "loss": 0.0282, "step": 2538 }, { "epoch": 1.1551410373066424, "grad_norm": 0.802146850650793, "learning_rate": 8.740803437780473e-07, "loss": 0.0228, "step": 2539 }, { "epoch": 1.1555959963603275, "grad_norm": 0.8852726447649146, "learning_rate": 8.739854921908684e-07, "loss": 0.0307, "step": 2540 }, { "epoch": 1.1560509554140128, "grad_norm": 0.7195373629857734, "learning_rate": 8.73890610043284e-07, "loss": 0.0294, "step": 2541 }, { "epoch": 1.156505914467698, "grad_norm": 0.7571261205274333, "learning_rate": 8.737956973430473e-07, "loss": 0.0484, "step": 2542 }, { "epoch": 1.156960873521383, "grad_norm": 0.8512636255791792, "learning_rate": 8.737007540979144e-07, "loss": 0.0395, "step": 2543 }, { "epoch": 1.1574158325750683, "grad_norm": 0.7542676910072469, "learning_rate": 8.736057803156434e-07, "loss": 0.023, "step": 2544 }, { "epoch": 1.1578707916287534, "grad_norm": 0.9991166315038315, "learning_rate": 8.735107760039953e-07, "loss": 0.0346, "step": 2545 }, { "epoch": 1.1583257506824385, "grad_norm": 0.891863620760079, "learning_rate": 8.734157411707334e-07, "loss": 0.0252, "step": 2546 }, { "epoch": 1.1587807097361238, "grad_norm": 1.1374843503948968, "learning_rate": 8.733206758236234e-07, "loss": 0.0531, "step": 2547 }, { "epoch": 1.1592356687898089, "grad_norm": 1.450707316594819, "learning_rate": 8.732255799704336e-07, "loss": 0.0378, "step": 2548 }, { "epoch": 1.159690627843494, "grad_norm": 0.936315447693317, "learning_rate": 8.731304536189349e-07, "loss": 0.029, "step": 2549 }, { "epoch": 1.1601455868971793, "grad_norm": 0.9493443213624092, "learning_rate": 8.730352967769006e-07, "loss": 0.0305, "step": 2550 }, { "epoch": 1.1606005459508644, "grad_norm": 0.9107810077410304, "learning_rate": 8.729401094521065e-07, "loss": 0.029, "step": 2551 }, { "epoch": 1.1610555050045497, "grad_norm": 0.8672910660089426, "learning_rate": 8.728448916523308e-07, "loss": 0.0368, "step": 2552 }, { "epoch": 1.1615104640582348, "grad_norm": 0.9909232540252985, "learning_rate": 8.727496433853543e-07, "loss": 0.0398, "step": 2553 }, { "epoch": 1.1619654231119199, "grad_norm": 0.9232353216894028, "learning_rate": 8.726543646589604e-07, "loss": 0.0293, "step": 2554 }, { "epoch": 1.1624203821656052, "grad_norm": 1.2906793790015074, "learning_rate": 8.725590554809345e-07, "loss": 0.0344, "step": 2555 }, { "epoch": 1.1628753412192903, "grad_norm": 0.8259165751738979, "learning_rate": 8.72463715859065e-07, "loss": 0.0187, "step": 2556 }, { "epoch": 1.1633303002729753, "grad_norm": 0.9598735663502731, "learning_rate": 8.72368345801143e-07, "loss": 0.0468, "step": 2557 }, { "epoch": 1.1637852593266607, "grad_norm": 0.8884099092516711, "learning_rate": 8.72272945314961e-07, "loss": 0.0176, "step": 2558 }, { "epoch": 1.1642402183803457, "grad_norm": 0.6853660238556566, "learning_rate": 8.721775144083155e-07, "loss": 0.0342, "step": 2559 }, { "epoch": 1.164695177434031, "grad_norm": 1.063445505056267, "learning_rate": 8.720820530890039e-07, "loss": 0.0406, "step": 2560 }, { "epoch": 1.1651501364877161, "grad_norm": 0.8583497483968962, "learning_rate": 8.719865613648274e-07, "loss": 0.024, "step": 2561 }, { "epoch": 1.1656050955414012, "grad_norm": 0.8825740374933925, "learning_rate": 8.71891039243589e-07, "loss": 0.0224, "step": 2562 }, { "epoch": 1.1660600545950865, "grad_norm": 0.6670044659478578, "learning_rate": 8.717954867330941e-07, "loss": 0.0157, "step": 2563 }, { "epoch": 1.1665150136487716, "grad_norm": 1.1997132685797292, "learning_rate": 8.716999038411512e-07, "loss": 0.0458, "step": 2564 }, { "epoch": 1.1669699727024567, "grad_norm": 0.6845676140550871, "learning_rate": 8.716042905755707e-07, "loss": 0.0286, "step": 2565 }, { "epoch": 1.167424931756142, "grad_norm": 0.868097670153605, "learning_rate": 8.715086469441658e-07, "loss": 0.0225, "step": 2566 }, { "epoch": 1.1678798908098271, "grad_norm": 0.9728284251967005, "learning_rate": 8.71412972954752e-07, "loss": 0.0213, "step": 2567 }, { "epoch": 1.1683348498635122, "grad_norm": 0.7794963975903837, "learning_rate": 8.713172686151473e-07, "loss": 0.0283, "step": 2568 }, { "epoch": 1.1687898089171975, "grad_norm": 0.9223588810493101, "learning_rate": 8.712215339331724e-07, "loss": 0.0407, "step": 2569 }, { "epoch": 1.1692447679708826, "grad_norm": 0.9319921027454858, "learning_rate": 8.711257689166498e-07, "loss": 0.0317, "step": 2570 }, { "epoch": 1.1696997270245677, "grad_norm": 0.7366234272895821, "learning_rate": 8.710299735734057e-07, "loss": 0.0213, "step": 2571 }, { "epoch": 1.170154686078253, "grad_norm": 0.8350251805304932, "learning_rate": 8.709341479112675e-07, "loss": 0.0232, "step": 2572 }, { "epoch": 1.170609645131938, "grad_norm": 0.9453278580950487, "learning_rate": 8.708382919380659e-07, "loss": 0.0369, "step": 2573 }, { "epoch": 1.1710646041856232, "grad_norm": 0.7726951911941323, "learning_rate": 8.707424056616339e-07, "loss": 0.0284, "step": 2574 }, { "epoch": 1.1715195632393085, "grad_norm": 0.9849642738527536, "learning_rate": 8.706464890898066e-07, "loss": 0.0253, "step": 2575 }, { "epoch": 1.1719745222929936, "grad_norm": 0.8873698781376558, "learning_rate": 8.705505422304222e-07, "loss": 0.0233, "step": 2576 }, { "epoch": 1.1724294813466787, "grad_norm": 1.0136651822010045, "learning_rate": 8.704545650913209e-07, "loss": 0.0232, "step": 2577 }, { "epoch": 1.172884440400364, "grad_norm": 0.8302827938681083, "learning_rate": 8.703585576803454e-07, "loss": 0.0453, "step": 2578 }, { "epoch": 1.173339399454049, "grad_norm": 1.0915012029031168, "learning_rate": 8.70262520005341e-07, "loss": 0.0435, "step": 2579 }, { "epoch": 1.1737943585077344, "grad_norm": 1.2321361184273745, "learning_rate": 8.701664520741556e-07, "loss": 0.033, "step": 2580 }, { "epoch": 1.1742493175614195, "grad_norm": 0.8876410819215693, "learning_rate": 8.700703538946394e-07, "loss": 0.0286, "step": 2581 }, { "epoch": 1.1747042766151046, "grad_norm": 0.877687332346365, "learning_rate": 8.699742254746452e-07, "loss": 0.0326, "step": 2582 }, { "epoch": 1.1751592356687899, "grad_norm": 0.9986136899148278, "learning_rate": 8.69878066822028e-07, "loss": 0.0439, "step": 2583 }, { "epoch": 1.175614194722475, "grad_norm": 0.9733151490674231, "learning_rate": 8.697818779446455e-07, "loss": 0.0358, "step": 2584 }, { "epoch": 1.17606915377616, "grad_norm": 0.9685094911214658, "learning_rate": 8.696856588503581e-07, "loss": 0.041, "step": 2585 }, { "epoch": 1.1765241128298454, "grad_norm": 0.894275335096385, "learning_rate": 8.69589409547028e-07, "loss": 0.039, "step": 2586 }, { "epoch": 1.1769790718835305, "grad_norm": 0.9640833284597093, "learning_rate": 8.694931300425203e-07, "loss": 0.0349, "step": 2587 }, { "epoch": 1.1774340309372158, "grad_norm": 1.1903454254601318, "learning_rate": 8.693968203447026e-07, "loss": 0.035, "step": 2588 }, { "epoch": 1.1778889899909009, "grad_norm": 0.9305421341632887, "learning_rate": 8.693004804614449e-07, "loss": 0.0263, "step": 2589 }, { "epoch": 1.178343949044586, "grad_norm": 0.6938399743560543, "learning_rate": 8.692041104006201e-07, "loss": 0.02, "step": 2590 }, { "epoch": 1.1787989080982713, "grad_norm": 1.167790573749163, "learning_rate": 8.691077101701023e-07, "loss": 0.0462, "step": 2591 }, { "epoch": 1.1792538671519563, "grad_norm": 0.7286916115432596, "learning_rate": 8.690112797777694e-07, "loss": 0.0274, "step": 2592 }, { "epoch": 1.1797088262056414, "grad_norm": 0.8555294165094003, "learning_rate": 8.689148192315011e-07, "loss": 0.0233, "step": 2593 }, { "epoch": 1.1801637852593267, "grad_norm": 0.8433503494794348, "learning_rate": 8.688183285391799e-07, "loss": 0.0272, "step": 2594 }, { "epoch": 1.1806187443130118, "grad_norm": 0.7648671741382471, "learning_rate": 8.687218077086904e-07, "loss": 0.0362, "step": 2595 }, { "epoch": 1.181073703366697, "grad_norm": 0.5757456866678153, "learning_rate": 8.686252567479199e-07, "loss": 0.0218, "step": 2596 }, { "epoch": 1.1815286624203822, "grad_norm": 1.0696190001634927, "learning_rate": 8.685286756647581e-07, "loss": 0.0397, "step": 2597 }, { "epoch": 1.1819836214740673, "grad_norm": 1.0004608870835447, "learning_rate": 8.684320644670972e-07, "loss": 0.0299, "step": 2598 }, { "epoch": 1.1824385805277524, "grad_norm": 1.0979397166634188, "learning_rate": 8.683354231628319e-07, "loss": 0.0283, "step": 2599 }, { "epoch": 1.1828935395814377, "grad_norm": 1.0629013290111025, "learning_rate": 8.68238751759859e-07, "loss": 0.0433, "step": 2600 }, { "epoch": 1.1833484986351228, "grad_norm": 0.8058094364911609, "learning_rate": 8.681420502660784e-07, "loss": 0.0361, "step": 2601 }, { "epoch": 1.183803457688808, "grad_norm": 1.1838635925066003, "learning_rate": 8.68045318689392e-07, "loss": 0.0405, "step": 2602 }, { "epoch": 1.1842584167424932, "grad_norm": 1.2483639760736494, "learning_rate": 8.679485570377042e-07, "loss": 0.0262, "step": 2603 }, { "epoch": 1.1847133757961783, "grad_norm": 0.9426256934212006, "learning_rate": 8.67851765318922e-07, "loss": 0.0299, "step": 2604 }, { "epoch": 1.1851683348498634, "grad_norm": 1.110042856980382, "learning_rate": 8.677549435409546e-07, "loss": 0.0293, "step": 2605 }, { "epoch": 1.1856232939035487, "grad_norm": 1.259474363908622, "learning_rate": 8.676580917117142e-07, "loss": 0.0667, "step": 2606 }, { "epoch": 1.1860782529572338, "grad_norm": 0.7936448979251531, "learning_rate": 8.675612098391147e-07, "loss": 0.0306, "step": 2607 }, { "epoch": 1.186533212010919, "grad_norm": 1.2389157144897776, "learning_rate": 8.674642979310732e-07, "loss": 0.0495, "step": 2608 }, { "epoch": 1.1869881710646042, "grad_norm": 1.0413866516583, "learning_rate": 8.673673559955085e-07, "loss": 0.0342, "step": 2609 }, { "epoch": 1.1874431301182893, "grad_norm": 0.7857159335822655, "learning_rate": 8.672703840403426e-07, "loss": 0.035, "step": 2610 }, { "epoch": 1.1878980891719746, "grad_norm": 0.8026032017287262, "learning_rate": 8.671733820734994e-07, "loss": 0.0262, "step": 2611 }, { "epoch": 1.1883530482256597, "grad_norm": 1.0756693018506136, "learning_rate": 8.670763501029057e-07, "loss": 0.0325, "step": 2612 }, { "epoch": 1.1888080072793448, "grad_norm": 0.8035108297851373, "learning_rate": 8.669792881364903e-07, "loss": 0.0352, "step": 2613 }, { "epoch": 1.18926296633303, "grad_norm": 0.7664342036540458, "learning_rate": 8.668821961821847e-07, "loss": 0.0423, "step": 2614 }, { "epoch": 1.1897179253867152, "grad_norm": 0.7898992830559549, "learning_rate": 8.667850742479229e-07, "loss": 0.0358, "step": 2615 }, { "epoch": 1.1901728844404005, "grad_norm": 0.8661065328677364, "learning_rate": 8.666879223416412e-07, "loss": 0.0369, "step": 2616 }, { "epoch": 1.1906278434940856, "grad_norm": 0.9817370545590277, "learning_rate": 8.665907404712784e-07, "loss": 0.0544, "step": 2617 }, { "epoch": 1.1910828025477707, "grad_norm": 0.8871072886939882, "learning_rate": 8.664935286447759e-07, "loss": 0.0312, "step": 2618 }, { "epoch": 1.191537761601456, "grad_norm": 0.7299476776130976, "learning_rate": 8.663962868700773e-07, "loss": 0.0242, "step": 2619 }, { "epoch": 1.191992720655141, "grad_norm": 0.9477495336785253, "learning_rate": 8.662990151551287e-07, "loss": 0.0597, "step": 2620 }, { "epoch": 1.1924476797088261, "grad_norm": 0.7180147600994478, "learning_rate": 8.662017135078789e-07, "loss": 0.0348, "step": 2621 }, { "epoch": 1.1929026387625115, "grad_norm": 1.3921428044109598, "learning_rate": 8.661043819362786e-07, "loss": 0.0474, "step": 2622 }, { "epoch": 1.1933575978161965, "grad_norm": 0.7607857568963972, "learning_rate": 8.660070204482817e-07, "loss": 0.0335, "step": 2623 }, { "epoch": 1.1938125568698816, "grad_norm": 0.7104960542635574, "learning_rate": 8.65909629051844e-07, "loss": 0.0263, "step": 2624 }, { "epoch": 1.194267515923567, "grad_norm": 0.7775544932741703, "learning_rate": 8.658122077549237e-07, "loss": 0.0334, "step": 2625 }, { "epoch": 1.194722474977252, "grad_norm": 0.9232802696169148, "learning_rate": 8.657147565654818e-07, "loss": 0.0461, "step": 2626 }, { "epoch": 1.1951774340309371, "grad_norm": 0.8050714150822946, "learning_rate": 8.656172754914817e-07, "loss": 0.0249, "step": 2627 }, { "epoch": 1.1956323930846224, "grad_norm": 0.8488983158184574, "learning_rate": 8.655197645408887e-07, "loss": 0.0329, "step": 2628 }, { "epoch": 1.1960873521383075, "grad_norm": 1.0167492573594907, "learning_rate": 8.654222237216713e-07, "loss": 0.0546, "step": 2629 }, { "epoch": 1.1965423111919926, "grad_norm": 1.1324323628167134, "learning_rate": 8.653246530418001e-07, "loss": 0.0353, "step": 2630 }, { "epoch": 1.196997270245678, "grad_norm": 0.8307449060749313, "learning_rate": 8.65227052509248e-07, "loss": 0.0223, "step": 2631 }, { "epoch": 1.197452229299363, "grad_norm": 1.3106456475122843, "learning_rate": 8.651294221319906e-07, "loss": 0.0646, "step": 2632 }, { "epoch": 1.197907188353048, "grad_norm": 0.8562557057241499, "learning_rate": 8.650317619180056e-07, "loss": 0.0228, "step": 2633 }, { "epoch": 1.1983621474067334, "grad_norm": 0.6968188230185423, "learning_rate": 8.649340718752736e-07, "loss": 0.0285, "step": 2634 }, { "epoch": 1.1988171064604185, "grad_norm": 0.864887208328244, "learning_rate": 8.648363520117772e-07, "loss": 0.0236, "step": 2635 }, { "epoch": 1.1992720655141038, "grad_norm": 0.5392982186137747, "learning_rate": 8.647386023355017e-07, "loss": 0.0183, "step": 2636 }, { "epoch": 1.199727024567789, "grad_norm": 1.3899681884070914, "learning_rate": 8.646408228544349e-07, "loss": 0.0543, "step": 2637 }, { "epoch": 1.200181983621474, "grad_norm": 0.9156702720631225, "learning_rate": 8.645430135765665e-07, "loss": 0.0295, "step": 2638 }, { "epoch": 1.2006369426751593, "grad_norm": 0.7395808560052927, "learning_rate": 8.644451745098895e-07, "loss": 0.0338, "step": 2639 }, { "epoch": 1.2010919017288444, "grad_norm": 1.087492965517917, "learning_rate": 8.643473056623985e-07, "loss": 0.0444, "step": 2640 }, { "epoch": 1.2015468607825295, "grad_norm": 1.1575706493574034, "learning_rate": 8.642494070420912e-07, "loss": 0.0431, "step": 2641 }, { "epoch": 1.2020018198362148, "grad_norm": 0.8398383422339285, "learning_rate": 8.641514786569672e-07, "loss": 0.0316, "step": 2642 }, { "epoch": 1.2024567788898999, "grad_norm": 1.0382231237286084, "learning_rate": 8.64053520515029e-07, "loss": 0.0331, "step": 2643 }, { "epoch": 1.2029117379435852, "grad_norm": 0.857165202666949, "learning_rate": 8.639555326242811e-07, "loss": 0.0238, "step": 2644 }, { "epoch": 1.2033666969972703, "grad_norm": 1.2479475325617886, "learning_rate": 8.638575149927305e-07, "loss": 0.0602, "step": 2645 }, { "epoch": 1.2038216560509554, "grad_norm": 1.2817271798241152, "learning_rate": 8.637594676283871e-07, "loss": 0.0324, "step": 2646 }, { "epoch": 1.2042766151046407, "grad_norm": 0.8891971430025116, "learning_rate": 8.636613905392627e-07, "loss": 0.0456, "step": 2647 }, { "epoch": 1.2047315741583258, "grad_norm": 0.6523519225820711, "learning_rate": 8.635632837333717e-07, "loss": 0.0166, "step": 2648 }, { "epoch": 1.2051865332120109, "grad_norm": 0.934430043000884, "learning_rate": 8.634651472187311e-07, "loss": 0.0311, "step": 2649 }, { "epoch": 1.2056414922656962, "grad_norm": 0.826266957806778, "learning_rate": 8.6336698100336e-07, "loss": 0.0332, "step": 2650 }, { "epoch": 1.2060964513193813, "grad_norm": 0.8937800172101543, "learning_rate": 8.632687850952802e-07, "loss": 0.0268, "step": 2651 }, { "epoch": 1.2065514103730663, "grad_norm": 0.7687682713920272, "learning_rate": 8.631705595025158e-07, "loss": 0.0268, "step": 2652 }, { "epoch": 1.2070063694267517, "grad_norm": 0.7805644263840028, "learning_rate": 8.630723042330933e-07, "loss": 0.0255, "step": 2653 }, { "epoch": 1.2074613284804367, "grad_norm": 0.7653074812349708, "learning_rate": 8.629740192950416e-07, "loss": 0.0222, "step": 2654 }, { "epoch": 1.2079162875341218, "grad_norm": 0.5691669131780308, "learning_rate": 8.628757046963924e-07, "loss": 0.0127, "step": 2655 }, { "epoch": 1.2083712465878071, "grad_norm": 0.6992600118833391, "learning_rate": 8.627773604451794e-07, "loss": 0.0181, "step": 2656 }, { "epoch": 1.2088262056414922, "grad_norm": 0.8668403088262662, "learning_rate": 8.626789865494386e-07, "loss": 0.0281, "step": 2657 }, { "epoch": 1.2092811646951773, "grad_norm": 0.7510126626187421, "learning_rate": 8.62580583017209e-07, "loss": 0.0309, "step": 2658 }, { "epoch": 1.2097361237488626, "grad_norm": 0.8222469993118944, "learning_rate": 8.624821498565314e-07, "loss": 0.0361, "step": 2659 }, { "epoch": 1.2101910828025477, "grad_norm": 0.9724049751320967, "learning_rate": 8.623836870754496e-07, "loss": 0.0384, "step": 2660 }, { "epoch": 1.210646041856233, "grad_norm": 0.8637456071363656, "learning_rate": 8.622851946820093e-07, "loss": 0.0477, "step": 2661 }, { "epoch": 1.2111010009099181, "grad_norm": 0.7428090907499658, "learning_rate": 8.621866726842591e-07, "loss": 0.0169, "step": 2662 }, { "epoch": 1.2115559599636032, "grad_norm": 1.1771749836415322, "learning_rate": 8.620881210902495e-07, "loss": 0.0476, "step": 2663 }, { "epoch": 1.2120109190172885, "grad_norm": 0.9536442285257221, "learning_rate": 8.61989539908034e-07, "loss": 0.0302, "step": 2664 }, { "epoch": 1.2124658780709736, "grad_norm": 1.1598632335316936, "learning_rate": 8.618909291456677e-07, "loss": 0.0705, "step": 2665 }, { "epoch": 1.2129208371246587, "grad_norm": 0.8630302494894735, "learning_rate": 8.617922888112092e-07, "loss": 0.0248, "step": 2666 }, { "epoch": 1.213375796178344, "grad_norm": 0.8916394376538732, "learning_rate": 8.616936189127187e-07, "loss": 0.0462, "step": 2667 }, { "epoch": 1.213830755232029, "grad_norm": 1.0317250762634762, "learning_rate": 8.61594919458259e-07, "loss": 0.0338, "step": 2668 }, { "epoch": 1.2142857142857142, "grad_norm": 1.000865944607014, "learning_rate": 8.614961904558954e-07, "loss": 0.0294, "step": 2669 }, { "epoch": 1.2147406733393995, "grad_norm": 0.6900526874711016, "learning_rate": 8.613974319136957e-07, "loss": 0.0199, "step": 2670 }, { "epoch": 1.2151956323930846, "grad_norm": 0.9571179965680353, "learning_rate": 8.612986438397298e-07, "loss": 0.0285, "step": 2671 }, { "epoch": 1.21565059144677, "grad_norm": 1.0336592406986884, "learning_rate": 8.611998262420706e-07, "loss": 0.0376, "step": 2672 }, { "epoch": 1.216105550500455, "grad_norm": 1.0040098954973333, "learning_rate": 8.611009791287926e-07, "loss": 0.0245, "step": 2673 }, { "epoch": 1.21656050955414, "grad_norm": 0.8439812463021409, "learning_rate": 8.610021025079733e-07, "loss": 0.0308, "step": 2674 }, { "epoch": 1.2170154686078254, "grad_norm": 0.9993024213039532, "learning_rate": 8.609031963876923e-07, "loss": 0.0259, "step": 2675 }, { "epoch": 1.2174704276615105, "grad_norm": 0.9446644976457677, "learning_rate": 8.608042607760321e-07, "loss": 0.0408, "step": 2676 }, { "epoch": 1.2179253867151956, "grad_norm": 0.8975686580424432, "learning_rate": 8.60705295681077e-07, "loss": 0.0339, "step": 2677 }, { "epoch": 1.2183803457688809, "grad_norm": 0.9347994398407182, "learning_rate": 8.606063011109142e-07, "loss": 0.0395, "step": 2678 }, { "epoch": 1.218835304822566, "grad_norm": 1.5918023666366123, "learning_rate": 8.605072770736328e-07, "loss": 0.035, "step": 2679 }, { "epoch": 1.219290263876251, "grad_norm": 0.7431878990666864, "learning_rate": 8.604082235773248e-07, "loss": 0.0233, "step": 2680 }, { "epoch": 1.2197452229299364, "grad_norm": 0.8457775406241679, "learning_rate": 8.603091406300844e-07, "loss": 0.0302, "step": 2681 }, { "epoch": 1.2202001819836215, "grad_norm": 0.8538855738139434, "learning_rate": 8.60210028240008e-07, "loss": 0.0284, "step": 2682 }, { "epoch": 1.2206551410373065, "grad_norm": 0.666206336540722, "learning_rate": 8.60110886415195e-07, "loss": 0.0191, "step": 2683 }, { "epoch": 1.2211101000909919, "grad_norm": 1.1246739511973611, "learning_rate": 8.600117151637465e-07, "loss": 0.0547, "step": 2684 }, { "epoch": 1.221565059144677, "grad_norm": 1.0840994302157372, "learning_rate": 8.599125144937665e-07, "loss": 0.0433, "step": 2685 }, { "epoch": 1.222020018198362, "grad_norm": 0.8449188422414068, "learning_rate": 8.598132844133612e-07, "loss": 0.0168, "step": 2686 }, { "epoch": 1.2224749772520473, "grad_norm": 2.534608582394185, "learning_rate": 8.597140249306392e-07, "loss": 0.0968, "step": 2687 }, { "epoch": 1.2229299363057324, "grad_norm": 0.7146285298298123, "learning_rate": 8.596147360537114e-07, "loss": 0.0162, "step": 2688 }, { "epoch": 1.2233848953594177, "grad_norm": 0.8317178848976031, "learning_rate": 8.595154177906915e-07, "loss": 0.0192, "step": 2689 }, { "epoch": 1.2238398544131028, "grad_norm": 0.800745233829314, "learning_rate": 8.594160701496951e-07, "loss": 0.0338, "step": 2690 }, { "epoch": 1.224294813466788, "grad_norm": 0.6059465657721909, "learning_rate": 8.593166931388407e-07, "loss": 0.0145, "step": 2691 }, { "epoch": 1.2247497725204732, "grad_norm": 0.8490689428452892, "learning_rate": 8.592172867662487e-07, "loss": 0.0365, "step": 2692 }, { "epoch": 1.2252047315741583, "grad_norm": 0.8708233937747618, "learning_rate": 8.591178510400423e-07, "loss": 0.0316, "step": 2693 }, { "epoch": 1.2256596906278434, "grad_norm": 1.353810373724661, "learning_rate": 8.590183859683467e-07, "loss": 0.0433, "step": 2694 }, { "epoch": 1.2261146496815287, "grad_norm": 1.0121976924638745, "learning_rate": 8.589188915592902e-07, "loss": 0.0548, "step": 2695 }, { "epoch": 1.2265696087352138, "grad_norm": 0.6614726257237998, "learning_rate": 8.588193678210026e-07, "loss": 0.0171, "step": 2696 }, { "epoch": 1.2270245677888991, "grad_norm": 0.559098903100871, "learning_rate": 8.587198147616166e-07, "loss": 0.0199, "step": 2697 }, { "epoch": 1.2274795268425842, "grad_norm": 0.7721115309689291, "learning_rate": 8.586202323892673e-07, "loss": 0.0221, "step": 2698 }, { "epoch": 1.2279344858962693, "grad_norm": 0.7566641686569185, "learning_rate": 8.585206207120923e-07, "loss": 0.0156, "step": 2699 }, { "epoch": 1.2283894449499546, "grad_norm": 0.7666791224278717, "learning_rate": 8.584209797382311e-07, "loss": 0.0323, "step": 2700 }, { "epoch": 1.2288444040036397, "grad_norm": 0.9568229421857039, "learning_rate": 8.583213094758261e-07, "loss": 0.0183, "step": 2701 }, { "epoch": 1.2292993630573248, "grad_norm": 0.8040414791111078, "learning_rate": 8.582216099330218e-07, "loss": 0.0341, "step": 2702 }, { "epoch": 1.22975432211101, "grad_norm": 1.0692788302267295, "learning_rate": 8.581218811179654e-07, "loss": 0.022, "step": 2703 }, { "epoch": 1.2302092811646952, "grad_norm": 0.798923009236112, "learning_rate": 8.580221230388058e-07, "loss": 0.0457, "step": 2704 }, { "epoch": 1.2306642402183803, "grad_norm": 0.9642692023281294, "learning_rate": 8.579223357036954e-07, "loss": 0.0353, "step": 2705 }, { "epoch": 1.2311191992720656, "grad_norm": 0.953804249008264, "learning_rate": 8.578225191207881e-07, "loss": 0.0366, "step": 2706 }, { "epoch": 1.2315741583257507, "grad_norm": 1.5650753213793487, "learning_rate": 8.577226732982403e-07, "loss": 0.0342, "step": 2707 }, { "epoch": 1.2320291173794358, "grad_norm": 1.1724310872204888, "learning_rate": 8.576227982442112e-07, "loss": 0.0411, "step": 2708 }, { "epoch": 1.232484076433121, "grad_norm": 0.841771290597977, "learning_rate": 8.575228939668621e-07, "loss": 0.0431, "step": 2709 }, { "epoch": 1.2329390354868062, "grad_norm": 1.2212258430686158, "learning_rate": 8.574229604743566e-07, "loss": 0.0336, "step": 2710 }, { "epoch": 1.2333939945404913, "grad_norm": 0.9272951632620287, "learning_rate": 8.573229977748608e-07, "loss": 0.0456, "step": 2711 }, { "epoch": 1.2338489535941766, "grad_norm": 0.8235520628833786, "learning_rate": 8.572230058765433e-07, "loss": 0.0352, "step": 2712 }, { "epoch": 1.2343039126478617, "grad_norm": 1.1763607546762411, "learning_rate": 8.57122984787575e-07, "loss": 0.0551, "step": 2713 }, { "epoch": 1.2347588717015467, "grad_norm": 0.7627847999344662, "learning_rate": 8.570229345161292e-07, "loss": 0.0264, "step": 2714 }, { "epoch": 1.235213830755232, "grad_norm": 0.9326508777750133, "learning_rate": 8.569228550703815e-07, "loss": 0.029, "step": 2715 }, { "epoch": 1.2356687898089171, "grad_norm": 0.8640614143388445, "learning_rate": 8.568227464585099e-07, "loss": 0.0302, "step": 2716 }, { "epoch": 1.2361237488626025, "grad_norm": 0.8331017167343165, "learning_rate": 8.567226086886947e-07, "loss": 0.0301, "step": 2717 }, { "epoch": 1.2365787079162875, "grad_norm": 0.6279308427553189, "learning_rate": 8.56622441769119e-07, "loss": 0.0167, "step": 2718 }, { "epoch": 1.2370336669699726, "grad_norm": 1.0694892915459606, "learning_rate": 8.565222457079678e-07, "loss": 0.0568, "step": 2719 }, { "epoch": 1.237488626023658, "grad_norm": 0.7518007206474223, "learning_rate": 8.564220205134288e-07, "loss": 0.0176, "step": 2720 }, { "epoch": 1.237943585077343, "grad_norm": 0.7565125781875415, "learning_rate": 8.563217661936919e-07, "loss": 0.0275, "step": 2721 }, { "epoch": 1.2383985441310281, "grad_norm": 0.6808288829324031, "learning_rate": 8.562214827569492e-07, "loss": 0.0187, "step": 2722 }, { "epoch": 1.2388535031847134, "grad_norm": 0.6281794381980826, "learning_rate": 8.561211702113959e-07, "loss": 0.0227, "step": 2723 }, { "epoch": 1.2393084622383985, "grad_norm": 0.9234896897980736, "learning_rate": 8.560208285652286e-07, "loss": 0.0329, "step": 2724 }, { "epoch": 1.2397634212920838, "grad_norm": 1.3462043154250611, "learning_rate": 8.55920457826647e-07, "loss": 0.0368, "step": 2725 }, { "epoch": 1.240218380345769, "grad_norm": 0.8005127553089734, "learning_rate": 8.558200580038529e-07, "loss": 0.0265, "step": 2726 }, { "epoch": 1.240673339399454, "grad_norm": 0.9474079132766127, "learning_rate": 8.557196291050506e-07, "loss": 0.0565, "step": 2727 }, { "epoch": 1.2411282984531393, "grad_norm": 0.8443127859428777, "learning_rate": 8.556191711384465e-07, "loss": 0.0406, "step": 2728 }, { "epoch": 1.2415832575068244, "grad_norm": 0.8713493044436513, "learning_rate": 8.555186841122497e-07, "loss": 0.0229, "step": 2729 }, { "epoch": 1.2420382165605095, "grad_norm": 1.8328249419664342, "learning_rate": 8.554181680346717e-07, "loss": 0.0555, "step": 2730 }, { "epoch": 1.2424931756141948, "grad_norm": 0.9424124441043785, "learning_rate": 8.553176229139261e-07, "loss": 0.0462, "step": 2731 }, { "epoch": 1.24294813466788, "grad_norm": 1.032635491558901, "learning_rate": 8.552170487582286e-07, "loss": 0.0345, "step": 2732 }, { "epoch": 1.243403093721565, "grad_norm": 0.6818150442890318, "learning_rate": 8.551164455757984e-07, "loss": 0.025, "step": 2733 }, { "epoch": 1.2438580527752503, "grad_norm": 0.6172359181882457, "learning_rate": 8.550158133748557e-07, "loss": 0.0192, "step": 2734 }, { "epoch": 1.2443130118289354, "grad_norm": 0.8256790956566126, "learning_rate": 8.549151521636242e-07, "loss": 0.0397, "step": 2735 }, { "epoch": 1.2447679708826205, "grad_norm": 0.8820740812061374, "learning_rate": 8.54814461950329e-07, "loss": 0.0302, "step": 2736 }, { "epoch": 1.2452229299363058, "grad_norm": 1.2458211377621542, "learning_rate": 8.547137427431985e-07, "loss": 0.0603, "step": 2737 }, { "epoch": 1.2456778889899909, "grad_norm": 1.0371278745265713, "learning_rate": 8.546129945504627e-07, "loss": 0.0532, "step": 2738 }, { "epoch": 1.246132848043676, "grad_norm": 0.9015054915456527, "learning_rate": 8.545122173803545e-07, "loss": 0.0292, "step": 2739 }, { "epoch": 1.2465878070973613, "grad_norm": 1.0727823773013052, "learning_rate": 8.544114112411087e-07, "loss": 0.0278, "step": 2740 }, { "epoch": 1.2470427661510464, "grad_norm": 0.7807361291657409, "learning_rate": 8.543105761409631e-07, "loss": 0.0339, "step": 2741 }, { "epoch": 1.2474977252047315, "grad_norm": 0.7503947406257944, "learning_rate": 8.542097120881571e-07, "loss": 0.04, "step": 2742 }, { "epoch": 1.2479526842584168, "grad_norm": 0.9624138384420282, "learning_rate": 8.541088190909331e-07, "loss": 0.0672, "step": 2743 }, { "epoch": 1.2484076433121019, "grad_norm": 0.8716528252480334, "learning_rate": 8.540078971575355e-07, "loss": 0.0283, "step": 2744 }, { "epoch": 1.2488626023657872, "grad_norm": 1.1678730580823093, "learning_rate": 8.539069462962113e-07, "loss": 0.0336, "step": 2745 }, { "epoch": 1.2493175614194723, "grad_norm": 0.8055204983484375, "learning_rate": 8.538059665152095e-07, "loss": 0.0288, "step": 2746 }, { "epoch": 1.2497725204731573, "grad_norm": 1.2373580815735874, "learning_rate": 8.537049578227821e-07, "loss": 0.0357, "step": 2747 }, { "epoch": 1.2502274795268427, "grad_norm": 0.8112876004206742, "learning_rate": 8.536039202271827e-07, "loss": 0.0375, "step": 2748 }, { "epoch": 1.2506824385805277, "grad_norm": 0.8078477050483465, "learning_rate": 8.53502853736668e-07, "loss": 0.022, "step": 2749 }, { "epoch": 1.251137397634213, "grad_norm": 0.9717190387408735, "learning_rate": 8.534017583594963e-07, "loss": 0.0295, "step": 2750 }, { "epoch": 1.2515923566878981, "grad_norm": 1.0759283319891644, "learning_rate": 8.533006341039291e-07, "loss": 0.0401, "step": 2751 }, { "epoch": 1.2520473157415832, "grad_norm": 1.2784905177974237, "learning_rate": 8.531994809782293e-07, "loss": 0.0477, "step": 2752 }, { "epoch": 1.2525022747952685, "grad_norm": 0.8731531986287857, "learning_rate": 8.530982989906631e-07, "loss": 0.0224, "step": 2753 }, { "epoch": 1.2529572338489536, "grad_norm": 1.0542070194752762, "learning_rate": 8.529970881494984e-07, "loss": 0.0567, "step": 2754 }, { "epoch": 1.2534121929026387, "grad_norm": 0.6495809477768786, "learning_rate": 8.528958484630059e-07, "loss": 0.026, "step": 2755 }, { "epoch": 1.253867151956324, "grad_norm": 0.6999029934363982, "learning_rate": 8.527945799394583e-07, "loss": 0.0341, "step": 2756 }, { "epoch": 1.2543221110100091, "grad_norm": 0.7595106654391769, "learning_rate": 8.526932825871307e-07, "loss": 0.033, "step": 2757 }, { "epoch": 1.2547770700636942, "grad_norm": 0.7387515662966558, "learning_rate": 8.525919564143009e-07, "loss": 0.0179, "step": 2758 }, { "epoch": 1.2552320291173795, "grad_norm": 1.1383461169234612, "learning_rate": 8.524906014292487e-07, "loss": 0.0491, "step": 2759 }, { "epoch": 1.2556869881710646, "grad_norm": 0.5776752036828022, "learning_rate": 8.523892176402563e-07, "loss": 0.0222, "step": 2760 }, { "epoch": 1.2561419472247497, "grad_norm": 0.762808332360433, "learning_rate": 8.522878050556086e-07, "loss": 0.0352, "step": 2761 }, { "epoch": 1.256596906278435, "grad_norm": 0.898431303608664, "learning_rate": 8.521863636835922e-07, "loss": 0.0268, "step": 2762 }, { "epoch": 1.25705186533212, "grad_norm": 0.6201388816560626, "learning_rate": 8.520848935324967e-07, "loss": 0.0203, "step": 2763 }, { "epoch": 1.2575068243858052, "grad_norm": 0.7607771077726979, "learning_rate": 8.519833946106137e-07, "loss": 0.0386, "step": 2764 }, { "epoch": 1.2579617834394905, "grad_norm": 0.9311134290411396, "learning_rate": 8.518818669262372e-07, "loss": 0.034, "step": 2765 }, { "epoch": 1.2584167424931756, "grad_norm": 0.7119855108181743, "learning_rate": 8.517803104876637e-07, "loss": 0.0237, "step": 2766 }, { "epoch": 1.2588717015468607, "grad_norm": 0.9920595848289004, "learning_rate": 8.516787253031919e-07, "loss": 0.0245, "step": 2767 }, { "epoch": 1.259326660600546, "grad_norm": 0.7586088087362506, "learning_rate": 8.515771113811226e-07, "loss": 0.0204, "step": 2768 }, { "epoch": 1.259781619654231, "grad_norm": 29.571837805482133, "learning_rate": 8.514754687297596e-07, "loss": 0.2798, "step": 2769 }, { "epoch": 1.2602365787079162, "grad_norm": 1.538300985585836, "learning_rate": 8.513737973574087e-07, "loss": 0.0628, "step": 2770 }, { "epoch": 1.2606915377616015, "grad_norm": 0.898568981374736, "learning_rate": 8.512720972723777e-07, "loss": 0.0275, "step": 2771 }, { "epoch": 1.2611464968152866, "grad_norm": 0.8650788511831783, "learning_rate": 8.511703684829772e-07, "loss": 0.0257, "step": 2772 }, { "epoch": 1.2616014558689717, "grad_norm": 1.4749542879970348, "learning_rate": 8.5106861099752e-07, "loss": 0.0556, "step": 2773 }, { "epoch": 1.262056414922657, "grad_norm": 0.8155185760231549, "learning_rate": 8.509668248243216e-07, "loss": 0.0182, "step": 2774 }, { "epoch": 1.262511373976342, "grad_norm": 0.793029365277259, "learning_rate": 8.508650099716989e-07, "loss": 0.0341, "step": 2775 }, { "epoch": 1.2629663330300274, "grad_norm": 0.5903331546786572, "learning_rate": 8.507631664479724e-07, "loss": 0.0147, "step": 2776 }, { "epoch": 1.2634212920837125, "grad_norm": 0.6903913073470873, "learning_rate": 8.506612942614637e-07, "loss": 0.027, "step": 2777 }, { "epoch": 1.2638762511373978, "grad_norm": 0.9561521582646012, "learning_rate": 8.505593934204978e-07, "loss": 0.036, "step": 2778 }, { "epoch": 1.2643312101910829, "grad_norm": 0.895283833887027, "learning_rate": 8.504574639334012e-07, "loss": 0.03, "step": 2779 }, { "epoch": 1.264786169244768, "grad_norm": 0.930684795682613, "learning_rate": 8.503555058085034e-07, "loss": 0.0324, "step": 2780 }, { "epoch": 1.2652411282984533, "grad_norm": 0.7687317982623642, "learning_rate": 8.50253519054136e-07, "loss": 0.0305, "step": 2781 }, { "epoch": 1.2656960873521383, "grad_norm": 1.6251050979666584, "learning_rate": 8.501515036786325e-07, "loss": 0.0453, "step": 2782 }, { "epoch": 1.2661510464058234, "grad_norm": 0.7519335570198988, "learning_rate": 8.500494596903296e-07, "loss": 0.0275, "step": 2783 }, { "epoch": 1.2666060054595087, "grad_norm": 0.8752544700554789, "learning_rate": 8.499473870975656e-07, "loss": 0.0292, "step": 2784 }, { "epoch": 1.2670609645131938, "grad_norm": 0.8856380690991379, "learning_rate": 8.498452859086814e-07, "loss": 0.0374, "step": 2785 }, { "epoch": 1.267515923566879, "grad_norm": 0.877737884751427, "learning_rate": 8.497431561320203e-07, "loss": 0.0376, "step": 2786 }, { "epoch": 1.2679708826205642, "grad_norm": 0.898013853971068, "learning_rate": 8.49640997775928e-07, "loss": 0.0239, "step": 2787 }, { "epoch": 1.2684258416742493, "grad_norm": 0.7934733976626871, "learning_rate": 8.495388108487524e-07, "loss": 0.0311, "step": 2788 }, { "epoch": 1.2688808007279344, "grad_norm": 0.8778715169240167, "learning_rate": 8.494365953588434e-07, "loss": 0.0308, "step": 2789 }, { "epoch": 1.2693357597816197, "grad_norm": 1.054069559535226, "learning_rate": 8.493343513145541e-07, "loss": 0.0333, "step": 2790 }, { "epoch": 1.2697907188353048, "grad_norm": 0.9978644973476216, "learning_rate": 8.492320787242393e-07, "loss": 0.0401, "step": 2791 }, { "epoch": 1.27024567788899, "grad_norm": 1.0178033338278905, "learning_rate": 8.491297775962559e-07, "loss": 0.0325, "step": 2792 }, { "epoch": 1.2707006369426752, "grad_norm": 0.8099115097985713, "learning_rate": 8.490274479389638e-07, "loss": 0.0267, "step": 2793 }, { "epoch": 1.2711555959963603, "grad_norm": 0.9554995061414735, "learning_rate": 8.489250897607248e-07, "loss": 0.0343, "step": 2794 }, { "epoch": 1.2716105550500454, "grad_norm": 0.9536409557720859, "learning_rate": 8.488227030699033e-07, "loss": 0.0448, "step": 2795 }, { "epoch": 1.2720655141037307, "grad_norm": 0.876765264289229, "learning_rate": 8.487202878748658e-07, "loss": 0.0298, "step": 2796 }, { "epoch": 1.2725204731574158, "grad_norm": 0.7360892475056073, "learning_rate": 8.486178441839811e-07, "loss": 0.0264, "step": 2797 }, { "epoch": 1.2729754322111009, "grad_norm": 0.7155934577478165, "learning_rate": 8.485153720056205e-07, "loss": 0.02, "step": 2798 }, { "epoch": 1.2734303912647862, "grad_norm": 0.9931079405828536, "learning_rate": 8.484128713481576e-07, "loss": 0.0282, "step": 2799 }, { "epoch": 1.2738853503184713, "grad_norm": 0.8509933471165853, "learning_rate": 8.483103422199682e-07, "loss": 0.0358, "step": 2800 }, { "epoch": 1.2743403093721566, "grad_norm": 5.464419503672341, "learning_rate": 8.482077846294308e-07, "loss": 0.1259, "step": 2801 }, { "epoch": 1.2747952684258417, "grad_norm": 1.0758352739317145, "learning_rate": 8.481051985849257e-07, "loss": 0.0395, "step": 2802 }, { "epoch": 1.2752502274795268, "grad_norm": 1.004303302699195, "learning_rate": 8.480025840948356e-07, "loss": 0.0332, "step": 2803 }, { "epoch": 1.275705186533212, "grad_norm": 0.9211519394120329, "learning_rate": 8.47899941167546e-07, "loss": 0.0243, "step": 2804 }, { "epoch": 1.2761601455868972, "grad_norm": 0.9048559233204291, "learning_rate": 8.477972698114444e-07, "loss": 0.0329, "step": 2805 }, { "epoch": 1.2766151046405825, "grad_norm": 1.0380182120483836, "learning_rate": 8.476945700349205e-07, "loss": 0.0389, "step": 2806 }, { "epoch": 1.2770700636942676, "grad_norm": 0.9725556922927764, "learning_rate": 8.475918418463663e-07, "loss": 0.0442, "step": 2807 }, { "epoch": 1.2775250227479527, "grad_norm": 1.4181765305273526, "learning_rate": 8.474890852541767e-07, "loss": 0.0507, "step": 2808 }, { "epoch": 1.277979981801638, "grad_norm": 1.0370985553909255, "learning_rate": 8.473863002667482e-07, "loss": 0.0588, "step": 2809 }, { "epoch": 1.278434940855323, "grad_norm": 1.0077374720045922, "learning_rate": 8.472834868924802e-07, "loss": 0.0446, "step": 2810 }, { "epoch": 1.2788898999090081, "grad_norm": 0.9967531478767317, "learning_rate": 8.471806451397738e-07, "loss": 0.0416, "step": 2811 }, { "epoch": 1.2793448589626935, "grad_norm": 0.888924918804667, "learning_rate": 8.47077775017033e-07, "loss": 0.0399, "step": 2812 }, { "epoch": 1.2797998180163785, "grad_norm": 0.8317232277118605, "learning_rate": 8.469748765326639e-07, "loss": 0.0233, "step": 2813 }, { "epoch": 1.2802547770700636, "grad_norm": 0.6372730616109161, "learning_rate": 8.468719496950747e-07, "loss": 0.0139, "step": 2814 }, { "epoch": 1.280709736123749, "grad_norm": 0.822238392999572, "learning_rate": 8.467689945126764e-07, "loss": 0.036, "step": 2815 }, { "epoch": 1.281164695177434, "grad_norm": 0.7465263345704704, "learning_rate": 8.466660109938817e-07, "loss": 0.0422, "step": 2816 }, { "epoch": 1.2816196542311191, "grad_norm": 0.7625128178147612, "learning_rate": 8.465629991471059e-07, "loss": 0.0242, "step": 2817 }, { "epoch": 1.2820746132848044, "grad_norm": 0.7070322517597929, "learning_rate": 8.464599589807673e-07, "loss": 0.026, "step": 2818 }, { "epoch": 1.2825295723384895, "grad_norm": 0.8645165460779396, "learning_rate": 8.463568905032852e-07, "loss": 0.0286, "step": 2819 }, { "epoch": 1.2829845313921746, "grad_norm": 1.0598296355280954, "learning_rate": 8.462537937230822e-07, "loss": 0.0474, "step": 2820 }, { "epoch": 1.28343949044586, "grad_norm": 0.7153155757409426, "learning_rate": 8.461506686485829e-07, "loss": 0.0245, "step": 2821 }, { "epoch": 1.283894449499545, "grad_norm": 0.8779042590660113, "learning_rate": 8.460475152882141e-07, "loss": 0.0329, "step": 2822 }, { "epoch": 1.28434940855323, "grad_norm": 0.7527172367412839, "learning_rate": 8.459443336504051e-07, "loss": 0.0349, "step": 2823 }, { "epoch": 1.2848043676069154, "grad_norm": 0.9017759581977272, "learning_rate": 8.458411237435874e-07, "loss": 0.0342, "step": 2824 }, { "epoch": 1.2852593266606005, "grad_norm": 1.0696006646180038, "learning_rate": 8.457378855761948e-07, "loss": 0.0231, "step": 2825 }, { "epoch": 1.2857142857142856, "grad_norm": 0.7174463964489417, "learning_rate": 8.456346191566636e-07, "loss": 0.0254, "step": 2826 }, { "epoch": 1.286169244767971, "grad_norm": 0.9351342843802792, "learning_rate": 8.455313244934324e-07, "loss": 0.0313, "step": 2827 }, { "epoch": 1.286624203821656, "grad_norm": 0.766971778252138, "learning_rate": 8.454280015949416e-07, "loss": 0.0239, "step": 2828 }, { "epoch": 1.2870791628753413, "grad_norm": 1.0100937597181174, "learning_rate": 8.453246504696343e-07, "loss": 0.0309, "step": 2829 }, { "epoch": 1.2875341219290264, "grad_norm": 0.8060429240038308, "learning_rate": 8.452212711259562e-07, "loss": 0.0264, "step": 2830 }, { "epoch": 1.2879890809827115, "grad_norm": 1.0364219140906665, "learning_rate": 8.451178635723548e-07, "loss": 0.0437, "step": 2831 }, { "epoch": 1.2884440400363968, "grad_norm": 0.7367547889086948, "learning_rate": 8.450144278172802e-07, "loss": 0.0315, "step": 2832 }, { "epoch": 1.2888989990900819, "grad_norm": 0.9343874406387315, "learning_rate": 8.449109638691845e-07, "loss": 0.0408, "step": 2833 }, { "epoch": 1.2893539581437672, "grad_norm": 0.7694003408033643, "learning_rate": 8.448074717365225e-07, "loss": 0.0249, "step": 2834 }, { "epoch": 1.2898089171974523, "grad_norm": 0.7467586972236325, "learning_rate": 8.44703951427751e-07, "loss": 0.0227, "step": 2835 }, { "epoch": 1.2902638762511374, "grad_norm": 0.9460776888879788, "learning_rate": 8.446004029513293e-07, "loss": 0.0206, "step": 2836 }, { "epoch": 1.2907188353048227, "grad_norm": 0.999505768984756, "learning_rate": 8.444968263157189e-07, "loss": 0.0285, "step": 2837 }, { "epoch": 1.2911737943585078, "grad_norm": 0.9700635430740446, "learning_rate": 8.443932215293835e-07, "loss": 0.0256, "step": 2838 }, { "epoch": 1.2916287534121929, "grad_norm": 0.8345140666608866, "learning_rate": 8.442895886007894e-07, "loss": 0.029, "step": 2839 }, { "epoch": 1.2920837124658782, "grad_norm": 0.7661805662221952, "learning_rate": 8.441859275384049e-07, "loss": 0.0369, "step": 2840 }, { "epoch": 1.2925386715195633, "grad_norm": 0.917244632313924, "learning_rate": 8.440822383507007e-07, "loss": 0.0326, "step": 2841 }, { "epoch": 1.2929936305732483, "grad_norm": 0.8277950578070447, "learning_rate": 8.439785210461498e-07, "loss": 0.0208, "step": 2842 }, { "epoch": 1.2934485896269337, "grad_norm": 0.8936634371107289, "learning_rate": 8.438747756332276e-07, "loss": 0.039, "step": 2843 }, { "epoch": 1.2939035486806187, "grad_norm": 1.0434837856207118, "learning_rate": 8.437710021204118e-07, "loss": 0.0418, "step": 2844 }, { "epoch": 1.2943585077343038, "grad_norm": 0.7273374902678118, "learning_rate": 8.436672005161819e-07, "loss": 0.025, "step": 2845 }, { "epoch": 1.2948134667879891, "grad_norm": 1.61730930841652, "learning_rate": 8.435633708290205e-07, "loss": 0.0706, "step": 2846 }, { "epoch": 1.2952684258416742, "grad_norm": 1.0593901045514347, "learning_rate": 8.43459513067412e-07, "loss": 0.0263, "step": 2847 }, { "epoch": 1.2957233848953593, "grad_norm": 1.107476273188476, "learning_rate": 8.43355627239843e-07, "loss": 0.0507, "step": 2848 }, { "epoch": 1.2961783439490446, "grad_norm": 0.9119030421096349, "learning_rate": 8.432517133548028e-07, "loss": 0.0358, "step": 2849 }, { "epoch": 1.2966333030027297, "grad_norm": 0.8055090320228976, "learning_rate": 8.431477714207829e-07, "loss": 0.0345, "step": 2850 }, { "epoch": 1.2970882620564148, "grad_norm": 0.5966390851987241, "learning_rate": 8.430438014462764e-07, "loss": 0.0244, "step": 2851 }, { "epoch": 1.2975432211101001, "grad_norm": 1.0821019163091679, "learning_rate": 8.429398034397798e-07, "loss": 0.0353, "step": 2852 }, { "epoch": 1.2979981801637852, "grad_norm": 0.8332730131579991, "learning_rate": 8.428357774097911e-07, "loss": 0.0451, "step": 2853 }, { "epoch": 1.2984531392174703, "grad_norm": 0.8643792349501233, "learning_rate": 8.42731723364811e-07, "loss": 0.0282, "step": 2854 }, { "epoch": 1.2989080982711556, "grad_norm": 0.9575864830335852, "learning_rate": 8.426276413133421e-07, "loss": 0.0464, "step": 2855 }, { "epoch": 1.2993630573248407, "grad_norm": 0.6697608740189507, "learning_rate": 8.425235312638898e-07, "loss": 0.0234, "step": 2856 }, { "epoch": 1.299818016378526, "grad_norm": 0.8702278345443554, "learning_rate": 8.424193932249613e-07, "loss": 0.0411, "step": 2857 }, { "epoch": 1.300272975432211, "grad_norm": 1.3750189637082497, "learning_rate": 8.423152272050663e-07, "loss": 0.0623, "step": 2858 }, { "epoch": 1.3007279344858962, "grad_norm": 0.916160849072238, "learning_rate": 8.422110332127168e-07, "loss": 0.0288, "step": 2859 }, { "epoch": 1.3011828935395815, "grad_norm": 0.8771988969979594, "learning_rate": 8.421068112564271e-07, "loss": 0.0234, "step": 2860 }, { "epoch": 1.3016378525932666, "grad_norm": 0.7079947173840426, "learning_rate": 8.420025613447138e-07, "loss": 0.0306, "step": 2861 }, { "epoch": 1.302092811646952, "grad_norm": 0.7788046446943802, "learning_rate": 8.418982834860956e-07, "loss": 0.035, "step": 2862 }, { "epoch": 1.302547770700637, "grad_norm": 1.1354329010641477, "learning_rate": 8.417939776890937e-07, "loss": 0.0421, "step": 2863 }, { "epoch": 1.303002729754322, "grad_norm": 1.2033815048476584, "learning_rate": 8.416896439622314e-07, "loss": 0.05, "step": 2864 }, { "epoch": 1.3034576888080074, "grad_norm": 1.1164537954938887, "learning_rate": 8.415852823140344e-07, "loss": 0.0392, "step": 2865 }, { "epoch": 1.3039126478616925, "grad_norm": 0.6607771111624197, "learning_rate": 8.414808927530308e-07, "loss": 0.0347, "step": 2866 }, { "epoch": 1.3043676069153776, "grad_norm": 1.0914032794723207, "learning_rate": 8.413764752877508e-07, "loss": 0.0466, "step": 2867 }, { "epoch": 1.3048225659690629, "grad_norm": 1.131568956913686, "learning_rate": 8.412720299267269e-07, "loss": 0.037, "step": 2868 }, { "epoch": 1.305277525022748, "grad_norm": 1.0495096571925588, "learning_rate": 8.411675566784938e-07, "loss": 0.0447, "step": 2869 }, { "epoch": 1.305732484076433, "grad_norm": 0.8997610924982157, "learning_rate": 8.410630555515885e-07, "loss": 0.0401, "step": 2870 }, { "epoch": 1.3061874431301184, "grad_norm": 1.0005870875608573, "learning_rate": 8.409585265545508e-07, "loss": 0.0477, "step": 2871 }, { "epoch": 1.3066424021838035, "grad_norm": 0.9870199434610379, "learning_rate": 8.40853969695922e-07, "loss": 0.0245, "step": 2872 }, { "epoch": 1.3070973612374885, "grad_norm": 0.9947090239101678, "learning_rate": 8.407493849842461e-07, "loss": 0.026, "step": 2873 }, { "epoch": 1.3075523202911739, "grad_norm": 0.7375537699000126, "learning_rate": 8.406447724280692e-07, "loss": 0.0314, "step": 2874 }, { "epoch": 1.308007279344859, "grad_norm": 1.0002681184565534, "learning_rate": 8.405401320359399e-07, "loss": 0.0403, "step": 2875 }, { "epoch": 1.308462238398544, "grad_norm": 0.9302288541565867, "learning_rate": 8.404354638164088e-07, "loss": 0.0409, "step": 2876 }, { "epoch": 1.3089171974522293, "grad_norm": 1.0328844689886274, "learning_rate": 8.40330767778029e-07, "loss": 0.0309, "step": 2877 }, { "epoch": 1.3093721565059144, "grad_norm": 0.909911768671675, "learning_rate": 8.402260439293558e-07, "loss": 0.0297, "step": 2878 }, { "epoch": 1.3098271155595995, "grad_norm": 0.9008305006970341, "learning_rate": 8.401212922789469e-07, "loss": 0.0217, "step": 2879 }, { "epoch": 1.3102820746132848, "grad_norm": 0.9354967155953791, "learning_rate": 8.400165128353617e-07, "loss": 0.0384, "step": 2880 }, { "epoch": 1.31073703366697, "grad_norm": 1.0964186283241741, "learning_rate": 8.399117056071627e-07, "loss": 0.0294, "step": 2881 }, { "epoch": 1.311191992720655, "grad_norm": 0.9407340837041227, "learning_rate": 8.398068706029143e-07, "loss": 0.0363, "step": 2882 }, { "epoch": 1.3116469517743403, "grad_norm": 1.6154118502538568, "learning_rate": 8.397020078311827e-07, "loss": 0.0869, "step": 2883 }, { "epoch": 1.3121019108280254, "grad_norm": 0.951658200428573, "learning_rate": 8.395971173005371e-07, "loss": 0.0301, "step": 2884 }, { "epoch": 1.3125568698817107, "grad_norm": 1.234566818121295, "learning_rate": 8.394921990195489e-07, "loss": 0.0621, "step": 2885 }, { "epoch": 1.3130118289353958, "grad_norm": 1.2713209150141227, "learning_rate": 8.393872529967912e-07, "loss": 0.0539, "step": 2886 }, { "epoch": 1.3134667879890811, "grad_norm": 0.6475084071331467, "learning_rate": 8.392822792408398e-07, "loss": 0.0255, "step": 2887 }, { "epoch": 1.3139217470427662, "grad_norm": 0.9436059640339335, "learning_rate": 8.391772777602728e-07, "loss": 0.0306, "step": 2888 }, { "epoch": 1.3143767060964513, "grad_norm": 0.8979778534711511, "learning_rate": 8.390722485636705e-07, "loss": 0.0341, "step": 2889 }, { "epoch": 1.3148316651501366, "grad_norm": 1.0191106187884549, "learning_rate": 8.389671916596152e-07, "loss": 0.0678, "step": 2890 }, { "epoch": 1.3152866242038217, "grad_norm": 1.1610452985467945, "learning_rate": 8.388621070566917e-07, "loss": 0.0302, "step": 2891 }, { "epoch": 1.3157415832575068, "grad_norm": 0.7757938985335905, "learning_rate": 8.387569947634872e-07, "loss": 0.0204, "step": 2892 }, { "epoch": 1.316196542311192, "grad_norm": 1.2149352709818675, "learning_rate": 8.386518547885907e-07, "loss": 0.0514, "step": 2893 }, { "epoch": 1.3166515013648772, "grad_norm": 0.7580404023924686, "learning_rate": 8.385466871405941e-07, "loss": 0.0275, "step": 2894 }, { "epoch": 1.3171064604185623, "grad_norm": 0.9404658641589325, "learning_rate": 8.384414918280911e-07, "loss": 0.0221, "step": 2895 }, { "epoch": 1.3175614194722476, "grad_norm": 0.9878433901937488, "learning_rate": 8.383362688596778e-07, "loss": 0.0349, "step": 2896 }, { "epoch": 1.3180163785259327, "grad_norm": 0.9075523972884683, "learning_rate": 8.382310182439525e-07, "loss": 0.0318, "step": 2897 }, { "epoch": 1.3184713375796178, "grad_norm": 0.8275424952051217, "learning_rate": 8.381257399895156e-07, "loss": 0.0496, "step": 2898 }, { "epoch": 1.318926296633303, "grad_norm": 0.7101830152752693, "learning_rate": 8.380204341049704e-07, "loss": 0.0374, "step": 2899 }, { "epoch": 1.3193812556869882, "grad_norm": 0.7759242964393012, "learning_rate": 8.37915100598922e-07, "loss": 0.0266, "step": 2900 }, { "epoch": 1.3198362147406733, "grad_norm": 0.8208327361578048, "learning_rate": 8.378097394799773e-07, "loss": 0.0315, "step": 2901 }, { "epoch": 1.3202911737943586, "grad_norm": 0.8148611820407744, "learning_rate": 8.377043507567463e-07, "loss": 0.0251, "step": 2902 }, { "epoch": 1.3207461328480437, "grad_norm": 0.9634778976822488, "learning_rate": 8.375989344378409e-07, "loss": 0.0404, "step": 2903 }, { "epoch": 1.3212010919017287, "grad_norm": 0.9127355551120976, "learning_rate": 8.374934905318751e-07, "loss": 0.0486, "step": 2904 }, { "epoch": 1.321656050955414, "grad_norm": 0.9430716146386843, "learning_rate": 8.373880190474653e-07, "loss": 0.0312, "step": 2905 }, { "epoch": 1.3221110100090991, "grad_norm": 0.7896419226076595, "learning_rate": 8.372825199932303e-07, "loss": 0.049, "step": 2906 }, { "epoch": 1.3225659690627842, "grad_norm": 0.7280015669063542, "learning_rate": 8.371769933777908e-07, "loss": 0.0237, "step": 2907 }, { "epoch": 1.3230209281164695, "grad_norm": 0.7985155290157837, "learning_rate": 8.370714392097702e-07, "loss": 0.0281, "step": 2908 }, { "epoch": 1.3234758871701546, "grad_norm": 0.8352762530367205, "learning_rate": 8.369658574977938e-07, "loss": 0.0349, "step": 2909 }, { "epoch": 1.3239308462238397, "grad_norm": 0.7818971012558334, "learning_rate": 8.368602482504891e-07, "loss": 0.0252, "step": 2910 }, { "epoch": 1.324385805277525, "grad_norm": 0.647133797793045, "learning_rate": 8.367546114764863e-07, "loss": 0.0237, "step": 2911 }, { "epoch": 1.3248407643312101, "grad_norm": 0.9207164248400135, "learning_rate": 8.366489471844173e-07, "loss": 0.0393, "step": 2912 }, { "epoch": 1.3252957233848954, "grad_norm": 0.8994060041183249, "learning_rate": 8.365432553829169e-07, "loss": 0.0333, "step": 2913 }, { "epoch": 1.3257506824385805, "grad_norm": 0.8946645860651039, "learning_rate": 8.364375360806213e-07, "loss": 0.0233, "step": 2914 }, { "epoch": 1.3262056414922658, "grad_norm": 0.6654464027520092, "learning_rate": 8.363317892861694e-07, "loss": 0.0332, "step": 2915 }, { "epoch": 1.326660600545951, "grad_norm": 0.6654832133753799, "learning_rate": 8.362260150082027e-07, "loss": 0.0242, "step": 2916 }, { "epoch": 1.327115559599636, "grad_norm": 0.949057074360122, "learning_rate": 8.361202132553646e-07, "loss": 0.0346, "step": 2917 }, { "epoch": 1.3275705186533213, "grad_norm": 0.8603425864135886, "learning_rate": 8.360143840363006e-07, "loss": 0.0268, "step": 2918 }, { "epoch": 1.3280254777070064, "grad_norm": 0.5494559528503289, "learning_rate": 8.359085273596583e-07, "loss": 0.0204, "step": 2919 }, { "epoch": 1.3284804367606915, "grad_norm": 0.7456614546502615, "learning_rate": 8.358026432340882e-07, "loss": 0.0196, "step": 2920 }, { "epoch": 1.3289353958143768, "grad_norm": 1.0250071002509307, "learning_rate": 8.356967316682426e-07, "loss": 0.0432, "step": 2921 }, { "epoch": 1.329390354868062, "grad_norm": 1.2349640285068422, "learning_rate": 8.355907926707759e-07, "loss": 0.049, "step": 2922 }, { "epoch": 1.329845313921747, "grad_norm": 0.9981234476872347, "learning_rate": 8.354848262503453e-07, "loss": 0.0442, "step": 2923 }, { "epoch": 1.3303002729754323, "grad_norm": 0.8184225472196581, "learning_rate": 8.353788324156099e-07, "loss": 0.0265, "step": 2924 }, { "epoch": 1.3307552320291174, "grad_norm": 0.5903508924335396, "learning_rate": 8.352728111752307e-07, "loss": 0.0171, "step": 2925 }, { "epoch": 1.3312101910828025, "grad_norm": 0.9430020554912252, "learning_rate": 8.351667625378713e-07, "loss": 0.0235, "step": 2926 }, { "epoch": 1.3316651501364878, "grad_norm": 0.7811389070681245, "learning_rate": 8.350606865121978e-07, "loss": 0.0281, "step": 2927 }, { "epoch": 1.3321201091901729, "grad_norm": 0.9932638321155588, "learning_rate": 8.349545831068782e-07, "loss": 0.0303, "step": 2928 }, { "epoch": 1.332575068243858, "grad_norm": 1.2165009076938014, "learning_rate": 8.348484523305827e-07, "loss": 0.0489, "step": 2929 }, { "epoch": 1.3330300272975433, "grad_norm": 0.613791812767916, "learning_rate": 8.347422941919838e-07, "loss": 0.0239, "step": 2930 }, { "epoch": 1.3334849863512284, "grad_norm": 1.0488825690539434, "learning_rate": 8.346361086997562e-07, "loss": 0.0444, "step": 2931 }, { "epoch": 1.3339399454049135, "grad_norm": 0.6962272059738152, "learning_rate": 8.345298958625772e-07, "loss": 0.0184, "step": 2932 }, { "epoch": 1.3343949044585988, "grad_norm": 1.0030629295451605, "learning_rate": 8.344236556891256e-07, "loss": 0.0281, "step": 2933 }, { "epoch": 1.3348498635122839, "grad_norm": 1.0781161773394115, "learning_rate": 8.343173881880833e-07, "loss": 0.0605, "step": 2934 }, { "epoch": 1.335304822565969, "grad_norm": 0.941945837567141, "learning_rate": 8.342110933681336e-07, "loss": 0.0375, "step": 2935 }, { "epoch": 1.3357597816196543, "grad_norm": 1.4982672162436177, "learning_rate": 8.341047712379627e-07, "loss": 0.0608, "step": 2936 }, { "epoch": 1.3362147406733393, "grad_norm": 0.9576110276499252, "learning_rate": 8.339984218062589e-07, "loss": 0.0335, "step": 2937 }, { "epoch": 1.3366696997270244, "grad_norm": 0.6355717462727449, "learning_rate": 8.338920450817122e-07, "loss": 0.0314, "step": 2938 }, { "epoch": 1.3371246587807097, "grad_norm": 1.1826104236396544, "learning_rate": 8.337856410730155e-07, "loss": 0.0481, "step": 2939 }, { "epoch": 1.3375796178343948, "grad_norm": 0.9684387950175698, "learning_rate": 8.336792097888635e-07, "loss": 0.033, "step": 2940 }, { "epoch": 1.3380345768880801, "grad_norm": 0.6768966649516572, "learning_rate": 8.335727512379534e-07, "loss": 0.028, "step": 2941 }, { "epoch": 1.3384895359417652, "grad_norm": 0.9020831048025556, "learning_rate": 8.334662654289846e-07, "loss": 0.0377, "step": 2942 }, { "epoch": 1.3389444949954505, "grad_norm": 1.2311883194114712, "learning_rate": 8.333597523706583e-07, "loss": 0.0528, "step": 2943 }, { "epoch": 1.3393994540491356, "grad_norm": 0.9613271056986702, "learning_rate": 8.332532120716785e-07, "loss": 0.0489, "step": 2944 }, { "epoch": 1.3398544131028207, "grad_norm": 0.8760250351034302, "learning_rate": 8.331466445407513e-07, "loss": 0.0539, "step": 2945 }, { "epoch": 1.340309372156506, "grad_norm": 1.1930418140458419, "learning_rate": 8.330400497865846e-07, "loss": 0.0421, "step": 2946 }, { "epoch": 1.3407643312101911, "grad_norm": 1.1104579924792486, "learning_rate": 8.329334278178892e-07, "loss": 0.0408, "step": 2947 }, { "epoch": 1.3412192902638762, "grad_norm": 0.7118455088294344, "learning_rate": 8.328267786433776e-07, "loss": 0.0256, "step": 2948 }, { "epoch": 1.3416742493175615, "grad_norm": 0.7542750225463263, "learning_rate": 8.327201022717644e-07, "loss": 0.0328, "step": 2949 }, { "epoch": 1.3421292083712466, "grad_norm": 0.804683631547178, "learning_rate": 8.326133987117673e-07, "loss": 0.0322, "step": 2950 }, { "epoch": 1.3425841674249317, "grad_norm": 0.9126589790205857, "learning_rate": 8.325066679721052e-07, "loss": 0.0418, "step": 2951 }, { "epoch": 1.343039126478617, "grad_norm": 1.439425596478824, "learning_rate": 8.323999100614998e-07, "loss": 0.0451, "step": 2952 }, { "epoch": 1.343494085532302, "grad_norm": 0.9084440152069617, "learning_rate": 8.322931249886749e-07, "loss": 0.0299, "step": 2953 }, { "epoch": 1.3439490445859872, "grad_norm": 0.7550091275084542, "learning_rate": 8.321863127623564e-07, "loss": 0.0313, "step": 2954 }, { "epoch": 1.3444040036396725, "grad_norm": 0.8314037752258797, "learning_rate": 8.320794733912726e-07, "loss": 0.0358, "step": 2955 }, { "epoch": 1.3448589626933576, "grad_norm": 1.4304484130796087, "learning_rate": 8.319726068841539e-07, "loss": 0.0289, "step": 2956 }, { "epoch": 1.3453139217470427, "grad_norm": 0.9252994348664043, "learning_rate": 8.318657132497329e-07, "loss": 0.0333, "step": 2957 }, { "epoch": 1.345768880800728, "grad_norm": 0.9356698745323252, "learning_rate": 8.317587924967444e-07, "loss": 0.0373, "step": 2958 }, { "epoch": 1.346223839854413, "grad_norm": 0.7793542603736853, "learning_rate": 8.316518446339258e-07, "loss": 0.0224, "step": 2959 }, { "epoch": 1.3466787989080982, "grad_norm": 0.7572184808058666, "learning_rate": 8.31544869670016e-07, "loss": 0.0277, "step": 2960 }, { "epoch": 1.3471337579617835, "grad_norm": 0.9437753796212603, "learning_rate": 8.314378676137568e-07, "loss": 0.0364, "step": 2961 }, { "epoch": 1.3475887170154686, "grad_norm": 1.0612544786288867, "learning_rate": 8.313308384738918e-07, "loss": 0.0341, "step": 2962 }, { "epoch": 1.3480436760691537, "grad_norm": 0.8822671306969826, "learning_rate": 8.31223782259167e-07, "loss": 0.0201, "step": 2963 }, { "epoch": 1.348498635122839, "grad_norm": 0.8523043416253127, "learning_rate": 8.311166989783302e-07, "loss": 0.0481, "step": 2964 }, { "epoch": 1.348953594176524, "grad_norm": 0.911965594113161, "learning_rate": 8.310095886401325e-07, "loss": 0.026, "step": 2965 }, { "epoch": 1.3494085532302094, "grad_norm": 1.0029096695163675, "learning_rate": 8.309024512533258e-07, "loss": 0.0361, "step": 2966 }, { "epoch": 1.3498635122838945, "grad_norm": 0.8251839422029651, "learning_rate": 8.307952868266652e-07, "loss": 0.0267, "step": 2967 }, { "epoch": 1.3503184713375795, "grad_norm": 0.7808147558070904, "learning_rate": 8.306880953689077e-07, "loss": 0.0181, "step": 2968 }, { "epoch": 1.3507734303912649, "grad_norm": 1.0184009654260666, "learning_rate": 8.305808768888123e-07, "loss": 0.0318, "step": 2969 }, { "epoch": 1.35122838944495, "grad_norm": 1.4427410375289322, "learning_rate": 8.304736313951406e-07, "loss": 0.0699, "step": 2970 }, { "epoch": 1.3516833484986353, "grad_norm": 0.9800665071707466, "learning_rate": 8.303663588966561e-07, "loss": 0.0453, "step": 2971 }, { "epoch": 1.3521383075523203, "grad_norm": 0.6250148918523828, "learning_rate": 8.302590594021246e-07, "loss": 0.0281, "step": 2972 }, { "epoch": 1.3525932666060054, "grad_norm": 1.0268211168602766, "learning_rate": 8.301517329203144e-07, "loss": 0.0628, "step": 2973 }, { "epoch": 1.3530482256596907, "grad_norm": 0.875291827668612, "learning_rate": 8.300443794599953e-07, "loss": 0.046, "step": 2974 }, { "epoch": 1.3535031847133758, "grad_norm": 0.9513902266572225, "learning_rate": 8.299369990299401e-07, "loss": 0.0402, "step": 2975 }, { "epoch": 1.353958143767061, "grad_norm": 0.7911932845851036, "learning_rate": 8.298295916389233e-07, "loss": 0.0231, "step": 2976 }, { "epoch": 1.3544131028207462, "grad_norm": 0.8153074453592719, "learning_rate": 8.297221572957218e-07, "loss": 0.0264, "step": 2977 }, { "epoch": 1.3548680618744313, "grad_norm": 0.7003497255737554, "learning_rate": 8.296146960091146e-07, "loss": 0.0141, "step": 2978 }, { "epoch": 1.3553230209281164, "grad_norm": 0.780461079359214, "learning_rate": 8.295072077878831e-07, "loss": 0.0222, "step": 2979 }, { "epoch": 1.3557779799818017, "grad_norm": 0.742442769864521, "learning_rate": 8.293996926408105e-07, "loss": 0.0249, "step": 2980 }, { "epoch": 1.3562329390354868, "grad_norm": 1.3345080043619642, "learning_rate": 8.292921505766824e-07, "loss": 0.0323, "step": 2981 }, { "epoch": 1.356687898089172, "grad_norm": 1.1599159669385952, "learning_rate": 8.291845816042871e-07, "loss": 0.042, "step": 2982 }, { "epoch": 1.3571428571428572, "grad_norm": 0.880807136706355, "learning_rate": 8.290769857324142e-07, "loss": 0.0342, "step": 2983 }, { "epoch": 1.3575978161965423, "grad_norm": 0.9218198996096685, "learning_rate": 8.289693629698563e-07, "loss": 0.034, "step": 2984 }, { "epoch": 1.3580527752502274, "grad_norm": 1.2077275582271114, "learning_rate": 8.288617133254074e-07, "loss": 0.0326, "step": 2985 }, { "epoch": 1.3585077343039127, "grad_norm": 1.272577511348183, "learning_rate": 8.287540368078647e-07, "loss": 0.0415, "step": 2986 }, { "epoch": 1.3589626933575978, "grad_norm": 0.9652058980073396, "learning_rate": 8.286463334260267e-07, "loss": 0.0354, "step": 2987 }, { "epoch": 1.3594176524112829, "grad_norm": 0.9073786705820537, "learning_rate": 8.285386031886943e-07, "loss": 0.0245, "step": 2988 }, { "epoch": 1.3598726114649682, "grad_norm": 1.1507813328947831, "learning_rate": 8.284308461046712e-07, "loss": 0.0516, "step": 2989 }, { "epoch": 1.3603275705186533, "grad_norm": 1.1198066713738621, "learning_rate": 8.283230621827624e-07, "loss": 0.0423, "step": 2990 }, { "epoch": 1.3607825295723384, "grad_norm": 0.935500007743045, "learning_rate": 8.282152514317755e-07, "loss": 0.0284, "step": 2991 }, { "epoch": 1.3612374886260237, "grad_norm": 0.9368734965289852, "learning_rate": 8.281074138605206e-07, "loss": 0.0427, "step": 2992 }, { "epoch": 1.3616924476797088, "grad_norm": 0.6335760787149742, "learning_rate": 8.279995494778096e-07, "loss": 0.0168, "step": 2993 }, { "epoch": 1.362147406733394, "grad_norm": 0.9328150043396523, "learning_rate": 8.278916582924565e-07, "loss": 0.043, "step": 2994 }, { "epoch": 1.3626023657870792, "grad_norm": 1.0113075146710357, "learning_rate": 8.277837403132779e-07, "loss": 0.0518, "step": 2995 }, { "epoch": 1.3630573248407643, "grad_norm": 0.6833604736284572, "learning_rate": 8.276757955490923e-07, "loss": 0.0284, "step": 2996 }, { "epoch": 1.3635122838944496, "grad_norm": 0.925479851882391, "learning_rate": 8.275678240087205e-07, "loss": 0.0459, "step": 2997 }, { "epoch": 1.3639672429481347, "grad_norm": 1.1156969179958318, "learning_rate": 8.274598257009855e-07, "loss": 0.0284, "step": 2998 }, { "epoch": 1.36442220200182, "grad_norm": 0.796950813695915, "learning_rate": 8.273518006347121e-07, "loss": 0.0221, "step": 2999 }, { "epoch": 1.364877161055505, "grad_norm": 0.9826986041410676, "learning_rate": 8.27243748818728e-07, "loss": 0.0337, "step": 3000 }, { "epoch": 1.3653321201091901, "grad_norm": 0.9367997670390044, "learning_rate": 8.271356702618626e-07, "loss": 0.0526, "step": 3001 }, { "epoch": 1.3657870791628755, "grad_norm": 1.0910812286825293, "learning_rate": 8.270275649729475e-07, "loss": 0.0359, "step": 3002 }, { "epoch": 1.3662420382165605, "grad_norm": 0.5889289683964172, "learning_rate": 8.269194329608166e-07, "loss": 0.0213, "step": 3003 }, { "epoch": 1.3666969972702456, "grad_norm": 0.7465682854231652, "learning_rate": 8.26811274234306e-07, "loss": 0.0209, "step": 3004 }, { "epoch": 1.367151956323931, "grad_norm": 0.9723365178327287, "learning_rate": 8.267030888022542e-07, "loss": 0.0262, "step": 3005 }, { "epoch": 1.367606915377616, "grad_norm": 0.69813663929556, "learning_rate": 8.265948766735009e-07, "loss": 0.0201, "step": 3006 }, { "epoch": 1.3680618744313011, "grad_norm": 0.6518997331087706, "learning_rate": 8.264866378568895e-07, "loss": 0.0247, "step": 3007 }, { "epoch": 1.3685168334849864, "grad_norm": 1.1005448988788136, "learning_rate": 8.263783723612644e-07, "loss": 0.026, "step": 3008 }, { "epoch": 1.3689717925386715, "grad_norm": 0.6184408781310367, "learning_rate": 8.262700801954725e-07, "loss": 0.0285, "step": 3009 }, { "epoch": 1.3694267515923566, "grad_norm": 1.0639748019706792, "learning_rate": 8.261617613683631e-07, "loss": 0.0346, "step": 3010 }, { "epoch": 1.369881710646042, "grad_norm": 1.5638090566896807, "learning_rate": 8.260534158887877e-07, "loss": 0.0709, "step": 3011 }, { "epoch": 1.370336669699727, "grad_norm": 0.7415216082068842, "learning_rate": 8.259450437655993e-07, "loss": 0.0341, "step": 3012 }, { "epoch": 1.370791628753412, "grad_norm": 1.0399503325473134, "learning_rate": 8.25836645007654e-07, "loss": 0.0479, "step": 3013 }, { "epoch": 1.3712465878070974, "grad_norm": 0.907319826971085, "learning_rate": 8.257282196238096e-07, "loss": 0.0338, "step": 3014 }, { "epoch": 1.3717015468607825, "grad_norm": 0.9350688460191624, "learning_rate": 8.25619767622926e-07, "loss": 0.0357, "step": 3015 }, { "epoch": 1.3721565059144676, "grad_norm": 0.9679057610014759, "learning_rate": 8.255112890138656e-07, "loss": 0.0354, "step": 3016 }, { "epoch": 1.372611464968153, "grad_norm": 0.560744813128689, "learning_rate": 8.254027838054924e-07, "loss": 0.023, "step": 3017 }, { "epoch": 1.373066424021838, "grad_norm": 0.6406416362290188, "learning_rate": 8.252942520066734e-07, "loss": 0.0207, "step": 3018 }, { "epoch": 1.373521383075523, "grad_norm": 0.7884287814994955, "learning_rate": 8.251856936262772e-07, "loss": 0.0454, "step": 3019 }, { "epoch": 1.3739763421292084, "grad_norm": 1.0395306693881303, "learning_rate": 8.250771086731745e-07, "loss": 0.0365, "step": 3020 }, { "epoch": 1.3744313011828935, "grad_norm": 0.8451630084403603, "learning_rate": 8.249684971562386e-07, "loss": 0.0225, "step": 3021 }, { "epoch": 1.3748862602365788, "grad_norm": 0.7923985055958195, "learning_rate": 8.248598590843446e-07, "loss": 0.0354, "step": 3022 }, { "epoch": 1.3753412192902639, "grad_norm": 1.0564046847222452, "learning_rate": 8.247511944663701e-07, "loss": 0.0356, "step": 3023 }, { "epoch": 1.3757961783439492, "grad_norm": 2.9466434854042527, "learning_rate": 8.246425033111943e-07, "loss": 0.0555, "step": 3024 }, { "epoch": 1.3762511373976343, "grad_norm": 1.2530193566372603, "learning_rate": 8.245337856276994e-07, "loss": 0.038, "step": 3025 }, { "epoch": 1.3767060964513194, "grad_norm": 1.1631007389646635, "learning_rate": 8.244250414247691e-07, "loss": 0.0732, "step": 3026 }, { "epoch": 1.3771610555050047, "grad_norm": 0.9814482279968092, "learning_rate": 8.243162707112894e-07, "loss": 0.0232, "step": 3027 }, { "epoch": 1.3776160145586898, "grad_norm": 0.6912811801021992, "learning_rate": 8.242074734961488e-07, "loss": 0.0272, "step": 3028 }, { "epoch": 1.3780709736123748, "grad_norm": 0.8636819376877516, "learning_rate": 8.240986497882375e-07, "loss": 0.0288, "step": 3029 }, { "epoch": 1.3785259326660602, "grad_norm": 0.7534166175819964, "learning_rate": 8.239897995964482e-07, "loss": 0.0248, "step": 3030 }, { "epoch": 1.3789808917197452, "grad_norm": 0.9084886506493514, "learning_rate": 8.238809229296755e-07, "loss": 0.0422, "step": 3031 }, { "epoch": 1.3794358507734303, "grad_norm": 1.0849463614667496, "learning_rate": 8.237720197968166e-07, "loss": 0.0489, "step": 3032 }, { "epoch": 1.3798908098271156, "grad_norm": 0.8229668137939848, "learning_rate": 8.236630902067702e-07, "loss": 0.0258, "step": 3033 }, { "epoch": 1.3803457688808007, "grad_norm": 0.9667653731279434, "learning_rate": 8.235541341684377e-07, "loss": 0.0353, "step": 3034 }, { "epoch": 1.3808007279344858, "grad_norm": 0.7785592100346905, "learning_rate": 8.234451516907226e-07, "loss": 0.0161, "step": 3035 }, { "epoch": 1.3812556869881711, "grad_norm": 0.9632067660917896, "learning_rate": 8.233361427825304e-07, "loss": 0.0385, "step": 3036 }, { "epoch": 1.3817106460418562, "grad_norm": 1.4787544255001095, "learning_rate": 8.232271074527687e-07, "loss": 0.0468, "step": 3037 }, { "epoch": 1.3821656050955413, "grad_norm": 0.6190773463166759, "learning_rate": 8.231180457103475e-07, "loss": 0.021, "step": 3038 }, { "epoch": 1.3826205641492266, "grad_norm": 0.6051032668235322, "learning_rate": 8.230089575641788e-07, "loss": 0.0256, "step": 3039 }, { "epoch": 1.3830755232029117, "grad_norm": 0.8667238371833399, "learning_rate": 8.228998430231768e-07, "loss": 0.0346, "step": 3040 }, { "epoch": 1.3835304822565968, "grad_norm": 0.6451080240871802, "learning_rate": 8.227907020962578e-07, "loss": 0.025, "step": 3041 }, { "epoch": 1.3839854413102821, "grad_norm": 0.9661681338659736, "learning_rate": 8.226815347923404e-07, "loss": 0.0406, "step": 3042 }, { "epoch": 1.3844404003639672, "grad_norm": 0.7850905240766595, "learning_rate": 8.225723411203451e-07, "loss": 0.0284, "step": 3043 }, { "epoch": 1.3848953594176523, "grad_norm": 1.005714999502436, "learning_rate": 8.224631210891948e-07, "loss": 0.0329, "step": 3044 }, { "epoch": 1.3853503184713376, "grad_norm": 0.7836021836071382, "learning_rate": 8.223538747078144e-07, "loss": 0.0459, "step": 3045 }, { "epoch": 1.3858052775250227, "grad_norm": 1.0055547014688397, "learning_rate": 8.222446019851313e-07, "loss": 0.0405, "step": 3046 }, { "epoch": 1.3862602365787078, "grad_norm": 1.3222992225220955, "learning_rate": 8.221353029300746e-07, "loss": 0.0534, "step": 3047 }, { "epoch": 1.386715195632393, "grad_norm": 0.711562189000451, "learning_rate": 8.220259775515755e-07, "loss": 0.0246, "step": 3048 }, { "epoch": 1.3871701546860782, "grad_norm": 0.9778754146965014, "learning_rate": 8.219166258585679e-07, "loss": 0.0447, "step": 3049 }, { "epoch": 1.3876251137397635, "grad_norm": 0.8469809212392932, "learning_rate": 8.218072478599873e-07, "loss": 0.0273, "step": 3050 }, { "epoch": 1.3880800727934486, "grad_norm": 0.8883483638792253, "learning_rate": 8.216978435647717e-07, "loss": 0.0379, "step": 3051 }, { "epoch": 1.388535031847134, "grad_norm": 0.8748345673504766, "learning_rate": 8.215884129818611e-07, "loss": 0.0244, "step": 3052 }, { "epoch": 1.388989990900819, "grad_norm": 0.7962413864961139, "learning_rate": 8.214789561201978e-07, "loss": 0.0292, "step": 3053 }, { "epoch": 1.389444949954504, "grad_norm": 1.5493628295707353, "learning_rate": 8.21369472988726e-07, "loss": 0.0456, "step": 3054 }, { "epoch": 1.3898999090081894, "grad_norm": 1.367723695541294, "learning_rate": 8.21259963596392e-07, "loss": 0.0327, "step": 3055 }, { "epoch": 1.3903548680618745, "grad_norm": 1.1264835778466813, "learning_rate": 8.211504279521445e-07, "loss": 0.0371, "step": 3056 }, { "epoch": 1.3908098271155596, "grad_norm": 0.9033084909974064, "learning_rate": 8.210408660649345e-07, "loss": 0.0376, "step": 3057 }, { "epoch": 1.3912647861692449, "grad_norm": 0.7494223177377431, "learning_rate": 8.209312779437147e-07, "loss": 0.0238, "step": 3058 }, { "epoch": 1.39171974522293, "grad_norm": 1.0872707110761046, "learning_rate": 8.208216635974401e-07, "loss": 0.0384, "step": 3059 }, { "epoch": 1.392174704276615, "grad_norm": 1.1313847313359517, "learning_rate": 8.207120230350681e-07, "loss": 0.0669, "step": 3060 }, { "epoch": 1.3926296633303004, "grad_norm": 0.5322755505111992, "learning_rate": 8.206023562655577e-07, "loss": 0.0205, "step": 3061 }, { "epoch": 1.3930846223839854, "grad_norm": 0.7847045180613638, "learning_rate": 8.204926632978708e-07, "loss": 0.0272, "step": 3062 }, { "epoch": 1.3935395814376705, "grad_norm": 1.0944319513998002, "learning_rate": 8.203829441409706e-07, "loss": 0.0453, "step": 3063 }, { "epoch": 1.3939945404913558, "grad_norm": 1.2453806492513526, "learning_rate": 8.202731988038231e-07, "loss": 0.0441, "step": 3064 }, { "epoch": 1.394449499545041, "grad_norm": 1.4196598596139172, "learning_rate": 8.201634272953963e-07, "loss": 0.0745, "step": 3065 }, { "epoch": 1.394904458598726, "grad_norm": 0.8745740814755255, "learning_rate": 8.200536296246598e-07, "loss": 0.0248, "step": 3066 }, { "epoch": 1.3953594176524113, "grad_norm": 0.8187287577923656, "learning_rate": 8.199438058005863e-07, "loss": 0.0273, "step": 3067 }, { "epoch": 1.3958143767060964, "grad_norm": 1.0845589564938265, "learning_rate": 8.198339558321496e-07, "loss": 0.036, "step": 3068 }, { "epoch": 1.3962693357597815, "grad_norm": 0.749700539896346, "learning_rate": 8.197240797283264e-07, "loss": 0.0222, "step": 3069 }, { "epoch": 1.3967242948134668, "grad_norm": 0.7156011770453841, "learning_rate": 8.196141774980955e-07, "loss": 0.0332, "step": 3070 }, { "epoch": 1.397179253867152, "grad_norm": 0.769239581926111, "learning_rate": 8.195042491504373e-07, "loss": 0.0189, "step": 3071 }, { "epoch": 1.397634212920837, "grad_norm": 0.6855695033973541, "learning_rate": 8.193942946943346e-07, "loss": 0.0143, "step": 3072 }, { "epoch": 1.3980891719745223, "grad_norm": 1.006967908776679, "learning_rate": 8.192843141387727e-07, "loss": 0.0337, "step": 3073 }, { "epoch": 1.3985441310282074, "grad_norm": 1.2810289940454445, "learning_rate": 8.191743074927384e-07, "loss": 0.0419, "step": 3074 }, { "epoch": 1.3989990900818925, "grad_norm": 0.8352757347897298, "learning_rate": 8.19064274765221e-07, "loss": 0.0269, "step": 3075 }, { "epoch": 1.3994540491355778, "grad_norm": 0.8102872139964511, "learning_rate": 8.189542159652121e-07, "loss": 0.03, "step": 3076 }, { "epoch": 1.399909008189263, "grad_norm": 1.009984916186407, "learning_rate": 8.188441311017049e-07, "loss": 0.0429, "step": 3077 }, { "epoch": 1.4003639672429482, "grad_norm": 0.931942290789495, "learning_rate": 8.187340201836953e-07, "loss": 0.0348, "step": 3078 }, { "epoch": 1.4008189262966333, "grad_norm": 0.894355487674316, "learning_rate": 8.186238832201808e-07, "loss": 0.031, "step": 3079 }, { "epoch": 1.4012738853503186, "grad_norm": 0.7429891604790393, "learning_rate": 8.185137202201617e-07, "loss": 0.0262, "step": 3080 }, { "epoch": 1.4017288444040037, "grad_norm": 0.8206984220624675, "learning_rate": 8.184035311926396e-07, "loss": 0.0182, "step": 3081 }, { "epoch": 1.4021838034576888, "grad_norm": 1.1458952187825449, "learning_rate": 8.18293316146619e-07, "loss": 0.0411, "step": 3082 }, { "epoch": 1.402638762511374, "grad_norm": 0.943936324620314, "learning_rate": 8.181830750911059e-07, "loss": 0.033, "step": 3083 }, { "epoch": 1.4030937215650592, "grad_norm": 0.9710715746759322, "learning_rate": 8.180728080351088e-07, "loss": 0.0292, "step": 3084 }, { "epoch": 1.4035486806187443, "grad_norm": 0.9958686806396023, "learning_rate": 8.179625149876382e-07, "loss": 0.0472, "step": 3085 }, { "epoch": 1.4040036396724296, "grad_norm": 1.3617185248220405, "learning_rate": 8.178521959577069e-07, "loss": 0.0535, "step": 3086 }, { "epoch": 1.4044585987261147, "grad_norm": 0.9132499387021406, "learning_rate": 8.177418509543295e-07, "loss": 0.0342, "step": 3087 }, { "epoch": 1.4049135577797998, "grad_norm": 0.8404339954669395, "learning_rate": 8.176314799865228e-07, "loss": 0.0279, "step": 3088 }, { "epoch": 1.405368516833485, "grad_norm": 0.7950731157225339, "learning_rate": 8.175210830633061e-07, "loss": 0.0331, "step": 3089 }, { "epoch": 1.4058234758871702, "grad_norm": 0.7690449708946481, "learning_rate": 8.174106601937005e-07, "loss": 0.0214, "step": 3090 }, { "epoch": 1.4062784349408552, "grad_norm": 0.9129829406183961, "learning_rate": 8.17300211386729e-07, "loss": 0.0268, "step": 3091 }, { "epoch": 1.4067333939945406, "grad_norm": 0.9936638047229787, "learning_rate": 8.171897366514172e-07, "loss": 0.0352, "step": 3092 }, { "epoch": 1.4071883530482256, "grad_norm": 1.1074910253793595, "learning_rate": 8.170792359967925e-07, "loss": 0.0315, "step": 3093 }, { "epoch": 1.4076433121019107, "grad_norm": 1.2738558119959127, "learning_rate": 8.169687094318847e-07, "loss": 0.0536, "step": 3094 }, { "epoch": 1.408098271155596, "grad_norm": 0.8240096450440958, "learning_rate": 8.168581569657251e-07, "loss": 0.0338, "step": 3095 }, { "epoch": 1.4085532302092811, "grad_norm": 0.7264344512016448, "learning_rate": 8.167475786073481e-07, "loss": 0.0168, "step": 3096 }, { "epoch": 1.4090081892629662, "grad_norm": 0.8202711015248981, "learning_rate": 8.166369743657893e-07, "loss": 0.0305, "step": 3097 }, { "epoch": 1.4094631483166515, "grad_norm": 1.0646011250704286, "learning_rate": 8.165263442500868e-07, "loss": 0.0259, "step": 3098 }, { "epoch": 1.4099181073703366, "grad_norm": 0.910251283986272, "learning_rate": 8.16415688269281e-07, "loss": 0.028, "step": 3099 }, { "epoch": 1.4103730664240217, "grad_norm": 0.8847663146457964, "learning_rate": 8.16305006432414e-07, "loss": 0.0311, "step": 3100 }, { "epoch": 1.410828025477707, "grad_norm": 0.8605463484213328, "learning_rate": 8.161942987485303e-07, "loss": 0.031, "step": 3101 }, { "epoch": 1.4112829845313921, "grad_norm": 2.9149085695397456, "learning_rate": 8.160835652266764e-07, "loss": 0.0699, "step": 3102 }, { "epoch": 1.4117379435850774, "grad_norm": 0.9149709077885386, "learning_rate": 8.159728058759011e-07, "loss": 0.0501, "step": 3103 }, { "epoch": 1.4121929026387625, "grad_norm": 0.9048418687866036, "learning_rate": 8.158620207052548e-07, "loss": 0.041, "step": 3104 }, { "epoch": 1.4126478616924476, "grad_norm": 1.0724928136798195, "learning_rate": 8.157512097237908e-07, "loss": 0.0371, "step": 3105 }, { "epoch": 1.413102820746133, "grad_norm": 0.9283309860098834, "learning_rate": 8.156403729405639e-07, "loss": 0.0303, "step": 3106 }, { "epoch": 1.413557779799818, "grad_norm": 0.7811893661571627, "learning_rate": 8.155295103646309e-07, "loss": 0.0246, "step": 3107 }, { "epoch": 1.4140127388535033, "grad_norm": 0.7437684607143144, "learning_rate": 8.154186220050514e-07, "loss": 0.0447, "step": 3108 }, { "epoch": 1.4144676979071884, "grad_norm": 0.9107642774582784, "learning_rate": 8.153077078708865e-07, "loss": 0.0266, "step": 3109 }, { "epoch": 1.4149226569608735, "grad_norm": 1.0775311401538987, "learning_rate": 8.151967679711996e-07, "loss": 0.0308, "step": 3110 }, { "epoch": 1.4153776160145588, "grad_norm": 0.733168099991657, "learning_rate": 8.150858023150562e-07, "loss": 0.0421, "step": 3111 }, { "epoch": 1.415832575068244, "grad_norm": 0.5634078188990812, "learning_rate": 8.14974810911524e-07, "loss": 0.0163, "step": 3112 }, { "epoch": 1.416287534121929, "grad_norm": 1.0610325365125284, "learning_rate": 8.148637937696728e-07, "loss": 0.0458, "step": 3113 }, { "epoch": 1.4167424931756143, "grad_norm": 1.1949572200541079, "learning_rate": 8.147527508985741e-07, "loss": 0.0511, "step": 3114 }, { "epoch": 1.4171974522292994, "grad_norm": 0.8329436784834815, "learning_rate": 8.14641682307302e-07, "loss": 0.0341, "step": 3115 }, { "epoch": 1.4176524112829845, "grad_norm": 0.9792781389733544, "learning_rate": 8.145305880049327e-07, "loss": 0.036, "step": 3116 }, { "epoch": 1.4181073703366698, "grad_norm": 0.8083443249169077, "learning_rate": 8.14419468000544e-07, "loss": 0.0352, "step": 3117 }, { "epoch": 1.4185623293903549, "grad_norm": 3.556156952389884, "learning_rate": 8.143083223032163e-07, "loss": 0.0766, "step": 3118 }, { "epoch": 1.41901728844404, "grad_norm": 1.2211653844021464, "learning_rate": 8.14197150922032e-07, "loss": 0.0496, "step": 3119 }, { "epoch": 1.4194722474977253, "grad_norm": 0.9474327383632177, "learning_rate": 8.140859538660753e-07, "loss": 0.0497, "step": 3120 }, { "epoch": 1.4199272065514104, "grad_norm": 0.9409949020349314, "learning_rate": 8.13974731144433e-07, "loss": 0.0477, "step": 3121 }, { "epoch": 1.4203821656050954, "grad_norm": 0.6559437409154592, "learning_rate": 8.138634827661934e-07, "loss": 0.0241, "step": 3122 }, { "epoch": 1.4208371246587808, "grad_norm": 0.7778074790609326, "learning_rate": 8.137522087404474e-07, "loss": 0.0268, "step": 3123 }, { "epoch": 1.4212920837124658, "grad_norm": 0.8085172610749318, "learning_rate": 8.136409090762879e-07, "loss": 0.0258, "step": 3124 }, { "epoch": 1.421747042766151, "grad_norm": 0.6880953593911606, "learning_rate": 8.135295837828095e-07, "loss": 0.014, "step": 3125 }, { "epoch": 1.4222020018198362, "grad_norm": 0.7202993249773466, "learning_rate": 8.134182328691097e-07, "loss": 0.0285, "step": 3126 }, { "epoch": 1.4226569608735213, "grad_norm": 0.7788933117657996, "learning_rate": 8.133068563442872e-07, "loss": 0.0227, "step": 3127 }, { "epoch": 1.4231119199272064, "grad_norm": 0.9034857866728329, "learning_rate": 8.131954542174431e-07, "loss": 0.0583, "step": 3128 }, { "epoch": 1.4235668789808917, "grad_norm": 0.904085851719301, "learning_rate": 8.130840264976811e-07, "loss": 0.0343, "step": 3129 }, { "epoch": 1.4240218380345768, "grad_norm": 1.127522284115175, "learning_rate": 8.129725731941062e-07, "loss": 0.0459, "step": 3130 }, { "epoch": 1.4244767970882621, "grad_norm": 1.0335842576782048, "learning_rate": 8.128610943158261e-07, "loss": 0.0356, "step": 3131 }, { "epoch": 1.4249317561419472, "grad_norm": 0.9769307907592587, "learning_rate": 8.127495898719501e-07, "loss": 0.0426, "step": 3132 }, { "epoch": 1.4253867151956323, "grad_norm": 0.823194503464388, "learning_rate": 8.126380598715901e-07, "loss": 0.0334, "step": 3133 }, { "epoch": 1.4258416742493176, "grad_norm": 0.8916793487708854, "learning_rate": 8.125265043238596e-07, "loss": 0.0439, "step": 3134 }, { "epoch": 1.4262966333030027, "grad_norm": 0.7802718336870106, "learning_rate": 8.124149232378746e-07, "loss": 0.028, "step": 3135 }, { "epoch": 1.426751592356688, "grad_norm": 0.7946520113179752, "learning_rate": 8.123033166227529e-07, "loss": 0.029, "step": 3136 }, { "epoch": 1.4272065514103731, "grad_norm": 0.7419868344750444, "learning_rate": 8.121916844876145e-07, "loss": 0.0351, "step": 3137 }, { "epoch": 1.4276615104640582, "grad_norm": 1.0240712666803469, "learning_rate": 8.120800268415815e-07, "loss": 0.0305, "step": 3138 }, { "epoch": 1.4281164695177435, "grad_norm": 0.6615739308328378, "learning_rate": 8.119683436937779e-07, "loss": 0.0285, "step": 3139 }, { "epoch": 1.4285714285714286, "grad_norm": 0.9742298305041059, "learning_rate": 8.118566350533302e-07, "loss": 0.0352, "step": 3140 }, { "epoch": 1.4290263876251137, "grad_norm": 0.9054626428894935, "learning_rate": 8.117449009293668e-07, "loss": 0.0293, "step": 3141 }, { "epoch": 1.429481346678799, "grad_norm": 0.7896026870419177, "learning_rate": 8.116331413310177e-07, "loss": 0.0289, "step": 3142 }, { "epoch": 1.429936305732484, "grad_norm": 0.8857421410441557, "learning_rate": 8.115213562674157e-07, "loss": 0.0255, "step": 3143 }, { "epoch": 1.4303912647861692, "grad_norm": 1.1428896837013023, "learning_rate": 8.114095457476952e-07, "loss": 0.0432, "step": 3144 }, { "epoch": 1.4308462238398545, "grad_norm": 1.9473409025430548, "learning_rate": 8.112977097809931e-07, "loss": 0.0782, "step": 3145 }, { "epoch": 1.4313011828935396, "grad_norm": 0.6588485217847181, "learning_rate": 8.111858483764478e-07, "loss": 0.0298, "step": 3146 }, { "epoch": 1.4317561419472247, "grad_norm": 1.0720820860599272, "learning_rate": 8.110739615432004e-07, "loss": 0.0268, "step": 3147 }, { "epoch": 1.43221110100091, "grad_norm": 0.8837663421288952, "learning_rate": 8.109620492903937e-07, "loss": 0.0565, "step": 3148 }, { "epoch": 1.432666060054595, "grad_norm": 0.7287827842481205, "learning_rate": 8.108501116271724e-07, "loss": 0.0251, "step": 3149 }, { "epoch": 1.4331210191082802, "grad_norm": 0.7508548508426927, "learning_rate": 8.107381485626839e-07, "loss": 0.0227, "step": 3150 }, { "epoch": 1.4335759781619655, "grad_norm": 0.7582056019441109, "learning_rate": 8.106261601060772e-07, "loss": 0.0264, "step": 3151 }, { "epoch": 1.4340309372156506, "grad_norm": 0.9302280377905917, "learning_rate": 8.105141462665035e-07, "loss": 0.0336, "step": 3152 }, { "epoch": 1.4344858962693356, "grad_norm": 0.7644301348401399, "learning_rate": 8.10402107053116e-07, "loss": 0.0311, "step": 3153 }, { "epoch": 1.434940855323021, "grad_norm": 1.162404494824062, "learning_rate": 8.102900424750701e-07, "loss": 0.0438, "step": 3154 }, { "epoch": 1.435395814376706, "grad_norm": 0.9596864630477949, "learning_rate": 8.101779525415231e-07, "loss": 0.036, "step": 3155 }, { "epoch": 1.4358507734303911, "grad_norm": 0.8923037547367764, "learning_rate": 8.100658372616344e-07, "loss": 0.035, "step": 3156 }, { "epoch": 1.4363057324840764, "grad_norm": 0.9306016717255235, "learning_rate": 8.099536966445661e-07, "loss": 0.0316, "step": 3157 }, { "epoch": 1.4367606915377615, "grad_norm": 1.1311957993871173, "learning_rate": 8.098415306994812e-07, "loss": 0.0661, "step": 3158 }, { "epoch": 1.4372156505914468, "grad_norm": 1.2501413951555167, "learning_rate": 8.097293394355458e-07, "loss": 0.0443, "step": 3159 }, { "epoch": 1.437670609645132, "grad_norm": 0.8757611139266945, "learning_rate": 8.096171228619274e-07, "loss": 0.0391, "step": 3160 }, { "epoch": 1.438125568698817, "grad_norm": 1.0842953501302823, "learning_rate": 8.09504880987796e-07, "loss": 0.0313, "step": 3161 }, { "epoch": 1.4385805277525023, "grad_norm": 0.8635562354243116, "learning_rate": 8.093926138223233e-07, "loss": 0.0404, "step": 3162 }, { "epoch": 1.4390354868061874, "grad_norm": 0.8703733497836597, "learning_rate": 8.092803213746837e-07, "loss": 0.0409, "step": 3163 }, { "epoch": 1.4394904458598727, "grad_norm": 0.8531090529322496, "learning_rate": 8.091680036540527e-07, "loss": 0.0369, "step": 3164 }, { "epoch": 1.4399454049135578, "grad_norm": 0.7913720252460319, "learning_rate": 8.090556606696087e-07, "loss": 0.034, "step": 3165 }, { "epoch": 1.440400363967243, "grad_norm": 1.1558166332662825, "learning_rate": 8.089432924305317e-07, "loss": 0.0208, "step": 3166 }, { "epoch": 1.4408553230209282, "grad_norm": 0.6310021537316332, "learning_rate": 8.088308989460039e-07, "loss": 0.0167, "step": 3167 }, { "epoch": 1.4413102820746133, "grad_norm": 0.7363777637344272, "learning_rate": 8.087184802252101e-07, "loss": 0.0268, "step": 3168 }, { "epoch": 1.4417652411282984, "grad_norm": 1.0505804840413648, "learning_rate": 8.08606036277336e-07, "loss": 0.0498, "step": 3169 }, { "epoch": 1.4422202001819837, "grad_norm": 0.920370497432542, "learning_rate": 8.084935671115704e-07, "loss": 0.0275, "step": 3170 }, { "epoch": 1.4426751592356688, "grad_norm": 0.8689513230046892, "learning_rate": 8.083810727371035e-07, "loss": 0.0377, "step": 3171 }, { "epoch": 1.443130118289354, "grad_norm": 0.794903431349294, "learning_rate": 8.082685531631282e-07, "loss": 0.0358, "step": 3172 }, { "epoch": 1.4435850773430392, "grad_norm": 1.2320040248175288, "learning_rate": 8.081560083988386e-07, "loss": 0.0564, "step": 3173 }, { "epoch": 1.4440400363967243, "grad_norm": 1.0900796471737446, "learning_rate": 8.080434384534316e-07, "loss": 0.038, "step": 3174 }, { "epoch": 1.4444949954504094, "grad_norm": 0.9170147703058286, "learning_rate": 8.079308433361061e-07, "loss": 0.0458, "step": 3175 }, { "epoch": 1.4449499545040947, "grad_norm": 0.9360635576950569, "learning_rate": 8.078182230560626e-07, "loss": 0.0402, "step": 3176 }, { "epoch": 1.4454049135577798, "grad_norm": 1.1022905024895124, "learning_rate": 8.07705577622504e-07, "loss": 0.0503, "step": 3177 }, { "epoch": 1.4458598726114649, "grad_norm": 0.9989304524962554, "learning_rate": 8.075929070446353e-07, "loss": 0.0373, "step": 3178 }, { "epoch": 1.4463148316651502, "grad_norm": 0.7391704563989013, "learning_rate": 8.074802113316633e-07, "loss": 0.0346, "step": 3179 }, { "epoch": 1.4467697907188353, "grad_norm": 0.7238398643148026, "learning_rate": 8.073674904927968e-07, "loss": 0.0149, "step": 3180 }, { "epoch": 1.4472247497725204, "grad_norm": 0.9578607865585056, "learning_rate": 8.072547445372471e-07, "loss": 0.0491, "step": 3181 }, { "epoch": 1.4476797088262057, "grad_norm": 0.8213215483693614, "learning_rate": 8.071419734742272e-07, "loss": 0.0253, "step": 3182 }, { "epoch": 1.4481346678798908, "grad_norm": 0.8502950802689085, "learning_rate": 8.070291773129525e-07, "loss": 0.0364, "step": 3183 }, { "epoch": 1.4485896269335758, "grad_norm": 0.9612359736628147, "learning_rate": 8.069163560626398e-07, "loss": 0.0491, "step": 3184 }, { "epoch": 1.4490445859872612, "grad_norm": 0.7648604980745364, "learning_rate": 8.068035097325086e-07, "loss": 0.0262, "step": 3185 }, { "epoch": 1.4494995450409462, "grad_norm": 0.8130083211944765, "learning_rate": 8.0669063833178e-07, "loss": 0.0435, "step": 3186 }, { "epoch": 1.4499545040946316, "grad_norm": 0.9556834062207352, "learning_rate": 8.065777418696774e-07, "loss": 0.0327, "step": 3187 }, { "epoch": 1.4504094631483166, "grad_norm": 1.1198403453926826, "learning_rate": 8.064648203554262e-07, "loss": 0.0378, "step": 3188 }, { "epoch": 1.450864422202002, "grad_norm": 1.1214374521240786, "learning_rate": 8.063518737982539e-07, "loss": 0.0291, "step": 3189 }, { "epoch": 1.451319381255687, "grad_norm": 0.7353520144237775, "learning_rate": 8.062389022073901e-07, "loss": 0.0184, "step": 3190 }, { "epoch": 1.4517743403093721, "grad_norm": 0.8755881766186225, "learning_rate": 8.061259055920659e-07, "loss": 0.0305, "step": 3191 }, { "epoch": 1.4522292993630574, "grad_norm": 0.9226264536751165, "learning_rate": 8.060128839615154e-07, "loss": 0.0325, "step": 3192 }, { "epoch": 1.4526842584167425, "grad_norm": 0.7601486942296244, "learning_rate": 8.05899837324974e-07, "loss": 0.0313, "step": 3193 }, { "epoch": 1.4531392174704276, "grad_norm": 0.8343117760558895, "learning_rate": 8.057867656916792e-07, "loss": 0.0333, "step": 3194 }, { "epoch": 1.453594176524113, "grad_norm": 0.6436525491303797, "learning_rate": 8.056736690708708e-07, "loss": 0.0352, "step": 3195 }, { "epoch": 1.454049135577798, "grad_norm": 0.9542161984464499, "learning_rate": 8.055605474717906e-07, "loss": 0.0283, "step": 3196 }, { "epoch": 1.4545040946314831, "grad_norm": 0.8550133548250926, "learning_rate": 8.054474009036825e-07, "loss": 0.023, "step": 3197 }, { "epoch": 1.4549590536851684, "grad_norm": 0.6105582618801296, "learning_rate": 8.053342293757919e-07, "loss": 0.0216, "step": 3198 }, { "epoch": 1.4554140127388535, "grad_norm": 1.0478772046830258, "learning_rate": 8.052210328973672e-07, "loss": 0.0442, "step": 3199 }, { "epoch": 1.4558689717925386, "grad_norm": 0.8841954919906386, "learning_rate": 8.05107811477658e-07, "loss": 0.0154, "step": 3200 }, { "epoch": 1.456323930846224, "grad_norm": 0.7256250155802184, "learning_rate": 8.049945651259162e-07, "loss": 0.0224, "step": 3201 }, { "epoch": 1.456778889899909, "grad_norm": 1.0273243303476565, "learning_rate": 8.048812938513958e-07, "loss": 0.0368, "step": 3202 }, { "epoch": 1.457233848953594, "grad_norm": 0.6757041477574101, "learning_rate": 8.047679976633532e-07, "loss": 0.0215, "step": 3203 }, { "epoch": 1.4576888080072794, "grad_norm": 1.0863774858841084, "learning_rate": 8.046546765710459e-07, "loss": 0.0584, "step": 3204 }, { "epoch": 1.4581437670609645, "grad_norm": 0.9454266522392238, "learning_rate": 8.045413305837343e-07, "loss": 0.0341, "step": 3205 }, { "epoch": 1.4585987261146496, "grad_norm": 0.6658528464175599, "learning_rate": 8.044279597106805e-07, "loss": 0.0278, "step": 3206 }, { "epoch": 1.459053685168335, "grad_norm": 1.2172970241782213, "learning_rate": 8.043145639611486e-07, "loss": 0.0378, "step": 3207 }, { "epoch": 1.45950864422202, "grad_norm": 0.8027106380728501, "learning_rate": 8.04201143344405e-07, "loss": 0.0271, "step": 3208 }, { "epoch": 1.459963603275705, "grad_norm": 0.5859548647524387, "learning_rate": 8.040876978697173e-07, "loss": 0.0173, "step": 3209 }, { "epoch": 1.4604185623293904, "grad_norm": 0.7051800085262965, "learning_rate": 8.039742275463565e-07, "loss": 0.0289, "step": 3210 }, { "epoch": 1.4608735213830755, "grad_norm": 0.7081633906705537, "learning_rate": 8.038607323835945e-07, "loss": 0.0154, "step": 3211 }, { "epoch": 1.4613284804367606, "grad_norm": 0.9774309332858271, "learning_rate": 8.037472123907057e-07, "loss": 0.027, "step": 3212 }, { "epoch": 1.4617834394904459, "grad_norm": 0.9923652024451268, "learning_rate": 8.036336675769664e-07, "loss": 0.0309, "step": 3213 }, { "epoch": 1.462238398544131, "grad_norm": 0.8498015722697095, "learning_rate": 8.035200979516548e-07, "loss": 0.0355, "step": 3214 }, { "epoch": 1.4626933575978163, "grad_norm": 0.7855401607862456, "learning_rate": 8.034065035240519e-07, "loss": 0.0281, "step": 3215 }, { "epoch": 1.4631483166515014, "grad_norm": 0.8560864963947679, "learning_rate": 8.032928843034392e-07, "loss": 0.0262, "step": 3216 }, { "epoch": 1.4636032757051867, "grad_norm": 0.8985527470388319, "learning_rate": 8.03179240299102e-07, "loss": 0.0389, "step": 3217 }, { "epoch": 1.4640582347588718, "grad_norm": 1.1459152822616927, "learning_rate": 8.030655715203262e-07, "loss": 0.0378, "step": 3218 }, { "epoch": 1.4645131938125568, "grad_norm": 1.0560473478220707, "learning_rate": 8.029518779764006e-07, "loss": 0.0543, "step": 3219 }, { "epoch": 1.4649681528662422, "grad_norm": 0.9339669416472581, "learning_rate": 8.028381596766158e-07, "loss": 0.0369, "step": 3220 }, { "epoch": 1.4654231119199272, "grad_norm": 0.6963631455995973, "learning_rate": 8.02724416630264e-07, "loss": 0.0119, "step": 3221 }, { "epoch": 1.4658780709736123, "grad_norm": 0.9129446432296754, "learning_rate": 8.026106488466402e-07, "loss": 0.0405, "step": 3222 }, { "epoch": 1.4663330300272976, "grad_norm": 0.5985884056953106, "learning_rate": 8.024968563350405e-07, "loss": 0.0176, "step": 3223 }, { "epoch": 1.4667879890809827, "grad_norm": 1.0419197241632172, "learning_rate": 8.023830391047639e-07, "loss": 0.0293, "step": 3224 }, { "epoch": 1.4672429481346678, "grad_norm": 1.1051722430618356, "learning_rate": 8.022691971651109e-07, "loss": 0.0543, "step": 3225 }, { "epoch": 1.4676979071883531, "grad_norm": 1.0769559062571428, "learning_rate": 8.021553305253841e-07, "loss": 0.0282, "step": 3226 }, { "epoch": 1.4681528662420382, "grad_norm": 0.6630049614403577, "learning_rate": 8.020414391948882e-07, "loss": 0.0194, "step": 3227 }, { "epoch": 1.4686078252957233, "grad_norm": 0.9343309026708425, "learning_rate": 8.019275231829299e-07, "loss": 0.0305, "step": 3228 }, { "epoch": 1.4690627843494086, "grad_norm": 0.7759155464674062, "learning_rate": 8.018135824988179e-07, "loss": 0.0304, "step": 3229 }, { "epoch": 1.4695177434030937, "grad_norm": 0.9462649350620728, "learning_rate": 8.016996171518628e-07, "loss": 0.0413, "step": 3230 }, { "epoch": 1.4699727024567788, "grad_norm": 0.8209494274215177, "learning_rate": 8.015856271513776e-07, "loss": 0.0284, "step": 3231 }, { "epoch": 1.4704276615104641, "grad_norm": 0.9321787983075599, "learning_rate": 8.014716125066769e-07, "loss": 0.0374, "step": 3232 }, { "epoch": 1.4708826205641492, "grad_norm": 1.0334125650443575, "learning_rate": 8.013575732270775e-07, "loss": 0.0598, "step": 3233 }, { "epoch": 1.4713375796178343, "grad_norm": 0.5800843097682805, "learning_rate": 8.012435093218981e-07, "loss": 0.0278, "step": 3234 }, { "epoch": 1.4717925386715196, "grad_norm": 0.9627975577342618, "learning_rate": 8.011294208004595e-07, "loss": 0.0374, "step": 3235 }, { "epoch": 1.4722474977252047, "grad_norm": 0.6872575332017902, "learning_rate": 8.010153076720846e-07, "loss": 0.0305, "step": 3236 }, { "epoch": 1.4727024567788898, "grad_norm": 0.7676140957449403, "learning_rate": 8.009011699460979e-07, "loss": 0.0385, "step": 3237 }, { "epoch": 1.473157415832575, "grad_norm": 0.9316221162545121, "learning_rate": 8.007870076318266e-07, "loss": 0.0248, "step": 3238 }, { "epoch": 1.4736123748862602, "grad_norm": 1.269025862404898, "learning_rate": 8.006728207385994e-07, "loss": 0.0394, "step": 3239 }, { "epoch": 1.4740673339399453, "grad_norm": 0.6527947869656604, "learning_rate": 8.00558609275747e-07, "loss": 0.0166, "step": 3240 }, { "epoch": 1.4745222929936306, "grad_norm": 0.9583574145827783, "learning_rate": 8.004443732526025e-07, "loss": 0.0405, "step": 3241 }, { "epoch": 1.4749772520473157, "grad_norm": 0.9343353191660405, "learning_rate": 8.003301126785007e-07, "loss": 0.0353, "step": 3242 }, { "epoch": 1.475432211101001, "grad_norm": 0.8272309134247835, "learning_rate": 8.002158275627781e-07, "loss": 0.0308, "step": 3243 }, { "epoch": 1.475887170154686, "grad_norm": 0.7649351532669482, "learning_rate": 8.00101517914774e-07, "loss": 0.0223, "step": 3244 }, { "epoch": 1.4763421292083714, "grad_norm": 1.2279551102308726, "learning_rate": 7.99987183743829e-07, "loss": 0.0476, "step": 3245 }, { "epoch": 1.4767970882620565, "grad_norm": 0.8664757704590031, "learning_rate": 7.998728250592863e-07, "loss": 0.0389, "step": 3246 }, { "epoch": 1.4772520473157416, "grad_norm": 0.7776392705911533, "learning_rate": 7.997584418704903e-07, "loss": 0.0269, "step": 3247 }, { "epoch": 1.4777070063694269, "grad_norm": 1.05759734354772, "learning_rate": 7.996440341867882e-07, "loss": 0.0431, "step": 3248 }, { "epoch": 1.478161965423112, "grad_norm": 0.7167024218562743, "learning_rate": 7.995296020175289e-07, "loss": 0.0215, "step": 3249 }, { "epoch": 1.478616924476797, "grad_norm": 0.6813655575086383, "learning_rate": 7.994151453720632e-07, "loss": 0.0182, "step": 3250 }, { "epoch": 1.4790718835304824, "grad_norm": 0.8636864102175807, "learning_rate": 7.993006642597437e-07, "loss": 0.0472, "step": 3251 }, { "epoch": 1.4795268425841674, "grad_norm": 1.112670509451075, "learning_rate": 7.991861586899258e-07, "loss": 0.0558, "step": 3252 }, { "epoch": 1.4799818016378525, "grad_norm": 0.9702861366120873, "learning_rate": 7.990716286719661e-07, "loss": 0.0488, "step": 3253 }, { "epoch": 1.4804367606915378, "grad_norm": 0.6948877269253102, "learning_rate": 7.989570742152234e-07, "loss": 0.0265, "step": 3254 }, { "epoch": 1.480891719745223, "grad_norm": 0.6234069321116141, "learning_rate": 7.988424953290587e-07, "loss": 0.0256, "step": 3255 }, { "epoch": 1.481346678798908, "grad_norm": 0.9462872851628017, "learning_rate": 7.987278920228349e-07, "loss": 0.047, "step": 3256 }, { "epoch": 1.4818016378525933, "grad_norm": 0.8095264309806285, "learning_rate": 7.986132643059168e-07, "loss": 0.0327, "step": 3257 }, { "epoch": 1.4822565969062784, "grad_norm": 1.1076935418843352, "learning_rate": 7.984986121876713e-07, "loss": 0.0522, "step": 3258 }, { "epoch": 1.4827115559599635, "grad_norm": 0.9310123799137633, "learning_rate": 7.983839356774671e-07, "loss": 0.0471, "step": 3259 }, { "epoch": 1.4831665150136488, "grad_norm": 1.2702600377235018, "learning_rate": 7.982692347846754e-07, "loss": 0.0376, "step": 3260 }, { "epoch": 1.483621474067334, "grad_norm": 0.9176385753740307, "learning_rate": 7.981545095186683e-07, "loss": 0.0256, "step": 3261 }, { "epoch": 1.484076433121019, "grad_norm": 1.4523828655704794, "learning_rate": 7.980397598888216e-07, "loss": 0.0744, "step": 3262 }, { "epoch": 1.4845313921747043, "grad_norm": 1.0565877841281541, "learning_rate": 7.979249859045117e-07, "loss": 0.0372, "step": 3263 }, { "epoch": 1.4849863512283894, "grad_norm": 0.6927965559077345, "learning_rate": 7.978101875751171e-07, "loss": 0.0279, "step": 3264 }, { "epoch": 1.4854413102820745, "grad_norm": 0.7847774444727177, "learning_rate": 7.976953649100189e-07, "loss": 0.0272, "step": 3265 }, { "epoch": 1.4858962693357598, "grad_norm": 0.8478725017410365, "learning_rate": 7.975805179186e-07, "loss": 0.025, "step": 3266 }, { "epoch": 1.486351228389445, "grad_norm": 1.512831238140782, "learning_rate": 7.97465646610245e-07, "loss": 0.048, "step": 3267 }, { "epoch": 1.4868061874431302, "grad_norm": 1.0134144630573254, "learning_rate": 7.973507509943405e-07, "loss": 0.0228, "step": 3268 }, { "epoch": 1.4872611464968153, "grad_norm": 0.9651706253352867, "learning_rate": 7.972358310802757e-07, "loss": 0.0423, "step": 3269 }, { "epoch": 1.4877161055505004, "grad_norm": 0.9323642812085203, "learning_rate": 7.97120886877441e-07, "loss": 0.0562, "step": 3270 }, { "epoch": 1.4881710646041857, "grad_norm": 4.818906625284994, "learning_rate": 7.970059183952293e-07, "loss": 0.1191, "step": 3271 }, { "epoch": 1.4886260236578708, "grad_norm": 20.97145329868001, "learning_rate": 7.968909256430351e-07, "loss": 0.2141, "step": 3272 }, { "epoch": 1.489080982711556, "grad_norm": 1.0577562963812712, "learning_rate": 7.967759086302552e-07, "loss": 0.0321, "step": 3273 }, { "epoch": 1.4895359417652412, "grad_norm": 0.9128274226011595, "learning_rate": 7.966608673662883e-07, "loss": 0.0271, "step": 3274 }, { "epoch": 1.4899909008189263, "grad_norm": 1.03320841952837, "learning_rate": 7.965458018605351e-07, "loss": 0.0437, "step": 3275 }, { "epoch": 1.4904458598726116, "grad_norm": 0.7740627490447266, "learning_rate": 7.964307121223981e-07, "loss": 0.038, "step": 3276 }, { "epoch": 1.4909008189262967, "grad_norm": 0.9752904554373271, "learning_rate": 7.963155981612819e-07, "loss": 0.0291, "step": 3277 }, { "epoch": 1.4913557779799818, "grad_norm": 1.2258206949601456, "learning_rate": 7.962004599865933e-07, "loss": 0.0595, "step": 3278 }, { "epoch": 1.491810737033667, "grad_norm": 0.7357905101128016, "learning_rate": 7.960852976077405e-07, "loss": 0.0225, "step": 3279 }, { "epoch": 1.4922656960873522, "grad_norm": 1.0566565535745212, "learning_rate": 7.959701110341346e-07, "loss": 0.0399, "step": 3280 }, { "epoch": 1.4927206551410372, "grad_norm": 0.9038085535638639, "learning_rate": 7.958549002751877e-07, "loss": 0.0311, "step": 3281 }, { "epoch": 1.4931756141947226, "grad_norm": 1.1817755503192022, "learning_rate": 7.957396653403145e-07, "loss": 0.0632, "step": 3282 }, { "epoch": 1.4936305732484076, "grad_norm": 1.2356096610878124, "learning_rate": 7.956244062389312e-07, "loss": 0.037, "step": 3283 }, { "epoch": 1.4940855323020927, "grad_norm": 0.9087925116864173, "learning_rate": 7.955091229804567e-07, "loss": 0.0296, "step": 3284 }, { "epoch": 1.494540491355778, "grad_norm": 0.753963717752698, "learning_rate": 7.953938155743109e-07, "loss": 0.042, "step": 3285 }, { "epoch": 1.4949954504094631, "grad_norm": 0.8417486647495676, "learning_rate": 7.952784840299166e-07, "loss": 0.0322, "step": 3286 }, { "epoch": 1.4954504094631482, "grad_norm": 0.6005432998830017, "learning_rate": 7.951631283566981e-07, "loss": 0.0163, "step": 3287 }, { "epoch": 1.4959053685168335, "grad_norm": 0.8494714356391339, "learning_rate": 7.950477485640816e-07, "loss": 0.045, "step": 3288 }, { "epoch": 1.4963603275705186, "grad_norm": 0.825547813108051, "learning_rate": 7.949323446614956e-07, "loss": 0.0287, "step": 3289 }, { "epoch": 1.4968152866242037, "grad_norm": 1.242714534813807, "learning_rate": 7.948169166583703e-07, "loss": 0.0519, "step": 3290 }, { "epoch": 1.497270245677889, "grad_norm": 1.0487374432320522, "learning_rate": 7.947014645641379e-07, "loss": 0.0312, "step": 3291 }, { "epoch": 1.4977252047315741, "grad_norm": 0.7276481574546845, "learning_rate": 7.945859883882327e-07, "loss": 0.0334, "step": 3292 }, { "epoch": 1.4981801637852592, "grad_norm": 0.6997160085909326, "learning_rate": 7.944704881400908e-07, "loss": 0.0231, "step": 3293 }, { "epoch": 1.4986351228389445, "grad_norm": 1.0176115201872729, "learning_rate": 7.943549638291506e-07, "loss": 0.0251, "step": 3294 }, { "epoch": 1.4990900818926296, "grad_norm": 0.9622193385937949, "learning_rate": 7.94239415464852e-07, "loss": 0.0236, "step": 3295 }, { "epoch": 1.499545040946315, "grad_norm": 1.042525317436163, "learning_rate": 7.941238430566369e-07, "loss": 0.0434, "step": 3296 }, { "epoch": 1.5, "grad_norm": 0.7319307321583792, "learning_rate": 7.940082466139498e-07, "loss": 0.0256, "step": 3297 }, { "epoch": 1.5004549590536853, "grad_norm": 0.8128057476510752, "learning_rate": 7.938926261462365e-07, "loss": 0.0301, "step": 3298 }, { "epoch": 1.5009099181073702, "grad_norm": 1.0334728125833705, "learning_rate": 7.93776981662945e-07, "loss": 0.0399, "step": 3299 }, { "epoch": 1.5013648771610555, "grad_norm": 0.77809739564112, "learning_rate": 7.936613131735253e-07, "loss": 0.0358, "step": 3300 }, { "epoch": 1.5018198362147408, "grad_norm": 1.0288104815154324, "learning_rate": 7.935456206874292e-07, "loss": 0.0335, "step": 3301 }, { "epoch": 1.5022747952684259, "grad_norm": 1.254236586809635, "learning_rate": 7.934299042141106e-07, "loss": 0.0413, "step": 3302 }, { "epoch": 1.502729754322111, "grad_norm": 0.8091461877734261, "learning_rate": 7.933141637630252e-07, "loss": 0.0299, "step": 3303 }, { "epoch": 1.5031847133757963, "grad_norm": 1.022010835863893, "learning_rate": 7.931983993436311e-07, "loss": 0.0215, "step": 3304 }, { "epoch": 1.5036396724294814, "grad_norm": 0.8997735306405509, "learning_rate": 7.93082610965388e-07, "loss": 0.0345, "step": 3305 }, { "epoch": 1.5040946314831665, "grad_norm": 1.0590344868029435, "learning_rate": 7.929667986377573e-07, "loss": 0.0334, "step": 3306 }, { "epoch": 1.5045495905368518, "grad_norm": 1.2043444818640674, "learning_rate": 7.928509623702029e-07, "loss": 0.046, "step": 3307 }, { "epoch": 1.5050045495905369, "grad_norm": 1.1450764519292251, "learning_rate": 7.927351021721903e-07, "loss": 0.0671, "step": 3308 }, { "epoch": 1.505459508644222, "grad_norm": 0.7593857791041106, "learning_rate": 7.926192180531872e-07, "loss": 0.0369, "step": 3309 }, { "epoch": 1.5059144676979073, "grad_norm": 0.9569442090991107, "learning_rate": 7.92503310022663e-07, "loss": 0.0296, "step": 3310 }, { "epoch": 1.5063694267515924, "grad_norm": 1.0982053075529303, "learning_rate": 7.923873780900892e-07, "loss": 0.0543, "step": 3311 }, { "epoch": 1.5068243858052774, "grad_norm": 1.0505675369549061, "learning_rate": 7.922714222649394e-07, "loss": 0.057, "step": 3312 }, { "epoch": 1.5072793448589628, "grad_norm": 1.0600618551728627, "learning_rate": 7.921554425566889e-07, "loss": 0.0285, "step": 3313 }, { "epoch": 1.5077343039126478, "grad_norm": 0.8769564029801037, "learning_rate": 7.920394389748149e-07, "loss": 0.0339, "step": 3314 }, { "epoch": 1.508189262966333, "grad_norm": 1.0067098477638619, "learning_rate": 7.919234115287967e-07, "loss": 0.0255, "step": 3315 }, { "epoch": 1.5086442220200182, "grad_norm": 0.7464267716201479, "learning_rate": 7.918073602281157e-07, "loss": 0.0313, "step": 3316 }, { "epoch": 1.5090991810737033, "grad_norm": 0.6227574875311856, "learning_rate": 7.916912850822551e-07, "loss": 0.0251, "step": 3317 }, { "epoch": 1.5095541401273884, "grad_norm": 0.7275775661892568, "learning_rate": 7.915751861006999e-07, "loss": 0.0287, "step": 3318 }, { "epoch": 1.5100090991810737, "grad_norm": 1.0095820725375695, "learning_rate": 7.914590632929371e-07, "loss": 0.024, "step": 3319 }, { "epoch": 1.5104640582347588, "grad_norm": 0.7369170491209848, "learning_rate": 7.913429166684559e-07, "loss": 0.0337, "step": 3320 }, { "epoch": 1.510919017288444, "grad_norm": 0.774558883737765, "learning_rate": 7.912267462367473e-07, "loss": 0.0278, "step": 3321 }, { "epoch": 1.5113739763421292, "grad_norm": 0.8578576866076262, "learning_rate": 7.911105520073043e-07, "loss": 0.0355, "step": 3322 }, { "epoch": 1.5118289353958145, "grad_norm": 1.2036265038557175, "learning_rate": 7.909943339896214e-07, "loss": 0.0618, "step": 3323 }, { "epoch": 1.5122838944494994, "grad_norm": 0.8709200349386419, "learning_rate": 7.908780921931957e-07, "loss": 0.0241, "step": 3324 }, { "epoch": 1.5127388535031847, "grad_norm": 1.1846187990609256, "learning_rate": 7.90761826627526e-07, "loss": 0.0535, "step": 3325 }, { "epoch": 1.51319381255687, "grad_norm": 0.755685382456857, "learning_rate": 7.906455373021128e-07, "loss": 0.0296, "step": 3326 }, { "epoch": 1.5136487716105549, "grad_norm": 0.7592545469877436, "learning_rate": 7.905292242264589e-07, "loss": 0.0208, "step": 3327 }, { "epoch": 1.5141037306642402, "grad_norm": 1.0542916476473074, "learning_rate": 7.904128874100688e-07, "loss": 0.0386, "step": 3328 }, { "epoch": 1.5145586897179255, "grad_norm": 0.7320234515894736, "learning_rate": 7.90296526862449e-07, "loss": 0.0227, "step": 3329 }, { "epoch": 1.5150136487716106, "grad_norm": 0.9308335205801176, "learning_rate": 7.901801425931082e-07, "loss": 0.0281, "step": 3330 }, { "epoch": 1.5154686078252957, "grad_norm": 0.9652346189905595, "learning_rate": 7.900637346115563e-07, "loss": 0.0329, "step": 3331 }, { "epoch": 1.515923566878981, "grad_norm": 1.0583016021055232, "learning_rate": 7.899473029273061e-07, "loss": 0.0381, "step": 3332 }, { "epoch": 1.516378525932666, "grad_norm": 1.048700829003982, "learning_rate": 7.898308475498716e-07, "loss": 0.04, "step": 3333 }, { "epoch": 1.5168334849863512, "grad_norm": 1.0008406222504842, "learning_rate": 7.897143684887692e-07, "loss": 0.0388, "step": 3334 }, { "epoch": 1.5172884440400365, "grad_norm": 0.8376106266867795, "learning_rate": 7.895978657535168e-07, "loss": 0.0261, "step": 3335 }, { "epoch": 1.5177434030937216, "grad_norm": 1.2355039676846002, "learning_rate": 7.894813393536348e-07, "loss": 0.0502, "step": 3336 }, { "epoch": 1.5181983621474067, "grad_norm": 1.0557463807903376, "learning_rate": 7.893647892986448e-07, "loss": 0.0371, "step": 3337 }, { "epoch": 1.518653321201092, "grad_norm": 0.6569460876508135, "learning_rate": 7.892482155980711e-07, "loss": 0.0266, "step": 3338 }, { "epoch": 1.519108280254777, "grad_norm": 1.2377403703410068, "learning_rate": 7.891316182614396e-07, "loss": 0.0391, "step": 3339 }, { "epoch": 1.5195632393084622, "grad_norm": 0.9911189953053333, "learning_rate": 7.890149972982779e-07, "loss": 0.0681, "step": 3340 }, { "epoch": 1.5200181983621475, "grad_norm": 0.8954263660192692, "learning_rate": 7.888983527181156e-07, "loss": 0.0294, "step": 3341 }, { "epoch": 1.5204731574158326, "grad_norm": 0.8429573073151924, "learning_rate": 7.887816845304847e-07, "loss": 0.0321, "step": 3342 }, { "epoch": 1.5209281164695176, "grad_norm": 0.8924531443524343, "learning_rate": 7.886649927449187e-07, "loss": 0.0352, "step": 3343 }, { "epoch": 1.521383075523203, "grad_norm": 0.9865759769570734, "learning_rate": 7.88548277370953e-07, "loss": 0.0457, "step": 3344 }, { "epoch": 1.521838034576888, "grad_norm": 0.8849646107666755, "learning_rate": 7.884315384181253e-07, "loss": 0.0278, "step": 3345 }, { "epoch": 1.5222929936305731, "grad_norm": 0.8314670323404226, "learning_rate": 7.883147758959747e-07, "loss": 0.0224, "step": 3346 }, { "epoch": 1.5227479526842584, "grad_norm": 1.8019996504077007, "learning_rate": 7.881979898140427e-07, "loss": 0.041, "step": 3347 }, { "epoch": 1.5232029117379435, "grad_norm": 0.9159892306160613, "learning_rate": 7.880811801818723e-07, "loss": 0.0395, "step": 3348 }, { "epoch": 1.5236578707916286, "grad_norm": 1.1223642217205434, "learning_rate": 7.87964347009009e-07, "loss": 0.0218, "step": 3349 }, { "epoch": 1.524112829845314, "grad_norm": 1.108640059765098, "learning_rate": 7.878474903049996e-07, "loss": 0.0529, "step": 3350 }, { "epoch": 1.5245677888989992, "grad_norm": 1.2894117521014026, "learning_rate": 7.877306100793933e-07, "loss": 0.0554, "step": 3351 }, { "epoch": 1.525022747952684, "grad_norm": 1.1241812261880764, "learning_rate": 7.87613706341741e-07, "loss": 0.0335, "step": 3352 }, { "epoch": 1.5254777070063694, "grad_norm": 0.7271493726997594, "learning_rate": 7.874967791015953e-07, "loss": 0.0225, "step": 3353 }, { "epoch": 1.5259326660600547, "grad_norm": 1.4101958001165233, "learning_rate": 7.873798283685112e-07, "loss": 0.0416, "step": 3354 }, { "epoch": 1.5263876251137396, "grad_norm": 0.8272817551849218, "learning_rate": 7.872628541520452e-07, "loss": 0.0215, "step": 3355 }, { "epoch": 1.526842584167425, "grad_norm": 0.8589736841727866, "learning_rate": 7.871458564617562e-07, "loss": 0.0342, "step": 3356 }, { "epoch": 1.5272975432211102, "grad_norm": 0.9276695635417507, "learning_rate": 7.870288353072044e-07, "loss": 0.0419, "step": 3357 }, { "epoch": 1.5277525022747953, "grad_norm": 0.7794519988127786, "learning_rate": 7.869117906979525e-07, "loss": 0.0335, "step": 3358 }, { "epoch": 1.5282074613284804, "grad_norm": 0.8095236685140611, "learning_rate": 7.867947226435647e-07, "loss": 0.0235, "step": 3359 }, { "epoch": 1.5286624203821657, "grad_norm": 0.9923428242065411, "learning_rate": 7.866776311536074e-07, "loss": 0.0375, "step": 3360 }, { "epoch": 1.5291173794358508, "grad_norm": 0.9647384785695188, "learning_rate": 7.865605162376485e-07, "loss": 0.023, "step": 3361 }, { "epoch": 1.5295723384895359, "grad_norm": 1.0628136352479411, "learning_rate": 7.864433779052585e-07, "loss": 0.047, "step": 3362 }, { "epoch": 1.5300272975432212, "grad_norm": 1.0928998996516173, "learning_rate": 7.863262161660092e-07, "loss": 0.0338, "step": 3363 }, { "epoch": 1.5304822565969063, "grad_norm": 1.0520986093664133, "learning_rate": 7.862090310294746e-07, "loss": 0.0483, "step": 3364 }, { "epoch": 1.5309372156505914, "grad_norm": 0.7852463887025447, "learning_rate": 7.860918225052304e-07, "loss": 0.0283, "step": 3365 }, { "epoch": 1.5313921747042767, "grad_norm": 1.0389555839159565, "learning_rate": 7.859745906028543e-07, "loss": 0.0353, "step": 3366 }, { "epoch": 1.5318471337579618, "grad_norm": 0.7575943395299229, "learning_rate": 7.858573353319264e-07, "loss": 0.0297, "step": 3367 }, { "epoch": 1.5323020928116469, "grad_norm": 0.7715524600558358, "learning_rate": 7.857400567020278e-07, "loss": 0.02, "step": 3368 }, { "epoch": 1.5327570518653322, "grad_norm": 1.0225290441120574, "learning_rate": 7.856227547227421e-07, "loss": 0.0571, "step": 3369 }, { "epoch": 1.5332120109190173, "grad_norm": 0.8087086128207897, "learning_rate": 7.85505429403655e-07, "loss": 0.0379, "step": 3370 }, { "epoch": 1.5336669699727024, "grad_norm": 0.7978989304863047, "learning_rate": 7.853880807543533e-07, "loss": 0.0368, "step": 3371 }, { "epoch": 1.5341219290263877, "grad_norm": 0.9986812258215143, "learning_rate": 7.852707087844266e-07, "loss": 0.045, "step": 3372 }, { "epoch": 1.5345768880800728, "grad_norm": 0.8445825322223783, "learning_rate": 7.851533135034657e-07, "loss": 0.0245, "step": 3373 }, { "epoch": 1.5350318471337578, "grad_norm": 0.935428845187389, "learning_rate": 7.850358949210638e-07, "loss": 0.045, "step": 3374 }, { "epoch": 1.5354868061874432, "grad_norm": 1.0693480531641053, "learning_rate": 7.849184530468158e-07, "loss": 0.0359, "step": 3375 }, { "epoch": 1.5359417652411285, "grad_norm": 0.7508182442478859, "learning_rate": 7.848009878903185e-07, "loss": 0.0221, "step": 3376 }, { "epoch": 1.5363967242948133, "grad_norm": 1.151022291992799, "learning_rate": 7.846834994611706e-07, "loss": 0.0572, "step": 3377 }, { "epoch": 1.5368516833484986, "grad_norm": 0.9866141216424829, "learning_rate": 7.845659877689729e-07, "loss": 0.0258, "step": 3378 }, { "epoch": 1.537306642402184, "grad_norm": 0.8342644206933332, "learning_rate": 7.844484528233277e-07, "loss": 0.0254, "step": 3379 }, { "epoch": 1.5377616014558688, "grad_norm": 1.0894711434333582, "learning_rate": 7.843308946338394e-07, "loss": 0.06, "step": 3380 }, { "epoch": 1.5382165605095541, "grad_norm": 0.9982967197579787, "learning_rate": 7.842133132101144e-07, "loss": 0.0384, "step": 3381 }, { "epoch": 1.5386715195632394, "grad_norm": 1.0409208063016457, "learning_rate": 7.84095708561761e-07, "loss": 0.0469, "step": 3382 }, { "epoch": 1.5391264786169245, "grad_norm": 0.9731848448530557, "learning_rate": 7.839780806983892e-07, "loss": 0.0207, "step": 3383 }, { "epoch": 1.5395814376706096, "grad_norm": 0.9306863465323401, "learning_rate": 7.838604296296112e-07, "loss": 0.0231, "step": 3384 }, { "epoch": 1.540036396724295, "grad_norm": 0.8496491998120467, "learning_rate": 7.837427553650408e-07, "loss": 0.0323, "step": 3385 }, { "epoch": 1.54049135577798, "grad_norm": 0.9904542426531011, "learning_rate": 7.836250579142937e-07, "loss": 0.0348, "step": 3386 }, { "epoch": 1.540946314831665, "grad_norm": 0.8589797699066688, "learning_rate": 7.835073372869877e-07, "loss": 0.0305, "step": 3387 }, { "epoch": 1.5414012738853504, "grad_norm": 0.9503117600863021, "learning_rate": 7.833895934927425e-07, "loss": 0.036, "step": 3388 }, { "epoch": 1.5418562329390355, "grad_norm": 0.771385250480453, "learning_rate": 7.832718265411794e-07, "loss": 0.0206, "step": 3389 }, { "epoch": 1.5423111919927206, "grad_norm": 0.9455396923495009, "learning_rate": 7.831540364419218e-07, "loss": 0.0454, "step": 3390 }, { "epoch": 1.542766151046406, "grad_norm": 1.0194627923173374, "learning_rate": 7.830362232045953e-07, "loss": 0.0445, "step": 3391 }, { "epoch": 1.543221110100091, "grad_norm": 0.9339250221519635, "learning_rate": 7.829183868388267e-07, "loss": 0.0251, "step": 3392 }, { "epoch": 1.543676069153776, "grad_norm": 1.066736550655114, "learning_rate": 7.828005273542451e-07, "loss": 0.0509, "step": 3393 }, { "epoch": 1.5441310282074614, "grad_norm": 0.8242936868347746, "learning_rate": 7.826826447604815e-07, "loss": 0.0218, "step": 3394 }, { "epoch": 1.5445859872611465, "grad_norm": 1.3311190183573294, "learning_rate": 7.82564739067169e-07, "loss": 0.0355, "step": 3395 }, { "epoch": 1.5450409463148316, "grad_norm": 1.2967878682417087, "learning_rate": 7.824468102839418e-07, "loss": 0.0501, "step": 3396 }, { "epoch": 1.5454959053685169, "grad_norm": 1.0271844942976427, "learning_rate": 7.82328858420437e-07, "loss": 0.0274, "step": 3397 }, { "epoch": 1.545950864422202, "grad_norm": 0.9049434013882345, "learning_rate": 7.822108834862928e-07, "loss": 0.0219, "step": 3398 }, { "epoch": 1.546405823475887, "grad_norm": 0.7632332642214978, "learning_rate": 7.820928854911496e-07, "loss": 0.0396, "step": 3399 }, { "epoch": 1.5468607825295724, "grad_norm": 1.155588017500432, "learning_rate": 7.819748644446498e-07, "loss": 0.0495, "step": 3400 }, { "epoch": 1.5473157415832575, "grad_norm": 0.9548463086453426, "learning_rate": 7.818568203564373e-07, "loss": 0.0377, "step": 3401 }, { "epoch": 1.5477707006369426, "grad_norm": 0.6426764193298727, "learning_rate": 7.817387532361583e-07, "loss": 0.0236, "step": 3402 }, { "epoch": 1.5482256596906279, "grad_norm": 0.9340439486358106, "learning_rate": 7.816206630934609e-07, "loss": 0.0311, "step": 3403 }, { "epoch": 1.5486806187443132, "grad_norm": 1.2258288050656976, "learning_rate": 7.815025499379945e-07, "loss": 0.0447, "step": 3404 }, { "epoch": 1.549135577797998, "grad_norm": 0.7905185043579759, "learning_rate": 7.813844137794112e-07, "loss": 0.0446, "step": 3405 }, { "epoch": 1.5495905368516834, "grad_norm": 0.7534837670439547, "learning_rate": 7.812662546273643e-07, "loss": 0.0185, "step": 3406 }, { "epoch": 1.5500454959053687, "grad_norm": 0.7978172250272243, "learning_rate": 7.811480724915092e-07, "loss": 0.0354, "step": 3407 }, { "epoch": 1.5505004549590535, "grad_norm": 0.775367875164434, "learning_rate": 7.810298673815032e-07, "loss": 0.0351, "step": 3408 }, { "epoch": 1.5509554140127388, "grad_norm": 1.2424251350225277, "learning_rate": 7.809116393070056e-07, "loss": 0.0665, "step": 3409 }, { "epoch": 1.5514103730664242, "grad_norm": 0.8453381094756598, "learning_rate": 7.807933882776774e-07, "loss": 0.0221, "step": 3410 }, { "epoch": 1.5518653321201092, "grad_norm": 1.3884243411996389, "learning_rate": 7.806751143031816e-07, "loss": 0.0499, "step": 3411 }, { "epoch": 1.5523202911737943, "grad_norm": 0.9156128214356491, "learning_rate": 7.805568173931829e-07, "loss": 0.0298, "step": 3412 }, { "epoch": 1.5527752502274796, "grad_norm": 0.8678452918764848, "learning_rate": 7.804384975573482e-07, "loss": 0.0282, "step": 3413 }, { "epoch": 1.5532302092811647, "grad_norm": 0.9826804154870664, "learning_rate": 7.803201548053458e-07, "loss": 0.0399, "step": 3414 }, { "epoch": 1.5536851683348498, "grad_norm": 0.9255475126240257, "learning_rate": 7.802017891468463e-07, "loss": 0.0314, "step": 3415 }, { "epoch": 1.5541401273885351, "grad_norm": 1.209918695103766, "learning_rate": 7.80083400591522e-07, "loss": 0.0429, "step": 3416 }, { "epoch": 1.5545950864422202, "grad_norm": 1.5756798866431252, "learning_rate": 7.799649891490471e-07, "loss": 0.0466, "step": 3417 }, { "epoch": 1.5550500454959053, "grad_norm": 0.822055068322049, "learning_rate": 7.798465548290975e-07, "loss": 0.0236, "step": 3418 }, { "epoch": 1.5555050045495906, "grad_norm": 0.9392246028227832, "learning_rate": 7.797280976413512e-07, "loss": 0.0355, "step": 3419 }, { "epoch": 1.5559599636032757, "grad_norm": 0.9151742771679718, "learning_rate": 7.796096175954881e-07, "loss": 0.0392, "step": 3420 }, { "epoch": 1.5564149226569608, "grad_norm": 0.8784013202361138, "learning_rate": 7.794911147011898e-07, "loss": 0.0317, "step": 3421 }, { "epoch": 1.556869881710646, "grad_norm": 1.0511942496761377, "learning_rate": 7.793725889681395e-07, "loss": 0.0379, "step": 3422 }, { "epoch": 1.5573248407643312, "grad_norm": 0.9813449917150971, "learning_rate": 7.792540404060231e-07, "loss": 0.0345, "step": 3423 }, { "epoch": 1.5577797998180163, "grad_norm": 1.191897061486849, "learning_rate": 7.791354690245275e-07, "loss": 0.028, "step": 3424 }, { "epoch": 1.5582347588717016, "grad_norm": 1.0909845580913877, "learning_rate": 7.790168748333421e-07, "loss": 0.0531, "step": 3425 }, { "epoch": 1.5586897179253867, "grad_norm": 0.6833997947181513, "learning_rate": 7.788982578421575e-07, "loss": 0.0214, "step": 3426 }, { "epoch": 1.5591446769790718, "grad_norm": 1.121647339616982, "learning_rate": 7.78779618060667e-07, "loss": 0.0312, "step": 3427 }, { "epoch": 1.559599636032757, "grad_norm": 1.205984993111627, "learning_rate": 7.786609554985651e-07, "loss": 0.0406, "step": 3428 }, { "epoch": 1.5600545950864422, "grad_norm": 1.1439135090910495, "learning_rate": 7.78542270165548e-07, "loss": 0.0319, "step": 3429 }, { "epoch": 1.5605095541401273, "grad_norm": 0.813517534259093, "learning_rate": 7.784235620713146e-07, "loss": 0.0256, "step": 3430 }, { "epoch": 1.5609645131938126, "grad_norm": 2.5111421641162557, "learning_rate": 7.783048312255653e-07, "loss": 0.0771, "step": 3431 }, { "epoch": 1.5614194722474979, "grad_norm": 0.8230643246131385, "learning_rate": 7.781860776380018e-07, "loss": 0.0275, "step": 3432 }, { "epoch": 1.5618744313011828, "grad_norm": 1.1490305694531542, "learning_rate": 7.780673013183283e-07, "loss": 0.0521, "step": 3433 }, { "epoch": 1.562329390354868, "grad_norm": 1.4091427171259558, "learning_rate": 7.779485022762507e-07, "loss": 0.0337, "step": 3434 }, { "epoch": 1.5627843494085534, "grad_norm": 1.0794895827266153, "learning_rate": 7.778296805214768e-07, "loss": 0.0448, "step": 3435 }, { "epoch": 1.5632393084622382, "grad_norm": 1.019818693426563, "learning_rate": 7.777108360637159e-07, "loss": 0.0277, "step": 3436 }, { "epoch": 1.5636942675159236, "grad_norm": 0.8036576817027009, "learning_rate": 7.775919689126798e-07, "loss": 0.0287, "step": 3437 }, { "epoch": 1.5641492265696089, "grad_norm": 1.2376346645225047, "learning_rate": 7.774730790780813e-07, "loss": 0.0502, "step": 3438 }, { "epoch": 1.564604185623294, "grad_norm": 1.171017661855251, "learning_rate": 7.773541665696361e-07, "loss": 0.0285, "step": 3439 }, { "epoch": 1.565059144676979, "grad_norm": 0.9476557683702831, "learning_rate": 7.772352313970608e-07, "loss": 0.0342, "step": 3440 }, { "epoch": 1.5655141037306644, "grad_norm": 0.8157580955059407, "learning_rate": 7.771162735700744e-07, "loss": 0.0216, "step": 3441 }, { "epoch": 1.5659690627843494, "grad_norm": 1.0058053394658433, "learning_rate": 7.769972930983977e-07, "loss": 0.0341, "step": 3442 }, { "epoch": 1.5664240218380345, "grad_norm": 0.976620353798322, "learning_rate": 7.76878289991753e-07, "loss": 0.0382, "step": 3443 }, { "epoch": 1.5668789808917198, "grad_norm": 1.0184683569669775, "learning_rate": 7.767592642598648e-07, "loss": 0.0434, "step": 3444 }, { "epoch": 1.567333939945405, "grad_norm": 0.950448167199121, "learning_rate": 7.766402159124595e-07, "loss": 0.0361, "step": 3445 }, { "epoch": 1.56778889899909, "grad_norm": 1.1478666797962007, "learning_rate": 7.765211449592647e-07, "loss": 0.0404, "step": 3446 }, { "epoch": 1.5682438580527753, "grad_norm": 0.8983681973886607, "learning_rate": 7.76402051410011e-07, "loss": 0.0379, "step": 3447 }, { "epoch": 1.5686988171064604, "grad_norm": 1.0961364014647224, "learning_rate": 7.762829352744299e-07, "loss": 0.031, "step": 3448 }, { "epoch": 1.5691537761601455, "grad_norm": 0.9127785993489056, "learning_rate": 7.761637965622549e-07, "loss": 0.0388, "step": 3449 }, { "epoch": 1.5696087352138308, "grad_norm": 2.258929925800965, "learning_rate": 7.760446352832217e-07, "loss": 0.0653, "step": 3450 }, { "epoch": 1.570063694267516, "grad_norm": 0.762164238986714, "learning_rate": 7.759254514470673e-07, "loss": 0.0255, "step": 3451 }, { "epoch": 1.570518653321201, "grad_norm": 0.9940379509944665, "learning_rate": 7.758062450635312e-07, "loss": 0.0402, "step": 3452 }, { "epoch": 1.5709736123748863, "grad_norm": 0.9535717204948514, "learning_rate": 7.756870161423543e-07, "loss": 0.0226, "step": 3453 }, { "epoch": 1.5714285714285714, "grad_norm": 0.797574950090285, "learning_rate": 7.755677646932794e-07, "loss": 0.0181, "step": 3454 }, { "epoch": 1.5718835304822565, "grad_norm": 0.8065439705034954, "learning_rate": 7.754484907260512e-07, "loss": 0.0246, "step": 3455 }, { "epoch": 1.5723384895359418, "grad_norm": 1.0976475349786978, "learning_rate": 7.753291942504163e-07, "loss": 0.0378, "step": 3456 }, { "epoch": 1.5727934485896269, "grad_norm": 0.7151546653889521, "learning_rate": 7.75209875276123e-07, "loss": 0.0367, "step": 3457 }, { "epoch": 1.573248407643312, "grad_norm": 1.443443624649499, "learning_rate": 7.750905338129216e-07, "loss": 0.076, "step": 3458 }, { "epoch": 1.5737033666969973, "grad_norm": 0.7482080191139597, "learning_rate": 7.749711698705641e-07, "loss": 0.028, "step": 3459 }, { "epoch": 1.5741583257506826, "grad_norm": 1.3156783983333435, "learning_rate": 7.748517834588041e-07, "loss": 0.0491, "step": 3460 }, { "epoch": 1.5746132848043675, "grad_norm": 1.0685951748847078, "learning_rate": 7.747323745873977e-07, "loss": 0.0387, "step": 3461 }, { "epoch": 1.5750682438580528, "grad_norm": 0.8683074062973614, "learning_rate": 7.746129432661025e-07, "loss": 0.0237, "step": 3462 }, { "epoch": 1.575523202911738, "grad_norm": 0.9389384436704687, "learning_rate": 7.744934895046775e-07, "loss": 0.0356, "step": 3463 }, { "epoch": 1.575978161965423, "grad_norm": 1.094639479294198, "learning_rate": 7.743740133128842e-07, "loss": 0.0448, "step": 3464 }, { "epoch": 1.5764331210191083, "grad_norm": 0.9306677059396596, "learning_rate": 7.742545147004858e-07, "loss": 0.0335, "step": 3465 }, { "epoch": 1.5768880800727936, "grad_norm": 0.8323739662915765, "learning_rate": 7.741349936772468e-07, "loss": 0.0258, "step": 3466 }, { "epoch": 1.5773430391264787, "grad_norm": 0.6270200235365221, "learning_rate": 7.740154502529339e-07, "loss": 0.018, "step": 3467 }, { "epoch": 1.5777979981801638, "grad_norm": 0.9835371223896436, "learning_rate": 7.738958844373162e-07, "loss": 0.0229, "step": 3468 }, { "epoch": 1.578252957233849, "grad_norm": 0.7704626322203195, "learning_rate": 7.737762962401636e-07, "loss": 0.0211, "step": 3469 }, { "epoch": 1.5787079162875342, "grad_norm": 1.0531087679511983, "learning_rate": 7.736566856712484e-07, "loss": 0.035, "step": 3470 }, { "epoch": 1.5791628753412192, "grad_norm": 0.9189928122654404, "learning_rate": 7.735370527403446e-07, "loss": 0.0274, "step": 3471 }, { "epoch": 1.5796178343949046, "grad_norm": 0.7619438795284988, "learning_rate": 7.734173974572283e-07, "loss": 0.0278, "step": 3472 }, { "epoch": 1.5800727934485896, "grad_norm": 0.8864036861195365, "learning_rate": 7.73297719831677e-07, "loss": 0.0384, "step": 3473 }, { "epoch": 1.5805277525022747, "grad_norm": 0.9582710810436442, "learning_rate": 7.731780198734701e-07, "loss": 0.0237, "step": 3474 }, { "epoch": 1.58098271155596, "grad_norm": 1.027276906080928, "learning_rate": 7.730582975923892e-07, "loss": 0.0244, "step": 3475 }, { "epoch": 1.5814376706096451, "grad_norm": 1.1472425491041482, "learning_rate": 7.729385529982173e-07, "loss": 0.0523, "step": 3476 }, { "epoch": 1.5818926296633302, "grad_norm": 1.0744650191010205, "learning_rate": 7.728187861007393e-07, "loss": 0.0511, "step": 3477 }, { "epoch": 1.5823475887170155, "grad_norm": 0.8142993926242905, "learning_rate": 7.726989969097424e-07, "loss": 0.0349, "step": 3478 }, { "epoch": 1.5828025477707006, "grad_norm": 1.0545524462571638, "learning_rate": 7.725791854350147e-07, "loss": 0.0616, "step": 3479 }, { "epoch": 1.5832575068243857, "grad_norm": 0.8320760921078437, "learning_rate": 7.724593516863471e-07, "loss": 0.0433, "step": 3480 }, { "epoch": 1.583712465878071, "grad_norm": 0.8308577151408484, "learning_rate": 7.723394956735316e-07, "loss": 0.0224, "step": 3481 }, { "epoch": 1.584167424931756, "grad_norm": 1.3842831887917046, "learning_rate": 7.722196174063625e-07, "loss": 0.039, "step": 3482 }, { "epoch": 1.5846223839854412, "grad_norm": 1.2837432492380012, "learning_rate": 7.720997168946355e-07, "loss": 0.0373, "step": 3483 }, { "epoch": 1.5850773430391265, "grad_norm": 0.834366134740409, "learning_rate": 7.719797941481486e-07, "loss": 0.0278, "step": 3484 }, { "epoch": 1.5855323020928116, "grad_norm": 1.2970139844344246, "learning_rate": 7.718598491767009e-07, "loss": 0.0631, "step": 3485 }, { "epoch": 1.5859872611464967, "grad_norm": 0.8529453976173942, "learning_rate": 7.717398819900942e-07, "loss": 0.0338, "step": 3486 }, { "epoch": 1.586442220200182, "grad_norm": 0.8963685528056743, "learning_rate": 7.716198925981316e-07, "loss": 0.0191, "step": 3487 }, { "epoch": 1.5868971792538673, "grad_norm": 0.8675730916622489, "learning_rate": 7.714998810106177e-07, "loss": 0.0486, "step": 3488 }, { "epoch": 1.5873521383075522, "grad_norm": 0.8759979473124747, "learning_rate": 7.713798472373598e-07, "loss": 0.0416, "step": 3489 }, { "epoch": 1.5878070973612375, "grad_norm": 1.2168178884139778, "learning_rate": 7.712597912881664e-07, "loss": 0.0426, "step": 3490 }, { "epoch": 1.5882620564149228, "grad_norm": 0.9905687301841587, "learning_rate": 7.711397131728477e-07, "loss": 0.0317, "step": 3491 }, { "epoch": 1.5887170154686077, "grad_norm": 0.8620478122362184, "learning_rate": 7.710196129012162e-07, "loss": 0.0216, "step": 3492 }, { "epoch": 1.589171974522293, "grad_norm": 0.7264178255894146, "learning_rate": 7.708994904830859e-07, "loss": 0.0169, "step": 3493 }, { "epoch": 1.5896269335759783, "grad_norm": 0.8951910398906391, "learning_rate": 7.707793459282725e-07, "loss": 0.0412, "step": 3494 }, { "epoch": 1.5900818926296634, "grad_norm": 1.0136023380753398, "learning_rate": 7.706591792465938e-07, "loss": 0.0468, "step": 3495 }, { "epoch": 1.5905368516833485, "grad_norm": 1.30920963087674, "learning_rate": 7.705389904478692e-07, "loss": 0.0536, "step": 3496 }, { "epoch": 1.5909918107370338, "grad_norm": 1.2142070313169429, "learning_rate": 7.704187795419201e-07, "loss": 0.0232, "step": 3497 }, { "epoch": 1.5914467697907189, "grad_norm": 0.8348041778674216, "learning_rate": 7.702985465385697e-07, "loss": 0.0292, "step": 3498 }, { "epoch": 1.591901728844404, "grad_norm": 1.088025084685648, "learning_rate": 7.701782914476425e-07, "loss": 0.0435, "step": 3499 }, { "epoch": 1.5923566878980893, "grad_norm": 0.7276564282208162, "learning_rate": 7.700580142789654e-07, "loss": 0.0295, "step": 3500 }, { "epoch": 1.5928116469517744, "grad_norm": 1.0386040928346931, "learning_rate": 7.699377150423672e-07, "loss": 0.0406, "step": 3501 }, { "epoch": 1.5932666060054594, "grad_norm": 1.3211010497394668, "learning_rate": 7.698173937476777e-07, "loss": 0.0386, "step": 3502 }, { "epoch": 1.5937215650591448, "grad_norm": 0.9927627415109123, "learning_rate": 7.696970504047293e-07, "loss": 0.0376, "step": 3503 }, { "epoch": 1.5941765241128298, "grad_norm": 1.1889632592653876, "learning_rate": 7.69576685023356e-07, "loss": 0.0464, "step": 3504 }, { "epoch": 1.594631483166515, "grad_norm": 0.8595972571168323, "learning_rate": 7.694562976133934e-07, "loss": 0.0454, "step": 3505 }, { "epoch": 1.5950864422202002, "grad_norm": 1.3010769106655897, "learning_rate": 7.693358881846788e-07, "loss": 0.0594, "step": 3506 }, { "epoch": 1.5955414012738853, "grad_norm": 1.060453911326754, "learning_rate": 7.692154567470522e-07, "loss": 0.0234, "step": 3507 }, { "epoch": 1.5959963603275704, "grad_norm": 0.9207058520336195, "learning_rate": 7.690950033103538e-07, "loss": 0.0245, "step": 3508 }, { "epoch": 1.5964513193812557, "grad_norm": 1.3581298921595077, "learning_rate": 7.689745278844271e-07, "loss": 0.0258, "step": 3509 }, { "epoch": 1.5969062784349408, "grad_norm": 3.2786731812265577, "learning_rate": 7.688540304791165e-07, "loss": 0.0343, "step": 3510 }, { "epoch": 1.597361237488626, "grad_norm": 1.2628976708710409, "learning_rate": 7.68733511104269e-07, "loss": 0.0434, "step": 3511 }, { "epoch": 1.5978161965423112, "grad_norm": 1.3653013663633025, "learning_rate": 7.686129697697324e-07, "loss": 0.0276, "step": 3512 }, { "epoch": 1.5982711555959963, "grad_norm": 0.9335413887554855, "learning_rate": 7.684924064853568e-07, "loss": 0.0308, "step": 3513 }, { "epoch": 1.5987261146496814, "grad_norm": 0.6945168904570539, "learning_rate": 7.683718212609944e-07, "loss": 0.0275, "step": 3514 }, { "epoch": 1.5991810737033667, "grad_norm": 1.2514028163754867, "learning_rate": 7.682512141064987e-07, "loss": 0.0637, "step": 3515 }, { "epoch": 1.599636032757052, "grad_norm": 1.488557335755261, "learning_rate": 7.681305850317251e-07, "loss": 0.0528, "step": 3516 }, { "epoch": 1.6000909918107369, "grad_norm": 0.9987640925472243, "learning_rate": 7.68009934046531e-07, "loss": 0.0492, "step": 3517 }, { "epoch": 1.6005459508644222, "grad_norm": 0.9940096210498419, "learning_rate": 7.678892611607755e-07, "loss": 0.0358, "step": 3518 }, { "epoch": 1.6010009099181075, "grad_norm": 1.0224141260627144, "learning_rate": 7.677685663843194e-07, "loss": 0.0263, "step": 3519 }, { "epoch": 1.6014558689717924, "grad_norm": 0.8319885100099907, "learning_rate": 7.676478497270251e-07, "loss": 0.0372, "step": 3520 }, { "epoch": 1.6019108280254777, "grad_norm": 0.8786558990123748, "learning_rate": 7.675271111987574e-07, "loss": 0.036, "step": 3521 }, { "epoch": 1.602365787079163, "grad_norm": 0.8817434162161227, "learning_rate": 7.674063508093822e-07, "loss": 0.0373, "step": 3522 }, { "epoch": 1.602820746132848, "grad_norm": 0.9103308527847908, "learning_rate": 7.672855685687676e-07, "loss": 0.0423, "step": 3523 }, { "epoch": 1.6032757051865332, "grad_norm": 1.1005895956725296, "learning_rate": 7.671647644867835e-07, "loss": 0.0485, "step": 3524 }, { "epoch": 1.6037306642402185, "grad_norm": 1.414202341084974, "learning_rate": 7.670439385733012e-07, "loss": 0.0256, "step": 3525 }, { "epoch": 1.6041856232939036, "grad_norm": 0.7404248340650502, "learning_rate": 7.669230908381944e-07, "loss": 0.0345, "step": 3526 }, { "epoch": 1.6046405823475887, "grad_norm": 1.0434437614005505, "learning_rate": 7.668022212913377e-07, "loss": 0.0468, "step": 3527 }, { "epoch": 1.605095541401274, "grad_norm": 1.1412565249379123, "learning_rate": 7.666813299426087e-07, "loss": 0.053, "step": 3528 }, { "epoch": 1.605550500454959, "grad_norm": 0.9086802399273984, "learning_rate": 7.665604168018855e-07, "loss": 0.0387, "step": 3529 }, { "epoch": 1.6060054595086442, "grad_norm": 0.9957313339362209, "learning_rate": 7.664394818790489e-07, "loss": 0.0235, "step": 3530 }, { "epoch": 1.6064604185623295, "grad_norm": 0.9125092337795346, "learning_rate": 7.663185251839812e-07, "loss": 0.0262, "step": 3531 }, { "epoch": 1.6069153776160146, "grad_norm": 0.5823783618457317, "learning_rate": 7.66197546726566e-07, "loss": 0.0258, "step": 3532 }, { "epoch": 1.6073703366696996, "grad_norm": 0.8899236873362361, "learning_rate": 7.660765465166898e-07, "loss": 0.0288, "step": 3533 }, { "epoch": 1.607825295723385, "grad_norm": 0.7514685367293514, "learning_rate": 7.659555245642395e-07, "loss": 0.0254, "step": 3534 }, { "epoch": 1.60828025477707, "grad_norm": 0.8311231623741377, "learning_rate": 7.658344808791049e-07, "loss": 0.0314, "step": 3535 }, { "epoch": 1.6087352138307551, "grad_norm": 0.7903307878329843, "learning_rate": 7.65713415471177e-07, "loss": 0.0295, "step": 3536 }, { "epoch": 1.6091901728844404, "grad_norm": 0.8274819805866896, "learning_rate": 7.655923283503488e-07, "loss": 0.0293, "step": 3537 }, { "epoch": 1.6096451319381255, "grad_norm": 0.7571997887150376, "learning_rate": 7.654712195265147e-07, "loss": 0.03, "step": 3538 }, { "epoch": 1.6101000909918106, "grad_norm": 0.6508787597662078, "learning_rate": 7.653500890095717e-07, "loss": 0.018, "step": 3539 }, { "epoch": 1.610555050045496, "grad_norm": 0.8570616581840088, "learning_rate": 7.652289368094176e-07, "loss": 0.0236, "step": 3540 }, { "epoch": 1.6110100090991812, "grad_norm": 1.2732404035352707, "learning_rate": 7.651077629359525e-07, "loss": 0.0322, "step": 3541 }, { "epoch": 1.611464968152866, "grad_norm": 0.8284667098543362, "learning_rate": 7.649865673990783e-07, "loss": 0.0295, "step": 3542 }, { "epoch": 1.6119199272065514, "grad_norm": 1.0209770193459848, "learning_rate": 7.648653502086986e-07, "loss": 0.0628, "step": 3543 }, { "epoch": 1.6123748862602367, "grad_norm": 0.9954814599001092, "learning_rate": 7.647441113747183e-07, "loss": 0.031, "step": 3544 }, { "epoch": 1.6128298453139216, "grad_norm": 0.8517035677026198, "learning_rate": 7.646228509070451e-07, "loss": 0.0394, "step": 3545 }, { "epoch": 1.613284804367607, "grad_norm": 0.6232546206535559, "learning_rate": 7.645015688155874e-07, "loss": 0.0235, "step": 3546 }, { "epoch": 1.6137397634212922, "grad_norm": 0.8869527004634391, "learning_rate": 7.64380265110256e-07, "loss": 0.0282, "step": 3547 }, { "epoch": 1.6141947224749773, "grad_norm": 0.8973084006853762, "learning_rate": 7.642589398009631e-07, "loss": 0.0445, "step": 3548 }, { "epoch": 1.6146496815286624, "grad_norm": 0.6925732229148356, "learning_rate": 7.641375928976233e-07, "loss": 0.0247, "step": 3549 }, { "epoch": 1.6151046405823477, "grad_norm": 0.797595121781999, "learning_rate": 7.64016224410152e-07, "loss": 0.039, "step": 3550 }, { "epoch": 1.6155595996360328, "grad_norm": 1.0029904585015519, "learning_rate": 7.638948343484672e-07, "loss": 0.0288, "step": 3551 }, { "epoch": 1.6160145586897179, "grad_norm": 0.8445887006370201, "learning_rate": 7.637734227224883e-07, "loss": 0.0268, "step": 3552 }, { "epoch": 1.6164695177434032, "grad_norm": 0.6640149145894326, "learning_rate": 7.636519895421365e-07, "loss": 0.0279, "step": 3553 }, { "epoch": 1.6169244767970883, "grad_norm": 0.9779382112093249, "learning_rate": 7.635305348173349e-07, "loss": 0.0356, "step": 3554 }, { "epoch": 1.6173794358507734, "grad_norm": 1.1071687380236381, "learning_rate": 7.634090585580079e-07, "loss": 0.052, "step": 3555 }, { "epoch": 1.6178343949044587, "grad_norm": 0.6525676404296362, "learning_rate": 7.632875607740823e-07, "loss": 0.0308, "step": 3556 }, { "epoch": 1.6182893539581438, "grad_norm": 1.0119445150784003, "learning_rate": 7.631660414754861e-07, "loss": 0.0324, "step": 3557 }, { "epoch": 1.6187443130118289, "grad_norm": 0.938273042871981, "learning_rate": 7.630445006721495e-07, "loss": 0.0392, "step": 3558 }, { "epoch": 1.6191992720655142, "grad_norm": 0.6436806101457173, "learning_rate": 7.629229383740041e-07, "loss": 0.0196, "step": 3559 }, { "epoch": 1.6196542311191993, "grad_norm": 0.8264572460205196, "learning_rate": 7.628013545909837e-07, "loss": 0.0394, "step": 3560 }, { "epoch": 1.6201091901728844, "grad_norm": 0.9782680153248389, "learning_rate": 7.626797493330233e-07, "loss": 0.0385, "step": 3561 }, { "epoch": 1.6205641492265697, "grad_norm": 0.8575056951552265, "learning_rate": 7.625581226100602e-07, "loss": 0.0407, "step": 3562 }, { "epoch": 1.6210191082802548, "grad_norm": 1.0250815505385125, "learning_rate": 7.624364744320328e-07, "loss": 0.019, "step": 3563 }, { "epoch": 1.6214740673339398, "grad_norm": 0.9644415095313444, "learning_rate": 7.62314804808882e-07, "loss": 0.0333, "step": 3564 }, { "epoch": 1.6219290263876252, "grad_norm": 1.0627187036670571, "learning_rate": 7.6219311375055e-07, "loss": 0.0399, "step": 3565 }, { "epoch": 1.6223839854413102, "grad_norm": 0.7391006082249724, "learning_rate": 7.620714012669805e-07, "loss": 0.0301, "step": 3566 }, { "epoch": 1.6228389444949953, "grad_norm": 1.4671231298237277, "learning_rate": 7.619496673681199e-07, "loss": 0.0846, "step": 3567 }, { "epoch": 1.6232939035486806, "grad_norm": 0.9682370573278636, "learning_rate": 7.618279120639153e-07, "loss": 0.0339, "step": 3568 }, { "epoch": 1.623748862602366, "grad_norm": 0.8482941242034705, "learning_rate": 7.617061353643161e-07, "loss": 0.0236, "step": 3569 }, { "epoch": 1.6242038216560508, "grad_norm": 0.8724643366429032, "learning_rate": 7.615843372792734e-07, "loss": 0.0225, "step": 3570 }, { "epoch": 1.6246587807097361, "grad_norm": 0.6993547813220327, "learning_rate": 7.614625178187402e-07, "loss": 0.0325, "step": 3571 }, { "epoch": 1.6251137397634214, "grad_norm": 0.758990603155747, "learning_rate": 7.613406769926705e-07, "loss": 0.0296, "step": 3572 }, { "epoch": 1.6255686988171063, "grad_norm": 1.3862359432363267, "learning_rate": 7.61218814811021e-07, "loss": 0.0426, "step": 3573 }, { "epoch": 1.6260236578707916, "grad_norm": 0.6717078978841888, "learning_rate": 7.610969312837497e-07, "loss": 0.0197, "step": 3574 }, { "epoch": 1.626478616924477, "grad_norm": 0.7876095496933203, "learning_rate": 7.609750264208161e-07, "loss": 0.0372, "step": 3575 }, { "epoch": 1.626933575978162, "grad_norm": 1.1053161587691172, "learning_rate": 7.608531002321819e-07, "loss": 0.038, "step": 3576 }, { "epoch": 1.627388535031847, "grad_norm": 0.8233976139562303, "learning_rate": 7.607311527278104e-07, "loss": 0.0276, "step": 3577 }, { "epoch": 1.6278434940855324, "grad_norm": 0.7855584491673503, "learning_rate": 7.606091839176665e-07, "loss": 0.0316, "step": 3578 }, { "epoch": 1.6282984531392175, "grad_norm": 0.7174814765883675, "learning_rate": 7.60487193811717e-07, "loss": 0.0222, "step": 3579 }, { "epoch": 1.6287534121929026, "grad_norm": 0.5607595517721546, "learning_rate": 7.603651824199304e-07, "loss": 0.0259, "step": 3580 }, { "epoch": 1.629208371246588, "grad_norm": 0.7221029853810141, "learning_rate": 7.60243149752277e-07, "loss": 0.0252, "step": 3581 }, { "epoch": 1.629663330300273, "grad_norm": 2.3119129120382462, "learning_rate": 7.601210958187285e-07, "loss": 0.0441, "step": 3582 }, { "epoch": 1.630118289353958, "grad_norm": 1.1669309540531276, "learning_rate": 7.599990206292588e-07, "loss": 0.0425, "step": 3583 }, { "epoch": 1.6305732484076434, "grad_norm": 0.7039489872410455, "learning_rate": 7.598769241938434e-07, "loss": 0.0219, "step": 3584 }, { "epoch": 1.6310282074613285, "grad_norm": 1.2534893889844134, "learning_rate": 7.597548065224593e-07, "loss": 0.0473, "step": 3585 }, { "epoch": 1.6314831665150136, "grad_norm": 0.5199957482084405, "learning_rate": 7.596326676250852e-07, "loss": 0.0156, "step": 3586 }, { "epoch": 1.6319381255686989, "grad_norm": 0.975277495935297, "learning_rate": 7.595105075117022e-07, "loss": 0.0316, "step": 3587 }, { "epoch": 1.632393084622384, "grad_norm": 0.7306323865898771, "learning_rate": 7.593883261922926e-07, "loss": 0.0273, "step": 3588 }, { "epoch": 1.632848043676069, "grad_norm": 0.8594122121774007, "learning_rate": 7.592661236768401e-07, "loss": 0.0269, "step": 3589 }, { "epoch": 1.6333030027297544, "grad_norm": 0.7855087292415127, "learning_rate": 7.591438999753309e-07, "loss": 0.032, "step": 3590 }, { "epoch": 1.6337579617834395, "grad_norm": 1.1187420484546196, "learning_rate": 7.590216550977526e-07, "loss": 0.0333, "step": 3591 }, { "epoch": 1.6342129208371245, "grad_norm": 0.7327803739201604, "learning_rate": 7.588993890540942e-07, "loss": 0.0276, "step": 3592 }, { "epoch": 1.6346678798908099, "grad_norm": 1.1314795314174948, "learning_rate": 7.58777101854347e-07, "loss": 0.0441, "step": 3593 }, { "epoch": 1.635122838944495, "grad_norm": 0.7753973951812566, "learning_rate": 7.586547935085037e-07, "loss": 0.0282, "step": 3594 }, { "epoch": 1.63557779799818, "grad_norm": 1.0927008539809873, "learning_rate": 7.585324640265587e-07, "loss": 0.0375, "step": 3595 }, { "epoch": 1.6360327570518653, "grad_norm": 0.54361001006905, "learning_rate": 7.584101134185084e-07, "loss": 0.0249, "step": 3596 }, { "epoch": 1.6364877161055507, "grad_norm": 0.9436557005450976, "learning_rate": 7.582877416943503e-07, "loss": 0.0413, "step": 3597 }, { "epoch": 1.6369426751592355, "grad_norm": 0.9727389170919065, "learning_rate": 7.581653488640844e-07, "loss": 0.0382, "step": 3598 }, { "epoch": 1.6373976342129208, "grad_norm": 0.9319940175020445, "learning_rate": 7.580429349377123e-07, "loss": 0.0315, "step": 3599 }, { "epoch": 1.6378525932666061, "grad_norm": 1.0113470926495671, "learning_rate": 7.579204999252367e-07, "loss": 0.0365, "step": 3600 }, { "epoch": 1.638307552320291, "grad_norm": 0.9607859297727236, "learning_rate": 7.577980438366627e-07, "loss": 0.0375, "step": 3601 }, { "epoch": 1.6387625113739763, "grad_norm": 1.074007543768012, "learning_rate": 7.576755666819967e-07, "loss": 0.0347, "step": 3602 }, { "epoch": 1.6392174704276616, "grad_norm": 0.8269011571941337, "learning_rate": 7.575530684712473e-07, "loss": 0.0226, "step": 3603 }, { "epoch": 1.6396724294813467, "grad_norm": 0.8805734425714167, "learning_rate": 7.574305492144238e-07, "loss": 0.0451, "step": 3604 }, { "epoch": 1.6401273885350318, "grad_norm": 1.0099425178865788, "learning_rate": 7.573080089215386e-07, "loss": 0.0284, "step": 3605 }, { "epoch": 1.6405823475887171, "grad_norm": 0.9543855608341975, "learning_rate": 7.571854476026048e-07, "loss": 0.0403, "step": 3606 }, { "epoch": 1.6410373066424022, "grad_norm": 0.782197460187853, "learning_rate": 7.570628652676377e-07, "loss": 0.0394, "step": 3607 }, { "epoch": 1.6414922656960873, "grad_norm": 0.8609868654397161, "learning_rate": 7.569402619266543e-07, "loss": 0.0251, "step": 3608 }, { "epoch": 1.6419472247497726, "grad_norm": 1.0232835281305295, "learning_rate": 7.568176375896729e-07, "loss": 0.0399, "step": 3609 }, { "epoch": 1.6424021838034577, "grad_norm": 1.0669603889711676, "learning_rate": 7.56694992266714e-07, "loss": 0.0277, "step": 3610 }, { "epoch": 1.6428571428571428, "grad_norm": 0.7432997139513513, "learning_rate": 7.565723259677993e-07, "loss": 0.0366, "step": 3611 }, { "epoch": 1.643312101910828, "grad_norm": 1.4292142645863377, "learning_rate": 7.564496387029531e-07, "loss": 0.0454, "step": 3612 }, { "epoch": 1.6437670609645132, "grad_norm": 0.8250137493071127, "learning_rate": 7.563269304822004e-07, "loss": 0.0231, "step": 3613 }, { "epoch": 1.6442220200181983, "grad_norm": 0.8314363262707181, "learning_rate": 7.562042013155685e-07, "loss": 0.0301, "step": 3614 }, { "epoch": 1.6446769790718836, "grad_norm": 1.1748085679080542, "learning_rate": 7.560814512130863e-07, "loss": 0.0522, "step": 3615 }, { "epoch": 1.6451319381255687, "grad_norm": 1.0051638438744506, "learning_rate": 7.559586801847843e-07, "loss": 0.032, "step": 3616 }, { "epoch": 1.6455868971792538, "grad_norm": 0.8598651679271702, "learning_rate": 7.55835888240695e-07, "loss": 0.0426, "step": 3617 }, { "epoch": 1.646041856232939, "grad_norm": 1.0433832063320903, "learning_rate": 7.557130753908521e-07, "loss": 0.0342, "step": 3618 }, { "epoch": 1.6464968152866242, "grad_norm": 1.1301232818703155, "learning_rate": 7.555902416452916e-07, "loss": 0.0347, "step": 3619 }, { "epoch": 1.6469517743403093, "grad_norm": 0.7049368428784307, "learning_rate": 7.554673870140509e-07, "loss": 0.0269, "step": 3620 }, { "epoch": 1.6474067333939946, "grad_norm": 1.1725012604316964, "learning_rate": 7.553445115071686e-07, "loss": 0.0448, "step": 3621 }, { "epoch": 1.6478616924476797, "grad_norm": 0.9161194691889688, "learning_rate": 7.552216151346863e-07, "loss": 0.0262, "step": 3622 }, { "epoch": 1.6483166515013647, "grad_norm": 1.158081718702307, "learning_rate": 7.550986979066461e-07, "loss": 0.0385, "step": 3623 }, { "epoch": 1.64877161055505, "grad_norm": 0.8431405947338398, "learning_rate": 7.549757598330924e-07, "loss": 0.038, "step": 3624 }, { "epoch": 1.6492265696087354, "grad_norm": 1.1927950892480967, "learning_rate": 7.548528009240709e-07, "loss": 0.0336, "step": 3625 }, { "epoch": 1.6496815286624202, "grad_norm": 0.9864868152248273, "learning_rate": 7.547298211896293e-07, "loss": 0.0267, "step": 3626 }, { "epoch": 1.6501364877161055, "grad_norm": 0.8652823611685224, "learning_rate": 7.546068206398174e-07, "loss": 0.038, "step": 3627 }, { "epoch": 1.6505914467697909, "grad_norm": 1.3802959791170302, "learning_rate": 7.544837992846855e-07, "loss": 0.0398, "step": 3628 }, { "epoch": 1.6510464058234757, "grad_norm": 1.4026212991342617, "learning_rate": 7.543607571342871e-07, "loss": 0.0291, "step": 3629 }, { "epoch": 1.651501364877161, "grad_norm": 0.8856041598334492, "learning_rate": 7.542376941986763e-07, "loss": 0.0255, "step": 3630 }, { "epoch": 1.6519563239308463, "grad_norm": 0.8431905298221543, "learning_rate": 7.541146104879093e-07, "loss": 0.0264, "step": 3631 }, { "epoch": 1.6524112829845314, "grad_norm": 0.6198548941023233, "learning_rate": 7.539915060120436e-07, "loss": 0.0204, "step": 3632 }, { "epoch": 1.6528662420382165, "grad_norm": 0.9101277626382972, "learning_rate": 7.538683807811392e-07, "loss": 0.0426, "step": 3633 }, { "epoch": 1.6533212010919018, "grad_norm": 0.7318521621665052, "learning_rate": 7.537452348052573e-07, "loss": 0.0237, "step": 3634 }, { "epoch": 1.653776160145587, "grad_norm": 1.1695633261573253, "learning_rate": 7.536220680944607e-07, "loss": 0.0297, "step": 3635 }, { "epoch": 1.654231119199272, "grad_norm": 0.7565366542889806, "learning_rate": 7.534988806588139e-07, "loss": 0.0268, "step": 3636 }, { "epoch": 1.6546860782529573, "grad_norm": 0.9170961143367562, "learning_rate": 7.533756725083835e-07, "loss": 0.0289, "step": 3637 }, { "epoch": 1.6551410373066424, "grad_norm": 0.8524382293604967, "learning_rate": 7.532524436532372e-07, "loss": 0.0285, "step": 3638 }, { "epoch": 1.6555959963603275, "grad_norm": 1.0230172711488392, "learning_rate": 7.53129194103445e-07, "loss": 0.0451, "step": 3639 }, { "epoch": 1.6560509554140128, "grad_norm": 0.9605659269432669, "learning_rate": 7.530059238690783e-07, "loss": 0.0366, "step": 3640 }, { "epoch": 1.656505914467698, "grad_norm": 1.2154334875283395, "learning_rate": 7.528826329602098e-07, "loss": 0.0448, "step": 3641 }, { "epoch": 1.656960873521383, "grad_norm": 0.8352790406492103, "learning_rate": 7.527593213869147e-07, "loss": 0.0212, "step": 3642 }, { "epoch": 1.6574158325750683, "grad_norm": 1.235782482361598, "learning_rate": 7.526359891592693e-07, "loss": 0.0363, "step": 3643 }, { "epoch": 1.6578707916287534, "grad_norm": 1.3135543910410858, "learning_rate": 7.525126362873518e-07, "loss": 0.0326, "step": 3644 }, { "epoch": 1.6583257506824385, "grad_norm": 0.7253702911857627, "learning_rate": 7.523892627812418e-07, "loss": 0.028, "step": 3645 }, { "epoch": 1.6587807097361238, "grad_norm": 1.0011084142371685, "learning_rate": 7.522658686510213e-07, "loss": 0.0419, "step": 3646 }, { "epoch": 1.6592356687898089, "grad_norm": 0.9591191358510034, "learning_rate": 7.521424539067731e-07, "loss": 0.0352, "step": 3647 }, { "epoch": 1.659690627843494, "grad_norm": 1.270826536885851, "learning_rate": 7.520190185585821e-07, "loss": 0.0384, "step": 3648 }, { "epoch": 1.6601455868971793, "grad_norm": 0.9831105189475593, "learning_rate": 7.518955626165353e-07, "loss": 0.0499, "step": 3649 }, { "epoch": 1.6606005459508644, "grad_norm": 0.8244923671563136, "learning_rate": 7.517720860907204e-07, "loss": 0.0445, "step": 3650 }, { "epoch": 1.6610555050045495, "grad_norm": 0.9126790876819876, "learning_rate": 7.516485889912278e-07, "loss": 0.0273, "step": 3651 }, { "epoch": 1.6615104640582348, "grad_norm": 0.882303846987347, "learning_rate": 7.51525071328149e-07, "loss": 0.0294, "step": 3652 }, { "epoch": 1.66196542311192, "grad_norm": 0.7904612226334157, "learning_rate": 7.514015331115771e-07, "loss": 0.0325, "step": 3653 }, { "epoch": 1.662420382165605, "grad_norm": 0.8960416786580818, "learning_rate": 7.512779743516072e-07, "loss": 0.0439, "step": 3654 }, { "epoch": 1.6628753412192903, "grad_norm": 0.5969827385463875, "learning_rate": 7.511543950583361e-07, "loss": 0.0183, "step": 3655 }, { "epoch": 1.6633303002729756, "grad_norm": 0.887341836638581, "learning_rate": 7.51030795241862e-07, "loss": 0.0279, "step": 3656 }, { "epoch": 1.6637852593266604, "grad_norm": 0.8650994775052941, "learning_rate": 7.509071749122848e-07, "loss": 0.0281, "step": 3657 }, { "epoch": 1.6642402183803457, "grad_norm": 1.04526534320909, "learning_rate": 7.507835340797066e-07, "loss": 0.0363, "step": 3658 }, { "epoch": 1.664695177434031, "grad_norm": 0.8086175323495584, "learning_rate": 7.506598727542304e-07, "loss": 0.0317, "step": 3659 }, { "epoch": 1.6651501364877161, "grad_norm": 4.554662744971681, "learning_rate": 7.505361909459614e-07, "loss": 0.0938, "step": 3660 }, { "epoch": 1.6656050955414012, "grad_norm": 0.8236876089297953, "learning_rate": 7.504124886650062e-07, "loss": 0.0393, "step": 3661 }, { "epoch": 1.6660600545950865, "grad_norm": 0.8676162176279067, "learning_rate": 7.502887659214735e-07, "loss": 0.0227, "step": 3662 }, { "epoch": 1.6665150136487716, "grad_norm": 1.1081527804613882, "learning_rate": 7.50165022725473e-07, "loss": 0.0458, "step": 3663 }, { "epoch": 1.6669699727024567, "grad_norm": 1.0278146251539704, "learning_rate": 7.500412590871166e-07, "loss": 0.0398, "step": 3664 }, { "epoch": 1.667424931756142, "grad_norm": 0.8965626461737278, "learning_rate": 7.499174750165177e-07, "loss": 0.0284, "step": 3665 }, { "epoch": 1.6678798908098271, "grad_norm": 0.9520236384149731, "learning_rate": 7.497936705237914e-07, "loss": 0.0362, "step": 3666 }, { "epoch": 1.6683348498635122, "grad_norm": 1.2192101235050825, "learning_rate": 7.496698456190542e-07, "loss": 0.0502, "step": 3667 }, { "epoch": 1.6687898089171975, "grad_norm": 0.8958567931078214, "learning_rate": 7.495460003124249e-07, "loss": 0.0259, "step": 3668 }, { "epoch": 1.6692447679708826, "grad_norm": 1.9219153150441108, "learning_rate": 7.494221346140234e-07, "loss": 0.0703, "step": 3669 }, { "epoch": 1.6696997270245677, "grad_norm": 0.8348513537444509, "learning_rate": 7.492982485339712e-07, "loss": 0.0273, "step": 3670 }, { "epoch": 1.670154686078253, "grad_norm": 0.863842633246758, "learning_rate": 7.491743420823922e-07, "loss": 0.0301, "step": 3671 }, { "epoch": 1.670609645131938, "grad_norm": 0.9316042530586504, "learning_rate": 7.490504152694113e-07, "loss": 0.0243, "step": 3672 }, { "epoch": 1.6710646041856232, "grad_norm": 1.1337656740346356, "learning_rate": 7.48926468105155e-07, "loss": 0.0488, "step": 3673 }, { "epoch": 1.6715195632393085, "grad_norm": 0.7595834957656284, "learning_rate": 7.488025005997518e-07, "loss": 0.0324, "step": 3674 }, { "epoch": 1.6719745222929936, "grad_norm": 1.1570896267456845, "learning_rate": 7.486785127633319e-07, "loss": 0.0238, "step": 3675 }, { "epoch": 1.6724294813466787, "grad_norm": 0.8927737854789384, "learning_rate": 7.485545046060271e-07, "loss": 0.0346, "step": 3676 }, { "epoch": 1.672884440400364, "grad_norm": 0.9553064766995415, "learning_rate": 7.484304761379705e-07, "loss": 0.037, "step": 3677 }, { "epoch": 1.673339399454049, "grad_norm": 0.9618442170053835, "learning_rate": 7.483064273692974e-07, "loss": 0.0359, "step": 3678 }, { "epoch": 1.6737943585077342, "grad_norm": 0.8718376417454753, "learning_rate": 7.481823583101443e-07, "loss": 0.0234, "step": 3679 }, { "epoch": 1.6742493175614195, "grad_norm": 1.052505940421394, "learning_rate": 7.480582689706497e-07, "loss": 0.035, "step": 3680 }, { "epoch": 1.6747042766151048, "grad_norm": 1.0884630808126208, "learning_rate": 7.479341593609533e-07, "loss": 0.0602, "step": 3681 }, { "epoch": 1.6751592356687897, "grad_norm": 1.0840801470210744, "learning_rate": 7.478100294911975e-07, "loss": 0.0433, "step": 3682 }, { "epoch": 1.675614194722475, "grad_norm": 1.200406286672445, "learning_rate": 7.47685879371525e-07, "loss": 0.036, "step": 3683 }, { "epoch": 1.6760691537761603, "grad_norm": 0.9764248753732958, "learning_rate": 7.47561709012081e-07, "loss": 0.0387, "step": 3684 }, { "epoch": 1.6765241128298451, "grad_norm": 0.7950823516132646, "learning_rate": 7.474375184230121e-07, "loss": 0.0261, "step": 3685 }, { "epoch": 1.6769790718835305, "grad_norm": 0.8303835123666623, "learning_rate": 7.473133076144666e-07, "loss": 0.0183, "step": 3686 }, { "epoch": 1.6774340309372158, "grad_norm": 0.7261004188029214, "learning_rate": 7.471890765965946e-07, "loss": 0.0296, "step": 3687 }, { "epoch": 1.6778889899909009, "grad_norm": 1.2408317146285244, "learning_rate": 7.470648253795474e-07, "loss": 0.0575, "step": 3688 }, { "epoch": 1.678343949044586, "grad_norm": 1.0272711801909127, "learning_rate": 7.469405539734785e-07, "loss": 0.0218, "step": 3689 }, { "epoch": 1.6787989080982713, "grad_norm": 0.7899434399693597, "learning_rate": 7.468162623885427e-07, "loss": 0.0295, "step": 3690 }, { "epoch": 1.6792538671519563, "grad_norm": 1.2154996288437139, "learning_rate": 7.466919506348963e-07, "loss": 0.0331, "step": 3691 }, { "epoch": 1.6797088262056414, "grad_norm": 0.8753502964488683, "learning_rate": 7.465676187226981e-07, "loss": 0.0364, "step": 3692 }, { "epoch": 1.6801637852593267, "grad_norm": 1.0613681944682294, "learning_rate": 7.464432666621073e-07, "loss": 0.052, "step": 3693 }, { "epoch": 1.6806187443130118, "grad_norm": 0.8706172108818153, "learning_rate": 7.463188944632859e-07, "loss": 0.0268, "step": 3694 }, { "epoch": 1.681073703366697, "grad_norm": 0.9180918998641694, "learning_rate": 7.461945021363967e-07, "loss": 0.0455, "step": 3695 }, { "epoch": 1.6815286624203822, "grad_norm": 0.7611821979736584, "learning_rate": 7.460700896916046e-07, "loss": 0.0297, "step": 3696 }, { "epoch": 1.6819836214740673, "grad_norm": 1.0916740537094967, "learning_rate": 7.459456571390761e-07, "loss": 0.0319, "step": 3697 }, { "epoch": 1.6824385805277524, "grad_norm": 0.705654472158198, "learning_rate": 7.458212044889788e-07, "loss": 0.0242, "step": 3698 }, { "epoch": 1.6828935395814377, "grad_norm": 0.8874749508975458, "learning_rate": 7.456967317514833e-07, "loss": 0.0207, "step": 3699 }, { "epoch": 1.6833484986351228, "grad_norm": 0.9806802836288482, "learning_rate": 7.455722389367602e-07, "loss": 0.0286, "step": 3700 }, { "epoch": 1.683803457688808, "grad_norm": 0.7532816730354085, "learning_rate": 7.454477260549827e-07, "loss": 0.0243, "step": 3701 }, { "epoch": 1.6842584167424932, "grad_norm": 1.1462607704239556, "learning_rate": 7.453231931163255e-07, "loss": 0.0512, "step": 3702 }, { "epoch": 1.6847133757961783, "grad_norm": 40.66371856571688, "learning_rate": 7.451986401309649e-07, "loss": 0.645, "step": 3703 }, { "epoch": 1.6851683348498634, "grad_norm": 1.1121575645010235, "learning_rate": 7.450740671090787e-07, "loss": 0.0423, "step": 3704 }, { "epoch": 1.6856232939035487, "grad_norm": 0.8505253727798028, "learning_rate": 7.449494740608464e-07, "loss": 0.029, "step": 3705 }, { "epoch": 1.686078252957234, "grad_norm": 0.9345417646211698, "learning_rate": 7.448248609964494e-07, "loss": 0.0271, "step": 3706 }, { "epoch": 1.6865332120109189, "grad_norm": 0.9346485711465193, "learning_rate": 7.447002279260704e-07, "loss": 0.0429, "step": 3707 }, { "epoch": 1.6869881710646042, "grad_norm": 0.9248979759979246, "learning_rate": 7.445755748598936e-07, "loss": 0.0461, "step": 3708 }, { "epoch": 1.6874431301182895, "grad_norm": 0.8958565470412361, "learning_rate": 7.444509018081053e-07, "loss": 0.0235, "step": 3709 }, { "epoch": 1.6878980891719744, "grad_norm": 0.7208865384958679, "learning_rate": 7.443262087808934e-07, "loss": 0.03, "step": 3710 }, { "epoch": 1.6883530482256597, "grad_norm": 0.7660731408274384, "learning_rate": 7.442014957884472e-07, "loss": 0.0274, "step": 3711 }, { "epoch": 1.688808007279345, "grad_norm": 1.1473081609312457, "learning_rate": 7.440767628409575e-07, "loss": 0.0297, "step": 3712 }, { "epoch": 1.68926296633303, "grad_norm": 0.7256622526441203, "learning_rate": 7.439520099486167e-07, "loss": 0.039, "step": 3713 }, { "epoch": 1.6897179253867152, "grad_norm": 1.2266936277710727, "learning_rate": 7.438272371216196e-07, "loss": 0.0478, "step": 3714 }, { "epoch": 1.6901728844404005, "grad_norm": 0.7178531461562944, "learning_rate": 7.437024443701618e-07, "loss": 0.0272, "step": 3715 }, { "epoch": 1.6906278434940856, "grad_norm": 0.6019757596437331, "learning_rate": 7.435776317044407e-07, "loss": 0.0228, "step": 3716 }, { "epoch": 1.6910828025477707, "grad_norm": 1.1350857398191934, "learning_rate": 7.434527991346555e-07, "loss": 0.0446, "step": 3717 }, { "epoch": 1.691537761601456, "grad_norm": 1.4111303386014855, "learning_rate": 7.433279466710071e-07, "loss": 0.0296, "step": 3718 }, { "epoch": 1.691992720655141, "grad_norm": 1.4284955284416092, "learning_rate": 7.432030743236976e-07, "loss": 0.049, "step": 3719 }, { "epoch": 1.6924476797088261, "grad_norm": 1.0529889193654127, "learning_rate": 7.430781821029313e-07, "loss": 0.0306, "step": 3720 }, { "epoch": 1.6929026387625115, "grad_norm": 0.9560984591880182, "learning_rate": 7.429532700189137e-07, "loss": 0.0282, "step": 3721 }, { "epoch": 1.6933575978161965, "grad_norm": 0.9945946986153701, "learning_rate": 7.42828338081852e-07, "loss": 0.0503, "step": 3722 }, { "epoch": 1.6938125568698816, "grad_norm": 1.0148833867485094, "learning_rate": 7.42703386301955e-07, "loss": 0.037, "step": 3723 }, { "epoch": 1.694267515923567, "grad_norm": 1.0436897406518646, "learning_rate": 7.425784146894335e-07, "loss": 0.0478, "step": 3724 }, { "epoch": 1.694722474977252, "grad_norm": 1.175336289222832, "learning_rate": 7.424534232544992e-07, "loss": 0.0208, "step": 3725 }, { "epoch": 1.6951774340309371, "grad_norm": 78.68436289367251, "learning_rate": 7.423284120073663e-07, "loss": 0.0351, "step": 3726 }, { "epoch": 1.6956323930846224, "grad_norm": 7.406878776961253, "learning_rate": 7.422033809582498e-07, "loss": 0.0304, "step": 3727 }, { "epoch": 1.6960873521383075, "grad_norm": 1.0749149579680604, "learning_rate": 7.420783301173668e-07, "loss": 0.0477, "step": 3728 }, { "epoch": 1.6965423111919926, "grad_norm": 3.786803667806749, "learning_rate": 7.419532594949358e-07, "loss": 0.0411, "step": 3729 }, { "epoch": 1.696997270245678, "grad_norm": 1.1537602219972758, "learning_rate": 7.418281691011772e-07, "loss": 0.0331, "step": 3730 }, { "epoch": 1.697452229299363, "grad_norm": 2.7147704285370287, "learning_rate": 7.417030589463127e-07, "loss": 0.0318, "step": 3731 }, { "epoch": 1.697907188353048, "grad_norm": 11.266052947029381, "learning_rate": 7.415779290405657e-07, "loss": 0.0341, "step": 3732 }, { "epoch": 1.6983621474067334, "grad_norm": 1.432507590928785, "learning_rate": 7.414527793941613e-07, "loss": 0.0292, "step": 3733 }, { "epoch": 1.6988171064604187, "grad_norm": 1.1431089996570076, "learning_rate": 7.413276100173262e-07, "loss": 0.0208, "step": 3734 }, { "epoch": 1.6992720655141036, "grad_norm": 1.3513740040535898, "learning_rate": 7.412024209202886e-07, "loss": 0.0376, "step": 3735 }, { "epoch": 1.699727024567789, "grad_norm": 0.6788508152193727, "learning_rate": 7.410772121132785e-07, "loss": 0.0297, "step": 3736 }, { "epoch": 1.7001819836214742, "grad_norm": 1.6159940575943985, "learning_rate": 7.409519836065271e-07, "loss": 0.0556, "step": 3737 }, { "epoch": 1.700636942675159, "grad_norm": 1.0076422755360128, "learning_rate": 7.40826735410268e-07, "loss": 0.052, "step": 3738 }, { "epoch": 1.7010919017288444, "grad_norm": 0.9164748283137955, "learning_rate": 7.407014675347355e-07, "loss": 0.0208, "step": 3739 }, { "epoch": 1.7015468607825297, "grad_norm": 1.1280887848017158, "learning_rate": 7.405761799901661e-07, "loss": 0.0513, "step": 3740 }, { "epoch": 1.7020018198362148, "grad_norm": 0.9919321276651568, "learning_rate": 7.404508727867977e-07, "loss": 0.0349, "step": 3741 }, { "epoch": 1.7024567788898999, "grad_norm": 3.622725772674169, "learning_rate": 7.403255459348699e-07, "loss": 0.0755, "step": 3742 }, { "epoch": 1.7029117379435852, "grad_norm": 0.7688278344587078, "learning_rate": 7.402001994446237e-07, "loss": 0.0295, "step": 3743 }, { "epoch": 1.7033666969972703, "grad_norm": 0.9832377218860502, "learning_rate": 7.400748333263019e-07, "loss": 0.0365, "step": 3744 }, { "epoch": 1.7038216560509554, "grad_norm": 1.0631844623754603, "learning_rate": 7.39949447590149e-07, "loss": 0.0369, "step": 3745 }, { "epoch": 1.7042766151046407, "grad_norm": 1.4076733921937195, "learning_rate": 7.398240422464108e-07, "loss": 0.0499, "step": 3746 }, { "epoch": 1.7047315741583258, "grad_norm": 1.0346164875501553, "learning_rate": 7.396986173053348e-07, "loss": 0.0539, "step": 3747 }, { "epoch": 1.7051865332120109, "grad_norm": 0.7206067317306014, "learning_rate": 7.395731727771705e-07, "loss": 0.0259, "step": 3748 }, { "epoch": 1.7056414922656962, "grad_norm": 1.192089379444426, "learning_rate": 7.394477086721682e-07, "loss": 0.033, "step": 3749 }, { "epoch": 1.7060964513193813, "grad_norm": 1.024298981624118, "learning_rate": 7.393222250005806e-07, "loss": 0.0379, "step": 3750 }, { "epoch": 1.7065514103730663, "grad_norm": 0.9764971468490846, "learning_rate": 7.391967217726615e-07, "loss": 0.0256, "step": 3751 }, { "epoch": 1.7070063694267517, "grad_norm": 0.9683370276294976, "learning_rate": 7.390711989986665e-07, "loss": 0.025, "step": 3752 }, { "epoch": 1.7074613284804367, "grad_norm": 0.7035699633736512, "learning_rate": 7.389456566888528e-07, "loss": 0.0262, "step": 3753 }, { "epoch": 1.7079162875341218, "grad_norm": 1.0529463272636899, "learning_rate": 7.38820094853479e-07, "loss": 0.043, "step": 3754 }, { "epoch": 1.7083712465878071, "grad_norm": 0.9874631410840193, "learning_rate": 7.386945135028058e-07, "loss": 0.0301, "step": 3755 }, { "epoch": 1.7088262056414922, "grad_norm": 0.6449660833935894, "learning_rate": 7.385689126470946e-07, "loss": 0.0217, "step": 3756 }, { "epoch": 1.7092811646951773, "grad_norm": 0.9368294192004077, "learning_rate": 7.384432922966093e-07, "loss": 0.0175, "step": 3757 }, { "epoch": 1.7097361237488626, "grad_norm": 0.6815690905609464, "learning_rate": 7.38317652461615e-07, "loss": 0.0187, "step": 3758 }, { "epoch": 1.7101910828025477, "grad_norm": 1.8955651127162172, "learning_rate": 7.381919931523784e-07, "loss": 0.0792, "step": 3759 }, { "epoch": 1.7106460418562328, "grad_norm": 1.0194984897583137, "learning_rate": 7.380663143791679e-07, "loss": 0.0248, "step": 3760 }, { "epoch": 1.7111010009099181, "grad_norm": 0.6187051880533316, "learning_rate": 7.37940616152253e-07, "loss": 0.0182, "step": 3761 }, { "epoch": 1.7115559599636034, "grad_norm": 0.9337740738490615, "learning_rate": 7.378148984819057e-07, "loss": 0.0466, "step": 3762 }, { "epoch": 1.7120109190172883, "grad_norm": 1.110578891182551, "learning_rate": 7.376891613783986e-07, "loss": 0.0538, "step": 3763 }, { "epoch": 1.7124658780709736, "grad_norm": 1.211477770468016, "learning_rate": 7.375634048520069e-07, "loss": 0.0454, "step": 3764 }, { "epoch": 1.712920837124659, "grad_norm": 1.1901230804786793, "learning_rate": 7.374376289130066e-07, "loss": 0.0351, "step": 3765 }, { "epoch": 1.7133757961783438, "grad_norm": 0.7051683250449241, "learning_rate": 7.373118335716754e-07, "loss": 0.0243, "step": 3766 }, { "epoch": 1.713830755232029, "grad_norm": 1.072311999348232, "learning_rate": 7.37186018838293e-07, "loss": 0.0414, "step": 3767 }, { "epoch": 1.7142857142857144, "grad_norm": 1.0177376393706181, "learning_rate": 7.370601847231399e-07, "loss": 0.0573, "step": 3768 }, { "epoch": 1.7147406733393995, "grad_norm": 0.9126047875277017, "learning_rate": 7.369343312364993e-07, "loss": 0.0287, "step": 3769 }, { "epoch": 1.7151956323930846, "grad_norm": 0.8427095815722319, "learning_rate": 7.368084583886551e-07, "loss": 0.0242, "step": 3770 }, { "epoch": 1.71565059144677, "grad_norm": 1.1037462097778181, "learning_rate": 7.366825661898931e-07, "loss": 0.0586, "step": 3771 }, { "epoch": 1.716105550500455, "grad_norm": 0.6379990216929221, "learning_rate": 7.365566546505007e-07, "loss": 0.0208, "step": 3772 }, { "epoch": 1.71656050955414, "grad_norm": 0.5997286806411553, "learning_rate": 7.364307237807667e-07, "loss": 0.0212, "step": 3773 }, { "epoch": 1.7170154686078254, "grad_norm": 0.7954767329384013, "learning_rate": 7.363047735909818e-07, "loss": 0.0167, "step": 3774 }, { "epoch": 1.7174704276615105, "grad_norm": 0.8131282623785382, "learning_rate": 7.361788040914379e-07, "loss": 0.0225, "step": 3775 }, { "epoch": 1.7179253867151956, "grad_norm": 1.0264538324271766, "learning_rate": 7.360528152924285e-07, "loss": 0.0408, "step": 3776 }, { "epoch": 1.7183803457688809, "grad_norm": 0.8178441740898147, "learning_rate": 7.359268072042492e-07, "loss": 0.0385, "step": 3777 }, { "epoch": 1.718835304822566, "grad_norm": 0.9453598849658956, "learning_rate": 7.358007798371965e-07, "loss": 0.0288, "step": 3778 }, { "epoch": 1.719290263876251, "grad_norm": 0.8470324928918932, "learning_rate": 7.356747332015691e-07, "loss": 0.0281, "step": 3779 }, { "epoch": 1.7197452229299364, "grad_norm": 0.8744776193125646, "learning_rate": 7.355486673076669e-07, "loss": 0.0322, "step": 3780 }, { "epoch": 1.7202001819836215, "grad_norm": 0.7181493211313377, "learning_rate": 7.354225821657912e-07, "loss": 0.028, "step": 3781 }, { "epoch": 1.7206551410373065, "grad_norm": 0.7519670409672398, "learning_rate": 7.352964777862451e-07, "loss": 0.0203, "step": 3782 }, { "epoch": 1.7211101000909919, "grad_norm": 0.646174041118885, "learning_rate": 7.351703541793337e-07, "loss": 0.0348, "step": 3783 }, { "epoch": 1.721565059144677, "grad_norm": 1.1671627729644836, "learning_rate": 7.350442113553629e-07, "loss": 0.032, "step": 3784 }, { "epoch": 1.722020018198362, "grad_norm": 0.7185912588428264, "learning_rate": 7.349180493246404e-07, "loss": 0.0188, "step": 3785 }, { "epoch": 1.7224749772520473, "grad_norm": 0.6180995023310318, "learning_rate": 7.34791868097476e-07, "loss": 0.0233, "step": 3786 }, { "epoch": 1.7229299363057324, "grad_norm": 0.7555463794013543, "learning_rate": 7.346656676841804e-07, "loss": 0.0428, "step": 3787 }, { "epoch": 1.7233848953594175, "grad_norm": 0.7824145279611009, "learning_rate": 7.345394480950662e-07, "loss": 0.0255, "step": 3788 }, { "epoch": 1.7238398544131028, "grad_norm": 0.8484107480216518, "learning_rate": 7.344132093404474e-07, "loss": 0.0263, "step": 3789 }, { "epoch": 1.7242948134667881, "grad_norm": 0.8996948753053458, "learning_rate": 7.342869514306398e-07, "loss": 0.0265, "step": 3790 }, { "epoch": 1.724749772520473, "grad_norm": 0.6280842416504939, "learning_rate": 7.341606743759605e-07, "loss": 0.0312, "step": 3791 }, { "epoch": 1.7252047315741583, "grad_norm": 1.016173219997532, "learning_rate": 7.340343781867284e-07, "loss": 0.0228, "step": 3792 }, { "epoch": 1.7256596906278436, "grad_norm": 0.8999578458135713, "learning_rate": 7.339080628732636e-07, "loss": 0.0284, "step": 3793 }, { "epoch": 1.7261146496815285, "grad_norm": 0.834927621192341, "learning_rate": 7.337817284458886e-07, "loss": 0.0202, "step": 3794 }, { "epoch": 1.7265696087352138, "grad_norm": 0.804173606250411, "learning_rate": 7.336553749149262e-07, "loss": 0.0261, "step": 3795 }, { "epoch": 1.7270245677888991, "grad_norm": 0.8473927850132064, "learning_rate": 7.335290022907019e-07, "loss": 0.0259, "step": 3796 }, { "epoch": 1.7274795268425842, "grad_norm": 0.7272151610869207, "learning_rate": 7.33402610583542e-07, "loss": 0.0281, "step": 3797 }, { "epoch": 1.7279344858962693, "grad_norm": 1.0075066910175543, "learning_rate": 7.33276199803775e-07, "loss": 0.0312, "step": 3798 }, { "epoch": 1.7283894449499546, "grad_norm": 0.7834054563223253, "learning_rate": 7.331497699617302e-07, "loss": 0.0219, "step": 3799 }, { "epoch": 1.7288444040036397, "grad_norm": 0.8707944380006337, "learning_rate": 7.330233210677392e-07, "loss": 0.0203, "step": 3800 }, { "epoch": 1.7292993630573248, "grad_norm": 0.9741368887121999, "learning_rate": 7.328968531321349e-07, "loss": 0.0406, "step": 3801 }, { "epoch": 1.72975432211101, "grad_norm": 0.8623771400547839, "learning_rate": 7.327703661652512e-07, "loss": 0.037, "step": 3802 }, { "epoch": 1.7302092811646952, "grad_norm": 0.8423061005334296, "learning_rate": 7.326438601774246e-07, "loss": 0.033, "step": 3803 }, { "epoch": 1.7306642402183803, "grad_norm": 0.6466246650612856, "learning_rate": 7.325173351789922e-07, "loss": 0.0327, "step": 3804 }, { "epoch": 1.7311191992720656, "grad_norm": 0.8999293870973674, "learning_rate": 7.323907911802933e-07, "loss": 0.0324, "step": 3805 }, { "epoch": 1.7315741583257507, "grad_norm": 0.9511218213955839, "learning_rate": 7.322642281916683e-07, "loss": 0.0327, "step": 3806 }, { "epoch": 1.7320291173794358, "grad_norm": 0.831727761141349, "learning_rate": 7.321376462234595e-07, "loss": 0.0464, "step": 3807 }, { "epoch": 1.732484076433121, "grad_norm": 0.980103521169993, "learning_rate": 7.320110452860107e-07, "loss": 0.0372, "step": 3808 }, { "epoch": 1.7329390354868062, "grad_norm": 1.6047723414941832, "learning_rate": 7.31884425389667e-07, "loss": 0.0424, "step": 3809 }, { "epoch": 1.7333939945404913, "grad_norm": 1.0662038367836977, "learning_rate": 7.317577865447752e-07, "loss": 0.0519, "step": 3810 }, { "epoch": 1.7338489535941766, "grad_norm": 0.8461404819513842, "learning_rate": 7.316311287616836e-07, "loss": 0.0188, "step": 3811 }, { "epoch": 1.7343039126478617, "grad_norm": 1.0695805707265904, "learning_rate": 7.315044520507423e-07, "loss": 0.027, "step": 3812 }, { "epoch": 1.7347588717015467, "grad_norm": 0.8145859211512444, "learning_rate": 7.313777564223026e-07, "loss": 0.0303, "step": 3813 }, { "epoch": 1.735213830755232, "grad_norm": 0.9007311975721172, "learning_rate": 7.312510418867175e-07, "loss": 0.0305, "step": 3814 }, { "epoch": 1.7356687898089171, "grad_norm": 0.6918520191969464, "learning_rate": 7.311243084543417e-07, "loss": 0.0209, "step": 3815 }, { "epoch": 1.7361237488626022, "grad_norm": 0.8375113211167834, "learning_rate": 7.309975561355311e-07, "loss": 0.0426, "step": 3816 }, { "epoch": 1.7365787079162875, "grad_norm": 0.9873213990183141, "learning_rate": 7.308707849406434e-07, "loss": 0.0256, "step": 3817 }, { "epoch": 1.7370336669699729, "grad_norm": 0.712297481221133, "learning_rate": 7.307439948800378e-07, "loss": 0.0216, "step": 3818 }, { "epoch": 1.7374886260236577, "grad_norm": 1.2325971753044904, "learning_rate": 7.306171859640748e-07, "loss": 0.0574, "step": 3819 }, { "epoch": 1.737943585077343, "grad_norm": 0.9558651791714843, "learning_rate": 7.30490358203117e-07, "loss": 0.0255, "step": 3820 }, { "epoch": 1.7383985441310283, "grad_norm": 17.575786802882966, "learning_rate": 7.303635116075278e-07, "loss": 0.1601, "step": 3821 }, { "epoch": 1.7388535031847132, "grad_norm": 1.018035987435318, "learning_rate": 7.30236646187673e-07, "loss": 0.0373, "step": 3822 }, { "epoch": 1.7393084622383985, "grad_norm": 0.9556811834553849, "learning_rate": 7.301097619539191e-07, "loss": 0.0363, "step": 3823 }, { "epoch": 1.7397634212920838, "grad_norm": 1.241572496870294, "learning_rate": 7.299828589166346e-07, "loss": 0.06, "step": 3824 }, { "epoch": 1.740218380345769, "grad_norm": 0.8166222988277537, "learning_rate": 7.298559370861896e-07, "loss": 0.0252, "step": 3825 }, { "epoch": 1.740673339399454, "grad_norm": 1.1173234821462195, "learning_rate": 7.297289964729553e-07, "loss": 0.0299, "step": 3826 }, { "epoch": 1.7411282984531393, "grad_norm": 0.7798504746551336, "learning_rate": 7.296020370873048e-07, "loss": 0.0311, "step": 3827 }, { "epoch": 1.7415832575068244, "grad_norm": 0.8383688181428924, "learning_rate": 7.294750589396128e-07, "loss": 0.0413, "step": 3828 }, { "epoch": 1.7420382165605095, "grad_norm": 1.082885463357385, "learning_rate": 7.293480620402553e-07, "loss": 0.0463, "step": 3829 }, { "epoch": 1.7424931756141948, "grad_norm": 0.9973656997795323, "learning_rate": 7.292210463996099e-07, "loss": 0.0261, "step": 3830 }, { "epoch": 1.74294813466788, "grad_norm": 1.0052928013965352, "learning_rate": 7.290940120280556e-07, "loss": 0.0396, "step": 3831 }, { "epoch": 1.743403093721565, "grad_norm": 1.651311353857485, "learning_rate": 7.289669589359733e-07, "loss": 0.0523, "step": 3832 }, { "epoch": 1.7438580527752503, "grad_norm": 1.278989496859344, "learning_rate": 7.288398871337452e-07, "loss": 0.0525, "step": 3833 }, { "epoch": 1.7443130118289354, "grad_norm": 1.0268438234063935, "learning_rate": 7.287127966317549e-07, "loss": 0.0307, "step": 3834 }, { "epoch": 1.7447679708826205, "grad_norm": 0.7648794662975461, "learning_rate": 7.285856874403877e-07, "loss": 0.0222, "step": 3835 }, { "epoch": 1.7452229299363058, "grad_norm": 1.006851529013741, "learning_rate": 7.284585595700304e-07, "loss": 0.0566, "step": 3836 }, { "epoch": 1.7456778889899909, "grad_norm": 0.8149971442586608, "learning_rate": 7.283314130310715e-07, "loss": 0.0249, "step": 3837 }, { "epoch": 1.746132848043676, "grad_norm": 1.2170544246581032, "learning_rate": 7.282042478339004e-07, "loss": 0.045, "step": 3838 }, { "epoch": 1.7465878070973613, "grad_norm": 0.8514671750384716, "learning_rate": 7.280770639889089e-07, "loss": 0.0308, "step": 3839 }, { "epoch": 1.7470427661510464, "grad_norm": 0.8902366717267536, "learning_rate": 7.279498615064897e-07, "loss": 0.0362, "step": 3840 }, { "epoch": 1.7474977252047315, "grad_norm": 0.9782133727858022, "learning_rate": 7.27822640397037e-07, "loss": 0.0332, "step": 3841 }, { "epoch": 1.7479526842584168, "grad_norm": 0.8449555960048527, "learning_rate": 7.276954006709473e-07, "loss": 0.0267, "step": 3842 }, { "epoch": 1.7484076433121019, "grad_norm": 0.8475698753866293, "learning_rate": 7.275681423386175e-07, "loss": 0.0321, "step": 3843 }, { "epoch": 1.748862602365787, "grad_norm": 0.7378465304113939, "learning_rate": 7.274408654104469e-07, "loss": 0.0372, "step": 3844 }, { "epoch": 1.7493175614194723, "grad_norm": 1.2011041317412636, "learning_rate": 7.273135698968358e-07, "loss": 0.0657, "step": 3845 }, { "epoch": 1.7497725204731576, "grad_norm": 0.8030367436382724, "learning_rate": 7.271862558081863e-07, "loss": 0.0371, "step": 3846 }, { "epoch": 1.7502274795268424, "grad_norm": 0.8402346016854695, "learning_rate": 7.270589231549022e-07, "loss": 0.047, "step": 3847 }, { "epoch": 1.7506824385805277, "grad_norm": 0.9418665716548049, "learning_rate": 7.269315719473879e-07, "loss": 0.0398, "step": 3848 }, { "epoch": 1.751137397634213, "grad_norm": 0.929558541408714, "learning_rate": 7.268042021960506e-07, "loss": 0.0319, "step": 3849 }, { "epoch": 1.7515923566878981, "grad_norm": 0.9878620391238714, "learning_rate": 7.266768139112981e-07, "loss": 0.0623, "step": 3850 }, { "epoch": 1.7520473157415832, "grad_norm": 0.819522609931165, "learning_rate": 7.265494071035401e-07, "loss": 0.0439, "step": 3851 }, { "epoch": 1.7525022747952685, "grad_norm": 0.8481327697214706, "learning_rate": 7.264219817831875e-07, "loss": 0.05, "step": 3852 }, { "epoch": 1.7529572338489536, "grad_norm": 1.1093846943070194, "learning_rate": 7.262945379606531e-07, "loss": 0.0349, "step": 3853 }, { "epoch": 1.7534121929026387, "grad_norm": 0.808443300156524, "learning_rate": 7.261670756463511e-07, "loss": 0.0406, "step": 3854 }, { "epoch": 1.753867151956324, "grad_norm": 1.0484335853088693, "learning_rate": 7.260395948506968e-07, "loss": 0.0318, "step": 3855 }, { "epoch": 1.7543221110100091, "grad_norm": 1.1102847273842749, "learning_rate": 7.259120955841078e-07, "loss": 0.0266, "step": 3856 }, { "epoch": 1.7547770700636942, "grad_norm": 0.8315781079884639, "learning_rate": 7.257845778570024e-07, "loss": 0.0309, "step": 3857 }, { "epoch": 1.7552320291173795, "grad_norm": 1.0277337463926397, "learning_rate": 7.256570416798011e-07, "loss": 0.0544, "step": 3858 }, { "epoch": 1.7556869881710646, "grad_norm": 1.293453633869193, "learning_rate": 7.255294870629253e-07, "loss": 0.0352, "step": 3859 }, { "epoch": 1.7561419472247497, "grad_norm": 1.003178401462958, "learning_rate": 7.254019140167984e-07, "loss": 0.045, "step": 3860 }, { "epoch": 1.756596906278435, "grad_norm": 1.2045753545038513, "learning_rate": 7.252743225518449e-07, "loss": 0.0314, "step": 3861 }, { "epoch": 1.75705186533212, "grad_norm": 0.6745246843030414, "learning_rate": 7.251467126784912e-07, "loss": 0.0227, "step": 3862 }, { "epoch": 1.7575068243858052, "grad_norm": 1.233440290085374, "learning_rate": 7.250190844071648e-07, "loss": 0.0534, "step": 3863 }, { "epoch": 1.7579617834394905, "grad_norm": 1.1536957815152185, "learning_rate": 7.248914377482951e-07, "loss": 0.0409, "step": 3864 }, { "epoch": 1.7584167424931756, "grad_norm": 0.7042076943226453, "learning_rate": 7.247637727123126e-07, "loss": 0.0327, "step": 3865 }, { "epoch": 1.7588717015468607, "grad_norm": 1.0203170051864963, "learning_rate": 7.246360893096496e-07, "loss": 0.0307, "step": 3866 }, { "epoch": 1.759326660600546, "grad_norm": 1.0464284649863076, "learning_rate": 7.245083875507398e-07, "loss": 0.0383, "step": 3867 }, { "epoch": 1.759781619654231, "grad_norm": 0.8216104888076887, "learning_rate": 7.243806674460187e-07, "loss": 0.0347, "step": 3868 }, { "epoch": 1.7602365787079162, "grad_norm": 0.9540286122089443, "learning_rate": 7.242529290059225e-07, "loss": 0.0456, "step": 3869 }, { "epoch": 1.7606915377616015, "grad_norm": 0.7905796180269096, "learning_rate": 7.241251722408896e-07, "loss": 0.0251, "step": 3870 }, { "epoch": 1.7611464968152868, "grad_norm": 0.9789421989306953, "learning_rate": 7.239973971613599e-07, "loss": 0.0296, "step": 3871 }, { "epoch": 1.7616014558689717, "grad_norm": 0.6485552861057302, "learning_rate": 7.238696037777745e-07, "loss": 0.0162, "step": 3872 }, { "epoch": 1.762056414922657, "grad_norm": 0.8484775975562947, "learning_rate": 7.237417921005761e-07, "loss": 0.0269, "step": 3873 }, { "epoch": 1.7625113739763423, "grad_norm": 1.4659337832515194, "learning_rate": 7.236139621402086e-07, "loss": 0.0601, "step": 3874 }, { "epoch": 1.7629663330300271, "grad_norm": 1.5496307963385796, "learning_rate": 7.234861139071184e-07, "loss": 0.0574, "step": 3875 }, { "epoch": 1.7634212920837125, "grad_norm": 1.0288921857925006, "learning_rate": 7.233582474117519e-07, "loss": 0.0458, "step": 3876 }, { "epoch": 1.7638762511373978, "grad_norm": 1.0262148496841836, "learning_rate": 7.232303626645581e-07, "loss": 0.0411, "step": 3877 }, { "epoch": 1.7643312101910829, "grad_norm": 1.0096817894485346, "learning_rate": 7.231024596759873e-07, "loss": 0.0351, "step": 3878 }, { "epoch": 1.764786169244768, "grad_norm": 0.7731764384264912, "learning_rate": 7.229745384564908e-07, "loss": 0.0279, "step": 3879 }, { "epoch": 1.7652411282984533, "grad_norm": 0.9858571724231138, "learning_rate": 7.228465990165222e-07, "loss": 0.0505, "step": 3880 }, { "epoch": 1.7656960873521383, "grad_norm": 1.0833483270617126, "learning_rate": 7.227186413665358e-07, "loss": 0.0317, "step": 3881 }, { "epoch": 1.7661510464058234, "grad_norm": 0.8530795515741634, "learning_rate": 7.225906655169878e-07, "loss": 0.0305, "step": 3882 }, { "epoch": 1.7666060054595087, "grad_norm": 0.9495719424366784, "learning_rate": 7.224626714783357e-07, "loss": 0.0222, "step": 3883 }, { "epoch": 1.7670609645131938, "grad_norm": 0.8774401256019617, "learning_rate": 7.223346592610388e-07, "loss": 0.0336, "step": 3884 }, { "epoch": 1.767515923566879, "grad_norm": 0.8589687224546587, "learning_rate": 7.222066288755577e-07, "loss": 0.032, "step": 3885 }, { "epoch": 1.7679708826205642, "grad_norm": 1.1390085989956455, "learning_rate": 7.220785803323544e-07, "loss": 0.0231, "step": 3886 }, { "epoch": 1.7684258416742493, "grad_norm": 0.9656258028085632, "learning_rate": 7.219505136418924e-07, "loss": 0.0254, "step": 3887 }, { "epoch": 1.7688808007279344, "grad_norm": 0.8383407669677485, "learning_rate": 7.218224288146366e-07, "loss": 0.0397, "step": 3888 }, { "epoch": 1.7693357597816197, "grad_norm": 0.8348905586943535, "learning_rate": 7.216943258610537e-07, "loss": 0.028, "step": 3889 }, { "epoch": 1.7697907188353048, "grad_norm": 0.7891050435761247, "learning_rate": 7.215662047916117e-07, "loss": 0.0245, "step": 3890 }, { "epoch": 1.77024567788899, "grad_norm": 0.935485671349422, "learning_rate": 7.2143806561678e-07, "loss": 0.0343, "step": 3891 }, { "epoch": 1.7707006369426752, "grad_norm": 0.7562622590627616, "learning_rate": 7.213099083470295e-07, "loss": 0.03, "step": 3892 }, { "epoch": 1.7711555959963603, "grad_norm": 0.5801226328071186, "learning_rate": 7.211817329928329e-07, "loss": 0.0245, "step": 3893 }, { "epoch": 1.7716105550500454, "grad_norm": 1.0097850087648284, "learning_rate": 7.210535395646637e-07, "loss": 0.0335, "step": 3894 }, { "epoch": 1.7720655141037307, "grad_norm": 0.7294354630469511, "learning_rate": 7.209253280729979e-07, "loss": 0.0304, "step": 3895 }, { "epoch": 1.7725204731574158, "grad_norm": 0.8616866266446068, "learning_rate": 7.207970985283116e-07, "loss": 0.0263, "step": 3896 }, { "epoch": 1.7729754322111009, "grad_norm": 0.8479061266824421, "learning_rate": 7.206688509410837e-07, "loss": 0.0337, "step": 3897 }, { "epoch": 1.7734303912647862, "grad_norm": 1.148577492004868, "learning_rate": 7.205405853217937e-07, "loss": 0.0237, "step": 3898 }, { "epoch": 1.7738853503184715, "grad_norm": 0.8938910354250396, "learning_rate": 7.204123016809232e-07, "loss": 0.0331, "step": 3899 }, { "epoch": 1.7743403093721564, "grad_norm": 1.0866159591477482, "learning_rate": 7.202840000289548e-07, "loss": 0.0603, "step": 3900 }, { "epoch": 1.7747952684258417, "grad_norm": 1.530542727770905, "learning_rate": 7.201556803763724e-07, "loss": 0.0567, "step": 3901 }, { "epoch": 1.775250227479527, "grad_norm": 1.0605714244138433, "learning_rate": 7.200273427336623e-07, "loss": 0.0243, "step": 3902 }, { "epoch": 1.7757051865332119, "grad_norm": 1.064735349926052, "learning_rate": 7.198989871113113e-07, "loss": 0.0381, "step": 3903 }, { "epoch": 1.7761601455868972, "grad_norm": 1.1267313481343704, "learning_rate": 7.19770613519808e-07, "loss": 0.0286, "step": 3904 }, { "epoch": 1.7766151046405825, "grad_norm": 0.9329539327053867, "learning_rate": 7.196422219696429e-07, "loss": 0.0298, "step": 3905 }, { "epoch": 1.7770700636942676, "grad_norm": 0.7863683136074263, "learning_rate": 7.195138124713072e-07, "loss": 0.0345, "step": 3906 }, { "epoch": 1.7775250227479527, "grad_norm": 0.8916790469776092, "learning_rate": 7.19385385035294e-07, "loss": 0.0385, "step": 3907 }, { "epoch": 1.777979981801638, "grad_norm": 0.9937441432912799, "learning_rate": 7.192569396720978e-07, "loss": 0.0416, "step": 3908 }, { "epoch": 1.778434940855323, "grad_norm": 0.7151946721791683, "learning_rate": 7.191284763922149e-07, "loss": 0.0201, "step": 3909 }, { "epoch": 1.7788898999090081, "grad_norm": 0.9836818976929858, "learning_rate": 7.189999952061423e-07, "loss": 0.0243, "step": 3910 }, { "epoch": 1.7793448589626935, "grad_norm": 0.7657851716238115, "learning_rate": 7.188714961243791e-07, "loss": 0.0362, "step": 3911 }, { "epoch": 1.7797998180163785, "grad_norm": 0.8842901245531253, "learning_rate": 7.187429791574258e-07, "loss": 0.0185, "step": 3912 }, { "epoch": 1.7802547770700636, "grad_norm": 0.9495044451924771, "learning_rate": 7.186144443157839e-07, "loss": 0.0249, "step": 3913 }, { "epoch": 1.780709736123749, "grad_norm": 0.9684677482413906, "learning_rate": 7.184858916099569e-07, "loss": 0.0296, "step": 3914 }, { "epoch": 1.781164695177434, "grad_norm": 1.0724726574627688, "learning_rate": 7.183573210504494e-07, "loss": 0.0383, "step": 3915 }, { "epoch": 1.7816196542311191, "grad_norm": 1.270596980149381, "learning_rate": 7.182287326477679e-07, "loss": 0.0508, "step": 3916 }, { "epoch": 1.7820746132848044, "grad_norm": 0.7606507154650863, "learning_rate": 7.1810012641242e-07, "loss": 0.0248, "step": 3917 }, { "epoch": 1.7825295723384895, "grad_norm": 1.0649326860640616, "learning_rate": 7.179715023549144e-07, "loss": 0.036, "step": 3918 }, { "epoch": 1.7829845313921746, "grad_norm": 0.7544711117052222, "learning_rate": 7.178428604857621e-07, "loss": 0.0263, "step": 3919 }, { "epoch": 1.78343949044586, "grad_norm": 1.0419229926277427, "learning_rate": 7.17714200815475e-07, "loss": 0.0271, "step": 3920 }, { "epoch": 1.783894449499545, "grad_norm": 0.9393084076746055, "learning_rate": 7.175855233545667e-07, "loss": 0.0356, "step": 3921 }, { "epoch": 1.78434940855323, "grad_norm": 1.115723666108372, "learning_rate": 7.17456828113552e-07, "loss": 0.0292, "step": 3922 }, { "epoch": 1.7848043676069154, "grad_norm": 0.6680133604968634, "learning_rate": 7.173281151029472e-07, "loss": 0.0205, "step": 3923 }, { "epoch": 1.7852593266606005, "grad_norm": 0.865901551421732, "learning_rate": 7.171993843332704e-07, "loss": 0.0289, "step": 3924 }, { "epoch": 1.7857142857142856, "grad_norm": 1.1129983308164886, "learning_rate": 7.170706358150407e-07, "loss": 0.047, "step": 3925 }, { "epoch": 1.786169244767971, "grad_norm": 0.9400298717706935, "learning_rate": 7.16941869558779e-07, "loss": 0.0453, "step": 3926 }, { "epoch": 1.7866242038216562, "grad_norm": 0.6842026503710418, "learning_rate": 7.168130855750075e-07, "loss": 0.0232, "step": 3927 }, { "epoch": 1.787079162875341, "grad_norm": 0.9217662334646701, "learning_rate": 7.166842838742496e-07, "loss": 0.0245, "step": 3928 }, { "epoch": 1.7875341219290264, "grad_norm": 1.1030599308857272, "learning_rate": 7.165554644670306e-07, "loss": 0.0336, "step": 3929 }, { "epoch": 1.7879890809827117, "grad_norm": 1.1026604526334314, "learning_rate": 7.164266273638771e-07, "loss": 0.0378, "step": 3930 }, { "epoch": 1.7884440400363966, "grad_norm": 1.5887926623553705, "learning_rate": 7.162977725753168e-07, "loss": 0.0327, "step": 3931 }, { "epoch": 1.7888989990900819, "grad_norm": 0.9837958895808053, "learning_rate": 7.161689001118794e-07, "loss": 0.0386, "step": 3932 }, { "epoch": 1.7893539581437672, "grad_norm": 1.2315395333769672, "learning_rate": 7.160400099840958e-07, "loss": 0.0531, "step": 3933 }, { "epoch": 1.7898089171974523, "grad_norm": 1.2059638779102386, "learning_rate": 7.159111022024982e-07, "loss": 0.0327, "step": 3934 }, { "epoch": 1.7902638762511374, "grad_norm": 0.7270502037679625, "learning_rate": 7.157821767776202e-07, "loss": 0.0314, "step": 3935 }, { "epoch": 1.7907188353048227, "grad_norm": 0.777193159253887, "learning_rate": 7.156532337199972e-07, "loss": 0.0377, "step": 3936 }, { "epoch": 1.7911737943585078, "grad_norm": 1.0925081526619975, "learning_rate": 7.155242730401659e-07, "loss": 0.0237, "step": 3937 }, { "epoch": 1.7916287534121929, "grad_norm": 1.108169048735122, "learning_rate": 7.153952947486644e-07, "loss": 0.0575, "step": 3938 }, { "epoch": 1.7920837124658782, "grad_norm": 0.9238470014526249, "learning_rate": 7.15266298856032e-07, "loss": 0.0412, "step": 3939 }, { "epoch": 1.7925386715195633, "grad_norm": 0.8663122304383759, "learning_rate": 7.151372853728098e-07, "loss": 0.0394, "step": 3940 }, { "epoch": 1.7929936305732483, "grad_norm": 1.0591365523820153, "learning_rate": 7.150082543095403e-07, "loss": 0.028, "step": 3941 }, { "epoch": 1.7934485896269337, "grad_norm": 1.252544391193583, "learning_rate": 7.14879205676767e-07, "loss": 0.0515, "step": 3942 }, { "epoch": 1.7939035486806187, "grad_norm": 0.6981838194601431, "learning_rate": 7.147501394850356e-07, "loss": 0.0323, "step": 3943 }, { "epoch": 1.7943585077343038, "grad_norm": 0.845348932882649, "learning_rate": 7.146210557448925e-07, "loss": 0.0271, "step": 3944 }, { "epoch": 1.7948134667879891, "grad_norm": 0.784612004135099, "learning_rate": 7.144919544668862e-07, "loss": 0.0311, "step": 3945 }, { "epoch": 1.7952684258416742, "grad_norm": 0.9114331177344513, "learning_rate": 7.143628356615656e-07, "loss": 0.0301, "step": 3946 }, { "epoch": 1.7957233848953593, "grad_norm": 1.4688243871543698, "learning_rate": 7.142336993394824e-07, "loss": 0.0465, "step": 3947 }, { "epoch": 1.7961783439490446, "grad_norm": 1.0814349159961008, "learning_rate": 7.141045455111887e-07, "loss": 0.0376, "step": 3948 }, { "epoch": 1.7966333030027297, "grad_norm": 1.086395509576243, "learning_rate": 7.139753741872384e-07, "loss": 0.0318, "step": 3949 }, { "epoch": 1.7970882620564148, "grad_norm": 1.688019338245213, "learning_rate": 7.138461853781869e-07, "loss": 0.0487, "step": 3950 }, { "epoch": 1.7975432211101001, "grad_norm": 0.9303763264777298, "learning_rate": 7.137169790945907e-07, "loss": 0.0272, "step": 3951 }, { "epoch": 1.7979981801637852, "grad_norm": 0.854833042699942, "learning_rate": 7.135877553470082e-07, "loss": 0.0268, "step": 3952 }, { "epoch": 1.7984531392174703, "grad_norm": 1.1796506667330882, "learning_rate": 7.13458514145999e-07, "loss": 0.0331, "step": 3953 }, { "epoch": 1.7989080982711556, "grad_norm": 0.5769053836837927, "learning_rate": 7.133292555021237e-07, "loss": 0.0229, "step": 3954 }, { "epoch": 1.799363057324841, "grad_norm": 0.9059913995392809, "learning_rate": 7.131999794259454e-07, "loss": 0.0314, "step": 3955 }, { "epoch": 1.7998180163785258, "grad_norm": 0.8826386749933668, "learning_rate": 7.130706859280274e-07, "loss": 0.0247, "step": 3956 }, { "epoch": 1.800272975432211, "grad_norm": 0.9264143057669723, "learning_rate": 7.12941375018935e-07, "loss": 0.0417, "step": 3957 }, { "epoch": 1.8007279344858964, "grad_norm": 1.1022132185982216, "learning_rate": 7.128120467092353e-07, "loss": 0.0352, "step": 3958 }, { "epoch": 1.8011828935395813, "grad_norm": 1.0024497482792716, "learning_rate": 7.126827010094961e-07, "loss": 0.0274, "step": 3959 }, { "epoch": 1.8016378525932666, "grad_norm": 1.1305330734783876, "learning_rate": 7.125533379302871e-07, "loss": 0.0467, "step": 3960 }, { "epoch": 1.802092811646952, "grad_norm": 0.7450996721431363, "learning_rate": 7.124239574821791e-07, "loss": 0.0329, "step": 3961 }, { "epoch": 1.802547770700637, "grad_norm": 1.5206869080270693, "learning_rate": 7.122945596757448e-07, "loss": 0.0529, "step": 3962 }, { "epoch": 1.803002729754322, "grad_norm": 1.2617839265250215, "learning_rate": 7.121651445215576e-07, "loss": 0.0502, "step": 3963 }, { "epoch": 1.8034576888080074, "grad_norm": 0.7810563092094688, "learning_rate": 7.12035712030193e-07, "loss": 0.0318, "step": 3964 }, { "epoch": 1.8039126478616925, "grad_norm": 0.9372763027400182, "learning_rate": 7.119062622122276e-07, "loss": 0.0286, "step": 3965 }, { "epoch": 1.8043676069153776, "grad_norm": 0.967124554611715, "learning_rate": 7.117767950782393e-07, "loss": 0.0231, "step": 3966 }, { "epoch": 1.8048225659690629, "grad_norm": 1.1213518699278096, "learning_rate": 7.116473106388077e-07, "loss": 0.0294, "step": 3967 }, { "epoch": 1.805277525022748, "grad_norm": 1.339363445829072, "learning_rate": 7.115178089045137e-07, "loss": 0.032, "step": 3968 }, { "epoch": 1.805732484076433, "grad_norm": 0.9863059860527984, "learning_rate": 7.113882898859396e-07, "loss": 0.0357, "step": 3969 }, { "epoch": 1.8061874431301184, "grad_norm": 1.3171834751553098, "learning_rate": 7.11258753593669e-07, "loss": 0.0566, "step": 3970 }, { "epoch": 1.8066424021838035, "grad_norm": 0.7693548190907238, "learning_rate": 7.11129200038287e-07, "loss": 0.0444, "step": 3971 }, { "epoch": 1.8070973612374885, "grad_norm": 0.8243569668156009, "learning_rate": 7.109996292303804e-07, "loss": 0.022, "step": 3972 }, { "epoch": 1.8075523202911739, "grad_norm": 0.9254044247305087, "learning_rate": 7.108700411805369e-07, "loss": 0.0433, "step": 3973 }, { "epoch": 1.808007279344859, "grad_norm": 1.1041022494015365, "learning_rate": 7.10740435899346e-07, "loss": 0.0343, "step": 3974 }, { "epoch": 1.808462238398544, "grad_norm": 1.1786472930058083, "learning_rate": 7.106108133973983e-07, "loss": 0.0339, "step": 3975 }, { "epoch": 1.8089171974522293, "grad_norm": 1.3079894061724133, "learning_rate": 7.104811736852861e-07, "loss": 0.0316, "step": 3976 }, { "epoch": 1.8093721565059144, "grad_norm": 1.40479221043801, "learning_rate": 7.103515167736028e-07, "loss": 0.0298, "step": 3977 }, { "epoch": 1.8098271155595995, "grad_norm": 1.4735199677436455, "learning_rate": 7.102218426729434e-07, "loss": 0.0355, "step": 3978 }, { "epoch": 1.8102820746132848, "grad_norm": 0.9205589723170797, "learning_rate": 7.100921513939046e-07, "loss": 0.0379, "step": 3979 }, { "epoch": 1.81073703366697, "grad_norm": 0.7124814848324251, "learning_rate": 7.099624429470838e-07, "loss": 0.02, "step": 3980 }, { "epoch": 1.811191992720655, "grad_norm": 0.8901256449074896, "learning_rate": 7.098327173430805e-07, "loss": 0.0292, "step": 3981 }, { "epoch": 1.8116469517743403, "grad_norm": 1.2490602925155847, "learning_rate": 7.09702974592495e-07, "loss": 0.0628, "step": 3982 }, { "epoch": 1.8121019108280256, "grad_norm": 0.7497868880913693, "learning_rate": 7.095732147059294e-07, "loss": 0.0228, "step": 3983 }, { "epoch": 1.8125568698817105, "grad_norm": 0.6849902109622913, "learning_rate": 7.094434376939873e-07, "loss": 0.0235, "step": 3984 }, { "epoch": 1.8130118289353958, "grad_norm": 1.132189306079228, "learning_rate": 7.093136435672731e-07, "loss": 0.0385, "step": 3985 }, { "epoch": 1.8134667879890811, "grad_norm": 2.6611045010836074, "learning_rate": 7.091838323363934e-07, "loss": 0.0408, "step": 3986 }, { "epoch": 1.813921747042766, "grad_norm": 1.7592588189239606, "learning_rate": 7.090540040119555e-07, "loss": 0.0355, "step": 3987 }, { "epoch": 1.8143767060964513, "grad_norm": 1.0969796186112766, "learning_rate": 7.089241586045684e-07, "loss": 0.0305, "step": 3988 }, { "epoch": 1.8148316651501366, "grad_norm": 0.9133700955505974, "learning_rate": 7.087942961248427e-07, "loss": 0.0276, "step": 3989 }, { "epoch": 1.8152866242038217, "grad_norm": 1.1185585515149101, "learning_rate": 7.086644165833898e-07, "loss": 0.0541, "step": 3990 }, { "epoch": 1.8157415832575068, "grad_norm": 1.1193475844105445, "learning_rate": 7.085345199908234e-07, "loss": 0.0242, "step": 3991 }, { "epoch": 1.816196542311192, "grad_norm": 0.9103111936445267, "learning_rate": 7.084046063577575e-07, "loss": 0.0406, "step": 3992 }, { "epoch": 1.8166515013648772, "grad_norm": 0.9086730774544863, "learning_rate": 7.082746756948084e-07, "loss": 0.0353, "step": 3993 }, { "epoch": 1.8171064604185623, "grad_norm": 0.9660201887437788, "learning_rate": 7.081447280125934e-07, "loss": 0.0388, "step": 3994 }, { "epoch": 1.8175614194722476, "grad_norm": 1.1434364961877401, "learning_rate": 7.080147633217311e-07, "loss": 0.0265, "step": 3995 }, { "epoch": 1.8180163785259327, "grad_norm": 1.0385475046126111, "learning_rate": 7.078847816328418e-07, "loss": 0.0241, "step": 3996 }, { "epoch": 1.8184713375796178, "grad_norm": 0.9981597405010151, "learning_rate": 7.07754782956547e-07, "loss": 0.0355, "step": 3997 }, { "epoch": 1.818926296633303, "grad_norm": 1.0740060967896283, "learning_rate": 7.076247673034695e-07, "loss": 0.0441, "step": 3998 }, { "epoch": 1.8193812556869882, "grad_norm": 1.0238904235995776, "learning_rate": 7.074947346842336e-07, "loss": 0.0336, "step": 3999 }, { "epoch": 1.8198362147406733, "grad_norm": 0.6264762920496362, "learning_rate": 7.07364685109465e-07, "loss": 0.0202, "step": 4000 }, { "epoch": 1.8202911737943586, "grad_norm": 0.7826814467899753, "learning_rate": 7.072346185897909e-07, "loss": 0.0407, "step": 4001 }, { "epoch": 1.8207461328480437, "grad_norm": 0.9669527993014366, "learning_rate": 7.071045351358395e-07, "loss": 0.0315, "step": 4002 }, { "epoch": 1.8212010919017287, "grad_norm": 0.9454766462634271, "learning_rate": 7.069744347582409e-07, "loss": 0.0243, "step": 4003 }, { "epoch": 1.821656050955414, "grad_norm": 0.8653380647757337, "learning_rate": 7.068443174676261e-07, "loss": 0.0423, "step": 4004 }, { "epoch": 1.8221110100090991, "grad_norm": 0.859006069736462, "learning_rate": 7.067141832746278e-07, "loss": 0.0296, "step": 4005 }, { "epoch": 1.8225659690627842, "grad_norm": 1.222386420328136, "learning_rate": 7.065840321898799e-07, "loss": 0.0414, "step": 4006 }, { "epoch": 1.8230209281164695, "grad_norm": 0.7476407482123641, "learning_rate": 7.064538642240178e-07, "loss": 0.0195, "step": 4007 }, { "epoch": 1.8234758871701549, "grad_norm": 0.765852437859921, "learning_rate": 7.063236793876784e-07, "loss": 0.0288, "step": 4008 }, { "epoch": 1.8239308462238397, "grad_norm": 0.8576842990985998, "learning_rate": 7.061934776914997e-07, "loss": 0.0388, "step": 4009 }, { "epoch": 1.824385805277525, "grad_norm": 0.7009482505986668, "learning_rate": 7.060632591461209e-07, "loss": 0.0276, "step": 4010 }, { "epoch": 1.8248407643312103, "grad_norm": 0.8183935498134707, "learning_rate": 7.059330237621835e-07, "loss": 0.0223, "step": 4011 }, { "epoch": 1.8252957233848952, "grad_norm": 1.1711934386726914, "learning_rate": 7.05802771550329e-07, "loss": 0.0412, "step": 4012 }, { "epoch": 1.8257506824385805, "grad_norm": 0.9042799826431464, "learning_rate": 7.056725025212016e-07, "loss": 0.0419, "step": 4013 }, { "epoch": 1.8262056414922658, "grad_norm": 0.5730340920743366, "learning_rate": 7.05542216685446e-07, "loss": 0.0176, "step": 4014 }, { "epoch": 1.826660600545951, "grad_norm": 0.6203767088485237, "learning_rate": 7.05411914053709e-07, "loss": 0.0319, "step": 4015 }, { "epoch": 1.827115559599636, "grad_norm": 1.548563085421601, "learning_rate": 7.052815946366376e-07, "loss": 0.0327, "step": 4016 }, { "epoch": 1.8275705186533213, "grad_norm": 0.9822548912523246, "learning_rate": 7.051512584448814e-07, "loss": 0.0461, "step": 4017 }, { "epoch": 1.8280254777070064, "grad_norm": 1.228943025874863, "learning_rate": 7.05020905489091e-07, "loss": 0.0328, "step": 4018 }, { "epoch": 1.8284804367606915, "grad_norm": 1.108180260628315, "learning_rate": 7.04890535779918e-07, "loss": 0.0408, "step": 4019 }, { "epoch": 1.8289353958143768, "grad_norm": 0.9467203671334619, "learning_rate": 7.047601493280156e-07, "loss": 0.0323, "step": 4020 }, { "epoch": 1.829390354868062, "grad_norm": 0.6802668617801688, "learning_rate": 7.046297461440386e-07, "loss": 0.0214, "step": 4021 }, { "epoch": 1.829845313921747, "grad_norm": 1.187117131556638, "learning_rate": 7.044993262386428e-07, "loss": 0.0657, "step": 4022 }, { "epoch": 1.8303002729754323, "grad_norm": 0.9400432167871355, "learning_rate": 7.043688896224856e-07, "loss": 0.0256, "step": 4023 }, { "epoch": 1.8307552320291174, "grad_norm": 0.9988414437512534, "learning_rate": 7.042384363062255e-07, "loss": 0.0413, "step": 4024 }, { "epoch": 1.8312101910828025, "grad_norm": 0.7244247011007051, "learning_rate": 7.04107966300523e-07, "loss": 0.0312, "step": 4025 }, { "epoch": 1.8316651501364878, "grad_norm": 0.8758252253221269, "learning_rate": 7.03977479616039e-07, "loss": 0.0154, "step": 4026 }, { "epoch": 1.8321201091901729, "grad_norm": 0.7247552344465393, "learning_rate": 7.038469762634367e-07, "loss": 0.0175, "step": 4027 }, { "epoch": 1.832575068243858, "grad_norm": 0.8570788830281759, "learning_rate": 7.0371645625338e-07, "loss": 0.0271, "step": 4028 }, { "epoch": 1.8330300272975433, "grad_norm": 0.9075758669289613, "learning_rate": 7.035859195965344e-07, "loss": 0.0405, "step": 4029 }, { "epoch": 1.8334849863512284, "grad_norm": 0.8274213911121379, "learning_rate": 7.034553663035669e-07, "loss": 0.0331, "step": 4030 }, { "epoch": 1.8339399454049135, "grad_norm": 1.0131703619575876, "learning_rate": 7.033247963851456e-07, "loss": 0.0373, "step": 4031 }, { "epoch": 1.8343949044585988, "grad_norm": 0.8134944535188923, "learning_rate": 7.031942098519402e-07, "loss": 0.0324, "step": 4032 }, { "epoch": 1.8348498635122839, "grad_norm": 0.7390389132974826, "learning_rate": 7.030636067146216e-07, "loss": 0.0235, "step": 4033 }, { "epoch": 1.835304822565969, "grad_norm": 0.9241022058876349, "learning_rate": 7.02932986983862e-07, "loss": 0.033, "step": 4034 }, { "epoch": 1.8357597816196543, "grad_norm": 0.9208179643511066, "learning_rate": 7.028023506703353e-07, "loss": 0.0276, "step": 4035 }, { "epoch": 1.8362147406733396, "grad_norm": 0.8449904897428405, "learning_rate": 7.026716977847162e-07, "loss": 0.0313, "step": 4036 }, { "epoch": 1.8366696997270244, "grad_norm": 1.02630260433871, "learning_rate": 7.025410283376812e-07, "loss": 0.0507, "step": 4037 }, { "epoch": 1.8371246587807097, "grad_norm": 1.0441786074298933, "learning_rate": 7.024103423399082e-07, "loss": 0.0419, "step": 4038 }, { "epoch": 1.837579617834395, "grad_norm": 0.8791805869312558, "learning_rate": 7.02279639802076e-07, "loss": 0.0317, "step": 4039 }, { "epoch": 1.83803457688808, "grad_norm": 1.1214575356176135, "learning_rate": 7.021489207348651e-07, "loss": 0.0363, "step": 4040 }, { "epoch": 1.8384895359417652, "grad_norm": 0.6488297738082485, "learning_rate": 7.020181851489573e-07, "loss": 0.0394, "step": 4041 }, { "epoch": 1.8389444949954505, "grad_norm": 0.8903446294709281, "learning_rate": 7.018874330550358e-07, "loss": 0.0429, "step": 4042 }, { "epoch": 1.8393994540491356, "grad_norm": 1.1293127068720645, "learning_rate": 7.017566644637849e-07, "loss": 0.027, "step": 4043 }, { "epoch": 1.8398544131028207, "grad_norm": 0.7841840136003166, "learning_rate": 7.016258793858905e-07, "loss": 0.0224, "step": 4044 }, { "epoch": 1.840309372156506, "grad_norm": 1.0292650466784743, "learning_rate": 7.014950778320398e-07, "loss": 0.0358, "step": 4045 }, { "epoch": 1.8407643312101911, "grad_norm": 1.1221941825820771, "learning_rate": 7.013642598129213e-07, "loss": 0.0399, "step": 4046 }, { "epoch": 1.8412192902638762, "grad_norm": 1.0996823354807468, "learning_rate": 7.012334253392249e-07, "loss": 0.0383, "step": 4047 }, { "epoch": 1.8416742493175615, "grad_norm": 1.0481732602998695, "learning_rate": 7.011025744216415e-07, "loss": 0.0252, "step": 4048 }, { "epoch": 1.8421292083712466, "grad_norm": 1.1672531458453945, "learning_rate": 7.009717070708642e-07, "loss": 0.0269, "step": 4049 }, { "epoch": 1.8425841674249317, "grad_norm": 0.7862060682535311, "learning_rate": 7.008408232975864e-07, "loss": 0.041, "step": 4050 }, { "epoch": 1.843039126478617, "grad_norm": 20.205286091323693, "learning_rate": 7.007099231125036e-07, "loss": 0.1783, "step": 4051 }, { "epoch": 1.843494085532302, "grad_norm": 1.35895227941203, "learning_rate": 7.005790065263122e-07, "loss": 0.0477, "step": 4052 }, { "epoch": 1.8439490445859872, "grad_norm": 1.3750451279450835, "learning_rate": 7.004480735497101e-07, "loss": 0.0358, "step": 4053 }, { "epoch": 1.8444040036396725, "grad_norm": 0.7863513229093967, "learning_rate": 7.00317124193397e-07, "loss": 0.0414, "step": 4054 }, { "epoch": 1.8448589626933576, "grad_norm": 1.1929160105694212, "learning_rate": 7.001861584680726e-07, "loss": 0.0542, "step": 4055 }, { "epoch": 1.8453139217470427, "grad_norm": 0.7791838476359682, "learning_rate": 7.000551763844398e-07, "loss": 0.0269, "step": 4056 }, { "epoch": 1.845768880800728, "grad_norm": 1.0733714552656635, "learning_rate": 6.999241779532012e-07, "loss": 0.0461, "step": 4057 }, { "epoch": 1.846223839854413, "grad_norm": 0.6894483485734731, "learning_rate": 6.997931631850618e-07, "loss": 0.0178, "step": 4058 }, { "epoch": 1.8466787989080982, "grad_norm": 0.8129264821685671, "learning_rate": 6.996621320907272e-07, "loss": 0.0296, "step": 4059 }, { "epoch": 1.8471337579617835, "grad_norm": 0.7856221217019855, "learning_rate": 6.995310846809049e-07, "loss": 0.0436, "step": 4060 }, { "epoch": 1.8475887170154686, "grad_norm": 0.7154900051993326, "learning_rate": 6.994000209663035e-07, "loss": 0.021, "step": 4061 }, { "epoch": 1.8480436760691537, "grad_norm": 0.7320952247655172, "learning_rate": 6.992689409576329e-07, "loss": 0.0305, "step": 4062 }, { "epoch": 1.848498635122839, "grad_norm": 1.0659585703014094, "learning_rate": 6.991378446656042e-07, "loss": 0.0409, "step": 4063 }, { "epoch": 1.8489535941765243, "grad_norm": 0.9473440621035901, "learning_rate": 6.990067321009302e-07, "loss": 0.0362, "step": 4064 }, { "epoch": 1.8494085532302091, "grad_norm": 1.1223996707149757, "learning_rate": 6.988756032743246e-07, "loss": 0.048, "step": 4065 }, { "epoch": 1.8498635122838945, "grad_norm": 0.6788777097433921, "learning_rate": 6.987444581965031e-07, "loss": 0.0168, "step": 4066 }, { "epoch": 1.8503184713375798, "grad_norm": 0.9657163281756479, "learning_rate": 6.986132968781818e-07, "loss": 0.0242, "step": 4067 }, { "epoch": 1.8507734303912646, "grad_norm": 0.8169432426898959, "learning_rate": 6.984821193300789e-07, "loss": 0.0306, "step": 4068 }, { "epoch": 1.85122838944495, "grad_norm": 1.386990121795223, "learning_rate": 6.983509255629136e-07, "loss": 0.0446, "step": 4069 }, { "epoch": 1.8516833484986353, "grad_norm": 1.0024480748992748, "learning_rate": 6.982197155874061e-07, "loss": 0.029, "step": 4070 }, { "epoch": 1.8521383075523203, "grad_norm": 0.9668511892799548, "learning_rate": 6.980884894142789e-07, "loss": 0.0246, "step": 4071 }, { "epoch": 1.8525932666060054, "grad_norm": 1.4729638131920695, "learning_rate": 6.979572470542547e-07, "loss": 0.0462, "step": 4072 }, { "epoch": 1.8530482256596907, "grad_norm": 0.9710603272051386, "learning_rate": 6.978259885180584e-07, "loss": 0.0257, "step": 4073 }, { "epoch": 1.8535031847133758, "grad_norm": 1.3473958924094593, "learning_rate": 6.976947138164157e-07, "loss": 0.0366, "step": 4074 }, { "epoch": 1.853958143767061, "grad_norm": 0.7765129452956805, "learning_rate": 6.975634229600538e-07, "loss": 0.0366, "step": 4075 }, { "epoch": 1.8544131028207462, "grad_norm": 1.3216271214792206, "learning_rate": 6.974321159597009e-07, "loss": 0.0414, "step": 4076 }, { "epoch": 1.8548680618744313, "grad_norm": 1.151590363428501, "learning_rate": 6.973007928260873e-07, "loss": 0.0369, "step": 4077 }, { "epoch": 1.8553230209281164, "grad_norm": 0.6233348490535299, "learning_rate": 6.971694535699439e-07, "loss": 0.015, "step": 4078 }, { "epoch": 1.8557779799818017, "grad_norm": 0.9328396998838188, "learning_rate": 6.970380982020033e-07, "loss": 0.042, "step": 4079 }, { "epoch": 1.8562329390354868, "grad_norm": 1.0584122453517535, "learning_rate": 6.969067267329988e-07, "loss": 0.037, "step": 4080 }, { "epoch": 1.856687898089172, "grad_norm": 0.9994370943732479, "learning_rate": 6.967753391736661e-07, "loss": 0.0323, "step": 4081 }, { "epoch": 1.8571428571428572, "grad_norm": 1.2605327621822269, "learning_rate": 6.966439355347411e-07, "loss": 0.0366, "step": 4082 }, { "epoch": 1.8575978161965423, "grad_norm": 1.0400919135447442, "learning_rate": 6.965125158269618e-07, "loss": 0.0343, "step": 4083 }, { "epoch": 1.8580527752502274, "grad_norm": 1.272103505768358, "learning_rate": 6.963810800610672e-07, "loss": 0.0392, "step": 4084 }, { "epoch": 1.8585077343039127, "grad_norm": 0.7653471221667356, "learning_rate": 6.962496282477975e-07, "loss": 0.0344, "step": 4085 }, { "epoch": 1.8589626933575978, "grad_norm": 0.8936137099590746, "learning_rate": 6.961181603978945e-07, "loss": 0.0353, "step": 4086 }, { "epoch": 1.8594176524112829, "grad_norm": 0.8958969272471706, "learning_rate": 6.959866765221011e-07, "loss": 0.0169, "step": 4087 }, { "epoch": 1.8598726114649682, "grad_norm": 1.0401500310695713, "learning_rate": 6.958551766311615e-07, "loss": 0.026, "step": 4088 }, { "epoch": 1.8603275705186533, "grad_norm": 1.1243186302571482, "learning_rate": 6.957236607358215e-07, "loss": 0.041, "step": 4089 }, { "epoch": 1.8607825295723384, "grad_norm": 1.2112967299770292, "learning_rate": 6.955921288468276e-07, "loss": 0.036, "step": 4090 }, { "epoch": 1.8612374886260237, "grad_norm": 0.8735144540547889, "learning_rate": 6.954605809749284e-07, "loss": 0.0246, "step": 4091 }, { "epoch": 1.861692447679709, "grad_norm": 1.1125282366530937, "learning_rate": 6.953290171308731e-07, "loss": 0.0416, "step": 4092 }, { "epoch": 1.8621474067333939, "grad_norm": 1.332513714647686, "learning_rate": 6.951974373254125e-07, "loss": 0.0314, "step": 4093 }, { "epoch": 1.8626023657870792, "grad_norm": 0.9476721323618569, "learning_rate": 6.950658415692992e-07, "loss": 0.0283, "step": 4094 }, { "epoch": 1.8630573248407645, "grad_norm": 1.0189090151755704, "learning_rate": 6.94934229873286e-07, "loss": 0.0421, "step": 4095 }, { "epoch": 1.8635122838944493, "grad_norm": 0.9539485875710965, "learning_rate": 6.948026022481278e-07, "loss": 0.0363, "step": 4096 }, { "epoch": 1.8639672429481347, "grad_norm": 0.9817815382079151, "learning_rate": 6.946709587045807e-07, "loss": 0.0299, "step": 4097 }, { "epoch": 1.86442220200182, "grad_norm": 1.2295292370968602, "learning_rate": 6.94539299253402e-07, "loss": 0.0532, "step": 4098 }, { "epoch": 1.864877161055505, "grad_norm": 0.8600777234698181, "learning_rate": 6.944076239053503e-07, "loss": 0.028, "step": 4099 }, { "epoch": 1.8653321201091901, "grad_norm": 1.0237423712047447, "learning_rate": 6.942759326711855e-07, "loss": 0.0262, "step": 4100 }, { "epoch": 1.8657870791628755, "grad_norm": 1.0267940561327042, "learning_rate": 6.94144225561669e-07, "loss": 0.0347, "step": 4101 }, { "epoch": 1.8662420382165605, "grad_norm": 0.9690978056184496, "learning_rate": 6.940125025875629e-07, "loss": 0.0516, "step": 4102 }, { "epoch": 1.8666969972702456, "grad_norm": 1.0749287193543502, "learning_rate": 6.938807637596314e-07, "loss": 0.0363, "step": 4103 }, { "epoch": 1.867151956323931, "grad_norm": 1.2389726205268012, "learning_rate": 6.937490090886393e-07, "loss": 0.045, "step": 4104 }, { "epoch": 1.867606915377616, "grad_norm": 0.8088431469634747, "learning_rate": 6.936172385853532e-07, "loss": 0.0188, "step": 4105 }, { "epoch": 1.8680618744313011, "grad_norm": 0.6428331187170039, "learning_rate": 6.934854522605407e-07, "loss": 0.0185, "step": 4106 }, { "epoch": 1.8685168334849864, "grad_norm": 0.8392712888901571, "learning_rate": 6.933536501249708e-07, "loss": 0.039, "step": 4107 }, { "epoch": 1.8689717925386715, "grad_norm": 0.6830182292943082, "learning_rate": 6.932218321894139e-07, "loss": 0.0251, "step": 4108 }, { "epoch": 1.8694267515923566, "grad_norm": 1.20558136086588, "learning_rate": 6.930899984646414e-07, "loss": 0.0441, "step": 4109 }, { "epoch": 1.869881710646042, "grad_norm": 1.0185700731941658, "learning_rate": 6.929581489614262e-07, "loss": 0.05, "step": 4110 }, { "epoch": 1.870336669699727, "grad_norm": 0.9024604732312785, "learning_rate": 6.928262836905426e-07, "loss": 0.0299, "step": 4111 }, { "epoch": 1.870791628753412, "grad_norm": 0.680486791777421, "learning_rate": 6.926944026627657e-07, "loss": 0.0162, "step": 4112 }, { "epoch": 1.8712465878070974, "grad_norm": 1.1253668885952135, "learning_rate": 6.925625058888724e-07, "loss": 0.0381, "step": 4113 }, { "epoch": 1.8717015468607825, "grad_norm": 0.8241855714017988, "learning_rate": 6.924305933796408e-07, "loss": 0.0266, "step": 4114 }, { "epoch": 1.8721565059144676, "grad_norm": 0.7362539863624471, "learning_rate": 6.922986651458502e-07, "loss": 0.0278, "step": 4115 }, { "epoch": 1.872611464968153, "grad_norm": 0.8358694825476314, "learning_rate": 6.92166721198281e-07, "loss": 0.019, "step": 4116 }, { "epoch": 1.873066424021838, "grad_norm": 0.6813166826516351, "learning_rate": 6.920347615477152e-07, "loss": 0.0257, "step": 4117 }, { "epoch": 1.873521383075523, "grad_norm": 1.3362999230106634, "learning_rate": 6.919027862049359e-07, "loss": 0.0346, "step": 4118 }, { "epoch": 1.8739763421292084, "grad_norm": 1.0773861255309745, "learning_rate": 6.917707951807274e-07, "loss": 0.0402, "step": 4119 }, { "epoch": 1.8744313011828937, "grad_norm": 1.5090391068887976, "learning_rate": 6.916387884858757e-07, "loss": 0.0622, "step": 4120 }, { "epoch": 1.8748862602365786, "grad_norm": 1.0814332871678836, "learning_rate": 6.915067661311675e-07, "loss": 0.0372, "step": 4121 }, { "epoch": 1.8753412192902639, "grad_norm": 1.1080463636790143, "learning_rate": 6.913747281273915e-07, "loss": 0.039, "step": 4122 }, { "epoch": 1.8757961783439492, "grad_norm": 1.044868613837893, "learning_rate": 6.912426744853367e-07, "loss": 0.0332, "step": 4123 }, { "epoch": 1.876251137397634, "grad_norm": 0.8052447563458709, "learning_rate": 6.911106052157942e-07, "loss": 0.0218, "step": 4124 }, { "epoch": 1.8767060964513194, "grad_norm": 1.0542696681922439, "learning_rate": 6.909785203295562e-07, "loss": 0.0433, "step": 4125 }, { "epoch": 1.8771610555050047, "grad_norm": 1.3523572921052853, "learning_rate": 6.90846419837416e-07, "loss": 0.0565, "step": 4126 }, { "epoch": 1.8776160145586898, "grad_norm": 1.3197648533681108, "learning_rate": 6.907143037501681e-07, "loss": 0.0364, "step": 4127 }, { "epoch": 1.8780709736123748, "grad_norm": 1.0268854946890793, "learning_rate": 6.905821720786085e-07, "loss": 0.0261, "step": 4128 }, { "epoch": 1.8785259326660602, "grad_norm": 0.9618260791031727, "learning_rate": 6.904500248335346e-07, "loss": 0.0405, "step": 4129 }, { "epoch": 1.8789808917197452, "grad_norm": 0.6814303705698566, "learning_rate": 6.903178620257447e-07, "loss": 0.0257, "step": 4130 }, { "epoch": 1.8794358507734303, "grad_norm": 0.7651945434414092, "learning_rate": 6.901856836660385e-07, "loss": 0.0265, "step": 4131 }, { "epoch": 1.8798908098271156, "grad_norm": 1.1541918100913406, "learning_rate": 6.900534897652173e-07, "loss": 0.0393, "step": 4132 }, { "epoch": 1.8803457688808007, "grad_norm": 1.0211480100056702, "learning_rate": 6.89921280334083e-07, "loss": 0.0283, "step": 4133 }, { "epoch": 1.8808007279344858, "grad_norm": 0.9214011740193361, "learning_rate": 6.897890553834396e-07, "loss": 0.0247, "step": 4134 }, { "epoch": 1.8812556869881711, "grad_norm": 0.951384229988795, "learning_rate": 6.896568149240913e-07, "loss": 0.0412, "step": 4135 }, { "epoch": 1.8817106460418562, "grad_norm": 1.0565450486297854, "learning_rate": 6.895245589668448e-07, "loss": 0.0374, "step": 4136 }, { "epoch": 1.8821656050955413, "grad_norm": 1.243517633366771, "learning_rate": 6.893922875225071e-07, "loss": 0.0493, "step": 4137 }, { "epoch": 1.8826205641492266, "grad_norm": 1.2103037491169613, "learning_rate": 6.892600006018871e-07, "loss": 0.0546, "step": 4138 }, { "epoch": 1.8830755232029117, "grad_norm": 0.7863197402639055, "learning_rate": 6.891276982157946e-07, "loss": 0.0301, "step": 4139 }, { "epoch": 1.8835304822565968, "grad_norm": 0.9721812346183104, "learning_rate": 6.889953803750404e-07, "loss": 0.0463, "step": 4140 }, { "epoch": 1.8839854413102821, "grad_norm": 1.1252277535617454, "learning_rate": 6.888630470904375e-07, "loss": 0.0607, "step": 4141 }, { "epoch": 1.8844404003639672, "grad_norm": 1.312259976323399, "learning_rate": 6.88730698372799e-07, "loss": 0.0426, "step": 4142 }, { "epoch": 1.8848953594176523, "grad_norm": 0.9538521478964815, "learning_rate": 6.885983342329405e-07, "loss": 0.0314, "step": 4143 }, { "epoch": 1.8853503184713376, "grad_norm": 1.1426790715402737, "learning_rate": 6.884659546816775e-07, "loss": 0.0484, "step": 4144 }, { "epoch": 1.8858052775250227, "grad_norm": 0.9277579362346501, "learning_rate": 6.883335597298278e-07, "loss": 0.0349, "step": 4145 }, { "epoch": 1.8862602365787078, "grad_norm": 0.9096699117917342, "learning_rate": 6.882011493882104e-07, "loss": 0.0298, "step": 4146 }, { "epoch": 1.886715195632393, "grad_norm": 1.0214959288831358, "learning_rate": 6.880687236676448e-07, "loss": 0.0435, "step": 4147 }, { "epoch": 1.8871701546860784, "grad_norm": 1.0602938212968538, "learning_rate": 6.879362825789524e-07, "loss": 0.0287, "step": 4148 }, { "epoch": 1.8876251137397633, "grad_norm": 0.6469747925956356, "learning_rate": 6.878038261329556e-07, "loss": 0.0244, "step": 4149 }, { "epoch": 1.8880800727934486, "grad_norm": 1.343916847652637, "learning_rate": 6.876713543404784e-07, "loss": 0.0451, "step": 4150 }, { "epoch": 1.888535031847134, "grad_norm": 0.7898420132713252, "learning_rate": 6.875388672123458e-07, "loss": 0.028, "step": 4151 }, { "epoch": 1.8889899909008188, "grad_norm": 0.8669093538055873, "learning_rate": 6.874063647593835e-07, "loss": 0.033, "step": 4152 }, { "epoch": 1.889444949954504, "grad_norm": 1.0095854897481877, "learning_rate": 6.872738469924197e-07, "loss": 0.0446, "step": 4153 }, { "epoch": 1.8898999090081894, "grad_norm": 0.6535309281108523, "learning_rate": 6.871413139222826e-07, "loss": 0.0203, "step": 4154 }, { "epoch": 1.8903548680618745, "grad_norm": 1.4434859746995625, "learning_rate": 6.870087655598027e-07, "loss": 0.0581, "step": 4155 }, { "epoch": 1.8908098271155596, "grad_norm": 0.8965064499462224, "learning_rate": 6.868762019158109e-07, "loss": 0.0239, "step": 4156 }, { "epoch": 1.8912647861692449, "grad_norm": 0.837987670619362, "learning_rate": 6.867436230011397e-07, "loss": 0.0235, "step": 4157 }, { "epoch": 1.89171974522293, "grad_norm": 1.0495165626682539, "learning_rate": 6.866110288266232e-07, "loss": 0.0296, "step": 4158 }, { "epoch": 1.892174704276615, "grad_norm": 0.8307244457567708, "learning_rate": 6.864784194030956e-07, "loss": 0.0237, "step": 4159 }, { "epoch": 1.8926296633303004, "grad_norm": 1.000292855797175, "learning_rate": 6.863457947413943e-07, "loss": 0.0515, "step": 4160 }, { "epoch": 1.8930846223839854, "grad_norm": 1.1731473608716216, "learning_rate": 6.86213154852356e-07, "loss": 0.0436, "step": 4161 }, { "epoch": 1.8935395814376705, "grad_norm": 0.9375626694954665, "learning_rate": 6.860804997468196e-07, "loss": 0.032, "step": 4162 }, { "epoch": 1.8939945404913558, "grad_norm": 0.8306425586483974, "learning_rate": 6.859478294356252e-07, "loss": 0.0311, "step": 4163 }, { "epoch": 1.894449499545041, "grad_norm": 0.7708481468621479, "learning_rate": 6.858151439296136e-07, "loss": 0.0277, "step": 4164 }, { "epoch": 1.894904458598726, "grad_norm": 1.1030913869588976, "learning_rate": 6.856824432396278e-07, "loss": 0.0455, "step": 4165 }, { "epoch": 1.8953594176524113, "grad_norm": 0.9565243518964397, "learning_rate": 6.855497273765111e-07, "loss": 0.0291, "step": 4166 }, { "epoch": 1.8958143767060964, "grad_norm": 1.0758527035481869, "learning_rate": 6.85416996351109e-07, "loss": 0.0335, "step": 4167 }, { "epoch": 1.8962693357597815, "grad_norm": 0.9778559875310973, "learning_rate": 6.85284250174267e-07, "loss": 0.0402, "step": 4168 }, { "epoch": 1.8967242948134668, "grad_norm": 0.7361200252859433, "learning_rate": 6.851514888568328e-07, "loss": 0.0275, "step": 4169 }, { "epoch": 1.897179253867152, "grad_norm": 0.9261798798922614, "learning_rate": 6.850187124096551e-07, "loss": 0.0368, "step": 4170 }, { "epoch": 1.897634212920837, "grad_norm": 39.578026599209274, "learning_rate": 6.848859208435838e-07, "loss": 0.1393, "step": 4171 }, { "epoch": 1.8980891719745223, "grad_norm": 1.0552610028337523, "learning_rate": 6.8475311416947e-07, "loss": 0.0438, "step": 4172 }, { "epoch": 1.8985441310282076, "grad_norm": 0.9605690778475581, "learning_rate": 6.846202923981659e-07, "loss": 0.0373, "step": 4173 }, { "epoch": 1.8989990900818925, "grad_norm": 0.9575208514449565, "learning_rate": 6.844874555405255e-07, "loss": 0.0254, "step": 4174 }, { "epoch": 1.8994540491355778, "grad_norm": 0.7052470871219335, "learning_rate": 6.843546036074032e-07, "loss": 0.0178, "step": 4175 }, { "epoch": 1.8999090081892631, "grad_norm": 0.7164487974039468, "learning_rate": 6.842217366096552e-07, "loss": 0.0238, "step": 4176 }, { "epoch": 1.900363967242948, "grad_norm": 0.7671566595960942, "learning_rate": 6.840888545581389e-07, "loss": 0.0209, "step": 4177 }, { "epoch": 1.9008189262966333, "grad_norm": 1.0390197170790723, "learning_rate": 6.839559574637127e-07, "loss": 0.039, "step": 4178 }, { "epoch": 1.9012738853503186, "grad_norm": 0.6414439793666813, "learning_rate": 6.838230453372364e-07, "loss": 0.0192, "step": 4179 }, { "epoch": 1.9017288444040037, "grad_norm": 1.2458520848777779, "learning_rate": 6.83690118189571e-07, "loss": 0.0347, "step": 4180 }, { "epoch": 1.9021838034576888, "grad_norm": 1.156746262496976, "learning_rate": 6.835571760315786e-07, "loss": 0.0503, "step": 4181 }, { "epoch": 1.902638762511374, "grad_norm": 0.9082178255610965, "learning_rate": 6.834242188741229e-07, "loss": 0.0428, "step": 4182 }, { "epoch": 1.9030937215650592, "grad_norm": 1.6214537309541373, "learning_rate": 6.832912467280682e-07, "loss": 0.0322, "step": 4183 }, { "epoch": 1.9035486806187443, "grad_norm": 0.7691913329408557, "learning_rate": 6.831582596042807e-07, "loss": 0.0348, "step": 4184 }, { "epoch": 1.9040036396724296, "grad_norm": 1.0983076523054522, "learning_rate": 6.830252575136271e-07, "loss": 0.0313, "step": 4185 }, { "epoch": 1.9044585987261147, "grad_norm": 0.8547412053410237, "learning_rate": 6.828922404669763e-07, "loss": 0.0316, "step": 4186 }, { "epoch": 1.9049135577797998, "grad_norm": 0.7393819616240198, "learning_rate": 6.827592084751974e-07, "loss": 0.0403, "step": 4187 }, { "epoch": 1.905368516833485, "grad_norm": 0.8788736852128407, "learning_rate": 6.826261615491613e-07, "loss": 0.0384, "step": 4188 }, { "epoch": 1.9058234758871702, "grad_norm": 1.4148380630150457, "learning_rate": 6.8249309969974e-07, "loss": 0.0267, "step": 4189 }, { "epoch": 1.9062784349408552, "grad_norm": 0.9765548193347571, "learning_rate": 6.823600229378068e-07, "loss": 0.0383, "step": 4190 }, { "epoch": 1.9067333939945406, "grad_norm": 0.7882855935626946, "learning_rate": 6.822269312742359e-07, "loss": 0.0208, "step": 4191 }, { "epoch": 1.9071883530482256, "grad_norm": 1.0617831184371767, "learning_rate": 6.820938247199034e-07, "loss": 0.0445, "step": 4192 }, { "epoch": 1.9076433121019107, "grad_norm": 0.9668992207101721, "learning_rate": 6.819607032856855e-07, "loss": 0.0324, "step": 4193 }, { "epoch": 1.908098271155596, "grad_norm": 0.8196939977986368, "learning_rate": 6.818275669824609e-07, "loss": 0.0282, "step": 4194 }, { "epoch": 1.9085532302092811, "grad_norm": 1.1066187396115699, "learning_rate": 6.816944158211087e-07, "loss": 0.0381, "step": 4195 }, { "epoch": 1.9090081892629662, "grad_norm": 0.9384662290697264, "learning_rate": 6.815612498125092e-07, "loss": 0.0301, "step": 4196 }, { "epoch": 1.9094631483166515, "grad_norm": 1.1954384282006965, "learning_rate": 6.814280689675444e-07, "loss": 0.0595, "step": 4197 }, { "epoch": 1.9099181073703366, "grad_norm": 1.0241768825027684, "learning_rate": 6.81294873297097e-07, "loss": 0.029, "step": 4198 }, { "epoch": 1.9103730664240217, "grad_norm": 1.150601707094974, "learning_rate": 6.811616628120513e-07, "loss": 0.0609, "step": 4199 }, { "epoch": 1.910828025477707, "grad_norm": 0.647504011463273, "learning_rate": 6.810284375232928e-07, "loss": 0.0236, "step": 4200 }, { "epoch": 1.9112829845313923, "grad_norm": 1.5578478761525039, "learning_rate": 6.808951974417076e-07, "loss": 0.0281, "step": 4201 }, { "epoch": 1.9117379435850772, "grad_norm": 1.016244291104494, "learning_rate": 6.807619425781841e-07, "loss": 0.0485, "step": 4202 }, { "epoch": 1.9121929026387625, "grad_norm": 1.108720230682581, "learning_rate": 6.806286729436108e-07, "loss": 0.048, "step": 4203 }, { "epoch": 1.9126478616924478, "grad_norm": 0.8602376999414505, "learning_rate": 6.804953885488782e-07, "loss": 0.0274, "step": 4204 }, { "epoch": 1.9131028207461327, "grad_norm": 1.2094292706508534, "learning_rate": 6.803620894048773e-07, "loss": 0.0554, "step": 4205 }, { "epoch": 1.913557779799818, "grad_norm": 0.8386867712047067, "learning_rate": 6.802287755225011e-07, "loss": 0.0417, "step": 4206 }, { "epoch": 1.9140127388535033, "grad_norm": 0.9571175444121305, "learning_rate": 6.800954469126433e-07, "loss": 0.0336, "step": 4207 }, { "epoch": 1.9144676979071884, "grad_norm": 0.7056142275962773, "learning_rate": 6.799621035861989e-07, "loss": 0.0184, "step": 4208 }, { "epoch": 1.9149226569608735, "grad_norm": 0.7009913622658354, "learning_rate": 6.798287455540641e-07, "loss": 0.0282, "step": 4209 }, { "epoch": 1.9153776160145588, "grad_norm": 0.8765909795137835, "learning_rate": 6.796953728271362e-07, "loss": 0.0344, "step": 4210 }, { "epoch": 1.915832575068244, "grad_norm": 1.0911556690058781, "learning_rate": 6.795619854163141e-07, "loss": 0.0558, "step": 4211 }, { "epoch": 1.916287534121929, "grad_norm": 1.0871553320879448, "learning_rate": 6.794285833324972e-07, "loss": 0.0438, "step": 4212 }, { "epoch": 1.9167424931756143, "grad_norm": 0.8988929107533729, "learning_rate": 6.79295166586587e-07, "loss": 0.028, "step": 4213 }, { "epoch": 1.9171974522292994, "grad_norm": 1.306080652277042, "learning_rate": 6.791617351894854e-07, "loss": 0.0513, "step": 4214 }, { "epoch": 1.9176524112829845, "grad_norm": 0.9161282741667484, "learning_rate": 6.790282891520958e-07, "loss": 0.0274, "step": 4215 }, { "epoch": 1.9181073703366698, "grad_norm": 0.9586844584218064, "learning_rate": 6.78894828485323e-07, "loss": 0.0384, "step": 4216 }, { "epoch": 1.9185623293903549, "grad_norm": 0.8197150908588826, "learning_rate": 6.787613532000726e-07, "loss": 0.0276, "step": 4217 }, { "epoch": 1.91901728844404, "grad_norm": 0.8856313745264044, "learning_rate": 6.78627863307252e-07, "loss": 0.036, "step": 4218 }, { "epoch": 1.9194722474977253, "grad_norm": 0.6971728914920851, "learning_rate": 6.784943588177686e-07, "loss": 0.019, "step": 4219 }, { "epoch": 1.9199272065514104, "grad_norm": 0.9979041577053263, "learning_rate": 6.783608397425327e-07, "loss": 0.0447, "step": 4220 }, { "epoch": 1.9203821656050954, "grad_norm": 0.7346112649519935, "learning_rate": 6.782273060924543e-07, "loss": 0.0262, "step": 4221 }, { "epoch": 1.9208371246587808, "grad_norm": 1.1807035473102818, "learning_rate": 6.780937578784451e-07, "loss": 0.0469, "step": 4222 }, { "epoch": 1.9212920837124658, "grad_norm": 0.6702210617940425, "learning_rate": 6.779601951114185e-07, "loss": 0.0195, "step": 4223 }, { "epoch": 1.921747042766151, "grad_norm": 1.1533346596077907, "learning_rate": 6.778266178022883e-07, "loss": 0.0439, "step": 4224 }, { "epoch": 1.9222020018198362, "grad_norm": 0.6764064201493162, "learning_rate": 6.776930259619702e-07, "loss": 0.0336, "step": 4225 }, { "epoch": 1.9226569608735213, "grad_norm": 1.1386165149325191, "learning_rate": 6.775594196013802e-07, "loss": 0.0382, "step": 4226 }, { "epoch": 1.9231119199272064, "grad_norm": 3.24332714251142, "learning_rate": 6.774257987314363e-07, "loss": 0.0784, "step": 4227 }, { "epoch": 1.9235668789808917, "grad_norm": 1.0277744762488468, "learning_rate": 6.772921633630576e-07, "loss": 0.0311, "step": 4228 }, { "epoch": 1.924021838034577, "grad_norm": 0.726576771671021, "learning_rate": 6.771585135071636e-07, "loss": 0.0418, "step": 4229 }, { "epoch": 1.924476797088262, "grad_norm": 1.1770692279372243, "learning_rate": 6.770248491746763e-07, "loss": 0.0529, "step": 4230 }, { "epoch": 1.9249317561419472, "grad_norm": 1.6266775150178152, "learning_rate": 6.768911703765175e-07, "loss": 0.0372, "step": 4231 }, { "epoch": 1.9253867151956325, "grad_norm": 1.0797461161322193, "learning_rate": 6.767574771236113e-07, "loss": 0.0451, "step": 4232 }, { "epoch": 1.9258416742493174, "grad_norm": 0.9639394113285896, "learning_rate": 6.766237694268821e-07, "loss": 0.0275, "step": 4233 }, { "epoch": 1.9262966333030027, "grad_norm": 0.9072007126937582, "learning_rate": 6.764900472972562e-07, "loss": 0.0181, "step": 4234 }, { "epoch": 1.926751592356688, "grad_norm": 1.5040872011071607, "learning_rate": 6.763563107456606e-07, "loss": 0.0319, "step": 4235 }, { "epoch": 1.9272065514103731, "grad_norm": 1.04158968468468, "learning_rate": 6.762225597830236e-07, "loss": 0.0489, "step": 4236 }, { "epoch": 1.9276615104640582, "grad_norm": 0.9178343220498092, "learning_rate": 6.76088794420275e-07, "loss": 0.0241, "step": 4237 }, { "epoch": 1.9281164695177435, "grad_norm": 0.9334286406979412, "learning_rate": 6.759550146683453e-07, "loss": 0.0283, "step": 4238 }, { "epoch": 1.9285714285714286, "grad_norm": 0.9753964660497414, "learning_rate": 6.758212205381664e-07, "loss": 0.0438, "step": 4239 }, { "epoch": 1.9290263876251137, "grad_norm": 0.9099121474349217, "learning_rate": 6.756874120406714e-07, "loss": 0.04, "step": 4240 }, { "epoch": 1.929481346678799, "grad_norm": 0.8742928508453046, "learning_rate": 6.755535891867943e-07, "loss": 0.0212, "step": 4241 }, { "epoch": 1.929936305732484, "grad_norm": 0.9785274022199002, "learning_rate": 6.754197519874709e-07, "loss": 0.03, "step": 4242 }, { "epoch": 1.9303912647861692, "grad_norm": 0.9504940246900319, "learning_rate": 6.752859004536375e-07, "loss": 0.0308, "step": 4243 }, { "epoch": 1.9308462238398545, "grad_norm": 1.1601963296008369, "learning_rate": 6.751520345962319e-07, "loss": 0.0302, "step": 4244 }, { "epoch": 1.9313011828935396, "grad_norm": 0.7580665032053262, "learning_rate": 6.75018154426193e-07, "loss": 0.0225, "step": 4245 }, { "epoch": 1.9317561419472247, "grad_norm": 0.8210897607572772, "learning_rate": 6.748842599544608e-07, "loss": 0.024, "step": 4246 }, { "epoch": 1.93221110100091, "grad_norm": 1.059526032954988, "learning_rate": 6.747503511919767e-07, "loss": 0.0313, "step": 4247 }, { "epoch": 1.932666060054595, "grad_norm": 0.9415811744372881, "learning_rate": 6.746164281496832e-07, "loss": 0.0538, "step": 4248 }, { "epoch": 1.9331210191082802, "grad_norm": 1.0728625172493527, "learning_rate": 6.744824908385236e-07, "loss": 0.0495, "step": 4249 }, { "epoch": 1.9335759781619655, "grad_norm": 0.7927446446879307, "learning_rate": 6.743485392694428e-07, "loss": 0.0191, "step": 4250 }, { "epoch": 1.9340309372156506, "grad_norm": 0.8711957579775603, "learning_rate": 6.742145734533867e-07, "loss": 0.0316, "step": 4251 }, { "epoch": 1.9344858962693356, "grad_norm": 1.1100489184081845, "learning_rate": 6.740805934013027e-07, "loss": 0.0353, "step": 4252 }, { "epoch": 1.934940855323021, "grad_norm": 0.8778266355560188, "learning_rate": 6.739465991241385e-07, "loss": 0.0409, "step": 4253 }, { "epoch": 1.935395814376706, "grad_norm": 0.9566012365661203, "learning_rate": 6.738125906328437e-07, "loss": 0.0376, "step": 4254 }, { "epoch": 1.9358507734303911, "grad_norm": 0.7736510480006613, "learning_rate": 6.73678567938369e-07, "loss": 0.0322, "step": 4255 }, { "epoch": 1.9363057324840764, "grad_norm": 1.1057311427670853, "learning_rate": 6.735445310516661e-07, "loss": 0.0304, "step": 4256 }, { "epoch": 1.9367606915377618, "grad_norm": 0.5612673841835504, "learning_rate": 6.734104799836877e-07, "loss": 0.0156, "step": 4257 }, { "epoch": 1.9372156505914466, "grad_norm": 0.6609573038132939, "learning_rate": 6.732764147453881e-07, "loss": 0.0216, "step": 4258 }, { "epoch": 1.937670609645132, "grad_norm": 0.8014615928359863, "learning_rate": 6.731423353477224e-07, "loss": 0.0385, "step": 4259 }, { "epoch": 1.9381255686988172, "grad_norm": 1.121794534017643, "learning_rate": 6.730082418016469e-07, "loss": 0.0427, "step": 4260 }, { "epoch": 1.9385805277525021, "grad_norm": 1.7411607216755065, "learning_rate": 6.728741341181192e-07, "loss": 0.0343, "step": 4261 }, { "epoch": 1.9390354868061874, "grad_norm": 1.0742341457409461, "learning_rate": 6.727400123080981e-07, "loss": 0.037, "step": 4262 }, { "epoch": 1.9394904458598727, "grad_norm": 0.9244935983654368, "learning_rate": 6.726058763825431e-07, "loss": 0.0244, "step": 4263 }, { "epoch": 1.9399454049135578, "grad_norm": 0.8493894899700476, "learning_rate": 6.724717263524153e-07, "loss": 0.0209, "step": 4264 }, { "epoch": 1.940400363967243, "grad_norm": 0.8667628156368868, "learning_rate": 6.723375622286771e-07, "loss": 0.0454, "step": 4265 }, { "epoch": 1.9408553230209282, "grad_norm": 1.0986619625032086, "learning_rate": 6.722033840222916e-07, "loss": 0.0446, "step": 4266 }, { "epoch": 1.9413102820746133, "grad_norm": 0.8763434252594664, "learning_rate": 6.720691917442231e-07, "loss": 0.0244, "step": 4267 }, { "epoch": 1.9417652411282984, "grad_norm": 0.8954070951517947, "learning_rate": 6.719349854054372e-07, "loss": 0.0273, "step": 4268 }, { "epoch": 1.9422202001819837, "grad_norm": 2.3725385825166856, "learning_rate": 6.718007650169009e-07, "loss": 0.09, "step": 4269 }, { "epoch": 1.9426751592356688, "grad_norm": 1.1031721048191896, "learning_rate": 6.71666530589582e-07, "loss": 0.0468, "step": 4270 }, { "epoch": 1.943130118289354, "grad_norm": 0.9171484070630177, "learning_rate": 6.715322821344494e-07, "loss": 0.0396, "step": 4271 }, { "epoch": 1.9435850773430392, "grad_norm": 0.824285177476433, "learning_rate": 6.713980196624731e-07, "loss": 0.0284, "step": 4272 }, { "epoch": 1.9440400363967243, "grad_norm": 1.0799648993225142, "learning_rate": 6.712637431846249e-07, "loss": 0.0469, "step": 4273 }, { "epoch": 1.9444949954504094, "grad_norm": 1.2697084536558423, "learning_rate": 6.711294527118771e-07, "loss": 0.0523, "step": 4274 }, { "epoch": 1.9449499545040947, "grad_norm": 0.8334747100501294, "learning_rate": 6.709951482552031e-07, "loss": 0.0298, "step": 4275 }, { "epoch": 1.9454049135577798, "grad_norm": 120.30918605651311, "learning_rate": 6.708608298255778e-07, "loss": 0.1555, "step": 4276 }, { "epoch": 1.9458598726114649, "grad_norm": 1.0402696395612625, "learning_rate": 6.707264974339771e-07, "loss": 0.0356, "step": 4277 }, { "epoch": 1.9463148316651502, "grad_norm": 0.8762351820133611, "learning_rate": 6.70592151091378e-07, "loss": 0.04, "step": 4278 }, { "epoch": 1.9467697907188353, "grad_norm": 0.8800803492982241, "learning_rate": 6.704577908087589e-07, "loss": 0.0485, "step": 4279 }, { "epoch": 1.9472247497725204, "grad_norm": 0.9597542388688403, "learning_rate": 6.703234165970987e-07, "loss": 0.0301, "step": 4280 }, { "epoch": 1.9476797088262057, "grad_norm": 0.9345440364567065, "learning_rate": 6.701890284673781e-07, "loss": 0.0327, "step": 4281 }, { "epoch": 1.9481346678798908, "grad_norm": 1.12267590155542, "learning_rate": 6.700546264305786e-07, "loss": 0.028, "step": 4282 }, { "epoch": 1.9485896269335758, "grad_norm": 0.8304030902178564, "learning_rate": 6.699202104976831e-07, "loss": 0.0221, "step": 4283 }, { "epoch": 1.9490445859872612, "grad_norm": 0.999987474202185, "learning_rate": 6.697857806796752e-07, "loss": 0.0315, "step": 4284 }, { "epoch": 1.9494995450409465, "grad_norm": 1.0347485906598042, "learning_rate": 6.696513369875402e-07, "loss": 0.0478, "step": 4285 }, { "epoch": 1.9499545040946313, "grad_norm": 0.8015612152596213, "learning_rate": 6.695168794322641e-07, "loss": 0.0213, "step": 4286 }, { "epoch": 1.9504094631483166, "grad_norm": 1.1867111560350325, "learning_rate": 6.693824080248341e-07, "loss": 0.0374, "step": 4287 }, { "epoch": 1.950864422202002, "grad_norm": 0.9568022292132446, "learning_rate": 6.692479227762386e-07, "loss": 0.0398, "step": 4288 }, { "epoch": 1.9513193812556868, "grad_norm": 0.773511498356015, "learning_rate": 6.691134236974671e-07, "loss": 0.0203, "step": 4289 }, { "epoch": 1.9517743403093721, "grad_norm": 2.024978726827532, "learning_rate": 6.689789107995105e-07, "loss": 0.1021, "step": 4290 }, { "epoch": 1.9522292993630574, "grad_norm": 0.9602525331452725, "learning_rate": 6.688443840933604e-07, "loss": 0.0427, "step": 4291 }, { "epoch": 1.9526842584167425, "grad_norm": 0.7875203461312885, "learning_rate": 6.687098435900096e-07, "loss": 0.037, "step": 4292 }, { "epoch": 1.9531392174704276, "grad_norm": 0.7793210136524269, "learning_rate": 6.685752893004524e-07, "loss": 0.0262, "step": 4293 }, { "epoch": 1.953594176524113, "grad_norm": 1.2227423882366673, "learning_rate": 6.684407212356838e-07, "loss": 0.0389, "step": 4294 }, { "epoch": 1.954049135577798, "grad_norm": 1.3129321676759127, "learning_rate": 6.683061394067001e-07, "loss": 0.0522, "step": 4295 }, { "epoch": 1.9545040946314831, "grad_norm": 1.420144119877498, "learning_rate": 6.681715438244987e-07, "loss": 0.0501, "step": 4296 }, { "epoch": 1.9549590536851684, "grad_norm": 0.919602193915338, "learning_rate": 6.680369345000782e-07, "loss": 0.0331, "step": 4297 }, { "epoch": 1.9554140127388535, "grad_norm": 0.9068669236556334, "learning_rate": 6.679023114444384e-07, "loss": 0.0342, "step": 4298 }, { "epoch": 1.9558689717925386, "grad_norm": 1.080003176710164, "learning_rate": 6.677676746685797e-07, "loss": 0.036, "step": 4299 }, { "epoch": 1.956323930846224, "grad_norm": 1.131985505563795, "learning_rate": 6.676330241835044e-07, "loss": 0.0359, "step": 4300 }, { "epoch": 1.956778889899909, "grad_norm": 0.9914540084407584, "learning_rate": 6.674983600002154e-07, "loss": 0.0247, "step": 4301 }, { "epoch": 1.957233848953594, "grad_norm": 0.9172173864744413, "learning_rate": 6.67363682129717e-07, "loss": 0.0333, "step": 4302 }, { "epoch": 1.9576888080072794, "grad_norm": 0.7845669028144359, "learning_rate": 6.672289905830141e-07, "loss": 0.02, "step": 4303 }, { "epoch": 1.9581437670609645, "grad_norm": 0.8111804803118821, "learning_rate": 6.670942853711133e-07, "loss": 0.0261, "step": 4304 }, { "epoch": 1.9585987261146496, "grad_norm": 0.8498357245157323, "learning_rate": 6.669595665050221e-07, "loss": 0.04, "step": 4305 }, { "epoch": 1.959053685168335, "grad_norm": 0.866880953943408, "learning_rate": 6.668248339957491e-07, "loss": 0.0348, "step": 4306 }, { "epoch": 1.95950864422202, "grad_norm": 0.9900283959301989, "learning_rate": 6.66690087854304e-07, "loss": 0.027, "step": 4307 }, { "epoch": 1.959963603275705, "grad_norm": 0.6696632999792232, "learning_rate": 6.665553280916978e-07, "loss": 0.0217, "step": 4308 }, { "epoch": 1.9604185623293904, "grad_norm": 1.1876488528760454, "learning_rate": 6.664205547189423e-07, "loss": 0.0622, "step": 4309 }, { "epoch": 1.9608735213830755, "grad_norm": 0.6671086286598974, "learning_rate": 6.662857677470507e-07, "loss": 0.0191, "step": 4310 }, { "epoch": 1.9613284804367606, "grad_norm": 1.0459318542521274, "learning_rate": 6.661509671870369e-07, "loss": 0.037, "step": 4311 }, { "epoch": 1.9617834394904459, "grad_norm": 0.9862476420042158, "learning_rate": 6.660161530499166e-07, "loss": 0.0389, "step": 4312 }, { "epoch": 1.9622383985441312, "grad_norm": 1.211068179970699, "learning_rate": 6.658813253467059e-07, "loss": 0.0445, "step": 4313 }, { "epoch": 1.962693357597816, "grad_norm": 1.0369441453049026, "learning_rate": 6.657464840884224e-07, "loss": 0.0389, "step": 4314 }, { "epoch": 1.9631483166515014, "grad_norm": 0.776932075299223, "learning_rate": 6.656116292860848e-07, "loss": 0.0402, "step": 4315 }, { "epoch": 1.9636032757051867, "grad_norm": 1.9474167673337466, "learning_rate": 6.654767609507126e-07, "loss": 0.069, "step": 4316 }, { "epoch": 1.9640582347588715, "grad_norm": 0.734785190613041, "learning_rate": 6.653418790933268e-07, "loss": 0.0253, "step": 4317 }, { "epoch": 1.9645131938125568, "grad_norm": 1.116565629054357, "learning_rate": 6.652069837249495e-07, "loss": 0.0338, "step": 4318 }, { "epoch": 1.9649681528662422, "grad_norm": 1.126319903852273, "learning_rate": 6.650720748566035e-07, "loss": 0.0368, "step": 4319 }, { "epoch": 1.9654231119199272, "grad_norm": 0.8982135966633862, "learning_rate": 6.649371524993129e-07, "loss": 0.0256, "step": 4320 }, { "epoch": 1.9658780709736123, "grad_norm": 7.752143126337965, "learning_rate": 6.64802216664103e-07, "loss": 0.0838, "step": 4321 }, { "epoch": 1.9663330300272976, "grad_norm": 1.1280343904093422, "learning_rate": 6.646672673620004e-07, "loss": 0.0633, "step": 4322 }, { "epoch": 1.9667879890809827, "grad_norm": 1.1473276346049353, "learning_rate": 6.645323046040322e-07, "loss": 0.0445, "step": 4323 }, { "epoch": 1.9672429481346678, "grad_norm": 0.8266324376958577, "learning_rate": 6.643973284012271e-07, "loss": 0.0283, "step": 4324 }, { "epoch": 1.9676979071883531, "grad_norm": 0.5553424774202205, "learning_rate": 6.642623387646146e-07, "loss": 0.0256, "step": 4325 }, { "epoch": 1.9681528662420382, "grad_norm": 1.0139053293427727, "learning_rate": 6.641273357052259e-07, "loss": 0.0432, "step": 4326 }, { "epoch": 1.9686078252957233, "grad_norm": 1.0422102996256444, "learning_rate": 6.639923192340923e-07, "loss": 0.0257, "step": 4327 }, { "epoch": 1.9690627843494086, "grad_norm": 0.816611965197192, "learning_rate": 6.63857289362247e-07, "loss": 0.0378, "step": 4328 }, { "epoch": 1.9695177434030937, "grad_norm": 0.779790171101918, "learning_rate": 6.63722246100724e-07, "loss": 0.029, "step": 4329 }, { "epoch": 1.9699727024567788, "grad_norm": 0.7430433364747858, "learning_rate": 6.635871894605584e-07, "loss": 0.0344, "step": 4330 }, { "epoch": 1.9704276615104641, "grad_norm": 0.819102104229802, "learning_rate": 6.634521194527865e-07, "loss": 0.0207, "step": 4331 }, { "epoch": 1.9708826205641492, "grad_norm": 0.8404003015194281, "learning_rate": 6.633170360884455e-07, "loss": 0.0284, "step": 4332 }, { "epoch": 1.9713375796178343, "grad_norm": 0.967942942216688, "learning_rate": 6.631819393785737e-07, "loss": 0.046, "step": 4333 }, { "epoch": 1.9717925386715196, "grad_norm": 0.873626353646314, "learning_rate": 6.630468293342108e-07, "loss": 0.036, "step": 4334 }, { "epoch": 1.9722474977252047, "grad_norm": 1.3915115282604795, "learning_rate": 6.629117059663973e-07, "loss": 0.0408, "step": 4335 }, { "epoch": 1.9727024567788898, "grad_norm": 1.1844020284192815, "learning_rate": 6.627765692861751e-07, "loss": 0.06, "step": 4336 }, { "epoch": 1.973157415832575, "grad_norm": 0.8599784582213558, "learning_rate": 6.626414193045866e-07, "loss": 0.0381, "step": 4337 }, { "epoch": 1.9736123748862604, "grad_norm": 1.108192356244821, "learning_rate": 6.625062560326757e-07, "loss": 0.0533, "step": 4338 }, { "epoch": 1.9740673339399453, "grad_norm": 0.8972751821578755, "learning_rate": 6.623710794814877e-07, "loss": 0.0417, "step": 4339 }, { "epoch": 1.9745222929936306, "grad_norm": 0.6998975747243716, "learning_rate": 6.622358896620681e-07, "loss": 0.0264, "step": 4340 }, { "epoch": 1.974977252047316, "grad_norm": 0.630585696221952, "learning_rate": 6.621006865854643e-07, "loss": 0.0177, "step": 4341 }, { "epoch": 1.9754322111010008, "grad_norm": 0.9201476331511071, "learning_rate": 6.619654702627245e-07, "loss": 0.0361, "step": 4342 }, { "epoch": 1.975887170154686, "grad_norm": 0.8240047949129307, "learning_rate": 6.618302407048979e-07, "loss": 0.0383, "step": 4343 }, { "epoch": 1.9763421292083714, "grad_norm": 1.190852970262995, "learning_rate": 6.616949979230349e-07, "loss": 0.0743, "step": 4344 }, { "epoch": 1.9767970882620565, "grad_norm": 0.7574116574221624, "learning_rate": 6.615597419281867e-07, "loss": 0.0266, "step": 4345 }, { "epoch": 1.9772520473157416, "grad_norm": 1.2290408148376082, "learning_rate": 6.614244727314063e-07, "loss": 0.0619, "step": 4346 }, { "epoch": 1.9777070063694269, "grad_norm": 1.05033209920814, "learning_rate": 6.612891903437465e-07, "loss": 0.0349, "step": 4347 }, { "epoch": 1.978161965423112, "grad_norm": 0.8394140477279904, "learning_rate": 6.611538947762627e-07, "loss": 0.0332, "step": 4348 }, { "epoch": 1.978616924476797, "grad_norm": 0.8048107702683261, "learning_rate": 6.610185860400105e-07, "loss": 0.0364, "step": 4349 }, { "epoch": 1.9790718835304824, "grad_norm": 0.9273269780773339, "learning_rate": 6.608832641460464e-07, "loss": 0.0396, "step": 4350 }, { "epoch": 1.9795268425841674, "grad_norm": 0.8675788444475835, "learning_rate": 6.607479291054288e-07, "loss": 0.0281, "step": 4351 }, { "epoch": 1.9799818016378525, "grad_norm": 0.7175827993317309, "learning_rate": 6.606125809292159e-07, "loss": 0.028, "step": 4352 }, { "epoch": 1.9804367606915378, "grad_norm": 0.7949880398266319, "learning_rate": 6.604772196284685e-07, "loss": 0.0215, "step": 4353 }, { "epoch": 1.980891719745223, "grad_norm": 0.8144908661435571, "learning_rate": 6.603418452142474e-07, "loss": 0.0307, "step": 4354 }, { "epoch": 1.981346678798908, "grad_norm": 0.8506589678303814, "learning_rate": 6.602064576976147e-07, "loss": 0.0327, "step": 4355 }, { "epoch": 1.9818016378525933, "grad_norm": 1.0377224889893417, "learning_rate": 6.60071057089634e-07, "loss": 0.0317, "step": 4356 }, { "epoch": 1.9822565969062784, "grad_norm": 1.1157864676195768, "learning_rate": 6.59935643401369e-07, "loss": 0.0288, "step": 4357 }, { "epoch": 1.9827115559599635, "grad_norm": 0.5958493267636483, "learning_rate": 6.598002166438858e-07, "loss": 0.0252, "step": 4358 }, { "epoch": 1.9831665150136488, "grad_norm": 0.9815993837730315, "learning_rate": 6.596647768282503e-07, "loss": 0.0426, "step": 4359 }, { "epoch": 1.983621474067334, "grad_norm": 1.2101609492505656, "learning_rate": 6.595293239655306e-07, "loss": 0.0513, "step": 4360 }, { "epoch": 1.984076433121019, "grad_norm": 1.0084548452935493, "learning_rate": 6.593938580667948e-07, "loss": 0.0315, "step": 4361 }, { "epoch": 1.9845313921747043, "grad_norm": 1.1255054483839113, "learning_rate": 6.592583791431128e-07, "loss": 0.0697, "step": 4362 }, { "epoch": 1.9849863512283894, "grad_norm": 0.9924381702679023, "learning_rate": 6.591228872055552e-07, "loss": 0.0358, "step": 4363 }, { "epoch": 1.9854413102820745, "grad_norm": 1.1261269592328853, "learning_rate": 6.589873822651939e-07, "loss": 0.0494, "step": 4364 }, { "epoch": 1.9858962693357598, "grad_norm": 0.870334853937679, "learning_rate": 6.588518643331017e-07, "loss": 0.0286, "step": 4365 }, { "epoch": 1.9863512283894451, "grad_norm": 0.9179158224288847, "learning_rate": 6.587163334203525e-07, "loss": 0.0279, "step": 4366 }, { "epoch": 1.98680618744313, "grad_norm": 0.8783129472163259, "learning_rate": 6.58580789538021e-07, "loss": 0.0232, "step": 4367 }, { "epoch": 1.9872611464968153, "grad_norm": 0.9761804264879057, "learning_rate": 6.58445232697184e-07, "loss": 0.0336, "step": 4368 }, { "epoch": 1.9877161055505006, "grad_norm": 1.0439453683587698, "learning_rate": 6.583096629089177e-07, "loss": 0.0426, "step": 4369 }, { "epoch": 1.9881710646041855, "grad_norm": 0.7467102992985231, "learning_rate": 6.58174080184301e-07, "loss": 0.0301, "step": 4370 }, { "epoch": 1.9886260236578708, "grad_norm": 0.8545970395055941, "learning_rate": 6.580384845344127e-07, "loss": 0.0217, "step": 4371 }, { "epoch": 1.989080982711556, "grad_norm": 0.7647111552462396, "learning_rate": 6.579028759703331e-07, "loss": 0.0268, "step": 4372 }, { "epoch": 1.9895359417652412, "grad_norm": 1.0566277422840866, "learning_rate": 6.577672545031435e-07, "loss": 0.0293, "step": 4373 }, { "epoch": 1.9899909008189263, "grad_norm": 0.6346101278872446, "learning_rate": 6.576316201439263e-07, "loss": 0.027, "step": 4374 }, { "epoch": 1.9904458598726116, "grad_norm": 1.0059602839586592, "learning_rate": 6.574959729037653e-07, "loss": 0.0538, "step": 4375 }, { "epoch": 1.9909008189262967, "grad_norm": 0.8440252440661395, "learning_rate": 6.573603127937442e-07, "loss": 0.042, "step": 4376 }, { "epoch": 1.9913557779799818, "grad_norm": 0.8459589959647473, "learning_rate": 6.572246398249492e-07, "loss": 0.0213, "step": 4377 }, { "epoch": 1.991810737033667, "grad_norm": 1.2174289467528407, "learning_rate": 6.570889540084665e-07, "loss": 0.0323, "step": 4378 }, { "epoch": 1.9922656960873522, "grad_norm": 0.9362913295123454, "learning_rate": 6.569532553553841e-07, "loss": 0.0391, "step": 4379 }, { "epoch": 1.9927206551410372, "grad_norm": 0.8574705547000226, "learning_rate": 6.568175438767904e-07, "loss": 0.0435, "step": 4380 }, { "epoch": 1.9931756141947226, "grad_norm": 1.1763738862520596, "learning_rate": 6.56681819583775e-07, "loss": 0.0383, "step": 4381 }, { "epoch": 1.9936305732484076, "grad_norm": 0.8157524201708595, "learning_rate": 6.565460824874292e-07, "loss": 0.0286, "step": 4382 }, { "epoch": 1.9940855323020927, "grad_norm": 0.952177652593245, "learning_rate": 6.564103325988441e-07, "loss": 0.0311, "step": 4383 }, { "epoch": 1.994540491355778, "grad_norm": 1.4087049729044423, "learning_rate": 6.562745699291132e-07, "loss": 0.04, "step": 4384 }, { "epoch": 1.9949954504094631, "grad_norm": 0.9473575339729072, "learning_rate": 6.561387944893303e-07, "loss": 0.0415, "step": 4385 }, { "epoch": 1.9954504094631482, "grad_norm": 0.9023414344676377, "learning_rate": 6.560030062905901e-07, "loss": 0.0286, "step": 4386 }, { "epoch": 1.9959053685168335, "grad_norm": 1.1384515605372418, "learning_rate": 6.558672053439886e-07, "loss": 0.0462, "step": 4387 }, { "epoch": 1.9963603275705186, "grad_norm": 0.9825377248807233, "learning_rate": 6.557313916606231e-07, "loss": 0.0346, "step": 4388 }, { "epoch": 1.9968152866242037, "grad_norm": 0.6045858335780908, "learning_rate": 6.555955652515917e-07, "loss": 0.0162, "step": 4389 }, { "epoch": 1.997270245677889, "grad_norm": 0.9551897346370329, "learning_rate": 6.554597261279931e-07, "loss": 0.0396, "step": 4390 }, { "epoch": 1.9977252047315741, "grad_norm": 0.8989749895357633, "learning_rate": 6.553238743009277e-07, "loss": 0.0333, "step": 4391 }, { "epoch": 1.9981801637852592, "grad_norm": 0.9196997322144652, "learning_rate": 6.551880097814971e-07, "loss": 0.0301, "step": 4392 }, { "epoch": 1.9986351228389445, "grad_norm": 1.1412048765264406, "learning_rate": 6.550521325808029e-07, "loss": 0.03, "step": 4393 }, { "epoch": 1.9990900818926298, "grad_norm": 0.7693057858862417, "learning_rate": 6.549162427099486e-07, "loss": 0.0341, "step": 4394 }, { "epoch": 1.9995450409463147, "grad_norm": 1.0242043893358321, "learning_rate": 6.547803401800384e-07, "loss": 0.0447, "step": 4395 }, { "epoch": 2.0, "grad_norm": 1.9228742314304224, "learning_rate": 6.546444250021782e-07, "loss": 0.0212, "step": 4396 }, { "epoch": 2.0004549590536853, "grad_norm": 0.7813141694326514, "learning_rate": 6.545084971874736e-07, "loss": 0.016, "step": 4397 }, { "epoch": 2.00090991810737, "grad_norm": 0.8118350975277375, "learning_rate": 6.543725567470327e-07, "loss": 0.0213, "step": 4398 }, { "epoch": 2.0013648771610555, "grad_norm": 0.6670315995048789, "learning_rate": 6.542366036919634e-07, "loss": 0.0133, "step": 4399 }, { "epoch": 2.001819836214741, "grad_norm": 0.399217469569325, "learning_rate": 6.541006380333753e-07, "loss": 0.0091, "step": 4400 }, { "epoch": 2.0022747952684257, "grad_norm": 0.6766181922160304, "learning_rate": 6.539646597823791e-07, "loss": 0.0269, "step": 4401 }, { "epoch": 2.002729754322111, "grad_norm": 0.6261969333983392, "learning_rate": 6.538286689500862e-07, "loss": 0.0262, "step": 4402 }, { "epoch": 2.0031847133757963, "grad_norm": 0.7121888161372554, "learning_rate": 6.536926655476091e-07, "loss": 0.0294, "step": 4403 }, { "epoch": 2.003639672429481, "grad_norm": 0.7136460493211033, "learning_rate": 6.535566495860614e-07, "loss": 0.0282, "step": 4404 }, { "epoch": 2.0040946314831665, "grad_norm": 0.5906403883993867, "learning_rate": 6.534206210765578e-07, "loss": 0.0261, "step": 4405 }, { "epoch": 2.0045495905368518, "grad_norm": 0.7182736129503909, "learning_rate": 6.532845800302139e-07, "loss": 0.0244, "step": 4406 }, { "epoch": 2.0050045495905366, "grad_norm": 0.7163141687623594, "learning_rate": 6.531485264581464e-07, "loss": 0.028, "step": 4407 }, { "epoch": 2.005459508644222, "grad_norm": 0.6753579724331115, "learning_rate": 6.530124603714729e-07, "loss": 0.0206, "step": 4408 }, { "epoch": 2.0059144676979073, "grad_norm": 0.4975425378438014, "learning_rate": 6.528763817813121e-07, "loss": 0.021, "step": 4409 }, { "epoch": 2.0063694267515926, "grad_norm": 0.5460393774642116, "learning_rate": 6.527402906987838e-07, "loss": 0.0146, "step": 4410 }, { "epoch": 2.0068243858052774, "grad_norm": 0.6370742195863449, "learning_rate": 6.526041871350085e-07, "loss": 0.0108, "step": 4411 }, { "epoch": 2.0072793448589628, "grad_norm": 1.1096658962537143, "learning_rate": 6.524680711011083e-07, "loss": 0.0246, "step": 4412 }, { "epoch": 2.007734303912648, "grad_norm": 0.6784502805979299, "learning_rate": 6.523319426082061e-07, "loss": 0.0184, "step": 4413 }, { "epoch": 2.008189262966333, "grad_norm": 0.6951583606567633, "learning_rate": 6.521958016674253e-07, "loss": 0.0267, "step": 4414 }, { "epoch": 2.0086442220200182, "grad_norm": 0.6495418399227011, "learning_rate": 6.520596482898909e-07, "loss": 0.0139, "step": 4415 }, { "epoch": 2.0090991810737036, "grad_norm": 0.6062801803387541, "learning_rate": 6.519234824867288e-07, "loss": 0.019, "step": 4416 }, { "epoch": 2.0095541401273884, "grad_norm": 0.7008901039826495, "learning_rate": 6.517873042690657e-07, "loss": 0.0224, "step": 4417 }, { "epoch": 2.0100090991810737, "grad_norm": 0.6545865519835079, "learning_rate": 6.516511136480296e-07, "loss": 0.0128, "step": 4418 }, { "epoch": 2.010464058234759, "grad_norm": 0.5607429766537309, "learning_rate": 6.515149106347494e-07, "loss": 0.0131, "step": 4419 }, { "epoch": 2.010919017288444, "grad_norm": 0.5099002437721561, "learning_rate": 6.513786952403548e-07, "loss": 0.0059, "step": 4420 }, { "epoch": 2.011373976342129, "grad_norm": 0.6824532925638841, "learning_rate": 6.512424674759771e-07, "loss": 0.0186, "step": 4421 }, { "epoch": 2.0118289353958145, "grad_norm": 0.5854248893923849, "learning_rate": 6.511062273527476e-07, "loss": 0.0248, "step": 4422 }, { "epoch": 2.0122838944494994, "grad_norm": 0.5967123748070123, "learning_rate": 6.509699748817999e-07, "loss": 0.0106, "step": 4423 }, { "epoch": 2.0127388535031847, "grad_norm": 0.8059983581764116, "learning_rate": 6.508337100742675e-07, "loss": 0.0261, "step": 4424 }, { "epoch": 2.01319381255687, "grad_norm": 0.37994859033479883, "learning_rate": 6.506974329412855e-07, "loss": 0.0067, "step": 4425 }, { "epoch": 2.013648771610555, "grad_norm": 0.6717993231840184, "learning_rate": 6.505611434939898e-07, "loss": 0.0254, "step": 4426 }, { "epoch": 2.01410373066424, "grad_norm": 0.756192830874317, "learning_rate": 6.504248417435173e-07, "loss": 0.0275, "step": 4427 }, { "epoch": 2.0145586897179255, "grad_norm": 0.5222342135865676, "learning_rate": 6.502885277010062e-07, "loss": 0.0128, "step": 4428 }, { "epoch": 2.0150136487716104, "grad_norm": 0.5730745983282698, "learning_rate": 6.501522013775951e-07, "loss": 0.0183, "step": 4429 }, { "epoch": 2.0154686078252957, "grad_norm": 0.6265201183773735, "learning_rate": 6.500158627844245e-07, "loss": 0.0189, "step": 4430 }, { "epoch": 2.015923566878981, "grad_norm": 0.4742427169205748, "learning_rate": 6.498795119326348e-07, "loss": 0.0177, "step": 4431 }, { "epoch": 2.016378525932666, "grad_norm": 0.7523795070889133, "learning_rate": 6.497431488333682e-07, "loss": 0.0154, "step": 4432 }, { "epoch": 2.016833484986351, "grad_norm": 0.7632284317843815, "learning_rate": 6.496067734977679e-07, "loss": 0.0169, "step": 4433 }, { "epoch": 2.0172884440400365, "grad_norm": 0.5376801944799755, "learning_rate": 6.494703859369777e-07, "loss": 0.0122, "step": 4434 }, { "epoch": 2.0177434030937214, "grad_norm": 1.2486507816588794, "learning_rate": 6.493339861621426e-07, "loss": 0.0133, "step": 4435 }, { "epoch": 2.0181983621474067, "grad_norm": 0.6978975633009956, "learning_rate": 6.491975741844082e-07, "loss": 0.0112, "step": 4436 }, { "epoch": 2.018653321201092, "grad_norm": 0.8886802067214047, "learning_rate": 6.490611500149221e-07, "loss": 0.0202, "step": 4437 }, { "epoch": 2.0191082802547773, "grad_norm": 0.8299371880874941, "learning_rate": 6.48924713664832e-07, "loss": 0.0142, "step": 4438 }, { "epoch": 2.019563239308462, "grad_norm": 0.9199831869414211, "learning_rate": 6.487882651452866e-07, "loss": 0.0134, "step": 4439 }, { "epoch": 2.0200181983621475, "grad_norm": 0.895244222038531, "learning_rate": 6.486518044674363e-07, "loss": 0.0239, "step": 4440 }, { "epoch": 2.0204731574158328, "grad_norm": 58.64487265203109, "learning_rate": 6.485153316424318e-07, "loss": 0.1477, "step": 4441 }, { "epoch": 2.0209281164695176, "grad_norm": 0.6839028284608688, "learning_rate": 6.48378846681425e-07, "loss": 0.0209, "step": 4442 }, { "epoch": 2.021383075523203, "grad_norm": 0.6981884285598347, "learning_rate": 6.482423495955691e-07, "loss": 0.023, "step": 4443 }, { "epoch": 2.0218380345768883, "grad_norm": 0.9669034897220085, "learning_rate": 6.481058403960177e-07, "loss": 0.0152, "step": 4444 }, { "epoch": 2.022292993630573, "grad_norm": 0.7610402700194951, "learning_rate": 6.47969319093926e-07, "loss": 0.0144, "step": 4445 }, { "epoch": 2.0227479526842584, "grad_norm": 0.9826857703630462, "learning_rate": 6.478327857004495e-07, "loss": 0.0251, "step": 4446 }, { "epoch": 2.0232029117379438, "grad_norm": 0.6985374248671841, "learning_rate": 6.476962402267456e-07, "loss": 0.0172, "step": 4447 }, { "epoch": 2.0236578707916286, "grad_norm": 0.5762533177585605, "learning_rate": 6.475596826839717e-07, "loss": 0.0148, "step": 4448 }, { "epoch": 2.024112829845314, "grad_norm": 0.6618553419771457, "learning_rate": 6.474231130832872e-07, "loss": 0.0173, "step": 4449 }, { "epoch": 2.0245677888989992, "grad_norm": 0.8437446234922692, "learning_rate": 6.472865314358516e-07, "loss": 0.0202, "step": 4450 }, { "epoch": 2.025022747952684, "grad_norm": 0.5530091006044189, "learning_rate": 6.471499377528256e-07, "loss": 0.0121, "step": 4451 }, { "epoch": 2.0254777070063694, "grad_norm": 0.7358206073234724, "learning_rate": 6.470133320453714e-07, "loss": 0.0136, "step": 4452 }, { "epoch": 2.0259326660600547, "grad_norm": 1.224661421350944, "learning_rate": 6.468767143246514e-07, "loss": 0.0249, "step": 4453 }, { "epoch": 2.0263876251137396, "grad_norm": 0.7048435650714743, "learning_rate": 6.467400846018298e-07, "loss": 0.0198, "step": 4454 }, { "epoch": 2.026842584167425, "grad_norm": 0.9164289887712118, "learning_rate": 6.466034428880712e-07, "loss": 0.0368, "step": 4455 }, { "epoch": 2.02729754322111, "grad_norm": 0.901560667938297, "learning_rate": 6.464667891945412e-07, "loss": 0.0307, "step": 4456 }, { "epoch": 2.027752502274795, "grad_norm": 0.5790408899937679, "learning_rate": 6.463301235324065e-07, "loss": 0.0175, "step": 4457 }, { "epoch": 2.0282074613284804, "grad_norm": 1.1169540979926347, "learning_rate": 6.461934459128351e-07, "loss": 0.0182, "step": 4458 }, { "epoch": 2.0286624203821657, "grad_norm": 0.727303194989778, "learning_rate": 6.460567563469955e-07, "loss": 0.0255, "step": 4459 }, { "epoch": 2.0291173794358506, "grad_norm": 1.6094952777611613, "learning_rate": 6.459200548460573e-07, "loss": 0.0472, "step": 4460 }, { "epoch": 2.029572338489536, "grad_norm": 0.7495919293441294, "learning_rate": 6.457833414211912e-07, "loss": 0.0124, "step": 4461 }, { "epoch": 2.030027297543221, "grad_norm": 0.7741515084929894, "learning_rate": 6.456466160835688e-07, "loss": 0.0158, "step": 4462 }, { "epoch": 2.030482256596906, "grad_norm": 0.7131364870390242, "learning_rate": 6.455098788443627e-07, "loss": 0.0195, "step": 4463 }, { "epoch": 2.0309372156505914, "grad_norm": 0.643758717991703, "learning_rate": 6.453731297147463e-07, "loss": 0.0105, "step": 4464 }, { "epoch": 2.0313921747042767, "grad_norm": 0.723726157771784, "learning_rate": 6.452363687058943e-07, "loss": 0.0123, "step": 4465 }, { "epoch": 2.031847133757962, "grad_norm": 0.8696883882932523, "learning_rate": 6.450995958289822e-07, "loss": 0.0221, "step": 4466 }, { "epoch": 2.032302092811647, "grad_norm": 0.6243043454649845, "learning_rate": 6.449628110951863e-07, "loss": 0.0122, "step": 4467 }, { "epoch": 2.032757051865332, "grad_norm": 0.6982255431980969, "learning_rate": 6.448260145156841e-07, "loss": 0.0163, "step": 4468 }, { "epoch": 2.0332120109190175, "grad_norm": 0.6437666877920513, "learning_rate": 6.446892061016542e-07, "loss": 0.0244, "step": 4469 }, { "epoch": 2.0336669699727024, "grad_norm": 0.5796559414812748, "learning_rate": 6.445523858642756e-07, "loss": 0.0121, "step": 4470 }, { "epoch": 2.0341219290263877, "grad_norm": 0.9036855762582203, "learning_rate": 6.444155538147289e-07, "loss": 0.013, "step": 4471 }, { "epoch": 2.034576888080073, "grad_norm": 0.717415151028589, "learning_rate": 6.442787099641954e-07, "loss": 0.0108, "step": 4472 }, { "epoch": 2.035031847133758, "grad_norm": 0.9697905721758339, "learning_rate": 6.441418543238571e-07, "loss": 0.0144, "step": 4473 }, { "epoch": 2.035486806187443, "grad_norm": 0.8567754024652214, "learning_rate": 6.440049869048975e-07, "loss": 0.0213, "step": 4474 }, { "epoch": 2.0359417652411285, "grad_norm": 1.2367908066786866, "learning_rate": 6.438681077185006e-07, "loss": 0.02, "step": 4475 }, { "epoch": 2.0363967242948133, "grad_norm": 0.7465228046713851, "learning_rate": 6.437312167758518e-07, "loss": 0.0123, "step": 4476 }, { "epoch": 2.0368516833484986, "grad_norm": 0.9612998651027195, "learning_rate": 6.43594314088137e-07, "loss": 0.0071, "step": 4477 }, { "epoch": 2.037306642402184, "grad_norm": 0.5765395005280004, "learning_rate": 6.434573996665433e-07, "loss": 0.0138, "step": 4478 }, { "epoch": 2.037761601455869, "grad_norm": 1.2019375883183312, "learning_rate": 6.433204735222587e-07, "loss": 0.0147, "step": 4479 }, { "epoch": 2.038216560509554, "grad_norm": 1.274332791715232, "learning_rate": 6.431835356664723e-07, "loss": 0.018, "step": 4480 }, { "epoch": 2.0386715195632394, "grad_norm": 0.660890513304956, "learning_rate": 6.430465861103738e-07, "loss": 0.0098, "step": 4481 }, { "epoch": 2.0391264786169243, "grad_norm": 0.7804984025596933, "learning_rate": 6.429096248651545e-07, "loss": 0.0194, "step": 4482 }, { "epoch": 2.0395814376706096, "grad_norm": 0.6223372487273443, "learning_rate": 6.42772651942006e-07, "loss": 0.0108, "step": 4483 }, { "epoch": 2.040036396724295, "grad_norm": 0.6929699955588061, "learning_rate": 6.42635667352121e-07, "loss": 0.0158, "step": 4484 }, { "epoch": 2.04049135577798, "grad_norm": 0.8164341156777911, "learning_rate": 6.424986711066936e-07, "loss": 0.0144, "step": 4485 }, { "epoch": 2.040946314831665, "grad_norm": 1.3102859046460387, "learning_rate": 6.423616632169182e-07, "loss": 0.0157, "step": 4486 }, { "epoch": 2.0414012738853504, "grad_norm": 0.7247706221017011, "learning_rate": 6.422246436939905e-07, "loss": 0.0186, "step": 4487 }, { "epoch": 2.0418562329390353, "grad_norm": 0.7409904649302119, "learning_rate": 6.420876125491072e-07, "loss": 0.0205, "step": 4488 }, { "epoch": 2.0423111919927206, "grad_norm": 1.1088731783938752, "learning_rate": 6.41950569793466e-07, "loss": 0.013, "step": 4489 }, { "epoch": 2.042766151046406, "grad_norm": 0.5464062385517823, "learning_rate": 6.418135154382654e-07, "loss": 0.0184, "step": 4490 }, { "epoch": 2.0432211101000908, "grad_norm": 0.9584811940884803, "learning_rate": 6.416764494947046e-07, "loss": 0.0126, "step": 4491 }, { "epoch": 2.043676069153776, "grad_norm": 0.60336676655176, "learning_rate": 6.415393719739839e-07, "loss": 0.0171, "step": 4492 }, { "epoch": 2.0441310282074614, "grad_norm": 1.065052368554227, "learning_rate": 6.414022828873053e-07, "loss": 0.0083, "step": 4493 }, { "epoch": 2.0445859872611467, "grad_norm": 1.4501890579033512, "learning_rate": 6.412651822458705e-07, "loss": 0.022, "step": 4494 }, { "epoch": 2.0450409463148316, "grad_norm": 0.6014339822440502, "learning_rate": 6.41128070060883e-07, "loss": 0.0082, "step": 4495 }, { "epoch": 2.045495905368517, "grad_norm": 0.65952986723753, "learning_rate": 6.40990946343547e-07, "loss": 0.011, "step": 4496 }, { "epoch": 2.045950864422202, "grad_norm": 1.0672229168967264, "learning_rate": 6.408538111050674e-07, "loss": 0.0298, "step": 4497 }, { "epoch": 2.046405823475887, "grad_norm": 0.6197009832230931, "learning_rate": 6.407166643566506e-07, "loss": 0.0077, "step": 4498 }, { "epoch": 2.0468607825295724, "grad_norm": 0.41406513686815977, "learning_rate": 6.405795061095034e-07, "loss": 0.0046, "step": 4499 }, { "epoch": 2.0473157415832577, "grad_norm": 0.4248238219663552, "learning_rate": 6.404423363748339e-07, "loss": 0.0058, "step": 4500 }, { "epoch": 2.0477707006369426, "grad_norm": 0.8784735153088363, "learning_rate": 6.403051551638508e-07, "loss": 0.0143, "step": 4501 }, { "epoch": 2.048225659690628, "grad_norm": 0.7164436981610202, "learning_rate": 6.401679624877641e-07, "loss": 0.0189, "step": 4502 }, { "epoch": 2.048680618744313, "grad_norm": 0.9493197661481363, "learning_rate": 6.400307583577844e-07, "loss": 0.0296, "step": 4503 }, { "epoch": 2.049135577797998, "grad_norm": 0.8144764481747759, "learning_rate": 6.398935427851235e-07, "loss": 0.0185, "step": 4504 }, { "epoch": 2.0495905368516834, "grad_norm": 0.7876068508993274, "learning_rate": 6.397563157809943e-07, "loss": 0.0144, "step": 4505 }, { "epoch": 2.0500454959053687, "grad_norm": 0.7536100282672631, "learning_rate": 6.396190773566097e-07, "loss": 0.0187, "step": 4506 }, { "epoch": 2.0505004549590535, "grad_norm": 0.7602717255477899, "learning_rate": 6.39481827523185e-07, "loss": 0.0131, "step": 4507 }, { "epoch": 2.050955414012739, "grad_norm": 0.8587312450949188, "learning_rate": 6.393445662919351e-07, "loss": 0.0126, "step": 4508 }, { "epoch": 2.051410373066424, "grad_norm": 0.9782398243822256, "learning_rate": 6.392072936740764e-07, "loss": 0.033, "step": 4509 }, { "epoch": 2.051865332120109, "grad_norm": 0.6236367878682966, "learning_rate": 6.390700096808264e-07, "loss": 0.0087, "step": 4510 }, { "epoch": 2.0523202911737943, "grad_norm": 0.9675674985375494, "learning_rate": 6.389327143234032e-07, "loss": 0.012, "step": 4511 }, { "epoch": 2.0527752502274796, "grad_norm": 0.7949060995408678, "learning_rate": 6.387954076130262e-07, "loss": 0.0104, "step": 4512 }, { "epoch": 2.0532302092811645, "grad_norm": 0.6213729498522748, "learning_rate": 6.386580895609151e-07, "loss": 0.0095, "step": 4513 }, { "epoch": 2.05368516833485, "grad_norm": 0.537862264713602, "learning_rate": 6.385207601782912e-07, "loss": 0.0089, "step": 4514 }, { "epoch": 2.054140127388535, "grad_norm": 0.7224337674742479, "learning_rate": 6.383834194763762e-07, "loss": 0.0156, "step": 4515 }, { "epoch": 2.05459508644222, "grad_norm": 0.881811920722785, "learning_rate": 6.382460674663931e-07, "loss": 0.0217, "step": 4516 }, { "epoch": 2.0550500454959053, "grad_norm": 0.9567199160741744, "learning_rate": 6.381087041595658e-07, "loss": 0.0173, "step": 4517 }, { "epoch": 2.0555050045495906, "grad_norm": 0.6959605956185696, "learning_rate": 6.379713295671188e-07, "loss": 0.0111, "step": 4518 }, { "epoch": 2.055959963603276, "grad_norm": 1.0169059453431863, "learning_rate": 6.378339437002778e-07, "loss": 0.0199, "step": 4519 }, { "epoch": 2.056414922656961, "grad_norm": 0.9214028778501555, "learning_rate": 6.376965465702695e-07, "loss": 0.0138, "step": 4520 }, { "epoch": 2.056869881710646, "grad_norm": 0.7887737624634472, "learning_rate": 6.375591381883212e-07, "loss": 0.0277, "step": 4521 }, { "epoch": 2.0573248407643314, "grad_norm": 0.5417866441602963, "learning_rate": 6.374217185656613e-07, "loss": 0.0163, "step": 4522 }, { "epoch": 2.0577797998180163, "grad_norm": 0.8800197992992301, "learning_rate": 6.37284287713519e-07, "loss": 0.0216, "step": 4523 }, { "epoch": 2.0582347588717016, "grad_norm": 0.7511506190817817, "learning_rate": 6.371468456431249e-07, "loss": 0.0195, "step": 4524 }, { "epoch": 2.058689717925387, "grad_norm": 0.7549663564175787, "learning_rate": 6.370093923657098e-07, "loss": 0.0209, "step": 4525 }, { "epoch": 2.0591446769790718, "grad_norm": 1.0088176206234392, "learning_rate": 6.36871927892506e-07, "loss": 0.0298, "step": 4526 }, { "epoch": 2.059599636032757, "grad_norm": 0.6535674525931227, "learning_rate": 6.367344522347464e-07, "loss": 0.0095, "step": 4527 }, { "epoch": 2.0600545950864424, "grad_norm": 0.638850429018032, "learning_rate": 6.365969654036647e-07, "loss": 0.0178, "step": 4528 }, { "epoch": 2.0605095541401273, "grad_norm": 0.5323528996184588, "learning_rate": 6.36459467410496e-07, "loss": 0.0151, "step": 4529 }, { "epoch": 2.0609645131938126, "grad_norm": 0.9589344059477838, "learning_rate": 6.363219582664757e-07, "loss": 0.0258, "step": 4530 }, { "epoch": 2.061419472247498, "grad_norm": 0.9899814561515887, "learning_rate": 6.361844379828407e-07, "loss": 0.0204, "step": 4531 }, { "epoch": 2.0618744313011828, "grad_norm": 0.8585074324622055, "learning_rate": 6.360469065708285e-07, "loss": 0.0114, "step": 4532 }, { "epoch": 2.062329390354868, "grad_norm": 0.7938102116900329, "learning_rate": 6.359093640416772e-07, "loss": 0.0188, "step": 4533 }, { "epoch": 2.0627843494085534, "grad_norm": 0.5674760258083946, "learning_rate": 6.357718104066267e-07, "loss": 0.0155, "step": 4534 }, { "epoch": 2.0632393084622382, "grad_norm": 1.5412272192415433, "learning_rate": 6.356342456769168e-07, "loss": 0.0119, "step": 4535 }, { "epoch": 2.0636942675159236, "grad_norm": 0.6597090726061628, "learning_rate": 6.354966698637891e-07, "loss": 0.0152, "step": 4536 }, { "epoch": 2.064149226569609, "grad_norm": 0.7414589317470369, "learning_rate": 6.353590829784853e-07, "loss": 0.0135, "step": 4537 }, { "epoch": 2.0646041856232937, "grad_norm": 0.5353866028625213, "learning_rate": 6.352214850322484e-07, "loss": 0.0082, "step": 4538 }, { "epoch": 2.065059144676979, "grad_norm": 0.9089804771486426, "learning_rate": 6.350838760363226e-07, "loss": 0.0175, "step": 4539 }, { "epoch": 2.0655141037306644, "grad_norm": 0.5213913118387343, "learning_rate": 6.349462560019523e-07, "loss": 0.0064, "step": 4540 }, { "epoch": 2.065969062784349, "grad_norm": 0.7425758939833903, "learning_rate": 6.348086249403836e-07, "loss": 0.0226, "step": 4541 }, { "epoch": 2.0664240218380345, "grad_norm": 0.7478879552484847, "learning_rate": 6.346709828628627e-07, "loss": 0.017, "step": 4542 }, { "epoch": 2.06687898089172, "grad_norm": 0.5639202406420404, "learning_rate": 6.345333297806373e-07, "loss": 0.0156, "step": 4543 }, { "epoch": 2.0673339399454047, "grad_norm": 0.8570104965594436, "learning_rate": 6.343956657049557e-07, "loss": 0.015, "step": 4544 }, { "epoch": 2.06778889899909, "grad_norm": 0.8315261839882233, "learning_rate": 6.342579906470672e-07, "loss": 0.0249, "step": 4545 }, { "epoch": 2.0682438580527753, "grad_norm": 0.6357032980268331, "learning_rate": 6.341203046182222e-07, "loss": 0.0157, "step": 4546 }, { "epoch": 2.06869881710646, "grad_norm": 0.989615317001109, "learning_rate": 6.339826076296714e-07, "loss": 0.0328, "step": 4547 }, { "epoch": 2.0691537761601455, "grad_norm": 0.6666458472369631, "learning_rate": 6.33844899692667e-07, "loss": 0.0145, "step": 4548 }, { "epoch": 2.069608735213831, "grad_norm": 0.9595721144992488, "learning_rate": 6.337071808184619e-07, "loss": 0.0317, "step": 4549 }, { "epoch": 2.070063694267516, "grad_norm": 0.521651307217076, "learning_rate": 6.335694510183097e-07, "loss": 0.0087, "step": 4550 }, { "epoch": 2.070518653321201, "grad_norm": 0.9232876081465896, "learning_rate": 6.334317103034652e-07, "loss": 0.0187, "step": 4551 }, { "epoch": 2.0709736123748863, "grad_norm": 0.6812816283109556, "learning_rate": 6.332939586851837e-07, "loss": 0.0273, "step": 4552 }, { "epoch": 2.0714285714285716, "grad_norm": 0.8548281432647622, "learning_rate": 6.331561961747223e-07, "loss": 0.0348, "step": 4553 }, { "epoch": 2.0718835304822565, "grad_norm": 0.7828945752824359, "learning_rate": 6.330184227833375e-07, "loss": 0.0182, "step": 4554 }, { "epoch": 2.072338489535942, "grad_norm": 0.6724525457487688, "learning_rate": 6.32880638522288e-07, "loss": 0.0181, "step": 4555 }, { "epoch": 2.072793448589627, "grad_norm": 0.7743289300887392, "learning_rate": 6.32742843402833e-07, "loss": 0.0133, "step": 4556 }, { "epoch": 2.073248407643312, "grad_norm": 1.3123298924710216, "learning_rate": 6.326050374362321e-07, "loss": 0.0205, "step": 4557 }, { "epoch": 2.0737033666969973, "grad_norm": 1.1750628659147841, "learning_rate": 6.324672206337463e-07, "loss": 0.0157, "step": 4558 }, { "epoch": 2.0741583257506826, "grad_norm": 0.9014478042341002, "learning_rate": 6.323293930066375e-07, "loss": 0.0109, "step": 4559 }, { "epoch": 2.0746132848043675, "grad_norm": 0.9312799552803257, "learning_rate": 6.321915545661685e-07, "loss": 0.0165, "step": 4560 }, { "epoch": 2.0750682438580528, "grad_norm": 0.8306750398625569, "learning_rate": 6.320537053236023e-07, "loss": 0.0207, "step": 4561 }, { "epoch": 2.075523202911738, "grad_norm": 0.7029225150874292, "learning_rate": 6.319158452902039e-07, "loss": 0.017, "step": 4562 }, { "epoch": 2.075978161965423, "grad_norm": 1.095051970871557, "learning_rate": 6.317779744772383e-07, "loss": 0.0172, "step": 4563 }, { "epoch": 2.0764331210191083, "grad_norm": 0.6836883052537155, "learning_rate": 6.316400928959717e-07, "loss": 0.0203, "step": 4564 }, { "epoch": 2.0768880800727936, "grad_norm": 0.5384916409518699, "learning_rate": 6.315022005576712e-07, "loss": 0.0093, "step": 4565 }, { "epoch": 2.0773430391264784, "grad_norm": 0.6545481634901328, "learning_rate": 6.313642974736049e-07, "loss": 0.0158, "step": 4566 }, { "epoch": 2.0777979981801638, "grad_norm": 0.7536770589065671, "learning_rate": 6.312263836550413e-07, "loss": 0.0137, "step": 4567 }, { "epoch": 2.078252957233849, "grad_norm": 0.8781906951022205, "learning_rate": 6.3108845911325e-07, "loss": 0.0145, "step": 4568 }, { "epoch": 2.078707916287534, "grad_norm": 1.134761632923604, "learning_rate": 6.309505238595021e-07, "loss": 0.0294, "step": 4569 }, { "epoch": 2.0791628753412192, "grad_norm": 0.7111520230704034, "learning_rate": 6.308125779050687e-07, "loss": 0.0166, "step": 4570 }, { "epoch": 2.0796178343949046, "grad_norm": 0.9116501420679854, "learning_rate": 6.306746212612222e-07, "loss": 0.0179, "step": 4571 }, { "epoch": 2.0800727934485894, "grad_norm": 0.42851856427211177, "learning_rate": 6.305366539392357e-07, "loss": 0.009, "step": 4572 }, { "epoch": 2.0805277525022747, "grad_norm": 0.913895415839106, "learning_rate": 6.303986759503835e-07, "loss": 0.0197, "step": 4573 }, { "epoch": 2.08098271155596, "grad_norm": 0.7270154559101547, "learning_rate": 6.302606873059402e-07, "loss": 0.0193, "step": 4574 }, { "epoch": 2.0814376706096454, "grad_norm": 0.8896398390856367, "learning_rate": 6.301226880171817e-07, "loss": 0.0249, "step": 4575 }, { "epoch": 2.08189262966333, "grad_norm": 0.7117390801142298, "learning_rate": 6.299846780953849e-07, "loss": 0.0134, "step": 4576 }, { "epoch": 2.0823475887170155, "grad_norm": 1.3390938376781152, "learning_rate": 6.298466575518273e-07, "loss": 0.0271, "step": 4577 }, { "epoch": 2.082802547770701, "grad_norm": 1.802582488789965, "learning_rate": 6.297086263977871e-07, "loss": 0.0529, "step": 4578 }, { "epoch": 2.0832575068243857, "grad_norm": 1.4393397095123135, "learning_rate": 6.295705846445438e-07, "loss": 0.0326, "step": 4579 }, { "epoch": 2.083712465878071, "grad_norm": 0.6608198299151415, "learning_rate": 6.294325323033775e-07, "loss": 0.0111, "step": 4580 }, { "epoch": 2.0841674249317563, "grad_norm": 0.7209703743068951, "learning_rate": 6.29294469385569e-07, "loss": 0.0196, "step": 4581 }, { "epoch": 2.084622383985441, "grad_norm": 0.6472043463473451, "learning_rate": 6.291563959024005e-07, "loss": 0.0159, "step": 4582 }, { "epoch": 2.0850773430391265, "grad_norm": 0.9252714319435577, "learning_rate": 6.290183118651545e-07, "loss": 0.0155, "step": 4583 }, { "epoch": 2.085532302092812, "grad_norm": 0.7698636782169304, "learning_rate": 6.288802172851146e-07, "loss": 0.0085, "step": 4584 }, { "epoch": 2.0859872611464967, "grad_norm": 0.7301719951332571, "learning_rate": 6.287421121735656e-07, "loss": 0.0203, "step": 4585 }, { "epoch": 2.086442220200182, "grad_norm": 0.6901013341860192, "learning_rate": 6.286039965417924e-07, "loss": 0.0165, "step": 4586 }, { "epoch": 2.0868971792538673, "grad_norm": 0.7096466645051028, "learning_rate": 6.284658704010815e-07, "loss": 0.0094, "step": 4587 }, { "epoch": 2.087352138307552, "grad_norm": 0.7204150004460801, "learning_rate": 6.283277337627197e-07, "loss": 0.0159, "step": 4588 }, { "epoch": 2.0878070973612375, "grad_norm": 0.7218868544441922, "learning_rate": 6.281895866379951e-07, "loss": 0.0217, "step": 4589 }, { "epoch": 2.088262056414923, "grad_norm": 0.6515700770089146, "learning_rate": 6.280514290381964e-07, "loss": 0.0156, "step": 4590 }, { "epoch": 2.0887170154686077, "grad_norm": 0.8860205631455776, "learning_rate": 6.279132609746131e-07, "loss": 0.0208, "step": 4591 }, { "epoch": 2.089171974522293, "grad_norm": 0.681536099739604, "learning_rate": 6.27775082458536e-07, "loss": 0.0167, "step": 4592 }, { "epoch": 2.0896269335759783, "grad_norm": 0.7539201112504922, "learning_rate": 6.276368935012558e-07, "loss": 0.0233, "step": 4593 }, { "epoch": 2.090081892629663, "grad_norm": 0.9613484516787081, "learning_rate": 6.274986941140653e-07, "loss": 0.0258, "step": 4594 }, { "epoch": 2.0905368516833485, "grad_norm": 0.7964425440346834, "learning_rate": 6.273604843082572e-07, "loss": 0.0253, "step": 4595 }, { "epoch": 2.0909918107370338, "grad_norm": 0.5999258744809202, "learning_rate": 6.272222640951256e-07, "loss": 0.0105, "step": 4596 }, { "epoch": 2.0914467697907186, "grad_norm": 0.7420756745003197, "learning_rate": 6.27084033485965e-07, "loss": 0.0186, "step": 4597 }, { "epoch": 2.091901728844404, "grad_norm": 1.0704532557832827, "learning_rate": 6.269457924920713e-07, "loss": 0.0261, "step": 4598 }, { "epoch": 2.0923566878980893, "grad_norm": 0.8125180774266194, "learning_rate": 6.268075411247405e-07, "loss": 0.0132, "step": 4599 }, { "epoch": 2.092811646951774, "grad_norm": 0.5967786900261133, "learning_rate": 6.266692793952702e-07, "loss": 0.0101, "step": 4600 }, { "epoch": 2.0932666060054594, "grad_norm": 0.8687051919721802, "learning_rate": 6.265310073149584e-07, "loss": 0.0333, "step": 4601 }, { "epoch": 2.0937215650591448, "grad_norm": 0.5784269873883905, "learning_rate": 6.263927248951041e-07, "loss": 0.0085, "step": 4602 }, { "epoch": 2.0941765241128296, "grad_norm": 0.7449091050322533, "learning_rate": 6.262544321470069e-07, "loss": 0.0213, "step": 4603 }, { "epoch": 2.094631483166515, "grad_norm": 0.7610122767989921, "learning_rate": 6.26116129081968e-07, "loss": 0.017, "step": 4604 }, { "epoch": 2.0950864422202002, "grad_norm": 0.948075164123386, "learning_rate": 6.259778157112884e-07, "loss": 0.0204, "step": 4605 }, { "epoch": 2.0955414012738856, "grad_norm": 0.8607162166511593, "learning_rate": 6.258394920462707e-07, "loss": 0.0155, "step": 4606 }, { "epoch": 2.0959963603275704, "grad_norm": 0.8944165636262186, "learning_rate": 6.257011580982179e-07, "loss": 0.0187, "step": 4607 }, { "epoch": 2.0964513193812557, "grad_norm": 0.7014452985826342, "learning_rate": 6.25562813878434e-07, "loss": 0.0156, "step": 4608 }, { "epoch": 2.096906278434941, "grad_norm": 1.1109854309199774, "learning_rate": 6.254244593982243e-07, "loss": 0.0159, "step": 4609 }, { "epoch": 2.097361237488626, "grad_norm": 0.5518244498590968, "learning_rate": 6.252860946688938e-07, "loss": 0.0096, "step": 4610 }, { "epoch": 2.097816196542311, "grad_norm": 1.2966984824164958, "learning_rate": 6.251477197017497e-07, "loss": 0.0229, "step": 4611 }, { "epoch": 2.0982711555959965, "grad_norm": 1.3558258988631657, "learning_rate": 6.250093345080991e-07, "loss": 0.0293, "step": 4612 }, { "epoch": 2.0987261146496814, "grad_norm": 0.980968336178441, "learning_rate": 6.248709390992502e-07, "loss": 0.0161, "step": 4613 }, { "epoch": 2.0991810737033667, "grad_norm": 0.682358690381231, "learning_rate": 6.24732533486512e-07, "loss": 0.0176, "step": 4614 }, { "epoch": 2.099636032757052, "grad_norm": 0.9596484924129705, "learning_rate": 6.245941176811946e-07, "loss": 0.0302, "step": 4615 }, { "epoch": 2.100090991810737, "grad_norm": 0.6695621420560547, "learning_rate": 6.244556916946084e-07, "loss": 0.0191, "step": 4616 }, { "epoch": 2.100545950864422, "grad_norm": 0.7330250366973601, "learning_rate": 6.24317255538065e-07, "loss": 0.0184, "step": 4617 }, { "epoch": 2.1010009099181075, "grad_norm": 0.9493617357964297, "learning_rate": 6.241788092228772e-07, "loss": 0.0262, "step": 4618 }, { "epoch": 2.1014558689717924, "grad_norm": 0.7353095428415172, "learning_rate": 6.240403527603578e-07, "loss": 0.0181, "step": 4619 }, { "epoch": 2.1019108280254777, "grad_norm": 0.850056456376145, "learning_rate": 6.239018861618209e-07, "loss": 0.0258, "step": 4620 }, { "epoch": 2.102365787079163, "grad_norm": 0.8619042375280144, "learning_rate": 6.237634094385813e-07, "loss": 0.025, "step": 4621 }, { "epoch": 2.102820746132848, "grad_norm": 0.651339142991056, "learning_rate": 6.236249226019549e-07, "loss": 0.0269, "step": 4622 }, { "epoch": 2.103275705186533, "grad_norm": 0.8910579238613185, "learning_rate": 6.234864256632582e-07, "loss": 0.0188, "step": 4623 }, { "epoch": 2.1037306642402185, "grad_norm": 0.9582980198943826, "learning_rate": 6.233479186338083e-07, "loss": 0.0267, "step": 4624 }, { "epoch": 2.1041856232939034, "grad_norm": 0.8308918609104718, "learning_rate": 6.232094015249235e-07, "loss": 0.0168, "step": 4625 }, { "epoch": 2.1046405823475887, "grad_norm": 0.8477983351346988, "learning_rate": 6.23070874347923e-07, "loss": 0.024, "step": 4626 }, { "epoch": 2.105095541401274, "grad_norm": 1.3021236534879757, "learning_rate": 6.229323371141262e-07, "loss": 0.0093, "step": 4627 }, { "epoch": 2.105550500454959, "grad_norm": 0.6979227091829557, "learning_rate": 6.227937898348541e-07, "loss": 0.0135, "step": 4628 }, { "epoch": 2.106005459508644, "grad_norm": 1.1940396127661776, "learning_rate": 6.226552325214281e-07, "loss": 0.0245, "step": 4629 }, { "epoch": 2.1064604185623295, "grad_norm": 0.5782931152754134, "learning_rate": 6.225166651851704e-07, "loss": 0.0073, "step": 4630 }, { "epoch": 2.1069153776160148, "grad_norm": 0.691837666904739, "learning_rate": 6.223780878374039e-07, "loss": 0.0167, "step": 4631 }, { "epoch": 2.1073703366696996, "grad_norm": 1.310006649485917, "learning_rate": 6.222395004894529e-07, "loss": 0.0303, "step": 4632 }, { "epoch": 2.107825295723385, "grad_norm": 0.8091662114648889, "learning_rate": 6.221009031526418e-07, "loss": 0.0261, "step": 4633 }, { "epoch": 2.1082802547770703, "grad_norm": 0.7129327947990063, "learning_rate": 6.219622958382964e-07, "loss": 0.0204, "step": 4634 }, { "epoch": 2.108735213830755, "grad_norm": 0.5476096541708526, "learning_rate": 6.21823678557743e-07, "loss": 0.01, "step": 4635 }, { "epoch": 2.1091901728844404, "grad_norm": 0.9969243153037943, "learning_rate": 6.216850513223086e-07, "loss": 0.0228, "step": 4636 }, { "epoch": 2.1096451319381258, "grad_norm": 0.7233935276975791, "learning_rate": 6.215464141433215e-07, "loss": 0.0174, "step": 4637 }, { "epoch": 2.1101000909918106, "grad_norm": 0.515396473560051, "learning_rate": 6.214077670321102e-07, "loss": 0.0128, "step": 4638 }, { "epoch": 2.110555050045496, "grad_norm": 0.7964442119379156, "learning_rate": 6.212691100000045e-07, "loss": 0.007, "step": 4639 }, { "epoch": 2.1110100090991812, "grad_norm": 1.0363540198862855, "learning_rate": 6.211304430583348e-07, "loss": 0.0321, "step": 4640 }, { "epoch": 2.111464968152866, "grad_norm": 0.7098154177896476, "learning_rate": 6.209917662184323e-07, "loss": 0.0128, "step": 4641 }, { "epoch": 2.1119199272065514, "grad_norm": 0.7082101617534051, "learning_rate": 6.20853079491629e-07, "loss": 0.01, "step": 4642 }, { "epoch": 2.1123748862602367, "grad_norm": 0.6995927820903105, "learning_rate": 6.207143828892579e-07, "loss": 0.0159, "step": 4643 }, { "epoch": 2.1128298453139216, "grad_norm": 0.8545262299063983, "learning_rate": 6.205756764226525e-07, "loss": 0.0172, "step": 4644 }, { "epoch": 2.113284804367607, "grad_norm": 1.211697868642885, "learning_rate": 6.204369601031474e-07, "loss": 0.0212, "step": 4645 }, { "epoch": 2.113739763421292, "grad_norm": 0.8813999149027799, "learning_rate": 6.202982339420777e-07, "loss": 0.018, "step": 4646 }, { "epoch": 2.114194722474977, "grad_norm": 1.0209411493936489, "learning_rate": 6.201594979507797e-07, "loss": 0.0273, "step": 4647 }, { "epoch": 2.1146496815286624, "grad_norm": 0.6145553835048279, "learning_rate": 6.200207521405901e-07, "loss": 0.0105, "step": 4648 }, { "epoch": 2.1151046405823477, "grad_norm": 0.7316084338957204, "learning_rate": 6.198819965228467e-07, "loss": 0.0247, "step": 4649 }, { "epoch": 2.1155595996360326, "grad_norm": 0.8125183165034252, "learning_rate": 6.19743231108888e-07, "loss": 0.0148, "step": 4650 }, { "epoch": 2.116014558689718, "grad_norm": 0.5693073569796486, "learning_rate": 6.196044559100531e-07, "loss": 0.0141, "step": 4651 }, { "epoch": 2.116469517743403, "grad_norm": 0.5400814221158362, "learning_rate": 6.194656709376821e-07, "loss": 0.0113, "step": 4652 }, { "epoch": 2.116924476797088, "grad_norm": 0.74085715434927, "learning_rate": 6.193268762031161e-07, "loss": 0.0143, "step": 4653 }, { "epoch": 2.1173794358507734, "grad_norm": 1.0235967231630043, "learning_rate": 6.191880717176966e-07, "loss": 0.0279, "step": 4654 }, { "epoch": 2.1178343949044587, "grad_norm": 0.6540952485505663, "learning_rate": 6.190492574927663e-07, "loss": 0.0138, "step": 4655 }, { "epoch": 2.1182893539581436, "grad_norm": 0.49215669583489563, "learning_rate": 6.189104335396679e-07, "loss": 0.0106, "step": 4656 }, { "epoch": 2.118744313011829, "grad_norm": 0.7349254675459281, "learning_rate": 6.187715998697462e-07, "loss": 0.0128, "step": 4657 }, { "epoch": 2.119199272065514, "grad_norm": 0.8157256569437605, "learning_rate": 6.186327564943457e-07, "loss": 0.0315, "step": 4658 }, { "epoch": 2.1196542311191995, "grad_norm": 0.7804316958722168, "learning_rate": 6.18493903424812e-07, "loss": 0.0202, "step": 4659 }, { "epoch": 2.1201091901728844, "grad_norm": 0.49600795994100266, "learning_rate": 6.183550406724917e-07, "loss": 0.0101, "step": 4660 }, { "epoch": 2.1205641492265697, "grad_norm": 0.8745153852297667, "learning_rate": 6.182161682487319e-07, "loss": 0.0211, "step": 4661 }, { "epoch": 2.121019108280255, "grad_norm": 0.987355692795381, "learning_rate": 6.180772861648807e-07, "loss": 0.0253, "step": 4662 }, { "epoch": 2.12147406733394, "grad_norm": 0.6764695491877274, "learning_rate": 6.179383944322867e-07, "loss": 0.02, "step": 4663 }, { "epoch": 2.121929026387625, "grad_norm": 0.6716700355745543, "learning_rate": 6.177994930623e-07, "loss": 0.0211, "step": 4664 }, { "epoch": 2.1223839854413105, "grad_norm": 0.7852436134418646, "learning_rate": 6.176605820662707e-07, "loss": 0.0148, "step": 4665 }, { "epoch": 2.1228389444949953, "grad_norm": 0.6763639598185853, "learning_rate": 6.175216614555498e-07, "loss": 0.0112, "step": 4666 }, { "epoch": 2.1232939035486806, "grad_norm": 0.9779698603253222, "learning_rate": 6.173827312414897e-07, "loss": 0.0254, "step": 4667 }, { "epoch": 2.123748862602366, "grad_norm": 0.6405433816227906, "learning_rate": 6.172437914354427e-07, "loss": 0.0183, "step": 4668 }, { "epoch": 2.124203821656051, "grad_norm": 0.61511289244104, "learning_rate": 6.171048420487627e-07, "loss": 0.0129, "step": 4669 }, { "epoch": 2.124658780709736, "grad_norm": 0.6152369268707297, "learning_rate": 6.169658830928036e-07, "loss": 0.0081, "step": 4670 }, { "epoch": 2.1251137397634214, "grad_norm": 0.7425224961830091, "learning_rate": 6.16826914578921e-07, "loss": 0.0175, "step": 4671 }, { "epoch": 2.1255686988171063, "grad_norm": 0.5856630431559786, "learning_rate": 6.166879365184704e-07, "loss": 0.0067, "step": 4672 }, { "epoch": 2.1260236578707916, "grad_norm": 0.9022946787364073, "learning_rate": 6.165489489228086e-07, "loss": 0.0322, "step": 4673 }, { "epoch": 2.126478616924477, "grad_norm": 0.5304668914169137, "learning_rate": 6.16409951803293e-07, "loss": 0.0102, "step": 4674 }, { "epoch": 2.126933575978162, "grad_norm": 0.9053401598515516, "learning_rate": 6.162709451712819e-07, "loss": 0.0123, "step": 4675 }, { "epoch": 2.127388535031847, "grad_norm": 0.8579150567369462, "learning_rate": 6.161319290381342e-07, "loss": 0.0105, "step": 4676 }, { "epoch": 2.1278434940855324, "grad_norm": 0.6979019736948572, "learning_rate": 6.159929034152097e-07, "loss": 0.0113, "step": 4677 }, { "epoch": 2.1282984531392173, "grad_norm": 0.6340440003745595, "learning_rate": 6.158538683138689e-07, "loss": 0.0173, "step": 4678 }, { "epoch": 2.1287534121929026, "grad_norm": 1.1900938744547793, "learning_rate": 6.157148237454734e-07, "loss": 0.0231, "step": 4679 }, { "epoch": 2.129208371246588, "grad_norm": 1.148633242706952, "learning_rate": 6.155757697213847e-07, "loss": 0.0271, "step": 4680 }, { "epoch": 2.1296633303002728, "grad_norm": 0.7189153017300144, "learning_rate": 6.154367062529663e-07, "loss": 0.0083, "step": 4681 }, { "epoch": 2.130118289353958, "grad_norm": 0.7283095147938174, "learning_rate": 6.152976333515815e-07, "loss": 0.0194, "step": 4682 }, { "epoch": 2.1305732484076434, "grad_norm": 0.9836178308944405, "learning_rate": 6.151585510285948e-07, "loss": 0.0113, "step": 4683 }, { "epoch": 2.1310282074613287, "grad_norm": 0.4841399844958555, "learning_rate": 6.150194592953714e-07, "loss": 0.0084, "step": 4684 }, { "epoch": 2.1314831665150136, "grad_norm": 0.6794461104827545, "learning_rate": 6.14880358163277e-07, "loss": 0.02, "step": 4685 }, { "epoch": 2.131938125568699, "grad_norm": 0.7532284957903631, "learning_rate": 6.147412476436789e-07, "loss": 0.015, "step": 4686 }, { "epoch": 2.132393084622384, "grad_norm": 1.0833388120201588, "learning_rate": 6.146021277479437e-07, "loss": 0.0224, "step": 4687 }, { "epoch": 2.132848043676069, "grad_norm": 0.7408131779166217, "learning_rate": 6.144629984874405e-07, "loss": 0.0136, "step": 4688 }, { "epoch": 2.1333030027297544, "grad_norm": 0.6889622370782262, "learning_rate": 6.143238598735381e-07, "loss": 0.021, "step": 4689 }, { "epoch": 2.1337579617834397, "grad_norm": 0.602137942324356, "learning_rate": 6.141847119176059e-07, "loss": 0.0136, "step": 4690 }, { "epoch": 2.1342129208371245, "grad_norm": 1.1126778742567274, "learning_rate": 6.140455546310148e-07, "loss": 0.029, "step": 4691 }, { "epoch": 2.13466787989081, "grad_norm": 0.7792952748746345, "learning_rate": 6.13906388025136e-07, "loss": 0.0202, "step": 4692 }, { "epoch": 2.135122838944495, "grad_norm": 0.67943357163911, "learning_rate": 6.137672121113415e-07, "loss": 0.0216, "step": 4693 }, { "epoch": 2.13557779799818, "grad_norm": 0.9018676989977321, "learning_rate": 6.136280269010043e-07, "loss": 0.0133, "step": 4694 }, { "epoch": 2.1360327570518653, "grad_norm": 0.7617782307106232, "learning_rate": 6.134888324054977e-07, "loss": 0.0203, "step": 4695 }, { "epoch": 2.1364877161055507, "grad_norm": 0.6151417910713064, "learning_rate": 6.133496286361964e-07, "loss": 0.0233, "step": 4696 }, { "epoch": 2.1369426751592355, "grad_norm": 0.6460014886195153, "learning_rate": 6.132104156044753e-07, "loss": 0.0137, "step": 4697 }, { "epoch": 2.137397634212921, "grad_norm": 0.5052964163000568, "learning_rate": 6.130711933217102e-07, "loss": 0.008, "step": 4698 }, { "epoch": 2.137852593266606, "grad_norm": 0.878002041826771, "learning_rate": 6.129319617992779e-07, "loss": 0.0093, "step": 4699 }, { "epoch": 2.138307552320291, "grad_norm": 0.6228716206510992, "learning_rate": 6.127927210485557e-07, "loss": 0.0104, "step": 4700 }, { "epoch": 2.1387625113739763, "grad_norm": 0.9602317338487865, "learning_rate": 6.126534710809216e-07, "loss": 0.0305, "step": 4701 }, { "epoch": 2.1392174704276616, "grad_norm": 0.5843796850219989, "learning_rate": 6.125142119077547e-07, "loss": 0.0137, "step": 4702 }, { "epoch": 2.1396724294813465, "grad_norm": 2.908694455603158, "learning_rate": 6.123749435404343e-07, "loss": 0.0584, "step": 4703 }, { "epoch": 2.140127388535032, "grad_norm": 0.48368118046006653, "learning_rate": 6.122356659903412e-07, "loss": 0.0058, "step": 4704 }, { "epoch": 2.140582347588717, "grad_norm": 0.5883866469020966, "learning_rate": 6.120963792688562e-07, "loss": 0.0123, "step": 4705 }, { "epoch": 2.141037306642402, "grad_norm": 0.5528427858118194, "learning_rate": 6.119570833873615e-07, "loss": 0.008, "step": 4706 }, { "epoch": 2.1414922656960873, "grad_norm": 0.8093564063699227, "learning_rate": 6.118177783572393e-07, "loss": 0.0204, "step": 4707 }, { "epoch": 2.1419472247497726, "grad_norm": 0.4553194692082459, "learning_rate": 6.116784641898734e-07, "loss": 0.0125, "step": 4708 }, { "epoch": 2.1424021838034575, "grad_norm": 0.7556115722499556, "learning_rate": 6.115391408966477e-07, "loss": 0.0122, "step": 4709 }, { "epoch": 2.142857142857143, "grad_norm": 0.5188602246105327, "learning_rate": 6.113998084889471e-07, "loss": 0.0052, "step": 4710 }, { "epoch": 2.143312101910828, "grad_norm": 0.6432105598970344, "learning_rate": 6.112604669781572e-07, "loss": 0.0147, "step": 4711 }, { "epoch": 2.143767060964513, "grad_norm": 0.9973768844168672, "learning_rate": 6.111211163756643e-07, "loss": 0.0319, "step": 4712 }, { "epoch": 2.1442220200181983, "grad_norm": 0.7726106667923719, "learning_rate": 6.109817566928559e-07, "loss": 0.0147, "step": 4713 }, { "epoch": 2.1446769790718836, "grad_norm": 0.8470856575008106, "learning_rate": 6.108423879411192e-07, "loss": 0.0147, "step": 4714 }, { "epoch": 2.145131938125569, "grad_norm": 0.8360078268964758, "learning_rate": 6.107030101318433e-07, "loss": 0.0111, "step": 4715 }, { "epoch": 2.1455868971792538, "grad_norm": 0.6008607553383948, "learning_rate": 6.105636232764172e-07, "loss": 0.0121, "step": 4716 }, { "epoch": 2.146041856232939, "grad_norm": 0.8962147688972367, "learning_rate": 6.104242273862313e-07, "loss": 0.0124, "step": 4717 }, { "epoch": 2.1464968152866244, "grad_norm": 0.9406619988019391, "learning_rate": 6.102848224726759e-07, "loss": 0.0213, "step": 4718 }, { "epoch": 2.1469517743403093, "grad_norm": 0.48915899326783674, "learning_rate": 6.101454085471431e-07, "loss": 0.0067, "step": 4719 }, { "epoch": 2.1474067333939946, "grad_norm": 0.8693830820002793, "learning_rate": 6.10005985621025e-07, "loss": 0.0211, "step": 4720 }, { "epoch": 2.14786169244768, "grad_norm": 1.0107079846522762, "learning_rate": 6.098665537057144e-07, "loss": 0.0178, "step": 4721 }, { "epoch": 2.1483166515013647, "grad_norm": 1.0140924604304205, "learning_rate": 6.097271128126051e-07, "loss": 0.0241, "step": 4722 }, { "epoch": 2.14877161055505, "grad_norm": 0.7485886595469591, "learning_rate": 6.095876629530917e-07, "loss": 0.0213, "step": 4723 }, { "epoch": 2.1492265696087354, "grad_norm": 0.6644186232438488, "learning_rate": 6.094482041385696e-07, "loss": 0.0104, "step": 4724 }, { "epoch": 2.1496815286624202, "grad_norm": 0.8770613492712995, "learning_rate": 6.093087363804344e-07, "loss": 0.0216, "step": 4725 }, { "epoch": 2.1501364877161055, "grad_norm": 0.8667942328970484, "learning_rate": 6.091692596900827e-07, "loss": 0.0179, "step": 4726 }, { "epoch": 2.150591446769791, "grad_norm": 0.598211339995204, "learning_rate": 6.090297740789124e-07, "loss": 0.0124, "step": 4727 }, { "epoch": 2.1510464058234757, "grad_norm": 0.7910141600196928, "learning_rate": 6.08890279558321e-07, "loss": 0.0177, "step": 4728 }, { "epoch": 2.151501364877161, "grad_norm": 0.6590392595908229, "learning_rate": 6.08750776139708e-07, "loss": 0.0143, "step": 4729 }, { "epoch": 2.1519563239308463, "grad_norm": 73.50998650457646, "learning_rate": 6.086112638344726e-07, "loss": 0.1079, "step": 4730 }, { "epoch": 2.152411282984531, "grad_norm": 0.6766658920282804, "learning_rate": 6.084717426540152e-07, "loss": 0.0109, "step": 4731 }, { "epoch": 2.1528662420382165, "grad_norm": 0.5496973894294638, "learning_rate": 6.083322126097369e-07, "loss": 0.0079, "step": 4732 }, { "epoch": 2.153321201091902, "grad_norm": 0.7351521668218293, "learning_rate": 6.081926737130392e-07, "loss": 0.0139, "step": 4733 }, { "epoch": 2.1537761601455867, "grad_norm": 0.8168976095792456, "learning_rate": 6.08053125975325e-07, "loss": 0.0103, "step": 4734 }, { "epoch": 2.154231119199272, "grad_norm": 1.224678649439618, "learning_rate": 6.079135694079972e-07, "loss": 0.0133, "step": 4735 }, { "epoch": 2.1546860782529573, "grad_norm": 0.8030945303761632, "learning_rate": 6.077740040224599e-07, "loss": 0.0249, "step": 4736 }, { "epoch": 2.1551410373066426, "grad_norm": 0.7218021944752535, "learning_rate": 6.076344298301177e-07, "loss": 0.0121, "step": 4737 }, { "epoch": 2.1555959963603275, "grad_norm": 0.84867955897945, "learning_rate": 6.074948468423759e-07, "loss": 0.0372, "step": 4738 }, { "epoch": 2.156050955414013, "grad_norm": 0.5939738845671995, "learning_rate": 6.073552550706407e-07, "loss": 0.0131, "step": 4739 }, { "epoch": 2.156505914467698, "grad_norm": 0.6154602184442761, "learning_rate": 6.072156545263189e-07, "loss": 0.0107, "step": 4740 }, { "epoch": 2.156960873521383, "grad_norm": 1.0091375835643532, "learning_rate": 6.07076045220818e-07, "loss": 0.0225, "step": 4741 }, { "epoch": 2.1574158325750683, "grad_norm": 0.8395701667838452, "learning_rate": 6.069364271655463e-07, "loss": 0.0199, "step": 4742 }, { "epoch": 2.1578707916287536, "grad_norm": 0.7220994235392331, "learning_rate": 6.067968003719124e-07, "loss": 0.0174, "step": 4743 }, { "epoch": 2.1583257506824385, "grad_norm": 1.088956556454313, "learning_rate": 6.066571648513265e-07, "loss": 0.0262, "step": 4744 }, { "epoch": 2.158780709736124, "grad_norm": 0.6282884294610442, "learning_rate": 6.065175206151988e-07, "loss": 0.0178, "step": 4745 }, { "epoch": 2.159235668789809, "grad_norm": 1.3098463429742226, "learning_rate": 6.063778676749403e-07, "loss": 0.0294, "step": 4746 }, { "epoch": 2.159690627843494, "grad_norm": 1.0310356990550216, "learning_rate": 6.062382060419628e-07, "loss": 0.0247, "step": 4747 }, { "epoch": 2.1601455868971793, "grad_norm": 0.933497183468962, "learning_rate": 6.060985357276788e-07, "loss": 0.0166, "step": 4748 }, { "epoch": 2.1606005459508646, "grad_norm": 0.8113542697763614, "learning_rate": 6.059588567435017e-07, "loss": 0.0117, "step": 4749 }, { "epoch": 2.1610555050045495, "grad_norm": 0.9611156289206068, "learning_rate": 6.058191691008452e-07, "loss": 0.0265, "step": 4750 }, { "epoch": 2.1615104640582348, "grad_norm": 1.4895107739827003, "learning_rate": 6.056794728111243e-07, "loss": 0.0404, "step": 4751 }, { "epoch": 2.16196542311192, "grad_norm": 0.5973851536205976, "learning_rate": 6.055397678857539e-07, "loss": 0.0082, "step": 4752 }, { "epoch": 2.162420382165605, "grad_norm": 0.8835672620880651, "learning_rate": 6.054000543361505e-07, "loss": 0.0281, "step": 4753 }, { "epoch": 2.1628753412192903, "grad_norm": 0.9552137057837596, "learning_rate": 6.052603321737305e-07, "loss": 0.031, "step": 4754 }, { "epoch": 2.1633303002729756, "grad_norm": 0.7413813395803579, "learning_rate": 6.051206014099116e-07, "loss": 0.0112, "step": 4755 }, { "epoch": 2.1637852593266604, "grad_norm": 0.6910299644623139, "learning_rate": 6.049808620561118e-07, "loss": 0.0124, "step": 4756 }, { "epoch": 2.1642402183803457, "grad_norm": 0.8562384987999563, "learning_rate": 6.048411141237499e-07, "loss": 0.0232, "step": 4757 }, { "epoch": 2.164695177434031, "grad_norm": 0.4338970259652208, "learning_rate": 6.047013576242458e-07, "loss": 0.0044, "step": 4758 }, { "epoch": 2.165150136487716, "grad_norm": 0.9396325236183273, "learning_rate": 6.045615925690195e-07, "loss": 0.0265, "step": 4759 }, { "epoch": 2.1656050955414012, "grad_norm": 0.617702642701776, "learning_rate": 6.044218189694921e-07, "loss": 0.0108, "step": 4760 }, { "epoch": 2.1660600545950865, "grad_norm": 1.174084014978862, "learning_rate": 6.042820368370853e-07, "loss": 0.0313, "step": 4761 }, { "epoch": 2.1665150136487714, "grad_norm": 0.7347039835147465, "learning_rate": 6.041422461832213e-07, "loss": 0.0188, "step": 4762 }, { "epoch": 2.1669699727024567, "grad_norm": 0.9757170001691282, "learning_rate": 6.040024470193232e-07, "loss": 0.0382, "step": 4763 }, { "epoch": 2.167424931756142, "grad_norm": 0.7173700647433809, "learning_rate": 6.038626393568149e-07, "loss": 0.0142, "step": 4764 }, { "epoch": 2.167879890809827, "grad_norm": 1.0161472601083883, "learning_rate": 6.037228232071207e-07, "loss": 0.018, "step": 4765 }, { "epoch": 2.168334849863512, "grad_norm": 0.8619170646411755, "learning_rate": 6.035829985816659e-07, "loss": 0.012, "step": 4766 }, { "epoch": 2.1687898089171975, "grad_norm": 0.7831827744375013, "learning_rate": 6.034431654918761e-07, "loss": 0.0153, "step": 4767 }, { "epoch": 2.1692447679708824, "grad_norm": 0.8202863100545227, "learning_rate": 6.033033239491778e-07, "loss": 0.0128, "step": 4768 }, { "epoch": 2.1696997270245677, "grad_norm": 1.1770296653097425, "learning_rate": 6.031634739649985e-07, "loss": 0.0277, "step": 4769 }, { "epoch": 2.170154686078253, "grad_norm": 0.9800588034339672, "learning_rate": 6.030236155507662e-07, "loss": 0.0202, "step": 4770 }, { "epoch": 2.1706096451319383, "grad_norm": 0.7146635022328155, "learning_rate": 6.028837487179091e-07, "loss": 0.0205, "step": 4771 }, { "epoch": 2.171064604185623, "grad_norm": 0.7200877467027755, "learning_rate": 6.027438734778567e-07, "loss": 0.0202, "step": 4772 }, { "epoch": 2.1715195632393085, "grad_norm": 1.8210649655145088, "learning_rate": 6.026039898420391e-07, "loss": 0.0388, "step": 4773 }, { "epoch": 2.171974522292994, "grad_norm": 1.212000850159266, "learning_rate": 6.024640978218866e-07, "loss": 0.0208, "step": 4774 }, { "epoch": 2.1724294813466787, "grad_norm": 0.6431084014555236, "learning_rate": 6.023241974288308e-07, "loss": 0.0271, "step": 4775 }, { "epoch": 2.172884440400364, "grad_norm": 0.8349816395736454, "learning_rate": 6.021842886743036e-07, "loss": 0.0189, "step": 4776 }, { "epoch": 2.1733393994540493, "grad_norm": 1.7290539429082525, "learning_rate": 6.02044371569738e-07, "loss": 0.0205, "step": 4777 }, { "epoch": 2.173794358507734, "grad_norm": 0.7336713564369388, "learning_rate": 6.019044461265671e-07, "loss": 0.0237, "step": 4778 }, { "epoch": 2.1742493175614195, "grad_norm": 0.8488761194528179, "learning_rate": 6.017645123562249e-07, "loss": 0.015, "step": 4779 }, { "epoch": 2.174704276615105, "grad_norm": 0.5491982623646381, "learning_rate": 6.016245702701466e-07, "loss": 0.0092, "step": 4780 }, { "epoch": 2.1751592356687897, "grad_norm": 0.6921510019971605, "learning_rate": 6.014846198797672e-07, "loss": 0.0132, "step": 4781 }, { "epoch": 2.175614194722475, "grad_norm": 0.851928163998067, "learning_rate": 6.013446611965228e-07, "loss": 0.0174, "step": 4782 }, { "epoch": 2.1760691537761603, "grad_norm": 0.4962984009260156, "learning_rate": 6.012046942318506e-07, "loss": 0.0099, "step": 4783 }, { "epoch": 2.176524112829845, "grad_norm": 1.066194239213503, "learning_rate": 6.010647189971878e-07, "loss": 0.0228, "step": 4784 }, { "epoch": 2.1769790718835305, "grad_norm": 0.857605114051159, "learning_rate": 6.009247355039724e-07, "loss": 0.0289, "step": 4785 }, { "epoch": 2.1774340309372158, "grad_norm": 0.7274608688905312, "learning_rate": 6.007847437636436e-07, "loss": 0.0071, "step": 4786 }, { "epoch": 2.1778889899909006, "grad_norm": 0.6584947094303737, "learning_rate": 6.006447437876405e-07, "loss": 0.0191, "step": 4787 }, { "epoch": 2.178343949044586, "grad_norm": 0.8429275821780892, "learning_rate": 6.005047355874035e-07, "loss": 0.0177, "step": 4788 }, { "epoch": 2.1787989080982713, "grad_norm": 1.6559753658484029, "learning_rate": 6.003647191743734e-07, "loss": 0.0155, "step": 4789 }, { "epoch": 2.179253867151956, "grad_norm": 0.9765234066683443, "learning_rate": 6.002246945599917e-07, "loss": 0.0243, "step": 4790 }, { "epoch": 2.1797088262056414, "grad_norm": 0.5865357922987434, "learning_rate": 6.000846617557006e-07, "loss": 0.0123, "step": 4791 }, { "epoch": 2.1801637852593267, "grad_norm": 0.8712672814024967, "learning_rate": 5.999446207729428e-07, "loss": 0.0161, "step": 4792 }, { "epoch": 2.180618744313012, "grad_norm": 0.9202551072562114, "learning_rate": 5.998045716231621e-07, "loss": 0.0087, "step": 4793 }, { "epoch": 2.181073703366697, "grad_norm": 0.5919710659541628, "learning_rate": 5.996645143178024e-07, "loss": 0.0076, "step": 4794 }, { "epoch": 2.1815286624203822, "grad_norm": 0.7306688662271053, "learning_rate": 5.995244488683088e-07, "loss": 0.0161, "step": 4795 }, { "epoch": 2.1819836214740675, "grad_norm": 0.7584266819304563, "learning_rate": 5.993843752861265e-07, "loss": 0.0098, "step": 4796 }, { "epoch": 2.1824385805277524, "grad_norm": 0.7919665758798171, "learning_rate": 5.99244293582702e-07, "loss": 0.0143, "step": 4797 }, { "epoch": 2.1828935395814377, "grad_norm": 1.0101897992541895, "learning_rate": 5.991042037694819e-07, "loss": 0.0071, "step": 4798 }, { "epoch": 2.183348498635123, "grad_norm": 0.8008042988662859, "learning_rate": 5.989641058579139e-07, "loss": 0.0175, "step": 4799 }, { "epoch": 2.183803457688808, "grad_norm": 0.9702446282648516, "learning_rate": 5.988239998594462e-07, "loss": 0.0242, "step": 4800 }, { "epoch": 2.184258416742493, "grad_norm": 1.2626845079354148, "learning_rate": 5.986838857855273e-07, "loss": 0.0144, "step": 4801 }, { "epoch": 2.1847133757961785, "grad_norm": 0.6622530152425987, "learning_rate": 5.985437636476072e-07, "loss": 0.014, "step": 4802 }, { "epoch": 2.1851683348498634, "grad_norm": 1.338435062556089, "learning_rate": 5.984036334571353e-07, "loss": 0.0518, "step": 4803 }, { "epoch": 2.1856232939035487, "grad_norm": 1.0239303189079134, "learning_rate": 5.982634952255632e-07, "loss": 0.0399, "step": 4804 }, { "epoch": 2.186078252957234, "grad_norm": 0.6846856421738714, "learning_rate": 5.98123348964342e-07, "loss": 0.0197, "step": 4805 }, { "epoch": 2.186533212010919, "grad_norm": 0.6338089093743701, "learning_rate": 5.979831946849236e-07, "loss": 0.0117, "step": 4806 }, { "epoch": 2.186988171064604, "grad_norm": 0.8648121220376743, "learning_rate": 5.978430323987614e-07, "loss": 0.02, "step": 4807 }, { "epoch": 2.1874431301182895, "grad_norm": 0.6591708817213561, "learning_rate": 5.977028621173082e-07, "loss": 0.0181, "step": 4808 }, { "epoch": 2.1878980891719744, "grad_norm": 0.6431989771675228, "learning_rate": 5.975626838520184e-07, "loss": 0.0129, "step": 4809 }, { "epoch": 2.1883530482256597, "grad_norm": 0.7878738959105084, "learning_rate": 5.974224976143466e-07, "loss": 0.0269, "step": 4810 }, { "epoch": 2.188808007279345, "grad_norm": 1.2666910215667888, "learning_rate": 5.972823034157484e-07, "loss": 0.0265, "step": 4811 }, { "epoch": 2.18926296633303, "grad_norm": 0.6543424557000688, "learning_rate": 5.971421012676795e-07, "loss": 0.0153, "step": 4812 }, { "epoch": 2.189717925386715, "grad_norm": 0.6063528752139031, "learning_rate": 5.970018911815968e-07, "loss": 0.0168, "step": 4813 }, { "epoch": 2.1901728844404005, "grad_norm": 0.737878072933376, "learning_rate": 5.968616731689577e-07, "loss": 0.0104, "step": 4814 }, { "epoch": 2.1906278434940853, "grad_norm": 0.8774988680399733, "learning_rate": 5.967214472412201e-07, "loss": 0.028, "step": 4815 }, { "epoch": 2.1910828025477707, "grad_norm": 0.6897032334181322, "learning_rate": 5.965812134098427e-07, "loss": 0.015, "step": 4816 }, { "epoch": 2.191537761601456, "grad_norm": 0.9723187008251271, "learning_rate": 5.964409716862845e-07, "loss": 0.0213, "step": 4817 }, { "epoch": 2.191992720655141, "grad_norm": 1.000658602360521, "learning_rate": 5.963007220820057e-07, "loss": 0.0173, "step": 4818 }, { "epoch": 2.192447679708826, "grad_norm": 0.9701596149579195, "learning_rate": 5.961604646084668e-07, "loss": 0.0163, "step": 4819 }, { "epoch": 2.1929026387625115, "grad_norm": 1.0014600893229917, "learning_rate": 5.960201992771288e-07, "loss": 0.0152, "step": 4820 }, { "epoch": 2.1933575978161963, "grad_norm": 0.40988985977716924, "learning_rate": 5.958799260994541e-07, "loss": 0.0096, "step": 4821 }, { "epoch": 2.1938125568698816, "grad_norm": 0.5088097748645461, "learning_rate": 5.957396450869045e-07, "loss": 0.0111, "step": 4822 }, { "epoch": 2.194267515923567, "grad_norm": 0.540119219496253, "learning_rate": 5.955993562509437e-07, "loss": 0.0092, "step": 4823 }, { "epoch": 2.194722474977252, "grad_norm": 0.6413643147934721, "learning_rate": 5.954590596030351e-07, "loss": 0.0097, "step": 4824 }, { "epoch": 2.195177434030937, "grad_norm": 0.8021180659604746, "learning_rate": 5.953187551546432e-07, "loss": 0.024, "step": 4825 }, { "epoch": 2.1956323930846224, "grad_norm": 0.7137060588823009, "learning_rate": 5.951784429172333e-07, "loss": 0.0217, "step": 4826 }, { "epoch": 2.1960873521383077, "grad_norm": 2.0512936331817686, "learning_rate": 5.950381229022705e-07, "loss": 0.0198, "step": 4827 }, { "epoch": 2.1965423111919926, "grad_norm": 0.867845373421944, "learning_rate": 5.948977951212218e-07, "loss": 0.0339, "step": 4828 }, { "epoch": 2.196997270245678, "grad_norm": 0.5865987442996835, "learning_rate": 5.947574595855538e-07, "loss": 0.0146, "step": 4829 }, { "epoch": 2.1974522292993632, "grad_norm": 0.5791913731742199, "learning_rate": 5.946171163067341e-07, "loss": 0.0145, "step": 4830 }, { "epoch": 2.197907188353048, "grad_norm": 0.8151170819659437, "learning_rate": 5.944767652962308e-07, "loss": 0.0189, "step": 4831 }, { "epoch": 2.1983621474067334, "grad_norm": 0.7724955493889337, "learning_rate": 5.94336406565513e-07, "loss": 0.0227, "step": 4832 }, { "epoch": 2.1988171064604187, "grad_norm": 0.9071089771498623, "learning_rate": 5.941960401260501e-07, "loss": 0.0362, "step": 4833 }, { "epoch": 2.1992720655141036, "grad_norm": 0.5466558692202488, "learning_rate": 5.940556659893122e-07, "loss": 0.0113, "step": 4834 }, { "epoch": 2.199727024567789, "grad_norm": 0.6456829449561466, "learning_rate": 5.9391528416677e-07, "loss": 0.0093, "step": 4835 }, { "epoch": 2.200181983621474, "grad_norm": 0.5461309128933698, "learning_rate": 5.93774894669895e-07, "loss": 0.0054, "step": 4836 }, { "epoch": 2.200636942675159, "grad_norm": 0.7405052662149527, "learning_rate": 5.936344975101588e-07, "loss": 0.0148, "step": 4837 }, { "epoch": 2.2010919017288444, "grad_norm": 0.7186409307924999, "learning_rate": 5.934940926990345e-07, "loss": 0.0209, "step": 4838 }, { "epoch": 2.2015468607825297, "grad_norm": 1.1340037699877035, "learning_rate": 5.933536802479952e-07, "loss": 0.0262, "step": 4839 }, { "epoch": 2.2020018198362146, "grad_norm": 0.6531178188509822, "learning_rate": 5.932132601685147e-07, "loss": 0.0154, "step": 4840 }, { "epoch": 2.2024567788899, "grad_norm": 0.8535805927314135, "learning_rate": 5.930728324720676e-07, "loss": 0.0126, "step": 4841 }, { "epoch": 2.202911737943585, "grad_norm": 0.8054148450659417, "learning_rate": 5.929323971701286e-07, "loss": 0.0172, "step": 4842 }, { "epoch": 2.20336669699727, "grad_norm": 0.7893419077788153, "learning_rate": 5.927919542741742e-07, "loss": 0.0187, "step": 4843 }, { "epoch": 2.2038216560509554, "grad_norm": 0.3593931114616576, "learning_rate": 5.926515037956801e-07, "loss": 0.0028, "step": 4844 }, { "epoch": 2.2042766151046407, "grad_norm": 0.830483887661117, "learning_rate": 5.925110457461236e-07, "loss": 0.0226, "step": 4845 }, { "epoch": 2.2047315741583255, "grad_norm": 0.9028647457553944, "learning_rate": 5.923705801369821e-07, "loss": 0.0124, "step": 4846 }, { "epoch": 2.205186533212011, "grad_norm": 0.6088222127773136, "learning_rate": 5.922301069797342e-07, "loss": 0.0096, "step": 4847 }, { "epoch": 2.205641492265696, "grad_norm": 0.6396753207276695, "learning_rate": 5.920896262858582e-07, "loss": 0.0065, "step": 4848 }, { "epoch": 2.2060964513193815, "grad_norm": 0.508758753079919, "learning_rate": 5.91949138066834e-07, "loss": 0.007, "step": 4849 }, { "epoch": 2.2065514103730663, "grad_norm": 1.024959133318094, "learning_rate": 5.918086423341414e-07, "loss": 0.0207, "step": 4850 }, { "epoch": 2.2070063694267517, "grad_norm": 0.7434029981709787, "learning_rate": 5.916681390992612e-07, "loss": 0.011, "step": 4851 }, { "epoch": 2.207461328480437, "grad_norm": 0.7403445876267135, "learning_rate": 5.915276283736745e-07, "loss": 0.0118, "step": 4852 }, { "epoch": 2.207916287534122, "grad_norm": 0.7748084808275602, "learning_rate": 5.913871101688635e-07, "loss": 0.0141, "step": 4853 }, { "epoch": 2.208371246587807, "grad_norm": 0.7510305801933072, "learning_rate": 5.912465844963105e-07, "loss": 0.0196, "step": 4854 }, { "epoch": 2.2088262056414925, "grad_norm": 0.9689887642686953, "learning_rate": 5.911060513674986e-07, "loss": 0.0192, "step": 4855 }, { "epoch": 2.2092811646951773, "grad_norm": 0.6691431011668686, "learning_rate": 5.909655107939117e-07, "loss": 0.0134, "step": 4856 }, { "epoch": 2.2097361237488626, "grad_norm": 1.086819149355037, "learning_rate": 5.90824962787034e-07, "loss": 0.0319, "step": 4857 }, { "epoch": 2.210191082802548, "grad_norm": 0.6111099649458206, "learning_rate": 5.906844073583505e-07, "loss": 0.015, "step": 4858 }, { "epoch": 2.210646041856233, "grad_norm": 1.0625781411516528, "learning_rate": 5.905438445193469e-07, "loss": 0.0257, "step": 4859 }, { "epoch": 2.211101000909918, "grad_norm": 1.3921052274540957, "learning_rate": 5.904032742815092e-07, "loss": 0.0281, "step": 4860 }, { "epoch": 2.2115559599636034, "grad_norm": 0.7887257888325875, "learning_rate": 5.90262696656324e-07, "loss": 0.0124, "step": 4861 }, { "epoch": 2.2120109190172883, "grad_norm": 0.6148405782319505, "learning_rate": 5.90122111655279e-07, "loss": 0.0116, "step": 4862 }, { "epoch": 2.2124658780709736, "grad_norm": 0.8441322253036698, "learning_rate": 5.899815192898619e-07, "loss": 0.0218, "step": 4863 }, { "epoch": 2.212920837124659, "grad_norm": 0.7751554235336439, "learning_rate": 5.898409195715616e-07, "loss": 0.0197, "step": 4864 }, { "epoch": 2.213375796178344, "grad_norm": 0.5610010203562027, "learning_rate": 5.897003125118669e-07, "loss": 0.0103, "step": 4865 }, { "epoch": 2.213830755232029, "grad_norm": 0.5860602577245694, "learning_rate": 5.895596981222678e-07, "loss": 0.0136, "step": 4866 }, { "epoch": 2.2142857142857144, "grad_norm": 0.8629707559796616, "learning_rate": 5.894190764142547e-07, "loss": 0.0219, "step": 4867 }, { "epoch": 2.2147406733393993, "grad_norm": 0.713306091404184, "learning_rate": 5.892784473993183e-07, "loss": 0.0185, "step": 4868 }, { "epoch": 2.2151956323930846, "grad_norm": 0.7155743486773385, "learning_rate": 5.891378110889505e-07, "loss": 0.0223, "step": 4869 }, { "epoch": 2.21565059144677, "grad_norm": 0.8729832988669289, "learning_rate": 5.889971674946433e-07, "loss": 0.0207, "step": 4870 }, { "epoch": 2.2161055505004548, "grad_norm": 0.6700363169596776, "learning_rate": 5.888565166278893e-07, "loss": 0.0169, "step": 4871 }, { "epoch": 2.21656050955414, "grad_norm": 0.8476415895959791, "learning_rate": 5.887158585001824e-07, "loss": 0.0274, "step": 4872 }, { "epoch": 2.2170154686078254, "grad_norm": 0.6445115797606633, "learning_rate": 5.885751931230158e-07, "loss": 0.0101, "step": 4873 }, { "epoch": 2.2174704276615103, "grad_norm": 0.6101641330243461, "learning_rate": 5.884345205078847e-07, "loss": 0.013, "step": 4874 }, { "epoch": 2.2179253867151956, "grad_norm": 0.8279643148121588, "learning_rate": 5.882938406662839e-07, "loss": 0.0231, "step": 4875 }, { "epoch": 2.218380345768881, "grad_norm": 0.7797864535790622, "learning_rate": 5.88153153609709e-07, "loss": 0.0144, "step": 4876 }, { "epoch": 2.2188353048225657, "grad_norm": 0.8383352795675124, "learning_rate": 5.880124593496567e-07, "loss": 0.023, "step": 4877 }, { "epoch": 2.219290263876251, "grad_norm": 0.6240917288486707, "learning_rate": 5.878717578976235e-07, "loss": 0.0126, "step": 4878 }, { "epoch": 2.2197452229299364, "grad_norm": 0.5990016505811259, "learning_rate": 5.877310492651072e-07, "loss": 0.0148, "step": 4879 }, { "epoch": 2.2202001819836217, "grad_norm": 0.6030503375494294, "learning_rate": 5.875903334636056e-07, "loss": 0.0242, "step": 4880 }, { "epoch": 2.2206551410373065, "grad_norm": 0.921557087472156, "learning_rate": 5.874496105046176e-07, "loss": 0.0228, "step": 4881 }, { "epoch": 2.221110100090992, "grad_norm": 0.6658777913313362, "learning_rate": 5.873088803996423e-07, "loss": 0.0121, "step": 4882 }, { "epoch": 2.221565059144677, "grad_norm": 0.6962685835993243, "learning_rate": 5.871681431601796e-07, "loss": 0.0121, "step": 4883 }, { "epoch": 2.222020018198362, "grad_norm": 0.6452971350120558, "learning_rate": 5.8702739879773e-07, "loss": 0.0264, "step": 4884 }, { "epoch": 2.2224749772520473, "grad_norm": 0.5955785752593303, "learning_rate": 5.868866473237943e-07, "loss": 0.0192, "step": 4885 }, { "epoch": 2.2229299363057327, "grad_norm": 1.0819298252369032, "learning_rate": 5.867458887498742e-07, "loss": 0.0326, "step": 4886 }, { "epoch": 2.2233848953594175, "grad_norm": 0.5795969093083166, "learning_rate": 5.866051230874718e-07, "loss": 0.0095, "step": 4887 }, { "epoch": 2.223839854413103, "grad_norm": 0.9118341237363049, "learning_rate": 5.864643503480898e-07, "loss": 0.0149, "step": 4888 }, { "epoch": 2.224294813466788, "grad_norm": 0.8253747637844426, "learning_rate": 5.863235705432315e-07, "loss": 0.0294, "step": 4889 }, { "epoch": 2.224749772520473, "grad_norm": 0.44981424385315727, "learning_rate": 5.861827836844008e-07, "loss": 0.0106, "step": 4890 }, { "epoch": 2.2252047315741583, "grad_norm": 0.7366801512103174, "learning_rate": 5.860419897831024e-07, "loss": 0.0196, "step": 4891 }, { "epoch": 2.2256596906278436, "grad_norm": 0.9562843121854071, "learning_rate": 5.859011888508411e-07, "loss": 0.0204, "step": 4892 }, { "epoch": 2.2261146496815285, "grad_norm": 0.8853435169520595, "learning_rate": 5.857603808991227e-07, "loss": 0.016, "step": 4893 }, { "epoch": 2.226569608735214, "grad_norm": 0.5481757595243599, "learning_rate": 5.85619565939453e-07, "loss": 0.015, "step": 4894 }, { "epoch": 2.227024567788899, "grad_norm": 0.995203582872904, "learning_rate": 5.854787439833392e-07, "loss": 0.0153, "step": 4895 }, { "epoch": 2.227479526842584, "grad_norm": 0.8591323601896483, "learning_rate": 5.853379150422885e-07, "loss": 0.024, "step": 4896 }, { "epoch": 2.2279344858962693, "grad_norm": 1.5270189647786876, "learning_rate": 5.851970791278085e-07, "loss": 0.0159, "step": 4897 }, { "epoch": 2.2283894449499546, "grad_norm": 0.5988834337847273, "learning_rate": 5.850562362514082e-07, "loss": 0.013, "step": 4898 }, { "epoch": 2.2288444040036395, "grad_norm": 0.7266897978286119, "learning_rate": 5.849153864245962e-07, "loss": 0.0144, "step": 4899 }, { "epoch": 2.229299363057325, "grad_norm": 0.5242280528845007, "learning_rate": 5.847745296588826e-07, "loss": 0.0112, "step": 4900 }, { "epoch": 2.22975432211101, "grad_norm": 0.7846529475158048, "learning_rate": 5.84633665965777e-07, "loss": 0.0191, "step": 4901 }, { "epoch": 2.2302092811646954, "grad_norm": 0.7708149446189171, "learning_rate": 5.844927953567905e-07, "loss": 0.0158, "step": 4902 }, { "epoch": 2.2306642402183803, "grad_norm": 0.9661816320273522, "learning_rate": 5.843519178434344e-07, "loss": 0.0309, "step": 4903 }, { "epoch": 2.2311191992720656, "grad_norm": 1.0382188857038626, "learning_rate": 5.842110334372203e-07, "loss": 0.0239, "step": 4904 }, { "epoch": 2.231574158325751, "grad_norm": 0.5937707914288138, "learning_rate": 5.84070142149661e-07, "loss": 0.0111, "step": 4905 }, { "epoch": 2.2320291173794358, "grad_norm": 0.49243343775459997, "learning_rate": 5.839292439922694e-07, "loss": 0.0114, "step": 4906 }, { "epoch": 2.232484076433121, "grad_norm": 0.6069723001558519, "learning_rate": 5.837883389765589e-07, "loss": 0.019, "step": 4907 }, { "epoch": 2.2329390354868064, "grad_norm": 0.7247819029867834, "learning_rate": 5.836474271140437e-07, "loss": 0.015, "step": 4908 }, { "epoch": 2.2333939945404913, "grad_norm": 0.5415288385626627, "learning_rate": 5.835065084162384e-07, "loss": 0.0076, "step": 4909 }, { "epoch": 2.2338489535941766, "grad_norm": 0.9597282037122461, "learning_rate": 5.833655828946586e-07, "loss": 0.0098, "step": 4910 }, { "epoch": 2.234303912647862, "grad_norm": 0.8986042771442763, "learning_rate": 5.832246505608197e-07, "loss": 0.0237, "step": 4911 }, { "epoch": 2.2347588717015467, "grad_norm": 1.0503513335196353, "learning_rate": 5.830837114262383e-07, "loss": 0.0339, "step": 4912 }, { "epoch": 2.235213830755232, "grad_norm": 0.7851954328159694, "learning_rate": 5.829427655024311e-07, "loss": 0.0145, "step": 4913 }, { "epoch": 2.2356687898089174, "grad_norm": 0.6677742135033028, "learning_rate": 5.828018128009156e-07, "loss": 0.0181, "step": 4914 }, { "epoch": 2.2361237488626022, "grad_norm": 1.4344856970511959, "learning_rate": 5.8266085333321e-07, "loss": 0.0205, "step": 4915 }, { "epoch": 2.2365787079162875, "grad_norm": 0.6801968714137967, "learning_rate": 5.825198871108327e-07, "loss": 0.0098, "step": 4916 }, { "epoch": 2.237033666969973, "grad_norm": 0.8194164522982973, "learning_rate": 5.823789141453031e-07, "loss": 0.0193, "step": 4917 }, { "epoch": 2.2374886260236577, "grad_norm": 0.8795209698025059, "learning_rate": 5.822379344481404e-07, "loss": 0.0198, "step": 4918 }, { "epoch": 2.237943585077343, "grad_norm": 1.870978481950137, "learning_rate": 5.820969480308652e-07, "loss": 0.0221, "step": 4919 }, { "epoch": 2.2383985441310283, "grad_norm": 0.7287706416193043, "learning_rate": 5.819559549049982e-07, "loss": 0.0136, "step": 4920 }, { "epoch": 2.238853503184713, "grad_norm": 0.7871926555409184, "learning_rate": 5.818149550820604e-07, "loss": 0.0216, "step": 4921 }, { "epoch": 2.2393084622383985, "grad_norm": 0.6894026887824042, "learning_rate": 5.816739485735742e-07, "loss": 0.0123, "step": 4922 }, { "epoch": 2.239763421292084, "grad_norm": 0.7286711975233446, "learning_rate": 5.815329353910618e-07, "loss": 0.0224, "step": 4923 }, { "epoch": 2.2402183803457687, "grad_norm": 0.745989563334821, "learning_rate": 5.81391915546046e-07, "loss": 0.0093, "step": 4924 }, { "epoch": 2.240673339399454, "grad_norm": 0.7973838662472517, "learning_rate": 5.812508890500502e-07, "loss": 0.0165, "step": 4925 }, { "epoch": 2.2411282984531393, "grad_norm": 0.7057199987596996, "learning_rate": 5.811098559145991e-07, "loss": 0.0143, "step": 4926 }, { "epoch": 2.241583257506824, "grad_norm": 2.7207152403157253, "learning_rate": 5.809688161512166e-07, "loss": 0.0526, "step": 4927 }, { "epoch": 2.2420382165605095, "grad_norm": 0.7541966763891808, "learning_rate": 5.808277697714282e-07, "loss": 0.0175, "step": 4928 }, { "epoch": 2.242493175614195, "grad_norm": 1.4649538529219372, "learning_rate": 5.806867167867595e-07, "loss": 0.015, "step": 4929 }, { "epoch": 2.2429481346678797, "grad_norm": 0.7898981459720023, "learning_rate": 5.805456572087366e-07, "loss": 0.0099, "step": 4930 }, { "epoch": 2.243403093721565, "grad_norm": 54.00418051847381, "learning_rate": 5.804045910488863e-07, "loss": 0.1248, "step": 4931 }, { "epoch": 2.2438580527752503, "grad_norm": 0.7040557861576308, "learning_rate": 5.802635183187358e-07, "loss": 0.0141, "step": 4932 }, { "epoch": 2.244313011828935, "grad_norm": 0.9284026364547717, "learning_rate": 5.801224390298134e-07, "loss": 0.0261, "step": 4933 }, { "epoch": 2.2447679708826205, "grad_norm": 0.6497254705743745, "learning_rate": 5.799813531936469e-07, "loss": 0.0229, "step": 4934 }, { "epoch": 2.245222929936306, "grad_norm": 0.8064494513362062, "learning_rate": 5.798402608217654e-07, "loss": 0.0165, "step": 4935 }, { "epoch": 2.245677888989991, "grad_norm": 0.8272544400445351, "learning_rate": 5.796991619256984e-07, "loss": 0.0208, "step": 4936 }, { "epoch": 2.246132848043676, "grad_norm": 0.7790679305518146, "learning_rate": 5.795580565169759e-07, "loss": 0.0266, "step": 4937 }, { "epoch": 2.2465878070973613, "grad_norm": 0.9068151758928057, "learning_rate": 5.794169446071282e-07, "loss": 0.0351, "step": 4938 }, { "epoch": 2.2470427661510466, "grad_norm": 1.7946363397826328, "learning_rate": 5.792758262076863e-07, "loss": 0.0554, "step": 4939 }, { "epoch": 2.2474977252047315, "grad_norm": 0.9436005196269532, "learning_rate": 5.791347013301822e-07, "loss": 0.0239, "step": 4940 }, { "epoch": 2.2479526842584168, "grad_norm": 1.035361093221878, "learning_rate": 5.789935699861475e-07, "loss": 0.0291, "step": 4941 }, { "epoch": 2.248407643312102, "grad_norm": 0.4812448034034859, "learning_rate": 5.788524321871149e-07, "loss": 0.0105, "step": 4942 }, { "epoch": 2.248862602365787, "grad_norm": 0.6570838956480312, "learning_rate": 5.787112879446176e-07, "loss": 0.0177, "step": 4943 }, { "epoch": 2.2493175614194723, "grad_norm": 1.0245102938115112, "learning_rate": 5.785701372701896e-07, "loss": 0.0192, "step": 4944 }, { "epoch": 2.2497725204731576, "grad_norm": 1.0150629543400376, "learning_rate": 5.784289801753645e-07, "loss": 0.0157, "step": 4945 }, { "epoch": 2.2502274795268424, "grad_norm": 1.0314871514179322, "learning_rate": 5.782878166716774e-07, "loss": 0.0462, "step": 4946 }, { "epoch": 2.2506824385805277, "grad_norm": 0.9667530214461841, "learning_rate": 5.781466467706636e-07, "loss": 0.017, "step": 4947 }, { "epoch": 2.251137397634213, "grad_norm": 1.9169447320410191, "learning_rate": 5.780054704838586e-07, "loss": 0.0458, "step": 4948 }, { "epoch": 2.251592356687898, "grad_norm": 0.7646616137253311, "learning_rate": 5.77864287822799e-07, "loss": 0.0211, "step": 4949 }, { "epoch": 2.2520473157415832, "grad_norm": 0.6292366154805403, "learning_rate": 5.777230987990211e-07, "loss": 0.0154, "step": 4950 }, { "epoch": 2.2525022747952685, "grad_norm": 1.3857062289979358, "learning_rate": 5.775819034240629e-07, "loss": 0.051, "step": 4951 }, { "epoch": 2.2529572338489534, "grad_norm": 0.7694082229643554, "learning_rate": 5.774407017094618e-07, "loss": 0.0171, "step": 4952 }, { "epoch": 2.2534121929026387, "grad_norm": 1.0654391619595096, "learning_rate": 5.772994936667561e-07, "loss": 0.0212, "step": 4953 }, { "epoch": 2.253867151956324, "grad_norm": 0.7710289559170167, "learning_rate": 5.771582793074852e-07, "loss": 0.0178, "step": 4954 }, { "epoch": 2.2543221110100093, "grad_norm": 0.655952128699421, "learning_rate": 5.77017058643188e-07, "loss": 0.0086, "step": 4955 }, { "epoch": 2.254777070063694, "grad_norm": 0.8800632466935198, "learning_rate": 5.768758316854045e-07, "loss": 0.0169, "step": 4956 }, { "epoch": 2.2552320291173795, "grad_norm": 0.8348513322382314, "learning_rate": 5.76734598445675e-07, "loss": 0.0179, "step": 4957 }, { "epoch": 2.255686988171065, "grad_norm": 0.6654279025707501, "learning_rate": 5.765933589355411e-07, "loss": 0.0182, "step": 4958 }, { "epoch": 2.2561419472247497, "grad_norm": 0.7917254030766661, "learning_rate": 5.764521131665437e-07, "loss": 0.0187, "step": 4959 }, { "epoch": 2.256596906278435, "grad_norm": 0.7872133828490148, "learning_rate": 5.763108611502246e-07, "loss": 0.0202, "step": 4960 }, { "epoch": 2.2570518653321203, "grad_norm": 0.8803062442999351, "learning_rate": 5.761696028981268e-07, "loss": 0.0264, "step": 4961 }, { "epoch": 2.257506824385805, "grad_norm": 0.4426389576315433, "learning_rate": 5.760283384217927e-07, "loss": 0.0062, "step": 4962 }, { "epoch": 2.2579617834394905, "grad_norm": 1.3186520923459215, "learning_rate": 5.758870677327664e-07, "loss": 0.0118, "step": 4963 }, { "epoch": 2.258416742493176, "grad_norm": 0.701604409031915, "learning_rate": 5.757457908425917e-07, "loss": 0.0101, "step": 4964 }, { "epoch": 2.2588717015468607, "grad_norm": 0.45802231040517305, "learning_rate": 5.756045077628129e-07, "loss": 0.0096, "step": 4965 }, { "epoch": 2.259326660600546, "grad_norm": 0.6500647781963753, "learning_rate": 5.754632185049752e-07, "loss": 0.0124, "step": 4966 }, { "epoch": 2.2597816196542313, "grad_norm": 0.6738101524410706, "learning_rate": 5.753219230806239e-07, "loss": 0.024, "step": 4967 }, { "epoch": 2.260236578707916, "grad_norm": 0.5574343196951463, "learning_rate": 5.751806215013055e-07, "loss": 0.0122, "step": 4968 }, { "epoch": 2.2606915377616015, "grad_norm": 0.4052246437210759, "learning_rate": 5.75039313778566e-07, "loss": 0.0074, "step": 4969 }, { "epoch": 2.261146496815287, "grad_norm": 0.7263856575457681, "learning_rate": 5.748979999239527e-07, "loss": 0.0185, "step": 4970 }, { "epoch": 2.2616014558689717, "grad_norm": 1.055372669036212, "learning_rate": 5.747566799490131e-07, "loss": 0.0265, "step": 4971 }, { "epoch": 2.262056414922657, "grad_norm": 0.5805002808649353, "learning_rate": 5.746153538652953e-07, "loss": 0.0097, "step": 4972 }, { "epoch": 2.2625113739763423, "grad_norm": 0.892689573961863, "learning_rate": 5.744740216843476e-07, "loss": 0.0255, "step": 4973 }, { "epoch": 2.262966333030027, "grad_norm": 0.6799270466390707, "learning_rate": 5.743326834177191e-07, "loss": 0.0157, "step": 4974 }, { "epoch": 2.2634212920837125, "grad_norm": 0.8772083534298115, "learning_rate": 5.741913390769596e-07, "loss": 0.0253, "step": 4975 }, { "epoch": 2.2638762511373978, "grad_norm": 0.8905648385752131, "learning_rate": 5.740499886736189e-07, "loss": 0.0264, "step": 4976 }, { "epoch": 2.2643312101910826, "grad_norm": 0.7494369332596739, "learning_rate": 5.739086322192474e-07, "loss": 0.011, "step": 4977 }, { "epoch": 2.264786169244768, "grad_norm": 0.868064608259066, "learning_rate": 5.737672697253963e-07, "loss": 0.0103, "step": 4978 }, { "epoch": 2.2652411282984533, "grad_norm": 0.7430927283689349, "learning_rate": 5.736259012036171e-07, "loss": 0.0147, "step": 4979 }, { "epoch": 2.265696087352138, "grad_norm": 0.6496721822841924, "learning_rate": 5.734845266654618e-07, "loss": 0.0103, "step": 4980 }, { "epoch": 2.2661510464058234, "grad_norm": 0.5020724165345207, "learning_rate": 5.733431461224827e-07, "loss": 0.0092, "step": 4981 }, { "epoch": 2.2666060054595087, "grad_norm": 0.8188113785245391, "learning_rate": 5.732017595862329e-07, "loss": 0.0151, "step": 4982 }, { "epoch": 2.2670609645131936, "grad_norm": 0.8703832487955965, "learning_rate": 5.73060367068266e-07, "loss": 0.0233, "step": 4983 }, { "epoch": 2.267515923566879, "grad_norm": 0.908828264070016, "learning_rate": 5.729189685801357e-07, "loss": 0.0289, "step": 4984 }, { "epoch": 2.2679708826205642, "grad_norm": 0.6332493422092287, "learning_rate": 5.727775641333968e-07, "loss": 0.0201, "step": 4985 }, { "epoch": 2.268425841674249, "grad_norm": 0.9107969862692183, "learning_rate": 5.726361537396038e-07, "loss": 0.0185, "step": 4986 }, { "epoch": 2.2688808007279344, "grad_norm": 1.0495476599012794, "learning_rate": 5.724947374103124e-07, "loss": 0.0284, "step": 4987 }, { "epoch": 2.2693357597816197, "grad_norm": 0.90163067897191, "learning_rate": 5.723533151570784e-07, "loss": 0.0151, "step": 4988 }, { "epoch": 2.2697907188353046, "grad_norm": 0.7538128667120102, "learning_rate": 5.722118869914582e-07, "loss": 0.0109, "step": 4989 }, { "epoch": 2.27024567788899, "grad_norm": 0.9927827994697428, "learning_rate": 5.72070452925009e-07, "loss": 0.0191, "step": 4990 }, { "epoch": 2.270700636942675, "grad_norm": 0.8991970262945453, "learning_rate": 5.719290129692875e-07, "loss": 0.0213, "step": 4991 }, { "epoch": 2.2711555959963605, "grad_norm": 0.9336662208449088, "learning_rate": 5.717875671358521e-07, "loss": 0.0188, "step": 4992 }, { "epoch": 2.2716105550500454, "grad_norm": 0.9707757312071759, "learning_rate": 5.716461154362609e-07, "loss": 0.0165, "step": 4993 }, { "epoch": 2.2720655141037307, "grad_norm": 0.6101936313730473, "learning_rate": 5.715046578820726e-07, "loss": 0.0074, "step": 4994 }, { "epoch": 2.272520473157416, "grad_norm": 0.9258578332812912, "learning_rate": 5.713631944848466e-07, "loss": 0.037, "step": 4995 }, { "epoch": 2.272975432211101, "grad_norm": 1.1155417782674513, "learning_rate": 5.712217252561425e-07, "loss": 0.0239, "step": 4996 }, { "epoch": 2.273430391264786, "grad_norm": 0.7012803632491743, "learning_rate": 5.710802502075209e-07, "loss": 0.0135, "step": 4997 }, { "epoch": 2.2738853503184715, "grad_norm": 0.6960008180721399, "learning_rate": 5.709387693505421e-07, "loss": 0.0204, "step": 4998 }, { "epoch": 2.2743403093721564, "grad_norm": 0.6631216633336702, "learning_rate": 5.707972826967674e-07, "loss": 0.0265, "step": 4999 }, { "epoch": 2.2747952684258417, "grad_norm": 0.6139359537786317, "learning_rate": 5.706557902577587e-07, "loss": 0.0148, "step": 5000 }, { "epoch": 2.275250227479527, "grad_norm": 0.6839772495258755, "learning_rate": 5.705142920450777e-07, "loss": 0.0175, "step": 5001 }, { "epoch": 2.275705186533212, "grad_norm": 0.5682668412827179, "learning_rate": 5.70372788070287e-07, "loss": 0.0195, "step": 5002 }, { "epoch": 2.276160145586897, "grad_norm": 0.7889396098758772, "learning_rate": 5.702312783449501e-07, "loss": 0.012, "step": 5003 }, { "epoch": 2.2766151046405825, "grad_norm": 0.6529316139140708, "learning_rate": 5.700897628806304e-07, "loss": 0.0175, "step": 5004 }, { "epoch": 2.2770700636942673, "grad_norm": 1.5118331778030896, "learning_rate": 5.699482416888917e-07, "loss": 0.0265, "step": 5005 }, { "epoch": 2.2775250227479527, "grad_norm": 0.609528281393288, "learning_rate": 5.698067147812985e-07, "loss": 0.0069, "step": 5006 }, { "epoch": 2.277979981801638, "grad_norm": 0.7906002260373225, "learning_rate": 5.696651821694158e-07, "loss": 0.0174, "step": 5007 }, { "epoch": 2.278434940855323, "grad_norm": 0.8839638523474493, "learning_rate": 5.695236438648091e-07, "loss": 0.0199, "step": 5008 }, { "epoch": 2.278889899909008, "grad_norm": 0.7068526234168603, "learning_rate": 5.693820998790442e-07, "loss": 0.0096, "step": 5009 }, { "epoch": 2.2793448589626935, "grad_norm": 1.1046701551459148, "learning_rate": 5.692405502236873e-07, "loss": 0.0303, "step": 5010 }, { "epoch": 2.2797998180163788, "grad_norm": 0.9688092539044401, "learning_rate": 5.690989949103056e-07, "loss": 0.0144, "step": 5011 }, { "epoch": 2.2802547770700636, "grad_norm": 0.9843289383710048, "learning_rate": 5.689574339504658e-07, "loss": 0.021, "step": 5012 }, { "epoch": 2.280709736123749, "grad_norm": 1.0253318077394469, "learning_rate": 5.68815867355736e-07, "loss": 0.0274, "step": 5013 }, { "epoch": 2.2811646951774343, "grad_norm": 0.6424562635320842, "learning_rate": 5.686742951376842e-07, "loss": 0.0107, "step": 5014 }, { "epoch": 2.281619654231119, "grad_norm": 0.849606697820788, "learning_rate": 5.685327173078794e-07, "loss": 0.0189, "step": 5015 }, { "epoch": 2.2820746132848044, "grad_norm": 0.6510899817619528, "learning_rate": 5.683911338778902e-07, "loss": 0.0143, "step": 5016 }, { "epoch": 2.2825295723384897, "grad_norm": 0.6710720949879404, "learning_rate": 5.682495448592864e-07, "loss": 0.0232, "step": 5017 }, { "epoch": 2.2829845313921746, "grad_norm": 1.1161718590281833, "learning_rate": 5.681079502636381e-07, "loss": 0.0218, "step": 5018 }, { "epoch": 2.28343949044586, "grad_norm": 0.7953440681278277, "learning_rate": 5.679663501025156e-07, "loss": 0.0136, "step": 5019 }, { "epoch": 2.2838944494995452, "grad_norm": 0.7539821677594462, "learning_rate": 5.678247443874898e-07, "loss": 0.0085, "step": 5020 }, { "epoch": 2.28434940855323, "grad_norm": 0.6622063575833372, "learning_rate": 5.676831331301325e-07, "loss": 0.0135, "step": 5021 }, { "epoch": 2.2848043676069154, "grad_norm": 0.6542821442039131, "learning_rate": 5.67541516342015e-07, "loss": 0.0096, "step": 5022 }, { "epoch": 2.2852593266606007, "grad_norm": 1.0420257256707592, "learning_rate": 5.673998940347098e-07, "loss": 0.0338, "step": 5023 }, { "epoch": 2.2857142857142856, "grad_norm": 0.8160531369538669, "learning_rate": 5.672582662197897e-07, "loss": 0.0136, "step": 5024 }, { "epoch": 2.286169244767971, "grad_norm": 0.9652936319218607, "learning_rate": 5.671166329088277e-07, "loss": 0.0169, "step": 5025 }, { "epoch": 2.286624203821656, "grad_norm": 1.2123850663482196, "learning_rate": 5.669749941133977e-07, "loss": 0.0239, "step": 5026 }, { "epoch": 2.287079162875341, "grad_norm": 0.5874986692397451, "learning_rate": 5.668333498450735e-07, "loss": 0.0129, "step": 5027 }, { "epoch": 2.2875341219290264, "grad_norm": 1.0393174200103643, "learning_rate": 5.666917001154299e-07, "loss": 0.025, "step": 5028 }, { "epoch": 2.2879890809827117, "grad_norm": 0.6629284784629844, "learning_rate": 5.665500449360418e-07, "loss": 0.0242, "step": 5029 }, { "epoch": 2.2884440400363966, "grad_norm": 0.5879308862874115, "learning_rate": 5.664083843184843e-07, "loss": 0.0123, "step": 5030 }, { "epoch": 2.288898999090082, "grad_norm": 0.7557337131204274, "learning_rate": 5.662667182743337e-07, "loss": 0.0166, "step": 5031 }, { "epoch": 2.289353958143767, "grad_norm": 0.5608007580620517, "learning_rate": 5.661250468151662e-07, "loss": 0.0143, "step": 5032 }, { "epoch": 2.289808917197452, "grad_norm": 0.6620057623043502, "learning_rate": 5.659833699525583e-07, "loss": 0.0131, "step": 5033 }, { "epoch": 2.2902638762511374, "grad_norm": 0.9127764182917381, "learning_rate": 5.658416876980876e-07, "loss": 0.0145, "step": 5034 }, { "epoch": 2.2907188353048227, "grad_norm": 1.1642096421354493, "learning_rate": 5.657000000633314e-07, "loss": 0.0276, "step": 5035 }, { "epoch": 2.2911737943585075, "grad_norm": 0.6052316716029791, "learning_rate": 5.65558307059868e-07, "loss": 0.0079, "step": 5036 }, { "epoch": 2.291628753412193, "grad_norm": 0.7837660817017816, "learning_rate": 5.654166086992756e-07, "loss": 0.026, "step": 5037 }, { "epoch": 2.292083712465878, "grad_norm": 0.9705936053843761, "learning_rate": 5.652749049931336e-07, "loss": 0.0336, "step": 5038 }, { "epoch": 2.292538671519563, "grad_norm": 1.2141561895187158, "learning_rate": 5.651331959530209e-07, "loss": 0.0151, "step": 5039 }, { "epoch": 2.2929936305732483, "grad_norm": 0.8858241493261849, "learning_rate": 5.649914815905177e-07, "loss": 0.0165, "step": 5040 }, { "epoch": 2.2934485896269337, "grad_norm": 0.8234800611963576, "learning_rate": 5.648497619172041e-07, "loss": 0.027, "step": 5041 }, { "epoch": 2.2939035486806185, "grad_norm": 0.902762944330827, "learning_rate": 5.647080369446608e-07, "loss": 0.0264, "step": 5042 }, { "epoch": 2.294358507734304, "grad_norm": 0.7237221174040874, "learning_rate": 5.645663066844691e-07, "loss": 0.0093, "step": 5043 }, { "epoch": 2.294813466787989, "grad_norm": 1.1413759706350146, "learning_rate": 5.6442457114821e-07, "loss": 0.0144, "step": 5044 }, { "epoch": 2.295268425841674, "grad_norm": 0.8667881891153927, "learning_rate": 5.642828303474664e-07, "loss": 0.0234, "step": 5045 }, { "epoch": 2.2957233848953593, "grad_norm": 0.6993438394060832, "learning_rate": 5.641410842938199e-07, "loss": 0.0122, "step": 5046 }, { "epoch": 2.2961783439490446, "grad_norm": 0.7588297408454538, "learning_rate": 5.639993329988536e-07, "loss": 0.014, "step": 5047 }, { "epoch": 2.29663330300273, "grad_norm": 0.9887605567535113, "learning_rate": 5.63857576474151e-07, "loss": 0.0137, "step": 5048 }, { "epoch": 2.297088262056415, "grad_norm": 0.5304014667089415, "learning_rate": 5.637158147312955e-07, "loss": 0.0165, "step": 5049 }, { "epoch": 2.2975432211101, "grad_norm": 0.6417646615818263, "learning_rate": 5.635740477818715e-07, "loss": 0.0107, "step": 5050 }, { "epoch": 2.2979981801637854, "grad_norm": 0.702719149830086, "learning_rate": 5.634322756374633e-07, "loss": 0.0178, "step": 5051 }, { "epoch": 2.2984531392174703, "grad_norm": 0.5776549323746466, "learning_rate": 5.63290498309656e-07, "loss": 0.0107, "step": 5052 }, { "epoch": 2.2989080982711556, "grad_norm": 0.7776455222116221, "learning_rate": 5.631487158100351e-07, "loss": 0.0156, "step": 5053 }, { "epoch": 2.299363057324841, "grad_norm": 0.4823319919959378, "learning_rate": 5.630069281501861e-07, "loss": 0.0065, "step": 5054 }, { "epoch": 2.299818016378526, "grad_norm": 1.0993074493956598, "learning_rate": 5.628651353416956e-07, "loss": 0.0283, "step": 5055 }, { "epoch": 2.300272975432211, "grad_norm": 0.7586098035253616, "learning_rate": 5.627233373961502e-07, "loss": 0.0125, "step": 5056 }, { "epoch": 2.3007279344858964, "grad_norm": 0.6012699199144347, "learning_rate": 5.625815343251369e-07, "loss": 0.0039, "step": 5057 }, { "epoch": 2.3011828935395813, "grad_norm": 1.1395222757773065, "learning_rate": 5.624397261402432e-07, "loss": 0.0141, "step": 5058 }, { "epoch": 2.3016378525932666, "grad_norm": 0.7303167771489728, "learning_rate": 5.62297912853057e-07, "loss": 0.0167, "step": 5059 }, { "epoch": 2.302092811646952, "grad_norm": 0.995346362876302, "learning_rate": 5.621560944751668e-07, "loss": 0.0271, "step": 5060 }, { "epoch": 2.3025477707006368, "grad_norm": 0.4092117789677838, "learning_rate": 5.62014271018161e-07, "loss": 0.0054, "step": 5061 }, { "epoch": 2.303002729754322, "grad_norm": 0.832948893137905, "learning_rate": 5.618724424936294e-07, "loss": 0.0256, "step": 5062 }, { "epoch": 2.3034576888080074, "grad_norm": 0.9727316454644795, "learning_rate": 5.617306089131609e-07, "loss": 0.0204, "step": 5063 }, { "epoch": 2.3039126478616927, "grad_norm": 0.5875446302515243, "learning_rate": 5.615887702883461e-07, "loss": 0.0085, "step": 5064 }, { "epoch": 2.3043676069153776, "grad_norm": 0.9996354422417412, "learning_rate": 5.61446926630775e-07, "loss": 0.0159, "step": 5065 }, { "epoch": 2.304822565969063, "grad_norm": 0.9910486543158845, "learning_rate": 5.613050779520384e-07, "loss": 0.0184, "step": 5066 }, { "epoch": 2.305277525022748, "grad_norm": 0.9699839007870722, "learning_rate": 5.611632242637279e-07, "loss": 0.0189, "step": 5067 }, { "epoch": 2.305732484076433, "grad_norm": 0.6009763572046609, "learning_rate": 5.610213655774349e-07, "loss": 0.0123, "step": 5068 }, { "epoch": 2.3061874431301184, "grad_norm": 1.1507519533069688, "learning_rate": 5.608795019047513e-07, "loss": 0.0137, "step": 5069 }, { "epoch": 2.3066424021838037, "grad_norm": 0.7938802623164662, "learning_rate": 5.607376332572699e-07, "loss": 0.0082, "step": 5070 }, { "epoch": 2.3070973612374885, "grad_norm": 0.9964781908998848, "learning_rate": 5.605957596465834e-07, "loss": 0.0265, "step": 5071 }, { "epoch": 2.307552320291174, "grad_norm": 0.9039124947555498, "learning_rate": 5.604538810842849e-07, "loss": 0.0151, "step": 5072 }, { "epoch": 2.308007279344859, "grad_norm": 0.885837052329662, "learning_rate": 5.603119975819683e-07, "loss": 0.0223, "step": 5073 }, { "epoch": 2.308462238398544, "grad_norm": 0.7006077101649587, "learning_rate": 5.601701091512277e-07, "loss": 0.0227, "step": 5074 }, { "epoch": 2.3089171974522293, "grad_norm": 0.7253519700484046, "learning_rate": 5.600282158036575e-07, "loss": 0.0105, "step": 5075 }, { "epoch": 2.3093721565059147, "grad_norm": 0.7872452115106694, "learning_rate": 5.598863175508526e-07, "loss": 0.0127, "step": 5076 }, { "epoch": 2.3098271155595995, "grad_norm": 0.7224111904148189, "learning_rate": 5.597444144044082e-07, "loss": 0.02, "step": 5077 }, { "epoch": 2.310282074613285, "grad_norm": 0.8537163692156919, "learning_rate": 5.596025063759201e-07, "loss": 0.0228, "step": 5078 }, { "epoch": 2.31073703366697, "grad_norm": 0.4894428683658614, "learning_rate": 5.594605934769844e-07, "loss": 0.0107, "step": 5079 }, { "epoch": 2.311191992720655, "grad_norm": 0.6893609281665073, "learning_rate": 5.593186757191973e-07, "loss": 0.017, "step": 5080 }, { "epoch": 2.3116469517743403, "grad_norm": 0.8907226311730718, "learning_rate": 5.591767531141563e-07, "loss": 0.0259, "step": 5081 }, { "epoch": 2.3121019108280256, "grad_norm": 0.5540396097752245, "learning_rate": 5.59034825673458e-07, "loss": 0.0104, "step": 5082 }, { "epoch": 2.3125568698817105, "grad_norm": 0.6417734555375377, "learning_rate": 5.588928934087003e-07, "loss": 0.0173, "step": 5083 }, { "epoch": 2.313011828935396, "grad_norm": 1.1828277823245261, "learning_rate": 5.587509563314814e-07, "loss": 0.0421, "step": 5084 }, { "epoch": 2.313466787989081, "grad_norm": 1.4200863438416622, "learning_rate": 5.586090144533997e-07, "loss": 0.035, "step": 5085 }, { "epoch": 2.313921747042766, "grad_norm": 0.6498635092973292, "learning_rate": 5.584670677860539e-07, "loss": 0.0173, "step": 5086 }, { "epoch": 2.3143767060964513, "grad_norm": 0.6233277317589706, "learning_rate": 5.583251163410435e-07, "loss": 0.0155, "step": 5087 }, { "epoch": 2.3148316651501366, "grad_norm": 1.0174143789267263, "learning_rate": 5.581831601299679e-07, "loss": 0.0305, "step": 5088 }, { "epoch": 2.3152866242038215, "grad_norm": 0.8442698298552654, "learning_rate": 5.580411991644271e-07, "loss": 0.0308, "step": 5089 }, { "epoch": 2.315741583257507, "grad_norm": 1.0385777162393337, "learning_rate": 5.578992334560218e-07, "loss": 0.0223, "step": 5090 }, { "epoch": 2.316196542311192, "grad_norm": 0.8769718138999812, "learning_rate": 5.577572630163527e-07, "loss": 0.0329, "step": 5091 }, { "epoch": 2.316651501364877, "grad_norm": 0.5947384316146073, "learning_rate": 5.576152878570207e-07, "loss": 0.0087, "step": 5092 }, { "epoch": 2.3171064604185623, "grad_norm": 0.7234057942717506, "learning_rate": 5.574733079896276e-07, "loss": 0.0143, "step": 5093 }, { "epoch": 2.3175614194722476, "grad_norm": 0.8226976506419632, "learning_rate": 5.573313234257754e-07, "loss": 0.0125, "step": 5094 }, { "epoch": 2.3180163785259325, "grad_norm": 0.8192363751828994, "learning_rate": 5.571893341770663e-07, "loss": 0.0183, "step": 5095 }, { "epoch": 2.3184713375796178, "grad_norm": 0.8897985175908183, "learning_rate": 5.570473402551029e-07, "loss": 0.0152, "step": 5096 }, { "epoch": 2.318926296633303, "grad_norm": 0.6975123274079408, "learning_rate": 5.569053416714887e-07, "loss": 0.0218, "step": 5097 }, { "epoch": 2.319381255686988, "grad_norm": 0.6235374848486601, "learning_rate": 5.56763338437827e-07, "loss": 0.0111, "step": 5098 }, { "epoch": 2.3198362147406733, "grad_norm": 0.6167746551743648, "learning_rate": 5.566213305657214e-07, "loss": 0.0169, "step": 5099 }, { "epoch": 2.3202911737943586, "grad_norm": 0.8049405713113327, "learning_rate": 5.564793180667765e-07, "loss": 0.02, "step": 5100 }, { "epoch": 2.3207461328480434, "grad_norm": 6.069288733627321, "learning_rate": 5.563373009525969e-07, "loss": 0.0703, "step": 5101 }, { "epoch": 2.3212010919017287, "grad_norm": 0.5721841803748595, "learning_rate": 5.561952792347873e-07, "loss": 0.0075, "step": 5102 }, { "epoch": 2.321656050955414, "grad_norm": 0.7851565942901152, "learning_rate": 5.560532529249533e-07, "loss": 0.0126, "step": 5103 }, { "epoch": 2.3221110100090994, "grad_norm": 1.166533887252137, "learning_rate": 5.559112220347006e-07, "loss": 0.0391, "step": 5104 }, { "epoch": 2.3225659690627842, "grad_norm": 0.4889487808994945, "learning_rate": 5.557691865756354e-07, "loss": 0.0078, "step": 5105 }, { "epoch": 2.3230209281164695, "grad_norm": 1.0187022908671273, "learning_rate": 5.556271465593642e-07, "loss": 0.0301, "step": 5106 }, { "epoch": 2.323475887170155, "grad_norm": 0.5357948778903981, "learning_rate": 5.554851019974934e-07, "loss": 0.0082, "step": 5107 }, { "epoch": 2.3239308462238397, "grad_norm": 0.5595508301137976, "learning_rate": 5.553430529016311e-07, "loss": 0.0049, "step": 5108 }, { "epoch": 2.324385805277525, "grad_norm": 0.6565807056187065, "learning_rate": 5.552009992833841e-07, "loss": 0.024, "step": 5109 }, { "epoch": 2.3248407643312103, "grad_norm": 0.6973964513419205, "learning_rate": 5.550589411543609e-07, "loss": 0.0138, "step": 5110 }, { "epoch": 2.325295723384895, "grad_norm": 0.8447369083258685, "learning_rate": 5.549168785261698e-07, "loss": 0.0288, "step": 5111 }, { "epoch": 2.3257506824385805, "grad_norm": 0.69269031363124, "learning_rate": 5.547748114104192e-07, "loss": 0.0185, "step": 5112 }, { "epoch": 2.326205641492266, "grad_norm": 1.1043221084773094, "learning_rate": 5.546327398187184e-07, "loss": 0.0122, "step": 5113 }, { "epoch": 2.3266606005459507, "grad_norm": 1.2359056583527142, "learning_rate": 5.544906637626767e-07, "loss": 0.0277, "step": 5114 }, { "epoch": 2.327115559599636, "grad_norm": 0.6880006247321905, "learning_rate": 5.543485832539042e-07, "loss": 0.0178, "step": 5115 }, { "epoch": 2.3275705186533213, "grad_norm": 0.7743251548174348, "learning_rate": 5.542064983040109e-07, "loss": 0.0176, "step": 5116 }, { "epoch": 2.328025477707006, "grad_norm": 0.6448900142088683, "learning_rate": 5.540644089246073e-07, "loss": 0.0169, "step": 5117 }, { "epoch": 2.3284804367606915, "grad_norm": 0.7829545167140088, "learning_rate": 5.539223151273045e-07, "loss": 0.0282, "step": 5118 }, { "epoch": 2.328935395814377, "grad_norm": 0.5335496033054933, "learning_rate": 5.537802169237134e-07, "loss": 0.0081, "step": 5119 }, { "epoch": 2.329390354868062, "grad_norm": 0.8625860928710545, "learning_rate": 5.53638114325446e-07, "loss": 0.0296, "step": 5120 }, { "epoch": 2.329845313921747, "grad_norm": 0.7801745464844398, "learning_rate": 5.53496007344114e-07, "loss": 0.0159, "step": 5121 }, { "epoch": 2.3303002729754323, "grad_norm": 0.6332510684975918, "learning_rate": 5.5335389599133e-07, "loss": 0.0151, "step": 5122 }, { "epoch": 2.3307552320291176, "grad_norm": 0.5935189998404626, "learning_rate": 5.532117802787066e-07, "loss": 0.0175, "step": 5123 }, { "epoch": 2.3312101910828025, "grad_norm": 0.6183028324890034, "learning_rate": 5.530696602178566e-07, "loss": 0.0148, "step": 5124 }, { "epoch": 2.331665150136488, "grad_norm": 0.6991587593643613, "learning_rate": 5.529275358203937e-07, "loss": 0.0274, "step": 5125 }, { "epoch": 2.332120109190173, "grad_norm": 0.7655865893358149, "learning_rate": 5.527854070979317e-07, "loss": 0.0066, "step": 5126 }, { "epoch": 2.332575068243858, "grad_norm": 0.8651756064699934, "learning_rate": 5.526432740620845e-07, "loss": 0.0264, "step": 5127 }, { "epoch": 2.3330300272975433, "grad_norm": 0.9234222114030433, "learning_rate": 5.525011367244667e-07, "loss": 0.0192, "step": 5128 }, { "epoch": 2.3334849863512286, "grad_norm": 0.7171536439645889, "learning_rate": 5.523589950966932e-07, "loss": 0.0208, "step": 5129 }, { "epoch": 2.3339399454049135, "grad_norm": 0.7891384975002655, "learning_rate": 5.52216849190379e-07, "loss": 0.0139, "step": 5130 }, { "epoch": 2.3343949044585988, "grad_norm": 0.7653515739415587, "learning_rate": 5.520746990171395e-07, "loss": 0.0248, "step": 5131 }, { "epoch": 2.334849863512284, "grad_norm": 0.7281917398831701, "learning_rate": 5.519325445885911e-07, "loss": 0.0098, "step": 5132 }, { "epoch": 2.335304822565969, "grad_norm": 0.984934913508253, "learning_rate": 5.517903859163495e-07, "loss": 0.0333, "step": 5133 }, { "epoch": 2.3357597816196543, "grad_norm": 0.7584535275582939, "learning_rate": 5.516482230120315e-07, "loss": 0.0159, "step": 5134 }, { "epoch": 2.3362147406733396, "grad_norm": 1.1155860985372898, "learning_rate": 5.51506055887254e-07, "loss": 0.0298, "step": 5135 }, { "epoch": 2.3366696997270244, "grad_norm": 0.6583824135785151, "learning_rate": 5.513638845536341e-07, "loss": 0.0101, "step": 5136 }, { "epoch": 2.3371246587807097, "grad_norm": 0.7823923242903263, "learning_rate": 5.512217090227895e-07, "loss": 0.0177, "step": 5137 }, { "epoch": 2.337579617834395, "grad_norm": 0.7571246310782139, "learning_rate": 5.510795293063383e-07, "loss": 0.0224, "step": 5138 }, { "epoch": 2.33803457688808, "grad_norm": 0.8029727459343843, "learning_rate": 5.509373454158985e-07, "loss": 0.0148, "step": 5139 }, { "epoch": 2.3384895359417652, "grad_norm": 0.6347693409605627, "learning_rate": 5.50795157363089e-07, "loss": 0.0184, "step": 5140 }, { "epoch": 2.3389444949954505, "grad_norm": 0.7584344529957944, "learning_rate": 5.506529651595285e-07, "loss": 0.0185, "step": 5141 }, { "epoch": 2.3393994540491354, "grad_norm": 0.8549337444484989, "learning_rate": 5.505107688168365e-07, "loss": 0.0272, "step": 5142 }, { "epoch": 2.3398544131028207, "grad_norm": 0.7777306836515157, "learning_rate": 5.503685683466325e-07, "loss": 0.022, "step": 5143 }, { "epoch": 2.340309372156506, "grad_norm": 1.0446157093599504, "learning_rate": 5.502263637605367e-07, "loss": 0.0212, "step": 5144 }, { "epoch": 2.340764331210191, "grad_norm": 1.1585417838633612, "learning_rate": 5.500841550701691e-07, "loss": 0.0241, "step": 5145 }, { "epoch": 2.341219290263876, "grad_norm": 0.8899841213744076, "learning_rate": 5.499419422871505e-07, "loss": 0.0171, "step": 5146 }, { "epoch": 2.3416742493175615, "grad_norm": 1.0235030565728076, "learning_rate": 5.497997254231021e-07, "loss": 0.0145, "step": 5147 }, { "epoch": 2.3421292083712464, "grad_norm": 0.7471631275706889, "learning_rate": 5.49657504489645e-07, "loss": 0.0163, "step": 5148 }, { "epoch": 2.3425841674249317, "grad_norm": 0.608803520302791, "learning_rate": 5.495152794984008e-07, "loss": 0.0099, "step": 5149 }, { "epoch": 2.343039126478617, "grad_norm": 0.9957128372431708, "learning_rate": 5.493730504609916e-07, "loss": 0.0172, "step": 5150 }, { "epoch": 2.343494085532302, "grad_norm": 0.7048568812224236, "learning_rate": 5.492308173890397e-07, "loss": 0.0218, "step": 5151 }, { "epoch": 2.343949044585987, "grad_norm": 0.8819856484089702, "learning_rate": 5.490885802941677e-07, "loss": 0.0164, "step": 5152 }, { "epoch": 2.3444040036396725, "grad_norm": 0.67104771233554, "learning_rate": 5.489463391879985e-07, "loss": 0.0119, "step": 5153 }, { "epoch": 2.3448589626933574, "grad_norm": 0.9389207073080326, "learning_rate": 5.488040940821558e-07, "loss": 0.0239, "step": 5154 }, { "epoch": 2.3453139217470427, "grad_norm": 0.7448672928960192, "learning_rate": 5.486618449882628e-07, "loss": 0.0297, "step": 5155 }, { "epoch": 2.345768880800728, "grad_norm": 0.5476524733795697, "learning_rate": 5.485195919179434e-07, "loss": 0.0055, "step": 5156 }, { "epoch": 2.3462238398544133, "grad_norm": 0.972122416090467, "learning_rate": 5.483773348828224e-07, "loss": 0.0253, "step": 5157 }, { "epoch": 2.346678798908098, "grad_norm": 0.8205127163486184, "learning_rate": 5.482350738945237e-07, "loss": 0.0173, "step": 5158 }, { "epoch": 2.3471337579617835, "grad_norm": 0.7352247188813735, "learning_rate": 5.480928089646726e-07, "loss": 0.0205, "step": 5159 }, { "epoch": 2.347588717015469, "grad_norm": 0.7281992058076758, "learning_rate": 5.479505401048946e-07, "loss": 0.0173, "step": 5160 }, { "epoch": 2.3480436760691537, "grad_norm": 0.7691809979166492, "learning_rate": 5.47808267326815e-07, "loss": 0.0154, "step": 5161 }, { "epoch": 2.348498635122839, "grad_norm": 0.8069502738119763, "learning_rate": 5.476659906420595e-07, "loss": 0.0177, "step": 5162 }, { "epoch": 2.3489535941765243, "grad_norm": 1.1060645976416208, "learning_rate": 5.475237100622545e-07, "loss": 0.0068, "step": 5163 }, { "epoch": 2.349408553230209, "grad_norm": 0.7972012495677001, "learning_rate": 5.473814255990268e-07, "loss": 0.026, "step": 5164 }, { "epoch": 2.3498635122838945, "grad_norm": 0.7209120841734524, "learning_rate": 5.472391372640027e-07, "loss": 0.0278, "step": 5165 }, { "epoch": 2.3503184713375798, "grad_norm": 0.7188896540528145, "learning_rate": 5.470968450688097e-07, "loss": 0.0119, "step": 5166 }, { "epoch": 2.3507734303912646, "grad_norm": 2.2269125241551198, "learning_rate": 5.469545490250752e-07, "loss": 0.043, "step": 5167 }, { "epoch": 2.35122838944495, "grad_norm": 0.8429851881063549, "learning_rate": 5.46812249144427e-07, "loss": 0.0205, "step": 5168 }, { "epoch": 2.3516833484986353, "grad_norm": 0.692370131490052, "learning_rate": 5.466699454384933e-07, "loss": 0.0211, "step": 5169 }, { "epoch": 2.35213830755232, "grad_norm": 0.7825065184711679, "learning_rate": 5.465276379189023e-07, "loss": 0.024, "step": 5170 }, { "epoch": 2.3525932666060054, "grad_norm": 0.7029154621451156, "learning_rate": 5.46385326597283e-07, "loss": 0.0169, "step": 5171 }, { "epoch": 2.3530482256596907, "grad_norm": 0.947577233036616, "learning_rate": 5.462430114852641e-07, "loss": 0.0258, "step": 5172 }, { "epoch": 2.3535031847133756, "grad_norm": 0.4142017699891913, "learning_rate": 5.461006925944753e-07, "loss": 0.0097, "step": 5173 }, { "epoch": 2.353958143767061, "grad_norm": 0.9513727310208583, "learning_rate": 5.45958369936546e-07, "loss": 0.0181, "step": 5174 }, { "epoch": 2.3544131028207462, "grad_norm": 0.5787692328576577, "learning_rate": 5.458160435231062e-07, "loss": 0.0117, "step": 5175 }, { "epoch": 2.3548680618744315, "grad_norm": 0.6999396193605177, "learning_rate": 5.456737133657864e-07, "loss": 0.0132, "step": 5176 }, { "epoch": 2.3553230209281164, "grad_norm": 1.3922222758983431, "learning_rate": 5.455313794762167e-07, "loss": 0.0154, "step": 5177 }, { "epoch": 2.3557779799818017, "grad_norm": 0.9688587756542076, "learning_rate": 5.453890418660286e-07, "loss": 0.0206, "step": 5178 }, { "epoch": 2.356232939035487, "grad_norm": 0.7936900627899831, "learning_rate": 5.452467005468527e-07, "loss": 0.0187, "step": 5179 }, { "epoch": 2.356687898089172, "grad_norm": 0.9791665110317825, "learning_rate": 5.45104355530321e-07, "loss": 0.0196, "step": 5180 }, { "epoch": 2.357142857142857, "grad_norm": 0.8118873281195423, "learning_rate": 5.449620068280649e-07, "loss": 0.0215, "step": 5181 }, { "epoch": 2.3575978161965425, "grad_norm": 1.0102096344378666, "learning_rate": 5.448196544517167e-07, "loss": 0.0149, "step": 5182 }, { "epoch": 2.3580527752502274, "grad_norm": 0.8909256875227861, "learning_rate": 5.446772984129088e-07, "loss": 0.0365, "step": 5183 }, { "epoch": 2.3585077343039127, "grad_norm": 0.776134034555858, "learning_rate": 5.445349387232737e-07, "loss": 0.0152, "step": 5184 }, { "epoch": 2.358962693357598, "grad_norm": 1.2317313619473176, "learning_rate": 5.443925753944447e-07, "loss": 0.0204, "step": 5185 }, { "epoch": 2.359417652411283, "grad_norm": 0.6446094507530061, "learning_rate": 5.442502084380548e-07, "loss": 0.0119, "step": 5186 }, { "epoch": 2.359872611464968, "grad_norm": 0.8698645455000993, "learning_rate": 5.441078378657378e-07, "loss": 0.0102, "step": 5187 }, { "epoch": 2.3603275705186535, "grad_norm": 0.7136585331763571, "learning_rate": 5.439654636891275e-07, "loss": 0.0238, "step": 5188 }, { "epoch": 2.3607825295723384, "grad_norm": 1.1145349201473604, "learning_rate": 5.438230859198579e-07, "loss": 0.018, "step": 5189 }, { "epoch": 2.3612374886260237, "grad_norm": 0.5746764089683954, "learning_rate": 5.436807045695638e-07, "loss": 0.0156, "step": 5190 }, { "epoch": 2.361692447679709, "grad_norm": 0.8365552692128699, "learning_rate": 5.435383196498794e-07, "loss": 0.0209, "step": 5191 }, { "epoch": 2.362147406733394, "grad_norm": 0.7731189758458016, "learning_rate": 5.433959311724406e-07, "loss": 0.0224, "step": 5192 }, { "epoch": 2.362602365787079, "grad_norm": 0.9384088077667632, "learning_rate": 5.432535391488821e-07, "loss": 0.0296, "step": 5193 }, { "epoch": 2.3630573248407645, "grad_norm": 0.6886921055048633, "learning_rate": 5.431111435908396e-07, "loss": 0.0141, "step": 5194 }, { "epoch": 2.3635122838944493, "grad_norm": 0.6549648206409677, "learning_rate": 5.429687445099492e-07, "loss": 0.0117, "step": 5195 }, { "epoch": 2.3639672429481347, "grad_norm": 0.6329756491119277, "learning_rate": 5.428263419178471e-07, "loss": 0.0192, "step": 5196 }, { "epoch": 2.36442220200182, "grad_norm": 0.6976224016670598, "learning_rate": 5.426839358261698e-07, "loss": 0.0237, "step": 5197 }, { "epoch": 2.364877161055505, "grad_norm": 0.537542646487351, "learning_rate": 5.425415262465538e-07, "loss": 0.011, "step": 5198 }, { "epoch": 2.36533212010919, "grad_norm": 0.675791750902084, "learning_rate": 5.423991131906366e-07, "loss": 0.0202, "step": 5199 }, { "epoch": 2.3657870791628755, "grad_norm": 0.8187715329083872, "learning_rate": 5.422566966700553e-07, "loss": 0.0174, "step": 5200 }, { "epoch": 2.3662420382165603, "grad_norm": 0.9846514685311066, "learning_rate": 5.421142766964474e-07, "loss": 0.0196, "step": 5201 }, { "epoch": 2.3666969972702456, "grad_norm": 0.6641076941642617, "learning_rate": 5.419718532814512e-07, "loss": 0.0137, "step": 5202 }, { "epoch": 2.367151956323931, "grad_norm": 0.704289546685685, "learning_rate": 5.418294264367046e-07, "loss": 0.0195, "step": 5203 }, { "epoch": 2.367606915377616, "grad_norm": 0.802165507523898, "learning_rate": 5.416869961738463e-07, "loss": 0.021, "step": 5204 }, { "epoch": 2.368061874431301, "grad_norm": 0.9119237078267505, "learning_rate": 5.415445625045147e-07, "loss": 0.0304, "step": 5205 }, { "epoch": 2.3685168334849864, "grad_norm": 0.7183461718920918, "learning_rate": 5.414021254403492e-07, "loss": 0.0205, "step": 5206 }, { "epoch": 2.3689717925386713, "grad_norm": 0.5611172560074457, "learning_rate": 5.412596849929891e-07, "loss": 0.0128, "step": 5207 }, { "epoch": 2.3694267515923566, "grad_norm": 0.6222604254154234, "learning_rate": 5.411172411740736e-07, "loss": 0.0198, "step": 5208 }, { "epoch": 2.369881710646042, "grad_norm": 0.7832379984067773, "learning_rate": 5.409747939952432e-07, "loss": 0.0139, "step": 5209 }, { "epoch": 2.370336669699727, "grad_norm": 0.932157072772348, "learning_rate": 5.408323434681374e-07, "loss": 0.0232, "step": 5210 }, { "epoch": 2.370791628753412, "grad_norm": 0.922169491348641, "learning_rate": 5.40689889604397e-07, "loss": 0.0202, "step": 5211 }, { "epoch": 2.3712465878070974, "grad_norm": 0.4037923288346381, "learning_rate": 5.405474324156624e-07, "loss": 0.0094, "step": 5212 }, { "epoch": 2.3717015468607827, "grad_norm": 0.908792193582552, "learning_rate": 5.404049719135748e-07, "loss": 0.0151, "step": 5213 }, { "epoch": 2.3721565059144676, "grad_norm": 0.9255018609926726, "learning_rate": 5.402625081097756e-07, "loss": 0.0201, "step": 5214 }, { "epoch": 2.372611464968153, "grad_norm": 0.7198098722160322, "learning_rate": 5.401200410159058e-07, "loss": 0.0199, "step": 5215 }, { "epoch": 2.373066424021838, "grad_norm": 0.7226848492731477, "learning_rate": 5.399775706436075e-07, "loss": 0.0145, "step": 5216 }, { "epoch": 2.373521383075523, "grad_norm": 0.8198375074656717, "learning_rate": 5.398350970045228e-07, "loss": 0.0131, "step": 5217 }, { "epoch": 2.3739763421292084, "grad_norm": 0.7654077882166983, "learning_rate": 5.396926201102935e-07, "loss": 0.0174, "step": 5218 }, { "epoch": 2.3744313011828937, "grad_norm": 0.7184145243768898, "learning_rate": 5.39550139972563e-07, "loss": 0.0249, "step": 5219 }, { "epoch": 2.3748862602365786, "grad_norm": 1.1541024516761216, "learning_rate": 5.394076566029732e-07, "loss": 0.0363, "step": 5220 }, { "epoch": 2.375341219290264, "grad_norm": 0.5123482884418783, "learning_rate": 5.392651700131681e-07, "loss": 0.0112, "step": 5221 }, { "epoch": 2.375796178343949, "grad_norm": 0.6469789417862009, "learning_rate": 5.391226802147904e-07, "loss": 0.0064, "step": 5222 }, { "epoch": 2.376251137397634, "grad_norm": 0.6279957127755206, "learning_rate": 5.389801872194839e-07, "loss": 0.0112, "step": 5223 }, { "epoch": 2.3767060964513194, "grad_norm": 0.5881685111977454, "learning_rate": 5.388376910388928e-07, "loss": 0.0147, "step": 5224 }, { "epoch": 2.3771610555050047, "grad_norm": 0.8913675848473358, "learning_rate": 5.386951916846607e-07, "loss": 0.0182, "step": 5225 }, { "epoch": 2.3776160145586895, "grad_norm": 0.7726015344461268, "learning_rate": 5.385526891684323e-07, "loss": 0.0158, "step": 5226 }, { "epoch": 2.378070973612375, "grad_norm": 0.43333725155206554, "learning_rate": 5.384101835018524e-07, "loss": 0.0046, "step": 5227 }, { "epoch": 2.37852593266606, "grad_norm": 0.7457366754685264, "learning_rate": 5.382676746965657e-07, "loss": 0.0132, "step": 5228 }, { "epoch": 2.3789808917197455, "grad_norm": 0.7730807843925512, "learning_rate": 5.381251627642174e-07, "loss": 0.0245, "step": 5229 }, { "epoch": 2.3794358507734303, "grad_norm": 0.7591473585052955, "learning_rate": 5.37982647716453e-07, "loss": 0.0124, "step": 5230 }, { "epoch": 2.3798908098271156, "grad_norm": 1.1950276225152185, "learning_rate": 5.378401295649182e-07, "loss": 0.0356, "step": 5231 }, { "epoch": 2.380345768880801, "grad_norm": 0.47006308778056327, "learning_rate": 5.376976083212588e-07, "loss": 0.0081, "step": 5232 }, { "epoch": 2.380800727934486, "grad_norm": 0.6800711353829005, "learning_rate": 5.375550839971211e-07, "loss": 0.0125, "step": 5233 }, { "epoch": 2.381255686988171, "grad_norm": 0.9941188224126964, "learning_rate": 5.374125566041516e-07, "loss": 0.0132, "step": 5234 }, { "epoch": 2.3817106460418564, "grad_norm": 0.7831660124897516, "learning_rate": 5.37270026153997e-07, "loss": 0.0119, "step": 5235 }, { "epoch": 2.3821656050955413, "grad_norm": 0.6622169032449349, "learning_rate": 5.37127492658304e-07, "loss": 0.0111, "step": 5236 }, { "epoch": 2.3826205641492266, "grad_norm": 0.7347831030861335, "learning_rate": 5.3698495612872e-07, "loss": 0.0166, "step": 5237 }, { "epoch": 2.383075523202912, "grad_norm": 1.183432055717985, "learning_rate": 5.368424165768924e-07, "loss": 0.0352, "step": 5238 }, { "epoch": 2.383530482256597, "grad_norm": 0.9668633021463724, "learning_rate": 5.366998740144691e-07, "loss": 0.0124, "step": 5239 }, { "epoch": 2.383985441310282, "grad_norm": 1.1618178541560396, "learning_rate": 5.365573284530975e-07, "loss": 0.0172, "step": 5240 }, { "epoch": 2.3844404003639674, "grad_norm": 1.276030935651846, "learning_rate": 5.364147799044264e-07, "loss": 0.0192, "step": 5241 }, { "epoch": 2.3848953594176523, "grad_norm": 0.6700217809148525, "learning_rate": 5.362722283801038e-07, "loss": 0.0123, "step": 5242 }, { "epoch": 2.3853503184713376, "grad_norm": 0.5308694699134624, "learning_rate": 5.361296738917784e-07, "loss": 0.0088, "step": 5243 }, { "epoch": 2.385805277525023, "grad_norm": 1.3495276029510468, "learning_rate": 5.359871164510994e-07, "loss": 0.0279, "step": 5244 }, { "epoch": 2.386260236578708, "grad_norm": 1.0868102248633584, "learning_rate": 5.358445560697157e-07, "loss": 0.0247, "step": 5245 }, { "epoch": 2.386715195632393, "grad_norm": 0.8590014769078842, "learning_rate": 5.357019927592769e-07, "loss": 0.0134, "step": 5246 }, { "epoch": 2.3871701546860784, "grad_norm": 0.7961845799486928, "learning_rate": 5.355594265314321e-07, "loss": 0.0217, "step": 5247 }, { "epoch": 2.3876251137397633, "grad_norm": 1.1689961007459637, "learning_rate": 5.354168573978318e-07, "loss": 0.0132, "step": 5248 }, { "epoch": 2.3880800727934486, "grad_norm": 0.6962030539249529, "learning_rate": 5.352742853701259e-07, "loss": 0.009, "step": 5249 }, { "epoch": 2.388535031847134, "grad_norm": 1.2796850564946227, "learning_rate": 5.351317104599645e-07, "loss": 0.0148, "step": 5250 }, { "epoch": 2.3889899909008188, "grad_norm": 0.8194310002477011, "learning_rate": 5.349891326789986e-07, "loss": 0.0288, "step": 5251 }, { "epoch": 2.389444949954504, "grad_norm": 1.164561908000258, "learning_rate": 5.348465520388786e-07, "loss": 0.0185, "step": 5252 }, { "epoch": 2.3898999090081894, "grad_norm": 0.9954111862320181, "learning_rate": 5.347039685512559e-07, "loss": 0.0265, "step": 5253 }, { "epoch": 2.3903548680618742, "grad_norm": 0.8695146281759338, "learning_rate": 5.345613822277814e-07, "loss": 0.0249, "step": 5254 }, { "epoch": 2.3908098271155596, "grad_norm": 0.8564449362268802, "learning_rate": 5.344187930801071e-07, "loss": 0.0321, "step": 5255 }, { "epoch": 2.391264786169245, "grad_norm": 0.6000399719395467, "learning_rate": 5.342762011198843e-07, "loss": 0.0105, "step": 5256 }, { "epoch": 2.3917197452229297, "grad_norm": 1.0647054529199107, "learning_rate": 5.341336063587651e-07, "loss": 0.0409, "step": 5257 }, { "epoch": 2.392174704276615, "grad_norm": 1.2247775932609706, "learning_rate": 5.339910088084018e-07, "loss": 0.0184, "step": 5258 }, { "epoch": 2.3926296633303004, "grad_norm": 1.0624550525871925, "learning_rate": 5.338484084804466e-07, "loss": 0.0176, "step": 5259 }, { "epoch": 2.3930846223839852, "grad_norm": 0.9438767827223448, "learning_rate": 5.337058053865527e-07, "loss": 0.0415, "step": 5260 }, { "epoch": 2.3935395814376705, "grad_norm": 0.8580445520748305, "learning_rate": 5.335631995383721e-07, "loss": 0.0123, "step": 5261 }, { "epoch": 2.393994540491356, "grad_norm": 1.0899350881087158, "learning_rate": 5.334205909475588e-07, "loss": 0.0232, "step": 5262 }, { "epoch": 2.3944494995450407, "grad_norm": 0.5744267003876907, "learning_rate": 5.332779796257656e-07, "loss": 0.0091, "step": 5263 }, { "epoch": 2.394904458598726, "grad_norm": 0.6761972091760919, "learning_rate": 5.331353655846462e-07, "loss": 0.0134, "step": 5264 }, { "epoch": 2.3953594176524113, "grad_norm": 1.1704007698348646, "learning_rate": 5.329927488358543e-07, "loss": 0.024, "step": 5265 }, { "epoch": 2.395814376706096, "grad_norm": 1.2216688915164087, "learning_rate": 5.328501293910439e-07, "loss": 0.0137, "step": 5266 }, { "epoch": 2.3962693357597815, "grad_norm": 0.881656522012382, "learning_rate": 5.327075072618696e-07, "loss": 0.0193, "step": 5267 }, { "epoch": 2.396724294813467, "grad_norm": 1.0290542396017228, "learning_rate": 5.325648824599852e-07, "loss": 0.0191, "step": 5268 }, { "epoch": 2.397179253867152, "grad_norm": 0.7067208958894959, "learning_rate": 5.324222549970458e-07, "loss": 0.0177, "step": 5269 }, { "epoch": 2.397634212920837, "grad_norm": 0.7493633324963801, "learning_rate": 5.322796248847061e-07, "loss": 0.0114, "step": 5270 }, { "epoch": 2.3980891719745223, "grad_norm": 0.665641879159369, "learning_rate": 5.321369921346211e-07, "loss": 0.0169, "step": 5271 }, { "epoch": 2.3985441310282076, "grad_norm": 1.0285002457127295, "learning_rate": 5.319943567584464e-07, "loss": 0.0132, "step": 5272 }, { "epoch": 2.3989990900818925, "grad_norm": 0.7829797768170973, "learning_rate": 5.318517187678373e-07, "loss": 0.0229, "step": 5273 }, { "epoch": 2.399454049135578, "grad_norm": 0.8959869336842406, "learning_rate": 5.317090781744496e-07, "loss": 0.0269, "step": 5274 }, { "epoch": 2.399909008189263, "grad_norm": 0.8213263834851672, "learning_rate": 5.315664349899393e-07, "loss": 0.0202, "step": 5275 }, { "epoch": 2.400363967242948, "grad_norm": 0.8294710940719623, "learning_rate": 5.314237892259624e-07, "loss": 0.0206, "step": 5276 }, { "epoch": 2.4008189262966333, "grad_norm": 0.6790461741082483, "learning_rate": 5.312811408941753e-07, "loss": 0.0079, "step": 5277 }, { "epoch": 2.4012738853503186, "grad_norm": 0.7656100147213915, "learning_rate": 5.311384900062345e-07, "loss": 0.0152, "step": 5278 }, { "epoch": 2.4017288444040035, "grad_norm": 1.0415960870814716, "learning_rate": 5.309958365737972e-07, "loss": 0.0167, "step": 5279 }, { "epoch": 2.402183803457689, "grad_norm": 1.056093808327353, "learning_rate": 5.308531806085202e-07, "loss": 0.0346, "step": 5280 }, { "epoch": 2.402638762511374, "grad_norm": 0.647677402255278, "learning_rate": 5.307105221220603e-07, "loss": 0.0144, "step": 5281 }, { "epoch": 2.403093721565059, "grad_norm": 0.7966598877694495, "learning_rate": 5.305678611260753e-07, "loss": 0.0182, "step": 5282 }, { "epoch": 2.4035486806187443, "grad_norm": 0.9977768312667369, "learning_rate": 5.304251976322228e-07, "loss": 0.0099, "step": 5283 }, { "epoch": 2.4040036396724296, "grad_norm": 0.8617696816622984, "learning_rate": 5.302825316521606e-07, "loss": 0.022, "step": 5284 }, { "epoch": 2.404458598726115, "grad_norm": 0.9447751075240197, "learning_rate": 5.301398631975466e-07, "loss": 0.0176, "step": 5285 }, { "epoch": 2.4049135577797998, "grad_norm": 1.445143076278695, "learning_rate": 5.29997192280039e-07, "loss": 0.0192, "step": 5286 }, { "epoch": 2.405368516833485, "grad_norm": 0.7964679785835838, "learning_rate": 5.298545189112965e-07, "loss": 0.02, "step": 5287 }, { "epoch": 2.4058234758871704, "grad_norm": 1.2163004159014568, "learning_rate": 5.297118431029775e-07, "loss": 0.0329, "step": 5288 }, { "epoch": 2.4062784349408552, "grad_norm": 0.7356845837494645, "learning_rate": 5.295691648667407e-07, "loss": 0.0138, "step": 5289 }, { "epoch": 2.4067333939945406, "grad_norm": 0.9673565662321004, "learning_rate": 5.294264842142453e-07, "loss": 0.021, "step": 5290 }, { "epoch": 2.407188353048226, "grad_norm": 0.8382717004196916, "learning_rate": 5.292838011571506e-07, "loss": 0.0112, "step": 5291 }, { "epoch": 2.4076433121019107, "grad_norm": 0.9181005832850434, "learning_rate": 5.291411157071159e-07, "loss": 0.0187, "step": 5292 }, { "epoch": 2.408098271155596, "grad_norm": 0.6804983769884528, "learning_rate": 5.289984278758009e-07, "loss": 0.0186, "step": 5293 }, { "epoch": 2.4085532302092814, "grad_norm": 0.6572940932526081, "learning_rate": 5.288557376748652e-07, "loss": 0.0132, "step": 5294 }, { "epoch": 2.4090081892629662, "grad_norm": 0.7892567251936357, "learning_rate": 5.287130451159689e-07, "loss": 0.0157, "step": 5295 }, { "epoch": 2.4094631483166515, "grad_norm": 0.5294195470568747, "learning_rate": 5.285703502107723e-07, "loss": 0.01, "step": 5296 }, { "epoch": 2.409918107370337, "grad_norm": 0.658075702173374, "learning_rate": 5.284276529709357e-07, "loss": 0.0135, "step": 5297 }, { "epoch": 2.4103730664240217, "grad_norm": 0.6753001639488709, "learning_rate": 5.282849534081198e-07, "loss": 0.015, "step": 5298 }, { "epoch": 2.410828025477707, "grad_norm": 0.8919221505403159, "learning_rate": 5.28142251533985e-07, "loss": 0.0148, "step": 5299 }, { "epoch": 2.4112829845313923, "grad_norm": 1.1618589182998234, "learning_rate": 5.279995473601926e-07, "loss": 0.0144, "step": 5300 }, { "epoch": 2.411737943585077, "grad_norm": 0.7480389631583544, "learning_rate": 5.278568408984037e-07, "loss": 0.0259, "step": 5301 }, { "epoch": 2.4121929026387625, "grad_norm": 1.1407441505855613, "learning_rate": 5.277141321602795e-07, "loss": 0.0278, "step": 5302 }, { "epoch": 2.412647861692448, "grad_norm": 0.9910744705709, "learning_rate": 5.275714211574816e-07, "loss": 0.0137, "step": 5303 }, { "epoch": 2.4131028207461327, "grad_norm": 0.8126375741722237, "learning_rate": 5.274287079016716e-07, "loss": 0.0223, "step": 5304 }, { "epoch": 2.413557779799818, "grad_norm": 0.5799982860107258, "learning_rate": 5.272859924045115e-07, "loss": 0.0225, "step": 5305 }, { "epoch": 2.4140127388535033, "grad_norm": 0.5521097387583427, "learning_rate": 5.271432746776633e-07, "loss": 0.0098, "step": 5306 }, { "epoch": 2.414467697907188, "grad_norm": 0.8103915675117351, "learning_rate": 5.270005547327893e-07, "loss": 0.0198, "step": 5307 }, { "epoch": 2.4149226569608735, "grad_norm": 1.1257571136662745, "learning_rate": 5.26857832581552e-07, "loss": 0.0272, "step": 5308 }, { "epoch": 2.415377616014559, "grad_norm": 0.8371415216315957, "learning_rate": 5.267151082356137e-07, "loss": 0.0115, "step": 5309 }, { "epoch": 2.4158325750682437, "grad_norm": 0.6858320184805021, "learning_rate": 5.265723817066375e-07, "loss": 0.0089, "step": 5310 }, { "epoch": 2.416287534121929, "grad_norm": 0.52512501483336, "learning_rate": 5.264296530062864e-07, "loss": 0.0079, "step": 5311 }, { "epoch": 2.4167424931756143, "grad_norm": 0.7434805274155908, "learning_rate": 5.262869221462232e-07, "loss": 0.0099, "step": 5312 }, { "epoch": 2.417197452229299, "grad_norm": 0.6530109278352823, "learning_rate": 5.261441891381115e-07, "loss": 0.0119, "step": 5313 }, { "epoch": 2.4176524112829845, "grad_norm": 1.1811327556107378, "learning_rate": 5.260014539936148e-07, "loss": 0.0256, "step": 5314 }, { "epoch": 2.41810737033667, "grad_norm": 1.1475309203753064, "learning_rate": 5.258587167243967e-07, "loss": 0.023, "step": 5315 }, { "epoch": 2.4185623293903546, "grad_norm": 0.6701657099301735, "learning_rate": 5.25715977342121e-07, "loss": 0.0113, "step": 5316 }, { "epoch": 2.41901728844404, "grad_norm": 0.7368852250465391, "learning_rate": 5.255732358584517e-07, "loss": 0.0114, "step": 5317 }, { "epoch": 2.4194722474977253, "grad_norm": 0.6691204599358106, "learning_rate": 5.254304922850532e-07, "loss": 0.0133, "step": 5318 }, { "epoch": 2.41992720655141, "grad_norm": 0.9008154186075427, "learning_rate": 5.252877466335896e-07, "loss": 0.0252, "step": 5319 }, { "epoch": 2.4203821656050954, "grad_norm": 0.9946711300885157, "learning_rate": 5.251449989157257e-07, "loss": 0.0159, "step": 5320 }, { "epoch": 2.4208371246587808, "grad_norm": 0.8004688431527411, "learning_rate": 5.250022491431259e-07, "loss": 0.027, "step": 5321 }, { "epoch": 2.421292083712466, "grad_norm": 0.7400357644644958, "learning_rate": 5.248594973274552e-07, "loss": 0.0113, "step": 5322 }, { "epoch": 2.421747042766151, "grad_norm": 0.9795729237779581, "learning_rate": 5.247167434803786e-07, "loss": 0.0234, "step": 5323 }, { "epoch": 2.4222020018198362, "grad_norm": 0.758030640631799, "learning_rate": 5.245739876135614e-07, "loss": 0.0154, "step": 5324 }, { "epoch": 2.4226569608735216, "grad_norm": 1.1108827303310023, "learning_rate": 5.244312297386691e-07, "loss": 0.0445, "step": 5325 }, { "epoch": 2.4231119199272064, "grad_norm": 0.6658894266585782, "learning_rate": 5.242884698673667e-07, "loss": 0.0166, "step": 5326 }, { "epoch": 2.4235668789808917, "grad_norm": 0.6922565260641231, "learning_rate": 5.241457080113204e-07, "loss": 0.0163, "step": 5327 }, { "epoch": 2.424021838034577, "grad_norm": 1.0745630224641072, "learning_rate": 5.240029441821959e-07, "loss": 0.0292, "step": 5328 }, { "epoch": 2.424476797088262, "grad_norm": 0.8374015020895897, "learning_rate": 5.238601783916592e-07, "loss": 0.0136, "step": 5329 }, { "epoch": 2.4249317561419472, "grad_norm": 0.8029339039974415, "learning_rate": 5.237174106513763e-07, "loss": 0.0167, "step": 5330 }, { "epoch": 2.4253867151956325, "grad_norm": 0.8606671173554737, "learning_rate": 5.235746409730138e-07, "loss": 0.0201, "step": 5331 }, { "epoch": 2.4258416742493174, "grad_norm": 0.8463368608375055, "learning_rate": 5.234318693682384e-07, "loss": 0.0294, "step": 5332 }, { "epoch": 2.4262966333030027, "grad_norm": 0.6662090542760197, "learning_rate": 5.232890958487161e-07, "loss": 0.0064, "step": 5333 }, { "epoch": 2.426751592356688, "grad_norm": 0.674582214447184, "learning_rate": 5.23146320426114e-07, "loss": 0.0177, "step": 5334 }, { "epoch": 2.427206551410373, "grad_norm": 0.8944412659123187, "learning_rate": 5.230035431120995e-07, "loss": 0.0218, "step": 5335 }, { "epoch": 2.427661510464058, "grad_norm": 0.981670458826332, "learning_rate": 5.228607639183391e-07, "loss": 0.0247, "step": 5336 }, { "epoch": 2.4281164695177435, "grad_norm": 1.1443658801717993, "learning_rate": 5.227179828565002e-07, "loss": 0.0292, "step": 5337 }, { "epoch": 2.4285714285714284, "grad_norm": 0.8978564747968155, "learning_rate": 5.225751999382506e-07, "loss": 0.0139, "step": 5338 }, { "epoch": 2.4290263876251137, "grad_norm": 0.545768275029812, "learning_rate": 5.224324151752575e-07, "loss": 0.0151, "step": 5339 }, { "epoch": 2.429481346678799, "grad_norm": 0.7900077047617381, "learning_rate": 5.222896285791888e-07, "loss": 0.017, "step": 5340 }, { "epoch": 2.4299363057324843, "grad_norm": 0.531418442996808, "learning_rate": 5.221468401617121e-07, "loss": 0.0104, "step": 5341 }, { "epoch": 2.430391264786169, "grad_norm": 0.74740069666814, "learning_rate": 5.220040499344958e-07, "loss": 0.0157, "step": 5342 }, { "epoch": 2.4308462238398545, "grad_norm": 0.8161983902320259, "learning_rate": 5.218612579092079e-07, "loss": 0.0105, "step": 5343 }, { "epoch": 2.43130118289354, "grad_norm": 0.6863907001218266, "learning_rate": 5.217184640975167e-07, "loss": 0.0108, "step": 5344 }, { "epoch": 2.4317561419472247, "grad_norm": 0.7818742213087054, "learning_rate": 5.215756685110907e-07, "loss": 0.02, "step": 5345 }, { "epoch": 2.43221110100091, "grad_norm": 0.6973597499170344, "learning_rate": 5.214328711615984e-07, "loss": 0.0147, "step": 5346 }, { "epoch": 2.4326660600545953, "grad_norm": 0.7090748661831369, "learning_rate": 5.212900720607088e-07, "loss": 0.0117, "step": 5347 }, { "epoch": 2.43312101910828, "grad_norm": 1.1004144622465826, "learning_rate": 5.211472712200905e-07, "loss": 0.0341, "step": 5348 }, { "epoch": 2.4335759781619655, "grad_norm": 0.7749913006486842, "learning_rate": 5.210044686514129e-07, "loss": 0.0193, "step": 5349 }, { "epoch": 2.434030937215651, "grad_norm": 0.6601997363390881, "learning_rate": 5.208616643663449e-07, "loss": 0.0078, "step": 5350 }, { "epoch": 2.4344858962693356, "grad_norm": 1.008141325474126, "learning_rate": 5.207188583765559e-07, "loss": 0.0235, "step": 5351 }, { "epoch": 2.434940855323021, "grad_norm": 1.1992417944783769, "learning_rate": 5.205760506937153e-07, "loss": 0.0228, "step": 5352 }, { "epoch": 2.4353958143767063, "grad_norm": 1.065890946877384, "learning_rate": 5.204332413294929e-07, "loss": 0.0203, "step": 5353 }, { "epoch": 2.435850773430391, "grad_norm": 0.7975335819943639, "learning_rate": 5.202904302955582e-07, "loss": 0.0119, "step": 5354 }, { "epoch": 2.4363057324840764, "grad_norm": 0.9283611647841963, "learning_rate": 5.201476176035812e-07, "loss": 0.0205, "step": 5355 }, { "epoch": 2.4367606915377618, "grad_norm": 0.6343684613792223, "learning_rate": 5.200048032652318e-07, "loss": 0.0074, "step": 5356 }, { "epoch": 2.4372156505914466, "grad_norm": 0.6712390451341768, "learning_rate": 5.198619872921804e-07, "loss": 0.0056, "step": 5357 }, { "epoch": 2.437670609645132, "grad_norm": 1.2291451276857768, "learning_rate": 5.197191696960967e-07, "loss": 0.0132, "step": 5358 }, { "epoch": 2.4381255686988172, "grad_norm": 1.3999843859926708, "learning_rate": 5.195763504886518e-07, "loss": 0.0297, "step": 5359 }, { "epoch": 2.438580527752502, "grad_norm": 0.7754618422107653, "learning_rate": 5.194335296815159e-07, "loss": 0.018, "step": 5360 }, { "epoch": 2.4390354868061874, "grad_norm": 0.5550043720517348, "learning_rate": 5.192907072863598e-07, "loss": 0.01, "step": 5361 }, { "epoch": 2.4394904458598727, "grad_norm": 0.7830488437837521, "learning_rate": 5.191478833148542e-07, "loss": 0.0282, "step": 5362 }, { "epoch": 2.4399454049135576, "grad_norm": 0.7428724585928976, "learning_rate": 5.190050577786699e-07, "loss": 0.011, "step": 5363 }, { "epoch": 2.440400363967243, "grad_norm": 0.9107518120981692, "learning_rate": 5.188622306894782e-07, "loss": 0.025, "step": 5364 }, { "epoch": 2.4408553230209282, "grad_norm": 1.0685160837805021, "learning_rate": 5.187194020589501e-07, "loss": 0.0124, "step": 5365 }, { "epoch": 2.441310282074613, "grad_norm": 0.5171061979757959, "learning_rate": 5.18576571898757e-07, "loss": 0.0078, "step": 5366 }, { "epoch": 2.4417652411282984, "grad_norm": 0.8219768962045525, "learning_rate": 5.184337402205704e-07, "loss": 0.0145, "step": 5367 }, { "epoch": 2.4422202001819837, "grad_norm": 0.4992533549092486, "learning_rate": 5.182909070360619e-07, "loss": 0.0083, "step": 5368 }, { "epoch": 2.4426751592356686, "grad_norm": 0.5527377356119058, "learning_rate": 5.181480723569029e-07, "loss": 0.0076, "step": 5369 }, { "epoch": 2.443130118289354, "grad_norm": 0.5984479509125199, "learning_rate": 5.180052361947656e-07, "loss": 0.016, "step": 5370 }, { "epoch": 2.443585077343039, "grad_norm": 0.9414218643525814, "learning_rate": 5.178623985613216e-07, "loss": 0.013, "step": 5371 }, { "epoch": 2.444040036396724, "grad_norm": 0.6344592350376572, "learning_rate": 5.177195594682429e-07, "loss": 0.0191, "step": 5372 }, { "epoch": 2.4444949954504094, "grad_norm": 0.6248369710260351, "learning_rate": 5.17576718927202e-07, "loss": 0.0123, "step": 5373 }, { "epoch": 2.4449499545040947, "grad_norm": 0.6630177003678707, "learning_rate": 5.174338769498711e-07, "loss": 0.0123, "step": 5374 }, { "epoch": 2.4454049135577796, "grad_norm": 0.9628989317751433, "learning_rate": 5.172910335479222e-07, "loss": 0.0331, "step": 5375 }, { "epoch": 2.445859872611465, "grad_norm": 1.007024188237931, "learning_rate": 5.171481887330282e-07, "loss": 0.0257, "step": 5376 }, { "epoch": 2.44631483166515, "grad_norm": 0.784129571814709, "learning_rate": 5.170053425168618e-07, "loss": 0.0178, "step": 5377 }, { "epoch": 2.4467697907188355, "grad_norm": 0.6447956180937221, "learning_rate": 5.168624949110956e-07, "loss": 0.0124, "step": 5378 }, { "epoch": 2.4472247497725204, "grad_norm": 1.5654253457841592, "learning_rate": 5.167196459274024e-07, "loss": 0.0379, "step": 5379 }, { "epoch": 2.4476797088262057, "grad_norm": 0.6765770323912685, "learning_rate": 5.165767955774552e-07, "loss": 0.0113, "step": 5380 }, { "epoch": 2.448134667879891, "grad_norm": 0.7817632308785677, "learning_rate": 5.164339438729273e-07, "loss": 0.018, "step": 5381 }, { "epoch": 2.448589626933576, "grad_norm": 0.7349979533953329, "learning_rate": 5.162910908254916e-07, "loss": 0.0241, "step": 5382 }, { "epoch": 2.449044585987261, "grad_norm": 1.0240531453882928, "learning_rate": 5.161482364468215e-07, "loss": 0.0215, "step": 5383 }, { "epoch": 2.4494995450409465, "grad_norm": 0.6378786287508392, "learning_rate": 5.160053807485903e-07, "loss": 0.0094, "step": 5384 }, { "epoch": 2.4499545040946313, "grad_norm": 0.7890335666089454, "learning_rate": 5.158625237424719e-07, "loss": 0.0207, "step": 5385 }, { "epoch": 2.4504094631483166, "grad_norm": 0.8091238383892219, "learning_rate": 5.157196654401397e-07, "loss": 0.0295, "step": 5386 }, { "epoch": 2.450864422202002, "grad_norm": 0.8147665119121361, "learning_rate": 5.155768058532673e-07, "loss": 0.0178, "step": 5387 }, { "epoch": 2.451319381255687, "grad_norm": 1.1907928741261022, "learning_rate": 5.154339449935288e-07, "loss": 0.0169, "step": 5388 }, { "epoch": 2.451774340309372, "grad_norm": 0.5595195443238529, "learning_rate": 5.152910828725979e-07, "loss": 0.0047, "step": 5389 }, { "epoch": 2.4522292993630574, "grad_norm": 0.8128577661913918, "learning_rate": 5.151482195021488e-07, "loss": 0.0184, "step": 5390 }, { "epoch": 2.4526842584167423, "grad_norm": 1.1522079326339254, "learning_rate": 5.150053548938556e-07, "loss": 0.0355, "step": 5391 }, { "epoch": 2.4531392174704276, "grad_norm": 0.7436810268194742, "learning_rate": 5.148624890593927e-07, "loss": 0.0169, "step": 5392 }, { "epoch": 2.453594176524113, "grad_norm": 0.8831158295826851, "learning_rate": 5.14719622010434e-07, "loss": 0.0132, "step": 5393 }, { "epoch": 2.4540491355777982, "grad_norm": 0.731272719967538, "learning_rate": 5.145767537586546e-07, "loss": 0.0075, "step": 5394 }, { "epoch": 2.454504094631483, "grad_norm": 0.7044015055306737, "learning_rate": 5.144338843157286e-07, "loss": 0.018, "step": 5395 }, { "epoch": 2.4549590536851684, "grad_norm": 0.93539359891532, "learning_rate": 5.142910136933306e-07, "loss": 0.0304, "step": 5396 }, { "epoch": 2.4554140127388537, "grad_norm": 1.2118316240345126, "learning_rate": 5.141481419031356e-07, "loss": 0.0154, "step": 5397 }, { "epoch": 2.4558689717925386, "grad_norm": 0.8648876264946395, "learning_rate": 5.140052689568185e-07, "loss": 0.0322, "step": 5398 }, { "epoch": 2.456323930846224, "grad_norm": 0.8284556443171381, "learning_rate": 5.138623948660538e-07, "loss": 0.0051, "step": 5399 }, { "epoch": 2.4567788898999092, "grad_norm": 0.8239331247768673, "learning_rate": 5.137195196425169e-07, "loss": 0.0194, "step": 5400 }, { "epoch": 2.457233848953594, "grad_norm": 0.5545495226430878, "learning_rate": 5.135766432978829e-07, "loss": 0.011, "step": 5401 }, { "epoch": 2.4576888080072794, "grad_norm": 0.65635873755248, "learning_rate": 5.134337658438269e-07, "loss": 0.0126, "step": 5402 }, { "epoch": 2.4581437670609647, "grad_norm": 0.8196775335026496, "learning_rate": 5.13290887292024e-07, "loss": 0.0124, "step": 5403 }, { "epoch": 2.4585987261146496, "grad_norm": 0.8857123922878034, "learning_rate": 5.131480076541501e-07, "loss": 0.0164, "step": 5404 }, { "epoch": 2.459053685168335, "grad_norm": 0.820752974033294, "learning_rate": 5.130051269418804e-07, "loss": 0.0206, "step": 5405 }, { "epoch": 2.45950864422202, "grad_norm": 0.6394006140247637, "learning_rate": 5.128622451668902e-07, "loss": 0.0122, "step": 5406 }, { "epoch": 2.459963603275705, "grad_norm": 0.782244081892027, "learning_rate": 5.127193623408556e-07, "loss": 0.0138, "step": 5407 }, { "epoch": 2.4604185623293904, "grad_norm": 1.0854430221786568, "learning_rate": 5.125764784754521e-07, "loss": 0.0105, "step": 5408 }, { "epoch": 2.4608735213830757, "grad_norm": 2.374362414107628, "learning_rate": 5.124335935823555e-07, "loss": 0.0646, "step": 5409 }, { "epoch": 2.4613284804367606, "grad_norm": 0.8043102311843925, "learning_rate": 5.122907076732419e-07, "loss": 0.0146, "step": 5410 }, { "epoch": 2.461783439490446, "grad_norm": 1.1551918194663162, "learning_rate": 5.12147820759787e-07, "loss": 0.0425, "step": 5411 }, { "epoch": 2.462238398544131, "grad_norm": 0.5890606220284833, "learning_rate": 5.120049328536673e-07, "loss": 0.016, "step": 5412 }, { "epoch": 2.462693357597816, "grad_norm": 0.8738304898808675, "learning_rate": 5.118620439665585e-07, "loss": 0.0314, "step": 5413 }, { "epoch": 2.4631483166515014, "grad_norm": 0.6975433720910416, "learning_rate": 5.117191541101371e-07, "loss": 0.0212, "step": 5414 }, { "epoch": 2.4636032757051867, "grad_norm": 0.874131459690079, "learning_rate": 5.115762632960794e-07, "loss": 0.0246, "step": 5415 }, { "epoch": 2.4640582347588715, "grad_norm": 1.1645675026678204, "learning_rate": 5.114333715360617e-07, "loss": 0.0293, "step": 5416 }, { "epoch": 2.464513193812557, "grad_norm": 0.8246800885222796, "learning_rate": 5.112904788417605e-07, "loss": 0.0165, "step": 5417 }, { "epoch": 2.464968152866242, "grad_norm": 0.832076663461905, "learning_rate": 5.111475852248522e-07, "loss": 0.0222, "step": 5418 }, { "epoch": 2.465423111919927, "grad_norm": 0.6874342828921397, "learning_rate": 5.110046906970139e-07, "loss": 0.0115, "step": 5419 }, { "epoch": 2.4658780709736123, "grad_norm": 0.731097876739095, "learning_rate": 5.108617952699218e-07, "loss": 0.007, "step": 5420 }, { "epoch": 2.4663330300272976, "grad_norm": 0.6994438973327185, "learning_rate": 5.107188989552528e-07, "loss": 0.0232, "step": 5421 }, { "epoch": 2.4667879890809825, "grad_norm": 1.0085143537990908, "learning_rate": 5.105760017646839e-07, "loss": 0.0154, "step": 5422 }, { "epoch": 2.467242948134668, "grad_norm": 0.9682403007275499, "learning_rate": 5.104331037098918e-07, "loss": 0.0343, "step": 5423 }, { "epoch": 2.467697907188353, "grad_norm": 0.9746915483077847, "learning_rate": 5.102902048025537e-07, "loss": 0.0138, "step": 5424 }, { "epoch": 2.468152866242038, "grad_norm": 1.7601679447457999, "learning_rate": 5.101473050543463e-07, "loss": 0.0277, "step": 5425 }, { "epoch": 2.4686078252957233, "grad_norm": 1.1167481692719115, "learning_rate": 5.100044044769472e-07, "loss": 0.0265, "step": 5426 }, { "epoch": 2.4690627843494086, "grad_norm": 0.8789120441276701, "learning_rate": 5.098615030820332e-07, "loss": 0.0106, "step": 5427 }, { "epoch": 2.4695177434030935, "grad_norm": 0.7840741334986231, "learning_rate": 5.097186008812818e-07, "loss": 0.0178, "step": 5428 }, { "epoch": 2.469972702456779, "grad_norm": 0.5070690484164013, "learning_rate": 5.095756978863701e-07, "loss": 0.0077, "step": 5429 }, { "epoch": 2.470427661510464, "grad_norm": 0.9225825819326506, "learning_rate": 5.094327941089757e-07, "loss": 0.0188, "step": 5430 }, { "epoch": 2.470882620564149, "grad_norm": 0.8155244886920129, "learning_rate": 5.09289889560776e-07, "loss": 0.0209, "step": 5431 }, { "epoch": 2.4713375796178343, "grad_norm": 0.6979154184622003, "learning_rate": 5.091469842534483e-07, "loss": 0.0101, "step": 5432 }, { "epoch": 2.4717925386715196, "grad_norm": 33.89384098162808, "learning_rate": 5.090040781986705e-07, "loss": 0.1164, "step": 5433 }, { "epoch": 2.472247497725205, "grad_norm": 0.8016649266752705, "learning_rate": 5.088611714081203e-07, "loss": 0.0169, "step": 5434 }, { "epoch": 2.47270245677889, "grad_norm": 0.9449484326166766, "learning_rate": 5.087182638934747e-07, "loss": 0.0265, "step": 5435 }, { "epoch": 2.473157415832575, "grad_norm": 0.7686868517185393, "learning_rate": 5.085753556664123e-07, "loss": 0.024, "step": 5436 }, { "epoch": 2.4736123748862604, "grad_norm": 0.6283672102837068, "learning_rate": 5.084324467386105e-07, "loss": 0.0107, "step": 5437 }, { "epoch": 2.4740673339399453, "grad_norm": 1.4426038936040466, "learning_rate": 5.082895371217472e-07, "loss": 0.0108, "step": 5438 }, { "epoch": 2.4745222929936306, "grad_norm": 0.7887820345258075, "learning_rate": 5.081466268275005e-07, "loss": 0.0127, "step": 5439 }, { "epoch": 2.474977252047316, "grad_norm": 0.6606685193788583, "learning_rate": 5.08003715867548e-07, "loss": 0.0223, "step": 5440 }, { "epoch": 2.4754322111010008, "grad_norm": 0.8610311098764438, "learning_rate": 5.078608042535682e-07, "loss": 0.01, "step": 5441 }, { "epoch": 2.475887170154686, "grad_norm": 1.0241632083202843, "learning_rate": 5.077178919972387e-07, "loss": 0.0214, "step": 5442 }, { "epoch": 2.4763421292083714, "grad_norm": 0.9262966343332544, "learning_rate": 5.075749791102382e-07, "loss": 0.014, "step": 5443 }, { "epoch": 2.4767970882620562, "grad_norm": 0.9453779573570069, "learning_rate": 5.074320656042446e-07, "loss": 0.0315, "step": 5444 }, { "epoch": 2.4772520473157416, "grad_norm": 0.9390057474336235, "learning_rate": 5.07289151490936e-07, "loss": 0.0203, "step": 5445 }, { "epoch": 2.477707006369427, "grad_norm": 0.8286647547936518, "learning_rate": 5.071462367819909e-07, "loss": 0.012, "step": 5446 }, { "epoch": 2.4781619654231117, "grad_norm": 0.6827138659720707, "learning_rate": 5.070033214890876e-07, "loss": 0.0101, "step": 5447 }, { "epoch": 2.478616924476797, "grad_norm": 0.7347947871256166, "learning_rate": 5.068604056239046e-07, "loss": 0.0193, "step": 5448 }, { "epoch": 2.4790718835304824, "grad_norm": 0.9253276727100334, "learning_rate": 5.0671748919812e-07, "loss": 0.0178, "step": 5449 }, { "epoch": 2.4795268425841677, "grad_norm": 1.0530542173679671, "learning_rate": 5.065745722234127e-07, "loss": 0.013, "step": 5450 }, { "epoch": 2.4799818016378525, "grad_norm": 0.8446995538026595, "learning_rate": 5.064316547114611e-07, "loss": 0.0084, "step": 5451 }, { "epoch": 2.480436760691538, "grad_norm": 0.6181801049546072, "learning_rate": 5.062887366739436e-07, "loss": 0.0137, "step": 5452 }, { "epoch": 2.480891719745223, "grad_norm": 0.702636023674383, "learning_rate": 5.06145818122539e-07, "loss": 0.0139, "step": 5453 }, { "epoch": 2.481346678798908, "grad_norm": 0.6750484535609671, "learning_rate": 5.060028990689258e-07, "loss": 0.0243, "step": 5454 }, { "epoch": 2.4818016378525933, "grad_norm": 0.6946472269625676, "learning_rate": 5.05859979524783e-07, "loss": 0.0194, "step": 5455 }, { "epoch": 2.4822565969062786, "grad_norm": 0.725320322160415, "learning_rate": 5.05717059501789e-07, "loss": 0.0203, "step": 5456 }, { "epoch": 2.4827115559599635, "grad_norm": 0.9468945655386326, "learning_rate": 5.055741390116226e-07, "loss": 0.0206, "step": 5457 }, { "epoch": 2.483166515013649, "grad_norm": 0.936186476285768, "learning_rate": 5.05431218065963e-07, "loss": 0.0237, "step": 5458 }, { "epoch": 2.483621474067334, "grad_norm": 0.8609362550690427, "learning_rate": 5.052882966764886e-07, "loss": 0.0205, "step": 5459 }, { "epoch": 2.484076433121019, "grad_norm": 1.0588417822813634, "learning_rate": 5.051453748548785e-07, "loss": 0.019, "step": 5460 }, { "epoch": 2.4845313921747043, "grad_norm": 0.5967330863717595, "learning_rate": 5.050024526128118e-07, "loss": 0.0201, "step": 5461 }, { "epoch": 2.4849863512283896, "grad_norm": 0.8581766555497515, "learning_rate": 5.04859529961967e-07, "loss": 0.016, "step": 5462 }, { "epoch": 2.4854413102820745, "grad_norm": 1.1402198728846453, "learning_rate": 5.047166069140234e-07, "loss": 0.0223, "step": 5463 }, { "epoch": 2.48589626933576, "grad_norm": 0.6770663673917243, "learning_rate": 5.0457368348066e-07, "loss": 0.0175, "step": 5464 }, { "epoch": 2.486351228389445, "grad_norm": 0.739420027199911, "learning_rate": 5.04430759673556e-07, "loss": 0.0122, "step": 5465 }, { "epoch": 2.48680618744313, "grad_norm": 0.7803011889833491, "learning_rate": 5.042878355043901e-07, "loss": 0.0174, "step": 5466 }, { "epoch": 2.4872611464968153, "grad_norm": 1.0141424550915445, "learning_rate": 5.041449109848416e-07, "loss": 0.015, "step": 5467 }, { "epoch": 2.4877161055505006, "grad_norm": 0.7108759841641992, "learning_rate": 5.040019861265901e-07, "loss": 0.0231, "step": 5468 }, { "epoch": 2.4881710646041855, "grad_norm": 1.851651223488676, "learning_rate": 5.038590609413141e-07, "loss": 0.0113, "step": 5469 }, { "epoch": 2.488626023657871, "grad_norm": 0.7207469999039241, "learning_rate": 5.03716135440693e-07, "loss": 0.0167, "step": 5470 }, { "epoch": 2.489080982711556, "grad_norm": 1.243000931556887, "learning_rate": 5.035732096364061e-07, "loss": 0.029, "step": 5471 }, { "epoch": 2.489535941765241, "grad_norm": 0.7389263243136356, "learning_rate": 5.034302835401328e-07, "loss": 0.0125, "step": 5472 }, { "epoch": 2.4899909008189263, "grad_norm": 1.208380646469847, "learning_rate": 5.032873571635521e-07, "loss": 0.0315, "step": 5473 }, { "epoch": 2.4904458598726116, "grad_norm": 1.2974348276679775, "learning_rate": 5.031444305183434e-07, "loss": 0.0217, "step": 5474 }, { "epoch": 2.4909008189262964, "grad_norm": 31.158928854709693, "learning_rate": 5.030015036161862e-07, "loss": 0.144, "step": 5475 }, { "epoch": 2.4913557779799818, "grad_norm": 0.5538760562935569, "learning_rate": 5.028585764687595e-07, "loss": 0.0093, "step": 5476 }, { "epoch": 2.491810737033667, "grad_norm": 0.7011080578616601, "learning_rate": 5.027156490877429e-07, "loss": 0.0214, "step": 5477 }, { "epoch": 2.492265696087352, "grad_norm": 1.7587737386472906, "learning_rate": 5.025727214848156e-07, "loss": 0.0162, "step": 5478 }, { "epoch": 2.4927206551410372, "grad_norm": 0.8007998208833864, "learning_rate": 5.024297936716573e-07, "loss": 0.0228, "step": 5479 }, { "epoch": 2.4931756141947226, "grad_norm": 0.6064642419335707, "learning_rate": 5.022868656599473e-07, "loss": 0.0129, "step": 5480 }, { "epoch": 2.4936305732484074, "grad_norm": 0.8171154196488857, "learning_rate": 5.021439374613647e-07, "loss": 0.012, "step": 5481 }, { "epoch": 2.4940855323020927, "grad_norm": 1.182207620297786, "learning_rate": 5.020010090875894e-07, "loss": 0.0287, "step": 5482 }, { "epoch": 2.494540491355778, "grad_norm": 0.7502322034655249, "learning_rate": 5.018580805503007e-07, "loss": 0.0185, "step": 5483 }, { "epoch": 2.494995450409463, "grad_norm": 0.9020287406275797, "learning_rate": 5.01715151861178e-07, "loss": 0.0261, "step": 5484 }, { "epoch": 2.4954504094631482, "grad_norm": 0.6277388120441005, "learning_rate": 5.015722230319009e-07, "loss": 0.0152, "step": 5485 }, { "epoch": 2.4959053685168335, "grad_norm": 0.6344413781093295, "learning_rate": 5.014292940741487e-07, "loss": 0.0107, "step": 5486 }, { "epoch": 2.496360327570519, "grad_norm": 0.9671643730543046, "learning_rate": 5.012863649996012e-07, "loss": 0.0183, "step": 5487 }, { "epoch": 2.4968152866242037, "grad_norm": 0.8625141279850337, "learning_rate": 5.011434358199375e-07, "loss": 0.0268, "step": 5488 }, { "epoch": 2.497270245677889, "grad_norm": 0.8738389728302492, "learning_rate": 5.010005065468376e-07, "loss": 0.0247, "step": 5489 }, { "epoch": 2.4977252047315743, "grad_norm": 0.7455992553490784, "learning_rate": 5.008575771919808e-07, "loss": 0.0161, "step": 5490 }, { "epoch": 2.498180163785259, "grad_norm": 1.1300152905748342, "learning_rate": 5.007146477670466e-07, "loss": 0.0322, "step": 5491 }, { "epoch": 2.4986351228389445, "grad_norm": 1.0730163385838425, "learning_rate": 5.005717182837147e-07, "loss": 0.0114, "step": 5492 }, { "epoch": 2.49909008189263, "grad_norm": 1.0392069887185615, "learning_rate": 5.004287887536644e-07, "loss": 0.0198, "step": 5493 }, { "epoch": 2.4995450409463147, "grad_norm": 0.6192852075296452, "learning_rate": 5.002858591885755e-07, "loss": 0.0079, "step": 5494 }, { "epoch": 2.5, "grad_norm": 0.9351613273607984, "learning_rate": 5.001429296001275e-07, "loss": 0.0235, "step": 5495 }, { "epoch": 2.5004549590536853, "grad_norm": 0.7113632138387808, "learning_rate": 5e-07, "loss": 0.0095, "step": 5496 }, { "epoch": 2.50090991810737, "grad_norm": 0.4386391696838351, "learning_rate": 4.998570703998725e-07, "loss": 0.0117, "step": 5497 }, { "epoch": 2.5013648771610555, "grad_norm": 0.733962365347063, "learning_rate": 4.997141408114245e-07, "loss": 0.0112, "step": 5498 }, { "epoch": 2.501819836214741, "grad_norm": 0.6827425984968476, "learning_rate": 4.995712112463357e-07, "loss": 0.0116, "step": 5499 }, { "epoch": 2.502274795268426, "grad_norm": 1.2867973110213642, "learning_rate": 4.994282817162854e-07, "loss": 0.0209, "step": 5500 }, { "epoch": 2.502729754322111, "grad_norm": 1.018938007961314, "learning_rate": 4.992853522329534e-07, "loss": 0.0281, "step": 5501 }, { "epoch": 2.5031847133757963, "grad_norm": 0.802752449001824, "learning_rate": 4.991424228080193e-07, "loss": 0.0145, "step": 5502 }, { "epoch": 2.5036396724294816, "grad_norm": 0.9371578802034312, "learning_rate": 4.989994934531625e-07, "loss": 0.0114, "step": 5503 }, { "epoch": 2.5040946314831665, "grad_norm": 0.6453240149640247, "learning_rate": 4.988565641800626e-07, "loss": 0.0129, "step": 5504 }, { "epoch": 2.5045495905368518, "grad_norm": 0.690123540445508, "learning_rate": 4.98713635000399e-07, "loss": 0.0147, "step": 5505 }, { "epoch": 2.505004549590537, "grad_norm": 0.8661651651663205, "learning_rate": 4.985707059258514e-07, "loss": 0.0225, "step": 5506 }, { "epoch": 2.505459508644222, "grad_norm": 0.714353783320937, "learning_rate": 4.984277769680992e-07, "loss": 0.0139, "step": 5507 }, { "epoch": 2.5059144676979073, "grad_norm": 0.7261145106696598, "learning_rate": 4.98284848138822e-07, "loss": 0.0107, "step": 5508 }, { "epoch": 2.5063694267515926, "grad_norm": 0.8714048374725956, "learning_rate": 4.981419194496993e-07, "loss": 0.0105, "step": 5509 }, { "epoch": 2.5068243858052774, "grad_norm": 0.6528613780392076, "learning_rate": 4.979989909124105e-07, "loss": 0.0144, "step": 5510 }, { "epoch": 2.5072793448589628, "grad_norm": 0.984783824951924, "learning_rate": 4.978560625386352e-07, "loss": 0.026, "step": 5511 }, { "epoch": 2.507734303912648, "grad_norm": 1.107887142907565, "learning_rate": 4.977131343400527e-07, "loss": 0.0141, "step": 5512 }, { "epoch": 2.508189262966333, "grad_norm": 0.34997224096823665, "learning_rate": 4.975702063283427e-07, "loss": 0.0033, "step": 5513 }, { "epoch": 2.5086442220200182, "grad_norm": 0.676123942297627, "learning_rate": 4.974272785151842e-07, "loss": 0.0116, "step": 5514 }, { "epoch": 2.5090991810737036, "grad_norm": 0.7201439878689567, "learning_rate": 4.97284350912257e-07, "loss": 0.019, "step": 5515 }, { "epoch": 2.5095541401273884, "grad_norm": 0.7951071999510243, "learning_rate": 4.971414235312405e-07, "loss": 0.0094, "step": 5516 }, { "epoch": 2.5100090991810737, "grad_norm": 0.7178654469725763, "learning_rate": 4.969984963838138e-07, "loss": 0.0129, "step": 5517 }, { "epoch": 2.510464058234759, "grad_norm": 0.7095440700632617, "learning_rate": 4.968555694816566e-07, "loss": 0.0196, "step": 5518 }, { "epoch": 2.510919017288444, "grad_norm": 0.6874625379585483, "learning_rate": 4.96712642836448e-07, "loss": 0.016, "step": 5519 }, { "epoch": 2.511373976342129, "grad_norm": 0.6615126757444245, "learning_rate": 4.965697164598673e-07, "loss": 0.0159, "step": 5520 }, { "epoch": 2.5118289353958145, "grad_norm": 0.7339242543714902, "learning_rate": 4.964267903635938e-07, "loss": 0.0262, "step": 5521 }, { "epoch": 2.5122838944494994, "grad_norm": 0.7095161942298147, "learning_rate": 4.962838645593069e-07, "loss": 0.0109, "step": 5522 }, { "epoch": 2.5127388535031847, "grad_norm": 0.7895347305627046, "learning_rate": 4.96140939058686e-07, "loss": 0.0114, "step": 5523 }, { "epoch": 2.51319381255687, "grad_norm": 0.5611305001760071, "learning_rate": 4.9599801387341e-07, "loss": 0.0069, "step": 5524 }, { "epoch": 2.513648771610555, "grad_norm": 0.7917169705205278, "learning_rate": 4.958550890151583e-07, "loss": 0.0133, "step": 5525 }, { "epoch": 2.51410373066424, "grad_norm": 0.8317567607338723, "learning_rate": 4.957121644956099e-07, "loss": 0.0203, "step": 5526 }, { "epoch": 2.5145586897179255, "grad_norm": 1.5428388077271804, "learning_rate": 4.955692403264443e-07, "loss": 0.0314, "step": 5527 }, { "epoch": 2.5150136487716104, "grad_norm": 0.8397386492411383, "learning_rate": 4.9542631651934e-07, "loss": 0.0096, "step": 5528 }, { "epoch": 2.5154686078252957, "grad_norm": 1.06442033068855, "learning_rate": 4.952833930859765e-07, "loss": 0.0208, "step": 5529 }, { "epoch": 2.515923566878981, "grad_norm": 0.9571123066904406, "learning_rate": 4.95140470038033e-07, "loss": 0.016, "step": 5530 }, { "epoch": 2.516378525932666, "grad_norm": 0.660697935556028, "learning_rate": 4.949975473871883e-07, "loss": 0.0176, "step": 5531 }, { "epoch": 2.516833484986351, "grad_norm": 1.0746782840676812, "learning_rate": 4.948546251451215e-07, "loss": 0.0173, "step": 5532 }, { "epoch": 2.5172884440400365, "grad_norm": 0.8343385650902172, "learning_rate": 4.947117033235115e-07, "loss": 0.0218, "step": 5533 }, { "epoch": 2.5177434030937214, "grad_norm": 0.613232085276289, "learning_rate": 4.945687819340371e-07, "loss": 0.0118, "step": 5534 }, { "epoch": 2.5181983621474067, "grad_norm": 0.8455036660119137, "learning_rate": 4.944258609883772e-07, "loss": 0.0138, "step": 5535 }, { "epoch": 2.518653321201092, "grad_norm": 0.8248552329114761, "learning_rate": 4.94282940498211e-07, "loss": 0.0163, "step": 5536 }, { "epoch": 2.519108280254777, "grad_norm": 1.0432020892225, "learning_rate": 4.94140020475217e-07, "loss": 0.0166, "step": 5537 }, { "epoch": 2.519563239308462, "grad_norm": 0.6645542479273776, "learning_rate": 4.939971009310742e-07, "loss": 0.012, "step": 5538 }, { "epoch": 2.5200181983621475, "grad_norm": 0.9162676872995642, "learning_rate": 4.93854181877461e-07, "loss": 0.0194, "step": 5539 }, { "epoch": 2.5204731574158323, "grad_norm": 1.028661192211043, "learning_rate": 4.937112633260565e-07, "loss": 0.0219, "step": 5540 }, { "epoch": 2.5209281164695176, "grad_norm": 0.6536771823652633, "learning_rate": 4.93568345288539e-07, "loss": 0.0105, "step": 5541 }, { "epoch": 2.521383075523203, "grad_norm": 0.9323059884196169, "learning_rate": 4.934254277765871e-07, "loss": 0.0208, "step": 5542 }, { "epoch": 2.521838034576888, "grad_norm": 0.8205173533066885, "learning_rate": 4.932825108018799e-07, "loss": 0.0145, "step": 5543 }, { "epoch": 2.522292993630573, "grad_norm": 0.7778010942702988, "learning_rate": 4.931395943760955e-07, "loss": 0.0128, "step": 5544 }, { "epoch": 2.5227479526842584, "grad_norm": 46.23026230033487, "learning_rate": 4.929966785109125e-07, "loss": 0.4324, "step": 5545 }, { "epoch": 2.5232029117379433, "grad_norm": 0.9377829322317542, "learning_rate": 4.928537632180092e-07, "loss": 0.0267, "step": 5546 }, { "epoch": 2.5236578707916286, "grad_norm": 1.6630000027723801, "learning_rate": 4.927108485090642e-07, "loss": 0.0163, "step": 5547 }, { "epoch": 2.524112829845314, "grad_norm": 0.8948664880883349, "learning_rate": 4.925679343957556e-07, "loss": 0.027, "step": 5548 }, { "epoch": 2.5245677888989992, "grad_norm": 0.8084061316897617, "learning_rate": 4.924250208897618e-07, "loss": 0.0194, "step": 5549 }, { "epoch": 2.525022747952684, "grad_norm": 0.4438845733035999, "learning_rate": 4.922821080027613e-07, "loss": 0.0126, "step": 5550 }, { "epoch": 2.5254777070063694, "grad_norm": 0.7336446866804854, "learning_rate": 4.921391957464319e-07, "loss": 0.0204, "step": 5551 }, { "epoch": 2.5259326660600547, "grad_norm": 0.9872617580313214, "learning_rate": 4.91996284132452e-07, "loss": 0.0336, "step": 5552 }, { "epoch": 2.5263876251137396, "grad_norm": 0.706333519166307, "learning_rate": 4.918533731724996e-07, "loss": 0.0216, "step": 5553 }, { "epoch": 2.526842584167425, "grad_norm": 0.8246981102821593, "learning_rate": 4.917104628782528e-07, "loss": 0.0179, "step": 5554 }, { "epoch": 2.52729754322111, "grad_norm": 0.8060044190323076, "learning_rate": 4.915675532613895e-07, "loss": 0.01, "step": 5555 }, { "epoch": 2.5277525022747955, "grad_norm": 0.4845592034023306, "learning_rate": 4.914246443335875e-07, "loss": 0.0068, "step": 5556 }, { "epoch": 2.5282074613284804, "grad_norm": 0.8311471397631951, "learning_rate": 4.912817361065252e-07, "loss": 0.0214, "step": 5557 }, { "epoch": 2.5286624203821657, "grad_norm": 0.9414146749972638, "learning_rate": 4.911388285918798e-07, "loss": 0.022, "step": 5558 }, { "epoch": 2.529117379435851, "grad_norm": 0.7961065553876732, "learning_rate": 4.909959218013294e-07, "loss": 0.0269, "step": 5559 }, { "epoch": 2.529572338489536, "grad_norm": 0.7469815637941537, "learning_rate": 4.908530157465516e-07, "loss": 0.0164, "step": 5560 }, { "epoch": 2.530027297543221, "grad_norm": 0.7756673800328543, "learning_rate": 4.907101104392241e-07, "loss": 0.0184, "step": 5561 }, { "epoch": 2.5304822565969065, "grad_norm": 1.368783741341754, "learning_rate": 4.905672058910243e-07, "loss": 0.031, "step": 5562 }, { "epoch": 2.5309372156505914, "grad_norm": 0.724915539526077, "learning_rate": 4.904243021136297e-07, "loss": 0.0137, "step": 5563 }, { "epoch": 2.5313921747042767, "grad_norm": 1.0419498796247029, "learning_rate": 4.902813991187182e-07, "loss": 0.0165, "step": 5564 }, { "epoch": 2.531847133757962, "grad_norm": 0.6548414153243403, "learning_rate": 4.901384969179667e-07, "loss": 0.0123, "step": 5565 }, { "epoch": 2.532302092811647, "grad_norm": 0.8922398245097592, "learning_rate": 4.899955955230529e-07, "loss": 0.0362, "step": 5566 }, { "epoch": 2.532757051865332, "grad_norm": 0.8534608427530975, "learning_rate": 4.898526949456536e-07, "loss": 0.0224, "step": 5567 }, { "epoch": 2.5332120109190175, "grad_norm": 0.6109015006171259, "learning_rate": 4.897097951974465e-07, "loss": 0.0106, "step": 5568 }, { "epoch": 2.5336669699727024, "grad_norm": 0.7815545316379318, "learning_rate": 4.895668962901083e-07, "loss": 0.0122, "step": 5569 }, { "epoch": 2.5341219290263877, "grad_norm": 0.9567717670207346, "learning_rate": 4.894239982353162e-07, "loss": 0.027, "step": 5570 }, { "epoch": 2.534576888080073, "grad_norm": 0.8129862082524694, "learning_rate": 4.892811010447471e-07, "loss": 0.0151, "step": 5571 }, { "epoch": 2.535031847133758, "grad_norm": 0.6279837203695529, "learning_rate": 4.891382047300782e-07, "loss": 0.0107, "step": 5572 }, { "epoch": 2.535486806187443, "grad_norm": 0.6554960658762113, "learning_rate": 4.889953093029861e-07, "loss": 0.0105, "step": 5573 }, { "epoch": 2.5359417652411285, "grad_norm": 0.9255527088802104, "learning_rate": 4.888524147751478e-07, "loss": 0.0236, "step": 5574 }, { "epoch": 2.5363967242948133, "grad_norm": 0.6039349277488775, "learning_rate": 4.887095211582396e-07, "loss": 0.0063, "step": 5575 }, { "epoch": 2.5368516833484986, "grad_norm": 0.7532291323219898, "learning_rate": 4.885666284639385e-07, "loss": 0.0266, "step": 5576 }, { "epoch": 2.537306642402184, "grad_norm": 0.8337023263633747, "learning_rate": 4.884237367039207e-07, "loss": 0.0112, "step": 5577 }, { "epoch": 2.537761601455869, "grad_norm": 0.9079218418521545, "learning_rate": 4.882808458898629e-07, "loss": 0.0166, "step": 5578 }, { "epoch": 2.538216560509554, "grad_norm": 0.673940467062053, "learning_rate": 4.881379560334416e-07, "loss": 0.0102, "step": 5579 }, { "epoch": 2.5386715195632394, "grad_norm": 0.9569887229117852, "learning_rate": 4.879950671463328e-07, "loss": 0.018, "step": 5580 }, { "epoch": 2.5391264786169243, "grad_norm": 1.240302213603426, "learning_rate": 4.87852179240213e-07, "loss": 0.0294, "step": 5581 }, { "epoch": 2.5395814376706096, "grad_norm": 0.7374643120576625, "learning_rate": 4.877092923267581e-07, "loss": 0.0225, "step": 5582 }, { "epoch": 2.540036396724295, "grad_norm": 0.894837557347414, "learning_rate": 4.875664064176446e-07, "loss": 0.0226, "step": 5583 }, { "epoch": 2.54049135577798, "grad_norm": 0.9099214569503777, "learning_rate": 4.874235215245479e-07, "loss": 0.0146, "step": 5584 }, { "epoch": 2.540946314831665, "grad_norm": 0.9454403612199443, "learning_rate": 4.872806376591444e-07, "loss": 0.0248, "step": 5585 }, { "epoch": 2.5414012738853504, "grad_norm": 0.9313773202896909, "learning_rate": 4.871377548331099e-07, "loss": 0.0153, "step": 5586 }, { "epoch": 2.5418562329390353, "grad_norm": 0.5995845051435521, "learning_rate": 4.869948730581197e-07, "loss": 0.0153, "step": 5587 }, { "epoch": 2.5423111919927206, "grad_norm": 0.6050590916233983, "learning_rate": 4.8685199234585e-07, "loss": 0.0218, "step": 5588 }, { "epoch": 2.542766151046406, "grad_norm": 0.7247147645533233, "learning_rate": 4.867091127079759e-07, "loss": 0.0204, "step": 5589 }, { "epoch": 2.5432211101000908, "grad_norm": 0.7189058100186897, "learning_rate": 4.865662341561732e-07, "loss": 0.0103, "step": 5590 }, { "epoch": 2.543676069153776, "grad_norm": 0.8184579015353936, "learning_rate": 4.864233567021171e-07, "loss": 0.0134, "step": 5591 }, { "epoch": 2.5441310282074614, "grad_norm": 0.7024154722110879, "learning_rate": 4.86280480357483e-07, "loss": 0.0188, "step": 5592 }, { "epoch": 2.5445859872611463, "grad_norm": 0.805003389435192, "learning_rate": 4.861376051339461e-07, "loss": 0.0148, "step": 5593 }, { "epoch": 2.5450409463148316, "grad_norm": 0.8183590052816502, "learning_rate": 4.859947310431816e-07, "loss": 0.011, "step": 5594 }, { "epoch": 2.545495905368517, "grad_norm": 0.7671492294370266, "learning_rate": 4.858518580968644e-07, "loss": 0.0266, "step": 5595 }, { "epoch": 2.5459508644222018, "grad_norm": 0.8770260560210277, "learning_rate": 4.857089863066694e-07, "loss": 0.0154, "step": 5596 }, { "epoch": 2.546405823475887, "grad_norm": 0.8151298409163193, "learning_rate": 4.855661156842716e-07, "loss": 0.0188, "step": 5597 }, { "epoch": 2.5468607825295724, "grad_norm": 1.2897951840921287, "learning_rate": 4.854232462413454e-07, "loss": 0.0427, "step": 5598 }, { "epoch": 2.5473157415832572, "grad_norm": 0.5267047424873867, "learning_rate": 4.852803779895657e-07, "loss": 0.0076, "step": 5599 }, { "epoch": 2.5477707006369426, "grad_norm": 0.7999003905406524, "learning_rate": 4.851375109406074e-07, "loss": 0.0262, "step": 5600 }, { "epoch": 2.548225659690628, "grad_norm": 0.5731508686305054, "learning_rate": 4.849946451061443e-07, "loss": 0.0069, "step": 5601 }, { "epoch": 2.548680618744313, "grad_norm": 0.6529411311471098, "learning_rate": 4.848517804978512e-07, "loss": 0.0197, "step": 5602 }, { "epoch": 2.549135577797998, "grad_norm": 0.8673046249852544, "learning_rate": 4.847089171274022e-07, "loss": 0.0204, "step": 5603 }, { "epoch": 2.5495905368516834, "grad_norm": 0.8792769528814084, "learning_rate": 4.845660550064713e-07, "loss": 0.0273, "step": 5604 }, { "epoch": 2.5500454959053687, "grad_norm": 0.6572228414187363, "learning_rate": 4.844231941467326e-07, "loss": 0.0177, "step": 5605 }, { "epoch": 2.5505004549590535, "grad_norm": 0.5724070322988349, "learning_rate": 4.842803345598604e-07, "loss": 0.0104, "step": 5606 }, { "epoch": 2.550955414012739, "grad_norm": 0.6196581931328422, "learning_rate": 4.84137476257528e-07, "loss": 0.0125, "step": 5607 }, { "epoch": 2.551410373066424, "grad_norm": 0.8352955693932369, "learning_rate": 4.839946192514097e-07, "loss": 0.0091, "step": 5608 }, { "epoch": 2.5518653321201095, "grad_norm": 1.1937534456296264, "learning_rate": 4.838517635531786e-07, "loss": 0.0182, "step": 5609 }, { "epoch": 2.5523202911737943, "grad_norm": 0.8484087201222879, "learning_rate": 4.837089091745085e-07, "loss": 0.0167, "step": 5610 }, { "epoch": 2.5527752502274796, "grad_norm": 0.6522968085878885, "learning_rate": 4.835660561270729e-07, "loss": 0.0125, "step": 5611 }, { "epoch": 2.553230209281165, "grad_norm": 0.9908591231368201, "learning_rate": 4.834232044225447e-07, "loss": 0.0186, "step": 5612 }, { "epoch": 2.55368516833485, "grad_norm": 0.6849216145801014, "learning_rate": 4.832803540725976e-07, "loss": 0.0192, "step": 5613 }, { "epoch": 2.554140127388535, "grad_norm": 0.6700504595025387, "learning_rate": 4.831375050889043e-07, "loss": 0.0206, "step": 5614 }, { "epoch": 2.5545950864422204, "grad_norm": 0.8752215466618717, "learning_rate": 4.829946574831382e-07, "loss": 0.0208, "step": 5615 }, { "epoch": 2.5550500454959053, "grad_norm": 1.0097968831540276, "learning_rate": 4.828518112669717e-07, "loss": 0.0242, "step": 5616 }, { "epoch": 2.5555050045495906, "grad_norm": 0.8622509581735388, "learning_rate": 4.827089664520779e-07, "loss": 0.0168, "step": 5617 }, { "epoch": 2.555959963603276, "grad_norm": 1.0224724189816206, "learning_rate": 4.825661230501291e-07, "loss": 0.0218, "step": 5618 }, { "epoch": 2.556414922656961, "grad_norm": 0.6133954122124438, "learning_rate": 4.824232810727979e-07, "loss": 0.0064, "step": 5619 }, { "epoch": 2.556869881710646, "grad_norm": 0.6930541634502038, "learning_rate": 4.82280440531757e-07, "loss": 0.0178, "step": 5620 }, { "epoch": 2.5573248407643314, "grad_norm": 1.1731639552382351, "learning_rate": 4.821376014386784e-07, "loss": 0.0187, "step": 5621 }, { "epoch": 2.5577797998180163, "grad_norm": 0.9020244481161063, "learning_rate": 4.819947638052345e-07, "loss": 0.0164, "step": 5622 }, { "epoch": 2.5582347588717016, "grad_norm": 0.9184322664266467, "learning_rate": 4.81851927643097e-07, "loss": 0.0223, "step": 5623 }, { "epoch": 2.558689717925387, "grad_norm": 0.6906434119609381, "learning_rate": 4.817090929639382e-07, "loss": 0.0168, "step": 5624 }, { "epoch": 2.5591446769790718, "grad_norm": 0.8470560781461682, "learning_rate": 4.815662597794295e-07, "loss": 0.0152, "step": 5625 }, { "epoch": 2.559599636032757, "grad_norm": 0.7147669520549016, "learning_rate": 4.814234281012428e-07, "loss": 0.0151, "step": 5626 }, { "epoch": 2.5600545950864424, "grad_norm": 0.6573871929990084, "learning_rate": 4.812805979410499e-07, "loss": 0.0207, "step": 5627 }, { "epoch": 2.5605095541401273, "grad_norm": 0.5894759178046042, "learning_rate": 4.811377693105218e-07, "loss": 0.0135, "step": 5628 }, { "epoch": 2.5609645131938126, "grad_norm": 0.8794005839546253, "learning_rate": 4.809949422213302e-07, "loss": 0.0254, "step": 5629 }, { "epoch": 2.561419472247498, "grad_norm": 1.0223146955925266, "learning_rate": 4.80852116685146e-07, "loss": 0.023, "step": 5630 }, { "epoch": 2.5618744313011828, "grad_norm": 0.8301418627718078, "learning_rate": 4.807092927136403e-07, "loss": 0.0159, "step": 5631 }, { "epoch": 2.562329390354868, "grad_norm": 0.8214718476501645, "learning_rate": 4.805664703184841e-07, "loss": 0.0153, "step": 5632 }, { "epoch": 2.5627843494085534, "grad_norm": 0.8147808910628849, "learning_rate": 4.804236495113481e-07, "loss": 0.0173, "step": 5633 }, { "epoch": 2.5632393084622382, "grad_norm": 0.8346074341504741, "learning_rate": 4.802808303039032e-07, "loss": 0.0279, "step": 5634 }, { "epoch": 2.5636942675159236, "grad_norm": 0.5643908770337699, "learning_rate": 4.801380127078197e-07, "loss": 0.0063, "step": 5635 }, { "epoch": 2.564149226569609, "grad_norm": 1.0265095235017305, "learning_rate": 4.799951967347683e-07, "loss": 0.0135, "step": 5636 }, { "epoch": 2.5646041856232937, "grad_norm": 0.8955910586227975, "learning_rate": 4.798523823964188e-07, "loss": 0.0184, "step": 5637 }, { "epoch": 2.565059144676979, "grad_norm": 0.7198915202376398, "learning_rate": 4.79709569704442e-07, "loss": 0.0164, "step": 5638 }, { "epoch": 2.5655141037306644, "grad_norm": 0.6815004091990604, "learning_rate": 4.795667586705073e-07, "loss": 0.0093, "step": 5639 }, { "epoch": 2.565969062784349, "grad_norm": 0.6960071908680533, "learning_rate": 4.794239493062845e-07, "loss": 0.016, "step": 5640 }, { "epoch": 2.5664240218380345, "grad_norm": 1.0852203046219964, "learning_rate": 4.792811416234441e-07, "loss": 0.0201, "step": 5641 }, { "epoch": 2.56687898089172, "grad_norm": 0.6975050848687177, "learning_rate": 4.791383356336551e-07, "loss": 0.0238, "step": 5642 }, { "epoch": 2.5673339399454047, "grad_norm": 1.0266668055938868, "learning_rate": 4.789955313485871e-07, "loss": 0.0208, "step": 5643 }, { "epoch": 2.56778889899909, "grad_norm": 0.7494057861770829, "learning_rate": 4.788527287799094e-07, "loss": 0.0063, "step": 5644 }, { "epoch": 2.5682438580527753, "grad_norm": 0.8990310714586617, "learning_rate": 4.787099279392912e-07, "loss": 0.0198, "step": 5645 }, { "epoch": 2.56869881710646, "grad_norm": 0.5760195407276884, "learning_rate": 4.785671288384016e-07, "loss": 0.0076, "step": 5646 }, { "epoch": 2.5691537761601455, "grad_norm": 0.9318528777665828, "learning_rate": 4.784243314889093e-07, "loss": 0.029, "step": 5647 }, { "epoch": 2.569608735213831, "grad_norm": 0.72320303844037, "learning_rate": 4.782815359024833e-07, "loss": 0.0159, "step": 5648 }, { "epoch": 2.5700636942675157, "grad_norm": 1.0634384621287316, "learning_rate": 4.781387420907921e-07, "loss": 0.0271, "step": 5649 }, { "epoch": 2.570518653321201, "grad_norm": 1.4500402458160397, "learning_rate": 4.779959500655042e-07, "loss": 0.0259, "step": 5650 }, { "epoch": 2.5709736123748863, "grad_norm": 0.7345981457146888, "learning_rate": 4.778531598382879e-07, "loss": 0.0238, "step": 5651 }, { "epoch": 2.571428571428571, "grad_norm": 0.7376264672343151, "learning_rate": 4.777103714208114e-07, "loss": 0.0128, "step": 5652 }, { "epoch": 2.5718835304822565, "grad_norm": 1.004446231228695, "learning_rate": 4.775675848247427e-07, "loss": 0.0162, "step": 5653 }, { "epoch": 2.572338489535942, "grad_norm": 0.9282832826335129, "learning_rate": 4.774248000617494e-07, "loss": 0.0202, "step": 5654 }, { "epoch": 2.5727934485896267, "grad_norm": 0.8060562299315555, "learning_rate": 4.772820171434996e-07, "loss": 0.0266, "step": 5655 }, { "epoch": 2.573248407643312, "grad_norm": 0.9899545529687854, "learning_rate": 4.771392360816609e-07, "loss": 0.0129, "step": 5656 }, { "epoch": 2.5737033666969973, "grad_norm": 0.5586910972218517, "learning_rate": 4.769964568879005e-07, "loss": 0.0176, "step": 5657 }, { "epoch": 2.5741583257506826, "grad_norm": 1.0870959051217002, "learning_rate": 4.768536795738859e-07, "loss": 0.0148, "step": 5658 }, { "epoch": 2.5746132848043675, "grad_norm": 0.8742465271222445, "learning_rate": 4.7671090415128393e-07, "loss": 0.0241, "step": 5659 }, { "epoch": 2.5750682438580528, "grad_norm": 0.528477749021489, "learning_rate": 4.7656813063176183e-07, "loss": 0.0111, "step": 5660 }, { "epoch": 2.575523202911738, "grad_norm": 0.7009723701954562, "learning_rate": 4.7642535902698603e-07, "loss": 0.0161, "step": 5661 }, { "epoch": 2.575978161965423, "grad_norm": 0.9112136703700382, "learning_rate": 4.762825893486235e-07, "loss": 0.0189, "step": 5662 }, { "epoch": 2.5764331210191083, "grad_norm": 1.005215678463243, "learning_rate": 4.761398216083409e-07, "loss": 0.0332, "step": 5663 }, { "epoch": 2.5768880800727936, "grad_norm": 0.9633824549921779, "learning_rate": 4.759970558178041e-07, "loss": 0.028, "step": 5664 }, { "epoch": 2.577343039126479, "grad_norm": 0.8231410498476633, "learning_rate": 4.7585429198867964e-07, "loss": 0.0167, "step": 5665 }, { "epoch": 2.5777979981801638, "grad_norm": 0.6813165596493591, "learning_rate": 4.757115301326333e-07, "loss": 0.0194, "step": 5666 }, { "epoch": 2.578252957233849, "grad_norm": 1.110163003513502, "learning_rate": 4.755687702613311e-07, "loss": 0.0275, "step": 5667 }, { "epoch": 2.5787079162875344, "grad_norm": 0.7574879634519114, "learning_rate": 4.7542601238643856e-07, "loss": 0.0118, "step": 5668 }, { "epoch": 2.5791628753412192, "grad_norm": 0.6314946624044618, "learning_rate": 4.752832565196213e-07, "loss": 0.0169, "step": 5669 }, { "epoch": 2.5796178343949046, "grad_norm": 1.0085334866501316, "learning_rate": 4.751405026725449e-07, "loss": 0.025, "step": 5670 }, { "epoch": 2.58007279344859, "grad_norm": 0.5727923268318157, "learning_rate": 4.7499775085687414e-07, "loss": 0.0122, "step": 5671 }, { "epoch": 2.5805277525022747, "grad_norm": 0.8219640352674025, "learning_rate": 4.7485500108427446e-07, "loss": 0.0136, "step": 5672 }, { "epoch": 2.58098271155596, "grad_norm": 0.9272663157144779, "learning_rate": 4.747122533664104e-07, "loss": 0.0245, "step": 5673 }, { "epoch": 2.5814376706096454, "grad_norm": 0.8046318750489906, "learning_rate": 4.745695077149469e-07, "loss": 0.0073, "step": 5674 }, { "epoch": 2.58189262966333, "grad_norm": 0.72119791888723, "learning_rate": 4.744267641415482e-07, "loss": 0.0239, "step": 5675 }, { "epoch": 2.5823475887170155, "grad_norm": 0.6079713193615599, "learning_rate": 4.742840226578789e-07, "loss": 0.0155, "step": 5676 }, { "epoch": 2.582802547770701, "grad_norm": 0.6520650798445282, "learning_rate": 4.741412832756033e-07, "loss": 0.0124, "step": 5677 }, { "epoch": 2.5832575068243857, "grad_norm": 0.78607941786688, "learning_rate": 4.739985460063852e-07, "loss": 0.0184, "step": 5678 }, { "epoch": 2.583712465878071, "grad_norm": 0.8501743733689305, "learning_rate": 4.7385581086188845e-07, "loss": 0.0289, "step": 5679 }, { "epoch": 2.5841674249317563, "grad_norm": 0.7900418485791123, "learning_rate": 4.7371307785377683e-07, "loss": 0.0205, "step": 5680 }, { "epoch": 2.584622383985441, "grad_norm": 0.8531603042131535, "learning_rate": 4.7357034699371376e-07, "loss": 0.0133, "step": 5681 }, { "epoch": 2.5850773430391265, "grad_norm": 1.1773395685096482, "learning_rate": 4.7342761829336236e-07, "loss": 0.0243, "step": 5682 }, { "epoch": 2.585532302092812, "grad_norm": 0.855736254281119, "learning_rate": 4.732848917643862e-07, "loss": 0.0128, "step": 5683 }, { "epoch": 2.5859872611464967, "grad_norm": 5.872501167373827, "learning_rate": 4.7314216741844803e-07, "loss": 0.0577, "step": 5684 }, { "epoch": 2.586442220200182, "grad_norm": 0.9508548006014507, "learning_rate": 4.7299944526721074e-07, "loss": 0.0376, "step": 5685 }, { "epoch": 2.5868971792538673, "grad_norm": 0.6717706726439454, "learning_rate": 4.728567253223367e-07, "loss": 0.012, "step": 5686 }, { "epoch": 2.587352138307552, "grad_norm": 0.9155128443901427, "learning_rate": 4.7271400759548865e-07, "loss": 0.0231, "step": 5687 }, { "epoch": 2.5878070973612375, "grad_norm": 0.9234587000598684, "learning_rate": 4.725712920983285e-07, "loss": 0.0226, "step": 5688 }, { "epoch": 2.588262056414923, "grad_norm": 0.8609248487502706, "learning_rate": 4.724285788425184e-07, "loss": 0.0122, "step": 5689 }, { "epoch": 2.5887170154686077, "grad_norm": 0.6632028318792135, "learning_rate": 4.7228586783972056e-07, "loss": 0.018, "step": 5690 }, { "epoch": 2.589171974522293, "grad_norm": 0.9045223868804189, "learning_rate": 4.721431591015963e-07, "loss": 0.0116, "step": 5691 }, { "epoch": 2.5896269335759783, "grad_norm": 0.7003984963699937, "learning_rate": 4.7200045263980744e-07, "loss": 0.0197, "step": 5692 }, { "epoch": 2.590081892629663, "grad_norm": 1.2100359146292352, "learning_rate": 4.71857748466015e-07, "loss": 0.0435, "step": 5693 }, { "epoch": 2.5905368516833485, "grad_norm": 0.9284355625886451, "learning_rate": 4.7171504659188044e-07, "loss": 0.0252, "step": 5694 }, { "epoch": 2.5909918107370338, "grad_norm": 1.0984426559376046, "learning_rate": 4.715723470290643e-07, "loss": 0.0177, "step": 5695 }, { "epoch": 2.5914467697907186, "grad_norm": 0.30970474430900885, "learning_rate": 4.7142964978922764e-07, "loss": 0.0037, "step": 5696 }, { "epoch": 2.591901728844404, "grad_norm": 0.6934643202001963, "learning_rate": 4.712869548840311e-07, "loss": 0.0171, "step": 5697 }, { "epoch": 2.5923566878980893, "grad_norm": 1.0179243471893773, "learning_rate": 4.7114426232513484e-07, "loss": 0.0178, "step": 5698 }, { "epoch": 2.592811646951774, "grad_norm": 0.7812267501891604, "learning_rate": 4.7100157212419923e-07, "loss": 0.0169, "step": 5699 }, { "epoch": 2.5932666060054594, "grad_norm": 0.9756797392272853, "learning_rate": 4.7085888429288414e-07, "loss": 0.0207, "step": 5700 }, { "epoch": 2.5937215650591448, "grad_norm": 1.108804502127688, "learning_rate": 4.7071619884284945e-07, "loss": 0.0232, "step": 5701 }, { "epoch": 2.5941765241128296, "grad_norm": 0.5571435677840592, "learning_rate": 4.705735157857547e-07, "loss": 0.0104, "step": 5702 }, { "epoch": 2.594631483166515, "grad_norm": 0.7313693901865022, "learning_rate": 4.7043083513325927e-07, "loss": 0.0126, "step": 5703 }, { "epoch": 2.5950864422202002, "grad_norm": 0.9325964421843673, "learning_rate": 4.702881568970226e-07, "loss": 0.0224, "step": 5704 }, { "epoch": 2.595541401273885, "grad_norm": 0.988593207526867, "learning_rate": 4.701454810887035e-07, "loss": 0.0309, "step": 5705 }, { "epoch": 2.5959963603275704, "grad_norm": 0.7702180447416548, "learning_rate": 4.70002807719961e-07, "loss": 0.0131, "step": 5706 }, { "epoch": 2.5964513193812557, "grad_norm": 1.1691574176400799, "learning_rate": 4.6986013680245346e-07, "loss": 0.0229, "step": 5707 }, { "epoch": 2.5969062784349406, "grad_norm": 1.252871941683342, "learning_rate": 4.6971746834783956e-07, "loss": 0.0237, "step": 5708 }, { "epoch": 2.597361237488626, "grad_norm": 1.165357741026103, "learning_rate": 4.6957480236777725e-07, "loss": 0.0096, "step": 5709 }, { "epoch": 2.597816196542311, "grad_norm": 0.8274156192567905, "learning_rate": 4.6943213887392456e-07, "loss": 0.0182, "step": 5710 }, { "epoch": 2.598271155595996, "grad_norm": 0.7789998726687359, "learning_rate": 4.692894778779397e-07, "loss": 0.0104, "step": 5711 }, { "epoch": 2.5987261146496814, "grad_norm": 0.9932822459308809, "learning_rate": 4.691468193914799e-07, "loss": 0.0202, "step": 5712 }, { "epoch": 2.5991810737033667, "grad_norm": 0.7374055816384486, "learning_rate": 4.690041634262027e-07, "loss": 0.0246, "step": 5713 }, { "epoch": 2.599636032757052, "grad_norm": 0.8051055366433898, "learning_rate": 4.6886150999376547e-07, "loss": 0.0194, "step": 5714 }, { "epoch": 2.600090991810737, "grad_norm": 0.47650518051791996, "learning_rate": 4.687188591058248e-07, "loss": 0.0051, "step": 5715 }, { "epoch": 2.600545950864422, "grad_norm": 0.974173493405615, "learning_rate": 4.6857621077403787e-07, "loss": 0.0193, "step": 5716 }, { "epoch": 2.6010009099181075, "grad_norm": 1.541284010691708, "learning_rate": 4.6843356501006083e-07, "loss": 0.0176, "step": 5717 }, { "epoch": 2.6014558689717924, "grad_norm": 0.6922724901429589, "learning_rate": 4.682909218255504e-07, "loss": 0.0092, "step": 5718 }, { "epoch": 2.6019108280254777, "grad_norm": 0.8825965813793325, "learning_rate": 4.6814828123216276e-07, "loss": 0.0217, "step": 5719 }, { "epoch": 2.602365787079163, "grad_norm": 0.6679462717898595, "learning_rate": 4.680056432415536e-07, "loss": 0.0142, "step": 5720 }, { "epoch": 2.6028207461328483, "grad_norm": 0.6840554065436956, "learning_rate": 4.6786300786537895e-07, "loss": 0.0068, "step": 5721 }, { "epoch": 2.603275705186533, "grad_norm": 1.7728388469093161, "learning_rate": 4.6772037511529404e-07, "loss": 0.0225, "step": 5722 }, { "epoch": 2.6037306642402185, "grad_norm": 0.6726235325025798, "learning_rate": 4.6757774500295443e-07, "loss": 0.012, "step": 5723 }, { "epoch": 2.604185623293904, "grad_norm": 0.855097041615118, "learning_rate": 4.674351175400149e-07, "loss": 0.0194, "step": 5724 }, { "epoch": 2.6046405823475887, "grad_norm": 0.5892762130564574, "learning_rate": 4.672924927381305e-07, "loss": 0.0102, "step": 5725 }, { "epoch": 2.605095541401274, "grad_norm": 0.8833823837668904, "learning_rate": 4.671498706089561e-07, "loss": 0.0168, "step": 5726 }, { "epoch": 2.6055505004549593, "grad_norm": 1.5823758237199443, "learning_rate": 4.6700725116414575e-07, "loss": 0.0313, "step": 5727 }, { "epoch": 2.606005459508644, "grad_norm": 1.3637703884524808, "learning_rate": 4.6686463441535395e-07, "loss": 0.0241, "step": 5728 }, { "epoch": 2.6064604185623295, "grad_norm": 0.8650958211048303, "learning_rate": 4.6672202037423446e-07, "loss": 0.0166, "step": 5729 }, { "epoch": 2.6069153776160148, "grad_norm": 1.4620318033167987, "learning_rate": 4.6657940905244136e-07, "loss": 0.0203, "step": 5730 }, { "epoch": 2.6073703366696996, "grad_norm": 1.5779774714091757, "learning_rate": 4.6643680046162774e-07, "loss": 0.0205, "step": 5731 }, { "epoch": 2.607825295723385, "grad_norm": 1.0935290218389502, "learning_rate": 4.6629419461344735e-07, "loss": 0.0243, "step": 5732 }, { "epoch": 2.6082802547770703, "grad_norm": 0.948796825108346, "learning_rate": 4.6615159151955336e-07, "loss": 0.0334, "step": 5733 }, { "epoch": 2.608735213830755, "grad_norm": 1.031406658629183, "learning_rate": 4.6600899119159824e-07, "loss": 0.0172, "step": 5734 }, { "epoch": 2.6091901728844404, "grad_norm": 0.8534893582574565, "learning_rate": 4.65866393641235e-07, "loss": 0.0214, "step": 5735 }, { "epoch": 2.6096451319381258, "grad_norm": 0.6130737282051778, "learning_rate": 4.657237988801158e-07, "loss": 0.0153, "step": 5736 }, { "epoch": 2.6101000909918106, "grad_norm": 0.6333837786086722, "learning_rate": 4.6558120691989313e-07, "loss": 0.0157, "step": 5737 }, { "epoch": 2.610555050045496, "grad_norm": 0.6550094953472325, "learning_rate": 4.654386177722185e-07, "loss": 0.0109, "step": 5738 }, { "epoch": 2.6110100090991812, "grad_norm": 0.5137022558178973, "learning_rate": 4.652960314487441e-07, "loss": 0.0096, "step": 5739 }, { "epoch": 2.611464968152866, "grad_norm": 0.9313614250184487, "learning_rate": 4.6515344796112137e-07, "loss": 0.0309, "step": 5740 }, { "epoch": 2.6119199272065514, "grad_norm": 0.7817399028003837, "learning_rate": 4.6501086732100143e-07, "loss": 0.0147, "step": 5741 }, { "epoch": 2.6123748862602367, "grad_norm": 0.7751536204638493, "learning_rate": 4.648682895400355e-07, "loss": 0.0158, "step": 5742 }, { "epoch": 2.6128298453139216, "grad_norm": 0.7428450485154856, "learning_rate": 4.647257146298742e-07, "loss": 0.0173, "step": 5743 }, { "epoch": 2.613284804367607, "grad_norm": 1.8996622218734374, "learning_rate": 4.645831426021683e-07, "loss": 0.0584, "step": 5744 }, { "epoch": 2.613739763421292, "grad_norm": 1.0762068000530194, "learning_rate": 4.644405734685678e-07, "loss": 0.0278, "step": 5745 }, { "epoch": 2.614194722474977, "grad_norm": 1.2651970055905044, "learning_rate": 4.6429800724072323e-07, "loss": 0.0259, "step": 5746 }, { "epoch": 2.6146496815286624, "grad_norm": 0.9109019538579749, "learning_rate": 4.6415544393028425e-07, "loss": 0.0155, "step": 5747 }, { "epoch": 2.6151046405823477, "grad_norm": 0.7661745829692606, "learning_rate": 4.6401288354890066e-07, "loss": 0.0215, "step": 5748 }, { "epoch": 2.6155595996360326, "grad_norm": 0.8684617946237729, "learning_rate": 4.638703261082216e-07, "loss": 0.0228, "step": 5749 }, { "epoch": 2.616014558689718, "grad_norm": 0.611654974661684, "learning_rate": 4.6372777161989635e-07, "loss": 0.0117, "step": 5750 }, { "epoch": 2.616469517743403, "grad_norm": 0.8250460451090782, "learning_rate": 4.635852200955737e-07, "loss": 0.0112, "step": 5751 }, { "epoch": 2.616924476797088, "grad_norm": 2.3625725048926385, "learning_rate": 4.6344267154690236e-07, "loss": 0.0182, "step": 5752 }, { "epoch": 2.6173794358507734, "grad_norm": 0.5864471236847816, "learning_rate": 4.6330012598553104e-07, "loss": 0.0071, "step": 5753 }, { "epoch": 2.6178343949044587, "grad_norm": 0.6029243501161677, "learning_rate": 4.631575834231075e-07, "loss": 0.0162, "step": 5754 }, { "epoch": 2.6182893539581436, "grad_norm": 0.8283198550387613, "learning_rate": 4.6301504387128005e-07, "loss": 0.0242, "step": 5755 }, { "epoch": 2.618744313011829, "grad_norm": 0.6450705911104295, "learning_rate": 4.62872507341696e-07, "loss": 0.0218, "step": 5756 }, { "epoch": 2.619199272065514, "grad_norm": 0.8056293143418414, "learning_rate": 4.627299738460032e-07, "loss": 0.0144, "step": 5757 }, { "epoch": 2.619654231119199, "grad_norm": 0.6311831711217202, "learning_rate": 4.6258744339584846e-07, "loss": 0.0158, "step": 5758 }, { "epoch": 2.6201091901728844, "grad_norm": 0.634437980253563, "learning_rate": 4.624449160028788e-07, "loss": 0.02, "step": 5759 }, { "epoch": 2.6205641492265697, "grad_norm": 0.4646946534298653, "learning_rate": 4.623023916787412e-07, "loss": 0.0063, "step": 5760 }, { "epoch": 2.6210191082802545, "grad_norm": 0.7474241204563751, "learning_rate": 4.621598704350818e-07, "loss": 0.0118, "step": 5761 }, { "epoch": 2.62147406733394, "grad_norm": 0.8878879334402332, "learning_rate": 4.6201735228354707e-07, "loss": 0.0318, "step": 5762 }, { "epoch": 2.621929026387625, "grad_norm": 0.6327685647984093, "learning_rate": 4.6187483723578265e-07, "loss": 0.009, "step": 5763 }, { "epoch": 2.62238398544131, "grad_norm": 0.902920749187125, "learning_rate": 4.617323253034344e-07, "loss": 0.022, "step": 5764 }, { "epoch": 2.6228389444949953, "grad_norm": 0.9613283965368543, "learning_rate": 4.6158981649814766e-07, "loss": 0.0184, "step": 5765 }, { "epoch": 2.6232939035486806, "grad_norm": 0.99258822051554, "learning_rate": 4.614473108315676e-07, "loss": 0.0247, "step": 5766 }, { "epoch": 2.623748862602366, "grad_norm": 0.7737830472866187, "learning_rate": 4.613048083153393e-07, "loss": 0.0281, "step": 5767 }, { "epoch": 2.624203821656051, "grad_norm": 0.6073522593524685, "learning_rate": 4.6116230896110726e-07, "loss": 0.0101, "step": 5768 }, { "epoch": 2.624658780709736, "grad_norm": 0.6840939564709326, "learning_rate": 4.6101981278051607e-07, "loss": 0.0166, "step": 5769 }, { "epoch": 2.6251137397634214, "grad_norm": 0.6413204604141409, "learning_rate": 4.608773197852096e-07, "loss": 0.0112, "step": 5770 }, { "epoch": 2.6255686988171063, "grad_norm": 1.5560477616014063, "learning_rate": 4.6073482998683205e-07, "loss": 0.0142, "step": 5771 }, { "epoch": 2.6260236578707916, "grad_norm": 0.9411709228768057, "learning_rate": 4.605923433970267e-07, "loss": 0.023, "step": 5772 }, { "epoch": 2.626478616924477, "grad_norm": 0.7656054428442008, "learning_rate": 4.6044986002743703e-07, "loss": 0.0181, "step": 5773 }, { "epoch": 2.6269335759781622, "grad_norm": 0.5975353625846819, "learning_rate": 4.6030737988970635e-07, "loss": 0.01, "step": 5774 }, { "epoch": 2.627388535031847, "grad_norm": 0.8925929441774604, "learning_rate": 4.601649029954773e-07, "loss": 0.02, "step": 5775 }, { "epoch": 2.6278434940855324, "grad_norm": 0.7078425583175544, "learning_rate": 4.6002242935639254e-07, "loss": 0.0243, "step": 5776 }, { "epoch": 2.6282984531392177, "grad_norm": 0.8749689903750962, "learning_rate": 4.598799589840942e-07, "loss": 0.0177, "step": 5777 }, { "epoch": 2.6287534121929026, "grad_norm": 0.6253769351170928, "learning_rate": 4.597374918902246e-07, "loss": 0.014, "step": 5778 }, { "epoch": 2.629208371246588, "grad_norm": 0.8908638361366829, "learning_rate": 4.5959502808642513e-07, "loss": 0.0289, "step": 5779 }, { "epoch": 2.629663330300273, "grad_norm": 1.0857228446405067, "learning_rate": 4.594525675843375e-07, "loss": 0.0225, "step": 5780 }, { "epoch": 2.630118289353958, "grad_norm": 0.6209291079894165, "learning_rate": 4.5931011039560306e-07, "loss": 0.0135, "step": 5781 }, { "epoch": 2.6305732484076434, "grad_norm": 0.6282774465924835, "learning_rate": 4.5916765653186254e-07, "loss": 0.0135, "step": 5782 }, { "epoch": 2.6310282074613287, "grad_norm": 0.5995863959481829, "learning_rate": 4.590252060047569e-07, "loss": 0.0115, "step": 5783 }, { "epoch": 2.6314831665150136, "grad_norm": 1.1220324338450895, "learning_rate": 4.5888275882592636e-07, "loss": 0.03, "step": 5784 }, { "epoch": 2.631938125568699, "grad_norm": 0.8897379823732585, "learning_rate": 4.5874031500701097e-07, "loss": 0.0134, "step": 5785 }, { "epoch": 2.632393084622384, "grad_norm": 0.8829886216827669, "learning_rate": 4.585978745596509e-07, "loss": 0.0254, "step": 5786 }, { "epoch": 2.632848043676069, "grad_norm": 1.0807796112345083, "learning_rate": 4.584554374954852e-07, "loss": 0.0222, "step": 5787 }, { "epoch": 2.6333030027297544, "grad_norm": 0.6758953261732099, "learning_rate": 4.583130038261537e-07, "loss": 0.0233, "step": 5788 }, { "epoch": 2.6337579617834397, "grad_norm": 0.8789626654344367, "learning_rate": 4.5817057356329543e-07, "loss": 0.0193, "step": 5789 }, { "epoch": 2.6342129208371245, "grad_norm": 0.6144331458207094, "learning_rate": 4.5802814671854877e-07, "loss": 0.0151, "step": 5790 }, { "epoch": 2.63466787989081, "grad_norm": 0.6219665155374587, "learning_rate": 4.578857233035526e-07, "loss": 0.009, "step": 5791 }, { "epoch": 2.635122838944495, "grad_norm": 0.5521652643662276, "learning_rate": 4.5774330332994484e-07, "loss": 0.0126, "step": 5792 }, { "epoch": 2.63557779799818, "grad_norm": 0.7975627805702227, "learning_rate": 4.576008868093636e-07, "loss": 0.0165, "step": 5793 }, { "epoch": 2.6360327570518653, "grad_norm": 0.7318172339001513, "learning_rate": 4.5745847375344617e-07, "loss": 0.0167, "step": 5794 }, { "epoch": 2.6364877161055507, "grad_norm": 0.9080512092142611, "learning_rate": 4.573160641738303e-07, "loss": 0.0123, "step": 5795 }, { "epoch": 2.6369426751592355, "grad_norm": 0.654906484540582, "learning_rate": 4.57173658082153e-07, "loss": 0.0212, "step": 5796 }, { "epoch": 2.637397634212921, "grad_norm": 1.0095015796301097, "learning_rate": 4.5703125549005076e-07, "loss": 0.0221, "step": 5797 }, { "epoch": 2.637852593266606, "grad_norm": 0.5929623029600651, "learning_rate": 4.568888564091605e-07, "loss": 0.0209, "step": 5798 }, { "epoch": 2.638307552320291, "grad_norm": 1.0814217723864206, "learning_rate": 4.5674646085111806e-07, "loss": 0.0165, "step": 5799 }, { "epoch": 2.6387625113739763, "grad_norm": 1.3531839745000382, "learning_rate": 4.5660406882755965e-07, "loss": 0.0216, "step": 5800 }, { "epoch": 2.6392174704276616, "grad_norm": 0.6775824084127721, "learning_rate": 4.5646168035012044e-07, "loss": 0.0165, "step": 5801 }, { "epoch": 2.6396724294813465, "grad_norm": 0.4508007381777027, "learning_rate": 4.563192954304363e-07, "loss": 0.0113, "step": 5802 }, { "epoch": 2.640127388535032, "grad_norm": 0.9640026923707415, "learning_rate": 4.561769140801422e-07, "loss": 0.0284, "step": 5803 }, { "epoch": 2.640582347588717, "grad_norm": 1.1452789792944535, "learning_rate": 4.560345363108726e-07, "loss": 0.0329, "step": 5804 }, { "epoch": 2.641037306642402, "grad_norm": 0.8077178500892105, "learning_rate": 4.558921621342623e-07, "loss": 0.02, "step": 5805 }, { "epoch": 2.6414922656960873, "grad_norm": 0.9230253145961156, "learning_rate": 4.557497915619452e-07, "loss": 0.0119, "step": 5806 }, { "epoch": 2.6419472247497726, "grad_norm": 0.7563975398357169, "learning_rate": 4.556074246055554e-07, "loss": 0.0099, "step": 5807 }, { "epoch": 2.6424021838034575, "grad_norm": 0.5708661366494066, "learning_rate": 4.5546506127672616e-07, "loss": 0.0113, "step": 5808 }, { "epoch": 2.642857142857143, "grad_norm": 0.4764514429095443, "learning_rate": 4.553227015870911e-07, "loss": 0.0088, "step": 5809 }, { "epoch": 2.643312101910828, "grad_norm": 0.770157218775655, "learning_rate": 4.5518034554828327e-07, "loss": 0.0213, "step": 5810 }, { "epoch": 2.643767060964513, "grad_norm": 0.7436842728220171, "learning_rate": 4.5503799317193504e-07, "loss": 0.0228, "step": 5811 }, { "epoch": 2.6442220200181983, "grad_norm": 0.8726476100961948, "learning_rate": 4.548956444696791e-07, "loss": 0.0143, "step": 5812 }, { "epoch": 2.6446769790718836, "grad_norm": 0.7407865548635287, "learning_rate": 4.547532994531473e-07, "loss": 0.0112, "step": 5813 }, { "epoch": 2.6451319381255685, "grad_norm": 0.9607010457003754, "learning_rate": 4.546109581339716e-07, "loss": 0.0292, "step": 5814 }, { "epoch": 2.6455868971792538, "grad_norm": 1.1594623843755911, "learning_rate": 4.544686205237832e-07, "loss": 0.0134, "step": 5815 }, { "epoch": 2.646041856232939, "grad_norm": 0.8833250121994172, "learning_rate": 4.543262866342137e-07, "loss": 0.0329, "step": 5816 }, { "epoch": 2.646496815286624, "grad_norm": 0.7089886302903666, "learning_rate": 4.5418395647689385e-07, "loss": 0.0204, "step": 5817 }, { "epoch": 2.6469517743403093, "grad_norm": 1.435721753176248, "learning_rate": 4.540416300634541e-07, "loss": 0.0189, "step": 5818 }, { "epoch": 2.6474067333939946, "grad_norm": 1.081719842454097, "learning_rate": 4.5389930740552484e-07, "loss": 0.0275, "step": 5819 }, { "epoch": 2.6478616924476794, "grad_norm": 0.5170914484417846, "learning_rate": 4.5375698851473603e-07, "loss": 0.0089, "step": 5820 }, { "epoch": 2.6483166515013647, "grad_norm": 1.0898475605566929, "learning_rate": 4.536146734027172e-07, "loss": 0.0197, "step": 5821 }, { "epoch": 2.64877161055505, "grad_norm": 0.6246870419895297, "learning_rate": 4.534723620810976e-07, "loss": 0.0178, "step": 5822 }, { "epoch": 2.6492265696087354, "grad_norm": 0.8573397422218126, "learning_rate": 4.5333005456150675e-07, "loss": 0.019, "step": 5823 }, { "epoch": 2.6496815286624202, "grad_norm": 1.197738932447692, "learning_rate": 4.5318775085557293e-07, "loss": 0.0113, "step": 5824 }, { "epoch": 2.6501364877161055, "grad_norm": 0.7722692116475826, "learning_rate": 4.5304545097492487e-07, "loss": 0.0258, "step": 5825 }, { "epoch": 2.650591446769791, "grad_norm": 0.6119890659303509, "learning_rate": 4.529031549311904e-07, "loss": 0.012, "step": 5826 }, { "epoch": 2.6510464058234757, "grad_norm": 0.7500102560896698, "learning_rate": 4.5276086273599745e-07, "loss": 0.0286, "step": 5827 }, { "epoch": 2.651501364877161, "grad_norm": 0.8672914758354598, "learning_rate": 4.526185744009734e-07, "loss": 0.0249, "step": 5828 }, { "epoch": 2.6519563239308463, "grad_norm": 0.7480975852594409, "learning_rate": 4.5247628993774533e-07, "loss": 0.015, "step": 5829 }, { "epoch": 2.6524112829845317, "grad_norm": 0.8599217704915689, "learning_rate": 4.523340093579405e-07, "loss": 0.029, "step": 5830 }, { "epoch": 2.6528662420382165, "grad_norm": 1.3644508362985992, "learning_rate": 4.5219173267318506e-07, "loss": 0.0185, "step": 5831 }, { "epoch": 2.653321201091902, "grad_norm": 0.5193128365917108, "learning_rate": 4.520494598951055e-07, "loss": 0.0125, "step": 5832 }, { "epoch": 2.653776160145587, "grad_norm": 0.8922058931115135, "learning_rate": 4.5190719103532726e-07, "loss": 0.0126, "step": 5833 }, { "epoch": 2.654231119199272, "grad_norm": 0.7758198673443419, "learning_rate": 4.517649261054764e-07, "loss": 0.0191, "step": 5834 }, { "epoch": 2.6546860782529573, "grad_norm": 0.7636260637959255, "learning_rate": 4.5162266511717785e-07, "loss": 0.0233, "step": 5835 }, { "epoch": 2.6551410373066426, "grad_norm": 0.738463080811571, "learning_rate": 4.5148040808205644e-07, "loss": 0.0221, "step": 5836 }, { "epoch": 2.6555959963603275, "grad_norm": 0.9625683918110185, "learning_rate": 4.513381550117373e-07, "loss": 0.0192, "step": 5837 }, { "epoch": 2.656050955414013, "grad_norm": 0.6597012732687209, "learning_rate": 4.511959059178442e-07, "loss": 0.0156, "step": 5838 }, { "epoch": 2.656505914467698, "grad_norm": 0.7904034534895265, "learning_rate": 4.510536608120014e-07, "loss": 0.0217, "step": 5839 }, { "epoch": 2.656960873521383, "grad_norm": 0.767343601685618, "learning_rate": 4.509114197058323e-07, "loss": 0.0211, "step": 5840 }, { "epoch": 2.6574158325750683, "grad_norm": 0.7597052487415956, "learning_rate": 4.5076918261096037e-07, "loss": 0.0115, "step": 5841 }, { "epoch": 2.6578707916287536, "grad_norm": 0.5824670830837653, "learning_rate": 4.506269495390084e-07, "loss": 0.011, "step": 5842 }, { "epoch": 2.6583257506824385, "grad_norm": 0.7076622802295155, "learning_rate": 4.5048472050159906e-07, "loss": 0.014, "step": 5843 }, { "epoch": 2.658780709736124, "grad_norm": 0.7527939246634165, "learning_rate": 4.50342495510355e-07, "loss": 0.0177, "step": 5844 }, { "epoch": 2.659235668789809, "grad_norm": 0.7240199169746351, "learning_rate": 4.5020027457689787e-07, "loss": 0.0256, "step": 5845 }, { "epoch": 2.659690627843494, "grad_norm": 0.6563234115573644, "learning_rate": 4.500580577128494e-07, "loss": 0.0157, "step": 5846 }, { "epoch": 2.6601455868971793, "grad_norm": 0.736564682878924, "learning_rate": 4.499158449298309e-07, "loss": 0.016, "step": 5847 }, { "epoch": 2.6606005459508646, "grad_norm": 0.9261499922506324, "learning_rate": 4.497736362394635e-07, "loss": 0.0162, "step": 5848 }, { "epoch": 2.6610555050045495, "grad_norm": 0.9900369041852004, "learning_rate": 4.496314316533676e-07, "loss": 0.0249, "step": 5849 }, { "epoch": 2.6615104640582348, "grad_norm": 2.3399655085957742, "learning_rate": 4.494892311831635e-07, "loss": 0.0223, "step": 5850 }, { "epoch": 2.66196542311192, "grad_norm": 0.7853107317634216, "learning_rate": 4.4934703484047154e-07, "loss": 0.0283, "step": 5851 }, { "epoch": 2.662420382165605, "grad_norm": 0.790484340375656, "learning_rate": 4.4920484263691103e-07, "loss": 0.0231, "step": 5852 }, { "epoch": 2.6628753412192903, "grad_norm": 0.7082774291257431, "learning_rate": 4.4906265458410155e-07, "loss": 0.0159, "step": 5853 }, { "epoch": 2.6633303002729756, "grad_norm": 0.7902574360760118, "learning_rate": 4.4892047069366173e-07, "loss": 0.009, "step": 5854 }, { "epoch": 2.6637852593266604, "grad_norm": 0.7344094701943281, "learning_rate": 4.4877829097721055e-07, "loss": 0.017, "step": 5855 }, { "epoch": 2.6642402183803457, "grad_norm": 0.645947808870827, "learning_rate": 4.486361154463661e-07, "loss": 0.0123, "step": 5856 }, { "epoch": 2.664695177434031, "grad_norm": 0.8158538499752196, "learning_rate": 4.484939441127461e-07, "loss": 0.0231, "step": 5857 }, { "epoch": 2.665150136487716, "grad_norm": 0.5648770230205923, "learning_rate": 4.483517769879685e-07, "loss": 0.0093, "step": 5858 }, { "epoch": 2.6656050955414012, "grad_norm": 0.9315239473536451, "learning_rate": 4.4820961408365053e-07, "loss": 0.0211, "step": 5859 }, { "epoch": 2.6660600545950865, "grad_norm": 0.9261475177004292, "learning_rate": 4.480674554114089e-07, "loss": 0.0234, "step": 5860 }, { "epoch": 2.6665150136487714, "grad_norm": 0.8311282211230085, "learning_rate": 4.4792530098286046e-07, "loss": 0.0202, "step": 5861 }, { "epoch": 2.6669699727024567, "grad_norm": 0.8550272364601884, "learning_rate": 4.4778315080962114e-07, "loss": 0.0129, "step": 5862 }, { "epoch": 2.667424931756142, "grad_norm": 0.9337327837224859, "learning_rate": 4.47641004903307e-07, "loss": 0.0207, "step": 5863 }, { "epoch": 2.667879890809827, "grad_norm": 1.341194376060801, "learning_rate": 4.4749886327553326e-07, "loss": 0.0149, "step": 5864 }, { "epoch": 2.668334849863512, "grad_norm": 0.8252563305193752, "learning_rate": 4.4735672593791544e-07, "loss": 0.0169, "step": 5865 }, { "epoch": 2.6687898089171975, "grad_norm": 0.8577965541356992, "learning_rate": 4.4721459290206836e-07, "loss": 0.0253, "step": 5866 }, { "epoch": 2.6692447679708824, "grad_norm": 0.7531228033817016, "learning_rate": 4.4707246417960633e-07, "loss": 0.0188, "step": 5867 }, { "epoch": 2.6696997270245677, "grad_norm": 0.9814876908219119, "learning_rate": 4.469303397821435e-07, "loss": 0.036, "step": 5868 }, { "epoch": 2.670154686078253, "grad_norm": 0.926355675637972, "learning_rate": 4.467882197212936e-07, "loss": 0.0248, "step": 5869 }, { "epoch": 2.670609645131938, "grad_norm": 0.7440978117349105, "learning_rate": 4.466461040086702e-07, "loss": 0.0151, "step": 5870 }, { "epoch": 2.671064604185623, "grad_norm": 1.1703512825665225, "learning_rate": 4.4650399265588594e-07, "loss": 0.0167, "step": 5871 }, { "epoch": 2.6715195632393085, "grad_norm": 0.7757210535715979, "learning_rate": 4.46361885674554e-07, "loss": 0.0109, "step": 5872 }, { "epoch": 2.6719745222929934, "grad_norm": 0.37825500065687934, "learning_rate": 4.462197830762866e-07, "loss": 0.0083, "step": 5873 }, { "epoch": 2.6724294813466787, "grad_norm": 1.2907620431424505, "learning_rate": 4.460776848726956e-07, "loss": 0.0361, "step": 5874 }, { "epoch": 2.672884440400364, "grad_norm": 0.8841293042402575, "learning_rate": 4.4593559107539274e-07, "loss": 0.0136, "step": 5875 }, { "epoch": 2.673339399454049, "grad_norm": 0.9234670984534333, "learning_rate": 4.4579350169598915e-07, "loss": 0.0142, "step": 5876 }, { "epoch": 2.673794358507734, "grad_norm": 0.744649317258839, "learning_rate": 4.456514167460959e-07, "loss": 0.0183, "step": 5877 }, { "epoch": 2.6742493175614195, "grad_norm": 0.7388414315786686, "learning_rate": 4.455093362373232e-07, "loss": 0.0164, "step": 5878 }, { "epoch": 2.674704276615105, "grad_norm": 0.5594305680740872, "learning_rate": 4.453672601812816e-07, "loss": 0.0151, "step": 5879 }, { "epoch": 2.6751592356687897, "grad_norm": 0.6353346299966872, "learning_rate": 4.452251885895809e-07, "loss": 0.018, "step": 5880 }, { "epoch": 2.675614194722475, "grad_norm": 0.984580751267218, "learning_rate": 4.450831214738303e-07, "loss": 0.016, "step": 5881 }, { "epoch": 2.6760691537761603, "grad_norm": 0.977918642406745, "learning_rate": 4.449410588456391e-07, "loss": 0.0262, "step": 5882 }, { "epoch": 2.676524112829845, "grad_norm": 0.7137074774684236, "learning_rate": 4.4479900071661585e-07, "loss": 0.018, "step": 5883 }, { "epoch": 2.6769790718835305, "grad_norm": 0.9768365321842841, "learning_rate": 4.446569470983691e-07, "loss": 0.025, "step": 5884 }, { "epoch": 2.6774340309372158, "grad_norm": 0.6086708442510752, "learning_rate": 4.445148980025064e-07, "loss": 0.0148, "step": 5885 }, { "epoch": 2.677888989990901, "grad_norm": 0.4495992273242621, "learning_rate": 4.4437285344063583e-07, "loss": 0.0058, "step": 5886 }, { "epoch": 2.678343949044586, "grad_norm": 0.8309690351861077, "learning_rate": 4.4423081342436466e-07, "loss": 0.0182, "step": 5887 }, { "epoch": 2.6787989080982713, "grad_norm": 0.6952224681739085, "learning_rate": 4.4408877796529935e-07, "loss": 0.0139, "step": 5888 }, { "epoch": 2.6792538671519566, "grad_norm": 0.6135938993847223, "learning_rate": 4.4394674707504676e-07, "loss": 0.0181, "step": 5889 }, { "epoch": 2.6797088262056414, "grad_norm": 1.1206551069371966, "learning_rate": 4.4380472076521286e-07, "loss": 0.0284, "step": 5890 }, { "epoch": 2.6801637852593267, "grad_norm": 0.7421031137963281, "learning_rate": 4.436626990474031e-07, "loss": 0.0219, "step": 5891 }, { "epoch": 2.680618744313012, "grad_norm": 0.8618467335915889, "learning_rate": 4.435206819332234e-07, "loss": 0.03, "step": 5892 }, { "epoch": 2.681073703366697, "grad_norm": 0.6353226669997998, "learning_rate": 4.433786694342786e-07, "loss": 0.0096, "step": 5893 }, { "epoch": 2.6815286624203822, "grad_norm": 1.4003832137190193, "learning_rate": 4.432366615621731e-07, "loss": 0.0447, "step": 5894 }, { "epoch": 2.6819836214740675, "grad_norm": 0.6936065883152062, "learning_rate": 4.4309465832851134e-07, "loss": 0.012, "step": 5895 }, { "epoch": 2.6824385805277524, "grad_norm": 0.7597689200472766, "learning_rate": 4.42952659744897e-07, "loss": 0.0249, "step": 5896 }, { "epoch": 2.6828935395814377, "grad_norm": 1.0228295725638785, "learning_rate": 4.428106658229339e-07, "loss": 0.0086, "step": 5897 }, { "epoch": 2.683348498635123, "grad_norm": 0.7255684895387678, "learning_rate": 4.4266867657422465e-07, "loss": 0.0241, "step": 5898 }, { "epoch": 2.683803457688808, "grad_norm": 1.0046734420861207, "learning_rate": 4.4252669201037234e-07, "loss": 0.0157, "step": 5899 }, { "epoch": 2.684258416742493, "grad_norm": 0.5714529216367379, "learning_rate": 4.4238471214297933e-07, "loss": 0.0131, "step": 5900 }, { "epoch": 2.6847133757961785, "grad_norm": 0.6649413446230127, "learning_rate": 4.4224273698364734e-07, "loss": 0.0157, "step": 5901 }, { "epoch": 2.6851683348498634, "grad_norm": 0.6741319773802624, "learning_rate": 4.421007665439782e-07, "loss": 0.015, "step": 5902 }, { "epoch": 2.6856232939035487, "grad_norm": 0.6397839395950378, "learning_rate": 4.419588008355728e-07, "loss": 0.009, "step": 5903 }, { "epoch": 2.686078252957234, "grad_norm": 0.7246057579880602, "learning_rate": 4.418168398700322e-07, "loss": 0.0154, "step": 5904 }, { "epoch": 2.686533212010919, "grad_norm": 0.925407939916406, "learning_rate": 4.416748836589565e-07, "loss": 0.0194, "step": 5905 }, { "epoch": 2.686988171064604, "grad_norm": 0.6922043221198736, "learning_rate": 4.41532932213946e-07, "loss": 0.0101, "step": 5906 }, { "epoch": 2.6874431301182895, "grad_norm": 1.4370028969921331, "learning_rate": 4.413909855466003e-07, "loss": 0.0163, "step": 5907 }, { "epoch": 2.6878980891719744, "grad_norm": 0.861538292189046, "learning_rate": 4.4124904366851856e-07, "loss": 0.0318, "step": 5908 }, { "epoch": 2.6883530482256597, "grad_norm": 0.7204489563615951, "learning_rate": 4.411071065912997e-07, "loss": 0.0163, "step": 5909 }, { "epoch": 2.688808007279345, "grad_norm": 0.603917967955805, "learning_rate": 4.4096517432654213e-07, "loss": 0.0089, "step": 5910 }, { "epoch": 2.68926296633303, "grad_norm": 0.8253749362961054, "learning_rate": 4.408232468858439e-07, "loss": 0.0283, "step": 5911 }, { "epoch": 2.689717925386715, "grad_norm": 0.7290234024656845, "learning_rate": 4.406813242808025e-07, "loss": 0.0124, "step": 5912 }, { "epoch": 2.6901728844404005, "grad_norm": 0.9618514981678683, "learning_rate": 4.405394065230156e-07, "loss": 0.03, "step": 5913 }, { "epoch": 2.6906278434940853, "grad_norm": 1.0350805015335294, "learning_rate": 4.403974936240799e-07, "loss": 0.0226, "step": 5914 }, { "epoch": 2.6910828025477707, "grad_norm": 0.644223161291037, "learning_rate": 4.4025558559559177e-07, "loss": 0.0091, "step": 5915 }, { "epoch": 2.691537761601456, "grad_norm": 0.8869043085221809, "learning_rate": 4.401136824491475e-07, "loss": 0.0322, "step": 5916 }, { "epoch": 2.691992720655141, "grad_norm": 0.5527385603605773, "learning_rate": 4.399717841963425e-07, "loss": 0.0133, "step": 5917 }, { "epoch": 2.692447679708826, "grad_norm": 0.8729982868901436, "learning_rate": 4.3982989084877234e-07, "loss": 0.0337, "step": 5918 }, { "epoch": 2.6929026387625115, "grad_norm": 1.0311114640300392, "learning_rate": 4.396880024180316e-07, "loss": 0.0222, "step": 5919 }, { "epoch": 2.6933575978161963, "grad_norm": 1.0983171667911906, "learning_rate": 4.39546118915715e-07, "loss": 0.0177, "step": 5920 }, { "epoch": 2.6938125568698816, "grad_norm": 0.8982767427024921, "learning_rate": 4.394042403534167e-07, "loss": 0.0091, "step": 5921 }, { "epoch": 2.694267515923567, "grad_norm": 0.9264182784941586, "learning_rate": 4.392623667427301e-07, "loss": 0.0314, "step": 5922 }, { "epoch": 2.694722474977252, "grad_norm": 1.1516589070014576, "learning_rate": 4.3912049809524873e-07, "loss": 0.013, "step": 5923 }, { "epoch": 2.695177434030937, "grad_norm": 0.6891203285993182, "learning_rate": 4.389786344225652e-07, "loss": 0.0143, "step": 5924 }, { "epoch": 2.6956323930846224, "grad_norm": 0.8876548722581543, "learning_rate": 4.388367757362722e-07, "loss": 0.0259, "step": 5925 }, { "epoch": 2.6960873521383073, "grad_norm": 0.8752454355577217, "learning_rate": 4.386949220479614e-07, "loss": 0.0153, "step": 5926 }, { "epoch": 2.6965423111919926, "grad_norm": 0.806724222611192, "learning_rate": 4.38553073369225e-07, "loss": 0.0202, "step": 5927 }, { "epoch": 2.696997270245678, "grad_norm": 0.5834564683069716, "learning_rate": 4.3841122971165387e-07, "loss": 0.0116, "step": 5928 }, { "epoch": 2.697452229299363, "grad_norm": 0.8332420974248967, "learning_rate": 4.38269391086839e-07, "loss": 0.0265, "step": 5929 }, { "epoch": 2.697907188353048, "grad_norm": 0.9531782590048881, "learning_rate": 4.3812755750637064e-07, "loss": 0.0245, "step": 5930 }, { "epoch": 2.6983621474067334, "grad_norm": 0.8919673316947738, "learning_rate": 4.3798572898183896e-07, "loss": 0.0318, "step": 5931 }, { "epoch": 2.6988171064604187, "grad_norm": 0.9260695000953123, "learning_rate": 4.378439055248333e-07, "loss": 0.021, "step": 5932 }, { "epoch": 2.6992720655141036, "grad_norm": 1.1419477781656502, "learning_rate": 4.377020871469429e-07, "loss": 0.028, "step": 5933 }, { "epoch": 2.699727024567789, "grad_norm": 0.800318484381603, "learning_rate": 4.3756027385975686e-07, "loss": 0.0127, "step": 5934 }, { "epoch": 2.700181983621474, "grad_norm": 0.8384682323158069, "learning_rate": 4.3741846567486307e-07, "loss": 0.0212, "step": 5935 }, { "epoch": 2.700636942675159, "grad_norm": 0.7007807672225901, "learning_rate": 4.372766626038498e-07, "loss": 0.0133, "step": 5936 }, { "epoch": 2.7010919017288444, "grad_norm": 0.8706793662139275, "learning_rate": 4.3713486465830433e-07, "loss": 0.0302, "step": 5937 }, { "epoch": 2.7015468607825297, "grad_norm": 0.6909623077100043, "learning_rate": 4.3699307184981394e-07, "loss": 0.0078, "step": 5938 }, { "epoch": 2.702001819836215, "grad_norm": 0.4449395247873451, "learning_rate": 4.3685128418996503e-07, "loss": 0.0055, "step": 5939 }, { "epoch": 2.7024567788899, "grad_norm": 0.9482967746197887, "learning_rate": 4.367095016903439e-07, "loss": 0.0125, "step": 5940 }, { "epoch": 2.702911737943585, "grad_norm": 0.6293517979281915, "learning_rate": 4.365677243625367e-07, "loss": 0.0163, "step": 5941 }, { "epoch": 2.7033666969972705, "grad_norm": 0.5480035981589818, "learning_rate": 4.364259522181285e-07, "loss": 0.0103, "step": 5942 }, { "epoch": 2.7038216560509554, "grad_norm": 0.6039778812558965, "learning_rate": 4.362841852687045e-07, "loss": 0.0128, "step": 5943 }, { "epoch": 2.7042766151046407, "grad_norm": 0.5581367358337688, "learning_rate": 4.36142423525849e-07, "loss": 0.0154, "step": 5944 }, { "epoch": 2.704731574158326, "grad_norm": 0.5924957450352203, "learning_rate": 4.3600066700114643e-07, "loss": 0.0118, "step": 5945 }, { "epoch": 2.705186533212011, "grad_norm": 0.7442962772053596, "learning_rate": 4.358589157061802e-07, "loss": 0.022, "step": 5946 }, { "epoch": 2.705641492265696, "grad_norm": 0.5810472472114041, "learning_rate": 4.357171696525336e-07, "loss": 0.01, "step": 5947 }, { "epoch": 2.7060964513193815, "grad_norm": 1.0646580323077037, "learning_rate": 4.3557542885178975e-07, "loss": 0.0171, "step": 5948 }, { "epoch": 2.7065514103730663, "grad_norm": 0.6939917864517325, "learning_rate": 4.354336933155309e-07, "loss": 0.0231, "step": 5949 }, { "epoch": 2.7070063694267517, "grad_norm": 1.2197728646155066, "learning_rate": 4.3529196305533915e-07, "loss": 0.0194, "step": 5950 }, { "epoch": 2.707461328480437, "grad_norm": 0.7618187260197782, "learning_rate": 4.351502380827958e-07, "loss": 0.0096, "step": 5951 }, { "epoch": 2.707916287534122, "grad_norm": 0.8431877430947808, "learning_rate": 4.350085184094823e-07, "loss": 0.0126, "step": 5952 }, { "epoch": 2.708371246587807, "grad_norm": 0.7824093168630901, "learning_rate": 4.3486680404697907e-07, "loss": 0.0149, "step": 5953 }, { "epoch": 2.7088262056414925, "grad_norm": 0.4935878040422845, "learning_rate": 4.347250950068664e-07, "loss": 0.0092, "step": 5954 }, { "epoch": 2.7092811646951773, "grad_norm": 0.8562238525190613, "learning_rate": 4.345833913007243e-07, "loss": 0.0247, "step": 5955 }, { "epoch": 2.7097361237488626, "grad_norm": 0.9679388512074037, "learning_rate": 4.34441692940132e-07, "loss": 0.0264, "step": 5956 }, { "epoch": 2.710191082802548, "grad_norm": 0.827026971043981, "learning_rate": 4.342999999366686e-07, "loss": 0.0269, "step": 5957 }, { "epoch": 2.710646041856233, "grad_norm": 0.9280726655958063, "learning_rate": 4.3415831230191244e-07, "loss": 0.0204, "step": 5958 }, { "epoch": 2.711101000909918, "grad_norm": 1.0834683813891153, "learning_rate": 4.340166300474417e-07, "loss": 0.0294, "step": 5959 }, { "epoch": 2.7115559599636034, "grad_norm": 0.5110556870790438, "learning_rate": 4.3387495318483384e-07, "loss": 0.0076, "step": 5960 }, { "epoch": 2.7120109190172883, "grad_norm": 1.3351145104381061, "learning_rate": 4.3373328172566615e-07, "loss": 0.0233, "step": 5961 }, { "epoch": 2.7124658780709736, "grad_norm": 1.3787095680160717, "learning_rate": 4.335916156815156e-07, "loss": 0.0195, "step": 5962 }, { "epoch": 2.712920837124659, "grad_norm": 0.6096539198582709, "learning_rate": 4.334499550639582e-07, "loss": 0.0107, "step": 5963 }, { "epoch": 2.713375796178344, "grad_norm": 0.8956749843924722, "learning_rate": 4.333082998845701e-07, "loss": 0.0231, "step": 5964 }, { "epoch": 2.713830755232029, "grad_norm": 0.8132124245607087, "learning_rate": 4.3316665015492654e-07, "loss": 0.0253, "step": 5965 }, { "epoch": 2.7142857142857144, "grad_norm": 1.5130668705534314, "learning_rate": 4.330250058866024e-07, "loss": 0.0318, "step": 5966 }, { "epoch": 2.7147406733393993, "grad_norm": 1.0132678314514625, "learning_rate": 4.328833670911724e-07, "loss": 0.0205, "step": 5967 }, { "epoch": 2.7151956323930846, "grad_norm": 0.6604062630753073, "learning_rate": 4.3274173378021034e-07, "loss": 0.0106, "step": 5968 }, { "epoch": 2.71565059144677, "grad_norm": 0.7129240792491904, "learning_rate": 4.326001059652902e-07, "loss": 0.0179, "step": 5969 }, { "epoch": 2.7161055505004548, "grad_norm": 0.8181223961595813, "learning_rate": 4.324584836579851e-07, "loss": 0.0095, "step": 5970 }, { "epoch": 2.71656050955414, "grad_norm": 0.862418912706931, "learning_rate": 4.323168668698676e-07, "loss": 0.0302, "step": 5971 }, { "epoch": 2.7170154686078254, "grad_norm": 0.7274516476862067, "learning_rate": 4.3217525561251016e-07, "loss": 0.0208, "step": 5972 }, { "epoch": 2.7174704276615103, "grad_norm": 0.902833875874707, "learning_rate": 4.320336498974845e-07, "loss": 0.0123, "step": 5973 }, { "epoch": 2.7179253867151956, "grad_norm": 0.8356912005638896, "learning_rate": 4.3189204973636206e-07, "loss": 0.0081, "step": 5974 }, { "epoch": 2.718380345768881, "grad_norm": 0.8286886612567862, "learning_rate": 4.3175045514071355e-07, "loss": 0.0168, "step": 5975 }, { "epoch": 2.7188353048225657, "grad_norm": 0.7266582827816958, "learning_rate": 4.316088661221098e-07, "loss": 0.0192, "step": 5976 }, { "epoch": 2.719290263876251, "grad_norm": 4.288031135353832, "learning_rate": 4.3146728269212076e-07, "loss": 0.0836, "step": 5977 }, { "epoch": 2.7197452229299364, "grad_norm": 1.0153396414284162, "learning_rate": 4.313257048623157e-07, "loss": 0.0134, "step": 5978 }, { "epoch": 2.7202001819836212, "grad_norm": 0.811569547423804, "learning_rate": 4.3118413264426415e-07, "loss": 0.0067, "step": 5979 }, { "epoch": 2.7206551410373065, "grad_norm": 0.7889727922026939, "learning_rate": 4.3104256604953424e-07, "loss": 0.0223, "step": 5980 }, { "epoch": 2.721110100090992, "grad_norm": 0.7359721887170986, "learning_rate": 4.309010050896946e-07, "loss": 0.0254, "step": 5981 }, { "epoch": 2.7215650591446767, "grad_norm": 0.7969680867666725, "learning_rate": 4.307594497763126e-07, "loss": 0.0219, "step": 5982 }, { "epoch": 2.722020018198362, "grad_norm": 0.7657026206062334, "learning_rate": 4.3061790012095576e-07, "loss": 0.0246, "step": 5983 }, { "epoch": 2.7224749772520473, "grad_norm": 0.8649116846921936, "learning_rate": 4.304763561351909e-07, "loss": 0.0148, "step": 5984 }, { "epoch": 2.722929936305732, "grad_norm": 0.6747256604421175, "learning_rate": 4.303348178305841e-07, "loss": 0.0152, "step": 5985 }, { "epoch": 2.7233848953594175, "grad_norm": 0.7099788022378745, "learning_rate": 4.301932852187016e-07, "loss": 0.013, "step": 5986 }, { "epoch": 2.723839854413103, "grad_norm": 1.0337810683408597, "learning_rate": 4.300517583111084e-07, "loss": 0.0243, "step": 5987 }, { "epoch": 2.724294813466788, "grad_norm": 0.7785870612387739, "learning_rate": 4.2991023711936974e-07, "loss": 0.016, "step": 5988 }, { "epoch": 2.724749772520473, "grad_norm": 0.7065940917831606, "learning_rate": 4.2976872165504974e-07, "loss": 0.0148, "step": 5989 }, { "epoch": 2.7252047315741583, "grad_norm": 0.8670044627619677, "learning_rate": 4.296272119297128e-07, "loss": 0.019, "step": 5990 }, { "epoch": 2.7256596906278436, "grad_norm": 0.8741495603228514, "learning_rate": 4.294857079549224e-07, "loss": 0.0146, "step": 5991 }, { "epoch": 2.7261146496815285, "grad_norm": 0.7894621309198404, "learning_rate": 4.293442097422414e-07, "loss": 0.0146, "step": 5992 }, { "epoch": 2.726569608735214, "grad_norm": 0.7174362465685148, "learning_rate": 4.292027173032326e-07, "loss": 0.0147, "step": 5993 }, { "epoch": 2.727024567788899, "grad_norm": 1.4654307371961268, "learning_rate": 4.290612306494579e-07, "loss": 0.0438, "step": 5994 }, { "epoch": 2.7274795268425844, "grad_norm": 0.7529497879993526, "learning_rate": 4.289197497924792e-07, "loss": 0.0286, "step": 5995 }, { "epoch": 2.7279344858962693, "grad_norm": 0.978274584601538, "learning_rate": 4.287782747438573e-07, "loss": 0.0379, "step": 5996 }, { "epoch": 2.7283894449499546, "grad_norm": 0.8146278445082874, "learning_rate": 4.286368055151534e-07, "loss": 0.0221, "step": 5997 }, { "epoch": 2.72884440400364, "grad_norm": 0.9544502492040587, "learning_rate": 4.2849534211792736e-07, "loss": 0.0092, "step": 5998 }, { "epoch": 2.729299363057325, "grad_norm": 0.8245416401511356, "learning_rate": 4.2835388456373913e-07, "loss": 0.0163, "step": 5999 }, { "epoch": 2.72975432211101, "grad_norm": 0.9624823954947755, "learning_rate": 4.282124328641479e-07, "loss": 0.0173, "step": 6000 }, { "epoch": 2.7302092811646954, "grad_norm": 0.8762848171259323, "learning_rate": 4.2807098703071254e-07, "loss": 0.0223, "step": 6001 }, { "epoch": 2.7306642402183803, "grad_norm": 0.8997924668919084, "learning_rate": 4.279295470749912e-07, "loss": 0.0114, "step": 6002 }, { "epoch": 2.7311191992720656, "grad_norm": 0.8308916977848857, "learning_rate": 4.277881130085416e-07, "loss": 0.0087, "step": 6003 }, { "epoch": 2.731574158325751, "grad_norm": 0.6990203218085058, "learning_rate": 4.2764668484292154e-07, "loss": 0.0094, "step": 6004 }, { "epoch": 2.7320291173794358, "grad_norm": 0.7764634713165659, "learning_rate": 4.275052625896876e-07, "loss": 0.026, "step": 6005 }, { "epoch": 2.732484076433121, "grad_norm": 0.7946186272784095, "learning_rate": 4.273638462603963e-07, "loss": 0.0177, "step": 6006 }, { "epoch": 2.7329390354868064, "grad_norm": 0.8636935599949715, "learning_rate": 4.2722243586660335e-07, "loss": 0.0069, "step": 6007 }, { "epoch": 2.7333939945404913, "grad_norm": 0.6573592285437179, "learning_rate": 4.270810314198644e-07, "loss": 0.0119, "step": 6008 }, { "epoch": 2.7338489535941766, "grad_norm": 0.8322081424874042, "learning_rate": 4.2693963293173415e-07, "loss": 0.0184, "step": 6009 }, { "epoch": 2.734303912647862, "grad_norm": 1.2749167311828593, "learning_rate": 4.2679824041376697e-07, "loss": 0.0232, "step": 6010 }, { "epoch": 2.7347588717015467, "grad_norm": 0.8971456986405165, "learning_rate": 4.266568538775174e-07, "loss": 0.0186, "step": 6011 }, { "epoch": 2.735213830755232, "grad_norm": 1.1848454358691587, "learning_rate": 4.2651547333453823e-07, "loss": 0.0098, "step": 6012 }, { "epoch": 2.7356687898089174, "grad_norm": 1.0510105468920348, "learning_rate": 4.2637409879638293e-07, "loss": 0.0166, "step": 6013 }, { "epoch": 2.7361237488626022, "grad_norm": 0.708904154301934, "learning_rate": 4.2623273027460366e-07, "loss": 0.0161, "step": 6014 }, { "epoch": 2.7365787079162875, "grad_norm": 1.008916990674995, "learning_rate": 4.260913677807527e-07, "loss": 0.0158, "step": 6015 }, { "epoch": 2.737033666969973, "grad_norm": 0.795763591866139, "learning_rate": 4.259500113263812e-07, "loss": 0.0156, "step": 6016 }, { "epoch": 2.7374886260236577, "grad_norm": 0.9954511665707747, "learning_rate": 4.2580866092304027e-07, "loss": 0.0194, "step": 6017 }, { "epoch": 2.737943585077343, "grad_norm": 0.6402092097402942, "learning_rate": 4.2566731658228076e-07, "loss": 0.0088, "step": 6018 }, { "epoch": 2.7383985441310283, "grad_norm": 0.9670965465764559, "learning_rate": 4.2552597831565233e-07, "loss": 0.0134, "step": 6019 }, { "epoch": 2.738853503184713, "grad_norm": 1.130038139444353, "learning_rate": 4.253846461347048e-07, "loss": 0.0288, "step": 6020 }, { "epoch": 2.7393084622383985, "grad_norm": 0.6513710626610757, "learning_rate": 4.252433200509868e-07, "loss": 0.0184, "step": 6021 }, { "epoch": 2.739763421292084, "grad_norm": 0.7594465988511604, "learning_rate": 4.251020000760474e-07, "loss": 0.0155, "step": 6022 }, { "epoch": 2.7402183803457687, "grad_norm": 0.6359473577973949, "learning_rate": 4.24960686221434e-07, "loss": 0.0111, "step": 6023 }, { "epoch": 2.740673339399454, "grad_norm": 1.427873230328534, "learning_rate": 4.2481937849869444e-07, "loss": 0.0368, "step": 6024 }, { "epoch": 2.7411282984531393, "grad_norm": 0.8038801926276932, "learning_rate": 4.2467807691937594e-07, "loss": 0.0168, "step": 6025 }, { "epoch": 2.741583257506824, "grad_norm": 2.5293607486260647, "learning_rate": 4.2453678149502483e-07, "loss": 0.0646, "step": 6026 }, { "epoch": 2.7420382165605095, "grad_norm": 1.418122928250183, "learning_rate": 4.2439549223718714e-07, "loss": 0.0137, "step": 6027 }, { "epoch": 2.742493175614195, "grad_norm": 0.9862049575492858, "learning_rate": 4.242542091574083e-07, "loss": 0.0231, "step": 6028 }, { "epoch": 2.7429481346678797, "grad_norm": 0.7421984643691355, "learning_rate": 4.2411293226723355e-07, "loss": 0.0176, "step": 6029 }, { "epoch": 2.743403093721565, "grad_norm": 0.6607461134238887, "learning_rate": 4.239716615782072e-07, "loss": 0.0118, "step": 6030 }, { "epoch": 2.7438580527752503, "grad_norm": 0.6949282088724084, "learning_rate": 4.238303971018732e-07, "loss": 0.0154, "step": 6031 }, { "epoch": 2.744313011828935, "grad_norm": 0.7268362760719965, "learning_rate": 4.236891388497754e-07, "loss": 0.012, "step": 6032 }, { "epoch": 2.7447679708826205, "grad_norm": 1.576912903668802, "learning_rate": 4.235478868334564e-07, "loss": 0.0233, "step": 6033 }, { "epoch": 2.745222929936306, "grad_norm": 0.7825783818316306, "learning_rate": 4.2340664106445894e-07, "loss": 0.0121, "step": 6034 }, { "epoch": 2.7456778889899907, "grad_norm": 0.5812574905528954, "learning_rate": 4.232654015543249e-07, "loss": 0.01, "step": 6035 }, { "epoch": 2.746132848043676, "grad_norm": 0.9393049391713764, "learning_rate": 4.2312416831459563e-07, "loss": 0.0251, "step": 6036 }, { "epoch": 2.7465878070973613, "grad_norm": 1.5572115317213067, "learning_rate": 4.229829413568122e-07, "loss": 0.0182, "step": 6037 }, { "epoch": 2.747042766151046, "grad_norm": 0.8328464483708349, "learning_rate": 4.2284172069251485e-07, "loss": 0.0157, "step": 6038 }, { "epoch": 2.7474977252047315, "grad_norm": 1.119149306659957, "learning_rate": 4.227005063332437e-07, "loss": 0.0426, "step": 6039 }, { "epoch": 2.7479526842584168, "grad_norm": 1.0440248500923561, "learning_rate": 4.2255929829053827e-07, "loss": 0.0233, "step": 6040 }, { "epoch": 2.7484076433121016, "grad_norm": 0.9814877038570765, "learning_rate": 4.2241809657593706e-07, "loss": 0.0308, "step": 6041 }, { "epoch": 2.748862602365787, "grad_norm": 0.815423626850715, "learning_rate": 4.222769012009788e-07, "loss": 0.0164, "step": 6042 }, { "epoch": 2.7493175614194723, "grad_norm": 0.6603123466963441, "learning_rate": 4.221357121772011e-07, "loss": 0.0127, "step": 6043 }, { "epoch": 2.7497725204731576, "grad_norm": 0.9731040754754687, "learning_rate": 4.2199452951614143e-07, "loss": 0.0265, "step": 6044 }, { "epoch": 2.7502274795268424, "grad_norm": 0.835657722949404, "learning_rate": 4.2185335322933636e-07, "loss": 0.0121, "step": 6045 }, { "epoch": 2.7506824385805277, "grad_norm": 1.0022525857875313, "learning_rate": 4.217121833283225e-07, "loss": 0.0223, "step": 6046 }, { "epoch": 2.751137397634213, "grad_norm": 0.827054246158979, "learning_rate": 4.215710198246355e-07, "loss": 0.0227, "step": 6047 }, { "epoch": 2.7515923566878984, "grad_norm": 0.9547508192581982, "learning_rate": 4.214298627298105e-07, "loss": 0.0165, "step": 6048 }, { "epoch": 2.7520473157415832, "grad_norm": 0.8262849111985905, "learning_rate": 4.2128871205538237e-07, "loss": 0.016, "step": 6049 }, { "epoch": 2.7525022747952685, "grad_norm": 0.6808625107703994, "learning_rate": 4.211475678128852e-07, "loss": 0.0185, "step": 6050 }, { "epoch": 2.752957233848954, "grad_norm": 1.0496638387713797, "learning_rate": 4.210064300138527e-07, "loss": 0.0251, "step": 6051 }, { "epoch": 2.7534121929026387, "grad_norm": 0.8795173252619963, "learning_rate": 4.208652986698179e-07, "loss": 0.0121, "step": 6052 }, { "epoch": 2.753867151956324, "grad_norm": 0.9921950878259996, "learning_rate": 4.207241737923136e-07, "loss": 0.0117, "step": 6053 }, { "epoch": 2.7543221110100093, "grad_norm": 0.36163307785453497, "learning_rate": 4.2058305539287187e-07, "loss": 0.0071, "step": 6054 }, { "epoch": 2.754777070063694, "grad_norm": 0.5102678618670893, "learning_rate": 4.204419434830241e-07, "loss": 0.0108, "step": 6055 }, { "epoch": 2.7552320291173795, "grad_norm": 0.9474990045936286, "learning_rate": 4.203008380743016e-07, "loss": 0.0334, "step": 6056 }, { "epoch": 2.755686988171065, "grad_norm": 1.0250623242401744, "learning_rate": 4.2015973917823456e-07, "loss": 0.0199, "step": 6057 }, { "epoch": 2.7561419472247497, "grad_norm": 0.7674386200973485, "learning_rate": 4.2001864680635317e-07, "loss": 0.0164, "step": 6058 }, { "epoch": 2.756596906278435, "grad_norm": 0.9198646006340169, "learning_rate": 4.198775609701866e-07, "loss": 0.0178, "step": 6059 }, { "epoch": 2.7570518653321203, "grad_norm": 0.48144760424537775, "learning_rate": 4.1973648168126396e-07, "loss": 0.0074, "step": 6060 }, { "epoch": 2.757506824385805, "grad_norm": 0.724080684947188, "learning_rate": 4.1959540895111375e-07, "loss": 0.021, "step": 6061 }, { "epoch": 2.7579617834394905, "grad_norm": 0.7407423162885999, "learning_rate": 4.1945434279126344e-07, "loss": 0.0121, "step": 6062 }, { "epoch": 2.758416742493176, "grad_norm": 3.723703047721801, "learning_rate": 4.1931328321324066e-07, "loss": 0.0749, "step": 6063 }, { "epoch": 2.7588717015468607, "grad_norm": 0.7140580856638739, "learning_rate": 4.1917223022857184e-07, "loss": 0.0141, "step": 6064 }, { "epoch": 2.759326660600546, "grad_norm": 0.6523503309815952, "learning_rate": 4.190311838487835e-07, "loss": 0.0159, "step": 6065 }, { "epoch": 2.7597816196542313, "grad_norm": 0.7184140855467785, "learning_rate": 4.188901440854009e-07, "loss": 0.0161, "step": 6066 }, { "epoch": 2.760236578707916, "grad_norm": 0.8756105474987347, "learning_rate": 4.187491109499496e-07, "loss": 0.0284, "step": 6067 }, { "epoch": 2.7606915377616015, "grad_norm": 0.5358357954470473, "learning_rate": 4.186080844539541e-07, "loss": 0.009, "step": 6068 }, { "epoch": 2.761146496815287, "grad_norm": 1.175064291754555, "learning_rate": 4.184670646089383e-07, "loss": 0.0276, "step": 6069 }, { "epoch": 2.7616014558689717, "grad_norm": 0.7505247905446183, "learning_rate": 4.1832605142642586e-07, "loss": 0.0134, "step": 6070 }, { "epoch": 2.762056414922657, "grad_norm": 0.7764293531626258, "learning_rate": 4.1818504491793965e-07, "loss": 0.0156, "step": 6071 }, { "epoch": 2.7625113739763423, "grad_norm": 0.5912265938864777, "learning_rate": 4.18044045095002e-07, "loss": 0.0107, "step": 6072 }, { "epoch": 2.762966333030027, "grad_norm": 0.5539834520528519, "learning_rate": 4.1790305196913476e-07, "loss": 0.0097, "step": 6073 }, { "epoch": 2.7634212920837125, "grad_norm": 0.8378330433105612, "learning_rate": 4.177620655518596e-07, "loss": 0.0147, "step": 6074 }, { "epoch": 2.7638762511373978, "grad_norm": 0.8979697781971654, "learning_rate": 4.17621085854697e-07, "loss": 0.0165, "step": 6075 }, { "epoch": 2.7643312101910826, "grad_norm": 1.162176690541826, "learning_rate": 4.174801128891673e-07, "loss": 0.0219, "step": 6076 }, { "epoch": 2.764786169244768, "grad_norm": 0.678280368266539, "learning_rate": 4.1733914666679e-07, "loss": 0.0263, "step": 6077 }, { "epoch": 2.7652411282984533, "grad_norm": 0.8324045486620494, "learning_rate": 4.171981871990844e-07, "loss": 0.0166, "step": 6078 }, { "epoch": 2.765696087352138, "grad_norm": 0.7471418260503979, "learning_rate": 4.1705723449756905e-07, "loss": 0.017, "step": 6079 }, { "epoch": 2.7661510464058234, "grad_norm": 0.729367525293249, "learning_rate": 4.1691628857376173e-07, "loss": 0.0146, "step": 6080 }, { "epoch": 2.7666060054595087, "grad_norm": 0.8917182424642869, "learning_rate": 4.1677534943918025e-07, "loss": 0.0134, "step": 6081 }, { "epoch": 2.7670609645131936, "grad_norm": 0.5917812108843689, "learning_rate": 4.166344171053414e-07, "loss": 0.0179, "step": 6082 }, { "epoch": 2.767515923566879, "grad_norm": 0.813275595892062, "learning_rate": 4.164934915837615e-07, "loss": 0.0172, "step": 6083 }, { "epoch": 2.7679708826205642, "grad_norm": 0.6177217755687727, "learning_rate": 4.1635257288595634e-07, "loss": 0.0144, "step": 6084 }, { "epoch": 2.768425841674249, "grad_norm": 1.4742921009810046, "learning_rate": 4.1621166102344125e-07, "loss": 0.0243, "step": 6085 }, { "epoch": 2.7688808007279344, "grad_norm": 1.0708324578104742, "learning_rate": 4.1607075600773075e-07, "loss": 0.019, "step": 6086 }, { "epoch": 2.7693357597816197, "grad_norm": 0.9988356217346185, "learning_rate": 4.159298578503389e-07, "loss": 0.0267, "step": 6087 }, { "epoch": 2.7697907188353046, "grad_norm": 0.5388519860676899, "learning_rate": 4.157889665627796e-07, "loss": 0.0101, "step": 6088 }, { "epoch": 2.77024567788899, "grad_norm": 0.49947325813912513, "learning_rate": 4.1564808215656565e-07, "loss": 0.0114, "step": 6089 }, { "epoch": 2.770700636942675, "grad_norm": 0.8433318221783965, "learning_rate": 4.155072046432095e-07, "loss": 0.0273, "step": 6090 }, { "epoch": 2.77115559599636, "grad_norm": 0.9102215914813515, "learning_rate": 4.1536633403422297e-07, "loss": 0.0134, "step": 6091 }, { "epoch": 2.7716105550500454, "grad_norm": 0.9913767047336763, "learning_rate": 4.1522547034111756e-07, "loss": 0.0243, "step": 6092 }, { "epoch": 2.7720655141037307, "grad_norm": 1.0522314370404253, "learning_rate": 4.150846135754037e-07, "loss": 0.0171, "step": 6093 }, { "epoch": 2.7725204731574156, "grad_norm": 1.6499355474781225, "learning_rate": 4.149437637485917e-07, "loss": 0.0088, "step": 6094 }, { "epoch": 2.772975432211101, "grad_norm": 0.9020371753360936, "learning_rate": 4.148029208721914e-07, "loss": 0.0137, "step": 6095 }, { "epoch": 2.773430391264786, "grad_norm": 0.6461616972961939, "learning_rate": 4.1466208495771154e-07, "loss": 0.0113, "step": 6096 }, { "epoch": 2.7738853503184715, "grad_norm": 1.1497359971115062, "learning_rate": 4.145212560166608e-07, "loss": 0.0217, "step": 6097 }, { "epoch": 2.7743403093721564, "grad_norm": 0.9109086089876007, "learning_rate": 4.1438043406054694e-07, "loss": 0.0129, "step": 6098 }, { "epoch": 2.7747952684258417, "grad_norm": 0.9876330694731078, "learning_rate": 4.1423961910087747e-07, "loss": 0.0137, "step": 6099 }, { "epoch": 2.775250227479527, "grad_norm": 1.0655927087083374, "learning_rate": 4.140988111491589e-07, "loss": 0.0285, "step": 6100 }, { "epoch": 2.775705186533212, "grad_norm": 0.7994555821665706, "learning_rate": 4.139580102168974e-07, "loss": 0.0201, "step": 6101 }, { "epoch": 2.776160145586897, "grad_norm": 0.7307176742194652, "learning_rate": 4.13817216315599e-07, "loss": 0.0202, "step": 6102 }, { "epoch": 2.7766151046405825, "grad_norm": 1.005332970198124, "learning_rate": 4.136764294567684e-07, "loss": 0.0256, "step": 6103 }, { "epoch": 2.777070063694268, "grad_norm": 0.9662644666653913, "learning_rate": 4.1353564965191025e-07, "loss": 0.027, "step": 6104 }, { "epoch": 2.7775250227479527, "grad_norm": 0.7990040908887478, "learning_rate": 4.1339487691252826e-07, "loss": 0.0243, "step": 6105 }, { "epoch": 2.777979981801638, "grad_norm": 0.9084129179387722, "learning_rate": 4.1325411125012587e-07, "loss": 0.0266, "step": 6106 }, { "epoch": 2.7784349408553233, "grad_norm": 1.226573919087918, "learning_rate": 4.1311335267620584e-07, "loss": 0.0213, "step": 6107 }, { "epoch": 2.778889899909008, "grad_norm": 0.6135322965156279, "learning_rate": 4.1297260120226986e-07, "loss": 0.0152, "step": 6108 }, { "epoch": 2.7793448589626935, "grad_norm": 0.9494388185852084, "learning_rate": 4.1283185683982023e-07, "loss": 0.0191, "step": 6109 }, { "epoch": 2.7797998180163788, "grad_norm": 0.7308214461395155, "learning_rate": 4.1269111960035764e-07, "loss": 0.0077, "step": 6110 }, { "epoch": 2.7802547770700636, "grad_norm": 1.7380903756970871, "learning_rate": 4.1255038949538236e-07, "loss": 0.0387, "step": 6111 }, { "epoch": 2.780709736123749, "grad_norm": 0.6459755984155481, "learning_rate": 4.1240966653639444e-07, "loss": 0.018, "step": 6112 }, { "epoch": 2.7811646951774343, "grad_norm": 0.7526197103613147, "learning_rate": 4.1226895073489285e-07, "loss": 0.0208, "step": 6113 }, { "epoch": 2.781619654231119, "grad_norm": 0.6012727626458111, "learning_rate": 4.1212824210237664e-07, "loss": 0.0136, "step": 6114 }, { "epoch": 2.7820746132848044, "grad_norm": 0.7415083948908003, "learning_rate": 4.1198754065034337e-07, "loss": 0.0102, "step": 6115 }, { "epoch": 2.7825295723384897, "grad_norm": 0.8828783423686993, "learning_rate": 4.1184684639029094e-07, "loss": 0.0145, "step": 6116 }, { "epoch": 2.7829845313921746, "grad_norm": 0.8031708647026076, "learning_rate": 4.117061593337162e-07, "loss": 0.0145, "step": 6117 }, { "epoch": 2.78343949044586, "grad_norm": 0.6550166977167122, "learning_rate": 4.115654794921153e-07, "loss": 0.008, "step": 6118 }, { "epoch": 2.7838944494995452, "grad_norm": 0.5582015770240302, "learning_rate": 4.114248068769842e-07, "loss": 0.018, "step": 6119 }, { "epoch": 2.78434940855323, "grad_norm": 0.554033630507376, "learning_rate": 4.1128414149981773e-07, "loss": 0.0112, "step": 6120 }, { "epoch": 2.7848043676069154, "grad_norm": 0.6783827926145986, "learning_rate": 4.111434833721107e-07, "loss": 0.0139, "step": 6121 }, { "epoch": 2.7852593266606007, "grad_norm": 0.5772196816155861, "learning_rate": 4.110028325053567e-07, "loss": 0.0122, "step": 6122 }, { "epoch": 2.7857142857142856, "grad_norm": 0.922081533874196, "learning_rate": 4.1086218891104943e-07, "loss": 0.013, "step": 6123 }, { "epoch": 2.786169244767971, "grad_norm": 1.0551241735528216, "learning_rate": 4.107215526006817e-07, "loss": 0.034, "step": 6124 }, { "epoch": 2.786624203821656, "grad_norm": 0.8187750498388184, "learning_rate": 4.105809235857454e-07, "loss": 0.0105, "step": 6125 }, { "epoch": 2.787079162875341, "grad_norm": 0.677959797579283, "learning_rate": 4.104403018777323e-07, "loss": 0.0094, "step": 6126 }, { "epoch": 2.7875341219290264, "grad_norm": 0.741354547014316, "learning_rate": 4.102996874881331e-07, "loss": 0.0122, "step": 6127 }, { "epoch": 2.7879890809827117, "grad_norm": 0.8732393629973759, "learning_rate": 4.1015908042843854e-07, "loss": 0.0241, "step": 6128 }, { "epoch": 2.7884440400363966, "grad_norm": 0.9055786322744693, "learning_rate": 4.1001848071013803e-07, "loss": 0.0136, "step": 6129 }, { "epoch": 2.788898999090082, "grad_norm": 0.8304568466262051, "learning_rate": 4.0987788834472095e-07, "loss": 0.0208, "step": 6130 }, { "epoch": 2.789353958143767, "grad_norm": 0.5827818710785857, "learning_rate": 4.0973730334367597e-07, "loss": 0.0089, "step": 6131 }, { "epoch": 2.789808917197452, "grad_norm": 0.7683588266129979, "learning_rate": 4.095967257184908e-07, "loss": 0.0229, "step": 6132 }, { "epoch": 2.7902638762511374, "grad_norm": 0.9224297348207268, "learning_rate": 4.094561554806532e-07, "loss": 0.0212, "step": 6133 }, { "epoch": 2.7907188353048227, "grad_norm": 4.543658214072113, "learning_rate": 4.093155926416494e-07, "loss": 0.0813, "step": 6134 }, { "epoch": 2.7911737943585075, "grad_norm": 0.8944860309972668, "learning_rate": 4.09175037212966e-07, "loss": 0.0138, "step": 6135 }, { "epoch": 2.791628753412193, "grad_norm": 1.1701885283722704, "learning_rate": 4.0903448920608823e-07, "loss": 0.0239, "step": 6136 }, { "epoch": 2.792083712465878, "grad_norm": 1.0052730329956236, "learning_rate": 4.0889394863250124e-07, "loss": 0.0158, "step": 6137 }, { "epoch": 2.792538671519563, "grad_norm": 0.731823692241152, "learning_rate": 4.087534155036895e-07, "loss": 0.0133, "step": 6138 }, { "epoch": 2.7929936305732483, "grad_norm": 0.7957605362186514, "learning_rate": 4.086128898311365e-07, "loss": 0.021, "step": 6139 }, { "epoch": 2.7934485896269337, "grad_norm": 0.6925788002738767, "learning_rate": 4.084723716263255e-07, "loss": 0.014, "step": 6140 }, { "epoch": 2.7939035486806185, "grad_norm": 0.5734307777557607, "learning_rate": 4.083318609007389e-07, "loss": 0.0137, "step": 6141 }, { "epoch": 2.794358507734304, "grad_norm": 1.0908774157755357, "learning_rate": 4.081913576658587e-07, "loss": 0.0183, "step": 6142 }, { "epoch": 2.794813466787989, "grad_norm": 0.7844980932861246, "learning_rate": 4.0805086193316593e-07, "loss": 0.0252, "step": 6143 }, { "epoch": 2.795268425841674, "grad_norm": 0.7289251541724276, "learning_rate": 4.0791037371414166e-07, "loss": 0.0216, "step": 6144 }, { "epoch": 2.7957233848953593, "grad_norm": 0.744653480460369, "learning_rate": 4.0776989302026583e-07, "loss": 0.0174, "step": 6145 }, { "epoch": 2.7961783439490446, "grad_norm": 0.8740211911364146, "learning_rate": 4.0762941986301785e-07, "loss": 0.0148, "step": 6146 }, { "epoch": 2.7966333030027295, "grad_norm": 0.6456995719908187, "learning_rate": 4.074889542538764e-07, "loss": 0.0162, "step": 6147 }, { "epoch": 2.797088262056415, "grad_norm": 1.215931151255361, "learning_rate": 4.0734849620431997e-07, "loss": 0.0225, "step": 6148 }, { "epoch": 2.7975432211101, "grad_norm": 0.6627101499876911, "learning_rate": 4.07208045725826e-07, "loss": 0.0159, "step": 6149 }, { "epoch": 2.797998180163785, "grad_norm": 0.5207798883795399, "learning_rate": 4.0706760282987125e-07, "loss": 0.0123, "step": 6150 }, { "epoch": 2.7984531392174703, "grad_norm": 0.6264350490199069, "learning_rate": 4.0692716752793255e-07, "loss": 0.0077, "step": 6151 }, { "epoch": 2.7989080982711556, "grad_norm": 0.9045794930163236, "learning_rate": 4.067867398314853e-07, "loss": 0.0378, "step": 6152 }, { "epoch": 2.799363057324841, "grad_norm": 0.7308230909375445, "learning_rate": 4.066463197520049e-07, "loss": 0.0168, "step": 6153 }, { "epoch": 2.799818016378526, "grad_norm": 0.408341339899514, "learning_rate": 4.065059073009655e-07, "loss": 0.0064, "step": 6154 }, { "epoch": 2.800272975432211, "grad_norm": 0.5416734079339307, "learning_rate": 4.0636550248984127e-07, "loss": 0.0066, "step": 6155 }, { "epoch": 2.8007279344858964, "grad_norm": 0.78864000336426, "learning_rate": 4.0622510533010524e-07, "loss": 0.0198, "step": 6156 }, { "epoch": 2.8011828935395813, "grad_norm": 0.7781852643486489, "learning_rate": 4.0608471583322996e-07, "loss": 0.0136, "step": 6157 }, { "epoch": 2.8016378525932666, "grad_norm": 0.8230885443070974, "learning_rate": 4.0594433401068785e-07, "loss": 0.0144, "step": 6158 }, { "epoch": 2.802092811646952, "grad_norm": 0.7550904300965503, "learning_rate": 4.058039598739498e-07, "loss": 0.021, "step": 6159 }, { "epoch": 2.802547770700637, "grad_norm": 0.8629831381009629, "learning_rate": 4.05663593434487e-07, "loss": 0.0183, "step": 6160 }, { "epoch": 2.803002729754322, "grad_norm": 0.49088361184081214, "learning_rate": 4.0552323470376916e-07, "loss": 0.018, "step": 6161 }, { "epoch": 2.8034576888080074, "grad_norm": 0.7946614439675487, "learning_rate": 4.05382883693266e-07, "loss": 0.0126, "step": 6162 }, { "epoch": 2.8039126478616927, "grad_norm": 0.7555157097511878, "learning_rate": 4.052425404144463e-07, "loss": 0.016, "step": 6163 }, { "epoch": 2.8043676069153776, "grad_norm": 0.9816258978985288, "learning_rate": 4.0510220487877804e-07, "loss": 0.0194, "step": 6164 }, { "epoch": 2.804822565969063, "grad_norm": 1.146667409502149, "learning_rate": 4.049618770977293e-07, "loss": 0.0231, "step": 6165 }, { "epoch": 2.805277525022748, "grad_norm": 0.5859765469085193, "learning_rate": 4.0482155708276674e-07, "loss": 0.0077, "step": 6166 }, { "epoch": 2.805732484076433, "grad_norm": 1.0118867076271176, "learning_rate": 4.046812448453568e-07, "loss": 0.0194, "step": 6167 }, { "epoch": 2.8061874431301184, "grad_norm": 1.3123239261674606, "learning_rate": 4.045409403969649e-07, "loss": 0.0149, "step": 6168 }, { "epoch": 2.8066424021838037, "grad_norm": 0.6292849178199333, "learning_rate": 4.0440064374905643e-07, "loss": 0.0121, "step": 6169 }, { "epoch": 2.8070973612374885, "grad_norm": 0.9398865071112946, "learning_rate": 4.0426035491309544e-07, "loss": 0.0327, "step": 6170 }, { "epoch": 2.807552320291174, "grad_norm": 0.6450021657703245, "learning_rate": 4.041200739005459e-07, "loss": 0.0078, "step": 6171 }, { "epoch": 2.808007279344859, "grad_norm": 0.675238206688403, "learning_rate": 4.03979800722871e-07, "loss": 0.01, "step": 6172 }, { "epoch": 2.808462238398544, "grad_norm": 0.8268679789264717, "learning_rate": 4.0383953539153315e-07, "loss": 0.0256, "step": 6173 }, { "epoch": 2.8089171974522293, "grad_norm": 0.722785286222672, "learning_rate": 4.0369927791799437e-07, "loss": 0.0204, "step": 6174 }, { "epoch": 2.8093721565059147, "grad_norm": 0.502927905253051, "learning_rate": 4.035590283137155e-07, "loss": 0.0084, "step": 6175 }, { "epoch": 2.8098271155595995, "grad_norm": 0.7771445240957574, "learning_rate": 4.034187865901575e-07, "loss": 0.0119, "step": 6176 }, { "epoch": 2.810282074613285, "grad_norm": 1.2541160513715353, "learning_rate": 4.0327855275878004e-07, "loss": 0.018, "step": 6177 }, { "epoch": 2.81073703366697, "grad_norm": 0.7314648998603132, "learning_rate": 4.0313832683104214e-07, "loss": 0.0179, "step": 6178 }, { "epoch": 2.811191992720655, "grad_norm": 1.0035617771256546, "learning_rate": 4.029981088184031e-07, "loss": 0.0154, "step": 6179 }, { "epoch": 2.8116469517743403, "grad_norm": 1.1264809977478325, "learning_rate": 4.028578987323205e-07, "loss": 0.0185, "step": 6180 }, { "epoch": 2.8121019108280256, "grad_norm": 0.7964157016051943, "learning_rate": 4.027176965842517e-07, "loss": 0.0154, "step": 6181 }, { "epoch": 2.8125568698817105, "grad_norm": 0.7391237420416684, "learning_rate": 4.0257750238565345e-07, "loss": 0.0239, "step": 6182 }, { "epoch": 2.813011828935396, "grad_norm": 0.8368573116012692, "learning_rate": 4.0243731614798163e-07, "loss": 0.0212, "step": 6183 }, { "epoch": 2.813466787989081, "grad_norm": 1.115635371707263, "learning_rate": 4.022971378826919e-07, "loss": 0.0124, "step": 6184 }, { "epoch": 2.813921747042766, "grad_norm": 0.5554434769563509, "learning_rate": 4.021569676012386e-07, "loss": 0.0106, "step": 6185 }, { "epoch": 2.8143767060964513, "grad_norm": 1.3039764387366155, "learning_rate": 4.020168053150762e-07, "loss": 0.018, "step": 6186 }, { "epoch": 2.8148316651501366, "grad_norm": 1.1554131716952585, "learning_rate": 4.018766510356581e-07, "loss": 0.0228, "step": 6187 }, { "epoch": 2.8152866242038215, "grad_norm": 0.8096364695696807, "learning_rate": 4.017365047744368e-07, "loss": 0.0161, "step": 6188 }, { "epoch": 2.815741583257507, "grad_norm": 0.9909268029637308, "learning_rate": 4.015963665428647e-07, "loss": 0.0154, "step": 6189 }, { "epoch": 2.816196542311192, "grad_norm": 0.8006895815071298, "learning_rate": 4.01456236352393e-07, "loss": 0.0295, "step": 6190 }, { "epoch": 2.816651501364877, "grad_norm": 0.7724505635329335, "learning_rate": 4.013161142144728e-07, "loss": 0.0239, "step": 6191 }, { "epoch": 2.8171064604185623, "grad_norm": 0.7913918697637956, "learning_rate": 4.011760001405539e-07, "loss": 0.012, "step": 6192 }, { "epoch": 2.8175614194722476, "grad_norm": 0.7495642755682572, "learning_rate": 4.01035894142086e-07, "loss": 0.0141, "step": 6193 }, { "epoch": 2.8180163785259325, "grad_norm": 0.6552235007349881, "learning_rate": 4.0089579623051805e-07, "loss": 0.0132, "step": 6194 }, { "epoch": 2.8184713375796178, "grad_norm": 0.7557226379878355, "learning_rate": 4.00755706417298e-07, "loss": 0.0235, "step": 6195 }, { "epoch": 2.818926296633303, "grad_norm": 1.2505307891303572, "learning_rate": 4.0061562471387355e-07, "loss": 0.0153, "step": 6196 }, { "epoch": 2.819381255686988, "grad_norm": 0.8052574694302723, "learning_rate": 4.004755511316913e-07, "loss": 0.0234, "step": 6197 }, { "epoch": 2.8198362147406733, "grad_norm": 0.6587260396381147, "learning_rate": 4.003354856821977e-07, "loss": 0.0147, "step": 6198 }, { "epoch": 2.8202911737943586, "grad_norm": 0.5173251066210995, "learning_rate": 4.001954283768379e-07, "loss": 0.0129, "step": 6199 }, { "epoch": 2.8207461328480434, "grad_norm": 0.9745909347528467, "learning_rate": 4.000553792270571e-07, "loss": 0.0219, "step": 6200 }, { "epoch": 2.8212010919017287, "grad_norm": 0.8438480934965993, "learning_rate": 3.9991533824429947e-07, "loss": 0.0173, "step": 6201 }, { "epoch": 2.821656050955414, "grad_norm": 1.3276392553064427, "learning_rate": 3.9977530544000827e-07, "loss": 0.0206, "step": 6202 }, { "epoch": 2.822111010009099, "grad_norm": 0.7181798518867347, "learning_rate": 3.9963528082562665e-07, "loss": 0.0088, "step": 6203 }, { "epoch": 2.8225659690627842, "grad_norm": 0.744480093178374, "learning_rate": 3.994952644125965e-07, "loss": 0.0172, "step": 6204 }, { "epoch": 2.8230209281164695, "grad_norm": 0.6764896126855247, "learning_rate": 3.993552562123596e-07, "loss": 0.0131, "step": 6205 }, { "epoch": 2.823475887170155, "grad_norm": 0.7901382143677593, "learning_rate": 3.9921525623635644e-07, "loss": 0.0139, "step": 6206 }, { "epoch": 2.8239308462238397, "grad_norm": 0.6276550461850291, "learning_rate": 3.9907526449602744e-07, "loss": 0.016, "step": 6207 }, { "epoch": 2.824385805277525, "grad_norm": 1.4594821532175843, "learning_rate": 3.989352810028123e-07, "loss": 0.0353, "step": 6208 }, { "epoch": 2.8248407643312103, "grad_norm": 0.5550647099506362, "learning_rate": 3.9879530576814936e-07, "loss": 0.015, "step": 6209 }, { "epoch": 2.825295723384895, "grad_norm": 1.1358354287431716, "learning_rate": 3.9865533880347716e-07, "loss": 0.0115, "step": 6210 }, { "epoch": 2.8257506824385805, "grad_norm": 0.9490389899957304, "learning_rate": 3.985153801202329e-07, "loss": 0.0247, "step": 6211 }, { "epoch": 2.826205641492266, "grad_norm": 0.8946237280727013, "learning_rate": 3.9837542972985355e-07, "loss": 0.0151, "step": 6212 }, { "epoch": 2.826660600545951, "grad_norm": 0.5814306512254432, "learning_rate": 3.9823548764377494e-07, "loss": 0.0137, "step": 6213 }, { "epoch": 2.827115559599636, "grad_norm": 0.7131018927554129, "learning_rate": 3.9809555387343286e-07, "loss": 0.0195, "step": 6214 }, { "epoch": 2.8275705186533213, "grad_norm": 0.7099578764961963, "learning_rate": 3.97955628430262e-07, "loss": 0.0219, "step": 6215 }, { "epoch": 2.8280254777070066, "grad_norm": 0.24759091235981734, "learning_rate": 3.9781571132569635e-07, "loss": 0.0014, "step": 6216 }, { "epoch": 2.8284804367606915, "grad_norm": 0.7319756744357938, "learning_rate": 3.976758025711692e-07, "loss": 0.0164, "step": 6217 }, { "epoch": 2.828935395814377, "grad_norm": 0.5928219843667742, "learning_rate": 3.975359021781135e-07, "loss": 0.0149, "step": 6218 }, { "epoch": 2.829390354868062, "grad_norm": 0.7274073805611578, "learning_rate": 3.9739601015796103e-07, "loss": 0.0178, "step": 6219 }, { "epoch": 2.829845313921747, "grad_norm": 0.9236003802727716, "learning_rate": 3.9725612652214316e-07, "loss": 0.0155, "step": 6220 }, { "epoch": 2.8303002729754323, "grad_norm": 1.3384003193175293, "learning_rate": 3.971162512820909e-07, "loss": 0.0176, "step": 6221 }, { "epoch": 2.8307552320291176, "grad_norm": 0.7112337038067276, "learning_rate": 3.969763844492338e-07, "loss": 0.0156, "step": 6222 }, { "epoch": 2.8312101910828025, "grad_norm": 0.7287601392028438, "learning_rate": 3.968365260350014e-07, "loss": 0.0211, "step": 6223 }, { "epoch": 2.831665150136488, "grad_norm": 0.7472178180394241, "learning_rate": 3.9669667605082215e-07, "loss": 0.0265, "step": 6224 }, { "epoch": 2.832120109190173, "grad_norm": 1.2358723543078451, "learning_rate": 3.965568345081241e-07, "loss": 0.0156, "step": 6225 }, { "epoch": 2.832575068243858, "grad_norm": 0.8117115782575539, "learning_rate": 3.964170014183343e-07, "loss": 0.0214, "step": 6226 }, { "epoch": 2.8330300272975433, "grad_norm": 0.6167790047994386, "learning_rate": 3.9627717679287924e-07, "loss": 0.0088, "step": 6227 }, { "epoch": 2.8334849863512286, "grad_norm": 0.5380746910890171, "learning_rate": 3.9613736064318516e-07, "loss": 0.0095, "step": 6228 }, { "epoch": 2.8339399454049135, "grad_norm": 0.5162781508065787, "learning_rate": 3.959975529806767e-07, "loss": 0.0114, "step": 6229 }, { "epoch": 2.8343949044585988, "grad_norm": 0.8789215895504555, "learning_rate": 3.958577538167788e-07, "loss": 0.0171, "step": 6230 }, { "epoch": 2.834849863512284, "grad_norm": 0.6829993490247251, "learning_rate": 3.9571796316291474e-07, "loss": 0.0171, "step": 6231 }, { "epoch": 2.835304822565969, "grad_norm": 0.8202536721935779, "learning_rate": 3.9557818103050787e-07, "loss": 0.0208, "step": 6232 }, { "epoch": 2.8357597816196543, "grad_norm": 0.6855656032688393, "learning_rate": 3.9543840743098047e-07, "loss": 0.0112, "step": 6233 }, { "epoch": 2.8362147406733396, "grad_norm": 0.9030174339872246, "learning_rate": 3.9529864237575403e-07, "loss": 0.0222, "step": 6234 }, { "epoch": 2.8366696997270244, "grad_norm": 1.797076171344948, "learning_rate": 3.9515888587625e-07, "loss": 0.0259, "step": 6235 }, { "epoch": 2.8371246587807097, "grad_norm": 0.8981190229502053, "learning_rate": 3.9501913794388825e-07, "loss": 0.0176, "step": 6236 }, { "epoch": 2.837579617834395, "grad_norm": 0.5298782066742141, "learning_rate": 3.948793985900885e-07, "loss": 0.0093, "step": 6237 }, { "epoch": 2.83803457688808, "grad_norm": 0.6551769663417474, "learning_rate": 3.9473966782626953e-07, "loss": 0.0233, "step": 6238 }, { "epoch": 2.8384895359417652, "grad_norm": 0.7058697479153647, "learning_rate": 3.9459994566384964e-07, "loss": 0.021, "step": 6239 }, { "epoch": 2.8389444949954505, "grad_norm": 0.8343136835127896, "learning_rate": 3.9446023211424605e-07, "loss": 0.0284, "step": 6240 }, { "epoch": 2.8393994540491354, "grad_norm": 1.0394862457503613, "learning_rate": 3.9432052718887565e-07, "loss": 0.0176, "step": 6241 }, { "epoch": 2.8398544131028207, "grad_norm": 0.8179622252152435, "learning_rate": 3.941808308991547e-07, "loss": 0.0208, "step": 6242 }, { "epoch": 2.840309372156506, "grad_norm": 0.9444975100599384, "learning_rate": 3.9404114325649827e-07, "loss": 0.0157, "step": 6243 }, { "epoch": 2.840764331210191, "grad_norm": 0.7097880518858299, "learning_rate": 3.939014642723212e-07, "loss": 0.0144, "step": 6244 }, { "epoch": 2.841219290263876, "grad_norm": 0.8063516460670189, "learning_rate": 3.9376179395803724e-07, "loss": 0.0121, "step": 6245 }, { "epoch": 2.8416742493175615, "grad_norm": 0.6317525613867199, "learning_rate": 3.9362213232505986e-07, "loss": 0.0125, "step": 6246 }, { "epoch": 2.8421292083712464, "grad_norm": 0.8099936008745146, "learning_rate": 3.934824793848013e-07, "loss": 0.0241, "step": 6247 }, { "epoch": 2.8425841674249317, "grad_norm": 1.313881550579524, "learning_rate": 3.9334283514867335e-07, "loss": 0.0217, "step": 6248 }, { "epoch": 2.843039126478617, "grad_norm": 0.9422621896963203, "learning_rate": 3.9320319962808744e-07, "loss": 0.0244, "step": 6249 }, { "epoch": 2.843494085532302, "grad_norm": 0.8769302771736551, "learning_rate": 3.9306357283445375e-07, "loss": 0.0157, "step": 6250 }, { "epoch": 2.843949044585987, "grad_norm": 0.8779445261861997, "learning_rate": 3.92923954779182e-07, "loss": 0.0098, "step": 6251 }, { "epoch": 2.8444040036396725, "grad_norm": 0.6383148941888903, "learning_rate": 3.927843454736812e-07, "loss": 0.0115, "step": 6252 }, { "epoch": 2.8448589626933574, "grad_norm": 0.8082660490825607, "learning_rate": 3.9264474492935926e-07, "loss": 0.0314, "step": 6253 }, { "epoch": 2.8453139217470427, "grad_norm": 0.6854981552047047, "learning_rate": 3.9250515315762415e-07, "loss": 0.0208, "step": 6254 }, { "epoch": 2.845768880800728, "grad_norm": 0.9235880334322779, "learning_rate": 3.9236557016988223e-07, "loss": 0.0227, "step": 6255 }, { "epoch": 2.846223839854413, "grad_norm": 0.7462097498801297, "learning_rate": 3.9222599597754e-07, "loss": 0.021, "step": 6256 }, { "epoch": 2.846678798908098, "grad_norm": 0.7347545275423888, "learning_rate": 3.920864305920028e-07, "loss": 0.0235, "step": 6257 }, { "epoch": 2.8471337579617835, "grad_norm": 0.5159827061408094, "learning_rate": 3.91946874024675e-07, "loss": 0.0095, "step": 6258 }, { "epoch": 2.8475887170154683, "grad_norm": 0.6541639147115367, "learning_rate": 3.9180732628696084e-07, "loss": 0.0079, "step": 6259 }, { "epoch": 2.8480436760691537, "grad_norm": 0.755198285588628, "learning_rate": 3.9166778739026326e-07, "loss": 0.0219, "step": 6260 }, { "epoch": 2.848498635122839, "grad_norm": 0.8409593807527084, "learning_rate": 3.9152825734598494e-07, "loss": 0.026, "step": 6261 }, { "epoch": 2.8489535941765243, "grad_norm": 0.6048955626911681, "learning_rate": 3.9138873616552737e-07, "loss": 0.0142, "step": 6262 }, { "epoch": 2.849408553230209, "grad_norm": 0.6235933872370516, "learning_rate": 3.912492238602919e-07, "loss": 0.0087, "step": 6263 }, { "epoch": 2.8498635122838945, "grad_norm": 1.386745869485917, "learning_rate": 3.911097204416789e-07, "loss": 0.0563, "step": 6264 }, { "epoch": 2.8503184713375798, "grad_norm": 0.7118089539916755, "learning_rate": 3.9097022592108764e-07, "loss": 0.0189, "step": 6265 }, { "epoch": 2.8507734303912646, "grad_norm": 0.7527418282236723, "learning_rate": 3.908307403099173e-07, "loss": 0.0109, "step": 6266 }, { "epoch": 2.85122838944495, "grad_norm": 0.6669985389899314, "learning_rate": 3.9069126361956573e-07, "loss": 0.0211, "step": 6267 }, { "epoch": 2.8516833484986353, "grad_norm": 0.8622830157248027, "learning_rate": 3.9055179586143056e-07, "loss": 0.0252, "step": 6268 }, { "epoch": 2.8521383075523206, "grad_norm": 0.9490395537479442, "learning_rate": 3.904123370469081e-07, "loss": 0.0182, "step": 6269 }, { "epoch": 2.8525932666060054, "grad_norm": 0.8000706641620633, "learning_rate": 3.9027288718739477e-07, "loss": 0.0151, "step": 6270 }, { "epoch": 2.8530482256596907, "grad_norm": 0.8936584337011958, "learning_rate": 3.9013344629428563e-07, "loss": 0.0247, "step": 6271 }, { "epoch": 2.853503184713376, "grad_norm": 0.9545200168517549, "learning_rate": 3.8999401437897505e-07, "loss": 0.0275, "step": 6272 }, { "epoch": 2.853958143767061, "grad_norm": 0.734968676268427, "learning_rate": 3.898545914528569e-07, "loss": 0.0108, "step": 6273 }, { "epoch": 2.8544131028207462, "grad_norm": 0.6598642071833455, "learning_rate": 3.8971517752732396e-07, "loss": 0.0178, "step": 6274 }, { "epoch": 2.8548680618744315, "grad_norm": 0.8270211656952016, "learning_rate": 3.8957577261376885e-07, "loss": 0.0282, "step": 6275 }, { "epoch": 2.8553230209281164, "grad_norm": 0.7333683677894628, "learning_rate": 3.894363767235827e-07, "loss": 0.0248, "step": 6276 }, { "epoch": 2.8557779799818017, "grad_norm": 0.9194006447283196, "learning_rate": 3.892969898681567e-07, "loss": 0.0158, "step": 6277 }, { "epoch": 2.856232939035487, "grad_norm": 1.1507014272286562, "learning_rate": 3.8915761205888075e-07, "loss": 0.028, "step": 6278 }, { "epoch": 2.856687898089172, "grad_norm": 0.5788666482365726, "learning_rate": 3.8901824330714416e-07, "loss": 0.0126, "step": 6279 }, { "epoch": 2.857142857142857, "grad_norm": 0.9080263041804181, "learning_rate": 3.8887888362433565e-07, "loss": 0.0278, "step": 6280 }, { "epoch": 2.8575978161965425, "grad_norm": 0.8393524180203786, "learning_rate": 3.8873953302184283e-07, "loss": 0.0277, "step": 6281 }, { "epoch": 2.8580527752502274, "grad_norm": 0.9292336719678133, "learning_rate": 3.88600191511053e-07, "loss": 0.035, "step": 6282 }, { "epoch": 2.8585077343039127, "grad_norm": 3.6550598206909504, "learning_rate": 3.884608591033522e-07, "loss": 0.0312, "step": 6283 }, { "epoch": 2.858962693357598, "grad_norm": 0.6801928951073362, "learning_rate": 3.883215358101265e-07, "loss": 0.0095, "step": 6284 }, { "epoch": 2.859417652411283, "grad_norm": 1.359579745911512, "learning_rate": 3.881822216427606e-07, "loss": 0.0193, "step": 6285 }, { "epoch": 2.859872611464968, "grad_norm": 0.7207321642074836, "learning_rate": 3.880429166126385e-07, "loss": 0.0117, "step": 6286 }, { "epoch": 2.8603275705186535, "grad_norm": 1.6743156539283472, "learning_rate": 3.8790362073114373e-07, "loss": 0.0298, "step": 6287 }, { "epoch": 2.8607825295723384, "grad_norm": 0.7217267875620001, "learning_rate": 3.877643340096589e-07, "loss": 0.0224, "step": 6288 }, { "epoch": 2.8612374886260237, "grad_norm": 0.7210729911083525, "learning_rate": 3.876250564595657e-07, "loss": 0.0109, "step": 6289 }, { "epoch": 2.861692447679709, "grad_norm": 0.41673740430986583, "learning_rate": 3.874857880922453e-07, "loss": 0.0061, "step": 6290 }, { "epoch": 2.862147406733394, "grad_norm": 0.7263261114213082, "learning_rate": 3.8734652891907844e-07, "loss": 0.0198, "step": 6291 }, { "epoch": 2.862602365787079, "grad_norm": 0.7827745500515234, "learning_rate": 3.8720727895144433e-07, "loss": 0.0184, "step": 6292 }, { "epoch": 2.8630573248407645, "grad_norm": 0.7038830960521468, "learning_rate": 3.870680382007222e-07, "loss": 0.0169, "step": 6293 }, { "epoch": 2.8635122838944493, "grad_norm": 0.9018252065044596, "learning_rate": 3.869288066782898e-07, "loss": 0.0106, "step": 6294 }, { "epoch": 2.8639672429481347, "grad_norm": 0.8507835190066197, "learning_rate": 3.8678958439552485e-07, "loss": 0.0222, "step": 6295 }, { "epoch": 2.86442220200182, "grad_norm": 0.5711883975159788, "learning_rate": 3.866503713638037e-07, "loss": 0.0119, "step": 6296 }, { "epoch": 2.864877161055505, "grad_norm": 1.0148664816170188, "learning_rate": 3.865111675945022e-07, "loss": 0.0358, "step": 6297 }, { "epoch": 2.86533212010919, "grad_norm": 0.9460938958203483, "learning_rate": 3.8637197309899575e-07, "loss": 0.0255, "step": 6298 }, { "epoch": 2.8657870791628755, "grad_norm": 0.8313951304192545, "learning_rate": 3.8623278788865846e-07, "loss": 0.0115, "step": 6299 }, { "epoch": 2.8662420382165603, "grad_norm": 0.9117405107537293, "learning_rate": 3.860936119748641e-07, "loss": 0.0147, "step": 6300 }, { "epoch": 2.8666969972702456, "grad_norm": 0.8059157215602422, "learning_rate": 3.8595444536898525e-07, "loss": 0.0173, "step": 6301 }, { "epoch": 2.867151956323931, "grad_norm": 0.7213556183860221, "learning_rate": 3.8581528808239423e-07, "loss": 0.0213, "step": 6302 }, { "epoch": 2.867606915377616, "grad_norm": 1.0797203118515304, "learning_rate": 3.8567614012646204e-07, "loss": 0.0256, "step": 6303 }, { "epoch": 2.868061874431301, "grad_norm": 0.7577819931962838, "learning_rate": 3.855370015125593e-07, "loss": 0.019, "step": 6304 }, { "epoch": 2.8685168334849864, "grad_norm": 1.0197684999681609, "learning_rate": 3.8539787225205615e-07, "loss": 0.0307, "step": 6305 }, { "epoch": 2.8689717925386713, "grad_norm": 0.8188283443701853, "learning_rate": 3.852587523563212e-07, "loss": 0.0192, "step": 6306 }, { "epoch": 2.8694267515923566, "grad_norm": 0.6667434542740999, "learning_rate": 3.85119641836723e-07, "loss": 0.0264, "step": 6307 }, { "epoch": 2.869881710646042, "grad_norm": 0.6650742459568053, "learning_rate": 3.849805407046287e-07, "loss": 0.0186, "step": 6308 }, { "epoch": 2.870336669699727, "grad_norm": 0.8200282709702775, "learning_rate": 3.848414489714053e-07, "loss": 0.0161, "step": 6309 }, { "epoch": 2.870791628753412, "grad_norm": 0.6806619387672953, "learning_rate": 3.847023666484186e-07, "loss": 0.009, "step": 6310 }, { "epoch": 2.8712465878070974, "grad_norm": 0.9266151758109601, "learning_rate": 3.8456329374703365e-07, "loss": 0.0222, "step": 6311 }, { "epoch": 2.8717015468607823, "grad_norm": 0.8522751969600024, "learning_rate": 3.844242302786153e-07, "loss": 0.0308, "step": 6312 }, { "epoch": 2.8721565059144676, "grad_norm": 0.6857413838744517, "learning_rate": 3.842851762545267e-07, "loss": 0.0269, "step": 6313 }, { "epoch": 2.872611464968153, "grad_norm": 1.0052319662152212, "learning_rate": 3.841461316861311e-07, "loss": 0.0283, "step": 6314 }, { "epoch": 2.8730664240218378, "grad_norm": 0.6035655545858868, "learning_rate": 3.8400709658479034e-07, "loss": 0.0086, "step": 6315 }, { "epoch": 2.873521383075523, "grad_norm": 1.021594146268933, "learning_rate": 3.838680709618659e-07, "loss": 0.0503, "step": 6316 }, { "epoch": 2.8739763421292084, "grad_norm": 0.7978649021798191, "learning_rate": 3.837290548287182e-07, "loss": 0.0348, "step": 6317 }, { "epoch": 2.8744313011828937, "grad_norm": 0.9900186836047826, "learning_rate": 3.835900481967069e-07, "loss": 0.0252, "step": 6318 }, { "epoch": 2.8748862602365786, "grad_norm": 1.3127645581145693, "learning_rate": 3.834510510771914e-07, "loss": 0.013, "step": 6319 }, { "epoch": 2.875341219290264, "grad_norm": 1.2168701491479517, "learning_rate": 3.833120634815296e-07, "loss": 0.0306, "step": 6320 }, { "epoch": 2.875796178343949, "grad_norm": 0.6009300092587038, "learning_rate": 3.8317308542107906e-07, "loss": 0.0103, "step": 6321 }, { "epoch": 2.876251137397634, "grad_norm": 1.0249313698555214, "learning_rate": 3.8303411690719644e-07, "loss": 0.018, "step": 6322 }, { "epoch": 2.8767060964513194, "grad_norm": 1.0478315210465234, "learning_rate": 3.828951579512374e-07, "loss": 0.0176, "step": 6323 }, { "epoch": 2.8771610555050047, "grad_norm": 0.6298906718340674, "learning_rate": 3.827562085645574e-07, "loss": 0.0096, "step": 6324 }, { "epoch": 2.87761601455869, "grad_norm": 1.0559711879115254, "learning_rate": 3.826172687585103e-07, "loss": 0.0238, "step": 6325 }, { "epoch": 2.878070973612375, "grad_norm": 0.7317290278227535, "learning_rate": 3.8247833854445004e-07, "loss": 0.0121, "step": 6326 }, { "epoch": 2.87852593266606, "grad_norm": 0.7008688755015636, "learning_rate": 3.8233941793372933e-07, "loss": 0.0149, "step": 6327 }, { "epoch": 2.8789808917197455, "grad_norm": 0.9518107445721634, "learning_rate": 3.822005069376999e-07, "loss": 0.0161, "step": 6328 }, { "epoch": 2.8794358507734303, "grad_norm": 0.7704296766802081, "learning_rate": 3.820616055677132e-07, "loss": 0.0157, "step": 6329 }, { "epoch": 2.8798908098271156, "grad_norm": 0.7413682560193718, "learning_rate": 3.819227138351194e-07, "loss": 0.0157, "step": 6330 }, { "epoch": 2.880345768880801, "grad_norm": 0.9593668246575182, "learning_rate": 3.8178383175126824e-07, "loss": 0.023, "step": 6331 }, { "epoch": 2.880800727934486, "grad_norm": 0.9669122230365854, "learning_rate": 3.816449593275083e-07, "loss": 0.0278, "step": 6332 }, { "epoch": 2.881255686988171, "grad_norm": 0.9725078078541304, "learning_rate": 3.8150609657518795e-07, "loss": 0.0205, "step": 6333 }, { "epoch": 2.8817106460418564, "grad_norm": 0.7574519387582489, "learning_rate": 3.8136724350565435e-07, "loss": 0.0148, "step": 6334 }, { "epoch": 2.8821656050955413, "grad_norm": 0.7956782179113986, "learning_rate": 3.8122840013025376e-07, "loss": 0.0245, "step": 6335 }, { "epoch": 2.8826205641492266, "grad_norm": 1.131394260485898, "learning_rate": 3.8108956646033207e-07, "loss": 0.016, "step": 6336 }, { "epoch": 2.883075523202912, "grad_norm": 0.7070279588984281, "learning_rate": 3.8095074250723385e-07, "loss": 0.0203, "step": 6337 }, { "epoch": 2.883530482256597, "grad_norm": 0.6242706536983027, "learning_rate": 3.8081192828230347e-07, "loss": 0.0172, "step": 6338 }, { "epoch": 2.883985441310282, "grad_norm": 1.2139924741569827, "learning_rate": 3.8067312379688386e-07, "loss": 0.0316, "step": 6339 }, { "epoch": 2.8844404003639674, "grad_norm": 0.6108789443287819, "learning_rate": 3.805343290623178e-07, "loss": 0.017, "step": 6340 }, { "epoch": 2.8848953594176523, "grad_norm": 0.7482961949463262, "learning_rate": 3.80395544089947e-07, "loss": 0.0139, "step": 6341 }, { "epoch": 2.8853503184713376, "grad_norm": 0.6844407968498609, "learning_rate": 3.802567688911121e-07, "loss": 0.0102, "step": 6342 }, { "epoch": 2.885805277525023, "grad_norm": 0.6926989606457623, "learning_rate": 3.8011800347715335e-07, "loss": 0.0156, "step": 6343 }, { "epoch": 2.886260236578708, "grad_norm": 0.9062036913112398, "learning_rate": 3.7997924785940986e-07, "loss": 0.0254, "step": 6344 }, { "epoch": 2.886715195632393, "grad_norm": 1.0558914935268628, "learning_rate": 3.7984050204922036e-07, "loss": 0.0238, "step": 6345 }, { "epoch": 2.8871701546860784, "grad_norm": 0.7619253321871883, "learning_rate": 3.797017660579222e-07, "loss": 0.0149, "step": 6346 }, { "epoch": 2.8876251137397633, "grad_norm": 0.6704456084741063, "learning_rate": 3.7956303989685257e-07, "loss": 0.0199, "step": 6347 }, { "epoch": 2.8880800727934486, "grad_norm": 0.7670357205929623, "learning_rate": 3.794243235773475e-07, "loss": 0.0257, "step": 6348 }, { "epoch": 2.888535031847134, "grad_norm": 0.42546838905994827, "learning_rate": 3.792856171107421e-07, "loss": 0.0083, "step": 6349 }, { "epoch": 2.8889899909008188, "grad_norm": 1.0755423274309426, "learning_rate": 3.7914692050837107e-07, "loss": 0.0189, "step": 6350 }, { "epoch": 2.889444949954504, "grad_norm": 1.0304681213086306, "learning_rate": 3.790082337815678e-07, "loss": 0.0305, "step": 6351 }, { "epoch": 2.8898999090081894, "grad_norm": 0.8381223294188839, "learning_rate": 3.7886955694166536e-07, "loss": 0.0176, "step": 6352 }, { "epoch": 2.8903548680618742, "grad_norm": 0.8229015136466893, "learning_rate": 3.7873088999999545e-07, "loss": 0.0218, "step": 6353 }, { "epoch": 2.8908098271155596, "grad_norm": 0.710180632452183, "learning_rate": 3.7859223296788973e-07, "loss": 0.0132, "step": 6354 }, { "epoch": 2.891264786169245, "grad_norm": 1.541064327596645, "learning_rate": 3.784535858566785e-07, "loss": 0.0291, "step": 6355 }, { "epoch": 2.8917197452229297, "grad_norm": 0.769061283160984, "learning_rate": 3.783149486776913e-07, "loss": 0.0186, "step": 6356 }, { "epoch": 2.892174704276615, "grad_norm": 0.7500674081717927, "learning_rate": 3.781763214422571e-07, "loss": 0.0182, "step": 6357 }, { "epoch": 2.8926296633303004, "grad_norm": 0.5733740205212292, "learning_rate": 3.7803770416170367e-07, "loss": 0.007, "step": 6358 }, { "epoch": 2.8930846223839852, "grad_norm": 0.5922695293743792, "learning_rate": 3.7789909684735823e-07, "loss": 0.0119, "step": 6359 }, { "epoch": 2.8935395814376705, "grad_norm": 1.2011658210812863, "learning_rate": 3.777604995105471e-07, "loss": 0.0243, "step": 6360 }, { "epoch": 2.893994540491356, "grad_norm": 0.925054995267702, "learning_rate": 3.776219121625961e-07, "loss": 0.0147, "step": 6361 }, { "epoch": 2.8944494995450407, "grad_norm": 0.5638069033929687, "learning_rate": 3.774833348148297e-07, "loss": 0.0089, "step": 6362 }, { "epoch": 2.894904458598726, "grad_norm": 0.6785834184491117, "learning_rate": 3.7734476747857197e-07, "loss": 0.0189, "step": 6363 }, { "epoch": 2.8953594176524113, "grad_norm": 0.6492254083006027, "learning_rate": 3.7720621016514585e-07, "loss": 0.0106, "step": 6364 }, { "epoch": 2.895814376706096, "grad_norm": 0.9629651644972256, "learning_rate": 3.770676628858738e-07, "loss": 0.032, "step": 6365 }, { "epoch": 2.8962693357597815, "grad_norm": 0.7033560755174528, "learning_rate": 3.769291256520771e-07, "loss": 0.0157, "step": 6366 }, { "epoch": 2.896724294813467, "grad_norm": 1.1057029967207697, "learning_rate": 3.767905984750764e-07, "loss": 0.0331, "step": 6367 }, { "epoch": 2.8971792538671517, "grad_norm": 0.7965251646262664, "learning_rate": 3.7665208136619167e-07, "loss": 0.0054, "step": 6368 }, { "epoch": 2.897634212920837, "grad_norm": 0.9203154893882206, "learning_rate": 3.765135743367418e-07, "loss": 0.026, "step": 6369 }, { "epoch": 2.8980891719745223, "grad_norm": 0.4908300471484833, "learning_rate": 3.763750773980451e-07, "loss": 0.0044, "step": 6370 }, { "epoch": 2.8985441310282076, "grad_norm": 0.7465761848116771, "learning_rate": 3.7623659056141863e-07, "loss": 0.0216, "step": 6371 }, { "epoch": 2.8989990900818925, "grad_norm": 0.7885771095582838, "learning_rate": 3.7609811383817925e-07, "loss": 0.0144, "step": 6372 }, { "epoch": 2.899454049135578, "grad_norm": 0.6169964312587278, "learning_rate": 3.7595964723964227e-07, "loss": 0.0114, "step": 6373 }, { "epoch": 2.899909008189263, "grad_norm": 0.7117903817609319, "learning_rate": 3.7582119077712273e-07, "loss": 0.0173, "step": 6374 }, { "epoch": 2.900363967242948, "grad_norm": 1.4734397202209408, "learning_rate": 3.756827444619348e-07, "loss": 0.0318, "step": 6375 }, { "epoch": 2.9008189262966333, "grad_norm": 0.6124177473455937, "learning_rate": 3.755443083053916e-07, "loss": 0.0122, "step": 6376 }, { "epoch": 2.9012738853503186, "grad_norm": 0.48421997001126516, "learning_rate": 3.7540588231880556e-07, "loss": 0.0085, "step": 6377 }, { "epoch": 2.901728844404004, "grad_norm": 0.8211470982521405, "learning_rate": 3.7526746651348796e-07, "loss": 0.0235, "step": 6378 }, { "epoch": 2.902183803457689, "grad_norm": 1.1590200571147384, "learning_rate": 3.751290609007499e-07, "loss": 0.0224, "step": 6379 }, { "epoch": 2.902638762511374, "grad_norm": 0.7439505827026538, "learning_rate": 3.7499066549190097e-07, "loss": 0.0226, "step": 6380 }, { "epoch": 2.9030937215650594, "grad_norm": 0.5699711855478212, "learning_rate": 3.7485228029825016e-07, "loss": 0.0115, "step": 6381 }, { "epoch": 2.9035486806187443, "grad_norm": 0.826121813890601, "learning_rate": 3.747139053311061e-07, "loss": 0.013, "step": 6382 }, { "epoch": 2.9040036396724296, "grad_norm": 0.7740457878678206, "learning_rate": 3.7457554060177575e-07, "loss": 0.0243, "step": 6383 }, { "epoch": 2.904458598726115, "grad_norm": 0.6736627216417496, "learning_rate": 3.744371861215659e-07, "loss": 0.0169, "step": 6384 }, { "epoch": 2.9049135577797998, "grad_norm": 0.7239049272531175, "learning_rate": 3.742988419017822e-07, "loss": 0.0229, "step": 6385 }, { "epoch": 2.905368516833485, "grad_norm": 0.6804027825460468, "learning_rate": 3.7416050795372945e-07, "loss": 0.0126, "step": 6386 }, { "epoch": 2.9058234758871704, "grad_norm": 0.5960827012056602, "learning_rate": 3.740221842887117e-07, "loss": 0.0128, "step": 6387 }, { "epoch": 2.9062784349408552, "grad_norm": 1.065783643640004, "learning_rate": 3.7388387091803196e-07, "loss": 0.0241, "step": 6388 }, { "epoch": 2.9067333939945406, "grad_norm": 0.8959563213535333, "learning_rate": 3.737455678529929e-07, "loss": 0.0216, "step": 6389 }, { "epoch": 2.907188353048226, "grad_norm": 0.663763832909355, "learning_rate": 3.736072751048959e-07, "loss": 0.0099, "step": 6390 }, { "epoch": 2.9076433121019107, "grad_norm": 0.6723885206721304, "learning_rate": 3.7346899268504167e-07, "loss": 0.0127, "step": 6391 }, { "epoch": 2.908098271155596, "grad_norm": 0.803392001069506, "learning_rate": 3.733307206047298e-07, "loss": 0.0153, "step": 6392 }, { "epoch": 2.9085532302092814, "grad_norm": 0.7595460749110501, "learning_rate": 3.731924588752595e-07, "loss": 0.0164, "step": 6393 }, { "epoch": 2.9090081892629662, "grad_norm": 0.947911084600543, "learning_rate": 3.7305420750792895e-07, "loss": 0.0238, "step": 6394 }, { "epoch": 2.9094631483166515, "grad_norm": 0.863737831187388, "learning_rate": 3.7291596651403477e-07, "loss": 0.0243, "step": 6395 }, { "epoch": 2.909918107370337, "grad_norm": 0.7167970314457005, "learning_rate": 3.727777359048743e-07, "loss": 0.024, "step": 6396 }, { "epoch": 2.9103730664240217, "grad_norm": 1.2131795625736428, "learning_rate": 3.726395156917427e-07, "loss": 0.0186, "step": 6397 }, { "epoch": 2.910828025477707, "grad_norm": 0.617314240109532, "learning_rate": 3.7250130588593463e-07, "loss": 0.0089, "step": 6398 }, { "epoch": 2.9112829845313923, "grad_norm": 0.5974683271408129, "learning_rate": 3.7236310649874425e-07, "loss": 0.0087, "step": 6399 }, { "epoch": 2.911737943585077, "grad_norm": 1.3339325040773935, "learning_rate": 3.7222491754146425e-07, "loss": 0.0201, "step": 6400 }, { "epoch": 2.9121929026387625, "grad_norm": 0.827461476808047, "learning_rate": 3.7208673902538703e-07, "loss": 0.0169, "step": 6401 }, { "epoch": 2.912647861692448, "grad_norm": 0.8773401191810173, "learning_rate": 3.719485709618036e-07, "loss": 0.019, "step": 6402 }, { "epoch": 2.9131028207461327, "grad_norm": 0.5604320157099767, "learning_rate": 3.718104133620048e-07, "loss": 0.0108, "step": 6403 }, { "epoch": 2.913557779799818, "grad_norm": 1.003127774030755, "learning_rate": 3.716722662372803e-07, "loss": 0.0154, "step": 6404 }, { "epoch": 2.9140127388535033, "grad_norm": 1.0069353383569721, "learning_rate": 3.715341295989185e-07, "loss": 0.0258, "step": 6405 }, { "epoch": 2.914467697907188, "grad_norm": 0.8870688212178341, "learning_rate": 3.7139600345820767e-07, "loss": 0.0192, "step": 6406 }, { "epoch": 2.9149226569608735, "grad_norm": 0.8996618026298765, "learning_rate": 3.7125788782643444e-07, "loss": 0.0247, "step": 6407 }, { "epoch": 2.915377616014559, "grad_norm": 0.8016438054240819, "learning_rate": 3.7111978271488544e-07, "loss": 0.0177, "step": 6408 }, { "epoch": 2.9158325750682437, "grad_norm": 0.8337089367452087, "learning_rate": 3.709816881348455e-07, "loss": 0.0102, "step": 6409 }, { "epoch": 2.916287534121929, "grad_norm": 0.9317962279022891, "learning_rate": 3.7084360409759953e-07, "loss": 0.0184, "step": 6410 }, { "epoch": 2.9167424931756143, "grad_norm": 0.8195414727463849, "learning_rate": 3.70705530614431e-07, "loss": 0.0184, "step": 6411 }, { "epoch": 2.917197452229299, "grad_norm": 0.969841062671001, "learning_rate": 3.7056746769662254e-07, "loss": 0.0123, "step": 6412 }, { "epoch": 2.9176524112829845, "grad_norm": 0.8249155876387954, "learning_rate": 3.704294153554562e-07, "loss": 0.0276, "step": 6413 }, { "epoch": 2.91810737033667, "grad_norm": 0.6622410163455185, "learning_rate": 3.7029137360221286e-07, "loss": 0.0193, "step": 6414 }, { "epoch": 2.9185623293903546, "grad_norm": 0.5929941869477152, "learning_rate": 3.7015334244817277e-07, "loss": 0.0099, "step": 6415 }, { "epoch": 2.91901728844404, "grad_norm": 0.6766419825702461, "learning_rate": 3.7001532190461493e-07, "loss": 0.0086, "step": 6416 }, { "epoch": 2.9194722474977253, "grad_norm": 0.5949503164692103, "learning_rate": 3.698773119828181e-07, "loss": 0.0167, "step": 6417 }, { "epoch": 2.91992720655141, "grad_norm": 0.6317258889235889, "learning_rate": 3.697393126940599e-07, "loss": 0.0118, "step": 6418 }, { "epoch": 2.9203821656050954, "grad_norm": 0.9818177451229028, "learning_rate": 3.6960132404961656e-07, "loss": 0.0149, "step": 6419 }, { "epoch": 2.9208371246587808, "grad_norm": 0.8306350355357541, "learning_rate": 3.6946334606076433e-07, "loss": 0.0157, "step": 6420 }, { "epoch": 2.9212920837124656, "grad_norm": 0.8795011093322078, "learning_rate": 3.6932537873877785e-07, "loss": 0.0189, "step": 6421 }, { "epoch": 2.921747042766151, "grad_norm": 0.705053734883748, "learning_rate": 3.691874220949314e-07, "loss": 0.0182, "step": 6422 }, { "epoch": 2.9222020018198362, "grad_norm": 0.8707890413125124, "learning_rate": 3.690494761404979e-07, "loss": 0.0164, "step": 6423 }, { "epoch": 2.922656960873521, "grad_norm": 1.3415753197830569, "learning_rate": 3.689115408867498e-07, "loss": 0.0237, "step": 6424 }, { "epoch": 2.9231119199272064, "grad_norm": 1.049523178897669, "learning_rate": 3.6877361634495885e-07, "loss": 0.0193, "step": 6425 }, { "epoch": 2.9235668789808917, "grad_norm": 0.6842206261770903, "learning_rate": 3.686357025263952e-07, "loss": 0.0202, "step": 6426 }, { "epoch": 2.924021838034577, "grad_norm": 0.6527264067168717, "learning_rate": 3.6849779944232877e-07, "loss": 0.0197, "step": 6427 }, { "epoch": 2.924476797088262, "grad_norm": 0.8454812145181981, "learning_rate": 3.683599071040283e-07, "loss": 0.0165, "step": 6428 }, { "epoch": 2.9249317561419472, "grad_norm": 0.7716958645036347, "learning_rate": 3.682220255227617e-07, "loss": 0.013, "step": 6429 }, { "epoch": 2.9253867151956325, "grad_norm": 1.2284215301645955, "learning_rate": 3.6808415470979596e-07, "loss": 0.0152, "step": 6430 }, { "epoch": 2.9258416742493174, "grad_norm": 0.7443001973791024, "learning_rate": 3.679462946763975e-07, "loss": 0.0109, "step": 6431 }, { "epoch": 2.9262966333030027, "grad_norm": 0.6324604402788511, "learning_rate": 3.6780844543383157e-07, "loss": 0.0104, "step": 6432 }, { "epoch": 2.926751592356688, "grad_norm": 0.815726379691611, "learning_rate": 3.676706069933625e-07, "loss": 0.0298, "step": 6433 }, { "epoch": 2.9272065514103733, "grad_norm": 0.7112912720702469, "learning_rate": 3.675327793662537e-07, "loss": 0.0116, "step": 6434 }, { "epoch": 2.927661510464058, "grad_norm": 1.0814560900578563, "learning_rate": 3.673949625637681e-07, "loss": 0.0117, "step": 6435 }, { "epoch": 2.9281164695177435, "grad_norm": 0.8535817673284755, "learning_rate": 3.6725715659716715e-07, "loss": 0.017, "step": 6436 }, { "epoch": 2.928571428571429, "grad_norm": 0.817462335555597, "learning_rate": 3.6711936147771184e-07, "loss": 0.0228, "step": 6437 }, { "epoch": 2.9290263876251137, "grad_norm": 0.8466133691150582, "learning_rate": 3.669815772166625e-07, "loss": 0.0215, "step": 6438 }, { "epoch": 2.929481346678799, "grad_norm": 0.6353496930419168, "learning_rate": 3.668438038252778e-07, "loss": 0.0094, "step": 6439 }, { "epoch": 2.9299363057324843, "grad_norm": 0.9522810363997031, "learning_rate": 3.6670604131481617e-07, "loss": 0.0243, "step": 6440 }, { "epoch": 2.930391264786169, "grad_norm": 0.8042574346609977, "learning_rate": 3.6656828969653485e-07, "loss": 0.0128, "step": 6441 }, { "epoch": 2.9308462238398545, "grad_norm": 1.6923236259749446, "learning_rate": 3.6643054898169044e-07, "loss": 0.0227, "step": 6442 }, { "epoch": 2.93130118289354, "grad_norm": 0.9497135942124821, "learning_rate": 3.6629281918153826e-07, "loss": 0.0135, "step": 6443 }, { "epoch": 2.9317561419472247, "grad_norm": 0.7880655238301595, "learning_rate": 3.66155100307333e-07, "loss": 0.0136, "step": 6444 }, { "epoch": 2.93221110100091, "grad_norm": 1.0780217738982671, "learning_rate": 3.6601739237032863e-07, "loss": 0.0311, "step": 6445 }, { "epoch": 2.9326660600545953, "grad_norm": 0.7626316706059537, "learning_rate": 3.6587969538177786e-07, "loss": 0.0217, "step": 6446 }, { "epoch": 2.93312101910828, "grad_norm": 1.054353833969882, "learning_rate": 3.6574200935293277e-07, "loss": 0.0296, "step": 6447 }, { "epoch": 2.9335759781619655, "grad_norm": 0.49409547475462606, "learning_rate": 3.656043342950443e-07, "loss": 0.0126, "step": 6448 }, { "epoch": 2.934030937215651, "grad_norm": 0.7240405384929773, "learning_rate": 3.654666702193628e-07, "loss": 0.017, "step": 6449 }, { "epoch": 2.9344858962693356, "grad_norm": 0.6788136036717803, "learning_rate": 3.6532901713713737e-07, "loss": 0.0136, "step": 6450 }, { "epoch": 2.934940855323021, "grad_norm": 0.946861233209408, "learning_rate": 3.651913750596163e-07, "loss": 0.0103, "step": 6451 }, { "epoch": 2.9353958143767063, "grad_norm": 0.9483693092037008, "learning_rate": 3.6505374399804757e-07, "loss": 0.021, "step": 6452 }, { "epoch": 2.935850773430391, "grad_norm": 0.7895151462044142, "learning_rate": 3.649161239636773e-07, "loss": 0.0244, "step": 6453 }, { "epoch": 2.9363057324840764, "grad_norm": 0.8684146297864589, "learning_rate": 3.647785149677516e-07, "loss": 0.0104, "step": 6454 }, { "epoch": 2.9367606915377618, "grad_norm": 0.990635020222022, "learning_rate": 3.646409170215148e-07, "loss": 0.0137, "step": 6455 }, { "epoch": 2.9372156505914466, "grad_norm": 0.8304671595080066, "learning_rate": 3.64503330136211e-07, "loss": 0.0165, "step": 6456 }, { "epoch": 2.937670609645132, "grad_norm": 0.8320550804339975, "learning_rate": 3.643657543230831e-07, "loss": 0.0102, "step": 6457 }, { "epoch": 2.9381255686988172, "grad_norm": 1.3404528677592868, "learning_rate": 3.642281895933732e-07, "loss": 0.0246, "step": 6458 }, { "epoch": 2.938580527752502, "grad_norm": 0.8477838673498351, "learning_rate": 3.6409063595832275e-07, "loss": 0.0164, "step": 6459 }, { "epoch": 2.9390354868061874, "grad_norm": 0.8930034279676965, "learning_rate": 3.6395309342917155e-07, "loss": 0.0146, "step": 6460 }, { "epoch": 2.9394904458598727, "grad_norm": 0.9056655089521906, "learning_rate": 3.6381556201715933e-07, "loss": 0.026, "step": 6461 }, { "epoch": 2.9399454049135576, "grad_norm": 1.1431906892736803, "learning_rate": 3.636780417335243e-07, "loss": 0.034, "step": 6462 }, { "epoch": 2.940400363967243, "grad_norm": 0.8749513315156507, "learning_rate": 3.6354053258950413e-07, "loss": 0.019, "step": 6463 }, { "epoch": 2.9408553230209282, "grad_norm": 0.6752993501588361, "learning_rate": 3.634030345963355e-07, "loss": 0.0152, "step": 6464 }, { "epoch": 2.941310282074613, "grad_norm": 0.7511671044126542, "learning_rate": 3.6326554776525355e-07, "loss": 0.0186, "step": 6465 }, { "epoch": 2.9417652411282984, "grad_norm": 1.1583158102805364, "learning_rate": 3.63128072107494e-07, "loss": 0.0171, "step": 6466 }, { "epoch": 2.9422202001819837, "grad_norm": 1.2757784777885126, "learning_rate": 3.629906076342901e-07, "loss": 0.0196, "step": 6467 }, { "epoch": 2.9426751592356686, "grad_norm": 0.9091175050872612, "learning_rate": 3.628531543568751e-07, "loss": 0.0185, "step": 6468 }, { "epoch": 2.943130118289354, "grad_norm": 0.7838884875586256, "learning_rate": 3.62715712286481e-07, "loss": 0.0118, "step": 6469 }, { "epoch": 2.943585077343039, "grad_norm": 0.6862825775337094, "learning_rate": 3.625782814343388e-07, "loss": 0.013, "step": 6470 }, { "epoch": 2.944040036396724, "grad_norm": 0.7396753055674987, "learning_rate": 3.6244086181167896e-07, "loss": 0.011, "step": 6471 }, { "epoch": 2.9444949954504094, "grad_norm": 0.8372333459628598, "learning_rate": 3.6230345342973056e-07, "loss": 0.0215, "step": 6472 }, { "epoch": 2.9449499545040947, "grad_norm": 0.8102721145809617, "learning_rate": 3.6216605629972214e-07, "loss": 0.0233, "step": 6473 }, { "epoch": 2.9454049135577796, "grad_norm": 0.6840723462672609, "learning_rate": 3.6202867043288123e-07, "loss": 0.0104, "step": 6474 }, { "epoch": 2.945859872611465, "grad_norm": 0.7553805498330066, "learning_rate": 3.618912958404342e-07, "loss": 0.0225, "step": 6475 }, { "epoch": 2.94631483166515, "grad_norm": 0.6651973170534892, "learning_rate": 3.61753932533607e-07, "loss": 0.0177, "step": 6476 }, { "epoch": 2.946769790718835, "grad_norm": 1.567748949838703, "learning_rate": 3.6161658052362387e-07, "loss": 0.041, "step": 6477 }, { "epoch": 2.9472247497725204, "grad_norm": 1.4123107398900858, "learning_rate": 3.61479239821709e-07, "loss": 0.0202, "step": 6478 }, { "epoch": 2.9476797088262057, "grad_norm": 1.2970516986493164, "learning_rate": 3.613419104390849e-07, "loss": 0.0222, "step": 6479 }, { "epoch": 2.9481346678798905, "grad_norm": 0.8050064385262121, "learning_rate": 3.612045923869738e-07, "loss": 0.0199, "step": 6480 }, { "epoch": 2.948589626933576, "grad_norm": 0.9096416595353252, "learning_rate": 3.610672856765967e-07, "loss": 0.0195, "step": 6481 }, { "epoch": 2.949044585987261, "grad_norm": 0.4684142003709393, "learning_rate": 3.609299903191736e-07, "loss": 0.0109, "step": 6482 }, { "epoch": 2.9494995450409465, "grad_norm": 0.9832385520148114, "learning_rate": 3.607927063259236e-07, "loss": 0.0301, "step": 6483 }, { "epoch": 2.9499545040946313, "grad_norm": 0.6834129107883281, "learning_rate": 3.60655433708065e-07, "loss": 0.0177, "step": 6484 }, { "epoch": 2.9504094631483166, "grad_norm": 0.6046221455514822, "learning_rate": 3.6051817247681516e-07, "loss": 0.0054, "step": 6485 }, { "epoch": 2.950864422202002, "grad_norm": 0.8861574050626135, "learning_rate": 3.6038092264339014e-07, "loss": 0.0125, "step": 6486 }, { "epoch": 2.951319381255687, "grad_norm": 0.49057925084386816, "learning_rate": 3.6024368421900574e-07, "loss": 0.0105, "step": 6487 }, { "epoch": 2.951774340309372, "grad_norm": 1.6909778092491983, "learning_rate": 3.601064572148764e-07, "loss": 0.0431, "step": 6488 }, { "epoch": 2.9522292993630574, "grad_norm": 0.44415865079375677, "learning_rate": 3.599692416422155e-07, "loss": 0.006, "step": 6489 }, { "epoch": 2.9526842584167428, "grad_norm": 0.9556917026858623, "learning_rate": 3.59832037512236e-07, "loss": 0.0153, "step": 6490 }, { "epoch": 2.9531392174704276, "grad_norm": 1.054722524283714, "learning_rate": 3.596948448361492e-07, "loss": 0.0157, "step": 6491 }, { "epoch": 2.953594176524113, "grad_norm": 0.7490039630316165, "learning_rate": 3.5955766362516624e-07, "loss": 0.0145, "step": 6492 }, { "epoch": 2.9540491355777982, "grad_norm": 0.6737513240431025, "learning_rate": 3.5942049389049655e-07, "loss": 0.0142, "step": 6493 }, { "epoch": 2.954504094631483, "grad_norm": 1.4094414276172227, "learning_rate": 3.592833356433493e-07, "loss": 0.0263, "step": 6494 }, { "epoch": 2.9549590536851684, "grad_norm": 1.388337117129183, "learning_rate": 3.5914618889493253e-07, "loss": 0.0152, "step": 6495 }, { "epoch": 2.9554140127388537, "grad_norm": 0.7265588909801421, "learning_rate": 3.5900905365645306e-07, "loss": 0.0227, "step": 6496 }, { "epoch": 2.9558689717925386, "grad_norm": 0.7670393306158486, "learning_rate": 3.5887192993911707e-07, "loss": 0.0242, "step": 6497 }, { "epoch": 2.956323930846224, "grad_norm": 1.0957842182645583, "learning_rate": 3.587348177541295e-07, "loss": 0.0299, "step": 6498 }, { "epoch": 2.9567788898999092, "grad_norm": 1.0801600297480098, "learning_rate": 3.585977171126948e-07, "loss": 0.0414, "step": 6499 }, { "epoch": 2.957233848953594, "grad_norm": 0.5893244721669152, "learning_rate": 3.58460628026016e-07, "loss": 0.0181, "step": 6500 }, { "epoch": 2.9576888080072794, "grad_norm": 1.1075506844332212, "learning_rate": 3.5832355050529545e-07, "loss": 0.0357, "step": 6501 }, { "epoch": 2.9581437670609647, "grad_norm": 0.6263503491161391, "learning_rate": 3.5818648456173474e-07, "loss": 0.0134, "step": 6502 }, { "epoch": 2.9585987261146496, "grad_norm": 0.7401323545774567, "learning_rate": 3.58049430206534e-07, "loss": 0.0161, "step": 6503 }, { "epoch": 2.959053685168335, "grad_norm": 1.0118425042683072, "learning_rate": 3.579123874508927e-07, "loss": 0.026, "step": 6504 }, { "epoch": 2.95950864422202, "grad_norm": 0.7473648183475007, "learning_rate": 3.5777535630600956e-07, "loss": 0.0211, "step": 6505 }, { "epoch": 2.959963603275705, "grad_norm": 0.795193049128125, "learning_rate": 3.57638336783082e-07, "loss": 0.0109, "step": 6506 }, { "epoch": 2.9604185623293904, "grad_norm": 0.8666346353265371, "learning_rate": 3.5750132889330643e-07, "loss": 0.0316, "step": 6507 }, { "epoch": 2.9608735213830757, "grad_norm": 0.7979292865401081, "learning_rate": 3.5736433264787894e-07, "loss": 0.0183, "step": 6508 }, { "epoch": 2.9613284804367606, "grad_norm": 1.1234252433477843, "learning_rate": 3.5722734805799406e-07, "loss": 0.025, "step": 6509 }, { "epoch": 2.961783439490446, "grad_norm": 0.6292976180584844, "learning_rate": 3.5709037513484554e-07, "loss": 0.0215, "step": 6510 }, { "epoch": 2.962238398544131, "grad_norm": 1.2202338448716203, "learning_rate": 3.5695341388962613e-07, "loss": 0.0232, "step": 6511 }, { "epoch": 2.962693357597816, "grad_norm": 0.9252168885569209, "learning_rate": 3.568164643335279e-07, "loss": 0.0257, "step": 6512 }, { "epoch": 2.9631483166515014, "grad_norm": 1.468032130824581, "learning_rate": 3.5667952647774137e-07, "loss": 0.0138, "step": 6513 }, { "epoch": 2.9636032757051867, "grad_norm": 0.7893789537500827, "learning_rate": 3.565426003334567e-07, "loss": 0.0166, "step": 6514 }, { "epoch": 2.9640582347588715, "grad_norm": 0.7798432548410273, "learning_rate": 3.5640568591186307e-07, "loss": 0.029, "step": 6515 }, { "epoch": 2.964513193812557, "grad_norm": 0.6814761717312907, "learning_rate": 3.5626878322414824e-07, "loss": 0.0212, "step": 6516 }, { "epoch": 2.964968152866242, "grad_norm": 0.8676057669013371, "learning_rate": 3.561318922814994e-07, "loss": 0.0169, "step": 6517 }, { "epoch": 2.965423111919927, "grad_norm": 0.7186043760347279, "learning_rate": 3.559950130951025e-07, "loss": 0.0107, "step": 6518 }, { "epoch": 2.9658780709736123, "grad_norm": 0.8116350622757644, "learning_rate": 3.5585814567614305e-07, "loss": 0.0216, "step": 6519 }, { "epoch": 2.9663330300272976, "grad_norm": 1.2869172425149047, "learning_rate": 3.5572129003580475e-07, "loss": 0.0332, "step": 6520 }, { "epoch": 2.9667879890809825, "grad_norm": 0.7355391027110417, "learning_rate": 3.5558444618527103e-07, "loss": 0.0133, "step": 6521 }, { "epoch": 2.967242948134668, "grad_norm": 0.6956669633338275, "learning_rate": 3.554476141357244e-07, "loss": 0.018, "step": 6522 }, { "epoch": 2.967697907188353, "grad_norm": 0.6273925153638129, "learning_rate": 3.553107938983458e-07, "loss": 0.0141, "step": 6523 }, { "epoch": 2.968152866242038, "grad_norm": 0.6323326865161704, "learning_rate": 3.551739854843159e-07, "loss": 0.0166, "step": 6524 }, { "epoch": 2.9686078252957233, "grad_norm": 0.6071053198177077, "learning_rate": 3.550371889048137e-07, "loss": 0.0162, "step": 6525 }, { "epoch": 2.9690627843494086, "grad_norm": 0.8363237715271785, "learning_rate": 3.5490040417101793e-07, "loss": 0.0132, "step": 6526 }, { "epoch": 2.9695177434030935, "grad_norm": 0.8236424797782745, "learning_rate": 3.547636312941057e-07, "loss": 0.0221, "step": 6527 }, { "epoch": 2.969972702456779, "grad_norm": 0.7703283289995418, "learning_rate": 3.546268702852536e-07, "loss": 0.0206, "step": 6528 }, { "epoch": 2.970427661510464, "grad_norm": 0.9934122535823519, "learning_rate": 3.544901211556374e-07, "loss": 0.0277, "step": 6529 }, { "epoch": 2.970882620564149, "grad_norm": 0.8044267568006738, "learning_rate": 3.5435338391643113e-07, "loss": 0.0137, "step": 6530 }, { "epoch": 2.9713375796178343, "grad_norm": 0.5976403967882574, "learning_rate": 3.5421665857880885e-07, "loss": 0.007, "step": 6531 }, { "epoch": 2.9717925386715196, "grad_norm": 0.9706632608498935, "learning_rate": 3.540799451539427e-07, "loss": 0.0323, "step": 6532 }, { "epoch": 2.9722474977252045, "grad_norm": 1.0988481230258806, "learning_rate": 3.5394324365300456e-07, "loss": 0.022, "step": 6533 }, { "epoch": 2.97270245677889, "grad_norm": 0.8419227452824612, "learning_rate": 3.538065540871649e-07, "loss": 0.0119, "step": 6534 }, { "epoch": 2.973157415832575, "grad_norm": 0.8385780380547128, "learning_rate": 3.5366987646759325e-07, "loss": 0.0152, "step": 6535 }, { "epoch": 2.9736123748862604, "grad_norm": 0.9975617580036925, "learning_rate": 3.5353321080545884e-07, "loss": 0.0331, "step": 6536 }, { "epoch": 2.9740673339399453, "grad_norm": 0.8604874473498202, "learning_rate": 3.533965571119287e-07, "loss": 0.013, "step": 6537 }, { "epoch": 2.9745222929936306, "grad_norm": 0.7932309485648432, "learning_rate": 3.532599153981701e-07, "loss": 0.0234, "step": 6538 }, { "epoch": 2.974977252047316, "grad_norm": 0.7710010591662109, "learning_rate": 3.5312328567534856e-07, "loss": 0.0261, "step": 6539 }, { "epoch": 2.9754322111010008, "grad_norm": 0.497035190124446, "learning_rate": 3.529866679546286e-07, "loss": 0.0075, "step": 6540 }, { "epoch": 2.975887170154686, "grad_norm": 0.709167447838399, "learning_rate": 3.5285006224717443e-07, "loss": 0.0087, "step": 6541 }, { "epoch": 2.9763421292083714, "grad_norm": 0.9198481582903101, "learning_rate": 3.527134685641484e-07, "loss": 0.0189, "step": 6542 }, { "epoch": 2.9767970882620567, "grad_norm": 0.6071930166780877, "learning_rate": 3.525768869167127e-07, "loss": 0.015, "step": 6543 }, { "epoch": 2.9772520473157416, "grad_norm": 0.7140881679427636, "learning_rate": 3.524403173160282e-07, "loss": 0.0143, "step": 6544 }, { "epoch": 2.977707006369427, "grad_norm": 0.8428137308517692, "learning_rate": 3.5230375977325445e-07, "loss": 0.0121, "step": 6545 }, { "epoch": 2.978161965423112, "grad_norm": 1.0907382758130852, "learning_rate": 3.521672142995505e-07, "loss": 0.0272, "step": 6546 }, { "epoch": 2.978616924476797, "grad_norm": 0.641140307610947, "learning_rate": 3.5203068090607415e-07, "loss": 0.0133, "step": 6547 }, { "epoch": 2.9790718835304824, "grad_norm": 1.0807304340042718, "learning_rate": 3.5189415960398246e-07, "loss": 0.0188, "step": 6548 }, { "epoch": 2.9795268425841677, "grad_norm": 0.962660284512762, "learning_rate": 3.5175765040443094e-07, "loss": 0.0226, "step": 6549 }, { "epoch": 2.9799818016378525, "grad_norm": 0.795566553909288, "learning_rate": 3.516211533185749e-07, "loss": 0.0163, "step": 6550 }, { "epoch": 2.980436760691538, "grad_norm": 0.7810875426550207, "learning_rate": 3.5148466835756825e-07, "loss": 0.0128, "step": 6551 }, { "epoch": 2.980891719745223, "grad_norm": 0.7597078328852642, "learning_rate": 3.5134819553256365e-07, "loss": 0.0172, "step": 6552 }, { "epoch": 2.981346678798908, "grad_norm": 0.7368469898908139, "learning_rate": 3.512117348547134e-07, "loss": 0.0172, "step": 6553 }, { "epoch": 2.9818016378525933, "grad_norm": 0.9305931916359345, "learning_rate": 3.510752863351681e-07, "loss": 0.0206, "step": 6554 }, { "epoch": 2.9822565969062786, "grad_norm": 1.138596724290123, "learning_rate": 3.5093884998507793e-07, "loss": 0.0254, "step": 6555 }, { "epoch": 2.9827115559599635, "grad_norm": 0.5053579652816207, "learning_rate": 3.508024258155917e-07, "loss": 0.0086, "step": 6556 }, { "epoch": 2.983166515013649, "grad_norm": 0.6635318132339223, "learning_rate": 3.5066601383785746e-07, "loss": 0.0179, "step": 6557 }, { "epoch": 2.983621474067334, "grad_norm": 0.7235833136473574, "learning_rate": 3.5052961406302235e-07, "loss": 0.0139, "step": 6558 }, { "epoch": 2.984076433121019, "grad_norm": 0.7970707444883729, "learning_rate": 3.5039322650223205e-07, "loss": 0.0253, "step": 6559 }, { "epoch": 2.9845313921747043, "grad_norm": 0.6828607442426381, "learning_rate": 3.5025685116663174e-07, "loss": 0.0249, "step": 6560 }, { "epoch": 2.9849863512283896, "grad_norm": 0.953780097880935, "learning_rate": 3.501204880673652e-07, "loss": 0.0301, "step": 6561 }, { "epoch": 2.9854413102820745, "grad_norm": 0.5102237132467475, "learning_rate": 3.499841372155757e-07, "loss": 0.0057, "step": 6562 }, { "epoch": 2.98589626933576, "grad_norm": 0.8547802049627937, "learning_rate": 3.4984779862240476e-07, "loss": 0.012, "step": 6563 }, { "epoch": 2.986351228389445, "grad_norm": 0.7007981455130955, "learning_rate": 3.4971147229899376e-07, "loss": 0.0138, "step": 6564 }, { "epoch": 2.98680618744313, "grad_norm": 0.8377935450700574, "learning_rate": 3.4957515825648263e-07, "loss": 0.02, "step": 6565 }, { "epoch": 2.9872611464968153, "grad_norm": 0.6330300931368811, "learning_rate": 3.4943885650601025e-07, "loss": 0.0182, "step": 6566 }, { "epoch": 2.9877161055505006, "grad_norm": 1.1056038833364514, "learning_rate": 3.4930256705871466e-07, "loss": 0.0319, "step": 6567 }, { "epoch": 2.9881710646041855, "grad_norm": 1.2658811656479012, "learning_rate": 3.491662899257326e-07, "loss": 0.0168, "step": 6568 }, { "epoch": 2.988626023657871, "grad_norm": 0.5908718672593835, "learning_rate": 3.4903002511820025e-07, "loss": 0.0101, "step": 6569 }, { "epoch": 2.989080982711556, "grad_norm": 1.3496000872299396, "learning_rate": 3.488937726472523e-07, "loss": 0.013, "step": 6570 }, { "epoch": 2.989535941765241, "grad_norm": 0.747690059351562, "learning_rate": 3.4875753252402297e-07, "loss": 0.0201, "step": 6571 }, { "epoch": 2.9899909008189263, "grad_norm": 0.9579687353402817, "learning_rate": 3.486213047596451e-07, "loss": 0.0328, "step": 6572 }, { "epoch": 2.9904458598726116, "grad_norm": 0.9017500966892662, "learning_rate": 3.4848508936525055e-07, "loss": 0.0229, "step": 6573 }, { "epoch": 2.9909008189262964, "grad_norm": 0.6903524734050406, "learning_rate": 3.483488863519704e-07, "loss": 0.0158, "step": 6574 }, { "epoch": 2.9913557779799818, "grad_norm": 0.8734303480487606, "learning_rate": 3.482126957309344e-07, "loss": 0.0322, "step": 6575 }, { "epoch": 2.991810737033667, "grad_norm": 0.8212269326276901, "learning_rate": 3.4807651751327126e-07, "loss": 0.019, "step": 6576 }, { "epoch": 2.992265696087352, "grad_norm": 0.7854712083587281, "learning_rate": 3.4794035171010905e-07, "loss": 0.0077, "step": 6577 }, { "epoch": 2.9927206551410372, "grad_norm": 0.5927759520160327, "learning_rate": 3.478041983325747e-07, "loss": 0.0139, "step": 6578 }, { "epoch": 2.9931756141947226, "grad_norm": 0.6913420634256854, "learning_rate": 3.476680573917939e-07, "loss": 0.0135, "step": 6579 }, { "epoch": 2.9936305732484074, "grad_norm": 0.8681964946014692, "learning_rate": 3.475319288988916e-07, "loss": 0.026, "step": 6580 }, { "epoch": 2.9940855323020927, "grad_norm": 0.6922792195471824, "learning_rate": 3.473958128649914e-07, "loss": 0.0136, "step": 6581 }, { "epoch": 2.994540491355778, "grad_norm": 1.0999813541720806, "learning_rate": 3.472597093012164e-07, "loss": 0.0109, "step": 6582 }, { "epoch": 2.994995450409463, "grad_norm": 25.7360565249769, "learning_rate": 3.471236182186881e-07, "loss": 0.0893, "step": 6583 }, { "epoch": 2.9954504094631482, "grad_norm": 0.7080798886343006, "learning_rate": 3.469875396285271e-07, "loss": 0.0103, "step": 6584 }, { "epoch": 2.9959053685168335, "grad_norm": 0.6047743204418057, "learning_rate": 3.468514735418536e-07, "loss": 0.0077, "step": 6585 }, { "epoch": 2.9963603275705184, "grad_norm": 0.7254944109484118, "learning_rate": 3.4671541996978605e-07, "loss": 0.0122, "step": 6586 }, { "epoch": 2.9968152866242037, "grad_norm": 0.6891230910099873, "learning_rate": 3.4657937892344225e-07, "loss": 0.0134, "step": 6587 }, { "epoch": 2.997270245677889, "grad_norm": 0.8345461546582199, "learning_rate": 3.4644335041393865e-07, "loss": 0.0169, "step": 6588 }, { "epoch": 2.997725204731574, "grad_norm": 0.7836469551758943, "learning_rate": 3.4630733445239103e-07, "loss": 0.0198, "step": 6589 }, { "epoch": 2.998180163785259, "grad_norm": 1.3774181986517609, "learning_rate": 3.461713310499139e-07, "loss": 0.0274, "step": 6590 }, { "epoch": 2.9986351228389445, "grad_norm": 0.4299622966057322, "learning_rate": 3.460353402176208e-07, "loss": 0.0079, "step": 6591 }, { "epoch": 2.99909008189263, "grad_norm": 0.714981879076951, "learning_rate": 3.4589936196662474e-07, "loss": 0.0107, "step": 6592 }, { "epoch": 2.9995450409463147, "grad_norm": 0.824948617789103, "learning_rate": 3.4576339630803664e-07, "loss": 0.0128, "step": 6593 }, { "epoch": 3.0, "grad_norm": 0.7989222482875739, "learning_rate": 3.4562744325296744e-07, "loss": 0.018, "step": 6594 }, { "epoch": 3.0004549590536853, "grad_norm": 0.42374202116483733, "learning_rate": 3.454915028125263e-07, "loss": 0.0056, "step": 6595 }, { "epoch": 3.00090991810737, "grad_norm": 0.5643594446182912, "learning_rate": 3.453555749978219e-07, "loss": 0.006, "step": 6596 }, { "epoch": 3.0013648771610555, "grad_norm": 0.44663835703109106, "learning_rate": 3.452196598199615e-07, "loss": 0.0049, "step": 6597 }, { "epoch": 3.001819836214741, "grad_norm": 0.5484803631764071, "learning_rate": 3.4508375729005133e-07, "loss": 0.012, "step": 6598 }, { "epoch": 3.0022747952684257, "grad_norm": 0.5881773488154484, "learning_rate": 3.4494786741919716e-07, "loss": 0.0128, "step": 6599 }, { "epoch": 3.002729754322111, "grad_norm": 0.34468770881006333, "learning_rate": 3.44811990218503e-07, "loss": 0.0043, "step": 6600 }, { "epoch": 3.0031847133757963, "grad_norm": 0.8400300690706076, "learning_rate": 3.4467612569907224e-07, "loss": 0.0093, "step": 6601 }, { "epoch": 3.003639672429481, "grad_norm": 0.4461206917529981, "learning_rate": 3.445402738720069e-07, "loss": 0.0125, "step": 6602 }, { "epoch": 3.0040946314831665, "grad_norm": 0.5286585954015977, "learning_rate": 3.4440443474840854e-07, "loss": 0.0102, "step": 6603 }, { "epoch": 3.0045495905368518, "grad_norm": 0.4037140709874133, "learning_rate": 3.442686083393769e-07, "loss": 0.0133, "step": 6604 }, { "epoch": 3.0050045495905366, "grad_norm": 1.3864184387098697, "learning_rate": 3.441327946560113e-07, "loss": 0.0285, "step": 6605 }, { "epoch": 3.005459508644222, "grad_norm": 0.670514951967158, "learning_rate": 3.4399699370940995e-07, "loss": 0.0165, "step": 6606 }, { "epoch": 3.0059144676979073, "grad_norm": 0.5357336645104309, "learning_rate": 3.438612055106697e-07, "loss": 0.0094, "step": 6607 }, { "epoch": 3.0063694267515926, "grad_norm": 0.38503221630569967, "learning_rate": 3.437254300708867e-07, "loss": 0.0121, "step": 6608 }, { "epoch": 3.0068243858052774, "grad_norm": 0.5173626735097266, "learning_rate": 3.435896674011559e-07, "loss": 0.0076, "step": 6609 }, { "epoch": 3.0072793448589628, "grad_norm": 0.3479864895018133, "learning_rate": 3.43453917512571e-07, "loss": 0.0043, "step": 6610 }, { "epoch": 3.007734303912648, "grad_norm": 1.0537740723555085, "learning_rate": 3.4331818041622505e-07, "loss": 0.0083, "step": 6611 }, { "epoch": 3.008189262966333, "grad_norm": 0.5142238742991692, "learning_rate": 3.431824561232097e-07, "loss": 0.0065, "step": 6612 }, { "epoch": 3.0086442220200182, "grad_norm": 0.3833398849382643, "learning_rate": 3.4304674464461593e-07, "loss": 0.0052, "step": 6613 }, { "epoch": 3.0090991810737036, "grad_norm": 0.504963925257379, "learning_rate": 3.429110459915335e-07, "loss": 0.0126, "step": 6614 }, { "epoch": 3.0095541401273884, "grad_norm": 0.5982049439217519, "learning_rate": 3.4277536017505085e-07, "loss": 0.0135, "step": 6615 }, { "epoch": 3.0100090991810737, "grad_norm": 0.2705289679884184, "learning_rate": 3.426396872062559e-07, "loss": 0.0038, "step": 6616 }, { "epoch": 3.010464058234759, "grad_norm": 0.36663858104773617, "learning_rate": 3.4250402709623496e-07, "loss": 0.0057, "step": 6617 }, { "epoch": 3.010919017288444, "grad_norm": 0.6911633774746331, "learning_rate": 3.4236837985607376e-07, "loss": 0.0147, "step": 6618 }, { "epoch": 3.011373976342129, "grad_norm": 0.5644200105746582, "learning_rate": 3.4223274549685647e-07, "loss": 0.0118, "step": 6619 }, { "epoch": 3.0118289353958145, "grad_norm": 0.653550899243251, "learning_rate": 3.420971240296669e-07, "loss": 0.0082, "step": 6620 }, { "epoch": 3.0122838944494994, "grad_norm": 0.8394881540141547, "learning_rate": 3.419615154655874e-07, "loss": 0.0157, "step": 6621 }, { "epoch": 3.0127388535031847, "grad_norm": 0.593169022224394, "learning_rate": 3.41825919815699e-07, "loss": 0.015, "step": 6622 }, { "epoch": 3.01319381255687, "grad_norm": 0.5685416714327711, "learning_rate": 3.416903370910822e-07, "loss": 0.0097, "step": 6623 }, { "epoch": 3.013648771610555, "grad_norm": 0.5769962648773609, "learning_rate": 3.415547673028161e-07, "loss": 0.0117, "step": 6624 }, { "epoch": 3.01410373066424, "grad_norm": 0.3547908807378016, "learning_rate": 3.4141921046197896e-07, "loss": 0.004, "step": 6625 }, { "epoch": 3.0145586897179255, "grad_norm": 1.1745572953347923, "learning_rate": 3.412836665796476e-07, "loss": 0.0095, "step": 6626 }, { "epoch": 3.0150136487716104, "grad_norm": 0.3878683779348784, "learning_rate": 3.4114813566689833e-07, "loss": 0.0066, "step": 6627 }, { "epoch": 3.0154686078252957, "grad_norm": 0.45275487078439425, "learning_rate": 3.410126177348062e-07, "loss": 0.0072, "step": 6628 }, { "epoch": 3.015923566878981, "grad_norm": 0.27415076361664426, "learning_rate": 3.4087711279444477e-07, "loss": 0.0028, "step": 6629 }, { "epoch": 3.016378525932666, "grad_norm": 0.5344032264388721, "learning_rate": 3.407416208568873e-07, "loss": 0.0091, "step": 6630 }, { "epoch": 3.016833484986351, "grad_norm": 0.39445350845582255, "learning_rate": 3.406061419332052e-07, "loss": 0.0058, "step": 6631 }, { "epoch": 3.0172884440400365, "grad_norm": 0.41450978538032174, "learning_rate": 3.404706760344694e-07, "loss": 0.0088, "step": 6632 }, { "epoch": 3.0177434030937214, "grad_norm": 0.7517785765960744, "learning_rate": 3.4033522317174945e-07, "loss": 0.0159, "step": 6633 }, { "epoch": 3.0181983621474067, "grad_norm": 0.4886410049551606, "learning_rate": 3.401997833561141e-07, "loss": 0.0066, "step": 6634 }, { "epoch": 3.018653321201092, "grad_norm": 0.39241632675429833, "learning_rate": 3.4006435659863086e-07, "loss": 0.0035, "step": 6635 }, { "epoch": 3.0191082802547773, "grad_norm": 0.562564383812904, "learning_rate": 3.399289429103661e-07, "loss": 0.0099, "step": 6636 }, { "epoch": 3.019563239308462, "grad_norm": 0.43910173407570535, "learning_rate": 3.3979354230238535e-07, "loss": 0.0055, "step": 6637 }, { "epoch": 3.0200181983621475, "grad_norm": 0.4918966363331513, "learning_rate": 3.3965815478575264e-07, "loss": 0.0035, "step": 6638 }, { "epoch": 3.0204731574158328, "grad_norm": 0.40695025492495107, "learning_rate": 3.395227803715316e-07, "loss": 0.0046, "step": 6639 }, { "epoch": 3.0209281164695176, "grad_norm": 0.524799748896578, "learning_rate": 3.39387419070784e-07, "loss": 0.0109, "step": 6640 }, { "epoch": 3.021383075523203, "grad_norm": 0.8098478480195893, "learning_rate": 3.392520708945713e-07, "loss": 0.0218, "step": 6641 }, { "epoch": 3.0218380345768883, "grad_norm": 0.7209230670448167, "learning_rate": 3.391167358539536e-07, "loss": 0.0224, "step": 6642 }, { "epoch": 3.022292993630573, "grad_norm": 0.2982484409651384, "learning_rate": 3.3898141395998953e-07, "loss": 0.0022, "step": 6643 }, { "epoch": 3.0227479526842584, "grad_norm": 0.8261260812574088, "learning_rate": 3.3884610522373724e-07, "loss": 0.02, "step": 6644 }, { "epoch": 3.0232029117379438, "grad_norm": 0.7742553020210611, "learning_rate": 3.3871080965625356e-07, "loss": 0.0106, "step": 6645 }, { "epoch": 3.0236578707916286, "grad_norm": 0.38215169517656056, "learning_rate": 3.3857552726859397e-07, "loss": 0.0059, "step": 6646 }, { "epoch": 3.024112829845314, "grad_norm": 0.47080541563773115, "learning_rate": 3.384402580718132e-07, "loss": 0.0103, "step": 6647 }, { "epoch": 3.0245677888989992, "grad_norm": 0.4326949183326038, "learning_rate": 3.3830500207696516e-07, "loss": 0.0126, "step": 6648 }, { "epoch": 3.025022747952684, "grad_norm": 0.6939377981970374, "learning_rate": 3.3816975929510203e-07, "loss": 0.0061, "step": 6649 }, { "epoch": 3.0254777070063694, "grad_norm": 0.4874207046836561, "learning_rate": 3.380345297372755e-07, "loss": 0.0108, "step": 6650 }, { "epoch": 3.0259326660600547, "grad_norm": 0.4542551080347085, "learning_rate": 3.3789931341453557e-07, "loss": 0.0039, "step": 6651 }, { "epoch": 3.0263876251137396, "grad_norm": 0.625766993907575, "learning_rate": 3.3776411033793196e-07, "loss": 0.0135, "step": 6652 }, { "epoch": 3.026842584167425, "grad_norm": 0.9139022395158278, "learning_rate": 3.376289205185124e-07, "loss": 0.0123, "step": 6653 }, { "epoch": 3.02729754322111, "grad_norm": 0.5080052927733057, "learning_rate": 3.3749374396732413e-07, "loss": 0.0076, "step": 6654 }, { "epoch": 3.027752502274795, "grad_norm": 0.6016665680254912, "learning_rate": 3.373585806954134e-07, "loss": 0.0147, "step": 6655 }, { "epoch": 3.0282074613284804, "grad_norm": 0.5988329412473324, "learning_rate": 3.3722343071382484e-07, "loss": 0.0094, "step": 6656 }, { "epoch": 3.0286624203821657, "grad_norm": 0.48797793561270847, "learning_rate": 3.370882940336026e-07, "loss": 0.0069, "step": 6657 }, { "epoch": 3.0291173794358506, "grad_norm": 1.0359427456644994, "learning_rate": 3.3695317066578915e-07, "loss": 0.0155, "step": 6658 }, { "epoch": 3.029572338489536, "grad_norm": 0.6970575820273368, "learning_rate": 3.368180606214264e-07, "loss": 0.0155, "step": 6659 }, { "epoch": 3.030027297543221, "grad_norm": 0.5364047029744029, "learning_rate": 3.366829639115547e-07, "loss": 0.0119, "step": 6660 }, { "epoch": 3.030482256596906, "grad_norm": 0.5795152896630739, "learning_rate": 3.365478805472135e-07, "loss": 0.0165, "step": 6661 }, { "epoch": 3.0309372156505914, "grad_norm": 0.5715816406386546, "learning_rate": 3.3641281053944157e-07, "loss": 0.0107, "step": 6662 }, { "epoch": 3.0313921747042767, "grad_norm": 0.5312819920944789, "learning_rate": 3.3627775389927597e-07, "loss": 0.0122, "step": 6663 }, { "epoch": 3.031847133757962, "grad_norm": 0.7212398001322251, "learning_rate": 3.36142710637753e-07, "loss": 0.0165, "step": 6664 }, { "epoch": 3.032302092811647, "grad_norm": 0.4915934607842643, "learning_rate": 3.360076807659077e-07, "loss": 0.0104, "step": 6665 }, { "epoch": 3.032757051865332, "grad_norm": 0.5306809966529901, "learning_rate": 3.358726642947742e-07, "loss": 0.0107, "step": 6666 }, { "epoch": 3.0332120109190175, "grad_norm": 0.6808232926864404, "learning_rate": 3.357376612353853e-07, "loss": 0.0078, "step": 6667 }, { "epoch": 3.0336669699727024, "grad_norm": 0.6145276889170098, "learning_rate": 3.3560267159877285e-07, "loss": 0.0133, "step": 6668 }, { "epoch": 3.0341219290263877, "grad_norm": 0.5999561463738833, "learning_rate": 3.3546769539596787e-07, "loss": 0.0112, "step": 6669 }, { "epoch": 3.034576888080073, "grad_norm": 0.5449688332672441, "learning_rate": 3.3533273263799965e-07, "loss": 0.0074, "step": 6670 }, { "epoch": 3.035031847133758, "grad_norm": 0.6186617542367082, "learning_rate": 3.35197783335897e-07, "loss": 0.0146, "step": 6671 }, { "epoch": 3.035486806187443, "grad_norm": 0.6920579637646879, "learning_rate": 3.3506284750068715e-07, "loss": 0.019, "step": 6672 }, { "epoch": 3.0359417652411285, "grad_norm": 0.30321078902533194, "learning_rate": 3.349279251433967e-07, "loss": 0.0025, "step": 6673 }, { "epoch": 3.0363967242948133, "grad_norm": 0.6283274983197713, "learning_rate": 3.347930162750505e-07, "loss": 0.0103, "step": 6674 }, { "epoch": 3.0368516833484986, "grad_norm": 0.9085521970559205, "learning_rate": 3.34658120906673e-07, "loss": 0.0091, "step": 6675 }, { "epoch": 3.037306642402184, "grad_norm": 0.5065439594259585, "learning_rate": 3.3452323904928737e-07, "loss": 0.0061, "step": 6676 }, { "epoch": 3.037761601455869, "grad_norm": 0.9052797673897295, "learning_rate": 3.3438837071391524e-07, "loss": 0.0098, "step": 6677 }, { "epoch": 3.038216560509554, "grad_norm": 0.9908599114701945, "learning_rate": 3.342535159115776e-07, "loss": 0.0208, "step": 6678 }, { "epoch": 3.0386715195632394, "grad_norm": 0.9013917372393708, "learning_rate": 3.3411867465329416e-07, "loss": 0.0152, "step": 6679 }, { "epoch": 3.0391264786169243, "grad_norm": 0.5598225492879491, "learning_rate": 3.339838469500835e-07, "loss": 0.0133, "step": 6680 }, { "epoch": 3.0395814376706096, "grad_norm": 0.5031604515634037, "learning_rate": 3.338490328129629e-07, "loss": 0.0137, "step": 6681 }, { "epoch": 3.040036396724295, "grad_norm": 0.8203900721555899, "learning_rate": 3.337142322529493e-07, "loss": 0.0062, "step": 6682 }, { "epoch": 3.04049135577798, "grad_norm": 0.6513632812637099, "learning_rate": 3.3357944528105765e-07, "loss": 0.0149, "step": 6683 }, { "epoch": 3.040946314831665, "grad_norm": 0.9281973617686845, "learning_rate": 3.334446719083022e-07, "loss": 0.0173, "step": 6684 }, { "epoch": 3.0414012738853504, "grad_norm": 0.4031413317891459, "learning_rate": 3.333099121456959e-07, "loss": 0.0069, "step": 6685 }, { "epoch": 3.0418562329390353, "grad_norm": 0.6661940125625572, "learning_rate": 3.33175166004251e-07, "loss": 0.0131, "step": 6686 }, { "epoch": 3.0423111919927206, "grad_norm": 0.5598187875931081, "learning_rate": 3.3304043349497795e-07, "loss": 0.0084, "step": 6687 }, { "epoch": 3.042766151046406, "grad_norm": 0.4034957549711382, "learning_rate": 3.329057146288866e-07, "loss": 0.0071, "step": 6688 }, { "epoch": 3.0432211101000908, "grad_norm": 0.5265564481574747, "learning_rate": 3.3277100941698594e-07, "loss": 0.0084, "step": 6689 }, { "epoch": 3.043676069153776, "grad_norm": 0.5863813469171462, "learning_rate": 3.32636317870283e-07, "loss": 0.0094, "step": 6690 }, { "epoch": 3.0441310282074614, "grad_norm": 0.5090576745779628, "learning_rate": 3.3250163999978455e-07, "loss": 0.0055, "step": 6691 }, { "epoch": 3.0445859872611467, "grad_norm": 1.00380567267612, "learning_rate": 3.3236697581649555e-07, "loss": 0.0131, "step": 6692 }, { "epoch": 3.0450409463148316, "grad_norm": 0.5832337273600335, "learning_rate": 3.322323253314203e-07, "loss": 0.011, "step": 6693 }, { "epoch": 3.045495905368517, "grad_norm": 0.49826067477680236, "learning_rate": 3.3209768855556176e-07, "loss": 0.0051, "step": 6694 }, { "epoch": 3.045950864422202, "grad_norm": 0.5469117241703197, "learning_rate": 3.319630654999217e-07, "loss": 0.0153, "step": 6695 }, { "epoch": 3.046405823475887, "grad_norm": 50.1787273996158, "learning_rate": 3.318284561755013e-07, "loss": 0.0731, "step": 6696 }, { "epoch": 3.0468607825295724, "grad_norm": 0.562372071141693, "learning_rate": 3.3169386059329994e-07, "loss": 0.0054, "step": 6697 }, { "epoch": 3.0473157415832577, "grad_norm": 0.5350267534955565, "learning_rate": 3.3155927876431633e-07, "loss": 0.0115, "step": 6698 }, { "epoch": 3.0477707006369426, "grad_norm": 0.4649874116625519, "learning_rate": 3.3142471069954765e-07, "loss": 0.0044, "step": 6699 }, { "epoch": 3.048225659690628, "grad_norm": 0.5229389388349703, "learning_rate": 3.3129015640999045e-07, "loss": 0.01, "step": 6700 }, { "epoch": 3.048680618744313, "grad_norm": 0.8051730319049246, "learning_rate": 3.3115561590663965e-07, "loss": 0.0145, "step": 6701 }, { "epoch": 3.049135577797998, "grad_norm": 0.6451559980030516, "learning_rate": 3.310210892004893e-07, "loss": 0.011, "step": 6702 }, { "epoch": 3.0495905368516834, "grad_norm": 0.530270744109496, "learning_rate": 3.3088657630253273e-07, "loss": 0.0055, "step": 6703 }, { "epoch": 3.0500454959053687, "grad_norm": 0.7402609427736223, "learning_rate": 3.307520772237613e-07, "loss": 0.0125, "step": 6704 }, { "epoch": 3.0505004549590535, "grad_norm": 0.7416004188818148, "learning_rate": 3.306175919751659e-07, "loss": 0.013, "step": 6705 }, { "epoch": 3.050955414012739, "grad_norm": 0.39320196246548883, "learning_rate": 3.3048312056773587e-07, "loss": 0.0091, "step": 6706 }, { "epoch": 3.051410373066424, "grad_norm": 0.9429416893621407, "learning_rate": 3.3034866301245976e-07, "loss": 0.0074, "step": 6707 }, { "epoch": 3.051865332120109, "grad_norm": 0.6172497656280636, "learning_rate": 3.302142193203247e-07, "loss": 0.0125, "step": 6708 }, { "epoch": 3.0523202911737943, "grad_norm": 0.6220893687750039, "learning_rate": 3.3007978950231677e-07, "loss": 0.0155, "step": 6709 }, { "epoch": 3.0527752502274796, "grad_norm": 0.43129709841661945, "learning_rate": 3.299453735694213e-07, "loss": 0.0049, "step": 6710 }, { "epoch": 3.0532302092811645, "grad_norm": 0.6652372919300106, "learning_rate": 3.298109715326219e-07, "loss": 0.0075, "step": 6711 }, { "epoch": 3.05368516833485, "grad_norm": 0.5754501122355281, "learning_rate": 3.296765834029014e-07, "loss": 0.0066, "step": 6712 }, { "epoch": 3.054140127388535, "grad_norm": 0.4028256458270475, "learning_rate": 3.295422091912412e-07, "loss": 0.007, "step": 6713 }, { "epoch": 3.05459508644222, "grad_norm": 0.8903143257461451, "learning_rate": 3.2940784890862194e-07, "loss": 0.0092, "step": 6714 }, { "epoch": 3.0550500454959053, "grad_norm": 0.573020278745889, "learning_rate": 3.2927350256602287e-07, "loss": 0.0101, "step": 6715 }, { "epoch": 3.0555050045495906, "grad_norm": 0.5550160471338459, "learning_rate": 3.2913917017442203e-07, "loss": 0.007, "step": 6716 }, { "epoch": 3.055959963603276, "grad_norm": 0.7608884562020368, "learning_rate": 3.290048517447969e-07, "loss": 0.0238, "step": 6717 }, { "epoch": 3.056414922656961, "grad_norm": 0.39050902093303125, "learning_rate": 3.288705472881228e-07, "loss": 0.0041, "step": 6718 }, { "epoch": 3.056869881710646, "grad_norm": 0.37091567957053845, "learning_rate": 3.28736256815375e-07, "loss": 0.0054, "step": 6719 }, { "epoch": 3.0573248407643314, "grad_norm": 0.5296358063044514, "learning_rate": 3.2860198033752685e-07, "loss": 0.0041, "step": 6720 }, { "epoch": 3.0577797998180163, "grad_norm": 1.1006986416256654, "learning_rate": 3.2846771786555073e-07, "loss": 0.0162, "step": 6721 }, { "epoch": 3.0582347588717016, "grad_norm": 0.4644747770600145, "learning_rate": 3.283334694104182e-07, "loss": 0.0039, "step": 6722 }, { "epoch": 3.058689717925387, "grad_norm": 0.6787005097262938, "learning_rate": 3.2819923498309896e-07, "loss": 0.0075, "step": 6723 }, { "epoch": 3.0591446769790718, "grad_norm": 0.3729054567060059, "learning_rate": 3.2806501459456267e-07, "loss": 0.004, "step": 6724 }, { "epoch": 3.059599636032757, "grad_norm": 0.4249667249272563, "learning_rate": 3.2793080825577697e-07, "loss": 0.006, "step": 6725 }, { "epoch": 3.0600545950864424, "grad_norm": 0.6732052627820803, "learning_rate": 3.277966159777085e-07, "loss": 0.0156, "step": 6726 }, { "epoch": 3.0605095541401273, "grad_norm": 0.5723554118565466, "learning_rate": 3.27662437771323e-07, "loss": 0.0067, "step": 6727 }, { "epoch": 3.0609645131938126, "grad_norm": 0.7656777594393143, "learning_rate": 3.2752827364758464e-07, "loss": 0.004, "step": 6728 }, { "epoch": 3.061419472247498, "grad_norm": 0.7505438282243289, "learning_rate": 3.27394123617457e-07, "loss": 0.0105, "step": 6729 }, { "epoch": 3.0618744313011828, "grad_norm": 0.5224791902366941, "learning_rate": 3.272599876919019e-07, "loss": 0.0073, "step": 6730 }, { "epoch": 3.062329390354868, "grad_norm": 0.72214074646181, "learning_rate": 3.271258658818807e-07, "loss": 0.005, "step": 6731 }, { "epoch": 3.0627843494085534, "grad_norm": 0.535513445620707, "learning_rate": 3.2699175819835306e-07, "loss": 0.0047, "step": 6732 }, { "epoch": 3.0632393084622382, "grad_norm": 0.9389496146576154, "learning_rate": 3.2685766465227757e-07, "loss": 0.0117, "step": 6733 }, { "epoch": 3.0636942675159236, "grad_norm": 0.5623552778554418, "learning_rate": 3.2672358525461195e-07, "loss": 0.0128, "step": 6734 }, { "epoch": 3.064149226569609, "grad_norm": 0.6157277375208229, "learning_rate": 3.265895200163123e-07, "loss": 0.01, "step": 6735 }, { "epoch": 3.0646041856232937, "grad_norm": 0.9444410167527825, "learning_rate": 3.264554689483341e-07, "loss": 0.0075, "step": 6736 }, { "epoch": 3.065059144676979, "grad_norm": 0.540265114482255, "learning_rate": 3.26321432061631e-07, "loss": 0.0075, "step": 6737 }, { "epoch": 3.0655141037306644, "grad_norm": 0.7469729373398118, "learning_rate": 3.261874093671563e-07, "loss": 0.0118, "step": 6738 }, { "epoch": 3.065969062784349, "grad_norm": 0.473146242482063, "learning_rate": 3.2605340087586164e-07, "loss": 0.0075, "step": 6739 }, { "epoch": 3.0664240218380345, "grad_norm": 0.662694246576719, "learning_rate": 3.2591940659869746e-07, "loss": 0.01, "step": 6740 }, { "epoch": 3.06687898089172, "grad_norm": 0.619648162944712, "learning_rate": 3.257854265466132e-07, "loss": 0.0129, "step": 6741 }, { "epoch": 3.0673339399454047, "grad_norm": 1.5049652270253266, "learning_rate": 3.2565146073055714e-07, "loss": 0.0223, "step": 6742 }, { "epoch": 3.06778889899909, "grad_norm": 0.787441406593484, "learning_rate": 3.2551750916147654e-07, "loss": 0.0219, "step": 6743 }, { "epoch": 3.0682438580527753, "grad_norm": 0.8956591659380425, "learning_rate": 3.2538357185031684e-07, "loss": 0.0122, "step": 6744 }, { "epoch": 3.06869881710646, "grad_norm": 0.52093681662737, "learning_rate": 3.252496488080232e-07, "loss": 0.0084, "step": 6745 }, { "epoch": 3.0691537761601455, "grad_norm": 0.38320387310051757, "learning_rate": 3.251157400455392e-07, "loss": 0.003, "step": 6746 }, { "epoch": 3.069608735213831, "grad_norm": 0.6436695288639919, "learning_rate": 3.24981845573807e-07, "loss": 0.012, "step": 6747 }, { "epoch": 3.070063694267516, "grad_norm": 0.39549914456688345, "learning_rate": 3.248479654037682e-07, "loss": 0.0037, "step": 6748 }, { "epoch": 3.070518653321201, "grad_norm": 0.6266168167853018, "learning_rate": 3.2471409954636256e-07, "loss": 0.0111, "step": 6749 }, { "epoch": 3.0709736123748863, "grad_norm": 0.6262381430612551, "learning_rate": 3.2458024801252916e-07, "loss": 0.0124, "step": 6750 }, { "epoch": 3.0714285714285716, "grad_norm": 0.7321867669131162, "learning_rate": 3.2444641081320555e-07, "loss": 0.0078, "step": 6751 }, { "epoch": 3.0718835304822565, "grad_norm": 0.6704984208113155, "learning_rate": 3.243125879593286e-07, "loss": 0.0109, "step": 6752 }, { "epoch": 3.072338489535942, "grad_norm": 0.786940119062094, "learning_rate": 3.241787794618336e-07, "loss": 0.0096, "step": 6753 }, { "epoch": 3.072793448589627, "grad_norm": 0.6788810343907328, "learning_rate": 3.2404498533165476e-07, "loss": 0.0049, "step": 6754 }, { "epoch": 3.073248407643312, "grad_norm": 0.38823012464590817, "learning_rate": 3.2391120557972494e-07, "loss": 0.0041, "step": 6755 }, { "epoch": 3.0737033666969973, "grad_norm": 0.47351290746742913, "learning_rate": 3.237774402169764e-07, "loss": 0.0051, "step": 6756 }, { "epoch": 3.0741583257506826, "grad_norm": 0.623111708736343, "learning_rate": 3.236436892543395e-07, "loss": 0.0124, "step": 6757 }, { "epoch": 3.0746132848043675, "grad_norm": 0.627261852814593, "learning_rate": 3.235099527027438e-07, "loss": 0.0118, "step": 6758 }, { "epoch": 3.0750682438580528, "grad_norm": 0.8828751614886426, "learning_rate": 3.233762305731179e-07, "loss": 0.0199, "step": 6759 }, { "epoch": 3.075523202911738, "grad_norm": 0.6784923731463897, "learning_rate": 3.2324252287638874e-07, "loss": 0.0069, "step": 6760 }, { "epoch": 3.075978161965423, "grad_norm": 0.4956647357306148, "learning_rate": 3.231088296234825e-07, "loss": 0.0086, "step": 6761 }, { "epoch": 3.0764331210191083, "grad_norm": 0.4903438159492423, "learning_rate": 3.229751508253238e-07, "loss": 0.0091, "step": 6762 }, { "epoch": 3.0768880800727936, "grad_norm": 0.8056422493257048, "learning_rate": 3.2284148649283637e-07, "loss": 0.012, "step": 6763 }, { "epoch": 3.0773430391264784, "grad_norm": 0.8852364428930187, "learning_rate": 3.227078366369425e-07, "loss": 0.0132, "step": 6764 }, { "epoch": 3.0777979981801638, "grad_norm": 0.5947225793938115, "learning_rate": 3.2257420126856355e-07, "loss": 0.0114, "step": 6765 }, { "epoch": 3.078252957233849, "grad_norm": 0.6343107661796383, "learning_rate": 3.2244058039861975e-07, "loss": 0.0169, "step": 6766 }, { "epoch": 3.078707916287534, "grad_norm": 0.5040717302942254, "learning_rate": 3.223069740380299e-07, "loss": 0.0037, "step": 6767 }, { "epoch": 3.0791628753412192, "grad_norm": 0.5088643218190166, "learning_rate": 3.2217338219771164e-07, "loss": 0.0104, "step": 6768 }, { "epoch": 3.0796178343949046, "grad_norm": 0.7861232630746053, "learning_rate": 3.220398048885815e-07, "loss": 0.0086, "step": 6769 }, { "epoch": 3.0800727934485894, "grad_norm": 0.5116954686289514, "learning_rate": 3.219062421215549e-07, "loss": 0.0131, "step": 6770 }, { "epoch": 3.0805277525022747, "grad_norm": 0.6645506790839965, "learning_rate": 3.2177269390754586e-07, "loss": 0.0112, "step": 6771 }, { "epoch": 3.08098271155596, "grad_norm": 0.651259632596792, "learning_rate": 3.216391602574673e-07, "loss": 0.0081, "step": 6772 }, { "epoch": 3.0814376706096454, "grad_norm": 0.5148827190555807, "learning_rate": 3.215056411822313e-07, "loss": 0.0066, "step": 6773 }, { "epoch": 3.08189262966333, "grad_norm": 0.8779238392076609, "learning_rate": 3.213721366927481e-07, "loss": 0.0129, "step": 6774 }, { "epoch": 3.0823475887170155, "grad_norm": 0.6709712911224796, "learning_rate": 3.2123864679992727e-07, "loss": 0.0084, "step": 6775 }, { "epoch": 3.082802547770701, "grad_norm": 0.7005811814255861, "learning_rate": 3.2110517151467697e-07, "loss": 0.0147, "step": 6776 }, { "epoch": 3.0832575068243857, "grad_norm": 0.593617565149211, "learning_rate": 3.209717108479042e-07, "loss": 0.0071, "step": 6777 }, { "epoch": 3.083712465878071, "grad_norm": 0.4733119455107835, "learning_rate": 3.2083826481051467e-07, "loss": 0.0068, "step": 6778 }, { "epoch": 3.0841674249317563, "grad_norm": 0.7669367506107335, "learning_rate": 3.207048334134129e-07, "loss": 0.0059, "step": 6779 }, { "epoch": 3.084622383985441, "grad_norm": 0.7047289470470138, "learning_rate": 3.2057141666750264e-07, "loss": 0.0211, "step": 6780 }, { "epoch": 3.0850773430391265, "grad_norm": 1.1859142811396415, "learning_rate": 3.2043801458368593e-07, "loss": 0.0149, "step": 6781 }, { "epoch": 3.085532302092812, "grad_norm": 0.5979697483076442, "learning_rate": 3.203046271728638e-07, "loss": 0.0051, "step": 6782 }, { "epoch": 3.0859872611464967, "grad_norm": 0.7051047263760248, "learning_rate": 3.201712544459359e-07, "loss": 0.013, "step": 6783 }, { "epoch": 3.086442220200182, "grad_norm": 0.750243665222291, "learning_rate": 3.2003789641380115e-07, "loss": 0.0063, "step": 6784 }, { "epoch": 3.0868971792538673, "grad_norm": 0.5510423917003686, "learning_rate": 3.199045530873566e-07, "loss": 0.0046, "step": 6785 }, { "epoch": 3.087352138307552, "grad_norm": 0.8919387522284473, "learning_rate": 3.1977122447749873e-07, "loss": 0.012, "step": 6786 }, { "epoch": 3.0878070973612375, "grad_norm": 0.7440066144782832, "learning_rate": 3.1963791059512255e-07, "loss": 0.0144, "step": 6787 }, { "epoch": 3.088262056414923, "grad_norm": 0.7125610001429161, "learning_rate": 3.195046114511218e-07, "loss": 0.008, "step": 6788 }, { "epoch": 3.0887170154686077, "grad_norm": 0.7301656154223086, "learning_rate": 3.1937132705638913e-07, "loss": 0.0091, "step": 6789 }, { "epoch": 3.089171974522293, "grad_norm": 0.4911882472396973, "learning_rate": 3.19238057421816e-07, "loss": 0.0082, "step": 6790 }, { "epoch": 3.0896269335759783, "grad_norm": 0.4278181605752241, "learning_rate": 3.1910480255829233e-07, "loss": 0.0069, "step": 6791 }, { "epoch": 3.090081892629663, "grad_norm": 0.7291382173095039, "learning_rate": 3.1897156247670734e-07, "loss": 0.0081, "step": 6792 }, { "epoch": 3.0905368516833485, "grad_norm": 0.8336552923395493, "learning_rate": 3.1883833718794863e-07, "loss": 0.0109, "step": 6793 }, { "epoch": 3.0909918107370338, "grad_norm": 0.5217352154753857, "learning_rate": 3.1870512670290296e-07, "loss": 0.0087, "step": 6794 }, { "epoch": 3.0914467697907186, "grad_norm": 0.7181465554973746, "learning_rate": 3.1857193103245565e-07, "loss": 0.0238, "step": 6795 }, { "epoch": 3.091901728844404, "grad_norm": 0.4038621057477677, "learning_rate": 3.184387501874908e-07, "loss": 0.0051, "step": 6796 }, { "epoch": 3.0923566878980893, "grad_norm": 0.39998423460899685, "learning_rate": 3.183055841788914e-07, "loss": 0.0048, "step": 6797 }, { "epoch": 3.092811646951774, "grad_norm": 0.3354707591818286, "learning_rate": 3.181724330175391e-07, "loss": 0.0044, "step": 6798 }, { "epoch": 3.0932666060054594, "grad_norm": 0.7941661402901072, "learning_rate": 3.180392967143145e-07, "loss": 0.019, "step": 6799 }, { "epoch": 3.0937215650591448, "grad_norm": 0.5418451179147028, "learning_rate": 3.1790617528009666e-07, "loss": 0.0063, "step": 6800 }, { "epoch": 3.0941765241128296, "grad_norm": 0.9622289556875069, "learning_rate": 3.177730687257639e-07, "loss": 0.0098, "step": 6801 }, { "epoch": 3.094631483166515, "grad_norm": 4.003385852080918, "learning_rate": 3.1763997706219324e-07, "loss": 0.0167, "step": 6802 }, { "epoch": 3.0950864422202002, "grad_norm": 0.6709454120521843, "learning_rate": 3.1750690030025996e-07, "loss": 0.0133, "step": 6803 }, { "epoch": 3.0955414012738856, "grad_norm": 0.694874438977747, "learning_rate": 3.173738384508388e-07, "loss": 0.0177, "step": 6804 }, { "epoch": 3.0959963603275704, "grad_norm": 0.5627923297079523, "learning_rate": 3.1724079152480267e-07, "loss": 0.0101, "step": 6805 }, { "epoch": 3.0964513193812557, "grad_norm": 0.3936803292730665, "learning_rate": 3.1710775953302384e-07, "loss": 0.0059, "step": 6806 }, { "epoch": 3.096906278434941, "grad_norm": 0.7351928905174511, "learning_rate": 3.1697474248637273e-07, "loss": 0.0102, "step": 6807 }, { "epoch": 3.097361237488626, "grad_norm": 0.5726724027632721, "learning_rate": 3.1684174039571927e-07, "loss": 0.0034, "step": 6808 }, { "epoch": 3.097816196542311, "grad_norm": 0.7571871371563089, "learning_rate": 3.1670875327193173e-07, "loss": 0.0072, "step": 6809 }, { "epoch": 3.0982711555959965, "grad_norm": 0.4000610249220219, "learning_rate": 3.165757811258771e-07, "loss": 0.0039, "step": 6810 }, { "epoch": 3.0987261146496814, "grad_norm": 0.7397841145395466, "learning_rate": 3.164428239684213e-07, "loss": 0.0194, "step": 6811 }, { "epoch": 3.0991810737033667, "grad_norm": 0.6098709758908476, "learning_rate": 3.1630988181042894e-07, "loss": 0.0106, "step": 6812 }, { "epoch": 3.099636032757052, "grad_norm": 0.41424221818888235, "learning_rate": 3.161769546627636e-07, "loss": 0.0044, "step": 6813 }, { "epoch": 3.100090991810737, "grad_norm": 0.658900989792767, "learning_rate": 3.160440425362872e-07, "loss": 0.0146, "step": 6814 }, { "epoch": 3.100545950864422, "grad_norm": 1.4784759861440957, "learning_rate": 3.1591114544186104e-07, "loss": 0.0167, "step": 6815 }, { "epoch": 3.1010009099181075, "grad_norm": 0.5713979146306908, "learning_rate": 3.1577826339034474e-07, "loss": 0.008, "step": 6816 }, { "epoch": 3.1014558689717924, "grad_norm": 0.5240412943094832, "learning_rate": 3.156453963925968e-07, "loss": 0.0087, "step": 6817 }, { "epoch": 3.1019108280254777, "grad_norm": 1.0089285989306354, "learning_rate": 3.155125444594746e-07, "loss": 0.0161, "step": 6818 }, { "epoch": 3.102365787079163, "grad_norm": 0.5602027189665995, "learning_rate": 3.15379707601834e-07, "loss": 0.0087, "step": 6819 }, { "epoch": 3.102820746132848, "grad_norm": 0.8611367599415184, "learning_rate": 3.152468858305301e-07, "loss": 0.0224, "step": 6820 }, { "epoch": 3.103275705186533, "grad_norm": 0.803336133699869, "learning_rate": 3.1511407915641617e-07, "loss": 0.0145, "step": 6821 }, { "epoch": 3.1037306642402185, "grad_norm": 1.152995864262738, "learning_rate": 3.149812875903448e-07, "loss": 0.0107, "step": 6822 }, { "epoch": 3.1041856232939034, "grad_norm": 0.7614073076874751, "learning_rate": 3.148485111431672e-07, "loss": 0.0064, "step": 6823 }, { "epoch": 3.1046405823475887, "grad_norm": 1.1282087775946414, "learning_rate": 3.14715749825733e-07, "loss": 0.0194, "step": 6824 }, { "epoch": 3.105095541401274, "grad_norm": 0.5159951780968302, "learning_rate": 3.145830036488911e-07, "loss": 0.0074, "step": 6825 }, { "epoch": 3.105550500454959, "grad_norm": 0.5570274280048253, "learning_rate": 3.144502726234889e-07, "loss": 0.0088, "step": 6826 }, { "epoch": 3.106005459508644, "grad_norm": 0.27837827521679975, "learning_rate": 3.1431755676037227e-07, "loss": 0.0013, "step": 6827 }, { "epoch": 3.1064604185623295, "grad_norm": 0.8227434426214503, "learning_rate": 3.141848560703862e-07, "loss": 0.0235, "step": 6828 }, { "epoch": 3.1069153776160148, "grad_norm": 0.648652567428713, "learning_rate": 3.1405217056437495e-07, "loss": 0.0127, "step": 6829 }, { "epoch": 3.1073703366696996, "grad_norm": 1.007468727809265, "learning_rate": 3.1391950025318035e-07, "loss": 0.0135, "step": 6830 }, { "epoch": 3.107825295723385, "grad_norm": 0.5276639401416432, "learning_rate": 3.137868451476441e-07, "loss": 0.0045, "step": 6831 }, { "epoch": 3.1082802547770703, "grad_norm": 0.39509657947362553, "learning_rate": 3.136542052586057e-07, "loss": 0.005, "step": 6832 }, { "epoch": 3.108735213830755, "grad_norm": 0.5169560631787699, "learning_rate": 3.1352158059690426e-07, "loss": 0.0086, "step": 6833 }, { "epoch": 3.1091901728844404, "grad_norm": 0.9629075204215112, "learning_rate": 3.1338897117337706e-07, "loss": 0.0093, "step": 6834 }, { "epoch": 3.1096451319381258, "grad_norm": 0.8668788550272866, "learning_rate": 3.132563769988602e-07, "loss": 0.0091, "step": 6835 }, { "epoch": 3.1101000909918106, "grad_norm": 0.6992101284235932, "learning_rate": 3.131237980841892e-07, "loss": 0.0154, "step": 6836 }, { "epoch": 3.110555050045496, "grad_norm": 0.7912868747286784, "learning_rate": 3.1299123444019733e-07, "loss": 0.019, "step": 6837 }, { "epoch": 3.1110100090991812, "grad_norm": 0.658573322738286, "learning_rate": 3.1285868607771735e-07, "loss": 0.0083, "step": 6838 }, { "epoch": 3.111464968152866, "grad_norm": 0.7071006722439873, "learning_rate": 3.1272615300758035e-07, "loss": 0.0244, "step": 6839 }, { "epoch": 3.1119199272065514, "grad_norm": 0.5780326761684019, "learning_rate": 3.1259363524061657e-07, "loss": 0.0091, "step": 6840 }, { "epoch": 3.1123748862602367, "grad_norm": 0.4538867684026786, "learning_rate": 3.124611327876544e-07, "loss": 0.0063, "step": 6841 }, { "epoch": 3.1128298453139216, "grad_norm": 0.38523813286960135, "learning_rate": 3.1232864565952145e-07, "loss": 0.0082, "step": 6842 }, { "epoch": 3.113284804367607, "grad_norm": 0.4119011752036484, "learning_rate": 3.121961738670443e-07, "loss": 0.0038, "step": 6843 }, { "epoch": 3.113739763421292, "grad_norm": 0.654581983211711, "learning_rate": 3.120637174210475e-07, "loss": 0.0073, "step": 6844 }, { "epoch": 3.114194722474977, "grad_norm": 1.0527982187921685, "learning_rate": 3.119312763323553e-07, "loss": 0.0203, "step": 6845 }, { "epoch": 3.1146496815286624, "grad_norm": 0.5152458354907089, "learning_rate": 3.1179885061178965e-07, "loss": 0.0077, "step": 6846 }, { "epoch": 3.1151046405823477, "grad_norm": 0.5977398341536653, "learning_rate": 3.116664402701721e-07, "loss": 0.0112, "step": 6847 }, { "epoch": 3.1155595996360326, "grad_norm": 0.9963799657940597, "learning_rate": 3.115340453183225e-07, "loss": 0.0081, "step": 6848 }, { "epoch": 3.116014558689718, "grad_norm": 0.7403084447099846, "learning_rate": 3.114016657670595e-07, "loss": 0.017, "step": 6849 }, { "epoch": 3.116469517743403, "grad_norm": 0.8867986618284401, "learning_rate": 3.112693016272009e-07, "loss": 0.0175, "step": 6850 }, { "epoch": 3.116924476797088, "grad_norm": 0.49815278867463547, "learning_rate": 3.1113695290956253e-07, "loss": 0.0066, "step": 6851 }, { "epoch": 3.1173794358507734, "grad_norm": 0.4731320483765953, "learning_rate": 3.110046196249596e-07, "loss": 0.0061, "step": 6852 }, { "epoch": 3.1178343949044587, "grad_norm": 0.4847748418366025, "learning_rate": 3.1087230178420556e-07, "loss": 0.0069, "step": 6853 }, { "epoch": 3.1182893539581436, "grad_norm": 0.5112699476078019, "learning_rate": 3.10739999398113e-07, "loss": 0.0038, "step": 6854 }, { "epoch": 3.118744313011829, "grad_norm": 0.49052101814351345, "learning_rate": 3.1060771247749287e-07, "loss": 0.0085, "step": 6855 }, { "epoch": 3.119199272065514, "grad_norm": 0.6221176266831538, "learning_rate": 3.104754410331551e-07, "loss": 0.0087, "step": 6856 }, { "epoch": 3.1196542311191995, "grad_norm": 0.7260632075217174, "learning_rate": 3.1034318507590863e-07, "loss": 0.0256, "step": 6857 }, { "epoch": 3.1201091901728844, "grad_norm": 1.1355024877566302, "learning_rate": 3.102109446165605e-07, "loss": 0.0167, "step": 6858 }, { "epoch": 3.1205641492265697, "grad_norm": 0.4611155053872938, "learning_rate": 3.1007871966591697e-07, "loss": 0.0034, "step": 6859 }, { "epoch": 3.121019108280255, "grad_norm": 1.0633994980917842, "learning_rate": 3.099465102347827e-07, "loss": 0.0054, "step": 6860 }, { "epoch": 3.12147406733394, "grad_norm": 0.6792107682076106, "learning_rate": 3.098143163339615e-07, "loss": 0.0226, "step": 6861 }, { "epoch": 3.121929026387625, "grad_norm": 0.7379760308402212, "learning_rate": 3.096821379742554e-07, "loss": 0.0189, "step": 6862 }, { "epoch": 3.1223839854413105, "grad_norm": 0.43508046251646004, "learning_rate": 3.095499751664653e-07, "loss": 0.0078, "step": 6863 }, { "epoch": 3.1228389444949953, "grad_norm": 1.1379133486580457, "learning_rate": 3.0941782792139136e-07, "loss": 0.0197, "step": 6864 }, { "epoch": 3.1232939035486806, "grad_norm": 0.4038126046330777, "learning_rate": 3.09285696249832e-07, "loss": 0.002, "step": 6865 }, { "epoch": 3.123748862602366, "grad_norm": 0.7115549190912024, "learning_rate": 3.0915358016258406e-07, "loss": 0.0115, "step": 6866 }, { "epoch": 3.124203821656051, "grad_norm": 0.546708914201207, "learning_rate": 3.090214796704439e-07, "loss": 0.006, "step": 6867 }, { "epoch": 3.124658780709736, "grad_norm": 0.5329643221637764, "learning_rate": 3.088893947842058e-07, "loss": 0.0094, "step": 6868 }, { "epoch": 3.1251137397634214, "grad_norm": 0.2894801284869644, "learning_rate": 3.0875732551466336e-07, "loss": 0.0025, "step": 6869 }, { "epoch": 3.1255686988171063, "grad_norm": 0.4986431471601534, "learning_rate": 3.0862527187260855e-07, "loss": 0.0084, "step": 6870 }, { "epoch": 3.1260236578707916, "grad_norm": 0.5798939981544861, "learning_rate": 3.084932338688323e-07, "loss": 0.0078, "step": 6871 }, { "epoch": 3.126478616924477, "grad_norm": 0.6523092838426897, "learning_rate": 3.083612115141243e-07, "loss": 0.0166, "step": 6872 }, { "epoch": 3.126933575978162, "grad_norm": 0.6994398817790285, "learning_rate": 3.0822920481927253e-07, "loss": 0.0189, "step": 6873 }, { "epoch": 3.127388535031847, "grad_norm": 0.632778698178046, "learning_rate": 3.0809721379506426e-07, "loss": 0.011, "step": 6874 }, { "epoch": 3.1278434940855324, "grad_norm": 0.6119040357400813, "learning_rate": 3.0796523845228484e-07, "loss": 0.0128, "step": 6875 }, { "epoch": 3.1282984531392173, "grad_norm": 0.6579893025088016, "learning_rate": 3.078332788017191e-07, "loss": 0.0174, "step": 6876 }, { "epoch": 3.1287534121929026, "grad_norm": 0.8287220679666086, "learning_rate": 3.077013348541498e-07, "loss": 0.0062, "step": 6877 }, { "epoch": 3.129208371246588, "grad_norm": 0.6068497682173851, "learning_rate": 3.0756940662035905e-07, "loss": 0.0136, "step": 6878 }, { "epoch": 3.1296633303002728, "grad_norm": 1.3733680967821582, "learning_rate": 3.074374941111275e-07, "loss": 0.0135, "step": 6879 }, { "epoch": 3.130118289353958, "grad_norm": 0.9085639970640971, "learning_rate": 3.073055973372343e-07, "loss": 0.0117, "step": 6880 }, { "epoch": 3.1305732484076434, "grad_norm": 0.7534221246084873, "learning_rate": 3.0717371630945756e-07, "loss": 0.0114, "step": 6881 }, { "epoch": 3.1310282074613287, "grad_norm": 0.43296802626466274, "learning_rate": 3.070418510385738e-07, "loss": 0.0138, "step": 6882 }, { "epoch": 3.1314831665150136, "grad_norm": 0.5368095628001898, "learning_rate": 3.0691000153535864e-07, "loss": 0.0075, "step": 6883 }, { "epoch": 3.131938125568699, "grad_norm": 0.957400093414906, "learning_rate": 3.0677816781058604e-07, "loss": 0.0222, "step": 6884 }, { "epoch": 3.132393084622384, "grad_norm": 0.46164250508445637, "learning_rate": 3.0664634987502903e-07, "loss": 0.0032, "step": 6885 }, { "epoch": 3.132848043676069, "grad_norm": 0.6696183595113716, "learning_rate": 3.065145477394592e-07, "loss": 0.0073, "step": 6886 }, { "epoch": 3.1333030027297544, "grad_norm": 0.5207751580334334, "learning_rate": 3.0638276141464675e-07, "loss": 0.0065, "step": 6887 }, { "epoch": 3.1337579617834397, "grad_norm": 0.5456452953762729, "learning_rate": 3.0625099091136073e-07, "loss": 0.0065, "step": 6888 }, { "epoch": 3.1342129208371245, "grad_norm": 1.0656593436547857, "learning_rate": 3.0611923624036865e-07, "loss": 0.0119, "step": 6889 }, { "epoch": 3.13466787989081, "grad_norm": 0.9573478663187959, "learning_rate": 3.059874974124371e-07, "loss": 0.0184, "step": 6890 }, { "epoch": 3.135122838944495, "grad_norm": 0.5804662118931646, "learning_rate": 3.05855774438331e-07, "loss": 0.0112, "step": 6891 }, { "epoch": 3.13557779799818, "grad_norm": 5.541634096203593, "learning_rate": 3.057240673288143e-07, "loss": 0.0382, "step": 6892 }, { "epoch": 3.1360327570518653, "grad_norm": 0.2881172841326112, "learning_rate": 3.055923760946496e-07, "loss": 0.0021, "step": 6893 }, { "epoch": 3.1364877161055507, "grad_norm": 1.9461201895847065, "learning_rate": 3.054607007465979e-07, "loss": 0.0132, "step": 6894 }, { "epoch": 3.1369426751592355, "grad_norm": 0.5548578972067522, "learning_rate": 3.0532904129541927e-07, "loss": 0.0135, "step": 6895 }, { "epoch": 3.137397634212921, "grad_norm": 0.6836213186488107, "learning_rate": 3.0519739775187233e-07, "loss": 0.018, "step": 6896 }, { "epoch": 3.137852593266606, "grad_norm": 0.5087442466445914, "learning_rate": 3.0506577012671414e-07, "loss": 0.0072, "step": 6897 }, { "epoch": 3.138307552320291, "grad_norm": 0.43183453819708034, "learning_rate": 3.049341584307008e-07, "loss": 0.0041, "step": 6898 }, { "epoch": 3.1387625113739763, "grad_norm": 0.6531833291841179, "learning_rate": 3.0480256267458737e-07, "loss": 0.0103, "step": 6899 }, { "epoch": 3.1392174704276616, "grad_norm": 0.7985455526737666, "learning_rate": 3.046709828691269e-07, "loss": 0.0185, "step": 6900 }, { "epoch": 3.1396724294813465, "grad_norm": 0.8422581432635537, "learning_rate": 3.0453941902507176e-07, "loss": 0.0163, "step": 6901 }, { "epoch": 3.140127388535032, "grad_norm": 0.6394897686265315, "learning_rate": 3.044078711531724e-07, "loss": 0.0058, "step": 6902 }, { "epoch": 3.140582347588717, "grad_norm": 0.538738031380634, "learning_rate": 3.0427633926417875e-07, "loss": 0.0096, "step": 6903 }, { "epoch": 3.141037306642402, "grad_norm": 0.8085156588818975, "learning_rate": 3.0414482336883853e-07, "loss": 0.0177, "step": 6904 }, { "epoch": 3.1414922656960873, "grad_norm": 0.41275780274134755, "learning_rate": 3.0401332347789887e-07, "loss": 0.0048, "step": 6905 }, { "epoch": 3.1419472247497726, "grad_norm": 0.4957260360782058, "learning_rate": 3.0388183960210553e-07, "loss": 0.0123, "step": 6906 }, { "epoch": 3.1424021838034575, "grad_norm": 0.7977673084149164, "learning_rate": 3.037503717522024e-07, "loss": 0.0154, "step": 6907 }, { "epoch": 3.142857142857143, "grad_norm": 0.9012589167329692, "learning_rate": 3.0361891993893286e-07, "loss": 0.0154, "step": 6908 }, { "epoch": 3.143312101910828, "grad_norm": 0.6702252111375034, "learning_rate": 3.034874841730382e-07, "loss": 0.0145, "step": 6909 }, { "epoch": 3.143767060964513, "grad_norm": 1.9470627596598173, "learning_rate": 3.0335606446525895e-07, "loss": 0.0211, "step": 6910 }, { "epoch": 3.1442220200181983, "grad_norm": 0.3728763570455775, "learning_rate": 3.03224660826334e-07, "loss": 0.0051, "step": 6911 }, { "epoch": 3.1446769790718836, "grad_norm": 0.41942440809618287, "learning_rate": 3.0309327326700105e-07, "loss": 0.002, "step": 6912 }, { "epoch": 3.145131938125569, "grad_norm": 1.1852408451522631, "learning_rate": 3.0296190179799685e-07, "loss": 0.0168, "step": 6913 }, { "epoch": 3.1455868971792538, "grad_norm": 0.46959356262015683, "learning_rate": 3.0283054643005603e-07, "loss": 0.0075, "step": 6914 }, { "epoch": 3.146041856232939, "grad_norm": 0.5338522295498084, "learning_rate": 3.0269920717391266e-07, "loss": 0.0062, "step": 6915 }, { "epoch": 3.1464968152866244, "grad_norm": 0.4602823923019775, "learning_rate": 3.0256788404029896e-07, "loss": 0.0041, "step": 6916 }, { "epoch": 3.1469517743403093, "grad_norm": 0.7495993513911848, "learning_rate": 3.0243657703994636e-07, "loss": 0.016, "step": 6917 }, { "epoch": 3.1474067333939946, "grad_norm": 0.6608852378746654, "learning_rate": 3.023052861835843e-07, "loss": 0.0052, "step": 6918 }, { "epoch": 3.14786169244768, "grad_norm": 0.7052852322887178, "learning_rate": 3.021740114819414e-07, "loss": 0.0053, "step": 6919 }, { "epoch": 3.1483166515013647, "grad_norm": 0.6180484551972766, "learning_rate": 3.020427529457452e-07, "loss": 0.0085, "step": 6920 }, { "epoch": 3.14877161055505, "grad_norm": 0.7500081212522827, "learning_rate": 3.0191151058572107e-07, "loss": 0.01, "step": 6921 }, { "epoch": 3.1492265696087354, "grad_norm": 0.6683748513060295, "learning_rate": 3.0178028441259384e-07, "loss": 0.0125, "step": 6922 }, { "epoch": 3.1496815286624202, "grad_norm": 0.4468794769033468, "learning_rate": 3.0164907443708656e-07, "loss": 0.0095, "step": 6923 }, { "epoch": 3.1501364877161055, "grad_norm": 0.7090968652684269, "learning_rate": 3.015178806699212e-07, "loss": 0.0116, "step": 6924 }, { "epoch": 3.150591446769791, "grad_norm": 0.5889477065850045, "learning_rate": 3.0138670312181825e-07, "loss": 0.0068, "step": 6925 }, { "epoch": 3.1510464058234757, "grad_norm": 0.5918714702868326, "learning_rate": 3.012555418034969e-07, "loss": 0.0035, "step": 6926 }, { "epoch": 3.151501364877161, "grad_norm": 0.6600451207974737, "learning_rate": 3.0112439672567525e-07, "loss": 0.0169, "step": 6927 }, { "epoch": 3.1519563239308463, "grad_norm": 0.5958470034158946, "learning_rate": 3.0099326789906977e-07, "loss": 0.0093, "step": 6928 }, { "epoch": 3.152411282984531, "grad_norm": 0.978680125705459, "learning_rate": 3.0086215533439584e-07, "loss": 0.0086, "step": 6929 }, { "epoch": 3.1528662420382165, "grad_norm": 0.39262359433914173, "learning_rate": 3.0073105904236716e-07, "loss": 0.0042, "step": 6930 }, { "epoch": 3.153321201091902, "grad_norm": 41.75388476137673, "learning_rate": 3.0059997903369657e-07, "loss": 0.1005, "step": 6931 }, { "epoch": 3.1537761601455867, "grad_norm": 0.7387253835365555, "learning_rate": 3.004689153190951e-07, "loss": 0.0146, "step": 6932 }, { "epoch": 3.154231119199272, "grad_norm": 0.6190983551344623, "learning_rate": 3.003378679092726e-07, "loss": 0.0137, "step": 6933 }, { "epoch": 3.1546860782529573, "grad_norm": 0.555167768284947, "learning_rate": 3.002068368149382e-07, "loss": 0.0069, "step": 6934 }, { "epoch": 3.1551410373066426, "grad_norm": 0.8682345760518599, "learning_rate": 3.000758220467988e-07, "loss": 0.0132, "step": 6935 }, { "epoch": 3.1555959963603275, "grad_norm": 0.5863912842657882, "learning_rate": 2.9994482361556023e-07, "loss": 0.0069, "step": 6936 }, { "epoch": 3.156050955414013, "grad_norm": 0.61846720687382, "learning_rate": 2.9981384153192737e-07, "loss": 0.0099, "step": 6937 }, { "epoch": 3.156505914467698, "grad_norm": 1.1401490573496313, "learning_rate": 2.996828758066032e-07, "loss": 0.0113, "step": 6938 }, { "epoch": 3.156960873521383, "grad_norm": 0.7097708722668408, "learning_rate": 2.995519264502899e-07, "loss": 0.0139, "step": 6939 }, { "epoch": 3.1574158325750683, "grad_norm": 0.8297543277120073, "learning_rate": 2.994209934736879e-07, "loss": 0.0072, "step": 6940 }, { "epoch": 3.1578707916287536, "grad_norm": 0.6359478727474985, "learning_rate": 2.992900768874964e-07, "loss": 0.0072, "step": 6941 }, { "epoch": 3.1583257506824385, "grad_norm": 0.9972453977319509, "learning_rate": 2.9915917670241363e-07, "loss": 0.0076, "step": 6942 }, { "epoch": 3.158780709736124, "grad_norm": 0.44140032136522395, "learning_rate": 2.9902829292913585e-07, "loss": 0.0045, "step": 6943 }, { "epoch": 3.159235668789809, "grad_norm": 0.8842041034180442, "learning_rate": 2.9889742557835853e-07, "loss": 0.0152, "step": 6944 }, { "epoch": 3.159690627843494, "grad_norm": 0.5664010873680372, "learning_rate": 2.987665746607752e-07, "loss": 0.0097, "step": 6945 }, { "epoch": 3.1601455868971793, "grad_norm": 0.692600632284697, "learning_rate": 2.986357401870788e-07, "loss": 0.014, "step": 6946 }, { "epoch": 3.1606005459508646, "grad_norm": 0.905335637586323, "learning_rate": 2.985049221679601e-07, "loss": 0.0212, "step": 6947 }, { "epoch": 3.1610555050045495, "grad_norm": 0.7891783320244719, "learning_rate": 2.983741206141094e-07, "loss": 0.0147, "step": 6948 }, { "epoch": 3.1615104640582348, "grad_norm": 0.7202436934097389, "learning_rate": 2.9824333553621514e-07, "loss": 0.01, "step": 6949 }, { "epoch": 3.16196542311192, "grad_norm": 0.41082479917094566, "learning_rate": 2.981125669449642e-07, "loss": 0.0045, "step": 6950 }, { "epoch": 3.162420382165605, "grad_norm": 0.6161755634980176, "learning_rate": 2.979818148510427e-07, "loss": 0.0122, "step": 6951 }, { "epoch": 3.1628753412192903, "grad_norm": 0.5979472515979359, "learning_rate": 2.9785107926513485e-07, "loss": 0.0083, "step": 6952 }, { "epoch": 3.1633303002729756, "grad_norm": 1.0258236035377115, "learning_rate": 2.977203601979241e-07, "loss": 0.0073, "step": 6953 }, { "epoch": 3.1637852593266604, "grad_norm": 0.6236156857935559, "learning_rate": 2.975896576600918e-07, "loss": 0.0047, "step": 6954 }, { "epoch": 3.1642402183803457, "grad_norm": 0.6710874374285318, "learning_rate": 2.9745897166231866e-07, "loss": 0.0088, "step": 6955 }, { "epoch": 3.164695177434031, "grad_norm": 0.4124686826897309, "learning_rate": 2.973283022152838e-07, "loss": 0.0027, "step": 6956 }, { "epoch": 3.165150136487716, "grad_norm": 0.6850075234757611, "learning_rate": 2.9719764932966473e-07, "loss": 0.0116, "step": 6957 }, { "epoch": 3.1656050955414012, "grad_norm": 0.9605377048957099, "learning_rate": 2.97067013016138e-07, "loss": 0.0111, "step": 6958 }, { "epoch": 3.1660600545950865, "grad_norm": 0.3548753769035255, "learning_rate": 2.969363932853784e-07, "loss": 0.0033, "step": 6959 }, { "epoch": 3.1665150136487714, "grad_norm": 0.7140222162001357, "learning_rate": 2.968057901480598e-07, "loss": 0.0147, "step": 6960 }, { "epoch": 3.1669699727024567, "grad_norm": 0.4609645324332707, "learning_rate": 2.9667520361485433e-07, "loss": 0.0065, "step": 6961 }, { "epoch": 3.167424931756142, "grad_norm": 0.9017258094668167, "learning_rate": 2.9654463369643304e-07, "loss": 0.0185, "step": 6962 }, { "epoch": 3.167879890809827, "grad_norm": 0.7085655737058197, "learning_rate": 2.964140804034656e-07, "loss": 0.0234, "step": 6963 }, { "epoch": 3.168334849863512, "grad_norm": 0.5281863232190027, "learning_rate": 2.9628354374662e-07, "loss": 0.0052, "step": 6964 }, { "epoch": 3.1687898089171975, "grad_norm": 1.0485321854122607, "learning_rate": 2.9615302373656336e-07, "loss": 0.0105, "step": 6965 }, { "epoch": 3.1692447679708824, "grad_norm": 0.8125100275368128, "learning_rate": 2.960225203839609e-07, "loss": 0.0117, "step": 6966 }, { "epoch": 3.1696997270245677, "grad_norm": 0.41687562390340055, "learning_rate": 2.958920336994771e-07, "loss": 0.0077, "step": 6967 }, { "epoch": 3.170154686078253, "grad_norm": 0.7246190914044239, "learning_rate": 2.9576156369377435e-07, "loss": 0.0178, "step": 6968 }, { "epoch": 3.1706096451319383, "grad_norm": 1.339780656365326, "learning_rate": 2.9563111037751434e-07, "loss": 0.0146, "step": 6969 }, { "epoch": 3.171064604185623, "grad_norm": 0.6859318485486272, "learning_rate": 2.955006737613572e-07, "loss": 0.0114, "step": 6970 }, { "epoch": 3.1715195632393085, "grad_norm": 0.5911426980220161, "learning_rate": 2.9537025385596146e-07, "loss": 0.01, "step": 6971 }, { "epoch": 3.171974522292994, "grad_norm": 0.25205642238454856, "learning_rate": 2.9523985067198435e-07, "loss": 0.0018, "step": 6972 }, { "epoch": 3.1724294813466787, "grad_norm": 0.4482720786350044, "learning_rate": 2.951094642200821e-07, "loss": 0.0102, "step": 6973 }, { "epoch": 3.172884440400364, "grad_norm": 0.848290851600301, "learning_rate": 2.9497909451090905e-07, "loss": 0.0095, "step": 6974 }, { "epoch": 3.1733393994540493, "grad_norm": 0.7250752337008111, "learning_rate": 2.9484874155511846e-07, "loss": 0.0194, "step": 6975 }, { "epoch": 3.173794358507734, "grad_norm": 0.524132912140072, "learning_rate": 2.9471840536336244e-07, "loss": 0.0038, "step": 6976 }, { "epoch": 3.1742493175614195, "grad_norm": 1.019150609602373, "learning_rate": 2.9458808594629116e-07, "loss": 0.0214, "step": 6977 }, { "epoch": 3.174704276615105, "grad_norm": 0.8046878403438714, "learning_rate": 2.9445778331455395e-07, "loss": 0.0115, "step": 6978 }, { "epoch": 3.1751592356687897, "grad_norm": 1.047443883112004, "learning_rate": 2.9432749747879844e-07, "loss": 0.0096, "step": 6979 }, { "epoch": 3.175614194722475, "grad_norm": 0.7471757377414372, "learning_rate": 2.941972284496711e-07, "loss": 0.0075, "step": 6980 }, { "epoch": 3.1760691537761603, "grad_norm": 0.5120841523336457, "learning_rate": 2.9406697623781676e-07, "loss": 0.0147, "step": 6981 }, { "epoch": 3.176524112829845, "grad_norm": 0.7967008437556853, "learning_rate": 2.9393674085387896e-07, "loss": 0.0101, "step": 6982 }, { "epoch": 3.1769790718835305, "grad_norm": 0.5769523588414878, "learning_rate": 2.9380652230850035e-07, "loss": 0.0068, "step": 6983 }, { "epoch": 3.1774340309372158, "grad_norm": 0.5930405194006083, "learning_rate": 2.936763206123215e-07, "loss": 0.0145, "step": 6984 }, { "epoch": 3.1778889899909006, "grad_norm": 0.8581862635772907, "learning_rate": 2.9354613577598206e-07, "loss": 0.0022, "step": 6985 }, { "epoch": 3.178343949044586, "grad_norm": 0.867509887910242, "learning_rate": 2.9341596781012e-07, "loss": 0.0121, "step": 6986 }, { "epoch": 3.1787989080982713, "grad_norm": 0.5027323191667913, "learning_rate": 2.9328581672537225e-07, "loss": 0.0054, "step": 6987 }, { "epoch": 3.179253867151956, "grad_norm": 0.7138525413628588, "learning_rate": 2.931556825323739e-07, "loss": 0.0077, "step": 6988 }, { "epoch": 3.1797088262056414, "grad_norm": 0.5294499861163717, "learning_rate": 2.93025565241759e-07, "loss": 0.009, "step": 6989 }, { "epoch": 3.1801637852593267, "grad_norm": 0.5637626848110558, "learning_rate": 2.9289546486416037e-07, "loss": 0.0083, "step": 6990 }, { "epoch": 3.180618744313012, "grad_norm": 0.5787039325621606, "learning_rate": 2.92765381410209e-07, "loss": 0.0101, "step": 6991 }, { "epoch": 3.181073703366697, "grad_norm": 0.7225616081288804, "learning_rate": 2.9263531489053497e-07, "loss": 0.0185, "step": 6992 }, { "epoch": 3.1815286624203822, "grad_norm": 0.565293896714827, "learning_rate": 2.9250526531576636e-07, "loss": 0.0209, "step": 6993 }, { "epoch": 3.1819836214740675, "grad_norm": 0.8123842262644864, "learning_rate": 2.923752326965306e-07, "loss": 0.0065, "step": 6994 }, { "epoch": 3.1824385805277524, "grad_norm": 0.6105744650708106, "learning_rate": 2.9224521704345306e-07, "loss": 0.0089, "step": 6995 }, { "epoch": 3.1828935395814377, "grad_norm": 1.1877183889304368, "learning_rate": 2.9211521836715804e-07, "loss": 0.016, "step": 6996 }, { "epoch": 3.183348498635123, "grad_norm": 0.4546533285447663, "learning_rate": 2.919852366782688e-07, "loss": 0.0096, "step": 6997 }, { "epoch": 3.183803457688808, "grad_norm": 0.6232423299385483, "learning_rate": 2.9185527198740665e-07, "loss": 0.0134, "step": 6998 }, { "epoch": 3.184258416742493, "grad_norm": 0.7289621774179096, "learning_rate": 2.917253243051915e-07, "loss": 0.0101, "step": 6999 }, { "epoch": 3.1847133757961785, "grad_norm": 0.631438071403099, "learning_rate": 2.915953936422425e-07, "loss": 0.0077, "step": 7000 }, { "epoch": 3.1851683348498634, "grad_norm": 0.6296003231803972, "learning_rate": 2.9146548000917677e-07, "loss": 0.0141, "step": 7001 }, { "epoch": 3.1856232939035487, "grad_norm": 1.123593204376574, "learning_rate": 2.913355834166102e-07, "loss": 0.0169, "step": 7002 }, { "epoch": 3.186078252957234, "grad_norm": 0.781993738661398, "learning_rate": 2.9120570387515735e-07, "loss": 0.0253, "step": 7003 }, { "epoch": 3.186533212010919, "grad_norm": 0.5022983561860366, "learning_rate": 2.9107584139543144e-07, "loss": 0.0075, "step": 7004 }, { "epoch": 3.186988171064604, "grad_norm": 0.6616261606176709, "learning_rate": 2.909459959880445e-07, "loss": 0.0115, "step": 7005 }, { "epoch": 3.1874431301182895, "grad_norm": 0.6078809361765763, "learning_rate": 2.908161676636066e-07, "loss": 0.0145, "step": 7006 }, { "epoch": 3.1878980891719744, "grad_norm": 0.5384371022128519, "learning_rate": 2.906863564327269e-07, "loss": 0.0111, "step": 7007 }, { "epoch": 3.1883530482256597, "grad_norm": 0.5905581615792942, "learning_rate": 2.905565623060129e-07, "loss": 0.0046, "step": 7008 }, { "epoch": 3.188808007279345, "grad_norm": 0.7191458530989125, "learning_rate": 2.904267852940705e-07, "loss": 0.0142, "step": 7009 }, { "epoch": 3.18926296633303, "grad_norm": 1.0034015242884005, "learning_rate": 2.902970254075049e-07, "loss": 0.007, "step": 7010 }, { "epoch": 3.189717925386715, "grad_norm": 1.1282869529891872, "learning_rate": 2.9016728265691947e-07, "loss": 0.0092, "step": 7011 }, { "epoch": 3.1901728844404005, "grad_norm": 0.5926164905149551, "learning_rate": 2.9003755705291616e-07, "loss": 0.009, "step": 7012 }, { "epoch": 3.1906278434940853, "grad_norm": 1.0506685471987784, "learning_rate": 2.899078486060955e-07, "loss": 0.02, "step": 7013 }, { "epoch": 3.1910828025477707, "grad_norm": 0.6568179921249279, "learning_rate": 2.8977815732705644e-07, "loss": 0.0139, "step": 7014 }, { "epoch": 3.191537761601456, "grad_norm": 0.3388725277882291, "learning_rate": 2.8964848322639725e-07, "loss": 0.0019, "step": 7015 }, { "epoch": 3.191992720655141, "grad_norm": 0.9103404053637792, "learning_rate": 2.895188263147141e-07, "loss": 0.0054, "step": 7016 }, { "epoch": 3.192447679708826, "grad_norm": 0.7120190900587068, "learning_rate": 2.893891866026017e-07, "loss": 0.0099, "step": 7017 }, { "epoch": 3.1929026387625115, "grad_norm": 0.8633824063742191, "learning_rate": 2.8925956410065413e-07, "loss": 0.0094, "step": 7018 }, { "epoch": 3.1933575978161963, "grad_norm": 0.5579094602852307, "learning_rate": 2.89129958819463e-07, "loss": 0.0103, "step": 7019 }, { "epoch": 3.1938125568698816, "grad_norm": 0.4621880936370116, "learning_rate": 2.8900037076961956e-07, "loss": 0.0079, "step": 7020 }, { "epoch": 3.194267515923567, "grad_norm": 0.5432407006643083, "learning_rate": 2.8887079996171295e-07, "loss": 0.009, "step": 7021 }, { "epoch": 3.194722474977252, "grad_norm": 1.7843623925860026, "learning_rate": 2.887412464063311e-07, "loss": 0.029, "step": 7022 }, { "epoch": 3.195177434030937, "grad_norm": 1.6088466983312664, "learning_rate": 2.886117101140605e-07, "loss": 0.0327, "step": 7023 }, { "epoch": 3.1956323930846224, "grad_norm": 0.9314590254624394, "learning_rate": 2.884821910954862e-07, "loss": 0.0269, "step": 7024 }, { "epoch": 3.1960873521383077, "grad_norm": 0.5741866517749178, "learning_rate": 2.883526893611923e-07, "loss": 0.0078, "step": 7025 }, { "epoch": 3.1965423111919926, "grad_norm": 0.7760034344191445, "learning_rate": 2.882232049217608e-07, "loss": 0.012, "step": 7026 }, { "epoch": 3.196997270245678, "grad_norm": 0.9883461123166765, "learning_rate": 2.8809373778777255e-07, "loss": 0.0125, "step": 7027 }, { "epoch": 3.1974522292993632, "grad_norm": 0.30821163822480163, "learning_rate": 2.8796428796980696e-07, "loss": 0.0026, "step": 7028 }, { "epoch": 3.197907188353048, "grad_norm": 0.5737921947687522, "learning_rate": 2.878348554784424e-07, "loss": 0.0068, "step": 7029 }, { "epoch": 3.1983621474067334, "grad_norm": 0.34203907939322387, "learning_rate": 2.877054403242554e-07, "loss": 0.0045, "step": 7030 }, { "epoch": 3.1988171064604187, "grad_norm": 0.3933458886127644, "learning_rate": 2.8757604251782075e-07, "loss": 0.0046, "step": 7031 }, { "epoch": 3.1992720655141036, "grad_norm": 0.3819289439165039, "learning_rate": 2.874466620697129e-07, "loss": 0.005, "step": 7032 }, { "epoch": 3.199727024567789, "grad_norm": 0.5749189872991979, "learning_rate": 2.873172989905037e-07, "loss": 0.0077, "step": 7033 }, { "epoch": 3.200181983621474, "grad_norm": 0.4619779816450111, "learning_rate": 2.871879532907646e-07, "loss": 0.0056, "step": 7034 }, { "epoch": 3.200636942675159, "grad_norm": 0.764088758579783, "learning_rate": 2.870586249810649e-07, "loss": 0.0185, "step": 7035 }, { "epoch": 3.2010919017288444, "grad_norm": 0.6418338016507192, "learning_rate": 2.8692931407197275e-07, "loss": 0.0135, "step": 7036 }, { "epoch": 3.2015468607825297, "grad_norm": 0.5925576047965659, "learning_rate": 2.8680002057405463e-07, "loss": 0.0085, "step": 7037 }, { "epoch": 3.2020018198362146, "grad_norm": 0.5113638171744681, "learning_rate": 2.8667074449787597e-07, "loss": 0.0086, "step": 7038 }, { "epoch": 3.2024567788899, "grad_norm": 0.7119678240766164, "learning_rate": 2.8654148585400097e-07, "loss": 0.012, "step": 7039 }, { "epoch": 3.202911737943585, "grad_norm": 0.5187329219527674, "learning_rate": 2.8641224465299175e-07, "loss": 0.004, "step": 7040 }, { "epoch": 3.20336669699727, "grad_norm": 0.8593253777829386, "learning_rate": 2.8628302090540934e-07, "loss": 0.0066, "step": 7041 }, { "epoch": 3.2038216560509554, "grad_norm": 1.0370200515628418, "learning_rate": 2.8615381462181326e-07, "loss": 0.0155, "step": 7042 }, { "epoch": 3.2042766151046407, "grad_norm": 0.6265087405685263, "learning_rate": 2.860246258127616e-07, "loss": 0.0037, "step": 7043 }, { "epoch": 3.2047315741583255, "grad_norm": 0.6251319732639563, "learning_rate": 2.8589545448881134e-07, "loss": 0.0072, "step": 7044 }, { "epoch": 3.205186533212011, "grad_norm": 0.6735681041518333, "learning_rate": 2.8576630066051753e-07, "loss": 0.0093, "step": 7045 }, { "epoch": 3.205641492265696, "grad_norm": 0.5045679385869635, "learning_rate": 2.8563716433843433e-07, "loss": 0.0071, "step": 7046 }, { "epoch": 3.2060964513193815, "grad_norm": 0.5916935692216051, "learning_rate": 2.8550804553311404e-07, "loss": 0.013, "step": 7047 }, { "epoch": 3.2065514103730663, "grad_norm": 0.9012763270006251, "learning_rate": 2.853789442551074e-07, "loss": 0.0117, "step": 7048 }, { "epoch": 3.2070063694267517, "grad_norm": 0.6432889684375712, "learning_rate": 2.8524986051496433e-07, "loss": 0.0091, "step": 7049 }, { "epoch": 3.207461328480437, "grad_norm": 0.8920224720859603, "learning_rate": 2.85120794323233e-07, "loss": 0.0158, "step": 7050 }, { "epoch": 3.207916287534122, "grad_norm": 0.7170457377898541, "learning_rate": 2.849917456904599e-07, "loss": 0.011, "step": 7051 }, { "epoch": 3.208371246587807, "grad_norm": 0.6807504911410209, "learning_rate": 2.848627146271902e-07, "loss": 0.0142, "step": 7052 }, { "epoch": 3.2088262056414925, "grad_norm": 0.7311330920823996, "learning_rate": 2.847337011439679e-07, "loss": 0.0092, "step": 7053 }, { "epoch": 3.2092811646951773, "grad_norm": 0.41927054329947083, "learning_rate": 2.846047052513356e-07, "loss": 0.0051, "step": 7054 }, { "epoch": 3.2097361237488626, "grad_norm": 0.6803272028243141, "learning_rate": 2.844757269598341e-07, "loss": 0.0113, "step": 7055 }, { "epoch": 3.210191082802548, "grad_norm": 0.80011045767563, "learning_rate": 2.843467662800029e-07, "loss": 0.0122, "step": 7056 }, { "epoch": 3.210646041856233, "grad_norm": 0.6770937902544324, "learning_rate": 2.842178232223798e-07, "loss": 0.0096, "step": 7057 }, { "epoch": 3.211101000909918, "grad_norm": 0.8377469614301952, "learning_rate": 2.8408889779750203e-07, "loss": 0.0241, "step": 7058 }, { "epoch": 3.2115559599636034, "grad_norm": 0.5258828611057097, "learning_rate": 2.8395999001590413e-07, "loss": 0.0056, "step": 7059 }, { "epoch": 3.2120109190172883, "grad_norm": 0.6213137943286348, "learning_rate": 2.8383109988812054e-07, "loss": 0.0102, "step": 7060 }, { "epoch": 3.2124658780709736, "grad_norm": 0.3968008910847791, "learning_rate": 2.837022274246832e-07, "loss": 0.0035, "step": 7061 }, { "epoch": 3.212920837124659, "grad_norm": 0.9248173665991971, "learning_rate": 2.835733726361229e-07, "loss": 0.0157, "step": 7062 }, { "epoch": 3.213375796178344, "grad_norm": 0.5264650971480169, "learning_rate": 2.8344453553296937e-07, "loss": 0.0072, "step": 7063 }, { "epoch": 3.213830755232029, "grad_norm": 0.7681986655742449, "learning_rate": 2.8331571612575045e-07, "loss": 0.0192, "step": 7064 }, { "epoch": 3.2142857142857144, "grad_norm": 0.3237313130686364, "learning_rate": 2.831869144249927e-07, "loss": 0.0038, "step": 7065 }, { "epoch": 3.2147406733393993, "grad_norm": 0.29627555120000904, "learning_rate": 2.8305813044122093e-07, "loss": 0.0037, "step": 7066 }, { "epoch": 3.2151956323930846, "grad_norm": 0.6304292408783815, "learning_rate": 2.829293641849591e-07, "loss": 0.0073, "step": 7067 }, { "epoch": 3.21565059144677, "grad_norm": 0.5759986231901109, "learning_rate": 2.828006156667295e-07, "loss": 0.0074, "step": 7068 }, { "epoch": 3.2161055505004548, "grad_norm": 0.5336305474725509, "learning_rate": 2.826718848970527e-07, "loss": 0.0063, "step": 7069 }, { "epoch": 3.21656050955414, "grad_norm": 0.5226485974142171, "learning_rate": 2.8254317188644815e-07, "loss": 0.0094, "step": 7070 }, { "epoch": 3.2170154686078254, "grad_norm": 0.40763375062477936, "learning_rate": 2.8241447664543327e-07, "loss": 0.0045, "step": 7071 }, { "epoch": 3.2174704276615103, "grad_norm": 0.6504991670730239, "learning_rate": 2.8228579918452494e-07, "loss": 0.0061, "step": 7072 }, { "epoch": 3.2179253867151956, "grad_norm": 0.6928509186764849, "learning_rate": 2.821571395142377e-07, "loss": 0.0111, "step": 7073 }, { "epoch": 3.218380345768881, "grad_norm": 0.5939079169693996, "learning_rate": 2.820284976450855e-07, "loss": 0.0169, "step": 7074 }, { "epoch": 3.2188353048225657, "grad_norm": 0.9069208373047899, "learning_rate": 2.8189987358758016e-07, "loss": 0.0146, "step": 7075 }, { "epoch": 3.219290263876251, "grad_norm": 0.8477030834908138, "learning_rate": 2.817712673522319e-07, "loss": 0.0147, "step": 7076 }, { "epoch": 3.2197452229299364, "grad_norm": 0.7243546111816951, "learning_rate": 2.816426789495504e-07, "loss": 0.0223, "step": 7077 }, { "epoch": 3.2202001819836217, "grad_norm": 0.33396501266707235, "learning_rate": 2.8151410839004317e-07, "loss": 0.0078, "step": 7078 }, { "epoch": 3.2206551410373065, "grad_norm": 0.5268882796544065, "learning_rate": 2.8138555568421625e-07, "loss": 0.0051, "step": 7079 }, { "epoch": 3.221110100090992, "grad_norm": 0.5354637823802656, "learning_rate": 2.812570208425743e-07, "loss": 0.0063, "step": 7080 }, { "epoch": 3.221565059144677, "grad_norm": 0.44273124851433865, "learning_rate": 2.8112850387562094e-07, "loss": 0.0103, "step": 7081 }, { "epoch": 3.222020018198362, "grad_norm": 0.5069107020600638, "learning_rate": 2.8100000479385765e-07, "loss": 0.0096, "step": 7082 }, { "epoch": 3.2224749772520473, "grad_norm": 0.5438740022904973, "learning_rate": 2.808715236077851e-07, "loss": 0.0101, "step": 7083 }, { "epoch": 3.2229299363057327, "grad_norm": 0.6424322278822633, "learning_rate": 2.8074306032790217e-07, "loss": 0.0083, "step": 7084 }, { "epoch": 3.2233848953594175, "grad_norm": 0.5001008743239447, "learning_rate": 2.806146149647062e-07, "loss": 0.0063, "step": 7085 }, { "epoch": 3.223839854413103, "grad_norm": 2.312265839430574, "learning_rate": 2.8048618752869285e-07, "loss": 0.0452, "step": 7086 }, { "epoch": 3.224294813466788, "grad_norm": 1.074554476947678, "learning_rate": 2.8035777803035696e-07, "loss": 0.0213, "step": 7087 }, { "epoch": 3.224749772520473, "grad_norm": 0.44573134867044806, "learning_rate": 2.8022938648019183e-07, "loss": 0.0089, "step": 7088 }, { "epoch": 3.2252047315741583, "grad_norm": 0.53806048353083, "learning_rate": 2.8010101288868875e-07, "loss": 0.0105, "step": 7089 }, { "epoch": 3.2256596906278436, "grad_norm": 0.7555328129057309, "learning_rate": 2.799726572663378e-07, "loss": 0.0159, "step": 7090 }, { "epoch": 3.2261146496815285, "grad_norm": 0.611543861914462, "learning_rate": 2.798443196236275e-07, "loss": 0.0096, "step": 7091 }, { "epoch": 3.226569608735214, "grad_norm": 0.5783692363491635, "learning_rate": 2.7971599997104536e-07, "loss": 0.0134, "step": 7092 }, { "epoch": 3.227024567788899, "grad_norm": 0.8525706391977254, "learning_rate": 2.7958769831907693e-07, "loss": 0.0066, "step": 7093 }, { "epoch": 3.227479526842584, "grad_norm": 0.8049086038378716, "learning_rate": 2.7945941467820624e-07, "loss": 0.0107, "step": 7094 }, { "epoch": 3.2279344858962693, "grad_norm": 0.5223246773935305, "learning_rate": 2.7933114905891637e-07, "loss": 0.0067, "step": 7095 }, { "epoch": 3.2283894449499546, "grad_norm": 1.2730712430107907, "learning_rate": 2.792029014716883e-07, "loss": 0.005, "step": 7096 }, { "epoch": 3.2288444040036395, "grad_norm": 0.6443631846717222, "learning_rate": 2.790746719270022e-07, "loss": 0.0073, "step": 7097 }, { "epoch": 3.229299363057325, "grad_norm": 0.7504536390843124, "learning_rate": 2.789464604353362e-07, "loss": 0.0154, "step": 7098 }, { "epoch": 3.22975432211101, "grad_norm": 0.46599752298530694, "learning_rate": 2.788182670071672e-07, "loss": 0.0081, "step": 7099 }, { "epoch": 3.2302092811646954, "grad_norm": 0.6085193409157936, "learning_rate": 2.786900916529704e-07, "loss": 0.0093, "step": 7100 }, { "epoch": 3.2306642402183803, "grad_norm": 0.9651390147584843, "learning_rate": 2.7856193438321984e-07, "loss": 0.0074, "step": 7101 }, { "epoch": 3.2311191992720656, "grad_norm": 0.46893390188347644, "learning_rate": 2.784337952083883e-07, "loss": 0.0065, "step": 7102 }, { "epoch": 3.231574158325751, "grad_norm": 1.7009231827016424, "learning_rate": 2.7830567413894637e-07, "loss": 0.0192, "step": 7103 }, { "epoch": 3.2320291173794358, "grad_norm": 0.4157935742727241, "learning_rate": 2.781775711853635e-07, "loss": 0.0068, "step": 7104 }, { "epoch": 3.232484076433121, "grad_norm": 0.6129600796667312, "learning_rate": 2.7804948635810766e-07, "loss": 0.0109, "step": 7105 }, { "epoch": 3.2329390354868064, "grad_norm": 0.4143673597912127, "learning_rate": 2.7792141966764566e-07, "loss": 0.0058, "step": 7106 }, { "epoch": 3.2333939945404913, "grad_norm": 0.5251345600330377, "learning_rate": 2.777933711244423e-07, "loss": 0.0095, "step": 7107 }, { "epoch": 3.2338489535941766, "grad_norm": 0.6276452237510961, "learning_rate": 2.776653407389611e-07, "loss": 0.0026, "step": 7108 }, { "epoch": 3.234303912647862, "grad_norm": 0.6781279706985008, "learning_rate": 2.775373285216642e-07, "loss": 0.0081, "step": 7109 }, { "epoch": 3.2347588717015467, "grad_norm": 0.5606998780371939, "learning_rate": 2.774093344830122e-07, "loss": 0.0055, "step": 7110 }, { "epoch": 3.235213830755232, "grad_norm": 1.0397626075178565, "learning_rate": 2.7728135863346424e-07, "loss": 0.0153, "step": 7111 }, { "epoch": 3.2356687898089174, "grad_norm": 0.5077762664539219, "learning_rate": 2.771534009834779e-07, "loss": 0.0066, "step": 7112 }, { "epoch": 3.2361237488626022, "grad_norm": 1.2533256768948533, "learning_rate": 2.7702546154350926e-07, "loss": 0.0492, "step": 7113 }, { "epoch": 3.2365787079162875, "grad_norm": 0.6862684497071293, "learning_rate": 2.76897540324013e-07, "loss": 0.0067, "step": 7114 }, { "epoch": 3.237033666969973, "grad_norm": 0.644178421251708, "learning_rate": 2.7676963733544195e-07, "loss": 0.0092, "step": 7115 }, { "epoch": 3.2374886260236577, "grad_norm": 0.6016646788039145, "learning_rate": 2.7664175258824805e-07, "loss": 0.0073, "step": 7116 }, { "epoch": 3.237943585077343, "grad_norm": 1.0760240567530757, "learning_rate": 2.765138860928817e-07, "loss": 0.0305, "step": 7117 }, { "epoch": 3.2383985441310283, "grad_norm": 0.8321637191869327, "learning_rate": 2.763860378597913e-07, "loss": 0.0088, "step": 7118 }, { "epoch": 3.238853503184713, "grad_norm": 0.7829709087152067, "learning_rate": 2.7625820789942407e-07, "loss": 0.0103, "step": 7119 }, { "epoch": 3.2393084622383985, "grad_norm": 0.9321105214383465, "learning_rate": 2.761303962222254e-07, "loss": 0.0056, "step": 7120 }, { "epoch": 3.239763421292084, "grad_norm": 0.47535637731270874, "learning_rate": 2.7600260283864007e-07, "loss": 0.0053, "step": 7121 }, { "epoch": 3.2402183803457687, "grad_norm": 0.8387712765195235, "learning_rate": 2.758748277591102e-07, "loss": 0.0073, "step": 7122 }, { "epoch": 3.240673339399454, "grad_norm": 0.7783533643812723, "learning_rate": 2.757470709940776e-07, "loss": 0.013, "step": 7123 }, { "epoch": 3.2411282984531393, "grad_norm": 0.5351070449064447, "learning_rate": 2.756193325539815e-07, "loss": 0.0044, "step": 7124 }, { "epoch": 3.241583257506824, "grad_norm": 0.48834582391651177, "learning_rate": 2.7549161244926004e-07, "loss": 0.0062, "step": 7125 }, { "epoch": 3.2420382165605095, "grad_norm": 0.42048051544487336, "learning_rate": 2.7536391069035043e-07, "loss": 0.008, "step": 7126 }, { "epoch": 3.242493175614195, "grad_norm": 1.845923730654777, "learning_rate": 2.7523622728768753e-07, "loss": 0.0316, "step": 7127 }, { "epoch": 3.2429481346678797, "grad_norm": 0.49119218601836656, "learning_rate": 2.751085622517051e-07, "loss": 0.008, "step": 7128 }, { "epoch": 3.243403093721565, "grad_norm": 0.7708219732530408, "learning_rate": 2.749809155928352e-07, "loss": 0.0099, "step": 7129 }, { "epoch": 3.2438580527752503, "grad_norm": 0.535181686900276, "learning_rate": 2.748532873215087e-07, "loss": 0.0058, "step": 7130 }, { "epoch": 3.244313011828935, "grad_norm": 0.4375365237965638, "learning_rate": 2.74725677448155e-07, "loss": 0.0054, "step": 7131 }, { "epoch": 3.2447679708826205, "grad_norm": 0.9085383059318396, "learning_rate": 2.745980859832016e-07, "loss": 0.0074, "step": 7132 }, { "epoch": 3.245222929936306, "grad_norm": 0.573373447026254, "learning_rate": 2.744705129370747e-07, "loss": 0.0066, "step": 7133 }, { "epoch": 3.245677888989991, "grad_norm": 0.5326753096823624, "learning_rate": 2.743429583201988e-07, "loss": 0.0081, "step": 7134 }, { "epoch": 3.246132848043676, "grad_norm": 0.8122253378268605, "learning_rate": 2.742154221429975e-07, "loss": 0.0178, "step": 7135 }, { "epoch": 3.2465878070973613, "grad_norm": 1.140755763164502, "learning_rate": 2.740879044158921e-07, "loss": 0.0187, "step": 7136 }, { "epoch": 3.2470427661510466, "grad_norm": 0.6903602994871945, "learning_rate": 2.7396040514930315e-07, "loss": 0.011, "step": 7137 }, { "epoch": 3.2474977252047315, "grad_norm": 0.8062065383943455, "learning_rate": 2.7383292435364904e-07, "loss": 0.0074, "step": 7138 }, { "epoch": 3.2479526842584168, "grad_norm": 0.533164985982459, "learning_rate": 2.7370546203934683e-07, "loss": 0.0069, "step": 7139 }, { "epoch": 3.248407643312102, "grad_norm": 0.6987439253373782, "learning_rate": 2.7357801821681254e-07, "loss": 0.0086, "step": 7140 }, { "epoch": 3.248862602365787, "grad_norm": 0.5580204265291397, "learning_rate": 2.7345059289646005e-07, "loss": 0.0059, "step": 7141 }, { "epoch": 3.2493175614194723, "grad_norm": 0.9430899217093408, "learning_rate": 2.73323186088702e-07, "loss": 0.0188, "step": 7142 }, { "epoch": 3.2497725204731576, "grad_norm": 0.6245868783670482, "learning_rate": 2.731957978039494e-07, "loss": 0.0048, "step": 7143 }, { "epoch": 3.2502274795268424, "grad_norm": 0.6503443243962317, "learning_rate": 2.730684280526119e-07, "loss": 0.0126, "step": 7144 }, { "epoch": 3.2506824385805277, "grad_norm": 0.3935856498957789, "learning_rate": 2.729410768450979e-07, "loss": 0.0036, "step": 7145 }, { "epoch": 3.251137397634213, "grad_norm": 0.65217313126876, "learning_rate": 2.728137441918136e-07, "loss": 0.0068, "step": 7146 }, { "epoch": 3.251592356687898, "grad_norm": 0.6350484786182593, "learning_rate": 2.726864301031643e-07, "loss": 0.0068, "step": 7147 }, { "epoch": 3.2520473157415832, "grad_norm": 1.2413889184836364, "learning_rate": 2.725591345895533e-07, "loss": 0.0081, "step": 7148 }, { "epoch": 3.2525022747952685, "grad_norm": 0.8348983580585755, "learning_rate": 2.7243185766138257e-07, "loss": 0.0211, "step": 7149 }, { "epoch": 3.2529572338489534, "grad_norm": 0.5166174259131971, "learning_rate": 2.723045993290527e-07, "loss": 0.0065, "step": 7150 }, { "epoch": 3.2534121929026387, "grad_norm": 0.5095504310870103, "learning_rate": 2.721773596029629e-07, "loss": 0.0071, "step": 7151 }, { "epoch": 3.253867151956324, "grad_norm": 0.6170355909919671, "learning_rate": 2.7205013849351043e-07, "loss": 0.0082, "step": 7152 }, { "epoch": 3.2543221110100093, "grad_norm": 0.45587063786978727, "learning_rate": 2.719229360110913e-07, "loss": 0.0098, "step": 7153 }, { "epoch": 3.254777070063694, "grad_norm": 0.9648241937708992, "learning_rate": 2.7179575216609954e-07, "loss": 0.0061, "step": 7154 }, { "epoch": 3.2552320291173795, "grad_norm": 0.7218316483815663, "learning_rate": 2.716685869689286e-07, "loss": 0.0155, "step": 7155 }, { "epoch": 3.255686988171065, "grad_norm": 0.37947051269680593, "learning_rate": 2.7154144042996964e-07, "loss": 0.0041, "step": 7156 }, { "epoch": 3.2561419472247497, "grad_norm": 0.6067926512326908, "learning_rate": 2.714143125596122e-07, "loss": 0.0089, "step": 7157 }, { "epoch": 3.256596906278435, "grad_norm": 0.49761158244269094, "learning_rate": 2.712872033682452e-07, "loss": 0.0101, "step": 7158 }, { "epoch": 3.2570518653321203, "grad_norm": 0.5443387900768771, "learning_rate": 2.7116011286625476e-07, "loss": 0.0065, "step": 7159 }, { "epoch": 3.257506824385805, "grad_norm": 0.43882291994604883, "learning_rate": 2.710330410640266e-07, "loss": 0.0091, "step": 7160 }, { "epoch": 3.2579617834394905, "grad_norm": 0.746795194614381, "learning_rate": 2.709059879719444e-07, "loss": 0.0184, "step": 7161 }, { "epoch": 3.258416742493176, "grad_norm": 0.43059793663250295, "learning_rate": 2.7077895360039025e-07, "loss": 0.0039, "step": 7162 }, { "epoch": 3.2588717015468607, "grad_norm": 0.6422814554111433, "learning_rate": 2.706519379597447e-07, "loss": 0.0086, "step": 7163 }, { "epoch": 3.259326660600546, "grad_norm": 0.586898011870795, "learning_rate": 2.7052494106038704e-07, "loss": 0.0086, "step": 7164 }, { "epoch": 3.2597816196542313, "grad_norm": 0.6131939799339864, "learning_rate": 2.703979629126951e-07, "loss": 0.0122, "step": 7165 }, { "epoch": 3.260236578707916, "grad_norm": 0.5269786942470606, "learning_rate": 2.702710035270448e-07, "loss": 0.0081, "step": 7166 }, { "epoch": 3.2606915377616015, "grad_norm": 0.6821878363480471, "learning_rate": 2.701440629138105e-07, "loss": 0.0044, "step": 7167 }, { "epoch": 3.261146496815287, "grad_norm": 0.7208433278597133, "learning_rate": 2.7001714108336534e-07, "loss": 0.0068, "step": 7168 }, { "epoch": 3.2616014558689717, "grad_norm": 0.8154665912131477, "learning_rate": 2.6989023804608093e-07, "loss": 0.0157, "step": 7169 }, { "epoch": 3.262056414922657, "grad_norm": 0.6440645399355257, "learning_rate": 2.697633538123271e-07, "loss": 0.0059, "step": 7170 }, { "epoch": 3.2625113739763423, "grad_norm": 0.7722536256956588, "learning_rate": 2.6963648839247207e-07, "loss": 0.0041, "step": 7171 }, { "epoch": 3.262966333030027, "grad_norm": 1.1716704018308537, "learning_rate": 2.695096417968831e-07, "loss": 0.0082, "step": 7172 }, { "epoch": 3.2634212920837125, "grad_norm": 0.536575105129538, "learning_rate": 2.6938281403592503e-07, "loss": 0.0039, "step": 7173 }, { "epoch": 3.2638762511373978, "grad_norm": 0.7195459336235547, "learning_rate": 2.692560051199623e-07, "loss": 0.0037, "step": 7174 }, { "epoch": 3.2643312101910826, "grad_norm": 1.0305331369338382, "learning_rate": 2.6912921505935667e-07, "loss": 0.0154, "step": 7175 }, { "epoch": 3.264786169244768, "grad_norm": 0.56437330684396, "learning_rate": 2.69002443864469e-07, "loss": 0.0057, "step": 7176 }, { "epoch": 3.2652411282984533, "grad_norm": 0.5945661848590464, "learning_rate": 2.6887569154565825e-07, "loss": 0.0086, "step": 7177 }, { "epoch": 3.265696087352138, "grad_norm": 0.5635321468952285, "learning_rate": 2.6874895811328224e-07, "loss": 0.0091, "step": 7178 }, { "epoch": 3.2661510464058234, "grad_norm": 0.6404298288167998, "learning_rate": 2.686222435776973e-07, "loss": 0.0123, "step": 7179 }, { "epoch": 3.2666060054595087, "grad_norm": 0.7054105323391612, "learning_rate": 2.6849554794925764e-07, "loss": 0.0099, "step": 7180 }, { "epoch": 3.2670609645131936, "grad_norm": 1.4106399761783412, "learning_rate": 2.683688712383164e-07, "loss": 0.0149, "step": 7181 }, { "epoch": 3.267515923566879, "grad_norm": 0.6800749429367929, "learning_rate": 2.682422134552248e-07, "loss": 0.0072, "step": 7182 }, { "epoch": 3.2679708826205642, "grad_norm": 0.5058476357297575, "learning_rate": 2.6811557461033306e-07, "loss": 0.007, "step": 7183 }, { "epoch": 3.268425841674249, "grad_norm": 0.5185112909587306, "learning_rate": 2.679889547139894e-07, "loss": 0.0045, "step": 7184 }, { "epoch": 3.2688808007279344, "grad_norm": 0.6333671867267279, "learning_rate": 2.6786235377654035e-07, "loss": 0.0114, "step": 7185 }, { "epoch": 3.2693357597816197, "grad_norm": 0.9524560570652902, "learning_rate": 2.677357718083317e-07, "loss": 0.0112, "step": 7186 }, { "epoch": 3.2697907188353046, "grad_norm": 0.7864936384053813, "learning_rate": 2.6760920881970685e-07, "loss": 0.004, "step": 7187 }, { "epoch": 3.27024567788899, "grad_norm": 0.9384489852410557, "learning_rate": 2.674826648210078e-07, "loss": 0.0123, "step": 7188 }, { "epoch": 3.270700636942675, "grad_norm": 0.90509291868233, "learning_rate": 2.673561398225755e-07, "loss": 0.0068, "step": 7189 }, { "epoch": 3.2711555959963605, "grad_norm": 0.6329832588970966, "learning_rate": 2.6722963383474884e-07, "loss": 0.0074, "step": 7190 }, { "epoch": 3.2716105550500454, "grad_norm": 0.6242843235683759, "learning_rate": 2.671031468678654e-07, "loss": 0.0151, "step": 7191 }, { "epoch": 3.2720655141037307, "grad_norm": 0.3921548554329177, "learning_rate": 2.669766789322607e-07, "loss": 0.0041, "step": 7192 }, { "epoch": 3.272520473157416, "grad_norm": 0.35782954186954014, "learning_rate": 2.668502300382696e-07, "loss": 0.0048, "step": 7193 }, { "epoch": 3.272975432211101, "grad_norm": 19.948343013448735, "learning_rate": 2.66723800196225e-07, "loss": 0.0347, "step": 7194 }, { "epoch": 3.273430391264786, "grad_norm": 0.5457432303518996, "learning_rate": 2.665973894164579e-07, "loss": 0.0093, "step": 7195 }, { "epoch": 3.2738853503184715, "grad_norm": 0.856901423521414, "learning_rate": 2.664709977092982e-07, "loss": 0.0097, "step": 7196 }, { "epoch": 3.2743403093721564, "grad_norm": 0.6742218519433386, "learning_rate": 2.663446250850737e-07, "loss": 0.0144, "step": 7197 }, { "epoch": 3.2747952684258417, "grad_norm": 0.5406220145413514, "learning_rate": 2.6621827155411147e-07, "loss": 0.009, "step": 7198 }, { "epoch": 3.275250227479527, "grad_norm": 0.6682707132532634, "learning_rate": 2.6609193712673615e-07, "loss": 0.0098, "step": 7199 }, { "epoch": 3.275705186533212, "grad_norm": 0.929697959378997, "learning_rate": 2.6596562181327163e-07, "loss": 0.0189, "step": 7200 }, { "epoch": 3.276160145586897, "grad_norm": 0.496629714296033, "learning_rate": 2.6583932562403955e-07, "loss": 0.0069, "step": 7201 }, { "epoch": 3.2766151046405825, "grad_norm": 0.7696013983499682, "learning_rate": 2.6571304856936015e-07, "loss": 0.0172, "step": 7202 }, { "epoch": 3.2770700636942673, "grad_norm": 0.9953019621912838, "learning_rate": 2.6558679065955257e-07, "loss": 0.0076, "step": 7203 }, { "epoch": 3.2775250227479527, "grad_norm": 0.22197213867828847, "learning_rate": 2.6546055190493393e-07, "loss": 0.0011, "step": 7204 }, { "epoch": 3.277979981801638, "grad_norm": 0.7533801622366176, "learning_rate": 2.6533433231581973e-07, "loss": 0.0122, "step": 7205 }, { "epoch": 3.278434940855323, "grad_norm": 0.783166769562038, "learning_rate": 2.65208131902524e-07, "loss": 0.0178, "step": 7206 }, { "epoch": 3.278889899909008, "grad_norm": 0.5522428441547137, "learning_rate": 2.6508195067535945e-07, "loss": 0.0052, "step": 7207 }, { "epoch": 3.2793448589626935, "grad_norm": 0.43844357406822587, "learning_rate": 2.6495578864463716e-07, "loss": 0.0023, "step": 7208 }, { "epoch": 3.2797998180163788, "grad_norm": 0.7233333091692035, "learning_rate": 2.6482964582066635e-07, "loss": 0.0078, "step": 7209 }, { "epoch": 3.2802547770700636, "grad_norm": 0.7541411811556891, "learning_rate": 2.647035222137549e-07, "loss": 0.0142, "step": 7210 }, { "epoch": 3.280709736123749, "grad_norm": 0.7083073698836371, "learning_rate": 2.645774178342088e-07, "loss": 0.0067, "step": 7211 }, { "epoch": 3.2811646951774343, "grad_norm": 0.7586720083893961, "learning_rate": 2.644513326923332e-07, "loss": 0.0082, "step": 7212 }, { "epoch": 3.281619654231119, "grad_norm": 0.7513920993300143, "learning_rate": 2.643252667984307e-07, "loss": 0.0093, "step": 7213 }, { "epoch": 3.2820746132848044, "grad_norm": 0.3977175734955582, "learning_rate": 2.6419922016280336e-07, "loss": 0.0064, "step": 7214 }, { "epoch": 3.2825295723384897, "grad_norm": 0.5831843736164252, "learning_rate": 2.6407319279575087e-07, "loss": 0.0131, "step": 7215 }, { "epoch": 3.2829845313921746, "grad_norm": 1.3016929718347516, "learning_rate": 2.639471847075714e-07, "loss": 0.0073, "step": 7216 }, { "epoch": 3.28343949044586, "grad_norm": 0.47153114263721696, "learning_rate": 2.638211959085622e-07, "loss": 0.0081, "step": 7217 }, { "epoch": 3.2838944494995452, "grad_norm": 0.8349521041269284, "learning_rate": 2.636952264090183e-07, "loss": 0.0104, "step": 7218 }, { "epoch": 3.28434940855323, "grad_norm": 0.8636194980291496, "learning_rate": 2.635692762192334e-07, "loss": 0.018, "step": 7219 }, { "epoch": 3.2848043676069154, "grad_norm": 0.8859434029273049, "learning_rate": 2.634433453494993e-07, "loss": 0.0206, "step": 7220 }, { "epoch": 3.2852593266606007, "grad_norm": 0.5889283150458854, "learning_rate": 2.633174338101067e-07, "loss": 0.0106, "step": 7221 }, { "epoch": 3.2857142857142856, "grad_norm": 0.5240108105237302, "learning_rate": 2.631915416113448e-07, "loss": 0.0113, "step": 7222 }, { "epoch": 3.286169244767971, "grad_norm": 0.5821666943068419, "learning_rate": 2.6306566876350067e-07, "loss": 0.0031, "step": 7223 }, { "epoch": 3.286624203821656, "grad_norm": 0.7037300348523509, "learning_rate": 2.6293981527686014e-07, "loss": 0.0081, "step": 7224 }, { "epoch": 3.287079162875341, "grad_norm": 0.5410882558816821, "learning_rate": 2.628139811617073e-07, "loss": 0.0086, "step": 7225 }, { "epoch": 3.2875341219290264, "grad_norm": 0.9485021783937182, "learning_rate": 2.626881664283247e-07, "loss": 0.0176, "step": 7226 }, { "epoch": 3.2879890809827117, "grad_norm": 0.3295328759266623, "learning_rate": 2.6256237108699337e-07, "loss": 0.0031, "step": 7227 }, { "epoch": 3.2884440400363966, "grad_norm": 0.9721664551555327, "learning_rate": 2.62436595147993e-07, "loss": 0.0072, "step": 7228 }, { "epoch": 3.288898999090082, "grad_norm": 1.0166034529954182, "learning_rate": 2.623108386216013e-07, "loss": 0.0072, "step": 7229 }, { "epoch": 3.289353958143767, "grad_norm": 0.6672659732266388, "learning_rate": 2.621851015180945e-07, "loss": 0.0103, "step": 7230 }, { "epoch": 3.289808917197452, "grad_norm": 0.818607265972044, "learning_rate": 2.6205938384774693e-07, "loss": 0.013, "step": 7231 }, { "epoch": 3.2902638762511374, "grad_norm": 0.553622537983118, "learning_rate": 2.6193368562083226e-07, "loss": 0.0095, "step": 7232 }, { "epoch": 3.2907188353048227, "grad_norm": 0.7615646361782206, "learning_rate": 2.6180800684762164e-07, "loss": 0.0176, "step": 7233 }, { "epoch": 3.2911737943585075, "grad_norm": 0.5315739176362425, "learning_rate": 2.6168234753838493e-07, "loss": 0.0096, "step": 7234 }, { "epoch": 3.291628753412193, "grad_norm": 0.616052193769159, "learning_rate": 2.6155670770339065e-07, "loss": 0.0072, "step": 7235 }, { "epoch": 3.292083712465878, "grad_norm": 0.45051493723796937, "learning_rate": 2.614310873529053e-07, "loss": 0.007, "step": 7236 }, { "epoch": 3.292538671519563, "grad_norm": 1.1842865109140446, "learning_rate": 2.613054864971943e-07, "loss": 0.0096, "step": 7237 }, { "epoch": 3.2929936305732483, "grad_norm": 0.5644479624599139, "learning_rate": 2.61179905146521e-07, "loss": 0.0113, "step": 7238 }, { "epoch": 3.2934485896269337, "grad_norm": 1.114571286953364, "learning_rate": 2.610543433111473e-07, "loss": 0.0152, "step": 7239 }, { "epoch": 3.2939035486806185, "grad_norm": 0.5593634517159358, "learning_rate": 2.609288010013335e-07, "loss": 0.0067, "step": 7240 }, { "epoch": 3.294358507734304, "grad_norm": 0.4147770910363209, "learning_rate": 2.6080327822733833e-07, "loss": 0.0025, "step": 7241 }, { "epoch": 3.294813466787989, "grad_norm": 0.5640652756718161, "learning_rate": 2.6067777499941936e-07, "loss": 0.0091, "step": 7242 }, { "epoch": 3.295268425841674, "grad_norm": 0.489414590449063, "learning_rate": 2.6055229132783175e-07, "loss": 0.0101, "step": 7243 }, { "epoch": 3.2957233848953593, "grad_norm": 0.6280348362796668, "learning_rate": 2.604268272228296e-07, "loss": 0.0085, "step": 7244 }, { "epoch": 3.2961783439490446, "grad_norm": 0.4495921057180775, "learning_rate": 2.6030138269466507e-07, "loss": 0.0051, "step": 7245 }, { "epoch": 3.29663330300273, "grad_norm": 0.7898738083555311, "learning_rate": 2.6017595775358927e-07, "loss": 0.0165, "step": 7246 }, { "epoch": 3.297088262056415, "grad_norm": 0.4002810178071852, "learning_rate": 2.6005055240985107e-07, "loss": 0.0071, "step": 7247 }, { "epoch": 3.2975432211101, "grad_norm": 0.5626273281875452, "learning_rate": 2.59925166673698e-07, "loss": 0.0075, "step": 7248 }, { "epoch": 3.2979981801637854, "grad_norm": 0.5889465201089298, "learning_rate": 2.5979980055537637e-07, "loss": 0.0058, "step": 7249 }, { "epoch": 3.2984531392174703, "grad_norm": 0.5874839506432217, "learning_rate": 2.596744540651301e-07, "loss": 0.0126, "step": 7250 }, { "epoch": 3.2989080982711556, "grad_norm": 0.5638691188534524, "learning_rate": 2.5954912721320235e-07, "loss": 0.0077, "step": 7251 }, { "epoch": 3.299363057324841, "grad_norm": 0.5401044249833572, "learning_rate": 2.5942382000983396e-07, "loss": 0.0083, "step": 7252 }, { "epoch": 3.299818016378526, "grad_norm": 0.6104059759907012, "learning_rate": 2.5929853246526463e-07, "loss": 0.0104, "step": 7253 }, { "epoch": 3.300272975432211, "grad_norm": 0.5117370737951263, "learning_rate": 2.5917326458973223e-07, "loss": 0.0063, "step": 7254 }, { "epoch": 3.3007279344858964, "grad_norm": 0.6183522736194857, "learning_rate": 2.590480163934727e-07, "loss": 0.013, "step": 7255 }, { "epoch": 3.3011828935395813, "grad_norm": 0.6025566582900109, "learning_rate": 2.589227878867215e-07, "loss": 0.0106, "step": 7256 }, { "epoch": 3.3016378525932666, "grad_norm": 0.7588565754561899, "learning_rate": 2.587975790797114e-07, "loss": 0.0152, "step": 7257 }, { "epoch": 3.302092811646952, "grad_norm": 1.5681975986883219, "learning_rate": 2.586723899826738e-07, "loss": 0.0226, "step": 7258 }, { "epoch": 3.3025477707006368, "grad_norm": 0.7177090723708218, "learning_rate": 2.585472206058388e-07, "loss": 0.0052, "step": 7259 }, { "epoch": 3.303002729754322, "grad_norm": 0.4924750372240527, "learning_rate": 2.5842207095943427e-07, "loss": 0.0137, "step": 7260 }, { "epoch": 3.3034576888080074, "grad_norm": 0.9867227286456741, "learning_rate": 2.582969410536874e-07, "loss": 0.0155, "step": 7261 }, { "epoch": 3.3039126478616927, "grad_norm": 0.7941223128382909, "learning_rate": 2.581718308988227e-07, "loss": 0.0073, "step": 7262 }, { "epoch": 3.3043676069153776, "grad_norm": 0.751906843431216, "learning_rate": 2.5804674050506414e-07, "loss": 0.0108, "step": 7263 }, { "epoch": 3.304822565969063, "grad_norm": 0.49371431634903107, "learning_rate": 2.579216698826333e-07, "loss": 0.006, "step": 7264 }, { "epoch": 3.305277525022748, "grad_norm": 0.580215172331562, "learning_rate": 2.577966190417502e-07, "loss": 0.0101, "step": 7265 }, { "epoch": 3.305732484076433, "grad_norm": 0.5015545939453705, "learning_rate": 2.5767158799263377e-07, "loss": 0.0077, "step": 7266 }, { "epoch": 3.3061874431301184, "grad_norm": 0.9180561097105284, "learning_rate": 2.5754657674550085e-07, "loss": 0.0067, "step": 7267 }, { "epoch": 3.3066424021838037, "grad_norm": 0.8333532664908659, "learning_rate": 2.5742158531056666e-07, "loss": 0.0113, "step": 7268 }, { "epoch": 3.3070973612374885, "grad_norm": 0.526241393875604, "learning_rate": 2.57296613698045e-07, "loss": 0.011, "step": 7269 }, { "epoch": 3.307552320291174, "grad_norm": 0.823397796386973, "learning_rate": 2.57171661918148e-07, "loss": 0.0091, "step": 7270 }, { "epoch": 3.308007279344859, "grad_norm": 0.7375999650884276, "learning_rate": 2.570467299810863e-07, "loss": 0.0162, "step": 7271 }, { "epoch": 3.308462238398544, "grad_norm": 0.4778414082479292, "learning_rate": 2.5692181789706875e-07, "loss": 0.0025, "step": 7272 }, { "epoch": 3.3089171974522293, "grad_norm": 0.9130765590100544, "learning_rate": 2.5679692567630243e-07, "loss": 0.0139, "step": 7273 }, { "epoch": 3.3093721565059147, "grad_norm": 0.7094997665138065, "learning_rate": 2.566720533289929e-07, "loss": 0.0189, "step": 7274 }, { "epoch": 3.3098271155595995, "grad_norm": 0.7969195971178473, "learning_rate": 2.5654720086534454e-07, "loss": 0.0085, "step": 7275 }, { "epoch": 3.310282074613285, "grad_norm": 0.945916831109105, "learning_rate": 2.5642236829555926e-07, "loss": 0.0109, "step": 7276 }, { "epoch": 3.31073703366697, "grad_norm": 0.8964427221111905, "learning_rate": 2.5629755562983825e-07, "loss": 0.0045, "step": 7277 }, { "epoch": 3.311191992720655, "grad_norm": 0.6425887489080222, "learning_rate": 2.5617276287838043e-07, "loss": 0.0075, "step": 7278 }, { "epoch": 3.3116469517743403, "grad_norm": 0.456286909320814, "learning_rate": 2.5604799005138317e-07, "loss": 0.0041, "step": 7279 }, { "epoch": 3.3121019108280256, "grad_norm": 0.6477180148252119, "learning_rate": 2.5592323715904263e-07, "loss": 0.0149, "step": 7280 }, { "epoch": 3.3125568698817105, "grad_norm": 0.5730774235832401, "learning_rate": 2.557985042115529e-07, "loss": 0.0088, "step": 7281 }, { "epoch": 3.313011828935396, "grad_norm": 0.4115616394039636, "learning_rate": 2.556737912191067e-07, "loss": 0.0069, "step": 7282 }, { "epoch": 3.313466787989081, "grad_norm": 0.8819569203309863, "learning_rate": 2.555490981918946e-07, "loss": 0.0117, "step": 7283 }, { "epoch": 3.313921747042766, "grad_norm": 0.38360002590739484, "learning_rate": 2.554244251401063e-07, "loss": 0.0033, "step": 7284 }, { "epoch": 3.3143767060964513, "grad_norm": 1.5407533296448883, "learning_rate": 2.552997720739297e-07, "loss": 0.009, "step": 7285 }, { "epoch": 3.3148316651501366, "grad_norm": 0.719220218694426, "learning_rate": 2.551751390035507e-07, "loss": 0.0188, "step": 7286 }, { "epoch": 3.3152866242038215, "grad_norm": 0.6299119557257052, "learning_rate": 2.5505052593915365e-07, "loss": 0.0197, "step": 7287 }, { "epoch": 3.315741583257507, "grad_norm": 0.7339717323231688, "learning_rate": 2.5492593289092136e-07, "loss": 0.0153, "step": 7288 }, { "epoch": 3.316196542311192, "grad_norm": 0.5150059592502234, "learning_rate": 2.548013598690352e-07, "loss": 0.0082, "step": 7289 }, { "epoch": 3.316651501364877, "grad_norm": 0.7013783888689855, "learning_rate": 2.5467680688367433e-07, "loss": 0.012, "step": 7290 }, { "epoch": 3.3171064604185623, "grad_norm": 0.7889144362302427, "learning_rate": 2.545522739450172e-07, "loss": 0.0126, "step": 7291 }, { "epoch": 3.3175614194722476, "grad_norm": 0.7231259021129378, "learning_rate": 2.544277610632398e-07, "loss": 0.0164, "step": 7292 }, { "epoch": 3.3180163785259325, "grad_norm": 0.5350363834713862, "learning_rate": 2.5430326824851686e-07, "loss": 0.0082, "step": 7293 }, { "epoch": 3.3184713375796178, "grad_norm": 0.491087073728545, "learning_rate": 2.54178795511021e-07, "loss": 0.0086, "step": 7294 }, { "epoch": 3.318926296633303, "grad_norm": 0.6307874784585287, "learning_rate": 2.540543428609241e-07, "loss": 0.0088, "step": 7295 }, { "epoch": 3.319381255686988, "grad_norm": 0.5695447138996446, "learning_rate": 2.539299103083956e-07, "loss": 0.01, "step": 7296 }, { "epoch": 3.3198362147406733, "grad_norm": 0.8898373213392573, "learning_rate": 2.5380549786360334e-07, "loss": 0.008, "step": 7297 }, { "epoch": 3.3202911737943586, "grad_norm": 0.9425704568106357, "learning_rate": 2.5368110553671424e-07, "loss": 0.0309, "step": 7298 }, { "epoch": 3.3207461328480434, "grad_norm": 0.24571626330901325, "learning_rate": 2.535567333378926e-07, "loss": 0.002, "step": 7299 }, { "epoch": 3.3212010919017287, "grad_norm": 0.5453646006054049, "learning_rate": 2.53432381277302e-07, "loss": 0.0059, "step": 7300 }, { "epoch": 3.321656050955414, "grad_norm": 0.4856157975641636, "learning_rate": 2.533080493651037e-07, "loss": 0.0062, "step": 7301 }, { "epoch": 3.3221110100090994, "grad_norm": 0.646292032372105, "learning_rate": 2.5318373761145757e-07, "loss": 0.0109, "step": 7302 }, { "epoch": 3.3225659690627842, "grad_norm": 0.6544192676715991, "learning_rate": 2.5305944602652163e-07, "loss": 0.009, "step": 7303 }, { "epoch": 3.3230209281164695, "grad_norm": 0.7966961049594998, "learning_rate": 2.529351746204525e-07, "loss": 0.0148, "step": 7304 }, { "epoch": 3.323475887170155, "grad_norm": 0.3557440202137253, "learning_rate": 2.528109234034054e-07, "loss": 0.0036, "step": 7305 }, { "epoch": 3.3239308462238397, "grad_norm": 0.6795064869830059, "learning_rate": 2.526866923855334e-07, "loss": 0.0084, "step": 7306 }, { "epoch": 3.324385805277525, "grad_norm": 0.7360257632029814, "learning_rate": 2.5256248157698797e-07, "loss": 0.013, "step": 7307 }, { "epoch": 3.3248407643312103, "grad_norm": 0.4071184921144767, "learning_rate": 2.524382909879189e-07, "loss": 0.0042, "step": 7308 }, { "epoch": 3.325295723384895, "grad_norm": 0.8710402232325434, "learning_rate": 2.52314120628475e-07, "loss": 0.0119, "step": 7309 }, { "epoch": 3.3257506824385805, "grad_norm": 0.4840726200406763, "learning_rate": 2.5218997050880256e-07, "loss": 0.0062, "step": 7310 }, { "epoch": 3.326205641492266, "grad_norm": 0.6403648850956888, "learning_rate": 2.5206584063904645e-07, "loss": 0.0058, "step": 7311 }, { "epoch": 3.3266606005459507, "grad_norm": 0.5577671207890009, "learning_rate": 2.519417310293504e-07, "loss": 0.0055, "step": 7312 }, { "epoch": 3.327115559599636, "grad_norm": 0.9107368090405862, "learning_rate": 2.5181764168985567e-07, "loss": 0.0156, "step": 7313 }, { "epoch": 3.3275705186533213, "grad_norm": 0.5794328280787514, "learning_rate": 2.516935726307027e-07, "loss": 0.0037, "step": 7314 }, { "epoch": 3.328025477707006, "grad_norm": 0.7841061455320979, "learning_rate": 2.5156952386202955e-07, "loss": 0.0157, "step": 7315 }, { "epoch": 3.3284804367606915, "grad_norm": 0.6716620670588681, "learning_rate": 2.514454953939731e-07, "loss": 0.0147, "step": 7316 }, { "epoch": 3.328935395814377, "grad_norm": 0.282846325856455, "learning_rate": 2.51321487236668e-07, "loss": 0.0024, "step": 7317 }, { "epoch": 3.329390354868062, "grad_norm": 1.8898317448643527, "learning_rate": 2.5119749940024806e-07, "loss": 0.03, "step": 7318 }, { "epoch": 3.329845313921747, "grad_norm": 0.7479988554820369, "learning_rate": 2.51073531894845e-07, "loss": 0.0156, "step": 7319 }, { "epoch": 3.3303002729754323, "grad_norm": 0.8107049250927004, "learning_rate": 2.509495847305888e-07, "loss": 0.0057, "step": 7320 }, { "epoch": 3.3307552320291176, "grad_norm": 0.5954259146600371, "learning_rate": 2.508256579176078e-07, "loss": 0.0078, "step": 7321 }, { "epoch": 3.3312101910828025, "grad_norm": 0.767488105132299, "learning_rate": 2.507017514660286e-07, "loss": 0.0096, "step": 7322 }, { "epoch": 3.331665150136488, "grad_norm": 0.4952930550777659, "learning_rate": 2.505778653859767e-07, "loss": 0.0059, "step": 7323 }, { "epoch": 3.332120109190173, "grad_norm": 0.5341278276404793, "learning_rate": 2.504539996875752e-07, "loss": 0.0086, "step": 7324 }, { "epoch": 3.332575068243858, "grad_norm": 0.706837802842943, "learning_rate": 2.5033015438094573e-07, "loss": 0.0106, "step": 7325 }, { "epoch": 3.3330300272975433, "grad_norm": 0.7431926831762371, "learning_rate": 2.502063294762087e-07, "loss": 0.0104, "step": 7326 }, { "epoch": 3.3334849863512286, "grad_norm": 0.36484445434877455, "learning_rate": 2.5008252498348225e-07, "loss": 0.0031, "step": 7327 }, { "epoch": 3.3339399454049135, "grad_norm": 0.5087006677791034, "learning_rate": 2.499587409128834e-07, "loss": 0.0097, "step": 7328 }, { "epoch": 3.3343949044585988, "grad_norm": 0.5404411728911523, "learning_rate": 2.4983497727452704e-07, "loss": 0.009, "step": 7329 }, { "epoch": 3.334849863512284, "grad_norm": 0.680150943140733, "learning_rate": 2.4971123407852663e-07, "loss": 0.0116, "step": 7330 }, { "epoch": 3.335304822565969, "grad_norm": 1.1813367974561382, "learning_rate": 2.495875113349938e-07, "loss": 0.0123, "step": 7331 }, { "epoch": 3.3357597816196543, "grad_norm": 0.5440048536115794, "learning_rate": 2.4946380905403865e-07, "loss": 0.0099, "step": 7332 }, { "epoch": 3.3362147406733396, "grad_norm": 0.6509333251034201, "learning_rate": 2.493401272457695e-07, "loss": 0.0083, "step": 7333 }, { "epoch": 3.3366696997270244, "grad_norm": 0.6354417075014562, "learning_rate": 2.492164659202934e-07, "loss": 0.0069, "step": 7334 }, { "epoch": 3.3371246587807097, "grad_norm": 1.1802670952316094, "learning_rate": 2.490928250877152e-07, "loss": 0.0102, "step": 7335 }, { "epoch": 3.337579617834395, "grad_norm": 0.9218360903407478, "learning_rate": 2.489692047581382e-07, "loss": 0.0139, "step": 7336 }, { "epoch": 3.33803457688808, "grad_norm": 1.3630049748315, "learning_rate": 2.4884560494166394e-07, "loss": 0.0225, "step": 7337 }, { "epoch": 3.3384895359417652, "grad_norm": 2.9798347897334327, "learning_rate": 2.487220256483929e-07, "loss": 0.0169, "step": 7338 }, { "epoch": 3.3389444949954505, "grad_norm": 0.5753959877641142, "learning_rate": 2.4859846688842293e-07, "loss": 0.0071, "step": 7339 }, { "epoch": 3.3393994540491354, "grad_norm": 0.7520598204280028, "learning_rate": 2.484749286718511e-07, "loss": 0.0057, "step": 7340 }, { "epoch": 3.3398544131028207, "grad_norm": 0.708475386466571, "learning_rate": 2.4835141100877225e-07, "loss": 0.0097, "step": 7341 }, { "epoch": 3.340309372156506, "grad_norm": 0.6085924072193974, "learning_rate": 2.482279139092795e-07, "loss": 0.0093, "step": 7342 }, { "epoch": 3.340764331210191, "grad_norm": 0.4889996655362197, "learning_rate": 2.4810443738346474e-07, "loss": 0.0079, "step": 7343 }, { "epoch": 3.341219290263876, "grad_norm": 0.6460880121787248, "learning_rate": 2.4798098144141787e-07, "loss": 0.0109, "step": 7344 }, { "epoch": 3.3416742493175615, "grad_norm": 0.970565655803768, "learning_rate": 2.4785754609322714e-07, "loss": 0.0145, "step": 7345 }, { "epoch": 3.3421292083712464, "grad_norm": 0.7067938495094007, "learning_rate": 2.477341313489788e-07, "loss": 0.0074, "step": 7346 }, { "epoch": 3.3425841674249317, "grad_norm": 4.195537013027917, "learning_rate": 2.47610737218758e-07, "loss": 0.0408, "step": 7347 }, { "epoch": 3.343039126478617, "grad_norm": 0.5206353879060767, "learning_rate": 2.474873637126482e-07, "loss": 0.0036, "step": 7348 }, { "epoch": 3.343494085532302, "grad_norm": 0.6519074500214684, "learning_rate": 2.473640108407307e-07, "loss": 0.0116, "step": 7349 }, { "epoch": 3.343949044585987, "grad_norm": 1.4768249081424127, "learning_rate": 2.472406786130854e-07, "loss": 0.0046, "step": 7350 }, { "epoch": 3.3444040036396725, "grad_norm": 0.7027793628686074, "learning_rate": 2.4711736703979015e-07, "loss": 0.006, "step": 7351 }, { "epoch": 3.3448589626933574, "grad_norm": 0.6680224227937994, "learning_rate": 2.4699407613092184e-07, "loss": 0.0086, "step": 7352 }, { "epoch": 3.3453139217470427, "grad_norm": 2.0760182521409076, "learning_rate": 2.468708058965549e-07, "loss": 0.043, "step": 7353 }, { "epoch": 3.345768880800728, "grad_norm": 0.7417872980277641, "learning_rate": 2.467475563467628e-07, "loss": 0.0136, "step": 7354 }, { "epoch": 3.3462238398544133, "grad_norm": 0.8097435499568405, "learning_rate": 2.466243274916166e-07, "loss": 0.0073, "step": 7355 }, { "epoch": 3.346678798908098, "grad_norm": 0.7185284825206529, "learning_rate": 2.46501119341186e-07, "loss": 0.0118, "step": 7356 }, { "epoch": 3.3471337579617835, "grad_norm": 0.5032960810919701, "learning_rate": 2.4637793190553934e-07, "loss": 0.0055, "step": 7357 }, { "epoch": 3.347588717015469, "grad_norm": 1.1054447530139382, "learning_rate": 2.4625476519474277e-07, "loss": 0.0035, "step": 7358 }, { "epoch": 3.3480436760691537, "grad_norm": 1.1869709265578514, "learning_rate": 2.461316192188608e-07, "loss": 0.0104, "step": 7359 }, { "epoch": 3.348498635122839, "grad_norm": 0.6809367789622653, "learning_rate": 2.460084939879563e-07, "loss": 0.0196, "step": 7360 }, { "epoch": 3.3489535941765243, "grad_norm": 0.8217022736307079, "learning_rate": 2.458853895120907e-07, "loss": 0.0103, "step": 7361 }, { "epoch": 3.349408553230209, "grad_norm": 0.5940304459372934, "learning_rate": 2.457623058013236e-07, "loss": 0.0033, "step": 7362 }, { "epoch": 3.3498635122838945, "grad_norm": 0.6210159157299061, "learning_rate": 2.456392428657128e-07, "loss": 0.0161, "step": 7363 }, { "epoch": 3.3503184713375798, "grad_norm": 0.8632724527117369, "learning_rate": 2.4551620071531437e-07, "loss": 0.0176, "step": 7364 }, { "epoch": 3.3507734303912646, "grad_norm": 0.6832010241016535, "learning_rate": 2.453931793601828e-07, "loss": 0.0156, "step": 7365 }, { "epoch": 3.35122838944495, "grad_norm": 0.588747047583665, "learning_rate": 2.4527017881037064e-07, "loss": 0.0081, "step": 7366 }, { "epoch": 3.3516833484986353, "grad_norm": 0.6483751188102674, "learning_rate": 2.451471990759291e-07, "loss": 0.0111, "step": 7367 }, { "epoch": 3.35213830755232, "grad_norm": 0.4887494521141409, "learning_rate": 2.450242401669077e-07, "loss": 0.0085, "step": 7368 }, { "epoch": 3.3525932666060054, "grad_norm": 1.0895210026920963, "learning_rate": 2.44901302093354e-07, "loss": 0.0078, "step": 7369 }, { "epoch": 3.3530482256596907, "grad_norm": 0.5212244956045353, "learning_rate": 2.4477838486531384e-07, "loss": 0.0057, "step": 7370 }, { "epoch": 3.3535031847133756, "grad_norm": 0.5412381230433384, "learning_rate": 2.446554884928313e-07, "loss": 0.0066, "step": 7371 }, { "epoch": 3.353958143767061, "grad_norm": 0.40149890209238337, "learning_rate": 2.445326129859493e-07, "loss": 0.0033, "step": 7372 }, { "epoch": 3.3544131028207462, "grad_norm": 0.6580589999492656, "learning_rate": 2.444097583547085e-07, "loss": 0.0139, "step": 7373 }, { "epoch": 3.3548680618744315, "grad_norm": 0.817798127839812, "learning_rate": 2.442869246091478e-07, "loss": 0.0264, "step": 7374 }, { "epoch": 3.3553230209281164, "grad_norm": 0.9456398852722456, "learning_rate": 2.441641117593051e-07, "loss": 0.0061, "step": 7375 }, { "epoch": 3.3557779799818017, "grad_norm": 0.5207763741865256, "learning_rate": 2.440413198152156e-07, "loss": 0.0055, "step": 7376 }, { "epoch": 3.356232939035487, "grad_norm": 0.7691991670546492, "learning_rate": 2.4391854878691373e-07, "loss": 0.0096, "step": 7377 }, { "epoch": 3.356687898089172, "grad_norm": 0.6983007070588965, "learning_rate": 2.437957986844316e-07, "loss": 0.0068, "step": 7378 }, { "epoch": 3.357142857142857, "grad_norm": 0.4002530316558947, "learning_rate": 2.4367306951779974e-07, "loss": 0.0033, "step": 7379 }, { "epoch": 3.3575978161965425, "grad_norm": 0.8540056232729016, "learning_rate": 2.4355036129704696e-07, "loss": 0.0125, "step": 7380 }, { "epoch": 3.3580527752502274, "grad_norm": 0.5235156493083911, "learning_rate": 2.4342767403220047e-07, "loss": 0.0107, "step": 7381 }, { "epoch": 3.3585077343039127, "grad_norm": 0.5023047817889835, "learning_rate": 2.43305007733286e-07, "loss": 0.0112, "step": 7382 }, { "epoch": 3.358962693357598, "grad_norm": 0.382634491125303, "learning_rate": 2.431823624103272e-07, "loss": 0.0025, "step": 7383 }, { "epoch": 3.359417652411283, "grad_norm": 1.0049934312448936, "learning_rate": 2.4305973807334586e-07, "loss": 0.0091, "step": 7384 }, { "epoch": 3.359872611464968, "grad_norm": 0.44751067464347544, "learning_rate": 2.4293713473236217e-07, "loss": 0.003, "step": 7385 }, { "epoch": 3.3603275705186535, "grad_norm": 0.6250454097503118, "learning_rate": 2.428145523973952e-07, "loss": 0.0127, "step": 7386 }, { "epoch": 3.3607825295723384, "grad_norm": 0.9113081543926952, "learning_rate": 2.426919910784615e-07, "loss": 0.0127, "step": 7387 }, { "epoch": 3.3612374886260237, "grad_norm": 0.5931548982922938, "learning_rate": 2.4256945078557615e-07, "loss": 0.0046, "step": 7388 }, { "epoch": 3.361692447679709, "grad_norm": 0.4224839565773249, "learning_rate": 2.4244693152875295e-07, "loss": 0.0071, "step": 7389 }, { "epoch": 3.362147406733394, "grad_norm": 0.5060914653404391, "learning_rate": 2.4232443331800317e-07, "loss": 0.0056, "step": 7390 }, { "epoch": 3.362602365787079, "grad_norm": 0.6964839578174619, "learning_rate": 2.4220195616333727e-07, "loss": 0.0168, "step": 7391 }, { "epoch": 3.3630573248407645, "grad_norm": 0.3971166087083739, "learning_rate": 2.420795000747633e-07, "loss": 0.0048, "step": 7392 }, { "epoch": 3.3635122838944493, "grad_norm": 0.5622354204937895, "learning_rate": 2.419570650622878e-07, "loss": 0.0052, "step": 7393 }, { "epoch": 3.3639672429481347, "grad_norm": 0.8287185071908062, "learning_rate": 2.4183465113591546e-07, "loss": 0.0142, "step": 7394 }, { "epoch": 3.36442220200182, "grad_norm": 0.3414872572512527, "learning_rate": 2.4171225830564956e-07, "loss": 0.0071, "step": 7395 }, { "epoch": 3.364877161055505, "grad_norm": 0.800585177426409, "learning_rate": 2.415898865814917e-07, "loss": 0.0138, "step": 7396 }, { "epoch": 3.36533212010919, "grad_norm": 0.7209481390550859, "learning_rate": 2.4146753597344135e-07, "loss": 0.009, "step": 7397 }, { "epoch": 3.3657870791628755, "grad_norm": 0.7245036038717649, "learning_rate": 2.4134520649149643e-07, "loss": 0.0248, "step": 7398 }, { "epoch": 3.3662420382165603, "grad_norm": 1.1549349790069061, "learning_rate": 2.412228981456531e-07, "loss": 0.0124, "step": 7399 }, { "epoch": 3.3666969972702456, "grad_norm": 0.8820357036052274, "learning_rate": 2.411006109459058e-07, "loss": 0.008, "step": 7400 }, { "epoch": 3.367151956323931, "grad_norm": 0.7580593790916985, "learning_rate": 2.409783449022475e-07, "loss": 0.0173, "step": 7401 }, { "epoch": 3.367606915377616, "grad_norm": 1.3641115201857883, "learning_rate": 2.40856100024669e-07, "loss": 0.0126, "step": 7402 }, { "epoch": 3.368061874431301, "grad_norm": 0.41523893434602777, "learning_rate": 2.407338763231599e-07, "loss": 0.0108, "step": 7403 }, { "epoch": 3.3685168334849864, "grad_norm": 0.869959557079163, "learning_rate": 2.406116738077076e-07, "loss": 0.0074, "step": 7404 }, { "epoch": 3.3689717925386713, "grad_norm": 0.43425117822119164, "learning_rate": 2.404894924882977e-07, "loss": 0.009, "step": 7405 }, { "epoch": 3.3694267515923566, "grad_norm": 0.890453258005921, "learning_rate": 2.4036733237491477e-07, "loss": 0.0076, "step": 7406 }, { "epoch": 3.369881710646042, "grad_norm": 0.6124874378205764, "learning_rate": 2.402451934775409e-07, "loss": 0.0116, "step": 7407 }, { "epoch": 3.370336669699727, "grad_norm": 0.5832933034464303, "learning_rate": 2.401230758061568e-07, "loss": 0.0121, "step": 7408 }, { "epoch": 3.370791628753412, "grad_norm": 0.9258430740573173, "learning_rate": 2.4000097937074117e-07, "loss": 0.0159, "step": 7409 }, { "epoch": 3.3712465878070974, "grad_norm": 0.8009177196378137, "learning_rate": 2.3987890418127135e-07, "loss": 0.0075, "step": 7410 }, { "epoch": 3.3717015468607827, "grad_norm": 0.5594787632322152, "learning_rate": 2.3975685024772297e-07, "loss": 0.0063, "step": 7411 }, { "epoch": 3.3721565059144676, "grad_norm": 0.6257863172736765, "learning_rate": 2.3963481758006954e-07, "loss": 0.0115, "step": 7412 }, { "epoch": 3.372611464968153, "grad_norm": 1.096440882048234, "learning_rate": 2.39512806188283e-07, "loss": 0.0124, "step": 7413 }, { "epoch": 3.373066424021838, "grad_norm": 0.4452014027848706, "learning_rate": 2.393908160823335e-07, "loss": 0.0067, "step": 7414 }, { "epoch": 3.373521383075523, "grad_norm": 0.5494456290458688, "learning_rate": 2.392688472721897e-07, "loss": 0.0079, "step": 7415 }, { "epoch": 3.3739763421292084, "grad_norm": 0.7270544451688247, "learning_rate": 2.3914689976781805e-07, "loss": 0.006, "step": 7416 }, { "epoch": 3.3744313011828937, "grad_norm": 0.5060225797750432, "learning_rate": 2.39024973579184e-07, "loss": 0.009, "step": 7417 }, { "epoch": 3.3748862602365786, "grad_norm": 1.2010582121172573, "learning_rate": 2.3890306871625054e-07, "loss": 0.0076, "step": 7418 }, { "epoch": 3.375341219290264, "grad_norm": 0.7670254322747356, "learning_rate": 2.38781185188979e-07, "loss": 0.0088, "step": 7419 }, { "epoch": 3.375796178343949, "grad_norm": 0.3924845835036628, "learning_rate": 2.386593230073295e-07, "loss": 0.0038, "step": 7420 }, { "epoch": 3.376251137397634, "grad_norm": 0.6935380619711268, "learning_rate": 2.3853748218125996e-07, "loss": 0.0143, "step": 7421 }, { "epoch": 3.3767060964513194, "grad_norm": 0.4484988497443523, "learning_rate": 2.3841566272072665e-07, "loss": 0.0021, "step": 7422 }, { "epoch": 3.3771610555050047, "grad_norm": 0.6827489810417703, "learning_rate": 2.3829386463568384e-07, "loss": 0.0118, "step": 7423 }, { "epoch": 3.3776160145586895, "grad_norm": 0.5094665707224598, "learning_rate": 2.381720879360846e-07, "loss": 0.0089, "step": 7424 }, { "epoch": 3.378070973612375, "grad_norm": 0.4590457952223666, "learning_rate": 2.3805033263188007e-07, "loss": 0.0107, "step": 7425 }, { "epoch": 3.37852593266606, "grad_norm": 0.40199338722951483, "learning_rate": 2.3792859873301946e-07, "loss": 0.0029, "step": 7426 }, { "epoch": 3.3789808917197455, "grad_norm": 0.7453923941263165, "learning_rate": 2.3780688624945021e-07, "loss": 0.0114, "step": 7427 }, { "epoch": 3.3794358507734303, "grad_norm": 0.602771972595214, "learning_rate": 2.3768519519111802e-07, "loss": 0.0118, "step": 7428 }, { "epoch": 3.3798908098271156, "grad_norm": 0.8120774735931454, "learning_rate": 2.3756352556796722e-07, "loss": 0.0094, "step": 7429 }, { "epoch": 3.380345768880801, "grad_norm": 1.113839272236432, "learning_rate": 2.3744187738993976e-07, "loss": 0.0124, "step": 7430 }, { "epoch": 3.380800727934486, "grad_norm": 0.2857745464093686, "learning_rate": 2.3732025066697663e-07, "loss": 0.0028, "step": 7431 }, { "epoch": 3.381255686988171, "grad_norm": 1.3617750710712604, "learning_rate": 2.371986454090163e-07, "loss": 0.0171, "step": 7432 }, { "epoch": 3.3817106460418564, "grad_norm": 0.6191314958470372, "learning_rate": 2.370770616259957e-07, "loss": 0.0076, "step": 7433 }, { "epoch": 3.3821656050955413, "grad_norm": 0.4085053283182143, "learning_rate": 2.3695549932785048e-07, "loss": 0.0051, "step": 7434 }, { "epoch": 3.3826205641492266, "grad_norm": 0.4570328524175535, "learning_rate": 2.3683395852451392e-07, "loss": 0.0035, "step": 7435 }, { "epoch": 3.383075523202912, "grad_norm": 0.35493745715147446, "learning_rate": 2.367124392259179e-07, "loss": 0.0026, "step": 7436 }, { "epoch": 3.383530482256597, "grad_norm": 0.8936806902185248, "learning_rate": 2.365909414419921e-07, "loss": 0.0079, "step": 7437 }, { "epoch": 3.383985441310282, "grad_norm": 0.6085064294182592, "learning_rate": 2.364694651826652e-07, "loss": 0.0148, "step": 7438 }, { "epoch": 3.3844404003639674, "grad_norm": 0.5089397105198706, "learning_rate": 2.3634801045786335e-07, "loss": 0.0095, "step": 7439 }, { "epoch": 3.3848953594176523, "grad_norm": 0.7295889644042363, "learning_rate": 2.3622657727751162e-07, "loss": 0.0076, "step": 7440 }, { "epoch": 3.3853503184713376, "grad_norm": 0.6547651526017074, "learning_rate": 2.3610516565153277e-07, "loss": 0.0096, "step": 7441 }, { "epoch": 3.385805277525023, "grad_norm": 0.5963777011888991, "learning_rate": 2.359837755898481e-07, "loss": 0.0085, "step": 7442 }, { "epoch": 3.386260236578708, "grad_norm": 1.1841390643067025, "learning_rate": 2.3586240710237681e-07, "loss": 0.011, "step": 7443 }, { "epoch": 3.386715195632393, "grad_norm": 0.42342174589369513, "learning_rate": 2.3574106019903673e-07, "loss": 0.0046, "step": 7444 }, { "epoch": 3.3871701546860784, "grad_norm": 0.9309844499210139, "learning_rate": 2.3561973488974408e-07, "loss": 0.0123, "step": 7445 }, { "epoch": 3.3876251137397633, "grad_norm": 1.6949280180425246, "learning_rate": 2.3549843118441272e-07, "loss": 0.0118, "step": 7446 }, { "epoch": 3.3880800727934486, "grad_norm": 0.6111943394354437, "learning_rate": 2.3537714909295508e-07, "loss": 0.008, "step": 7447 }, { "epoch": 3.388535031847134, "grad_norm": 0.5535911282475018, "learning_rate": 2.3525588862528162e-07, "loss": 0.0128, "step": 7448 }, { "epoch": 3.3889899909008188, "grad_norm": 0.4938023377345491, "learning_rate": 2.3513464979130153e-07, "loss": 0.0048, "step": 7449 }, { "epoch": 3.389444949954504, "grad_norm": 0.7760104171144285, "learning_rate": 2.3501343260092177e-07, "loss": 0.0137, "step": 7450 }, { "epoch": 3.3898999090081894, "grad_norm": 0.9515347835463359, "learning_rate": 2.348922370640475e-07, "loss": 0.0163, "step": 7451 }, { "epoch": 3.3903548680618742, "grad_norm": 0.7064739824993066, "learning_rate": 2.347710631905825e-07, "loss": 0.0127, "step": 7452 }, { "epoch": 3.3908098271155596, "grad_norm": 0.5933389620484537, "learning_rate": 2.3464991099042826e-07, "loss": 0.0156, "step": 7453 }, { "epoch": 3.391264786169245, "grad_norm": 0.40951247329414464, "learning_rate": 2.345287804734852e-07, "loss": 0.005, "step": 7454 }, { "epoch": 3.3917197452229297, "grad_norm": 0.49263405096283713, "learning_rate": 2.3440767164965136e-07, "loss": 0.012, "step": 7455 }, { "epoch": 3.392174704276615, "grad_norm": 0.7957070630856387, "learning_rate": 2.3428658452882317e-07, "loss": 0.0203, "step": 7456 }, { "epoch": 3.3926296633303004, "grad_norm": 0.851957378821564, "learning_rate": 2.3416551912089512e-07, "loss": 0.0114, "step": 7457 }, { "epoch": 3.3930846223839852, "grad_norm": 0.3750430668045164, "learning_rate": 2.340444754357604e-07, "loss": 0.0047, "step": 7458 }, { "epoch": 3.3935395814376705, "grad_norm": 0.47125562504378776, "learning_rate": 2.3392345348331027e-07, "loss": 0.0036, "step": 7459 }, { "epoch": 3.393994540491356, "grad_norm": 0.44777319201329663, "learning_rate": 2.3380245327343396e-07, "loss": 0.0089, "step": 7460 }, { "epoch": 3.3944494995450407, "grad_norm": 0.6719821618619813, "learning_rate": 2.3368147481601895e-07, "loss": 0.007, "step": 7461 }, { "epoch": 3.394904458598726, "grad_norm": 1.0030285277516429, "learning_rate": 2.3356051812095102e-07, "loss": 0.002, "step": 7462 }, { "epoch": 3.3953594176524113, "grad_norm": 0.3600257063576819, "learning_rate": 2.3343958319811447e-07, "loss": 0.0035, "step": 7463 }, { "epoch": 3.395814376706096, "grad_norm": 0.46841009192895977, "learning_rate": 2.3331867005739124e-07, "loss": 0.0041, "step": 7464 }, { "epoch": 3.3962693357597815, "grad_norm": 0.43388644520083974, "learning_rate": 2.3319777870866213e-07, "loss": 0.0049, "step": 7465 }, { "epoch": 3.396724294813467, "grad_norm": 0.5719351046701568, "learning_rate": 2.3307690916180572e-07, "loss": 0.0109, "step": 7466 }, { "epoch": 3.397179253867152, "grad_norm": 0.4626007950306062, "learning_rate": 2.3295606142669867e-07, "loss": 0.0122, "step": 7467 }, { "epoch": 3.397634212920837, "grad_norm": 0.8695339482663919, "learning_rate": 2.3283523551321648e-07, "loss": 0.0122, "step": 7468 }, { "epoch": 3.3980891719745223, "grad_norm": 0.5132192406627508, "learning_rate": 2.3271443143123237e-07, "loss": 0.0121, "step": 7469 }, { "epoch": 3.3985441310282076, "grad_norm": 0.7536521550132815, "learning_rate": 2.325936491906179e-07, "loss": 0.0113, "step": 7470 }, { "epoch": 3.3989990900818925, "grad_norm": 0.5660645421625073, "learning_rate": 2.324728888012426e-07, "loss": 0.0054, "step": 7471 }, { "epoch": 3.399454049135578, "grad_norm": 0.5126951250262372, "learning_rate": 2.3235215027297466e-07, "loss": 0.0072, "step": 7472 }, { "epoch": 3.399909008189263, "grad_norm": 0.5962096621940148, "learning_rate": 2.3223143361568053e-07, "loss": 0.015, "step": 7473 }, { "epoch": 3.400363967242948, "grad_norm": 0.8405959421273194, "learning_rate": 2.3211073883922445e-07, "loss": 0.0123, "step": 7474 }, { "epoch": 3.4008189262966333, "grad_norm": 0.5255448557374748, "learning_rate": 2.31990065953469e-07, "loss": 0.0063, "step": 7475 }, { "epoch": 3.4012738853503186, "grad_norm": 0.568124745971554, "learning_rate": 2.3186941496827496e-07, "loss": 0.0079, "step": 7476 }, { "epoch": 3.4017288444040035, "grad_norm": 0.5325843608224488, "learning_rate": 2.3174878589350134e-07, "loss": 0.0077, "step": 7477 }, { "epoch": 3.402183803457689, "grad_norm": 0.736555574949994, "learning_rate": 2.3162817873900554e-07, "loss": 0.0072, "step": 7478 }, { "epoch": 3.402638762511374, "grad_norm": 2.327216877512693, "learning_rate": 2.3150759351464315e-07, "loss": 0.0105, "step": 7479 }, { "epoch": 3.403093721565059, "grad_norm": 0.5651677485649113, "learning_rate": 2.3138703023026774e-07, "loss": 0.0057, "step": 7480 }, { "epoch": 3.4035486806187443, "grad_norm": 0.6384331333209629, "learning_rate": 2.312664888957312e-07, "loss": 0.0061, "step": 7481 }, { "epoch": 3.4040036396724296, "grad_norm": 0.6448376237739271, "learning_rate": 2.3114596952088338e-07, "loss": 0.0182, "step": 7482 }, { "epoch": 3.404458598726115, "grad_norm": 1.689840637607514, "learning_rate": 2.31025472115573e-07, "loss": 0.0419, "step": 7483 }, { "epoch": 3.4049135577797998, "grad_norm": 0.7033742736212908, "learning_rate": 2.3090499668964636e-07, "loss": 0.0159, "step": 7484 }, { "epoch": 3.405368516833485, "grad_norm": 46.75649560550298, "learning_rate": 2.3078454325294794e-07, "loss": 0.2591, "step": 7485 }, { "epoch": 3.4058234758871704, "grad_norm": 0.886884246058611, "learning_rate": 2.3066411181532108e-07, "loss": 0.0273, "step": 7486 }, { "epoch": 3.4062784349408552, "grad_norm": 0.7796928158213248, "learning_rate": 2.3054370238660654e-07, "loss": 0.0072, "step": 7487 }, { "epoch": 3.4067333939945406, "grad_norm": 3.4230938233180437, "learning_rate": 2.3042331497664397e-07, "loss": 0.0129, "step": 7488 }, { "epoch": 3.407188353048226, "grad_norm": 0.9573730918819465, "learning_rate": 2.303029495952707e-07, "loss": 0.0146, "step": 7489 }, { "epoch": 3.4076433121019107, "grad_norm": 1.4361235081546517, "learning_rate": 2.3018260625232245e-07, "loss": 0.0225, "step": 7490 }, { "epoch": 3.408098271155596, "grad_norm": 0.5374580532234141, "learning_rate": 2.300622849576329e-07, "loss": 0.0049, "step": 7491 }, { "epoch": 3.4085532302092814, "grad_norm": 1.308598042957269, "learning_rate": 2.2994198572103445e-07, "loss": 0.0096, "step": 7492 }, { "epoch": 3.4090081892629662, "grad_norm": 0.9274318936887566, "learning_rate": 2.2982170855235755e-07, "loss": 0.0088, "step": 7493 }, { "epoch": 3.4094631483166515, "grad_norm": 0.5699497789513946, "learning_rate": 2.2970145346143044e-07, "loss": 0.0086, "step": 7494 }, { "epoch": 3.409918107370337, "grad_norm": 0.5210996700463056, "learning_rate": 2.2958122045807997e-07, "loss": 0.0049, "step": 7495 }, { "epoch": 3.4103730664240217, "grad_norm": 0.6370578985818718, "learning_rate": 2.2946100955213078e-07, "loss": 0.0097, "step": 7496 }, { "epoch": 3.410828025477707, "grad_norm": 0.8275845688038269, "learning_rate": 2.2934082075340626e-07, "loss": 0.0105, "step": 7497 }, { "epoch": 3.4112829845313923, "grad_norm": 0.6165735170182686, "learning_rate": 2.2922065407172762e-07, "loss": 0.0033, "step": 7498 }, { "epoch": 3.411737943585077, "grad_norm": 0.6461118913218021, "learning_rate": 2.291005095169141e-07, "loss": 0.0104, "step": 7499 }, { "epoch": 3.4121929026387625, "grad_norm": 0.38948669288708987, "learning_rate": 2.2898038709878382e-07, "loss": 0.0042, "step": 7500 }, { "epoch": 3.412647861692448, "grad_norm": 0.26215135318793603, "learning_rate": 2.2886028682715214e-07, "loss": 0.0021, "step": 7501 }, { "epoch": 3.4131028207461327, "grad_norm": 1.6258020670710926, "learning_rate": 2.2874020871183358e-07, "loss": 0.0156, "step": 7502 }, { "epoch": 3.413557779799818, "grad_norm": 0.7513413707300415, "learning_rate": 2.2862015276264013e-07, "loss": 0.0197, "step": 7503 }, { "epoch": 3.4140127388535033, "grad_norm": 0.31321117394977965, "learning_rate": 2.285001189893823e-07, "loss": 0.0023, "step": 7504 }, { "epoch": 3.414467697907188, "grad_norm": 1.2234301010121689, "learning_rate": 2.2838010740186848e-07, "loss": 0.0138, "step": 7505 }, { "epoch": 3.4149226569608735, "grad_norm": 0.7313656360995175, "learning_rate": 2.2826011800990564e-07, "loss": 0.0185, "step": 7506 }, { "epoch": 3.415377616014559, "grad_norm": 0.7815707816994927, "learning_rate": 2.2814015082329897e-07, "loss": 0.0218, "step": 7507 }, { "epoch": 3.4158325750682437, "grad_norm": 0.5739763420078552, "learning_rate": 2.2802020585185146e-07, "loss": 0.0065, "step": 7508 }, { "epoch": 3.416287534121929, "grad_norm": 1.0861595354843416, "learning_rate": 2.2790028310536446e-07, "loss": 0.0146, "step": 7509 }, { "epoch": 3.4167424931756143, "grad_norm": 0.4628797954392701, "learning_rate": 2.277803825936376e-07, "loss": 0.0045, "step": 7510 }, { "epoch": 3.417197452229299, "grad_norm": 0.652397918794953, "learning_rate": 2.2766050432646833e-07, "loss": 0.0122, "step": 7511 }, { "epoch": 3.4176524112829845, "grad_norm": 1.0415406613236953, "learning_rate": 2.2754064831365294e-07, "loss": 0.0087, "step": 7512 }, { "epoch": 3.41810737033667, "grad_norm": 0.6242588045732779, "learning_rate": 2.2742081456498513e-07, "loss": 0.0113, "step": 7513 }, { "epoch": 3.4185623293903546, "grad_norm": 0.5385867085623881, "learning_rate": 2.2730100309025762e-07, "loss": 0.0105, "step": 7514 }, { "epoch": 3.41901728844404, "grad_norm": 0.6264591146182956, "learning_rate": 2.2718121389926071e-07, "loss": 0.0051, "step": 7515 }, { "epoch": 3.4194722474977253, "grad_norm": 0.895152740226296, "learning_rate": 2.270614470017827e-07, "loss": 0.0087, "step": 7516 }, { "epoch": 3.41992720655141, "grad_norm": 0.4547763341779787, "learning_rate": 2.2694170240761084e-07, "loss": 0.0053, "step": 7517 }, { "epoch": 3.4203821656050954, "grad_norm": 0.4894299892454428, "learning_rate": 2.2682198012652996e-07, "loss": 0.0073, "step": 7518 }, { "epoch": 3.4208371246587808, "grad_norm": 0.655828013673317, "learning_rate": 2.267022801683232e-07, "loss": 0.0118, "step": 7519 }, { "epoch": 3.421292083712466, "grad_norm": 0.5319280513474768, "learning_rate": 2.2658260254277172e-07, "loss": 0.0135, "step": 7520 }, { "epoch": 3.421747042766151, "grad_norm": 0.5380374741370839, "learning_rate": 2.264629472596552e-07, "loss": 0.0047, "step": 7521 }, { "epoch": 3.4222020018198362, "grad_norm": 0.38853422586749603, "learning_rate": 2.263433143287516e-07, "loss": 0.0047, "step": 7522 }, { "epoch": 3.4226569608735216, "grad_norm": 0.5779805068145907, "learning_rate": 2.2622370375983645e-07, "loss": 0.0125, "step": 7523 }, { "epoch": 3.4231119199272064, "grad_norm": 0.8074780307808598, "learning_rate": 2.2610411556268387e-07, "loss": 0.0251, "step": 7524 }, { "epoch": 3.4235668789808917, "grad_norm": 0.5328656530575538, "learning_rate": 2.2598454974706593e-07, "loss": 0.0102, "step": 7525 }, { "epoch": 3.424021838034577, "grad_norm": 0.7566670669966932, "learning_rate": 2.258650063227533e-07, "loss": 0.0153, "step": 7526 }, { "epoch": 3.424476797088262, "grad_norm": 0.6056635289124147, "learning_rate": 2.257454852995142e-07, "loss": 0.0161, "step": 7527 }, { "epoch": 3.4249317561419472, "grad_norm": 1.0269838385013066, "learning_rate": 2.2562598668711568e-07, "loss": 0.0103, "step": 7528 }, { "epoch": 3.4253867151956325, "grad_norm": 0.5121034140857071, "learning_rate": 2.2550651049532248e-07, "loss": 0.0074, "step": 7529 }, { "epoch": 3.4258416742493174, "grad_norm": 0.38187183681678155, "learning_rate": 2.2538705673389745e-07, "loss": 0.0033, "step": 7530 }, { "epoch": 3.4262966333030027, "grad_norm": 0.5365268490972959, "learning_rate": 2.2526762541260218e-07, "loss": 0.0043, "step": 7531 }, { "epoch": 3.426751592356688, "grad_norm": 0.2868115962757468, "learning_rate": 2.2514821654119586e-07, "loss": 0.0026, "step": 7532 }, { "epoch": 3.427206551410373, "grad_norm": 0.40712464300903367, "learning_rate": 2.2502883012943613e-07, "loss": 0.0059, "step": 7533 }, { "epoch": 3.427661510464058, "grad_norm": 0.7589745759289658, "learning_rate": 2.249094661870784e-07, "loss": 0.0215, "step": 7534 }, { "epoch": 3.4281164695177435, "grad_norm": 0.6633018306420365, "learning_rate": 2.247901247238768e-07, "loss": 0.0055, "step": 7535 }, { "epoch": 3.4285714285714284, "grad_norm": 0.6249628630358469, "learning_rate": 2.2467080574958363e-07, "loss": 0.0066, "step": 7536 }, { "epoch": 3.4290263876251137, "grad_norm": 0.8362849456422414, "learning_rate": 2.2455150927394878e-07, "loss": 0.0129, "step": 7537 }, { "epoch": 3.429481346678799, "grad_norm": 0.48475013330070343, "learning_rate": 2.2443223530672068e-07, "loss": 0.0097, "step": 7538 }, { "epoch": 3.4299363057324843, "grad_norm": 0.5638312560436298, "learning_rate": 2.2431298385764563e-07, "loss": 0.0089, "step": 7539 }, { "epoch": 3.430391264786169, "grad_norm": 0.5198897709312454, "learning_rate": 2.241937549364688e-07, "loss": 0.0111, "step": 7540 }, { "epoch": 3.4308462238398545, "grad_norm": 0.5861668484707628, "learning_rate": 2.2407454855293257e-07, "loss": 0.0073, "step": 7541 }, { "epoch": 3.43130118289354, "grad_norm": 0.2644985179394257, "learning_rate": 2.2395536471677833e-07, "loss": 0.0018, "step": 7542 }, { "epoch": 3.4317561419472247, "grad_norm": 0.5669204055058009, "learning_rate": 2.238362034377451e-07, "loss": 0.0065, "step": 7543 }, { "epoch": 3.43221110100091, "grad_norm": 0.3621012390142855, "learning_rate": 2.2371706472557023e-07, "loss": 0.0033, "step": 7544 }, { "epoch": 3.4326660600545953, "grad_norm": 0.5892012342794751, "learning_rate": 2.2359794858998893e-07, "loss": 0.0076, "step": 7545 }, { "epoch": 3.43312101910828, "grad_norm": 0.7392002526812408, "learning_rate": 2.234788550407352e-07, "loss": 0.0063, "step": 7546 }, { "epoch": 3.4335759781619655, "grad_norm": 0.8486728379902249, "learning_rate": 2.233597840875407e-07, "loss": 0.0158, "step": 7547 }, { "epoch": 3.434030937215651, "grad_norm": 0.5200826954053561, "learning_rate": 2.232407357401352e-07, "loss": 0.0131, "step": 7548 }, { "epoch": 3.4344858962693356, "grad_norm": 0.9016848670313095, "learning_rate": 2.231217100082471e-07, "loss": 0.0149, "step": 7549 }, { "epoch": 3.434940855323021, "grad_norm": 0.5544495671622997, "learning_rate": 2.2300270690160227e-07, "loss": 0.0113, "step": 7550 }, { "epoch": 3.4353958143767063, "grad_norm": 0.49997525747562, "learning_rate": 2.2288372642992553e-07, "loss": 0.011, "step": 7551 }, { "epoch": 3.435850773430391, "grad_norm": 0.7755200489654073, "learning_rate": 2.227647686029392e-07, "loss": 0.0176, "step": 7552 }, { "epoch": 3.4363057324840764, "grad_norm": 0.49070665226505616, "learning_rate": 2.2264583343036402e-07, "loss": 0.0032, "step": 7553 }, { "epoch": 3.4367606915377618, "grad_norm": 0.5800872658785952, "learning_rate": 2.225269209219186e-07, "loss": 0.0133, "step": 7554 }, { "epoch": 3.4372156505914466, "grad_norm": 0.4140069381699699, "learning_rate": 2.224080310873202e-07, "loss": 0.0058, "step": 7555 }, { "epoch": 3.437670609645132, "grad_norm": 0.644612881055785, "learning_rate": 2.2228916393628406e-07, "loss": 0.0075, "step": 7556 }, { "epoch": 3.4381255686988172, "grad_norm": 0.7891572972950626, "learning_rate": 2.2217031947852333e-07, "loss": 0.0152, "step": 7557 }, { "epoch": 3.438580527752502, "grad_norm": 1.5889349140762166, "learning_rate": 2.2205149772374938e-07, "loss": 0.0082, "step": 7558 }, { "epoch": 3.4390354868061874, "grad_norm": 1.275375854417817, "learning_rate": 2.219326986816717e-07, "loss": 0.0187, "step": 7559 }, { "epoch": 3.4394904458598727, "grad_norm": 0.5168819648231492, "learning_rate": 2.2181392236199826e-07, "loss": 0.0109, "step": 7560 }, { "epoch": 3.4399454049135576, "grad_norm": 0.7635363853278249, "learning_rate": 2.2169516877443484e-07, "loss": 0.0101, "step": 7561 }, { "epoch": 3.440400363967243, "grad_norm": 0.6289728280173681, "learning_rate": 2.2157643792868525e-07, "loss": 0.0135, "step": 7562 }, { "epoch": 3.4408553230209282, "grad_norm": 0.6671148581525372, "learning_rate": 2.2145772983445199e-07, "loss": 0.0101, "step": 7563 }, { "epoch": 3.441310282074613, "grad_norm": 0.5682540572839337, "learning_rate": 2.2133904450143498e-07, "loss": 0.005, "step": 7564 }, { "epoch": 3.4417652411282984, "grad_norm": 0.6034420909077974, "learning_rate": 2.2122038193933296e-07, "loss": 0.0111, "step": 7565 }, { "epoch": 3.4422202001819837, "grad_norm": 0.7162544051023749, "learning_rate": 2.2110174215784245e-07, "loss": 0.008, "step": 7566 }, { "epoch": 3.4426751592356686, "grad_norm": 0.8430166326379459, "learning_rate": 2.20983125166658e-07, "loss": 0.0205, "step": 7567 }, { "epoch": 3.443130118289354, "grad_norm": 0.6003819125863686, "learning_rate": 2.2086453097547242e-07, "loss": 0.0061, "step": 7568 }, { "epoch": 3.443585077343039, "grad_norm": 0.7263845259098474, "learning_rate": 2.2074595959397674e-07, "loss": 0.0098, "step": 7569 }, { "epoch": 3.444040036396724, "grad_norm": 0.7611783589360511, "learning_rate": 2.2062741103186034e-07, "loss": 0.0133, "step": 7570 }, { "epoch": 3.4444949954504094, "grad_norm": 1.0157978884220757, "learning_rate": 2.205088852988103e-07, "loss": 0.0116, "step": 7571 }, { "epoch": 3.4449499545040947, "grad_norm": 1.000797732402794, "learning_rate": 2.2039038240451195e-07, "loss": 0.0143, "step": 7572 }, { "epoch": 3.4454049135577796, "grad_norm": 0.5101190905790332, "learning_rate": 2.2027190235864872e-07, "loss": 0.007, "step": 7573 }, { "epoch": 3.445859872611465, "grad_norm": 0.5460378632162021, "learning_rate": 2.201534451709025e-07, "loss": 0.0116, "step": 7574 }, { "epoch": 3.44631483166515, "grad_norm": 0.7102637511980564, "learning_rate": 2.20035010850953e-07, "loss": 0.0049, "step": 7575 }, { "epoch": 3.4467697907188355, "grad_norm": 0.8058381077418446, "learning_rate": 2.1991659940847796e-07, "loss": 0.0181, "step": 7576 }, { "epoch": 3.4472247497725204, "grad_norm": 0.6489711764852132, "learning_rate": 2.197982108531537e-07, "loss": 0.0079, "step": 7577 }, { "epoch": 3.4476797088262057, "grad_norm": 0.5149936744440072, "learning_rate": 2.196798451946541e-07, "loss": 0.0049, "step": 7578 }, { "epoch": 3.448134667879891, "grad_norm": 0.7217192575852976, "learning_rate": 2.195615024426518e-07, "loss": 0.0071, "step": 7579 }, { "epoch": 3.448589626933576, "grad_norm": 0.2887655025992217, "learning_rate": 2.194431826068171e-07, "loss": 0.0019, "step": 7580 }, { "epoch": 3.449044585987261, "grad_norm": 0.39158428552723445, "learning_rate": 2.193248856968185e-07, "loss": 0.0045, "step": 7581 }, { "epoch": 3.4494995450409465, "grad_norm": 0.7994783212394398, "learning_rate": 2.1920661172232275e-07, "loss": 0.0104, "step": 7582 }, { "epoch": 3.4499545040946313, "grad_norm": 0.6800396869861896, "learning_rate": 2.1908836069299447e-07, "loss": 0.0026, "step": 7583 }, { "epoch": 3.4504094631483166, "grad_norm": 1.116507345246118, "learning_rate": 2.1897013261849672e-07, "loss": 0.0164, "step": 7584 }, { "epoch": 3.450864422202002, "grad_norm": 0.5813394196304884, "learning_rate": 2.1885192750849085e-07, "loss": 0.0105, "step": 7585 }, { "epoch": 3.451319381255687, "grad_norm": 0.8281415739011928, "learning_rate": 2.187337453726358e-07, "loss": 0.0094, "step": 7586 }, { "epoch": 3.451774340309372, "grad_norm": 0.671633239658104, "learning_rate": 2.1861558622058883e-07, "loss": 0.0097, "step": 7587 }, { "epoch": 3.4522292993630574, "grad_norm": 0.5621927117497056, "learning_rate": 2.1849745006200532e-07, "loss": 0.0121, "step": 7588 }, { "epoch": 3.4526842584167423, "grad_norm": 0.6799519417384229, "learning_rate": 2.183793369065391e-07, "loss": 0.0074, "step": 7589 }, { "epoch": 3.4531392174704276, "grad_norm": 0.5112867513118133, "learning_rate": 2.182612467638415e-07, "loss": 0.005, "step": 7590 }, { "epoch": 3.453594176524113, "grad_norm": 0.681031542256464, "learning_rate": 2.1814317964356265e-07, "loss": 0.0118, "step": 7591 }, { "epoch": 3.4540491355777982, "grad_norm": 0.6604153048492248, "learning_rate": 2.1802513555535036e-07, "loss": 0.0078, "step": 7592 }, { "epoch": 3.454504094631483, "grad_norm": 0.6690586211708845, "learning_rate": 2.1790711450885034e-07, "loss": 0.0096, "step": 7593 }, { "epoch": 3.4549590536851684, "grad_norm": 0.6884501408217268, "learning_rate": 2.1778911651370723e-07, "loss": 0.014, "step": 7594 }, { "epoch": 3.4554140127388537, "grad_norm": 0.9120960764762162, "learning_rate": 2.1767114157956308e-07, "loss": 0.0166, "step": 7595 }, { "epoch": 3.4558689717925386, "grad_norm": 0.9455775148887481, "learning_rate": 2.1755318971605824e-07, "loss": 0.0161, "step": 7596 }, { "epoch": 3.456323930846224, "grad_norm": 0.6720224472168312, "learning_rate": 2.1743526093283104e-07, "loss": 0.0148, "step": 7597 }, { "epoch": 3.4567788898999092, "grad_norm": 0.5729473087888397, "learning_rate": 2.1731735523951826e-07, "loss": 0.0073, "step": 7598 }, { "epoch": 3.457233848953594, "grad_norm": 0.6264362684280178, "learning_rate": 2.1719947264575484e-07, "loss": 0.0115, "step": 7599 }, { "epoch": 3.4576888080072794, "grad_norm": 0.5943640755370861, "learning_rate": 2.1708161316117336e-07, "loss": 0.0108, "step": 7600 }, { "epoch": 3.4581437670609647, "grad_norm": 0.6641819489880169, "learning_rate": 2.169637767954048e-07, "loss": 0.0086, "step": 7601 }, { "epoch": 3.4585987261146496, "grad_norm": 0.41548166916271917, "learning_rate": 2.1684596355807804e-07, "loss": 0.0027, "step": 7602 }, { "epoch": 3.459053685168335, "grad_norm": 0.908001727749631, "learning_rate": 2.1672817345882067e-07, "loss": 0.0137, "step": 7603 }, { "epoch": 3.45950864422202, "grad_norm": 0.8355378555535569, "learning_rate": 2.1661040650725748e-07, "loss": 0.0202, "step": 7604 }, { "epoch": 3.459963603275705, "grad_norm": 0.7332112615835004, "learning_rate": 2.1649266271301224e-07, "loss": 0.017, "step": 7605 }, { "epoch": 3.4604185623293904, "grad_norm": 1.687427653381106, "learning_rate": 2.163749420857064e-07, "loss": 0.0046, "step": 7606 }, { "epoch": 3.4608735213830757, "grad_norm": 0.5394614996131555, "learning_rate": 2.162572446349592e-07, "loss": 0.0091, "step": 7607 }, { "epoch": 3.4613284804367606, "grad_norm": 0.6498067378876912, "learning_rate": 2.1613957037038877e-07, "loss": 0.0105, "step": 7608 }, { "epoch": 3.461783439490446, "grad_norm": 0.7415088811114238, "learning_rate": 2.160219193016108e-07, "loss": 0.0223, "step": 7609 }, { "epoch": 3.462238398544131, "grad_norm": 0.7781803565600616, "learning_rate": 2.1590429143823907e-07, "loss": 0.0081, "step": 7610 }, { "epoch": 3.462693357597816, "grad_norm": 0.5356588855979416, "learning_rate": 2.1578668678988554e-07, "loss": 0.0067, "step": 7611 }, { "epoch": 3.4631483166515014, "grad_norm": 0.7685966868611525, "learning_rate": 2.1566910536616052e-07, "loss": 0.0202, "step": 7612 }, { "epoch": 3.4636032757051867, "grad_norm": 0.7363219627935135, "learning_rate": 2.155515471766723e-07, "loss": 0.0164, "step": 7613 }, { "epoch": 3.4640582347588715, "grad_norm": 0.5601362958253191, "learning_rate": 2.1543401223102708e-07, "loss": 0.0103, "step": 7614 }, { "epoch": 3.464513193812557, "grad_norm": 0.5837031460393957, "learning_rate": 2.1531650053882932e-07, "loss": 0.0084, "step": 7615 }, { "epoch": 3.464968152866242, "grad_norm": 1.04729412812304, "learning_rate": 2.1519901210968157e-07, "loss": 0.0119, "step": 7616 }, { "epoch": 3.465423111919927, "grad_norm": 0.5529126665069, "learning_rate": 2.1508154695318414e-07, "loss": 0.0052, "step": 7617 }, { "epoch": 3.4658780709736123, "grad_norm": 0.4453104664597786, "learning_rate": 2.1496410507893603e-07, "loss": 0.0036, "step": 7618 }, { "epoch": 3.4663330300272976, "grad_norm": 0.3481088355406558, "learning_rate": 2.1484668649653425e-07, "loss": 0.0047, "step": 7619 }, { "epoch": 3.4667879890809825, "grad_norm": 0.6030059481249149, "learning_rate": 2.1472929121557347e-07, "loss": 0.0062, "step": 7620 }, { "epoch": 3.467242948134668, "grad_norm": 0.6380192540538873, "learning_rate": 2.1461191924564677e-07, "loss": 0.0037, "step": 7621 }, { "epoch": 3.467697907188353, "grad_norm": 0.49907535980994305, "learning_rate": 2.1449457059634501e-07, "loss": 0.0082, "step": 7622 }, { "epoch": 3.468152866242038, "grad_norm": 0.6550284222107265, "learning_rate": 2.1437724527725782e-07, "loss": 0.0096, "step": 7623 }, { "epoch": 3.4686078252957233, "grad_norm": 1.0905189137977125, "learning_rate": 2.1425994329797231e-07, "loss": 0.0079, "step": 7624 }, { "epoch": 3.4690627843494086, "grad_norm": 0.7106752310644547, "learning_rate": 2.141426646680736e-07, "loss": 0.0116, "step": 7625 }, { "epoch": 3.4695177434030935, "grad_norm": 0.7073747425365754, "learning_rate": 2.1402540939714565e-07, "loss": 0.0018, "step": 7626 }, { "epoch": 3.469972702456779, "grad_norm": 0.6677290282369831, "learning_rate": 2.139081774947696e-07, "loss": 0.0078, "step": 7627 }, { "epoch": 3.470427661510464, "grad_norm": 0.3837690707762307, "learning_rate": 2.1379096897052546e-07, "loss": 0.0053, "step": 7628 }, { "epoch": 3.470882620564149, "grad_norm": 0.40389257896267156, "learning_rate": 2.1367378383399088e-07, "loss": 0.002, "step": 7629 }, { "epoch": 3.4713375796178343, "grad_norm": 0.8078155287891192, "learning_rate": 2.1355662209474158e-07, "loss": 0.0211, "step": 7630 }, { "epoch": 3.4717925386715196, "grad_norm": 0.6201295061074923, "learning_rate": 2.1343948376235143e-07, "loss": 0.0088, "step": 7631 }, { "epoch": 3.472247497725205, "grad_norm": 0.48045182985088053, "learning_rate": 2.1332236884639253e-07, "loss": 0.006, "step": 7632 }, { "epoch": 3.47270245677889, "grad_norm": 0.569467317591628, "learning_rate": 2.1320527735643523e-07, "loss": 0.0065, "step": 7633 }, { "epoch": 3.473157415832575, "grad_norm": 0.6446083972028901, "learning_rate": 2.1308820930204753e-07, "loss": 0.0114, "step": 7634 }, { "epoch": 3.4736123748862604, "grad_norm": 0.493614990967363, "learning_rate": 2.1297116469279563e-07, "loss": 0.0039, "step": 7635 }, { "epoch": 3.4740673339399453, "grad_norm": 0.26896246181119954, "learning_rate": 2.1285414353824377e-07, "loss": 0.0031, "step": 7636 }, { "epoch": 3.4745222929936306, "grad_norm": 0.48622779756996115, "learning_rate": 2.1273714584795476e-07, "loss": 0.0094, "step": 7637 }, { "epoch": 3.474977252047316, "grad_norm": 0.554618883896482, "learning_rate": 2.1262017163148894e-07, "loss": 0.006, "step": 7638 }, { "epoch": 3.4754322111010008, "grad_norm": 0.42914489074786255, "learning_rate": 2.125032208984047e-07, "loss": 0.0035, "step": 7639 }, { "epoch": 3.475887170154686, "grad_norm": 0.9028114982358357, "learning_rate": 2.1238629365825912e-07, "loss": 0.01, "step": 7640 }, { "epoch": 3.4763421292083714, "grad_norm": 0.5420901136305284, "learning_rate": 2.1226938992060656e-07, "loss": 0.0094, "step": 7641 }, { "epoch": 3.4767970882620562, "grad_norm": 0.8491126610127406, "learning_rate": 2.1215250969500025e-07, "loss": 0.0139, "step": 7642 }, { "epoch": 3.4772520473157416, "grad_norm": 0.7624482992371646, "learning_rate": 2.1203565299099098e-07, "loss": 0.0123, "step": 7643 }, { "epoch": 3.477707006369427, "grad_norm": 0.6915073249239886, "learning_rate": 2.119188198181277e-07, "loss": 0.0121, "step": 7644 }, { "epoch": 3.4781619654231117, "grad_norm": 0.7813940702431491, "learning_rate": 2.1180201018595729e-07, "loss": 0.0026, "step": 7645 }, { "epoch": 3.478616924476797, "grad_norm": 1.0490110085819626, "learning_rate": 2.1168522410402517e-07, "loss": 0.0038, "step": 7646 }, { "epoch": 3.4790718835304824, "grad_norm": 0.904409408871114, "learning_rate": 2.1156846158187464e-07, "loss": 0.0144, "step": 7647 }, { "epoch": 3.4795268425841677, "grad_norm": 1.0745488585750436, "learning_rate": 2.1145172262904693e-07, "loss": 0.0055, "step": 7648 }, { "epoch": 3.4799818016378525, "grad_norm": 0.5128023139334983, "learning_rate": 2.1133500725508136e-07, "loss": 0.0079, "step": 7649 }, { "epoch": 3.480436760691538, "grad_norm": 0.6440726224052151, "learning_rate": 2.112183154695152e-07, "loss": 0.0063, "step": 7650 }, { "epoch": 3.480891719745223, "grad_norm": 0.7560308197511303, "learning_rate": 2.111016472818844e-07, "loss": 0.0129, "step": 7651 }, { "epoch": 3.481346678798908, "grad_norm": 0.8344803519288101, "learning_rate": 2.1098500270172226e-07, "loss": 0.0159, "step": 7652 }, { "epoch": 3.4818016378525933, "grad_norm": 0.8055188630708792, "learning_rate": 2.1086838173856036e-07, "loss": 0.0124, "step": 7653 }, { "epoch": 3.4822565969062786, "grad_norm": 0.7006367570168196, "learning_rate": 2.1075178440192882e-07, "loss": 0.0137, "step": 7654 }, { "epoch": 3.4827115559599635, "grad_norm": 0.6541203856456191, "learning_rate": 2.106352107013552e-07, "loss": 0.0079, "step": 7655 }, { "epoch": 3.483166515013649, "grad_norm": 0.8372901445877414, "learning_rate": 2.1051866064636525e-07, "loss": 0.0126, "step": 7656 }, { "epoch": 3.483621474067334, "grad_norm": 1.0728267074452174, "learning_rate": 2.1040213424648316e-07, "loss": 0.0281, "step": 7657 }, { "epoch": 3.484076433121019, "grad_norm": 0.8108888422336789, "learning_rate": 2.1028563151123096e-07, "loss": 0.0265, "step": 7658 }, { "epoch": 3.4845313921747043, "grad_norm": 0.7123351778178488, "learning_rate": 2.1016915245012856e-07, "loss": 0.0115, "step": 7659 }, { "epoch": 3.4849863512283896, "grad_norm": 0.37375698481826003, "learning_rate": 2.1005269707269395e-07, "loss": 0.0022, "step": 7660 }, { "epoch": 3.4854413102820745, "grad_norm": 0.6896494400348131, "learning_rate": 2.0993626538844356e-07, "loss": 0.0137, "step": 7661 }, { "epoch": 3.48589626933576, "grad_norm": 0.7051446710838073, "learning_rate": 2.0981985740689183e-07, "loss": 0.0109, "step": 7662 }, { "epoch": 3.486351228389445, "grad_norm": 0.37264588320572184, "learning_rate": 2.0970347313755093e-07, "loss": 0.0023, "step": 7663 }, { "epoch": 3.48680618744313, "grad_norm": 0.3941266947668728, "learning_rate": 2.0958711258993123e-07, "loss": 0.0052, "step": 7664 }, { "epoch": 3.4872611464968153, "grad_norm": 0.580360331039643, "learning_rate": 2.09470775773541e-07, "loss": 0.0103, "step": 7665 }, { "epoch": 3.4877161055505006, "grad_norm": 0.7883691305640139, "learning_rate": 2.0935446269788714e-07, "loss": 0.008, "step": 7666 }, { "epoch": 3.4881710646041855, "grad_norm": 0.7505643514802921, "learning_rate": 2.092381733724739e-07, "loss": 0.0097, "step": 7667 }, { "epoch": 3.488626023657871, "grad_norm": 0.48014758129104096, "learning_rate": 2.0912190780680422e-07, "loss": 0.0065, "step": 7668 }, { "epoch": 3.489080982711556, "grad_norm": 0.9388969962849222, "learning_rate": 2.090056660103786e-07, "loss": 0.0153, "step": 7669 }, { "epoch": 3.489535941765241, "grad_norm": 43.38023149058364, "learning_rate": 2.0888944799269569e-07, "loss": 0.0396, "step": 7670 }, { "epoch": 3.4899909008189263, "grad_norm": 0.43041194339811484, "learning_rate": 2.0877325376325266e-07, "loss": 0.0036, "step": 7671 }, { "epoch": 3.4904458598726116, "grad_norm": 0.47632077127369343, "learning_rate": 2.0865708333154412e-07, "loss": 0.0071, "step": 7672 }, { "epoch": 3.4909008189262964, "grad_norm": 0.6495768811851671, "learning_rate": 2.0854093670706308e-07, "loss": 0.0111, "step": 7673 }, { "epoch": 3.4913557779799818, "grad_norm": 0.9058951470426233, "learning_rate": 2.0842481389930023e-07, "loss": 0.0099, "step": 7674 }, { "epoch": 3.491810737033667, "grad_norm": 0.69736944332932, "learning_rate": 2.0830871491774488e-07, "loss": 0.0114, "step": 7675 }, { "epoch": 3.492265696087352, "grad_norm": 0.7555446124580851, "learning_rate": 2.081926397718843e-07, "loss": 0.0141, "step": 7676 }, { "epoch": 3.4927206551410372, "grad_norm": 0.6493060330822783, "learning_rate": 2.0807658847120335e-07, "loss": 0.0096, "step": 7677 }, { "epoch": 3.4931756141947226, "grad_norm": 0.6489355510660627, "learning_rate": 2.0796056102518527e-07, "loss": 0.0077, "step": 7678 }, { "epoch": 3.4936305732484074, "grad_norm": 0.6073136073113496, "learning_rate": 2.0784455744331113e-07, "loss": 0.0113, "step": 7679 }, { "epoch": 3.4940855323020927, "grad_norm": 0.8547507273679713, "learning_rate": 2.0772857773506059e-07, "loss": 0.0127, "step": 7680 }, { "epoch": 3.494540491355778, "grad_norm": 0.4590570904105181, "learning_rate": 2.0761262190991063e-07, "loss": 0.0051, "step": 7681 }, { "epoch": 3.494995450409463, "grad_norm": 0.6158049464649052, "learning_rate": 2.0749668997733693e-07, "loss": 0.008, "step": 7682 }, { "epoch": 3.4954504094631482, "grad_norm": 0.5160242536641908, "learning_rate": 2.0738078194681287e-07, "loss": 0.0043, "step": 7683 }, { "epoch": 3.4959053685168335, "grad_norm": 0.5982014875233602, "learning_rate": 2.0726489782780958e-07, "loss": 0.0153, "step": 7684 }, { "epoch": 3.496360327570519, "grad_norm": 0.7643520807873011, "learning_rate": 2.0714903762979713e-07, "loss": 0.0182, "step": 7685 }, { "epoch": 3.4968152866242037, "grad_norm": 0.3845904594353635, "learning_rate": 2.0703320136224273e-07, "loss": 0.0033, "step": 7686 }, { "epoch": 3.497270245677889, "grad_norm": 0.4718249042990343, "learning_rate": 2.0691738903461216e-07, "loss": 0.004, "step": 7687 }, { "epoch": 3.4977252047315743, "grad_norm": 0.5456902235158938, "learning_rate": 2.068016006563688e-07, "loss": 0.0103, "step": 7688 }, { "epoch": 3.498180163785259, "grad_norm": 0.7755826215837694, "learning_rate": 2.0668583623697471e-07, "loss": 0.0133, "step": 7689 }, { "epoch": 3.4986351228389445, "grad_norm": 1.1687275850732166, "learning_rate": 2.0657009578588935e-07, "loss": 0.0115, "step": 7690 }, { "epoch": 3.49909008189263, "grad_norm": 0.798774958438141, "learning_rate": 2.064543793125708e-07, "loss": 0.0101, "step": 7691 }, { "epoch": 3.4995450409463147, "grad_norm": 0.7602591882310743, "learning_rate": 2.0633868682647477e-07, "loss": 0.0207, "step": 7692 }, { "epoch": 3.5, "grad_norm": 1.4484996517826465, "learning_rate": 2.0622301833705507e-07, "loss": 0.0184, "step": 7693 }, { "epoch": 3.5004549590536853, "grad_norm": 0.6744940308835764, "learning_rate": 2.0610737385376348e-07, "loss": 0.006, "step": 7694 }, { "epoch": 3.50090991810737, "grad_norm": 0.8107194943013568, "learning_rate": 2.0599175338605002e-07, "loss": 0.0094, "step": 7695 }, { "epoch": 3.5013648771610555, "grad_norm": 0.5195172079090502, "learning_rate": 2.0587615694336295e-07, "loss": 0.0108, "step": 7696 }, { "epoch": 3.501819836214741, "grad_norm": 0.5670460147020906, "learning_rate": 2.057605845351481e-07, "loss": 0.0134, "step": 7697 }, { "epoch": 3.502274795268426, "grad_norm": 0.5544651096289986, "learning_rate": 2.056450361708495e-07, "loss": 0.0088, "step": 7698 }, { "epoch": 3.502729754322111, "grad_norm": 0.5004436724786839, "learning_rate": 2.0552951185990908e-07, "loss": 0.0085, "step": 7699 }, { "epoch": 3.5031847133757963, "grad_norm": 0.6129339906638587, "learning_rate": 2.054140116117673e-07, "loss": 0.0092, "step": 7700 }, { "epoch": 3.5036396724294816, "grad_norm": 0.9922869802614129, "learning_rate": 2.0529853543586216e-07, "loss": 0.0131, "step": 7701 }, { "epoch": 3.5040946314831665, "grad_norm": 0.8871424702700379, "learning_rate": 2.0518308334162964e-07, "loss": 0.0075, "step": 7702 }, { "epoch": 3.5045495905368518, "grad_norm": 0.6711449455184268, "learning_rate": 2.050676553385044e-07, "loss": 0.0121, "step": 7703 }, { "epoch": 3.505004549590537, "grad_norm": 41.64022471180399, "learning_rate": 2.0495225143591826e-07, "loss": 0.0882, "step": 7704 }, { "epoch": 3.505459508644222, "grad_norm": 0.979358049164127, "learning_rate": 2.048368716433019e-07, "loss": 0.0146, "step": 7705 }, { "epoch": 3.5059144676979073, "grad_norm": 0.5991568792602532, "learning_rate": 2.0472151597008342e-07, "loss": 0.0097, "step": 7706 }, { "epoch": 3.5063694267515926, "grad_norm": 0.5704110435603712, "learning_rate": 2.0460618442568917e-07, "loss": 0.0075, "step": 7707 }, { "epoch": 3.5068243858052774, "grad_norm": 0.5924225766421233, "learning_rate": 2.0449087701954338e-07, "loss": 0.0131, "step": 7708 }, { "epoch": 3.5072793448589628, "grad_norm": 0.4783108926360455, "learning_rate": 2.0437559376106861e-07, "loss": 0.0064, "step": 7709 }, { "epoch": 3.507734303912648, "grad_norm": 0.7191359213295054, "learning_rate": 2.042603346596855e-07, "loss": 0.0073, "step": 7710 }, { "epoch": 3.508189262966333, "grad_norm": 0.3899862300809221, "learning_rate": 2.0414509972481225e-07, "loss": 0.0043, "step": 7711 }, { "epoch": 3.5086442220200182, "grad_norm": 0.3734622015700309, "learning_rate": 2.0402988896586542e-07, "loss": 0.0059, "step": 7712 }, { "epoch": 3.5090991810737036, "grad_norm": 0.4208739947952024, "learning_rate": 2.0391470239225928e-07, "loss": 0.0053, "step": 7713 }, { "epoch": 3.5095541401273884, "grad_norm": 0.3562512101792738, "learning_rate": 2.0379954001340672e-07, "loss": 0.0031, "step": 7714 }, { "epoch": 3.5100090991810737, "grad_norm": 0.63217623887472, "learning_rate": 2.036844018387181e-07, "loss": 0.0099, "step": 7715 }, { "epoch": 3.510464058234759, "grad_norm": 0.5085029512976544, "learning_rate": 2.0356928787760186e-07, "loss": 0.0058, "step": 7716 }, { "epoch": 3.510919017288444, "grad_norm": 0.7902785873029788, "learning_rate": 2.0345419813946495e-07, "loss": 0.0038, "step": 7717 }, { "epoch": 3.511373976342129, "grad_norm": 0.5404510486927532, "learning_rate": 2.0333913263371156e-07, "loss": 0.0119, "step": 7718 }, { "epoch": 3.5118289353958145, "grad_norm": 0.8093015684809592, "learning_rate": 2.0322409136974476e-07, "loss": 0.0109, "step": 7719 }, { "epoch": 3.5122838944494994, "grad_norm": 0.49683347296273317, "learning_rate": 2.0310907435696496e-07, "loss": 0.004, "step": 7720 }, { "epoch": 3.5127388535031847, "grad_norm": 0.36943639435872616, "learning_rate": 2.0299408160477083e-07, "loss": 0.0071, "step": 7721 }, { "epoch": 3.51319381255687, "grad_norm": 0.45186676440357504, "learning_rate": 2.0287911312255912e-07, "loss": 0.0083, "step": 7722 }, { "epoch": 3.513648771610555, "grad_norm": 0.6138471070800074, "learning_rate": 2.0276416891972415e-07, "loss": 0.0103, "step": 7723 }, { "epoch": 3.51410373066424, "grad_norm": 0.5217375510819893, "learning_rate": 2.0264924900565933e-07, "loss": 0.004, "step": 7724 }, { "epoch": 3.5145586897179255, "grad_norm": 0.7869856406558752, "learning_rate": 2.0253435338975505e-07, "loss": 0.0043, "step": 7725 }, { "epoch": 3.5150136487716104, "grad_norm": 0.5824282800257232, "learning_rate": 2.024194820814001e-07, "loss": 0.0093, "step": 7726 }, { "epoch": 3.5154686078252957, "grad_norm": 0.7084778596287391, "learning_rate": 2.023046350899812e-07, "loss": 0.0057, "step": 7727 }, { "epoch": 3.515923566878981, "grad_norm": 0.6405141977530951, "learning_rate": 2.0218981242488292e-07, "loss": 0.0073, "step": 7728 }, { "epoch": 3.516378525932666, "grad_norm": 0.7700896333202202, "learning_rate": 2.020750140954885e-07, "loss": 0.0167, "step": 7729 }, { "epoch": 3.516833484986351, "grad_norm": 0.7264856157666546, "learning_rate": 2.019602401111783e-07, "loss": 0.009, "step": 7730 }, { "epoch": 3.5172884440400365, "grad_norm": 0.582382439565773, "learning_rate": 2.0184549048133155e-07, "loss": 0.0105, "step": 7731 }, { "epoch": 3.5177434030937214, "grad_norm": 0.7124877952786189, "learning_rate": 2.0173076521532484e-07, "loss": 0.0049, "step": 7732 }, { "epoch": 3.5181983621474067, "grad_norm": 0.6631238868556282, "learning_rate": 2.0161606432253285e-07, "loss": 0.0133, "step": 7733 }, { "epoch": 3.518653321201092, "grad_norm": 0.7776713694288407, "learning_rate": 2.015013878123288e-07, "loss": 0.0186, "step": 7734 }, { "epoch": 3.519108280254777, "grad_norm": 0.7273232382525459, "learning_rate": 2.0138673569408328e-07, "loss": 0.0079, "step": 7735 }, { "epoch": 3.519563239308462, "grad_norm": 0.37074489840333824, "learning_rate": 2.012721079771652e-07, "loss": 0.0031, "step": 7736 }, { "epoch": 3.5200181983621475, "grad_norm": 0.6946534570517507, "learning_rate": 2.011575046709413e-07, "loss": 0.0086, "step": 7737 }, { "epoch": 3.5204731574158323, "grad_norm": 0.5226655847142668, "learning_rate": 2.010429257847765e-07, "loss": 0.0168, "step": 7738 }, { "epoch": 3.5209281164695176, "grad_norm": 1.1736482109494262, "learning_rate": 2.0092837132803392e-07, "loss": 0.0059, "step": 7739 }, { "epoch": 3.521383075523203, "grad_norm": 0.6242711829534894, "learning_rate": 2.0081384131007422e-07, "loss": 0.0117, "step": 7740 }, { "epoch": 3.521838034576888, "grad_norm": 1.0477551517213335, "learning_rate": 2.0069933574025633e-07, "loss": 0.0057, "step": 7741 }, { "epoch": 3.522292993630573, "grad_norm": 0.5577290911715018, "learning_rate": 2.0058485462793688e-07, "loss": 0.01, "step": 7742 }, { "epoch": 3.5227479526842584, "grad_norm": 0.582916291595667, "learning_rate": 2.0047039798247116e-07, "loss": 0.0095, "step": 7743 }, { "epoch": 3.5232029117379433, "grad_norm": 1.1452712337524593, "learning_rate": 2.003559658132117e-07, "loss": 0.0209, "step": 7744 }, { "epoch": 3.5236578707916286, "grad_norm": 0.3830778965135432, "learning_rate": 2.0024155812950966e-07, "loss": 0.009, "step": 7745 }, { "epoch": 3.524112829845314, "grad_norm": 0.5751641601478813, "learning_rate": 2.0012717494071378e-07, "loss": 0.0086, "step": 7746 }, { "epoch": 3.5245677888989992, "grad_norm": 1.4793629161369592, "learning_rate": 2.0001281625617085e-07, "loss": 0.0134, "step": 7747 }, { "epoch": 3.525022747952684, "grad_norm": 0.6227354236950853, "learning_rate": 1.9989848208522598e-07, "loss": 0.01, "step": 7748 }, { "epoch": 3.5254777070063694, "grad_norm": 0.49004622634169753, "learning_rate": 1.997841724372219e-07, "loss": 0.0104, "step": 7749 }, { "epoch": 3.5259326660600547, "grad_norm": 0.7116586475774364, "learning_rate": 1.9966988732149948e-07, "loss": 0.0089, "step": 7750 }, { "epoch": 3.5263876251137396, "grad_norm": 0.9993948284607208, "learning_rate": 1.9955562674739745e-07, "loss": 0.0202, "step": 7751 }, { "epoch": 3.526842584167425, "grad_norm": 0.7898517802776058, "learning_rate": 1.9944139072425275e-07, "loss": 0.0096, "step": 7752 }, { "epoch": 3.52729754322111, "grad_norm": 1.5151449117475093, "learning_rate": 1.9932717926140052e-07, "loss": 0.0053, "step": 7753 }, { "epoch": 3.5277525022747955, "grad_norm": 0.8826151246444269, "learning_rate": 1.9921299236817335e-07, "loss": 0.0111, "step": 7754 }, { "epoch": 3.5282074613284804, "grad_norm": 0.6629473639126524, "learning_rate": 1.990988300539021e-07, "loss": 0.0123, "step": 7755 }, { "epoch": 3.5286624203821657, "grad_norm": 1.0828995283089597, "learning_rate": 1.9898469232791544e-07, "loss": 0.0108, "step": 7756 }, { "epoch": 3.529117379435851, "grad_norm": 0.6688933147482798, "learning_rate": 1.9887057919954054e-07, "loss": 0.016, "step": 7757 }, { "epoch": 3.529572338489536, "grad_norm": 0.8358270402181139, "learning_rate": 1.987564906781018e-07, "loss": 0.0066, "step": 7758 }, { "epoch": 3.530027297543221, "grad_norm": 0.9282451845958891, "learning_rate": 1.9864242677292243e-07, "loss": 0.0185, "step": 7759 }, { "epoch": 3.5304822565969065, "grad_norm": 0.7358812602843569, "learning_rate": 1.9852838749332302e-07, "loss": 0.014, "step": 7760 }, { "epoch": 3.5309372156505914, "grad_norm": 0.8327492545627139, "learning_rate": 1.984143728486224e-07, "loss": 0.0178, "step": 7761 }, { "epoch": 3.5313921747042767, "grad_norm": 0.6862857641432073, "learning_rate": 1.9830038284813706e-07, "loss": 0.0043, "step": 7762 }, { "epoch": 3.531847133757962, "grad_norm": 0.5616765266845133, "learning_rate": 1.9818641750118214e-07, "loss": 0.0139, "step": 7763 }, { "epoch": 3.532302092811647, "grad_norm": 0.4446036514112624, "learning_rate": 1.9807247681707017e-07, "loss": 0.0047, "step": 7764 }, { "epoch": 3.532757051865332, "grad_norm": 0.45666441188913615, "learning_rate": 1.979585608051118e-07, "loss": 0.006, "step": 7765 }, { "epoch": 3.5332120109190175, "grad_norm": 1.1615984923886853, "learning_rate": 1.9784466947461599e-07, "loss": 0.0105, "step": 7766 }, { "epoch": 3.5336669699727024, "grad_norm": 0.5991975723228553, "learning_rate": 1.9773080283488907e-07, "loss": 0.0041, "step": 7767 }, { "epoch": 3.5341219290263877, "grad_norm": 0.741737774804608, "learning_rate": 1.976169608952361e-07, "loss": 0.0089, "step": 7768 }, { "epoch": 3.534576888080073, "grad_norm": 0.8313880350045523, "learning_rate": 1.9750314366495952e-07, "loss": 0.0133, "step": 7769 }, { "epoch": 3.535031847133758, "grad_norm": 0.9097306227100388, "learning_rate": 1.9738935115336002e-07, "loss": 0.0191, "step": 7770 }, { "epoch": 3.535486806187443, "grad_norm": 0.5321007777116347, "learning_rate": 1.9727558336973594e-07, "loss": 0.0052, "step": 7771 }, { "epoch": 3.5359417652411285, "grad_norm": 0.5446221069975231, "learning_rate": 1.9716184032338413e-07, "loss": 0.007, "step": 7772 }, { "epoch": 3.5363967242948133, "grad_norm": 0.5566798371765417, "learning_rate": 1.9704812202359926e-07, "loss": 0.0093, "step": 7773 }, { "epoch": 3.5368516833484986, "grad_norm": 0.9292640321405934, "learning_rate": 1.9693442847967378e-07, "loss": 0.0077, "step": 7774 }, { "epoch": 3.537306642402184, "grad_norm": 1.579492218114512, "learning_rate": 1.9682075970089812e-07, "loss": 0.0273, "step": 7775 }, { "epoch": 3.537761601455869, "grad_norm": 0.8781038360827347, "learning_rate": 1.9670711569656073e-07, "loss": 0.028, "step": 7776 }, { "epoch": 3.538216560509554, "grad_norm": 1.4731974910800838, "learning_rate": 1.965934964759483e-07, "loss": 0.016, "step": 7777 }, { "epoch": 3.5386715195632394, "grad_norm": 0.6567480929012334, "learning_rate": 1.9647990204834518e-07, "loss": 0.0094, "step": 7778 }, { "epoch": 3.5391264786169243, "grad_norm": 0.6660616736613298, "learning_rate": 1.9636633242303362e-07, "loss": 0.0106, "step": 7779 }, { "epoch": 3.5395814376706096, "grad_norm": 0.6865852214341369, "learning_rate": 1.9625278760929436e-07, "loss": 0.0197, "step": 7780 }, { "epoch": 3.540036396724295, "grad_norm": 0.6471235704689107, "learning_rate": 1.9613926761640543e-07, "loss": 0.0169, "step": 7781 }, { "epoch": 3.54049135577798, "grad_norm": 0.9197482550017754, "learning_rate": 1.9602577245364343e-07, "loss": 0.0094, "step": 7782 }, { "epoch": 3.540946314831665, "grad_norm": 0.9222773308116102, "learning_rate": 1.9591230213028264e-07, "loss": 0.0077, "step": 7783 }, { "epoch": 3.5414012738853504, "grad_norm": 0.8012022657293248, "learning_rate": 1.9579885665559527e-07, "loss": 0.014, "step": 7784 }, { "epoch": 3.5418562329390353, "grad_norm": 0.5330560736252816, "learning_rate": 1.9568543603885135e-07, "loss": 0.0065, "step": 7785 }, { "epoch": 3.5423111919927206, "grad_norm": 0.6517268816333068, "learning_rate": 1.9557204028931934e-07, "loss": 0.0083, "step": 7786 }, { "epoch": 3.542766151046406, "grad_norm": 0.48520935312522384, "learning_rate": 1.954586694162656e-07, "loss": 0.0074, "step": 7787 }, { "epoch": 3.5432211101000908, "grad_norm": 0.5560735711339501, "learning_rate": 1.953453234289541e-07, "loss": 0.0138, "step": 7788 }, { "epoch": 3.543676069153776, "grad_norm": 0.5771634901466152, "learning_rate": 1.9523200233664694e-07, "loss": 0.0107, "step": 7789 }, { "epoch": 3.5441310282074614, "grad_norm": 0.6679126319567597, "learning_rate": 1.9511870614860405e-07, "loss": 0.0157, "step": 7790 }, { "epoch": 3.5445859872611463, "grad_norm": 0.7276316735732392, "learning_rate": 1.9500543487408388e-07, "loss": 0.0058, "step": 7791 }, { "epoch": 3.5450409463148316, "grad_norm": 0.5893747568656978, "learning_rate": 1.9489218852234218e-07, "loss": 0.007, "step": 7792 }, { "epoch": 3.545495905368517, "grad_norm": 0.7265389352796151, "learning_rate": 1.947789671026328e-07, "loss": 0.0089, "step": 7793 }, { "epoch": 3.5459508644222018, "grad_norm": 0.6437522948602018, "learning_rate": 1.9466577062420808e-07, "loss": 0.0089, "step": 7794 }, { "epoch": 3.546405823475887, "grad_norm": 0.8202489697930979, "learning_rate": 1.9455259909631756e-07, "loss": 0.0151, "step": 7795 }, { "epoch": 3.5468607825295724, "grad_norm": 0.7354373322741758, "learning_rate": 1.9443945252820936e-07, "loss": 0.0223, "step": 7796 }, { "epoch": 3.5473157415832572, "grad_norm": 0.6343968448559626, "learning_rate": 1.943263309291292e-07, "loss": 0.0112, "step": 7797 }, { "epoch": 3.5477707006369426, "grad_norm": 0.8031244111965546, "learning_rate": 1.9421323430832094e-07, "loss": 0.0078, "step": 7798 }, { "epoch": 3.548225659690628, "grad_norm": 1.380294858993297, "learning_rate": 1.9410016267502615e-07, "loss": 0.0193, "step": 7799 }, { "epoch": 3.548680618744313, "grad_norm": 0.660035319934087, "learning_rate": 1.9398711603848454e-07, "loss": 0.0051, "step": 7800 }, { "epoch": 3.549135577797998, "grad_norm": 0.8893037176097816, "learning_rate": 1.9387409440793385e-07, "loss": 0.0095, "step": 7801 }, { "epoch": 3.5495905368516834, "grad_norm": 0.583790934733208, "learning_rate": 1.9376109779260985e-07, "loss": 0.0057, "step": 7802 }, { "epoch": 3.5500454959053687, "grad_norm": 0.7185393689463534, "learning_rate": 1.9364812620174604e-07, "loss": 0.0137, "step": 7803 }, { "epoch": 3.5505004549590535, "grad_norm": 0.6144877605011533, "learning_rate": 1.9353517964457384e-07, "loss": 0.0131, "step": 7804 }, { "epoch": 3.550955414012739, "grad_norm": 0.6059224696420816, "learning_rate": 1.934222581303226e-07, "loss": 0.0122, "step": 7805 }, { "epoch": 3.551410373066424, "grad_norm": 0.4992877257932138, "learning_rate": 1.933093616682201e-07, "loss": 0.0052, "step": 7806 }, { "epoch": 3.5518653321201095, "grad_norm": 0.3646856240143395, "learning_rate": 1.931964902674914e-07, "loss": 0.0033, "step": 7807 }, { "epoch": 3.5523202911737943, "grad_norm": 0.4129090252742288, "learning_rate": 1.930836439373602e-07, "loss": 0.0037, "step": 7808 }, { "epoch": 3.5527752502274796, "grad_norm": 0.787219623265633, "learning_rate": 1.9297082268704757e-07, "loss": 0.0098, "step": 7809 }, { "epoch": 3.553230209281165, "grad_norm": 0.6550082688195541, "learning_rate": 1.928580265257726e-07, "loss": 0.0054, "step": 7810 }, { "epoch": 3.55368516833485, "grad_norm": 0.5999458926253766, "learning_rate": 1.9274525546275283e-07, "loss": 0.0082, "step": 7811 }, { "epoch": 3.554140127388535, "grad_norm": 0.6887798128865817, "learning_rate": 1.9263250950720327e-07, "loss": 0.0153, "step": 7812 }, { "epoch": 3.5545950864422204, "grad_norm": 0.5764543533954326, "learning_rate": 1.9251978866833695e-07, "loss": 0.0049, "step": 7813 }, { "epoch": 3.5550500454959053, "grad_norm": 0.6728467311213493, "learning_rate": 1.9240709295536478e-07, "loss": 0.0053, "step": 7814 }, { "epoch": 3.5555050045495906, "grad_norm": 0.5682614280854209, "learning_rate": 1.9229442237749589e-07, "loss": 0.0104, "step": 7815 }, { "epoch": 3.555959963603276, "grad_norm": 0.7696664399468236, "learning_rate": 1.9218177694393733e-07, "loss": 0.006, "step": 7816 }, { "epoch": 3.556414922656961, "grad_norm": 0.8712785883248028, "learning_rate": 1.9206915666389395e-07, "loss": 0.0135, "step": 7817 }, { "epoch": 3.556869881710646, "grad_norm": 0.7296771294174788, "learning_rate": 1.9195656154656842e-07, "loss": 0.0108, "step": 7818 }, { "epoch": 3.5573248407643314, "grad_norm": 3.8846989768921287, "learning_rate": 1.9184399160116144e-07, "loss": 0.053, "step": 7819 }, { "epoch": 3.5577797998180163, "grad_norm": 0.39365653197600836, "learning_rate": 1.9173144683687198e-07, "loss": 0.0041, "step": 7820 }, { "epoch": 3.5582347588717016, "grad_norm": 0.6995739368722992, "learning_rate": 1.916189272628964e-07, "loss": 0.0081, "step": 7821 }, { "epoch": 3.558689717925387, "grad_norm": 0.8424657596904735, "learning_rate": 1.915064328884296e-07, "loss": 0.0087, "step": 7822 }, { "epoch": 3.5591446769790718, "grad_norm": 0.8848002491738235, "learning_rate": 1.9139396372266402e-07, "loss": 0.0206, "step": 7823 }, { "epoch": 3.559599636032757, "grad_norm": 9.981231471946034, "learning_rate": 1.9128151977478986e-07, "loss": 0.0511, "step": 7824 }, { "epoch": 3.5600545950864424, "grad_norm": 0.7916755929460304, "learning_rate": 1.9116910105399592e-07, "loss": 0.0057, "step": 7825 }, { "epoch": 3.5605095541401273, "grad_norm": 0.6368199267170069, "learning_rate": 1.9105670756946833e-07, "loss": 0.009, "step": 7826 }, { "epoch": 3.5609645131938126, "grad_norm": 0.5839106918575393, "learning_rate": 1.909443393303915e-07, "loss": 0.0085, "step": 7827 }, { "epoch": 3.561419472247498, "grad_norm": 0.776030074207624, "learning_rate": 1.9083199634594738e-07, "loss": 0.0191, "step": 7828 }, { "epoch": 3.5618744313011828, "grad_norm": 0.4017710158736845, "learning_rate": 1.907196786253163e-07, "loss": 0.004, "step": 7829 }, { "epoch": 3.562329390354868, "grad_norm": 0.7573424079414106, "learning_rate": 1.9060738617767657e-07, "loss": 0.021, "step": 7830 }, { "epoch": 3.5627843494085534, "grad_norm": 0.49880476893311976, "learning_rate": 1.9049511901220405e-07, "loss": 0.0032, "step": 7831 }, { "epoch": 3.5632393084622382, "grad_norm": 0.7125614500128543, "learning_rate": 1.9038287713807267e-07, "loss": 0.0078, "step": 7832 }, { "epoch": 3.5636942675159236, "grad_norm": 0.681920885775602, "learning_rate": 1.9027066056445435e-07, "loss": 0.0054, "step": 7833 }, { "epoch": 3.564149226569609, "grad_norm": 0.9234117355917877, "learning_rate": 1.9015846930051877e-07, "loss": 0.0096, "step": 7834 }, { "epoch": 3.5646041856232937, "grad_norm": 0.5418829512806898, "learning_rate": 1.9004630335543387e-07, "loss": 0.0044, "step": 7835 }, { "epoch": 3.565059144676979, "grad_norm": 0.40203583567162793, "learning_rate": 1.8993416273836544e-07, "loss": 0.0047, "step": 7836 }, { "epoch": 3.5655141037306644, "grad_norm": 0.8054062733423555, "learning_rate": 1.8982204745847702e-07, "loss": 0.01, "step": 7837 }, { "epoch": 3.565969062784349, "grad_norm": 0.644744113888807, "learning_rate": 1.8970995752493014e-07, "loss": 0.008, "step": 7838 }, { "epoch": 3.5664240218380345, "grad_norm": 0.5989876212822, "learning_rate": 1.8959789294688406e-07, "loss": 0.0072, "step": 7839 }, { "epoch": 3.56687898089172, "grad_norm": 0.3725614921082798, "learning_rate": 1.8948585373349663e-07, "loss": 0.0028, "step": 7840 }, { "epoch": 3.5673339399454047, "grad_norm": 0.31559280588887206, "learning_rate": 1.8937383989392293e-07, "loss": 0.0016, "step": 7841 }, { "epoch": 3.56778889899909, "grad_norm": 0.7345414944129933, "learning_rate": 1.8926185143731605e-07, "loss": 0.0101, "step": 7842 }, { "epoch": 3.5682438580527753, "grad_norm": 1.4237455113603854, "learning_rate": 1.8914988837282763e-07, "loss": 0.0262, "step": 7843 }, { "epoch": 3.56869881710646, "grad_norm": 0.6802708339705464, "learning_rate": 1.8903795070960633e-07, "loss": 0.0078, "step": 7844 }, { "epoch": 3.5691537761601455, "grad_norm": 0.7959629796640272, "learning_rate": 1.8892603845679961e-07, "loss": 0.0098, "step": 7845 }, { "epoch": 3.569608735213831, "grad_norm": 0.43846045198543515, "learning_rate": 1.8881415162355218e-07, "loss": 0.0083, "step": 7846 }, { "epoch": 3.5700636942675157, "grad_norm": 0.531670331775544, "learning_rate": 1.8870229021900702e-07, "loss": 0.0136, "step": 7847 }, { "epoch": 3.570518653321201, "grad_norm": 0.6131674124319763, "learning_rate": 1.8859045425230475e-07, "loss": 0.0071, "step": 7848 }, { "epoch": 3.5709736123748863, "grad_norm": 0.5945123426296216, "learning_rate": 1.8847864373258416e-07, "loss": 0.0079, "step": 7849 }, { "epoch": 3.571428571428571, "grad_norm": 0.6768229634215076, "learning_rate": 1.8836685866898222e-07, "loss": 0.0068, "step": 7850 }, { "epoch": 3.5718835304822565, "grad_norm": 0.5177213143451171, "learning_rate": 1.8825509907063326e-07, "loss": 0.0117, "step": 7851 }, { "epoch": 3.572338489535942, "grad_norm": 0.8326071427491966, "learning_rate": 1.8814336494666977e-07, "loss": 0.012, "step": 7852 }, { "epoch": 3.5727934485896267, "grad_norm": 0.6137906953502368, "learning_rate": 1.8803165630622197e-07, "loss": 0.0101, "step": 7853 }, { "epoch": 3.573248407643312, "grad_norm": 0.8945516109672005, "learning_rate": 1.8791997315841863e-07, "loss": 0.0152, "step": 7854 }, { "epoch": 3.5737033666969973, "grad_norm": 0.6337762012684623, "learning_rate": 1.8780831551238562e-07, "loss": 0.0146, "step": 7855 }, { "epoch": 3.5741583257506826, "grad_norm": 0.6029859563750084, "learning_rate": 1.8769668337724714e-07, "loss": 0.0079, "step": 7856 }, { "epoch": 3.5746132848043675, "grad_norm": 0.9443621589868063, "learning_rate": 1.8758507676212547e-07, "loss": 0.0121, "step": 7857 }, { "epoch": 3.5750682438580528, "grad_norm": 0.7123831288672474, "learning_rate": 1.8747349567614034e-07, "loss": 0.0119, "step": 7858 }, { "epoch": 3.575523202911738, "grad_norm": 0.7509582942229913, "learning_rate": 1.8736194012840996e-07, "loss": 0.0203, "step": 7859 }, { "epoch": 3.575978161965423, "grad_norm": 0.4724892854532428, "learning_rate": 1.8725041012804993e-07, "loss": 0.0029, "step": 7860 }, { "epoch": 3.5764331210191083, "grad_norm": 0.5638008597961734, "learning_rate": 1.8713890568417407e-07, "loss": 0.0137, "step": 7861 }, { "epoch": 3.5768880800727936, "grad_norm": 0.42560898849397377, "learning_rate": 1.8702742680589378e-07, "loss": 0.005, "step": 7862 }, { "epoch": 3.577343039126479, "grad_norm": 1.2267150125179695, "learning_rate": 1.8691597350231874e-07, "loss": 0.0157, "step": 7863 }, { "epoch": 3.5777979981801638, "grad_norm": 0.9931040142172171, "learning_rate": 1.8680454578255672e-07, "loss": 0.0133, "step": 7864 }, { "epoch": 3.578252957233849, "grad_norm": 0.4378191094298598, "learning_rate": 1.8669314365571283e-07, "loss": 0.0034, "step": 7865 }, { "epoch": 3.5787079162875344, "grad_norm": 0.7488968768355677, "learning_rate": 1.8658176713089036e-07, "loss": 0.0109, "step": 7866 }, { "epoch": 3.5791628753412192, "grad_norm": 0.8895281907053633, "learning_rate": 1.8647041621719045e-07, "loss": 0.0095, "step": 7867 }, { "epoch": 3.5796178343949046, "grad_norm": 0.521779123971729, "learning_rate": 1.863590909237121e-07, "loss": 0.0067, "step": 7868 }, { "epoch": 3.58007279344859, "grad_norm": 0.5353063618003269, "learning_rate": 1.862477912595526e-07, "loss": 0.0038, "step": 7869 }, { "epoch": 3.5805277525022747, "grad_norm": 0.3668960476300501, "learning_rate": 1.8613651723380657e-07, "loss": 0.0023, "step": 7870 }, { "epoch": 3.58098271155596, "grad_norm": 0.6777037466314345, "learning_rate": 1.8602526885556714e-07, "loss": 0.0123, "step": 7871 }, { "epoch": 3.5814376706096454, "grad_norm": 1.007345830638154, "learning_rate": 1.8591404613392476e-07, "loss": 0.0175, "step": 7872 }, { "epoch": 3.58189262966333, "grad_norm": 0.4357810778724562, "learning_rate": 1.85802849077968e-07, "loss": 0.0073, "step": 7873 }, { "epoch": 3.5823475887170155, "grad_norm": 0.6092319427754339, "learning_rate": 1.8569167769678374e-07, "loss": 0.008, "step": 7874 }, { "epoch": 3.582802547770701, "grad_norm": 1.14949496563264, "learning_rate": 1.8558053199945612e-07, "loss": 0.0115, "step": 7875 }, { "epoch": 3.5832575068243857, "grad_norm": 0.9251208372693599, "learning_rate": 1.854694119950675e-07, "loss": 0.017, "step": 7876 }, { "epoch": 3.583712465878071, "grad_norm": 0.8091725396341435, "learning_rate": 1.85358317692698e-07, "loss": 0.0072, "step": 7877 }, { "epoch": 3.5841674249317563, "grad_norm": 0.6036004315631195, "learning_rate": 1.8524724910142586e-07, "loss": 0.0138, "step": 7878 }, { "epoch": 3.584622383985441, "grad_norm": 1.2729696077028514, "learning_rate": 1.8513620623032727e-07, "loss": 0.0274, "step": 7879 }, { "epoch": 3.5850773430391265, "grad_norm": 0.3650921634555704, "learning_rate": 1.8502518908847598e-07, "loss": 0.004, "step": 7880 }, { "epoch": 3.585532302092812, "grad_norm": 0.7318615231647003, "learning_rate": 1.8491419768494389e-07, "loss": 0.01, "step": 7881 }, { "epoch": 3.5859872611464967, "grad_norm": 0.6137283456896471, "learning_rate": 1.8480323202880038e-07, "loss": 0.0159, "step": 7882 }, { "epoch": 3.586442220200182, "grad_norm": 0.36763224723253646, "learning_rate": 1.8469229212911357e-07, "loss": 0.0048, "step": 7883 }, { "epoch": 3.5868971792538673, "grad_norm": 0.5635657056501874, "learning_rate": 1.8458137799494856e-07, "loss": 0.006, "step": 7884 }, { "epoch": 3.587352138307552, "grad_norm": 0.4243387519571532, "learning_rate": 1.8447048963536903e-07, "loss": 0.0052, "step": 7885 }, { "epoch": 3.5878070973612375, "grad_norm": 0.7704754541673645, "learning_rate": 1.8435962705943626e-07, "loss": 0.0097, "step": 7886 }, { "epoch": 3.588262056414923, "grad_norm": 0.649099050118693, "learning_rate": 1.842487902762091e-07, "loss": 0.0145, "step": 7887 }, { "epoch": 3.5887170154686077, "grad_norm": 0.8452279936339993, "learning_rate": 1.841379792947451e-07, "loss": 0.0109, "step": 7888 }, { "epoch": 3.589171974522293, "grad_norm": 0.8067303807630184, "learning_rate": 1.84027194124099e-07, "loss": 0.0136, "step": 7889 }, { "epoch": 3.5896269335759783, "grad_norm": 0.8512670327271114, "learning_rate": 1.8391643477332364e-07, "loss": 0.0181, "step": 7890 }, { "epoch": 3.590081892629663, "grad_norm": 0.6643129010654808, "learning_rate": 1.8380570125146965e-07, "loss": 0.0116, "step": 7891 }, { "epoch": 3.5905368516833485, "grad_norm": 0.6469559790106295, "learning_rate": 1.8369499356758588e-07, "loss": 0.0082, "step": 7892 }, { "epoch": 3.5909918107370338, "grad_norm": 0.355163402856396, "learning_rate": 1.8358431173071897e-07, "loss": 0.0027, "step": 7893 }, { "epoch": 3.5914467697907186, "grad_norm": 2.4625905254965303, "learning_rate": 1.8347365574991314e-07, "loss": 0.0464, "step": 7894 }, { "epoch": 3.591901728844404, "grad_norm": 0.3473123401220791, "learning_rate": 1.833630256342108e-07, "loss": 0.003, "step": 7895 }, { "epoch": 3.5923566878980893, "grad_norm": 0.6106924920126928, "learning_rate": 1.8325242139265191e-07, "loss": 0.0077, "step": 7896 }, { "epoch": 3.592811646951774, "grad_norm": 0.549573586203816, "learning_rate": 1.8314184303427482e-07, "loss": 0.0077, "step": 7897 }, { "epoch": 3.5932666060054594, "grad_norm": 0.5547954880886693, "learning_rate": 1.8303129056811528e-07, "loss": 0.0088, "step": 7898 }, { "epoch": 3.5937215650591448, "grad_norm": 0.533573789572966, "learning_rate": 1.8292076400320744e-07, "loss": 0.0038, "step": 7899 }, { "epoch": 3.5941765241128296, "grad_norm": 0.5600686705557125, "learning_rate": 1.8281026334858284e-07, "loss": 0.009, "step": 7900 }, { "epoch": 3.594631483166515, "grad_norm": 0.7227562819716343, "learning_rate": 1.8269978861327095e-07, "loss": 0.0117, "step": 7901 }, { "epoch": 3.5950864422202002, "grad_norm": 0.7150370737414706, "learning_rate": 1.8258933980629954e-07, "loss": 0.0063, "step": 7902 }, { "epoch": 3.595541401273885, "grad_norm": 0.7210663648996667, "learning_rate": 1.8247891693669391e-07, "loss": 0.02, "step": 7903 }, { "epoch": 3.5959963603275704, "grad_norm": 0.46701242861104264, "learning_rate": 1.8236852001347724e-07, "loss": 0.0045, "step": 7904 }, { "epoch": 3.5964513193812557, "grad_norm": 2.7322788762595875, "learning_rate": 1.8225814904567056e-07, "loss": 0.0126, "step": 7905 }, { "epoch": 3.5969062784349406, "grad_norm": 0.6482380865752293, "learning_rate": 1.8214780404229318e-07, "loss": 0.0105, "step": 7906 }, { "epoch": 3.597361237488626, "grad_norm": 0.6317562393229065, "learning_rate": 1.8203748501236172e-07, "loss": 0.0147, "step": 7907 }, { "epoch": 3.597816196542311, "grad_norm": 0.7488023267044198, "learning_rate": 1.819271919648912e-07, "loss": 0.0197, "step": 7908 }, { "epoch": 3.598271155595996, "grad_norm": 0.5805688511134737, "learning_rate": 1.8181692490889415e-07, "loss": 0.0086, "step": 7909 }, { "epoch": 3.5987261146496814, "grad_norm": 0.35358492809626596, "learning_rate": 1.817066838533811e-07, "loss": 0.0049, "step": 7910 }, { "epoch": 3.5991810737033667, "grad_norm": 0.4766331897200839, "learning_rate": 1.8159646880736034e-07, "loss": 0.006, "step": 7911 }, { "epoch": 3.599636032757052, "grad_norm": 0.4647794349016244, "learning_rate": 1.8148627977983816e-07, "loss": 0.0068, "step": 7912 }, { "epoch": 3.600090991810737, "grad_norm": 0.7683584494403825, "learning_rate": 1.8137611677981902e-07, "loss": 0.018, "step": 7913 }, { "epoch": 3.600545950864422, "grad_norm": 0.6606852062420137, "learning_rate": 1.8126597981630472e-07, "loss": 0.0065, "step": 7914 }, { "epoch": 3.6010009099181075, "grad_norm": 0.736745661998197, "learning_rate": 1.8115586889829515e-07, "loss": 0.0065, "step": 7915 }, { "epoch": 3.6014558689717924, "grad_norm": 0.5599016378637072, "learning_rate": 1.8104578403478794e-07, "loss": 0.0109, "step": 7916 }, { "epoch": 3.6019108280254777, "grad_norm": 0.6489062034379478, "learning_rate": 1.80935725234779e-07, "loss": 0.01, "step": 7917 }, { "epoch": 3.602365787079163, "grad_norm": 0.985708033794782, "learning_rate": 1.8082569250726175e-07, "loss": 0.0094, "step": 7918 }, { "epoch": 3.6028207461328483, "grad_norm": 0.6428004682719552, "learning_rate": 1.8071568586122732e-07, "loss": 0.0078, "step": 7919 }, { "epoch": 3.603275705186533, "grad_norm": 0.7457730442774165, "learning_rate": 1.8060570530566538e-07, "loss": 0.0131, "step": 7920 }, { "epoch": 3.6037306642402185, "grad_norm": 1.0605007778593851, "learning_rate": 1.8049575084956263e-07, "loss": 0.012, "step": 7921 }, { "epoch": 3.604185623293904, "grad_norm": 1.1211733056156374, "learning_rate": 1.8038582250190442e-07, "loss": 0.012, "step": 7922 }, { "epoch": 3.6046405823475887, "grad_norm": 0.8627805791842259, "learning_rate": 1.8027592027167348e-07, "loss": 0.0113, "step": 7923 }, { "epoch": 3.605095541401274, "grad_norm": 0.9429132444688817, "learning_rate": 1.801660441678504e-07, "loss": 0.0131, "step": 7924 }, { "epoch": 3.6055505004549593, "grad_norm": 0.44591907425459604, "learning_rate": 1.800561941994137e-07, "loss": 0.0049, "step": 7925 }, { "epoch": 3.606005459508644, "grad_norm": 0.747739995918312, "learning_rate": 1.7994637037534e-07, "loss": 0.0176, "step": 7926 }, { "epoch": 3.6064604185623295, "grad_norm": 0.433383421775959, "learning_rate": 1.7983657270460368e-07, "loss": 0.009, "step": 7927 }, { "epoch": 3.6069153776160148, "grad_norm": 0.964167719783542, "learning_rate": 1.7972680119617677e-07, "loss": 0.0115, "step": 7928 }, { "epoch": 3.6073703366696996, "grad_norm": 1.382741811850466, "learning_rate": 1.7961705585902942e-07, "loss": 0.0185, "step": 7929 }, { "epoch": 3.607825295723385, "grad_norm": 0.8564460801200346, "learning_rate": 1.795073367021292e-07, "loss": 0.0122, "step": 7930 }, { "epoch": 3.6082802547770703, "grad_norm": 0.5354523337759727, "learning_rate": 1.793976437344422e-07, "loss": 0.0064, "step": 7931 }, { "epoch": 3.608735213830755, "grad_norm": 1.092908717300996, "learning_rate": 1.7928797696493202e-07, "loss": 0.0208, "step": 7932 }, { "epoch": 3.6091901728844404, "grad_norm": 0.5948385546381642, "learning_rate": 1.7917833640255987e-07, "loss": 0.0045, "step": 7933 }, { "epoch": 3.6096451319381258, "grad_norm": 0.7556929133579945, "learning_rate": 1.7906872205628536e-07, "loss": 0.0091, "step": 7934 }, { "epoch": 3.6101000909918106, "grad_norm": 0.4665826391681589, "learning_rate": 1.7895913393506547e-07, "loss": 0.006, "step": 7935 }, { "epoch": 3.610555050045496, "grad_norm": 0.9783242219958285, "learning_rate": 1.7884957204785546e-07, "loss": 0.0095, "step": 7936 }, { "epoch": 3.6110100090991812, "grad_norm": 0.6374230549675531, "learning_rate": 1.7874003640360812e-07, "loss": 0.0129, "step": 7937 }, { "epoch": 3.611464968152866, "grad_norm": 0.8370975221933229, "learning_rate": 1.7863052701127423e-07, "loss": 0.0125, "step": 7938 }, { "epoch": 3.6119199272065514, "grad_norm": 0.7271419598270958, "learning_rate": 1.785210438798024e-07, "loss": 0.0073, "step": 7939 }, { "epoch": 3.6123748862602367, "grad_norm": 0.5285235495995095, "learning_rate": 1.784115870181387e-07, "loss": 0.0047, "step": 7940 }, { "epoch": 3.6128298453139216, "grad_norm": 1.0473144627019717, "learning_rate": 1.7830215643522817e-07, "loss": 0.0187, "step": 7941 }, { "epoch": 3.613284804367607, "grad_norm": 0.42759799958289146, "learning_rate": 1.7819275214001263e-07, "loss": 0.0055, "step": 7942 }, { "epoch": 3.613739763421292, "grad_norm": 0.702544030685759, "learning_rate": 1.7808337414143216e-07, "loss": 0.0063, "step": 7943 }, { "epoch": 3.614194722474977, "grad_norm": 0.8440360644081051, "learning_rate": 1.7797402244842457e-07, "loss": 0.0184, "step": 7944 }, { "epoch": 3.6146496815286624, "grad_norm": 0.42334638947283754, "learning_rate": 1.778646970699254e-07, "loss": 0.0053, "step": 7945 }, { "epoch": 3.6151046405823477, "grad_norm": 0.7006134735778677, "learning_rate": 1.7775539801486866e-07, "loss": 0.0179, "step": 7946 }, { "epoch": 3.6155595996360326, "grad_norm": 0.8139327359674772, "learning_rate": 1.7764612529218537e-07, "loss": 0.0215, "step": 7947 }, { "epoch": 3.616014558689718, "grad_norm": 0.5937308732613404, "learning_rate": 1.7753687891080515e-07, "loss": 0.0109, "step": 7948 }, { "epoch": 3.616469517743403, "grad_norm": 2.0414557190183005, "learning_rate": 1.77427658879655e-07, "loss": 0.0112, "step": 7949 }, { "epoch": 3.616924476797088, "grad_norm": 0.6159199853860781, "learning_rate": 1.773184652076596e-07, "loss": 0.0081, "step": 7950 }, { "epoch": 3.6173794358507734, "grad_norm": 0.7487827479559398, "learning_rate": 1.7720929790374223e-07, "loss": 0.0058, "step": 7951 }, { "epoch": 3.6178343949044587, "grad_norm": 0.494650831486557, "learning_rate": 1.7710015697682328e-07, "loss": 0.0069, "step": 7952 }, { "epoch": 3.6182893539581436, "grad_norm": 0.7917474749524457, "learning_rate": 1.7699104243582131e-07, "loss": 0.0138, "step": 7953 }, { "epoch": 3.618744313011829, "grad_norm": 0.4833800261736221, "learning_rate": 1.7688195428965247e-07, "loss": 0.0059, "step": 7954 }, { "epoch": 3.619199272065514, "grad_norm": 1.0157831795066095, "learning_rate": 1.7677289254723122e-07, "loss": 0.0158, "step": 7955 }, { "epoch": 3.619654231119199, "grad_norm": 0.8168469794298758, "learning_rate": 1.7666385721746958e-07, "loss": 0.0149, "step": 7956 }, { "epoch": 3.6201091901728844, "grad_norm": 0.7028739443641462, "learning_rate": 1.7655484830927742e-07, "loss": 0.0141, "step": 7957 }, { "epoch": 3.6205641492265697, "grad_norm": 0.7231695128680977, "learning_rate": 1.7644586583156234e-07, "loss": 0.0164, "step": 7958 }, { "epoch": 3.6210191082802545, "grad_norm": 0.47655534161389873, "learning_rate": 1.7633690979322985e-07, "loss": 0.005, "step": 7959 }, { "epoch": 3.62147406733394, "grad_norm": 0.6886641743733539, "learning_rate": 1.7622798020318353e-07, "loss": 0.0098, "step": 7960 }, { "epoch": 3.621929026387625, "grad_norm": 0.4881589252505408, "learning_rate": 1.761190770703244e-07, "loss": 0.0047, "step": 7961 }, { "epoch": 3.62238398544131, "grad_norm": 0.5956892764437361, "learning_rate": 1.760102004035518e-07, "loss": 0.01, "step": 7962 }, { "epoch": 3.6228389444949953, "grad_norm": 0.6675717501154477, "learning_rate": 1.7590135021176256e-07, "loss": 0.0087, "step": 7963 }, { "epoch": 3.6232939035486806, "grad_norm": 0.6231792062529405, "learning_rate": 1.7579252650385114e-07, "loss": 0.004, "step": 7964 }, { "epoch": 3.623748862602366, "grad_norm": 0.9085250230300949, "learning_rate": 1.7568372928871051e-07, "loss": 0.0097, "step": 7965 }, { "epoch": 3.624203821656051, "grad_norm": 0.5619264194338985, "learning_rate": 1.7557495857523097e-07, "loss": 0.0066, "step": 7966 }, { "epoch": 3.624658780709736, "grad_norm": 0.5591787121795929, "learning_rate": 1.7546621437230068e-07, "loss": 0.0081, "step": 7967 }, { "epoch": 3.6251137397634214, "grad_norm": 0.616741631302135, "learning_rate": 1.7535749668880563e-07, "loss": 0.0128, "step": 7968 }, { "epoch": 3.6255686988171063, "grad_norm": 0.9755978572620371, "learning_rate": 1.7524880553362987e-07, "loss": 0.0188, "step": 7969 }, { "epoch": 3.6260236578707916, "grad_norm": 0.4415888610085137, "learning_rate": 1.7514014091565532e-07, "loss": 0.0047, "step": 7970 }, { "epoch": 3.626478616924477, "grad_norm": 0.9411497551578728, "learning_rate": 1.7503150284376138e-07, "loss": 0.0128, "step": 7971 }, { "epoch": 3.6269335759781622, "grad_norm": 0.555654586370021, "learning_rate": 1.7492289132682553e-07, "loss": 0.0148, "step": 7972 }, { "epoch": 3.627388535031847, "grad_norm": 0.935687718825943, "learning_rate": 1.7481430637372297e-07, "loss": 0.0137, "step": 7973 }, { "epoch": 3.6278434940855324, "grad_norm": 0.5024896227594986, "learning_rate": 1.7470574799332653e-07, "loss": 0.0074, "step": 7974 }, { "epoch": 3.6282984531392177, "grad_norm": 0.9783745851159453, "learning_rate": 1.745972161945074e-07, "loss": 0.0128, "step": 7975 }, { "epoch": 3.6287534121929026, "grad_norm": 0.6974673022441852, "learning_rate": 1.7448871098613445e-07, "loss": 0.0047, "step": 7976 }, { "epoch": 3.629208371246588, "grad_norm": 0.735887549890809, "learning_rate": 1.74380232377074e-07, "loss": 0.0171, "step": 7977 }, { "epoch": 3.629663330300273, "grad_norm": 0.7026052365268379, "learning_rate": 1.7427178037619045e-07, "loss": 0.0104, "step": 7978 }, { "epoch": 3.630118289353958, "grad_norm": 0.5993468641183821, "learning_rate": 1.741633549923459e-07, "loss": 0.005, "step": 7979 }, { "epoch": 3.6305732484076434, "grad_norm": 0.7150661037334742, "learning_rate": 1.740549562344007e-07, "loss": 0.0169, "step": 7980 }, { "epoch": 3.6310282074613287, "grad_norm": 0.5586931095729906, "learning_rate": 1.739465841112125e-07, "loss": 0.0116, "step": 7981 }, { "epoch": 3.6314831665150136, "grad_norm": 1.3755474742273275, "learning_rate": 1.7383823863163682e-07, "loss": 0.0248, "step": 7982 }, { "epoch": 3.631938125568699, "grad_norm": 0.6812454967185382, "learning_rate": 1.7372991980452752e-07, "loss": 0.0072, "step": 7983 }, { "epoch": 3.632393084622384, "grad_norm": 0.6866971354398688, "learning_rate": 1.7362162763873557e-07, "loss": 0.0098, "step": 7984 }, { "epoch": 3.632848043676069, "grad_norm": 0.730200611201292, "learning_rate": 1.7351336214311052e-07, "loss": 0.0087, "step": 7985 }, { "epoch": 3.6333030027297544, "grad_norm": 0.6498222254039752, "learning_rate": 1.7340512332649905e-07, "loss": 0.0075, "step": 7986 }, { "epoch": 3.6337579617834397, "grad_norm": 0.7373077953337623, "learning_rate": 1.7329691119774602e-07, "loss": 0.0084, "step": 7987 }, { "epoch": 3.6342129208371245, "grad_norm": 0.6206091952620154, "learning_rate": 1.7318872576569392e-07, "loss": 0.0038, "step": 7988 }, { "epoch": 3.63466787989081, "grad_norm": 0.6047137739325499, "learning_rate": 1.7308056703918322e-07, "loss": 0.0038, "step": 7989 }, { "epoch": 3.635122838944495, "grad_norm": 0.6732124680265776, "learning_rate": 1.7297243502705244e-07, "loss": 0.0065, "step": 7990 }, { "epoch": 3.63557779799818, "grad_norm": 0.6511319509728833, "learning_rate": 1.7286432973813742e-07, "loss": 0.007, "step": 7991 }, { "epoch": 3.6360327570518653, "grad_norm": 1.9421085914216845, "learning_rate": 1.72756251181272e-07, "loss": 0.0088, "step": 7992 }, { "epoch": 3.6364877161055507, "grad_norm": 0.9382398308013732, "learning_rate": 1.7264819936528778e-07, "loss": 0.0157, "step": 7993 }, { "epoch": 3.6369426751592355, "grad_norm": 0.7105329173908382, "learning_rate": 1.7254017429901457e-07, "loss": 0.015, "step": 7994 }, { "epoch": 3.637397634212921, "grad_norm": 0.5401754069682586, "learning_rate": 1.7243217599127953e-07, "loss": 0.009, "step": 7995 }, { "epoch": 3.637852593266606, "grad_norm": 0.9429728248070433, "learning_rate": 1.7232420445090761e-07, "loss": 0.0075, "step": 7996 }, { "epoch": 3.638307552320291, "grad_norm": 0.5138485422034239, "learning_rate": 1.722162596867221e-07, "loss": 0.0087, "step": 7997 }, { "epoch": 3.6387625113739763, "grad_norm": 0.5922682646200774, "learning_rate": 1.721083417075434e-07, "loss": 0.011, "step": 7998 }, { "epoch": 3.6392174704276616, "grad_norm": 0.7873267881241011, "learning_rate": 1.720004505221904e-07, "loss": 0.0094, "step": 7999 }, { "epoch": 3.6396724294813465, "grad_norm": 0.6765450629208603, "learning_rate": 1.7189258613947943e-07, "loss": 0.0139, "step": 8000 }, { "epoch": 3.640127388535032, "grad_norm": 0.8073707064835404, "learning_rate": 1.7178474856822455e-07, "loss": 0.0166, "step": 8001 }, { "epoch": 3.640582347588717, "grad_norm": 0.5030410843981094, "learning_rate": 1.7167693781723768e-07, "loss": 0.0111, "step": 8002 }, { "epoch": 3.641037306642402, "grad_norm": 0.946900766041143, "learning_rate": 1.7156915389532877e-07, "loss": 0.0075, "step": 8003 }, { "epoch": 3.6414922656960873, "grad_norm": 0.7273641721541345, "learning_rate": 1.7146139681130556e-07, "loss": 0.0143, "step": 8004 }, { "epoch": 3.6419472247497726, "grad_norm": 1.0196969536394374, "learning_rate": 1.7135366657397331e-07, "loss": 0.0123, "step": 8005 }, { "epoch": 3.6424021838034575, "grad_norm": 0.9948884710752443, "learning_rate": 1.712459631921353e-07, "loss": 0.0134, "step": 8006 }, { "epoch": 3.642857142857143, "grad_norm": 0.7973196654306182, "learning_rate": 1.711382866745924e-07, "loss": 0.0246, "step": 8007 }, { "epoch": 3.643312101910828, "grad_norm": 0.6401808258456475, "learning_rate": 1.710306370301437e-07, "loss": 0.0103, "step": 8008 }, { "epoch": 3.643767060964513, "grad_norm": 0.8312006932061361, "learning_rate": 1.7092301426758575e-07, "loss": 0.0051, "step": 8009 }, { "epoch": 3.6442220200181983, "grad_norm": 0.5225883267737952, "learning_rate": 1.7081541839571285e-07, "loss": 0.0114, "step": 8010 }, { "epoch": 3.6446769790718836, "grad_norm": 1.1191888987039342, "learning_rate": 1.707078494233175e-07, "loss": 0.0179, "step": 8011 }, { "epoch": 3.6451319381255685, "grad_norm": 0.6854180197947004, "learning_rate": 1.7060030735918962e-07, "loss": 0.0079, "step": 8012 }, { "epoch": 3.6455868971792538, "grad_norm": 0.5910819596925793, "learning_rate": 1.7049279221211693e-07, "loss": 0.0095, "step": 8013 }, { "epoch": 3.646041856232939, "grad_norm": 1.353718553980273, "learning_rate": 1.7038530399088536e-07, "loss": 0.0114, "step": 8014 }, { "epoch": 3.646496815286624, "grad_norm": 0.4553352362795296, "learning_rate": 1.702778427042782e-07, "loss": 0.0081, "step": 8015 }, { "epoch": 3.6469517743403093, "grad_norm": 0.39585418468494116, "learning_rate": 1.7017040836107677e-07, "loss": 0.0028, "step": 8016 }, { "epoch": 3.6474067333939946, "grad_norm": 0.8460236432558875, "learning_rate": 1.7006300097005988e-07, "loss": 0.0109, "step": 8017 }, { "epoch": 3.6478616924476794, "grad_norm": 1.0441564703299844, "learning_rate": 1.6995562054000456e-07, "loss": 0.016, "step": 8018 }, { "epoch": 3.6483166515013647, "grad_norm": 0.8726971086857631, "learning_rate": 1.6984826707968565e-07, "loss": 0.0117, "step": 8019 }, { "epoch": 3.64877161055505, "grad_norm": 3.3503192785147258, "learning_rate": 1.697409405978754e-07, "loss": 0.0401, "step": 8020 }, { "epoch": 3.6492265696087354, "grad_norm": 0.5634882456448022, "learning_rate": 1.6963364110334404e-07, "loss": 0.0068, "step": 8021 }, { "epoch": 3.6496815286624202, "grad_norm": 0.3767387253728018, "learning_rate": 1.6952636860485942e-07, "loss": 0.0063, "step": 8022 }, { "epoch": 3.6501364877161055, "grad_norm": 0.6001246242631723, "learning_rate": 1.6941912311118778e-07, "loss": 0.0123, "step": 8023 }, { "epoch": 3.650591446769791, "grad_norm": 0.6811586140753461, "learning_rate": 1.693119046310923e-07, "loss": 0.0215, "step": 8024 }, { "epoch": 3.6510464058234757, "grad_norm": 0.5653139662140445, "learning_rate": 1.6920471317333473e-07, "loss": 0.0107, "step": 8025 }, { "epoch": 3.651501364877161, "grad_norm": 0.9518721821985818, "learning_rate": 1.690975487466742e-07, "loss": 0.0169, "step": 8026 }, { "epoch": 3.6519563239308463, "grad_norm": 0.45112807295186147, "learning_rate": 1.6899041135986747e-07, "loss": 0.0055, "step": 8027 }, { "epoch": 3.6524112829845317, "grad_norm": 0.9120608588141037, "learning_rate": 1.6888330102166964e-07, "loss": 0.0167, "step": 8028 }, { "epoch": 3.6528662420382165, "grad_norm": 1.009062496378989, "learning_rate": 1.6877621774083317e-07, "loss": 0.011, "step": 8029 }, { "epoch": 3.653321201091902, "grad_norm": 0.9100683224818771, "learning_rate": 1.6866916152610832e-07, "loss": 0.0132, "step": 8030 }, { "epoch": 3.653776160145587, "grad_norm": 1.0996503251048821, "learning_rate": 1.6856213238624324e-07, "loss": 0.0147, "step": 8031 }, { "epoch": 3.654231119199272, "grad_norm": 0.7485896147914093, "learning_rate": 1.6845513032998387e-07, "loss": 0.0179, "step": 8032 }, { "epoch": 3.6546860782529573, "grad_norm": 0.6469339929991664, "learning_rate": 1.6834815536607422e-07, "loss": 0.0059, "step": 8033 }, { "epoch": 3.6551410373066426, "grad_norm": 0.6396290973850493, "learning_rate": 1.682412075032556e-07, "loss": 0.0081, "step": 8034 }, { "epoch": 3.6555959963603275, "grad_norm": 0.94359851924553, "learning_rate": 1.6813428675026726e-07, "loss": 0.0178, "step": 8035 }, { "epoch": 3.656050955414013, "grad_norm": 0.5500248708234142, "learning_rate": 1.6802739311584612e-07, "loss": 0.0099, "step": 8036 }, { "epoch": 3.656505914467698, "grad_norm": 0.5646171506938185, "learning_rate": 1.6792052660872747e-07, "loss": 0.007, "step": 8037 }, { "epoch": 3.656960873521383, "grad_norm": 0.5391995097906385, "learning_rate": 1.678136872376435e-07, "loss": 0.0063, "step": 8038 }, { "epoch": 3.6574158325750683, "grad_norm": 0.6477630948976262, "learning_rate": 1.6770687501132507e-07, "loss": 0.0087, "step": 8039 }, { "epoch": 3.6578707916287536, "grad_norm": 0.6004794953067465, "learning_rate": 1.6760008993850022e-07, "loss": 0.0125, "step": 8040 }, { "epoch": 3.6583257506824385, "grad_norm": 0.8118559950146447, "learning_rate": 1.6749333202789472e-07, "loss": 0.0173, "step": 8041 }, { "epoch": 3.658780709736124, "grad_norm": 0.5710049157749548, "learning_rate": 1.6738660128823268e-07, "loss": 0.0046, "step": 8042 }, { "epoch": 3.659235668789809, "grad_norm": 0.5737420164415374, "learning_rate": 1.6727989772823554e-07, "loss": 0.011, "step": 8043 }, { "epoch": 3.659690627843494, "grad_norm": 1.7900765140770458, "learning_rate": 1.671732213566226e-07, "loss": 0.0508, "step": 8044 }, { "epoch": 3.6601455868971793, "grad_norm": 0.43116116599808824, "learning_rate": 1.6706657218211085e-07, "loss": 0.0057, "step": 8045 }, { "epoch": 3.6606005459508646, "grad_norm": 1.0312880706136387, "learning_rate": 1.6695995021341526e-07, "loss": 0.0194, "step": 8046 }, { "epoch": 3.6610555050045495, "grad_norm": 0.5532468680797358, "learning_rate": 1.6685335545924873e-07, "loss": 0.0094, "step": 8047 }, { "epoch": 3.6615104640582348, "grad_norm": 0.480106376850555, "learning_rate": 1.6674678792832148e-07, "loss": 0.005, "step": 8048 }, { "epoch": 3.66196542311192, "grad_norm": 0.4585261671272529, "learning_rate": 1.666402476293418e-07, "loss": 0.0063, "step": 8049 }, { "epoch": 3.662420382165605, "grad_norm": 0.6728832697768613, "learning_rate": 1.665337345710156e-07, "loss": 0.0143, "step": 8050 }, { "epoch": 3.6628753412192903, "grad_norm": 0.32234998423748895, "learning_rate": 1.6642724876204657e-07, "loss": 0.0032, "step": 8051 }, { "epoch": 3.6633303002729756, "grad_norm": 0.6062585896445438, "learning_rate": 1.6632079021113639e-07, "loss": 0.0105, "step": 8052 }, { "epoch": 3.6637852593266604, "grad_norm": 0.48452787030552785, "learning_rate": 1.6621435892698448e-07, "loss": 0.0064, "step": 8053 }, { "epoch": 3.6642402183803457, "grad_norm": 0.4905332675052506, "learning_rate": 1.6610795491828778e-07, "loss": 0.006, "step": 8054 }, { "epoch": 3.664695177434031, "grad_norm": 0.42520572446313126, "learning_rate": 1.6600157819374117e-07, "loss": 0.0035, "step": 8055 }, { "epoch": 3.665150136487716, "grad_norm": 0.4736870687888423, "learning_rate": 1.6589522876203716e-07, "loss": 0.0064, "step": 8056 }, { "epoch": 3.6656050955414012, "grad_norm": 0.5575736980342748, "learning_rate": 1.6578890663186634e-07, "loss": 0.0097, "step": 8057 }, { "epoch": 3.6660600545950865, "grad_norm": 0.5668572378764499, "learning_rate": 1.6568261181191685e-07, "loss": 0.0058, "step": 8058 }, { "epoch": 3.6665150136487714, "grad_norm": 0.7107741246722784, "learning_rate": 1.655763443108743e-07, "loss": 0.0068, "step": 8059 }, { "epoch": 3.6669699727024567, "grad_norm": 0.7047145536289433, "learning_rate": 1.654701041374229e-07, "loss": 0.0173, "step": 8060 }, { "epoch": 3.667424931756142, "grad_norm": 0.40432890512615954, "learning_rate": 1.653638913002437e-07, "loss": 0.0035, "step": 8061 }, { "epoch": 3.667879890809827, "grad_norm": 1.090127969738009, "learning_rate": 1.6525770580801623e-07, "loss": 0.0171, "step": 8062 }, { "epoch": 3.668334849863512, "grad_norm": 0.7925770328769776, "learning_rate": 1.6515154766941736e-07, "loss": 0.0116, "step": 8063 }, { "epoch": 3.6687898089171975, "grad_norm": 0.4031161366025356, "learning_rate": 1.6504541689312184e-07, "loss": 0.0063, "step": 8064 }, { "epoch": 3.6692447679708824, "grad_norm": 0.7498046677637622, "learning_rate": 1.649393134878021e-07, "loss": 0.0087, "step": 8065 }, { "epoch": 3.6696997270245677, "grad_norm": 0.44105038907395944, "learning_rate": 1.6483323746212851e-07, "loss": 0.012, "step": 8066 }, { "epoch": 3.670154686078253, "grad_norm": 0.8110339193155363, "learning_rate": 1.6472718882476933e-07, "loss": 0.0074, "step": 8067 }, { "epoch": 3.670609645131938, "grad_norm": 0.6380695516092812, "learning_rate": 1.6462116758439016e-07, "loss": 0.0071, "step": 8068 }, { "epoch": 3.671064604185623, "grad_norm": 1.108953860035866, "learning_rate": 1.6451517374965463e-07, "loss": 0.0126, "step": 8069 }, { "epoch": 3.6715195632393085, "grad_norm": 0.7369426069744043, "learning_rate": 1.6440920732922393e-07, "loss": 0.0177, "step": 8070 }, { "epoch": 3.6719745222929934, "grad_norm": 0.6452219015972733, "learning_rate": 1.6430326833175745e-07, "loss": 0.0082, "step": 8071 }, { "epoch": 3.6724294813466787, "grad_norm": 0.5291668007647801, "learning_rate": 1.641973567659119e-07, "loss": 0.0042, "step": 8072 }, { "epoch": 3.672884440400364, "grad_norm": 0.475443827483504, "learning_rate": 1.640914726403417e-07, "loss": 0.0114, "step": 8073 }, { "epoch": 3.673339399454049, "grad_norm": 0.43387418925443144, "learning_rate": 1.6398561596369954e-07, "loss": 0.0054, "step": 8074 }, { "epoch": 3.673794358507734, "grad_norm": 0.7841825424177195, "learning_rate": 1.6387978674463527e-07, "loss": 0.0051, "step": 8075 }, { "epoch": 3.6742493175614195, "grad_norm": 0.8928551416437542, "learning_rate": 1.6377398499179711e-07, "loss": 0.0113, "step": 8076 }, { "epoch": 3.674704276615105, "grad_norm": 0.6547214214076037, "learning_rate": 1.6366821071383053e-07, "loss": 0.0064, "step": 8077 }, { "epoch": 3.6751592356687897, "grad_norm": 0.5518538117468933, "learning_rate": 1.6356246391937883e-07, "loss": 0.0053, "step": 8078 }, { "epoch": 3.675614194722475, "grad_norm": 0.8397737276698771, "learning_rate": 1.6345674461708315e-07, "loss": 0.0129, "step": 8079 }, { "epoch": 3.6760691537761603, "grad_norm": 0.7886950280346162, "learning_rate": 1.6335105281558248e-07, "loss": 0.017, "step": 8080 }, { "epoch": 3.676524112829845, "grad_norm": 0.5480002104515687, "learning_rate": 1.6324538852351362e-07, "loss": 0.0093, "step": 8081 }, { "epoch": 3.6769790718835305, "grad_norm": 1.2870030262685281, "learning_rate": 1.6313975174951083e-07, "loss": 0.011, "step": 8082 }, { "epoch": 3.6774340309372158, "grad_norm": 0.4296315962171077, "learning_rate": 1.6303414250220633e-07, "loss": 0.0061, "step": 8083 }, { "epoch": 3.677888989990901, "grad_norm": 0.7724384642423092, "learning_rate": 1.6292856079022992e-07, "loss": 0.0092, "step": 8084 }, { "epoch": 3.678343949044586, "grad_norm": 0.7088340433874419, "learning_rate": 1.6282300662220917e-07, "loss": 0.0101, "step": 8085 }, { "epoch": 3.6787989080982713, "grad_norm": 0.7381639660414672, "learning_rate": 1.627174800067698e-07, "loss": 0.0106, "step": 8086 }, { "epoch": 3.6792538671519566, "grad_norm": 0.568692773422987, "learning_rate": 1.626119809525347e-07, "loss": 0.0099, "step": 8087 }, { "epoch": 3.6797088262056414, "grad_norm": 0.6517763410899009, "learning_rate": 1.62506509468125e-07, "loss": 0.0077, "step": 8088 }, { "epoch": 3.6801637852593267, "grad_norm": 0.6299492788535351, "learning_rate": 1.6240106556215927e-07, "loss": 0.0074, "step": 8089 }, { "epoch": 3.680618744313012, "grad_norm": 0.48269800232283805, "learning_rate": 1.6229564924325368e-07, "loss": 0.0056, "step": 8090 }, { "epoch": 3.681073703366697, "grad_norm": 0.5496153827095187, "learning_rate": 1.6219026052002276e-07, "loss": 0.0047, "step": 8091 }, { "epoch": 3.6815286624203822, "grad_norm": 1.2362460616787596, "learning_rate": 1.6208489940107822e-07, "loss": 0.0469, "step": 8092 }, { "epoch": 3.6819836214740675, "grad_norm": 0.45566444923182353, "learning_rate": 1.6197956589502965e-07, "loss": 0.0051, "step": 8093 }, { "epoch": 3.6824385805277524, "grad_norm": 0.5702791134998019, "learning_rate": 1.618742600104843e-07, "loss": 0.0067, "step": 8094 }, { "epoch": 3.6828935395814377, "grad_norm": 1.0572828442456708, "learning_rate": 1.6176898175604753e-07, "loss": 0.0068, "step": 8095 }, { "epoch": 3.683348498635123, "grad_norm": 0.6167922212358754, "learning_rate": 1.6166373114032227e-07, "loss": 0.0071, "step": 8096 }, { "epoch": 3.683803457688808, "grad_norm": 0.5182385657703837, "learning_rate": 1.6155850817190898e-07, "loss": 0.0085, "step": 8097 }, { "epoch": 3.684258416742493, "grad_norm": 0.8705523427074527, "learning_rate": 1.61453312859406e-07, "loss": 0.017, "step": 8098 }, { "epoch": 3.6847133757961785, "grad_norm": 0.7660116797517583, "learning_rate": 1.6134814521140928e-07, "loss": 0.0113, "step": 8099 }, { "epoch": 3.6851683348498634, "grad_norm": 0.7408388815648802, "learning_rate": 1.6124300523651296e-07, "loss": 0.0108, "step": 8100 }, { "epoch": 3.6856232939035487, "grad_norm": 0.7357857874386857, "learning_rate": 1.6113789294330825e-07, "loss": 0.0109, "step": 8101 }, { "epoch": 3.686078252957234, "grad_norm": 0.9635132743919292, "learning_rate": 1.6103280834038485e-07, "loss": 0.0137, "step": 8102 }, { "epoch": 3.686533212010919, "grad_norm": 0.4774125883136833, "learning_rate": 1.609277514363296e-07, "loss": 0.0055, "step": 8103 }, { "epoch": 3.686988171064604, "grad_norm": 0.6732961484930099, "learning_rate": 1.6082272223972703e-07, "loss": 0.008, "step": 8104 }, { "epoch": 3.6874431301182895, "grad_norm": 0.4052954083468488, "learning_rate": 1.6071772075916012e-07, "loss": 0.005, "step": 8105 }, { "epoch": 3.6878980891719744, "grad_norm": 0.6867641982747811, "learning_rate": 1.6061274700320882e-07, "loss": 0.0112, "step": 8106 }, { "epoch": 3.6883530482256597, "grad_norm": 0.7135569999396112, "learning_rate": 1.6050780098045124e-07, "loss": 0.0074, "step": 8107 }, { "epoch": 3.688808007279345, "grad_norm": 0.7470501305247325, "learning_rate": 1.6040288269946285e-07, "loss": 0.0086, "step": 8108 }, { "epoch": 3.68926296633303, "grad_norm": 0.5390614053380092, "learning_rate": 1.6029799216881723e-07, "loss": 0.0088, "step": 8109 }, { "epoch": 3.689717925386715, "grad_norm": 1.1055608427751515, "learning_rate": 1.6019312939708584e-07, "loss": 0.0112, "step": 8110 }, { "epoch": 3.6901728844404005, "grad_norm": 0.47132819129413867, "learning_rate": 1.6008829439283734e-07, "loss": 0.0053, "step": 8111 }, { "epoch": 3.6906278434940853, "grad_norm": 0.39079201643952344, "learning_rate": 1.5998348716463833e-07, "loss": 0.0037, "step": 8112 }, { "epoch": 3.6910828025477707, "grad_norm": 0.4977751439268561, "learning_rate": 1.5987870772105318e-07, "loss": 0.0102, "step": 8113 }, { "epoch": 3.691537761601456, "grad_norm": 0.6479978170412406, "learning_rate": 1.5977395607064415e-07, "loss": 0.0074, "step": 8114 }, { "epoch": 3.691992720655141, "grad_norm": 0.714283759278265, "learning_rate": 1.5966923222197088e-07, "loss": 0.0114, "step": 8115 }, { "epoch": 3.692447679708826, "grad_norm": 1.0062968113578, "learning_rate": 1.5956453618359116e-07, "loss": 0.012, "step": 8116 }, { "epoch": 3.6929026387625115, "grad_norm": 0.5771995946774401, "learning_rate": 1.5945986796406012e-07, "loss": 0.0099, "step": 8117 }, { "epoch": 3.6933575978161963, "grad_norm": 0.752177016843637, "learning_rate": 1.5935522757193088e-07, "loss": 0.0168, "step": 8118 }, { "epoch": 3.6938125568698816, "grad_norm": 0.5590735721369194, "learning_rate": 1.5925061501575392e-07, "loss": 0.0092, "step": 8119 }, { "epoch": 3.694267515923567, "grad_norm": 0.5594378482615978, "learning_rate": 1.5914603030407802e-07, "loss": 0.0063, "step": 8120 }, { "epoch": 3.694722474977252, "grad_norm": 0.78217014885285, "learning_rate": 1.5904147344544928e-07, "loss": 0.0037, "step": 8121 }, { "epoch": 3.695177434030937, "grad_norm": 0.8761642440560914, "learning_rate": 1.5893694444841138e-07, "loss": 0.0228, "step": 8122 }, { "epoch": 3.6956323930846224, "grad_norm": 0.7669578754803927, "learning_rate": 1.588324433215063e-07, "loss": 0.0153, "step": 8123 }, { "epoch": 3.6960873521383073, "grad_norm": 0.5775367200664124, "learning_rate": 1.5872797007327315e-07, "loss": 0.0056, "step": 8124 }, { "epoch": 3.6965423111919926, "grad_norm": 0.6865952643384075, "learning_rate": 1.5862352471224923e-07, "loss": 0.0121, "step": 8125 }, { "epoch": 3.696997270245678, "grad_norm": 0.4935832367111184, "learning_rate": 1.5851910724696927e-07, "loss": 0.0077, "step": 8126 }, { "epoch": 3.697452229299363, "grad_norm": 0.7823939417215212, "learning_rate": 1.584147176859657e-07, "loss": 0.0126, "step": 8127 }, { "epoch": 3.697907188353048, "grad_norm": 0.6406598335802375, "learning_rate": 1.5831035603776866e-07, "loss": 0.0117, "step": 8128 }, { "epoch": 3.6983621474067334, "grad_norm": 0.4708838629624632, "learning_rate": 1.5820602231090628e-07, "loss": 0.0052, "step": 8129 }, { "epoch": 3.6988171064604187, "grad_norm": 0.5077382365791285, "learning_rate": 1.5810171651390442e-07, "loss": 0.0103, "step": 8130 }, { "epoch": 3.6992720655141036, "grad_norm": 0.7642593883526252, "learning_rate": 1.5799743865528624e-07, "loss": 0.009, "step": 8131 }, { "epoch": 3.699727024567789, "grad_norm": 0.5611122597571744, "learning_rate": 1.5789318874357295e-07, "loss": 0.0043, "step": 8132 }, { "epoch": 3.700181983621474, "grad_norm": 0.8400909684367256, "learning_rate": 1.5778896678728316e-07, "loss": 0.0092, "step": 8133 }, { "epoch": 3.700636942675159, "grad_norm": 0.28160298496758707, "learning_rate": 1.576847727949337e-07, "loss": 0.002, "step": 8134 }, { "epoch": 3.7010919017288444, "grad_norm": 0.6024445669008336, "learning_rate": 1.5758060677503876e-07, "loss": 0.0108, "step": 8135 }, { "epoch": 3.7015468607825297, "grad_norm": 0.517119221458705, "learning_rate": 1.5747646873611015e-07, "loss": 0.003, "step": 8136 }, { "epoch": 3.702001819836215, "grad_norm": 0.5856925091469901, "learning_rate": 1.5737235868665784e-07, "loss": 0.0183, "step": 8137 }, { "epoch": 3.7024567788899, "grad_norm": 1.0625655125819995, "learning_rate": 1.5726827663518895e-07, "loss": 0.009, "step": 8138 }, { "epoch": 3.702911737943585, "grad_norm": 0.4807253536364837, "learning_rate": 1.5716422259020883e-07, "loss": 0.0071, "step": 8139 }, { "epoch": 3.7033666969972705, "grad_norm": 0.9616486384329034, "learning_rate": 1.5706019656022023e-07, "loss": 0.0216, "step": 8140 }, { "epoch": 3.7038216560509554, "grad_norm": 0.6612056939986498, "learning_rate": 1.5695619855372367e-07, "loss": 0.0113, "step": 8141 }, { "epoch": 3.7042766151046407, "grad_norm": 0.7843512817833026, "learning_rate": 1.568522285792172e-07, "loss": 0.0129, "step": 8142 }, { "epoch": 3.704731574158326, "grad_norm": 0.6216896787769163, "learning_rate": 1.5674828664519701e-07, "loss": 0.0067, "step": 8143 }, { "epoch": 3.705186533212011, "grad_norm": 0.7262663316053447, "learning_rate": 1.5664437276015692e-07, "loss": 0.0031, "step": 8144 }, { "epoch": 3.705641492265696, "grad_norm": 0.7470649474884201, "learning_rate": 1.5654048693258803e-07, "loss": 0.0108, "step": 8145 }, { "epoch": 3.7060964513193815, "grad_norm": 1.4665460828885533, "learning_rate": 1.5643662917097955e-07, "loss": 0.0114, "step": 8146 }, { "epoch": 3.7065514103730663, "grad_norm": 0.7982212923765266, "learning_rate": 1.56332799483818e-07, "loss": 0.0124, "step": 8147 }, { "epoch": 3.7070063694267517, "grad_norm": 0.8807032418697435, "learning_rate": 1.5622899787958832e-07, "loss": 0.0071, "step": 8148 }, { "epoch": 3.707461328480437, "grad_norm": 1.0139206466028037, "learning_rate": 1.5612522436677243e-07, "loss": 0.0125, "step": 8149 }, { "epoch": 3.707916287534122, "grad_norm": 0.41514961824253815, "learning_rate": 1.5602147895385016e-07, "loss": 0.0061, "step": 8150 }, { "epoch": 3.708371246587807, "grad_norm": 0.746492240634513, "learning_rate": 1.559177616492993e-07, "loss": 0.0224, "step": 8151 }, { "epoch": 3.7088262056414925, "grad_norm": 0.4760909664122976, "learning_rate": 1.5581407246159507e-07, "loss": 0.005, "step": 8152 }, { "epoch": 3.7092811646951773, "grad_norm": 0.5339200592629875, "learning_rate": 1.557104113992106e-07, "loss": 0.0068, "step": 8153 }, { "epoch": 3.7097361237488626, "grad_norm": 0.7316769789341797, "learning_rate": 1.556067784706165e-07, "loss": 0.0132, "step": 8154 }, { "epoch": 3.710191082802548, "grad_norm": 0.6747964357503466, "learning_rate": 1.5550317368428124e-07, "loss": 0.0099, "step": 8155 }, { "epoch": 3.710646041856233, "grad_norm": 0.6036225521888057, "learning_rate": 1.5539959704867083e-07, "loss": 0.0099, "step": 8156 }, { "epoch": 3.711101000909918, "grad_norm": 0.7330959175470851, "learning_rate": 1.5529604857224903e-07, "loss": 0.0065, "step": 8157 }, { "epoch": 3.7115559599636034, "grad_norm": 0.7036875667186115, "learning_rate": 1.5519252826347745e-07, "loss": 0.0111, "step": 8158 }, { "epoch": 3.7120109190172883, "grad_norm": 0.4642667407852875, "learning_rate": 1.5508903613081552e-07, "loss": 0.0036, "step": 8159 }, { "epoch": 3.7124658780709736, "grad_norm": 0.47405409835350487, "learning_rate": 1.549855721827199e-07, "loss": 0.0071, "step": 8160 }, { "epoch": 3.712920837124659, "grad_norm": 0.7588776830264338, "learning_rate": 1.548821364276453e-07, "loss": 0.0144, "step": 8161 }, { "epoch": 3.713375796178344, "grad_norm": 0.5911970756759952, "learning_rate": 1.547787288740438e-07, "loss": 0.0138, "step": 8162 }, { "epoch": 3.713830755232029, "grad_norm": 0.5569320711767968, "learning_rate": 1.546753495303657e-07, "loss": 0.0052, "step": 8163 }, { "epoch": 3.7142857142857144, "grad_norm": 1.7696735006158757, "learning_rate": 1.5457199840505847e-07, "loss": 0.0127, "step": 8164 }, { "epoch": 3.7147406733393993, "grad_norm": 0.8523668135078739, "learning_rate": 1.5446867550656767e-07, "loss": 0.009, "step": 8165 }, { "epoch": 3.7151956323930846, "grad_norm": 0.598161546866106, "learning_rate": 1.5436538084333633e-07, "loss": 0.0075, "step": 8166 }, { "epoch": 3.71565059144677, "grad_norm": 0.721229924985878, "learning_rate": 1.542621144238051e-07, "loss": 0.0139, "step": 8167 }, { "epoch": 3.7161055505004548, "grad_norm": 1.9284237688268522, "learning_rate": 1.541588762564126e-07, "loss": 0.0142, "step": 8168 }, { "epoch": 3.71656050955414, "grad_norm": 1.1421029916608298, "learning_rate": 1.54055666349595e-07, "loss": 0.0146, "step": 8169 }, { "epoch": 3.7170154686078254, "grad_norm": 0.8654869477829499, "learning_rate": 1.5395248471178607e-07, "loss": 0.0131, "step": 8170 }, { "epoch": 3.7174704276615103, "grad_norm": 0.5193771317139192, "learning_rate": 1.5384933135141713e-07, "loss": 0.0078, "step": 8171 }, { "epoch": 3.7179253867151956, "grad_norm": 0.8216005570977775, "learning_rate": 1.537462062769177e-07, "loss": 0.0086, "step": 8172 }, { "epoch": 3.718380345768881, "grad_norm": 0.5980379816050457, "learning_rate": 1.5364310949671478e-07, "loss": 0.01, "step": 8173 }, { "epoch": 3.7188353048225657, "grad_norm": 0.7570328070039846, "learning_rate": 1.5354004101923278e-07, "loss": 0.01, "step": 8174 }, { "epoch": 3.719290263876251, "grad_norm": 0.8106445210998001, "learning_rate": 1.5343700085289402e-07, "loss": 0.0086, "step": 8175 }, { "epoch": 3.7197452229299364, "grad_norm": 2.1747808568698725, "learning_rate": 1.5333398900611839e-07, "loss": 0.0156, "step": 8176 }, { "epoch": 3.7202001819836212, "grad_norm": 0.746174615739898, "learning_rate": 1.5323100548732375e-07, "loss": 0.0207, "step": 8177 }, { "epoch": 3.7206551410373065, "grad_norm": 1.0565503590944498, "learning_rate": 1.531280503049252e-07, "loss": 0.0144, "step": 8178 }, { "epoch": 3.721110100090992, "grad_norm": 0.5232938633676375, "learning_rate": 1.530251234673361e-07, "loss": 0.0054, "step": 8179 }, { "epoch": 3.7215650591446767, "grad_norm": 0.6855272010823804, "learning_rate": 1.5292222498296698e-07, "loss": 0.009, "step": 8180 }, { "epoch": 3.722020018198362, "grad_norm": 0.4436846873226734, "learning_rate": 1.5281935486022607e-07, "loss": 0.0032, "step": 8181 }, { "epoch": 3.7224749772520473, "grad_norm": 0.9007786629316126, "learning_rate": 1.5271651310751975e-07, "loss": 0.0144, "step": 8182 }, { "epoch": 3.722929936305732, "grad_norm": 0.8262268370017988, "learning_rate": 1.526136997332517e-07, "loss": 0.0078, "step": 8183 }, { "epoch": 3.7233848953594175, "grad_norm": 0.5779189128517023, "learning_rate": 1.5251091474582335e-07, "loss": 0.0066, "step": 8184 }, { "epoch": 3.723839854413103, "grad_norm": 0.6666752430508338, "learning_rate": 1.524081581536336e-07, "loss": 0.0101, "step": 8185 }, { "epoch": 3.724294813466788, "grad_norm": 0.8574483958011917, "learning_rate": 1.523054299650795e-07, "loss": 0.0095, "step": 8186 }, { "epoch": 3.724749772520473, "grad_norm": 0.7641421477106415, "learning_rate": 1.5220273018855563e-07, "loss": 0.0071, "step": 8187 }, { "epoch": 3.7252047315741583, "grad_norm": 0.5246630246881385, "learning_rate": 1.5210005883245397e-07, "loss": 0.0042, "step": 8188 }, { "epoch": 3.7256596906278436, "grad_norm": 0.44188492578552413, "learning_rate": 1.5199741590516445e-07, "loss": 0.0037, "step": 8189 }, { "epoch": 3.7261146496815285, "grad_norm": 0.6242586002642719, "learning_rate": 1.5189480141507448e-07, "loss": 0.007, "step": 8190 }, { "epoch": 3.726569608735214, "grad_norm": 1.4689006456262386, "learning_rate": 1.5179221537056918e-07, "loss": 0.0153, "step": 8191 }, { "epoch": 3.727024567788899, "grad_norm": 1.7597557543394515, "learning_rate": 1.5168965778003156e-07, "loss": 0.0164, "step": 8192 }, { "epoch": 3.7274795268425844, "grad_norm": 0.9764472168220967, "learning_rate": 1.515871286518423e-07, "loss": 0.0081, "step": 8193 }, { "epoch": 3.7279344858962693, "grad_norm": 0.514952962015491, "learning_rate": 1.514846279943795e-07, "loss": 0.0069, "step": 8194 }, { "epoch": 3.7283894449499546, "grad_norm": 1.0266199539982057, "learning_rate": 1.5138215581601897e-07, "loss": 0.0077, "step": 8195 }, { "epoch": 3.72884440400364, "grad_norm": 0.7555121329370067, "learning_rate": 1.512797121251342e-07, "loss": 0.0097, "step": 8196 }, { "epoch": 3.729299363057325, "grad_norm": 0.37529756154090715, "learning_rate": 1.5117729693009667e-07, "loss": 0.0066, "step": 8197 }, { "epoch": 3.72975432211101, "grad_norm": 1.0243858357606175, "learning_rate": 1.510749102392752e-07, "loss": 0.0109, "step": 8198 }, { "epoch": 3.7302092811646954, "grad_norm": 0.6142769401113515, "learning_rate": 1.5097255206103616e-07, "loss": 0.0079, "step": 8199 }, { "epoch": 3.7306642402183803, "grad_norm": 0.5676139008158384, "learning_rate": 1.5087022240374414e-07, "loss": 0.0102, "step": 8200 }, { "epoch": 3.7311191992720656, "grad_norm": 0.573752242822105, "learning_rate": 1.507679212757607e-07, "loss": 0.0052, "step": 8201 }, { "epoch": 3.731574158325751, "grad_norm": 0.46429004924833445, "learning_rate": 1.5066564868544585e-07, "loss": 0.0046, "step": 8202 }, { "epoch": 3.7320291173794358, "grad_norm": 0.6984770964652254, "learning_rate": 1.505634046411565e-07, "loss": 0.0109, "step": 8203 }, { "epoch": 3.732484076433121, "grad_norm": 0.544049823043084, "learning_rate": 1.5046118915124778e-07, "loss": 0.0092, "step": 8204 }, { "epoch": 3.7329390354868064, "grad_norm": 1.2994056370716947, "learning_rate": 1.5035900222407194e-07, "loss": 0.0137, "step": 8205 }, { "epoch": 3.7333939945404913, "grad_norm": 0.6439812806096807, "learning_rate": 1.5025684386797955e-07, "loss": 0.0084, "step": 8206 }, { "epoch": 3.7338489535941766, "grad_norm": 0.9890613752259477, "learning_rate": 1.5015471409131857e-07, "loss": 0.0111, "step": 8207 }, { "epoch": 3.734303912647862, "grad_norm": 0.453092407994916, "learning_rate": 1.5005261290243443e-07, "loss": 0.0023, "step": 8208 }, { "epoch": 3.7347588717015467, "grad_norm": 0.8327324917731622, "learning_rate": 1.4995054030967047e-07, "loss": 0.0085, "step": 8209 }, { "epoch": 3.735213830755232, "grad_norm": 0.7418939573397915, "learning_rate": 1.4984849632136738e-07, "loss": 0.0149, "step": 8210 }, { "epoch": 3.7356687898089174, "grad_norm": 0.6555008872422066, "learning_rate": 1.4974648094586405e-07, "loss": 0.0056, "step": 8211 }, { "epoch": 3.7361237488626022, "grad_norm": 0.7609013053563085, "learning_rate": 1.4964449419149655e-07, "loss": 0.0135, "step": 8212 }, { "epoch": 3.7365787079162875, "grad_norm": 0.5592209046597559, "learning_rate": 1.4954253606659867e-07, "loss": 0.0095, "step": 8213 }, { "epoch": 3.737033666969973, "grad_norm": 0.7406251956130773, "learning_rate": 1.4944060657950224e-07, "loss": 0.0158, "step": 8214 }, { "epoch": 3.7374886260236577, "grad_norm": 0.7930537862647565, "learning_rate": 1.4933870573853613e-07, "loss": 0.0208, "step": 8215 }, { "epoch": 3.737943585077343, "grad_norm": 0.5492471853509031, "learning_rate": 1.492368335520276e-07, "loss": 0.0066, "step": 8216 }, { "epoch": 3.7383985441310283, "grad_norm": 0.4662493351377216, "learning_rate": 1.4913499002830105e-07, "loss": 0.0025, "step": 8217 }, { "epoch": 3.738853503184713, "grad_norm": 0.6939836046939479, "learning_rate": 1.4903317517567853e-07, "loss": 0.0085, "step": 8218 }, { "epoch": 3.7393084622383985, "grad_norm": 1.5045177906588019, "learning_rate": 1.4893138900247988e-07, "loss": 0.0434, "step": 8219 }, { "epoch": 3.739763421292084, "grad_norm": 2.1606823747595056, "learning_rate": 1.4882963151702272e-07, "loss": 0.006, "step": 8220 }, { "epoch": 3.7402183803457687, "grad_norm": 0.7652994924640267, "learning_rate": 1.4872790272762231e-07, "loss": 0.0131, "step": 8221 }, { "epoch": 3.740673339399454, "grad_norm": 0.37603614440070676, "learning_rate": 1.486262026425914e-07, "loss": 0.0017, "step": 8222 }, { "epoch": 3.7411282984531393, "grad_norm": 0.9021795920287394, "learning_rate": 1.485245312702404e-07, "loss": 0.0153, "step": 8223 }, { "epoch": 3.741583257506824, "grad_norm": 0.7336615948686028, "learning_rate": 1.484228886188773e-07, "loss": 0.0123, "step": 8224 }, { "epoch": 3.7420382165605095, "grad_norm": 0.4068968212939118, "learning_rate": 1.4832127469680822e-07, "loss": 0.0044, "step": 8225 }, { "epoch": 3.742493175614195, "grad_norm": 0.4635431839632003, "learning_rate": 1.4821968951233637e-07, "loss": 0.0024, "step": 8226 }, { "epoch": 3.7429481346678797, "grad_norm": 0.6335779537694518, "learning_rate": 1.481181330737627e-07, "loss": 0.0078, "step": 8227 }, { "epoch": 3.743403093721565, "grad_norm": 0.6080117703604248, "learning_rate": 1.480166053893863e-07, "loss": 0.0066, "step": 8228 }, { "epoch": 3.7438580527752503, "grad_norm": 1.1085322261824693, "learning_rate": 1.4791510646750337e-07, "loss": 0.0103, "step": 8229 }, { "epoch": 3.744313011828935, "grad_norm": 0.8267205961133489, "learning_rate": 1.4781363631640776e-07, "loss": 0.0101, "step": 8230 }, { "epoch": 3.7447679708826205, "grad_norm": 0.48203607229600415, "learning_rate": 1.4771219494439148e-07, "loss": 0.01, "step": 8231 }, { "epoch": 3.745222929936306, "grad_norm": 0.8301859105894113, "learning_rate": 1.4761078235974372e-07, "loss": 0.025, "step": 8232 }, { "epoch": 3.7456778889899907, "grad_norm": 0.3269253878832081, "learning_rate": 1.4750939857075146e-07, "loss": 0.0037, "step": 8233 }, { "epoch": 3.746132848043676, "grad_norm": 0.7770905219622002, "learning_rate": 1.4740804358569913e-07, "loss": 0.0161, "step": 8234 }, { "epoch": 3.7465878070973613, "grad_norm": 0.8019931559715067, "learning_rate": 1.473067174128692e-07, "loss": 0.0268, "step": 8235 }, { "epoch": 3.747042766151046, "grad_norm": 1.0245322913162709, "learning_rate": 1.4720542006054177e-07, "loss": 0.007, "step": 8236 }, { "epoch": 3.7474977252047315, "grad_norm": 0.83476168201301, "learning_rate": 1.4710415153699418e-07, "loss": 0.01, "step": 8237 }, { "epoch": 3.7479526842584168, "grad_norm": 0.47154110074604216, "learning_rate": 1.4700291185050163e-07, "loss": 0.0068, "step": 8238 }, { "epoch": 3.7484076433121016, "grad_norm": 0.7144312731337455, "learning_rate": 1.469017010093369e-07, "loss": 0.0075, "step": 8239 }, { "epoch": 3.748862602365787, "grad_norm": 0.7299812915680339, "learning_rate": 1.468005190217707e-07, "loss": 0.0164, "step": 8240 }, { "epoch": 3.7493175614194723, "grad_norm": 0.6670123078852904, "learning_rate": 1.466993658960709e-07, "loss": 0.0148, "step": 8241 }, { "epoch": 3.7497725204731576, "grad_norm": 0.8734126612550891, "learning_rate": 1.465982416405036e-07, "loss": 0.0144, "step": 8242 }, { "epoch": 3.7502274795268424, "grad_norm": 0.4526428532805336, "learning_rate": 1.4649714626333203e-07, "loss": 0.0058, "step": 8243 }, { "epoch": 3.7506824385805277, "grad_norm": 0.6708398354551024, "learning_rate": 1.4639607977281714e-07, "loss": 0.0149, "step": 8244 }, { "epoch": 3.751137397634213, "grad_norm": 0.3338658287270343, "learning_rate": 1.462950421772179e-07, "loss": 0.0026, "step": 8245 }, { "epoch": 3.7515923566878984, "grad_norm": 0.5131422261516511, "learning_rate": 1.4619403348479042e-07, "loss": 0.0052, "step": 8246 }, { "epoch": 3.7520473157415832, "grad_norm": 0.5656493418419812, "learning_rate": 1.4609305370378865e-07, "loss": 0.0115, "step": 8247 }, { "epoch": 3.7525022747952685, "grad_norm": 0.7903098195496321, "learning_rate": 1.459921028424645e-07, "loss": 0.0187, "step": 8248 }, { "epoch": 3.752957233848954, "grad_norm": 0.4627894341272578, "learning_rate": 1.4589118090906682e-07, "loss": 0.0043, "step": 8249 }, { "epoch": 3.7534121929026387, "grad_norm": 0.6221542786907271, "learning_rate": 1.4579028791184285e-07, "loss": 0.0071, "step": 8250 }, { "epoch": 3.753867151956324, "grad_norm": 0.7476117224426787, "learning_rate": 1.4568942385903694e-07, "loss": 0.0112, "step": 8251 }, { "epoch": 3.7543221110100093, "grad_norm": 0.5061426264142888, "learning_rate": 1.455885887588913e-07, "loss": 0.0071, "step": 8252 }, { "epoch": 3.754777070063694, "grad_norm": 0.6728781314599096, "learning_rate": 1.454877826196455e-07, "loss": 0.0092, "step": 8253 }, { "epoch": 3.7552320291173795, "grad_norm": 0.7536720284958476, "learning_rate": 1.4538700544953714e-07, "loss": 0.0042, "step": 8254 }, { "epoch": 3.755686988171065, "grad_norm": 0.8700506253799086, "learning_rate": 1.4528625725680144e-07, "loss": 0.0093, "step": 8255 }, { "epoch": 3.7561419472247497, "grad_norm": 0.9908748330897141, "learning_rate": 1.4518553804967092e-07, "loss": 0.0107, "step": 8256 }, { "epoch": 3.756596906278435, "grad_norm": 0.4871975147065399, "learning_rate": 1.4508484783637588e-07, "loss": 0.0056, "step": 8257 }, { "epoch": 3.7570518653321203, "grad_norm": 1.4654761117023507, "learning_rate": 1.4498418662514418e-07, "loss": 0.0128, "step": 8258 }, { "epoch": 3.757506824385805, "grad_norm": 0.7169008317506808, "learning_rate": 1.4488355442420163e-07, "loss": 0.0063, "step": 8259 }, { "epoch": 3.7579617834394905, "grad_norm": 0.5516238445787786, "learning_rate": 1.447829512417713e-07, "loss": 0.0135, "step": 8260 }, { "epoch": 3.758416742493176, "grad_norm": 0.7234082044667427, "learning_rate": 1.4468237708607394e-07, "loss": 0.0168, "step": 8261 }, { "epoch": 3.7588717015468607, "grad_norm": 0.6722373473148902, "learning_rate": 1.445818319653283e-07, "loss": 0.0074, "step": 8262 }, { "epoch": 3.759326660600546, "grad_norm": 0.8977140903760009, "learning_rate": 1.4448131588775026e-07, "loss": 0.0074, "step": 8263 }, { "epoch": 3.7597816196542313, "grad_norm": 0.5332975931992513, "learning_rate": 1.4438082886155346e-07, "loss": 0.007, "step": 8264 }, { "epoch": 3.760236578707916, "grad_norm": 0.8020563785444736, "learning_rate": 1.4428037089494943e-07, "loss": 0.0112, "step": 8265 }, { "epoch": 3.7606915377616015, "grad_norm": 0.7685407983213531, "learning_rate": 1.4417994199614712e-07, "loss": 0.0162, "step": 8266 }, { "epoch": 3.761146496815287, "grad_norm": 0.6847603868231997, "learning_rate": 1.440795421733531e-07, "loss": 0.01, "step": 8267 }, { "epoch": 3.7616014558689717, "grad_norm": 0.7567815102267442, "learning_rate": 1.4397917143477146e-07, "loss": 0.0075, "step": 8268 }, { "epoch": 3.762056414922657, "grad_norm": 0.7196611900130648, "learning_rate": 1.4387882978860411e-07, "loss": 0.0056, "step": 8269 }, { "epoch": 3.7625113739763423, "grad_norm": 1.3319179565004662, "learning_rate": 1.4377851724305068e-07, "loss": 0.0108, "step": 8270 }, { "epoch": 3.762966333030027, "grad_norm": 0.760274776745291, "learning_rate": 1.4367823380630818e-07, "loss": 0.0075, "step": 8271 }, { "epoch": 3.7634212920837125, "grad_norm": 1.239981490623871, "learning_rate": 1.4357797948657124e-07, "loss": 0.0111, "step": 8272 }, { "epoch": 3.7638762511373978, "grad_norm": 0.7790970457458554, "learning_rate": 1.4347775429203212e-07, "loss": 0.0156, "step": 8273 }, { "epoch": 3.7643312101910826, "grad_norm": 0.4990068850183673, "learning_rate": 1.43377558230881e-07, "loss": 0.0069, "step": 8274 }, { "epoch": 3.764786169244768, "grad_norm": 0.8704178057599635, "learning_rate": 1.4327739131130518e-07, "loss": 0.0103, "step": 8275 }, { "epoch": 3.7652411282984533, "grad_norm": 1.033116278132461, "learning_rate": 1.431772535414902e-07, "loss": 0.0217, "step": 8276 }, { "epoch": 3.765696087352138, "grad_norm": 0.5799727245700309, "learning_rate": 1.4307714492961858e-07, "loss": 0.0065, "step": 8277 }, { "epoch": 3.7661510464058234, "grad_norm": 0.8258107628816915, "learning_rate": 1.4297706548387074e-07, "loss": 0.0069, "step": 8278 }, { "epoch": 3.7666060054595087, "grad_norm": 0.8186974631550499, "learning_rate": 1.428770152124249e-07, "loss": 0.0183, "step": 8279 }, { "epoch": 3.7670609645131936, "grad_norm": 0.8767915558458814, "learning_rate": 1.4277699412345668e-07, "loss": 0.0115, "step": 8280 }, { "epoch": 3.767515923566879, "grad_norm": 0.45011303410976755, "learning_rate": 1.4267700222513927e-07, "loss": 0.0025, "step": 8281 }, { "epoch": 3.7679708826205642, "grad_norm": 0.9206636741422429, "learning_rate": 1.4257703952564342e-07, "loss": 0.0097, "step": 8282 }, { "epoch": 3.768425841674249, "grad_norm": 1.4968545972053524, "learning_rate": 1.4247710603313783e-07, "loss": 0.0096, "step": 8283 }, { "epoch": 3.7688808007279344, "grad_norm": 0.8142060481122254, "learning_rate": 1.4237720175578872e-07, "loss": 0.0099, "step": 8284 }, { "epoch": 3.7693357597816197, "grad_norm": 0.3827372945107534, "learning_rate": 1.4227732670175964e-07, "loss": 0.0089, "step": 8285 }, { "epoch": 3.7697907188353046, "grad_norm": 0.6886401386455074, "learning_rate": 1.42177480879212e-07, "loss": 0.0132, "step": 8286 }, { "epoch": 3.77024567788899, "grad_norm": 0.18957254007117558, "learning_rate": 1.4207766429630452e-07, "loss": 0.0007, "step": 8287 }, { "epoch": 3.770700636942675, "grad_norm": 0.4554903769445469, "learning_rate": 1.4197787696119412e-07, "loss": 0.0039, "step": 8288 }, { "epoch": 3.77115559599636, "grad_norm": 0.5899429269663569, "learning_rate": 1.4187811888203465e-07, "loss": 0.0034, "step": 8289 }, { "epoch": 3.7716105550500454, "grad_norm": 0.9635752922604464, "learning_rate": 1.4177839006697818e-07, "loss": 0.0131, "step": 8290 }, { "epoch": 3.7720655141037307, "grad_norm": 0.7255292779750303, "learning_rate": 1.4167869052417397e-07, "loss": 0.0092, "step": 8291 }, { "epoch": 3.7725204731574156, "grad_norm": 0.3724293324170391, "learning_rate": 1.4157902026176888e-07, "loss": 0.0043, "step": 8292 }, { "epoch": 3.772975432211101, "grad_norm": 0.7503474646245781, "learning_rate": 1.4147937928790776e-07, "loss": 0.0171, "step": 8293 }, { "epoch": 3.773430391264786, "grad_norm": 0.7246411542933455, "learning_rate": 1.4137976761073266e-07, "loss": 0.0056, "step": 8294 }, { "epoch": 3.7738853503184715, "grad_norm": 0.6014810256686793, "learning_rate": 1.4128018523838354e-07, "loss": 0.008, "step": 8295 }, { "epoch": 3.7743403093721564, "grad_norm": 0.5921197790888523, "learning_rate": 1.4118063217899745e-07, "loss": 0.0053, "step": 8296 }, { "epoch": 3.7747952684258417, "grad_norm": 0.6051914339341945, "learning_rate": 1.4108110844070975e-07, "loss": 0.0103, "step": 8297 }, { "epoch": 3.775250227479527, "grad_norm": 0.4271021662780045, "learning_rate": 1.4098161403165316e-07, "loss": 0.0028, "step": 8298 }, { "epoch": 3.775705186533212, "grad_norm": 0.5085457957033226, "learning_rate": 1.4088214895995775e-07, "loss": 0.0085, "step": 8299 }, { "epoch": 3.776160145586897, "grad_norm": 0.9506599078918156, "learning_rate": 1.4078271323375136e-07, "loss": 0.0105, "step": 8300 }, { "epoch": 3.7766151046405825, "grad_norm": 0.8026648385120967, "learning_rate": 1.4068330686115943e-07, "loss": 0.008, "step": 8301 }, { "epoch": 3.777070063694268, "grad_norm": 0.858187600301211, "learning_rate": 1.4058392985030487e-07, "loss": 0.0134, "step": 8302 }, { "epoch": 3.7775250227479527, "grad_norm": 0.7648841062764326, "learning_rate": 1.4048458220930843e-07, "loss": 0.0136, "step": 8303 }, { "epoch": 3.777979981801638, "grad_norm": 0.6376091905811694, "learning_rate": 1.4038526394628853e-07, "loss": 0.0131, "step": 8304 }, { "epoch": 3.7784349408553233, "grad_norm": 0.8900035381743228, "learning_rate": 1.4028597506936086e-07, "loss": 0.023, "step": 8305 }, { "epoch": 3.778889899909008, "grad_norm": 0.496955739335869, "learning_rate": 1.4018671558663885e-07, "loss": 0.0051, "step": 8306 }, { "epoch": 3.7793448589626935, "grad_norm": 0.6845465414570177, "learning_rate": 1.4008748550623338e-07, "loss": 0.0104, "step": 8307 }, { "epoch": 3.7797998180163788, "grad_norm": 0.6173756629028252, "learning_rate": 1.3998828483625341e-07, "loss": 0.0067, "step": 8308 }, { "epoch": 3.7802547770700636, "grad_norm": 0.7144017814389306, "learning_rate": 1.3988911358480505e-07, "loss": 0.0179, "step": 8309 }, { "epoch": 3.780709736123749, "grad_norm": 0.7501471885222457, "learning_rate": 1.3978997175999185e-07, "loss": 0.0118, "step": 8310 }, { "epoch": 3.7811646951774343, "grad_norm": 0.5607687977116463, "learning_rate": 1.3969085936991564e-07, "loss": 0.0092, "step": 8311 }, { "epoch": 3.781619654231119, "grad_norm": 0.5132036596725411, "learning_rate": 1.395917764226751e-07, "loss": 0.0065, "step": 8312 }, { "epoch": 3.7820746132848044, "grad_norm": 0.9894318294701199, "learning_rate": 1.394927229263672e-07, "loss": 0.0085, "step": 8313 }, { "epoch": 3.7825295723384897, "grad_norm": 0.5275294675991042, "learning_rate": 1.393936988890859e-07, "loss": 0.0067, "step": 8314 }, { "epoch": 3.7829845313921746, "grad_norm": 0.5696828087249275, "learning_rate": 1.392947043189231e-07, "loss": 0.0068, "step": 8315 }, { "epoch": 3.78343949044586, "grad_norm": 0.6351075320650468, "learning_rate": 1.3919573922396793e-07, "loss": 0.0097, "step": 8316 }, { "epoch": 3.7838944494995452, "grad_norm": 0.6607412534243677, "learning_rate": 1.390968036123076e-07, "loss": 0.0119, "step": 8317 }, { "epoch": 3.78434940855323, "grad_norm": 0.735491205821277, "learning_rate": 1.3899789749202673e-07, "loss": 0.0116, "step": 8318 }, { "epoch": 3.7848043676069154, "grad_norm": 0.47512632186871556, "learning_rate": 1.3889902087120748e-07, "loss": 0.0078, "step": 8319 }, { "epoch": 3.7852593266606007, "grad_norm": 0.5736033863848898, "learning_rate": 1.3880017375792952e-07, "loss": 0.012, "step": 8320 }, { "epoch": 3.7857142857142856, "grad_norm": 0.6067326846755087, "learning_rate": 1.3870135616027e-07, "loss": 0.0101, "step": 8321 }, { "epoch": 3.786169244767971, "grad_norm": 0.9247281282222776, "learning_rate": 1.3860256808630427e-07, "loss": 0.0082, "step": 8322 }, { "epoch": 3.786624203821656, "grad_norm": 0.46984844700998596, "learning_rate": 1.3850380954410456e-07, "loss": 0.0062, "step": 8323 }, { "epoch": 3.787079162875341, "grad_norm": 0.5050114276648372, "learning_rate": 1.3840508054174094e-07, "loss": 0.0055, "step": 8324 }, { "epoch": 3.7875341219290264, "grad_norm": 0.48644073316417596, "learning_rate": 1.3830638108728127e-07, "loss": 0.0087, "step": 8325 }, { "epoch": 3.7879890809827117, "grad_norm": 0.45677440751088966, "learning_rate": 1.3820771118879065e-07, "loss": 0.0055, "step": 8326 }, { "epoch": 3.7884440400363966, "grad_norm": 0.49333101764177756, "learning_rate": 1.3810907085433215e-07, "loss": 0.0071, "step": 8327 }, { "epoch": 3.788898999090082, "grad_norm": 1.1327851297733773, "learning_rate": 1.380104600919661e-07, "loss": 0.0121, "step": 8328 }, { "epoch": 3.789353958143767, "grad_norm": 0.7005489325788898, "learning_rate": 1.3791187890975053e-07, "loss": 0.0101, "step": 8329 }, { "epoch": 3.789808917197452, "grad_norm": 0.6907092469337418, "learning_rate": 1.3781332731574086e-07, "loss": 0.0149, "step": 8330 }, { "epoch": 3.7902638762511374, "grad_norm": 0.9418088963411256, "learning_rate": 1.377148053179905e-07, "loss": 0.0126, "step": 8331 }, { "epoch": 3.7907188353048227, "grad_norm": 0.8075936532256192, "learning_rate": 1.3761631292455033e-07, "loss": 0.0111, "step": 8332 }, { "epoch": 3.7911737943585075, "grad_norm": 0.7025715121405179, "learning_rate": 1.375178501434685e-07, "loss": 0.0189, "step": 8333 }, { "epoch": 3.791628753412193, "grad_norm": 0.4639369573455222, "learning_rate": 1.3741941698279107e-07, "loss": 0.0068, "step": 8334 }, { "epoch": 3.792083712465878, "grad_norm": 0.9481712616547706, "learning_rate": 1.3732101345056145e-07, "loss": 0.0086, "step": 8335 }, { "epoch": 3.792538671519563, "grad_norm": 0.5614447731558037, "learning_rate": 1.3722263955482066e-07, "loss": 0.0059, "step": 8336 }, { "epoch": 3.7929936305732483, "grad_norm": 0.5856735098233252, "learning_rate": 1.371242953036076e-07, "loss": 0.0156, "step": 8337 }, { "epoch": 3.7934485896269337, "grad_norm": 0.4427338086301418, "learning_rate": 1.3702598070495824e-07, "loss": 0.0025, "step": 8338 }, { "epoch": 3.7939035486806185, "grad_norm": 0.5723507484710637, "learning_rate": 1.3692769576690673e-07, "loss": 0.0073, "step": 8339 }, { "epoch": 3.794358507734304, "grad_norm": 0.640342653968437, "learning_rate": 1.3682944049748423e-07, "loss": 0.0095, "step": 8340 }, { "epoch": 3.794813466787989, "grad_norm": 0.8580227809859986, "learning_rate": 1.3673121490471972e-07, "loss": 0.0138, "step": 8341 }, { "epoch": 3.795268425841674, "grad_norm": 0.6383619615679231, "learning_rate": 1.3663301899663992e-07, "loss": 0.0101, "step": 8342 }, { "epoch": 3.7957233848953593, "grad_norm": 0.4644457626307925, "learning_rate": 1.3653485278126893e-07, "loss": 0.003, "step": 8343 }, { "epoch": 3.7961783439490446, "grad_norm": 0.8612788331417015, "learning_rate": 1.3643671626662828e-07, "loss": 0.0095, "step": 8344 }, { "epoch": 3.7966333030027295, "grad_norm": 0.8558297083377424, "learning_rate": 1.3633860946073727e-07, "loss": 0.0103, "step": 8345 }, { "epoch": 3.797088262056415, "grad_norm": 0.6232655047352884, "learning_rate": 1.3624053237161276e-07, "loss": 0.0056, "step": 8346 }, { "epoch": 3.7975432211101, "grad_norm": 0.8315870969707471, "learning_rate": 1.3614248500726938e-07, "loss": 0.0126, "step": 8347 }, { "epoch": 3.797998180163785, "grad_norm": 0.6626809565213052, "learning_rate": 1.36044467375719e-07, "loss": 0.0138, "step": 8348 }, { "epoch": 3.7984531392174703, "grad_norm": 0.6372346665104832, "learning_rate": 1.3594647948497113e-07, "loss": 0.0095, "step": 8349 }, { "epoch": 3.7989080982711556, "grad_norm": 0.7281973218695594, "learning_rate": 1.358485213430327e-07, "loss": 0.0124, "step": 8350 }, { "epoch": 3.799363057324841, "grad_norm": 0.7104247293547993, "learning_rate": 1.357505929579088e-07, "loss": 0.0126, "step": 8351 }, { "epoch": 3.799818016378526, "grad_norm": 1.3030575295536366, "learning_rate": 1.3565269433760134e-07, "loss": 0.022, "step": 8352 }, { "epoch": 3.800272975432211, "grad_norm": 0.5223422813585213, "learning_rate": 1.355548254901105e-07, "loss": 0.0063, "step": 8353 }, { "epoch": 3.8007279344858964, "grad_norm": 0.6469280153414433, "learning_rate": 1.354569864234335e-07, "loss": 0.0038, "step": 8354 }, { "epoch": 3.8011828935395813, "grad_norm": 2.484201691052607, "learning_rate": 1.353591771455651e-07, "loss": 0.0116, "step": 8355 }, { "epoch": 3.8016378525932666, "grad_norm": 0.9897692798268044, "learning_rate": 1.352613976644983e-07, "loss": 0.0068, "step": 8356 }, { "epoch": 3.802092811646952, "grad_norm": 0.6689666930127808, "learning_rate": 1.3516364798822283e-07, "loss": 0.0113, "step": 8357 }, { "epoch": 3.802547770700637, "grad_norm": 0.8135245722635508, "learning_rate": 1.350659281247265e-07, "loss": 0.0113, "step": 8358 }, { "epoch": 3.803002729754322, "grad_norm": 0.5875067208753522, "learning_rate": 1.3496823808199436e-07, "loss": 0.0071, "step": 8359 }, { "epoch": 3.8034576888080074, "grad_norm": 0.5270054621272549, "learning_rate": 1.348705778680093e-07, "loss": 0.0104, "step": 8360 }, { "epoch": 3.8039126478616927, "grad_norm": 0.9624682742550149, "learning_rate": 1.3477294749075192e-07, "loss": 0.0117, "step": 8361 }, { "epoch": 3.8043676069153776, "grad_norm": 0.9036797148254098, "learning_rate": 1.3467534695819987e-07, "loss": 0.0088, "step": 8362 }, { "epoch": 3.804822565969063, "grad_norm": 0.663175967353095, "learning_rate": 1.3457777627832866e-07, "loss": 0.02, "step": 8363 }, { "epoch": 3.805277525022748, "grad_norm": 0.5154113030009957, "learning_rate": 1.3448023545911124e-07, "loss": 0.0048, "step": 8364 }, { "epoch": 3.805732484076433, "grad_norm": 0.5355606469004093, "learning_rate": 1.3438272450851845e-07, "loss": 0.0042, "step": 8365 }, { "epoch": 3.8061874431301184, "grad_norm": 0.5264613784201647, "learning_rate": 1.3428524343451808e-07, "loss": 0.005, "step": 8366 }, { "epoch": 3.8066424021838037, "grad_norm": 0.7702704359543887, "learning_rate": 1.3418779224507633e-07, "loss": 0.0154, "step": 8367 }, { "epoch": 3.8070973612374885, "grad_norm": 0.4617968954330232, "learning_rate": 1.340903709481561e-07, "loss": 0.0048, "step": 8368 }, { "epoch": 3.807552320291174, "grad_norm": 0.7403258824915862, "learning_rate": 1.3399297955171822e-07, "loss": 0.017, "step": 8369 }, { "epoch": 3.808007279344859, "grad_norm": 0.581730425888665, "learning_rate": 1.338956180637213e-07, "loss": 0.0167, "step": 8370 }, { "epoch": 3.808462238398544, "grad_norm": 0.601492885098236, "learning_rate": 1.3379828649212122e-07, "loss": 0.0158, "step": 8371 }, { "epoch": 3.8089171974522293, "grad_norm": 1.0078563077143134, "learning_rate": 1.3370098484487135e-07, "loss": 0.0153, "step": 8372 }, { "epoch": 3.8093721565059147, "grad_norm": 0.4922231225346698, "learning_rate": 1.3360371312992268e-07, "loss": 0.0079, "step": 8373 }, { "epoch": 3.8098271155595995, "grad_norm": 1.8160216170576964, "learning_rate": 1.335064713552241e-07, "loss": 0.0216, "step": 8374 }, { "epoch": 3.810282074613285, "grad_norm": 0.772295200129282, "learning_rate": 1.3340925952872145e-07, "loss": 0.0143, "step": 8375 }, { "epoch": 3.81073703366697, "grad_norm": 0.5341314342856012, "learning_rate": 1.3331207765835872e-07, "loss": 0.0143, "step": 8376 }, { "epoch": 3.811191992720655, "grad_norm": 0.39645731424292635, "learning_rate": 1.332149257520771e-07, "loss": 0.0054, "step": 8377 }, { "epoch": 3.8116469517743403, "grad_norm": 0.9090662423911219, "learning_rate": 1.3311780381781535e-07, "loss": 0.0146, "step": 8378 }, { "epoch": 3.8121019108280256, "grad_norm": 0.5054967396786882, "learning_rate": 1.3302071186350972e-07, "loss": 0.0051, "step": 8379 }, { "epoch": 3.8125568698817105, "grad_norm": 0.6049342611710791, "learning_rate": 1.329236498970942e-07, "loss": 0.008, "step": 8380 }, { "epoch": 3.813011828935396, "grad_norm": 0.7828563965894388, "learning_rate": 1.328266179265005e-07, "loss": 0.0105, "step": 8381 }, { "epoch": 3.813466787989081, "grad_norm": 0.9695483601403131, "learning_rate": 1.3272961595965742e-07, "loss": 0.01, "step": 8382 }, { "epoch": 3.813921747042766, "grad_norm": 1.35942793683411, "learning_rate": 1.3263264400449158e-07, "loss": 0.0091, "step": 8383 }, { "epoch": 3.8143767060964513, "grad_norm": 0.8438980997097981, "learning_rate": 1.325357020689269e-07, "loss": 0.014, "step": 8384 }, { "epoch": 3.8148316651501366, "grad_norm": 1.002665046981652, "learning_rate": 1.3243879016088532e-07, "loss": 0.0105, "step": 8385 }, { "epoch": 3.8152866242038215, "grad_norm": 0.7699099890462439, "learning_rate": 1.323419082882859e-07, "loss": 0.0145, "step": 8386 }, { "epoch": 3.815741583257507, "grad_norm": 0.5833780890409155, "learning_rate": 1.3224505645904532e-07, "loss": 0.0066, "step": 8387 }, { "epoch": 3.816196542311192, "grad_norm": 1.1415216809593411, "learning_rate": 1.3214823468107805e-07, "loss": 0.007, "step": 8388 }, { "epoch": 3.816651501364877, "grad_norm": 0.5949816503918903, "learning_rate": 1.320514429622957e-07, "loss": 0.0097, "step": 8389 }, { "epoch": 3.8171064604185623, "grad_norm": 0.4320878828622485, "learning_rate": 1.3195468131060793e-07, "loss": 0.0052, "step": 8390 }, { "epoch": 3.8175614194722476, "grad_norm": 1.6503662915872621, "learning_rate": 1.3185794973392156e-07, "loss": 0.0144, "step": 8391 }, { "epoch": 3.8180163785259325, "grad_norm": 0.527262833518259, "learning_rate": 1.31761248240141e-07, "loss": 0.0087, "step": 8392 }, { "epoch": 3.8184713375796178, "grad_norm": 0.7951417925800048, "learning_rate": 1.3166457683716814e-07, "loss": 0.0153, "step": 8393 }, { "epoch": 3.818926296633303, "grad_norm": 0.8460854636489423, "learning_rate": 1.3156793553290269e-07, "loss": 0.0255, "step": 8394 }, { "epoch": 3.819381255686988, "grad_norm": 0.6817563045535701, "learning_rate": 1.3147132433524183e-07, "loss": 0.0064, "step": 8395 }, { "epoch": 3.8198362147406733, "grad_norm": 0.5638161610739818, "learning_rate": 1.313747432520801e-07, "loss": 0.0063, "step": 8396 }, { "epoch": 3.8202911737943586, "grad_norm": 0.8331017937512462, "learning_rate": 1.3127819229130964e-07, "loss": 0.0076, "step": 8397 }, { "epoch": 3.8207461328480434, "grad_norm": 0.8587229649766303, "learning_rate": 1.3118167146082003e-07, "loss": 0.0112, "step": 8398 }, { "epoch": 3.8212010919017287, "grad_norm": 0.3319135767387228, "learning_rate": 1.3108518076849884e-07, "loss": 0.0026, "step": 8399 }, { "epoch": 3.821656050955414, "grad_norm": 0.806362637156139, "learning_rate": 1.3098872022223069e-07, "loss": 0.0122, "step": 8400 }, { "epoch": 3.822111010009099, "grad_norm": 0.5777388953411373, "learning_rate": 1.308922898298977e-07, "loss": 0.0044, "step": 8401 }, { "epoch": 3.8225659690627842, "grad_norm": 0.6158330587931917, "learning_rate": 1.3079588959938004e-07, "loss": 0.0123, "step": 8402 }, { "epoch": 3.8230209281164695, "grad_norm": 0.7429549317790863, "learning_rate": 1.3069951953855485e-07, "loss": 0.0077, "step": 8403 }, { "epoch": 3.823475887170155, "grad_norm": 0.654108254240789, "learning_rate": 1.3060317965529733e-07, "loss": 0.011, "step": 8404 }, { "epoch": 3.8239308462238397, "grad_norm": 0.8630089614373715, "learning_rate": 1.305068699574798e-07, "loss": 0.0182, "step": 8405 }, { "epoch": 3.824385805277525, "grad_norm": 0.2712471382045205, "learning_rate": 1.3041059045297213e-07, "loss": 0.0021, "step": 8406 }, { "epoch": 3.8248407643312103, "grad_norm": 0.42628862278501284, "learning_rate": 1.3031434114964206e-07, "loss": 0.0021, "step": 8407 }, { "epoch": 3.825295723384895, "grad_norm": 0.6641157811399379, "learning_rate": 1.3021812205535442e-07, "loss": 0.0081, "step": 8408 }, { "epoch": 3.8257506824385805, "grad_norm": 0.5413321490108421, "learning_rate": 1.3012193317797187e-07, "loss": 0.0093, "step": 8409 }, { "epoch": 3.826205641492266, "grad_norm": 0.8843586726077454, "learning_rate": 1.3002577452535475e-07, "loss": 0.0114, "step": 8410 }, { "epoch": 3.826660600545951, "grad_norm": 0.8100927301126952, "learning_rate": 1.2992964610536056e-07, "loss": 0.0172, "step": 8411 }, { "epoch": 3.827115559599636, "grad_norm": 0.7945440523410215, "learning_rate": 1.2983354792584443e-07, "loss": 0.0164, "step": 8412 }, { "epoch": 3.8275705186533213, "grad_norm": 0.7554120939224466, "learning_rate": 1.29737479994659e-07, "loss": 0.0147, "step": 8413 }, { "epoch": 3.8280254777070066, "grad_norm": 0.6238758166666348, "learning_rate": 1.2964144231965473e-07, "loss": 0.0148, "step": 8414 }, { "epoch": 3.8284804367606915, "grad_norm": 0.4863788579696869, "learning_rate": 1.2954543490867913e-07, "loss": 0.0041, "step": 8415 }, { "epoch": 3.828935395814377, "grad_norm": 0.7728204054355972, "learning_rate": 1.2944945776957777e-07, "loss": 0.0107, "step": 8416 }, { "epoch": 3.829390354868062, "grad_norm": 1.13867908828044, "learning_rate": 1.2935351091019337e-07, "loss": 0.0135, "step": 8417 }, { "epoch": 3.829845313921747, "grad_norm": 0.7790870309781882, "learning_rate": 1.2925759433836604e-07, "loss": 0.0042, "step": 8418 }, { "epoch": 3.8303002729754323, "grad_norm": 1.2991768936769401, "learning_rate": 1.29161708061934e-07, "loss": 0.016, "step": 8419 }, { "epoch": 3.8307552320291176, "grad_norm": 0.7399664427761887, "learning_rate": 1.290658520887325e-07, "loss": 0.0104, "step": 8420 }, { "epoch": 3.8312101910828025, "grad_norm": 0.899376321911177, "learning_rate": 1.2897002642659444e-07, "loss": 0.0094, "step": 8421 }, { "epoch": 3.831665150136488, "grad_norm": 0.3979882085801281, "learning_rate": 1.2887423108335012e-07, "loss": 0.0044, "step": 8422 }, { "epoch": 3.832120109190173, "grad_norm": 0.49931227145098633, "learning_rate": 1.2877846606682763e-07, "loss": 0.0051, "step": 8423 }, { "epoch": 3.832575068243858, "grad_norm": 0.6794918090073038, "learning_rate": 1.2868273138485264e-07, "loss": 0.0059, "step": 8424 }, { "epoch": 3.8330300272975433, "grad_norm": 0.5435771261285975, "learning_rate": 1.2858702704524797e-07, "loss": 0.0062, "step": 8425 }, { "epoch": 3.8334849863512286, "grad_norm": 0.5514384808683901, "learning_rate": 1.284913530558342e-07, "loss": 0.0104, "step": 8426 }, { "epoch": 3.8339399454049135, "grad_norm": 0.9404221574169485, "learning_rate": 1.283957094244292e-07, "loss": 0.009, "step": 8427 }, { "epoch": 3.8343949044585988, "grad_norm": 0.9749800149438536, "learning_rate": 1.2830009615884873e-07, "loss": 0.0062, "step": 8428 }, { "epoch": 3.834849863512284, "grad_norm": 0.7368578855691912, "learning_rate": 1.2820451326690573e-07, "loss": 0.0122, "step": 8429 }, { "epoch": 3.835304822565969, "grad_norm": 0.5802578118638289, "learning_rate": 1.2810896075641102e-07, "loss": 0.0103, "step": 8430 }, { "epoch": 3.8357597816196543, "grad_norm": 0.9911055340346104, "learning_rate": 1.2801343863517267e-07, "loss": 0.016, "step": 8431 }, { "epoch": 3.8362147406733396, "grad_norm": 0.6627907889466113, "learning_rate": 1.27917946910996e-07, "loss": 0.0052, "step": 8432 }, { "epoch": 3.8366696997270244, "grad_norm": 0.9983125421109295, "learning_rate": 1.2782248559168457e-07, "loss": 0.0118, "step": 8433 }, { "epoch": 3.8371246587807097, "grad_norm": 1.2675177162076943, "learning_rate": 1.277270546850389e-07, "loss": 0.012, "step": 8434 }, { "epoch": 3.837579617834395, "grad_norm": 0.5045523954129478, "learning_rate": 1.2763165419885713e-07, "loss": 0.0034, "step": 8435 }, { "epoch": 3.83803457688808, "grad_norm": 1.2830680023136585, "learning_rate": 1.2753628414093486e-07, "loss": 0.0183, "step": 8436 }, { "epoch": 3.8384895359417652, "grad_norm": 0.5956013850796307, "learning_rate": 1.274409445190654e-07, "loss": 0.0141, "step": 8437 }, { "epoch": 3.8389444949954505, "grad_norm": 0.4341248555954432, "learning_rate": 1.2734563534103964e-07, "loss": 0.0099, "step": 8438 }, { "epoch": 3.8393994540491354, "grad_norm": 0.9578844895977755, "learning_rate": 1.2725035661464567e-07, "loss": 0.0075, "step": 8439 }, { "epoch": 3.8398544131028207, "grad_norm": 0.6574223151367614, "learning_rate": 1.2715510834766925e-07, "loss": 0.016, "step": 8440 }, { "epoch": 3.840309372156506, "grad_norm": 0.7575672975452237, "learning_rate": 1.2705989054789357e-07, "loss": 0.0141, "step": 8441 }, { "epoch": 3.840764331210191, "grad_norm": 0.5676557914283631, "learning_rate": 1.2696470322309938e-07, "loss": 0.0137, "step": 8442 }, { "epoch": 3.841219290263876, "grad_norm": 0.5706201131451903, "learning_rate": 1.2686954638106495e-07, "loss": 0.0118, "step": 8443 }, { "epoch": 3.8416742493175615, "grad_norm": 1.1536437954954302, "learning_rate": 1.2677442002956633e-07, "loss": 0.008, "step": 8444 }, { "epoch": 3.8421292083712464, "grad_norm": 1.0627469393439521, "learning_rate": 1.2667932417637667e-07, "loss": 0.0119, "step": 8445 }, { "epoch": 3.8425841674249317, "grad_norm": 0.7236885100121877, "learning_rate": 1.2658425882926672e-07, "loss": 0.0083, "step": 8446 }, { "epoch": 3.843039126478617, "grad_norm": 0.734586878245761, "learning_rate": 1.2648922399600466e-07, "loss": 0.0205, "step": 8447 }, { "epoch": 3.843494085532302, "grad_norm": 0.8767409972579934, "learning_rate": 1.2639421968435655e-07, "loss": 0.0109, "step": 8448 }, { "epoch": 3.843949044585987, "grad_norm": 0.5305570918900996, "learning_rate": 1.262992459020857e-07, "loss": 0.0072, "step": 8449 }, { "epoch": 3.8444040036396725, "grad_norm": 0.7612654519923754, "learning_rate": 1.2620430265695263e-07, "loss": 0.026, "step": 8450 }, { "epoch": 3.8448589626933574, "grad_norm": 0.7017660108899051, "learning_rate": 1.2610938995671605e-07, "loss": 0.0128, "step": 8451 }, { "epoch": 3.8453139217470427, "grad_norm": 0.539430867360907, "learning_rate": 1.2601450780913158e-07, "loss": 0.0084, "step": 8452 }, { "epoch": 3.845768880800728, "grad_norm": 0.6477524199303842, "learning_rate": 1.2591965622195273e-07, "loss": 0.0061, "step": 8453 }, { "epoch": 3.846223839854413, "grad_norm": 0.5525841308491705, "learning_rate": 1.2582483520293023e-07, "loss": 0.0137, "step": 8454 }, { "epoch": 3.846678798908098, "grad_norm": 0.6900758419239493, "learning_rate": 1.257300447598124e-07, "loss": 0.0103, "step": 8455 }, { "epoch": 3.8471337579617835, "grad_norm": 0.36300869829563803, "learning_rate": 1.256352849003451e-07, "loss": 0.0023, "step": 8456 }, { "epoch": 3.8475887170154683, "grad_norm": 0.6626569410745196, "learning_rate": 1.255405556322716e-07, "loss": 0.0139, "step": 8457 }, { "epoch": 3.8480436760691537, "grad_norm": 0.753919783513821, "learning_rate": 1.2544585696333304e-07, "loss": 0.0098, "step": 8458 }, { "epoch": 3.848498635122839, "grad_norm": 0.692991645059256, "learning_rate": 1.2535118890126756e-07, "loss": 0.01, "step": 8459 }, { "epoch": 3.8489535941765243, "grad_norm": 0.5314668493778936, "learning_rate": 1.2525655145381104e-07, "loss": 0.0109, "step": 8460 }, { "epoch": 3.849408553230209, "grad_norm": 1.354580279647028, "learning_rate": 1.251619446286966e-07, "loss": 0.0096, "step": 8461 }, { "epoch": 3.8498635122838945, "grad_norm": 1.0761322474211554, "learning_rate": 1.2506736843365551e-07, "loss": 0.0095, "step": 8462 }, { "epoch": 3.8503184713375798, "grad_norm": 0.9782001546412251, "learning_rate": 1.2497282287641586e-07, "loss": 0.0188, "step": 8463 }, { "epoch": 3.8507734303912646, "grad_norm": 0.4524782011843308, "learning_rate": 1.248783079647034e-07, "loss": 0.0076, "step": 8464 }, { "epoch": 3.85122838944495, "grad_norm": 0.6497903099423971, "learning_rate": 1.2478382370624168e-07, "loss": 0.0147, "step": 8465 }, { "epoch": 3.8516833484986353, "grad_norm": 0.6179845403750456, "learning_rate": 1.246893701087513e-07, "loss": 0.0095, "step": 8466 }, { "epoch": 3.8521383075523206, "grad_norm": 0.569079903510825, "learning_rate": 1.2459494717995083e-07, "loss": 0.01, "step": 8467 }, { "epoch": 3.8525932666060054, "grad_norm": 0.7429194382520179, "learning_rate": 1.24500554927556e-07, "loss": 0.0128, "step": 8468 }, { "epoch": 3.8530482256596907, "grad_norm": 0.5849737663458905, "learning_rate": 1.244061933592801e-07, "loss": 0.0113, "step": 8469 }, { "epoch": 3.853503184713376, "grad_norm": 0.6549740396887332, "learning_rate": 1.243118624828337e-07, "loss": 0.0079, "step": 8470 }, { "epoch": 3.853958143767061, "grad_norm": 0.578545327635183, "learning_rate": 1.2421756230592533e-07, "loss": 0.0081, "step": 8471 }, { "epoch": 3.8544131028207462, "grad_norm": 0.9999848371884954, "learning_rate": 1.2412329283626094e-07, "loss": 0.0137, "step": 8472 }, { "epoch": 3.8548680618744315, "grad_norm": 0.4464146582382739, "learning_rate": 1.2402905408154356e-07, "loss": 0.0072, "step": 8473 }, { "epoch": 3.8553230209281164, "grad_norm": 0.6696739627235725, "learning_rate": 1.23934846049474e-07, "loss": 0.0082, "step": 8474 }, { "epoch": 3.8557779799818017, "grad_norm": 0.6495524357993501, "learning_rate": 1.2384066874775046e-07, "loss": 0.0108, "step": 8475 }, { "epoch": 3.856232939035487, "grad_norm": 0.3716834421799543, "learning_rate": 1.2374652218406883e-07, "loss": 0.0019, "step": 8476 }, { "epoch": 3.856687898089172, "grad_norm": 0.8165146123127897, "learning_rate": 1.2365240636612228e-07, "loss": 0.0046, "step": 8477 }, { "epoch": 3.857142857142857, "grad_norm": 0.7653341808429295, "learning_rate": 1.2355832130160133e-07, "loss": 0.0068, "step": 8478 }, { "epoch": 3.8575978161965425, "grad_norm": 0.5423975248880296, "learning_rate": 1.2346426699819456e-07, "loss": 0.0053, "step": 8479 }, { "epoch": 3.8580527752502274, "grad_norm": 0.5927152494945386, "learning_rate": 1.2337024346358744e-07, "loss": 0.0159, "step": 8480 }, { "epoch": 3.8585077343039127, "grad_norm": 0.6644192539728225, "learning_rate": 1.2327625070546304e-07, "loss": 0.0056, "step": 8481 }, { "epoch": 3.858962693357598, "grad_norm": 0.5535429890127997, "learning_rate": 1.2318228873150232e-07, "loss": 0.0103, "step": 8482 }, { "epoch": 3.859417652411283, "grad_norm": 0.6389934169635302, "learning_rate": 1.230883575493833e-07, "loss": 0.0059, "step": 8483 }, { "epoch": 3.859872611464968, "grad_norm": 1.2535157510459374, "learning_rate": 1.2299445716678154e-07, "loss": 0.0156, "step": 8484 }, { "epoch": 3.8603275705186535, "grad_norm": 0.6707166429044971, "learning_rate": 1.2290058759137006e-07, "loss": 0.0034, "step": 8485 }, { "epoch": 3.8607825295723384, "grad_norm": 0.592713745680885, "learning_rate": 1.228067488308196e-07, "loss": 0.0047, "step": 8486 }, { "epoch": 3.8612374886260237, "grad_norm": 0.7516332040081983, "learning_rate": 1.2271294089279836e-07, "loss": 0.0104, "step": 8487 }, { "epoch": 3.861692447679709, "grad_norm": 0.9105973036433831, "learning_rate": 1.2261916378497183e-07, "loss": 0.0104, "step": 8488 }, { "epoch": 3.862147406733394, "grad_norm": 0.792967571474245, "learning_rate": 1.2252541751500294e-07, "loss": 0.0093, "step": 8489 }, { "epoch": 3.862602365787079, "grad_norm": 0.5412115025798007, "learning_rate": 1.2243170209055214e-07, "loss": 0.0083, "step": 8490 }, { "epoch": 3.8630573248407645, "grad_norm": 1.061893873168164, "learning_rate": 1.2233801751927776e-07, "loss": 0.0248, "step": 8491 }, { "epoch": 3.8635122838944493, "grad_norm": 0.8698987146472726, "learning_rate": 1.222443638088349e-07, "loss": 0.0194, "step": 8492 }, { "epoch": 3.8639672429481347, "grad_norm": 1.766465463548462, "learning_rate": 1.2215074096687684e-07, "loss": 0.0313, "step": 8493 }, { "epoch": 3.86442220200182, "grad_norm": 0.6101256492070709, "learning_rate": 1.2205714900105384e-07, "loss": 0.0084, "step": 8494 }, { "epoch": 3.864877161055505, "grad_norm": 0.4910277609857243, "learning_rate": 1.2196358791901378e-07, "loss": 0.0053, "step": 8495 }, { "epoch": 3.86533212010919, "grad_norm": 0.8087353887788462, "learning_rate": 1.2187005772840218e-07, "loss": 0.0113, "step": 8496 }, { "epoch": 3.8657870791628755, "grad_norm": 0.8702390085467097, "learning_rate": 1.217765584368619e-07, "loss": 0.0209, "step": 8497 }, { "epoch": 3.8662420382165603, "grad_norm": 0.8368254310298501, "learning_rate": 1.216830900520332e-07, "loss": 0.0071, "step": 8498 }, { "epoch": 3.8666969972702456, "grad_norm": 0.8419343682143438, "learning_rate": 1.215896525815538e-07, "loss": 0.0063, "step": 8499 }, { "epoch": 3.867151956323931, "grad_norm": 0.8513693518915235, "learning_rate": 1.2149624603305907e-07, "loss": 0.0112, "step": 8500 }, { "epoch": 3.867606915377616, "grad_norm": 0.9177036448779844, "learning_rate": 1.21402870414182e-07, "loss": 0.0118, "step": 8501 }, { "epoch": 3.868061874431301, "grad_norm": 0.5769938853161448, "learning_rate": 1.213095257325526e-07, "loss": 0.0057, "step": 8502 }, { "epoch": 3.8685168334849864, "grad_norm": 0.7891387651583581, "learning_rate": 1.2121621199579858e-07, "loss": 0.0164, "step": 8503 }, { "epoch": 3.8689717925386713, "grad_norm": 0.6434461765225155, "learning_rate": 1.2112292921154505e-07, "loss": 0.0104, "step": 8504 }, { "epoch": 3.8694267515923566, "grad_norm": 0.7907524515810479, "learning_rate": 1.2102967738741488e-07, "loss": 0.0239, "step": 8505 }, { "epoch": 3.869881710646042, "grad_norm": 0.9189944015911764, "learning_rate": 1.2093645653102786e-07, "loss": 0.0146, "step": 8506 }, { "epoch": 3.870336669699727, "grad_norm": 0.5743759056753619, "learning_rate": 1.20843266650002e-07, "loss": 0.0078, "step": 8507 }, { "epoch": 3.870791628753412, "grad_norm": 0.629622808695983, "learning_rate": 1.2075010775195205e-07, "loss": 0.0145, "step": 8508 }, { "epoch": 3.8712465878070974, "grad_norm": 1.1034296238629262, "learning_rate": 1.2065697984449052e-07, "loss": 0.0085, "step": 8509 }, { "epoch": 3.8717015468607823, "grad_norm": 0.6356267688531929, "learning_rate": 1.2056388293522767e-07, "loss": 0.0075, "step": 8510 }, { "epoch": 3.8721565059144676, "grad_norm": 0.9263676113573582, "learning_rate": 1.2047081703177074e-07, "loss": 0.0226, "step": 8511 }, { "epoch": 3.872611464968153, "grad_norm": 0.5340246047029109, "learning_rate": 1.2037778214172473e-07, "loss": 0.0058, "step": 8512 }, { "epoch": 3.8730664240218378, "grad_norm": 0.7589414099054134, "learning_rate": 1.2028477827269184e-07, "loss": 0.0082, "step": 8513 }, { "epoch": 3.873521383075523, "grad_norm": 0.5504336630055666, "learning_rate": 1.2019180543227215e-07, "loss": 0.0092, "step": 8514 }, { "epoch": 3.8739763421292084, "grad_norm": 0.7478935861325975, "learning_rate": 1.20098863628063e-07, "loss": 0.0176, "step": 8515 }, { "epoch": 3.8744313011828937, "grad_norm": 0.47817253838734125, "learning_rate": 1.2000595286765913e-07, "loss": 0.0028, "step": 8516 }, { "epoch": 3.8748862602365786, "grad_norm": 0.6908293610952052, "learning_rate": 1.1991307315865274e-07, "loss": 0.0157, "step": 8517 }, { "epoch": 3.875341219290264, "grad_norm": 0.6316802768629227, "learning_rate": 1.1982022450863356e-07, "loss": 0.0137, "step": 8518 }, { "epoch": 3.875796178343949, "grad_norm": 50.132029151102664, "learning_rate": 1.1972740692518856e-07, "loss": 0.0894, "step": 8519 }, { "epoch": 3.876251137397634, "grad_norm": 0.7210041768524409, "learning_rate": 1.196346204159026e-07, "loss": 0.0042, "step": 8520 }, { "epoch": 3.8767060964513194, "grad_norm": 1.0323385401711511, "learning_rate": 1.1954186498835794e-07, "loss": 0.0049, "step": 8521 }, { "epoch": 3.8771610555050047, "grad_norm": 3.3346828728792928, "learning_rate": 1.194491406501339e-07, "loss": 0.0144, "step": 8522 }, { "epoch": 3.87761601455869, "grad_norm": 0.378654134433277, "learning_rate": 1.1935644740880757e-07, "loss": 0.003, "step": 8523 }, { "epoch": 3.878070973612375, "grad_norm": 0.6756395365663339, "learning_rate": 1.1926378527195318e-07, "loss": 0.0142, "step": 8524 }, { "epoch": 3.87852593266606, "grad_norm": 0.5489063223210275, "learning_rate": 1.1917115424714303e-07, "loss": 0.0038, "step": 8525 }, { "epoch": 3.8789808917197455, "grad_norm": 0.7697791205005462, "learning_rate": 1.1907855434194635e-07, "loss": 0.0049, "step": 8526 }, { "epoch": 3.8794358507734303, "grad_norm": 0.7070446501982883, "learning_rate": 1.1898598556392986e-07, "loss": 0.0078, "step": 8527 }, { "epoch": 3.8798908098271156, "grad_norm": 0.6693012823431275, "learning_rate": 1.1889344792065813e-07, "loss": 0.0111, "step": 8528 }, { "epoch": 3.880345768880801, "grad_norm": 1.0240272619273132, "learning_rate": 1.188009414196926e-07, "loss": 0.007, "step": 8529 }, { "epoch": 3.880800727934486, "grad_norm": 0.5783837675926145, "learning_rate": 1.1870846606859286e-07, "loss": 0.0085, "step": 8530 }, { "epoch": 3.881255686988171, "grad_norm": 0.6390591326962564, "learning_rate": 1.1861602187491532e-07, "loss": 0.0087, "step": 8531 }, { "epoch": 3.8817106460418564, "grad_norm": 0.6246978318580182, "learning_rate": 1.1852360884621415e-07, "loss": 0.016, "step": 8532 }, { "epoch": 3.8821656050955413, "grad_norm": 0.5420878645811056, "learning_rate": 1.1843122699004082e-07, "loss": 0.009, "step": 8533 }, { "epoch": 3.8826205641492266, "grad_norm": 0.600036022775182, "learning_rate": 1.1833887631394446e-07, "loss": 0.0095, "step": 8534 }, { "epoch": 3.883075523202912, "grad_norm": 0.5152570733168643, "learning_rate": 1.1824655682547174e-07, "loss": 0.0072, "step": 8535 }, { "epoch": 3.883530482256597, "grad_norm": 0.8481314959351133, "learning_rate": 1.1815426853216636e-07, "loss": 0.0059, "step": 8536 }, { "epoch": 3.883985441310282, "grad_norm": 1.176381986839215, "learning_rate": 1.1806201144156979e-07, "loss": 0.0191, "step": 8537 }, { "epoch": 3.8844404003639674, "grad_norm": 0.8202381923689072, "learning_rate": 1.1796978556122068e-07, "loss": 0.0259, "step": 8538 }, { "epoch": 3.8848953594176523, "grad_norm": 0.6385906423915313, "learning_rate": 1.1787759089865557e-07, "loss": 0.0104, "step": 8539 }, { "epoch": 3.8853503184713376, "grad_norm": 0.962195612498455, "learning_rate": 1.1778542746140813e-07, "loss": 0.0186, "step": 8540 }, { "epoch": 3.885805277525023, "grad_norm": 0.5187355685480922, "learning_rate": 1.1769329525700932e-07, "loss": 0.0071, "step": 8541 }, { "epoch": 3.886260236578708, "grad_norm": 1.3285913583155449, "learning_rate": 1.176011942929881e-07, "loss": 0.0178, "step": 8542 }, { "epoch": 3.886715195632393, "grad_norm": 1.1054405673709906, "learning_rate": 1.1750912457687024e-07, "loss": 0.0136, "step": 8543 }, { "epoch": 3.8871701546860784, "grad_norm": 0.5111409756328967, "learning_rate": 1.1741708611617951e-07, "loss": 0.0069, "step": 8544 }, { "epoch": 3.8876251137397633, "grad_norm": 0.6630518728683619, "learning_rate": 1.173250789184368e-07, "loss": 0.0075, "step": 8545 }, { "epoch": 3.8880800727934486, "grad_norm": 0.8075997860848939, "learning_rate": 1.1723310299116051e-07, "loss": 0.0167, "step": 8546 }, { "epoch": 3.888535031847134, "grad_norm": 0.5926180200356403, "learning_rate": 1.1714115834186644e-07, "loss": 0.0145, "step": 8547 }, { "epoch": 3.8889899909008188, "grad_norm": 0.4703206466153774, "learning_rate": 1.1704924497806773e-07, "loss": 0.0113, "step": 8548 }, { "epoch": 3.889444949954504, "grad_norm": 0.4284421738388875, "learning_rate": 1.1695736290727554e-07, "loss": 0.0034, "step": 8549 }, { "epoch": 3.8898999090081894, "grad_norm": 0.6433218143233748, "learning_rate": 1.1686551213699785e-07, "loss": 0.0089, "step": 8550 }, { "epoch": 3.8903548680618742, "grad_norm": 1.0203832376043924, "learning_rate": 1.1677369267474035e-07, "loss": 0.0046, "step": 8551 }, { "epoch": 3.8908098271155596, "grad_norm": 0.6123041530911524, "learning_rate": 1.1668190452800603e-07, "loss": 0.008, "step": 8552 }, { "epoch": 3.891264786169245, "grad_norm": 0.48012076522304137, "learning_rate": 1.1659014770429526e-07, "loss": 0.011, "step": 8553 }, { "epoch": 3.8917197452229297, "grad_norm": 0.8222532028539079, "learning_rate": 1.1649842221110628e-07, "loss": 0.0154, "step": 8554 }, { "epoch": 3.892174704276615, "grad_norm": 0.7260393334951357, "learning_rate": 1.1640672805593422e-07, "loss": 0.0112, "step": 8555 }, { "epoch": 3.8926296633303004, "grad_norm": 0.622386007785115, "learning_rate": 1.1631506524627221e-07, "loss": 0.0056, "step": 8556 }, { "epoch": 3.8930846223839852, "grad_norm": 0.5834092768113225, "learning_rate": 1.1622343378961036e-07, "loss": 0.0152, "step": 8557 }, { "epoch": 3.8935395814376705, "grad_norm": 0.6169722900566552, "learning_rate": 1.1613183369343627e-07, "loss": 0.0086, "step": 8558 }, { "epoch": 3.893994540491356, "grad_norm": 0.5962762381166093, "learning_rate": 1.1604026496523534e-07, "loss": 0.0068, "step": 8559 }, { "epoch": 3.8944494995450407, "grad_norm": 0.9121227848808199, "learning_rate": 1.1594872761248998e-07, "loss": 0.0068, "step": 8560 }, { "epoch": 3.894904458598726, "grad_norm": 1.1466926276614724, "learning_rate": 1.158572216426802e-07, "loss": 0.0092, "step": 8561 }, { "epoch": 3.8953594176524113, "grad_norm": 1.0468060860952928, "learning_rate": 1.1576574706328341e-07, "loss": 0.0039, "step": 8562 }, { "epoch": 3.895814376706096, "grad_norm": 0.578511263039794, "learning_rate": 1.1567430388177457e-07, "loss": 0.0107, "step": 8563 }, { "epoch": 3.8962693357597815, "grad_norm": 0.5449405213533962, "learning_rate": 1.1558289210562616e-07, "loss": 0.0122, "step": 8564 }, { "epoch": 3.896724294813467, "grad_norm": 0.7731303803723304, "learning_rate": 1.1549151174230775e-07, "loss": 0.0162, "step": 8565 }, { "epoch": 3.8971792538671517, "grad_norm": 0.5318662384933809, "learning_rate": 1.1540016279928666e-07, "loss": 0.007, "step": 8566 }, { "epoch": 3.897634212920837, "grad_norm": 0.7973699932092244, "learning_rate": 1.1530884528402723e-07, "loss": 0.0185, "step": 8567 }, { "epoch": 3.8980891719745223, "grad_norm": 0.7881447914462326, "learning_rate": 1.1521755920399189e-07, "loss": 0.0133, "step": 8568 }, { "epoch": 3.8985441310282076, "grad_norm": 0.385548067039919, "learning_rate": 1.1512630456663975e-07, "loss": 0.0034, "step": 8569 }, { "epoch": 3.8989990900818925, "grad_norm": 0.9760107467251884, "learning_rate": 1.1503508137942813e-07, "loss": 0.0074, "step": 8570 }, { "epoch": 3.899454049135578, "grad_norm": 1.4030866828309405, "learning_rate": 1.1494388964981117e-07, "loss": 0.0068, "step": 8571 }, { "epoch": 3.899909008189263, "grad_norm": 0.41037264877760626, "learning_rate": 1.1485272938524043e-07, "loss": 0.004, "step": 8572 }, { "epoch": 3.900363967242948, "grad_norm": 1.0677727708375928, "learning_rate": 1.1476160059316559e-07, "loss": 0.0148, "step": 8573 }, { "epoch": 3.9008189262966333, "grad_norm": 0.7117656127784545, "learning_rate": 1.1467050328103295e-07, "loss": 0.0053, "step": 8574 }, { "epoch": 3.9012738853503186, "grad_norm": 0.785550859910128, "learning_rate": 1.1457943745628668e-07, "loss": 0.0222, "step": 8575 }, { "epoch": 3.901728844404004, "grad_norm": 0.5392750608137112, "learning_rate": 1.144884031263681e-07, "loss": 0.0108, "step": 8576 }, { "epoch": 3.902183803457689, "grad_norm": 0.488506803832639, "learning_rate": 1.143974002987162e-07, "loss": 0.0098, "step": 8577 }, { "epoch": 3.902638762511374, "grad_norm": 0.6071617743371884, "learning_rate": 1.1430642898076759e-07, "loss": 0.0041, "step": 8578 }, { "epoch": 3.9030937215650594, "grad_norm": 0.3677283196232011, "learning_rate": 1.1421548917995582e-07, "loss": 0.0026, "step": 8579 }, { "epoch": 3.9035486806187443, "grad_norm": 0.5328045875523724, "learning_rate": 1.1412458090371207e-07, "loss": 0.0058, "step": 8580 }, { "epoch": 3.9040036396724296, "grad_norm": 0.552289652213895, "learning_rate": 1.1403370415946484e-07, "loss": 0.0097, "step": 8581 }, { "epoch": 3.904458598726115, "grad_norm": 1.1354694185753385, "learning_rate": 1.139428589546404e-07, "loss": 0.0129, "step": 8582 }, { "epoch": 3.9049135577797998, "grad_norm": 0.8007946723928308, "learning_rate": 1.1385204529666204e-07, "loss": 0.0134, "step": 8583 }, { "epoch": 3.905368516833485, "grad_norm": 1.8630005377472685, "learning_rate": 1.1376126319295077e-07, "loss": 0.0439, "step": 8584 }, { "epoch": 3.9058234758871704, "grad_norm": 0.5696801232239791, "learning_rate": 1.1367051265092486e-07, "loss": 0.0064, "step": 8585 }, { "epoch": 3.9062784349408552, "grad_norm": 0.8088263743538542, "learning_rate": 1.1357979367800002e-07, "loss": 0.0206, "step": 8586 }, { "epoch": 3.9067333939945406, "grad_norm": 0.9569227249937047, "learning_rate": 1.1348910628158925e-07, "loss": 0.013, "step": 8587 }, { "epoch": 3.907188353048226, "grad_norm": 0.9686028533290356, "learning_rate": 1.133984504691034e-07, "loss": 0.0153, "step": 8588 }, { "epoch": 3.9076433121019107, "grad_norm": 0.7308334649590408, "learning_rate": 1.1330782624795026e-07, "loss": 0.0105, "step": 8589 }, { "epoch": 3.908098271155596, "grad_norm": 0.7564545992155586, "learning_rate": 1.1321723362553514e-07, "loss": 0.0153, "step": 8590 }, { "epoch": 3.9085532302092814, "grad_norm": 0.4744691767707516, "learning_rate": 1.1312667260926117e-07, "loss": 0.0089, "step": 8591 }, { "epoch": 3.9090081892629662, "grad_norm": 0.866970393032781, "learning_rate": 1.1303614320652827e-07, "loss": 0.0192, "step": 8592 }, { "epoch": 3.9094631483166515, "grad_norm": 0.9212861639963761, "learning_rate": 1.1294564542473434e-07, "loss": 0.0145, "step": 8593 }, { "epoch": 3.909918107370337, "grad_norm": 0.7768956918112158, "learning_rate": 1.1285517927127436e-07, "loss": 0.0102, "step": 8594 }, { "epoch": 3.9103730664240217, "grad_norm": 0.9225521887335039, "learning_rate": 1.1276474475354075e-07, "loss": 0.0105, "step": 8595 }, { "epoch": 3.910828025477707, "grad_norm": 0.6794259713531394, "learning_rate": 1.1267434187892338e-07, "loss": 0.0127, "step": 8596 }, { "epoch": 3.9112829845313923, "grad_norm": 0.5381586221251305, "learning_rate": 1.1258397065480963e-07, "loss": 0.008, "step": 8597 }, { "epoch": 3.911737943585077, "grad_norm": 0.7379235050480398, "learning_rate": 1.124936310885844e-07, "loss": 0.0073, "step": 8598 }, { "epoch": 3.9121929026387625, "grad_norm": 1.2579238490729605, "learning_rate": 1.1240332318762963e-07, "loss": 0.0101, "step": 8599 }, { "epoch": 3.912647861692448, "grad_norm": 0.4124310088954016, "learning_rate": 1.1231304695932492e-07, "loss": 0.0075, "step": 8600 }, { "epoch": 3.9131028207461327, "grad_norm": 0.6284684178237712, "learning_rate": 1.1222280241104715e-07, "loss": 0.0079, "step": 8601 }, { "epoch": 3.913557779799818, "grad_norm": 0.5501483354888838, "learning_rate": 1.1213258955017086e-07, "loss": 0.0067, "step": 8602 }, { "epoch": 3.9140127388535033, "grad_norm": 0.6600557283801004, "learning_rate": 1.1204240838406781e-07, "loss": 0.0224, "step": 8603 }, { "epoch": 3.914467697907188, "grad_norm": 1.2906285941800866, "learning_rate": 1.1195225892010696e-07, "loss": 0.0166, "step": 8604 }, { "epoch": 3.9149226569608735, "grad_norm": 0.8292586172374238, "learning_rate": 1.1186214116565529e-07, "loss": 0.0098, "step": 8605 }, { "epoch": 3.915377616014559, "grad_norm": 2.0854683526605045, "learning_rate": 1.1177205512807642e-07, "loss": 0.0054, "step": 8606 }, { "epoch": 3.9158325750682437, "grad_norm": 0.567616236607331, "learning_rate": 1.1168200081473217e-07, "loss": 0.0061, "step": 8607 }, { "epoch": 3.916287534121929, "grad_norm": 0.7388714462570067, "learning_rate": 1.1159197823298116e-07, "loss": 0.0131, "step": 8608 }, { "epoch": 3.9167424931756143, "grad_norm": 1.2367842781465568, "learning_rate": 1.1150198739017969e-07, "loss": 0.0119, "step": 8609 }, { "epoch": 3.917197452229299, "grad_norm": 0.8597713089190286, "learning_rate": 1.1141202829368124e-07, "loss": 0.0145, "step": 8610 }, { "epoch": 3.9176524112829845, "grad_norm": 0.5288649460482994, "learning_rate": 1.1132210095083694e-07, "loss": 0.006, "step": 8611 }, { "epoch": 3.91810737033667, "grad_norm": 0.6961627181081346, "learning_rate": 1.1123220536899547e-07, "loss": 0.0089, "step": 8612 }, { "epoch": 3.9185623293903546, "grad_norm": 0.3927613959561521, "learning_rate": 1.1114234155550251e-07, "loss": 0.0062, "step": 8613 }, { "epoch": 3.91901728844404, "grad_norm": 0.8364403530210055, "learning_rate": 1.1105250951770128e-07, "loss": 0.011, "step": 8614 }, { "epoch": 3.9194722474977253, "grad_norm": 0.6570995230707234, "learning_rate": 1.1096270926293244e-07, "loss": 0.011, "step": 8615 }, { "epoch": 3.91992720655141, "grad_norm": 0.6715543988086399, "learning_rate": 1.1087294079853421e-07, "loss": 0.0114, "step": 8616 }, { "epoch": 3.9203821656050954, "grad_norm": 0.6330101965457471, "learning_rate": 1.1078320413184201e-07, "loss": 0.0082, "step": 8617 }, { "epoch": 3.9208371246587808, "grad_norm": 0.5215040623018272, "learning_rate": 1.1069349927018856e-07, "loss": 0.0099, "step": 8618 }, { "epoch": 3.9212920837124656, "grad_norm": 0.8417743462340642, "learning_rate": 1.1060382622090436e-07, "loss": 0.0111, "step": 8619 }, { "epoch": 3.921747042766151, "grad_norm": 0.7859500857131145, "learning_rate": 1.1051418499131682e-07, "loss": 0.0111, "step": 8620 }, { "epoch": 3.9222020018198362, "grad_norm": 0.6632450613629585, "learning_rate": 1.1042457558875135e-07, "loss": 0.0164, "step": 8621 }, { "epoch": 3.922656960873521, "grad_norm": 0.5092367846273677, "learning_rate": 1.1033499802053026e-07, "loss": 0.0112, "step": 8622 }, { "epoch": 3.9231119199272064, "grad_norm": 0.6119648929248173, "learning_rate": 1.1024545229397342e-07, "loss": 0.0091, "step": 8623 }, { "epoch": 3.9235668789808917, "grad_norm": 0.8816349173411854, "learning_rate": 1.1015593841639809e-07, "loss": 0.0151, "step": 8624 }, { "epoch": 3.924021838034577, "grad_norm": 1.0877080080108166, "learning_rate": 1.100664563951188e-07, "loss": 0.0123, "step": 8625 }, { "epoch": 3.924476797088262, "grad_norm": 0.6185969531932369, "learning_rate": 1.0997700623744782e-07, "loss": 0.0149, "step": 8626 }, { "epoch": 3.9249317561419472, "grad_norm": 0.2013618218318165, "learning_rate": 1.0988758795069463e-07, "loss": 0.0013, "step": 8627 }, { "epoch": 3.9253867151956325, "grad_norm": 0.37953513244694126, "learning_rate": 1.0979820154216607e-07, "loss": 0.0032, "step": 8628 }, { "epoch": 3.9258416742493174, "grad_norm": 0.5767503518283007, "learning_rate": 1.0970884701916632e-07, "loss": 0.0083, "step": 8629 }, { "epoch": 3.9262966333030027, "grad_norm": 0.6023912063703166, "learning_rate": 1.0961952438899696e-07, "loss": 0.0087, "step": 8630 }, { "epoch": 3.926751592356688, "grad_norm": 0.6548427849358992, "learning_rate": 1.095302336589572e-07, "loss": 0.0115, "step": 8631 }, { "epoch": 3.9272065514103733, "grad_norm": 0.9921181313654042, "learning_rate": 1.0944097483634329e-07, "loss": 0.0146, "step": 8632 }, { "epoch": 3.927661510464058, "grad_norm": 0.787220305790091, "learning_rate": 1.0935174792844931e-07, "loss": 0.0147, "step": 8633 }, { "epoch": 3.9281164695177435, "grad_norm": 0.5848049107728375, "learning_rate": 1.0926255294256637e-07, "loss": 0.0065, "step": 8634 }, { "epoch": 3.928571428571429, "grad_norm": 0.5535790999723047, "learning_rate": 1.0917338988598285e-07, "loss": 0.0055, "step": 8635 }, { "epoch": 3.9290263876251137, "grad_norm": 0.5306612941575879, "learning_rate": 1.090842587659851e-07, "loss": 0.007, "step": 8636 }, { "epoch": 3.929481346678799, "grad_norm": 0.7316652236322583, "learning_rate": 1.0899515958985639e-07, "loss": 0.012, "step": 8637 }, { "epoch": 3.9299363057324843, "grad_norm": 0.6714627651668873, "learning_rate": 1.0890609236487747e-07, "loss": 0.012, "step": 8638 }, { "epoch": 3.930391264786169, "grad_norm": 0.4284283084084919, "learning_rate": 1.0881705709832639e-07, "loss": 0.0024, "step": 8639 }, { "epoch": 3.9308462238398545, "grad_norm": 0.5781345367939654, "learning_rate": 1.0872805379747879e-07, "loss": 0.0083, "step": 8640 }, { "epoch": 3.93130118289354, "grad_norm": 0.6670800763697257, "learning_rate": 1.0863908246960784e-07, "loss": 0.0114, "step": 8641 }, { "epoch": 3.9317561419472247, "grad_norm": 2.1182076800012672, "learning_rate": 1.0855014312198368e-07, "loss": 0.0135, "step": 8642 }, { "epoch": 3.93221110100091, "grad_norm": 0.5846475287689277, "learning_rate": 1.0846123576187411e-07, "loss": 0.0083, "step": 8643 }, { "epoch": 3.9326660600545953, "grad_norm": 0.6385698289818327, "learning_rate": 1.0837236039654396e-07, "loss": 0.014, "step": 8644 }, { "epoch": 3.93312101910828, "grad_norm": 0.8445717128201865, "learning_rate": 1.0828351703325612e-07, "loss": 0.009, "step": 8645 }, { "epoch": 3.9335759781619655, "grad_norm": 0.752411056037435, "learning_rate": 1.081947056792702e-07, "loss": 0.0129, "step": 8646 }, { "epoch": 3.934030937215651, "grad_norm": 0.4088493038479333, "learning_rate": 1.0810592634184362e-07, "loss": 0.0044, "step": 8647 }, { "epoch": 3.9344858962693356, "grad_norm": 0.4919973223279204, "learning_rate": 1.0801717902823098e-07, "loss": 0.0138, "step": 8648 }, { "epoch": 3.934940855323021, "grad_norm": 0.3186395770900219, "learning_rate": 1.0792846374568416e-07, "loss": 0.0027, "step": 8649 }, { "epoch": 3.9353958143767063, "grad_norm": 0.6164520482874368, "learning_rate": 1.0783978050145288e-07, "loss": 0.0078, "step": 8650 }, { "epoch": 3.935850773430391, "grad_norm": 0.8646823004872666, "learning_rate": 1.0775112930278368e-07, "loss": 0.0144, "step": 8651 }, { "epoch": 3.9363057324840764, "grad_norm": 1.1904222849486472, "learning_rate": 1.0766251015692085e-07, "loss": 0.0193, "step": 8652 }, { "epoch": 3.9367606915377618, "grad_norm": 1.7498214518981623, "learning_rate": 1.075739230711058e-07, "loss": 0.0207, "step": 8653 }, { "epoch": 3.9372156505914466, "grad_norm": 0.6538560612719513, "learning_rate": 1.0748536805257752e-07, "loss": 0.0089, "step": 8654 }, { "epoch": 3.937670609645132, "grad_norm": 0.5724238871693562, "learning_rate": 1.0739684510857255e-07, "loss": 0.0078, "step": 8655 }, { "epoch": 3.9381255686988172, "grad_norm": 1.0024032752328234, "learning_rate": 1.0730835424632444e-07, "loss": 0.0082, "step": 8656 }, { "epoch": 3.938580527752502, "grad_norm": 0.9319089333304592, "learning_rate": 1.072198954730642e-07, "loss": 0.0121, "step": 8657 }, { "epoch": 3.9390354868061874, "grad_norm": 0.673860512572436, "learning_rate": 1.0713146879602036e-07, "loss": 0.0139, "step": 8658 }, { "epoch": 3.9394904458598727, "grad_norm": 0.739038722460154, "learning_rate": 1.0704307422241854e-07, "loss": 0.0072, "step": 8659 }, { "epoch": 3.9399454049135576, "grad_norm": 0.4285573244112637, "learning_rate": 1.0695471175948212e-07, "loss": 0.0067, "step": 8660 }, { "epoch": 3.940400363967243, "grad_norm": 0.6557278931728411, "learning_rate": 1.0686638141443182e-07, "loss": 0.0129, "step": 8661 }, { "epoch": 3.9408553230209282, "grad_norm": 0.6453253251337351, "learning_rate": 1.067780831944855e-07, "loss": 0.0116, "step": 8662 }, { "epoch": 3.941310282074613, "grad_norm": 0.7667418161956268, "learning_rate": 1.0668981710685843e-07, "loss": 0.0145, "step": 8663 }, { "epoch": 3.9417652411282984, "grad_norm": 0.3791920157669289, "learning_rate": 1.0660158315876317e-07, "loss": 0.0055, "step": 8664 }, { "epoch": 3.9422202001819837, "grad_norm": 0.782476628101818, "learning_rate": 1.0651338135741005e-07, "loss": 0.0111, "step": 8665 }, { "epoch": 3.9426751592356686, "grad_norm": 1.0861247948108184, "learning_rate": 1.0642521171000651e-07, "loss": 0.0147, "step": 8666 }, { "epoch": 3.943130118289354, "grad_norm": 0.9151505152445368, "learning_rate": 1.0633707422375715e-07, "loss": 0.0202, "step": 8667 }, { "epoch": 3.943585077343039, "grad_norm": 0.3448127573054543, "learning_rate": 1.0624896890586448e-07, "loss": 0.0037, "step": 8668 }, { "epoch": 3.944040036396724, "grad_norm": 1.169544176020226, "learning_rate": 1.0616089576352771e-07, "loss": 0.01, "step": 8669 }, { "epoch": 3.9444949954504094, "grad_norm": 0.8939563137103134, "learning_rate": 1.0607285480394418e-07, "loss": 0.0055, "step": 8670 }, { "epoch": 3.9449499545040947, "grad_norm": 0.707589462710664, "learning_rate": 1.0598484603430796e-07, "loss": 0.015, "step": 8671 }, { "epoch": 3.9454049135577796, "grad_norm": 0.535587288538872, "learning_rate": 1.0589686946181075e-07, "loss": 0.0032, "step": 8672 }, { "epoch": 3.945859872611465, "grad_norm": 0.8356947578626405, "learning_rate": 1.0580892509364147e-07, "loss": 0.0156, "step": 8673 }, { "epoch": 3.94631483166515, "grad_norm": 1.0574535539793575, "learning_rate": 1.0572101293698671e-07, "loss": 0.011, "step": 8674 }, { "epoch": 3.946769790718835, "grad_norm": 0.6878258934777283, "learning_rate": 1.0563313299903037e-07, "loss": 0.0074, "step": 8675 }, { "epoch": 3.9472247497725204, "grad_norm": 0.7569028248295371, "learning_rate": 1.0554528528695344e-07, "loss": 0.0187, "step": 8676 }, { "epoch": 3.9476797088262057, "grad_norm": 0.6702946939783441, "learning_rate": 1.0545746980793446e-07, "loss": 0.0107, "step": 8677 }, { "epoch": 3.9481346678798905, "grad_norm": 0.5417946715157065, "learning_rate": 1.0536968656914913e-07, "loss": 0.0083, "step": 8678 }, { "epoch": 3.948589626933576, "grad_norm": 0.6826368484167549, "learning_rate": 1.0528193557777109e-07, "loss": 0.005, "step": 8679 }, { "epoch": 3.949044585987261, "grad_norm": 0.7720115964517752, "learning_rate": 1.0519421684097068e-07, "loss": 0.0144, "step": 8680 }, { "epoch": 3.9494995450409465, "grad_norm": 0.511299651417319, "learning_rate": 1.0510653036591583e-07, "loss": 0.0073, "step": 8681 }, { "epoch": 3.9499545040946313, "grad_norm": 0.6990664656282842, "learning_rate": 1.0501887615977211e-07, "loss": 0.0042, "step": 8682 }, { "epoch": 3.9504094631483166, "grad_norm": 0.8134942565950375, "learning_rate": 1.04931254229702e-07, "loss": 0.0153, "step": 8683 }, { "epoch": 3.950864422202002, "grad_norm": 0.6593679997234868, "learning_rate": 1.0484366458286586e-07, "loss": 0.0117, "step": 8684 }, { "epoch": 3.951319381255687, "grad_norm": 0.977717102601891, "learning_rate": 1.0475610722642086e-07, "loss": 0.015, "step": 8685 }, { "epoch": 3.951774340309372, "grad_norm": 1.0090069988842159, "learning_rate": 1.0466858216752194e-07, "loss": 0.0155, "step": 8686 }, { "epoch": 3.9522292993630574, "grad_norm": 0.6682453180250725, "learning_rate": 1.04581089413321e-07, "loss": 0.015, "step": 8687 }, { "epoch": 3.9526842584167428, "grad_norm": 0.8466744057632597, "learning_rate": 1.0449362897096775e-07, "loss": 0.019, "step": 8688 }, { "epoch": 3.9531392174704276, "grad_norm": 0.6271287678574774, "learning_rate": 1.0440620084760921e-07, "loss": 0.0115, "step": 8689 }, { "epoch": 3.953594176524113, "grad_norm": 0.6785514179459011, "learning_rate": 1.0431880505038942e-07, "loss": 0.0061, "step": 8690 }, { "epoch": 3.9540491355777982, "grad_norm": 1.1587790129712559, "learning_rate": 1.0423144158644997e-07, "loss": 0.0092, "step": 8691 }, { "epoch": 3.954504094631483, "grad_norm": 0.37594689448502033, "learning_rate": 1.0414411046292992e-07, "loss": 0.0082, "step": 8692 }, { "epoch": 3.9549590536851684, "grad_norm": 0.6423559349531752, "learning_rate": 1.0405681168696529e-07, "loss": 0.0045, "step": 8693 }, { "epoch": 3.9554140127388537, "grad_norm": 0.8115852698683436, "learning_rate": 1.0396954526569012e-07, "loss": 0.0079, "step": 8694 }, { "epoch": 3.9558689717925386, "grad_norm": 0.7155328541333671, "learning_rate": 1.0388231120623509e-07, "loss": 0.006, "step": 8695 }, { "epoch": 3.956323930846224, "grad_norm": 0.5667203614723506, "learning_rate": 1.037951095157289e-07, "loss": 0.0139, "step": 8696 }, { "epoch": 3.9567788898999092, "grad_norm": 0.7767492570910604, "learning_rate": 1.0370794020129709e-07, "loss": 0.0107, "step": 8697 }, { "epoch": 3.957233848953594, "grad_norm": 0.8233109504975986, "learning_rate": 1.0362080327006262e-07, "loss": 0.0058, "step": 8698 }, { "epoch": 3.9576888080072794, "grad_norm": 0.7996720475854577, "learning_rate": 1.0353369872914625e-07, "loss": 0.0069, "step": 8699 }, { "epoch": 3.9581437670609647, "grad_norm": 0.7367330068880047, "learning_rate": 1.0344662658566561e-07, "loss": 0.0151, "step": 8700 }, { "epoch": 3.9585987261146496, "grad_norm": 0.5703468011366513, "learning_rate": 1.0335958684673573e-07, "loss": 0.0104, "step": 8701 }, { "epoch": 3.959053685168335, "grad_norm": 0.624472549903746, "learning_rate": 1.0327257951946916e-07, "loss": 0.0079, "step": 8702 }, { "epoch": 3.95950864422202, "grad_norm": 0.7908063256768636, "learning_rate": 1.0318560461097575e-07, "loss": 0.0149, "step": 8703 }, { "epoch": 3.959963603275705, "grad_norm": 0.6265362222892875, "learning_rate": 1.0309866212836287e-07, "loss": 0.0149, "step": 8704 }, { "epoch": 3.9604185623293904, "grad_norm": 0.6098700375972513, "learning_rate": 1.0301175207873491e-07, "loss": 0.0042, "step": 8705 }, { "epoch": 3.9608735213830757, "grad_norm": 1.0071458965140707, "learning_rate": 1.0292487446919385e-07, "loss": 0.0102, "step": 8706 }, { "epoch": 3.9613284804367606, "grad_norm": 0.7523112754392863, "learning_rate": 1.0283802930683866e-07, "loss": 0.0079, "step": 8707 }, { "epoch": 3.961783439490446, "grad_norm": 0.768813593372236, "learning_rate": 1.0275121659876634e-07, "loss": 0.0151, "step": 8708 }, { "epoch": 3.962238398544131, "grad_norm": 0.4310720028922905, "learning_rate": 1.0266443635207051e-07, "loss": 0.0037, "step": 8709 }, { "epoch": 3.962693357597816, "grad_norm": 0.9829217110663344, "learning_rate": 1.0257768857384269e-07, "loss": 0.0108, "step": 8710 }, { "epoch": 3.9631483166515014, "grad_norm": 1.5466930441693836, "learning_rate": 1.024909732711714e-07, "loss": 0.0195, "step": 8711 }, { "epoch": 3.9636032757051867, "grad_norm": 0.4372235991868267, "learning_rate": 1.0240429045114257e-07, "loss": 0.0038, "step": 8712 }, { "epoch": 3.9640582347588715, "grad_norm": 0.4922675216335801, "learning_rate": 1.0231764012083966e-07, "loss": 0.0066, "step": 8713 }, { "epoch": 3.964513193812557, "grad_norm": 0.8487998811394941, "learning_rate": 1.0223102228734331e-07, "loss": 0.0158, "step": 8714 }, { "epoch": 3.964968152866242, "grad_norm": 0.6839580598456438, "learning_rate": 1.0214443695773151e-07, "loss": 0.0127, "step": 8715 }, { "epoch": 3.965423111919927, "grad_norm": 0.3448843586380723, "learning_rate": 1.020578841390795e-07, "loss": 0.005, "step": 8716 }, { "epoch": 3.9658780709736123, "grad_norm": 0.5769271853287143, "learning_rate": 1.0197136383846011e-07, "loss": 0.0062, "step": 8717 }, { "epoch": 3.9663330300272976, "grad_norm": 0.8671256192091781, "learning_rate": 1.018848760629435e-07, "loss": 0.0092, "step": 8718 }, { "epoch": 3.9667879890809825, "grad_norm": 0.8404389448258782, "learning_rate": 1.0179842081959694e-07, "loss": 0.0106, "step": 8719 }, { "epoch": 3.967242948134668, "grad_norm": 0.7970531478920281, "learning_rate": 1.017119981154852e-07, "loss": 0.0115, "step": 8720 }, { "epoch": 3.967697907188353, "grad_norm": 0.628188229651761, "learning_rate": 1.0162560795767017e-07, "loss": 0.0058, "step": 8721 }, { "epoch": 3.968152866242038, "grad_norm": 0.5177437414768856, "learning_rate": 1.0153925035321154e-07, "loss": 0.0082, "step": 8722 }, { "epoch": 3.9686078252957233, "grad_norm": 0.5112739910465252, "learning_rate": 1.0145292530916583e-07, "loss": 0.0089, "step": 8723 }, { "epoch": 3.9690627843494086, "grad_norm": 0.40540894990767007, "learning_rate": 1.0136663283258734e-07, "loss": 0.0038, "step": 8724 }, { "epoch": 3.9695177434030935, "grad_norm": 0.5375763125013957, "learning_rate": 1.0128037293052744e-07, "loss": 0.0094, "step": 8725 }, { "epoch": 3.969972702456779, "grad_norm": 0.8404933307393482, "learning_rate": 1.011941456100347e-07, "loss": 0.0164, "step": 8726 }, { "epoch": 3.970427661510464, "grad_norm": 0.8133190606084778, "learning_rate": 1.0110795087815554e-07, "loss": 0.0229, "step": 8727 }, { "epoch": 3.970882620564149, "grad_norm": 0.9209495002756888, "learning_rate": 1.0102178874193323e-07, "loss": 0.0106, "step": 8728 }, { "epoch": 3.9713375796178343, "grad_norm": 0.7593525836922294, "learning_rate": 1.0093565920840862e-07, "loss": 0.0081, "step": 8729 }, { "epoch": 3.9717925386715196, "grad_norm": 0.6535973987837698, "learning_rate": 1.008495622846196e-07, "loss": 0.0089, "step": 8730 }, { "epoch": 3.9722474977252045, "grad_norm": 0.7680697366771582, "learning_rate": 1.0076349797760198e-07, "loss": 0.0118, "step": 8731 }, { "epoch": 3.97270245677889, "grad_norm": 0.5310494479427964, "learning_rate": 1.0067746629438817e-07, "loss": 0.0068, "step": 8732 }, { "epoch": 3.973157415832575, "grad_norm": 0.5382994925470888, "learning_rate": 1.0059146724200867e-07, "loss": 0.0102, "step": 8733 }, { "epoch": 3.9736123748862604, "grad_norm": 0.5727016394644007, "learning_rate": 1.0050550082749076e-07, "loss": 0.0176, "step": 8734 }, { "epoch": 3.9740673339399453, "grad_norm": 0.9935656405303261, "learning_rate": 1.004195670578592e-07, "loss": 0.0065, "step": 8735 }, { "epoch": 3.9745222929936306, "grad_norm": 0.6359075927889619, "learning_rate": 1.0033366594013604e-07, "loss": 0.0101, "step": 8736 }, { "epoch": 3.974977252047316, "grad_norm": 0.714456263090367, "learning_rate": 1.0024779748134077e-07, "loss": 0.003, "step": 8737 }, { "epoch": 3.9754322111010008, "grad_norm": 0.35238682757727025, "learning_rate": 1.0016196168849039e-07, "loss": 0.0026, "step": 8738 }, { "epoch": 3.975887170154686, "grad_norm": 0.7245965867261163, "learning_rate": 1.000761585685988e-07, "loss": 0.0112, "step": 8739 }, { "epoch": 3.9763421292083714, "grad_norm": 0.7741975119431838, "learning_rate": 9.999038812867755e-08, "loss": 0.0106, "step": 8740 }, { "epoch": 3.9767970882620567, "grad_norm": 0.4616239439937123, "learning_rate": 9.99046503757352e-08, "loss": 0.0042, "step": 8741 }, { "epoch": 3.9772520473157416, "grad_norm": 0.30745147042840076, "learning_rate": 9.98189453167781e-08, "loss": 0.0046, "step": 8742 }, { "epoch": 3.977707006369427, "grad_norm": 0.9327317725894885, "learning_rate": 9.973327295880962e-08, "loss": 0.0149, "step": 8743 }, { "epoch": 3.978161965423112, "grad_norm": 0.4768162761010355, "learning_rate": 9.964763330883036e-08, "loss": 0.0057, "step": 8744 }, { "epoch": 3.978616924476797, "grad_norm": 0.7017630481234045, "learning_rate": 9.956202637383871e-08, "loss": 0.0202, "step": 8745 }, { "epoch": 3.9790718835304824, "grad_norm": 4.438741883915154, "learning_rate": 9.947645216082967e-08, "loss": 0.0552, "step": 8746 }, { "epoch": 3.9795268425841677, "grad_norm": 0.7968452890046585, "learning_rate": 9.93909106767964e-08, "loss": 0.007, "step": 8747 }, { "epoch": 3.9799818016378525, "grad_norm": 1.1266281110469727, "learning_rate": 9.930540192872877e-08, "loss": 0.0058, "step": 8748 }, { "epoch": 3.980436760691538, "grad_norm": 0.5173380169866928, "learning_rate": 9.921992592361417e-08, "loss": 0.0137, "step": 8749 }, { "epoch": 3.980891719745223, "grad_norm": 0.6784910701095603, "learning_rate": 9.913448266843721e-08, "loss": 0.0159, "step": 8750 }, { "epoch": 3.981346678798908, "grad_norm": 1.0193433462162211, "learning_rate": 9.904907217018e-08, "loss": 0.0094, "step": 8751 }, { "epoch": 3.9818016378525933, "grad_norm": 0.5816727136810181, "learning_rate": 9.896369443582209e-08, "loss": 0.0138, "step": 8752 }, { "epoch": 3.9822565969062786, "grad_norm": 1.7991436373096172, "learning_rate": 9.887834947233997e-08, "loss": 0.0126, "step": 8753 }, { "epoch": 3.9827115559599635, "grad_norm": 0.34681846843472297, "learning_rate": 9.879303728670769e-08, "loss": 0.002, "step": 8754 }, { "epoch": 3.983166515013649, "grad_norm": 0.5604217963628993, "learning_rate": 9.870775788589646e-08, "loss": 0.0088, "step": 8755 }, { "epoch": 3.983621474067334, "grad_norm": 0.5994471994216449, "learning_rate": 9.862251127687515e-08, "loss": 0.0102, "step": 8756 }, { "epoch": 3.984076433121019, "grad_norm": 0.8030741393084921, "learning_rate": 9.853729746660965e-08, "loss": 0.0094, "step": 8757 }, { "epoch": 3.9845313921747043, "grad_norm": 1.169671955027396, "learning_rate": 9.845211646206303e-08, "loss": 0.0055, "step": 8758 }, { "epoch": 3.9849863512283896, "grad_norm": 0.9349042406060727, "learning_rate": 9.836696827019624e-08, "loss": 0.0128, "step": 8759 }, { "epoch": 3.9854413102820745, "grad_norm": 0.8299629807564936, "learning_rate": 9.828185289796692e-08, "loss": 0.0063, "step": 8760 }, { "epoch": 3.98589626933576, "grad_norm": 0.6413410881015639, "learning_rate": 9.819677035233054e-08, "loss": 0.0099, "step": 8761 }, { "epoch": 3.986351228389445, "grad_norm": 0.8996680334608695, "learning_rate": 9.81117206402396e-08, "loss": 0.0127, "step": 8762 }, { "epoch": 3.98680618744313, "grad_norm": 0.31074430262363817, "learning_rate": 9.802670376864386e-08, "loss": 0.0021, "step": 8763 }, { "epoch": 3.9872611464968153, "grad_norm": 0.6283744849021394, "learning_rate": 9.794171974449067e-08, "loss": 0.0026, "step": 8764 }, { "epoch": 3.9877161055505006, "grad_norm": 0.6393678050720111, "learning_rate": 9.78567685747242e-08, "loss": 0.0123, "step": 8765 }, { "epoch": 3.9881710646041855, "grad_norm": 0.7037648813048581, "learning_rate": 9.777185026628676e-08, "loss": 0.0042, "step": 8766 }, { "epoch": 3.988626023657871, "grad_norm": 0.7824281458929396, "learning_rate": 9.768696482611726e-08, "loss": 0.0036, "step": 8767 }, { "epoch": 3.989080982711556, "grad_norm": 0.35320695020794024, "learning_rate": 9.760211226115222e-08, "loss": 0.0022, "step": 8768 }, { "epoch": 3.989535941765241, "grad_norm": 0.7840267593382769, "learning_rate": 9.751729257832531e-08, "loss": 0.0198, "step": 8769 }, { "epoch": 3.9899909008189263, "grad_norm": 0.44321648040006645, "learning_rate": 9.743250578456752e-08, "loss": 0.0059, "step": 8770 }, { "epoch": 3.9904458598726116, "grad_norm": 0.6576849921759724, "learning_rate": 9.734775188680755e-08, "loss": 0.0078, "step": 8771 }, { "epoch": 3.9909008189262964, "grad_norm": 0.6467639505370472, "learning_rate": 9.72630308919708e-08, "loss": 0.0093, "step": 8772 }, { "epoch": 3.9913557779799818, "grad_norm": 0.4938629563758258, "learning_rate": 9.717834280698051e-08, "loss": 0.0067, "step": 8773 }, { "epoch": 3.991810737033667, "grad_norm": 0.767946749283883, "learning_rate": 9.709368763875691e-08, "loss": 0.0163, "step": 8774 }, { "epoch": 3.992265696087352, "grad_norm": 0.6137918111341205, "learning_rate": 9.700906539421755e-08, "loss": 0.0068, "step": 8775 }, { "epoch": 3.9927206551410372, "grad_norm": 0.6558696039222114, "learning_rate": 9.692447608027765e-08, "loss": 0.0093, "step": 8776 }, { "epoch": 3.9931756141947226, "grad_norm": 1.2383313139269507, "learning_rate": 9.683991970384925e-08, "loss": 0.0174, "step": 8777 }, { "epoch": 3.9936305732484074, "grad_norm": 0.5865371124290912, "learning_rate": 9.675539627184193e-08, "loss": 0.0102, "step": 8778 }, { "epoch": 3.9940855323020927, "grad_norm": 1.2564508306363895, "learning_rate": 9.667090579116249e-08, "loss": 0.0193, "step": 8779 }, { "epoch": 3.994540491355778, "grad_norm": 0.7266454412153822, "learning_rate": 9.65864482687152e-08, "loss": 0.0151, "step": 8780 }, { "epoch": 3.994995450409463, "grad_norm": 0.44116285642351993, "learning_rate": 9.650202371140171e-08, "loss": 0.0045, "step": 8781 }, { "epoch": 3.9954504094631482, "grad_norm": 0.6832051899533633, "learning_rate": 9.641763212612064e-08, "loss": 0.0056, "step": 8782 }, { "epoch": 3.9959053685168335, "grad_norm": 0.4561171649036809, "learning_rate": 9.633327351976811e-08, "loss": 0.0046, "step": 8783 }, { "epoch": 3.9963603275705184, "grad_norm": 0.716398143942355, "learning_rate": 9.624894789923737e-08, "loss": 0.0059, "step": 8784 }, { "epoch": 3.9968152866242037, "grad_norm": 0.5402179373073889, "learning_rate": 9.616465527141942e-08, "loss": 0.0074, "step": 8785 }, { "epoch": 3.997270245677889, "grad_norm": 0.6382916885634363, "learning_rate": 9.608039564320208e-08, "loss": 0.011, "step": 8786 }, { "epoch": 3.997725204731574, "grad_norm": 0.657585876860311, "learning_rate": 9.599616902147079e-08, "loss": 0.0116, "step": 8787 }, { "epoch": 3.998180163785259, "grad_norm": 0.7376246448270997, "learning_rate": 9.591197541310814e-08, "loss": 0.0223, "step": 8788 }, { "epoch": 3.9986351228389445, "grad_norm": 0.5791738656690063, "learning_rate": 9.58278148249938e-08, "loss": 0.0059, "step": 8789 }, { "epoch": 3.99909008189263, "grad_norm": 0.9354329128787754, "learning_rate": 9.574368726400544e-08, "loss": 0.0098, "step": 8790 }, { "epoch": 3.9995450409463147, "grad_norm": 0.6668883165143173, "learning_rate": 9.565959273701729e-08, "loss": 0.0112, "step": 8791 }, { "epoch": 4.0, "grad_norm": 0.5698472030334093, "learning_rate": 9.557553125090123e-08, "loss": 0.0037, "step": 8792 }, { "epoch": 4.000454959053685, "grad_norm": 0.4244551761299895, "learning_rate": 9.549150281252632e-08, "loss": 0.0045, "step": 8793 }, { "epoch": 4.000909918107371, "grad_norm": 0.6514330222619343, "learning_rate": 9.540750742875903e-08, "loss": 0.0058, "step": 8794 }, { "epoch": 4.0013648771610555, "grad_norm": 0.2990472199490715, "learning_rate": 9.532354510646323e-08, "loss": 0.003, "step": 8795 }, { "epoch": 4.00181983621474, "grad_norm": 0.4696184618994566, "learning_rate": 9.52396158524998e-08, "loss": 0.0039, "step": 8796 }, { "epoch": 4.002274795268426, "grad_norm": 0.4171783285401014, "learning_rate": 9.51557196737271e-08, "loss": 0.0091, "step": 8797 }, { "epoch": 4.002729754322111, "grad_norm": 0.47494435101171306, "learning_rate": 9.507185657700062e-08, "loss": 0.0057, "step": 8798 }, { "epoch": 4.003184713375796, "grad_norm": 0.5040929863267044, "learning_rate": 9.498802656917348e-08, "loss": 0.006, "step": 8799 }, { "epoch": 4.003639672429482, "grad_norm": 0.44677449580902723, "learning_rate": 9.490422965709565e-08, "loss": 0.0049, "step": 8800 }, { "epoch": 4.0040946314831665, "grad_norm": 0.5083896495541083, "learning_rate": 9.482046584761494e-08, "loss": 0.0086, "step": 8801 }, { "epoch": 4.004549590536851, "grad_norm": 0.4493255051073451, "learning_rate": 9.473673514757596e-08, "loss": 0.0111, "step": 8802 }, { "epoch": 4.005004549590537, "grad_norm": 0.4732344636825034, "learning_rate": 9.465303756382087e-08, "loss": 0.0101, "step": 8803 }, { "epoch": 4.005459508644222, "grad_norm": 0.4764208851693609, "learning_rate": 9.456937310318886e-08, "loss": 0.0056, "step": 8804 }, { "epoch": 4.005914467697907, "grad_norm": 1.0204938377128354, "learning_rate": 9.44857417725169e-08, "loss": 0.0107, "step": 8805 }, { "epoch": 4.006369426751593, "grad_norm": 0.33083610894101095, "learning_rate": 9.440214357863885e-08, "loss": 0.0041, "step": 8806 }, { "epoch": 4.0068243858052774, "grad_norm": 0.7247250117195397, "learning_rate": 9.431857852838581e-08, "loss": 0.0096, "step": 8807 }, { "epoch": 4.007279344858962, "grad_norm": 0.7116364022986817, "learning_rate": 9.423504662858666e-08, "loss": 0.0118, "step": 8808 }, { "epoch": 4.007734303912648, "grad_norm": 0.37515765624013353, "learning_rate": 9.415154788606694e-08, "loss": 0.0038, "step": 8809 }, { "epoch": 4.008189262966333, "grad_norm": 0.5448134056468023, "learning_rate": 9.406808230765001e-08, "loss": 0.0032, "step": 8810 }, { "epoch": 4.008644222020018, "grad_norm": 0.43153357972471035, "learning_rate": 9.398464990015631e-08, "loss": 0.0041, "step": 8811 }, { "epoch": 4.0090991810737036, "grad_norm": 0.42272111358050113, "learning_rate": 9.390125067040339e-08, "loss": 0.0082, "step": 8812 }, { "epoch": 4.009554140127388, "grad_norm": 0.5392970512385028, "learning_rate": 9.381788462520623e-08, "loss": 0.0032, "step": 8813 }, { "epoch": 4.010009099181073, "grad_norm": 0.367119030707117, "learning_rate": 9.373455177137729e-08, "loss": 0.0039, "step": 8814 }, { "epoch": 4.010464058234759, "grad_norm": 0.5111724498641563, "learning_rate": 9.365125211572616e-08, "loss": 0.0049, "step": 8815 }, { "epoch": 4.010919017288444, "grad_norm": 0.6621865679443364, "learning_rate": 9.356798566505969e-08, "loss": 0.0054, "step": 8816 }, { "epoch": 4.011373976342129, "grad_norm": 0.6082048437935667, "learning_rate": 9.348475242618197e-08, "loss": 0.0117, "step": 8817 }, { "epoch": 4.0118289353958145, "grad_norm": 0.45251939575078054, "learning_rate": 9.340155240589437e-08, "loss": 0.0028, "step": 8818 }, { "epoch": 4.012283894449499, "grad_norm": 0.33205732872142496, "learning_rate": 9.331838561099587e-08, "loss": 0.0012, "step": 8819 }, { "epoch": 4.012738853503185, "grad_norm": 0.5518979914632872, "learning_rate": 9.323525204828231e-08, "loss": 0.0045, "step": 8820 }, { "epoch": 4.01319381255687, "grad_norm": 0.48093036056627375, "learning_rate": 9.315215172454688e-08, "loss": 0.0045, "step": 8821 }, { "epoch": 4.013648771610555, "grad_norm": 0.9177610973605586, "learning_rate": 9.306908464658048e-08, "loss": 0.0065, "step": 8822 }, { "epoch": 4.014103730664241, "grad_norm": 0.6367837053822749, "learning_rate": 9.298605082117062e-08, "loss": 0.0062, "step": 8823 }, { "epoch": 4.0145586897179255, "grad_norm": 0.44701018563311873, "learning_rate": 9.290305025510281e-08, "loss": 0.0052, "step": 8824 }, { "epoch": 4.01501364877161, "grad_norm": 0.48695680782284884, "learning_rate": 9.282008295515926e-08, "loss": 0.0037, "step": 8825 }, { "epoch": 4.015468607825296, "grad_norm": 0.4609436349496672, "learning_rate": 9.273714892811974e-08, "loss": 0.0078, "step": 8826 }, { "epoch": 4.015923566878981, "grad_norm": 0.9409400764691782, "learning_rate": 9.265424818076107e-08, "loss": 0.0166, "step": 8827 }, { "epoch": 4.016378525932666, "grad_norm": 0.5250039089281501, "learning_rate": 9.25713807198577e-08, "loss": 0.0072, "step": 8828 }, { "epoch": 4.016833484986352, "grad_norm": 0.33995658874219603, "learning_rate": 9.24885465521813e-08, "loss": 0.005, "step": 8829 }, { "epoch": 4.0172884440400365, "grad_norm": 0.29757627943497994, "learning_rate": 9.240574568450055e-08, "loss": 0.002, "step": 8830 }, { "epoch": 4.017743403093721, "grad_norm": 0.652021012523394, "learning_rate": 9.232297812358164e-08, "loss": 0.0139, "step": 8831 }, { "epoch": 4.018198362147407, "grad_norm": 0.5637437724928756, "learning_rate": 9.224024387618773e-08, "loss": 0.0061, "step": 8832 }, { "epoch": 4.018653321201092, "grad_norm": 0.1457687781031791, "learning_rate": 9.215754294907979e-08, "loss": 0.001, "step": 8833 }, { "epoch": 4.019108280254777, "grad_norm": 0.3105952990415875, "learning_rate": 9.207487534901564e-08, "loss": 0.0023, "step": 8834 }, { "epoch": 4.019563239308463, "grad_norm": 0.33752100205206836, "learning_rate": 9.199224108275039e-08, "loss": 0.0048, "step": 8835 }, { "epoch": 4.0200181983621475, "grad_norm": 0.4676876630262886, "learning_rate": 9.190964015703678e-08, "loss": 0.0053, "step": 8836 }, { "epoch": 4.020473157415832, "grad_norm": 0.5713963294447455, "learning_rate": 9.182707257862443e-08, "loss": 0.0145, "step": 8837 }, { "epoch": 4.020928116469518, "grad_norm": 0.37488873192571764, "learning_rate": 9.174453835426033e-08, "loss": 0.0052, "step": 8838 }, { "epoch": 4.021383075523203, "grad_norm": 0.4523852947051741, "learning_rate": 9.166203749068896e-08, "loss": 0.0042, "step": 8839 }, { "epoch": 4.021838034576888, "grad_norm": 0.38467303363227545, "learning_rate": 9.157956999465188e-08, "loss": 0.0043, "step": 8840 }, { "epoch": 4.022292993630574, "grad_norm": 0.3525892697378404, "learning_rate": 9.149713587288793e-08, "loss": 0.0017, "step": 8841 }, { "epoch": 4.022747952684258, "grad_norm": 0.47335248164341187, "learning_rate": 9.141473513213316e-08, "loss": 0.0085, "step": 8842 }, { "epoch": 4.023202911737943, "grad_norm": 0.5676807456823151, "learning_rate": 9.133236777912107e-08, "loss": 0.0108, "step": 8843 }, { "epoch": 4.023657870791629, "grad_norm": 0.29737238315382963, "learning_rate": 9.125003382058244e-08, "loss": 0.0024, "step": 8844 }, { "epoch": 4.024112829845314, "grad_norm": 0.5836868482283706, "learning_rate": 9.116773326324517e-08, "loss": 0.0033, "step": 8845 }, { "epoch": 4.024567788898999, "grad_norm": 0.7663164077021085, "learning_rate": 9.10854661138345e-08, "loss": 0.0143, "step": 8846 }, { "epoch": 4.0250227479526846, "grad_norm": 0.28989563390851164, "learning_rate": 9.10032323790727e-08, "loss": 0.0039, "step": 8847 }, { "epoch": 4.025477707006369, "grad_norm": 0.43194576991678835, "learning_rate": 9.092103206567991e-08, "loss": 0.0052, "step": 8848 }, { "epoch": 4.025932666060054, "grad_norm": 0.4180209368231124, "learning_rate": 9.083886518037287e-08, "loss": 0.0015, "step": 8849 }, { "epoch": 4.02638762511374, "grad_norm": 0.634499666465118, "learning_rate": 9.075673172986615e-08, "loss": 0.0124, "step": 8850 }, { "epoch": 4.026842584167425, "grad_norm": 0.2966099281299469, "learning_rate": 9.067463172087114e-08, "loss": 0.0032, "step": 8851 }, { "epoch": 4.02729754322111, "grad_norm": 0.6236533114453934, "learning_rate": 9.059256516009662e-08, "loss": 0.0092, "step": 8852 }, { "epoch": 4.0277525022747955, "grad_norm": 0.36524498963466634, "learning_rate": 9.051053205424897e-08, "loss": 0.0037, "step": 8853 }, { "epoch": 4.02820746132848, "grad_norm": 0.731158305433035, "learning_rate": 9.042853241003134e-08, "loss": 0.0113, "step": 8854 }, { "epoch": 4.028662420382165, "grad_norm": 0.33347886954855066, "learning_rate": 9.034656623414449e-08, "loss": 0.0024, "step": 8855 }, { "epoch": 4.029117379435851, "grad_norm": 0.5662275113793954, "learning_rate": 9.026463353328612e-08, "loss": 0.0079, "step": 8856 }, { "epoch": 4.029572338489536, "grad_norm": 0.2968710396707196, "learning_rate": 9.018273431415157e-08, "loss": 0.0036, "step": 8857 }, { "epoch": 4.030027297543221, "grad_norm": 0.4583505039470318, "learning_rate": 9.010086858343335e-08, "loss": 0.0049, "step": 8858 }, { "epoch": 4.0304822565969065, "grad_norm": 1.0158798890237448, "learning_rate": 9.00190363478211e-08, "loss": 0.006, "step": 8859 }, { "epoch": 4.030937215650591, "grad_norm": 0.4095927119297428, "learning_rate": 8.993723761400168e-08, "loss": 0.0074, "step": 8860 }, { "epoch": 4.031392174704276, "grad_norm": 0.35224879841575174, "learning_rate": 8.985547238865932e-08, "loss": 0.0023, "step": 8861 }, { "epoch": 4.031847133757962, "grad_norm": 0.6087578960636463, "learning_rate": 8.977374067847566e-08, "loss": 0.0095, "step": 8862 }, { "epoch": 4.032302092811647, "grad_norm": 0.45308490008517055, "learning_rate": 8.969204249012918e-08, "loss": 0.0072, "step": 8863 }, { "epoch": 4.032757051865332, "grad_norm": 1.1814543063658036, "learning_rate": 8.961037783029618e-08, "loss": 0.0256, "step": 8864 }, { "epoch": 4.0332120109190175, "grad_norm": 0.44673744637222773, "learning_rate": 8.952874670564986e-08, "loss": 0.0068, "step": 8865 }, { "epoch": 4.033666969972702, "grad_norm": 0.48174884847392213, "learning_rate": 8.944714912286049e-08, "loss": 0.0064, "step": 8866 }, { "epoch": 4.034121929026387, "grad_norm": 0.4394840264003743, "learning_rate": 8.936558508859627e-08, "loss": 0.0035, "step": 8867 }, { "epoch": 4.034576888080073, "grad_norm": 0.19345867856534024, "learning_rate": 8.928405460952199e-08, "loss": 0.0011, "step": 8868 }, { "epoch": 4.035031847133758, "grad_norm": 0.3986527053930723, "learning_rate": 8.920255769229995e-08, "loss": 0.0048, "step": 8869 }, { "epoch": 4.035486806187443, "grad_norm": 0.4106609637864481, "learning_rate": 8.912109434358967e-08, "loss": 0.0033, "step": 8870 }, { "epoch": 4.0359417652411285, "grad_norm": 0.4438894216885987, "learning_rate": 8.903966457004802e-08, "loss": 0.0079, "step": 8871 }, { "epoch": 4.036396724294813, "grad_norm": 0.36429938173491483, "learning_rate": 8.895826837832927e-08, "loss": 0.0041, "step": 8872 }, { "epoch": 4.036851683348498, "grad_norm": 0.5453385584578645, "learning_rate": 8.88769057750845e-08, "loss": 0.0057, "step": 8873 }, { "epoch": 4.037306642402184, "grad_norm": 0.44934972715896765, "learning_rate": 8.879557676696243e-08, "loss": 0.0053, "step": 8874 }, { "epoch": 4.037761601455869, "grad_norm": 0.5677247169213188, "learning_rate": 8.871428136060883e-08, "loss": 0.0052, "step": 8875 }, { "epoch": 4.038216560509555, "grad_norm": 0.6416932892018257, "learning_rate": 8.863301956266673e-08, "loss": 0.0099, "step": 8876 }, { "epoch": 4.038671519563239, "grad_norm": 0.5654201310993424, "learning_rate": 8.855179137977648e-08, "loss": 0.0046, "step": 8877 }, { "epoch": 4.039126478616924, "grad_norm": 0.3843953968014701, "learning_rate": 8.847059681857594e-08, "loss": 0.0037, "step": 8878 }, { "epoch": 4.03958143767061, "grad_norm": 0.6230174294233408, "learning_rate": 8.838943588569975e-08, "loss": 0.0137, "step": 8879 }, { "epoch": 4.040036396724295, "grad_norm": 0.3301490057008659, "learning_rate": 8.830830858777999e-08, "loss": 0.0038, "step": 8880 }, { "epoch": 4.04049135577798, "grad_norm": 0.34918892006605246, "learning_rate": 8.822721493144602e-08, "loss": 0.0022, "step": 8881 }, { "epoch": 4.0409463148316656, "grad_norm": 0.5805825940870816, "learning_rate": 8.814615492332461e-08, "loss": 0.0068, "step": 8882 }, { "epoch": 4.04140127388535, "grad_norm": 0.7752546595782669, "learning_rate": 8.80651285700395e-08, "loss": 0.0095, "step": 8883 }, { "epoch": 4.041856232939035, "grad_norm": 0.6110308210608789, "learning_rate": 8.798413587821162e-08, "loss": 0.0106, "step": 8884 }, { "epoch": 4.042311191992721, "grad_norm": 0.4902328365228221, "learning_rate": 8.79031768544597e-08, "loss": 0.0096, "step": 8885 }, { "epoch": 4.042766151046406, "grad_norm": 0.5702715588542698, "learning_rate": 8.782225150539902e-08, "loss": 0.0102, "step": 8886 }, { "epoch": 4.043221110100091, "grad_norm": 0.6315130541863503, "learning_rate": 8.77413598376427e-08, "loss": 0.0059, "step": 8887 }, { "epoch": 4.0436760691537765, "grad_norm": 0.37849373868250225, "learning_rate": 8.766050185780066e-08, "loss": 0.0045, "step": 8888 }, { "epoch": 4.044131028207461, "grad_norm": 0.5347285644781848, "learning_rate": 8.757967757248036e-08, "loss": 0.0037, "step": 8889 }, { "epoch": 4.044585987261146, "grad_norm": 0.3458355983808249, "learning_rate": 8.749888698828616e-08, "loss": 0.0043, "step": 8890 }, { "epoch": 4.045040946314832, "grad_norm": 0.7040434674089914, "learning_rate": 8.741813011182015e-08, "loss": 0.0079, "step": 8891 }, { "epoch": 4.045495905368517, "grad_norm": 0.5780454889227109, "learning_rate": 8.733740694968139e-08, "loss": 0.0102, "step": 8892 }, { "epoch": 4.045950864422202, "grad_norm": 0.40418077233784155, "learning_rate": 8.725671750846619e-08, "loss": 0.0074, "step": 8893 }, { "epoch": 4.0464058234758875, "grad_norm": 0.5497631657890363, "learning_rate": 8.717606179476811e-08, "loss": 0.0053, "step": 8894 }, { "epoch": 4.046860782529572, "grad_norm": 0.34486927660524513, "learning_rate": 8.709543981517787e-08, "loss": 0.0033, "step": 8895 }, { "epoch": 4.047315741583257, "grad_norm": 0.7350460037858636, "learning_rate": 8.701485157628369e-08, "loss": 0.0184, "step": 8896 }, { "epoch": 4.047770700636943, "grad_norm": 0.534250275388981, "learning_rate": 8.693429708467088e-08, "loss": 0.0079, "step": 8897 }, { "epoch": 4.048225659690628, "grad_norm": 0.3731035769845565, "learning_rate": 8.685377634692176e-08, "loss": 0.0022, "step": 8898 }, { "epoch": 4.048680618744313, "grad_norm": 0.42319265689346924, "learning_rate": 8.677328936961641e-08, "loss": 0.003, "step": 8899 }, { "epoch": 4.0491355777979985, "grad_norm": 0.5711152257998708, "learning_rate": 8.669283615933159e-08, "loss": 0.0029, "step": 8900 }, { "epoch": 4.049590536851683, "grad_norm": 0.3884700077627492, "learning_rate": 8.66124167226419e-08, "loss": 0.0013, "step": 8901 }, { "epoch": 4.050045495905368, "grad_norm": 0.5013369746224542, "learning_rate": 8.653203106611867e-08, "loss": 0.0111, "step": 8902 }, { "epoch": 4.050500454959054, "grad_norm": 0.9416587587289139, "learning_rate": 8.645167919633062e-08, "loss": 0.014, "step": 8903 }, { "epoch": 4.050955414012739, "grad_norm": 0.48706553118102835, "learning_rate": 8.637136111984367e-08, "loss": 0.006, "step": 8904 }, { "epoch": 4.051410373066424, "grad_norm": 0.62490427709643, "learning_rate": 8.629107684322113e-08, "loss": 0.0175, "step": 8905 }, { "epoch": 4.0518653321201095, "grad_norm": 0.3791314419427227, "learning_rate": 8.621082637302368e-08, "loss": 0.0029, "step": 8906 }, { "epoch": 4.052320291173794, "grad_norm": 0.5589493690391825, "learning_rate": 8.613060971580877e-08, "loss": 0.0081, "step": 8907 }, { "epoch": 4.052775250227479, "grad_norm": 0.46578915565724627, "learning_rate": 8.605042687813146e-08, "loss": 0.0059, "step": 8908 }, { "epoch": 4.053230209281165, "grad_norm": 0.5537649177014999, "learning_rate": 8.597027786654387e-08, "loss": 0.0066, "step": 8909 }, { "epoch": 4.05368516833485, "grad_norm": 0.5124866717974282, "learning_rate": 8.589016268759536e-08, "loss": 0.0043, "step": 8910 }, { "epoch": 4.054140127388535, "grad_norm": 0.4790801149992505, "learning_rate": 8.581008134783274e-08, "loss": 0.0067, "step": 8911 }, { "epoch": 4.05459508644222, "grad_norm": 0.5583764431070996, "learning_rate": 8.573003385379968e-08, "loss": 0.0049, "step": 8912 }, { "epoch": 4.055050045495905, "grad_norm": 0.4924821351811271, "learning_rate": 8.565002021203754e-08, "loss": 0.0069, "step": 8913 }, { "epoch": 4.05550500454959, "grad_norm": 0.5653405421452806, "learning_rate": 8.557004042908455e-08, "loss": 0.0038, "step": 8914 }, { "epoch": 4.055959963603276, "grad_norm": 0.6262468471128794, "learning_rate": 8.54900945114762e-08, "loss": 0.0052, "step": 8915 }, { "epoch": 4.056414922656961, "grad_norm": 0.7327541313204637, "learning_rate": 8.541018246574555e-08, "loss": 0.0141, "step": 8916 }, { "epoch": 4.056869881710646, "grad_norm": 0.741701641345004, "learning_rate": 8.533030429842253e-08, "loss": 0.0068, "step": 8917 }, { "epoch": 4.057324840764331, "grad_norm": 0.32065923194371854, "learning_rate": 8.525046001603436e-08, "loss": 0.0071, "step": 8918 }, { "epoch": 4.057779799818016, "grad_norm": 0.6252636318229133, "learning_rate": 8.517064962510551e-08, "loss": 0.0071, "step": 8919 }, { "epoch": 4.058234758871701, "grad_norm": 0.3302967021214778, "learning_rate": 8.509087313215785e-08, "loss": 0.0028, "step": 8920 }, { "epoch": 4.058689717925387, "grad_norm": 0.39864597409929736, "learning_rate": 8.50111305437104e-08, "loss": 0.0045, "step": 8921 }, { "epoch": 4.059144676979072, "grad_norm": 0.285040984539163, "learning_rate": 8.493142186627934e-08, "loss": 0.0029, "step": 8922 }, { "epoch": 4.059599636032757, "grad_norm": 0.4552209541990534, "learning_rate": 8.485174710637799e-08, "loss": 0.0088, "step": 8923 }, { "epoch": 4.060054595086442, "grad_norm": 0.5227671137099362, "learning_rate": 8.4772106270517e-08, "loss": 0.0074, "step": 8924 }, { "epoch": 4.060509554140127, "grad_norm": 0.49337643350184374, "learning_rate": 8.469249936520445e-08, "loss": 0.0134, "step": 8925 }, { "epoch": 4.060964513193812, "grad_norm": 0.4438968364771477, "learning_rate": 8.461292639694517e-08, "loss": 0.0034, "step": 8926 }, { "epoch": 4.061419472247498, "grad_norm": 0.39551590721751545, "learning_rate": 8.453338737224186e-08, "loss": 0.0039, "step": 8927 }, { "epoch": 4.061874431301183, "grad_norm": 0.5541504996799945, "learning_rate": 8.445388229759387e-08, "loss": 0.011, "step": 8928 }, { "epoch": 4.0623293903548685, "grad_norm": 0.4581860598131042, "learning_rate": 8.43744111794979e-08, "loss": 0.0056, "step": 8929 }, { "epoch": 4.062784349408553, "grad_norm": 0.5938342436028076, "learning_rate": 8.429497402444824e-08, "loss": 0.0058, "step": 8930 }, { "epoch": 4.063239308462238, "grad_norm": 0.3307256684719887, "learning_rate": 8.4215570838936e-08, "loss": 0.003, "step": 8931 }, { "epoch": 4.063694267515924, "grad_norm": 0.6076674574032728, "learning_rate": 8.413620162944962e-08, "loss": 0.0094, "step": 8932 }, { "epoch": 4.064149226569609, "grad_norm": 0.5148502181368257, "learning_rate": 8.405686640247472e-08, "loss": 0.0115, "step": 8933 }, { "epoch": 4.064604185623294, "grad_norm": 0.6767826037710474, "learning_rate": 8.397756516449428e-08, "loss": 0.0066, "step": 8934 }, { "epoch": 4.0650591446769795, "grad_norm": 0.47187435800881156, "learning_rate": 8.389829792198866e-08, "loss": 0.0051, "step": 8935 }, { "epoch": 4.065514103730664, "grad_norm": 0.8315371406577065, "learning_rate": 8.381906468143496e-08, "loss": 0.0073, "step": 8936 }, { "epoch": 4.065969062784349, "grad_norm": 0.6257634678647042, "learning_rate": 8.373986544930789e-08, "loss": 0.0161, "step": 8937 }, { "epoch": 4.066424021838035, "grad_norm": 0.5103172894713726, "learning_rate": 8.366070023207905e-08, "loss": 0.0045, "step": 8938 }, { "epoch": 4.06687898089172, "grad_norm": 0.42493760807644543, "learning_rate": 8.358156903621776e-08, "loss": 0.0052, "step": 8939 }, { "epoch": 4.067333939945405, "grad_norm": 0.3807046976604801, "learning_rate": 8.350247186818998e-08, "loss": 0.0049, "step": 8940 }, { "epoch": 4.0677888989990905, "grad_norm": 0.42859530015838465, "learning_rate": 8.342340873445947e-08, "loss": 0.0077, "step": 8941 }, { "epoch": 4.068243858052775, "grad_norm": 0.4728657468653079, "learning_rate": 8.334437964148671e-08, "loss": 0.0081, "step": 8942 }, { "epoch": 4.06869881710646, "grad_norm": 0.5434895691532059, "learning_rate": 8.326538459572951e-08, "loss": 0.0106, "step": 8943 }, { "epoch": 4.069153776160146, "grad_norm": 0.3733438176085992, "learning_rate": 8.318642360364331e-08, "loss": 0.0056, "step": 8944 }, { "epoch": 4.069608735213831, "grad_norm": 0.5282451849881212, "learning_rate": 8.310749667168022e-08, "loss": 0.0021, "step": 8945 }, { "epoch": 4.070063694267516, "grad_norm": 0.6139000774223063, "learning_rate": 8.302860380628984e-08, "loss": 0.0038, "step": 8946 }, { "epoch": 4.070518653321201, "grad_norm": 0.6451301770100016, "learning_rate": 8.294974501391883e-08, "loss": 0.0109, "step": 8947 }, { "epoch": 4.070973612374886, "grad_norm": 0.6158082103407974, "learning_rate": 8.287092030101133e-08, "loss": 0.0026, "step": 8948 }, { "epoch": 4.071428571428571, "grad_norm": 0.44842477525798813, "learning_rate": 8.279212967400845e-08, "loss": 0.0041, "step": 8949 }, { "epoch": 4.071883530482257, "grad_norm": 0.33466960763240133, "learning_rate": 8.271337313934867e-08, "loss": 0.0031, "step": 8950 }, { "epoch": 4.072338489535942, "grad_norm": 0.5658554270142341, "learning_rate": 8.263465070346765e-08, "loss": 0.0062, "step": 8951 }, { "epoch": 4.072793448589627, "grad_norm": 0.6042572954620323, "learning_rate": 8.255596237279816e-08, "loss": 0.008, "step": 8952 }, { "epoch": 4.073248407643312, "grad_norm": 0.8116395242032914, "learning_rate": 8.247730815377013e-08, "loss": 0.0085, "step": 8953 }, { "epoch": 4.073703366696997, "grad_norm": 0.42690070286482545, "learning_rate": 8.239868805281097e-08, "loss": 0.0021, "step": 8954 }, { "epoch": 4.074158325750682, "grad_norm": 0.5233036738367217, "learning_rate": 8.232010207634526e-08, "loss": 0.0051, "step": 8955 }, { "epoch": 4.074613284804368, "grad_norm": 0.5886449112103362, "learning_rate": 8.22415502307946e-08, "loss": 0.0036, "step": 8956 }, { "epoch": 4.075068243858053, "grad_norm": 0.7130746108241274, "learning_rate": 8.216303252257789e-08, "loss": 0.0055, "step": 8957 }, { "epoch": 4.075523202911738, "grad_norm": 0.46009528740907907, "learning_rate": 8.208454895811106e-08, "loss": 0.0035, "step": 8958 }, { "epoch": 4.075978161965423, "grad_norm": 0.4787102984100384, "learning_rate": 8.200609954380777e-08, "loss": 0.0088, "step": 8959 }, { "epoch": 4.076433121019108, "grad_norm": 0.48501588605998586, "learning_rate": 8.192768428607839e-08, "loss": 0.0045, "step": 8960 }, { "epoch": 4.076888080072793, "grad_norm": 0.5526726298512198, "learning_rate": 8.184930319133048e-08, "loss": 0.0084, "step": 8961 }, { "epoch": 4.077343039126479, "grad_norm": 0.46242914691744114, "learning_rate": 8.177095626596931e-08, "loss": 0.0052, "step": 8962 }, { "epoch": 4.077797998180164, "grad_norm": 0.6927437140843602, "learning_rate": 8.169264351639671e-08, "loss": 0.02, "step": 8963 }, { "epoch": 4.078252957233849, "grad_norm": 0.38665319750928573, "learning_rate": 8.161436494901242e-08, "loss": 0.0054, "step": 8964 }, { "epoch": 4.078707916287534, "grad_norm": 0.5759115645389066, "learning_rate": 8.153612057021275e-08, "loss": 0.0052, "step": 8965 }, { "epoch": 4.079162875341219, "grad_norm": 0.21803010483103819, "learning_rate": 8.145791038639161e-08, "loss": 0.0013, "step": 8966 }, { "epoch": 4.079617834394904, "grad_norm": 0.5478298229732833, "learning_rate": 8.137973440393975e-08, "loss": 0.0076, "step": 8967 }, { "epoch": 4.08007279344859, "grad_norm": 0.5615907896225079, "learning_rate": 8.130159262924551e-08, "loss": 0.0076, "step": 8968 }, { "epoch": 4.080527752502275, "grad_norm": 0.5468509243294342, "learning_rate": 8.122348506869448e-08, "loss": 0.0045, "step": 8969 }, { "epoch": 4.08098271155596, "grad_norm": 0.5746094890912014, "learning_rate": 8.114541172866901e-08, "loss": 0.0076, "step": 8970 }, { "epoch": 4.081437670609645, "grad_norm": 0.48400536943728173, "learning_rate": 8.106737261554897e-08, "loss": 0.0022, "step": 8971 }, { "epoch": 4.08189262966333, "grad_norm": 0.4238303265227045, "learning_rate": 8.098936773571125e-08, "loss": 0.0037, "step": 8972 }, { "epoch": 4.082347588717015, "grad_norm": 0.6582862635183928, "learning_rate": 8.09113970955303e-08, "loss": 0.0121, "step": 8973 }, { "epoch": 4.082802547770701, "grad_norm": 0.644551560659728, "learning_rate": 8.083346070137737e-08, "loss": 0.0072, "step": 8974 }, { "epoch": 4.083257506824386, "grad_norm": 0.6150060241112704, "learning_rate": 8.075555855962097e-08, "loss": 0.0053, "step": 8975 }, { "epoch": 4.083712465878071, "grad_norm": 0.6737965158181375, "learning_rate": 8.067769067662717e-08, "loss": 0.0076, "step": 8976 }, { "epoch": 4.084167424931756, "grad_norm": 0.39287092976517235, "learning_rate": 8.059985705875872e-08, "loss": 0.0051, "step": 8977 }, { "epoch": 4.084622383985441, "grad_norm": 0.4168090627382752, "learning_rate": 8.052205771237602e-08, "loss": 0.0042, "step": 8978 }, { "epoch": 4.085077343039126, "grad_norm": 0.20747838832130147, "learning_rate": 8.044429264383651e-08, "loss": 0.0013, "step": 8979 }, { "epoch": 4.085532302092812, "grad_norm": 0.6680463791817772, "learning_rate": 8.036656185949464e-08, "loss": 0.0061, "step": 8980 }, { "epoch": 4.085987261146497, "grad_norm": 0.8010002470229728, "learning_rate": 8.028886536570234e-08, "loss": 0.0091, "step": 8981 }, { "epoch": 4.0864422202001816, "grad_norm": 0.4820768337889379, "learning_rate": 8.021120316880842e-08, "loss": 0.0031, "step": 8982 }, { "epoch": 4.086897179253867, "grad_norm": 0.4116832890885963, "learning_rate": 8.013357527515918e-08, "loss": 0.0029, "step": 8983 }, { "epoch": 4.087352138307552, "grad_norm": 0.5152479879382725, "learning_rate": 8.005598169109828e-08, "loss": 0.0089, "step": 8984 }, { "epoch": 4.087807097361237, "grad_norm": 0.5524680383914397, "learning_rate": 7.997842242296605e-08, "loss": 0.0089, "step": 8985 }, { "epoch": 4.088262056414923, "grad_norm": 0.5012320008774274, "learning_rate": 7.990089747710033e-08, "loss": 0.0068, "step": 8986 }, { "epoch": 4.088717015468608, "grad_norm": 0.6723176557020085, "learning_rate": 7.982340685983601e-08, "loss": 0.0113, "step": 8987 }, { "epoch": 4.089171974522293, "grad_norm": 0.42002321769731055, "learning_rate": 7.97459505775055e-08, "loss": 0.0062, "step": 8988 }, { "epoch": 4.089626933575978, "grad_norm": 0.4187687278577262, "learning_rate": 7.966852863643797e-08, "loss": 0.0055, "step": 8989 }, { "epoch": 4.090081892629663, "grad_norm": 0.4051874047301264, "learning_rate": 7.959114104296016e-08, "loss": 0.0052, "step": 8990 }, { "epoch": 4.090536851683349, "grad_norm": 0.44027094780933484, "learning_rate": 7.95137878033958e-08, "loss": 0.0045, "step": 8991 }, { "epoch": 4.090991810737034, "grad_norm": 0.48062049958760655, "learning_rate": 7.943646892406563e-08, "loss": 0.0061, "step": 8992 }, { "epoch": 4.091446769790719, "grad_norm": 0.5507454694876179, "learning_rate": 7.935918441128808e-08, "loss": 0.0091, "step": 8993 }, { "epoch": 4.091901728844404, "grad_norm": 0.6033143667233948, "learning_rate": 7.928193427137847e-08, "loss": 0.0082, "step": 8994 }, { "epoch": 4.092356687898089, "grad_norm": 0.5118933717658172, "learning_rate": 7.920471851064913e-08, "loss": 0.0088, "step": 8995 }, { "epoch": 4.092811646951774, "grad_norm": 0.3580852516026764, "learning_rate": 7.912753713540987e-08, "loss": 0.0013, "step": 8996 }, { "epoch": 4.09326660600546, "grad_norm": 0.5492222697683757, "learning_rate": 7.905039015196763e-08, "loss": 0.0052, "step": 8997 }, { "epoch": 4.093721565059145, "grad_norm": 0.7709215635210004, "learning_rate": 7.897327756662658e-08, "loss": 0.0123, "step": 8998 }, { "epoch": 4.09417652411283, "grad_norm": 0.7798072154917504, "learning_rate": 7.889619938568798e-08, "loss": 0.0166, "step": 8999 }, { "epoch": 4.094631483166515, "grad_norm": 0.4775966235680321, "learning_rate": 7.881915561545027e-08, "loss": 0.0034, "step": 9000 }, { "epoch": 4.0950864422202, "grad_norm": 0.7528733110132376, "learning_rate": 7.874214626220898e-08, "loss": 0.0086, "step": 9001 }, { "epoch": 4.095541401273885, "grad_norm": 0.5875765247762368, "learning_rate": 7.866517133225726e-08, "loss": 0.0056, "step": 9002 }, { "epoch": 4.095996360327571, "grad_norm": 0.43570559009201687, "learning_rate": 7.858823083188493e-08, "loss": 0.0055, "step": 9003 }, { "epoch": 4.096451319381256, "grad_norm": 0.7098018306632684, "learning_rate": 7.851132476737938e-08, "loss": 0.0049, "step": 9004 }, { "epoch": 4.096906278434941, "grad_norm": 0.421526148739396, "learning_rate": 7.843445314502489e-08, "loss": 0.0059, "step": 9005 }, { "epoch": 4.097361237488626, "grad_norm": 0.4462421295581641, "learning_rate": 7.835761597110308e-08, "loss": 0.005, "step": 9006 }, { "epoch": 4.097816196542311, "grad_norm": 0.44890207427659956, "learning_rate": 7.828081325189284e-08, "loss": 0.0031, "step": 9007 }, { "epoch": 4.098271155595996, "grad_norm": 0.5889584767492592, "learning_rate": 7.820404499367012e-08, "loss": 0.007, "step": 9008 }, { "epoch": 4.098726114649682, "grad_norm": 0.31738455198079424, "learning_rate": 7.812731120270799e-08, "loss": 0.0019, "step": 9009 }, { "epoch": 4.099181073703367, "grad_norm": 0.6720899509612405, "learning_rate": 7.805061188527673e-08, "loss": 0.007, "step": 9010 }, { "epoch": 4.099636032757052, "grad_norm": 0.6054309531474107, "learning_rate": 7.797394704764392e-08, "loss": 0.011, "step": 9011 }, { "epoch": 4.100090991810737, "grad_norm": 0.8742914208056248, "learning_rate": 7.789731669607446e-08, "loss": 0.0117, "step": 9012 }, { "epoch": 4.100545950864422, "grad_norm": 0.5425198866798454, "learning_rate": 7.782072083683011e-08, "loss": 0.0017, "step": 9013 }, { "epoch": 4.101000909918107, "grad_norm": 0.41459384431558854, "learning_rate": 7.774415947616986e-08, "loss": 0.0061, "step": 9014 }, { "epoch": 4.101455868971793, "grad_norm": 0.7659341868179248, "learning_rate": 7.766763262035004e-08, "loss": 0.0053, "step": 9015 }, { "epoch": 4.101910828025478, "grad_norm": 0.6249223444116285, "learning_rate": 7.759114027562386e-08, "loss": 0.0025, "step": 9016 }, { "epoch": 4.1023657870791626, "grad_norm": 0.6743935427195442, "learning_rate": 7.751468244824216e-08, "loss": 0.0153, "step": 9017 }, { "epoch": 4.102820746132848, "grad_norm": 0.4803188868123222, "learning_rate": 7.743825914445285e-08, "loss": 0.0045, "step": 9018 }, { "epoch": 4.103275705186533, "grad_norm": 0.5377627913553741, "learning_rate": 7.736187037050069e-08, "loss": 0.0024, "step": 9019 }, { "epoch": 4.103730664240218, "grad_norm": 0.5636481366794368, "learning_rate": 7.728551613262785e-08, "loss": 0.0031, "step": 9020 }, { "epoch": 4.104185623293904, "grad_norm": 0.4841633198533176, "learning_rate": 7.720919643707358e-08, "loss": 0.0094, "step": 9021 }, { "epoch": 4.104640582347589, "grad_norm": 0.44601443154263765, "learning_rate": 7.713291129007455e-08, "loss": 0.0068, "step": 9022 }, { "epoch": 4.1050955414012735, "grad_norm": 0.380449224009512, "learning_rate": 7.705666069786437e-08, "loss": 0.0035, "step": 9023 }, { "epoch": 4.105550500454959, "grad_norm": 0.6170488811676121, "learning_rate": 7.69804446666738e-08, "loss": 0.0031, "step": 9024 }, { "epoch": 4.106005459508644, "grad_norm": 0.6311395062197727, "learning_rate": 7.690426320273102e-08, "loss": 0.0103, "step": 9025 }, { "epoch": 4.106460418562329, "grad_norm": 0.5433917951968573, "learning_rate": 7.682811631226111e-08, "loss": 0.0103, "step": 9026 }, { "epoch": 4.106915377616015, "grad_norm": 0.6675226926402328, "learning_rate": 7.675200400148657e-08, "loss": 0.0081, "step": 9027 }, { "epoch": 4.1073703366697, "grad_norm": 1.0275644540432627, "learning_rate": 7.667592627662689e-08, "loss": 0.0066, "step": 9028 }, { "epoch": 4.1078252957233845, "grad_norm": 0.6729498974655868, "learning_rate": 7.659988314389886e-08, "loss": 0.0125, "step": 9029 }, { "epoch": 4.10828025477707, "grad_norm": 0.7402345323016937, "learning_rate": 7.652387460951614e-08, "loss": 0.0183, "step": 9030 }, { "epoch": 4.108735213830755, "grad_norm": 0.47938138467935215, "learning_rate": 7.644790067969004e-08, "loss": 0.0048, "step": 9031 }, { "epoch": 4.10919017288444, "grad_norm": 0.611394595647708, "learning_rate": 7.637196136062885e-08, "loss": 0.0045, "step": 9032 }, { "epoch": 4.109645131938126, "grad_norm": 0.4027979680177033, "learning_rate": 7.629605665853789e-08, "loss": 0.0029, "step": 9033 }, { "epoch": 4.110100090991811, "grad_norm": 0.5864167498835922, "learning_rate": 7.62201865796197e-08, "loss": 0.006, "step": 9034 }, { "epoch": 4.1105550500454955, "grad_norm": 0.7069528711149985, "learning_rate": 7.614435113007406e-08, "loss": 0.0152, "step": 9035 }, { "epoch": 4.111010009099181, "grad_norm": 0.6137647734235206, "learning_rate": 7.606855031609799e-08, "loss": 0.0085, "step": 9036 }, { "epoch": 4.111464968152866, "grad_norm": 0.4097397171828531, "learning_rate": 7.599278414388544e-08, "loss": 0.0041, "step": 9037 }, { "epoch": 4.111919927206552, "grad_norm": 0.6475002645392067, "learning_rate": 7.591705261962784e-08, "loss": 0.0169, "step": 9038 }, { "epoch": 4.112374886260237, "grad_norm": 0.6025362694955504, "learning_rate": 7.584135574951361e-08, "loss": 0.0041, "step": 9039 }, { "epoch": 4.112829845313922, "grad_norm": 0.7543029050777724, "learning_rate": 7.576569353972817e-08, "loss": 0.0165, "step": 9040 }, { "epoch": 4.113284804367607, "grad_norm": 0.4404136081037151, "learning_rate": 7.569006599645456e-08, "loss": 0.0022, "step": 9041 }, { "epoch": 4.113739763421292, "grad_norm": 0.6126544888379387, "learning_rate": 7.561447312587255e-08, "loss": 0.01, "step": 9042 }, { "epoch": 4.114194722474977, "grad_norm": 0.7072453298467516, "learning_rate": 7.553891493415932e-08, "loss": 0.0042, "step": 9043 }, { "epoch": 4.114649681528663, "grad_norm": 1.3593612716353587, "learning_rate": 7.546339142748897e-08, "loss": 0.0167, "step": 9044 }, { "epoch": 4.115104640582348, "grad_norm": 0.6611011336970904, "learning_rate": 7.53879026120331e-08, "loss": 0.0046, "step": 9045 }, { "epoch": 4.115559599636033, "grad_norm": 0.31215561511692147, "learning_rate": 7.53124484939604e-08, "loss": 0.0035, "step": 9046 }, { "epoch": 4.116014558689718, "grad_norm": 0.40891817999300395, "learning_rate": 7.523702907943658e-08, "loss": 0.0033, "step": 9047 }, { "epoch": 4.116469517743403, "grad_norm": 0.4909742906389428, "learning_rate": 7.516164437462453e-08, "loss": 0.0053, "step": 9048 }, { "epoch": 4.116924476797088, "grad_norm": 0.330769494470262, "learning_rate": 7.508629438568414e-08, "loss": 0.0024, "step": 9049 }, { "epoch": 4.117379435850774, "grad_norm": 0.6229118377431575, "learning_rate": 7.501097911877308e-08, "loss": 0.0057, "step": 9050 }, { "epoch": 4.117834394904459, "grad_norm": 0.48083984788328954, "learning_rate": 7.493569858004544e-08, "loss": 0.0081, "step": 9051 }, { "epoch": 4.1182893539581436, "grad_norm": 0.35155267746730623, "learning_rate": 7.486045277565307e-08, "loss": 0.0029, "step": 9052 }, { "epoch": 4.118744313011829, "grad_norm": 0.512111180880758, "learning_rate": 7.478524171174455e-08, "loss": 0.014, "step": 9053 }, { "epoch": 4.119199272065514, "grad_norm": 0.6038687799262934, "learning_rate": 7.471006539446583e-08, "loss": 0.0073, "step": 9054 }, { "epoch": 4.119654231119199, "grad_norm": 0.7039918573850416, "learning_rate": 7.463492382995989e-08, "loss": 0.0108, "step": 9055 }, { "epoch": 4.120109190172885, "grad_norm": 0.41008844662907107, "learning_rate": 7.455981702436714e-08, "loss": 0.0037, "step": 9056 }, { "epoch": 4.12056414922657, "grad_norm": 0.4391033710714802, "learning_rate": 7.44847449838249e-08, "loss": 0.0035, "step": 9057 }, { "epoch": 4.1210191082802545, "grad_norm": 0.4295868719178911, "learning_rate": 7.440970771446752e-08, "loss": 0.0064, "step": 9058 }, { "epoch": 4.12147406733394, "grad_norm": 0.5199419598911481, "learning_rate": 7.433470522242701e-08, "loss": 0.0111, "step": 9059 }, { "epoch": 4.121929026387625, "grad_norm": 0.48057689574287993, "learning_rate": 7.425973751383203e-08, "loss": 0.0062, "step": 9060 }, { "epoch": 4.12238398544131, "grad_norm": 0.5920336822671712, "learning_rate": 7.418480459480869e-08, "loss": 0.0044, "step": 9061 }, { "epoch": 4.122838944494996, "grad_norm": 0.5576716412319324, "learning_rate": 7.410990647148024e-08, "loss": 0.0121, "step": 9062 }, { "epoch": 4.123293903548681, "grad_norm": 0.8363501752236008, "learning_rate": 7.403504314996689e-08, "loss": 0.0057, "step": 9063 }, { "epoch": 4.1237488626023655, "grad_norm": 0.3756766370029143, "learning_rate": 7.396021463638607e-08, "loss": 0.002, "step": 9064 }, { "epoch": 4.124203821656051, "grad_norm": 0.6390094922364916, "learning_rate": 7.388542093685258e-08, "loss": 0.0113, "step": 9065 }, { "epoch": 4.124658780709736, "grad_norm": 0.5994160402895315, "learning_rate": 7.381066205747822e-08, "loss": 0.014, "step": 9066 }, { "epoch": 4.125113739763421, "grad_norm": 0.3043022284903256, "learning_rate": 7.373593800437194e-08, "loss": 0.0027, "step": 9067 }, { "epoch": 4.125568698817107, "grad_norm": 0.4613737961578401, "learning_rate": 7.366124878363982e-08, "loss": 0.0033, "step": 9068 }, { "epoch": 4.126023657870792, "grad_norm": 0.5234543232139194, "learning_rate": 7.358659440138498e-08, "loss": 0.0149, "step": 9069 }, { "epoch": 4.1264786169244765, "grad_norm": 0.5865057955841114, "learning_rate": 7.351197486370808e-08, "loss": 0.0023, "step": 9070 }, { "epoch": 4.126933575978162, "grad_norm": 0.9439542549402332, "learning_rate": 7.343739017670663e-08, "loss": 0.0061, "step": 9071 }, { "epoch": 4.127388535031847, "grad_norm": 0.7203941299659987, "learning_rate": 7.336284034647517e-08, "loss": 0.007, "step": 9072 }, { "epoch": 4.127843494085532, "grad_norm": 0.6628040744174141, "learning_rate": 7.328832537910584e-08, "loss": 0.0123, "step": 9073 }, { "epoch": 4.128298453139218, "grad_norm": 0.5805578242731112, "learning_rate": 7.321384528068747e-08, "loss": 0.0033, "step": 9074 }, { "epoch": 4.128753412192903, "grad_norm": 0.6554281114360814, "learning_rate": 7.31394000573064e-08, "loss": 0.0053, "step": 9075 }, { "epoch": 4.1292083712465875, "grad_norm": 0.19826865282246492, "learning_rate": 7.306498971504588e-08, "loss": 0.0008, "step": 9076 }, { "epoch": 4.129663330300273, "grad_norm": 0.47339133598292654, "learning_rate": 7.299061425998637e-08, "loss": 0.0052, "step": 9077 }, { "epoch": 4.130118289353958, "grad_norm": 0.5973107146155526, "learning_rate": 7.291627369820541e-08, "loss": 0.006, "step": 9078 }, { "epoch": 4.130573248407643, "grad_norm": 0.39098925592284983, "learning_rate": 7.284196803577785e-08, "loss": 0.0028, "step": 9079 }, { "epoch": 4.131028207461329, "grad_norm": 0.6399664247409599, "learning_rate": 7.276769727877574e-08, "loss": 0.0091, "step": 9080 }, { "epoch": 4.131483166515014, "grad_norm": 0.46533844796913443, "learning_rate": 7.269346143326804e-08, "loss": 0.0039, "step": 9081 }, { "epoch": 4.131938125568698, "grad_norm": 0.8086556921902142, "learning_rate": 7.261926050532102e-08, "loss": 0.0027, "step": 9082 }, { "epoch": 4.132393084622384, "grad_norm": 0.5890788561918963, "learning_rate": 7.254509450099782e-08, "loss": 0.0146, "step": 9083 }, { "epoch": 4.132848043676069, "grad_norm": 0.4841612094652644, "learning_rate": 7.247096342635927e-08, "loss": 0.0034, "step": 9084 }, { "epoch": 4.133303002729754, "grad_norm": 0.6686946273028823, "learning_rate": 7.239686728746291e-08, "loss": 0.006, "step": 9085 }, { "epoch": 4.13375796178344, "grad_norm": 0.47905212087943166, "learning_rate": 7.23228060903634e-08, "loss": 0.0056, "step": 9086 }, { "epoch": 4.1342129208371245, "grad_norm": 0.6309401654737001, "learning_rate": 7.224877984111288e-08, "loss": 0.0104, "step": 9087 }, { "epoch": 4.134667879890809, "grad_norm": 0.819657652676849, "learning_rate": 7.217478854576026e-08, "loss": 0.008, "step": 9088 }, { "epoch": 4.135122838944495, "grad_norm": 0.5205905353645909, "learning_rate": 7.210083221035201e-08, "loss": 0.0066, "step": 9089 }, { "epoch": 4.13557779799818, "grad_norm": 0.7297074746723039, "learning_rate": 7.202691084093138e-08, "loss": 0.0103, "step": 9090 }, { "epoch": 4.136032757051865, "grad_norm": 0.4710192171987028, "learning_rate": 7.195302444353884e-08, "loss": 0.0069, "step": 9091 }, { "epoch": 4.136487716105551, "grad_norm": 0.4109722552736249, "learning_rate": 7.187917302421214e-08, "loss": 0.0017, "step": 9092 }, { "epoch": 4.1369426751592355, "grad_norm": 0.28657108978143864, "learning_rate": 7.180535658898596e-08, "loss": 0.0015, "step": 9093 }, { "epoch": 4.13739763421292, "grad_norm": 0.1926528907209848, "learning_rate": 7.173157514389228e-08, "loss": 0.0006, "step": 9094 }, { "epoch": 4.137852593266606, "grad_norm": 0.6965196769413466, "learning_rate": 7.165782869496035e-08, "loss": 0.0152, "step": 9095 }, { "epoch": 4.138307552320291, "grad_norm": 0.47292207538922065, "learning_rate": 7.158411724821628e-08, "loss": 0.007, "step": 9096 }, { "epoch": 4.138762511373977, "grad_norm": 0.9249222607255045, "learning_rate": 7.151044080968343e-08, "loss": 0.0143, "step": 9097 }, { "epoch": 4.139217470427662, "grad_norm": 0.35597668473381255, "learning_rate": 7.143679938538227e-08, "loss": 0.0034, "step": 9098 }, { "epoch": 4.1396724294813465, "grad_norm": 0.4064280630635832, "learning_rate": 7.136319298133053e-08, "loss": 0.0057, "step": 9099 }, { "epoch": 4.140127388535032, "grad_norm": 0.776793305665938, "learning_rate": 7.128962160354291e-08, "loss": 0.0064, "step": 9100 }, { "epoch": 4.140582347588717, "grad_norm": 0.8868506562416912, "learning_rate": 7.12160852580314e-08, "loss": 0.0097, "step": 9101 }, { "epoch": 4.141037306642402, "grad_norm": 0.48434852331932127, "learning_rate": 7.114258395080508e-08, "loss": 0.0064, "step": 9102 }, { "epoch": 4.141492265696088, "grad_norm": 0.966754902883372, "learning_rate": 7.106911768786999e-08, "loss": 0.0053, "step": 9103 }, { "epoch": 4.141947224749773, "grad_norm": 0.6570092842012837, "learning_rate": 7.099568647522968e-08, "loss": 0.0127, "step": 9104 }, { "epoch": 4.1424021838034575, "grad_norm": 0.3850227204141224, "learning_rate": 7.092229031888446e-08, "loss": 0.0035, "step": 9105 }, { "epoch": 4.142857142857143, "grad_norm": 0.49328171561964834, "learning_rate": 7.084892922483204e-08, "loss": 0.0098, "step": 9106 }, { "epoch": 4.143312101910828, "grad_norm": 0.35495514050849025, "learning_rate": 7.077560319906694e-08, "loss": 0.0012, "step": 9107 }, { "epoch": 4.143767060964513, "grad_norm": 0.37878148469450446, "learning_rate": 7.070231224758122e-08, "loss": 0.0025, "step": 9108 }, { "epoch": 4.144222020018199, "grad_norm": 0.5596211703638053, "learning_rate": 7.062905637636396e-08, "loss": 0.0049, "step": 9109 }, { "epoch": 4.144676979071884, "grad_norm": 0.4379999791280165, "learning_rate": 7.055583559140115e-08, "loss": 0.0028, "step": 9110 }, { "epoch": 4.1451319381255685, "grad_norm": 0.46781262685760666, "learning_rate": 7.048264989867614e-08, "loss": 0.0029, "step": 9111 }, { "epoch": 4.145586897179254, "grad_norm": 0.6062285610337689, "learning_rate": 7.040949930416918e-08, "loss": 0.0073, "step": 9112 }, { "epoch": 4.146041856232939, "grad_norm": 1.024548439601093, "learning_rate": 7.033638381385803e-08, "loss": 0.0135, "step": 9113 }, { "epoch": 4.146496815286624, "grad_norm": 1.0127796815971148, "learning_rate": 7.026330343371712e-08, "loss": 0.0094, "step": 9114 }, { "epoch": 4.14695177434031, "grad_norm": 0.5426646274137572, "learning_rate": 7.019025816971851e-08, "loss": 0.0058, "step": 9115 }, { "epoch": 4.147406733393995, "grad_norm": 0.30713209567431515, "learning_rate": 7.011724802783103e-08, "loss": 0.0024, "step": 9116 }, { "epoch": 4.147861692447679, "grad_norm": 0.4349451147327164, "learning_rate": 7.004427301402055e-08, "loss": 0.0053, "step": 9117 }, { "epoch": 4.148316651501365, "grad_norm": 0.6991176339023728, "learning_rate": 6.997133313425058e-08, "loss": 0.0036, "step": 9118 }, { "epoch": 4.14877161055505, "grad_norm": 0.5537030388757426, "learning_rate": 6.989842839448123e-08, "loss": 0.006, "step": 9119 }, { "epoch": 4.149226569608735, "grad_norm": 0.6223855594690768, "learning_rate": 6.982555880066998e-08, "loss": 0.003, "step": 9120 }, { "epoch": 4.149681528662421, "grad_norm": 0.5399004292014686, "learning_rate": 6.975272435877133e-08, "loss": 0.0048, "step": 9121 }, { "epoch": 4.1501364877161055, "grad_norm": 0.3332039011963109, "learning_rate": 6.967992507473702e-08, "loss": 0.0039, "step": 9122 }, { "epoch": 4.15059144676979, "grad_norm": 0.7651320134149671, "learning_rate": 6.960716095451608e-08, "loss": 0.0114, "step": 9123 }, { "epoch": 4.151046405823476, "grad_norm": 0.5347638352624313, "learning_rate": 6.95344320040543e-08, "loss": 0.0046, "step": 9124 }, { "epoch": 4.151501364877161, "grad_norm": 0.48400487298841527, "learning_rate": 6.94617382292948e-08, "loss": 0.007, "step": 9125 }, { "epoch": 4.151956323930846, "grad_norm": 0.5587323344392633, "learning_rate": 6.938907963617774e-08, "loss": 0.0056, "step": 9126 }, { "epoch": 4.152411282984532, "grad_norm": 0.6054470470842175, "learning_rate": 6.93164562306403e-08, "loss": 0.0033, "step": 9127 }, { "epoch": 4.1528662420382165, "grad_norm": 0.5575523287993699, "learning_rate": 6.924386801861721e-08, "loss": 0.0099, "step": 9128 }, { "epoch": 4.153321201091901, "grad_norm": 0.8119110260299146, "learning_rate": 6.917131500603995e-08, "loss": 0.0036, "step": 9129 }, { "epoch": 4.153776160145587, "grad_norm": 0.34105648080472195, "learning_rate": 6.909879719883733e-08, "loss": 0.0042, "step": 9130 }, { "epoch": 4.154231119199272, "grad_norm": 0.6789796828296677, "learning_rate": 6.9026314602935e-08, "loss": 0.0028, "step": 9131 }, { "epoch": 4.154686078252957, "grad_norm": 1.3882306473620538, "learning_rate": 6.89538672242559e-08, "loss": 0.0184, "step": 9132 }, { "epoch": 4.155141037306643, "grad_norm": 0.613160311760209, "learning_rate": 6.888145506872029e-08, "loss": 0.0049, "step": 9133 }, { "epoch": 4.1555959963603275, "grad_norm": 0.35082163585162324, "learning_rate": 6.880907814224523e-08, "loss": 0.0021, "step": 9134 }, { "epoch": 4.156050955414012, "grad_norm": 0.6402979528860412, "learning_rate": 6.873673645074495e-08, "loss": 0.0149, "step": 9135 }, { "epoch": 4.156505914467698, "grad_norm": 0.5297538216315626, "learning_rate": 6.866443000013117e-08, "loss": 0.0079, "step": 9136 }, { "epoch": 4.156960873521383, "grad_norm": 0.5101108439722521, "learning_rate": 6.859215879631214e-08, "loss": 0.0099, "step": 9137 }, { "epoch": 4.157415832575068, "grad_norm": 0.7372542860236007, "learning_rate": 6.851992284519375e-08, "loss": 0.0031, "step": 9138 }, { "epoch": 4.157870791628754, "grad_norm": 0.7173234399178836, "learning_rate": 6.844772215267875e-08, "loss": 0.005, "step": 9139 }, { "epoch": 4.1583257506824385, "grad_norm": 0.5348786191137713, "learning_rate": 6.837555672466699e-08, "loss": 0.0036, "step": 9140 }, { "epoch": 4.158780709736123, "grad_norm": 0.7269877766401087, "learning_rate": 6.830342656705545e-08, "loss": 0.011, "step": 9141 }, { "epoch": 4.159235668789809, "grad_norm": 0.6933710768967669, "learning_rate": 6.823133168573836e-08, "loss": 0.0115, "step": 9142 }, { "epoch": 4.159690627843494, "grad_norm": 0.5802821133833084, "learning_rate": 6.81592720866071e-08, "loss": 0.0132, "step": 9143 }, { "epoch": 4.160145586897179, "grad_norm": 0.1547919731262612, "learning_rate": 6.808724777554997e-08, "loss": 0.0007, "step": 9144 }, { "epoch": 4.160600545950865, "grad_norm": 0.4899086987765263, "learning_rate": 6.801525875845243e-08, "loss": 0.0083, "step": 9145 }, { "epoch": 4.1610555050045495, "grad_norm": 0.7480161967469746, "learning_rate": 6.794330504119706e-08, "loss": 0.0097, "step": 9146 }, { "epoch": 4.161510464058235, "grad_norm": 0.8101812783588723, "learning_rate": 6.787138662966368e-08, "loss": 0.0064, "step": 9147 }, { "epoch": 4.16196542311192, "grad_norm": 0.7557769925489247, "learning_rate": 6.779950352972919e-08, "loss": 0.0102, "step": 9148 }, { "epoch": 4.162420382165605, "grad_norm": 0.7816671881440801, "learning_rate": 6.77276557472673e-08, "loss": 0.0116, "step": 9149 }, { "epoch": 4.162875341219291, "grad_norm": 0.7848121930769125, "learning_rate": 6.765584328814938e-08, "loss": 0.0188, "step": 9150 }, { "epoch": 4.163330300272976, "grad_norm": 0.4695347194562915, "learning_rate": 6.758406615824342e-08, "loss": 0.0036, "step": 9151 }, { "epoch": 4.16378525932666, "grad_norm": 1.04424918896818, "learning_rate": 6.751232436341487e-08, "loss": 0.0125, "step": 9152 }, { "epoch": 4.164240218380346, "grad_norm": 0.31344916867803974, "learning_rate": 6.74406179095261e-08, "loss": 0.0013, "step": 9153 }, { "epoch": 4.164695177434031, "grad_norm": 0.4441403960974415, "learning_rate": 6.73689468024366e-08, "loss": 0.0046, "step": 9154 }, { "epoch": 4.165150136487716, "grad_norm": 0.3733970974755575, "learning_rate": 6.72973110480029e-08, "loss": 0.0032, "step": 9155 }, { "epoch": 4.165605095541402, "grad_norm": 0.6030718286068278, "learning_rate": 6.722571065207888e-08, "loss": 0.005, "step": 9156 }, { "epoch": 4.1660600545950865, "grad_norm": 0.3481252466147254, "learning_rate": 6.715414562051552e-08, "loss": 0.0057, "step": 9157 }, { "epoch": 4.166515013648771, "grad_norm": 0.6544709198031885, "learning_rate": 6.70826159591607e-08, "loss": 0.003, "step": 9158 }, { "epoch": 4.166969972702457, "grad_norm": 0.5461400303161917, "learning_rate": 6.701112167385942e-08, "loss": 0.006, "step": 9159 }, { "epoch": 4.167424931756142, "grad_norm": 0.36689295761831087, "learning_rate": 6.693966277045394e-08, "loss": 0.0037, "step": 9160 }, { "epoch": 4.167879890809827, "grad_norm": 0.9114058147646281, "learning_rate": 6.686823925478335e-08, "loss": 0.0092, "step": 9161 }, { "epoch": 4.168334849863513, "grad_norm": 0.681545774666772, "learning_rate": 6.679685113268446e-08, "loss": 0.006, "step": 9162 }, { "epoch": 4.1687898089171975, "grad_norm": 0.6589138840176062, "learning_rate": 6.672549840999037e-08, "loss": 0.0056, "step": 9163 }, { "epoch": 4.169244767970882, "grad_norm": 0.7428701075063785, "learning_rate": 6.665418109253207e-08, "loss": 0.0052, "step": 9164 }, { "epoch": 4.169699727024568, "grad_norm": 0.4930528489637366, "learning_rate": 6.658289918613708e-08, "loss": 0.0061, "step": 9165 }, { "epoch": 4.170154686078253, "grad_norm": 0.957390833644832, "learning_rate": 6.651165269663017e-08, "loss": 0.0065, "step": 9166 }, { "epoch": 4.170609645131938, "grad_norm": 0.7100288560039918, "learning_rate": 6.644044162983353e-08, "loss": 0.0064, "step": 9167 }, { "epoch": 4.171064604185624, "grad_norm": 0.46648684342158336, "learning_rate": 6.6369265991566e-08, "loss": 0.0082, "step": 9168 }, { "epoch": 4.1715195632393085, "grad_norm": 0.4566169764508392, "learning_rate": 6.629812578764387e-08, "loss": 0.0037, "step": 9169 }, { "epoch": 4.171974522292993, "grad_norm": 0.38020565823967345, "learning_rate": 6.622702102388017e-08, "loss": 0.0028, "step": 9170 }, { "epoch": 4.172429481346679, "grad_norm": 0.6418597804806314, "learning_rate": 6.61559517060854e-08, "loss": 0.0127, "step": 9171 }, { "epoch": 4.172884440400364, "grad_norm": 0.6203202056556261, "learning_rate": 6.608491784006715e-08, "loss": 0.008, "step": 9172 }, { "epoch": 4.173339399454049, "grad_norm": 0.48520235236132586, "learning_rate": 6.601391943162987e-08, "loss": 0.0022, "step": 9173 }, { "epoch": 4.173794358507735, "grad_norm": 0.8373354609739322, "learning_rate": 6.594295648657528e-08, "loss": 0.0067, "step": 9174 }, { "epoch": 4.1742493175614195, "grad_norm": 0.49136583911697324, "learning_rate": 6.587202901070194e-08, "loss": 0.0036, "step": 9175 }, { "epoch": 4.174704276615104, "grad_norm": 0.5431844982658833, "learning_rate": 6.5801137009806e-08, "loss": 0.0041, "step": 9176 }, { "epoch": 4.17515923566879, "grad_norm": 0.43236758100164385, "learning_rate": 6.57302804896802e-08, "loss": 0.0021, "step": 9177 }, { "epoch": 4.175614194722475, "grad_norm": 0.7611562417729844, "learning_rate": 6.565945945611484e-08, "loss": 0.0153, "step": 9178 }, { "epoch": 4.17606915377616, "grad_norm": 0.525994466003869, "learning_rate": 6.558867391489703e-08, "loss": 0.0084, "step": 9179 }, { "epoch": 4.176524112829846, "grad_norm": 0.5615978174073254, "learning_rate": 6.551792387181087e-08, "loss": 0.0089, "step": 9180 }, { "epoch": 4.1769790718835305, "grad_norm": 0.6200209288682624, "learning_rate": 6.544720933263797e-08, "loss": 0.0177, "step": 9181 }, { "epoch": 4.177434030937215, "grad_norm": 0.6281797615024413, "learning_rate": 6.537653030315671e-08, "loss": 0.0102, "step": 9182 }, { "epoch": 4.177888989990901, "grad_norm": 0.3665832782546625, "learning_rate": 6.530588678914261e-08, "loss": 0.0029, "step": 9183 }, { "epoch": 4.178343949044586, "grad_norm": 0.41609049420373284, "learning_rate": 6.523527879636836e-08, "loss": 0.0044, "step": 9184 }, { "epoch": 4.178798908098271, "grad_norm": 0.3008566424404263, "learning_rate": 6.516470633060366e-08, "loss": 0.0019, "step": 9185 }, { "epoch": 4.179253867151957, "grad_norm": 0.7595408387148092, "learning_rate": 6.509416939761564e-08, "loss": 0.0085, "step": 9186 }, { "epoch": 4.179708826205641, "grad_norm": 0.5760309322072932, "learning_rate": 6.5023668003168e-08, "loss": 0.0147, "step": 9187 }, { "epoch": 4.180163785259326, "grad_norm": 0.6165125286073235, "learning_rate": 6.495320215302192e-08, "loss": 0.0074, "step": 9188 }, { "epoch": 4.180618744313012, "grad_norm": 0.29594355288190366, "learning_rate": 6.488277185293539e-08, "loss": 0.0016, "step": 9189 }, { "epoch": 4.181073703366697, "grad_norm": 0.6382409846816868, "learning_rate": 6.481237710866389e-08, "loss": 0.0062, "step": 9190 }, { "epoch": 4.181528662420382, "grad_norm": 0.6679903562879854, "learning_rate": 6.474201792595956e-08, "loss": 0.0092, "step": 9191 }, { "epoch": 4.1819836214740675, "grad_norm": 0.6585442320348704, "learning_rate": 6.467169431057202e-08, "loss": 0.0113, "step": 9192 }, { "epoch": 4.182438580527752, "grad_norm": 0.19972191885199655, "learning_rate": 6.460140626824762e-08, "loss": 0.0008, "step": 9193 }, { "epoch": 4.182893539581437, "grad_norm": 0.4573604123247014, "learning_rate": 6.453115380473001e-08, "loss": 0.0087, "step": 9194 }, { "epoch": 4.183348498635123, "grad_norm": 0.7321638906213631, "learning_rate": 6.446093692576004e-08, "loss": 0.011, "step": 9195 }, { "epoch": 4.183803457688808, "grad_norm": 0.542046694684335, "learning_rate": 6.439075563707547e-08, "loss": 0.0093, "step": 9196 }, { "epoch": 4.184258416742493, "grad_norm": 0.6036511521764452, "learning_rate": 6.432060994441113e-08, "loss": 0.0068, "step": 9197 }, { "epoch": 4.1847133757961785, "grad_norm": 0.5291691307236139, "learning_rate": 6.42504998534989e-08, "loss": 0.0034, "step": 9198 }, { "epoch": 4.185168334849863, "grad_norm": 0.5372662334712673, "learning_rate": 6.418042537006813e-08, "loss": 0.006, "step": 9199 }, { "epoch": 4.185623293903548, "grad_norm": 0.4760278918182915, "learning_rate": 6.411038649984474e-08, "loss": 0.0052, "step": 9200 }, { "epoch": 4.186078252957234, "grad_norm": 0.4571625590398452, "learning_rate": 6.40403832485522e-08, "loss": 0.0049, "step": 9201 }, { "epoch": 4.186533212010919, "grad_norm": 0.2776053049062435, "learning_rate": 6.39704156219108e-08, "loss": 0.0007, "step": 9202 }, { "epoch": 4.186988171064604, "grad_norm": 0.7212229526638557, "learning_rate": 6.390048362563789e-08, "loss": 0.0099, "step": 9203 }, { "epoch": 4.1874431301182895, "grad_norm": 1.4200904380060524, "learning_rate": 6.383058726544798e-08, "loss": 0.0032, "step": 9204 }, { "epoch": 4.187898089171974, "grad_norm": 0.5252152186837753, "learning_rate": 6.376072654705272e-08, "loss": 0.0069, "step": 9205 }, { "epoch": 4.188353048225659, "grad_norm": 0.8918350915225158, "learning_rate": 6.369090147616102e-08, "loss": 0.0063, "step": 9206 }, { "epoch": 4.188808007279345, "grad_norm": 0.5411172849900869, "learning_rate": 6.362111205847842e-08, "loss": 0.0076, "step": 9207 }, { "epoch": 4.18926296633303, "grad_norm": 0.6235700721214581, "learning_rate": 6.355135829970793e-08, "loss": 0.0066, "step": 9208 }, { "epoch": 4.189717925386716, "grad_norm": 1.0793879758239648, "learning_rate": 6.348164020554936e-08, "loss": 0.0145, "step": 9209 }, { "epoch": 4.1901728844404005, "grad_norm": 0.8717048228997795, "learning_rate": 6.341195778169988e-08, "loss": 0.01, "step": 9210 }, { "epoch": 4.190627843494085, "grad_norm": 0.5248988828740992, "learning_rate": 6.334231103385369e-08, "loss": 0.0055, "step": 9211 }, { "epoch": 4.191082802547771, "grad_norm": 0.45602102845621045, "learning_rate": 6.327269996770173e-08, "loss": 0.0081, "step": 9212 }, { "epoch": 4.191537761601456, "grad_norm": 0.5425889665322589, "learning_rate": 6.320312458893262e-08, "loss": 0.007, "step": 9213 }, { "epoch": 4.191992720655141, "grad_norm": 0.2789633370048948, "learning_rate": 6.31335849032315e-08, "loss": 0.0019, "step": 9214 }, { "epoch": 4.192447679708827, "grad_norm": 0.42253541408845124, "learning_rate": 6.306408091628107e-08, "loss": 0.0038, "step": 9215 }, { "epoch": 4.1929026387625115, "grad_norm": 0.5090399320864861, "learning_rate": 6.299461263376077e-08, "loss": 0.0087, "step": 9216 }, { "epoch": 4.193357597816196, "grad_norm": 0.5112994320603375, "learning_rate": 6.292518006134723e-08, "loss": 0.0064, "step": 9217 }, { "epoch": 4.193812556869882, "grad_norm": 0.5412904724851559, "learning_rate": 6.285578320471402e-08, "loss": 0.0054, "step": 9218 }, { "epoch": 4.194267515923567, "grad_norm": 0.5270082646385227, "learning_rate": 6.278642206953211e-08, "loss": 0.0069, "step": 9219 }, { "epoch": 4.194722474977252, "grad_norm": 0.48117944954741754, "learning_rate": 6.271709666146946e-08, "loss": 0.0109, "step": 9220 }, { "epoch": 4.195177434030938, "grad_norm": 0.2293543699641825, "learning_rate": 6.264780698619094e-08, "loss": 0.0015, "step": 9221 }, { "epoch": 4.195632393084622, "grad_norm": 0.4562799041182865, "learning_rate": 6.257855304935849e-08, "loss": 0.0051, "step": 9222 }, { "epoch": 4.196087352138307, "grad_norm": 0.5886106446178003, "learning_rate": 6.250933485663123e-08, "loss": 0.014, "step": 9223 }, { "epoch": 4.196542311191993, "grad_norm": 0.5295196669448965, "learning_rate": 6.244015241366556e-08, "loss": 0.0088, "step": 9224 }, { "epoch": 4.196997270245678, "grad_norm": 0.581575029310107, "learning_rate": 6.237100572611464e-08, "loss": 0.0149, "step": 9225 }, { "epoch": 4.197452229299363, "grad_norm": 0.5745744532536167, "learning_rate": 6.230189479962872e-08, "loss": 0.0057, "step": 9226 }, { "epoch": 4.1979071883530485, "grad_norm": 0.35439763113104583, "learning_rate": 6.223281963985539e-08, "loss": 0.0026, "step": 9227 }, { "epoch": 4.198362147406733, "grad_norm": 0.44718816969342273, "learning_rate": 6.216378025243902e-08, "loss": 0.0059, "step": 9228 }, { "epoch": 4.198817106460418, "grad_norm": 0.6116500675298241, "learning_rate": 6.209477664302138e-08, "loss": 0.0064, "step": 9229 }, { "epoch": 4.199272065514104, "grad_norm": 0.6359615222882241, "learning_rate": 6.202580881724106e-08, "loss": 0.0118, "step": 9230 }, { "epoch": 4.199727024567789, "grad_norm": 0.51442549651575, "learning_rate": 6.195687678073375e-08, "loss": 0.0032, "step": 9231 }, { "epoch": 4.200181983621474, "grad_norm": 0.7600188130719604, "learning_rate": 6.188798053913225e-08, "loss": 0.0075, "step": 9232 }, { "epoch": 4.2006369426751595, "grad_norm": 0.7260458933631289, "learning_rate": 6.181912009806628e-08, "loss": 0.0117, "step": 9233 }, { "epoch": 4.201091901728844, "grad_norm": 0.583033709163653, "learning_rate": 6.175029546316323e-08, "loss": 0.009, "step": 9234 }, { "epoch": 4.201546860782529, "grad_norm": 0.6135663139130656, "learning_rate": 6.168150664004695e-08, "loss": 0.0089, "step": 9235 }, { "epoch": 4.202001819836215, "grad_norm": 0.5423025078315111, "learning_rate": 6.161275363433849e-08, "loss": 0.003, "step": 9236 }, { "epoch": 4.2024567788899, "grad_norm": 0.5322157390923123, "learning_rate": 6.154403645165607e-08, "loss": 0.0044, "step": 9237 }, { "epoch": 4.202911737943585, "grad_norm": 0.7284239314977838, "learning_rate": 6.147535509761487e-08, "loss": 0.0045, "step": 9238 }, { "epoch": 4.2033666969972705, "grad_norm": 0.9698883955434968, "learning_rate": 6.140670957782734e-08, "loss": 0.0062, "step": 9239 }, { "epoch": 4.203821656050955, "grad_norm": 0.5152838439634974, "learning_rate": 6.133809989790273e-08, "loss": 0.0039, "step": 9240 }, { "epoch": 4.20427661510464, "grad_norm": 0.5069880887739758, "learning_rate": 6.126952606344776e-08, "loss": 0.005, "step": 9241 }, { "epoch": 4.204731574158326, "grad_norm": 0.5096227007739345, "learning_rate": 6.120098808006579e-08, "loss": 0.0077, "step": 9242 }, { "epoch": 4.205186533212011, "grad_norm": 0.7089696621604609, "learning_rate": 6.11324859533574e-08, "loss": 0.0108, "step": 9243 }, { "epoch": 4.205641492265696, "grad_norm": 0.721925430377814, "learning_rate": 6.106401968892044e-08, "loss": 0.016, "step": 9244 }, { "epoch": 4.2060964513193815, "grad_norm": 0.9495693249862123, "learning_rate": 6.09955892923496e-08, "loss": 0.0119, "step": 9245 }, { "epoch": 4.206551410373066, "grad_norm": 0.514982479647311, "learning_rate": 6.092719476923664e-08, "loss": 0.0079, "step": 9246 }, { "epoch": 4.207006369426751, "grad_norm": 0.3713532522262336, "learning_rate": 6.08588361251704e-08, "loss": 0.0024, "step": 9247 }, { "epoch": 4.207461328480437, "grad_norm": 0.42382090385071824, "learning_rate": 6.079051336573693e-08, "loss": 0.0031, "step": 9248 }, { "epoch": 4.207916287534122, "grad_norm": 0.71559110706801, "learning_rate": 6.072222649651936e-08, "loss": 0.0117, "step": 9249 }, { "epoch": 4.208371246587807, "grad_norm": 0.7500588822894471, "learning_rate": 6.065397552309765e-08, "loss": 0.0133, "step": 9250 }, { "epoch": 4.2088262056414925, "grad_norm": 0.5718545788373137, "learning_rate": 6.058576045104902e-08, "loss": 0.0081, "step": 9251 }, { "epoch": 4.209281164695177, "grad_norm": 1.0337966424081493, "learning_rate": 6.051758128594758e-08, "loss": 0.0049, "step": 9252 }, { "epoch": 4.209736123748862, "grad_norm": 0.8845416887095867, "learning_rate": 6.044943803336477e-08, "loss": 0.0071, "step": 9253 }, { "epoch": 4.210191082802548, "grad_norm": 0.43091977067692916, "learning_rate": 6.038133069886885e-08, "loss": 0.005, "step": 9254 }, { "epoch": 4.210646041856233, "grad_norm": 0.46970447895725165, "learning_rate": 6.031325928802533e-08, "loss": 0.0029, "step": 9255 }, { "epoch": 4.211101000909918, "grad_norm": 0.6981080847405878, "learning_rate": 6.024522380639668e-08, "loss": 0.0032, "step": 9256 }, { "epoch": 4.211555959963603, "grad_norm": 0.6048202291244242, "learning_rate": 6.01772242595423e-08, "loss": 0.0135, "step": 9257 }, { "epoch": 4.212010919017288, "grad_norm": 0.7870581787484018, "learning_rate": 6.010926065301907e-08, "loss": 0.0067, "step": 9258 }, { "epoch": 4.212465878070974, "grad_norm": 0.4301581352881331, "learning_rate": 6.004133299238051e-08, "loss": 0.0028, "step": 9259 }, { "epoch": 4.212920837124659, "grad_norm": 0.3420982949874867, "learning_rate": 5.997344128317738e-08, "loss": 0.0026, "step": 9260 }, { "epoch": 4.213375796178344, "grad_norm": 0.48674462705952204, "learning_rate": 5.990558553095743e-08, "loss": 0.0053, "step": 9261 }, { "epoch": 4.2138307552320295, "grad_norm": 0.5775396498527131, "learning_rate": 5.983776574126554e-08, "loss": 0.0117, "step": 9262 }, { "epoch": 4.214285714285714, "grad_norm": 0.6018689581034664, "learning_rate": 5.976998191964377e-08, "loss": 0.0078, "step": 9263 }, { "epoch": 4.214740673339399, "grad_norm": 0.425347946051962, "learning_rate": 5.9702234071631e-08, "loss": 0.005, "step": 9264 }, { "epoch": 4.215195632393085, "grad_norm": 0.4678564409638841, "learning_rate": 5.963452220276332e-08, "loss": 0.004, "step": 9265 }, { "epoch": 4.21565059144677, "grad_norm": 1.0589234545584085, "learning_rate": 5.956684631857384e-08, "loss": 0.0066, "step": 9266 }, { "epoch": 4.216105550500455, "grad_norm": 0.4772445845547828, "learning_rate": 5.949920642459255e-08, "loss": 0.0057, "step": 9267 }, { "epoch": 4.2165605095541405, "grad_norm": 0.31425483699582524, "learning_rate": 5.943160252634688e-08, "loss": 0.0011, "step": 9268 }, { "epoch": 4.217015468607825, "grad_norm": 0.6452583061913122, "learning_rate": 5.936403462936113e-08, "loss": 0.0095, "step": 9269 }, { "epoch": 4.21747042766151, "grad_norm": 0.8410581669445155, "learning_rate": 5.9296502739156596e-08, "loss": 0.0151, "step": 9270 }, { "epoch": 4.217925386715196, "grad_norm": 0.4508873429319743, "learning_rate": 5.922900686125165e-08, "loss": 0.0044, "step": 9271 }, { "epoch": 4.218380345768881, "grad_norm": 0.5582828364991763, "learning_rate": 5.91615470011616e-08, "loss": 0.0027, "step": 9272 }, { "epoch": 4.218835304822566, "grad_norm": 0.5620389360764227, "learning_rate": 5.9094123164399324e-08, "loss": 0.0075, "step": 9273 }, { "epoch": 4.2192902638762515, "grad_norm": 0.5844748166165546, "learning_rate": 5.902673535647412e-08, "loss": 0.0096, "step": 9274 }, { "epoch": 4.219745222929936, "grad_norm": 1.1511646789299455, "learning_rate": 5.8959383582892597e-08, "loss": 0.0049, "step": 9275 }, { "epoch": 4.220200181983621, "grad_norm": 0.5512207986946391, "learning_rate": 5.889206784915862e-08, "loss": 0.0025, "step": 9276 }, { "epoch": 4.220655141037307, "grad_norm": 2.184351054941728, "learning_rate": 5.882478816077274e-08, "loss": 0.0138, "step": 9277 }, { "epoch": 4.221110100090992, "grad_norm": 0.8400990496194704, "learning_rate": 5.875754452323295e-08, "loss": 0.0088, "step": 9278 }, { "epoch": 4.221565059144677, "grad_norm": 0.615087082296079, "learning_rate": 5.869033694203401e-08, "loss": 0.0067, "step": 9279 }, { "epoch": 4.2220200181983625, "grad_norm": 0.456056301779648, "learning_rate": 5.8623165422667764e-08, "loss": 0.0061, "step": 9280 }, { "epoch": 4.222474977252047, "grad_norm": 0.6392581306899648, "learning_rate": 5.8556029970623086e-08, "loss": 0.0111, "step": 9281 }, { "epoch": 4.222929936305732, "grad_norm": 0.5185075810103459, "learning_rate": 5.8488930591386154e-08, "loss": 0.008, "step": 9282 }, { "epoch": 4.223384895359418, "grad_norm": 1.1791557706820335, "learning_rate": 5.842186729044002e-08, "loss": 0.0073, "step": 9283 }, { "epoch": 4.223839854413103, "grad_norm": 0.46753356583294253, "learning_rate": 5.835484007326474e-08, "loss": 0.0062, "step": 9284 }, { "epoch": 4.224294813466788, "grad_norm": 0.7321903380070759, "learning_rate": 5.8287848945337503e-08, "loss": 0.0056, "step": 9285 }, { "epoch": 4.2247497725204735, "grad_norm": 0.36361699272876113, "learning_rate": 5.8220893912132364e-08, "loss": 0.0031, "step": 9286 }, { "epoch": 4.225204731574158, "grad_norm": 0.5766660835663937, "learning_rate": 5.815397497912084e-08, "loss": 0.0103, "step": 9287 }, { "epoch": 4.225659690627843, "grad_norm": 0.4094461373072103, "learning_rate": 5.80870921517711e-08, "loss": 0.0028, "step": 9288 }, { "epoch": 4.226114649681529, "grad_norm": 0.6533655539524674, "learning_rate": 5.802024543554845e-08, "loss": 0.017, "step": 9289 }, { "epoch": 4.226569608735214, "grad_norm": 0.4022319988765195, "learning_rate": 5.795343483591547e-08, "loss": 0.005, "step": 9290 }, { "epoch": 4.227024567788899, "grad_norm": 0.7392066312680196, "learning_rate": 5.788666035833145e-08, "loss": 0.0079, "step": 9291 }, { "epoch": 4.227479526842584, "grad_norm": 0.46132355022778104, "learning_rate": 5.7819922008253085e-08, "loss": 0.003, "step": 9292 }, { "epoch": 4.227934485896269, "grad_norm": 0.6868643483007406, "learning_rate": 5.775321979113379e-08, "loss": 0.011, "step": 9293 }, { "epoch": 4.228389444949954, "grad_norm": 0.23049988848299813, "learning_rate": 5.7686553712424204e-08, "loss": 0.0012, "step": 9294 }, { "epoch": 4.22884440400364, "grad_norm": 0.553004135263348, "learning_rate": 5.761992377757191e-08, "loss": 0.0052, "step": 9295 }, { "epoch": 4.229299363057325, "grad_norm": 1.4239661007336748, "learning_rate": 5.755332999202167e-08, "loss": 0.0224, "step": 9296 }, { "epoch": 4.22975432211101, "grad_norm": 0.2666523130024936, "learning_rate": 5.74867723612153e-08, "loss": 0.0022, "step": 9297 }, { "epoch": 4.230209281164695, "grad_norm": 0.5061508053134761, "learning_rate": 5.742025089059155e-08, "loss": 0.0033, "step": 9298 }, { "epoch": 4.23066424021838, "grad_norm": 0.4904629777904135, "learning_rate": 5.735376558558624e-08, "loss": 0.0066, "step": 9299 }, { "epoch": 4.231119199272065, "grad_norm": 0.6949741348423402, "learning_rate": 5.7287316451632095e-08, "loss": 0.0119, "step": 9300 }, { "epoch": 4.231574158325751, "grad_norm": 0.9906225673990221, "learning_rate": 5.722090349415931e-08, "loss": 0.0064, "step": 9301 }, { "epoch": 4.232029117379436, "grad_norm": 1.096600959661856, "learning_rate": 5.715452671859467e-08, "loss": 0.0137, "step": 9302 }, { "epoch": 4.232484076433121, "grad_norm": 0.6054345312313161, "learning_rate": 5.70881861303622e-08, "loss": 0.0134, "step": 9303 }, { "epoch": 4.232939035486806, "grad_norm": 0.608874700075734, "learning_rate": 5.7021881734883037e-08, "loss": 0.0063, "step": 9304 }, { "epoch": 4.233393994540491, "grad_norm": 0.48822860330470474, "learning_rate": 5.695561353757522e-08, "loss": 0.0068, "step": 9305 }, { "epoch": 4.233848953594176, "grad_norm": 0.6961702503294926, "learning_rate": 5.688938154385381e-08, "loss": 0.0138, "step": 9306 }, { "epoch": 4.234303912647862, "grad_norm": 0.5766425264991671, "learning_rate": 5.68231857591312e-08, "loss": 0.0047, "step": 9307 }, { "epoch": 4.234758871701547, "grad_norm": 0.6268161971856331, "learning_rate": 5.675702618881645e-08, "loss": 0.0098, "step": 9308 }, { "epoch": 4.235213830755232, "grad_norm": 1.1940000433976445, "learning_rate": 5.669090283831584e-08, "loss": 0.0089, "step": 9309 }, { "epoch": 4.235668789808917, "grad_norm": 0.3931700307624959, "learning_rate": 5.662481571303262e-08, "loss": 0.0025, "step": 9310 }, { "epoch": 4.236123748862602, "grad_norm": 0.5429762651440988, "learning_rate": 5.655876481836719e-08, "loss": 0.0086, "step": 9311 }, { "epoch": 4.236578707916287, "grad_norm": 0.6942431281338296, "learning_rate": 5.649275015971705e-08, "loss": 0.0078, "step": 9312 }, { "epoch": 4.237033666969973, "grad_norm": 0.5492972642776742, "learning_rate": 5.6426771742476454e-08, "loss": 0.0083, "step": 9313 }, { "epoch": 4.237488626023658, "grad_norm": 0.5508347856175015, "learning_rate": 5.636082957203697e-08, "loss": 0.005, "step": 9314 }, { "epoch": 4.237943585077343, "grad_norm": 1.214051765204512, "learning_rate": 5.629492365378691e-08, "loss": 0.0138, "step": 9315 }, { "epoch": 4.238398544131028, "grad_norm": 0.35917376243994714, "learning_rate": 5.6229053993112005e-08, "loss": 0.005, "step": 9316 }, { "epoch": 4.238853503184713, "grad_norm": 0.49553747879114124, "learning_rate": 5.616322059539469e-08, "loss": 0.0038, "step": 9317 }, { "epoch": 4.239308462238399, "grad_norm": 0.6615368494853329, "learning_rate": 5.6097423466014706e-08, "loss": 0.0093, "step": 9318 }, { "epoch": 4.239763421292084, "grad_norm": 0.5043582092704711, "learning_rate": 5.603166261034864e-08, "loss": 0.0047, "step": 9319 }, { "epoch": 4.240218380345769, "grad_norm": 0.6193435841413588, "learning_rate": 5.5965938033770085e-08, "loss": 0.0058, "step": 9320 }, { "epoch": 4.2406733393994545, "grad_norm": 0.5554476057600283, "learning_rate": 5.590024974164992e-08, "loss": 0.0183, "step": 9321 }, { "epoch": 4.241128298453139, "grad_norm": 0.6280560696861675, "learning_rate": 5.583459773935584e-08, "loss": 0.0055, "step": 9322 }, { "epoch": 4.241583257506824, "grad_norm": 0.8528239873067354, "learning_rate": 5.5768982032252555e-08, "loss": 0.0133, "step": 9323 }, { "epoch": 4.24203821656051, "grad_norm": 0.6841491312693043, "learning_rate": 5.570340262570184e-08, "loss": 0.0062, "step": 9324 }, { "epoch": 4.242493175614195, "grad_norm": 1.5305295243992338, "learning_rate": 5.563785952506267e-08, "loss": 0.0414, "step": 9325 }, { "epoch": 4.24294813466788, "grad_norm": 0.666696030852692, "learning_rate": 5.557235273569094e-08, "loss": 0.0073, "step": 9326 }, { "epoch": 4.243403093721565, "grad_norm": 0.6512158155708195, "learning_rate": 5.550688226293959e-08, "loss": 0.0106, "step": 9327 }, { "epoch": 4.24385805277525, "grad_norm": 0.6568165019892495, "learning_rate": 5.544144811215845e-08, "loss": 0.004, "step": 9328 }, { "epoch": 4.244313011828935, "grad_norm": 0.5557893153682193, "learning_rate": 5.537605028869452e-08, "loss": 0.0055, "step": 9329 }, { "epoch": 4.244767970882621, "grad_norm": 1.3425285758196215, "learning_rate": 5.531068879789191e-08, "loss": 0.005, "step": 9330 }, { "epoch": 4.245222929936306, "grad_norm": 0.6140239130683341, "learning_rate": 5.524536364509153e-08, "loss": 0.005, "step": 9331 }, { "epoch": 4.245677888989991, "grad_norm": 0.7526479592107905, "learning_rate": 5.518007483563164e-08, "loss": 0.008, "step": 9332 }, { "epoch": 4.246132848043676, "grad_norm": 0.5647406294605181, "learning_rate": 5.5114822374847215e-08, "loss": 0.0093, "step": 9333 }, { "epoch": 4.246587807097361, "grad_norm": 0.6627590293759416, "learning_rate": 5.504960626807037e-08, "loss": 0.0129, "step": 9334 }, { "epoch": 4.247042766151046, "grad_norm": 1.7841281476934812, "learning_rate": 5.49844265206304e-08, "loss": 0.0033, "step": 9335 }, { "epoch": 4.247497725204732, "grad_norm": 0.21317884755621, "learning_rate": 5.491928313785343e-08, "loss": 0.0014, "step": 9336 }, { "epoch": 4.247952684258417, "grad_norm": 0.557995583944161, "learning_rate": 5.4854176125062654e-08, "loss": 0.0042, "step": 9337 }, { "epoch": 4.248407643312102, "grad_norm": 0.6403609785125607, "learning_rate": 5.4789105487578256e-08, "loss": 0.012, "step": 9338 }, { "epoch": 4.248862602365787, "grad_norm": 0.5123852879883948, "learning_rate": 5.4724071230717594e-08, "loss": 0.0092, "step": 9339 }, { "epoch": 4.249317561419472, "grad_norm": 0.48008309829829915, "learning_rate": 5.465907335979514e-08, "loss": 0.0056, "step": 9340 }, { "epoch": 4.249772520473157, "grad_norm": 0.518571930349569, "learning_rate": 5.4594111880121976e-08, "loss": 0.0066, "step": 9341 }, { "epoch": 4.250227479526843, "grad_norm": 0.304710797773669, "learning_rate": 5.452918679700663e-08, "loss": 0.0031, "step": 9342 }, { "epoch": 4.250682438580528, "grad_norm": 0.7894692227809192, "learning_rate": 5.446429811575437e-08, "loss": 0.0152, "step": 9343 }, { "epoch": 4.251137397634213, "grad_norm": 0.31694777945072916, "learning_rate": 5.439944584166756e-08, "loss": 0.0022, "step": 9344 }, { "epoch": 4.251592356687898, "grad_norm": 0.5487788566745527, "learning_rate": 5.4334629980045734e-08, "loss": 0.0114, "step": 9345 }, { "epoch": 4.252047315741583, "grad_norm": 0.4085596649702885, "learning_rate": 5.4269850536185435e-08, "loss": 0.0018, "step": 9346 }, { "epoch": 4.252502274795268, "grad_norm": 56.7921256493716, "learning_rate": 5.420510751538004e-08, "loss": 0.1063, "step": 9347 }, { "epoch": 4.252957233848954, "grad_norm": 0.8913079825086652, "learning_rate": 5.414040092292005e-08, "loss": 0.0099, "step": 9348 }, { "epoch": 4.253412192902639, "grad_norm": 0.7409017449993437, "learning_rate": 5.4075730764092944e-08, "loss": 0.0079, "step": 9349 }, { "epoch": 4.253867151956324, "grad_norm": 0.7085128084675596, "learning_rate": 5.4011097044183383e-08, "loss": 0.005, "step": 9350 }, { "epoch": 4.254322111010009, "grad_norm": 0.609103265763282, "learning_rate": 5.3946499768472986e-08, "loss": 0.0055, "step": 9351 }, { "epoch": 4.254777070063694, "grad_norm": 0.24052946067905184, "learning_rate": 5.388193894224014e-08, "loss": 0.0036, "step": 9352 }, { "epoch": 4.255232029117379, "grad_norm": 0.501528822889359, "learning_rate": 5.381741457076067e-08, "loss": 0.0072, "step": 9353 }, { "epoch": 4.255686988171065, "grad_norm": 0.5205109176638023, "learning_rate": 5.375292665930703e-08, "loss": 0.0067, "step": 9354 }, { "epoch": 4.25614194722475, "grad_norm": 0.28234905617522843, "learning_rate": 5.368847521314912e-08, "loss": 0.0028, "step": 9355 }, { "epoch": 4.256596906278435, "grad_norm": 0.5566368826869889, "learning_rate": 5.3624060237553504e-08, "loss": 0.0061, "step": 9356 }, { "epoch": 4.25705186533212, "grad_norm": 0.5969890520146921, "learning_rate": 5.3559681737783854e-08, "loss": 0.0111, "step": 9357 }, { "epoch": 4.257506824385805, "grad_norm": 0.5714139245129622, "learning_rate": 5.3495339719100806e-08, "loss": 0.0072, "step": 9358 }, { "epoch": 4.25796178343949, "grad_norm": 0.6945062100089437, "learning_rate": 5.343103418676215e-08, "loss": 0.0157, "step": 9359 }, { "epoch": 4.258416742493176, "grad_norm": 0.5800809098061875, "learning_rate": 5.336676514602284e-08, "loss": 0.0131, "step": 9360 }, { "epoch": 4.258871701546861, "grad_norm": 0.6704867994047269, "learning_rate": 5.330253260213452e-08, "loss": 0.0082, "step": 9361 }, { "epoch": 4.2593266606005455, "grad_norm": 0.6715189322456898, "learning_rate": 5.323833656034593e-08, "loss": 0.0044, "step": 9362 }, { "epoch": 4.259781619654231, "grad_norm": 0.5441602899962157, "learning_rate": 5.317417702590282e-08, "loss": 0.0034, "step": 9363 }, { "epoch": 4.260236578707916, "grad_norm": 0.6553905901806301, "learning_rate": 5.3110054004048276e-08, "loss": 0.0122, "step": 9364 }, { "epoch": 4.260691537761602, "grad_norm": 0.5586773174667661, "learning_rate": 5.3045967500021935e-08, "loss": 0.0062, "step": 9365 }, { "epoch": 4.261146496815287, "grad_norm": 0.51495601709953, "learning_rate": 5.298191751906056e-08, "loss": 0.0026, "step": 9366 }, { "epoch": 4.261601455868972, "grad_norm": 0.4968693440913105, "learning_rate": 5.2917904066398344e-08, "loss": 0.003, "step": 9367 }, { "epoch": 4.262056414922657, "grad_norm": 0.8320646147410402, "learning_rate": 5.285392714726589e-08, "loss": 0.0061, "step": 9368 }, { "epoch": 4.262511373976342, "grad_norm": 0.5853569880032601, "learning_rate": 5.278998676689128e-08, "loss": 0.0041, "step": 9369 }, { "epoch": 4.262966333030027, "grad_norm": 0.6853844015762338, "learning_rate": 5.2726082930499407e-08, "loss": 0.0123, "step": 9370 }, { "epoch": 4.263421292083713, "grad_norm": 0.6317758863467045, "learning_rate": 5.266221564331214e-08, "loss": 0.0073, "step": 9371 }, { "epoch": 4.263876251137398, "grad_norm": 0.5249396115576442, "learning_rate": 5.259838491054836e-08, "loss": 0.0057, "step": 9372 }, { "epoch": 4.264331210191083, "grad_norm": 0.6077428182245569, "learning_rate": 5.253459073742411e-08, "loss": 0.0114, "step": 9373 }, { "epoch": 4.264786169244768, "grad_norm": 0.4386405781132177, "learning_rate": 5.247083312915246e-08, "loss": 0.0038, "step": 9374 }, { "epoch": 4.265241128298453, "grad_norm": 0.5366596782105959, "learning_rate": 5.240711209094334e-08, "loss": 0.0044, "step": 9375 }, { "epoch": 4.265696087352138, "grad_norm": 0.6891223243511643, "learning_rate": 5.234342762800364e-08, "loss": 0.0118, "step": 9376 }, { "epoch": 4.266151046405824, "grad_norm": 1.0957056868450794, "learning_rate": 5.227977974553749e-08, "loss": 0.0068, "step": 9377 }, { "epoch": 4.266606005459509, "grad_norm": 0.7981298243973242, "learning_rate": 5.221616844874577e-08, "loss": 0.0092, "step": 9378 }, { "epoch": 4.267060964513194, "grad_norm": 0.6596690146236318, "learning_rate": 5.2152593742826656e-08, "loss": 0.004, "step": 9379 }, { "epoch": 4.267515923566879, "grad_norm": 0.7853721315479987, "learning_rate": 5.2089055632975e-08, "loss": 0.0034, "step": 9380 }, { "epoch": 4.267970882620564, "grad_norm": 1.0484433546027765, "learning_rate": 5.2025554124383085e-08, "loss": 0.0101, "step": 9381 }, { "epoch": 4.268425841674249, "grad_norm": 1.2851399002991055, "learning_rate": 5.196208922223988e-08, "loss": 0.009, "step": 9382 }, { "epoch": 4.268880800727935, "grad_norm": 0.546889673566768, "learning_rate": 5.189866093173134e-08, "loss": 0.0095, "step": 9383 }, { "epoch": 4.26933575978162, "grad_norm": 0.6386480610130634, "learning_rate": 5.183526925804066e-08, "loss": 0.0085, "step": 9384 }, { "epoch": 4.269790718835305, "grad_norm": 0.5998613016882519, "learning_rate": 5.1771914206347914e-08, "loss": 0.0063, "step": 9385 }, { "epoch": 4.27024567788899, "grad_norm": 0.7748221757927256, "learning_rate": 5.170859578183018e-08, "loss": 0.0109, "step": 9386 }, { "epoch": 4.270700636942675, "grad_norm": 0.48156441429202484, "learning_rate": 5.164531398966138e-08, "loss": 0.0062, "step": 9387 }, { "epoch": 4.27115559599636, "grad_norm": 0.7852115129591168, "learning_rate": 5.1582068835012815e-08, "loss": 0.0119, "step": 9388 }, { "epoch": 4.271610555050046, "grad_norm": 0.5209289000451856, "learning_rate": 5.151886032305264e-08, "loss": 0.007, "step": 9389 }, { "epoch": 4.272065514103731, "grad_norm": 0.30405472443063014, "learning_rate": 5.1455688458945824e-08, "loss": 0.0031, "step": 9390 }, { "epoch": 4.272520473157416, "grad_norm": 0.45770288734185066, "learning_rate": 5.1392553247854574e-08, "loss": 0.0051, "step": 9391 }, { "epoch": 4.272975432211101, "grad_norm": 0.3004049366721968, "learning_rate": 5.132945469493788e-08, "loss": 0.0043, "step": 9392 }, { "epoch": 4.273430391264786, "grad_norm": 0.8274950994875618, "learning_rate": 5.126639280535211e-08, "loss": 0.0179, "step": 9393 }, { "epoch": 4.273885350318471, "grad_norm": 0.43959735672748934, "learning_rate": 5.120336758425009e-08, "loss": 0.0032, "step": 9394 }, { "epoch": 4.274340309372157, "grad_norm": 0.6300438128802501, "learning_rate": 5.114037903678225e-08, "loss": 0.0143, "step": 9395 }, { "epoch": 4.274795268425842, "grad_norm": 0.6580077814977938, "learning_rate": 5.1077427168095656e-08, "loss": 0.0105, "step": 9396 }, { "epoch": 4.2752502274795265, "grad_norm": 0.6899090763422313, "learning_rate": 5.101451198333423e-08, "loss": 0.0166, "step": 9397 }, { "epoch": 4.275705186533212, "grad_norm": 0.31985755721499964, "learning_rate": 5.095163348763942e-08, "loss": 0.0026, "step": 9398 }, { "epoch": 4.276160145586897, "grad_norm": 0.47773986596168927, "learning_rate": 5.088879168614918e-08, "loss": 0.0115, "step": 9399 }, { "epoch": 4.276615104640582, "grad_norm": 0.5244052163406036, "learning_rate": 5.082598658399878e-08, "loss": 0.0072, "step": 9400 }, { "epoch": 4.277070063694268, "grad_norm": 0.4895620196690139, "learning_rate": 5.0763218186320176e-08, "loss": 0.0059, "step": 9401 }, { "epoch": 4.277525022747953, "grad_norm": 0.47468377362140335, "learning_rate": 5.0700486498242653e-08, "loss": 0.0104, "step": 9402 }, { "epoch": 4.2779799818016375, "grad_norm": 0.6104161746535918, "learning_rate": 5.063779152489245e-08, "loss": 0.0112, "step": 9403 }, { "epoch": 4.278434940855323, "grad_norm": 0.3185695160802086, "learning_rate": 5.0575133271392626e-08, "loss": 0.0023, "step": 9404 }, { "epoch": 4.278889899909008, "grad_norm": 0.41697320417716466, "learning_rate": 5.0512511742863304e-08, "loss": 0.0034, "step": 9405 }, { "epoch": 4.279344858962693, "grad_norm": 0.5617623833862839, "learning_rate": 5.0449926944421574e-08, "loss": 0.0073, "step": 9406 }, { "epoch": 4.279799818016379, "grad_norm": 0.5324591647247603, "learning_rate": 5.0387378881181777e-08, "loss": 0.0071, "step": 9407 }, { "epoch": 4.280254777070064, "grad_norm": 0.5868759087288522, "learning_rate": 5.032486755825482e-08, "loss": 0.0059, "step": 9408 }, { "epoch": 4.2807097361237485, "grad_norm": 0.2295283617101413, "learning_rate": 5.026239298074908e-08, "loss": 0.0012, "step": 9409 }, { "epoch": 4.281164695177434, "grad_norm": 0.48177132198739814, "learning_rate": 5.019995515376962e-08, "loss": 0.0056, "step": 9410 }, { "epoch": 4.281619654231119, "grad_norm": 0.36090687623742557, "learning_rate": 5.013755408241849e-08, "loss": 0.0055, "step": 9411 }, { "epoch": 4.282074613284804, "grad_norm": 0.5247834487310132, "learning_rate": 5.007518977179481e-08, "loss": 0.0093, "step": 9412 }, { "epoch": 4.28252957233849, "grad_norm": 0.5506404206914612, "learning_rate": 5.001286222699491e-08, "loss": 0.0033, "step": 9413 }, { "epoch": 4.282984531392175, "grad_norm": 0.8466084982208111, "learning_rate": 4.995057145311171e-08, "loss": 0.0088, "step": 9414 }, { "epoch": 4.2834394904458595, "grad_norm": 0.6140412283412857, "learning_rate": 4.9888317455235364e-08, "loss": 0.0087, "step": 9415 }, { "epoch": 4.283894449499545, "grad_norm": 0.7812788723445957, "learning_rate": 4.9826100238453126e-08, "loss": 0.0083, "step": 9416 }, { "epoch": 4.28434940855323, "grad_norm": 0.6934676041415352, "learning_rate": 4.9763919807848884e-08, "loss": 0.0064, "step": 9417 }, { "epoch": 4.284804367606915, "grad_norm": 0.4362050102349615, "learning_rate": 4.970177616850396e-08, "loss": 0.0085, "step": 9418 }, { "epoch": 4.285259326660601, "grad_norm": 0.48004085432767535, "learning_rate": 4.96396693254964e-08, "loss": 0.0039, "step": 9419 }, { "epoch": 4.285714285714286, "grad_norm": 3.5081187950589166, "learning_rate": 4.957759928390121e-08, "loss": 0.0241, "step": 9420 }, { "epoch": 4.2861692447679705, "grad_norm": 0.30221392874343606, "learning_rate": 4.951556604879048e-08, "loss": 0.0024, "step": 9421 }, { "epoch": 4.286624203821656, "grad_norm": 0.46389151938644124, "learning_rate": 4.945356962523328e-08, "loss": 0.003, "step": 9422 }, { "epoch": 4.287079162875341, "grad_norm": 0.6087511891036472, "learning_rate": 4.9391610018295784e-08, "loss": 0.0132, "step": 9423 }, { "epoch": 4.287534121929026, "grad_norm": 0.49489897827927865, "learning_rate": 4.932968723304104e-08, "loss": 0.0061, "step": 9424 }, { "epoch": 4.287989080982712, "grad_norm": 0.3934064815532094, "learning_rate": 4.926780127452901e-08, "loss": 0.0042, "step": 9425 }, { "epoch": 4.288444040036397, "grad_norm": 0.41363480497713595, "learning_rate": 4.92059521478167e-08, "loss": 0.0034, "step": 9426 }, { "epoch": 4.288898999090081, "grad_norm": 0.5676302545113459, "learning_rate": 4.914413985795829e-08, "loss": 0.0047, "step": 9427 }, { "epoch": 4.289353958143767, "grad_norm": 0.45820075886626976, "learning_rate": 4.908236441000474e-08, "loss": 0.0033, "step": 9428 }, { "epoch": 4.289808917197452, "grad_norm": 0.48423140853084573, "learning_rate": 4.9020625809003956e-08, "loss": 0.0034, "step": 9429 }, { "epoch": 4.290263876251138, "grad_norm": 0.5610181916896234, "learning_rate": 4.8958924060001125e-08, "loss": 0.0064, "step": 9430 }, { "epoch": 4.290718835304823, "grad_norm": 0.37802837294024544, "learning_rate": 4.8897259168037994e-08, "loss": 0.0023, "step": 9431 }, { "epoch": 4.2911737943585075, "grad_norm": 0.675219744082812, "learning_rate": 4.8835631138153866e-08, "loss": 0.0046, "step": 9432 }, { "epoch": 4.291628753412193, "grad_norm": 0.6036804427104768, "learning_rate": 4.877403997538443e-08, "loss": 0.0095, "step": 9433 }, { "epoch": 4.292083712465878, "grad_norm": 0.686375678763148, "learning_rate": 4.8712485684762775e-08, "loss": 0.0025, "step": 9434 }, { "epoch": 4.292538671519563, "grad_norm": 0.3093932665227299, "learning_rate": 4.86509682713187e-08, "loss": 0.0017, "step": 9435 }, { "epoch": 4.292993630573249, "grad_norm": 0.5342305175525136, "learning_rate": 4.858948774007921e-08, "loss": 0.0082, "step": 9436 }, { "epoch": 4.293448589626934, "grad_norm": 0.6928338786860598, "learning_rate": 4.852804409606831e-08, "loss": 0.0081, "step": 9437 }, { "epoch": 4.2939035486806185, "grad_norm": 0.3789780653684574, "learning_rate": 4.8466637344306836e-08, "loss": 0.0034, "step": 9438 }, { "epoch": 4.294358507734304, "grad_norm": 0.6893620233327711, "learning_rate": 4.840526748981266e-08, "loss": 0.0078, "step": 9439 }, { "epoch": 4.294813466787989, "grad_norm": 0.6394626046776511, "learning_rate": 4.8343934537600494e-08, "loss": 0.0167, "step": 9440 }, { "epoch": 4.295268425841674, "grad_norm": 0.6355218048543105, "learning_rate": 4.828263849268249e-08, "loss": 0.0084, "step": 9441 }, { "epoch": 4.29572338489536, "grad_norm": 0.24976905516713319, "learning_rate": 4.822137936006732e-08, "loss": 0.0013, "step": 9442 }, { "epoch": 4.296178343949045, "grad_norm": 0.6184727829396881, "learning_rate": 4.816015714476074e-08, "loss": 0.005, "step": 9443 }, { "epoch": 4.2966333030027295, "grad_norm": 0.5375678379179414, "learning_rate": 4.8098971851765645e-08, "loss": 0.0111, "step": 9444 }, { "epoch": 4.297088262056415, "grad_norm": 1.046757191796702, "learning_rate": 4.8037823486081764e-08, "loss": 0.0042, "step": 9445 }, { "epoch": 4.2975432211101, "grad_norm": 0.6457401677557607, "learning_rate": 4.797671205270604e-08, "loss": 0.0082, "step": 9446 }, { "epoch": 4.297998180163785, "grad_norm": 1.7710960930493718, "learning_rate": 4.7915637556632026e-08, "loss": 0.0344, "step": 9447 }, { "epoch": 4.298453139217471, "grad_norm": 0.5047469845745622, "learning_rate": 4.785460000285052e-08, "loss": 0.0096, "step": 9448 }, { "epoch": 4.298908098271156, "grad_norm": 0.27876829173829093, "learning_rate": 4.779359939634925e-08, "loss": 0.0019, "step": 9449 }, { "epoch": 4.2993630573248405, "grad_norm": 0.7640168096361929, "learning_rate": 4.7732635742112783e-08, "loss": 0.0058, "step": 9450 }, { "epoch": 4.299818016378526, "grad_norm": 0.4249954733129061, "learning_rate": 4.767170904512291e-08, "loss": 0.0067, "step": 9451 }, { "epoch": 4.300272975432211, "grad_norm": 3.036716367683079, "learning_rate": 4.761081931035837e-08, "loss": 0.0364, "step": 9452 }, { "epoch": 4.300727934485896, "grad_norm": 0.8518852583749447, "learning_rate": 4.7549966542794695e-08, "loss": 0.0104, "step": 9453 }, { "epoch": 4.301182893539582, "grad_norm": 0.6166975247076144, "learning_rate": 4.7489150747404506e-08, "loss": 0.0115, "step": 9454 }, { "epoch": 4.301637852593267, "grad_norm": 0.5850364837315332, "learning_rate": 4.7428371929157326e-08, "loss": 0.0079, "step": 9455 }, { "epoch": 4.3020928116469515, "grad_norm": 0.30095481161978355, "learning_rate": 4.736763009301986e-08, "loss": 0.0022, "step": 9456 }, { "epoch": 4.302547770700637, "grad_norm": 0.9098226931827124, "learning_rate": 4.730692524395552e-08, "loss": 0.0123, "step": 9457 }, { "epoch": 4.303002729754322, "grad_norm": 0.6393235730565366, "learning_rate": 4.7246257386925005e-08, "loss": 0.0068, "step": 9458 }, { "epoch": 4.303457688808007, "grad_norm": 0.3275263683694067, "learning_rate": 4.718562652688574e-08, "loss": 0.0032, "step": 9459 }, { "epoch": 4.303912647861693, "grad_norm": 0.7328821909491114, "learning_rate": 4.712503266879203e-08, "loss": 0.0046, "step": 9460 }, { "epoch": 4.304367606915378, "grad_norm": 0.647144545980114, "learning_rate": 4.7064475817595594e-08, "loss": 0.0059, "step": 9461 }, { "epoch": 4.304822565969062, "grad_norm": 1.5410212941912944, "learning_rate": 4.70039559782448e-08, "loss": 0.0047, "step": 9462 }, { "epoch": 4.305277525022748, "grad_norm": 0.5863665365999738, "learning_rate": 4.6943473155684974e-08, "loss": 0.0062, "step": 9463 }, { "epoch": 4.305732484076433, "grad_norm": 0.4969475426333545, "learning_rate": 4.688302735485844e-08, "loss": 0.0081, "step": 9464 }, { "epoch": 4.306187443130118, "grad_norm": 0.6461085853429347, "learning_rate": 4.6822618580704685e-08, "loss": 0.0075, "step": 9465 }, { "epoch": 4.306642402183804, "grad_norm": 0.7505149163556487, "learning_rate": 4.6762246838160043e-08, "loss": 0.007, "step": 9466 }, { "epoch": 4.3070973612374885, "grad_norm": 0.44528760064818684, "learning_rate": 4.670191213215785e-08, "loss": 0.0055, "step": 9467 }, { "epoch": 4.307552320291173, "grad_norm": 0.5947629212496768, "learning_rate": 4.6641614467628266e-08, "loss": 0.018, "step": 9468 }, { "epoch": 4.308007279344859, "grad_norm": 0.6134743784809633, "learning_rate": 4.658135384949857e-08, "loss": 0.0078, "step": 9469 }, { "epoch": 4.308462238398544, "grad_norm": 0.8070884357856586, "learning_rate": 4.6521130282693056e-08, "loss": 0.0101, "step": 9470 }, { "epoch": 4.308917197452229, "grad_norm": 0.7077028127856976, "learning_rate": 4.6460943772132835e-08, "loss": 0.0065, "step": 9471 }, { "epoch": 4.309372156505915, "grad_norm": 0.49976250097288827, "learning_rate": 4.6400794322736136e-08, "loss": 0.0087, "step": 9472 }, { "epoch": 4.3098271155595995, "grad_norm": 0.6080865700806688, "learning_rate": 4.634068193941815e-08, "loss": 0.0125, "step": 9473 }, { "epoch": 4.310282074613285, "grad_norm": 0.6977917656834381, "learning_rate": 4.6280606627090826e-08, "loss": 0.0142, "step": 9474 }, { "epoch": 4.31073703366697, "grad_norm": 0.3883892859288384, "learning_rate": 4.622056839066346e-08, "loss": 0.0059, "step": 9475 }, { "epoch": 4.311191992720655, "grad_norm": 0.6310364401460001, "learning_rate": 4.616056723504197e-08, "loss": 0.0082, "step": 9476 }, { "epoch": 4.311646951774341, "grad_norm": 0.9567251219605828, "learning_rate": 4.6100603165129423e-08, "loss": 0.0091, "step": 9477 }, { "epoch": 4.312101910828026, "grad_norm": 0.6954597342619169, "learning_rate": 4.6040676185825686e-08, "loss": 0.0092, "step": 9478 }, { "epoch": 4.3125568698817105, "grad_norm": 0.9466164433448381, "learning_rate": 4.598078630202784e-08, "loss": 0.0059, "step": 9479 }, { "epoch": 4.313011828935396, "grad_norm": 0.5098703898778995, "learning_rate": 4.592093351862991e-08, "loss": 0.0038, "step": 9480 }, { "epoch": 4.313466787989081, "grad_norm": 0.6272983686007726, "learning_rate": 4.5861117840522656e-08, "loss": 0.0122, "step": 9481 }, { "epoch": 4.313921747042766, "grad_norm": 0.4404038786702789, "learning_rate": 4.5801339272593996e-08, "loss": 0.0052, "step": 9482 }, { "epoch": 4.314376706096452, "grad_norm": 0.49679076595840566, "learning_rate": 4.574159781972875e-08, "loss": 0.0048, "step": 9483 }, { "epoch": 4.314831665150137, "grad_norm": 0.6105888474532419, "learning_rate": 4.568189348680862e-08, "loss": 0.0071, "step": 9484 }, { "epoch": 4.3152866242038215, "grad_norm": 0.7347820405566654, "learning_rate": 4.562222627871248e-08, "loss": 0.0269, "step": 9485 }, { "epoch": 4.315741583257507, "grad_norm": 0.6049564290430856, "learning_rate": 4.5562596200316164e-08, "loss": 0.0043, "step": 9486 }, { "epoch": 4.316196542311192, "grad_norm": 71.26615622431576, "learning_rate": 4.5503003256492255e-08, "loss": 0.0535, "step": 9487 }, { "epoch": 4.316651501364877, "grad_norm": 0.6953200559090456, "learning_rate": 4.544344745211037e-08, "loss": 0.0019, "step": 9488 }, { "epoch": 4.317106460418563, "grad_norm": 0.5638571889809451, "learning_rate": 4.538392879203717e-08, "loss": 0.009, "step": 9489 }, { "epoch": 4.317561419472248, "grad_norm": 0.4679081967158131, "learning_rate": 4.532444728113638e-08, "loss": 0.0096, "step": 9490 }, { "epoch": 4.3180163785259325, "grad_norm": 0.6916565404998958, "learning_rate": 4.526500292426844e-08, "loss": 0.0082, "step": 9491 }, { "epoch": 4.318471337579618, "grad_norm": 0.4498176984274732, "learning_rate": 4.520559572629079e-08, "loss": 0.0025, "step": 9492 }, { "epoch": 4.318926296633303, "grad_norm": 0.5147345544708735, "learning_rate": 4.514622569205817e-08, "loss": 0.0055, "step": 9493 }, { "epoch": 4.319381255686988, "grad_norm": 0.47310746638364826, "learning_rate": 4.5086892826421754e-08, "loss": 0.0053, "step": 9494 }, { "epoch": 4.319836214740674, "grad_norm": 0.5123882748264634, "learning_rate": 4.502759713423016e-08, "loss": 0.0066, "step": 9495 }, { "epoch": 4.320291173794359, "grad_norm": 0.4693211306856563, "learning_rate": 4.4968338620328726e-08, "loss": 0.0062, "step": 9496 }, { "epoch": 4.320746132848043, "grad_norm": 0.4813850625664888, "learning_rate": 4.4909117289559705e-08, "loss": 0.0023, "step": 9497 }, { "epoch": 4.321201091901729, "grad_norm": 0.6450724559261357, "learning_rate": 4.4849933146762384e-08, "loss": 0.004, "step": 9498 }, { "epoch": 4.321656050955414, "grad_norm": 0.37292844868213365, "learning_rate": 4.4790786196773124e-08, "loss": 0.0034, "step": 9499 }, { "epoch": 4.322111010009099, "grad_norm": 0.5553687101075336, "learning_rate": 4.473167644442516e-08, "loss": 0.0034, "step": 9500 }, { "epoch": 4.322565969062785, "grad_norm": 0.4558905005394018, "learning_rate": 4.467260389454863e-08, "loss": 0.0053, "step": 9501 }, { "epoch": 4.3230209281164695, "grad_norm": 0.5245964866137642, "learning_rate": 4.461356855197068e-08, "loss": 0.006, "step": 9502 }, { "epoch": 4.323475887170154, "grad_norm": 0.5697253531475515, "learning_rate": 4.455457042151528e-08, "loss": 0.007, "step": 9503 }, { "epoch": 4.32393084622384, "grad_norm": 0.546813051221274, "learning_rate": 4.4495609508003737e-08, "loss": 0.0116, "step": 9504 }, { "epoch": 4.324385805277525, "grad_norm": 0.6488995201030462, "learning_rate": 4.4436685816253926e-08, "loss": 0.0058, "step": 9505 }, { "epoch": 4.32484076433121, "grad_norm": 0.6462035717232145, "learning_rate": 4.437779935108077e-08, "loss": 0.0093, "step": 9506 }, { "epoch": 4.325295723384896, "grad_norm": 0.6733738194095197, "learning_rate": 4.431895011729636e-08, "loss": 0.0096, "step": 9507 }, { "epoch": 4.3257506824385805, "grad_norm": 0.5051060214989886, "learning_rate": 4.426013811970941e-08, "loss": 0.0075, "step": 9508 }, { "epoch": 4.326205641492265, "grad_norm": 0.7202185004172375, "learning_rate": 4.420136336312596e-08, "loss": 0.0078, "step": 9509 }, { "epoch": 4.326660600545951, "grad_norm": 1.255494938228605, "learning_rate": 4.414262585234874e-08, "loss": 0.0139, "step": 9510 }, { "epoch": 4.327115559599636, "grad_norm": 0.48134815500177136, "learning_rate": 4.4083925592177494e-08, "loss": 0.0089, "step": 9511 }, { "epoch": 4.327570518653321, "grad_norm": 0.6618931266814925, "learning_rate": 4.402526258740885e-08, "loss": 0.0086, "step": 9512 }, { "epoch": 4.328025477707007, "grad_norm": 0.34657375535831686, "learning_rate": 4.3966636842836636e-08, "loss": 0.0061, "step": 9513 }, { "epoch": 4.3284804367606915, "grad_norm": 0.5946749457212951, "learning_rate": 4.390804836325146e-08, "loss": 0.0109, "step": 9514 }, { "epoch": 4.328935395814376, "grad_norm": 0.44827810383342703, "learning_rate": 4.3849497153440886e-08, "loss": 0.004, "step": 9515 }, { "epoch": 4.329390354868062, "grad_norm": 0.5928411202306525, "learning_rate": 4.379098321818947e-08, "loss": 0.0091, "step": 9516 }, { "epoch": 4.329845313921747, "grad_norm": 0.6536606578436334, "learning_rate": 4.3732506562278615e-08, "loss": 0.0089, "step": 9517 }, { "epoch": 4.330300272975432, "grad_norm": 0.7843996058079381, "learning_rate": 4.3674067190486886e-08, "loss": 0.0235, "step": 9518 }, { "epoch": 4.330755232029118, "grad_norm": 0.7511324114195692, "learning_rate": 4.361566510758963e-08, "loss": 0.0121, "step": 9519 }, { "epoch": 4.3312101910828025, "grad_norm": 0.6425332496247874, "learning_rate": 4.3557300318359134e-08, "loss": 0.013, "step": 9520 }, { "epoch": 4.331665150136487, "grad_norm": 0.3550275019892308, "learning_rate": 4.349897282756487e-08, "loss": 0.0032, "step": 9521 }, { "epoch": 4.332120109190173, "grad_norm": 0.7661551993052474, "learning_rate": 4.3440682639973026e-08, "loss": 0.0052, "step": 9522 }, { "epoch": 4.332575068243858, "grad_norm": 0.6951473031416241, "learning_rate": 4.3382429760346674e-08, "loss": 0.0109, "step": 9523 }, { "epoch": 4.333030027297543, "grad_norm": 0.5236030751446282, "learning_rate": 4.332421419344623e-08, "loss": 0.0051, "step": 9524 }, { "epoch": 4.333484986351229, "grad_norm": 0.4865435065673877, "learning_rate": 4.326603594402861e-08, "loss": 0.0056, "step": 9525 }, { "epoch": 4.3339399454049135, "grad_norm": 0.5518970939355096, "learning_rate": 4.3207895016847964e-08, "loss": 0.008, "step": 9526 }, { "epoch": 4.334394904458598, "grad_norm": 0.599356484773862, "learning_rate": 4.3149791416655204e-08, "loss": 0.003, "step": 9527 }, { "epoch": 4.334849863512284, "grad_norm": 0.38619055669198105, "learning_rate": 4.309172514819837e-08, "loss": 0.0034, "step": 9528 }, { "epoch": 4.335304822565969, "grad_norm": 1.7824091866907847, "learning_rate": 4.303369621622244e-08, "loss": 0.0243, "step": 9529 }, { "epoch": 4.335759781619654, "grad_norm": 0.5641225486836753, "learning_rate": 4.297570462546923e-08, "loss": 0.0068, "step": 9530 }, { "epoch": 4.33621474067334, "grad_norm": 0.6477134608781113, "learning_rate": 4.291775038067758e-08, "loss": 0.0047, "step": 9531 }, { "epoch": 4.336669699727024, "grad_norm": 1.4113158794570337, "learning_rate": 4.2859833486583064e-08, "loss": 0.0045, "step": 9532 }, { "epoch": 4.337124658780709, "grad_norm": 0.5500723983869753, "learning_rate": 4.280195394791863e-08, "loss": 0.0059, "step": 9533 }, { "epoch": 4.337579617834395, "grad_norm": 0.38991805106499744, "learning_rate": 4.274411176941373e-08, "loss": 0.003, "step": 9534 }, { "epoch": 4.33803457688808, "grad_norm": 0.42053309395592997, "learning_rate": 4.268630695579517e-08, "loss": 0.0029, "step": 9535 }, { "epoch": 4.338489535941765, "grad_norm": 0.9027892699065883, "learning_rate": 4.262853951178641e-08, "loss": 0.0051, "step": 9536 }, { "epoch": 4.3389444949954505, "grad_norm": 0.7044999223493388, "learning_rate": 4.257080944210778e-08, "loss": 0.0138, "step": 9537 }, { "epoch": 4.339399454049135, "grad_norm": 0.5527320289129368, "learning_rate": 4.2513116751477005e-08, "loss": 0.006, "step": 9538 }, { "epoch": 4.339854413102821, "grad_norm": 0.7031993625557783, "learning_rate": 4.245546144460838e-08, "loss": 0.0121, "step": 9539 }, { "epoch": 4.340309372156506, "grad_norm": 0.3472911487293751, "learning_rate": 4.239784352621312e-08, "loss": 0.0045, "step": 9540 }, { "epoch": 4.340764331210191, "grad_norm": 0.6812423897915032, "learning_rate": 4.234026300099952e-08, "loss": 0.0062, "step": 9541 }, { "epoch": 4.341219290263877, "grad_norm": 0.7837272162627477, "learning_rate": 4.2282719873672824e-08, "loss": 0.0051, "step": 9542 }, { "epoch": 4.3416742493175615, "grad_norm": 0.27143590941676726, "learning_rate": 4.2225214148935376e-08, "loss": 0.0019, "step": 9543 }, { "epoch": 4.342129208371246, "grad_norm": 1.0124491322268285, "learning_rate": 4.216774583148608e-08, "loss": 0.0088, "step": 9544 }, { "epoch": 4.342584167424932, "grad_norm": 0.6906086439093372, "learning_rate": 4.211031492602102e-08, "loss": 0.0053, "step": 9545 }, { "epoch": 4.343039126478617, "grad_norm": 0.39150649144104044, "learning_rate": 4.205292143723321e-08, "loss": 0.0018, "step": 9546 }, { "epoch": 4.343494085532302, "grad_norm": 0.8657258983000162, "learning_rate": 4.199556536981264e-08, "loss": 0.0168, "step": 9547 }, { "epoch": 4.343949044585988, "grad_norm": 37.890585952205484, "learning_rate": 4.193824672844598e-08, "loss": 0.0688, "step": 9548 }, { "epoch": 4.3444040036396725, "grad_norm": 0.6412177810342237, "learning_rate": 4.188096551781739e-08, "loss": 0.0103, "step": 9549 }, { "epoch": 4.344858962693357, "grad_norm": 0.5869573153049711, "learning_rate": 4.1823721742607395e-08, "loss": 0.0039, "step": 9550 }, { "epoch": 4.345313921747043, "grad_norm": 0.7199233591217675, "learning_rate": 4.176651540749371e-08, "loss": 0.0031, "step": 9551 }, { "epoch": 4.345768880800728, "grad_norm": 1.009731219285124, "learning_rate": 4.170934651715108e-08, "loss": 0.0024, "step": 9552 }, { "epoch": 4.346223839854413, "grad_norm": 0.3596308553023083, "learning_rate": 4.165221507625105e-08, "loss": 0.0074, "step": 9553 }, { "epoch": 4.346678798908099, "grad_norm": 0.5636040462536931, "learning_rate": 4.1595121089462116e-08, "loss": 0.0104, "step": 9554 }, { "epoch": 4.3471337579617835, "grad_norm": 0.32749528755692175, "learning_rate": 4.1538064561449646e-08, "loss": 0.0031, "step": 9555 }, { "epoch": 4.347588717015468, "grad_norm": 0.4249978651243963, "learning_rate": 4.1481045496876254e-08, "loss": 0.0085, "step": 9556 }, { "epoch": 4.348043676069154, "grad_norm": 0.7381996205625211, "learning_rate": 4.1424063900401043e-08, "loss": 0.0116, "step": 9557 }, { "epoch": 4.348498635122839, "grad_norm": 0.5150475003980518, "learning_rate": 4.136711977668056e-08, "loss": 0.0057, "step": 9558 }, { "epoch": 4.348953594176524, "grad_norm": 0.6719183995448132, "learning_rate": 4.131021313036787e-08, "loss": 0.0133, "step": 9559 }, { "epoch": 4.34940855323021, "grad_norm": 0.45676694426744513, "learning_rate": 4.125334396611313e-08, "loss": 0.0071, "step": 9560 }, { "epoch": 4.3498635122838945, "grad_norm": 0.7954022691439845, "learning_rate": 4.11965122885633e-08, "loss": 0.0224, "step": 9561 }, { "epoch": 4.350318471337579, "grad_norm": 0.3543891192055466, "learning_rate": 4.113971810236261e-08, "loss": 0.0037, "step": 9562 }, { "epoch": 4.350773430391265, "grad_norm": 0.6808699133108247, "learning_rate": 4.108296141215206e-08, "loss": 0.011, "step": 9563 }, { "epoch": 4.35122838944495, "grad_norm": 0.5007663363136916, "learning_rate": 4.10262422225694e-08, "loss": 0.0103, "step": 9564 }, { "epoch": 4.351683348498635, "grad_norm": 0.5862792279162385, "learning_rate": 4.096956053824957e-08, "loss": 0.0072, "step": 9565 }, { "epoch": 4.352138307552321, "grad_norm": 0.7403024373810166, "learning_rate": 4.091291636382416e-08, "loss": 0.0039, "step": 9566 }, { "epoch": 4.352593266606005, "grad_norm": 0.9265443329564561, "learning_rate": 4.085630970392212e-08, "loss": 0.007, "step": 9567 }, { "epoch": 4.35304822565969, "grad_norm": 0.5222874856055326, "learning_rate": 4.0799740563168924e-08, "loss": 0.0065, "step": 9568 }, { "epoch": 4.353503184713376, "grad_norm": 0.44595528328887835, "learning_rate": 4.0743208946187157e-08, "loss": 0.0094, "step": 9569 }, { "epoch": 4.353958143767061, "grad_norm": 0.601921293564657, "learning_rate": 4.068671485759651e-08, "loss": 0.0122, "step": 9570 }, { "epoch": 4.354413102820746, "grad_norm": 0.571903143029782, "learning_rate": 4.063025830201311e-08, "loss": 0.0098, "step": 9571 }, { "epoch": 4.3548680618744315, "grad_norm": 0.6440265991961914, "learning_rate": 4.0573839284050614e-08, "loss": 0.0083, "step": 9572 }, { "epoch": 4.355323020928116, "grad_norm": 0.4327098985676528, "learning_rate": 4.051745780831922e-08, "loss": 0.0048, "step": 9573 }, { "epoch": 4.355777979981801, "grad_norm": 0.6114023415060491, "learning_rate": 4.046111387942619e-08, "loss": 0.0107, "step": 9574 }, { "epoch": 4.356232939035487, "grad_norm": 1.1412700550446218, "learning_rate": 4.040480750197561e-08, "loss": 0.0071, "step": 9575 }, { "epoch": 4.356687898089172, "grad_norm": 0.43425319485930935, "learning_rate": 4.034853868056859e-08, "loss": 0.0046, "step": 9576 }, { "epoch": 4.357142857142857, "grad_norm": 0.5839371245614863, "learning_rate": 4.029230741980333e-08, "loss": 0.0063, "step": 9577 }, { "epoch": 4.3575978161965425, "grad_norm": 0.6657507998204758, "learning_rate": 4.023611372427471e-08, "loss": 0.0037, "step": 9578 }, { "epoch": 4.358052775250227, "grad_norm": 0.3908936564149074, "learning_rate": 4.017995759857456e-08, "loss": 0.0061, "step": 9579 }, { "epoch": 4.358507734303912, "grad_norm": 0.42097405015377415, "learning_rate": 4.0123839047291663e-08, "loss": 0.0047, "step": 9580 }, { "epoch": 4.358962693357598, "grad_norm": 0.7209746376377367, "learning_rate": 4.0067758075012e-08, "loss": 0.0093, "step": 9581 }, { "epoch": 4.359417652411283, "grad_norm": 0.737466244609962, "learning_rate": 4.001171468631809e-08, "loss": 0.0063, "step": 9582 }, { "epoch": 4.359872611464968, "grad_norm": 0.7917143446682676, "learning_rate": 3.995570888578942e-08, "loss": 0.0072, "step": 9583 }, { "epoch": 4.3603275705186535, "grad_norm": 0.6996229266116617, "learning_rate": 3.9899740678002834e-08, "loss": 0.0083, "step": 9584 }, { "epoch": 4.360782529572338, "grad_norm": 0.7913401054467278, "learning_rate": 3.9843810067531517e-08, "loss": 0.0055, "step": 9585 }, { "epoch": 4.361237488626024, "grad_norm": 0.7209116537251699, "learning_rate": 3.978791705894608e-08, "loss": 0.0062, "step": 9586 }, { "epoch": 4.361692447679709, "grad_norm": 0.9930267024106689, "learning_rate": 3.973206165681381e-08, "loss": 0.0117, "step": 9587 }, { "epoch": 4.362147406733394, "grad_norm": 0.6772275401757507, "learning_rate": 3.9676243865698846e-08, "loss": 0.0183, "step": 9588 }, { "epoch": 4.36260236578708, "grad_norm": 0.5740968194271118, "learning_rate": 3.962046369016248e-08, "loss": 0.0039, "step": 9589 }, { "epoch": 4.3630573248407645, "grad_norm": 0.6321877273219701, "learning_rate": 3.9564721134762556e-08, "loss": 0.0039, "step": 9590 }, { "epoch": 4.363512283894449, "grad_norm": 1.0075979820746273, "learning_rate": 3.9509016204054504e-08, "loss": 0.0056, "step": 9591 }, { "epoch": 4.363967242948135, "grad_norm": 0.6601886314726387, "learning_rate": 3.945334890259011e-08, "loss": 0.0095, "step": 9592 }, { "epoch": 4.36442220200182, "grad_norm": 0.5484102627976424, "learning_rate": 3.9397719234918195e-08, "loss": 0.0075, "step": 9593 }, { "epoch": 4.364877161055505, "grad_norm": 0.7383542103449078, "learning_rate": 3.934212720558461e-08, "loss": 0.0105, "step": 9594 }, { "epoch": 4.365332120109191, "grad_norm": 0.5399335225905275, "learning_rate": 3.9286572819132e-08, "loss": 0.0044, "step": 9595 }, { "epoch": 4.3657870791628755, "grad_norm": 0.454951787647349, "learning_rate": 3.923105608010019e-08, "loss": 0.0028, "step": 9596 }, { "epoch": 4.36624203821656, "grad_norm": 0.8581160961749679, "learning_rate": 3.91755769930256e-08, "loss": 0.006, "step": 9597 }, { "epoch": 4.366696997270246, "grad_norm": 0.5014702283714108, "learning_rate": 3.912013556244182e-08, "loss": 0.0075, "step": 9598 }, { "epoch": 4.367151956323931, "grad_norm": 0.9473334898086126, "learning_rate": 3.906473179287928e-08, "loss": 0.0094, "step": 9599 }, { "epoch": 4.367606915377616, "grad_norm": 0.42995691930370095, "learning_rate": 3.9009365688865204e-08, "loss": 0.0056, "step": 9600 }, { "epoch": 4.368061874431302, "grad_norm": 0.5441556457231288, "learning_rate": 3.895403725492402e-08, "loss": 0.0036, "step": 9601 }, { "epoch": 4.368516833484986, "grad_norm": 0.7789436361355743, "learning_rate": 3.8898746495576897e-08, "loss": 0.0122, "step": 9602 }, { "epoch": 4.368971792538671, "grad_norm": 0.5626951054777521, "learning_rate": 3.884349341534182e-08, "loss": 0.0073, "step": 9603 }, { "epoch": 4.369426751592357, "grad_norm": 0.5683218278689521, "learning_rate": 3.878827801873385e-08, "loss": 0.0088, "step": 9604 }, { "epoch": 4.369881710646042, "grad_norm": 0.580407457389991, "learning_rate": 3.8733100310265e-08, "loss": 0.0095, "step": 9605 }, { "epoch": 4.370336669699727, "grad_norm": 0.4976617270427146, "learning_rate": 3.86779602944442e-08, "loss": 0.0103, "step": 9606 }, { "epoch": 4.3707916287534125, "grad_norm": 0.5817093041717176, "learning_rate": 3.862285797577719e-08, "loss": 0.0034, "step": 9607 }, { "epoch": 4.371246587807097, "grad_norm": 0.6246029065508516, "learning_rate": 3.856779335876664e-08, "loss": 0.005, "step": 9608 }, { "epoch": 4.371701546860782, "grad_norm": 0.5197217542680472, "learning_rate": 3.851276644791213e-08, "loss": 0.0067, "step": 9609 }, { "epoch": 4.372156505914468, "grad_norm": 0.5602235456752982, "learning_rate": 3.8457777247710376e-08, "loss": 0.0084, "step": 9610 }, { "epoch": 4.372611464968153, "grad_norm": 0.6610454948292718, "learning_rate": 3.840282576265463e-08, "loss": 0.0069, "step": 9611 }, { "epoch": 4.373066424021838, "grad_norm": 0.5764360036334442, "learning_rate": 3.834791199723558e-08, "loss": 0.0037, "step": 9612 }, { "epoch": 4.3735213830755235, "grad_norm": 0.9678299333251696, "learning_rate": 3.82930359559403e-08, "loss": 0.0121, "step": 9613 }, { "epoch": 4.373976342129208, "grad_norm": 0.5622612358049782, "learning_rate": 3.823819764325298e-08, "loss": 0.0091, "step": 9614 }, { "epoch": 4.374431301182893, "grad_norm": 0.4659767542604344, "learning_rate": 3.818339706365498e-08, "loss": 0.0038, "step": 9615 }, { "epoch": 4.374886260236579, "grad_norm": 0.5652090473736554, "learning_rate": 3.812863422162421e-08, "loss": 0.0047, "step": 9616 }, { "epoch": 4.375341219290264, "grad_norm": 0.8789752002706217, "learning_rate": 3.807390912163561e-08, "loss": 0.0053, "step": 9617 }, { "epoch": 4.375796178343949, "grad_norm": 0.3208280771272636, "learning_rate": 3.801922176816108e-08, "loss": 0.0046, "step": 9618 }, { "epoch": 4.3762511373976345, "grad_norm": 0.43677754627435983, "learning_rate": 3.796457216566945e-08, "loss": 0.0043, "step": 9619 }, { "epoch": 4.376706096451319, "grad_norm": 0.45549629181268164, "learning_rate": 3.790996031862653e-08, "loss": 0.005, "step": 9620 }, { "epoch": 4.377161055505004, "grad_norm": 1.212510919569976, "learning_rate": 3.7855386231494926e-08, "loss": 0.0044, "step": 9621 }, { "epoch": 4.37761601455869, "grad_norm": 0.7213397098517935, "learning_rate": 3.7800849908734053e-08, "loss": 0.0051, "step": 9622 }, { "epoch": 4.378070973612375, "grad_norm": 0.8446952245908317, "learning_rate": 3.774635135480042e-08, "loss": 0.0135, "step": 9623 }, { "epoch": 4.37852593266606, "grad_norm": 0.4723622880798376, "learning_rate": 3.769189057414751e-08, "loss": 0.0074, "step": 9624 }, { "epoch": 4.3789808917197455, "grad_norm": 0.37662075344103263, "learning_rate": 3.7637467571225435e-08, "loss": 0.0043, "step": 9625 }, { "epoch": 4.37943585077343, "grad_norm": 0.5787792471362766, "learning_rate": 3.7583082350481573e-08, "loss": 0.0071, "step": 9626 }, { "epoch": 4.379890809827115, "grad_norm": 0.3006657113456819, "learning_rate": 3.752873491635999e-08, "loss": 0.0006, "step": 9627 }, { "epoch": 4.380345768880801, "grad_norm": 0.5281081293552313, "learning_rate": 3.7474425273301694e-08, "loss": 0.0117, "step": 9628 }, { "epoch": 4.380800727934486, "grad_norm": 0.8520434123377502, "learning_rate": 3.742015342574451e-08, "loss": 0.007, "step": 9629 }, { "epoch": 4.381255686988171, "grad_norm": 0.5870057631498616, "learning_rate": 3.73659193781235e-08, "loss": 0.0114, "step": 9630 }, { "epoch": 4.3817106460418564, "grad_norm": 0.6604606287100603, "learning_rate": 3.73117231348703e-08, "loss": 0.0056, "step": 9631 }, { "epoch": 4.382165605095541, "grad_norm": 0.3822106508783586, "learning_rate": 3.7257564700413525e-08, "loss": 0.0037, "step": 9632 }, { "epoch": 4.382620564149226, "grad_norm": 0.5909569497176838, "learning_rate": 3.720344407917897e-08, "loss": 0.0131, "step": 9633 }, { "epoch": 4.383075523202912, "grad_norm": 0.6439585715815324, "learning_rate": 3.7149361275588817e-08, "loss": 0.0067, "step": 9634 }, { "epoch": 4.383530482256597, "grad_norm": 0.5078452387462895, "learning_rate": 3.709531629406282e-08, "loss": 0.0099, "step": 9635 }, { "epoch": 4.383985441310282, "grad_norm": 0.5187735530814783, "learning_rate": 3.704130913901704e-08, "loss": 0.0034, "step": 9636 }, { "epoch": 4.384440400363967, "grad_norm": 0.5731208819536481, "learning_rate": 3.698733981486485e-08, "loss": 0.0096, "step": 9637 }, { "epoch": 4.384895359417652, "grad_norm": 1.0421283824008312, "learning_rate": 3.693340832601616e-08, "loss": 0.009, "step": 9638 }, { "epoch": 4.385350318471337, "grad_norm": 0.7049150228952815, "learning_rate": 3.6879514676878164e-08, "loss": 0.0109, "step": 9639 }, { "epoch": 4.385805277525023, "grad_norm": 0.629815499684263, "learning_rate": 3.68256588718549e-08, "loss": 0.0047, "step": 9640 }, { "epoch": 4.386260236578708, "grad_norm": 0.8302459382598544, "learning_rate": 3.6771840915347076e-08, "loss": 0.0046, "step": 9641 }, { "epoch": 4.386715195632393, "grad_norm": 0.2822343849488484, "learning_rate": 3.671806081175255e-08, "loss": 0.0017, "step": 9642 }, { "epoch": 4.387170154686078, "grad_norm": 0.5894224783603114, "learning_rate": 3.6664318565465814e-08, "loss": 0.0086, "step": 9643 }, { "epoch": 4.387625113739763, "grad_norm": 0.6771875410171063, "learning_rate": 3.661061418087863e-08, "loss": 0.0077, "step": 9644 }, { "epoch": 4.388080072793448, "grad_norm": 0.6148682541073537, "learning_rate": 3.6556947662379435e-08, "loss": 0.0108, "step": 9645 }, { "epoch": 4.388535031847134, "grad_norm": 0.5353102723943938, "learning_rate": 3.6503319014353495e-08, "loss": 0.0042, "step": 9646 }, { "epoch": 4.388989990900819, "grad_norm": 0.572831687730139, "learning_rate": 3.644972824118325e-08, "loss": 0.0111, "step": 9647 }, { "epoch": 4.389444949954504, "grad_norm": 0.7269969367650362, "learning_rate": 3.639617534724782e-08, "loss": 0.0155, "step": 9648 }, { "epoch": 4.389899909008189, "grad_norm": 0.9524124627551597, "learning_rate": 3.634266033692335e-08, "loss": 0.0112, "step": 9649 }, { "epoch": 4.390354868061874, "grad_norm": 0.646289480547547, "learning_rate": 3.628918321458285e-08, "loss": 0.0143, "step": 9650 }, { "epoch": 4.39080982711556, "grad_norm": 0.590968378145495, "learning_rate": 3.623574398459617e-08, "loss": 0.0085, "step": 9651 }, { "epoch": 4.391264786169245, "grad_norm": 0.3886345251387379, "learning_rate": 3.618234265133008e-08, "loss": 0.0034, "step": 9652 }, { "epoch": 4.39171974522293, "grad_norm": 0.6027230333898559, "learning_rate": 3.612897921914837e-08, "loss": 0.0039, "step": 9653 }, { "epoch": 4.3921747042766155, "grad_norm": 1.1824899104418587, "learning_rate": 3.6075653692411725e-08, "loss": 0.0127, "step": 9654 }, { "epoch": 4.3926296633303, "grad_norm": 0.3591321816515237, "learning_rate": 3.60223660754776e-08, "loss": 0.0033, "step": 9655 }, { "epoch": 4.393084622383985, "grad_norm": 0.7481477755359465, "learning_rate": 3.5969116372700446e-08, "loss": 0.0094, "step": 9656 }, { "epoch": 4.393539581437671, "grad_norm": 0.51136493857776, "learning_rate": 3.591590458843141e-08, "loss": 0.0046, "step": 9657 }, { "epoch": 4.393994540491356, "grad_norm": 1.1018484198720724, "learning_rate": 3.586273072701901e-08, "loss": 0.0066, "step": 9658 }, { "epoch": 4.394449499545041, "grad_norm": 0.4989670459410634, "learning_rate": 3.5809594792808205e-08, "loss": 0.0069, "step": 9659 }, { "epoch": 4.3949044585987265, "grad_norm": 0.5219750482534486, "learning_rate": 3.575649679014098e-08, "loss": 0.007, "step": 9660 }, { "epoch": 4.395359417652411, "grad_norm": 0.5386486861953512, "learning_rate": 3.570343672335641e-08, "loss": 0.0094, "step": 9661 }, { "epoch": 4.395814376706096, "grad_norm": 0.7701473388083484, "learning_rate": 3.565041459679014e-08, "loss": 0.0121, "step": 9662 }, { "epoch": 4.396269335759782, "grad_norm": 1.0051679065693786, "learning_rate": 3.559743041477509e-08, "loss": 0.0027, "step": 9663 }, { "epoch": 4.396724294813467, "grad_norm": 0.6495716504506457, "learning_rate": 3.55444841816408e-08, "loss": 0.0103, "step": 9664 }, { "epoch": 4.397179253867152, "grad_norm": 0.5512900031880198, "learning_rate": 3.549157590171381e-08, "loss": 0.0111, "step": 9665 }, { "epoch": 4.3976342129208374, "grad_norm": 0.6237271623690125, "learning_rate": 3.543870557931755e-08, "loss": 0.0079, "step": 9666 }, { "epoch": 4.398089171974522, "grad_norm": 0.45583154460431097, "learning_rate": 3.538587321877223e-08, "loss": 0.0033, "step": 9667 }, { "epoch": 4.398544131028207, "grad_norm": 0.6856266856957685, "learning_rate": 3.533307882439523e-08, "loss": 0.0145, "step": 9668 }, { "epoch": 4.398999090081893, "grad_norm": 0.7545205937227414, "learning_rate": 3.528032240050061e-08, "loss": 0.0087, "step": 9669 }, { "epoch": 4.399454049135578, "grad_norm": 0.8151527612298165, "learning_rate": 3.5227603951399455e-08, "loss": 0.0047, "step": 9670 }, { "epoch": 4.399909008189263, "grad_norm": 0.67296452123682, "learning_rate": 3.5174923481399556e-08, "loss": 0.0086, "step": 9671 }, { "epoch": 4.400363967242948, "grad_norm": 0.3144012846024441, "learning_rate": 3.5122280994805745e-08, "loss": 0.0015, "step": 9672 }, { "epoch": 4.400818926296633, "grad_norm": 0.30212379264308115, "learning_rate": 3.5069676495919796e-08, "loss": 0.0019, "step": 9673 }, { "epoch": 4.401273885350318, "grad_norm": 0.3926592614652846, "learning_rate": 3.5017109989040225e-08, "loss": 0.0022, "step": 9674 }, { "epoch": 4.401728844404004, "grad_norm": 0.5202435271615472, "learning_rate": 3.496458147846265e-08, "loss": 0.0032, "step": 9675 }, { "epoch": 4.402183803457689, "grad_norm": 0.5721873451123738, "learning_rate": 3.491209096847941e-08, "loss": 0.0065, "step": 9676 }, { "epoch": 4.402638762511374, "grad_norm": 0.426086335315713, "learning_rate": 3.485963846337969e-08, "loss": 0.0026, "step": 9677 }, { "epoch": 4.403093721565059, "grad_norm": 0.47627731190290873, "learning_rate": 3.480722396744984e-08, "loss": 0.0059, "step": 9678 }, { "epoch": 4.403548680618744, "grad_norm": 0.46581650806847263, "learning_rate": 3.475484748497287e-08, "loss": 0.0056, "step": 9679 }, { "epoch": 4.404003639672429, "grad_norm": 0.6308982417503004, "learning_rate": 3.47025090202287e-08, "loss": 0.0038, "step": 9680 }, { "epoch": 4.404458598726115, "grad_norm": 0.19084404367718205, "learning_rate": 3.4650208577494175e-08, "loss": 0.001, "step": 9681 }, { "epoch": 4.4049135577798, "grad_norm": 0.40657459886058567, "learning_rate": 3.459794616104306e-08, "loss": 0.0019, "step": 9682 }, { "epoch": 4.405368516833485, "grad_norm": 0.6719776190285567, "learning_rate": 3.45457217751462e-08, "loss": 0.0147, "step": 9683 }, { "epoch": 4.40582347588717, "grad_norm": 0.4499514981184255, "learning_rate": 3.449353542407091e-08, "loss": 0.0058, "step": 9684 }, { "epoch": 4.406278434940855, "grad_norm": 0.41558116650259164, "learning_rate": 3.444138711208172e-08, "loss": 0.0028, "step": 9685 }, { "epoch": 4.40673339399454, "grad_norm": 1.1967082906504751, "learning_rate": 3.43892768434399e-08, "loss": 0.006, "step": 9686 }, { "epoch": 4.407188353048226, "grad_norm": 0.3856656045407659, "learning_rate": 3.433720462240375e-08, "loss": 0.0038, "step": 9687 }, { "epoch": 4.407643312101911, "grad_norm": 0.717773492467244, "learning_rate": 3.4285170453228206e-08, "loss": 0.0156, "step": 9688 }, { "epoch": 4.408098271155596, "grad_norm": 0.33232861834874067, "learning_rate": 3.423317434016548e-08, "loss": 0.004, "step": 9689 }, { "epoch": 4.408553230209281, "grad_norm": 0.717059679327204, "learning_rate": 3.41812162874644e-08, "loss": 0.0082, "step": 9690 }, { "epoch": 4.409008189262966, "grad_norm": 0.7272780281273832, "learning_rate": 3.4129296299370616e-08, "loss": 0.0045, "step": 9691 }, { "epoch": 4.409463148316651, "grad_norm": 1.2259427320832421, "learning_rate": 3.40774143801269e-08, "loss": 0.0104, "step": 9692 }, { "epoch": 4.409918107370337, "grad_norm": 1.0043978973111298, "learning_rate": 3.402557053397287e-08, "loss": 0.0035, "step": 9693 }, { "epoch": 4.410373066424022, "grad_norm": 0.40295981499733985, "learning_rate": 3.3973764765144854e-08, "loss": 0.0061, "step": 9694 }, { "epoch": 4.4108280254777075, "grad_norm": 0.4602455414444204, "learning_rate": 3.392199707787618e-08, "loss": 0.01, "step": 9695 }, { "epoch": 4.411282984531392, "grad_norm": 0.4362905611334533, "learning_rate": 3.38702674763971e-08, "loss": 0.0039, "step": 9696 }, { "epoch": 4.411737943585077, "grad_norm": 0.4880200016637552, "learning_rate": 3.381857596493476e-08, "loss": 0.0046, "step": 9697 }, { "epoch": 4.412192902638763, "grad_norm": 0.538425376492538, "learning_rate": 3.376692254771324e-08, "loss": 0.0118, "step": 9698 }, { "epoch": 4.412647861692448, "grad_norm": 0.7169139317965131, "learning_rate": 3.3715307228953214e-08, "loss": 0.0078, "step": 9699 }, { "epoch": 4.413102820746133, "grad_norm": 0.7517716296869843, "learning_rate": 3.366373001287265e-08, "loss": 0.0092, "step": 9700 }, { "epoch": 4.4135577797998184, "grad_norm": 0.3909021378695554, "learning_rate": 3.3612190903686e-08, "loss": 0.0027, "step": 9701 }, { "epoch": 4.414012738853503, "grad_norm": 0.5833648211278878, "learning_rate": 3.35606899056049e-08, "loss": 0.0044, "step": 9702 }, { "epoch": 4.414467697907188, "grad_norm": 0.7633527283072831, "learning_rate": 3.350922702283787e-08, "loss": 0.0096, "step": 9703 }, { "epoch": 4.414922656960874, "grad_norm": 0.5742491707830192, "learning_rate": 3.3457802259590165e-08, "loss": 0.0065, "step": 9704 }, { "epoch": 4.415377616014559, "grad_norm": 10.511848059165766, "learning_rate": 3.340641562006402e-08, "loss": 0.0403, "step": 9705 }, { "epoch": 4.415832575068244, "grad_norm": 0.5287514686448741, "learning_rate": 3.335506710845837e-08, "loss": 0.0145, "step": 9706 }, { "epoch": 4.416287534121929, "grad_norm": 0.7381216092899162, "learning_rate": 3.3303756728969346e-08, "loss": 0.0109, "step": 9707 }, { "epoch": 4.416742493175614, "grad_norm": 0.6627760248184287, "learning_rate": 3.325248448578971e-08, "loss": 0.0146, "step": 9708 }, { "epoch": 4.417197452229299, "grad_norm": 0.5642863644771658, "learning_rate": 3.3201250383109224e-08, "loss": 0.0104, "step": 9709 }, { "epoch": 4.417652411282985, "grad_norm": 0.4577711271095217, "learning_rate": 3.315005442511454e-08, "loss": 0.0054, "step": 9710 }, { "epoch": 4.41810737033667, "grad_norm": 0.46389942960031294, "learning_rate": 3.309889661598908e-08, "loss": 0.0056, "step": 9711 }, { "epoch": 4.418562329390355, "grad_norm": 0.5392486226416194, "learning_rate": 3.304777695991334e-08, "loss": 0.0075, "step": 9712 }, { "epoch": 4.41901728844404, "grad_norm": 0.8164663533480085, "learning_rate": 3.299669546106454e-08, "loss": 0.0146, "step": 9713 }, { "epoch": 4.419472247497725, "grad_norm": 0.3562467696347916, "learning_rate": 3.2945652123616826e-08, "loss": 0.0046, "step": 9714 }, { "epoch": 4.41992720655141, "grad_norm": 0.4952667064124895, "learning_rate": 3.28946469517411e-08, "loss": 0.0114, "step": 9715 }, { "epoch": 4.420382165605096, "grad_norm": 0.45813385019163355, "learning_rate": 3.284367994960546e-08, "loss": 0.006, "step": 9716 }, { "epoch": 4.420837124658781, "grad_norm": 0.5462411653854211, "learning_rate": 3.27927511213747e-08, "loss": 0.0059, "step": 9717 }, { "epoch": 4.421292083712466, "grad_norm": 0.5182047955006376, "learning_rate": 3.2741860471210356e-08, "loss": 0.0114, "step": 9718 }, { "epoch": 4.421747042766151, "grad_norm": 0.609154320728631, "learning_rate": 3.269100800327112e-08, "loss": 0.0105, "step": 9719 }, { "epoch": 4.422202001819836, "grad_norm": 0.8151443420800697, "learning_rate": 3.2640193721712284e-08, "loss": 0.003, "step": 9720 }, { "epoch": 4.422656960873521, "grad_norm": 0.7760668752597505, "learning_rate": 3.2589417630686345e-08, "loss": 0.005, "step": 9721 }, { "epoch": 4.423111919927207, "grad_norm": 0.5747169007302749, "learning_rate": 3.2538679734342325e-08, "loss": 0.0053, "step": 9722 }, { "epoch": 4.423566878980892, "grad_norm": 0.6751330093239016, "learning_rate": 3.248798003682629e-08, "loss": 0.0059, "step": 9723 }, { "epoch": 4.424021838034577, "grad_norm": 0.5987755968586685, "learning_rate": 3.2437318542281375e-08, "loss": 0.0047, "step": 9724 }, { "epoch": 4.424476797088262, "grad_norm": 0.7613149627057739, "learning_rate": 3.2386695254847216e-08, "loss": 0.0067, "step": 9725 }, { "epoch": 4.424931756141947, "grad_norm": 0.5914586733836915, "learning_rate": 3.233611017866067e-08, "loss": 0.0062, "step": 9726 }, { "epoch": 4.425386715195632, "grad_norm": 0.7138755554771403, "learning_rate": 3.22855633178552e-08, "loss": 0.0043, "step": 9727 }, { "epoch": 4.425841674249318, "grad_norm": 0.6588373640427115, "learning_rate": 3.223505467656135e-08, "loss": 0.0088, "step": 9728 }, { "epoch": 4.426296633303003, "grad_norm": 0.45594503546368575, "learning_rate": 3.2184584258906354e-08, "loss": 0.0023, "step": 9729 }, { "epoch": 4.426751592356688, "grad_norm": 1.867302767317927, "learning_rate": 3.213415206901449e-08, "loss": 0.0371, "step": 9730 }, { "epoch": 4.427206551410373, "grad_norm": 0.8752462105040505, "learning_rate": 3.2083758111006944e-08, "loss": 0.0082, "step": 9731 }, { "epoch": 4.427661510464058, "grad_norm": 0.7309216141693928, "learning_rate": 3.20334023890016e-08, "loss": 0.0101, "step": 9732 }, { "epoch": 4.428116469517743, "grad_norm": 0.44147296605536807, "learning_rate": 3.198308490711327e-08, "loss": 0.0042, "step": 9733 }, { "epoch": 4.428571428571429, "grad_norm": 0.5180351232375702, "learning_rate": 3.193280566945372e-08, "loss": 0.0037, "step": 9734 }, { "epoch": 4.429026387625114, "grad_norm": 0.4547150232103333, "learning_rate": 3.188256468013139e-08, "loss": 0.0062, "step": 9735 }, { "epoch": 4.429481346678799, "grad_norm": 0.5411117329489066, "learning_rate": 3.183236194325201e-08, "loss": 0.0101, "step": 9736 }, { "epoch": 4.429936305732484, "grad_norm": 0.27701984593333717, "learning_rate": 3.178219746291766e-08, "loss": 0.0021, "step": 9737 }, { "epoch": 4.430391264786169, "grad_norm": 0.7613390324301497, "learning_rate": 3.1732071243227796e-08, "loss": 0.0081, "step": 9738 }, { "epoch": 4.430846223839854, "grad_norm": 0.36274758506415256, "learning_rate": 3.168198328827837e-08, "loss": 0.0032, "step": 9739 }, { "epoch": 4.43130118289354, "grad_norm": 0.7159947156916096, "learning_rate": 3.1631933602162317e-08, "loss": 0.0109, "step": 9740 }, { "epoch": 4.431756141947225, "grad_norm": 0.698201753714352, "learning_rate": 3.15819221889696e-08, "loss": 0.0042, "step": 9741 }, { "epoch": 4.4322111010009095, "grad_norm": 0.750214787282555, "learning_rate": 3.1531949052786777e-08, "loss": 0.0113, "step": 9742 }, { "epoch": 4.432666060054595, "grad_norm": 0.23523874139307943, "learning_rate": 3.1482014197697584e-08, "loss": 0.0015, "step": 9743 }, { "epoch": 4.43312101910828, "grad_norm": 0.4808921982917478, "learning_rate": 3.143211762778225e-08, "loss": 0.0081, "step": 9744 }, { "epoch": 4.433575978161965, "grad_norm": 0.6500003508089608, "learning_rate": 3.138225934711819e-08, "loss": 0.005, "step": 9745 }, { "epoch": 4.434030937215651, "grad_norm": 0.5897440815308227, "learning_rate": 3.133243935977981e-08, "loss": 0.0105, "step": 9746 }, { "epoch": 4.434485896269336, "grad_norm": 0.35780522105945, "learning_rate": 3.128265766983795e-08, "loss": 0.003, "step": 9747 }, { "epoch": 4.4349408553230205, "grad_norm": 0.6437236639367502, "learning_rate": 3.12329142813606e-08, "loss": 0.0119, "step": 9748 }, { "epoch": 4.435395814376706, "grad_norm": 0.3127145646357245, "learning_rate": 3.1183209198412444e-08, "loss": 0.0029, "step": 9749 }, { "epoch": 4.435850773430391, "grad_norm": 0.5118467222436103, "learning_rate": 3.113354242505539e-08, "loss": 0.0145, "step": 9750 }, { "epoch": 4.436305732484076, "grad_norm": 0.909943023397291, "learning_rate": 3.108391396534782e-08, "loss": 0.0086, "step": 9751 }, { "epoch": 4.436760691537762, "grad_norm": 0.40147975555800325, "learning_rate": 3.103432382334525e-08, "loss": 0.0053, "step": 9752 }, { "epoch": 4.437215650591447, "grad_norm": 0.6924258110511523, "learning_rate": 3.09847720030999e-08, "loss": 0.0041, "step": 9753 }, { "epoch": 4.4376706096451315, "grad_norm": 0.6691703190255839, "learning_rate": 3.09352585086608e-08, "loss": 0.0051, "step": 9754 }, { "epoch": 4.438125568698817, "grad_norm": 0.587988507265791, "learning_rate": 3.088578334407427e-08, "loss": 0.0016, "step": 9755 }, { "epoch": 4.438580527752502, "grad_norm": 0.30651182841242985, "learning_rate": 3.083634651338296e-08, "loss": 0.0039, "step": 9756 }, { "epoch": 4.439035486806187, "grad_norm": 0.6814273471863169, "learning_rate": 3.07869480206267e-08, "loss": 0.0067, "step": 9757 }, { "epoch": 4.439490445859873, "grad_norm": 0.6034672346183036, "learning_rate": 3.073758786984204e-08, "loss": 0.0039, "step": 9758 }, { "epoch": 4.439945404913558, "grad_norm": 1.0282639697486073, "learning_rate": 3.068826606506253e-08, "loss": 0.0104, "step": 9759 }, { "epoch": 4.440400363967243, "grad_norm": 0.6800883065177002, "learning_rate": 3.0638982610318554e-08, "loss": 0.004, "step": 9760 }, { "epoch": 4.440855323020928, "grad_norm": 0.39551911529623646, "learning_rate": 3.0589737509637334e-08, "loss": 0.0037, "step": 9761 }, { "epoch": 4.441310282074613, "grad_norm": 0.47358858934337794, "learning_rate": 3.054053076704294e-08, "loss": 0.0048, "step": 9762 }, { "epoch": 4.441765241128299, "grad_norm": 0.39807772531331626, "learning_rate": 3.049136238655625e-08, "loss": 0.0027, "step": 9763 }, { "epoch": 4.442220200181984, "grad_norm": 0.8004955551114676, "learning_rate": 3.0442232372195174e-08, "loss": 0.008, "step": 9764 }, { "epoch": 4.442675159235669, "grad_norm": 0.3972303887540066, "learning_rate": 3.039314072797433e-08, "loss": 0.0033, "step": 9765 }, { "epoch": 4.443130118289354, "grad_norm": 0.5433482699053926, "learning_rate": 3.0344087457905344e-08, "loss": 0.0097, "step": 9766 }, { "epoch": 4.443585077343039, "grad_norm": 0.6753171528754535, "learning_rate": 3.029507256599662e-08, "loss": 0.006, "step": 9767 }, { "epoch": 4.444040036396724, "grad_norm": 0.5451998630202027, "learning_rate": 3.024609605625328e-08, "loss": 0.0107, "step": 9768 }, { "epoch": 4.44449499545041, "grad_norm": 1.5119336863703383, "learning_rate": 3.019715793267769e-08, "loss": 0.0136, "step": 9769 }, { "epoch": 4.444949954504095, "grad_norm": 0.7653753598871738, "learning_rate": 3.014825819926869e-08, "loss": 0.0086, "step": 9770 }, { "epoch": 4.44540491355778, "grad_norm": 0.5604088615979548, "learning_rate": 3.0099396860022274e-08, "loss": 0.0091, "step": 9771 }, { "epoch": 4.445859872611465, "grad_norm": 0.7863085563491936, "learning_rate": 3.005057391893095e-08, "loss": 0.0119, "step": 9772 }, { "epoch": 4.44631483166515, "grad_norm": 0.44607915245311713, "learning_rate": 3.000178937998454e-08, "loss": 0.0044, "step": 9773 }, { "epoch": 4.446769790718835, "grad_norm": 0.4293227425174771, "learning_rate": 2.995304324716935e-08, "loss": 0.0065, "step": 9774 }, { "epoch": 4.447224749772521, "grad_norm": 1.091648460897179, "learning_rate": 2.990433552446886e-08, "loss": 0.0073, "step": 9775 }, { "epoch": 4.447679708826206, "grad_norm": 0.5218625811606148, "learning_rate": 2.985566621586311e-08, "loss": 0.0066, "step": 9776 }, { "epoch": 4.4481346678798905, "grad_norm": 0.5027407983227871, "learning_rate": 2.9807035325329134e-08, "loss": 0.0086, "step": 9777 }, { "epoch": 4.448589626933576, "grad_norm": 0.31341162469848205, "learning_rate": 2.9758442856840814e-08, "loss": 0.0037, "step": 9778 }, { "epoch": 4.449044585987261, "grad_norm": 0.9066697955643562, "learning_rate": 2.970988881436892e-08, "loss": 0.0069, "step": 9779 }, { "epoch": 4.449499545040946, "grad_norm": 0.7257904341496284, "learning_rate": 2.9661373201881224e-08, "loss": 0.0044, "step": 9780 }, { "epoch": 4.449954504094632, "grad_norm": 0.5292278420505875, "learning_rate": 2.961289602334205e-08, "loss": 0.0038, "step": 9781 }, { "epoch": 4.450409463148317, "grad_norm": 0.5250716610040097, "learning_rate": 2.9564457282712783e-08, "loss": 0.0043, "step": 9782 }, { "epoch": 4.4508644222020015, "grad_norm": 0.5178043929057295, "learning_rate": 2.9516056983951488e-08, "loss": 0.0062, "step": 9783 }, { "epoch": 4.451319381255687, "grad_norm": 0.5242398570893938, "learning_rate": 2.9467695131013436e-08, "loss": 0.0046, "step": 9784 }, { "epoch": 4.451774340309372, "grad_norm": 0.8386598192503755, "learning_rate": 2.9419371727850418e-08, "loss": 0.0108, "step": 9785 }, { "epoch": 4.452229299363057, "grad_norm": 0.9235442369519195, "learning_rate": 2.937108677841116e-08, "loss": 0.0045, "step": 9786 }, { "epoch": 4.452684258416743, "grad_norm": 0.7087560573794023, "learning_rate": 2.9322840286641448e-08, "loss": 0.0071, "step": 9787 }, { "epoch": 4.453139217470428, "grad_norm": 0.4986085957855692, "learning_rate": 2.9274632256483523e-08, "loss": 0.0047, "step": 9788 }, { "epoch": 4.4535941765241125, "grad_norm": 0.6408956765286971, "learning_rate": 2.9226462691877007e-08, "loss": 0.0068, "step": 9789 }, { "epoch": 4.454049135577798, "grad_norm": 1.6152356793635616, "learning_rate": 2.917833159675792e-08, "loss": 0.0072, "step": 9790 }, { "epoch": 4.454504094631483, "grad_norm": 0.4839081649242538, "learning_rate": 2.9130238975059396e-08, "loss": 0.0026, "step": 9791 }, { "epoch": 4.454959053685168, "grad_norm": 0.381745413988445, "learning_rate": 2.9082184830711232e-08, "loss": 0.0037, "step": 9792 }, { "epoch": 4.455414012738854, "grad_norm": 0.7922495007801975, "learning_rate": 2.903416916764029e-08, "loss": 0.0094, "step": 9793 }, { "epoch": 4.455868971792539, "grad_norm": 0.4131757290097048, "learning_rate": 2.8986191989770258e-08, "loss": 0.0019, "step": 9794 }, { "epoch": 4.4563239308462235, "grad_norm": 0.6170822319376147, "learning_rate": 2.893825330102151e-08, "loss": 0.006, "step": 9795 }, { "epoch": 4.456778889899909, "grad_norm": 0.5729561954129014, "learning_rate": 2.8890353105311404e-08, "loss": 0.0144, "step": 9796 }, { "epoch": 4.457233848953594, "grad_norm": 0.5639677824437043, "learning_rate": 2.8842491406554092e-08, "loss": 0.0034, "step": 9797 }, { "epoch": 4.457688808007279, "grad_norm": 1.332452715183351, "learning_rate": 2.8794668208660668e-08, "loss": 0.0097, "step": 9798 }, { "epoch": 4.458143767060965, "grad_norm": 0.38590382459985084, "learning_rate": 2.8746883515539055e-08, "loss": 0.0035, "step": 9799 }, { "epoch": 4.45859872611465, "grad_norm": 0.4480069533936494, "learning_rate": 2.869913733109386e-08, "loss": 0.0021, "step": 9800 }, { "epoch": 4.4590536851683344, "grad_norm": 0.8101031507432261, "learning_rate": 2.86514296592269e-08, "loss": 0.0033, "step": 9801 }, { "epoch": 4.45950864422202, "grad_norm": 1.1682002985194309, "learning_rate": 2.8603760503836448e-08, "loss": 0.0105, "step": 9802 }, { "epoch": 4.459963603275705, "grad_norm": 0.8164534100740987, "learning_rate": 2.855612986881789e-08, "loss": 0.0097, "step": 9803 }, { "epoch": 4.460418562329391, "grad_norm": 0.6065000223336433, "learning_rate": 2.8508537758063444e-08, "loss": 0.0073, "step": 9804 }, { "epoch": 4.460873521383076, "grad_norm": 1.0249641422598146, "learning_rate": 2.8460984175462055e-08, "loss": 0.0109, "step": 9805 }, { "epoch": 4.461328480436761, "grad_norm": 0.4706483836245795, "learning_rate": 2.841346912489956e-08, "loss": 0.0026, "step": 9806 }, { "epoch": 4.461783439490446, "grad_norm": 0.4477740180018138, "learning_rate": 2.8365992610258517e-08, "loss": 0.0065, "step": 9807 }, { "epoch": 4.462238398544131, "grad_norm": 0.5414366138621477, "learning_rate": 2.831855463541888e-08, "loss": 0.0036, "step": 9808 }, { "epoch": 4.462693357597816, "grad_norm": 1.4488836420502123, "learning_rate": 2.8271155204256826e-08, "loss": 0.019, "step": 9809 }, { "epoch": 4.463148316651502, "grad_norm": 0.529089900864664, "learning_rate": 2.82237943206457e-08, "loss": 0.0065, "step": 9810 }, { "epoch": 4.463603275705187, "grad_norm": 0.870117073156538, "learning_rate": 2.8176471988455575e-08, "loss": 0.0044, "step": 9811 }, { "epoch": 4.4640582347588715, "grad_norm": 0.46125123004511864, "learning_rate": 2.8129188211553358e-08, "loss": 0.0062, "step": 9812 }, { "epoch": 4.464513193812557, "grad_norm": 0.40673441585867126, "learning_rate": 2.8081942993802953e-08, "loss": 0.0043, "step": 9813 }, { "epoch": 4.464968152866242, "grad_norm": 0.5538518730400293, "learning_rate": 2.8034736339064947e-08, "loss": 0.0038, "step": 9814 }, { "epoch": 4.465423111919927, "grad_norm": 0.3276998654364444, "learning_rate": 2.7987568251197024e-08, "loss": 0.0018, "step": 9815 }, { "epoch": 4.465878070973613, "grad_norm": 0.4424332870440619, "learning_rate": 2.7940438734053385e-08, "loss": 0.0068, "step": 9816 }, { "epoch": 4.466333030027298, "grad_norm": 0.5402768996087958, "learning_rate": 2.7893347791485276e-08, "loss": 0.005, "step": 9817 }, { "epoch": 4.4667879890809825, "grad_norm": 0.6158926091895408, "learning_rate": 2.7846295427340792e-08, "loss": 0.0087, "step": 9818 }, { "epoch": 4.467242948134668, "grad_norm": 0.5832194438374385, "learning_rate": 2.77992816454648e-08, "loss": 0.0049, "step": 9819 }, { "epoch": 4.467697907188353, "grad_norm": 0.5045086645054491, "learning_rate": 2.775230644969906e-08, "loss": 0.0058, "step": 9820 }, { "epoch": 4.468152866242038, "grad_norm": 0.7951644043820002, "learning_rate": 2.770536984388222e-08, "loss": 0.0046, "step": 9821 }, { "epoch": 4.468607825295724, "grad_norm": 0.8363902507280396, "learning_rate": 2.765847183184966e-08, "loss": 0.0122, "step": 9822 }, { "epoch": 4.469062784349409, "grad_norm": 0.35834714979263627, "learning_rate": 2.7611612417433706e-08, "loss": 0.0032, "step": 9823 }, { "epoch": 4.4695177434030935, "grad_norm": 0.4065053310159229, "learning_rate": 2.7564791604463568e-08, "loss": 0.0039, "step": 9824 }, { "epoch": 4.469972702456779, "grad_norm": 1.2926467561968216, "learning_rate": 2.751800939676513e-08, "loss": 0.0133, "step": 9825 }, { "epoch": 4.470427661510464, "grad_norm": 0.5806090399702154, "learning_rate": 2.7471265798161168e-08, "loss": 0.0062, "step": 9826 }, { "epoch": 4.470882620564149, "grad_norm": 0.6592065083424448, "learning_rate": 2.7424560812471463e-08, "loss": 0.0079, "step": 9827 }, { "epoch": 4.471337579617835, "grad_norm": 1.6846124064807075, "learning_rate": 2.7377894443512627e-08, "loss": 0.0115, "step": 9828 }, { "epoch": 4.47179253867152, "grad_norm": 0.4250370203752944, "learning_rate": 2.733126669509789e-08, "loss": 0.0033, "step": 9829 }, { "epoch": 4.4722474977252045, "grad_norm": 1.4618151389194116, "learning_rate": 2.7284677571037474e-08, "loss": 0.0077, "step": 9830 }, { "epoch": 4.47270245677889, "grad_norm": 0.6918293227123787, "learning_rate": 2.723812707513834e-08, "loss": 0.0062, "step": 9831 }, { "epoch": 4.473157415832575, "grad_norm": 0.6566141862379631, "learning_rate": 2.719161521120461e-08, "loss": 0.0048, "step": 9832 }, { "epoch": 4.47361237488626, "grad_norm": 0.44620037112564004, "learning_rate": 2.7145141983036856e-08, "loss": 0.0054, "step": 9833 }, { "epoch": 4.474067333939946, "grad_norm": 0.7017613670190467, "learning_rate": 2.709870739443265e-08, "loss": 0.0105, "step": 9834 }, { "epoch": 4.474522292993631, "grad_norm": 0.900182671316605, "learning_rate": 2.705231144918657e-08, "loss": 0.0037, "step": 9835 }, { "epoch": 4.4749772520473154, "grad_norm": 0.56045861720922, "learning_rate": 2.7005954151089693e-08, "loss": 0.0113, "step": 9836 }, { "epoch": 4.475432211101001, "grad_norm": 0.24381219697588125, "learning_rate": 2.695963550393032e-08, "loss": 0.0009, "step": 9837 }, { "epoch": 4.475887170154686, "grad_norm": 1.4782013301501133, "learning_rate": 2.6913355511493264e-08, "loss": 0.0051, "step": 9838 }, { "epoch": 4.476342129208371, "grad_norm": 0.9928103900334109, "learning_rate": 2.6867114177560378e-08, "loss": 0.0123, "step": 9839 }, { "epoch": 4.476797088262057, "grad_norm": 0.6396150306739176, "learning_rate": 2.682091150591026e-08, "loss": 0.0172, "step": 9840 }, { "epoch": 4.477252047315742, "grad_norm": 0.6225903923992453, "learning_rate": 2.6774747500318217e-08, "loss": 0.0108, "step": 9841 }, { "epoch": 4.477707006369426, "grad_norm": 1.2122311481191685, "learning_rate": 2.6728622164556958e-08, "loss": 0.0082, "step": 9842 }, { "epoch": 4.478161965423112, "grad_norm": 0.6636109383169375, "learning_rate": 2.6682535502395354e-08, "loss": 0.0187, "step": 9843 }, { "epoch": 4.478616924476797, "grad_norm": 0.6868369313319344, "learning_rate": 2.6636487517599504e-08, "loss": 0.0138, "step": 9844 }, { "epoch": 4.479071883530482, "grad_norm": 0.713026007864059, "learning_rate": 2.659047821393223e-08, "loss": 0.0074, "step": 9845 }, { "epoch": 4.479526842584168, "grad_norm": 0.6247351791744379, "learning_rate": 2.654450759515303e-08, "loss": 0.0077, "step": 9846 }, { "epoch": 4.4799818016378525, "grad_norm": 0.8908376987348209, "learning_rate": 2.649857566501873e-08, "loss": 0.0035, "step": 9847 }, { "epoch": 4.480436760691537, "grad_norm": 0.462341542099677, "learning_rate": 2.645268242728238e-08, "loss": 0.0062, "step": 9848 }, { "epoch": 4.480891719745223, "grad_norm": 0.43694505236251613, "learning_rate": 2.640682788569437e-08, "loss": 0.0045, "step": 9849 }, { "epoch": 4.481346678798908, "grad_norm": 0.5585017479317337, "learning_rate": 2.6361012044001652e-08, "loss": 0.0101, "step": 9850 }, { "epoch": 4.481801637852593, "grad_norm": 0.524437733785625, "learning_rate": 2.631523490594806e-08, "loss": 0.0091, "step": 9851 }, { "epoch": 4.482256596906279, "grad_norm": 0.4004185264618325, "learning_rate": 2.6269496475274377e-08, "loss": 0.0025, "step": 9852 }, { "epoch": 4.4827115559599635, "grad_norm": 0.17261973880156106, "learning_rate": 2.622379675571812e-08, "loss": 0.0009, "step": 9853 }, { "epoch": 4.483166515013648, "grad_norm": 0.2662768358984416, "learning_rate": 2.6178135751013575e-08, "loss": 0.0022, "step": 9854 }, { "epoch": 4.483621474067334, "grad_norm": 0.5108067854617407, "learning_rate": 2.6132513464892038e-08, "loss": 0.0061, "step": 9855 }, { "epoch": 4.484076433121019, "grad_norm": 0.6212992196003643, "learning_rate": 2.6086929901081466e-08, "loss": 0.0127, "step": 9856 }, { "epoch": 4.484531392174704, "grad_norm": 0.507373880728708, "learning_rate": 2.6041385063306887e-08, "loss": 0.0076, "step": 9857 }, { "epoch": 4.48498635122839, "grad_norm": 0.40295090397917943, "learning_rate": 2.599587895528993e-08, "loss": 0.0032, "step": 9858 }, { "epoch": 4.4854413102820745, "grad_norm": 0.34858167835582227, "learning_rate": 2.595041158074923e-08, "loss": 0.0025, "step": 9859 }, { "epoch": 4.485896269335759, "grad_norm": 0.6298624345316199, "learning_rate": 2.590498294339999e-08, "loss": 0.0154, "step": 9860 }, { "epoch": 4.486351228389445, "grad_norm": 0.48082628556606943, "learning_rate": 2.585959304695462e-08, "loss": 0.0074, "step": 9861 }, { "epoch": 4.48680618744313, "grad_norm": 0.6403640786160388, "learning_rate": 2.5814241895122048e-08, "loss": 0.0039, "step": 9862 }, { "epoch": 4.487261146496815, "grad_norm": 0.8011264988773946, "learning_rate": 2.576892949160825e-08, "loss": 0.017, "step": 9863 }, { "epoch": 4.487716105550501, "grad_norm": 0.4057520844006818, "learning_rate": 2.572365584011599e-08, "loss": 0.0045, "step": 9864 }, { "epoch": 4.4881710646041855, "grad_norm": 0.5307066182429567, "learning_rate": 2.5678420944344637e-08, "loss": 0.0051, "step": 9865 }, { "epoch": 4.48862602365787, "grad_norm": 0.6068936454570855, "learning_rate": 2.5633224807990794e-08, "loss": 0.0032, "step": 9866 }, { "epoch": 4.489080982711556, "grad_norm": 0.42982579514405045, "learning_rate": 2.5588067434747616e-08, "loss": 0.005, "step": 9867 }, { "epoch": 4.489535941765241, "grad_norm": 0.4967157836177433, "learning_rate": 2.55429488283051e-08, "loss": 0.0057, "step": 9868 }, { "epoch": 4.489990900818927, "grad_norm": 1.0615929350571376, "learning_rate": 2.549786899235018e-08, "loss": 0.0023, "step": 9869 }, { "epoch": 4.490445859872612, "grad_norm": 0.7396845234540385, "learning_rate": 2.5452827930566523e-08, "loss": 0.0136, "step": 9870 }, { "epoch": 4.4909008189262964, "grad_norm": 0.8046376236716299, "learning_rate": 2.5407825646634794e-08, "loss": 0.0096, "step": 9871 }, { "epoch": 4.491355777979982, "grad_norm": 0.5874480030504854, "learning_rate": 2.5362862144232333e-08, "loss": 0.004, "step": 9872 }, { "epoch": 4.491810737033667, "grad_norm": 7.102707226726423, "learning_rate": 2.531793742703331e-08, "loss": 0.019, "step": 9873 }, { "epoch": 4.492265696087352, "grad_norm": 0.4135212842883121, "learning_rate": 2.5273051498708787e-08, "loss": 0.0031, "step": 9874 }, { "epoch": 4.492720655141038, "grad_norm": 0.6737884525173656, "learning_rate": 2.5228204362926663e-08, "loss": 0.0068, "step": 9875 }, { "epoch": 4.4931756141947226, "grad_norm": 0.5075548836228554, "learning_rate": 2.5183396023351565e-08, "loss": 0.0025, "step": 9876 }, { "epoch": 4.493630573248407, "grad_norm": 0.7111284767436471, "learning_rate": 2.5138626483645176e-08, "loss": 0.0038, "step": 9877 }, { "epoch": 4.494085532302093, "grad_norm": 0.7737078412297653, "learning_rate": 2.509389574746579e-08, "loss": 0.0147, "step": 9878 }, { "epoch": 4.494540491355778, "grad_norm": 0.6028246497124277, "learning_rate": 2.5049203818468535e-08, "loss": 0.0105, "step": 9879 }, { "epoch": 4.494995450409463, "grad_norm": 0.5157159929806957, "learning_rate": 2.5004550700305437e-08, "loss": 0.0071, "step": 9880 }, { "epoch": 4.495450409463149, "grad_norm": 0.608697980781103, "learning_rate": 2.495993639662547e-08, "loss": 0.0077, "step": 9881 }, { "epoch": 4.4959053685168335, "grad_norm": 0.30648892888605817, "learning_rate": 2.4915360911074267e-08, "loss": 0.0051, "step": 9882 }, { "epoch": 4.496360327570518, "grad_norm": 0.5009709145203952, "learning_rate": 2.48708242472942e-08, "loss": 0.0056, "step": 9883 }, { "epoch": 4.496815286624204, "grad_norm": 0.7937509010315897, "learning_rate": 2.482632640892479e-08, "loss": 0.0042, "step": 9884 }, { "epoch": 4.497270245677889, "grad_norm": 0.5319844244539399, "learning_rate": 2.478186739960203e-08, "loss": 0.0087, "step": 9885 }, { "epoch": 4.497725204731574, "grad_norm": 0.5642354504975257, "learning_rate": 2.4737447222959118e-08, "loss": 0.0101, "step": 9886 }, { "epoch": 4.49818016378526, "grad_norm": 0.49453212972577615, "learning_rate": 2.4693065882625707e-08, "loss": 0.0051, "step": 9887 }, { "epoch": 4.4986351228389445, "grad_norm": 0.7012554746101831, "learning_rate": 2.464872338222851e-08, "loss": 0.0061, "step": 9888 }, { "epoch": 4.499090081892629, "grad_norm": 0.39676080149003307, "learning_rate": 2.46044197253909e-08, "loss": 0.0035, "step": 9889 }, { "epoch": 4.499545040946315, "grad_norm": 0.4277121205697935, "learning_rate": 2.4560154915733267e-08, "loss": 0.0047, "step": 9890 }, { "epoch": 4.5, "grad_norm": 0.34903668111172487, "learning_rate": 2.4515928956872712e-08, "loss": 0.0018, "step": 9891 }, { "epoch": 4.500454959053685, "grad_norm": 0.8722454908478626, "learning_rate": 2.4471741852423233e-08, "loss": 0.0089, "step": 9892 }, { "epoch": 4.500909918107371, "grad_norm": 0.9924069652186952, "learning_rate": 2.44275936059955e-08, "loss": 0.0109, "step": 9893 }, { "epoch": 4.5013648771610555, "grad_norm": 0.7242070615673829, "learning_rate": 2.4383484221197125e-08, "loss": 0.0077, "step": 9894 }, { "epoch": 4.50181983621474, "grad_norm": 0.9528965968908777, "learning_rate": 2.4339413701632615e-08, "loss": 0.0046, "step": 9895 }, { "epoch": 4.502274795268426, "grad_norm": 1.421591403514525, "learning_rate": 2.4295382050903146e-08, "loss": 0.0125, "step": 9896 }, { "epoch": 4.502729754322111, "grad_norm": 0.6862767609958665, "learning_rate": 2.425138927260667e-08, "loss": 0.0186, "step": 9897 }, { "epoch": 4.503184713375796, "grad_norm": 0.7705323581991024, "learning_rate": 2.4207435370338368e-08, "loss": 0.0058, "step": 9898 }, { "epoch": 4.503639672429482, "grad_norm": 0.7423438781880114, "learning_rate": 2.4163520347689647e-08, "loss": 0.0081, "step": 9899 }, { "epoch": 4.5040946314831665, "grad_norm": 0.8926200650269248, "learning_rate": 2.4119644208249247e-08, "loss": 0.0089, "step": 9900 }, { "epoch": 4.504549590536851, "grad_norm": 0.9065010018211108, "learning_rate": 2.4075806955602518e-08, "loss": 0.0081, "step": 9901 }, { "epoch": 4.505004549590537, "grad_norm": 0.5678447900729378, "learning_rate": 2.4032008593331544e-08, "loss": 0.0049, "step": 9902 }, { "epoch": 4.505459508644222, "grad_norm": 0.6749425353294174, "learning_rate": 2.3988249125015346e-08, "loss": 0.0054, "step": 9903 }, { "epoch": 4.505914467697907, "grad_norm": 0.8342744374245709, "learning_rate": 2.3944528554229793e-08, "loss": 0.0061, "step": 9904 }, { "epoch": 4.506369426751593, "grad_norm": 0.5870198901481012, "learning_rate": 2.390084688454752e-08, "loss": 0.0059, "step": 9905 }, { "epoch": 4.5068243858052774, "grad_norm": 0.9374215116739136, "learning_rate": 2.385720411953801e-08, "loss": 0.0164, "step": 9906 }, { "epoch": 4.507279344858962, "grad_norm": 0.794103994919594, "learning_rate": 2.3813600262767576e-08, "loss": 0.011, "step": 9907 }, { "epoch": 4.507734303912648, "grad_norm": 0.6365810394605567, "learning_rate": 2.3770035317799196e-08, "loss": 0.0064, "step": 9908 }, { "epoch": 4.508189262966333, "grad_norm": 0.5287702183251018, "learning_rate": 2.3726509288192975e-08, "loss": 0.0087, "step": 9909 }, { "epoch": 4.508644222020019, "grad_norm": 0.5429660047652883, "learning_rate": 2.3683022177505563e-08, "loss": 0.0029, "step": 9910 }, { "epoch": 4.5090991810737036, "grad_norm": 0.4105083978885969, "learning_rate": 2.3639573989290508e-08, "loss": 0.005, "step": 9911 }, { "epoch": 4.509554140127388, "grad_norm": 0.5822375602920646, "learning_rate": 2.35961647270983e-08, "loss": 0.0033, "step": 9912 }, { "epoch": 4.510009099181074, "grad_norm": 0.6822166218188481, "learning_rate": 2.355279439447605e-08, "loss": 0.0049, "step": 9913 }, { "epoch": 4.510464058234759, "grad_norm": 0.6669641601764383, "learning_rate": 2.3509462994967864e-08, "loss": 0.0066, "step": 9914 }, { "epoch": 4.510919017288444, "grad_norm": 1.747568265463525, "learning_rate": 2.3466170532114526e-08, "loss": 0.0074, "step": 9915 }, { "epoch": 4.51137397634213, "grad_norm": 0.4263861915668347, "learning_rate": 2.342291700945376e-08, "loss": 0.005, "step": 9916 }, { "epoch": 4.5118289353958145, "grad_norm": 0.7810033888356654, "learning_rate": 2.337970243052001e-08, "loss": 0.0124, "step": 9917 }, { "epoch": 4.512283894449499, "grad_norm": 0.4969612720326647, "learning_rate": 2.3336526798844515e-08, "loss": 0.0049, "step": 9918 }, { "epoch": 4.512738853503185, "grad_norm": 0.46301878532542295, "learning_rate": 2.329339011795539e-08, "loss": 0.0023, "step": 9919 }, { "epoch": 4.51319381255687, "grad_norm": 0.43944993481173894, "learning_rate": 2.3250292391377767e-08, "loss": 0.002, "step": 9920 }, { "epoch": 4.513648771610555, "grad_norm": 0.5170103361675482, "learning_rate": 2.3207233622633272e-08, "loss": 0.0052, "step": 9921 }, { "epoch": 4.514103730664241, "grad_norm": 0.49538528126977593, "learning_rate": 2.3164213815240475e-08, "loss": 0.0073, "step": 9922 }, { "epoch": 4.5145586897179255, "grad_norm": 0.31420065904293176, "learning_rate": 2.3121232972714677e-08, "loss": 0.0013, "step": 9923 }, { "epoch": 4.51501364877161, "grad_norm": 0.6749654953374163, "learning_rate": 2.3078291098568182e-08, "loss": 0.0038, "step": 9924 }, { "epoch": 4.515468607825296, "grad_norm": 0.5926278218853713, "learning_rate": 2.3035388196309956e-08, "loss": 0.0085, "step": 9925 }, { "epoch": 4.515923566878981, "grad_norm": 0.44602008999740367, "learning_rate": 2.299252426944587e-08, "loss": 0.0069, "step": 9926 }, { "epoch": 4.516378525932666, "grad_norm": 0.7255702003794869, "learning_rate": 2.294969932147861e-08, "loss": 0.0136, "step": 9927 }, { "epoch": 4.516833484986352, "grad_norm": 0.36936891855571125, "learning_rate": 2.2906913355907498e-08, "loss": 0.0035, "step": 9928 }, { "epoch": 4.5172884440400365, "grad_norm": 0.47524600790740323, "learning_rate": 2.2864166376228956e-08, "loss": 0.0055, "step": 9929 }, { "epoch": 4.517743403093721, "grad_norm": 0.6449275863764974, "learning_rate": 2.282145838593602e-08, "loss": 0.008, "step": 9930 }, { "epoch": 4.518198362147407, "grad_norm": 0.5537625301746115, "learning_rate": 2.2778789388518572e-08, "loss": 0.0057, "step": 9931 }, { "epoch": 4.518653321201092, "grad_norm": 0.5309388153125864, "learning_rate": 2.273615938746326e-08, "loss": 0.0056, "step": 9932 }, { "epoch": 4.519108280254777, "grad_norm": 0.36294326731489424, "learning_rate": 2.2693568386253746e-08, "loss": 0.0038, "step": 9933 }, { "epoch": 4.519563239308463, "grad_norm": 0.3062047429740168, "learning_rate": 2.2651016388370357e-08, "loss": 0.002, "step": 9934 }, { "epoch": 4.5200181983621475, "grad_norm": 0.7809491945817094, "learning_rate": 2.26085033972902e-08, "loss": 0.0061, "step": 9935 }, { "epoch": 4.520473157415832, "grad_norm": 0.28825954524430825, "learning_rate": 2.2566029416487332e-08, "loss": 0.0027, "step": 9936 }, { "epoch": 4.520928116469518, "grad_norm": 0.6445223262991536, "learning_rate": 2.252359444943236e-08, "loss": 0.0062, "step": 9937 }, { "epoch": 4.521383075523203, "grad_norm": 0.50192260910149, "learning_rate": 2.2481198499593067e-08, "loss": 0.0041, "step": 9938 }, { "epoch": 4.521838034576888, "grad_norm": 0.7460875011806962, "learning_rate": 2.2438841570433685e-08, "loss": 0.0074, "step": 9939 }, { "epoch": 4.522292993630574, "grad_norm": 0.866805934934162, "learning_rate": 2.2396523665415612e-08, "loss": 0.007, "step": 9940 }, { "epoch": 4.522747952684258, "grad_norm": 0.6439765737316268, "learning_rate": 2.2354244787996744e-08, "loss": 0.0111, "step": 9941 }, { "epoch": 4.523202911737943, "grad_norm": 0.6607430142424785, "learning_rate": 2.2312004941631935e-08, "loss": 0.0056, "step": 9942 }, { "epoch": 4.523657870791629, "grad_norm": 0.7975309907113765, "learning_rate": 2.226980412977292e-08, "loss": 0.0034, "step": 9943 }, { "epoch": 4.524112829845314, "grad_norm": 0.3467356379319141, "learning_rate": 2.2227642355868105e-08, "loss": 0.0063, "step": 9944 }, { "epoch": 4.524567788898999, "grad_norm": 1.4222260024486766, "learning_rate": 2.2185519623362737e-08, "loss": 0.0112, "step": 9945 }, { "epoch": 4.5250227479526846, "grad_norm": 0.3935271306290487, "learning_rate": 2.2143435935698895e-08, "loss": 0.0056, "step": 9946 }, { "epoch": 4.525477707006369, "grad_norm": 0.48576043876550595, "learning_rate": 2.210139129631544e-08, "loss": 0.01, "step": 9947 }, { "epoch": 4.525932666060054, "grad_norm": 0.6733926615543592, "learning_rate": 2.205938570864818e-08, "loss": 0.0038, "step": 9948 }, { "epoch": 4.52638762511374, "grad_norm": 0.4235542571141185, "learning_rate": 2.201741917612959e-08, "loss": 0.0063, "step": 9949 }, { "epoch": 4.526842584167425, "grad_norm": 0.6800068422832263, "learning_rate": 2.1975491702188987e-08, "loss": 0.0074, "step": 9950 }, { "epoch": 4.52729754322111, "grad_norm": 0.5898766358905241, "learning_rate": 2.1933603290252402e-08, "loss": 0.0089, "step": 9951 }, { "epoch": 4.5277525022747955, "grad_norm": 0.3915660285755576, "learning_rate": 2.1891753943742764e-08, "loss": 0.0053, "step": 9952 }, { "epoch": 4.52820746132848, "grad_norm": 0.41669415836641666, "learning_rate": 2.1849943666079895e-08, "loss": 0.0033, "step": 9953 }, { "epoch": 4.528662420382165, "grad_norm": 0.8320694527216056, "learning_rate": 2.1808172460680396e-08, "loss": 0.0115, "step": 9954 }, { "epoch": 4.529117379435851, "grad_norm": 0.9150482968771522, "learning_rate": 2.1766440330957536e-08, "loss": 0.005, "step": 9955 }, { "epoch": 4.529572338489536, "grad_norm": 0.5699150256361648, "learning_rate": 2.1724747280321477e-08, "loss": 0.01, "step": 9956 }, { "epoch": 4.530027297543221, "grad_norm": 0.7164559472843673, "learning_rate": 2.1683093312179157e-08, "loss": 0.0064, "step": 9957 }, { "epoch": 4.5304822565969065, "grad_norm": 0.5380014803844538, "learning_rate": 2.164147842993441e-08, "loss": 0.0056, "step": 9958 }, { "epoch": 4.530937215650591, "grad_norm": 0.524518102057639, "learning_rate": 2.1599902636987854e-08, "loss": 0.0051, "step": 9959 }, { "epoch": 4.531392174704276, "grad_norm": 0.2308426975395599, "learning_rate": 2.1558365936736712e-08, "loss": 0.0007, "step": 9960 }, { "epoch": 4.531847133757962, "grad_norm": 0.7501838325574499, "learning_rate": 2.1516868332575378e-08, "loss": 0.0057, "step": 9961 }, { "epoch": 4.532302092811647, "grad_norm": 0.7817360161607243, "learning_rate": 2.1475409827894698e-08, "loss": 0.0055, "step": 9962 }, { "epoch": 4.532757051865332, "grad_norm": 0.6929941087948313, "learning_rate": 2.1433990426082572e-08, "loss": 0.0082, "step": 9963 }, { "epoch": 4.5332120109190175, "grad_norm": 0.9322362129438213, "learning_rate": 2.1392610130523568e-08, "loss": 0.0045, "step": 9964 }, { "epoch": 4.533666969972702, "grad_norm": 0.5066152560871506, "learning_rate": 2.13512689445991e-08, "loss": 0.0074, "step": 9965 }, { "epoch": 4.534121929026387, "grad_norm": 0.6465724817550283, "learning_rate": 2.130996687168729e-08, "loss": 0.011, "step": 9966 }, { "epoch": 4.534576888080073, "grad_norm": 1.188930480023469, "learning_rate": 2.1268703915163223e-08, "loss": 0.0082, "step": 9967 }, { "epoch": 4.535031847133758, "grad_norm": 0.7667958186017533, "learning_rate": 2.122748007839886e-08, "loss": 0.0064, "step": 9968 }, { "epoch": 4.535486806187443, "grad_norm": 0.4318827009089821, "learning_rate": 2.1186295364762685e-08, "loss": 0.0064, "step": 9969 }, { "epoch": 4.5359417652411285, "grad_norm": 0.6397397950013772, "learning_rate": 2.114514977762011e-08, "loss": 0.0165, "step": 9970 }, { "epoch": 4.536396724294813, "grad_norm": 0.5046346103246676, "learning_rate": 2.1104043320333387e-08, "loss": 0.0091, "step": 9971 }, { "epoch": 4.536851683348498, "grad_norm": 0.5770564989672669, "learning_rate": 2.106297599626161e-08, "loss": 0.0073, "step": 9972 }, { "epoch": 4.537306642402184, "grad_norm": 1.0857877297980039, "learning_rate": 2.1021947808760598e-08, "loss": 0.0112, "step": 9973 }, { "epoch": 4.537761601455869, "grad_norm": 0.73555766393218, "learning_rate": 2.0980958761182887e-08, "loss": 0.0203, "step": 9974 }, { "epoch": 4.538216560509554, "grad_norm": 0.5094993572347777, "learning_rate": 2.0940008856878078e-08, "loss": 0.0084, "step": 9975 }, { "epoch": 4.538671519563239, "grad_norm": 0.9039451866835201, "learning_rate": 2.089909809919227e-08, "loss": 0.0092, "step": 9976 }, { "epoch": 4.539126478616924, "grad_norm": 0.23640207591032128, "learning_rate": 2.0858226491468567e-08, "loss": 0.0012, "step": 9977 }, { "epoch": 4.539581437670609, "grad_norm": 0.2805346575879051, "learning_rate": 2.0817394037046852e-08, "loss": 0.0021, "step": 9978 }, { "epoch": 4.540036396724295, "grad_norm": 0.3848667801811919, "learning_rate": 2.0776600739263737e-08, "loss": 0.0041, "step": 9979 }, { "epoch": 4.54049135577798, "grad_norm": 0.4900334981822392, "learning_rate": 2.073584660145261e-08, "loss": 0.0039, "step": 9980 }, { "epoch": 4.540946314831665, "grad_norm": 0.45229784500776277, "learning_rate": 2.0695131626943694e-08, "loss": 0.0048, "step": 9981 }, { "epoch": 4.54140127388535, "grad_norm": 0.384219231265401, "learning_rate": 2.065445581906422e-08, "loss": 0.0024, "step": 9982 }, { "epoch": 4.541856232939035, "grad_norm": 0.5221381553470127, "learning_rate": 2.0613819181137913e-08, "loss": 0.0036, "step": 9983 }, { "epoch": 4.542311191992721, "grad_norm": 0.9183401032749902, "learning_rate": 2.05732217164854e-08, "loss": 0.0069, "step": 9984 }, { "epoch": 4.542766151046406, "grad_norm": 0.669025043847742, "learning_rate": 2.0532663428424134e-08, "loss": 0.0052, "step": 9985 }, { "epoch": 4.543221110100091, "grad_norm": 0.769665401492441, "learning_rate": 2.0492144320268245e-08, "loss": 0.0081, "step": 9986 }, { "epoch": 4.5436760691537765, "grad_norm": 0.726511937753534, "learning_rate": 2.0451664395329026e-08, "loss": 0.0042, "step": 9987 }, { "epoch": 4.544131028207461, "grad_norm": 0.4732007403420125, "learning_rate": 2.0411223656914055e-08, "loss": 0.0029, "step": 9988 }, { "epoch": 4.544585987261146, "grad_norm": 0.6879115455426232, "learning_rate": 2.037082210832819e-08, "loss": 0.0086, "step": 9989 }, { "epoch": 4.545040946314832, "grad_norm": 0.6516012944637154, "learning_rate": 2.033045975287273e-08, "loss": 0.014, "step": 9990 }, { "epoch": 4.545495905368517, "grad_norm": 0.6203399663835725, "learning_rate": 2.0290136593845818e-08, "loss": 0.0108, "step": 9991 }, { "epoch": 4.545950864422202, "grad_norm": 0.2180503771387872, "learning_rate": 2.0249852634542707e-08, "loss": 0.0019, "step": 9992 }, { "epoch": 4.5464058234758875, "grad_norm": 0.6058832248539266, "learning_rate": 2.020960787825515e-08, "loss": 0.0072, "step": 9993 }, { "epoch": 4.546860782529572, "grad_norm": 0.39154261110054916, "learning_rate": 2.0169402328271635e-08, "loss": 0.0032, "step": 9994 }, { "epoch": 4.547315741583257, "grad_norm": 0.4533302895086245, "learning_rate": 2.012923598787769e-08, "loss": 0.0074, "step": 9995 }, { "epoch": 4.547770700636943, "grad_norm": 0.5999198343671889, "learning_rate": 2.008910886035542e-08, "loss": 0.0033, "step": 9996 }, { "epoch": 4.548225659690628, "grad_norm": 0.7431487034171155, "learning_rate": 2.004902094898403e-08, "loss": 0.0056, "step": 9997 }, { "epoch": 4.548680618744313, "grad_norm": 0.7630921145256772, "learning_rate": 2.000897225703918e-08, "loss": 0.0128, "step": 9998 }, { "epoch": 4.5491355777979985, "grad_norm": 0.3471695965393274, "learning_rate": 1.996896278779353e-08, "loss": 0.0027, "step": 9999 }, { "epoch": 4.549590536851683, "grad_norm": 1.1883263885863007, "learning_rate": 1.9928992544516354e-08, "loss": 0.0029, "step": 10000 }, { "epoch": 4.550045495905368, "grad_norm": 0.890305445047881, "learning_rate": 1.9889061530473984e-08, "loss": 0.0219, "step": 10001 }, { "epoch": 4.550500454959054, "grad_norm": 0.3737943939384319, "learning_rate": 1.9849169748929306e-08, "loss": 0.0019, "step": 10002 }, { "epoch": 4.550955414012739, "grad_norm": 0.7940753793552366, "learning_rate": 1.9809317203142162e-08, "loss": 0.009, "step": 10003 }, { "epoch": 4.551410373066424, "grad_norm": 0.5934051404723111, "learning_rate": 1.9769503896369167e-08, "loss": 0.0098, "step": 10004 }, { "epoch": 4.5518653321201095, "grad_norm": 0.7959673210215582, "learning_rate": 1.9729729831863495e-08, "loss": 0.0186, "step": 10005 }, { "epoch": 4.552320291173794, "grad_norm": 0.5685592799366722, "learning_rate": 1.9689995012875438e-08, "loss": 0.0052, "step": 10006 }, { "epoch": 4.552775250227479, "grad_norm": 0.7358775156875339, "learning_rate": 1.9650299442652007e-08, "loss": 0.0032, "step": 10007 }, { "epoch": 4.553230209281165, "grad_norm": 0.33851740211828807, "learning_rate": 1.961064312443683e-08, "loss": 0.0015, "step": 10008 }, { "epoch": 4.55368516833485, "grad_norm": 0.7252381259116459, "learning_rate": 1.957102606147043e-08, "loss": 0.0085, "step": 10009 }, { "epoch": 4.554140127388535, "grad_norm": 0.7773807958995618, "learning_rate": 1.953144825699016e-08, "loss": 0.0091, "step": 10010 }, { "epoch": 4.55459508644222, "grad_norm": 0.4481280415909541, "learning_rate": 1.9491909714230203e-08, "loss": 0.0028, "step": 10011 }, { "epoch": 4.555050045495905, "grad_norm": 0.9458204483774687, "learning_rate": 1.9452410436421483e-08, "loss": 0.0097, "step": 10012 }, { "epoch": 4.55550500454959, "grad_norm": 0.6120547945365373, "learning_rate": 1.941295042679164e-08, "loss": 0.0048, "step": 10013 }, { "epoch": 4.555959963603276, "grad_norm": 17.757540179068805, "learning_rate": 1.9373529688565094e-08, "loss": 0.0132, "step": 10014 }, { "epoch": 4.556414922656961, "grad_norm": 0.6411849722182907, "learning_rate": 1.9334148224963266e-08, "loss": 0.0095, "step": 10015 }, { "epoch": 4.556869881710646, "grad_norm": 0.8199584426312179, "learning_rate": 1.9294806039204135e-08, "loss": 0.0058, "step": 10016 }, { "epoch": 4.557324840764331, "grad_norm": 0.5611703457810838, "learning_rate": 1.925550313450264e-08, "loss": 0.008, "step": 10017 }, { "epoch": 4.557779799818016, "grad_norm": 0.5317080791850162, "learning_rate": 1.9216239514070422e-08, "loss": 0.0043, "step": 10018 }, { "epoch": 4.558234758871702, "grad_norm": 0.55680399513161, "learning_rate": 1.9177015181115862e-08, "loss": 0.0119, "step": 10019 }, { "epoch": 4.558689717925387, "grad_norm": 0.38156944241180735, "learning_rate": 1.9137830138844292e-08, "loss": 0.0013, "step": 10020 }, { "epoch": 4.559144676979072, "grad_norm": 0.6591117494825597, "learning_rate": 1.9098684390457697e-08, "loss": 0.0176, "step": 10021 }, { "epoch": 4.5595996360327575, "grad_norm": 0.6024542808518344, "learning_rate": 1.9059577939154915e-08, "loss": 0.0041, "step": 10022 }, { "epoch": 4.560054595086442, "grad_norm": 0.808797499695358, "learning_rate": 1.9020510788131383e-08, "loss": 0.0077, "step": 10023 }, { "epoch": 4.560509554140127, "grad_norm": 0.42870335755885647, "learning_rate": 1.8981482940579717e-08, "loss": 0.0064, "step": 10024 }, { "epoch": 4.560964513193813, "grad_norm": 0.8419869213860227, "learning_rate": 1.894249439968898e-08, "loss": 0.0119, "step": 10025 }, { "epoch": 4.561419472247498, "grad_norm": 0.42774404594257837, "learning_rate": 1.8903545168645175e-08, "loss": 0.0023, "step": 10026 }, { "epoch": 4.561874431301183, "grad_norm": 0.5498592473450019, "learning_rate": 1.886463525063109e-08, "loss": 0.0063, "step": 10027 }, { "epoch": 4.5623293903548685, "grad_norm": 0.6292548086530966, "learning_rate": 1.8825764648826182e-08, "loss": 0.0104, "step": 10028 }, { "epoch": 4.562784349408553, "grad_norm": 0.5795853912204904, "learning_rate": 1.87869333664068e-08, "loss": 0.0073, "step": 10029 }, { "epoch": 4.563239308462238, "grad_norm": 0.44267258627952555, "learning_rate": 1.8748141406546068e-08, "loss": 0.0039, "step": 10030 }, { "epoch": 4.563694267515924, "grad_norm": 0.34107287857307256, "learning_rate": 1.8709388772413957e-08, "loss": 0.0026, "step": 10031 }, { "epoch": 4.564149226569609, "grad_norm": 0.5671034069126905, "learning_rate": 1.86706754671771e-08, "loss": 0.0064, "step": 10032 }, { "epoch": 4.564604185623294, "grad_norm": 0.4166896438835863, "learning_rate": 1.8632001493999016e-08, "loss": 0.0046, "step": 10033 }, { "epoch": 4.5650591446769795, "grad_norm": 0.5798266068601919, "learning_rate": 1.859336685603985e-08, "loss": 0.0091, "step": 10034 }, { "epoch": 4.565514103730664, "grad_norm": 0.4294071258787175, "learning_rate": 1.8554771556456795e-08, "loss": 0.0019, "step": 10035 }, { "epoch": 4.565969062784349, "grad_norm": 0.6495752603686512, "learning_rate": 1.8516215598403605e-08, "loss": 0.0099, "step": 10036 }, { "epoch": 4.566424021838035, "grad_norm": 1.654655320022245, "learning_rate": 1.8477698985030875e-08, "loss": 0.0094, "step": 10037 }, { "epoch": 4.56687898089172, "grad_norm": 0.6026876658906706, "learning_rate": 1.8439221719486086e-08, "loss": 0.0078, "step": 10038 }, { "epoch": 4.567333939945405, "grad_norm": 0.3013823658454553, "learning_rate": 1.8400783804913335e-08, "loss": 0.0035, "step": 10039 }, { "epoch": 4.5677888989990905, "grad_norm": 0.6833149389035966, "learning_rate": 1.8362385244453715e-08, "loss": 0.0042, "step": 10040 }, { "epoch": 4.568243858052775, "grad_norm": 0.6129199473985459, "learning_rate": 1.8324026041244943e-08, "loss": 0.0075, "step": 10041 }, { "epoch": 4.56869881710646, "grad_norm": 0.4236811111829872, "learning_rate": 1.8285706198421512e-08, "loss": 0.0025, "step": 10042 }, { "epoch": 4.569153776160146, "grad_norm": 0.7432839149475531, "learning_rate": 1.8247425719114694e-08, "loss": 0.0091, "step": 10043 }, { "epoch": 4.569608735213831, "grad_norm": 0.5823806922741878, "learning_rate": 1.820918460645271e-08, "loss": 0.008, "step": 10044 }, { "epoch": 4.570063694267516, "grad_norm": 0.44950138065108314, "learning_rate": 1.8170982863560448e-08, "loss": 0.0077, "step": 10045 }, { "epoch": 4.570518653321201, "grad_norm": 0.4637822052403866, "learning_rate": 1.813282049355952e-08, "loss": 0.0064, "step": 10046 }, { "epoch": 4.570973612374886, "grad_norm": 0.7564522431748812, "learning_rate": 1.8094697499568435e-08, "loss": 0.0135, "step": 10047 }, { "epoch": 4.571428571428571, "grad_norm": 0.758276828727623, "learning_rate": 1.805661388470231e-08, "loss": 0.0051, "step": 10048 }, { "epoch": 4.571883530482257, "grad_norm": 0.42761619384911453, "learning_rate": 1.8018569652073378e-08, "loss": 0.0021, "step": 10049 }, { "epoch": 4.572338489535942, "grad_norm": 0.6641379567382012, "learning_rate": 1.7980564804790267e-08, "loss": 0.0103, "step": 10050 }, { "epoch": 4.572793448589627, "grad_norm": 0.4004512240462523, "learning_rate": 1.79425993459586e-08, "loss": 0.0052, "step": 10051 }, { "epoch": 4.573248407643312, "grad_norm": 0.5438310641049443, "learning_rate": 1.7904673278680836e-08, "loss": 0.006, "step": 10052 }, { "epoch": 4.573703366696997, "grad_norm": 1.0690955191187197, "learning_rate": 1.786678660605595e-08, "loss": 0.0132, "step": 10053 }, { "epoch": 4.574158325750682, "grad_norm": 0.6594906375857356, "learning_rate": 1.782893933118007e-08, "loss": 0.0152, "step": 10054 }, { "epoch": 4.574613284804368, "grad_norm": 0.5930582581965528, "learning_rate": 1.7791131457145725e-08, "loss": 0.0084, "step": 10055 }, { "epoch": 4.575068243858053, "grad_norm": 0.4739528733451673, "learning_rate": 1.7753362987042553e-08, "loss": 0.0074, "step": 10056 }, { "epoch": 4.575523202911738, "grad_norm": 0.44923119243152176, "learning_rate": 1.771563392395675e-08, "loss": 0.002, "step": 10057 }, { "epoch": 4.575978161965423, "grad_norm": 0.47293883956662014, "learning_rate": 1.767794427097119e-08, "loss": 0.0049, "step": 10058 }, { "epoch": 4.576433121019108, "grad_norm": 0.49200569842897246, "learning_rate": 1.764029403116607e-08, "loss": 0.0036, "step": 10059 }, { "epoch": 4.576888080072793, "grad_norm": 0.3942082323641272, "learning_rate": 1.7602683207617765e-08, "loss": 0.0027, "step": 10060 }, { "epoch": 4.577343039126479, "grad_norm": 0.5480672770855255, "learning_rate": 1.75651118033997e-08, "loss": 0.0073, "step": 10061 }, { "epoch": 4.577797998180164, "grad_norm": 0.25883266669189897, "learning_rate": 1.752757982158204e-08, "loss": 0.0026, "step": 10062 }, { "epoch": 4.578252957233849, "grad_norm": 0.2046255223414519, "learning_rate": 1.7490087265231656e-08, "loss": 0.0015, "step": 10063 }, { "epoch": 4.578707916287534, "grad_norm": 0.42587649981924713, "learning_rate": 1.7452634137412438e-08, "loss": 0.0059, "step": 10064 }, { "epoch": 4.579162875341219, "grad_norm": 0.36474917451783795, "learning_rate": 1.741522044118471e-08, "loss": 0.0039, "step": 10065 }, { "epoch": 4.579617834394904, "grad_norm": 0.2997447694401485, "learning_rate": 1.7377846179605914e-08, "loss": 0.0016, "step": 10066 }, { "epoch": 4.58007279344859, "grad_norm": 0.5888309815156823, "learning_rate": 1.7340511355730004e-08, "loss": 0.0064, "step": 10067 }, { "epoch": 4.580527752502275, "grad_norm": 0.5153825044115009, "learning_rate": 1.730321597260781e-08, "loss": 0.0068, "step": 10068 }, { "epoch": 4.58098271155596, "grad_norm": 0.45223172100239767, "learning_rate": 1.726596003328701e-08, "loss": 0.0072, "step": 10069 }, { "epoch": 4.581437670609645, "grad_norm": 0.5242583299751505, "learning_rate": 1.722874354081194e-08, "loss": 0.0075, "step": 10070 }, { "epoch": 4.58189262966333, "grad_norm": 0.5624909318336155, "learning_rate": 1.7191566498223787e-08, "loss": 0.0027, "step": 10071 }, { "epoch": 4.582347588717015, "grad_norm": 0.9711957000803544, "learning_rate": 1.715442890856039e-08, "loss": 0.007, "step": 10072 }, { "epoch": 4.582802547770701, "grad_norm": 0.6570976920839259, "learning_rate": 1.7117330774856555e-08, "loss": 0.0109, "step": 10073 }, { "epoch": 4.583257506824386, "grad_norm": 0.64122762739969, "learning_rate": 1.7080272100143844e-08, "loss": 0.0045, "step": 10074 }, { "epoch": 4.583712465878071, "grad_norm": 0.6264685471029169, "learning_rate": 1.7043252887450455e-08, "loss": 0.0081, "step": 10075 }, { "epoch": 4.584167424931756, "grad_norm": 0.5395126983626023, "learning_rate": 1.700627313980141e-08, "loss": 0.0086, "step": 10076 }, { "epoch": 4.584622383985441, "grad_norm": 1.009655731982824, "learning_rate": 1.6969332860218512e-08, "loss": 0.0059, "step": 10077 }, { "epoch": 4.585077343039126, "grad_norm": 1.2053623391313333, "learning_rate": 1.6932432051720402e-08, "loss": 0.0128, "step": 10078 }, { "epoch": 4.585532302092812, "grad_norm": 0.3017779531177534, "learning_rate": 1.6895570717322395e-08, "loss": 0.0073, "step": 10079 }, { "epoch": 4.585987261146497, "grad_norm": 0.28575080438873857, "learning_rate": 1.685874886003674e-08, "loss": 0.0019, "step": 10080 }, { "epoch": 4.5864422202001816, "grad_norm": 0.4299458792985961, "learning_rate": 1.6821966482872263e-08, "loss": 0.007, "step": 10081 }, { "epoch": 4.586897179253867, "grad_norm": 0.6193695400416042, "learning_rate": 1.6785223588834606e-08, "loss": 0.0159, "step": 10082 }, { "epoch": 4.587352138307552, "grad_norm": 0.5780193117804713, "learning_rate": 1.6748520180926372e-08, "loss": 0.0025, "step": 10083 }, { "epoch": 4.587807097361237, "grad_norm": 0.6590501931088497, "learning_rate": 1.6711856262146716e-08, "loss": 0.0052, "step": 10084 }, { "epoch": 4.588262056414923, "grad_norm": 0.6630522327582955, "learning_rate": 1.6675231835491686e-08, "loss": 0.0074, "step": 10085 }, { "epoch": 4.588717015468608, "grad_norm": 0.9506554916622475, "learning_rate": 1.6638646903953946e-08, "loss": 0.0049, "step": 10086 }, { "epoch": 4.5891719745222925, "grad_norm": 0.5394287027247345, "learning_rate": 1.6602101470523154e-08, "loss": 0.0091, "step": 10087 }, { "epoch": 4.589626933575978, "grad_norm": 0.5239716642394004, "learning_rate": 1.6565595538185707e-08, "loss": 0.0025, "step": 10088 }, { "epoch": 4.590081892629663, "grad_norm": 0.3728684723959347, "learning_rate": 1.6529129109924545e-08, "loss": 0.002, "step": 10089 }, { "epoch": 4.590536851683348, "grad_norm": 1.645942965414542, "learning_rate": 1.6492702188719677e-08, "loss": 0.0029, "step": 10090 }, { "epoch": 4.590991810737034, "grad_norm": 0.4887016446851462, "learning_rate": 1.6456314777547608e-08, "loss": 0.0054, "step": 10091 }, { "epoch": 4.591446769790719, "grad_norm": 0.6357750231345788, "learning_rate": 1.6419966879381906e-08, "loss": 0.0096, "step": 10092 }, { "epoch": 4.5919017288444035, "grad_norm": 0.44357110782851505, "learning_rate": 1.6383658497192643e-08, "loss": 0.0031, "step": 10093 }, { "epoch": 4.592356687898089, "grad_norm": 0.6363537535274272, "learning_rate": 1.634738963394683e-08, "loss": 0.0063, "step": 10094 }, { "epoch": 4.592811646951774, "grad_norm": 0.5909367701296319, "learning_rate": 1.6311160292608204e-08, "loss": 0.0114, "step": 10095 }, { "epoch": 4.59326660600546, "grad_norm": 0.7392114091810374, "learning_rate": 1.627497047613724e-08, "loss": 0.0066, "step": 10096 }, { "epoch": 4.593721565059145, "grad_norm": 0.6006945764387409, "learning_rate": 1.623882018749112e-08, "loss": 0.0107, "step": 10097 }, { "epoch": 4.59417652411283, "grad_norm": 0.9123736938033765, "learning_rate": 1.620270942962404e-08, "loss": 0.0089, "step": 10098 }, { "epoch": 4.594631483166515, "grad_norm": 0.6859100903671541, "learning_rate": 1.6166638205486747e-08, "loss": 0.0078, "step": 10099 }, { "epoch": 4.5950864422202, "grad_norm": 0.7861256458283147, "learning_rate": 1.6130606518026723e-08, "loss": 0.0089, "step": 10100 }, { "epoch": 4.595541401273885, "grad_norm": 0.4078820490801727, "learning_rate": 1.6094614370188496e-08, "loss": 0.0053, "step": 10101 }, { "epoch": 4.595996360327571, "grad_norm": 0.6853416258480756, "learning_rate": 1.6058661764912997e-08, "loss": 0.013, "step": 10102 }, { "epoch": 4.596451319381256, "grad_norm": 0.8673588069344188, "learning_rate": 1.6022748705138312e-08, "loss": 0.0124, "step": 10103 }, { "epoch": 4.596906278434941, "grad_norm": 0.8520146774764678, "learning_rate": 1.598687519379893e-08, "loss": 0.0103, "step": 10104 }, { "epoch": 4.597361237488626, "grad_norm": 0.6884213446695705, "learning_rate": 1.5951041233826345e-08, "loss": 0.0139, "step": 10105 }, { "epoch": 4.597816196542311, "grad_norm": 0.29562809877897234, "learning_rate": 1.5915246828148655e-08, "loss": 0.0068, "step": 10106 }, { "epoch": 4.598271155595996, "grad_norm": 0.20820626423418478, "learning_rate": 1.587949197969085e-08, "loss": 0.0011, "step": 10107 }, { "epoch": 4.598726114649682, "grad_norm": 0.31444944979273326, "learning_rate": 1.5843776691374822e-08, "loss": 0.0019, "step": 10108 }, { "epoch": 4.599181073703367, "grad_norm": 0.749901097125218, "learning_rate": 1.5808100966118842e-08, "loss": 0.0021, "step": 10109 }, { "epoch": 4.599636032757052, "grad_norm": 0.534224350963736, "learning_rate": 1.5772464806838358e-08, "loss": 0.0057, "step": 10110 }, { "epoch": 4.600090991810737, "grad_norm": 0.9466034444671886, "learning_rate": 1.5736868216445153e-08, "loss": 0.0037, "step": 10111 }, { "epoch": 4.600545950864422, "grad_norm": 0.4380373681105855, "learning_rate": 1.5701311197848288e-08, "loss": 0.0052, "step": 10112 }, { "epoch": 4.601000909918107, "grad_norm": 0.47284828926605726, "learning_rate": 1.566579375395316e-08, "loss": 0.005, "step": 10113 }, { "epoch": 4.601455868971793, "grad_norm": 0.5899322070597083, "learning_rate": 1.5630315887662116e-08, "loss": 0.0034, "step": 10114 }, { "epoch": 4.601910828025478, "grad_norm": 0.7012520160699359, "learning_rate": 1.5594877601874278e-08, "loss": 0.0066, "step": 10115 }, { "epoch": 4.6023657870791626, "grad_norm": 0.7578884052774766, "learning_rate": 1.5559478899485447e-08, "loss": 0.0069, "step": 10116 }, { "epoch": 4.602820746132848, "grad_norm": 0.5831685532168887, "learning_rate": 1.5524119783388356e-08, "loss": 0.005, "step": 10117 }, { "epoch": 4.603275705186533, "grad_norm": 0.5328059478698787, "learning_rate": 1.5488800256472312e-08, "loss": 0.0071, "step": 10118 }, { "epoch": 4.603730664240218, "grad_norm": 0.6611759527683956, "learning_rate": 1.545352032162345e-08, "loss": 0.0095, "step": 10119 }, { "epoch": 4.604185623293904, "grad_norm": 0.6336084385012762, "learning_rate": 1.541827998172468e-08, "loss": 0.002, "step": 10120 }, { "epoch": 4.604640582347589, "grad_norm": 1.1449736823779462, "learning_rate": 1.538307923965576e-08, "loss": 0.0042, "step": 10121 }, { "epoch": 4.6050955414012735, "grad_norm": 0.8860664431366099, "learning_rate": 1.534791809829311e-08, "loss": 0.0063, "step": 10122 }, { "epoch": 4.605550500454959, "grad_norm": 0.8694825687551732, "learning_rate": 1.5312796560509932e-08, "loss": 0.0058, "step": 10123 }, { "epoch": 4.606005459508644, "grad_norm": 0.5338776411240432, "learning_rate": 1.5277714629176154e-08, "loss": 0.0036, "step": 10124 }, { "epoch": 4.606460418562329, "grad_norm": 0.40480627421595833, "learning_rate": 1.5242672307158533e-08, "loss": 0.0035, "step": 10125 }, { "epoch": 4.606915377616015, "grad_norm": 0.40193454398129735, "learning_rate": 1.5207669597320617e-08, "loss": 0.0024, "step": 10126 }, { "epoch": 4.6073703366697, "grad_norm": 0.4827220015299876, "learning_rate": 1.517270650252267e-08, "loss": 0.0054, "step": 10127 }, { "epoch": 4.607825295723385, "grad_norm": 0.4618890785324132, "learning_rate": 1.513778302562163e-08, "loss": 0.0062, "step": 10128 }, { "epoch": 4.60828025477707, "grad_norm": 0.6002709540045922, "learning_rate": 1.5102899169471385e-08, "loss": 0.0076, "step": 10129 }, { "epoch": 4.608735213830755, "grad_norm": 0.49379108601553406, "learning_rate": 1.5068054936922427e-08, "loss": 0.0031, "step": 10130 }, { "epoch": 4.609190172884441, "grad_norm": 0.8338503190290271, "learning_rate": 1.5033250330822035e-08, "loss": 0.0199, "step": 10131 }, { "epoch": 4.609645131938126, "grad_norm": 0.6324809662619004, "learning_rate": 1.4998485354014435e-08, "loss": 0.0136, "step": 10132 }, { "epoch": 4.610100090991811, "grad_norm": 0.4198136137424188, "learning_rate": 1.49637600093403e-08, "loss": 0.0082, "step": 10133 }, { "epoch": 4.610555050045496, "grad_norm": 0.4315813928136515, "learning_rate": 1.4929074299637357e-08, "loss": 0.0038, "step": 10134 }, { "epoch": 4.611010009099181, "grad_norm": 1.4635330468240955, "learning_rate": 1.4894428227739786e-08, "loss": 0.0077, "step": 10135 }, { "epoch": 4.611464968152866, "grad_norm": 0.5220164917652551, "learning_rate": 1.4859821796478877e-08, "loss": 0.0067, "step": 10136 }, { "epoch": 4.611919927206552, "grad_norm": 0.4200533611098053, "learning_rate": 1.482525500868248e-08, "loss": 0.0034, "step": 10137 }, { "epoch": 4.612374886260237, "grad_norm": 0.6483404760715551, "learning_rate": 1.4790727867175223e-08, "loss": 0.011, "step": 10138 }, { "epoch": 4.612829845313922, "grad_norm": 0.42156788551590957, "learning_rate": 1.4756240374778461e-08, "loss": 0.0038, "step": 10139 }, { "epoch": 4.613284804367607, "grad_norm": 0.4423388402760811, "learning_rate": 1.4721792534310384e-08, "loss": 0.0074, "step": 10140 }, { "epoch": 4.613739763421292, "grad_norm": 0.5803633386083056, "learning_rate": 1.4687384348585963e-08, "loss": 0.0071, "step": 10141 }, { "epoch": 4.614194722474977, "grad_norm": 0.6561715573364258, "learning_rate": 1.4653015820416836e-08, "loss": 0.0052, "step": 10142 }, { "epoch": 4.614649681528663, "grad_norm": 0.678347464297239, "learning_rate": 1.4618686952611426e-08, "loss": 0.0034, "step": 10143 }, { "epoch": 4.615104640582348, "grad_norm": 0.603077910699982, "learning_rate": 1.4584397747974986e-08, "loss": 0.0068, "step": 10144 }, { "epoch": 4.615559599636033, "grad_norm": 0.5520847141470715, "learning_rate": 1.4550148209309387e-08, "loss": 0.0099, "step": 10145 }, { "epoch": 4.616014558689718, "grad_norm": 0.5738958229593409, "learning_rate": 1.4515938339413503e-08, "loss": 0.0054, "step": 10146 }, { "epoch": 4.616469517743403, "grad_norm": 0.48015174838481006, "learning_rate": 1.448176814108265e-08, "loss": 0.004, "step": 10147 }, { "epoch": 4.616924476797088, "grad_norm": 0.4402613017132023, "learning_rate": 1.4447637617109154e-08, "loss": 0.005, "step": 10148 }, { "epoch": 4.617379435850774, "grad_norm": 0.43648544908263276, "learning_rate": 1.4413546770281892e-08, "loss": 0.0065, "step": 10149 }, { "epoch": 4.617834394904459, "grad_norm": 0.36537149084198073, "learning_rate": 1.4379495603386749e-08, "loss": 0.0038, "step": 10150 }, { "epoch": 4.6182893539581436, "grad_norm": 0.557495170062743, "learning_rate": 1.434548411920622e-08, "loss": 0.0039, "step": 10151 }, { "epoch": 4.618744313011829, "grad_norm": 0.5171710621288604, "learning_rate": 1.4311512320519525e-08, "loss": 0.0084, "step": 10152 }, { "epoch": 4.619199272065514, "grad_norm": 0.47807272830689096, "learning_rate": 1.4277580210102669e-08, "loss": 0.0034, "step": 10153 }, { "epoch": 4.619654231119199, "grad_norm": 0.6732114842476845, "learning_rate": 1.4243687790728432e-08, "loss": 0.0044, "step": 10154 }, { "epoch": 4.620109190172885, "grad_norm": 0.5186771691418333, "learning_rate": 1.4209835065166432e-08, "loss": 0.0048, "step": 10155 }, { "epoch": 4.62056414922657, "grad_norm": 0.4642673302637921, "learning_rate": 1.4176022036182789e-08, "loss": 0.0031, "step": 10156 }, { "epoch": 4.6210191082802545, "grad_norm": 0.4771922998668724, "learning_rate": 1.4142248706540793e-08, "loss": 0.0034, "step": 10157 }, { "epoch": 4.62147406733394, "grad_norm": 0.37323671305677814, "learning_rate": 1.4108515079000071e-08, "loss": 0.0031, "step": 10158 }, { "epoch": 4.621929026387625, "grad_norm": 0.5512410205740637, "learning_rate": 1.4074821156317195e-08, "loss": 0.0101, "step": 10159 }, { "epoch": 4.62238398544131, "grad_norm": 0.6377124635543503, "learning_rate": 1.4041166941245575e-08, "loss": 0.0092, "step": 10160 }, { "epoch": 4.622838944494996, "grad_norm": 0.642169537894405, "learning_rate": 1.4007552436535175e-08, "loss": 0.0122, "step": 10161 }, { "epoch": 4.623293903548681, "grad_norm": 0.45247539421185706, "learning_rate": 1.397397764493291e-08, "loss": 0.0022, "step": 10162 }, { "epoch": 4.6237488626023655, "grad_norm": 0.6657317232236053, "learning_rate": 1.3940442569182253e-08, "loss": 0.0067, "step": 10163 }, { "epoch": 4.624203821656051, "grad_norm": 0.8845129147708184, "learning_rate": 1.3906947212023623e-08, "loss": 0.0047, "step": 10164 }, { "epoch": 4.624658780709736, "grad_norm": 1.0731149444399262, "learning_rate": 1.3873491576194163e-08, "loss": 0.0185, "step": 10165 }, { "epoch": 4.625113739763421, "grad_norm": 0.509746447396925, "learning_rate": 1.3840075664427575e-08, "loss": 0.0106, "step": 10166 }, { "epoch": 4.625568698817107, "grad_norm": 0.6597170392237346, "learning_rate": 1.3806699479454565e-08, "loss": 0.0084, "step": 10167 }, { "epoch": 4.626023657870792, "grad_norm": 0.3372271855898515, "learning_rate": 1.3773363024002449e-08, "loss": 0.0033, "step": 10168 }, { "epoch": 4.6264786169244765, "grad_norm": 0.7352231659605166, "learning_rate": 1.3740066300795272e-08, "loss": 0.012, "step": 10169 }, { "epoch": 4.626933575978162, "grad_norm": 0.6679597529058052, "learning_rate": 1.3706809312553914e-08, "loss": 0.0075, "step": 10170 }, { "epoch": 4.627388535031847, "grad_norm": 0.6404370249999011, "learning_rate": 1.3673592061996086e-08, "loss": 0.0072, "step": 10171 }, { "epoch": 4.627843494085532, "grad_norm": 0.3372363871916897, "learning_rate": 1.3640414551836122e-08, "loss": 0.002, "step": 10172 }, { "epoch": 4.628298453139218, "grad_norm": 0.6045149969078594, "learning_rate": 1.3607276784785071e-08, "loss": 0.0035, "step": 10173 }, { "epoch": 4.628753412192903, "grad_norm": 0.4786595566005278, "learning_rate": 1.357417876355077e-08, "loss": 0.0105, "step": 10174 }, { "epoch": 4.6292083712465875, "grad_norm": 0.8748152121805601, "learning_rate": 1.3541120490837943e-08, "loss": 0.0198, "step": 10175 }, { "epoch": 4.629663330300273, "grad_norm": 0.6306484575207307, "learning_rate": 1.3508101969347984e-08, "loss": 0.0155, "step": 10176 }, { "epoch": 4.630118289353958, "grad_norm": 0.5957175726576235, "learning_rate": 1.3475123201778848e-08, "loss": 0.0118, "step": 10177 }, { "epoch": 4.630573248407643, "grad_norm": 0.7311852629224338, "learning_rate": 1.3442184190825544e-08, "loss": 0.0055, "step": 10178 }, { "epoch": 4.631028207461329, "grad_norm": 0.5780813626672577, "learning_rate": 1.3409284939179699e-08, "loss": 0.0039, "step": 10179 }, { "epoch": 4.631483166515014, "grad_norm": 0.37889163350613475, "learning_rate": 1.337642544952966e-08, "loss": 0.0028, "step": 10180 }, { "epoch": 4.631938125568698, "grad_norm": 0.30964881033338065, "learning_rate": 1.3343605724560558e-08, "loss": 0.0062, "step": 10181 }, { "epoch": 4.632393084622384, "grad_norm": 0.1831749094133982, "learning_rate": 1.3310825766954303e-08, "loss": 0.0037, "step": 10182 }, { "epoch": 4.632848043676069, "grad_norm": 0.5360090008493094, "learning_rate": 1.327808557938942e-08, "loss": 0.005, "step": 10183 }, { "epoch": 4.633303002729754, "grad_norm": 0.7013725758723331, "learning_rate": 1.3245385164541379e-08, "loss": 0.0144, "step": 10184 }, { "epoch": 4.63375796178344, "grad_norm": 0.7485969449293726, "learning_rate": 1.3212724525082375e-08, "loss": 0.0102, "step": 10185 }, { "epoch": 4.6342129208371245, "grad_norm": 0.6084742170044107, "learning_rate": 1.3180103663681164e-08, "loss": 0.005, "step": 10186 }, { "epoch": 4.634667879890809, "grad_norm": 0.6047782542624053, "learning_rate": 1.3147522583003445e-08, "loss": 0.002, "step": 10187 }, { "epoch": 4.635122838944495, "grad_norm": 0.6693316735066183, "learning_rate": 1.3114981285711535e-08, "loss": 0.0077, "step": 10188 }, { "epoch": 4.63557779799818, "grad_norm": 0.48692057414194, "learning_rate": 1.3082479774464638e-08, "loss": 0.0053, "step": 10189 }, { "epoch": 4.636032757051865, "grad_norm": 0.9123344475380216, "learning_rate": 1.3050018051918577e-08, "loss": 0.0049, "step": 10190 }, { "epoch": 4.636487716105551, "grad_norm": 0.7485065152647584, "learning_rate": 1.3017596120725948e-08, "loss": 0.0073, "step": 10191 }, { "epoch": 4.6369426751592355, "grad_norm": 0.6080657798743656, "learning_rate": 1.2985213983536248e-08, "loss": 0.0046, "step": 10192 }, { "epoch": 4.63739763421292, "grad_norm": 0.4419829892542802, "learning_rate": 1.2952871642995466e-08, "loss": 0.0036, "step": 10193 }, { "epoch": 4.637852593266606, "grad_norm": 0.779445217189928, "learning_rate": 1.2920569101746548e-08, "loss": 0.0189, "step": 10194 }, { "epoch": 4.638307552320291, "grad_norm": 0.39659129935680293, "learning_rate": 1.2888306362429158e-08, "loss": 0.0035, "step": 10195 }, { "epoch": 4.638762511373976, "grad_norm": 0.840279785702362, "learning_rate": 1.2856083427679521e-08, "loss": 0.0065, "step": 10196 }, { "epoch": 4.639217470427662, "grad_norm": 0.40707626858510765, "learning_rate": 1.2823900300130808e-08, "loss": 0.0061, "step": 10197 }, { "epoch": 4.6396724294813465, "grad_norm": 0.8534548026910305, "learning_rate": 1.2791756982412916e-08, "loss": 0.0207, "step": 10198 }, { "epoch": 4.640127388535031, "grad_norm": 0.8338411462240639, "learning_rate": 1.275965347715241e-08, "loss": 0.0156, "step": 10199 }, { "epoch": 4.640582347588717, "grad_norm": 0.45372511393913717, "learning_rate": 1.2727589786972747e-08, "loss": 0.0038, "step": 10200 }, { "epoch": 4.641037306642402, "grad_norm": 0.49241099640046987, "learning_rate": 1.2695565914493889e-08, "loss": 0.0083, "step": 10201 }, { "epoch": 4.641492265696087, "grad_norm": 0.8378477857253865, "learning_rate": 1.266358186233274e-08, "loss": 0.0319, "step": 10202 }, { "epoch": 4.641947224749773, "grad_norm": 0.7654548952609668, "learning_rate": 1.2631637633102877e-08, "loss": 0.0051, "step": 10203 }, { "epoch": 4.6424021838034575, "grad_norm": 0.595328534140233, "learning_rate": 1.2599733229414655e-08, "loss": 0.0066, "step": 10204 }, { "epoch": 4.642857142857143, "grad_norm": 1.4021583836788054, "learning_rate": 1.2567868653875158e-08, "loss": 0.0105, "step": 10205 }, { "epoch": 4.643312101910828, "grad_norm": 0.3515120947380551, "learning_rate": 1.253604390908819e-08, "loss": 0.0065, "step": 10206 }, { "epoch": 4.643767060964513, "grad_norm": 0.6670695469387529, "learning_rate": 1.2504258997654393e-08, "loss": 0.0155, "step": 10207 }, { "epoch": 4.644222020018199, "grad_norm": 0.5403562901499382, "learning_rate": 1.2472513922170969e-08, "loss": 0.0045, "step": 10208 }, { "epoch": 4.644676979071884, "grad_norm": 0.8543055693205357, "learning_rate": 1.2440808685232118e-08, "loss": 0.0204, "step": 10209 }, { "epoch": 4.6451319381255685, "grad_norm": 0.6951034815541811, "learning_rate": 1.2409143289428603e-08, "loss": 0.0082, "step": 10210 }, { "epoch": 4.645586897179254, "grad_norm": 0.41987182671531037, "learning_rate": 1.237751773734791e-08, "loss": 0.004, "step": 10211 }, { "epoch": 4.646041856232939, "grad_norm": 0.2799706436280332, "learning_rate": 1.2345932031574357e-08, "loss": 0.0014, "step": 10212 }, { "epoch": 4.646496815286624, "grad_norm": 0.6809621100589652, "learning_rate": 1.231438617468905e-08, "loss": 0.0034, "step": 10213 }, { "epoch": 4.64695177434031, "grad_norm": 0.545434207697499, "learning_rate": 1.2282880169269706e-08, "loss": 0.0066, "step": 10214 }, { "epoch": 4.647406733393995, "grad_norm": 0.7341442620792351, "learning_rate": 1.2251414017890927e-08, "loss": 0.0157, "step": 10215 }, { "epoch": 4.647861692447679, "grad_norm": 0.47740382211495297, "learning_rate": 1.2219987723123937e-08, "loss": 0.0034, "step": 10216 }, { "epoch": 4.648316651501365, "grad_norm": 0.5648798685416307, "learning_rate": 1.2188601287536737e-08, "loss": 0.0118, "step": 10217 }, { "epoch": 4.64877161055505, "grad_norm": 0.7258881529744411, "learning_rate": 1.215725471369411e-08, "loss": 0.0078, "step": 10218 }, { "epoch": 4.649226569608735, "grad_norm": 0.586323969448989, "learning_rate": 1.2125948004157505e-08, "loss": 0.0144, "step": 10219 }, { "epoch": 4.649681528662421, "grad_norm": 0.2774591268300428, "learning_rate": 1.2094681161485266e-08, "loss": 0.0023, "step": 10220 }, { "epoch": 4.6501364877161055, "grad_norm": 0.5544597632743341, "learning_rate": 1.2063454188232348e-08, "loss": 0.0102, "step": 10221 }, { "epoch": 4.65059144676979, "grad_norm": 0.45647303076389417, "learning_rate": 1.2032267086950376e-08, "loss": 0.0045, "step": 10222 }, { "epoch": 4.651046405823476, "grad_norm": 0.4393212894777797, "learning_rate": 1.2001119860187924e-08, "loss": 0.0035, "step": 10223 }, { "epoch": 4.651501364877161, "grad_norm": 0.483007247114926, "learning_rate": 1.197001251049018e-08, "loss": 0.0066, "step": 10224 }, { "epoch": 4.651956323930846, "grad_norm": 0.6170507252644023, "learning_rate": 1.1938945040399107e-08, "loss": 0.0101, "step": 10225 }, { "epoch": 4.652411282984532, "grad_norm": 0.28966088615081587, "learning_rate": 1.1907917452453342e-08, "loss": 0.0013, "step": 10226 }, { "epoch": 4.6528662420382165, "grad_norm": 0.7364518429921187, "learning_rate": 1.1876929749188358e-08, "loss": 0.0165, "step": 10227 }, { "epoch": 4.653321201091901, "grad_norm": 0.5007897820833913, "learning_rate": 1.184598193313635e-08, "loss": 0.004, "step": 10228 }, { "epoch": 4.653776160145587, "grad_norm": 0.5819437011707546, "learning_rate": 1.1815074006826243e-08, "loss": 0.0035, "step": 10229 }, { "epoch": 4.654231119199272, "grad_norm": 0.33932164300929785, "learning_rate": 1.1784205972783678e-08, "loss": 0.0019, "step": 10230 }, { "epoch": 4.654686078252957, "grad_norm": 0.9709848429425727, "learning_rate": 1.175337783353092e-08, "loss": 0.0141, "step": 10231 }, { "epoch": 4.655141037306643, "grad_norm": 0.5515617645485345, "learning_rate": 1.172258959158734e-08, "loss": 0.0052, "step": 10232 }, { "epoch": 4.6555959963603275, "grad_norm": 0.46646436504842087, "learning_rate": 1.1691841249468649e-08, "loss": 0.0081, "step": 10233 }, { "epoch": 4.656050955414012, "grad_norm": 0.5874087154629689, "learning_rate": 1.1661132809687501e-08, "loss": 0.0085, "step": 10234 }, { "epoch": 4.656505914467698, "grad_norm": 0.35492765257390396, "learning_rate": 1.1630464274753283e-08, "loss": 0.0034, "step": 10235 }, { "epoch": 4.656960873521383, "grad_norm": 0.3469109796248528, "learning_rate": 1.1599835647172041e-08, "loss": 0.002, "step": 10236 }, { "epoch": 4.657415832575068, "grad_norm": 0.6766453544254456, "learning_rate": 1.1569246929446664e-08, "loss": 0.0107, "step": 10237 }, { "epoch": 4.657870791628754, "grad_norm": 0.6527985210182251, "learning_rate": 1.1538698124076706e-08, "loss": 0.0117, "step": 10238 }, { "epoch": 4.6583257506824385, "grad_norm": 0.8345919729713092, "learning_rate": 1.1508189233558451e-08, "loss": 0.0121, "step": 10239 }, { "epoch": 4.658780709736124, "grad_norm": 0.5819277806327037, "learning_rate": 1.147772026038496e-08, "loss": 0.0086, "step": 10240 }, { "epoch": 4.659235668789809, "grad_norm": 0.5536771568662121, "learning_rate": 1.1447291207046017e-08, "loss": 0.0062, "step": 10241 }, { "epoch": 4.659690627843494, "grad_norm": 0.34039004281869195, "learning_rate": 1.1416902076028135e-08, "loss": 0.0047, "step": 10242 }, { "epoch": 4.66014558689718, "grad_norm": 0.826590921540042, "learning_rate": 1.1386552869814602e-08, "loss": 0.0085, "step": 10243 }, { "epoch": 4.660600545950865, "grad_norm": 0.5310245188388057, "learning_rate": 1.135624359088544e-08, "loss": 0.0054, "step": 10244 }, { "epoch": 4.6610555050045495, "grad_norm": 0.42119302356542054, "learning_rate": 1.1325974241717383e-08, "loss": 0.0051, "step": 10245 }, { "epoch": 4.661510464058235, "grad_norm": 0.7164613845076795, "learning_rate": 1.1295744824783792e-08, "loss": 0.0175, "step": 10246 }, { "epoch": 4.66196542311192, "grad_norm": 0.520831554721927, "learning_rate": 1.1265555342554967e-08, "loss": 0.0063, "step": 10247 }, { "epoch": 4.662420382165605, "grad_norm": 0.35234050824084306, "learning_rate": 1.1235405797497932e-08, "loss": 0.0058, "step": 10248 }, { "epoch": 4.662875341219291, "grad_norm": 0.3523286341569258, "learning_rate": 1.1205296192076275e-08, "loss": 0.0048, "step": 10249 }, { "epoch": 4.663330300272976, "grad_norm": 0.48131221348856745, "learning_rate": 1.1175226528750414e-08, "loss": 0.0032, "step": 10250 }, { "epoch": 4.66378525932666, "grad_norm": 0.3623304059907239, "learning_rate": 1.1145196809977497e-08, "loss": 0.0031, "step": 10251 }, { "epoch": 4.664240218380346, "grad_norm": 0.5292745900612521, "learning_rate": 1.1115207038211505e-08, "loss": 0.0056, "step": 10252 }, { "epoch": 4.664695177434031, "grad_norm": 0.49986727909739487, "learning_rate": 1.1085257215902976e-08, "loss": 0.0042, "step": 10253 }, { "epoch": 4.665150136487716, "grad_norm": 0.34653664697887265, "learning_rate": 1.1055347345499288e-08, "loss": 0.0025, "step": 10254 }, { "epoch": 4.665605095541402, "grad_norm": 0.7063017334218181, "learning_rate": 1.1025477429444597e-08, "loss": 0.0039, "step": 10255 }, { "epoch": 4.6660600545950865, "grad_norm": 0.4727312080953959, "learning_rate": 1.099564747017967e-08, "loss": 0.002, "step": 10256 }, { "epoch": 4.666515013648771, "grad_norm": 0.7192341951211927, "learning_rate": 1.0965857470142171e-08, "loss": 0.0094, "step": 10257 }, { "epoch": 4.666969972702457, "grad_norm": 0.6634406717494901, "learning_rate": 1.0936107431766317e-08, "loss": 0.0049, "step": 10258 }, { "epoch": 4.667424931756142, "grad_norm": 0.6701366868486258, "learning_rate": 1.0906397357483166e-08, "loss": 0.0142, "step": 10259 }, { "epoch": 4.667879890809827, "grad_norm": 0.9504057850986763, "learning_rate": 1.0876727249720441e-08, "loss": 0.0065, "step": 10260 }, { "epoch": 4.668334849863513, "grad_norm": 0.5922341799995354, "learning_rate": 1.0847097110902703e-08, "loss": 0.003, "step": 10261 }, { "epoch": 4.6687898089171975, "grad_norm": 0.6408441788178042, "learning_rate": 1.0817506943451238e-08, "loss": 0.0116, "step": 10262 }, { "epoch": 4.669244767970882, "grad_norm": 0.5522140754910121, "learning_rate": 1.0787956749783999e-08, "loss": 0.0085, "step": 10263 }, { "epoch": 4.669699727024568, "grad_norm": 0.6091000808597958, "learning_rate": 1.0758446532315668e-08, "loss": 0.0121, "step": 10264 }, { "epoch": 4.670154686078253, "grad_norm": 0.45166430627288034, "learning_rate": 1.0728976293457649e-08, "loss": 0.0024, "step": 10265 }, { "epoch": 4.670609645131938, "grad_norm": 0.8086249442698157, "learning_rate": 1.069954603561818e-08, "loss": 0.0047, "step": 10266 }, { "epoch": 4.671064604185624, "grad_norm": 0.5773330964062074, "learning_rate": 1.0670155761202171e-08, "loss": 0.0097, "step": 10267 }, { "epoch": 4.6715195632393085, "grad_norm": 0.48150087349431187, "learning_rate": 1.0640805472611203e-08, "loss": 0.0032, "step": 10268 }, { "epoch": 4.671974522292993, "grad_norm": 0.3965437008812115, "learning_rate": 1.061149517224369e-08, "loss": 0.0026, "step": 10269 }, { "epoch": 4.672429481346679, "grad_norm": 0.4684233798734329, "learning_rate": 1.0582224862494715e-08, "loss": 0.008, "step": 10270 }, { "epoch": 4.672884440400364, "grad_norm": 0.43635958677168835, "learning_rate": 1.05529945457562e-08, "loss": 0.0094, "step": 10271 }, { "epoch": 4.673339399454049, "grad_norm": 0.6124145666700633, "learning_rate": 1.052380422441662e-08, "loss": 0.0027, "step": 10272 }, { "epoch": 4.673794358507735, "grad_norm": 0.761868825954131, "learning_rate": 1.0494653900861295e-08, "loss": 0.0102, "step": 10273 }, { "epoch": 4.6742493175614195, "grad_norm": 0.33487516644859705, "learning_rate": 1.0465543577472313e-08, "loss": 0.0013, "step": 10274 }, { "epoch": 4.674704276615104, "grad_norm": 0.6052885191861928, "learning_rate": 1.0436473256628275e-08, "loss": 0.0071, "step": 10275 }, { "epoch": 4.67515923566879, "grad_norm": 0.476473066333621, "learning_rate": 1.0407442940704836e-08, "loss": 0.0051, "step": 10276 }, { "epoch": 4.675614194722475, "grad_norm": 0.47693575370952895, "learning_rate": 1.0378452632074209e-08, "loss": 0.0058, "step": 10277 }, { "epoch": 4.67606915377616, "grad_norm": 0.7418203185579164, "learning_rate": 1.0349502333105332e-08, "loss": 0.0095, "step": 10278 }, { "epoch": 4.676524112829846, "grad_norm": 0.5951260427728313, "learning_rate": 1.0320592046163924e-08, "loss": 0.0055, "step": 10279 }, { "epoch": 4.6769790718835305, "grad_norm": 0.3666537878300986, "learning_rate": 1.0291721773612261e-08, "loss": 0.0054, "step": 10280 }, { "epoch": 4.677434030937215, "grad_norm": 0.8372768419277189, "learning_rate": 1.0262891517809625e-08, "loss": 0.013, "step": 10281 }, { "epoch": 4.677888989990901, "grad_norm": 0.4673726081337037, "learning_rate": 1.023410128111185e-08, "loss": 0.0066, "step": 10282 }, { "epoch": 4.678343949044586, "grad_norm": 0.6452151709168649, "learning_rate": 1.0205351065871615e-08, "loss": 0.007, "step": 10283 }, { "epoch": 4.678798908098271, "grad_norm": 0.4955205626097612, "learning_rate": 1.0176640874438147e-08, "loss": 0.0063, "step": 10284 }, { "epoch": 4.679253867151957, "grad_norm": 0.6468998896551802, "learning_rate": 1.0147970709157573e-08, "loss": 0.0094, "step": 10285 }, { "epoch": 4.679708826205641, "grad_norm": 0.9433298218547772, "learning_rate": 1.0119340572372681e-08, "loss": 0.0047, "step": 10286 }, { "epoch": 4.680163785259326, "grad_norm": 0.4741446522884783, "learning_rate": 1.0090750466422993e-08, "loss": 0.0097, "step": 10287 }, { "epoch": 4.680618744313012, "grad_norm": 0.5103865915707199, "learning_rate": 1.0062200393644805e-08, "loss": 0.0086, "step": 10288 }, { "epoch": 4.681073703366697, "grad_norm": 0.468250999527374, "learning_rate": 1.0033690356370972e-08, "loss": 0.007, "step": 10289 }, { "epoch": 4.681528662420382, "grad_norm": 0.3077908572315175, "learning_rate": 1.0005220356931354e-08, "loss": 0.004, "step": 10290 }, { "epoch": 4.6819836214740675, "grad_norm": 0.7570944287540846, "learning_rate": 9.976790397652312e-09, "loss": 0.0147, "step": 10291 }, { "epoch": 4.682438580527752, "grad_norm": 0.8200858644681627, "learning_rate": 9.9484004808571e-09, "loss": 0.0127, "step": 10292 }, { "epoch": 4.682893539581437, "grad_norm": 0.2313711054501986, "learning_rate": 9.92005060886547e-09, "loss": 0.0023, "step": 10293 }, { "epoch": 4.683348498635123, "grad_norm": 0.3709784072621579, "learning_rate": 9.89174078399413e-09, "loss": 0.0033, "step": 10294 }, { "epoch": 4.683803457688808, "grad_norm": 0.4758139094035307, "learning_rate": 9.863471008556446e-09, "loss": 0.0045, "step": 10295 }, { "epoch": 4.684258416742493, "grad_norm": 0.6118461921669036, "learning_rate": 9.835241284862462e-09, "loss": 0.0039, "step": 10296 }, { "epoch": 4.6847133757961785, "grad_norm": 0.5839486631170264, "learning_rate": 9.807051615218998e-09, "loss": 0.0091, "step": 10297 }, { "epoch": 4.685168334849863, "grad_norm": 0.745902995082155, "learning_rate": 9.7789020019296e-09, "loss": 0.0047, "step": 10298 }, { "epoch": 4.685623293903548, "grad_norm": 0.6217668107881767, "learning_rate": 9.750792447294487e-09, "loss": 0.0137, "step": 10299 }, { "epoch": 4.686078252957234, "grad_norm": 0.5349770587494711, "learning_rate": 9.722722953610707e-09, "loss": 0.0078, "step": 10300 }, { "epoch": 4.686533212010919, "grad_norm": 0.45637624765574375, "learning_rate": 9.694693523171926e-09, "loss": 0.0049, "step": 10301 }, { "epoch": 4.686988171064604, "grad_norm": 0.8619001915874746, "learning_rate": 9.66670415826859e-09, "loss": 0.023, "step": 10302 }, { "epoch": 4.6874431301182895, "grad_norm": 0.36129805061507453, "learning_rate": 9.638754861187815e-09, "loss": 0.0035, "step": 10303 }, { "epoch": 4.687898089171974, "grad_norm": 0.6370648667745945, "learning_rate": 9.61084563421355e-09, "loss": 0.0097, "step": 10304 }, { "epoch": 4.688353048225659, "grad_norm": 0.9811377751290004, "learning_rate": 9.58297647962647e-09, "loss": 0.0045, "step": 10305 }, { "epoch": 4.688808007279345, "grad_norm": 41.02168043607429, "learning_rate": 9.55514739970381e-09, "loss": 0.1631, "step": 10306 }, { "epoch": 4.68926296633303, "grad_norm": 0.6308970799545276, "learning_rate": 9.527358396719698e-09, "loss": 0.0044, "step": 10307 }, { "epoch": 4.689717925386715, "grad_norm": 2.8183283263933565, "learning_rate": 9.49960947294487e-09, "loss": 0.0034, "step": 10308 }, { "epoch": 4.6901728844404005, "grad_norm": 0.6776628975738354, "learning_rate": 9.471900630646845e-09, "loss": 0.0073, "step": 10309 }, { "epoch": 4.690627843494085, "grad_norm": 0.5375743185746771, "learning_rate": 9.444231872089925e-09, "loss": 0.0067, "step": 10310 }, { "epoch": 4.69108280254777, "grad_norm": 0.7056639128558797, "learning_rate": 9.416603199535078e-09, "loss": 0.0107, "step": 10311 }, { "epoch": 4.691537761601456, "grad_norm": 0.5111659886155773, "learning_rate": 9.389014615239943e-09, "loss": 0.0084, "step": 10312 }, { "epoch": 4.691992720655141, "grad_norm": 0.5550947014330323, "learning_rate": 9.361466121458938e-09, "loss": 0.0029, "step": 10313 }, { "epoch": 4.692447679708827, "grad_norm": 0.7744591402345553, "learning_rate": 9.333957720443209e-09, "loss": 0.0087, "step": 10314 }, { "epoch": 4.6929026387625115, "grad_norm": 1.2347489098130917, "learning_rate": 9.306489414440677e-09, "loss": 0.0113, "step": 10315 }, { "epoch": 4.693357597816196, "grad_norm": 0.43598500968053605, "learning_rate": 9.279061205695826e-09, "loss": 0.0032, "step": 10316 }, { "epoch": 4.693812556869882, "grad_norm": 0.5406961642962067, "learning_rate": 9.25167309645003e-09, "loss": 0.0084, "step": 10317 }, { "epoch": 4.694267515923567, "grad_norm": 0.5272339122956274, "learning_rate": 9.224325088941331e-09, "loss": 0.0037, "step": 10318 }, { "epoch": 4.694722474977252, "grad_norm": 0.5068957162806424, "learning_rate": 9.197017185404443e-09, "loss": 0.0075, "step": 10319 }, { "epoch": 4.695177434030938, "grad_norm": 0.7831231625292944, "learning_rate": 9.169749388070858e-09, "loss": 0.0086, "step": 10320 }, { "epoch": 4.695632393084622, "grad_norm": 0.6515771605136015, "learning_rate": 9.142521699168792e-09, "loss": 0.0076, "step": 10321 }, { "epoch": 4.696087352138307, "grad_norm": 0.3906065633541709, "learning_rate": 9.115334120923191e-09, "loss": 0.0023, "step": 10322 }, { "epoch": 4.696542311191993, "grad_norm": 1.064434917825696, "learning_rate": 9.088186655555607e-09, "loss": 0.0081, "step": 10323 }, { "epoch": 4.696997270245678, "grad_norm": 0.9483858635110443, "learning_rate": 9.061079305284491e-09, "loss": 0.0101, "step": 10324 }, { "epoch": 4.697452229299363, "grad_norm": 0.3789355148527927, "learning_rate": 9.03401207232496e-09, "loss": 0.003, "step": 10325 }, { "epoch": 4.6979071883530485, "grad_norm": 0.7740266292711346, "learning_rate": 9.00698495888874e-09, "loss": 0.008, "step": 10326 }, { "epoch": 4.698362147406733, "grad_norm": 0.7614429481254189, "learning_rate": 8.97999796718446e-09, "loss": 0.0122, "step": 10327 }, { "epoch": 4.698817106460418, "grad_norm": 0.6036548333659988, "learning_rate": 8.953051099417242e-09, "loss": 0.0084, "step": 10328 }, { "epoch": 4.699272065514104, "grad_norm": 0.6407473586665541, "learning_rate": 8.926144357789157e-09, "loss": 0.009, "step": 10329 }, { "epoch": 4.699727024567789, "grad_norm": 0.6386849672169786, "learning_rate": 8.899277744498889e-09, "loss": 0.0068, "step": 10330 }, { "epoch": 4.700181983621474, "grad_norm": 0.757868156887069, "learning_rate": 8.872451261741853e-09, "loss": 0.0081, "step": 10331 }, { "epoch": 4.7006369426751595, "grad_norm": 0.6744424877066221, "learning_rate": 8.845664911710238e-09, "loss": 0.009, "step": 10332 }, { "epoch": 4.701091901728844, "grad_norm": 0.3063006776520512, "learning_rate": 8.818918696592737e-09, "loss": 0.0017, "step": 10333 }, { "epoch": 4.701546860782529, "grad_norm": 0.815815567358131, "learning_rate": 8.792212618575156e-09, "loss": 0.016, "step": 10334 }, { "epoch": 4.702001819836215, "grad_norm": 0.6422433543216717, "learning_rate": 8.765546679839641e-09, "loss": 0.0118, "step": 10335 }, { "epoch": 4.7024567788899, "grad_norm": 0.24387753463615053, "learning_rate": 8.738920882565282e-09, "loss": 0.0013, "step": 10336 }, { "epoch": 4.702911737943585, "grad_norm": 0.3181261310290097, "learning_rate": 8.712335228927781e-09, "loss": 0.0026, "step": 10337 }, { "epoch": 4.7033666969972705, "grad_norm": 0.4635692001343105, "learning_rate": 8.685789721099568e-09, "loss": 0.0049, "step": 10338 }, { "epoch": 4.703821656050955, "grad_norm": 0.7232400038049073, "learning_rate": 8.659284361249908e-09, "loss": 0.0058, "step": 10339 }, { "epoch": 4.70427661510464, "grad_norm": 0.5775202627282051, "learning_rate": 8.632819151544678e-09, "loss": 0.007, "step": 10340 }, { "epoch": 4.704731574158326, "grad_norm": 0.5439572230182292, "learning_rate": 8.60639409414643e-09, "loss": 0.0061, "step": 10341 }, { "epoch": 4.705186533212011, "grad_norm": 0.7044007480210689, "learning_rate": 8.580009191214544e-09, "loss": 0.0024, "step": 10342 }, { "epoch": 4.705641492265696, "grad_norm": 0.44007199582227574, "learning_rate": 8.553664444905073e-09, "loss": 0.0044, "step": 10343 }, { "epoch": 4.7060964513193815, "grad_norm": 0.7786979758991297, "learning_rate": 8.527359857370797e-09, "loss": 0.0066, "step": 10344 }, { "epoch": 4.706551410373066, "grad_norm": 0.8365900170362165, "learning_rate": 8.501095430761218e-09, "loss": 0.015, "step": 10345 }, { "epoch": 4.707006369426751, "grad_norm": 0.4121786084533199, "learning_rate": 8.474871167222508e-09, "loss": 0.0053, "step": 10346 }, { "epoch": 4.707461328480437, "grad_norm": 0.48114827690115464, "learning_rate": 8.448687068897676e-09, "loss": 0.005, "step": 10347 }, { "epoch": 4.707916287534122, "grad_norm": 0.44956515529794233, "learning_rate": 8.42254313792623e-09, "loss": 0.0021, "step": 10348 }, { "epoch": 4.708371246587808, "grad_norm": 0.65937376281205, "learning_rate": 8.39643937644463e-09, "loss": 0.0054, "step": 10349 }, { "epoch": 4.7088262056414925, "grad_norm": 0.38678735709215395, "learning_rate": 8.370375786586003e-09, "loss": 0.0046, "step": 10350 }, { "epoch": 4.709281164695177, "grad_norm": 0.5891904844859702, "learning_rate": 8.344352370480035e-09, "loss": 0.006, "step": 10351 }, { "epoch": 4.709736123748863, "grad_norm": 0.885136054525014, "learning_rate": 8.3183691302533e-09, "loss": 0.0217, "step": 10352 }, { "epoch": 4.710191082802548, "grad_norm": 0.5656649475356584, "learning_rate": 8.29242606802899e-09, "loss": 0.0089, "step": 10353 }, { "epoch": 4.710646041856233, "grad_norm": 0.7691914225883566, "learning_rate": 8.266523185927132e-09, "loss": 0.0091, "step": 10354 }, { "epoch": 4.711101000909919, "grad_norm": 0.76456878573681, "learning_rate": 8.240660486064366e-09, "loss": 0.027, "step": 10355 }, { "epoch": 4.711555959963603, "grad_norm": 0.5503876770807153, "learning_rate": 8.214837970554057e-09, "loss": 0.0053, "step": 10356 }, { "epoch": 4.712010919017288, "grad_norm": 0.15376555338069872, "learning_rate": 8.189055641506293e-09, "loss": 0.0008, "step": 10357 }, { "epoch": 4.712465878070974, "grad_norm": 0.6689033003578246, "learning_rate": 8.16331350102789e-09, "loss": 0.0195, "step": 10358 }, { "epoch": 4.712920837124659, "grad_norm": 0.30452188895299553, "learning_rate": 8.13761155122239e-09, "loss": 0.0009, "step": 10359 }, { "epoch": 4.713375796178344, "grad_norm": 0.5864266539194357, "learning_rate": 8.111949794190054e-09, "loss": 0.0025, "step": 10360 }, { "epoch": 4.7138307552320295, "grad_norm": 0.8406074759470012, "learning_rate": 8.086328232027873e-09, "loss": 0.0071, "step": 10361 }, { "epoch": 4.714285714285714, "grad_norm": 0.47058860356554316, "learning_rate": 8.060746866829393e-09, "loss": 0.0052, "step": 10362 }, { "epoch": 4.714740673339399, "grad_norm": 0.7195172595530315, "learning_rate": 8.035205700685165e-09, "loss": 0.0091, "step": 10363 }, { "epoch": 4.715195632393085, "grad_norm": 0.5086833615080365, "learning_rate": 8.009704735682243e-09, "loss": 0.0038, "step": 10364 }, { "epoch": 4.71565059144677, "grad_norm": 0.6826326362604073, "learning_rate": 7.984243973904459e-09, "loss": 0.0055, "step": 10365 }, { "epoch": 4.716105550500455, "grad_norm": 0.5511036915954404, "learning_rate": 7.95882341743226e-09, "loss": 0.0075, "step": 10366 }, { "epoch": 4.7165605095541405, "grad_norm": 0.7215564889961903, "learning_rate": 7.933443068342982e-09, "loss": 0.0182, "step": 10367 }, { "epoch": 4.717015468607825, "grad_norm": 0.6272885313831793, "learning_rate": 7.908102928710637e-09, "loss": 0.0183, "step": 10368 }, { "epoch": 4.71747042766151, "grad_norm": 0.4590029992621699, "learning_rate": 7.882803000605842e-09, "loss": 0.0074, "step": 10369 }, { "epoch": 4.717925386715196, "grad_norm": 0.41021562100273185, "learning_rate": 7.857543286096002e-09, "loss": 0.0037, "step": 10370 }, { "epoch": 4.718380345768881, "grad_norm": 1.4007382646391104, "learning_rate": 7.832323787245187e-09, "loss": 0.0338, "step": 10371 }, { "epoch": 4.718835304822566, "grad_norm": 0.639811624361501, "learning_rate": 7.807144506114305e-09, "loss": 0.0116, "step": 10372 }, { "epoch": 4.7192902638762515, "grad_norm": 0.4227394498010489, "learning_rate": 7.782005444760819e-09, "loss": 0.0045, "step": 10373 }, { "epoch": 4.719745222929936, "grad_norm": 0.8109313607785451, "learning_rate": 7.756906605239089e-09, "loss": 0.0155, "step": 10374 }, { "epoch": 4.720200181983621, "grad_norm": 0.7931424833859521, "learning_rate": 7.731847989599915e-09, "loss": 0.0069, "step": 10375 }, { "epoch": 4.720655141037307, "grad_norm": 0.404218044216291, "learning_rate": 7.706829599891108e-09, "loss": 0.0036, "step": 10376 }, { "epoch": 4.721110100090992, "grad_norm": 0.5340417167003945, "learning_rate": 7.681851438156973e-09, "loss": 0.013, "step": 10377 }, { "epoch": 4.721565059144677, "grad_norm": 0.9631624963596079, "learning_rate": 7.656913506438712e-09, "loss": 0.0042, "step": 10378 }, { "epoch": 4.7220200181983625, "grad_norm": 0.7563192941841547, "learning_rate": 7.632015806774029e-09, "loss": 0.012, "step": 10379 }, { "epoch": 4.722474977252047, "grad_norm": 0.8446820619496425, "learning_rate": 7.607158341197461e-09, "loss": 0.0235, "step": 10380 }, { "epoch": 4.722929936305732, "grad_norm": 0.43828472570484156, "learning_rate": 7.58234111174033e-09, "loss": 0.0035, "step": 10381 }, { "epoch": 4.723384895359418, "grad_norm": 0.6140432615027424, "learning_rate": 7.557564120430571e-09, "loss": 0.0074, "step": 10382 }, { "epoch": 4.723839854413103, "grad_norm": 0.2950514146474311, "learning_rate": 7.532827369292782e-09, "loss": 0.0019, "step": 10383 }, { "epoch": 4.724294813466788, "grad_norm": 0.6284431202507317, "learning_rate": 7.508130860348405e-09, "loss": 0.0067, "step": 10384 }, { "epoch": 4.7247497725204735, "grad_norm": 0.6259918010458667, "learning_rate": 7.483474595615492e-09, "loss": 0.0067, "step": 10385 }, { "epoch": 4.725204731574158, "grad_norm": 0.25259093929802645, "learning_rate": 7.458858577108818e-09, "loss": 0.0022, "step": 10386 }, { "epoch": 4.725659690627843, "grad_norm": 0.7698790242264756, "learning_rate": 7.434282806839942e-09, "loss": 0.0078, "step": 10387 }, { "epoch": 4.726114649681529, "grad_norm": 0.47876804447869825, "learning_rate": 7.409747286817091e-09, "loss": 0.0031, "step": 10388 }, { "epoch": 4.726569608735214, "grad_norm": 0.8586699780281701, "learning_rate": 7.3852520190451625e-09, "loss": 0.0164, "step": 10389 }, { "epoch": 4.727024567788899, "grad_norm": 0.15412219225849458, "learning_rate": 7.3607970055258315e-09, "loss": 0.0005, "step": 10390 }, { "epoch": 4.727479526842584, "grad_norm": 0.8110517367621118, "learning_rate": 7.336382248257389e-09, "loss": 0.0051, "step": 10391 }, { "epoch": 4.727934485896269, "grad_norm": 0.6302641463561162, "learning_rate": 7.312007749234961e-09, "loss": 0.008, "step": 10392 }, { "epoch": 4.728389444949954, "grad_norm": 0.48787743400777844, "learning_rate": 7.287673510450343e-09, "loss": 0.0036, "step": 10393 }, { "epoch": 4.72884440400364, "grad_norm": 0.6868225760721695, "learning_rate": 7.263379533891889e-09, "loss": 0.0089, "step": 10394 }, { "epoch": 4.729299363057325, "grad_norm": 0.7471669188319842, "learning_rate": 7.2391258215449554e-09, "loss": 0.0179, "step": 10395 }, { "epoch": 4.72975432211101, "grad_norm": 0.57433987437366, "learning_rate": 7.214912375391291e-09, "loss": 0.0037, "step": 10396 }, { "epoch": 4.730209281164695, "grad_norm": 0.6751142287739248, "learning_rate": 7.190739197409645e-09, "loss": 0.0083, "step": 10397 }, { "epoch": 4.73066424021838, "grad_norm": 0.960469401054423, "learning_rate": 7.166606289575272e-09, "loss": 0.0069, "step": 10398 }, { "epoch": 4.731119199272065, "grad_norm": 0.7064583858477512, "learning_rate": 7.14251365386026e-09, "loss": 0.0087, "step": 10399 }, { "epoch": 4.731574158325751, "grad_norm": 0.7969378559165439, "learning_rate": 7.118461292233257e-09, "loss": 0.0105, "step": 10400 }, { "epoch": 4.732029117379436, "grad_norm": 0.7110899261160112, "learning_rate": 7.0944492066597475e-09, "loss": 0.0081, "step": 10401 }, { "epoch": 4.732484076433121, "grad_norm": 0.898606298959133, "learning_rate": 7.070477399101937e-09, "loss": 0.0032, "step": 10402 }, { "epoch": 4.732939035486806, "grad_norm": 0.33803842952902863, "learning_rate": 7.04654587151865e-09, "loss": 0.005, "step": 10403 }, { "epoch": 4.733393994540491, "grad_norm": 0.4962070017004507, "learning_rate": 7.0226546258655425e-09, "loss": 0.0056, "step": 10404 }, { "epoch": 4.733848953594176, "grad_norm": 0.41048292611463927, "learning_rate": 6.9988036640947215e-09, "loss": 0.0021, "step": 10405 }, { "epoch": 4.734303912647862, "grad_norm": 0.4536170583338753, "learning_rate": 6.974992988155404e-09, "loss": 0.0046, "step": 10406 }, { "epoch": 4.734758871701547, "grad_norm": 1.1471229333097925, "learning_rate": 6.951222599993089e-09, "loss": 0.0056, "step": 10407 }, { "epoch": 4.735213830755232, "grad_norm": 0.30553169962655563, "learning_rate": 6.927492501550281e-09, "loss": 0.0025, "step": 10408 }, { "epoch": 4.735668789808917, "grad_norm": 1.044119355959611, "learning_rate": 6.903802694766148e-09, "loss": 0.003, "step": 10409 }, { "epoch": 4.736123748862602, "grad_norm": 0.5173159579231191, "learning_rate": 6.88015318157642e-09, "loss": 0.0112, "step": 10410 }, { "epoch": 4.736578707916287, "grad_norm": 0.6550121964821447, "learning_rate": 6.8565439639136635e-09, "loss": 0.0068, "step": 10411 }, { "epoch": 4.737033666969973, "grad_norm": 0.7796816135762472, "learning_rate": 6.8329750437071674e-09, "loss": 0.0149, "step": 10412 }, { "epoch": 4.737488626023658, "grad_norm": 0.7441967965502219, "learning_rate": 6.809446422882781e-09, "loss": 0.0111, "step": 10413 }, { "epoch": 4.737943585077343, "grad_norm": 0.49105596057384115, "learning_rate": 6.785958103363243e-09, "loss": 0.0081, "step": 10414 }, { "epoch": 4.738398544131028, "grad_norm": 0.5195948062552787, "learning_rate": 6.762510087067741e-09, "loss": 0.0057, "step": 10415 }, { "epoch": 4.738853503184713, "grad_norm": 0.5873283150754964, "learning_rate": 6.739102375912575e-09, "loss": 0.0049, "step": 10416 }, { "epoch": 4.739308462238398, "grad_norm": 0.47619798669341573, "learning_rate": 6.7157349718104386e-09, "loss": 0.0049, "step": 10417 }, { "epoch": 4.739763421292084, "grad_norm": 0.508077800245102, "learning_rate": 6.6924078766708025e-09, "loss": 0.0038, "step": 10418 }, { "epoch": 4.740218380345769, "grad_norm": 0.42468775868270237, "learning_rate": 6.66912109239981e-09, "loss": 0.0029, "step": 10419 }, { "epoch": 4.740673339399454, "grad_norm": 0.7244410135265784, "learning_rate": 6.645874620900327e-09, "loss": 0.0042, "step": 10420 }, { "epoch": 4.741128298453139, "grad_norm": 1.2024872191009837, "learning_rate": 6.622668464072056e-09, "loss": 0.0075, "step": 10421 }, { "epoch": 4.741583257506824, "grad_norm": 0.36339328657693243, "learning_rate": 6.5995026238112035e-09, "loss": 0.0018, "step": 10422 }, { "epoch": 4.742038216560509, "grad_norm": 0.7484757437240657, "learning_rate": 6.576377102010866e-09, "loss": 0.0156, "step": 10423 }, { "epoch": 4.742493175614195, "grad_norm": 0.6997128177145208, "learning_rate": 6.553291900560698e-09, "loss": 0.0051, "step": 10424 }, { "epoch": 4.74294813466788, "grad_norm": 0.5154727400316943, "learning_rate": 6.5302470213470796e-09, "loss": 0.0017, "step": 10425 }, { "epoch": 4.743403093721565, "grad_norm": 0.8190170541234394, "learning_rate": 6.507242466253282e-09, "loss": 0.0085, "step": 10426 }, { "epoch": 4.74385805277525, "grad_norm": 0.7368295919922379, "learning_rate": 6.484278237158969e-09, "loss": 0.0146, "step": 10427 }, { "epoch": 4.744313011828935, "grad_norm": 0.5127445362112418, "learning_rate": 6.461354335940805e-09, "loss": 0.0027, "step": 10428 }, { "epoch": 4.744767970882621, "grad_norm": 0.5819477466579129, "learning_rate": 6.438470764471848e-09, "loss": 0.0042, "step": 10429 }, { "epoch": 4.745222929936306, "grad_norm": 0.8453474077983189, "learning_rate": 6.4156275246222135e-09, "loss": 0.0124, "step": 10430 }, { "epoch": 4.745677888989991, "grad_norm": 0.506673836637756, "learning_rate": 6.392824618258519e-09, "loss": 0.0071, "step": 10431 }, { "epoch": 4.746132848043676, "grad_norm": 0.5823016660396074, "learning_rate": 6.370062047244051e-09, "loss": 0.0054, "step": 10432 }, { "epoch": 4.746587807097361, "grad_norm": 0.5316810822460947, "learning_rate": 6.347339813438934e-09, "loss": 0.0067, "step": 10433 }, { "epoch": 4.747042766151046, "grad_norm": 0.6688090651008716, "learning_rate": 6.324657918699849e-09, "loss": 0.006, "step": 10434 }, { "epoch": 4.747497725204732, "grad_norm": 0.6447603335998947, "learning_rate": 6.302016364880369e-09, "loss": 0.0103, "step": 10435 }, { "epoch": 4.747952684258417, "grad_norm": 0.49700411657555543, "learning_rate": 6.279415153830514e-09, "loss": 0.0066, "step": 10436 }, { "epoch": 4.748407643312102, "grad_norm": 0.6600452020952593, "learning_rate": 6.256854287397251e-09, "loss": 0.008, "step": 10437 }, { "epoch": 4.748862602365787, "grad_norm": 0.360203468230306, "learning_rate": 6.23433376742416e-09, "loss": 0.0029, "step": 10438 }, { "epoch": 4.749317561419472, "grad_norm": 0.20792308618749586, "learning_rate": 6.211853595751493e-09, "loss": 0.0008, "step": 10439 }, { "epoch": 4.749772520473157, "grad_norm": 0.5428874568515234, "learning_rate": 6.189413774216168e-09, "loss": 0.0047, "step": 10440 }, { "epoch": 4.750227479526843, "grad_norm": 0.5112648977534553, "learning_rate": 6.167014304651996e-09, "loss": 0.0103, "step": 10441 }, { "epoch": 4.750682438580528, "grad_norm": 0.6246408105131716, "learning_rate": 6.144655188889236e-09, "loss": 0.0032, "step": 10442 }, { "epoch": 4.751137397634213, "grad_norm": 0.64931343181716, "learning_rate": 6.122336428755037e-09, "loss": 0.0077, "step": 10443 }, { "epoch": 4.751592356687898, "grad_norm": 0.9328863937715289, "learning_rate": 6.100058026073107e-09, "loss": 0.008, "step": 10444 }, { "epoch": 4.752047315741583, "grad_norm": 0.4501393171535572, "learning_rate": 6.0778199826641005e-09, "loss": 0.0061, "step": 10445 }, { "epoch": 4.752502274795268, "grad_norm": 0.7938481200901649, "learning_rate": 6.055622300345064e-09, "loss": 0.0113, "step": 10446 }, { "epoch": 4.752957233848954, "grad_norm": 0.5583076670576965, "learning_rate": 6.033464980929992e-09, "loss": 0.0059, "step": 10447 }, { "epoch": 4.753412192902639, "grad_norm": 0.7944967769740707, "learning_rate": 6.011348026229323e-09, "loss": 0.0035, "step": 10448 }, { "epoch": 4.753867151956324, "grad_norm": 1.0332770949027503, "learning_rate": 5.989271438050558e-09, "loss": 0.0112, "step": 10449 }, { "epoch": 4.754322111010009, "grad_norm": 0.4338289640819529, "learning_rate": 5.96723521819753e-09, "loss": 0.0056, "step": 10450 }, { "epoch": 4.754777070063694, "grad_norm": 0.9046798804253445, "learning_rate": 5.945239368471078e-09, "loss": 0.0045, "step": 10451 }, { "epoch": 4.755232029117379, "grad_norm": 0.6380247741362725, "learning_rate": 5.923283890668484e-09, "loss": 0.0048, "step": 10452 }, { "epoch": 4.755686988171065, "grad_norm": 0.5098626282808184, "learning_rate": 5.9013687865839265e-09, "loss": 0.0079, "step": 10453 }, { "epoch": 4.75614194722475, "grad_norm": 0.5387091849588442, "learning_rate": 5.8794940580081386e-09, "loss": 0.0081, "step": 10454 }, { "epoch": 4.756596906278435, "grad_norm": 0.43433811528645816, "learning_rate": 5.85765970672869e-09, "loss": 0.0039, "step": 10455 }, { "epoch": 4.75705186533212, "grad_norm": 0.8521293053745072, "learning_rate": 5.835865734529821e-09, "loss": 0.0069, "step": 10456 }, { "epoch": 4.757506824385805, "grad_norm": 0.5499418990833075, "learning_rate": 5.814112143192273e-09, "loss": 0.0036, "step": 10457 }, { "epoch": 4.757961783439491, "grad_norm": 0.514918858501837, "learning_rate": 5.792398934493847e-09, "loss": 0.0059, "step": 10458 }, { "epoch": 4.758416742493176, "grad_norm": 0.590377021412508, "learning_rate": 5.770726110208679e-09, "loss": 0.0056, "step": 10459 }, { "epoch": 4.758871701546861, "grad_norm": 0.5213616292746899, "learning_rate": 5.749093672107907e-09, "loss": 0.0058, "step": 10460 }, { "epoch": 4.759326660600546, "grad_norm": 0.4872649378776043, "learning_rate": 5.7275016219591745e-09, "loss": 0.0055, "step": 10461 }, { "epoch": 4.759781619654231, "grad_norm": 0.4418567919887484, "learning_rate": 5.705949961526901e-09, "loss": 0.007, "step": 10462 }, { "epoch": 4.760236578707916, "grad_norm": 0.3615543406231569, "learning_rate": 5.68443869257218e-09, "loss": 0.0017, "step": 10463 }, { "epoch": 4.760691537761602, "grad_norm": 0.5240492585482395, "learning_rate": 5.662967816852771e-09, "loss": 0.004, "step": 10464 }, { "epoch": 4.761146496815287, "grad_norm": 0.5924260416673713, "learning_rate": 5.64153733612327e-09, "loss": 0.003, "step": 10465 }, { "epoch": 4.761601455868972, "grad_norm": 0.4393975111683296, "learning_rate": 5.6201472521348885e-09, "loss": 0.0057, "step": 10466 }, { "epoch": 4.762056414922657, "grad_norm": 0.8156086712529271, "learning_rate": 5.598797566635394e-09, "loss": 0.0093, "step": 10467 }, { "epoch": 4.762511373976342, "grad_norm": 0.4685407362671836, "learning_rate": 5.577488281369502e-09, "loss": 0.0049, "step": 10468 }, { "epoch": 4.762966333030027, "grad_norm": 0.5900804282951355, "learning_rate": 5.5562193980784874e-09, "loss": 0.0096, "step": 10469 }, { "epoch": 4.763421292083713, "grad_norm": 0.4924084503478246, "learning_rate": 5.534990918500293e-09, "loss": 0.0035, "step": 10470 }, { "epoch": 4.763876251137398, "grad_norm": 0.445873179134492, "learning_rate": 5.513802844369642e-09, "loss": 0.0056, "step": 10471 }, { "epoch": 4.764331210191083, "grad_norm": 1.8897236569656592, "learning_rate": 5.492655177418037e-09, "loss": 0.0073, "step": 10472 }, { "epoch": 4.764786169244768, "grad_norm": 0.34876602472666485, "learning_rate": 5.471547919373376e-09, "loss": 0.0037, "step": 10473 }, { "epoch": 4.765241128298453, "grad_norm": 0.48823562515070157, "learning_rate": 5.450481071960611e-09, "loss": 0.0049, "step": 10474 }, { "epoch": 4.765696087352138, "grad_norm": 0.8046686805664002, "learning_rate": 5.429454636901143e-09, "loss": 0.0085, "step": 10475 }, { "epoch": 4.766151046405824, "grad_norm": 0.3688185857914118, "learning_rate": 5.408468615913209e-09, "loss": 0.0038, "step": 10476 }, { "epoch": 4.766606005459509, "grad_norm": 0.5306008752379477, "learning_rate": 5.387523010711603e-09, "loss": 0.0057, "step": 10477 }, { "epoch": 4.767060964513194, "grad_norm": 0.5852483477227836, "learning_rate": 5.3666178230079575e-09, "loss": 0.0086, "step": 10478 }, { "epoch": 4.767515923566879, "grad_norm": 0.47412949825840156, "learning_rate": 5.345753054510627e-09, "loss": 0.0037, "step": 10479 }, { "epoch": 4.767970882620564, "grad_norm": 0.48272753260682083, "learning_rate": 5.324928706924525e-09, "loss": 0.0042, "step": 10480 }, { "epoch": 4.768425841674249, "grad_norm": 1.6672398762695209, "learning_rate": 5.304144781951292e-09, "loss": 0.0181, "step": 10481 }, { "epoch": 4.768880800727935, "grad_norm": 0.5917818906149135, "learning_rate": 5.28340128128929e-09, "loss": 0.0048, "step": 10482 }, { "epoch": 4.76933575978162, "grad_norm": 0.7791435700233358, "learning_rate": 5.262698206633609e-09, "loss": 0.0057, "step": 10483 }, { "epoch": 4.769790718835305, "grad_norm": 0.5041794796843777, "learning_rate": 5.2420355596760615e-09, "loss": 0.0039, "step": 10484 }, { "epoch": 4.77024567788899, "grad_norm": 0.6998230374431356, "learning_rate": 5.221413342105019e-09, "loss": 0.0137, "step": 10485 }, { "epoch": 4.770700636942675, "grad_norm": 0.7878161371414856, "learning_rate": 5.200831555605745e-09, "loss": 0.0076, "step": 10486 }, { "epoch": 4.77115559599636, "grad_norm": 0.627593208560225, "learning_rate": 5.180290201859949e-09, "loss": 0.0116, "step": 10487 }, { "epoch": 4.771610555050046, "grad_norm": 0.5885169497417331, "learning_rate": 5.15978928254629e-09, "loss": 0.0079, "step": 10488 }, { "epoch": 4.772065514103731, "grad_norm": 0.5297525587974209, "learning_rate": 5.139328799339981e-09, "loss": 0.0047, "step": 10489 }, { "epoch": 4.772520473157416, "grad_norm": 0.5301069118813665, "learning_rate": 5.118908753912965e-09, "loss": 0.0044, "step": 10490 }, { "epoch": 4.772975432211101, "grad_norm": 0.741836933181208, "learning_rate": 5.0985291479338496e-09, "loss": 0.0151, "step": 10491 }, { "epoch": 4.773430391264786, "grad_norm": 0.4686077565506677, "learning_rate": 5.078189983067915e-09, "loss": 0.0061, "step": 10492 }, { "epoch": 4.773885350318471, "grad_norm": 0.7119599623653252, "learning_rate": 5.057891260977276e-09, "loss": 0.0104, "step": 10493 }, { "epoch": 4.774340309372157, "grad_norm": 0.7149158319588449, "learning_rate": 5.037632983320661e-09, "loss": 0.006, "step": 10494 }, { "epoch": 4.774795268425842, "grad_norm": 0.45115742946824966, "learning_rate": 5.017415151753412e-09, "loss": 0.0044, "step": 10495 }, { "epoch": 4.7752502274795265, "grad_norm": 0.43394855639644103, "learning_rate": 4.997237767927709e-09, "loss": 0.0017, "step": 10496 }, { "epoch": 4.775705186533212, "grad_norm": 0.5087563691679025, "learning_rate": 4.977100833492287e-09, "loss": 0.0099, "step": 10497 }, { "epoch": 4.776160145586897, "grad_norm": 0.6064778668163726, "learning_rate": 4.957004350092719e-09, "loss": 0.0122, "step": 10498 }, { "epoch": 4.776615104640582, "grad_norm": 0.6134677455514248, "learning_rate": 4.936948319371137e-09, "loss": 0.0055, "step": 10499 }, { "epoch": 4.777070063694268, "grad_norm": 0.6483638556030055, "learning_rate": 4.9169327429664505e-09, "loss": 0.013, "step": 10500 }, { "epoch": 4.777525022747953, "grad_norm": 0.9699122314353043, "learning_rate": 4.8969576225142975e-09, "loss": 0.0047, "step": 10501 }, { "epoch": 4.7779799818016375, "grad_norm": 0.6129849771024098, "learning_rate": 4.877022959646815e-09, "loss": 0.0113, "step": 10502 }, { "epoch": 4.778434940855323, "grad_norm": 0.4739898175403287, "learning_rate": 4.8571287559930896e-09, "loss": 0.0036, "step": 10503 }, { "epoch": 4.778889899909008, "grad_norm": 0.716885238752244, "learning_rate": 4.837275013178821e-09, "loss": 0.006, "step": 10504 }, { "epoch": 4.779344858962693, "grad_norm": 0.8103624787744323, "learning_rate": 4.8174617328262665e-09, "loss": 0.0073, "step": 10505 }, { "epoch": 4.779799818016379, "grad_norm": 0.4774224973378116, "learning_rate": 4.797688916554465e-09, "loss": 0.0035, "step": 10506 }, { "epoch": 4.780254777070064, "grad_norm": 0.4882582975845527, "learning_rate": 4.777956565979235e-09, "loss": 0.0056, "step": 10507 }, { "epoch": 4.7807097361237485, "grad_norm": 0.6123276421617033, "learning_rate": 4.758264682713009e-09, "loss": 0.0082, "step": 10508 }, { "epoch": 4.781164695177434, "grad_norm": 0.32097269834844705, "learning_rate": 4.738613268364888e-09, "loss": 0.0048, "step": 10509 }, { "epoch": 4.781619654231119, "grad_norm": 0.6602284084612496, "learning_rate": 4.719002324540755e-09, "loss": 0.0048, "step": 10510 }, { "epoch": 4.782074613284804, "grad_norm": 0.6889508707219377, "learning_rate": 4.699431852842994e-09, "loss": 0.0092, "step": 10511 }, { "epoch": 4.78252957233849, "grad_norm": 0.35480728633241887, "learning_rate": 4.679901854870993e-09, "loss": 0.0024, "step": 10512 }, { "epoch": 4.782984531392175, "grad_norm": 0.6840662933235896, "learning_rate": 4.660412332220476e-09, "loss": 0.0149, "step": 10513 }, { "epoch": 4.7834394904458595, "grad_norm": 0.7256959253606535, "learning_rate": 4.640963286484223e-09, "loss": 0.0184, "step": 10514 }, { "epoch": 4.783894449499545, "grad_norm": 0.5308410615711039, "learning_rate": 4.621554719251408e-09, "loss": 0.0118, "step": 10515 }, { "epoch": 4.78434940855323, "grad_norm": 0.5468217496537418, "learning_rate": 4.602186632107985e-09, "loss": 0.0019, "step": 10516 }, { "epoch": 4.784804367606915, "grad_norm": 0.4591411685092322, "learning_rate": 4.582859026636687e-09, "loss": 0.0038, "step": 10517 }, { "epoch": 4.785259326660601, "grad_norm": 0.6812552413859404, "learning_rate": 4.563571904416918e-09, "loss": 0.0106, "step": 10518 }, { "epoch": 4.785714285714286, "grad_norm": 0.5327263531263847, "learning_rate": 4.5443252670246404e-09, "loss": 0.0045, "step": 10519 }, { "epoch": 4.7861692447679705, "grad_norm": 0.5173395244563501, "learning_rate": 4.5251191160326495e-09, "loss": 0.0058, "step": 10520 }, { "epoch": 4.786624203821656, "grad_norm": 1.0846367278021831, "learning_rate": 4.505953453010358e-09, "loss": 0.004, "step": 10521 }, { "epoch": 4.787079162875341, "grad_norm": 0.5957347903754597, "learning_rate": 4.486828279523902e-09, "loss": 0.0079, "step": 10522 }, { "epoch": 4.787534121929026, "grad_norm": 0.5559773279565002, "learning_rate": 4.467743597136197e-09, "loss": 0.0072, "step": 10523 }, { "epoch": 4.787989080982712, "grad_norm": 0.4126539456301807, "learning_rate": 4.4486994074066645e-09, "loss": 0.0018, "step": 10524 }, { "epoch": 4.788444040036397, "grad_norm": 0.5687022471278109, "learning_rate": 4.429695711891501e-09, "loss": 0.0146, "step": 10525 }, { "epoch": 4.788898999090081, "grad_norm": 0.49498401041490964, "learning_rate": 4.410732512143633e-09, "loss": 0.0068, "step": 10526 }, { "epoch": 4.789353958143767, "grad_norm": 0.5735796974417597, "learning_rate": 4.391809809712599e-09, "loss": 0.0131, "step": 10527 }, { "epoch": 4.789808917197452, "grad_norm": 0.814300193772623, "learning_rate": 4.372927606144771e-09, "loss": 0.0078, "step": 10528 }, { "epoch": 4.790263876251137, "grad_norm": 0.5776633821849256, "learning_rate": 4.354085902983084e-09, "loss": 0.0063, "step": 10529 }, { "epoch": 4.790718835304823, "grad_norm": 1.0264505723351702, "learning_rate": 4.335284701767139e-09, "loss": 0.0077, "step": 10530 }, { "epoch": 4.7911737943585075, "grad_norm": 0.5235753352380842, "learning_rate": 4.316524004033317e-09, "loss": 0.0055, "step": 10531 }, { "epoch": 4.791628753412192, "grad_norm": 0.7325640340414263, "learning_rate": 4.297803811314727e-09, "loss": 0.0093, "step": 10532 }, { "epoch": 4.792083712465878, "grad_norm": 0.41552843493668684, "learning_rate": 4.279124125140976e-09, "loss": 0.0027, "step": 10533 }, { "epoch": 4.792538671519563, "grad_norm": 0.58887568654597, "learning_rate": 4.260484947038567e-09, "loss": 0.0065, "step": 10534 }, { "epoch": 4.792993630573249, "grad_norm": 0.998827812933085, "learning_rate": 4.241886278530615e-09, "loss": 0.0075, "step": 10535 }, { "epoch": 4.793448589626934, "grad_norm": 0.45293006840840183, "learning_rate": 4.223328121136849e-09, "loss": 0.0036, "step": 10536 }, { "epoch": 4.7939035486806185, "grad_norm": 0.5303145772160968, "learning_rate": 4.204810476373833e-09, "loss": 0.0036, "step": 10537 }, { "epoch": 4.794358507734304, "grad_norm": 0.5372630387959583, "learning_rate": 4.186333345754689e-09, "loss": 0.004, "step": 10538 }, { "epoch": 4.794813466787989, "grad_norm": 0.6180578415435019, "learning_rate": 4.167896730789322e-09, "loss": 0.0073, "step": 10539 }, { "epoch": 4.795268425841674, "grad_norm": 0.5030931077416264, "learning_rate": 4.149500632984304e-09, "loss": 0.0047, "step": 10540 }, { "epoch": 4.79572338489536, "grad_norm": 0.19823949337835844, "learning_rate": 4.131145053842766e-09, "loss": 0.0006, "step": 10541 }, { "epoch": 4.796178343949045, "grad_norm": 0.5748107187066708, "learning_rate": 4.112829994864842e-09, "loss": 0.0044, "step": 10542 }, { "epoch": 4.7966333030027295, "grad_norm": 0.5464888107354088, "learning_rate": 4.094555457547e-09, "loss": 0.0043, "step": 10543 }, { "epoch": 4.797088262056415, "grad_norm": 0.45412076347949887, "learning_rate": 4.076321443382602e-09, "loss": 0.0022, "step": 10544 }, { "epoch": 4.7975432211101, "grad_norm": 0.7526746019119704, "learning_rate": 4.058127953861568e-09, "loss": 0.0049, "step": 10545 }, { "epoch": 4.797998180163785, "grad_norm": 0.29787735514603614, "learning_rate": 4.039974990470762e-09, "loss": 0.0027, "step": 10546 }, { "epoch": 4.798453139217471, "grad_norm": 0.6129810822388362, "learning_rate": 4.021862554693445e-09, "loss": 0.0095, "step": 10547 }, { "epoch": 4.798908098271156, "grad_norm": 0.476244288422151, "learning_rate": 4.003790648009653e-09, "loss": 0.0095, "step": 10548 }, { "epoch": 4.7993630573248405, "grad_norm": 0.5957616459904725, "learning_rate": 3.985759271896261e-09, "loss": 0.01, "step": 10549 }, { "epoch": 4.799818016378526, "grad_norm": 0.6752096701811061, "learning_rate": 3.96776842782659e-09, "loss": 0.0116, "step": 10550 }, { "epoch": 4.800272975432211, "grad_norm": 0.2712746354992372, "learning_rate": 3.949818117270798e-09, "loss": 0.0017, "step": 10551 }, { "epoch": 4.800727934485896, "grad_norm": 0.6740361010677578, "learning_rate": 3.9319083416957665e-09, "loss": 0.011, "step": 10552 }, { "epoch": 4.801182893539582, "grad_norm": 0.5985330547058618, "learning_rate": 3.914039102564992e-09, "loss": 0.0113, "step": 10553 }, { "epoch": 4.801637852593267, "grad_norm": 0.46068054861207713, "learning_rate": 3.896210401338584e-09, "loss": 0.0037, "step": 10554 }, { "epoch": 4.8020928116469515, "grad_norm": 0.6689227019983588, "learning_rate": 3.878422239473489e-09, "loss": 0.0032, "step": 10555 }, { "epoch": 4.802547770700637, "grad_norm": 0.3674692278464905, "learning_rate": 3.8606746184232655e-09, "loss": 0.0029, "step": 10556 }, { "epoch": 4.803002729754322, "grad_norm": 0.5006304938767986, "learning_rate": 3.842967539638198e-09, "loss": 0.0069, "step": 10557 }, { "epoch": 4.803457688808007, "grad_norm": 0.6033919989057219, "learning_rate": 3.8253010045651845e-09, "loss": 0.0065, "step": 10558 }, { "epoch": 4.803912647861693, "grad_norm": 0.4260170292877747, "learning_rate": 3.807675014647849e-09, "loss": 0.0104, "step": 10559 }, { "epoch": 4.804367606915378, "grad_norm": 0.7506003638479329, "learning_rate": 3.790089571326538e-09, "loss": 0.0084, "step": 10560 }, { "epoch": 4.804822565969062, "grad_norm": 0.89572408506451, "learning_rate": 3.772544676038214e-09, "loss": 0.0043, "step": 10561 }, { "epoch": 4.805277525022748, "grad_norm": 0.4633031741162467, "learning_rate": 3.755040330216619e-09, "loss": 0.0042, "step": 10562 }, { "epoch": 4.805732484076433, "grad_norm": 0.4876344480308314, "learning_rate": 3.737576535292108e-09, "loss": 0.006, "step": 10563 }, { "epoch": 4.806187443130118, "grad_norm": 0.2875857065785412, "learning_rate": 3.7201532926917634e-09, "loss": 0.0018, "step": 10564 }, { "epoch": 4.806642402183804, "grad_norm": 0.6414016418812682, "learning_rate": 3.702770603839278e-09, "loss": 0.0084, "step": 10565 }, { "epoch": 4.8070973612374885, "grad_norm": 0.45612996804384026, "learning_rate": 3.6854284701551276e-09, "loss": 0.005, "step": 10566 }, { "epoch": 4.807552320291173, "grad_norm": 0.49178571370743585, "learning_rate": 3.6681268930564558e-09, "loss": 0.0124, "step": 10567 }, { "epoch": 4.808007279344859, "grad_norm": 0.6065517361244969, "learning_rate": 3.650865873957021e-09, "loss": 0.0115, "step": 10568 }, { "epoch": 4.808462238398544, "grad_norm": 0.38240681485382483, "learning_rate": 3.6336454142673054e-09, "loss": 0.0013, "step": 10569 }, { "epoch": 4.80891719745223, "grad_norm": 0.46245201988633033, "learning_rate": 3.616465515394518e-09, "loss": 0.0074, "step": 10570 }, { "epoch": 4.809372156505915, "grad_norm": 0.7272018936043848, "learning_rate": 3.5993261787425343e-09, "loss": 0.0119, "step": 10571 }, { "epoch": 4.8098271155595995, "grad_norm": 0.2773818059332094, "learning_rate": 3.5822274057119017e-09, "loss": 0.0018, "step": 10572 }, { "epoch": 4.810282074613285, "grad_norm": 0.6293190038305414, "learning_rate": 3.5651691976998355e-09, "loss": 0.0082, "step": 10573 }, { "epoch": 4.81073703366697, "grad_norm": 0.5706644838387509, "learning_rate": 3.5481515561002764e-09, "loss": 0.0086, "step": 10574 }, { "epoch": 4.811191992720655, "grad_norm": 0.6528863659636005, "learning_rate": 3.5311744823037783e-09, "loss": 0.0067, "step": 10575 }, { "epoch": 4.811646951774341, "grad_norm": 0.4626619666115997, "learning_rate": 3.514237977697676e-09, "loss": 0.0057, "step": 10576 }, { "epoch": 4.812101910828026, "grad_norm": 0.7861061873130694, "learning_rate": 3.497342043665974e-09, "loss": 0.0053, "step": 10577 }, { "epoch": 4.8125568698817105, "grad_norm": 0.703804940033147, "learning_rate": 3.4804866815892898e-09, "loss": 0.0114, "step": 10578 }, { "epoch": 4.813011828935396, "grad_norm": 0.6926532891669845, "learning_rate": 3.4636718928449105e-09, "loss": 0.011, "step": 10579 }, { "epoch": 4.813466787989081, "grad_norm": 0.6310270262417765, "learning_rate": 3.4468976788069592e-09, "loss": 0.0109, "step": 10580 }, { "epoch": 4.813921747042766, "grad_norm": 0.5639257192941802, "learning_rate": 3.430164040846173e-09, "loss": 0.0088, "step": 10581 }, { "epoch": 4.814376706096452, "grad_norm": 0.2915039355732343, "learning_rate": 3.4134709803298466e-09, "loss": 0.0029, "step": 10582 }, { "epoch": 4.814831665150137, "grad_norm": 0.523773834483009, "learning_rate": 3.396818498622056e-09, "loss": 0.0031, "step": 10583 }, { "epoch": 4.8152866242038215, "grad_norm": 1.0713696846917191, "learning_rate": 3.380206597083657e-09, "loss": 0.0142, "step": 10584 }, { "epoch": 4.815741583257507, "grad_norm": 0.6119166443565629, "learning_rate": 3.3636352770720634e-09, "loss": 0.0061, "step": 10585 }, { "epoch": 4.816196542311192, "grad_norm": 0.43235782529062255, "learning_rate": 3.3471045399414142e-09, "loss": 0.0059, "step": 10586 }, { "epoch": 4.816651501364877, "grad_norm": 0.9494656652032167, "learning_rate": 3.330614387042574e-09, "loss": 0.0082, "step": 10587 }, { "epoch": 4.817106460418563, "grad_norm": 0.4557188412224058, "learning_rate": 3.314164819722909e-09, "loss": 0.008, "step": 10588 }, { "epoch": 4.817561419472248, "grad_norm": 0.360068306008142, "learning_rate": 3.2977558393266767e-09, "loss": 0.0024, "step": 10589 }, { "epoch": 4.8180163785259325, "grad_norm": 0.4577619744627913, "learning_rate": 3.28138744719475e-09, "loss": 0.0072, "step": 10590 }, { "epoch": 4.818471337579618, "grad_norm": 0.39451015789079313, "learning_rate": 3.265059644664725e-09, "loss": 0.0036, "step": 10591 }, { "epoch": 4.818926296633303, "grad_norm": 0.6868034814503875, "learning_rate": 3.2487724330707567e-09, "loss": 0.0139, "step": 10592 }, { "epoch": 4.819381255686988, "grad_norm": 0.6209942402264509, "learning_rate": 3.2325258137437805e-09, "loss": 0.0064, "step": 10593 }, { "epoch": 4.819836214740674, "grad_norm": 0.7106564743856439, "learning_rate": 3.2163197880114566e-09, "loss": 0.0114, "step": 10594 }, { "epoch": 4.820291173794359, "grad_norm": 0.5869459924493536, "learning_rate": 3.2001543571980038e-09, "loss": 0.0133, "step": 10595 }, { "epoch": 4.820746132848043, "grad_norm": 0.6140365076703708, "learning_rate": 3.184029522624421e-09, "loss": 0.004, "step": 10596 }, { "epoch": 4.821201091901729, "grad_norm": 0.7081922324484646, "learning_rate": 3.1679452856083198e-09, "loss": 0.007, "step": 10597 }, { "epoch": 4.821656050955414, "grad_norm": 0.6098565078683899, "learning_rate": 3.1519016474640946e-09, "loss": 0.0045, "step": 10598 }, { "epoch": 4.822111010009099, "grad_norm": 0.5390427260121216, "learning_rate": 3.1358986095026962e-09, "loss": 0.0116, "step": 10599 }, { "epoch": 4.822565969062785, "grad_norm": 0.6587873231545894, "learning_rate": 3.1199361730318564e-09, "loss": 0.0082, "step": 10600 }, { "epoch": 4.8230209281164695, "grad_norm": 0.4124893619525656, "learning_rate": 3.1040143393559206e-09, "loss": 0.0027, "step": 10601 }, { "epoch": 4.823475887170154, "grad_norm": 1.3662937518192686, "learning_rate": 3.0881331097759587e-09, "loss": 0.0061, "step": 10602 }, { "epoch": 4.82393084622384, "grad_norm": 0.514161288537379, "learning_rate": 3.072292485589767e-09, "loss": 0.0063, "step": 10603 }, { "epoch": 4.824385805277525, "grad_norm": 0.6662303277754468, "learning_rate": 3.056492468091698e-09, "loss": 0.0033, "step": 10604 }, { "epoch": 4.82484076433121, "grad_norm": 0.7599182235405119, "learning_rate": 3.0407330585728865e-09, "loss": 0.0132, "step": 10605 }, { "epoch": 4.825295723384896, "grad_norm": 0.6923512179459151, "learning_rate": 3.025014258321135e-09, "loss": 0.0072, "step": 10606 }, { "epoch": 4.8257506824385805, "grad_norm": 1.0977530038827916, "learning_rate": 3.0093360686209157e-09, "loss": 0.0081, "step": 10607 }, { "epoch": 4.826205641492265, "grad_norm": 1.3950845861731918, "learning_rate": 2.9936984907533157e-09, "loss": 0.0098, "step": 10608 }, { "epoch": 4.826660600545951, "grad_norm": 0.6036005155192509, "learning_rate": 2.978101525996257e-09, "loss": 0.0112, "step": 10609 }, { "epoch": 4.827115559599636, "grad_norm": 0.2291294112698119, "learning_rate": 2.962545175624165e-09, "loss": 0.0018, "step": 10610 }, { "epoch": 4.827570518653321, "grad_norm": 0.42834488076575, "learning_rate": 2.9470294409082996e-09, "loss": 0.0058, "step": 10611 }, { "epoch": 4.828025477707007, "grad_norm": 0.47051304962225227, "learning_rate": 2.9315543231165362e-09, "loss": 0.0036, "step": 10612 }, { "epoch": 4.8284804367606915, "grad_norm": 0.5083103944035803, "learning_rate": 2.9161198235134187e-09, "loss": 0.0042, "step": 10613 }, { "epoch": 4.828935395814376, "grad_norm": 0.21654870165378556, "learning_rate": 2.9007259433601606e-09, "loss": 0.002, "step": 10614 }, { "epoch": 4.829390354868062, "grad_norm": 0.8890044014824109, "learning_rate": 2.8853726839146995e-09, "loss": 0.0079, "step": 10615 }, { "epoch": 4.829845313921747, "grad_norm": 0.9750953559956923, "learning_rate": 2.8700600464316437e-09, "loss": 0.0143, "step": 10616 }, { "epoch": 4.830300272975432, "grad_norm": 0.5049644892819207, "learning_rate": 2.854788032162214e-09, "loss": 0.004, "step": 10617 }, { "epoch": 4.830755232029118, "grad_norm": 0.5419379178899109, "learning_rate": 2.8395566423544683e-09, "loss": 0.0051, "step": 10618 }, { "epoch": 4.8312101910828025, "grad_norm": 0.5835209052022392, "learning_rate": 2.8243658782529657e-09, "loss": 0.0092, "step": 10619 }, { "epoch": 4.831665150136487, "grad_norm": 0.564944882215275, "learning_rate": 2.8092157410991026e-09, "loss": 0.0039, "step": 10620 }, { "epoch": 4.832120109190173, "grad_norm": 0.3661417990109383, "learning_rate": 2.794106232130833e-09, "loss": 0.0035, "step": 10621 }, { "epoch": 4.832575068243858, "grad_norm": 0.7593800925084059, "learning_rate": 2.7790373525827804e-09, "loss": 0.012, "step": 10622 }, { "epoch": 4.833030027297543, "grad_norm": 0.49597628554598344, "learning_rate": 2.7640091036864044e-09, "loss": 0.0075, "step": 10623 }, { "epoch": 4.833484986351229, "grad_norm": 0.4176311697027282, "learning_rate": 2.7490214866697224e-09, "loss": 0.0067, "step": 10624 }, { "epoch": 4.8339399454049135, "grad_norm": 0.4805906318652338, "learning_rate": 2.734074502757422e-09, "loss": 0.0078, "step": 10625 }, { "epoch": 4.834394904458598, "grad_norm": 0.5574624656097003, "learning_rate": 2.7191681531709142e-09, "loss": 0.0042, "step": 10626 }, { "epoch": 4.834849863512284, "grad_norm": 0.46776434338873085, "learning_rate": 2.704302439128281e-09, "loss": 0.0022, "step": 10627 }, { "epoch": 4.835304822565969, "grad_norm": 0.4345051833689017, "learning_rate": 2.6894773618442723e-09, "loss": 0.0059, "step": 10628 }, { "epoch": 4.835759781619654, "grad_norm": 0.6977047249284501, "learning_rate": 2.674692922530364e-09, "loss": 0.0107, "step": 10629 }, { "epoch": 4.83621474067334, "grad_norm": 1.2771851369293792, "learning_rate": 2.6599491223946448e-09, "loss": 0.0077, "step": 10630 }, { "epoch": 4.836669699727024, "grad_norm": 0.4833689391666877, "learning_rate": 2.6452459626419288e-09, "loss": 0.0019, "step": 10631 }, { "epoch": 4.837124658780709, "grad_norm": 0.7066074673829048, "learning_rate": 2.630583444473644e-09, "loss": 0.009, "step": 10632 }, { "epoch": 4.837579617834395, "grad_norm": 0.7659131270431422, "learning_rate": 2.615961569087999e-09, "loss": 0.0089, "step": 10633 }, { "epoch": 4.83803457688808, "grad_norm": 0.6015936386949814, "learning_rate": 2.601380337679815e-09, "loss": 0.0059, "step": 10634 }, { "epoch": 4.838489535941765, "grad_norm": 0.4119792660676874, "learning_rate": 2.586839751440584e-09, "loss": 0.0026, "step": 10635 }, { "epoch": 4.8389444949954505, "grad_norm": 0.27629204171607075, "learning_rate": 2.572339811558577e-09, "loss": 0.0011, "step": 10636 }, { "epoch": 4.839399454049135, "grad_norm": 0.9458791099223549, "learning_rate": 2.5578805192185138e-09, "loss": 0.0072, "step": 10637 }, { "epoch": 4.83985441310282, "grad_norm": 0.5170738957084788, "learning_rate": 2.5434618756020043e-09, "loss": 0.0084, "step": 10638 }, { "epoch": 4.840309372156506, "grad_norm": 0.6585515503278715, "learning_rate": 2.529083881887384e-09, "loss": 0.0126, "step": 10639 }, { "epoch": 4.840764331210191, "grad_norm": 0.4021585182267562, "learning_rate": 2.514746539249435e-09, "loss": 0.0033, "step": 10640 }, { "epoch": 4.841219290263876, "grad_norm": 0.4826736071254221, "learning_rate": 2.5004498488597757e-09, "loss": 0.0054, "step": 10641 }, { "epoch": 4.8416742493175615, "grad_norm": 0.46434699058729256, "learning_rate": 2.486193811886639e-09, "loss": 0.0041, "step": 10642 }, { "epoch": 4.842129208371246, "grad_norm": 0.728333592388545, "learning_rate": 2.471978429494981e-09, "loss": 0.0064, "step": 10643 }, { "epoch": 4.842584167424932, "grad_norm": 0.5612676679896665, "learning_rate": 2.457803702846484e-09, "loss": 0.0134, "step": 10644 }, { "epoch": 4.843039126478617, "grad_norm": 0.2481925033898101, "learning_rate": 2.4436696330993323e-09, "loss": 0.0015, "step": 10645 }, { "epoch": 4.843494085532302, "grad_norm": 0.5791788214605984, "learning_rate": 2.4295762214086025e-09, "loss": 0.0074, "step": 10646 }, { "epoch": 4.843949044585988, "grad_norm": 0.6202791709803228, "learning_rate": 2.415523468925873e-09, "loss": 0.0071, "step": 10647 }, { "epoch": 4.8444040036396725, "grad_norm": 0.6235395185212882, "learning_rate": 2.4015113767995032e-09, "loss": 0.0048, "step": 10648 }, { "epoch": 4.844858962693357, "grad_norm": 0.6827629884265114, "learning_rate": 2.3875399461745215e-09, "loss": 0.01, "step": 10649 }, { "epoch": 4.845313921747043, "grad_norm": 0.4119779357262995, "learning_rate": 2.373609178192515e-09, "loss": 0.0056, "step": 10650 }, { "epoch": 4.845768880800728, "grad_norm": 1.8126479439572758, "learning_rate": 2.359719073991906e-09, "loss": 0.0088, "step": 10651 }, { "epoch": 4.846223839854413, "grad_norm": 0.3835284662360587, "learning_rate": 2.3458696347077865e-09, "loss": 0.0033, "step": 10652 }, { "epoch": 4.846678798908099, "grad_norm": 0.5980378044683499, "learning_rate": 2.332060861471752e-09, "loss": 0.0034, "step": 10653 }, { "epoch": 4.8471337579617835, "grad_norm": 0.8201724882759847, "learning_rate": 2.3182927554122877e-09, "loss": 0.0081, "step": 10654 }, { "epoch": 4.847588717015468, "grad_norm": 0.8090448848174305, "learning_rate": 2.3045653176544387e-09, "loss": 0.0078, "step": 10655 }, { "epoch": 4.848043676069154, "grad_norm": 0.2616313835229306, "learning_rate": 2.2908785493199188e-09, "loss": 0.0025, "step": 10656 }, { "epoch": 4.848498635122839, "grad_norm": 0.46841632026204805, "learning_rate": 2.2772324515272223e-09, "loss": 0.0034, "step": 10657 }, { "epoch": 4.848953594176524, "grad_norm": 0.56239663044108, "learning_rate": 2.263627025391346e-09, "loss": 0.0022, "step": 10658 }, { "epoch": 4.84940855323021, "grad_norm": 0.507363574158883, "learning_rate": 2.250062272024067e-09, "loss": 0.0058, "step": 10659 }, { "epoch": 4.8498635122838945, "grad_norm": 4.644590595057865, "learning_rate": 2.236538192533943e-09, "loss": 0.0547, "step": 10660 }, { "epoch": 4.850318471337579, "grad_norm": 0.6100239809462993, "learning_rate": 2.2230547880260353e-09, "loss": 0.0149, "step": 10661 }, { "epoch": 4.850773430391265, "grad_norm": 0.5685429072511002, "learning_rate": 2.2096120596021285e-09, "loss": 0.003, "step": 10662 }, { "epoch": 4.85122838944495, "grad_norm": 0.44149904136859935, "learning_rate": 2.196210008360733e-09, "loss": 0.0047, "step": 10663 }, { "epoch": 4.851683348498635, "grad_norm": 0.7151973459887058, "learning_rate": 2.1828486353970277e-09, "loss": 0.0079, "step": 10664 }, { "epoch": 4.852138307552321, "grad_norm": 0.620245384807896, "learning_rate": 2.169527941802751e-09, "loss": 0.0056, "step": 10665 }, { "epoch": 4.852593266606005, "grad_norm": 0.3824674497141757, "learning_rate": 2.156247928666477e-09, "loss": 0.0034, "step": 10666 }, { "epoch": 4.85304822565969, "grad_norm": 0.4657962436931095, "learning_rate": 2.1430085970733924e-09, "loss": 0.0072, "step": 10667 }, { "epoch": 4.853503184713376, "grad_norm": 0.8964763171984479, "learning_rate": 2.1298099481053545e-09, "loss": 0.0195, "step": 10668 }, { "epoch": 4.853958143767061, "grad_norm": 0.554365109850628, "learning_rate": 2.1166519828408892e-09, "loss": 0.0046, "step": 10669 }, { "epoch": 4.854413102820746, "grad_norm": 0.5808687783611847, "learning_rate": 2.103534702355192e-09, "loss": 0.0083, "step": 10670 }, { "epoch": 4.8548680618744315, "grad_norm": 0.41737886112961536, "learning_rate": 2.0904581077201276e-09, "loss": 0.0093, "step": 10671 }, { "epoch": 4.855323020928116, "grad_norm": 0.45106564181079156, "learning_rate": 2.077422200004342e-09, "loss": 0.0036, "step": 10672 }, { "epoch": 4.855777979981801, "grad_norm": 2.6755734515117275, "learning_rate": 2.064426980272982e-09, "loss": 0.0228, "step": 10673 }, { "epoch": 4.856232939035487, "grad_norm": 0.355633469146808, "learning_rate": 2.0514724495879765e-09, "loss": 0.0028, "step": 10674 }, { "epoch": 4.856687898089172, "grad_norm": 0.6041823485712094, "learning_rate": 2.0385586090079787e-09, "loss": 0.0043, "step": 10675 }, { "epoch": 4.857142857142857, "grad_norm": 0.383698461993388, "learning_rate": 2.0256854595881446e-09, "loss": 0.0037, "step": 10676 }, { "epoch": 4.8575978161965425, "grad_norm": 0.49506484171938403, "learning_rate": 2.0128530023804656e-09, "loss": 0.0057, "step": 10677 }, { "epoch": 4.858052775250227, "grad_norm": 0.9091209379939603, "learning_rate": 2.0000612384336036e-09, "loss": 0.0044, "step": 10678 }, { "epoch": 4.858507734303913, "grad_norm": 0.6251242547479949, "learning_rate": 1.9873101687927774e-09, "loss": 0.0113, "step": 10679 }, { "epoch": 4.858962693357598, "grad_norm": 0.6840261830259968, "learning_rate": 1.974599794499876e-09, "loss": 0.0062, "step": 10680 }, { "epoch": 4.859417652411283, "grad_norm": 0.6184455095114602, "learning_rate": 1.9619301165936795e-09, "loss": 0.0043, "step": 10681 }, { "epoch": 4.859872611464969, "grad_norm": 0.4094042046357908, "learning_rate": 1.9493011361094714e-09, "loss": 0.0042, "step": 10682 }, { "epoch": 4.8603275705186535, "grad_norm": 0.36156453971068664, "learning_rate": 1.936712854079148e-09, "loss": 0.0032, "step": 10683 }, { "epoch": 4.860782529572338, "grad_norm": 0.6772778699396511, "learning_rate": 1.9241652715314416e-09, "loss": 0.0072, "step": 10684 }, { "epoch": 4.861237488626024, "grad_norm": 0.454270336683913, "learning_rate": 1.9116583894915884e-09, "loss": 0.0067, "step": 10685 }, { "epoch": 4.861692447679709, "grad_norm": 0.6006250451302145, "learning_rate": 1.89919220898177e-09, "loss": 0.0096, "step": 10686 }, { "epoch": 4.862147406733394, "grad_norm": 0.4946202755172669, "learning_rate": 1.8867667310204503e-09, "loss": 0.0093, "step": 10687 }, { "epoch": 4.86260236578708, "grad_norm": 0.44912828957927575, "learning_rate": 1.8743819566232055e-09, "loss": 0.0067, "step": 10688 }, { "epoch": 4.8630573248407645, "grad_norm": 0.6974308298491466, "learning_rate": 1.8620378868018927e-09, "loss": 0.0074, "step": 10689 }, { "epoch": 4.863512283894449, "grad_norm": 0.24080876879047888, "learning_rate": 1.8497345225652604e-09, "loss": 0.002, "step": 10690 }, { "epoch": 4.863967242948135, "grad_norm": 0.740697369049907, "learning_rate": 1.837471864918727e-09, "loss": 0.0084, "step": 10691 }, { "epoch": 4.86442220200182, "grad_norm": 0.8532616249360094, "learning_rate": 1.8252499148642686e-09, "loss": 0.0036, "step": 10692 }, { "epoch": 4.864877161055505, "grad_norm": 0.5534315571690128, "learning_rate": 1.8130686734006972e-09, "loss": 0.0122, "step": 10693 }, { "epoch": 4.865332120109191, "grad_norm": 0.692159899247698, "learning_rate": 1.8009281415233278e-09, "loss": 0.0089, "step": 10694 }, { "epoch": 4.8657870791628755, "grad_norm": 0.2714738075621584, "learning_rate": 1.7888283202243116e-09, "loss": 0.0017, "step": 10695 }, { "epoch": 4.86624203821656, "grad_norm": 0.38708679716941835, "learning_rate": 1.7767692104923016e-09, "loss": 0.006, "step": 10696 }, { "epoch": 4.866696997270246, "grad_norm": 0.6664256031259425, "learning_rate": 1.764750813312732e-09, "loss": 0.0068, "step": 10697 }, { "epoch": 4.867151956323931, "grad_norm": 0.42610233215703375, "learning_rate": 1.7527731296677617e-09, "loss": 0.0055, "step": 10698 }, { "epoch": 4.867606915377616, "grad_norm": 0.4336743302046774, "learning_rate": 1.7408361605361078e-09, "loss": 0.0056, "step": 10699 }, { "epoch": 4.868061874431302, "grad_norm": 0.41293277723271476, "learning_rate": 1.728939906893212e-09, "loss": 0.0023, "step": 10700 }, { "epoch": 4.868516833484986, "grad_norm": 0.48379195383137286, "learning_rate": 1.7170843697111304e-09, "loss": 0.0035, "step": 10701 }, { "epoch": 4.868971792538671, "grad_norm": 0.4698588130524145, "learning_rate": 1.7052695499586988e-09, "loss": 0.0056, "step": 10702 }, { "epoch": 4.869426751592357, "grad_norm": 0.4532061895411687, "learning_rate": 1.6934954486013674e-09, "loss": 0.0079, "step": 10703 }, { "epoch": 4.869881710646042, "grad_norm": 0.5487344360809481, "learning_rate": 1.6817620666012556e-09, "loss": 0.0047, "step": 10704 }, { "epoch": 4.870336669699727, "grad_norm": 0.47776082100741807, "learning_rate": 1.670069404917207e-09, "loss": 0.0088, "step": 10705 }, { "epoch": 4.8707916287534125, "grad_norm": 0.698932607706504, "learning_rate": 1.658417464504569e-09, "loss": 0.0149, "step": 10706 }, { "epoch": 4.871246587807097, "grad_norm": 0.4421521795132051, "learning_rate": 1.646806246315635e-09, "loss": 0.0024, "step": 10707 }, { "epoch": 4.871701546860782, "grad_norm": 0.5778202960308882, "learning_rate": 1.635235751299091e-09, "loss": 0.0104, "step": 10708 }, { "epoch": 4.872156505914468, "grad_norm": 0.4060380712692299, "learning_rate": 1.6237059804005137e-09, "loss": 0.0057, "step": 10709 }, { "epoch": 4.872611464968153, "grad_norm": 0.8444288067052296, "learning_rate": 1.6122169345620384e-09, "loss": 0.0074, "step": 10710 }, { "epoch": 4.873066424021838, "grad_norm": 0.38653372904621297, "learning_rate": 1.6007686147225251e-09, "loss": 0.0041, "step": 10711 }, { "epoch": 4.8735213830755235, "grad_norm": 0.795195451715857, "learning_rate": 1.5893610218173925e-09, "loss": 0.0074, "step": 10712 }, { "epoch": 4.873976342129208, "grad_norm": 0.5421766317643901, "learning_rate": 1.5779941567789502e-09, "loss": 0.0057, "step": 10713 }, { "epoch": 4.874431301182893, "grad_norm": 0.6757780344045788, "learning_rate": 1.5666680205358995e-09, "loss": 0.0066, "step": 10714 }, { "epoch": 4.874886260236579, "grad_norm": 0.4243831210989044, "learning_rate": 1.5553826140138339e-09, "loss": 0.0028, "step": 10715 }, { "epoch": 4.875341219290264, "grad_norm": 0.5500858564812107, "learning_rate": 1.5441379381349595e-09, "loss": 0.0046, "step": 10716 }, { "epoch": 4.875796178343949, "grad_norm": 0.5743014342797624, "learning_rate": 1.532933993818153e-09, "loss": 0.0088, "step": 10717 }, { "epoch": 4.8762511373976345, "grad_norm": 0.4255630384339553, "learning_rate": 1.5217707819789039e-09, "loss": 0.0041, "step": 10718 }, { "epoch": 4.876706096451319, "grad_norm": 0.1886128253842411, "learning_rate": 1.510648303529427e-09, "loss": 0.0008, "step": 10719 }, { "epoch": 4.877161055505004, "grad_norm": 0.24886804305716095, "learning_rate": 1.4995665593786623e-09, "loss": 0.0025, "step": 10720 }, { "epoch": 4.87761601455869, "grad_norm": 0.6101677572153525, "learning_rate": 1.4885255504320516e-09, "loss": 0.0033, "step": 10721 }, { "epoch": 4.878070973612375, "grad_norm": 0.5958505361948463, "learning_rate": 1.4775252775918734e-09, "loss": 0.0102, "step": 10722 }, { "epoch": 4.87852593266606, "grad_norm": 0.8215294809926676, "learning_rate": 1.4665657417570754e-09, "loss": 0.0149, "step": 10723 }, { "epoch": 4.8789808917197455, "grad_norm": 0.6565068536866185, "learning_rate": 1.4556469438231079e-09, "loss": 0.0155, "step": 10724 }, { "epoch": 4.87943585077343, "grad_norm": 0.3895979223630302, "learning_rate": 1.4447688846823125e-09, "loss": 0.0038, "step": 10725 }, { "epoch": 4.879890809827115, "grad_norm": 0.6832379566026235, "learning_rate": 1.433931565223534e-09, "loss": 0.0043, "step": 10726 }, { "epoch": 4.880345768880801, "grad_norm": 0.7415596819047479, "learning_rate": 1.423134986332397e-09, "loss": 0.0062, "step": 10727 }, { "epoch": 4.880800727934486, "grad_norm": 0.518310664676903, "learning_rate": 1.4123791488910297e-09, "loss": 0.002, "step": 10728 }, { "epoch": 4.881255686988171, "grad_norm": 0.6798220037829125, "learning_rate": 1.4016640537785062e-09, "loss": 0.0123, "step": 10729 }, { "epoch": 4.8817106460418564, "grad_norm": 0.8332908831974488, "learning_rate": 1.390989701870293e-09, "loss": 0.0091, "step": 10730 }, { "epoch": 4.882165605095541, "grad_norm": 0.3568771518752399, "learning_rate": 1.3803560940387481e-09, "loss": 0.0018, "step": 10731 }, { "epoch": 4.882620564149226, "grad_norm": 0.4734726351500124, "learning_rate": 1.3697632311527874e-09, "loss": 0.006, "step": 10732 }, { "epoch": 4.883075523202912, "grad_norm": 0.6765365856495201, "learning_rate": 1.3592111140778851e-09, "loss": 0.01, "step": 10733 }, { "epoch": 4.883530482256597, "grad_norm": 0.5865753836629187, "learning_rate": 1.348699743676518e-09, "loss": 0.0072, "step": 10734 }, { "epoch": 4.883985441310282, "grad_norm": 0.41540905113099647, "learning_rate": 1.3382291208074437e-09, "loss": 0.0026, "step": 10735 }, { "epoch": 4.884440400363967, "grad_norm": 0.93668986286618, "learning_rate": 1.3277992463263665e-09, "loss": 0.0039, "step": 10736 }, { "epoch": 4.884895359417652, "grad_norm": 0.7250838108113946, "learning_rate": 1.3174101210855493e-09, "loss": 0.0143, "step": 10737 }, { "epoch": 4.885350318471337, "grad_norm": 0.4515727688498362, "learning_rate": 1.307061745933924e-09, "loss": 0.0054, "step": 10738 }, { "epoch": 4.885805277525023, "grad_norm": 0.5965178976260512, "learning_rate": 1.2967541217172028e-09, "loss": 0.0024, "step": 10739 }, { "epoch": 4.886260236578708, "grad_norm": 0.6257815582335707, "learning_rate": 1.2864872492775457e-09, "loss": 0.0085, "step": 10740 }, { "epoch": 4.886715195632393, "grad_norm": 0.4351607218532064, "learning_rate": 1.2762611294540037e-09, "loss": 0.0055, "step": 10741 }, { "epoch": 4.887170154686078, "grad_norm": 0.6604406211304328, "learning_rate": 1.2660757630821862e-09, "loss": 0.0059, "step": 10742 }, { "epoch": 4.887625113739763, "grad_norm": 0.3456199041424559, "learning_rate": 1.2559311509943716e-09, "loss": 0.0014, "step": 10743 }, { "epoch": 4.888080072793448, "grad_norm": 0.5877036060879721, "learning_rate": 1.245827294019619e-09, "loss": 0.0065, "step": 10744 }, { "epoch": 4.888535031847134, "grad_norm": 0.3472822905732674, "learning_rate": 1.235764192983435e-09, "loss": 0.0024, "step": 10745 }, { "epoch": 4.888989990900819, "grad_norm": 0.5536124672002201, "learning_rate": 1.2257418487082726e-09, "loss": 0.0047, "step": 10746 }, { "epoch": 4.889444949954504, "grad_norm": 0.5556080495590487, "learning_rate": 1.215760262012977e-09, "loss": 0.0062, "step": 10747 }, { "epoch": 4.889899909008189, "grad_norm": 0.4142064589887706, "learning_rate": 1.2058194337132843e-09, "loss": 0.0026, "step": 10748 }, { "epoch": 4.890354868061874, "grad_norm": 0.6697445411596161, "learning_rate": 1.1959193646214893e-09, "loss": 0.011, "step": 10749 }, { "epoch": 4.890809827115559, "grad_norm": 0.44316793724270787, "learning_rate": 1.1860600555465562e-09, "loss": 0.0078, "step": 10750 }, { "epoch": 4.891264786169245, "grad_norm": 2.0580039305357962, "learning_rate": 1.176241507294229e-09, "loss": 0.0259, "step": 10751 }, { "epoch": 4.89171974522293, "grad_norm": 0.8067214272165604, "learning_rate": 1.1664637206667549e-09, "loss": 0.012, "step": 10752 }, { "epoch": 4.892174704276615, "grad_norm": 0.8478425892792404, "learning_rate": 1.1567266964631616e-09, "loss": 0.0124, "step": 10753 }, { "epoch": 4.8926296633303, "grad_norm": 0.6412096020539942, "learning_rate": 1.1470304354790905e-09, "loss": 0.0146, "step": 10754 }, { "epoch": 4.893084622383985, "grad_norm": 0.49262348011474894, "learning_rate": 1.1373749385069076e-09, "loss": 0.0041, "step": 10755 }, { "epoch": 4.893539581437671, "grad_norm": 0.41036026590364666, "learning_rate": 1.127760206335593e-09, "loss": 0.0042, "step": 10756 }, { "epoch": 4.893994540491356, "grad_norm": 0.6066929776713368, "learning_rate": 1.1181862397507958e-09, "loss": 0.0047, "step": 10757 }, { "epoch": 4.894449499545041, "grad_norm": 0.5421445890875027, "learning_rate": 1.1086530395349458e-09, "loss": 0.0048, "step": 10758 }, { "epoch": 4.8949044585987265, "grad_norm": 0.4074044748813261, "learning_rate": 1.0991606064669755e-09, "loss": 0.0024, "step": 10759 }, { "epoch": 4.895359417652411, "grad_norm": 0.6451265310187658, "learning_rate": 1.0897089413225425e-09, "loss": 0.0067, "step": 10760 }, { "epoch": 4.895814376706096, "grad_norm": 0.6205997255006956, "learning_rate": 1.0802980448740838e-09, "loss": 0.0055, "step": 10761 }, { "epoch": 4.896269335759782, "grad_norm": 0.5731788010226537, "learning_rate": 1.0709279178905405e-09, "loss": 0.004, "step": 10762 }, { "epoch": 4.896724294813467, "grad_norm": 0.7348126995180025, "learning_rate": 1.0615985611376888e-09, "loss": 0.0137, "step": 10763 }, { "epoch": 4.897179253867152, "grad_norm": 0.4338256448475118, "learning_rate": 1.052309975377752e-09, "loss": 0.0046, "step": 10764 }, { "epoch": 4.8976342129208374, "grad_norm": 0.42679792610674, "learning_rate": 1.0430621613697898e-09, "loss": 0.0033, "step": 10765 }, { "epoch": 4.898089171974522, "grad_norm": 0.6590113331219116, "learning_rate": 1.0338551198695867e-09, "loss": 0.0084, "step": 10766 }, { "epoch": 4.898544131028207, "grad_norm": 0.7287310197726007, "learning_rate": 1.024688851629374e-09, "loss": 0.0139, "step": 10767 }, { "epoch": 4.898999090081893, "grad_norm": 0.6953521404423153, "learning_rate": 1.0155633573982746e-09, "loss": 0.011, "step": 10768 }, { "epoch": 4.899454049135578, "grad_norm": 0.4541543758926211, "learning_rate": 1.0064786379219703e-09, "loss": 0.0091, "step": 10769 }, { "epoch": 4.899909008189263, "grad_norm": 0.7193616123045228, "learning_rate": 9.974346939427557e-10, "loss": 0.0019, "step": 10770 }, { "epoch": 4.900363967242948, "grad_norm": 0.5149302766927956, "learning_rate": 9.884315261997067e-10, "loss": 0.011, "step": 10771 }, { "epoch": 4.900818926296633, "grad_norm": 0.4449733059929503, "learning_rate": 9.794691354285123e-10, "loss": 0.0056, "step": 10772 }, { "epoch": 4.901273885350318, "grad_norm": 0.4915086124644911, "learning_rate": 9.705475223615312e-10, "loss": 0.0102, "step": 10773 }, { "epoch": 4.901728844404004, "grad_norm": 0.37564044112736583, "learning_rate": 9.616666877278467e-10, "loss": 0.0021, "step": 10774 }, { "epoch": 4.902183803457689, "grad_norm": 0.7597290949358203, "learning_rate": 9.528266322531008e-10, "loss": 0.002, "step": 10775 }, { "epoch": 4.902638762511374, "grad_norm": 0.5790236042392625, "learning_rate": 9.440273566597156e-10, "loss": 0.0073, "step": 10776 }, { "epoch": 4.903093721565059, "grad_norm": 0.36136149592818123, "learning_rate": 9.35268861666616e-10, "loss": 0.0013, "step": 10777 }, { "epoch": 4.903548680618744, "grad_norm": 0.2728363380007821, "learning_rate": 9.265511479896737e-10, "loss": 0.0011, "step": 10778 }, { "epoch": 4.904003639672429, "grad_norm": 0.4473988036749572, "learning_rate": 9.178742163411524e-10, "loss": 0.0033, "step": 10779 }, { "epoch": 4.904458598726115, "grad_norm": 0.6646178681477112, "learning_rate": 9.092380674300959e-10, "loss": 0.008, "step": 10780 }, { "epoch": 4.9049135577798, "grad_norm": 0.5662800546837327, "learning_rate": 9.006427019622176e-10, "loss": 0.0105, "step": 10781 }, { "epoch": 4.905368516833485, "grad_norm": 0.7755797072926629, "learning_rate": 8.920881206399555e-10, "loss": 0.0051, "step": 10782 }, { "epoch": 4.90582347588717, "grad_norm": 0.7684326551540268, "learning_rate": 8.835743241622506e-10, "loss": 0.0085, "step": 10783 }, { "epoch": 4.906278434940855, "grad_norm": 0.6911136084914583, "learning_rate": 8.751013132249352e-10, "loss": 0.0093, "step": 10784 }, { "epoch": 4.90673339399454, "grad_norm": 0.4621880350891757, "learning_rate": 8.666690885202333e-10, "loss": 0.0044, "step": 10785 }, { "epoch": 4.907188353048226, "grad_norm": 0.6360590854884562, "learning_rate": 8.582776507373157e-10, "loss": 0.0103, "step": 10786 }, { "epoch": 4.907643312101911, "grad_norm": 0.32410617601102754, "learning_rate": 8.499270005618009e-10, "loss": 0.0047, "step": 10787 }, { "epoch": 4.9080982711555965, "grad_norm": 0.737232135422596, "learning_rate": 8.416171386761428e-10, "loss": 0.0136, "step": 10788 }, { "epoch": 4.908553230209281, "grad_norm": 0.6735185854602812, "learning_rate": 8.333480657593539e-10, "loss": 0.0126, "step": 10789 }, { "epoch": 4.909008189262966, "grad_norm": 0.5188768205651552, "learning_rate": 8.251197824871159e-10, "loss": 0.0072, "step": 10790 }, { "epoch": 4.909463148316652, "grad_norm": 0.4018837977375292, "learning_rate": 8.169322895318354e-10, "loss": 0.0059, "step": 10791 }, { "epoch": 4.909918107370337, "grad_norm": 0.6514769663276299, "learning_rate": 8.087855875625882e-10, "loss": 0.0145, "step": 10792 }, { "epoch": 4.910373066424022, "grad_norm": 0.5023071332206993, "learning_rate": 8.006796772450641e-10, "loss": 0.0057, "step": 10793 }, { "epoch": 4.9108280254777075, "grad_norm": 0.42008130795597476, "learning_rate": 7.926145592416223e-10, "loss": 0.0032, "step": 10794 }, { "epoch": 4.911282984531392, "grad_norm": 0.6936283563448391, "learning_rate": 7.845902342112909e-10, "loss": 0.0023, "step": 10795 }, { "epoch": 4.911737943585077, "grad_norm": 0.7276180481867962, "learning_rate": 7.766067028098233e-10, "loss": 0.0048, "step": 10796 }, { "epoch": 4.912192902638763, "grad_norm": 0.33129101643365627, "learning_rate": 7.686639656895866e-10, "loss": 0.002, "step": 10797 }, { "epoch": 4.912647861692448, "grad_norm": 0.5596325203855034, "learning_rate": 7.607620234996171e-10, "loss": 0.0093, "step": 10798 }, { "epoch": 4.913102820746133, "grad_norm": 0.5844577810946708, "learning_rate": 7.529008768856205e-10, "loss": 0.0103, "step": 10799 }, { "epoch": 4.9135577797998184, "grad_norm": 0.5205442113681366, "learning_rate": 7.450805264900273e-10, "loss": 0.0107, "step": 10800 }, { "epoch": 4.914012738853503, "grad_norm": 0.6629229799251952, "learning_rate": 7.37300972951771e-10, "loss": 0.0033, "step": 10801 }, { "epoch": 4.914467697907188, "grad_norm": 0.5059018922494513, "learning_rate": 7.295622169066762e-10, "loss": 0.0032, "step": 10802 }, { "epoch": 4.914922656960874, "grad_norm": 0.670633436707917, "learning_rate": 7.218642589870705e-10, "loss": 0.0038, "step": 10803 }, { "epoch": 4.915377616014559, "grad_norm": 0.5517505777644675, "learning_rate": 7.142070998220062e-10, "loss": 0.0051, "step": 10804 }, { "epoch": 4.915832575068244, "grad_norm": 0.9046251396486003, "learning_rate": 7.065907400371495e-10, "loss": 0.0102, "step": 10805 }, { "epoch": 4.916287534121929, "grad_norm": 0.6416914829211234, "learning_rate": 6.990151802549471e-10, "loss": 0.0114, "step": 10806 }, { "epoch": 4.916742493175614, "grad_norm": 0.14081180762903706, "learning_rate": 6.914804210943481e-10, "loss": 0.0008, "step": 10807 }, { "epoch": 4.917197452229299, "grad_norm": 0.4686985351984233, "learning_rate": 6.839864631711378e-10, "loss": 0.0057, "step": 10808 }, { "epoch": 4.917652411282985, "grad_norm": 0.4224643782809796, "learning_rate": 6.765333070976598e-10, "loss": 0.0017, "step": 10809 }, { "epoch": 4.91810737033667, "grad_norm": 0.5754095163641789, "learning_rate": 6.691209534829823e-10, "loss": 0.0086, "step": 10810 }, { "epoch": 4.918562329390355, "grad_norm": 0.5134260183108805, "learning_rate": 6.617494029327875e-10, "loss": 0.0043, "step": 10811 }, { "epoch": 4.91901728844404, "grad_norm": 0.8366369591987096, "learning_rate": 6.544186560493714e-10, "loss": 0.0137, "step": 10812 }, { "epoch": 4.919472247497725, "grad_norm": 0.44685374266211525, "learning_rate": 6.471287134319214e-10, "loss": 0.004, "step": 10813 }, { "epoch": 4.91992720655141, "grad_norm": 0.47896177692096187, "learning_rate": 6.398795756760167e-10, "loss": 0.0068, "step": 10814 }, { "epoch": 4.920382165605096, "grad_norm": 0.7129130711087798, "learning_rate": 6.326712433740167e-10, "loss": 0.0078, "step": 10815 }, { "epoch": 4.920837124658781, "grad_norm": 0.7311095558518663, "learning_rate": 6.255037171150612e-10, "loss": 0.0096, "step": 10816 }, { "epoch": 4.921292083712466, "grad_norm": 0.6381386032736319, "learning_rate": 6.18376997484793e-10, "loss": 0.007, "step": 10817 }, { "epoch": 4.921747042766151, "grad_norm": 1.0182757592648668, "learning_rate": 6.112910850655239e-10, "loss": 0.0047, "step": 10818 }, { "epoch": 4.922202001819836, "grad_norm": 0.42451462320893024, "learning_rate": 6.042459804363465e-10, "loss": 0.0038, "step": 10819 }, { "epoch": 4.922656960873521, "grad_norm": 0.6541650502609014, "learning_rate": 5.972416841729666e-10, "loss": 0.0075, "step": 10820 }, { "epoch": 4.923111919927207, "grad_norm": 0.24964899912653477, "learning_rate": 5.902781968476489e-10, "loss": 0.0009, "step": 10821 }, { "epoch": 4.923566878980892, "grad_norm": 0.5219280473523086, "learning_rate": 5.833555190295492e-10, "loss": 0.0082, "step": 10822 }, { "epoch": 4.924021838034577, "grad_norm": 1.200897928423323, "learning_rate": 5.76473651284215e-10, "loss": 0.0047, "step": 10823 }, { "epoch": 4.924476797088262, "grad_norm": 0.7356778705882547, "learning_rate": 5.696325941741409e-10, "loss": 0.0042, "step": 10824 }, { "epoch": 4.924931756141947, "grad_norm": 0.6948720805343682, "learning_rate": 5.628323482582687e-10, "loss": 0.0056, "step": 10825 }, { "epoch": 4.925386715195632, "grad_norm": 0.4847615739098852, "learning_rate": 5.560729140923204e-10, "loss": 0.0043, "step": 10826 }, { "epoch": 4.925841674249318, "grad_norm": 0.8142602159305974, "learning_rate": 5.493542922285765e-10, "loss": 0.009, "step": 10827 }, { "epoch": 4.926296633303003, "grad_norm": 0.6971389449973897, "learning_rate": 5.426764832160979e-10, "loss": 0.0166, "step": 10828 }, { "epoch": 4.926751592356688, "grad_norm": 0.6261243634346428, "learning_rate": 5.360394876006147e-10, "loss": 0.0107, "step": 10829 }, { "epoch": 4.927206551410373, "grad_norm": 0.8125179949919439, "learning_rate": 5.294433059244152e-10, "loss": 0.0095, "step": 10830 }, { "epoch": 4.927661510464058, "grad_norm": 0.6462137096861268, "learning_rate": 5.228879387265128e-10, "loss": 0.0125, "step": 10831 }, { "epoch": 4.928116469517743, "grad_norm": 0.7070230179615409, "learning_rate": 5.163733865425901e-10, "loss": 0.0102, "step": 10832 }, { "epoch": 4.928571428571429, "grad_norm": 0.6064317613121332, "learning_rate": 5.09899649904999e-10, "loss": 0.0079, "step": 10833 }, { "epoch": 4.929026387625114, "grad_norm": 0.7596283796770223, "learning_rate": 5.034667293427053e-10, "loss": 0.0059, "step": 10834 }, { "epoch": 4.929481346678799, "grad_norm": 0.9280553290857447, "learning_rate": 4.970746253813996e-10, "loss": 0.0064, "step": 10835 }, { "epoch": 4.929936305732484, "grad_norm": 0.44863393649018163, "learning_rate": 4.907233385434417e-10, "loss": 0.0048, "step": 10836 }, { "epoch": 4.930391264786169, "grad_norm": 0.3718848954458007, "learning_rate": 4.8441286934775e-10, "loss": 0.0032, "step": 10837 }, { "epoch": 4.930846223839854, "grad_norm": 0.5871442848756296, "learning_rate": 4.781432183101342e-10, "loss": 0.0055, "step": 10838 }, { "epoch": 4.93130118289354, "grad_norm": 2.363613828946477, "learning_rate": 4.719143859427954e-10, "loss": 0.0111, "step": 10839 }, { "epoch": 4.931756141947225, "grad_norm": 0.3885252833600034, "learning_rate": 4.657263727547711e-10, "loss": 0.0059, "step": 10840 }, { "epoch": 4.9322111010009095, "grad_norm": 0.9269054728641991, "learning_rate": 4.595791792516568e-10, "loss": 0.0078, "step": 10841 }, { "epoch": 4.932666060054595, "grad_norm": 0.5456473886624938, "learning_rate": 4.534728059358839e-10, "loss": 0.0085, "step": 10842 }, { "epoch": 4.93312101910828, "grad_norm": 0.7551417714134783, "learning_rate": 4.474072533063866e-10, "loss": 0.0165, "step": 10843 }, { "epoch": 4.933575978161965, "grad_norm": 0.8270888228070771, "learning_rate": 4.4138252185876855e-10, "loss": 0.0162, "step": 10844 }, { "epoch": 4.934030937215651, "grad_norm": 0.5169522981185909, "learning_rate": 4.353986120854136e-10, "loss": 0.0048, "step": 10845 }, { "epoch": 4.934485896269336, "grad_norm": 0.6073269161822947, "learning_rate": 4.29455524475264e-10, "loss": 0.0052, "step": 10846 }, { "epoch": 4.9349408553230205, "grad_norm": 0.4129618817433863, "learning_rate": 4.2355325951398677e-10, "loss": 0.0044, "step": 10847 }, { "epoch": 4.935395814376706, "grad_norm": 0.7174050889874525, "learning_rate": 4.176918176838629e-10, "loss": 0.0124, "step": 10848 }, { "epoch": 4.935850773430391, "grad_norm": 0.7567158235205034, "learning_rate": 4.1187119946384243e-10, "loss": 0.011, "step": 10849 }, { "epoch": 4.936305732484076, "grad_norm": 0.454108887270079, "learning_rate": 4.060914053296005e-10, "loss": 0.011, "step": 10850 }, { "epoch": 4.936760691537762, "grad_norm": 0.8730358382609744, "learning_rate": 4.0035243575342604e-10, "loss": 0.0103, "step": 10851 }, { "epoch": 4.937215650591447, "grad_norm": 0.5695504671429416, "learning_rate": 3.9465429120427717e-10, "loss": 0.0067, "step": 10852 }, { "epoch": 4.9376706096451315, "grad_norm": 0.6607173094197044, "learning_rate": 3.88996972147837e-10, "loss": 0.0069, "step": 10853 }, { "epoch": 4.938125568698817, "grad_norm": 0.6859059370178331, "learning_rate": 3.8338047904623583e-10, "loss": 0.009, "step": 10854 }, { "epoch": 4.938580527752502, "grad_norm": 0.5696752997901859, "learning_rate": 3.778048123586064e-10, "loss": 0.0065, "step": 10855 }, { "epoch": 4.939035486806187, "grad_norm": 0.4278206719650412, "learning_rate": 3.722699725404732e-10, "loss": 0.0055, "step": 10856 }, { "epoch": 4.939490445859873, "grad_norm": 0.5809435111316815, "learning_rate": 3.6677596004414114e-10, "loss": 0.0076, "step": 10857 }, { "epoch": 4.939945404913558, "grad_norm": 0.5405284660410116, "learning_rate": 3.6132277531852885e-10, "loss": 0.0064, "step": 10858 }, { "epoch": 4.9404003639672425, "grad_norm": 0.8127751576529574, "learning_rate": 3.5591041880933536e-10, "loss": 0.0061, "step": 10859 }, { "epoch": 4.940855323020928, "grad_norm": 0.6162406061299031, "learning_rate": 3.50538890958707e-10, "loss": 0.008, "step": 10860 }, { "epoch": 4.941310282074613, "grad_norm": 0.7315822123410854, "learning_rate": 3.4520819220568154e-10, "loss": 0.018, "step": 10861 }, { "epoch": 4.941765241128298, "grad_norm": 0.7278468081016687, "learning_rate": 3.3991832298579937e-10, "loss": 0.0062, "step": 10862 }, { "epoch": 4.942220200181984, "grad_norm": 0.35129673448541837, "learning_rate": 3.3466928373132586e-10, "loss": 0.0028, "step": 10863 }, { "epoch": 4.942675159235669, "grad_norm": 0.7158138976493837, "learning_rate": 3.294610748712512e-10, "loss": 0.0147, "step": 10864 }, { "epoch": 4.943130118289354, "grad_norm": 0.6837318692442774, "learning_rate": 3.242936968311238e-10, "loss": 0.0118, "step": 10865 }, { "epoch": 4.943585077343039, "grad_norm": 0.4361518995430752, "learning_rate": 3.1916715003316164e-10, "loss": 0.0045, "step": 10866 }, { "epoch": 4.944040036396724, "grad_norm": 0.45344726294662413, "learning_rate": 3.1408143489636275e-10, "loss": 0.0052, "step": 10867 }, { "epoch": 4.94449499545041, "grad_norm": 0.6743199144125543, "learning_rate": 3.0903655183622814e-10, "loss": 0.0069, "step": 10868 }, { "epoch": 4.944949954504095, "grad_norm": 0.4005440009983241, "learning_rate": 3.040325012650391e-10, "loss": 0.0037, "step": 10869 }, { "epoch": 4.94540491355778, "grad_norm": 0.5104856673193006, "learning_rate": 2.990692835917463e-10, "loss": 0.0027, "step": 10870 }, { "epoch": 4.945859872611465, "grad_norm": 0.59805391823228, "learning_rate": 2.9414689922185875e-10, "loss": 0.0044, "step": 10871 }, { "epoch": 4.94631483166515, "grad_norm": 0.6304751563416021, "learning_rate": 2.8926534855766573e-10, "loss": 0.0079, "step": 10872 }, { "epoch": 4.946769790718835, "grad_norm": 0.3319463489938069, "learning_rate": 2.844246319980148e-10, "loss": 0.0044, "step": 10873 }, { "epoch": 4.947224749772521, "grad_norm": 0.44098101177701526, "learning_rate": 2.796247499384785e-10, "loss": 0.0017, "step": 10874 }, { "epoch": 4.947679708826206, "grad_norm": 0.6857701949276331, "learning_rate": 2.7486570277129863e-10, "loss": 0.0059, "step": 10875 }, { "epoch": 4.9481346678798905, "grad_norm": 0.669304417380285, "learning_rate": 2.701474908853307e-10, "loss": 0.006, "step": 10876 }, { "epoch": 4.948589626933576, "grad_norm": 0.43226195748015805, "learning_rate": 2.654701146662108e-10, "loss": 0.0113, "step": 10877 }, { "epoch": 4.949044585987261, "grad_norm": 0.6755759408201963, "learning_rate": 2.608335744960222e-10, "loss": 0.0096, "step": 10878 }, { "epoch": 4.949499545040946, "grad_norm": 0.5032200439333012, "learning_rate": 2.562378707537394e-10, "loss": 0.007, "step": 10879 }, { "epoch": 4.949954504094632, "grad_norm": 0.6955100733558593, "learning_rate": 2.516830038148954e-10, "loss": 0.0142, "step": 10880 }, { "epoch": 4.950409463148317, "grad_norm": 0.4596167144264365, "learning_rate": 2.4716897405163693e-10, "loss": 0.0056, "step": 10881 }, { "epoch": 4.9508644222020015, "grad_norm": 0.5124358296051028, "learning_rate": 2.4269578183289117e-10, "loss": 0.0046, "step": 10882 }, { "epoch": 4.951319381255687, "grad_norm": 0.4968242255020857, "learning_rate": 2.382634275241435e-10, "loss": 0.0037, "step": 10883 }, { "epoch": 4.951774340309372, "grad_norm": 0.7759295686932595, "learning_rate": 2.3387191148765974e-10, "loss": 0.012, "step": 10884 }, { "epoch": 4.952229299363057, "grad_norm": 0.3683939401647621, "learning_rate": 2.2952123408215284e-10, "loss": 0.0028, "step": 10885 }, { "epoch": 4.952684258416743, "grad_norm": 0.6922461995527178, "learning_rate": 2.2521139566328284e-10, "loss": 0.016, "step": 10886 }, { "epoch": 4.953139217470428, "grad_norm": 1.3928994617124577, "learning_rate": 2.2094239658315693e-10, "loss": 0.0197, "step": 10887 }, { "epoch": 4.9535941765241125, "grad_norm": 0.3911693130832513, "learning_rate": 2.1671423719066273e-10, "loss": 0.0061, "step": 10888 }, { "epoch": 4.954049135577798, "grad_norm": 4.561552466806049, "learning_rate": 2.125269178312461e-10, "loss": 0.0221, "step": 10889 }, { "epoch": 4.954504094631483, "grad_norm": 0.49035028722252455, "learning_rate": 2.083804388471333e-10, "loss": 0.0109, "step": 10890 }, { "epoch": 4.954959053685168, "grad_norm": 0.41881755598922926, "learning_rate": 2.0427480057710888e-10, "loss": 0.0036, "step": 10891 }, { "epoch": 4.955414012738854, "grad_norm": 0.7543793740126901, "learning_rate": 2.0021000335673778e-10, "loss": 0.0057, "step": 10892 }, { "epoch": 4.955868971792539, "grad_norm": 0.6582735820440556, "learning_rate": 1.9618604751808764e-10, "loss": 0.006, "step": 10893 }, { "epoch": 4.9563239308462235, "grad_norm": 0.6372839765518894, "learning_rate": 1.9220293339000658e-10, "loss": 0.0108, "step": 10894 }, { "epoch": 4.956778889899909, "grad_norm": 0.5547023867932366, "learning_rate": 1.88260661298012e-10, "loss": 0.0052, "step": 10895 }, { "epoch": 4.957233848953594, "grad_norm": 0.4622400794704903, "learning_rate": 1.8435923156423504e-10, "loss": 0.004, "step": 10896 }, { "epoch": 4.957688808007279, "grad_norm": 0.46434283534076143, "learning_rate": 1.804986445074208e-10, "loss": 0.0084, "step": 10897 }, { "epoch": 4.958143767060965, "grad_norm": 0.35735011022668284, "learning_rate": 1.7667890044315014e-10, "loss": 0.005, "step": 10898 }, { "epoch": 4.95859872611465, "grad_norm": 0.44474060014176237, "learning_rate": 1.7289999968350676e-10, "loss": 0.0029, "step": 10899 }, { "epoch": 4.959053685168335, "grad_norm": 0.6094146275037043, "learning_rate": 1.6916194253724368e-10, "loss": 0.0121, "step": 10900 }, { "epoch": 4.95950864422202, "grad_norm": 0.4573786098735274, "learning_rate": 1.654647293098388e-10, "loss": 0.0032, "step": 10901 }, { "epoch": 4.959963603275705, "grad_norm": 0.9371302599118722, "learning_rate": 1.6180836030343926e-10, "loss": 0.0041, "step": 10902 }, { "epoch": 4.960418562329391, "grad_norm": 0.5081594056574731, "learning_rate": 1.581928358168061e-10, "loss": 0.0089, "step": 10903 }, { "epoch": 4.960873521383076, "grad_norm": 0.4165142082126769, "learning_rate": 1.5461815614542518e-10, "loss": 0.0055, "step": 10904 }, { "epoch": 4.961328480436761, "grad_norm": 0.33796195227982384, "learning_rate": 1.5108432158134066e-10, "loss": 0.0015, "step": 10905 }, { "epoch": 4.961783439490446, "grad_norm": 0.4861409184941939, "learning_rate": 1.4759133241332155e-10, "loss": 0.0079, "step": 10906 }, { "epoch": 4.962238398544131, "grad_norm": 0.6531165823380607, "learning_rate": 1.4413918892686172e-10, "loss": 0.0066, "step": 10907 }, { "epoch": 4.962693357597816, "grad_norm": 0.7118193199659779, "learning_rate": 1.407278914040133e-10, "loss": 0.01, "step": 10908 }, { "epoch": 4.963148316651502, "grad_norm": 0.3678270897987698, "learning_rate": 1.373574401234978e-10, "loss": 0.0028, "step": 10909 }, { "epoch": 4.963603275705187, "grad_norm": 0.5539193647949295, "learning_rate": 1.3402783536081709e-10, "loss": 0.0059, "step": 10910 }, { "epoch": 4.9640582347588715, "grad_norm": 0.3873138669316014, "learning_rate": 1.3073907738797574e-10, "loss": 0.0021, "step": 10911 }, { "epoch": 4.964513193812557, "grad_norm": 0.6695814091010318, "learning_rate": 1.2749116647375878e-10, "loss": 0.004, "step": 10912 }, { "epoch": 4.964968152866242, "grad_norm": 0.4272890563259492, "learning_rate": 1.2428410288356505e-10, "loss": 0.0042, "step": 10913 }, { "epoch": 4.965423111919927, "grad_norm": 0.36391421334731744, "learning_rate": 1.2111788687946267e-10, "loss": 0.0057, "step": 10914 }, { "epoch": 4.965878070973613, "grad_norm": 0.7007538206741104, "learning_rate": 1.179925187201891e-10, "loss": 0.0136, "step": 10915 }, { "epoch": 4.966333030027298, "grad_norm": 0.7656147022225661, "learning_rate": 1.1490799866109568e-10, "loss": 0.0243, "step": 10916 }, { "epoch": 4.9667879890809825, "grad_norm": 0.93160298099921, "learning_rate": 1.1186432695425851e-10, "loss": 0.0081, "step": 10917 }, { "epoch": 4.967242948134668, "grad_norm": 0.732943462909258, "learning_rate": 1.0886150384836756e-10, "loss": 0.0128, "step": 10918 }, { "epoch": 4.967697907188353, "grad_norm": 0.46193533528820135, "learning_rate": 1.0589952958883763e-10, "loss": 0.0039, "step": 10919 }, { "epoch": 4.968152866242038, "grad_norm": 0.7261658315653726, "learning_rate": 1.0297840441775285e-10, "loss": 0.0073, "step": 10920 }, { "epoch": 4.968607825295724, "grad_norm": 0.7695073530494868, "learning_rate": 1.0009812857370015e-10, "loss": 0.007, "step": 10921 }, { "epoch": 4.969062784349409, "grad_norm": 0.6764615079627672, "learning_rate": 9.725870229210231e-11, "loss": 0.0109, "step": 10922 }, { "epoch": 4.9695177434030935, "grad_norm": 0.6245562776670596, "learning_rate": 9.446012580499596e-11, "loss": 0.0115, "step": 10923 }, { "epoch": 4.969972702456779, "grad_norm": 0.7853008776315, "learning_rate": 9.170239934108703e-11, "loss": 0.007, "step": 10924 }, { "epoch": 4.970427661510464, "grad_norm": 0.5095623931097317, "learning_rate": 8.898552312563978e-11, "loss": 0.0028, "step": 10925 }, { "epoch": 4.970882620564149, "grad_norm": 0.9701095294540815, "learning_rate": 8.630949738075432e-11, "loss": 0.0153, "step": 10926 }, { "epoch": 4.971337579617835, "grad_norm": 0.27917544584435444, "learning_rate": 8.367432232503357e-11, "loss": 0.0032, "step": 10927 }, { "epoch": 4.97179253867152, "grad_norm": 0.6077493068053453, "learning_rate": 8.10799981738608e-11, "loss": 0.0124, "step": 10928 }, { "epoch": 4.9722474977252045, "grad_norm": 0.4949394572259519, "learning_rate": 7.852652513923309e-11, "loss": 0.0042, "step": 10929 }, { "epoch": 4.97270245677889, "grad_norm": 0.6910842818604677, "learning_rate": 7.601390342976133e-11, "loss": 0.0075, "step": 10930 }, { "epoch": 4.973157415832575, "grad_norm": 0.5501543347310734, "learning_rate": 7.35421332508368e-11, "loss": 0.005, "step": 10931 }, { "epoch": 4.97361237488626, "grad_norm": 0.7430195820262775, "learning_rate": 7.111121480435356e-11, "loss": 0.0061, "step": 10932 }, { "epoch": 4.974067333939946, "grad_norm": 0.42064612226473663, "learning_rate": 6.872114828904151e-11, "loss": 0.0058, "step": 10933 }, { "epoch": 4.974522292993631, "grad_norm": 1.819136831486825, "learning_rate": 6.63719339001334e-11, "loss": 0.0429, "step": 10934 }, { "epoch": 4.9749772520473154, "grad_norm": 0.8859172862200253, "learning_rate": 6.40635718296978e-11, "loss": 0.0063, "step": 10935 }, { "epoch": 4.975432211101001, "grad_norm": 0.49446273732206636, "learning_rate": 6.179606226625055e-11, "loss": 0.0074, "step": 10936 }, { "epoch": 4.975887170154686, "grad_norm": 0.6786294563950138, "learning_rate": 5.956940539508792e-11, "loss": 0.0107, "step": 10937 }, { "epoch": 4.976342129208371, "grad_norm": 0.6635873675518563, "learning_rate": 5.738360139823095e-11, "loss": 0.0097, "step": 10938 }, { "epoch": 4.976797088262057, "grad_norm": 0.7309903558281022, "learning_rate": 5.5238650454314525e-11, "loss": 0.0061, "step": 10939 }, { "epoch": 4.977252047315742, "grad_norm": 0.4752486758114802, "learning_rate": 5.3134552738531846e-11, "loss": 0.0065, "step": 10940 }, { "epoch": 4.977707006369426, "grad_norm": 0.5346210085674483, "learning_rate": 5.107130842280094e-11, "loss": 0.0107, "step": 10941 }, { "epoch": 4.978161965423112, "grad_norm": 0.558283044999984, "learning_rate": 4.90489176758202e-11, "loss": 0.0065, "step": 10942 }, { "epoch": 4.978616924476797, "grad_norm": 0.6251031239396716, "learning_rate": 4.7067380662846325e-11, "loss": 0.0063, "step": 10943 }, { "epoch": 4.979071883530482, "grad_norm": 1.1383566498155329, "learning_rate": 4.5126697545694317e-11, "loss": 0.0087, "step": 10944 }, { "epoch": 4.979526842584168, "grad_norm": 0.4609148598030098, "learning_rate": 4.322686848301504e-11, "loss": 0.0024, "step": 10945 }, { "epoch": 4.9799818016378525, "grad_norm": 0.49146803716142456, "learning_rate": 4.1367893630073205e-11, "loss": 0.0102, "step": 10946 }, { "epoch": 4.980436760691537, "grad_norm": 0.8185368241417812, "learning_rate": 3.95497731387473e-11, "loss": 0.0025, "step": 10947 }, { "epoch": 4.980891719745223, "grad_norm": 0.9109839708968143, "learning_rate": 3.777250715764069e-11, "loss": 0.0104, "step": 10948 }, { "epoch": 4.981346678798908, "grad_norm": 0.25346582536101253, "learning_rate": 3.6036095831915026e-11, "loss": 0.0032, "step": 10949 }, { "epoch": 4.981801637852593, "grad_norm": 35.81460458340688, "learning_rate": 3.434053930351233e-11, "loss": 0.0327, "step": 10950 }, { "epoch": 4.982256596906279, "grad_norm": 0.7758319055245936, "learning_rate": 3.2685837710988426e-11, "loss": 0.0112, "step": 10951 }, { "epoch": 4.9827115559599635, "grad_norm": 0.451227769973383, "learning_rate": 3.107199118951298e-11, "loss": 0.0096, "step": 10952 }, { "epoch": 4.983166515013648, "grad_norm": 0.7728130506993733, "learning_rate": 2.949899987103599e-11, "loss": 0.0081, "step": 10953 }, { "epoch": 4.983621474067334, "grad_norm": 0.9059583246643468, "learning_rate": 2.796686388401026e-11, "loss": 0.0134, "step": 10954 }, { "epoch": 4.984076433121019, "grad_norm": 0.5899588989426727, "learning_rate": 2.647558335372446e-11, "loss": 0.0131, "step": 10955 }, { "epoch": 4.984531392174704, "grad_norm": 0.8115640785031968, "learning_rate": 2.502515840197006e-11, "loss": 0.0063, "step": 10956 }, { "epoch": 4.98498635122839, "grad_norm": 34.82191436278875, "learning_rate": 2.3615589147318872e-11, "loss": 0.0276, "step": 10957 }, { "epoch": 4.9854413102820745, "grad_norm": 0.8902865409433887, "learning_rate": 2.2246875704901026e-11, "loss": 0.0144, "step": 10958 }, { "epoch": 4.985896269335759, "grad_norm": 2.386090781710058, "learning_rate": 2.091901818657149e-11, "loss": 0.021, "step": 10959 }, { "epoch": 4.986351228389445, "grad_norm": 0.7690494127970008, "learning_rate": 1.963201670091008e-11, "loss": 0.0086, "step": 10960 }, { "epoch": 4.98680618744313, "grad_norm": 0.5911164374986629, "learning_rate": 1.8385871352999406e-11, "loss": 0.0107, "step": 10961 }, { "epoch": 4.987261146496815, "grad_norm": 0.6677427849722005, "learning_rate": 1.7180582244646914e-11, "loss": 0.0055, "step": 10962 }, { "epoch": 4.987716105550501, "grad_norm": 0.6571713715409135, "learning_rate": 1.601614947444041e-11, "loss": 0.0133, "step": 10963 }, { "epoch": 4.9881710646041855, "grad_norm": 0.6065806585302176, "learning_rate": 1.489257313752601e-11, "loss": 0.0059, "step": 10964 }, { "epoch": 4.98862602365787, "grad_norm": 0.8299982443709759, "learning_rate": 1.3809853325608133e-11, "loss": 0.0066, "step": 10965 }, { "epoch": 4.989080982711556, "grad_norm": 0.9881207120101995, "learning_rate": 1.276799012728258e-11, "loss": 0.009, "step": 10966 }, { "epoch": 4.989535941765241, "grad_norm": 0.47820590804618457, "learning_rate": 1.176698362759243e-11, "loss": 0.0014, "step": 10967 }, { "epoch": 4.989990900818926, "grad_norm": 0.6325346402617874, "learning_rate": 1.0806833908416634e-11, "loss": 0.0138, "step": 10968 }, { "epoch": 4.990445859872612, "grad_norm": 0.70750163288351, "learning_rate": 9.887541048136939e-12, "loss": 0.0091, "step": 10969 }, { "epoch": 4.9909008189262964, "grad_norm": 0.31704115773518077, "learning_rate": 9.00910512197095e-12, "loss": 0.0023, "step": 10970 }, { "epoch": 4.991355777979981, "grad_norm": 0.6365085880461119, "learning_rate": 8.171526201583568e-12, "loss": 0.0121, "step": 10971 }, { "epoch": 4.991810737033667, "grad_norm": 0.5530100391864079, "learning_rate": 7.37480435547555e-12, "loss": 0.0078, "step": 10972 }, { "epoch": 4.992265696087352, "grad_norm": 1.0235331447185707, "learning_rate": 6.61893964881699e-12, "loss": 0.0097, "step": 10973 }, { "epoch": 4.992720655141038, "grad_norm": 1.096886332973395, "learning_rate": 5.903932143225265e-12, "loss": 0.0121, "step": 10974 }, { "epoch": 4.9931756141947226, "grad_norm": 0.9796807141889246, "learning_rate": 5.22978189726464e-12, "loss": 0.0091, "step": 10975 }, { "epoch": 4.993630573248407, "grad_norm": 0.7156520027047488, "learning_rate": 4.5964889659466655e-12, "loss": 0.0164, "step": 10976 }, { "epoch": 4.994085532302093, "grad_norm": 0.4052963267204114, "learning_rate": 4.004053401063246e-12, "loss": 0.0035, "step": 10977 }, { "epoch": 4.994540491355778, "grad_norm": 0.23214163015687223, "learning_rate": 3.4524752509645927e-12, "loss": 0.0018, "step": 10978 }, { "epoch": 4.994995450409463, "grad_norm": 0.29979924536331237, "learning_rate": 2.9417545608367844e-12, "loss": 0.0024, "step": 10979 }, { "epoch": 4.995450409463149, "grad_norm": 0.5227839157313641, "learning_rate": 2.471891372313184e-12, "loss": 0.0043, "step": 10980 }, { "epoch": 4.9959053685168335, "grad_norm": 0.5847414349659543, "learning_rate": 2.042885723751997e-12, "loss": 0.0091, "step": 10981 }, { "epoch": 4.996360327570518, "grad_norm": 0.7089990851014353, "learning_rate": 1.6547376503472932e-12, "loss": 0.0083, "step": 10982 }, { "epoch": 4.996815286624204, "grad_norm": 0.5737833430083067, "learning_rate": 1.3074471837404288e-12, "loss": 0.0102, "step": 10983 }, { "epoch": 4.997270245677889, "grad_norm": 0.3377155477163815, "learning_rate": 1.0010143522976023e-12, "loss": 0.0023, "step": 10984 }, { "epoch": 4.997725204731574, "grad_norm": 0.812554011440206, "learning_rate": 7.354391810543426e-13, "loss": 0.0038, "step": 10985 }, { "epoch": 4.99818016378526, "grad_norm": 0.8746269876533118, "learning_rate": 5.107216917710211e-13, "loss": 0.0084, "step": 10986 }, { "epoch": 4.9986351228389445, "grad_norm": 0.8152054601113159, "learning_rate": 3.2686190276631777e-13, "loss": 0.0138, "step": 10987 }, { "epoch": 4.999090081892629, "grad_norm": 0.3559071975411582, "learning_rate": 1.8385982908375452e-13, "loss": 0.0023, "step": 10988 }, { "epoch": 4.999545040946315, "grad_norm": 0.5708973805111688, "learning_rate": 8.171548238067316e-14, "loss": 0.0052, "step": 10989 }, { "epoch": 5.0, "grad_norm": 0.39753238381467954, "learning_rate": 2.042887098374635e-14, "loss": 0.0034, "step": 10990 }, { "epoch": 5.0, "step": 10990, "total_flos": 72288269254656.0, "train_loss": 0.026851859047976715, "train_runtime": 13824.9753, "train_samples_per_second": 3.179, "train_steps_per_second": 0.795 } ], "logging_steps": 1, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 555, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 72288269254656.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }