{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045495905368516835, "grad_norm": 9.461428161462043, "learning_rate": 1e-05, "loss": 0.1263, "step": 1 }, { "epoch": 0.0009099181073703367, "grad_norm": 5.190780450250769, "learning_rate": 9.99999979571129e-06, "loss": 0.1723, "step": 2 }, { "epoch": 0.001364877161055505, "grad_norm": 7.521926017130347, "learning_rate": 9.999999182845177e-06, "loss": 0.1327, "step": 3 }, { "epoch": 0.0018198362147406734, "grad_norm": 2.5665810200307217, "learning_rate": 9.99999816140171e-06, "loss": 0.1095, "step": 4 }, { "epoch": 0.0022747952684258415, "grad_norm": 2.738508706395883, "learning_rate": 9.999996731380973e-06, "loss": 0.1151, "step": 5 }, { "epoch": 0.00272975432211101, "grad_norm": 2.67941899677245, "learning_rate": 9.999994892783083e-06, "loss": 0.0821, "step": 6 }, { "epoch": 0.0031847133757961785, "grad_norm": 2.137586234420784, "learning_rate": 9.99999264560819e-06, "loss": 0.0729, "step": 7 }, { "epoch": 0.003639672429481347, "grad_norm": 2.8221590420989164, "learning_rate": 9.999989989856477e-06, "loss": 0.0929, "step": 8 }, { "epoch": 0.004094631483166515, "grad_norm": 1.6167314639784554, "learning_rate": 9.999986925528164e-06, "loss": 0.0466, "step": 9 }, { "epoch": 0.004549590536851683, "grad_norm": 2.1773262431631313, "learning_rate": 9.999983452623498e-06, "loss": 0.0709, "step": 10 }, { "epoch": 0.005004549590536852, "grad_norm": 7.6444390817806465, "learning_rate": 9.999979571142765e-06, "loss": 0.0809, "step": 11 }, { "epoch": 0.00545950864422202, "grad_norm": 2.034523884241798, "learning_rate": 9.999975281086278e-06, "loss": 0.0839, "step": 12 }, { "epoch": 0.005914467697907188, "grad_norm": 3.576108282005355, "learning_rate": 9.999970582454392e-06, "loss": 0.0728, "step": 13 }, { "epoch": 0.006369426751592357, "grad_norm": 2.623641566468802, "learning_rate": 9.999965475247491e-06, "loss": 0.1052, "step": 14 }, { "epoch": 0.006824385805277525, "grad_norm": 2.1413574998269085, "learning_rate": 9.99995995946599e-06, "loss": 0.0885, "step": 15 }, { "epoch": 0.007279344858962694, "grad_norm": 1.4859066724415246, "learning_rate": 9.999954035110342e-06, "loss": 0.0644, "step": 16 }, { "epoch": 0.0077343039126478615, "grad_norm": 2.851793157608408, "learning_rate": 9.999947702181027e-06, "loss": 0.1057, "step": 17 }, { "epoch": 0.00818926296633303, "grad_norm": 4.693829546662477, "learning_rate": 9.999940960678568e-06, "loss": 0.0867, "step": 18 }, { "epoch": 0.008644222020018199, "grad_norm": 2.2728033563417362, "learning_rate": 9.999933810603513e-06, "loss": 0.0789, "step": 19 }, { "epoch": 0.009099181073703366, "grad_norm": 1.6705986173507794, "learning_rate": 9.999926251956447e-06, "loss": 0.0683, "step": 20 }, { "epoch": 0.009554140127388535, "grad_norm": 2.187579869114393, "learning_rate": 9.999918284737986e-06, "loss": 0.0984, "step": 21 }, { "epoch": 0.010009099181073703, "grad_norm": 2.328040268012338, "learning_rate": 9.999909908948782e-06, "loss": 0.0699, "step": 22 }, { "epoch": 0.010464058234758872, "grad_norm": 5.572389775693198, "learning_rate": 9.999901124589519e-06, "loss": 0.0912, "step": 23 }, { "epoch": 0.01091901728844404, "grad_norm": 1.84796719674859, "learning_rate": 9.999891931660916e-06, "loss": 0.1015, "step": 24 }, { "epoch": 0.011373976342129208, "grad_norm": 1.7501762990792236, "learning_rate": 9.999882330163725e-06, "loss": 0.0909, "step": 25 }, { "epoch": 0.011828935395814377, "grad_norm": 0.9922115950592263, "learning_rate": 9.999872320098729e-06, "loss": 0.0656, "step": 26 }, { "epoch": 0.012283894449499545, "grad_norm": 1.5612370560987539, "learning_rate": 9.999861901466746e-06, "loss": 0.0974, "step": 27 }, { "epoch": 0.012738853503184714, "grad_norm": 1.4617271794930395, "learning_rate": 9.999851074268625e-06, "loss": 0.0853, "step": 28 }, { "epoch": 0.013193812556869881, "grad_norm": 1.8127085104491556, "learning_rate": 9.999839838505257e-06, "loss": 0.1081, "step": 29 }, { "epoch": 0.01364877161055505, "grad_norm": 1.4710105512612208, "learning_rate": 9.999828194177555e-06, "loss": 0.0868, "step": 30 }, { "epoch": 0.014103730664240218, "grad_norm": 1.3474487189311888, "learning_rate": 9.999816141286472e-06, "loss": 0.0817, "step": 31 }, { "epoch": 0.014558689717925387, "grad_norm": 1.0967596652549403, "learning_rate": 9.99980367983299e-06, "loss": 0.0637, "step": 32 }, { "epoch": 0.015013648771610554, "grad_norm": 3.179425671823194, "learning_rate": 9.999790809818134e-06, "loss": 0.069, "step": 33 }, { "epoch": 0.015468607825295723, "grad_norm": 4.482257681577152, "learning_rate": 9.999777531242951e-06, "loss": 0.0915, "step": 34 }, { "epoch": 0.01592356687898089, "grad_norm": 3.953299040475791, "learning_rate": 9.999763844108528e-06, "loss": 0.0562, "step": 35 }, { "epoch": 0.01637852593266606, "grad_norm": 1.1127201050382067, "learning_rate": 9.999749748415982e-06, "loss": 0.0556, "step": 36 }, { "epoch": 0.01683348498635123, "grad_norm": 79.45756094624792, "learning_rate": 9.999735244166464e-06, "loss": 0.1223, "step": 37 }, { "epoch": 0.017288444040036398, "grad_norm": 2777.9092912017113, "learning_rate": 9.99972033136116e-06, "loss": 0.3211, "step": 38 }, { "epoch": 0.017743403093721567, "grad_norm": 2.5204693177238466, "learning_rate": 9.999705010001291e-06, "loss": 0.0723, "step": 39 }, { "epoch": 0.018198362147406732, "grad_norm": 2.2975907071135655, "learning_rate": 9.999689280088105e-06, "loss": 0.0696, "step": 40 }, { "epoch": 0.0186533212010919, "grad_norm": 2.998434349074003, "learning_rate": 9.99967314162289e-06, "loss": 0.083, "step": 41 }, { "epoch": 0.01910828025477707, "grad_norm": 3.882239448575704, "learning_rate": 9.999656594606966e-06, "loss": 0.1015, "step": 42 }, { "epoch": 0.019563239308462238, "grad_norm": 3.5286596480512493, "learning_rate": 9.999639639041681e-06, "loss": 0.0817, "step": 43 }, { "epoch": 0.020018198362147407, "grad_norm": 1.6933989447443707, "learning_rate": 9.999622274928424e-06, "loss": 0.1003, "step": 44 }, { "epoch": 0.020473157415832575, "grad_norm": 1.2483160046323276, "learning_rate": 9.999604502268614e-06, "loss": 0.0952, "step": 45 }, { "epoch": 0.020928116469517744, "grad_norm": 0.9417906124383243, "learning_rate": 9.9995863210637e-06, "loss": 0.0731, "step": 46 }, { "epoch": 0.021383075523202913, "grad_norm": 2.8195414757816897, "learning_rate": 9.99956773131517e-06, "loss": 0.1845, "step": 47 }, { "epoch": 0.02183803457688808, "grad_norm": 2.74390379471345, "learning_rate": 9.999548733024545e-06, "loss": 0.1826, "step": 48 }, { "epoch": 0.022292993630573247, "grad_norm": 1.5138494619527987, "learning_rate": 9.999529326193373e-06, "loss": 0.0857, "step": 49 }, { "epoch": 0.022747952684258416, "grad_norm": 1.215379974181271, "learning_rate": 9.999509510823242e-06, "loss": 0.0686, "step": 50 }, { "epoch": 0.023202911737943584, "grad_norm": 1.292187967807859, "learning_rate": 9.999489286915773e-06, "loss": 0.0707, "step": 51 }, { "epoch": 0.023657870791628753, "grad_norm": 1.7888013203563982, "learning_rate": 9.999468654472614e-06, "loss": 0.0682, "step": 52 }, { "epoch": 0.024112829845313922, "grad_norm": 0.8979425621703144, "learning_rate": 9.999447613495457e-06, "loss": 0.0508, "step": 53 }, { "epoch": 0.02456778889899909, "grad_norm": 1.9123835444775663, "learning_rate": 9.99942616398602e-06, "loss": 0.0689, "step": 54 }, { "epoch": 0.02502274795268426, "grad_norm": 0.9393581994096443, "learning_rate": 9.99940430594605e-06, "loss": 0.0496, "step": 55 }, { "epoch": 0.025477707006369428, "grad_norm": 1.0234476513644222, "learning_rate": 9.999382039377339e-06, "loss": 0.0601, "step": 56 }, { "epoch": 0.025932666060054597, "grad_norm": 0.9291387208138827, "learning_rate": 9.999359364281704e-06, "loss": 0.0377, "step": 57 }, { "epoch": 0.026387625113739762, "grad_norm": 1.8209170803663992, "learning_rate": 9.999336280660999e-06, "loss": 0.1144, "step": 58 }, { "epoch": 0.02684258416742493, "grad_norm": 1.1214625046464874, "learning_rate": 9.99931278851711e-06, "loss": 0.0622, "step": 59 }, { "epoch": 0.0272975432211101, "grad_norm": 1.0331723997917317, "learning_rate": 9.999288887851956e-06, "loss": 0.0667, "step": 60 }, { "epoch": 0.027752502274795268, "grad_norm": 1.0412381501406744, "learning_rate": 9.999264578667493e-06, "loss": 0.0566, "step": 61 }, { "epoch": 0.028207461328480437, "grad_norm": 1.4510603110658047, "learning_rate": 9.999239860965703e-06, "loss": 0.0845, "step": 62 }, { "epoch": 0.028662420382165606, "grad_norm": 1.301162540669183, "learning_rate": 9.999214734748609e-06, "loss": 0.0759, "step": 63 }, { "epoch": 0.029117379435850774, "grad_norm": 0.9977688847603402, "learning_rate": 9.999189200018263e-06, "loss": 0.0528, "step": 64 }, { "epoch": 0.029572338489535943, "grad_norm": 1.2894688842348854, "learning_rate": 9.99916325677675e-06, "loss": 0.0899, "step": 65 }, { "epoch": 0.03002729754322111, "grad_norm": 1.4627871680702638, "learning_rate": 9.999136905026194e-06, "loss": 0.1456, "step": 66 }, { "epoch": 0.030482256596906277, "grad_norm": 1.2304385710214434, "learning_rate": 9.999110144768745e-06, "loss": 0.079, "step": 67 }, { "epoch": 0.030937215650591446, "grad_norm": 1.085016380732753, "learning_rate": 9.99908297600659e-06, "loss": 0.0696, "step": 68 }, { "epoch": 0.03139217470427662, "grad_norm": 0.989450558642297, "learning_rate": 9.99905539874195e-06, "loss": 0.069, "step": 69 }, { "epoch": 0.03184713375796178, "grad_norm": 1.0510491151133208, "learning_rate": 9.99902741297708e-06, "loss": 0.0555, "step": 70 }, { "epoch": 0.03230209281164695, "grad_norm": 0.8938033562648371, "learning_rate": 9.998999018714264e-06, "loss": 0.0783, "step": 71 }, { "epoch": 0.03275705186533212, "grad_norm": 2.902512108322722, "learning_rate": 9.998970215955824e-06, "loss": 0.0702, "step": 72 }, { "epoch": 0.033212010919017286, "grad_norm": 0.7661831894133686, "learning_rate": 9.998941004704113e-06, "loss": 0.0519, "step": 73 }, { "epoch": 0.03366696997270246, "grad_norm": 1.1047249497744047, "learning_rate": 9.998911384961518e-06, "loss": 0.0773, "step": 74 }, { "epoch": 0.034121929026387623, "grad_norm": 0.7750047299312716, "learning_rate": 9.998881356730458e-06, "loss": 0.0598, "step": 75 }, { "epoch": 0.034576888080072796, "grad_norm": 0.9815801555720315, "learning_rate": 9.99885092001339e-06, "loss": 0.0661, "step": 76 }, { "epoch": 0.03503184713375796, "grad_norm": 1.3090963451351905, "learning_rate": 9.998820074812799e-06, "loss": 0.0713, "step": 77 }, { "epoch": 0.03548680618744313, "grad_norm": 1.1489338732270693, "learning_rate": 9.998788821131207e-06, "loss": 0.0946, "step": 78 }, { "epoch": 0.0359417652411283, "grad_norm": 0.9040381990998293, "learning_rate": 9.998757158971164e-06, "loss": 0.067, "step": 79 }, { "epoch": 0.036396724294813464, "grad_norm": 1.1019926198229115, "learning_rate": 9.998725088335263e-06, "loss": 0.0874, "step": 80 }, { "epoch": 0.036851683348498636, "grad_norm": 0.5779852750462403, "learning_rate": 9.99869260922612e-06, "loss": 0.0492, "step": 81 }, { "epoch": 0.0373066424021838, "grad_norm": 1.2769852710418472, "learning_rate": 9.998659721646393e-06, "loss": 0.0781, "step": 82 }, { "epoch": 0.03776160145586897, "grad_norm": 0.9020624084974485, "learning_rate": 9.998626425598766e-06, "loss": 0.0734, "step": 83 }, { "epoch": 0.03821656050955414, "grad_norm": 0.9626764462141776, "learning_rate": 9.99859272108596e-06, "loss": 0.0719, "step": 84 }, { "epoch": 0.03867151956323931, "grad_norm": 0.9435885887029873, "learning_rate": 9.998558608110733e-06, "loss": 0.0835, "step": 85 }, { "epoch": 0.039126478616924476, "grad_norm": 1.0578725525123687, "learning_rate": 9.998524086675867e-06, "loss": 0.0746, "step": 86 }, { "epoch": 0.03958143767060965, "grad_norm": 1.0366588534208079, "learning_rate": 9.998489156784188e-06, "loss": 0.0933, "step": 87 }, { "epoch": 0.040036396724294813, "grad_norm": 1.0595948680723846, "learning_rate": 9.998453818438547e-06, "loss": 0.0846, "step": 88 }, { "epoch": 0.04049135577797998, "grad_norm": 0.8807515753016749, "learning_rate": 9.998418071641833e-06, "loss": 0.0649, "step": 89 }, { "epoch": 0.04094631483166515, "grad_norm": 0.9034225145874141, "learning_rate": 9.998381916396967e-06, "loss": 0.0621, "step": 90 }, { "epoch": 0.041401273885350316, "grad_norm": 0.6732889821553815, "learning_rate": 9.998345352706901e-06, "loss": 0.0367, "step": 91 }, { "epoch": 0.04185623293903549, "grad_norm": 0.7136967603743426, "learning_rate": 9.998308380574628e-06, "loss": 0.0569, "step": 92 }, { "epoch": 0.042311191992720654, "grad_norm": 1.1459385364035048, "learning_rate": 9.998271000003166e-06, "loss": 0.1184, "step": 93 }, { "epoch": 0.042766151046405826, "grad_norm": 0.8224906129097734, "learning_rate": 9.998233210995569e-06, "loss": 0.0682, "step": 94 }, { "epoch": 0.04322111010009099, "grad_norm": 1.5182946932236698, "learning_rate": 9.998195013554926e-06, "loss": 0.0875, "step": 95 }, { "epoch": 0.04367606915377616, "grad_norm": 0.9355855711018981, "learning_rate": 9.998156407684359e-06, "loss": 0.0939, "step": 96 }, { "epoch": 0.04413102820746133, "grad_norm": 0.7329840867165283, "learning_rate": 9.998117393387022e-06, "loss": 0.0466, "step": 97 }, { "epoch": 0.044585987261146494, "grad_norm": 0.8701001036058451, "learning_rate": 9.9980779706661e-06, "loss": 0.0729, "step": 98 }, { "epoch": 0.045040946314831666, "grad_norm": 1.0218896298663185, "learning_rate": 9.99803813952482e-06, "loss": 0.0828, "step": 99 }, { "epoch": 0.04549590536851683, "grad_norm": 0.9044995357273884, "learning_rate": 9.997997899966433e-06, "loss": 0.0709, "step": 100 }, { "epoch": 0.045950864422202004, "grad_norm": 0.9877796099816964, "learning_rate": 9.99795725199423e-06, "loss": 0.0903, "step": 101 }, { "epoch": 0.04640582347588717, "grad_norm": 1.0061501994463906, "learning_rate": 9.99791619561153e-06, "loss": 0.0831, "step": 102 }, { "epoch": 0.04686078252957234, "grad_norm": 0.8789173954818107, "learning_rate": 9.997874730821689e-06, "loss": 0.0714, "step": 103 }, { "epoch": 0.047315741583257506, "grad_norm": 15.480920098194954, "learning_rate": 9.997832857628093e-06, "loss": 0.2603, "step": 104 }, { "epoch": 0.04777070063694268, "grad_norm": 1.3806761301603454, "learning_rate": 9.99779057603417e-06, "loss": 0.1227, "step": 105 }, { "epoch": 0.048225659690627844, "grad_norm": 0.8462176607269959, "learning_rate": 9.997747886043368e-06, "loss": 0.0605, "step": 106 }, { "epoch": 0.04868061874431301, "grad_norm": 0.7467169847716549, "learning_rate": 9.997704787659179e-06, "loss": 0.0618, "step": 107 }, { "epoch": 0.04913557779799818, "grad_norm": 1.5653334818977065, "learning_rate": 9.997661280885125e-06, "loss": 0.1253, "step": 108 }, { "epoch": 0.049590536851683346, "grad_norm": 0.871706038604149, "learning_rate": 9.99761736572476e-06, "loss": 0.0716, "step": 109 }, { "epoch": 0.05004549590536852, "grad_norm": 1.1398296008355844, "learning_rate": 9.997573042181672e-06, "loss": 0.0698, "step": 110 }, { "epoch": 0.050500454959053684, "grad_norm": 1.0487992691419916, "learning_rate": 9.997528310259485e-06, "loss": 0.1102, "step": 111 }, { "epoch": 0.050955414012738856, "grad_norm": 0.9112684449646818, "learning_rate": 9.997483169961852e-06, "loss": 0.1032, "step": 112 }, { "epoch": 0.05141037306642402, "grad_norm": 0.9418790141923585, "learning_rate": 9.997437621292463e-06, "loss": 0.0771, "step": 113 }, { "epoch": 0.051865332120109194, "grad_norm": 0.7796140692842074, "learning_rate": 9.99739166425504e-06, "loss": 0.0627, "step": 114 }, { "epoch": 0.05232029117379436, "grad_norm": 1.5434421216734795, "learning_rate": 9.997345298853339e-06, "loss": 0.1495, "step": 115 }, { "epoch": 0.052775250227479524, "grad_norm": 0.8898179660551836, "learning_rate": 9.997298525091148e-06, "loss": 0.0735, "step": 116 }, { "epoch": 0.053230209281164696, "grad_norm": 0.8585916871524272, "learning_rate": 9.997251342972288e-06, "loss": 0.068, "step": 117 }, { "epoch": 0.05368516833484986, "grad_norm": 0.812806800238708, "learning_rate": 9.997203752500616e-06, "loss": 0.0689, "step": 118 }, { "epoch": 0.054140127388535034, "grad_norm": 0.9677722064277628, "learning_rate": 9.997155753680021e-06, "loss": 0.0795, "step": 119 }, { "epoch": 0.0545950864422202, "grad_norm": 1.621934591654054, "learning_rate": 9.997107346514425e-06, "loss": 0.0707, "step": 120 }, { "epoch": 0.05505004549590537, "grad_norm": 0.6750452750311531, "learning_rate": 9.997058531007782e-06, "loss": 0.0588, "step": 121 }, { "epoch": 0.055505004549590536, "grad_norm": 0.9583870506818666, "learning_rate": 9.997009307164083e-06, "loss": 0.0859, "step": 122 }, { "epoch": 0.05595996360327571, "grad_norm": 1.247483970027119, "learning_rate": 9.99695967498735e-06, "loss": 0.0952, "step": 123 }, { "epoch": 0.056414922656960874, "grad_norm": 0.7937903902273558, "learning_rate": 9.996909634481639e-06, "loss": 0.0614, "step": 124 }, { "epoch": 0.05686988171064604, "grad_norm": 4.855426128828546, "learning_rate": 9.996859185651038e-06, "loss": 0.1629, "step": 125 }, { "epoch": 0.05732484076433121, "grad_norm": 1.0499970639607177, "learning_rate": 9.99680832849967e-06, "loss": 0.1031, "step": 126 }, { "epoch": 0.05777979981801638, "grad_norm": 0.8730447821488512, "learning_rate": 9.99675706303169e-06, "loss": 0.0606, "step": 127 }, { "epoch": 0.05823475887170155, "grad_norm": 1.2779985416162813, "learning_rate": 9.99670538925129e-06, "loss": 0.074, "step": 128 }, { "epoch": 0.058689717925386714, "grad_norm": 0.8606157718419157, "learning_rate": 9.996653307162687e-06, "loss": 0.0703, "step": 129 }, { "epoch": 0.059144676979071886, "grad_norm": 0.8920761218762643, "learning_rate": 9.996600816770144e-06, "loss": 0.0818, "step": 130 }, { "epoch": 0.05959963603275705, "grad_norm": 1.1603462045917847, "learning_rate": 9.996547918077944e-06, "loss": 0.1148, "step": 131 }, { "epoch": 0.06005459508644222, "grad_norm": 0.9108713801214797, "learning_rate": 9.996494611090414e-06, "loss": 0.0884, "step": 132 }, { "epoch": 0.06050955414012739, "grad_norm": 0.6523725468628359, "learning_rate": 9.996440895811907e-06, "loss": 0.0535, "step": 133 }, { "epoch": 0.060964513193812554, "grad_norm": 0.8812777694752004, "learning_rate": 9.996386772246816e-06, "loss": 0.087, "step": 134 }, { "epoch": 0.061419472247497726, "grad_norm": 1.0622191207422995, "learning_rate": 9.99633224039956e-06, "loss": 0.0982, "step": 135 }, { "epoch": 0.06187443130118289, "grad_norm": 3.7961077321923025, "learning_rate": 9.996277300274596e-06, "loss": 0.1526, "step": 136 }, { "epoch": 0.062329390354868064, "grad_norm": 0.9444433559435487, "learning_rate": 9.996221951876415e-06, "loss": 0.0996, "step": 137 }, { "epoch": 0.06278434940855324, "grad_norm": 1.444871481552235, "learning_rate": 9.996166195209539e-06, "loss": 0.1075, "step": 138 }, { "epoch": 0.0632393084622384, "grad_norm": 0.7446446480732116, "learning_rate": 9.996110030278522e-06, "loss": 0.0561, "step": 139 }, { "epoch": 0.06369426751592357, "grad_norm": 0.8913010543094952, "learning_rate": 9.996053457087958e-06, "loss": 0.0715, "step": 140 }, { "epoch": 0.06414922656960874, "grad_norm": 0.7815821404043856, "learning_rate": 9.995996475642466e-06, "loss": 0.0796, "step": 141 }, { "epoch": 0.0646041856232939, "grad_norm": 0.74337588448595, "learning_rate": 9.995939085946704e-06, "loss": 0.0661, "step": 142 }, { "epoch": 0.06505914467697907, "grad_norm": 0.9974255688753435, "learning_rate": 9.995881288005363e-06, "loss": 0.0869, "step": 143 }, { "epoch": 0.06551410373066424, "grad_norm": 1.2260290141946268, "learning_rate": 9.995823081823162e-06, "loss": 0.0766, "step": 144 }, { "epoch": 0.06596906278434941, "grad_norm": 0.9751795993584637, "learning_rate": 9.99576446740486e-06, "loss": 0.091, "step": 145 }, { "epoch": 0.06642402183803457, "grad_norm": 1.6175476325168967, "learning_rate": 9.995705444755249e-06, "loss": 0.1208, "step": 146 }, { "epoch": 0.06687898089171974, "grad_norm": 0.7580083688127299, "learning_rate": 9.995646013879147e-06, "loss": 0.0622, "step": 147 }, { "epoch": 0.06733393994540492, "grad_norm": 1.0194887039793072, "learning_rate": 9.995586174781413e-06, "loss": 0.0753, "step": 148 }, { "epoch": 0.06778889899909009, "grad_norm": 0.9065646408503975, "learning_rate": 9.995525927466936e-06, "loss": 0.0848, "step": 149 }, { "epoch": 0.06824385805277525, "grad_norm": 0.8871078738477127, "learning_rate": 9.995465271940641e-06, "loss": 0.0607, "step": 150 }, { "epoch": 0.06869881710646042, "grad_norm": 1.1486707652049646, "learning_rate": 9.995404208207485e-06, "loss": 0.0809, "step": 151 }, { "epoch": 0.06915377616014559, "grad_norm": 1.1473150526096232, "learning_rate": 9.995342736272453e-06, "loss": 0.1035, "step": 152 }, { "epoch": 0.06960873521383075, "grad_norm": 1.3025683052462544, "learning_rate": 9.995280856140572e-06, "loss": 0.1197, "step": 153 }, { "epoch": 0.07006369426751592, "grad_norm": 0.8069596755970996, "learning_rate": 9.9952185678169e-06, "loss": 0.0526, "step": 154 }, { "epoch": 0.0705186533212011, "grad_norm": 0.8153700064848134, "learning_rate": 9.995155871306524e-06, "loss": 0.0613, "step": 155 }, { "epoch": 0.07097361237488627, "grad_norm": 0.7319023745966868, "learning_rate": 9.995092766614567e-06, "loss": 0.0512, "step": 156 }, { "epoch": 0.07142857142857142, "grad_norm": 1.0146656175738817, "learning_rate": 9.995029253746186e-06, "loss": 0.0846, "step": 157 }, { "epoch": 0.0718835304822566, "grad_norm": 0.8015254985373994, "learning_rate": 9.994965332706574e-06, "loss": 0.0619, "step": 158 }, { "epoch": 0.07233848953594177, "grad_norm": 1.0630207312416284, "learning_rate": 9.994901003500952e-06, "loss": 0.0796, "step": 159 }, { "epoch": 0.07279344858962693, "grad_norm": 0.9431304991088505, "learning_rate": 9.994836266134575e-06, "loss": 0.0743, "step": 160 }, { "epoch": 0.0732484076433121, "grad_norm": 1.023738915097686, "learning_rate": 9.994771120612737e-06, "loss": 0.0888, "step": 161 }, { "epoch": 0.07370336669699727, "grad_norm": 0.9272637744585672, "learning_rate": 9.994705566940757e-06, "loss": 0.084, "step": 162 }, { "epoch": 0.07415832575068244, "grad_norm": 1.122378326253592, "learning_rate": 9.994639605123994e-06, "loss": 0.0961, "step": 163 }, { "epoch": 0.0746132848043676, "grad_norm": 0.753531768411978, "learning_rate": 9.994573235167839e-06, "loss": 0.0736, "step": 164 }, { "epoch": 0.07506824385805277, "grad_norm": 0.9314766958597749, "learning_rate": 9.994506457077715e-06, "loss": 0.0838, "step": 165 }, { "epoch": 0.07552320291173795, "grad_norm": 0.996008388557059, "learning_rate": 9.994439270859077e-06, "loss": 0.1076, "step": 166 }, { "epoch": 0.07597816196542312, "grad_norm": 0.9199332464612126, "learning_rate": 9.994371676517418e-06, "loss": 0.0724, "step": 167 }, { "epoch": 0.07643312101910828, "grad_norm": 0.8652292283168678, "learning_rate": 9.994303674058259e-06, "loss": 0.0628, "step": 168 }, { "epoch": 0.07688808007279345, "grad_norm": 0.8176262426438138, "learning_rate": 9.994235263487158e-06, "loss": 0.0743, "step": 169 }, { "epoch": 0.07734303912647862, "grad_norm": 0.8147855247941459, "learning_rate": 9.994166444809705e-06, "loss": 0.0559, "step": 170 }, { "epoch": 0.07779799818016378, "grad_norm": 0.7853019575635352, "learning_rate": 9.994097218031524e-06, "loss": 0.0681, "step": 171 }, { "epoch": 0.07825295723384895, "grad_norm": 0.8445610480134321, "learning_rate": 9.994027583158272e-06, "loss": 0.0785, "step": 172 }, { "epoch": 0.07870791628753412, "grad_norm": 0.8555498692388026, "learning_rate": 9.993957540195638e-06, "loss": 0.077, "step": 173 }, { "epoch": 0.0791628753412193, "grad_norm": 0.8281270493499452, "learning_rate": 9.993887089149346e-06, "loss": 0.0848, "step": 174 }, { "epoch": 0.07961783439490445, "grad_norm": 0.7180425978661062, "learning_rate": 9.993816230025152e-06, "loss": 0.0588, "step": 175 }, { "epoch": 0.08007279344858963, "grad_norm": 0.9287545326980071, "learning_rate": 9.99374496282885e-06, "loss": 0.0874, "step": 176 }, { "epoch": 0.0805277525022748, "grad_norm": 1.5950603980195528, "learning_rate": 9.993673287566261e-06, "loss": 0.1301, "step": 177 }, { "epoch": 0.08098271155595996, "grad_norm": 0.505966633973175, "learning_rate": 9.99360120424324e-06, "loss": 0.0459, "step": 178 }, { "epoch": 0.08143767060964513, "grad_norm": 0.6170796905443107, "learning_rate": 9.993528712865681e-06, "loss": 0.0666, "step": 179 }, { "epoch": 0.0818926296633303, "grad_norm": 0.8965600572228928, "learning_rate": 9.993455813439507e-06, "loss": 0.0648, "step": 180 }, { "epoch": 0.08234758871701547, "grad_norm": 0.7555745664692847, "learning_rate": 9.993382505970673e-06, "loss": 0.0479, "step": 181 }, { "epoch": 0.08280254777070063, "grad_norm": 0.7885826993774436, "learning_rate": 9.99330879046517e-06, "loss": 0.0605, "step": 182 }, { "epoch": 0.0832575068243858, "grad_norm": 0.6970911126559147, "learning_rate": 9.993234666929024e-06, "loss": 0.0545, "step": 183 }, { "epoch": 0.08371246587807098, "grad_norm": 0.8281240642020996, "learning_rate": 9.99316013536829e-06, "loss": 0.0651, "step": 184 }, { "epoch": 0.08416742493175614, "grad_norm": 0.8497823551734951, "learning_rate": 9.993085195789057e-06, "loss": 0.098, "step": 185 }, { "epoch": 0.08462238398544131, "grad_norm": 0.8425278224044996, "learning_rate": 9.993009848197452e-06, "loss": 0.0861, "step": 186 }, { "epoch": 0.08507734303912648, "grad_norm": 0.729342450692031, "learning_rate": 9.992934092599629e-06, "loss": 0.0651, "step": 187 }, { "epoch": 0.08553230209281165, "grad_norm": 0.8810253378927329, "learning_rate": 9.99285792900178e-06, "loss": 0.0995, "step": 188 }, { "epoch": 0.08598726114649681, "grad_norm": 1.0402457083445067, "learning_rate": 9.992781357410131e-06, "loss": 0.1061, "step": 189 }, { "epoch": 0.08644222020018198, "grad_norm": 0.7397036090930822, "learning_rate": 9.992704377830934e-06, "loss": 0.0571, "step": 190 }, { "epoch": 0.08689717925386715, "grad_norm": 1.4783630598693296, "learning_rate": 9.992626990270484e-06, "loss": 0.1154, "step": 191 }, { "epoch": 0.08735213830755233, "grad_norm": 1.1100322283473036, "learning_rate": 9.992549194735101e-06, "loss": 0.1179, "step": 192 }, { "epoch": 0.08780709736123748, "grad_norm": 0.5797984556503705, "learning_rate": 9.992470991231144e-06, "loss": 0.0466, "step": 193 }, { "epoch": 0.08826205641492266, "grad_norm": 1.059908713900853, "learning_rate": 9.992392379765005e-06, "loss": 0.0994, "step": 194 }, { "epoch": 0.08871701546860783, "grad_norm": 1.1187885391430794, "learning_rate": 9.992313360343104e-06, "loss": 0.0986, "step": 195 }, { "epoch": 0.08917197452229299, "grad_norm": 0.7509441330173129, "learning_rate": 9.992233932971901e-06, "loss": 0.0634, "step": 196 }, { "epoch": 0.08962693357597816, "grad_norm": 0.9426276516690344, "learning_rate": 9.992154097657888e-06, "loss": 0.0857, "step": 197 }, { "epoch": 0.09008189262966333, "grad_norm": 0.8754039034503873, "learning_rate": 9.992073854407585e-06, "loss": 0.0881, "step": 198 }, { "epoch": 0.0905368516833485, "grad_norm": 2.8697219156120712, "learning_rate": 9.99199320322755e-06, "loss": 0.0851, "step": 199 }, { "epoch": 0.09099181073703366, "grad_norm": 0.7429242681646778, "learning_rate": 9.991912144124375e-06, "loss": 0.0729, "step": 200 }, { "epoch": 0.09144676979071883, "grad_norm": 1.0552979449251756, "learning_rate": 9.991830677104682e-06, "loss": 0.1066, "step": 201 }, { "epoch": 0.09190172884440401, "grad_norm": 0.8812651371324355, "learning_rate": 9.99174880217513e-06, "loss": 0.0732, "step": 202 }, { "epoch": 0.09235668789808917, "grad_norm": 1.0755107845413352, "learning_rate": 9.991666519342407e-06, "loss": 0.0977, "step": 203 }, { "epoch": 0.09281164695177434, "grad_norm": 0.8925063431256136, "learning_rate": 9.99158382861324e-06, "loss": 0.0904, "step": 204 }, { "epoch": 0.09326660600545951, "grad_norm": 0.8190206986922173, "learning_rate": 9.991500729994384e-06, "loss": 0.0729, "step": 205 }, { "epoch": 0.09372156505914468, "grad_norm": 0.6635798147425112, "learning_rate": 9.991417223492629e-06, "loss": 0.0631, "step": 206 }, { "epoch": 0.09417652411282984, "grad_norm": 1.0314655306023923, "learning_rate": 9.991333309114798e-06, "loss": 0.0852, "step": 207 }, { "epoch": 0.09463148316651501, "grad_norm": 0.8533496857694978, "learning_rate": 9.991248986867753e-06, "loss": 0.0868, "step": 208 }, { "epoch": 0.09508644222020018, "grad_norm": 1.039085255997433, "learning_rate": 9.991164256758378e-06, "loss": 0.095, "step": 209 }, { "epoch": 0.09554140127388536, "grad_norm": 1.1484522866350177, "learning_rate": 9.9910791187936e-06, "loss": 0.1333, "step": 210 }, { "epoch": 0.09599636032757052, "grad_norm": 0.8277820800102422, "learning_rate": 9.99099357298038e-06, "loss": 0.0664, "step": 211 }, { "epoch": 0.09645131938125569, "grad_norm": 0.821796111319934, "learning_rate": 9.9909076193257e-06, "loss": 0.083, "step": 212 }, { "epoch": 0.09690627843494086, "grad_norm": 0.9448800546720313, "learning_rate": 9.990821257836589e-06, "loss": 0.0873, "step": 213 }, { "epoch": 0.09736123748862602, "grad_norm": 0.9002810379340489, "learning_rate": 9.990734488520103e-06, "loss": 0.099, "step": 214 }, { "epoch": 0.09781619654231119, "grad_norm": 0.6145149717344348, "learning_rate": 9.990647311383334e-06, "loss": 0.0425, "step": 215 }, { "epoch": 0.09827115559599636, "grad_norm": 1.1377497370761045, "learning_rate": 9.990559726433404e-06, "loss": 0.0903, "step": 216 }, { "epoch": 0.09872611464968153, "grad_norm": 0.8401357673155365, "learning_rate": 9.99047173367747e-06, "loss": 0.0812, "step": 217 }, { "epoch": 0.09918107370336669, "grad_norm": 0.6977882365614015, "learning_rate": 9.990383333122722e-06, "loss": 0.0613, "step": 218 }, { "epoch": 0.09963603275705187, "grad_norm": 0.6751056796776193, "learning_rate": 9.990294524776384e-06, "loss": 0.0636, "step": 219 }, { "epoch": 0.10009099181073704, "grad_norm": 0.7973250315161167, "learning_rate": 9.990205308645716e-06, "loss": 0.0655, "step": 220 }, { "epoch": 0.1005459508644222, "grad_norm": 0.6494979859380491, "learning_rate": 9.990115684738005e-06, "loss": 0.0461, "step": 221 }, { "epoch": 0.10100090991810737, "grad_norm": 0.7863907355652456, "learning_rate": 9.990025653060574e-06, "loss": 0.0881, "step": 222 }, { "epoch": 0.10145586897179254, "grad_norm": 1.2756737972223395, "learning_rate": 9.98993521362078e-06, "loss": 0.1102, "step": 223 }, { "epoch": 0.10191082802547771, "grad_norm": 1.1992554133605928, "learning_rate": 9.989844366426018e-06, "loss": 0.1147, "step": 224 }, { "epoch": 0.10236578707916287, "grad_norm": 0.5034605400337953, "learning_rate": 9.989753111483707e-06, "loss": 0.0462, "step": 225 }, { "epoch": 0.10282074613284804, "grad_norm": 0.9881921480518578, "learning_rate": 9.989661448801305e-06, "loss": 0.0848, "step": 226 }, { "epoch": 0.10327570518653321, "grad_norm": 0.7581777568438945, "learning_rate": 9.989569378386303e-06, "loss": 0.079, "step": 227 }, { "epoch": 0.10373066424021839, "grad_norm": 0.6464731162067388, "learning_rate": 9.989476900246223e-06, "loss": 0.0617, "step": 228 }, { "epoch": 0.10418562329390355, "grad_norm": 0.8780639185859085, "learning_rate": 9.989384014388624e-06, "loss": 0.086, "step": 229 }, { "epoch": 0.10464058234758872, "grad_norm": 0.6623808171307163, "learning_rate": 9.989290720821095e-06, "loss": 0.0694, "step": 230 }, { "epoch": 0.10509554140127389, "grad_norm": 0.721054554263859, "learning_rate": 9.98919701955126e-06, "loss": 0.0735, "step": 231 }, { "epoch": 0.10555050045495905, "grad_norm": 0.7868134014829404, "learning_rate": 9.989102910586776e-06, "loss": 0.0546, "step": 232 }, { "epoch": 0.10600545950864422, "grad_norm": 0.9137158371163484, "learning_rate": 9.989008393935331e-06, "loss": 0.0771, "step": 233 }, { "epoch": 0.10646041856232939, "grad_norm": 0.8326009579593463, "learning_rate": 9.98891346960465e-06, "loss": 0.0667, "step": 234 }, { "epoch": 0.10691537761601456, "grad_norm": 0.6462724580348628, "learning_rate": 9.988818137602494e-06, "loss": 0.0717, "step": 235 }, { "epoch": 0.10737033666969972, "grad_norm": 0.7513725247558808, "learning_rate": 9.988722397936646e-06, "loss": 0.0733, "step": 236 }, { "epoch": 0.1078252957233849, "grad_norm": 1.094509848236789, "learning_rate": 9.988626250614932e-06, "loss": 0.1009, "step": 237 }, { "epoch": 0.10828025477707007, "grad_norm": 0.8200579138639758, "learning_rate": 9.98852969564521e-06, "loss": 0.0844, "step": 238 }, { "epoch": 0.10873521383075523, "grad_norm": 0.7417763562196316, "learning_rate": 9.988432733035369e-06, "loss": 0.0611, "step": 239 }, { "epoch": 0.1091901728844404, "grad_norm": 0.8476475869820355, "learning_rate": 9.988335362793333e-06, "loss": 0.0863, "step": 240 }, { "epoch": 0.10964513193812557, "grad_norm": 0.9998642783878469, "learning_rate": 9.988237584927058e-06, "loss": 0.0909, "step": 241 }, { "epoch": 0.11010009099181074, "grad_norm": 1.1689324698997519, "learning_rate": 9.988139399444534e-06, "loss": 0.124, "step": 242 }, { "epoch": 0.1105550500454959, "grad_norm": 0.790901332269412, "learning_rate": 9.988040806353786e-06, "loss": 0.0855, "step": 243 }, { "epoch": 0.11101000909918107, "grad_norm": 0.8931785977847209, "learning_rate": 9.987941805662869e-06, "loss": 0.1023, "step": 244 }, { "epoch": 0.11146496815286625, "grad_norm": 0.7352781929773609, "learning_rate": 9.98784239737987e-06, "loss": 0.0563, "step": 245 }, { "epoch": 0.11191992720655142, "grad_norm": 0.7169092611535308, "learning_rate": 9.987742581512919e-06, "loss": 0.0683, "step": 246 }, { "epoch": 0.11237488626023658, "grad_norm": 0.6767560569792272, "learning_rate": 9.987642358070167e-06, "loss": 0.0669, "step": 247 }, { "epoch": 0.11282984531392175, "grad_norm": 0.8442319805699996, "learning_rate": 9.987541727059805e-06, "loss": 0.0768, "step": 248 }, { "epoch": 0.11328480436760692, "grad_norm": 0.7700876798522618, "learning_rate": 9.987440688490058e-06, "loss": 0.0643, "step": 249 }, { "epoch": 0.11373976342129208, "grad_norm": 0.7286087978317647, "learning_rate": 9.98733924236918e-06, "loss": 0.0698, "step": 250 }, { "epoch": 0.11419472247497725, "grad_norm": 0.7917355018437868, "learning_rate": 9.98723738870546e-06, "loss": 0.0791, "step": 251 }, { "epoch": 0.11464968152866242, "grad_norm": 1.0469499693242315, "learning_rate": 9.987135127507226e-06, "loss": 0.0761, "step": 252 }, { "epoch": 0.1151046405823476, "grad_norm": 0.8361714930383379, "learning_rate": 9.987032458782828e-06, "loss": 0.0789, "step": 253 }, { "epoch": 0.11555959963603275, "grad_norm": 0.5902853873046482, "learning_rate": 9.986929382540662e-06, "loss": 0.0479, "step": 254 }, { "epoch": 0.11601455868971793, "grad_norm": 0.7349436304465384, "learning_rate": 9.986825898789145e-06, "loss": 0.0668, "step": 255 }, { "epoch": 0.1164695177434031, "grad_norm": 0.7657107039148755, "learning_rate": 9.986722007536737e-06, "loss": 0.0617, "step": 256 }, { "epoch": 0.11692447679708826, "grad_norm": 0.6450631027744769, "learning_rate": 9.986617708791926e-06, "loss": 0.0679, "step": 257 }, { "epoch": 0.11737943585077343, "grad_norm": 0.6292930010016882, "learning_rate": 9.986513002563236e-06, "loss": 0.0482, "step": 258 }, { "epoch": 0.1178343949044586, "grad_norm": 0.8758541343517451, "learning_rate": 9.986407888859221e-06, "loss": 0.0994, "step": 259 }, { "epoch": 0.11828935395814377, "grad_norm": 0.6537445862223847, "learning_rate": 9.986302367688473e-06, "loss": 0.07, "step": 260 }, { "epoch": 0.11874431301182893, "grad_norm": 0.8029660816844667, "learning_rate": 9.986196439059613e-06, "loss": 0.0623, "step": 261 }, { "epoch": 0.1191992720655141, "grad_norm": 0.7339528606524214, "learning_rate": 9.986090102981297e-06, "loss": 0.0791, "step": 262 }, { "epoch": 0.11965423111919928, "grad_norm": 0.7934112522002073, "learning_rate": 9.985983359462215e-06, "loss": 0.0672, "step": 263 }, { "epoch": 0.12010919017288443, "grad_norm": 1.0186962263060808, "learning_rate": 9.98587620851109e-06, "loss": 0.1213, "step": 264 }, { "epoch": 0.1205641492265696, "grad_norm": 0.6769843647605545, "learning_rate": 9.985768650136679e-06, "loss": 0.0685, "step": 265 }, { "epoch": 0.12101910828025478, "grad_norm": 0.7543020935976431, "learning_rate": 9.985660684347765e-06, "loss": 0.0861, "step": 266 }, { "epoch": 0.12147406733393995, "grad_norm": 0.9552124731299731, "learning_rate": 9.985552311153178e-06, "loss": 0.0922, "step": 267 }, { "epoch": 0.12192902638762511, "grad_norm": 0.7436699167226903, "learning_rate": 9.985443530561769e-06, "loss": 0.0885, "step": 268 }, { "epoch": 0.12238398544131028, "grad_norm": 1.329058937551934, "learning_rate": 9.98533434258243e-06, "loss": 0.1115, "step": 269 }, { "epoch": 0.12283894449499545, "grad_norm": 0.6835909813818813, "learning_rate": 9.985224747224083e-06, "loss": 0.0586, "step": 270 }, { "epoch": 0.12329390354868063, "grad_norm": 1.0733107060854794, "learning_rate": 9.98511474449568e-06, "loss": 0.0811, "step": 271 }, { "epoch": 0.12374886260236578, "grad_norm": 0.5916007278667166, "learning_rate": 9.985004334406215e-06, "loss": 0.0696, "step": 272 }, { "epoch": 0.12420382165605096, "grad_norm": 0.9149357508392912, "learning_rate": 9.984893516964707e-06, "loss": 0.0704, "step": 273 }, { "epoch": 0.12465878070973613, "grad_norm": 1.1634742377762608, "learning_rate": 9.984782292180212e-06, "loss": 0.1178, "step": 274 }, { "epoch": 0.1251137397634213, "grad_norm": 0.603957454908005, "learning_rate": 9.98467066006182e-06, "loss": 0.0585, "step": 275 }, { "epoch": 0.12556869881710647, "grad_norm": 0.7735087790025026, "learning_rate": 9.984558620618651e-06, "loss": 0.0953, "step": 276 }, { "epoch": 0.12602365787079162, "grad_norm": 1.2570182633873541, "learning_rate": 9.984446173859863e-06, "loss": 0.1353, "step": 277 }, { "epoch": 0.1264786169244768, "grad_norm": 0.7275895818672663, "learning_rate": 9.984333319794642e-06, "loss": 0.0774, "step": 278 }, { "epoch": 0.12693357597816196, "grad_norm": 0.6395006056363333, "learning_rate": 9.984220058432212e-06, "loss": 0.0591, "step": 279 }, { "epoch": 0.12738853503184713, "grad_norm": 0.6563921850032347, "learning_rate": 9.984106389781828e-06, "loss": 0.0573, "step": 280 }, { "epoch": 0.1278434940855323, "grad_norm": 0.9399157526953884, "learning_rate": 9.983992313852776e-06, "loss": 0.0793, "step": 281 }, { "epoch": 0.12829845313921748, "grad_norm": 0.93528061821534, "learning_rate": 9.983877830654381e-06, "loss": 0.0807, "step": 282 }, { "epoch": 0.12875341219290265, "grad_norm": 0.7192448233352142, "learning_rate": 9.983762940195996e-06, "loss": 0.0773, "step": 283 }, { "epoch": 0.1292083712465878, "grad_norm": 0.7097381072031733, "learning_rate": 9.98364764248701e-06, "loss": 0.0698, "step": 284 }, { "epoch": 0.12966333030027297, "grad_norm": 1.1635566012920768, "learning_rate": 9.983531937536844e-06, "loss": 0.0893, "step": 285 }, { "epoch": 0.13011828935395814, "grad_norm": 0.8456555685011555, "learning_rate": 9.983415825354954e-06, "loss": 0.0628, "step": 286 }, { "epoch": 0.1305732484076433, "grad_norm": 0.7151838393189083, "learning_rate": 9.983299305950828e-06, "loss": 0.0557, "step": 287 }, { "epoch": 0.13102820746132848, "grad_norm": 0.7095193783870621, "learning_rate": 9.983182379333989e-06, "loss": 0.0604, "step": 288 }, { "epoch": 0.13148316651501366, "grad_norm": 0.8581434444337498, "learning_rate": 9.983065045513986e-06, "loss": 0.0781, "step": 289 }, { "epoch": 0.13193812556869883, "grad_norm": 0.5600994934804626, "learning_rate": 9.982947304500414e-06, "loss": 0.0498, "step": 290 }, { "epoch": 0.13239308462238397, "grad_norm": 0.7355720212694087, "learning_rate": 9.98282915630289e-06, "loss": 0.0692, "step": 291 }, { "epoch": 0.13284804367606914, "grad_norm": 1.6846985851500909, "learning_rate": 9.98271060093107e-06, "loss": 0.1687, "step": 292 }, { "epoch": 0.13330300272975432, "grad_norm": 0.7959406174268434, "learning_rate": 9.98259163839464e-06, "loss": 0.0718, "step": 293 }, { "epoch": 0.1337579617834395, "grad_norm": 0.6005858848115938, "learning_rate": 9.982472268703323e-06, "loss": 0.0465, "step": 294 }, { "epoch": 0.13421292083712466, "grad_norm": 0.7865103977061746, "learning_rate": 9.982352491866874e-06, "loss": 0.071, "step": 295 }, { "epoch": 0.13466787989080983, "grad_norm": 0.7167219429964851, "learning_rate": 9.982232307895077e-06, "loss": 0.0658, "step": 296 }, { "epoch": 0.135122838944495, "grad_norm": 1.206398567596641, "learning_rate": 9.982111716797758e-06, "loss": 0.101, "step": 297 }, { "epoch": 0.13557779799818018, "grad_norm": 1.0085912508470862, "learning_rate": 9.981990718584768e-06, "loss": 0.0959, "step": 298 }, { "epoch": 0.13603275705186532, "grad_norm": 0.8594135430057543, "learning_rate": 9.981869313265995e-06, "loss": 0.0912, "step": 299 }, { "epoch": 0.1364877161055505, "grad_norm": 0.9903339586980618, "learning_rate": 9.981747500851357e-06, "loss": 0.0692, "step": 300 }, { "epoch": 0.13694267515923567, "grad_norm": 0.7623380548666351, "learning_rate": 9.981625281350812e-06, "loss": 0.0699, "step": 301 }, { "epoch": 0.13739763421292084, "grad_norm": 0.6267143484055344, "learning_rate": 9.981502654774349e-06, "loss": 0.0499, "step": 302 }, { "epoch": 0.137852593266606, "grad_norm": 0.8234150836820757, "learning_rate": 9.98137962113198e-06, "loss": 0.0788, "step": 303 }, { "epoch": 0.13830755232029118, "grad_norm": 0.8158733102806115, "learning_rate": 9.98125618043377e-06, "loss": 0.089, "step": 304 }, { "epoch": 0.13876251137397635, "grad_norm": 0.6372656549463032, "learning_rate": 9.981132332689796e-06, "loss": 0.0517, "step": 305 }, { "epoch": 0.1392174704276615, "grad_norm": 0.7713863813548327, "learning_rate": 9.981008077910184e-06, "loss": 0.0769, "step": 306 }, { "epoch": 0.13967242948134667, "grad_norm": 0.8883775702857831, "learning_rate": 9.980883416105084e-06, "loss": 0.0828, "step": 307 }, { "epoch": 0.14012738853503184, "grad_norm": 0.6490936355626988, "learning_rate": 9.980758347284687e-06, "loss": 0.0618, "step": 308 }, { "epoch": 0.14058234758871702, "grad_norm": 0.8359554084586713, "learning_rate": 9.980632871459209e-06, "loss": 0.0714, "step": 309 }, { "epoch": 0.1410373066424022, "grad_norm": 0.7373523328454649, "learning_rate": 9.980506988638906e-06, "loss": 0.0836, "step": 310 }, { "epoch": 0.14149226569608736, "grad_norm": 0.6644370731485183, "learning_rate": 9.980380698834064e-06, "loss": 0.0777, "step": 311 }, { "epoch": 0.14194722474977253, "grad_norm": 0.870883965477211, "learning_rate": 9.980254002055003e-06, "loss": 0.0847, "step": 312 }, { "epoch": 0.14240218380345768, "grad_norm": 0.6021065409531002, "learning_rate": 9.980126898312074e-06, "loss": 0.0583, "step": 313 }, { "epoch": 0.14285714285714285, "grad_norm": 0.8705461588189498, "learning_rate": 9.979999387615665e-06, "loss": 0.0895, "step": 314 }, { "epoch": 0.14331210191082802, "grad_norm": 0.9639410731114018, "learning_rate": 9.979871469976197e-06, "loss": 0.0901, "step": 315 }, { "epoch": 0.1437670609645132, "grad_norm": 0.7554126383153169, "learning_rate": 9.97974314540412e-06, "loss": 0.0699, "step": 316 }, { "epoch": 0.14422202001819837, "grad_norm": 1.1039648440512544, "learning_rate": 9.979614413909922e-06, "loss": 0.1013, "step": 317 }, { "epoch": 0.14467697907188354, "grad_norm": 0.5258831871743486, "learning_rate": 9.979485275504121e-06, "loss": 0.0544, "step": 318 }, { "epoch": 0.1451319381255687, "grad_norm": 1.3025897394440575, "learning_rate": 9.979355730197271e-06, "loss": 0.1067, "step": 319 }, { "epoch": 0.14558689717925385, "grad_norm": 0.5206132423310033, "learning_rate": 9.979225777999956e-06, "loss": 0.0497, "step": 320 }, { "epoch": 0.14604185623293903, "grad_norm": 0.7202189397663867, "learning_rate": 9.9790954189228e-06, "loss": 0.0807, "step": 321 }, { "epoch": 0.1464968152866242, "grad_norm": 0.5738667169449175, "learning_rate": 9.97896465297645e-06, "loss": 0.0614, "step": 322 }, { "epoch": 0.14695177434030937, "grad_norm": 0.7972440737628133, "learning_rate": 9.978833480171592e-06, "loss": 0.0906, "step": 323 }, { "epoch": 0.14740673339399454, "grad_norm": 0.7697423454053598, "learning_rate": 9.978701900518947e-06, "loss": 0.0632, "step": 324 }, { "epoch": 0.14786169244767972, "grad_norm": 0.8259885564233931, "learning_rate": 9.978569914029267e-06, "loss": 0.0944, "step": 325 }, { "epoch": 0.1483166515013649, "grad_norm": 0.8450006655868962, "learning_rate": 9.978437520713335e-06, "loss": 0.0862, "step": 326 }, { "epoch": 0.14877161055505003, "grad_norm": 0.7746078278616594, "learning_rate": 9.978304720581973e-06, "loss": 0.088, "step": 327 }, { "epoch": 0.1492265696087352, "grad_norm": 0.9977734940815816, "learning_rate": 9.97817151364603e-06, "loss": 0.1036, "step": 328 }, { "epoch": 0.14968152866242038, "grad_norm": 0.7800752301510507, "learning_rate": 9.978037899916393e-06, "loss": 0.0778, "step": 329 }, { "epoch": 0.15013648771610555, "grad_norm": 0.7521153273438224, "learning_rate": 9.97790387940398e-06, "loss": 0.0532, "step": 330 }, { "epoch": 0.15059144676979072, "grad_norm": 0.8046420256419254, "learning_rate": 9.977769452119741e-06, "loss": 0.0708, "step": 331 }, { "epoch": 0.1510464058234759, "grad_norm": 0.9071770528791517, "learning_rate": 9.97763461807466e-06, "loss": 0.1006, "step": 332 }, { "epoch": 0.15150136487716107, "grad_norm": 0.8824570234268595, "learning_rate": 9.97749937727976e-06, "loss": 0.0855, "step": 333 }, { "epoch": 0.15195632393084624, "grad_norm": 0.8286075823730068, "learning_rate": 9.977363729746088e-06, "loss": 0.077, "step": 334 }, { "epoch": 0.15241128298453138, "grad_norm": 0.6791233851472963, "learning_rate": 9.977227675484729e-06, "loss": 0.0698, "step": 335 }, { "epoch": 0.15286624203821655, "grad_norm": 0.9813875260679181, "learning_rate": 9.977091214506803e-06, "loss": 0.0838, "step": 336 }, { "epoch": 0.15332120109190173, "grad_norm": 0.9986284190120469, "learning_rate": 9.976954346823456e-06, "loss": 0.0789, "step": 337 }, { "epoch": 0.1537761601455869, "grad_norm": 0.6456071732838817, "learning_rate": 9.976817072445878e-06, "loss": 0.0566, "step": 338 }, { "epoch": 0.15423111919927207, "grad_norm": 0.7707362352402762, "learning_rate": 9.976679391385283e-06, "loss": 0.0677, "step": 339 }, { "epoch": 0.15468607825295724, "grad_norm": 0.5804713825378958, "learning_rate": 9.976541303652923e-06, "loss": 0.0547, "step": 340 }, { "epoch": 0.15514103730664242, "grad_norm": 0.7705377953828665, "learning_rate": 9.976402809260083e-06, "loss": 0.0673, "step": 341 }, { "epoch": 0.15559599636032756, "grad_norm": 0.651002355082985, "learning_rate": 9.976263908218076e-06, "loss": 0.066, "step": 342 }, { "epoch": 0.15605095541401273, "grad_norm": 1.0075230687249708, "learning_rate": 9.976124600538257e-06, "loss": 0.1151, "step": 343 }, { "epoch": 0.1565059144676979, "grad_norm": 0.7110146200064966, "learning_rate": 9.975984886232006e-06, "loss": 0.0693, "step": 344 }, { "epoch": 0.15696087352138308, "grad_norm": 0.782615076662302, "learning_rate": 9.975844765310743e-06, "loss": 0.071, "step": 345 }, { "epoch": 0.15741583257506825, "grad_norm": 1.091513822496144, "learning_rate": 9.975704237785915e-06, "loss": 0.1277, "step": 346 }, { "epoch": 0.15787079162875342, "grad_norm": 0.8244942271322709, "learning_rate": 9.975563303669006e-06, "loss": 0.092, "step": 347 }, { "epoch": 0.1583257506824386, "grad_norm": 1.0997264747524325, "learning_rate": 9.975421962971536e-06, "loss": 0.102, "step": 348 }, { "epoch": 0.15878070973612374, "grad_norm": 1.0471722358260585, "learning_rate": 9.97528021570505e-06, "loss": 0.1112, "step": 349 }, { "epoch": 0.1592356687898089, "grad_norm": 0.6366013160292697, "learning_rate": 9.975138061881135e-06, "loss": 0.0629, "step": 350 }, { "epoch": 0.15969062784349408, "grad_norm": 0.7145502784859615, "learning_rate": 9.974995501511404e-06, "loss": 0.0567, "step": 351 }, { "epoch": 0.16014558689717925, "grad_norm": 1.0825694007542435, "learning_rate": 9.974852534607506e-06, "loss": 0.0897, "step": 352 }, { "epoch": 0.16060054595086443, "grad_norm": 0.8874195306329471, "learning_rate": 9.974709161181126e-06, "loss": 0.0879, "step": 353 }, { "epoch": 0.1610555050045496, "grad_norm": 0.8193025449594961, "learning_rate": 9.974565381243982e-06, "loss": 0.0969, "step": 354 }, { "epoch": 0.16151046405823477, "grad_norm": 0.76528422131405, "learning_rate": 9.974421194807815e-06, "loss": 0.0786, "step": 355 }, { "epoch": 0.16196542311191992, "grad_norm": 0.8836543328533641, "learning_rate": 9.974276601884416e-06, "loss": 0.0744, "step": 356 }, { "epoch": 0.1624203821656051, "grad_norm": 0.7482952108426273, "learning_rate": 9.974131602485596e-06, "loss": 0.0772, "step": 357 }, { "epoch": 0.16287534121929026, "grad_norm": 0.9122723647083647, "learning_rate": 9.973986196623203e-06, "loss": 0.0851, "step": 358 }, { "epoch": 0.16333030027297543, "grad_norm": 0.8373653902978805, "learning_rate": 9.973840384309121e-06, "loss": 0.0865, "step": 359 }, { "epoch": 0.1637852593266606, "grad_norm": 0.6360069343077157, "learning_rate": 9.973694165555264e-06, "loss": 0.0618, "step": 360 }, { "epoch": 0.16424021838034578, "grad_norm": 0.7967304456611868, "learning_rate": 9.973547540373582e-06, "loss": 0.0865, "step": 361 }, { "epoch": 0.16469517743403095, "grad_norm": 1.1699452577832765, "learning_rate": 9.973400508776054e-06, "loss": 0.1144, "step": 362 }, { "epoch": 0.1651501364877161, "grad_norm": 0.6282867599706373, "learning_rate": 9.973253070774698e-06, "loss": 0.0633, "step": 363 }, { "epoch": 0.16560509554140126, "grad_norm": 0.79942272506218, "learning_rate": 9.973105226381559e-06, "loss": 0.069, "step": 364 }, { "epoch": 0.16606005459508644, "grad_norm": 0.9348674828410355, "learning_rate": 9.972956975608719e-06, "loss": 0.1019, "step": 365 }, { "epoch": 0.1665150136487716, "grad_norm": 1.0942665884463076, "learning_rate": 9.972808318468292e-06, "loss": 0.0859, "step": 366 }, { "epoch": 0.16696997270245678, "grad_norm": 0.6283579225277517, "learning_rate": 9.972659254972426e-06, "loss": 0.0589, "step": 367 }, { "epoch": 0.16742493175614195, "grad_norm": 1.0989677054167046, "learning_rate": 9.972509785133304e-06, "loss": 0.1081, "step": 368 }, { "epoch": 0.16787989080982713, "grad_norm": 0.7310198219540203, "learning_rate": 9.972359908963137e-06, "loss": 0.0675, "step": 369 }, { "epoch": 0.16833484986351227, "grad_norm": 0.757671629194488, "learning_rate": 9.972209626474172e-06, "loss": 0.0734, "step": 370 }, { "epoch": 0.16878980891719744, "grad_norm": 0.7966175159886519, "learning_rate": 9.972058937678692e-06, "loss": 0.075, "step": 371 }, { "epoch": 0.16924476797088261, "grad_norm": 0.9805514159267839, "learning_rate": 9.97190784258901e-06, "loss": 0.1071, "step": 372 }, { "epoch": 0.1696997270245678, "grad_norm": 0.7000612574442994, "learning_rate": 9.971756341217471e-06, "loss": 0.0526, "step": 373 }, { "epoch": 0.17015468607825296, "grad_norm": 0.7917466702374949, "learning_rate": 9.971604433576456e-06, "loss": 0.0698, "step": 374 }, { "epoch": 0.17060964513193813, "grad_norm": 0.8412692631182211, "learning_rate": 9.97145211967838e-06, "loss": 0.0783, "step": 375 }, { "epoch": 0.1710646041856233, "grad_norm": 0.5615038895232536, "learning_rate": 9.971299399535685e-06, "loss": 0.053, "step": 376 }, { "epoch": 0.17151956323930848, "grad_norm": 0.6849745369298482, "learning_rate": 9.971146273160854e-06, "loss": 0.0774, "step": 377 }, { "epoch": 0.17197452229299362, "grad_norm": 0.6466596777060115, "learning_rate": 9.9709927405664e-06, "loss": 0.0606, "step": 378 }, { "epoch": 0.1724294813466788, "grad_norm": 0.7169884074840761, "learning_rate": 9.970838801764866e-06, "loss": 0.0839, "step": 379 }, { "epoch": 0.17288444040036396, "grad_norm": 0.9393396355410675, "learning_rate": 9.970684456768836e-06, "loss": 0.1132, "step": 380 }, { "epoch": 0.17333939945404914, "grad_norm": 12.197098173453568, "learning_rate": 9.970529705590918e-06, "loss": 0.4858, "step": 381 }, { "epoch": 0.1737943585077343, "grad_norm": 0.7355841274771772, "learning_rate": 9.97037454824376e-06, "loss": 0.0714, "step": 382 }, { "epoch": 0.17424931756141948, "grad_norm": 1.050385265783733, "learning_rate": 9.97021898474004e-06, "loss": 0.1024, "step": 383 }, { "epoch": 0.17470427661510465, "grad_norm": 0.8612087678995594, "learning_rate": 9.970063015092469e-06, "loss": 0.085, "step": 384 }, { "epoch": 0.1751592356687898, "grad_norm": 1.3886472100476919, "learning_rate": 9.969906639313793e-06, "loss": 0.1212, "step": 385 }, { "epoch": 0.17561419472247497, "grad_norm": 0.8238176964814595, "learning_rate": 9.96974985741679e-06, "loss": 0.0721, "step": 386 }, { "epoch": 0.17606915377616014, "grad_norm": 0.8718897735731601, "learning_rate": 9.969592669414272e-06, "loss": 0.0959, "step": 387 }, { "epoch": 0.17652411282984531, "grad_norm": 6.796752422837202, "learning_rate": 9.969435075319083e-06, "loss": 0.115, "step": 388 }, { "epoch": 0.1769790718835305, "grad_norm": 0.58176536820322, "learning_rate": 9.969277075144104e-06, "loss": 0.0459, "step": 389 }, { "epoch": 0.17743403093721566, "grad_norm": 0.7267253435076165, "learning_rate": 9.969118668902242e-06, "loss": 0.07, "step": 390 }, { "epoch": 0.17788898999090083, "grad_norm": 0.7682389367523258, "learning_rate": 9.968959856606442e-06, "loss": 0.0542, "step": 391 }, { "epoch": 0.17834394904458598, "grad_norm": 0.7873348185837048, "learning_rate": 9.968800638269682e-06, "loss": 0.0598, "step": 392 }, { "epoch": 0.17879890809827115, "grad_norm": 1.287713292390112, "learning_rate": 9.968641013904974e-06, "loss": 0.1442, "step": 393 }, { "epoch": 0.17925386715195632, "grad_norm": 1.085650814952146, "learning_rate": 9.968480983525359e-06, "loss": 0.0926, "step": 394 }, { "epoch": 0.1797088262056415, "grad_norm": 0.6716676596759695, "learning_rate": 9.968320547143918e-06, "loss": 0.0767, "step": 395 }, { "epoch": 0.18016378525932666, "grad_norm": 0.8467396807693714, "learning_rate": 9.968159704773757e-06, "loss": 0.0977, "step": 396 }, { "epoch": 0.18061874431301184, "grad_norm": 0.6438855833782786, "learning_rate": 9.967998456428021e-06, "loss": 0.0586, "step": 397 }, { "epoch": 0.181073703366697, "grad_norm": 0.7254140122399564, "learning_rate": 9.967836802119886e-06, "loss": 0.06, "step": 398 }, { "epoch": 0.18152866242038215, "grad_norm": 0.87517545358881, "learning_rate": 9.967674741862563e-06, "loss": 0.1016, "step": 399 }, { "epoch": 0.18198362147406733, "grad_norm": 1.0624206936058178, "learning_rate": 9.967512275669294e-06, "loss": 0.1296, "step": 400 }, { "epoch": 0.1824385805277525, "grad_norm": 1.0284720738314184, "learning_rate": 9.967349403553353e-06, "loss": 0.0862, "step": 401 }, { "epoch": 0.18289353958143767, "grad_norm": 0.8342932737384292, "learning_rate": 9.967186125528053e-06, "loss": 0.0873, "step": 402 }, { "epoch": 0.18334849863512284, "grad_norm": 1.543095569701571, "learning_rate": 9.967022441606734e-06, "loss": 0.1209, "step": 403 }, { "epoch": 0.18380345768880801, "grad_norm": 0.70731586616612, "learning_rate": 9.966858351802773e-06, "loss": 0.0726, "step": 404 }, { "epoch": 0.1842584167424932, "grad_norm": 0.6660531988680356, "learning_rate": 9.966693856129576e-06, "loss": 0.0562, "step": 405 }, { "epoch": 0.18471337579617833, "grad_norm": 0.8503640969928286, "learning_rate": 9.966528954600587e-06, "loss": 0.0838, "step": 406 }, { "epoch": 0.1851683348498635, "grad_norm": 0.6021534124846688, "learning_rate": 9.96636364722928e-06, "loss": 0.0673, "step": 407 }, { "epoch": 0.18562329390354868, "grad_norm": 0.8782816795828058, "learning_rate": 9.966197934029165e-06, "loss": 0.0845, "step": 408 }, { "epoch": 0.18607825295723385, "grad_norm": 0.9030990654346936, "learning_rate": 9.966031815013781e-06, "loss": 0.0839, "step": 409 }, { "epoch": 0.18653321201091902, "grad_norm": 0.8567507299712805, "learning_rate": 9.965865290196703e-06, "loss": 0.0935, "step": 410 }, { "epoch": 0.1869881710646042, "grad_norm": 0.8099856489670021, "learning_rate": 9.96569835959154e-06, "loss": 0.0747, "step": 411 }, { "epoch": 0.18744313011828936, "grad_norm": 0.8938878675243255, "learning_rate": 9.965531023211931e-06, "loss": 0.0854, "step": 412 }, { "epoch": 0.18789808917197454, "grad_norm": 0.735313860104022, "learning_rate": 9.965363281071551e-06, "loss": 0.0865, "step": 413 }, { "epoch": 0.18835304822565968, "grad_norm": 0.5495229598132649, "learning_rate": 9.965195133184108e-06, "loss": 0.0403, "step": 414 }, { "epoch": 0.18880800727934485, "grad_norm": 1.0700416713113117, "learning_rate": 9.965026579563342e-06, "loss": 0.1086, "step": 415 }, { "epoch": 0.18926296633303002, "grad_norm": 0.7118653717355078, "learning_rate": 9.964857620223024e-06, "loss": 0.0691, "step": 416 }, { "epoch": 0.1897179253867152, "grad_norm": 0.6871481686027417, "learning_rate": 9.964688255176963e-06, "loss": 0.0667, "step": 417 }, { "epoch": 0.19017288444040037, "grad_norm": 0.9848841869658392, "learning_rate": 9.964518484438998e-06, "loss": 0.0813, "step": 418 }, { "epoch": 0.19062784349408554, "grad_norm": 0.6311750922074311, "learning_rate": 9.964348308023001e-06, "loss": 0.0592, "step": 419 }, { "epoch": 0.1910828025477707, "grad_norm": 0.7813168734245782, "learning_rate": 9.964177725942881e-06, "loss": 0.0826, "step": 420 }, { "epoch": 0.19153776160145586, "grad_norm": 0.8572110622332836, "learning_rate": 9.964006738212574e-06, "loss": 0.0853, "step": 421 }, { "epoch": 0.19199272065514103, "grad_norm": 0.5304433423014596, "learning_rate": 9.963835344846056e-06, "loss": 0.048, "step": 422 }, { "epoch": 0.1924476797088262, "grad_norm": 0.7598521228122416, "learning_rate": 9.963663545857328e-06, "loss": 0.0757, "step": 423 }, { "epoch": 0.19290263876251137, "grad_norm": 1.1542546683489703, "learning_rate": 9.963491341260432e-06, "loss": 0.104, "step": 424 }, { "epoch": 0.19335759781619655, "grad_norm": 0.7766563582253432, "learning_rate": 9.963318731069437e-06, "loss": 0.0952, "step": 425 }, { "epoch": 0.19381255686988172, "grad_norm": 1.1319194983916299, "learning_rate": 9.96314571529845e-06, "loss": 0.1005, "step": 426 }, { "epoch": 0.1942675159235669, "grad_norm": 0.7230559135257585, "learning_rate": 9.962972293961608e-06, "loss": 0.0647, "step": 427 }, { "epoch": 0.19472247497725204, "grad_norm": 0.9863934566369588, "learning_rate": 9.962798467073083e-06, "loss": 0.0763, "step": 428 }, { "epoch": 0.1951774340309372, "grad_norm": 0.8259784410005646, "learning_rate": 9.96262423464708e-06, "loss": 0.087, "step": 429 }, { "epoch": 0.19563239308462238, "grad_norm": 0.7987139095182185, "learning_rate": 9.962449596697834e-06, "loss": 0.0671, "step": 430 }, { "epoch": 0.19608735213830755, "grad_norm": 1.130208173229934, "learning_rate": 9.962274553239619e-06, "loss": 0.119, "step": 431 }, { "epoch": 0.19654231119199272, "grad_norm": 0.7399696243677417, "learning_rate": 9.962099104286735e-06, "loss": 0.064, "step": 432 }, { "epoch": 0.1969972702456779, "grad_norm": 1.156015767405528, "learning_rate": 9.961923249853523e-06, "loss": 0.1102, "step": 433 }, { "epoch": 0.19745222929936307, "grad_norm": 0.972422739757894, "learning_rate": 9.961746989954349e-06, "loss": 0.1093, "step": 434 }, { "epoch": 0.1979071883530482, "grad_norm": 0.7766700420403171, "learning_rate": 9.96157032460362e-06, "loss": 0.0655, "step": 435 }, { "epoch": 0.19836214740673339, "grad_norm": 0.7460679115751414, "learning_rate": 9.961393253815767e-06, "loss": 0.0751, "step": 436 }, { "epoch": 0.19881710646041856, "grad_norm": 1.0684214450487566, "learning_rate": 9.961215777605266e-06, "loss": 0.0789, "step": 437 }, { "epoch": 0.19927206551410373, "grad_norm": 0.7683994291392229, "learning_rate": 9.961037895986615e-06, "loss": 0.0849, "step": 438 }, { "epoch": 0.1997270245677889, "grad_norm": 0.7270368453251704, "learning_rate": 9.960859608974352e-06, "loss": 0.0779, "step": 439 }, { "epoch": 0.20018198362147407, "grad_norm": 0.701460207303568, "learning_rate": 9.960680916583042e-06, "loss": 0.0639, "step": 440 }, { "epoch": 0.20063694267515925, "grad_norm": 0.6784619280926262, "learning_rate": 9.960501818827292e-06, "loss": 0.077, "step": 441 }, { "epoch": 0.2010919017288444, "grad_norm": 0.8064075868568972, "learning_rate": 9.960322315721735e-06, "loss": 0.0827, "step": 442 }, { "epoch": 0.20154686078252956, "grad_norm": 0.9155026735417204, "learning_rate": 9.960142407281039e-06, "loss": 0.0841, "step": 443 }, { "epoch": 0.20200181983621474, "grad_norm": 0.6167749294869733, "learning_rate": 9.959962093519904e-06, "loss": 0.054, "step": 444 }, { "epoch": 0.2024567788898999, "grad_norm": 0.8127781985331358, "learning_rate": 9.959781374453066e-06, "loss": 0.0751, "step": 445 }, { "epoch": 0.20291173794358508, "grad_norm": 0.98306444688532, "learning_rate": 9.959600250095294e-06, "loss": 0.075, "step": 446 }, { "epoch": 0.20336669699727025, "grad_norm": 0.7982130269360888, "learning_rate": 9.959418720461384e-06, "loss": 0.0834, "step": 447 }, { "epoch": 0.20382165605095542, "grad_norm": 0.7862225023823932, "learning_rate": 9.959236785566175e-06, "loss": 0.0704, "step": 448 }, { "epoch": 0.20427661510464057, "grad_norm": 0.562107514296544, "learning_rate": 9.959054445424532e-06, "loss": 0.0644, "step": 449 }, { "epoch": 0.20473157415832574, "grad_norm": 0.6089607791855781, "learning_rate": 9.958871700051353e-06, "loss": 0.0512, "step": 450 }, { "epoch": 0.2051865332120109, "grad_norm": 0.6962095067981563, "learning_rate": 9.958688549461573e-06, "loss": 0.0712, "step": 451 }, { "epoch": 0.20564149226569609, "grad_norm": 1.155217046291275, "learning_rate": 9.958504993670158e-06, "loss": 0.1049, "step": 452 }, { "epoch": 0.20609645131938126, "grad_norm": 1.0913314226134752, "learning_rate": 9.958321032692107e-06, "loss": 0.1226, "step": 453 }, { "epoch": 0.20655141037306643, "grad_norm": 22.735025633907238, "learning_rate": 9.958136666542455e-06, "loss": 0.8419, "step": 454 }, { "epoch": 0.2070063694267516, "grad_norm": 1.184019553325164, "learning_rate": 9.957951895236262e-06, "loss": 0.1113, "step": 455 }, { "epoch": 0.20746132848043677, "grad_norm": 0.7664792046331882, "learning_rate": 9.957766718788632e-06, "loss": 0.104, "step": 456 }, { "epoch": 0.20791628753412192, "grad_norm": 0.8672883026786035, "learning_rate": 9.957581137214695e-06, "loss": 0.074, "step": 457 }, { "epoch": 0.2083712465878071, "grad_norm": 0.8772220264781722, "learning_rate": 9.957395150529615e-06, "loss": 0.0986, "step": 458 }, { "epoch": 0.20882620564149226, "grad_norm": 0.7016331971826193, "learning_rate": 9.95720875874859e-06, "loss": 0.0752, "step": 459 }, { "epoch": 0.20928116469517744, "grad_norm": 0.6308822051977305, "learning_rate": 9.957021961886855e-06, "loss": 0.0608, "step": 460 }, { "epoch": 0.2097361237488626, "grad_norm": 0.9803601042372939, "learning_rate": 9.956834759959669e-06, "loss": 0.0908, "step": 461 }, { "epoch": 0.21019108280254778, "grad_norm": 0.7674462109758159, "learning_rate": 9.95664715298233e-06, "loss": 0.074, "step": 462 }, { "epoch": 0.21064604185623295, "grad_norm": 0.7450186566335193, "learning_rate": 9.95645914097017e-06, "loss": 0.0817, "step": 463 }, { "epoch": 0.2111010009099181, "grad_norm": 0.7225723661612439, "learning_rate": 9.956270723938553e-06, "loss": 0.0849, "step": 464 }, { "epoch": 0.21155595996360327, "grad_norm": 0.7190355211871646, "learning_rate": 9.956081901902875e-06, "loss": 0.0748, "step": 465 }, { "epoch": 0.21201091901728844, "grad_norm": 1.210684562087392, "learning_rate": 9.955892674878565e-06, "loss": 0.1272, "step": 466 }, { "epoch": 0.2124658780709736, "grad_norm": 0.834170476650907, "learning_rate": 9.955703042881087e-06, "loss": 0.0992, "step": 467 }, { "epoch": 0.21292083712465878, "grad_norm": 0.874478173291907, "learning_rate": 9.955513005925934e-06, "loss": 0.0858, "step": 468 }, { "epoch": 0.21337579617834396, "grad_norm": 0.5510320150423565, "learning_rate": 9.95532256402864e-06, "loss": 0.0574, "step": 469 }, { "epoch": 0.21383075523202913, "grad_norm": 0.5657171871822584, "learning_rate": 9.955131717204762e-06, "loss": 0.0671, "step": 470 }, { "epoch": 0.21428571428571427, "grad_norm": 0.7564664653864259, "learning_rate": 9.954940465469898e-06, "loss": 0.085, "step": 471 }, { "epoch": 0.21474067333939945, "grad_norm": 0.7594501005901694, "learning_rate": 9.954748808839675e-06, "loss": 0.0733, "step": 472 }, { "epoch": 0.21519563239308462, "grad_norm": 0.6748092428366178, "learning_rate": 9.954556747329754e-06, "loss": 0.0707, "step": 473 }, { "epoch": 0.2156505914467698, "grad_norm": 1.715089789819449, "learning_rate": 9.954364280955832e-06, "loss": 0.1045, "step": 474 }, { "epoch": 0.21610555050045496, "grad_norm": 0.6668751648778155, "learning_rate": 9.954171409733634e-06, "loss": 0.0573, "step": 475 }, { "epoch": 0.21656050955414013, "grad_norm": 0.5963716475430643, "learning_rate": 9.95397813367892e-06, "loss": 0.0752, "step": 476 }, { "epoch": 0.2170154686078253, "grad_norm": 0.9917190233932158, "learning_rate": 9.953784452807487e-06, "loss": 0.1049, "step": 477 }, { "epoch": 0.21747042766151045, "grad_norm": 0.5638529401686616, "learning_rate": 9.953590367135159e-06, "loss": 0.0547, "step": 478 }, { "epoch": 0.21792538671519562, "grad_norm": 0.6477110515460727, "learning_rate": 9.953395876677796e-06, "loss": 0.0564, "step": 479 }, { "epoch": 0.2183803457688808, "grad_norm": 0.5492055118574499, "learning_rate": 9.95320098145129e-06, "loss": 0.0505, "step": 480 }, { "epoch": 0.21883530482256597, "grad_norm": 0.8954528378372288, "learning_rate": 9.95300568147157e-06, "loss": 0.126, "step": 481 }, { "epoch": 0.21929026387625114, "grad_norm": 0.6155736143826033, "learning_rate": 9.952809976754593e-06, "loss": 0.0518, "step": 482 }, { "epoch": 0.2197452229299363, "grad_norm": 1.1486004986445648, "learning_rate": 9.952613867316351e-06, "loss": 0.1142, "step": 483 }, { "epoch": 0.22020018198362148, "grad_norm": 0.8236924325360948, "learning_rate": 9.95241735317287e-06, "loss": 0.1047, "step": 484 }, { "epoch": 0.22065514103730663, "grad_norm": 0.832372102653505, "learning_rate": 9.952220434340209e-06, "loss": 0.0729, "step": 485 }, { "epoch": 0.2211101000909918, "grad_norm": 0.7288716722109786, "learning_rate": 9.952023110834456e-06, "loss": 0.068, "step": 486 }, { "epoch": 0.22156505914467697, "grad_norm": 0.5327254294033283, "learning_rate": 9.951825382671739e-06, "loss": 0.0614, "step": 487 }, { "epoch": 0.22202001819836215, "grad_norm": 0.7204991379763186, "learning_rate": 9.951627249868213e-06, "loss": 0.0666, "step": 488 }, { "epoch": 0.22247497725204732, "grad_norm": 0.7485835393026234, "learning_rate": 9.95142871244007e-06, "loss": 0.068, "step": 489 }, { "epoch": 0.2229299363057325, "grad_norm": 0.45602532896445397, "learning_rate": 9.951229770403531e-06, "loss": 0.0414, "step": 490 }, { "epoch": 0.22338489535941766, "grad_norm": 0.7240661348572547, "learning_rate": 9.951030423774858e-06, "loss": 0.0798, "step": 491 }, { "epoch": 0.22383985441310283, "grad_norm": 0.7716352477687572, "learning_rate": 9.950830672570337e-06, "loss": 0.071, "step": 492 }, { "epoch": 0.22429481346678798, "grad_norm": 1.22677184750836, "learning_rate": 9.95063051680629e-06, "loss": 0.1373, "step": 493 }, { "epoch": 0.22474977252047315, "grad_norm": 0.7365431233953595, "learning_rate": 9.950429956499074e-06, "loss": 0.0699, "step": 494 }, { "epoch": 0.22520473157415832, "grad_norm": 0.705654951368504, "learning_rate": 9.950228991665078e-06, "loss": 0.0741, "step": 495 }, { "epoch": 0.2256596906278435, "grad_norm": 0.8261497906057415, "learning_rate": 9.950027622320724e-06, "loss": 0.0764, "step": 496 }, { "epoch": 0.22611464968152867, "grad_norm": 0.9965395262255518, "learning_rate": 9.949825848482465e-06, "loss": 0.0852, "step": 497 }, { "epoch": 0.22656960873521384, "grad_norm": 0.6807161957389707, "learning_rate": 9.949623670166794e-06, "loss": 0.074, "step": 498 }, { "epoch": 0.227024567788899, "grad_norm": 1.1216390709095547, "learning_rate": 9.949421087390228e-06, "loss": 0.0931, "step": 499 }, { "epoch": 0.22747952684258416, "grad_norm": 1.1278655216416786, "learning_rate": 9.949218100169322e-06, "loss": 0.1177, "step": 500 }, { "epoch": 0.22793448589626933, "grad_norm": 0.9160591457448575, "learning_rate": 9.949014708520664e-06, "loss": 0.1015, "step": 501 }, { "epoch": 0.2283894449499545, "grad_norm": 0.9377363057118697, "learning_rate": 9.948810912460872e-06, "loss": 0.1059, "step": 502 }, { "epoch": 0.22884440400363967, "grad_norm": 0.8760932101779023, "learning_rate": 9.948606712006601e-06, "loss": 0.0812, "step": 503 }, { "epoch": 0.22929936305732485, "grad_norm": 0.6962605051289937, "learning_rate": 9.948402107174537e-06, "loss": 0.0735, "step": 504 }, { "epoch": 0.22975432211101002, "grad_norm": 0.6501265713488487, "learning_rate": 9.948197097981401e-06, "loss": 0.0551, "step": 505 }, { "epoch": 0.2302092811646952, "grad_norm": 1.2156011775652311, "learning_rate": 9.947991684443942e-06, "loss": 0.1066, "step": 506 }, { "epoch": 0.23066424021838033, "grad_norm": 0.9679794435610901, "learning_rate": 9.947785866578951e-06, "loss": 0.0981, "step": 507 }, { "epoch": 0.2311191992720655, "grad_norm": 0.7195724631231237, "learning_rate": 9.94757964440324e-06, "loss": 0.0777, "step": 508 }, { "epoch": 0.23157415832575068, "grad_norm": 0.549427502610929, "learning_rate": 9.947373017933665e-06, "loss": 0.0516, "step": 509 }, { "epoch": 0.23202911737943585, "grad_norm": 0.5667212336170355, "learning_rate": 9.947165987187108e-06, "loss": 0.0583, "step": 510 }, { "epoch": 0.23248407643312102, "grad_norm": 0.6638127935874616, "learning_rate": 9.946958552180489e-06, "loss": 0.0723, "step": 511 }, { "epoch": 0.2329390354868062, "grad_norm": 0.5226768129517959, "learning_rate": 9.946750712930756e-06, "loss": 0.0482, "step": 512 }, { "epoch": 0.23339399454049137, "grad_norm": 0.8358986518129136, "learning_rate": 9.946542469454894e-06, "loss": 0.1037, "step": 513 }, { "epoch": 0.2338489535941765, "grad_norm": 0.6695809647699968, "learning_rate": 9.94633382176992e-06, "loss": 0.0728, "step": 514 }, { "epoch": 0.23430391264786168, "grad_norm": 1.0608546974350634, "learning_rate": 9.946124769892884e-06, "loss": 0.1192, "step": 515 }, { "epoch": 0.23475887170154686, "grad_norm": 0.5090717025630993, "learning_rate": 9.945915313840869e-06, "loss": 0.0612, "step": 516 }, { "epoch": 0.23521383075523203, "grad_norm": 0.8105130307542814, "learning_rate": 9.94570545363099e-06, "loss": 0.0838, "step": 517 }, { "epoch": 0.2356687898089172, "grad_norm": 0.7752986876049957, "learning_rate": 9.945495189280394e-06, "loss": 0.092, "step": 518 }, { "epoch": 0.23612374886260237, "grad_norm": 0.869801315379322, "learning_rate": 9.945284520806267e-06, "loss": 0.077, "step": 519 }, { "epoch": 0.23657870791628755, "grad_norm": 0.5427153243822386, "learning_rate": 9.94507344822582e-06, "loss": 0.0592, "step": 520 }, { "epoch": 0.2370336669699727, "grad_norm": 0.7368670007832758, "learning_rate": 9.944861971556305e-06, "loss": 0.0608, "step": 521 }, { "epoch": 0.23748862602365786, "grad_norm": 0.8141430793460733, "learning_rate": 9.944650090814998e-06, "loss": 0.0616, "step": 522 }, { "epoch": 0.23794358507734303, "grad_norm": 2.1096588720516425, "learning_rate": 9.944437806019216e-06, "loss": 0.0938, "step": 523 }, { "epoch": 0.2383985441310282, "grad_norm": 0.7014907085161215, "learning_rate": 9.944225117186306e-06, "loss": 0.0812, "step": 524 }, { "epoch": 0.23885350318471338, "grad_norm": 0.5078467158211916, "learning_rate": 9.944012024333647e-06, "loss": 0.0561, "step": 525 }, { "epoch": 0.23930846223839855, "grad_norm": 0.6379031604907951, "learning_rate": 9.943798527478652e-06, "loss": 0.0678, "step": 526 }, { "epoch": 0.23976342129208372, "grad_norm": 0.799876019099874, "learning_rate": 9.943584626638768e-06, "loss": 0.0914, "step": 527 }, { "epoch": 0.24021838034576887, "grad_norm": 0.6550229607349646, "learning_rate": 9.943370321831474e-06, "loss": 0.0668, "step": 528 }, { "epoch": 0.24067333939945404, "grad_norm": 0.767534839542607, "learning_rate": 9.943155613074279e-06, "loss": 0.0711, "step": 529 }, { "epoch": 0.2411282984531392, "grad_norm": 0.7571838990000624, "learning_rate": 9.942940500384733e-06, "loss": 0.0893, "step": 530 }, { "epoch": 0.24158325750682438, "grad_norm": 17.807000846945513, "learning_rate": 9.942724983780409e-06, "loss": 0.3419, "step": 531 }, { "epoch": 0.24203821656050956, "grad_norm": 1.2088422410181228, "learning_rate": 9.942509063278922e-06, "loss": 0.1173, "step": 532 }, { "epoch": 0.24249317561419473, "grad_norm": 0.8811842157145667, "learning_rate": 9.942292738897914e-06, "loss": 0.1006, "step": 533 }, { "epoch": 0.2429481346678799, "grad_norm": 0.7726281786442553, "learning_rate": 9.942076010655063e-06, "loss": 0.0909, "step": 534 }, { "epoch": 0.24340309372156507, "grad_norm": 0.9942256398778268, "learning_rate": 9.941858878568078e-06, "loss": 0.134, "step": 535 }, { "epoch": 0.24385805277525022, "grad_norm": 1.001596627292525, "learning_rate": 9.941641342654702e-06, "loss": 0.0977, "step": 536 }, { "epoch": 0.2443130118289354, "grad_norm": 0.5064863363900076, "learning_rate": 9.941423402932713e-06, "loss": 0.0559, "step": 537 }, { "epoch": 0.24476797088262056, "grad_norm": 0.8589680374278897, "learning_rate": 9.94120505941992e-06, "loss": 0.0992, "step": 538 }, { "epoch": 0.24522292993630573, "grad_norm": 0.7830880681851201, "learning_rate": 9.940986312134162e-06, "loss": 0.0825, "step": 539 }, { "epoch": 0.2456778889899909, "grad_norm": 0.5778344550660577, "learning_rate": 9.940767161093316e-06, "loss": 0.0637, "step": 540 }, { "epoch": 0.24613284804367608, "grad_norm": 0.8661775200374767, "learning_rate": 9.94054760631529e-06, "loss": 0.0958, "step": 541 }, { "epoch": 0.24658780709736125, "grad_norm": 0.6976226834296251, "learning_rate": 9.940327647818026e-06, "loss": 0.0752, "step": 542 }, { "epoch": 0.2470427661510464, "grad_norm": 0.7530160135685138, "learning_rate": 9.940107285619495e-06, "loss": 0.077, "step": 543 }, { "epoch": 0.24749772520473157, "grad_norm": 0.7997106896354084, "learning_rate": 9.939886519737707e-06, "loss": 0.0958, "step": 544 }, { "epoch": 0.24795268425841674, "grad_norm": 0.8918061918047896, "learning_rate": 9.939665350190702e-06, "loss": 0.0822, "step": 545 }, { "epoch": 0.2484076433121019, "grad_norm": 0.804115756264787, "learning_rate": 9.93944377699655e-06, "loss": 0.0915, "step": 546 }, { "epoch": 0.24886260236578708, "grad_norm": 0.6234057941022288, "learning_rate": 9.93922180017336e-06, "loss": 0.0672, "step": 547 }, { "epoch": 0.24931756141947226, "grad_norm": 0.8269450754551354, "learning_rate": 9.93899941973927e-06, "loss": 0.1102, "step": 548 }, { "epoch": 0.24977252047315743, "grad_norm": 0.9233841316663005, "learning_rate": 9.93877663571245e-06, "loss": 0.0963, "step": 549 }, { "epoch": 0.2502274795268426, "grad_norm": 0.9944861568923805, "learning_rate": 9.938553448111108e-06, "loss": 0.1127, "step": 550 }, { "epoch": 0.25068243858052774, "grad_norm": 0.8423641298780182, "learning_rate": 9.938329856953482e-06, "loss": 0.0788, "step": 551 }, { "epoch": 0.25113739763421294, "grad_norm": 0.8124861649110975, "learning_rate": 9.938105862257839e-06, "loss": 0.0831, "step": 552 }, { "epoch": 0.2515923566878981, "grad_norm": 0.6612222253979325, "learning_rate": 9.937881464042485e-06, "loss": 0.0703, "step": 553 }, { "epoch": 0.25204731574158323, "grad_norm": 0.854447666921162, "learning_rate": 9.937656662325759e-06, "loss": 0.1074, "step": 554 }, { "epoch": 0.25250227479526843, "grad_norm": 0.74521770368624, "learning_rate": 9.937431457126028e-06, "loss": 0.0777, "step": 555 }, { "epoch": 0.2529572338489536, "grad_norm": 0.5044600553216889, "learning_rate": 9.937205848461694e-06, "loss": 0.0482, "step": 556 }, { "epoch": 0.2534121929026388, "grad_norm": 1.0949051966397356, "learning_rate": 9.936979836351197e-06, "loss": 0.0945, "step": 557 }, { "epoch": 0.2538671519563239, "grad_norm": 1.0332199252594778, "learning_rate": 9.936753420813003e-06, "loss": 0.092, "step": 558 }, { "epoch": 0.2543221110100091, "grad_norm": 0.7029577630748303, "learning_rate": 9.936526601865612e-06, "loss": 0.0612, "step": 559 }, { "epoch": 0.25477707006369427, "grad_norm": 0.5251640812064944, "learning_rate": 9.936299379527561e-06, "loss": 0.0569, "step": 560 }, { "epoch": 0.2552320291173794, "grad_norm": 0.6689496924283664, "learning_rate": 9.936071753817416e-06, "loss": 0.0831, "step": 561 }, { "epoch": 0.2556869881710646, "grad_norm": 0.8094390650978945, "learning_rate": 9.935843724753778e-06, "loss": 0.0897, "step": 562 }, { "epoch": 0.25614194722474976, "grad_norm": 0.9168849457874456, "learning_rate": 9.935615292355283e-06, "loss": 0.1002, "step": 563 }, { "epoch": 0.25659690627843496, "grad_norm": 0.8829987760246157, "learning_rate": 9.935386456640593e-06, "loss": 0.0997, "step": 564 }, { "epoch": 0.2570518653321201, "grad_norm": 0.9381858557170412, "learning_rate": 9.93515721762841e-06, "loss": 0.0926, "step": 565 }, { "epoch": 0.2575068243858053, "grad_norm": 0.6555630906162114, "learning_rate": 9.934927575337469e-06, "loss": 0.0805, "step": 566 }, { "epoch": 0.25796178343949044, "grad_norm": 0.49897284031908906, "learning_rate": 9.93469752978653e-06, "loss": 0.0545, "step": 567 }, { "epoch": 0.2584167424931756, "grad_norm": 0.8528689809178094, "learning_rate": 9.934467080994394e-06, "loss": 0.071, "step": 568 }, { "epoch": 0.2588717015468608, "grad_norm": 0.7999188284583189, "learning_rate": 9.934236228979893e-06, "loss": 0.0675, "step": 569 }, { "epoch": 0.25932666060054593, "grad_norm": 0.6603615540899209, "learning_rate": 9.934004973761888e-06, "loss": 0.0584, "step": 570 }, { "epoch": 0.25978161965423113, "grad_norm": 0.907545218090885, "learning_rate": 9.933773315359281e-06, "loss": 0.0912, "step": 571 }, { "epoch": 0.2602365787079163, "grad_norm": 1.2225854103436529, "learning_rate": 9.933541253790998e-06, "loss": 0.0996, "step": 572 }, { "epoch": 0.2606915377616015, "grad_norm": 0.821182112953313, "learning_rate": 9.933308789076004e-06, "loss": 0.0886, "step": 573 }, { "epoch": 0.2611464968152866, "grad_norm": 0.5608593716975471, "learning_rate": 9.933075921233292e-06, "loss": 0.0597, "step": 574 }, { "epoch": 0.26160145586897177, "grad_norm": 0.977094581221023, "learning_rate": 9.932842650281897e-06, "loss": 0.0796, "step": 575 }, { "epoch": 0.26205641492265697, "grad_norm": 1.0086738407073246, "learning_rate": 9.932608976240875e-06, "loss": 0.1245, "step": 576 }, { "epoch": 0.2625113739763421, "grad_norm": 0.7841605184531412, "learning_rate": 9.932374899129323e-06, "loss": 0.0798, "step": 577 }, { "epoch": 0.2629663330300273, "grad_norm": 0.6360279282536222, "learning_rate": 9.932140418966369e-06, "loss": 0.0714, "step": 578 }, { "epoch": 0.26342129208371245, "grad_norm": 0.8673569892639119, "learning_rate": 9.931905535771174e-06, "loss": 0.0805, "step": 579 }, { "epoch": 0.26387625113739765, "grad_norm": 1.0489822111787226, "learning_rate": 9.93167024956293e-06, "loss": 0.1046, "step": 580 }, { "epoch": 0.2643312101910828, "grad_norm": 0.5670611684906575, "learning_rate": 9.931434560360864e-06, "loss": 0.0662, "step": 581 }, { "epoch": 0.26478616924476794, "grad_norm": 0.6786486717931198, "learning_rate": 9.931198468184236e-06, "loss": 0.0705, "step": 582 }, { "epoch": 0.26524112829845314, "grad_norm": 0.7580601459978998, "learning_rate": 9.93096197305234e-06, "loss": 0.0852, "step": 583 }, { "epoch": 0.2656960873521383, "grad_norm": 0.8802141056853473, "learning_rate": 9.930725074984498e-06, "loss": 0.0989, "step": 584 }, { "epoch": 0.2661510464058235, "grad_norm": 0.6365186853726369, "learning_rate": 9.930487774000071e-06, "loss": 0.0639, "step": 585 }, { "epoch": 0.26660600545950863, "grad_norm": 0.5301331320559389, "learning_rate": 9.930250070118448e-06, "loss": 0.0628, "step": 586 }, { "epoch": 0.26706096451319383, "grad_norm": 0.6982626314754508, "learning_rate": 9.930011963359055e-06, "loss": 0.071, "step": 587 }, { "epoch": 0.267515923566879, "grad_norm": 1.0151988128038116, "learning_rate": 9.929773453741346e-06, "loss": 0.1074, "step": 588 }, { "epoch": 0.2679708826205642, "grad_norm": 0.809050548171497, "learning_rate": 9.929534541284814e-06, "loss": 0.0715, "step": 589 }, { "epoch": 0.2684258416742493, "grad_norm": 0.8254901916718546, "learning_rate": 9.929295226008981e-06, "loss": 0.0867, "step": 590 }, { "epoch": 0.26888080072793447, "grad_norm": 0.695875393623419, "learning_rate": 9.929055507933403e-06, "loss": 0.0667, "step": 591 }, { "epoch": 0.26933575978161967, "grad_norm": 0.6569370607259161, "learning_rate": 9.928815387077668e-06, "loss": 0.0667, "step": 592 }, { "epoch": 0.2697907188353048, "grad_norm": 0.8509989554819866, "learning_rate": 9.9285748634614e-06, "loss": 0.0964, "step": 593 }, { "epoch": 0.27024567788899, "grad_norm": 0.7743154017799978, "learning_rate": 9.928333937104249e-06, "loss": 0.1008, "step": 594 }, { "epoch": 0.27070063694267515, "grad_norm": 0.6810806452813069, "learning_rate": 9.928092608025905e-06, "loss": 0.0623, "step": 595 }, { "epoch": 0.27115559599636035, "grad_norm": 0.6757764847225584, "learning_rate": 9.927850876246087e-06, "loss": 0.0621, "step": 596 }, { "epoch": 0.2716105550500455, "grad_norm": 0.7561897396028232, "learning_rate": 9.927608741784551e-06, "loss": 0.0769, "step": 597 }, { "epoch": 0.27206551410373064, "grad_norm": 0.9087608421567758, "learning_rate": 9.927366204661081e-06, "loss": 0.1064, "step": 598 }, { "epoch": 0.27252047315741584, "grad_norm": 0.6090969825991095, "learning_rate": 9.927123264895497e-06, "loss": 0.0596, "step": 599 }, { "epoch": 0.272975432211101, "grad_norm": 0.5838273869575724, "learning_rate": 9.926879922507651e-06, "loss": 0.0581, "step": 600 }, { "epoch": 0.2734303912647862, "grad_norm": 41.16319851924577, "learning_rate": 9.926636177517427e-06, "loss": 0.7305, "step": 601 }, { "epoch": 0.27388535031847133, "grad_norm": 0.7159907538362364, "learning_rate": 9.926392029944743e-06, "loss": 0.0655, "step": 602 }, { "epoch": 0.27434030937215653, "grad_norm": 0.6649118967721417, "learning_rate": 9.92614747980955e-06, "loss": 0.0676, "step": 603 }, { "epoch": 0.2747952684258417, "grad_norm": 0.6955588874689645, "learning_rate": 9.92590252713183e-06, "loss": 0.0691, "step": 604 }, { "epoch": 0.2752502274795268, "grad_norm": 1.0093833512385355, "learning_rate": 9.925657171931603e-06, "loss": 0.0788, "step": 605 }, { "epoch": 0.275705186533212, "grad_norm": 0.7222760734094591, "learning_rate": 9.925411414228913e-06, "loss": 0.0765, "step": 606 }, { "epoch": 0.27616014558689717, "grad_norm": 0.7901083190949632, "learning_rate": 9.925165254043846e-06, "loss": 0.0899, "step": 607 }, { "epoch": 0.27661510464058237, "grad_norm": 0.9417411536264935, "learning_rate": 9.924918691396516e-06, "loss": 0.105, "step": 608 }, { "epoch": 0.2770700636942675, "grad_norm": 0.8531576003982281, "learning_rate": 9.924671726307073e-06, "loss": 0.0943, "step": 609 }, { "epoch": 0.2775250227479527, "grad_norm": 0.5771833327707789, "learning_rate": 9.924424358795694e-06, "loss": 0.0649, "step": 610 }, { "epoch": 0.27797998180163785, "grad_norm": 0.6804808150530418, "learning_rate": 9.924176588882597e-06, "loss": 0.0591, "step": 611 }, { "epoch": 0.278434940855323, "grad_norm": 0.6916110773643345, "learning_rate": 9.923928416588027e-06, "loss": 0.082, "step": 612 }, { "epoch": 0.2788898999090082, "grad_norm": 0.7302341341594485, "learning_rate": 9.923679841932261e-06, "loss": 0.0858, "step": 613 }, { "epoch": 0.27934485896269334, "grad_norm": 0.7190514572276734, "learning_rate": 9.923430864935615e-06, "loss": 0.0658, "step": 614 }, { "epoch": 0.27979981801637854, "grad_norm": 0.6872892360375661, "learning_rate": 9.923181485618432e-06, "loss": 0.0639, "step": 615 }, { "epoch": 0.2802547770700637, "grad_norm": 0.6937876338258171, "learning_rate": 9.92293170400109e-06, "loss": 0.0759, "step": 616 }, { "epoch": 0.2807097361237489, "grad_norm": 0.8498928251372749, "learning_rate": 9.922681520104002e-06, "loss": 0.0777, "step": 617 }, { "epoch": 0.28116469517743403, "grad_norm": 0.7409609990217324, "learning_rate": 9.922430933947612e-06, "loss": 0.0665, "step": 618 }, { "epoch": 0.2816196542311192, "grad_norm": 1.2216942184143182, "learning_rate": 9.922179945552393e-06, "loss": 0.1405, "step": 619 }, { "epoch": 0.2820746132848044, "grad_norm": 0.6637234254274302, "learning_rate": 9.921928554938857e-06, "loss": 0.062, "step": 620 }, { "epoch": 0.2825295723384895, "grad_norm": 0.9463087936758936, "learning_rate": 9.921676762127548e-06, "loss": 0.0767, "step": 621 }, { "epoch": 0.2829845313921747, "grad_norm": 1.089309305809361, "learning_rate": 9.921424567139042e-06, "loss": 0.1171, "step": 622 }, { "epoch": 0.28343949044585987, "grad_norm": 0.8752119302288704, "learning_rate": 9.921171969993942e-06, "loss": 0.0813, "step": 623 }, { "epoch": 0.28389444949954507, "grad_norm": 0.7870883299373892, "learning_rate": 9.920918970712894e-06, "loss": 0.0993, "step": 624 }, { "epoch": 0.2843494085532302, "grad_norm": 0.6504873266789636, "learning_rate": 9.92066556931657e-06, "loss": 0.073, "step": 625 }, { "epoch": 0.28480436760691535, "grad_norm": 1.1098031698420505, "learning_rate": 9.920411765825679e-06, "loss": 0.1218, "step": 626 }, { "epoch": 0.28525932666060055, "grad_norm": 1.217844501512982, "learning_rate": 9.920157560260957e-06, "loss": 0.1549, "step": 627 }, { "epoch": 0.2857142857142857, "grad_norm": 0.9728161223416268, "learning_rate": 9.919902952643179e-06, "loss": 0.0984, "step": 628 }, { "epoch": 0.2861692447679709, "grad_norm": 0.5217007184455262, "learning_rate": 9.91964794299315e-06, "loss": 0.0636, "step": 629 }, { "epoch": 0.28662420382165604, "grad_norm": 1.7394407973312302, "learning_rate": 9.919392531331706e-06, "loss": 0.1686, "step": 630 }, { "epoch": 0.28707916287534124, "grad_norm": 0.5702940927618096, "learning_rate": 9.919136717679723e-06, "loss": 0.0465, "step": 631 }, { "epoch": 0.2875341219290264, "grad_norm": 0.5990973378462472, "learning_rate": 9.9188805020581e-06, "loss": 0.0678, "step": 632 }, { "epoch": 0.28798908098271153, "grad_norm": 0.9343816967111115, "learning_rate": 9.918623884487777e-06, "loss": 0.1068, "step": 633 }, { "epoch": 0.28844404003639673, "grad_norm": 0.5997939637509836, "learning_rate": 9.91836686498972e-06, "loss": 0.0629, "step": 634 }, { "epoch": 0.2888989990900819, "grad_norm": 0.8063617612610782, "learning_rate": 9.918109443584938e-06, "loss": 0.0904, "step": 635 }, { "epoch": 0.2893539581437671, "grad_norm": 0.6625405697250593, "learning_rate": 9.917851620294461e-06, "loss": 0.0638, "step": 636 }, { "epoch": 0.2898089171974522, "grad_norm": 0.7423789779714624, "learning_rate": 9.917593395139358e-06, "loss": 0.0714, "step": 637 }, { "epoch": 0.2902638762511374, "grad_norm": 0.6102576569607258, "learning_rate": 9.91733476814073e-06, "loss": 0.0563, "step": 638 }, { "epoch": 0.29071883530482256, "grad_norm": 0.8342620452233175, "learning_rate": 9.91707573931971e-06, "loss": 0.0934, "step": 639 }, { "epoch": 0.2911737943585077, "grad_norm": 0.6397583044633867, "learning_rate": 9.916816308697468e-06, "loss": 0.0608, "step": 640 }, { "epoch": 0.2916287534121929, "grad_norm": 0.7837909798874247, "learning_rate": 9.9165564762952e-06, "loss": 0.0936, "step": 641 }, { "epoch": 0.29208371246587805, "grad_norm": 0.9915309549496408, "learning_rate": 9.916296242134142e-06, "loss": 0.1364, "step": 642 }, { "epoch": 0.29253867151956325, "grad_norm": 0.7722166587924495, "learning_rate": 9.916035606235555e-06, "loss": 0.1022, "step": 643 }, { "epoch": 0.2929936305732484, "grad_norm": 0.6446192951972597, "learning_rate": 9.915774568620739e-06, "loss": 0.0794, "step": 644 }, { "epoch": 0.2934485896269336, "grad_norm": 0.7655996282008942, "learning_rate": 9.915513129311025e-06, "loss": 0.083, "step": 645 }, { "epoch": 0.29390354868061874, "grad_norm": 0.7358761993420325, "learning_rate": 9.915251288327776e-06, "loss": 0.0927, "step": 646 }, { "epoch": 0.2943585077343039, "grad_norm": 0.8417441236168001, "learning_rate": 9.914989045692388e-06, "loss": 0.0791, "step": 647 }, { "epoch": 0.2948134667879891, "grad_norm": 0.8847229450668922, "learning_rate": 9.914726401426293e-06, "loss": 0.1114, "step": 648 }, { "epoch": 0.29526842584167423, "grad_norm": 0.6805089048669102, "learning_rate": 9.91446335555095e-06, "loss": 0.0645, "step": 649 }, { "epoch": 0.29572338489535943, "grad_norm": 0.9967907781154212, "learning_rate": 9.914199908087856e-06, "loss": 0.1125, "step": 650 }, { "epoch": 0.2961783439490446, "grad_norm": 0.7069764233646496, "learning_rate": 9.913936059058537e-06, "loss": 0.0961, "step": 651 }, { "epoch": 0.2966333030027298, "grad_norm": 0.8237259808163154, "learning_rate": 9.913671808484554e-06, "loss": 0.0863, "step": 652 }, { "epoch": 0.2970882620564149, "grad_norm": 0.5595221349609915, "learning_rate": 9.913407156387503e-06, "loss": 0.0477, "step": 653 }, { "epoch": 0.29754322111010006, "grad_norm": 0.8322598543263076, "learning_rate": 9.913142102789005e-06, "loss": 0.0785, "step": 654 }, { "epoch": 0.29799818016378526, "grad_norm": 0.9426946452527044, "learning_rate": 9.912876647710723e-06, "loss": 0.0993, "step": 655 }, { "epoch": 0.2984531392174704, "grad_norm": 0.8902481236790349, "learning_rate": 9.912610791174348e-06, "loss": 0.0981, "step": 656 }, { "epoch": 0.2989080982711556, "grad_norm": 0.6714333609160019, "learning_rate": 9.912344533201604e-06, "loss": 0.0716, "step": 657 }, { "epoch": 0.29936305732484075, "grad_norm": 0.6721636461789662, "learning_rate": 9.91207787381425e-06, "loss": 0.0675, "step": 658 }, { "epoch": 0.29981801637852595, "grad_norm": 0.628744075340254, "learning_rate": 9.911810813034073e-06, "loss": 0.0583, "step": 659 }, { "epoch": 0.3002729754322111, "grad_norm": 0.9172548581720068, "learning_rate": 9.9115433508829e-06, "loss": 0.0972, "step": 660 }, { "epoch": 0.30072793448589624, "grad_norm": 0.914462327674233, "learning_rate": 9.911275487382583e-06, "loss": 0.089, "step": 661 }, { "epoch": 0.30118289353958144, "grad_norm": 0.7410939383575923, "learning_rate": 9.911007222555011e-06, "loss": 0.0744, "step": 662 }, { "epoch": 0.3016378525932666, "grad_norm": 0.6952942958219819, "learning_rate": 9.91073855642211e-06, "loss": 0.0627, "step": 663 }, { "epoch": 0.3020928116469518, "grad_norm": 0.8802064643150562, "learning_rate": 9.910469489005828e-06, "loss": 0.0836, "step": 664 }, { "epoch": 0.30254777070063693, "grad_norm": 0.9015922573736656, "learning_rate": 9.910200020328158e-06, "loss": 0.0934, "step": 665 }, { "epoch": 0.30300272975432213, "grad_norm": 0.6635682732023674, "learning_rate": 9.909930150411113e-06, "loss": 0.0623, "step": 666 }, { "epoch": 0.3034576888080073, "grad_norm": 1.928152977107998, "learning_rate": 9.909659879276751e-06, "loss": 0.1457, "step": 667 }, { "epoch": 0.3039126478616925, "grad_norm": 0.7754006092902415, "learning_rate": 9.909389206947156e-06, "loss": 0.0621, "step": 668 }, { "epoch": 0.3043676069153776, "grad_norm": 1.0461982822616211, "learning_rate": 9.909118133444444e-06, "loss": 0.1087, "step": 669 }, { "epoch": 0.30482256596906276, "grad_norm": 0.7981897376851527, "learning_rate": 9.90884665879077e-06, "loss": 0.0921, "step": 670 }, { "epoch": 0.30527752502274796, "grad_norm": 0.8941901965354629, "learning_rate": 9.908574783008313e-06, "loss": 0.1055, "step": 671 }, { "epoch": 0.3057324840764331, "grad_norm": 1.0219508428898654, "learning_rate": 9.908302506119291e-06, "loss": 0.1152, "step": 672 }, { "epoch": 0.3061874431301183, "grad_norm": 0.7623168423299865, "learning_rate": 9.908029828145956e-06, "loss": 0.0837, "step": 673 }, { "epoch": 0.30664240218380345, "grad_norm": 0.7026665400337327, "learning_rate": 9.907756749110587e-06, "loss": 0.0785, "step": 674 }, { "epoch": 0.30709736123748865, "grad_norm": 1.0861630797383492, "learning_rate": 9.9074832690355e-06, "loss": 0.1121, "step": 675 }, { "epoch": 0.3075523202911738, "grad_norm": 0.8171913655631801, "learning_rate": 9.907209387943042e-06, "loss": 0.0759, "step": 676 }, { "epoch": 0.30800727934485894, "grad_norm": 0.695009650682766, "learning_rate": 9.906935105855595e-06, "loss": 0.0508, "step": 677 }, { "epoch": 0.30846223839854414, "grad_norm": 1.1629680848047237, "learning_rate": 9.906660422795569e-06, "loss": 0.1123, "step": 678 }, { "epoch": 0.3089171974522293, "grad_norm": 1.1028006392582481, "learning_rate": 9.906385338785411e-06, "loss": 0.1048, "step": 679 }, { "epoch": 0.3093721565059145, "grad_norm": 0.8590661780887954, "learning_rate": 9.906109853847601e-06, "loss": 0.0947, "step": 680 }, { "epoch": 0.30982711555959963, "grad_norm": 0.9160314729851723, "learning_rate": 9.90583396800465e-06, "loss": 0.0928, "step": 681 }, { "epoch": 0.31028207461328483, "grad_norm": 0.8935511298088069, "learning_rate": 9.9055576812791e-06, "loss": 0.0996, "step": 682 }, { "epoch": 0.31073703366697, "grad_norm": 0.7005723015579258, "learning_rate": 9.905280993693533e-06, "loss": 0.0863, "step": 683 }, { "epoch": 0.3111919927206551, "grad_norm": 0.6441434987399284, "learning_rate": 9.905003905270553e-06, "loss": 0.0682, "step": 684 }, { "epoch": 0.3116469517743403, "grad_norm": 0.9609160991558658, "learning_rate": 9.904726416032803e-06, "loss": 0.1095, "step": 685 }, { "epoch": 0.31210191082802546, "grad_norm": 0.723787688745946, "learning_rate": 9.904448526002963e-06, "loss": 0.0637, "step": 686 }, { "epoch": 0.31255686988171066, "grad_norm": 0.5250433090776031, "learning_rate": 9.904170235203737e-06, "loss": 0.0587, "step": 687 }, { "epoch": 0.3130118289353958, "grad_norm": 0.8819438583914972, "learning_rate": 9.903891543657866e-06, "loss": 0.1112, "step": 688 }, { "epoch": 0.313466787989081, "grad_norm": 0.5413774773467063, "learning_rate": 9.903612451388122e-06, "loss": 0.0722, "step": 689 }, { "epoch": 0.31392174704276615, "grad_norm": 0.8913097595158456, "learning_rate": 9.903332958417315e-06, "loss": 0.0893, "step": 690 }, { "epoch": 0.3143767060964513, "grad_norm": 0.6466979890354269, "learning_rate": 9.903053064768283e-06, "loss": 0.0709, "step": 691 }, { "epoch": 0.3148316651501365, "grad_norm": 0.8428101951038133, "learning_rate": 9.902772770463892e-06, "loss": 0.0814, "step": 692 }, { "epoch": 0.31528662420382164, "grad_norm": 0.5832299371816577, "learning_rate": 9.902492075527057e-06, "loss": 0.0597, "step": 693 }, { "epoch": 0.31574158325750684, "grad_norm": 0.7856263020740725, "learning_rate": 9.902210979980705e-06, "loss": 0.074, "step": 694 }, { "epoch": 0.316196542311192, "grad_norm": 0.8507681095680276, "learning_rate": 9.90192948384781e-06, "loss": 0.0941, "step": 695 }, { "epoch": 0.3166515013648772, "grad_norm": 0.7777857824270489, "learning_rate": 9.901647587151376e-06, "loss": 0.0708, "step": 696 }, { "epoch": 0.31710646041856233, "grad_norm": 1.068022521735614, "learning_rate": 9.901365289914437e-06, "loss": 0.108, "step": 697 }, { "epoch": 0.3175614194722475, "grad_norm": 1.1320770025873614, "learning_rate": 9.901082592160059e-06, "loss": 0.108, "step": 698 }, { "epoch": 0.3180163785259327, "grad_norm": 0.803518334023751, "learning_rate": 9.900799493911346e-06, "loss": 0.0871, "step": 699 }, { "epoch": 0.3184713375796178, "grad_norm": 0.8188444942805464, "learning_rate": 9.900515995191431e-06, "loss": 0.0808, "step": 700 }, { "epoch": 0.318926296633303, "grad_norm": 0.8993527964087475, "learning_rate": 9.900232096023478e-06, "loss": 0.0821, "step": 701 }, { "epoch": 0.31938125568698816, "grad_norm": 0.5600271316880729, "learning_rate": 9.899947796430687e-06, "loss": 0.0478, "step": 702 }, { "epoch": 0.31983621474067336, "grad_norm": 0.8369718087747545, "learning_rate": 9.899663096436292e-06, "loss": 0.0871, "step": 703 }, { "epoch": 0.3202911737943585, "grad_norm": 0.8993771893247359, "learning_rate": 9.899377996063554e-06, "loss": 0.0858, "step": 704 }, { "epoch": 0.32074613284804365, "grad_norm": 0.6615773523414142, "learning_rate": 9.899092495335772e-06, "loss": 0.0601, "step": 705 }, { "epoch": 0.32120109190172885, "grad_norm": 0.8278593900178107, "learning_rate": 9.898806594276273e-06, "loss": 0.0769, "step": 706 }, { "epoch": 0.321656050955414, "grad_norm": 0.7866286577186284, "learning_rate": 9.898520292908425e-06, "loss": 0.0894, "step": 707 }, { "epoch": 0.3221110100090992, "grad_norm": 0.8050313615570786, "learning_rate": 9.89823359125562e-06, "loss": 0.0732, "step": 708 }, { "epoch": 0.32256596906278434, "grad_norm": 1.0243914254387991, "learning_rate": 9.897946489341286e-06, "loss": 0.0901, "step": 709 }, { "epoch": 0.32302092811646954, "grad_norm": 0.7036337195424629, "learning_rate": 9.897658987188882e-06, "loss": 0.0686, "step": 710 }, { "epoch": 0.3234758871701547, "grad_norm": 0.5593772745397846, "learning_rate": 9.897371084821905e-06, "loss": 0.045, "step": 711 }, { "epoch": 0.32393084622383983, "grad_norm": 0.608867956874154, "learning_rate": 9.897082782263878e-06, "loss": 0.0692, "step": 712 }, { "epoch": 0.32438580527752503, "grad_norm": 0.6488333561840038, "learning_rate": 9.896794079538362e-06, "loss": 0.0513, "step": 713 }, { "epoch": 0.3248407643312102, "grad_norm": 0.5593745607285364, "learning_rate": 9.896504976668948e-06, "loss": 0.0437, "step": 714 }, { "epoch": 0.3252957233848954, "grad_norm": 0.5072427035814352, "learning_rate": 9.896215473679259e-06, "loss": 0.0566, "step": 715 }, { "epoch": 0.3257506824385805, "grad_norm": 0.7088539736923404, "learning_rate": 9.895925570592952e-06, "loss": 0.0878, "step": 716 }, { "epoch": 0.3262056414922657, "grad_norm": 0.9653520712469312, "learning_rate": 9.895635267433719e-06, "loss": 0.101, "step": 717 }, { "epoch": 0.32666060054595086, "grad_norm": 1.2323140645024868, "learning_rate": 9.895344564225277e-06, "loss": 0.1359, "step": 718 }, { "epoch": 0.327115559599636, "grad_norm": 0.6826807669546061, "learning_rate": 9.895053460991389e-06, "loss": 0.0799, "step": 719 }, { "epoch": 0.3275705186533212, "grad_norm": 0.9496304010026827, "learning_rate": 9.894761957755834e-06, "loss": 0.0928, "step": 720 }, { "epoch": 0.32802547770700635, "grad_norm": 0.8578622125964999, "learning_rate": 9.894470054542438e-06, "loss": 0.1149, "step": 721 }, { "epoch": 0.32848043676069155, "grad_norm": 0.5483719717114235, "learning_rate": 9.894177751375053e-06, "loss": 0.0621, "step": 722 }, { "epoch": 0.3289353958143767, "grad_norm": 0.6341198897869947, "learning_rate": 9.893885048277564e-06, "loss": 0.0568, "step": 723 }, { "epoch": 0.3293903548680619, "grad_norm": 0.7169738278552924, "learning_rate": 9.893591945273888e-06, "loss": 0.0752, "step": 724 }, { "epoch": 0.32984531392174704, "grad_norm": 0.9839905963719277, "learning_rate": 9.89329844238798e-06, "loss": 0.1167, "step": 725 }, { "epoch": 0.3303002729754322, "grad_norm": 0.6825969142747964, "learning_rate": 9.89300453964382e-06, "loss": 0.0693, "step": 726 }, { "epoch": 0.3307552320291174, "grad_norm": 1.0420794853330364, "learning_rate": 9.892710237065423e-06, "loss": 0.1561, "step": 727 }, { "epoch": 0.33121019108280253, "grad_norm": 1.0109988913697336, "learning_rate": 9.892415534676844e-06, "loss": 0.0813, "step": 728 }, { "epoch": 0.33166515013648773, "grad_norm": 0.6237179977245606, "learning_rate": 9.892120432502161e-06, "loss": 0.063, "step": 729 }, { "epoch": 0.3321201091901729, "grad_norm": 0.7047649578988654, "learning_rate": 9.891824930565488e-06, "loss": 0.0757, "step": 730 }, { "epoch": 0.3325750682438581, "grad_norm": 0.8381336709785119, "learning_rate": 9.891529028890974e-06, "loss": 0.1137, "step": 731 }, { "epoch": 0.3330300272975432, "grad_norm": 1.108812928457643, "learning_rate": 9.891232727502797e-06, "loss": 0.0971, "step": 732 }, { "epoch": 0.33348498635122836, "grad_norm": 0.8911550238765422, "learning_rate": 9.89093602642517e-06, "loss": 0.0869, "step": 733 }, { "epoch": 0.33393994540491356, "grad_norm": 0.7527062298816352, "learning_rate": 9.890638925682339e-06, "loss": 0.085, "step": 734 }, { "epoch": 0.3343949044585987, "grad_norm": 0.8028637093759472, "learning_rate": 9.89034142529858e-06, "loss": 0.0866, "step": 735 }, { "epoch": 0.3348498635122839, "grad_norm": 0.6620365400447171, "learning_rate": 9.890043525298203e-06, "loss": 0.053, "step": 736 }, { "epoch": 0.33530482256596905, "grad_norm": 0.6606838089782118, "learning_rate": 9.889745225705555e-06, "loss": 0.0783, "step": 737 }, { "epoch": 0.33575978161965425, "grad_norm": 0.6719238881234298, "learning_rate": 9.889446526545007e-06, "loss": 0.079, "step": 738 }, { "epoch": 0.3362147406733394, "grad_norm": 0.7379881342173255, "learning_rate": 9.88914742784097e-06, "loss": 0.0848, "step": 739 }, { "epoch": 0.33666969972702454, "grad_norm": 1.9725398231448836, "learning_rate": 9.888847929617887e-06, "loss": 0.1666, "step": 740 }, { "epoch": 0.33712465878070974, "grad_norm": 0.7800667095330575, "learning_rate": 9.888548031900226e-06, "loss": 0.0779, "step": 741 }, { "epoch": 0.3375796178343949, "grad_norm": 0.9725198572426639, "learning_rate": 9.888247734712497e-06, "loss": 0.0719, "step": 742 }, { "epoch": 0.3380345768880801, "grad_norm": 0.9547104503470986, "learning_rate": 9.887947038079238e-06, "loss": 0.1119, "step": 743 }, { "epoch": 0.33848953594176523, "grad_norm": 0.5879353672489683, "learning_rate": 9.887645942025022e-06, "loss": 0.0553, "step": 744 }, { "epoch": 0.33894449499545043, "grad_norm": 0.5485885922626542, "learning_rate": 9.887344446574452e-06, "loss": 0.0494, "step": 745 }, { "epoch": 0.3393994540491356, "grad_norm": 0.9640668269863656, "learning_rate": 9.887042551752163e-06, "loss": 0.1104, "step": 746 }, { "epoch": 0.3398544131028208, "grad_norm": 0.8639463935480832, "learning_rate": 9.886740257582827e-06, "loss": 0.0655, "step": 747 }, { "epoch": 0.3403093721565059, "grad_norm": 0.6489702107287116, "learning_rate": 9.886437564091148e-06, "loss": 0.0777, "step": 748 }, { "epoch": 0.34076433121019106, "grad_norm": 0.8236523684362178, "learning_rate": 9.886134471301854e-06, "loss": 0.0916, "step": 749 }, { "epoch": 0.34121929026387626, "grad_norm": 0.8459143900125461, "learning_rate": 9.885830979239718e-06, "loss": 0.1017, "step": 750 }, { "epoch": 0.3416742493175614, "grad_norm": 0.7496065352262437, "learning_rate": 9.885527087929541e-06, "loss": 0.0861, "step": 751 }, { "epoch": 0.3421292083712466, "grad_norm": 0.849292513666517, "learning_rate": 9.88522279739615e-06, "loss": 0.0839, "step": 752 }, { "epoch": 0.34258416742493175, "grad_norm": 0.7756671663835698, "learning_rate": 9.884918107664417e-06, "loss": 0.0809, "step": 753 }, { "epoch": 0.34303912647861695, "grad_norm": 0.7338987681003677, "learning_rate": 9.884613018759234e-06, "loss": 0.0721, "step": 754 }, { "epoch": 0.3434940855323021, "grad_norm": 0.6003946948163056, "learning_rate": 9.884307530705534e-06, "loss": 0.0782, "step": 755 }, { "epoch": 0.34394904458598724, "grad_norm": 0.5309561440373582, "learning_rate": 9.88400164352828e-06, "loss": 0.0563, "step": 756 }, { "epoch": 0.34440400363967244, "grad_norm": 0.6551261739802692, "learning_rate": 9.883695357252467e-06, "loss": 0.061, "step": 757 }, { "epoch": 0.3448589626933576, "grad_norm": 0.6598139820416582, "learning_rate": 9.883388671903125e-06, "loss": 0.084, "step": 758 }, { "epoch": 0.3453139217470428, "grad_norm": 0.8678451615084499, "learning_rate": 9.883081587505315e-06, "loss": 0.0893, "step": 759 }, { "epoch": 0.34576888080072793, "grad_norm": 0.8849976199871086, "learning_rate": 9.882774104084127e-06, "loss": 0.0938, "step": 760 }, { "epoch": 0.34622383985441313, "grad_norm": 0.6157555054475868, "learning_rate": 9.882466221664691e-06, "loss": 0.0535, "step": 761 }, { "epoch": 0.3466787989080983, "grad_norm": 0.9555128068667961, "learning_rate": 9.882157940272165e-06, "loss": 0.0984, "step": 762 }, { "epoch": 0.3471337579617834, "grad_norm": 0.8431106213501941, "learning_rate": 9.881849259931738e-06, "loss": 0.1062, "step": 763 }, { "epoch": 0.3475887170154686, "grad_norm": 0.6608166650909644, "learning_rate": 9.881540180668637e-06, "loss": 0.0589, "step": 764 }, { "epoch": 0.34804367606915376, "grad_norm": 0.7177237690901401, "learning_rate": 9.881230702508118e-06, "loss": 0.0721, "step": 765 }, { "epoch": 0.34849863512283896, "grad_norm": 0.49396541889218665, "learning_rate": 9.880920825475468e-06, "loss": 0.0582, "step": 766 }, { "epoch": 0.3489535941765241, "grad_norm": 0.7008727540015932, "learning_rate": 9.88061054959601e-06, "loss": 0.0689, "step": 767 }, { "epoch": 0.3494085532302093, "grad_norm": 0.6417543130209264, "learning_rate": 9.880299874895098e-06, "loss": 0.0859, "step": 768 }, { "epoch": 0.34986351228389445, "grad_norm": 0.5325758158155319, "learning_rate": 9.879988801398121e-06, "loss": 0.0508, "step": 769 }, { "epoch": 0.3503184713375796, "grad_norm": 0.653129374155715, "learning_rate": 9.879677329130496e-06, "loss": 0.0822, "step": 770 }, { "epoch": 0.3507734303912648, "grad_norm": 0.6044703796770591, "learning_rate": 9.879365458117678e-06, "loss": 0.0662, "step": 771 }, { "epoch": 0.35122838944494994, "grad_norm": 0.6417796330386928, "learning_rate": 9.879053188385148e-06, "loss": 0.0649, "step": 772 }, { "epoch": 0.35168334849863514, "grad_norm": 0.6127493684308597, "learning_rate": 9.878740519958425e-06, "loss": 0.0601, "step": 773 }, { "epoch": 0.3521383075523203, "grad_norm": 0.9092296350808027, "learning_rate": 9.878427452863059e-06, "loss": 0.1138, "step": 774 }, { "epoch": 0.3525932666060055, "grad_norm": 0.8850379239223551, "learning_rate": 9.878113987124633e-06, "loss": 0.1135, "step": 775 }, { "epoch": 0.35304822565969063, "grad_norm": 0.8106864823035035, "learning_rate": 9.877800122768761e-06, "loss": 0.084, "step": 776 }, { "epoch": 0.3535031847133758, "grad_norm": 0.6717791100158048, "learning_rate": 9.877485859821092e-06, "loss": 0.0764, "step": 777 }, { "epoch": 0.353958143767061, "grad_norm": 0.4266356830653338, "learning_rate": 9.877171198307304e-06, "loss": 0.0496, "step": 778 }, { "epoch": 0.3544131028207461, "grad_norm": 0.7839112755574695, "learning_rate": 9.87685613825311e-06, "loss": 0.0864, "step": 779 }, { "epoch": 0.3548680618744313, "grad_norm": 0.8928629316475961, "learning_rate": 9.876540679684257e-06, "loss": 0.0802, "step": 780 }, { "epoch": 0.35532302092811646, "grad_norm": 0.7427060191976654, "learning_rate": 9.876224822626522e-06, "loss": 0.0809, "step": 781 }, { "epoch": 0.35577797998180166, "grad_norm": 0.6618589317208607, "learning_rate": 9.875908567105716e-06, "loss": 0.0633, "step": 782 }, { "epoch": 0.3562329390354868, "grad_norm": 0.9168643329932029, "learning_rate": 9.87559191314768e-06, "loss": 0.0977, "step": 783 }, { "epoch": 0.35668789808917195, "grad_norm": 1.010661772545197, "learning_rate": 9.87527486077829e-06, "loss": 0.112, "step": 784 }, { "epoch": 0.35714285714285715, "grad_norm": 0.7355960177801563, "learning_rate": 9.874957410023458e-06, "loss": 0.0578, "step": 785 }, { "epoch": 0.3575978161965423, "grad_norm": 0.7012046376593928, "learning_rate": 9.874639560909118e-06, "loss": 0.0856, "step": 786 }, { "epoch": 0.3580527752502275, "grad_norm": 0.629856671324697, "learning_rate": 9.87432131346125e-06, "loss": 0.079, "step": 787 }, { "epoch": 0.35850773430391264, "grad_norm": 0.6605442679933491, "learning_rate": 9.874002667705855e-06, "loss": 0.0713, "step": 788 }, { "epoch": 0.35896269335759784, "grad_norm": 0.6036439966816435, "learning_rate": 9.873683623668972e-06, "loss": 0.0734, "step": 789 }, { "epoch": 0.359417652411283, "grad_norm": 0.9098464282834562, "learning_rate": 9.873364181376674e-06, "loss": 0.1273, "step": 790 }, { "epoch": 0.35987261146496813, "grad_norm": 0.725232432410699, "learning_rate": 9.873044340855062e-06, "loss": 0.0704, "step": 791 }, { "epoch": 0.36032757051865333, "grad_norm": 0.8275864687946802, "learning_rate": 9.872724102130273e-06, "loss": 0.0722, "step": 792 }, { "epoch": 0.3607825295723385, "grad_norm": 0.6908762665090429, "learning_rate": 9.872403465228476e-06, "loss": 0.068, "step": 793 }, { "epoch": 0.3612374886260237, "grad_norm": 0.8007479624540592, "learning_rate": 9.872082430175871e-06, "loss": 0.0792, "step": 794 }, { "epoch": 0.3616924476797088, "grad_norm": 0.7580697654486878, "learning_rate": 9.871760996998692e-06, "loss": 0.0662, "step": 795 }, { "epoch": 0.362147406733394, "grad_norm": 1.0378802589927232, "learning_rate": 9.871439165723207e-06, "loss": 0.0905, "step": 796 }, { "epoch": 0.36260236578707916, "grad_norm": 0.9366156924362913, "learning_rate": 9.87111693637571e-06, "loss": 0.0966, "step": 797 }, { "epoch": 0.3630573248407643, "grad_norm": 0.9568919919938076, "learning_rate": 9.870794308982536e-06, "loss": 0.1092, "step": 798 }, { "epoch": 0.3635122838944495, "grad_norm": 1.0303944561108107, "learning_rate": 9.870471283570046e-06, "loss": 0.1214, "step": 799 }, { "epoch": 0.36396724294813465, "grad_norm": 0.7123988620535131, "learning_rate": 9.870147860164639e-06, "loss": 0.0952, "step": 800 }, { "epoch": 0.36442220200181985, "grad_norm": 0.6461145025804255, "learning_rate": 9.86982403879274e-06, "loss": 0.0653, "step": 801 }, { "epoch": 0.364877161055505, "grad_norm": 0.761176238728339, "learning_rate": 9.869499819480815e-06, "loss": 0.0911, "step": 802 }, { "epoch": 0.3653321201091902, "grad_norm": 0.6778284620896282, "learning_rate": 9.869175202255354e-06, "loss": 0.0726, "step": 803 }, { "epoch": 0.36578707916287534, "grad_norm": 0.6378934869683002, "learning_rate": 9.868850187142885e-06, "loss": 0.0721, "step": 804 }, { "epoch": 0.3662420382165605, "grad_norm": 0.725078464245391, "learning_rate": 9.868524774169968e-06, "loss": 0.0774, "step": 805 }, { "epoch": 0.3666969972702457, "grad_norm": 0.7707907185217752, "learning_rate": 9.86819896336319e-06, "loss": 0.067, "step": 806 }, { "epoch": 0.36715195632393083, "grad_norm": 0.8162851407409059, "learning_rate": 9.867872754749178e-06, "loss": 0.0908, "step": 807 }, { "epoch": 0.36760691537761603, "grad_norm": 0.5330499489332517, "learning_rate": 9.867546148354586e-06, "loss": 0.066, "step": 808 }, { "epoch": 0.3680618744313012, "grad_norm": 0.6649993383235931, "learning_rate": 9.867219144206105e-06, "loss": 0.0672, "step": 809 }, { "epoch": 0.3685168334849864, "grad_norm": 0.9824606570699352, "learning_rate": 9.866891742330458e-06, "loss": 0.11, "step": 810 }, { "epoch": 0.3689717925386715, "grad_norm": 0.6507791006697302, "learning_rate": 9.866563942754394e-06, "loss": 0.0622, "step": 811 }, { "epoch": 0.36942675159235666, "grad_norm": 0.7455907568930894, "learning_rate": 9.866235745504705e-06, "loss": 0.0833, "step": 812 }, { "epoch": 0.36988171064604186, "grad_norm": 0.9927293122267482, "learning_rate": 9.865907150608203e-06, "loss": 0.0978, "step": 813 }, { "epoch": 0.370336669699727, "grad_norm": 0.817279180213694, "learning_rate": 9.865578158091746e-06, "loss": 0.1036, "step": 814 }, { "epoch": 0.3707916287534122, "grad_norm": 0.9966504261459711, "learning_rate": 9.865248767982211e-06, "loss": 0.1027, "step": 815 }, { "epoch": 0.37124658780709735, "grad_norm": 0.9561727776097537, "learning_rate": 9.864918980306521e-06, "loss": 0.1136, "step": 816 }, { "epoch": 0.37170154686078255, "grad_norm": 0.6718095123705313, "learning_rate": 9.86458879509162e-06, "loss": 0.0762, "step": 817 }, { "epoch": 0.3721565059144677, "grad_norm": 0.9803345299998187, "learning_rate": 9.864258212364492e-06, "loss": 0.0791, "step": 818 }, { "epoch": 0.37261146496815284, "grad_norm": 0.8058679812037255, "learning_rate": 9.86392723215215e-06, "loss": 0.069, "step": 819 }, { "epoch": 0.37306642402183804, "grad_norm": 0.5836160590759203, "learning_rate": 9.86359585448164e-06, "loss": 0.0621, "step": 820 }, { "epoch": 0.3735213830755232, "grad_norm": 0.6511599091669776, "learning_rate": 9.863264079380039e-06, "loss": 0.0745, "step": 821 }, { "epoch": 0.3739763421292084, "grad_norm": 0.9308266206126162, "learning_rate": 9.862931906874461e-06, "loss": 0.1132, "step": 822 }, { "epoch": 0.37443130118289353, "grad_norm": 0.613775373571284, "learning_rate": 9.862599336992048e-06, "loss": 0.0545, "step": 823 }, { "epoch": 0.37488626023657873, "grad_norm": 0.6991388893487894, "learning_rate": 9.862266369759976e-06, "loss": 0.0754, "step": 824 }, { "epoch": 0.37534121929026387, "grad_norm": 0.6352968005261165, "learning_rate": 9.861933005205454e-06, "loss": 0.0576, "step": 825 }, { "epoch": 0.37579617834394907, "grad_norm": 1.109194467922723, "learning_rate": 9.861599243355725e-06, "loss": 0.1281, "step": 826 }, { "epoch": 0.3762511373976342, "grad_norm": 0.9742134289860664, "learning_rate": 9.86126508423806e-06, "loss": 0.1067, "step": 827 }, { "epoch": 0.37670609645131936, "grad_norm": 0.6015820455914206, "learning_rate": 9.860930527879763e-06, "loss": 0.055, "step": 828 }, { "epoch": 0.37716105550500456, "grad_norm": 1.0894948091440197, "learning_rate": 9.860595574308179e-06, "loss": 0.1147, "step": 829 }, { "epoch": 0.3776160145586897, "grad_norm": 0.7023892750192133, "learning_rate": 9.860260223550672e-06, "loss": 0.0815, "step": 830 }, { "epoch": 0.3780709736123749, "grad_norm": 0.4943868719085533, "learning_rate": 9.859924475634649e-06, "loss": 0.0476, "step": 831 }, { "epoch": 0.37852593266606005, "grad_norm": 0.9974648765413693, "learning_rate": 9.859588330587545e-06, "loss": 0.1068, "step": 832 }, { "epoch": 0.37898089171974525, "grad_norm": 0.5960289391531881, "learning_rate": 9.859251788436829e-06, "loss": 0.0715, "step": 833 }, { "epoch": 0.3794358507734304, "grad_norm": 0.907079582974149, "learning_rate": 9.85891484921e-06, "loss": 0.0905, "step": 834 }, { "epoch": 0.37989080982711554, "grad_norm": 0.8133034306250352, "learning_rate": 9.858577512934592e-06, "loss": 0.1012, "step": 835 }, { "epoch": 0.38034576888080074, "grad_norm": 0.7828785203637737, "learning_rate": 9.858239779638173e-06, "loss": 0.0726, "step": 836 }, { "epoch": 0.3808007279344859, "grad_norm": 1.3138864597148558, "learning_rate": 9.857901649348338e-06, "loss": 0.1307, "step": 837 }, { "epoch": 0.3812556869881711, "grad_norm": 0.7000750227265026, "learning_rate": 9.857563122092717e-06, "loss": 0.0777, "step": 838 }, { "epoch": 0.3817106460418562, "grad_norm": 0.757283984575844, "learning_rate": 9.857224197898975e-06, "loss": 0.083, "step": 839 }, { "epoch": 0.3821656050955414, "grad_norm": 0.7113754486134378, "learning_rate": 9.856884876794805e-06, "loss": 0.0795, "step": 840 }, { "epoch": 0.38262056414922657, "grad_norm": 0.6891370217065743, "learning_rate": 9.856545158807938e-06, "loss": 0.0576, "step": 841 }, { "epoch": 0.3830755232029117, "grad_norm": 0.7230826558764609, "learning_rate": 9.856205043966134e-06, "loss": 0.0973, "step": 842 }, { "epoch": 0.3835304822565969, "grad_norm": 0.9951638416419379, "learning_rate": 9.855864532297181e-06, "loss": 0.1225, "step": 843 }, { "epoch": 0.38398544131028206, "grad_norm": 0.8272776971451865, "learning_rate": 9.85552362382891e-06, "loss": 0.0928, "step": 844 }, { "epoch": 0.38444040036396726, "grad_norm": 0.662562460388915, "learning_rate": 9.855182318589174e-06, "loss": 0.0711, "step": 845 }, { "epoch": 0.3848953594176524, "grad_norm": 1.185659176011977, "learning_rate": 9.854840616605866e-06, "loss": 0.0922, "step": 846 }, { "epoch": 0.3853503184713376, "grad_norm": 0.7002426118833048, "learning_rate": 9.854498517906908e-06, "loss": 0.0828, "step": 847 }, { "epoch": 0.38580527752502275, "grad_norm": 0.8957633348930525, "learning_rate": 9.854156022520252e-06, "loss": 0.0809, "step": 848 }, { "epoch": 0.3862602365787079, "grad_norm": 1.0593251614278854, "learning_rate": 9.853813130473887e-06, "loss": 0.1109, "step": 849 }, { "epoch": 0.3867151956323931, "grad_norm": 0.7751748709357449, "learning_rate": 9.853469841795832e-06, "loss": 0.0823, "step": 850 }, { "epoch": 0.38717015468607824, "grad_norm": 0.5943868690351954, "learning_rate": 9.853126156514142e-06, "loss": 0.0758, "step": 851 }, { "epoch": 0.38762511373976344, "grad_norm": 0.4901349757557767, "learning_rate": 9.852782074656897e-06, "loss": 0.064, "step": 852 }, { "epoch": 0.3880800727934486, "grad_norm": 0.7531191508768753, "learning_rate": 9.852437596252216e-06, "loss": 0.0824, "step": 853 }, { "epoch": 0.3885350318471338, "grad_norm": 0.7684236261792305, "learning_rate": 9.852092721328248e-06, "loss": 0.0674, "step": 854 }, { "epoch": 0.3889899909008189, "grad_norm": 0.8624513661560378, "learning_rate": 9.851747449913176e-06, "loss": 0.09, "step": 855 }, { "epoch": 0.38944494995450407, "grad_norm": 0.9125725996183891, "learning_rate": 9.851401782035213e-06, "loss": 0.129, "step": 856 }, { "epoch": 0.38989990900818927, "grad_norm": 0.7630714638300728, "learning_rate": 9.851055717722604e-06, "loss": 0.068, "step": 857 }, { "epoch": 0.3903548680618744, "grad_norm": 0.834756070401477, "learning_rate": 9.850709257003628e-06, "loss": 0.0831, "step": 858 }, { "epoch": 0.3908098271155596, "grad_norm": 0.9864776662717517, "learning_rate": 9.850362399906598e-06, "loss": 0.0904, "step": 859 }, { "epoch": 0.39126478616924476, "grad_norm": 0.6242730295284743, "learning_rate": 9.850015146459857e-06, "loss": 0.0754, "step": 860 }, { "epoch": 0.39171974522292996, "grad_norm": 0.838271649072902, "learning_rate": 9.84966749669178e-06, "loss": 0.0899, "step": 861 }, { "epoch": 0.3921747042766151, "grad_norm": 0.6826448278617049, "learning_rate": 9.849319450630777e-06, "loss": 0.0698, "step": 862 }, { "epoch": 0.39262966333030025, "grad_norm": 0.5533993282250775, "learning_rate": 9.848971008305288e-06, "loss": 0.0688, "step": 863 }, { "epoch": 0.39308462238398545, "grad_norm": 0.838673412156409, "learning_rate": 9.848622169743784e-06, "loss": 0.0815, "step": 864 }, { "epoch": 0.3935395814376706, "grad_norm": 0.9783580500729582, "learning_rate": 9.848272934974774e-06, "loss": 0.0745, "step": 865 }, { "epoch": 0.3939945404913558, "grad_norm": 0.5976030953641746, "learning_rate": 9.847923304026793e-06, "loss": 0.0664, "step": 866 }, { "epoch": 0.39444949954504094, "grad_norm": 0.6999143793652887, "learning_rate": 9.847573276928415e-06, "loss": 0.0804, "step": 867 }, { "epoch": 0.39490445859872614, "grad_norm": 0.6338725165728231, "learning_rate": 9.847222853708239e-06, "loss": 0.0655, "step": 868 }, { "epoch": 0.3953594176524113, "grad_norm": 0.7010627446349382, "learning_rate": 9.846872034394902e-06, "loss": 0.0667, "step": 869 }, { "epoch": 0.3958143767060964, "grad_norm": 0.6173227181881447, "learning_rate": 9.84652081901707e-06, "loss": 0.0674, "step": 870 }, { "epoch": 0.3962693357597816, "grad_norm": 0.9673042020268607, "learning_rate": 9.846169207603443e-06, "loss": 0.1267, "step": 871 }, { "epoch": 0.39672429481346677, "grad_norm": 0.6294912489479282, "learning_rate": 9.845817200182755e-06, "loss": 0.0588, "step": 872 }, { "epoch": 0.39717925386715197, "grad_norm": 0.8477152807126976, "learning_rate": 9.845464796783767e-06, "loss": 0.1219, "step": 873 }, { "epoch": 0.3976342129208371, "grad_norm": 0.5887483684825674, "learning_rate": 9.845111997435279e-06, "loss": 0.0731, "step": 874 }, { "epoch": 0.3980891719745223, "grad_norm": 0.5630369277247907, "learning_rate": 9.844758802166116e-06, "loss": 0.0579, "step": 875 }, { "epoch": 0.39854413102820746, "grad_norm": 0.6717541815357567, "learning_rate": 9.844405211005145e-06, "loss": 0.0711, "step": 876 }, { "epoch": 0.3989990900818926, "grad_norm": 0.6571828619535791, "learning_rate": 9.844051223981258e-06, "loss": 0.0638, "step": 877 }, { "epoch": 0.3994540491355778, "grad_norm": 0.6723710552364174, "learning_rate": 9.84369684112338e-06, "loss": 0.0676, "step": 878 }, { "epoch": 0.39990900818926295, "grad_norm": 0.7014173744195523, "learning_rate": 9.84334206246047e-06, "loss": 0.0751, "step": 879 }, { "epoch": 0.40036396724294815, "grad_norm": 0.7999660318519703, "learning_rate": 9.842986888021518e-06, "loss": 0.0895, "step": 880 }, { "epoch": 0.4008189262966333, "grad_norm": 0.5578605501955606, "learning_rate": 9.842631317835548e-06, "loss": 0.0637, "step": 881 }, { "epoch": 0.4012738853503185, "grad_norm": 0.6615256090849237, "learning_rate": 9.842275351931617e-06, "loss": 0.0664, "step": 882 }, { "epoch": 0.40172884440400364, "grad_norm": 0.5263094198672195, "learning_rate": 9.841918990338812e-06, "loss": 0.0611, "step": 883 }, { "epoch": 0.4021838034576888, "grad_norm": 0.8080883575450535, "learning_rate": 9.841562233086252e-06, "loss": 0.0912, "step": 884 }, { "epoch": 0.402638762511374, "grad_norm": 0.6655757939327012, "learning_rate": 9.841205080203092e-06, "loss": 0.0601, "step": 885 }, { "epoch": 0.4030937215650591, "grad_norm": 0.8701903481119097, "learning_rate": 9.840847531718515e-06, "loss": 0.0914, "step": 886 }, { "epoch": 0.4035486806187443, "grad_norm": 0.7730206436987713, "learning_rate": 9.840489587661738e-06, "loss": 0.0747, "step": 887 }, { "epoch": 0.40400363967242947, "grad_norm": 0.7410839527981146, "learning_rate": 9.840131248062012e-06, "loss": 0.079, "step": 888 }, { "epoch": 0.40445859872611467, "grad_norm": 0.627620281196765, "learning_rate": 9.839772512948618e-06, "loss": 0.0715, "step": 889 }, { "epoch": 0.4049135577797998, "grad_norm": 0.8746014124114054, "learning_rate": 9.83941338235087e-06, "loss": 0.0824, "step": 890 }, { "epoch": 0.40536851683348496, "grad_norm": 1.0112737589697485, "learning_rate": 9.839053856298116e-06, "loss": 0.1251, "step": 891 }, { "epoch": 0.40582347588717016, "grad_norm": 0.72216805525771, "learning_rate": 9.838693934819734e-06, "loss": 0.0893, "step": 892 }, { "epoch": 0.4062784349408553, "grad_norm": 0.7544949830136005, "learning_rate": 9.838333617945134e-06, "loss": 0.0968, "step": 893 }, { "epoch": 0.4067333939945405, "grad_norm": 0.9543024355165705, "learning_rate": 9.837972905703762e-06, "loss": 0.102, "step": 894 }, { "epoch": 0.40718835304822565, "grad_norm": 1.02061795078975, "learning_rate": 9.83761179812509e-06, "loss": 0.0649, "step": 895 }, { "epoch": 0.40764331210191085, "grad_norm": 0.39738812842187227, "learning_rate": 9.837250295238629e-06, "loss": 0.0428, "step": 896 }, { "epoch": 0.408098271155596, "grad_norm": 0.8873895570319217, "learning_rate": 9.836888397073919e-06, "loss": 0.1068, "step": 897 }, { "epoch": 0.40855323020928114, "grad_norm": 0.7492126364897504, "learning_rate": 9.836526103660533e-06, "loss": 0.0953, "step": 898 }, { "epoch": 0.40900818926296634, "grad_norm": 0.821575499525911, "learning_rate": 9.836163415028075e-06, "loss": 0.0712, "step": 899 }, { "epoch": 0.4094631483166515, "grad_norm": 1.0052579979241618, "learning_rate": 9.835800331206183e-06, "loss": 0.1138, "step": 900 }, { "epoch": 0.4099181073703367, "grad_norm": 0.7848465428804848, "learning_rate": 9.835436852224525e-06, "loss": 0.0978, "step": 901 }, { "epoch": 0.4103730664240218, "grad_norm": 0.9719856735481065, "learning_rate": 9.835072978112804e-06, "loss": 0.0846, "step": 902 }, { "epoch": 0.410828025477707, "grad_norm": 0.6607308818506346, "learning_rate": 9.834708708900755e-06, "loss": 0.0654, "step": 903 }, { "epoch": 0.41128298453139217, "grad_norm": 0.5191597312034261, "learning_rate": 9.834344044618144e-06, "loss": 0.0518, "step": 904 }, { "epoch": 0.41173794358507737, "grad_norm": 0.5336391872354229, "learning_rate": 9.83397898529477e-06, "loss": 0.0535, "step": 905 }, { "epoch": 0.4121929026387625, "grad_norm": 0.5687342550017563, "learning_rate": 9.833613530960462e-06, "loss": 0.0578, "step": 906 }, { "epoch": 0.41264786169244766, "grad_norm": 0.8793783198642894, "learning_rate": 9.833247681645083e-06, "loss": 0.1286, "step": 907 }, { "epoch": 0.41310282074613286, "grad_norm": 0.8073005899800644, "learning_rate": 9.832881437378534e-06, "loss": 0.0853, "step": 908 }, { "epoch": 0.413557779799818, "grad_norm": 0.511699500000588, "learning_rate": 9.832514798190738e-06, "loss": 0.0504, "step": 909 }, { "epoch": 0.4140127388535032, "grad_norm": 0.5082793074725768, "learning_rate": 9.832147764111655e-06, "loss": 0.056, "step": 910 }, { "epoch": 0.41446769790718835, "grad_norm": 0.9876041013395295, "learning_rate": 9.83178033517128e-06, "loss": 0.0984, "step": 911 }, { "epoch": 0.41492265696087355, "grad_norm": 0.7511273129930924, "learning_rate": 9.831412511399633e-06, "loss": 0.0969, "step": 912 }, { "epoch": 0.4153776160145587, "grad_norm": 1.0144870263760433, "learning_rate": 9.831044292826778e-06, "loss": 0.1482, "step": 913 }, { "epoch": 0.41583257506824384, "grad_norm": 0.70444400073401, "learning_rate": 9.830675679482797e-06, "loss": 0.0802, "step": 914 }, { "epoch": 0.41628753412192904, "grad_norm": 1.0357251397748677, "learning_rate": 9.830306671397816e-06, "loss": 0.1061, "step": 915 }, { "epoch": 0.4167424931756142, "grad_norm": 0.895894802940119, "learning_rate": 9.829937268601988e-06, "loss": 0.1005, "step": 916 }, { "epoch": 0.4171974522292994, "grad_norm": 0.6004589977630954, "learning_rate": 9.829567471125497e-06, "loss": 0.0664, "step": 917 }, { "epoch": 0.4176524112829845, "grad_norm": 0.6058859475834909, "learning_rate": 9.829197278998562e-06, "loss": 0.0728, "step": 918 }, { "epoch": 0.4181073703366697, "grad_norm": 0.5886912548442098, "learning_rate": 9.828826692251435e-06, "loss": 0.074, "step": 919 }, { "epoch": 0.41856232939035487, "grad_norm": 0.5982473215332103, "learning_rate": 9.828455710914398e-06, "loss": 0.0653, "step": 920 }, { "epoch": 0.41901728844404, "grad_norm": 0.8647804622811079, "learning_rate": 9.828084335017763e-06, "loss": 0.0741, "step": 921 }, { "epoch": 0.4194722474977252, "grad_norm": 0.653767178815679, "learning_rate": 9.827712564591883e-06, "loss": 0.0604, "step": 922 }, { "epoch": 0.41992720655141036, "grad_norm": 0.7812500085225947, "learning_rate": 9.827340399667132e-06, "loss": 0.0708, "step": 923 }, { "epoch": 0.42038216560509556, "grad_norm": 0.7314008563711142, "learning_rate": 9.826967840273921e-06, "loss": 0.0721, "step": 924 }, { "epoch": 0.4208371246587807, "grad_norm": 0.8727413076803472, "learning_rate": 9.8265948864427e-06, "loss": 0.0892, "step": 925 }, { "epoch": 0.4212920837124659, "grad_norm": 0.6051379056710864, "learning_rate": 9.826221538203942e-06, "loss": 0.0685, "step": 926 }, { "epoch": 0.42174704276615105, "grad_norm": 0.7279887191787228, "learning_rate": 9.825847795588154e-06, "loss": 0.0766, "step": 927 }, { "epoch": 0.4222020018198362, "grad_norm": 0.7126811268305303, "learning_rate": 9.825473658625876e-06, "loss": 0.0821, "step": 928 }, { "epoch": 0.4226569608735214, "grad_norm": 0.8812960827967533, "learning_rate": 9.825099127347684e-06, "loss": 0.0982, "step": 929 }, { "epoch": 0.42311191992720654, "grad_norm": 0.7462955906438729, "learning_rate": 9.824724201784182e-06, "loss": 0.1073, "step": 930 }, { "epoch": 0.42356687898089174, "grad_norm": 0.5448066050338419, "learning_rate": 9.824348881966004e-06, "loss": 0.0637, "step": 931 }, { "epoch": 0.4240218380345769, "grad_norm": 0.7750150802923693, "learning_rate": 9.823973167923823e-06, "loss": 0.09, "step": 932 }, { "epoch": 0.4244767970882621, "grad_norm": 0.8695175796556455, "learning_rate": 9.82359705968834e-06, "loss": 0.0857, "step": 933 }, { "epoch": 0.4249317561419472, "grad_norm": 0.653112477618241, "learning_rate": 9.823220557290289e-06, "loss": 0.0722, "step": 934 }, { "epoch": 0.42538671519563237, "grad_norm": 0.7764742726938813, "learning_rate": 9.822843660760434e-06, "loss": 0.0582, "step": 935 }, { "epoch": 0.42584167424931757, "grad_norm": 0.8338160462571067, "learning_rate": 9.822466370129576e-06, "loss": 0.0993, "step": 936 }, { "epoch": 0.4262966333030027, "grad_norm": 0.7416650975880095, "learning_rate": 9.822088685428543e-06, "loss": 0.0782, "step": 937 }, { "epoch": 0.4267515923566879, "grad_norm": 0.5969422348364739, "learning_rate": 9.821710606688199e-06, "loss": 0.0546, "step": 938 }, { "epoch": 0.42720655141037306, "grad_norm": 0.6235404067325917, "learning_rate": 9.82133213393944e-06, "loss": 0.0638, "step": 939 }, { "epoch": 0.42766151046405826, "grad_norm": 0.7910461101358781, "learning_rate": 9.820953267213194e-06, "loss": 0.0775, "step": 940 }, { "epoch": 0.4281164695177434, "grad_norm": 0.692978452923811, "learning_rate": 9.820574006540415e-06, "loss": 0.053, "step": 941 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7310389759017597, "learning_rate": 9.820194351952098e-06, "loss": 0.0716, "step": 942 }, { "epoch": 0.42902638762511375, "grad_norm": 0.6553331509390902, "learning_rate": 9.819814303479268e-06, "loss": 0.0612, "step": 943 }, { "epoch": 0.4294813466787989, "grad_norm": 1.1310076957610966, "learning_rate": 9.819433861152978e-06, "loss": 0.1116, "step": 944 }, { "epoch": 0.4299363057324841, "grad_norm": 0.6933766894953944, "learning_rate": 9.819053025004316e-06, "loss": 0.0932, "step": 945 }, { "epoch": 0.43039126478616924, "grad_norm": 0.7823571557493696, "learning_rate": 9.818671795064405e-06, "loss": 0.0847, "step": 946 }, { "epoch": 0.43084622383985444, "grad_norm": 0.8000794358590197, "learning_rate": 9.818290171364396e-06, "loss": 0.0916, "step": 947 }, { "epoch": 0.4313011828935396, "grad_norm": 0.6207042654318157, "learning_rate": 9.817908153935473e-06, "loss": 0.0568, "step": 948 }, { "epoch": 0.4317561419472247, "grad_norm": 0.7957970680354334, "learning_rate": 9.817525742808854e-06, "loss": 0.1203, "step": 949 }, { "epoch": 0.4322111010009099, "grad_norm": 0.6607960765057979, "learning_rate": 9.817142938015786e-06, "loss": 0.069, "step": 950 }, { "epoch": 0.43266606005459507, "grad_norm": 0.8132102265727185, "learning_rate": 9.816759739587552e-06, "loss": 0.0821, "step": 951 }, { "epoch": 0.43312101910828027, "grad_norm": 0.6410149691778323, "learning_rate": 9.816376147555464e-06, "loss": 0.0612, "step": 952 }, { "epoch": 0.4335759781619654, "grad_norm": 1.0196998859089288, "learning_rate": 9.815992161950867e-06, "loss": 0.1183, "step": 953 }, { "epoch": 0.4340309372156506, "grad_norm": 0.5899375116434804, "learning_rate": 9.81560778280514e-06, "loss": 0.0604, "step": 954 }, { "epoch": 0.43448589626933576, "grad_norm": 1.0046158107797931, "learning_rate": 9.815223010149693e-06, "loss": 0.0876, "step": 955 }, { "epoch": 0.4349408553230209, "grad_norm": 0.7980339738331416, "learning_rate": 9.814837844015966e-06, "loss": 0.0894, "step": 956 }, { "epoch": 0.4353958143767061, "grad_norm": 0.6974524248281853, "learning_rate": 9.814452284435433e-06, "loss": 0.0741, "step": 957 }, { "epoch": 0.43585077343039125, "grad_norm": 0.7679692797858835, "learning_rate": 9.814066331439603e-06, "loss": 0.0796, "step": 958 }, { "epoch": 0.43630573248407645, "grad_norm": 0.8183774417740679, "learning_rate": 9.813679985060012e-06, "loss": 0.0963, "step": 959 }, { "epoch": 0.4367606915377616, "grad_norm": 0.7950656053104391, "learning_rate": 9.81329324532823e-06, "loss": 0.0837, "step": 960 }, { "epoch": 0.4372156505914468, "grad_norm": 0.6056809369995887, "learning_rate": 9.812906112275862e-06, "loss": 0.0465, "step": 961 }, { "epoch": 0.43767060964513194, "grad_norm": 1.0980359635620318, "learning_rate": 9.81251858593454e-06, "loss": 0.1206, "step": 962 }, { "epoch": 0.4381255686988171, "grad_norm": 0.6123483237764059, "learning_rate": 9.812130666335933e-06, "loss": 0.08, "step": 963 }, { "epoch": 0.4385805277525023, "grad_norm": 0.8151730014839008, "learning_rate": 9.81174235351174e-06, "loss": 0.0983, "step": 964 }, { "epoch": 0.4390354868061874, "grad_norm": 0.7143828681073273, "learning_rate": 9.811353647493691e-06, "loss": 0.0809, "step": 965 }, { "epoch": 0.4394904458598726, "grad_norm": 0.5647036962239634, "learning_rate": 9.810964548313549e-06, "loss": 0.0581, "step": 966 }, { "epoch": 0.43994540491355777, "grad_norm": 0.7594400506736699, "learning_rate": 9.81057505600311e-06, "loss": 0.078, "step": 967 }, { "epoch": 0.44040036396724297, "grad_norm": 0.6515426202345832, "learning_rate": 9.810185170594205e-06, "loss": 0.0688, "step": 968 }, { "epoch": 0.4408553230209281, "grad_norm": 0.8798906332352223, "learning_rate": 9.809794892118687e-06, "loss": 0.0915, "step": 969 }, { "epoch": 0.44131028207461326, "grad_norm": 0.7350866900672135, "learning_rate": 9.809404220608451e-06, "loss": 0.0671, "step": 970 }, { "epoch": 0.44176524112829846, "grad_norm": 0.7216847217866104, "learning_rate": 9.809013156095424e-06, "loss": 0.0726, "step": 971 }, { "epoch": 0.4422202001819836, "grad_norm": 0.8179702740752783, "learning_rate": 9.808621698611557e-06, "loss": 0.0758, "step": 972 }, { "epoch": 0.4426751592356688, "grad_norm": 0.5533105745807706, "learning_rate": 9.808229848188842e-06, "loss": 0.0528, "step": 973 }, { "epoch": 0.44313011828935395, "grad_norm": 0.7503486538749657, "learning_rate": 9.807837604859296e-06, "loss": 0.0878, "step": 974 }, { "epoch": 0.44358507734303915, "grad_norm": 0.40510949005498975, "learning_rate": 9.807444968654975e-06, "loss": 0.0424, "step": 975 }, { "epoch": 0.4440400363967243, "grad_norm": 0.8540666353042626, "learning_rate": 9.807051939607959e-06, "loss": 0.1108, "step": 976 }, { "epoch": 0.44449499545040944, "grad_norm": 0.7543284179304937, "learning_rate": 9.806658517750369e-06, "loss": 0.0719, "step": 977 }, { "epoch": 0.44494995450409464, "grad_norm": 0.6982493359241757, "learning_rate": 9.80626470311435e-06, "loss": 0.0777, "step": 978 }, { "epoch": 0.4454049135577798, "grad_norm": 0.7275511253894157, "learning_rate": 9.805870495732085e-06, "loss": 0.0693, "step": 979 }, { "epoch": 0.445859872611465, "grad_norm": 0.8647890459895436, "learning_rate": 9.805475895635787e-06, "loss": 0.0882, "step": 980 }, { "epoch": 0.4463148316651501, "grad_norm": 0.757804762973183, "learning_rate": 9.8050809028577e-06, "loss": 0.0724, "step": 981 }, { "epoch": 0.4467697907188353, "grad_norm": 0.7515219153063712, "learning_rate": 9.8046855174301e-06, "loss": 0.0659, "step": 982 }, { "epoch": 0.44722474977252047, "grad_norm": 1.0502681583017184, "learning_rate": 9.804289739385297e-06, "loss": 0.1207, "step": 983 }, { "epoch": 0.44767970882620567, "grad_norm": 0.5780062486364612, "learning_rate": 9.803893568755633e-06, "loss": 0.0772, "step": 984 }, { "epoch": 0.4481346678798908, "grad_norm": 0.5515644567052078, "learning_rate": 9.80349700557348e-06, "loss": 0.0628, "step": 985 }, { "epoch": 0.44858962693357596, "grad_norm": 0.6432677095504179, "learning_rate": 9.803100049871246e-06, "loss": 0.0817, "step": 986 }, { "epoch": 0.44904458598726116, "grad_norm": 0.5424958391196154, "learning_rate": 9.802702701681366e-06, "loss": 0.0649, "step": 987 }, { "epoch": 0.4494995450409463, "grad_norm": 0.6556126282036931, "learning_rate": 9.80230496103631e-06, "loss": 0.0579, "step": 988 }, { "epoch": 0.4499545040946315, "grad_norm": 0.5632646083130022, "learning_rate": 9.801906827968578e-06, "loss": 0.0591, "step": 989 }, { "epoch": 0.45040946314831665, "grad_norm": 1.0464719217252296, "learning_rate": 9.801508302510707e-06, "loss": 0.124, "step": 990 }, { "epoch": 0.45086442220200185, "grad_norm": 0.7231067459050019, "learning_rate": 9.801109384695261e-06, "loss": 0.0631, "step": 991 }, { "epoch": 0.451319381255687, "grad_norm": 0.775594128230074, "learning_rate": 9.800710074554837e-06, "loss": 0.0924, "step": 992 }, { "epoch": 0.45177434030937214, "grad_norm": 0.6340180385643369, "learning_rate": 9.800310372122066e-06, "loss": 0.068, "step": 993 }, { "epoch": 0.45222929936305734, "grad_norm": 0.9703750136380557, "learning_rate": 9.799910277429609e-06, "loss": 0.0902, "step": 994 }, { "epoch": 0.4526842584167425, "grad_norm": 0.5881925827197537, "learning_rate": 9.79950979051016e-06, "loss": 0.0662, "step": 995 }, { "epoch": 0.4531392174704277, "grad_norm": 0.7583235380843109, "learning_rate": 9.799108911396446e-06, "loss": 0.0755, "step": 996 }, { "epoch": 0.4535941765241128, "grad_norm": 0.6585135755735663, "learning_rate": 9.798707640121224e-06, "loss": 0.0669, "step": 997 }, { "epoch": 0.454049135577798, "grad_norm": 0.9344579240939844, "learning_rate": 9.798305976717286e-06, "loss": 0.1028, "step": 998 }, { "epoch": 0.45450409463148317, "grad_norm": 0.6238360425747993, "learning_rate": 9.79790392121745e-06, "loss": 0.0608, "step": 999 }, { "epoch": 0.4549590536851683, "grad_norm": 0.715680092291253, "learning_rate": 9.797501473654573e-06, "loss": 0.0792, "step": 1000 }, { "epoch": 0.4554140127388535, "grad_norm": 0.8167758856821831, "learning_rate": 9.797098634061543e-06, "loss": 0.0948, "step": 1001 }, { "epoch": 0.45586897179253866, "grad_norm": 0.8318764431867516, "learning_rate": 9.796695402471275e-06, "loss": 0.0967, "step": 1002 }, { "epoch": 0.45632393084622386, "grad_norm": 0.9700547030363569, "learning_rate": 9.79629177891672e-06, "loss": 0.1138, "step": 1003 }, { "epoch": 0.456778889899909, "grad_norm": 0.7702596501705347, "learning_rate": 9.79588776343086e-06, "loss": 0.0826, "step": 1004 }, { "epoch": 0.4572338489535942, "grad_norm": 0.833778163717652, "learning_rate": 9.795483356046711e-06, "loss": 0.0927, "step": 1005 }, { "epoch": 0.45768880800727935, "grad_norm": 0.7006737675801851, "learning_rate": 9.795078556797318e-06, "loss": 0.0747, "step": 1006 }, { "epoch": 0.4581437670609645, "grad_norm": 0.8810114143185821, "learning_rate": 9.794673365715761e-06, "loss": 0.0921, "step": 1007 }, { "epoch": 0.4585987261146497, "grad_norm": 0.7286145380478113, "learning_rate": 9.794267782835148e-06, "loss": 0.0832, "step": 1008 }, { "epoch": 0.45905368516833484, "grad_norm": 0.8181887559127218, "learning_rate": 9.793861808188622e-06, "loss": 0.0729, "step": 1009 }, { "epoch": 0.45950864422202004, "grad_norm": 1.0821839097582124, "learning_rate": 9.793455441809359e-06, "loss": 0.1025, "step": 1010 }, { "epoch": 0.4599636032757052, "grad_norm": 0.515896949523265, "learning_rate": 9.793048683730564e-06, "loss": 0.0512, "step": 1011 }, { "epoch": 0.4604185623293904, "grad_norm": 0.7800604571516774, "learning_rate": 9.792641533985474e-06, "loss": 0.1065, "step": 1012 }, { "epoch": 0.4608735213830755, "grad_norm": 0.48365424866268936, "learning_rate": 9.792233992607365e-06, "loss": 0.0622, "step": 1013 }, { "epoch": 0.46132848043676067, "grad_norm": 0.8472876133123602, "learning_rate": 9.791826059629532e-06, "loss": 0.0713, "step": 1014 }, { "epoch": 0.46178343949044587, "grad_norm": 0.935522534168844, "learning_rate": 9.791417735085316e-06, "loss": 0.0853, "step": 1015 }, { "epoch": 0.462238398544131, "grad_norm": 0.8028819334602026, "learning_rate": 9.791009019008078e-06, "loss": 0.0795, "step": 1016 }, { "epoch": 0.4626933575978162, "grad_norm": 0.6458928385673616, "learning_rate": 9.79059991143122e-06, "loss": 0.0836, "step": 1017 }, { "epoch": 0.46314831665150136, "grad_norm": 0.8309912415690437, "learning_rate": 9.790190412388173e-06, "loss": 0.0895, "step": 1018 }, { "epoch": 0.46360327570518656, "grad_norm": 0.6953691809158898, "learning_rate": 9.789780521912396e-06, "loss": 0.0686, "step": 1019 }, { "epoch": 0.4640582347588717, "grad_norm": 0.7563151979586233, "learning_rate": 9.789370240037385e-06, "loss": 0.0879, "step": 1020 }, { "epoch": 0.46451319381255685, "grad_norm": 0.6646619102460968, "learning_rate": 9.788959566796667e-06, "loss": 0.0761, "step": 1021 }, { "epoch": 0.46496815286624205, "grad_norm": 0.8092527562913561, "learning_rate": 9.788548502223801e-06, "loss": 0.0863, "step": 1022 }, { "epoch": 0.4654231119199272, "grad_norm": 2.0284506817542396, "learning_rate": 9.788137046352374e-06, "loss": 0.2011, "step": 1023 }, { "epoch": 0.4658780709736124, "grad_norm": 0.6524644993097855, "learning_rate": 9.787725199216011e-06, "loss": 0.0765, "step": 1024 }, { "epoch": 0.46633303002729753, "grad_norm": 0.48134373932870766, "learning_rate": 9.787312960848368e-06, "loss": 0.0505, "step": 1025 }, { "epoch": 0.46678798908098273, "grad_norm": 0.6646547386252114, "learning_rate": 9.786900331283128e-06, "loss": 0.0825, "step": 1026 }, { "epoch": 0.4672429481346679, "grad_norm": 0.5655812014606527, "learning_rate": 9.78648731055401e-06, "loss": 0.0659, "step": 1027 }, { "epoch": 0.467697907188353, "grad_norm": 0.680196435092224, "learning_rate": 9.786073898694766e-06, "loss": 0.0734, "step": 1028 }, { "epoch": 0.4681528662420382, "grad_norm": 0.6198434008496165, "learning_rate": 9.785660095739176e-06, "loss": 0.0687, "step": 1029 }, { "epoch": 0.46860782529572337, "grad_norm": 0.5967309034966486, "learning_rate": 9.785245901721054e-06, "loss": 0.0443, "step": 1030 }, { "epoch": 0.46906278434940857, "grad_norm": 0.588565790719301, "learning_rate": 9.784831316674246e-06, "loss": 0.0741, "step": 1031 }, { "epoch": 0.4695177434030937, "grad_norm": 0.6384508627867143, "learning_rate": 9.784416340632634e-06, "loss": 0.0639, "step": 1032 }, { "epoch": 0.4699727024567789, "grad_norm": 0.528980291125106, "learning_rate": 9.784000973630124e-06, "loss": 0.0506, "step": 1033 }, { "epoch": 0.47042766151046406, "grad_norm": 0.6297922247581061, "learning_rate": 9.783585215700656e-06, "loss": 0.0704, "step": 1034 }, { "epoch": 0.4708826205641492, "grad_norm": 1.1014615381108162, "learning_rate": 9.783169066878208e-06, "loss": 0.1063, "step": 1035 }, { "epoch": 0.4713375796178344, "grad_norm": 0.7370811970547196, "learning_rate": 9.782752527196785e-06, "loss": 0.0888, "step": 1036 }, { "epoch": 0.47179253867151955, "grad_norm": 0.6272964856361817, "learning_rate": 9.782335596690425e-06, "loss": 0.0683, "step": 1037 }, { "epoch": 0.47224749772520475, "grad_norm": 0.9675945822898259, "learning_rate": 9.781918275393196e-06, "loss": 0.1031, "step": 1038 }, { "epoch": 0.4727024567788899, "grad_norm": 0.8448129794628584, "learning_rate": 9.781500563339202e-06, "loss": 0.0818, "step": 1039 }, { "epoch": 0.4731574158325751, "grad_norm": 0.5148120993988892, "learning_rate": 9.781082460562574e-06, "loss": 0.0525, "step": 1040 }, { "epoch": 0.47361237488626023, "grad_norm": 0.7767251927940846, "learning_rate": 9.780663967097477e-06, "loss": 0.0869, "step": 1041 }, { "epoch": 0.4740673339399454, "grad_norm": 0.9661754574144388, "learning_rate": 9.780245082978112e-06, "loss": 0.0923, "step": 1042 }, { "epoch": 0.4745222929936306, "grad_norm": 0.780061387882855, "learning_rate": 9.779825808238705e-06, "loss": 0.095, "step": 1043 }, { "epoch": 0.4749772520473157, "grad_norm": 0.8513172657519864, "learning_rate": 9.77940614291352e-06, "loss": 0.0772, "step": 1044 }, { "epoch": 0.4754322111010009, "grad_norm": 0.6199453465731616, "learning_rate": 9.778986087036846e-06, "loss": 0.0701, "step": 1045 }, { "epoch": 0.47588717015468607, "grad_norm": 0.5327629714743946, "learning_rate": 9.778565640643011e-06, "loss": 0.0447, "step": 1046 }, { "epoch": 0.47634212920837127, "grad_norm": 0.8882337205809296, "learning_rate": 9.778144803766375e-06, "loss": 0.0788, "step": 1047 }, { "epoch": 0.4767970882620564, "grad_norm": 0.6023343672839219, "learning_rate": 9.77772357644132e-06, "loss": 0.0693, "step": 1048 }, { "epoch": 0.47725204731574156, "grad_norm": 0.8031515985448552, "learning_rate": 9.777301958702273e-06, "loss": 0.0911, "step": 1049 }, { "epoch": 0.47770700636942676, "grad_norm": 0.8695877166802147, "learning_rate": 9.776879950583683e-06, "loss": 0.12, "step": 1050 }, { "epoch": 0.4781619654231119, "grad_norm": 0.6077253389668626, "learning_rate": 9.776457552120034e-06, "loss": 0.0722, "step": 1051 }, { "epoch": 0.4786169244767971, "grad_norm": 0.7976020915977983, "learning_rate": 9.776034763345845e-06, "loss": 0.0783, "step": 1052 }, { "epoch": 0.47907188353048225, "grad_norm": 0.7091049596783572, "learning_rate": 9.775611584295663e-06, "loss": 0.0739, "step": 1053 }, { "epoch": 0.47952684258416745, "grad_norm": 0.7919907245184465, "learning_rate": 9.775188015004072e-06, "loss": 0.0728, "step": 1054 }, { "epoch": 0.4799818016378526, "grad_norm": 0.9227645018819045, "learning_rate": 9.774764055505676e-06, "loss": 0.0905, "step": 1055 }, { "epoch": 0.48043676069153773, "grad_norm": 0.7130315690029604, "learning_rate": 9.774339705835127e-06, "loss": 0.09, "step": 1056 }, { "epoch": 0.48089171974522293, "grad_norm": 0.7993270676292756, "learning_rate": 9.773914966027098e-06, "loss": 0.1011, "step": 1057 }, { "epoch": 0.4813466787989081, "grad_norm": 0.8955668988276211, "learning_rate": 9.773489836116297e-06, "loss": 0.0963, "step": 1058 }, { "epoch": 0.4818016378525933, "grad_norm": 0.7582155580680914, "learning_rate": 9.773064316137464e-06, "loss": 0.0766, "step": 1059 }, { "epoch": 0.4822565969062784, "grad_norm": 0.6939955066308027, "learning_rate": 9.772638406125367e-06, "loss": 0.0687, "step": 1060 }, { "epoch": 0.4827115559599636, "grad_norm": 0.8091635860789653, "learning_rate": 9.772212106114816e-06, "loss": 0.0754, "step": 1061 }, { "epoch": 0.48316651501364877, "grad_norm": 0.8236012040739623, "learning_rate": 9.77178541614064e-06, "loss": 0.0951, "step": 1062 }, { "epoch": 0.48362147406733397, "grad_norm": 0.6622501946117725, "learning_rate": 9.77135833623771e-06, "loss": 0.083, "step": 1063 }, { "epoch": 0.4840764331210191, "grad_norm": 0.8689743387052602, "learning_rate": 9.770930866440927e-06, "loss": 0.1074, "step": 1064 }, { "epoch": 0.48453139217470426, "grad_norm": 0.6733750246744147, "learning_rate": 9.770503006785214e-06, "loss": 0.0639, "step": 1065 }, { "epoch": 0.48498635122838946, "grad_norm": 0.9485233745498586, "learning_rate": 9.770074757305541e-06, "loss": 0.1106, "step": 1066 }, { "epoch": 0.4854413102820746, "grad_norm": 0.8288392949652397, "learning_rate": 9.769646118036902e-06, "loss": 0.0661, "step": 1067 }, { "epoch": 0.4858962693357598, "grad_norm": 0.7475423805914638, "learning_rate": 9.76921708901432e-06, "loss": 0.0686, "step": 1068 }, { "epoch": 0.48635122838944495, "grad_norm": 0.54120364671088, "learning_rate": 9.768787670272855e-06, "loss": 0.0629, "step": 1069 }, { "epoch": 0.48680618744313015, "grad_norm": 0.7281619635509152, "learning_rate": 9.768357861847598e-06, "loss": 0.0723, "step": 1070 }, { "epoch": 0.4872611464968153, "grad_norm": 0.8883321717067604, "learning_rate": 9.767927663773668e-06, "loss": 0.0832, "step": 1071 }, { "epoch": 0.48771610555050043, "grad_norm": 0.7681469789077073, "learning_rate": 9.767497076086223e-06, "loss": 0.0786, "step": 1072 }, { "epoch": 0.48817106460418563, "grad_norm": 0.6590861395931087, "learning_rate": 9.767066098820446e-06, "loss": 0.0704, "step": 1073 }, { "epoch": 0.4886260236578708, "grad_norm": 0.7944203702948146, "learning_rate": 9.766634732011557e-06, "loss": 0.0867, "step": 1074 }, { "epoch": 0.489080982711556, "grad_norm": 0.7832480468570255, "learning_rate": 9.766202975694801e-06, "loss": 0.0873, "step": 1075 }, { "epoch": 0.4895359417652411, "grad_norm": 0.7232266679451883, "learning_rate": 9.765770829905464e-06, "loss": 0.0785, "step": 1076 }, { "epoch": 0.4899909008189263, "grad_norm": 0.5406798309730716, "learning_rate": 9.765338294678856e-06, "loss": 0.0469, "step": 1077 }, { "epoch": 0.49044585987261147, "grad_norm": 0.5866548164219128, "learning_rate": 9.764905370050321e-06, "loss": 0.0524, "step": 1078 }, { "epoch": 0.4909008189262966, "grad_norm": 0.9915720236606885, "learning_rate": 9.76447205605524e-06, "loss": 0.1019, "step": 1079 }, { "epoch": 0.4913557779799818, "grad_norm": 0.6838845303274752, "learning_rate": 9.764038352729018e-06, "loss": 0.0891, "step": 1080 }, { "epoch": 0.49181073703366696, "grad_norm": 0.9385660559352969, "learning_rate": 9.763604260107096e-06, "loss": 0.1058, "step": 1081 }, { "epoch": 0.49226569608735216, "grad_norm": 0.6710872617569944, "learning_rate": 9.763169778224946e-06, "loss": 0.0665, "step": 1082 }, { "epoch": 0.4927206551410373, "grad_norm": 0.7878885609137168, "learning_rate": 9.762734907118072e-06, "loss": 0.0876, "step": 1083 }, { "epoch": 0.4931756141947225, "grad_norm": 0.6302166766090778, "learning_rate": 9.76229964682201e-06, "loss": 0.0507, "step": 1084 }, { "epoch": 0.49363057324840764, "grad_norm": 0.5833462678864086, "learning_rate": 9.761863997372325e-06, "loss": 0.0612, "step": 1085 }, { "epoch": 0.4940855323020928, "grad_norm": 1.036522158484448, "learning_rate": 9.761427958804621e-06, "loss": 0.1395, "step": 1086 }, { "epoch": 0.494540491355778, "grad_norm": 1.1502320115946314, "learning_rate": 9.760991531154526e-06, "loss": 0.1149, "step": 1087 }, { "epoch": 0.49499545040946313, "grad_norm": 0.7616054217825209, "learning_rate": 9.760554714457704e-06, "loss": 0.0684, "step": 1088 }, { "epoch": 0.49545040946314833, "grad_norm": 0.5129309167340426, "learning_rate": 9.760117508749846e-06, "loss": 0.0614, "step": 1089 }, { "epoch": 0.4959053685168335, "grad_norm": 0.7147170789642256, "learning_rate": 9.759679914066686e-06, "loss": 0.0842, "step": 1090 }, { "epoch": 0.4963603275705187, "grad_norm": 0.7513123367978354, "learning_rate": 9.759241930443975e-06, "loss": 0.0749, "step": 1091 }, { "epoch": 0.4968152866242038, "grad_norm": 0.5462870672862663, "learning_rate": 9.75880355791751e-06, "loss": 0.0588, "step": 1092 }, { "epoch": 0.49727024567788897, "grad_norm": 0.6158644897786469, "learning_rate": 9.758364796523105e-06, "loss": 0.0578, "step": 1093 }, { "epoch": 0.49772520473157417, "grad_norm": 0.5248367448810554, "learning_rate": 9.757925646296617e-06, "loss": 0.0504, "step": 1094 }, { "epoch": 0.4981801637852593, "grad_norm": 0.7801307646100064, "learning_rate": 9.757486107273935e-06, "loss": 0.0819, "step": 1095 }, { "epoch": 0.4986351228389445, "grad_norm": 0.6822936325355138, "learning_rate": 9.75704617949097e-06, "loss": 0.0828, "step": 1096 }, { "epoch": 0.49909008189262966, "grad_norm": 0.49379397863131413, "learning_rate": 9.756605862983675e-06, "loss": 0.0606, "step": 1097 }, { "epoch": 0.49954504094631486, "grad_norm": 0.5236513133369656, "learning_rate": 9.756165157788029e-06, "loss": 0.0493, "step": 1098 }, { "epoch": 0.5, "grad_norm": 0.7323812225903658, "learning_rate": 9.755724063940047e-06, "loss": 0.0794, "step": 1099 }, { "epoch": 0.5004549590536852, "grad_norm": 0.853156508842135, "learning_rate": 9.755282581475769e-06, "loss": 0.08, "step": 1100 }, { "epoch": 0.5009099181073703, "grad_norm": 0.7117091061791435, "learning_rate": 9.754840710431274e-06, "loss": 0.0773, "step": 1101 }, { "epoch": 0.5013648771610555, "grad_norm": 0.9350752111669145, "learning_rate": 9.754398450842668e-06, "loss": 0.1046, "step": 1102 }, { "epoch": 0.5018198362147407, "grad_norm": 0.8834833642233855, "learning_rate": 9.753955802746091e-06, "loss": 0.1284, "step": 1103 }, { "epoch": 0.5022747952684259, "grad_norm": 0.9022387216275947, "learning_rate": 9.753512766177717e-06, "loss": 0.0898, "step": 1104 }, { "epoch": 0.502729754322111, "grad_norm": 0.551248880180483, "learning_rate": 9.753069341173745e-06, "loss": 0.0596, "step": 1105 }, { "epoch": 0.5031847133757962, "grad_norm": 0.5970423480352659, "learning_rate": 9.752625527770409e-06, "loss": 0.0723, "step": 1106 }, { "epoch": 0.5036396724294814, "grad_norm": 0.7620108531589319, "learning_rate": 9.75218132600398e-06, "loss": 0.0856, "step": 1107 }, { "epoch": 0.5040946314831665, "grad_norm": 0.7720887684681512, "learning_rate": 9.751736735910753e-06, "loss": 0.0904, "step": 1108 }, { "epoch": 0.5045495905368517, "grad_norm": 0.8672659681858957, "learning_rate": 9.75129175752706e-06, "loss": 0.1043, "step": 1109 }, { "epoch": 0.5050045495905369, "grad_norm": 0.7511079874116621, "learning_rate": 9.75084639088926e-06, "loss": 0.0719, "step": 1110 }, { "epoch": 0.5054595086442221, "grad_norm": 0.7442062138473109, "learning_rate": 9.750400636033746e-06, "loss": 0.0805, "step": 1111 }, { "epoch": 0.5059144676979072, "grad_norm": 0.716157443156474, "learning_rate": 9.749954492996947e-06, "loss": 0.0902, "step": 1112 }, { "epoch": 0.5063694267515924, "grad_norm": 0.7655895172099163, "learning_rate": 9.749507961815317e-06, "loss": 0.0973, "step": 1113 }, { "epoch": 0.5068243858052776, "grad_norm": 0.6288294239038802, "learning_rate": 9.749061042525343e-06, "loss": 0.0646, "step": 1114 }, { "epoch": 0.5072793448589626, "grad_norm": 0.6709452216437115, "learning_rate": 9.74861373516355e-06, "loss": 0.0717, "step": 1115 }, { "epoch": 0.5077343039126478, "grad_norm": 0.6522838269502338, "learning_rate": 9.748166039766484e-06, "loss": 0.0475, "step": 1116 }, { "epoch": 0.508189262966333, "grad_norm": 0.7999784990978867, "learning_rate": 9.747717956370735e-06, "loss": 0.0925, "step": 1117 }, { "epoch": 0.5086442220200182, "grad_norm": 1.0917998243863505, "learning_rate": 9.747269485012913e-06, "loss": 0.1293, "step": 1118 }, { "epoch": 0.5090991810737033, "grad_norm": 0.7636715530766439, "learning_rate": 9.746820625729667e-06, "loss": 0.0774, "step": 1119 }, { "epoch": 0.5095541401273885, "grad_norm": 0.6701230428761437, "learning_rate": 9.746371378557677e-06, "loss": 0.0623, "step": 1120 }, { "epoch": 0.5100090991810737, "grad_norm": 0.972334707766994, "learning_rate": 9.745921743533653e-06, "loss": 0.113, "step": 1121 }, { "epoch": 0.5104640582347588, "grad_norm": 0.6630727679984025, "learning_rate": 9.745471720694335e-06, "loss": 0.0828, "step": 1122 }, { "epoch": 0.510919017288444, "grad_norm": 0.8798279960192045, "learning_rate": 9.745021310076498e-06, "loss": 0.0772, "step": 1123 }, { "epoch": 0.5113739763421292, "grad_norm": 0.6337737332675445, "learning_rate": 9.744570511716952e-06, "loss": 0.0805, "step": 1124 }, { "epoch": 0.5118289353958144, "grad_norm": 0.9171053674032225, "learning_rate": 9.744119325652526e-06, "loss": 0.0901, "step": 1125 }, { "epoch": 0.5122838944494995, "grad_norm": 0.7437420002919692, "learning_rate": 9.743667751920093e-06, "loss": 0.0789, "step": 1126 }, { "epoch": 0.5127388535031847, "grad_norm": 0.692440215965907, "learning_rate": 9.743215790556556e-06, "loss": 0.0885, "step": 1127 }, { "epoch": 0.5131938125568699, "grad_norm": 0.5830998661595514, "learning_rate": 9.742763441598841e-06, "loss": 0.0571, "step": 1128 }, { "epoch": 0.513648771610555, "grad_norm": 0.7409283851806759, "learning_rate": 9.742310705083919e-06, "loss": 0.0819, "step": 1129 }, { "epoch": 0.5141037306642402, "grad_norm": 0.6329559817029019, "learning_rate": 9.74185758104878e-06, "loss": 0.0732, "step": 1130 }, { "epoch": 0.5145586897179254, "grad_norm": 0.47102788261692413, "learning_rate": 9.741404069530455e-06, "loss": 0.0496, "step": 1131 }, { "epoch": 0.5150136487716106, "grad_norm": 0.7193278988032876, "learning_rate": 9.740950170566002e-06, "loss": 0.0797, "step": 1132 }, { "epoch": 0.5154686078252957, "grad_norm": 0.7827454423152818, "learning_rate": 9.740495884192509e-06, "loss": 0.0863, "step": 1133 }, { "epoch": 0.5159235668789809, "grad_norm": 0.5187125000260286, "learning_rate": 9.740041210447101e-06, "loss": 0.048, "step": 1134 }, { "epoch": 0.5163785259326661, "grad_norm": 0.7621657915309645, "learning_rate": 9.739586149366932e-06, "loss": 0.076, "step": 1135 }, { "epoch": 0.5168334849863512, "grad_norm": 1.0691498364952807, "learning_rate": 9.739130700989185e-06, "loss": 0.1085, "step": 1136 }, { "epoch": 0.5172884440400364, "grad_norm": 1.126943089011516, "learning_rate": 9.738674865351081e-06, "loss": 0.1197, "step": 1137 }, { "epoch": 0.5177434030937216, "grad_norm": 0.5967935472543325, "learning_rate": 9.738218642489864e-06, "loss": 0.0715, "step": 1138 }, { "epoch": 0.5181983621474068, "grad_norm": 0.6520369417533736, "learning_rate": 9.73776203244282e-06, "loss": 0.0812, "step": 1139 }, { "epoch": 0.5186533212010919, "grad_norm": 0.6923655317783546, "learning_rate": 9.737305035247258e-06, "loss": 0.0607, "step": 1140 }, { "epoch": 0.5191082802547771, "grad_norm": 0.5971267035932937, "learning_rate": 9.73684765094052e-06, "loss": 0.0597, "step": 1141 }, { "epoch": 0.5195632393084623, "grad_norm": 0.6102979031011873, "learning_rate": 9.736389879559984e-06, "loss": 0.0464, "step": 1142 }, { "epoch": 0.5200181983621474, "grad_norm": 0.5971210330968472, "learning_rate": 9.735931721143058e-06, "loss": 0.0674, "step": 1143 }, { "epoch": 0.5204731574158326, "grad_norm": 0.9014574419537533, "learning_rate": 9.735473175727178e-06, "loss": 0.1071, "step": 1144 }, { "epoch": 0.5209281164695178, "grad_norm": 1.024240239778721, "learning_rate": 9.735014243349814e-06, "loss": 0.1058, "step": 1145 }, { "epoch": 0.521383075523203, "grad_norm": 0.740240244958144, "learning_rate": 9.73455492404847e-06, "loss": 0.0716, "step": 1146 }, { "epoch": 0.521838034576888, "grad_norm": 0.8552793125149327, "learning_rate": 9.734095217860679e-06, "loss": 0.1116, "step": 1147 }, { "epoch": 0.5222929936305732, "grad_norm": 0.8388846880500271, "learning_rate": 9.733635124824007e-06, "loss": 0.1195, "step": 1148 }, { "epoch": 0.5227479526842584, "grad_norm": 0.7476616795889469, "learning_rate": 9.733174644976047e-06, "loss": 0.0982, "step": 1149 }, { "epoch": 0.5232029117379435, "grad_norm": 1.247104578949049, "learning_rate": 9.732713778354431e-06, "loss": 0.1339, "step": 1150 }, { "epoch": 0.5236578707916287, "grad_norm": 0.8127429979477634, "learning_rate": 9.732252524996818e-06, "loss": 0.0994, "step": 1151 }, { "epoch": 0.5241128298453139, "grad_norm": 1.1678300434583342, "learning_rate": 9.731790884940899e-06, "loss": 0.1152, "step": 1152 }, { "epoch": 0.5245677888989991, "grad_norm": 0.5209287069427062, "learning_rate": 9.731328858224398e-06, "loss": 0.0546, "step": 1153 }, { "epoch": 0.5250227479526842, "grad_norm": 0.8363023252623251, "learning_rate": 9.730866444885069e-06, "loss": 0.0894, "step": 1154 }, { "epoch": 0.5254777070063694, "grad_norm": 0.8202924553152645, "learning_rate": 9.730403644960697e-06, "loss": 0.0914, "step": 1155 }, { "epoch": 0.5259326660600546, "grad_norm": 0.4900409376406188, "learning_rate": 9.729940458489105e-06, "loss": 0.0454, "step": 1156 }, { "epoch": 0.5263876251137397, "grad_norm": 0.5631225499534328, "learning_rate": 9.729476885508136e-06, "loss": 0.0542, "step": 1157 }, { "epoch": 0.5268425841674249, "grad_norm": 0.566596895824316, "learning_rate": 9.729012926055674e-06, "loss": 0.0625, "step": 1158 }, { "epoch": 0.5272975432211101, "grad_norm": 0.9035766920121469, "learning_rate": 9.728548580169632e-06, "loss": 0.1013, "step": 1159 }, { "epoch": 0.5277525022747953, "grad_norm": 0.8241016260766749, "learning_rate": 9.728083847887955e-06, "loss": 0.078, "step": 1160 }, { "epoch": 0.5282074613284804, "grad_norm": 0.7435557294319748, "learning_rate": 9.727618729248617e-06, "loss": 0.0864, "step": 1161 }, { "epoch": 0.5286624203821656, "grad_norm": 0.6611375262646607, "learning_rate": 9.727153224289627e-06, "loss": 0.0769, "step": 1162 }, { "epoch": 0.5291173794358508, "grad_norm": 0.8275931946782299, "learning_rate": 9.726687333049024e-06, "loss": 0.0889, "step": 1163 }, { "epoch": 0.5295723384895359, "grad_norm": 1.057751919756087, "learning_rate": 9.726221055564874e-06, "loss": 0.0851, "step": 1164 }, { "epoch": 0.5300272975432211, "grad_norm": 0.7884543920060787, "learning_rate": 9.725754391875287e-06, "loss": 0.0746, "step": 1165 }, { "epoch": 0.5304822565969063, "grad_norm": 0.8593529313000522, "learning_rate": 9.72528734201839e-06, "loss": 0.0828, "step": 1166 }, { "epoch": 0.5309372156505915, "grad_norm": 0.5225417485901063, "learning_rate": 9.72481990603235e-06, "loss": 0.0794, "step": 1167 }, { "epoch": 0.5313921747042766, "grad_norm": 0.8820660720540598, "learning_rate": 9.724352083955366e-06, "loss": 0.1059, "step": 1168 }, { "epoch": 0.5318471337579618, "grad_norm": 0.6775105748188827, "learning_rate": 9.723883875825664e-06, "loss": 0.079, "step": 1169 }, { "epoch": 0.532302092811647, "grad_norm": 0.5969175177573056, "learning_rate": 9.723415281681505e-06, "loss": 0.061, "step": 1170 }, { "epoch": 0.5327570518653321, "grad_norm": 0.7165111743049339, "learning_rate": 9.722946301561179e-06, "loss": 0.0824, "step": 1171 }, { "epoch": 0.5332120109190173, "grad_norm": 0.7771351455478163, "learning_rate": 9.722476935503011e-06, "loss": 0.0936, "step": 1172 }, { "epoch": 0.5336669699727025, "grad_norm": 0.5612071801020553, "learning_rate": 9.722007183545353e-06, "loss": 0.0584, "step": 1173 }, { "epoch": 0.5341219290263877, "grad_norm": 0.7630759308283642, "learning_rate": 9.721537045726594e-06, "loss": 0.0711, "step": 1174 }, { "epoch": 0.5345768880800728, "grad_norm": 0.7415951616336062, "learning_rate": 9.721066522085148e-06, "loss": 0.0786, "step": 1175 }, { "epoch": 0.535031847133758, "grad_norm": 0.6697058559185771, "learning_rate": 9.720595612659467e-06, "loss": 0.0943, "step": 1176 }, { "epoch": 0.5354868061874432, "grad_norm": 0.8294561042543531, "learning_rate": 9.720124317488031e-06, "loss": 0.0766, "step": 1177 }, { "epoch": 0.5359417652411284, "grad_norm": 0.8069252663248169, "learning_rate": 9.719652636609351e-06, "loss": 0.1036, "step": 1178 }, { "epoch": 0.5363967242948134, "grad_norm": 0.5216393236723873, "learning_rate": 9.719180570061973e-06, "loss": 0.0681, "step": 1179 }, { "epoch": 0.5368516833484986, "grad_norm": 0.7561882785891234, "learning_rate": 9.718708117884468e-06, "loss": 0.0888, "step": 1180 }, { "epoch": 0.5373066424021838, "grad_norm": 0.7101886443887773, "learning_rate": 9.718235280115446e-06, "loss": 0.0841, "step": 1181 }, { "epoch": 0.5377616014558689, "grad_norm": 0.93883085852681, "learning_rate": 9.717762056793545e-06, "loss": 0.1116, "step": 1182 }, { "epoch": 0.5382165605095541, "grad_norm": 0.8029318164759022, "learning_rate": 9.717288447957433e-06, "loss": 0.0817, "step": 1183 }, { "epoch": 0.5386715195632393, "grad_norm": 0.7189629467174897, "learning_rate": 9.716814453645811e-06, "loss": 0.0913, "step": 1184 }, { "epoch": 0.5391264786169245, "grad_norm": 0.6194922793353296, "learning_rate": 9.716340073897414e-06, "loss": 0.073, "step": 1185 }, { "epoch": 0.5395814376706096, "grad_norm": 0.5862599296496694, "learning_rate": 9.715865308751006e-06, "loss": 0.0599, "step": 1186 }, { "epoch": 0.5400363967242948, "grad_norm": 1.0638863826866105, "learning_rate": 9.715390158245381e-06, "loss": 0.1412, "step": 1187 }, { "epoch": 0.54049135577798, "grad_norm": 0.6031416289368001, "learning_rate": 9.714914622419367e-06, "loss": 0.0694, "step": 1188 }, { "epoch": 0.5409463148316651, "grad_norm": 0.5762096954254395, "learning_rate": 9.714438701311822e-06, "loss": 0.0627, "step": 1189 }, { "epoch": 0.5414012738853503, "grad_norm": 0.6077021479661606, "learning_rate": 9.713962394961636e-06, "loss": 0.067, "step": 1190 }, { "epoch": 0.5418562329390355, "grad_norm": 0.5381873559759192, "learning_rate": 9.713485703407732e-06, "loss": 0.0595, "step": 1191 }, { "epoch": 0.5423111919927207, "grad_norm": 0.7866618609648011, "learning_rate": 9.713008626689063e-06, "loss": 0.1064, "step": 1192 }, { "epoch": 0.5427661510464058, "grad_norm": 0.7100862231154079, "learning_rate": 9.712531164844611e-06, "loss": 0.07, "step": 1193 }, { "epoch": 0.543221110100091, "grad_norm": 0.5579932774059501, "learning_rate": 9.712053317913394e-06, "loss": 0.0525, "step": 1194 }, { "epoch": 0.5436760691537762, "grad_norm": 0.5454543895601387, "learning_rate": 9.711575085934459e-06, "loss": 0.0741, "step": 1195 }, { "epoch": 0.5441310282074613, "grad_norm": 0.6754854519258514, "learning_rate": 9.711096468946888e-06, "loss": 0.101, "step": 1196 }, { "epoch": 0.5445859872611465, "grad_norm": 0.8125002765504534, "learning_rate": 9.710617466989787e-06, "loss": 0.0937, "step": 1197 }, { "epoch": 0.5450409463148317, "grad_norm": 0.5893498973936582, "learning_rate": 9.710138080102298e-06, "loss": 0.0658, "step": 1198 }, { "epoch": 0.5454959053685169, "grad_norm": 0.8107633297228217, "learning_rate": 9.709658308323597e-06, "loss": 0.0955, "step": 1199 }, { "epoch": 0.545950864422202, "grad_norm": 0.6726060122769176, "learning_rate": 9.70917815169289e-06, "loss": 0.084, "step": 1200 }, { "epoch": 0.5464058234758872, "grad_norm": 0.6077011277694447, "learning_rate": 9.708697610249407e-06, "loss": 0.0756, "step": 1201 }, { "epoch": 0.5468607825295724, "grad_norm": 0.7073007110523803, "learning_rate": 9.70821668403242e-06, "loss": 0.0818, "step": 1202 }, { "epoch": 0.5473157415832575, "grad_norm": 0.9420816064988972, "learning_rate": 9.707735373081231e-06, "loss": 0.1197, "step": 1203 }, { "epoch": 0.5477707006369427, "grad_norm": 0.552138579735494, "learning_rate": 9.707253677435165e-06, "loss": 0.0594, "step": 1204 }, { "epoch": 0.5482256596906279, "grad_norm": 0.6375758502862188, "learning_rate": 9.706771597133587e-06, "loss": 0.0572, "step": 1205 }, { "epoch": 0.5486806187443131, "grad_norm": 0.6581691945271008, "learning_rate": 9.706289132215889e-06, "loss": 0.0707, "step": 1206 }, { "epoch": 0.5491355777979982, "grad_norm": 0.820106985355047, "learning_rate": 9.705806282721498e-06, "loss": 0.0865, "step": 1207 }, { "epoch": 0.5495905368516834, "grad_norm": 0.5258555939105785, "learning_rate": 9.705323048689866e-06, "loss": 0.0462, "step": 1208 }, { "epoch": 0.5500454959053686, "grad_norm": 0.7818892498713288, "learning_rate": 9.704839430160487e-06, "loss": 0.1005, "step": 1209 }, { "epoch": 0.5505004549590536, "grad_norm": 0.6371281646305975, "learning_rate": 9.704355427172874e-06, "loss": 0.0712, "step": 1210 }, { "epoch": 0.5509554140127388, "grad_norm": 0.5981165031558572, "learning_rate": 9.70387103976658e-06, "loss": 0.0669, "step": 1211 }, { "epoch": 0.551410373066424, "grad_norm": 0.640233382171881, "learning_rate": 9.703386267981188e-06, "loss": 0.0629, "step": 1212 }, { "epoch": 0.5518653321201092, "grad_norm": 0.5436666812285462, "learning_rate": 9.70290111185631e-06, "loss": 0.0527, "step": 1213 }, { "epoch": 0.5523202911737943, "grad_norm": 0.9264418893677014, "learning_rate": 9.702415571431594e-06, "loss": 0.1392, "step": 1214 }, { "epoch": 0.5527752502274795, "grad_norm": 0.6659444469982292, "learning_rate": 9.70192964674671e-06, "loss": 0.0948, "step": 1215 }, { "epoch": 0.5532302092811647, "grad_norm": 0.5526163080676849, "learning_rate": 9.70144333784137e-06, "loss": 0.0661, "step": 1216 }, { "epoch": 0.5536851683348498, "grad_norm": 0.7994476768514381, "learning_rate": 9.700956644755313e-06, "loss": 0.0966, "step": 1217 }, { "epoch": 0.554140127388535, "grad_norm": 0.7919884013199107, "learning_rate": 9.700469567528307e-06, "loss": 0.1082, "step": 1218 }, { "epoch": 0.5545950864422202, "grad_norm": 0.7366932972024113, "learning_rate": 9.699982106200155e-06, "loss": 0.0841, "step": 1219 }, { "epoch": 0.5550500454959054, "grad_norm": 0.8558659635343526, "learning_rate": 9.699494260810692e-06, "loss": 0.0866, "step": 1220 }, { "epoch": 0.5555050045495905, "grad_norm": 0.8060928626360002, "learning_rate": 9.699006031399779e-06, "loss": 0.0777, "step": 1221 }, { "epoch": 0.5559599636032757, "grad_norm": 0.6914626835020681, "learning_rate": 9.698517418007314e-06, "loss": 0.0775, "step": 1222 }, { "epoch": 0.5564149226569609, "grad_norm": 0.8706739684427142, "learning_rate": 9.698028420673224e-06, "loss": 0.0984, "step": 1223 }, { "epoch": 0.556869881710646, "grad_norm": 0.7863016327992207, "learning_rate": 9.697539039437468e-06, "loss": 0.1118, "step": 1224 }, { "epoch": 0.5573248407643312, "grad_norm": 0.7719453440565228, "learning_rate": 9.697049274340036e-06, "loss": 0.0824, "step": 1225 }, { "epoch": 0.5577797998180164, "grad_norm": 1.1509899845731206, "learning_rate": 9.696559125420949e-06, "loss": 0.1254, "step": 1226 }, { "epoch": 0.5582347588717016, "grad_norm": 0.5202193771917482, "learning_rate": 9.696068592720257e-06, "loss": 0.0538, "step": 1227 }, { "epoch": 0.5586897179253867, "grad_norm": 0.5880633286090164, "learning_rate": 9.69557767627805e-06, "loss": 0.0711, "step": 1228 }, { "epoch": 0.5591446769790719, "grad_norm": 0.6342846572654288, "learning_rate": 9.695086376134438e-06, "loss": 0.0671, "step": 1229 }, { "epoch": 0.5595996360327571, "grad_norm": 0.7541651906429654, "learning_rate": 9.694594692329571e-06, "loss": 0.0813, "step": 1230 }, { "epoch": 0.5600545950864422, "grad_norm": 0.6416731945433944, "learning_rate": 9.694102624903627e-06, "loss": 0.0733, "step": 1231 }, { "epoch": 0.5605095541401274, "grad_norm": 1.0012992796464886, "learning_rate": 9.693610173896815e-06, "loss": 0.096, "step": 1232 }, { "epoch": 0.5609645131938126, "grad_norm": 0.725396699259508, "learning_rate": 9.693117339349376e-06, "loss": 0.0665, "step": 1233 }, { "epoch": 0.5614194722474978, "grad_norm": 0.7481457641805567, "learning_rate": 9.692624121301581e-06, "loss": 0.0715, "step": 1234 }, { "epoch": 0.5618744313011829, "grad_norm": 0.969766282604155, "learning_rate": 9.692130519793734e-06, "loss": 0.0991, "step": 1235 }, { "epoch": 0.5623293903548681, "grad_norm": 0.8522169509206354, "learning_rate": 9.691636534866172e-06, "loss": 0.1025, "step": 1236 }, { "epoch": 0.5627843494085533, "grad_norm": 0.7682304561659135, "learning_rate": 9.691142166559259e-06, "loss": 0.0846, "step": 1237 }, { "epoch": 0.5632393084622384, "grad_norm": 0.5495617218791536, "learning_rate": 9.690647414913392e-06, "loss": 0.0766, "step": 1238 }, { "epoch": 0.5636942675159236, "grad_norm": 0.6826816911759014, "learning_rate": 9.690152279969003e-06, "loss": 0.0729, "step": 1239 }, { "epoch": 0.5641492265696088, "grad_norm": 0.8352406959674302, "learning_rate": 9.689656761766548e-06, "loss": 0.0896, "step": 1240 }, { "epoch": 0.564604185623294, "grad_norm": 0.5908696548320724, "learning_rate": 9.689160860346522e-06, "loss": 0.0753, "step": 1241 }, { "epoch": 0.565059144676979, "grad_norm": 0.4283914528398344, "learning_rate": 9.688664575749447e-06, "loss": 0.0414, "step": 1242 }, { "epoch": 0.5655141037306642, "grad_norm": 0.6584468440229382, "learning_rate": 9.688167908015877e-06, "loss": 0.0733, "step": 1243 }, { "epoch": 0.5659690627843494, "grad_norm": 0.9211218848648471, "learning_rate": 9.687670857186396e-06, "loss": 0.1171, "step": 1244 }, { "epoch": 0.5664240218380345, "grad_norm": 0.9250852893692096, "learning_rate": 9.68717342330162e-06, "loss": 0.1061, "step": 1245 }, { "epoch": 0.5668789808917197, "grad_norm": 0.8688266055790496, "learning_rate": 9.686675606402203e-06, "loss": 0.1213, "step": 1246 }, { "epoch": 0.5673339399454049, "grad_norm": 0.7110325678190088, "learning_rate": 9.686177406528819e-06, "loss": 0.0836, "step": 1247 }, { "epoch": 0.5677888989990901, "grad_norm": 0.8260984800022192, "learning_rate": 9.685678823722178e-06, "loss": 0.0907, "step": 1248 }, { "epoch": 0.5682438580527752, "grad_norm": 0.6625042460625208, "learning_rate": 9.685179858023026e-06, "loss": 0.0777, "step": 1249 }, { "epoch": 0.5686988171064604, "grad_norm": 0.711324638729454, "learning_rate": 9.684680509472133e-06, "loss": 0.0815, "step": 1250 }, { "epoch": 0.5691537761601456, "grad_norm": 0.6863010294874783, "learning_rate": 9.684180778110306e-06, "loss": 0.0642, "step": 1251 }, { "epoch": 0.5696087352138307, "grad_norm": 0.5978880624303593, "learning_rate": 9.683680663978377e-06, "loss": 0.065, "step": 1252 }, { "epoch": 0.5700636942675159, "grad_norm": 0.6322068932784428, "learning_rate": 9.683180167117216e-06, "loss": 0.0681, "step": 1253 }, { "epoch": 0.5705186533212011, "grad_norm": 0.7826720403434554, "learning_rate": 9.682679287567722e-06, "loss": 0.0881, "step": 1254 }, { "epoch": 0.5709736123748863, "grad_norm": 0.794807695787425, "learning_rate": 9.682178025370824e-06, "loss": 0.1118, "step": 1255 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7050268620804678, "learning_rate": 9.681676380567482e-06, "loss": 0.0839, "step": 1256 }, { "epoch": 0.5718835304822566, "grad_norm": 0.5581694578677082, "learning_rate": 9.681174353198687e-06, "loss": 0.0482, "step": 1257 }, { "epoch": 0.5723384895359418, "grad_norm": 0.6766600070725707, "learning_rate": 9.680671943305465e-06, "loss": 0.0679, "step": 1258 }, { "epoch": 0.5727934485896269, "grad_norm": 0.6995276308642288, "learning_rate": 9.680169150928868e-06, "loss": 0.0823, "step": 1259 }, { "epoch": 0.5732484076433121, "grad_norm": 0.6008334474427011, "learning_rate": 9.679665976109985e-06, "loss": 0.0669, "step": 1260 }, { "epoch": 0.5737033666969973, "grad_norm": 0.6951316344905618, "learning_rate": 9.679162418889932e-06, "loss": 0.0644, "step": 1261 }, { "epoch": 0.5741583257506825, "grad_norm": 0.7661270676130627, "learning_rate": 9.678658479309854e-06, "loss": 0.0837, "step": 1262 }, { "epoch": 0.5746132848043676, "grad_norm": 0.7593531327031607, "learning_rate": 9.678154157410937e-06, "loss": 0.0646, "step": 1263 }, { "epoch": 0.5750682438580528, "grad_norm": 0.7824619403016152, "learning_rate": 9.677649453234388e-06, "loss": 0.0907, "step": 1264 }, { "epoch": 0.575523202911738, "grad_norm": 0.8187746029529864, "learning_rate": 9.67714436682145e-06, "loss": 0.0906, "step": 1265 }, { "epoch": 0.5759781619654231, "grad_norm": 0.7676559233650921, "learning_rate": 9.676638898213394e-06, "loss": 0.0839, "step": 1266 }, { "epoch": 0.5764331210191083, "grad_norm": 0.5944493207466681, "learning_rate": 9.676133047451528e-06, "loss": 0.0588, "step": 1267 }, { "epoch": 0.5768880800727935, "grad_norm": 0.6734586229257056, "learning_rate": 9.675626814577188e-06, "loss": 0.0804, "step": 1268 }, { "epoch": 0.5773430391264787, "grad_norm": 0.6315388478681175, "learning_rate": 9.675120199631738e-06, "loss": 0.0636, "step": 1269 }, { "epoch": 0.5777979981801638, "grad_norm": 0.7252277920198784, "learning_rate": 9.674613202656577e-06, "loss": 0.0842, "step": 1270 }, { "epoch": 0.578252957233849, "grad_norm": 0.58556718084403, "learning_rate": 9.674105823693139e-06, "loss": 0.0764, "step": 1271 }, { "epoch": 0.5787079162875342, "grad_norm": 0.7635901125586164, "learning_rate": 9.673598062782878e-06, "loss": 0.0907, "step": 1272 }, { "epoch": 0.5791628753412192, "grad_norm": 0.33852379656119563, "learning_rate": 9.67308991996729e-06, "loss": 0.0387, "step": 1273 }, { "epoch": 0.5796178343949044, "grad_norm": 0.8984557509320932, "learning_rate": 9.672581395287897e-06, "loss": 0.0969, "step": 1274 }, { "epoch": 0.5800727934485896, "grad_norm": 0.881696210059407, "learning_rate": 9.672072488786254e-06, "loss": 0.115, "step": 1275 }, { "epoch": 0.5805277525022748, "grad_norm": 0.805394208652388, "learning_rate": 9.671563200503947e-06, "loss": 0.0916, "step": 1276 }, { "epoch": 0.5809827115559599, "grad_norm": 0.5947193670178038, "learning_rate": 9.67105353048259e-06, "loss": 0.0645, "step": 1277 }, { "epoch": 0.5814376706096451, "grad_norm": 0.9345719582841384, "learning_rate": 9.670543478763834e-06, "loss": 0.0853, "step": 1278 }, { "epoch": 0.5818926296633303, "grad_norm": 0.46822310121822047, "learning_rate": 9.670033045389356e-06, "loss": 0.06, "step": 1279 }, { "epoch": 0.5823475887170154, "grad_norm": 0.882335352298928, "learning_rate": 9.669522230400868e-06, "loss": 0.1288, "step": 1280 }, { "epoch": 0.5828025477707006, "grad_norm": 0.7155876804587362, "learning_rate": 9.66901103384011e-06, "loss": 0.0923, "step": 1281 }, { "epoch": 0.5832575068243858, "grad_norm": 0.758339057709363, "learning_rate": 9.668499455748857e-06, "loss": 0.0866, "step": 1282 }, { "epoch": 0.583712465878071, "grad_norm": 0.5929990208040478, "learning_rate": 9.66798749616891e-06, "loss": 0.0571, "step": 1283 }, { "epoch": 0.5841674249317561, "grad_norm": 0.5486564328594907, "learning_rate": 9.667475155142104e-06, "loss": 0.0551, "step": 1284 }, { "epoch": 0.5846223839854413, "grad_norm": 0.6958253493282612, "learning_rate": 9.666962432710307e-06, "loss": 0.0731, "step": 1285 }, { "epoch": 0.5850773430391265, "grad_norm": 1.1984701204529857, "learning_rate": 9.666449328915418e-06, "loss": 0.1248, "step": 1286 }, { "epoch": 0.5855323020928116, "grad_norm": 1.07466414021835, "learning_rate": 9.66593584379936e-06, "loss": 0.0969, "step": 1287 }, { "epoch": 0.5859872611464968, "grad_norm": 0.7365065558485686, "learning_rate": 9.6654219774041e-06, "loss": 0.0768, "step": 1288 }, { "epoch": 0.586442220200182, "grad_norm": 0.7278778525375763, "learning_rate": 9.664907729771622e-06, "loss": 0.0931, "step": 1289 }, { "epoch": 0.5868971792538672, "grad_norm": 0.6940342908894654, "learning_rate": 9.664393100943951e-06, "loss": 0.0716, "step": 1290 }, { "epoch": 0.5873521383075523, "grad_norm": 0.7046475563496115, "learning_rate": 9.663878090963142e-06, "loss": 0.0833, "step": 1291 }, { "epoch": 0.5878070973612375, "grad_norm": 0.6554863862272154, "learning_rate": 9.663362699871275e-06, "loss": 0.0705, "step": 1292 }, { "epoch": 0.5882620564149227, "grad_norm": 0.610296786595235, "learning_rate": 9.66284692771047e-06, "loss": 0.0592, "step": 1293 }, { "epoch": 0.5887170154686078, "grad_norm": 0.6866815075031769, "learning_rate": 9.662330774522869e-06, "loss": 0.0748, "step": 1294 }, { "epoch": 0.589171974522293, "grad_norm": 0.5654106713312388, "learning_rate": 9.661814240350653e-06, "loss": 0.0546, "step": 1295 }, { "epoch": 0.5896269335759782, "grad_norm": 1.271034489401823, "learning_rate": 9.66129732523603e-06, "loss": 0.1473, "step": 1296 }, { "epoch": 0.5900818926296634, "grad_norm": 0.45734781465896296, "learning_rate": 9.66078002922124e-06, "loss": 0.0452, "step": 1297 }, { "epoch": 0.5905368516833485, "grad_norm": 0.8001910391102482, "learning_rate": 9.660262352348553e-06, "loss": 0.0801, "step": 1298 }, { "epoch": 0.5909918107370337, "grad_norm": 0.8095822615697389, "learning_rate": 9.659744294660272e-06, "loss": 0.0851, "step": 1299 }, { "epoch": 0.5914467697907189, "grad_norm": 0.6222175915293906, "learning_rate": 9.659225856198732e-06, "loss": 0.0725, "step": 1300 }, { "epoch": 0.591901728844404, "grad_norm": 0.5098172411498206, "learning_rate": 9.658707037006294e-06, "loss": 0.0586, "step": 1301 }, { "epoch": 0.5923566878980892, "grad_norm": 0.5056342525545805, "learning_rate": 9.658187837125357e-06, "loss": 0.0552, "step": 1302 }, { "epoch": 0.5928116469517744, "grad_norm": 0.8298114087640572, "learning_rate": 9.657668256598347e-06, "loss": 0.0976, "step": 1303 }, { "epoch": 0.5932666060054596, "grad_norm": 0.9354418819253106, "learning_rate": 9.657148295467719e-06, "loss": 0.1128, "step": 1304 }, { "epoch": 0.5937215650591446, "grad_norm": 0.732222390896743, "learning_rate": 9.656627953775964e-06, "loss": 0.0719, "step": 1305 }, { "epoch": 0.5941765241128298, "grad_norm": 0.817074061431315, "learning_rate": 9.6561072315656e-06, "loss": 0.097, "step": 1306 }, { "epoch": 0.594631483166515, "grad_norm": 0.6993010225350191, "learning_rate": 9.655586128879185e-06, "loss": 0.0866, "step": 1307 }, { "epoch": 0.5950864422202001, "grad_norm": 0.6036033167422408, "learning_rate": 9.655064645759291e-06, "loss": 0.0615, "step": 1308 }, { "epoch": 0.5955414012738853, "grad_norm": 0.4333029170805267, "learning_rate": 9.654542782248539e-06, "loss": 0.0333, "step": 1309 }, { "epoch": 0.5959963603275705, "grad_norm": 0.5158856954901245, "learning_rate": 9.65402053838957e-06, "loss": 0.0534, "step": 1310 }, { "epoch": 0.5964513193812557, "grad_norm": 0.8439407413306237, "learning_rate": 9.653497914225059e-06, "loss": 0.0886, "step": 1311 }, { "epoch": 0.5969062784349408, "grad_norm": 1.097335021441692, "learning_rate": 9.652974909797714e-06, "loss": 0.1184, "step": 1312 }, { "epoch": 0.597361237488626, "grad_norm": 0.6552117042192046, "learning_rate": 9.652451525150272e-06, "loss": 0.0719, "step": 1313 }, { "epoch": 0.5978161965423112, "grad_norm": 0.6353863518066384, "learning_rate": 9.651927760325504e-06, "loss": 0.0696, "step": 1314 }, { "epoch": 0.5982711555959963, "grad_norm": 0.9048456403488727, "learning_rate": 9.651403615366204e-06, "loss": 0.0859, "step": 1315 }, { "epoch": 0.5987261146496815, "grad_norm": 0.7176841695337582, "learning_rate": 9.650879090315207e-06, "loss": 0.0821, "step": 1316 }, { "epoch": 0.5991810737033667, "grad_norm": 0.696539124420045, "learning_rate": 9.650354185215374e-06, "loss": 0.0875, "step": 1317 }, { "epoch": 0.5996360327570519, "grad_norm": 0.5924500205612657, "learning_rate": 9.649828900109599e-06, "loss": 0.0646, "step": 1318 }, { "epoch": 0.600090991810737, "grad_norm": 0.5430407542910594, "learning_rate": 9.649303235040803e-06, "loss": 0.0486, "step": 1319 }, { "epoch": 0.6005459508644222, "grad_norm": 0.6459813862779727, "learning_rate": 9.648777190051944e-06, "loss": 0.0903, "step": 1320 }, { "epoch": 0.6010009099181074, "grad_norm": 0.6531397749427512, "learning_rate": 9.648250765186006e-06, "loss": 0.0638, "step": 1321 }, { "epoch": 0.6014558689717925, "grad_norm": 0.6616813941465042, "learning_rate": 9.647723960486006e-06, "loss": 0.0861, "step": 1322 }, { "epoch": 0.6019108280254777, "grad_norm": 0.8426003399558685, "learning_rate": 9.647196775994995e-06, "loss": 0.0928, "step": 1323 }, { "epoch": 0.6023657870791629, "grad_norm": 0.6908471872127779, "learning_rate": 9.646669211756049e-06, "loss": 0.064, "step": 1324 }, { "epoch": 0.6028207461328481, "grad_norm": 0.6969433310817453, "learning_rate": 9.64614126781228e-06, "loss": 0.0683, "step": 1325 }, { "epoch": 0.6032757051865332, "grad_norm": 0.7506047981065134, "learning_rate": 9.645612944206826e-06, "loss": 0.0849, "step": 1326 }, { "epoch": 0.6037306642402184, "grad_norm": 0.5624997977779479, "learning_rate": 9.645084240982862e-06, "loss": 0.064, "step": 1327 }, { "epoch": 0.6041856232939036, "grad_norm": 0.43671100502349636, "learning_rate": 9.644555158183592e-06, "loss": 0.0615, "step": 1328 }, { "epoch": 0.6046405823475887, "grad_norm": 0.553762280713577, "learning_rate": 9.64402569585225e-06, "loss": 0.0596, "step": 1329 }, { "epoch": 0.6050955414012739, "grad_norm": 0.6580653378362663, "learning_rate": 9.643495854032099e-06, "loss": 0.0558, "step": 1330 }, { "epoch": 0.6055505004549591, "grad_norm": 0.7656128172437318, "learning_rate": 9.642965632766437e-06, "loss": 0.0915, "step": 1331 }, { "epoch": 0.6060054595086443, "grad_norm": 0.49008300515141723, "learning_rate": 9.642435032098591e-06, "loss": 0.0553, "step": 1332 }, { "epoch": 0.6064604185623294, "grad_norm": 0.6058179105933948, "learning_rate": 9.64190405207192e-06, "loss": 0.0709, "step": 1333 }, { "epoch": 0.6069153776160146, "grad_norm": 0.6707142568108124, "learning_rate": 9.641372692729811e-06, "loss": 0.0715, "step": 1334 }, { "epoch": 0.6073703366696998, "grad_norm": 0.8710319334113071, "learning_rate": 9.640840954115686e-06, "loss": 0.091, "step": 1335 }, { "epoch": 0.607825295723385, "grad_norm": 0.7496993600003082, "learning_rate": 9.640308836272996e-06, "loss": 0.0932, "step": 1336 }, { "epoch": 0.60828025477707, "grad_norm": 0.9684583450547241, "learning_rate": 9.639776339245225e-06, "loss": 0.087, "step": 1337 }, { "epoch": 0.6087352138307552, "grad_norm": 0.7857186962980957, "learning_rate": 9.639243463075884e-06, "loss": 0.1084, "step": 1338 }, { "epoch": 0.6091901728844404, "grad_norm": 1.1677743182021476, "learning_rate": 9.638710207808518e-06, "loss": 0.0712, "step": 1339 }, { "epoch": 0.6096451319381255, "grad_norm": 0.725604064535932, "learning_rate": 9.6381765734867e-06, "loss": 0.077, "step": 1340 }, { "epoch": 0.6101000909918107, "grad_norm": 0.5923782964843433, "learning_rate": 9.63764256015404e-06, "loss": 0.0641, "step": 1341 }, { "epoch": 0.6105550500454959, "grad_norm": 0.7069177546563966, "learning_rate": 9.637108167854173e-06, "loss": 0.0747, "step": 1342 }, { "epoch": 0.6110100090991811, "grad_norm": 0.780384533965345, "learning_rate": 9.636573396630767e-06, "loss": 0.0709, "step": 1343 }, { "epoch": 0.6114649681528662, "grad_norm": 0.7305821703239879, "learning_rate": 9.636038246527523e-06, "loss": 0.0955, "step": 1344 }, { "epoch": 0.6119199272065514, "grad_norm": 0.6274215993935015, "learning_rate": 9.635502717588168e-06, "loss": 0.0656, "step": 1345 }, { "epoch": 0.6123748862602366, "grad_norm": 0.6018866737558257, "learning_rate": 9.634966809856465e-06, "loss": 0.0729, "step": 1346 }, { "epoch": 0.6128298453139217, "grad_norm": 0.9406786913650838, "learning_rate": 9.634430523376207e-06, "loss": 0.1105, "step": 1347 }, { "epoch": 0.6132848043676069, "grad_norm": 0.6910930219074588, "learning_rate": 9.633893858191214e-06, "loss": 0.0652, "step": 1348 }, { "epoch": 0.6137397634212921, "grad_norm": 0.6641071332456526, "learning_rate": 9.633356814345342e-06, "loss": 0.0896, "step": 1349 }, { "epoch": 0.6141947224749773, "grad_norm": 0.6463461735454817, "learning_rate": 9.632819391882475e-06, "loss": 0.0691, "step": 1350 }, { "epoch": 0.6146496815286624, "grad_norm": 0.6570738741447356, "learning_rate": 9.63228159084653e-06, "loss": 0.0726, "step": 1351 }, { "epoch": 0.6151046405823476, "grad_norm": 0.9251372605740943, "learning_rate": 9.631743411281451e-06, "loss": 0.1089, "step": 1352 }, { "epoch": 0.6155595996360328, "grad_norm": 1.0354136522724409, "learning_rate": 9.631204853231219e-06, "loss": 0.1065, "step": 1353 }, { "epoch": 0.6160145586897179, "grad_norm": 0.7577345531084587, "learning_rate": 9.630665916739839e-06, "loss": 0.083, "step": 1354 }, { "epoch": 0.6164695177434031, "grad_norm": 0.6775679844485006, "learning_rate": 9.630126601851353e-06, "loss": 0.065, "step": 1355 }, { "epoch": 0.6169244767970883, "grad_norm": 0.6510409015870585, "learning_rate": 9.62958690860983e-06, "loss": 0.0842, "step": 1356 }, { "epoch": 0.6173794358507735, "grad_norm": 0.6541401291987898, "learning_rate": 9.629046837059373e-06, "loss": 0.0809, "step": 1357 }, { "epoch": 0.6178343949044586, "grad_norm": 0.6773644747284383, "learning_rate": 9.628506387244111e-06, "loss": 0.08, "step": 1358 }, { "epoch": 0.6182893539581438, "grad_norm": 0.7401243921784199, "learning_rate": 9.627965559208212e-06, "loss": 0.0632, "step": 1359 }, { "epoch": 0.618744313011829, "grad_norm": 0.6255731586329286, "learning_rate": 9.627424352995866e-06, "loss": 0.0836, "step": 1360 }, { "epoch": 0.6191992720655141, "grad_norm": 0.8684189032240879, "learning_rate": 9.626882768651298e-06, "loss": 0.0918, "step": 1361 }, { "epoch": 0.6196542311191993, "grad_norm": 0.5565014005760545, "learning_rate": 9.626340806218765e-06, "loss": 0.0508, "step": 1362 }, { "epoch": 0.6201091901728845, "grad_norm": 0.580066419485805, "learning_rate": 9.625798465742555e-06, "loss": 0.0691, "step": 1363 }, { "epoch": 0.6205641492265697, "grad_norm": 0.5980127746625918, "learning_rate": 9.625255747266984e-06, "loss": 0.0674, "step": 1364 }, { "epoch": 0.6210191082802548, "grad_norm": 0.8518146992949526, "learning_rate": 9.6247126508364e-06, "loss": 0.1112, "step": 1365 }, { "epoch": 0.62147406733394, "grad_norm": 0.8485700961520207, "learning_rate": 9.624169176495185e-06, "loss": 0.0966, "step": 1366 }, { "epoch": 0.6219290263876252, "grad_norm": 0.9962639418238284, "learning_rate": 9.623625324287747e-06, "loss": 0.1047, "step": 1367 }, { "epoch": 0.6223839854413102, "grad_norm": 0.7706385402975253, "learning_rate": 9.623081094258527e-06, "loss": 0.1229, "step": 1368 }, { "epoch": 0.6228389444949954, "grad_norm": 0.9185957443221413, "learning_rate": 9.622536486451997e-06, "loss": 0.0981, "step": 1369 }, { "epoch": 0.6232939035486806, "grad_norm": 0.5737112203779396, "learning_rate": 9.621991500912662e-06, "loss": 0.0615, "step": 1370 }, { "epoch": 0.6237488626023658, "grad_norm": 0.8225187377418599, "learning_rate": 9.621446137685051e-06, "loss": 0.1032, "step": 1371 }, { "epoch": 0.6242038216560509, "grad_norm": 0.911993563924521, "learning_rate": 9.620900396813734e-06, "loss": 0.1052, "step": 1372 }, { "epoch": 0.6246587807097361, "grad_norm": 1.1969877300226637, "learning_rate": 9.620354278343306e-06, "loss": 0.1323, "step": 1373 }, { "epoch": 0.6251137397634213, "grad_norm": 0.49674299728731663, "learning_rate": 9.61980778231839e-06, "loss": 0.0469, "step": 1374 }, { "epoch": 0.6255686988171064, "grad_norm": 0.9419790098064809, "learning_rate": 9.619260908783645e-06, "loss": 0.0829, "step": 1375 }, { "epoch": 0.6260236578707916, "grad_norm": 0.8648992102518269, "learning_rate": 9.61871365778376e-06, "loss": 0.1227, "step": 1376 }, { "epoch": 0.6264786169244768, "grad_norm": 0.6855921150752273, "learning_rate": 9.618166029363452e-06, "loss": 0.0893, "step": 1377 }, { "epoch": 0.626933575978162, "grad_norm": 0.7460350385490577, "learning_rate": 9.61761802356747e-06, "loss": 0.1029, "step": 1378 }, { "epoch": 0.6273885350318471, "grad_norm": 0.6238948896650269, "learning_rate": 9.617069640440598e-06, "loss": 0.0671, "step": 1379 }, { "epoch": 0.6278434940855323, "grad_norm": 0.8484782740935036, "learning_rate": 9.616520880027645e-06, "loss": 0.1094, "step": 1380 }, { "epoch": 0.6282984531392175, "grad_norm": 0.4929008515621752, "learning_rate": 9.615971742373453e-06, "loss": 0.0621, "step": 1381 }, { "epoch": 0.6287534121929026, "grad_norm": 0.8230508842215047, "learning_rate": 9.615422227522897e-06, "loss": 0.0873, "step": 1382 }, { "epoch": 0.6292083712465878, "grad_norm": 0.8269677617343545, "learning_rate": 9.614872335520879e-06, "loss": 0.0996, "step": 1383 }, { "epoch": 0.629663330300273, "grad_norm": 0.7039938726965704, "learning_rate": 9.614322066412335e-06, "loss": 0.084, "step": 1384 }, { "epoch": 0.6301182893539582, "grad_norm": 0.7376546247757936, "learning_rate": 9.613771420242229e-06, "loss": 0.0857, "step": 1385 }, { "epoch": 0.6305732484076433, "grad_norm": 0.6736142636267153, "learning_rate": 9.613220397055558e-06, "loss": 0.0732, "step": 1386 }, { "epoch": 0.6310282074613285, "grad_norm": 0.7476942520500481, "learning_rate": 9.612668996897351e-06, "loss": 0.0713, "step": 1387 }, { "epoch": 0.6314831665150137, "grad_norm": 0.7359465201312233, "learning_rate": 9.612117219812662e-06, "loss": 0.0847, "step": 1388 }, { "epoch": 0.6319381255686988, "grad_norm": 0.9663363466846744, "learning_rate": 9.611565065846583e-06, "loss": 0.1015, "step": 1389 }, { "epoch": 0.632393084622384, "grad_norm": 0.7893446645403931, "learning_rate": 9.611012535044232e-06, "loss": 0.0983, "step": 1390 }, { "epoch": 0.6328480436760692, "grad_norm": 1.024989133088754, "learning_rate": 9.61045962745076e-06, "loss": 0.1102, "step": 1391 }, { "epoch": 0.6333030027297544, "grad_norm": 0.4979683651622851, "learning_rate": 9.609906343111348e-06, "loss": 0.0586, "step": 1392 }, { "epoch": 0.6337579617834395, "grad_norm": 1.1009002383858189, "learning_rate": 9.609352682071209e-06, "loss": 0.0963, "step": 1393 }, { "epoch": 0.6342129208371247, "grad_norm": 1.0522149389130615, "learning_rate": 9.608798644375583e-06, "loss": 0.1189, "step": 1394 }, { "epoch": 0.6346678798908099, "grad_norm": 0.9812979427333788, "learning_rate": 9.608244230069745e-06, "loss": 0.1216, "step": 1395 }, { "epoch": 0.635122838944495, "grad_norm": 0.7352050689297358, "learning_rate": 9.607689439199e-06, "loss": 0.0875, "step": 1396 }, { "epoch": 0.6355777979981801, "grad_norm": 0.8346962373874338, "learning_rate": 9.60713427180868e-06, "loss": 0.0872, "step": 1397 }, { "epoch": 0.6360327570518653, "grad_norm": 0.9100484302304894, "learning_rate": 9.606578727944156e-06, "loss": 0.1014, "step": 1398 }, { "epoch": 0.6364877161055505, "grad_norm": 0.6397054531308819, "learning_rate": 9.606022807650819e-06, "loss": 0.0661, "step": 1399 }, { "epoch": 0.6369426751592356, "grad_norm": 0.7013671405977515, "learning_rate": 9.6054665109741e-06, "loss": 0.0788, "step": 1400 }, { "epoch": 0.6373976342129208, "grad_norm": 0.7177935827049716, "learning_rate": 9.604909837959456e-06, "loss": 0.0739, "step": 1401 }, { "epoch": 0.637852593266606, "grad_norm": 1.0034339624615456, "learning_rate": 9.604352788652375e-06, "loss": 0.125, "step": 1402 }, { "epoch": 0.6383075523202911, "grad_norm": 0.7908500695821505, "learning_rate": 9.603795363098377e-06, "loss": 0.0626, "step": 1403 }, { "epoch": 0.6387625113739763, "grad_norm": 0.7396845097003291, "learning_rate": 9.603237561343013e-06, "loss": 0.0845, "step": 1404 }, { "epoch": 0.6392174704276615, "grad_norm": 0.6132031146325181, "learning_rate": 9.602679383431864e-06, "loss": 0.0832, "step": 1405 }, { "epoch": 0.6396724294813467, "grad_norm": 0.5848815265706712, "learning_rate": 9.602120829410539e-06, "loss": 0.0609, "step": 1406 }, { "epoch": 0.6401273885350318, "grad_norm": 1.1396916096380878, "learning_rate": 9.601561899324685e-06, "loss": 0.089, "step": 1407 }, { "epoch": 0.640582347588717, "grad_norm": 0.6243784477376835, "learning_rate": 9.601002593219972e-06, "loss": 0.0629, "step": 1408 }, { "epoch": 0.6410373066424022, "grad_norm": 0.7693306930944409, "learning_rate": 9.600442911142107e-06, "loss": 0.0975, "step": 1409 }, { "epoch": 0.6414922656960873, "grad_norm": 0.5824222441008058, "learning_rate": 9.599882853136821e-06, "loss": 0.0668, "step": 1410 }, { "epoch": 0.6419472247497725, "grad_norm": 0.7486427214965261, "learning_rate": 9.59932241924988e-06, "loss": 0.0885, "step": 1411 }, { "epoch": 0.6424021838034577, "grad_norm": 0.7403442425812181, "learning_rate": 9.598761609527084e-06, "loss": 0.0764, "step": 1412 }, { "epoch": 0.6428571428571429, "grad_norm": 0.8444168000337251, "learning_rate": 9.598200424014255e-06, "loss": 0.0901, "step": 1413 }, { "epoch": 0.643312101910828, "grad_norm": 0.6214870203253012, "learning_rate": 9.597638862757255e-06, "loss": 0.0641, "step": 1414 }, { "epoch": 0.6437670609645132, "grad_norm": 0.45639812216740483, "learning_rate": 9.597076925801967e-06, "loss": 0.0525, "step": 1415 }, { "epoch": 0.6442220200181984, "grad_norm": 0.5879645013041995, "learning_rate": 9.596514613194313e-06, "loss": 0.0664, "step": 1416 }, { "epoch": 0.6446769790718835, "grad_norm": 0.723485890557837, "learning_rate": 9.595951924980245e-06, "loss": 0.0878, "step": 1417 }, { "epoch": 0.6451319381255687, "grad_norm": 0.49190939142236517, "learning_rate": 9.595388861205738e-06, "loss": 0.0446, "step": 1418 }, { "epoch": 0.6455868971792539, "grad_norm": 0.8244975390610266, "learning_rate": 9.59482542191681e-06, "loss": 0.0927, "step": 1419 }, { "epoch": 0.6460418562329391, "grad_norm": 0.8365340393723969, "learning_rate": 9.594261607159494e-06, "loss": 0.0944, "step": 1420 }, { "epoch": 0.6464968152866242, "grad_norm": 0.9246231982112141, "learning_rate": 9.59369741697987e-06, "loss": 0.1132, "step": 1421 }, { "epoch": 0.6469517743403094, "grad_norm": 0.7576903487594321, "learning_rate": 9.593132851424036e-06, "loss": 0.0968, "step": 1422 }, { "epoch": 0.6474067333939946, "grad_norm": 0.7385455319846311, "learning_rate": 9.59256791053813e-06, "loss": 0.1045, "step": 1423 }, { "epoch": 0.6478616924476797, "grad_norm": 0.8466333605064674, "learning_rate": 9.592002594368312e-06, "loss": 0.1058, "step": 1424 }, { "epoch": 0.6483166515013649, "grad_norm": 0.9463191649116842, "learning_rate": 9.59143690296078e-06, "loss": 0.1179, "step": 1425 }, { "epoch": 0.6487716105550501, "grad_norm": 0.49506567565602905, "learning_rate": 9.590870836361758e-06, "loss": 0.0679, "step": 1426 }, { "epoch": 0.6492265696087353, "grad_norm": 0.9070193484568203, "learning_rate": 9.590304394617506e-06, "loss": 0.0889, "step": 1427 }, { "epoch": 0.6496815286624203, "grad_norm": 0.4746970963167155, "learning_rate": 9.589737577774308e-06, "loss": 0.0474, "step": 1428 }, { "epoch": 0.6501364877161055, "grad_norm": 0.7625565873276676, "learning_rate": 9.58917038587848e-06, "loss": 0.1052, "step": 1429 }, { "epoch": 0.6505914467697907, "grad_norm": 0.5544350713091404, "learning_rate": 9.588602818976374e-06, "loss": 0.0602, "step": 1430 }, { "epoch": 0.6510464058234758, "grad_norm": 0.8043877114109435, "learning_rate": 9.588034877114367e-06, "loss": 0.0714, "step": 1431 }, { "epoch": 0.651501364877161, "grad_norm": 0.6177719048805246, "learning_rate": 9.58746656033887e-06, "loss": 0.0822, "step": 1432 }, { "epoch": 0.6519563239308462, "grad_norm": 1.070732220715245, "learning_rate": 9.586897868696323e-06, "loss": 0.1203, "step": 1433 }, { "epoch": 0.6524112829845314, "grad_norm": 1.183590915899486, "learning_rate": 9.586328802233195e-06, "loss": 0.0935, "step": 1434 }, { "epoch": 0.6528662420382165, "grad_norm": 0.581772493938091, "learning_rate": 9.58575936099599e-06, "loss": 0.0682, "step": 1435 }, { "epoch": 0.6533212010919017, "grad_norm": 0.7377901301818582, "learning_rate": 9.58518954503124e-06, "loss": 0.0824, "step": 1436 }, { "epoch": 0.6537761601455869, "grad_norm": 0.9292214040800371, "learning_rate": 9.584619354385505e-06, "loss": 0.1138, "step": 1437 }, { "epoch": 0.654231119199272, "grad_norm": 0.7573270642921373, "learning_rate": 9.58404878910538e-06, "loss": 0.074, "step": 1438 }, { "epoch": 0.6546860782529572, "grad_norm": 0.5838864743945036, "learning_rate": 9.58347784923749e-06, "loss": 0.067, "step": 1439 }, { "epoch": 0.6551410373066424, "grad_norm": 0.6730458126896756, "learning_rate": 9.58290653482849e-06, "loss": 0.0632, "step": 1440 }, { "epoch": 0.6555959963603276, "grad_norm": 0.7216545389315259, "learning_rate": 9.582334845925063e-06, "loss": 0.0757, "step": 1441 }, { "epoch": 0.6560509554140127, "grad_norm": 0.929819001740202, "learning_rate": 9.581762782573926e-06, "loss": 0.0973, "step": 1442 }, { "epoch": 0.6565059144676979, "grad_norm": 0.7680577896195074, "learning_rate": 9.581190344821827e-06, "loss": 0.086, "step": 1443 }, { "epoch": 0.6569608735213831, "grad_norm": 0.8746535076926352, "learning_rate": 9.58061753271554e-06, "loss": 0.1085, "step": 1444 }, { "epoch": 0.6574158325750682, "grad_norm": 0.6364512825611769, "learning_rate": 9.580044346301875e-06, "loss": 0.0764, "step": 1445 }, { "epoch": 0.6578707916287534, "grad_norm": 0.47118649986170347, "learning_rate": 9.57947078562767e-06, "loss": 0.0506, "step": 1446 }, { "epoch": 0.6583257506824386, "grad_norm": 0.6564703457147261, "learning_rate": 9.578896850739792e-06, "loss": 0.0702, "step": 1447 }, { "epoch": 0.6587807097361238, "grad_norm": 0.6786314185300042, "learning_rate": 9.578322541685142e-06, "loss": 0.0778, "step": 1448 }, { "epoch": 0.6592356687898089, "grad_norm": 0.7866249519519628, "learning_rate": 9.577747858510647e-06, "loss": 0.1066, "step": 1449 }, { "epoch": 0.6596906278434941, "grad_norm": 0.8352652198110325, "learning_rate": 9.577172801263272e-06, "loss": 0.0973, "step": 1450 }, { "epoch": 0.6601455868971793, "grad_norm": 0.6694090591857538, "learning_rate": 9.576597369990006e-06, "loss": 0.077, "step": 1451 }, { "epoch": 0.6606005459508644, "grad_norm": 0.6613042389515336, "learning_rate": 9.576021564737871e-06, "loss": 0.0608, "step": 1452 }, { "epoch": 0.6610555050045496, "grad_norm": 0.7515982683897205, "learning_rate": 9.575445385553917e-06, "loss": 0.1003, "step": 1453 }, { "epoch": 0.6615104640582348, "grad_norm": 0.9769815693335377, "learning_rate": 9.57486883248523e-06, "loss": 0.0946, "step": 1454 }, { "epoch": 0.66196542311192, "grad_norm": 1.1665424395125852, "learning_rate": 9.574291905578922e-06, "loss": 0.1317, "step": 1455 }, { "epoch": 0.6624203821656051, "grad_norm": 0.6942177292436024, "learning_rate": 9.573714604882138e-06, "loss": 0.0615, "step": 1456 }, { "epoch": 0.6628753412192903, "grad_norm": 0.9194225981756011, "learning_rate": 9.57313693044205e-06, "loss": 0.0975, "step": 1457 }, { "epoch": 0.6633303002729755, "grad_norm": 0.7117926275391128, "learning_rate": 9.572558882305863e-06, "loss": 0.0847, "step": 1458 }, { "epoch": 0.6637852593266605, "grad_norm": 0.9546376743105418, "learning_rate": 9.571980460520815e-06, "loss": 0.1196, "step": 1459 }, { "epoch": 0.6642402183803457, "grad_norm": 0.8937437496424256, "learning_rate": 9.57140166513417e-06, "loss": 0.096, "step": 1460 }, { "epoch": 0.664695177434031, "grad_norm": 0.5937947199850856, "learning_rate": 9.570822496193225e-06, "loss": 0.058, "step": 1461 }, { "epoch": 0.6651501364877161, "grad_norm": 0.5756039867728808, "learning_rate": 9.570242953745307e-06, "loss": 0.082, "step": 1462 }, { "epoch": 0.6656050955414012, "grad_norm": 0.7416722804778516, "learning_rate": 9.569663037837776e-06, "loss": 0.098, "step": 1463 }, { "epoch": 0.6660600545950864, "grad_norm": 0.6377485683281849, "learning_rate": 9.569082748518017e-06, "loss": 0.0723, "step": 1464 }, { "epoch": 0.6665150136487716, "grad_norm": 0.7884664768500067, "learning_rate": 9.568502085833449e-06, "loss": 0.0884, "step": 1465 }, { "epoch": 0.6669699727024567, "grad_norm": 0.7723350087530905, "learning_rate": 9.567921049831522e-06, "loss": 0.0967, "step": 1466 }, { "epoch": 0.6674249317561419, "grad_norm": 0.7260885892233983, "learning_rate": 9.567339640559716e-06, "loss": 0.0812, "step": 1467 }, { "epoch": 0.6678798908098271, "grad_norm": 0.5596294621225263, "learning_rate": 9.566757858065538e-06, "loss": 0.0631, "step": 1468 }, { "epoch": 0.6683348498635123, "grad_norm": 0.7286352648100037, "learning_rate": 9.566175702396534e-06, "loss": 0.0823, "step": 1469 }, { "epoch": 0.6687898089171974, "grad_norm": 0.9301493673689373, "learning_rate": 9.565593173600271e-06, "loss": 0.0987, "step": 1470 }, { "epoch": 0.6692447679708826, "grad_norm": 0.6817718703338496, "learning_rate": 9.565010271724353e-06, "loss": 0.0755, "step": 1471 }, { "epoch": 0.6696997270245678, "grad_norm": 0.7526239018301766, "learning_rate": 9.56442699681641e-06, "loss": 0.0876, "step": 1472 }, { "epoch": 0.6701546860782529, "grad_norm": 0.7279647211742274, "learning_rate": 9.563843348924105e-06, "loss": 0.0681, "step": 1473 }, { "epoch": 0.6706096451319381, "grad_norm": 0.8487044021854026, "learning_rate": 9.563259328095132e-06, "loss": 0.0903, "step": 1474 }, { "epoch": 0.6710646041856233, "grad_norm": 0.609495225783116, "learning_rate": 9.562674934377214e-06, "loss": 0.0801, "step": 1475 }, { "epoch": 0.6715195632393085, "grad_norm": 0.7638645194963899, "learning_rate": 9.562090167818107e-06, "loss": 0.0874, "step": 1476 }, { "epoch": 0.6719745222929936, "grad_norm": 1.4076317151154771, "learning_rate": 9.561505028465593e-06, "loss": 0.0874, "step": 1477 }, { "epoch": 0.6724294813466788, "grad_norm": 0.6311161675673277, "learning_rate": 9.560919516367486e-06, "loss": 0.0738, "step": 1478 }, { "epoch": 0.672884440400364, "grad_norm": 0.638266808298586, "learning_rate": 9.560333631571634e-06, "loss": 0.0682, "step": 1479 }, { "epoch": 0.6733393994540491, "grad_norm": 0.7097356519617585, "learning_rate": 9.559747374125911e-06, "loss": 0.0987, "step": 1480 }, { "epoch": 0.6737943585077343, "grad_norm": 0.6502346745698145, "learning_rate": 9.559160744078226e-06, "loss": 0.0644, "step": 1481 }, { "epoch": 0.6742493175614195, "grad_norm": 1.056681303492363, "learning_rate": 9.558573741476513e-06, "loss": 0.0939, "step": 1482 }, { "epoch": 0.6747042766151047, "grad_norm": 0.7992268675141662, "learning_rate": 9.557986366368742e-06, "loss": 0.0733, "step": 1483 }, { "epoch": 0.6751592356687898, "grad_norm": 1.0832399406974047, "learning_rate": 9.557398618802907e-06, "loss": 0.1123, "step": 1484 }, { "epoch": 0.675614194722475, "grad_norm": 0.6543008513198456, "learning_rate": 9.556810498827039e-06, "loss": 0.0794, "step": 1485 }, { "epoch": 0.6760691537761602, "grad_norm": 0.6306597614421026, "learning_rate": 9.556222006489193e-06, "loss": 0.0786, "step": 1486 }, { "epoch": 0.6765241128298453, "grad_norm": 0.5618899284499352, "learning_rate": 9.555633141837462e-06, "loss": 0.0618, "step": 1487 }, { "epoch": 0.6769790718835305, "grad_norm": 0.6434016854657288, "learning_rate": 9.555043904919963e-06, "loss": 0.0796, "step": 1488 }, { "epoch": 0.6774340309372157, "grad_norm": 0.7512094182824542, "learning_rate": 9.554454295784848e-06, "loss": 0.0745, "step": 1489 }, { "epoch": 0.6778889899909009, "grad_norm": 0.662429978970196, "learning_rate": 9.553864314480294e-06, "loss": 0.0788, "step": 1490 }, { "epoch": 0.678343949044586, "grad_norm": 0.7125824073483379, "learning_rate": 9.553273961054514e-06, "loss": 0.072, "step": 1491 }, { "epoch": 0.6787989080982711, "grad_norm": 0.8599367957772613, "learning_rate": 9.552683235555749e-06, "loss": 0.0765, "step": 1492 }, { "epoch": 0.6792538671519563, "grad_norm": 0.7900843446637873, "learning_rate": 9.55209213803227e-06, "loss": 0.0861, "step": 1493 }, { "epoch": 0.6797088262056415, "grad_norm": 0.9492542185178791, "learning_rate": 9.551500668532377e-06, "loss": 0.1036, "step": 1494 }, { "epoch": 0.6801637852593266, "grad_norm": 0.5324340095596853, "learning_rate": 9.550908827104404e-06, "loss": 0.0509, "step": 1495 }, { "epoch": 0.6806187443130118, "grad_norm": 1.4654919772375794, "learning_rate": 9.550316613796716e-06, "loss": 0.0891, "step": 1496 }, { "epoch": 0.681073703366697, "grad_norm": 0.6964909028346599, "learning_rate": 9.549724028657698e-06, "loss": 0.0814, "step": 1497 }, { "epoch": 0.6815286624203821, "grad_norm": 0.7118346157191014, "learning_rate": 9.549131071735784e-06, "loss": 0.0711, "step": 1498 }, { "epoch": 0.6819836214740673, "grad_norm": 0.9814989838911676, "learning_rate": 9.54853774307942e-06, "loss": 0.0981, "step": 1499 }, { "epoch": 0.6824385805277525, "grad_norm": 0.8030617514029292, "learning_rate": 9.547944042737092e-06, "loss": 0.0944, "step": 1500 }, { "epoch": 0.6828935395814377, "grad_norm": 0.9091821467413523, "learning_rate": 9.547349970757317e-06, "loss": 0.1419, "step": 1501 }, { "epoch": 0.6833484986351228, "grad_norm": 0.7604842345576438, "learning_rate": 9.546755527188638e-06, "loss": 0.0616, "step": 1502 }, { "epoch": 0.683803457688808, "grad_norm": 0.7795635296832277, "learning_rate": 9.546160712079629e-06, "loss": 0.0819, "step": 1503 }, { "epoch": 0.6842584167424932, "grad_norm": 0.6155010796235886, "learning_rate": 9.545565525478896e-06, "loss": 0.0737, "step": 1504 }, { "epoch": 0.6847133757961783, "grad_norm": 0.6981564617213015, "learning_rate": 9.544969967435079e-06, "loss": 0.0786, "step": 1505 }, { "epoch": 0.6851683348498635, "grad_norm": 0.8590705218017948, "learning_rate": 9.54437403799684e-06, "loss": 0.0835, "step": 1506 }, { "epoch": 0.6856232939035487, "grad_norm": 0.8783591706447448, "learning_rate": 9.543777737212876e-06, "loss": 0.118, "step": 1507 }, { "epoch": 0.6860782529572339, "grad_norm": 0.5312480753344904, "learning_rate": 9.543181065131914e-06, "loss": 0.0535, "step": 1508 }, { "epoch": 0.686533212010919, "grad_norm": 0.6911478055364548, "learning_rate": 9.542584021802715e-06, "loss": 0.0651, "step": 1509 }, { "epoch": 0.6869881710646042, "grad_norm": 0.910176403224045, "learning_rate": 9.54198660727406e-06, "loss": 0.0916, "step": 1510 }, { "epoch": 0.6874431301182894, "grad_norm": 0.5369469100452242, "learning_rate": 9.541388821594774e-06, "loss": 0.064, "step": 1511 }, { "epoch": 0.6878980891719745, "grad_norm": 0.7242695685667516, "learning_rate": 9.540790664813702e-06, "loss": 0.0725, "step": 1512 }, { "epoch": 0.6883530482256597, "grad_norm": 0.7527422721071317, "learning_rate": 9.540192136979722e-06, "loss": 0.0863, "step": 1513 }, { "epoch": 0.6888080072793449, "grad_norm": 0.5409793571909967, "learning_rate": 9.539593238141745e-06, "loss": 0.0678, "step": 1514 }, { "epoch": 0.6892629663330301, "grad_norm": 0.5059270742296627, "learning_rate": 9.538993968348706e-06, "loss": 0.0613, "step": 1515 }, { "epoch": 0.6897179253867152, "grad_norm": 0.8092866682697022, "learning_rate": 9.538394327649581e-06, "loss": 0.0816, "step": 1516 }, { "epoch": 0.6901728844404004, "grad_norm": 0.7416822411067572, "learning_rate": 9.537794316093366e-06, "loss": 0.0736, "step": 1517 }, { "epoch": 0.6906278434940856, "grad_norm": 0.6013123530792879, "learning_rate": 9.537193933729092e-06, "loss": 0.0637, "step": 1518 }, { "epoch": 0.6910828025477707, "grad_norm": 1.0953662823641266, "learning_rate": 9.53659318060582e-06, "loss": 0.1381, "step": 1519 }, { "epoch": 0.6915377616014559, "grad_norm": 0.7906081758139587, "learning_rate": 9.535992056772639e-06, "loss": 0.088, "step": 1520 }, { "epoch": 0.6919927206551411, "grad_norm": 0.9984370937403453, "learning_rate": 9.535390562278673e-06, "loss": 0.086, "step": 1521 }, { "epoch": 0.6924476797088263, "grad_norm": 0.7438661675719108, "learning_rate": 9.53478869717307e-06, "loss": 0.0771, "step": 1522 }, { "epoch": 0.6929026387625113, "grad_norm": 0.85189844123529, "learning_rate": 9.534186461505015e-06, "loss": 0.1109, "step": 1523 }, { "epoch": 0.6933575978161965, "grad_norm": 0.7215256903381998, "learning_rate": 9.533583855323717e-06, "loss": 0.0947, "step": 1524 }, { "epoch": 0.6938125568698817, "grad_norm": 0.8936614524747819, "learning_rate": 9.532980878678422e-06, "loss": 0.0731, "step": 1525 }, { "epoch": 0.6942675159235668, "grad_norm": 0.7734700292932609, "learning_rate": 9.5323775316184e-06, "loss": 0.0844, "step": 1526 }, { "epoch": 0.694722474977252, "grad_norm": 0.7521845435610183, "learning_rate": 9.531773814192953e-06, "loss": 0.0878, "step": 1527 }, { "epoch": 0.6951774340309372, "grad_norm": 0.890089227377408, "learning_rate": 9.531169726451417e-06, "loss": 0.1128, "step": 1528 }, { "epoch": 0.6956323930846224, "grad_norm": 0.7682866565773229, "learning_rate": 9.530565268443153e-06, "loss": 0.0956, "step": 1529 }, { "epoch": 0.6960873521383075, "grad_norm": 0.9617852359873308, "learning_rate": 9.529960440217554e-06, "loss": 0.1088, "step": 1530 }, { "epoch": 0.6965423111919927, "grad_norm": 0.9775947633570551, "learning_rate": 9.529355241824045e-06, "loss": 0.107, "step": 1531 }, { "epoch": 0.6969972702456779, "grad_norm": 0.6007455012792351, "learning_rate": 9.528749673312082e-06, "loss": 0.0743, "step": 1532 }, { "epoch": 0.697452229299363, "grad_norm": 0.5419764603212612, "learning_rate": 9.528143734731143e-06, "loss": 0.0822, "step": 1533 }, { "epoch": 0.6979071883530482, "grad_norm": 0.8185575482665152, "learning_rate": 9.52753742613075e-06, "loss": 0.0832, "step": 1534 }, { "epoch": 0.6983621474067334, "grad_norm": 0.9643638751029543, "learning_rate": 9.526930747560446e-06, "loss": 0.1026, "step": 1535 }, { "epoch": 0.6988171064604186, "grad_norm": 0.8502651132594353, "learning_rate": 9.526323699069803e-06, "loss": 0.0902, "step": 1536 }, { "epoch": 0.6992720655141037, "grad_norm": 0.5376181329235236, "learning_rate": 9.525716280708428e-06, "loss": 0.068, "step": 1537 }, { "epoch": 0.6997270245677889, "grad_norm": 0.7166675033334694, "learning_rate": 9.525108492525957e-06, "loss": 0.0752, "step": 1538 }, { "epoch": 0.7001819836214741, "grad_norm": 0.43432195935007917, "learning_rate": 9.524500334572054e-06, "loss": 0.0417, "step": 1539 }, { "epoch": 0.7006369426751592, "grad_norm": 0.8369054167821826, "learning_rate": 9.523891806896417e-06, "loss": 0.1098, "step": 1540 }, { "epoch": 0.7010919017288444, "grad_norm": 0.49781336551041033, "learning_rate": 9.523282909548773e-06, "loss": 0.0618, "step": 1541 }, { "epoch": 0.7015468607825296, "grad_norm": 0.9187882410427298, "learning_rate": 9.522673642578873e-06, "loss": 0.1247, "step": 1542 }, { "epoch": 0.7020018198362148, "grad_norm": 0.5007920591193696, "learning_rate": 9.522064006036509e-06, "loss": 0.0601, "step": 1543 }, { "epoch": 0.7024567788898999, "grad_norm": 0.582945252861272, "learning_rate": 9.521453999971497e-06, "loss": 0.0585, "step": 1544 }, { "epoch": 0.7029117379435851, "grad_norm": 0.5749885951853907, "learning_rate": 9.520843624433681e-06, "loss": 0.0664, "step": 1545 }, { "epoch": 0.7033666969972703, "grad_norm": 0.9724598324631707, "learning_rate": 9.520232879472942e-06, "loss": 0.1199, "step": 1546 }, { "epoch": 0.7038216560509554, "grad_norm": 1.0592052108390146, "learning_rate": 9.519621765139181e-06, "loss": 0.1278, "step": 1547 }, { "epoch": 0.7042766151046406, "grad_norm": 0.42374402440173636, "learning_rate": 9.519010281482344e-06, "loss": 0.0446, "step": 1548 }, { "epoch": 0.7047315741583258, "grad_norm": 1.102301602930716, "learning_rate": 9.518398428552393e-06, "loss": 0.1226, "step": 1549 }, { "epoch": 0.705186533212011, "grad_norm": 0.6842519583257138, "learning_rate": 9.51778620639933e-06, "loss": 0.0905, "step": 1550 }, { "epoch": 0.7056414922656961, "grad_norm": 0.7530573117253311, "learning_rate": 9.517173615073177e-06, "loss": 0.0766, "step": 1551 }, { "epoch": 0.7060964513193813, "grad_norm": 0.43285639961604566, "learning_rate": 9.516560654623996e-06, "loss": 0.0475, "step": 1552 }, { "epoch": 0.7065514103730665, "grad_norm": 0.9094561094681402, "learning_rate": 9.515947325101875e-06, "loss": 0.0896, "step": 1553 }, { "epoch": 0.7070063694267515, "grad_norm": 0.6097385256206468, "learning_rate": 9.515333626556933e-06, "loss": 0.0653, "step": 1554 }, { "epoch": 0.7074613284804367, "grad_norm": 0.7304393114645329, "learning_rate": 9.514719559039318e-06, "loss": 0.0896, "step": 1555 }, { "epoch": 0.707916287534122, "grad_norm": 0.8799769831067698, "learning_rate": 9.514105122599208e-06, "loss": 0.1176, "step": 1556 }, { "epoch": 0.7083712465878071, "grad_norm": 1.0962688093811397, "learning_rate": 9.513490317286815e-06, "loss": 0.1174, "step": 1557 }, { "epoch": 0.7088262056414922, "grad_norm": 0.8022559500547495, "learning_rate": 9.512875143152373e-06, "loss": 0.0969, "step": 1558 }, { "epoch": 0.7092811646951774, "grad_norm": 0.37133918747574174, "learning_rate": 9.512259600246156e-06, "loss": 0.031, "step": 1559 }, { "epoch": 0.7097361237488626, "grad_norm": 0.6214125216955318, "learning_rate": 9.511643688618463e-06, "loss": 0.0943, "step": 1560 }, { "epoch": 0.7101910828025477, "grad_norm": 0.7097270108607417, "learning_rate": 9.51102740831962e-06, "loss": 0.0847, "step": 1561 }, { "epoch": 0.7106460418562329, "grad_norm": 0.8290870913254417, "learning_rate": 9.510410759399991e-06, "loss": 0.0867, "step": 1562 }, { "epoch": 0.7111010009099181, "grad_norm": 0.7141101307254801, "learning_rate": 9.50979374190996e-06, "loss": 0.0838, "step": 1563 }, { "epoch": 0.7115559599636033, "grad_norm": 0.8532705780985276, "learning_rate": 9.509176355899954e-06, "loss": 0.09, "step": 1564 }, { "epoch": 0.7120109190172884, "grad_norm": 0.6858037908830302, "learning_rate": 9.508558601420417e-06, "loss": 0.0637, "step": 1565 }, { "epoch": 0.7124658780709736, "grad_norm": 0.7489578082911201, "learning_rate": 9.507940478521833e-06, "loss": 0.1059, "step": 1566 }, { "epoch": 0.7129208371246588, "grad_norm": 0.5241685648277268, "learning_rate": 9.507321987254712e-06, "loss": 0.0474, "step": 1567 }, { "epoch": 0.7133757961783439, "grad_norm": 0.9862924439076355, "learning_rate": 9.50670312766959e-06, "loss": 0.1047, "step": 1568 }, { "epoch": 0.7138307552320291, "grad_norm": 0.8286292773017996, "learning_rate": 9.506083899817043e-06, "loss": 0.0808, "step": 1569 }, { "epoch": 0.7142857142857143, "grad_norm": 0.8166629192761119, "learning_rate": 9.505464303747667e-06, "loss": 0.079, "step": 1570 }, { "epoch": 0.7147406733393995, "grad_norm": 0.6651663578468047, "learning_rate": 9.504844339512096e-06, "loss": 0.0879, "step": 1571 }, { "epoch": 0.7151956323930846, "grad_norm": 0.5230779536546156, "learning_rate": 9.50422400716099e-06, "loss": 0.0585, "step": 1572 }, { "epoch": 0.7156505914467698, "grad_norm": 0.6543543054934573, "learning_rate": 9.503603306745036e-06, "loss": 0.0564, "step": 1573 }, { "epoch": 0.716105550500455, "grad_norm": 0.7812592861176204, "learning_rate": 9.502982238314962e-06, "loss": 0.0874, "step": 1574 }, { "epoch": 0.7165605095541401, "grad_norm": 0.5040232473993467, "learning_rate": 9.502360801921512e-06, "loss": 0.0532, "step": 1575 }, { "epoch": 0.7170154686078253, "grad_norm": 0.8631279038726943, "learning_rate": 9.501738997615471e-06, "loss": 0.1045, "step": 1576 }, { "epoch": 0.7174704276615105, "grad_norm": 0.7716014465645913, "learning_rate": 9.501116825447648e-06, "loss": 0.068, "step": 1577 }, { "epoch": 0.7179253867151957, "grad_norm": 0.5327432187838176, "learning_rate": 9.500494285468884e-06, "loss": 0.053, "step": 1578 }, { "epoch": 0.7183803457688808, "grad_norm": 0.8209926537375553, "learning_rate": 9.499871377730053e-06, "loss": 0.1164, "step": 1579 }, { "epoch": 0.718835304822566, "grad_norm": 0.5454374508074649, "learning_rate": 9.499248102282052e-06, "loss": 0.0579, "step": 1580 }, { "epoch": 0.7192902638762512, "grad_norm": 0.4944315103743207, "learning_rate": 9.498624459175815e-06, "loss": 0.0542, "step": 1581 }, { "epoch": 0.7197452229299363, "grad_norm": 0.8372013648456964, "learning_rate": 9.498000448462305e-06, "loss": 0.0948, "step": 1582 }, { "epoch": 0.7202001819836215, "grad_norm": 0.6792072434969908, "learning_rate": 9.49737607019251e-06, "loss": 0.0683, "step": 1583 }, { "epoch": 0.7206551410373067, "grad_norm": 0.6679228302277659, "learning_rate": 9.496751324417452e-06, "loss": 0.0526, "step": 1584 }, { "epoch": 0.7211101000909919, "grad_norm": 0.830168268257237, "learning_rate": 9.496126211188184e-06, "loss": 0.1049, "step": 1585 }, { "epoch": 0.721565059144677, "grad_norm": 0.7614112606151382, "learning_rate": 9.495500730555784e-06, "loss": 0.0966, "step": 1586 }, { "epoch": 0.7220200181983621, "grad_norm": 0.7574732623314945, "learning_rate": 9.494874882571368e-06, "loss": 0.0648, "step": 1587 }, { "epoch": 0.7224749772520473, "grad_norm": 0.7541681951930181, "learning_rate": 9.494248667286075e-06, "loss": 0.0905, "step": 1588 }, { "epoch": 0.7229299363057324, "grad_norm": 0.776748715422375, "learning_rate": 9.493622084751076e-06, "loss": 0.0841, "step": 1589 }, { "epoch": 0.7233848953594176, "grad_norm": 0.6440945504942991, "learning_rate": 9.492995135017574e-06, "loss": 0.0779, "step": 1590 }, { "epoch": 0.7238398544131028, "grad_norm": 0.658893968607762, "learning_rate": 9.4923678181368e-06, "loss": 0.0862, "step": 1591 }, { "epoch": 0.724294813466788, "grad_norm": 0.764304310956247, "learning_rate": 9.491740134160014e-06, "loss": 0.0834, "step": 1592 }, { "epoch": 0.7247497725204731, "grad_norm": 1.246667162089055, "learning_rate": 9.491112083138509e-06, "loss": 0.141, "step": 1593 }, { "epoch": 0.7252047315741583, "grad_norm": 0.7827390484343668, "learning_rate": 9.490483665123606e-06, "loss": 0.0687, "step": 1594 }, { "epoch": 0.7256596906278435, "grad_norm": 0.6055248563993239, "learning_rate": 9.489854880166658e-06, "loss": 0.0716, "step": 1595 }, { "epoch": 0.7261146496815286, "grad_norm": 0.7067865427149594, "learning_rate": 9.489225728319044e-06, "loss": 0.0756, "step": 1596 }, { "epoch": 0.7265696087352138, "grad_norm": 0.85395818798431, "learning_rate": 9.488596209632179e-06, "loss": 0.1099, "step": 1597 }, { "epoch": 0.727024567788899, "grad_norm": 0.6870669290352402, "learning_rate": 9.4879663241575e-06, "loss": 0.0703, "step": 1598 }, { "epoch": 0.7274795268425842, "grad_norm": 1.2809048497988667, "learning_rate": 9.48733607194648e-06, "loss": 0.1663, "step": 1599 }, { "epoch": 0.7279344858962693, "grad_norm": 0.7180890087653823, "learning_rate": 9.486705453050622e-06, "loss": 0.0738, "step": 1600 }, { "epoch": 0.7283894449499545, "grad_norm": 0.5662460892211576, "learning_rate": 9.486074467521456e-06, "loss": 0.0627, "step": 1601 }, { "epoch": 0.7288444040036397, "grad_norm": 0.7172800606287587, "learning_rate": 9.485443115410541e-06, "loss": 0.0715, "step": 1602 }, { "epoch": 0.7292993630573248, "grad_norm": 0.6146064647413995, "learning_rate": 9.484811396769475e-06, "loss": 0.0828, "step": 1603 }, { "epoch": 0.72975432211101, "grad_norm": 0.8606888467276742, "learning_rate": 9.484179311649873e-06, "loss": 0.0962, "step": 1604 }, { "epoch": 0.7302092811646952, "grad_norm": 0.46814164753859155, "learning_rate": 9.483546860103388e-06, "loss": 0.0477, "step": 1605 }, { "epoch": 0.7306642402183804, "grad_norm": 0.7370090010007736, "learning_rate": 9.4829140421817e-06, "loss": 0.081, "step": 1606 }, { "epoch": 0.7311191992720655, "grad_norm": 1.0689466216112777, "learning_rate": 9.482280857936522e-06, "loss": 0.109, "step": 1607 }, { "epoch": 0.7315741583257507, "grad_norm": 0.4147348220425697, "learning_rate": 9.481647307419594e-06, "loss": 0.0479, "step": 1608 }, { "epoch": 0.7320291173794359, "grad_norm": 0.4998747516198886, "learning_rate": 9.481013390682687e-06, "loss": 0.0634, "step": 1609 }, { "epoch": 0.732484076433121, "grad_norm": 0.8673371359679307, "learning_rate": 9.480379107777601e-06, "loss": 0.1108, "step": 1610 }, { "epoch": 0.7329390354868062, "grad_norm": 0.6369274329058493, "learning_rate": 9.47974445875617e-06, "loss": 0.0698, "step": 1611 }, { "epoch": 0.7333939945404914, "grad_norm": 0.6434647227835387, "learning_rate": 9.47910944367025e-06, "loss": 0.0618, "step": 1612 }, { "epoch": 0.7338489535941766, "grad_norm": 0.8035955314379585, "learning_rate": 9.478474062571735e-06, "loss": 0.0997, "step": 1613 }, { "epoch": 0.7343039126478617, "grad_norm": 0.7996949463502321, "learning_rate": 9.477838315512544e-06, "loss": 0.0873, "step": 1614 }, { "epoch": 0.7347588717015469, "grad_norm": 0.6484970204244012, "learning_rate": 9.477202202544626e-06, "loss": 0.0925, "step": 1615 }, { "epoch": 0.7352138307552321, "grad_norm": 0.6478821974846899, "learning_rate": 9.476565723719966e-06, "loss": 0.0693, "step": 1616 }, { "epoch": 0.7356687898089171, "grad_norm": 0.6896940284490023, "learning_rate": 9.475928879090568e-06, "loss": 0.0763, "step": 1617 }, { "epoch": 0.7361237488626023, "grad_norm": 0.6758264439259065, "learning_rate": 9.475291668708476e-06, "loss": 0.0717, "step": 1618 }, { "epoch": 0.7365787079162875, "grad_norm": 0.6285383601705616, "learning_rate": 9.474654092625758e-06, "loss": 0.0561, "step": 1619 }, { "epoch": 0.7370336669699727, "grad_norm": 0.7488998942485512, "learning_rate": 9.474016150894518e-06, "loss": 0.0765, "step": 1620 }, { "epoch": 0.7374886260236578, "grad_norm": 0.7511340475878087, "learning_rate": 9.47337784356688e-06, "loss": 0.0865, "step": 1621 }, { "epoch": 0.737943585077343, "grad_norm": 0.6908706816034008, "learning_rate": 9.472739170695006e-06, "loss": 0.0879, "step": 1622 }, { "epoch": 0.7383985441310282, "grad_norm": 0.9159671053782389, "learning_rate": 9.472100132331089e-06, "loss": 0.0862, "step": 1623 }, { "epoch": 0.7388535031847133, "grad_norm": 0.8367180794291794, "learning_rate": 9.471460728527342e-06, "loss": 0.0988, "step": 1624 }, { "epoch": 0.7393084622383985, "grad_norm": 0.6396536181540736, "learning_rate": 9.470820959336018e-06, "loss": 0.0742, "step": 1625 }, { "epoch": 0.7397634212920837, "grad_norm": 0.7212059639642758, "learning_rate": 9.470180824809394e-06, "loss": 0.0887, "step": 1626 }, { "epoch": 0.7402183803457689, "grad_norm": 0.6570480817818456, "learning_rate": 9.469540324999782e-06, "loss": 0.0654, "step": 1627 }, { "epoch": 0.740673339399454, "grad_norm": 0.6780217435395393, "learning_rate": 9.468899459959518e-06, "loss": 0.0613, "step": 1628 }, { "epoch": 0.7411282984531392, "grad_norm": 0.8367065537687267, "learning_rate": 9.468258229740972e-06, "loss": 0.087, "step": 1629 }, { "epoch": 0.7415832575068244, "grad_norm": 0.6724757485261361, "learning_rate": 9.467616634396542e-06, "loss": 0.0513, "step": 1630 }, { "epoch": 0.7420382165605095, "grad_norm": 0.5923362651506067, "learning_rate": 9.466974673978654e-06, "loss": 0.0668, "step": 1631 }, { "epoch": 0.7424931756141947, "grad_norm": 0.8046255156703264, "learning_rate": 9.466332348539772e-06, "loss": 0.0888, "step": 1632 }, { "epoch": 0.7429481346678799, "grad_norm": 0.7456071657218726, "learning_rate": 9.465689658132379e-06, "loss": 0.0872, "step": 1633 }, { "epoch": 0.7434030937215651, "grad_norm": 0.8751254537474247, "learning_rate": 9.465046602808994e-06, "loss": 0.0901, "step": 1634 }, { "epoch": 0.7438580527752502, "grad_norm": 0.9953711560207276, "learning_rate": 9.464403182622164e-06, "loss": 0.1175, "step": 1635 }, { "epoch": 0.7443130118289354, "grad_norm": 0.738323897945569, "learning_rate": 9.463759397624466e-06, "loss": 0.1016, "step": 1636 }, { "epoch": 0.7447679708826206, "grad_norm": 0.620705920516562, "learning_rate": 9.46311524786851e-06, "loss": 0.0654, "step": 1637 }, { "epoch": 0.7452229299363057, "grad_norm": 1.2433273775382216, "learning_rate": 9.462470733406929e-06, "loss": 0.1403, "step": 1638 }, { "epoch": 0.7456778889899909, "grad_norm": 1.0268174749706445, "learning_rate": 9.461825854292394e-06, "loss": 0.1065, "step": 1639 }, { "epoch": 0.7461328480436761, "grad_norm": 0.6942991337802967, "learning_rate": 9.4611806105776e-06, "loss": 0.0736, "step": 1640 }, { "epoch": 0.7465878070973613, "grad_norm": 0.8367822612372433, "learning_rate": 9.460535002315272e-06, "loss": 0.089, "step": 1641 }, { "epoch": 0.7470427661510464, "grad_norm": 0.5929887457730553, "learning_rate": 9.459889029558167e-06, "loss": 0.0665, "step": 1642 }, { "epoch": 0.7474977252047316, "grad_norm": 0.5692342733265978, "learning_rate": 9.459242692359072e-06, "loss": 0.0708, "step": 1643 }, { "epoch": 0.7479526842584168, "grad_norm": 0.6049162715481944, "learning_rate": 9.4585959907708e-06, "loss": 0.0716, "step": 1644 }, { "epoch": 0.7484076433121019, "grad_norm": 0.5865800556894495, "learning_rate": 9.457948924846201e-06, "loss": 0.0562, "step": 1645 }, { "epoch": 0.7488626023657871, "grad_norm": 1.018263961729041, "learning_rate": 9.457301494638147e-06, "loss": 0.1129, "step": 1646 }, { "epoch": 0.7493175614194723, "grad_norm": 0.8420303347709615, "learning_rate": 9.456653700199542e-06, "loss": 0.0982, "step": 1647 }, { "epoch": 0.7497725204731575, "grad_norm": 0.6178217269864875, "learning_rate": 9.456005541583326e-06, "loss": 0.0777, "step": 1648 }, { "epoch": 0.7502274795268425, "grad_norm": 0.6159701780113571, "learning_rate": 9.455357018842458e-06, "loss": 0.075, "step": 1649 }, { "epoch": 0.7506824385805277, "grad_norm": 0.5563337669331565, "learning_rate": 9.454708132029936e-06, "loss": 0.0594, "step": 1650 }, { "epoch": 0.7511373976342129, "grad_norm": 0.7796132603413727, "learning_rate": 9.454058881198782e-06, "loss": 0.0842, "step": 1651 }, { "epoch": 0.7515923566878981, "grad_norm": 0.5977999349867541, "learning_rate": 9.45340926640205e-06, "loss": 0.0623, "step": 1652 }, { "epoch": 0.7520473157415832, "grad_norm": 0.7762091660359064, "learning_rate": 9.452759287692824e-06, "loss": 0.0923, "step": 1653 }, { "epoch": 0.7525022747952684, "grad_norm": 1.029286283612893, "learning_rate": 9.452108945124218e-06, "loss": 0.1114, "step": 1654 }, { "epoch": 0.7529572338489536, "grad_norm": 0.5046695202197234, "learning_rate": 9.451458238749375e-06, "loss": 0.058, "step": 1655 }, { "epoch": 0.7534121929026387, "grad_norm": 0.6262659207860063, "learning_rate": 9.450807168621468e-06, "loss": 0.0607, "step": 1656 }, { "epoch": 0.7538671519563239, "grad_norm": 0.7451490801568118, "learning_rate": 9.450155734793697e-06, "loss": 0.0716, "step": 1657 }, { "epoch": 0.7543221110100091, "grad_norm": 0.6504007368655154, "learning_rate": 9.449503937319297e-06, "loss": 0.0913, "step": 1658 }, { "epoch": 0.7547770700636943, "grad_norm": 0.8923820492879996, "learning_rate": 9.448851776251528e-06, "loss": 0.0984, "step": 1659 }, { "epoch": 0.7552320291173794, "grad_norm": 0.7256175088606572, "learning_rate": 9.448199251643684e-06, "loss": 0.0834, "step": 1660 }, { "epoch": 0.7556869881710646, "grad_norm": 0.7778885787730276, "learning_rate": 9.447546363549085e-06, "loss": 0.0878, "step": 1661 }, { "epoch": 0.7561419472247498, "grad_norm": 0.8265030986085233, "learning_rate": 9.446893112021083e-06, "loss": 0.0827, "step": 1662 }, { "epoch": 0.7565969062784349, "grad_norm": 0.5801162274559535, "learning_rate": 9.446239497113055e-06, "loss": 0.0797, "step": 1663 }, { "epoch": 0.7570518653321201, "grad_norm": 0.8974914764997551, "learning_rate": 9.445585518878418e-06, "loss": 0.1088, "step": 1664 }, { "epoch": 0.7575068243858053, "grad_norm": 0.8878060872125964, "learning_rate": 9.444931177370605e-06, "loss": 0.1235, "step": 1665 }, { "epoch": 0.7579617834394905, "grad_norm": 0.5088737676913533, "learning_rate": 9.44427647264309e-06, "loss": 0.0478, "step": 1666 }, { "epoch": 0.7584167424931756, "grad_norm": 0.7484910765250183, "learning_rate": 9.443621404749374e-06, "loss": 0.0686, "step": 1667 }, { "epoch": 0.7588717015468608, "grad_norm": 0.6292123912530658, "learning_rate": 9.442965973742983e-06, "loss": 0.0652, "step": 1668 }, { "epoch": 0.759326660600546, "grad_norm": 1.037223955207567, "learning_rate": 9.442310179677476e-06, "loss": 0.0827, "step": 1669 }, { "epoch": 0.7597816196542311, "grad_norm": 0.6769034013570638, "learning_rate": 9.441654022606444e-06, "loss": 0.0771, "step": 1670 }, { "epoch": 0.7602365787079163, "grad_norm": 0.8310244395490821, "learning_rate": 9.440997502583503e-06, "loss": 0.091, "step": 1671 }, { "epoch": 0.7606915377616015, "grad_norm": 1.0039785109365194, "learning_rate": 9.4403406196623e-06, "loss": 0.1251, "step": 1672 }, { "epoch": 0.7611464968152867, "grad_norm": 0.7908056524331212, "learning_rate": 9.439683373896515e-06, "loss": 0.0876, "step": 1673 }, { "epoch": 0.7616014558689718, "grad_norm": 1.0809832712577787, "learning_rate": 9.439025765339852e-06, "loss": 0.1256, "step": 1674 }, { "epoch": 0.762056414922657, "grad_norm": 0.5964161616065347, "learning_rate": 9.438367794046053e-06, "loss": 0.0585, "step": 1675 }, { "epoch": 0.7625113739763422, "grad_norm": 0.8617975528364193, "learning_rate": 9.437709460068882e-06, "loss": 0.0783, "step": 1676 }, { "epoch": 0.7629663330300273, "grad_norm": 0.6361215357389327, "learning_rate": 9.437050763462132e-06, "loss": 0.0692, "step": 1677 }, { "epoch": 0.7634212920837125, "grad_norm": 0.9790069893643866, "learning_rate": 9.436391704279632e-06, "loss": 0.1173, "step": 1678 }, { "epoch": 0.7638762511373977, "grad_norm": 1.1287905857392149, "learning_rate": 9.435732282575235e-06, "loss": 0.1505, "step": 1679 }, { "epoch": 0.7643312101910829, "grad_norm": 0.8195744592905398, "learning_rate": 9.435072498402832e-06, "loss": 0.0877, "step": 1680 }, { "epoch": 0.7647861692447679, "grad_norm": 0.5293612997987346, "learning_rate": 9.434412351816329e-06, "loss": 0.0609, "step": 1681 }, { "epoch": 0.7652411282984531, "grad_norm": 0.7565664140640663, "learning_rate": 9.433751842869676e-06, "loss": 0.0895, "step": 1682 }, { "epoch": 0.7656960873521383, "grad_norm": 0.8390610329820178, "learning_rate": 9.433090971616842e-06, "loss": 0.0823, "step": 1683 }, { "epoch": 0.7661510464058234, "grad_norm": 0.7979326314286513, "learning_rate": 9.432429738111836e-06, "loss": 0.0893, "step": 1684 }, { "epoch": 0.7666060054595086, "grad_norm": 0.7985876042778349, "learning_rate": 9.431768142408687e-06, "loss": 0.0965, "step": 1685 }, { "epoch": 0.7670609645131938, "grad_norm": 0.7008114448081032, "learning_rate": 9.431106184561462e-06, "loss": 0.0894, "step": 1686 }, { "epoch": 0.767515923566879, "grad_norm": 0.8506122352220377, "learning_rate": 9.430443864624249e-06, "loss": 0.0949, "step": 1687 }, { "epoch": 0.7679708826205641, "grad_norm": 1.0900644244466022, "learning_rate": 9.429781182651171e-06, "loss": 0.1211, "step": 1688 }, { "epoch": 0.7684258416742493, "grad_norm": 0.585079487316927, "learning_rate": 9.429118138696378e-06, "loss": 0.0642, "step": 1689 }, { "epoch": 0.7688808007279345, "grad_norm": 0.8727981223997378, "learning_rate": 9.428454732814055e-06, "loss": 0.0987, "step": 1690 }, { "epoch": 0.7693357597816196, "grad_norm": 0.7032463083497149, "learning_rate": 9.427790965058407e-06, "loss": 0.0685, "step": 1691 }, { "epoch": 0.7697907188353048, "grad_norm": 0.6784390616651746, "learning_rate": 9.42712683548368e-06, "loss": 0.079, "step": 1692 }, { "epoch": 0.77024567788899, "grad_norm": 0.774501448184362, "learning_rate": 9.426462344144138e-06, "loss": 0.0784, "step": 1693 }, { "epoch": 0.7707006369426752, "grad_norm": 0.7793988116138444, "learning_rate": 9.425797491094086e-06, "loss": 0.0801, "step": 1694 }, { "epoch": 0.7711555959963603, "grad_norm": 0.7642360389143683, "learning_rate": 9.425132276387847e-06, "loss": 0.1009, "step": 1695 }, { "epoch": 0.7716105550500455, "grad_norm": 0.6080046843370063, "learning_rate": 9.424466700079785e-06, "loss": 0.0688, "step": 1696 }, { "epoch": 0.7720655141037307, "grad_norm": 0.6270167280264678, "learning_rate": 9.423800762224283e-06, "loss": 0.0626, "step": 1697 }, { "epoch": 0.7725204731574158, "grad_norm": 0.5357586110049548, "learning_rate": 9.42313446287576e-06, "loss": 0.0626, "step": 1698 }, { "epoch": 0.772975432211101, "grad_norm": 0.6233095813256608, "learning_rate": 9.422467802088664e-06, "loss": 0.0804, "step": 1699 }, { "epoch": 0.7734303912647862, "grad_norm": 0.7158265191654914, "learning_rate": 9.42180077991747e-06, "loss": 0.0887, "step": 1700 }, { "epoch": 0.7738853503184714, "grad_norm": 1.0305735114746193, "learning_rate": 9.421133396416687e-06, "loss": 0.1441, "step": 1701 }, { "epoch": 0.7743403093721565, "grad_norm": 0.6965845039033058, "learning_rate": 9.420465651640847e-06, "loss": 0.079, "step": 1702 }, { "epoch": 0.7747952684258417, "grad_norm": 0.4529773063241175, "learning_rate": 9.419797545644516e-06, "loss": 0.0443, "step": 1703 }, { "epoch": 0.7752502274795269, "grad_norm": 0.5407082720421394, "learning_rate": 9.41912907848229e-06, "loss": 0.0625, "step": 1704 }, { "epoch": 0.775705186533212, "grad_norm": 0.5625290405803486, "learning_rate": 9.418460250208791e-06, "loss": 0.0695, "step": 1705 }, { "epoch": 0.7761601455868972, "grad_norm": 0.5288549658523206, "learning_rate": 9.417791060878677e-06, "loss": 0.0546, "step": 1706 }, { "epoch": 0.7766151046405824, "grad_norm": 0.6390336517076213, "learning_rate": 9.417121510546626e-06, "loss": 0.0474, "step": 1707 }, { "epoch": 0.7770700636942676, "grad_norm": 1.1628554226147039, "learning_rate": 9.416451599267353e-06, "loss": 0.1427, "step": 1708 }, { "epoch": 0.7775250227479527, "grad_norm": 0.5775794942631142, "learning_rate": 9.415781327095601e-06, "loss": 0.0722, "step": 1709 }, { "epoch": 0.7779799818016379, "grad_norm": 0.6702327788675698, "learning_rate": 9.415110694086139e-06, "loss": 0.0863, "step": 1710 }, { "epoch": 0.778434940855323, "grad_norm": 1.0756620214218862, "learning_rate": 9.41443970029377e-06, "loss": 0.0916, "step": 1711 }, { "epoch": 0.7788898999090081, "grad_norm": 0.6873597883249742, "learning_rate": 9.413768345773324e-06, "loss": 0.0928, "step": 1712 }, { "epoch": 0.7793448589626933, "grad_norm": 0.546687059556293, "learning_rate": 9.413096630579661e-06, "loss": 0.0681, "step": 1713 }, { "epoch": 0.7797998180163785, "grad_norm": 0.5882776722743176, "learning_rate": 9.412424554767672e-06, "loss": 0.0666, "step": 1714 }, { "epoch": 0.7802547770700637, "grad_norm": 0.7757931395434748, "learning_rate": 9.411752118392272e-06, "loss": 0.0961, "step": 1715 }, { "epoch": 0.7807097361237488, "grad_norm": 0.7533384044089068, "learning_rate": 9.411079321508416e-06, "loss": 0.0915, "step": 1716 }, { "epoch": 0.781164695177434, "grad_norm": 0.6690633163427073, "learning_rate": 9.410406164171076e-06, "loss": 0.0757, "step": 1717 }, { "epoch": 0.7816196542311192, "grad_norm": 0.9875033482174213, "learning_rate": 9.40973264643526e-06, "loss": 0.1016, "step": 1718 }, { "epoch": 0.7820746132848043, "grad_norm": 0.7285855686862363, "learning_rate": 9.409058768356007e-06, "loss": 0.0777, "step": 1719 }, { "epoch": 0.7825295723384895, "grad_norm": 0.5412833929378409, "learning_rate": 9.408384529988385e-06, "loss": 0.0596, "step": 1720 }, { "epoch": 0.7829845313921747, "grad_norm": 0.48748390975323075, "learning_rate": 9.407709931387486e-06, "loss": 0.0451, "step": 1721 }, { "epoch": 0.7834394904458599, "grad_norm": 0.8626755233369133, "learning_rate": 9.407034972608436e-06, "loss": 0.1093, "step": 1722 }, { "epoch": 0.783894449499545, "grad_norm": 0.5986423081381415, "learning_rate": 9.40635965370639e-06, "loss": 0.0737, "step": 1723 }, { "epoch": 0.7843494085532302, "grad_norm": 0.8697508747552452, "learning_rate": 9.40568397473653e-06, "loss": 0.0748, "step": 1724 }, { "epoch": 0.7848043676069154, "grad_norm": 0.6651587535516658, "learning_rate": 9.405007935754076e-06, "loss": 0.0553, "step": 1725 }, { "epoch": 0.7852593266606005, "grad_norm": 1.1307670638395897, "learning_rate": 9.404331536814265e-06, "loss": 0.1451, "step": 1726 }, { "epoch": 0.7857142857142857, "grad_norm": 0.6724877006657928, "learning_rate": 9.40365477797237e-06, "loss": 0.0803, "step": 1727 }, { "epoch": 0.7861692447679709, "grad_norm": 0.739524107451132, "learning_rate": 9.40297765928369e-06, "loss": 0.0713, "step": 1728 }, { "epoch": 0.7866242038216561, "grad_norm": 0.6341880042511068, "learning_rate": 9.402300180803563e-06, "loss": 0.0739, "step": 1729 }, { "epoch": 0.7870791628753412, "grad_norm": 0.5809522499341311, "learning_rate": 9.401622342587346e-06, "loss": 0.067, "step": 1730 }, { "epoch": 0.7875341219290264, "grad_norm": 0.6208756444695567, "learning_rate": 9.400944144690428e-06, "loss": 0.0865, "step": 1731 }, { "epoch": 0.7879890809827116, "grad_norm": 0.7358085271263743, "learning_rate": 9.400265587168226e-06, "loss": 0.0827, "step": 1732 }, { "epoch": 0.7884440400363967, "grad_norm": 0.6985098389174249, "learning_rate": 9.399586670076196e-06, "loss": 0.0784, "step": 1733 }, { "epoch": 0.7888989990900819, "grad_norm": 0.6524277365731544, "learning_rate": 9.39890739346981e-06, "loss": 0.0759, "step": 1734 }, { "epoch": 0.7893539581437671, "grad_norm": 0.8500489687124628, "learning_rate": 9.398227757404576e-06, "loss": 0.1139, "step": 1735 }, { "epoch": 0.7898089171974523, "grad_norm": 0.49161558761743546, "learning_rate": 9.397547761936034e-06, "loss": 0.0445, "step": 1736 }, { "epoch": 0.7902638762511374, "grad_norm": 0.3886581827401007, "learning_rate": 9.396867407119748e-06, "loss": 0.0387, "step": 1737 }, { "epoch": 0.7907188353048226, "grad_norm": 0.43315626329206963, "learning_rate": 9.396186693011312e-06, "loss": 0.0484, "step": 1738 }, { "epoch": 0.7911737943585078, "grad_norm": 0.7578063731873546, "learning_rate": 9.395505619666353e-06, "loss": 0.0872, "step": 1739 }, { "epoch": 0.7916287534121929, "grad_norm": 0.9087897001540515, "learning_rate": 9.394824187140526e-06, "loss": 0.0914, "step": 1740 }, { "epoch": 0.792083712465878, "grad_norm": 0.5994634977370948, "learning_rate": 9.394142395489512e-06, "loss": 0.061, "step": 1741 }, { "epoch": 0.7925386715195633, "grad_norm": 0.6263578026813904, "learning_rate": 9.393460244769023e-06, "loss": 0.0608, "step": 1742 }, { "epoch": 0.7929936305732485, "grad_norm": 0.5753033056961346, "learning_rate": 9.392777735034807e-06, "loss": 0.0721, "step": 1743 }, { "epoch": 0.7934485896269335, "grad_norm": 0.6561198773299641, "learning_rate": 9.392094866342632e-06, "loss": 0.0599, "step": 1744 }, { "epoch": 0.7939035486806187, "grad_norm": 0.7317990056550264, "learning_rate": 9.391411638748297e-06, "loss": 0.0742, "step": 1745 }, { "epoch": 0.7943585077343039, "grad_norm": 0.5011723772780661, "learning_rate": 9.390728052307637e-06, "loss": 0.0647, "step": 1746 }, { "epoch": 0.794813466787989, "grad_norm": 0.6867846904523061, "learning_rate": 9.390044107076506e-06, "loss": 0.0779, "step": 1747 }, { "epoch": 0.7952684258416742, "grad_norm": 0.9267872196876082, "learning_rate": 9.389359803110796e-06, "loss": 0.1001, "step": 1748 }, { "epoch": 0.7957233848953594, "grad_norm": 3.487580179742763, "learning_rate": 9.388675140466427e-06, "loss": 0.1841, "step": 1749 }, { "epoch": 0.7961783439490446, "grad_norm": 0.6520959532750612, "learning_rate": 9.387990119199343e-06, "loss": 0.0714, "step": 1750 }, { "epoch": 0.7966333030027297, "grad_norm": 0.8129917876989495, "learning_rate": 9.387304739365524e-06, "loss": 0.0949, "step": 1751 }, { "epoch": 0.7970882620564149, "grad_norm": 0.6276053555905522, "learning_rate": 9.386619001020974e-06, "loss": 0.0552, "step": 1752 }, { "epoch": 0.7975432211101001, "grad_norm": 0.7632340875896291, "learning_rate": 9.385932904221729e-06, "loss": 0.0655, "step": 1753 }, { "epoch": 0.7979981801637852, "grad_norm": 0.7239218776412117, "learning_rate": 9.385246449023853e-06, "loss": 0.1113, "step": 1754 }, { "epoch": 0.7984531392174704, "grad_norm": 1.0468381569335767, "learning_rate": 9.38455963548344e-06, "loss": 0.1042, "step": 1755 }, { "epoch": 0.7989080982711556, "grad_norm": 0.8019558864262506, "learning_rate": 9.383872463656616e-06, "loss": 0.0868, "step": 1756 }, { "epoch": 0.7993630573248408, "grad_norm": 0.7449121488820226, "learning_rate": 9.383184933599531e-06, "loss": 0.0945, "step": 1757 }, { "epoch": 0.7998180163785259, "grad_norm": 0.5905383438931077, "learning_rate": 9.382497045368368e-06, "loss": 0.0672, "step": 1758 }, { "epoch": 0.8002729754322111, "grad_norm": 0.5337189472762474, "learning_rate": 9.381808799019336e-06, "loss": 0.0509, "step": 1759 }, { "epoch": 0.8007279344858963, "grad_norm": 1.0483707789224317, "learning_rate": 9.38112019460868e-06, "loss": 0.1069, "step": 1760 }, { "epoch": 0.8011828935395814, "grad_norm": 0.8974041640796228, "learning_rate": 9.380431232192663e-06, "loss": 0.1061, "step": 1761 }, { "epoch": 0.8016378525932666, "grad_norm": 0.774987790741639, "learning_rate": 9.379741911827591e-06, "loss": 0.0971, "step": 1762 }, { "epoch": 0.8020928116469518, "grad_norm": 0.5037991292329869, "learning_rate": 9.379052233569788e-06, "loss": 0.0545, "step": 1763 }, { "epoch": 0.802547770700637, "grad_norm": 0.7571282390818425, "learning_rate": 9.37836219747561e-06, "loss": 0.0774, "step": 1764 }, { "epoch": 0.8030027297543221, "grad_norm": 0.47374252215612206, "learning_rate": 9.377671803601447e-06, "loss": 0.0479, "step": 1765 }, { "epoch": 0.8034576888080073, "grad_norm": 0.563871853603133, "learning_rate": 9.376981052003713e-06, "loss": 0.0583, "step": 1766 }, { "epoch": 0.8039126478616925, "grad_norm": 0.7260639419055305, "learning_rate": 9.376289942738855e-06, "loss": 0.0739, "step": 1767 }, { "epoch": 0.8043676069153776, "grad_norm": 0.7704639306429572, "learning_rate": 9.375598475863345e-06, "loss": 0.08, "step": 1768 }, { "epoch": 0.8048225659690628, "grad_norm": 0.8052864772012752, "learning_rate": 9.374906651433689e-06, "loss": 0.1155, "step": 1769 }, { "epoch": 0.805277525022748, "grad_norm": 0.945940660466259, "learning_rate": 9.374214469506416e-06, "loss": 0.0942, "step": 1770 }, { "epoch": 0.8057324840764332, "grad_norm": 0.8382092898318407, "learning_rate": 9.373521930138092e-06, "loss": 0.0831, "step": 1771 }, { "epoch": 0.8061874431301183, "grad_norm": 0.5910933141386769, "learning_rate": 9.372829033385306e-06, "loss": 0.0825, "step": 1772 }, { "epoch": 0.8066424021838035, "grad_norm": 0.7616883112365667, "learning_rate": 9.37213577930468e-06, "loss": 0.0907, "step": 1773 }, { "epoch": 0.8070973612374887, "grad_norm": 0.9571485234330176, "learning_rate": 9.37144216795286e-06, "loss": 0.1322, "step": 1774 }, { "epoch": 0.8075523202911737, "grad_norm": 0.770430324420924, "learning_rate": 9.370748199386529e-06, "loss": 0.0821, "step": 1775 }, { "epoch": 0.8080072793448589, "grad_norm": 0.6303205378749905, "learning_rate": 9.370053873662393e-06, "loss": 0.0694, "step": 1776 }, { "epoch": 0.8084622383985441, "grad_norm": 0.6777135846807264, "learning_rate": 9.36935919083719e-06, "loss": 0.0685, "step": 1777 }, { "epoch": 0.8089171974522293, "grad_norm": 0.7319936383805717, "learning_rate": 9.368664150967686e-06, "loss": 0.0679, "step": 1778 }, { "epoch": 0.8093721565059144, "grad_norm": 0.7990830113911501, "learning_rate": 9.367968754110675e-06, "loss": 0.1023, "step": 1779 }, { "epoch": 0.8098271155595996, "grad_norm": 0.5223284241529513, "learning_rate": 9.367273000322983e-06, "loss": 0.063, "step": 1780 }, { "epoch": 0.8102820746132848, "grad_norm": 1.040419010652034, "learning_rate": 9.366576889661465e-06, "loss": 0.1236, "step": 1781 }, { "epoch": 0.8107370336669699, "grad_norm": 0.6404250074887077, "learning_rate": 9.365880422183003e-06, "loss": 0.0656, "step": 1782 }, { "epoch": 0.8111919927206551, "grad_norm": 0.7564675990794105, "learning_rate": 9.365183597944506e-06, "loss": 0.0725, "step": 1783 }, { "epoch": 0.8116469517743403, "grad_norm": 0.5955963027805166, "learning_rate": 9.364486417002922e-06, "loss": 0.07, "step": 1784 }, { "epoch": 0.8121019108280255, "grad_norm": 0.6658882483856376, "learning_rate": 9.363788879415217e-06, "loss": 0.0616, "step": 1785 }, { "epoch": 0.8125568698817106, "grad_norm": 0.6032274064354748, "learning_rate": 9.36309098523839e-06, "loss": 0.0688, "step": 1786 }, { "epoch": 0.8130118289353958, "grad_norm": 0.7627355718580127, "learning_rate": 9.362392734529472e-06, "loss": 0.0841, "step": 1787 }, { "epoch": 0.813466787989081, "grad_norm": 0.6581922552034235, "learning_rate": 9.361694127345523e-06, "loss": 0.0773, "step": 1788 }, { "epoch": 0.8139217470427661, "grad_norm": 0.5723109702485146, "learning_rate": 9.360995163743622e-06, "loss": 0.0755, "step": 1789 }, { "epoch": 0.8143767060964513, "grad_norm": 0.8492692664232014, "learning_rate": 9.360295843780893e-06, "loss": 0.084, "step": 1790 }, { "epoch": 0.8148316651501365, "grad_norm": 0.7138327780528116, "learning_rate": 9.35959616751448e-06, "loss": 0.0754, "step": 1791 }, { "epoch": 0.8152866242038217, "grad_norm": 0.7513269368015193, "learning_rate": 9.358896135001555e-06, "loss": 0.075, "step": 1792 }, { "epoch": 0.8157415832575068, "grad_norm": 6.226904157676098, "learning_rate": 9.35819574629932e-06, "loss": 0.2447, "step": 1793 }, { "epoch": 0.816196542311192, "grad_norm": 0.9632842432595244, "learning_rate": 9.35749500146501e-06, "loss": 0.0968, "step": 1794 }, { "epoch": 0.8166515013648772, "grad_norm": 0.6910899092527569, "learning_rate": 9.356793900555891e-06, "loss": 0.0736, "step": 1795 }, { "epoch": 0.8171064604185623, "grad_norm": 0.8430341812657529, "learning_rate": 9.356092443629247e-06, "loss": 0.0929, "step": 1796 }, { "epoch": 0.8175614194722475, "grad_norm": 0.7425545237339678, "learning_rate": 9.355390630742401e-06, "loss": 0.1005, "step": 1797 }, { "epoch": 0.8180163785259327, "grad_norm": 0.7004618898733044, "learning_rate": 9.3546884619527e-06, "loss": 0.0789, "step": 1798 }, { "epoch": 0.8184713375796179, "grad_norm": 0.5461552026045962, "learning_rate": 9.353985937317525e-06, "loss": 0.0763, "step": 1799 }, { "epoch": 0.818926296633303, "grad_norm": 0.6222175380121098, "learning_rate": 9.35328305689428e-06, "loss": 0.0754, "step": 1800 }, { "epoch": 0.8193812556869882, "grad_norm": 0.7386705168753549, "learning_rate": 9.352579820740404e-06, "loss": 0.0641, "step": 1801 }, { "epoch": 0.8198362147406734, "grad_norm": 1.2544587029581489, "learning_rate": 9.351876228913363e-06, "loss": 0.107, "step": 1802 }, { "epoch": 0.8202911737943585, "grad_norm": 0.6546855629883478, "learning_rate": 9.351172281470645e-06, "loss": 0.0781, "step": 1803 }, { "epoch": 0.8207461328480437, "grad_norm": 0.7485647273392206, "learning_rate": 9.350467978469782e-06, "loss": 0.0898, "step": 1804 }, { "epoch": 0.8212010919017289, "grad_norm": 0.5530668925780788, "learning_rate": 9.34976331996832e-06, "loss": 0.057, "step": 1805 }, { "epoch": 0.821656050955414, "grad_norm": 0.870085999603916, "learning_rate": 9.349058306023844e-06, "loss": 0.1077, "step": 1806 }, { "epoch": 0.8221110100090991, "grad_norm": 0.891036381079533, "learning_rate": 9.348352936693964e-06, "loss": 0.1082, "step": 1807 }, { "epoch": 0.8225659690627843, "grad_norm": 0.5641275258385202, "learning_rate": 9.347647212036316e-06, "loss": 0.0613, "step": 1808 }, { "epoch": 0.8230209281164695, "grad_norm": 0.7163257638587112, "learning_rate": 9.346941132108575e-06, "loss": 0.0842, "step": 1809 }, { "epoch": 0.8234758871701547, "grad_norm": 0.7333770270884309, "learning_rate": 9.346234696968435e-06, "loss": 0.0782, "step": 1810 }, { "epoch": 0.8239308462238398, "grad_norm": 0.5399164747367127, "learning_rate": 9.345527906673622e-06, "loss": 0.0676, "step": 1811 }, { "epoch": 0.824385805277525, "grad_norm": 1.0476291790994476, "learning_rate": 9.344820761281892e-06, "loss": 0.0984, "step": 1812 }, { "epoch": 0.8248407643312102, "grad_norm": 0.639304845804496, "learning_rate": 9.344113260851031e-06, "loss": 0.0764, "step": 1813 }, { "epoch": 0.8252957233848953, "grad_norm": 0.6071291165528282, "learning_rate": 9.343405405438852e-06, "loss": 0.0707, "step": 1814 }, { "epoch": 0.8257506824385805, "grad_norm": 0.6973111552871604, "learning_rate": 9.342697195103199e-06, "loss": 0.0917, "step": 1815 }, { "epoch": 0.8262056414922657, "grad_norm": 0.6486872321285189, "learning_rate": 9.341988629901942e-06, "loss": 0.0725, "step": 1816 }, { "epoch": 0.8266606005459509, "grad_norm": 0.5216883119977757, "learning_rate": 9.341279709892981e-06, "loss": 0.0572, "step": 1817 }, { "epoch": 0.827115559599636, "grad_norm": 0.4472530755665983, "learning_rate": 9.340570435134248e-06, "loss": 0.0412, "step": 1818 }, { "epoch": 0.8275705186533212, "grad_norm": 0.786165560489741, "learning_rate": 9.339860805683703e-06, "loss": 0.0905, "step": 1819 }, { "epoch": 0.8280254777070064, "grad_norm": 0.8504390923669081, "learning_rate": 9.33915082159933e-06, "loss": 0.0761, "step": 1820 }, { "epoch": 0.8284804367606915, "grad_norm": 0.5303034158640553, "learning_rate": 9.338440482939146e-06, "loss": 0.0735, "step": 1821 }, { "epoch": 0.8289353958143767, "grad_norm": 0.6407993820931909, "learning_rate": 9.337729789761199e-06, "loss": 0.0829, "step": 1822 }, { "epoch": 0.8293903548680619, "grad_norm": 2.670877671269915, "learning_rate": 9.337018742123563e-06, "loss": 0.1871, "step": 1823 }, { "epoch": 0.8298453139217471, "grad_norm": 1.0355313595445745, "learning_rate": 9.336307340084341e-06, "loss": 0.0955, "step": 1824 }, { "epoch": 0.8303002729754322, "grad_norm": 0.6127983226216669, "learning_rate": 9.335595583701667e-06, "loss": 0.0639, "step": 1825 }, { "epoch": 0.8307552320291174, "grad_norm": 0.6196615465194765, "learning_rate": 9.334883473033699e-06, "loss": 0.0706, "step": 1826 }, { "epoch": 0.8312101910828026, "grad_norm": 0.7243682512181147, "learning_rate": 9.33417100813863e-06, "loss": 0.0869, "step": 1827 }, { "epoch": 0.8316651501364877, "grad_norm": 0.94108166831404, "learning_rate": 9.33345818907468e-06, "loss": 0.1349, "step": 1828 }, { "epoch": 0.8321201091901729, "grad_norm": 4.6896190497823955, "learning_rate": 9.332745015900097e-06, "loss": 0.1125, "step": 1829 }, { "epoch": 0.8325750682438581, "grad_norm": 0.7268733027831774, "learning_rate": 9.332031488673156e-06, "loss": 0.0651, "step": 1830 }, { "epoch": 0.8330300272975433, "grad_norm": 0.5169699897246913, "learning_rate": 9.331317607452166e-06, "loss": 0.0683, "step": 1831 }, { "epoch": 0.8334849863512284, "grad_norm": 0.5056561715785393, "learning_rate": 9.330603372295463e-06, "loss": 0.0568, "step": 1832 }, { "epoch": 0.8339399454049136, "grad_norm": 0.5749009883761049, "learning_rate": 9.329888783261408e-06, "loss": 0.0594, "step": 1833 }, { "epoch": 0.8343949044585988, "grad_norm": 0.6696966952437984, "learning_rate": 9.329173840408394e-06, "loss": 0.0764, "step": 1834 }, { "epoch": 0.8348498635122839, "grad_norm": 0.7329039198928983, "learning_rate": 9.328458543794844e-06, "loss": 0.0729, "step": 1835 }, { "epoch": 0.835304822565969, "grad_norm": 0.5892831520257552, "learning_rate": 9.327742893479212e-06, "loss": 0.0838, "step": 1836 }, { "epoch": 0.8357597816196543, "grad_norm": 0.848350653615326, "learning_rate": 9.327026889519973e-06, "loss": 0.0778, "step": 1837 }, { "epoch": 0.8362147406733395, "grad_norm": 0.939837339633871, "learning_rate": 9.326310531975636e-06, "loss": 0.1005, "step": 1838 }, { "epoch": 0.8366696997270245, "grad_norm": 0.6312875650471034, "learning_rate": 9.32559382090474e-06, "loss": 0.0626, "step": 1839 }, { "epoch": 0.8371246587807097, "grad_norm": 0.9586580739045799, "learning_rate": 9.324876756365853e-06, "loss": 0.1154, "step": 1840 }, { "epoch": 0.8375796178343949, "grad_norm": 0.6108920091747637, "learning_rate": 9.324159338417566e-06, "loss": 0.0674, "step": 1841 }, { "epoch": 0.83803457688808, "grad_norm": 0.9247779620401613, "learning_rate": 9.323441567118508e-06, "loss": 0.11, "step": 1842 }, { "epoch": 0.8384895359417652, "grad_norm": 0.6152452902665, "learning_rate": 9.322723442527328e-06, "loss": 0.0657, "step": 1843 }, { "epoch": 0.8389444949954504, "grad_norm": 0.6579130646316164, "learning_rate": 9.32200496470271e-06, "loss": 0.0721, "step": 1844 }, { "epoch": 0.8393994540491356, "grad_norm": 0.6812573423845587, "learning_rate": 9.321286133703365e-06, "loss": 0.0627, "step": 1845 }, { "epoch": 0.8398544131028207, "grad_norm": 0.5946100319565307, "learning_rate": 9.320566949588031e-06, "loss": 0.0708, "step": 1846 }, { "epoch": 0.8403093721565059, "grad_norm": 0.6319246275087805, "learning_rate": 9.319847412415477e-06, "loss": 0.0651, "step": 1847 }, { "epoch": 0.8407643312101911, "grad_norm": 0.6789460664352271, "learning_rate": 9.3191275222445e-06, "loss": 0.0707, "step": 1848 }, { "epoch": 0.8412192902638762, "grad_norm": 0.4396253526793688, "learning_rate": 9.31840727913393e-06, "loss": 0.0431, "step": 1849 }, { "epoch": 0.8416742493175614, "grad_norm": 0.6745617928769184, "learning_rate": 9.317686683142616e-06, "loss": 0.0747, "step": 1850 }, { "epoch": 0.8421292083712466, "grad_norm": 0.6924165554321049, "learning_rate": 9.316965734329447e-06, "loss": 0.0575, "step": 1851 }, { "epoch": 0.8425841674249318, "grad_norm": 0.7219679526943963, "learning_rate": 9.316244432753332e-06, "loss": 0.0935, "step": 1852 }, { "epoch": 0.8430391264786169, "grad_norm": 1.0205930330831676, "learning_rate": 9.315522778473214e-06, "loss": 0.1213, "step": 1853 }, { "epoch": 0.8434940855323021, "grad_norm": 1.009181015179975, "learning_rate": 9.314800771548064e-06, "loss": 0.1049, "step": 1854 }, { "epoch": 0.8439490445859873, "grad_norm": 0.7263916504334191, "learning_rate": 9.31407841203688e-06, "loss": 0.1025, "step": 1855 }, { "epoch": 0.8444040036396724, "grad_norm": 0.6276487176726284, "learning_rate": 9.31335569999869e-06, "loss": 0.0587, "step": 1856 }, { "epoch": 0.8448589626933576, "grad_norm": 0.6171084743549562, "learning_rate": 9.31263263549255e-06, "loss": 0.0495, "step": 1857 }, { "epoch": 0.8453139217470428, "grad_norm": 0.6730791565382994, "learning_rate": 9.31190921857755e-06, "loss": 0.0789, "step": 1858 }, { "epoch": 0.845768880800728, "grad_norm": 0.7874386993734893, "learning_rate": 9.311185449312798e-06, "loss": 0.088, "step": 1859 }, { "epoch": 0.8462238398544131, "grad_norm": 0.5073783803158326, "learning_rate": 9.310461327757442e-06, "loss": 0.0561, "step": 1860 }, { "epoch": 0.8466787989080983, "grad_norm": 0.6051266904327832, "learning_rate": 9.309736853970652e-06, "loss": 0.0688, "step": 1861 }, { "epoch": 0.8471337579617835, "grad_norm": 1.0483500354699085, "learning_rate": 9.309012028011628e-06, "loss": 0.1346, "step": 1862 }, { "epoch": 0.8475887170154686, "grad_norm": 0.9049471090474998, "learning_rate": 9.3082868499396e-06, "loss": 0.0986, "step": 1863 }, { "epoch": 0.8480436760691538, "grad_norm": 0.47381125867485346, "learning_rate": 9.307561319813829e-06, "loss": 0.058, "step": 1864 }, { "epoch": 0.848498635122839, "grad_norm": 0.7964538075850383, "learning_rate": 9.306835437693597e-06, "loss": 0.0829, "step": 1865 }, { "epoch": 0.8489535941765242, "grad_norm": 0.9919343521297046, "learning_rate": 9.306109203638225e-06, "loss": 0.0885, "step": 1866 }, { "epoch": 0.8494085532302093, "grad_norm": 1.4502514405100166, "learning_rate": 9.305382617707052e-06, "loss": 0.1023, "step": 1867 }, { "epoch": 0.8498635122838945, "grad_norm": 0.7238180713867792, "learning_rate": 9.304655679959459e-06, "loss": 0.0813, "step": 1868 }, { "epoch": 0.8503184713375797, "grad_norm": 0.7360849022013412, "learning_rate": 9.303928390454839e-06, "loss": 0.0671, "step": 1869 }, { "epoch": 0.8507734303912647, "grad_norm": 0.5803360108595549, "learning_rate": 9.30320074925263e-06, "loss": 0.075, "step": 1870 }, { "epoch": 0.8512283894449499, "grad_norm": 0.6838093346854254, "learning_rate": 9.302472756412288e-06, "loss": 0.0812, "step": 1871 }, { "epoch": 0.8516833484986351, "grad_norm": 0.8850924783689049, "learning_rate": 9.301744411993302e-06, "loss": 0.0991, "step": 1872 }, { "epoch": 0.8521383075523203, "grad_norm": 0.8273381019086633, "learning_rate": 9.30101571605519e-06, "loss": 0.0803, "step": 1873 }, { "epoch": 0.8525932666060054, "grad_norm": 0.6554434764444423, "learning_rate": 9.300286668657495e-06, "loss": 0.0737, "step": 1874 }, { "epoch": 0.8530482256596906, "grad_norm": 0.8230660869280486, "learning_rate": 9.299557269859795e-06, "loss": 0.0748, "step": 1875 }, { "epoch": 0.8535031847133758, "grad_norm": 0.609738768294497, "learning_rate": 9.298827519721692e-06, "loss": 0.0608, "step": 1876 }, { "epoch": 0.8539581437670609, "grad_norm": 0.7433208516076715, "learning_rate": 9.298097418302817e-06, "loss": 0.0992, "step": 1877 }, { "epoch": 0.8544131028207461, "grad_norm": 0.5414027711398505, "learning_rate": 9.29736696566283e-06, "loss": 0.0642, "step": 1878 }, { "epoch": 0.8548680618744313, "grad_norm": 0.8950820233319129, "learning_rate": 9.296636161861422e-06, "loss": 0.1121, "step": 1879 }, { "epoch": 0.8553230209281165, "grad_norm": 2.0225500877401617, "learning_rate": 9.295905006958308e-06, "loss": 0.1409, "step": 1880 }, { "epoch": 0.8557779799818016, "grad_norm": 0.7783660516278756, "learning_rate": 9.295173501013239e-06, "loss": 0.0974, "step": 1881 }, { "epoch": 0.8562329390354868, "grad_norm": 0.7064043776078144, "learning_rate": 9.29444164408599e-06, "loss": 0.0954, "step": 1882 }, { "epoch": 0.856687898089172, "grad_norm": 0.6658976396134992, "learning_rate": 9.29370943623636e-06, "loss": 0.0636, "step": 1883 }, { "epoch": 0.8571428571428571, "grad_norm": 0.6825106501213147, "learning_rate": 9.292976877524189e-06, "loss": 0.0908, "step": 1884 }, { "epoch": 0.8575978161965423, "grad_norm": 0.8132731569130554, "learning_rate": 9.292243968009332e-06, "loss": 0.0952, "step": 1885 }, { "epoch": 0.8580527752502275, "grad_norm": 1.283740720887758, "learning_rate": 9.29151070775168e-06, "loss": 0.1407, "step": 1886 }, { "epoch": 0.8585077343039127, "grad_norm": 0.8987444265022443, "learning_rate": 9.290777096811156e-06, "loss": 0.1008, "step": 1887 }, { "epoch": 0.8589626933575978, "grad_norm": 0.9027753674161602, "learning_rate": 9.290043135247704e-06, "loss": 0.0917, "step": 1888 }, { "epoch": 0.859417652411283, "grad_norm": 0.7721264653335534, "learning_rate": 9.289308823121302e-06, "loss": 0.0876, "step": 1889 }, { "epoch": 0.8598726114649682, "grad_norm": 0.8645055674602313, "learning_rate": 9.28857416049195e-06, "loss": 0.0775, "step": 1890 }, { "epoch": 0.8603275705186533, "grad_norm": 0.7828026058785104, "learning_rate": 9.287839147419685e-06, "loss": 0.0953, "step": 1891 }, { "epoch": 0.8607825295723385, "grad_norm": 0.7581321197025821, "learning_rate": 9.287103783964571e-06, "loss": 0.1004, "step": 1892 }, { "epoch": 0.8612374886260237, "grad_norm": 0.5836098633522236, "learning_rate": 9.286368070186696e-06, "loss": 0.0586, "step": 1893 }, { "epoch": 0.8616924476797089, "grad_norm": 0.8102404855384281, "learning_rate": 9.285632006146178e-06, "loss": 0.0809, "step": 1894 }, { "epoch": 0.862147406733394, "grad_norm": 0.5684276012396848, "learning_rate": 9.284895591903167e-06, "loss": 0.0736, "step": 1895 }, { "epoch": 0.8626023657870792, "grad_norm": 0.629014301705328, "learning_rate": 9.284158827517838e-06, "loss": 0.0707, "step": 1896 }, { "epoch": 0.8630573248407644, "grad_norm": 0.6150335967135018, "learning_rate": 9.283421713050398e-06, "loss": 0.0665, "step": 1897 }, { "epoch": 0.8635122838944495, "grad_norm": 0.7977181385850289, "learning_rate": 9.282684248561078e-06, "loss": 0.1077, "step": 1898 }, { "epoch": 0.8639672429481347, "grad_norm": 0.5184482645002529, "learning_rate": 9.281946434110141e-06, "loss": 0.0594, "step": 1899 }, { "epoch": 0.8644222020018199, "grad_norm": 0.7148270230091635, "learning_rate": 9.28120826975788e-06, "loss": 0.1005, "step": 1900 }, { "epoch": 0.864877161055505, "grad_norm": 0.6020497479816633, "learning_rate": 9.280469755564613e-06, "loss": 0.0595, "step": 1901 }, { "epoch": 0.8653321201091901, "grad_norm": 0.7725143836000526, "learning_rate": 9.279730891590688e-06, "loss": 0.063, "step": 1902 }, { "epoch": 0.8657870791628753, "grad_norm": 0.5341160118168524, "learning_rate": 9.27899167789648e-06, "loss": 0.0649, "step": 1903 }, { "epoch": 0.8662420382165605, "grad_norm": 0.78025783272878, "learning_rate": 9.278252114542398e-06, "loss": 0.0987, "step": 1904 }, { "epoch": 0.8666969972702456, "grad_norm": 1.0383225939834173, "learning_rate": 9.277512201588871e-06, "loss": 0.1532, "step": 1905 }, { "epoch": 0.8671519563239308, "grad_norm": 0.742851971816876, "learning_rate": 9.276771939096367e-06, "loss": 0.1083, "step": 1906 }, { "epoch": 0.867606915377616, "grad_norm": 0.6246586544484709, "learning_rate": 9.276031327125371e-06, "loss": 0.0798, "step": 1907 }, { "epoch": 0.8680618744313012, "grad_norm": 0.6937230711216974, "learning_rate": 9.275290365736408e-06, "loss": 0.0764, "step": 1908 }, { "epoch": 0.8685168334849863, "grad_norm": 0.6405216327010745, "learning_rate": 9.274549054990022e-06, "loss": 0.0553, "step": 1909 }, { "epoch": 0.8689717925386715, "grad_norm": 0.6118088958703919, "learning_rate": 9.273807394946791e-06, "loss": 0.0719, "step": 1910 }, { "epoch": 0.8694267515923567, "grad_norm": 0.5929451056907732, "learning_rate": 9.27306538566732e-06, "loss": 0.0736, "step": 1911 }, { "epoch": 0.8698817106460418, "grad_norm": 0.551189089448713, "learning_rate": 9.272323027212244e-06, "loss": 0.0802, "step": 1912 }, { "epoch": 0.870336669699727, "grad_norm": 0.6964950682522272, "learning_rate": 9.271580319642221e-06, "loss": 0.0956, "step": 1913 }, { "epoch": 0.8707916287534122, "grad_norm": 0.656523844824833, "learning_rate": 9.270837263017947e-06, "loss": 0.0716, "step": 1914 }, { "epoch": 0.8712465878070974, "grad_norm": 0.5516956702822526, "learning_rate": 9.270093857400138e-06, "loss": 0.0756, "step": 1915 }, { "epoch": 0.8717015468607825, "grad_norm": 0.6458984664434074, "learning_rate": 9.269350102849542e-06, "loss": 0.0762, "step": 1916 }, { "epoch": 0.8721565059144677, "grad_norm": 0.6244797606471136, "learning_rate": 9.268605999426936e-06, "loss": 0.066, "step": 1917 }, { "epoch": 0.8726114649681529, "grad_norm": 1.3051429800547985, "learning_rate": 9.267861547193126e-06, "loss": 0.1487, "step": 1918 }, { "epoch": 0.873066424021838, "grad_norm": 0.9503536634109886, "learning_rate": 9.267116746208944e-06, "loss": 0.1088, "step": 1919 }, { "epoch": 0.8735213830755232, "grad_norm": 0.6872044557187451, "learning_rate": 9.26637159653525e-06, "loss": 0.0952, "step": 1920 }, { "epoch": 0.8739763421292084, "grad_norm": 0.8261797174841458, "learning_rate": 9.265626098232934e-06, "loss": 0.0917, "step": 1921 }, { "epoch": 0.8744313011828936, "grad_norm": 0.6285868744907084, "learning_rate": 9.26488025136292e-06, "loss": 0.0736, "step": 1922 }, { "epoch": 0.8748862602365787, "grad_norm": 0.95408072866655, "learning_rate": 9.264134055986152e-06, "loss": 0.09, "step": 1923 }, { "epoch": 0.8753412192902639, "grad_norm": 0.8126928412084633, "learning_rate": 9.263387512163604e-06, "loss": 0.0861, "step": 1924 }, { "epoch": 0.8757961783439491, "grad_norm": 0.628340619476289, "learning_rate": 9.262640619956282e-06, "loss": 0.0853, "step": 1925 }, { "epoch": 0.8762511373976342, "grad_norm": 0.822645279842771, "learning_rate": 9.261893379425218e-06, "loss": 0.0921, "step": 1926 }, { "epoch": 0.8767060964513194, "grad_norm": 0.664699910134531, "learning_rate": 9.261145790631475e-06, "loss": 0.0661, "step": 1927 }, { "epoch": 0.8771610555050046, "grad_norm": 0.46120202232971963, "learning_rate": 9.26039785363614e-06, "loss": 0.0548, "step": 1928 }, { "epoch": 0.8776160145586898, "grad_norm": 0.47348608915538554, "learning_rate": 9.259649568500333e-06, "loss": 0.0579, "step": 1929 }, { "epoch": 0.8780709736123748, "grad_norm": 0.5421377090850338, "learning_rate": 9.258900935285199e-06, "loss": 0.0591, "step": 1930 }, { "epoch": 0.87852593266606, "grad_norm": 0.5523212054660892, "learning_rate": 9.258151954051914e-06, "loss": 0.0757, "step": 1931 }, { "epoch": 0.8789808917197452, "grad_norm": 0.733320680764707, "learning_rate": 9.25740262486168e-06, "loss": 0.0999, "step": 1932 }, { "epoch": 0.8794358507734303, "grad_norm": 0.5636961368288687, "learning_rate": 9.25665294777573e-06, "loss": 0.0525, "step": 1933 }, { "epoch": 0.8798908098271155, "grad_norm": 0.5613709035035684, "learning_rate": 9.255902922855326e-06, "loss": 0.0512, "step": 1934 }, { "epoch": 0.8803457688808007, "grad_norm": 0.6266000159117329, "learning_rate": 9.255152550161753e-06, "loss": 0.0714, "step": 1935 }, { "epoch": 0.8808007279344859, "grad_norm": 0.5624931761265524, "learning_rate": 9.25440182975633e-06, "loss": 0.0667, "step": 1936 }, { "epoch": 0.881255686988171, "grad_norm": 0.8855653361345076, "learning_rate": 9.253650761700401e-06, "loss": 0.1104, "step": 1937 }, { "epoch": 0.8817106460418562, "grad_norm": 0.4051324158485566, "learning_rate": 9.252899346055343e-06, "loss": 0.0447, "step": 1938 }, { "epoch": 0.8821656050955414, "grad_norm": 0.6705030425420828, "learning_rate": 9.252147582882556e-06, "loss": 0.08, "step": 1939 }, { "epoch": 0.8826205641492265, "grad_norm": 0.745395756906896, "learning_rate": 9.25139547224347e-06, "loss": 0.0892, "step": 1940 }, { "epoch": 0.8830755232029117, "grad_norm": 0.9577657000178205, "learning_rate": 9.250643014199547e-06, "loss": 0.1144, "step": 1941 }, { "epoch": 0.8835304822565969, "grad_norm": 0.6774410545148242, "learning_rate": 9.24989020881227e-06, "loss": 0.0753, "step": 1942 }, { "epoch": 0.8839854413102821, "grad_norm": 0.7409774305157982, "learning_rate": 9.249137056143159e-06, "loss": 0.0722, "step": 1943 }, { "epoch": 0.8844404003639672, "grad_norm": 0.6042335346844097, "learning_rate": 9.248383556253758e-06, "loss": 0.0775, "step": 1944 }, { "epoch": 0.8848953594176524, "grad_norm": 0.8396643903072698, "learning_rate": 9.247629709205635e-06, "loss": 0.1051, "step": 1945 }, { "epoch": 0.8853503184713376, "grad_norm": 0.6590167845553623, "learning_rate": 9.246875515060396e-06, "loss": 0.0774, "step": 1946 }, { "epoch": 0.8858052775250227, "grad_norm": 0.5876827286169646, "learning_rate": 9.24612097387967e-06, "loss": 0.0768, "step": 1947 }, { "epoch": 0.8862602365787079, "grad_norm": 0.8894868784932225, "learning_rate": 9.245366085725111e-06, "loss": 0.0983, "step": 1948 }, { "epoch": 0.8867151956323931, "grad_norm": 0.5389319757607208, "learning_rate": 9.24461085065841e-06, "loss": 0.0571, "step": 1949 }, { "epoch": 0.8871701546860783, "grad_norm": 0.4677621224916707, "learning_rate": 9.243855268741275e-06, "loss": 0.0534, "step": 1950 }, { "epoch": 0.8876251137397634, "grad_norm": 0.6166575793819061, "learning_rate": 9.243099340035454e-06, "loss": 0.0679, "step": 1951 }, { "epoch": 0.8880800727934486, "grad_norm": 0.684219803564928, "learning_rate": 9.242343064602719e-06, "loss": 0.0797, "step": 1952 }, { "epoch": 0.8885350318471338, "grad_norm": 0.6543060915410528, "learning_rate": 9.241586442504865e-06, "loss": 0.0876, "step": 1953 }, { "epoch": 0.8889899909008189, "grad_norm": 0.6916358607655352, "learning_rate": 9.240829473803723e-06, "loss": 0.0816, "step": 1954 }, { "epoch": 0.8894449499545041, "grad_norm": 0.6650683160408256, "learning_rate": 9.240072158561146e-06, "loss": 0.0851, "step": 1955 }, { "epoch": 0.8898999090081893, "grad_norm": 0.8336397769475173, "learning_rate": 9.239314496839022e-06, "loss": 0.1075, "step": 1956 }, { "epoch": 0.8903548680618745, "grad_norm": 0.6498784190415388, "learning_rate": 9.23855648869926e-06, "loss": 0.0748, "step": 1957 }, { "epoch": 0.8908098271155596, "grad_norm": 0.7894795440995916, "learning_rate": 9.237798134203803e-06, "loss": 0.1045, "step": 1958 }, { "epoch": 0.8912647861692448, "grad_norm": 0.5980997509859944, "learning_rate": 9.237039433414623e-06, "loss": 0.079, "step": 1959 }, { "epoch": 0.89171974522293, "grad_norm": 0.8222326498301533, "learning_rate": 9.236280386393712e-06, "loss": 0.082, "step": 1960 }, { "epoch": 0.892174704276615, "grad_norm": 0.6293204676003961, "learning_rate": 9.2355209932031e-06, "loss": 0.0741, "step": 1961 }, { "epoch": 0.8926296633303002, "grad_norm": 0.47863668175134233, "learning_rate": 9.23476125390484e-06, "loss": 0.0524, "step": 1962 }, { "epoch": 0.8930846223839854, "grad_norm": 0.7798093326874596, "learning_rate": 9.234001168561013e-06, "loss": 0.0691, "step": 1963 }, { "epoch": 0.8935395814376706, "grad_norm": 0.7301612531501247, "learning_rate": 9.233240737233733e-06, "loss": 0.0965, "step": 1964 }, { "epoch": 0.8939945404913557, "grad_norm": 1.0452984923884894, "learning_rate": 9.232479959985136e-06, "loss": 0.1293, "step": 1965 }, { "epoch": 0.8944494995450409, "grad_norm": 0.6963389022030017, "learning_rate": 9.23171883687739e-06, "loss": 0.0767, "step": 1966 }, { "epoch": 0.8949044585987261, "grad_norm": 0.45171069390219404, "learning_rate": 9.23095736797269e-06, "loss": 0.0522, "step": 1967 }, { "epoch": 0.8953594176524113, "grad_norm": 1.0061313103020273, "learning_rate": 9.230195553333263e-06, "loss": 0.1277, "step": 1968 }, { "epoch": 0.8958143767060964, "grad_norm": 1.5986138982364897, "learning_rate": 9.229433393021358e-06, "loss": 0.1405, "step": 1969 }, { "epoch": 0.8962693357597816, "grad_norm": 0.6908357505139043, "learning_rate": 9.228670887099256e-06, "loss": 0.0739, "step": 1970 }, { "epoch": 0.8967242948134668, "grad_norm": 0.5277345258701365, "learning_rate": 9.227908035629266e-06, "loss": 0.0526, "step": 1971 }, { "epoch": 0.8971792538671519, "grad_norm": 0.6285224648148875, "learning_rate": 9.227144838673724e-06, "loss": 0.0706, "step": 1972 }, { "epoch": 0.8976342129208371, "grad_norm": 0.949308919855668, "learning_rate": 9.226381296294995e-06, "loss": 0.1045, "step": 1973 }, { "epoch": 0.8980891719745223, "grad_norm": 0.752138900094858, "learning_rate": 9.225617408555471e-06, "loss": 0.0907, "step": 1974 }, { "epoch": 0.8985441310282075, "grad_norm": 0.9650799951574368, "learning_rate": 9.224853175517578e-06, "loss": 0.1261, "step": 1975 }, { "epoch": 0.8989990900818926, "grad_norm": 0.6368811817284902, "learning_rate": 9.224088597243762e-06, "loss": 0.0759, "step": 1976 }, { "epoch": 0.8994540491355778, "grad_norm": 0.7403608884362824, "learning_rate": 9.223323673796503e-06, "loss": 0.081, "step": 1977 }, { "epoch": 0.899909008189263, "grad_norm": 0.8033696439311833, "learning_rate": 9.222558405238303e-06, "loss": 0.0968, "step": 1978 }, { "epoch": 0.9003639672429481, "grad_norm": 0.7306511821068437, "learning_rate": 9.2217927916317e-06, "loss": 0.0916, "step": 1979 }, { "epoch": 0.9008189262966333, "grad_norm": 0.8380967239417318, "learning_rate": 9.221026833039256e-06, "loss": 0.0945, "step": 1980 }, { "epoch": 0.9012738853503185, "grad_norm": 0.7718744506924977, "learning_rate": 9.220260529523561e-06, "loss": 0.0918, "step": 1981 }, { "epoch": 0.9017288444040037, "grad_norm": 0.7393925382776323, "learning_rate": 9.219493881147234e-06, "loss": 0.0816, "step": 1982 }, { "epoch": 0.9021838034576888, "grad_norm": 0.7687427983757074, "learning_rate": 9.218726887972923e-06, "loss": 0.0835, "step": 1983 }, { "epoch": 0.902638762511374, "grad_norm": 0.6785077320109779, "learning_rate": 9.2179595500633e-06, "loss": 0.0799, "step": 1984 }, { "epoch": 0.9030937215650592, "grad_norm": 0.9172539926736025, "learning_rate": 9.217191867481072e-06, "loss": 0.1147, "step": 1985 }, { "epoch": 0.9035486806187443, "grad_norm": 0.9222679238503178, "learning_rate": 9.21642384028897e-06, "loss": 0.127, "step": 1986 }, { "epoch": 0.9040036396724295, "grad_norm": 0.8844523810912496, "learning_rate": 9.215655468549752e-06, "loss": 0.1013, "step": 1987 }, { "epoch": 0.9044585987261147, "grad_norm": 0.5874811797706115, "learning_rate": 9.214886752326208e-06, "loss": 0.0528, "step": 1988 }, { "epoch": 0.9049135577797999, "grad_norm": 0.6774186522730414, "learning_rate": 9.214117691681152e-06, "loss": 0.0749, "step": 1989 }, { "epoch": 0.905368516833485, "grad_norm": 0.46678264083336873, "learning_rate": 9.213348286677429e-06, "loss": 0.0502, "step": 1990 }, { "epoch": 0.9058234758871702, "grad_norm": 0.6369505909634797, "learning_rate": 9.21257853737791e-06, "loss": 0.0597, "step": 1991 }, { "epoch": 0.9062784349408554, "grad_norm": 0.7872482528902512, "learning_rate": 9.211808443845499e-06, "loss": 0.0842, "step": 1992 }, { "epoch": 0.9067333939945404, "grad_norm": 0.6991340678786092, "learning_rate": 9.211038006143121e-06, "loss": 0.0714, "step": 1993 }, { "epoch": 0.9071883530482256, "grad_norm": 0.5842126029431552, "learning_rate": 9.210267224333735e-06, "loss": 0.0686, "step": 1994 }, { "epoch": 0.9076433121019108, "grad_norm": 0.6405241386542652, "learning_rate": 9.209496098480324e-06, "loss": 0.0843, "step": 1995 }, { "epoch": 0.908098271155596, "grad_norm": 0.6431855863004138, "learning_rate": 9.208724628645901e-06, "loss": 0.0781, "step": 1996 }, { "epoch": 0.9085532302092811, "grad_norm": 0.6571372788631167, "learning_rate": 9.207952814893511e-06, "loss": 0.0746, "step": 1997 }, { "epoch": 0.9090081892629663, "grad_norm": 0.6228847041781231, "learning_rate": 9.207180657286216e-06, "loss": 0.0563, "step": 1998 }, { "epoch": 0.9094631483166515, "grad_norm": 0.6649592874484661, "learning_rate": 9.20640815588712e-06, "loss": 0.0737, "step": 1999 }, { "epoch": 0.9099181073703366, "grad_norm": 0.6395827893566276, "learning_rate": 9.205635310759344e-06, "loss": 0.0864, "step": 2000 }, { "epoch": 0.9103730664240218, "grad_norm": 0.6470816609318947, "learning_rate": 9.204862121966044e-06, "loss": 0.0819, "step": 2001 }, { "epoch": 0.910828025477707, "grad_norm": 0.6954176357821441, "learning_rate": 9.2040885895704e-06, "loss": 0.0935, "step": 2002 }, { "epoch": 0.9112829845313922, "grad_norm": 0.5250024400720148, "learning_rate": 9.203314713635621e-06, "loss": 0.0521, "step": 2003 }, { "epoch": 0.9117379435850773, "grad_norm": 0.6765818316745539, "learning_rate": 9.202540494224946e-06, "loss": 0.1078, "step": 2004 }, { "epoch": 0.9121929026387625, "grad_norm": 0.7602463030942905, "learning_rate": 9.20176593140164e-06, "loss": 0.068, "step": 2005 }, { "epoch": 0.9126478616924477, "grad_norm": 0.4564764883431911, "learning_rate": 9.200991025228998e-06, "loss": 0.0576, "step": 2006 }, { "epoch": 0.9131028207461328, "grad_norm": 0.87338946860691, "learning_rate": 9.20021577577034e-06, "loss": 0.1155, "step": 2007 }, { "epoch": 0.913557779799818, "grad_norm": 0.67443699378812, "learning_rate": 9.199440183089019e-06, "loss": 0.0803, "step": 2008 }, { "epoch": 0.9140127388535032, "grad_norm": 0.697779741574365, "learning_rate": 9.198664247248408e-06, "loss": 0.0886, "step": 2009 }, { "epoch": 0.9144676979071884, "grad_norm": 0.6888292123310293, "learning_rate": 9.197887968311917e-06, "loss": 0.088, "step": 2010 }, { "epoch": 0.9149226569608735, "grad_norm": 0.593887211300783, "learning_rate": 9.197111346342979e-06, "loss": 0.0597, "step": 2011 }, { "epoch": 0.9153776160145587, "grad_norm": 0.5222048906208826, "learning_rate": 9.196334381405055e-06, "loss": 0.055, "step": 2012 }, { "epoch": 0.9158325750682439, "grad_norm": 0.7406902681131339, "learning_rate": 9.195557073561636e-06, "loss": 0.0725, "step": 2013 }, { "epoch": 0.916287534121929, "grad_norm": 0.7369752030698005, "learning_rate": 9.194779422876242e-06, "loss": 0.0725, "step": 2014 }, { "epoch": 0.9167424931756142, "grad_norm": 0.5674786098045346, "learning_rate": 9.194001429412414e-06, "loss": 0.0528, "step": 2015 }, { "epoch": 0.9171974522292994, "grad_norm": 0.9561188233992612, "learning_rate": 9.19322309323373e-06, "loss": 0.1213, "step": 2016 }, { "epoch": 0.9176524112829846, "grad_norm": 0.7666480467189352, "learning_rate": 9.192444414403792e-06, "loss": 0.0788, "step": 2017 }, { "epoch": 0.9181073703366697, "grad_norm": 1.0242939804657472, "learning_rate": 9.19166539298623e-06, "loss": 0.1341, "step": 2018 }, { "epoch": 0.9185623293903549, "grad_norm": 0.6407407288510717, "learning_rate": 9.1908860290447e-06, "loss": 0.0702, "step": 2019 }, { "epoch": 0.9190172884440401, "grad_norm": 0.9262978099585683, "learning_rate": 9.190106322642888e-06, "loss": 0.0962, "step": 2020 }, { "epoch": 0.9194722474977252, "grad_norm": 0.6371294810639554, "learning_rate": 9.189326273844512e-06, "loss": 0.0716, "step": 2021 }, { "epoch": 0.9199272065514104, "grad_norm": 0.616042736799084, "learning_rate": 9.18854588271331e-06, "loss": 0.0697, "step": 2022 }, { "epoch": 0.9203821656050956, "grad_norm": 0.8652881040430276, "learning_rate": 9.187765149313057e-06, "loss": 0.0949, "step": 2023 }, { "epoch": 0.9208371246587808, "grad_norm": 0.7171212404467417, "learning_rate": 9.186984073707545e-06, "loss": 0.0685, "step": 2024 }, { "epoch": 0.9212920837124658, "grad_norm": 0.6434040420425213, "learning_rate": 9.186202655960603e-06, "loss": 0.0774, "step": 2025 }, { "epoch": 0.921747042766151, "grad_norm": 0.6537324523008204, "learning_rate": 9.185420896136086e-06, "loss": 0.0786, "step": 2026 }, { "epoch": 0.9222020018198362, "grad_norm": 0.6271186642997567, "learning_rate": 9.184638794297873e-06, "loss": 0.0636, "step": 2027 }, { "epoch": 0.9226569608735213, "grad_norm": 0.7041069370791754, "learning_rate": 9.183856350509877e-06, "loss": 0.0809, "step": 2028 }, { "epoch": 0.9231119199272065, "grad_norm": 0.8781019574614535, "learning_rate": 9.183073564836033e-06, "loss": 0.1051, "step": 2029 }, { "epoch": 0.9235668789808917, "grad_norm": 0.48818413319632054, "learning_rate": 9.182290437340308e-06, "loss": 0.0474, "step": 2030 }, { "epoch": 0.9240218380345769, "grad_norm": 0.8775797840737246, "learning_rate": 9.181506968086696e-06, "loss": 0.0949, "step": 2031 }, { "epoch": 0.924476797088262, "grad_norm": 0.958612912496998, "learning_rate": 9.180723157139218e-06, "loss": 0.121, "step": 2032 }, { "epoch": 0.9249317561419472, "grad_norm": 0.6245762602830833, "learning_rate": 9.179939004561925e-06, "loss": 0.0655, "step": 2033 }, { "epoch": 0.9253867151956324, "grad_norm": 0.5017046465493271, "learning_rate": 9.17915451041889e-06, "loss": 0.0661, "step": 2034 }, { "epoch": 0.9258416742493175, "grad_norm": 0.710064858137144, "learning_rate": 9.178369674774224e-06, "loss": 0.0791, "step": 2035 }, { "epoch": 0.9262966333030027, "grad_norm": 0.587851189554333, "learning_rate": 9.177584497692056e-06, "loss": 0.0637, "step": 2036 }, { "epoch": 0.9267515923566879, "grad_norm": 1.3023478543600886, "learning_rate": 9.176798979236548e-06, "loss": 0.1095, "step": 2037 }, { "epoch": 0.9272065514103731, "grad_norm": 0.540716658575828, "learning_rate": 9.17601311947189e-06, "loss": 0.0693, "step": 2038 }, { "epoch": 0.9276615104640582, "grad_norm": 0.6208372361565256, "learning_rate": 9.175226918462298e-06, "loss": 0.0718, "step": 2039 }, { "epoch": 0.9281164695177434, "grad_norm": 0.7701609774864682, "learning_rate": 9.174440376272021e-06, "loss": 0.0976, "step": 2040 }, { "epoch": 0.9285714285714286, "grad_norm": 0.7010494768516853, "learning_rate": 9.173653492965325e-06, "loss": 0.0993, "step": 2041 }, { "epoch": 0.9290263876251137, "grad_norm": 0.6373763175184742, "learning_rate": 9.172866268606514e-06, "loss": 0.0724, "step": 2042 }, { "epoch": 0.9294813466787989, "grad_norm": 0.701200286737339, "learning_rate": 9.172078703259917e-06, "loss": 0.0825, "step": 2043 }, { "epoch": 0.9299363057324841, "grad_norm": 0.4368340952860916, "learning_rate": 9.171290796989887e-06, "loss": 0.0477, "step": 2044 }, { "epoch": 0.9303912647861693, "grad_norm": 0.6370651977402901, "learning_rate": 9.170502549860813e-06, "loss": 0.0796, "step": 2045 }, { "epoch": 0.9308462238398544, "grad_norm": 1.1692149001382897, "learning_rate": 9.169713961937104e-06, "loss": 0.122, "step": 2046 }, { "epoch": 0.9313011828935396, "grad_norm": 0.694595823352437, "learning_rate": 9.168925033283199e-06, "loss": 0.0935, "step": 2047 }, { "epoch": 0.9317561419472248, "grad_norm": 0.672175800896758, "learning_rate": 9.168135763963567e-06, "loss": 0.0763, "step": 2048 }, { "epoch": 0.9322111010009099, "grad_norm": 0.5254037194744254, "learning_rate": 9.167346154042705e-06, "loss": 0.0535, "step": 2049 }, { "epoch": 0.9326660600545951, "grad_norm": 0.6788074343357934, "learning_rate": 9.166556203585134e-06, "loss": 0.0804, "step": 2050 }, { "epoch": 0.9331210191082803, "grad_norm": 0.6950456412782345, "learning_rate": 9.165765912655407e-06, "loss": 0.0727, "step": 2051 }, { "epoch": 0.9335759781619655, "grad_norm": 0.8037111447772672, "learning_rate": 9.1649752813181e-06, "loss": 0.0811, "step": 2052 }, { "epoch": 0.9340309372156506, "grad_norm": 0.6043473581913603, "learning_rate": 9.164184309637824e-06, "loss": 0.0773, "step": 2053 }, { "epoch": 0.9344858962693358, "grad_norm": 0.6914300193057683, "learning_rate": 9.16339299767921e-06, "loss": 0.0888, "step": 2054 }, { "epoch": 0.934940855323021, "grad_norm": 0.5973299516809696, "learning_rate": 9.162601345506923e-06, "loss": 0.0771, "step": 2055 }, { "epoch": 0.935395814376706, "grad_norm": 0.5667027927032561, "learning_rate": 9.161809353185651e-06, "loss": 0.0589, "step": 2056 }, { "epoch": 0.9358507734303912, "grad_norm": 0.5892355686848351, "learning_rate": 9.161017020780114e-06, "loss": 0.0562, "step": 2057 }, { "epoch": 0.9363057324840764, "grad_norm": 0.8503563061945567, "learning_rate": 9.160224348355057e-06, "loss": 0.1075, "step": 2058 }, { "epoch": 0.9367606915377616, "grad_norm": 0.8030569297687169, "learning_rate": 9.159431335975255e-06, "loss": 0.0651, "step": 2059 }, { "epoch": 0.9372156505914467, "grad_norm": 0.6182029602806504, "learning_rate": 9.158637983705505e-06, "loss": 0.0908, "step": 2060 }, { "epoch": 0.9376706096451319, "grad_norm": 0.6167088007283392, "learning_rate": 9.157844291610641e-06, "loss": 0.0719, "step": 2061 }, { "epoch": 0.9381255686988171, "grad_norm": 1.0378949375185438, "learning_rate": 9.157050259755519e-06, "loss": 0.0925, "step": 2062 }, { "epoch": 0.9385805277525022, "grad_norm": 0.6009053311569907, "learning_rate": 9.156255888205021e-06, "loss": 0.0868, "step": 2063 }, { "epoch": 0.9390354868061874, "grad_norm": 0.6730461926983252, "learning_rate": 9.155461177024062e-06, "loss": 0.0791, "step": 2064 }, { "epoch": 0.9394904458598726, "grad_norm": 0.8310142050561945, "learning_rate": 9.154666126277582e-06, "loss": 0.0882, "step": 2065 }, { "epoch": 0.9399454049135578, "grad_norm": 0.5455153208822874, "learning_rate": 9.153870736030549e-06, "loss": 0.0651, "step": 2066 }, { "epoch": 0.9404003639672429, "grad_norm": 0.8245922923142007, "learning_rate": 9.153075006347957e-06, "loss": 0.1357, "step": 2067 }, { "epoch": 0.9408553230209281, "grad_norm": 0.7891736693746195, "learning_rate": 9.15227893729483e-06, "loss": 0.0879, "step": 2068 }, { "epoch": 0.9413102820746133, "grad_norm": 0.6032022964433661, "learning_rate": 9.151482528936222e-06, "loss": 0.0594, "step": 2069 }, { "epoch": 0.9417652411282984, "grad_norm": 0.8087071917107507, "learning_rate": 9.150685781337207e-06, "loss": 0.0872, "step": 2070 }, { "epoch": 0.9422202001819836, "grad_norm": 1.1875700013397057, "learning_rate": 9.149888694562896e-06, "loss": 0.1447, "step": 2071 }, { "epoch": 0.9426751592356688, "grad_norm": 0.7351727785498874, "learning_rate": 9.149091268678423e-06, "loss": 0.0708, "step": 2072 }, { "epoch": 0.943130118289354, "grad_norm": 0.6792286796417435, "learning_rate": 9.148293503748947e-06, "loss": 0.0876, "step": 2073 }, { "epoch": 0.9435850773430391, "grad_norm": 0.7417762096300724, "learning_rate": 9.14749539983966e-06, "loss": 0.0852, "step": 2074 }, { "epoch": 0.9440400363967243, "grad_norm": 0.5155173170030183, "learning_rate": 9.146696957015777e-06, "loss": 0.0606, "step": 2075 }, { "epoch": 0.9444949954504095, "grad_norm": 1.1023064832096257, "learning_rate": 9.145898175342545e-06, "loss": 0.1488, "step": 2076 }, { "epoch": 0.9449499545040946, "grad_norm": 0.6914694719967308, "learning_rate": 9.145099054885238e-06, "loss": 0.0816, "step": 2077 }, { "epoch": 0.9454049135577798, "grad_norm": 0.6905933706764309, "learning_rate": 9.144299595709156e-06, "loss": 0.0876, "step": 2078 }, { "epoch": 0.945859872611465, "grad_norm": 0.5233906895741112, "learning_rate": 9.143499797879626e-06, "loss": 0.0562, "step": 2079 }, { "epoch": 0.9463148316651502, "grad_norm": 0.5101515836442003, "learning_rate": 9.142699661462005e-06, "loss": 0.0559, "step": 2080 }, { "epoch": 0.9467697907188353, "grad_norm": 0.48017157157527135, "learning_rate": 9.141899186521675e-06, "loss": 0.0503, "step": 2081 }, { "epoch": 0.9472247497725205, "grad_norm": 0.6592673728640894, "learning_rate": 9.141098373124048e-06, "loss": 0.0797, "step": 2082 }, { "epoch": 0.9476797088262057, "grad_norm": 0.86432014477488, "learning_rate": 9.140297221334562e-06, "loss": 0.0858, "step": 2083 }, { "epoch": 0.9481346678798908, "grad_norm": 1.0397141319559977, "learning_rate": 9.139495731218685e-06, "loss": 0.1198, "step": 2084 }, { "epoch": 0.948589626933576, "grad_norm": 0.862052866017664, "learning_rate": 9.138693902841914e-06, "loss": 0.1056, "step": 2085 }, { "epoch": 0.9490445859872612, "grad_norm": 0.7709077621401632, "learning_rate": 9.137891736269764e-06, "loss": 0.0918, "step": 2086 }, { "epoch": 0.9494995450409464, "grad_norm": 0.8691294728765458, "learning_rate": 9.137089231567789e-06, "loss": 0.0925, "step": 2087 }, { "epoch": 0.9499545040946314, "grad_norm": 0.6098999809715144, "learning_rate": 9.136286388801564e-06, "loss": 0.0673, "step": 2088 }, { "epoch": 0.9504094631483166, "grad_norm": 0.7157788293123913, "learning_rate": 9.135483208036695e-06, "loss": 0.0802, "step": 2089 }, { "epoch": 0.9508644222020018, "grad_norm": 0.9397853662008804, "learning_rate": 9.134679689338814e-06, "loss": 0.1021, "step": 2090 }, { "epoch": 0.9513193812556869, "grad_norm": 0.5449934450219076, "learning_rate": 9.133875832773582e-06, "loss": 0.0698, "step": 2091 }, { "epoch": 0.9517743403093721, "grad_norm": 0.5678662789014983, "learning_rate": 9.133071638406684e-06, "loss": 0.0726, "step": 2092 }, { "epoch": 0.9522292993630573, "grad_norm": 0.704718355722168, "learning_rate": 9.132267106303836e-06, "loss": 0.0949, "step": 2093 }, { "epoch": 0.9526842584167425, "grad_norm": 0.7119333629649424, "learning_rate": 9.131462236530784e-06, "loss": 0.0815, "step": 2094 }, { "epoch": 0.9531392174704276, "grad_norm": 0.9543831010874976, "learning_rate": 9.130657029153293e-06, "loss": 0.1037, "step": 2095 }, { "epoch": 0.9535941765241128, "grad_norm": 0.4141088945678519, "learning_rate": 9.129851484237165e-06, "loss": 0.0438, "step": 2096 }, { "epoch": 0.954049135577798, "grad_norm": 0.880955172212152, "learning_rate": 9.129045601848222e-06, "loss": 0.1139, "step": 2097 }, { "epoch": 0.9545040946314831, "grad_norm": 0.5340666725025275, "learning_rate": 9.12823938205232e-06, "loss": 0.0662, "step": 2098 }, { "epoch": 0.9549590536851683, "grad_norm": 0.7598809630255295, "learning_rate": 9.127432824915339e-06, "loss": 0.086, "step": 2099 }, { "epoch": 0.9554140127388535, "grad_norm": 0.5889551801250265, "learning_rate": 9.126625930503187e-06, "loss": 0.0618, "step": 2100 }, { "epoch": 0.9558689717925387, "grad_norm": 0.7452095277301981, "learning_rate": 9.125818698881798e-06, "loss": 0.0846, "step": 2101 }, { "epoch": 0.9563239308462238, "grad_norm": 0.874570701264544, "learning_rate": 9.125011130117139e-06, "loss": 0.0711, "step": 2102 }, { "epoch": 0.956778889899909, "grad_norm": 0.6700889468480424, "learning_rate": 9.124203224275198e-06, "loss": 0.0771, "step": 2103 }, { "epoch": 0.9572338489535942, "grad_norm": 0.5713697589917575, "learning_rate": 9.123394981421995e-06, "loss": 0.0647, "step": 2104 }, { "epoch": 0.9576888080072793, "grad_norm": 0.7416406361243658, "learning_rate": 9.122586401623574e-06, "loss": 0.0797, "step": 2105 }, { "epoch": 0.9581437670609645, "grad_norm": 0.8792771411195691, "learning_rate": 9.12177748494601e-06, "loss": 0.1043, "step": 2106 }, { "epoch": 0.9585987261146497, "grad_norm": 0.8409261244287831, "learning_rate": 9.120968231455406e-06, "loss": 0.0968, "step": 2107 }, { "epoch": 0.9590536851683349, "grad_norm": 0.588499824544961, "learning_rate": 9.120158641217885e-06, "loss": 0.0675, "step": 2108 }, { "epoch": 0.95950864422202, "grad_norm": 0.5664840104040384, "learning_rate": 9.119348714299607e-06, "loss": 0.0721, "step": 2109 }, { "epoch": 0.9599636032757052, "grad_norm": 0.7544363313105896, "learning_rate": 9.118538450766755e-06, "loss": 0.0723, "step": 2110 }, { "epoch": 0.9604185623293904, "grad_norm": 0.6699256182505398, "learning_rate": 9.117727850685541e-06, "loss": 0.0669, "step": 2111 }, { "epoch": 0.9608735213830755, "grad_norm": 0.5711605071447146, "learning_rate": 9.116916914122202e-06, "loss": 0.0637, "step": 2112 }, { "epoch": 0.9613284804367607, "grad_norm": 0.6965803730129388, "learning_rate": 9.116105641143005e-06, "loss": 0.0744, "step": 2113 }, { "epoch": 0.9617834394904459, "grad_norm": 0.8598026014818454, "learning_rate": 9.115294031814242e-06, "loss": 0.0937, "step": 2114 }, { "epoch": 0.9622383985441311, "grad_norm": 0.5794082624701737, "learning_rate": 9.114482086202236e-06, "loss": 0.0675, "step": 2115 }, { "epoch": 0.9626933575978162, "grad_norm": 0.7600807206599288, "learning_rate": 9.113669804373335e-06, "loss": 0.1047, "step": 2116 }, { "epoch": 0.9631483166515014, "grad_norm": 0.6377342056356247, "learning_rate": 9.112857186393913e-06, "loss": 0.0676, "step": 2117 }, { "epoch": 0.9636032757051866, "grad_norm": 1.1042469320816768, "learning_rate": 9.112044232330377e-06, "loss": 0.1508, "step": 2118 }, { "epoch": 0.9640582347588716, "grad_norm": 0.817690744261235, "learning_rate": 9.111230942249156e-06, "loss": 0.0904, "step": 2119 }, { "epoch": 0.9645131938125568, "grad_norm": 0.7037231293816442, "learning_rate": 9.110417316216708e-06, "loss": 0.0636, "step": 2120 }, { "epoch": 0.964968152866242, "grad_norm": 0.6588945759110881, "learning_rate": 9.10960335429952e-06, "loss": 0.0684, "step": 2121 }, { "epoch": 0.9654231119199272, "grad_norm": 0.6220308381200076, "learning_rate": 9.108789056564105e-06, "loss": 0.0877, "step": 2122 }, { "epoch": 0.9658780709736123, "grad_norm": 0.6262721502493606, "learning_rate": 9.107974423077001e-06, "loss": 0.0642, "step": 2123 }, { "epoch": 0.9663330300272975, "grad_norm": 0.9510165739511419, "learning_rate": 9.107159453904781e-06, "loss": 0.0994, "step": 2124 }, { "epoch": 0.9667879890809827, "grad_norm": 0.7410601791583596, "learning_rate": 9.10634414911404e-06, "loss": 0.0751, "step": 2125 }, { "epoch": 0.9672429481346679, "grad_norm": 0.592927363864185, "learning_rate": 9.105528508771395e-06, "loss": 0.0785, "step": 2126 }, { "epoch": 0.967697907188353, "grad_norm": 0.704125884709214, "learning_rate": 9.104712532943502e-06, "loss": 0.0672, "step": 2127 }, { "epoch": 0.9681528662420382, "grad_norm": 0.6763649668606744, "learning_rate": 9.10389622169704e-06, "loss": 0.0813, "step": 2128 }, { "epoch": 0.9686078252957234, "grad_norm": 1.0481681916194059, "learning_rate": 9.103079575098708e-06, "loss": 0.1165, "step": 2129 }, { "epoch": 0.9690627843494085, "grad_norm": 0.6244343397167454, "learning_rate": 9.102262593215246e-06, "loss": 0.0548, "step": 2130 }, { "epoch": 0.9695177434030937, "grad_norm": 0.6662772517701377, "learning_rate": 9.101445276113407e-06, "loss": 0.0672, "step": 2131 }, { "epoch": 0.9699727024567789, "grad_norm": 0.7302079833291476, "learning_rate": 9.100627623859985e-06, "loss": 0.0747, "step": 2132 }, { "epoch": 0.9704276615104641, "grad_norm": 0.7003598456468986, "learning_rate": 9.09980963652179e-06, "loss": 0.0763, "step": 2133 }, { "epoch": 0.9708826205641492, "grad_norm": 0.8675523177046712, "learning_rate": 9.098991314165668e-06, "loss": 0.1123, "step": 2134 }, { "epoch": 0.9713375796178344, "grad_norm": 0.6531391716615499, "learning_rate": 9.098172656858484e-06, "loss": 0.0626, "step": 2135 }, { "epoch": 0.9717925386715196, "grad_norm": 0.8230462520119928, "learning_rate": 9.097353664667138e-06, "loss": 0.0873, "step": 2136 }, { "epoch": 0.9722474977252047, "grad_norm": 0.6524897158303723, "learning_rate": 9.096534337658558e-06, "loss": 0.0658, "step": 2137 }, { "epoch": 0.9727024567788899, "grad_norm": 0.7421742040769631, "learning_rate": 9.095714675899688e-06, "loss": 0.0782, "step": 2138 }, { "epoch": 0.9731574158325751, "grad_norm": 0.6400011673563383, "learning_rate": 9.094894679457511e-06, "loss": 0.0605, "step": 2139 }, { "epoch": 0.9736123748862603, "grad_norm": 0.5825220963314399, "learning_rate": 9.094074348399034e-06, "loss": 0.0711, "step": 2140 }, { "epoch": 0.9740673339399454, "grad_norm": 0.9652267063711952, "learning_rate": 9.09325368279129e-06, "loss": 0.0996, "step": 2141 }, { "epoch": 0.9745222929936306, "grad_norm": 0.9291202899333796, "learning_rate": 9.09243268270134e-06, "loss": 0.0818, "step": 2142 }, { "epoch": 0.9749772520473158, "grad_norm": 0.8799622533298002, "learning_rate": 9.091611348196272e-06, "loss": 0.0904, "step": 2143 }, { "epoch": 0.9754322111010009, "grad_norm": 0.8326816067428606, "learning_rate": 9.090789679343201e-06, "loss": 0.0931, "step": 2144 }, { "epoch": 0.9758871701546861, "grad_norm": 0.783000579713321, "learning_rate": 9.089967676209274e-06, "loss": 0.0879, "step": 2145 }, { "epoch": 0.9763421292083713, "grad_norm": 0.7001846964382422, "learning_rate": 9.089145338861657e-06, "loss": 0.0916, "step": 2146 }, { "epoch": 0.9767970882620565, "grad_norm": 0.953946241556791, "learning_rate": 9.08832266736755e-06, "loss": 0.1205, "step": 2147 }, { "epoch": 0.9772520473157416, "grad_norm": 0.7358151559070641, "learning_rate": 9.087499661794177e-06, "loss": 0.0915, "step": 2148 }, { "epoch": 0.9777070063694268, "grad_norm": 0.8142291270830226, "learning_rate": 9.08667632220879e-06, "loss": 0.0995, "step": 2149 }, { "epoch": 0.978161965423112, "grad_norm": 0.7106034630801776, "learning_rate": 9.08585264867867e-06, "loss": 0.0783, "step": 2150 }, { "epoch": 0.978616924476797, "grad_norm": 0.826812478555379, "learning_rate": 9.085028641271123e-06, "loss": 0.1058, "step": 2151 }, { "epoch": 0.9790718835304822, "grad_norm": 0.8960647231942128, "learning_rate": 9.084204300053483e-06, "loss": 0.108, "step": 2152 }, { "epoch": 0.9795268425841674, "grad_norm": 0.7308955491972883, "learning_rate": 9.083379625093111e-06, "loss": 0.0963, "step": 2153 }, { "epoch": 0.9799818016378526, "grad_norm": 0.854998609995297, "learning_rate": 9.082554616457397e-06, "loss": 0.1031, "step": 2154 }, { "epoch": 0.9804367606915377, "grad_norm": 0.6134903423880519, "learning_rate": 9.081729274213758e-06, "loss": 0.0728, "step": 2155 }, { "epoch": 0.9808917197452229, "grad_norm": 0.7494461465991118, "learning_rate": 9.080903598429634e-06, "loss": 0.0612, "step": 2156 }, { "epoch": 0.9813466787989081, "grad_norm": 0.6477350071161301, "learning_rate": 9.080077589172496e-06, "loss": 0.0725, "step": 2157 }, { "epoch": 0.9818016378525932, "grad_norm": 0.5949372826775987, "learning_rate": 9.079251246509846e-06, "loss": 0.0618, "step": 2158 }, { "epoch": 0.9822565969062784, "grad_norm": 1.0457437129682037, "learning_rate": 9.078424570509202e-06, "loss": 0.134, "step": 2159 }, { "epoch": 0.9827115559599636, "grad_norm": 0.7562918714504535, "learning_rate": 9.077597561238123e-06, "loss": 0.0746, "step": 2160 }, { "epoch": 0.9831665150136488, "grad_norm": 0.705691881874251, "learning_rate": 9.076770218764186e-06, "loss": 0.0903, "step": 2161 }, { "epoch": 0.9836214740673339, "grad_norm": 0.700571619924188, "learning_rate": 9.075942543154996e-06, "loss": 0.0905, "step": 2162 }, { "epoch": 0.9840764331210191, "grad_norm": 0.5178609664739039, "learning_rate": 9.075114534478187e-06, "loss": 0.0623, "step": 2163 }, { "epoch": 0.9845313921747043, "grad_norm": 0.5564063525132696, "learning_rate": 9.074286192801423e-06, "loss": 0.0622, "step": 2164 }, { "epoch": 0.9849863512283894, "grad_norm": 0.8390150599738658, "learning_rate": 9.07345751819239e-06, "loss": 0.0894, "step": 2165 }, { "epoch": 0.9854413102820746, "grad_norm": 0.6899304429749638, "learning_rate": 9.072628510718804e-06, "loss": 0.0715, "step": 2166 }, { "epoch": 0.9858962693357598, "grad_norm": 0.7215157855324703, "learning_rate": 9.071799170448409e-06, "loss": 0.0767, "step": 2167 }, { "epoch": 0.986351228389445, "grad_norm": 0.5513970488289187, "learning_rate": 9.070969497448972e-06, "loss": 0.0586, "step": 2168 }, { "epoch": 0.9868061874431301, "grad_norm": 0.5126138943457034, "learning_rate": 9.070139491788295e-06, "loss": 0.0686, "step": 2169 }, { "epoch": 0.9872611464968153, "grad_norm": 0.7021455623884609, "learning_rate": 9.069309153534196e-06, "loss": 0.0853, "step": 2170 }, { "epoch": 0.9877161055505005, "grad_norm": 0.8937932838828458, "learning_rate": 9.068478482754532e-06, "loss": 0.1229, "step": 2171 }, { "epoch": 0.9881710646041856, "grad_norm": 0.7580326063736847, "learning_rate": 9.067647479517179e-06, "loss": 0.1176, "step": 2172 }, { "epoch": 0.9886260236578708, "grad_norm": 0.854693695415459, "learning_rate": 9.066816143890042e-06, "loss": 0.0624, "step": 2173 }, { "epoch": 0.989080982711556, "grad_norm": 0.691622087221906, "learning_rate": 9.065984475941056e-06, "loss": 0.0821, "step": 2174 }, { "epoch": 0.9895359417652412, "grad_norm": 0.5701976798754824, "learning_rate": 9.065152475738182e-06, "loss": 0.0525, "step": 2175 }, { "epoch": 0.9899909008189263, "grad_norm": 0.5280985607821013, "learning_rate": 9.064320143349405e-06, "loss": 0.0532, "step": 2176 }, { "epoch": 0.9904458598726115, "grad_norm": 0.7270073505569681, "learning_rate": 9.063487478842738e-06, "loss": 0.0729, "step": 2177 }, { "epoch": 0.9909008189262967, "grad_norm": 0.5397573476737881, "learning_rate": 9.062654482286228e-06, "loss": 0.0546, "step": 2178 }, { "epoch": 0.9913557779799818, "grad_norm": 0.8280519656078903, "learning_rate": 9.061821153747938e-06, "loss": 0.0794, "step": 2179 }, { "epoch": 0.991810737033667, "grad_norm": 0.6367661759018886, "learning_rate": 9.060987493295967e-06, "loss": 0.0679, "step": 2180 }, { "epoch": 0.9922656960873522, "grad_norm": 0.7859239736098618, "learning_rate": 9.060153500998438e-06, "loss": 0.0958, "step": 2181 }, { "epoch": 0.9927206551410374, "grad_norm": 0.8770748630020422, "learning_rate": 9.0593191769235e-06, "loss": 0.1037, "step": 2182 }, { "epoch": 0.9931756141947224, "grad_norm": 0.5493767625809909, "learning_rate": 9.05848452113933e-06, "loss": 0.0535, "step": 2183 }, { "epoch": 0.9936305732484076, "grad_norm": 1.0509546431486094, "learning_rate": 9.057649533714134e-06, "loss": 0.1136, "step": 2184 }, { "epoch": 0.9940855323020928, "grad_norm": 0.8067366260983323, "learning_rate": 9.056814214716143e-06, "loss": 0.0911, "step": 2185 }, { "epoch": 0.9945404913557779, "grad_norm": 0.6708197750921108, "learning_rate": 9.055978564213614e-06, "loss": 0.0737, "step": 2186 }, { "epoch": 0.9949954504094631, "grad_norm": 1.0620824544949425, "learning_rate": 9.055142582274831e-06, "loss": 0.1035, "step": 2187 }, { "epoch": 0.9954504094631483, "grad_norm": 0.7809645088567875, "learning_rate": 9.054306268968111e-06, "loss": 0.0964, "step": 2188 }, { "epoch": 0.9959053685168335, "grad_norm": 0.6922882332723763, "learning_rate": 9.053469624361793e-06, "loss": 0.0769, "step": 2189 }, { "epoch": 0.9963603275705186, "grad_norm": 0.6135634693459231, "learning_rate": 9.052632648524242e-06, "loss": 0.0857, "step": 2190 }, { "epoch": 0.9968152866242038, "grad_norm": 0.7230383107997012, "learning_rate": 9.051795341523852e-06, "loss": 0.0666, "step": 2191 }, { "epoch": 0.997270245677889, "grad_norm": 0.7702877397526973, "learning_rate": 9.050957703429044e-06, "loss": 0.0861, "step": 2192 }, { "epoch": 0.9977252047315741, "grad_norm": 0.79537510756259, "learning_rate": 9.050119734308266e-06, "loss": 0.0906, "step": 2193 }, { "epoch": 0.9981801637852593, "grad_norm": 0.6318589660625535, "learning_rate": 9.049281434229995e-06, "loss": 0.0821, "step": 2194 }, { "epoch": 0.9986351228389445, "grad_norm": 0.6618836956269952, "learning_rate": 9.048442803262731e-06, "loss": 0.0748, "step": 2195 }, { "epoch": 0.9990900818926297, "grad_norm": 0.5469592163366095, "learning_rate": 9.047603841475003e-06, "loss": 0.066, "step": 2196 }, { "epoch": 0.9995450409463148, "grad_norm": 0.6279887796401853, "learning_rate": 9.046764548935368e-06, "loss": 0.0743, "step": 2197 }, { "epoch": 1.0, "grad_norm": 0.40519899960847033, "learning_rate": 9.045924925712411e-06, "loss": 0.0327, "step": 2198 }, { "epoch": 1.000454959053685, "grad_norm": 0.41468311147935694, "learning_rate": 9.045084971874738e-06, "loss": 0.0243, "step": 2199 }, { "epoch": 1.0009099181073704, "grad_norm": 0.5188055788021196, "learning_rate": 9.04424468749099e-06, "loss": 0.0375, "step": 2200 }, { "epoch": 1.0013648771610555, "grad_norm": 0.4764585088866917, "learning_rate": 9.04340407262983e-06, "loss": 0.0395, "step": 2201 }, { "epoch": 1.0018198362147406, "grad_norm": 0.28928828491344616, "learning_rate": 9.042563127359946e-06, "loss": 0.0208, "step": 2202 }, { "epoch": 1.0022747952684259, "grad_norm": 0.5179468693343099, "learning_rate": 9.041721851750063e-06, "loss": 0.0322, "step": 2203 }, { "epoch": 1.002729754322111, "grad_norm": 0.4198208723720039, "learning_rate": 9.04088024586892e-06, "loss": 0.0366, "step": 2204 }, { "epoch": 1.0031847133757963, "grad_norm": 0.4784473138415427, "learning_rate": 9.040038309785293e-06, "loss": 0.0422, "step": 2205 }, { "epoch": 1.0036396724294814, "grad_norm": 0.576332931747316, "learning_rate": 9.039196043567979e-06, "loss": 0.0387, "step": 2206 }, { "epoch": 1.0040946314831665, "grad_norm": 0.5205582439898824, "learning_rate": 9.038353447285807e-06, "loss": 0.0551, "step": 2207 }, { "epoch": 1.0045495905368518, "grad_norm": 0.7737994932982504, "learning_rate": 9.037510521007626e-06, "loss": 0.042, "step": 2208 }, { "epoch": 1.0050045495905369, "grad_norm": 0.4056433108647087, "learning_rate": 9.03666726480232e-06, "loss": 0.0309, "step": 2209 }, { "epoch": 1.005459508644222, "grad_norm": 0.31259616668647877, "learning_rate": 9.035823678738795e-06, "loss": 0.0247, "step": 2210 }, { "epoch": 1.0059144676979073, "grad_norm": 0.545747512672262, "learning_rate": 9.034979762885985e-06, "loss": 0.0379, "step": 2211 }, { "epoch": 1.0063694267515924, "grad_norm": 0.3531093457798414, "learning_rate": 9.034135517312848e-06, "loss": 0.0198, "step": 2212 }, { "epoch": 1.0068243858052774, "grad_norm": 0.3471778421349368, "learning_rate": 9.033290942088377e-06, "loss": 0.0191, "step": 2213 }, { "epoch": 1.0072793448589628, "grad_norm": 0.45123302926671505, "learning_rate": 9.032446037281582e-06, "loss": 0.0233, "step": 2214 }, { "epoch": 1.0077343039126478, "grad_norm": 0.40498118740009004, "learning_rate": 9.031600802961508e-06, "loss": 0.028, "step": 2215 }, { "epoch": 1.008189262966333, "grad_norm": 0.44404852807953515, "learning_rate": 9.030755239197224e-06, "loss": 0.0343, "step": 2216 }, { "epoch": 1.0086442220200182, "grad_norm": 0.41886201143517243, "learning_rate": 9.029909346057826e-06, "loss": 0.0276, "step": 2217 }, { "epoch": 1.0090991810737033, "grad_norm": 0.2879285343911946, "learning_rate": 9.029063123612431e-06, "loss": 0.02, "step": 2218 }, { "epoch": 1.0095541401273886, "grad_norm": 0.5781677724909076, "learning_rate": 9.028216571930197e-06, "loss": 0.0339, "step": 2219 }, { "epoch": 1.0100090991810737, "grad_norm": 0.42128445628125777, "learning_rate": 9.027369691080292e-06, "loss": 0.0329, "step": 2220 }, { "epoch": 1.0104640582347588, "grad_norm": 0.4867304814601137, "learning_rate": 9.026522481131925e-06, "loss": 0.0451, "step": 2221 }, { "epoch": 1.0109190172884441, "grad_norm": 0.35647532367363194, "learning_rate": 9.025674942154325e-06, "loss": 0.0202, "step": 2222 }, { "epoch": 1.0113739763421292, "grad_norm": 0.6154778638320356, "learning_rate": 9.024827074216748e-06, "loss": 0.0619, "step": 2223 }, { "epoch": 1.0118289353958143, "grad_norm": 0.46447780693049373, "learning_rate": 9.023978877388479e-06, "loss": 0.0265, "step": 2224 }, { "epoch": 1.0122838944494996, "grad_norm": 0.4551756875246183, "learning_rate": 9.02313035173883e-06, "loss": 0.0167, "step": 2225 }, { "epoch": 1.0127388535031847, "grad_norm": 0.4341660568896861, "learning_rate": 9.022281497337133e-06, "loss": 0.0257, "step": 2226 }, { "epoch": 1.0131938125568698, "grad_norm": 0.37807969634776667, "learning_rate": 9.021432314252758e-06, "loss": 0.0235, "step": 2227 }, { "epoch": 1.013648771610555, "grad_norm": 0.43791115876653813, "learning_rate": 9.020582802555095e-06, "loss": 0.0285, "step": 2228 }, { "epoch": 1.0141037306642402, "grad_norm": 0.7541669794368306, "learning_rate": 9.019732962313562e-06, "loss": 0.0412, "step": 2229 }, { "epoch": 1.0145586897179253, "grad_norm": 0.41591203424935613, "learning_rate": 9.018882793597605e-06, "loss": 0.0217, "step": 2230 }, { "epoch": 1.0150136487716106, "grad_norm": 0.531675738557164, "learning_rate": 9.018032296476695e-06, "loss": 0.0259, "step": 2231 }, { "epoch": 1.0154686078252957, "grad_norm": 0.4525534861298487, "learning_rate": 9.017181471020331e-06, "loss": 0.032, "step": 2232 }, { "epoch": 1.015923566878981, "grad_norm": 0.5572932855598556, "learning_rate": 9.016330317298038e-06, "loss": 0.0321, "step": 2233 }, { "epoch": 1.016378525932666, "grad_norm": 0.4880772464783955, "learning_rate": 9.01547883537937e-06, "loss": 0.0242, "step": 2234 }, { "epoch": 1.0168334849863512, "grad_norm": 0.5290436879010799, "learning_rate": 9.014627025333906e-06, "loss": 0.0268, "step": 2235 }, { "epoch": 1.0172884440400365, "grad_norm": 0.3469524553449946, "learning_rate": 9.01377488723125e-06, "loss": 0.0189, "step": 2236 }, { "epoch": 1.0177434030937216, "grad_norm": 0.5381328202645719, "learning_rate": 9.012922421141036e-06, "loss": 0.0282, "step": 2237 }, { "epoch": 1.0181983621474067, "grad_norm": 0.5437416204093511, "learning_rate": 9.012069627132925e-06, "loss": 0.0365, "step": 2238 }, { "epoch": 1.018653321201092, "grad_norm": 0.5151432843211493, "learning_rate": 9.011216505276601e-06, "loss": 0.0327, "step": 2239 }, { "epoch": 1.019108280254777, "grad_norm": 0.7194165832171175, "learning_rate": 9.01036305564178e-06, "loss": 0.0447, "step": 2240 }, { "epoch": 1.0195632393084622, "grad_norm": 0.4895196525190099, "learning_rate": 9.009509278298201e-06, "loss": 0.0226, "step": 2241 }, { "epoch": 1.0200181983621475, "grad_norm": 0.36403402277658775, "learning_rate": 9.008655173315629e-06, "loss": 0.0172, "step": 2242 }, { "epoch": 1.0204731574158326, "grad_norm": 0.5192307375895406, "learning_rate": 9.00780074076386e-06, "loss": 0.0281, "step": 2243 }, { "epoch": 1.0209281164695176, "grad_norm": 0.5855074570295021, "learning_rate": 9.006945980712713e-06, "loss": 0.039, "step": 2244 }, { "epoch": 1.021383075523203, "grad_norm": 0.3530576777441414, "learning_rate": 9.006090893232036e-06, "loss": 0.0165, "step": 2245 }, { "epoch": 1.021838034576888, "grad_norm": 0.46560015374930225, "learning_rate": 9.005235478391704e-06, "loss": 0.031, "step": 2246 }, { "epoch": 1.0222929936305734, "grad_norm": 0.4320906337363968, "learning_rate": 9.004379736261614e-06, "loss": 0.0229, "step": 2247 }, { "epoch": 1.0227479526842584, "grad_norm": 0.5843690219708401, "learning_rate": 9.003523666911698e-06, "loss": 0.0398, "step": 2248 }, { "epoch": 1.0232029117379435, "grad_norm": 0.4876049343109499, "learning_rate": 9.002667270411905e-06, "loss": 0.0209, "step": 2249 }, { "epoch": 1.0236578707916288, "grad_norm": 0.4996309287294051, "learning_rate": 9.001810546832219e-06, "loss": 0.0339, "step": 2250 }, { "epoch": 1.024112829845314, "grad_norm": 0.44615485337683974, "learning_rate": 9.000953496242648e-06, "loss": 0.0367, "step": 2251 }, { "epoch": 1.024567788898999, "grad_norm": 0.4816248261028461, "learning_rate": 9.000096118713226e-06, "loss": 0.0302, "step": 2252 }, { "epoch": 1.0250227479526843, "grad_norm": 0.3202895454501902, "learning_rate": 8.999238414314014e-06, "loss": 0.018, "step": 2253 }, { "epoch": 1.0254777070063694, "grad_norm": 0.39394390771447657, "learning_rate": 8.998380383115098e-06, "loss": 0.0203, "step": 2254 }, { "epoch": 1.0259326660600545, "grad_norm": 0.6774965098079401, "learning_rate": 8.997522025186592e-06, "loss": 0.0444, "step": 2255 }, { "epoch": 1.0263876251137398, "grad_norm": 0.6156285698131154, "learning_rate": 8.996663340598642e-06, "loss": 0.033, "step": 2256 }, { "epoch": 1.026842584167425, "grad_norm": 0.6636465470342775, "learning_rate": 8.995804329421408e-06, "loss": 0.0282, "step": 2257 }, { "epoch": 1.02729754322111, "grad_norm": 0.7643329557559453, "learning_rate": 8.994944991725094e-06, "loss": 0.0413, "step": 2258 }, { "epoch": 1.0277525022747953, "grad_norm": 0.4484887858566329, "learning_rate": 8.994085327579914e-06, "loss": 0.0244, "step": 2259 }, { "epoch": 1.0282074613284804, "grad_norm": 0.6046158805682427, "learning_rate": 8.993225337056118e-06, "loss": 0.0372, "step": 2260 }, { "epoch": 1.0286624203821657, "grad_norm": 0.5297868937946675, "learning_rate": 8.992365020223982e-06, "loss": 0.0407, "step": 2261 }, { "epoch": 1.0291173794358508, "grad_norm": 0.4805793953554321, "learning_rate": 8.991504377153805e-06, "loss": 0.0297, "step": 2262 }, { "epoch": 1.0295723384895359, "grad_norm": 0.6196673347815759, "learning_rate": 8.990643407915915e-06, "loss": 0.0397, "step": 2263 }, { "epoch": 1.0300272975432212, "grad_norm": 0.6223272220447811, "learning_rate": 8.98978211258067e-06, "loss": 0.0409, "step": 2264 }, { "epoch": 1.0304822565969063, "grad_norm": 0.49952273986223505, "learning_rate": 8.988920491218446e-06, "loss": 0.0272, "step": 2265 }, { "epoch": 1.0309372156505914, "grad_norm": 0.6292771186739616, "learning_rate": 8.988058543899654e-06, "loss": 0.0384, "step": 2266 }, { "epoch": 1.0313921747042767, "grad_norm": 0.38772458827936923, "learning_rate": 8.987196270694727e-06, "loss": 0.024, "step": 2267 }, { "epoch": 1.0318471337579618, "grad_norm": 0.8799833129039605, "learning_rate": 8.986333671674128e-06, "loss": 0.0341, "step": 2268 }, { "epoch": 1.0323020928116469, "grad_norm": 0.6271731268799836, "learning_rate": 8.985470746908342e-06, "loss": 0.033, "step": 2269 }, { "epoch": 1.0327570518653322, "grad_norm": 0.38786047905872434, "learning_rate": 8.984607496467885e-06, "loss": 0.021, "step": 2270 }, { "epoch": 1.0332120109190173, "grad_norm": 0.6280644096851069, "learning_rate": 8.9837439204233e-06, "loss": 0.0491, "step": 2271 }, { "epoch": 1.0336669699727024, "grad_norm": 0.5847841334225715, "learning_rate": 8.98288001884515e-06, "loss": 0.0337, "step": 2272 }, { "epoch": 1.0341219290263877, "grad_norm": 0.36088677101245703, "learning_rate": 8.982015791804032e-06, "loss": 0.0156, "step": 2273 }, { "epoch": 1.0345768880800728, "grad_norm": 0.4537884974426005, "learning_rate": 8.981151239370566e-06, "loss": 0.027, "step": 2274 }, { "epoch": 1.035031847133758, "grad_norm": 0.6090066061447076, "learning_rate": 8.9802863616154e-06, "loss": 0.0378, "step": 2275 }, { "epoch": 1.0354868061874432, "grad_norm": 0.7101749544755233, "learning_rate": 8.979421158609206e-06, "loss": 0.0439, "step": 2276 }, { "epoch": 1.0359417652411282, "grad_norm": 0.5742339125588956, "learning_rate": 8.978555630422686e-06, "loss": 0.0328, "step": 2277 }, { "epoch": 1.0363967242948136, "grad_norm": 0.632873074474985, "learning_rate": 8.977689777126568e-06, "loss": 0.0472, "step": 2278 }, { "epoch": 1.0368516833484986, "grad_norm": 0.8069979527700195, "learning_rate": 8.976823598791604e-06, "loss": 0.0319, "step": 2279 }, { "epoch": 1.0373066424021837, "grad_norm": 0.4015240288673539, "learning_rate": 8.975957095488575e-06, "loss": 0.0269, "step": 2280 }, { "epoch": 1.037761601455869, "grad_norm": 0.5786381841993868, "learning_rate": 8.975090267288286e-06, "loss": 0.0296, "step": 2281 }, { "epoch": 1.0382165605095541, "grad_norm": 0.5451914455456522, "learning_rate": 8.974223114261574e-06, "loss": 0.0343, "step": 2282 }, { "epoch": 1.0386715195632392, "grad_norm": 0.6945170105788371, "learning_rate": 8.973355636479294e-06, "loss": 0.0476, "step": 2283 }, { "epoch": 1.0391264786169245, "grad_norm": 0.5171663408691534, "learning_rate": 8.972487834012338e-06, "loss": 0.0301, "step": 2284 }, { "epoch": 1.0395814376706096, "grad_norm": 0.494166229450044, "learning_rate": 8.971619706931613e-06, "loss": 0.0226, "step": 2285 }, { "epoch": 1.0400363967242947, "grad_norm": 0.7676778552323048, "learning_rate": 8.970751255308063e-06, "loss": 0.045, "step": 2286 }, { "epoch": 1.04049135577798, "grad_norm": 0.44323443611073776, "learning_rate": 8.969882479212652e-06, "loss": 0.0196, "step": 2287 }, { "epoch": 1.040946314831665, "grad_norm": 0.41146000373164554, "learning_rate": 8.969013378716371e-06, "loss": 0.0196, "step": 2288 }, { "epoch": 1.0414012738853504, "grad_norm": 0.3888711487160539, "learning_rate": 8.968143953890242e-06, "loss": 0.0228, "step": 2289 }, { "epoch": 1.0418562329390355, "grad_norm": 0.49379959221935377, "learning_rate": 8.96727420480531e-06, "loss": 0.0306, "step": 2290 }, { "epoch": 1.0423111919927206, "grad_norm": 0.48325360654642197, "learning_rate": 8.966404131532645e-06, "loss": 0.0265, "step": 2291 }, { "epoch": 1.042766151046406, "grad_norm": 0.47493208719115093, "learning_rate": 8.965533734143347e-06, "loss": 0.0239, "step": 2292 }, { "epoch": 1.043221110100091, "grad_norm": 0.556271091368108, "learning_rate": 8.964663012708538e-06, "loss": 0.0365, "step": 2293 }, { "epoch": 1.043676069153776, "grad_norm": 0.8512257992210553, "learning_rate": 8.963791967299375e-06, "loss": 0.0332, "step": 2294 }, { "epoch": 1.0441310282074614, "grad_norm": 0.4600946915818348, "learning_rate": 8.96292059798703e-06, "loss": 0.0254, "step": 2295 }, { "epoch": 1.0445859872611465, "grad_norm": 0.5926927797370501, "learning_rate": 8.962048904842713e-06, "loss": 0.034, "step": 2296 }, { "epoch": 1.0450409463148316, "grad_norm": 0.5174352508068348, "learning_rate": 8.96117688793765e-06, "loss": 0.0334, "step": 2297 }, { "epoch": 1.0454959053685169, "grad_norm": 0.4726564762945724, "learning_rate": 8.960304547343101e-06, "loss": 0.0271, "step": 2298 }, { "epoch": 1.045950864422202, "grad_norm": 0.49021838747059965, "learning_rate": 8.959431883130348e-06, "loss": 0.0272, "step": 2299 }, { "epoch": 1.046405823475887, "grad_norm": 0.33392762330146264, "learning_rate": 8.958558895370703e-06, "loss": 0.0184, "step": 2300 }, { "epoch": 1.0468607825295724, "grad_norm": 0.43970090494512293, "learning_rate": 8.9576855841355e-06, "loss": 0.0247, "step": 2301 }, { "epoch": 1.0473157415832575, "grad_norm": 0.34961568768416074, "learning_rate": 8.956811949496108e-06, "loss": 0.0207, "step": 2302 }, { "epoch": 1.0477707006369428, "grad_norm": 0.5047819086466443, "learning_rate": 8.955937991523908e-06, "loss": 0.0358, "step": 2303 }, { "epoch": 1.0482256596906279, "grad_norm": 0.5502957295717672, "learning_rate": 8.955063710290322e-06, "loss": 0.0396, "step": 2304 }, { "epoch": 1.048680618744313, "grad_norm": 0.4007555279082937, "learning_rate": 8.95418910586679e-06, "loss": 0.0205, "step": 2305 }, { "epoch": 1.0491355777979983, "grad_norm": 0.37932885662916804, "learning_rate": 8.953314178324782e-06, "loss": 0.0261, "step": 2306 }, { "epoch": 1.0495905368516834, "grad_norm": 0.6331059696275105, "learning_rate": 8.952438927735793e-06, "loss": 0.0397, "step": 2307 }, { "epoch": 1.0500454959053684, "grad_norm": 0.5533999405103901, "learning_rate": 8.951563354171343e-06, "loss": 0.0216, "step": 2308 }, { "epoch": 1.0505004549590538, "grad_norm": 0.5064049753801714, "learning_rate": 8.950687457702981e-06, "loss": 0.0253, "step": 2309 }, { "epoch": 1.0509554140127388, "grad_norm": 0.7762514931128638, "learning_rate": 8.94981123840228e-06, "loss": 0.0257, "step": 2310 }, { "epoch": 1.051410373066424, "grad_norm": 0.5258772784610919, "learning_rate": 8.948934696340842e-06, "loss": 0.0402, "step": 2311 }, { "epoch": 1.0518653321201092, "grad_norm": 0.5179164003875761, "learning_rate": 8.948057831590296e-06, "loss": 0.0392, "step": 2312 }, { "epoch": 1.0523202911737943, "grad_norm": 0.4873683404674824, "learning_rate": 8.94718064422229e-06, "loss": 0.0225, "step": 2313 }, { "epoch": 1.0527752502274794, "grad_norm": 0.42294954664238593, "learning_rate": 8.94630313430851e-06, "loss": 0.0239, "step": 2314 }, { "epoch": 1.0532302092811647, "grad_norm": 0.5120965619588207, "learning_rate": 8.945425301920656e-06, "loss": 0.0239, "step": 2315 }, { "epoch": 1.0536851683348498, "grad_norm": 0.5274581767565953, "learning_rate": 8.944547147130467e-06, "loss": 0.0395, "step": 2316 }, { "epoch": 1.0541401273885351, "grad_norm": 0.6240914390797723, "learning_rate": 8.943668670009698e-06, "loss": 0.04, "step": 2317 }, { "epoch": 1.0545950864422202, "grad_norm": 0.588480807715609, "learning_rate": 8.942789870630133e-06, "loss": 0.0379, "step": 2318 }, { "epoch": 1.0550500454959053, "grad_norm": 0.5328051168509789, "learning_rate": 8.941910749063587e-06, "loss": 0.0256, "step": 2319 }, { "epoch": 1.0555050045495906, "grad_norm": 0.5662136884367794, "learning_rate": 8.941031305381894e-06, "loss": 0.0349, "step": 2320 }, { "epoch": 1.0559599636032757, "grad_norm": 0.4080289916939306, "learning_rate": 8.940151539656922e-06, "loss": 0.0203, "step": 2321 }, { "epoch": 1.0564149226569608, "grad_norm": 0.6644738842779135, "learning_rate": 8.93927145196056e-06, "loss": 0.0295, "step": 2322 }, { "epoch": 1.056869881710646, "grad_norm": 0.43989425636246393, "learning_rate": 8.938391042364723e-06, "loss": 0.0257, "step": 2323 }, { "epoch": 1.0573248407643312, "grad_norm": 0.5431541428763835, "learning_rate": 8.937510310941358e-06, "loss": 0.03, "step": 2324 }, { "epoch": 1.0577797998180163, "grad_norm": 0.5122724279785533, "learning_rate": 8.936629257762429e-06, "loss": 0.0273, "step": 2325 }, { "epoch": 1.0582347588717016, "grad_norm": 0.41195858239961775, "learning_rate": 8.935747882899937e-06, "loss": 0.0216, "step": 2326 }, { "epoch": 1.0586897179253867, "grad_norm": 0.5171757707727286, "learning_rate": 8.9348661864259e-06, "loss": 0.0299, "step": 2327 }, { "epoch": 1.0591446769790718, "grad_norm": 0.6216382380161013, "learning_rate": 8.93398416841237e-06, "loss": 0.0525, "step": 2328 }, { "epoch": 1.059599636032757, "grad_norm": 0.47615445722593264, "learning_rate": 8.933101828931418e-06, "loss": 0.0229, "step": 2329 }, { "epoch": 1.0600545950864422, "grad_norm": 0.5543921495737715, "learning_rate": 8.932219168055146e-06, "loss": 0.0353, "step": 2330 }, { "epoch": 1.0605095541401275, "grad_norm": 0.4807073495602966, "learning_rate": 8.931336185855682e-06, "loss": 0.029, "step": 2331 }, { "epoch": 1.0609645131938126, "grad_norm": 0.7132043951444881, "learning_rate": 8.930452882405178e-06, "loss": 0.0573, "step": 2332 }, { "epoch": 1.0614194722474977, "grad_norm": 0.7323908092635573, "learning_rate": 8.929569257775816e-06, "loss": 0.031, "step": 2333 }, { "epoch": 1.061874431301183, "grad_norm": 0.7282498524373471, "learning_rate": 8.9286853120398e-06, "loss": 0.0212, "step": 2334 }, { "epoch": 1.062329390354868, "grad_norm": 0.5041730540211715, "learning_rate": 8.92780104526936e-06, "loss": 0.0219, "step": 2335 }, { "epoch": 1.0627843494085532, "grad_norm": 0.5694707546108049, "learning_rate": 8.926916457536755e-06, "loss": 0.0277, "step": 2336 }, { "epoch": 1.0632393084622385, "grad_norm": 0.4942987205465501, "learning_rate": 8.926031548914274e-06, "loss": 0.0283, "step": 2337 }, { "epoch": 1.0636942675159236, "grad_norm": 0.7094719472889628, "learning_rate": 8.925146319474225e-06, "loss": 0.0484, "step": 2338 }, { "epoch": 1.0641492265696086, "grad_norm": 0.5401572696577567, "learning_rate": 8.924260769288944e-06, "loss": 0.032, "step": 2339 }, { "epoch": 1.064604185623294, "grad_norm": 0.6271229371930636, "learning_rate": 8.923374898430794e-06, "loss": 0.0417, "step": 2340 }, { "epoch": 1.065059144676979, "grad_norm": 0.5384710947557135, "learning_rate": 8.922488706972165e-06, "loss": 0.028, "step": 2341 }, { "epoch": 1.0655141037306644, "grad_norm": 0.5738095562796759, "learning_rate": 8.921602194985473e-06, "loss": 0.0251, "step": 2342 }, { "epoch": 1.0659690627843494, "grad_norm": 0.4114388383836, "learning_rate": 8.920715362543158e-06, "loss": 0.0257, "step": 2343 }, { "epoch": 1.0664240218380345, "grad_norm": 0.4407026853756295, "learning_rate": 8.919828209717691e-06, "loss": 0.0318, "step": 2344 }, { "epoch": 1.0668789808917198, "grad_norm": 0.5795706484311789, "learning_rate": 8.918940736581565e-06, "loss": 0.0384, "step": 2345 }, { "epoch": 1.067333939945405, "grad_norm": 0.4997138165488597, "learning_rate": 8.918052943207298e-06, "loss": 0.0339, "step": 2346 }, { "epoch": 1.06778889899909, "grad_norm": 0.6466785074736559, "learning_rate": 8.91716482966744e-06, "loss": 0.0412, "step": 2347 }, { "epoch": 1.0682438580527753, "grad_norm": 0.6101860514996267, "learning_rate": 8.916276396034561e-06, "loss": 0.0349, "step": 2348 }, { "epoch": 1.0686988171064604, "grad_norm": 0.6648890763063255, "learning_rate": 8.915387642381261e-06, "loss": 0.0374, "step": 2349 }, { "epoch": 1.0691537761601455, "grad_norm": 0.6435783427790035, "learning_rate": 8.914498568780163e-06, "loss": 0.0425, "step": 2350 }, { "epoch": 1.0696087352138308, "grad_norm": 0.4168529921191238, "learning_rate": 8.913609175303923e-06, "loss": 0.0222, "step": 2351 }, { "epoch": 1.070063694267516, "grad_norm": 0.370333742802149, "learning_rate": 8.912719462025213e-06, "loss": 0.018, "step": 2352 }, { "epoch": 1.070518653321201, "grad_norm": 0.3929772094003772, "learning_rate": 8.911829429016737e-06, "loss": 0.0184, "step": 2353 }, { "epoch": 1.0709736123748863, "grad_norm": 0.36777976145335695, "learning_rate": 8.910939076351228e-06, "loss": 0.0199, "step": 2354 }, { "epoch": 1.0714285714285714, "grad_norm": 0.5445905742319043, "learning_rate": 8.910048404101437e-06, "loss": 0.0297, "step": 2355 }, { "epoch": 1.0718835304822565, "grad_norm": 0.517651494476337, "learning_rate": 8.90915741234015e-06, "loss": 0.0244, "step": 2356 }, { "epoch": 1.0723384895359418, "grad_norm": 0.6079868190664829, "learning_rate": 8.908266101140173e-06, "loss": 0.0327, "step": 2357 }, { "epoch": 1.0727934485896269, "grad_norm": 0.5005614750938115, "learning_rate": 8.907374470574339e-06, "loss": 0.0288, "step": 2358 }, { "epoch": 1.0732484076433122, "grad_norm": 0.41084278869296126, "learning_rate": 8.906482520715508e-06, "loss": 0.0196, "step": 2359 }, { "epoch": 1.0737033666969973, "grad_norm": 0.42883961230062595, "learning_rate": 8.905590251636566e-06, "loss": 0.0201, "step": 2360 }, { "epoch": 1.0741583257506824, "grad_norm": 0.7507509176249603, "learning_rate": 8.904697663410429e-06, "loss": 0.0519, "step": 2361 }, { "epoch": 1.0746132848043677, "grad_norm": 0.35684834441788627, "learning_rate": 8.90380475611003e-06, "loss": 0.0193, "step": 2362 }, { "epoch": 1.0750682438580528, "grad_norm": 0.359991301638448, "learning_rate": 8.902911529808338e-06, "loss": 0.02, "step": 2363 }, { "epoch": 1.0755232029117379, "grad_norm": 0.6485293447004715, "learning_rate": 8.90201798457834e-06, "loss": 0.05, "step": 2364 }, { "epoch": 1.0759781619654232, "grad_norm": 0.35596882973823685, "learning_rate": 8.901124120493055e-06, "loss": 0.0201, "step": 2365 }, { "epoch": 1.0764331210191083, "grad_norm": 0.5195485453283638, "learning_rate": 8.900229937625522e-06, "loss": 0.0267, "step": 2366 }, { "epoch": 1.0768880800727934, "grad_norm": 0.5121436407601963, "learning_rate": 8.899335436048813e-06, "loss": 0.0293, "step": 2367 }, { "epoch": 1.0773430391264787, "grad_norm": 0.574083355691705, "learning_rate": 8.898440615836021e-06, "loss": 0.0314, "step": 2368 }, { "epoch": 1.0777979981801638, "grad_norm": 0.36323016195490376, "learning_rate": 8.897545477060268e-06, "loss": 0.0164, "step": 2369 }, { "epoch": 1.078252957233849, "grad_norm": 0.44874033315946665, "learning_rate": 8.8966500197947e-06, "loss": 0.0255, "step": 2370 }, { "epoch": 1.0787079162875342, "grad_norm": 0.4549169634711705, "learning_rate": 8.895754244112486e-06, "loss": 0.0252, "step": 2371 }, { "epoch": 1.0791628753412192, "grad_norm": 0.5188300138751303, "learning_rate": 8.894858150086832e-06, "loss": 0.022, "step": 2372 }, { "epoch": 1.0796178343949046, "grad_norm": 0.5077854205250166, "learning_rate": 8.893961737790957e-06, "loss": 0.027, "step": 2373 }, { "epoch": 1.0800727934485896, "grad_norm": 0.5080695970336, "learning_rate": 8.893065007298116e-06, "loss": 0.0293, "step": 2374 }, { "epoch": 1.0805277525022747, "grad_norm": 0.49124016807194615, "learning_rate": 8.89216795868158e-06, "loss": 0.0253, "step": 2375 }, { "epoch": 1.08098271155596, "grad_norm": 0.746420330430573, "learning_rate": 8.891270592014658e-06, "loss": 0.0393, "step": 2376 }, { "epoch": 1.0814376706096451, "grad_norm": 0.5899621371906842, "learning_rate": 8.890372907370677e-06, "loss": 0.0325, "step": 2377 }, { "epoch": 1.0818926296633302, "grad_norm": 0.538668781912988, "learning_rate": 8.889474904822987e-06, "loss": 0.0254, "step": 2378 }, { "epoch": 1.0823475887170155, "grad_norm": 0.48796027217616167, "learning_rate": 8.888576584444976e-06, "loss": 0.0284, "step": 2379 }, { "epoch": 1.0828025477707006, "grad_norm": 0.4607384499708701, "learning_rate": 8.887677946310045e-06, "loss": 0.0293, "step": 2380 }, { "epoch": 1.0832575068243857, "grad_norm": 0.6691227522534325, "learning_rate": 8.886778990491632e-06, "loss": 0.0479, "step": 2381 }, { "epoch": 1.083712465878071, "grad_norm": 0.4131339751828579, "learning_rate": 8.885879717063189e-06, "loss": 0.0232, "step": 2382 }, { "epoch": 1.084167424931756, "grad_norm": 0.49834287436563, "learning_rate": 8.884980126098206e-06, "loss": 0.0261, "step": 2383 }, { "epoch": 1.0846223839854412, "grad_norm": 0.49133678192638947, "learning_rate": 8.88408021767019e-06, "loss": 0.0217, "step": 2384 }, { "epoch": 1.0850773430391265, "grad_norm": 0.4897177991752284, "learning_rate": 8.88317999185268e-06, "loss": 0.0304, "step": 2385 }, { "epoch": 1.0855323020928116, "grad_norm": 0.5332982190122252, "learning_rate": 8.882279448719235e-06, "loss": 0.024, "step": 2386 }, { "epoch": 1.085987261146497, "grad_norm": 0.39337001966991797, "learning_rate": 8.881378588343448e-06, "loss": 0.0195, "step": 2387 }, { "epoch": 1.086442220200182, "grad_norm": 0.5648723431118464, "learning_rate": 8.88047741079893e-06, "loss": 0.0277, "step": 2388 }, { "epoch": 1.086897179253867, "grad_norm": 0.38358401084782046, "learning_rate": 8.879575916159323e-06, "loss": 0.0234, "step": 2389 }, { "epoch": 1.0873521383075524, "grad_norm": 0.4916039064871815, "learning_rate": 8.878674104498293e-06, "loss": 0.0196, "step": 2390 }, { "epoch": 1.0878070973612375, "grad_norm": 0.4574406020630443, "learning_rate": 8.877771975889529e-06, "loss": 0.0266, "step": 2391 }, { "epoch": 1.0882620564149226, "grad_norm": 1.2527103886930033, "learning_rate": 8.876869530406753e-06, "loss": 0.085, "step": 2392 }, { "epoch": 1.0887170154686079, "grad_norm": 0.6740099441800771, "learning_rate": 8.875966768123705e-06, "loss": 0.0491, "step": 2393 }, { "epoch": 1.089171974522293, "grad_norm": 0.8127319301316774, "learning_rate": 8.875063689114157e-06, "loss": 0.0351, "step": 2394 }, { "epoch": 1.089626933575978, "grad_norm": 0.6883882884250196, "learning_rate": 8.874160293451903e-06, "loss": 0.0351, "step": 2395 }, { "epoch": 1.0900818926296634, "grad_norm": 0.472050537765526, "learning_rate": 8.873256581210767e-06, "loss": 0.0281, "step": 2396 }, { "epoch": 1.0905368516833485, "grad_norm": 0.43429585005126187, "learning_rate": 8.872352552464594e-06, "loss": 0.0217, "step": 2397 }, { "epoch": 1.0909918107370338, "grad_norm": 0.7559591015285818, "learning_rate": 8.871448207287259e-06, "loss": 0.0234, "step": 2398 }, { "epoch": 1.0914467697907189, "grad_norm": 1.295843093263791, "learning_rate": 8.870543545752657e-06, "loss": 0.0378, "step": 2399 }, { "epoch": 1.091901728844404, "grad_norm": 0.687703240327456, "learning_rate": 8.869638567934718e-06, "loss": 0.0428, "step": 2400 }, { "epoch": 1.0923566878980893, "grad_norm": 0.5316380088515792, "learning_rate": 8.86873327390739e-06, "loss": 0.0207, "step": 2401 }, { "epoch": 1.0928116469517744, "grad_norm": 0.37080940024955544, "learning_rate": 8.867827663744649e-06, "loss": 0.014, "step": 2402 }, { "epoch": 1.0932666060054594, "grad_norm": 0.551372034105751, "learning_rate": 8.8669217375205e-06, "loss": 0.0407, "step": 2403 }, { "epoch": 1.0937215650591448, "grad_norm": 0.550827427093742, "learning_rate": 8.866015495308967e-06, "loss": 0.0295, "step": 2404 }, { "epoch": 1.0941765241128298, "grad_norm": 0.5312346261037174, "learning_rate": 8.865108937184108e-06, "loss": 0.0329, "step": 2405 }, { "epoch": 1.094631483166515, "grad_norm": 0.606116027973049, "learning_rate": 8.864202063220003e-06, "loss": 0.036, "step": 2406 }, { "epoch": 1.0950864422202002, "grad_norm": 0.5039409256044083, "learning_rate": 8.863294873490752e-06, "loss": 0.0237, "step": 2407 }, { "epoch": 1.0955414012738853, "grad_norm": 0.7088932326845141, "learning_rate": 8.862387368070493e-06, "loss": 0.0502, "step": 2408 }, { "epoch": 1.0959963603275704, "grad_norm": 0.42457685669799344, "learning_rate": 8.86147954703338e-06, "loss": 0.0232, "step": 2409 }, { "epoch": 1.0964513193812557, "grad_norm": 0.400049727629285, "learning_rate": 8.860571410453598e-06, "loss": 0.0137, "step": 2410 }, { "epoch": 1.0969062784349408, "grad_norm": 0.5528326412344238, "learning_rate": 8.859662958405352e-06, "loss": 0.0259, "step": 2411 }, { "epoch": 1.097361237488626, "grad_norm": 0.3740020218354164, "learning_rate": 8.858754190962881e-06, "loss": 0.0207, "step": 2412 }, { "epoch": 1.0978161965423112, "grad_norm": 0.43380267454252947, "learning_rate": 8.857845108200443e-06, "loss": 0.03, "step": 2413 }, { "epoch": 1.0982711555959963, "grad_norm": 0.41117776188244837, "learning_rate": 8.856935710192326e-06, "loss": 0.0217, "step": 2414 }, { "epoch": 1.0987261146496816, "grad_norm": 0.7295481072089418, "learning_rate": 8.856025997012837e-06, "loss": 0.0355, "step": 2415 }, { "epoch": 1.0991810737033667, "grad_norm": 0.6100308273835641, "learning_rate": 8.85511596873632e-06, "loss": 0.0369, "step": 2416 }, { "epoch": 1.0996360327570518, "grad_norm": 0.41413261117443184, "learning_rate": 8.854205625437135e-06, "loss": 0.0198, "step": 2417 }, { "epoch": 1.100090991810737, "grad_norm": 0.45865368499615844, "learning_rate": 8.853294967189672e-06, "loss": 0.0274, "step": 2418 }, { "epoch": 1.1005459508644222, "grad_norm": 0.49503724640291885, "learning_rate": 8.852383994068345e-06, "loss": 0.039, "step": 2419 }, { "epoch": 1.1010009099181073, "grad_norm": 0.3278139965958097, "learning_rate": 8.851472706147595e-06, "loss": 0.02, "step": 2420 }, { "epoch": 1.1014558689717926, "grad_norm": 0.7072991481662654, "learning_rate": 8.85056110350189e-06, "loss": 0.0478, "step": 2421 }, { "epoch": 1.1019108280254777, "grad_norm": 0.3754428113606483, "learning_rate": 8.84964918620572e-06, "loss": 0.0204, "step": 2422 }, { "epoch": 1.1023657870791628, "grad_norm": 0.7096758634544409, "learning_rate": 8.848736954333603e-06, "loss": 0.0335, "step": 2423 }, { "epoch": 1.102820746132848, "grad_norm": 0.5727995354405594, "learning_rate": 8.847824407960083e-06, "loss": 0.0323, "step": 2424 }, { "epoch": 1.1032757051865332, "grad_norm": 0.6229568548114003, "learning_rate": 8.84691154715973e-06, "loss": 0.0309, "step": 2425 }, { "epoch": 1.1037306642402185, "grad_norm": 0.5010513455704715, "learning_rate": 8.845998372007136e-06, "loss": 0.0286, "step": 2426 }, { "epoch": 1.1041856232939036, "grad_norm": 0.34862957832393143, "learning_rate": 8.845084882576924e-06, "loss": 0.0165, "step": 2427 }, { "epoch": 1.1046405823475887, "grad_norm": 0.5610710585811625, "learning_rate": 8.84417107894374e-06, "loss": 0.0381, "step": 2428 }, { "epoch": 1.105095541401274, "grad_norm": 0.3998367702132408, "learning_rate": 8.843256961182255e-06, "loss": 0.0186, "step": 2429 }, { "epoch": 1.105550500454959, "grad_norm": 0.6787215229828617, "learning_rate": 8.842342529367167e-06, "loss": 0.0487, "step": 2430 }, { "epoch": 1.1060054595086442, "grad_norm": 0.6483563929183911, "learning_rate": 8.8414277835732e-06, "loss": 0.0409, "step": 2431 }, { "epoch": 1.1064604185623295, "grad_norm": 0.6351823340870137, "learning_rate": 8.840512723875103e-06, "loss": 0.0497, "step": 2432 }, { "epoch": 1.1069153776160146, "grad_norm": 0.3467791981865341, "learning_rate": 8.839597350347648e-06, "loss": 0.0172, "step": 2433 }, { "epoch": 1.1073703366696996, "grad_norm": 0.4877926867999841, "learning_rate": 8.838681663065638e-06, "loss": 0.0268, "step": 2434 }, { "epoch": 1.107825295723385, "grad_norm": 0.561052741145843, "learning_rate": 8.837765662103898e-06, "loss": 0.0351, "step": 2435 }, { "epoch": 1.10828025477707, "grad_norm": 0.5339886977527083, "learning_rate": 8.836849347537278e-06, "loss": 0.0286, "step": 2436 }, { "epoch": 1.1087352138307551, "grad_norm": 0.41940315295115715, "learning_rate": 8.835932719440658e-06, "loss": 0.016, "step": 2437 }, { "epoch": 1.1091901728844404, "grad_norm": 0.500811248377599, "learning_rate": 8.835015777888938e-06, "loss": 0.0277, "step": 2438 }, { "epoch": 1.1096451319381255, "grad_norm": 0.6905252242552301, "learning_rate": 8.83409852295705e-06, "loss": 0.0451, "step": 2439 }, { "epoch": 1.1101000909918108, "grad_norm": 0.4932334291437054, "learning_rate": 8.833180954719941e-06, "loss": 0.023, "step": 2440 }, { "epoch": 1.110555050045496, "grad_norm": 0.32570391119462067, "learning_rate": 8.832263073252597e-06, "loss": 0.0223, "step": 2441 }, { "epoch": 1.111010009099181, "grad_norm": 0.5189620509513116, "learning_rate": 8.831344878630022e-06, "loss": 0.0345, "step": 2442 }, { "epoch": 1.1114649681528663, "grad_norm": 0.35471915929013836, "learning_rate": 8.830426370927246e-06, "loss": 0.0178, "step": 2443 }, { "epoch": 1.1119199272065514, "grad_norm": 0.4071867204646678, "learning_rate": 8.829507550219323e-06, "loss": 0.0187, "step": 2444 }, { "epoch": 1.1123748862602365, "grad_norm": 0.5327053422443435, "learning_rate": 8.828588416581338e-06, "loss": 0.0321, "step": 2445 }, { "epoch": 1.1128298453139218, "grad_norm": 0.4727447057361278, "learning_rate": 8.827668970088397e-06, "loss": 0.0256, "step": 2446 }, { "epoch": 1.113284804367607, "grad_norm": 0.44344698021715867, "learning_rate": 8.826749210815634e-06, "loss": 0.0212, "step": 2447 }, { "epoch": 1.113739763421292, "grad_norm": 0.48653354586078956, "learning_rate": 8.825829138838206e-06, "loss": 0.0252, "step": 2448 }, { "epoch": 1.1141947224749773, "grad_norm": 0.4904789767614279, "learning_rate": 8.824908754231299e-06, "loss": 0.0219, "step": 2449 }, { "epoch": 1.1146496815286624, "grad_norm": 0.5096306577344566, "learning_rate": 8.823988057070122e-06, "loss": 0.0269, "step": 2450 }, { "epoch": 1.1151046405823477, "grad_norm": 0.4524604770972165, "learning_rate": 8.823067047429908e-06, "loss": 0.0197, "step": 2451 }, { "epoch": 1.1155595996360328, "grad_norm": 0.6661762941224277, "learning_rate": 8.82214572538592e-06, "loss": 0.0432, "step": 2452 }, { "epoch": 1.1160145586897179, "grad_norm": 0.45413808918893234, "learning_rate": 8.821224091013445e-06, "loss": 0.0252, "step": 2453 }, { "epoch": 1.1164695177434032, "grad_norm": 0.4564359066247584, "learning_rate": 8.820302144387794e-06, "loss": 0.0305, "step": 2454 }, { "epoch": 1.1169244767970883, "grad_norm": 0.5331752474098931, "learning_rate": 8.819379885584303e-06, "loss": 0.0285, "step": 2455 }, { "epoch": 1.1173794358507734, "grad_norm": 0.8314482044455632, "learning_rate": 8.818457314678336e-06, "loss": 0.0474, "step": 2456 }, { "epoch": 1.1178343949044587, "grad_norm": 0.5831509752587852, "learning_rate": 8.817534431745283e-06, "loss": 0.0204, "step": 2457 }, { "epoch": 1.1182893539581438, "grad_norm": 0.42113991056064237, "learning_rate": 8.816611236860554e-06, "loss": 0.0207, "step": 2458 }, { "epoch": 1.1187443130118289, "grad_norm": 0.5492674131587796, "learning_rate": 8.815687730099594e-06, "loss": 0.023, "step": 2459 }, { "epoch": 1.1191992720655142, "grad_norm": 0.5627677712218775, "learning_rate": 8.81476391153786e-06, "loss": 0.0238, "step": 2460 }, { "epoch": 1.1196542311191993, "grad_norm": 0.306412099822185, "learning_rate": 8.813839781250848e-06, "loss": 0.0136, "step": 2461 }, { "epoch": 1.1201091901728844, "grad_norm": 0.4884139369729457, "learning_rate": 8.812915339314073e-06, "loss": 0.0325, "step": 2462 }, { "epoch": 1.1205641492265697, "grad_norm": 0.6440331779678226, "learning_rate": 8.811990585803074e-06, "loss": 0.0462, "step": 2463 }, { "epoch": 1.1210191082802548, "grad_norm": 0.6354635395644428, "learning_rate": 8.81106552079342e-06, "loss": 0.0326, "step": 2464 }, { "epoch": 1.1214740673339398, "grad_norm": 0.4841057095746355, "learning_rate": 8.810140144360701e-06, "loss": 0.0288, "step": 2465 }, { "epoch": 1.1219290263876252, "grad_norm": 0.7578064954916388, "learning_rate": 8.809214456580539e-06, "loss": 0.0444, "step": 2466 }, { "epoch": 1.1223839854413102, "grad_norm": 0.36333027437030824, "learning_rate": 8.80828845752857e-06, "loss": 0.0166, "step": 2467 }, { "epoch": 1.1228389444949956, "grad_norm": 1.0828419984965674, "learning_rate": 8.80736214728047e-06, "loss": 0.0509, "step": 2468 }, { "epoch": 1.1232939035486806, "grad_norm": 0.41035853061268457, "learning_rate": 8.806435525911927e-06, "loss": 0.0152, "step": 2469 }, { "epoch": 1.1237488626023657, "grad_norm": 0.48117366130842515, "learning_rate": 8.805508593498662e-06, "loss": 0.0358, "step": 2470 }, { "epoch": 1.124203821656051, "grad_norm": 0.48865070302034325, "learning_rate": 8.804581350116422e-06, "loss": 0.0248, "step": 2471 }, { "epoch": 1.1246587807097361, "grad_norm": 0.6166160347574816, "learning_rate": 8.803653795840974e-06, "loss": 0.0372, "step": 2472 }, { "epoch": 1.1251137397634212, "grad_norm": 0.4235666133907878, "learning_rate": 8.802725930748115e-06, "loss": 0.0224, "step": 2473 }, { "epoch": 1.1255686988171065, "grad_norm": 0.49371023402555386, "learning_rate": 8.801797754913667e-06, "loss": 0.0253, "step": 2474 }, { "epoch": 1.1260236578707916, "grad_norm": 0.5375981231946215, "learning_rate": 8.800869268413475e-06, "loss": 0.0303, "step": 2475 }, { "epoch": 1.1264786169244767, "grad_norm": 0.6200342528643785, "learning_rate": 8.79994047132341e-06, "loss": 0.0301, "step": 2476 }, { "epoch": 1.126933575978162, "grad_norm": 0.7763567599332302, "learning_rate": 8.79901136371937e-06, "loss": 0.0367, "step": 2477 }, { "epoch": 1.127388535031847, "grad_norm": 0.4168679527566863, "learning_rate": 8.798081945677279e-06, "loss": 0.0193, "step": 2478 }, { "epoch": 1.1278434940855324, "grad_norm": 0.5499515478297102, "learning_rate": 8.797152217273082e-06, "loss": 0.0232, "step": 2479 }, { "epoch": 1.1282984531392175, "grad_norm": 0.3629031290073349, "learning_rate": 8.796222178582756e-06, "loss": 0.0217, "step": 2480 }, { "epoch": 1.1287534121929026, "grad_norm": 0.539897737827513, "learning_rate": 8.795291829682293e-06, "loss": 0.0272, "step": 2481 }, { "epoch": 1.129208371246588, "grad_norm": 0.5636939303591514, "learning_rate": 8.794361170647723e-06, "loss": 0.0322, "step": 2482 }, { "epoch": 1.129663330300273, "grad_norm": 0.6219815104303015, "learning_rate": 8.793430201555095e-06, "loss": 0.0274, "step": 2483 }, { "epoch": 1.130118289353958, "grad_norm": 0.6542904517198702, "learning_rate": 8.79249892248048e-06, "loss": 0.0358, "step": 2484 }, { "epoch": 1.1305732484076434, "grad_norm": 0.46666017679304383, "learning_rate": 8.79156733349998e-06, "loss": 0.0308, "step": 2485 }, { "epoch": 1.1310282074613285, "grad_norm": 0.643787908195578, "learning_rate": 8.790635434689722e-06, "loss": 0.0325, "step": 2486 }, { "epoch": 1.1314831665150136, "grad_norm": 0.6798497056398047, "learning_rate": 8.789703226125853e-06, "loss": 0.0388, "step": 2487 }, { "epoch": 1.1319381255686989, "grad_norm": 0.45682700520723596, "learning_rate": 8.78877070788455e-06, "loss": 0.0248, "step": 2488 }, { "epoch": 1.132393084622384, "grad_norm": 0.520494224107322, "learning_rate": 8.787837880042016e-06, "loss": 0.0251, "step": 2489 }, { "epoch": 1.132848043676069, "grad_norm": 0.5608809735379154, "learning_rate": 8.786904742674476e-06, "loss": 0.0354, "step": 2490 }, { "epoch": 1.1333030027297544, "grad_norm": 0.5383912877252518, "learning_rate": 8.78597129585818e-06, "loss": 0.0252, "step": 2491 }, { "epoch": 1.1337579617834395, "grad_norm": 0.3952421850434973, "learning_rate": 8.78503753966941e-06, "loss": 0.0191, "step": 2492 }, { "epoch": 1.1342129208371245, "grad_norm": 0.7660377240440205, "learning_rate": 8.784103474184463e-06, "loss": 0.0372, "step": 2493 }, { "epoch": 1.1346678798908099, "grad_norm": 0.45419840808136375, "learning_rate": 8.783169099479669e-06, "loss": 0.0237, "step": 2494 }, { "epoch": 1.135122838944495, "grad_norm": 0.6963944475868004, "learning_rate": 8.782234415631381e-06, "loss": 0.0402, "step": 2495 }, { "epoch": 1.1355777979981803, "grad_norm": 0.43802475738162483, "learning_rate": 8.781299422715979e-06, "loss": 0.0238, "step": 2496 }, { "epoch": 1.1360327570518653, "grad_norm": 0.6062845259672841, "learning_rate": 8.780364120809863e-06, "loss": 0.0299, "step": 2497 }, { "epoch": 1.1364877161055504, "grad_norm": 0.44459971712814256, "learning_rate": 8.779428509989463e-06, "loss": 0.0205, "step": 2498 }, { "epoch": 1.1369426751592357, "grad_norm": 0.8182256630287221, "learning_rate": 8.778492590331234e-06, "loss": 0.0358, "step": 2499 }, { "epoch": 1.1373976342129208, "grad_norm": 0.35292113524041313, "learning_rate": 8.777556361911652e-06, "loss": 0.0188, "step": 2500 }, { "epoch": 1.137852593266606, "grad_norm": 0.5495898839385301, "learning_rate": 8.776619824807225e-06, "loss": 0.0403, "step": 2501 }, { "epoch": 1.1383075523202912, "grad_norm": 0.47715012261917683, "learning_rate": 8.77568297909448e-06, "loss": 0.0308, "step": 2502 }, { "epoch": 1.1387625113739763, "grad_norm": 0.5057002315147829, "learning_rate": 8.774745824849973e-06, "loss": 0.0255, "step": 2503 }, { "epoch": 1.1392174704276614, "grad_norm": 0.637445487028803, "learning_rate": 8.773808362150284e-06, "loss": 0.0441, "step": 2504 }, { "epoch": 1.1396724294813467, "grad_norm": 0.46970000948757085, "learning_rate": 8.772870591072016e-06, "loss": 0.0203, "step": 2505 }, { "epoch": 1.1401273885350318, "grad_norm": 0.48405940158780947, "learning_rate": 8.771932511691805e-06, "loss": 0.0248, "step": 2506 }, { "epoch": 1.1405823475887171, "grad_norm": 0.5007699680851107, "learning_rate": 8.7709941240863e-06, "loss": 0.0299, "step": 2507 }, { "epoch": 1.1410373066424022, "grad_norm": 0.47412512472759577, "learning_rate": 8.770055428332187e-06, "loss": 0.0289, "step": 2508 }, { "epoch": 1.1414922656960873, "grad_norm": 0.6167640062421629, "learning_rate": 8.769116424506168e-06, "loss": 0.0308, "step": 2509 }, { "epoch": 1.1419472247497726, "grad_norm": 0.39237316345479106, "learning_rate": 8.768177112684976e-06, "loss": 0.023, "step": 2510 }, { "epoch": 1.1424021838034577, "grad_norm": 0.5186908295343413, "learning_rate": 8.767237492945372e-06, "loss": 0.0253, "step": 2511 }, { "epoch": 1.1428571428571428, "grad_norm": 0.5056070603356543, "learning_rate": 8.766297565364127e-06, "loss": 0.0269, "step": 2512 }, { "epoch": 1.143312101910828, "grad_norm": 0.572114404769031, "learning_rate": 8.765357330018056e-06, "loss": 0.04, "step": 2513 }, { "epoch": 1.1437670609645132, "grad_norm": 0.5742667251635876, "learning_rate": 8.764416786983987e-06, "loss": 0.0341, "step": 2514 }, { "epoch": 1.1442220200181983, "grad_norm": 0.7921946978016261, "learning_rate": 8.763475936338778e-06, "loss": 0.0297, "step": 2515 }, { "epoch": 1.1446769790718836, "grad_norm": 0.5932003547457203, "learning_rate": 8.762534778159313e-06, "loss": 0.0329, "step": 2516 }, { "epoch": 1.1451319381255687, "grad_norm": 0.4383972484081299, "learning_rate": 8.761593312522496e-06, "loss": 0.026, "step": 2517 }, { "epoch": 1.1455868971792538, "grad_norm": 0.494406013971066, "learning_rate": 8.76065153950526e-06, "loss": 0.0252, "step": 2518 }, { "epoch": 1.146041856232939, "grad_norm": 0.41600285124838154, "learning_rate": 8.759709459184565e-06, "loss": 0.03, "step": 2519 }, { "epoch": 1.1464968152866242, "grad_norm": 0.7103449624996373, "learning_rate": 8.758767071637391e-06, "loss": 0.0293, "step": 2520 }, { "epoch": 1.1469517743403093, "grad_norm": 0.7247596682387525, "learning_rate": 8.757824376940748e-06, "loss": 0.0534, "step": 2521 }, { "epoch": 1.1474067333939946, "grad_norm": 0.5429066180348485, "learning_rate": 8.756881375171664e-06, "loss": 0.0366, "step": 2522 }, { "epoch": 1.1478616924476797, "grad_norm": 0.5884373670939516, "learning_rate": 8.755938066407201e-06, "loss": 0.0335, "step": 2523 }, { "epoch": 1.148316651501365, "grad_norm": 0.6156045708560577, "learning_rate": 8.754994450724441e-06, "loss": 0.0345, "step": 2524 }, { "epoch": 1.14877161055505, "grad_norm": 0.5614699649040673, "learning_rate": 8.754050528200493e-06, "loss": 0.0329, "step": 2525 }, { "epoch": 1.1492265696087351, "grad_norm": 0.6406021126928062, "learning_rate": 8.753106298912488e-06, "loss": 0.0306, "step": 2526 }, { "epoch": 1.1496815286624205, "grad_norm": 0.5000438600163287, "learning_rate": 8.752161762937586e-06, "loss": 0.0223, "step": 2527 }, { "epoch": 1.1501364877161055, "grad_norm": 0.3997197285041498, "learning_rate": 8.751216920352967e-06, "loss": 0.0221, "step": 2528 }, { "epoch": 1.1505914467697906, "grad_norm": 0.5040179214810742, "learning_rate": 8.750271771235844e-06, "loss": 0.0196, "step": 2529 }, { "epoch": 1.151046405823476, "grad_norm": 0.40549609696673644, "learning_rate": 8.749326315663447e-06, "loss": 0.0231, "step": 2530 }, { "epoch": 1.151501364877161, "grad_norm": 0.406160230893779, "learning_rate": 8.748380553713033e-06, "loss": 0.0208, "step": 2531 }, { "epoch": 1.1519563239308463, "grad_norm": 0.5844194685702613, "learning_rate": 8.747434485461892e-06, "loss": 0.0241, "step": 2532 }, { "epoch": 1.1524112829845314, "grad_norm": 0.36029638509152084, "learning_rate": 8.746488110987326e-06, "loss": 0.015, "step": 2533 }, { "epoch": 1.1528662420382165, "grad_norm": 0.7276197204807093, "learning_rate": 8.745541430366671e-06, "loss": 0.0418, "step": 2534 }, { "epoch": 1.1533212010919018, "grad_norm": 1.5020467500828025, "learning_rate": 8.744594443677284e-06, "loss": 0.0582, "step": 2535 }, { "epoch": 1.153776160145587, "grad_norm": 0.4311974728697227, "learning_rate": 8.743647150996551e-06, "loss": 0.0258, "step": 2536 }, { "epoch": 1.154231119199272, "grad_norm": 0.6248463720530537, "learning_rate": 8.742699552401878e-06, "loss": 0.0398, "step": 2537 }, { "epoch": 1.1546860782529573, "grad_norm": 0.5339944254155865, "learning_rate": 8.7417516479707e-06, "loss": 0.0252, "step": 2538 }, { "epoch": 1.1551410373066424, "grad_norm": 0.3465118720450813, "learning_rate": 8.740803437780474e-06, "loss": 0.0183, "step": 2539 }, { "epoch": 1.1555959963603275, "grad_norm": 0.6096918552154363, "learning_rate": 8.739854921908684e-06, "loss": 0.0318, "step": 2540 }, { "epoch": 1.1560509554140128, "grad_norm": 0.42626286323793855, "learning_rate": 8.73890610043284e-06, "loss": 0.0292, "step": 2541 }, { "epoch": 1.156505914467698, "grad_norm": 0.47325164391197866, "learning_rate": 8.737956973430475e-06, "loss": 0.0337, "step": 2542 }, { "epoch": 1.156960873521383, "grad_norm": 0.6214186683671308, "learning_rate": 8.737007540979146e-06, "loss": 0.0235, "step": 2543 }, { "epoch": 1.1574158325750683, "grad_norm": 0.4958886649213906, "learning_rate": 8.736057803156436e-06, "loss": 0.0255, "step": 2544 }, { "epoch": 1.1578707916287534, "grad_norm": 0.3732620529932146, "learning_rate": 8.735107760039954e-06, "loss": 0.0197, "step": 2545 }, { "epoch": 1.1583257506824385, "grad_norm": 0.5778213004705967, "learning_rate": 8.734157411707334e-06, "loss": 0.0277, "step": 2546 }, { "epoch": 1.1587807097361238, "grad_norm": 0.4850677867721973, "learning_rate": 8.733206758236235e-06, "loss": 0.0235, "step": 2547 }, { "epoch": 1.1592356687898089, "grad_norm": 0.5687049775983313, "learning_rate": 8.732255799704337e-06, "loss": 0.0335, "step": 2548 }, { "epoch": 1.159690627843494, "grad_norm": 0.5063906062734673, "learning_rate": 8.73130453618935e-06, "loss": 0.0224, "step": 2549 }, { "epoch": 1.1601455868971793, "grad_norm": 0.4830706957588217, "learning_rate": 8.730352967769007e-06, "loss": 0.026, "step": 2550 }, { "epoch": 1.1606005459508644, "grad_norm": 0.4565903397736301, "learning_rate": 8.729401094521066e-06, "loss": 0.0171, "step": 2551 }, { "epoch": 1.1610555050045497, "grad_norm": 0.5299141705331825, "learning_rate": 8.728448916523309e-06, "loss": 0.0283, "step": 2552 }, { "epoch": 1.1615104640582348, "grad_norm": 0.5618467862878425, "learning_rate": 8.727496433853543e-06, "loss": 0.0289, "step": 2553 }, { "epoch": 1.1619654231119199, "grad_norm": 0.464342731748468, "learning_rate": 8.726543646589605e-06, "loss": 0.0202, "step": 2554 }, { "epoch": 1.1624203821656052, "grad_norm": 0.5984943035378484, "learning_rate": 8.725590554809346e-06, "loss": 0.0387, "step": 2555 }, { "epoch": 1.1628753412192903, "grad_norm": 0.3103247899143151, "learning_rate": 8.724637158590652e-06, "loss": 0.0172, "step": 2556 }, { "epoch": 1.1633303002729753, "grad_norm": 0.5719001232225214, "learning_rate": 8.72368345801143e-06, "loss": 0.0328, "step": 2557 }, { "epoch": 1.1637852593266607, "grad_norm": 0.7184689253863656, "learning_rate": 8.722729453149613e-06, "loss": 0.0256, "step": 2558 }, { "epoch": 1.1642402183803457, "grad_norm": 0.4264869300929295, "learning_rate": 8.721775144083155e-06, "loss": 0.0273, "step": 2559 }, { "epoch": 1.164695177434031, "grad_norm": 0.6992959245688258, "learning_rate": 8.72082053089004e-06, "loss": 0.0391, "step": 2560 }, { "epoch": 1.1651501364877161, "grad_norm": 0.5598830058244858, "learning_rate": 8.719865613648276e-06, "loss": 0.0348, "step": 2561 }, { "epoch": 1.1656050955414012, "grad_norm": 0.4490293057873329, "learning_rate": 8.718910392435892e-06, "loss": 0.0185, "step": 2562 }, { "epoch": 1.1660600545950865, "grad_norm": 0.3188239247752473, "learning_rate": 8.717954867330943e-06, "loss": 0.0118, "step": 2563 }, { "epoch": 1.1665150136487716, "grad_norm": 0.529002754756549, "learning_rate": 8.716999038411513e-06, "loss": 0.0422, "step": 2564 }, { "epoch": 1.1669699727024567, "grad_norm": 0.6102751055626958, "learning_rate": 8.716042905755708e-06, "loss": 0.0321, "step": 2565 }, { "epoch": 1.167424931756142, "grad_norm": 0.4958464600211268, "learning_rate": 8.715086469441659e-06, "loss": 0.027, "step": 2566 }, { "epoch": 1.1678798908098271, "grad_norm": 0.6925927485590572, "learning_rate": 8.714129729547522e-06, "loss": 0.0528, "step": 2567 }, { "epoch": 1.1683348498635122, "grad_norm": 0.48346645004557054, "learning_rate": 8.713172686151475e-06, "loss": 0.0241, "step": 2568 }, { "epoch": 1.1687898089171975, "grad_norm": 0.6160868757033329, "learning_rate": 8.712215339331724e-06, "loss": 0.0364, "step": 2569 }, { "epoch": 1.1692447679708826, "grad_norm": 0.5521736841094272, "learning_rate": 8.711257689166499e-06, "loss": 0.0384, "step": 2570 }, { "epoch": 1.1696997270245677, "grad_norm": 0.4358123533199606, "learning_rate": 8.710299735734057e-06, "loss": 0.0218, "step": 2571 }, { "epoch": 1.170154686078253, "grad_norm": 0.49989161769199447, "learning_rate": 8.709341479112676e-06, "loss": 0.019, "step": 2572 }, { "epoch": 1.170609645131938, "grad_norm": 0.6461070187412289, "learning_rate": 8.70838291938066e-06, "loss": 0.05, "step": 2573 }, { "epoch": 1.1710646041856232, "grad_norm": 0.5015730644729591, "learning_rate": 8.70742405661634e-06, "loss": 0.0262, "step": 2574 }, { "epoch": 1.1715195632393085, "grad_norm": 0.6731652049317264, "learning_rate": 8.706464890898068e-06, "loss": 0.0417, "step": 2575 }, { "epoch": 1.1719745222929936, "grad_norm": 0.5953498514866105, "learning_rate": 8.705505422304224e-06, "loss": 0.0251, "step": 2576 }, { "epoch": 1.1724294813466787, "grad_norm": 0.49337464142227694, "learning_rate": 8.70454565091321e-06, "loss": 0.0283, "step": 2577 }, { "epoch": 1.172884440400364, "grad_norm": 0.40746621618427764, "learning_rate": 8.703585576803455e-06, "loss": 0.0235, "step": 2578 }, { "epoch": 1.173339399454049, "grad_norm": 0.574388099759434, "learning_rate": 8.702625200053412e-06, "loss": 0.0357, "step": 2579 }, { "epoch": 1.1737943585077344, "grad_norm": 0.49209063287204186, "learning_rate": 8.701664520741558e-06, "loss": 0.0271, "step": 2580 }, { "epoch": 1.1742493175614195, "grad_norm": 0.49658769644628054, "learning_rate": 8.700703538946396e-06, "loss": 0.0312, "step": 2581 }, { "epoch": 1.1747042766151046, "grad_norm": 0.48898735666034404, "learning_rate": 8.699742254746452e-06, "loss": 0.0308, "step": 2582 }, { "epoch": 1.1751592356687899, "grad_norm": 0.6965571111870493, "learning_rate": 8.698780668220281e-06, "loss": 0.0587, "step": 2583 }, { "epoch": 1.175614194722475, "grad_norm": 0.4680913844344663, "learning_rate": 8.697818779446456e-06, "loss": 0.0268, "step": 2584 }, { "epoch": 1.17606915377616, "grad_norm": 0.5966094635320064, "learning_rate": 8.696856588503582e-06, "loss": 0.0441, "step": 2585 }, { "epoch": 1.1765241128298454, "grad_norm": 0.41029105691286216, "learning_rate": 8.69589409547028e-06, "loss": 0.0238, "step": 2586 }, { "epoch": 1.1769790718835305, "grad_norm": 0.4919555962191467, "learning_rate": 8.694931300425204e-06, "loss": 0.022, "step": 2587 }, { "epoch": 1.1774340309372158, "grad_norm": 0.4941665993905159, "learning_rate": 8.693968203447027e-06, "loss": 0.0318, "step": 2588 }, { "epoch": 1.1778889899909009, "grad_norm": 0.4471241857833498, "learning_rate": 8.693004804614451e-06, "loss": 0.0298, "step": 2589 }, { "epoch": 1.178343949044586, "grad_norm": 0.42475689565329255, "learning_rate": 8.692041104006201e-06, "loss": 0.0245, "step": 2590 }, { "epoch": 1.1787989080982713, "grad_norm": 0.7037247909228679, "learning_rate": 8.691077101701024e-06, "loss": 0.0422, "step": 2591 }, { "epoch": 1.1792538671519563, "grad_norm": 0.4727292395507324, "learning_rate": 8.690112797777695e-06, "loss": 0.0286, "step": 2592 }, { "epoch": 1.1797088262056414, "grad_norm": 0.4886187172760372, "learning_rate": 8.689148192315013e-06, "loss": 0.0253, "step": 2593 }, { "epoch": 1.1801637852593267, "grad_norm": 0.4878895092851417, "learning_rate": 8.6881832853918e-06, "loss": 0.0294, "step": 2594 }, { "epoch": 1.1806187443130118, "grad_norm": 0.3785632403936228, "learning_rate": 8.687218077086905e-06, "loss": 0.0262, "step": 2595 }, { "epoch": 1.181073703366697, "grad_norm": 0.3032359273578328, "learning_rate": 8.6862525674792e-06, "loss": 0.0207, "step": 2596 }, { "epoch": 1.1815286624203822, "grad_norm": 0.5805982565364416, "learning_rate": 8.685286756647582e-06, "loss": 0.0299, "step": 2597 }, { "epoch": 1.1819836214740673, "grad_norm": 0.5312395563049912, "learning_rate": 8.684320644670975e-06, "loss": 0.0391, "step": 2598 }, { "epoch": 1.1824385805277524, "grad_norm": 0.6427828501421616, "learning_rate": 8.68335423162832e-06, "loss": 0.0366, "step": 2599 }, { "epoch": 1.1828935395814377, "grad_norm": 0.6549023820063344, "learning_rate": 8.682387517598591e-06, "loss": 0.0466, "step": 2600 }, { "epoch": 1.1833484986351228, "grad_norm": 0.4191743788408071, "learning_rate": 8.681420502660785e-06, "loss": 0.0233, "step": 2601 }, { "epoch": 1.183803457688808, "grad_norm": 0.4871715984486466, "learning_rate": 8.68045318689392e-06, "loss": 0.0271, "step": 2602 }, { "epoch": 1.1842584167424932, "grad_norm": 0.6701976394432037, "learning_rate": 8.679485570377043e-06, "loss": 0.0306, "step": 2603 }, { "epoch": 1.1847133757961783, "grad_norm": 0.6441120205935942, "learning_rate": 8.678517653189222e-06, "loss": 0.0394, "step": 2604 }, { "epoch": 1.1851683348498634, "grad_norm": 0.5060858425158437, "learning_rate": 8.677549435409548e-06, "loss": 0.0217, "step": 2605 }, { "epoch": 1.1856232939035487, "grad_norm": 0.6752485468046396, "learning_rate": 8.676580917117144e-06, "loss": 0.039, "step": 2606 }, { "epoch": 1.1860782529572338, "grad_norm": 0.3957815075118571, "learning_rate": 8.675612098391149e-06, "loss": 0.0188, "step": 2607 }, { "epoch": 1.186533212010919, "grad_norm": 0.5187116630942156, "learning_rate": 8.674642979310732e-06, "loss": 0.026, "step": 2608 }, { "epoch": 1.1869881710646042, "grad_norm": 0.5769983660492354, "learning_rate": 8.673673559955086e-06, "loss": 0.0343, "step": 2609 }, { "epoch": 1.1874431301182893, "grad_norm": 0.4743399882711679, "learning_rate": 8.672703840403428e-06, "loss": 0.0293, "step": 2610 }, { "epoch": 1.1878980891719746, "grad_norm": 0.3693698002797069, "learning_rate": 8.671733820734996e-06, "loss": 0.0162, "step": 2611 }, { "epoch": 1.1883530482256597, "grad_norm": 0.7143210340908582, "learning_rate": 8.670763501029059e-06, "loss": 0.0424, "step": 2612 }, { "epoch": 1.1888080072793448, "grad_norm": 0.44099669973790273, "learning_rate": 8.669792881364905e-06, "loss": 0.0288, "step": 2613 }, { "epoch": 1.18926296633303, "grad_norm": 0.47880134181841405, "learning_rate": 8.668821961821848e-06, "loss": 0.0356, "step": 2614 }, { "epoch": 1.1897179253867152, "grad_norm": 0.49961852236193943, "learning_rate": 8.66785074247923e-06, "loss": 0.0264, "step": 2615 }, { "epoch": 1.1901728844404005, "grad_norm": 0.6606861173434392, "learning_rate": 8.666879223416413e-06, "loss": 0.0402, "step": 2616 }, { "epoch": 1.1906278434940856, "grad_norm": 0.5832250365729773, "learning_rate": 8.665907404712786e-06, "loss": 0.0349, "step": 2617 }, { "epoch": 1.1910828025477707, "grad_norm": 0.47607736173413934, "learning_rate": 8.66493528644776e-06, "loss": 0.0275, "step": 2618 }, { "epoch": 1.191537761601456, "grad_norm": 0.4323045066773957, "learning_rate": 8.663962868700773e-06, "loss": 0.0215, "step": 2619 }, { "epoch": 1.191992720655141, "grad_norm": 0.6823901111258103, "learning_rate": 8.662990151551288e-06, "loss": 0.0367, "step": 2620 }, { "epoch": 1.1924476797088261, "grad_norm": 0.568395741941641, "learning_rate": 8.66201713507879e-06, "loss": 0.0327, "step": 2621 }, { "epoch": 1.1929026387625115, "grad_norm": 0.8032308375903047, "learning_rate": 8.661043819362788e-06, "loss": 0.0396, "step": 2622 }, { "epoch": 1.1933575978161965, "grad_norm": 0.5352047847553939, "learning_rate": 8.660070204482818e-06, "loss": 0.0384, "step": 2623 }, { "epoch": 1.1938125568698816, "grad_norm": 0.43266491785940075, "learning_rate": 8.65909629051844e-06, "loss": 0.0235, "step": 2624 }, { "epoch": 1.194267515923567, "grad_norm": 0.5039359947320041, "learning_rate": 8.658122077549239e-06, "loss": 0.0332, "step": 2625 }, { "epoch": 1.194722474977252, "grad_norm": 0.46282675009108876, "learning_rate": 8.65714756565482e-06, "loss": 0.028, "step": 2626 }, { "epoch": 1.1951774340309371, "grad_norm": 0.42685254155176316, "learning_rate": 8.656172754914818e-06, "loss": 0.0193, "step": 2627 }, { "epoch": 1.1956323930846224, "grad_norm": 0.5644652302861507, "learning_rate": 8.655197645408889e-06, "loss": 0.0327, "step": 2628 }, { "epoch": 1.1960873521383075, "grad_norm": 0.6017102850762671, "learning_rate": 8.654222237216714e-06, "loss": 0.0395, "step": 2629 }, { "epoch": 1.1965423111919926, "grad_norm": 0.4828717952370834, "learning_rate": 8.653246530418003e-06, "loss": 0.0296, "step": 2630 }, { "epoch": 1.196997270245678, "grad_norm": 0.4718632798920294, "learning_rate": 8.652270525092481e-06, "loss": 0.0175, "step": 2631 }, { "epoch": 1.197452229299363, "grad_norm": 0.9210566120370747, "learning_rate": 8.651294221319907e-06, "loss": 0.0532, "step": 2632 }, { "epoch": 1.197907188353048, "grad_norm": 0.5973832244257986, "learning_rate": 8.650317619180057e-06, "loss": 0.0356, "step": 2633 }, { "epoch": 1.1983621474067334, "grad_norm": 0.4056353546459655, "learning_rate": 8.649340718752736e-06, "loss": 0.0233, "step": 2634 }, { "epoch": 1.1988171064604185, "grad_norm": 0.6383917144915527, "learning_rate": 8.648363520117773e-06, "loss": 0.0282, "step": 2635 }, { "epoch": 1.1992720655141038, "grad_norm": 0.30187722032440356, "learning_rate": 8.647386023355017e-06, "loss": 0.015, "step": 2636 }, { "epoch": 1.199727024567789, "grad_norm": 0.7620089776567717, "learning_rate": 8.646408228544349e-06, "loss": 0.0449, "step": 2637 }, { "epoch": 1.200181983621474, "grad_norm": 0.7042927681153068, "learning_rate": 8.645430135765667e-06, "loss": 0.04, "step": 2638 }, { "epoch": 1.2006369426751593, "grad_norm": 0.5117403840739881, "learning_rate": 8.644451745098896e-06, "loss": 0.0297, "step": 2639 }, { "epoch": 1.2010919017288444, "grad_norm": 0.7659399394915278, "learning_rate": 8.643473056623987e-06, "loss": 0.0592, "step": 2640 }, { "epoch": 1.2015468607825295, "grad_norm": 0.5678495394727697, "learning_rate": 8.642494070420912e-06, "loss": 0.032, "step": 2641 }, { "epoch": 1.2020018198362148, "grad_norm": 0.4587046178873542, "learning_rate": 8.641514786569674e-06, "loss": 0.0273, "step": 2642 }, { "epoch": 1.2024567788898999, "grad_norm": 0.5810971871142143, "learning_rate": 8.640535205150291e-06, "loss": 0.0436, "step": 2643 }, { "epoch": 1.2029117379435852, "grad_norm": 0.49553783255896267, "learning_rate": 8.639555326242812e-06, "loss": 0.0375, "step": 2644 }, { "epoch": 1.2033666969972703, "grad_norm": 0.700954373813157, "learning_rate": 8.638575149927306e-06, "loss": 0.0416, "step": 2645 }, { "epoch": 1.2038216560509554, "grad_norm": 0.51916075076626, "learning_rate": 8.637594676283872e-06, "loss": 0.0301, "step": 2646 }, { "epoch": 1.2042766151046407, "grad_norm": 0.5616014526557234, "learning_rate": 8.636613905392628e-06, "loss": 0.0333, "step": 2647 }, { "epoch": 1.2047315741583258, "grad_norm": 0.3996003632999196, "learning_rate": 8.635632837333719e-06, "loss": 0.0203, "step": 2648 }, { "epoch": 1.2051865332120109, "grad_norm": 0.5908400254903149, "learning_rate": 8.634651472187312e-06, "loss": 0.0355, "step": 2649 }, { "epoch": 1.2056414922656962, "grad_norm": 0.5521857176836706, "learning_rate": 8.633669810033601e-06, "loss": 0.0302, "step": 2650 }, { "epoch": 1.2060964513193813, "grad_norm": 0.47154629646415547, "learning_rate": 8.632687850952803e-06, "loss": 0.0254, "step": 2651 }, { "epoch": 1.2065514103730663, "grad_norm": 0.5084600548265098, "learning_rate": 8.63170559502516e-06, "loss": 0.0263, "step": 2652 }, { "epoch": 1.2070063694267517, "grad_norm": 0.41669809700741084, "learning_rate": 8.630723042330934e-06, "loss": 0.0235, "step": 2653 }, { "epoch": 1.2074613284804367, "grad_norm": 0.4239984269262903, "learning_rate": 8.629740192950418e-06, "loss": 0.0258, "step": 2654 }, { "epoch": 1.2079162875341218, "grad_norm": 0.5493755020180808, "learning_rate": 8.628757046963925e-06, "loss": 0.0312, "step": 2655 }, { "epoch": 1.2083712465878071, "grad_norm": 0.44940260929025, "learning_rate": 8.627773604451795e-06, "loss": 0.0253, "step": 2656 }, { "epoch": 1.2088262056414922, "grad_norm": 0.49748760446391493, "learning_rate": 8.626789865494388e-06, "loss": 0.029, "step": 2657 }, { "epoch": 1.2092811646951773, "grad_norm": 0.4473696250717918, "learning_rate": 8.62580583017209e-06, "loss": 0.0265, "step": 2658 }, { "epoch": 1.2097361237488626, "grad_norm": 0.634783340896908, "learning_rate": 8.624821498565316e-06, "loss": 0.0375, "step": 2659 }, { "epoch": 1.2101910828025477, "grad_norm": 0.5688906906342468, "learning_rate": 8.623836870754497e-06, "loss": 0.0291, "step": 2660 }, { "epoch": 1.210646041856233, "grad_norm": 0.524163167377845, "learning_rate": 8.622851946820094e-06, "loss": 0.0343, "step": 2661 }, { "epoch": 1.2111010009099181, "grad_norm": 0.4184285347511745, "learning_rate": 8.621866726842592e-06, "loss": 0.0245, "step": 2662 }, { "epoch": 1.2115559599636032, "grad_norm": 0.5452023193304021, "learning_rate": 8.620881210902497e-06, "loss": 0.0361, "step": 2663 }, { "epoch": 1.2120109190172885, "grad_norm": 0.8825681885181793, "learning_rate": 8.61989539908034e-06, "loss": 0.0551, "step": 2664 }, { "epoch": 1.2124658780709736, "grad_norm": 0.6606796283358398, "learning_rate": 8.61890929145668e-06, "loss": 0.0501, "step": 2665 }, { "epoch": 1.2129208371246587, "grad_norm": 0.5383057502775304, "learning_rate": 8.617922888112093e-06, "loss": 0.0327, "step": 2666 }, { "epoch": 1.213375796178344, "grad_norm": 0.456267646438963, "learning_rate": 8.616936189127189e-06, "loss": 0.0271, "step": 2667 }, { "epoch": 1.213830755232029, "grad_norm": 0.6876820645690198, "learning_rate": 8.615949194582591e-06, "loss": 0.0522, "step": 2668 }, { "epoch": 1.2142857142857142, "grad_norm": 0.4235510337955621, "learning_rate": 8.614961904558956e-06, "loss": 0.0178, "step": 2669 }, { "epoch": 1.2147406733393995, "grad_norm": 0.31389612581359266, "learning_rate": 8.613974319136959e-06, "loss": 0.0142, "step": 2670 }, { "epoch": 1.2151956323930846, "grad_norm": 0.5466534592913287, "learning_rate": 8.6129864383973e-06, "loss": 0.0325, "step": 2671 }, { "epoch": 1.21565059144677, "grad_norm": 0.6256801141600264, "learning_rate": 8.611998262420707e-06, "loss": 0.031, "step": 2672 }, { "epoch": 1.216105550500455, "grad_norm": 0.5060382153635896, "learning_rate": 8.611009791287926e-06, "loss": 0.0262, "step": 2673 }, { "epoch": 1.21656050955414, "grad_norm": 0.5027235560302646, "learning_rate": 8.610021025079734e-06, "loss": 0.0254, "step": 2674 }, { "epoch": 1.2170154686078254, "grad_norm": 0.5543017523957823, "learning_rate": 8.609031963876924e-06, "loss": 0.0308, "step": 2675 }, { "epoch": 1.2174704276615105, "grad_norm": 0.4737161111249352, "learning_rate": 8.608042607760322e-06, "loss": 0.0326, "step": 2676 }, { "epoch": 1.2179253867151956, "grad_norm": 0.4843464243684333, "learning_rate": 8.607052956810772e-06, "loss": 0.0258, "step": 2677 }, { "epoch": 1.2183803457688809, "grad_norm": 0.5194322149503382, "learning_rate": 8.606063011109143e-06, "loss": 0.0358, "step": 2678 }, { "epoch": 1.218835304822566, "grad_norm": 0.5930513493210321, "learning_rate": 8.60507277073633e-06, "loss": 0.0362, "step": 2679 }, { "epoch": 1.219290263876251, "grad_norm": 0.32996053031100914, "learning_rate": 8.604082235773249e-06, "loss": 0.0131, "step": 2680 }, { "epoch": 1.2197452229299364, "grad_norm": 0.4531032973363827, "learning_rate": 8.603091406300845e-06, "loss": 0.0264, "step": 2681 }, { "epoch": 1.2202001819836215, "grad_norm": 0.4752447004618926, "learning_rate": 8.602100282400082e-06, "loss": 0.0222, "step": 2682 }, { "epoch": 1.2206551410373065, "grad_norm": 0.48294135837077795, "learning_rate": 8.60110886415195e-06, "loss": 0.0286, "step": 2683 }, { "epoch": 1.2211101000909919, "grad_norm": 0.8146460808068521, "learning_rate": 8.600117151637465e-06, "loss": 0.0553, "step": 2684 }, { "epoch": 1.221565059144677, "grad_norm": 0.5348405988590901, "learning_rate": 8.599125144937666e-06, "loss": 0.0341, "step": 2685 }, { "epoch": 1.222020018198362, "grad_norm": 0.5209228039836593, "learning_rate": 8.598132844133614e-06, "loss": 0.0285, "step": 2686 }, { "epoch": 1.2224749772520473, "grad_norm": 0.8667405302686297, "learning_rate": 8.597140249306393e-06, "loss": 0.0554, "step": 2687 }, { "epoch": 1.2229299363057324, "grad_norm": 0.3662245233762516, "learning_rate": 8.596147360537115e-06, "loss": 0.0186, "step": 2688 }, { "epoch": 1.2233848953594177, "grad_norm": 0.5675330701823686, "learning_rate": 8.595154177906915e-06, "loss": 0.0252, "step": 2689 }, { "epoch": 1.2238398544131028, "grad_norm": 0.5055412550341041, "learning_rate": 8.594160701496951e-06, "loss": 0.0359, "step": 2690 }, { "epoch": 1.224294813466788, "grad_norm": 0.4636507359192646, "learning_rate": 8.593166931388408e-06, "loss": 0.0235, "step": 2691 }, { "epoch": 1.2247497725204732, "grad_norm": 0.5789114485670152, "learning_rate": 8.592172867662488e-06, "loss": 0.0309, "step": 2692 }, { "epoch": 1.2252047315741583, "grad_norm": 0.5362511549256743, "learning_rate": 8.591178510400424e-06, "loss": 0.0288, "step": 2693 }, { "epoch": 1.2256596906278434, "grad_norm": 0.665176698679116, "learning_rate": 8.590183859683469e-06, "loss": 0.0381, "step": 2694 }, { "epoch": 1.2261146496815287, "grad_norm": 0.5319510120853973, "learning_rate": 8.589188915592903e-06, "loss": 0.0359, "step": 2695 }, { "epoch": 1.2265696087352138, "grad_norm": 0.4177494615666587, "learning_rate": 8.588193678210026e-06, "loss": 0.0194, "step": 2696 }, { "epoch": 1.2270245677888991, "grad_norm": 0.34563423472616117, "learning_rate": 8.587198147616166e-06, "loss": 0.0188, "step": 2697 }, { "epoch": 1.2274795268425842, "grad_norm": 0.5420023688259344, "learning_rate": 8.586202323892675e-06, "loss": 0.0322, "step": 2698 }, { "epoch": 1.2279344858962693, "grad_norm": 0.5715046852040315, "learning_rate": 8.585206207120925e-06, "loss": 0.0248, "step": 2699 }, { "epoch": 1.2283894449499546, "grad_norm": 0.6150293588585071, "learning_rate": 8.584209797382313e-06, "loss": 0.0349, "step": 2700 }, { "epoch": 1.2288444040036397, "grad_norm": 0.7538546206140824, "learning_rate": 8.583213094758262e-06, "loss": 0.0415, "step": 2701 }, { "epoch": 1.2292993630573248, "grad_norm": 0.41258699232239693, "learning_rate": 8.582216099330218e-06, "loss": 0.0252, "step": 2702 }, { "epoch": 1.22975432211101, "grad_norm": 0.5992053934366026, "learning_rate": 8.581218811179655e-06, "loss": 0.0231, "step": 2703 }, { "epoch": 1.2302092811646952, "grad_norm": 0.4911038111295034, "learning_rate": 8.58022123038806e-06, "loss": 0.0367, "step": 2704 }, { "epoch": 1.2306642402183803, "grad_norm": 0.5415583441174247, "learning_rate": 8.579223357036956e-06, "loss": 0.0356, "step": 2705 }, { "epoch": 1.2311191992720656, "grad_norm": 0.648050207407017, "learning_rate": 8.578225191207881e-06, "loss": 0.0322, "step": 2706 }, { "epoch": 1.2315741583257507, "grad_norm": 0.6515223387873779, "learning_rate": 8.577226732982405e-06, "loss": 0.0424, "step": 2707 }, { "epoch": 1.2320291173794358, "grad_norm": 0.7662318426027166, "learning_rate": 8.576227982442114e-06, "loss": 0.037, "step": 2708 }, { "epoch": 1.232484076433121, "grad_norm": 0.4709920734770032, "learning_rate": 8.575228939668623e-06, "loss": 0.0284, "step": 2709 }, { "epoch": 1.2329390354868062, "grad_norm": 0.7144313144730997, "learning_rate": 8.574229604743566e-06, "loss": 0.0316, "step": 2710 }, { "epoch": 1.2333939945404913, "grad_norm": 0.4992331855484428, "learning_rate": 8.573229977748609e-06, "loss": 0.0345, "step": 2711 }, { "epoch": 1.2338489535941766, "grad_norm": 0.6112686451914704, "learning_rate": 8.572230058765434e-06, "loss": 0.0358, "step": 2712 }, { "epoch": 1.2343039126478617, "grad_norm": 0.8262726736467544, "learning_rate": 8.571229847875751e-06, "loss": 0.0641, "step": 2713 }, { "epoch": 1.2347588717015467, "grad_norm": 0.4953827805427677, "learning_rate": 8.570229345161293e-06, "loss": 0.0247, "step": 2714 }, { "epoch": 1.235213830755232, "grad_norm": 0.3801656553630412, "learning_rate": 8.569228550703815e-06, "loss": 0.0249, "step": 2715 }, { "epoch": 1.2356687898089171, "grad_norm": 0.49612613452863535, "learning_rate": 8.568227464585099e-06, "loss": 0.0277, "step": 2716 }, { "epoch": 1.2361237488626025, "grad_norm": 0.4582666835548743, "learning_rate": 8.567226086886948e-06, "loss": 0.0262, "step": 2717 }, { "epoch": 1.2365787079162875, "grad_norm": 0.6697552955443566, "learning_rate": 8.566224417691191e-06, "loss": 0.0338, "step": 2718 }, { "epoch": 1.2370336669699726, "grad_norm": 0.8001154357445661, "learning_rate": 8.565222457079679e-06, "loss": 0.0685, "step": 2719 }, { "epoch": 1.237488626023658, "grad_norm": 0.4454996360487464, "learning_rate": 8.56422020513429e-06, "loss": 0.0233, "step": 2720 }, { "epoch": 1.237943585077343, "grad_norm": 0.42231887554095254, "learning_rate": 8.56321766193692e-06, "loss": 0.0247, "step": 2721 }, { "epoch": 1.2383985441310281, "grad_norm": 0.49520892835841024, "learning_rate": 8.562214827569495e-06, "loss": 0.0198, "step": 2722 }, { "epoch": 1.2388535031847134, "grad_norm": 0.3119762559086726, "learning_rate": 8.56121170211396e-06, "loss": 0.0183, "step": 2723 }, { "epoch": 1.2393084622383985, "grad_norm": 0.48127588980662994, "learning_rate": 8.560208285652287e-06, "loss": 0.0348, "step": 2724 }, { "epoch": 1.2397634212920838, "grad_norm": 0.975980592939099, "learning_rate": 8.559204578266471e-06, "loss": 0.0712, "step": 2725 }, { "epoch": 1.240218380345769, "grad_norm": 0.4739910877413602, "learning_rate": 8.55820058003853e-06, "loss": 0.027, "step": 2726 }, { "epoch": 1.240673339399454, "grad_norm": 0.5358172750361924, "learning_rate": 8.557196291050506e-06, "loss": 0.0403, "step": 2727 }, { "epoch": 1.2411282984531393, "grad_norm": 0.49464890318884047, "learning_rate": 8.556191711384466e-06, "loss": 0.0336, "step": 2728 }, { "epoch": 1.2415832575068244, "grad_norm": 0.4046597291390638, "learning_rate": 8.555186841122498e-06, "loss": 0.024, "step": 2729 }, { "epoch": 1.2420382165605095, "grad_norm": 0.656706108193742, "learning_rate": 8.554181680346717e-06, "loss": 0.0348, "step": 2730 }, { "epoch": 1.2424931756141948, "grad_norm": 0.49134341156698247, "learning_rate": 8.553176229139262e-06, "loss": 0.033, "step": 2731 }, { "epoch": 1.24294813466788, "grad_norm": 0.3673616941332998, "learning_rate": 8.552170487582287e-06, "loss": 0.0233, "step": 2732 }, { "epoch": 1.243403093721565, "grad_norm": 0.3845834813421107, "learning_rate": 8.551164455757985e-06, "loss": 0.021, "step": 2733 }, { "epoch": 1.2438580527752503, "grad_norm": 0.4219248857316413, "learning_rate": 8.550158133748559e-06, "loss": 0.0232, "step": 2734 }, { "epoch": 1.2443130118289354, "grad_norm": 0.5359384657995739, "learning_rate": 8.549151521636244e-06, "loss": 0.0426, "step": 2735 }, { "epoch": 1.2447679708826205, "grad_norm": 0.6147117803498731, "learning_rate": 8.548144619503291e-06, "loss": 0.0372, "step": 2736 }, { "epoch": 1.2452229299363058, "grad_norm": 0.7816013628144164, "learning_rate": 8.547137427431986e-06, "loss": 0.0509, "step": 2737 }, { "epoch": 1.2456778889899909, "grad_norm": 0.5732293106945054, "learning_rate": 8.546129945504629e-06, "loss": 0.0404, "step": 2738 }, { "epoch": 1.246132848043676, "grad_norm": 0.5878496377747829, "learning_rate": 8.545122173803547e-06, "loss": 0.0349, "step": 2739 }, { "epoch": 1.2465878070973613, "grad_norm": 0.5178543900697522, "learning_rate": 8.544114112411088e-06, "loss": 0.0317, "step": 2740 }, { "epoch": 1.2470427661510464, "grad_norm": 0.44475184485600816, "learning_rate": 8.54310576140963e-06, "loss": 0.0246, "step": 2741 }, { "epoch": 1.2474977252047315, "grad_norm": 0.41811991583751146, "learning_rate": 8.542097120881572e-06, "loss": 0.0264, "step": 2742 }, { "epoch": 1.2479526842584168, "grad_norm": 0.504603909447871, "learning_rate": 8.541088190909333e-06, "loss": 0.037, "step": 2743 }, { "epoch": 1.2484076433121019, "grad_norm": 0.5546565546187008, "learning_rate": 8.540078971575355e-06, "loss": 0.0321, "step": 2744 }, { "epoch": 1.2488626023657872, "grad_norm": 0.5988533107048205, "learning_rate": 8.539069462962115e-06, "loss": 0.0356, "step": 2745 }, { "epoch": 1.2493175614194723, "grad_norm": 0.5355497681868633, "learning_rate": 8.538059665152097e-06, "loss": 0.0219, "step": 2746 }, { "epoch": 1.2497725204731573, "grad_norm": 0.5560216189929246, "learning_rate": 8.537049578227823e-06, "loss": 0.0318, "step": 2747 }, { "epoch": 1.2502274795268427, "grad_norm": 0.41791265535852423, "learning_rate": 8.536039202271828e-06, "loss": 0.0296, "step": 2748 }, { "epoch": 1.2506824385805277, "grad_norm": 0.6230283621476296, "learning_rate": 8.53502853736668e-06, "loss": 0.0229, "step": 2749 }, { "epoch": 1.251137397634213, "grad_norm": 0.5883015192363978, "learning_rate": 8.534017583594965e-06, "loss": 0.0454, "step": 2750 }, { "epoch": 1.2515923566878981, "grad_norm": 0.5657093936113446, "learning_rate": 8.53300634103929e-06, "loss": 0.0328, "step": 2751 }, { "epoch": 1.2520473157415832, "grad_norm": 0.9286848475357391, "learning_rate": 8.531994809782294e-06, "loss": 0.0651, "step": 2752 }, { "epoch": 1.2525022747952685, "grad_norm": 0.5306254596544426, "learning_rate": 8.530982989906632e-06, "loss": 0.0264, "step": 2753 }, { "epoch": 1.2529572338489536, "grad_norm": 0.599793814100533, "learning_rate": 8.529970881494985e-06, "loss": 0.038, "step": 2754 }, { "epoch": 1.2534121929026387, "grad_norm": 0.4592924108716034, "learning_rate": 8.52895848463006e-06, "loss": 0.0253, "step": 2755 }, { "epoch": 1.253867151956324, "grad_norm": 0.5025180855718538, "learning_rate": 8.527945799394584e-06, "loss": 0.0269, "step": 2756 }, { "epoch": 1.2543221110100091, "grad_norm": 0.3690223518853051, "learning_rate": 8.526932825871308e-06, "loss": 0.0214, "step": 2757 }, { "epoch": 1.2547770700636942, "grad_norm": 0.38161446652737785, "learning_rate": 8.52591956414301e-06, "loss": 0.0166, "step": 2758 }, { "epoch": 1.2552320291173795, "grad_norm": 0.611622699149414, "learning_rate": 8.524906014292488e-06, "loss": 0.0412, "step": 2759 }, { "epoch": 1.2556869881710646, "grad_norm": 0.4022077081421061, "learning_rate": 8.523892176402565e-06, "loss": 0.0234, "step": 2760 }, { "epoch": 1.2561419472247497, "grad_norm": 0.4085009912666225, "learning_rate": 8.522878050556087e-06, "loss": 0.0271, "step": 2761 }, { "epoch": 1.256596906278435, "grad_norm": 0.591494783456256, "learning_rate": 8.521863636835924e-06, "loss": 0.0288, "step": 2762 }, { "epoch": 1.25705186533212, "grad_norm": 0.4315940956441906, "learning_rate": 8.520848935324968e-06, "loss": 0.0257, "step": 2763 }, { "epoch": 1.2575068243858052, "grad_norm": 0.4623767141710468, "learning_rate": 8.519833946106139e-06, "loss": 0.0293, "step": 2764 }, { "epoch": 1.2579617834394905, "grad_norm": 0.5965051882391731, "learning_rate": 8.518818669262373e-06, "loss": 0.0367, "step": 2765 }, { "epoch": 1.2584167424931756, "grad_norm": 0.5441954958905808, "learning_rate": 8.517803104876638e-06, "loss": 0.0314, "step": 2766 }, { "epoch": 1.2588717015468607, "grad_norm": 0.5077782576820083, "learning_rate": 8.51678725303192e-06, "loss": 0.0261, "step": 2767 }, { "epoch": 1.259326660600546, "grad_norm": 0.6376855259836618, "learning_rate": 8.515771113811226e-06, "loss": 0.0409, "step": 2768 }, { "epoch": 1.259781619654231, "grad_norm": 6.915760462322178, "learning_rate": 8.514754687297598e-06, "loss": 0.1986, "step": 2769 }, { "epoch": 1.2602365787079162, "grad_norm": 0.5889806379105973, "learning_rate": 8.513737973574088e-06, "loss": 0.0336, "step": 2770 }, { "epoch": 1.2606915377616015, "grad_norm": 0.5275667357404193, "learning_rate": 8.512720972723779e-06, "loss": 0.0289, "step": 2771 }, { "epoch": 1.2611464968152866, "grad_norm": 0.3147286633021264, "learning_rate": 8.511703684829773e-06, "loss": 0.0163, "step": 2772 }, { "epoch": 1.2616014558689717, "grad_norm": 0.8013976464224811, "learning_rate": 8.510686109975202e-06, "loss": 0.0468, "step": 2773 }, { "epoch": 1.262056414922657, "grad_norm": 0.4994061588441834, "learning_rate": 8.509668248243217e-06, "loss": 0.02, "step": 2774 }, { "epoch": 1.262511373976342, "grad_norm": 0.5749302842677763, "learning_rate": 8.508650099716991e-06, "loss": 0.0362, "step": 2775 }, { "epoch": 1.2629663330300274, "grad_norm": 0.40312955247530624, "learning_rate": 8.507631664479725e-06, "loss": 0.0229, "step": 2776 }, { "epoch": 1.2634212920837125, "grad_norm": 0.4196449929909409, "learning_rate": 8.506612942614639e-06, "loss": 0.0195, "step": 2777 }, { "epoch": 1.2638762511373978, "grad_norm": 0.4933487936946509, "learning_rate": 8.505593934204978e-06, "loss": 0.0325, "step": 2778 }, { "epoch": 1.2643312101910829, "grad_norm": 0.5074169183177683, "learning_rate": 8.504574639334013e-06, "loss": 0.0302, "step": 2779 }, { "epoch": 1.264786169244768, "grad_norm": 0.5508963628460979, "learning_rate": 8.503555058085035e-06, "loss": 0.0215, "step": 2780 }, { "epoch": 1.2652411282984533, "grad_norm": 0.462497042914889, "learning_rate": 8.502535190541362e-06, "loss": 0.0249, "step": 2781 }, { "epoch": 1.2656960873521383, "grad_norm": 0.8409491409991778, "learning_rate": 8.501515036786327e-06, "loss": 0.0464, "step": 2782 }, { "epoch": 1.2661510464058234, "grad_norm": 0.5198784420945559, "learning_rate": 8.500494596903298e-06, "loss": 0.0337, "step": 2783 }, { "epoch": 1.2666060054595087, "grad_norm": 0.45309440991920896, "learning_rate": 8.499473870975657e-06, "loss": 0.0253, "step": 2784 }, { "epoch": 1.2670609645131938, "grad_norm": 0.49216033758494676, "learning_rate": 8.498452859086816e-06, "loss": 0.0287, "step": 2785 }, { "epoch": 1.267515923566879, "grad_norm": 0.6649300772430892, "learning_rate": 8.497431561320204e-06, "loss": 0.0493, "step": 2786 }, { "epoch": 1.2679708826205642, "grad_norm": 0.6238882043590698, "learning_rate": 8.496409977759281e-06, "loss": 0.0358, "step": 2787 }, { "epoch": 1.2684258416742493, "grad_norm": 0.3865373802086272, "learning_rate": 8.495388108487525e-06, "loss": 0.0197, "step": 2788 }, { "epoch": 1.2688808007279344, "grad_norm": 0.4201281559699742, "learning_rate": 8.494365953588435e-06, "loss": 0.0222, "step": 2789 }, { "epoch": 1.2693357597816197, "grad_norm": 0.6045328676429113, "learning_rate": 8.493343513145543e-06, "loss": 0.0402, "step": 2790 }, { "epoch": 1.2697907188353048, "grad_norm": 0.5549299619854191, "learning_rate": 8.492320787242394e-06, "loss": 0.0323, "step": 2791 }, { "epoch": 1.27024567788899, "grad_norm": 0.4475795587111154, "learning_rate": 8.491297775962561e-06, "loss": 0.0235, "step": 2792 }, { "epoch": 1.2707006369426752, "grad_norm": 0.7036026073161589, "learning_rate": 8.49027447938964e-06, "loss": 0.0433, "step": 2793 }, { "epoch": 1.2711555959963603, "grad_norm": 0.6587509536956878, "learning_rate": 8.48925089760725e-06, "loss": 0.0381, "step": 2794 }, { "epoch": 1.2716105550500454, "grad_norm": 0.5314238167851097, "learning_rate": 8.488227030699034e-06, "loss": 0.0347, "step": 2795 }, { "epoch": 1.2720655141037307, "grad_norm": 0.40385254147988475, "learning_rate": 8.487202878748659e-06, "loss": 0.0153, "step": 2796 }, { "epoch": 1.2725204731574158, "grad_norm": 0.4187276708435959, "learning_rate": 8.486178441839812e-06, "loss": 0.028, "step": 2797 }, { "epoch": 1.2729754322111009, "grad_norm": 0.5320879394729393, "learning_rate": 8.485153720056206e-06, "loss": 0.0282, "step": 2798 }, { "epoch": 1.2734303912647862, "grad_norm": 0.5626707275839432, "learning_rate": 8.484128713481578e-06, "loss": 0.0265, "step": 2799 }, { "epoch": 1.2738853503184713, "grad_norm": 0.49217175534274415, "learning_rate": 8.483103422199683e-06, "loss": 0.024, "step": 2800 }, { "epoch": 1.2743403093721566, "grad_norm": 1.7488430942036035, "learning_rate": 8.48207784629431e-06, "loss": 0.0627, "step": 2801 }, { "epoch": 1.2747952684258417, "grad_norm": 0.5648200964805751, "learning_rate": 8.481051985849259e-06, "loss": 0.0276, "step": 2802 }, { "epoch": 1.2752502274795268, "grad_norm": 0.5556231021278181, "learning_rate": 8.480025840948357e-06, "loss": 0.034, "step": 2803 }, { "epoch": 1.275705186533212, "grad_norm": 0.6476049581350152, "learning_rate": 8.478999411675461e-06, "loss": 0.0375, "step": 2804 }, { "epoch": 1.2761601455868972, "grad_norm": 0.5124346469861034, "learning_rate": 8.477972698114446e-06, "loss": 0.0272, "step": 2805 }, { "epoch": 1.2766151046405825, "grad_norm": 0.6717078615438046, "learning_rate": 8.476945700349206e-06, "loss": 0.0375, "step": 2806 }, { "epoch": 1.2770700636942676, "grad_norm": 0.6030848555640039, "learning_rate": 8.475918418463665e-06, "loss": 0.0358, "step": 2807 }, { "epoch": 1.2775250227479527, "grad_norm": 0.7825586786731197, "learning_rate": 8.474890852541768e-06, "loss": 0.0522, "step": 2808 }, { "epoch": 1.277979981801638, "grad_norm": 0.6153861535254155, "learning_rate": 8.473863002667484e-06, "loss": 0.0445, "step": 2809 }, { "epoch": 1.278434940855323, "grad_norm": 0.5282220213793622, "learning_rate": 8.472834868924803e-06, "loss": 0.0481, "step": 2810 }, { "epoch": 1.2788898999090081, "grad_norm": 0.5421116083773543, "learning_rate": 8.47180645139774e-06, "loss": 0.0344, "step": 2811 }, { "epoch": 1.2793448589626935, "grad_norm": 0.6791388111543428, "learning_rate": 8.470777750170331e-06, "loss": 0.0435, "step": 2812 }, { "epoch": 1.2797998180163785, "grad_norm": 0.5207371822148394, "learning_rate": 8.469748765326639e-06, "loss": 0.0322, "step": 2813 }, { "epoch": 1.2802547770700636, "grad_norm": 0.45374031536978576, "learning_rate": 8.468719496950748e-06, "loss": 0.0202, "step": 2814 }, { "epoch": 1.280709736123749, "grad_norm": 0.6220210270574482, "learning_rate": 8.467689945126764e-06, "loss": 0.0389, "step": 2815 }, { "epoch": 1.281164695177434, "grad_norm": 0.44694663225442227, "learning_rate": 8.466660109938817e-06, "loss": 0.0277, "step": 2816 }, { "epoch": 1.2816196542311191, "grad_norm": 0.45764799454231664, "learning_rate": 8.46562999147106e-06, "loss": 0.0214, "step": 2817 }, { "epoch": 1.2820746132848044, "grad_norm": 0.46665783231694, "learning_rate": 8.464599589807673e-06, "loss": 0.0328, "step": 2818 }, { "epoch": 1.2825295723384895, "grad_norm": 0.5791262823156262, "learning_rate": 8.463568905032853e-06, "loss": 0.0315, "step": 2819 }, { "epoch": 1.2829845313921746, "grad_norm": 0.5688402724299256, "learning_rate": 8.462537937230823e-06, "loss": 0.0341, "step": 2820 }, { "epoch": 1.28343949044586, "grad_norm": 0.509413677148141, "learning_rate": 8.46150668648583e-06, "loss": 0.0269, "step": 2821 }, { "epoch": 1.283894449499545, "grad_norm": 0.4653446182519699, "learning_rate": 8.460475152882142e-06, "loss": 0.0283, "step": 2822 }, { "epoch": 1.28434940855323, "grad_norm": 0.3972245449089319, "learning_rate": 8.459443336504052e-06, "loss": 0.025, "step": 2823 }, { "epoch": 1.2848043676069154, "grad_norm": 0.7353018545774426, "learning_rate": 8.458411237435875e-06, "loss": 0.0404, "step": 2824 }, { "epoch": 1.2852593266606005, "grad_norm": 0.6368251484118869, "learning_rate": 8.45737885576195e-06, "loss": 0.0304, "step": 2825 }, { "epoch": 1.2857142857142856, "grad_norm": 0.41997329370972175, "learning_rate": 8.456346191566638e-06, "loss": 0.0198, "step": 2826 }, { "epoch": 1.286169244767971, "grad_norm": 0.6490280661806181, "learning_rate": 8.455313244934324e-06, "loss": 0.0421, "step": 2827 }, { "epoch": 1.286624203821656, "grad_norm": 0.5889007709220472, "learning_rate": 8.454280015949417e-06, "loss": 0.0284, "step": 2828 }, { "epoch": 1.2870791628753413, "grad_norm": 0.5451280417854998, "learning_rate": 8.453246504696345e-06, "loss": 0.0226, "step": 2829 }, { "epoch": 1.2875341219290264, "grad_norm": 0.4320016505130762, "learning_rate": 8.452212711259562e-06, "loss": 0.0248, "step": 2830 }, { "epoch": 1.2879890809827115, "grad_norm": 0.639460311213598, "learning_rate": 8.45117863572355e-06, "loss": 0.0358, "step": 2831 }, { "epoch": 1.2884440400363968, "grad_norm": 0.4572192713269325, "learning_rate": 8.450144278172802e-06, "loss": 0.0219, "step": 2832 }, { "epoch": 1.2888989990900819, "grad_norm": 0.6056405312996962, "learning_rate": 8.449109638691846e-06, "loss": 0.0404, "step": 2833 }, { "epoch": 1.2893539581437672, "grad_norm": 0.6311739331027462, "learning_rate": 8.448074717365227e-06, "loss": 0.0462, "step": 2834 }, { "epoch": 1.2898089171974523, "grad_norm": 0.3815545778311228, "learning_rate": 8.447039514277511e-06, "loss": 0.0218, "step": 2835 }, { "epoch": 1.2902638762511374, "grad_norm": 0.3978060829542109, "learning_rate": 8.446004029513294e-06, "loss": 0.0178, "step": 2836 }, { "epoch": 1.2907188353048227, "grad_norm": 0.5275681469990177, "learning_rate": 8.44496826315719e-06, "loss": 0.0328, "step": 2837 }, { "epoch": 1.2911737943585078, "grad_norm": 0.39047958443652486, "learning_rate": 8.443932215293837e-06, "loss": 0.0185, "step": 2838 }, { "epoch": 1.2916287534121929, "grad_norm": 0.3956886222652548, "learning_rate": 8.442895886007894e-06, "loss": 0.0198, "step": 2839 }, { "epoch": 1.2920837124658782, "grad_norm": 0.46209991899892167, "learning_rate": 8.441859275384051e-06, "loss": 0.0312, "step": 2840 }, { "epoch": 1.2925386715195633, "grad_norm": 0.46877419670667825, "learning_rate": 8.440822383507009e-06, "loss": 0.0291, "step": 2841 }, { "epoch": 1.2929936305732483, "grad_norm": 0.45005958214248154, "learning_rate": 8.4397852104615e-06, "loss": 0.0237, "step": 2842 }, { "epoch": 1.2934485896269337, "grad_norm": 0.45818206860330574, "learning_rate": 8.438747756332278e-06, "loss": 0.031, "step": 2843 }, { "epoch": 1.2939035486806187, "grad_norm": 0.6372108995312181, "learning_rate": 8.43771002120412e-06, "loss": 0.043, "step": 2844 }, { "epoch": 1.2943585077343038, "grad_norm": 0.7913133192237457, "learning_rate": 8.43667200516182e-06, "loss": 0.0322, "step": 2845 }, { "epoch": 1.2948134667879891, "grad_norm": 0.7449168452669471, "learning_rate": 8.435633708290205e-06, "loss": 0.0668, "step": 2846 }, { "epoch": 1.2952684258416742, "grad_norm": 0.5449759524419574, "learning_rate": 8.434595130674121e-06, "loss": 0.0287, "step": 2847 }, { "epoch": 1.2957233848953593, "grad_norm": 0.6854051716023769, "learning_rate": 8.433556272398431e-06, "loss": 0.04, "step": 2848 }, { "epoch": 1.2961783439490446, "grad_norm": 0.5167363871048377, "learning_rate": 8.43251713354803e-06, "loss": 0.0306, "step": 2849 }, { "epoch": 1.2966333030027297, "grad_norm": 0.5049406724574115, "learning_rate": 8.43147771420783e-06, "loss": 0.03, "step": 2850 }, { "epoch": 1.2970882620564148, "grad_norm": 0.47114278348228805, "learning_rate": 8.430438014462764e-06, "loss": 0.0295, "step": 2851 }, { "epoch": 1.2975432211101001, "grad_norm": 0.7186330165104124, "learning_rate": 8.429398034397798e-06, "loss": 0.0387, "step": 2852 }, { "epoch": 1.2979981801637852, "grad_norm": 0.599939271911953, "learning_rate": 8.428357774097913e-06, "loss": 0.0311, "step": 2853 }, { "epoch": 1.2984531392174703, "grad_norm": 0.5994585274722956, "learning_rate": 8.42731723364811e-06, "loss": 0.0391, "step": 2854 }, { "epoch": 1.2989080982711556, "grad_norm": 0.5657660047246607, "learning_rate": 8.426276413133422e-06, "loss": 0.0301, "step": 2855 }, { "epoch": 1.2993630573248407, "grad_norm": 0.4190238792112018, "learning_rate": 8.4252353126389e-06, "loss": 0.0245, "step": 2856 }, { "epoch": 1.299818016378526, "grad_norm": 0.6420578682403528, "learning_rate": 8.424193932249614e-06, "loss": 0.0377, "step": 2857 }, { "epoch": 1.300272975432211, "grad_norm": 0.5700032729383738, "learning_rate": 8.423152272050665e-06, "loss": 0.0338, "step": 2858 }, { "epoch": 1.3007279344858962, "grad_norm": 0.5125782380711011, "learning_rate": 8.42211033212717e-06, "loss": 0.0279, "step": 2859 }, { "epoch": 1.3011828935395815, "grad_norm": 0.3613001497946594, "learning_rate": 8.421068112564272e-06, "loss": 0.0142, "step": 2860 }, { "epoch": 1.3016378525932666, "grad_norm": 0.49734554941531095, "learning_rate": 8.42002561344714e-06, "loss": 0.027, "step": 2861 }, { "epoch": 1.302092811646952, "grad_norm": 0.5045918830965134, "learning_rate": 8.418982834860958e-06, "loss": 0.0371, "step": 2862 }, { "epoch": 1.302547770700637, "grad_norm": 0.7937593563379391, "learning_rate": 8.417939776890938e-06, "loss": 0.0477, "step": 2863 }, { "epoch": 1.303002729754322, "grad_norm": 0.770292717700032, "learning_rate": 8.416896439622315e-06, "loss": 0.0364, "step": 2864 }, { "epoch": 1.3034576888080074, "grad_norm": 0.40217667555726205, "learning_rate": 8.415852823140344e-06, "loss": 0.0215, "step": 2865 }, { "epoch": 1.3039126478616925, "grad_norm": 0.39410556190775164, "learning_rate": 8.41480892753031e-06, "loss": 0.0258, "step": 2866 }, { "epoch": 1.3043676069153776, "grad_norm": 0.5772374070482569, "learning_rate": 8.413764752877509e-06, "loss": 0.0383, "step": 2867 }, { "epoch": 1.3048225659690629, "grad_norm": 0.528432161004818, "learning_rate": 8.41272029926727e-06, "loss": 0.0309, "step": 2868 }, { "epoch": 1.305277525022748, "grad_norm": 0.5477254790589614, "learning_rate": 8.411675566784939e-06, "loss": 0.0306, "step": 2869 }, { "epoch": 1.305732484076433, "grad_norm": 0.5287075963479533, "learning_rate": 8.410630555515887e-06, "loss": 0.0379, "step": 2870 }, { "epoch": 1.3061874431301184, "grad_norm": 0.6485432136738517, "learning_rate": 8.409585265545509e-06, "loss": 0.0507, "step": 2871 }, { "epoch": 1.3066424021838035, "grad_norm": 0.4441608183133053, "learning_rate": 8.408539696959222e-06, "loss": 0.0181, "step": 2872 }, { "epoch": 1.3070973612374885, "grad_norm": 0.581370080812639, "learning_rate": 8.407493849842462e-06, "loss": 0.0298, "step": 2873 }, { "epoch": 1.3075523202911739, "grad_norm": 0.41095630327527305, "learning_rate": 8.406447724280694e-06, "loss": 0.0172, "step": 2874 }, { "epoch": 1.308007279344859, "grad_norm": 0.5932824581914778, "learning_rate": 8.4054013203594e-06, "loss": 0.0396, "step": 2875 }, { "epoch": 1.308462238398544, "grad_norm": 0.5059022446307072, "learning_rate": 8.40435463816409e-06, "loss": 0.0295, "step": 2876 }, { "epoch": 1.3089171974522293, "grad_norm": 0.6107245381532688, "learning_rate": 8.403307677780291e-06, "loss": 0.0456, "step": 2877 }, { "epoch": 1.3093721565059144, "grad_norm": 0.4990493825414314, "learning_rate": 8.40226043929356e-06, "loss": 0.0251, "step": 2878 }, { "epoch": 1.3098271155595995, "grad_norm": 0.4714729010419501, "learning_rate": 8.40121292278947e-06, "loss": 0.0162, "step": 2879 }, { "epoch": 1.3102820746132848, "grad_norm": 0.8705842073062648, "learning_rate": 8.400165128353619e-06, "loss": 0.0628, "step": 2880 }, { "epoch": 1.31073703366697, "grad_norm": 0.5559037822076499, "learning_rate": 8.399117056071628e-06, "loss": 0.0301, "step": 2881 }, { "epoch": 1.311191992720655, "grad_norm": 0.7209850941894482, "learning_rate": 8.398068706029144e-06, "loss": 0.0562, "step": 2882 }, { "epoch": 1.3116469517743403, "grad_norm": 0.6600248319736058, "learning_rate": 8.397020078311829e-06, "loss": 0.0479, "step": 2883 }, { "epoch": 1.3121019108280254, "grad_norm": 0.5745977024793432, "learning_rate": 8.395971173005373e-06, "loss": 0.0307, "step": 2884 }, { "epoch": 1.3125568698817107, "grad_norm": 0.7612725338466626, "learning_rate": 8.39492199019549e-06, "loss": 0.045, "step": 2885 }, { "epoch": 1.3130118289353958, "grad_norm": 0.5825437442447712, "learning_rate": 8.393872529967913e-06, "loss": 0.0373, "step": 2886 }, { "epoch": 1.3134667879890811, "grad_norm": 0.4805516575623738, "learning_rate": 8.3928227924084e-06, "loss": 0.0257, "step": 2887 }, { "epoch": 1.3139217470427662, "grad_norm": 0.5620132619283021, "learning_rate": 8.391772777602729e-06, "loss": 0.0392, "step": 2888 }, { "epoch": 1.3143767060964513, "grad_norm": 0.4909960245970496, "learning_rate": 8.390722485636707e-06, "loss": 0.0329, "step": 2889 }, { "epoch": 1.3148316651501366, "grad_norm": 0.7269269853225212, "learning_rate": 8.389671916596152e-06, "loss": 0.052, "step": 2890 }, { "epoch": 1.3152866242038217, "grad_norm": 0.5201558195230915, "learning_rate": 8.388621070566918e-06, "loss": 0.0248, "step": 2891 }, { "epoch": 1.3157415832575068, "grad_norm": 0.394106763274554, "learning_rate": 8.387569947634872e-06, "loss": 0.018, "step": 2892 }, { "epoch": 1.316196542311192, "grad_norm": 0.6404544864402316, "learning_rate": 8.386518547885907e-06, "loss": 0.0476, "step": 2893 }, { "epoch": 1.3166515013648772, "grad_norm": 0.5191807643177463, "learning_rate": 8.385466871405942e-06, "loss": 0.0275, "step": 2894 }, { "epoch": 1.3171064604185623, "grad_norm": 0.5938811996011489, "learning_rate": 8.384414918280912e-06, "loss": 0.0381, "step": 2895 }, { "epoch": 1.3175614194722476, "grad_norm": 0.5756131532663223, "learning_rate": 8.383362688596779e-06, "loss": 0.0338, "step": 2896 }, { "epoch": 1.3180163785259327, "grad_norm": 0.5811779212201078, "learning_rate": 8.382310182439526e-06, "loss": 0.0376, "step": 2897 }, { "epoch": 1.3184713375796178, "grad_norm": 0.448051940064308, "learning_rate": 8.381257399895157e-06, "loss": 0.0311, "step": 2898 }, { "epoch": 1.318926296633303, "grad_norm": 0.5275685839417847, "learning_rate": 8.380204341049706e-06, "loss": 0.0301, "step": 2899 }, { "epoch": 1.3193812556869882, "grad_norm": 0.5862324399838035, "learning_rate": 8.37915100598922e-06, "loss": 0.0326, "step": 2900 }, { "epoch": 1.3198362147406733, "grad_norm": 0.4657130433511499, "learning_rate": 8.378097394799774e-06, "loss": 0.0292, "step": 2901 }, { "epoch": 1.3202911737943586, "grad_norm": 0.43527864616078554, "learning_rate": 8.377043507567464e-06, "loss": 0.0198, "step": 2902 }, { "epoch": 1.3207461328480437, "grad_norm": 0.5682424315090409, "learning_rate": 8.37598934437841e-06, "loss": 0.0337, "step": 2903 }, { "epoch": 1.3212010919017287, "grad_norm": 0.5138915407015388, "learning_rate": 8.374934905318753e-06, "loss": 0.0361, "step": 2904 }, { "epoch": 1.321656050955414, "grad_norm": 0.6053557583572503, "learning_rate": 8.373880190474653e-06, "loss": 0.0357, "step": 2905 }, { "epoch": 1.3221110100090991, "grad_norm": 0.39758173287328924, "learning_rate": 8.372825199932304e-06, "loss": 0.0291, "step": 2906 }, { "epoch": 1.3225659690627842, "grad_norm": 0.3641881015036813, "learning_rate": 8.371769933777908e-06, "loss": 0.0171, "step": 2907 }, { "epoch": 1.3230209281164695, "grad_norm": 0.3905844015965015, "learning_rate": 8.370714392097703e-06, "loss": 0.0185, "step": 2908 }, { "epoch": 1.3234758871701546, "grad_norm": 0.7229548107308235, "learning_rate": 8.369658574977939e-06, "loss": 0.0308, "step": 2909 }, { "epoch": 1.3239308462238397, "grad_norm": 0.44397421085587824, "learning_rate": 8.368602482504894e-06, "loss": 0.0257, "step": 2910 }, { "epoch": 1.324385805277525, "grad_norm": 0.3765050238419547, "learning_rate": 8.367546114764863e-06, "loss": 0.0188, "step": 2911 }, { "epoch": 1.3248407643312101, "grad_norm": 0.6236537427570221, "learning_rate": 8.366489471844174e-06, "loss": 0.0341, "step": 2912 }, { "epoch": 1.3252957233848954, "grad_norm": 0.5416006396662083, "learning_rate": 8.36543255382917e-06, "loss": 0.0275, "step": 2913 }, { "epoch": 1.3257506824385805, "grad_norm": 0.4937949308281839, "learning_rate": 8.364375360806214e-06, "loss": 0.022, "step": 2914 }, { "epoch": 1.3262056414922658, "grad_norm": 0.537820582130104, "learning_rate": 8.363317892861695e-06, "loss": 0.0319, "step": 2915 }, { "epoch": 1.326660600545951, "grad_norm": 0.3658940967114081, "learning_rate": 8.36226015008203e-06, "loss": 0.0163, "step": 2916 }, { "epoch": 1.327115559599636, "grad_norm": 0.42988876957689603, "learning_rate": 8.361202132553647e-06, "loss": 0.0248, "step": 2917 }, { "epoch": 1.3275705186533213, "grad_norm": 0.4763202386564734, "learning_rate": 8.360143840363006e-06, "loss": 0.0267, "step": 2918 }, { "epoch": 1.3280254777070064, "grad_norm": 0.5304487014834793, "learning_rate": 8.359085273596583e-06, "loss": 0.0232, "step": 2919 }, { "epoch": 1.3284804367606915, "grad_norm": 0.5132885954286545, "learning_rate": 8.358026432340883e-06, "loss": 0.0296, "step": 2920 }, { "epoch": 1.3289353958143768, "grad_norm": 0.5441398032146411, "learning_rate": 8.356967316682427e-06, "loss": 0.025, "step": 2921 }, { "epoch": 1.329390354868062, "grad_norm": 0.8637700710210932, "learning_rate": 8.35590792670776e-06, "loss": 0.0546, "step": 2922 }, { "epoch": 1.329845313921747, "grad_norm": 0.6961731263031892, "learning_rate": 8.354848262503455e-06, "loss": 0.0455, "step": 2923 }, { "epoch": 1.3303002729754323, "grad_norm": 0.4361786339244665, "learning_rate": 8.3537883241561e-06, "loss": 0.0247, "step": 2924 }, { "epoch": 1.3307552320291174, "grad_norm": 0.43356790518587945, "learning_rate": 8.352728111752308e-06, "loss": 0.0211, "step": 2925 }, { "epoch": 1.3312101910828025, "grad_norm": 0.6497845011963704, "learning_rate": 8.351667625378714e-06, "loss": 0.0339, "step": 2926 }, { "epoch": 1.3316651501364878, "grad_norm": 0.42248975914582426, "learning_rate": 8.35060686512198e-06, "loss": 0.0241, "step": 2927 }, { "epoch": 1.3321201091901729, "grad_norm": 1.0254154838868987, "learning_rate": 8.349545831068783e-06, "loss": 0.0527, "step": 2928 }, { "epoch": 1.332575068243858, "grad_norm": 0.5465891301505484, "learning_rate": 8.348484523305828e-06, "loss": 0.0321, "step": 2929 }, { "epoch": 1.3330300272975433, "grad_norm": 0.4438432419153223, "learning_rate": 8.347422941919839e-06, "loss": 0.0246, "step": 2930 }, { "epoch": 1.3334849863512284, "grad_norm": 0.612887112386914, "learning_rate": 8.346361086997563e-06, "loss": 0.0378, "step": 2931 }, { "epoch": 1.3339399454049135, "grad_norm": 0.5224132598314238, "learning_rate": 8.345298958625773e-06, "loss": 0.0238, "step": 2932 }, { "epoch": 1.3343949044585988, "grad_norm": 0.4483766382516411, "learning_rate": 8.344236556891258e-06, "loss": 0.0229, "step": 2933 }, { "epoch": 1.3348498635122839, "grad_norm": 0.6468172061802943, "learning_rate": 8.343173881880834e-06, "loss": 0.0478, "step": 2934 }, { "epoch": 1.335304822565969, "grad_norm": 0.5929266648467246, "learning_rate": 8.342110933681338e-06, "loss": 0.0327, "step": 2935 }, { "epoch": 1.3357597816196543, "grad_norm": 0.9253130324099915, "learning_rate": 8.341047712379629e-06, "loss": 0.0561, "step": 2936 }, { "epoch": 1.3362147406733393, "grad_norm": 0.42363571916667725, "learning_rate": 8.33998421806259e-06, "loss": 0.0201, "step": 2937 }, { "epoch": 1.3366696997270244, "grad_norm": 0.46068482279133083, "learning_rate": 8.338920450817124e-06, "loss": 0.0328, "step": 2938 }, { "epoch": 1.3371246587807097, "grad_norm": 0.5809098017908542, "learning_rate": 8.337856410730157e-06, "loss": 0.0325, "step": 2939 }, { "epoch": 1.3375796178343948, "grad_norm": 0.4339828167803655, "learning_rate": 8.336792097888636e-06, "loss": 0.025, "step": 2940 }, { "epoch": 1.3380345768880801, "grad_norm": 0.39427353503978124, "learning_rate": 8.335727512379535e-06, "loss": 0.0235, "step": 2941 }, { "epoch": 1.3384895359417652, "grad_norm": 0.46862803888718435, "learning_rate": 8.334662654289847e-06, "loss": 0.0262, "step": 2942 }, { "epoch": 1.3389444949954505, "grad_norm": 0.5771037970727136, "learning_rate": 8.333597523706583e-06, "loss": 0.0364, "step": 2943 }, { "epoch": 1.3393994540491356, "grad_norm": 0.5439670424125458, "learning_rate": 8.332532120716787e-06, "loss": 0.0317, "step": 2944 }, { "epoch": 1.3398544131028207, "grad_norm": 0.5794567514020939, "learning_rate": 8.331466445407513e-06, "loss": 0.0352, "step": 2945 }, { "epoch": 1.340309372156506, "grad_norm": 0.6921033406520039, "learning_rate": 8.330400497865847e-06, "loss": 0.0372, "step": 2946 }, { "epoch": 1.3407643312101911, "grad_norm": 0.547284959859392, "learning_rate": 8.329334278178893e-06, "loss": 0.0407, "step": 2947 }, { "epoch": 1.3412192902638762, "grad_norm": 0.3600178778635221, "learning_rate": 8.328267786433777e-06, "loss": 0.0207, "step": 2948 }, { "epoch": 1.3416742493175615, "grad_norm": 0.49898975870202994, "learning_rate": 8.327201022717645e-06, "loss": 0.0364, "step": 2949 }, { "epoch": 1.3421292083712466, "grad_norm": 0.4818424249267445, "learning_rate": 8.326133987117674e-06, "loss": 0.0461, "step": 2950 }, { "epoch": 1.3425841674249317, "grad_norm": 0.5346060862589315, "learning_rate": 8.325066679721053e-06, "loss": 0.0322, "step": 2951 }, { "epoch": 1.343039126478617, "grad_norm": 0.4768637215194256, "learning_rate": 8.323999100615e-06, "loss": 0.0228, "step": 2952 }, { "epoch": 1.343494085532302, "grad_norm": 0.6121311741443156, "learning_rate": 8.32293124988675e-06, "loss": 0.0389, "step": 2953 }, { "epoch": 1.3439490445859872, "grad_norm": 0.41050069959015945, "learning_rate": 8.321863127623565e-06, "loss": 0.0186, "step": 2954 }, { "epoch": 1.3444040036396725, "grad_norm": 0.51587593937067, "learning_rate": 8.320794733912727e-06, "loss": 0.033, "step": 2955 }, { "epoch": 1.3448589626933576, "grad_norm": 0.4429693043138898, "learning_rate": 8.319726068841541e-06, "loss": 0.0218, "step": 2956 }, { "epoch": 1.3453139217470427, "grad_norm": 0.4419902925455928, "learning_rate": 8.31865713249733e-06, "loss": 0.026, "step": 2957 }, { "epoch": 1.345768880800728, "grad_norm": 1.2645808402043965, "learning_rate": 8.317587924967445e-06, "loss": 0.0557, "step": 2958 }, { "epoch": 1.346223839854413, "grad_norm": 0.5397408570521678, "learning_rate": 8.31651844633926e-06, "loss": 0.0278, "step": 2959 }, { "epoch": 1.3466787989080982, "grad_norm": 0.5231183718719437, "learning_rate": 8.31544869670016e-06, "loss": 0.028, "step": 2960 }, { "epoch": 1.3471337579617835, "grad_norm": 0.6229462228070504, "learning_rate": 8.31437867613757e-06, "loss": 0.038, "step": 2961 }, { "epoch": 1.3475887170154686, "grad_norm": 0.7194000029115405, "learning_rate": 8.313308384738918e-06, "loss": 0.0443, "step": 2962 }, { "epoch": 1.3480436760691537, "grad_norm": 0.3970249499512815, "learning_rate": 8.31223782259167e-06, "loss": 0.0148, "step": 2963 }, { "epoch": 1.348498635122839, "grad_norm": 0.5285088103191983, "learning_rate": 8.311166989783303e-06, "loss": 0.0364, "step": 2964 }, { "epoch": 1.348953594176524, "grad_norm": 0.39954065377503273, "learning_rate": 8.310095886401326e-06, "loss": 0.0231, "step": 2965 }, { "epoch": 1.3494085532302094, "grad_norm": 0.5024343953698133, "learning_rate": 8.309024512533258e-06, "loss": 0.0241, "step": 2966 }, { "epoch": 1.3498635122838945, "grad_norm": 0.4085029836051291, "learning_rate": 8.307952868266653e-06, "loss": 0.0189, "step": 2967 }, { "epoch": 1.3503184713375795, "grad_norm": 0.5144685273285099, "learning_rate": 8.306880953689078e-06, "loss": 0.0267, "step": 2968 }, { "epoch": 1.3507734303912649, "grad_norm": 0.6709812500236393, "learning_rate": 8.305808768888123e-06, "loss": 0.0476, "step": 2969 }, { "epoch": 1.35122838944495, "grad_norm": 0.7208231702150235, "learning_rate": 8.304736313951407e-06, "loss": 0.0504, "step": 2970 }, { "epoch": 1.3516833484986353, "grad_norm": 0.4863482746478878, "learning_rate": 8.303663588966562e-06, "loss": 0.0343, "step": 2971 }, { "epoch": 1.3521383075523203, "grad_norm": 0.3674961944159868, "learning_rate": 8.302590594021246e-06, "loss": 0.0211, "step": 2972 }, { "epoch": 1.3525932666060054, "grad_norm": 0.7230295041278698, "learning_rate": 8.301517329203144e-06, "loss": 0.054, "step": 2973 }, { "epoch": 1.3530482256596907, "grad_norm": 0.45096016710067965, "learning_rate": 8.300443794599953e-06, "loss": 0.0347, "step": 2974 }, { "epoch": 1.3535031847133758, "grad_norm": 0.5073916445193536, "learning_rate": 8.299369990299401e-06, "loss": 0.0247, "step": 2975 }, { "epoch": 1.353958143767061, "grad_norm": 0.48751506850799814, "learning_rate": 8.298295916389234e-06, "loss": 0.0253, "step": 2976 }, { "epoch": 1.3544131028207462, "grad_norm": 0.4580906342189067, "learning_rate": 8.297221572957219e-06, "loss": 0.0285, "step": 2977 }, { "epoch": 1.3548680618744313, "grad_norm": 0.5613436435308797, "learning_rate": 8.296146960091147e-06, "loss": 0.0293, "step": 2978 }, { "epoch": 1.3553230209281164, "grad_norm": 0.40557778558942736, "learning_rate": 8.295072077878831e-06, "loss": 0.026, "step": 2979 }, { "epoch": 1.3557779799818017, "grad_norm": 0.5936885393390249, "learning_rate": 8.293996926408106e-06, "loss": 0.0277, "step": 2980 }, { "epoch": 1.3562329390354868, "grad_norm": 0.6190172575588174, "learning_rate": 8.292921505766826e-06, "loss": 0.0439, "step": 2981 }, { "epoch": 1.356687898089172, "grad_norm": 0.5580565035724842, "learning_rate": 8.291845816042872e-06, "loss": 0.0293, "step": 2982 }, { "epoch": 1.3571428571428572, "grad_norm": 0.5141691931429101, "learning_rate": 8.290769857324144e-06, "loss": 0.0293, "step": 2983 }, { "epoch": 1.3575978161965423, "grad_norm": 0.4512712109432825, "learning_rate": 8.289693629698564e-06, "loss": 0.0246, "step": 2984 }, { "epoch": 1.3580527752502274, "grad_norm": 0.6040999257078576, "learning_rate": 8.288617133254075e-06, "loss": 0.0372, "step": 2985 }, { "epoch": 1.3585077343039127, "grad_norm": 0.6915500026797864, "learning_rate": 8.287540368078648e-06, "loss": 0.0425, "step": 2986 }, { "epoch": 1.3589626933575978, "grad_norm": 0.5125738426632732, "learning_rate": 8.286463334260268e-06, "loss": 0.0359, "step": 2987 }, { "epoch": 1.3594176524112829, "grad_norm": 0.556319160536806, "learning_rate": 8.285386031886944e-06, "loss": 0.0241, "step": 2988 }, { "epoch": 1.3598726114649682, "grad_norm": 0.6102320846565004, "learning_rate": 8.284308461046713e-06, "loss": 0.0333, "step": 2989 }, { "epoch": 1.3603275705186533, "grad_norm": 0.5361302245507936, "learning_rate": 8.283230621827625e-06, "loss": 0.0278, "step": 2990 }, { "epoch": 1.3607825295723384, "grad_norm": 0.4699596538592812, "learning_rate": 8.282152514317756e-06, "loss": 0.0265, "step": 2991 }, { "epoch": 1.3612374886260237, "grad_norm": 0.6895404803749348, "learning_rate": 8.281074138605207e-06, "loss": 0.0394, "step": 2992 }, { "epoch": 1.3616924476797088, "grad_norm": 0.4695263884849979, "learning_rate": 8.279995494778097e-06, "loss": 0.0206, "step": 2993 }, { "epoch": 1.362147406733394, "grad_norm": 0.429106119981996, "learning_rate": 8.278916582924566e-06, "loss": 0.0262, "step": 2994 }, { "epoch": 1.3626023657870792, "grad_norm": 0.676343482596927, "learning_rate": 8.27783740313278e-06, "loss": 0.0552, "step": 2995 }, { "epoch": 1.3630573248407643, "grad_norm": 0.3406816842285639, "learning_rate": 8.276757955490924e-06, "loss": 0.0194, "step": 2996 }, { "epoch": 1.3635122838944496, "grad_norm": 0.5611480329253273, "learning_rate": 8.275678240087206e-06, "loss": 0.0329, "step": 2997 }, { "epoch": 1.3639672429481347, "grad_norm": 0.479385138705802, "learning_rate": 8.274598257009856e-06, "loss": 0.0322, "step": 2998 }, { "epoch": 1.36442220200182, "grad_norm": 0.45227339102007225, "learning_rate": 8.273518006347122e-06, "loss": 0.0203, "step": 2999 }, { "epoch": 1.364877161055505, "grad_norm": 0.44241532318072835, "learning_rate": 8.272437488187282e-06, "loss": 0.0227, "step": 3000 }, { "epoch": 1.3653321201091901, "grad_norm": 0.5114034038920364, "learning_rate": 8.271356702618627e-06, "loss": 0.0403, "step": 3001 }, { "epoch": 1.3657870791628755, "grad_norm": 0.6162969431059591, "learning_rate": 8.270275649729476e-06, "loss": 0.0325, "step": 3002 }, { "epoch": 1.3662420382165605, "grad_norm": 0.527092132342831, "learning_rate": 8.269194329608168e-06, "loss": 0.0284, "step": 3003 }, { "epoch": 1.3666969972702456, "grad_norm": 0.46778673954946487, "learning_rate": 8.268112742343062e-06, "loss": 0.0245, "step": 3004 }, { "epoch": 1.367151956323931, "grad_norm": 0.4147528449331875, "learning_rate": 8.267030888022543e-06, "loss": 0.0139, "step": 3005 }, { "epoch": 1.367606915377616, "grad_norm": 0.5241363451016124, "learning_rate": 8.26594876673501e-06, "loss": 0.0247, "step": 3006 }, { "epoch": 1.3680618744313011, "grad_norm": 0.5201590911331215, "learning_rate": 8.264866378568897e-06, "loss": 0.0177, "step": 3007 }, { "epoch": 1.3685168334849864, "grad_norm": 0.5378167093969959, "learning_rate": 8.263783723612644e-06, "loss": 0.0261, "step": 3008 }, { "epoch": 1.3689717925386715, "grad_norm": 0.3726813097721879, "learning_rate": 8.262700801954726e-06, "loss": 0.02, "step": 3009 }, { "epoch": 1.3694267515923566, "grad_norm": 0.6126857162538056, "learning_rate": 8.261617613683633e-06, "loss": 0.0433, "step": 3010 }, { "epoch": 1.369881710646042, "grad_norm": 1.0211897698494863, "learning_rate": 8.260534158887878e-06, "loss": 0.0696, "step": 3011 }, { "epoch": 1.370336669699727, "grad_norm": 0.4760258275057717, "learning_rate": 8.259450437655994e-06, "loss": 0.0349, "step": 3012 }, { "epoch": 1.370791628753412, "grad_norm": 0.5989088817313054, "learning_rate": 8.258366450076541e-06, "loss": 0.0491, "step": 3013 }, { "epoch": 1.3712465878070974, "grad_norm": 0.6467128053552115, "learning_rate": 8.257282196238097e-06, "loss": 0.0367, "step": 3014 }, { "epoch": 1.3717015468607825, "grad_norm": 0.6142108685785274, "learning_rate": 8.256197676229262e-06, "loss": 0.04, "step": 3015 }, { "epoch": 1.3721565059144676, "grad_norm": 0.5514322815123416, "learning_rate": 8.255112890138657e-06, "loss": 0.0268, "step": 3016 }, { "epoch": 1.372611464968153, "grad_norm": 0.45640389193979836, "learning_rate": 8.254027838054925e-06, "loss": 0.0273, "step": 3017 }, { "epoch": 1.373066424021838, "grad_norm": 0.39439780413094305, "learning_rate": 8.252942520066735e-06, "loss": 0.0177, "step": 3018 }, { "epoch": 1.373521383075523, "grad_norm": 0.4504055956288238, "learning_rate": 8.251856936262774e-06, "loss": 0.028, "step": 3019 }, { "epoch": 1.3739763421292084, "grad_norm": 0.48489134818246765, "learning_rate": 8.250771086731745e-06, "loss": 0.0313, "step": 3020 }, { "epoch": 1.3744313011828935, "grad_norm": 0.36762870096085365, "learning_rate": 8.249684971562387e-06, "loss": 0.0134, "step": 3021 }, { "epoch": 1.3748862602365788, "grad_norm": 0.5182427734749195, "learning_rate": 8.248598590843447e-06, "loss": 0.0273, "step": 3022 }, { "epoch": 1.3753412192902639, "grad_norm": 0.3485416699729316, "learning_rate": 8.247511944663701e-06, "loss": 0.0193, "step": 3023 }, { "epoch": 1.3757961783439492, "grad_norm": 0.5920956861150252, "learning_rate": 8.246425033111944e-06, "loss": 0.0445, "step": 3024 }, { "epoch": 1.3762511373976343, "grad_norm": 0.5525125917448483, "learning_rate": 8.245337856276996e-06, "loss": 0.0357, "step": 3025 }, { "epoch": 1.3767060964513194, "grad_norm": 0.629783233447734, "learning_rate": 8.244250414247692e-06, "loss": 0.0413, "step": 3026 }, { "epoch": 1.3771610555050047, "grad_norm": 0.35243668108000564, "learning_rate": 8.243162707112895e-06, "loss": 0.0163, "step": 3027 }, { "epoch": 1.3776160145586898, "grad_norm": 0.32017923480563804, "learning_rate": 8.242074734961489e-06, "loss": 0.0217, "step": 3028 }, { "epoch": 1.3780709736123748, "grad_norm": 0.5302331839428659, "learning_rate": 8.240986497882376e-06, "loss": 0.0246, "step": 3029 }, { "epoch": 1.3785259326660602, "grad_norm": 0.513223225130444, "learning_rate": 8.239897995964483e-06, "loss": 0.0245, "step": 3030 }, { "epoch": 1.3789808917197452, "grad_norm": 0.5941014343712512, "learning_rate": 8.238809229296756e-06, "loss": 0.0327, "step": 3031 }, { "epoch": 1.3794358507734303, "grad_norm": 0.8390821773783546, "learning_rate": 8.237720197968167e-06, "loss": 0.0722, "step": 3032 }, { "epoch": 1.3798908098271156, "grad_norm": 0.49003643455117596, "learning_rate": 8.236630902067702e-06, "loss": 0.0242, "step": 3033 }, { "epoch": 1.3803457688808007, "grad_norm": 0.6184622682847121, "learning_rate": 8.235541341684378e-06, "loss": 0.0396, "step": 3034 }, { "epoch": 1.3808007279344858, "grad_norm": 0.4436355063234884, "learning_rate": 8.234451516907228e-06, "loss": 0.0197, "step": 3035 }, { "epoch": 1.3812556869881711, "grad_norm": 0.45830037526579676, "learning_rate": 8.233361427825305e-06, "loss": 0.0257, "step": 3036 }, { "epoch": 1.3817106460418562, "grad_norm": 0.6393980461408286, "learning_rate": 8.232271074527688e-06, "loss": 0.048, "step": 3037 }, { "epoch": 1.3821656050955413, "grad_norm": 0.40754186598011133, "learning_rate": 8.231180457103477e-06, "loss": 0.0212, "step": 3038 }, { "epoch": 1.3826205641492266, "grad_norm": 0.5370935736948618, "learning_rate": 8.23008957564179e-06, "loss": 0.0309, "step": 3039 }, { "epoch": 1.3830755232029117, "grad_norm": 0.5067459376653409, "learning_rate": 8.22899843023177e-06, "loss": 0.0372, "step": 3040 }, { "epoch": 1.3835304822565968, "grad_norm": 0.4526706797906497, "learning_rate": 8.227907020962578e-06, "loss": 0.0253, "step": 3041 }, { "epoch": 1.3839854413102821, "grad_norm": 0.536389038504338, "learning_rate": 8.226815347923404e-06, "loss": 0.0298, "step": 3042 }, { "epoch": 1.3844404003639672, "grad_norm": 0.4802523068929896, "learning_rate": 8.225723411203452e-06, "loss": 0.0275, "step": 3043 }, { "epoch": 1.3848953594176523, "grad_norm": 0.5706003382968008, "learning_rate": 8.22463121089195e-06, "loss": 0.0378, "step": 3044 }, { "epoch": 1.3853503184713376, "grad_norm": 0.4968696815862661, "learning_rate": 8.223538747078146e-06, "loss": 0.0337, "step": 3045 }, { "epoch": 1.3858052775250227, "grad_norm": 0.5978285446033724, "learning_rate": 8.222446019851315e-06, "loss": 0.0297, "step": 3046 }, { "epoch": 1.3862602365787078, "grad_norm": 0.7562718290242261, "learning_rate": 8.221353029300747e-06, "loss": 0.046, "step": 3047 }, { "epoch": 1.386715195632393, "grad_norm": 0.40943266637554676, "learning_rate": 8.220259775515756e-06, "loss": 0.0208, "step": 3048 }, { "epoch": 1.3871701546860782, "grad_norm": 0.49567626101539297, "learning_rate": 8.21916625858568e-06, "loss": 0.0308, "step": 3049 }, { "epoch": 1.3876251137397635, "grad_norm": 0.5102658380968296, "learning_rate": 8.218072478599875e-06, "loss": 0.027, "step": 3050 }, { "epoch": 1.3880800727934486, "grad_norm": 0.5640260630506516, "learning_rate": 8.216978435647718e-06, "loss": 0.052, "step": 3051 }, { "epoch": 1.388535031847134, "grad_norm": 0.47339074035185624, "learning_rate": 8.215884129818612e-06, "loss": 0.0216, "step": 3052 }, { "epoch": 1.388989990900819, "grad_norm": 0.45861030037606987, "learning_rate": 8.214789561201979e-06, "loss": 0.0228, "step": 3053 }, { "epoch": 1.389444949954504, "grad_norm": 0.5617691155764737, "learning_rate": 8.21369472988726e-06, "loss": 0.0376, "step": 3054 }, { "epoch": 1.3898999090081894, "grad_norm": 0.7593333756738143, "learning_rate": 8.21259963596392e-06, "loss": 0.0443, "step": 3055 }, { "epoch": 1.3903548680618745, "grad_norm": 0.5508343748423888, "learning_rate": 8.211504279521445e-06, "loss": 0.0348, "step": 3056 }, { "epoch": 1.3908098271155596, "grad_norm": 0.4297088508988446, "learning_rate": 8.210408660649346e-06, "loss": 0.0236, "step": 3057 }, { "epoch": 1.3912647861692449, "grad_norm": 0.4652687964406028, "learning_rate": 8.209312779437147e-06, "loss": 0.0195, "step": 3058 }, { "epoch": 1.39171974522293, "grad_norm": 0.5833898595618995, "learning_rate": 8.208216635974401e-06, "loss": 0.0409, "step": 3059 }, { "epoch": 1.392174704276615, "grad_norm": 0.59822842016674, "learning_rate": 8.207120230350682e-06, "loss": 0.0496, "step": 3060 }, { "epoch": 1.3926296633303004, "grad_norm": 0.2680216564923973, "learning_rate": 8.206023562655578e-06, "loss": 0.0192, "step": 3061 }, { "epoch": 1.3930846223839854, "grad_norm": 0.4548172021576498, "learning_rate": 8.204926632978708e-06, "loss": 0.0263, "step": 3062 }, { "epoch": 1.3935395814376705, "grad_norm": 0.5200897444884447, "learning_rate": 8.203829441409708e-06, "loss": 0.0357, "step": 3063 }, { "epoch": 1.3939945404913558, "grad_norm": 0.5810832537093169, "learning_rate": 8.202731988038232e-06, "loss": 0.0362, "step": 3064 }, { "epoch": 1.394449499545041, "grad_norm": 0.6998441041100203, "learning_rate": 8.201634272953963e-06, "loss": 0.0482, "step": 3065 }, { "epoch": 1.394904458598726, "grad_norm": 0.5366640695091471, "learning_rate": 8.2005362962466e-06, "loss": 0.0242, "step": 3066 }, { "epoch": 1.3953594176524113, "grad_norm": 0.43762864600569806, "learning_rate": 8.199438058005864e-06, "loss": 0.0275, "step": 3067 }, { "epoch": 1.3958143767060964, "grad_norm": 0.7172645055229739, "learning_rate": 8.198339558321497e-06, "loss": 0.044, "step": 3068 }, { "epoch": 1.3962693357597815, "grad_norm": 0.4922329511547334, "learning_rate": 8.197240797283266e-06, "loss": 0.0238, "step": 3069 }, { "epoch": 1.3967242948134668, "grad_norm": 0.43305804975583, "learning_rate": 8.196141774980957e-06, "loss": 0.0253, "step": 3070 }, { "epoch": 1.397179253867152, "grad_norm": 0.4830253862903282, "learning_rate": 8.195042491504373e-06, "loss": 0.027, "step": 3071 }, { "epoch": 1.397634212920837, "grad_norm": 0.5708344998642687, "learning_rate": 8.193942946943348e-06, "loss": 0.0363, "step": 3072 }, { "epoch": 1.3980891719745223, "grad_norm": 0.5596831343818472, "learning_rate": 8.192843141387727e-06, "loss": 0.0334, "step": 3073 }, { "epoch": 1.3985441310282074, "grad_norm": 0.7790965200858694, "learning_rate": 8.191743074927385e-06, "loss": 0.0402, "step": 3074 }, { "epoch": 1.3989990900818925, "grad_norm": 0.3861231357505535, "learning_rate": 8.19064274765221e-06, "loss": 0.0266, "step": 3075 }, { "epoch": 1.3994540491355778, "grad_norm": 0.45166903229483174, "learning_rate": 8.189542159652122e-06, "loss": 0.0211, "step": 3076 }, { "epoch": 1.399909008189263, "grad_norm": 0.6031804236142754, "learning_rate": 8.18844131101705e-06, "loss": 0.035, "step": 3077 }, { "epoch": 1.4003639672429482, "grad_norm": 1.3846733915903273, "learning_rate": 8.187340201836955e-06, "loss": 0.037, "step": 3078 }, { "epoch": 1.4008189262966333, "grad_norm": 0.4992604754422713, "learning_rate": 8.186238832201809e-06, "loss": 0.0382, "step": 3079 }, { "epoch": 1.4012738853503186, "grad_norm": 0.5098672644565495, "learning_rate": 8.185137202201618e-06, "loss": 0.0314, "step": 3080 }, { "epoch": 1.4017288444040037, "grad_norm": 0.4607415273832947, "learning_rate": 8.184035311926397e-06, "loss": 0.0232, "step": 3081 }, { "epoch": 1.4021838034576888, "grad_norm": 0.5536418146465356, "learning_rate": 8.18293316146619e-06, "loss": 0.0415, "step": 3082 }, { "epoch": 1.402638762511374, "grad_norm": 0.5646125069514234, "learning_rate": 8.18183075091106e-06, "loss": 0.0274, "step": 3083 }, { "epoch": 1.4030937215650592, "grad_norm": 0.4731283804868446, "learning_rate": 8.18072808035109e-06, "loss": 0.0213, "step": 3084 }, { "epoch": 1.4035486806187443, "grad_norm": 0.6175450171343488, "learning_rate": 8.179625149876384e-06, "loss": 0.0425, "step": 3085 }, { "epoch": 1.4040036396724296, "grad_norm": 0.7990078666071156, "learning_rate": 8.178521959577069e-06, "loss": 0.0504, "step": 3086 }, { "epoch": 1.4044585987261147, "grad_norm": 0.4939494179351219, "learning_rate": 8.177418509543296e-06, "loss": 0.0348, "step": 3087 }, { "epoch": 1.4049135577797998, "grad_norm": 0.5234647003423882, "learning_rate": 8.17631479986523e-06, "loss": 0.0242, "step": 3088 }, { "epoch": 1.405368516833485, "grad_norm": 0.4543086003689767, "learning_rate": 8.175210830633063e-06, "loss": 0.0365, "step": 3089 }, { "epoch": 1.4058234758871702, "grad_norm": 0.4270056688908246, "learning_rate": 8.174106601937005e-06, "loss": 0.0197, "step": 3090 }, { "epoch": 1.4062784349408552, "grad_norm": 0.3545709323637256, "learning_rate": 8.173002113867291e-06, "loss": 0.0211, "step": 3091 }, { "epoch": 1.4067333939945406, "grad_norm": 0.6194564321624038, "learning_rate": 8.171897366514174e-06, "loss": 0.0386, "step": 3092 }, { "epoch": 1.4071883530482256, "grad_norm": 10.733523014095153, "learning_rate": 8.170792359967926e-06, "loss": 0.0351, "step": 3093 }, { "epoch": 1.4076433121019107, "grad_norm": 2.954613313740392, "learning_rate": 8.169687094318848e-06, "loss": 0.0484, "step": 3094 }, { "epoch": 1.408098271155596, "grad_norm": 0.47504988483410193, "learning_rate": 8.168581569657253e-06, "loss": 0.0293, "step": 3095 }, { "epoch": 1.4085532302092811, "grad_norm": 0.3293476754938369, "learning_rate": 8.167475786073483e-06, "loss": 0.0153, "step": 3096 }, { "epoch": 1.4090081892629662, "grad_norm": 0.4467649777544998, "learning_rate": 8.166369743657894e-06, "loss": 0.0246, "step": 3097 }, { "epoch": 1.4094631483166515, "grad_norm": 0.4910843124231285, "learning_rate": 8.165263442500869e-06, "loss": 0.0307, "step": 3098 }, { "epoch": 1.4099181073703366, "grad_norm": 0.45810467957956863, "learning_rate": 8.164156882692811e-06, "loss": 0.0234, "step": 3099 }, { "epoch": 1.4103730664240217, "grad_norm": 0.5117781690908434, "learning_rate": 8.16305006432414e-06, "loss": 0.0262, "step": 3100 }, { "epoch": 1.410828025477707, "grad_norm": 0.5143420044566899, "learning_rate": 8.161942987485303e-06, "loss": 0.0241, "step": 3101 }, { "epoch": 1.4112829845313921, "grad_norm": 0.8775557486692402, "learning_rate": 8.160835652266765e-06, "loss": 0.036, "step": 3102 }, { "epoch": 1.4117379435850774, "grad_norm": 0.591279163847277, "learning_rate": 8.159728058759012e-06, "loss": 0.0552, "step": 3103 }, { "epoch": 1.4121929026387625, "grad_norm": 0.43025003609442686, "learning_rate": 8.15862020705255e-06, "loss": 0.0265, "step": 3104 }, { "epoch": 1.4126478616924476, "grad_norm": 0.5960364379763011, "learning_rate": 8.157512097237909e-06, "loss": 0.029, "step": 3105 }, { "epoch": 1.413102820746133, "grad_norm": 0.4699210144648856, "learning_rate": 8.15640372940564e-06, "loss": 0.0261, "step": 3106 }, { "epoch": 1.413557779799818, "grad_norm": 0.40195772164768756, "learning_rate": 8.15529510364631e-06, "loss": 0.0251, "step": 3107 }, { "epoch": 1.4140127388535033, "grad_norm": 0.38209950272348364, "learning_rate": 8.154186220050516e-06, "loss": 0.0342, "step": 3108 }, { "epoch": 1.4144676979071884, "grad_norm": 0.6918635638402106, "learning_rate": 8.153077078708867e-06, "loss": 0.0384, "step": 3109 }, { "epoch": 1.4149226569608735, "grad_norm": 0.564268707731014, "learning_rate": 8.151967679711997e-06, "loss": 0.04, "step": 3110 }, { "epoch": 1.4153776160145588, "grad_norm": 0.4717867339000607, "learning_rate": 8.150858023150563e-06, "loss": 0.0366, "step": 3111 }, { "epoch": 1.415832575068244, "grad_norm": 0.36126517477798215, "learning_rate": 8.14974810911524e-06, "loss": 0.0154, "step": 3112 }, { "epoch": 1.416287534121929, "grad_norm": 0.48852954945790295, "learning_rate": 8.148637937696728e-06, "loss": 0.0318, "step": 3113 }, { "epoch": 1.4167424931756143, "grad_norm": 0.698433091689678, "learning_rate": 8.147527508985742e-06, "loss": 0.0538, "step": 3114 }, { "epoch": 1.4171974522292994, "grad_norm": 0.5209634235366688, "learning_rate": 8.14641682307302e-06, "loss": 0.0317, "step": 3115 }, { "epoch": 1.4176524112829845, "grad_norm": 0.4461508909116904, "learning_rate": 8.145305880049328e-06, "loss": 0.0254, "step": 3116 }, { "epoch": 1.4181073703366698, "grad_norm": 0.4294589539133548, "learning_rate": 8.14419468000544e-06, "loss": 0.0252, "step": 3117 }, { "epoch": 1.4185623293903549, "grad_norm": 0.5390763643611722, "learning_rate": 8.143083223032164e-06, "loss": 0.0198, "step": 3118 }, { "epoch": 1.41901728844404, "grad_norm": 0.6769964702086475, "learning_rate": 8.141971509220321e-06, "loss": 0.0417, "step": 3119 }, { "epoch": 1.4194722474977253, "grad_norm": 0.5507224112797532, "learning_rate": 8.140859538660755e-06, "loss": 0.0341, "step": 3120 }, { "epoch": 1.4199272065514104, "grad_norm": 0.5263510786416742, "learning_rate": 8.139747311444331e-06, "loss": 0.0263, "step": 3121 }, { "epoch": 1.4203821656050954, "grad_norm": 0.44633836091071555, "learning_rate": 8.138634827661936e-06, "loss": 0.0206, "step": 3122 }, { "epoch": 1.4208371246587808, "grad_norm": 0.4409238528743987, "learning_rate": 8.137522087404474e-06, "loss": 0.0235, "step": 3123 }, { "epoch": 1.4212920837124658, "grad_norm": 0.53650644597874, "learning_rate": 8.13640909076288e-06, "loss": 0.0249, "step": 3124 }, { "epoch": 1.421747042766151, "grad_norm": 0.43913353713243797, "learning_rate": 8.135295837828097e-06, "loss": 0.022, "step": 3125 }, { "epoch": 1.4222020018198362, "grad_norm": 0.4411508266847209, "learning_rate": 8.134182328691098e-06, "loss": 0.021, "step": 3126 }, { "epoch": 1.4226569608735213, "grad_norm": 0.5580383577936809, "learning_rate": 8.133068563442873e-06, "loss": 0.0253, "step": 3127 }, { "epoch": 1.4231119199272064, "grad_norm": 0.6159942959143841, "learning_rate": 8.131954542174433e-06, "loss": 0.0369, "step": 3128 }, { "epoch": 1.4235668789808917, "grad_norm": 0.5518845413386557, "learning_rate": 8.130840264976812e-06, "loss": 0.0318, "step": 3129 }, { "epoch": 1.4240218380345768, "grad_norm": 0.5260026586781827, "learning_rate": 8.129725731941063e-06, "loss": 0.0276, "step": 3130 }, { "epoch": 1.4244767970882621, "grad_norm": 0.5177138558113535, "learning_rate": 8.128610943158262e-06, "loss": 0.0236, "step": 3131 }, { "epoch": 1.4249317561419472, "grad_norm": 0.552369709761377, "learning_rate": 8.127495898719502e-06, "loss": 0.0341, "step": 3132 }, { "epoch": 1.4253867151956323, "grad_norm": 0.5264921326858812, "learning_rate": 8.126380598715902e-06, "loss": 0.0342, "step": 3133 }, { "epoch": 1.4258416742493176, "grad_norm": 0.9269763934758377, "learning_rate": 8.125265043238597e-06, "loss": 0.0531, "step": 3134 }, { "epoch": 1.4262966333030027, "grad_norm": 0.9603254771666904, "learning_rate": 8.124149232378747e-06, "loss": 0.0417, "step": 3135 }, { "epoch": 1.426751592356688, "grad_norm": 0.6470486278536731, "learning_rate": 8.12303316622753e-06, "loss": 0.0405, "step": 3136 }, { "epoch": 1.4272065514103731, "grad_norm": 0.48201293664894446, "learning_rate": 8.121916844876145e-06, "loss": 0.0321, "step": 3137 }, { "epoch": 1.4276615104640582, "grad_norm": 0.5872475622012117, "learning_rate": 8.120800268415815e-06, "loss": 0.0326, "step": 3138 }, { "epoch": 1.4281164695177435, "grad_norm": 0.5576014328083249, "learning_rate": 8.11968343693778e-06, "loss": 0.0245, "step": 3139 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6302514250098855, "learning_rate": 8.118566350533304e-06, "loss": 0.0478, "step": 3140 }, { "epoch": 1.4290263876251137, "grad_norm": 0.6141352095304792, "learning_rate": 8.117449009293668e-06, "loss": 0.0341, "step": 3141 }, { "epoch": 1.429481346678799, "grad_norm": 0.5457299413412613, "learning_rate": 8.116331413310178e-06, "loss": 0.0282, "step": 3142 }, { "epoch": 1.429936305732484, "grad_norm": 0.5791172434046514, "learning_rate": 8.115213562674158e-06, "loss": 0.0345, "step": 3143 }, { "epoch": 1.4303912647861692, "grad_norm": 0.5478313762128336, "learning_rate": 8.114095457476954e-06, "loss": 0.0316, "step": 3144 }, { "epoch": 1.4308462238398545, "grad_norm": 0.9292226056717331, "learning_rate": 8.112977097809932e-06, "loss": 0.0642, "step": 3145 }, { "epoch": 1.4313011828935396, "grad_norm": 0.6165602818367223, "learning_rate": 8.111858483764478e-06, "loss": 0.0406, "step": 3146 }, { "epoch": 1.4317561419472247, "grad_norm": 0.5589066346288158, "learning_rate": 8.110739615432005e-06, "loss": 0.0309, "step": 3147 }, { "epoch": 1.43221110100091, "grad_norm": 0.5221994826214766, "learning_rate": 8.109620492903938e-06, "loss": 0.0426, "step": 3148 }, { "epoch": 1.432666060054595, "grad_norm": 0.6643254697112511, "learning_rate": 8.108501116271725e-06, "loss": 0.0449, "step": 3149 }, { "epoch": 1.4331210191082802, "grad_norm": 0.40499081748743093, "learning_rate": 8.10738148562684e-06, "loss": 0.0259, "step": 3150 }, { "epoch": 1.4335759781619655, "grad_norm": 0.5319408589709967, "learning_rate": 8.106261601060773e-06, "loss": 0.0235, "step": 3151 }, { "epoch": 1.4340309372156506, "grad_norm": 0.502676216768999, "learning_rate": 8.105141462665036e-06, "loss": 0.029, "step": 3152 }, { "epoch": 1.4344858962693356, "grad_norm": 0.4708835121259622, "learning_rate": 8.104021070531161e-06, "loss": 0.0291, "step": 3153 }, { "epoch": 1.434940855323021, "grad_norm": 0.6763569542098118, "learning_rate": 8.102900424750702e-06, "loss": 0.0435, "step": 3154 }, { "epoch": 1.435395814376706, "grad_norm": 0.599568518154828, "learning_rate": 8.101779525415232e-06, "loss": 0.0366, "step": 3155 }, { "epoch": 1.4358507734303911, "grad_norm": 0.4968849199855825, "learning_rate": 8.100658372616346e-06, "loss": 0.0324, "step": 3156 }, { "epoch": 1.4363057324840764, "grad_norm": 0.6290729255135113, "learning_rate": 8.099536966445661e-06, "loss": 0.0323, "step": 3157 }, { "epoch": 1.4367606915377615, "grad_norm": 0.6666747225975457, "learning_rate": 8.098415306994813e-06, "loss": 0.0443, "step": 3158 }, { "epoch": 1.4372156505914468, "grad_norm": 0.7245033011103253, "learning_rate": 8.097293394355459e-06, "loss": 0.0516, "step": 3159 }, { "epoch": 1.437670609645132, "grad_norm": 0.42971274750201555, "learning_rate": 8.096171228619276e-06, "loss": 0.0247, "step": 3160 }, { "epoch": 1.438125568698817, "grad_norm": 0.5833567897257396, "learning_rate": 8.095048809877961e-06, "loss": 0.0294, "step": 3161 }, { "epoch": 1.4385805277525023, "grad_norm": 0.6728067449458445, "learning_rate": 8.093926138223234e-06, "loss": 0.0495, "step": 3162 }, { "epoch": 1.4390354868061874, "grad_norm": 0.6121920243772246, "learning_rate": 8.092803213746838e-06, "loss": 0.0418, "step": 3163 }, { "epoch": 1.4394904458598727, "grad_norm": 0.42743590888197097, "learning_rate": 8.091680036540528e-06, "loss": 0.0279, "step": 3164 }, { "epoch": 1.4399454049135578, "grad_norm": 0.4126243156879265, "learning_rate": 8.090556606696088e-06, "loss": 0.0223, "step": 3165 }, { "epoch": 1.440400363967243, "grad_norm": 0.5217770481149632, "learning_rate": 8.089432924305319e-06, "loss": 0.0265, "step": 3166 }, { "epoch": 1.4408553230209282, "grad_norm": 0.3749847244592466, "learning_rate": 8.08830898946004e-06, "loss": 0.0246, "step": 3167 }, { "epoch": 1.4413102820746133, "grad_norm": 0.5165056627108671, "learning_rate": 8.087184802252102e-06, "loss": 0.0314, "step": 3168 }, { "epoch": 1.4417652411282984, "grad_norm": 0.523636258278207, "learning_rate": 8.086060362773362e-06, "loss": 0.0325, "step": 3169 }, { "epoch": 1.4422202001819837, "grad_norm": 0.6072272924301817, "learning_rate": 8.084935671115705e-06, "loss": 0.0219, "step": 3170 }, { "epoch": 1.4426751592356688, "grad_norm": 0.525973990821791, "learning_rate": 8.083810727371037e-06, "loss": 0.0318, "step": 3171 }, { "epoch": 1.443130118289354, "grad_norm": 0.4683061240213502, "learning_rate": 8.082685531631282e-06, "loss": 0.0319, "step": 3172 }, { "epoch": 1.4435850773430392, "grad_norm": 0.8031318133945788, "learning_rate": 8.081560083988387e-06, "loss": 0.0347, "step": 3173 }, { "epoch": 1.4440400363967243, "grad_norm": 0.5449225628318263, "learning_rate": 8.080434384534318e-06, "loss": 0.0396, "step": 3174 }, { "epoch": 1.4444949954504094, "grad_norm": 0.42918723160148686, "learning_rate": 8.07930843336106e-06, "loss": 0.0221, "step": 3175 }, { "epoch": 1.4449499545040947, "grad_norm": 0.6193328526079885, "learning_rate": 8.078182230560628e-06, "loss": 0.0367, "step": 3176 }, { "epoch": 1.4454049135577798, "grad_norm": 0.5347432504833302, "learning_rate": 8.077055776225041e-06, "loss": 0.0329, "step": 3177 }, { "epoch": 1.4458598726114649, "grad_norm": 0.46998578040146427, "learning_rate": 8.075929070446354e-06, "loss": 0.021, "step": 3178 }, { "epoch": 1.4463148316651502, "grad_norm": 0.4749547726682001, "learning_rate": 8.074802113316633e-06, "loss": 0.032, "step": 3179 }, { "epoch": 1.4467697907188353, "grad_norm": 0.4196630072378444, "learning_rate": 8.07367490492797e-06, "loss": 0.0181, "step": 3180 }, { "epoch": 1.4472247497725204, "grad_norm": 0.5737177118299412, "learning_rate": 8.072547445372471e-06, "loss": 0.0391, "step": 3181 }, { "epoch": 1.4476797088262057, "grad_norm": 0.6430451121317332, "learning_rate": 8.071419734742275e-06, "loss": 0.0395, "step": 3182 }, { "epoch": 1.4481346678798908, "grad_norm": 0.4956926457176075, "learning_rate": 8.070291773129526e-06, "loss": 0.0307, "step": 3183 }, { "epoch": 1.4485896269335758, "grad_norm": 0.6279572303428038, "learning_rate": 8.0691635606264e-06, "loss": 0.0347, "step": 3184 }, { "epoch": 1.4490445859872612, "grad_norm": 0.5041410724529021, "learning_rate": 8.068035097325087e-06, "loss": 0.0308, "step": 3185 }, { "epoch": 1.4494995450409462, "grad_norm": 0.598159207447712, "learning_rate": 8.066906383317801e-06, "loss": 0.0365, "step": 3186 }, { "epoch": 1.4499545040946316, "grad_norm": 0.7495960095780199, "learning_rate": 8.065777418696775e-06, "loss": 0.0397, "step": 3187 }, { "epoch": 1.4504094631483166, "grad_norm": 0.6239207793483897, "learning_rate": 8.064648203554264e-06, "loss": 0.0333, "step": 3188 }, { "epoch": 1.450864422202002, "grad_norm": 0.6099069881067322, "learning_rate": 8.06351873798254e-06, "loss": 0.0447, "step": 3189 }, { "epoch": 1.451319381255687, "grad_norm": 0.3679602053637268, "learning_rate": 8.062389022073901e-06, "loss": 0.0227, "step": 3190 }, { "epoch": 1.4517743403093721, "grad_norm": 0.5180394703575767, "learning_rate": 8.061259055920661e-06, "loss": 0.0265, "step": 3191 }, { "epoch": 1.4522292993630574, "grad_norm": 0.5464173471525077, "learning_rate": 8.060128839615155e-06, "loss": 0.0358, "step": 3192 }, { "epoch": 1.4526842584167425, "grad_norm": 0.5864419383217762, "learning_rate": 8.05899837324974e-06, "loss": 0.0329, "step": 3193 }, { "epoch": 1.4531392174704276, "grad_norm": 0.5545494776633784, "learning_rate": 8.057867656916793e-06, "loss": 0.0324, "step": 3194 }, { "epoch": 1.453594176524113, "grad_norm": 0.44836014586822326, "learning_rate": 8.05673669070871e-06, "loss": 0.0308, "step": 3195 }, { "epoch": 1.454049135577798, "grad_norm": 0.6284948960555835, "learning_rate": 8.055605474717908e-06, "loss": 0.0432, "step": 3196 }, { "epoch": 1.4545040946314831, "grad_norm": 0.3707138957956115, "learning_rate": 8.054474009036826e-06, "loss": 0.0145, "step": 3197 }, { "epoch": 1.4549590536851684, "grad_norm": 0.3239258824372849, "learning_rate": 8.05334229375792e-06, "loss": 0.0199, "step": 3198 }, { "epoch": 1.4554140127388535, "grad_norm": 0.5079580495736739, "learning_rate": 8.052210328973673e-06, "loss": 0.0318, "step": 3199 }, { "epoch": 1.4558689717925386, "grad_norm": 0.5104623998753726, "learning_rate": 8.051078114776581e-06, "loss": 0.0246, "step": 3200 }, { "epoch": 1.456323930846224, "grad_norm": 0.49461842129465483, "learning_rate": 8.049945651259163e-06, "loss": 0.0199, "step": 3201 }, { "epoch": 1.456778889899909, "grad_norm": 0.6851413999422369, "learning_rate": 8.048812938513958e-06, "loss": 0.0437, "step": 3202 }, { "epoch": 1.457233848953594, "grad_norm": 0.4560870320885985, "learning_rate": 8.047679976633532e-06, "loss": 0.0241, "step": 3203 }, { "epoch": 1.4576888080072794, "grad_norm": 0.5352288092695029, "learning_rate": 8.04654676571046e-06, "loss": 0.0426, "step": 3204 }, { "epoch": 1.4581437670609645, "grad_norm": 0.6643419689254415, "learning_rate": 8.045413305837344e-06, "loss": 0.033, "step": 3205 }, { "epoch": 1.4585987261146496, "grad_norm": 0.43853553587407185, "learning_rate": 8.044279597106807e-06, "loss": 0.0361, "step": 3206 }, { "epoch": 1.459053685168335, "grad_norm": 0.4961349795414473, "learning_rate": 8.043145639611488e-06, "loss": 0.03, "step": 3207 }, { "epoch": 1.45950864422202, "grad_norm": 0.4711729889421299, "learning_rate": 8.04201143344405e-06, "loss": 0.022, "step": 3208 }, { "epoch": 1.459963603275705, "grad_norm": 0.363199677939404, "learning_rate": 8.040876978697174e-06, "loss": 0.0236, "step": 3209 }, { "epoch": 1.4604185623293904, "grad_norm": 0.38604446366020234, "learning_rate": 8.039742275463566e-06, "loss": 0.0226, "step": 3210 }, { "epoch": 1.4608735213830755, "grad_norm": 0.45765271075247205, "learning_rate": 8.038607323835946e-06, "loss": 0.025, "step": 3211 }, { "epoch": 1.4613284804367606, "grad_norm": 0.4395685757788466, "learning_rate": 8.037472123907058e-06, "loss": 0.0206, "step": 3212 }, { "epoch": 1.4617834394904459, "grad_norm": 0.6738624693889801, "learning_rate": 8.036336675769665e-06, "loss": 0.042, "step": 3213 }, { "epoch": 1.462238398544131, "grad_norm": 0.5686179308511357, "learning_rate": 8.03520097951655e-06, "loss": 0.0381, "step": 3214 }, { "epoch": 1.4626933575978163, "grad_norm": 0.4915965693261835, "learning_rate": 8.034065035240519e-06, "loss": 0.0332, "step": 3215 }, { "epoch": 1.4631483166515014, "grad_norm": 0.4051092749550177, "learning_rate": 8.032928843034393e-06, "loss": 0.0197, "step": 3216 }, { "epoch": 1.4636032757051867, "grad_norm": 0.6425143148265581, "learning_rate": 8.031792402991022e-06, "loss": 0.0466, "step": 3217 }, { "epoch": 1.4640582347588718, "grad_norm": 0.5423117548064644, "learning_rate": 8.030655715203265e-06, "loss": 0.0309, "step": 3218 }, { "epoch": 1.4645131938125568, "grad_norm": 0.5826622186708784, "learning_rate": 8.029518779764007e-06, "loss": 0.0438, "step": 3219 }, { "epoch": 1.4649681528662422, "grad_norm": 0.5576538577521253, "learning_rate": 8.028381596766159e-06, "loss": 0.0373, "step": 3220 }, { "epoch": 1.4654231119199272, "grad_norm": 0.6507764614874492, "learning_rate": 8.027244166302641e-06, "loss": 0.0328, "step": 3221 }, { "epoch": 1.4658780709736123, "grad_norm": 0.46590511948446056, "learning_rate": 8.026106488466403e-06, "loss": 0.037, "step": 3222 }, { "epoch": 1.4663330300272976, "grad_norm": 0.3698047411401823, "learning_rate": 8.024968563350406e-06, "loss": 0.0159, "step": 3223 }, { "epoch": 1.4667879890809827, "grad_norm": 0.5475583937083369, "learning_rate": 8.02383039104764e-06, "loss": 0.0343, "step": 3224 }, { "epoch": 1.4672429481346678, "grad_norm": 0.967666097714969, "learning_rate": 8.02269197165111e-06, "loss": 0.0493, "step": 3225 }, { "epoch": 1.4676979071883531, "grad_norm": 0.530611866566285, "learning_rate": 8.021553305253841e-06, "loss": 0.0342, "step": 3226 }, { "epoch": 1.4681528662420382, "grad_norm": 0.36365102789295684, "learning_rate": 8.020414391948882e-06, "loss": 0.0197, "step": 3227 }, { "epoch": 1.4686078252957233, "grad_norm": 0.786177107653995, "learning_rate": 8.0192752318293e-06, "loss": 0.0462, "step": 3228 }, { "epoch": 1.4690627843494086, "grad_norm": 0.4832882680186476, "learning_rate": 8.01813582498818e-06, "loss": 0.0322, "step": 3229 }, { "epoch": 1.4695177434030937, "grad_norm": 0.5440743945002253, "learning_rate": 8.01699617151863e-06, "loss": 0.041, "step": 3230 }, { "epoch": 1.4699727024567788, "grad_norm": 0.45934929798428287, "learning_rate": 8.015856271513777e-06, "loss": 0.0244, "step": 3231 }, { "epoch": 1.4704276615104641, "grad_norm": 0.5331795218085696, "learning_rate": 8.014716125066771e-06, "loss": 0.0342, "step": 3232 }, { "epoch": 1.4708826205641492, "grad_norm": 0.6946319306233284, "learning_rate": 8.013575732270775e-06, "loss": 0.0543, "step": 3233 }, { "epoch": 1.4713375796178343, "grad_norm": 0.41286436665712734, "learning_rate": 8.012435093218982e-06, "loss": 0.0238, "step": 3234 }, { "epoch": 1.4717925386715196, "grad_norm": 0.5015834867051956, "learning_rate": 8.011294208004596e-06, "loss": 0.0303, "step": 3235 }, { "epoch": 1.4722474977252047, "grad_norm": 0.346012187808125, "learning_rate": 8.010153076720848e-06, "loss": 0.0269, "step": 3236 }, { "epoch": 1.4727024567788898, "grad_norm": 0.44295027382591184, "learning_rate": 8.00901169946098e-06, "loss": 0.034, "step": 3237 }, { "epoch": 1.473157415832575, "grad_norm": 0.514028689971606, "learning_rate": 8.007870076318268e-06, "loss": 0.0337, "step": 3238 }, { "epoch": 1.4736123748862602, "grad_norm": 0.614308759846028, "learning_rate": 8.006728207385996e-06, "loss": 0.0412, "step": 3239 }, { "epoch": 1.4740673339399453, "grad_norm": 0.30523726833683307, "learning_rate": 8.005586092757472e-06, "loss": 0.0162, "step": 3240 }, { "epoch": 1.4745222929936306, "grad_norm": 0.638750228351473, "learning_rate": 8.004443732526026e-06, "loss": 0.0364, "step": 3241 }, { "epoch": 1.4749772520473157, "grad_norm": 0.6238842384536734, "learning_rate": 8.003301126785007e-06, "loss": 0.039, "step": 3242 }, { "epoch": 1.475432211101001, "grad_norm": 0.4763894780054722, "learning_rate": 8.002158275627783e-06, "loss": 0.0303, "step": 3243 }, { "epoch": 1.475887170154686, "grad_norm": 0.5018596408732007, "learning_rate": 8.00101517914774e-06, "loss": 0.0288, "step": 3244 }, { "epoch": 1.4763421292083714, "grad_norm": 0.7564418877944006, "learning_rate": 7.999871837438292e-06, "loss": 0.0561, "step": 3245 }, { "epoch": 1.4767970882620565, "grad_norm": 0.49218467944499245, "learning_rate": 7.998728250592865e-06, "loss": 0.0259, "step": 3246 }, { "epoch": 1.4772520473157416, "grad_norm": 0.5418004463760446, "learning_rate": 7.997584418704905e-06, "loss": 0.0325, "step": 3247 }, { "epoch": 1.4777070063694269, "grad_norm": 0.5379198820020985, "learning_rate": 7.996440341867884e-06, "loss": 0.0342, "step": 3248 }, { "epoch": 1.478161965423112, "grad_norm": 0.3534734720936311, "learning_rate": 7.99529602017529e-06, "loss": 0.0165, "step": 3249 }, { "epoch": 1.478616924476797, "grad_norm": 0.5380771099198766, "learning_rate": 7.994151453720632e-06, "loss": 0.0285, "step": 3250 }, { "epoch": 1.4790718835304824, "grad_norm": 0.5333004127188998, "learning_rate": 7.993006642597438e-06, "loss": 0.0331, "step": 3251 }, { "epoch": 1.4795268425841674, "grad_norm": 0.6692517171982048, "learning_rate": 7.991861586899258e-06, "loss": 0.0549, "step": 3252 }, { "epoch": 1.4799818016378525, "grad_norm": 0.6481381791067767, "learning_rate": 7.990716286719662e-06, "loss": 0.0299, "step": 3253 }, { "epoch": 1.4804367606915378, "grad_norm": 0.4516323080619454, "learning_rate": 7.989570742152235e-06, "loss": 0.0189, "step": 3254 }, { "epoch": 1.480891719745223, "grad_norm": 0.40638032731762097, "learning_rate": 7.988424953290588e-06, "loss": 0.0296, "step": 3255 }, { "epoch": 1.481346678798908, "grad_norm": 0.597331728437238, "learning_rate": 7.98727892022835e-06, "loss": 0.041, "step": 3256 }, { "epoch": 1.4818016378525933, "grad_norm": 0.4627299367009861, "learning_rate": 7.986132643059169e-06, "loss": 0.0274, "step": 3257 }, { "epoch": 1.4822565969062784, "grad_norm": 0.6031029641496045, "learning_rate": 7.984986121876714e-06, "loss": 0.0401, "step": 3258 }, { "epoch": 1.4827115559599635, "grad_norm": 0.547521201895844, "learning_rate": 7.983839356774671e-06, "loss": 0.0396, "step": 3259 }, { "epoch": 1.4831665150136488, "grad_norm": 0.866003940880254, "learning_rate": 7.982692347846755e-06, "loss": 0.0359, "step": 3260 }, { "epoch": 1.483621474067334, "grad_norm": 0.4726902171500864, "learning_rate": 7.981545095186684e-06, "loss": 0.0204, "step": 3261 }, { "epoch": 1.484076433121019, "grad_norm": 0.8705680413739901, "learning_rate": 7.980397598888217e-06, "loss": 0.0633, "step": 3262 }, { "epoch": 1.4845313921747043, "grad_norm": 0.570357211847994, "learning_rate": 7.979249859045117e-06, "loss": 0.0315, "step": 3263 }, { "epoch": 1.4849863512283894, "grad_norm": 0.40815032752344726, "learning_rate": 7.978101875751173e-06, "loss": 0.0212, "step": 3264 }, { "epoch": 1.4854413102820745, "grad_norm": 0.4310123367818552, "learning_rate": 7.97695364910019e-06, "loss": 0.0303, "step": 3265 }, { "epoch": 1.4858962693357598, "grad_norm": 0.8088215943900415, "learning_rate": 7.975805179186001e-06, "loss": 0.0515, "step": 3266 }, { "epoch": 1.486351228389445, "grad_norm": 0.7314527739715035, "learning_rate": 7.97465646610245e-06, "loss": 0.0422, "step": 3267 }, { "epoch": 1.4868061874431302, "grad_norm": 0.7721966723010093, "learning_rate": 7.973507509943406e-06, "loss": 0.0479, "step": 3268 }, { "epoch": 1.4872611464968153, "grad_norm": 0.47220931052176524, "learning_rate": 7.972358310802758e-06, "loss": 0.0286, "step": 3269 }, { "epoch": 1.4877161055505004, "grad_norm": 0.5764503158465304, "learning_rate": 7.971208868774412e-06, "loss": 0.0465, "step": 3270 }, { "epoch": 1.4881710646041857, "grad_norm": 1.0302779784470453, "learning_rate": 7.970059183952295e-06, "loss": 0.0427, "step": 3271 }, { "epoch": 1.4886260236578708, "grad_norm": 5.82527427597862, "learning_rate": 7.968909256430352e-06, "loss": 0.0953, "step": 3272 }, { "epoch": 1.489080982711556, "grad_norm": 0.6281307778669729, "learning_rate": 7.967759086302554e-06, "loss": 0.0432, "step": 3273 }, { "epoch": 1.4895359417652412, "grad_norm": 0.4654685533789216, "learning_rate": 7.966608673662885e-06, "loss": 0.0189, "step": 3274 }, { "epoch": 1.4899909008189263, "grad_norm": 0.5419348533302625, "learning_rate": 7.965458018605352e-06, "loss": 0.0366, "step": 3275 }, { "epoch": 1.4904458598726116, "grad_norm": 0.6193685619288958, "learning_rate": 7.964307121223983e-06, "loss": 0.0419, "step": 3276 }, { "epoch": 1.4909008189262967, "grad_norm": 0.4517996667707405, "learning_rate": 7.96315598161282e-06, "loss": 0.0257, "step": 3277 }, { "epoch": 1.4913557779799818, "grad_norm": 0.9132409180267741, "learning_rate": 7.962004599865935e-06, "loss": 0.0601, "step": 3278 }, { "epoch": 1.491810737033667, "grad_norm": 0.41324571059066195, "learning_rate": 7.960852976077406e-06, "loss": 0.0259, "step": 3279 }, { "epoch": 1.4922656960873522, "grad_norm": 0.6733326146756219, "learning_rate": 7.959701110341346e-06, "loss": 0.0419, "step": 3280 }, { "epoch": 1.4927206551410372, "grad_norm": 0.5701229927572675, "learning_rate": 7.958549002751879e-06, "loss": 0.0418, "step": 3281 }, { "epoch": 1.4931756141947226, "grad_norm": 0.7607287312568358, "learning_rate": 7.957396653403145e-06, "loss": 0.0552, "step": 3282 }, { "epoch": 1.4936305732484076, "grad_norm": 0.7579360335222567, "learning_rate": 7.956244062389313e-06, "loss": 0.0418, "step": 3283 }, { "epoch": 1.4940855323020927, "grad_norm": 0.6259733792239423, "learning_rate": 7.955091229804568e-06, "loss": 0.0347, "step": 3284 }, { "epoch": 1.494540491355778, "grad_norm": 0.44634359243758226, "learning_rate": 7.95393815574311e-06, "loss": 0.0247, "step": 3285 }, { "epoch": 1.4949954504094631, "grad_norm": 0.6518710394549171, "learning_rate": 7.952784840299166e-06, "loss": 0.0385, "step": 3286 }, { "epoch": 1.4954504094631482, "grad_norm": 0.42025419748462395, "learning_rate": 7.951631283566981e-06, "loss": 0.0145, "step": 3287 }, { "epoch": 1.4959053685168335, "grad_norm": 0.5219400243782572, "learning_rate": 7.950477485640818e-06, "loss": 0.0369, "step": 3288 }, { "epoch": 1.4963603275705186, "grad_norm": 0.5014319884220756, "learning_rate": 7.949323446614957e-06, "loss": 0.0267, "step": 3289 }, { "epoch": 1.4968152866242037, "grad_norm": 0.7694796958332192, "learning_rate": 7.948169166583703e-06, "loss": 0.049, "step": 3290 }, { "epoch": 1.497270245677889, "grad_norm": 0.5816908738213662, "learning_rate": 7.94701464564138e-06, "loss": 0.0275, "step": 3291 }, { "epoch": 1.4977252047315741, "grad_norm": 0.5076527248188425, "learning_rate": 7.945859883882327e-06, "loss": 0.0274, "step": 3292 }, { "epoch": 1.4981801637852592, "grad_norm": 0.6039997527684581, "learning_rate": 7.94470488140091e-06, "loss": 0.0378, "step": 3293 }, { "epoch": 1.4986351228389445, "grad_norm": 0.592557887847226, "learning_rate": 7.943549638291507e-06, "loss": 0.0301, "step": 3294 }, { "epoch": 1.4990900818926296, "grad_norm": 0.5313149152774042, "learning_rate": 7.94239415464852e-06, "loss": 0.0373, "step": 3295 }, { "epoch": 1.499545040946315, "grad_norm": 0.5031451885132354, "learning_rate": 7.94123843056637e-06, "loss": 0.036, "step": 3296 }, { "epoch": 1.5, "grad_norm": 0.5804403409340608, "learning_rate": 7.9400824661395e-06, "loss": 0.0379, "step": 3297 }, { "epoch": 1.5004549590536853, "grad_norm": 0.45207731359234127, "learning_rate": 7.938926261462366e-06, "loss": 0.0253, "step": 3298 }, { "epoch": 1.5009099181073702, "grad_norm": 0.7319323709083051, "learning_rate": 7.93776981662945e-06, "loss": 0.0427, "step": 3299 }, { "epoch": 1.5013648771610555, "grad_norm": 0.5950172451800795, "learning_rate": 7.936613131735253e-06, "loss": 0.026, "step": 3300 }, { "epoch": 1.5018198362147408, "grad_norm": 0.5184269203338593, "learning_rate": 7.935456206874292e-06, "loss": 0.0236, "step": 3301 }, { "epoch": 1.5022747952684259, "grad_norm": 0.45332160585967457, "learning_rate": 7.934299042141107e-06, "loss": 0.0289, "step": 3302 }, { "epoch": 1.502729754322111, "grad_norm": 0.4539427619968088, "learning_rate": 7.933141637630252e-06, "loss": 0.0228, "step": 3303 }, { "epoch": 1.5031847133757963, "grad_norm": 0.6349022485227811, "learning_rate": 7.931983993436312e-06, "loss": 0.0441, "step": 3304 }, { "epoch": 1.5036396724294814, "grad_norm": 0.5901676639230142, "learning_rate": 7.930826109653882e-06, "loss": 0.0315, "step": 3305 }, { "epoch": 1.5040946314831665, "grad_norm": 0.44019191896156384, "learning_rate": 7.929667986377574e-06, "loss": 0.0241, "step": 3306 }, { "epoch": 1.5045495905368518, "grad_norm": 0.849223492428955, "learning_rate": 7.92850962370203e-06, "loss": 0.0672, "step": 3307 }, { "epoch": 1.5050045495905369, "grad_norm": 0.6138944480286226, "learning_rate": 7.927351021721905e-06, "loss": 0.0475, "step": 3308 }, { "epoch": 1.505459508644222, "grad_norm": 0.5308981954331117, "learning_rate": 7.926192180531873e-06, "loss": 0.0328, "step": 3309 }, { "epoch": 1.5059144676979073, "grad_norm": 0.45288398749279407, "learning_rate": 7.925033100226632e-06, "loss": 0.0258, "step": 3310 }, { "epoch": 1.5063694267515924, "grad_norm": 0.6804917839354758, "learning_rate": 7.923873780900894e-06, "loss": 0.0519, "step": 3311 }, { "epoch": 1.5068243858052774, "grad_norm": 0.6877383357614162, "learning_rate": 7.922714222649394e-06, "loss": 0.0461, "step": 3312 }, { "epoch": 1.5072793448589628, "grad_norm": 0.37307848806453825, "learning_rate": 7.92155442556689e-06, "loss": 0.0169, "step": 3313 }, { "epoch": 1.5077343039126478, "grad_norm": 0.4649246667832013, "learning_rate": 7.92039438974815e-06, "loss": 0.0246, "step": 3314 }, { "epoch": 1.508189262966333, "grad_norm": 0.5797951905427798, "learning_rate": 7.919234115287969e-06, "loss": 0.0264, "step": 3315 }, { "epoch": 1.5086442220200182, "grad_norm": 0.5999502093130843, "learning_rate": 7.918073602281158e-06, "loss": 0.0408, "step": 3316 }, { "epoch": 1.5090991810737033, "grad_norm": 0.37444817465605024, "learning_rate": 7.91691285082255e-06, "loss": 0.0199, "step": 3317 }, { "epoch": 1.5095541401273884, "grad_norm": 0.43571431795450083, "learning_rate": 7.915751861007e-06, "loss": 0.0311, "step": 3318 }, { "epoch": 1.5100090991810737, "grad_norm": 0.5718039363551488, "learning_rate": 7.914590632929372e-06, "loss": 0.0398, "step": 3319 }, { "epoch": 1.5104640582347588, "grad_norm": 0.5094342970442877, "learning_rate": 7.913429166684561e-06, "loss": 0.0379, "step": 3320 }, { "epoch": 1.510919017288444, "grad_norm": 0.5242594744129395, "learning_rate": 7.912267462367473e-06, "loss": 0.026, "step": 3321 }, { "epoch": 1.5113739763421292, "grad_norm": 0.48884902053211027, "learning_rate": 7.911105520073044e-06, "loss": 0.0306, "step": 3322 }, { "epoch": 1.5118289353958145, "grad_norm": 0.7394168313167301, "learning_rate": 7.909943339896215e-06, "loss": 0.0479, "step": 3323 }, { "epoch": 1.5122838944494994, "grad_norm": 0.6469513917525913, "learning_rate": 7.908780921931957e-06, "loss": 0.0371, "step": 3324 }, { "epoch": 1.5127388535031847, "grad_norm": 0.8592861455834359, "learning_rate": 7.90761826627526e-06, "loss": 0.0484, "step": 3325 }, { "epoch": 1.51319381255687, "grad_norm": 0.42257868990927616, "learning_rate": 7.90645537302113e-06, "loss": 0.0232, "step": 3326 }, { "epoch": 1.5136487716105549, "grad_norm": 0.45779799028489876, "learning_rate": 7.905292242264591e-06, "loss": 0.0278, "step": 3327 }, { "epoch": 1.5141037306642402, "grad_norm": 0.5433890115321734, "learning_rate": 7.904128874100689e-06, "loss": 0.0365, "step": 3328 }, { "epoch": 1.5145586897179255, "grad_norm": 0.5494344731912977, "learning_rate": 7.902965268624491e-06, "loss": 0.0282, "step": 3329 }, { "epoch": 1.5150136487716106, "grad_norm": 0.6130934963882723, "learning_rate": 7.901801425931082e-06, "loss": 0.0424, "step": 3330 }, { "epoch": 1.5154686078252957, "grad_norm": 0.589889212368702, "learning_rate": 7.900637346115563e-06, "loss": 0.0311, "step": 3331 }, { "epoch": 1.515923566878981, "grad_norm": 0.5616386558187996, "learning_rate": 7.899473029273061e-06, "loss": 0.0399, "step": 3332 }, { "epoch": 1.516378525932666, "grad_norm": 0.5559424467039881, "learning_rate": 7.898308475498717e-06, "loss": 0.0368, "step": 3333 }, { "epoch": 1.5168334849863512, "grad_norm": 0.4805464009233816, "learning_rate": 7.897143684887692e-06, "loss": 0.0257, "step": 3334 }, { "epoch": 1.5172884440400365, "grad_norm": 0.4773831435489601, "learning_rate": 7.89597865753517e-06, "loss": 0.0256, "step": 3335 }, { "epoch": 1.5177434030937216, "grad_norm": 0.6381236025058032, "learning_rate": 7.894813393536349e-06, "loss": 0.0439, "step": 3336 }, { "epoch": 1.5181983621474067, "grad_norm": 0.911328191968218, "learning_rate": 7.893647892986448e-06, "loss": 0.0674, "step": 3337 }, { "epoch": 1.518653321201092, "grad_norm": 0.4575031300837551, "learning_rate": 7.892482155980713e-06, "loss": 0.028, "step": 3338 }, { "epoch": 1.519108280254777, "grad_norm": 0.43138761248175833, "learning_rate": 7.891316182614397e-06, "loss": 0.0211, "step": 3339 }, { "epoch": 1.5195632393084622, "grad_norm": 0.6073801276576956, "learning_rate": 7.890149972982779e-06, "loss": 0.0346, "step": 3340 }, { "epoch": 1.5200181983621475, "grad_norm": 0.495487486191647, "learning_rate": 7.888983527181157e-06, "loss": 0.0252, "step": 3341 }, { "epoch": 1.5204731574158326, "grad_norm": 0.40225955239465766, "learning_rate": 7.887816845304847e-06, "loss": 0.0197, "step": 3342 }, { "epoch": 1.5209281164695176, "grad_norm": 0.5565854453398432, "learning_rate": 7.886649927449189e-06, "loss": 0.0291, "step": 3343 }, { "epoch": 1.521383075523203, "grad_norm": 0.6454632437150152, "learning_rate": 7.885482773709532e-06, "loss": 0.0368, "step": 3344 }, { "epoch": 1.521838034576888, "grad_norm": 0.4211140971021706, "learning_rate": 7.884315384181254e-06, "loss": 0.0213, "step": 3345 }, { "epoch": 1.5222929936305731, "grad_norm": 0.46955263253373974, "learning_rate": 7.883147758959748e-06, "loss": 0.0298, "step": 3346 }, { "epoch": 1.5227479526842584, "grad_norm": 0.562841156860276, "learning_rate": 7.881979898140428e-06, "loss": 0.0315, "step": 3347 }, { "epoch": 1.5232029117379435, "grad_norm": 0.653622108778222, "learning_rate": 7.880811801818724e-06, "loss": 0.029, "step": 3348 }, { "epoch": 1.5236578707916286, "grad_norm": 0.4989556791862373, "learning_rate": 7.879643470090092e-06, "loss": 0.0287, "step": 3349 }, { "epoch": 1.524112829845314, "grad_norm": 0.6493806438408574, "learning_rate": 7.878474903049997e-06, "loss": 0.0379, "step": 3350 }, { "epoch": 1.5245677888989992, "grad_norm": 0.9494130739652255, "learning_rate": 7.877306100793934e-06, "loss": 0.0732, "step": 3351 }, { "epoch": 1.525022747952684, "grad_norm": 0.5297290866341118, "learning_rate": 7.876137063417411e-06, "loss": 0.0308, "step": 3352 }, { "epoch": 1.5254777070063694, "grad_norm": 0.6319762265400439, "learning_rate": 7.874967791015954e-06, "loss": 0.0242, "step": 3353 }, { "epoch": 1.5259326660600547, "grad_norm": 0.5767239975955621, "learning_rate": 7.873798283685112e-06, "loss": 0.0406, "step": 3354 }, { "epoch": 1.5263876251137396, "grad_norm": 0.5537544990681309, "learning_rate": 7.872628541520453e-06, "loss": 0.0274, "step": 3355 }, { "epoch": 1.526842584167425, "grad_norm": 0.45762117910575895, "learning_rate": 7.871458564617562e-06, "loss": 0.0294, "step": 3356 }, { "epoch": 1.5272975432211102, "grad_norm": 0.6149649052094309, "learning_rate": 7.870288353072046e-06, "loss": 0.0318, "step": 3357 }, { "epoch": 1.5277525022747953, "grad_norm": 0.5433678607043896, "learning_rate": 7.869117906979526e-06, "loss": 0.0351, "step": 3358 }, { "epoch": 1.5282074613284804, "grad_norm": 0.893607675733148, "learning_rate": 7.867947226435649e-06, "loss": 0.0361, "step": 3359 }, { "epoch": 1.5286624203821657, "grad_norm": 0.72028067546292, "learning_rate": 7.866776311536075e-06, "loss": 0.0266, "step": 3360 }, { "epoch": 1.5291173794358508, "grad_norm": 0.4457333347562546, "learning_rate": 7.865605162376485e-06, "loss": 0.025, "step": 3361 }, { "epoch": 1.5295723384895359, "grad_norm": 0.5843708022673314, "learning_rate": 7.864433779052586e-06, "loss": 0.0402, "step": 3362 }, { "epoch": 1.5300272975432212, "grad_norm": 0.47900762284025383, "learning_rate": 7.863262161660093e-06, "loss": 0.0264, "step": 3363 }, { "epoch": 1.5304822565969063, "grad_norm": 0.6320035641166531, "learning_rate": 7.862090310294747e-06, "loss": 0.0418, "step": 3364 }, { "epoch": 1.5309372156505914, "grad_norm": 0.4362921274047482, "learning_rate": 7.860918225052306e-06, "loss": 0.0206, "step": 3365 }, { "epoch": 1.5313921747042767, "grad_norm": 0.47511830394385196, "learning_rate": 7.859745906028545e-06, "loss": 0.0302, "step": 3366 }, { "epoch": 1.5318471337579618, "grad_norm": 0.44734571803407175, "learning_rate": 7.858573353319264e-06, "loss": 0.0269, "step": 3367 }, { "epoch": 1.5323020928116469, "grad_norm": 0.4240090335241271, "learning_rate": 7.85740056702028e-06, "loss": 0.0232, "step": 3368 }, { "epoch": 1.5327570518653322, "grad_norm": 0.47124639738210494, "learning_rate": 7.856227547227421e-06, "loss": 0.0347, "step": 3369 }, { "epoch": 1.5332120109190173, "grad_norm": 0.5925515695295828, "learning_rate": 7.85505429403655e-06, "loss": 0.0352, "step": 3370 }, { "epoch": 1.5336669699727024, "grad_norm": 0.42740971855552534, "learning_rate": 7.853880807543534e-06, "loss": 0.0373, "step": 3371 }, { "epoch": 1.5341219290263877, "grad_norm": 0.6808939567186573, "learning_rate": 7.852707087844267e-06, "loss": 0.0459, "step": 3372 }, { "epoch": 1.5345768880800728, "grad_norm": 0.5327072494298575, "learning_rate": 7.851533135034658e-06, "loss": 0.034, "step": 3373 }, { "epoch": 1.5350318471337578, "grad_norm": 0.49051246039059626, "learning_rate": 7.850358949210639e-06, "loss": 0.0307, "step": 3374 }, { "epoch": 1.5354868061874432, "grad_norm": 0.4879153893556511, "learning_rate": 7.84918453046816e-06, "loss": 0.0253, "step": 3375 }, { "epoch": 1.5359417652411285, "grad_norm": 0.4606644347025298, "learning_rate": 7.848009878903187e-06, "loss": 0.0261, "step": 3376 }, { "epoch": 1.5363967242948133, "grad_norm": 0.7246412722247344, "learning_rate": 7.846834994611707e-06, "loss": 0.0508, "step": 3377 }, { "epoch": 1.5368516833484986, "grad_norm": 0.5437097771161801, "learning_rate": 7.845659877689729e-06, "loss": 0.0256, "step": 3378 }, { "epoch": 1.537306642402184, "grad_norm": 0.4513777273034115, "learning_rate": 7.844484528233279e-06, "loss": 0.0268, "step": 3379 }, { "epoch": 1.5377616014558688, "grad_norm": 0.5767478416216448, "learning_rate": 7.843308946338396e-06, "loss": 0.0459, "step": 3380 }, { "epoch": 1.5382165605095541, "grad_norm": 0.5278410996002848, "learning_rate": 7.842133132101145e-06, "loss": 0.0291, "step": 3381 }, { "epoch": 1.5386715195632394, "grad_norm": 0.5863010112909317, "learning_rate": 7.840957085617612e-06, "loss": 0.0351, "step": 3382 }, { "epoch": 1.5391264786169245, "grad_norm": 0.5692494495242041, "learning_rate": 7.839780806983894e-06, "loss": 0.0362, "step": 3383 }, { "epoch": 1.5395814376706096, "grad_norm": 0.5765842865723461, "learning_rate": 7.838604296296114e-06, "loss": 0.0281, "step": 3384 }, { "epoch": 1.540036396724295, "grad_norm": 0.5611416523170261, "learning_rate": 7.837427553650409e-06, "loss": 0.0301, "step": 3385 }, { "epoch": 1.54049135577798, "grad_norm": 0.5227145172283483, "learning_rate": 7.836250579142938e-06, "loss": 0.0256, "step": 3386 }, { "epoch": 1.540946314831665, "grad_norm": 0.5053608536514115, "learning_rate": 7.835073372869878e-06, "loss": 0.032, "step": 3387 }, { "epoch": 1.5414012738853504, "grad_norm": 0.5523481686663522, "learning_rate": 7.833895934927426e-06, "loss": 0.0351, "step": 3388 }, { "epoch": 1.5418562329390355, "grad_norm": 0.46085951723099483, "learning_rate": 7.832718265411795e-06, "loss": 0.0238, "step": 3389 }, { "epoch": 1.5423111919927206, "grad_norm": 0.6406733101128618, "learning_rate": 7.83154036441922e-06, "loss": 0.0597, "step": 3390 }, { "epoch": 1.542766151046406, "grad_norm": 0.6182774088480784, "learning_rate": 7.830362232045953e-06, "loss": 0.0457, "step": 3391 }, { "epoch": 1.543221110100091, "grad_norm": 0.419525367767363, "learning_rate": 7.829183868388269e-06, "loss": 0.0284, "step": 3392 }, { "epoch": 1.543676069153776, "grad_norm": 0.687986567801542, "learning_rate": 7.828005273542452e-06, "loss": 0.0352, "step": 3393 }, { "epoch": 1.5441310282074614, "grad_norm": 0.6030172854686707, "learning_rate": 7.826826447604815e-06, "loss": 0.0284, "step": 3394 }, { "epoch": 1.5445859872611465, "grad_norm": 0.4267704901425831, "learning_rate": 7.82564739067169e-06, "loss": 0.0283, "step": 3395 }, { "epoch": 1.5450409463148316, "grad_norm": 0.7090173708752295, "learning_rate": 7.82446810283942e-06, "loss": 0.0492, "step": 3396 }, { "epoch": 1.5454959053685169, "grad_norm": 0.5782679542081406, "learning_rate": 7.82328858420437e-06, "loss": 0.0315, "step": 3397 }, { "epoch": 1.545950864422202, "grad_norm": 0.41616530573356497, "learning_rate": 7.82210883486293e-06, "loss": 0.0235, "step": 3398 }, { "epoch": 1.546405823475887, "grad_norm": 0.5309283516266711, "learning_rate": 7.820928854911497e-06, "loss": 0.0379, "step": 3399 }, { "epoch": 1.5468607825295724, "grad_norm": 0.699558728154947, "learning_rate": 7.819748644446499e-06, "loss": 0.0482, "step": 3400 }, { "epoch": 1.5473157415832575, "grad_norm": 0.471820338463683, "learning_rate": 7.818568203564375e-06, "loss": 0.023, "step": 3401 }, { "epoch": 1.5477707006369426, "grad_norm": 0.42929414000407534, "learning_rate": 7.817387532361585e-06, "loss": 0.0244, "step": 3402 }, { "epoch": 1.5482256596906279, "grad_norm": 0.41613692792574736, "learning_rate": 7.816206630934611e-06, "loss": 0.0217, "step": 3403 }, { "epoch": 1.5486806187443132, "grad_norm": 0.5150802413289814, "learning_rate": 7.815025499379947e-06, "loss": 0.0277, "step": 3404 }, { "epoch": 1.549135577797998, "grad_norm": 0.4967119725409103, "learning_rate": 7.813844137794114e-06, "loss": 0.0406, "step": 3405 }, { "epoch": 1.5495905368516834, "grad_norm": 0.3871111988995279, "learning_rate": 7.812662546273643e-06, "loss": 0.0179, "step": 3406 }, { "epoch": 1.5500454959053687, "grad_norm": 0.549646943524987, "learning_rate": 7.811480724915093e-06, "loss": 0.0327, "step": 3407 }, { "epoch": 1.5505004549590535, "grad_norm": 0.3543829777210757, "learning_rate": 7.810298673815031e-06, "loss": 0.017, "step": 3408 }, { "epoch": 1.5509554140127388, "grad_norm": 0.6113532245733246, "learning_rate": 7.809116393070057e-06, "loss": 0.0397, "step": 3409 }, { "epoch": 1.5514103730664242, "grad_norm": 0.8106140035945865, "learning_rate": 7.807933882776774e-06, "loss": 0.0474, "step": 3410 }, { "epoch": 1.5518653321201092, "grad_norm": 0.6896922151206373, "learning_rate": 7.806751143031817e-06, "loss": 0.0404, "step": 3411 }, { "epoch": 1.5523202911737943, "grad_norm": 0.5736907390195001, "learning_rate": 7.80556817393183e-06, "loss": 0.0244, "step": 3412 }, { "epoch": 1.5527752502274796, "grad_norm": 0.5185448241286031, "learning_rate": 7.804384975573482e-06, "loss": 0.025, "step": 3413 }, { "epoch": 1.5532302092811647, "grad_norm": 0.6057910883771316, "learning_rate": 7.803201548053459e-06, "loss": 0.0301, "step": 3414 }, { "epoch": 1.5536851683348498, "grad_norm": 0.5693286233096472, "learning_rate": 7.802017891468464e-06, "loss": 0.0277, "step": 3415 }, { "epoch": 1.5541401273885351, "grad_norm": 0.679306287123086, "learning_rate": 7.80083400591522e-06, "loss": 0.0427, "step": 3416 }, { "epoch": 1.5545950864422202, "grad_norm": 0.6700442491171918, "learning_rate": 7.799649891490472e-06, "loss": 0.041, "step": 3417 }, { "epoch": 1.5550500454959053, "grad_norm": 0.45486416732531165, "learning_rate": 7.798465548290975e-06, "loss": 0.0311, "step": 3418 }, { "epoch": 1.5555050045495906, "grad_norm": 0.4474510317671961, "learning_rate": 7.797280976413512e-06, "loss": 0.0276, "step": 3419 }, { "epoch": 1.5559599636032757, "grad_norm": 0.5430321466215652, "learning_rate": 7.796096175954881e-06, "loss": 0.0369, "step": 3420 }, { "epoch": 1.5564149226569608, "grad_norm": 0.5723323718862299, "learning_rate": 7.7949111470119e-06, "loss": 0.0308, "step": 3421 }, { "epoch": 1.556869881710646, "grad_norm": 0.5885577249765643, "learning_rate": 7.793725889681396e-06, "loss": 0.027, "step": 3422 }, { "epoch": 1.5573248407643312, "grad_norm": 0.639570428654892, "learning_rate": 7.792540404060232e-06, "loss": 0.0602, "step": 3423 }, { "epoch": 1.5577797998180163, "grad_norm": 0.4521381633452731, "learning_rate": 7.791354690245276e-06, "loss": 0.023, "step": 3424 }, { "epoch": 1.5582347588717016, "grad_norm": 0.5227633621304679, "learning_rate": 7.790168748333422e-06, "loss": 0.0306, "step": 3425 }, { "epoch": 1.5586897179253867, "grad_norm": 0.49798978950391587, "learning_rate": 7.788982578421576e-06, "loss": 0.0276, "step": 3426 }, { "epoch": 1.5591446769790718, "grad_norm": 0.5793757794831385, "learning_rate": 7.78779618060667e-06, "loss": 0.0372, "step": 3427 }, { "epoch": 1.559599636032757, "grad_norm": 0.534549847977574, "learning_rate": 7.78660955498565e-06, "loss": 0.0376, "step": 3428 }, { "epoch": 1.5600545950864422, "grad_norm": 0.7215365048435313, "learning_rate": 7.78542270165548e-06, "loss": 0.0425, "step": 3429 }, { "epoch": 1.5605095541401273, "grad_norm": 0.4278147394667788, "learning_rate": 7.784235620713148e-06, "loss": 0.0263, "step": 3430 }, { "epoch": 1.5609645131938126, "grad_norm": 1.1898282213354985, "learning_rate": 7.783048312255653e-06, "loss": 0.0444, "step": 3431 }, { "epoch": 1.5614194722474979, "grad_norm": 0.45023984506018094, "learning_rate": 7.781860776380018e-06, "loss": 0.0224, "step": 3432 }, { "epoch": 1.5618744313011828, "grad_norm": 0.6704837664332035, "learning_rate": 7.780673013183285e-06, "loss": 0.0469, "step": 3433 }, { "epoch": 1.562329390354868, "grad_norm": 0.6013246016752672, "learning_rate": 7.779485022762507e-06, "loss": 0.0347, "step": 3434 }, { "epoch": 1.5627843494085534, "grad_norm": 0.8945038921473537, "learning_rate": 7.778296805214768e-06, "loss": 0.0474, "step": 3435 }, { "epoch": 1.5632393084622382, "grad_norm": 0.5510688494568616, "learning_rate": 7.77710836063716e-06, "loss": 0.0383, "step": 3436 }, { "epoch": 1.5636942675159236, "grad_norm": 0.6583580556515867, "learning_rate": 7.775919689126798e-06, "loss": 0.0336, "step": 3437 }, { "epoch": 1.5641492265696089, "grad_norm": 0.7415363558720585, "learning_rate": 7.774730790780814e-06, "loss": 0.0331, "step": 3438 }, { "epoch": 1.564604185623294, "grad_norm": 0.7507039745490994, "learning_rate": 7.773541665696363e-06, "loss": 0.0491, "step": 3439 }, { "epoch": 1.565059144676979, "grad_norm": 0.5977637793388121, "learning_rate": 7.77235231397061e-06, "loss": 0.0356, "step": 3440 }, { "epoch": 1.5655141037306644, "grad_norm": 0.4500431567893906, "learning_rate": 7.771162735700746e-06, "loss": 0.0282, "step": 3441 }, { "epoch": 1.5659690627843494, "grad_norm": 0.4701512669605663, "learning_rate": 7.769972930983977e-06, "loss": 0.0274, "step": 3442 }, { "epoch": 1.5664240218380345, "grad_norm": 0.595132942648599, "learning_rate": 7.76878289991753e-06, "loss": 0.0377, "step": 3443 }, { "epoch": 1.5668789808917198, "grad_norm": 0.7296510448397537, "learning_rate": 7.76759264259865e-06, "loss": 0.0437, "step": 3444 }, { "epoch": 1.567333939945405, "grad_norm": 0.5102548675863424, "learning_rate": 7.766402159124595e-06, "loss": 0.0327, "step": 3445 }, { "epoch": 1.56778889899909, "grad_norm": 0.538408356737662, "learning_rate": 7.765211449592649e-06, "loss": 0.0337, "step": 3446 }, { "epoch": 1.5682438580527753, "grad_norm": 0.47460946861088327, "learning_rate": 7.764020514100112e-06, "loss": 0.0271, "step": 3447 }, { "epoch": 1.5686988171064604, "grad_norm": 0.561507703759553, "learning_rate": 7.7628293527443e-06, "loss": 0.0318, "step": 3448 }, { "epoch": 1.5691537761601455, "grad_norm": 0.4792840867094543, "learning_rate": 7.76163796562255e-06, "loss": 0.0331, "step": 3449 }, { "epoch": 1.5696087352138308, "grad_norm": 0.8607482079019099, "learning_rate": 7.760446352832217e-06, "loss": 0.0388, "step": 3450 }, { "epoch": 1.570063694267516, "grad_norm": 0.5223043059792263, "learning_rate": 7.759254514470675e-06, "loss": 0.0339, "step": 3451 }, { "epoch": 1.570518653321201, "grad_norm": 5.710898517541713, "learning_rate": 7.758062450635313e-06, "loss": 0.0555, "step": 3452 }, { "epoch": 1.5709736123748863, "grad_norm": 0.5105552971743239, "learning_rate": 7.756870161423544e-06, "loss": 0.0262, "step": 3453 }, { "epoch": 1.5714285714285714, "grad_norm": 0.4968688652691249, "learning_rate": 7.755677646932796e-06, "loss": 0.0224, "step": 3454 }, { "epoch": 1.5718835304822565, "grad_norm": 0.4976523567379475, "learning_rate": 7.754484907260513e-06, "loss": 0.0273, "step": 3455 }, { "epoch": 1.5723384895359418, "grad_norm": 0.5434947844040935, "learning_rate": 7.753291942504165e-06, "loss": 0.0305, "step": 3456 }, { "epoch": 1.5727934485896269, "grad_norm": 0.47435089320259716, "learning_rate": 7.75209875276123e-06, "loss": 0.0295, "step": 3457 }, { "epoch": 1.573248407643312, "grad_norm": 0.7090034377929653, "learning_rate": 7.750905338129218e-06, "loss": 0.0562, "step": 3458 }, { "epoch": 1.5737033666969973, "grad_norm": 0.35682356138133664, "learning_rate": 7.749711698705642e-06, "loss": 0.0215, "step": 3459 }, { "epoch": 1.5741583257506826, "grad_norm": 0.690125413698341, "learning_rate": 7.748517834588041e-06, "loss": 0.0507, "step": 3460 }, { "epoch": 1.5746132848043675, "grad_norm": 0.8065897776167275, "learning_rate": 7.747323745873978e-06, "loss": 0.0619, "step": 3461 }, { "epoch": 1.5750682438580528, "grad_norm": 0.5611610306466592, "learning_rate": 7.746129432661026e-06, "loss": 0.0437, "step": 3462 }, { "epoch": 1.575523202911738, "grad_norm": 0.6156596061850024, "learning_rate": 7.744934895046777e-06, "loss": 0.0389, "step": 3463 }, { "epoch": 1.575978161965423, "grad_norm": 0.6770117433407621, "learning_rate": 7.743740133128844e-06, "loss": 0.0425, "step": 3464 }, { "epoch": 1.5764331210191083, "grad_norm": 0.5646976725890579, "learning_rate": 7.742545147004859e-06, "loss": 0.0354, "step": 3465 }, { "epoch": 1.5768880800727936, "grad_norm": 0.5002058028310358, "learning_rate": 7.741349936772468e-06, "loss": 0.0304, "step": 3466 }, { "epoch": 1.5773430391264787, "grad_norm": 0.42270356073255155, "learning_rate": 7.74015450252934e-06, "loss": 0.0275, "step": 3467 }, { "epoch": 1.5777979981801638, "grad_norm": 0.5112403656545559, "learning_rate": 7.738958844373164e-06, "loss": 0.0284, "step": 3468 }, { "epoch": 1.578252957233849, "grad_norm": 0.5654040077009538, "learning_rate": 7.737762962401637e-06, "loss": 0.0266, "step": 3469 }, { "epoch": 1.5787079162875342, "grad_norm": 0.5043261297906985, "learning_rate": 7.736566856712486e-06, "loss": 0.0454, "step": 3470 }, { "epoch": 1.5791628753412192, "grad_norm": 0.4594033411160087, "learning_rate": 7.735370527403447e-06, "loss": 0.0222, "step": 3471 }, { "epoch": 1.5796178343949046, "grad_norm": 0.5209905943849851, "learning_rate": 7.734173974572283e-06, "loss": 0.0347, "step": 3472 }, { "epoch": 1.5800727934485896, "grad_norm": 0.42941284083933845, "learning_rate": 7.732977198316772e-06, "loss": 0.0298, "step": 3473 }, { "epoch": 1.5805277525022747, "grad_norm": 0.37167293538439883, "learning_rate": 7.731780198734702e-06, "loss": 0.0168, "step": 3474 }, { "epoch": 1.58098271155596, "grad_norm": 0.6728592612271111, "learning_rate": 7.730582975923892e-06, "loss": 0.0444, "step": 3475 }, { "epoch": 1.5814376706096451, "grad_norm": 0.704688894432512, "learning_rate": 7.729385529982174e-06, "loss": 0.0455, "step": 3476 }, { "epoch": 1.5818926296633302, "grad_norm": 0.6254068661918205, "learning_rate": 7.728187861007394e-06, "loss": 0.0314, "step": 3477 }, { "epoch": 1.5823475887170155, "grad_norm": 0.4853841882946067, "learning_rate": 7.726989969097424e-06, "loss": 0.0269, "step": 3478 }, { "epoch": 1.5828025477707006, "grad_norm": 0.5633959034881754, "learning_rate": 7.725791854350148e-06, "loss": 0.0434, "step": 3479 }, { "epoch": 1.5832575068243857, "grad_norm": 0.4754367671510815, "learning_rate": 7.724593516863472e-06, "loss": 0.0323, "step": 3480 }, { "epoch": 1.583712465878071, "grad_norm": 0.5505428790889674, "learning_rate": 7.723394956735316e-06, "loss": 0.0273, "step": 3481 }, { "epoch": 1.584167424931756, "grad_norm": 0.6739680763910496, "learning_rate": 7.722196174063625e-06, "loss": 0.0441, "step": 3482 }, { "epoch": 1.5846223839854412, "grad_norm": 0.5513199772036493, "learning_rate": 7.720997168946355e-06, "loss": 0.0408, "step": 3483 }, { "epoch": 1.5850773430391265, "grad_norm": 0.37523723040612444, "learning_rate": 7.719797941481487e-06, "loss": 0.0223, "step": 3484 }, { "epoch": 1.5855323020928116, "grad_norm": 0.6066395634270424, "learning_rate": 7.71859849176701e-06, "loss": 0.0394, "step": 3485 }, { "epoch": 1.5859872611464967, "grad_norm": 0.5392116492425955, "learning_rate": 7.717398819900943e-06, "loss": 0.0342, "step": 3486 }, { "epoch": 1.586442220200182, "grad_norm": 0.4688172933226109, "learning_rate": 7.716198925981316e-06, "loss": 0.0272, "step": 3487 }, { "epoch": 1.5868971792538673, "grad_norm": 0.4914165590714183, "learning_rate": 7.714998810106178e-06, "loss": 0.0365, "step": 3488 }, { "epoch": 1.5873521383075522, "grad_norm": 0.4851376352933137, "learning_rate": 7.713798472373598e-06, "loss": 0.0318, "step": 3489 }, { "epoch": 1.5878070973612375, "grad_norm": 0.636717106499792, "learning_rate": 7.712597912881664e-06, "loss": 0.0388, "step": 3490 }, { "epoch": 1.5882620564149228, "grad_norm": 0.4759975612349714, "learning_rate": 7.711397131728479e-06, "loss": 0.0343, "step": 3491 }, { "epoch": 1.5887170154686077, "grad_norm": 0.6121544784459866, "learning_rate": 7.710196129012163e-06, "loss": 0.0393, "step": 3492 }, { "epoch": 1.589171974522293, "grad_norm": 0.459290097235983, "learning_rate": 7.70899490483086e-06, "loss": 0.0239, "step": 3493 }, { "epoch": 1.5896269335759783, "grad_norm": 0.5006072426867193, "learning_rate": 7.707793459282726e-06, "loss": 0.0264, "step": 3494 }, { "epoch": 1.5900818926296634, "grad_norm": 0.5392590451459817, "learning_rate": 7.706591792465938e-06, "loss": 0.0415, "step": 3495 }, { "epoch": 1.5905368516833485, "grad_norm": 0.6427781964599646, "learning_rate": 7.705389904478694e-06, "loss": 0.0451, "step": 3496 }, { "epoch": 1.5909918107370338, "grad_norm": 0.5904502422720772, "learning_rate": 7.704187795419202e-06, "loss": 0.0326, "step": 3497 }, { "epoch": 1.5914467697907189, "grad_norm": 0.5254871759750529, "learning_rate": 7.702985465385698e-06, "loss": 0.0271, "step": 3498 }, { "epoch": 1.591901728844404, "grad_norm": 0.8139706995395187, "learning_rate": 7.701782914476425e-06, "loss": 0.0471, "step": 3499 }, { "epoch": 1.5923566878980893, "grad_norm": 0.45527893905535977, "learning_rate": 7.700580142789656e-06, "loss": 0.0275, "step": 3500 }, { "epoch": 1.5928116469517744, "grad_norm": 0.6912461759234002, "learning_rate": 7.699377150423673e-06, "loss": 0.0429, "step": 3501 }, { "epoch": 1.5932666060054594, "grad_norm": 0.6082047878888045, "learning_rate": 7.698173937476779e-06, "loss": 0.0341, "step": 3502 }, { "epoch": 1.5937215650591448, "grad_norm": 0.6073667552436033, "learning_rate": 7.696970504047295e-06, "loss": 0.0275, "step": 3503 }, { "epoch": 1.5941765241128298, "grad_norm": 0.8091516868172267, "learning_rate": 7.695766850233562e-06, "loss": 0.0571, "step": 3504 }, { "epoch": 1.594631483166515, "grad_norm": 0.49571343354550057, "learning_rate": 7.694562976133935e-06, "loss": 0.0456, "step": 3505 }, { "epoch": 1.5950864422202002, "grad_norm": 0.4862397941245227, "learning_rate": 7.693358881846789e-06, "loss": 0.0245, "step": 3506 }, { "epoch": 1.5955414012738853, "grad_norm": 0.8324349327582877, "learning_rate": 7.692154567470522e-06, "loss": 0.0584, "step": 3507 }, { "epoch": 1.5959963603275704, "grad_norm": 0.42161922635608196, "learning_rate": 7.69095003310354e-06, "loss": 0.0224, "step": 3508 }, { "epoch": 1.5964513193812557, "grad_norm": 0.5720006424666347, "learning_rate": 7.689745278844271e-06, "loss": 0.0444, "step": 3509 }, { "epoch": 1.5969062784349408, "grad_norm": 0.4329811248199684, "learning_rate": 7.688540304791166e-06, "loss": 0.028, "step": 3510 }, { "epoch": 1.597361237488626, "grad_norm": 0.6590673444667859, "learning_rate": 7.687335111042691e-06, "loss": 0.0513, "step": 3511 }, { "epoch": 1.5978161965423112, "grad_norm": 0.5940362949551716, "learning_rate": 7.686129697697324e-06, "loss": 0.0324, "step": 3512 }, { "epoch": 1.5982711555959963, "grad_norm": 0.6462196543335987, "learning_rate": 7.684924064853568e-06, "loss": 0.0371, "step": 3513 }, { "epoch": 1.5987261146496814, "grad_norm": 0.31438590420870827, "learning_rate": 7.683718212609945e-06, "loss": 0.0149, "step": 3514 }, { "epoch": 1.5991810737033667, "grad_norm": 0.6635942498961731, "learning_rate": 7.682512141064988e-06, "loss": 0.0482, "step": 3515 }, { "epoch": 1.599636032757052, "grad_norm": 0.8219386674967395, "learning_rate": 7.681305850317252e-06, "loss": 0.0569, "step": 3516 }, { "epoch": 1.6000909918107369, "grad_norm": 0.5324553198705271, "learning_rate": 7.680099340465312e-06, "loss": 0.031, "step": 3517 }, { "epoch": 1.6005459508644222, "grad_norm": 0.5022819826273329, "learning_rate": 7.678892611607756e-06, "loss": 0.0322, "step": 3518 }, { "epoch": 1.6010009099181075, "grad_norm": 0.5200432351073436, "learning_rate": 7.677685663843195e-06, "loss": 0.0244, "step": 3519 }, { "epoch": 1.6014558689717924, "grad_norm": 0.5079063474217674, "learning_rate": 7.676478497270253e-06, "loss": 0.0302, "step": 3520 }, { "epoch": 1.6019108280254777, "grad_norm": 0.45086384597230217, "learning_rate": 7.675271111987574e-06, "loss": 0.0264, "step": 3521 }, { "epoch": 1.602365787079163, "grad_norm": 0.5987447309633367, "learning_rate": 7.674063508093823e-06, "loss": 0.0345, "step": 3522 }, { "epoch": 1.602820746132848, "grad_norm": 0.6694848585180148, "learning_rate": 7.672855685687676e-06, "loss": 0.0468, "step": 3523 }, { "epoch": 1.6032757051865332, "grad_norm": 0.4884693910817342, "learning_rate": 7.671647644867836e-06, "loss": 0.0312, "step": 3524 }, { "epoch": 1.6037306642402185, "grad_norm": 0.5243581522168983, "learning_rate": 7.670439385733012e-06, "loss": 0.0314, "step": 3525 }, { "epoch": 1.6041856232939036, "grad_norm": 0.416015607301664, "learning_rate": 7.669230908381944e-06, "loss": 0.0273, "step": 3526 }, { "epoch": 1.6046405823475887, "grad_norm": 0.5428096951326984, "learning_rate": 7.668022212913378e-06, "loss": 0.0368, "step": 3527 }, { "epoch": 1.605095541401274, "grad_norm": 0.5817012639943427, "learning_rate": 7.666813299426087e-06, "loss": 0.0412, "step": 3528 }, { "epoch": 1.605550500454959, "grad_norm": 0.5570959439337889, "learning_rate": 7.665604168018856e-06, "loss": 0.0472, "step": 3529 }, { "epoch": 1.6060054595086442, "grad_norm": 0.6212569836098534, "learning_rate": 7.66439481879049e-06, "loss": 0.0364, "step": 3530 }, { "epoch": 1.6064604185623295, "grad_norm": 0.44375581247796847, "learning_rate": 7.663185251839813e-06, "loss": 0.0263, "step": 3531 }, { "epoch": 1.6069153776160146, "grad_norm": 0.34915723581136715, "learning_rate": 7.661975467265661e-06, "loss": 0.0195, "step": 3532 }, { "epoch": 1.6073703366696996, "grad_norm": 0.505632210854433, "learning_rate": 7.660765465166898e-06, "loss": 0.0238, "step": 3533 }, { "epoch": 1.607825295723385, "grad_norm": 0.4686118663668764, "learning_rate": 7.659555245642396e-06, "loss": 0.0347, "step": 3534 }, { "epoch": 1.60828025477707, "grad_norm": 0.5197317168350211, "learning_rate": 7.658344808791049e-06, "loss": 0.0256, "step": 3535 }, { "epoch": 1.6087352138307551, "grad_norm": 0.4039821082222526, "learning_rate": 7.65713415471177e-06, "loss": 0.0193, "step": 3536 }, { "epoch": 1.6091901728844404, "grad_norm": 0.42336017856374314, "learning_rate": 7.655923283503488e-06, "loss": 0.0236, "step": 3537 }, { "epoch": 1.6096451319381255, "grad_norm": 0.4159281988410182, "learning_rate": 7.654712195265148e-06, "loss": 0.031, "step": 3538 }, { "epoch": 1.6101000909918106, "grad_norm": 0.46547916774453957, "learning_rate": 7.653500890095718e-06, "loss": 0.0236, "step": 3539 }, { "epoch": 1.610555050045496, "grad_norm": 0.6814753024595361, "learning_rate": 7.652289368094177e-06, "loss": 0.0398, "step": 3540 }, { "epoch": 1.6110100090991812, "grad_norm": 0.7072996547012244, "learning_rate": 7.651077629359526e-06, "loss": 0.0299, "step": 3541 }, { "epoch": 1.611464968152866, "grad_norm": 0.4728878872440036, "learning_rate": 7.649865673990784e-06, "loss": 0.0249, "step": 3542 }, { "epoch": 1.6119199272065514, "grad_norm": 0.5985278217148838, "learning_rate": 7.648653502086987e-06, "loss": 0.0504, "step": 3543 }, { "epoch": 1.6123748862602367, "grad_norm": 0.4864758660970619, "learning_rate": 7.647441113747183e-06, "loss": 0.0243, "step": 3544 }, { "epoch": 1.6128298453139216, "grad_norm": 0.5046056423212918, "learning_rate": 7.646228509070452e-06, "loss": 0.0283, "step": 3545 }, { "epoch": 1.613284804367607, "grad_norm": 0.4166387607087943, "learning_rate": 7.645015688155875e-06, "loss": 0.0232, "step": 3546 }, { "epoch": 1.6137397634212922, "grad_norm": 0.6346006347530964, "learning_rate": 7.643802651102561e-06, "loss": 0.0307, "step": 3547 }, { "epoch": 1.6141947224749773, "grad_norm": 0.614457634488847, "learning_rate": 7.642589398009632e-06, "loss": 0.0337, "step": 3548 }, { "epoch": 1.6146496815286624, "grad_norm": 0.4011091478272312, "learning_rate": 7.641375928976234e-06, "loss": 0.0222, "step": 3549 }, { "epoch": 1.6151046405823477, "grad_norm": 0.44802583977127497, "learning_rate": 7.64016224410152e-06, "loss": 0.0307, "step": 3550 }, { "epoch": 1.6155595996360328, "grad_norm": 0.6512304506397211, "learning_rate": 7.638948343484673e-06, "loss": 0.039, "step": 3551 }, { "epoch": 1.6160145586897179, "grad_norm": 0.5094725225300599, "learning_rate": 7.637734227224885e-06, "loss": 0.0269, "step": 3552 }, { "epoch": 1.6164695177434032, "grad_norm": 0.38774814089402904, "learning_rate": 7.636519895421365e-06, "loss": 0.0227, "step": 3553 }, { "epoch": 1.6169244767970883, "grad_norm": 0.600146705549112, "learning_rate": 7.63530534817335e-06, "loss": 0.031, "step": 3554 }, { "epoch": 1.6173794358507734, "grad_norm": 0.6396250853926813, "learning_rate": 7.63409058558008e-06, "loss": 0.0541, "step": 3555 }, { "epoch": 1.6178343949044587, "grad_norm": 0.3674994725639151, "learning_rate": 7.632875607740825e-06, "loss": 0.0198, "step": 3556 }, { "epoch": 1.6182893539581438, "grad_norm": 0.706365847864232, "learning_rate": 7.631660414754862e-06, "loss": 0.0464, "step": 3557 }, { "epoch": 1.6187443130118289, "grad_norm": 0.5917150010241741, "learning_rate": 7.630445006721497e-06, "loss": 0.0413, "step": 3558 }, { "epoch": 1.6191992720655142, "grad_norm": 0.49637791594441183, "learning_rate": 7.629229383740042e-06, "loss": 0.0216, "step": 3559 }, { "epoch": 1.6196542311191993, "grad_norm": 0.4170834995352204, "learning_rate": 7.628013545909838e-06, "loss": 0.0221, "step": 3560 }, { "epoch": 1.6201091901728844, "grad_norm": 0.45364271750881185, "learning_rate": 7.626797493330235e-06, "loss": 0.0268, "step": 3561 }, { "epoch": 1.6205641492265697, "grad_norm": 0.611490193807076, "learning_rate": 7.625581226100603e-06, "loss": 0.0408, "step": 3562 }, { "epoch": 1.6210191082802548, "grad_norm": 0.4607491745006535, "learning_rate": 7.6243647443203295e-06, "loss": 0.0201, "step": 3563 }, { "epoch": 1.6214740673339398, "grad_norm": 0.5622724933371087, "learning_rate": 7.623148048088821e-06, "loss": 0.0345, "step": 3564 }, { "epoch": 1.6219290263876252, "grad_norm": 0.6433742820185328, "learning_rate": 7.6219311375055e-06, "loss": 0.0303, "step": 3565 }, { "epoch": 1.6223839854413102, "grad_norm": 0.5029378697434539, "learning_rate": 7.620714012669807e-06, "loss": 0.0345, "step": 3566 }, { "epoch": 1.6228389444949953, "grad_norm": 0.7588833946097253, "learning_rate": 7.619496673681201e-06, "loss": 0.0659, "step": 3567 }, { "epoch": 1.6232939035486806, "grad_norm": 0.5425276901336257, "learning_rate": 7.618279120639154e-06, "loss": 0.0303, "step": 3568 }, { "epoch": 1.623748862602366, "grad_norm": 0.4113587117385332, "learning_rate": 7.6170613536431625e-06, "loss": 0.0116, "step": 3569 }, { "epoch": 1.6242038216560508, "grad_norm": 0.5187414197115504, "learning_rate": 7.615843372792735e-06, "loss": 0.0291, "step": 3570 }, { "epoch": 1.6246587807097361, "grad_norm": 0.3522854361880795, "learning_rate": 7.614625178187402e-06, "loss": 0.0216, "step": 3571 }, { "epoch": 1.6251137397634214, "grad_norm": 0.6881608158668938, "learning_rate": 7.613406769926706e-06, "loss": 0.0375, "step": 3572 }, { "epoch": 1.6255686988171063, "grad_norm": 0.73681812623654, "learning_rate": 7.612188148110211e-06, "loss": 0.0449, "step": 3573 }, { "epoch": 1.6260236578707916, "grad_norm": 0.38921683099066845, "learning_rate": 7.610969312837497e-06, "loss": 0.0177, "step": 3574 }, { "epoch": 1.626478616924477, "grad_norm": 0.5386370708102756, "learning_rate": 7.609750264208161e-06, "loss": 0.0374, "step": 3575 }, { "epoch": 1.626933575978162, "grad_norm": 0.6893051198659124, "learning_rate": 7.60853100232182e-06, "loss": 0.0407, "step": 3576 }, { "epoch": 1.627388535031847, "grad_norm": 0.5280242999427653, "learning_rate": 7.6073115272781055e-06, "loss": 0.0305, "step": 3577 }, { "epoch": 1.6278434940855324, "grad_norm": 0.4378730785336437, "learning_rate": 7.606091839176666e-06, "loss": 0.0313, "step": 3578 }, { "epoch": 1.6282984531392175, "grad_norm": 0.4508110009197861, "learning_rate": 7.604871938117171e-06, "loss": 0.0259, "step": 3579 }, { "epoch": 1.6287534121929026, "grad_norm": 0.37231708257580964, "learning_rate": 7.6036518241993055e-06, "loss": 0.0206, "step": 3580 }, { "epoch": 1.629208371246588, "grad_norm": 0.5023008053378739, "learning_rate": 7.602431497522771e-06, "loss": 0.0284, "step": 3581 }, { "epoch": 1.629663330300273, "grad_norm": 1.0431521995815034, "learning_rate": 7.601210958187286e-06, "loss": 0.0628, "step": 3582 }, { "epoch": 1.630118289353958, "grad_norm": 0.6004977507554499, "learning_rate": 7.599990206292589e-06, "loss": 0.038, "step": 3583 }, { "epoch": 1.6305732484076434, "grad_norm": 0.4205149087751285, "learning_rate": 7.598769241938435e-06, "loss": 0.0221, "step": 3584 }, { "epoch": 1.6310282074613285, "grad_norm": 0.6822993931535771, "learning_rate": 7.597548065224594e-06, "loss": 0.04, "step": 3585 }, { "epoch": 1.6314831665150136, "grad_norm": 0.3585095136964591, "learning_rate": 7.596326676250853e-06, "loss": 0.0193, "step": 3586 }, { "epoch": 1.6319381255686989, "grad_norm": 0.4523831063775864, "learning_rate": 7.595105075117023e-06, "loss": 0.0217, "step": 3587 }, { "epoch": 1.632393084622384, "grad_norm": 0.5716166616117068, "learning_rate": 7.593883261922927e-06, "loss": 0.0322, "step": 3588 }, { "epoch": 1.632848043676069, "grad_norm": 0.5367659095679785, "learning_rate": 7.592661236768402e-06, "loss": 0.0423, "step": 3589 }, { "epoch": 1.6333030027297544, "grad_norm": 0.4242349443718962, "learning_rate": 7.59143899975331e-06, "loss": 0.021, "step": 3590 }, { "epoch": 1.6337579617834395, "grad_norm": 0.58571921676948, "learning_rate": 7.5902165509775276e-06, "loss": 0.0237, "step": 3591 }, { "epoch": 1.6342129208371245, "grad_norm": 0.5080071985034912, "learning_rate": 7.588993890540943e-06, "loss": 0.0328, "step": 3592 }, { "epoch": 1.6346678798908099, "grad_norm": 0.6532991035653749, "learning_rate": 7.587771018543471e-06, "loss": 0.0409, "step": 3593 }, { "epoch": 1.635122838944495, "grad_norm": 0.3891041407518566, "learning_rate": 7.586547935085038e-06, "loss": 0.0208, "step": 3594 }, { "epoch": 1.63557779799818, "grad_norm": 0.5728865075835124, "learning_rate": 7.585324640265588e-06, "loss": 0.0352, "step": 3595 }, { "epoch": 1.6360327570518653, "grad_norm": 0.34316777483010713, "learning_rate": 7.584101134185084e-06, "loss": 0.0256, "step": 3596 }, { "epoch": 1.6364877161055507, "grad_norm": 0.67041665141859, "learning_rate": 7.582877416943504e-06, "loss": 0.0482, "step": 3597 }, { "epoch": 1.6369426751592355, "grad_norm": 0.6944862376061023, "learning_rate": 7.581653488640845e-06, "loss": 0.0401, "step": 3598 }, { "epoch": 1.6373976342129208, "grad_norm": 0.4926515756950819, "learning_rate": 7.580429349377123e-06, "loss": 0.036, "step": 3599 }, { "epoch": 1.6378525932666061, "grad_norm": 0.6531006904735797, "learning_rate": 7.579204999252368e-06, "loss": 0.035, "step": 3600 }, { "epoch": 1.638307552320291, "grad_norm": 0.5554109610782453, "learning_rate": 7.577980438366628e-06, "loss": 0.0352, "step": 3601 }, { "epoch": 1.6387625113739763, "grad_norm": 0.42491724437145356, "learning_rate": 7.5767556668199685e-06, "loss": 0.0215, "step": 3602 }, { "epoch": 1.6392174704276616, "grad_norm": 0.4860918427295927, "learning_rate": 7.575530684712473e-06, "loss": 0.0343, "step": 3603 }, { "epoch": 1.6396724294813467, "grad_norm": 0.5124027672501053, "learning_rate": 7.574305492144238e-06, "loss": 0.0286, "step": 3604 }, { "epoch": 1.6401273885350318, "grad_norm": 0.5514675811144069, "learning_rate": 7.5730800892153866e-06, "loss": 0.0305, "step": 3605 }, { "epoch": 1.6405823475887171, "grad_norm": 0.524484642986204, "learning_rate": 7.5718544760260496e-06, "loss": 0.0215, "step": 3606 }, { "epoch": 1.6410373066424022, "grad_norm": 0.4407331404446461, "learning_rate": 7.570628652676378e-06, "loss": 0.0263, "step": 3607 }, { "epoch": 1.6414922656960873, "grad_norm": 0.47395155961080115, "learning_rate": 7.569402619266544e-06, "loss": 0.0237, "step": 3608 }, { "epoch": 1.6419472247497726, "grad_norm": 0.5502832582406681, "learning_rate": 7.568176375896729e-06, "loss": 0.0249, "step": 3609 }, { "epoch": 1.6424021838034577, "grad_norm": 0.6479111217338874, "learning_rate": 7.566949922667141e-06, "loss": 0.0289, "step": 3610 }, { "epoch": 1.6428571428571428, "grad_norm": 0.4577092754363741, "learning_rate": 7.565723259677994e-06, "loss": 0.0264, "step": 3611 }, { "epoch": 1.643312101910828, "grad_norm": 0.7670756095284599, "learning_rate": 7.564496387029532e-06, "loss": 0.0306, "step": 3612 }, { "epoch": 1.6437670609645132, "grad_norm": 0.5175142660634636, "learning_rate": 7.563269304822005e-06, "loss": 0.024, "step": 3613 }, { "epoch": 1.6442220200181983, "grad_norm": 0.4852960351529595, "learning_rate": 7.562042013155686e-06, "loss": 0.0239, "step": 3614 }, { "epoch": 1.6446769790718836, "grad_norm": 0.6874391579402988, "learning_rate": 7.560814512130864e-06, "loss": 0.0509, "step": 3615 }, { "epoch": 1.6451319381255687, "grad_norm": 0.6884415051879906, "learning_rate": 7.559586801847845e-06, "loss": 0.0421, "step": 3616 }, { "epoch": 1.6455868971792538, "grad_norm": 0.5022590822197832, "learning_rate": 7.558358882406951e-06, "loss": 0.0358, "step": 3617 }, { "epoch": 1.646041856232939, "grad_norm": 0.7191160994079118, "learning_rate": 7.5571307539085226e-06, "loss": 0.0362, "step": 3618 }, { "epoch": 1.6464968152866242, "grad_norm": 0.5361701775024936, "learning_rate": 7.555902416452917e-06, "loss": 0.0347, "step": 3619 }, { "epoch": 1.6469517743403093, "grad_norm": 0.40469894871847095, "learning_rate": 7.55467387014051e-06, "loss": 0.0262, "step": 3620 }, { "epoch": 1.6474067333939946, "grad_norm": 0.4729190743690024, "learning_rate": 7.553445115071687e-06, "loss": 0.0298, "step": 3621 }, { "epoch": 1.6478616924476797, "grad_norm": 0.6232910519424011, "learning_rate": 7.5522161513468635e-06, "loss": 0.0419, "step": 3622 }, { "epoch": 1.6483166515013647, "grad_norm": 0.7210062870818522, "learning_rate": 7.550986979066461e-06, "loss": 0.0477, "step": 3623 }, { "epoch": 1.64877161055505, "grad_norm": 0.5437977283564177, "learning_rate": 7.549757598330925e-06, "loss": 0.0315, "step": 3624 }, { "epoch": 1.6492265696087354, "grad_norm": 0.6454727225977187, "learning_rate": 7.54852800924071e-06, "loss": 0.0468, "step": 3625 }, { "epoch": 1.6496815286624202, "grad_norm": 0.4260615747515582, "learning_rate": 7.547298211896295e-06, "loss": 0.0239, "step": 3626 }, { "epoch": 1.6501364877161055, "grad_norm": 0.48016862965087487, "learning_rate": 7.546068206398175e-06, "loss": 0.0342, "step": 3627 }, { "epoch": 1.6505914467697909, "grad_norm": 0.7138927769394839, "learning_rate": 7.544837992846856e-06, "loss": 0.0471, "step": 3628 }, { "epoch": 1.6510464058234757, "grad_norm": 0.7630868706471562, "learning_rate": 7.543607571342873e-06, "loss": 0.0312, "step": 3629 }, { "epoch": 1.651501364877161, "grad_norm": 0.44730595153415403, "learning_rate": 7.542376941986765e-06, "loss": 0.0219, "step": 3630 }, { "epoch": 1.6519563239308463, "grad_norm": 0.5364651255825534, "learning_rate": 7.541146104879093e-06, "loss": 0.0382, "step": 3631 }, { "epoch": 1.6524112829845314, "grad_norm": 0.41349870049114545, "learning_rate": 7.5399150601204375e-06, "loss": 0.021, "step": 3632 }, { "epoch": 1.6528662420382165, "grad_norm": 0.44535310210219453, "learning_rate": 7.538683807811393e-06, "loss": 0.025, "step": 3633 }, { "epoch": 1.6533212010919018, "grad_norm": 0.38034270281211774, "learning_rate": 7.537452348052574e-06, "loss": 0.0232, "step": 3634 }, { "epoch": 1.653776160145587, "grad_norm": 0.41145064601774145, "learning_rate": 7.536220680944608e-06, "loss": 0.0266, "step": 3635 }, { "epoch": 1.654231119199272, "grad_norm": 0.5573614158305944, "learning_rate": 7.534988806588139e-06, "loss": 0.0279, "step": 3636 }, { "epoch": 1.6546860782529573, "grad_norm": 0.6441991234238742, "learning_rate": 7.533756725083836e-06, "loss": 0.0415, "step": 3637 }, { "epoch": 1.6551410373066424, "grad_norm": 0.46683199273540027, "learning_rate": 7.532524436532373e-06, "loss": 0.022, "step": 3638 }, { "epoch": 1.6555959963603275, "grad_norm": 0.7897915034320098, "learning_rate": 7.531291941034451e-06, "loss": 0.0481, "step": 3639 }, { "epoch": 1.6560509554140128, "grad_norm": 0.5629722196073099, "learning_rate": 7.530059238690783e-06, "loss": 0.0401, "step": 3640 }, { "epoch": 1.656505914467698, "grad_norm": 0.7125776182901007, "learning_rate": 7.528826329602099e-06, "loss": 0.0352, "step": 3641 }, { "epoch": 1.656960873521383, "grad_norm": 0.5622661248270417, "learning_rate": 7.5275932138691485e-06, "loss": 0.0332, "step": 3642 }, { "epoch": 1.6574158325750683, "grad_norm": 0.7151312448681683, "learning_rate": 7.5263598915926934e-06, "loss": 0.038, "step": 3643 }, { "epoch": 1.6578707916287534, "grad_norm": 0.5343940111933614, "learning_rate": 7.525126362873519e-06, "loss": 0.0388, "step": 3644 }, { "epoch": 1.6583257506824385, "grad_norm": 0.5572916031792711, "learning_rate": 7.5238926278124195e-06, "loss": 0.0305, "step": 3645 }, { "epoch": 1.6587807097361238, "grad_norm": 0.6268588453699686, "learning_rate": 7.522658686510214e-06, "loss": 0.0367, "step": 3646 }, { "epoch": 1.6592356687898089, "grad_norm": 0.6407086406233048, "learning_rate": 7.521424539067732e-06, "loss": 0.0349, "step": 3647 }, { "epoch": 1.659690627843494, "grad_norm": 0.7598194861893685, "learning_rate": 7.520190185585823e-06, "loss": 0.0473, "step": 3648 }, { "epoch": 1.6601455868971793, "grad_norm": 0.4939114369203625, "learning_rate": 7.518955626165354e-06, "loss": 0.0376, "step": 3649 }, { "epoch": 1.6606005459508644, "grad_norm": 0.5009430676312933, "learning_rate": 7.517720860907205e-06, "loss": 0.0323, "step": 3650 }, { "epoch": 1.6610555050045495, "grad_norm": 0.4835427088563686, "learning_rate": 7.51648588991228e-06, "loss": 0.029, "step": 3651 }, { "epoch": 1.6615104640582348, "grad_norm": 0.5264831926178674, "learning_rate": 7.51525071328149e-06, "loss": 0.0367, "step": 3652 }, { "epoch": 1.66196542311192, "grad_norm": 0.4817533345459221, "learning_rate": 7.514015331115772e-06, "loss": 0.0303, "step": 3653 }, { "epoch": 1.662420382165605, "grad_norm": 0.46866664090793914, "learning_rate": 7.512779743516073e-06, "loss": 0.0381, "step": 3654 }, { "epoch": 1.6628753412192903, "grad_norm": 0.417154929021052, "learning_rate": 7.511543950583362e-06, "loss": 0.0216, "step": 3655 }, { "epoch": 1.6633303002729756, "grad_norm": 0.524657118523136, "learning_rate": 7.5103079524186206e-06, "loss": 0.0297, "step": 3656 }, { "epoch": 1.6637852593266604, "grad_norm": 0.5512423603507336, "learning_rate": 7.509071749122849e-06, "loss": 0.032, "step": 3657 }, { "epoch": 1.6642402183803457, "grad_norm": 0.6354187109786534, "learning_rate": 7.5078353407970675e-06, "loss": 0.0368, "step": 3658 }, { "epoch": 1.664695177434031, "grad_norm": 0.465755148211732, "learning_rate": 7.506598727542305e-06, "loss": 0.026, "step": 3659 }, { "epoch": 1.6651501364877161, "grad_norm": 1.2489245060589367, "learning_rate": 7.5053619094596144e-06, "loss": 0.0695, "step": 3660 }, { "epoch": 1.6656050955414012, "grad_norm": 0.4195788150765403, "learning_rate": 7.504124886650064e-06, "loss": 0.0297, "step": 3661 }, { "epoch": 1.6660600545950865, "grad_norm": 0.4520027445439969, "learning_rate": 7.5028876592147356e-06, "loss": 0.0195, "step": 3662 }, { "epoch": 1.6665150136487716, "grad_norm": 0.6079480787748858, "learning_rate": 7.501650227254731e-06, "loss": 0.0312, "step": 3663 }, { "epoch": 1.6669699727024567, "grad_norm": 1.0717457296352204, "learning_rate": 7.500412590871167e-06, "loss": 0.0504, "step": 3664 }, { "epoch": 1.667424931756142, "grad_norm": 0.43425899943762286, "learning_rate": 7.499174750165178e-06, "loss": 0.0195, "step": 3665 }, { "epoch": 1.6678798908098271, "grad_norm": 0.493687208906982, "learning_rate": 7.497936705237915e-06, "loss": 0.0355, "step": 3666 }, { "epoch": 1.6683348498635122, "grad_norm": 0.5379546390424125, "learning_rate": 7.4966984561905435e-06, "loss": 0.0359, "step": 3667 }, { "epoch": 1.6687898089171975, "grad_norm": 0.49000291706954646, "learning_rate": 7.49546000312425e-06, "loss": 0.029, "step": 3668 }, { "epoch": 1.6692447679708826, "grad_norm": 0.7850925100883417, "learning_rate": 7.494221346140234e-06, "loss": 0.0648, "step": 3669 }, { "epoch": 1.6696997270245677, "grad_norm": 0.3895801016241115, "learning_rate": 7.4929824853397135e-06, "loss": 0.0176, "step": 3670 }, { "epoch": 1.670154686078253, "grad_norm": 0.5834752225965807, "learning_rate": 7.4917434208239235e-06, "loss": 0.0317, "step": 3671 }, { "epoch": 1.670609645131938, "grad_norm": 0.4548464902681102, "learning_rate": 7.490504152694113e-06, "loss": 0.0313, "step": 3672 }, { "epoch": 1.6710646041856232, "grad_norm": 0.6896247279139939, "learning_rate": 7.489264681051551e-06, "loss": 0.0448, "step": 3673 }, { "epoch": 1.6715195632393085, "grad_norm": 0.39980420203978523, "learning_rate": 7.488025005997519e-06, "loss": 0.0267, "step": 3674 }, { "epoch": 1.6719745222929936, "grad_norm": 0.6586268912096953, "learning_rate": 7.486785127633321e-06, "loss": 0.0368, "step": 3675 }, { "epoch": 1.6724294813466787, "grad_norm": 0.43582639609296714, "learning_rate": 7.485545046060272e-06, "loss": 0.0262, "step": 3676 }, { "epoch": 1.672884440400364, "grad_norm": 0.5879108612573464, "learning_rate": 7.484304761379706e-06, "loss": 0.0329, "step": 3677 }, { "epoch": 1.673339399454049, "grad_norm": 0.4684434902017037, "learning_rate": 7.4830642736929745e-06, "loss": 0.0326, "step": 3678 }, { "epoch": 1.6737943585077342, "grad_norm": 0.5530781122820492, "learning_rate": 7.481823583101444e-06, "loss": 0.0238, "step": 3679 }, { "epoch": 1.6742493175614195, "grad_norm": 0.6103374408082517, "learning_rate": 7.4805826897064985e-06, "loss": 0.0371, "step": 3680 }, { "epoch": 1.6747042766151048, "grad_norm": 0.574122694602557, "learning_rate": 7.479341593609535e-06, "loss": 0.0377, "step": 3681 }, { "epoch": 1.6751592356687897, "grad_norm": 0.6200957468248504, "learning_rate": 7.478100294911977e-06, "loss": 0.0433, "step": 3682 }, { "epoch": 1.675614194722475, "grad_norm": 0.6642367843460708, "learning_rate": 7.476858793715252e-06, "loss": 0.0407, "step": 3683 }, { "epoch": 1.6760691537761603, "grad_norm": 0.696515604442919, "learning_rate": 7.475617090120811e-06, "loss": 0.035, "step": 3684 }, { "epoch": 1.6765241128298451, "grad_norm": 0.5790866750331873, "learning_rate": 7.4743751842301225e-06, "loss": 0.0299, "step": 3685 }, { "epoch": 1.6769790718835305, "grad_norm": 0.4894119701250759, "learning_rate": 7.473133076144667e-06, "loss": 0.0307, "step": 3686 }, { "epoch": 1.6774340309372158, "grad_norm": 0.4775329874958323, "learning_rate": 7.471890765965947e-06, "loss": 0.0277, "step": 3687 }, { "epoch": 1.6778889899909009, "grad_norm": 0.8596143001090791, "learning_rate": 7.470648253795475e-06, "loss": 0.0443, "step": 3688 }, { "epoch": 1.678343949044586, "grad_norm": 0.47716517673981357, "learning_rate": 7.469405539734786e-06, "loss": 0.0175, "step": 3689 }, { "epoch": 1.6787989080982713, "grad_norm": 0.5635174716071238, "learning_rate": 7.468162623885428e-06, "loss": 0.0341, "step": 3690 }, { "epoch": 1.6792538671519563, "grad_norm": 0.48265942529925676, "learning_rate": 7.466919506348964e-06, "loss": 0.0263, "step": 3691 }, { "epoch": 1.6797088262056414, "grad_norm": 0.4995704136788404, "learning_rate": 7.465676187226981e-06, "loss": 0.0327, "step": 3692 }, { "epoch": 1.6801637852593267, "grad_norm": 0.539650849512318, "learning_rate": 7.464432666621074e-06, "loss": 0.0336, "step": 3693 }, { "epoch": 1.6806187443130118, "grad_norm": 0.5180565169948113, "learning_rate": 7.4631889446328595e-06, "loss": 0.0316, "step": 3694 }, { "epoch": 1.681073703366697, "grad_norm": 0.6102235388801855, "learning_rate": 7.461945021363968e-06, "loss": 0.0397, "step": 3695 }, { "epoch": 1.6815286624203822, "grad_norm": 0.4209026490907827, "learning_rate": 7.460700896916047e-06, "loss": 0.0211, "step": 3696 }, { "epoch": 1.6819836214740673, "grad_norm": 0.7313054141207679, "learning_rate": 7.459456571390762e-06, "loss": 0.0393, "step": 3697 }, { "epoch": 1.6824385805277524, "grad_norm": 0.3649702874908748, "learning_rate": 7.4582120448897896e-06, "loss": 0.0212, "step": 3698 }, { "epoch": 1.6828935395814377, "grad_norm": 0.6424573468685715, "learning_rate": 7.456967317514834e-06, "loss": 0.0321, "step": 3699 }, { "epoch": 1.6833484986351228, "grad_norm": 0.6293356488965565, "learning_rate": 7.455722389367603e-06, "loss": 0.0273, "step": 3700 }, { "epoch": 1.683803457688808, "grad_norm": 0.7235481703397522, "learning_rate": 7.454477260549828e-06, "loss": 0.0327, "step": 3701 }, { "epoch": 1.6842584167424932, "grad_norm": 0.6999848465561391, "learning_rate": 7.453231931163256e-06, "loss": 0.0624, "step": 3702 }, { "epoch": 1.6847133757961783, "grad_norm": 12.090077564639325, "learning_rate": 7.45198640130965e-06, "loss": 0.4276, "step": 3703 }, { "epoch": 1.6851683348498634, "grad_norm": 0.5455210735996605, "learning_rate": 7.450740671090788e-06, "loss": 0.0357, "step": 3704 }, { "epoch": 1.6856232939035487, "grad_norm": 0.5024928241770569, "learning_rate": 7.449494740608465e-06, "loss": 0.0339, "step": 3705 }, { "epoch": 1.686078252957234, "grad_norm": 0.4273476781631383, "learning_rate": 7.448248609964495e-06, "loss": 0.0252, "step": 3706 }, { "epoch": 1.6865332120109189, "grad_norm": 0.5009529731705445, "learning_rate": 7.447002279260704e-06, "loss": 0.026, "step": 3707 }, { "epoch": 1.6869881710646042, "grad_norm": 0.4953825250807911, "learning_rate": 7.445755748598938e-06, "loss": 0.0299, "step": 3708 }, { "epoch": 1.6874431301182895, "grad_norm": 0.5099196504130769, "learning_rate": 7.444509018081054e-06, "loss": 0.0285, "step": 3709 }, { "epoch": 1.6878980891719744, "grad_norm": 0.44873344215081523, "learning_rate": 7.443262087808936e-06, "loss": 0.0309, "step": 3710 }, { "epoch": 1.6883530482256597, "grad_norm": 0.4804221750169978, "learning_rate": 7.442014957884473e-06, "loss": 0.0321, "step": 3711 }, { "epoch": 1.688808007279345, "grad_norm": 0.5431087843128045, "learning_rate": 7.440767628409575e-06, "loss": 0.0398, "step": 3712 }, { "epoch": 1.68926296633303, "grad_norm": 0.43205749502925345, "learning_rate": 7.439520099486168e-06, "loss": 0.032, "step": 3713 }, { "epoch": 1.6897179253867152, "grad_norm": 0.7020261750553709, "learning_rate": 7.438272371216198e-06, "loss": 0.052, "step": 3714 }, { "epoch": 1.6901728844404005, "grad_norm": 0.40232785104425167, "learning_rate": 7.437024443701619e-06, "loss": 0.0259, "step": 3715 }, { "epoch": 1.6906278434940856, "grad_norm": 0.392818290322344, "learning_rate": 7.435776317044408e-06, "loss": 0.0276, "step": 3716 }, { "epoch": 1.6910828025477707, "grad_norm": 0.4762436487265989, "learning_rate": 7.434527991346556e-06, "loss": 0.0309, "step": 3717 }, { "epoch": 1.691537761601456, "grad_norm": 0.7231433730579884, "learning_rate": 7.433279466710071e-06, "loss": 0.0406, "step": 3718 }, { "epoch": 1.691992720655141, "grad_norm": 0.5651212308899453, "learning_rate": 7.432030743236977e-06, "loss": 0.0403, "step": 3719 }, { "epoch": 1.6924476797088261, "grad_norm": 0.5993536332498526, "learning_rate": 7.430781821029313e-06, "loss": 0.0254, "step": 3720 }, { "epoch": 1.6929026387625115, "grad_norm": 0.6161358572154612, "learning_rate": 7.4295327001891384e-06, "loss": 0.0301, "step": 3721 }, { "epoch": 1.6933575978161965, "grad_norm": 0.5199384098068588, "learning_rate": 7.428283380818521e-06, "loss": 0.0391, "step": 3722 }, { "epoch": 1.6938125568698816, "grad_norm": 0.5206027617785847, "learning_rate": 7.42703386301955e-06, "loss": 0.031, "step": 3723 }, { "epoch": 1.694267515923567, "grad_norm": 0.8161105210637801, "learning_rate": 7.4257841468943355e-06, "loss": 0.0517, "step": 3724 }, { "epoch": 1.694722474977252, "grad_norm": 0.4986248164130022, "learning_rate": 7.424534232544993e-06, "loss": 0.0238, "step": 3725 }, { "epoch": 1.6951774340309371, "grad_norm": 0.6150288798515563, "learning_rate": 7.423284120073664e-06, "loss": 0.0371, "step": 3726 }, { "epoch": 1.6956323930846224, "grad_norm": 0.3854498238738396, "learning_rate": 7.422033809582498e-06, "loss": 0.0229, "step": 3727 }, { "epoch": 1.6960873521383075, "grad_norm": 0.5497475163150745, "learning_rate": 7.4207833011736685e-06, "loss": 0.0333, "step": 3728 }, { "epoch": 1.6965423111919926, "grad_norm": 0.546075976887808, "learning_rate": 7.419532594949359e-06, "loss": 0.0391, "step": 3729 }, { "epoch": 1.696997270245678, "grad_norm": 0.5503279076609637, "learning_rate": 7.4182816910117724e-06, "loss": 0.032, "step": 3730 }, { "epoch": 1.697452229299363, "grad_norm": 0.44577137948939505, "learning_rate": 7.417030589463128e-06, "loss": 0.0229, "step": 3731 }, { "epoch": 1.697907188353048, "grad_norm": 0.4637424150287877, "learning_rate": 7.415779290405658e-06, "loss": 0.0276, "step": 3732 }, { "epoch": 1.6983621474067334, "grad_norm": 0.5116482751824055, "learning_rate": 7.414527793941614e-06, "loss": 0.031, "step": 3733 }, { "epoch": 1.6988171064604187, "grad_norm": 0.42887680446126897, "learning_rate": 7.413276100173262e-06, "loss": 0.0299, "step": 3734 }, { "epoch": 1.6992720655141036, "grad_norm": 0.4103477386094312, "learning_rate": 7.412024209202887e-06, "loss": 0.0241, "step": 3735 }, { "epoch": 1.699727024567789, "grad_norm": 0.34985347095670377, "learning_rate": 7.410772121132785e-06, "loss": 0.0212, "step": 3736 }, { "epoch": 1.7001819836214742, "grad_norm": 0.867876090579481, "learning_rate": 7.409519836065272e-06, "loss": 0.0519, "step": 3737 }, { "epoch": 1.700636942675159, "grad_norm": 0.5730098932055396, "learning_rate": 7.4082673541026805e-06, "loss": 0.0419, "step": 3738 }, { "epoch": 1.7010919017288444, "grad_norm": 0.4719630921045215, "learning_rate": 7.407014675347356e-06, "loss": 0.0255, "step": 3739 }, { "epoch": 1.7015468607825297, "grad_norm": 0.6245364317243467, "learning_rate": 7.405761799901662e-06, "loss": 0.0426, "step": 3740 }, { "epoch": 1.7020018198362148, "grad_norm": 0.5465632899661947, "learning_rate": 7.404508727867978e-06, "loss": 0.0329, "step": 3741 }, { "epoch": 1.7024567788898999, "grad_norm": 1.5070858916095438, "learning_rate": 7.403255459348699e-06, "loss": 0.058, "step": 3742 }, { "epoch": 1.7029117379435852, "grad_norm": 0.5022020296420143, "learning_rate": 7.402001994446237e-06, "loss": 0.0227, "step": 3743 }, { "epoch": 1.7033666969972703, "grad_norm": 0.5805315084933184, "learning_rate": 7.400748333263019e-06, "loss": 0.0343, "step": 3744 }, { "epoch": 1.7038216560509554, "grad_norm": 0.7410975613111707, "learning_rate": 7.399494475901491e-06, "loss": 0.0482, "step": 3745 }, { "epoch": 1.7042766151046407, "grad_norm": 0.6576165214272308, "learning_rate": 7.398240422464109e-06, "loss": 0.0331, "step": 3746 }, { "epoch": 1.7047315741583258, "grad_norm": 0.48532107166258226, "learning_rate": 7.396986173053349e-06, "loss": 0.028, "step": 3747 }, { "epoch": 1.7051865332120109, "grad_norm": 0.5585289141774178, "learning_rate": 7.395731727771705e-06, "loss": 0.0237, "step": 3748 }, { "epoch": 1.7056414922656962, "grad_norm": 0.6547560638585251, "learning_rate": 7.394477086721683e-06, "loss": 0.0366, "step": 3749 }, { "epoch": 1.7060964513193813, "grad_norm": 0.6673639283206183, "learning_rate": 7.393222250005807e-06, "loss": 0.0381, "step": 3750 }, { "epoch": 1.7065514103730663, "grad_norm": 0.6628156221749939, "learning_rate": 7.391967217726616e-06, "loss": 0.0263, "step": 3751 }, { "epoch": 1.7070063694267517, "grad_norm": 0.319113418020921, "learning_rate": 7.390711989986667e-06, "loss": 0.0104, "step": 3752 }, { "epoch": 1.7074613284804367, "grad_norm": 0.4874963972768766, "learning_rate": 7.389456566888529e-06, "loss": 0.0297, "step": 3753 }, { "epoch": 1.7079162875341218, "grad_norm": 0.6623867553871146, "learning_rate": 7.3882009485347915e-06, "loss": 0.0313, "step": 3754 }, { "epoch": 1.7083712465878071, "grad_norm": 0.6948367292138308, "learning_rate": 7.386945135028058e-06, "loss": 0.0353, "step": 3755 }, { "epoch": 1.7088262056414922, "grad_norm": 0.38970023134785947, "learning_rate": 7.385689126470948e-06, "loss": 0.0227, "step": 3756 }, { "epoch": 1.7092811646951773, "grad_norm": 0.5652773203658376, "learning_rate": 7.384432922966094e-06, "loss": 0.0352, "step": 3757 }, { "epoch": 1.7097361237488626, "grad_norm": 0.45850523783482594, "learning_rate": 7.383176524616151e-06, "loss": 0.0212, "step": 3758 }, { "epoch": 1.7101910828025477, "grad_norm": 0.745749844037621, "learning_rate": 7.381919931523786e-06, "loss": 0.0483, "step": 3759 }, { "epoch": 1.7106460418562328, "grad_norm": 0.5571287482700786, "learning_rate": 7.3806631437916795e-06, "loss": 0.0369, "step": 3760 }, { "epoch": 1.7111010009099181, "grad_norm": 0.33863255848977414, "learning_rate": 7.379406161522531e-06, "loss": 0.0145, "step": 3761 }, { "epoch": 1.7115559599636034, "grad_norm": 0.601596026810322, "learning_rate": 7.378148984819058e-06, "loss": 0.0402, "step": 3762 }, { "epoch": 1.7120109190172883, "grad_norm": 0.6041830243958276, "learning_rate": 7.376891613783987e-06, "loss": 0.0382, "step": 3763 }, { "epoch": 1.7124658780709736, "grad_norm": 0.5590057572877777, "learning_rate": 7.37563404852007e-06, "loss": 0.0322, "step": 3764 }, { "epoch": 1.712920837124659, "grad_norm": 0.5783338305509084, "learning_rate": 7.374376289130066e-06, "loss": 0.0308, "step": 3765 }, { "epoch": 1.7133757961783438, "grad_norm": 0.44184246273165895, "learning_rate": 7.373118335716755e-06, "loss": 0.0328, "step": 3766 }, { "epoch": 1.713830755232029, "grad_norm": 0.608815547149246, "learning_rate": 7.37186018838293e-06, "loss": 0.0322, "step": 3767 }, { "epoch": 1.7142857142857144, "grad_norm": 0.7043357325734398, "learning_rate": 7.3706018472314e-06, "loss": 0.0448, "step": 3768 }, { "epoch": 1.7147406733393995, "grad_norm": 0.5870414993849291, "learning_rate": 7.369343312364994e-06, "loss": 0.0275, "step": 3769 }, { "epoch": 1.7151956323930846, "grad_norm": 0.4144434553520093, "learning_rate": 7.3680845838865524e-06, "loss": 0.0226, "step": 3770 }, { "epoch": 1.71565059144677, "grad_norm": 0.7724336484979811, "learning_rate": 7.366825661898932e-06, "loss": 0.0523, "step": 3771 }, { "epoch": 1.716105550500455, "grad_norm": 0.44158179383570634, "learning_rate": 7.3655665465050085e-06, "loss": 0.0219, "step": 3772 }, { "epoch": 1.71656050955414, "grad_norm": 0.46164501993590573, "learning_rate": 7.364307237807669e-06, "loss": 0.0277, "step": 3773 }, { "epoch": 1.7170154686078254, "grad_norm": 0.3878859081616898, "learning_rate": 7.363047735909818e-06, "loss": 0.0174, "step": 3774 }, { "epoch": 1.7174704276615105, "grad_norm": 0.3010933687776966, "learning_rate": 7.361788040914379e-06, "loss": 0.0149, "step": 3775 }, { "epoch": 1.7179253867151956, "grad_norm": 0.5938272876776134, "learning_rate": 7.3605281529242855e-06, "loss": 0.0477, "step": 3776 }, { "epoch": 1.7183803457688809, "grad_norm": 0.5314911050681621, "learning_rate": 7.359268072042493e-06, "loss": 0.0347, "step": 3777 }, { "epoch": 1.718835304822566, "grad_norm": 0.5161094802276209, "learning_rate": 7.358007798371966e-06, "loss": 0.0271, "step": 3778 }, { "epoch": 1.719290263876251, "grad_norm": 0.43329053175996995, "learning_rate": 7.3567473320156925e-06, "loss": 0.0223, "step": 3779 }, { "epoch": 1.7197452229299364, "grad_norm": 0.4944751146860554, "learning_rate": 7.3554866730766696e-06, "loss": 0.0225, "step": 3780 }, { "epoch": 1.7202001819836215, "grad_norm": 0.44064583732303525, "learning_rate": 7.3542258216579136e-06, "loss": 0.03, "step": 3781 }, { "epoch": 1.7206551410373065, "grad_norm": 0.5157957700243019, "learning_rate": 7.3529647778624525e-06, "loss": 0.0212, "step": 3782 }, { "epoch": 1.7211101000909919, "grad_norm": 0.3945957677244866, "learning_rate": 7.351703541793338e-06, "loss": 0.0221, "step": 3783 }, { "epoch": 1.721565059144677, "grad_norm": 0.6180546881752076, "learning_rate": 7.35044211355363e-06, "loss": 0.0321, "step": 3784 }, { "epoch": 1.722020018198362, "grad_norm": 0.35332318030215154, "learning_rate": 7.3491804932464054e-06, "loss": 0.0201, "step": 3785 }, { "epoch": 1.7224749772520473, "grad_norm": 0.43770098659833917, "learning_rate": 7.347918680974761e-06, "loss": 0.0206, "step": 3786 }, { "epoch": 1.7229299363057324, "grad_norm": 0.5533036364637789, "learning_rate": 7.3466566768418045e-06, "loss": 0.0366, "step": 3787 }, { "epoch": 1.7233848953594175, "grad_norm": 0.3318485584811483, "learning_rate": 7.345394480950663e-06, "loss": 0.016, "step": 3788 }, { "epoch": 1.7238398544131028, "grad_norm": 0.526066120808399, "learning_rate": 7.344132093404474e-06, "loss": 0.0306, "step": 3789 }, { "epoch": 1.7242948134667881, "grad_norm": 0.49753343489307694, "learning_rate": 7.342869514306399e-06, "loss": 0.0245, "step": 3790 }, { "epoch": 1.724749772520473, "grad_norm": 0.45513746059510923, "learning_rate": 7.341606743759606e-06, "loss": 0.0276, "step": 3791 }, { "epoch": 1.7252047315741583, "grad_norm": 0.4081732067012268, "learning_rate": 7.340343781867285e-06, "loss": 0.0193, "step": 3792 }, { "epoch": 1.7256596906278436, "grad_norm": 0.5326445316009483, "learning_rate": 7.339080628732638e-06, "loss": 0.0338, "step": 3793 }, { "epoch": 1.7261146496815285, "grad_norm": 0.4806692608797013, "learning_rate": 7.337817284458887e-06, "loss": 0.0221, "step": 3794 }, { "epoch": 1.7265696087352138, "grad_norm": 0.4064847766577533, "learning_rate": 7.336553749149263e-06, "loss": 0.0165, "step": 3795 }, { "epoch": 1.7270245677888991, "grad_norm": 0.5186028514202176, "learning_rate": 7.33529002290702e-06, "loss": 0.031, "step": 3796 }, { "epoch": 1.7274795268425842, "grad_norm": 0.600305480294502, "learning_rate": 7.3340261058354215e-06, "loss": 0.0292, "step": 3797 }, { "epoch": 1.7279344858962693, "grad_norm": 0.45392043889591754, "learning_rate": 7.3327619980377505e-06, "loss": 0.0211, "step": 3798 }, { "epoch": 1.7283894449499546, "grad_norm": 0.5213897904145063, "learning_rate": 7.3314976996173035e-06, "loss": 0.0194, "step": 3799 }, { "epoch": 1.7288444040036397, "grad_norm": 0.4522011044749012, "learning_rate": 7.330233210677393e-06, "loss": 0.0272, "step": 3800 }, { "epoch": 1.7292993630573248, "grad_norm": 0.6462641556015574, "learning_rate": 7.32896853132135e-06, "loss": 0.0285, "step": 3801 }, { "epoch": 1.72975432211101, "grad_norm": 0.5251528408225381, "learning_rate": 7.327703661652513e-06, "loss": 0.0336, "step": 3802 }, { "epoch": 1.7302092811646952, "grad_norm": 0.4981466687834417, "learning_rate": 7.326438601774246e-06, "loss": 0.0249, "step": 3803 }, { "epoch": 1.7306642402183803, "grad_norm": 0.42897325009717824, "learning_rate": 7.325173351789923e-06, "loss": 0.0268, "step": 3804 }, { "epoch": 1.7311191992720656, "grad_norm": 0.45147714794739996, "learning_rate": 7.323907911802935e-06, "loss": 0.0225, "step": 3805 }, { "epoch": 1.7315741583257507, "grad_norm": 0.7370959239958824, "learning_rate": 7.322642281916684e-06, "loss": 0.0467, "step": 3806 }, { "epoch": 1.7320291173794358, "grad_norm": 0.4524420819643831, "learning_rate": 7.321376462234596e-06, "loss": 0.0274, "step": 3807 }, { "epoch": 1.732484076433121, "grad_norm": 0.5254868113533432, "learning_rate": 7.320110452860108e-06, "loss": 0.0329, "step": 3808 }, { "epoch": 1.7329390354868062, "grad_norm": 0.9564816030024412, "learning_rate": 7.318844253896671e-06, "loss": 0.0559, "step": 3809 }, { "epoch": 1.7333939945404913, "grad_norm": 0.5793168368997682, "learning_rate": 7.317577865447752e-06, "loss": 0.0358, "step": 3810 }, { "epoch": 1.7338489535941766, "grad_norm": 0.48460233389571516, "learning_rate": 7.316311287616837e-06, "loss": 0.0254, "step": 3811 }, { "epoch": 1.7343039126478617, "grad_norm": 0.5224913589925347, "learning_rate": 7.3150445205074235e-06, "loss": 0.0248, "step": 3812 }, { "epoch": 1.7347588717015467, "grad_norm": 0.4532280068500263, "learning_rate": 7.313777564223027e-06, "loss": 0.024, "step": 3813 }, { "epoch": 1.735213830755232, "grad_norm": 0.7398017091813359, "learning_rate": 7.3125104188671756e-06, "loss": 0.0511, "step": 3814 }, { "epoch": 1.7356687898089171, "grad_norm": 0.6215664511261365, "learning_rate": 7.311243084543418e-06, "loss": 0.0357, "step": 3815 }, { "epoch": 1.7361237488626022, "grad_norm": 0.5734922269319896, "learning_rate": 7.309975561355312e-06, "loss": 0.0333, "step": 3816 }, { "epoch": 1.7365787079162875, "grad_norm": 0.48964645948557484, "learning_rate": 7.308707849406434e-06, "loss": 0.0256, "step": 3817 }, { "epoch": 1.7370336669699729, "grad_norm": 0.419383294955176, "learning_rate": 7.3074399488003786e-06, "loss": 0.0169, "step": 3818 }, { "epoch": 1.7374886260236577, "grad_norm": 0.7285131334252631, "learning_rate": 7.306171859640749e-06, "loss": 0.0384, "step": 3819 }, { "epoch": 1.737943585077343, "grad_norm": 0.4592919908652582, "learning_rate": 7.304903582031171e-06, "loss": 0.0283, "step": 3820 }, { "epoch": 1.7383985441310283, "grad_norm": 4.09629691031824, "learning_rate": 7.30363511607528e-06, "loss": 0.0879, "step": 3821 }, { "epoch": 1.7388535031847132, "grad_norm": 0.5226168373603701, "learning_rate": 7.302366461876731e-06, "loss": 0.0284, "step": 3822 }, { "epoch": 1.7393084622383985, "grad_norm": 0.5448610135045856, "learning_rate": 7.301097619539193e-06, "loss": 0.0291, "step": 3823 }, { "epoch": 1.7397634212920838, "grad_norm": 0.7777497169855851, "learning_rate": 7.2998285891663465e-06, "loss": 0.0504, "step": 3824 }, { "epoch": 1.740218380345769, "grad_norm": 0.4599172925615985, "learning_rate": 7.298559370861896e-06, "loss": 0.0239, "step": 3825 }, { "epoch": 1.740673339399454, "grad_norm": 0.7806211292664905, "learning_rate": 7.297289964729554e-06, "loss": 0.0581, "step": 3826 }, { "epoch": 1.7411282984531393, "grad_norm": 0.6073930539328547, "learning_rate": 7.29602037087305e-06, "loss": 0.038, "step": 3827 }, { "epoch": 1.7415832575068244, "grad_norm": 0.4942760634250622, "learning_rate": 7.294750589396129e-06, "loss": 0.0278, "step": 3828 }, { "epoch": 1.7420382165605095, "grad_norm": 0.5097614043226851, "learning_rate": 7.293480620402553e-06, "loss": 0.0327, "step": 3829 }, { "epoch": 1.7424931756141948, "grad_norm": 0.677626075495248, "learning_rate": 7.2922104639961e-06, "loss": 0.0428, "step": 3830 }, { "epoch": 1.74294813466788, "grad_norm": 0.42937454159522365, "learning_rate": 7.290940120280557e-06, "loss": 0.0245, "step": 3831 }, { "epoch": 1.743403093721565, "grad_norm": 0.7118640826950644, "learning_rate": 7.2896695893597344e-06, "loss": 0.0443, "step": 3832 }, { "epoch": 1.7438580527752503, "grad_norm": 0.7972677619215067, "learning_rate": 7.288398871337453e-06, "loss": 0.0465, "step": 3833 }, { "epoch": 1.7443130118289354, "grad_norm": 0.6170583394100522, "learning_rate": 7.28712796631755e-06, "loss": 0.0465, "step": 3834 }, { "epoch": 1.7447679708826205, "grad_norm": 0.4855998128585689, "learning_rate": 7.285856874403878e-06, "loss": 0.0291, "step": 3835 }, { "epoch": 1.7452229299363058, "grad_norm": 0.6098175387505606, "learning_rate": 7.284585595700306e-06, "loss": 0.0409, "step": 3836 }, { "epoch": 1.7456778889899909, "grad_norm": 0.5051469778613001, "learning_rate": 7.283314130310716e-06, "loss": 0.029, "step": 3837 }, { "epoch": 1.746132848043676, "grad_norm": 0.7383032973710374, "learning_rate": 7.282042478339005e-06, "loss": 0.0346, "step": 3838 }, { "epoch": 1.7465878070973613, "grad_norm": 0.407879313889853, "learning_rate": 7.2807706398890895e-06, "loss": 0.0202, "step": 3839 }, { "epoch": 1.7470427661510464, "grad_norm": 0.5353921163068872, "learning_rate": 7.279498615064897e-06, "loss": 0.0244, "step": 3840 }, { "epoch": 1.7474977252047315, "grad_norm": 0.5249889217718524, "learning_rate": 7.278226403970371e-06, "loss": 0.0303, "step": 3841 }, { "epoch": 1.7479526842584168, "grad_norm": 0.5250520409728981, "learning_rate": 7.276954006709473e-06, "loss": 0.0195, "step": 3842 }, { "epoch": 1.7484076433121019, "grad_norm": 0.5503025135768406, "learning_rate": 7.275681423386176e-06, "loss": 0.0308, "step": 3843 }, { "epoch": 1.748862602365787, "grad_norm": 0.4540938115527505, "learning_rate": 7.27440865410447e-06, "loss": 0.031, "step": 3844 }, { "epoch": 1.7493175614194723, "grad_norm": 0.4608493774662213, "learning_rate": 7.273135698968359e-06, "loss": 0.033, "step": 3845 }, { "epoch": 1.7497725204731576, "grad_norm": 0.5434375722057718, "learning_rate": 7.271862558081865e-06, "loss": 0.0398, "step": 3846 }, { "epoch": 1.7502274795268424, "grad_norm": 0.5014557314754617, "learning_rate": 7.270589231549022e-06, "loss": 0.0411, "step": 3847 }, { "epoch": 1.7506824385805277, "grad_norm": 0.5605891749344477, "learning_rate": 7.269315719473879e-06, "loss": 0.0378, "step": 3848 }, { "epoch": 1.751137397634213, "grad_norm": 0.6081085150245663, "learning_rate": 7.268042021960508e-06, "loss": 0.0412, "step": 3849 }, { "epoch": 1.7515923566878981, "grad_norm": 0.667465979501546, "learning_rate": 7.266768139112982e-06, "loss": 0.0497, "step": 3850 }, { "epoch": 1.7520473157415832, "grad_norm": 0.40853600862540174, "learning_rate": 7.265494071035401e-06, "loss": 0.0244, "step": 3851 }, { "epoch": 1.7525022747952685, "grad_norm": 0.5287548034772686, "learning_rate": 7.264219817831875e-06, "loss": 0.0385, "step": 3852 }, { "epoch": 1.7529572338489536, "grad_norm": 0.5795926874902304, "learning_rate": 7.262945379606532e-06, "loss": 0.0311, "step": 3853 }, { "epoch": 1.7534121929026387, "grad_norm": 0.5696107705215291, "learning_rate": 7.261670756463511e-06, "loss": 0.0447, "step": 3854 }, { "epoch": 1.753867151956324, "grad_norm": 0.53296867648118, "learning_rate": 7.260395948506969e-06, "loss": 0.0387, "step": 3855 }, { "epoch": 1.7543221110100091, "grad_norm": 0.7261108823709813, "learning_rate": 7.259120955841079e-06, "loss": 0.0374, "step": 3856 }, { "epoch": 1.7547770700636942, "grad_norm": 0.42299923831898156, "learning_rate": 7.257845778570025e-06, "loss": 0.0265, "step": 3857 }, { "epoch": 1.7552320291173795, "grad_norm": 0.5908684889670753, "learning_rate": 7.256570416798012e-06, "loss": 0.0459, "step": 3858 }, { "epoch": 1.7556869881710646, "grad_norm": 0.5168098805178122, "learning_rate": 7.255294870629255e-06, "loss": 0.0311, "step": 3859 }, { "epoch": 1.7561419472247497, "grad_norm": 0.49943533767271087, "learning_rate": 7.254019140167985e-06, "loss": 0.0345, "step": 3860 }, { "epoch": 1.756596906278435, "grad_norm": 0.6434720360032985, "learning_rate": 7.252743225518451e-06, "loss": 0.0347, "step": 3861 }, { "epoch": 1.75705186533212, "grad_norm": 0.3347064769600789, "learning_rate": 7.251467126784913e-06, "loss": 0.0134, "step": 3862 }, { "epoch": 1.7575068243858052, "grad_norm": 0.6959580811097871, "learning_rate": 7.2501908440716495e-06, "loss": 0.0383, "step": 3863 }, { "epoch": 1.7579617834394905, "grad_norm": 0.7211232218619601, "learning_rate": 7.248914377482952e-06, "loss": 0.0521, "step": 3864 }, { "epoch": 1.7584167424931756, "grad_norm": 0.5339237434249615, "learning_rate": 7.247637727123127e-06, "loss": 0.0331, "step": 3865 }, { "epoch": 1.7588717015468607, "grad_norm": 0.5061058168841682, "learning_rate": 7.246360893096497e-06, "loss": 0.0185, "step": 3866 }, { "epoch": 1.759326660600546, "grad_norm": 0.6438152399441944, "learning_rate": 7.245083875507399e-06, "loss": 0.037, "step": 3867 }, { "epoch": 1.759781619654231, "grad_norm": 0.46135233899654055, "learning_rate": 7.243806674460187e-06, "loss": 0.0266, "step": 3868 }, { "epoch": 1.7602365787079162, "grad_norm": 0.5348484519345554, "learning_rate": 7.242529290059226e-06, "loss": 0.0313, "step": 3869 }, { "epoch": 1.7606915377616015, "grad_norm": 0.6355208997437304, "learning_rate": 7.241251722408897e-06, "loss": 0.0418, "step": 3870 }, { "epoch": 1.7611464968152868, "grad_norm": 0.49532919357280714, "learning_rate": 7.239973971613601e-06, "loss": 0.0198, "step": 3871 }, { "epoch": 1.7616014558689717, "grad_norm": 0.5490859209662113, "learning_rate": 7.238696037777746e-06, "loss": 0.0217, "step": 3872 }, { "epoch": 1.762056414922657, "grad_norm": 0.44798778875478795, "learning_rate": 7.237417921005762e-06, "loss": 0.0301, "step": 3873 }, { "epoch": 1.7625113739763423, "grad_norm": 0.5596271551200658, "learning_rate": 7.236139621402087e-06, "loss": 0.0454, "step": 3874 }, { "epoch": 1.7629663330300271, "grad_norm": 0.6046555536937368, "learning_rate": 7.234861139071184e-06, "loss": 0.0415, "step": 3875 }, { "epoch": 1.7634212920837125, "grad_norm": 0.5784244809042435, "learning_rate": 7.23358247411752e-06, "loss": 0.0365, "step": 3876 }, { "epoch": 1.7638762511373978, "grad_norm": 0.6506098894981671, "learning_rate": 7.232303626645582e-06, "loss": 0.0332, "step": 3877 }, { "epoch": 1.7643312101910829, "grad_norm": 0.517598977042059, "learning_rate": 7.231024596759874e-06, "loss": 0.0293, "step": 3878 }, { "epoch": 1.764786169244768, "grad_norm": 0.5031709596106537, "learning_rate": 7.229745384564909e-06, "loss": 0.031, "step": 3879 }, { "epoch": 1.7652411282984533, "grad_norm": 0.5939459609265596, "learning_rate": 7.228465990165222e-06, "loss": 0.0368, "step": 3880 }, { "epoch": 1.7656960873521383, "grad_norm": 0.8032766875075776, "learning_rate": 7.227186413665359e-06, "loss": 0.0459, "step": 3881 }, { "epoch": 1.7661510464058234, "grad_norm": 0.5038566660592529, "learning_rate": 7.225906655169879e-06, "loss": 0.0235, "step": 3882 }, { "epoch": 1.7666060054595087, "grad_norm": 0.4295582387223101, "learning_rate": 7.2246267147833585e-06, "loss": 0.0193, "step": 3883 }, { "epoch": 1.7670609645131938, "grad_norm": 0.5324112724062958, "learning_rate": 7.223346592610389e-06, "loss": 0.0288, "step": 3884 }, { "epoch": 1.767515923566879, "grad_norm": 0.5085275637911066, "learning_rate": 7.222066288755578e-06, "loss": 0.0367, "step": 3885 }, { "epoch": 1.7679708826205642, "grad_norm": 0.6991787121800683, "learning_rate": 7.220785803323544e-06, "loss": 0.0411, "step": 3886 }, { "epoch": 1.7684258416742493, "grad_norm": 0.9547185096653158, "learning_rate": 7.219505136418924e-06, "loss": 0.0654, "step": 3887 }, { "epoch": 1.7688808007279344, "grad_norm": 0.5058121552890327, "learning_rate": 7.218224288146367e-06, "loss": 0.0292, "step": 3888 }, { "epoch": 1.7693357597816197, "grad_norm": 0.45480870790423666, "learning_rate": 7.216943258610538e-06, "loss": 0.0276, "step": 3889 }, { "epoch": 1.7697907188353048, "grad_norm": 0.4458594164591656, "learning_rate": 7.215662047916118e-06, "loss": 0.0249, "step": 3890 }, { "epoch": 1.77024567788899, "grad_norm": 0.5899132330173917, "learning_rate": 7.214380656167801e-06, "loss": 0.025, "step": 3891 }, { "epoch": 1.7707006369426752, "grad_norm": 0.459963286150344, "learning_rate": 7.213099083470296e-06, "loss": 0.0311, "step": 3892 }, { "epoch": 1.7711555959963603, "grad_norm": 0.406938597420023, "learning_rate": 7.21181732992833e-06, "loss": 0.0256, "step": 3893 }, { "epoch": 1.7716105550500454, "grad_norm": 0.574528139737722, "learning_rate": 7.210535395646638e-06, "loss": 0.0366, "step": 3894 }, { "epoch": 1.7720655141037307, "grad_norm": 0.48034518659388153, "learning_rate": 7.2092532807299794e-06, "loss": 0.036, "step": 3895 }, { "epoch": 1.7725204731574158, "grad_norm": 0.5540331865344734, "learning_rate": 7.207970985283117e-06, "loss": 0.0232, "step": 3896 }, { "epoch": 1.7729754322111009, "grad_norm": 0.5929018938747233, "learning_rate": 7.206688509410838e-06, "loss": 0.0308, "step": 3897 }, { "epoch": 1.7734303912647862, "grad_norm": 0.5494749966707317, "learning_rate": 7.205405853217939e-06, "loss": 0.0219, "step": 3898 }, { "epoch": 1.7738853503184715, "grad_norm": 0.4798377241717678, "learning_rate": 7.204123016809232e-06, "loss": 0.029, "step": 3899 }, { "epoch": 1.7743403093721564, "grad_norm": 0.6834984444108951, "learning_rate": 7.202840000289548e-06, "loss": 0.0436, "step": 3900 }, { "epoch": 1.7747952684258417, "grad_norm": 0.6325235486118592, "learning_rate": 7.2015568037637255e-06, "loss": 0.043, "step": 3901 }, { "epoch": 1.775250227479527, "grad_norm": 0.6543473635592312, "learning_rate": 7.200273427336623e-06, "loss": 0.0386, "step": 3902 }, { "epoch": 1.7757051865332119, "grad_norm": 0.5494135195180709, "learning_rate": 7.198989871113113e-06, "loss": 0.0357, "step": 3903 }, { "epoch": 1.7761601455868972, "grad_norm": 0.6036518496810389, "learning_rate": 7.197706135198082e-06, "loss": 0.0294, "step": 3904 }, { "epoch": 1.7766151046405825, "grad_norm": 0.5058727125301408, "learning_rate": 7.196422219696429e-06, "loss": 0.0276, "step": 3905 }, { "epoch": 1.7770700636942676, "grad_norm": 0.5408005843276728, "learning_rate": 7.195138124713073e-06, "loss": 0.0288, "step": 3906 }, { "epoch": 1.7775250227479527, "grad_norm": 0.5570097418642844, "learning_rate": 7.193853850352941e-06, "loss": 0.0318, "step": 3907 }, { "epoch": 1.777979981801638, "grad_norm": 0.7516904407926818, "learning_rate": 7.192569396720978e-06, "loss": 0.0363, "step": 3908 }, { "epoch": 1.778434940855323, "grad_norm": 0.5426361783495182, "learning_rate": 7.1912847639221495e-06, "loss": 0.0281, "step": 3909 }, { "epoch": 1.7788898999090081, "grad_norm": 0.5988355876026382, "learning_rate": 7.189999952061424e-06, "loss": 0.0293, "step": 3910 }, { "epoch": 1.7793448589626935, "grad_norm": 0.4940570073509149, "learning_rate": 7.188714961243792e-06, "loss": 0.0215, "step": 3911 }, { "epoch": 1.7797998180163785, "grad_norm": 0.5460494752292816, "learning_rate": 7.187429791574259e-06, "loss": 0.0324, "step": 3912 }, { "epoch": 1.7802547770700636, "grad_norm": 0.5877174034961691, "learning_rate": 7.18614444315784e-06, "loss": 0.0423, "step": 3913 }, { "epoch": 1.780709736123749, "grad_norm": 0.5616239933360987, "learning_rate": 7.1848589160995705e-06, "loss": 0.0325, "step": 3914 }, { "epoch": 1.781164695177434, "grad_norm": 0.5565765328246807, "learning_rate": 7.1835732105044955e-06, "loss": 0.0274, "step": 3915 }, { "epoch": 1.7816196542311191, "grad_norm": 0.4775935924483279, "learning_rate": 7.182287326477681e-06, "loss": 0.0307, "step": 3916 }, { "epoch": 1.7820746132848044, "grad_norm": 0.4826281170996024, "learning_rate": 7.181001264124201e-06, "loss": 0.0245, "step": 3917 }, { "epoch": 1.7825295723384895, "grad_norm": 0.6381064122504433, "learning_rate": 7.179715023549145e-06, "loss": 0.0336, "step": 3918 }, { "epoch": 1.7829845313921746, "grad_norm": 0.5015086145547383, "learning_rate": 7.178428604857622e-06, "loss": 0.0276, "step": 3919 }, { "epoch": 1.78343949044586, "grad_norm": 0.4836511306822213, "learning_rate": 7.1771420081547514e-06, "loss": 0.0312, "step": 3920 }, { "epoch": 1.783894449499545, "grad_norm": 0.5672466172555591, "learning_rate": 7.175855233545669e-06, "loss": 0.0452, "step": 3921 }, { "epoch": 1.78434940855323, "grad_norm": 0.5529854651973736, "learning_rate": 7.174568281135521e-06, "loss": 0.033, "step": 3922 }, { "epoch": 1.7848043676069154, "grad_norm": 0.4677563731963938, "learning_rate": 7.173281151029473e-06, "loss": 0.0284, "step": 3923 }, { "epoch": 1.7852593266606005, "grad_norm": 0.8341654528612025, "learning_rate": 7.171993843332705e-06, "loss": 0.0448, "step": 3924 }, { "epoch": 1.7857142857142856, "grad_norm": 0.7469351020696116, "learning_rate": 7.170706358150408e-06, "loss": 0.0411, "step": 3925 }, { "epoch": 1.786169244767971, "grad_norm": 0.5434962552420755, "learning_rate": 7.169418695587791e-06, "loss": 0.0438, "step": 3926 }, { "epoch": 1.7866242038216562, "grad_norm": 0.3555412569211614, "learning_rate": 7.1681308557500755e-06, "loss": 0.0214, "step": 3927 }, { "epoch": 1.787079162875341, "grad_norm": 0.5544920715175138, "learning_rate": 7.166842838742497e-06, "loss": 0.0306, "step": 3928 }, { "epoch": 1.7875341219290264, "grad_norm": 0.5786874917103659, "learning_rate": 7.165554644670307e-06, "loss": 0.042, "step": 3929 }, { "epoch": 1.7879890809827117, "grad_norm": 0.6196487285635767, "learning_rate": 7.164266273638771e-06, "loss": 0.044, "step": 3930 }, { "epoch": 1.7884440400363966, "grad_norm": 0.6477828820090044, "learning_rate": 7.162977725753169e-06, "loss": 0.0413, "step": 3931 }, { "epoch": 1.7888989990900819, "grad_norm": 0.5499920986044309, "learning_rate": 7.1616890011187945e-06, "loss": 0.039, "step": 3932 }, { "epoch": 1.7893539581437672, "grad_norm": 0.6737272879159145, "learning_rate": 7.160400099840959e-06, "loss": 0.0481, "step": 3933 }, { "epoch": 1.7898089171974523, "grad_norm": 0.6104575447990194, "learning_rate": 7.1591110220249826e-06, "loss": 0.0362, "step": 3934 }, { "epoch": 1.7902638762511374, "grad_norm": 0.455324541786891, "learning_rate": 7.157821767776203e-06, "loss": 0.0243, "step": 3935 }, { "epoch": 1.7907188353048227, "grad_norm": 0.3953682511963992, "learning_rate": 7.1565323371999725e-06, "loss": 0.0209, "step": 3936 }, { "epoch": 1.7911737943585078, "grad_norm": 0.48993108578342043, "learning_rate": 7.15524273040166e-06, "loss": 0.0243, "step": 3937 }, { "epoch": 1.7916287534121929, "grad_norm": 0.6575529662736596, "learning_rate": 7.153952947486645e-06, "loss": 0.0528, "step": 3938 }, { "epoch": 1.7920837124658782, "grad_norm": 0.615383967163662, "learning_rate": 7.152662988560322e-06, "loss": 0.0374, "step": 3939 }, { "epoch": 1.7925386715195633, "grad_norm": 0.5617255711206577, "learning_rate": 7.151372853728099e-06, "loss": 0.0404, "step": 3940 }, { "epoch": 1.7929936305732483, "grad_norm": 0.7324453621905366, "learning_rate": 7.150082543095403e-06, "loss": 0.0321, "step": 3941 }, { "epoch": 1.7934485896269337, "grad_norm": 0.6300413924268614, "learning_rate": 7.148792056767672e-06, "loss": 0.0419, "step": 3942 }, { "epoch": 1.7939035486806187, "grad_norm": 0.4149778419279514, "learning_rate": 7.147501394850357e-06, "loss": 0.0292, "step": 3943 }, { "epoch": 1.7943585077343038, "grad_norm": 0.5442232866096656, "learning_rate": 7.146210557448926e-06, "loss": 0.0333, "step": 3944 }, { "epoch": 1.7948134667879891, "grad_norm": 0.3725055725075537, "learning_rate": 7.144919544668863e-06, "loss": 0.0156, "step": 3945 }, { "epoch": 1.7952684258416742, "grad_norm": 0.6523123894505088, "learning_rate": 7.143628356615657e-06, "loss": 0.0356, "step": 3946 }, { "epoch": 1.7957233848953593, "grad_norm": 0.5848326338750129, "learning_rate": 7.142336993394825e-06, "loss": 0.0334, "step": 3947 }, { "epoch": 1.7961783439490446, "grad_norm": 0.670071924436715, "learning_rate": 7.141045455111888e-06, "loss": 0.0484, "step": 3948 }, { "epoch": 1.7966333030027297, "grad_norm": 0.5985665676795898, "learning_rate": 7.139753741872385e-06, "loss": 0.0346, "step": 3949 }, { "epoch": 1.7970882620564148, "grad_norm": 0.6010858528292592, "learning_rate": 7.13846185378187e-06, "loss": 0.0336, "step": 3950 }, { "epoch": 1.7975432211101001, "grad_norm": 0.507128365864404, "learning_rate": 7.137169790945908e-06, "loss": 0.0282, "step": 3951 }, { "epoch": 1.7979981801637852, "grad_norm": 0.5443000453471494, "learning_rate": 7.135877553470083e-06, "loss": 0.0294, "step": 3952 }, { "epoch": 1.7984531392174703, "grad_norm": 0.5191857848884796, "learning_rate": 7.134585141459991e-06, "loss": 0.033, "step": 3953 }, { "epoch": 1.7989080982711556, "grad_norm": 0.3401039408774095, "learning_rate": 7.133292555021239e-06, "loss": 0.0181, "step": 3954 }, { "epoch": 1.799363057324841, "grad_norm": 0.5449394868820434, "learning_rate": 7.131999794259454e-06, "loss": 0.0268, "step": 3955 }, { "epoch": 1.7998180163785258, "grad_norm": 0.5380343118714457, "learning_rate": 7.1307068592802745e-06, "loss": 0.0273, "step": 3956 }, { "epoch": 1.800272975432211, "grad_norm": 0.4790260426232836, "learning_rate": 7.129413750189351e-06, "loss": 0.0361, "step": 3957 }, { "epoch": 1.8007279344858964, "grad_norm": 0.7795203644523895, "learning_rate": 7.128120467092354e-06, "loss": 0.0367, "step": 3958 }, { "epoch": 1.8011828935395813, "grad_norm": 0.4963598697991848, "learning_rate": 7.126827010094962e-06, "loss": 0.0266, "step": 3959 }, { "epoch": 1.8016378525932666, "grad_norm": 0.5848640070282111, "learning_rate": 7.125533379302872e-06, "loss": 0.0408, "step": 3960 }, { "epoch": 1.802092811646952, "grad_norm": 0.538672038336608, "learning_rate": 7.1242395748217915e-06, "loss": 0.0413, "step": 3961 }, { "epoch": 1.802547770700637, "grad_norm": 0.6170672028146802, "learning_rate": 7.122945596757449e-06, "loss": 0.0462, "step": 3962 }, { "epoch": 1.803002729754322, "grad_norm": 0.685339047524874, "learning_rate": 7.121651445215577e-06, "loss": 0.0438, "step": 3963 }, { "epoch": 1.8034576888080074, "grad_norm": 0.5758149767926731, "learning_rate": 7.120357120301931e-06, "loss": 0.0418, "step": 3964 }, { "epoch": 1.8039126478616925, "grad_norm": 0.5758706450090999, "learning_rate": 7.119062622122277e-06, "loss": 0.039, "step": 3965 }, { "epoch": 1.8043676069153776, "grad_norm": 0.5164499799409003, "learning_rate": 7.117767950782394e-06, "loss": 0.0345, "step": 3966 }, { "epoch": 1.8048225659690629, "grad_norm": 0.6701744378878628, "learning_rate": 7.1164731063880775e-06, "loss": 0.0357, "step": 3967 }, { "epoch": 1.805277525022748, "grad_norm": 0.5339530812209317, "learning_rate": 7.115178089045137e-06, "loss": 0.0307, "step": 3968 }, { "epoch": 1.805732484076433, "grad_norm": 0.5616425335788408, "learning_rate": 7.1138828988593964e-06, "loss": 0.0377, "step": 3969 }, { "epoch": 1.8061874431301184, "grad_norm": 0.6328052760754227, "learning_rate": 7.112587535936691e-06, "loss": 0.035, "step": 3970 }, { "epoch": 1.8066424021838035, "grad_norm": 0.49133852954676155, "learning_rate": 7.111292000382871e-06, "loss": 0.0344, "step": 3971 }, { "epoch": 1.8070973612374885, "grad_norm": 0.5991588106482821, "learning_rate": 7.1099962923038055e-06, "loss": 0.0239, "step": 3972 }, { "epoch": 1.8075523202911739, "grad_norm": 0.5223998404891242, "learning_rate": 7.10870041180537e-06, "loss": 0.0372, "step": 3973 }, { "epoch": 1.808007279344859, "grad_norm": 0.4946982797043428, "learning_rate": 7.10740435899346e-06, "loss": 0.0314, "step": 3974 }, { "epoch": 1.808462238398544, "grad_norm": 0.6989947393162688, "learning_rate": 7.106108133973983e-06, "loss": 0.0497, "step": 3975 }, { "epoch": 1.8089171974522293, "grad_norm": 0.5312108774836449, "learning_rate": 7.104811736852861e-06, "loss": 0.0325, "step": 3976 }, { "epoch": 1.8093721565059144, "grad_norm": 0.5491635863938112, "learning_rate": 7.10351516773603e-06, "loss": 0.0219, "step": 3977 }, { "epoch": 1.8098271155595995, "grad_norm": 0.6472056450996261, "learning_rate": 7.102218426729434e-06, "loss": 0.0388, "step": 3978 }, { "epoch": 1.8102820746132848, "grad_norm": 0.52454943466676, "learning_rate": 7.1009215139390475e-06, "loss": 0.0344, "step": 3979 }, { "epoch": 1.81073703366697, "grad_norm": 0.4375468836504088, "learning_rate": 7.0996244294708395e-06, "loss": 0.0223, "step": 3980 }, { "epoch": 1.811191992720655, "grad_norm": 0.4454882660335665, "learning_rate": 7.098327173430806e-06, "loss": 0.0319, "step": 3981 }, { "epoch": 1.8116469517743403, "grad_norm": 0.7417080914051789, "learning_rate": 7.097029745924951e-06, "loss": 0.0418, "step": 3982 }, { "epoch": 1.8121019108280256, "grad_norm": 0.40394518683492026, "learning_rate": 7.095732147059295e-06, "loss": 0.018, "step": 3983 }, { "epoch": 1.8125568698817105, "grad_norm": 0.434190114216561, "learning_rate": 7.094434376939874e-06, "loss": 0.0271, "step": 3984 }, { "epoch": 1.8130118289353958, "grad_norm": 0.5649421222343399, "learning_rate": 7.093136435672731e-06, "loss": 0.0352, "step": 3985 }, { "epoch": 1.8134667879890811, "grad_norm": 0.579850191763891, "learning_rate": 7.091838323363935e-06, "loss": 0.0383, "step": 3986 }, { "epoch": 1.813921747042766, "grad_norm": 0.7918429567659951, "learning_rate": 7.090540040119556e-06, "loss": 0.0542, "step": 3987 }, { "epoch": 1.8143767060964513, "grad_norm": 0.6425928510109524, "learning_rate": 7.089241586045684e-06, "loss": 0.0357, "step": 3988 }, { "epoch": 1.8148316651501366, "grad_norm": 0.550612542420563, "learning_rate": 7.087942961248428e-06, "loss": 0.0261, "step": 3989 }, { "epoch": 1.8152866242038217, "grad_norm": 0.6018700809779689, "learning_rate": 7.086644165833899e-06, "loss": 0.044, "step": 3990 }, { "epoch": 1.8157415832575068, "grad_norm": 0.504372754878958, "learning_rate": 7.085345199908234e-06, "loss": 0.0243, "step": 3991 }, { "epoch": 1.816196542311192, "grad_norm": 0.4546000934803039, "learning_rate": 7.084046063577577e-06, "loss": 0.023, "step": 3992 }, { "epoch": 1.8166515013648772, "grad_norm": 0.4740803970183519, "learning_rate": 7.0827467569480846e-06, "loss": 0.0292, "step": 3993 }, { "epoch": 1.8171064604185623, "grad_norm": 0.6679799664830866, "learning_rate": 7.081447280125935e-06, "loss": 0.0403, "step": 3994 }, { "epoch": 1.8175614194722476, "grad_norm": 1.0216658476486054, "learning_rate": 7.080147633217311e-06, "loss": 0.0382, "step": 3995 }, { "epoch": 1.8180163785259327, "grad_norm": 0.8756873391931521, "learning_rate": 7.078847816328419e-06, "loss": 0.0296, "step": 3996 }, { "epoch": 1.8184713375796178, "grad_norm": 0.6994105648415206, "learning_rate": 7.077547829565471e-06, "loss": 0.0449, "step": 3997 }, { "epoch": 1.818926296633303, "grad_norm": 0.5970649996996983, "learning_rate": 7.076247673034696e-06, "loss": 0.0449, "step": 3998 }, { "epoch": 1.8193812556869882, "grad_norm": 0.49363215921320414, "learning_rate": 7.074947346842337e-06, "loss": 0.0301, "step": 3999 }, { "epoch": 1.8198362147406733, "grad_norm": 0.34205539276628477, "learning_rate": 7.073646851094651e-06, "loss": 0.0161, "step": 4000 }, { "epoch": 1.8202911737943586, "grad_norm": 0.4754376726073723, "learning_rate": 7.07234618589791e-06, "loss": 0.035, "step": 4001 }, { "epoch": 1.8207461328480437, "grad_norm": 0.43989711731105835, "learning_rate": 7.071045351358396e-06, "loss": 0.0275, "step": 4002 }, { "epoch": 1.8212010919017287, "grad_norm": 0.6022776161475787, "learning_rate": 7.06974434758241e-06, "loss": 0.0296, "step": 4003 }, { "epoch": 1.821656050955414, "grad_norm": 0.5856739185783606, "learning_rate": 7.068443174676262e-06, "loss": 0.0419, "step": 4004 }, { "epoch": 1.8221110100090991, "grad_norm": 0.5657144514568859, "learning_rate": 7.067141832746279e-06, "loss": 0.0323, "step": 4005 }, { "epoch": 1.8225659690627842, "grad_norm": 0.7015862599769166, "learning_rate": 7.0658403218988004e-06, "loss": 0.0408, "step": 4006 }, { "epoch": 1.8230209281164695, "grad_norm": 0.499501929992003, "learning_rate": 7.06453864224018e-06, "loss": 0.0268, "step": 4007 }, { "epoch": 1.8234758871701549, "grad_norm": 0.5065496773444177, "learning_rate": 7.063236793876785e-06, "loss": 0.0317, "step": 4008 }, { "epoch": 1.8239308462238397, "grad_norm": 0.4125612464785829, "learning_rate": 7.061934776914997e-06, "loss": 0.0289, "step": 4009 }, { "epoch": 1.824385805277525, "grad_norm": 0.38547122342302553, "learning_rate": 7.06063259146121e-06, "loss": 0.0204, "step": 4010 }, { "epoch": 1.8248407643312103, "grad_norm": 0.5094320858269077, "learning_rate": 7.0593302376218355e-06, "loss": 0.0341, "step": 4011 }, { "epoch": 1.8252957233848952, "grad_norm": 0.7171503681661315, "learning_rate": 7.058027715503292e-06, "loss": 0.0387, "step": 4012 }, { "epoch": 1.8257506824385805, "grad_norm": 0.560749436168927, "learning_rate": 7.056725025212017e-06, "loss": 0.0335, "step": 4013 }, { "epoch": 1.8262056414922658, "grad_norm": 0.5660083707755492, "learning_rate": 7.055422166854461e-06, "loss": 0.0303, "step": 4014 }, { "epoch": 1.826660600545951, "grad_norm": 0.45700289605611905, "learning_rate": 7.05411914053709e-06, "loss": 0.0315, "step": 4015 }, { "epoch": 1.827115559599636, "grad_norm": 0.5276769885577551, "learning_rate": 7.052815946366377e-06, "loss": 0.0351, "step": 4016 }, { "epoch": 1.8275705186533213, "grad_norm": 0.5502525643150208, "learning_rate": 7.051512584448815e-06, "loss": 0.0397, "step": 4017 }, { "epoch": 1.8280254777070064, "grad_norm": 0.482778868702987, "learning_rate": 7.050209054890911e-06, "loss": 0.0274, "step": 4018 }, { "epoch": 1.8284804367606915, "grad_norm": 0.5312301615199478, "learning_rate": 7.048905357799181e-06, "loss": 0.04, "step": 4019 }, { "epoch": 1.8289353958143768, "grad_norm": 0.6096548276462953, "learning_rate": 7.047601493280157e-06, "loss": 0.0283, "step": 4020 }, { "epoch": 1.829390354868062, "grad_norm": 0.47444463674377546, "learning_rate": 7.046297461440387e-06, "loss": 0.0363, "step": 4021 }, { "epoch": 1.829845313921747, "grad_norm": 0.6124965706930082, "learning_rate": 7.044993262386429e-06, "loss": 0.0409, "step": 4022 }, { "epoch": 1.8303002729754323, "grad_norm": 0.6896982876579423, "learning_rate": 7.043688896224856e-06, "loss": 0.0381, "step": 4023 }, { "epoch": 1.8307552320291174, "grad_norm": 0.51933427018194, "learning_rate": 7.042384363062257e-06, "loss": 0.0408, "step": 4024 }, { "epoch": 1.8312101910828025, "grad_norm": 0.49264804902028037, "learning_rate": 7.041079663005231e-06, "loss": 0.0296, "step": 4025 }, { "epoch": 1.8316651501364878, "grad_norm": 0.4301249590257083, "learning_rate": 7.039774796160391e-06, "loss": 0.0241, "step": 4026 }, { "epoch": 1.8321201091901729, "grad_norm": 0.3618689267935005, "learning_rate": 7.038469762634368e-06, "loss": 0.0176, "step": 4027 }, { "epoch": 1.832575068243858, "grad_norm": 0.618004674757222, "learning_rate": 7.0371645625338e-06, "loss": 0.0331, "step": 4028 }, { "epoch": 1.8330300272975433, "grad_norm": 0.49202152928312604, "learning_rate": 7.035859195965344e-06, "loss": 0.038, "step": 4029 }, { "epoch": 1.8334849863512284, "grad_norm": 0.4024547889272229, "learning_rate": 7.034553663035669e-06, "loss": 0.0257, "step": 4030 }, { "epoch": 1.8339399454049135, "grad_norm": 0.4873924197201483, "learning_rate": 7.033247963851457e-06, "loss": 0.0196, "step": 4031 }, { "epoch": 1.8343949044585988, "grad_norm": 0.49561205032728406, "learning_rate": 7.031942098519403e-06, "loss": 0.0291, "step": 4032 }, { "epoch": 1.8348498635122839, "grad_norm": 0.49352370428101255, "learning_rate": 7.030636067146217e-06, "loss": 0.0349, "step": 4033 }, { "epoch": 1.835304822565969, "grad_norm": 0.6452194492215327, "learning_rate": 7.0293298698386215e-06, "loss": 0.0385, "step": 4034 }, { "epoch": 1.8357597816196543, "grad_norm": 0.5933562882498339, "learning_rate": 7.028023506703354e-06, "loss": 0.0414, "step": 4035 }, { "epoch": 1.8362147406733396, "grad_norm": 0.4610645783735472, "learning_rate": 7.0267169778471635e-06, "loss": 0.0196, "step": 4036 }, { "epoch": 1.8366696997270244, "grad_norm": 0.6054267023221399, "learning_rate": 7.0254102833768134e-06, "loss": 0.0479, "step": 4037 }, { "epoch": 1.8371246587807097, "grad_norm": 0.5589777226125918, "learning_rate": 7.024103423399083e-06, "loss": 0.0358, "step": 4038 }, { "epoch": 1.837579617834395, "grad_norm": 0.5022138566645574, "learning_rate": 7.022796398020761e-06, "loss": 0.0339, "step": 4039 }, { "epoch": 1.83803457688808, "grad_norm": 0.6392214580188212, "learning_rate": 7.021489207348651e-06, "loss": 0.0415, "step": 4040 }, { "epoch": 1.8384895359417652, "grad_norm": 0.4152816013558989, "learning_rate": 7.020181851489574e-06, "loss": 0.031, "step": 4041 }, { "epoch": 1.8389444949954505, "grad_norm": 0.4997589182807433, "learning_rate": 7.018874330550359e-06, "loss": 0.0354, "step": 4042 }, { "epoch": 1.8393994540491356, "grad_norm": 0.4878815217419077, "learning_rate": 7.01756664463785e-06, "loss": 0.026, "step": 4043 }, { "epoch": 1.8398544131028207, "grad_norm": 0.4882957111581977, "learning_rate": 7.016258793858906e-06, "loss": 0.0279, "step": 4044 }, { "epoch": 1.840309372156506, "grad_norm": 0.5091348211222342, "learning_rate": 7.014950778320399e-06, "loss": 0.0233, "step": 4045 }, { "epoch": 1.8407643312101911, "grad_norm": 0.5165970173238531, "learning_rate": 7.013642598129213e-06, "loss": 0.035, "step": 4046 }, { "epoch": 1.8412192902638762, "grad_norm": 0.45148158952539325, "learning_rate": 7.01233425339225e-06, "loss": 0.0274, "step": 4047 }, { "epoch": 1.8416742493175615, "grad_norm": 0.47591721709121404, "learning_rate": 7.011025744216417e-06, "loss": 0.0277, "step": 4048 }, { "epoch": 1.8421292083712466, "grad_norm": 0.5657580806360729, "learning_rate": 7.0097170707086425e-06, "loss": 0.0367, "step": 4049 }, { "epoch": 1.8425841674249317, "grad_norm": 0.44240401287913805, "learning_rate": 7.008408232975865e-06, "loss": 0.0274, "step": 4050 }, { "epoch": 1.843039126478617, "grad_norm": 3.2583444075327703, "learning_rate": 7.007099231125036e-06, "loss": 0.0524, "step": 4051 }, { "epoch": 1.843494085532302, "grad_norm": 0.6144454233999067, "learning_rate": 7.005790065263123e-06, "loss": 0.0335, "step": 4052 }, { "epoch": 1.8439490445859872, "grad_norm": 0.7055540833310427, "learning_rate": 7.004480735497102e-06, "loss": 0.0316, "step": 4053 }, { "epoch": 1.8444040036396725, "grad_norm": 0.5278918909415452, "learning_rate": 7.00317124193397e-06, "loss": 0.0269, "step": 4054 }, { "epoch": 1.8448589626933576, "grad_norm": 0.6178451990779255, "learning_rate": 7.001861584680727e-06, "loss": 0.0376, "step": 4055 }, { "epoch": 1.8453139217470427, "grad_norm": 0.5258992283471937, "learning_rate": 7.000551763844399e-06, "loss": 0.0322, "step": 4056 }, { "epoch": 1.845768880800728, "grad_norm": 0.5775961051186342, "learning_rate": 6.999241779532014e-06, "loss": 0.0379, "step": 4057 }, { "epoch": 1.846223839854413, "grad_norm": 0.42259640511196034, "learning_rate": 6.997931631850619e-06, "loss": 0.0298, "step": 4058 }, { "epoch": 1.8466787989080982, "grad_norm": 0.4873844670640969, "learning_rate": 6.996621320907273e-06, "loss": 0.0241, "step": 4059 }, { "epoch": 1.8471337579617835, "grad_norm": 0.5151220931944738, "learning_rate": 6.995310846809051e-06, "loss": 0.0362, "step": 4060 }, { "epoch": 1.8475887170154686, "grad_norm": 0.4246894371554255, "learning_rate": 6.994000209663037e-06, "loss": 0.0213, "step": 4061 }, { "epoch": 1.8480436760691537, "grad_norm": 0.4666895538233693, "learning_rate": 6.99268940957633e-06, "loss": 0.0252, "step": 4062 }, { "epoch": 1.848498635122839, "grad_norm": 0.6927898464919332, "learning_rate": 6.991378446656043e-06, "loss": 0.0401, "step": 4063 }, { "epoch": 1.8489535941765243, "grad_norm": 0.6790701349214193, "learning_rate": 6.990067321009303e-06, "loss": 0.0294, "step": 4064 }, { "epoch": 1.8494085532302091, "grad_norm": 0.8008386886618306, "learning_rate": 6.9887560327432465e-06, "loss": 0.0529, "step": 4065 }, { "epoch": 1.8498635122838945, "grad_norm": 0.31982778652087174, "learning_rate": 6.9874445819650315e-06, "loss": 0.013, "step": 4066 }, { "epoch": 1.8503184713375798, "grad_norm": 0.7293561184884446, "learning_rate": 6.986132968781818e-06, "loss": 0.0426, "step": 4067 }, { "epoch": 1.8507734303912646, "grad_norm": 0.44612197517231394, "learning_rate": 6.9848211933007904e-06, "loss": 0.0267, "step": 4068 }, { "epoch": 1.85122838944495, "grad_norm": 0.5644789933631869, "learning_rate": 6.983509255629136e-06, "loss": 0.0256, "step": 4069 }, { "epoch": 1.8516833484986353, "grad_norm": 0.630824695024137, "learning_rate": 6.982197155874062e-06, "loss": 0.0353, "step": 4070 }, { "epoch": 1.8521383075523203, "grad_norm": 0.6309372938638874, "learning_rate": 6.980884894142789e-06, "loss": 0.0385, "step": 4071 }, { "epoch": 1.8525932666060054, "grad_norm": 0.549135184924981, "learning_rate": 6.979572470542549e-06, "loss": 0.0286, "step": 4072 }, { "epoch": 1.8530482256596907, "grad_norm": 0.39639697149656766, "learning_rate": 6.978259885180585e-06, "loss": 0.0288, "step": 4073 }, { "epoch": 1.8535031847133758, "grad_norm": 0.6033987036469487, "learning_rate": 6.976947138164157e-06, "loss": 0.0366, "step": 4074 }, { "epoch": 1.853958143767061, "grad_norm": 0.4554154294708478, "learning_rate": 6.975634229600539e-06, "loss": 0.0361, "step": 4075 }, { "epoch": 1.8544131028207462, "grad_norm": 0.5039912781556998, "learning_rate": 6.9743211595970105e-06, "loss": 0.0298, "step": 4076 }, { "epoch": 1.8548680618744313, "grad_norm": 0.6270034579834832, "learning_rate": 6.973007928260874e-06, "loss": 0.0329, "step": 4077 }, { "epoch": 1.8553230209281164, "grad_norm": 0.4343606029179238, "learning_rate": 6.971694535699441e-06, "loss": 0.0226, "step": 4078 }, { "epoch": 1.8557779799818017, "grad_norm": 0.511619098133748, "learning_rate": 6.970380982020033e-06, "loss": 0.0311, "step": 4079 }, { "epoch": 1.8562329390354868, "grad_norm": 0.6291419962861428, "learning_rate": 6.969067267329989e-06, "loss": 0.0428, "step": 4080 }, { "epoch": 1.856687898089172, "grad_norm": 0.7658450189704773, "learning_rate": 6.967753391736662e-06, "loss": 0.047, "step": 4081 }, { "epoch": 1.8571428571428572, "grad_norm": 0.6066682501864417, "learning_rate": 6.966439355347412e-06, "loss": 0.0375, "step": 4082 }, { "epoch": 1.8575978161965423, "grad_norm": 0.7124765630916889, "learning_rate": 6.965125158269619e-06, "loss": 0.0324, "step": 4083 }, { "epoch": 1.8580527752502274, "grad_norm": 0.5157157965087068, "learning_rate": 6.963810800610672e-06, "loss": 0.0269, "step": 4084 }, { "epoch": 1.8585077343039127, "grad_norm": 0.47744357360890555, "learning_rate": 6.962496282477976e-06, "loss": 0.0279, "step": 4085 }, { "epoch": 1.8589626933575978, "grad_norm": 0.5092031639283099, "learning_rate": 6.961181603978946e-06, "loss": 0.03, "step": 4086 }, { "epoch": 1.8594176524112829, "grad_norm": 0.5650533067317093, "learning_rate": 6.959866765221012e-06, "loss": 0.0278, "step": 4087 }, { "epoch": 1.8598726114649682, "grad_norm": 0.6170436102843387, "learning_rate": 6.958551766311616e-06, "loss": 0.0306, "step": 4088 }, { "epoch": 1.8603275705186533, "grad_norm": 0.7496398968567375, "learning_rate": 6.957236607358216e-06, "loss": 0.0383, "step": 4089 }, { "epoch": 1.8607825295723384, "grad_norm": 0.6582698994869104, "learning_rate": 6.955921288468277e-06, "loss": 0.0395, "step": 4090 }, { "epoch": 1.8612374886260237, "grad_norm": 0.5216081345029945, "learning_rate": 6.954605809749284e-06, "loss": 0.0366, "step": 4091 }, { "epoch": 1.861692447679709, "grad_norm": 0.6946230560815133, "learning_rate": 6.953290171308732e-06, "loss": 0.0394, "step": 4092 }, { "epoch": 1.8621474067333939, "grad_norm": 0.5790626531179943, "learning_rate": 6.951974373254127e-06, "loss": 0.0287, "step": 4093 }, { "epoch": 1.8626023657870792, "grad_norm": 0.4183972019857722, "learning_rate": 6.950658415692992e-06, "loss": 0.0307, "step": 4094 }, { "epoch": 1.8630573248407645, "grad_norm": 0.5568397799801404, "learning_rate": 6.949342298732861e-06, "loss": 0.0378, "step": 4095 }, { "epoch": 1.8635122838944493, "grad_norm": 0.5743063582553911, "learning_rate": 6.948026022481279e-06, "loss": 0.029, "step": 4096 }, { "epoch": 1.8639672429481347, "grad_norm": 0.39922835370133297, "learning_rate": 6.946709587045808e-06, "loss": 0.0216, "step": 4097 }, { "epoch": 1.86442220200182, "grad_norm": 0.652730513196503, "learning_rate": 6.945392992534022e-06, "loss": 0.0427, "step": 4098 }, { "epoch": 1.864877161055505, "grad_norm": 0.3791674162212088, "learning_rate": 6.9440762390535046e-06, "loss": 0.024, "step": 4099 }, { "epoch": 1.8653321201091901, "grad_norm": 0.533439629209361, "learning_rate": 6.9427593267118565e-06, "loss": 0.0317, "step": 4100 }, { "epoch": 1.8657870791628755, "grad_norm": 0.6602195993489012, "learning_rate": 6.941442255616691e-06, "loss": 0.048, "step": 4101 }, { "epoch": 1.8662420382165605, "grad_norm": 0.5800092363666163, "learning_rate": 6.94012502587563e-06, "loss": 0.0448, "step": 4102 }, { "epoch": 1.8666969972702456, "grad_norm": 0.5347550571270917, "learning_rate": 6.938807637596315e-06, "loss": 0.0307, "step": 4103 }, { "epoch": 1.867151956323931, "grad_norm": 0.6828358934580386, "learning_rate": 6.937490090886394e-06, "loss": 0.0415, "step": 4104 }, { "epoch": 1.867606915377616, "grad_norm": 0.49549687933880604, "learning_rate": 6.936172385853534e-06, "loss": 0.0334, "step": 4105 }, { "epoch": 1.8680618744313011, "grad_norm": 0.37121930632346345, "learning_rate": 6.934854522605409e-06, "loss": 0.0148, "step": 4106 }, { "epoch": 1.8685168334849864, "grad_norm": 0.5637249554892388, "learning_rate": 6.9335365012497095e-06, "loss": 0.0346, "step": 4107 }, { "epoch": 1.8689717925386715, "grad_norm": 0.4079255439605901, "learning_rate": 6.93221832189414e-06, "loss": 0.0238, "step": 4108 }, { "epoch": 1.8694267515923566, "grad_norm": 0.51260711597464, "learning_rate": 6.930899984646416e-06, "loss": 0.031, "step": 4109 }, { "epoch": 1.869881710646042, "grad_norm": 0.6268143820434923, "learning_rate": 6.929581489614263e-06, "loss": 0.0494, "step": 4110 }, { "epoch": 1.870336669699727, "grad_norm": 0.622134803563498, "learning_rate": 6.928262836905426e-06, "loss": 0.025, "step": 4111 }, { "epoch": 1.870791628753412, "grad_norm": 0.5332846552633914, "learning_rate": 6.926944026627658e-06, "loss": 0.0276, "step": 4112 }, { "epoch": 1.8712465878070974, "grad_norm": 0.5708974782402968, "learning_rate": 6.925625058888725e-06, "loss": 0.032, "step": 4113 }, { "epoch": 1.8717015468607825, "grad_norm": 0.4806578138038823, "learning_rate": 6.924305933796409e-06, "loss": 0.03, "step": 4114 }, { "epoch": 1.8721565059144676, "grad_norm": 0.39212455930029516, "learning_rate": 6.922986651458503e-06, "loss": 0.0264, "step": 4115 }, { "epoch": 1.872611464968153, "grad_norm": 0.4259990007914431, "learning_rate": 6.921667211982811e-06, "loss": 0.0196, "step": 4116 }, { "epoch": 1.873066424021838, "grad_norm": 0.5043990484033058, "learning_rate": 6.920347615477153e-06, "loss": 0.0272, "step": 4117 }, { "epoch": 1.873521383075523, "grad_norm": 0.6315043827155147, "learning_rate": 6.919027862049359e-06, "loss": 0.0347, "step": 4118 }, { "epoch": 1.8739763421292084, "grad_norm": 0.5086595636715445, "learning_rate": 6.917707951807275e-06, "loss": 0.0294, "step": 4119 }, { "epoch": 1.8744313011828937, "grad_norm": 0.7059881241515326, "learning_rate": 6.9163878848587585e-06, "loss": 0.0375, "step": 4120 }, { "epoch": 1.8748862602365786, "grad_norm": 0.4981489285964966, "learning_rate": 6.915067661311676e-06, "loss": 0.0229, "step": 4121 }, { "epoch": 1.8753412192902639, "grad_norm": 0.7535656971705376, "learning_rate": 6.913747281273916e-06, "loss": 0.0441, "step": 4122 }, { "epoch": 1.8757961783439492, "grad_norm": 0.5606563081185906, "learning_rate": 6.912426744853368e-06, "loss": 0.0182, "step": 4123 }, { "epoch": 1.876251137397634, "grad_norm": 0.4800039645683506, "learning_rate": 6.911106052157943e-06, "loss": 0.0329, "step": 4124 }, { "epoch": 1.8767060964513194, "grad_norm": 0.7422807931262311, "learning_rate": 6.909785203295563e-06, "loss": 0.0483, "step": 4125 }, { "epoch": 1.8771610555050047, "grad_norm": 0.7428282082101595, "learning_rate": 6.908464198374161e-06, "loss": 0.0395, "step": 4126 }, { "epoch": 1.8776160145586898, "grad_norm": 0.5567461218204124, "learning_rate": 6.907143037501681e-06, "loss": 0.0261, "step": 4127 }, { "epoch": 1.8780709736123748, "grad_norm": 0.4535800153644064, "learning_rate": 6.9058217207860856e-06, "loss": 0.0205, "step": 4128 }, { "epoch": 1.8785259326660602, "grad_norm": 0.5380949608863376, "learning_rate": 6.904500248335348e-06, "loss": 0.0284, "step": 4129 }, { "epoch": 1.8789808917197452, "grad_norm": 0.5576353977632108, "learning_rate": 6.903178620257448e-06, "loss": 0.0315, "step": 4130 }, { "epoch": 1.8794358507734303, "grad_norm": 0.4049573082389251, "learning_rate": 6.901856836660386e-06, "loss": 0.0202, "step": 4131 }, { "epoch": 1.8798908098271156, "grad_norm": 0.843414674417525, "learning_rate": 6.900534897652174e-06, "loss": 0.0521, "step": 4132 }, { "epoch": 1.8803457688808007, "grad_norm": 0.5932961835636155, "learning_rate": 6.8992128033408316e-06, "loss": 0.032, "step": 4133 }, { "epoch": 1.8808007279344858, "grad_norm": 0.5235441475273466, "learning_rate": 6.8978905538343965e-06, "loss": 0.0317, "step": 4134 }, { "epoch": 1.8812556869881711, "grad_norm": 0.5931362305529173, "learning_rate": 6.8965681492409145e-06, "loss": 0.0357, "step": 4135 }, { "epoch": 1.8817106460418562, "grad_norm": 0.6866152295154555, "learning_rate": 6.895245589668449e-06, "loss": 0.0322, "step": 4136 }, { "epoch": 1.8821656050955413, "grad_norm": 0.5571567976815832, "learning_rate": 6.893922875225072e-06, "loss": 0.0342, "step": 4137 }, { "epoch": 1.8826205641492266, "grad_norm": 0.6131320785326742, "learning_rate": 6.892600006018871e-06, "loss": 0.0411, "step": 4138 }, { "epoch": 1.8830755232029117, "grad_norm": 0.4805462802325607, "learning_rate": 6.891276982157946e-06, "loss": 0.028, "step": 4139 }, { "epoch": 1.8835304822565968, "grad_norm": 0.611829755478912, "learning_rate": 6.8899538037504055e-06, "loss": 0.0414, "step": 4140 }, { "epoch": 1.8839854413102821, "grad_norm": 0.7171505872571483, "learning_rate": 6.8886304709043764e-06, "loss": 0.04, "step": 4141 }, { "epoch": 1.8844404003639672, "grad_norm": 0.6341903094003845, "learning_rate": 6.8873069837279915e-06, "loss": 0.0534, "step": 4142 }, { "epoch": 1.8848953594176523, "grad_norm": 0.5384311264461159, "learning_rate": 6.885983342329406e-06, "loss": 0.027, "step": 4143 }, { "epoch": 1.8853503184713376, "grad_norm": 0.5989581507822104, "learning_rate": 6.884659546816777e-06, "loss": 0.0359, "step": 4144 }, { "epoch": 1.8858052775250227, "grad_norm": 0.43766012476693866, "learning_rate": 6.883335597298279e-06, "loss": 0.0209, "step": 4145 }, { "epoch": 1.8862602365787078, "grad_norm": 0.5834755472420379, "learning_rate": 6.882011493882105e-06, "loss": 0.0233, "step": 4146 }, { "epoch": 1.886715195632393, "grad_norm": 0.6972609731827578, "learning_rate": 6.880687236676449e-06, "loss": 0.0438, "step": 4147 }, { "epoch": 1.8871701546860784, "grad_norm": 0.4316315646247104, "learning_rate": 6.879362825789525e-06, "loss": 0.0199, "step": 4148 }, { "epoch": 1.8876251137397633, "grad_norm": 0.3699795833373929, "learning_rate": 6.8780382613295575e-06, "loss": 0.0214, "step": 4149 }, { "epoch": 1.8880800727934486, "grad_norm": 0.5280634658210035, "learning_rate": 6.876713543404785e-06, "loss": 0.0294, "step": 4150 }, { "epoch": 1.888535031847134, "grad_norm": 0.42649598275949047, "learning_rate": 6.875388672123458e-06, "loss": 0.0209, "step": 4151 }, { "epoch": 1.8889899909008188, "grad_norm": 0.6006523238443225, "learning_rate": 6.874063647593836e-06, "loss": 0.0399, "step": 4152 }, { "epoch": 1.889444949954504, "grad_norm": 0.7779759284956044, "learning_rate": 6.872738469924198e-06, "loss": 0.0507, "step": 4153 }, { "epoch": 1.8898999090081894, "grad_norm": 0.3897094032440889, "learning_rate": 6.871413139222827e-06, "loss": 0.0186, "step": 4154 }, { "epoch": 1.8903548680618745, "grad_norm": 1.0454124974291628, "learning_rate": 6.870087655598028e-06, "loss": 0.0839, "step": 4155 }, { "epoch": 1.8908098271155596, "grad_norm": 0.39688930772130354, "learning_rate": 6.8687620191581095e-06, "loss": 0.0212, "step": 4156 }, { "epoch": 1.8912647861692449, "grad_norm": 0.41441871720990314, "learning_rate": 6.867436230011397e-06, "loss": 0.0218, "step": 4157 }, { "epoch": 1.89171974522293, "grad_norm": 0.45977519202691797, "learning_rate": 6.866110288266232e-06, "loss": 0.0255, "step": 4158 }, { "epoch": 1.892174704276615, "grad_norm": 0.4251437168707794, "learning_rate": 6.864784194030958e-06, "loss": 0.0254, "step": 4159 }, { "epoch": 1.8926296633303004, "grad_norm": 0.5804153437616366, "learning_rate": 6.863457947413944e-06, "loss": 0.037, "step": 4160 }, { "epoch": 1.8930846223839854, "grad_norm": 0.6495689827219507, "learning_rate": 6.862131548523561e-06, "loss": 0.0358, "step": 4161 }, { "epoch": 1.8935395814376705, "grad_norm": 0.5133413068665638, "learning_rate": 6.8608049974681964e-06, "loss": 0.0393, "step": 4162 }, { "epoch": 1.8939945404913558, "grad_norm": 0.4862566575630052, "learning_rate": 6.859478294356252e-06, "loss": 0.0204, "step": 4163 }, { "epoch": 1.894449499545041, "grad_norm": 0.5755819284310981, "learning_rate": 6.858151439296137e-06, "loss": 0.0349, "step": 4164 }, { "epoch": 1.894904458598726, "grad_norm": 0.6013684152559066, "learning_rate": 6.85682443239628e-06, "loss": 0.0299, "step": 4165 }, { "epoch": 1.8953594176524113, "grad_norm": 0.593056511577885, "learning_rate": 6.855497273765113e-06, "loss": 0.0338, "step": 4166 }, { "epoch": 1.8958143767060964, "grad_norm": 0.5140109171479168, "learning_rate": 6.85416996351109e-06, "loss": 0.0295, "step": 4167 }, { "epoch": 1.8962693357597815, "grad_norm": 0.580481113661126, "learning_rate": 6.8528425017426715e-06, "loss": 0.0385, "step": 4168 }, { "epoch": 1.8967242948134668, "grad_norm": 0.4729425465735144, "learning_rate": 6.851514888568329e-06, "loss": 0.0278, "step": 4169 }, { "epoch": 1.897179253867152, "grad_norm": 0.6053992371768653, "learning_rate": 6.850187124096552e-06, "loss": 0.0415, "step": 4170 }, { "epoch": 1.897634212920837, "grad_norm": 2.476045237305704, "learning_rate": 6.848859208435838e-06, "loss": 0.0426, "step": 4171 }, { "epoch": 1.8980891719745223, "grad_norm": 0.5716074235612479, "learning_rate": 6.847531141694701e-06, "loss": 0.0419, "step": 4172 }, { "epoch": 1.8985441310282076, "grad_norm": 0.7002000989512744, "learning_rate": 6.846202923981661e-06, "loss": 0.0362, "step": 4173 }, { "epoch": 1.8989990900818925, "grad_norm": 0.5410115757258703, "learning_rate": 6.844874555405256e-06, "loss": 0.0307, "step": 4174 }, { "epoch": 1.8994540491355778, "grad_norm": 0.36725599427465627, "learning_rate": 6.8435460360740336e-06, "loss": 0.022, "step": 4175 }, { "epoch": 1.8999090081892631, "grad_norm": 0.3603629432216531, "learning_rate": 6.842217366096553e-06, "loss": 0.0183, "step": 4176 }, { "epoch": 1.900363967242948, "grad_norm": 0.538845244670838, "learning_rate": 6.84088854558139e-06, "loss": 0.031, "step": 4177 }, { "epoch": 1.9008189262966333, "grad_norm": 0.61286557236097, "learning_rate": 6.839559574637128e-06, "loss": 0.0322, "step": 4178 }, { "epoch": 1.9012738853503186, "grad_norm": 0.44454321228530824, "learning_rate": 6.838230453372365e-06, "loss": 0.0244, "step": 4179 }, { "epoch": 1.9017288444040037, "grad_norm": 0.6436314987022439, "learning_rate": 6.836901181895711e-06, "loss": 0.038, "step": 4180 }, { "epoch": 1.9021838034576888, "grad_norm": 0.6354133524673475, "learning_rate": 6.835571760315788e-06, "loss": 0.0442, "step": 4181 }, { "epoch": 1.902638762511374, "grad_norm": 0.6508106654348409, "learning_rate": 6.83424218874123e-06, "loss": 0.0417, "step": 4182 }, { "epoch": 1.9030937215650592, "grad_norm": 0.6031856657210548, "learning_rate": 6.832912467280684e-06, "loss": 0.0326, "step": 4183 }, { "epoch": 1.9035486806187443, "grad_norm": 0.5143443237954904, "learning_rate": 6.831582596042807e-06, "loss": 0.0312, "step": 4184 }, { "epoch": 1.9040036396724296, "grad_norm": 0.625403042295468, "learning_rate": 6.8302525751362724e-06, "loss": 0.04, "step": 4185 }, { "epoch": 1.9044585987261147, "grad_norm": 0.42059937773327577, "learning_rate": 6.8289224046697645e-06, "loss": 0.0251, "step": 4186 }, { "epoch": 1.9049135577797998, "grad_norm": 0.5024413136546955, "learning_rate": 6.827592084751975e-06, "loss": 0.0366, "step": 4187 }, { "epoch": 1.905368516833485, "grad_norm": 0.5498246757421382, "learning_rate": 6.826261615491614e-06, "loss": 0.0288, "step": 4188 }, { "epoch": 1.9058234758871702, "grad_norm": 0.5713051541552272, "learning_rate": 6.824930996997401e-06, "loss": 0.0342, "step": 4189 }, { "epoch": 1.9062784349408552, "grad_norm": 0.47256159060913766, "learning_rate": 6.823600229378069e-06, "loss": 0.0231, "step": 4190 }, { "epoch": 1.9067333939945406, "grad_norm": 0.5070885208390664, "learning_rate": 6.82226931274236e-06, "loss": 0.0217, "step": 4191 }, { "epoch": 1.9071883530482256, "grad_norm": 0.6279779773842173, "learning_rate": 6.820938247199035e-06, "loss": 0.0339, "step": 4192 }, { "epoch": 1.9076433121019107, "grad_norm": 0.6186141625393742, "learning_rate": 6.819607032856857e-06, "loss": 0.0374, "step": 4193 }, { "epoch": 1.908098271155596, "grad_norm": 0.48779739664063554, "learning_rate": 6.81827566982461e-06, "loss": 0.0238, "step": 4194 }, { "epoch": 1.9085532302092811, "grad_norm": 0.6950217579037524, "learning_rate": 6.816944158211088e-06, "loss": 0.0513, "step": 4195 }, { "epoch": 1.9090081892629662, "grad_norm": 0.5355831607004834, "learning_rate": 6.815612498125093e-06, "loss": 0.0311, "step": 4196 }, { "epoch": 1.9094631483166515, "grad_norm": 0.742063347605555, "learning_rate": 6.814280689675444e-06, "loss": 0.0511, "step": 4197 }, { "epoch": 1.9099181073703366, "grad_norm": 0.47161758023745004, "learning_rate": 6.812948732970971e-06, "loss": 0.0228, "step": 4198 }, { "epoch": 1.9103730664240217, "grad_norm": 0.6490262091704564, "learning_rate": 6.811616628120514e-06, "loss": 0.0432, "step": 4199 }, { "epoch": 1.910828025477707, "grad_norm": 0.38283337091143843, "learning_rate": 6.8102843752329286e-06, "loss": 0.0186, "step": 4200 }, { "epoch": 1.9112829845313923, "grad_norm": 0.4468451599677746, "learning_rate": 6.808951974417077e-06, "loss": 0.0312, "step": 4201 }, { "epoch": 1.9117379435850772, "grad_norm": 0.5048947846466737, "learning_rate": 6.807619425781841e-06, "loss": 0.0361, "step": 4202 }, { "epoch": 1.9121929026387625, "grad_norm": 0.8947871093541464, "learning_rate": 6.806286729436109e-06, "loss": 0.0819, "step": 4203 }, { "epoch": 1.9126478616924478, "grad_norm": 0.38672962216968826, "learning_rate": 6.804953885488783e-06, "loss": 0.0221, "step": 4204 }, { "epoch": 1.9131028207461327, "grad_norm": 0.6261240402378399, "learning_rate": 6.803620894048773e-06, "loss": 0.0537, "step": 4205 }, { "epoch": 1.913557779799818, "grad_norm": 0.48593300752539076, "learning_rate": 6.802287755225012e-06, "loss": 0.0384, "step": 4206 }, { "epoch": 1.9140127388535033, "grad_norm": 0.6664418812806434, "learning_rate": 6.800954469126434e-06, "loss": 0.0375, "step": 4207 }, { "epoch": 1.9144676979071884, "grad_norm": 0.4580710122565072, "learning_rate": 6.799621035861989e-06, "loss": 0.0218, "step": 4208 }, { "epoch": 1.9149226569608735, "grad_norm": 0.36316540191417757, "learning_rate": 6.798287455540642e-06, "loss": 0.0185, "step": 4209 }, { "epoch": 1.9153776160145588, "grad_norm": 0.5676289334367329, "learning_rate": 6.7969537282713624e-06, "loss": 0.0279, "step": 4210 }, { "epoch": 1.915832575068244, "grad_norm": 0.5582353676662519, "learning_rate": 6.795619854163143e-06, "loss": 0.0348, "step": 4211 }, { "epoch": 1.916287534121929, "grad_norm": 0.7340230920663307, "learning_rate": 6.794285833324973e-06, "loss": 0.0397, "step": 4212 }, { "epoch": 1.9167424931756143, "grad_norm": 0.497610110970188, "learning_rate": 6.792951665865871e-06, "loss": 0.0297, "step": 4213 }, { "epoch": 1.9171974522292994, "grad_norm": 0.8901064327873458, "learning_rate": 6.791617351894855e-06, "loss": 0.0599, "step": 4214 }, { "epoch": 1.9176524112829845, "grad_norm": 0.6574222134836127, "learning_rate": 6.790282891520958e-06, "loss": 0.0405, "step": 4215 }, { "epoch": 1.9181073703366698, "grad_norm": 0.5366635685640508, "learning_rate": 6.788948284853232e-06, "loss": 0.0306, "step": 4216 }, { "epoch": 1.9185623293903549, "grad_norm": 0.4726708314237089, "learning_rate": 6.787613532000727e-06, "loss": 0.021, "step": 4217 }, { "epoch": 1.91901728844404, "grad_norm": 0.6795314395307376, "learning_rate": 6.786278633072521e-06, "loss": 0.0319, "step": 4218 }, { "epoch": 1.9194722474977253, "grad_norm": 0.45126350652234226, "learning_rate": 6.784943588177687e-06, "loss": 0.0269, "step": 4219 }, { "epoch": 1.9199272065514104, "grad_norm": 0.6942845005528151, "learning_rate": 6.783608397425328e-06, "loss": 0.0405, "step": 4220 }, { "epoch": 1.9203821656050954, "grad_norm": 0.32180086236608474, "learning_rate": 6.782273060924544e-06, "loss": 0.0179, "step": 4221 }, { "epoch": 1.9208371246587808, "grad_norm": 0.7724119213459093, "learning_rate": 6.780937578784452e-06, "loss": 0.0586, "step": 4222 }, { "epoch": 1.9212920837124658, "grad_norm": 0.4289017376812481, "learning_rate": 6.779601951114186e-06, "loss": 0.0217, "step": 4223 }, { "epoch": 1.921747042766151, "grad_norm": 0.5465287260644731, "learning_rate": 6.778266178022884e-06, "loss": 0.0344, "step": 4224 }, { "epoch": 1.9222020018198362, "grad_norm": 0.38990536763795697, "learning_rate": 6.776930259619703e-06, "loss": 0.0282, "step": 4225 }, { "epoch": 1.9226569608735213, "grad_norm": 0.7934359797785354, "learning_rate": 6.775594196013803e-06, "loss": 0.0323, "step": 4226 }, { "epoch": 1.9231119199272064, "grad_norm": 0.678124033784331, "learning_rate": 6.774257987314364e-06, "loss": 0.0362, "step": 4227 }, { "epoch": 1.9235668789808917, "grad_norm": 0.5797665311517408, "learning_rate": 6.772921633630577e-06, "loss": 0.0331, "step": 4228 }, { "epoch": 1.924021838034577, "grad_norm": 0.48896590389820244, "learning_rate": 6.7715851350716375e-06, "loss": 0.0281, "step": 4229 }, { "epoch": 1.924476797088262, "grad_norm": 0.5081972827747478, "learning_rate": 6.7702484917467635e-06, "loss": 0.0324, "step": 4230 }, { "epoch": 1.9249317561419472, "grad_norm": 0.6821635948549074, "learning_rate": 6.768911703765175e-06, "loss": 0.0486, "step": 4231 }, { "epoch": 1.9253867151956325, "grad_norm": 0.5407561356587544, "learning_rate": 6.767574771236114e-06, "loss": 0.0381, "step": 4232 }, { "epoch": 1.9258416742493174, "grad_norm": 0.5749904174805588, "learning_rate": 6.766237694268822e-06, "loss": 0.0414, "step": 4233 }, { "epoch": 1.9262966333030027, "grad_norm": 0.5171904308253861, "learning_rate": 6.764900472972562e-06, "loss": 0.0244, "step": 4234 }, { "epoch": 1.926751592356688, "grad_norm": 0.5536805309306727, "learning_rate": 6.763563107456607e-06, "loss": 0.0303, "step": 4235 }, { "epoch": 1.9272065514103731, "grad_norm": 0.671886081855136, "learning_rate": 6.762225597830236e-06, "loss": 0.0585, "step": 4236 }, { "epoch": 1.9276615104640582, "grad_norm": 0.5428504655942143, "learning_rate": 6.760887944202751e-06, "loss": 0.0165, "step": 4237 }, { "epoch": 1.9281164695177435, "grad_norm": 0.5289601666695274, "learning_rate": 6.759550146683454e-06, "loss": 0.0308, "step": 4238 }, { "epoch": 1.9285714285714286, "grad_norm": 0.500870856273744, "learning_rate": 6.758212205381665e-06, "loss": 0.0303, "step": 4239 }, { "epoch": 1.9290263876251137, "grad_norm": 0.548165583645003, "learning_rate": 6.7568741204067145e-06, "loss": 0.0299, "step": 4240 }, { "epoch": 1.929481346678799, "grad_norm": 0.6690133186126083, "learning_rate": 6.7555358918679435e-06, "loss": 0.0332, "step": 4241 }, { "epoch": 1.929936305732484, "grad_norm": 0.6155617299772874, "learning_rate": 6.75419751987471e-06, "loss": 0.0314, "step": 4242 }, { "epoch": 1.9303912647861692, "grad_norm": 0.5839720790284428, "learning_rate": 6.752859004536376e-06, "loss": 0.035, "step": 4243 }, { "epoch": 1.9308462238398545, "grad_norm": 0.6772350774170958, "learning_rate": 6.751520345962319e-06, "loss": 0.0425, "step": 4244 }, { "epoch": 1.9313011828935396, "grad_norm": 0.44906985575019187, "learning_rate": 6.7501815442619315e-06, "loss": 0.0245, "step": 4245 }, { "epoch": 1.9317561419472247, "grad_norm": 0.559749956318088, "learning_rate": 6.74884259954461e-06, "loss": 0.0309, "step": 4246 }, { "epoch": 1.93221110100091, "grad_norm": 0.48000720862490154, "learning_rate": 6.747503511919768e-06, "loss": 0.0348, "step": 4247 }, { "epoch": 1.932666060054595, "grad_norm": 0.48872163153009507, "learning_rate": 6.746164281496832e-06, "loss": 0.035, "step": 4248 }, { "epoch": 1.9331210191082802, "grad_norm": 0.5850838894642061, "learning_rate": 6.744824908385237e-06, "loss": 0.0407, "step": 4249 }, { "epoch": 1.9335759781619655, "grad_norm": 0.3777275153128007, "learning_rate": 6.743485392694429e-06, "loss": 0.011, "step": 4250 }, { "epoch": 1.9340309372156506, "grad_norm": 0.4899042906423011, "learning_rate": 6.742145734533868e-06, "loss": 0.0268, "step": 4251 }, { "epoch": 1.9344858962693356, "grad_norm": 0.497140311954729, "learning_rate": 6.740805934013027e-06, "loss": 0.0336, "step": 4252 }, { "epoch": 1.934940855323021, "grad_norm": 0.5497428673732642, "learning_rate": 6.739465991241385e-06, "loss": 0.0402, "step": 4253 }, { "epoch": 1.935395814376706, "grad_norm": 0.47312365256676847, "learning_rate": 6.7381259063284375e-06, "loss": 0.0213, "step": 4254 }, { "epoch": 1.9358507734303911, "grad_norm": 0.48376291484120243, "learning_rate": 6.7367856793836905e-06, "loss": 0.0262, "step": 4255 }, { "epoch": 1.9363057324840764, "grad_norm": 0.8142605312434946, "learning_rate": 6.7354453105166615e-06, "loss": 0.0508, "step": 4256 }, { "epoch": 1.9367606915377618, "grad_norm": 0.42547889245737935, "learning_rate": 6.734104799836878e-06, "loss": 0.0166, "step": 4257 }, { "epoch": 1.9372156505914466, "grad_norm": 0.5833429245001462, "learning_rate": 6.7327641474538816e-06, "loss": 0.0193, "step": 4258 }, { "epoch": 1.937670609645132, "grad_norm": 0.5371727299945996, "learning_rate": 6.731423353477224e-06, "loss": 0.0337, "step": 4259 }, { "epoch": 1.9381255686988172, "grad_norm": 0.5590256570552319, "learning_rate": 6.73008241801647e-06, "loss": 0.0384, "step": 4260 }, { "epoch": 1.9385805277525021, "grad_norm": 0.49279712910657897, "learning_rate": 6.7287413411811935e-06, "loss": 0.0207, "step": 4261 }, { "epoch": 1.9390354868061874, "grad_norm": 0.47385824774797464, "learning_rate": 6.727400123080981e-06, "loss": 0.0273, "step": 4262 }, { "epoch": 1.9394904458598727, "grad_norm": 0.5136475532315747, "learning_rate": 6.726058763825431e-06, "loss": 0.0203, "step": 4263 }, { "epoch": 1.9399454049135578, "grad_norm": 0.383378308522441, "learning_rate": 6.724717263524154e-06, "loss": 0.0206, "step": 4264 }, { "epoch": 1.940400363967243, "grad_norm": 0.5024964516130972, "learning_rate": 6.723375622286772e-06, "loss": 0.0345, "step": 4265 }, { "epoch": 1.9408553230209282, "grad_norm": 0.462858719824685, "learning_rate": 6.722033840222917e-06, "loss": 0.0316, "step": 4266 }, { "epoch": 1.9413102820746133, "grad_norm": 0.5835999657969747, "learning_rate": 6.720691917442232e-06, "loss": 0.0216, "step": 4267 }, { "epoch": 1.9417652411282984, "grad_norm": 0.42098842770143113, "learning_rate": 6.7193498540543736e-06, "loss": 0.0274, "step": 4268 }, { "epoch": 1.9422202001819837, "grad_norm": 0.7112824280279982, "learning_rate": 6.7180076501690105e-06, "loss": 0.0428, "step": 4269 }, { "epoch": 1.9426751592356688, "grad_norm": 0.6517418719527546, "learning_rate": 6.716665305895821e-06, "loss": 0.0375, "step": 4270 }, { "epoch": 1.943130118289354, "grad_norm": 0.4606842807963288, "learning_rate": 6.715322821344495e-06, "loss": 0.0283, "step": 4271 }, { "epoch": 1.9435850773430392, "grad_norm": 0.4262009261306818, "learning_rate": 6.713980196624732e-06, "loss": 0.0205, "step": 4272 }, { "epoch": 1.9440400363967243, "grad_norm": 0.8172009202443107, "learning_rate": 6.712637431846251e-06, "loss": 0.0494, "step": 4273 }, { "epoch": 1.9444949954504094, "grad_norm": 1.1652533424561522, "learning_rate": 6.711294527118772e-06, "loss": 0.0738, "step": 4274 }, { "epoch": 1.9449499545040947, "grad_norm": 0.46193035782026587, "learning_rate": 6.709951482552032e-06, "loss": 0.0202, "step": 4275 }, { "epoch": 1.9454049135577798, "grad_norm": 0.432957979144842, "learning_rate": 6.708608298255778e-06, "loss": 0.0179, "step": 4276 }, { "epoch": 1.9458598726114649, "grad_norm": 0.6172675181416378, "learning_rate": 6.707264974339772e-06, "loss": 0.0357, "step": 4277 }, { "epoch": 1.9463148316651502, "grad_norm": 0.5084300297630266, "learning_rate": 6.705921510913781e-06, "loss": 0.0393, "step": 4278 }, { "epoch": 1.9467697907188353, "grad_norm": 0.5684094495685631, "learning_rate": 6.704577908087589e-06, "loss": 0.0434, "step": 4279 }, { "epoch": 1.9472247497725204, "grad_norm": 0.7216629288754447, "learning_rate": 6.7032341659709875e-06, "loss": 0.0408, "step": 4280 }, { "epoch": 1.9476797088262057, "grad_norm": 0.44704604969626477, "learning_rate": 6.701890284673782e-06, "loss": 0.0278, "step": 4281 }, { "epoch": 1.9481346678798908, "grad_norm": 0.6359559201978502, "learning_rate": 6.700546264305787e-06, "loss": 0.0333, "step": 4282 }, { "epoch": 1.9485896269335758, "grad_norm": 0.6380060738199653, "learning_rate": 6.699202104976832e-06, "loss": 0.0341, "step": 4283 }, { "epoch": 1.9490445859872612, "grad_norm": 0.5494115223446472, "learning_rate": 6.697857806796753e-06, "loss": 0.0281, "step": 4284 }, { "epoch": 1.9494995450409465, "grad_norm": 0.596507057397881, "learning_rate": 6.696513369875403e-06, "loss": 0.0419, "step": 4285 }, { "epoch": 1.9499545040946313, "grad_norm": 0.4656177655917427, "learning_rate": 6.695168794322642e-06, "loss": 0.0216, "step": 4286 }, { "epoch": 1.9504094631483166, "grad_norm": 0.5297924925475368, "learning_rate": 6.693824080248341e-06, "loss": 0.0237, "step": 4287 }, { "epoch": 1.950864422202002, "grad_norm": 0.41739211017712247, "learning_rate": 6.692479227762387e-06, "loss": 0.0238, "step": 4288 }, { "epoch": 1.9513193812556868, "grad_norm": 0.5024100245734225, "learning_rate": 6.691134236974673e-06, "loss": 0.0283, "step": 4289 }, { "epoch": 1.9517743403093721, "grad_norm": 0.749115415166653, "learning_rate": 6.6897891079951065e-06, "loss": 0.0457, "step": 4290 }, { "epoch": 1.9522292993630574, "grad_norm": 0.6087252514507477, "learning_rate": 6.688443840933605e-06, "loss": 0.0408, "step": 4291 }, { "epoch": 1.9526842584167425, "grad_norm": 0.7019660276329671, "learning_rate": 6.6870984359000964e-06, "loss": 0.0371, "step": 4292 }, { "epoch": 1.9531392174704276, "grad_norm": 0.44147466434114707, "learning_rate": 6.6857528930045245e-06, "loss": 0.0243, "step": 4293 }, { "epoch": 1.953594176524113, "grad_norm": 0.73232677874317, "learning_rate": 6.684407212356838e-06, "loss": 0.0448, "step": 4294 }, { "epoch": 1.954049135577798, "grad_norm": 0.8646603208146352, "learning_rate": 6.683061394067002e-06, "loss": 0.0471, "step": 4295 }, { "epoch": 1.9545040946314831, "grad_norm": 0.7264235846217042, "learning_rate": 6.6817154382449876e-06, "loss": 0.0522, "step": 4296 }, { "epoch": 1.9549590536851684, "grad_norm": 0.49328455085653516, "learning_rate": 6.680369345000783e-06, "loss": 0.0346, "step": 4297 }, { "epoch": 1.9554140127388535, "grad_norm": 0.7462276090013412, "learning_rate": 6.679023114444385e-06, "loss": 0.0518, "step": 4298 }, { "epoch": 1.9558689717925386, "grad_norm": 0.5443680438196182, "learning_rate": 6.6776767466857974e-06, "loss": 0.0379, "step": 4299 }, { "epoch": 1.956323930846224, "grad_norm": 0.5168252132778363, "learning_rate": 6.676330241835045e-06, "loss": 0.0212, "step": 4300 }, { "epoch": 1.956778889899909, "grad_norm": 0.48305900780596694, "learning_rate": 6.674983600002155e-06, "loss": 0.0219, "step": 4301 }, { "epoch": 1.957233848953594, "grad_norm": 0.5604428637787447, "learning_rate": 6.67363682129717e-06, "loss": 0.0324, "step": 4302 }, { "epoch": 1.9576888080072794, "grad_norm": 0.32607077104499727, "learning_rate": 6.672289905830141e-06, "loss": 0.0165, "step": 4303 }, { "epoch": 1.9581437670609645, "grad_norm": 0.6360494656970666, "learning_rate": 6.6709428537111336e-06, "loss": 0.0375, "step": 4304 }, { "epoch": 1.9585987261146496, "grad_norm": 0.5622414185374158, "learning_rate": 6.669595665050223e-06, "loss": 0.0317, "step": 4305 }, { "epoch": 1.959053685168335, "grad_norm": 0.4737427909708252, "learning_rate": 6.668248339957491e-06, "loss": 0.0301, "step": 4306 }, { "epoch": 1.95950864422202, "grad_norm": 0.49041443717670724, "learning_rate": 6.666900878543041e-06, "loss": 0.0318, "step": 4307 }, { "epoch": 1.959963603275705, "grad_norm": 0.3981810185297644, "learning_rate": 6.6655532809169785e-06, "loss": 0.0304, "step": 4308 }, { "epoch": 1.9604185623293904, "grad_norm": 0.7103157363923976, "learning_rate": 6.664205547189424e-06, "loss": 0.0531, "step": 4309 }, { "epoch": 1.9608735213830755, "grad_norm": 0.2983514558093099, "learning_rate": 6.662857677470508e-06, "loss": 0.017, "step": 4310 }, { "epoch": 1.9613284804367606, "grad_norm": 0.4795393028635018, "learning_rate": 6.66150967187037e-06, "loss": 0.0351, "step": 4311 }, { "epoch": 1.9617834394904459, "grad_norm": 0.48386879103999797, "learning_rate": 6.660161530499168e-06, "loss": 0.0296, "step": 4312 }, { "epoch": 1.9622383985441312, "grad_norm": 0.6086611680310994, "learning_rate": 6.65881325346706e-06, "loss": 0.0331, "step": 4313 }, { "epoch": 1.962693357597816, "grad_norm": 0.7315193859463398, "learning_rate": 6.657464840884225e-06, "loss": 0.0448, "step": 4314 }, { "epoch": 1.9631483166515014, "grad_norm": 0.4931020414003279, "learning_rate": 6.656116292860849e-06, "loss": 0.0383, "step": 4315 }, { "epoch": 1.9636032757051867, "grad_norm": 0.6692604288867324, "learning_rate": 6.654767609507127e-06, "loss": 0.0569, "step": 4316 }, { "epoch": 1.9640582347588715, "grad_norm": 0.6022950831552026, "learning_rate": 6.65341879093327e-06, "loss": 0.0344, "step": 4317 }, { "epoch": 1.9645131938125568, "grad_norm": 0.5355104605232749, "learning_rate": 6.652069837249495e-06, "loss": 0.0261, "step": 4318 }, { "epoch": 1.9649681528662422, "grad_norm": 0.5793528038518898, "learning_rate": 6.650720748566035e-06, "loss": 0.0386, "step": 4319 }, { "epoch": 1.9654231119199272, "grad_norm": 0.686478667336817, "learning_rate": 6.649371524993129e-06, "loss": 0.0356, "step": 4320 }, { "epoch": 1.9658780709736123, "grad_norm": 4.5997608223820885, "learning_rate": 6.64802216664103e-06, "loss": 0.1511, "step": 4321 }, { "epoch": 1.9663330300272976, "grad_norm": 0.6232069446071732, "learning_rate": 6.646672673620005e-06, "loss": 0.0449, "step": 4322 }, { "epoch": 1.9667879890809827, "grad_norm": 0.5406508477628449, "learning_rate": 6.645323046040323e-06, "loss": 0.0309, "step": 4323 }, { "epoch": 1.9672429481346678, "grad_norm": 0.5261295765454279, "learning_rate": 6.643973284012271e-06, "loss": 0.0261, "step": 4324 }, { "epoch": 1.9676979071883531, "grad_norm": 0.751170141815096, "learning_rate": 6.642623387646148e-06, "loss": 0.0216, "step": 4325 }, { "epoch": 1.9681528662420382, "grad_norm": 0.6125631770685741, "learning_rate": 6.64127335705226e-06, "loss": 0.0426, "step": 4326 }, { "epoch": 1.9686078252957233, "grad_norm": 0.46397179555598267, "learning_rate": 6.639923192340923e-06, "loss": 0.0247, "step": 4327 }, { "epoch": 1.9690627843494086, "grad_norm": 0.5013740130400764, "learning_rate": 6.63857289362247e-06, "loss": 0.0297, "step": 4328 }, { "epoch": 1.9695177434030937, "grad_norm": 0.46418829608866635, "learning_rate": 6.637222461007241e-06, "loss": 0.0291, "step": 4329 }, { "epoch": 1.9699727024567788, "grad_norm": 0.44197278228763304, "learning_rate": 6.635871894605585e-06, "loss": 0.0323, "step": 4330 }, { "epoch": 1.9704276615104641, "grad_norm": 0.35879639917254974, "learning_rate": 6.634521194527865e-06, "loss": 0.0201, "step": 4331 }, { "epoch": 1.9708826205641492, "grad_norm": 0.4078620107559211, "learning_rate": 6.633170360884455e-06, "loss": 0.0261, "step": 4332 }, { "epoch": 1.9713375796178343, "grad_norm": 0.4603460338520836, "learning_rate": 6.6318193937857375e-06, "loss": 0.0288, "step": 4333 }, { "epoch": 1.9717925386715196, "grad_norm": 0.5356573066502125, "learning_rate": 6.630468293342109e-06, "loss": 0.0389, "step": 4334 }, { "epoch": 1.9722474977252047, "grad_norm": 0.6230220298169801, "learning_rate": 6.629117059663975e-06, "loss": 0.0405, "step": 4335 }, { "epoch": 1.9727024567788898, "grad_norm": 0.757992411373233, "learning_rate": 6.627765692861752e-06, "loss": 0.0546, "step": 4336 }, { "epoch": 1.973157415832575, "grad_norm": 0.5349849941879612, "learning_rate": 6.626414193045867e-06, "loss": 0.0314, "step": 4337 }, { "epoch": 1.9736123748862604, "grad_norm": 0.6693115397359995, "learning_rate": 6.625062560326758e-06, "loss": 0.0417, "step": 4338 }, { "epoch": 1.9740673339399453, "grad_norm": 0.5222744642383346, "learning_rate": 6.6237107948148785e-06, "loss": 0.0291, "step": 4339 }, { "epoch": 1.9745222929936306, "grad_norm": 0.42003903473507664, "learning_rate": 6.622358896620682e-06, "loss": 0.0216, "step": 4340 }, { "epoch": 1.974977252047316, "grad_norm": 0.3460225532281504, "learning_rate": 6.621006865854645e-06, "loss": 0.017, "step": 4341 }, { "epoch": 1.9754322111010008, "grad_norm": 0.5132175355903507, "learning_rate": 6.619654702627246e-06, "loss": 0.0324, "step": 4342 }, { "epoch": 1.975887170154686, "grad_norm": 0.4372697417829231, "learning_rate": 6.61830240704898e-06, "loss": 0.0278, "step": 4343 }, { "epoch": 1.9763421292083714, "grad_norm": 0.6015750404917026, "learning_rate": 6.616949979230349e-06, "loss": 0.0382, "step": 4344 }, { "epoch": 1.9767970882620565, "grad_norm": 0.5658368711187638, "learning_rate": 6.615597419281867e-06, "loss": 0.0237, "step": 4345 }, { "epoch": 1.9772520473157416, "grad_norm": 0.7546044236392536, "learning_rate": 6.614244727314063e-06, "loss": 0.0439, "step": 4346 }, { "epoch": 1.9777070063694269, "grad_norm": 0.5610657717322657, "learning_rate": 6.612891903437466e-06, "loss": 0.0341, "step": 4347 }, { "epoch": 1.978161965423112, "grad_norm": 0.5483801878678801, "learning_rate": 6.611538947762628e-06, "loss": 0.0321, "step": 4348 }, { "epoch": 1.978616924476797, "grad_norm": 0.4906620796819385, "learning_rate": 6.610185860400106e-06, "loss": 0.0258, "step": 4349 }, { "epoch": 1.9790718835304824, "grad_norm": 0.5491476478691184, "learning_rate": 6.608832641460465e-06, "loss": 0.041, "step": 4350 }, { "epoch": 1.9795268425841674, "grad_norm": 0.5539074191353909, "learning_rate": 6.607479291054288e-06, "loss": 0.0362, "step": 4351 }, { "epoch": 1.9799818016378525, "grad_norm": 0.4806168792141746, "learning_rate": 6.6061258092921595e-06, "loss": 0.0234, "step": 4352 }, { "epoch": 1.9804367606915378, "grad_norm": 0.6160426143841132, "learning_rate": 6.6047721962846854e-06, "loss": 0.0348, "step": 4353 }, { "epoch": 1.980891719745223, "grad_norm": 0.6686540794952565, "learning_rate": 6.603418452142475e-06, "loss": 0.0383, "step": 4354 }, { "epoch": 1.981346678798908, "grad_norm": 0.5334646209689482, "learning_rate": 6.602064576976148e-06, "loss": 0.0264, "step": 4355 }, { "epoch": 1.9818016378525933, "grad_norm": 0.666271260243879, "learning_rate": 6.600710570896341e-06, "loss": 0.0383, "step": 4356 }, { "epoch": 1.9822565969062784, "grad_norm": 0.8503041725684195, "learning_rate": 6.5993564340136915e-06, "loss": 0.0647, "step": 4357 }, { "epoch": 1.9827115559599635, "grad_norm": 0.37484521391292774, "learning_rate": 6.598002166438859e-06, "loss": 0.0168, "step": 4358 }, { "epoch": 1.9831665150136488, "grad_norm": 0.515778668718774, "learning_rate": 6.596647768282505e-06, "loss": 0.0298, "step": 4359 }, { "epoch": 1.983621474067334, "grad_norm": 0.6713659676731957, "learning_rate": 6.595293239655307e-06, "loss": 0.0345, "step": 4360 }, { "epoch": 1.984076433121019, "grad_norm": 0.5925529457710818, "learning_rate": 6.593938580667949e-06, "loss": 0.0376, "step": 4361 }, { "epoch": 1.9845313921747043, "grad_norm": 0.6579256270327525, "learning_rate": 6.592583791431128e-06, "loss": 0.0427, "step": 4362 }, { "epoch": 1.9849863512283894, "grad_norm": 0.5299555195913671, "learning_rate": 6.591228872055553e-06, "loss": 0.0255, "step": 4363 }, { "epoch": 1.9854413102820745, "grad_norm": 0.6607587462806197, "learning_rate": 6.5898738226519396e-06, "loss": 0.0465, "step": 4364 }, { "epoch": 1.9858962693357598, "grad_norm": 0.5091472503350368, "learning_rate": 6.588518643331018e-06, "loss": 0.0249, "step": 4365 }, { "epoch": 1.9863512283894451, "grad_norm": 0.4595426588781745, "learning_rate": 6.5871633342035255e-06, "loss": 0.0303, "step": 4366 }, { "epoch": 1.98680618744313, "grad_norm": 0.5310500534756174, "learning_rate": 6.585807895380212e-06, "loss": 0.0322, "step": 4367 }, { "epoch": 1.9872611464968153, "grad_norm": 0.5651708114452124, "learning_rate": 6.584452326971841e-06, "loss": 0.0421, "step": 4368 }, { "epoch": 1.9877161055505006, "grad_norm": 0.632273854951826, "learning_rate": 6.583096629089178e-06, "loss": 0.037, "step": 4369 }, { "epoch": 1.9881710646041855, "grad_norm": 0.43696218471863923, "learning_rate": 6.5817408018430105e-06, "loss": 0.0216, "step": 4370 }, { "epoch": 1.9886260236578708, "grad_norm": 0.6068747591423251, "learning_rate": 6.580384845344128e-06, "loss": 0.0308, "step": 4371 }, { "epoch": 1.989080982711556, "grad_norm": 0.4235191860692518, "learning_rate": 6.579028759703332e-06, "loss": 0.0248, "step": 4372 }, { "epoch": 1.9895359417652412, "grad_norm": 0.5598435542102982, "learning_rate": 6.577672545031436e-06, "loss": 0.0321, "step": 4373 }, { "epoch": 1.9899909008189263, "grad_norm": 0.37623320043924735, "learning_rate": 6.576316201439264e-06, "loss": 0.0238, "step": 4374 }, { "epoch": 1.9904458598726116, "grad_norm": 0.5140371963530428, "learning_rate": 6.574959729037653e-06, "loss": 0.0301, "step": 4375 }, { "epoch": 1.9909008189262967, "grad_norm": 0.49567280344957254, "learning_rate": 6.573603127937443e-06, "loss": 0.0342, "step": 4376 }, { "epoch": 1.9913557779799818, "grad_norm": 0.504590903021897, "learning_rate": 6.572246398249492e-06, "loss": 0.0287, "step": 4377 }, { "epoch": 1.991810737033667, "grad_norm": 0.5818147798282555, "learning_rate": 6.570889540084666e-06, "loss": 0.0332, "step": 4378 }, { "epoch": 1.9922656960873522, "grad_norm": 0.4938323230947109, "learning_rate": 6.569532553553841e-06, "loss": 0.0297, "step": 4379 }, { "epoch": 1.9927206551410372, "grad_norm": 0.543508024211706, "learning_rate": 6.568175438767904e-06, "loss": 0.0322, "step": 4380 }, { "epoch": 1.9931756141947226, "grad_norm": 0.47867791548831073, "learning_rate": 6.566818195837751e-06, "loss": 0.0281, "step": 4381 }, { "epoch": 1.9936305732484076, "grad_norm": 0.5598346075270535, "learning_rate": 6.5654608248742924e-06, "loss": 0.039, "step": 4382 }, { "epoch": 1.9940855323020927, "grad_norm": 0.5553858519117557, "learning_rate": 6.564103325988442e-06, "loss": 0.0412, "step": 4383 }, { "epoch": 1.994540491355778, "grad_norm": 0.711787912581307, "learning_rate": 6.562745699291133e-06, "loss": 0.0414, "step": 4384 }, { "epoch": 1.9949954504094631, "grad_norm": 0.5244433760038992, "learning_rate": 6.561387944893304e-06, "loss": 0.0267, "step": 4385 }, { "epoch": 1.9954504094631482, "grad_norm": 0.38337108463111075, "learning_rate": 6.560030062905901e-06, "loss": 0.0195, "step": 4386 }, { "epoch": 1.9959053685168335, "grad_norm": 0.5588694456295312, "learning_rate": 6.558672053439888e-06, "loss": 0.0375, "step": 4387 }, { "epoch": 1.9963603275705186, "grad_norm": 0.5692970330829459, "learning_rate": 6.557313916606232e-06, "loss": 0.0373, "step": 4388 }, { "epoch": 1.9968152866242037, "grad_norm": 0.47534629481112917, "learning_rate": 6.555955652515918e-06, "loss": 0.0208, "step": 4389 }, { "epoch": 1.997270245677889, "grad_norm": 0.6826078768399079, "learning_rate": 6.554597261279932e-06, "loss": 0.0479, "step": 4390 }, { "epoch": 1.9977252047315741, "grad_norm": 0.4647646884318749, "learning_rate": 6.553238743009278e-06, "loss": 0.0257, "step": 4391 }, { "epoch": 1.9981801637852592, "grad_norm": 0.5040993261207027, "learning_rate": 6.551880097814971e-06, "loss": 0.0371, "step": 4392 }, { "epoch": 1.9986351228389445, "grad_norm": 0.5257593216912329, "learning_rate": 6.550521325808029e-06, "loss": 0.038, "step": 4393 }, { "epoch": 1.9990900818926298, "grad_norm": 0.5498615738618472, "learning_rate": 6.549162427099487e-06, "loss": 0.0312, "step": 4394 }, { "epoch": 1.9995450409463147, "grad_norm": 0.5898625475436916, "learning_rate": 6.547803401800385e-06, "loss": 0.0316, "step": 4395 }, { "epoch": 2.0, "grad_norm": 0.6291682782694813, "learning_rate": 6.546444250021783e-06, "loss": 0.0359, "step": 4396 }, { "epoch": 2.0004549590536853, "grad_norm": 0.29661485114623315, "learning_rate": 6.545084971874738e-06, "loss": 0.0111, "step": 4397 }, { "epoch": 2.00090991810737, "grad_norm": 0.33133405147036854, "learning_rate": 6.543725567470327e-06, "loss": 0.0163, "step": 4398 }, { "epoch": 2.0013648771610555, "grad_norm": 0.24619716513701012, "learning_rate": 6.542366036919634e-06, "loss": 0.0085, "step": 4399 }, { "epoch": 2.001819836214741, "grad_norm": 0.227326455372897, "learning_rate": 6.541006380333754e-06, "loss": 0.009, "step": 4400 }, { "epoch": 2.0022747952684257, "grad_norm": 0.3750038499488343, "learning_rate": 6.539646597823791e-06, "loss": 0.0174, "step": 4401 }, { "epoch": 2.002729754322111, "grad_norm": 0.2741933372767153, "learning_rate": 6.5382866895008625e-06, "loss": 0.0122, "step": 4402 }, { "epoch": 2.0031847133757963, "grad_norm": 0.3486640836851904, "learning_rate": 6.536926655476092e-06, "loss": 0.0153, "step": 4403 }, { "epoch": 2.003639672429481, "grad_norm": 0.27151976968378777, "learning_rate": 6.535566495860615e-06, "loss": 0.0104, "step": 4404 }, { "epoch": 2.0040946314831665, "grad_norm": 0.22619718539976288, "learning_rate": 6.534206210765579e-06, "loss": 0.0109, "step": 4405 }, { "epoch": 2.0045495905368518, "grad_norm": 0.35042859460732206, "learning_rate": 6.53284580030214e-06, "loss": 0.0165, "step": 4406 }, { "epoch": 2.0050045495905366, "grad_norm": 0.35024054234718227, "learning_rate": 6.531485264581464e-06, "loss": 0.014, "step": 4407 }, { "epoch": 2.005459508644222, "grad_norm": 0.3523410861831411, "learning_rate": 6.530124603714729e-06, "loss": 0.0175, "step": 4408 }, { "epoch": 2.0059144676979073, "grad_norm": 0.21154324422798507, "learning_rate": 6.5287638178131216e-06, "loss": 0.0085, "step": 4409 }, { "epoch": 2.0063694267515926, "grad_norm": 0.2574154363605853, "learning_rate": 6.527402906987838e-06, "loss": 0.0102, "step": 4410 }, { "epoch": 2.0068243858052774, "grad_norm": 0.4202359889540857, "learning_rate": 6.526041871350086e-06, "loss": 0.0147, "step": 4411 }, { "epoch": 2.0072793448589628, "grad_norm": 0.512735944313641, "learning_rate": 6.524680711011085e-06, "loss": 0.0227, "step": 4412 }, { "epoch": 2.007734303912648, "grad_norm": 0.46095481615562883, "learning_rate": 6.523319426082062e-06, "loss": 0.0169, "step": 4413 }, { "epoch": 2.008189262966333, "grad_norm": 0.3436523183514405, "learning_rate": 6.521958016674253e-06, "loss": 0.016, "step": 4414 }, { "epoch": 2.0086442220200182, "grad_norm": 0.34563237176708556, "learning_rate": 6.52059648289891e-06, "loss": 0.0098, "step": 4415 }, { "epoch": 2.0090991810737036, "grad_norm": 0.20620877505979698, "learning_rate": 6.519234824867288e-06, "loss": 0.0086, "step": 4416 }, { "epoch": 2.0095541401273884, "grad_norm": 0.29783321754422687, "learning_rate": 6.517873042690658e-06, "loss": 0.0106, "step": 4417 }, { "epoch": 2.0100090991810737, "grad_norm": 0.3453682593057073, "learning_rate": 6.516511136480297e-06, "loss": 0.0111, "step": 4418 }, { "epoch": 2.010464058234759, "grad_norm": 0.37926577488480717, "learning_rate": 6.515149106347495e-06, "loss": 0.0103, "step": 4419 }, { "epoch": 2.010919017288444, "grad_norm": 0.2699076627022181, "learning_rate": 6.513786952403549e-06, "loss": 0.009, "step": 4420 }, { "epoch": 2.011373976342129, "grad_norm": 0.4573462759532556, "learning_rate": 6.512424674759772e-06, "loss": 0.0174, "step": 4421 }, { "epoch": 2.0118289353958145, "grad_norm": 0.3761340593210398, "learning_rate": 6.511062273527478e-06, "loss": 0.0117, "step": 4422 }, { "epoch": 2.0122838944494994, "grad_norm": 0.2873044747781519, "learning_rate": 6.509699748817999e-06, "loss": 0.0077, "step": 4423 }, { "epoch": 2.0127388535031847, "grad_norm": 0.4471565163894407, "learning_rate": 6.5083371007426754e-06, "loss": 0.0122, "step": 4424 }, { "epoch": 2.01319381255687, "grad_norm": 0.18762503676748718, "learning_rate": 6.506974329412855e-06, "loss": 0.0055, "step": 4425 }, { "epoch": 2.013648771610555, "grad_norm": 0.41747131105768737, "learning_rate": 6.505611434939898e-06, "loss": 0.0117, "step": 4426 }, { "epoch": 2.01410373066424, "grad_norm": 0.28884035180311946, "learning_rate": 6.504248417435174e-06, "loss": 0.0117, "step": 4427 }, { "epoch": 2.0145586897179255, "grad_norm": 0.3634594502134902, "learning_rate": 6.502885277010063e-06, "loss": 0.0177, "step": 4428 }, { "epoch": 2.0150136487716104, "grad_norm": 0.4063657344059047, "learning_rate": 6.501522013775951e-06, "loss": 0.0147, "step": 4429 }, { "epoch": 2.0154686078252957, "grad_norm": 0.4404980270121877, "learning_rate": 6.500158627844245e-06, "loss": 0.0119, "step": 4430 }, { "epoch": 2.015923566878981, "grad_norm": 0.20532966469615765, "learning_rate": 6.498795119326348e-06, "loss": 0.0066, "step": 4431 }, { "epoch": 2.016378525932666, "grad_norm": 0.404535372889522, "learning_rate": 6.497431488333683e-06, "loss": 0.0174, "step": 4432 }, { "epoch": 2.016833484986351, "grad_norm": 0.5848970097518671, "learning_rate": 6.496067734977681e-06, "loss": 0.0205, "step": 4433 }, { "epoch": 2.0172884440400365, "grad_norm": 0.25410677716502017, "learning_rate": 6.494703859369778e-06, "loss": 0.0049, "step": 4434 }, { "epoch": 2.0177434030937214, "grad_norm": 0.47548386572656115, "learning_rate": 6.493339861621426e-06, "loss": 0.0113, "step": 4435 }, { "epoch": 2.0181983621474067, "grad_norm": 0.4468900324730278, "learning_rate": 6.491975741844083e-06, "loss": 0.0129, "step": 4436 }, { "epoch": 2.018653321201092, "grad_norm": 0.5481295616443194, "learning_rate": 6.490611500149222e-06, "loss": 0.0167, "step": 4437 }, { "epoch": 2.0191082802547773, "grad_norm": 0.4155580311539728, "learning_rate": 6.489247136648321e-06, "loss": 0.0115, "step": 4438 }, { "epoch": 2.019563239308462, "grad_norm": 0.27816921441583564, "learning_rate": 6.487882651452867e-06, "loss": 0.0113, "step": 4439 }, { "epoch": 2.0200181983621475, "grad_norm": 0.6028146743323916, "learning_rate": 6.486518044674364e-06, "loss": 0.0143, "step": 4440 }, { "epoch": 2.0204731574158328, "grad_norm": 0.4833297613211801, "learning_rate": 6.4851533164243184e-06, "loss": 0.0136, "step": 4441 }, { "epoch": 2.0209281164695176, "grad_norm": 0.34567853809426624, "learning_rate": 6.483788466814251e-06, "loss": 0.0111, "step": 4442 }, { "epoch": 2.021383075523203, "grad_norm": 0.4456901597501804, "learning_rate": 6.482423495955692e-06, "loss": 0.0116, "step": 4443 }, { "epoch": 2.0218380345768883, "grad_norm": 0.5137637244388034, "learning_rate": 6.4810584039601776e-06, "loss": 0.0229, "step": 4444 }, { "epoch": 2.022292993630573, "grad_norm": 0.5240103241593609, "learning_rate": 6.4796931909392605e-06, "loss": 0.0298, "step": 4445 }, { "epoch": 2.0227479526842584, "grad_norm": 0.6996956460313095, "learning_rate": 6.478327857004496e-06, "loss": 0.0252, "step": 4446 }, { "epoch": 2.0232029117379438, "grad_norm": 0.4864794904953008, "learning_rate": 6.476962402267457e-06, "loss": 0.0223, "step": 4447 }, { "epoch": 2.0236578707916286, "grad_norm": 0.2717614212804149, "learning_rate": 6.475596826839718e-06, "loss": 0.0057, "step": 4448 }, { "epoch": 2.024112829845314, "grad_norm": 0.3421850465764547, "learning_rate": 6.474231130832873e-06, "loss": 0.0149, "step": 4449 }, { "epoch": 2.0245677888989992, "grad_norm": 0.54699171460358, "learning_rate": 6.4728653143585165e-06, "loss": 0.024, "step": 4450 }, { "epoch": 2.025022747952684, "grad_norm": 0.3496883500317143, "learning_rate": 6.4714993775282576e-06, "loss": 0.0084, "step": 4451 }, { "epoch": 2.0254777070063694, "grad_norm": 0.4957179416651556, "learning_rate": 6.470133320453716e-06, "loss": 0.0188, "step": 4452 }, { "epoch": 2.0259326660600547, "grad_norm": 0.4593651031575888, "learning_rate": 6.468767143246515e-06, "loss": 0.0218, "step": 4453 }, { "epoch": 2.0263876251137396, "grad_norm": 0.40386577359827, "learning_rate": 6.467400846018299e-06, "loss": 0.0162, "step": 4454 }, { "epoch": 2.026842584167425, "grad_norm": 0.38025380370000395, "learning_rate": 6.466034428880713e-06, "loss": 0.0161, "step": 4455 }, { "epoch": 2.02729754322111, "grad_norm": 0.5151203534522615, "learning_rate": 6.464667891945413e-06, "loss": 0.017, "step": 4456 }, { "epoch": 2.027752502274795, "grad_norm": 0.34742835371586755, "learning_rate": 6.463301235324066e-06, "loss": 0.0093, "step": 4457 }, { "epoch": 2.0282074613284804, "grad_norm": 0.3439303563428748, "learning_rate": 6.461934459128351e-06, "loss": 0.0124, "step": 4458 }, { "epoch": 2.0286624203821657, "grad_norm": 0.45170347221061447, "learning_rate": 6.460567563469956e-06, "loss": 0.0165, "step": 4459 }, { "epoch": 2.0291173794358506, "grad_norm": 0.48931317367198135, "learning_rate": 6.459200548460574e-06, "loss": 0.0184, "step": 4460 }, { "epoch": 2.029572338489536, "grad_norm": 0.45094252471113627, "learning_rate": 6.457833414211913e-06, "loss": 0.0179, "step": 4461 }, { "epoch": 2.030027297543221, "grad_norm": 0.3155385633049441, "learning_rate": 6.4564661608356895e-06, "loss": 0.0095, "step": 4462 }, { "epoch": 2.030482256596906, "grad_norm": 0.4679983016076288, "learning_rate": 6.455098788443628e-06, "loss": 0.0151, "step": 4463 }, { "epoch": 2.0309372156505914, "grad_norm": 0.3223606777694415, "learning_rate": 6.453731297147464e-06, "loss": 0.0065, "step": 4464 }, { "epoch": 2.0313921747042767, "grad_norm": 0.3518840136193259, "learning_rate": 6.452363687058944e-06, "loss": 0.0142, "step": 4465 }, { "epoch": 2.031847133757962, "grad_norm": 0.4551104474723514, "learning_rate": 6.450995958289823e-06, "loss": 0.0227, "step": 4466 }, { "epoch": 2.032302092811647, "grad_norm": 0.4234721298392046, "learning_rate": 6.449628110951864e-06, "loss": 0.0128, "step": 4467 }, { "epoch": 2.032757051865332, "grad_norm": 0.34735067289965277, "learning_rate": 6.448260145156842e-06, "loss": 0.0102, "step": 4468 }, { "epoch": 2.0332120109190175, "grad_norm": 0.2925105437444775, "learning_rate": 6.446892061016543e-06, "loss": 0.0089, "step": 4469 }, { "epoch": 2.0336669699727024, "grad_norm": 0.28557266020408856, "learning_rate": 6.445523858642757e-06, "loss": 0.0087, "step": 4470 }, { "epoch": 2.0341219290263877, "grad_norm": 0.22950926949060813, "learning_rate": 6.44415553814729e-06, "loss": 0.0089, "step": 4471 }, { "epoch": 2.034576888080073, "grad_norm": 0.3818053666366112, "learning_rate": 6.442787099641954e-06, "loss": 0.0148, "step": 4472 }, { "epoch": 2.035031847133758, "grad_norm": 0.3988495192583969, "learning_rate": 6.441418543238573e-06, "loss": 0.0114, "step": 4473 }, { "epoch": 2.035486806187443, "grad_norm": 0.1933907250862565, "learning_rate": 6.440049869048975e-06, "loss": 0.0045, "step": 4474 }, { "epoch": 2.0359417652411285, "grad_norm": 0.29494059099515324, "learning_rate": 6.438681077185007e-06, "loss": 0.0081, "step": 4475 }, { "epoch": 2.0363967242948133, "grad_norm": 0.34480547718786486, "learning_rate": 6.43731216775852e-06, "loss": 0.0124, "step": 4476 }, { "epoch": 2.0368516833484986, "grad_norm": 0.37007488761569907, "learning_rate": 6.435943140881371e-06, "loss": 0.0213, "step": 4477 }, { "epoch": 2.037306642402184, "grad_norm": 0.2748881269360316, "learning_rate": 6.434573996665433e-06, "loss": 0.0059, "step": 4478 }, { "epoch": 2.037761601455869, "grad_norm": 0.26630809246208575, "learning_rate": 6.433204735222588e-06, "loss": 0.0091, "step": 4479 }, { "epoch": 2.038216560509554, "grad_norm": 0.3096085485212809, "learning_rate": 6.431835356664724e-06, "loss": 0.0106, "step": 4480 }, { "epoch": 2.0386715195632394, "grad_norm": 0.16181335219841153, "learning_rate": 6.43046586110374e-06, "loss": 0.0028, "step": 4481 }, { "epoch": 2.0391264786169243, "grad_norm": 0.2831903057353729, "learning_rate": 6.429096248651545e-06, "loss": 0.0042, "step": 4482 }, { "epoch": 2.0395814376706096, "grad_norm": 0.20289363916753683, "learning_rate": 6.427726519420061e-06, "loss": 0.0051, "step": 4483 }, { "epoch": 2.040036396724295, "grad_norm": 0.3648528177635673, "learning_rate": 6.426356673521211e-06, "loss": 0.0139, "step": 4484 }, { "epoch": 2.04049135577798, "grad_norm": 0.3622896287328724, "learning_rate": 6.424986711066936e-06, "loss": 0.012, "step": 4485 }, { "epoch": 2.040946314831665, "grad_norm": 0.32884386613740535, "learning_rate": 6.423616632169183e-06, "loss": 0.0107, "step": 4486 }, { "epoch": 2.0414012738853504, "grad_norm": 0.37933877465879645, "learning_rate": 6.422246436939906e-06, "loss": 0.0111, "step": 4487 }, { "epoch": 2.0418562329390353, "grad_norm": 0.29523659467992736, "learning_rate": 6.420876125491074e-06, "loss": 0.0093, "step": 4488 }, { "epoch": 2.0423111919927206, "grad_norm": 0.27326868398843224, "learning_rate": 6.41950569793466e-06, "loss": 0.0113, "step": 4489 }, { "epoch": 2.042766151046406, "grad_norm": 0.3979596951225033, "learning_rate": 6.418135154382655e-06, "loss": 0.01, "step": 4490 }, { "epoch": 2.0432211101000908, "grad_norm": 0.45553399667837624, "learning_rate": 6.416764494947047e-06, "loss": 0.0172, "step": 4491 }, { "epoch": 2.043676069153776, "grad_norm": 0.37437585080804636, "learning_rate": 6.4153937197398394e-06, "loss": 0.007, "step": 4492 }, { "epoch": 2.0441310282074614, "grad_norm": 0.40616957286227146, "learning_rate": 6.414022828873053e-06, "loss": 0.011, "step": 4493 }, { "epoch": 2.0445859872611467, "grad_norm": 0.2827652219835658, "learning_rate": 6.412651822458705e-06, "loss": 0.0064, "step": 4494 }, { "epoch": 2.0450409463148316, "grad_norm": 0.4462866764759693, "learning_rate": 6.411280700608831e-06, "loss": 0.0174, "step": 4495 }, { "epoch": 2.045495905368517, "grad_norm": 0.5106540488008284, "learning_rate": 6.409909463435471e-06, "loss": 0.0158, "step": 4496 }, { "epoch": 2.045950864422202, "grad_norm": 0.5023106121662962, "learning_rate": 6.408538111050675e-06, "loss": 0.0187, "step": 4497 }, { "epoch": 2.046405823475887, "grad_norm": 0.2706502978948015, "learning_rate": 6.407166643566507e-06, "loss": 0.0068, "step": 4498 }, { "epoch": 2.0468607825295724, "grad_norm": 0.1342442982265927, "learning_rate": 6.405795061095035e-06, "loss": 0.0026, "step": 4499 }, { "epoch": 2.0473157415832577, "grad_norm": 0.30407723641667117, "learning_rate": 6.40442336374834e-06, "loss": 0.0053, "step": 4500 }, { "epoch": 2.0477707006369426, "grad_norm": 0.36406400308330433, "learning_rate": 6.4030515516385085e-06, "loss": 0.013, "step": 4501 }, { "epoch": 2.048225659690628, "grad_norm": 0.413690505210537, "learning_rate": 6.401679624877641e-06, "loss": 0.0088, "step": 4502 }, { "epoch": 2.048680618744313, "grad_norm": 0.3138346298460819, "learning_rate": 6.400307583577845e-06, "loss": 0.0075, "step": 4503 }, { "epoch": 2.049135577797998, "grad_norm": 0.28421717298116506, "learning_rate": 6.3989354278512365e-06, "loss": 0.0091, "step": 4504 }, { "epoch": 2.0495905368516834, "grad_norm": 0.3902980730132667, "learning_rate": 6.397563157809944e-06, "loss": 0.0133, "step": 4505 }, { "epoch": 2.0500454959053687, "grad_norm": 0.3419413032157435, "learning_rate": 6.396190773566098e-06, "loss": 0.008, "step": 4506 }, { "epoch": 2.0505004549590535, "grad_norm": 0.47943799467739867, "learning_rate": 6.39481827523185e-06, "loss": 0.0224, "step": 4507 }, { "epoch": 2.050955414012739, "grad_norm": 0.5374258981199744, "learning_rate": 6.393445662919352e-06, "loss": 0.0176, "step": 4508 }, { "epoch": 2.051410373066424, "grad_norm": 0.4243266474425536, "learning_rate": 6.3920729367407645e-06, "loss": 0.0138, "step": 4509 }, { "epoch": 2.051865332120109, "grad_norm": 0.18548240194284374, "learning_rate": 6.390700096808266e-06, "loss": 0.0035, "step": 4510 }, { "epoch": 2.0523202911737943, "grad_norm": 0.4831143257226566, "learning_rate": 6.389327143234033e-06, "loss": 0.0155, "step": 4511 }, { "epoch": 2.0527752502274796, "grad_norm": 0.818348652840465, "learning_rate": 6.387954076130263e-06, "loss": 0.0195, "step": 4512 }, { "epoch": 2.0532302092811645, "grad_norm": 0.24451486435498174, "learning_rate": 6.386580895609151e-06, "loss": 0.0077, "step": 4513 }, { "epoch": 2.05368516833485, "grad_norm": 0.6286193236656645, "learning_rate": 6.385207601782912e-06, "loss": 0.0141, "step": 4514 }, { "epoch": 2.054140127388535, "grad_norm": 0.3183855842619917, "learning_rate": 6.383834194763763e-06, "loss": 0.008, "step": 4515 }, { "epoch": 2.05459508644222, "grad_norm": 0.390485711720583, "learning_rate": 6.382460674663932e-06, "loss": 0.0092, "step": 4516 }, { "epoch": 2.0550500454959053, "grad_norm": 0.5014940631434602, "learning_rate": 6.381087041595659e-06, "loss": 0.0193, "step": 4517 }, { "epoch": 2.0555050045495906, "grad_norm": 0.4143102550753054, "learning_rate": 6.379713295671189e-06, "loss": 0.0086, "step": 4518 }, { "epoch": 2.055959963603276, "grad_norm": 0.4132710071888765, "learning_rate": 6.3783394370027785e-06, "loss": 0.0173, "step": 4519 }, { "epoch": 2.056414922656961, "grad_norm": 0.42016048890380997, "learning_rate": 6.376965465702696e-06, "loss": 0.0147, "step": 4520 }, { "epoch": 2.056869881710646, "grad_norm": 0.4992153924894953, "learning_rate": 6.375591381883213e-06, "loss": 0.0165, "step": 4521 }, { "epoch": 2.0573248407643314, "grad_norm": 0.28310436425069296, "learning_rate": 6.374217185656614e-06, "loss": 0.0095, "step": 4522 }, { "epoch": 2.0577797998180163, "grad_norm": 0.5065260337851148, "learning_rate": 6.372842877135191e-06, "loss": 0.0122, "step": 4523 }, { "epoch": 2.0582347588717016, "grad_norm": 0.4043323568079733, "learning_rate": 6.37146845643125e-06, "loss": 0.0116, "step": 4524 }, { "epoch": 2.058689717925387, "grad_norm": 0.523207412624782, "learning_rate": 6.370093923657099e-06, "loss": 0.0111, "step": 4525 }, { "epoch": 2.0591446769790718, "grad_norm": 0.29125802401521844, "learning_rate": 6.36871927892506e-06, "loss": 0.0078, "step": 4526 }, { "epoch": 2.059599636032757, "grad_norm": 0.23957204726726347, "learning_rate": 6.367344522347465e-06, "loss": 0.0045, "step": 4527 }, { "epoch": 2.0600545950864424, "grad_norm": 0.40895703978535547, "learning_rate": 6.365969654036648e-06, "loss": 0.0073, "step": 4528 }, { "epoch": 2.0605095541401273, "grad_norm": 0.4095715269513763, "learning_rate": 6.36459467410496e-06, "loss": 0.0111, "step": 4529 }, { "epoch": 2.0609645131938126, "grad_norm": 0.33090525933488096, "learning_rate": 6.363219582664758e-06, "loss": 0.01, "step": 4530 }, { "epoch": 2.061419472247498, "grad_norm": 0.2919761694239399, "learning_rate": 6.361844379828408e-06, "loss": 0.0093, "step": 4531 }, { "epoch": 2.0618744313011828, "grad_norm": 0.1761863838449364, "learning_rate": 6.360469065708286e-06, "loss": 0.0025, "step": 4532 }, { "epoch": 2.062329390354868, "grad_norm": 0.4755051873680735, "learning_rate": 6.359093640416773e-06, "loss": 0.0145, "step": 4533 }, { "epoch": 2.0627843494085534, "grad_norm": 0.3940944642294024, "learning_rate": 6.357718104066267e-06, "loss": 0.0102, "step": 4534 }, { "epoch": 2.0632393084622382, "grad_norm": 0.32006471418527305, "learning_rate": 6.356342456769169e-06, "loss": 0.0134, "step": 4535 }, { "epoch": 2.0636942675159236, "grad_norm": 0.4070164451436498, "learning_rate": 6.354966698637892e-06, "loss": 0.0086, "step": 4536 }, { "epoch": 2.064149226569609, "grad_norm": 0.3806130387200204, "learning_rate": 6.353590829784853e-06, "loss": 0.0093, "step": 4537 }, { "epoch": 2.0646041856232937, "grad_norm": 0.340282552287716, "learning_rate": 6.352214850322486e-06, "loss": 0.0045, "step": 4538 }, { "epoch": 2.065059144676979, "grad_norm": 0.36706911447568497, "learning_rate": 6.3508387603632275e-06, "loss": 0.0109, "step": 4539 }, { "epoch": 2.0655141037306644, "grad_norm": 0.287314127027193, "learning_rate": 6.349462560019524e-06, "loss": 0.0082, "step": 4540 }, { "epoch": 2.065969062784349, "grad_norm": 0.5856303471113281, "learning_rate": 6.348086249403836e-06, "loss": 0.0302, "step": 4541 }, { "epoch": 2.0664240218380345, "grad_norm": 0.41896863384200134, "learning_rate": 6.3467098286286274e-06, "loss": 0.01, "step": 4542 }, { "epoch": 2.06687898089172, "grad_norm": 0.426921792024738, "learning_rate": 6.3453332978063745e-06, "loss": 0.0051, "step": 4543 }, { "epoch": 2.0673339399454047, "grad_norm": 0.4348306158879752, "learning_rate": 6.343956657049558e-06, "loss": 0.012, "step": 4544 }, { "epoch": 2.06778889899909, "grad_norm": 0.3622185695212827, "learning_rate": 6.342579906470673e-06, "loss": 0.0128, "step": 4545 }, { "epoch": 2.0682438580527753, "grad_norm": 0.3970694799161914, "learning_rate": 6.341203046182223e-06, "loss": 0.0074, "step": 4546 }, { "epoch": 2.06869881710646, "grad_norm": 0.6259473080603545, "learning_rate": 6.339826076296715e-06, "loss": 0.0157, "step": 4547 }, { "epoch": 2.0691537761601455, "grad_norm": 0.30910267507061634, "learning_rate": 6.338448996926671e-06, "loss": 0.0108, "step": 4548 }, { "epoch": 2.069608735213831, "grad_norm": 0.3797281042748299, "learning_rate": 6.337071808184619e-06, "loss": 0.0145, "step": 4549 }, { "epoch": 2.070063694267516, "grad_norm": 0.24930199379338153, "learning_rate": 6.335694510183098e-06, "loss": 0.0072, "step": 4550 }, { "epoch": 2.070518653321201, "grad_norm": 0.4107630216562235, "learning_rate": 6.3343171030346525e-06, "loss": 0.01, "step": 4551 }, { "epoch": 2.0709736123748863, "grad_norm": 0.3737643389016472, "learning_rate": 6.332939586851838e-06, "loss": 0.0139, "step": 4552 }, { "epoch": 2.0714285714285716, "grad_norm": 0.32194853200282086, "learning_rate": 6.331561961747224e-06, "loss": 0.0073, "step": 4553 }, { "epoch": 2.0718835304822565, "grad_norm": 0.5983325080789265, "learning_rate": 6.330184227833376e-06, "loss": 0.0213, "step": 4554 }, { "epoch": 2.072338489535942, "grad_norm": 0.7343643528516064, "learning_rate": 6.328806385222881e-06, "loss": 0.0168, "step": 4555 }, { "epoch": 2.072793448589627, "grad_norm": 0.4020262446848935, "learning_rate": 6.327428434028331e-06, "loss": 0.0108, "step": 4556 }, { "epoch": 2.073248407643312, "grad_norm": 0.4537001584900146, "learning_rate": 6.326050374362322e-06, "loss": 0.0158, "step": 4557 }, { "epoch": 2.0737033666969973, "grad_norm": 0.38177530440217355, "learning_rate": 6.324672206337465e-06, "loss": 0.0083, "step": 4558 }, { "epoch": 2.0741583257506826, "grad_norm": 0.4500572048722902, "learning_rate": 6.323293930066377e-06, "loss": 0.0148, "step": 4559 }, { "epoch": 2.0746132848043675, "grad_norm": 0.46657307952440574, "learning_rate": 6.3219155456616856e-06, "loss": 0.0217, "step": 4560 }, { "epoch": 2.0750682438580528, "grad_norm": 0.25860718016450046, "learning_rate": 6.320537053236024e-06, "loss": 0.007, "step": 4561 }, { "epoch": 2.075523202911738, "grad_norm": 0.2535852497165178, "learning_rate": 6.31915845290204e-06, "loss": 0.0046, "step": 4562 }, { "epoch": 2.075978161965423, "grad_norm": 0.4879797035856903, "learning_rate": 6.317779744772384e-06, "loss": 0.0168, "step": 4563 }, { "epoch": 2.0764331210191083, "grad_norm": 0.5480357468481581, "learning_rate": 6.316400928959718e-06, "loss": 0.0167, "step": 4564 }, { "epoch": 2.0768880800727936, "grad_norm": 0.4225876036094889, "learning_rate": 6.315022005576713e-06, "loss": 0.0105, "step": 4565 }, { "epoch": 2.0773430391264784, "grad_norm": 0.25443563760292537, "learning_rate": 6.31364297473605e-06, "loss": 0.0084, "step": 4566 }, { "epoch": 2.0777979981801638, "grad_norm": 0.3507842449241893, "learning_rate": 6.312263836550413e-06, "loss": 0.0086, "step": 4567 }, { "epoch": 2.078252957233849, "grad_norm": 0.400370306980809, "learning_rate": 6.310884591132501e-06, "loss": 0.0111, "step": 4568 }, { "epoch": 2.078707916287534, "grad_norm": 0.47168660585239236, "learning_rate": 6.309505238595022e-06, "loss": 0.0067, "step": 4569 }, { "epoch": 2.0791628753412192, "grad_norm": 0.3683418005490081, "learning_rate": 6.3081257790506875e-06, "loss": 0.0086, "step": 4570 }, { "epoch": 2.0796178343949046, "grad_norm": 0.43676959298159407, "learning_rate": 6.306746212612222e-06, "loss": 0.0145, "step": 4571 }, { "epoch": 2.0800727934485894, "grad_norm": 0.3902808514258779, "learning_rate": 6.305366539392358e-06, "loss": 0.0096, "step": 4572 }, { "epoch": 2.0805277525022747, "grad_norm": 0.4408050461319164, "learning_rate": 6.303986759503835e-06, "loss": 0.0132, "step": 4573 }, { "epoch": 2.08098271155596, "grad_norm": 0.4850415360136034, "learning_rate": 6.302606873059403e-06, "loss": 0.0082, "step": 4574 }, { "epoch": 2.0814376706096454, "grad_norm": 0.4259018931666596, "learning_rate": 6.301226880171818e-06, "loss": 0.0212, "step": 4575 }, { "epoch": 2.08189262966333, "grad_norm": 0.37655761023061296, "learning_rate": 6.29984678095385e-06, "loss": 0.0105, "step": 4576 }, { "epoch": 2.0823475887170155, "grad_norm": 0.3938914042491262, "learning_rate": 6.2984665755182735e-06, "loss": 0.0086, "step": 4577 }, { "epoch": 2.082802547770701, "grad_norm": 0.6143724316889407, "learning_rate": 6.297086263977872e-06, "loss": 0.0153, "step": 4578 }, { "epoch": 2.0832575068243857, "grad_norm": 0.5858433418813844, "learning_rate": 6.295705846445439e-06, "loss": 0.0334, "step": 4579 }, { "epoch": 2.083712465878071, "grad_norm": 0.6002687806444671, "learning_rate": 6.294325323033775e-06, "loss": 0.0184, "step": 4580 }, { "epoch": 2.0841674249317563, "grad_norm": 0.42666084226846, "learning_rate": 6.29294469385569e-06, "loss": 0.0183, "step": 4581 }, { "epoch": 2.084622383985441, "grad_norm": 0.33115785714098306, "learning_rate": 6.291563959024005e-06, "loss": 0.0103, "step": 4582 }, { "epoch": 2.0850773430391265, "grad_norm": 0.40795705419312395, "learning_rate": 6.290183118651546e-06, "loss": 0.0161, "step": 4583 }, { "epoch": 2.085532302092812, "grad_norm": 0.24850035268721687, "learning_rate": 6.2888021728511475e-06, "loss": 0.0045, "step": 4584 }, { "epoch": 2.0859872611464967, "grad_norm": 0.34080271467213186, "learning_rate": 6.2874211217356574e-06, "loss": 0.0114, "step": 4585 }, { "epoch": 2.086442220200182, "grad_norm": 0.2859645446897584, "learning_rate": 6.286039965417925e-06, "loss": 0.0082, "step": 4586 }, { "epoch": 2.0868971792538673, "grad_norm": 0.41872676985407964, "learning_rate": 6.284658704010815e-06, "loss": 0.0112, "step": 4587 }, { "epoch": 2.087352138307552, "grad_norm": 0.5174279262156914, "learning_rate": 6.283277337627198e-06, "loss": 0.0217, "step": 4588 }, { "epoch": 2.0878070973612375, "grad_norm": 0.4215936972981889, "learning_rate": 6.281895866379951e-06, "loss": 0.0135, "step": 4589 }, { "epoch": 2.088262056414923, "grad_norm": 0.2934422956997878, "learning_rate": 6.280514290381965e-06, "loss": 0.0036, "step": 4590 }, { "epoch": 2.0887170154686077, "grad_norm": 0.4206969344733646, "learning_rate": 6.2791326097461324e-06, "loss": 0.0132, "step": 4591 }, { "epoch": 2.089171974522293, "grad_norm": 0.3891551053085789, "learning_rate": 6.2777508245853605e-06, "loss": 0.0161, "step": 4592 }, { "epoch": 2.0896269335759783, "grad_norm": 0.2868441850896359, "learning_rate": 6.276368935012559e-06, "loss": 0.008, "step": 4593 }, { "epoch": 2.090081892629663, "grad_norm": 0.4325567732140906, "learning_rate": 6.274986941140654e-06, "loss": 0.0128, "step": 4594 }, { "epoch": 2.0905368516833485, "grad_norm": 0.49807938130393564, "learning_rate": 6.273604843082573e-06, "loss": 0.0136, "step": 4595 }, { "epoch": 2.0909918107370338, "grad_norm": 0.5118888559284318, "learning_rate": 6.272222640951257e-06, "loss": 0.0105, "step": 4596 }, { "epoch": 2.0914467697907186, "grad_norm": 0.28166633533659863, "learning_rate": 6.270840334859651e-06, "loss": 0.0113, "step": 4597 }, { "epoch": 2.091901728844404, "grad_norm": 0.3900023284562493, "learning_rate": 6.269457924920713e-06, "loss": 0.011, "step": 4598 }, { "epoch": 2.0923566878980893, "grad_norm": 0.41681279462432963, "learning_rate": 6.2680754112474065e-06, "loss": 0.0139, "step": 4599 }, { "epoch": 2.092811646951774, "grad_norm": 0.4565442396926241, "learning_rate": 6.266692793952702e-06, "loss": 0.0106, "step": 4600 }, { "epoch": 2.0932666060054594, "grad_norm": 0.5744912833702874, "learning_rate": 6.265310073149584e-06, "loss": 0.0263, "step": 4601 }, { "epoch": 2.0937215650591448, "grad_norm": 0.43435981273700197, "learning_rate": 6.263927248951042e-06, "loss": 0.0131, "step": 4602 }, { "epoch": 2.0941765241128296, "grad_norm": 0.25937892046827166, "learning_rate": 6.26254432147007e-06, "loss": 0.0077, "step": 4603 }, { "epoch": 2.094631483166515, "grad_norm": 0.4005011124266399, "learning_rate": 6.261161290819681e-06, "loss": 0.0103, "step": 4604 }, { "epoch": 2.0950864422202002, "grad_norm": 0.5898502910711467, "learning_rate": 6.259778157112885e-06, "loss": 0.0174, "step": 4605 }, { "epoch": 2.0955414012738856, "grad_norm": 0.5079361789692335, "learning_rate": 6.258394920462707e-06, "loss": 0.0208, "step": 4606 }, { "epoch": 2.0959963603275704, "grad_norm": 0.48058981648043236, "learning_rate": 6.257011580982179e-06, "loss": 0.0102, "step": 4607 }, { "epoch": 2.0964513193812557, "grad_norm": 0.4851769092280976, "learning_rate": 6.255628138784341e-06, "loss": 0.0139, "step": 4608 }, { "epoch": 2.096906278434941, "grad_norm": 0.29882629203864847, "learning_rate": 6.254244593982244e-06, "loss": 0.0098, "step": 4609 }, { "epoch": 2.097361237488626, "grad_norm": 0.35145666589052676, "learning_rate": 6.252860946688939e-06, "loss": 0.0078, "step": 4610 }, { "epoch": 2.097816196542311, "grad_norm": 0.4665876717085764, "learning_rate": 6.251477197017498e-06, "loss": 0.0159, "step": 4611 }, { "epoch": 2.0982711555959965, "grad_norm": 0.5760290115746136, "learning_rate": 6.250093345080992e-06, "loss": 0.0283, "step": 4612 }, { "epoch": 2.0987261146496814, "grad_norm": 0.2294038867724398, "learning_rate": 6.248709390992504e-06, "loss": 0.0046, "step": 4613 }, { "epoch": 2.0991810737033667, "grad_norm": 0.4230616771741766, "learning_rate": 6.247325334865121e-06, "loss": 0.0116, "step": 4614 }, { "epoch": 2.099636032757052, "grad_norm": 0.27642211082644613, "learning_rate": 6.245941176811946e-06, "loss": 0.0084, "step": 4615 }, { "epoch": 2.100090991810737, "grad_norm": 0.6578946218287635, "learning_rate": 6.244556916946085e-06, "loss": 0.0179, "step": 4616 }, { "epoch": 2.100545950864422, "grad_norm": 0.4064909225755356, "learning_rate": 6.243172555380651e-06, "loss": 0.0107, "step": 4617 }, { "epoch": 2.1010009099181075, "grad_norm": 0.254458023879425, "learning_rate": 6.241788092228773e-06, "loss": 0.0109, "step": 4618 }, { "epoch": 2.1014558689717924, "grad_norm": 0.3556847926268138, "learning_rate": 6.240403527603579e-06, "loss": 0.0117, "step": 4619 }, { "epoch": 2.1019108280254777, "grad_norm": 0.5112993248015234, "learning_rate": 6.23901886161821e-06, "loss": 0.0118, "step": 4620 }, { "epoch": 2.102365787079163, "grad_norm": 0.3782746457746778, "learning_rate": 6.237634094385814e-06, "loss": 0.0107, "step": 4621 }, { "epoch": 2.102820746132848, "grad_norm": 0.3403380790753291, "learning_rate": 6.23624922601955e-06, "loss": 0.0121, "step": 4622 }, { "epoch": 2.103275705186533, "grad_norm": 0.422132134092372, "learning_rate": 6.234864256632582e-06, "loss": 0.0134, "step": 4623 }, { "epoch": 2.1037306642402185, "grad_norm": 0.6334762245502538, "learning_rate": 6.233479186338084e-06, "loss": 0.0195, "step": 4624 }, { "epoch": 2.1041856232939034, "grad_norm": 0.3369816740149923, "learning_rate": 6.232094015249236e-06, "loss": 0.0095, "step": 4625 }, { "epoch": 2.1046405823475887, "grad_norm": 0.5808306354860736, "learning_rate": 6.230708743479231e-06, "loss": 0.0166, "step": 4626 }, { "epoch": 2.105095541401274, "grad_norm": 0.7841793961162681, "learning_rate": 6.229323371141264e-06, "loss": 0.0166, "step": 4627 }, { "epoch": 2.105550500454959, "grad_norm": 0.330060841421579, "learning_rate": 6.2279378983485415e-06, "loss": 0.0166, "step": 4628 }, { "epoch": 2.106005459508644, "grad_norm": 0.28734963065483526, "learning_rate": 6.226552325214281e-06, "loss": 0.0082, "step": 4629 }, { "epoch": 2.1064604185623295, "grad_norm": 0.35806742065789043, "learning_rate": 6.225166651851704e-06, "loss": 0.0121, "step": 4630 }, { "epoch": 2.1069153776160148, "grad_norm": 0.4378141175008406, "learning_rate": 6.2237808783740395e-06, "loss": 0.0104, "step": 4631 }, { "epoch": 2.1073703366696996, "grad_norm": 0.5425819690818054, "learning_rate": 6.2223950048945295e-06, "loss": 0.0223, "step": 4632 }, { "epoch": 2.107825295723385, "grad_norm": 0.4016986818697845, "learning_rate": 6.22100903152642e-06, "loss": 0.0137, "step": 4633 }, { "epoch": 2.1082802547770703, "grad_norm": 0.36947648991630955, "learning_rate": 6.219622958382965e-06, "loss": 0.0165, "step": 4634 }, { "epoch": 2.108735213830755, "grad_norm": 0.17005186534941416, "learning_rate": 6.218236785577431e-06, "loss": 0.0039, "step": 4635 }, { "epoch": 2.1091901728844404, "grad_norm": 0.6340745866299279, "learning_rate": 6.216850513223087e-06, "loss": 0.0256, "step": 4636 }, { "epoch": 2.1096451319381258, "grad_norm": 0.30127420891385065, "learning_rate": 6.215464141433216e-06, "loss": 0.0069, "step": 4637 }, { "epoch": 2.1101000909918106, "grad_norm": 0.23026028296805445, "learning_rate": 6.214077670321103e-06, "loss": 0.0035, "step": 4638 }, { "epoch": 2.110555050045496, "grad_norm": 0.45991686539056265, "learning_rate": 6.212691100000046e-06, "loss": 0.014, "step": 4639 }, { "epoch": 2.1110100090991812, "grad_norm": 0.36611553659446155, "learning_rate": 6.211304430583349e-06, "loss": 0.0161, "step": 4640 }, { "epoch": 2.111464968152866, "grad_norm": 0.3628732699246142, "learning_rate": 6.209917662184324e-06, "loss": 0.0084, "step": 4641 }, { "epoch": 2.1119199272065514, "grad_norm": 0.32595313351396926, "learning_rate": 6.208530794916291e-06, "loss": 0.007, "step": 4642 }, { "epoch": 2.1123748862602367, "grad_norm": 0.4699770478908922, "learning_rate": 6.20714382889258e-06, "loss": 0.0115, "step": 4643 }, { "epoch": 2.1128298453139216, "grad_norm": 0.4628952236120021, "learning_rate": 6.205756764226526e-06, "loss": 0.0121, "step": 4644 }, { "epoch": 2.113284804367607, "grad_norm": 0.5187753749264317, "learning_rate": 6.2043696010314745e-06, "loss": 0.0284, "step": 4645 }, { "epoch": 2.113739763421292, "grad_norm": 0.3867323333746344, "learning_rate": 6.202982339420778e-06, "loss": 0.014, "step": 4646 }, { "epoch": 2.114194722474977, "grad_norm": 0.3981493529790491, "learning_rate": 6.201594979507798e-06, "loss": 0.0093, "step": 4647 }, { "epoch": 2.1146496815286624, "grad_norm": 0.27754154956912436, "learning_rate": 6.2002075214059024e-06, "loss": 0.0063, "step": 4648 }, { "epoch": 2.1151046405823477, "grad_norm": 0.4835166644113581, "learning_rate": 6.198819965228468e-06, "loss": 0.0183, "step": 4649 }, { "epoch": 2.1155595996360326, "grad_norm": 0.3388831775783709, "learning_rate": 6.19743231108888e-06, "loss": 0.0051, "step": 4650 }, { "epoch": 2.116014558689718, "grad_norm": 0.49436350716636684, "learning_rate": 6.196044559100531e-06, "loss": 0.0175, "step": 4651 }, { "epoch": 2.116469517743403, "grad_norm": 0.3239993158425095, "learning_rate": 6.194656709376822e-06, "loss": 0.0097, "step": 4652 }, { "epoch": 2.116924476797088, "grad_norm": 0.2610206185014245, "learning_rate": 6.193268762031162e-06, "loss": 0.0134, "step": 4653 }, { "epoch": 2.1173794358507734, "grad_norm": 0.4812889361595157, "learning_rate": 6.1918807171769666e-06, "loss": 0.0209, "step": 4654 }, { "epoch": 2.1178343949044587, "grad_norm": 0.31728463901876697, "learning_rate": 6.1904925749276635e-06, "loss": 0.0096, "step": 4655 }, { "epoch": 2.1182893539581436, "grad_norm": 0.28225278847668417, "learning_rate": 6.189104335396681e-06, "loss": 0.0073, "step": 4656 }, { "epoch": 2.118744313011829, "grad_norm": 0.30348622765303085, "learning_rate": 6.187715998697463e-06, "loss": 0.0076, "step": 4657 }, { "epoch": 2.119199272065514, "grad_norm": 0.3446741668542745, "learning_rate": 6.1863275649434575e-06, "loss": 0.0084, "step": 4658 }, { "epoch": 2.1196542311191995, "grad_norm": 0.42149292621983275, "learning_rate": 6.184939034248121e-06, "loss": 0.0127, "step": 4659 }, { "epoch": 2.1201091901728844, "grad_norm": 0.2888655467331999, "learning_rate": 6.183550406724917e-06, "loss": 0.0068, "step": 4660 }, { "epoch": 2.1205641492265697, "grad_norm": 0.27747096934774546, "learning_rate": 6.18216168248732e-06, "loss": 0.0106, "step": 4661 }, { "epoch": 2.121019108280255, "grad_norm": 0.4080394325964134, "learning_rate": 6.1807728616488085e-06, "loss": 0.0122, "step": 4662 }, { "epoch": 2.12147406733394, "grad_norm": 0.30637128462500635, "learning_rate": 6.1793839443228685e-06, "loss": 0.0068, "step": 4663 }, { "epoch": 2.121929026387625, "grad_norm": 0.44786866354627414, "learning_rate": 6.177994930623001e-06, "loss": 0.0162, "step": 4664 }, { "epoch": 2.1223839854413105, "grad_norm": 0.3251280730970267, "learning_rate": 6.176605820662707e-06, "loss": 0.011, "step": 4665 }, { "epoch": 2.1228389444949953, "grad_norm": 0.4069982323725549, "learning_rate": 6.1752166145554996e-06, "loss": 0.0081, "step": 4666 }, { "epoch": 2.1232939035486806, "grad_norm": 0.32279978984882435, "learning_rate": 6.173827312414897e-06, "loss": 0.0114, "step": 4667 }, { "epoch": 2.123748862602366, "grad_norm": 0.4372200402483741, "learning_rate": 6.172437914354428e-06, "loss": 0.011, "step": 4668 }, { "epoch": 2.124203821656051, "grad_norm": 0.26049460733199026, "learning_rate": 6.171048420487627e-06, "loss": 0.0073, "step": 4669 }, { "epoch": 2.124658780709736, "grad_norm": 0.3874247713956048, "learning_rate": 6.169658830928037e-06, "loss": 0.0114, "step": 4670 }, { "epoch": 2.1251137397634214, "grad_norm": 0.36789539929279097, "learning_rate": 6.168269145789211e-06, "loss": 0.0128, "step": 4671 }, { "epoch": 2.1255686988171063, "grad_norm": 0.41423701778830163, "learning_rate": 6.166879365184705e-06, "loss": 0.0125, "step": 4672 }, { "epoch": 2.1260236578707916, "grad_norm": 0.3888237106440935, "learning_rate": 6.165489489228086e-06, "loss": 0.0097, "step": 4673 }, { "epoch": 2.126478616924477, "grad_norm": 0.33622016893128404, "learning_rate": 6.164099518032931e-06, "loss": 0.0096, "step": 4674 }, { "epoch": 2.126933575978162, "grad_norm": 0.5419369950038244, "learning_rate": 6.16270945171282e-06, "loss": 0.0223, "step": 4675 }, { "epoch": 2.127388535031847, "grad_norm": 0.4666748588676364, "learning_rate": 6.161319290381342e-06, "loss": 0.0138, "step": 4676 }, { "epoch": 2.1278434940855324, "grad_norm": 0.20680893492029834, "learning_rate": 6.159929034152098e-06, "loss": 0.0044, "step": 4677 }, { "epoch": 2.1282984531392173, "grad_norm": 0.32100270225017463, "learning_rate": 6.158538683138689e-06, "loss": 0.0079, "step": 4678 }, { "epoch": 2.1287534121929026, "grad_norm": 0.5240960762964716, "learning_rate": 6.157148237454734e-06, "loss": 0.0208, "step": 4679 }, { "epoch": 2.129208371246588, "grad_norm": 0.4750588882235299, "learning_rate": 6.155757697213848e-06, "loss": 0.0317, "step": 4680 }, { "epoch": 2.1296633303002728, "grad_norm": 0.6071814918296271, "learning_rate": 6.154367062529663e-06, "loss": 0.0162, "step": 4681 }, { "epoch": 2.130118289353958, "grad_norm": 0.3483499351536633, "learning_rate": 6.152976333515816e-06, "loss": 0.0103, "step": 4682 }, { "epoch": 2.1305732484076434, "grad_norm": 0.3176517571523393, "learning_rate": 6.151585510285949e-06, "loss": 0.022, "step": 4683 }, { "epoch": 2.1310282074613287, "grad_norm": 0.2834270098676874, "learning_rate": 6.150194592953714e-06, "loss": 0.0078, "step": 4684 }, { "epoch": 2.1314831665150136, "grad_norm": 0.20075819652666196, "learning_rate": 6.1488035816327705e-06, "loss": 0.0052, "step": 4685 }, { "epoch": 2.131938125568699, "grad_norm": 0.35543231749418297, "learning_rate": 6.147412476436789e-06, "loss": 0.0188, "step": 4686 }, { "epoch": 2.132393084622384, "grad_norm": 0.4005658010128812, "learning_rate": 6.146021277479438e-06, "loss": 0.0161, "step": 4687 }, { "epoch": 2.132848043676069, "grad_norm": 0.43084722475538406, "learning_rate": 6.1446299848744064e-06, "loss": 0.0094, "step": 4688 }, { "epoch": 2.1333030027297544, "grad_norm": 0.35225087592272825, "learning_rate": 6.143238598735382e-06, "loss": 0.0104, "step": 4689 }, { "epoch": 2.1337579617834397, "grad_norm": 0.4328677661340792, "learning_rate": 6.14184711917606e-06, "loss": 0.0124, "step": 4690 }, { "epoch": 2.1342129208371245, "grad_norm": 0.512463258271892, "learning_rate": 6.140455546310149e-06, "loss": 0.0218, "step": 4691 }, { "epoch": 2.13466787989081, "grad_norm": 0.309462854786802, "learning_rate": 6.13906388025136e-06, "loss": 0.0079, "step": 4692 }, { "epoch": 2.135122838944495, "grad_norm": 0.46781772579974923, "learning_rate": 6.137672121113416e-06, "loss": 0.0191, "step": 4693 }, { "epoch": 2.13557779799818, "grad_norm": 2.517888895246575, "learning_rate": 6.136280269010043e-06, "loss": 0.0821, "step": 4694 }, { "epoch": 2.1360327570518653, "grad_norm": 0.4572240952536275, "learning_rate": 6.134888324054978e-06, "loss": 0.016, "step": 4695 }, { "epoch": 2.1364877161055507, "grad_norm": 0.24484263967956846, "learning_rate": 6.133496286361965e-06, "loss": 0.0085, "step": 4696 }, { "epoch": 2.1369426751592355, "grad_norm": 0.3074001728531714, "learning_rate": 6.132104156044753e-06, "loss": 0.0082, "step": 4697 }, { "epoch": 2.137397634212921, "grad_norm": 0.6501583812528575, "learning_rate": 6.130711933217103e-06, "loss": 0.0184, "step": 4698 }, { "epoch": 2.137852593266606, "grad_norm": 0.43823203013909906, "learning_rate": 6.12931961799278e-06, "loss": 0.0161, "step": 4699 }, { "epoch": 2.138307552320291, "grad_norm": 0.2380852798065643, "learning_rate": 6.127927210485558e-06, "loss": 0.0052, "step": 4700 }, { "epoch": 2.1387625113739763, "grad_norm": 0.41066885343616005, "learning_rate": 6.126534710809217e-06, "loss": 0.0104, "step": 4701 }, { "epoch": 2.1392174704276616, "grad_norm": 0.3193981628094812, "learning_rate": 6.125142119077548e-06, "loss": 0.009, "step": 4702 }, { "epoch": 2.1396724294813465, "grad_norm": 0.27467948825988836, "learning_rate": 6.123749435404345e-06, "loss": 0.0067, "step": 4703 }, { "epoch": 2.140127388535032, "grad_norm": 0.47760238173553504, "learning_rate": 6.122356659903414e-06, "loss": 0.0081, "step": 4704 }, { "epoch": 2.140582347588717, "grad_norm": 0.33054107501935437, "learning_rate": 6.1209637926885635e-06, "loss": 0.0108, "step": 4705 }, { "epoch": 2.141037306642402, "grad_norm": 0.15623333190508923, "learning_rate": 6.119570833873616e-06, "loss": 0.0035, "step": 4706 }, { "epoch": 2.1414922656960873, "grad_norm": 0.6933431793628463, "learning_rate": 6.118177783572394e-06, "loss": 0.0378, "step": 4707 }, { "epoch": 2.1419472247497726, "grad_norm": 0.24781879479870367, "learning_rate": 6.116784641898734e-06, "loss": 0.0074, "step": 4708 }, { "epoch": 2.1424021838034575, "grad_norm": 0.4716485241111409, "learning_rate": 6.115391408966478e-06, "loss": 0.0097, "step": 4709 }, { "epoch": 2.142857142857143, "grad_norm": 0.3580072322855141, "learning_rate": 6.113998084889472e-06, "loss": 0.0042, "step": 4710 }, { "epoch": 2.143312101910828, "grad_norm": 0.38016300805459524, "learning_rate": 6.112604669781572e-06, "loss": 0.012, "step": 4711 }, { "epoch": 2.143767060964513, "grad_norm": 0.507619153938635, "learning_rate": 6.111211163756644e-06, "loss": 0.0146, "step": 4712 }, { "epoch": 2.1442220200181983, "grad_norm": 0.42767402212513583, "learning_rate": 6.10981756692856e-06, "loss": 0.0115, "step": 4713 }, { "epoch": 2.1446769790718836, "grad_norm": 0.5149753893607478, "learning_rate": 6.108423879411193e-06, "loss": 0.011, "step": 4714 }, { "epoch": 2.145131938125569, "grad_norm": 0.23231944854908404, "learning_rate": 6.107030101318433e-06, "loss": 0.0056, "step": 4715 }, { "epoch": 2.1455868971792538, "grad_norm": 0.35414801991752665, "learning_rate": 6.1056362327641726e-06, "loss": 0.0077, "step": 4716 }, { "epoch": 2.146041856232939, "grad_norm": 0.4995095629811545, "learning_rate": 6.104242273862313e-06, "loss": 0.0187, "step": 4717 }, { "epoch": 2.1464968152866244, "grad_norm": 0.40298438638076695, "learning_rate": 6.102848224726761e-06, "loss": 0.011, "step": 4718 }, { "epoch": 2.1469517743403093, "grad_norm": 0.2879775600435006, "learning_rate": 6.1014540854714324e-06, "loss": 0.0056, "step": 4719 }, { "epoch": 2.1474067333939946, "grad_norm": 0.2463958119740778, "learning_rate": 6.100059856210251e-06, "loss": 0.009, "step": 4720 }, { "epoch": 2.14786169244768, "grad_norm": 0.6843603558724051, "learning_rate": 6.098665537057145e-06, "loss": 0.0192, "step": 4721 }, { "epoch": 2.1483166515013647, "grad_norm": 0.3959988936762539, "learning_rate": 6.097271128126052e-06, "loss": 0.0117, "step": 4722 }, { "epoch": 2.14877161055505, "grad_norm": 0.24838444785399422, "learning_rate": 6.095876629530918e-06, "loss": 0.0068, "step": 4723 }, { "epoch": 2.1492265696087354, "grad_norm": 0.2528726815979166, "learning_rate": 6.094482041385697e-06, "loss": 0.0053, "step": 4724 }, { "epoch": 2.1496815286624202, "grad_norm": 0.4836205323337117, "learning_rate": 6.093087363804345e-06, "loss": 0.0129, "step": 4725 }, { "epoch": 2.1501364877161055, "grad_norm": 0.3494498814055253, "learning_rate": 6.0916925969008275e-06, "loss": 0.0146, "step": 4726 }, { "epoch": 2.150591446769791, "grad_norm": 0.3399153756042083, "learning_rate": 6.090297740789124e-06, "loss": 0.0106, "step": 4727 }, { "epoch": 2.1510464058234757, "grad_norm": 0.5278632665447198, "learning_rate": 6.088902795583211e-06, "loss": 0.016, "step": 4728 }, { "epoch": 2.151501364877161, "grad_norm": 0.2242819039645525, "learning_rate": 6.08750776139708e-06, "loss": 0.0072, "step": 4729 }, { "epoch": 2.1519563239308463, "grad_norm": 0.19418934363495513, "learning_rate": 6.086112638344727e-06, "loss": 0.0053, "step": 4730 }, { "epoch": 2.152411282984531, "grad_norm": 0.27005944566872925, "learning_rate": 6.084717426540152e-06, "loss": 0.0097, "step": 4731 }, { "epoch": 2.1528662420382165, "grad_norm": 0.3667775845948143, "learning_rate": 6.08332212609737e-06, "loss": 0.014, "step": 4732 }, { "epoch": 2.153321201091902, "grad_norm": 0.37190070651539836, "learning_rate": 6.081926737130392e-06, "loss": 0.0106, "step": 4733 }, { "epoch": 2.1537761601455867, "grad_norm": 0.3497323603964637, "learning_rate": 6.080531259753251e-06, "loss": 0.0084, "step": 4734 }, { "epoch": 2.154231119199272, "grad_norm": 0.7224472318461973, "learning_rate": 6.079135694079973e-06, "loss": 0.026, "step": 4735 }, { "epoch": 2.1546860782529573, "grad_norm": 0.4345968958519765, "learning_rate": 6.0777400402246e-06, "loss": 0.0149, "step": 4736 }, { "epoch": 2.1551410373066426, "grad_norm": 0.2859755139669437, "learning_rate": 6.076344298301178e-06, "loss": 0.0084, "step": 4737 }, { "epoch": 2.1555959963603275, "grad_norm": 0.4119900972119734, "learning_rate": 6.0749484684237605e-06, "loss": 0.0109, "step": 4738 }, { "epoch": 2.156050955414013, "grad_norm": 0.3616480465966663, "learning_rate": 6.073552550706408e-06, "loss": 0.011, "step": 4739 }, { "epoch": 2.156505914467698, "grad_norm": 0.30461442568532915, "learning_rate": 6.0721565452631895e-06, "loss": 0.006, "step": 4740 }, { "epoch": 2.156960873521383, "grad_norm": 0.3764717952729475, "learning_rate": 6.070760452208181e-06, "loss": 0.0131, "step": 4741 }, { "epoch": 2.1574158325750683, "grad_norm": 0.20780953253775353, "learning_rate": 6.069364271655463e-06, "loss": 0.0072, "step": 4742 }, { "epoch": 2.1578707916287536, "grad_norm": 0.4013046783652124, "learning_rate": 6.0679680037191245e-06, "loss": 0.0104, "step": 4743 }, { "epoch": 2.1583257506824385, "grad_norm": 0.5323515325728005, "learning_rate": 6.0665716485132665e-06, "loss": 0.0247, "step": 4744 }, { "epoch": 2.158780709736124, "grad_norm": 0.3098061356634163, "learning_rate": 6.065175206151988e-06, "loss": 0.0071, "step": 4745 }, { "epoch": 2.159235668789809, "grad_norm": 0.6711243538714666, "learning_rate": 6.0637786767494035e-06, "loss": 0.0213, "step": 4746 }, { "epoch": 2.159690627843494, "grad_norm": 0.4331859916089256, "learning_rate": 6.062382060419628e-06, "loss": 0.0131, "step": 4747 }, { "epoch": 2.1601455868971793, "grad_norm": 0.43169068625570167, "learning_rate": 6.0609853572767886e-06, "loss": 0.0122, "step": 4748 }, { "epoch": 2.1606005459508646, "grad_norm": 0.40906162019120135, "learning_rate": 6.0595885674350184e-06, "loss": 0.0155, "step": 4749 }, { "epoch": 2.1610555050045495, "grad_norm": 0.3653189390834431, "learning_rate": 6.058191691008453e-06, "loss": 0.0133, "step": 4750 }, { "epoch": 2.1615104640582348, "grad_norm": 0.7215799730558953, "learning_rate": 6.056794728111244e-06, "loss": 0.0294, "step": 4751 }, { "epoch": 2.16196542311192, "grad_norm": 0.28209095792273875, "learning_rate": 6.0553976788575395e-06, "loss": 0.0084, "step": 4752 }, { "epoch": 2.162420382165605, "grad_norm": 0.26556119267985756, "learning_rate": 6.054000543361506e-06, "loss": 0.0067, "step": 4753 }, { "epoch": 2.1628753412192903, "grad_norm": 0.46234399150367417, "learning_rate": 6.052603321737306e-06, "loss": 0.0228, "step": 4754 }, { "epoch": 2.1633303002729756, "grad_norm": 0.2291718371880802, "learning_rate": 6.051206014099116e-06, "loss": 0.0043, "step": 4755 }, { "epoch": 2.1637852593266604, "grad_norm": 0.37626130042703126, "learning_rate": 6.049808620561118e-06, "loss": 0.0108, "step": 4756 }, { "epoch": 2.1642402183803457, "grad_norm": 0.4387336074321349, "learning_rate": 6.0484111412375005e-06, "loss": 0.0157, "step": 4757 }, { "epoch": 2.164695177434031, "grad_norm": 0.33337756700143717, "learning_rate": 6.047013576242459e-06, "loss": 0.0072, "step": 4758 }, { "epoch": 2.165150136487716, "grad_norm": 0.4185471804476035, "learning_rate": 6.045615925690196e-06, "loss": 0.0189, "step": 4759 }, { "epoch": 2.1656050955414012, "grad_norm": 0.2661686896047376, "learning_rate": 6.044218189694922e-06, "loss": 0.0083, "step": 4760 }, { "epoch": 2.1660600545950865, "grad_norm": 0.4272995989189421, "learning_rate": 6.042820368370854e-06, "loss": 0.0168, "step": 4761 }, { "epoch": 2.1665150136487714, "grad_norm": 0.31015151921115824, "learning_rate": 6.041422461832214e-06, "loss": 0.0054, "step": 4762 }, { "epoch": 2.1669699727024567, "grad_norm": 0.5715853686305054, "learning_rate": 6.0400244701932334e-06, "loss": 0.0325, "step": 4763 }, { "epoch": 2.167424931756142, "grad_norm": 0.26231670013399255, "learning_rate": 6.03862639356815e-06, "loss": 0.0052, "step": 4764 }, { "epoch": 2.167879890809827, "grad_norm": 0.4456886325034235, "learning_rate": 6.037228232071207e-06, "loss": 0.0185, "step": 4765 }, { "epoch": 2.168334849863512, "grad_norm": 0.4069779934635853, "learning_rate": 6.035829985816659e-06, "loss": 0.0112, "step": 4766 }, { "epoch": 2.1687898089171975, "grad_norm": 0.33798797647195855, "learning_rate": 6.034431654918761e-06, "loss": 0.0078, "step": 4767 }, { "epoch": 2.1692447679708824, "grad_norm": 0.4244850264652735, "learning_rate": 6.033033239491779e-06, "loss": 0.0127, "step": 4768 }, { "epoch": 2.1696997270245677, "grad_norm": 0.4582293511737658, "learning_rate": 6.031634739649987e-06, "loss": 0.0228, "step": 4769 }, { "epoch": 2.170154686078253, "grad_norm": 0.3495427084098909, "learning_rate": 6.030236155507663e-06, "loss": 0.0135, "step": 4770 }, { "epoch": 2.1706096451319383, "grad_norm": 0.3447277224746257, "learning_rate": 6.028837487179092e-06, "loss": 0.0086, "step": 4771 }, { "epoch": 2.171064604185623, "grad_norm": 0.41684658949692854, "learning_rate": 6.0274387347785675e-06, "loss": 0.0131, "step": 4772 }, { "epoch": 2.1715195632393085, "grad_norm": 2.161786263491943, "learning_rate": 6.026039898420392e-06, "loss": 0.0314, "step": 4773 }, { "epoch": 2.171974522292994, "grad_norm": 0.23402137097262496, "learning_rate": 6.024640978218867e-06, "loss": 0.0077, "step": 4774 }, { "epoch": 2.1724294813466787, "grad_norm": 0.4908623331090018, "learning_rate": 6.023241974288308e-06, "loss": 0.0137, "step": 4775 }, { "epoch": 2.172884440400364, "grad_norm": 0.3139320367383749, "learning_rate": 6.021842886743036e-06, "loss": 0.0094, "step": 4776 }, { "epoch": 2.1733393994540493, "grad_norm": 0.5267319213235393, "learning_rate": 6.02044371569738e-06, "loss": 0.0174, "step": 4777 }, { "epoch": 2.173794358507734, "grad_norm": 0.3176716587564772, "learning_rate": 6.019044461265672e-06, "loss": 0.0077, "step": 4778 }, { "epoch": 2.1742493175614195, "grad_norm": 0.5365896557338491, "learning_rate": 6.01764512356225e-06, "loss": 0.0151, "step": 4779 }, { "epoch": 2.174704276615105, "grad_norm": 0.3069252575826205, "learning_rate": 6.016245702701466e-06, "loss": 0.0073, "step": 4780 }, { "epoch": 2.1751592356687897, "grad_norm": 0.45555655741708767, "learning_rate": 6.014846198797673e-06, "loss": 0.0152, "step": 4781 }, { "epoch": 2.175614194722475, "grad_norm": 0.577079432872511, "learning_rate": 6.013446611965229e-06, "loss": 0.0202, "step": 4782 }, { "epoch": 2.1760691537761603, "grad_norm": 0.2647405580225935, "learning_rate": 6.012046942318507e-06, "loss": 0.0062, "step": 4783 }, { "epoch": 2.176524112829845, "grad_norm": 0.38849884729419404, "learning_rate": 6.0106471899718785e-06, "loss": 0.0143, "step": 4784 }, { "epoch": 2.1769790718835305, "grad_norm": 0.4252941734157271, "learning_rate": 6.009247355039725e-06, "loss": 0.0126, "step": 4785 }, { "epoch": 2.1774340309372158, "grad_norm": 0.2569941751090335, "learning_rate": 6.007847437636436e-06, "loss": 0.0057, "step": 4786 }, { "epoch": 2.1778889899909006, "grad_norm": 0.44171668049013985, "learning_rate": 6.006447437876406e-06, "loss": 0.0107, "step": 4787 }, { "epoch": 2.178343949044586, "grad_norm": 0.39292596246211253, "learning_rate": 6.005047355874036e-06, "loss": 0.0103, "step": 4788 }, { "epoch": 2.1787989080982713, "grad_norm": 0.4829001341608921, "learning_rate": 6.003647191743734e-06, "loss": 0.0139, "step": 4789 }, { "epoch": 2.179253867151956, "grad_norm": 0.3070324978728649, "learning_rate": 6.002246945599918e-06, "loss": 0.0067, "step": 4790 }, { "epoch": 2.1797088262056414, "grad_norm": 0.3008772991904148, "learning_rate": 6.0008466175570066e-06, "loss": 0.0058, "step": 4791 }, { "epoch": 2.1801637852593267, "grad_norm": 0.47506214872913943, "learning_rate": 5.999446207729429e-06, "loss": 0.0192, "step": 4792 }, { "epoch": 2.180618744313012, "grad_norm": 0.4593157770677093, "learning_rate": 5.9980457162316206e-06, "loss": 0.0168, "step": 4793 }, { "epoch": 2.181073703366697, "grad_norm": 0.30301020683938856, "learning_rate": 5.996645143178025e-06, "loss": 0.0075, "step": 4794 }, { "epoch": 2.1815286624203822, "grad_norm": 0.39741882552150204, "learning_rate": 5.995244488683088e-06, "loss": 0.009, "step": 4795 }, { "epoch": 2.1819836214740675, "grad_norm": 0.6789778146399447, "learning_rate": 5.993843752861266e-06, "loss": 0.0265, "step": 4796 }, { "epoch": 2.1824385805277524, "grad_norm": 0.45692103365533293, "learning_rate": 5.992442935827021e-06, "loss": 0.0123, "step": 4797 }, { "epoch": 2.1828935395814377, "grad_norm": 0.2663643592767637, "learning_rate": 5.99104203769482e-06, "loss": 0.0065, "step": 4798 }, { "epoch": 2.183348498635123, "grad_norm": 0.4192556124069004, "learning_rate": 5.98964105857914e-06, "loss": 0.0164, "step": 4799 }, { "epoch": 2.183803457688808, "grad_norm": 0.3294174370540794, "learning_rate": 5.988239998594463e-06, "loss": 0.0111, "step": 4800 }, { "epoch": 2.184258416742493, "grad_norm": 0.256413754235468, "learning_rate": 5.9868388578552736e-06, "loss": 0.0057, "step": 4801 }, { "epoch": 2.1847133757961785, "grad_norm": 0.35476569934440916, "learning_rate": 5.985437636476072e-06, "loss": 0.013, "step": 4802 }, { "epoch": 2.1851683348498634, "grad_norm": 0.5364039288146651, "learning_rate": 5.984036334571354e-06, "loss": 0.0188, "step": 4803 }, { "epoch": 2.1856232939035487, "grad_norm": 0.5035275777717944, "learning_rate": 5.982634952255633e-06, "loss": 0.0111, "step": 4804 }, { "epoch": 2.186078252957234, "grad_norm": 0.3242586079060238, "learning_rate": 5.98123348964342e-06, "loss": 0.0112, "step": 4805 }, { "epoch": 2.186533212010919, "grad_norm": 0.3814719552488175, "learning_rate": 5.979831946849237e-06, "loss": 0.0208, "step": 4806 }, { "epoch": 2.186988171064604, "grad_norm": 0.4091000994356109, "learning_rate": 5.978430323987614e-06, "loss": 0.0129, "step": 4807 }, { "epoch": 2.1874431301182895, "grad_norm": 0.3493239318833215, "learning_rate": 5.977028621173082e-06, "loss": 0.0099, "step": 4808 }, { "epoch": 2.1878980891719744, "grad_norm": 0.4579529662370525, "learning_rate": 5.975626838520185e-06, "loss": 0.0104, "step": 4809 }, { "epoch": 2.1883530482256597, "grad_norm": 0.3544967487096724, "learning_rate": 5.9742249761434665e-06, "loss": 0.0094, "step": 4810 }, { "epoch": 2.188808007279345, "grad_norm": 0.35437826425866875, "learning_rate": 5.972823034157485e-06, "loss": 0.0133, "step": 4811 }, { "epoch": 2.18926296633303, "grad_norm": 0.29840883208459107, "learning_rate": 5.971421012676796e-06, "loss": 0.009, "step": 4812 }, { "epoch": 2.189717925386715, "grad_norm": 0.32160309893785227, "learning_rate": 5.970018911815969e-06, "loss": 0.0075, "step": 4813 }, { "epoch": 2.1901728844404005, "grad_norm": 0.40003893224302556, "learning_rate": 5.9686167316895786e-06, "loss": 0.01, "step": 4814 }, { "epoch": 2.1906278434940853, "grad_norm": 0.5572564727276995, "learning_rate": 5.967214472412202e-06, "loss": 0.0185, "step": 4815 }, { "epoch": 2.1910828025477707, "grad_norm": 0.37148152625028286, "learning_rate": 5.965812134098428e-06, "loss": 0.0131, "step": 4816 }, { "epoch": 2.191537761601456, "grad_norm": 0.3321422926123032, "learning_rate": 5.9644097168628455e-06, "loss": 0.0113, "step": 4817 }, { "epoch": 2.191992720655141, "grad_norm": 0.4280593116204736, "learning_rate": 5.963007220820057e-06, "loss": 0.0105, "step": 4818 }, { "epoch": 2.192447679708826, "grad_norm": 0.48795039502378945, "learning_rate": 5.9616046460846685e-06, "loss": 0.0264, "step": 4819 }, { "epoch": 2.1929026387625115, "grad_norm": 0.4345889767556356, "learning_rate": 5.960201992771289e-06, "loss": 0.0147, "step": 4820 }, { "epoch": 2.1933575978161963, "grad_norm": 0.26992468902773464, "learning_rate": 5.958799260994541e-06, "loss": 0.005, "step": 4821 }, { "epoch": 2.1938125568698816, "grad_norm": 0.3243729973073873, "learning_rate": 5.957396450869046e-06, "loss": 0.0059, "step": 4822 }, { "epoch": 2.194267515923567, "grad_norm": 0.20256304912426243, "learning_rate": 5.955993562509438e-06, "loss": 0.0032, "step": 4823 }, { "epoch": 2.194722474977252, "grad_norm": 0.3111318089177059, "learning_rate": 5.954590596030352e-06, "loss": 0.0134, "step": 4824 }, { "epoch": 2.195177434030937, "grad_norm": 0.27959531047054953, "learning_rate": 5.953187551546433e-06, "loss": 0.0075, "step": 4825 }, { "epoch": 2.1956323930846224, "grad_norm": 0.32796181877560665, "learning_rate": 5.951784429172334e-06, "loss": 0.0097, "step": 4826 }, { "epoch": 2.1960873521383077, "grad_norm": 0.8261618815243253, "learning_rate": 5.950381229022706e-06, "loss": 0.0163, "step": 4827 }, { "epoch": 2.1965423111919926, "grad_norm": 0.47960887226055954, "learning_rate": 5.948977951212219e-06, "loss": 0.0133, "step": 4828 }, { "epoch": 2.196997270245678, "grad_norm": 0.6295428431852292, "learning_rate": 5.947574595855539e-06, "loss": 0.0157, "step": 4829 }, { "epoch": 2.1974522292993632, "grad_norm": 0.5058231133904427, "learning_rate": 5.946171163067341e-06, "loss": 0.0136, "step": 4830 }, { "epoch": 2.197907188353048, "grad_norm": 0.41769543575645507, "learning_rate": 5.944767652962309e-06, "loss": 0.0139, "step": 4831 }, { "epoch": 2.1983621474067334, "grad_norm": 0.4026794930288648, "learning_rate": 5.943364065655131e-06, "loss": 0.0199, "step": 4832 }, { "epoch": 2.1988171064604187, "grad_norm": 0.613796587142224, "learning_rate": 5.941960401260502e-06, "loss": 0.0241, "step": 4833 }, { "epoch": 2.1992720655141036, "grad_norm": 0.44531019560955765, "learning_rate": 5.940556659893123e-06, "loss": 0.0111, "step": 4834 }, { "epoch": 2.199727024567789, "grad_norm": 0.6064428736600246, "learning_rate": 5.9391528416677e-06, "loss": 0.0241, "step": 4835 }, { "epoch": 2.200181983621474, "grad_norm": 0.659961666861241, "learning_rate": 5.93774894669895e-06, "loss": 0.0193, "step": 4836 }, { "epoch": 2.200636942675159, "grad_norm": 0.35896669249236113, "learning_rate": 5.936344975101589e-06, "loss": 0.0162, "step": 4837 }, { "epoch": 2.2010919017288444, "grad_norm": 0.2226024632735598, "learning_rate": 5.934940926990346e-06, "loss": 0.0059, "step": 4838 }, { "epoch": 2.2015468607825297, "grad_norm": 0.6412521074792568, "learning_rate": 5.933536802479952e-06, "loss": 0.0268, "step": 4839 }, { "epoch": 2.2020018198362146, "grad_norm": 0.2311707642063774, "learning_rate": 5.9321326016851475e-06, "loss": 0.0085, "step": 4840 }, { "epoch": 2.2024567788899, "grad_norm": 0.2665377816064787, "learning_rate": 5.930728324720676e-06, "loss": 0.0075, "step": 4841 }, { "epoch": 2.202911737943585, "grad_norm": 0.433629797866015, "learning_rate": 5.929323971701287e-06, "loss": 0.0091, "step": 4842 }, { "epoch": 2.20336669699727, "grad_norm": 0.37633327644039666, "learning_rate": 5.927919542741742e-06, "loss": 0.0149, "step": 4843 }, { "epoch": 2.2038216560509554, "grad_norm": 0.2997005850142501, "learning_rate": 5.926515037956802e-06, "loss": 0.0049, "step": 4844 }, { "epoch": 2.2042766151046407, "grad_norm": 0.4106961641210632, "learning_rate": 5.925110457461236e-06, "loss": 0.0107, "step": 4845 }, { "epoch": 2.2047315741583255, "grad_norm": 0.390639317481699, "learning_rate": 5.923705801369822e-06, "loss": 0.0184, "step": 4846 }, { "epoch": 2.205186533212011, "grad_norm": 0.3714930458538043, "learning_rate": 5.922301069797343e-06, "loss": 0.0079, "step": 4847 }, { "epoch": 2.205641492265696, "grad_norm": 0.37185622687701453, "learning_rate": 5.920896262858583e-06, "loss": 0.0096, "step": 4848 }, { "epoch": 2.2060964513193815, "grad_norm": 0.21431297930910828, "learning_rate": 5.919491380668341e-06, "loss": 0.0027, "step": 4849 }, { "epoch": 2.2065514103730663, "grad_norm": 0.26101335832628725, "learning_rate": 5.918086423341415e-06, "loss": 0.005, "step": 4850 }, { "epoch": 2.2070063694267517, "grad_norm": 0.3174396421177979, "learning_rate": 5.916681390992613e-06, "loss": 0.0083, "step": 4851 }, { "epoch": 2.207461328480437, "grad_norm": 0.37189332383009505, "learning_rate": 5.915276283736746e-06, "loss": 0.0125, "step": 4852 }, { "epoch": 2.207916287534122, "grad_norm": 0.22839634936221256, "learning_rate": 5.9138711016886364e-06, "loss": 0.0082, "step": 4853 }, { "epoch": 2.208371246587807, "grad_norm": 0.39902762102311734, "learning_rate": 5.912465844963106e-06, "loss": 0.0089, "step": 4854 }, { "epoch": 2.2088262056414925, "grad_norm": 0.38011464793935024, "learning_rate": 5.911060513674986e-06, "loss": 0.0132, "step": 4855 }, { "epoch": 2.2092811646951773, "grad_norm": 0.44117284633989806, "learning_rate": 5.9096551079391175e-06, "loss": 0.0098, "step": 4856 }, { "epoch": 2.2097361237488626, "grad_norm": 0.5433595565488459, "learning_rate": 5.908249627870342e-06, "loss": 0.0211, "step": 4857 }, { "epoch": 2.210191082802548, "grad_norm": 0.4540701154032658, "learning_rate": 5.906844073583507e-06, "loss": 0.0101, "step": 4858 }, { "epoch": 2.210646041856233, "grad_norm": 0.6026998190118189, "learning_rate": 5.90543844519347e-06, "loss": 0.0261, "step": 4859 }, { "epoch": 2.211101000909918, "grad_norm": 0.6184229542959041, "learning_rate": 5.904032742815092e-06, "loss": 0.0276, "step": 4860 }, { "epoch": 2.2115559599636034, "grad_norm": 0.32119971093347577, "learning_rate": 5.902626966563241e-06, "loss": 0.0084, "step": 4861 }, { "epoch": 2.2120109190172883, "grad_norm": 0.353281287967236, "learning_rate": 5.901221116552791e-06, "loss": 0.0125, "step": 4862 }, { "epoch": 2.2124658780709736, "grad_norm": 0.535007533174073, "learning_rate": 5.8998151928986205e-06, "loss": 0.0285, "step": 4863 }, { "epoch": 2.212920837124659, "grad_norm": 0.5308484524550808, "learning_rate": 5.898409195715616e-06, "loss": 0.0214, "step": 4864 }, { "epoch": 2.213375796178344, "grad_norm": 0.44553792105412476, "learning_rate": 5.89700312511867e-06, "loss": 0.0107, "step": 4865 }, { "epoch": 2.213830755232029, "grad_norm": 0.494225228414626, "learning_rate": 5.895596981222679e-06, "loss": 0.0131, "step": 4866 }, { "epoch": 2.2142857142857144, "grad_norm": 0.28563660749753433, "learning_rate": 5.894190764142547e-06, "loss": 0.0077, "step": 4867 }, { "epoch": 2.2147406733393993, "grad_norm": 0.36408312845008073, "learning_rate": 5.892784473993184e-06, "loss": 0.0161, "step": 4868 }, { "epoch": 2.2151956323930846, "grad_norm": 0.31879770665449286, "learning_rate": 5.891378110889505e-06, "loss": 0.0124, "step": 4869 }, { "epoch": 2.21565059144677, "grad_norm": 0.3031384330817416, "learning_rate": 5.889971674946434e-06, "loss": 0.0118, "step": 4870 }, { "epoch": 2.2161055505004548, "grad_norm": 0.41800795118456197, "learning_rate": 5.888565166278895e-06, "loss": 0.0128, "step": 4871 }, { "epoch": 2.21656050955414, "grad_norm": 0.5372077913709468, "learning_rate": 5.887158585001825e-06, "loss": 0.0132, "step": 4872 }, { "epoch": 2.2170154686078254, "grad_norm": 0.5567453276448947, "learning_rate": 5.885751931230159e-06, "loss": 0.0153, "step": 4873 }, { "epoch": 2.2174704276615103, "grad_norm": 0.26780172680459685, "learning_rate": 5.884345205078847e-06, "loss": 0.0079, "step": 4874 }, { "epoch": 2.2179253867151956, "grad_norm": 0.2872420903479903, "learning_rate": 5.8829384066628395e-06, "loss": 0.0106, "step": 4875 }, { "epoch": 2.218380345768881, "grad_norm": 0.2720204238574773, "learning_rate": 5.881531536097091e-06, "loss": 0.0085, "step": 4876 }, { "epoch": 2.2188353048225657, "grad_norm": 0.2854866521087616, "learning_rate": 5.8801245934965676e-06, "loss": 0.0116, "step": 4877 }, { "epoch": 2.219290263876251, "grad_norm": 0.2934997083257922, "learning_rate": 5.878717578976236e-06, "loss": 0.0091, "step": 4878 }, { "epoch": 2.2197452229299364, "grad_norm": 0.2950040866228448, "learning_rate": 5.877310492651073e-06, "loss": 0.0091, "step": 4879 }, { "epoch": 2.2202001819836217, "grad_norm": 0.38209288233432137, "learning_rate": 5.875903334636056e-06, "loss": 0.013, "step": 4880 }, { "epoch": 2.2206551410373065, "grad_norm": 0.4351658467566086, "learning_rate": 5.874496105046177e-06, "loss": 0.0133, "step": 4881 }, { "epoch": 2.221110100090992, "grad_norm": 0.452865040336361, "learning_rate": 5.873088803996424e-06, "loss": 0.0123, "step": 4882 }, { "epoch": 2.221565059144677, "grad_norm": 0.3422021101237533, "learning_rate": 5.871681431601797e-06, "loss": 0.0107, "step": 4883 }, { "epoch": 2.222020018198362, "grad_norm": 0.3445596196811098, "learning_rate": 5.870273987977301e-06, "loss": 0.0098, "step": 4884 }, { "epoch": 2.2224749772520473, "grad_norm": 0.2666549337093361, "learning_rate": 5.868866473237944e-06, "loss": 0.0053, "step": 4885 }, { "epoch": 2.2229299363057327, "grad_norm": 0.5124450260383374, "learning_rate": 5.867458887498743e-06, "loss": 0.0163, "step": 4886 }, { "epoch": 2.2233848953594175, "grad_norm": 0.3993052473544218, "learning_rate": 5.866051230874719e-06, "loss": 0.0121, "step": 4887 }, { "epoch": 2.223839854413103, "grad_norm": 0.4550765708618119, "learning_rate": 5.8646435034808975e-06, "loss": 0.0073, "step": 4888 }, { "epoch": 2.224294813466788, "grad_norm": 0.49229903782323486, "learning_rate": 5.863235705432317e-06, "loss": 0.0161, "step": 4889 }, { "epoch": 2.224749772520473, "grad_norm": 0.6784352478909106, "learning_rate": 5.86182783684401e-06, "loss": 0.0182, "step": 4890 }, { "epoch": 2.2252047315741583, "grad_norm": 0.4082367694691016, "learning_rate": 5.860419897831025e-06, "loss": 0.0133, "step": 4891 }, { "epoch": 2.2256596906278436, "grad_norm": 0.45059913620494113, "learning_rate": 5.859011888508412e-06, "loss": 0.0147, "step": 4892 }, { "epoch": 2.2261146496815285, "grad_norm": 0.49608072863593244, "learning_rate": 5.857603808991228e-06, "loss": 0.0213, "step": 4893 }, { "epoch": 2.226569608735214, "grad_norm": 0.2726929458301699, "learning_rate": 5.856195659394531e-06, "loss": 0.0064, "step": 4894 }, { "epoch": 2.227024567788899, "grad_norm": 0.538526382842245, "learning_rate": 5.8547874398333924e-06, "loss": 0.0236, "step": 4895 }, { "epoch": 2.227479526842584, "grad_norm": 0.38738433735005584, "learning_rate": 5.853379150422885e-06, "loss": 0.0104, "step": 4896 }, { "epoch": 2.2279344858962693, "grad_norm": 1.5326763283968696, "learning_rate": 5.851970791278086e-06, "loss": 0.0201, "step": 4897 }, { "epoch": 2.2283894449499546, "grad_norm": 0.2143038442053601, "learning_rate": 5.850562362514083e-06, "loss": 0.0052, "step": 4898 }, { "epoch": 2.2288444040036395, "grad_norm": 0.37303622579677503, "learning_rate": 5.849153864245963e-06, "loss": 0.0071, "step": 4899 }, { "epoch": 2.229299363057325, "grad_norm": 0.39372925777577983, "learning_rate": 5.847745296588827e-06, "loss": 0.0067, "step": 4900 }, { "epoch": 2.22975432211101, "grad_norm": 0.4332279505365411, "learning_rate": 5.8463366596577706e-06, "loss": 0.0136, "step": 4901 }, { "epoch": 2.2302092811646954, "grad_norm": 0.29820073293128874, "learning_rate": 5.844927953567906e-06, "loss": 0.0147, "step": 4902 }, { "epoch": 2.2306642402183803, "grad_norm": 0.4984730828892178, "learning_rate": 5.843519178434345e-06, "loss": 0.0198, "step": 4903 }, { "epoch": 2.2311191992720656, "grad_norm": 0.6088795409231883, "learning_rate": 5.842110334372203e-06, "loss": 0.0159, "step": 4904 }, { "epoch": 2.231574158325751, "grad_norm": 0.40473599951548656, "learning_rate": 5.840701421496611e-06, "loss": 0.0084, "step": 4905 }, { "epoch": 2.2320291173794358, "grad_norm": 0.3123276430739044, "learning_rate": 5.8392924399226945e-06, "loss": 0.005, "step": 4906 }, { "epoch": 2.232484076433121, "grad_norm": 0.26646880519784627, "learning_rate": 5.837883389765589e-06, "loss": 0.0068, "step": 4907 }, { "epoch": 2.2329390354868064, "grad_norm": 0.35407883713038, "learning_rate": 5.8364742711404375e-06, "loss": 0.0086, "step": 4908 }, { "epoch": 2.2333939945404913, "grad_norm": 0.33912525466798915, "learning_rate": 5.835065084162386e-06, "loss": 0.005, "step": 4909 }, { "epoch": 2.2338489535941766, "grad_norm": 0.5903475824416466, "learning_rate": 5.833655828946587e-06, "loss": 0.0139, "step": 4910 }, { "epoch": 2.234303912647862, "grad_norm": 0.48780826821725437, "learning_rate": 5.832246505608198e-06, "loss": 0.0206, "step": 4911 }, { "epoch": 2.2347588717015467, "grad_norm": 0.40209753389439074, "learning_rate": 5.830837114262384e-06, "loss": 0.0174, "step": 4912 }, { "epoch": 2.235213830755232, "grad_norm": 0.4813709058978907, "learning_rate": 5.829427655024312e-06, "loss": 0.0161, "step": 4913 }, { "epoch": 2.2356687898089174, "grad_norm": 0.21913261319584917, "learning_rate": 5.828018128009156e-06, "loss": 0.006, "step": 4914 }, { "epoch": 2.2361237488626022, "grad_norm": 0.6013695489737468, "learning_rate": 5.826608533332101e-06, "loss": 0.0242, "step": 4915 }, { "epoch": 2.2365787079162875, "grad_norm": 0.3203067204799793, "learning_rate": 5.825198871108328e-06, "loss": 0.0125, "step": 4916 }, { "epoch": 2.237033666969973, "grad_norm": 0.5646445515057829, "learning_rate": 5.823789141453031e-06, "loss": 0.0181, "step": 4917 }, { "epoch": 2.2374886260236577, "grad_norm": 0.49975574343761603, "learning_rate": 5.822379344481404e-06, "loss": 0.0162, "step": 4918 }, { "epoch": 2.237943585077343, "grad_norm": 0.4065286004383984, "learning_rate": 5.820969480308652e-06, "loss": 0.0064, "step": 4919 }, { "epoch": 2.2383985441310283, "grad_norm": 0.32281003186740637, "learning_rate": 5.819559549049982e-06, "loss": 0.0064, "step": 4920 }, { "epoch": 2.238853503184713, "grad_norm": 0.47098734384379837, "learning_rate": 5.8181495508206045e-06, "loss": 0.0151, "step": 4921 }, { "epoch": 2.2393084622383985, "grad_norm": 0.3570877991124948, "learning_rate": 5.816739485735743e-06, "loss": 0.0131, "step": 4922 }, { "epoch": 2.239763421292084, "grad_norm": 0.4698502156284249, "learning_rate": 5.815329353910618e-06, "loss": 0.0144, "step": 4923 }, { "epoch": 2.2402183803457687, "grad_norm": 0.5783311808405599, "learning_rate": 5.81391915546046e-06, "loss": 0.0158, "step": 4924 }, { "epoch": 2.240673339399454, "grad_norm": 0.38897801889691475, "learning_rate": 5.812508890500503e-06, "loss": 0.0123, "step": 4925 }, { "epoch": 2.2411282984531393, "grad_norm": 0.40295578720791275, "learning_rate": 5.811098559145991e-06, "loss": 0.0104, "step": 4926 }, { "epoch": 2.241583257506824, "grad_norm": 1.4507216651716257, "learning_rate": 5.809688161512167e-06, "loss": 0.0176, "step": 4927 }, { "epoch": 2.2420382165605095, "grad_norm": 0.5062657433386122, "learning_rate": 5.808277697714283e-06, "loss": 0.0184, "step": 4928 }, { "epoch": 2.242493175614195, "grad_norm": 0.37838836624796074, "learning_rate": 5.806867167867595e-06, "loss": 0.0124, "step": 4929 }, { "epoch": 2.2429481346678797, "grad_norm": 0.3740956013076256, "learning_rate": 5.8054565720873665e-06, "loss": 0.0172, "step": 4930 }, { "epoch": 2.243403093721565, "grad_norm": 2.8682603949734697, "learning_rate": 5.804045910488864e-06, "loss": 0.0392, "step": 4931 }, { "epoch": 2.2438580527752503, "grad_norm": 0.25586205171673104, "learning_rate": 5.80263518318736e-06, "loss": 0.0062, "step": 4932 }, { "epoch": 2.244313011828935, "grad_norm": 0.5138290683522561, "learning_rate": 5.801224390298135e-06, "loss": 0.0223, "step": 4933 }, { "epoch": 2.2447679708826205, "grad_norm": 0.31307652816681897, "learning_rate": 5.79981353193647e-06, "loss": 0.0083, "step": 4934 }, { "epoch": 2.245222929936306, "grad_norm": 0.5007090142655471, "learning_rate": 5.798402608217655e-06, "loss": 0.0125, "step": 4935 }, { "epoch": 2.245677888989991, "grad_norm": 0.3376887505199625, "learning_rate": 5.7969916192569855e-06, "loss": 0.0184, "step": 4936 }, { "epoch": 2.246132848043676, "grad_norm": 0.3470931920015642, "learning_rate": 5.7955805651697595e-06, "loss": 0.0109, "step": 4937 }, { "epoch": 2.2465878070973613, "grad_norm": 0.5101493862951201, "learning_rate": 5.794169446071283e-06, "loss": 0.012, "step": 4938 }, { "epoch": 2.2470427661510466, "grad_norm": 0.4222936678567516, "learning_rate": 5.792758262076864e-06, "loss": 0.0181, "step": 4939 }, { "epoch": 2.2474977252047315, "grad_norm": 0.3154935093336658, "learning_rate": 5.7913470133018225e-06, "loss": 0.0105, "step": 4940 }, { "epoch": 2.2479526842584168, "grad_norm": 0.3089213208483222, "learning_rate": 5.789935699861475e-06, "loss": 0.009, "step": 4941 }, { "epoch": 2.248407643312102, "grad_norm": 0.12832391205359286, "learning_rate": 5.78852432187115e-06, "loss": 0.004, "step": 4942 }, { "epoch": 2.248862602365787, "grad_norm": 0.41319675096337877, "learning_rate": 5.787112879446177e-06, "loss": 0.0095, "step": 4943 }, { "epoch": 2.2493175614194723, "grad_norm": 0.3204711489093298, "learning_rate": 5.785701372701896e-06, "loss": 0.0131, "step": 4944 }, { "epoch": 2.2497725204731576, "grad_norm": 0.6014032291558712, "learning_rate": 5.784289801753646e-06, "loss": 0.0269, "step": 4945 }, { "epoch": 2.2502274795268424, "grad_norm": 0.5374390381709199, "learning_rate": 5.782878166716775e-06, "loss": 0.0217, "step": 4946 }, { "epoch": 2.2506824385805277, "grad_norm": 0.36820229425385403, "learning_rate": 5.7814664677066364e-06, "loss": 0.0126, "step": 4947 }, { "epoch": 2.251137397634213, "grad_norm": 0.46592461070793456, "learning_rate": 5.780054704838587e-06, "loss": 0.0111, "step": 4948 }, { "epoch": 2.251592356687898, "grad_norm": 0.25899334637267063, "learning_rate": 5.77864287822799e-06, "loss": 0.0081, "step": 4949 }, { "epoch": 2.2520473157415832, "grad_norm": 0.28583117999916735, "learning_rate": 5.777230987990212e-06, "loss": 0.0102, "step": 4950 }, { "epoch": 2.2525022747952685, "grad_norm": 0.3209514458106223, "learning_rate": 5.775819034240629e-06, "loss": 0.0096, "step": 4951 }, { "epoch": 2.2529572338489534, "grad_norm": 0.24934042043358404, "learning_rate": 5.774407017094618e-06, "loss": 0.0085, "step": 4952 }, { "epoch": 2.2534121929026387, "grad_norm": 0.47009835945401357, "learning_rate": 5.772994936667562e-06, "loss": 0.0197, "step": 4953 }, { "epoch": 2.253867151956324, "grad_norm": 0.23128927069268837, "learning_rate": 5.771582793074853e-06, "loss": 0.0055, "step": 4954 }, { "epoch": 2.2543221110100093, "grad_norm": 0.5715633849732003, "learning_rate": 5.77017058643188e-06, "loss": 0.0174, "step": 4955 }, { "epoch": 2.254777070063694, "grad_norm": 0.43675819305589864, "learning_rate": 5.768758316854045e-06, "loss": 0.0137, "step": 4956 }, { "epoch": 2.2552320291173795, "grad_norm": 0.2688189862498713, "learning_rate": 5.767345984456751e-06, "loss": 0.0082, "step": 4957 }, { "epoch": 2.255686988171065, "grad_norm": 0.44499266839276047, "learning_rate": 5.7659335893554115e-06, "loss": 0.0168, "step": 4958 }, { "epoch": 2.2561419472247497, "grad_norm": 0.44237827656028933, "learning_rate": 5.764521131665437e-06, "loss": 0.016, "step": 4959 }, { "epoch": 2.256596906278435, "grad_norm": 0.37931892498888303, "learning_rate": 5.7631086115022464e-06, "loss": 0.0076, "step": 4960 }, { "epoch": 2.2570518653321203, "grad_norm": 0.5485154743879561, "learning_rate": 5.761696028981269e-06, "loss": 0.0233, "step": 4961 }, { "epoch": 2.257506824385805, "grad_norm": 0.3893754847059226, "learning_rate": 5.7602833842179285e-06, "loss": 0.0123, "step": 4962 }, { "epoch": 2.2579617834394905, "grad_norm": 0.40854201839403814, "learning_rate": 5.758870677327665e-06, "loss": 0.0103, "step": 4963 }, { "epoch": 2.258416742493176, "grad_norm": 0.49304151830282095, "learning_rate": 5.7574579084259175e-06, "loss": 0.0068, "step": 4964 }, { "epoch": 2.2588717015468607, "grad_norm": 0.3101229238565357, "learning_rate": 5.7560450776281295e-06, "loss": 0.006, "step": 4965 }, { "epoch": 2.259326660600546, "grad_norm": 0.3018619929319456, "learning_rate": 5.754632185049753e-06, "loss": 0.0153, "step": 4966 }, { "epoch": 2.2597816196542313, "grad_norm": 0.40603320270712606, "learning_rate": 5.75321923080624e-06, "loss": 0.0134, "step": 4967 }, { "epoch": 2.260236578707916, "grad_norm": 0.34605688909493, "learning_rate": 5.751806215013055e-06, "loss": 0.0132, "step": 4968 }, { "epoch": 2.2606915377616015, "grad_norm": 0.2875706728087106, "learning_rate": 5.75039313778566e-06, "loss": 0.005, "step": 4969 }, { "epoch": 2.261146496815287, "grad_norm": 0.3932509886436604, "learning_rate": 5.748979999239528e-06, "loss": 0.012, "step": 4970 }, { "epoch": 2.2616014558689717, "grad_norm": 0.567212199212246, "learning_rate": 5.7475667994901316e-06, "loss": 0.0236, "step": 4971 }, { "epoch": 2.262056414922657, "grad_norm": 0.21614892083477413, "learning_rate": 5.746153538652953e-06, "loss": 0.0046, "step": 4972 }, { "epoch": 2.2625113739763423, "grad_norm": 0.370042314029447, "learning_rate": 5.7447402168434775e-06, "loss": 0.0134, "step": 4973 }, { "epoch": 2.262966333030027, "grad_norm": 0.5187504110392387, "learning_rate": 5.743326834177192e-06, "loss": 0.017, "step": 4974 }, { "epoch": 2.2634212920837125, "grad_norm": 0.4883200916567663, "learning_rate": 5.741913390769597e-06, "loss": 0.0205, "step": 4975 }, { "epoch": 2.2638762511373978, "grad_norm": 0.5524738448804214, "learning_rate": 5.74049988673619e-06, "loss": 0.0169, "step": 4976 }, { "epoch": 2.2643312101910826, "grad_norm": 0.47634028211297075, "learning_rate": 5.739086322192474e-06, "loss": 0.0127, "step": 4977 }, { "epoch": 2.264786169244768, "grad_norm": 0.4926572378339637, "learning_rate": 5.737672697253964e-06, "loss": 0.0135, "step": 4978 }, { "epoch": 2.2652411282984533, "grad_norm": 0.3576600424560595, "learning_rate": 5.736259012036171e-06, "loss": 0.0123, "step": 4979 }, { "epoch": 2.265696087352138, "grad_norm": 0.5349258203707818, "learning_rate": 5.734845266654619e-06, "loss": 0.0131, "step": 4980 }, { "epoch": 2.2661510464058234, "grad_norm": 0.4592868406741922, "learning_rate": 5.733431461224828e-06, "loss": 0.0098, "step": 4981 }, { "epoch": 2.2666060054595087, "grad_norm": 0.2552801025598758, "learning_rate": 5.732017595862329e-06, "loss": 0.0055, "step": 4982 }, { "epoch": 2.2670609645131936, "grad_norm": 0.35292700821310596, "learning_rate": 5.730603670682661e-06, "loss": 0.0116, "step": 4983 }, { "epoch": 2.267515923566879, "grad_norm": 0.48567998975999815, "learning_rate": 5.7291896858013574e-06, "loss": 0.0163, "step": 4984 }, { "epoch": 2.2679708826205642, "grad_norm": 0.43355116925628845, "learning_rate": 5.727775641333968e-06, "loss": 0.015, "step": 4985 }, { "epoch": 2.268425841674249, "grad_norm": 0.4044202222636259, "learning_rate": 5.726361537396038e-06, "loss": 0.0157, "step": 4986 }, { "epoch": 2.2688808007279344, "grad_norm": 0.3995526928549569, "learning_rate": 5.724947374103125e-06, "loss": 0.0099, "step": 4987 }, { "epoch": 2.2693357597816197, "grad_norm": 0.35497847490830436, "learning_rate": 5.723533151570785e-06, "loss": 0.0097, "step": 4988 }, { "epoch": 2.2697907188353046, "grad_norm": 0.2612014985047218, "learning_rate": 5.722118869914583e-06, "loss": 0.0061, "step": 4989 }, { "epoch": 2.27024567788899, "grad_norm": 0.36502490742386334, "learning_rate": 5.720704529250091e-06, "loss": 0.0096, "step": 4990 }, { "epoch": 2.270700636942675, "grad_norm": 0.4948103633382935, "learning_rate": 5.719290129692876e-06, "loss": 0.0226, "step": 4991 }, { "epoch": 2.2711555959963605, "grad_norm": 0.6525924451495685, "learning_rate": 5.717875671358521e-06, "loss": 0.0095, "step": 4992 }, { "epoch": 2.2716105550500454, "grad_norm": 0.272285400491665, "learning_rate": 5.7164611543626094e-06, "loss": 0.0084, "step": 4993 }, { "epoch": 2.2720655141037307, "grad_norm": 0.40074776778608673, "learning_rate": 5.715046578820726e-06, "loss": 0.0078, "step": 4994 }, { "epoch": 2.272520473157416, "grad_norm": 0.5321487800525837, "learning_rate": 5.713631944848467e-06, "loss": 0.0137, "step": 4995 }, { "epoch": 2.272975432211101, "grad_norm": 0.5950192216486127, "learning_rate": 5.712217252561426e-06, "loss": 0.0273, "step": 4996 }, { "epoch": 2.273430391264786, "grad_norm": 0.4284115464368724, "learning_rate": 5.71080250207521e-06, "loss": 0.0088, "step": 4997 }, { "epoch": 2.2738853503184715, "grad_norm": 0.4443237885326474, "learning_rate": 5.709387693505421e-06, "loss": 0.0134, "step": 4998 }, { "epoch": 2.2743403093721564, "grad_norm": 0.4168854902978304, "learning_rate": 5.707972826967675e-06, "loss": 0.0133, "step": 4999 }, { "epoch": 2.2747952684258417, "grad_norm": 0.37898531127249696, "learning_rate": 5.706557902577587e-06, "loss": 0.0121, "step": 5000 }, { "epoch": 2.275250227479527, "grad_norm": 0.345508399152065, "learning_rate": 5.705142920450777e-06, "loss": 0.0078, "step": 5001 }, { "epoch": 2.275705186533212, "grad_norm": 0.3722891283110431, "learning_rate": 5.703727880702872e-06, "loss": 0.0095, "step": 5002 }, { "epoch": 2.276160145586897, "grad_norm": 0.5259989621791551, "learning_rate": 5.702312783449502e-06, "loss": 0.0076, "step": 5003 }, { "epoch": 2.2766151046405825, "grad_norm": 0.24442617647958984, "learning_rate": 5.700897628806304e-06, "loss": 0.0071, "step": 5004 }, { "epoch": 2.2770700636942673, "grad_norm": 0.5788326829668876, "learning_rate": 5.699482416888917e-06, "loss": 0.0288, "step": 5005 }, { "epoch": 2.2775250227479527, "grad_norm": 0.24644035360801245, "learning_rate": 5.698067147812986e-06, "loss": 0.0039, "step": 5006 }, { "epoch": 2.277979981801638, "grad_norm": 0.3470487605226571, "learning_rate": 5.696651821694159e-06, "loss": 0.0115, "step": 5007 }, { "epoch": 2.278434940855323, "grad_norm": 0.3709003071958226, "learning_rate": 5.6952364386480915e-06, "loss": 0.0122, "step": 5008 }, { "epoch": 2.278889899909008, "grad_norm": 0.41802688093191903, "learning_rate": 5.693820998790442e-06, "loss": 0.0107, "step": 5009 }, { "epoch": 2.2793448589626935, "grad_norm": 0.42250516619422673, "learning_rate": 5.692405502236874e-06, "loss": 0.0202, "step": 5010 }, { "epoch": 2.2797998180163788, "grad_norm": 0.39930549644985364, "learning_rate": 5.690989949103056e-06, "loss": 0.0135, "step": 5011 }, { "epoch": 2.2802547770700636, "grad_norm": 0.6672791426407672, "learning_rate": 5.689574339504659e-06, "loss": 0.0202, "step": 5012 }, { "epoch": 2.280709736123749, "grad_norm": 0.28877596548959583, "learning_rate": 5.68815867355736e-06, "loss": 0.0086, "step": 5013 }, { "epoch": 2.2811646951774343, "grad_norm": 0.23984631690190455, "learning_rate": 5.686742951376844e-06, "loss": 0.0055, "step": 5014 }, { "epoch": 2.281619654231119, "grad_norm": 0.42296709808869476, "learning_rate": 5.685327173078794e-06, "loss": 0.0146, "step": 5015 }, { "epoch": 2.2820746132848044, "grad_norm": 0.22652783021347725, "learning_rate": 5.683911338778902e-06, "loss": 0.0082, "step": 5016 }, { "epoch": 2.2825295723384897, "grad_norm": 0.3828821098856581, "learning_rate": 5.682495448592865e-06, "loss": 0.0085, "step": 5017 }, { "epoch": 2.2829845313921746, "grad_norm": 0.6481984981851953, "learning_rate": 5.681079502636382e-06, "loss": 0.0196, "step": 5018 }, { "epoch": 2.28343949044586, "grad_norm": 0.3605305080965294, "learning_rate": 5.6796635010251565e-06, "loss": 0.0145, "step": 5019 }, { "epoch": 2.2838944494995452, "grad_norm": 0.45871327603411105, "learning_rate": 5.678247443874899e-06, "loss": 0.0091, "step": 5020 }, { "epoch": 2.28434940855323, "grad_norm": 0.4387078912852865, "learning_rate": 5.676831331301326e-06, "loss": 0.0174, "step": 5021 }, { "epoch": 2.2848043676069154, "grad_norm": 0.7364232085228714, "learning_rate": 5.67541516342015e-06, "loss": 0.0175, "step": 5022 }, { "epoch": 2.2852593266606007, "grad_norm": 0.38051913232432416, "learning_rate": 5.673998940347098e-06, "loss": 0.0105, "step": 5023 }, { "epoch": 2.2857142857142856, "grad_norm": 0.4203835976556839, "learning_rate": 5.672582662197897e-06, "loss": 0.0176, "step": 5024 }, { "epoch": 2.286169244767971, "grad_norm": 0.4657706550731654, "learning_rate": 5.671166329088278e-06, "loss": 0.0177, "step": 5025 }, { "epoch": 2.286624203821656, "grad_norm": 0.6592102460769599, "learning_rate": 5.669749941133978e-06, "loss": 0.0212, "step": 5026 }, { "epoch": 2.287079162875341, "grad_norm": 0.4071193965957663, "learning_rate": 5.668333498450736e-06, "loss": 0.0084, "step": 5027 }, { "epoch": 2.2875341219290264, "grad_norm": 0.4983485656734523, "learning_rate": 5.6669170011543e-06, "loss": 0.0225, "step": 5028 }, { "epoch": 2.2879890809827117, "grad_norm": 0.43354718301208217, "learning_rate": 5.665500449360418e-06, "loss": 0.0107, "step": 5029 }, { "epoch": 2.2884440400363966, "grad_norm": 0.39977433431566556, "learning_rate": 5.664083843184843e-06, "loss": 0.013, "step": 5030 }, { "epoch": 2.288898999090082, "grad_norm": 0.32376461410374807, "learning_rate": 5.662667182743338e-06, "loss": 0.0091, "step": 5031 }, { "epoch": 2.289353958143767, "grad_norm": 0.3657112859204232, "learning_rate": 5.661250468151662e-06, "loss": 0.012, "step": 5032 }, { "epoch": 2.289808917197452, "grad_norm": 0.3009564905495144, "learning_rate": 5.659833699525584e-06, "loss": 0.0072, "step": 5033 }, { "epoch": 2.2902638762511374, "grad_norm": 0.39452593339125275, "learning_rate": 5.6584168769808766e-06, "loss": 0.022, "step": 5034 }, { "epoch": 2.2907188353048227, "grad_norm": 0.360961425345943, "learning_rate": 5.657000000633315e-06, "loss": 0.0099, "step": 5035 }, { "epoch": 2.2911737943585075, "grad_norm": 0.463328619990547, "learning_rate": 5.655583070598681e-06, "loss": 0.0128, "step": 5036 }, { "epoch": 2.291628753412193, "grad_norm": 0.35998487951110436, "learning_rate": 5.6541660869927565e-06, "loss": 0.0125, "step": 5037 }, { "epoch": 2.292083712465878, "grad_norm": 0.4405740492655455, "learning_rate": 5.652749049931336e-06, "loss": 0.0176, "step": 5038 }, { "epoch": 2.292538671519563, "grad_norm": 0.6490718386759887, "learning_rate": 5.65133195953021e-06, "loss": 0.0309, "step": 5039 }, { "epoch": 2.2929936305732483, "grad_norm": 0.3699204140788893, "learning_rate": 5.6499148159051775e-06, "loss": 0.0099, "step": 5040 }, { "epoch": 2.2934485896269337, "grad_norm": 0.3902261568395218, "learning_rate": 5.648497619172042e-06, "loss": 0.0132, "step": 5041 }, { "epoch": 2.2939035486806185, "grad_norm": 0.4404931751252235, "learning_rate": 5.647080369446609e-06, "loss": 0.0162, "step": 5042 }, { "epoch": 2.294358507734304, "grad_norm": 0.2353589954428234, "learning_rate": 5.645663066844692e-06, "loss": 0.0067, "step": 5043 }, { "epoch": 2.294813466787989, "grad_norm": 0.5950281333092574, "learning_rate": 5.644245711482101e-06, "loss": 0.0284, "step": 5044 }, { "epoch": 2.295268425841674, "grad_norm": 0.4707885897948095, "learning_rate": 5.642828303474665e-06, "loss": 0.0163, "step": 5045 }, { "epoch": 2.2957233848953593, "grad_norm": 0.4904893500955452, "learning_rate": 5.6414108429382e-06, "loss": 0.0183, "step": 5046 }, { "epoch": 2.2961783439490446, "grad_norm": 0.29333798950057566, "learning_rate": 5.639993329988537e-06, "loss": 0.0094, "step": 5047 }, { "epoch": 2.29663330300273, "grad_norm": 0.32263195452792615, "learning_rate": 5.638575764741511e-06, "loss": 0.0108, "step": 5048 }, { "epoch": 2.297088262056415, "grad_norm": 0.2795409607827203, "learning_rate": 5.637158147312956e-06, "loss": 0.0065, "step": 5049 }, { "epoch": 2.2975432211101, "grad_norm": 0.3823863568191496, "learning_rate": 5.635740477818716e-06, "loss": 0.0107, "step": 5050 }, { "epoch": 2.2979981801637854, "grad_norm": 0.28033895048134977, "learning_rate": 5.634322756374634e-06, "loss": 0.0071, "step": 5051 }, { "epoch": 2.2984531392174703, "grad_norm": 0.29199495113342167, "learning_rate": 5.632904983096561e-06, "loss": 0.0076, "step": 5052 }, { "epoch": 2.2989080982711556, "grad_norm": 0.4174410155801214, "learning_rate": 5.631487158100352e-06, "loss": 0.0167, "step": 5053 }, { "epoch": 2.299363057324841, "grad_norm": 0.18711042403073394, "learning_rate": 5.630069281501862e-06, "loss": 0.0046, "step": 5054 }, { "epoch": 2.299818016378526, "grad_norm": 0.5876691796827814, "learning_rate": 5.628651353416957e-06, "loss": 0.0187, "step": 5055 }, { "epoch": 2.300272975432211, "grad_norm": 0.7474700362276027, "learning_rate": 5.627233373961503e-06, "loss": 0.0173, "step": 5056 }, { "epoch": 2.3007279344858964, "grad_norm": 0.3849472671771112, "learning_rate": 5.62581534325137e-06, "loss": 0.01, "step": 5057 }, { "epoch": 2.3011828935395813, "grad_norm": 0.4551373080863021, "learning_rate": 5.624397261402432e-06, "loss": 0.011, "step": 5058 }, { "epoch": 2.3016378525932666, "grad_norm": 0.3737734609919106, "learning_rate": 5.62297912853057e-06, "loss": 0.013, "step": 5059 }, { "epoch": 2.302092811646952, "grad_norm": 0.3156504510256659, "learning_rate": 5.621560944751668e-06, "loss": 0.0069, "step": 5060 }, { "epoch": 2.3025477707006368, "grad_norm": 0.3385850814330116, "learning_rate": 5.6201427101816105e-06, "loss": 0.0095, "step": 5061 }, { "epoch": 2.303002729754322, "grad_norm": 0.3968512150117719, "learning_rate": 5.618724424936295e-06, "loss": 0.0119, "step": 5062 }, { "epoch": 2.3034576888080074, "grad_norm": 0.3817144000944613, "learning_rate": 5.61730608913161e-06, "loss": 0.0137, "step": 5063 }, { "epoch": 2.3039126478616927, "grad_norm": 0.44460642647912646, "learning_rate": 5.615887702883462e-06, "loss": 0.0164, "step": 5064 }, { "epoch": 2.3043676069153776, "grad_norm": 0.42919077855817034, "learning_rate": 5.61446926630775e-06, "loss": 0.0108, "step": 5065 }, { "epoch": 2.304822565969063, "grad_norm": 0.5802541888893671, "learning_rate": 5.613050779520385e-06, "loss": 0.0211, "step": 5066 }, { "epoch": 2.305277525022748, "grad_norm": 0.46402502055509914, "learning_rate": 5.611632242637279e-06, "loss": 0.0204, "step": 5067 }, { "epoch": 2.305732484076433, "grad_norm": 0.26674695532163656, "learning_rate": 5.610213655774349e-06, "loss": 0.0057, "step": 5068 }, { "epoch": 2.3061874431301184, "grad_norm": 0.3823871062863482, "learning_rate": 5.608795019047514e-06, "loss": 0.0204, "step": 5069 }, { "epoch": 2.3066424021838037, "grad_norm": 0.3923528299084095, "learning_rate": 5.607376332572699e-06, "loss": 0.0112, "step": 5070 }, { "epoch": 2.3070973612374885, "grad_norm": 0.3614994681977554, "learning_rate": 5.605957596465834e-06, "loss": 0.0145, "step": 5071 }, { "epoch": 2.307552320291174, "grad_norm": 0.45085748129160796, "learning_rate": 5.60453881084285e-06, "loss": 0.0175, "step": 5072 }, { "epoch": 2.308007279344859, "grad_norm": 0.48780435224282037, "learning_rate": 5.603119975819684e-06, "loss": 0.0221, "step": 5073 }, { "epoch": 2.308462238398544, "grad_norm": 0.3344774151298191, "learning_rate": 5.601701091512279e-06, "loss": 0.0095, "step": 5074 }, { "epoch": 2.3089171974522293, "grad_norm": 0.42206035237573375, "learning_rate": 5.600282158036575e-06, "loss": 0.0072, "step": 5075 }, { "epoch": 2.3093721565059147, "grad_norm": 0.5683375002520097, "learning_rate": 5.598863175508526e-06, "loss": 0.0209, "step": 5076 }, { "epoch": 2.3098271155595995, "grad_norm": 0.48459850230370116, "learning_rate": 5.597444144044083e-06, "loss": 0.017, "step": 5077 }, { "epoch": 2.310282074613285, "grad_norm": 0.4332383227088994, "learning_rate": 5.596025063759202e-06, "loss": 0.0171, "step": 5078 }, { "epoch": 2.31073703366697, "grad_norm": 0.2758241308671077, "learning_rate": 5.594605934769845e-06, "loss": 0.0104, "step": 5079 }, { "epoch": 2.311191992720655, "grad_norm": 0.3338536433140087, "learning_rate": 5.593186757191974e-06, "loss": 0.0089, "step": 5080 }, { "epoch": 2.3116469517743403, "grad_norm": 0.2640392126447474, "learning_rate": 5.591767531141563e-06, "loss": 0.0082, "step": 5081 }, { "epoch": 2.3121019108280256, "grad_norm": 0.3953893898741137, "learning_rate": 5.59034825673458e-06, "loss": 0.0083, "step": 5082 }, { "epoch": 2.3125568698817105, "grad_norm": 0.2569213699979878, "learning_rate": 5.588928934087003e-06, "loss": 0.0086, "step": 5083 }, { "epoch": 2.313011828935396, "grad_norm": 0.6313666359843114, "learning_rate": 5.5875095633148146e-06, "loss": 0.0246, "step": 5084 }, { "epoch": 2.313466787989081, "grad_norm": 0.643173164905506, "learning_rate": 5.586090144533998e-06, "loss": 0.0305, "step": 5085 }, { "epoch": 2.313921747042766, "grad_norm": 0.5062189665269613, "learning_rate": 5.58467067786054e-06, "loss": 0.0157, "step": 5086 }, { "epoch": 2.3143767060964513, "grad_norm": 0.5819628723646005, "learning_rate": 5.583251163410436e-06, "loss": 0.0091, "step": 5087 }, { "epoch": 2.3148316651501366, "grad_norm": 0.4972780006772306, "learning_rate": 5.58183160129968e-06, "loss": 0.0162, "step": 5088 }, { "epoch": 2.3152866242038215, "grad_norm": 0.41694264361458766, "learning_rate": 5.580411991644273e-06, "loss": 0.0164, "step": 5089 }, { "epoch": 2.315741583257507, "grad_norm": 0.489632550791834, "learning_rate": 5.578992334560219e-06, "loss": 0.0167, "step": 5090 }, { "epoch": 2.316196542311192, "grad_norm": 0.3288945372054903, "learning_rate": 5.577572630163527e-06, "loss": 0.0086, "step": 5091 }, { "epoch": 2.316651501364877, "grad_norm": 0.38969169771532974, "learning_rate": 5.576152878570208e-06, "loss": 0.014, "step": 5092 }, { "epoch": 2.3171064604185623, "grad_norm": 0.2772452761090754, "learning_rate": 5.5747330798962765e-06, "loss": 0.0076, "step": 5093 }, { "epoch": 2.3175614194722476, "grad_norm": 0.7398501879278008, "learning_rate": 5.573313234257755e-06, "loss": 0.0212, "step": 5094 }, { "epoch": 2.3180163785259325, "grad_norm": 0.40366058849418684, "learning_rate": 5.571893341770663e-06, "loss": 0.0092, "step": 5095 }, { "epoch": 2.3184713375796178, "grad_norm": 0.3973674281915543, "learning_rate": 5.57047340255103e-06, "loss": 0.0097, "step": 5096 }, { "epoch": 2.318926296633303, "grad_norm": 0.46292198909608095, "learning_rate": 5.569053416714887e-06, "loss": 0.019, "step": 5097 }, { "epoch": 2.319381255686988, "grad_norm": 0.4462893128572885, "learning_rate": 5.56763338437827e-06, "loss": 0.0144, "step": 5098 }, { "epoch": 2.3198362147406733, "grad_norm": 0.48797923425342077, "learning_rate": 5.566213305657215e-06, "loss": 0.0129, "step": 5099 }, { "epoch": 2.3202911737943586, "grad_norm": 0.412122799014893, "learning_rate": 5.564793180667766e-06, "loss": 0.0096, "step": 5100 }, { "epoch": 2.3207461328480434, "grad_norm": 3.5850346149075736, "learning_rate": 5.56337300952597e-06, "loss": 0.0639, "step": 5101 }, { "epoch": 2.3212010919017287, "grad_norm": 0.19091056292886274, "learning_rate": 5.561952792347873e-06, "loss": 0.0043, "step": 5102 }, { "epoch": 2.321656050955414, "grad_norm": 0.3746308093194985, "learning_rate": 5.5605325292495335e-06, "loss": 0.0113, "step": 5103 }, { "epoch": 2.3221110100090994, "grad_norm": 0.4288877488182419, "learning_rate": 5.559112220347007e-06, "loss": 0.0193, "step": 5104 }, { "epoch": 2.3225659690627842, "grad_norm": 0.43634766681944853, "learning_rate": 5.557691865756355e-06, "loss": 0.0115, "step": 5105 }, { "epoch": 2.3230209281164695, "grad_norm": 0.4211964374389764, "learning_rate": 5.556271465593642e-06, "loss": 0.0212, "step": 5106 }, { "epoch": 2.323475887170155, "grad_norm": 0.30603182021946, "learning_rate": 5.554851019974935e-06, "loss": 0.0075, "step": 5107 }, { "epoch": 2.3239308462238397, "grad_norm": 0.2871259949309822, "learning_rate": 5.5534305290163115e-06, "loss": 0.0075, "step": 5108 }, { "epoch": 2.324385805277525, "grad_norm": 0.3468060233961574, "learning_rate": 5.552009992833842e-06, "loss": 0.0078, "step": 5109 }, { "epoch": 2.3248407643312103, "grad_norm": 0.3542587721425399, "learning_rate": 5.55058941154361e-06, "loss": 0.0097, "step": 5110 }, { "epoch": 2.325295723384895, "grad_norm": 0.38377233668808036, "learning_rate": 5.549168785261698e-06, "loss": 0.0094, "step": 5111 }, { "epoch": 2.3257506824385805, "grad_norm": 0.7489040250614702, "learning_rate": 5.547748114104192e-06, "loss": 0.0191, "step": 5112 }, { "epoch": 2.326205641492266, "grad_norm": 0.28559796075104366, "learning_rate": 5.546327398187184e-06, "loss": 0.0045, "step": 5113 }, { "epoch": 2.3266606005459507, "grad_norm": 0.49166462721919674, "learning_rate": 5.544906637626768e-06, "loss": 0.0205, "step": 5114 }, { "epoch": 2.327115559599636, "grad_norm": 0.3507371672526975, "learning_rate": 5.543485832539043e-06, "loss": 0.0076, "step": 5115 }, { "epoch": 2.3275705186533213, "grad_norm": 0.16025051105052449, "learning_rate": 5.54206498304011e-06, "loss": 0.0035, "step": 5116 }, { "epoch": 2.328025477707006, "grad_norm": 0.3514133884320373, "learning_rate": 5.540644089246073e-06, "loss": 0.0115, "step": 5117 }, { "epoch": 2.3284804367606915, "grad_norm": 0.32838122382429447, "learning_rate": 5.539223151273045e-06, "loss": 0.0072, "step": 5118 }, { "epoch": 2.328935395814377, "grad_norm": 0.3789507088231374, "learning_rate": 5.537802169237134e-06, "loss": 0.0077, "step": 5119 }, { "epoch": 2.329390354868062, "grad_norm": 0.5026365616612761, "learning_rate": 5.536381143254461e-06, "loss": 0.017, "step": 5120 }, { "epoch": 2.329845313921747, "grad_norm": 0.38103556357332163, "learning_rate": 5.534960073441141e-06, "loss": 0.0165, "step": 5121 }, { "epoch": 2.3303002729754323, "grad_norm": 0.3273533156081994, "learning_rate": 5.533538959913301e-06, "loss": 0.0072, "step": 5122 }, { "epoch": 2.3307552320291176, "grad_norm": 0.31684182975393166, "learning_rate": 5.5321178027870655e-06, "loss": 0.0096, "step": 5123 }, { "epoch": 2.3312101910828025, "grad_norm": 0.2591271732965268, "learning_rate": 5.530696602178566e-06, "loss": 0.008, "step": 5124 }, { "epoch": 2.331665150136488, "grad_norm": 0.3364534699488801, "learning_rate": 5.529275358203938e-06, "loss": 0.0094, "step": 5125 }, { "epoch": 2.332120109190173, "grad_norm": 0.31871194877671855, "learning_rate": 5.527854070979317e-06, "loss": 0.0089, "step": 5126 }, { "epoch": 2.332575068243858, "grad_norm": 0.5134652747659972, "learning_rate": 5.526432740620846e-06, "loss": 0.015, "step": 5127 }, { "epoch": 2.3330300272975433, "grad_norm": 0.4040805564754702, "learning_rate": 5.525011367244668e-06, "loss": 0.0126, "step": 5128 }, { "epoch": 2.3334849863512286, "grad_norm": 0.22849334940135976, "learning_rate": 5.523589950966932e-06, "loss": 0.0065, "step": 5129 }, { "epoch": 2.3339399454049135, "grad_norm": 0.38056950548925816, "learning_rate": 5.522168491903791e-06, "loss": 0.0088, "step": 5130 }, { "epoch": 2.3343949044585988, "grad_norm": 0.4479812046589512, "learning_rate": 5.520746990171396e-06, "loss": 0.0112, "step": 5131 }, { "epoch": 2.334849863512284, "grad_norm": 0.35949104680649646, "learning_rate": 5.5193254458859115e-06, "loss": 0.0117, "step": 5132 }, { "epoch": 2.335304822565969, "grad_norm": 0.3596141100734688, "learning_rate": 5.517903859163496e-06, "loss": 0.0114, "step": 5133 }, { "epoch": 2.3357597816196543, "grad_norm": 0.3321917780898999, "learning_rate": 5.516482230120316e-06, "loss": 0.0104, "step": 5134 }, { "epoch": 2.3362147406733396, "grad_norm": 0.4668618150432097, "learning_rate": 5.515060558872541e-06, "loss": 0.0116, "step": 5135 }, { "epoch": 2.3366696997270244, "grad_norm": 0.21570878006243732, "learning_rate": 5.513638845536341e-06, "loss": 0.004, "step": 5136 }, { "epoch": 2.3371246587807097, "grad_norm": 0.4585477462408327, "learning_rate": 5.512217090227896e-06, "loss": 0.0111, "step": 5137 }, { "epoch": 2.337579617834395, "grad_norm": 0.597536026245206, "learning_rate": 5.510795293063383e-06, "loss": 0.0127, "step": 5138 }, { "epoch": 2.33803457688808, "grad_norm": 0.3536513185606967, "learning_rate": 5.509373454158986e-06, "loss": 0.0067, "step": 5139 }, { "epoch": 2.3384895359417652, "grad_norm": 0.20360766036589872, "learning_rate": 5.5079515736308905e-06, "loss": 0.005, "step": 5140 }, { "epoch": 2.3389444949954505, "grad_norm": 0.2707652544579224, "learning_rate": 5.506529651595286e-06, "loss": 0.0054, "step": 5141 }, { "epoch": 2.3393994540491354, "grad_norm": 0.39362868925558897, "learning_rate": 5.5051076881683656e-06, "loss": 0.0117, "step": 5142 }, { "epoch": 2.3398544131028207, "grad_norm": 0.43413748804385666, "learning_rate": 5.503685683466326e-06, "loss": 0.0147, "step": 5143 }, { "epoch": 2.340309372156506, "grad_norm": 0.4750004627533353, "learning_rate": 5.502263637605368e-06, "loss": 0.0177, "step": 5144 }, { "epoch": 2.340764331210191, "grad_norm": 0.49510868242892087, "learning_rate": 5.500841550701692e-06, "loss": 0.0109, "step": 5145 }, { "epoch": 2.341219290263876, "grad_norm": 0.47646236015820576, "learning_rate": 5.499419422871506e-06, "loss": 0.0181, "step": 5146 }, { "epoch": 2.3416742493175615, "grad_norm": 0.5135583609690554, "learning_rate": 5.4979972542310224e-06, "loss": 0.0207, "step": 5147 }, { "epoch": 2.3421292083712464, "grad_norm": 0.5281542299049714, "learning_rate": 5.49657504489645e-06, "loss": 0.0125, "step": 5148 }, { "epoch": 2.3425841674249317, "grad_norm": 0.3803719712457962, "learning_rate": 5.495152794984009e-06, "loss": 0.012, "step": 5149 }, { "epoch": 2.343039126478617, "grad_norm": 0.3549464506951641, "learning_rate": 5.493730504609916e-06, "loss": 0.0142, "step": 5150 }, { "epoch": 2.343494085532302, "grad_norm": 0.5083323476527921, "learning_rate": 5.492308173890398e-06, "loss": 0.0172, "step": 5151 }, { "epoch": 2.343949044585987, "grad_norm": 0.28678181509420586, "learning_rate": 5.490885802941678e-06, "loss": 0.0075, "step": 5152 }, { "epoch": 2.3444040036396725, "grad_norm": 0.5466667218810686, "learning_rate": 5.489463391879986e-06, "loss": 0.0167, "step": 5153 }, { "epoch": 2.3448589626933574, "grad_norm": 0.393248239534694, "learning_rate": 5.488040940821558e-06, "loss": 0.0135, "step": 5154 }, { "epoch": 2.3453139217470427, "grad_norm": 0.5634806254809632, "learning_rate": 5.486618449882628e-06, "loss": 0.0235, "step": 5155 }, { "epoch": 2.345768880800728, "grad_norm": 0.35741976769377365, "learning_rate": 5.485195919179434e-06, "loss": 0.0096, "step": 5156 }, { "epoch": 2.3462238398544133, "grad_norm": 0.3287227460273248, "learning_rate": 5.483773348828224e-06, "loss": 0.0102, "step": 5157 }, { "epoch": 2.346678798908098, "grad_norm": 0.34566261414427857, "learning_rate": 5.482350738945238e-06, "loss": 0.0186, "step": 5158 }, { "epoch": 2.3471337579617835, "grad_norm": 0.4825164592835336, "learning_rate": 5.4809280896467275e-06, "loss": 0.0222, "step": 5159 }, { "epoch": 2.347588717015469, "grad_norm": 0.41901111819882036, "learning_rate": 5.479505401048947e-06, "loss": 0.0117, "step": 5160 }, { "epoch": 2.3480436760691537, "grad_norm": 0.4078878433271012, "learning_rate": 5.4780826732681506e-06, "loss": 0.0098, "step": 5161 }, { "epoch": 2.348498635122839, "grad_norm": 0.39394380721957667, "learning_rate": 5.476659906420596e-06, "loss": 0.0083, "step": 5162 }, { "epoch": 2.3489535941765243, "grad_norm": 0.5075398429488378, "learning_rate": 5.4752371006225455e-06, "loss": 0.0151, "step": 5163 }, { "epoch": 2.349408553230209, "grad_norm": 0.4971659694533423, "learning_rate": 5.4738142559902685e-06, "loss": 0.0244, "step": 5164 }, { "epoch": 2.3498635122838945, "grad_norm": 0.45965992406623524, "learning_rate": 5.472391372640028e-06, "loss": 0.0125, "step": 5165 }, { "epoch": 2.3503184713375798, "grad_norm": 0.5015944290941781, "learning_rate": 5.470968450688098e-06, "loss": 0.0119, "step": 5166 }, { "epoch": 2.3507734303912646, "grad_norm": 0.7219996034314764, "learning_rate": 5.469545490250753e-06, "loss": 0.0135, "step": 5167 }, { "epoch": 2.35122838944495, "grad_norm": 0.5160262999277548, "learning_rate": 5.468122491444271e-06, "loss": 0.0149, "step": 5168 }, { "epoch": 2.3516833484986353, "grad_norm": 0.30359649291817126, "learning_rate": 5.466699454384934e-06, "loss": 0.0133, "step": 5169 }, { "epoch": 2.35213830755232, "grad_norm": 0.449765608795224, "learning_rate": 5.465276379189024e-06, "loss": 0.0239, "step": 5170 }, { "epoch": 2.3525932666060054, "grad_norm": 0.2868592417951846, "learning_rate": 5.4638532659728306e-06, "loss": 0.0069, "step": 5171 }, { "epoch": 2.3530482256596907, "grad_norm": 0.34360803220041697, "learning_rate": 5.462430114852641e-06, "loss": 0.0103, "step": 5172 }, { "epoch": 2.3535031847133756, "grad_norm": 0.24947631338377824, "learning_rate": 5.461006925944753e-06, "loss": 0.0067, "step": 5173 }, { "epoch": 2.353958143767061, "grad_norm": 0.353015504583996, "learning_rate": 5.4595836993654605e-06, "loss": 0.0103, "step": 5174 }, { "epoch": 2.3544131028207462, "grad_norm": 0.2937630791965515, "learning_rate": 5.458160435231062e-06, "loss": 0.0096, "step": 5175 }, { "epoch": 2.3548680618744315, "grad_norm": 0.28706192712106926, "learning_rate": 5.456737133657865e-06, "loss": 0.0063, "step": 5176 }, { "epoch": 2.3553230209281164, "grad_norm": 0.41859742618093065, "learning_rate": 5.455313794762167e-06, "loss": 0.0145, "step": 5177 }, { "epoch": 2.3557779799818017, "grad_norm": 0.315336713910296, "learning_rate": 5.453890418660286e-06, "loss": 0.01, "step": 5178 }, { "epoch": 2.356232939035487, "grad_norm": 0.3971798519572188, "learning_rate": 5.452467005468528e-06, "loss": 0.0211, "step": 5179 }, { "epoch": 2.356687898089172, "grad_norm": 0.3816503031725337, "learning_rate": 5.45104355530321e-06, "loss": 0.0178, "step": 5180 }, { "epoch": 2.357142857142857, "grad_norm": 0.31547098033189974, "learning_rate": 5.44962006828065e-06, "loss": 0.0102, "step": 5181 }, { "epoch": 2.3575978161965425, "grad_norm": 0.33955988320842717, "learning_rate": 5.448196544517168e-06, "loss": 0.0108, "step": 5182 }, { "epoch": 2.3580527752502274, "grad_norm": 0.39597746365649117, "learning_rate": 5.44677298412909e-06, "loss": 0.0181, "step": 5183 }, { "epoch": 2.3585077343039127, "grad_norm": 0.43323282165822946, "learning_rate": 5.445349387232738e-06, "loss": 0.0094, "step": 5184 }, { "epoch": 2.358962693357598, "grad_norm": 0.5537151721234318, "learning_rate": 5.443925753944448e-06, "loss": 0.0221, "step": 5185 }, { "epoch": 2.359417652411283, "grad_norm": 0.2923319692625086, "learning_rate": 5.4425020843805485e-06, "loss": 0.0081, "step": 5186 }, { "epoch": 2.359872611464968, "grad_norm": 0.560851840102983, "learning_rate": 5.4410783786573785e-06, "loss": 0.0186, "step": 5187 }, { "epoch": 2.3603275705186535, "grad_norm": 0.3300258799514752, "learning_rate": 5.439654636891275e-06, "loss": 0.0121, "step": 5188 }, { "epoch": 2.3607825295723384, "grad_norm": 0.34519215267290093, "learning_rate": 5.4382308591985785e-06, "loss": 0.0138, "step": 5189 }, { "epoch": 2.3612374886260237, "grad_norm": 0.22487529661537098, "learning_rate": 5.436807045695638e-06, "loss": 0.0051, "step": 5190 }, { "epoch": 2.361692447679709, "grad_norm": 0.3575446831077963, "learning_rate": 5.435383196498795e-06, "loss": 0.0096, "step": 5191 }, { "epoch": 2.362147406733394, "grad_norm": 0.3945715478446503, "learning_rate": 5.433959311724406e-06, "loss": 0.0105, "step": 5192 }, { "epoch": 2.362602365787079, "grad_norm": 0.4411210526095032, "learning_rate": 5.432535391488821e-06, "loss": 0.0128, "step": 5193 }, { "epoch": 2.3630573248407645, "grad_norm": 0.4444325898238472, "learning_rate": 5.431111435908396e-06, "loss": 0.0121, "step": 5194 }, { "epoch": 2.3635122838944493, "grad_norm": 0.31383347621890917, "learning_rate": 5.429687445099493e-06, "loss": 0.0147, "step": 5195 }, { "epoch": 2.3639672429481347, "grad_norm": 0.506179144097311, "learning_rate": 5.428263419178471e-06, "loss": 0.0152, "step": 5196 }, { "epoch": 2.36442220200182, "grad_norm": 0.31338140472658715, "learning_rate": 5.4268393582616986e-06, "loss": 0.0091, "step": 5197 }, { "epoch": 2.364877161055505, "grad_norm": 0.6449391980468399, "learning_rate": 5.425415262465539e-06, "loss": 0.0112, "step": 5198 }, { "epoch": 2.36533212010919, "grad_norm": 0.2714619504084544, "learning_rate": 5.423991131906366e-06, "loss": 0.0095, "step": 5199 }, { "epoch": 2.3657870791628755, "grad_norm": 0.44637208265522926, "learning_rate": 5.422566966700553e-06, "loss": 0.0178, "step": 5200 }, { "epoch": 2.3662420382165603, "grad_norm": 0.44473690474266697, "learning_rate": 5.421142766964475e-06, "loss": 0.0128, "step": 5201 }, { "epoch": 2.3666969972702456, "grad_norm": 0.3251591627585468, "learning_rate": 5.419718532814513e-06, "loss": 0.0108, "step": 5202 }, { "epoch": 2.367151956323931, "grad_norm": 0.43871324002278006, "learning_rate": 5.418294264367046e-06, "loss": 0.0184, "step": 5203 }, { "epoch": 2.367606915377616, "grad_norm": 0.32143958977435755, "learning_rate": 5.416869961738463e-06, "loss": 0.0084, "step": 5204 }, { "epoch": 2.368061874431301, "grad_norm": 0.6047381473561789, "learning_rate": 5.415445625045148e-06, "loss": 0.0222, "step": 5205 }, { "epoch": 2.3685168334849864, "grad_norm": 0.2775712802500097, "learning_rate": 5.414021254403493e-06, "loss": 0.0086, "step": 5206 }, { "epoch": 2.3689717925386713, "grad_norm": 0.3913688146883563, "learning_rate": 5.412596849929892e-06, "loss": 0.0123, "step": 5207 }, { "epoch": 2.3694267515923566, "grad_norm": 0.4180126345527299, "learning_rate": 5.411172411740737e-06, "loss": 0.0127, "step": 5208 }, { "epoch": 2.369881710646042, "grad_norm": 0.4145520895468016, "learning_rate": 5.409747939952432e-06, "loss": 0.0115, "step": 5209 }, { "epoch": 2.370336669699727, "grad_norm": 0.46984567291181023, "learning_rate": 5.408323434681375e-06, "loss": 0.0119, "step": 5210 }, { "epoch": 2.370791628753412, "grad_norm": 0.4370725716934907, "learning_rate": 5.40689889604397e-06, "loss": 0.0155, "step": 5211 }, { "epoch": 2.3712465878070974, "grad_norm": 0.24972957311155186, "learning_rate": 5.4054743241566255e-06, "loss": 0.006, "step": 5212 }, { "epoch": 2.3717015468607827, "grad_norm": 0.42555167464637306, "learning_rate": 5.404049719135749e-06, "loss": 0.0101, "step": 5213 }, { "epoch": 2.3721565059144676, "grad_norm": 0.2732869198235925, "learning_rate": 5.4026250810977565e-06, "loss": 0.0061, "step": 5214 }, { "epoch": 2.372611464968153, "grad_norm": 0.23664705125295948, "learning_rate": 5.401200410159059e-06, "loss": 0.0042, "step": 5215 }, { "epoch": 2.373066424021838, "grad_norm": 0.3954694984893938, "learning_rate": 5.3997757064360756e-06, "loss": 0.013, "step": 5216 }, { "epoch": 2.373521383075523, "grad_norm": 0.18180273546718598, "learning_rate": 5.398350970045229e-06, "loss": 0.0047, "step": 5217 }, { "epoch": 2.3739763421292084, "grad_norm": 0.31477234286448397, "learning_rate": 5.396926201102937e-06, "loss": 0.0057, "step": 5218 }, { "epoch": 2.3744313011828937, "grad_norm": 0.36759194140173296, "learning_rate": 5.39550139972563e-06, "loss": 0.0092, "step": 5219 }, { "epoch": 2.3748862602365786, "grad_norm": 0.442239633811156, "learning_rate": 5.394076566029733e-06, "loss": 0.0204, "step": 5220 }, { "epoch": 2.375341219290264, "grad_norm": 0.37882384654604506, "learning_rate": 5.392651700131681e-06, "loss": 0.0138, "step": 5221 }, { "epoch": 2.375796178343949, "grad_norm": 0.45700123117896185, "learning_rate": 5.391226802147904e-06, "loss": 0.0202, "step": 5222 }, { "epoch": 2.376251137397634, "grad_norm": 0.4173548911856212, "learning_rate": 5.38980187219484e-06, "loss": 0.013, "step": 5223 }, { "epoch": 2.3767060964513194, "grad_norm": 0.501526585754104, "learning_rate": 5.388376910388928e-06, "loss": 0.0146, "step": 5224 }, { "epoch": 2.3771610555050047, "grad_norm": 0.4972966797438422, "learning_rate": 5.386951916846608e-06, "loss": 0.023, "step": 5225 }, { "epoch": 2.3776160145586895, "grad_norm": 0.3498558355060645, "learning_rate": 5.385526891684324e-06, "loss": 0.0138, "step": 5226 }, { "epoch": 2.378070973612375, "grad_norm": 0.3700683880723432, "learning_rate": 5.3841018350185244e-06, "loss": 0.0133, "step": 5227 }, { "epoch": 2.37852593266606, "grad_norm": 0.5585648096050481, "learning_rate": 5.3826767469656585e-06, "loss": 0.0156, "step": 5228 }, { "epoch": 2.3789808917197455, "grad_norm": 0.3331006105880638, "learning_rate": 5.381251627642174e-06, "loss": 0.0092, "step": 5229 }, { "epoch": 2.3794358507734303, "grad_norm": 0.3079202123421543, "learning_rate": 5.37982647716453e-06, "loss": 0.0094, "step": 5230 }, { "epoch": 2.3798908098271156, "grad_norm": 0.5613934800867718, "learning_rate": 5.378401295649182e-06, "loss": 0.0172, "step": 5231 }, { "epoch": 2.380345768880801, "grad_norm": 0.3703230818626966, "learning_rate": 5.3769760832125884e-06, "loss": 0.0076, "step": 5232 }, { "epoch": 2.380800727934486, "grad_norm": 0.39393991401324163, "learning_rate": 5.375550839971212e-06, "loss": 0.0088, "step": 5233 }, { "epoch": 2.381255686988171, "grad_norm": 0.2322305193916463, "learning_rate": 5.374125566041517e-06, "loss": 0.0059, "step": 5234 }, { "epoch": 2.3817106460418564, "grad_norm": 0.6230743002805157, "learning_rate": 5.37270026153997e-06, "loss": 0.0217, "step": 5235 }, { "epoch": 2.3821656050955413, "grad_norm": 0.23889548023386997, "learning_rate": 5.37127492658304e-06, "loss": 0.0056, "step": 5236 }, { "epoch": 2.3826205641492266, "grad_norm": 0.45250805993738163, "learning_rate": 5.3698495612872e-06, "loss": 0.0141, "step": 5237 }, { "epoch": 2.383075523202912, "grad_norm": 0.4417927166255723, "learning_rate": 5.368424165768925e-06, "loss": 0.0163, "step": 5238 }, { "epoch": 2.383530482256597, "grad_norm": 0.601485164623354, "learning_rate": 5.366998740144691e-06, "loss": 0.0226, "step": 5239 }, { "epoch": 2.383985441310282, "grad_norm": 0.447241403498978, "learning_rate": 5.365573284530976e-06, "loss": 0.022, "step": 5240 }, { "epoch": 2.3844404003639674, "grad_norm": 0.48628577475368706, "learning_rate": 5.3641477990442645e-06, "loss": 0.0203, "step": 5241 }, { "epoch": 2.3848953594176523, "grad_norm": 0.42848094818559307, "learning_rate": 5.362722283801038e-06, "loss": 0.0078, "step": 5242 }, { "epoch": 2.3853503184713376, "grad_norm": 0.35278326637047613, "learning_rate": 5.361296738917785e-06, "loss": 0.0132, "step": 5243 }, { "epoch": 2.385805277525023, "grad_norm": 0.46467384475872836, "learning_rate": 5.359871164510995e-06, "loss": 0.0207, "step": 5244 }, { "epoch": 2.386260236578708, "grad_norm": 0.4096938326628108, "learning_rate": 5.3584455606971576e-06, "loss": 0.0136, "step": 5245 }, { "epoch": 2.386715195632393, "grad_norm": 0.3279067148690326, "learning_rate": 5.357019927592769e-06, "loss": 0.0071, "step": 5246 }, { "epoch": 2.3871701546860784, "grad_norm": 0.32010669985804296, "learning_rate": 5.355594265314321e-06, "loss": 0.0103, "step": 5247 }, { "epoch": 2.3876251137397633, "grad_norm": 0.4281430187657181, "learning_rate": 5.3541685739783186e-06, "loss": 0.0089, "step": 5248 }, { "epoch": 2.3880800727934486, "grad_norm": 0.6972921855537589, "learning_rate": 5.352742853701259e-06, "loss": 0.0188, "step": 5249 }, { "epoch": 2.388535031847134, "grad_norm": 0.3369695950732781, "learning_rate": 5.351317104599646e-06, "loss": 0.0118, "step": 5250 }, { "epoch": 2.3889899909008188, "grad_norm": 0.38649180583281173, "learning_rate": 5.3498913267899864e-06, "loss": 0.0102, "step": 5251 }, { "epoch": 2.389444949954504, "grad_norm": 0.2538962592519053, "learning_rate": 5.348465520388787e-06, "loss": 0.0085, "step": 5252 }, { "epoch": 2.3898999090081894, "grad_norm": 0.40680922435796585, "learning_rate": 5.34703968551256e-06, "loss": 0.0115, "step": 5253 }, { "epoch": 2.3903548680618742, "grad_norm": 0.4516969702412574, "learning_rate": 5.345613822277815e-06, "loss": 0.0164, "step": 5254 }, { "epoch": 2.3908098271155596, "grad_norm": 0.43021217049594745, "learning_rate": 5.344187930801072e-06, "loss": 0.0122, "step": 5255 }, { "epoch": 2.391264786169245, "grad_norm": 0.5015071545210491, "learning_rate": 5.342762011198843e-06, "loss": 0.0119, "step": 5256 }, { "epoch": 2.3917197452229297, "grad_norm": 0.6970788943828632, "learning_rate": 5.341336063587651e-06, "loss": 0.0208, "step": 5257 }, { "epoch": 2.392174704276615, "grad_norm": 0.4780454188193996, "learning_rate": 5.3399100880840185e-06, "loss": 0.0117, "step": 5258 }, { "epoch": 2.3926296633303004, "grad_norm": 0.5264740389456065, "learning_rate": 5.338484084804467e-06, "loss": 0.009, "step": 5259 }, { "epoch": 2.3930846223839852, "grad_norm": 0.3439050966830248, "learning_rate": 5.337058053865527e-06, "loss": 0.0106, "step": 5260 }, { "epoch": 2.3935395814376705, "grad_norm": 0.6715181229719871, "learning_rate": 5.335631995383722e-06, "loss": 0.0247, "step": 5261 }, { "epoch": 2.393994540491356, "grad_norm": 0.574221604583767, "learning_rate": 5.3342059094755885e-06, "loss": 0.0156, "step": 5262 }, { "epoch": 2.3944494995450407, "grad_norm": 0.47373211505761376, "learning_rate": 5.332779796257656e-06, "loss": 0.0119, "step": 5263 }, { "epoch": 2.394904458598726, "grad_norm": 0.32577636597481435, "learning_rate": 5.331353655846462e-06, "loss": 0.0146, "step": 5264 }, { "epoch": 2.3953594176524113, "grad_norm": 0.41695373722744733, "learning_rate": 5.329927488358544e-06, "loss": 0.0121, "step": 5265 }, { "epoch": 2.395814376706096, "grad_norm": 0.31340022193006356, "learning_rate": 5.3285012939104395e-06, "loss": 0.0076, "step": 5266 }, { "epoch": 2.3962693357597815, "grad_norm": 0.42920487838433324, "learning_rate": 5.327075072618696e-06, "loss": 0.0117, "step": 5267 }, { "epoch": 2.396724294813467, "grad_norm": 0.36684847916393465, "learning_rate": 5.325648824599853e-06, "loss": 0.0117, "step": 5268 }, { "epoch": 2.397179253867152, "grad_norm": 0.3279088478753296, "learning_rate": 5.324222549970458e-06, "loss": 0.0146, "step": 5269 }, { "epoch": 2.397634212920837, "grad_norm": 0.6377109395841157, "learning_rate": 5.322796248847062e-06, "loss": 0.0188, "step": 5270 }, { "epoch": 2.3980891719745223, "grad_norm": 0.24823435844273126, "learning_rate": 5.321369921346211e-06, "loss": 0.0063, "step": 5271 }, { "epoch": 2.3985441310282076, "grad_norm": 0.3508243366725432, "learning_rate": 5.3199435675844644e-06, "loss": 0.011, "step": 5272 }, { "epoch": 2.3989990900818925, "grad_norm": 0.35664589258764695, "learning_rate": 5.318517187678374e-06, "loss": 0.0126, "step": 5273 }, { "epoch": 2.399454049135578, "grad_norm": 0.49922827351419585, "learning_rate": 5.317090781744497e-06, "loss": 0.0134, "step": 5274 }, { "epoch": 2.399909008189263, "grad_norm": 0.42920600718528423, "learning_rate": 5.315664349899393e-06, "loss": 0.0197, "step": 5275 }, { "epoch": 2.400363967242948, "grad_norm": 0.3636947535999247, "learning_rate": 5.314237892259624e-06, "loss": 0.0129, "step": 5276 }, { "epoch": 2.4008189262966333, "grad_norm": 0.4114786573539662, "learning_rate": 5.312811408941753e-06, "loss": 0.0099, "step": 5277 }, { "epoch": 2.4012738853503186, "grad_norm": 0.4031041535549888, "learning_rate": 5.311384900062346e-06, "loss": 0.0159, "step": 5278 }, { "epoch": 2.4017288444040035, "grad_norm": 0.6712437758510306, "learning_rate": 5.309958365737973e-06, "loss": 0.0182, "step": 5279 }, { "epoch": 2.402183803457689, "grad_norm": 0.43264061757597233, "learning_rate": 5.308531806085202e-06, "loss": 0.0159, "step": 5280 }, { "epoch": 2.402638762511374, "grad_norm": 0.26441178822412903, "learning_rate": 5.307105221220604e-06, "loss": 0.0067, "step": 5281 }, { "epoch": 2.403093721565059, "grad_norm": 0.4404124093235634, "learning_rate": 5.3056786112607535e-06, "loss": 0.0111, "step": 5282 }, { "epoch": 2.4035486806187443, "grad_norm": 0.42989271959707304, "learning_rate": 5.304251976322229e-06, "loss": 0.0119, "step": 5283 }, { "epoch": 2.4040036396724296, "grad_norm": 0.4631793935562148, "learning_rate": 5.302825316521607e-06, "loss": 0.0143, "step": 5284 }, { "epoch": 2.404458598726115, "grad_norm": 0.58463774112294, "learning_rate": 5.301398631975466e-06, "loss": 0.0208, "step": 5285 }, { "epoch": 2.4049135577797998, "grad_norm": 0.5531017946934959, "learning_rate": 5.299971922800391e-06, "loss": 0.0287, "step": 5286 }, { "epoch": 2.405368516833485, "grad_norm": 0.33464890796105184, "learning_rate": 5.298545189112966e-06, "loss": 0.01, "step": 5287 }, { "epoch": 2.4058234758871704, "grad_norm": 0.40252846455148866, "learning_rate": 5.297118431029775e-06, "loss": 0.0175, "step": 5288 }, { "epoch": 2.4062784349408552, "grad_norm": 0.28830007619590997, "learning_rate": 5.295691648667407e-06, "loss": 0.0079, "step": 5289 }, { "epoch": 2.4067333939945406, "grad_norm": 0.36136838045868824, "learning_rate": 5.294264842142454e-06, "loss": 0.0112, "step": 5290 }, { "epoch": 2.407188353048226, "grad_norm": 0.42706958288079705, "learning_rate": 5.292838011571507e-06, "loss": 0.0103, "step": 5291 }, { "epoch": 2.4076433121019107, "grad_norm": 0.5404021820517108, "learning_rate": 5.2914111570711605e-06, "loss": 0.0141, "step": 5292 }, { "epoch": 2.408098271155596, "grad_norm": 0.40642751021129225, "learning_rate": 5.289984278758009e-06, "loss": 0.0153, "step": 5293 }, { "epoch": 2.4085532302092814, "grad_norm": 0.30578502453889306, "learning_rate": 5.2885573767486535e-06, "loss": 0.0113, "step": 5294 }, { "epoch": 2.4090081892629662, "grad_norm": 0.3209377054110551, "learning_rate": 5.28713045115969e-06, "loss": 0.0114, "step": 5295 }, { "epoch": 2.4094631483166515, "grad_norm": 0.48096854465015776, "learning_rate": 5.285703502107724e-06, "loss": 0.017, "step": 5296 }, { "epoch": 2.409918107370337, "grad_norm": 0.4219490254103711, "learning_rate": 5.284276529709358e-06, "loss": 0.0125, "step": 5297 }, { "epoch": 2.4103730664240217, "grad_norm": 0.4636782537011902, "learning_rate": 5.282849534081198e-06, "loss": 0.0191, "step": 5298 }, { "epoch": 2.410828025477707, "grad_norm": 0.42315034221489234, "learning_rate": 5.28142251533985e-06, "loss": 0.0175, "step": 5299 }, { "epoch": 2.4112829845313923, "grad_norm": 0.4673610840212676, "learning_rate": 5.2799954736019264e-06, "loss": 0.016, "step": 5300 }, { "epoch": 2.411737943585077, "grad_norm": 0.40097914710543014, "learning_rate": 5.2785684089840375e-06, "loss": 0.0146, "step": 5301 }, { "epoch": 2.4121929026387625, "grad_norm": 0.445243466924135, "learning_rate": 5.277141321602795e-06, "loss": 0.0173, "step": 5302 }, { "epoch": 2.412647861692448, "grad_norm": 0.37737962994500585, "learning_rate": 5.275714211574816e-06, "loss": 0.0106, "step": 5303 }, { "epoch": 2.4131028207461327, "grad_norm": 0.4112204646422568, "learning_rate": 5.274287079016717e-06, "loss": 0.0157, "step": 5304 }, { "epoch": 2.413557779799818, "grad_norm": 0.33025822861785786, "learning_rate": 5.272859924045116e-06, "loss": 0.0113, "step": 5305 }, { "epoch": 2.4140127388535033, "grad_norm": 0.3300443428375169, "learning_rate": 5.2714327467766335e-06, "loss": 0.0107, "step": 5306 }, { "epoch": 2.414467697907188, "grad_norm": 0.40680442981426845, "learning_rate": 5.2700055473278935e-06, "loss": 0.0111, "step": 5307 }, { "epoch": 2.4149226569608735, "grad_norm": 0.3689628882005614, "learning_rate": 5.268578325815521e-06, "loss": 0.0149, "step": 5308 }, { "epoch": 2.415377616014559, "grad_norm": 0.3866694235292162, "learning_rate": 5.267151082356138e-06, "loss": 0.0103, "step": 5309 }, { "epoch": 2.4158325750682437, "grad_norm": 0.3141065437405475, "learning_rate": 5.265723817066376e-06, "loss": 0.0097, "step": 5310 }, { "epoch": 2.416287534121929, "grad_norm": 0.27490602838646616, "learning_rate": 5.264296530062865e-06, "loss": 0.0079, "step": 5311 }, { "epoch": 2.4167424931756143, "grad_norm": 0.38772971732292993, "learning_rate": 5.262869221462234e-06, "loss": 0.0061, "step": 5312 }, { "epoch": 2.417197452229299, "grad_norm": 0.26314076127170805, "learning_rate": 5.261441891381116e-06, "loss": 0.0087, "step": 5313 }, { "epoch": 2.4176524112829845, "grad_norm": 0.2792754343780329, "learning_rate": 5.260014539936148e-06, "loss": 0.0091, "step": 5314 }, { "epoch": 2.41810737033667, "grad_norm": 0.6497302976664651, "learning_rate": 5.258587167243968e-06, "loss": 0.0184, "step": 5315 }, { "epoch": 2.4185623293903546, "grad_norm": 0.4894356800746713, "learning_rate": 5.257159773421211e-06, "loss": 0.0137, "step": 5316 }, { "epoch": 2.41901728844404, "grad_norm": 0.38163565505543456, "learning_rate": 5.255732358584517e-06, "loss": 0.0171, "step": 5317 }, { "epoch": 2.4194722474977253, "grad_norm": 0.3234146579281654, "learning_rate": 5.2543049228505326e-06, "loss": 0.007, "step": 5318 }, { "epoch": 2.41992720655141, "grad_norm": 0.44372590044724763, "learning_rate": 5.252877466335897e-06, "loss": 0.0182, "step": 5319 }, { "epoch": 2.4203821656050954, "grad_norm": 0.5020596134338452, "learning_rate": 5.251449989157257e-06, "loss": 0.0227, "step": 5320 }, { "epoch": 2.4208371246587808, "grad_norm": 0.4087790160266453, "learning_rate": 5.250022491431259e-06, "loss": 0.0189, "step": 5321 }, { "epoch": 2.421292083712466, "grad_norm": 0.29776606030221714, "learning_rate": 5.2485949732745525e-06, "loss": 0.0064, "step": 5322 }, { "epoch": 2.421747042766151, "grad_norm": 0.4447913938555693, "learning_rate": 5.247167434803787e-06, "loss": 0.0133, "step": 5323 }, { "epoch": 2.4222020018198362, "grad_norm": 0.46365246362920237, "learning_rate": 5.245739876135615e-06, "loss": 0.0173, "step": 5324 }, { "epoch": 2.4226569608735216, "grad_norm": 0.5178806777130259, "learning_rate": 5.244312297386691e-06, "loss": 0.0115, "step": 5325 }, { "epoch": 2.4231119199272064, "grad_norm": 0.35093793454524563, "learning_rate": 5.242884698673668e-06, "loss": 0.0114, "step": 5326 }, { "epoch": 2.4235668789808917, "grad_norm": 0.28422567507689994, "learning_rate": 5.241457080113205e-06, "loss": 0.0081, "step": 5327 }, { "epoch": 2.424021838034577, "grad_norm": 0.48736935552274546, "learning_rate": 5.24002944182196e-06, "loss": 0.0189, "step": 5328 }, { "epoch": 2.424476797088262, "grad_norm": 0.2964000798583057, "learning_rate": 5.2386017839165925e-06, "loss": 0.0147, "step": 5329 }, { "epoch": 2.4249317561419472, "grad_norm": 0.5907284720265954, "learning_rate": 5.237174106513764e-06, "loss": 0.0095, "step": 5330 }, { "epoch": 2.4253867151956325, "grad_norm": 0.4737712799278728, "learning_rate": 5.235746409730139e-06, "loss": 0.0141, "step": 5331 }, { "epoch": 2.4258416742493174, "grad_norm": 0.5550673202170863, "learning_rate": 5.234318693682384e-06, "loss": 0.0173, "step": 5332 }, { "epoch": 2.4262966333030027, "grad_norm": 0.14115228212962203, "learning_rate": 5.232890958487162e-06, "loss": 0.0018, "step": 5333 }, { "epoch": 2.426751592356688, "grad_norm": 0.4631423032957736, "learning_rate": 5.231463204261142e-06, "loss": 0.0193, "step": 5334 }, { "epoch": 2.427206551410373, "grad_norm": 0.4235314876677975, "learning_rate": 5.2300354311209955e-06, "loss": 0.0111, "step": 5335 }, { "epoch": 2.427661510464058, "grad_norm": 0.4684844540332601, "learning_rate": 5.228607639183392e-06, "loss": 0.0198, "step": 5336 }, { "epoch": 2.4281164695177435, "grad_norm": 0.32457884621106575, "learning_rate": 5.227179828565003e-06, "loss": 0.0089, "step": 5337 }, { "epoch": 2.4285714285714284, "grad_norm": 0.23451179427681904, "learning_rate": 5.225751999382507e-06, "loss": 0.0068, "step": 5338 }, { "epoch": 2.4290263876251137, "grad_norm": 0.23318095670248587, "learning_rate": 5.224324151752575e-06, "loss": 0.0063, "step": 5339 }, { "epoch": 2.429481346678799, "grad_norm": 0.4347170570515523, "learning_rate": 5.222896285791889e-06, "loss": 0.0163, "step": 5340 }, { "epoch": 2.4299363057324843, "grad_norm": 0.21176806980673593, "learning_rate": 5.221468401617121e-06, "loss": 0.0036, "step": 5341 }, { "epoch": 2.430391264786169, "grad_norm": 0.3457671056366103, "learning_rate": 5.220040499344958e-06, "loss": 0.0077, "step": 5342 }, { "epoch": 2.4308462238398545, "grad_norm": 0.32392634294488304, "learning_rate": 5.2186125790920796e-06, "loss": 0.0056, "step": 5343 }, { "epoch": 2.43130118289354, "grad_norm": 0.34963533307201805, "learning_rate": 5.217184640975167e-06, "loss": 0.0074, "step": 5344 }, { "epoch": 2.4317561419472247, "grad_norm": 0.5346383300170781, "learning_rate": 5.2157566851109075e-06, "loss": 0.0164, "step": 5345 }, { "epoch": 2.43221110100091, "grad_norm": 0.4217985939266549, "learning_rate": 5.214328711615984e-06, "loss": 0.0133, "step": 5346 }, { "epoch": 2.4326660600545953, "grad_norm": 0.4333819173618331, "learning_rate": 5.2129007206070894e-06, "loss": 0.0081, "step": 5347 }, { "epoch": 2.43312101910828, "grad_norm": 0.49905999345083274, "learning_rate": 5.211472712200905e-06, "loss": 0.0105, "step": 5348 }, { "epoch": 2.4335759781619655, "grad_norm": 0.45412017248375747, "learning_rate": 5.210044686514129e-06, "loss": 0.0113, "step": 5349 }, { "epoch": 2.434030937215651, "grad_norm": 0.30414770683658676, "learning_rate": 5.20861664366345e-06, "loss": 0.0054, "step": 5350 }, { "epoch": 2.4344858962693356, "grad_norm": 0.48930476002695283, "learning_rate": 5.207188583765559e-06, "loss": 0.016, "step": 5351 }, { "epoch": 2.434940855323021, "grad_norm": 0.4009965835101347, "learning_rate": 5.205760506937155e-06, "loss": 0.0092, "step": 5352 }, { "epoch": 2.4353958143767063, "grad_norm": 0.41809624341181123, "learning_rate": 5.204332413294929e-06, "loss": 0.0152, "step": 5353 }, { "epoch": 2.435850773430391, "grad_norm": 0.21306333496842877, "learning_rate": 5.202904302955582e-06, "loss": 0.0039, "step": 5354 }, { "epoch": 2.4363057324840764, "grad_norm": 0.48254684653454416, "learning_rate": 5.201476176035813e-06, "loss": 0.0167, "step": 5355 }, { "epoch": 2.4367606915377618, "grad_norm": 0.4504698553713479, "learning_rate": 5.200048032652319e-06, "loss": 0.0139, "step": 5356 }, { "epoch": 2.4372156505914466, "grad_norm": 0.6854774387741476, "learning_rate": 5.198619872921804e-06, "loss": 0.0226, "step": 5357 }, { "epoch": 2.437670609645132, "grad_norm": 0.41483620737618115, "learning_rate": 5.1971916969609685e-06, "loss": 0.0132, "step": 5358 }, { "epoch": 2.4381255686988172, "grad_norm": 0.5129994167682194, "learning_rate": 5.195763504886518e-06, "loss": 0.0261, "step": 5359 }, { "epoch": 2.438580527752502, "grad_norm": 0.41405688069575036, "learning_rate": 5.19433529681516e-06, "loss": 0.0067, "step": 5360 }, { "epoch": 2.4390354868061874, "grad_norm": 0.43578772096858115, "learning_rate": 5.1929070728635985e-06, "loss": 0.0159, "step": 5361 }, { "epoch": 2.4394904458598727, "grad_norm": 0.31965492899761083, "learning_rate": 5.1914788331485424e-06, "loss": 0.0079, "step": 5362 }, { "epoch": 2.4399454049135576, "grad_norm": 0.40154663104201666, "learning_rate": 5.190050577786699e-06, "loss": 0.0074, "step": 5363 }, { "epoch": 2.440400363967243, "grad_norm": 0.44931254718914193, "learning_rate": 5.188622306894783e-06, "loss": 0.0106, "step": 5364 }, { "epoch": 2.4408553230209282, "grad_norm": 0.33437106617358336, "learning_rate": 5.187194020589501e-06, "loss": 0.005, "step": 5365 }, { "epoch": 2.441310282074613, "grad_norm": 0.20199436605898388, "learning_rate": 5.185765718987571e-06, "loss": 0.0025, "step": 5366 }, { "epoch": 2.4417652411282984, "grad_norm": 0.46926903808957776, "learning_rate": 5.184337402205705e-06, "loss": 0.0194, "step": 5367 }, { "epoch": 2.4422202001819837, "grad_norm": 0.367596857437341, "learning_rate": 5.18290907036062e-06, "loss": 0.0084, "step": 5368 }, { "epoch": 2.4426751592356686, "grad_norm": 0.1327251181749855, "learning_rate": 5.18148072356903e-06, "loss": 0.0025, "step": 5369 }, { "epoch": 2.443130118289354, "grad_norm": 0.4147742303890549, "learning_rate": 5.180052361947656e-06, "loss": 0.0104, "step": 5370 }, { "epoch": 2.443585077343039, "grad_norm": 0.4630253091530012, "learning_rate": 5.178623985613216e-06, "loss": 0.0199, "step": 5371 }, { "epoch": 2.444040036396724, "grad_norm": 0.34218287491377375, "learning_rate": 5.177195594682431e-06, "loss": 0.0108, "step": 5372 }, { "epoch": 2.4444949954504094, "grad_norm": 0.409352010746614, "learning_rate": 5.17576718927202e-06, "loss": 0.0073, "step": 5373 }, { "epoch": 2.4449499545040947, "grad_norm": 0.36350434011146243, "learning_rate": 5.174338769498711e-06, "loss": 0.0118, "step": 5374 }, { "epoch": 2.4454049135577796, "grad_norm": 0.33996923505627474, "learning_rate": 5.172910335479223e-06, "loss": 0.0141, "step": 5375 }, { "epoch": 2.445859872611465, "grad_norm": 0.5691354080089669, "learning_rate": 5.171481887330283e-06, "loss": 0.0275, "step": 5376 }, { "epoch": 2.44631483166515, "grad_norm": 0.587573769102348, "learning_rate": 5.170053425168619e-06, "loss": 0.0171, "step": 5377 }, { "epoch": 2.4467697907188355, "grad_norm": 0.4951503647581403, "learning_rate": 5.168624949110956e-06, "loss": 0.0107, "step": 5378 }, { "epoch": 2.4472247497725204, "grad_norm": 0.5222877485943915, "learning_rate": 5.167196459274024e-06, "loss": 0.0233, "step": 5379 }, { "epoch": 2.4476797088262057, "grad_norm": 0.3857745059781012, "learning_rate": 5.165767955774554e-06, "loss": 0.0122, "step": 5380 }, { "epoch": 2.448134667879891, "grad_norm": 0.4280925056961366, "learning_rate": 5.164339438729273e-06, "loss": 0.0097, "step": 5381 }, { "epoch": 2.448589626933576, "grad_norm": 0.4134588427952028, "learning_rate": 5.162910908254917e-06, "loss": 0.0167, "step": 5382 }, { "epoch": 2.449044585987261, "grad_norm": 0.41863303981195715, "learning_rate": 5.161482364468216e-06, "loss": 0.0254, "step": 5383 }, { "epoch": 2.4494995450409465, "grad_norm": 0.29417651115854027, "learning_rate": 5.1600538074859045e-06, "loss": 0.0059, "step": 5384 }, { "epoch": 2.4499545040946313, "grad_norm": 0.3633088275042752, "learning_rate": 5.15862523742472e-06, "loss": 0.0115, "step": 5385 }, { "epoch": 2.4504094631483166, "grad_norm": 0.44125434053934054, "learning_rate": 5.157196654401397e-06, "loss": 0.0157, "step": 5386 }, { "epoch": 2.450864422202002, "grad_norm": 0.2642282335305633, "learning_rate": 5.155768058532674e-06, "loss": 0.0061, "step": 5387 }, { "epoch": 2.451319381255687, "grad_norm": 0.2843265353073583, "learning_rate": 5.15433944993529e-06, "loss": 0.0105, "step": 5388 }, { "epoch": 2.451774340309372, "grad_norm": 0.4197881250600456, "learning_rate": 5.15291082872598e-06, "loss": 0.0063, "step": 5389 }, { "epoch": 2.4522292993630574, "grad_norm": 0.4789517931451006, "learning_rate": 5.151482195021489e-06, "loss": 0.0234, "step": 5390 }, { "epoch": 2.4526842584167423, "grad_norm": 0.4717176046669697, "learning_rate": 5.150053548938557e-06, "loss": 0.022, "step": 5391 }, { "epoch": 2.4531392174704276, "grad_norm": 0.23207011911006153, "learning_rate": 5.148624890593927e-06, "loss": 0.0062, "step": 5392 }, { "epoch": 2.453594176524113, "grad_norm": 0.459972570924796, "learning_rate": 5.147196220104342e-06, "loss": 0.017, "step": 5393 }, { "epoch": 2.4540491355777982, "grad_norm": 0.5567722874176256, "learning_rate": 5.145767537586546e-06, "loss": 0.0124, "step": 5394 }, { "epoch": 2.454504094631483, "grad_norm": 0.30209411330804076, "learning_rate": 5.144338843157286e-06, "loss": 0.0096, "step": 5395 }, { "epoch": 2.4549590536851684, "grad_norm": 0.2519017471529283, "learning_rate": 5.1429101369333065e-06, "loss": 0.0072, "step": 5396 }, { "epoch": 2.4554140127388537, "grad_norm": 0.6563956864761441, "learning_rate": 5.141481419031357e-06, "loss": 0.0187, "step": 5397 }, { "epoch": 2.4558689717925386, "grad_norm": 0.4708398496905736, "learning_rate": 5.140052689568185e-06, "loss": 0.0162, "step": 5398 }, { "epoch": 2.456323930846224, "grad_norm": 0.17379437121204325, "learning_rate": 5.1386239486605394e-06, "loss": 0.0034, "step": 5399 }, { "epoch": 2.4567788898999092, "grad_norm": 0.2611660630951624, "learning_rate": 5.1371951964251695e-06, "loss": 0.0078, "step": 5400 }, { "epoch": 2.457233848953594, "grad_norm": 0.3978398002330195, "learning_rate": 5.135766432978829e-06, "loss": 0.0127, "step": 5401 }, { "epoch": 2.4576888080072794, "grad_norm": 0.2922957300568856, "learning_rate": 5.134337658438269e-06, "loss": 0.0074, "step": 5402 }, { "epoch": 2.4581437670609647, "grad_norm": 0.2539151326498751, "learning_rate": 5.132908872920242e-06, "loss": 0.0103, "step": 5403 }, { "epoch": 2.4585987261146496, "grad_norm": 0.458182720404031, "learning_rate": 5.1314800765415014e-06, "loss": 0.0092, "step": 5404 }, { "epoch": 2.459053685168335, "grad_norm": 0.3001758441171674, "learning_rate": 5.130051269418804e-06, "loss": 0.0062, "step": 5405 }, { "epoch": 2.45950864422202, "grad_norm": 0.6882974671921861, "learning_rate": 5.128622451668903e-06, "loss": 0.0309, "step": 5406 }, { "epoch": 2.459963603275705, "grad_norm": 0.49732412901573037, "learning_rate": 5.127193623408556e-06, "loss": 0.0141, "step": 5407 }, { "epoch": 2.4604185623293904, "grad_norm": 0.41004480864420567, "learning_rate": 5.125764784754521e-06, "loss": 0.013, "step": 5408 }, { "epoch": 2.4608735213830757, "grad_norm": 0.41635378566858544, "learning_rate": 5.1243359358235555e-06, "loss": 0.0145, "step": 5409 }, { "epoch": 2.4613284804367606, "grad_norm": 0.5225584751956076, "learning_rate": 5.12290707673242e-06, "loss": 0.0136, "step": 5410 }, { "epoch": 2.461783439490446, "grad_norm": 0.40487768289825765, "learning_rate": 5.121478207597871e-06, "loss": 0.0118, "step": 5411 }, { "epoch": 2.462238398544131, "grad_norm": 0.32717360369468845, "learning_rate": 5.120049328536674e-06, "loss": 0.0094, "step": 5412 }, { "epoch": 2.462693357597816, "grad_norm": 0.48756100640197947, "learning_rate": 5.1186204396655855e-06, "loss": 0.0178, "step": 5413 }, { "epoch": 2.4631483166515014, "grad_norm": 0.6624070111868631, "learning_rate": 5.117191541101372e-06, "loss": 0.012, "step": 5414 }, { "epoch": 2.4636032757051867, "grad_norm": 0.6506693017472739, "learning_rate": 5.115762632960795e-06, "loss": 0.0213, "step": 5415 }, { "epoch": 2.4640582347588715, "grad_norm": 0.2881670840004332, "learning_rate": 5.114333715360617e-06, "loss": 0.0071, "step": 5416 }, { "epoch": 2.464513193812557, "grad_norm": 0.26900122959065437, "learning_rate": 5.1129047884176065e-06, "loss": 0.0089, "step": 5417 }, { "epoch": 2.464968152866242, "grad_norm": 0.3812297870362603, "learning_rate": 5.111475852248523e-06, "loss": 0.0139, "step": 5418 }, { "epoch": 2.465423111919927, "grad_norm": 0.4476381917589943, "learning_rate": 5.11004690697014e-06, "loss": 0.0116, "step": 5419 }, { "epoch": 2.4658780709736123, "grad_norm": 0.37913712833335467, "learning_rate": 5.1086179526992185e-06, "loss": 0.0112, "step": 5420 }, { "epoch": 2.4663330300272976, "grad_norm": 0.35300019232667945, "learning_rate": 5.107188989552529e-06, "loss": 0.0157, "step": 5421 }, { "epoch": 2.4667879890809825, "grad_norm": 0.483643978357322, "learning_rate": 5.105760017646839e-06, "loss": 0.0147, "step": 5422 }, { "epoch": 2.467242948134668, "grad_norm": 0.4084233464230104, "learning_rate": 5.1043310370989184e-06, "loss": 0.0174, "step": 5423 }, { "epoch": 2.467697907188353, "grad_norm": 0.3258662727119082, "learning_rate": 5.102902048025538e-06, "loss": 0.013, "step": 5424 }, { "epoch": 2.468152866242038, "grad_norm": 0.47279595459088114, "learning_rate": 5.101473050543464e-06, "loss": 0.0146, "step": 5425 }, { "epoch": 2.4686078252957233, "grad_norm": 0.3251070762000244, "learning_rate": 5.100044044769472e-06, "loss": 0.0089, "step": 5426 }, { "epoch": 2.4690627843494086, "grad_norm": 0.3201788561181316, "learning_rate": 5.098615030820333e-06, "loss": 0.0071, "step": 5427 }, { "epoch": 2.4695177434030935, "grad_norm": 0.28274405624464927, "learning_rate": 5.097186008812818e-06, "loss": 0.0056, "step": 5428 }, { "epoch": 2.469972702456779, "grad_norm": 0.2820569423373688, "learning_rate": 5.095756978863702e-06, "loss": 0.006, "step": 5429 }, { "epoch": 2.470427661510464, "grad_norm": 0.4647302280554121, "learning_rate": 5.094327941089758e-06, "loss": 0.0096, "step": 5430 }, { "epoch": 2.470882620564149, "grad_norm": 0.250235327250325, "learning_rate": 5.0928988956077604e-06, "loss": 0.0084, "step": 5431 }, { "epoch": 2.4713375796178343, "grad_norm": 0.3826893361598013, "learning_rate": 5.0914698425344845e-06, "loss": 0.0087, "step": 5432 }, { "epoch": 2.4717925386715196, "grad_norm": 0.8139476684262739, "learning_rate": 5.090040781986706e-06, "loss": 0.0239, "step": 5433 }, { "epoch": 2.472247497725205, "grad_norm": 0.2749894403917113, "learning_rate": 5.088611714081203e-06, "loss": 0.0068, "step": 5434 }, { "epoch": 2.47270245677889, "grad_norm": 0.5239176380706023, "learning_rate": 5.0871826389347475e-06, "loss": 0.0172, "step": 5435 }, { "epoch": 2.473157415832575, "grad_norm": 0.4481873000513952, "learning_rate": 5.085753556664124e-06, "loss": 0.0102, "step": 5436 }, { "epoch": 2.4736123748862604, "grad_norm": 0.3097559481978516, "learning_rate": 5.084324467386106e-06, "loss": 0.0129, "step": 5437 }, { "epoch": 2.4740673339399453, "grad_norm": 0.3057121182463589, "learning_rate": 5.0828953712174735e-06, "loss": 0.0083, "step": 5438 }, { "epoch": 2.4745222929936306, "grad_norm": 0.4079199912576415, "learning_rate": 5.081466268275005e-06, "loss": 0.0104, "step": 5439 }, { "epoch": 2.474977252047316, "grad_norm": 0.3809173575195753, "learning_rate": 5.080037158675481e-06, "loss": 0.0136, "step": 5440 }, { "epoch": 2.4754322111010008, "grad_norm": 0.30222294439440717, "learning_rate": 5.078608042535682e-06, "loss": 0.0091, "step": 5441 }, { "epoch": 2.475887170154686, "grad_norm": 0.43467376785926043, "learning_rate": 5.077178919972388e-06, "loss": 0.0138, "step": 5442 }, { "epoch": 2.4763421292083714, "grad_norm": 0.5629966441322612, "learning_rate": 5.075749791102382e-06, "loss": 0.0124, "step": 5443 }, { "epoch": 2.4767970882620562, "grad_norm": 0.44402811668745085, "learning_rate": 5.074320656042446e-06, "loss": 0.0148, "step": 5444 }, { "epoch": 2.4772520473157416, "grad_norm": 0.3843873930847924, "learning_rate": 5.072891514909361e-06, "loss": 0.0102, "step": 5445 }, { "epoch": 2.477707006369427, "grad_norm": 0.32882781000696565, "learning_rate": 5.071462367819909e-06, "loss": 0.0119, "step": 5446 }, { "epoch": 2.4781619654231117, "grad_norm": 0.42795726773632586, "learning_rate": 5.070033214890876e-06, "loss": 0.016, "step": 5447 }, { "epoch": 2.478616924476797, "grad_norm": 0.22838737121675054, "learning_rate": 5.068604056239046e-06, "loss": 0.0043, "step": 5448 }, { "epoch": 2.4790718835304824, "grad_norm": 0.34419758854472965, "learning_rate": 5.067174891981201e-06, "loss": 0.0078, "step": 5449 }, { "epoch": 2.4795268425841677, "grad_norm": 0.6802361006015856, "learning_rate": 5.065745722234128e-06, "loss": 0.0235, "step": 5450 }, { "epoch": 2.4799818016378525, "grad_norm": 0.3091308707042837, "learning_rate": 5.064316547114612e-06, "loss": 0.0052, "step": 5451 }, { "epoch": 2.480436760691538, "grad_norm": 0.3555379741422837, "learning_rate": 5.0628873667394364e-06, "loss": 0.0081, "step": 5452 }, { "epoch": 2.480891719745223, "grad_norm": 0.3114904842834978, "learning_rate": 5.061458181225391e-06, "loss": 0.0062, "step": 5453 }, { "epoch": 2.481346678798908, "grad_norm": 0.3427768927053479, "learning_rate": 5.060028990689259e-06, "loss": 0.0087, "step": 5454 }, { "epoch": 2.4818016378525933, "grad_norm": 0.49459219472705657, "learning_rate": 5.05859979524783e-06, "loss": 0.0158, "step": 5455 }, { "epoch": 2.4822565969062786, "grad_norm": 0.30078272464870365, "learning_rate": 5.057170595017891e-06, "loss": 0.0099, "step": 5456 }, { "epoch": 2.4827115559599635, "grad_norm": 0.2252201083771656, "learning_rate": 5.055741390116227e-06, "loss": 0.0036, "step": 5457 }, { "epoch": 2.483166515013649, "grad_norm": 0.5446063405835339, "learning_rate": 5.054312180659631e-06, "loss": 0.0102, "step": 5458 }, { "epoch": 2.483621474067334, "grad_norm": 0.43403876230490246, "learning_rate": 5.0528829667648875e-06, "loss": 0.0092, "step": 5459 }, { "epoch": 2.484076433121019, "grad_norm": 0.45501792804101815, "learning_rate": 5.051453748548786e-06, "loss": 0.015, "step": 5460 }, { "epoch": 2.4845313921747043, "grad_norm": 1.3807537003371522, "learning_rate": 5.050024526128118e-06, "loss": 0.0087, "step": 5461 }, { "epoch": 2.4849863512283896, "grad_norm": 0.4206908833428577, "learning_rate": 5.048595299619671e-06, "loss": 0.018, "step": 5462 }, { "epoch": 2.4854413102820745, "grad_norm": 0.5526855525115705, "learning_rate": 5.047166069140235e-06, "loss": 0.0197, "step": 5463 }, { "epoch": 2.48589626933576, "grad_norm": 0.43880393214773, "learning_rate": 5.0457368348066e-06, "loss": 0.0108, "step": 5464 }, { "epoch": 2.486351228389445, "grad_norm": 0.42724520346313205, "learning_rate": 5.04430759673556e-06, "loss": 0.0078, "step": 5465 }, { "epoch": 2.48680618744313, "grad_norm": 0.5998726297653039, "learning_rate": 5.042878355043902e-06, "loss": 0.0153, "step": 5466 }, { "epoch": 2.4872611464968153, "grad_norm": 0.24920444582854095, "learning_rate": 5.041449109848418e-06, "loss": 0.0044, "step": 5467 }, { "epoch": 2.4877161055505006, "grad_norm": 0.4596108651630432, "learning_rate": 5.040019861265901e-06, "loss": 0.0132, "step": 5468 }, { "epoch": 2.4881710646041855, "grad_norm": 0.26925258867518487, "learning_rate": 5.038590609413141e-06, "loss": 0.0072, "step": 5469 }, { "epoch": 2.488626023657871, "grad_norm": 0.350170449969174, "learning_rate": 5.03716135440693e-06, "loss": 0.0139, "step": 5470 }, { "epoch": 2.489080982711556, "grad_norm": 0.49030337105475924, "learning_rate": 5.035732096364061e-06, "loss": 0.0322, "step": 5471 }, { "epoch": 2.489535941765241, "grad_norm": 0.5060644314900122, "learning_rate": 5.034302835401328e-06, "loss": 0.0139, "step": 5472 }, { "epoch": 2.4899909008189263, "grad_norm": 0.5299328884299171, "learning_rate": 5.032873571635522e-06, "loss": 0.0237, "step": 5473 }, { "epoch": 2.4904458598726116, "grad_norm": 0.4599415921033778, "learning_rate": 5.031444305183435e-06, "loss": 0.0089, "step": 5474 }, { "epoch": 2.4909008189262964, "grad_norm": 0.4580562875637281, "learning_rate": 5.030015036161863e-06, "loss": 0.0147, "step": 5475 }, { "epoch": 2.4913557779799818, "grad_norm": 0.3369733129797372, "learning_rate": 5.028585764687596e-06, "loss": 0.0089, "step": 5476 }, { "epoch": 2.491810737033667, "grad_norm": 0.3201648664458773, "learning_rate": 5.02715649087743e-06, "loss": 0.0076, "step": 5477 }, { "epoch": 2.492265696087352, "grad_norm": 0.4890149308940846, "learning_rate": 5.025727214848158e-06, "loss": 0.0102, "step": 5478 }, { "epoch": 2.4927206551410372, "grad_norm": 0.39312401492933874, "learning_rate": 5.024297936716574e-06, "loss": 0.0088, "step": 5479 }, { "epoch": 2.4931756141947226, "grad_norm": 0.3618076509461657, "learning_rate": 5.0228686565994745e-06, "loss": 0.0109, "step": 5480 }, { "epoch": 2.4936305732484074, "grad_norm": 0.4513013278322856, "learning_rate": 5.021439374613648e-06, "loss": 0.0165, "step": 5481 }, { "epoch": 2.4940855323020927, "grad_norm": 0.671144246567914, "learning_rate": 5.020010090875895e-06, "loss": 0.0137, "step": 5482 }, { "epoch": 2.494540491355778, "grad_norm": 0.4678025512741067, "learning_rate": 5.018580805503007e-06, "loss": 0.0172, "step": 5483 }, { "epoch": 2.494995450409463, "grad_norm": 0.45375195687662523, "learning_rate": 5.01715151861178e-06, "loss": 0.015, "step": 5484 }, { "epoch": 2.4954504094631482, "grad_norm": 0.27386671611422353, "learning_rate": 5.015722230319009e-06, "loss": 0.0078, "step": 5485 }, { "epoch": 2.4959053685168335, "grad_norm": 0.47655367248773756, "learning_rate": 5.014292940741487e-06, "loss": 0.0172, "step": 5486 }, { "epoch": 2.496360327570519, "grad_norm": 0.38754885419391666, "learning_rate": 5.012863649996013e-06, "loss": 0.0149, "step": 5487 }, { "epoch": 2.4968152866242037, "grad_norm": 0.33897821581513504, "learning_rate": 5.011434358199375e-06, "loss": 0.0106, "step": 5488 }, { "epoch": 2.497270245677889, "grad_norm": 0.47996766790953127, "learning_rate": 5.0100050654683766e-06, "loss": 0.0204, "step": 5489 }, { "epoch": 2.4977252047315743, "grad_norm": 0.4268913339924163, "learning_rate": 5.008575771919808e-06, "loss": 0.0082, "step": 5490 }, { "epoch": 2.498180163785259, "grad_norm": 0.3951902395496699, "learning_rate": 5.007146477670466e-06, "loss": 0.0112, "step": 5491 }, { "epoch": 2.4986351228389445, "grad_norm": 0.7081438062828642, "learning_rate": 5.005717182837147e-06, "loss": 0.0305, "step": 5492 }, { "epoch": 2.49909008189263, "grad_norm": 0.42096235736073284, "learning_rate": 5.004287887536645e-06, "loss": 0.0203, "step": 5493 }, { "epoch": 2.4995450409463147, "grad_norm": 0.5346006200366389, "learning_rate": 5.002858591885756e-06, "loss": 0.0189, "step": 5494 }, { "epoch": 2.5, "grad_norm": 0.49017838297375854, "learning_rate": 5.001429296001275e-06, "loss": 0.0145, "step": 5495 }, { "epoch": 2.5004549590536853, "grad_norm": 0.4141991773280852, "learning_rate": 5e-06, "loss": 0.0096, "step": 5496 }, { "epoch": 2.50090991810737, "grad_norm": 0.2467185146822804, "learning_rate": 4.9985707039987256e-06, "loss": 0.0064, "step": 5497 }, { "epoch": 2.5013648771610555, "grad_norm": 0.35852650013127413, "learning_rate": 4.9971414081142455e-06, "loss": 0.0114, "step": 5498 }, { "epoch": 2.501819836214741, "grad_norm": 0.2997727062951807, "learning_rate": 4.995712112463358e-06, "loss": 0.0083, "step": 5499 }, { "epoch": 2.502274795268426, "grad_norm": 0.4678314645703148, "learning_rate": 4.994282817162854e-06, "loss": 0.0246, "step": 5500 }, { "epoch": 2.502729754322111, "grad_norm": 0.37661683265971724, "learning_rate": 4.992853522329535e-06, "loss": 0.01, "step": 5501 }, { "epoch": 2.5031847133757963, "grad_norm": 0.3003152893169655, "learning_rate": 4.991424228080193e-06, "loss": 0.0081, "step": 5502 }, { "epoch": 2.5036396724294816, "grad_norm": 0.27232546572021565, "learning_rate": 4.989994934531625e-06, "loss": 0.0048, "step": 5503 }, { "epoch": 2.5040946314831665, "grad_norm": 0.31903561416643506, "learning_rate": 4.988565641800626e-06, "loss": 0.0103, "step": 5504 }, { "epoch": 2.5045495905368518, "grad_norm": 0.31597363231985026, "learning_rate": 4.98713635000399e-06, "loss": 0.0087, "step": 5505 }, { "epoch": 2.505004549590537, "grad_norm": 0.3289676047625398, "learning_rate": 4.985707059258515e-06, "loss": 0.0092, "step": 5506 }, { "epoch": 2.505459508644222, "grad_norm": 0.36358965738000726, "learning_rate": 4.9842777696809925e-06, "loss": 0.0107, "step": 5507 }, { "epoch": 2.5059144676979073, "grad_norm": 0.3595581796565044, "learning_rate": 4.98284848138822e-06, "loss": 0.0156, "step": 5508 }, { "epoch": 2.5063694267515926, "grad_norm": 0.29196970351493584, "learning_rate": 4.9814191944969934e-06, "loss": 0.0071, "step": 5509 }, { "epoch": 2.5068243858052774, "grad_norm": 0.36578404478530974, "learning_rate": 4.979989909124106e-06, "loss": 0.0102, "step": 5510 }, { "epoch": 2.5072793448589628, "grad_norm": 0.4041831002992017, "learning_rate": 4.978560625386354e-06, "loss": 0.0122, "step": 5511 }, { "epoch": 2.507734303912648, "grad_norm": 0.4143905742161826, "learning_rate": 4.977131343400528e-06, "loss": 0.0137, "step": 5512 }, { "epoch": 2.508189262966333, "grad_norm": 0.40720665401264816, "learning_rate": 4.975702063283428e-06, "loss": 0.0063, "step": 5513 }, { "epoch": 2.5086442220200182, "grad_norm": 0.564393026317754, "learning_rate": 4.974272785151843e-06, "loss": 0.0201, "step": 5514 }, { "epoch": 2.5090991810737036, "grad_norm": 0.30933261805319073, "learning_rate": 4.972843509122571e-06, "loss": 0.0112, "step": 5515 }, { "epoch": 2.5095541401273884, "grad_norm": 0.3069960764395276, "learning_rate": 4.971414235312406e-06, "loss": 0.0082, "step": 5516 }, { "epoch": 2.5100090991810737, "grad_norm": 0.5014702072334418, "learning_rate": 4.9699849638381396e-06, "loss": 0.0128, "step": 5517 }, { "epoch": 2.510464058234759, "grad_norm": 0.36167069762141185, "learning_rate": 4.968555694816567e-06, "loss": 0.0161, "step": 5518 }, { "epoch": 2.510919017288444, "grad_norm": 0.47630341434996465, "learning_rate": 4.96712642836448e-06, "loss": 0.0105, "step": 5519 }, { "epoch": 2.511373976342129, "grad_norm": 0.3495169527672344, "learning_rate": 4.965697164598674e-06, "loss": 0.0086, "step": 5520 }, { "epoch": 2.5118289353958145, "grad_norm": 0.4536091118396002, "learning_rate": 4.964267903635939e-06, "loss": 0.0174, "step": 5521 }, { "epoch": 2.5122838944494994, "grad_norm": 0.5347034444205534, "learning_rate": 4.96283864559307e-06, "loss": 0.0094, "step": 5522 }, { "epoch": 2.5127388535031847, "grad_norm": 0.34504812867014445, "learning_rate": 4.96140939058686e-06, "loss": 0.0131, "step": 5523 }, { "epoch": 2.51319381255687, "grad_norm": 0.3159930637146742, "learning_rate": 4.9599801387341e-06, "loss": 0.007, "step": 5524 }, { "epoch": 2.513648771610555, "grad_norm": 0.4629740972135442, "learning_rate": 4.958550890151584e-06, "loss": 0.011, "step": 5525 }, { "epoch": 2.51410373066424, "grad_norm": 0.3702026628093378, "learning_rate": 4.9571216449561e-06, "loss": 0.0108, "step": 5526 }, { "epoch": 2.5145586897179255, "grad_norm": 0.6242326412181597, "learning_rate": 4.955692403264442e-06, "loss": 0.014, "step": 5527 }, { "epoch": 2.5150136487716104, "grad_norm": 0.5075724414262232, "learning_rate": 4.9542631651934e-06, "loss": 0.0131, "step": 5528 }, { "epoch": 2.5154686078252957, "grad_norm": 0.8756432428133315, "learning_rate": 4.952833930859766e-06, "loss": 0.0436, "step": 5529 }, { "epoch": 2.515923566878981, "grad_norm": 0.5147063715586754, "learning_rate": 4.951404700380331e-06, "loss": 0.0295, "step": 5530 }, { "epoch": 2.516378525932666, "grad_norm": 0.30998361393225377, "learning_rate": 4.9499754738718835e-06, "loss": 0.0093, "step": 5531 }, { "epoch": 2.516833484986351, "grad_norm": 0.3577270446223234, "learning_rate": 4.948546251451216e-06, "loss": 0.0115, "step": 5532 }, { "epoch": 2.5172884440400365, "grad_norm": 0.3958399614718588, "learning_rate": 4.947117033235116e-06, "loss": 0.0187, "step": 5533 }, { "epoch": 2.5177434030937214, "grad_norm": 0.2819495908306721, "learning_rate": 4.945687819340372e-06, "loss": 0.0051, "step": 5534 }, { "epoch": 2.5181983621474067, "grad_norm": 0.5591416708777733, "learning_rate": 4.944258609883773e-06, "loss": 0.0172, "step": 5535 }, { "epoch": 2.518653321201092, "grad_norm": 0.3741700766720031, "learning_rate": 4.942829404982112e-06, "loss": 0.0142, "step": 5536 }, { "epoch": 2.519108280254777, "grad_norm": 0.4945911767497451, "learning_rate": 4.9414002047521705e-06, "loss": 0.019, "step": 5537 }, { "epoch": 2.519563239308462, "grad_norm": 0.31958519221663395, "learning_rate": 4.939971009310743e-06, "loss": 0.0135, "step": 5538 }, { "epoch": 2.5200181983621475, "grad_norm": 0.3592760701659723, "learning_rate": 4.938541818774611e-06, "loss": 0.0124, "step": 5539 }, { "epoch": 2.5204731574158323, "grad_norm": 0.6461483543159348, "learning_rate": 4.937112633260566e-06, "loss": 0.0244, "step": 5540 }, { "epoch": 2.5209281164695176, "grad_norm": 0.34428537904976697, "learning_rate": 4.9356834528853905e-06, "loss": 0.0097, "step": 5541 }, { "epoch": 2.521383075523203, "grad_norm": 0.4758533567881116, "learning_rate": 4.934254277765872e-06, "loss": 0.016, "step": 5542 }, { "epoch": 2.521838034576888, "grad_norm": 0.3571273207320487, "learning_rate": 4.9328251080188e-06, "loss": 0.0093, "step": 5543 }, { "epoch": 2.522292993630573, "grad_norm": 0.31431089221926806, "learning_rate": 4.931395943760955e-06, "loss": 0.0084, "step": 5544 }, { "epoch": 2.5227479526842584, "grad_norm": 4.747832584318776, "learning_rate": 4.929966785109125e-06, "loss": 0.111, "step": 5545 }, { "epoch": 2.5232029117379433, "grad_norm": 0.39506910801267275, "learning_rate": 4.928537632180092e-06, "loss": 0.0102, "step": 5546 }, { "epoch": 2.5236578707916286, "grad_norm": 0.6871172353377315, "learning_rate": 4.927108485090643e-06, "loss": 0.0281, "step": 5547 }, { "epoch": 2.524112829845314, "grad_norm": 0.37466249576280947, "learning_rate": 4.925679343957557e-06, "loss": 0.012, "step": 5548 }, { "epoch": 2.5245677888989992, "grad_norm": 0.34469889399890297, "learning_rate": 4.924250208897619e-06, "loss": 0.0109, "step": 5549 }, { "epoch": 2.525022747952684, "grad_norm": 0.399821198857524, "learning_rate": 4.922821080027613e-06, "loss": 0.0125, "step": 5550 }, { "epoch": 2.5254777070063694, "grad_norm": 0.39885120746996305, "learning_rate": 4.921391957464319e-06, "loss": 0.0129, "step": 5551 }, { "epoch": 2.5259326660600547, "grad_norm": 0.5874602162370535, "learning_rate": 4.919962841324521e-06, "loss": 0.0245, "step": 5552 }, { "epoch": 2.5263876251137396, "grad_norm": 0.33581082842146154, "learning_rate": 4.918533731724997e-06, "loss": 0.0115, "step": 5553 }, { "epoch": 2.526842584167425, "grad_norm": 0.3470907353559372, "learning_rate": 4.917104628782529e-06, "loss": 0.0088, "step": 5554 }, { "epoch": 2.52729754322111, "grad_norm": 0.5267945547914693, "learning_rate": 4.915675532613896e-06, "loss": 0.0142, "step": 5555 }, { "epoch": 2.5277525022747955, "grad_norm": 0.3725294825015107, "learning_rate": 4.914246443335876e-06, "loss": 0.0111, "step": 5556 }, { "epoch": 2.5282074613284804, "grad_norm": 0.2627866071882769, "learning_rate": 4.9128173610652524e-06, "loss": 0.0087, "step": 5557 }, { "epoch": 2.5286624203821657, "grad_norm": 0.40804527856852457, "learning_rate": 4.9113882859187985e-06, "loss": 0.0118, "step": 5558 }, { "epoch": 2.529117379435851, "grad_norm": 0.44875320925114015, "learning_rate": 4.909959218013295e-06, "loss": 0.0208, "step": 5559 }, { "epoch": 2.529572338489536, "grad_norm": 0.439232146447453, "learning_rate": 4.908530157465516e-06, "loss": 0.012, "step": 5560 }, { "epoch": 2.530027297543221, "grad_norm": 0.271794599047473, "learning_rate": 4.907101104392242e-06, "loss": 0.0078, "step": 5561 }, { "epoch": 2.5304822565969065, "grad_norm": 0.8592142893631755, "learning_rate": 4.905672058910243e-06, "loss": 0.0294, "step": 5562 }, { "epoch": 2.5309372156505914, "grad_norm": 0.41292625608557826, "learning_rate": 4.904243021136298e-06, "loss": 0.0107, "step": 5563 }, { "epoch": 2.5313921747042767, "grad_norm": 0.38744962053142673, "learning_rate": 4.902813991187183e-06, "loss": 0.0144, "step": 5564 }, { "epoch": 2.531847133757962, "grad_norm": 0.24441496751971636, "learning_rate": 4.901384969179668e-06, "loss": 0.0078, "step": 5565 }, { "epoch": 2.532302092811647, "grad_norm": 0.4237942333287475, "learning_rate": 4.8999559552305294e-06, "loss": 0.0131, "step": 5566 }, { "epoch": 2.532757051865332, "grad_norm": 0.5124564835590572, "learning_rate": 4.898526949456537e-06, "loss": 0.0092, "step": 5567 }, { "epoch": 2.5332120109190175, "grad_norm": 0.539497107342928, "learning_rate": 4.897097951974465e-06, "loss": 0.0216, "step": 5568 }, { "epoch": 2.5336669699727024, "grad_norm": 0.3139704212663251, "learning_rate": 4.895668962901084e-06, "loss": 0.0084, "step": 5569 }, { "epoch": 2.5341219290263877, "grad_norm": 0.3789700818731656, "learning_rate": 4.894239982353162e-06, "loss": 0.0111, "step": 5570 }, { "epoch": 2.534576888080073, "grad_norm": 0.270368556526887, "learning_rate": 4.892811010447472e-06, "loss": 0.0062, "step": 5571 }, { "epoch": 2.535031847133758, "grad_norm": 0.8572665325618268, "learning_rate": 4.891382047300783e-06, "loss": 0.0237, "step": 5572 }, { "epoch": 2.535486806187443, "grad_norm": 0.3680325899891936, "learning_rate": 4.889953093029862e-06, "loss": 0.0111, "step": 5573 }, { "epoch": 2.5359417652411285, "grad_norm": 0.43888213024771167, "learning_rate": 4.888524147751479e-06, "loss": 0.0141, "step": 5574 }, { "epoch": 2.5363967242948133, "grad_norm": 0.5154113179861864, "learning_rate": 4.887095211582397e-06, "loss": 0.0127, "step": 5575 }, { "epoch": 2.5368516833484986, "grad_norm": 0.3662234379749456, "learning_rate": 4.885666284639385e-06, "loss": 0.0144, "step": 5576 }, { "epoch": 2.537306642402184, "grad_norm": 0.6064861777011574, "learning_rate": 4.884237367039207e-06, "loss": 0.0258, "step": 5577 }, { "epoch": 2.537761601455869, "grad_norm": 0.22811602993101804, "learning_rate": 4.882808458898629e-06, "loss": 0.0058, "step": 5578 }, { "epoch": 2.538216560509554, "grad_norm": 0.47988155796684506, "learning_rate": 4.881379560334416e-06, "loss": 0.0112, "step": 5579 }, { "epoch": 2.5386715195632394, "grad_norm": 0.48247690902291146, "learning_rate": 4.879950671463328e-06, "loss": 0.0113, "step": 5580 }, { "epoch": 2.5391264786169243, "grad_norm": 0.712840557495972, "learning_rate": 4.878521792402131e-06, "loss": 0.0293, "step": 5581 }, { "epoch": 2.5395814376706096, "grad_norm": 0.2824744664732004, "learning_rate": 4.877092923267582e-06, "loss": 0.0056, "step": 5582 }, { "epoch": 2.540036396724295, "grad_norm": 0.3342297770263099, "learning_rate": 4.875664064176447e-06, "loss": 0.0112, "step": 5583 }, { "epoch": 2.54049135577798, "grad_norm": 0.5261592758190419, "learning_rate": 4.87423521524548e-06, "loss": 0.0144, "step": 5584 }, { "epoch": 2.540946314831665, "grad_norm": 0.37226086362158617, "learning_rate": 4.8728063765914446e-06, "loss": 0.0095, "step": 5585 }, { "epoch": 2.5414012738853504, "grad_norm": 0.21794305516565435, "learning_rate": 4.871377548331099e-06, "loss": 0.0044, "step": 5586 }, { "epoch": 2.5418562329390353, "grad_norm": 0.5972324079160436, "learning_rate": 4.869948730581198e-06, "loss": 0.0133, "step": 5587 }, { "epoch": 2.5423111919927206, "grad_norm": 0.3574329092640865, "learning_rate": 4.8685199234585e-06, "loss": 0.013, "step": 5588 }, { "epoch": 2.542766151046406, "grad_norm": 0.39022968181721507, "learning_rate": 4.867091127079759e-06, "loss": 0.0097, "step": 5589 }, { "epoch": 2.5432211101000908, "grad_norm": 0.32666939246421944, "learning_rate": 4.865662341561733e-06, "loss": 0.0098, "step": 5590 }, { "epoch": 2.543676069153776, "grad_norm": 0.4250077147805536, "learning_rate": 4.864233567021171e-06, "loss": 0.0113, "step": 5591 }, { "epoch": 2.5441310282074614, "grad_norm": 0.41558963533530774, "learning_rate": 4.8628048035748304e-06, "loss": 0.011, "step": 5592 }, { "epoch": 2.5445859872611463, "grad_norm": 0.41087461827146887, "learning_rate": 4.861376051339462e-06, "loss": 0.0159, "step": 5593 }, { "epoch": 2.5450409463148316, "grad_norm": 0.3193621099240802, "learning_rate": 4.859947310431816e-06, "loss": 0.006, "step": 5594 }, { "epoch": 2.545495905368517, "grad_norm": 0.35894260549027757, "learning_rate": 4.858518580968644e-06, "loss": 0.0084, "step": 5595 }, { "epoch": 2.5459508644222018, "grad_norm": 0.3013580851069753, "learning_rate": 4.857089863066694e-06, "loss": 0.0118, "step": 5596 }, { "epoch": 2.546405823475887, "grad_norm": 0.4268250695733099, "learning_rate": 4.8556611568427165e-06, "loss": 0.0174, "step": 5597 }, { "epoch": 2.5468607825295724, "grad_norm": 0.4394317936188433, "learning_rate": 4.854232462413455e-06, "loss": 0.0138, "step": 5598 }, { "epoch": 2.5473157415832572, "grad_norm": 0.40675101044833256, "learning_rate": 4.852803779895658e-06, "loss": 0.0168, "step": 5599 }, { "epoch": 2.5477707006369426, "grad_norm": 0.2736244357234857, "learning_rate": 4.8513751094060744e-06, "loss": 0.0076, "step": 5600 }, { "epoch": 2.548225659690628, "grad_norm": 0.3242077509300469, "learning_rate": 4.849946451061444e-06, "loss": 0.0079, "step": 5601 }, { "epoch": 2.548680618744313, "grad_norm": 0.45386624160854155, "learning_rate": 4.848517804978513e-06, "loss": 0.0159, "step": 5602 }, { "epoch": 2.549135577797998, "grad_norm": 0.34118928000231935, "learning_rate": 4.847089171274022e-06, "loss": 0.0161, "step": 5603 }, { "epoch": 2.5495905368516834, "grad_norm": 0.2614801479276341, "learning_rate": 4.845660550064714e-06, "loss": 0.0053, "step": 5604 }, { "epoch": 2.5500454959053687, "grad_norm": 0.4315174445267716, "learning_rate": 4.8442319414673266e-06, "loss": 0.0154, "step": 5605 }, { "epoch": 2.5505004549590535, "grad_norm": 0.7845355806911353, "learning_rate": 4.842803345598604e-06, "loss": 0.0259, "step": 5606 }, { "epoch": 2.550955414012739, "grad_norm": 0.33333613207861285, "learning_rate": 4.841374762575281e-06, "loss": 0.0093, "step": 5607 }, { "epoch": 2.551410373066424, "grad_norm": 0.33005978750881937, "learning_rate": 4.839946192514096e-06, "loss": 0.0104, "step": 5608 }, { "epoch": 2.5518653321201095, "grad_norm": 0.6088233555043218, "learning_rate": 4.838517635531787e-06, "loss": 0.0256, "step": 5609 }, { "epoch": 2.5523202911737943, "grad_norm": 0.3382526567827965, "learning_rate": 4.837089091745086e-06, "loss": 0.0086, "step": 5610 }, { "epoch": 2.5527752502274796, "grad_norm": 0.31057908536556494, "learning_rate": 4.835660561270729e-06, "loss": 0.0077, "step": 5611 }, { "epoch": 2.553230209281165, "grad_norm": 0.551724445181761, "learning_rate": 4.834232044225447e-06, "loss": 0.0227, "step": 5612 }, { "epoch": 2.55368516833485, "grad_norm": 0.29200280313332505, "learning_rate": 4.832803540725977e-06, "loss": 0.0111, "step": 5613 }, { "epoch": 2.554140127388535, "grad_norm": 0.47928702896577746, "learning_rate": 4.831375050889045e-06, "loss": 0.0143, "step": 5614 }, { "epoch": 2.5545950864422204, "grad_norm": 0.2709973614498435, "learning_rate": 4.829946574831383e-06, "loss": 0.0096, "step": 5615 }, { "epoch": 2.5550500454959053, "grad_norm": 0.41319674005519985, "learning_rate": 4.828518112669718e-06, "loss": 0.0143, "step": 5616 }, { "epoch": 2.5555050045495906, "grad_norm": 0.4179304240853926, "learning_rate": 4.827089664520779e-06, "loss": 0.0137, "step": 5617 }, { "epoch": 2.555959963603276, "grad_norm": 0.6009269695179169, "learning_rate": 4.8256612305012915e-06, "loss": 0.0283, "step": 5618 }, { "epoch": 2.556414922656961, "grad_norm": 0.5525798106238338, "learning_rate": 4.8242328107279805e-06, "loss": 0.0157, "step": 5619 }, { "epoch": 2.556869881710646, "grad_norm": 0.3009813287525514, "learning_rate": 4.822804405317571e-06, "loss": 0.0065, "step": 5620 }, { "epoch": 2.5573248407643314, "grad_norm": 0.5024514195761675, "learning_rate": 4.821376014386785e-06, "loss": 0.0141, "step": 5621 }, { "epoch": 2.5577797998180163, "grad_norm": 0.38596782208931674, "learning_rate": 4.819947638052345e-06, "loss": 0.0121, "step": 5622 }, { "epoch": 2.5582347588717016, "grad_norm": 0.4591430541180311, "learning_rate": 4.818519276430971e-06, "loss": 0.0102, "step": 5623 }, { "epoch": 2.558689717925387, "grad_norm": 0.3284450771135685, "learning_rate": 4.8170909296393824e-06, "loss": 0.006, "step": 5624 }, { "epoch": 2.5591446769790718, "grad_norm": 0.4315696126470542, "learning_rate": 4.815662597794296e-06, "loss": 0.0129, "step": 5625 }, { "epoch": 2.559599636032757, "grad_norm": 0.2984890660335619, "learning_rate": 4.814234281012429e-06, "loss": 0.0091, "step": 5626 }, { "epoch": 2.5600545950864424, "grad_norm": 0.28799942970226056, "learning_rate": 4.812805979410499e-06, "loss": 0.0097, "step": 5627 }, { "epoch": 2.5605095541401273, "grad_norm": 0.5168975637701665, "learning_rate": 4.811377693105219e-06, "loss": 0.0211, "step": 5628 }, { "epoch": 2.5609645131938126, "grad_norm": 0.4424202098069847, "learning_rate": 4.809949422213303e-06, "loss": 0.0091, "step": 5629 }, { "epoch": 2.561419472247498, "grad_norm": 0.5069754961694984, "learning_rate": 4.80852116685146e-06, "loss": 0.0175, "step": 5630 }, { "epoch": 2.5618744313011828, "grad_norm": 0.5715217237686248, "learning_rate": 4.807092927136404e-06, "loss": 0.0263, "step": 5631 }, { "epoch": 2.562329390354868, "grad_norm": 0.3962821391351908, "learning_rate": 4.805664703184842e-06, "loss": 0.0125, "step": 5632 }, { "epoch": 2.5627843494085534, "grad_norm": 0.4456271554372272, "learning_rate": 4.804236495113481e-06, "loss": 0.0136, "step": 5633 }, { "epoch": 2.5632393084622382, "grad_norm": 0.3920443337679276, "learning_rate": 4.802808303039032e-06, "loss": 0.0163, "step": 5634 }, { "epoch": 2.5636942675159236, "grad_norm": 0.20440097078867706, "learning_rate": 4.801380127078198e-06, "loss": 0.0042, "step": 5635 }, { "epoch": 2.564149226569609, "grad_norm": 0.30323678238048796, "learning_rate": 4.799951967347683e-06, "loss": 0.0087, "step": 5636 }, { "epoch": 2.5646041856232937, "grad_norm": 0.4519864852017533, "learning_rate": 4.798523823964189e-06, "loss": 0.0155, "step": 5637 }, { "epoch": 2.565059144676979, "grad_norm": 0.45497642018573037, "learning_rate": 4.79709569704442e-06, "loss": 0.0112, "step": 5638 }, { "epoch": 2.5655141037306644, "grad_norm": 0.22147517641216033, "learning_rate": 4.795667586705073e-06, "loss": 0.0062, "step": 5639 }, { "epoch": 2.565969062784349, "grad_norm": 0.14301943004382262, "learning_rate": 4.794239493062846e-06, "loss": 0.0029, "step": 5640 }, { "epoch": 2.5664240218380345, "grad_norm": 0.38343344825948156, "learning_rate": 4.792811416234441e-06, "loss": 0.011, "step": 5641 }, { "epoch": 2.56687898089172, "grad_norm": 0.3722181884774487, "learning_rate": 4.791383356336552e-06, "loss": 0.0158, "step": 5642 }, { "epoch": 2.5673339399454047, "grad_norm": 0.26893801764638015, "learning_rate": 4.7899553134858715e-06, "loss": 0.0075, "step": 5643 }, { "epoch": 2.56778889899909, "grad_norm": 0.2892027359172782, "learning_rate": 4.7885272877990955e-06, "loss": 0.0069, "step": 5644 }, { "epoch": 2.5682438580527753, "grad_norm": 0.44760714500350546, "learning_rate": 4.787099279392913e-06, "loss": 0.0201, "step": 5645 }, { "epoch": 2.56869881710646, "grad_norm": 0.4511090540698227, "learning_rate": 4.7856712883840174e-06, "loss": 0.0074, "step": 5646 }, { "epoch": 2.5691537761601455, "grad_norm": 0.42953413647564953, "learning_rate": 4.784243314889094e-06, "loss": 0.0263, "step": 5647 }, { "epoch": 2.569608735213831, "grad_norm": 0.28324021307789976, "learning_rate": 4.782815359024834e-06, "loss": 0.0056, "step": 5648 }, { "epoch": 2.5700636942675157, "grad_norm": 0.3945037003097103, "learning_rate": 4.781387420907922e-06, "loss": 0.0175, "step": 5649 }, { "epoch": 2.570518653321201, "grad_norm": 0.4455129862590413, "learning_rate": 4.779959500655043e-06, "loss": 0.0146, "step": 5650 }, { "epoch": 2.5709736123748863, "grad_norm": 0.3662410009937011, "learning_rate": 4.77853159838288e-06, "loss": 0.0083, "step": 5651 }, { "epoch": 2.571428571428571, "grad_norm": 0.38796879524415695, "learning_rate": 4.777103714208114e-06, "loss": 0.0088, "step": 5652 }, { "epoch": 2.5718835304822565, "grad_norm": 0.46067658983642434, "learning_rate": 4.775675848247427e-06, "loss": 0.0245, "step": 5653 }, { "epoch": 2.572338489535942, "grad_norm": 0.3793292965702815, "learning_rate": 4.774248000617494e-06, "loss": 0.014, "step": 5654 }, { "epoch": 2.5727934485896267, "grad_norm": 0.5589395168221093, "learning_rate": 4.772820171434997e-06, "loss": 0.0218, "step": 5655 }, { "epoch": 2.573248407643312, "grad_norm": 0.35660954216467167, "learning_rate": 4.77139236081661e-06, "loss": 0.0085, "step": 5656 }, { "epoch": 2.5737033666969973, "grad_norm": 0.37115423687658833, "learning_rate": 4.769964568879006e-06, "loss": 0.017, "step": 5657 }, { "epoch": 2.5741583257506826, "grad_norm": 0.5059021892593047, "learning_rate": 4.768536795738859e-06, "loss": 0.0161, "step": 5658 }, { "epoch": 2.5746132848043675, "grad_norm": 0.39404637850315527, "learning_rate": 4.76710904151284e-06, "loss": 0.0169, "step": 5659 }, { "epoch": 2.5750682438580528, "grad_norm": 0.3251630101652949, "learning_rate": 4.765681306317618e-06, "loss": 0.0087, "step": 5660 }, { "epoch": 2.575523202911738, "grad_norm": 0.3729261098704293, "learning_rate": 4.764253590269861e-06, "loss": 0.0107, "step": 5661 }, { "epoch": 2.575978161965423, "grad_norm": 0.5563546315346025, "learning_rate": 4.762825893486236e-06, "loss": 0.0251, "step": 5662 }, { "epoch": 2.5764331210191083, "grad_norm": 0.46011067744887346, "learning_rate": 4.761398216083409e-06, "loss": 0.0175, "step": 5663 }, { "epoch": 2.5768880800727936, "grad_norm": 0.5243980347635864, "learning_rate": 4.7599705581780415e-06, "loss": 0.0221, "step": 5664 }, { "epoch": 2.577343039126479, "grad_norm": 0.22153214791109685, "learning_rate": 4.7585429198867975e-06, "loss": 0.005, "step": 5665 }, { "epoch": 2.5777979981801638, "grad_norm": 0.3470983000687461, "learning_rate": 4.757115301326334e-06, "loss": 0.0083, "step": 5666 }, { "epoch": 2.578252957233849, "grad_norm": 0.46582878099904523, "learning_rate": 4.755687702613312e-06, "loss": 0.0114, "step": 5667 }, { "epoch": 2.5787079162875344, "grad_norm": 0.3366141366194447, "learning_rate": 4.754260123864386e-06, "loss": 0.0081, "step": 5668 }, { "epoch": 2.5791628753412192, "grad_norm": 0.27418210374134405, "learning_rate": 4.752832565196213e-06, "loss": 0.009, "step": 5669 }, { "epoch": 2.5796178343949046, "grad_norm": 0.4096441211837446, "learning_rate": 4.751405026725449e-06, "loss": 0.0157, "step": 5670 }, { "epoch": 2.58007279344859, "grad_norm": 0.2989812908078907, "learning_rate": 4.749977508568742e-06, "loss": 0.0069, "step": 5671 }, { "epoch": 2.5805277525022747, "grad_norm": 0.3161858485023435, "learning_rate": 4.7485500108427456e-06, "loss": 0.0125, "step": 5672 }, { "epoch": 2.58098271155596, "grad_norm": 0.41529380289389317, "learning_rate": 4.7471225336641045e-06, "loss": 0.0126, "step": 5673 }, { "epoch": 2.5814376706096454, "grad_norm": 0.3891313276093038, "learning_rate": 4.74569507714947e-06, "loss": 0.0091, "step": 5674 }, { "epoch": 2.58189262966333, "grad_norm": 0.36763083573086686, "learning_rate": 4.744267641415483e-06, "loss": 0.0112, "step": 5675 }, { "epoch": 2.5823475887170155, "grad_norm": 0.26062760370540944, "learning_rate": 4.74284022657879e-06, "loss": 0.0095, "step": 5676 }, { "epoch": 2.582802547770701, "grad_norm": 0.4445073634493149, "learning_rate": 4.741412832756034e-06, "loss": 0.0198, "step": 5677 }, { "epoch": 2.5832575068243857, "grad_norm": 0.2525167791360124, "learning_rate": 4.7399854600638524e-06, "loss": 0.0098, "step": 5678 }, { "epoch": 2.583712465878071, "grad_norm": 0.38463953964064057, "learning_rate": 4.738558108618885e-06, "loss": 0.0153, "step": 5679 }, { "epoch": 2.5841674249317563, "grad_norm": 0.40068511122357836, "learning_rate": 4.737130778537769e-06, "loss": 0.0166, "step": 5680 }, { "epoch": 2.584622383985441, "grad_norm": 0.5621317411510426, "learning_rate": 4.735703469937138e-06, "loss": 0.0168, "step": 5681 }, { "epoch": 2.5850773430391265, "grad_norm": 0.458585899105421, "learning_rate": 4.734276182933624e-06, "loss": 0.0235, "step": 5682 }, { "epoch": 2.585532302092812, "grad_norm": 0.23298344983466068, "learning_rate": 4.732848917643863e-06, "loss": 0.0042, "step": 5683 }, { "epoch": 2.5859872611464967, "grad_norm": 0.4890985050410502, "learning_rate": 4.731421674184481e-06, "loss": 0.0163, "step": 5684 }, { "epoch": 2.586442220200182, "grad_norm": 0.3166420086538857, "learning_rate": 4.729994452672108e-06, "loss": 0.0088, "step": 5685 }, { "epoch": 2.5868971792538673, "grad_norm": 0.431078529504673, "learning_rate": 4.728567253223367e-06, "loss": 0.0201, "step": 5686 }, { "epoch": 2.587352138307552, "grad_norm": 0.5884834347804827, "learning_rate": 4.727140075954887e-06, "loss": 0.0236, "step": 5687 }, { "epoch": 2.5878070973612375, "grad_norm": 0.41660555887864803, "learning_rate": 4.725712920983286e-06, "loss": 0.0193, "step": 5688 }, { "epoch": 2.588262056414923, "grad_norm": 0.5891883848732339, "learning_rate": 4.724285788425185e-06, "loss": 0.0096, "step": 5689 }, { "epoch": 2.5887170154686077, "grad_norm": 0.3155735538669077, "learning_rate": 4.722858678397206e-06, "loss": 0.0084, "step": 5690 }, { "epoch": 2.589171974522293, "grad_norm": 0.3079750584560806, "learning_rate": 4.721431591015963e-06, "loss": 0.0061, "step": 5691 }, { "epoch": 2.5896269335759783, "grad_norm": 0.37091097569234943, "learning_rate": 4.720004526398075e-06, "loss": 0.015, "step": 5692 }, { "epoch": 2.590081892629663, "grad_norm": 0.4926220454682441, "learning_rate": 4.7185774846601505e-06, "loss": 0.0085, "step": 5693 }, { "epoch": 2.5905368516833485, "grad_norm": 0.5546824530334568, "learning_rate": 4.717150465918805e-06, "loss": 0.016, "step": 5694 }, { "epoch": 2.5909918107370338, "grad_norm": 0.4540780593471903, "learning_rate": 4.715723470290644e-06, "loss": 0.0153, "step": 5695 }, { "epoch": 2.5914467697907186, "grad_norm": 0.3553708020812148, "learning_rate": 4.714296497892277e-06, "loss": 0.009, "step": 5696 }, { "epoch": 2.591901728844404, "grad_norm": 0.39148283435969855, "learning_rate": 4.712869548840311e-06, "loss": 0.0109, "step": 5697 }, { "epoch": 2.5923566878980893, "grad_norm": 0.43669578541265186, "learning_rate": 4.711442623251349e-06, "loss": 0.016, "step": 5698 }, { "epoch": 2.592811646951774, "grad_norm": 0.27027201782885235, "learning_rate": 4.710015721241993e-06, "loss": 0.0081, "step": 5699 }, { "epoch": 2.5932666060054594, "grad_norm": 0.4110807726380925, "learning_rate": 4.708588842928842e-06, "loss": 0.0162, "step": 5700 }, { "epoch": 2.5937215650591448, "grad_norm": 0.35656479827200116, "learning_rate": 4.7071619884284955e-06, "loss": 0.0103, "step": 5701 }, { "epoch": 2.5941765241128296, "grad_norm": 0.305326393352088, "learning_rate": 4.705735157857548e-06, "loss": 0.0112, "step": 5702 }, { "epoch": 2.594631483166515, "grad_norm": 0.2792248986192964, "learning_rate": 4.704308351332593e-06, "loss": 0.01, "step": 5703 }, { "epoch": 2.5950864422202002, "grad_norm": 0.621058188142095, "learning_rate": 4.702881568970227e-06, "loss": 0.0167, "step": 5704 }, { "epoch": 2.595541401273885, "grad_norm": 0.3892333521186203, "learning_rate": 4.701454810887036e-06, "loss": 0.0114, "step": 5705 }, { "epoch": 2.5959963603275704, "grad_norm": 0.35186659910320217, "learning_rate": 4.70002807719961e-06, "loss": 0.0069, "step": 5706 }, { "epoch": 2.5964513193812557, "grad_norm": 0.40108911457025237, "learning_rate": 4.698601368024535e-06, "loss": 0.0172, "step": 5707 }, { "epoch": 2.5969062784349406, "grad_norm": 0.42373634603180493, "learning_rate": 4.697174683478396e-06, "loss": 0.0157, "step": 5708 }, { "epoch": 2.597361237488626, "grad_norm": 0.6394868852008742, "learning_rate": 4.695748023677773e-06, "loss": 0.0231, "step": 5709 }, { "epoch": 2.597816196542311, "grad_norm": 0.5512344234887848, "learning_rate": 4.6943213887392465e-06, "loss": 0.0159, "step": 5710 }, { "epoch": 2.598271155595996, "grad_norm": 0.1773299977692886, "learning_rate": 4.692894778779398e-06, "loss": 0.0044, "step": 5711 }, { "epoch": 2.5987261146496814, "grad_norm": 0.46249397568887457, "learning_rate": 4.691468193914799e-06, "loss": 0.0172, "step": 5712 }, { "epoch": 2.5991810737033667, "grad_norm": 0.533014196533797, "learning_rate": 4.690041634262028e-06, "loss": 0.0167, "step": 5713 }, { "epoch": 2.599636032757052, "grad_norm": 0.412996319965304, "learning_rate": 4.6886150999376554e-06, "loss": 0.0093, "step": 5714 }, { "epoch": 2.600090991810737, "grad_norm": 0.3674583428485953, "learning_rate": 4.687188591058248e-06, "loss": 0.012, "step": 5715 }, { "epoch": 2.600545950864422, "grad_norm": 0.5397087960063365, "learning_rate": 4.68576210774038e-06, "loss": 0.0294, "step": 5716 }, { "epoch": 2.6010009099181075, "grad_norm": 0.3718017390186533, "learning_rate": 4.684335650100609e-06, "loss": 0.0122, "step": 5717 }, { "epoch": 2.6014558689717924, "grad_norm": 0.5260313811778944, "learning_rate": 4.682909218255505e-06, "loss": 0.0187, "step": 5718 }, { "epoch": 2.6019108280254777, "grad_norm": 0.3005424847198216, "learning_rate": 4.6814828123216285e-06, "loss": 0.008, "step": 5719 }, { "epoch": 2.602365787079163, "grad_norm": 0.33107833266033543, "learning_rate": 4.680056432415536e-06, "loss": 0.0088, "step": 5720 }, { "epoch": 2.6028207461328483, "grad_norm": 0.2958430539835329, "learning_rate": 4.6786300786537905e-06, "loss": 0.0073, "step": 5721 }, { "epoch": 2.603275705186533, "grad_norm": 0.31657097622558167, "learning_rate": 4.677203751152941e-06, "loss": 0.0118, "step": 5722 }, { "epoch": 2.6037306642402185, "grad_norm": 0.4079268321662643, "learning_rate": 4.675777450029545e-06, "loss": 0.0165, "step": 5723 }, { "epoch": 2.604185623293904, "grad_norm": 0.33480528730370973, "learning_rate": 4.6743511754001495e-06, "loss": 0.0083, "step": 5724 }, { "epoch": 2.6046405823475887, "grad_norm": 0.29683596241044735, "learning_rate": 4.6729249273813055e-06, "loss": 0.0075, "step": 5725 }, { "epoch": 2.605095541401274, "grad_norm": 0.4032644218320927, "learning_rate": 4.671498706089561e-06, "loss": 0.0153, "step": 5726 }, { "epoch": 2.6055505004549593, "grad_norm": 0.4074746712570187, "learning_rate": 4.670072511641458e-06, "loss": 0.0131, "step": 5727 }, { "epoch": 2.606005459508644, "grad_norm": 0.3692897583349627, "learning_rate": 4.66864634415354e-06, "loss": 0.0138, "step": 5728 }, { "epoch": 2.6064604185623295, "grad_norm": 0.30749957189884386, "learning_rate": 4.667220203742345e-06, "loss": 0.0065, "step": 5729 }, { "epoch": 2.6069153776160148, "grad_norm": 0.30623565693679805, "learning_rate": 4.665794090524414e-06, "loss": 0.0124, "step": 5730 }, { "epoch": 2.6073703366696996, "grad_norm": 0.500005036560983, "learning_rate": 4.6643680046162785e-06, "loss": 0.0205, "step": 5731 }, { "epoch": 2.607825295723385, "grad_norm": 0.4034681368915988, "learning_rate": 4.662941946134474e-06, "loss": 0.0151, "step": 5732 }, { "epoch": 2.6082802547770703, "grad_norm": 0.43530520324662214, "learning_rate": 4.661515915195534e-06, "loss": 0.0165, "step": 5733 }, { "epoch": 2.608735213830755, "grad_norm": 0.509796889816632, "learning_rate": 4.660089911915983e-06, "loss": 0.0162, "step": 5734 }, { "epoch": 2.6091901728844404, "grad_norm": 0.3612159836198824, "learning_rate": 4.6586639364123505e-06, "loss": 0.0122, "step": 5735 }, { "epoch": 2.6096451319381258, "grad_norm": 0.36888760672195897, "learning_rate": 4.657237988801159e-06, "loss": 0.0083, "step": 5736 }, { "epoch": 2.6101000909918106, "grad_norm": 0.31423597167812467, "learning_rate": 4.655812069198932e-06, "loss": 0.0081, "step": 5737 }, { "epoch": 2.610555050045496, "grad_norm": 0.3897289635639756, "learning_rate": 4.654386177722185e-06, "loss": 0.0131, "step": 5738 }, { "epoch": 2.6110100090991812, "grad_norm": 0.2048674693935203, "learning_rate": 4.652960314487441e-06, "loss": 0.0054, "step": 5739 }, { "epoch": 2.611464968152866, "grad_norm": 0.5026091275956444, "learning_rate": 4.651534479611214e-06, "loss": 0.0135, "step": 5740 }, { "epoch": 2.6119199272065514, "grad_norm": 0.3423561108381752, "learning_rate": 4.650108673210014e-06, "loss": 0.0102, "step": 5741 }, { "epoch": 2.6123748862602367, "grad_norm": 0.3257624307460199, "learning_rate": 4.648682895400356e-06, "loss": 0.0079, "step": 5742 }, { "epoch": 2.6128298453139216, "grad_norm": 0.5221096495513938, "learning_rate": 4.647257146298742e-06, "loss": 0.0123, "step": 5743 }, { "epoch": 2.613284804367607, "grad_norm": 1.364059336614433, "learning_rate": 4.645831426021684e-06, "loss": 0.0303, "step": 5744 }, { "epoch": 2.613739763421292, "grad_norm": 0.551240674369172, "learning_rate": 4.644405734685679e-06, "loss": 0.0273, "step": 5745 }, { "epoch": 2.614194722474977, "grad_norm": 0.38420589237028663, "learning_rate": 4.642980072407233e-06, "loss": 0.0104, "step": 5746 }, { "epoch": 2.6146496815286624, "grad_norm": 0.5384659442716453, "learning_rate": 4.641554439302843e-06, "loss": 0.0143, "step": 5747 }, { "epoch": 2.6151046405823477, "grad_norm": 0.3876530722237191, "learning_rate": 4.640128835489007e-06, "loss": 0.0119, "step": 5748 }, { "epoch": 2.6155595996360326, "grad_norm": 0.3476457709085455, "learning_rate": 4.6387032610822164e-06, "loss": 0.011, "step": 5749 }, { "epoch": 2.616014558689718, "grad_norm": 0.26694074076048757, "learning_rate": 4.637277716198964e-06, "loss": 0.0091, "step": 5750 }, { "epoch": 2.616469517743403, "grad_norm": 0.4197615362874177, "learning_rate": 4.635852200955738e-06, "loss": 0.0176, "step": 5751 }, { "epoch": 2.616924476797088, "grad_norm": 0.48739515766664343, "learning_rate": 4.634426715469024e-06, "loss": 0.022, "step": 5752 }, { "epoch": 2.6173794358507734, "grad_norm": 0.26925881956883646, "learning_rate": 4.633001259855311e-06, "loss": 0.0066, "step": 5753 }, { "epoch": 2.6178343949044587, "grad_norm": 0.3718053136267044, "learning_rate": 4.631575834231076e-06, "loss": 0.0142, "step": 5754 }, { "epoch": 2.6182893539581436, "grad_norm": 0.35008894308345395, "learning_rate": 4.630150438712801e-06, "loss": 0.0137, "step": 5755 }, { "epoch": 2.618744313011829, "grad_norm": 0.2634624680974572, "learning_rate": 4.6287250734169605e-06, "loss": 0.0088, "step": 5756 }, { "epoch": 2.619199272065514, "grad_norm": 0.4188095871063324, "learning_rate": 4.627299738460032e-06, "loss": 0.0133, "step": 5757 }, { "epoch": 2.619654231119199, "grad_norm": 0.37122477130171483, "learning_rate": 4.6258744339584855e-06, "loss": 0.0135, "step": 5758 }, { "epoch": 2.6201091901728844, "grad_norm": 0.41017280481299545, "learning_rate": 4.624449160028789e-06, "loss": 0.011, "step": 5759 }, { "epoch": 2.6205641492265697, "grad_norm": 0.28790189607168515, "learning_rate": 4.623023916787412e-06, "loss": 0.007, "step": 5760 }, { "epoch": 2.6210191082802545, "grad_norm": 0.8095340989341101, "learning_rate": 4.6215987043508185e-06, "loss": 0.0188, "step": 5761 }, { "epoch": 2.62147406733394, "grad_norm": 0.38160155583041777, "learning_rate": 4.620173522835471e-06, "loss": 0.0113, "step": 5762 }, { "epoch": 2.621929026387625, "grad_norm": 0.3070713675822249, "learning_rate": 4.618748372357827e-06, "loss": 0.0076, "step": 5763 }, { "epoch": 2.62238398544131, "grad_norm": 0.34069908038311614, "learning_rate": 4.617323253034345e-06, "loss": 0.0098, "step": 5764 }, { "epoch": 2.6228389444949953, "grad_norm": 0.3435062113384841, "learning_rate": 4.615898164981477e-06, "loss": 0.0111, "step": 5765 }, { "epoch": 2.6232939035486806, "grad_norm": 0.37577842389294913, "learning_rate": 4.614473108315676e-06, "loss": 0.015, "step": 5766 }, { "epoch": 2.623748862602366, "grad_norm": 0.42903703048564007, "learning_rate": 4.613048083153393e-06, "loss": 0.0164, "step": 5767 }, { "epoch": 2.624203821656051, "grad_norm": 0.5280151778638612, "learning_rate": 4.611623089611073e-06, "loss": 0.0141, "step": 5768 }, { "epoch": 2.624658780709736, "grad_norm": 0.2636320380579304, "learning_rate": 4.610198127805161e-06, "loss": 0.0067, "step": 5769 }, { "epoch": 2.6251137397634214, "grad_norm": 0.3750678231721881, "learning_rate": 4.608773197852096e-06, "loss": 0.0124, "step": 5770 }, { "epoch": 2.6255686988171063, "grad_norm": 0.4404046939348186, "learning_rate": 4.607348299868321e-06, "loss": 0.0176, "step": 5771 }, { "epoch": 2.6260236578707916, "grad_norm": 0.5118635401841247, "learning_rate": 4.605923433970268e-06, "loss": 0.0186, "step": 5772 }, { "epoch": 2.626478616924477, "grad_norm": 0.5571878826845478, "learning_rate": 4.604498600274371e-06, "loss": 0.0211, "step": 5773 }, { "epoch": 2.6269335759781622, "grad_norm": 0.22682777512616378, "learning_rate": 4.603073798897064e-06, "loss": 0.0059, "step": 5774 }, { "epoch": 2.627388535031847, "grad_norm": 0.24317710327264414, "learning_rate": 4.601649029954773e-06, "loss": 0.006, "step": 5775 }, { "epoch": 2.6278434940855324, "grad_norm": 0.41648038322407926, "learning_rate": 4.600224293563926e-06, "loss": 0.0092, "step": 5776 }, { "epoch": 2.6282984531392177, "grad_norm": 0.5273847358730055, "learning_rate": 4.598799589840943e-06, "loss": 0.0092, "step": 5777 }, { "epoch": 2.6287534121929026, "grad_norm": 0.32386046473781877, "learning_rate": 4.597374918902247e-06, "loss": 0.0113, "step": 5778 }, { "epoch": 2.629208371246588, "grad_norm": 0.2967163856636597, "learning_rate": 4.595950280864252e-06, "loss": 0.0081, "step": 5779 }, { "epoch": 2.629663330300273, "grad_norm": 0.3618730617503731, "learning_rate": 4.594525675843375e-06, "loss": 0.0088, "step": 5780 }, { "epoch": 2.630118289353958, "grad_norm": 0.3308158172891041, "learning_rate": 4.593101103956031e-06, "loss": 0.0063, "step": 5781 }, { "epoch": 2.6305732484076434, "grad_norm": 0.2711505150736749, "learning_rate": 4.591676565318626e-06, "loss": 0.0081, "step": 5782 }, { "epoch": 2.6310282074613287, "grad_norm": 0.5925624878565041, "learning_rate": 4.5902520600475694e-06, "loss": 0.0168, "step": 5783 }, { "epoch": 2.6314831665150136, "grad_norm": 0.5513124587083298, "learning_rate": 4.588827588259265e-06, "loss": 0.0178, "step": 5784 }, { "epoch": 2.631938125568699, "grad_norm": 0.2665248519466739, "learning_rate": 4.587403150070111e-06, "loss": 0.007, "step": 5785 }, { "epoch": 2.632393084622384, "grad_norm": 0.4397339921938562, "learning_rate": 4.5859787455965095e-06, "loss": 0.0133, "step": 5786 }, { "epoch": 2.632848043676069, "grad_norm": 0.2800285530528093, "learning_rate": 4.584554374954853e-06, "loss": 0.0052, "step": 5787 }, { "epoch": 2.6333030027297544, "grad_norm": 0.3129782224235985, "learning_rate": 4.583130038261538e-06, "loss": 0.0111, "step": 5788 }, { "epoch": 2.6337579617834397, "grad_norm": 0.36646264066442885, "learning_rate": 4.5817057356329545e-06, "loss": 0.0121, "step": 5789 }, { "epoch": 2.6342129208371245, "grad_norm": 0.30701057653401975, "learning_rate": 4.580281467185488e-06, "loss": 0.0084, "step": 5790 }, { "epoch": 2.63466787989081, "grad_norm": 0.40444746509468976, "learning_rate": 4.578857233035527e-06, "loss": 0.0082, "step": 5791 }, { "epoch": 2.635122838944495, "grad_norm": 0.385256018837849, "learning_rate": 4.5774330332994485e-06, "loss": 0.0114, "step": 5792 }, { "epoch": 2.63557779799818, "grad_norm": 0.2883972426981428, "learning_rate": 4.576008868093636e-06, "loss": 0.0073, "step": 5793 }, { "epoch": 2.6360327570518653, "grad_norm": 0.3895013306574242, "learning_rate": 4.574584737534462e-06, "loss": 0.0143, "step": 5794 }, { "epoch": 2.6364877161055507, "grad_norm": 0.2324144680387847, "learning_rate": 4.573160641738303e-06, "loss": 0.0078, "step": 5795 }, { "epoch": 2.6369426751592355, "grad_norm": 0.45599244569563824, "learning_rate": 4.57173658082153e-06, "loss": 0.0128, "step": 5796 }, { "epoch": 2.637397634212921, "grad_norm": 0.5501655745067856, "learning_rate": 4.570312554900508e-06, "loss": 0.0258, "step": 5797 }, { "epoch": 2.637852593266606, "grad_norm": 0.34996166563248526, "learning_rate": 4.568888564091606e-06, "loss": 0.014, "step": 5798 }, { "epoch": 2.638307552320291, "grad_norm": 0.38384660960629524, "learning_rate": 4.567464608511181e-06, "loss": 0.0174, "step": 5799 }, { "epoch": 2.6387625113739763, "grad_norm": 0.4623207748093834, "learning_rate": 4.566040688275597e-06, "loss": 0.0276, "step": 5800 }, { "epoch": 2.6392174704276616, "grad_norm": 0.5223956569811683, "learning_rate": 4.564616803501205e-06, "loss": 0.0264, "step": 5801 }, { "epoch": 2.6396724294813465, "grad_norm": 0.4449564284543771, "learning_rate": 4.563192954304364e-06, "loss": 0.016, "step": 5802 }, { "epoch": 2.640127388535032, "grad_norm": 0.36251810804579676, "learning_rate": 4.561769140801422e-06, "loss": 0.013, "step": 5803 }, { "epoch": 2.640582347588717, "grad_norm": 0.33855626891813634, "learning_rate": 4.5603453631087265e-06, "loss": 0.0127, "step": 5804 }, { "epoch": 2.641037306642402, "grad_norm": 0.34777929454318607, "learning_rate": 4.558921621342623e-06, "loss": 0.0114, "step": 5805 }, { "epoch": 2.6414922656960873, "grad_norm": 0.41291110565820316, "learning_rate": 4.557497915619452e-06, "loss": 0.015, "step": 5806 }, { "epoch": 2.6419472247497726, "grad_norm": 0.5617402299104329, "learning_rate": 4.556074246055555e-06, "loss": 0.013, "step": 5807 }, { "epoch": 2.6424021838034575, "grad_norm": 0.2687758103178183, "learning_rate": 4.5546506127672625e-06, "loss": 0.0086, "step": 5808 }, { "epoch": 2.642857142857143, "grad_norm": 0.22091279175143466, "learning_rate": 4.553227015870912e-06, "loss": 0.0046, "step": 5809 }, { "epoch": 2.643312101910828, "grad_norm": 0.4313278689258374, "learning_rate": 4.551803455482833e-06, "loss": 0.0101, "step": 5810 }, { "epoch": 2.643767060964513, "grad_norm": 0.6047817710725403, "learning_rate": 4.550379931719351e-06, "loss": 0.0096, "step": 5811 }, { "epoch": 2.6442220200181983, "grad_norm": 0.32507370297772714, "learning_rate": 4.548956444696791e-06, "loss": 0.0083, "step": 5812 }, { "epoch": 2.6446769790718836, "grad_norm": 0.2984733283454028, "learning_rate": 4.5475329945314735e-06, "loss": 0.0076, "step": 5813 }, { "epoch": 2.6451319381255685, "grad_norm": 0.4613307935418929, "learning_rate": 4.5461095813397164e-06, "loss": 0.0185, "step": 5814 }, { "epoch": 2.6455868971792538, "grad_norm": 0.4451412694131819, "learning_rate": 4.544686205237833e-06, "loss": 0.0224, "step": 5815 }, { "epoch": 2.646041856232939, "grad_norm": 0.4145702427879785, "learning_rate": 4.543262866342138e-06, "loss": 0.0104, "step": 5816 }, { "epoch": 2.646496815286624, "grad_norm": 0.3356934525716918, "learning_rate": 4.541839564768939e-06, "loss": 0.0082, "step": 5817 }, { "epoch": 2.6469517743403093, "grad_norm": 0.439864585201127, "learning_rate": 4.540416300634541e-06, "loss": 0.0147, "step": 5818 }, { "epoch": 2.6474067333939946, "grad_norm": 0.34928665902959727, "learning_rate": 4.538993074055249e-06, "loss": 0.0123, "step": 5819 }, { "epoch": 2.6478616924476794, "grad_norm": 0.1589131959554854, "learning_rate": 4.537569885147361e-06, "loss": 0.0039, "step": 5820 }, { "epoch": 2.6483166515013647, "grad_norm": 0.5476586369600871, "learning_rate": 4.536146734027173e-06, "loss": 0.0229, "step": 5821 }, { "epoch": 2.64877161055505, "grad_norm": 0.3645269802636522, "learning_rate": 4.534723620810977e-06, "loss": 0.0148, "step": 5822 }, { "epoch": 2.6492265696087354, "grad_norm": 0.4071429204997368, "learning_rate": 4.533300545615068e-06, "loss": 0.013, "step": 5823 }, { "epoch": 2.6496815286624202, "grad_norm": 0.25993932014711674, "learning_rate": 4.53187750855573e-06, "loss": 0.0056, "step": 5824 }, { "epoch": 2.6501364877161055, "grad_norm": 0.3610959108570182, "learning_rate": 4.530454509749249e-06, "loss": 0.0108, "step": 5825 }, { "epoch": 2.650591446769791, "grad_norm": 0.3441179248113602, "learning_rate": 4.529031549311904e-06, "loss": 0.014, "step": 5826 }, { "epoch": 2.6510464058234757, "grad_norm": 0.4457131297216828, "learning_rate": 4.527608627359975e-06, "loss": 0.018, "step": 5827 }, { "epoch": 2.651501364877161, "grad_norm": 0.5060202653904013, "learning_rate": 4.526185744009735e-06, "loss": 0.0227, "step": 5828 }, { "epoch": 2.6519563239308463, "grad_norm": 0.3673643525210136, "learning_rate": 4.524762899377454e-06, "loss": 0.0126, "step": 5829 }, { "epoch": 2.6524112829845317, "grad_norm": 0.44996253774646483, "learning_rate": 4.523340093579406e-06, "loss": 0.0146, "step": 5830 }, { "epoch": 2.6528662420382165, "grad_norm": 0.2976982932735961, "learning_rate": 4.521917326731851e-06, "loss": 0.0073, "step": 5831 }, { "epoch": 2.653321201091902, "grad_norm": 0.33298774146807764, "learning_rate": 4.520494598951055e-06, "loss": 0.0088, "step": 5832 }, { "epoch": 2.653776160145587, "grad_norm": 0.46122820715296065, "learning_rate": 4.519071910353273e-06, "loss": 0.0141, "step": 5833 }, { "epoch": 2.654231119199272, "grad_norm": 0.45337578318684346, "learning_rate": 4.5176492610547645e-06, "loss": 0.0165, "step": 5834 }, { "epoch": 2.6546860782529573, "grad_norm": 0.4976446291006916, "learning_rate": 4.5162266511717786e-06, "loss": 0.0164, "step": 5835 }, { "epoch": 2.6551410373066426, "grad_norm": 0.3929025087484671, "learning_rate": 4.514804080820565e-06, "loss": 0.0125, "step": 5836 }, { "epoch": 2.6555959963603275, "grad_norm": 0.579797174178738, "learning_rate": 4.513381550117373e-06, "loss": 0.0191, "step": 5837 }, { "epoch": 2.656050955414013, "grad_norm": 0.23622730003541972, "learning_rate": 4.511959059178443e-06, "loss": 0.0038, "step": 5838 }, { "epoch": 2.656505914467698, "grad_norm": 0.4543745917329778, "learning_rate": 4.5105366081200145e-06, "loss": 0.0204, "step": 5839 }, { "epoch": 2.656960873521383, "grad_norm": 0.3278508136129625, "learning_rate": 4.509114197058324e-06, "loss": 0.0093, "step": 5840 }, { "epoch": 2.6574158325750683, "grad_norm": 0.3779936944103423, "learning_rate": 4.507691826109604e-06, "loss": 0.0187, "step": 5841 }, { "epoch": 2.6578707916287536, "grad_norm": 0.3390924411890412, "learning_rate": 4.5062694953900844e-06, "loss": 0.0106, "step": 5842 }, { "epoch": 2.6583257506824385, "grad_norm": 0.3444507329542516, "learning_rate": 4.504847205015992e-06, "loss": 0.0115, "step": 5843 }, { "epoch": 2.658780709736124, "grad_norm": 0.49310642630458146, "learning_rate": 4.5034249551035506e-06, "loss": 0.0171, "step": 5844 }, { "epoch": 2.659235668789809, "grad_norm": 0.47513025298327166, "learning_rate": 4.502002745768979e-06, "loss": 0.0171, "step": 5845 }, { "epoch": 2.659690627843494, "grad_norm": 0.3112837906022871, "learning_rate": 4.500580577128495e-06, "loss": 0.01, "step": 5846 }, { "epoch": 2.6601455868971793, "grad_norm": 0.3597869037000204, "learning_rate": 4.49915844929831e-06, "loss": 0.0124, "step": 5847 }, { "epoch": 2.6606005459508646, "grad_norm": 0.6453893116899386, "learning_rate": 4.497736362394636e-06, "loss": 0.0083, "step": 5848 }, { "epoch": 2.6610555050045495, "grad_norm": 0.3001237241602498, "learning_rate": 4.496314316533677e-06, "loss": 0.0084, "step": 5849 }, { "epoch": 2.6615104640582348, "grad_norm": 0.24020844944489492, "learning_rate": 4.494892311831635e-06, "loss": 0.0053, "step": 5850 }, { "epoch": 2.66196542311192, "grad_norm": 0.4048867054822093, "learning_rate": 4.493470348404716e-06, "loss": 0.0105, "step": 5851 }, { "epoch": 2.662420382165605, "grad_norm": 0.47597584535507365, "learning_rate": 4.492048426369111e-06, "loss": 0.0147, "step": 5852 }, { "epoch": 2.6628753412192903, "grad_norm": 0.38895201531794266, "learning_rate": 4.4906265458410155e-06, "loss": 0.0148, "step": 5853 }, { "epoch": 2.6633303002729756, "grad_norm": 0.5263749586215324, "learning_rate": 4.489204706936618e-06, "loss": 0.02, "step": 5854 }, { "epoch": 2.6637852593266604, "grad_norm": 0.4070373856353329, "learning_rate": 4.487782909772106e-06, "loss": 0.0104, "step": 5855 }, { "epoch": 2.6642402183803457, "grad_norm": 0.5081767752708876, "learning_rate": 4.486361154463662e-06, "loss": 0.0101, "step": 5856 }, { "epoch": 2.664695177434031, "grad_norm": 0.464902831281099, "learning_rate": 4.484939441127462e-06, "loss": 0.0122, "step": 5857 }, { "epoch": 2.665150136487716, "grad_norm": 0.2928487182953344, "learning_rate": 4.483517769879686e-06, "loss": 0.0074, "step": 5858 }, { "epoch": 2.6656050955414012, "grad_norm": 0.34151745015555895, "learning_rate": 4.482096140836506e-06, "loss": 0.0137, "step": 5859 }, { "epoch": 2.6660600545950865, "grad_norm": 0.41820223929717, "learning_rate": 4.48067455411409e-06, "loss": 0.0171, "step": 5860 }, { "epoch": 2.6665150136487714, "grad_norm": 0.44755507302270947, "learning_rate": 4.4792530098286055e-06, "loss": 0.011, "step": 5861 }, { "epoch": 2.6669699727024567, "grad_norm": 0.5075248732354104, "learning_rate": 4.477831508096212e-06, "loss": 0.0294, "step": 5862 }, { "epoch": 2.667424931756142, "grad_norm": 0.3649971612422481, "learning_rate": 4.476410049033071e-06, "loss": 0.018, "step": 5863 }, { "epoch": 2.667879890809827, "grad_norm": 0.6136006900786559, "learning_rate": 4.474988632755333e-06, "loss": 0.0188, "step": 5864 }, { "epoch": 2.668334849863512, "grad_norm": 0.31952285704506544, "learning_rate": 4.473567259379155e-06, "loss": 0.0098, "step": 5865 }, { "epoch": 2.6687898089171975, "grad_norm": 0.3303307149310139, "learning_rate": 4.4721459290206845e-06, "loss": 0.009, "step": 5866 }, { "epoch": 2.6692447679708824, "grad_norm": 0.48624774371727625, "learning_rate": 4.470724641796064e-06, "loss": 0.0167, "step": 5867 }, { "epoch": 2.6696997270245677, "grad_norm": 0.5994384400402291, "learning_rate": 4.469303397821436e-06, "loss": 0.0234, "step": 5868 }, { "epoch": 2.670154686078253, "grad_norm": 0.41772145877689015, "learning_rate": 4.467882197212936e-06, "loss": 0.0221, "step": 5869 }, { "epoch": 2.670609645131938, "grad_norm": 0.43795459902089234, "learning_rate": 4.466461040086703e-06, "loss": 0.0164, "step": 5870 }, { "epoch": 2.671064604185623, "grad_norm": 0.47863779803985534, "learning_rate": 4.46503992655886e-06, "loss": 0.0208, "step": 5871 }, { "epoch": 2.6715195632393085, "grad_norm": 0.5909524466084196, "learning_rate": 4.46361885674554e-06, "loss": 0.0139, "step": 5872 }, { "epoch": 2.6719745222929934, "grad_norm": 0.19048398219418494, "learning_rate": 4.462197830762867e-06, "loss": 0.0053, "step": 5873 }, { "epoch": 2.6724294813466787, "grad_norm": 0.26133628017831345, "learning_rate": 4.460776848726956e-06, "loss": 0.0083, "step": 5874 }, { "epoch": 2.672884440400364, "grad_norm": 0.41149130816008755, "learning_rate": 4.459355910753928e-06, "loss": 0.0173, "step": 5875 }, { "epoch": 2.673339399454049, "grad_norm": 0.4524287025288151, "learning_rate": 4.4579350169598926e-06, "loss": 0.0179, "step": 5876 }, { "epoch": 2.673794358507734, "grad_norm": 0.5112956824842843, "learning_rate": 4.456514167460959e-06, "loss": 0.0122, "step": 5877 }, { "epoch": 2.6742493175614195, "grad_norm": 0.25333799765495163, "learning_rate": 4.4550933623732326e-06, "loss": 0.0086, "step": 5878 }, { "epoch": 2.674704276615105, "grad_norm": 0.3001308840457026, "learning_rate": 4.4536726018128165e-06, "loss": 0.0096, "step": 5879 }, { "epoch": 2.6751592356687897, "grad_norm": 0.4096936505479323, "learning_rate": 4.45225188589581e-06, "loss": 0.0135, "step": 5880 }, { "epoch": 2.675614194722475, "grad_norm": 0.5571718231482075, "learning_rate": 4.450831214738303e-06, "loss": 0.0205, "step": 5881 }, { "epoch": 2.6760691537761603, "grad_norm": 0.3091438223005542, "learning_rate": 4.4494105884563915e-06, "loss": 0.0099, "step": 5882 }, { "epoch": 2.676524112829845, "grad_norm": 0.25766124485336955, "learning_rate": 4.447990007166159e-06, "loss": 0.0081, "step": 5883 }, { "epoch": 2.6769790718835305, "grad_norm": 0.5478957271033703, "learning_rate": 4.446569470983692e-06, "loss": 0.0141, "step": 5884 }, { "epoch": 2.6774340309372158, "grad_norm": 0.2775414909517801, "learning_rate": 4.445148980025065e-06, "loss": 0.0102, "step": 5885 }, { "epoch": 2.677888989990901, "grad_norm": 0.24580451578508553, "learning_rate": 4.443728534406359e-06, "loss": 0.0056, "step": 5886 }, { "epoch": 2.678343949044586, "grad_norm": 0.4127465647091162, "learning_rate": 4.442308134243647e-06, "loss": 0.018, "step": 5887 }, { "epoch": 2.6787989080982713, "grad_norm": 0.31431865508607826, "learning_rate": 4.440887779652995e-06, "loss": 0.0096, "step": 5888 }, { "epoch": 2.6792538671519566, "grad_norm": 0.40643493771737643, "learning_rate": 4.439467470750468e-06, "loss": 0.0113, "step": 5889 }, { "epoch": 2.6797088262056414, "grad_norm": 0.29721225550651337, "learning_rate": 4.438047207652129e-06, "loss": 0.0068, "step": 5890 }, { "epoch": 2.6801637852593267, "grad_norm": 0.3345518511805347, "learning_rate": 4.436626990474031e-06, "loss": 0.0086, "step": 5891 }, { "epoch": 2.680618744313012, "grad_norm": 0.5785048947016902, "learning_rate": 4.435206819332235e-06, "loss": 0.0174, "step": 5892 }, { "epoch": 2.681073703366697, "grad_norm": 0.2507398612469783, "learning_rate": 4.433786694342787e-06, "loss": 0.0069, "step": 5893 }, { "epoch": 2.6815286624203822, "grad_norm": 0.42109176587022173, "learning_rate": 4.432366615621731e-06, "loss": 0.0188, "step": 5894 }, { "epoch": 2.6819836214740675, "grad_norm": 0.3116410675804666, "learning_rate": 4.430946583285114e-06, "loss": 0.0069, "step": 5895 }, { "epoch": 2.6824385805277524, "grad_norm": 0.47691225592049874, "learning_rate": 4.429526597448971e-06, "loss": 0.0121, "step": 5896 }, { "epoch": 2.6828935395814377, "grad_norm": 0.3169795585038183, "learning_rate": 4.4281066582293395e-06, "loss": 0.0079, "step": 5897 }, { "epoch": 2.683348498635123, "grad_norm": 0.47180599726734984, "learning_rate": 4.426686765742247e-06, "loss": 0.0171, "step": 5898 }, { "epoch": 2.683803457688808, "grad_norm": 0.3678746859384994, "learning_rate": 4.425266920103724e-06, "loss": 0.0095, "step": 5899 }, { "epoch": 2.684258416742493, "grad_norm": 0.3346651706380877, "learning_rate": 4.423847121429794e-06, "loss": 0.0101, "step": 5900 }, { "epoch": 2.6847133757961785, "grad_norm": 0.31662847366698194, "learning_rate": 4.422427369836474e-06, "loss": 0.0108, "step": 5901 }, { "epoch": 2.6851683348498634, "grad_norm": 0.3108308777458586, "learning_rate": 4.421007665439783e-06, "loss": 0.0089, "step": 5902 }, { "epoch": 2.6856232939035487, "grad_norm": 0.32253387780453324, "learning_rate": 4.419588008355728e-06, "loss": 0.0082, "step": 5903 }, { "epoch": 2.686078252957234, "grad_norm": 0.3299627220985637, "learning_rate": 4.418168398700323e-06, "loss": 0.0103, "step": 5904 }, { "epoch": 2.686533212010919, "grad_norm": 0.35129161189746655, "learning_rate": 4.4167488365895655e-06, "loss": 0.0083, "step": 5905 }, { "epoch": 2.686988171064604, "grad_norm": 0.46063557831796464, "learning_rate": 4.415329322139461e-06, "loss": 0.0139, "step": 5906 }, { "epoch": 2.6874431301182895, "grad_norm": 0.30436466186993605, "learning_rate": 4.413909855466004e-06, "loss": 0.0084, "step": 5907 }, { "epoch": 2.6878980891719744, "grad_norm": 0.4276214090485729, "learning_rate": 4.412490436685186e-06, "loss": 0.012, "step": 5908 }, { "epoch": 2.6883530482256597, "grad_norm": 0.43284047680467447, "learning_rate": 4.411071065912998e-06, "loss": 0.0124, "step": 5909 }, { "epoch": 2.688808007279345, "grad_norm": 0.33056729075812286, "learning_rate": 4.4096517432654214e-06, "loss": 0.0109, "step": 5910 }, { "epoch": 2.68926296633303, "grad_norm": 0.3622732918865968, "learning_rate": 4.40823246885844e-06, "loss": 0.0102, "step": 5911 }, { "epoch": 2.689717925386715, "grad_norm": 0.3749288794205867, "learning_rate": 4.406813242808026e-06, "loss": 0.0091, "step": 5912 }, { "epoch": 2.6901728844404005, "grad_norm": 0.4823955650158816, "learning_rate": 4.405394065230156e-06, "loss": 0.0248, "step": 5913 }, { "epoch": 2.6906278434940853, "grad_norm": 0.4005168714710043, "learning_rate": 4.4039749362408e-06, "loss": 0.0138, "step": 5914 }, { "epoch": 2.6910828025477707, "grad_norm": 0.24863550042056706, "learning_rate": 4.402555855955919e-06, "loss": 0.0065, "step": 5915 }, { "epoch": 2.691537761601456, "grad_norm": 0.37360887296936135, "learning_rate": 4.4011368244914755e-06, "loss": 0.014, "step": 5916 }, { "epoch": 2.691992720655141, "grad_norm": 0.30213901305179247, "learning_rate": 4.399717841963426e-06, "loss": 0.0072, "step": 5917 }, { "epoch": 2.692447679708826, "grad_norm": 0.35931487388605293, "learning_rate": 4.398298908487724e-06, "loss": 0.0124, "step": 5918 }, { "epoch": 2.6929026387625115, "grad_norm": 0.42471054550290893, "learning_rate": 4.396880024180317e-06, "loss": 0.0085, "step": 5919 }, { "epoch": 2.6933575978161963, "grad_norm": 0.41595229154631397, "learning_rate": 4.395461189157151e-06, "loss": 0.0153, "step": 5920 }, { "epoch": 2.6938125568698816, "grad_norm": 0.46265630144973596, "learning_rate": 4.394042403534168e-06, "loss": 0.0211, "step": 5921 }, { "epoch": 2.694267515923567, "grad_norm": 0.3677226498607149, "learning_rate": 4.3926236674273015e-06, "loss": 0.0152, "step": 5922 }, { "epoch": 2.694722474977252, "grad_norm": 0.3296085608612325, "learning_rate": 4.391204980952488e-06, "loss": 0.0053, "step": 5923 }, { "epoch": 2.695177434030937, "grad_norm": 0.24938540793476988, "learning_rate": 4.3897863442256524e-06, "loss": 0.0064, "step": 5924 }, { "epoch": 2.6956323930846224, "grad_norm": 0.3695274301851014, "learning_rate": 4.388367757362722e-06, "loss": 0.0106, "step": 5925 }, { "epoch": 2.6960873521383073, "grad_norm": 0.42640183823398775, "learning_rate": 4.386949220479615e-06, "loss": 0.018, "step": 5926 }, { "epoch": 2.6965423111919926, "grad_norm": 0.5087558783924742, "learning_rate": 4.3855307336922506e-06, "loss": 0.0113, "step": 5927 }, { "epoch": 2.696997270245678, "grad_norm": 0.3870320579224213, "learning_rate": 4.384112297116539e-06, "loss": 0.0095, "step": 5928 }, { "epoch": 2.697452229299363, "grad_norm": 0.3736399356844653, "learning_rate": 4.382693910868391e-06, "loss": 0.0114, "step": 5929 }, { "epoch": 2.697907188353048, "grad_norm": 0.3062519866734188, "learning_rate": 4.381275575063707e-06, "loss": 0.0058, "step": 5930 }, { "epoch": 2.6983621474067334, "grad_norm": 0.4299664104931159, "learning_rate": 4.37985728981839e-06, "loss": 0.0164, "step": 5931 }, { "epoch": 2.6988171064604187, "grad_norm": 0.4447482202242733, "learning_rate": 4.378439055248333e-06, "loss": 0.0221, "step": 5932 }, { "epoch": 2.6992720655141036, "grad_norm": 0.3920079999178914, "learning_rate": 4.37702087146943e-06, "loss": 0.0092, "step": 5933 }, { "epoch": 2.699727024567789, "grad_norm": 0.5715092331290539, "learning_rate": 4.3756027385975695e-06, "loss": 0.0234, "step": 5934 }, { "epoch": 2.700181983621474, "grad_norm": 0.45525462737236416, "learning_rate": 4.374184656748631e-06, "loss": 0.0104, "step": 5935 }, { "epoch": 2.700636942675159, "grad_norm": 0.2209715620676055, "learning_rate": 4.372766626038499e-06, "loss": 0.0053, "step": 5936 }, { "epoch": 2.7010919017288444, "grad_norm": 0.4108139339697753, "learning_rate": 4.371348646583044e-06, "loss": 0.0108, "step": 5937 }, { "epoch": 2.7015468607825297, "grad_norm": 0.37570904277500317, "learning_rate": 4.36993071849814e-06, "loss": 0.0071, "step": 5938 }, { "epoch": 2.702001819836215, "grad_norm": 0.7350727758399006, "learning_rate": 4.368512841899651e-06, "loss": 0.0249, "step": 5939 }, { "epoch": 2.7024567788899, "grad_norm": 0.5080607083946161, "learning_rate": 4.36709501690344e-06, "loss": 0.0156, "step": 5940 }, { "epoch": 2.702911737943585, "grad_norm": 0.3854516790863163, "learning_rate": 4.365677243625367e-06, "loss": 0.0081, "step": 5941 }, { "epoch": 2.7033666969972705, "grad_norm": 0.3410881101028962, "learning_rate": 4.364259522181286e-06, "loss": 0.0103, "step": 5942 }, { "epoch": 2.7038216560509554, "grad_norm": 0.35641324030309507, "learning_rate": 4.362841852687045e-06, "loss": 0.0093, "step": 5943 }, { "epoch": 2.7042766151046407, "grad_norm": 0.5388055690150927, "learning_rate": 4.361424235258491e-06, "loss": 0.0166, "step": 5944 }, { "epoch": 2.704731574158326, "grad_norm": 0.2631136612518011, "learning_rate": 4.360006670011464e-06, "loss": 0.0054, "step": 5945 }, { "epoch": 2.705186533212011, "grad_norm": 0.2893901886937811, "learning_rate": 4.3585891570618025e-06, "loss": 0.0091, "step": 5946 }, { "epoch": 2.705641492265696, "grad_norm": 0.5550008205804519, "learning_rate": 4.357171696525336e-06, "loss": 0.019, "step": 5947 }, { "epoch": 2.7060964513193815, "grad_norm": 0.7036458097520657, "learning_rate": 4.355754288517898e-06, "loss": 0.0192, "step": 5948 }, { "epoch": 2.7065514103730663, "grad_norm": 0.37716798265996226, "learning_rate": 4.3543369331553094e-06, "loss": 0.0098, "step": 5949 }, { "epoch": 2.7070063694267517, "grad_norm": 0.35358119456607356, "learning_rate": 4.352919630553393e-06, "loss": 0.0122, "step": 5950 }, { "epoch": 2.707461328480437, "grad_norm": 0.49091923935457776, "learning_rate": 4.351502380827959e-06, "loss": 0.0105, "step": 5951 }, { "epoch": 2.707916287534122, "grad_norm": 0.3721912755689915, "learning_rate": 4.350085184094824e-06, "loss": 0.0114, "step": 5952 }, { "epoch": 2.708371246587807, "grad_norm": 0.4595936797563464, "learning_rate": 4.348668040469791e-06, "loss": 0.0121, "step": 5953 }, { "epoch": 2.7088262056414925, "grad_norm": 0.18308282044190585, "learning_rate": 4.347250950068665e-06, "loss": 0.0046, "step": 5954 }, { "epoch": 2.7092811646951773, "grad_norm": 0.38133385208471704, "learning_rate": 4.3458339130072435e-06, "loss": 0.0134, "step": 5955 }, { "epoch": 2.7097361237488626, "grad_norm": 0.3324887417715275, "learning_rate": 4.34441692940132e-06, "loss": 0.0086, "step": 5956 }, { "epoch": 2.710191082802548, "grad_norm": 0.4121226392533765, "learning_rate": 4.342999999366687e-06, "loss": 0.011, "step": 5957 }, { "epoch": 2.710646041856233, "grad_norm": 0.3664305268526003, "learning_rate": 4.341583123019124e-06, "loss": 0.0089, "step": 5958 }, { "epoch": 2.711101000909918, "grad_norm": 0.30153585451944653, "learning_rate": 4.340166300474418e-06, "loss": 0.0108, "step": 5959 }, { "epoch": 2.7115559599636034, "grad_norm": 0.2674138630433753, "learning_rate": 4.338749531848339e-06, "loss": 0.0079, "step": 5960 }, { "epoch": 2.7120109190172883, "grad_norm": 0.4596102873673691, "learning_rate": 4.337332817256662e-06, "loss": 0.0158, "step": 5961 }, { "epoch": 2.7124658780709736, "grad_norm": 0.36879563360197226, "learning_rate": 4.3359161568151566e-06, "loss": 0.0173, "step": 5962 }, { "epoch": 2.712920837124659, "grad_norm": 0.31597139069597485, "learning_rate": 4.334499550639583e-06, "loss": 0.0092, "step": 5963 }, { "epoch": 2.713375796178344, "grad_norm": 0.2618434781780756, "learning_rate": 4.333082998845701e-06, "loss": 0.008, "step": 5964 }, { "epoch": 2.713830755232029, "grad_norm": 0.31157091138158305, "learning_rate": 4.331666501549266e-06, "loss": 0.0121, "step": 5965 }, { "epoch": 2.7142857142857144, "grad_norm": 0.42119834558716646, "learning_rate": 4.330250058866025e-06, "loss": 0.0223, "step": 5966 }, { "epoch": 2.7147406733393993, "grad_norm": 0.6547892014861587, "learning_rate": 4.3288336709117246e-06, "loss": 0.0219, "step": 5967 }, { "epoch": 2.7151956323930846, "grad_norm": 0.4372691554822337, "learning_rate": 4.327417337802104e-06, "loss": 0.0121, "step": 5968 }, { "epoch": 2.71565059144677, "grad_norm": 0.40059797983832385, "learning_rate": 4.326001059652903e-06, "loss": 0.0137, "step": 5969 }, { "epoch": 2.7161055505004548, "grad_norm": 0.3263738631147659, "learning_rate": 4.324584836579851e-06, "loss": 0.0054, "step": 5970 }, { "epoch": 2.71656050955414, "grad_norm": 0.32344836807620925, "learning_rate": 4.323168668698677e-06, "loss": 0.0138, "step": 5971 }, { "epoch": 2.7170154686078254, "grad_norm": 0.45925229926678546, "learning_rate": 4.321752556125103e-06, "loss": 0.0124, "step": 5972 }, { "epoch": 2.7174704276615103, "grad_norm": 0.5011067119815998, "learning_rate": 4.320336498974845e-06, "loss": 0.0228, "step": 5973 }, { "epoch": 2.7179253867151956, "grad_norm": 0.7521044804384955, "learning_rate": 4.3189204973636215e-06, "loss": 0.0355, "step": 5974 }, { "epoch": 2.718380345768881, "grad_norm": 0.32854602876346556, "learning_rate": 4.317504551407136e-06, "loss": 0.0077, "step": 5975 }, { "epoch": 2.7188353048225657, "grad_norm": 0.3839985419288797, "learning_rate": 4.316088661221099e-06, "loss": 0.0118, "step": 5976 }, { "epoch": 2.719290263876251, "grad_norm": 0.6425451914377049, "learning_rate": 4.314672826921208e-06, "loss": 0.0281, "step": 5977 }, { "epoch": 2.7197452229299364, "grad_norm": 0.4744921390656975, "learning_rate": 4.313257048623158e-06, "loss": 0.0139, "step": 5978 }, { "epoch": 2.7202001819836212, "grad_norm": 0.33409379873540984, "learning_rate": 4.311841326442642e-06, "loss": 0.008, "step": 5979 }, { "epoch": 2.7206551410373065, "grad_norm": 0.3259174484089608, "learning_rate": 4.310425660495343e-06, "loss": 0.0105, "step": 5980 }, { "epoch": 2.721110100090992, "grad_norm": 0.4861237315830944, "learning_rate": 4.3090100508969465e-06, "loss": 0.0195, "step": 5981 }, { "epoch": 2.7215650591446767, "grad_norm": 0.5213304928328588, "learning_rate": 4.307594497763127e-06, "loss": 0.0148, "step": 5982 }, { "epoch": 2.722020018198362, "grad_norm": 0.34951031591703374, "learning_rate": 4.306179001209558e-06, "loss": 0.0138, "step": 5983 }, { "epoch": 2.7224749772520473, "grad_norm": 0.271038910785206, "learning_rate": 4.304763561351909e-06, "loss": 0.0088, "step": 5984 }, { "epoch": 2.722929936305732, "grad_norm": 0.3836333079471888, "learning_rate": 4.303348178305842e-06, "loss": 0.0165, "step": 5985 }, { "epoch": 2.7233848953594175, "grad_norm": 0.298228828505515, "learning_rate": 4.301932852187016e-06, "loss": 0.0108, "step": 5986 }, { "epoch": 2.723839854413103, "grad_norm": 0.42299373707295224, "learning_rate": 4.300517583111085e-06, "loss": 0.0176, "step": 5987 }, { "epoch": 2.724294813466788, "grad_norm": 0.2785147818525911, "learning_rate": 4.299102371193698e-06, "loss": 0.0082, "step": 5988 }, { "epoch": 2.724749772520473, "grad_norm": 0.2568855967379187, "learning_rate": 4.297687216550498e-06, "loss": 0.0081, "step": 5989 }, { "epoch": 2.7252047315741583, "grad_norm": 0.3593110390610532, "learning_rate": 4.296272119297128e-06, "loss": 0.0127, "step": 5990 }, { "epoch": 2.7256596906278436, "grad_norm": 0.4960810888734688, "learning_rate": 4.294857079549225e-06, "loss": 0.0298, "step": 5991 }, { "epoch": 2.7261146496815285, "grad_norm": 0.4111408777512469, "learning_rate": 4.2934420974224145e-06, "loss": 0.0071, "step": 5992 }, { "epoch": 2.726569608735214, "grad_norm": 0.48783725226573443, "learning_rate": 4.292027173032326e-06, "loss": 0.0167, "step": 5993 }, { "epoch": 2.727024567788899, "grad_norm": 0.6014951972548783, "learning_rate": 4.29061230649458e-06, "loss": 0.0366, "step": 5994 }, { "epoch": 2.7274795268425844, "grad_norm": 0.5639786086544031, "learning_rate": 4.289197497924792e-06, "loss": 0.0158, "step": 5995 }, { "epoch": 2.7279344858962693, "grad_norm": 0.511557736098475, "learning_rate": 4.287782747438573e-06, "loss": 0.0212, "step": 5996 }, { "epoch": 2.7283894449499546, "grad_norm": 0.4175148879191534, "learning_rate": 4.286368055151534e-06, "loss": 0.0188, "step": 5997 }, { "epoch": 2.72884440400364, "grad_norm": 0.42995148142816225, "learning_rate": 4.2849534211792745e-06, "loss": 0.0107, "step": 5998 }, { "epoch": 2.729299363057325, "grad_norm": 0.32554889780250434, "learning_rate": 4.283538845637391e-06, "loss": 0.0062, "step": 5999 }, { "epoch": 2.72975432211101, "grad_norm": 0.2858393798989467, "learning_rate": 4.28212432864148e-06, "loss": 0.0094, "step": 6000 }, { "epoch": 2.7302092811646954, "grad_norm": 0.45764410917087667, "learning_rate": 4.280709870307126e-06, "loss": 0.0172, "step": 6001 }, { "epoch": 2.7306642402183803, "grad_norm": 0.47934274558905793, "learning_rate": 4.279295470749913e-06, "loss": 0.0181, "step": 6002 }, { "epoch": 2.7311191992720656, "grad_norm": 0.48026364129621, "learning_rate": 4.277881130085417e-06, "loss": 0.0108, "step": 6003 }, { "epoch": 2.731574158325751, "grad_norm": 0.2637951056808094, "learning_rate": 4.276466848429216e-06, "loss": 0.0068, "step": 6004 }, { "epoch": 2.7320291173794358, "grad_norm": 0.3407506054823702, "learning_rate": 4.275052625896877e-06, "loss": 0.0076, "step": 6005 }, { "epoch": 2.732484076433121, "grad_norm": 0.2874278977902796, "learning_rate": 4.273638462603963e-06, "loss": 0.0069, "step": 6006 }, { "epoch": 2.7329390354868064, "grad_norm": 0.34675813464582306, "learning_rate": 4.272224358666034e-06, "loss": 0.0094, "step": 6007 }, { "epoch": 2.7333939945404913, "grad_norm": 0.4228462684046438, "learning_rate": 4.270810314198644e-06, "loss": 0.013, "step": 6008 }, { "epoch": 2.7338489535941766, "grad_norm": 0.4965390314312646, "learning_rate": 4.269396329317342e-06, "loss": 0.0172, "step": 6009 }, { "epoch": 2.734303912647862, "grad_norm": 0.5078479135463949, "learning_rate": 4.2679824041376706e-06, "loss": 0.0213, "step": 6010 }, { "epoch": 2.7347588717015467, "grad_norm": 0.4779946394655433, "learning_rate": 4.266568538775174e-06, "loss": 0.0162, "step": 6011 }, { "epoch": 2.735213830755232, "grad_norm": 0.5037789907695531, "learning_rate": 4.265154733345383e-06, "loss": 0.0222, "step": 6012 }, { "epoch": 2.7356687898089174, "grad_norm": 0.48690389553033686, "learning_rate": 4.2637409879638295e-06, "loss": 0.0212, "step": 6013 }, { "epoch": 2.7361237488626022, "grad_norm": 0.3585354660137679, "learning_rate": 4.262327302746037e-06, "loss": 0.0094, "step": 6014 }, { "epoch": 2.7365787079162875, "grad_norm": 0.26747655226336553, "learning_rate": 4.260913677807527e-06, "loss": 0.0079, "step": 6015 }, { "epoch": 2.737033666969973, "grad_norm": 0.45017825990090166, "learning_rate": 4.259500113263812e-06, "loss": 0.0132, "step": 6016 }, { "epoch": 2.7374886260236577, "grad_norm": 0.35508190230188047, "learning_rate": 4.258086609230403e-06, "loss": 0.011, "step": 6017 }, { "epoch": 2.737943585077343, "grad_norm": 0.44162569436143834, "learning_rate": 4.256673165822808e-06, "loss": 0.0102, "step": 6018 }, { "epoch": 2.7383985441310283, "grad_norm": 0.5580313491582966, "learning_rate": 4.255259783156524e-06, "loss": 0.0283, "step": 6019 }, { "epoch": 2.738853503184713, "grad_norm": 0.5012586476460804, "learning_rate": 4.253846461347049e-06, "loss": 0.0149, "step": 6020 }, { "epoch": 2.7393084622383985, "grad_norm": 0.24032698607455938, "learning_rate": 4.252433200509869e-06, "loss": 0.0085, "step": 6021 }, { "epoch": 2.739763421292084, "grad_norm": 0.4447599235839283, "learning_rate": 4.251020000760474e-06, "loss": 0.0228, "step": 6022 }, { "epoch": 2.7402183803457687, "grad_norm": 0.37372332737273567, "learning_rate": 4.2496068622143405e-06, "loss": 0.019, "step": 6023 }, { "epoch": 2.740673339399454, "grad_norm": 0.44428958710931893, "learning_rate": 4.248193784986945e-06, "loss": 0.0158, "step": 6024 }, { "epoch": 2.7411282984531393, "grad_norm": 0.3596671825856569, "learning_rate": 4.24678076919376e-06, "loss": 0.0106, "step": 6025 }, { "epoch": 2.741583257506824, "grad_norm": 0.6336535672429905, "learning_rate": 4.2453678149502485e-06, "loss": 0.034, "step": 6026 }, { "epoch": 2.7420382165605095, "grad_norm": 0.41892272188920054, "learning_rate": 4.243954922371872e-06, "loss": 0.0162, "step": 6027 }, { "epoch": 2.742493175614195, "grad_norm": 0.412957468127852, "learning_rate": 4.242542091574083e-06, "loss": 0.0146, "step": 6028 }, { "epoch": 2.7429481346678797, "grad_norm": 0.2956409275628743, "learning_rate": 4.241129322672336e-06, "loss": 0.0074, "step": 6029 }, { "epoch": 2.743403093721565, "grad_norm": 0.7754413600064651, "learning_rate": 4.239716615782072e-06, "loss": 0.0248, "step": 6030 }, { "epoch": 2.7438580527752503, "grad_norm": 0.3767843229640008, "learning_rate": 4.238303971018732e-06, "loss": 0.0084, "step": 6031 }, { "epoch": 2.744313011828935, "grad_norm": 0.83477873341383, "learning_rate": 4.236891388497754e-06, "loss": 0.0244, "step": 6032 }, { "epoch": 2.7447679708826205, "grad_norm": 0.3487267676364768, "learning_rate": 4.235478868334564e-06, "loss": 0.0143, "step": 6033 }, { "epoch": 2.745222929936306, "grad_norm": 0.4715632280454509, "learning_rate": 4.23406641064459e-06, "loss": 0.0212, "step": 6034 }, { "epoch": 2.7456778889899907, "grad_norm": 0.13743236892341112, "learning_rate": 4.2326540155432495e-06, "loss": 0.0025, "step": 6035 }, { "epoch": 2.746132848043676, "grad_norm": 0.3903137918449913, "learning_rate": 4.231241683145957e-06, "loss": 0.0169, "step": 6036 }, { "epoch": 2.7465878070973613, "grad_norm": 0.33399784550717343, "learning_rate": 4.229829413568123e-06, "loss": 0.0098, "step": 6037 }, { "epoch": 2.747042766151046, "grad_norm": 0.30915255490624294, "learning_rate": 4.228417206925149e-06, "loss": 0.0119, "step": 6038 }, { "epoch": 2.7474977252047315, "grad_norm": 0.4498938401201024, "learning_rate": 4.227005063332438e-06, "loss": 0.0182, "step": 6039 }, { "epoch": 2.7479526842584168, "grad_norm": 0.41696981771742386, "learning_rate": 4.225592982905383e-06, "loss": 0.011, "step": 6040 }, { "epoch": 2.7484076433121016, "grad_norm": 0.5416946682788819, "learning_rate": 4.224180965759371e-06, "loss": 0.0188, "step": 6041 }, { "epoch": 2.748862602365787, "grad_norm": 0.3394706961277096, "learning_rate": 4.222769012009789e-06, "loss": 0.0079, "step": 6042 }, { "epoch": 2.7493175614194723, "grad_norm": 0.3145576845233851, "learning_rate": 4.221357121772012e-06, "loss": 0.0134, "step": 6043 }, { "epoch": 2.7497725204731576, "grad_norm": 0.3780673339666677, "learning_rate": 4.219945295161415e-06, "loss": 0.0163, "step": 6044 }, { "epoch": 2.7502274795268424, "grad_norm": 0.4413501139701228, "learning_rate": 4.218533532293364e-06, "loss": 0.022, "step": 6045 }, { "epoch": 2.7506824385805277, "grad_norm": 0.4744950692812475, "learning_rate": 4.2171218332832255e-06, "loss": 0.0177, "step": 6046 }, { "epoch": 2.751137397634213, "grad_norm": 0.2876589143233397, "learning_rate": 4.215710198246355e-06, "loss": 0.0119, "step": 6047 }, { "epoch": 2.7515923566878984, "grad_norm": 0.3568692776496177, "learning_rate": 4.2142986272981054e-06, "loss": 0.0148, "step": 6048 }, { "epoch": 2.7520473157415832, "grad_norm": 0.3987937262773912, "learning_rate": 4.212887120553824e-06, "loss": 0.0162, "step": 6049 }, { "epoch": 2.7525022747952685, "grad_norm": 0.3970739293642638, "learning_rate": 4.211475678128853e-06, "loss": 0.016, "step": 6050 }, { "epoch": 2.752957233848954, "grad_norm": 0.38257854830908083, "learning_rate": 4.210064300138527e-06, "loss": 0.0195, "step": 6051 }, { "epoch": 2.7534121929026387, "grad_norm": 0.27539639933281646, "learning_rate": 4.208652986698179e-06, "loss": 0.0061, "step": 6052 }, { "epoch": 2.753867151956324, "grad_norm": 0.276362732921801, "learning_rate": 4.2072417379231366e-06, "loss": 0.0092, "step": 6053 }, { "epoch": 2.7543221110100093, "grad_norm": 0.30579099001049087, "learning_rate": 4.205830553928719e-06, "loss": 0.0072, "step": 6054 }, { "epoch": 2.754777070063694, "grad_norm": 0.2791220968846108, "learning_rate": 4.204419434830242e-06, "loss": 0.0066, "step": 6055 }, { "epoch": 2.7552320291173795, "grad_norm": 0.3965659540742579, "learning_rate": 4.203008380743017e-06, "loss": 0.0242, "step": 6056 }, { "epoch": 2.755686988171065, "grad_norm": 0.4539489140290909, "learning_rate": 4.201597391782346e-06, "loss": 0.0139, "step": 6057 }, { "epoch": 2.7561419472247497, "grad_norm": 0.23824613312272833, "learning_rate": 4.200186468063532e-06, "loss": 0.0082, "step": 6058 }, { "epoch": 2.756596906278435, "grad_norm": 0.4477019339717532, "learning_rate": 4.198775609701866e-06, "loss": 0.0194, "step": 6059 }, { "epoch": 2.7570518653321203, "grad_norm": 0.2994317448702024, "learning_rate": 4.19736481681264e-06, "loss": 0.0064, "step": 6060 }, { "epoch": 2.757506824385805, "grad_norm": 0.37626624109614354, "learning_rate": 4.195954089511138e-06, "loss": 0.0109, "step": 6061 }, { "epoch": 2.7579617834394905, "grad_norm": 0.4013219959387168, "learning_rate": 4.194543427912635e-06, "loss": 0.01, "step": 6062 }, { "epoch": 2.758416742493176, "grad_norm": 0.7997522438204837, "learning_rate": 4.1931328321324076e-06, "loss": 0.0263, "step": 6063 }, { "epoch": 2.7588717015468607, "grad_norm": 0.31119835503473015, "learning_rate": 4.191722302285719e-06, "loss": 0.0085, "step": 6064 }, { "epoch": 2.759326660600546, "grad_norm": 0.2853810603396253, "learning_rate": 4.190311838487835e-06, "loss": 0.0092, "step": 6065 }, { "epoch": 2.7597816196542313, "grad_norm": 0.28249320508886516, "learning_rate": 4.18890144085401e-06, "loss": 0.0113, "step": 6066 }, { "epoch": 2.760236578707916, "grad_norm": 0.4242011220249303, "learning_rate": 4.187491109499496e-06, "loss": 0.0177, "step": 6067 }, { "epoch": 2.7606915377616015, "grad_norm": 0.39698604392213305, "learning_rate": 4.186080844539541e-06, "loss": 0.0124, "step": 6068 }, { "epoch": 2.761146496815287, "grad_norm": 0.5918489733481939, "learning_rate": 4.1846706460893835e-06, "loss": 0.0173, "step": 6069 }, { "epoch": 2.7616014558689717, "grad_norm": 0.22072983320116718, "learning_rate": 4.183260514264259e-06, "loss": 0.0049, "step": 6070 }, { "epoch": 2.762056414922657, "grad_norm": 0.33935785502715216, "learning_rate": 4.181850449179397e-06, "loss": 0.0096, "step": 6071 }, { "epoch": 2.7625113739763423, "grad_norm": 0.2104903430711546, "learning_rate": 4.180440450950021e-06, "loss": 0.0053, "step": 6072 }, { "epoch": 2.762966333030027, "grad_norm": 0.28401035184529577, "learning_rate": 4.179030519691349e-06, "loss": 0.01, "step": 6073 }, { "epoch": 2.7634212920837125, "grad_norm": 0.2196656857103231, "learning_rate": 4.1776206555185964e-06, "loss": 0.0065, "step": 6074 }, { "epoch": 2.7638762511373978, "grad_norm": 0.3298230170003272, "learning_rate": 4.17621085854697e-06, "loss": 0.009, "step": 6075 }, { "epoch": 2.7643312101910826, "grad_norm": 0.4293989065704018, "learning_rate": 4.174801128891673e-06, "loss": 0.0163, "step": 6076 }, { "epoch": 2.764786169244768, "grad_norm": 0.3193133909773695, "learning_rate": 4.173391466667901e-06, "loss": 0.0077, "step": 6077 }, { "epoch": 2.7652411282984533, "grad_norm": 0.3839636462875678, "learning_rate": 4.171981871990845e-06, "loss": 0.0127, "step": 6078 }, { "epoch": 2.765696087352138, "grad_norm": 0.3577973140064893, "learning_rate": 4.1705723449756905e-06, "loss": 0.0067, "step": 6079 }, { "epoch": 2.7661510464058234, "grad_norm": 0.4196585376044528, "learning_rate": 4.169162885737617e-06, "loss": 0.0104, "step": 6080 }, { "epoch": 2.7666060054595087, "grad_norm": 0.39230708506159845, "learning_rate": 4.167753494391803e-06, "loss": 0.0067, "step": 6081 }, { "epoch": 2.7670609645131936, "grad_norm": 0.4203626802175535, "learning_rate": 4.166344171053414e-06, "loss": 0.0188, "step": 6082 }, { "epoch": 2.767515923566879, "grad_norm": 0.38508313789767185, "learning_rate": 4.164934915837616e-06, "loss": 0.0114, "step": 6083 }, { "epoch": 2.7679708826205642, "grad_norm": 0.2836512973439608, "learning_rate": 4.163525728859564e-06, "loss": 0.0071, "step": 6084 }, { "epoch": 2.768425841674249, "grad_norm": 0.507854761946816, "learning_rate": 4.162116610234413e-06, "loss": 0.0168, "step": 6085 }, { "epoch": 2.7688808007279344, "grad_norm": 0.44711296495064196, "learning_rate": 4.160707560077308e-06, "loss": 0.0114, "step": 6086 }, { "epoch": 2.7693357597816197, "grad_norm": 0.4402637245154256, "learning_rate": 4.15929857850339e-06, "loss": 0.0135, "step": 6087 }, { "epoch": 2.7697907188353046, "grad_norm": 0.38054718542309746, "learning_rate": 4.157889665627797e-06, "loss": 0.0205, "step": 6088 }, { "epoch": 2.77024567788899, "grad_norm": 0.2969576548015972, "learning_rate": 4.156480821565657e-06, "loss": 0.0094, "step": 6089 }, { "epoch": 2.770700636942675, "grad_norm": 0.40685272359600044, "learning_rate": 4.155072046432096e-06, "loss": 0.0106, "step": 6090 }, { "epoch": 2.77115559599636, "grad_norm": 0.6099433784764327, "learning_rate": 4.15366334034223e-06, "loss": 0.0127, "step": 6091 }, { "epoch": 2.7716105550500454, "grad_norm": 0.3697608331724075, "learning_rate": 4.152254703411176e-06, "loss": 0.0149, "step": 6092 }, { "epoch": 2.7720655141037307, "grad_norm": 0.5553578973041321, "learning_rate": 4.1508461357540375e-06, "loss": 0.0261, "step": 6093 }, { "epoch": 2.7725204731574156, "grad_norm": 0.41108563991522645, "learning_rate": 4.149437637485917e-06, "loss": 0.0131, "step": 6094 }, { "epoch": 2.772975432211101, "grad_norm": 0.2981934428308497, "learning_rate": 4.148029208721914e-06, "loss": 0.0115, "step": 6095 }, { "epoch": 2.773430391264786, "grad_norm": 0.27972718856948525, "learning_rate": 4.146620849577116e-06, "loss": 0.007, "step": 6096 }, { "epoch": 2.7738853503184715, "grad_norm": 0.4735269301944073, "learning_rate": 4.145212560166608e-06, "loss": 0.0158, "step": 6097 }, { "epoch": 2.7743403093721564, "grad_norm": 0.5992133569279419, "learning_rate": 4.14380434060547e-06, "loss": 0.024, "step": 6098 }, { "epoch": 2.7747952684258417, "grad_norm": 0.41263465498683555, "learning_rate": 4.142396191008775e-06, "loss": 0.0136, "step": 6099 }, { "epoch": 2.775250227479527, "grad_norm": 0.4540678890818895, "learning_rate": 4.1409881114915895e-06, "loss": 0.0149, "step": 6100 }, { "epoch": 2.775705186533212, "grad_norm": 0.5350222753468424, "learning_rate": 4.139580102168975e-06, "loss": 0.0145, "step": 6101 }, { "epoch": 2.776160145586897, "grad_norm": 0.5974686412677844, "learning_rate": 4.138172163155991e-06, "loss": 0.0133, "step": 6102 }, { "epoch": 2.7766151046405825, "grad_norm": 0.2485634179454498, "learning_rate": 4.136764294567684e-06, "loss": 0.0069, "step": 6103 }, { "epoch": 2.777070063694268, "grad_norm": 0.4455822554552264, "learning_rate": 4.135356496519103e-06, "loss": 0.0144, "step": 6104 }, { "epoch": 2.7775250227479527, "grad_norm": 0.4095125624931906, "learning_rate": 4.1339487691252835e-06, "loss": 0.0115, "step": 6105 }, { "epoch": 2.777979981801638, "grad_norm": 0.32133424970948393, "learning_rate": 4.1325411125012596e-06, "loss": 0.0108, "step": 6106 }, { "epoch": 2.7784349408553233, "grad_norm": 0.363708717190287, "learning_rate": 4.131133526762059e-06, "loss": 0.0151, "step": 6107 }, { "epoch": 2.778889899909008, "grad_norm": 0.4004201324139527, "learning_rate": 4.129726012022699e-06, "loss": 0.0142, "step": 6108 }, { "epoch": 2.7793448589626935, "grad_norm": 0.40719233124918014, "learning_rate": 4.128318568398203e-06, "loss": 0.0123, "step": 6109 }, { "epoch": 2.7797998180163788, "grad_norm": 0.1069858768999529, "learning_rate": 4.126911196003577e-06, "loss": 0.0016, "step": 6110 }, { "epoch": 2.7802547770700636, "grad_norm": 0.4986757196985265, "learning_rate": 4.125503894953824e-06, "loss": 0.0151, "step": 6111 }, { "epoch": 2.780709736123749, "grad_norm": 0.40006597528261445, "learning_rate": 4.124096665363945e-06, "loss": 0.0097, "step": 6112 }, { "epoch": 2.7811646951774343, "grad_norm": 0.3556187303609429, "learning_rate": 4.122689507348929e-06, "loss": 0.0126, "step": 6113 }, { "epoch": 2.781619654231119, "grad_norm": 0.2980746095953485, "learning_rate": 4.121282421023766e-06, "loss": 0.0064, "step": 6114 }, { "epoch": 2.7820746132848044, "grad_norm": 0.3245688169288705, "learning_rate": 4.119875406503434e-06, "loss": 0.0072, "step": 6115 }, { "epoch": 2.7825295723384897, "grad_norm": 0.3359587015267282, "learning_rate": 4.11846846390291e-06, "loss": 0.0114, "step": 6116 }, { "epoch": 2.7829845313921746, "grad_norm": 0.3609356095211249, "learning_rate": 4.117061593337163e-06, "loss": 0.0108, "step": 6117 }, { "epoch": 2.78343949044586, "grad_norm": 0.3814984149878645, "learning_rate": 4.115654794921154e-06, "loss": 0.012, "step": 6118 }, { "epoch": 2.7838944494995452, "grad_norm": 0.3702545080140778, "learning_rate": 4.114248068769843e-06, "loss": 0.0105, "step": 6119 }, { "epoch": 2.78434940855323, "grad_norm": 0.2089677221723774, "learning_rate": 4.112841414998178e-06, "loss": 0.0062, "step": 6120 }, { "epoch": 2.7848043676069154, "grad_norm": 0.565495547702211, "learning_rate": 4.111434833721108e-06, "loss": 0.009, "step": 6121 }, { "epoch": 2.7852593266606007, "grad_norm": 0.5192264102055605, "learning_rate": 4.110028325053568e-06, "loss": 0.0128, "step": 6122 }, { "epoch": 2.7857142857142856, "grad_norm": 0.3855956011187438, "learning_rate": 4.1086218891104955e-06, "loss": 0.013, "step": 6123 }, { "epoch": 2.786169244767971, "grad_norm": 0.3387686752427193, "learning_rate": 4.107215526006818e-06, "loss": 0.0063, "step": 6124 }, { "epoch": 2.786624203821656, "grad_norm": 0.3411160755212013, "learning_rate": 4.105809235857454e-06, "loss": 0.0114, "step": 6125 }, { "epoch": 2.787079162875341, "grad_norm": 0.4634161517162885, "learning_rate": 4.104403018777323e-06, "loss": 0.0135, "step": 6126 }, { "epoch": 2.7875341219290264, "grad_norm": 0.43586130194323086, "learning_rate": 4.102996874881332e-06, "loss": 0.0144, "step": 6127 }, { "epoch": 2.7879890809827117, "grad_norm": 0.2333417989509398, "learning_rate": 4.101590804284386e-06, "loss": 0.0044, "step": 6128 }, { "epoch": 2.7884440400363966, "grad_norm": 0.37099998245746507, "learning_rate": 4.10018480710138e-06, "loss": 0.0129, "step": 6129 }, { "epoch": 2.788898999090082, "grad_norm": 0.3605216388388408, "learning_rate": 4.09877888344721e-06, "loss": 0.0116, "step": 6130 }, { "epoch": 2.789353958143767, "grad_norm": 0.32127702019694454, "learning_rate": 4.09737303343676e-06, "loss": 0.0078, "step": 6131 }, { "epoch": 2.789808917197452, "grad_norm": 0.4105825824192487, "learning_rate": 4.0959672571849085e-06, "loss": 0.0116, "step": 6132 }, { "epoch": 2.7902638762511374, "grad_norm": 0.5365128626959759, "learning_rate": 4.094561554806532e-06, "loss": 0.03, "step": 6133 }, { "epoch": 2.7907188353048227, "grad_norm": 1.2024839942652652, "learning_rate": 4.093155926416494e-06, "loss": 0.0171, "step": 6134 }, { "epoch": 2.7911737943585075, "grad_norm": 0.42772647366704053, "learning_rate": 4.091750372129661e-06, "loss": 0.0135, "step": 6135 }, { "epoch": 2.791628753412193, "grad_norm": 0.6080903619430011, "learning_rate": 4.090344892060883e-06, "loss": 0.0143, "step": 6136 }, { "epoch": 2.792083712465878, "grad_norm": 0.2714797294601795, "learning_rate": 4.0889394863250135e-06, "loss": 0.0095, "step": 6137 }, { "epoch": 2.792538671519563, "grad_norm": 0.31361914857417433, "learning_rate": 4.087534155036896e-06, "loss": 0.0078, "step": 6138 }, { "epoch": 2.7929936305732483, "grad_norm": 0.5677242944360218, "learning_rate": 4.086128898311365e-06, "loss": 0.0321, "step": 6139 }, { "epoch": 2.7934485896269337, "grad_norm": 0.2690088037282102, "learning_rate": 4.084723716263255e-06, "loss": 0.0063, "step": 6140 }, { "epoch": 2.7939035486806185, "grad_norm": 0.30559644460865787, "learning_rate": 4.08331860900739e-06, "loss": 0.0068, "step": 6141 }, { "epoch": 2.794358507734304, "grad_norm": 0.4406939030506512, "learning_rate": 4.0819135766585875e-06, "loss": 0.0095, "step": 6142 }, { "epoch": 2.794813466787989, "grad_norm": 0.2787607200087893, "learning_rate": 4.08050861933166e-06, "loss": 0.0111, "step": 6143 }, { "epoch": 2.795268425841674, "grad_norm": 0.3172800639046428, "learning_rate": 4.079103737141417e-06, "loss": 0.0116, "step": 6144 }, { "epoch": 2.7957233848953593, "grad_norm": 0.4815081625712856, "learning_rate": 4.077698930202659e-06, "loss": 0.0172, "step": 6145 }, { "epoch": 2.7961783439490446, "grad_norm": 0.39982219763098237, "learning_rate": 4.076294198630179e-06, "loss": 0.0161, "step": 6146 }, { "epoch": 2.7966333030027295, "grad_norm": 0.34409258783246904, "learning_rate": 4.074889542538765e-06, "loss": 0.0085, "step": 6147 }, { "epoch": 2.797088262056415, "grad_norm": 0.3676107651287223, "learning_rate": 4.073484962043201e-06, "loss": 0.0109, "step": 6148 }, { "epoch": 2.7975432211101, "grad_norm": 0.5426235224871229, "learning_rate": 4.0720804572582604e-06, "loss": 0.0179, "step": 6149 }, { "epoch": 2.797998180163785, "grad_norm": 0.29564947110329504, "learning_rate": 4.070676028298713e-06, "loss": 0.0093, "step": 6150 }, { "epoch": 2.7984531392174703, "grad_norm": 0.29361865044206104, "learning_rate": 4.069271675279326e-06, "loss": 0.0058, "step": 6151 }, { "epoch": 2.7989080982711556, "grad_norm": 0.4724242565339271, "learning_rate": 4.067867398314853e-06, "loss": 0.0148, "step": 6152 }, { "epoch": 2.799363057324841, "grad_norm": 0.3957167051302801, "learning_rate": 4.06646319752005e-06, "loss": 0.0126, "step": 6153 }, { "epoch": 2.799818016378526, "grad_norm": 0.2952970132079702, "learning_rate": 4.065059073009656e-06, "loss": 0.0037, "step": 6154 }, { "epoch": 2.800272975432211, "grad_norm": 0.5179131046118575, "learning_rate": 4.063655024898413e-06, "loss": 0.0155, "step": 6155 }, { "epoch": 2.8007279344858964, "grad_norm": 0.32668303841341806, "learning_rate": 4.062251053301053e-06, "loss": 0.0057, "step": 6156 }, { "epoch": 2.8011828935395813, "grad_norm": 0.4738822446233882, "learning_rate": 4.0608471583323e-06, "loss": 0.0123, "step": 6157 }, { "epoch": 2.8016378525932666, "grad_norm": 0.4115028385994002, "learning_rate": 4.059443340106879e-06, "loss": 0.0131, "step": 6158 }, { "epoch": 2.802092811646952, "grad_norm": 0.4445545794301525, "learning_rate": 4.0580395987394985e-06, "loss": 0.0135, "step": 6159 }, { "epoch": 2.802547770700637, "grad_norm": 0.4974983954631685, "learning_rate": 4.05663593434487e-06, "loss": 0.0129, "step": 6160 }, { "epoch": 2.803002729754322, "grad_norm": 0.5583654847070216, "learning_rate": 4.0552323470376916e-06, "loss": 0.0172, "step": 6161 }, { "epoch": 2.8034576888080074, "grad_norm": 0.4321008907232332, "learning_rate": 4.05382883693266e-06, "loss": 0.0222, "step": 6162 }, { "epoch": 2.8039126478616927, "grad_norm": 0.2227093598090755, "learning_rate": 4.052425404144463e-06, "loss": 0.0051, "step": 6163 }, { "epoch": 2.8043676069153776, "grad_norm": 0.3694282504252912, "learning_rate": 4.051022048787781e-06, "loss": 0.0097, "step": 6164 }, { "epoch": 2.804822565969063, "grad_norm": 0.5681438221233656, "learning_rate": 4.049618770977294e-06, "loss": 0.018, "step": 6165 }, { "epoch": 2.805277525022748, "grad_norm": 0.2892593210016612, "learning_rate": 4.048215570827668e-06, "loss": 0.0047, "step": 6166 }, { "epoch": 2.805732484076433, "grad_norm": 0.23883116102697122, "learning_rate": 4.046812448453568e-06, "loss": 0.0071, "step": 6167 }, { "epoch": 2.8061874431301184, "grad_norm": 0.28257402229772005, "learning_rate": 4.045409403969649e-06, "loss": 0.0061, "step": 6168 }, { "epoch": 2.8066424021838037, "grad_norm": 0.4509944900552927, "learning_rate": 4.044006437490564e-06, "loss": 0.0092, "step": 6169 }, { "epoch": 2.8070973612374885, "grad_norm": 0.43804886630996964, "learning_rate": 4.042603549130955e-06, "loss": 0.0126, "step": 6170 }, { "epoch": 2.807552320291174, "grad_norm": 0.25021342325412005, "learning_rate": 4.041200739005459e-06, "loss": 0.0045, "step": 6171 }, { "epoch": 2.808007279344859, "grad_norm": 0.2863598785273324, "learning_rate": 4.039798007228711e-06, "loss": 0.0135, "step": 6172 }, { "epoch": 2.808462238398544, "grad_norm": 0.28327946187334474, "learning_rate": 4.038395353915332e-06, "loss": 0.0095, "step": 6173 }, { "epoch": 2.8089171974522293, "grad_norm": 0.4102568188771912, "learning_rate": 4.036992779179944e-06, "loss": 0.011, "step": 6174 }, { "epoch": 2.8093721565059147, "grad_norm": 0.6984396517670316, "learning_rate": 4.035590283137155e-06, "loss": 0.0185, "step": 6175 }, { "epoch": 2.8098271155595995, "grad_norm": 0.22159635474375433, "learning_rate": 4.034187865901576e-06, "loss": 0.0062, "step": 6176 }, { "epoch": 2.810282074613285, "grad_norm": 0.5517320078633793, "learning_rate": 4.0327855275878005e-06, "loss": 0.022, "step": 6177 }, { "epoch": 2.81073703366697, "grad_norm": 0.33894270663938314, "learning_rate": 4.031383268310422e-06, "loss": 0.0063, "step": 6178 }, { "epoch": 2.811191992720655, "grad_norm": 0.23034929824612319, "learning_rate": 4.029981088184031e-06, "loss": 0.0061, "step": 6179 }, { "epoch": 2.8116469517743403, "grad_norm": 0.3090425666502145, "learning_rate": 4.028578987323206e-06, "loss": 0.0118, "step": 6180 }, { "epoch": 2.8121019108280256, "grad_norm": 0.500033020983605, "learning_rate": 4.027176965842518e-06, "loss": 0.0119, "step": 6181 }, { "epoch": 2.8125568698817105, "grad_norm": 0.3249202590119885, "learning_rate": 4.025775023856535e-06, "loss": 0.0089, "step": 6182 }, { "epoch": 2.813011828935396, "grad_norm": 0.3949380686526961, "learning_rate": 4.024373161479817e-06, "loss": 0.0102, "step": 6183 }, { "epoch": 2.813466787989081, "grad_norm": 0.3923780176585343, "learning_rate": 4.02297137882692e-06, "loss": 0.0117, "step": 6184 }, { "epoch": 2.813921747042766, "grad_norm": 0.4570940911572673, "learning_rate": 4.0215696760123864e-06, "loss": 0.0086, "step": 6185 }, { "epoch": 2.8143767060964513, "grad_norm": 0.2693777188109058, "learning_rate": 4.020168053150763e-06, "loss": 0.0086, "step": 6186 }, { "epoch": 2.8148316651501366, "grad_norm": 0.5132690913024266, "learning_rate": 4.018766510356582e-06, "loss": 0.0195, "step": 6187 }, { "epoch": 2.8152866242038215, "grad_norm": 0.4280831276148228, "learning_rate": 4.017365047744368e-06, "loss": 0.0107, "step": 6188 }, { "epoch": 2.815741583257507, "grad_norm": 0.39207033872188096, "learning_rate": 4.015963665428647e-06, "loss": 0.0131, "step": 6189 }, { "epoch": 2.816196542311192, "grad_norm": 0.37182274264419013, "learning_rate": 4.014562363523931e-06, "loss": 0.0077, "step": 6190 }, { "epoch": 2.816651501364877, "grad_norm": 0.36959480584241267, "learning_rate": 4.013161142144729e-06, "loss": 0.0081, "step": 6191 }, { "epoch": 2.8171064604185623, "grad_norm": 0.48029848145136084, "learning_rate": 4.011760001405539e-06, "loss": 0.0196, "step": 6192 }, { "epoch": 2.8175614194722476, "grad_norm": 0.39938984185530507, "learning_rate": 4.010358941420861e-06, "loss": 0.0144, "step": 6193 }, { "epoch": 2.8180163785259325, "grad_norm": 0.5186541852711295, "learning_rate": 4.008957962305181e-06, "loss": 0.0077, "step": 6194 }, { "epoch": 2.8184713375796178, "grad_norm": 0.4704822675115242, "learning_rate": 4.007557064172981e-06, "loss": 0.0148, "step": 6195 }, { "epoch": 2.818926296633303, "grad_norm": 0.4576744056253656, "learning_rate": 4.0061562471387364e-06, "loss": 0.0168, "step": 6196 }, { "epoch": 2.819381255686988, "grad_norm": 0.4640905261962327, "learning_rate": 4.004755511316913e-06, "loss": 0.0132, "step": 6197 }, { "epoch": 2.8198362147406733, "grad_norm": 0.5765526220206479, "learning_rate": 4.003354856821978e-06, "loss": 0.012, "step": 6198 }, { "epoch": 2.8202911737943586, "grad_norm": 0.4219405088038093, "learning_rate": 4.001954283768379e-06, "loss": 0.0133, "step": 6199 }, { "epoch": 2.8207461328480434, "grad_norm": 0.46041164012862307, "learning_rate": 4.0005537922705715e-06, "loss": 0.0161, "step": 6200 }, { "epoch": 2.8212010919017287, "grad_norm": 0.2979993492422895, "learning_rate": 3.999153382442995e-06, "loss": 0.011, "step": 6201 }, { "epoch": 2.821656050955414, "grad_norm": 0.5193864738255584, "learning_rate": 3.997753054400083e-06, "loss": 0.0224, "step": 6202 }, { "epoch": 2.822111010009099, "grad_norm": 0.263340828216671, "learning_rate": 3.996352808256267e-06, "loss": 0.0068, "step": 6203 }, { "epoch": 2.8225659690627842, "grad_norm": 0.9075308243517225, "learning_rate": 3.994952644125965e-06, "loss": 0.0379, "step": 6204 }, { "epoch": 2.8230209281164695, "grad_norm": 0.4606940433030956, "learning_rate": 3.993552562123596e-06, "loss": 0.0079, "step": 6205 }, { "epoch": 2.823475887170155, "grad_norm": 0.46382897126081213, "learning_rate": 3.9921525623635645e-06, "loss": 0.0242, "step": 6206 }, { "epoch": 2.8239308462238397, "grad_norm": 0.45456026962942275, "learning_rate": 3.990752644960275e-06, "loss": 0.0086, "step": 6207 }, { "epoch": 2.824385805277525, "grad_norm": 0.4109895851058085, "learning_rate": 3.989352810028123e-06, "loss": 0.0144, "step": 6208 }, { "epoch": 2.8248407643312103, "grad_norm": 0.44216401737597316, "learning_rate": 3.987953057681494e-06, "loss": 0.015, "step": 6209 }, { "epoch": 2.825295723384895, "grad_norm": 0.47281748571493637, "learning_rate": 3.986553388034772e-06, "loss": 0.0098, "step": 6210 }, { "epoch": 2.8257506824385805, "grad_norm": 0.5202086496180477, "learning_rate": 3.985153801202329e-06, "loss": 0.0115, "step": 6211 }, { "epoch": 2.826205641492266, "grad_norm": 0.42007418723643686, "learning_rate": 3.983754297298536e-06, "loss": 0.013, "step": 6212 }, { "epoch": 2.826660600545951, "grad_norm": 0.2580674306748603, "learning_rate": 3.98235487643775e-06, "loss": 0.0075, "step": 6213 }, { "epoch": 2.827115559599636, "grad_norm": 0.4831461625018207, "learning_rate": 3.980955538734329e-06, "loss": 0.0148, "step": 6214 }, { "epoch": 2.8275705186533213, "grad_norm": 0.4924236442432863, "learning_rate": 3.9795562843026205e-06, "loss": 0.0156, "step": 6215 }, { "epoch": 2.8280254777070066, "grad_norm": 0.34345252750675803, "learning_rate": 3.9781571132569644e-06, "loss": 0.0068, "step": 6216 }, { "epoch": 2.8284804367606915, "grad_norm": 0.3657755175918713, "learning_rate": 3.976758025711693e-06, "loss": 0.0131, "step": 6217 }, { "epoch": 2.828935395814377, "grad_norm": 0.3521811277945949, "learning_rate": 3.975359021781136e-06, "loss": 0.0114, "step": 6218 }, { "epoch": 2.829390354868062, "grad_norm": 0.6147186033564752, "learning_rate": 3.973960101579611e-06, "loss": 0.0241, "step": 6219 }, { "epoch": 2.829845313921747, "grad_norm": 0.2926985229278627, "learning_rate": 3.9725612652214325e-06, "loss": 0.0073, "step": 6220 }, { "epoch": 2.8303002729754323, "grad_norm": 0.5536621993472162, "learning_rate": 3.971162512820909e-06, "loss": 0.0288, "step": 6221 }, { "epoch": 2.8307552320291176, "grad_norm": 0.2359701509054112, "learning_rate": 3.969763844492338e-06, "loss": 0.0039, "step": 6222 }, { "epoch": 2.8312101910828025, "grad_norm": 0.45417416402719135, "learning_rate": 3.968365260350014e-06, "loss": 0.0095, "step": 6223 }, { "epoch": 2.831665150136488, "grad_norm": 0.4692648445169428, "learning_rate": 3.9669667605082215e-06, "loss": 0.0165, "step": 6224 }, { "epoch": 2.832120109190173, "grad_norm": 0.5301083101280761, "learning_rate": 3.965568345081242e-06, "loss": 0.0205, "step": 6225 }, { "epoch": 2.832575068243858, "grad_norm": 0.408721164035492, "learning_rate": 3.964170014183344e-06, "loss": 0.0111, "step": 6226 }, { "epoch": 2.8330300272975433, "grad_norm": 0.399372569637041, "learning_rate": 3.962771767928793e-06, "loss": 0.016, "step": 6227 }, { "epoch": 2.8334849863512286, "grad_norm": 0.2714697972879553, "learning_rate": 3.961373606431852e-06, "loss": 0.0082, "step": 6228 }, { "epoch": 2.8339399454049135, "grad_norm": 0.2513873152496805, "learning_rate": 3.959975529806767e-06, "loss": 0.006, "step": 6229 }, { "epoch": 2.8343949044585988, "grad_norm": 0.27400233417289505, "learning_rate": 3.958577538167788e-06, "loss": 0.0081, "step": 6230 }, { "epoch": 2.834849863512284, "grad_norm": 0.3389260811669458, "learning_rate": 3.957179631629148e-06, "loss": 0.0098, "step": 6231 }, { "epoch": 2.835304822565969, "grad_norm": 0.307024384609634, "learning_rate": 3.9557818103050794e-06, "loss": 0.0093, "step": 6232 }, { "epoch": 2.8357597816196543, "grad_norm": 0.16866023560329357, "learning_rate": 3.954384074309805e-06, "loss": 0.0032, "step": 6233 }, { "epoch": 2.8362147406733396, "grad_norm": 0.46015536527315654, "learning_rate": 3.952986423757541e-06, "loss": 0.0135, "step": 6234 }, { "epoch": 2.8366696997270244, "grad_norm": 0.3440336028007375, "learning_rate": 3.9515888587625e-06, "loss": 0.0089, "step": 6235 }, { "epoch": 2.8371246587807097, "grad_norm": 0.5030475499553019, "learning_rate": 3.9501913794388826e-06, "loss": 0.0243, "step": 6236 }, { "epoch": 2.837579617834395, "grad_norm": 0.2591496580193672, "learning_rate": 3.9487939859008855e-06, "loss": 0.0118, "step": 6237 }, { "epoch": 2.83803457688808, "grad_norm": 0.3818984107723288, "learning_rate": 3.947396678262696e-06, "loss": 0.0192, "step": 6238 }, { "epoch": 2.8384895359417652, "grad_norm": 0.31222358788521987, "learning_rate": 3.9459994566384965e-06, "loss": 0.0071, "step": 6239 }, { "epoch": 2.8389444949954505, "grad_norm": 0.37310107381749025, "learning_rate": 3.944602321142461e-06, "loss": 0.0109, "step": 6240 }, { "epoch": 2.8393994540491354, "grad_norm": 0.36063037626474487, "learning_rate": 3.943205271888757e-06, "loss": 0.0125, "step": 6241 }, { "epoch": 2.8398544131028207, "grad_norm": 0.3976547008235038, "learning_rate": 3.941808308991548e-06, "loss": 0.0119, "step": 6242 }, { "epoch": 2.840309372156506, "grad_norm": 0.42885646552793905, "learning_rate": 3.940411432564983e-06, "loss": 0.0198, "step": 6243 }, { "epoch": 2.840764331210191, "grad_norm": 0.32609775183349216, "learning_rate": 3.939014642723213e-06, "loss": 0.0082, "step": 6244 }, { "epoch": 2.841219290263876, "grad_norm": 0.2633611789673397, "learning_rate": 3.937617939580374e-06, "loss": 0.0095, "step": 6245 }, { "epoch": 2.8416742493175615, "grad_norm": 0.7468469436321076, "learning_rate": 3.936221323250599e-06, "loss": 0.0387, "step": 6246 }, { "epoch": 2.8421292083712464, "grad_norm": 0.46187922087775524, "learning_rate": 3.9348247938480134e-06, "loss": 0.0189, "step": 6247 }, { "epoch": 2.8425841674249317, "grad_norm": 0.3614147118478118, "learning_rate": 3.9334283514867334e-06, "loss": 0.0154, "step": 6248 }, { "epoch": 2.843039126478617, "grad_norm": 0.4203956918022225, "learning_rate": 3.932031996280875e-06, "loss": 0.0129, "step": 6249 }, { "epoch": 2.843494085532302, "grad_norm": 0.36877925475134965, "learning_rate": 3.9306357283445375e-06, "loss": 0.0072, "step": 6250 }, { "epoch": 2.843949044585987, "grad_norm": 0.31676464172553337, "learning_rate": 3.929239547791821e-06, "loss": 0.0113, "step": 6251 }, { "epoch": 2.8444040036396725, "grad_norm": 0.33370319234845225, "learning_rate": 3.927843454736812e-06, "loss": 0.0083, "step": 6252 }, { "epoch": 2.8448589626933574, "grad_norm": 0.4779432996177093, "learning_rate": 3.926447449293593e-06, "loss": 0.0165, "step": 6253 }, { "epoch": 2.8453139217470427, "grad_norm": 0.3448796729681847, "learning_rate": 3.925051531576242e-06, "loss": 0.0107, "step": 6254 }, { "epoch": 2.845768880800728, "grad_norm": 0.7920240022008865, "learning_rate": 3.923655701698823e-06, "loss": 0.0151, "step": 6255 }, { "epoch": 2.846223839854413, "grad_norm": 0.3929181089463895, "learning_rate": 3.922259959775401e-06, "loss": 0.0074, "step": 6256 }, { "epoch": 2.846678798908098, "grad_norm": 0.48258847782440906, "learning_rate": 3.920864305920028e-06, "loss": 0.0108, "step": 6257 }, { "epoch": 2.8471337579617835, "grad_norm": 0.3296749800384298, "learning_rate": 3.919468740246751e-06, "loss": 0.0087, "step": 6258 }, { "epoch": 2.8475887170154683, "grad_norm": 0.28240203229145405, "learning_rate": 3.9180732628696085e-06, "loss": 0.0069, "step": 6259 }, { "epoch": 2.8480436760691537, "grad_norm": 0.43034205793785113, "learning_rate": 3.916677873902633e-06, "loss": 0.0086, "step": 6260 }, { "epoch": 2.848498635122839, "grad_norm": 0.5889995269761055, "learning_rate": 3.91528257345985e-06, "loss": 0.0214, "step": 6261 }, { "epoch": 2.8489535941765243, "grad_norm": 0.2561110247434447, "learning_rate": 3.913887361655274e-06, "loss": 0.0047, "step": 6262 }, { "epoch": 2.849408553230209, "grad_norm": 0.4018927452114139, "learning_rate": 3.91249223860292e-06, "loss": 0.0103, "step": 6263 }, { "epoch": 2.8498635122838945, "grad_norm": 0.37873751535407935, "learning_rate": 3.9110972044167895e-06, "loss": 0.01, "step": 6264 }, { "epoch": 2.8503184713375798, "grad_norm": 0.30995861125402663, "learning_rate": 3.909702259210877e-06, "loss": 0.0122, "step": 6265 }, { "epoch": 2.8507734303912646, "grad_norm": 0.3161837188029331, "learning_rate": 3.908307403099173e-06, "loss": 0.0093, "step": 6266 }, { "epoch": 2.85122838944495, "grad_norm": 0.3327609618084383, "learning_rate": 3.906912636195658e-06, "loss": 0.0075, "step": 6267 }, { "epoch": 2.8516833484986353, "grad_norm": 0.32864109799528896, "learning_rate": 3.905517958614306e-06, "loss": 0.0075, "step": 6268 }, { "epoch": 2.8521383075523206, "grad_norm": 0.4799548165970622, "learning_rate": 3.904123370469082e-06, "loss": 0.0134, "step": 6269 }, { "epoch": 2.8525932666060054, "grad_norm": 0.42389617533635005, "learning_rate": 3.902728871873948e-06, "loss": 0.013, "step": 6270 }, { "epoch": 2.8530482256596907, "grad_norm": 0.33785952911566375, "learning_rate": 3.901334462942857e-06, "loss": 0.0109, "step": 6271 }, { "epoch": 2.853503184713376, "grad_norm": 0.5898203473860996, "learning_rate": 3.899940143789751e-06, "loss": 0.0194, "step": 6272 }, { "epoch": 2.853958143767061, "grad_norm": 0.3206891964404092, "learning_rate": 3.898545914528569e-06, "loss": 0.0101, "step": 6273 }, { "epoch": 2.8544131028207462, "grad_norm": 0.4364208443227221, "learning_rate": 3.89715177527324e-06, "loss": 0.0107, "step": 6274 }, { "epoch": 2.8548680618744315, "grad_norm": 0.5727255634194993, "learning_rate": 3.895757726137689e-06, "loss": 0.0149, "step": 6275 }, { "epoch": 2.8553230209281164, "grad_norm": 0.3050702500498504, "learning_rate": 3.894363767235827e-06, "loss": 0.0065, "step": 6276 }, { "epoch": 2.8557779799818017, "grad_norm": 0.45008964956059117, "learning_rate": 3.892969898681567e-06, "loss": 0.0128, "step": 6277 }, { "epoch": 2.856232939035487, "grad_norm": 0.512783180225894, "learning_rate": 3.891576120588808e-06, "loss": 0.0124, "step": 6278 }, { "epoch": 2.856687898089172, "grad_norm": 0.28781976414263555, "learning_rate": 3.890182433071442e-06, "loss": 0.0055, "step": 6279 }, { "epoch": 2.857142857142857, "grad_norm": 0.36475905568543177, "learning_rate": 3.8887888362433565e-06, "loss": 0.0114, "step": 6280 }, { "epoch": 2.8575978161965425, "grad_norm": 0.3528387944542802, "learning_rate": 3.887395330218429e-06, "loss": 0.0127, "step": 6281 }, { "epoch": 2.8580527752502274, "grad_norm": 0.49295591560837315, "learning_rate": 3.88600191511053e-06, "loss": 0.026, "step": 6282 }, { "epoch": 2.8585077343039127, "grad_norm": 0.3833227027075929, "learning_rate": 3.8846085910335226e-06, "loss": 0.0116, "step": 6283 }, { "epoch": 2.858962693357598, "grad_norm": 0.3048850317680643, "learning_rate": 3.883215358101266e-06, "loss": 0.0103, "step": 6284 }, { "epoch": 2.859417652411283, "grad_norm": 0.3343925798847499, "learning_rate": 3.881822216427607e-06, "loss": 0.0057, "step": 6285 }, { "epoch": 2.859872611464968, "grad_norm": 0.4629523464228457, "learning_rate": 3.880429166126385e-06, "loss": 0.0157, "step": 6286 }, { "epoch": 2.8603275705186535, "grad_norm": 0.5882468591085122, "learning_rate": 3.879036207311438e-06, "loss": 0.011, "step": 6287 }, { "epoch": 2.8607825295723384, "grad_norm": 0.35458341546101474, "learning_rate": 3.87764334009659e-06, "loss": 0.0163, "step": 6288 }, { "epoch": 2.8612374886260237, "grad_norm": 0.2276758875960399, "learning_rate": 3.876250564595658e-06, "loss": 0.005, "step": 6289 }, { "epoch": 2.861692447679709, "grad_norm": 0.29770107756072245, "learning_rate": 3.874857880922453e-06, "loss": 0.0064, "step": 6290 }, { "epoch": 2.862147406733394, "grad_norm": 0.41935572575138813, "learning_rate": 3.873465289190785e-06, "loss": 0.0092, "step": 6291 }, { "epoch": 2.862602365787079, "grad_norm": 0.3591339221779853, "learning_rate": 3.872072789514444e-06, "loss": 0.0071, "step": 6292 }, { "epoch": 2.8630573248407645, "grad_norm": 0.4375891096587897, "learning_rate": 3.870680382007223e-06, "loss": 0.0113, "step": 6293 }, { "epoch": 2.8635122838944493, "grad_norm": 0.3684019269047062, "learning_rate": 3.869288066782898e-06, "loss": 0.0117, "step": 6294 }, { "epoch": 2.8639672429481347, "grad_norm": 0.40391472870105816, "learning_rate": 3.867895843955249e-06, "loss": 0.0065, "step": 6295 }, { "epoch": 2.86442220200182, "grad_norm": 0.3903734654988649, "learning_rate": 3.866503713638037e-06, "loss": 0.0077, "step": 6296 }, { "epoch": 2.864877161055505, "grad_norm": 0.46125587147120434, "learning_rate": 3.8651116759450225e-06, "loss": 0.0154, "step": 6297 }, { "epoch": 2.86533212010919, "grad_norm": 0.5955787823821846, "learning_rate": 3.863719730989958e-06, "loss": 0.0172, "step": 6298 }, { "epoch": 2.8657870791628755, "grad_norm": 0.366758042196921, "learning_rate": 3.862327878886585e-06, "loss": 0.009, "step": 6299 }, { "epoch": 2.8662420382165603, "grad_norm": 0.5224307717050332, "learning_rate": 3.860936119748642e-06, "loss": 0.0194, "step": 6300 }, { "epoch": 2.8666969972702456, "grad_norm": 0.40543025280630457, "learning_rate": 3.859544453689853e-06, "loss": 0.0133, "step": 6301 }, { "epoch": 2.867151956323931, "grad_norm": 0.36560351191576285, "learning_rate": 3.858152880823943e-06, "loss": 0.0094, "step": 6302 }, { "epoch": 2.867606915377616, "grad_norm": 0.4582924402202266, "learning_rate": 3.856761401264621e-06, "loss": 0.0232, "step": 6303 }, { "epoch": 2.868061874431301, "grad_norm": 0.29322164088269265, "learning_rate": 3.8553700151255935e-06, "loss": 0.0057, "step": 6304 }, { "epoch": 2.8685168334849864, "grad_norm": 0.391556775245077, "learning_rate": 3.853978722520562e-06, "loss": 0.014, "step": 6305 }, { "epoch": 2.8689717925386713, "grad_norm": 0.3714143208914766, "learning_rate": 3.8525875235632125e-06, "loss": 0.011, "step": 6306 }, { "epoch": 2.8694267515923566, "grad_norm": 0.3966665180003109, "learning_rate": 3.85119641836723e-06, "loss": 0.0178, "step": 6307 }, { "epoch": 2.869881710646042, "grad_norm": 0.2592938059452948, "learning_rate": 3.849805407046288e-06, "loss": 0.0049, "step": 6308 }, { "epoch": 2.870336669699727, "grad_norm": 0.5038771572877041, "learning_rate": 3.848414489714054e-06, "loss": 0.0157, "step": 6309 }, { "epoch": 2.870791628753412, "grad_norm": 0.4286736006088806, "learning_rate": 3.847023666484187e-06, "loss": 0.0202, "step": 6310 }, { "epoch": 2.8712465878070974, "grad_norm": 0.47034164071342965, "learning_rate": 3.8456329374703375e-06, "loss": 0.0179, "step": 6311 }, { "epoch": 2.8717015468607823, "grad_norm": 0.34124052176861547, "learning_rate": 3.844242302786153e-06, "loss": 0.0087, "step": 6312 }, { "epoch": 2.8721565059144676, "grad_norm": 0.3814836081503508, "learning_rate": 3.8428517625452675e-06, "loss": 0.0067, "step": 6313 }, { "epoch": 2.872611464968153, "grad_norm": 0.34100076799914025, "learning_rate": 3.841461316861312e-06, "loss": 0.0105, "step": 6314 }, { "epoch": 2.8730664240218378, "grad_norm": 0.6781933350158105, "learning_rate": 3.840070965847904e-06, "loss": 0.0182, "step": 6315 }, { "epoch": 2.873521383075523, "grad_norm": 0.4155545550670509, "learning_rate": 3.83868070961866e-06, "loss": 0.0101, "step": 6316 }, { "epoch": 2.8739763421292084, "grad_norm": 0.4086492845096628, "learning_rate": 3.837290548287183e-06, "loss": 0.0127, "step": 6317 }, { "epoch": 2.8744313011828937, "grad_norm": 0.414005074607464, "learning_rate": 3.83590048196707e-06, "loss": 0.0216, "step": 6318 }, { "epoch": 2.8748862602365786, "grad_norm": 0.3042880022208222, "learning_rate": 3.834510510771914e-06, "loss": 0.0085, "step": 6319 }, { "epoch": 2.875341219290264, "grad_norm": 0.40440982052288527, "learning_rate": 3.833120634815296e-06, "loss": 0.019, "step": 6320 }, { "epoch": 2.875796178343949, "grad_norm": 0.26802996831161435, "learning_rate": 3.831730854210791e-06, "loss": 0.0054, "step": 6321 }, { "epoch": 2.876251137397634, "grad_norm": 0.7323334230102913, "learning_rate": 3.830341169071965e-06, "loss": 0.0242, "step": 6322 }, { "epoch": 2.8767060964513194, "grad_norm": 0.4133971361129842, "learning_rate": 3.828951579512374e-06, "loss": 0.013, "step": 6323 }, { "epoch": 2.8771610555050047, "grad_norm": 0.35954978352833555, "learning_rate": 3.827562085645574e-06, "loss": 0.0161, "step": 6324 }, { "epoch": 2.87761601455869, "grad_norm": 0.30090268898226913, "learning_rate": 3.826172687585104e-06, "loss": 0.0094, "step": 6325 }, { "epoch": 2.878070973612375, "grad_norm": 0.5403862929387374, "learning_rate": 3.824783385444501e-06, "loss": 0.0178, "step": 6326 }, { "epoch": 2.87852593266606, "grad_norm": 0.3140167732550238, "learning_rate": 3.8233941793372934e-06, "loss": 0.0081, "step": 6327 }, { "epoch": 2.8789808917197455, "grad_norm": 0.39000912243012076, "learning_rate": 3.822005069377e-06, "loss": 0.0124, "step": 6328 }, { "epoch": 2.8794358507734303, "grad_norm": 0.3286182522237254, "learning_rate": 3.820616055677132e-06, "loss": 0.0082, "step": 6329 }, { "epoch": 2.8798908098271156, "grad_norm": 0.3373701375250453, "learning_rate": 3.819227138351194e-06, "loss": 0.0096, "step": 6330 }, { "epoch": 2.880345768880801, "grad_norm": 0.5454728178026315, "learning_rate": 3.817838317512683e-06, "loss": 0.0184, "step": 6331 }, { "epoch": 2.880800727934486, "grad_norm": 0.48750702796293294, "learning_rate": 3.8164495932750835e-06, "loss": 0.0143, "step": 6332 }, { "epoch": 2.881255686988171, "grad_norm": 0.3748304025492812, "learning_rate": 3.81506096575188e-06, "loss": 0.013, "step": 6333 }, { "epoch": 2.8817106460418564, "grad_norm": 0.2617319646306041, "learning_rate": 3.813672435056544e-06, "loss": 0.0074, "step": 6334 }, { "epoch": 2.8821656050955413, "grad_norm": 0.3476047473437665, "learning_rate": 3.812284001302538e-06, "loss": 0.0118, "step": 6335 }, { "epoch": 2.8826205641492266, "grad_norm": 0.3286218231881052, "learning_rate": 3.8108956646033214e-06, "loss": 0.0083, "step": 6336 }, { "epoch": 2.883075523202912, "grad_norm": 0.4079602279689255, "learning_rate": 3.809507425072339e-06, "loss": 0.0124, "step": 6337 }, { "epoch": 2.883530482256597, "grad_norm": 0.3473277015907147, "learning_rate": 3.808119282823035e-06, "loss": 0.0078, "step": 6338 }, { "epoch": 2.883985441310282, "grad_norm": 0.6991876672781839, "learning_rate": 3.8067312379688393e-06, "loss": 0.0194, "step": 6339 }, { "epoch": 2.8844404003639674, "grad_norm": 0.22675332639990367, "learning_rate": 3.8053432906231786e-06, "loss": 0.0052, "step": 6340 }, { "epoch": 2.8848953594176523, "grad_norm": 0.49560045249468987, "learning_rate": 3.8039554408994707e-06, "loss": 0.015, "step": 6341 }, { "epoch": 2.8853503184713376, "grad_norm": 0.36076854600275526, "learning_rate": 3.802567688911121e-06, "loss": 0.0103, "step": 6342 }, { "epoch": 2.885805277525023, "grad_norm": 0.40202946834776543, "learning_rate": 3.801180034771534e-06, "loss": 0.0113, "step": 6343 }, { "epoch": 2.886260236578708, "grad_norm": 0.3789777648716472, "learning_rate": 3.7997924785940992e-06, "loss": 0.0128, "step": 6344 }, { "epoch": 2.886715195632393, "grad_norm": 0.503523378489957, "learning_rate": 3.798405020492204e-06, "loss": 0.0177, "step": 6345 }, { "epoch": 2.8871701546860784, "grad_norm": 0.45704213837144136, "learning_rate": 3.7970176605792227e-06, "loss": 0.0218, "step": 6346 }, { "epoch": 2.8876251137397633, "grad_norm": 0.3884640190831784, "learning_rate": 3.7956303989685263e-06, "loss": 0.0132, "step": 6347 }, { "epoch": 2.8880800727934486, "grad_norm": 0.3950973283624611, "learning_rate": 3.7942432357734756e-06, "loss": 0.0165, "step": 6348 }, { "epoch": 2.888535031847134, "grad_norm": 0.36843284102790197, "learning_rate": 3.7928561711074212e-06, "loss": 0.005, "step": 6349 }, { "epoch": 2.8889899909008188, "grad_norm": 0.40221758497654353, "learning_rate": 3.791469205083711e-06, "loss": 0.0147, "step": 6350 }, { "epoch": 2.889444949954504, "grad_norm": 0.4207977527715021, "learning_rate": 3.790082337815678e-06, "loss": 0.0129, "step": 6351 }, { "epoch": 2.8898999090081894, "grad_norm": 0.3079913969975454, "learning_rate": 3.788695569416654e-06, "loss": 0.0068, "step": 6352 }, { "epoch": 2.8903548680618742, "grad_norm": 0.4885305823729606, "learning_rate": 3.7873088999999553e-06, "loss": 0.0181, "step": 6353 }, { "epoch": 2.8908098271155596, "grad_norm": 0.3543962167794279, "learning_rate": 3.785922329678898e-06, "loss": 0.0098, "step": 6354 }, { "epoch": 2.891264786169245, "grad_norm": 0.4228251555050592, "learning_rate": 3.784535858566786e-06, "loss": 0.0214, "step": 6355 }, { "epoch": 2.8917197452229297, "grad_norm": 0.33124874600109977, "learning_rate": 3.7831494867769134e-06, "loss": 0.0071, "step": 6356 }, { "epoch": 2.892174704276615, "grad_norm": 0.41657557881618507, "learning_rate": 3.7817632144225713e-06, "loss": 0.0113, "step": 6357 }, { "epoch": 2.8926296633303004, "grad_norm": 0.3291065357445098, "learning_rate": 3.780377041617037e-06, "loss": 0.0062, "step": 6358 }, { "epoch": 2.8930846223839852, "grad_norm": 0.3615507370786119, "learning_rate": 3.7789909684735825e-06, "loss": 0.0096, "step": 6359 }, { "epoch": 2.8935395814376705, "grad_norm": 0.4012291148091594, "learning_rate": 3.7776049951054718e-06, "loss": 0.0084, "step": 6360 }, { "epoch": 2.893994540491356, "grad_norm": 0.45028283597993485, "learning_rate": 3.7762191216259613e-06, "loss": 0.0134, "step": 6361 }, { "epoch": 2.8944494995450407, "grad_norm": 0.3255175306272122, "learning_rate": 3.7748333481482974e-06, "loss": 0.0125, "step": 6362 }, { "epoch": 2.894904458598726, "grad_norm": 0.31382961258144904, "learning_rate": 3.77344767478572e-06, "loss": 0.0078, "step": 6363 }, { "epoch": 2.8953594176524113, "grad_norm": 0.21500250648441935, "learning_rate": 3.7720621016514593e-06, "loss": 0.0027, "step": 6364 }, { "epoch": 2.895814376706096, "grad_norm": 0.521065049666442, "learning_rate": 3.7706766288587386e-06, "loss": 0.0173, "step": 6365 }, { "epoch": 2.8962693357597815, "grad_norm": 0.319897868405258, "learning_rate": 3.7692912565207716e-06, "loss": 0.0071, "step": 6366 }, { "epoch": 2.896724294813467, "grad_norm": 0.4516402028839931, "learning_rate": 3.7679059847507643e-06, "loss": 0.013, "step": 6367 }, { "epoch": 2.8971792538671517, "grad_norm": 0.3595984074926371, "learning_rate": 3.7665208136619176e-06, "loss": 0.0127, "step": 6368 }, { "epoch": 2.897634212920837, "grad_norm": 0.4642847466205341, "learning_rate": 3.7651357433674187e-06, "loss": 0.0159, "step": 6369 }, { "epoch": 2.8980891719745223, "grad_norm": 0.33327002165436054, "learning_rate": 3.763750773980451e-06, "loss": 0.0067, "step": 6370 }, { "epoch": 2.8985441310282076, "grad_norm": 0.4191259812612057, "learning_rate": 3.762365905614187e-06, "loss": 0.0221, "step": 6371 }, { "epoch": 2.8989990900818925, "grad_norm": 0.4272225721959623, "learning_rate": 3.760981138381793e-06, "loss": 0.0135, "step": 6372 }, { "epoch": 2.899454049135578, "grad_norm": 0.29475046938215244, "learning_rate": 3.7595964723964236e-06, "loss": 0.0068, "step": 6373 }, { "epoch": 2.899909008189263, "grad_norm": 0.5355429451634685, "learning_rate": 3.7582119077712277e-06, "loss": 0.011, "step": 6374 }, { "epoch": 2.900363967242948, "grad_norm": 0.4908348557141388, "learning_rate": 3.7568274446193486e-06, "loss": 0.0158, "step": 6375 }, { "epoch": 2.9008189262966333, "grad_norm": 0.39466071047943796, "learning_rate": 3.7554430830539164e-06, "loss": 0.0124, "step": 6376 }, { "epoch": 2.9012738853503186, "grad_norm": 0.2776723609453031, "learning_rate": 3.7540588231880557e-06, "loss": 0.0046, "step": 6377 }, { "epoch": 2.901728844404004, "grad_norm": 0.7685620841223039, "learning_rate": 3.75267466513488e-06, "loss": 0.0245, "step": 6378 }, { "epoch": 2.902183803457689, "grad_norm": 0.5251658551775495, "learning_rate": 3.7512906090074997e-06, "loss": 0.0172, "step": 6379 }, { "epoch": 2.902638762511374, "grad_norm": 0.3884955696657931, "learning_rate": 3.74990665491901e-06, "loss": 0.0095, "step": 6380 }, { "epoch": 2.9030937215650594, "grad_norm": 0.271630438033093, "learning_rate": 3.748522802982502e-06, "loss": 0.007, "step": 6381 }, { "epoch": 2.9035486806187443, "grad_norm": 0.378895331137123, "learning_rate": 3.747139053311061e-06, "loss": 0.0141, "step": 6382 }, { "epoch": 2.9040036396724296, "grad_norm": 0.3110405139589322, "learning_rate": 3.745755406017758e-06, "loss": 0.0095, "step": 6383 }, { "epoch": 2.904458598726115, "grad_norm": 0.3398426847127566, "learning_rate": 3.74437186121566e-06, "loss": 0.0092, "step": 6384 }, { "epoch": 2.9049135577797998, "grad_norm": 0.3518693864415104, "learning_rate": 3.7429884190178224e-06, "loss": 0.0096, "step": 6385 }, { "epoch": 2.905368516833485, "grad_norm": 0.5256974818520146, "learning_rate": 3.741605079537295e-06, "loss": 0.0167, "step": 6386 }, { "epoch": 2.9058234758871704, "grad_norm": 0.4092907395202486, "learning_rate": 3.740221842887117e-06, "loss": 0.0108, "step": 6387 }, { "epoch": 2.9062784349408552, "grad_norm": 0.34404946823911237, "learning_rate": 3.7388387091803204e-06, "loss": 0.0113, "step": 6388 }, { "epoch": 2.9067333939945406, "grad_norm": 0.28834987095222475, "learning_rate": 3.73745567852993e-06, "loss": 0.0107, "step": 6389 }, { "epoch": 2.907188353048226, "grad_norm": 0.3095882977232282, "learning_rate": 3.73607275104896e-06, "loss": 0.0094, "step": 6390 }, { "epoch": 2.9076433121019107, "grad_norm": 0.3807601767809069, "learning_rate": 3.7346899268504174e-06, "loss": 0.012, "step": 6391 }, { "epoch": 2.908098271155596, "grad_norm": 0.3260098412889043, "learning_rate": 3.7333072060472987e-06, "loss": 0.009, "step": 6392 }, { "epoch": 2.9085532302092814, "grad_norm": 0.5427951597890734, "learning_rate": 3.7319245887525956e-06, "loss": 0.0146, "step": 6393 }, { "epoch": 2.9090081892629662, "grad_norm": 0.39000578811549785, "learning_rate": 3.73054207507929e-06, "loss": 0.0105, "step": 6394 }, { "epoch": 2.9094631483166515, "grad_norm": 0.4186877479062313, "learning_rate": 3.729159665140348e-06, "loss": 0.0117, "step": 6395 }, { "epoch": 2.909918107370337, "grad_norm": 0.4922890078770253, "learning_rate": 3.7277773590487436e-06, "loss": 0.0146, "step": 6396 }, { "epoch": 2.9103730664240217, "grad_norm": 0.3147800308833414, "learning_rate": 3.726395156917428e-06, "loss": 0.0125, "step": 6397 }, { "epoch": 2.910828025477707, "grad_norm": 0.2879171961223307, "learning_rate": 3.7250130588593467e-06, "loss": 0.0086, "step": 6398 }, { "epoch": 2.9112829845313923, "grad_norm": 0.3067206030759571, "learning_rate": 3.723631064987443e-06, "loss": 0.0115, "step": 6399 }, { "epoch": 2.911737943585077, "grad_norm": 0.3049912993314163, "learning_rate": 3.722249175414643e-06, "loss": 0.0103, "step": 6400 }, { "epoch": 2.9121929026387625, "grad_norm": 0.4957445043143095, "learning_rate": 3.7208673902538705e-06, "loss": 0.0124, "step": 6401 }, { "epoch": 2.912647861692448, "grad_norm": 0.41912048390994716, "learning_rate": 3.7194857096180366e-06, "loss": 0.0157, "step": 6402 }, { "epoch": 2.9131028207461327, "grad_norm": 0.3220786981792336, "learning_rate": 3.7181041336200485e-06, "loss": 0.0055, "step": 6403 }, { "epoch": 2.913557779799818, "grad_norm": 0.3922854078706956, "learning_rate": 3.7167226623728035e-06, "loss": 0.0125, "step": 6404 }, { "epoch": 2.9140127388535033, "grad_norm": 0.41602097384647796, "learning_rate": 3.7153412959891856e-06, "loss": 0.0142, "step": 6405 }, { "epoch": 2.914467697907188, "grad_norm": 0.40619529249645336, "learning_rate": 3.713960034582077e-06, "loss": 0.0132, "step": 6406 }, { "epoch": 2.9149226569608735, "grad_norm": 0.2815417818515389, "learning_rate": 3.712578878264345e-06, "loss": 0.0094, "step": 6407 }, { "epoch": 2.915377616014559, "grad_norm": 0.24659431801693033, "learning_rate": 3.7111978271488546e-06, "loss": 0.0063, "step": 6408 }, { "epoch": 2.9158325750682437, "grad_norm": 0.23984002812085403, "learning_rate": 3.709816881348456e-06, "loss": 0.0055, "step": 6409 }, { "epoch": 2.916287534121929, "grad_norm": 0.46157485439988916, "learning_rate": 3.7084360409759956e-06, "loss": 0.0106, "step": 6410 }, { "epoch": 2.9167424931756143, "grad_norm": 0.4257357090766481, "learning_rate": 3.7070553061443106e-06, "loss": 0.0177, "step": 6411 }, { "epoch": 2.917197452229299, "grad_norm": 0.4430704949620385, "learning_rate": 3.705674676966226e-06, "loss": 0.0082, "step": 6412 }, { "epoch": 2.9176524112829845, "grad_norm": 0.43743754168504323, "learning_rate": 3.7042941535545628e-06, "loss": 0.0095, "step": 6413 }, { "epoch": 2.91810737033667, "grad_norm": 0.2983734567492093, "learning_rate": 3.702913736022129e-06, "loss": 0.0071, "step": 6414 }, { "epoch": 2.9185623293903546, "grad_norm": 0.282368325605843, "learning_rate": 3.701533424481728e-06, "loss": 0.0061, "step": 6415 }, { "epoch": 2.91901728844404, "grad_norm": 0.2812285450319493, "learning_rate": 3.7001532190461497e-06, "loss": 0.0035, "step": 6416 }, { "epoch": 2.9194722474977253, "grad_norm": 0.2779456517793231, "learning_rate": 3.698773119828182e-06, "loss": 0.0096, "step": 6417 }, { "epoch": 2.91992720655141, "grad_norm": 0.36421628202503153, "learning_rate": 3.6973931269405994e-06, "loss": 0.0105, "step": 6418 }, { "epoch": 2.9203821656050954, "grad_norm": 0.30592615966929854, "learning_rate": 3.696013240496166e-06, "loss": 0.0092, "step": 6419 }, { "epoch": 2.9208371246587808, "grad_norm": 0.2455435362824365, "learning_rate": 3.694633460607644e-06, "loss": 0.0056, "step": 6420 }, { "epoch": 2.9212920837124656, "grad_norm": 0.4095843281421876, "learning_rate": 3.693253787387779e-06, "loss": 0.0122, "step": 6421 }, { "epoch": 2.921747042766151, "grad_norm": 0.37739285852106713, "learning_rate": 3.691874220949314e-06, "loss": 0.01, "step": 6422 }, { "epoch": 2.9222020018198362, "grad_norm": 0.35981876257094425, "learning_rate": 3.690494761404979e-06, "loss": 0.0068, "step": 6423 }, { "epoch": 2.922656960873521, "grad_norm": 0.38381891847358335, "learning_rate": 3.6891154088674985e-06, "loss": 0.0096, "step": 6424 }, { "epoch": 2.9231119199272064, "grad_norm": 0.5358702715105391, "learning_rate": 3.687736163449589e-06, "loss": 0.0166, "step": 6425 }, { "epoch": 2.9235668789808917, "grad_norm": 0.3463906828200289, "learning_rate": 3.6863570252639522e-06, "loss": 0.0092, "step": 6426 }, { "epoch": 2.924021838034577, "grad_norm": 0.3390637291243101, "learning_rate": 3.6849779944232885e-06, "loss": 0.0069, "step": 6427 }, { "epoch": 2.924476797088262, "grad_norm": 0.4496711440316595, "learning_rate": 3.683599071040283e-06, "loss": 0.0182, "step": 6428 }, { "epoch": 2.9249317561419472, "grad_norm": 0.40106220012278543, "learning_rate": 3.6822202552276176e-06, "loss": 0.0118, "step": 6429 }, { "epoch": 2.9253867151956325, "grad_norm": 0.41005481151564804, "learning_rate": 3.6808415470979602e-06, "loss": 0.0131, "step": 6430 }, { "epoch": 2.9258416742493174, "grad_norm": 0.27832402660490174, "learning_rate": 3.679462946763975e-06, "loss": 0.0074, "step": 6431 }, { "epoch": 2.9262966333030027, "grad_norm": 0.41817783001301695, "learning_rate": 3.678084454338316e-06, "loss": 0.0076, "step": 6432 }, { "epoch": 2.926751592356688, "grad_norm": 0.40978520701397403, "learning_rate": 3.6767060699336253e-06, "loss": 0.0152, "step": 6433 }, { "epoch": 2.9272065514103733, "grad_norm": 0.37563857566681513, "learning_rate": 3.6753277936625374e-06, "loss": 0.0055, "step": 6434 }, { "epoch": 2.927661510464058, "grad_norm": 0.4103564455898853, "learning_rate": 3.6739496256376816e-06, "loss": 0.0069, "step": 6435 }, { "epoch": 2.9281164695177435, "grad_norm": 0.38379673912392054, "learning_rate": 3.672571565971672e-06, "loss": 0.014, "step": 6436 }, { "epoch": 2.928571428571429, "grad_norm": 0.41343362721512666, "learning_rate": 3.671193614777119e-06, "loss": 0.0178, "step": 6437 }, { "epoch": 2.9290263876251137, "grad_norm": 0.5279136335701177, "learning_rate": 3.669815772166625e-06, "loss": 0.0163, "step": 6438 }, { "epoch": 2.929481346678799, "grad_norm": 0.715206835259677, "learning_rate": 3.6684380382527784e-06, "loss": 0.0252, "step": 6439 }, { "epoch": 2.9299363057324843, "grad_norm": 0.4699215733523937, "learning_rate": 3.6670604131481625e-06, "loss": 0.0181, "step": 6440 }, { "epoch": 2.930391264786169, "grad_norm": 0.43942198974305297, "learning_rate": 3.665682896965349e-06, "loss": 0.0204, "step": 6441 }, { "epoch": 2.9308462238398545, "grad_norm": 0.3618401829398822, "learning_rate": 3.664305489816905e-06, "loss": 0.0114, "step": 6442 }, { "epoch": 2.93130118289354, "grad_norm": 0.5945058832165803, "learning_rate": 3.662928191815383e-06, "loss": 0.0215, "step": 6443 }, { "epoch": 2.9317561419472247, "grad_norm": 0.22962348680888112, "learning_rate": 3.6615510030733302e-06, "loss": 0.0076, "step": 6444 }, { "epoch": 2.93221110100091, "grad_norm": 0.3394510298500849, "learning_rate": 3.660173923703287e-06, "loss": 0.0122, "step": 6445 }, { "epoch": 2.9326660600545953, "grad_norm": 0.5258261357226683, "learning_rate": 3.6587969538177793e-06, "loss": 0.0169, "step": 6446 }, { "epoch": 2.93312101910828, "grad_norm": 0.5012672727554425, "learning_rate": 3.657420093529328e-06, "loss": 0.0167, "step": 6447 }, { "epoch": 2.9335759781619655, "grad_norm": 0.2731143247747597, "learning_rate": 3.656043342950443e-06, "loss": 0.0067, "step": 6448 }, { "epoch": 2.934030937215651, "grad_norm": 0.41294977607930694, "learning_rate": 3.6546667021936284e-06, "loss": 0.0115, "step": 6449 }, { "epoch": 2.9344858962693356, "grad_norm": 0.6537779837928394, "learning_rate": 3.6532901713713742e-06, "loss": 0.0189, "step": 6450 }, { "epoch": 2.934940855323021, "grad_norm": 0.3978700759394602, "learning_rate": 3.6519137505961636e-06, "loss": 0.0115, "step": 6451 }, { "epoch": 2.9353958143767063, "grad_norm": 0.3942439895320119, "learning_rate": 3.650537439980476e-06, "loss": 0.0144, "step": 6452 }, { "epoch": 2.935850773430391, "grad_norm": 0.6015376869042698, "learning_rate": 3.649161239636774e-06, "loss": 0.0187, "step": 6453 }, { "epoch": 2.9363057324840764, "grad_norm": 0.3245374606574856, "learning_rate": 3.647785149677516e-06, "loss": 0.0117, "step": 6454 }, { "epoch": 2.9367606915377618, "grad_norm": 0.5037273302868972, "learning_rate": 3.6464091702151484e-06, "loss": 0.026, "step": 6455 }, { "epoch": 2.9372156505914466, "grad_norm": 0.5511274356127235, "learning_rate": 3.645033301362111e-06, "loss": 0.0178, "step": 6456 }, { "epoch": 2.937670609645132, "grad_norm": 0.42975626401719114, "learning_rate": 3.643657543230832e-06, "loss": 0.0127, "step": 6457 }, { "epoch": 2.9381255686988172, "grad_norm": 0.48340126395934097, "learning_rate": 3.6422818959337326e-06, "loss": 0.0229, "step": 6458 }, { "epoch": 2.938580527752502, "grad_norm": 0.3623128074406011, "learning_rate": 3.640906359583228e-06, "loss": 0.0094, "step": 6459 }, { "epoch": 2.9390354868061874, "grad_norm": 0.3466250150212331, "learning_rate": 3.6395309342917163e-06, "loss": 0.0063, "step": 6460 }, { "epoch": 2.9394904458598727, "grad_norm": 0.6088263100387632, "learning_rate": 3.638155620171594e-06, "loss": 0.0186, "step": 6461 }, { "epoch": 2.9399454049135576, "grad_norm": 0.5086846303566309, "learning_rate": 3.6367804173352434e-06, "loss": 0.0161, "step": 6462 }, { "epoch": 2.940400363967243, "grad_norm": 0.28554776742467947, "learning_rate": 3.6354053258950416e-06, "loss": 0.0116, "step": 6463 }, { "epoch": 2.9408553230209282, "grad_norm": 0.33621246049434594, "learning_rate": 3.6340303459633553e-06, "loss": 0.0088, "step": 6464 }, { "epoch": 2.941310282074613, "grad_norm": 0.313175095500926, "learning_rate": 3.6326554776525357e-06, "loss": 0.0079, "step": 6465 }, { "epoch": 2.9417652411282984, "grad_norm": 0.42398447161072816, "learning_rate": 3.63128072107494e-06, "loss": 0.012, "step": 6466 }, { "epoch": 2.9422202001819837, "grad_norm": 0.37481151861391304, "learning_rate": 3.6299060763429016e-06, "loss": 0.0126, "step": 6467 }, { "epoch": 2.9426751592356686, "grad_norm": 0.3641669379794496, "learning_rate": 3.628531543568751e-06, "loss": 0.0175, "step": 6468 }, { "epoch": 2.943130118289354, "grad_norm": 0.39785299336729835, "learning_rate": 3.6271571228648108e-06, "loss": 0.0082, "step": 6469 }, { "epoch": 2.943585077343039, "grad_norm": 0.378413719535719, "learning_rate": 3.6257828143433886e-06, "loss": 0.0108, "step": 6470 }, { "epoch": 2.944040036396724, "grad_norm": 0.5402550307213921, "learning_rate": 3.6244086181167903e-06, "loss": 0.0215, "step": 6471 }, { "epoch": 2.9444949954504094, "grad_norm": 0.33770455857948545, "learning_rate": 3.623034534297306e-06, "loss": 0.0114, "step": 6472 }, { "epoch": 2.9449499545040947, "grad_norm": 0.2765679731681435, "learning_rate": 3.621660562997222e-06, "loss": 0.0088, "step": 6473 }, { "epoch": 2.9454049135577796, "grad_norm": 0.3523143978385394, "learning_rate": 3.6202867043288126e-06, "loss": 0.0081, "step": 6474 }, { "epoch": 2.945859872611465, "grad_norm": 0.5230423166762636, "learning_rate": 3.6189129584043426e-06, "loss": 0.0174, "step": 6475 }, { "epoch": 2.94631483166515, "grad_norm": 0.359405361480886, "learning_rate": 3.6175393253360704e-06, "loss": 0.0089, "step": 6476 }, { "epoch": 2.946769790718835, "grad_norm": 0.3911789410996822, "learning_rate": 3.6161658052362393e-06, "loss": 0.0113, "step": 6477 }, { "epoch": 2.9472247497725204, "grad_norm": 0.3744567594377753, "learning_rate": 3.6147923982170906e-06, "loss": 0.0124, "step": 6478 }, { "epoch": 2.9476797088262057, "grad_norm": 0.3385001640787944, "learning_rate": 3.6134191043908496e-06, "loss": 0.0126, "step": 6479 }, { "epoch": 2.9481346678798905, "grad_norm": 0.4244177104253812, "learning_rate": 3.6120459238697387e-06, "loss": 0.0232, "step": 6480 }, { "epoch": 2.948589626933576, "grad_norm": 0.374756040858105, "learning_rate": 3.610672856765968e-06, "loss": 0.0152, "step": 6481 }, { "epoch": 2.949044585987261, "grad_norm": 0.40130996389176443, "learning_rate": 3.6092999031917366e-06, "loss": 0.0057, "step": 6482 }, { "epoch": 2.9494995450409465, "grad_norm": 0.6906072685479914, "learning_rate": 3.607927063259237e-06, "loss": 0.0253, "step": 6483 }, { "epoch": 2.9499545040946313, "grad_norm": 0.41175261532553814, "learning_rate": 3.6065543370806504e-06, "loss": 0.007, "step": 6484 }, { "epoch": 2.9504094631483166, "grad_norm": 0.4425109093139479, "learning_rate": 3.605181724768152e-06, "loss": 0.0119, "step": 6485 }, { "epoch": 2.950864422202002, "grad_norm": 0.5125659469748413, "learning_rate": 3.603809226433902e-06, "loss": 0.0108, "step": 6486 }, { "epoch": 2.951319381255687, "grad_norm": 0.21762571448281992, "learning_rate": 3.602436842190058e-06, "loss": 0.0061, "step": 6487 }, { "epoch": 2.951774340309372, "grad_norm": 0.28720286244589455, "learning_rate": 3.6010645721487648e-06, "loss": 0.0061, "step": 6488 }, { "epoch": 2.9522292993630574, "grad_norm": 0.2508069191871544, "learning_rate": 3.5996924164221558e-06, "loss": 0.0056, "step": 6489 }, { "epoch": 2.9526842584167428, "grad_norm": 0.339259444336096, "learning_rate": 3.5983203751223605e-06, "loss": 0.0122, "step": 6490 }, { "epoch": 2.9531392174704276, "grad_norm": 0.5387079386311976, "learning_rate": 3.5969484483614923e-06, "loss": 0.0269, "step": 6491 }, { "epoch": 2.953594176524113, "grad_norm": 0.34316022717814354, "learning_rate": 3.595576636251663e-06, "loss": 0.0104, "step": 6492 }, { "epoch": 2.9540491355777982, "grad_norm": 0.346253213252283, "learning_rate": 3.594204938904966e-06, "loss": 0.0095, "step": 6493 }, { "epoch": 2.954504094631483, "grad_norm": 0.41042946731191354, "learning_rate": 3.5928333564334937e-06, "loss": 0.0072, "step": 6494 }, { "epoch": 2.9549590536851684, "grad_norm": 0.3459032386002424, "learning_rate": 3.591461888949326e-06, "loss": 0.0133, "step": 6495 }, { "epoch": 2.9554140127388537, "grad_norm": 0.33990718467059117, "learning_rate": 3.590090536564531e-06, "loss": 0.0111, "step": 6496 }, { "epoch": 2.9558689717925386, "grad_norm": 0.42419889987542986, "learning_rate": 3.588719299391171e-06, "loss": 0.0206, "step": 6497 }, { "epoch": 2.956323930846224, "grad_norm": 0.4111216623523516, "learning_rate": 3.5873481775412957e-06, "loss": 0.0092, "step": 6498 }, { "epoch": 2.9567788898999092, "grad_norm": 0.438032802285073, "learning_rate": 3.5859771711269486e-06, "loss": 0.0139, "step": 6499 }, { "epoch": 2.957233848953594, "grad_norm": 0.3627021964701024, "learning_rate": 3.58460628026016e-06, "loss": 0.011, "step": 6500 }, { "epoch": 2.9576888080072794, "grad_norm": 0.5433206072238611, "learning_rate": 3.583235505052955e-06, "loss": 0.023, "step": 6501 }, { "epoch": 2.9581437670609647, "grad_norm": 0.36726213720614953, "learning_rate": 3.581864845617348e-06, "loss": 0.0062, "step": 6502 }, { "epoch": 2.9585987261146496, "grad_norm": 0.31629345606568876, "learning_rate": 3.5804943020653403e-06, "loss": 0.0124, "step": 6503 }, { "epoch": 2.959053685168335, "grad_norm": 0.38915340933054543, "learning_rate": 3.579123874508927e-06, "loss": 0.0145, "step": 6504 }, { "epoch": 2.95950864422202, "grad_norm": 0.2957833994150592, "learning_rate": 3.5777535630600962e-06, "loss": 0.0102, "step": 6505 }, { "epoch": 2.959963603275705, "grad_norm": 0.32441200112227103, "learning_rate": 3.57638336783082e-06, "loss": 0.0116, "step": 6506 }, { "epoch": 2.9604185623293904, "grad_norm": 0.31312273494997284, "learning_rate": 3.575013288933065e-06, "loss": 0.0113, "step": 6507 }, { "epoch": 2.9608735213830757, "grad_norm": 0.42217355317451716, "learning_rate": 3.5736433264787903e-06, "loss": 0.011, "step": 6508 }, { "epoch": 2.9613284804367606, "grad_norm": 0.43660776250825045, "learning_rate": 3.572273480579941e-06, "loss": 0.0203, "step": 6509 }, { "epoch": 2.961783439490446, "grad_norm": 0.25376051638608066, "learning_rate": 3.5709037513484555e-06, "loss": 0.0077, "step": 6510 }, { "epoch": 2.962238398544131, "grad_norm": 0.5652030775647546, "learning_rate": 3.569534138896262e-06, "loss": 0.0273, "step": 6511 }, { "epoch": 2.962693357597816, "grad_norm": 0.5268701970464702, "learning_rate": 3.568164643335279e-06, "loss": 0.0228, "step": 6512 }, { "epoch": 2.9631483166515014, "grad_norm": 0.4982147251025876, "learning_rate": 3.566795264777414e-06, "loss": 0.0125, "step": 6513 }, { "epoch": 2.9636032757051867, "grad_norm": 0.6654372766865096, "learning_rate": 3.565426003334567e-06, "loss": 0.0237, "step": 6514 }, { "epoch": 2.9640582347588715, "grad_norm": 0.32738325332042234, "learning_rate": 3.564056859118631e-06, "loss": 0.0102, "step": 6515 }, { "epoch": 2.964513193812557, "grad_norm": 0.5996042803569864, "learning_rate": 3.5626878322414824e-06, "loss": 0.0159, "step": 6516 }, { "epoch": 2.964968152866242, "grad_norm": 0.3353125547507947, "learning_rate": 3.5613189228149947e-06, "loss": 0.0146, "step": 6517 }, { "epoch": 2.965423111919927, "grad_norm": 0.5474646876156293, "learning_rate": 3.5599501309510252e-06, "loss": 0.0153, "step": 6518 }, { "epoch": 2.9658780709736123, "grad_norm": 0.435343334679233, "learning_rate": 3.5585814567614307e-06, "loss": 0.0133, "step": 6519 }, { "epoch": 2.9663330300272976, "grad_norm": 0.49327260361722014, "learning_rate": 3.557212900358048e-06, "loss": 0.0217, "step": 6520 }, { "epoch": 2.9667879890809825, "grad_norm": 0.4450911084721028, "learning_rate": 3.555844461852711e-06, "loss": 0.0161, "step": 6521 }, { "epoch": 2.967242948134668, "grad_norm": 0.3584541153371727, "learning_rate": 3.5544761413572444e-06, "loss": 0.009, "step": 6522 }, { "epoch": 2.967697907188353, "grad_norm": 0.3124643595442405, "learning_rate": 3.5531079389834587e-06, "loss": 0.0086, "step": 6523 }, { "epoch": 2.968152866242038, "grad_norm": 0.32194436616331723, "learning_rate": 3.5517398548431592e-06, "loss": 0.0099, "step": 6524 }, { "epoch": 2.9686078252957233, "grad_norm": 0.34931698332817945, "learning_rate": 3.5503718890481376e-06, "loss": 0.0091, "step": 6525 }, { "epoch": 2.9690627843494086, "grad_norm": 0.37815788525833466, "learning_rate": 3.5490040417101795e-06, "loss": 0.0125, "step": 6526 }, { "epoch": 2.9695177434030935, "grad_norm": 0.28238843041660866, "learning_rate": 3.5476363129410576e-06, "loss": 0.008, "step": 6527 }, { "epoch": 2.969972702456779, "grad_norm": 0.3697340753396572, "learning_rate": 3.5462687028525366e-06, "loss": 0.0099, "step": 6528 }, { "epoch": 2.970427661510464, "grad_norm": 0.32393130922575747, "learning_rate": 3.544901211556374e-06, "loss": 0.0087, "step": 6529 }, { "epoch": 2.970882620564149, "grad_norm": 0.4461277427758584, "learning_rate": 3.543533839164312e-06, "loss": 0.0135, "step": 6530 }, { "epoch": 2.9713375796178343, "grad_norm": 0.29835175560956406, "learning_rate": 3.5421665857880887e-06, "loss": 0.0059, "step": 6531 }, { "epoch": 2.9717925386715196, "grad_norm": 0.3606052458056106, "learning_rate": 3.5407994515394273e-06, "loss": 0.0214, "step": 6532 }, { "epoch": 2.9722474977252045, "grad_norm": 0.4187932062463131, "learning_rate": 3.539432436530046e-06, "loss": 0.011, "step": 6533 }, { "epoch": 2.97270245677889, "grad_norm": 0.319007180889365, "learning_rate": 3.538065540871649e-06, "loss": 0.0116, "step": 6534 }, { "epoch": 2.973157415832575, "grad_norm": 0.5531012482531121, "learning_rate": 3.5366987646759333e-06, "loss": 0.0152, "step": 6535 }, { "epoch": 2.9736123748862604, "grad_norm": 0.47758964654481584, "learning_rate": 3.535332108054589e-06, "loss": 0.0179, "step": 6536 }, { "epoch": 2.9740673339399453, "grad_norm": 0.3569075037008427, "learning_rate": 3.5339655711192878e-06, "loss": 0.015, "step": 6537 }, { "epoch": 2.9745222929936306, "grad_norm": 0.5098602727032687, "learning_rate": 3.532599153981702e-06, "loss": 0.0147, "step": 6538 }, { "epoch": 2.974977252047316, "grad_norm": 0.33681859522390956, "learning_rate": 3.531232856753486e-06, "loss": 0.0125, "step": 6539 }, { "epoch": 2.9754322111010008, "grad_norm": 0.552580439927012, "learning_rate": 3.5298666795462865e-06, "loss": 0.0147, "step": 6540 }, { "epoch": 2.975887170154686, "grad_norm": 0.3375126958474858, "learning_rate": 3.528500622471745e-06, "loss": 0.0086, "step": 6541 }, { "epoch": 2.9763421292083714, "grad_norm": 0.5061059149911944, "learning_rate": 3.5271346856414847e-06, "loss": 0.0158, "step": 6542 }, { "epoch": 2.9767970882620567, "grad_norm": 0.2708132117320371, "learning_rate": 3.525768869167128e-06, "loss": 0.0068, "step": 6543 }, { "epoch": 2.9772520473157416, "grad_norm": 0.3515462809804613, "learning_rate": 3.5244031731602824e-06, "loss": 0.0157, "step": 6544 }, { "epoch": 2.977707006369427, "grad_norm": 0.38160272549077745, "learning_rate": 3.523037597732545e-06, "loss": 0.0128, "step": 6545 }, { "epoch": 2.978161965423112, "grad_norm": 0.5097781769845867, "learning_rate": 3.521672142995506e-06, "loss": 0.0253, "step": 6546 }, { "epoch": 2.978616924476797, "grad_norm": 0.3929186750012416, "learning_rate": 3.5203068090607416e-06, "loss": 0.0093, "step": 6547 }, { "epoch": 2.9790718835304824, "grad_norm": 0.5796891004760814, "learning_rate": 3.518941596039825e-06, "loss": 0.0181, "step": 6548 }, { "epoch": 2.9795268425841677, "grad_norm": 0.4339014424134206, "learning_rate": 3.51757650404431e-06, "loss": 0.0277, "step": 6549 }, { "epoch": 2.9799818016378525, "grad_norm": 0.41398487307020093, "learning_rate": 3.5162115331857494e-06, "loss": 0.0186, "step": 6550 }, { "epoch": 2.980436760691538, "grad_norm": 0.3757799362980298, "learning_rate": 3.514846683575683e-06, "loss": 0.0146, "step": 6551 }, { "epoch": 2.980891719745223, "grad_norm": 0.5281256320015878, "learning_rate": 3.5134819553256374e-06, "loss": 0.0146, "step": 6552 }, { "epoch": 2.981346678798908, "grad_norm": 0.42468968336333995, "learning_rate": 3.512117348547134e-06, "loss": 0.0137, "step": 6553 }, { "epoch": 2.9818016378525933, "grad_norm": 0.3359772115067771, "learning_rate": 3.510752863351682e-06, "loss": 0.0114, "step": 6554 }, { "epoch": 2.9822565969062786, "grad_norm": 0.3638159398785847, "learning_rate": 3.50938849985078e-06, "loss": 0.0098, "step": 6555 }, { "epoch": 2.9827115559599635, "grad_norm": 0.19591234810517247, "learning_rate": 3.508024258155918e-06, "loss": 0.0034, "step": 6556 }, { "epoch": 2.983166515013649, "grad_norm": 0.4248982130335413, "learning_rate": 3.506660138378575e-06, "loss": 0.0132, "step": 6557 }, { "epoch": 2.983621474067334, "grad_norm": 0.41451250175402327, "learning_rate": 3.505296140630224e-06, "loss": 0.0177, "step": 6558 }, { "epoch": 2.984076433121019, "grad_norm": 0.38969374572008747, "learning_rate": 3.5039322650223207e-06, "loss": 0.0108, "step": 6559 }, { "epoch": 2.9845313921747043, "grad_norm": 0.38980340160687305, "learning_rate": 3.5025685116663176e-06, "loss": 0.0094, "step": 6560 }, { "epoch": 2.9849863512283896, "grad_norm": 0.4368813537831224, "learning_rate": 3.5012048806736525e-06, "loss": 0.0157, "step": 6561 }, { "epoch": 2.9854413102820745, "grad_norm": 0.445846609961705, "learning_rate": 3.499841372155757e-06, "loss": 0.0111, "step": 6562 }, { "epoch": 2.98589626933576, "grad_norm": 0.36904131373841426, "learning_rate": 3.4984779862240483e-06, "loss": 0.009, "step": 6563 }, { "epoch": 2.986351228389445, "grad_norm": 0.5158383186613557, "learning_rate": 3.497114722989938e-06, "loss": 0.0108, "step": 6564 }, { "epoch": 2.98680618744313, "grad_norm": 0.2709198700370854, "learning_rate": 3.495751582564827e-06, "loss": 0.0091, "step": 6565 }, { "epoch": 2.9872611464968153, "grad_norm": 0.3360782617935061, "learning_rate": 3.4943885650601028e-06, "loss": 0.008, "step": 6566 }, { "epoch": 2.9877161055505006, "grad_norm": 0.4780280782673142, "learning_rate": 3.4930256705871467e-06, "loss": 0.0208, "step": 6567 }, { "epoch": 2.9881710646041855, "grad_norm": 0.44920864250946857, "learning_rate": 3.4916628992573267e-06, "loss": 0.0118, "step": 6568 }, { "epoch": 2.988626023657871, "grad_norm": 0.29882332357265173, "learning_rate": 3.490300251182003e-06, "loss": 0.0075, "step": 6569 }, { "epoch": 2.989080982711556, "grad_norm": 0.5988789623374708, "learning_rate": 3.4889377264725233e-06, "loss": 0.0234, "step": 6570 }, { "epoch": 2.989535941765241, "grad_norm": 0.3983572447931714, "learning_rate": 3.48757532524023e-06, "loss": 0.0126, "step": 6571 }, { "epoch": 2.9899909008189263, "grad_norm": 0.3764680155253722, "learning_rate": 3.4862130475964516e-06, "loss": 0.0115, "step": 6572 }, { "epoch": 2.9904458598726116, "grad_norm": 0.35833475350018384, "learning_rate": 3.4848508936525063e-06, "loss": 0.0138, "step": 6573 }, { "epoch": 2.9909008189262964, "grad_norm": 0.34962169597146103, "learning_rate": 3.4834888635197044e-06, "loss": 0.0075, "step": 6574 }, { "epoch": 2.9913557779799818, "grad_norm": 0.3918155381217852, "learning_rate": 3.482126957309344e-06, "loss": 0.015, "step": 6575 }, { "epoch": 2.991810737033667, "grad_norm": 0.46818754910044025, "learning_rate": 3.4807651751327133e-06, "loss": 0.0139, "step": 6576 }, { "epoch": 2.992265696087352, "grad_norm": 0.48511393425607174, "learning_rate": 3.479403517101091e-06, "loss": 0.0097, "step": 6577 }, { "epoch": 2.9927206551410372, "grad_norm": 0.6301894242314166, "learning_rate": 3.478041983325747e-06, "loss": 0.021, "step": 6578 }, { "epoch": 2.9931756141947226, "grad_norm": 0.3476736060486332, "learning_rate": 3.476680573917939e-06, "loss": 0.0103, "step": 6579 }, { "epoch": 2.9936305732484074, "grad_norm": 0.5028656074679961, "learning_rate": 3.4753192889889166e-06, "loss": 0.014, "step": 6580 }, { "epoch": 2.9940855323020927, "grad_norm": 0.25322291430768457, "learning_rate": 3.4739581286499147e-06, "loss": 0.007, "step": 6581 }, { "epoch": 2.994540491355778, "grad_norm": 0.36743018715888376, "learning_rate": 3.4725970930121646e-06, "loss": 0.0054, "step": 6582 }, { "epoch": 2.994995450409463, "grad_norm": 0.32859431938405953, "learning_rate": 3.4712361821868814e-06, "loss": 0.0101, "step": 6583 }, { "epoch": 2.9954504094631482, "grad_norm": 0.3683224458881264, "learning_rate": 3.4698753962852715e-06, "loss": 0.0097, "step": 6584 }, { "epoch": 2.9959053685168335, "grad_norm": 0.44288889919259156, "learning_rate": 3.468514735418537e-06, "loss": 0.012, "step": 6585 }, { "epoch": 2.9963603275705184, "grad_norm": 0.4542328497419673, "learning_rate": 3.4671541996978607e-06, "loss": 0.013, "step": 6586 }, { "epoch": 2.9968152866242037, "grad_norm": 0.28826436118881377, "learning_rate": 3.465793789234423e-06, "loss": 0.0137, "step": 6587 }, { "epoch": 2.997270245677889, "grad_norm": 0.3456106458761672, "learning_rate": 3.4644335041393867e-06, "loss": 0.0097, "step": 6588 }, { "epoch": 2.997725204731574, "grad_norm": 0.36722041869566846, "learning_rate": 3.463073344523911e-06, "loss": 0.0105, "step": 6589 }, { "epoch": 2.998180163785259, "grad_norm": 0.3671137298434854, "learning_rate": 3.4617133104991396e-06, "loss": 0.0163, "step": 6590 }, { "epoch": 2.9986351228389445, "grad_norm": 0.26960385085062966, "learning_rate": 3.4603534021762088e-06, "loss": 0.0043, "step": 6591 }, { "epoch": 2.99909008189263, "grad_norm": 0.4990967729111016, "learning_rate": 3.458993619666248e-06, "loss": 0.0104, "step": 6592 }, { "epoch": 2.9995450409463147, "grad_norm": 0.5826797641151416, "learning_rate": 3.4576339630803667e-06, "loss": 0.0245, "step": 6593 }, { "epoch": 3.0, "grad_norm": 0.23571962806449165, "learning_rate": 3.456274432529675e-06, "loss": 0.0083, "step": 6594 }, { "epoch": 3.0004549590536853, "grad_norm": 0.0893191704649401, "learning_rate": 3.4549150281252635e-06, "loss": 0.0018, "step": 6595 }, { "epoch": 3.00090991810737, "grad_norm": 0.3347845618716773, "learning_rate": 3.4535557499782195e-06, "loss": 0.0056, "step": 6596 }, { "epoch": 3.0013648771610555, "grad_norm": 0.13635346834634315, "learning_rate": 3.452196598199615e-06, "loss": 0.0024, "step": 6597 }, { "epoch": 3.001819836214741, "grad_norm": 0.16089312403502545, "learning_rate": 3.4508375729005137e-06, "loss": 0.003, "step": 6598 }, { "epoch": 3.0022747952684257, "grad_norm": 0.19438018052149716, "learning_rate": 3.449478674191972e-06, "loss": 0.0057, "step": 6599 }, { "epoch": 3.002729754322111, "grad_norm": 0.1531692653094001, "learning_rate": 3.44811990218503e-06, "loss": 0.0029, "step": 6600 }, { "epoch": 3.0031847133757963, "grad_norm": 0.14561522799142143, "learning_rate": 3.4467612569907226e-06, "loss": 0.0025, "step": 6601 }, { "epoch": 3.003639672429481, "grad_norm": 0.21913303802077205, "learning_rate": 3.4454027387200695e-06, "loss": 0.0043, "step": 6602 }, { "epoch": 3.0040946314831665, "grad_norm": 0.1478213208738507, "learning_rate": 3.4440443474840857e-06, "loss": 0.0021, "step": 6603 }, { "epoch": 3.0045495905368518, "grad_norm": 0.2202665007894661, "learning_rate": 3.4426860833937696e-06, "loss": 0.0062, "step": 6604 }, { "epoch": 3.0050045495905366, "grad_norm": 0.15933037907081252, "learning_rate": 3.4413279465601136e-06, "loss": 0.0022, "step": 6605 }, { "epoch": 3.005459508644222, "grad_norm": 0.2995172416816985, "learning_rate": 3.4399699370940996e-06, "loss": 0.0062, "step": 6606 }, { "epoch": 3.0059144676979073, "grad_norm": 0.34600386926789417, "learning_rate": 3.4386120551066976e-06, "loss": 0.0037, "step": 6607 }, { "epoch": 3.0063694267515926, "grad_norm": 0.20426270040272176, "learning_rate": 3.437254300708868e-06, "loss": 0.0039, "step": 6608 }, { "epoch": 3.0068243858052774, "grad_norm": 0.301874923402197, "learning_rate": 3.4358966740115595e-06, "loss": 0.0073, "step": 6609 }, { "epoch": 3.0072793448589628, "grad_norm": 0.1122449943135855, "learning_rate": 3.4345391751257105e-06, "loss": 0.0014, "step": 6610 }, { "epoch": 3.007734303912648, "grad_norm": 0.1996226528370291, "learning_rate": 3.433181804162251e-06, "loss": 0.0046, "step": 6611 }, { "epoch": 3.008189262966333, "grad_norm": 0.1739934559568579, "learning_rate": 3.4318245612320976e-06, "loss": 0.0023, "step": 6612 }, { "epoch": 3.0086442220200182, "grad_norm": 0.3493893395081199, "learning_rate": 3.4304674464461597e-06, "loss": 0.0091, "step": 6613 }, { "epoch": 3.0090991810737036, "grad_norm": 0.18861323894491727, "learning_rate": 3.429110459915336e-06, "loss": 0.0041, "step": 6614 }, { "epoch": 3.0095541401273884, "grad_norm": 0.2042787293335032, "learning_rate": 3.427753601750509e-06, "loss": 0.0045, "step": 6615 }, { "epoch": 3.0100090991810737, "grad_norm": 0.07566522736113394, "learning_rate": 3.4263968720625597e-06, "loss": 0.0013, "step": 6616 }, { "epoch": 3.010464058234759, "grad_norm": 0.4534280859980601, "learning_rate": 3.4250402709623497e-06, "loss": 0.0051, "step": 6617 }, { "epoch": 3.010919017288444, "grad_norm": 0.23575231717807305, "learning_rate": 3.423683798560738e-06, "loss": 0.0063, "step": 6618 }, { "epoch": 3.011373976342129, "grad_norm": 0.14894053974123447, "learning_rate": 3.4223274549685653e-06, "loss": 0.0032, "step": 6619 }, { "epoch": 3.0118289353958145, "grad_norm": 0.11599223217040529, "learning_rate": 3.4209712402966693e-06, "loss": 0.0021, "step": 6620 }, { "epoch": 3.0122838944494994, "grad_norm": 0.3253214585977508, "learning_rate": 3.419615154655874e-06, "loss": 0.0049, "step": 6621 }, { "epoch": 3.0127388535031847, "grad_norm": 0.14413936950678644, "learning_rate": 3.4182591981569903e-06, "loss": 0.0026, "step": 6622 }, { "epoch": 3.01319381255687, "grad_norm": 0.15902466126091303, "learning_rate": 3.416903370910822e-06, "loss": 0.0028, "step": 6623 }, { "epoch": 3.013648771610555, "grad_norm": 0.3624443200913134, "learning_rate": 3.415547673028161e-06, "loss": 0.0058, "step": 6624 }, { "epoch": 3.01410373066424, "grad_norm": 0.24355384727569604, "learning_rate": 3.41419210461979e-06, "loss": 0.0027, "step": 6625 }, { "epoch": 3.0145586897179255, "grad_norm": 0.3643777170821261, "learning_rate": 3.412836665796476e-06, "loss": 0.012, "step": 6626 }, { "epoch": 3.0150136487716104, "grad_norm": 0.19011536327909298, "learning_rate": 3.4114813566689837e-06, "loss": 0.0026, "step": 6627 }, { "epoch": 3.0154686078252957, "grad_norm": 0.36758426824899176, "learning_rate": 3.410126177348062e-06, "loss": 0.0047, "step": 6628 }, { "epoch": 3.015923566878981, "grad_norm": 0.27298808165345695, "learning_rate": 3.408771127944448e-06, "loss": 0.0033, "step": 6629 }, { "epoch": 3.016378525932666, "grad_norm": 0.264986574263462, "learning_rate": 3.4074162085688734e-06, "loss": 0.0058, "step": 6630 }, { "epoch": 3.016833484986351, "grad_norm": 0.12813541524178768, "learning_rate": 3.4060614193320524e-06, "loss": 0.0017, "step": 6631 }, { "epoch": 3.0172884440400365, "grad_norm": 0.337075641842545, "learning_rate": 3.4047067603446947e-06, "loss": 0.0042, "step": 6632 }, { "epoch": 3.0177434030937214, "grad_norm": 0.3163141420097077, "learning_rate": 3.403352231717495e-06, "loss": 0.0096, "step": 6633 }, { "epoch": 3.0181983621474067, "grad_norm": 0.618042102760366, "learning_rate": 3.4019978335611414e-06, "loss": 0.0058, "step": 6634 }, { "epoch": 3.018653321201092, "grad_norm": 0.2230934149262993, "learning_rate": 3.400643565986309e-06, "loss": 0.0034, "step": 6635 }, { "epoch": 3.0191082802547773, "grad_norm": 0.11737139792400703, "learning_rate": 3.3992894291036615e-06, "loss": 0.0012, "step": 6636 }, { "epoch": 3.019563239308462, "grad_norm": 0.1253771741531208, "learning_rate": 3.3979354230238537e-06, "loss": 0.0017, "step": 6637 }, { "epoch": 3.0200181983621475, "grad_norm": 0.31309338322046454, "learning_rate": 3.396581547857527e-06, "loss": 0.0032, "step": 6638 }, { "epoch": 3.0204731574158328, "grad_norm": 0.2721560884607357, "learning_rate": 3.3952278037153162e-06, "loss": 0.0059, "step": 6639 }, { "epoch": 3.0209281164695176, "grad_norm": 0.19990261394723954, "learning_rate": 3.3938741907078404e-06, "loss": 0.0026, "step": 6640 }, { "epoch": 3.021383075523203, "grad_norm": 0.24426365534373817, "learning_rate": 3.3925207089457134e-06, "loss": 0.0047, "step": 6641 }, { "epoch": 3.0218380345768883, "grad_norm": 0.28575080423457644, "learning_rate": 3.391167358539536e-06, "loss": 0.0037, "step": 6642 }, { "epoch": 3.022292993630573, "grad_norm": 0.138887174100726, "learning_rate": 3.3898141395998957e-06, "loss": 0.0018, "step": 6643 }, { "epoch": 3.0227479526842584, "grad_norm": 0.3332499529648303, "learning_rate": 3.388461052237373e-06, "loss": 0.0105, "step": 6644 }, { "epoch": 3.0232029117379438, "grad_norm": 0.27342978649469785, "learning_rate": 3.3871080965625356e-06, "loss": 0.0022, "step": 6645 }, { "epoch": 3.0236578707916286, "grad_norm": 0.19489859654165118, "learning_rate": 3.3857552726859398e-06, "loss": 0.0039, "step": 6646 }, { "epoch": 3.024112829845314, "grad_norm": 0.14270451832945846, "learning_rate": 3.3844025807181325e-06, "loss": 0.0018, "step": 6647 }, { "epoch": 3.0245677888989992, "grad_norm": 0.35927759277977983, "learning_rate": 3.383050020769652e-06, "loss": 0.0052, "step": 6648 }, { "epoch": 3.025022747952684, "grad_norm": 0.24924094124945648, "learning_rate": 3.381697592951021e-06, "loss": 0.0038, "step": 6649 }, { "epoch": 3.0254777070063694, "grad_norm": 0.2909270553851987, "learning_rate": 3.380345297372756e-06, "loss": 0.0036, "step": 6650 }, { "epoch": 3.0259326660600547, "grad_norm": 0.2911388642080158, "learning_rate": 3.3789931341453564e-06, "loss": 0.0068, "step": 6651 }, { "epoch": 3.0263876251137396, "grad_norm": 0.2488324883508692, "learning_rate": 3.3776411033793198e-06, "loss": 0.0055, "step": 6652 }, { "epoch": 3.026842584167425, "grad_norm": 0.10552758910776536, "learning_rate": 3.376289205185125e-06, "loss": 0.0028, "step": 6653 }, { "epoch": 3.02729754322111, "grad_norm": 0.04360860869686373, "learning_rate": 3.3749374396732413e-06, "loss": 0.0005, "step": 6654 }, { "epoch": 3.027752502274795, "grad_norm": 0.16426236232776292, "learning_rate": 3.3735858069541345e-06, "loss": 0.0023, "step": 6655 }, { "epoch": 3.0282074613284804, "grad_norm": 0.2646664737834769, "learning_rate": 3.372234307138249e-06, "loss": 0.0038, "step": 6656 }, { "epoch": 3.0286624203821657, "grad_norm": 0.12543008094547287, "learning_rate": 3.3708829403360266e-06, "loss": 0.0023, "step": 6657 }, { "epoch": 3.0291173794358506, "grad_norm": 0.23685846621502793, "learning_rate": 3.369531706657892e-06, "loss": 0.0047, "step": 6658 }, { "epoch": 3.029572338489536, "grad_norm": 0.27145480963050583, "learning_rate": 3.368180606214264e-06, "loss": 0.0074, "step": 6659 }, { "epoch": 3.030027297543221, "grad_norm": 0.11211373787488198, "learning_rate": 3.3668296391155473e-06, "loss": 0.002, "step": 6660 }, { "epoch": 3.030482256596906, "grad_norm": 0.22366057001345938, "learning_rate": 3.3654788054721356e-06, "loss": 0.0038, "step": 6661 }, { "epoch": 3.0309372156505914, "grad_norm": 0.3061134159484588, "learning_rate": 3.3641281053944165e-06, "loss": 0.0062, "step": 6662 }, { "epoch": 3.0313921747042767, "grad_norm": 0.19156535723123164, "learning_rate": 3.36277753899276e-06, "loss": 0.0048, "step": 6663 }, { "epoch": 3.031847133757962, "grad_norm": 0.3513658408597901, "learning_rate": 3.3614271063775306e-06, "loss": 0.0042, "step": 6664 }, { "epoch": 3.032302092811647, "grad_norm": 0.39642909000522336, "learning_rate": 3.3600768076590772e-06, "loss": 0.0088, "step": 6665 }, { "epoch": 3.032757051865332, "grad_norm": 0.1440361583079107, "learning_rate": 3.3587266429477426e-06, "loss": 0.0014, "step": 6666 }, { "epoch": 3.0332120109190175, "grad_norm": 0.4184718961994572, "learning_rate": 3.3573766123538536e-06, "loss": 0.0051, "step": 6667 }, { "epoch": 3.0336669699727024, "grad_norm": 0.22512611342607328, "learning_rate": 3.356026715987729e-06, "loss": 0.0024, "step": 6668 }, { "epoch": 3.0341219290263877, "grad_norm": 0.1610965509607099, "learning_rate": 3.354676953959679e-06, "loss": 0.002, "step": 6669 }, { "epoch": 3.034576888080073, "grad_norm": 0.13527929271548852, "learning_rate": 3.353327326379997e-06, "loss": 0.0014, "step": 6670 }, { "epoch": 3.035031847133758, "grad_norm": 0.28141741622039324, "learning_rate": 3.3519778333589702e-06, "loss": 0.0038, "step": 6671 }, { "epoch": 3.035486806187443, "grad_norm": 0.2240547258601244, "learning_rate": 3.3506284750068718e-06, "loss": 0.0033, "step": 6672 }, { "epoch": 3.0359417652411285, "grad_norm": 0.20360780883197763, "learning_rate": 3.3492792514339672e-06, "loss": 0.001, "step": 6673 }, { "epoch": 3.0363967242948133, "grad_norm": 0.11469096789758414, "learning_rate": 3.347930162750505e-06, "loss": 0.001, "step": 6674 }, { "epoch": 3.0368516833484986, "grad_norm": 0.21056526465288855, "learning_rate": 3.3465812090667303e-06, "loss": 0.0028, "step": 6675 }, { "epoch": 3.037306642402184, "grad_norm": 0.2933373381535325, "learning_rate": 3.3452323904928742e-06, "loss": 0.0052, "step": 6676 }, { "epoch": 3.037761601455869, "grad_norm": 0.14648108429263568, "learning_rate": 3.343883707139153e-06, "loss": 0.0016, "step": 6677 }, { "epoch": 3.038216560509554, "grad_norm": 0.3595127209262766, "learning_rate": 3.3425351591157766e-06, "loss": 0.008, "step": 6678 }, { "epoch": 3.0386715195632394, "grad_norm": 0.15990912197878135, "learning_rate": 3.3411867465329416e-06, "loss": 0.0025, "step": 6679 }, { "epoch": 3.0391264786169243, "grad_norm": 0.3206118713392188, "learning_rate": 3.3398384695008356e-06, "loss": 0.0018, "step": 6680 }, { "epoch": 3.0395814376706096, "grad_norm": 0.2639358400102524, "learning_rate": 3.33849032812963e-06, "loss": 0.0026, "step": 6681 }, { "epoch": 3.040036396724295, "grad_norm": 0.6928160680099208, "learning_rate": 3.337142322529493e-06, "loss": 0.0184, "step": 6682 }, { "epoch": 3.04049135577798, "grad_norm": 0.29813064246204063, "learning_rate": 3.3357944528105767e-06, "loss": 0.004, "step": 6683 }, { "epoch": 3.040946314831665, "grad_norm": 0.246077961176862, "learning_rate": 3.334446719083022e-06, "loss": 0.0037, "step": 6684 }, { "epoch": 3.0414012738853504, "grad_norm": 0.1168015668333905, "learning_rate": 3.33309912145696e-06, "loss": 0.001, "step": 6685 }, { "epoch": 3.0418562329390353, "grad_norm": 0.26554698256243703, "learning_rate": 3.3317516600425105e-06, "loss": 0.0047, "step": 6686 }, { "epoch": 3.0423111919927206, "grad_norm": 0.145936907656275, "learning_rate": 3.33040433494978e-06, "loss": 0.0015, "step": 6687 }, { "epoch": 3.042766151046406, "grad_norm": 0.14191997224295935, "learning_rate": 3.3290571462888664e-06, "loss": 0.0022, "step": 6688 }, { "epoch": 3.0432211101000908, "grad_norm": 0.2068582104615765, "learning_rate": 3.3277100941698597e-06, "loss": 0.0029, "step": 6689 }, { "epoch": 3.043676069153776, "grad_norm": 0.12399299402302033, "learning_rate": 3.3263631787028308e-06, "loss": 0.0016, "step": 6690 }, { "epoch": 3.0441310282074614, "grad_norm": 0.15741626964441308, "learning_rate": 3.3250163999978457e-06, "loss": 0.0018, "step": 6691 }, { "epoch": 3.0445859872611467, "grad_norm": 0.3094470919955424, "learning_rate": 3.3236697581649557e-06, "loss": 0.0059, "step": 6692 }, { "epoch": 3.0450409463148316, "grad_norm": 0.18765245098930736, "learning_rate": 3.3223232533142034e-06, "loss": 0.0015, "step": 6693 }, { "epoch": 3.045495905368517, "grad_norm": 0.5725730085518594, "learning_rate": 3.320976885555618e-06, "loss": 0.0036, "step": 6694 }, { "epoch": 3.045950864422202, "grad_norm": 0.23770536112447027, "learning_rate": 3.3196306549992176e-06, "loss": 0.0037, "step": 6695 }, { "epoch": 3.046405823475887, "grad_norm": 0.16804086419320965, "learning_rate": 3.3182845617550137e-06, "loss": 0.0016, "step": 6696 }, { "epoch": 3.0468607825295724, "grad_norm": 0.2221844581540461, "learning_rate": 3.316938605933e-06, "loss": 0.0028, "step": 6697 }, { "epoch": 3.0473157415832577, "grad_norm": 0.10819946158612141, "learning_rate": 3.315592787643164e-06, "loss": 0.001, "step": 6698 }, { "epoch": 3.0477707006369426, "grad_norm": 0.2660761098576769, "learning_rate": 3.3142471069954767e-06, "loss": 0.0025, "step": 6699 }, { "epoch": 3.048225659690628, "grad_norm": 0.18950291654243787, "learning_rate": 3.3129015640999052e-06, "loss": 0.0022, "step": 6700 }, { "epoch": 3.048680618744313, "grad_norm": 0.13500705608327346, "learning_rate": 3.311556159066397e-06, "loss": 0.0032, "step": 6701 }, { "epoch": 3.049135577797998, "grad_norm": 0.42461346784601345, "learning_rate": 3.3102108920048935e-06, "loss": 0.0051, "step": 6702 }, { "epoch": 3.0495905368516834, "grad_norm": 0.29661973482573345, "learning_rate": 3.3088657630253278e-06, "loss": 0.0059, "step": 6703 }, { "epoch": 3.0500454959053687, "grad_norm": 0.21458789712042925, "learning_rate": 3.3075207722376136e-06, "loss": 0.0056, "step": 6704 }, { "epoch": 3.0505004549590535, "grad_norm": 0.22927246194199355, "learning_rate": 3.3061759197516598e-06, "loss": 0.0047, "step": 6705 }, { "epoch": 3.050955414012739, "grad_norm": 0.12383450067305045, "learning_rate": 3.3048312056773592e-06, "loss": 0.002, "step": 6706 }, { "epoch": 3.051410373066424, "grad_norm": 0.28367765063276584, "learning_rate": 3.3034866301245983e-06, "loss": 0.006, "step": 6707 }, { "epoch": 3.051865332120109, "grad_norm": 0.10539871834944999, "learning_rate": 3.302142193203248e-06, "loss": 0.0007, "step": 6708 }, { "epoch": 3.0523202911737943, "grad_norm": 0.20715104476378063, "learning_rate": 3.3007978950231684e-06, "loss": 0.0009, "step": 6709 }, { "epoch": 3.0527752502274796, "grad_norm": 0.12719221782871798, "learning_rate": 3.2994537356942137e-06, "loss": 0.0014, "step": 6710 }, { "epoch": 3.0532302092811645, "grad_norm": 0.16971830470920765, "learning_rate": 3.298109715326219e-06, "loss": 0.0035, "step": 6711 }, { "epoch": 3.05368516833485, "grad_norm": 0.38435683043787533, "learning_rate": 3.296765834029014e-06, "loss": 0.0063, "step": 6712 }, { "epoch": 3.054140127388535, "grad_norm": 0.2242781790282236, "learning_rate": 3.2954220919124125e-06, "loss": 0.0031, "step": 6713 }, { "epoch": 3.05459508644222, "grad_norm": 0.1947451320689837, "learning_rate": 3.29407848908622e-06, "loss": 0.0021, "step": 6714 }, { "epoch": 3.0550500454959053, "grad_norm": 0.27425414421883815, "learning_rate": 3.2927350256602293e-06, "loss": 0.0022, "step": 6715 }, { "epoch": 3.0555050045495906, "grad_norm": 0.342670858910268, "learning_rate": 3.291391701744221e-06, "loss": 0.0046, "step": 6716 }, { "epoch": 3.055959963603276, "grad_norm": 0.40502422693178913, "learning_rate": 3.290048517447969e-06, "loss": 0.0095, "step": 6717 }, { "epoch": 3.056414922656961, "grad_norm": 0.2447947801658237, "learning_rate": 3.2887054728812284e-06, "loss": 0.004, "step": 6718 }, { "epoch": 3.056869881710646, "grad_norm": 0.07084556483498368, "learning_rate": 3.2873625681537503e-06, "loss": 0.0006, "step": 6719 }, { "epoch": 3.0573248407643314, "grad_norm": 0.18491076189575661, "learning_rate": 3.286019803375269e-06, "loss": 0.0017, "step": 6720 }, { "epoch": 3.0577797998180163, "grad_norm": 0.1518483405872594, "learning_rate": 3.2846771786555075e-06, "loss": 0.0016, "step": 6721 }, { "epoch": 3.0582347588717016, "grad_norm": 0.1246677738383808, "learning_rate": 3.2833346941041823e-06, "loss": 0.0012, "step": 6722 }, { "epoch": 3.058689717925387, "grad_norm": 0.24501469373553433, "learning_rate": 3.2819923498309903e-06, "loss": 0.0037, "step": 6723 }, { "epoch": 3.0591446769790718, "grad_norm": 0.34766650866262405, "learning_rate": 3.280650145945627e-06, "loss": 0.0035, "step": 6724 }, { "epoch": 3.059599636032757, "grad_norm": 0.14575844229273618, "learning_rate": 3.27930808255777e-06, "loss": 0.0007, "step": 6725 }, { "epoch": 3.0600545950864424, "grad_norm": 0.2783510874130845, "learning_rate": 3.277966159777085e-06, "loss": 0.0068, "step": 6726 }, { "epoch": 3.0605095541401273, "grad_norm": 0.07908330415622568, "learning_rate": 3.27662437771323e-06, "loss": 0.0006, "step": 6727 }, { "epoch": 3.0609645131938126, "grad_norm": 0.2734536838160686, "learning_rate": 3.2752827364758464e-06, "loss": 0.0023, "step": 6728 }, { "epoch": 3.061419472247498, "grad_norm": 0.18183981401188093, "learning_rate": 3.2739412361745703e-06, "loss": 0.0012, "step": 6729 }, { "epoch": 3.0618744313011828, "grad_norm": 0.3419348643210425, "learning_rate": 3.27259987691902e-06, "loss": 0.0036, "step": 6730 }, { "epoch": 3.062329390354868, "grad_norm": 0.32333906168309456, "learning_rate": 3.2712586588188074e-06, "loss": 0.0056, "step": 6731 }, { "epoch": 3.0627843494085534, "grad_norm": 0.05072137352877194, "learning_rate": 3.269917581983531e-06, "loss": 0.0005, "step": 6732 }, { "epoch": 3.0632393084622382, "grad_norm": 0.185579387618519, "learning_rate": 3.2685766465227766e-06, "loss": 0.0027, "step": 6733 }, { "epoch": 3.0636942675159236, "grad_norm": 0.32487271522956185, "learning_rate": 3.26723585254612e-06, "loss": 0.0076, "step": 6734 }, { "epoch": 3.064149226569609, "grad_norm": 0.41963868473177607, "learning_rate": 3.265895200163123e-06, "loss": 0.0074, "step": 6735 }, { "epoch": 3.0646041856232937, "grad_norm": 0.43032963788354833, "learning_rate": 3.2645546894833415e-06, "loss": 0.0142, "step": 6736 }, { "epoch": 3.065059144676979, "grad_norm": 0.23090880419977639, "learning_rate": 3.2632143206163103e-06, "loss": 0.0058, "step": 6737 }, { "epoch": 3.0655141037306644, "grad_norm": 0.13971581409177106, "learning_rate": 3.2618740936715633e-06, "loss": 0.0027, "step": 6738 }, { "epoch": 3.065969062784349, "grad_norm": 0.27559078548912813, "learning_rate": 3.2605340087586168e-06, "loss": 0.008, "step": 6739 }, { "epoch": 3.0664240218380345, "grad_norm": 0.19149140193008823, "learning_rate": 3.2591940659869747e-06, "loss": 0.0029, "step": 6740 }, { "epoch": 3.06687898089172, "grad_norm": 0.32214302850237414, "learning_rate": 3.2578542654661326e-06, "loss": 0.0045, "step": 6741 }, { "epoch": 3.0673339399454047, "grad_norm": 0.3984638299308933, "learning_rate": 3.256514607305572e-06, "loss": 0.0156, "step": 6742 }, { "epoch": 3.06778889899909, "grad_norm": 0.1549843083152256, "learning_rate": 3.2551750916147656e-06, "loss": 0.002, "step": 6743 }, { "epoch": 3.0682438580527753, "grad_norm": 0.14445799642068108, "learning_rate": 3.2538357185031688e-06, "loss": 0.0015, "step": 6744 }, { "epoch": 3.06869881710646, "grad_norm": 0.286337049654631, "learning_rate": 3.2524964880802324e-06, "loss": 0.0091, "step": 6745 }, { "epoch": 3.0691537761601455, "grad_norm": 0.25286225331619305, "learning_rate": 3.2511574004553924e-06, "loss": 0.0034, "step": 6746 }, { "epoch": 3.069608735213831, "grad_norm": 0.10431242281535952, "learning_rate": 3.2498184557380705e-06, "loss": 0.0012, "step": 6747 }, { "epoch": 3.070063694267516, "grad_norm": 0.16269028958376114, "learning_rate": 3.2484796540376824e-06, "loss": 0.0024, "step": 6748 }, { "epoch": 3.070518653321201, "grad_norm": 0.1963416886866191, "learning_rate": 3.2471409954636256e-06, "loss": 0.0047, "step": 6749 }, { "epoch": 3.0709736123748863, "grad_norm": 0.24824216801087257, "learning_rate": 3.245802480125292e-06, "loss": 0.0027, "step": 6750 }, { "epoch": 3.0714285714285716, "grad_norm": 0.3652436704817453, "learning_rate": 3.244464108132056e-06, "loss": 0.0126, "step": 6751 }, { "epoch": 3.0718835304822565, "grad_norm": 0.256209447243996, "learning_rate": 3.2431258795932863e-06, "loss": 0.0027, "step": 6752 }, { "epoch": 3.072338489535942, "grad_norm": 0.1885877672421738, "learning_rate": 3.241787794618336e-06, "loss": 0.0028, "step": 6753 }, { "epoch": 3.072793448589627, "grad_norm": 0.1684478415436154, "learning_rate": 3.240449853316548e-06, "loss": 0.0038, "step": 6754 }, { "epoch": 3.073248407643312, "grad_norm": 0.2539833830707047, "learning_rate": 3.23911205579725e-06, "loss": 0.0032, "step": 6755 }, { "epoch": 3.0737033666969973, "grad_norm": 0.23607032119302146, "learning_rate": 3.2377744021697643e-06, "loss": 0.004, "step": 6756 }, { "epoch": 3.0741583257506826, "grad_norm": 0.2461559638382031, "learning_rate": 3.2364368925433954e-06, "loss": 0.0057, "step": 6757 }, { "epoch": 3.0746132848043675, "grad_norm": 0.19472830974496605, "learning_rate": 3.235099527027438e-06, "loss": 0.0051, "step": 6758 }, { "epoch": 3.0750682438580528, "grad_norm": 0.30793589428434276, "learning_rate": 3.2337623057311794e-06, "loss": 0.0127, "step": 6759 }, { "epoch": 3.075523202911738, "grad_norm": 0.2218162672251745, "learning_rate": 3.232425228763888e-06, "loss": 0.0016, "step": 6760 }, { "epoch": 3.075978161965423, "grad_norm": 0.15274006563259715, "learning_rate": 3.2310882962348257e-06, "loss": 0.0027, "step": 6761 }, { "epoch": 3.0764331210191083, "grad_norm": 0.35604068911246184, "learning_rate": 3.229751508253238e-06, "loss": 0.0041, "step": 6762 }, { "epoch": 3.0768880800727936, "grad_norm": 0.27756735557983553, "learning_rate": 3.228414864928364e-06, "loss": 0.0088, "step": 6763 }, { "epoch": 3.0773430391264784, "grad_norm": 0.1550615401595909, "learning_rate": 3.2270783663694254e-06, "loss": 0.0025, "step": 6764 }, { "epoch": 3.0777979981801638, "grad_norm": 0.37469727087291216, "learning_rate": 3.2257420126856357e-06, "loss": 0.0054, "step": 6765 }, { "epoch": 3.078252957233849, "grad_norm": 0.3148135804600808, "learning_rate": 3.224405803986198e-06, "loss": 0.0032, "step": 6766 }, { "epoch": 3.078707916287534, "grad_norm": 0.19563974920154628, "learning_rate": 3.223069740380299e-06, "loss": 0.0036, "step": 6767 }, { "epoch": 3.0791628753412192, "grad_norm": 0.15179995299084575, "learning_rate": 3.2217338219771166e-06, "loss": 0.0015, "step": 6768 }, { "epoch": 3.0796178343949046, "grad_norm": 0.5307591429935765, "learning_rate": 3.2203980488858154e-06, "loss": 0.0065, "step": 6769 }, { "epoch": 3.0800727934485894, "grad_norm": 0.09519582501742016, "learning_rate": 3.2190624212155497e-06, "loss": 0.0012, "step": 6770 }, { "epoch": 3.0805277525022747, "grad_norm": 0.31477397210596536, "learning_rate": 3.217726939075459e-06, "loss": 0.0041, "step": 6771 }, { "epoch": 3.08098271155596, "grad_norm": 0.10423859914988316, "learning_rate": 3.2163916025746734e-06, "loss": 0.0018, "step": 6772 }, { "epoch": 3.0814376706096454, "grad_norm": 0.22066632675155598, "learning_rate": 3.215056411822313e-06, "loss": 0.0036, "step": 6773 }, { "epoch": 3.08189262966333, "grad_norm": 0.3322288750822183, "learning_rate": 3.213721366927481e-06, "loss": 0.0102, "step": 6774 }, { "epoch": 3.0823475887170155, "grad_norm": 0.3297821943068105, "learning_rate": 3.2123864679992732e-06, "loss": 0.0028, "step": 6775 }, { "epoch": 3.082802547770701, "grad_norm": 0.2861879548721158, "learning_rate": 3.21105171514677e-06, "loss": 0.0035, "step": 6776 }, { "epoch": 3.0832575068243857, "grad_norm": 0.23409510522565669, "learning_rate": 3.209717108479042e-06, "loss": 0.0011, "step": 6777 }, { "epoch": 3.083712465878071, "grad_norm": 0.36240084758036684, "learning_rate": 3.208382648105147e-06, "loss": 0.0049, "step": 6778 }, { "epoch": 3.0841674249317563, "grad_norm": 0.14284862077915064, "learning_rate": 3.2070483341341295e-06, "loss": 0.0018, "step": 6779 }, { "epoch": 3.084622383985441, "grad_norm": 0.13574095398642788, "learning_rate": 3.205714166675027e-06, "loss": 0.0022, "step": 6780 }, { "epoch": 3.0850773430391265, "grad_norm": 0.254705055854357, "learning_rate": 3.2043801458368597e-06, "loss": 0.0055, "step": 6781 }, { "epoch": 3.085532302092812, "grad_norm": 0.047918884710824824, "learning_rate": 3.203046271728638e-06, "loss": 0.0004, "step": 6782 }, { "epoch": 3.0859872611464967, "grad_norm": 0.23173230609638582, "learning_rate": 3.2017125444593595e-06, "loss": 0.0046, "step": 6783 }, { "epoch": 3.086442220200182, "grad_norm": 0.39728780776283734, "learning_rate": 3.2003789641380115e-06, "loss": 0.0153, "step": 6784 }, { "epoch": 3.0868971792538673, "grad_norm": 0.13449792478069442, "learning_rate": 3.1990455308735667e-06, "loss": 0.0014, "step": 6785 }, { "epoch": 3.087352138307552, "grad_norm": 0.2541201258133481, "learning_rate": 3.1977122447749876e-06, "loss": 0.0032, "step": 6786 }, { "epoch": 3.0878070973612375, "grad_norm": 0.1602163158655709, "learning_rate": 3.196379105951226e-06, "loss": 0.0034, "step": 6787 }, { "epoch": 3.088262056414923, "grad_norm": 0.28588033239075733, "learning_rate": 3.195046114511219e-06, "loss": 0.0098, "step": 6788 }, { "epoch": 3.0887170154686077, "grad_norm": 0.602944255822795, "learning_rate": 3.193713270563892e-06, "loss": 0.0135, "step": 6789 }, { "epoch": 3.089171974522293, "grad_norm": 0.20656631772005207, "learning_rate": 3.1923805742181603e-06, "loss": 0.0026, "step": 6790 }, { "epoch": 3.0896269335759783, "grad_norm": 0.21656989632053653, "learning_rate": 3.1910480255829235e-06, "loss": 0.0038, "step": 6791 }, { "epoch": 3.090081892629663, "grad_norm": 0.14329498352974906, "learning_rate": 3.189715624767074e-06, "loss": 0.0022, "step": 6792 }, { "epoch": 3.0905368516833485, "grad_norm": 0.1746950490511674, "learning_rate": 3.1883833718794863e-06, "loss": 0.0039, "step": 6793 }, { "epoch": 3.0909918107370338, "grad_norm": 0.5528401786430039, "learning_rate": 3.18705126702903e-06, "loss": 0.0073, "step": 6794 }, { "epoch": 3.0914467697907186, "grad_norm": 0.24112917565479014, "learning_rate": 3.185719310324557e-06, "loss": 0.0028, "step": 6795 }, { "epoch": 3.091901728844404, "grad_norm": 0.25085510075313666, "learning_rate": 3.184387501874908e-06, "loss": 0.0027, "step": 6796 }, { "epoch": 3.0923566878980893, "grad_norm": 0.13331791093439024, "learning_rate": 3.1830558417889145e-06, "loss": 0.001, "step": 6797 }, { "epoch": 3.092811646951774, "grad_norm": 0.1334019090615344, "learning_rate": 3.1817243301753912e-06, "loss": 0.0009, "step": 6798 }, { "epoch": 3.0932666060054594, "grad_norm": 0.2206446641484756, "learning_rate": 3.1803929671431457e-06, "loss": 0.0056, "step": 6799 }, { "epoch": 3.0937215650591448, "grad_norm": 0.22346354807526733, "learning_rate": 3.179061752800967e-06, "loss": 0.0034, "step": 6800 }, { "epoch": 3.0941765241128296, "grad_norm": 0.18159113071231728, "learning_rate": 3.1777306872576396e-06, "loss": 0.0039, "step": 6801 }, { "epoch": 3.094631483166515, "grad_norm": 0.23189102644656986, "learning_rate": 3.176399770621933e-06, "loss": 0.0028, "step": 6802 }, { "epoch": 3.0950864422202002, "grad_norm": 0.2646974563604349, "learning_rate": 3.1750690030025998e-06, "loss": 0.0046, "step": 6803 }, { "epoch": 3.0955414012738856, "grad_norm": 0.3552353110594039, "learning_rate": 3.173738384508388e-06, "loss": 0.0061, "step": 6804 }, { "epoch": 3.0959963603275704, "grad_norm": 0.27473740890782317, "learning_rate": 3.172407915248027e-06, "loss": 0.0059, "step": 6805 }, { "epoch": 3.0964513193812557, "grad_norm": 0.21037904857030093, "learning_rate": 3.171077595330239e-06, "loss": 0.0025, "step": 6806 }, { "epoch": 3.096906278434941, "grad_norm": 0.22035385629324789, "learning_rate": 3.169747424863728e-06, "loss": 0.0042, "step": 6807 }, { "epoch": 3.097361237488626, "grad_norm": 0.37125583006058394, "learning_rate": 3.168417403957193e-06, "loss": 0.0051, "step": 6808 }, { "epoch": 3.097816196542311, "grad_norm": 0.22099877904042967, "learning_rate": 3.167087532719318e-06, "loss": 0.005, "step": 6809 }, { "epoch": 3.0982711555959965, "grad_norm": 0.277029472802014, "learning_rate": 3.1657578112587713e-06, "loss": 0.0033, "step": 6810 }, { "epoch": 3.0987261146496814, "grad_norm": 0.13559192176704188, "learning_rate": 3.1644282396842135e-06, "loss": 0.002, "step": 6811 }, { "epoch": 3.0991810737033667, "grad_norm": 0.2477328764641318, "learning_rate": 3.16309881810429e-06, "loss": 0.0035, "step": 6812 }, { "epoch": 3.099636032757052, "grad_norm": 0.20415176153338863, "learning_rate": 3.1617695466276364e-06, "loss": 0.0022, "step": 6813 }, { "epoch": 3.100090991810737, "grad_norm": 0.3369442240246176, "learning_rate": 3.160440425362873e-06, "loss": 0.0037, "step": 6814 }, { "epoch": 3.100545950864422, "grad_norm": 0.45163771156229976, "learning_rate": 3.1591114544186107e-06, "loss": 0.0161, "step": 6815 }, { "epoch": 3.1010009099181075, "grad_norm": 0.23015613448335928, "learning_rate": 3.157782633903448e-06, "loss": 0.0032, "step": 6816 }, { "epoch": 3.1014558689717924, "grad_norm": 0.19549436314524332, "learning_rate": 3.1564539639259685e-06, "loss": 0.0025, "step": 6817 }, { "epoch": 3.1019108280254777, "grad_norm": 0.21312866380568515, "learning_rate": 3.1551254445947468e-06, "loss": 0.0014, "step": 6818 }, { "epoch": 3.102365787079163, "grad_norm": 0.47724103266672474, "learning_rate": 3.1537970760183406e-06, "loss": 0.0105, "step": 6819 }, { "epoch": 3.102820746132848, "grad_norm": 0.18669670815297695, "learning_rate": 3.1524688583053014e-06, "loss": 0.0019, "step": 6820 }, { "epoch": 3.103275705186533, "grad_norm": 0.12167249750420166, "learning_rate": 3.151140791564162e-06, "loss": 0.0012, "step": 6821 }, { "epoch": 3.1037306642402185, "grad_norm": 0.3700964808925218, "learning_rate": 3.1498128759034484e-06, "loss": 0.0098, "step": 6822 }, { "epoch": 3.1041856232939034, "grad_norm": 0.28064010484527385, "learning_rate": 3.1484851114316724e-06, "loss": 0.0035, "step": 6823 }, { "epoch": 3.1046405823475887, "grad_norm": 0.27322401109860034, "learning_rate": 3.1471574982573306e-06, "loss": 0.007, "step": 6824 }, { "epoch": 3.105095541401274, "grad_norm": 0.1695836303368951, "learning_rate": 3.1458300364889118e-06, "loss": 0.0037, "step": 6825 }, { "epoch": 3.105550500454959, "grad_norm": 0.3666144555844518, "learning_rate": 3.144502726234889e-06, "loss": 0.0063, "step": 6826 }, { "epoch": 3.106005459508644, "grad_norm": 0.1254979537275639, "learning_rate": 3.143175567603723e-06, "loss": 0.0008, "step": 6827 }, { "epoch": 3.1064604185623295, "grad_norm": 0.09078910708998472, "learning_rate": 3.141848560703863e-06, "loss": 0.001, "step": 6828 }, { "epoch": 3.1069153776160148, "grad_norm": 0.21858526744185838, "learning_rate": 3.14052170564375e-06, "loss": 0.0043, "step": 6829 }, { "epoch": 3.1073703366696996, "grad_norm": 0.261118643262181, "learning_rate": 3.139195002531804e-06, "loss": 0.0079, "step": 6830 }, { "epoch": 3.107825295723385, "grad_norm": 0.3114192070734795, "learning_rate": 3.1378684514764413e-06, "loss": 0.0021, "step": 6831 }, { "epoch": 3.1082802547770703, "grad_norm": 0.18848825978171801, "learning_rate": 3.1365420525860575e-06, "loss": 0.0017, "step": 6832 }, { "epoch": 3.108735213830755, "grad_norm": 0.2764956274626624, "learning_rate": 3.135215805969043e-06, "loss": 0.0031, "step": 6833 }, { "epoch": 3.1091901728844404, "grad_norm": 0.09770119491979927, "learning_rate": 3.133889711733771e-06, "loss": 0.0013, "step": 6834 }, { "epoch": 3.1096451319381258, "grad_norm": 0.28421163712249997, "learning_rate": 3.1325637699886023e-06, "loss": 0.0066, "step": 6835 }, { "epoch": 3.1101000909918106, "grad_norm": 0.23556076856213476, "learning_rate": 3.1312379808418926e-06, "loss": 0.0025, "step": 6836 }, { "epoch": 3.110555050045496, "grad_norm": 0.10250499014945226, "learning_rate": 3.1299123444019737e-06, "loss": 0.0008, "step": 6837 }, { "epoch": 3.1110100090991812, "grad_norm": 0.28035545990831157, "learning_rate": 3.128586860777174e-06, "loss": 0.0037, "step": 6838 }, { "epoch": 3.111464968152866, "grad_norm": 0.3360707081314679, "learning_rate": 3.127261530075804e-06, "loss": 0.0042, "step": 6839 }, { "epoch": 3.1119199272065514, "grad_norm": 0.23777162093326473, "learning_rate": 3.125936352406166e-06, "loss": 0.0033, "step": 6840 }, { "epoch": 3.1123748862602367, "grad_norm": 0.2611160470051368, "learning_rate": 3.1246113278765442e-06, "loss": 0.0072, "step": 6841 }, { "epoch": 3.1128298453139216, "grad_norm": 0.1334675178409681, "learning_rate": 3.123286456595215e-06, "loss": 0.002, "step": 6842 }, { "epoch": 3.113284804367607, "grad_norm": 0.18728210685285573, "learning_rate": 3.1219617386704433e-06, "loss": 0.0057, "step": 6843 }, { "epoch": 3.113739763421292, "grad_norm": 0.18067937966525086, "learning_rate": 3.1206371742104756e-06, "loss": 0.0015, "step": 6844 }, { "epoch": 3.114194722474977, "grad_norm": 0.2530804121374889, "learning_rate": 3.119312763323553e-06, "loss": 0.0021, "step": 6845 }, { "epoch": 3.1146496815286624, "grad_norm": 0.07786261321008235, "learning_rate": 3.1179885061178965e-06, "loss": 0.001, "step": 6846 }, { "epoch": 3.1151046405823477, "grad_norm": 0.3167869446803168, "learning_rate": 3.116664402701721e-06, "loss": 0.0031, "step": 6847 }, { "epoch": 3.1155595996360326, "grad_norm": 0.3034780237339912, "learning_rate": 3.1153404531832252e-06, "loss": 0.0037, "step": 6848 }, { "epoch": 3.116014558689718, "grad_norm": 0.1430869012028794, "learning_rate": 3.1140166576705955e-06, "loss": 0.0015, "step": 6849 }, { "epoch": 3.116469517743403, "grad_norm": 0.2955575243852622, "learning_rate": 3.1126930162720093e-06, "loss": 0.0063, "step": 6850 }, { "epoch": 3.116924476797088, "grad_norm": 0.18123834513066317, "learning_rate": 3.1113695290956257e-06, "loss": 0.0013, "step": 6851 }, { "epoch": 3.1173794358507734, "grad_norm": 0.18820326429090645, "learning_rate": 3.1100461962495966e-06, "loss": 0.0009, "step": 6852 }, { "epoch": 3.1178343949044587, "grad_norm": 0.3286260862725942, "learning_rate": 3.1087230178420557e-06, "loss": 0.0054, "step": 6853 }, { "epoch": 3.1182893539581436, "grad_norm": 0.41650101855170824, "learning_rate": 3.10739999398113e-06, "loss": 0.014, "step": 6854 }, { "epoch": 3.118744313011829, "grad_norm": 0.23364049257521766, "learning_rate": 3.1060771247749287e-06, "loss": 0.0029, "step": 6855 }, { "epoch": 3.119199272065514, "grad_norm": 0.20554830683417963, "learning_rate": 3.1047544103315515e-06, "loss": 0.0053, "step": 6856 }, { "epoch": 3.1196542311191995, "grad_norm": 0.18823120029122975, "learning_rate": 3.1034318507590867e-06, "loss": 0.0023, "step": 6857 }, { "epoch": 3.1201091901728844, "grad_norm": 0.43839133144778, "learning_rate": 3.102109446165605e-06, "loss": 0.0034, "step": 6858 }, { "epoch": 3.1205641492265697, "grad_norm": 0.05775456441691884, "learning_rate": 3.10078719665917e-06, "loss": 0.0003, "step": 6859 }, { "epoch": 3.121019108280255, "grad_norm": 0.47133052133767855, "learning_rate": 3.0994651023478273e-06, "loss": 0.0043, "step": 6860 }, { "epoch": 3.12147406733394, "grad_norm": 0.2967469487959148, "learning_rate": 3.0981431633396153e-06, "loss": 0.005, "step": 6861 }, { "epoch": 3.121929026387625, "grad_norm": 0.18682640074147638, "learning_rate": 3.0968213797425543e-06, "loss": 0.0034, "step": 6862 }, { "epoch": 3.1223839854413105, "grad_norm": 0.060623855381300544, "learning_rate": 3.0954997516646535e-06, "loss": 0.0005, "step": 6863 }, { "epoch": 3.1228389444949953, "grad_norm": 0.23986937011875514, "learning_rate": 3.094178279213914e-06, "loss": 0.0048, "step": 6864 }, { "epoch": 3.1232939035486806, "grad_norm": 0.030752265465777497, "learning_rate": 3.09285696249832e-06, "loss": 0.0003, "step": 6865 }, { "epoch": 3.123748862602366, "grad_norm": 0.1249131422777919, "learning_rate": 3.091535801625841e-06, "loss": 0.0009, "step": 6866 }, { "epoch": 3.124203821656051, "grad_norm": 0.20426069213763887, "learning_rate": 3.090214796704439e-06, "loss": 0.0021, "step": 6867 }, { "epoch": 3.124658780709736, "grad_norm": 0.06901177869871337, "learning_rate": 3.0888939478420583e-06, "loss": 0.0008, "step": 6868 }, { "epoch": 3.1251137397634214, "grad_norm": 0.10847243942239054, "learning_rate": 3.087573255146634e-06, "loss": 0.0011, "step": 6869 }, { "epoch": 3.1255686988171063, "grad_norm": 0.13784424514041746, "learning_rate": 3.086252718726086e-06, "loss": 0.0015, "step": 6870 }, { "epoch": 3.1260236578707916, "grad_norm": 0.1826519258113166, "learning_rate": 3.0849323386883236e-06, "loss": 0.0021, "step": 6871 }, { "epoch": 3.126478616924477, "grad_norm": 0.14043473789225805, "learning_rate": 3.0836121151412436e-06, "loss": 0.0019, "step": 6872 }, { "epoch": 3.126933575978162, "grad_norm": 0.36756843729354177, "learning_rate": 3.0822920481927255e-06, "loss": 0.0058, "step": 6873 }, { "epoch": 3.127388535031847, "grad_norm": 0.18237617466251585, "learning_rate": 3.080972137950643e-06, "loss": 0.0016, "step": 6874 }, { "epoch": 3.1278434940855324, "grad_norm": 0.3914557887957653, "learning_rate": 3.079652384522849e-06, "loss": 0.007, "step": 6875 }, { "epoch": 3.1282984531392173, "grad_norm": 0.2644660494945359, "learning_rate": 3.0783327880171916e-06, "loss": 0.0032, "step": 6876 }, { "epoch": 3.1287534121929026, "grad_norm": 0.4032487225341823, "learning_rate": 3.077013348541498e-06, "loss": 0.015, "step": 6877 }, { "epoch": 3.129208371246588, "grad_norm": 0.2312442593073337, "learning_rate": 3.075694066203591e-06, "loss": 0.0027, "step": 6878 }, { "epoch": 3.1296633303002728, "grad_norm": 0.08588400873012622, "learning_rate": 3.0743749411112754e-06, "loss": 0.0013, "step": 6879 }, { "epoch": 3.130118289353958, "grad_norm": 0.22002806254206217, "learning_rate": 3.073055973372343e-06, "loss": 0.0027, "step": 6880 }, { "epoch": 3.1305732484076434, "grad_norm": 0.1417886830228247, "learning_rate": 3.071737163094576e-06, "loss": 0.0012, "step": 6881 }, { "epoch": 3.1310282074613287, "grad_norm": 0.1689950083208543, "learning_rate": 3.0704185103857383e-06, "loss": 0.0026, "step": 6882 }, { "epoch": 3.1314831665150136, "grad_norm": 0.04820569512335356, "learning_rate": 3.0691000153535864e-06, "loss": 0.0006, "step": 6883 }, { "epoch": 3.131938125568699, "grad_norm": 0.3576441567792067, "learning_rate": 3.0677816781058604e-06, "loss": 0.0053, "step": 6884 }, { "epoch": 3.132393084622384, "grad_norm": 0.19423967620935864, "learning_rate": 3.0664634987502905e-06, "loss": 0.0032, "step": 6885 }, { "epoch": 3.132848043676069, "grad_norm": 0.27304912751554217, "learning_rate": 3.0651454773945926e-06, "loss": 0.0041, "step": 6886 }, { "epoch": 3.1333030027297544, "grad_norm": 0.14069924471421918, "learning_rate": 3.063827614146468e-06, "loss": 0.0013, "step": 6887 }, { "epoch": 3.1337579617834397, "grad_norm": 0.33853261769032045, "learning_rate": 3.062509909113608e-06, "loss": 0.0018, "step": 6888 }, { "epoch": 3.1342129208371245, "grad_norm": 0.3300657984131354, "learning_rate": 3.061192362403687e-06, "loss": 0.0035, "step": 6889 }, { "epoch": 3.13466787989081, "grad_norm": 0.22586443625227232, "learning_rate": 3.0598749741243717e-06, "loss": 0.0043, "step": 6890 }, { "epoch": 3.135122838944495, "grad_norm": 0.3142204030631487, "learning_rate": 3.05855774438331e-06, "loss": 0.0097, "step": 6891 }, { "epoch": 3.13557779799818, "grad_norm": 0.5347180070280461, "learning_rate": 3.057240673288144e-06, "loss": 0.0071, "step": 6892 }, { "epoch": 3.1360327570518653, "grad_norm": 0.07979869757978177, "learning_rate": 3.0559237609464963e-06, "loss": 0.0007, "step": 6893 }, { "epoch": 3.1364877161055507, "grad_norm": 0.34513907395525717, "learning_rate": 3.0546070074659796e-06, "loss": 0.0094, "step": 6894 }, { "epoch": 3.1369426751592355, "grad_norm": 0.24669283060965713, "learning_rate": 3.0532904129541928e-06, "loss": 0.0054, "step": 6895 }, { "epoch": 3.137397634212921, "grad_norm": 0.26367498137508605, "learning_rate": 3.0519739775187235e-06, "loss": 0.0066, "step": 6896 }, { "epoch": 3.137852593266606, "grad_norm": 0.3499818267724872, "learning_rate": 3.050657701267142e-06, "loss": 0.0033, "step": 6897 }, { "epoch": 3.138307552320291, "grad_norm": 0.23822308511616982, "learning_rate": 3.0493415843070086e-06, "loss": 0.0089, "step": 6898 }, { "epoch": 3.1387625113739763, "grad_norm": 0.3506298325425386, "learning_rate": 3.048025626745874e-06, "loss": 0.0108, "step": 6899 }, { "epoch": 3.1392174704276616, "grad_norm": 0.16713557199923024, "learning_rate": 3.0467098286912696e-06, "loss": 0.0018, "step": 6900 }, { "epoch": 3.1396724294813465, "grad_norm": 0.24614508266707702, "learning_rate": 3.045394190250718e-06, "loss": 0.0067, "step": 6901 }, { "epoch": 3.140127388535032, "grad_norm": 0.15444915730686565, "learning_rate": 3.0440787115317243e-06, "loss": 0.0021, "step": 6902 }, { "epoch": 3.140582347588717, "grad_norm": 0.5231792062704987, "learning_rate": 3.0427633926417876e-06, "loss": 0.0074, "step": 6903 }, { "epoch": 3.141037306642402, "grad_norm": 0.3562110756967007, "learning_rate": 3.0414482336883855e-06, "loss": 0.004, "step": 6904 }, { "epoch": 3.1414922656960873, "grad_norm": 0.5750481940157537, "learning_rate": 3.040133234778989e-06, "loss": 0.0024, "step": 6905 }, { "epoch": 3.1419472247497726, "grad_norm": 0.04883965726067432, "learning_rate": 3.0388183960210554e-06, "loss": 0.0005, "step": 6906 }, { "epoch": 3.1424021838034575, "grad_norm": 0.23197639701457073, "learning_rate": 3.0375037175220247e-06, "loss": 0.005, "step": 6907 }, { "epoch": 3.142857142857143, "grad_norm": 0.4011252824886896, "learning_rate": 3.0361891993893287e-06, "loss": 0.0109, "step": 6908 }, { "epoch": 3.143312101910828, "grad_norm": 0.16329878320363317, "learning_rate": 3.0348748417303826e-06, "loss": 0.0031, "step": 6909 }, { "epoch": 3.143767060964513, "grad_norm": 0.4667068330320742, "learning_rate": 3.03356064465259e-06, "loss": 0.006, "step": 6910 }, { "epoch": 3.1442220200181983, "grad_norm": 0.2126685344140864, "learning_rate": 3.0322466082633405e-06, "loss": 0.0025, "step": 6911 }, { "epoch": 3.1446769790718836, "grad_norm": 0.2922093884797465, "learning_rate": 3.030932732670011e-06, "loss": 0.0079, "step": 6912 }, { "epoch": 3.145131938125569, "grad_norm": 0.3383758073430269, "learning_rate": 3.029619017979969e-06, "loss": 0.0106, "step": 6913 }, { "epoch": 3.1455868971792538, "grad_norm": 0.1872380842348733, "learning_rate": 3.0283054643005605e-06, "loss": 0.0017, "step": 6914 }, { "epoch": 3.146041856232939, "grad_norm": 0.22900712924516234, "learning_rate": 3.026992071739127e-06, "loss": 0.0039, "step": 6915 }, { "epoch": 3.1464968152866244, "grad_norm": 0.1062966484968155, "learning_rate": 3.0256788404029903e-06, "loss": 0.0012, "step": 6916 }, { "epoch": 3.1469517743403093, "grad_norm": 0.11574748561169698, "learning_rate": 3.024365770399464e-06, "loss": 0.0021, "step": 6917 }, { "epoch": 3.1474067333939946, "grad_norm": 0.11166745590869884, "learning_rate": 3.0230528618358435e-06, "loss": 0.0009, "step": 6918 }, { "epoch": 3.14786169244768, "grad_norm": 0.3276043045734579, "learning_rate": 3.021740114819415e-06, "loss": 0.0049, "step": 6919 }, { "epoch": 3.1483166515013647, "grad_norm": 0.16322970016947816, "learning_rate": 3.0204275294574526e-06, "loss": 0.0013, "step": 6920 }, { "epoch": 3.14877161055505, "grad_norm": 0.2662388850182962, "learning_rate": 3.019115105857211e-06, "loss": 0.0038, "step": 6921 }, { "epoch": 3.1492265696087354, "grad_norm": 0.23585195831489067, "learning_rate": 3.017802844125939e-06, "loss": 0.0024, "step": 6922 }, { "epoch": 3.1496815286624202, "grad_norm": 0.24780754269062924, "learning_rate": 3.016490744370866e-06, "loss": 0.0042, "step": 6923 }, { "epoch": 3.1501364877161055, "grad_norm": 0.18098048291154423, "learning_rate": 3.0151788066992125e-06, "loss": 0.0051, "step": 6924 }, { "epoch": 3.150591446769791, "grad_norm": 0.13069298547750907, "learning_rate": 3.013867031218183e-06, "loss": 0.0023, "step": 6925 }, { "epoch": 3.1510464058234757, "grad_norm": 0.30992928731917874, "learning_rate": 3.0125554180349694e-06, "loss": 0.0052, "step": 6926 }, { "epoch": 3.151501364877161, "grad_norm": 0.2334764034735816, "learning_rate": 3.0112439672567526e-06, "loss": 0.0052, "step": 6927 }, { "epoch": 3.1519563239308463, "grad_norm": 0.4362554464038189, "learning_rate": 3.009932678990698e-06, "loss": 0.0108, "step": 6928 }, { "epoch": 3.152411282984531, "grad_norm": 0.26549600857142336, "learning_rate": 3.008621553343959e-06, "loss": 0.0038, "step": 6929 }, { "epoch": 3.1528662420382165, "grad_norm": 0.23576495783450618, "learning_rate": 3.007310590423672e-06, "loss": 0.0024, "step": 6930 }, { "epoch": 3.153321201091902, "grad_norm": 0.18064147428010743, "learning_rate": 3.0059997903369658e-06, "loss": 0.0039, "step": 6931 }, { "epoch": 3.1537761601455867, "grad_norm": 0.2725154124219658, "learning_rate": 3.004689153190952e-06, "loss": 0.006, "step": 6932 }, { "epoch": 3.154231119199272, "grad_norm": 0.06262260074187467, "learning_rate": 3.0033786790927266e-06, "loss": 0.0006, "step": 6933 }, { "epoch": 3.1546860782529573, "grad_norm": 0.1521036144057461, "learning_rate": 3.0020683681493824e-06, "loss": 0.0018, "step": 6934 }, { "epoch": 3.1551410373066426, "grad_norm": 0.24234515722418318, "learning_rate": 3.000758220467988e-06, "loss": 0.0015, "step": 6935 }, { "epoch": 3.1555959963603275, "grad_norm": 0.09706574994749276, "learning_rate": 2.9994482361556026e-06, "loss": 0.0011, "step": 6936 }, { "epoch": 3.156050955414013, "grad_norm": 0.25646361450328253, "learning_rate": 2.9981384153192737e-06, "loss": 0.0057, "step": 6937 }, { "epoch": 3.156505914467698, "grad_norm": 0.09333352082841011, "learning_rate": 2.9968287580660328e-06, "loss": 0.0011, "step": 6938 }, { "epoch": 3.156960873521383, "grad_norm": 0.36507353945184257, "learning_rate": 2.9955192645028995e-06, "loss": 0.0069, "step": 6939 }, { "epoch": 3.1574158325750683, "grad_norm": 0.3736052944418765, "learning_rate": 2.994209934736879e-06, "loss": 0.0077, "step": 6940 }, { "epoch": 3.1578707916287536, "grad_norm": 0.19118053832227974, "learning_rate": 2.9929007688749647e-06, "loss": 0.0045, "step": 6941 }, { "epoch": 3.1583257506824385, "grad_norm": 0.028668478996688996, "learning_rate": 2.991591767024137e-06, "loss": 0.0003, "step": 6942 }, { "epoch": 3.158780709736124, "grad_norm": 0.5005650245540699, "learning_rate": 2.990282929291359e-06, "loss": 0.0041, "step": 6943 }, { "epoch": 3.159235668789809, "grad_norm": 0.21948524129154579, "learning_rate": 2.9889742557835855e-06, "loss": 0.0062, "step": 6944 }, { "epoch": 3.159690627843494, "grad_norm": 0.1688003229431753, "learning_rate": 2.9876657466077523e-06, "loss": 0.0016, "step": 6945 }, { "epoch": 3.1601455868971793, "grad_norm": 0.3526875064917373, "learning_rate": 2.9863574018707887e-06, "loss": 0.0072, "step": 6946 }, { "epoch": 3.1606005459508646, "grad_norm": 0.1432575929253344, "learning_rate": 2.9850492216796016e-06, "loss": 0.0025, "step": 6947 }, { "epoch": 3.1610555050045495, "grad_norm": 0.2527655887021309, "learning_rate": 2.983741206141094e-06, "loss": 0.0027, "step": 6948 }, { "epoch": 3.1615104640582348, "grad_norm": 0.1766949098242639, "learning_rate": 2.9824333553621515e-06, "loss": 0.0049, "step": 6949 }, { "epoch": 3.16196542311192, "grad_norm": 0.17726795076389612, "learning_rate": 2.9811256694496428e-06, "loss": 0.0016, "step": 6950 }, { "epoch": 3.162420382165605, "grad_norm": 0.1951559691219317, "learning_rate": 2.979818148510427e-06, "loss": 0.0026, "step": 6951 }, { "epoch": 3.1628753412192903, "grad_norm": 0.3866619841764015, "learning_rate": 2.978510792651349e-06, "loss": 0.0087, "step": 6952 }, { "epoch": 3.1633303002729756, "grad_norm": 0.1848239122137479, "learning_rate": 2.9772036019792415e-06, "loss": 0.0011, "step": 6953 }, { "epoch": 3.1637852593266604, "grad_norm": 0.39702477405697256, "learning_rate": 2.9758965766009187e-06, "loss": 0.0055, "step": 6954 }, { "epoch": 3.1642402183803457, "grad_norm": 0.21157337658248726, "learning_rate": 2.974589716623187e-06, "loss": 0.0059, "step": 6955 }, { "epoch": 3.164695177434031, "grad_norm": 0.09650726779714776, "learning_rate": 2.9732830221528386e-06, "loss": 0.001, "step": 6956 }, { "epoch": 3.165150136487716, "grad_norm": 0.1483336983921621, "learning_rate": 2.9719764932966477e-06, "loss": 0.0012, "step": 6957 }, { "epoch": 3.1656050955414012, "grad_norm": 0.514786651956486, "learning_rate": 2.9706701301613806e-06, "loss": 0.0151, "step": 6958 }, { "epoch": 3.1660600545950865, "grad_norm": 0.3691817509596384, "learning_rate": 2.969363932853785e-06, "loss": 0.0049, "step": 6959 }, { "epoch": 3.1665150136487714, "grad_norm": 0.2928020644138154, "learning_rate": 2.968057901480599e-06, "loss": 0.0093, "step": 6960 }, { "epoch": 3.1669699727024567, "grad_norm": 0.10110473195819367, "learning_rate": 2.9667520361485435e-06, "loss": 0.001, "step": 6961 }, { "epoch": 3.167424931756142, "grad_norm": 0.21108152820646317, "learning_rate": 2.9654463369643305e-06, "loss": 0.0032, "step": 6962 }, { "epoch": 3.167879890809827, "grad_norm": 0.20160032923496765, "learning_rate": 2.9641408040346563e-06, "loss": 0.0022, "step": 6963 }, { "epoch": 3.168334849863512, "grad_norm": 0.1257116402451997, "learning_rate": 2.9628354374662005e-06, "loss": 0.0017, "step": 6964 }, { "epoch": 3.1687898089171975, "grad_norm": 0.23255984994435436, "learning_rate": 2.961530237365634e-06, "loss": 0.0032, "step": 6965 }, { "epoch": 3.1692447679708824, "grad_norm": 0.16078873437351757, "learning_rate": 2.9602252038396097e-06, "loss": 0.0031, "step": 6966 }, { "epoch": 3.1696997270245677, "grad_norm": 0.11493960089780716, "learning_rate": 2.958920336994772e-06, "loss": 0.0007, "step": 6967 }, { "epoch": 3.170154686078253, "grad_norm": 0.3582624200962682, "learning_rate": 2.957615636937744e-06, "loss": 0.0057, "step": 6968 }, { "epoch": 3.1706096451319383, "grad_norm": 0.36328299559816346, "learning_rate": 2.9563111037751437e-06, "loss": 0.0114, "step": 6969 }, { "epoch": 3.171064604185623, "grad_norm": 0.4499217887781241, "learning_rate": 2.9550067376135723e-06, "loss": 0.0071, "step": 6970 }, { "epoch": 3.1715195632393085, "grad_norm": 0.3041462875162963, "learning_rate": 2.953702538559615e-06, "loss": 0.0048, "step": 6971 }, { "epoch": 3.171974522292994, "grad_norm": 0.282479872234923, "learning_rate": 2.952398506719844e-06, "loss": 0.0017, "step": 6972 }, { "epoch": 3.1724294813466787, "grad_norm": 0.1934463390572384, "learning_rate": 2.951094642200821e-06, "loss": 0.004, "step": 6973 }, { "epoch": 3.172884440400364, "grad_norm": 0.19199945438383587, "learning_rate": 2.9497909451090913e-06, "loss": 0.0034, "step": 6974 }, { "epoch": 3.1733393994540493, "grad_norm": 0.265309250667967, "learning_rate": 2.948487415551185e-06, "loss": 0.0056, "step": 6975 }, { "epoch": 3.173794358507734, "grad_norm": 0.2828975870477447, "learning_rate": 2.947184053633625e-06, "loss": 0.004, "step": 6976 }, { "epoch": 3.1742493175614195, "grad_norm": 0.2673498425994623, "learning_rate": 2.9458808594629117e-06, "loss": 0.0046, "step": 6977 }, { "epoch": 3.174704276615105, "grad_norm": 0.17801218073857059, "learning_rate": 2.94457783314554e-06, "loss": 0.0032, "step": 6978 }, { "epoch": 3.1751592356687897, "grad_norm": 0.4511777180967237, "learning_rate": 2.9432749747879845e-06, "loss": 0.0096, "step": 6979 }, { "epoch": 3.175614194722475, "grad_norm": 0.3218999149293914, "learning_rate": 2.9419722844967113e-06, "loss": 0.0103, "step": 6980 }, { "epoch": 3.1760691537761603, "grad_norm": 0.13031810988975212, "learning_rate": 2.940669762378168e-06, "loss": 0.0017, "step": 6981 }, { "epoch": 3.176524112829845, "grad_norm": 0.16634347564999383, "learning_rate": 2.93936740853879e-06, "loss": 0.003, "step": 6982 }, { "epoch": 3.1769790718835305, "grad_norm": 0.35073646452000296, "learning_rate": 2.9380652230850036e-06, "loss": 0.0055, "step": 6983 }, { "epoch": 3.1774340309372158, "grad_norm": 0.13779006286259865, "learning_rate": 2.9367632061232155e-06, "loss": 0.0017, "step": 6984 }, { "epoch": 3.1778889899909006, "grad_norm": 0.2066628657399087, "learning_rate": 2.935461357759821e-06, "loss": 0.0032, "step": 6985 }, { "epoch": 3.178343949044586, "grad_norm": 0.19396580606858738, "learning_rate": 2.9341596781012004e-06, "loss": 0.0018, "step": 6986 }, { "epoch": 3.1787989080982713, "grad_norm": 0.04884125106200917, "learning_rate": 2.9328581672537227e-06, "loss": 0.0005, "step": 6987 }, { "epoch": 3.179253867151956, "grad_norm": 0.22919642298421322, "learning_rate": 2.9315568253237394e-06, "loss": 0.0018, "step": 6988 }, { "epoch": 3.1797088262056414, "grad_norm": 0.13864618583544894, "learning_rate": 2.930255652417591e-06, "loss": 0.0012, "step": 6989 }, { "epoch": 3.1801637852593267, "grad_norm": 0.3397982807660512, "learning_rate": 2.9289546486416042e-06, "loss": 0.0063, "step": 6990 }, { "epoch": 3.180618744313012, "grad_norm": 0.27321968304937355, "learning_rate": 2.9276538141020907e-06, "loss": 0.0027, "step": 6991 }, { "epoch": 3.181073703366697, "grad_norm": 0.27382106641037923, "learning_rate": 2.92635314890535e-06, "loss": 0.0077, "step": 6992 }, { "epoch": 3.1815286624203822, "grad_norm": 0.1890246170642665, "learning_rate": 2.925052653157664e-06, "loss": 0.0046, "step": 6993 }, { "epoch": 3.1819836214740675, "grad_norm": 0.17113397948676443, "learning_rate": 2.923752326965306e-06, "loss": 0.0038, "step": 6994 }, { "epoch": 3.1824385805277524, "grad_norm": 0.46890350563403316, "learning_rate": 2.922452170434531e-06, "loss": 0.0072, "step": 6995 }, { "epoch": 3.1828935395814377, "grad_norm": 0.18836950500834493, "learning_rate": 2.9211521836715806e-06, "loss": 0.0022, "step": 6996 }, { "epoch": 3.183348498635123, "grad_norm": 0.058613057769979555, "learning_rate": 2.9198523667826885e-06, "loss": 0.0005, "step": 6997 }, { "epoch": 3.183803457688808, "grad_norm": 0.14162693351389868, "learning_rate": 2.9185527198740673e-06, "loss": 0.0015, "step": 6998 }, { "epoch": 3.184258416742493, "grad_norm": 0.2611812239293852, "learning_rate": 2.917253243051915e-06, "loss": 0.0055, "step": 6999 }, { "epoch": 3.1847133757961785, "grad_norm": 0.21605526072712278, "learning_rate": 2.9159539364224254e-06, "loss": 0.004, "step": 7000 }, { "epoch": 3.1851683348498634, "grad_norm": 0.19439346188644074, "learning_rate": 2.914654800091768e-06, "loss": 0.0012, "step": 7001 }, { "epoch": 3.1856232939035487, "grad_norm": 0.33966533894608314, "learning_rate": 2.9133558341661027e-06, "loss": 0.0086, "step": 7002 }, { "epoch": 3.186078252957234, "grad_norm": 0.3850582324069532, "learning_rate": 2.912057038751574e-06, "loss": 0.0056, "step": 7003 }, { "epoch": 3.186533212010919, "grad_norm": 0.22403964117173933, "learning_rate": 2.9107584139543147e-06, "loss": 0.0053, "step": 7004 }, { "epoch": 3.186988171064604, "grad_norm": 0.26608613177073764, "learning_rate": 2.909459959880445e-06, "loss": 0.0054, "step": 7005 }, { "epoch": 3.1874431301182895, "grad_norm": 0.3506465274404851, "learning_rate": 2.9081616766360665e-06, "loss": 0.0049, "step": 7006 }, { "epoch": 3.1878980891719744, "grad_norm": 0.0692620909517553, "learning_rate": 2.906863564327269e-06, "loss": 0.0007, "step": 7007 }, { "epoch": 3.1883530482256597, "grad_norm": 0.26402863797053766, "learning_rate": 2.9055656230601293e-06, "loss": 0.0036, "step": 7008 }, { "epoch": 3.188808007279345, "grad_norm": 0.4588868642887718, "learning_rate": 2.904267852940705e-06, "loss": 0.0071, "step": 7009 }, { "epoch": 3.18926296633303, "grad_norm": 0.3152284566115162, "learning_rate": 2.902970254075049e-06, "loss": 0.0037, "step": 7010 }, { "epoch": 3.189717925386715, "grad_norm": 0.4337128851726491, "learning_rate": 2.901672826569195e-06, "loss": 0.0132, "step": 7011 }, { "epoch": 3.1901728844404005, "grad_norm": 0.5011694619624966, "learning_rate": 2.900375570529162e-06, "loss": 0.0069, "step": 7012 }, { "epoch": 3.1906278434940853, "grad_norm": 0.3373852818093339, "learning_rate": 2.8990784860609555e-06, "loss": 0.0126, "step": 7013 }, { "epoch": 3.1910828025477707, "grad_norm": 0.2903091018628997, "learning_rate": 2.897781573270565e-06, "loss": 0.0064, "step": 7014 }, { "epoch": 3.191537761601456, "grad_norm": 0.3513440393628483, "learning_rate": 2.896484832263973e-06, "loss": 0.0037, "step": 7015 }, { "epoch": 3.191992720655141, "grad_norm": 0.11824010187024156, "learning_rate": 2.895188263147141e-06, "loss": 0.0013, "step": 7016 }, { "epoch": 3.192447679708826, "grad_norm": 0.6185486354105568, "learning_rate": 2.8938918660260173e-06, "loss": 0.0107, "step": 7017 }, { "epoch": 3.1929026387625115, "grad_norm": 0.4431826813186897, "learning_rate": 2.8925956410065414e-06, "loss": 0.0072, "step": 7018 }, { "epoch": 3.1933575978161963, "grad_norm": 0.19884086207376586, "learning_rate": 2.8912995881946303e-06, "loss": 0.0039, "step": 7019 }, { "epoch": 3.1938125568698816, "grad_norm": 0.19480040542304444, "learning_rate": 2.890003707696196e-06, "loss": 0.0033, "step": 7020 }, { "epoch": 3.194267515923567, "grad_norm": 0.1703072898911511, "learning_rate": 2.88870799961713e-06, "loss": 0.0026, "step": 7021 }, { "epoch": 3.194722474977252, "grad_norm": 0.5755124786510741, "learning_rate": 2.8874124640633115e-06, "loss": 0.0082, "step": 7022 }, { "epoch": 3.195177434030937, "grad_norm": 0.4408051162755984, "learning_rate": 2.8861171011406052e-06, "loss": 0.0161, "step": 7023 }, { "epoch": 3.1956323930846224, "grad_norm": 0.22693617648642977, "learning_rate": 2.8848219109548623e-06, "loss": 0.0029, "step": 7024 }, { "epoch": 3.1960873521383077, "grad_norm": 0.560357126303314, "learning_rate": 2.8835268936119233e-06, "loss": 0.0032, "step": 7025 }, { "epoch": 3.1965423111919926, "grad_norm": 0.18668153440199636, "learning_rate": 2.882232049217608e-06, "loss": 0.0028, "step": 7026 }, { "epoch": 3.196997270245678, "grad_norm": 0.27298531267990983, "learning_rate": 2.8809373778777262e-06, "loss": 0.0068, "step": 7027 }, { "epoch": 3.1974522292993632, "grad_norm": 0.29394006000889444, "learning_rate": 2.87964287969807e-06, "loss": 0.0026, "step": 7028 }, { "epoch": 3.197907188353048, "grad_norm": 0.13356383356886484, "learning_rate": 2.8783485547844247e-06, "loss": 0.0026, "step": 7029 }, { "epoch": 3.1983621474067334, "grad_norm": 0.111298059352568, "learning_rate": 2.877054403242554e-06, "loss": 0.0008, "step": 7030 }, { "epoch": 3.1988171064604187, "grad_norm": 0.14570272870682158, "learning_rate": 2.8757604251782077e-06, "loss": 0.0015, "step": 7031 }, { "epoch": 3.1992720655141036, "grad_norm": 0.18254357335485907, "learning_rate": 2.8744666206971295e-06, "loss": 0.0051, "step": 7032 }, { "epoch": 3.199727024567789, "grad_norm": 0.19640566322377645, "learning_rate": 2.8731729899050374e-06, "loss": 0.0066, "step": 7033 }, { "epoch": 3.200181983621474, "grad_norm": 0.17340254397041094, "learning_rate": 2.8718795329076465e-06, "loss": 0.0019, "step": 7034 }, { "epoch": 3.200636942675159, "grad_norm": 0.3315625381531741, "learning_rate": 2.8705862498106496e-06, "loss": 0.008, "step": 7035 }, { "epoch": 3.2010919017288444, "grad_norm": 0.1320339384241569, "learning_rate": 2.8692931407197276e-06, "loss": 0.0012, "step": 7036 }, { "epoch": 3.2015468607825297, "grad_norm": 0.07674289430099869, "learning_rate": 2.8680002057405466e-06, "loss": 0.0007, "step": 7037 }, { "epoch": 3.2020018198362146, "grad_norm": 0.24346719572021686, "learning_rate": 2.86670744497876e-06, "loss": 0.0035, "step": 7038 }, { "epoch": 3.2024567788899, "grad_norm": 0.17979439699527758, "learning_rate": 2.86541485854001e-06, "loss": 0.0029, "step": 7039 }, { "epoch": 3.202911737943585, "grad_norm": 0.15306681712630646, "learning_rate": 2.864122446529918e-06, "loss": 0.0025, "step": 7040 }, { "epoch": 3.20336669699727, "grad_norm": 0.16833353442459245, "learning_rate": 2.8628302090540938e-06, "loss": 0.0033, "step": 7041 }, { "epoch": 3.2038216560509554, "grad_norm": 0.5090333769918682, "learning_rate": 2.861538146218133e-06, "loss": 0.0115, "step": 7042 }, { "epoch": 3.2042766151046407, "grad_norm": 0.058486268020582495, "learning_rate": 2.8602462581276166e-06, "loss": 0.0006, "step": 7043 }, { "epoch": 3.2047315741583255, "grad_norm": 0.2321510470825738, "learning_rate": 2.858954544888114e-06, "loss": 0.0064, "step": 7044 }, { "epoch": 3.205186533212011, "grad_norm": 0.28357815960626026, "learning_rate": 2.857663006605176e-06, "loss": 0.0069, "step": 7045 }, { "epoch": 3.205641492265696, "grad_norm": 0.3739588837097919, "learning_rate": 2.8563716433843438e-06, "loss": 0.0123, "step": 7046 }, { "epoch": 3.2060964513193815, "grad_norm": 0.1739856858299897, "learning_rate": 2.8550804553311407e-06, "loss": 0.0026, "step": 7047 }, { "epoch": 3.2065514103730663, "grad_norm": 0.3729315402442275, "learning_rate": 2.8537894425510743e-06, "loss": 0.0094, "step": 7048 }, { "epoch": 3.2070063694267517, "grad_norm": 0.19879084739198827, "learning_rate": 2.8524986051496438e-06, "loss": 0.0054, "step": 7049 }, { "epoch": 3.207461328480437, "grad_norm": 0.2507443550659683, "learning_rate": 2.85120794323233e-06, "loss": 0.0067, "step": 7050 }, { "epoch": 3.207916287534122, "grad_norm": 0.2034723154478191, "learning_rate": 2.8499174569045997e-06, "loss": 0.0054, "step": 7051 }, { "epoch": 3.208371246587807, "grad_norm": 0.19812395661781815, "learning_rate": 2.8486271462719024e-06, "loss": 0.0039, "step": 7052 }, { "epoch": 3.2088262056414925, "grad_norm": 0.12151231660747276, "learning_rate": 2.847337011439679e-06, "loss": 0.0016, "step": 7053 }, { "epoch": 3.2092811646951773, "grad_norm": 0.06625285711406216, "learning_rate": 2.8460470525133565e-06, "loss": 0.0009, "step": 7054 }, { "epoch": 3.2097361237488626, "grad_norm": 0.2222268868967166, "learning_rate": 2.8447572695983413e-06, "loss": 0.0027, "step": 7055 }, { "epoch": 3.210191082802548, "grad_norm": 0.22375109937504972, "learning_rate": 2.843467662800029e-06, "loss": 0.004, "step": 7056 }, { "epoch": 3.210646041856233, "grad_norm": 0.24148247330602277, "learning_rate": 2.8421782322237983e-06, "loss": 0.0041, "step": 7057 }, { "epoch": 3.211101000909918, "grad_norm": 0.4036283257293504, "learning_rate": 2.8408889779750204e-06, "loss": 0.0079, "step": 7058 }, { "epoch": 3.2115559599636034, "grad_norm": 0.16012253927637513, "learning_rate": 2.839599900159042e-06, "loss": 0.0031, "step": 7059 }, { "epoch": 3.2120109190172883, "grad_norm": 0.19575698891162982, "learning_rate": 2.838310998881206e-06, "loss": 0.004, "step": 7060 }, { "epoch": 3.2124658780709736, "grad_norm": 0.23238409751911412, "learning_rate": 2.8370222742468324e-06, "loss": 0.0028, "step": 7061 }, { "epoch": 3.212920837124659, "grad_norm": 1.1799028822947624, "learning_rate": 2.8357337263612294e-06, "loss": 0.0079, "step": 7062 }, { "epoch": 3.213375796178344, "grad_norm": 0.11704122077802191, "learning_rate": 2.8344453553296942e-06, "loss": 0.0009, "step": 7063 }, { "epoch": 3.213830755232029, "grad_norm": 0.25464919161887095, "learning_rate": 2.833157161257505e-06, "loss": 0.0039, "step": 7064 }, { "epoch": 3.2142857142857144, "grad_norm": 0.1701445119296061, "learning_rate": 2.8318691442499275e-06, "loss": 0.0015, "step": 7065 }, { "epoch": 3.2147406733393993, "grad_norm": 0.022517491579665552, "learning_rate": 2.83058130441221e-06, "loss": 0.0002, "step": 7066 }, { "epoch": 3.2151956323930846, "grad_norm": 0.2986016634993806, "learning_rate": 2.8292936418495913e-06, "loss": 0.0025, "step": 7067 }, { "epoch": 3.21565059144677, "grad_norm": 0.25424537343014675, "learning_rate": 2.8280061566672957e-06, "loss": 0.003, "step": 7068 }, { "epoch": 3.2161055505004548, "grad_norm": 0.21837940611362922, "learning_rate": 2.8267188489705275e-06, "loss": 0.0022, "step": 7069 }, { "epoch": 3.21656050955414, "grad_norm": 0.1724310669243258, "learning_rate": 2.825431718864482e-06, "loss": 0.0019, "step": 7070 }, { "epoch": 3.2170154686078254, "grad_norm": 0.3434573885395083, "learning_rate": 2.824144766454333e-06, "loss": 0.0074, "step": 7071 }, { "epoch": 3.2174704276615103, "grad_norm": 0.17924963646789135, "learning_rate": 2.82285799184525e-06, "loss": 0.0038, "step": 7072 }, { "epoch": 3.2179253867151956, "grad_norm": 0.22808845598214292, "learning_rate": 2.8215713951423772e-06, "loss": 0.0049, "step": 7073 }, { "epoch": 3.218380345768881, "grad_norm": 0.16599836575910004, "learning_rate": 2.8202849764508554e-06, "loss": 0.0022, "step": 7074 }, { "epoch": 3.2188353048225657, "grad_norm": 0.3223551171330232, "learning_rate": 2.8189987358758018e-06, "loss": 0.0124, "step": 7075 }, { "epoch": 3.219290263876251, "grad_norm": 0.12228389008917982, "learning_rate": 2.8177126735223193e-06, "loss": 0.0011, "step": 7076 }, { "epoch": 3.2197452229299364, "grad_norm": 0.20551910180727626, "learning_rate": 2.8164267894955045e-06, "loss": 0.0038, "step": 7077 }, { "epoch": 3.2202001819836217, "grad_norm": 0.15038531484880807, "learning_rate": 2.8151410839004325e-06, "loss": 0.0036, "step": 7078 }, { "epoch": 3.2206551410373065, "grad_norm": 0.4052783988873605, "learning_rate": 2.8138555568421625e-06, "loss": 0.0071, "step": 7079 }, { "epoch": 3.221110100090992, "grad_norm": 0.16447661337097008, "learning_rate": 2.8125702084257432e-06, "loss": 0.0019, "step": 7080 }, { "epoch": 3.221565059144677, "grad_norm": 0.1262644516927067, "learning_rate": 2.8112850387562097e-06, "loss": 0.0008, "step": 7081 }, { "epoch": 3.222020018198362, "grad_norm": 0.11041599551115908, "learning_rate": 2.810000047938577e-06, "loss": 0.0009, "step": 7082 }, { "epoch": 3.2224749772520473, "grad_norm": 0.1798952670690953, "learning_rate": 2.8087152360778513e-06, "loss": 0.0021, "step": 7083 }, { "epoch": 3.2229299363057327, "grad_norm": 0.060136204294505696, "learning_rate": 2.807430603279022e-06, "loss": 0.0005, "step": 7084 }, { "epoch": 3.2233848953594175, "grad_norm": 0.13404582005182145, "learning_rate": 2.806146149647062e-06, "loss": 0.0012, "step": 7085 }, { "epoch": 3.223839854413103, "grad_norm": 0.43716814240493523, "learning_rate": 2.804861875286929e-06, "loss": 0.0083, "step": 7086 }, { "epoch": 3.224294813466788, "grad_norm": 0.3108530374716742, "learning_rate": 2.8035777803035703e-06, "loss": 0.0058, "step": 7087 }, { "epoch": 3.224749772520473, "grad_norm": 0.2317295713780408, "learning_rate": 2.8022938648019187e-06, "loss": 0.002, "step": 7088 }, { "epoch": 3.2252047315741583, "grad_norm": 0.11282755114913963, "learning_rate": 2.801010128886888e-06, "loss": 0.002, "step": 7089 }, { "epoch": 3.2256596906278436, "grad_norm": 0.2165071297806928, "learning_rate": 2.7997265726633783e-06, "loss": 0.003, "step": 7090 }, { "epoch": 3.2261146496815285, "grad_norm": 0.020343813297191718, "learning_rate": 2.7984431962362758e-06, "loss": 0.0002, "step": 7091 }, { "epoch": 3.226569608735214, "grad_norm": 0.06077218066603958, "learning_rate": 2.797159999710454e-06, "loss": 0.0006, "step": 7092 }, { "epoch": 3.227024567788899, "grad_norm": 0.12333300168926505, "learning_rate": 2.7958769831907694e-06, "loss": 0.0016, "step": 7093 }, { "epoch": 3.227479526842584, "grad_norm": 0.5918758359390299, "learning_rate": 2.7945941467820626e-06, "loss": 0.0147, "step": 7094 }, { "epoch": 3.2279344858962693, "grad_norm": 0.20177534987550425, "learning_rate": 2.793311490589164e-06, "loss": 0.002, "step": 7095 }, { "epoch": 3.2283894449499546, "grad_norm": 0.14393835941534916, "learning_rate": 2.792029014716883e-06, "loss": 0.0017, "step": 7096 }, { "epoch": 3.2288444040036395, "grad_norm": 0.14805804298902528, "learning_rate": 2.7907467192700222e-06, "loss": 0.0014, "step": 7097 }, { "epoch": 3.229299363057325, "grad_norm": 0.2160902593340542, "learning_rate": 2.7894646043533623e-06, "loss": 0.0097, "step": 7098 }, { "epoch": 3.22975432211101, "grad_norm": 0.33562528419967586, "learning_rate": 2.7881826700716724e-06, "loss": 0.0051, "step": 7099 }, { "epoch": 3.2302092811646954, "grad_norm": 0.09317744494325432, "learning_rate": 2.7869009165297046e-06, "loss": 0.0009, "step": 7100 }, { "epoch": 3.2306642402183803, "grad_norm": 0.1849985981344789, "learning_rate": 2.785619343832199e-06, "loss": 0.0023, "step": 7101 }, { "epoch": 3.2311191992720656, "grad_norm": 0.04485944164390735, "learning_rate": 2.784337952083883e-06, "loss": 0.0005, "step": 7102 }, { "epoch": 3.231574158325751, "grad_norm": 0.24686562422397823, "learning_rate": 2.783056741389464e-06, "loss": 0.0023, "step": 7103 }, { "epoch": 3.2320291173794358, "grad_norm": 0.2815808173089651, "learning_rate": 2.7817757118536354e-06, "loss": 0.0039, "step": 7104 }, { "epoch": 3.232484076433121, "grad_norm": 0.3456378578190567, "learning_rate": 2.780494863581077e-06, "loss": 0.0058, "step": 7105 }, { "epoch": 3.2329390354868064, "grad_norm": 0.21264962376783553, "learning_rate": 2.779214196676457e-06, "loss": 0.002, "step": 7106 }, { "epoch": 3.2333939945404913, "grad_norm": 0.3646219696749776, "learning_rate": 2.7779337112444238e-06, "loss": 0.009, "step": 7107 }, { "epoch": 3.2338489535941766, "grad_norm": 0.4208559385557234, "learning_rate": 2.776653407389611e-06, "loss": 0.0063, "step": 7108 }, { "epoch": 3.234303912647862, "grad_norm": 0.19330467771961016, "learning_rate": 2.7753732852166428e-06, "loss": 0.0043, "step": 7109 }, { "epoch": 3.2347588717015467, "grad_norm": 0.1344147761618293, "learning_rate": 2.774093344830122e-06, "loss": 0.0015, "step": 7110 }, { "epoch": 3.235213830755232, "grad_norm": 0.2657374909842001, "learning_rate": 2.7728135863346427e-06, "loss": 0.005, "step": 7111 }, { "epoch": 3.2356687898089174, "grad_norm": 0.4072459750414462, "learning_rate": 2.7715340098347794e-06, "loss": 0.0026, "step": 7112 }, { "epoch": 3.2361237488626022, "grad_norm": 0.1442266297551056, "learning_rate": 2.770254615435093e-06, "loss": 0.0036, "step": 7113 }, { "epoch": 3.2365787079162875, "grad_norm": 0.22488065683105946, "learning_rate": 2.76897540324013e-06, "loss": 0.0024, "step": 7114 }, { "epoch": 3.237033666969973, "grad_norm": 0.26262719206301965, "learning_rate": 2.76769637335442e-06, "loss": 0.0033, "step": 7115 }, { "epoch": 3.2374886260236577, "grad_norm": 0.18923068007150018, "learning_rate": 2.766417525882481e-06, "loss": 0.0041, "step": 7116 }, { "epoch": 3.237943585077343, "grad_norm": 0.3162808890372713, "learning_rate": 2.7651388609288177e-06, "loss": 0.0032, "step": 7117 }, { "epoch": 3.2383985441310283, "grad_norm": 0.20015900847028084, "learning_rate": 2.7638603785979133e-06, "loss": 0.0054, "step": 7118 }, { "epoch": 3.238853503184713, "grad_norm": 0.28257635956012495, "learning_rate": 2.762582078994241e-06, "loss": 0.0031, "step": 7119 }, { "epoch": 3.2393084622383985, "grad_norm": 0.043847211302883014, "learning_rate": 2.761303962222255e-06, "loss": 0.0004, "step": 7120 }, { "epoch": 3.239763421292084, "grad_norm": 0.09330151131243765, "learning_rate": 2.760026028386401e-06, "loss": 0.0008, "step": 7121 }, { "epoch": 3.2402183803457687, "grad_norm": 0.15342091258456772, "learning_rate": 2.7587482775911024e-06, "loss": 0.0015, "step": 7122 }, { "epoch": 3.240673339399454, "grad_norm": 0.20887414195070997, "learning_rate": 2.757470709940776e-06, "loss": 0.0034, "step": 7123 }, { "epoch": 3.2411282984531393, "grad_norm": 0.24010222913705828, "learning_rate": 2.7561933255398156e-06, "loss": 0.0037, "step": 7124 }, { "epoch": 3.241583257506824, "grad_norm": 0.275266451650375, "learning_rate": 2.754916124492601e-06, "loss": 0.0028, "step": 7125 }, { "epoch": 3.2420382165605095, "grad_norm": 0.19890008052552316, "learning_rate": 2.7536391069035046e-06, "loss": 0.002, "step": 7126 }, { "epoch": 3.242493175614195, "grad_norm": 0.1416871420440179, "learning_rate": 2.7523622728768757e-06, "loss": 0.0007, "step": 7127 }, { "epoch": 3.2429481346678797, "grad_norm": 0.08239089863556529, "learning_rate": 2.7510856225170513e-06, "loss": 0.0006, "step": 7128 }, { "epoch": 3.243403093721565, "grad_norm": 0.10984184017568707, "learning_rate": 2.7498091559283525e-06, "loss": 0.0007, "step": 7129 }, { "epoch": 3.2438580527752503, "grad_norm": 0.21224609107233322, "learning_rate": 2.7485328732150872e-06, "loss": 0.0051, "step": 7130 }, { "epoch": 3.244313011828935, "grad_norm": 0.3065831000641192, "learning_rate": 2.7472567744815506e-06, "loss": 0.0024, "step": 7131 }, { "epoch": 3.2447679708826205, "grad_norm": 0.3190289370033862, "learning_rate": 2.7459808598320165e-06, "loss": 0.0028, "step": 7132 }, { "epoch": 3.245222929936306, "grad_norm": 0.1621658989961227, "learning_rate": 2.7447051293707476e-06, "loss": 0.0015, "step": 7133 }, { "epoch": 3.245677888989991, "grad_norm": 0.39736181091782363, "learning_rate": 2.7434295832019887e-06, "loss": 0.0061, "step": 7134 }, { "epoch": 3.246132848043676, "grad_norm": 0.3990589807723281, "learning_rate": 2.7421542214299756e-06, "loss": 0.0075, "step": 7135 }, { "epoch": 3.2465878070973613, "grad_norm": 0.20334170027250575, "learning_rate": 2.7408790441589217e-06, "loss": 0.007, "step": 7136 }, { "epoch": 3.2470427661510466, "grad_norm": 0.1532240001827751, "learning_rate": 2.7396040514930316e-06, "loss": 0.0033, "step": 7137 }, { "epoch": 3.2474977252047315, "grad_norm": 0.24004564455352015, "learning_rate": 2.7383292435364908e-06, "loss": 0.0028, "step": 7138 }, { "epoch": 3.2479526842584168, "grad_norm": 0.05297090734355906, "learning_rate": 2.737054620393469e-06, "loss": 0.0003, "step": 7139 }, { "epoch": 3.248407643312102, "grad_norm": 0.10141762631608119, "learning_rate": 2.7357801821681255e-06, "loss": 0.0008, "step": 7140 }, { "epoch": 3.248862602365787, "grad_norm": 0.1890439388552718, "learning_rate": 2.734505928964601e-06, "loss": 0.0015, "step": 7141 }, { "epoch": 3.2493175614194723, "grad_norm": 0.18501097364674118, "learning_rate": 2.733231860887021e-06, "loss": 0.0021, "step": 7142 }, { "epoch": 3.2497725204731576, "grad_norm": 0.03364657933446476, "learning_rate": 2.7319579780394943e-06, "loss": 0.0002, "step": 7143 }, { "epoch": 3.2502274795268424, "grad_norm": 0.34418281924681093, "learning_rate": 2.730684280526119e-06, "loss": 0.0057, "step": 7144 }, { "epoch": 3.2506824385805277, "grad_norm": 0.05360713459967386, "learning_rate": 2.729410768450979e-06, "loss": 0.0003, "step": 7145 }, { "epoch": 3.251137397634213, "grad_norm": 0.169200753952908, "learning_rate": 2.7281374419181367e-06, "loss": 0.0013, "step": 7146 }, { "epoch": 3.251592356687898, "grad_norm": 0.14898552955724903, "learning_rate": 2.726864301031643e-06, "loss": 0.0012, "step": 7147 }, { "epoch": 3.2520473157415832, "grad_norm": 0.0703381828959106, "learning_rate": 2.725591345895533e-06, "loss": 0.0005, "step": 7148 }, { "epoch": 3.2525022747952685, "grad_norm": 0.42677499836340976, "learning_rate": 2.7243185766138257e-06, "loss": 0.0078, "step": 7149 }, { "epoch": 3.2529572338489534, "grad_norm": 0.10280017033093113, "learning_rate": 2.7230459932905275e-06, "loss": 0.0008, "step": 7150 }, { "epoch": 3.2534121929026387, "grad_norm": 0.28943015042820897, "learning_rate": 2.7217735960296295e-06, "loss": 0.0035, "step": 7151 }, { "epoch": 3.253867151956324, "grad_norm": 0.12429545745869343, "learning_rate": 2.720501384935105e-06, "loss": 0.0005, "step": 7152 }, { "epoch": 3.2543221110100093, "grad_norm": 0.06051664488385681, "learning_rate": 2.7192293601109134e-06, "loss": 0.0006, "step": 7153 }, { "epoch": 3.254777070063694, "grad_norm": 0.24896660037657153, "learning_rate": 2.717957521660996e-06, "loss": 0.0049, "step": 7154 }, { "epoch": 3.2552320291173795, "grad_norm": 0.17915147040713456, "learning_rate": 2.7166858696892867e-06, "loss": 0.0037, "step": 7155 }, { "epoch": 3.255686988171065, "grad_norm": 0.20377455551361887, "learning_rate": 2.715414404299697e-06, "loss": 0.002, "step": 7156 }, { "epoch": 3.2561419472247497, "grad_norm": 0.3373844136749358, "learning_rate": 2.7141431255961228e-06, "loss": 0.0038, "step": 7157 }, { "epoch": 3.256596906278435, "grad_norm": 0.32355393248923936, "learning_rate": 2.7128720336824523e-06, "loss": 0.0041, "step": 7158 }, { "epoch": 3.2570518653321203, "grad_norm": 0.14333946189776586, "learning_rate": 2.7116011286625478e-06, "loss": 0.0014, "step": 7159 }, { "epoch": 3.257506824385805, "grad_norm": 0.2600585350353282, "learning_rate": 2.710330410640267e-06, "loss": 0.0023, "step": 7160 }, { "epoch": 3.2579617834394905, "grad_norm": 0.09719011715006663, "learning_rate": 2.7090598797194447e-06, "loss": 0.0022, "step": 7161 }, { "epoch": 3.258416742493176, "grad_norm": 0.11143143654325995, "learning_rate": 2.707789536003903e-06, "loss": 0.0017, "step": 7162 }, { "epoch": 3.2588717015468607, "grad_norm": 0.34203956354533316, "learning_rate": 2.7065193795974474e-06, "loss": 0.0107, "step": 7163 }, { "epoch": 3.259326660600546, "grad_norm": 0.41330817811248277, "learning_rate": 2.705249410603871e-06, "loss": 0.0086, "step": 7164 }, { "epoch": 3.2597816196542313, "grad_norm": 0.09824676055711044, "learning_rate": 2.7039796291269516e-06, "loss": 0.0005, "step": 7165 }, { "epoch": 3.260236578707916, "grad_norm": 0.06807850809841584, "learning_rate": 2.7027100352704484e-06, "loss": 0.0004, "step": 7166 }, { "epoch": 3.2606915377616015, "grad_norm": 0.02428812660313921, "learning_rate": 2.7014406291381057e-06, "loss": 0.0003, "step": 7167 }, { "epoch": 3.261146496815287, "grad_norm": 0.30857124081147036, "learning_rate": 2.7001714108336535e-06, "loss": 0.0093, "step": 7168 }, { "epoch": 3.2616014558689717, "grad_norm": 0.4031854097512622, "learning_rate": 2.6989023804608095e-06, "loss": 0.0034, "step": 7169 }, { "epoch": 3.262056414922657, "grad_norm": 0.10851853189900855, "learning_rate": 2.697633538123271e-06, "loss": 0.0006, "step": 7170 }, { "epoch": 3.2625113739763423, "grad_norm": 0.2690155188250169, "learning_rate": 2.696364883924721e-06, "loss": 0.0033, "step": 7171 }, { "epoch": 3.262966333030027, "grad_norm": 0.2181276629041794, "learning_rate": 2.695096417968831e-06, "loss": 0.0021, "step": 7172 }, { "epoch": 3.2634212920837125, "grad_norm": 0.23758819563699266, "learning_rate": 2.6938281403592508e-06, "loss": 0.0018, "step": 7173 }, { "epoch": 3.2638762511373978, "grad_norm": 0.7967636126170162, "learning_rate": 2.692560051199623e-06, "loss": 0.042, "step": 7174 }, { "epoch": 3.2643312101910826, "grad_norm": 0.4211075960736797, "learning_rate": 2.691292150593567e-06, "loss": 0.0028, "step": 7175 }, { "epoch": 3.264786169244768, "grad_norm": 0.20546727432724868, "learning_rate": 2.6900244386446903e-06, "loss": 0.003, "step": 7176 }, { "epoch": 3.2652411282984533, "grad_norm": 0.20224565986671209, "learning_rate": 2.688756915456583e-06, "loss": 0.002, "step": 7177 }, { "epoch": 3.265696087352138, "grad_norm": 0.32028901986308433, "learning_rate": 2.6874895811328227e-06, "loss": 0.0047, "step": 7178 }, { "epoch": 3.2661510464058234, "grad_norm": 0.3517715475866459, "learning_rate": 2.6862224357769735e-06, "loss": 0.0034, "step": 7179 }, { "epoch": 3.2666060054595087, "grad_norm": 0.16322044110343517, "learning_rate": 2.684955479492577e-06, "loss": 0.001, "step": 7180 }, { "epoch": 3.2670609645131936, "grad_norm": 0.19287541425823002, "learning_rate": 2.6836887123831646e-06, "loss": 0.0026, "step": 7181 }, { "epoch": 3.267515923566879, "grad_norm": 0.21650577938346618, "learning_rate": 2.6824221345522485e-06, "loss": 0.0015, "step": 7182 }, { "epoch": 3.2679708826205642, "grad_norm": 0.26646322209002665, "learning_rate": 2.6811557461033313e-06, "loss": 0.0026, "step": 7183 }, { "epoch": 3.268425841674249, "grad_norm": 0.25724749351765963, "learning_rate": 2.6798895471398945e-06, "loss": 0.0037, "step": 7184 }, { "epoch": 3.2688808007279344, "grad_norm": 0.11269082970141436, "learning_rate": 2.678623537765404e-06, "loss": 0.0013, "step": 7185 }, { "epoch": 3.2693357597816197, "grad_norm": 0.24740827637594645, "learning_rate": 2.6773577180833173e-06, "loss": 0.006, "step": 7186 }, { "epoch": 3.2697907188353046, "grad_norm": 0.24586612229392948, "learning_rate": 2.6760920881970688e-06, "loss": 0.0032, "step": 7187 }, { "epoch": 3.27024567788899, "grad_norm": 0.3197566711858623, "learning_rate": 2.674826648210078e-06, "loss": 0.0041, "step": 7188 }, { "epoch": 3.270700636942675, "grad_norm": 0.15619097272415708, "learning_rate": 2.673561398225755e-06, "loss": 0.0012, "step": 7189 }, { "epoch": 3.2711555959963605, "grad_norm": 0.18916658455143268, "learning_rate": 2.6722963383474888e-06, "loss": 0.0014, "step": 7190 }, { "epoch": 3.2716105550500454, "grad_norm": 0.23624471707474365, "learning_rate": 2.6710314686786544e-06, "loss": 0.0046, "step": 7191 }, { "epoch": 3.2720655141037307, "grad_norm": 0.4881128093896115, "learning_rate": 2.6697667893226077e-06, "loss": 0.0116, "step": 7192 }, { "epoch": 3.272520473157416, "grad_norm": 0.2804352033956888, "learning_rate": 2.6685023003826965e-06, "loss": 0.0027, "step": 7193 }, { "epoch": 3.272975432211101, "grad_norm": 0.2267541804256952, "learning_rate": 2.6672380019622503e-06, "loss": 0.0029, "step": 7194 }, { "epoch": 3.273430391264786, "grad_norm": 0.2757278787292266, "learning_rate": 2.6659738941645797e-06, "loss": 0.003, "step": 7195 }, { "epoch": 3.2738853503184715, "grad_norm": 0.09656253820978641, "learning_rate": 2.6647099770929824e-06, "loss": 0.0009, "step": 7196 }, { "epoch": 3.2743403093721564, "grad_norm": 0.2568811260632643, "learning_rate": 2.6634462508507375e-06, "loss": 0.0021, "step": 7197 }, { "epoch": 3.2747952684258417, "grad_norm": 0.16199808895737475, "learning_rate": 2.662182715541115e-06, "loss": 0.0015, "step": 7198 }, { "epoch": 3.275250227479527, "grad_norm": 0.17744029718226378, "learning_rate": 2.660919371267362e-06, "loss": 0.0031, "step": 7199 }, { "epoch": 3.275705186533212, "grad_norm": 0.29539641511409087, "learning_rate": 2.659656218132717e-06, "loss": 0.0053, "step": 7200 }, { "epoch": 3.276160145586897, "grad_norm": 0.28386411505165304, "learning_rate": 2.658393256240396e-06, "loss": 0.0047, "step": 7201 }, { "epoch": 3.2766151046405825, "grad_norm": 0.14718863575620184, "learning_rate": 2.657130485693602e-06, "loss": 0.0026, "step": 7202 }, { "epoch": 3.2770700636942673, "grad_norm": 0.24779703909292675, "learning_rate": 2.655867906595526e-06, "loss": 0.0052, "step": 7203 }, { "epoch": 3.2775250227479527, "grad_norm": 0.17435684825661954, "learning_rate": 2.65460551904934e-06, "loss": 0.002, "step": 7204 }, { "epoch": 3.277979981801638, "grad_norm": 0.3549564471563982, "learning_rate": 2.653343323158198e-06, "loss": 0.0022, "step": 7205 }, { "epoch": 3.278434940855323, "grad_norm": 0.1819617948665579, "learning_rate": 2.6520813190252404e-06, "loss": 0.0022, "step": 7206 }, { "epoch": 3.278889899909008, "grad_norm": 0.22062113504077227, "learning_rate": 2.6508195067535945e-06, "loss": 0.002, "step": 7207 }, { "epoch": 3.2793448589626935, "grad_norm": 0.31409163144651064, "learning_rate": 2.649557886446372e-06, "loss": 0.0049, "step": 7208 }, { "epoch": 3.2797998180163788, "grad_norm": 0.2796765425652682, "learning_rate": 2.648296458206664e-06, "loss": 0.0021, "step": 7209 }, { "epoch": 3.2802547770700636, "grad_norm": 0.25773002985936555, "learning_rate": 2.6470352221375496e-06, "loss": 0.0031, "step": 7210 }, { "epoch": 3.280709736123749, "grad_norm": 0.241479713747486, "learning_rate": 2.6457741783420885e-06, "loss": 0.0019, "step": 7211 }, { "epoch": 3.2811646951774343, "grad_norm": 0.3235988011783772, "learning_rate": 2.6445133269233325e-06, "loss": 0.0085, "step": 7212 }, { "epoch": 3.281619654231119, "grad_norm": 0.12733756900378967, "learning_rate": 2.6432526679843074e-06, "loss": 0.0014, "step": 7213 }, { "epoch": 3.2820746132848044, "grad_norm": 0.39314498928669095, "learning_rate": 2.641992201628034e-06, "loss": 0.0042, "step": 7214 }, { "epoch": 3.2825295723384897, "grad_norm": 0.09141632687175844, "learning_rate": 2.6407319279575088e-06, "loss": 0.0011, "step": 7215 }, { "epoch": 3.2829845313921746, "grad_norm": 0.2826436391514078, "learning_rate": 2.639471847075714e-06, "loss": 0.0057, "step": 7216 }, { "epoch": 3.28343949044586, "grad_norm": 0.12212552924455057, "learning_rate": 2.6382119590856226e-06, "loss": 0.0011, "step": 7217 }, { "epoch": 3.2838944494995452, "grad_norm": 0.24209539677389158, "learning_rate": 2.6369522640901836e-06, "loss": 0.0018, "step": 7218 }, { "epoch": 3.28434940855323, "grad_norm": 0.35241312909161665, "learning_rate": 2.6356927621923343e-06, "loss": 0.0082, "step": 7219 }, { "epoch": 3.2848043676069154, "grad_norm": 0.38443106051345766, "learning_rate": 2.634433453494993e-06, "loss": 0.0089, "step": 7220 }, { "epoch": 3.2852593266606007, "grad_norm": 0.4712211357254818, "learning_rate": 2.633174338101068e-06, "loss": 0.0046, "step": 7221 }, { "epoch": 3.2857142857142856, "grad_norm": 0.19369858066775172, "learning_rate": 2.6319154161134484e-06, "loss": 0.0021, "step": 7222 }, { "epoch": 3.286169244767971, "grad_norm": 0.3276655948136777, "learning_rate": 2.6306566876350072e-06, "loss": 0.0067, "step": 7223 }, { "epoch": 3.286624203821656, "grad_norm": 0.28093485046800604, "learning_rate": 2.6293981527686018e-06, "loss": 0.0036, "step": 7224 }, { "epoch": 3.287079162875341, "grad_norm": 0.14654869426966358, "learning_rate": 2.6281398116170736e-06, "loss": 0.0014, "step": 7225 }, { "epoch": 3.2875341219290264, "grad_norm": 0.2526362478616213, "learning_rate": 2.626881664283247e-06, "loss": 0.0032, "step": 7226 }, { "epoch": 3.2879890809827117, "grad_norm": 0.11656170348832014, "learning_rate": 2.625623710869934e-06, "loss": 0.0008, "step": 7227 }, { "epoch": 3.2884440400363966, "grad_norm": 0.419202347891847, "learning_rate": 2.62436595147993e-06, "loss": 0.0044, "step": 7228 }, { "epoch": 3.288898999090082, "grad_norm": 0.36359780227547744, "learning_rate": 2.6231083862160134e-06, "loss": 0.0068, "step": 7229 }, { "epoch": 3.289353958143767, "grad_norm": 0.12545000328430567, "learning_rate": 2.621851015180945e-06, "loss": 0.0011, "step": 7230 }, { "epoch": 3.289808917197452, "grad_norm": 0.3613893940093683, "learning_rate": 2.6205938384774698e-06, "loss": 0.0122, "step": 7231 }, { "epoch": 3.2902638762511374, "grad_norm": 0.20055896136181164, "learning_rate": 2.6193368562083226e-06, "loss": 0.0022, "step": 7232 }, { "epoch": 3.2907188353048227, "grad_norm": 0.25846498960539516, "learning_rate": 2.618080068476217e-06, "loss": 0.0044, "step": 7233 }, { "epoch": 3.2911737943585075, "grad_norm": 0.32379062949367765, "learning_rate": 2.6168234753838497e-06, "loss": 0.0038, "step": 7234 }, { "epoch": 3.291628753412193, "grad_norm": 0.20795370844078756, "learning_rate": 2.615567077033907e-06, "loss": 0.0009, "step": 7235 }, { "epoch": 3.292083712465878, "grad_norm": 0.04601790713926328, "learning_rate": 2.6143108735290534e-06, "loss": 0.0003, "step": 7236 }, { "epoch": 3.292538671519563, "grad_norm": 0.3164377804861328, "learning_rate": 2.6130548649719434e-06, "loss": 0.0069, "step": 7237 }, { "epoch": 3.2929936305732483, "grad_norm": 0.24218745571621716, "learning_rate": 2.61179905146521e-06, "loss": 0.0026, "step": 7238 }, { "epoch": 3.2934485896269337, "grad_norm": 0.2755404432763007, "learning_rate": 2.610543433111473e-06, "loss": 0.0044, "step": 7239 }, { "epoch": 3.2939035486806185, "grad_norm": 0.14076124438836563, "learning_rate": 2.609288010013335e-06, "loss": 0.002, "step": 7240 }, { "epoch": 3.294358507734304, "grad_norm": 0.09307802009705414, "learning_rate": 2.6080327822733837e-06, "loss": 0.0005, "step": 7241 }, { "epoch": 3.294813466787989, "grad_norm": 0.09960470798181198, "learning_rate": 2.6067777499941937e-06, "loss": 0.0008, "step": 7242 }, { "epoch": 3.295268425841674, "grad_norm": 0.17779239899983187, "learning_rate": 2.6055229132783175e-06, "loss": 0.003, "step": 7243 }, { "epoch": 3.2957233848953593, "grad_norm": 0.239092146999513, "learning_rate": 2.6042682722282964e-06, "loss": 0.0059, "step": 7244 }, { "epoch": 3.2961783439490446, "grad_norm": 0.26744604832466934, "learning_rate": 2.603013826946651e-06, "loss": 0.003, "step": 7245 }, { "epoch": 3.29663330300273, "grad_norm": 0.12128936745238342, "learning_rate": 2.6017595775358928e-06, "loss": 0.0011, "step": 7246 }, { "epoch": 3.297088262056415, "grad_norm": 0.20695500254881521, "learning_rate": 2.6005055240985113e-06, "loss": 0.0057, "step": 7247 }, { "epoch": 3.2975432211101, "grad_norm": 0.1419702376477733, "learning_rate": 2.5992516667369805e-06, "loss": 0.0025, "step": 7248 }, { "epoch": 3.2979981801637854, "grad_norm": 0.05793491091204328, "learning_rate": 2.597998005553764e-06, "loss": 0.0005, "step": 7249 }, { "epoch": 3.2984531392174703, "grad_norm": 0.08289586536331124, "learning_rate": 2.5967445406513013e-06, "loss": 0.0006, "step": 7250 }, { "epoch": 3.2989080982711556, "grad_norm": 0.2714757900769244, "learning_rate": 2.5954912721320237e-06, "loss": 0.0028, "step": 7251 }, { "epoch": 3.299363057324841, "grad_norm": 0.12976936546940726, "learning_rate": 2.5942382000983403e-06, "loss": 0.0019, "step": 7252 }, { "epoch": 3.299818016378526, "grad_norm": 0.3248448443965714, "learning_rate": 2.5929853246526466e-06, "loss": 0.005, "step": 7253 }, { "epoch": 3.300272975432211, "grad_norm": 0.17074397189179708, "learning_rate": 2.5917326458973224e-06, "loss": 0.0019, "step": 7254 }, { "epoch": 3.3007279344858964, "grad_norm": 0.3065231414289718, "learning_rate": 2.5904801639347273e-06, "loss": 0.0056, "step": 7255 }, { "epoch": 3.3011828935395813, "grad_norm": 0.20273530408422263, "learning_rate": 2.5892278788672154e-06, "loss": 0.002, "step": 7256 }, { "epoch": 3.3016378525932666, "grad_norm": 0.38598289013176906, "learning_rate": 2.5879757907971144e-06, "loss": 0.0066, "step": 7257 }, { "epoch": 3.302092811646952, "grad_norm": 0.136017382148316, "learning_rate": 2.5867238998267386e-06, "loss": 0.0017, "step": 7258 }, { "epoch": 3.3025477707006368, "grad_norm": 0.05194067573736035, "learning_rate": 2.585472206058388e-06, "loss": 0.0005, "step": 7259 }, { "epoch": 3.303002729754322, "grad_norm": 0.23793721769862095, "learning_rate": 2.584220709594343e-06, "loss": 0.0041, "step": 7260 }, { "epoch": 3.3034576888080074, "grad_norm": 0.24718595782286687, "learning_rate": 2.582969410536874e-06, "loss": 0.0026, "step": 7261 }, { "epoch": 3.3039126478616927, "grad_norm": 0.2704220442702209, "learning_rate": 2.5817183089882275e-06, "loss": 0.0052, "step": 7262 }, { "epoch": 3.3043676069153776, "grad_norm": 0.19002404344345794, "learning_rate": 2.580467405050642e-06, "loss": 0.0043, "step": 7263 }, { "epoch": 3.304822565969063, "grad_norm": 0.0949216897112524, "learning_rate": 2.5792166988263336e-06, "loss": 0.0007, "step": 7264 }, { "epoch": 3.305277525022748, "grad_norm": 0.7628543889003218, "learning_rate": 2.5779661904175022e-06, "loss": 0.0011, "step": 7265 }, { "epoch": 3.305732484076433, "grad_norm": 0.23094332899454187, "learning_rate": 2.576715879926338e-06, "loss": 0.0027, "step": 7266 }, { "epoch": 3.3061874431301184, "grad_norm": 0.3845595692052192, "learning_rate": 2.575465767455009e-06, "loss": 0.002, "step": 7267 }, { "epoch": 3.3066424021838037, "grad_norm": 0.20459573434649453, "learning_rate": 2.574215853105667e-06, "loss": 0.0017, "step": 7268 }, { "epoch": 3.3070973612374885, "grad_norm": 0.3196180698891119, "learning_rate": 2.5729661369804505e-06, "loss": 0.0045, "step": 7269 }, { "epoch": 3.307552320291174, "grad_norm": 0.2073632429701408, "learning_rate": 2.5717166191814803e-06, "loss": 0.0021, "step": 7270 }, { "epoch": 3.308007279344859, "grad_norm": 0.5950699211150948, "learning_rate": 2.5704672998108636e-06, "loss": 0.0132, "step": 7271 }, { "epoch": 3.308462238398544, "grad_norm": 0.2997257838226498, "learning_rate": 2.569218178970688e-06, "loss": 0.0027, "step": 7272 }, { "epoch": 3.3089171974522293, "grad_norm": 0.2924224676762152, "learning_rate": 2.5679692567630247e-06, "loss": 0.0041, "step": 7273 }, { "epoch": 3.3093721565059147, "grad_norm": 0.518688078078415, "learning_rate": 2.5667205332899294e-06, "loss": 0.005, "step": 7274 }, { "epoch": 3.3098271155595995, "grad_norm": 0.13013443653274556, "learning_rate": 2.5654720086534456e-06, "loss": 0.0013, "step": 7275 }, { "epoch": 3.310282074613285, "grad_norm": 0.37097330863903055, "learning_rate": 2.5642236829555926e-06, "loss": 0.0064, "step": 7276 }, { "epoch": 3.31073703366697, "grad_norm": 0.4071420739095773, "learning_rate": 2.5629755562983827e-06, "loss": 0.0088, "step": 7277 }, { "epoch": 3.311191992720655, "grad_norm": 0.3013908187952351, "learning_rate": 2.5617276287838043e-06, "loss": 0.0041, "step": 7278 }, { "epoch": 3.3116469517743403, "grad_norm": 0.13989079123637085, "learning_rate": 2.560479900513832e-06, "loss": 0.0022, "step": 7279 }, { "epoch": 3.3121019108280256, "grad_norm": 0.14256871449962055, "learning_rate": 2.5592323715904266e-06, "loss": 0.0029, "step": 7280 }, { "epoch": 3.3125568698817105, "grad_norm": 0.12064162971566546, "learning_rate": 2.5579850421155294e-06, "loss": 0.0022, "step": 7281 }, { "epoch": 3.313011828935396, "grad_norm": 0.14792415961492336, "learning_rate": 2.5567379121910672e-06, "loss": 0.0017, "step": 7282 }, { "epoch": 3.313466787989081, "grad_norm": 0.1547308310214879, "learning_rate": 2.5554909819189466e-06, "loss": 0.0011, "step": 7283 }, { "epoch": 3.313921747042766, "grad_norm": 0.2690986210507297, "learning_rate": 2.5542442514010635e-06, "loss": 0.0035, "step": 7284 }, { "epoch": 3.3143767060964513, "grad_norm": 0.12918828812883756, "learning_rate": 2.5529977207392977e-06, "loss": 0.0019, "step": 7285 }, { "epoch": 3.3148316651501366, "grad_norm": 0.5004581771599083, "learning_rate": 2.551751390035507e-06, "loss": 0.009, "step": 7286 }, { "epoch": 3.3152866242038215, "grad_norm": 0.2188150802674912, "learning_rate": 2.550505259391537e-06, "loss": 0.0042, "step": 7287 }, { "epoch": 3.315741583257507, "grad_norm": 0.32237899611928544, "learning_rate": 2.549259328909214e-06, "loss": 0.0038, "step": 7288 }, { "epoch": 3.316196542311192, "grad_norm": 0.17433601843193414, "learning_rate": 2.548013598690352e-06, "loss": 0.0022, "step": 7289 }, { "epoch": 3.316651501364877, "grad_norm": 0.32272099671720045, "learning_rate": 2.5467680688367437e-06, "loss": 0.0045, "step": 7290 }, { "epoch": 3.3171064604185623, "grad_norm": 0.2581902236623843, "learning_rate": 2.5455227394501726e-06, "loss": 0.0069, "step": 7291 }, { "epoch": 3.3175614194722476, "grad_norm": 0.35864153150367284, "learning_rate": 2.5442776106323984e-06, "loss": 0.0041, "step": 7292 }, { "epoch": 3.3180163785259325, "grad_norm": 0.14772981343671274, "learning_rate": 2.5430326824851693e-06, "loss": 0.0009, "step": 7293 }, { "epoch": 3.3184713375796178, "grad_norm": 0.13298019476464154, "learning_rate": 2.5417879551102104e-06, "loss": 0.001, "step": 7294 }, { "epoch": 3.318926296633303, "grad_norm": 0.34506515118859876, "learning_rate": 2.540543428609241e-06, "loss": 0.0035, "step": 7295 }, { "epoch": 3.319381255686988, "grad_norm": 0.13203322940229437, "learning_rate": 2.539299103083956e-06, "loss": 0.0012, "step": 7296 }, { "epoch": 3.3198362147406733, "grad_norm": 0.19859864419203488, "learning_rate": 2.5380549786360335e-06, "loss": 0.0012, "step": 7297 }, { "epoch": 3.3202911737943586, "grad_norm": 0.36402931140326145, "learning_rate": 2.5368110553671426e-06, "loss": 0.0093, "step": 7298 }, { "epoch": 3.3207461328480434, "grad_norm": 0.49718953879928546, "learning_rate": 2.5355673333789264e-06, "loss": 0.0051, "step": 7299 }, { "epoch": 3.3212010919017287, "grad_norm": 0.20120116580885394, "learning_rate": 2.53432381277302e-06, "loss": 0.0038, "step": 7300 }, { "epoch": 3.321656050955414, "grad_norm": 0.20356369716200837, "learning_rate": 2.5330804936510374e-06, "loss": 0.0022, "step": 7301 }, { "epoch": 3.3221110100090994, "grad_norm": 0.15078844520382614, "learning_rate": 2.5318373761145757e-06, "loss": 0.0028, "step": 7302 }, { "epoch": 3.3225659690627842, "grad_norm": 0.2013139263014182, "learning_rate": 2.530594460265217e-06, "loss": 0.002, "step": 7303 }, { "epoch": 3.3230209281164695, "grad_norm": 0.23668974653933753, "learning_rate": 2.5293517462045254e-06, "loss": 0.0039, "step": 7304 }, { "epoch": 3.323475887170155, "grad_norm": 0.21387998420937504, "learning_rate": 2.528109234034054e-06, "loss": 0.0012, "step": 7305 }, { "epoch": 3.3239308462238397, "grad_norm": 0.1727488684922544, "learning_rate": 2.5268669238553337e-06, "loss": 0.0042, "step": 7306 }, { "epoch": 3.324385805277525, "grad_norm": 0.27516283501786926, "learning_rate": 2.52562481576988e-06, "loss": 0.0041, "step": 7307 }, { "epoch": 3.3248407643312103, "grad_norm": 0.08728974391034555, "learning_rate": 2.5243829098791894e-06, "loss": 0.001, "step": 7308 }, { "epoch": 3.325295723384895, "grad_norm": 0.36755171543684856, "learning_rate": 2.5231412062847502e-06, "loss": 0.0073, "step": 7309 }, { "epoch": 3.3257506824385805, "grad_norm": 0.2876866632856369, "learning_rate": 2.5218997050880262e-06, "loss": 0.004, "step": 7310 }, { "epoch": 3.326205641492266, "grad_norm": 0.11053008624322136, "learning_rate": 2.5206584063904648e-06, "loss": 0.0015, "step": 7311 }, { "epoch": 3.3266606005459507, "grad_norm": 0.20030627286537156, "learning_rate": 2.5194173102935044e-06, "loss": 0.0026, "step": 7312 }, { "epoch": 3.327115559599636, "grad_norm": 0.16950799475825842, "learning_rate": 2.5181764168985566e-06, "loss": 0.0033, "step": 7313 }, { "epoch": 3.3275705186533213, "grad_norm": 0.2842261936564266, "learning_rate": 2.516935726307027e-06, "loss": 0.0027, "step": 7314 }, { "epoch": 3.328025477707006, "grad_norm": 0.41526373932969185, "learning_rate": 2.515695238620296e-06, "loss": 0.0021, "step": 7315 }, { "epoch": 3.3284804367606915, "grad_norm": 0.21075976304945018, "learning_rate": 2.514454953939731e-06, "loss": 0.0034, "step": 7316 }, { "epoch": 3.328935395814377, "grad_norm": 0.07055641907169599, "learning_rate": 2.5132148723666807e-06, "loss": 0.0007, "step": 7317 }, { "epoch": 3.329390354868062, "grad_norm": 1.0278173050813473, "learning_rate": 2.5119749940024805e-06, "loss": 0.0095, "step": 7318 }, { "epoch": 3.329845313921747, "grad_norm": 0.1438870759025006, "learning_rate": 2.5107353189484503e-06, "loss": 0.0022, "step": 7319 }, { "epoch": 3.3303002729754323, "grad_norm": 0.33844938099288047, "learning_rate": 2.5094958473058883e-06, "loss": 0.0051, "step": 7320 }, { "epoch": 3.3307552320291176, "grad_norm": 0.3393158344571439, "learning_rate": 2.508256579176078e-06, "loss": 0.0068, "step": 7321 }, { "epoch": 3.3312101910828025, "grad_norm": 0.3775418266642367, "learning_rate": 2.5070175146602864e-06, "loss": 0.0082, "step": 7322 }, { "epoch": 3.331665150136488, "grad_norm": 0.3513833090670177, "learning_rate": 2.5057786538597674e-06, "loss": 0.0031, "step": 7323 }, { "epoch": 3.332120109190173, "grad_norm": 0.07797340683218945, "learning_rate": 2.504539996875752e-06, "loss": 0.0009, "step": 7324 }, { "epoch": 3.332575068243858, "grad_norm": 0.189112346004838, "learning_rate": 2.5033015438094577e-06, "loss": 0.0022, "step": 7325 }, { "epoch": 3.3330300272975433, "grad_norm": 0.26356992815314156, "learning_rate": 2.502063294762087e-06, "loss": 0.005, "step": 7326 }, { "epoch": 3.3334849863512286, "grad_norm": 0.10282269568175559, "learning_rate": 2.500825249834823e-06, "loss": 0.0009, "step": 7327 }, { "epoch": 3.3339399454049135, "grad_norm": 0.1778713733816966, "learning_rate": 2.4995874091288343e-06, "loss": 0.0023, "step": 7328 }, { "epoch": 3.3343949044585988, "grad_norm": 0.24928156816519922, "learning_rate": 2.4983497727452704e-06, "loss": 0.001, "step": 7329 }, { "epoch": 3.334849863512284, "grad_norm": 0.33541814228695904, "learning_rate": 2.4971123407852665e-06, "loss": 0.01, "step": 7330 }, { "epoch": 3.335304822565969, "grad_norm": 0.5295496967837838, "learning_rate": 2.495875113349939e-06, "loss": 0.0066, "step": 7331 }, { "epoch": 3.3357597816196543, "grad_norm": 0.28325256526663395, "learning_rate": 2.494638090540387e-06, "loss": 0.0061, "step": 7332 }, { "epoch": 3.3362147406733396, "grad_norm": 0.09104853173261164, "learning_rate": 2.493401272457695e-06, "loss": 0.001, "step": 7333 }, { "epoch": 3.3366696997270244, "grad_norm": 0.36272303356851776, "learning_rate": 2.492164659202934e-06, "loss": 0.0128, "step": 7334 }, { "epoch": 3.3371246587807097, "grad_norm": 0.35061986939040046, "learning_rate": 2.490928250877152e-06, "loss": 0.0111, "step": 7335 }, { "epoch": 3.337579617834395, "grad_norm": 0.28032885436245897, "learning_rate": 2.4896920475813824e-06, "loss": 0.0047, "step": 7336 }, { "epoch": 3.33803457688808, "grad_norm": 0.46575974164271244, "learning_rate": 2.48845604941664e-06, "loss": 0.0109, "step": 7337 }, { "epoch": 3.3384895359417652, "grad_norm": 0.44632320593972635, "learning_rate": 2.4872202564839293e-06, "loss": 0.0105, "step": 7338 }, { "epoch": 3.3389444949954505, "grad_norm": 0.29875495923786044, "learning_rate": 2.48598466888423e-06, "loss": 0.0046, "step": 7339 }, { "epoch": 3.3393994540491354, "grad_norm": 0.2570759464781434, "learning_rate": 2.4847492867185113e-06, "loss": 0.0028, "step": 7340 }, { "epoch": 3.3398544131028207, "grad_norm": 0.25893243182426584, "learning_rate": 2.483514110087723e-06, "loss": 0.0074, "step": 7341 }, { "epoch": 3.340309372156506, "grad_norm": 0.26430627311799326, "learning_rate": 2.482279139092795e-06, "loss": 0.0016, "step": 7342 }, { "epoch": 3.340764331210191, "grad_norm": 0.1980482553148076, "learning_rate": 2.481044373834648e-06, "loss": 0.0023, "step": 7343 }, { "epoch": 3.341219290263876, "grad_norm": 0.364216865329497, "learning_rate": 2.479809814414179e-06, "loss": 0.0066, "step": 7344 }, { "epoch": 3.3416742493175615, "grad_norm": 0.2557590655299494, "learning_rate": 2.4785754609322714e-06, "loss": 0.0059, "step": 7345 }, { "epoch": 3.3421292083712464, "grad_norm": 0.1370077657388034, "learning_rate": 2.477341313489788e-06, "loss": 0.0017, "step": 7346 }, { "epoch": 3.3425841674249317, "grad_norm": 0.06275266553004565, "learning_rate": 2.4761073721875805e-06, "loss": 0.0005, "step": 7347 }, { "epoch": 3.343039126478617, "grad_norm": 0.10547302869649244, "learning_rate": 2.4748736371264825e-06, "loss": 0.0007, "step": 7348 }, { "epoch": 3.343494085532302, "grad_norm": 0.2881291699184336, "learning_rate": 2.4736401084073074e-06, "loss": 0.0052, "step": 7349 }, { "epoch": 3.343949044585987, "grad_norm": 0.23454407724560167, "learning_rate": 2.4724067861308544e-06, "loss": 0.0032, "step": 7350 }, { "epoch": 3.3444040036396725, "grad_norm": 0.16237935164582346, "learning_rate": 2.4711736703979015e-06, "loss": 0.0012, "step": 7351 }, { "epoch": 3.3448589626933574, "grad_norm": 0.17893016342068024, "learning_rate": 2.4699407613092185e-06, "loss": 0.0014, "step": 7352 }, { "epoch": 3.3453139217470427, "grad_norm": 0.28489482923608267, "learning_rate": 2.4687080589655494e-06, "loss": 0.003, "step": 7353 }, { "epoch": 3.345768880800728, "grad_norm": 0.40093990448855094, "learning_rate": 2.467475563467628e-06, "loss": 0.009, "step": 7354 }, { "epoch": 3.3462238398544133, "grad_norm": 0.1603316142781402, "learning_rate": 2.4662432749161664e-06, "loss": 0.0028, "step": 7355 }, { "epoch": 3.346678798908098, "grad_norm": 0.35500619515811727, "learning_rate": 2.4650111934118604e-06, "loss": 0.0032, "step": 7356 }, { "epoch": 3.3471337579617835, "grad_norm": 0.04077092516052504, "learning_rate": 2.463779319055394e-06, "loss": 0.0004, "step": 7357 }, { "epoch": 3.347588717015469, "grad_norm": 0.2930415679876122, "learning_rate": 2.462547651947428e-06, "loss": 0.0049, "step": 7358 }, { "epoch": 3.3480436760691537, "grad_norm": 0.14966112595626418, "learning_rate": 2.4613161921886087e-06, "loss": 0.0016, "step": 7359 }, { "epoch": 3.348498635122839, "grad_norm": 0.09521370717364151, "learning_rate": 2.4600849398795633e-06, "loss": 0.0013, "step": 7360 }, { "epoch": 3.3489535941765243, "grad_norm": 0.26831455339138277, "learning_rate": 2.458853895120907e-06, "loss": 0.004, "step": 7361 }, { "epoch": 3.349408553230209, "grad_norm": 0.3522142055830625, "learning_rate": 2.4576230580132366e-06, "loss": 0.0031, "step": 7362 }, { "epoch": 3.3498635122838945, "grad_norm": 0.3136183709621097, "learning_rate": 2.4563924286571285e-06, "loss": 0.004, "step": 7363 }, { "epoch": 3.3503184713375798, "grad_norm": 0.38469428663494953, "learning_rate": 2.455162007153144e-06, "loss": 0.0053, "step": 7364 }, { "epoch": 3.3507734303912646, "grad_norm": 0.2608359331901631, "learning_rate": 2.4539317936018287e-06, "loss": 0.0043, "step": 7365 }, { "epoch": 3.35122838944495, "grad_norm": 0.28166058044681713, "learning_rate": 2.452701788103707e-06, "loss": 0.0019, "step": 7366 }, { "epoch": 3.3516833484986353, "grad_norm": 0.2311103793427457, "learning_rate": 2.451471990759291e-06, "loss": 0.0024, "step": 7367 }, { "epoch": 3.35213830755232, "grad_norm": 0.2940347344513648, "learning_rate": 2.4502424016690775e-06, "loss": 0.0049, "step": 7368 }, { "epoch": 3.3525932666060054, "grad_norm": 0.4669024471263398, "learning_rate": 2.44901302093354e-06, "loss": 0.0095, "step": 7369 }, { "epoch": 3.3530482256596907, "grad_norm": 0.0961448192715872, "learning_rate": 2.4477838486531386e-06, "loss": 0.0023, "step": 7370 }, { "epoch": 3.3535031847133756, "grad_norm": 0.138231832817281, "learning_rate": 2.446554884928313e-06, "loss": 0.0007, "step": 7371 }, { "epoch": 3.353958143767061, "grad_norm": 0.4425843182085254, "learning_rate": 2.445326129859493e-06, "loss": 0.0042, "step": 7372 }, { "epoch": 3.3544131028207462, "grad_norm": 0.13892725791334185, "learning_rate": 2.4440975835470853e-06, "loss": 0.0018, "step": 7373 }, { "epoch": 3.3548680618744315, "grad_norm": 0.2050029208884926, "learning_rate": 2.4428692460914783e-06, "loss": 0.0034, "step": 7374 }, { "epoch": 3.3553230209281164, "grad_norm": 0.5872584964516175, "learning_rate": 2.441641117593051e-06, "loss": 0.0157, "step": 7375 }, { "epoch": 3.3557779799818017, "grad_norm": 0.2075077060962585, "learning_rate": 2.440413198152156e-06, "loss": 0.0025, "step": 7376 }, { "epoch": 3.356232939035487, "grad_norm": 0.21060675410282897, "learning_rate": 2.4391854878691374e-06, "loss": 0.0021, "step": 7377 }, { "epoch": 3.356687898089172, "grad_norm": 0.18972519948504624, "learning_rate": 2.437957986844316e-06, "loss": 0.0016, "step": 7378 }, { "epoch": 3.357142857142857, "grad_norm": 0.5664575246739697, "learning_rate": 2.436730695177998e-06, "loss": 0.0037, "step": 7379 }, { "epoch": 3.3575978161965425, "grad_norm": 0.35747484587848555, "learning_rate": 2.43550361297047e-06, "loss": 0.0081, "step": 7380 }, { "epoch": 3.3580527752502274, "grad_norm": 0.2616627947472333, "learning_rate": 2.434276740322005e-06, "loss": 0.0094, "step": 7381 }, { "epoch": 3.3585077343039127, "grad_norm": 0.2449991978069107, "learning_rate": 2.4330500773328608e-06, "loss": 0.0041, "step": 7382 }, { "epoch": 3.358962693357598, "grad_norm": 0.3051261543090273, "learning_rate": 2.4318236241032723e-06, "loss": 0.0027, "step": 7383 }, { "epoch": 3.359417652411283, "grad_norm": 0.1995524845485516, "learning_rate": 2.430597380733459e-06, "loss": 0.0055, "step": 7384 }, { "epoch": 3.359872611464968, "grad_norm": 0.5443547108548171, "learning_rate": 2.429371347323622e-06, "loss": 0.0051, "step": 7385 }, { "epoch": 3.3603275705186535, "grad_norm": 0.21093484627895018, "learning_rate": 2.428145523973952e-06, "loss": 0.0023, "step": 7386 }, { "epoch": 3.3607825295723384, "grad_norm": 0.13761078709243324, "learning_rate": 2.426919910784615e-06, "loss": 0.0014, "step": 7387 }, { "epoch": 3.3612374886260237, "grad_norm": 0.49437877181380013, "learning_rate": 2.425694507855762e-06, "loss": 0.0123, "step": 7388 }, { "epoch": 3.361692447679709, "grad_norm": 0.5302513007493147, "learning_rate": 2.42446931528753e-06, "loss": 0.004, "step": 7389 }, { "epoch": 3.362147406733394, "grad_norm": 0.11245617563980523, "learning_rate": 2.423244333180032e-06, "loss": 0.0009, "step": 7390 }, { "epoch": 3.362602365787079, "grad_norm": 0.08739279737890837, "learning_rate": 2.422019561633373e-06, "loss": 0.0016, "step": 7391 }, { "epoch": 3.3630573248407645, "grad_norm": 0.3547527842961154, "learning_rate": 2.4207950007476335e-06, "loss": 0.0057, "step": 7392 }, { "epoch": 3.3635122838944493, "grad_norm": 0.33975658421196914, "learning_rate": 2.4195706506228785e-06, "loss": 0.0034, "step": 7393 }, { "epoch": 3.3639672429481347, "grad_norm": 0.3250004927194024, "learning_rate": 2.4183465113591547e-06, "loss": 0.0038, "step": 7394 }, { "epoch": 3.36442220200182, "grad_norm": 0.28608833459689476, "learning_rate": 2.417122583056496e-06, "loss": 0.0043, "step": 7395 }, { "epoch": 3.364877161055505, "grad_norm": 0.2405906832521087, "learning_rate": 2.4158988658149173e-06, "loss": 0.0043, "step": 7396 }, { "epoch": 3.36533212010919, "grad_norm": 0.4235606741284786, "learning_rate": 2.4146753597344136e-06, "loss": 0.006, "step": 7397 }, { "epoch": 3.3657870791628755, "grad_norm": 0.09472292713394488, "learning_rate": 2.4134520649149646e-06, "loss": 0.0009, "step": 7398 }, { "epoch": 3.3662420382165603, "grad_norm": 0.1446411078510499, "learning_rate": 2.4122289814565312e-06, "loss": 0.0021, "step": 7399 }, { "epoch": 3.3666969972702456, "grad_norm": 0.15640355506080575, "learning_rate": 2.4110061094590583e-06, "loss": 0.0024, "step": 7400 }, { "epoch": 3.367151956323931, "grad_norm": 0.2408129485245961, "learning_rate": 2.4097834490224754e-06, "loss": 0.0056, "step": 7401 }, { "epoch": 3.367606915377616, "grad_norm": 0.335113110031722, "learning_rate": 2.4085610002466904e-06, "loss": 0.0086, "step": 7402 }, { "epoch": 3.368061874431301, "grad_norm": 0.08420324284226373, "learning_rate": 2.407338763231599e-06, "loss": 0.0013, "step": 7403 }, { "epoch": 3.3685168334849864, "grad_norm": 0.1396450007182465, "learning_rate": 2.4061167380770763e-06, "loss": 0.0011, "step": 7404 }, { "epoch": 3.3689717925386713, "grad_norm": 0.16999508428944177, "learning_rate": 2.404894924882977e-06, "loss": 0.0018, "step": 7405 }, { "epoch": 3.3694267515923566, "grad_norm": 0.3526039573547545, "learning_rate": 2.4036733237491476e-06, "loss": 0.0073, "step": 7406 }, { "epoch": 3.369881710646042, "grad_norm": 0.23492435210891607, "learning_rate": 2.4024519347754093e-06, "loss": 0.0026, "step": 7407 }, { "epoch": 3.370336669699727, "grad_norm": 0.15506720423048814, "learning_rate": 2.4012307580615687e-06, "loss": 0.0025, "step": 7408 }, { "epoch": 3.370791628753412, "grad_norm": 0.3132982903346134, "learning_rate": 2.400009793707412e-06, "loss": 0.0026, "step": 7409 }, { "epoch": 3.3712465878070974, "grad_norm": 0.5417676611954173, "learning_rate": 2.3987890418127135e-06, "loss": 0.0067, "step": 7410 }, { "epoch": 3.3717015468607827, "grad_norm": 0.11962600915485652, "learning_rate": 2.39756850247723e-06, "loss": 0.0011, "step": 7411 }, { "epoch": 3.3721565059144676, "grad_norm": 0.29016847422403347, "learning_rate": 2.3963481758006958e-06, "loss": 0.0047, "step": 7412 }, { "epoch": 3.372611464968153, "grad_norm": 0.06873271751268913, "learning_rate": 2.3951280618828305e-06, "loss": 0.0006, "step": 7413 }, { "epoch": 3.373066424021838, "grad_norm": 0.1891045428343446, "learning_rate": 2.393908160823335e-06, "loss": 0.0022, "step": 7414 }, { "epoch": 3.373521383075523, "grad_norm": 0.2432783585012321, "learning_rate": 2.3926884727218975e-06, "loss": 0.0021, "step": 7415 }, { "epoch": 3.3739763421292084, "grad_norm": 0.34727501362713886, "learning_rate": 2.3914689976781807e-06, "loss": 0.0069, "step": 7416 }, { "epoch": 3.3744313011828937, "grad_norm": 0.28857604910638013, "learning_rate": 2.3902497357918404e-06, "loss": 0.0027, "step": 7417 }, { "epoch": 3.3748862602365786, "grad_norm": 0.24525753408560597, "learning_rate": 2.3890306871625058e-06, "loss": 0.004, "step": 7418 }, { "epoch": 3.375341219290264, "grad_norm": 0.2027320126370087, "learning_rate": 2.3878118518897905e-06, "loss": 0.0055, "step": 7419 }, { "epoch": 3.375796178343949, "grad_norm": 0.20894508240817183, "learning_rate": 2.3865932300732954e-06, "loss": 0.0028, "step": 7420 }, { "epoch": 3.376251137397634, "grad_norm": 0.26394897429274966, "learning_rate": 2.3853748218126e-06, "loss": 0.0027, "step": 7421 }, { "epoch": 3.3767060964513194, "grad_norm": 0.2262767108381828, "learning_rate": 2.384156627207267e-06, "loss": 0.0019, "step": 7422 }, { "epoch": 3.3771610555050047, "grad_norm": 0.15183545450199315, "learning_rate": 2.3829386463568388e-06, "loss": 0.0017, "step": 7423 }, { "epoch": 3.3776160145586895, "grad_norm": 0.1648028546920776, "learning_rate": 2.3817208793608467e-06, "loss": 0.0016, "step": 7424 }, { "epoch": 3.378070973612375, "grad_norm": 0.1332240175041799, "learning_rate": 2.380503326318801e-06, "loss": 0.0014, "step": 7425 }, { "epoch": 3.37852593266606, "grad_norm": 0.05527216943138619, "learning_rate": 2.379285987330195e-06, "loss": 0.0006, "step": 7426 }, { "epoch": 3.3789808917197455, "grad_norm": 0.22076078562524457, "learning_rate": 2.3780688624945026e-06, "loss": 0.002, "step": 7427 }, { "epoch": 3.3794358507734303, "grad_norm": 0.22583059364757874, "learning_rate": 2.3768519519111804e-06, "loss": 0.007, "step": 7428 }, { "epoch": 3.3798908098271156, "grad_norm": 0.24204613519324628, "learning_rate": 2.3756352556796726e-06, "loss": 0.0026, "step": 7429 }, { "epoch": 3.380345768880801, "grad_norm": 0.19870045843796788, "learning_rate": 2.374418773899398e-06, "loss": 0.0037, "step": 7430 }, { "epoch": 3.380800727934486, "grad_norm": 0.1063779813856768, "learning_rate": 2.3732025066697667e-06, "loss": 0.001, "step": 7431 }, { "epoch": 3.381255686988171, "grad_norm": 0.26323668100938086, "learning_rate": 2.3719864540901634e-06, "loss": 0.0038, "step": 7432 }, { "epoch": 3.3817106460418564, "grad_norm": 0.24662734188438992, "learning_rate": 2.3707706162599573e-06, "loss": 0.0074, "step": 7433 }, { "epoch": 3.3821656050955413, "grad_norm": 0.27595005994695637, "learning_rate": 2.369554993278505e-06, "loss": 0.0033, "step": 7434 }, { "epoch": 3.3826205641492266, "grad_norm": 0.127101063642665, "learning_rate": 2.3683395852451396e-06, "loss": 0.0007, "step": 7435 }, { "epoch": 3.383075523202912, "grad_norm": 0.1374146327909923, "learning_rate": 2.367124392259179e-06, "loss": 0.0008, "step": 7436 }, { "epoch": 3.383530482256597, "grad_norm": 0.13515885020106613, "learning_rate": 2.3659094144199214e-06, "loss": 0.0018, "step": 7437 }, { "epoch": 3.383985441310282, "grad_norm": 0.40005077561699465, "learning_rate": 2.3646946518266522e-06, "loss": 0.0046, "step": 7438 }, { "epoch": 3.3844404003639674, "grad_norm": 0.11524035274942056, "learning_rate": 2.3634801045786338e-06, "loss": 0.0015, "step": 7439 }, { "epoch": 3.3848953594176523, "grad_norm": 0.1960265772957544, "learning_rate": 2.362265772775116e-06, "loss": 0.0017, "step": 7440 }, { "epoch": 3.3853503184713376, "grad_norm": 0.2299423310415387, "learning_rate": 2.361051656515328e-06, "loss": 0.002, "step": 7441 }, { "epoch": 3.385805277525023, "grad_norm": 0.22255715423427594, "learning_rate": 2.3598377558984814e-06, "loss": 0.0029, "step": 7442 }, { "epoch": 3.386260236578708, "grad_norm": 0.430756961990995, "learning_rate": 2.3586240710237685e-06, "loss": 0.011, "step": 7443 }, { "epoch": 3.386715195632393, "grad_norm": 0.03173160400022317, "learning_rate": 2.3574106019903673e-06, "loss": 0.0003, "step": 7444 }, { "epoch": 3.3871701546860784, "grad_norm": 0.2615814142769935, "learning_rate": 2.356197348897441e-06, "loss": 0.0068, "step": 7445 }, { "epoch": 3.3876251137397633, "grad_norm": 0.3088409936110604, "learning_rate": 2.3549843118441275e-06, "loss": 0.0057, "step": 7446 }, { "epoch": 3.3880800727934486, "grad_norm": 0.17585992559994212, "learning_rate": 2.3537714909295513e-06, "loss": 0.0026, "step": 7447 }, { "epoch": 3.388535031847134, "grad_norm": 0.30041111394092374, "learning_rate": 2.3525588862528163e-06, "loss": 0.0031, "step": 7448 }, { "epoch": 3.3889899909008188, "grad_norm": 0.6346439900667901, "learning_rate": 2.351346497913016e-06, "loss": 0.0198, "step": 7449 }, { "epoch": 3.389444949954504, "grad_norm": 0.4954806140291429, "learning_rate": 2.350134326009218e-06, "loss": 0.0033, "step": 7450 }, { "epoch": 3.3898999090081894, "grad_norm": 0.3460371370826556, "learning_rate": 2.348922370640475e-06, "loss": 0.0059, "step": 7451 }, { "epoch": 3.3903548680618742, "grad_norm": 0.41469027172683753, "learning_rate": 2.347710631905825e-06, "loss": 0.0083, "step": 7452 }, { "epoch": 3.3908098271155596, "grad_norm": 0.29431864933926416, "learning_rate": 2.3464991099042826e-06, "loss": 0.0043, "step": 7453 }, { "epoch": 3.391264786169245, "grad_norm": 0.21397868198234682, "learning_rate": 2.345287804734852e-06, "loss": 0.0029, "step": 7454 }, { "epoch": 3.3917197452229297, "grad_norm": 0.21561317801769855, "learning_rate": 2.3440767164965137e-06, "loss": 0.0028, "step": 7455 }, { "epoch": 3.392174704276615, "grad_norm": 0.3482130736958066, "learning_rate": 2.342865845288232e-06, "loss": 0.0037, "step": 7456 }, { "epoch": 3.3926296633303004, "grad_norm": 0.24450700003734802, "learning_rate": 2.3416551912089513e-06, "loss": 0.0042, "step": 7457 }, { "epoch": 3.3930846223839852, "grad_norm": 0.13762153687268255, "learning_rate": 2.340444754357604e-06, "loss": 0.0017, "step": 7458 }, { "epoch": 3.3935395814376705, "grad_norm": 0.44899251478874885, "learning_rate": 2.339234534833103e-06, "loss": 0.0051, "step": 7459 }, { "epoch": 3.393994540491356, "grad_norm": 0.4440731458246905, "learning_rate": 2.33802453273434e-06, "loss": 0.0094, "step": 7460 }, { "epoch": 3.3944494995450407, "grad_norm": 0.32028308463595856, "learning_rate": 2.33681474816019e-06, "loss": 0.004, "step": 7461 }, { "epoch": 3.394904458598726, "grad_norm": 0.19976710451337512, "learning_rate": 2.3356051812095104e-06, "loss": 0.0078, "step": 7462 }, { "epoch": 3.3953594176524113, "grad_norm": 0.10010372295754415, "learning_rate": 2.334395831981145e-06, "loss": 0.0017, "step": 7463 }, { "epoch": 3.395814376706096, "grad_norm": 0.31711059250387985, "learning_rate": 2.3331867005739127e-06, "loss": 0.0023, "step": 7464 }, { "epoch": 3.3962693357597815, "grad_norm": 0.3296500083697068, "learning_rate": 2.3319777870866217e-06, "loss": 0.0042, "step": 7465 }, { "epoch": 3.396724294813467, "grad_norm": 0.31665319933384933, "learning_rate": 2.3307690916180575e-06, "loss": 0.0026, "step": 7466 }, { "epoch": 3.397179253867152, "grad_norm": 0.2826439077134469, "learning_rate": 2.329560614266987e-06, "loss": 0.0037, "step": 7467 }, { "epoch": 3.397634212920837, "grad_norm": 0.2984721192995415, "learning_rate": 2.328352355132165e-06, "loss": 0.0074, "step": 7468 }, { "epoch": 3.3980891719745223, "grad_norm": 0.1621392575248319, "learning_rate": 2.327144314312324e-06, "loss": 0.002, "step": 7469 }, { "epoch": 3.3985441310282076, "grad_norm": 0.2551895988159972, "learning_rate": 2.3259364919061795e-06, "loss": 0.0049, "step": 7470 }, { "epoch": 3.3989990900818925, "grad_norm": 0.2108534720772892, "learning_rate": 2.3247288880124265e-06, "loss": 0.0015, "step": 7471 }, { "epoch": 3.399454049135578, "grad_norm": 0.13524097984038214, "learning_rate": 2.323521502729747e-06, "loss": 0.0023, "step": 7472 }, { "epoch": 3.399909008189263, "grad_norm": 0.1704708922074107, "learning_rate": 2.322314336156806e-06, "loss": 0.002, "step": 7473 }, { "epoch": 3.400363967242948, "grad_norm": 0.1267406621929312, "learning_rate": 2.3211073883922447e-06, "loss": 0.0019, "step": 7474 }, { "epoch": 3.4008189262966333, "grad_norm": 0.3100928218013081, "learning_rate": 2.31990065953469e-06, "loss": 0.0046, "step": 7475 }, { "epoch": 3.4012738853503186, "grad_norm": 0.20746714353843193, "learning_rate": 2.31869414968275e-06, "loss": 0.0032, "step": 7476 }, { "epoch": 3.4017288444040035, "grad_norm": 0.11569388270920063, "learning_rate": 2.3174878589350135e-06, "loss": 0.0015, "step": 7477 }, { "epoch": 3.402183803457689, "grad_norm": 0.39858592777381013, "learning_rate": 2.3162817873900556e-06, "loss": 0.0046, "step": 7478 }, { "epoch": 3.402638762511374, "grad_norm": 0.2415748693095528, "learning_rate": 2.315075935146432e-06, "loss": 0.0037, "step": 7479 }, { "epoch": 3.403093721565059, "grad_norm": 0.16769446953265724, "learning_rate": 2.3138703023026775e-06, "loss": 0.002, "step": 7480 }, { "epoch": 3.4035486806187443, "grad_norm": 0.19158668731085754, "learning_rate": 2.3126648889573124e-06, "loss": 0.002, "step": 7481 }, { "epoch": 3.4040036396724296, "grad_norm": 0.1710038402115662, "learning_rate": 2.311459695208834e-06, "loss": 0.0041, "step": 7482 }, { "epoch": 3.404458598726115, "grad_norm": 0.12239022636371179, "learning_rate": 2.31025472115573e-06, "loss": 0.0012, "step": 7483 }, { "epoch": 3.4049135577797998, "grad_norm": 0.20696613435734043, "learning_rate": 2.3090499668964637e-06, "loss": 0.0035, "step": 7484 }, { "epoch": 3.405368516833485, "grad_norm": 0.5902100957614066, "learning_rate": 2.3078454325294797e-06, "loss": 0.0055, "step": 7485 }, { "epoch": 3.4058234758871704, "grad_norm": 0.2742027427827176, "learning_rate": 2.3066411181532113e-06, "loss": 0.004, "step": 7486 }, { "epoch": 3.4062784349408552, "grad_norm": 0.14160244714288026, "learning_rate": 2.3054370238660655e-06, "loss": 0.0028, "step": 7487 }, { "epoch": 3.4067333939945406, "grad_norm": 0.10936957664965745, "learning_rate": 2.30423314976644e-06, "loss": 0.001, "step": 7488 }, { "epoch": 3.407188353048226, "grad_norm": 0.27197434499031803, "learning_rate": 2.3030294959527073e-06, "loss": 0.0053, "step": 7489 }, { "epoch": 3.4076433121019107, "grad_norm": 0.31420550405197456, "learning_rate": 2.3018260625232246e-06, "loss": 0.0088, "step": 7490 }, { "epoch": 3.408098271155596, "grad_norm": 0.35484494683380813, "learning_rate": 2.3006228495763295e-06, "loss": 0.0057, "step": 7491 }, { "epoch": 3.4085532302092814, "grad_norm": 0.29627024364808935, "learning_rate": 2.299419857210345e-06, "loss": 0.0025, "step": 7492 }, { "epoch": 3.4090081892629662, "grad_norm": 0.3488972186549872, "learning_rate": 2.298217085523576e-06, "loss": 0.0076, "step": 7493 }, { "epoch": 3.4094631483166515, "grad_norm": 0.2041676568134566, "learning_rate": 2.2970145346143045e-06, "loss": 0.0022, "step": 7494 }, { "epoch": 3.409918107370337, "grad_norm": 0.3644252519731684, "learning_rate": 2.2958122045808002e-06, "loss": 0.0073, "step": 7495 }, { "epoch": 3.4103730664240217, "grad_norm": 0.17796643268697715, "learning_rate": 2.294610095521308e-06, "loss": 0.0012, "step": 7496 }, { "epoch": 3.410828025477707, "grad_norm": 0.26542944580127786, "learning_rate": 2.293408207534063e-06, "loss": 0.0015, "step": 7497 }, { "epoch": 3.4112829845313923, "grad_norm": 0.02401730874842608, "learning_rate": 2.2922065407172767e-06, "loss": 0.0003, "step": 7498 }, { "epoch": 3.411737943585077, "grad_norm": 0.15404227228074097, "learning_rate": 2.2910050951691416e-06, "loss": 0.0029, "step": 7499 }, { "epoch": 3.4121929026387625, "grad_norm": 0.08787003145192555, "learning_rate": 2.2898038709878386e-06, "loss": 0.0006, "step": 7500 }, { "epoch": 3.412647861692448, "grad_norm": 0.2818577725151577, "learning_rate": 2.2886028682715217e-06, "loss": 0.0033, "step": 7501 }, { "epoch": 3.4131028207461327, "grad_norm": 0.18935098847517642, "learning_rate": 2.287402087118336e-06, "loss": 0.0024, "step": 7502 }, { "epoch": 3.413557779799818, "grad_norm": 0.13854282299343776, "learning_rate": 2.2862015276264016e-06, "loss": 0.0029, "step": 7503 }, { "epoch": 3.4140127388535033, "grad_norm": 0.3039917831277023, "learning_rate": 2.2850011898938236e-06, "loss": 0.0024, "step": 7504 }, { "epoch": 3.414467697907188, "grad_norm": 0.23879128032948413, "learning_rate": 2.283801074018685e-06, "loss": 0.0045, "step": 7505 }, { "epoch": 3.4149226569608735, "grad_norm": 0.40633003185693745, "learning_rate": 2.2826011800990567e-06, "loss": 0.0053, "step": 7506 }, { "epoch": 3.415377616014559, "grad_norm": 0.16206510040827052, "learning_rate": 2.28140150823299e-06, "loss": 0.0031, "step": 7507 }, { "epoch": 3.4158325750682437, "grad_norm": 0.14917788836790466, "learning_rate": 2.280202058518515e-06, "loss": 0.0022, "step": 7508 }, { "epoch": 3.416287534121929, "grad_norm": 0.3271005830438918, "learning_rate": 2.279002831053645e-06, "loss": 0.006, "step": 7509 }, { "epoch": 3.4167424931756143, "grad_norm": 0.07984297227321478, "learning_rate": 2.277803825936376e-06, "loss": 0.0005, "step": 7510 }, { "epoch": 3.417197452229299, "grad_norm": 0.06900928897055819, "learning_rate": 2.2766050432646835e-06, "loss": 0.0007, "step": 7511 }, { "epoch": 3.4176524112829845, "grad_norm": 0.24834426694486775, "learning_rate": 2.2754064831365296e-06, "loss": 0.0027, "step": 7512 }, { "epoch": 3.41810737033667, "grad_norm": 0.07036224443674961, "learning_rate": 2.2742081456498517e-06, "loss": 0.0007, "step": 7513 }, { "epoch": 3.4185623293903546, "grad_norm": 0.1131181170368926, "learning_rate": 2.2730100309025765e-06, "loss": 0.0009, "step": 7514 }, { "epoch": 3.41901728844404, "grad_norm": 0.2982112452094316, "learning_rate": 2.271812138992607e-06, "loss": 0.006, "step": 7515 }, { "epoch": 3.4194722474977253, "grad_norm": 0.2993485471057614, "learning_rate": 2.270614470017827e-06, "loss": 0.0043, "step": 7516 }, { "epoch": 3.41992720655141, "grad_norm": 0.37038025888018483, "learning_rate": 2.2694170240761086e-06, "loss": 0.0048, "step": 7517 }, { "epoch": 3.4203821656050954, "grad_norm": 0.23874094536173626, "learning_rate": 2.2682198012653e-06, "loss": 0.0026, "step": 7518 }, { "epoch": 3.4208371246587808, "grad_norm": 0.22512967728517105, "learning_rate": 2.2670228016832325e-06, "loss": 0.0017, "step": 7519 }, { "epoch": 3.421292083712466, "grad_norm": 0.28465327850834565, "learning_rate": 2.2658260254277176e-06, "loss": 0.0043, "step": 7520 }, { "epoch": 3.421747042766151, "grad_norm": 0.17985399793542411, "learning_rate": 2.2646294725965522e-06, "loss": 0.003, "step": 7521 }, { "epoch": 3.4222020018198362, "grad_norm": 0.11797323942667824, "learning_rate": 2.2634331432875163e-06, "loss": 0.0011, "step": 7522 }, { "epoch": 3.4226569608735216, "grad_norm": 0.04658003610474827, "learning_rate": 2.262237037598365e-06, "loss": 0.0005, "step": 7523 }, { "epoch": 3.4231119199272064, "grad_norm": 0.2269317789310602, "learning_rate": 2.261041155626839e-06, "loss": 0.0042, "step": 7524 }, { "epoch": 3.4235668789808917, "grad_norm": 0.20585873166901278, "learning_rate": 2.2598454974706595e-06, "loss": 0.0022, "step": 7525 }, { "epoch": 3.424021838034577, "grad_norm": 0.028660333040078968, "learning_rate": 2.2586500632275333e-06, "loss": 0.0002, "step": 7526 }, { "epoch": 3.424476797088262, "grad_norm": 0.17937247856404254, "learning_rate": 2.2574548529951423e-06, "loss": 0.0021, "step": 7527 }, { "epoch": 3.4249317561419472, "grad_norm": 0.36347026246348846, "learning_rate": 2.256259866871157e-06, "loss": 0.0088, "step": 7528 }, { "epoch": 3.4253867151956325, "grad_norm": 0.29857263800353295, "learning_rate": 2.2550651049532253e-06, "loss": 0.004, "step": 7529 }, { "epoch": 3.4258416742493174, "grad_norm": 0.2619645740528489, "learning_rate": 2.2538705673389747e-06, "loss": 0.0024, "step": 7530 }, { "epoch": 3.4262966333030027, "grad_norm": 0.37703136795917297, "learning_rate": 2.252676254126022e-06, "loss": 0.0024, "step": 7531 }, { "epoch": 3.426751592356688, "grad_norm": 0.09039222248055129, "learning_rate": 2.251482165411959e-06, "loss": 0.0005, "step": 7532 }, { "epoch": 3.427206551410373, "grad_norm": 0.27748057243558133, "learning_rate": 2.2502883012943614e-06, "loss": 0.0033, "step": 7533 }, { "epoch": 3.427661510464058, "grad_norm": 0.28617041874724686, "learning_rate": 2.2490946618707844e-06, "loss": 0.0039, "step": 7534 }, { "epoch": 3.4281164695177435, "grad_norm": 0.47224638433142235, "learning_rate": 2.2479012472387685e-06, "loss": 0.007, "step": 7535 }, { "epoch": 3.4285714285714284, "grad_norm": 0.0772165856397902, "learning_rate": 2.2467080574958365e-06, "loss": 0.0006, "step": 7536 }, { "epoch": 3.4290263876251137, "grad_norm": 0.3347847903742624, "learning_rate": 2.245515092739488e-06, "loss": 0.0125, "step": 7537 }, { "epoch": 3.429481346678799, "grad_norm": 0.48537822056152635, "learning_rate": 2.244322353067207e-06, "loss": 0.0038, "step": 7538 }, { "epoch": 3.4299363057324843, "grad_norm": 0.29186139958278556, "learning_rate": 2.2431298385764565e-06, "loss": 0.0041, "step": 7539 }, { "epoch": 3.430391264786169, "grad_norm": 0.2796719230790243, "learning_rate": 2.241937549364688e-06, "loss": 0.0032, "step": 7540 }, { "epoch": 3.4308462238398545, "grad_norm": 0.14853241686614696, "learning_rate": 2.240745485529326e-06, "loss": 0.002, "step": 7541 }, { "epoch": 3.43130118289354, "grad_norm": 0.23798983423878695, "learning_rate": 2.2395536471677835e-06, "loss": 0.0018, "step": 7542 }, { "epoch": 3.4317561419472247, "grad_norm": 0.20385346619196504, "learning_rate": 2.2383620343774516e-06, "loss": 0.0018, "step": 7543 }, { "epoch": 3.43221110100091, "grad_norm": 0.23529455413456232, "learning_rate": 2.2371706472557026e-06, "loss": 0.0013, "step": 7544 }, { "epoch": 3.4326660600545953, "grad_norm": 0.0993919378156682, "learning_rate": 2.2359794858998894e-06, "loss": 0.0012, "step": 7545 }, { "epoch": 3.43312101910828, "grad_norm": 0.17278011269367857, "learning_rate": 2.2347885504073525e-06, "loss": 0.0033, "step": 7546 }, { "epoch": 3.4335759781619655, "grad_norm": 0.292090133582803, "learning_rate": 2.233597840875407e-06, "loss": 0.0069, "step": 7547 }, { "epoch": 3.434030937215651, "grad_norm": 0.28677597066255733, "learning_rate": 2.232407357401352e-06, "loss": 0.0052, "step": 7548 }, { "epoch": 3.4344858962693356, "grad_norm": 0.16510230830411007, "learning_rate": 2.231217100082471e-06, "loss": 0.0024, "step": 7549 }, { "epoch": 3.434940855323021, "grad_norm": 0.21829917839424848, "learning_rate": 2.230027069016023e-06, "loss": 0.0037, "step": 7550 }, { "epoch": 3.4353958143767063, "grad_norm": 0.18653216734594502, "learning_rate": 2.2288372642992557e-06, "loss": 0.0018, "step": 7551 }, { "epoch": 3.435850773430391, "grad_norm": 0.3865186293895805, "learning_rate": 2.227647686029392e-06, "loss": 0.0043, "step": 7552 }, { "epoch": 3.4363057324840764, "grad_norm": 0.21625871006312322, "learning_rate": 2.2264583343036406e-06, "loss": 0.0049, "step": 7553 }, { "epoch": 3.4367606915377618, "grad_norm": 0.20052999210861674, "learning_rate": 2.2252692092191864e-06, "loss": 0.002, "step": 7554 }, { "epoch": 3.4372156505914466, "grad_norm": 0.32467840049268315, "learning_rate": 2.2240803108732024e-06, "loss": 0.0059, "step": 7555 }, { "epoch": 3.437670609645132, "grad_norm": 0.1844286177408229, "learning_rate": 2.2228916393628407e-06, "loss": 0.0015, "step": 7556 }, { "epoch": 3.4381255686988172, "grad_norm": 0.2418682440631349, "learning_rate": 2.2217031947852336e-06, "loss": 0.0032, "step": 7557 }, { "epoch": 3.438580527752502, "grad_norm": 0.4792988592408678, "learning_rate": 2.220514977237494e-06, "loss": 0.0089, "step": 7558 }, { "epoch": 3.4390354868061874, "grad_norm": 0.38816660052508606, "learning_rate": 2.219326986816717e-06, "loss": 0.0093, "step": 7559 }, { "epoch": 3.4394904458598727, "grad_norm": 0.12074161677617608, "learning_rate": 2.218139223619983e-06, "loss": 0.001, "step": 7560 }, { "epoch": 3.4399454049135576, "grad_norm": 0.34773008386237036, "learning_rate": 2.2169516877443487e-06, "loss": 0.0048, "step": 7561 }, { "epoch": 3.440400363967243, "grad_norm": 0.423207554559181, "learning_rate": 2.215764379286853e-06, "loss": 0.0064, "step": 7562 }, { "epoch": 3.4408553230209282, "grad_norm": 0.04242146456268766, "learning_rate": 2.21457729834452e-06, "loss": 0.0004, "step": 7563 }, { "epoch": 3.441310282074613, "grad_norm": 1.2470397646639153, "learning_rate": 2.2133904450143502e-06, "loss": 0.0155, "step": 7564 }, { "epoch": 3.4417652411282984, "grad_norm": 0.13792975474096358, "learning_rate": 2.2122038193933297e-06, "loss": 0.0023, "step": 7565 }, { "epoch": 3.4422202001819837, "grad_norm": 0.09370130857618474, "learning_rate": 2.211017421578425e-06, "loss": 0.0011, "step": 7566 }, { "epoch": 3.4426751592356686, "grad_norm": 0.23737595228068492, "learning_rate": 2.2098312516665806e-06, "loss": 0.0054, "step": 7567 }, { "epoch": 3.443130118289354, "grad_norm": 0.2664383212374511, "learning_rate": 2.2086453097547244e-06, "loss": 0.003, "step": 7568 }, { "epoch": 3.443585077343039, "grad_norm": 0.16132888935395276, "learning_rate": 2.2074595959397675e-06, "loss": 0.0028, "step": 7569 }, { "epoch": 3.444040036396724, "grad_norm": 0.5772967328366053, "learning_rate": 2.2062741103186037e-06, "loss": 0.0108, "step": 7570 }, { "epoch": 3.4444949954504094, "grad_norm": 0.3199739605964402, "learning_rate": 2.205088852988103e-06, "loss": 0.0082, "step": 7571 }, { "epoch": 3.4449499545040947, "grad_norm": 0.38774756555729817, "learning_rate": 2.20390382404512e-06, "loss": 0.0092, "step": 7572 }, { "epoch": 3.4454049135577796, "grad_norm": 0.1104623776806025, "learning_rate": 2.2027190235864875e-06, "loss": 0.0008, "step": 7573 }, { "epoch": 3.445859872611465, "grad_norm": 0.10013143827437533, "learning_rate": 2.201534451709025e-06, "loss": 0.0011, "step": 7574 }, { "epoch": 3.44631483166515, "grad_norm": 0.2688118064240035, "learning_rate": 2.20035010850953e-06, "loss": 0.0021, "step": 7575 }, { "epoch": 3.4467697907188355, "grad_norm": 0.2948865089535925, "learning_rate": 2.1991659940847797e-06, "loss": 0.0032, "step": 7576 }, { "epoch": 3.4472247497725204, "grad_norm": 0.4027637446370561, "learning_rate": 2.197982108531537e-06, "loss": 0.003, "step": 7577 }, { "epoch": 3.4476797088262057, "grad_norm": 0.081179246782656, "learning_rate": 2.1967984519465414e-06, "loss": 0.0006, "step": 7578 }, { "epoch": 3.448134667879891, "grad_norm": 0.2759429853936018, "learning_rate": 2.1956150244265184e-06, "loss": 0.008, "step": 7579 }, { "epoch": 3.448589626933576, "grad_norm": 0.06454804157775773, "learning_rate": 2.1944318260681715e-06, "loss": 0.0004, "step": 7580 }, { "epoch": 3.449044585987261, "grad_norm": 0.22803415435311675, "learning_rate": 2.193248856968185e-06, "loss": 0.0012, "step": 7581 }, { "epoch": 3.4494995450409465, "grad_norm": 0.03709175174358703, "learning_rate": 2.192066117223228e-06, "loss": 0.0003, "step": 7582 }, { "epoch": 3.4499545040946313, "grad_norm": 0.22274667973839699, "learning_rate": 2.190883606929945e-06, "loss": 0.0027, "step": 7583 }, { "epoch": 3.4504094631483166, "grad_norm": 0.27241348710092617, "learning_rate": 2.1897013261849678e-06, "loss": 0.0042, "step": 7584 }, { "epoch": 3.450864422202002, "grad_norm": 0.2061567636885349, "learning_rate": 2.1885192750849087e-06, "loss": 0.0021, "step": 7585 }, { "epoch": 3.451319381255687, "grad_norm": 0.25342431366619456, "learning_rate": 2.187337453726358e-06, "loss": 0.002, "step": 7586 }, { "epoch": 3.451774340309372, "grad_norm": 0.20710263181165453, "learning_rate": 2.186155862205889e-06, "loss": 0.0021, "step": 7587 }, { "epoch": 3.4522292993630574, "grad_norm": 0.08750458274873917, "learning_rate": 2.1849745006200536e-06, "loss": 0.0005, "step": 7588 }, { "epoch": 3.4526842584167423, "grad_norm": 0.14585580421027522, "learning_rate": 2.183793369065391e-06, "loss": 0.0033, "step": 7589 }, { "epoch": 3.4531392174704276, "grad_norm": 0.2129683517211805, "learning_rate": 2.182612467638415e-06, "loss": 0.0034, "step": 7590 }, { "epoch": 3.453594176524113, "grad_norm": 0.18623176925640242, "learning_rate": 2.181431796435627e-06, "loss": 0.002, "step": 7591 }, { "epoch": 3.4540491355777982, "grad_norm": 0.3411416919071153, "learning_rate": 2.1802513555535038e-06, "loss": 0.0074, "step": 7592 }, { "epoch": 3.454504094631483, "grad_norm": 0.06228605865594398, "learning_rate": 2.1790711450885038e-06, "loss": 0.0007, "step": 7593 }, { "epoch": 3.4549590536851684, "grad_norm": 0.20682244288660823, "learning_rate": 2.1778911651370728e-06, "loss": 0.003, "step": 7594 }, { "epoch": 3.4554140127388537, "grad_norm": 0.37674912856008497, "learning_rate": 2.176711415795631e-06, "loss": 0.0049, "step": 7595 }, { "epoch": 3.4558689717925386, "grad_norm": 0.20956696853928675, "learning_rate": 2.1755318971605828e-06, "loss": 0.0048, "step": 7596 }, { "epoch": 3.456323930846224, "grad_norm": 0.13996537141149556, "learning_rate": 2.1743526093283106e-06, "loss": 0.0014, "step": 7597 }, { "epoch": 3.4567788898999092, "grad_norm": 0.11111230907779905, "learning_rate": 2.1731735523951832e-06, "loss": 0.0013, "step": 7598 }, { "epoch": 3.457233848953594, "grad_norm": 0.3765596737913546, "learning_rate": 2.1719947264575484e-06, "loss": 0.0091, "step": 7599 }, { "epoch": 3.4576888080072794, "grad_norm": 0.21079256008288408, "learning_rate": 2.1708161316117338e-06, "loss": 0.0021, "step": 7600 }, { "epoch": 3.4581437670609647, "grad_norm": 0.053408008752552555, "learning_rate": 2.169637767954048e-06, "loss": 0.0006, "step": 7601 }, { "epoch": 3.4585987261146496, "grad_norm": 0.40635097820663696, "learning_rate": 2.1684596355807807e-06, "loss": 0.0068, "step": 7602 }, { "epoch": 3.459053685168335, "grad_norm": 0.1806692460087327, "learning_rate": 2.167281734588207e-06, "loss": 0.0027, "step": 7603 }, { "epoch": 3.45950864422202, "grad_norm": 0.4497160257864994, "learning_rate": 2.166104065072575e-06, "loss": 0.0101, "step": 7604 }, { "epoch": 3.459963603275705, "grad_norm": 0.1287875528761719, "learning_rate": 2.164926627130123e-06, "loss": 0.0021, "step": 7605 }, { "epoch": 3.4604185623293904, "grad_norm": 0.19435939703400862, "learning_rate": 2.163749420857064e-06, "loss": 0.0013, "step": 7606 }, { "epoch": 3.4608735213830757, "grad_norm": 0.3117927694283147, "learning_rate": 2.162572446349592e-06, "loss": 0.0036, "step": 7607 }, { "epoch": 3.4613284804367606, "grad_norm": 0.20144599399754456, "learning_rate": 2.161395703703888e-06, "loss": 0.0026, "step": 7608 }, { "epoch": 3.461783439490446, "grad_norm": 0.20076451955079833, "learning_rate": 2.160219193016108e-06, "loss": 0.0047, "step": 7609 }, { "epoch": 3.462238398544131, "grad_norm": 0.2855118399669933, "learning_rate": 2.159042914382391e-06, "loss": 0.0078, "step": 7610 }, { "epoch": 3.462693357597816, "grad_norm": 0.2858193240528223, "learning_rate": 2.1578668678988556e-06, "loss": 0.0023, "step": 7611 }, { "epoch": 3.4631483166515014, "grad_norm": 0.35197519842563885, "learning_rate": 2.1566910536616052e-06, "loss": 0.0031, "step": 7612 }, { "epoch": 3.4636032757051867, "grad_norm": 0.6211274895977381, "learning_rate": 2.155515471766723e-06, "loss": 0.0085, "step": 7613 }, { "epoch": 3.4640582347588715, "grad_norm": 0.1082607290666677, "learning_rate": 2.154340122310271e-06, "loss": 0.0014, "step": 7614 }, { "epoch": 3.464513193812557, "grad_norm": 0.11953090664049322, "learning_rate": 2.1531650053882934e-06, "loss": 0.0014, "step": 7615 }, { "epoch": 3.464968152866242, "grad_norm": 0.701244752820876, "learning_rate": 2.151990121096816e-06, "loss": 0.0082, "step": 7616 }, { "epoch": 3.465423111919927, "grad_norm": 0.3539661775632979, "learning_rate": 2.1508154695318417e-06, "loss": 0.0054, "step": 7617 }, { "epoch": 3.4658780709736123, "grad_norm": 0.1994463520339777, "learning_rate": 2.1496410507893608e-06, "loss": 0.0034, "step": 7618 }, { "epoch": 3.4663330300272976, "grad_norm": 0.21445122631720664, "learning_rate": 2.148466864965343e-06, "loss": 0.001, "step": 7619 }, { "epoch": 3.4667879890809825, "grad_norm": 0.5655065115031422, "learning_rate": 2.147292912155735e-06, "loss": 0.0091, "step": 7620 }, { "epoch": 3.467242948134668, "grad_norm": 0.27343652443409633, "learning_rate": 2.146119192456468e-06, "loss": 0.0053, "step": 7621 }, { "epoch": 3.467697907188353, "grad_norm": 0.3044766199111716, "learning_rate": 2.1449457059634505e-06, "loss": 0.0045, "step": 7622 }, { "epoch": 3.468152866242038, "grad_norm": 0.20504372124105646, "learning_rate": 2.1437724527725785e-06, "loss": 0.0036, "step": 7623 }, { "epoch": 3.4686078252957233, "grad_norm": 0.3428906131655945, "learning_rate": 2.1425994329797233e-06, "loss": 0.0038, "step": 7624 }, { "epoch": 3.4690627843494086, "grad_norm": 0.32566883383029654, "learning_rate": 2.1414266466807365e-06, "loss": 0.0046, "step": 7625 }, { "epoch": 3.4695177434030935, "grad_norm": 0.0828636824706321, "learning_rate": 2.1402540939714565e-06, "loss": 0.0004, "step": 7626 }, { "epoch": 3.469972702456779, "grad_norm": 0.21200476841826338, "learning_rate": 2.139081774947696e-06, "loss": 0.0019, "step": 7627 }, { "epoch": 3.470427661510464, "grad_norm": 0.2749722201481947, "learning_rate": 2.1379096897052547e-06, "loss": 0.0041, "step": 7628 }, { "epoch": 3.470882620564149, "grad_norm": 0.2897640267601307, "learning_rate": 2.136737838339909e-06, "loss": 0.0036, "step": 7629 }, { "epoch": 3.4713375796178343, "grad_norm": 0.3814692753140629, "learning_rate": 2.135566220947416e-06, "loss": 0.0112, "step": 7630 }, { "epoch": 3.4717925386715196, "grad_norm": 0.2363076761425247, "learning_rate": 2.1343948376235146e-06, "loss": 0.0056, "step": 7631 }, { "epoch": 3.472247497725205, "grad_norm": 0.7851429315806875, "learning_rate": 2.1332236884639256e-06, "loss": 0.0036, "step": 7632 }, { "epoch": 3.47270245677889, "grad_norm": 0.21600176693922699, "learning_rate": 2.1320527735643526e-06, "loss": 0.0055, "step": 7633 }, { "epoch": 3.473157415832575, "grad_norm": 0.21443155601202132, "learning_rate": 2.1308820930204753e-06, "loss": 0.0016, "step": 7634 }, { "epoch": 3.4736123748862604, "grad_norm": 0.041329900558874755, "learning_rate": 2.1297116469279566e-06, "loss": 0.0004, "step": 7635 }, { "epoch": 3.4740673339399453, "grad_norm": 0.42409122140892475, "learning_rate": 2.128541435382438e-06, "loss": 0.0041, "step": 7636 }, { "epoch": 3.4745222929936306, "grad_norm": 0.36781015367786757, "learning_rate": 2.127371458479548e-06, "loss": 0.0097, "step": 7637 }, { "epoch": 3.474977252047316, "grad_norm": 0.23064456548736212, "learning_rate": 2.1262017163148895e-06, "loss": 0.0053, "step": 7638 }, { "epoch": 3.4754322111010008, "grad_norm": 0.06676349607223435, "learning_rate": 2.1250322089840477e-06, "loss": 0.0004, "step": 7639 }, { "epoch": 3.475887170154686, "grad_norm": 0.25347342042534343, "learning_rate": 2.1238629365825913e-06, "loss": 0.003, "step": 7640 }, { "epoch": 3.4763421292083714, "grad_norm": 0.236096168728993, "learning_rate": 2.1226938992060658e-06, "loss": 0.004, "step": 7641 }, { "epoch": 3.4767970882620562, "grad_norm": 0.18380529756649028, "learning_rate": 2.121525096950003e-06, "loss": 0.0022, "step": 7642 }, { "epoch": 3.4772520473157416, "grad_norm": 0.18749254168424617, "learning_rate": 2.12035652990991e-06, "loss": 0.0034, "step": 7643 }, { "epoch": 3.477707006369427, "grad_norm": 0.4144093987784156, "learning_rate": 2.1191881981812775e-06, "loss": 0.0047, "step": 7644 }, { "epoch": 3.4781619654231117, "grad_norm": 0.09958505928649422, "learning_rate": 2.118020101859573e-06, "loss": 0.0007, "step": 7645 }, { "epoch": 3.478616924476797, "grad_norm": 0.06464789867528721, "learning_rate": 2.116852241040252e-06, "loss": 0.0005, "step": 7646 }, { "epoch": 3.4790718835304824, "grad_norm": 0.27935140306616363, "learning_rate": 2.115684615818747e-06, "loss": 0.0072, "step": 7647 }, { "epoch": 3.4795268425841677, "grad_norm": 0.27625905261306927, "learning_rate": 2.1145172262904695e-06, "loss": 0.0034, "step": 7648 }, { "epoch": 3.4799818016378525, "grad_norm": 0.18352368089048088, "learning_rate": 2.1133500725508138e-06, "loss": 0.0015, "step": 7649 }, { "epoch": 3.480436760691538, "grad_norm": 0.3003965541889959, "learning_rate": 2.1121831546951523e-06, "loss": 0.005, "step": 7650 }, { "epoch": 3.480891719745223, "grad_norm": 0.3302318788975426, "learning_rate": 2.1110164728188444e-06, "loss": 0.0029, "step": 7651 }, { "epoch": 3.481346678798908, "grad_norm": 0.2517251573498449, "learning_rate": 2.1098500270172227e-06, "loss": 0.0057, "step": 7652 }, { "epoch": 3.4818016378525933, "grad_norm": 0.13703521113700673, "learning_rate": 2.108683817385604e-06, "loss": 0.0021, "step": 7653 }, { "epoch": 3.4822565969062786, "grad_norm": 0.17465017489104626, "learning_rate": 2.1075178440192883e-06, "loss": 0.0026, "step": 7654 }, { "epoch": 3.4827115559599635, "grad_norm": 0.3465028784605604, "learning_rate": 2.1063521070135524e-06, "loss": 0.0113, "step": 7655 }, { "epoch": 3.483166515013649, "grad_norm": 0.3202340544807439, "learning_rate": 2.105186606463653e-06, "loss": 0.0031, "step": 7656 }, { "epoch": 3.483621474067334, "grad_norm": 0.2195421656372889, "learning_rate": 2.104021342464832e-06, "loss": 0.0053, "step": 7657 }, { "epoch": 3.484076433121019, "grad_norm": 0.3054300703290584, "learning_rate": 2.10285631511231e-06, "loss": 0.0039, "step": 7658 }, { "epoch": 3.4845313921747043, "grad_norm": 0.23760591490469737, "learning_rate": 2.101691524501286e-06, "loss": 0.0021, "step": 7659 }, { "epoch": 3.4849863512283896, "grad_norm": 0.04292499430560187, "learning_rate": 2.10052697072694e-06, "loss": 0.0004, "step": 7660 }, { "epoch": 3.4854413102820745, "grad_norm": 0.14970135576185403, "learning_rate": 2.099362653884436e-06, "loss": 0.0022, "step": 7661 }, { "epoch": 3.48589626933576, "grad_norm": 0.49591442435479727, "learning_rate": 2.0981985740689186e-06, "loss": 0.0122, "step": 7662 }, { "epoch": 3.486351228389445, "grad_norm": 0.013466798394920708, "learning_rate": 2.0970347313755095e-06, "loss": 0.0001, "step": 7663 }, { "epoch": 3.48680618744313, "grad_norm": 0.34123558150053246, "learning_rate": 2.0958711258993126e-06, "loss": 0.0035, "step": 7664 }, { "epoch": 3.4872611464968153, "grad_norm": 0.45707477527665535, "learning_rate": 2.0947077577354102e-06, "loss": 0.0089, "step": 7665 }, { "epoch": 3.4877161055505006, "grad_norm": 0.14902716143499972, "learning_rate": 2.0935446269788718e-06, "loss": 0.0011, "step": 7666 }, { "epoch": 3.4881710646041855, "grad_norm": 0.096295257932685, "learning_rate": 2.0923817337247394e-06, "loss": 0.0006, "step": 7667 }, { "epoch": 3.488626023657871, "grad_norm": 0.04511701075256628, "learning_rate": 2.0912190780680425e-06, "loss": 0.0004, "step": 7668 }, { "epoch": 3.489080982711556, "grad_norm": 0.15709166808886962, "learning_rate": 2.0900566601037865e-06, "loss": 0.0024, "step": 7669 }, { "epoch": 3.489535941765241, "grad_norm": 0.30102747564671173, "learning_rate": 2.0888944799269573e-06, "loss": 0.0034, "step": 7670 }, { "epoch": 3.4899909008189263, "grad_norm": 0.04903034125974509, "learning_rate": 2.087732537632527e-06, "loss": 0.0004, "step": 7671 }, { "epoch": 3.4904458598726116, "grad_norm": 0.13787578985762405, "learning_rate": 2.0865708333154415e-06, "loss": 0.0012, "step": 7672 }, { "epoch": 3.4909008189262964, "grad_norm": 0.4930112044362682, "learning_rate": 2.085409367070631e-06, "loss": 0.0078, "step": 7673 }, { "epoch": 3.4913557779799818, "grad_norm": 0.3331102491912277, "learning_rate": 2.0842481389930024e-06, "loss": 0.0042, "step": 7674 }, { "epoch": 3.491810737033667, "grad_norm": 0.24518165619961485, "learning_rate": 2.083087149177449e-06, "loss": 0.0055, "step": 7675 }, { "epoch": 3.492265696087352, "grad_norm": 0.26806674327420144, "learning_rate": 2.0819263977188433e-06, "loss": 0.0044, "step": 7676 }, { "epoch": 3.4927206551410372, "grad_norm": 0.12831954387596262, "learning_rate": 2.0807658847120336e-06, "loss": 0.0011, "step": 7677 }, { "epoch": 3.4931756141947226, "grad_norm": 0.3574771135467804, "learning_rate": 2.079605610251853e-06, "loss": 0.0088, "step": 7678 }, { "epoch": 3.4936305732484074, "grad_norm": 0.9454144648403213, "learning_rate": 2.0784455744331115e-06, "loss": 0.0033, "step": 7679 }, { "epoch": 3.4940855323020927, "grad_norm": 0.4379124975836111, "learning_rate": 2.077285777350606e-06, "loss": 0.0078, "step": 7680 }, { "epoch": 3.494540491355778, "grad_norm": 0.4310839291740187, "learning_rate": 2.0761262190991065e-06, "loss": 0.0043, "step": 7681 }, { "epoch": 3.494995450409463, "grad_norm": 0.2914926678941282, "learning_rate": 2.07496689977337e-06, "loss": 0.0031, "step": 7682 }, { "epoch": 3.4954504094631482, "grad_norm": 0.24195137796044136, "learning_rate": 2.073807819468129e-06, "loss": 0.0016, "step": 7683 }, { "epoch": 3.4959053685168335, "grad_norm": 0.21132836396361337, "learning_rate": 2.072648978278096e-06, "loss": 0.0026, "step": 7684 }, { "epoch": 3.496360327570519, "grad_norm": 0.43372639824221204, "learning_rate": 2.0714903762979716e-06, "loss": 0.0091, "step": 7685 }, { "epoch": 3.4968152866242037, "grad_norm": 0.4669176399759277, "learning_rate": 2.0703320136224276e-06, "loss": 0.0052, "step": 7686 }, { "epoch": 3.497270245677889, "grad_norm": 0.34361418459078286, "learning_rate": 2.0691738903461218e-06, "loss": 0.0024, "step": 7687 }, { "epoch": 3.4977252047315743, "grad_norm": 0.05989770662271216, "learning_rate": 2.0680160065636883e-06, "loss": 0.0004, "step": 7688 }, { "epoch": 3.498180163785259, "grad_norm": 0.2598226848700686, "learning_rate": 2.0668583623697473e-06, "loss": 0.0057, "step": 7689 }, { "epoch": 3.4986351228389445, "grad_norm": 0.12947657831849282, "learning_rate": 2.065700957858894e-06, "loss": 0.001, "step": 7690 }, { "epoch": 3.49909008189263, "grad_norm": 0.26414260804908984, "learning_rate": 2.0645437931257084e-06, "loss": 0.0029, "step": 7691 }, { "epoch": 3.4995450409463147, "grad_norm": 0.2324277480681154, "learning_rate": 2.063386868264748e-06, "loss": 0.0039, "step": 7692 }, { "epoch": 3.5, "grad_norm": 0.3713831590109256, "learning_rate": 2.062230183370551e-06, "loss": 0.0129, "step": 7693 }, { "epoch": 3.5004549590536853, "grad_norm": 0.3314732097380107, "learning_rate": 2.061073738537635e-06, "loss": 0.0049, "step": 7694 }, { "epoch": 3.50090991810737, "grad_norm": 0.154796189639817, "learning_rate": 2.0599175338605003e-06, "loss": 0.0017, "step": 7695 }, { "epoch": 3.5013648771610555, "grad_norm": 0.13762628370203114, "learning_rate": 2.05876156943363e-06, "loss": 0.0017, "step": 7696 }, { "epoch": 3.501819836214741, "grad_norm": 0.24251629376883618, "learning_rate": 2.0576058453514813e-06, "loss": 0.0023, "step": 7697 }, { "epoch": 3.502274795268426, "grad_norm": 0.09266297595642309, "learning_rate": 2.056450361708495e-06, "loss": 0.0007, "step": 7698 }, { "epoch": 3.502729754322111, "grad_norm": 0.21739943901262987, "learning_rate": 2.055295118599091e-06, "loss": 0.0017, "step": 7699 }, { "epoch": 3.5031847133757963, "grad_norm": 0.4813147020638217, "learning_rate": 2.0541401161176734e-06, "loss": 0.0056, "step": 7700 }, { "epoch": 3.5036396724294816, "grad_norm": 0.28823743863945117, "learning_rate": 2.052985354358622e-06, "loss": 0.0034, "step": 7701 }, { "epoch": 3.5040946314831665, "grad_norm": 0.5030958069763313, "learning_rate": 2.0518308334162967e-06, "loss": 0.0066, "step": 7702 }, { "epoch": 3.5045495905368518, "grad_norm": 0.35179867111866353, "learning_rate": 2.0506765533850443e-06, "loss": 0.0027, "step": 7703 }, { "epoch": 3.505004549590537, "grad_norm": 0.1695077708781563, "learning_rate": 2.049522514359183e-06, "loss": 0.0017, "step": 7704 }, { "epoch": 3.505459508644222, "grad_norm": 0.13925839508068513, "learning_rate": 2.048368716433019e-06, "loss": 0.001, "step": 7705 }, { "epoch": 3.5059144676979073, "grad_norm": 0.20857052666077638, "learning_rate": 2.0472151597008343e-06, "loss": 0.0018, "step": 7706 }, { "epoch": 3.5063694267515926, "grad_norm": 0.32588152477384214, "learning_rate": 2.046061844256892e-06, "loss": 0.0063, "step": 7707 }, { "epoch": 3.5068243858052774, "grad_norm": 0.2159640328455583, "learning_rate": 2.044908770195434e-06, "loss": 0.0045, "step": 7708 }, { "epoch": 3.5072793448589628, "grad_norm": 0.0927780252449845, "learning_rate": 2.043755937610686e-06, "loss": 0.0009, "step": 7709 }, { "epoch": 3.507734303912648, "grad_norm": 0.3699721690080576, "learning_rate": 2.042603346596855e-06, "loss": 0.0036, "step": 7710 }, { "epoch": 3.508189262966333, "grad_norm": 0.14692602124506826, "learning_rate": 2.041450997248123e-06, "loss": 0.001, "step": 7711 }, { "epoch": 3.5086442220200182, "grad_norm": 0.13063334895354267, "learning_rate": 2.0402988896586544e-06, "loss": 0.0016, "step": 7712 }, { "epoch": 3.5090991810737036, "grad_norm": 0.11577327532530884, "learning_rate": 2.039147023922593e-06, "loss": 0.0009, "step": 7713 }, { "epoch": 3.5095541401273884, "grad_norm": 0.27500033312287503, "learning_rate": 2.0379954001340676e-06, "loss": 0.0032, "step": 7714 }, { "epoch": 3.5100090991810737, "grad_norm": 0.20955556319604213, "learning_rate": 2.0368440183871812e-06, "loss": 0.0029, "step": 7715 }, { "epoch": 3.510464058234759, "grad_norm": 0.3950502179693419, "learning_rate": 2.035692878776019e-06, "loss": 0.0072, "step": 7716 }, { "epoch": 3.510919017288444, "grad_norm": 0.12650653859475156, "learning_rate": 2.03454198139465e-06, "loss": 0.0021, "step": 7717 }, { "epoch": 3.511373976342129, "grad_norm": 0.20906515268210074, "learning_rate": 2.0333913263371157e-06, "loss": 0.0059, "step": 7718 }, { "epoch": 3.5118289353958145, "grad_norm": 0.12271949406361587, "learning_rate": 2.032240913697448e-06, "loss": 0.0011, "step": 7719 }, { "epoch": 3.5122838944494994, "grad_norm": 0.17998330403633808, "learning_rate": 2.03109074356965e-06, "loss": 0.0026, "step": 7720 }, { "epoch": 3.5127388535031847, "grad_norm": 0.3815183137353262, "learning_rate": 2.0299408160477084e-06, "loss": 0.003, "step": 7721 }, { "epoch": 3.51319381255687, "grad_norm": 0.14660418518619533, "learning_rate": 2.0287911312255916e-06, "loss": 0.0022, "step": 7722 }, { "epoch": 3.513648771610555, "grad_norm": 0.40032699275641115, "learning_rate": 2.0276416891972416e-06, "loss": 0.007, "step": 7723 }, { "epoch": 3.51410373066424, "grad_norm": 0.09232605913048236, "learning_rate": 2.0264924900565937e-06, "loss": 0.0014, "step": 7724 }, { "epoch": 3.5145586897179255, "grad_norm": 0.16072560504405328, "learning_rate": 2.0253435338975506e-06, "loss": 0.0021, "step": 7725 }, { "epoch": 3.5150136487716104, "grad_norm": 0.0888364786911226, "learning_rate": 2.024194820814001e-06, "loss": 0.0009, "step": 7726 }, { "epoch": 3.5154686078252957, "grad_norm": 0.2534815771670263, "learning_rate": 2.023046350899812e-06, "loss": 0.0039, "step": 7727 }, { "epoch": 3.515923566878981, "grad_norm": 0.136855460118477, "learning_rate": 2.0218981242488295e-06, "loss": 0.0015, "step": 7728 }, { "epoch": 3.516378525932666, "grad_norm": 0.2555215073819053, "learning_rate": 2.0207501409548854e-06, "loss": 0.0087, "step": 7729 }, { "epoch": 3.516833484986351, "grad_norm": 0.3361774757234204, "learning_rate": 2.019602401111783e-06, "loss": 0.0119, "step": 7730 }, { "epoch": 3.5172884440400365, "grad_norm": 0.19381321912426963, "learning_rate": 2.0184549048133157e-06, "loss": 0.0035, "step": 7731 }, { "epoch": 3.5177434030937214, "grad_norm": 0.43141822202194136, "learning_rate": 2.0173076521532485e-06, "loss": 0.0064, "step": 7732 }, { "epoch": 3.5181983621474067, "grad_norm": 0.19249594679272394, "learning_rate": 2.016160643225329e-06, "loss": 0.0035, "step": 7733 }, { "epoch": 3.518653321201092, "grad_norm": 0.3965970263838499, "learning_rate": 2.015013878123288e-06, "loss": 0.0059, "step": 7734 }, { "epoch": 3.519108280254777, "grad_norm": 0.23543948713337504, "learning_rate": 2.013867356940833e-06, "loss": 0.0032, "step": 7735 }, { "epoch": 3.519563239308462, "grad_norm": 0.15592626464027823, "learning_rate": 2.0127210797716522e-06, "loss": 0.0008, "step": 7736 }, { "epoch": 3.5200181983621475, "grad_norm": 0.2949229386168134, "learning_rate": 2.0115750467094132e-06, "loss": 0.0075, "step": 7737 }, { "epoch": 3.5204731574158323, "grad_norm": 0.23403578862731103, "learning_rate": 2.010429257847765e-06, "loss": 0.0028, "step": 7738 }, { "epoch": 3.5209281164695176, "grad_norm": 0.35966101897758035, "learning_rate": 2.0092837132803396e-06, "loss": 0.0085, "step": 7739 }, { "epoch": 3.521383075523203, "grad_norm": 0.2564744225208268, "learning_rate": 2.0081384131007425e-06, "loss": 0.0023, "step": 7740 }, { "epoch": 3.521838034576888, "grad_norm": 0.3584577571659201, "learning_rate": 2.0069933574025634e-06, "loss": 0.007, "step": 7741 }, { "epoch": 3.522292993630573, "grad_norm": 0.39500534624058986, "learning_rate": 2.0058485462793693e-06, "loss": 0.0039, "step": 7742 }, { "epoch": 3.5227479526842584, "grad_norm": 0.2156540854241804, "learning_rate": 2.004703979824712e-06, "loss": 0.0036, "step": 7743 }, { "epoch": 3.5232029117379433, "grad_norm": 0.3463758948091541, "learning_rate": 2.003559658132117e-06, "loss": 0.0068, "step": 7744 }, { "epoch": 3.5236578707916286, "grad_norm": 0.11934460967375501, "learning_rate": 2.0024155812950967e-06, "loss": 0.0021, "step": 7745 }, { "epoch": 3.524112829845314, "grad_norm": 0.10718495765205331, "learning_rate": 2.0012717494071384e-06, "loss": 0.0022, "step": 7746 }, { "epoch": 3.5245677888989992, "grad_norm": 0.2771520953346642, "learning_rate": 2.0001281625617086e-06, "loss": 0.0072, "step": 7747 }, { "epoch": 3.525022747952684, "grad_norm": 0.2886675412571639, "learning_rate": 1.99898482085226e-06, "loss": 0.0054, "step": 7748 }, { "epoch": 3.5254777070063694, "grad_norm": 0.29857230312345473, "learning_rate": 1.9978417243722192e-06, "loss": 0.0025, "step": 7749 }, { "epoch": 3.5259326660600547, "grad_norm": 0.24868240714629586, "learning_rate": 1.996698873214995e-06, "loss": 0.0041, "step": 7750 }, { "epoch": 3.5263876251137396, "grad_norm": 0.1859194084631994, "learning_rate": 1.995556267473975e-06, "loss": 0.0049, "step": 7751 }, { "epoch": 3.526842584167425, "grad_norm": 0.12545191760091903, "learning_rate": 1.9944139072425276e-06, "loss": 0.0017, "step": 7752 }, { "epoch": 3.52729754322111, "grad_norm": 0.2047816519840243, "learning_rate": 1.9932717926140055e-06, "loss": 0.0022, "step": 7753 }, { "epoch": 3.5277525022747955, "grad_norm": 0.31282131817938136, "learning_rate": 1.992129923681734e-06, "loss": 0.0097, "step": 7754 }, { "epoch": 3.5282074613284804, "grad_norm": 0.24047159717721744, "learning_rate": 1.9909883005390213e-06, "loss": 0.0029, "step": 7755 }, { "epoch": 3.5286624203821657, "grad_norm": 0.3848924828105359, "learning_rate": 1.9898469232791546e-06, "loss": 0.0062, "step": 7756 }, { "epoch": 3.529117379435851, "grad_norm": 0.20400027152249636, "learning_rate": 1.9887057919954056e-06, "loss": 0.0037, "step": 7757 }, { "epoch": 3.529572338489536, "grad_norm": 0.21319407139990143, "learning_rate": 1.9875649067810184e-06, "loss": 0.0022, "step": 7758 }, { "epoch": 3.530027297543221, "grad_norm": 0.20840203584149328, "learning_rate": 1.9864242677292244e-06, "loss": 0.0017, "step": 7759 }, { "epoch": 3.5304822565969065, "grad_norm": 0.10516358843511583, "learning_rate": 1.9852838749332304e-06, "loss": 0.0015, "step": 7760 }, { "epoch": 3.5309372156505914, "grad_norm": 0.19790041912850087, "learning_rate": 1.984143728486224e-06, "loss": 0.0041, "step": 7761 }, { "epoch": 3.5313921747042767, "grad_norm": 0.21096348895533565, "learning_rate": 1.9830038284813708e-06, "loss": 0.0038, "step": 7762 }, { "epoch": 3.531847133757962, "grad_norm": 0.12389181466543855, "learning_rate": 1.981864175011822e-06, "loss": 0.0022, "step": 7763 }, { "epoch": 3.532302092811647, "grad_norm": 0.06832625849825473, "learning_rate": 1.980724768170702e-06, "loss": 0.0009, "step": 7764 }, { "epoch": 3.532757051865332, "grad_norm": 0.26512545294472484, "learning_rate": 1.9795856080511184e-06, "loss": 0.0033, "step": 7765 }, { "epoch": 3.5332120109190175, "grad_norm": 0.09913613064610376, "learning_rate": 1.97844669474616e-06, "loss": 0.0008, "step": 7766 }, { "epoch": 3.5336669699727024, "grad_norm": 0.11892462331092149, "learning_rate": 1.977308028348891e-06, "loss": 0.0015, "step": 7767 }, { "epoch": 3.5341219290263877, "grad_norm": 0.21379987765787165, "learning_rate": 1.976169608952361e-06, "loss": 0.0043, "step": 7768 }, { "epoch": 3.534576888080073, "grad_norm": 0.4392171257258942, "learning_rate": 1.9750314366495953e-06, "loss": 0.0077, "step": 7769 }, { "epoch": 3.535031847133758, "grad_norm": 0.5114805694341462, "learning_rate": 1.9738935115336004e-06, "loss": 0.0048, "step": 7770 }, { "epoch": 3.535486806187443, "grad_norm": 0.3052064749925445, "learning_rate": 1.9727558336973594e-06, "loss": 0.0052, "step": 7771 }, { "epoch": 3.5359417652411285, "grad_norm": 0.26725770758324674, "learning_rate": 1.9716184032338415e-06, "loss": 0.003, "step": 7772 }, { "epoch": 3.5363967242948133, "grad_norm": 0.4597987824136517, "learning_rate": 1.9704812202359928e-06, "loss": 0.009, "step": 7773 }, { "epoch": 3.5368516833484986, "grad_norm": 0.11087107979603508, "learning_rate": 1.9693442847967383e-06, "loss": 0.0008, "step": 7774 }, { "epoch": 3.537306642402184, "grad_norm": 0.23878980076655165, "learning_rate": 1.9682075970089815e-06, "loss": 0.0049, "step": 7775 }, { "epoch": 3.537761601455869, "grad_norm": 0.34025913021293663, "learning_rate": 1.9670711569656076e-06, "loss": 0.004, "step": 7776 }, { "epoch": 3.538216560509554, "grad_norm": 0.2054945240914702, "learning_rate": 1.9659349647594835e-06, "loss": 0.004, "step": 7777 }, { "epoch": 3.5386715195632394, "grad_norm": 0.26478068246536995, "learning_rate": 1.964799020483452e-06, "loss": 0.0067, "step": 7778 }, { "epoch": 3.5391264786169243, "grad_norm": 0.11053642628877315, "learning_rate": 1.9636633242303365e-06, "loss": 0.0015, "step": 7779 }, { "epoch": 3.5395814376706096, "grad_norm": 0.20097693948923082, "learning_rate": 1.962527876092944e-06, "loss": 0.0035, "step": 7780 }, { "epoch": 3.540036396724295, "grad_norm": 0.3552994232820875, "learning_rate": 1.9613926761640543e-06, "loss": 0.0046, "step": 7781 }, { "epoch": 3.54049135577798, "grad_norm": 0.2165973136872899, "learning_rate": 1.9602577245364345e-06, "loss": 0.0037, "step": 7782 }, { "epoch": 3.540946314831665, "grad_norm": 0.22827804602151244, "learning_rate": 1.9591230213028265e-06, "loss": 0.0022, "step": 7783 }, { "epoch": 3.5414012738853504, "grad_norm": 0.20769097356530997, "learning_rate": 1.957988566555953e-06, "loss": 0.003, "step": 7784 }, { "epoch": 3.5418562329390353, "grad_norm": 0.25811787146593734, "learning_rate": 1.9568543603885136e-06, "loss": 0.0026, "step": 7785 }, { "epoch": 3.5423111919927206, "grad_norm": 0.3676202785795054, "learning_rate": 1.9557204028931936e-06, "loss": 0.0089, "step": 7786 }, { "epoch": 3.542766151046406, "grad_norm": 0.3947073011485478, "learning_rate": 1.9545866941626563e-06, "loss": 0.0084, "step": 7787 }, { "epoch": 3.5432211101000908, "grad_norm": 0.16825838161603163, "learning_rate": 1.9534532342895413e-06, "loss": 0.0014, "step": 7788 }, { "epoch": 3.543676069153776, "grad_norm": 0.3284350768006685, "learning_rate": 1.9523200233664695e-06, "loss": 0.0069, "step": 7789 }, { "epoch": 3.5441310282074614, "grad_norm": 0.14180787120331206, "learning_rate": 1.9511870614860407e-06, "loss": 0.0019, "step": 7790 }, { "epoch": 3.5445859872611463, "grad_norm": 0.14790928009158866, "learning_rate": 1.950054348740839e-06, "loss": 0.002, "step": 7791 }, { "epoch": 3.5450409463148316, "grad_norm": 0.06680045802963924, "learning_rate": 1.948921885223422e-06, "loss": 0.0007, "step": 7792 }, { "epoch": 3.545495905368517, "grad_norm": 0.2869613031072459, "learning_rate": 1.9477896710263285e-06, "loss": 0.0083, "step": 7793 }, { "epoch": 3.5459508644222018, "grad_norm": 0.5072236925227637, "learning_rate": 1.946657706242081e-06, "loss": 0.0098, "step": 7794 }, { "epoch": 3.546405823475887, "grad_norm": 0.22275331235240897, "learning_rate": 1.945525990963176e-06, "loss": 0.0026, "step": 7795 }, { "epoch": 3.5468607825295724, "grad_norm": 0.2513136107376008, "learning_rate": 1.944394525282094e-06, "loss": 0.0028, "step": 7796 }, { "epoch": 3.5473157415832572, "grad_norm": 0.12425764201258375, "learning_rate": 1.9432633092912924e-06, "loss": 0.0007, "step": 7797 }, { "epoch": 3.5477707006369426, "grad_norm": 0.3561145669451658, "learning_rate": 1.9421323430832097e-06, "loss": 0.0056, "step": 7798 }, { "epoch": 3.548225659690628, "grad_norm": 0.25548502182389277, "learning_rate": 1.941001626750262e-06, "loss": 0.0033, "step": 7799 }, { "epoch": 3.548680618744313, "grad_norm": 0.3356442032343971, "learning_rate": 1.939871160384846e-06, "loss": 0.0029, "step": 7800 }, { "epoch": 3.549135577797998, "grad_norm": 0.29472476245464385, "learning_rate": 1.9387409440793387e-06, "loss": 0.0061, "step": 7801 }, { "epoch": 3.5495905368516834, "grad_norm": 0.08043940012829344, "learning_rate": 1.9376109779260986e-06, "loss": 0.0005, "step": 7802 }, { "epoch": 3.5500454959053687, "grad_norm": 0.16168538080218328, "learning_rate": 1.9364812620174607e-06, "loss": 0.002, "step": 7803 }, { "epoch": 3.5505004549590535, "grad_norm": 0.30321264558747174, "learning_rate": 1.9353517964457386e-06, "loss": 0.0026, "step": 7804 }, { "epoch": 3.550955414012739, "grad_norm": 0.16581168105425526, "learning_rate": 1.934222581303226e-06, "loss": 0.0018, "step": 7805 }, { "epoch": 3.551410373066424, "grad_norm": 0.20053437534470278, "learning_rate": 1.933093616682201e-06, "loss": 0.0034, "step": 7806 }, { "epoch": 3.5518653321201095, "grad_norm": 0.16316560799208163, "learning_rate": 1.9319649026749144e-06, "loss": 0.002, "step": 7807 }, { "epoch": 3.5523202911737943, "grad_norm": 0.12272105120085568, "learning_rate": 1.9308364393736025e-06, "loss": 0.0009, "step": 7808 }, { "epoch": 3.5527752502274796, "grad_norm": 0.04301336602681302, "learning_rate": 1.9297082268704758e-06, "loss": 0.0004, "step": 7809 }, { "epoch": 3.553230209281165, "grad_norm": 0.04921376032330394, "learning_rate": 1.9285802652577262e-06, "loss": 0.0005, "step": 7810 }, { "epoch": 3.55368516833485, "grad_norm": 0.4000595697202519, "learning_rate": 1.9274525546275284e-06, "loss": 0.0022, "step": 7811 }, { "epoch": 3.554140127388535, "grad_norm": 0.19376601167534413, "learning_rate": 1.926325095072033e-06, "loss": 0.0012, "step": 7812 }, { "epoch": 3.5545950864422204, "grad_norm": 0.17456277815504023, "learning_rate": 1.9251978866833696e-06, "loss": 0.0015, "step": 7813 }, { "epoch": 3.5550500454959053, "grad_norm": 0.271046223458997, "learning_rate": 1.924070929553648e-06, "loss": 0.0032, "step": 7814 }, { "epoch": 3.5555050045495906, "grad_norm": 0.27826881314759416, "learning_rate": 1.922944223774959e-06, "loss": 0.0039, "step": 7815 }, { "epoch": 3.555959963603276, "grad_norm": 0.066469035444489, "learning_rate": 1.9218177694393737e-06, "loss": 0.0007, "step": 7816 }, { "epoch": 3.556414922656961, "grad_norm": 0.28801098648844486, "learning_rate": 1.9206915666389396e-06, "loss": 0.0031, "step": 7817 }, { "epoch": 3.556869881710646, "grad_norm": 0.2410917775885526, "learning_rate": 1.9195656154656844e-06, "loss": 0.007, "step": 7818 }, { "epoch": 3.5573248407643314, "grad_norm": 0.3350517672704401, "learning_rate": 1.9184399160116146e-06, "loss": 0.0043, "step": 7819 }, { "epoch": 3.5577797998180163, "grad_norm": 0.07255469096683553, "learning_rate": 1.91731446836872e-06, "loss": 0.0006, "step": 7820 }, { "epoch": 3.5582347588717016, "grad_norm": 0.2816743657966873, "learning_rate": 1.9161892726289643e-06, "loss": 0.0031, "step": 7821 }, { "epoch": 3.558689717925387, "grad_norm": 0.23167362359732568, "learning_rate": 1.9150643288842963e-06, "loss": 0.0038, "step": 7822 }, { "epoch": 3.5591446769790718, "grad_norm": 0.15723888410495052, "learning_rate": 1.9139396372266407e-06, "loss": 0.002, "step": 7823 }, { "epoch": 3.559599636032757, "grad_norm": 8.291661300439266, "learning_rate": 1.912815197747899e-06, "loss": 0.0929, "step": 7824 }, { "epoch": 3.5600545950864424, "grad_norm": 0.556252010177556, "learning_rate": 1.9116910105399594e-06, "loss": 0.0119, "step": 7825 }, { "epoch": 3.5605095541401273, "grad_norm": 0.10335399382757667, "learning_rate": 1.9105670756946836e-06, "loss": 0.0017, "step": 7826 }, { "epoch": 3.5609645131938126, "grad_norm": 0.3029963896566323, "learning_rate": 1.909443393303915e-06, "loss": 0.0062, "step": 7827 }, { "epoch": 3.561419472247498, "grad_norm": 0.24255008071947937, "learning_rate": 1.908319963459474e-06, "loss": 0.0037, "step": 7828 }, { "epoch": 3.5618744313011828, "grad_norm": 0.31374557388326757, "learning_rate": 1.9071967862531632e-06, "loss": 0.0027, "step": 7829 }, { "epoch": 3.562329390354868, "grad_norm": 0.2839158633267819, "learning_rate": 1.906073861776766e-06, "loss": 0.0084, "step": 7830 }, { "epoch": 3.5627843494085534, "grad_norm": 0.4561988777530323, "learning_rate": 1.9049511901220409e-06, "loss": 0.0099, "step": 7831 }, { "epoch": 3.5632393084622382, "grad_norm": 0.19015944658395145, "learning_rate": 1.9038287713807269e-06, "loss": 0.0048, "step": 7832 }, { "epoch": 3.5636942675159236, "grad_norm": 0.2685832904432481, "learning_rate": 1.9027066056445437e-06, "loss": 0.0031, "step": 7833 }, { "epoch": 3.564149226569609, "grad_norm": 0.4365573333097983, "learning_rate": 1.9015846930051879e-06, "loss": 0.0092, "step": 7834 }, { "epoch": 3.5646041856232937, "grad_norm": 0.2956580581449613, "learning_rate": 1.900463033554339e-06, "loss": 0.0073, "step": 7835 }, { "epoch": 3.565059144676979, "grad_norm": 0.1827958823749623, "learning_rate": 1.8993416273836546e-06, "loss": 0.0013, "step": 7836 }, { "epoch": 3.5655141037306644, "grad_norm": 0.2399800437682335, "learning_rate": 1.8982204745847704e-06, "loss": 0.0029, "step": 7837 }, { "epoch": 3.565969062784349, "grad_norm": 0.17654268778036145, "learning_rate": 1.8970995752493016e-06, "loss": 0.0022, "step": 7838 }, { "epoch": 3.5664240218380345, "grad_norm": 0.12701314508844222, "learning_rate": 1.8959789294688408e-06, "loss": 0.0019, "step": 7839 }, { "epoch": 3.56687898089172, "grad_norm": 0.03969785212810466, "learning_rate": 1.8948585373349665e-06, "loss": 0.0003, "step": 7840 }, { "epoch": 3.5673339399454047, "grad_norm": 0.05554512170704364, "learning_rate": 1.8937383989392294e-06, "loss": 0.0005, "step": 7841 }, { "epoch": 3.56778889899909, "grad_norm": 0.19775444531922698, "learning_rate": 1.8926185143731607e-06, "loss": 0.0023, "step": 7842 }, { "epoch": 3.5682438580527753, "grad_norm": 0.1915942271992767, "learning_rate": 1.8914988837282767e-06, "loss": 0.0025, "step": 7843 }, { "epoch": 3.56869881710646, "grad_norm": 0.1826400534228145, "learning_rate": 1.8903795070960635e-06, "loss": 0.004, "step": 7844 }, { "epoch": 3.5691537761601455, "grad_norm": 0.1141328912592994, "learning_rate": 1.8892603845679963e-06, "loss": 0.0014, "step": 7845 }, { "epoch": 3.569608735213831, "grad_norm": 0.07758567332736764, "learning_rate": 1.8881415162355222e-06, "loss": 0.001, "step": 7846 }, { "epoch": 3.5700636942675157, "grad_norm": 0.17736716274854417, "learning_rate": 1.8870229021900706e-06, "loss": 0.0028, "step": 7847 }, { "epoch": 3.570518653321201, "grad_norm": 0.24901034747236278, "learning_rate": 1.8859045425230477e-06, "loss": 0.0072, "step": 7848 }, { "epoch": 3.5709736123748863, "grad_norm": 0.15898630125207755, "learning_rate": 1.8847864373258417e-06, "loss": 0.0026, "step": 7849 }, { "epoch": 3.571428571428571, "grad_norm": 0.360024037313287, "learning_rate": 1.8836685866898224e-06, "loss": 0.0042, "step": 7850 }, { "epoch": 3.5718835304822565, "grad_norm": 0.18931597313542467, "learning_rate": 1.8825509907063328e-06, "loss": 0.0024, "step": 7851 }, { "epoch": 3.572338489535942, "grad_norm": 0.3368471376035797, "learning_rate": 1.8814336494666979e-06, "loss": 0.0034, "step": 7852 }, { "epoch": 3.5727934485896267, "grad_norm": 0.1877896723592328, "learning_rate": 1.88031656306222e-06, "loss": 0.0021, "step": 7853 }, { "epoch": 3.573248407643312, "grad_norm": 0.30689448214652615, "learning_rate": 1.8791997315841865e-06, "loss": 0.003, "step": 7854 }, { "epoch": 3.5737033666969973, "grad_norm": 0.37648531464817464, "learning_rate": 1.8780831551238566e-06, "loss": 0.0054, "step": 7855 }, { "epoch": 3.5741583257506826, "grad_norm": 0.14790533285685548, "learning_rate": 1.8769668337724717e-06, "loss": 0.0019, "step": 7856 }, { "epoch": 3.5746132848043675, "grad_norm": 0.3290089136734179, "learning_rate": 1.875850767621255e-06, "loss": 0.0081, "step": 7857 }, { "epoch": 3.5750682438580528, "grad_norm": 0.2438159577849764, "learning_rate": 1.8747349567614036e-06, "loss": 0.0026, "step": 7858 }, { "epoch": 3.575523202911738, "grad_norm": 0.44908224923679907, "learning_rate": 1.8736194012840996e-06, "loss": 0.0064, "step": 7859 }, { "epoch": 3.575978161965423, "grad_norm": 0.2612222870249877, "learning_rate": 1.8725041012804994e-06, "loss": 0.0038, "step": 7860 }, { "epoch": 3.5764331210191083, "grad_norm": 0.2538727072882192, "learning_rate": 1.8713890568417408e-06, "loss": 0.0047, "step": 7861 }, { "epoch": 3.5768880800727936, "grad_norm": 0.37328615462974646, "learning_rate": 1.870274268058938e-06, "loss": 0.0044, "step": 7862 }, { "epoch": 3.577343039126479, "grad_norm": 0.23852059542601314, "learning_rate": 1.8691597350231877e-06, "loss": 0.0036, "step": 7863 }, { "epoch": 3.5777979981801638, "grad_norm": 0.1648965035834699, "learning_rate": 1.8680454578255674e-06, "loss": 0.002, "step": 7864 }, { "epoch": 3.578252957233849, "grad_norm": 0.5781760248014696, "learning_rate": 1.8669314365571285e-06, "loss": 0.0057, "step": 7865 }, { "epoch": 3.5787079162875344, "grad_norm": 0.15098789223124823, "learning_rate": 1.8658176713089038e-06, "loss": 0.0033, "step": 7866 }, { "epoch": 3.5791628753412192, "grad_norm": 0.2569999285657466, "learning_rate": 1.8647041621719047e-06, "loss": 0.0062, "step": 7867 }, { "epoch": 3.5796178343949046, "grad_norm": 0.22836082376197347, "learning_rate": 1.8635909092371214e-06, "loss": 0.0018, "step": 7868 }, { "epoch": 3.58007279344859, "grad_norm": 0.2125034388435507, "learning_rate": 1.862477912595526e-06, "loss": 0.0029, "step": 7869 }, { "epoch": 3.5805277525022747, "grad_norm": 0.14826069261954608, "learning_rate": 1.861365172338066e-06, "loss": 0.0008, "step": 7870 }, { "epoch": 3.58098271155596, "grad_norm": 0.25791990914333773, "learning_rate": 1.8602526885556715e-06, "loss": 0.0035, "step": 7871 }, { "epoch": 3.5814376706096454, "grad_norm": 0.24349151276618372, "learning_rate": 1.8591404613392477e-06, "loss": 0.0053, "step": 7872 }, { "epoch": 3.58189262966333, "grad_norm": 0.2541523185075842, "learning_rate": 1.8580284907796803e-06, "loss": 0.0038, "step": 7873 }, { "epoch": 3.5823475887170155, "grad_norm": 0.13711911289964399, "learning_rate": 1.8569167769678375e-06, "loss": 0.0023, "step": 7874 }, { "epoch": 3.582802547770701, "grad_norm": 0.4239392401036632, "learning_rate": 1.8558053199945613e-06, "loss": 0.0128, "step": 7875 }, { "epoch": 3.5832575068243857, "grad_norm": 0.21898069795300995, "learning_rate": 1.8546941199506752e-06, "loss": 0.0043, "step": 7876 }, { "epoch": 3.583712465878071, "grad_norm": 0.35824719370429525, "learning_rate": 1.8535831769269802e-06, "loss": 0.0098, "step": 7877 }, { "epoch": 3.5841674249317563, "grad_norm": 0.19549547883919538, "learning_rate": 1.8524724910142588e-06, "loss": 0.0015, "step": 7878 }, { "epoch": 3.584622383985441, "grad_norm": 0.34835104718669907, "learning_rate": 1.851362062303273e-06, "loss": 0.007, "step": 7879 }, { "epoch": 3.5850773430391265, "grad_norm": 0.04346302220808581, "learning_rate": 1.85025189088476e-06, "loss": 0.0005, "step": 7880 }, { "epoch": 3.585532302092812, "grad_norm": 0.31511780552489604, "learning_rate": 1.849141976849439e-06, "loss": 0.0045, "step": 7881 }, { "epoch": 3.5859872611464967, "grad_norm": 0.2975742152769068, "learning_rate": 1.848032320288004e-06, "loss": 0.0037, "step": 7882 }, { "epoch": 3.586442220200182, "grad_norm": 0.09078263429941769, "learning_rate": 1.8469229212911361e-06, "loss": 0.0009, "step": 7883 }, { "epoch": 3.5868971792538673, "grad_norm": 0.11573700181135811, "learning_rate": 1.8458137799494857e-06, "loss": 0.0006, "step": 7884 }, { "epoch": 3.587352138307552, "grad_norm": 0.14498108831423218, "learning_rate": 1.8447048963536908e-06, "loss": 0.0011, "step": 7885 }, { "epoch": 3.5878070973612375, "grad_norm": 0.0983077504321627, "learning_rate": 1.8435962705943628e-06, "loss": 0.0012, "step": 7886 }, { "epoch": 3.588262056414923, "grad_norm": 0.3040958269451847, "learning_rate": 1.8424879027620913e-06, "loss": 0.0035, "step": 7887 }, { "epoch": 3.5887170154686077, "grad_norm": 0.33771776352356425, "learning_rate": 1.8413797929474515e-06, "loss": 0.0058, "step": 7888 }, { "epoch": 3.589171974522293, "grad_norm": 0.19632673041120569, "learning_rate": 1.8402719412409904e-06, "loss": 0.0041, "step": 7889 }, { "epoch": 3.5896269335759783, "grad_norm": 0.22556125154060808, "learning_rate": 1.8391643477332367e-06, "loss": 0.0052, "step": 7890 }, { "epoch": 3.590081892629663, "grad_norm": 0.16926219057323924, "learning_rate": 1.8380570125146968e-06, "loss": 0.0024, "step": 7891 }, { "epoch": 3.5905368516833485, "grad_norm": 0.2824537823006504, "learning_rate": 1.8369499356758592e-06, "loss": 0.0023, "step": 7892 }, { "epoch": 3.5909918107370338, "grad_norm": 0.2907508835953086, "learning_rate": 1.83584311730719e-06, "loss": 0.0039, "step": 7893 }, { "epoch": 3.5914467697907186, "grad_norm": 0.39093362476591115, "learning_rate": 1.8347365574991317e-06, "loss": 0.0078, "step": 7894 }, { "epoch": 3.591901728844404, "grad_norm": 0.21703840186106568, "learning_rate": 1.8336302563421083e-06, "loss": 0.0022, "step": 7895 }, { "epoch": 3.5923566878980893, "grad_norm": 0.30147901577619696, "learning_rate": 1.8325242139265192e-06, "loss": 0.0021, "step": 7896 }, { "epoch": 3.592811646951774, "grad_norm": 0.3958194604680672, "learning_rate": 1.8314184303427484e-06, "loss": 0.0024, "step": 7897 }, { "epoch": 3.5932666060054594, "grad_norm": 0.11906332456432153, "learning_rate": 1.830312905681153e-06, "loss": 0.001, "step": 7898 }, { "epoch": 3.5937215650591448, "grad_norm": 0.24610698279584053, "learning_rate": 1.8292076400320746e-06, "loss": 0.0028, "step": 7899 }, { "epoch": 3.5941765241128296, "grad_norm": 0.21960152706123276, "learning_rate": 1.8281026334858287e-06, "loss": 0.0058, "step": 7900 }, { "epoch": 3.594631483166515, "grad_norm": 0.1790492789710415, "learning_rate": 1.8269978861327097e-06, "loss": 0.0017, "step": 7901 }, { "epoch": 3.5950864422202002, "grad_norm": 0.32270491510546734, "learning_rate": 1.8258933980629957e-06, "loss": 0.0045, "step": 7902 }, { "epoch": 3.595541401273885, "grad_norm": 0.22759880789456016, "learning_rate": 1.8247891693669394e-06, "loss": 0.0013, "step": 7903 }, { "epoch": 3.5959963603275704, "grad_norm": 0.5020246925343776, "learning_rate": 1.8236852001347728e-06, "loss": 0.009, "step": 7904 }, { "epoch": 3.5964513193812557, "grad_norm": 0.08959192426591882, "learning_rate": 1.8225814904567057e-06, "loss": 0.0018, "step": 7905 }, { "epoch": 3.5969062784349406, "grad_norm": 0.1358588758268522, "learning_rate": 1.821478040422932e-06, "loss": 0.0018, "step": 7906 }, { "epoch": 3.597361237488626, "grad_norm": 0.13531912890129033, "learning_rate": 1.8203748501236173e-06, "loss": 0.0019, "step": 7907 }, { "epoch": 3.597816196542311, "grad_norm": 0.19270377915409623, "learning_rate": 1.8192719196489123e-06, "loss": 0.0021, "step": 7908 }, { "epoch": 3.598271155595996, "grad_norm": 0.07305642433280565, "learning_rate": 1.8181692490889418e-06, "loss": 0.0006, "step": 7909 }, { "epoch": 3.5987261146496814, "grad_norm": 0.6475939154284186, "learning_rate": 1.8170668385338113e-06, "loss": 0.0101, "step": 7910 }, { "epoch": 3.5991810737033667, "grad_norm": 0.2817718685548278, "learning_rate": 1.8159646880736036e-06, "loss": 0.002, "step": 7911 }, { "epoch": 3.599636032757052, "grad_norm": 0.26163244962577487, "learning_rate": 1.8148627977983817e-06, "loss": 0.003, "step": 7912 }, { "epoch": 3.600090991810737, "grad_norm": 0.1663032670859049, "learning_rate": 1.8137611677981904e-06, "loss": 0.002, "step": 7913 }, { "epoch": 3.600545950864422, "grad_norm": 0.2697742508820168, "learning_rate": 1.8126597981630474e-06, "loss": 0.0027, "step": 7914 }, { "epoch": 3.6010009099181075, "grad_norm": 0.228454837473255, "learning_rate": 1.8115586889829517e-06, "loss": 0.0025, "step": 7915 }, { "epoch": 3.6014558689717924, "grad_norm": 0.22736868081564568, "learning_rate": 1.8104578403478794e-06, "loss": 0.0027, "step": 7916 }, { "epoch": 3.6019108280254777, "grad_norm": 0.2126793718386172, "learning_rate": 1.8093572523477904e-06, "loss": 0.0018, "step": 7917 }, { "epoch": 3.602365787079163, "grad_norm": 0.26137924219270103, "learning_rate": 1.8082569250726179e-06, "loss": 0.0051, "step": 7918 }, { "epoch": 3.6028207461328483, "grad_norm": 0.1931062911369165, "learning_rate": 1.8071568586122733e-06, "loss": 0.0051, "step": 7919 }, { "epoch": 3.603275705186533, "grad_norm": 0.1332182802182727, "learning_rate": 1.806057053056654e-06, "loss": 0.0018, "step": 7920 }, { "epoch": 3.6037306642402185, "grad_norm": 0.21635079571295868, "learning_rate": 1.8049575084956266e-06, "loss": 0.0035, "step": 7921 }, { "epoch": 3.604185623293904, "grad_norm": 0.3301968051704073, "learning_rate": 1.8038582250190445e-06, "loss": 0.0053, "step": 7922 }, { "epoch": 3.6046405823475887, "grad_norm": 0.37722946083861625, "learning_rate": 1.802759202716735e-06, "loss": 0.0031, "step": 7923 }, { "epoch": 3.605095541401274, "grad_norm": 0.1582487674967987, "learning_rate": 1.8016604416785043e-06, "loss": 0.0026, "step": 7924 }, { "epoch": 3.6055505004549593, "grad_norm": 0.26033746966944415, "learning_rate": 1.8005619419941372e-06, "loss": 0.0049, "step": 7925 }, { "epoch": 3.606005459508644, "grad_norm": 0.6888754296815256, "learning_rate": 1.7994637037534003e-06, "loss": 0.0048, "step": 7926 }, { "epoch": 3.6064604185623295, "grad_norm": 0.17130425264638557, "learning_rate": 1.798365727046037e-06, "loss": 0.0015, "step": 7927 }, { "epoch": 3.6069153776160148, "grad_norm": 0.18849109292648805, "learning_rate": 1.797268011961768e-06, "loss": 0.0033, "step": 7928 }, { "epoch": 3.6073703366696996, "grad_norm": 0.22730640992688486, "learning_rate": 1.7961705585902945e-06, "loss": 0.0025, "step": 7929 }, { "epoch": 3.607825295723385, "grad_norm": 0.4602083535252957, "learning_rate": 1.7950733670212921e-06, "loss": 0.0104, "step": 7930 }, { "epoch": 3.6082802547770703, "grad_norm": 0.5105317258007144, "learning_rate": 1.7939764373444223e-06, "loss": 0.0069, "step": 7931 }, { "epoch": 3.608735213830755, "grad_norm": 0.24042606244188347, "learning_rate": 1.7928797696493204e-06, "loss": 0.0032, "step": 7932 }, { "epoch": 3.6091901728844404, "grad_norm": 0.37057816240218805, "learning_rate": 1.7917833640255988e-06, "loss": 0.0045, "step": 7933 }, { "epoch": 3.6096451319381258, "grad_norm": 0.18373547765656353, "learning_rate": 1.790687220562854e-06, "loss": 0.0019, "step": 7934 }, { "epoch": 3.6101000909918106, "grad_norm": 0.16822871767184222, "learning_rate": 1.7895913393506547e-06, "loss": 0.002, "step": 7935 }, { "epoch": 3.610555050045496, "grad_norm": 0.9120036062826911, "learning_rate": 1.7884957204785546e-06, "loss": 0.0301, "step": 7936 }, { "epoch": 3.6110100090991812, "grad_norm": 0.18320868208439767, "learning_rate": 1.7874003640360816e-06, "loss": 0.0025, "step": 7937 }, { "epoch": 3.611464968152866, "grad_norm": 0.38050779048434175, "learning_rate": 1.7863052701127426e-06, "loss": 0.003, "step": 7938 }, { "epoch": 3.6119199272065514, "grad_norm": 0.3179570980024949, "learning_rate": 1.785210438798024e-06, "loss": 0.0051, "step": 7939 }, { "epoch": 3.6123748862602367, "grad_norm": 0.2466072577767851, "learning_rate": 1.7841158701813872e-06, "loss": 0.0039, "step": 7940 }, { "epoch": 3.6128298453139216, "grad_norm": 0.2947505388506555, "learning_rate": 1.7830215643522818e-06, "loss": 0.0045, "step": 7941 }, { "epoch": 3.613284804367607, "grad_norm": 0.1261551585566211, "learning_rate": 1.7819275214001263e-06, "loss": 0.0014, "step": 7942 }, { "epoch": 3.613739763421292, "grad_norm": 0.376726998153657, "learning_rate": 1.7808337414143218e-06, "loss": 0.0082, "step": 7943 }, { "epoch": 3.614194722474977, "grad_norm": 0.19294331327608216, "learning_rate": 1.779740224484246e-06, "loss": 0.0028, "step": 7944 }, { "epoch": 3.6146496815286624, "grad_norm": 0.07491206481414248, "learning_rate": 1.7786469706992542e-06, "loss": 0.0007, "step": 7945 }, { "epoch": 3.6151046405823477, "grad_norm": 0.3390974285656792, "learning_rate": 1.7775539801486868e-06, "loss": 0.0043, "step": 7946 }, { "epoch": 3.6155595996360326, "grad_norm": 0.28674394998514324, "learning_rate": 1.7764612529218538e-06, "loss": 0.0028, "step": 7947 }, { "epoch": 3.616014558689718, "grad_norm": 0.19709725492535993, "learning_rate": 1.7753687891080517e-06, "loss": 0.0017, "step": 7948 }, { "epoch": 3.616469517743403, "grad_norm": 0.11491431563050752, "learning_rate": 1.77427658879655e-06, "loss": 0.0008, "step": 7949 }, { "epoch": 3.616924476797088, "grad_norm": 0.32651563866476874, "learning_rate": 1.7731846520765962e-06, "loss": 0.0074, "step": 7950 }, { "epoch": 3.6173794358507734, "grad_norm": 0.3122939474441103, "learning_rate": 1.7720929790374225e-06, "loss": 0.0038, "step": 7951 }, { "epoch": 3.6178343949044587, "grad_norm": 0.37129059422111754, "learning_rate": 1.7710015697682332e-06, "loss": 0.0049, "step": 7952 }, { "epoch": 3.6182893539581436, "grad_norm": 0.2332594988565104, "learning_rate": 1.7699104243582133e-06, "loss": 0.004, "step": 7953 }, { "epoch": 3.618744313011829, "grad_norm": 0.09925808851744253, "learning_rate": 1.768819542896525e-06, "loss": 0.0009, "step": 7954 }, { "epoch": 3.619199272065514, "grad_norm": 0.31852423028987725, "learning_rate": 1.7677289254723124e-06, "loss": 0.0084, "step": 7955 }, { "epoch": 3.619654231119199, "grad_norm": 0.2439366003167992, "learning_rate": 1.766638572174696e-06, "loss": 0.0031, "step": 7956 }, { "epoch": 3.6201091901728844, "grad_norm": 0.2634222070561646, "learning_rate": 1.7655484830927743e-06, "loss": 0.003, "step": 7957 }, { "epoch": 3.6205641492265697, "grad_norm": 0.14678145811124868, "learning_rate": 1.7644586583156237e-06, "loss": 0.0017, "step": 7958 }, { "epoch": 3.6210191082802545, "grad_norm": 0.07528320992932991, "learning_rate": 1.7633690979322986e-06, "loss": 0.0008, "step": 7959 }, { "epoch": 3.62147406733394, "grad_norm": 0.19911298795477617, "learning_rate": 1.7622798020318354e-06, "loss": 0.0029, "step": 7960 }, { "epoch": 3.621929026387625, "grad_norm": 0.17775238000837332, "learning_rate": 1.7611907707032444e-06, "loss": 0.0015, "step": 7961 }, { "epoch": 3.62238398544131, "grad_norm": 0.10451954365692988, "learning_rate": 1.7601020040355182e-06, "loss": 0.001, "step": 7962 }, { "epoch": 3.6228389444949953, "grad_norm": 0.3015366289508726, "learning_rate": 1.7590135021176258e-06, "loss": 0.0021, "step": 7963 }, { "epoch": 3.6232939035486806, "grad_norm": 0.5457679843974467, "learning_rate": 1.7579252650385114e-06, "loss": 0.0042, "step": 7964 }, { "epoch": 3.623748862602366, "grad_norm": 0.10670382197885002, "learning_rate": 1.7568372928871053e-06, "loss": 0.0007, "step": 7965 }, { "epoch": 3.624203821656051, "grad_norm": 0.25542607912323717, "learning_rate": 1.7557495857523098e-06, "loss": 0.0021, "step": 7966 }, { "epoch": 3.624658780709736, "grad_norm": 0.19018887122334371, "learning_rate": 1.7546621437230071e-06, "loss": 0.0008, "step": 7967 }, { "epoch": 3.6251137397634214, "grad_norm": 0.3269645623973134, "learning_rate": 1.7535749668880563e-06, "loss": 0.0039, "step": 7968 }, { "epoch": 3.6255686988171063, "grad_norm": 0.1778802123521978, "learning_rate": 1.752488055336299e-06, "loss": 0.002, "step": 7969 }, { "epoch": 3.6260236578707916, "grad_norm": 0.1470751908641553, "learning_rate": 1.7514014091565535e-06, "loss": 0.0012, "step": 7970 }, { "epoch": 3.626478616924477, "grad_norm": 0.36141962596043486, "learning_rate": 1.7503150284376142e-06, "loss": 0.0088, "step": 7971 }, { "epoch": 3.6269335759781622, "grad_norm": 0.2724935962348301, "learning_rate": 1.7492289132682554e-06, "loss": 0.0027, "step": 7972 }, { "epoch": 3.627388535031847, "grad_norm": 0.29107249038998095, "learning_rate": 1.7481430637372298e-06, "loss": 0.0041, "step": 7973 }, { "epoch": 3.6278434940855324, "grad_norm": 0.13134200796209422, "learning_rate": 1.7470574799332658e-06, "loss": 0.0008, "step": 7974 }, { "epoch": 3.6282984531392177, "grad_norm": 0.4090567447742412, "learning_rate": 1.7459721619450743e-06, "loss": 0.0064, "step": 7975 }, { "epoch": 3.6287534121929026, "grad_norm": 0.20124356356315617, "learning_rate": 1.7448871098613446e-06, "loss": 0.0016, "step": 7976 }, { "epoch": 3.629208371246588, "grad_norm": 0.15918472248295626, "learning_rate": 1.7438023237707403e-06, "loss": 0.0022, "step": 7977 }, { "epoch": 3.629663330300273, "grad_norm": 0.20179590639826822, "learning_rate": 1.7427178037619046e-06, "loss": 0.0019, "step": 7978 }, { "epoch": 3.630118289353958, "grad_norm": 0.11466157783046807, "learning_rate": 1.7416335499234593e-06, "loss": 0.0008, "step": 7979 }, { "epoch": 3.6305732484076434, "grad_norm": 0.3021783554825343, "learning_rate": 1.740549562344007e-06, "loss": 0.0053, "step": 7980 }, { "epoch": 3.6310282074613287, "grad_norm": 0.31465809219243523, "learning_rate": 1.739465841112125e-06, "loss": 0.003, "step": 7981 }, { "epoch": 3.6314831665150136, "grad_norm": 0.17248181798206758, "learning_rate": 1.7383823863163685e-06, "loss": 0.002, "step": 7982 }, { "epoch": 3.631938125568699, "grad_norm": 0.22433963000160198, "learning_rate": 1.7372991980452753e-06, "loss": 0.0019, "step": 7983 }, { "epoch": 3.632393084622384, "grad_norm": 0.23492332221439477, "learning_rate": 1.7362162763873557e-06, "loss": 0.0066, "step": 7984 }, { "epoch": 3.632848043676069, "grad_norm": 0.07711611078754155, "learning_rate": 1.7351336214311055e-06, "loss": 0.0007, "step": 7985 }, { "epoch": 3.6333030027297544, "grad_norm": 0.36145023436743134, "learning_rate": 1.7340512332649905e-06, "loss": 0.0049, "step": 7986 }, { "epoch": 3.6337579617834397, "grad_norm": 0.16489699725141843, "learning_rate": 1.7329691119774606e-06, "loss": 0.0035, "step": 7987 }, { "epoch": 3.6342129208371245, "grad_norm": 0.07212114779748191, "learning_rate": 1.7318872576569396e-06, "loss": 0.0007, "step": 7988 }, { "epoch": 3.63466787989081, "grad_norm": 0.15231096026220012, "learning_rate": 1.7308056703918324e-06, "loss": 0.001, "step": 7989 }, { "epoch": 3.635122838944495, "grad_norm": 0.2462416491193636, "learning_rate": 1.7297243502705247e-06, "loss": 0.0038, "step": 7990 }, { "epoch": 3.63557779799818, "grad_norm": 0.18542740966174365, "learning_rate": 1.7286432973813744e-06, "loss": 0.0016, "step": 7991 }, { "epoch": 3.6360327570518653, "grad_norm": 0.3046324467845501, "learning_rate": 1.7275625118127203e-06, "loss": 0.0042, "step": 7992 }, { "epoch": 3.6364877161055507, "grad_norm": 0.15460995311172396, "learning_rate": 1.726481993652878e-06, "loss": 0.0021, "step": 7993 }, { "epoch": 3.6369426751592355, "grad_norm": 0.28842121453416686, "learning_rate": 1.725401742990146e-06, "loss": 0.0045, "step": 7994 }, { "epoch": 3.637397634212921, "grad_norm": 0.20665613874312755, "learning_rate": 1.7243217599127953e-06, "loss": 0.003, "step": 7995 }, { "epoch": 3.637852593266606, "grad_norm": 0.20818202801446936, "learning_rate": 1.7232420445090765e-06, "loss": 0.0028, "step": 7996 }, { "epoch": 3.638307552320291, "grad_norm": 0.27258458591774315, "learning_rate": 1.7221625968672212e-06, "loss": 0.0056, "step": 7997 }, { "epoch": 3.6387625113739763, "grad_norm": 0.06792279719819914, "learning_rate": 1.7210834170754342e-06, "loss": 0.0008, "step": 7998 }, { "epoch": 3.6392174704276616, "grad_norm": 0.26170658844395617, "learning_rate": 1.7200045052219044e-06, "loss": 0.0031, "step": 7999 }, { "epoch": 3.6396724294813465, "grad_norm": 0.09230026412826589, "learning_rate": 1.7189258613947945e-06, "loss": 0.0009, "step": 8000 }, { "epoch": 3.640127388535032, "grad_norm": 0.3259220385786079, "learning_rate": 1.7178474856822457e-06, "loss": 0.0061, "step": 8001 }, { "epoch": 3.640582347588717, "grad_norm": 0.4320532035267409, "learning_rate": 1.716769378172377e-06, "loss": 0.0056, "step": 8002 }, { "epoch": 3.641037306642402, "grad_norm": 0.2802724673216492, "learning_rate": 1.715691538953288e-06, "loss": 0.0052, "step": 8003 }, { "epoch": 3.6414922656960873, "grad_norm": 0.38924744222299684, "learning_rate": 1.7146139681130557e-06, "loss": 0.0033, "step": 8004 }, { "epoch": 3.6419472247497726, "grad_norm": 0.5772229222338136, "learning_rate": 1.7135366657397335e-06, "loss": 0.0056, "step": 8005 }, { "epoch": 3.6424021838034575, "grad_norm": 0.19733163639925325, "learning_rate": 1.7124596319213532e-06, "loss": 0.004, "step": 8006 }, { "epoch": 3.642857142857143, "grad_norm": 0.2547387512497557, "learning_rate": 1.7113828667459242e-06, "loss": 0.0032, "step": 8007 }, { "epoch": 3.643312101910828, "grad_norm": 0.20781975644724582, "learning_rate": 1.7103063703014372e-06, "loss": 0.0018, "step": 8008 }, { "epoch": 3.643767060964513, "grad_norm": 0.18637383797821747, "learning_rate": 1.709230142675858e-06, "loss": 0.0038, "step": 8009 }, { "epoch": 3.6442220200181983, "grad_norm": 0.24572797256397994, "learning_rate": 1.7081541839571285e-06, "loss": 0.0021, "step": 8010 }, { "epoch": 3.6446769790718836, "grad_norm": 0.38239060579784134, "learning_rate": 1.7070784942331753e-06, "loss": 0.0044, "step": 8011 }, { "epoch": 3.6451319381255685, "grad_norm": 0.30692054265423613, "learning_rate": 1.7060030735918963e-06, "loss": 0.0132, "step": 8012 }, { "epoch": 3.6455868971792538, "grad_norm": 0.11880490417194524, "learning_rate": 1.7049279221211696e-06, "loss": 0.0009, "step": 8013 }, { "epoch": 3.646041856232939, "grad_norm": 0.26737440297820153, "learning_rate": 1.7038530399088538e-06, "loss": 0.0073, "step": 8014 }, { "epoch": 3.646496815286624, "grad_norm": 0.15832893330790165, "learning_rate": 1.7027784270427822e-06, "loss": 0.0012, "step": 8015 }, { "epoch": 3.6469517743403093, "grad_norm": 0.22136569941688236, "learning_rate": 1.701704083610768e-06, "loss": 0.0011, "step": 8016 }, { "epoch": 3.6474067333939946, "grad_norm": 0.15909962981524223, "learning_rate": 1.700630009700599e-06, "loss": 0.002, "step": 8017 }, { "epoch": 3.6478616924476794, "grad_norm": 0.14564063705330876, "learning_rate": 1.6995562054000459e-06, "loss": 0.002, "step": 8018 }, { "epoch": 3.6483166515013647, "grad_norm": 0.26482202710233876, "learning_rate": 1.6984826707968566e-06, "loss": 0.0035, "step": 8019 }, { "epoch": 3.64877161055505, "grad_norm": 0.0418102071312338, "learning_rate": 1.6974094059787544e-06, "loss": 0.0003, "step": 8020 }, { "epoch": 3.6492265696087354, "grad_norm": 0.2365175368480019, "learning_rate": 1.6963364110334407e-06, "loss": 0.002, "step": 8021 }, { "epoch": 3.6496815286624202, "grad_norm": 0.18684998075962683, "learning_rate": 1.6952636860485944e-06, "loss": 0.0023, "step": 8022 }, { "epoch": 3.6501364877161055, "grad_norm": 0.3184178875394961, "learning_rate": 1.694191231111878e-06, "loss": 0.0029, "step": 8023 }, { "epoch": 3.650591446769791, "grad_norm": 0.3355623492931194, "learning_rate": 1.6931190463109231e-06, "loss": 0.0105, "step": 8024 }, { "epoch": 3.6510464058234757, "grad_norm": 0.1440589001383886, "learning_rate": 1.6920471317333476e-06, "loss": 0.0013, "step": 8025 }, { "epoch": 3.651501364877161, "grad_norm": 0.2217250047560994, "learning_rate": 1.6909754874667422e-06, "loss": 0.0031, "step": 8026 }, { "epoch": 3.6519563239308463, "grad_norm": 0.07925508024230195, "learning_rate": 1.689904113598675e-06, "loss": 0.0006, "step": 8027 }, { "epoch": 3.6524112829845317, "grad_norm": 0.23261960660017306, "learning_rate": 1.6888330102166966e-06, "loss": 0.0021, "step": 8028 }, { "epoch": 3.6528662420382165, "grad_norm": 0.3905453071373662, "learning_rate": 1.687762177408332e-06, "loss": 0.0065, "step": 8029 }, { "epoch": 3.653321201091902, "grad_norm": 0.643251963619827, "learning_rate": 1.6866916152610836e-06, "loss": 0.0067, "step": 8030 }, { "epoch": 3.653776160145587, "grad_norm": 0.03428659765720983, "learning_rate": 1.6856213238624324e-06, "loss": 0.0003, "step": 8031 }, { "epoch": 3.654231119199272, "grad_norm": 0.2959269804489528, "learning_rate": 1.6845513032998389e-06, "loss": 0.0018, "step": 8032 }, { "epoch": 3.6546860782529573, "grad_norm": 0.20584201943035565, "learning_rate": 1.6834815536607424e-06, "loss": 0.0031, "step": 8033 }, { "epoch": 3.6551410373066426, "grad_norm": 0.115908312475804, "learning_rate": 1.6824120750325562e-06, "loss": 0.0008, "step": 8034 }, { "epoch": 3.6555959963603275, "grad_norm": 0.43484571627949, "learning_rate": 1.6813428675026728e-06, "loss": 0.0057, "step": 8035 }, { "epoch": 3.656050955414013, "grad_norm": 0.2567462497537789, "learning_rate": 1.6802739311584615e-06, "loss": 0.0039, "step": 8036 }, { "epoch": 3.656505914467698, "grad_norm": 0.07363581620074681, "learning_rate": 1.6792052660872749e-06, "loss": 0.0009, "step": 8037 }, { "epoch": 3.656960873521383, "grad_norm": 0.138696403855459, "learning_rate": 1.6781368723764352e-06, "loss": 0.0014, "step": 8038 }, { "epoch": 3.6574158325750683, "grad_norm": 0.24446612454313577, "learning_rate": 1.677068750113251e-06, "loss": 0.0056, "step": 8039 }, { "epoch": 3.6578707916287536, "grad_norm": 0.19522023722854703, "learning_rate": 1.6760008993850024e-06, "loss": 0.0024, "step": 8040 }, { "epoch": 3.6583257506824385, "grad_norm": 0.2936910334169346, "learning_rate": 1.6749333202789474e-06, "loss": 0.0055, "step": 8041 }, { "epoch": 3.658780709736124, "grad_norm": 0.2594041278930158, "learning_rate": 1.673866012882327e-06, "loss": 0.0039, "step": 8042 }, { "epoch": 3.659235668789809, "grad_norm": 0.256542480986476, "learning_rate": 1.6727989772823556e-06, "loss": 0.002, "step": 8043 }, { "epoch": 3.659690627843494, "grad_norm": 0.05315534347099337, "learning_rate": 1.6717322135662262e-06, "loss": 0.0007, "step": 8044 }, { "epoch": 3.6601455868971793, "grad_norm": 0.22839544401872766, "learning_rate": 1.6706657218211087e-06, "loss": 0.0018, "step": 8045 }, { "epoch": 3.6606005459508646, "grad_norm": 0.2895208201895555, "learning_rate": 1.6695995021341526e-06, "loss": 0.0058, "step": 8046 }, { "epoch": 3.6610555050045495, "grad_norm": 0.27113804214984594, "learning_rate": 1.6685335545924874e-06, "loss": 0.0017, "step": 8047 }, { "epoch": 3.6615104640582348, "grad_norm": 0.20008265866310768, "learning_rate": 1.667467879283215e-06, "loss": 0.001, "step": 8048 }, { "epoch": 3.66196542311192, "grad_norm": 0.36496285492596525, "learning_rate": 1.6664024762934183e-06, "loss": 0.0057, "step": 8049 }, { "epoch": 3.662420382165605, "grad_norm": 0.22717373419090375, "learning_rate": 1.6653373457101562e-06, "loss": 0.0019, "step": 8050 }, { "epoch": 3.6628753412192903, "grad_norm": 0.16463648813290777, "learning_rate": 1.6642724876204658e-06, "loss": 0.0008, "step": 8051 }, { "epoch": 3.6633303002729756, "grad_norm": 0.34860700898580577, "learning_rate": 1.663207902111364e-06, "loss": 0.0023, "step": 8052 }, { "epoch": 3.6637852593266604, "grad_norm": 0.1897024665327669, "learning_rate": 1.6621435892698452e-06, "loss": 0.0008, "step": 8053 }, { "epoch": 3.6642402183803457, "grad_norm": 0.2538522484070175, "learning_rate": 1.661079549182878e-06, "loss": 0.0024, "step": 8054 }, { "epoch": 3.664695177434031, "grad_norm": 0.13633945559507546, "learning_rate": 1.660015781937412e-06, "loss": 0.0009, "step": 8055 }, { "epoch": 3.665150136487716, "grad_norm": 0.05667062613502048, "learning_rate": 1.6589522876203717e-06, "loss": 0.0005, "step": 8056 }, { "epoch": 3.6656050955414012, "grad_norm": 0.9296746127418903, "learning_rate": 1.6578890663186637e-06, "loss": 0.0218, "step": 8057 }, { "epoch": 3.6660600545950865, "grad_norm": 0.16994736190385232, "learning_rate": 1.6568261181191687e-06, "loss": 0.0018, "step": 8058 }, { "epoch": 3.6665150136487714, "grad_norm": 0.20970246978452106, "learning_rate": 1.6557634431087433e-06, "loss": 0.0046, "step": 8059 }, { "epoch": 3.6669699727024567, "grad_norm": 0.3238131627995037, "learning_rate": 1.6547010413742292e-06, "loss": 0.0021, "step": 8060 }, { "epoch": 3.667424931756142, "grad_norm": 0.34887487019946234, "learning_rate": 1.653638913002437e-06, "loss": 0.0025, "step": 8061 }, { "epoch": 3.667879890809827, "grad_norm": 0.21373648284817673, "learning_rate": 1.6525770580801626e-06, "loss": 0.0023, "step": 8062 }, { "epoch": 3.668334849863512, "grad_norm": 0.26000117786148963, "learning_rate": 1.6515154766941738e-06, "loss": 0.0079, "step": 8063 }, { "epoch": 3.6687898089171975, "grad_norm": 0.2316191875370578, "learning_rate": 1.6504541689312186e-06, "loss": 0.0019, "step": 8064 }, { "epoch": 3.6692447679708824, "grad_norm": 0.2233798900373078, "learning_rate": 1.6493931348780211e-06, "loss": 0.0056, "step": 8065 }, { "epoch": 3.6696997270245677, "grad_norm": 0.3112488602656874, "learning_rate": 1.6483323746212854e-06, "loss": 0.0028, "step": 8066 }, { "epoch": 3.670154686078253, "grad_norm": 0.07397477435980715, "learning_rate": 1.6472718882476934e-06, "loss": 0.0005, "step": 8067 }, { "epoch": 3.670609645131938, "grad_norm": 0.21372708894174636, "learning_rate": 1.6462116758439018e-06, "loss": 0.0023, "step": 8068 }, { "epoch": 3.671064604185623, "grad_norm": 0.11587699128581494, "learning_rate": 1.6451517374965465e-06, "loss": 0.0005, "step": 8069 }, { "epoch": 3.6715195632393085, "grad_norm": 0.23728992814098315, "learning_rate": 1.6440920732922395e-06, "loss": 0.0027, "step": 8070 }, { "epoch": 3.6719745222929934, "grad_norm": 0.2498084519480596, "learning_rate": 1.6430326833175747e-06, "loss": 0.0007, "step": 8071 }, { "epoch": 3.6724294813466787, "grad_norm": 0.2033303305879698, "learning_rate": 1.6419735676591192e-06, "loss": 0.0018, "step": 8072 }, { "epoch": 3.672884440400364, "grad_norm": 0.09336571464239883, "learning_rate": 1.640914726403417e-06, "loss": 0.0007, "step": 8073 }, { "epoch": 3.673339399454049, "grad_norm": 0.23707085945734882, "learning_rate": 1.6398561596369955e-06, "loss": 0.0015, "step": 8074 }, { "epoch": 3.673794358507734, "grad_norm": 0.18057782993863786, "learning_rate": 1.6387978674463528e-06, "loss": 0.0025, "step": 8075 }, { "epoch": 3.6742493175614195, "grad_norm": 0.1877921614722231, "learning_rate": 1.6377398499179714e-06, "loss": 0.004, "step": 8076 }, { "epoch": 3.674704276615105, "grad_norm": 0.18128735508895163, "learning_rate": 1.6366821071383054e-06, "loss": 0.0034, "step": 8077 }, { "epoch": 3.6751592356687897, "grad_norm": 0.4882826172237317, "learning_rate": 1.6356246391937886e-06, "loss": 0.0077, "step": 8078 }, { "epoch": 3.675614194722475, "grad_norm": 0.1487306066833931, "learning_rate": 1.6345674461708316e-06, "loss": 0.0027, "step": 8079 }, { "epoch": 3.6760691537761603, "grad_norm": 0.2807606238463104, "learning_rate": 1.633510528155825e-06, "loss": 0.0033, "step": 8080 }, { "epoch": 3.676524112829845, "grad_norm": 0.6105029591761452, "learning_rate": 1.6324538852351363e-06, "loss": 0.0054, "step": 8081 }, { "epoch": 3.6769790718835305, "grad_norm": 0.31109172914956656, "learning_rate": 1.6313975174951085e-06, "loss": 0.0094, "step": 8082 }, { "epoch": 3.6774340309372158, "grad_norm": 0.07440605530093407, "learning_rate": 1.6303414250220634e-06, "loss": 0.0004, "step": 8083 }, { "epoch": 3.677888989990901, "grad_norm": 0.2508677618629711, "learning_rate": 1.6292856079022995e-06, "loss": 0.0071, "step": 8084 }, { "epoch": 3.678343949044586, "grad_norm": 0.11832609471604028, "learning_rate": 1.6282300662220918e-06, "loss": 0.0008, "step": 8085 }, { "epoch": 3.6787989080982713, "grad_norm": 0.1776783336318369, "learning_rate": 1.6271748000676984e-06, "loss": 0.0029, "step": 8086 }, { "epoch": 3.6792538671519566, "grad_norm": 0.18193949597186546, "learning_rate": 1.6261198095253471e-06, "loss": 0.0008, "step": 8087 }, { "epoch": 3.6797088262056414, "grad_norm": 0.4059062526547424, "learning_rate": 1.62506509468125e-06, "loss": 0.0034, "step": 8088 }, { "epoch": 3.6801637852593267, "grad_norm": 0.09739922675376, "learning_rate": 1.6240106556215928e-06, "loss": 0.001, "step": 8089 }, { "epoch": 3.680618744313012, "grad_norm": 0.18279291383397514, "learning_rate": 1.6229564924325368e-06, "loss": 0.0013, "step": 8090 }, { "epoch": 3.681073703366697, "grad_norm": 0.07938296220456725, "learning_rate": 1.621902605200228e-06, "loss": 0.0009, "step": 8091 }, { "epoch": 3.6815286624203822, "grad_norm": 0.4295045590791644, "learning_rate": 1.6208489940107824e-06, "loss": 0.0133, "step": 8092 }, { "epoch": 3.6819836214740675, "grad_norm": 0.13567236342070368, "learning_rate": 1.6197956589502967e-06, "loss": 0.0011, "step": 8093 }, { "epoch": 3.6824385805277524, "grad_norm": 0.237125267127452, "learning_rate": 1.6187426001048434e-06, "loss": 0.0051, "step": 8094 }, { "epoch": 3.6828935395814377, "grad_norm": 0.0921147379700198, "learning_rate": 1.6176898175604756e-06, "loss": 0.001, "step": 8095 }, { "epoch": 3.683348498635123, "grad_norm": 0.45689369392245593, "learning_rate": 1.616637311403223e-06, "loss": 0.004, "step": 8096 }, { "epoch": 3.683803457688808, "grad_norm": 0.0788016536765662, "learning_rate": 1.61558508171909e-06, "loss": 0.0003, "step": 8097 }, { "epoch": 3.684258416742493, "grad_norm": 0.3398853455146879, "learning_rate": 1.6145331285940603e-06, "loss": 0.0041, "step": 8098 }, { "epoch": 3.6847133757961785, "grad_norm": 0.2181534229852083, "learning_rate": 1.613481452114093e-06, "loss": 0.0045, "step": 8099 }, { "epoch": 3.6851683348498634, "grad_norm": 0.15704801098337517, "learning_rate": 1.6124300523651298e-06, "loss": 0.0012, "step": 8100 }, { "epoch": 3.6856232939035487, "grad_norm": 0.4041858294364238, "learning_rate": 1.611378929433083e-06, "loss": 0.0079, "step": 8101 }, { "epoch": 3.686078252957234, "grad_norm": 0.32553459565945025, "learning_rate": 1.6103280834038488e-06, "loss": 0.0035, "step": 8102 }, { "epoch": 3.686533212010919, "grad_norm": 0.1751796301007587, "learning_rate": 1.609277514363296e-06, "loss": 0.001, "step": 8103 }, { "epoch": 3.686988171064604, "grad_norm": 0.17804523665708427, "learning_rate": 1.6082272223972705e-06, "loss": 0.0019, "step": 8104 }, { "epoch": 3.6874431301182895, "grad_norm": 0.4336705678119796, "learning_rate": 1.6071772075916015e-06, "loss": 0.0039, "step": 8105 }, { "epoch": 3.6878980891719744, "grad_norm": 0.17729697885502382, "learning_rate": 1.6061274700320884e-06, "loss": 0.002, "step": 8106 }, { "epoch": 3.6883530482256597, "grad_norm": 0.3183949662331046, "learning_rate": 1.6050780098045126e-06, "loss": 0.002, "step": 8107 }, { "epoch": 3.688808007279345, "grad_norm": 0.40763133301734894, "learning_rate": 1.6040288269946286e-06, "loss": 0.0173, "step": 8108 }, { "epoch": 3.68926296633303, "grad_norm": 0.2337827993703391, "learning_rate": 1.6029799216881726e-06, "loss": 0.003, "step": 8109 }, { "epoch": 3.689717925386715, "grad_norm": 0.17871030055580414, "learning_rate": 1.6019312939708588e-06, "loss": 0.0039, "step": 8110 }, { "epoch": 3.6901728844404005, "grad_norm": 0.09853067750951439, "learning_rate": 1.6008829439283736e-06, "loss": 0.0008, "step": 8111 }, { "epoch": 3.6906278434940853, "grad_norm": 0.14965381136569647, "learning_rate": 1.5998348716463834e-06, "loss": 0.0011, "step": 8112 }, { "epoch": 3.6910828025477707, "grad_norm": 0.08220692669256428, "learning_rate": 1.5987870772105318e-06, "loss": 0.0008, "step": 8113 }, { "epoch": 3.691537761601456, "grad_norm": 0.35664315589602713, "learning_rate": 1.5977395607064417e-06, "loss": 0.0073, "step": 8114 }, { "epoch": 3.691992720655141, "grad_norm": 0.5066667458416546, "learning_rate": 1.596692322219709e-06, "loss": 0.0103, "step": 8115 }, { "epoch": 3.692447679708826, "grad_norm": 0.24439120936355235, "learning_rate": 1.595645361835912e-06, "loss": 0.0044, "step": 8116 }, { "epoch": 3.6929026387625115, "grad_norm": 0.07884822485577705, "learning_rate": 1.5945986796406016e-06, "loss": 0.0007, "step": 8117 }, { "epoch": 3.6933575978161963, "grad_norm": 0.3541099715146426, "learning_rate": 1.593552275719309e-06, "loss": 0.0038, "step": 8118 }, { "epoch": 3.6938125568698816, "grad_norm": 0.2727755087120767, "learning_rate": 1.5925061501575395e-06, "loss": 0.0038, "step": 8119 }, { "epoch": 3.694267515923567, "grad_norm": 0.3138646035146125, "learning_rate": 1.5914603030407804e-06, "loss": 0.0037, "step": 8120 }, { "epoch": 3.694722474977252, "grad_norm": 0.36335095432959763, "learning_rate": 1.5904147344544928e-06, "loss": 0.0032, "step": 8121 }, { "epoch": 3.695177434030937, "grad_norm": 0.3252670029539399, "learning_rate": 1.589369444484114e-06, "loss": 0.0071, "step": 8122 }, { "epoch": 3.6956323930846224, "grad_norm": 0.17722976254242095, "learning_rate": 1.5883244332150633e-06, "loss": 0.0016, "step": 8123 }, { "epoch": 3.6960873521383073, "grad_norm": 0.21793092151447244, "learning_rate": 1.5872797007327317e-06, "loss": 0.0009, "step": 8124 }, { "epoch": 3.6965423111919926, "grad_norm": 0.2453560035524618, "learning_rate": 1.5862352471224924e-06, "loss": 0.0022, "step": 8125 }, { "epoch": 3.696997270245678, "grad_norm": 0.2802691468742985, "learning_rate": 1.5851910724696928e-06, "loss": 0.0031, "step": 8126 }, { "epoch": 3.697452229299363, "grad_norm": 0.2714291926028238, "learning_rate": 1.5841471768596572e-06, "loss": 0.0036, "step": 8127 }, { "epoch": 3.697907188353048, "grad_norm": 0.1686808066666587, "learning_rate": 1.5831035603776868e-06, "loss": 0.0017, "step": 8128 }, { "epoch": 3.6983621474067334, "grad_norm": 0.3231812351426906, "learning_rate": 1.5820602231090632e-06, "loss": 0.0055, "step": 8129 }, { "epoch": 3.6988171064604187, "grad_norm": 0.1333500586490056, "learning_rate": 1.5810171651390444e-06, "loss": 0.0014, "step": 8130 }, { "epoch": 3.6992720655141036, "grad_norm": 0.13082552313962106, "learning_rate": 1.5799743865528628e-06, "loss": 0.001, "step": 8131 }, { "epoch": 3.699727024567789, "grad_norm": 0.3775599061582529, "learning_rate": 1.5789318874357296e-06, "loss": 0.0036, "step": 8132 }, { "epoch": 3.700181983621474, "grad_norm": 0.30490658121115527, "learning_rate": 1.5778896678728317e-06, "loss": 0.0061, "step": 8133 }, { "epoch": 3.700636942675159, "grad_norm": 0.47510571798683454, "learning_rate": 1.576847727949337e-06, "loss": 0.0066, "step": 8134 }, { "epoch": 3.7010919017288444, "grad_norm": 0.6801230806386573, "learning_rate": 1.5758060677503879e-06, "loss": 0.0082, "step": 8135 }, { "epoch": 3.7015468607825297, "grad_norm": 0.45149534343087655, "learning_rate": 1.5747646873611016e-06, "loss": 0.0049, "step": 8136 }, { "epoch": 3.702001819836215, "grad_norm": 0.08401680854604955, "learning_rate": 1.5737235868665785e-06, "loss": 0.001, "step": 8137 }, { "epoch": 3.7024567788899, "grad_norm": 0.15202672484368168, "learning_rate": 1.5726827663518896e-06, "loss": 0.0012, "step": 8138 }, { "epoch": 3.702911737943585, "grad_norm": 0.18858998473549005, "learning_rate": 1.5716422259020887e-06, "loss": 0.0035, "step": 8139 }, { "epoch": 3.7033666969972705, "grad_norm": 0.14498326515775617, "learning_rate": 1.5706019656022026e-06, "loss": 0.0016, "step": 8140 }, { "epoch": 3.7038216560509554, "grad_norm": 0.11211350283108794, "learning_rate": 1.5695619855372368e-06, "loss": 0.0015, "step": 8141 }, { "epoch": 3.7042766151046407, "grad_norm": 0.1600336110538318, "learning_rate": 1.5685222857921723e-06, "loss": 0.0021, "step": 8142 }, { "epoch": 3.704731574158326, "grad_norm": 0.14413269398601997, "learning_rate": 1.5674828664519703e-06, "loss": 0.0014, "step": 8143 }, { "epoch": 3.705186533212011, "grad_norm": 0.02911049526708629, "learning_rate": 1.5664437276015692e-06, "loss": 0.0003, "step": 8144 }, { "epoch": 3.705641492265696, "grad_norm": 0.21041983313542045, "learning_rate": 1.5654048693258805e-06, "loss": 0.0016, "step": 8145 }, { "epoch": 3.7060964513193815, "grad_norm": 0.304157616495816, "learning_rate": 1.5643662917097956e-06, "loss": 0.0089, "step": 8146 }, { "epoch": 3.7065514103730663, "grad_norm": 0.2615352239004681, "learning_rate": 1.5633279948381802e-06, "loss": 0.0025, "step": 8147 }, { "epoch": 3.7070063694267517, "grad_norm": 0.27691913461101136, "learning_rate": 1.5622899787958833e-06, "loss": 0.0049, "step": 8148 }, { "epoch": 3.707461328480437, "grad_norm": 0.114621549822485, "learning_rate": 1.5612522436677246e-06, "loss": 0.0009, "step": 8149 }, { "epoch": 3.707916287534122, "grad_norm": 0.5218349233293614, "learning_rate": 1.5602147895385017e-06, "loss": 0.0053, "step": 8150 }, { "epoch": 3.708371246587807, "grad_norm": 0.143448234861761, "learning_rate": 1.5591776164929934e-06, "loss": 0.0019, "step": 8151 }, { "epoch": 3.7088262056414925, "grad_norm": 0.3200096242316086, "learning_rate": 1.5581407246159508e-06, "loss": 0.0034, "step": 8152 }, { "epoch": 3.7092811646951773, "grad_norm": 0.2091570438104385, "learning_rate": 1.557104113992106e-06, "loss": 0.0012, "step": 8153 }, { "epoch": 3.7097361237488626, "grad_norm": 0.6784293195389853, "learning_rate": 1.556067784706165e-06, "loss": 0.0039, "step": 8154 }, { "epoch": 3.710191082802548, "grad_norm": 0.20314360753897395, "learning_rate": 1.5550317368428125e-06, "loss": 0.0026, "step": 8155 }, { "epoch": 3.710646041856233, "grad_norm": 0.3866123087416582, "learning_rate": 1.5539959704867086e-06, "loss": 0.0041, "step": 8156 }, { "epoch": 3.711101000909918, "grad_norm": 0.22551460892807806, "learning_rate": 1.5529604857224906e-06, "loss": 0.0039, "step": 8157 }, { "epoch": 3.7115559599636034, "grad_norm": 0.27405852867762126, "learning_rate": 1.5519252826347747e-06, "loss": 0.0025, "step": 8158 }, { "epoch": 3.7120109190172883, "grad_norm": 0.3565648505068062, "learning_rate": 1.5508903613081556e-06, "loss": 0.0039, "step": 8159 }, { "epoch": 3.7124658780709736, "grad_norm": 0.1750687873245234, "learning_rate": 1.5498557218271992e-06, "loss": 0.0011, "step": 8160 }, { "epoch": 3.712920837124659, "grad_norm": 0.1352421615919305, "learning_rate": 1.5488213642764532e-06, "loss": 0.0028, "step": 8161 }, { "epoch": 3.713375796178344, "grad_norm": 0.202231039324682, "learning_rate": 1.5477872887404382e-06, "loss": 0.0013, "step": 8162 }, { "epoch": 3.713830755232029, "grad_norm": 0.24228110286430646, "learning_rate": 1.5467534953036572e-06, "loss": 0.0013, "step": 8163 }, { "epoch": 3.7142857142857144, "grad_norm": 0.22773854068191546, "learning_rate": 1.545719984050585e-06, "loss": 0.004, "step": 8164 }, { "epoch": 3.7147406733393993, "grad_norm": 0.2722710264362533, "learning_rate": 1.544686755065677e-06, "loss": 0.0043, "step": 8165 }, { "epoch": 3.7151956323930846, "grad_norm": 0.21025749108698538, "learning_rate": 1.5436538084333635e-06, "loss": 0.0018, "step": 8166 }, { "epoch": 3.71565059144677, "grad_norm": 0.3469508797420825, "learning_rate": 1.5426211442380513e-06, "loss": 0.0067, "step": 8167 }, { "epoch": 3.7161055505004548, "grad_norm": 0.2183605406128026, "learning_rate": 1.5415887625641264e-06, "loss": 0.0012, "step": 8168 }, { "epoch": 3.71656050955414, "grad_norm": 0.3629561416787955, "learning_rate": 1.5405566634959502e-06, "loss": 0.0103, "step": 8169 }, { "epoch": 3.7170154686078254, "grad_norm": 0.24427040328021304, "learning_rate": 1.5395248471178609e-06, "loss": 0.0079, "step": 8170 }, { "epoch": 3.7174704276615103, "grad_norm": 0.576094040455724, "learning_rate": 1.5384933135141716e-06, "loss": 0.0046, "step": 8171 }, { "epoch": 3.7179253867151956, "grad_norm": 0.40737176015701665, "learning_rate": 1.5374620627691772e-06, "loss": 0.006, "step": 8172 }, { "epoch": 3.718380345768881, "grad_norm": 0.6436231479428369, "learning_rate": 1.5364310949671479e-06, "loss": 0.0053, "step": 8173 }, { "epoch": 3.7188353048225657, "grad_norm": 0.20807227823414023, "learning_rate": 1.535400410192328e-06, "loss": 0.0023, "step": 8174 }, { "epoch": 3.719290263876251, "grad_norm": 0.2543467034198584, "learning_rate": 1.5343700085289404e-06, "loss": 0.007, "step": 8175 }, { "epoch": 3.7197452229299364, "grad_norm": 0.16510189582639836, "learning_rate": 1.533339890061184e-06, "loss": 0.0022, "step": 8176 }, { "epoch": 3.7202001819836212, "grad_norm": 0.33333657645841364, "learning_rate": 1.5323100548732378e-06, "loss": 0.0025, "step": 8177 }, { "epoch": 3.7206551410373065, "grad_norm": 0.11701843279350102, "learning_rate": 1.5312805030492522e-06, "loss": 0.0011, "step": 8178 }, { "epoch": 3.721110100090992, "grad_norm": 0.2816843841249177, "learning_rate": 1.530251234673361e-06, "loss": 0.0048, "step": 8179 }, { "epoch": 3.7215650591446767, "grad_norm": 0.31726450797492217, "learning_rate": 1.5292222498296699e-06, "loss": 0.0083, "step": 8180 }, { "epoch": 3.722020018198362, "grad_norm": 0.3807455373911141, "learning_rate": 1.5281935486022609e-06, "loss": 0.008, "step": 8181 }, { "epoch": 3.7224749772520473, "grad_norm": 0.3212487310627123, "learning_rate": 1.5271651310751979e-06, "loss": 0.0056, "step": 8182 }, { "epoch": 3.722929936305732, "grad_norm": 0.1564252903280299, "learning_rate": 1.5261369973325173e-06, "loss": 0.0028, "step": 8183 }, { "epoch": 3.7233848953594175, "grad_norm": 0.2053746298802703, "learning_rate": 1.5251091474582337e-06, "loss": 0.0038, "step": 8184 }, { "epoch": 3.723839854413103, "grad_norm": 0.11648992545027957, "learning_rate": 1.5240815815363363e-06, "loss": 0.0008, "step": 8185 }, { "epoch": 3.724294813466788, "grad_norm": 0.2745476945485481, "learning_rate": 1.5230542996507951e-06, "loss": 0.0054, "step": 8186 }, { "epoch": 3.724749772520473, "grad_norm": 0.3817400439444805, "learning_rate": 1.5220273018855565e-06, "loss": 0.012, "step": 8187 }, { "epoch": 3.7252047315741583, "grad_norm": 0.22620017285496435, "learning_rate": 1.52100058832454e-06, "loss": 0.0021, "step": 8188 }, { "epoch": 3.7256596906278436, "grad_norm": 0.23609135270221862, "learning_rate": 1.5199741590516449e-06, "loss": 0.004, "step": 8189 }, { "epoch": 3.7261146496815285, "grad_norm": 0.20238060883779085, "learning_rate": 1.518948014150745e-06, "loss": 0.0049, "step": 8190 }, { "epoch": 3.726569608735214, "grad_norm": 0.26415892273089064, "learning_rate": 1.517922153705692e-06, "loss": 0.0058, "step": 8191 }, { "epoch": 3.727024567788899, "grad_norm": 0.29755471660063304, "learning_rate": 1.516896577800316e-06, "loss": 0.0047, "step": 8192 }, { "epoch": 3.7274795268425844, "grad_norm": 0.20014416089509243, "learning_rate": 1.5158712865184233e-06, "loss": 0.0015, "step": 8193 }, { "epoch": 3.7279344858962693, "grad_norm": 0.17539450605563967, "learning_rate": 1.5148462799437952e-06, "loss": 0.0012, "step": 8194 }, { "epoch": 3.7283894449499546, "grad_norm": 0.26433059508538664, "learning_rate": 1.51382155816019e-06, "loss": 0.0023, "step": 8195 }, { "epoch": 3.72884440400364, "grad_norm": 0.3998943484669266, "learning_rate": 1.512797121251342e-06, "loss": 0.0092, "step": 8196 }, { "epoch": 3.729299363057325, "grad_norm": 0.18197829623673958, "learning_rate": 1.5117729693009669e-06, "loss": 0.0017, "step": 8197 }, { "epoch": 3.72975432211101, "grad_norm": 0.19680326817115126, "learning_rate": 1.5107491023927523e-06, "loss": 0.0049, "step": 8198 }, { "epoch": 3.7302092811646954, "grad_norm": 0.30362799043348515, "learning_rate": 1.5097255206103617e-06, "loss": 0.0042, "step": 8199 }, { "epoch": 3.7306642402183803, "grad_norm": 0.3683103144138177, "learning_rate": 1.5087022240374417e-06, "loss": 0.0082, "step": 8200 }, { "epoch": 3.7311191992720656, "grad_norm": 0.32816132325601566, "learning_rate": 1.5076792127576074e-06, "loss": 0.0031, "step": 8201 }, { "epoch": 3.731574158325751, "grad_norm": 0.18096225981957614, "learning_rate": 1.5066564868544587e-06, "loss": 0.0055, "step": 8202 }, { "epoch": 3.7320291173794358, "grad_norm": 0.3953806872657199, "learning_rate": 1.5056340464115653e-06, "loss": 0.0049, "step": 8203 }, { "epoch": 3.732484076433121, "grad_norm": 0.2410208030875781, "learning_rate": 1.504611891512478e-06, "loss": 0.0039, "step": 8204 }, { "epoch": 3.7329390354868064, "grad_norm": 0.3200857663453817, "learning_rate": 1.5035900222407197e-06, "loss": 0.0139, "step": 8205 }, { "epoch": 3.7333939945404913, "grad_norm": 0.12241225924537646, "learning_rate": 1.5025684386797957e-06, "loss": 0.0009, "step": 8206 }, { "epoch": 3.7338489535941766, "grad_norm": 0.23738776768358139, "learning_rate": 1.501547140913186e-06, "loss": 0.0066, "step": 8207 }, { "epoch": 3.734303912647862, "grad_norm": 0.3078124828880467, "learning_rate": 1.5005261290243445e-06, "loss": 0.0034, "step": 8208 }, { "epoch": 3.7347588717015467, "grad_norm": 0.21207605564515963, "learning_rate": 1.499505403096705e-06, "loss": 0.0034, "step": 8209 }, { "epoch": 3.735213830755232, "grad_norm": 0.1886399794420435, "learning_rate": 1.498484963213674e-06, "loss": 0.0035, "step": 8210 }, { "epoch": 3.7356687898089174, "grad_norm": 0.13404938480224354, "learning_rate": 1.4974648094586408e-06, "loss": 0.0016, "step": 8211 }, { "epoch": 3.7361237488626022, "grad_norm": 0.13570478317238555, "learning_rate": 1.4964449419149657e-06, "loss": 0.0011, "step": 8212 }, { "epoch": 3.7365787079162875, "grad_norm": 0.2640883114846296, "learning_rate": 1.4954253606659868e-06, "loss": 0.0032, "step": 8213 }, { "epoch": 3.737033666969973, "grad_norm": 0.12715737080694361, "learning_rate": 1.4944060657950227e-06, "loss": 0.0015, "step": 8214 }, { "epoch": 3.7374886260236577, "grad_norm": 0.11243982104331475, "learning_rate": 1.4933870573853616e-06, "loss": 0.0018, "step": 8215 }, { "epoch": 3.737943585077343, "grad_norm": 0.2366247478197491, "learning_rate": 1.4923683355202761e-06, "loss": 0.0025, "step": 8216 }, { "epoch": 3.7383985441310283, "grad_norm": 0.38383205174148627, "learning_rate": 1.4913499002830106e-06, "loss": 0.0043, "step": 8217 }, { "epoch": 3.738853503184713, "grad_norm": 0.24298155221940393, "learning_rate": 1.4903317517567856e-06, "loss": 0.0037, "step": 8218 }, { "epoch": 3.7393084622383985, "grad_norm": 0.1498799121177, "learning_rate": 1.4893138900247989e-06, "loss": 0.0038, "step": 8219 }, { "epoch": 3.739763421292084, "grad_norm": 0.23918706551676, "learning_rate": 1.4882963151702272e-06, "loss": 0.0033, "step": 8220 }, { "epoch": 3.7402183803457687, "grad_norm": 0.30941297576676496, "learning_rate": 1.4872790272762234e-06, "loss": 0.0045, "step": 8221 }, { "epoch": 3.740673339399454, "grad_norm": 0.02228059940016356, "learning_rate": 1.4862620264259142e-06, "loss": 0.0002, "step": 8222 }, { "epoch": 3.7411282984531393, "grad_norm": 0.09162878481786349, "learning_rate": 1.4852453127024042e-06, "loss": 0.0012, "step": 8223 }, { "epoch": 3.741583257506824, "grad_norm": 0.17340731818855434, "learning_rate": 1.4842288861887732e-06, "loss": 0.0036, "step": 8224 }, { "epoch": 3.7420382165605095, "grad_norm": 0.3006127733886893, "learning_rate": 1.4832127469680823e-06, "loss": 0.0053, "step": 8225 }, { "epoch": 3.742493175614195, "grad_norm": 0.16527196990196308, "learning_rate": 1.482196895123364e-06, "loss": 0.0011, "step": 8226 }, { "epoch": 3.7429481346678797, "grad_norm": 0.08198894408476844, "learning_rate": 1.4811813307376271e-06, "loss": 0.0008, "step": 8227 }, { "epoch": 3.743403093721565, "grad_norm": 0.22901257424191804, "learning_rate": 1.4801660538938633e-06, "loss": 0.0016, "step": 8228 }, { "epoch": 3.7438580527752503, "grad_norm": 0.21571914853572963, "learning_rate": 1.479151064675034e-06, "loss": 0.0032, "step": 8229 }, { "epoch": 3.744313011828935, "grad_norm": 0.25112107983233956, "learning_rate": 1.4781363631640777e-06, "loss": 0.0035, "step": 8230 }, { "epoch": 3.7447679708826205, "grad_norm": 0.1768253906755621, "learning_rate": 1.4771219494439148e-06, "loss": 0.0014, "step": 8231 }, { "epoch": 3.745222929936306, "grad_norm": 0.3420258053192273, "learning_rate": 1.4761078235974374e-06, "loss": 0.0044, "step": 8232 }, { "epoch": 3.7456778889899907, "grad_norm": 0.3239689734533046, "learning_rate": 1.4750939857075147e-06, "loss": 0.0042, "step": 8233 }, { "epoch": 3.746132848043676, "grad_norm": 0.43480442220027377, "learning_rate": 1.4740804358569916e-06, "loss": 0.0039, "step": 8234 }, { "epoch": 3.7465878070973613, "grad_norm": 0.2456495241766836, "learning_rate": 1.4730671741286923e-06, "loss": 0.0021, "step": 8235 }, { "epoch": 3.747042766151046, "grad_norm": 0.2388725584202595, "learning_rate": 1.4720542006054178e-06, "loss": 0.0038, "step": 8236 }, { "epoch": 3.7474977252047315, "grad_norm": 0.23301887945847435, "learning_rate": 1.471041515369942e-06, "loss": 0.0036, "step": 8237 }, { "epoch": 3.7479526842584168, "grad_norm": 0.17543611593853725, "learning_rate": 1.4700291185050164e-06, "loss": 0.0025, "step": 8238 }, { "epoch": 3.7484076433121016, "grad_norm": 0.26831077593504893, "learning_rate": 1.4690170100933692e-06, "loss": 0.0064, "step": 8239 }, { "epoch": 3.748862602365787, "grad_norm": 0.1613720374260712, "learning_rate": 1.4680051902177073e-06, "loss": 0.0017, "step": 8240 }, { "epoch": 3.7493175614194723, "grad_norm": 0.1760288579310544, "learning_rate": 1.4669936589607092e-06, "loss": 0.0019, "step": 8241 }, { "epoch": 3.7497725204731576, "grad_norm": 0.46488460827072936, "learning_rate": 1.4659824164050363e-06, "loss": 0.0042, "step": 8242 }, { "epoch": 3.7502274795268424, "grad_norm": 0.1676211883362639, "learning_rate": 1.4649714626333206e-06, "loss": 0.0021, "step": 8243 }, { "epoch": 3.7506824385805277, "grad_norm": 0.2138238858681873, "learning_rate": 1.4639607977281716e-06, "loss": 0.0051, "step": 8244 }, { "epoch": 3.751137397634213, "grad_norm": 0.19386391574906334, "learning_rate": 1.462950421772179e-06, "loss": 0.0026, "step": 8245 }, { "epoch": 3.7515923566878984, "grad_norm": 0.30300347693637725, "learning_rate": 1.4619403348479045e-06, "loss": 0.0048, "step": 8246 }, { "epoch": 3.7520473157415832, "grad_norm": 0.1297172804999276, "learning_rate": 1.4609305370378867e-06, "loss": 0.0011, "step": 8247 }, { "epoch": 3.7525022747952685, "grad_norm": 0.09863438238686904, "learning_rate": 1.4599210284246452e-06, "loss": 0.0011, "step": 8248 }, { "epoch": 3.752957233848954, "grad_norm": 0.03359004436647919, "learning_rate": 1.4589118090906684e-06, "loss": 0.0002, "step": 8249 }, { "epoch": 3.7534121929026387, "grad_norm": 0.26956004939560535, "learning_rate": 1.4579028791184286e-06, "loss": 0.0032, "step": 8250 }, { "epoch": 3.753867151956324, "grad_norm": 0.15930204581725518, "learning_rate": 1.4568942385903695e-06, "loss": 0.0016, "step": 8251 }, { "epoch": 3.7543221110100093, "grad_norm": 0.4694956725111323, "learning_rate": 1.455885887588913e-06, "loss": 0.0104, "step": 8252 }, { "epoch": 3.754777070063694, "grad_norm": 0.12168435882028572, "learning_rate": 1.4548778261964552e-06, "loss": 0.0016, "step": 8253 }, { "epoch": 3.7552320291173795, "grad_norm": 0.24991258948964318, "learning_rate": 1.4538700544953715e-06, "loss": 0.0035, "step": 8254 }, { "epoch": 3.755686988171065, "grad_norm": 0.16153944886396596, "learning_rate": 1.4528625725680146e-06, "loss": 0.0024, "step": 8255 }, { "epoch": 3.7561419472247497, "grad_norm": 0.2742082198279699, "learning_rate": 1.4518553804967094e-06, "loss": 0.003, "step": 8256 }, { "epoch": 3.756596906278435, "grad_norm": 0.21943261188891286, "learning_rate": 1.4508484783637588e-06, "loss": 0.0039, "step": 8257 }, { "epoch": 3.7570518653321203, "grad_norm": 0.40702778771251596, "learning_rate": 1.4498418662514418e-06, "loss": 0.0088, "step": 8258 }, { "epoch": 3.757506824385805, "grad_norm": 0.32585313264357846, "learning_rate": 1.4488355442420166e-06, "loss": 0.0113, "step": 8259 }, { "epoch": 3.7579617834394905, "grad_norm": 0.21473201616683416, "learning_rate": 1.4478295124177133e-06, "loss": 0.0021, "step": 8260 }, { "epoch": 3.758416742493176, "grad_norm": 0.12206682889929317, "learning_rate": 1.4468237708607397e-06, "loss": 0.0012, "step": 8261 }, { "epoch": 3.7588717015468607, "grad_norm": 0.08287384135932507, "learning_rate": 1.4458183196532833e-06, "loss": 0.001, "step": 8262 }, { "epoch": 3.759326660600546, "grad_norm": 0.09582947209908133, "learning_rate": 1.4448131588775026e-06, "loss": 0.0014, "step": 8263 }, { "epoch": 3.7597816196542313, "grad_norm": 0.04708513880623716, "learning_rate": 1.4438082886155347e-06, "loss": 0.0003, "step": 8264 }, { "epoch": 3.760236578707916, "grad_norm": 0.4133360982095818, "learning_rate": 1.4428037089494946e-06, "loss": 0.011, "step": 8265 }, { "epoch": 3.7606915377616015, "grad_norm": 0.22983535595942226, "learning_rate": 1.4417994199614716e-06, "loss": 0.0024, "step": 8266 }, { "epoch": 3.761146496815287, "grad_norm": 0.30595199856839916, "learning_rate": 1.4407954217335312e-06, "loss": 0.006, "step": 8267 }, { "epoch": 3.7616014558689717, "grad_norm": 0.28199614845423543, "learning_rate": 1.4397917143477146e-06, "loss": 0.0108, "step": 8268 }, { "epoch": 3.762056414922657, "grad_norm": 0.4448458371879213, "learning_rate": 1.4387882978860412e-06, "loss": 0.0047, "step": 8269 }, { "epoch": 3.7625113739763423, "grad_norm": 0.09856097507428246, "learning_rate": 1.437785172430507e-06, "loss": 0.001, "step": 8270 }, { "epoch": 3.762966333030027, "grad_norm": 0.34702751501721035, "learning_rate": 1.436782338063082e-06, "loss": 0.0068, "step": 8271 }, { "epoch": 3.7634212920837125, "grad_norm": 0.3459473693444633, "learning_rate": 1.4357797948657126e-06, "loss": 0.0067, "step": 8272 }, { "epoch": 3.7638762511373978, "grad_norm": 0.1000976792686654, "learning_rate": 1.4347775429203215e-06, "loss": 0.0013, "step": 8273 }, { "epoch": 3.7643312101910826, "grad_norm": 0.12371231083008664, "learning_rate": 1.43377558230881e-06, "loss": 0.0011, "step": 8274 }, { "epoch": 3.764786169244768, "grad_norm": 0.6908311924322025, "learning_rate": 1.432773913113052e-06, "loss": 0.0054, "step": 8275 }, { "epoch": 3.7652411282984533, "grad_norm": 0.4250515237535653, "learning_rate": 1.431772535414902e-06, "loss": 0.0035, "step": 8276 }, { "epoch": 3.765696087352138, "grad_norm": 0.2581985701494445, "learning_rate": 1.430771449296186e-06, "loss": 0.0041, "step": 8277 }, { "epoch": 3.7661510464058234, "grad_norm": 0.22040733457862796, "learning_rate": 1.4297706548387074e-06, "loss": 0.0023, "step": 8278 }, { "epoch": 3.7666060054595087, "grad_norm": 0.3236483680786687, "learning_rate": 1.4287701521242493e-06, "loss": 0.0068, "step": 8279 }, { "epoch": 3.7670609645131936, "grad_norm": 0.24037905617484695, "learning_rate": 1.427769941234567e-06, "loss": 0.0042, "step": 8280 }, { "epoch": 3.767515923566879, "grad_norm": 0.08354808399660561, "learning_rate": 1.426770022251393e-06, "loss": 0.0008, "step": 8281 }, { "epoch": 3.7679708826205642, "grad_norm": 0.2610880841770453, "learning_rate": 1.4257703952564344e-06, "loss": 0.0041, "step": 8282 }, { "epoch": 3.768425841674249, "grad_norm": 0.37604638537049256, "learning_rate": 1.4247710603313785e-06, "loss": 0.0071, "step": 8283 }, { "epoch": 3.7688808007279344, "grad_norm": 0.21831117309163686, "learning_rate": 1.4237720175578873e-06, "loss": 0.0052, "step": 8284 }, { "epoch": 3.7693357597816197, "grad_norm": 0.22212900825780707, "learning_rate": 1.4227732670175963e-06, "loss": 0.0049, "step": 8285 }, { "epoch": 3.7697907188353046, "grad_norm": 0.2692388023971826, "learning_rate": 1.4217748087921202e-06, "loss": 0.0071, "step": 8286 }, { "epoch": 3.77024567788899, "grad_norm": 0.4182857081945022, "learning_rate": 1.4207766429630453e-06, "loss": 0.0016, "step": 8287 }, { "epoch": 3.770700636942675, "grad_norm": 0.12810129756546632, "learning_rate": 1.4197787696119414e-06, "loss": 0.001, "step": 8288 }, { "epoch": 3.77115559599636, "grad_norm": 0.17563388815871478, "learning_rate": 1.4187811888203468e-06, "loss": 0.0024, "step": 8289 }, { "epoch": 3.7716105550500454, "grad_norm": 0.28458649825509774, "learning_rate": 1.4177839006697818e-06, "loss": 0.0059, "step": 8290 }, { "epoch": 3.7720655141037307, "grad_norm": 0.16885255227897664, "learning_rate": 1.41678690524174e-06, "loss": 0.0022, "step": 8291 }, { "epoch": 3.7725204731574156, "grad_norm": 0.3219999041978894, "learning_rate": 1.415790202617689e-06, "loss": 0.0022, "step": 8292 }, { "epoch": 3.772975432211101, "grad_norm": 0.2659269968266485, "learning_rate": 1.4147937928790778e-06, "loss": 0.0064, "step": 8293 }, { "epoch": 3.773430391264786, "grad_norm": 0.12091441191771202, "learning_rate": 1.4137976761073269e-06, "loss": 0.0007, "step": 8294 }, { "epoch": 3.7738853503184715, "grad_norm": 0.10390529647426298, "learning_rate": 1.4128018523838355e-06, "loss": 0.0007, "step": 8295 }, { "epoch": 3.7743403093721564, "grad_norm": 0.39299677619197315, "learning_rate": 1.4118063217899746e-06, "loss": 0.0026, "step": 8296 }, { "epoch": 3.7747952684258417, "grad_norm": 0.38015204142968195, "learning_rate": 1.4108110844070977e-06, "loss": 0.007, "step": 8297 }, { "epoch": 3.775250227479527, "grad_norm": 0.129346704475973, "learning_rate": 1.4098161403165317e-06, "loss": 0.0007, "step": 8298 }, { "epoch": 3.775705186533212, "grad_norm": 0.2602422039857844, "learning_rate": 1.4088214895995777e-06, "loss": 0.0057, "step": 8299 }, { "epoch": 3.776160145586897, "grad_norm": 0.27091709573008166, "learning_rate": 1.4078271323375137e-06, "loss": 0.0034, "step": 8300 }, { "epoch": 3.7766151046405825, "grad_norm": 0.7141580285719477, "learning_rate": 1.4068330686115943e-06, "loss": 0.0135, "step": 8301 }, { "epoch": 3.777070063694268, "grad_norm": 0.27640944266949646, "learning_rate": 1.4058392985030488e-06, "loss": 0.0074, "step": 8302 }, { "epoch": 3.7775250227479527, "grad_norm": 0.3385462216465802, "learning_rate": 1.4048458220930843e-06, "loss": 0.004, "step": 8303 }, { "epoch": 3.777979981801638, "grad_norm": 0.08786583811062756, "learning_rate": 1.4038526394628854e-06, "loss": 0.0008, "step": 8304 }, { "epoch": 3.7784349408553233, "grad_norm": 0.40261208409509985, "learning_rate": 1.4028597506936086e-06, "loss": 0.0104, "step": 8305 }, { "epoch": 3.778889899909008, "grad_norm": 0.24271452189170495, "learning_rate": 1.4018671558663888e-06, "loss": 0.004, "step": 8306 }, { "epoch": 3.7793448589626935, "grad_norm": 0.1524327901050344, "learning_rate": 1.4008748550623342e-06, "loss": 0.0024, "step": 8307 }, { "epoch": 3.7797998180163788, "grad_norm": 0.22063041266979924, "learning_rate": 1.3998828483625343e-06, "loss": 0.0025, "step": 8308 }, { "epoch": 3.7802547770700636, "grad_norm": 0.14460719120690937, "learning_rate": 1.3988911358480506e-06, "loss": 0.0024, "step": 8309 }, { "epoch": 3.780709736123749, "grad_norm": 0.10118189583718977, "learning_rate": 1.3978997175999186e-06, "loss": 0.0012, "step": 8310 }, { "epoch": 3.7811646951774343, "grad_norm": 0.11685587624071721, "learning_rate": 1.3969085936991567e-06, "loss": 0.0014, "step": 8311 }, { "epoch": 3.781619654231119, "grad_norm": 0.20108530234702668, "learning_rate": 1.3959177642267513e-06, "loss": 0.0033, "step": 8312 }, { "epoch": 3.7820746132848044, "grad_norm": 0.10795246525356737, "learning_rate": 1.3949272292636722e-06, "loss": 0.0011, "step": 8313 }, { "epoch": 3.7825295723384897, "grad_norm": 0.26094487332367433, "learning_rate": 1.3939369888908593e-06, "loss": 0.0043, "step": 8314 }, { "epoch": 3.7829845313921746, "grad_norm": 0.24836976639729275, "learning_rate": 1.392947043189231e-06, "loss": 0.0022, "step": 8315 }, { "epoch": 3.78343949044586, "grad_norm": 0.15770979557828438, "learning_rate": 1.3919573922396796e-06, "loss": 0.0011, "step": 8316 }, { "epoch": 3.7838944494995452, "grad_norm": 0.3270618825843036, "learning_rate": 1.390968036123076e-06, "loss": 0.0048, "step": 8317 }, { "epoch": 3.78434940855323, "grad_norm": 0.24141696524179324, "learning_rate": 1.3899789749202674e-06, "loss": 0.0025, "step": 8318 }, { "epoch": 3.7848043676069154, "grad_norm": 0.28267758275750954, "learning_rate": 1.388990208712075e-06, "loss": 0.0044, "step": 8319 }, { "epoch": 3.7852593266606007, "grad_norm": 0.1399486288759601, "learning_rate": 1.3880017375792953e-06, "loss": 0.0007, "step": 8320 }, { "epoch": 3.7857142857142856, "grad_norm": 0.22842964329284515, "learning_rate": 1.3870135616027003e-06, "loss": 0.0016, "step": 8321 }, { "epoch": 3.786169244767971, "grad_norm": 0.35524751768780877, "learning_rate": 1.3860256808630429e-06, "loss": 0.0036, "step": 8322 }, { "epoch": 3.786624203821656, "grad_norm": 0.3221021611325404, "learning_rate": 1.385038095441046e-06, "loss": 0.0019, "step": 8323 }, { "epoch": 3.787079162875341, "grad_norm": 0.2024406933552888, "learning_rate": 1.3840508054174095e-06, "loss": 0.0016, "step": 8324 }, { "epoch": 3.7875341219290264, "grad_norm": 0.20208271857860724, "learning_rate": 1.3830638108728128e-06, "loss": 0.0018, "step": 8325 }, { "epoch": 3.7879890809827117, "grad_norm": 0.0751246125810503, "learning_rate": 1.3820771118879067e-06, "loss": 0.0005, "step": 8326 }, { "epoch": 3.7884440400363966, "grad_norm": 0.2707006913872268, "learning_rate": 1.3810907085433216e-06, "loss": 0.0027, "step": 8327 }, { "epoch": 3.788898999090082, "grad_norm": 0.5023881792721455, "learning_rate": 1.3801046009196612e-06, "loss": 0.0112, "step": 8328 }, { "epoch": 3.789353958143767, "grad_norm": 0.13838993811751352, "learning_rate": 1.3791187890975055e-06, "loss": 0.0017, "step": 8329 }, { "epoch": 3.789808917197452, "grad_norm": 0.23351490105407538, "learning_rate": 1.3781332731574087e-06, "loss": 0.0043, "step": 8330 }, { "epoch": 3.7902638762511374, "grad_norm": 0.22232711243570125, "learning_rate": 1.3771480531799054e-06, "loss": 0.0059, "step": 8331 }, { "epoch": 3.7907188353048227, "grad_norm": 0.30010517879141546, "learning_rate": 1.3761631292455036e-06, "loss": 0.0056, "step": 8332 }, { "epoch": 3.7911737943585075, "grad_norm": 0.3390281605040097, "learning_rate": 1.3751785014346853e-06, "loss": 0.0043, "step": 8333 }, { "epoch": 3.791628753412193, "grad_norm": 0.3379033481004833, "learning_rate": 1.374194169827911e-06, "loss": 0.0049, "step": 8334 }, { "epoch": 3.792083712465878, "grad_norm": 0.15248759514781857, "learning_rate": 1.3732101345056149e-06, "loss": 0.0015, "step": 8335 }, { "epoch": 3.792538671519563, "grad_norm": 0.3090673122352973, "learning_rate": 1.3722263955482068e-06, "loss": 0.0029, "step": 8336 }, { "epoch": 3.7929936305732483, "grad_norm": 0.30032816807117646, "learning_rate": 1.371242953036076e-06, "loss": 0.0046, "step": 8337 }, { "epoch": 3.7934485896269337, "grad_norm": 0.47663178964047737, "learning_rate": 1.3702598070495826e-06, "loss": 0.0093, "step": 8338 }, { "epoch": 3.7939035486806185, "grad_norm": 0.17488689932511098, "learning_rate": 1.3692769576690674e-06, "loss": 0.0011, "step": 8339 }, { "epoch": 3.794358507734304, "grad_norm": 0.22410253015377568, "learning_rate": 1.3682944049748425e-06, "loss": 0.0052, "step": 8340 }, { "epoch": 3.794813466787989, "grad_norm": 0.1786421527765028, "learning_rate": 1.3673121490471975e-06, "loss": 0.0034, "step": 8341 }, { "epoch": 3.795268425841674, "grad_norm": 0.2517136326053243, "learning_rate": 1.3663301899663995e-06, "loss": 0.0063, "step": 8342 }, { "epoch": 3.7957233848953593, "grad_norm": 0.048507668053566086, "learning_rate": 1.3653485278126894e-06, "loss": 0.0003, "step": 8343 }, { "epoch": 3.7961783439490446, "grad_norm": 0.20757721788522404, "learning_rate": 1.364367162666283e-06, "loss": 0.0035, "step": 8344 }, { "epoch": 3.7966333030027295, "grad_norm": 0.2612240603399998, "learning_rate": 1.363386094607373e-06, "loss": 0.0041, "step": 8345 }, { "epoch": 3.797088262056415, "grad_norm": 0.08271865212704826, "learning_rate": 1.3624053237161278e-06, "loss": 0.0011, "step": 8346 }, { "epoch": 3.7975432211101, "grad_norm": 0.3715588667918637, "learning_rate": 1.361424850072694e-06, "loss": 0.0039, "step": 8347 }, { "epoch": 3.797998180163785, "grad_norm": 0.20936961372647586, "learning_rate": 1.3604446737571902e-06, "loss": 0.0054, "step": 8348 }, { "epoch": 3.7984531392174703, "grad_norm": 0.19051753243209604, "learning_rate": 1.3594647948497113e-06, "loss": 0.0037, "step": 8349 }, { "epoch": 3.7989080982711556, "grad_norm": 0.28101577844919723, "learning_rate": 1.3584852134303273e-06, "loss": 0.005, "step": 8350 }, { "epoch": 3.799363057324841, "grad_norm": 0.3966610896334549, "learning_rate": 1.3575059295790882e-06, "loss": 0.0132, "step": 8351 }, { "epoch": 3.799818016378526, "grad_norm": 0.3114847369521246, "learning_rate": 1.3565269433760137e-06, "loss": 0.0106, "step": 8352 }, { "epoch": 3.800272975432211, "grad_norm": 0.18758962670137064, "learning_rate": 1.355548254901105e-06, "loss": 0.002, "step": 8353 }, { "epoch": 3.8007279344858964, "grad_norm": 0.24626540682506912, "learning_rate": 1.3545698642343351e-06, "loss": 0.0033, "step": 8354 }, { "epoch": 3.8011828935395813, "grad_norm": 0.07727662074760829, "learning_rate": 1.3535917714556512e-06, "loss": 0.0006, "step": 8355 }, { "epoch": 3.8016378525932666, "grad_norm": 0.3433344176285079, "learning_rate": 1.352613976644983e-06, "loss": 0.005, "step": 8356 }, { "epoch": 3.802092811646952, "grad_norm": 0.1406963305448382, "learning_rate": 1.3516364798822284e-06, "loss": 0.0013, "step": 8357 }, { "epoch": 3.802547770700637, "grad_norm": 0.15787679477952846, "learning_rate": 1.3506592812472653e-06, "loss": 0.0018, "step": 8358 }, { "epoch": 3.803002729754322, "grad_norm": 0.12541486370887497, "learning_rate": 1.3496823808199438e-06, "loss": 0.0016, "step": 8359 }, { "epoch": 3.8034576888080074, "grad_norm": 0.2008632031837807, "learning_rate": 1.3487057786800932e-06, "loss": 0.0031, "step": 8360 }, { "epoch": 3.8039126478616927, "grad_norm": 0.2754841460669745, "learning_rate": 1.3477294749075194e-06, "loss": 0.0078, "step": 8361 }, { "epoch": 3.8043676069153776, "grad_norm": 0.22427736264793874, "learning_rate": 1.346753469581999e-06, "loss": 0.0046, "step": 8362 }, { "epoch": 3.804822565969063, "grad_norm": 0.2092053635397987, "learning_rate": 1.3457777627832868e-06, "loss": 0.0028, "step": 8363 }, { "epoch": 3.805277525022748, "grad_norm": 0.17228733251915468, "learning_rate": 1.3448023545911126e-06, "loss": 0.001, "step": 8364 }, { "epoch": 3.805732484076433, "grad_norm": 0.1674441467315862, "learning_rate": 1.3438272450851846e-06, "loss": 0.0016, "step": 8365 }, { "epoch": 3.8061874431301184, "grad_norm": 0.12685419369546294, "learning_rate": 1.3428524343451809e-06, "loss": 0.0007, "step": 8366 }, { "epoch": 3.8066424021838037, "grad_norm": 0.20988870320387673, "learning_rate": 1.3418779224507634e-06, "loss": 0.0016, "step": 8367 }, { "epoch": 3.8070973612374885, "grad_norm": 0.08262350636139937, "learning_rate": 1.3409037094815613e-06, "loss": 0.0006, "step": 8368 }, { "epoch": 3.807552320291174, "grad_norm": 0.17581154904944118, "learning_rate": 1.3399297955171825e-06, "loss": 0.0019, "step": 8369 }, { "epoch": 3.808007279344859, "grad_norm": 0.16225362274054209, "learning_rate": 1.338956180637213e-06, "loss": 0.0021, "step": 8370 }, { "epoch": 3.808462238398544, "grad_norm": 0.2001268120216354, "learning_rate": 1.3379828649212123e-06, "loss": 0.0035, "step": 8371 }, { "epoch": 3.8089171974522293, "grad_norm": 0.33097145827180485, "learning_rate": 1.3370098484487138e-06, "loss": 0.0079, "step": 8372 }, { "epoch": 3.8093721565059147, "grad_norm": 0.2258307707906472, "learning_rate": 1.336037131299227e-06, "loss": 0.0052, "step": 8373 }, { "epoch": 3.8098271155595995, "grad_norm": 0.3399619825879456, "learning_rate": 1.335064713552241e-06, "loss": 0.0098, "step": 8374 }, { "epoch": 3.810282074613285, "grad_norm": 0.09768137874619251, "learning_rate": 1.3340925952872147e-06, "loss": 0.0007, "step": 8375 }, { "epoch": 3.81073703366697, "grad_norm": 0.08380580073280516, "learning_rate": 1.3331207765835875e-06, "loss": 0.0009, "step": 8376 }, { "epoch": 3.811191992720655, "grad_norm": 0.05436590886491927, "learning_rate": 1.332149257520771e-06, "loss": 0.0005, "step": 8377 }, { "epoch": 3.8116469517743403, "grad_norm": 0.12454516050396389, "learning_rate": 1.3311780381781537e-06, "loss": 0.0021, "step": 8378 }, { "epoch": 3.8121019108280256, "grad_norm": 0.4294764678194112, "learning_rate": 1.3302071186350972e-06, "loss": 0.0053, "step": 8379 }, { "epoch": 3.8125568698817105, "grad_norm": 0.20353694606732922, "learning_rate": 1.3292364989709422e-06, "loss": 0.0039, "step": 8380 }, { "epoch": 3.813011828935396, "grad_norm": 0.25911207089557037, "learning_rate": 1.3282661792650054e-06, "loss": 0.0025, "step": 8381 }, { "epoch": 3.813466787989081, "grad_norm": 0.18024098799730776, "learning_rate": 1.3272961595965743e-06, "loss": 0.0027, "step": 8382 }, { "epoch": 3.813921747042766, "grad_norm": 0.18652213195706055, "learning_rate": 1.3263264400449161e-06, "loss": 0.0014, "step": 8383 }, { "epoch": 3.8143767060964513, "grad_norm": 0.4040370739991496, "learning_rate": 1.325357020689269e-06, "loss": 0.0077, "step": 8384 }, { "epoch": 3.8148316651501366, "grad_norm": 0.09313504076322941, "learning_rate": 1.3243879016088534e-06, "loss": 0.0007, "step": 8385 }, { "epoch": 3.8152866242038215, "grad_norm": 0.23092393538599115, "learning_rate": 1.3234190828828591e-06, "loss": 0.0036, "step": 8386 }, { "epoch": 3.815741583257507, "grad_norm": 0.2571674933170574, "learning_rate": 1.3224505645904534e-06, "loss": 0.0035, "step": 8387 }, { "epoch": 3.816196542311192, "grad_norm": 0.45823777636308516, "learning_rate": 1.3214823468107807e-06, "loss": 0.0106, "step": 8388 }, { "epoch": 3.816651501364877, "grad_norm": 0.19575203408620434, "learning_rate": 1.3205144296229572e-06, "loss": 0.0025, "step": 8389 }, { "epoch": 3.8171064604185623, "grad_norm": 0.14813728090937492, "learning_rate": 1.3195468131060796e-06, "loss": 0.0048, "step": 8390 }, { "epoch": 3.8175614194722476, "grad_norm": 0.460510606669038, "learning_rate": 1.3185794973392158e-06, "loss": 0.0153, "step": 8391 }, { "epoch": 3.8180163785259325, "grad_norm": 0.0692284420265699, "learning_rate": 1.3176124824014102e-06, "loss": 0.0007, "step": 8392 }, { "epoch": 3.8184713375796178, "grad_norm": 0.10343261170357118, "learning_rate": 1.3166457683716815e-06, "loss": 0.0007, "step": 8393 }, { "epoch": 3.818926296633303, "grad_norm": 0.2958141511157044, "learning_rate": 1.3156793553290271e-06, "loss": 0.0056, "step": 8394 }, { "epoch": 3.819381255686988, "grad_norm": 0.1438216881899908, "learning_rate": 1.3147132433524184e-06, "loss": 0.0025, "step": 8395 }, { "epoch": 3.8198362147406733, "grad_norm": 0.10786947358240591, "learning_rate": 1.313747432520801e-06, "loss": 0.0009, "step": 8396 }, { "epoch": 3.8202911737943586, "grad_norm": 0.25833041543913837, "learning_rate": 1.3127819229130967e-06, "loss": 0.006, "step": 8397 }, { "epoch": 3.8207461328480434, "grad_norm": 0.1456879020024286, "learning_rate": 1.3118167146082005e-06, "loss": 0.0018, "step": 8398 }, { "epoch": 3.8212010919017287, "grad_norm": 0.1095285919176108, "learning_rate": 1.3108518076849886e-06, "loss": 0.0017, "step": 8399 }, { "epoch": 3.821656050955414, "grad_norm": 0.2114624496544202, "learning_rate": 1.3098872022223069e-06, "loss": 0.0031, "step": 8400 }, { "epoch": 3.822111010009099, "grad_norm": 0.4166027951841225, "learning_rate": 1.3089228982989771e-06, "loss": 0.0077, "step": 8401 }, { "epoch": 3.8225659690627842, "grad_norm": 0.1341179200124046, "learning_rate": 1.3079588959938006e-06, "loss": 0.0012, "step": 8402 }, { "epoch": 3.8230209281164695, "grad_norm": 0.2815178784042401, "learning_rate": 1.3069951953855486e-06, "loss": 0.0018, "step": 8403 }, { "epoch": 3.823475887170155, "grad_norm": 0.3038368919220553, "learning_rate": 1.3060317965529734e-06, "loss": 0.004, "step": 8404 }, { "epoch": 3.8239308462238397, "grad_norm": 0.12260925707408848, "learning_rate": 1.305068699574798e-06, "loss": 0.0015, "step": 8405 }, { "epoch": 3.824385805277525, "grad_norm": 0.2630754045078848, "learning_rate": 1.3041059045297217e-06, "loss": 0.0016, "step": 8406 }, { "epoch": 3.8248407643312103, "grad_norm": 0.41332693376846336, "learning_rate": 1.303143411496421e-06, "loss": 0.0051, "step": 8407 }, { "epoch": 3.825295723384895, "grad_norm": 0.1368726520759684, "learning_rate": 1.3021812205535444e-06, "loss": 0.0007, "step": 8408 }, { "epoch": 3.8257506824385805, "grad_norm": 0.09424341000898812, "learning_rate": 1.3012193317797189e-06, "loss": 0.001, "step": 8409 }, { "epoch": 3.826205641492266, "grad_norm": 0.06180291258421272, "learning_rate": 1.3002577452535475e-06, "loss": 0.0004, "step": 8410 }, { "epoch": 3.826660600545951, "grad_norm": 0.3137531774726185, "learning_rate": 1.2992964610536057e-06, "loss": 0.0044, "step": 8411 }, { "epoch": 3.827115559599636, "grad_norm": 0.19275950127678917, "learning_rate": 1.2983354792584446e-06, "loss": 0.0035, "step": 8412 }, { "epoch": 3.8275705186533213, "grad_norm": 0.3541223724062894, "learning_rate": 1.2973747999465903e-06, "loss": 0.0038, "step": 8413 }, { "epoch": 3.8280254777070066, "grad_norm": 0.07837287848896765, "learning_rate": 1.2964144231965475e-06, "loss": 0.0011, "step": 8414 }, { "epoch": 3.8284804367606915, "grad_norm": 0.2019202511765073, "learning_rate": 1.2954543490867917e-06, "loss": 0.0025, "step": 8415 }, { "epoch": 3.828935395814377, "grad_norm": 0.34803643060160927, "learning_rate": 1.2944945776957778e-06, "loss": 0.0035, "step": 8416 }, { "epoch": 3.829390354868062, "grad_norm": 0.254793766134315, "learning_rate": 1.2935351091019338e-06, "loss": 0.0027, "step": 8417 }, { "epoch": 3.829845313921747, "grad_norm": 0.08242862581297755, "learning_rate": 1.2925759433836604e-06, "loss": 0.001, "step": 8418 }, { "epoch": 3.8303002729754323, "grad_norm": 0.2402643718597356, "learning_rate": 1.29161708061934e-06, "loss": 0.0061, "step": 8419 }, { "epoch": 3.8307552320291176, "grad_norm": 0.31272566727538187, "learning_rate": 1.2906585208873251e-06, "loss": 0.0077, "step": 8420 }, { "epoch": 3.8312101910828025, "grad_norm": 0.10375993159218067, "learning_rate": 1.2897002642659444e-06, "loss": 0.0009, "step": 8421 }, { "epoch": 3.831665150136488, "grad_norm": 0.1822856543304169, "learning_rate": 1.2887423108335012e-06, "loss": 0.0029, "step": 8422 }, { "epoch": 3.832120109190173, "grad_norm": 0.05261446379952179, "learning_rate": 1.2877846606682764e-06, "loss": 0.0003, "step": 8423 }, { "epoch": 3.832575068243858, "grad_norm": 0.2633675845918805, "learning_rate": 1.2868273138485265e-06, "loss": 0.0069, "step": 8424 }, { "epoch": 3.8330300272975433, "grad_norm": 0.38249460070364494, "learning_rate": 1.2858702704524801e-06, "loss": 0.0045, "step": 8425 }, { "epoch": 3.8334849863512286, "grad_norm": 0.2211986689376906, "learning_rate": 1.284913530558342e-06, "loss": 0.0058, "step": 8426 }, { "epoch": 3.8339399454049135, "grad_norm": 0.04571734758670077, "learning_rate": 1.2839570942442918e-06, "loss": 0.0003, "step": 8427 }, { "epoch": 3.8343949044585988, "grad_norm": 0.26529399465953435, "learning_rate": 1.2830009615884876e-06, "loss": 0.0057, "step": 8428 }, { "epoch": 3.834849863512284, "grad_norm": 0.14497405512497816, "learning_rate": 1.2820451326690576e-06, "loss": 0.002, "step": 8429 }, { "epoch": 3.835304822565969, "grad_norm": 0.12265915889052215, "learning_rate": 1.2810896075641106e-06, "loss": 0.0016, "step": 8430 }, { "epoch": 3.8357597816196543, "grad_norm": 0.2923179320068547, "learning_rate": 1.2801343863517269e-06, "loss": 0.0051, "step": 8431 }, { "epoch": 3.8362147406733396, "grad_norm": 0.3477059185150881, "learning_rate": 1.2791794691099603e-06, "loss": 0.0097, "step": 8432 }, { "epoch": 3.8366696997270244, "grad_norm": 0.6442459221866703, "learning_rate": 1.2782248559168458e-06, "loss": 0.0088, "step": 8433 }, { "epoch": 3.8371246587807097, "grad_norm": 0.4303015594796121, "learning_rate": 1.2772705468503892e-06, "loss": 0.0042, "step": 8434 }, { "epoch": 3.837579617834395, "grad_norm": 0.3089108860974287, "learning_rate": 1.2763165419885714e-06, "loss": 0.0053, "step": 8435 }, { "epoch": 3.83803457688808, "grad_norm": 0.13206227245372856, "learning_rate": 1.2753628414093489e-06, "loss": 0.0007, "step": 8436 }, { "epoch": 3.8384895359417652, "grad_norm": 0.1514964065398938, "learning_rate": 1.274409445190654e-06, "loss": 0.0016, "step": 8437 }, { "epoch": 3.8389444949954505, "grad_norm": 0.18536084132664352, "learning_rate": 1.2734563534103967e-06, "loss": 0.0015, "step": 8438 }, { "epoch": 3.8393994540491354, "grad_norm": 0.2714714388232563, "learning_rate": 1.2725035661464568e-06, "loss": 0.0053, "step": 8439 }, { "epoch": 3.8398544131028207, "grad_norm": 0.22143921333332356, "learning_rate": 1.2715510834766925e-06, "loss": 0.0035, "step": 8440 }, { "epoch": 3.840309372156506, "grad_norm": 0.6099050324716289, "learning_rate": 1.2705989054789358e-06, "loss": 0.0103, "step": 8441 }, { "epoch": 3.840764331210191, "grad_norm": 0.07760202526070378, "learning_rate": 1.269647032230994e-06, "loss": 0.0008, "step": 8442 }, { "epoch": 3.841219290263876, "grad_norm": 0.14988794337877911, "learning_rate": 1.2686954638106497e-06, "loss": 0.0027, "step": 8443 }, { "epoch": 3.8416742493175615, "grad_norm": 0.09512279442014934, "learning_rate": 1.2677442002956636e-06, "loss": 0.0009, "step": 8444 }, { "epoch": 3.8421292083712464, "grad_norm": 0.21960955604778923, "learning_rate": 1.2667932417637669e-06, "loss": 0.0045, "step": 8445 }, { "epoch": 3.8425841674249317, "grad_norm": 0.049301179279957294, "learning_rate": 1.2658425882926672e-06, "loss": 0.0004, "step": 8446 }, { "epoch": 3.843039126478617, "grad_norm": 0.18069259601796572, "learning_rate": 1.2648922399600467e-06, "loss": 0.0022, "step": 8447 }, { "epoch": 3.843494085532302, "grad_norm": 0.19142766700652458, "learning_rate": 1.2639421968435655e-06, "loss": 0.003, "step": 8448 }, { "epoch": 3.843949044585987, "grad_norm": 0.1523308102062212, "learning_rate": 1.262992459020857e-06, "loss": 0.0016, "step": 8449 }, { "epoch": 3.8444040036396725, "grad_norm": 0.2619310242968084, "learning_rate": 1.2620430265695267e-06, "loss": 0.0061, "step": 8450 }, { "epoch": 3.8448589626933574, "grad_norm": 0.28502517658670345, "learning_rate": 1.2610938995671606e-06, "loss": 0.0039, "step": 8451 }, { "epoch": 3.8453139217470427, "grad_norm": 0.12442702407848295, "learning_rate": 1.2601450780913161e-06, "loss": 0.0015, "step": 8452 }, { "epoch": 3.845768880800728, "grad_norm": 0.24018405930013872, "learning_rate": 1.2591965622195274e-06, "loss": 0.0046, "step": 8453 }, { "epoch": 3.846223839854413, "grad_norm": 0.08288531940869734, "learning_rate": 1.2582483520293026e-06, "loss": 0.0006, "step": 8454 }, { "epoch": 3.846678798908098, "grad_norm": 0.2280538494024286, "learning_rate": 1.2573004475981243e-06, "loss": 0.0026, "step": 8455 }, { "epoch": 3.8471337579617835, "grad_norm": 0.12883334774563363, "learning_rate": 1.256352849003451e-06, "loss": 0.0013, "step": 8456 }, { "epoch": 3.8475887170154683, "grad_norm": 0.22662915829767713, "learning_rate": 1.2554055563227163e-06, "loss": 0.0022, "step": 8457 }, { "epoch": 3.8480436760691537, "grad_norm": 0.27566415303598973, "learning_rate": 1.2544585696333305e-06, "loss": 0.0041, "step": 8458 }, { "epoch": 3.848498635122839, "grad_norm": 0.28085231845099684, "learning_rate": 1.2535118890126758e-06, "loss": 0.0023, "step": 8459 }, { "epoch": 3.8489535941765243, "grad_norm": 0.2588290618037598, "learning_rate": 1.2525655145381104e-06, "loss": 0.0046, "step": 8460 }, { "epoch": 3.849408553230209, "grad_norm": 0.03194820568526538, "learning_rate": 1.2516194462869663e-06, "loss": 0.0003, "step": 8461 }, { "epoch": 3.8498635122838945, "grad_norm": 0.1768107842451328, "learning_rate": 1.2506736843365552e-06, "loss": 0.0039, "step": 8462 }, { "epoch": 3.8503184713375798, "grad_norm": 0.3924196110233749, "learning_rate": 1.2497282287641588e-06, "loss": 0.0053, "step": 8463 }, { "epoch": 3.8507734303912646, "grad_norm": 0.5109022811396491, "learning_rate": 1.248783079647034e-06, "loss": 0.0072, "step": 8464 }, { "epoch": 3.85122838944495, "grad_norm": 0.1933094018038453, "learning_rate": 1.247838237062417e-06, "loss": 0.0018, "step": 8465 }, { "epoch": 3.8516833484986353, "grad_norm": 0.4144346490474305, "learning_rate": 1.2468937010875131e-06, "loss": 0.0085, "step": 8466 }, { "epoch": 3.8521383075523206, "grad_norm": 0.1355727849874782, "learning_rate": 1.2459494717995085e-06, "loss": 0.0012, "step": 8467 }, { "epoch": 3.8525932666060054, "grad_norm": 0.16928611993835407, "learning_rate": 1.2450055492755602e-06, "loss": 0.0029, "step": 8468 }, { "epoch": 3.8530482256596907, "grad_norm": 0.40246200344720284, "learning_rate": 1.244061933592801e-06, "loss": 0.0041, "step": 8469 }, { "epoch": 3.853503184713376, "grad_norm": 0.13060140981445278, "learning_rate": 1.2431186248283373e-06, "loss": 0.0008, "step": 8470 }, { "epoch": 3.853958143767061, "grad_norm": 0.2299546387610646, "learning_rate": 1.2421756230592535e-06, "loss": 0.002, "step": 8471 }, { "epoch": 3.8544131028207462, "grad_norm": 0.1902407967740717, "learning_rate": 1.2412329283626096e-06, "loss": 0.0021, "step": 8472 }, { "epoch": 3.8548680618744315, "grad_norm": 0.3665417651145956, "learning_rate": 1.2402905408154359e-06, "loss": 0.0033, "step": 8473 }, { "epoch": 3.8553230209281164, "grad_norm": 0.20410041093409834, "learning_rate": 1.2393484604947403e-06, "loss": 0.0028, "step": 8474 }, { "epoch": 3.8557779799818017, "grad_norm": 0.16998252128460117, "learning_rate": 1.2384066874775047e-06, "loss": 0.0029, "step": 8475 }, { "epoch": 3.856232939035487, "grad_norm": 0.254858957722109, "learning_rate": 1.2374652218406884e-06, "loss": 0.0041, "step": 8476 }, { "epoch": 3.856687898089172, "grad_norm": 0.16426591782009645, "learning_rate": 1.236524063661223e-06, "loss": 0.0013, "step": 8477 }, { "epoch": 3.857142857142857, "grad_norm": 0.2147436072461515, "learning_rate": 1.2355832130160134e-06, "loss": 0.0017, "step": 8478 }, { "epoch": 3.8575978161965425, "grad_norm": 0.12819228530685667, "learning_rate": 1.234642669981946e-06, "loss": 0.0014, "step": 8479 }, { "epoch": 3.8580527752502274, "grad_norm": 0.17063038965755264, "learning_rate": 1.2337024346358745e-06, "loss": 0.0015, "step": 8480 }, { "epoch": 3.8585077343039127, "grad_norm": 0.07538577281816423, "learning_rate": 1.2327625070546306e-06, "loss": 0.0006, "step": 8481 }, { "epoch": 3.858962693357598, "grad_norm": 0.21829555406319018, "learning_rate": 1.2318228873150233e-06, "loss": 0.0014, "step": 8482 }, { "epoch": 3.859417652411283, "grad_norm": 0.8538086274154935, "learning_rate": 1.230883575493833e-06, "loss": 0.0204, "step": 8483 }, { "epoch": 3.859872611464968, "grad_norm": 0.2527606116905976, "learning_rate": 1.2299445716678155e-06, "loss": 0.0056, "step": 8484 }, { "epoch": 3.8603275705186535, "grad_norm": 0.23098909363938505, "learning_rate": 1.2290058759137008e-06, "loss": 0.0015, "step": 8485 }, { "epoch": 3.8607825295723384, "grad_norm": 0.47281148816795787, "learning_rate": 1.228067488308196e-06, "loss": 0.0082, "step": 8486 }, { "epoch": 3.8612374886260237, "grad_norm": 0.2939325031824273, "learning_rate": 1.227129408927984e-06, "loss": 0.0038, "step": 8487 }, { "epoch": 3.861692447679709, "grad_norm": 0.22152083230825947, "learning_rate": 1.2261916378497185e-06, "loss": 0.0035, "step": 8488 }, { "epoch": 3.862147406733394, "grad_norm": 0.13073537912552963, "learning_rate": 1.2252541751500297e-06, "loss": 0.0018, "step": 8489 }, { "epoch": 3.862602365787079, "grad_norm": 0.25251644999039125, "learning_rate": 1.2243170209055216e-06, "loss": 0.003, "step": 8490 }, { "epoch": 3.8630573248407645, "grad_norm": 0.3489997487440649, "learning_rate": 1.2233801751927777e-06, "loss": 0.007, "step": 8491 }, { "epoch": 3.8635122838944493, "grad_norm": 0.24501311776491316, "learning_rate": 1.2224436380883492e-06, "loss": 0.0072, "step": 8492 }, { "epoch": 3.8639672429481347, "grad_norm": 0.32105220703151655, "learning_rate": 1.2215074096687685e-06, "loss": 0.0099, "step": 8493 }, { "epoch": 3.86442220200182, "grad_norm": 0.2453375811247342, "learning_rate": 1.2205714900105387e-06, "loss": 0.0081, "step": 8494 }, { "epoch": 3.864877161055505, "grad_norm": 0.14215486028298338, "learning_rate": 1.219635879190138e-06, "loss": 0.002, "step": 8495 }, { "epoch": 3.86533212010919, "grad_norm": 0.1657498998523969, "learning_rate": 1.2187005772840222e-06, "loss": 0.0022, "step": 8496 }, { "epoch": 3.8657870791628755, "grad_norm": 0.3151440894059005, "learning_rate": 1.2177655843686193e-06, "loss": 0.0066, "step": 8497 }, { "epoch": 3.8662420382165603, "grad_norm": 0.4361697324762576, "learning_rate": 1.2168309005203321e-06, "loss": 0.0035, "step": 8498 }, { "epoch": 3.8666969972702456, "grad_norm": 0.18600029231156295, "learning_rate": 1.215896525815538e-06, "loss": 0.0028, "step": 8499 }, { "epoch": 3.867151956323931, "grad_norm": 0.32574708560126603, "learning_rate": 1.214962460330591e-06, "loss": 0.0079, "step": 8500 }, { "epoch": 3.867606915377616, "grad_norm": 0.4790888601671271, "learning_rate": 1.2140287041418203e-06, "loss": 0.0063, "step": 8501 }, { "epoch": 3.868061874431301, "grad_norm": 0.2894038847868456, "learning_rate": 1.2130952573255261e-06, "loss": 0.0049, "step": 8502 }, { "epoch": 3.8685168334849864, "grad_norm": 0.2575133700033376, "learning_rate": 1.212162119957986e-06, "loss": 0.0042, "step": 8503 }, { "epoch": 3.8689717925386713, "grad_norm": 0.16055335960243933, "learning_rate": 1.2112292921154507e-06, "loss": 0.0015, "step": 8504 }, { "epoch": 3.8694267515923566, "grad_norm": 0.5268595075573901, "learning_rate": 1.210296773874149e-06, "loss": 0.0055, "step": 8505 }, { "epoch": 3.869881710646042, "grad_norm": 0.14380315121523776, "learning_rate": 1.2093645653102787e-06, "loss": 0.0015, "step": 8506 }, { "epoch": 3.870336669699727, "grad_norm": 0.3030518155517926, "learning_rate": 1.2084326665000201e-06, "loss": 0.0039, "step": 8507 }, { "epoch": 3.870791628753412, "grad_norm": 0.3005729703259829, "learning_rate": 1.2075010775195205e-06, "loss": 0.0047, "step": 8508 }, { "epoch": 3.8712465878070974, "grad_norm": 0.19318436861989613, "learning_rate": 1.2065697984449055e-06, "loss": 0.0039, "step": 8509 }, { "epoch": 3.8717015468607823, "grad_norm": 0.22623508334349976, "learning_rate": 1.2056388293522768e-06, "loss": 0.004, "step": 8510 }, { "epoch": 3.8721565059144676, "grad_norm": 0.2713342001914328, "learning_rate": 1.2047081703177077e-06, "loss": 0.0025, "step": 8511 }, { "epoch": 3.872611464968153, "grad_norm": 0.13520350593071404, "learning_rate": 1.2037778214172475e-06, "loss": 0.001, "step": 8512 }, { "epoch": 3.8730664240218378, "grad_norm": 0.05526102019123737, "learning_rate": 1.2028477827269186e-06, "loss": 0.0006, "step": 8513 }, { "epoch": 3.873521383075523, "grad_norm": 0.21448112011511056, "learning_rate": 1.2019180543227216e-06, "loss": 0.0045, "step": 8514 }, { "epoch": 3.8739763421292084, "grad_norm": 0.3353278890425571, "learning_rate": 1.20098863628063e-06, "loss": 0.0073, "step": 8515 }, { "epoch": 3.8744313011828937, "grad_norm": 0.3159457572771494, "learning_rate": 1.2000595286765914e-06, "loss": 0.0081, "step": 8516 }, { "epoch": 3.8748862602365786, "grad_norm": 0.09978703021591674, "learning_rate": 1.1991307315865274e-06, "loss": 0.001, "step": 8517 }, { "epoch": 3.875341219290264, "grad_norm": 0.2664293463872522, "learning_rate": 1.1982022450863358e-06, "loss": 0.0054, "step": 8518 }, { "epoch": 3.875796178343949, "grad_norm": 0.23266523081990464, "learning_rate": 1.1972740692518858e-06, "loss": 0.0059, "step": 8519 }, { "epoch": 3.876251137397634, "grad_norm": 0.10827081850758029, "learning_rate": 1.1963462041590262e-06, "loss": 0.0012, "step": 8520 }, { "epoch": 3.8767060964513194, "grad_norm": 0.15410971891546507, "learning_rate": 1.1954186498835797e-06, "loss": 0.0024, "step": 8521 }, { "epoch": 3.8771610555050047, "grad_norm": 0.3021095692725338, "learning_rate": 1.194491406501339e-06, "loss": 0.0035, "step": 8522 }, { "epoch": 3.87761601455869, "grad_norm": 0.045179863353742786, "learning_rate": 1.193564474088076e-06, "loss": 0.0003, "step": 8523 }, { "epoch": 3.878070973612375, "grad_norm": 0.3901003588986076, "learning_rate": 1.192637852719532e-06, "loss": 0.0107, "step": 8524 }, { "epoch": 3.87852593266606, "grad_norm": 0.24040628801795585, "learning_rate": 1.1917115424714305e-06, "loss": 0.0021, "step": 8525 }, { "epoch": 3.8789808917197455, "grad_norm": 0.29613167821824377, "learning_rate": 1.1907855434194637e-06, "loss": 0.0056, "step": 8526 }, { "epoch": 3.8794358507734303, "grad_norm": 0.25052492781274743, "learning_rate": 1.1898598556392987e-06, "loss": 0.0038, "step": 8527 }, { "epoch": 3.8798908098271156, "grad_norm": 0.44246555835890966, "learning_rate": 1.1889344792065816e-06, "loss": 0.0066, "step": 8528 }, { "epoch": 3.880345768880801, "grad_norm": 0.2918796780310605, "learning_rate": 1.1880094141969262e-06, "loss": 0.0061, "step": 8529 }, { "epoch": 3.880800727934486, "grad_norm": 0.14354874224811812, "learning_rate": 1.1870846606859288e-06, "loss": 0.0009, "step": 8530 }, { "epoch": 3.881255686988171, "grad_norm": 0.17765159955535334, "learning_rate": 1.1861602187491533e-06, "loss": 0.0021, "step": 8531 }, { "epoch": 3.8817106460418564, "grad_norm": 0.3821340989360063, "learning_rate": 1.1852360884621417e-06, "loss": 0.0093, "step": 8532 }, { "epoch": 3.8821656050955413, "grad_norm": 0.1782471975402449, "learning_rate": 1.1843122699004083e-06, "loss": 0.0028, "step": 8533 }, { "epoch": 3.8826205641492266, "grad_norm": 0.17999876619669206, "learning_rate": 1.1833887631394447e-06, "loss": 0.0027, "step": 8534 }, { "epoch": 3.883075523202912, "grad_norm": 0.14798802286978838, "learning_rate": 1.1824655682547176e-06, "loss": 0.0013, "step": 8535 }, { "epoch": 3.883530482256597, "grad_norm": 0.13713911919604432, "learning_rate": 1.181542685321664e-06, "loss": 0.0026, "step": 8536 }, { "epoch": 3.883985441310282, "grad_norm": 0.28753625801362137, "learning_rate": 1.180620114415698e-06, "loss": 0.0065, "step": 8537 }, { "epoch": 3.8844404003639674, "grad_norm": 0.24528139972882526, "learning_rate": 1.1796978556122069e-06, "loss": 0.0057, "step": 8538 }, { "epoch": 3.8848953594176523, "grad_norm": 0.2208511421476422, "learning_rate": 1.178775908986556e-06, "loss": 0.0026, "step": 8539 }, { "epoch": 3.8853503184713376, "grad_norm": 0.3498990752877721, "learning_rate": 1.1778542746140814e-06, "loss": 0.0087, "step": 8540 }, { "epoch": 3.885805277525023, "grad_norm": 0.077305454218025, "learning_rate": 1.1769329525700934e-06, "loss": 0.0008, "step": 8541 }, { "epoch": 3.886260236578708, "grad_norm": 0.13867732645712094, "learning_rate": 1.176011942929881e-06, "loss": 0.0026, "step": 8542 }, { "epoch": 3.886715195632393, "grad_norm": 2.021104381740242, "learning_rate": 1.1750912457687024e-06, "loss": 0.0092, "step": 8543 }, { "epoch": 3.8871701546860784, "grad_norm": 0.05688037376082502, "learning_rate": 1.1741708611617951e-06, "loss": 0.0004, "step": 8544 }, { "epoch": 3.8876251137397633, "grad_norm": 0.11688013987991248, "learning_rate": 1.1732507891843681e-06, "loss": 0.0013, "step": 8545 }, { "epoch": 3.8880800727934486, "grad_norm": 0.11431852676949443, "learning_rate": 1.1723310299116052e-06, "loss": 0.0009, "step": 8546 }, { "epoch": 3.888535031847134, "grad_norm": 0.12562883668862718, "learning_rate": 1.1714115834186646e-06, "loss": 0.0013, "step": 8547 }, { "epoch": 3.8889899909008188, "grad_norm": 0.17462077871472168, "learning_rate": 1.1704924497806775e-06, "loss": 0.002, "step": 8548 }, { "epoch": 3.889444949954504, "grad_norm": 0.2662032111623803, "learning_rate": 1.1695736290727554e-06, "loss": 0.002, "step": 8549 }, { "epoch": 3.8898999090081894, "grad_norm": 0.1087899932136938, "learning_rate": 1.1686551213699788e-06, "loss": 0.0009, "step": 8550 }, { "epoch": 3.8903548680618742, "grad_norm": 0.3749712407882381, "learning_rate": 1.1677369267474036e-06, "loss": 0.0037, "step": 8551 }, { "epoch": 3.8908098271155596, "grad_norm": 0.35472025867000434, "learning_rate": 1.1668190452800604e-06, "loss": 0.0039, "step": 8552 }, { "epoch": 3.891264786169245, "grad_norm": 0.1307639121413599, "learning_rate": 1.1659014770429527e-06, "loss": 0.001, "step": 8553 }, { "epoch": 3.8917197452229297, "grad_norm": 0.15643581691221203, "learning_rate": 1.164984222111063e-06, "loss": 0.0015, "step": 8554 }, { "epoch": 3.892174704276615, "grad_norm": 0.15113369685521766, "learning_rate": 1.1640672805593423e-06, "loss": 0.0032, "step": 8555 }, { "epoch": 3.8926296633303004, "grad_norm": 0.2700194885635221, "learning_rate": 1.1631506524627223e-06, "loss": 0.0042, "step": 8556 }, { "epoch": 3.8930846223839852, "grad_norm": 0.22335646873162818, "learning_rate": 1.1622343378961037e-06, "loss": 0.0034, "step": 8557 }, { "epoch": 3.8935395814376705, "grad_norm": 0.3530776068168374, "learning_rate": 1.1613183369343627e-06, "loss": 0.0031, "step": 8558 }, { "epoch": 3.893994540491356, "grad_norm": 0.14898826965234926, "learning_rate": 1.1604026496523536e-06, "loss": 0.0015, "step": 8559 }, { "epoch": 3.8944494995450407, "grad_norm": 0.0900944793650691, "learning_rate": 1.1594872761249e-06, "loss": 0.0007, "step": 8560 }, { "epoch": 3.894904458598726, "grad_norm": 0.2546589358428222, "learning_rate": 1.1585722164268021e-06, "loss": 0.0038, "step": 8561 }, { "epoch": 3.8953594176524113, "grad_norm": 0.20274418218456525, "learning_rate": 1.1576574706328342e-06, "loss": 0.0029, "step": 8562 }, { "epoch": 3.895814376706096, "grad_norm": 0.382352250268221, "learning_rate": 1.1567430388177459e-06, "loss": 0.0058, "step": 8563 }, { "epoch": 3.8962693357597815, "grad_norm": 0.3740862719077101, "learning_rate": 1.1558289210562618e-06, "loss": 0.0038, "step": 8564 }, { "epoch": 3.896724294813467, "grad_norm": 0.1599488372275841, "learning_rate": 1.1549151174230778e-06, "loss": 0.0025, "step": 8565 }, { "epoch": 3.8971792538671517, "grad_norm": 0.41750466773791584, "learning_rate": 1.1540016279928668e-06, "loss": 0.006, "step": 8566 }, { "epoch": 3.897634212920837, "grad_norm": 0.12327063563732767, "learning_rate": 1.1530884528402724e-06, "loss": 0.0016, "step": 8567 }, { "epoch": 3.8980891719745223, "grad_norm": 0.2799985029804694, "learning_rate": 1.1521755920399191e-06, "loss": 0.0079, "step": 8568 }, { "epoch": 3.8985441310282076, "grad_norm": 0.30444484701947416, "learning_rate": 1.1512630456663976e-06, "loss": 0.0022, "step": 8569 }, { "epoch": 3.8989990900818925, "grad_norm": 0.12678300534776715, "learning_rate": 1.1503508137942814e-06, "loss": 0.0013, "step": 8570 }, { "epoch": 3.899454049135578, "grad_norm": 0.3200922001692808, "learning_rate": 1.1494388964981117e-06, "loss": 0.009, "step": 8571 }, { "epoch": 3.899909008189263, "grad_norm": 0.3025651795182405, "learning_rate": 1.1485272938524045e-06, "loss": 0.002, "step": 8572 }, { "epoch": 3.900363967242948, "grad_norm": 0.1477858119731619, "learning_rate": 1.147616005931656e-06, "loss": 0.0012, "step": 8573 }, { "epoch": 3.9008189262966333, "grad_norm": 0.3644421033195017, "learning_rate": 1.1467050328103295e-06, "loss": 0.0041, "step": 8574 }, { "epoch": 3.9012738853503186, "grad_norm": 0.412765249106771, "learning_rate": 1.145794374562867e-06, "loss": 0.0112, "step": 8575 }, { "epoch": 3.901728844404004, "grad_norm": 0.09425648281105749, "learning_rate": 1.1448840312636812e-06, "loss": 0.0013, "step": 8576 }, { "epoch": 3.902183803457689, "grad_norm": 0.11709607267456766, "learning_rate": 1.1439740029871622e-06, "loss": 0.001, "step": 8577 }, { "epoch": 3.902638762511374, "grad_norm": 0.41047764808237097, "learning_rate": 1.143064289807676e-06, "loss": 0.008, "step": 8578 }, { "epoch": 3.9030937215650594, "grad_norm": 0.44103632497336703, "learning_rate": 1.1421548917995584e-06, "loss": 0.0054, "step": 8579 }, { "epoch": 3.9035486806187443, "grad_norm": 0.08228339709205519, "learning_rate": 1.1412458090371208e-06, "loss": 0.0012, "step": 8580 }, { "epoch": 3.9040036396724296, "grad_norm": 0.11814590257555363, "learning_rate": 1.1403370415946486e-06, "loss": 0.0014, "step": 8581 }, { "epoch": 3.904458598726115, "grad_norm": 0.2564362949576262, "learning_rate": 1.1394285895464041e-06, "loss": 0.0064, "step": 8582 }, { "epoch": 3.9049135577797998, "grad_norm": 0.5530443241890499, "learning_rate": 1.1385204529666205e-06, "loss": 0.0034, "step": 8583 }, { "epoch": 3.905368516833485, "grad_norm": 0.1478147299889684, "learning_rate": 1.1376126319295078e-06, "loss": 0.0014, "step": 8584 }, { "epoch": 3.9058234758871704, "grad_norm": 0.47332719412921304, "learning_rate": 1.1367051265092487e-06, "loss": 0.009, "step": 8585 }, { "epoch": 3.9062784349408552, "grad_norm": 0.21666486995780654, "learning_rate": 1.1357979367800004e-06, "loss": 0.0023, "step": 8586 }, { "epoch": 3.9067333939945406, "grad_norm": 0.27718541865223906, "learning_rate": 1.1348910628158927e-06, "loss": 0.0055, "step": 8587 }, { "epoch": 3.907188353048226, "grad_norm": 0.3178236785516548, "learning_rate": 1.1339845046910342e-06, "loss": 0.0079, "step": 8588 }, { "epoch": 3.9076433121019107, "grad_norm": 0.2835566572880014, "learning_rate": 1.1330782624795027e-06, "loss": 0.0044, "step": 8589 }, { "epoch": 3.908098271155596, "grad_norm": 0.17121068411044227, "learning_rate": 1.1321723362553516e-06, "loss": 0.0035, "step": 8590 }, { "epoch": 3.9085532302092814, "grad_norm": 0.2633371349136533, "learning_rate": 1.131266726092612e-06, "loss": 0.0052, "step": 8591 }, { "epoch": 3.9090081892629662, "grad_norm": 0.18550586432732438, "learning_rate": 1.1303614320652828e-06, "loss": 0.0012, "step": 8592 }, { "epoch": 3.9094631483166515, "grad_norm": 0.14134717139327005, "learning_rate": 1.1294564542473435e-06, "loss": 0.0009, "step": 8593 }, { "epoch": 3.909918107370337, "grad_norm": 0.14179189989250485, "learning_rate": 1.1285517927127438e-06, "loss": 0.0011, "step": 8594 }, { "epoch": 3.9103730664240217, "grad_norm": 0.18480139294796458, "learning_rate": 1.1276474475354077e-06, "loss": 0.0042, "step": 8595 }, { "epoch": 3.910828025477707, "grad_norm": 0.26152965664440003, "learning_rate": 1.126743418789234e-06, "loss": 0.0021, "step": 8596 }, { "epoch": 3.9112829845313923, "grad_norm": 0.25280587208806443, "learning_rate": 1.1258397065480963e-06, "loss": 0.0022, "step": 8597 }, { "epoch": 3.911737943585077, "grad_norm": 0.26530461494558766, "learning_rate": 1.124936310885844e-06, "loss": 0.0056, "step": 8598 }, { "epoch": 3.9121929026387625, "grad_norm": 0.30633216410793296, "learning_rate": 1.1240332318762964e-06, "loss": 0.0085, "step": 8599 }, { "epoch": 3.912647861692448, "grad_norm": 0.08594344662114259, "learning_rate": 1.1231304695932494e-06, "loss": 0.0016, "step": 8600 }, { "epoch": 3.9131028207461327, "grad_norm": 0.10880268538920568, "learning_rate": 1.1222280241104716e-06, "loss": 0.0012, "step": 8601 }, { "epoch": 3.913557779799818, "grad_norm": 0.5411180940024666, "learning_rate": 1.1213258955017086e-06, "loss": 0.0097, "step": 8602 }, { "epoch": 3.9140127388535033, "grad_norm": 0.2871672628937993, "learning_rate": 1.1204240838406782e-06, "loss": 0.0056, "step": 8603 }, { "epoch": 3.914467697907188, "grad_norm": 0.3146656867161822, "learning_rate": 1.1195225892010697e-06, "loss": 0.0084, "step": 8604 }, { "epoch": 3.9149226569608735, "grad_norm": 0.2352121235369682, "learning_rate": 1.118621411656553e-06, "loss": 0.002, "step": 8605 }, { "epoch": 3.915377616014559, "grad_norm": 0.09586231537241285, "learning_rate": 1.1177205512807643e-06, "loss": 0.0007, "step": 8606 }, { "epoch": 3.9158325750682437, "grad_norm": 0.21579455644758452, "learning_rate": 1.1168200081473219e-06, "loss": 0.0019, "step": 8607 }, { "epoch": 3.916287534121929, "grad_norm": 0.2781448682024323, "learning_rate": 1.1159197823298117e-06, "loss": 0.0039, "step": 8608 }, { "epoch": 3.9167424931756143, "grad_norm": 0.30543288704648824, "learning_rate": 1.115019873901797e-06, "loss": 0.0096, "step": 8609 }, { "epoch": 3.917197452229299, "grad_norm": 0.0742914789727566, "learning_rate": 1.1141202829368124e-06, "loss": 0.0011, "step": 8610 }, { "epoch": 3.9176524112829845, "grad_norm": 0.19062290156766876, "learning_rate": 1.1132210095083696e-06, "loss": 0.0068, "step": 8611 }, { "epoch": 3.91810737033667, "grad_norm": 0.26470427971734123, "learning_rate": 1.112322053689955e-06, "loss": 0.0043, "step": 8612 }, { "epoch": 3.9185623293903546, "grad_norm": 0.16327070324816795, "learning_rate": 1.111423415555025e-06, "loss": 0.0016, "step": 8613 }, { "epoch": 3.91901728844404, "grad_norm": 0.2462024846666314, "learning_rate": 1.110525095177013e-06, "loss": 0.0019, "step": 8614 }, { "epoch": 3.9194722474977253, "grad_norm": 0.18678889734389317, "learning_rate": 1.1096270926293245e-06, "loss": 0.0017, "step": 8615 }, { "epoch": 3.91992720655141, "grad_norm": 0.2198330200083498, "learning_rate": 1.1087294079853423e-06, "loss": 0.0056, "step": 8616 }, { "epoch": 3.9203821656050954, "grad_norm": 0.1754214115924083, "learning_rate": 1.10783204131842e-06, "loss": 0.0024, "step": 8617 }, { "epoch": 3.9208371246587808, "grad_norm": 0.09310448686369165, "learning_rate": 1.1069349927018858e-06, "loss": 0.0009, "step": 8618 }, { "epoch": 3.9212920837124656, "grad_norm": 0.3972172442181105, "learning_rate": 1.1060382622090437e-06, "loss": 0.0047, "step": 8619 }, { "epoch": 3.921747042766151, "grad_norm": 0.13143351817243823, "learning_rate": 1.1051418499131683e-06, "loss": 0.0011, "step": 8620 }, { "epoch": 3.9222020018198362, "grad_norm": 0.10110188430928709, "learning_rate": 1.1042457558875135e-06, "loss": 0.0012, "step": 8621 }, { "epoch": 3.922656960873521, "grad_norm": 0.16142768137410202, "learning_rate": 1.1033499802053027e-06, "loss": 0.0016, "step": 8622 }, { "epoch": 3.9231119199272064, "grad_norm": 0.34715387308860346, "learning_rate": 1.1024545229397344e-06, "loss": 0.0029, "step": 8623 }, { "epoch": 3.9235668789808917, "grad_norm": 0.39642159044255004, "learning_rate": 1.101559384163981e-06, "loss": 0.0061, "step": 8624 }, { "epoch": 3.924021838034577, "grad_norm": 0.3881289778789691, "learning_rate": 1.1006645639511881e-06, "loss": 0.0139, "step": 8625 }, { "epoch": 3.924476797088262, "grad_norm": 0.1449078177475703, "learning_rate": 1.0997700623744784e-06, "loss": 0.0012, "step": 8626 }, { "epoch": 3.9249317561419472, "grad_norm": 0.11366266129974073, "learning_rate": 1.0988758795069465e-06, "loss": 0.0006, "step": 8627 }, { "epoch": 3.9253867151956325, "grad_norm": 0.07754432110257026, "learning_rate": 1.0979820154216608e-06, "loss": 0.0005, "step": 8628 }, { "epoch": 3.9258416742493174, "grad_norm": 0.07008705442776036, "learning_rate": 1.0970884701916634e-06, "loss": 0.0009, "step": 8629 }, { "epoch": 3.9262966333030027, "grad_norm": 0.2680134664292478, "learning_rate": 1.0961952438899699e-06, "loss": 0.003, "step": 8630 }, { "epoch": 3.926751592356688, "grad_norm": 0.1646752181540853, "learning_rate": 1.0953023365895721e-06, "loss": 0.0022, "step": 8631 }, { "epoch": 3.9272065514103733, "grad_norm": 0.23522267023706603, "learning_rate": 1.094409748363433e-06, "loss": 0.0037, "step": 8632 }, { "epoch": 3.927661510464058, "grad_norm": 0.19158777285507161, "learning_rate": 1.0935174792844934e-06, "loss": 0.0061, "step": 8633 }, { "epoch": 3.9281164695177435, "grad_norm": 0.21099724199086758, "learning_rate": 1.0926255294256638e-06, "loss": 0.0031, "step": 8634 }, { "epoch": 3.928571428571429, "grad_norm": 0.12153992936245393, "learning_rate": 1.0917338988598287e-06, "loss": 0.0008, "step": 8635 }, { "epoch": 3.9290263876251137, "grad_norm": 0.47079421823001855, "learning_rate": 1.0908425876598512e-06, "loss": 0.0085, "step": 8636 }, { "epoch": 3.929481346678799, "grad_norm": 0.30617255225261963, "learning_rate": 1.0899515958985641e-06, "loss": 0.0032, "step": 8637 }, { "epoch": 3.9299363057324843, "grad_norm": 0.1920210400975485, "learning_rate": 1.0890609236487748e-06, "loss": 0.002, "step": 8638 }, { "epoch": 3.930391264786169, "grad_norm": 0.4275090553301554, "learning_rate": 1.088170570983264e-06, "loss": 0.0043, "step": 8639 }, { "epoch": 3.9308462238398545, "grad_norm": 0.16416958893908284, "learning_rate": 1.0872805379747881e-06, "loss": 0.0031, "step": 8640 }, { "epoch": 3.93130118289354, "grad_norm": 0.40803310219522615, "learning_rate": 1.0863908246960786e-06, "loss": 0.0042, "step": 8641 }, { "epoch": 3.9317561419472247, "grad_norm": 0.30289168127297866, "learning_rate": 1.085501431219837e-06, "loss": 0.0069, "step": 8642 }, { "epoch": 3.93221110100091, "grad_norm": 0.19369172892118913, "learning_rate": 1.0846123576187413e-06, "loss": 0.0037, "step": 8643 }, { "epoch": 3.9326660600545953, "grad_norm": 0.14886037559979806, "learning_rate": 1.0837236039654397e-06, "loss": 0.0021, "step": 8644 }, { "epoch": 3.93312101910828, "grad_norm": 0.05580514991875835, "learning_rate": 1.0828351703325612e-06, "loss": 0.0005, "step": 8645 }, { "epoch": 3.9335759781619655, "grad_norm": 0.20148014456043523, "learning_rate": 1.0819470567927021e-06, "loss": 0.0014, "step": 8646 }, { "epoch": 3.934030937215651, "grad_norm": 0.16647964047422256, "learning_rate": 1.0810592634184364e-06, "loss": 0.0006, "step": 8647 }, { "epoch": 3.9344858962693356, "grad_norm": 0.061531122052330224, "learning_rate": 1.08017179028231e-06, "loss": 0.0005, "step": 8648 }, { "epoch": 3.934940855323021, "grad_norm": 0.3815404935382793, "learning_rate": 1.0792846374568416e-06, "loss": 0.0102, "step": 8649 }, { "epoch": 3.9353958143767063, "grad_norm": 0.11426919162438308, "learning_rate": 1.0783978050145288e-06, "loss": 0.0009, "step": 8650 }, { "epoch": 3.935850773430391, "grad_norm": 0.49548187169426167, "learning_rate": 1.077511293027837e-06, "loss": 0.0069, "step": 8651 }, { "epoch": 3.9363057324840764, "grad_norm": 0.2875600916353048, "learning_rate": 1.0766251015692086e-06, "loss": 0.006, "step": 8652 }, { "epoch": 3.9367606915377618, "grad_norm": 0.23380131484145028, "learning_rate": 1.075739230711058e-06, "loss": 0.0074, "step": 8653 }, { "epoch": 3.9372156505914466, "grad_norm": 0.2477451242343582, "learning_rate": 1.0748536805257753e-06, "loss": 0.0034, "step": 8654 }, { "epoch": 3.937670609645132, "grad_norm": 0.1261165654044897, "learning_rate": 1.0739684510857257e-06, "loss": 0.0011, "step": 8655 }, { "epoch": 3.9381255686988172, "grad_norm": 0.26204275295761664, "learning_rate": 1.0730835424632446e-06, "loss": 0.0028, "step": 8656 }, { "epoch": 3.938580527752502, "grad_norm": 0.18508101639745386, "learning_rate": 1.0721989547306423e-06, "loss": 0.0039, "step": 8657 }, { "epoch": 3.9390354868061874, "grad_norm": 0.21769222540241673, "learning_rate": 1.0713146879602038e-06, "loss": 0.0025, "step": 8658 }, { "epoch": 3.9394904458598727, "grad_norm": 0.10034400825997478, "learning_rate": 1.0704307422241856e-06, "loss": 0.0014, "step": 8659 }, { "epoch": 3.9399454049135576, "grad_norm": 0.3513842150791386, "learning_rate": 1.0695471175948213e-06, "loss": 0.0024, "step": 8660 }, { "epoch": 3.940400363967243, "grad_norm": 0.24255234185569405, "learning_rate": 1.0686638141443184e-06, "loss": 0.0013, "step": 8661 }, { "epoch": 3.9408553230209282, "grad_norm": 0.15771631194738922, "learning_rate": 1.067780831944855e-06, "loss": 0.0032, "step": 8662 }, { "epoch": 3.941310282074613, "grad_norm": 0.20513929273822182, "learning_rate": 1.0668981710685844e-06, "loss": 0.001, "step": 8663 }, { "epoch": 3.9417652411282984, "grad_norm": 0.07935303674519927, "learning_rate": 1.0660158315876318e-06, "loss": 0.0018, "step": 8664 }, { "epoch": 3.9422202001819837, "grad_norm": 0.09263657522623965, "learning_rate": 1.0651338135741006e-06, "loss": 0.001, "step": 8665 }, { "epoch": 3.9426751592356686, "grad_norm": 0.24340118318335507, "learning_rate": 1.0642521171000653e-06, "loss": 0.0023, "step": 8666 }, { "epoch": 3.943130118289354, "grad_norm": 0.17367882379298985, "learning_rate": 1.0633707422375716e-06, "loss": 0.0018, "step": 8667 }, { "epoch": 3.943585077343039, "grad_norm": 0.07771508279610274, "learning_rate": 1.062489689058645e-06, "loss": 0.0008, "step": 8668 }, { "epoch": 3.944040036396724, "grad_norm": 0.2898127528353958, "learning_rate": 1.0616089576352774e-06, "loss": 0.0074, "step": 8669 }, { "epoch": 3.9444949954504094, "grad_norm": 0.24703088495388803, "learning_rate": 1.060728548039442e-06, "loss": 0.0026, "step": 8670 }, { "epoch": 3.9449499545040947, "grad_norm": 0.26944606176110175, "learning_rate": 1.0598484603430797e-06, "loss": 0.0032, "step": 8671 }, { "epoch": 3.9454049135577796, "grad_norm": 0.29121094145548027, "learning_rate": 1.0589686946181078e-06, "loss": 0.0023, "step": 8672 }, { "epoch": 3.945859872611465, "grad_norm": 0.20062648290840376, "learning_rate": 1.0580892509364149e-06, "loss": 0.0024, "step": 8673 }, { "epoch": 3.94631483166515, "grad_norm": 0.3499803264391899, "learning_rate": 1.0572101293698671e-06, "loss": 0.0032, "step": 8674 }, { "epoch": 3.946769790718835, "grad_norm": 0.21998699471541316, "learning_rate": 1.056331329990304e-06, "loss": 0.0021, "step": 8675 }, { "epoch": 3.9472247497725204, "grad_norm": 0.20345501118890286, "learning_rate": 1.0554528528695346e-06, "loss": 0.0038, "step": 8676 }, { "epoch": 3.9476797088262057, "grad_norm": 0.129402120440634, "learning_rate": 1.0545746980793447e-06, "loss": 0.0011, "step": 8677 }, { "epoch": 3.9481346678798905, "grad_norm": 0.17145867970903111, "learning_rate": 1.0536968656914914e-06, "loss": 0.002, "step": 8678 }, { "epoch": 3.948589626933576, "grad_norm": 0.2681611999526238, "learning_rate": 1.052819355777711e-06, "loss": 0.0064, "step": 8679 }, { "epoch": 3.949044585987261, "grad_norm": 0.254791795465932, "learning_rate": 1.051942168409707e-06, "loss": 0.0025, "step": 8680 }, { "epoch": 3.9494995450409465, "grad_norm": 0.6065577504385965, "learning_rate": 1.0510653036591583e-06, "loss": 0.0085, "step": 8681 }, { "epoch": 3.9499545040946313, "grad_norm": 0.156835419474489, "learning_rate": 1.0501887615977214e-06, "loss": 0.0011, "step": 8682 }, { "epoch": 3.9504094631483166, "grad_norm": 0.2636604698049228, "learning_rate": 1.0493125422970202e-06, "loss": 0.0054, "step": 8683 }, { "epoch": 3.950864422202002, "grad_norm": 0.13970453218294443, "learning_rate": 1.0484366458286587e-06, "loss": 0.0015, "step": 8684 }, { "epoch": 3.951319381255687, "grad_norm": 0.23198002437201626, "learning_rate": 1.0475610722642088e-06, "loss": 0.0046, "step": 8685 }, { "epoch": 3.951774340309372, "grad_norm": 0.41613148640674474, "learning_rate": 1.0466858216752195e-06, "loss": 0.0098, "step": 8686 }, { "epoch": 3.9522292993630574, "grad_norm": 0.11465905064390043, "learning_rate": 1.04581089413321e-06, "loss": 0.0011, "step": 8687 }, { "epoch": 3.9526842584167428, "grad_norm": 0.4837982363440678, "learning_rate": 1.0449362897096776e-06, "loss": 0.0046, "step": 8688 }, { "epoch": 3.9531392174704276, "grad_norm": 0.6149581324428018, "learning_rate": 1.0440620084760922e-06, "loss": 0.0052, "step": 8689 }, { "epoch": 3.953594176524113, "grad_norm": 0.29608630177877293, "learning_rate": 1.0431880505038945e-06, "loss": 0.0048, "step": 8690 }, { "epoch": 3.9540491355777982, "grad_norm": 0.6426721614861997, "learning_rate": 1.0423144158644999e-06, "loss": 0.0147, "step": 8691 }, { "epoch": 3.954504094631483, "grad_norm": 0.08263751493435481, "learning_rate": 1.0414411046292994e-06, "loss": 0.0013, "step": 8692 }, { "epoch": 3.9549590536851684, "grad_norm": 0.11949983173572434, "learning_rate": 1.040568116869653e-06, "loss": 0.0008, "step": 8693 }, { "epoch": 3.9554140127388537, "grad_norm": 0.14808758997571178, "learning_rate": 1.0396954526569014e-06, "loss": 0.001, "step": 8694 }, { "epoch": 3.9558689717925386, "grad_norm": 0.4391031935388645, "learning_rate": 1.038823112062351e-06, "loss": 0.0062, "step": 8695 }, { "epoch": 3.956323930846224, "grad_norm": 0.3449639865614129, "learning_rate": 1.0379510951572891e-06, "loss": 0.0058, "step": 8696 }, { "epoch": 3.9567788898999092, "grad_norm": 0.22060024698376487, "learning_rate": 1.037079402012971e-06, "loss": 0.0033, "step": 8697 }, { "epoch": 3.957233848953594, "grad_norm": 0.19863491841459927, "learning_rate": 1.0362080327006263e-06, "loss": 0.0014, "step": 8698 }, { "epoch": 3.9576888080072794, "grad_norm": 0.6450457156491471, "learning_rate": 1.0353369872914626e-06, "loss": 0.0092, "step": 8699 }, { "epoch": 3.9581437670609647, "grad_norm": 0.1812422798582844, "learning_rate": 1.0344662658566562e-06, "loss": 0.0029, "step": 8700 }, { "epoch": 3.9585987261146496, "grad_norm": 0.3499859683753853, "learning_rate": 1.0335958684673574e-06, "loss": 0.0048, "step": 8701 }, { "epoch": 3.959053685168335, "grad_norm": 0.33754610678884184, "learning_rate": 1.0327257951946917e-06, "loss": 0.006, "step": 8702 }, { "epoch": 3.95950864422202, "grad_norm": 0.2245844092477358, "learning_rate": 1.0318560461097577e-06, "loss": 0.0018, "step": 8703 }, { "epoch": 3.959963603275705, "grad_norm": 0.12315408931465745, "learning_rate": 1.030986621283629e-06, "loss": 0.0012, "step": 8704 }, { "epoch": 3.9604185623293904, "grad_norm": 0.12010861141309559, "learning_rate": 1.0301175207873492e-06, "loss": 0.0017, "step": 8705 }, { "epoch": 3.9608735213830757, "grad_norm": 1.2467247797502008, "learning_rate": 1.0292487446919385e-06, "loss": 0.0094, "step": 8706 }, { "epoch": 3.9613284804367606, "grad_norm": 0.1702753689624994, "learning_rate": 1.0283802930683866e-06, "loss": 0.0021, "step": 8707 }, { "epoch": 3.961783439490446, "grad_norm": 0.19280587099263943, "learning_rate": 1.0275121659876636e-06, "loss": 0.0054, "step": 8708 }, { "epoch": 3.962238398544131, "grad_norm": 0.12907783862629738, "learning_rate": 1.0266443635207052e-06, "loss": 0.0008, "step": 8709 }, { "epoch": 3.962693357597816, "grad_norm": 0.0991430962001798, "learning_rate": 1.0257768857384271e-06, "loss": 0.0012, "step": 8710 }, { "epoch": 3.9631483166515014, "grad_norm": 0.2809748907686127, "learning_rate": 1.0249097327117142e-06, "loss": 0.0057, "step": 8711 }, { "epoch": 3.9636032757051867, "grad_norm": 0.1621882457127811, "learning_rate": 1.0240429045114258e-06, "loss": 0.0016, "step": 8712 }, { "epoch": 3.9640582347588715, "grad_norm": 0.13838844775693793, "learning_rate": 1.0231764012083966e-06, "loss": 0.0017, "step": 8713 }, { "epoch": 3.964513193812557, "grad_norm": 0.2524560728919716, "learning_rate": 1.0223102228734332e-06, "loss": 0.0039, "step": 8714 }, { "epoch": 3.964968152866242, "grad_norm": 0.1490474408984266, "learning_rate": 1.0214443695773152e-06, "loss": 0.0022, "step": 8715 }, { "epoch": 3.965423111919927, "grad_norm": 0.15521693607656253, "learning_rate": 1.0205788413907952e-06, "loss": 0.0011, "step": 8716 }, { "epoch": 3.9658780709736123, "grad_norm": 0.25999108001055726, "learning_rate": 1.0197136383846013e-06, "loss": 0.0028, "step": 8717 }, { "epoch": 3.9663330300272976, "grad_norm": 0.3463144321518056, "learning_rate": 1.018848760629435e-06, "loss": 0.0068, "step": 8718 }, { "epoch": 3.9667879890809825, "grad_norm": 0.17594896148628147, "learning_rate": 1.0179842081959695e-06, "loss": 0.0028, "step": 8719 }, { "epoch": 3.967242948134668, "grad_norm": 0.04295292711472292, "learning_rate": 1.0171199811548522e-06, "loss": 0.0005, "step": 8720 }, { "epoch": 3.967697907188353, "grad_norm": 0.28654923641368224, "learning_rate": 1.0162560795767019e-06, "loss": 0.003, "step": 8721 }, { "epoch": 3.968152866242038, "grad_norm": 0.3053671988277895, "learning_rate": 1.0153925035321155e-06, "loss": 0.0102, "step": 8722 }, { "epoch": 3.9686078252957233, "grad_norm": 0.2715425283881052, "learning_rate": 1.0145292530916584e-06, "loss": 0.0025, "step": 8723 }, { "epoch": 3.9690627843494086, "grad_norm": 0.07658153294032666, "learning_rate": 1.0136663283258734e-06, "loss": 0.0005, "step": 8724 }, { "epoch": 3.9695177434030935, "grad_norm": 0.13775655789012115, "learning_rate": 1.0128037293052744e-06, "loss": 0.0012, "step": 8725 }, { "epoch": 3.969972702456779, "grad_norm": 0.2597407304558315, "learning_rate": 1.0119414561003472e-06, "loss": 0.0034, "step": 8726 }, { "epoch": 3.970427661510464, "grad_norm": 0.23116775928116218, "learning_rate": 1.0110795087815555e-06, "loss": 0.003, "step": 8727 }, { "epoch": 3.970882620564149, "grad_norm": 0.21092958003181436, "learning_rate": 1.0102178874193324e-06, "loss": 0.0041, "step": 8728 }, { "epoch": 3.9713375796178343, "grad_norm": 0.3633862961488543, "learning_rate": 1.0093565920840863e-06, "loss": 0.0028, "step": 8729 }, { "epoch": 3.9717925386715196, "grad_norm": 0.3691079082696472, "learning_rate": 1.0084956228461962e-06, "loss": 0.006, "step": 8730 }, { "epoch": 3.9722474977252045, "grad_norm": 0.14908440249447896, "learning_rate": 1.0076349797760199e-06, "loss": 0.0025, "step": 8731 }, { "epoch": 3.97270245677889, "grad_norm": 0.34556387668714117, "learning_rate": 1.0067746629438819e-06, "loss": 0.003, "step": 8732 }, { "epoch": 3.973157415832575, "grad_norm": 0.32848764113459716, "learning_rate": 1.0059146724200869e-06, "loss": 0.0094, "step": 8733 }, { "epoch": 3.9736123748862604, "grad_norm": 0.23966969552121023, "learning_rate": 1.0050550082749077e-06, "loss": 0.004, "step": 8734 }, { "epoch": 3.9740673339399453, "grad_norm": 0.21141709663998878, "learning_rate": 1.0041956705785921e-06, "loss": 0.0033, "step": 8735 }, { "epoch": 3.9745222929936306, "grad_norm": 0.34271889423516433, "learning_rate": 1.0033366594013605e-06, "loss": 0.006, "step": 8736 }, { "epoch": 3.974977252047316, "grad_norm": 0.16438627342977627, "learning_rate": 1.0024779748134077e-06, "loss": 0.0023, "step": 8737 }, { "epoch": 3.9754322111010008, "grad_norm": 0.21527411005713984, "learning_rate": 1.001619616884904e-06, "loss": 0.0033, "step": 8738 }, { "epoch": 3.975887170154686, "grad_norm": 0.1608473656316147, "learning_rate": 1.0007615856859882e-06, "loss": 0.0009, "step": 8739 }, { "epoch": 3.9763421292083714, "grad_norm": 0.3284809627554831, "learning_rate": 9.999038812867757e-07, "loss": 0.0074, "step": 8740 }, { "epoch": 3.9767970882620567, "grad_norm": 0.1462191760856074, "learning_rate": 9.990465037573522e-07, "loss": 0.0015, "step": 8741 }, { "epoch": 3.9772520473157416, "grad_norm": 0.32529967232499146, "learning_rate": 9.981894531677811e-07, "loss": 0.0019, "step": 8742 }, { "epoch": 3.977707006369427, "grad_norm": 0.1776392132258705, "learning_rate": 9.973327295880962e-07, "loss": 0.0029, "step": 8743 }, { "epoch": 3.978161965423112, "grad_norm": 0.3820563221184206, "learning_rate": 9.964763330883037e-07, "loss": 0.0058, "step": 8744 }, { "epoch": 3.978616924476797, "grad_norm": 0.30612370402455663, "learning_rate": 9.956202637383872e-07, "loss": 0.0051, "step": 8745 }, { "epoch": 3.9790718835304824, "grad_norm": 0.3191255868166321, "learning_rate": 9.947645216082969e-07, "loss": 0.0094, "step": 8746 }, { "epoch": 3.9795268425841677, "grad_norm": 0.4696820632872373, "learning_rate": 9.93909106767964e-07, "loss": 0.0103, "step": 8747 }, { "epoch": 3.9799818016378525, "grad_norm": 0.042044907583024684, "learning_rate": 9.930540192872878e-07, "loss": 0.0004, "step": 8748 }, { "epoch": 3.980436760691538, "grad_norm": 0.11795179096727024, "learning_rate": 9.921992592361417e-07, "loss": 0.0025, "step": 8749 }, { "epoch": 3.980891719745223, "grad_norm": 0.2046039392004054, "learning_rate": 9.913448266843723e-07, "loss": 0.0033, "step": 8750 }, { "epoch": 3.981346678798908, "grad_norm": 0.5841869736893668, "learning_rate": 9.904907217018e-07, "loss": 0.0086, "step": 8751 }, { "epoch": 3.9818016378525933, "grad_norm": 0.11895797035409238, "learning_rate": 9.89636944358221e-07, "loss": 0.0017, "step": 8752 }, { "epoch": 3.9822565969062786, "grad_norm": 0.2358466084469214, "learning_rate": 9.887834947233998e-07, "loss": 0.0029, "step": 8753 }, { "epoch": 3.9827115559599635, "grad_norm": 0.10090868860955018, "learning_rate": 9.879303728670769e-07, "loss": 0.0007, "step": 8754 }, { "epoch": 3.983166515013649, "grad_norm": 0.21601646307537486, "learning_rate": 9.87077578858965e-07, "loss": 0.0026, "step": 8755 }, { "epoch": 3.983621474067334, "grad_norm": 0.4228229652085819, "learning_rate": 9.862251127687517e-07, "loss": 0.0072, "step": 8756 }, { "epoch": 3.984076433121019, "grad_norm": 0.2028143509325717, "learning_rate": 9.853729746660967e-07, "loss": 0.0019, "step": 8757 }, { "epoch": 3.9845313921747043, "grad_norm": 0.033332071780349914, "learning_rate": 9.845211646206303e-07, "loss": 0.0004, "step": 8758 }, { "epoch": 3.9849863512283896, "grad_norm": 0.24264904644566826, "learning_rate": 9.836696827019626e-07, "loss": 0.0082, "step": 8759 }, { "epoch": 3.9854413102820745, "grad_norm": 0.26325127541901877, "learning_rate": 9.828185289796694e-07, "loss": 0.0039, "step": 8760 }, { "epoch": 3.98589626933576, "grad_norm": 0.3430827811238145, "learning_rate": 9.819677035233056e-07, "loss": 0.0048, "step": 8761 }, { "epoch": 3.986351228389445, "grad_norm": 0.3358286935788147, "learning_rate": 9.81117206402396e-07, "loss": 0.0034, "step": 8762 }, { "epoch": 3.98680618744313, "grad_norm": 0.12137190403464901, "learning_rate": 9.802670376864388e-07, "loss": 0.0011, "step": 8763 }, { "epoch": 3.9872611464968153, "grad_norm": 0.14336226362938656, "learning_rate": 9.794171974449067e-07, "loss": 0.0009, "step": 8764 }, { "epoch": 3.9877161055505006, "grad_norm": 0.11599236315923966, "learning_rate": 9.785676857472421e-07, "loss": 0.0009, "step": 8765 }, { "epoch": 3.9881710646041855, "grad_norm": 0.3456247368378552, "learning_rate": 9.777185026628676e-07, "loss": 0.0017, "step": 8766 }, { "epoch": 3.988626023657871, "grad_norm": 0.029247647986222928, "learning_rate": 9.768696482611728e-07, "loss": 0.0002, "step": 8767 }, { "epoch": 3.989080982711556, "grad_norm": 0.10692913485883968, "learning_rate": 9.760211226115224e-07, "loss": 0.0015, "step": 8768 }, { "epoch": 3.989535941765241, "grad_norm": 0.23407586241030387, "learning_rate": 9.751729257832532e-07, "loss": 0.0021, "step": 8769 }, { "epoch": 3.9899909008189263, "grad_norm": 0.12988169044897596, "learning_rate": 9.743250578456752e-07, "loss": 0.0013, "step": 8770 }, { "epoch": 3.9904458598726116, "grad_norm": 0.4082665644860765, "learning_rate": 9.734775188680756e-07, "loss": 0.0059, "step": 8771 }, { "epoch": 3.9909008189262964, "grad_norm": 0.16520521011067488, "learning_rate": 9.726303089197082e-07, "loss": 0.0017, "step": 8772 }, { "epoch": 3.9913557779799818, "grad_norm": 0.19565360378040844, "learning_rate": 9.717834280698052e-07, "loss": 0.0017, "step": 8773 }, { "epoch": 3.991810737033667, "grad_norm": 0.226198635897474, "learning_rate": 9.709368763875693e-07, "loss": 0.003, "step": 8774 }, { "epoch": 3.992265696087352, "grad_norm": 0.17248274805901515, "learning_rate": 9.700906539421756e-07, "loss": 0.0018, "step": 8775 }, { "epoch": 3.9927206551410372, "grad_norm": 0.3293312271683953, "learning_rate": 9.692447608027767e-07, "loss": 0.0074, "step": 8776 }, { "epoch": 3.9931756141947226, "grad_norm": 0.30023329038908825, "learning_rate": 9.683991970384926e-07, "loss": 0.0073, "step": 8777 }, { "epoch": 3.9936305732484074, "grad_norm": 0.1969723331175485, "learning_rate": 9.675539627184194e-07, "loss": 0.0032, "step": 8778 }, { "epoch": 3.9940855323020927, "grad_norm": 0.04680802015481922, "learning_rate": 9.66709057911625e-07, "loss": 0.0004, "step": 8779 }, { "epoch": 3.994540491355778, "grad_norm": 0.29455246796677664, "learning_rate": 9.658644826871521e-07, "loss": 0.0079, "step": 8780 }, { "epoch": 3.994995450409463, "grad_norm": 0.1850009513454034, "learning_rate": 9.65020237114017e-07, "loss": 0.0016, "step": 8781 }, { "epoch": 3.9954504094631482, "grad_norm": 0.16219225526352227, "learning_rate": 9.641763212612065e-07, "loss": 0.0028, "step": 8782 }, { "epoch": 3.9959053685168335, "grad_norm": 0.02569460864892824, "learning_rate": 9.633327351976812e-07, "loss": 0.0002, "step": 8783 }, { "epoch": 3.9963603275705184, "grad_norm": 0.2568372854164518, "learning_rate": 9.62489478992374e-07, "loss": 0.0023, "step": 8784 }, { "epoch": 3.9968152866242037, "grad_norm": 0.14499148233375353, "learning_rate": 9.616465527141944e-07, "loss": 0.0012, "step": 8785 }, { "epoch": 3.997270245677889, "grad_norm": 0.309844440479213, "learning_rate": 9.60803956432021e-07, "loss": 0.0029, "step": 8786 }, { "epoch": 3.997725204731574, "grad_norm": 0.3518639185091935, "learning_rate": 9.599616902147079e-07, "loss": 0.0026, "step": 8787 }, { "epoch": 3.998180163785259, "grad_norm": 0.2909013371052998, "learning_rate": 9.591197541310815e-07, "loss": 0.0067, "step": 8788 }, { "epoch": 3.9986351228389445, "grad_norm": 0.11106972171052319, "learning_rate": 9.582781482499382e-07, "loss": 0.0007, "step": 8789 }, { "epoch": 3.99909008189263, "grad_norm": 0.12271487168672769, "learning_rate": 9.574368726400546e-07, "loss": 0.0022, "step": 8790 }, { "epoch": 3.9995450409463147, "grad_norm": 0.1912244248318796, "learning_rate": 9.565959273701731e-07, "loss": 0.0026, "step": 8791 }, { "epoch": 4.0, "grad_norm": 0.2508029985197783, "learning_rate": 9.557553125090125e-07, "loss": 0.003, "step": 8792 }, { "epoch": 4.000454959053685, "grad_norm": 0.16629222058490714, "learning_rate": 9.549150281252633e-07, "loss": 0.001, "step": 8793 }, { "epoch": 4.000909918107371, "grad_norm": 0.05719935228050158, "learning_rate": 9.540750742875905e-07, "loss": 0.0004, "step": 8794 }, { "epoch": 4.0013648771610555, "grad_norm": 0.06236427885770259, "learning_rate": 9.532354510646324e-07, "loss": 0.0003, "step": 8795 }, { "epoch": 4.00181983621474, "grad_norm": 0.12152999402084956, "learning_rate": 9.52396158524998e-07, "loss": 0.0022, "step": 8796 }, { "epoch": 4.002274795268426, "grad_norm": 0.05899456985133848, "learning_rate": 9.515571967372711e-07, "loss": 0.0007, "step": 8797 }, { "epoch": 4.002729754322111, "grad_norm": 0.08540582284877513, "learning_rate": 9.507185657700063e-07, "loss": 0.0008, "step": 8798 }, { "epoch": 4.003184713375796, "grad_norm": 0.03038397634794022, "learning_rate": 9.49880265691735e-07, "loss": 0.0002, "step": 8799 }, { "epoch": 4.003639672429482, "grad_norm": 0.09453161125851471, "learning_rate": 9.490422965709567e-07, "loss": 0.0012, "step": 8800 }, { "epoch": 4.0040946314831665, "grad_norm": 0.023004927038856763, "learning_rate": 9.482046584761496e-07, "loss": 0.0002, "step": 8801 }, { "epoch": 4.004549590536851, "grad_norm": 0.05805642327320453, "learning_rate": 9.473673514757597e-07, "loss": 0.001, "step": 8802 }, { "epoch": 4.005004549590537, "grad_norm": 0.10417321598418562, "learning_rate": 9.465303756382089e-07, "loss": 0.0009, "step": 8803 }, { "epoch": 4.005459508644222, "grad_norm": 0.060553228852067764, "learning_rate": 9.456937310318887e-07, "loss": 0.0008, "step": 8804 }, { "epoch": 4.005914467697907, "grad_norm": 0.11718271171656236, "learning_rate": 9.44857417725169e-07, "loss": 0.0019, "step": 8805 }, { "epoch": 4.006369426751593, "grad_norm": 0.1525982819267108, "learning_rate": 9.440214357863886e-07, "loss": 0.0011, "step": 8806 }, { "epoch": 4.0068243858052774, "grad_norm": 0.15186714338180338, "learning_rate": 9.431857852838583e-07, "loss": 0.0017, "step": 8807 }, { "epoch": 4.007279344858962, "grad_norm": 0.056470550793833195, "learning_rate": 9.423504662858668e-07, "loss": 0.0008, "step": 8808 }, { "epoch": 4.007734303912648, "grad_norm": 0.03213406387937676, "learning_rate": 9.415154788606695e-07, "loss": 0.0004, "step": 8809 }, { "epoch": 4.008189262966333, "grad_norm": 0.10789827861267955, "learning_rate": 9.406808230765003e-07, "loss": 0.0017, "step": 8810 }, { "epoch": 4.008644222020018, "grad_norm": 0.03326499372539869, "learning_rate": 9.398464990015632e-07, "loss": 0.0003, "step": 8811 }, { "epoch": 4.0090991810737036, "grad_norm": 0.13732606415092802, "learning_rate": 9.39012506704034e-07, "loss": 0.0009, "step": 8812 }, { "epoch": 4.009554140127388, "grad_norm": 0.06052085080997063, "learning_rate": 9.381788462520625e-07, "loss": 0.0007, "step": 8813 }, { "epoch": 4.010009099181073, "grad_norm": 0.03687619251212416, "learning_rate": 9.37345517713773e-07, "loss": 0.0006, "step": 8814 }, { "epoch": 4.010464058234759, "grad_norm": 0.08696989433821824, "learning_rate": 9.365125211572618e-07, "loss": 0.0006, "step": 8815 }, { "epoch": 4.010919017288444, "grad_norm": 0.10067405821078737, "learning_rate": 9.35679856650597e-07, "loss": 0.001, "step": 8816 }, { "epoch": 4.011373976342129, "grad_norm": 0.29326634481674474, "learning_rate": 9.3484752426182e-07, "loss": 0.0031, "step": 8817 }, { "epoch": 4.0118289353958145, "grad_norm": 0.02722245843376445, "learning_rate": 9.340155240589438e-07, "loss": 0.0004, "step": 8818 }, { "epoch": 4.012283894449499, "grad_norm": 0.08483441130339463, "learning_rate": 9.331838561099588e-07, "loss": 0.0008, "step": 8819 }, { "epoch": 4.012738853503185, "grad_norm": 0.2814341617235241, "learning_rate": 9.323525204828232e-07, "loss": 0.0018, "step": 8820 }, { "epoch": 4.01319381255687, "grad_norm": 0.03885371775112174, "learning_rate": 9.315215172454689e-07, "loss": 0.0005, "step": 8821 }, { "epoch": 4.013648771610555, "grad_norm": 0.0801769474773646, "learning_rate": 9.306908464658049e-07, "loss": 0.0014, "step": 8822 }, { "epoch": 4.014103730664241, "grad_norm": 0.13932865631304017, "learning_rate": 9.298605082117062e-07, "loss": 0.0024, "step": 8823 }, { "epoch": 4.0145586897179255, "grad_norm": 0.1737163295052175, "learning_rate": 9.290305025510282e-07, "loss": 0.0023, "step": 8824 }, { "epoch": 4.01501364877161, "grad_norm": 0.059422585767260366, "learning_rate": 9.282008295515926e-07, "loss": 0.0006, "step": 8825 }, { "epoch": 4.015468607825296, "grad_norm": 0.05645889787498745, "learning_rate": 9.273714892811975e-07, "loss": 0.0007, "step": 8826 }, { "epoch": 4.015923566878981, "grad_norm": 0.1218436780245377, "learning_rate": 9.265424818076107e-07, "loss": 0.0027, "step": 8827 }, { "epoch": 4.016378525932666, "grad_norm": 0.06894785952639247, "learning_rate": 9.257138071985771e-07, "loss": 0.0005, "step": 8828 }, { "epoch": 4.016833484986352, "grad_norm": 0.02303475730372948, "learning_rate": 9.248854655218131e-07, "loss": 0.0002, "step": 8829 }, { "epoch": 4.0172884440400365, "grad_norm": 0.01997278340019789, "learning_rate": 9.240574568450056e-07, "loss": 0.0002, "step": 8830 }, { "epoch": 4.017743403093721, "grad_norm": 0.04785533783856743, "learning_rate": 9.232297812358166e-07, "loss": 0.0006, "step": 8831 }, { "epoch": 4.018198362147407, "grad_norm": 0.043167284016806806, "learning_rate": 9.224024387618774e-07, "loss": 0.0004, "step": 8832 }, { "epoch": 4.018653321201092, "grad_norm": 0.0715606613607931, "learning_rate": 9.21575429490798e-07, "loss": 0.0003, "step": 8833 }, { "epoch": 4.019108280254777, "grad_norm": 0.05578037900399803, "learning_rate": 9.207487534901565e-07, "loss": 0.0006, "step": 8834 }, { "epoch": 4.019563239308463, "grad_norm": 0.055469182131497125, "learning_rate": 9.19922410827504e-07, "loss": 0.0003, "step": 8835 }, { "epoch": 4.0200181983621475, "grad_norm": 0.18119416154441015, "learning_rate": 9.190964015703679e-07, "loss": 0.003, "step": 8836 }, { "epoch": 4.020473157415832, "grad_norm": 0.03358839125611996, "learning_rate": 9.182707257862444e-07, "loss": 0.0003, "step": 8837 }, { "epoch": 4.020928116469518, "grad_norm": 0.09534904427799977, "learning_rate": 9.174453835426034e-07, "loss": 0.0023, "step": 8838 }, { "epoch": 4.021383075523203, "grad_norm": 0.3361905090875473, "learning_rate": 9.166203749068897e-07, "loss": 0.0014, "step": 8839 }, { "epoch": 4.021838034576888, "grad_norm": 0.049679718973019385, "learning_rate": 9.157956999465189e-07, "loss": 0.0005, "step": 8840 }, { "epoch": 4.022292993630574, "grad_norm": 0.02174356134841266, "learning_rate": 9.149713587288795e-07, "loss": 0.0002, "step": 8841 }, { "epoch": 4.022747952684258, "grad_norm": 0.1900377877472236, "learning_rate": 9.141473513213317e-07, "loss": 0.002, "step": 8842 }, { "epoch": 4.023202911737943, "grad_norm": 0.07960843514729345, "learning_rate": 9.133236777912107e-07, "loss": 0.0006, "step": 8843 }, { "epoch": 4.023657870791629, "grad_norm": 0.02187814862049447, "learning_rate": 9.125003382058245e-07, "loss": 0.0003, "step": 8844 }, { "epoch": 4.024112829845314, "grad_norm": 0.027882909629560287, "learning_rate": 9.116773326324518e-07, "loss": 0.0003, "step": 8845 }, { "epoch": 4.024567788898999, "grad_norm": 0.07110757169703331, "learning_rate": 9.10854661138345e-07, "loss": 0.0006, "step": 8846 }, { "epoch": 4.0250227479526846, "grad_norm": 0.022934152761742876, "learning_rate": 9.100323237907272e-07, "loss": 0.0002, "step": 8847 }, { "epoch": 4.025477707006369, "grad_norm": 0.0357790498890144, "learning_rate": 9.092103206567993e-07, "loss": 0.0005, "step": 8848 }, { "epoch": 4.025932666060054, "grad_norm": 0.03831240959878874, "learning_rate": 9.083886518037288e-07, "loss": 0.0004, "step": 8849 }, { "epoch": 4.02638762511374, "grad_norm": 0.03401280425837219, "learning_rate": 9.075673172986615e-07, "loss": 0.0004, "step": 8850 }, { "epoch": 4.026842584167425, "grad_norm": 0.1535679340541849, "learning_rate": 9.067463172087115e-07, "loss": 0.0024, "step": 8851 }, { "epoch": 4.02729754322111, "grad_norm": 0.016958990476787006, "learning_rate": 9.059256516009662e-07, "loss": 0.0002, "step": 8852 }, { "epoch": 4.0277525022747955, "grad_norm": 0.06832549703698518, "learning_rate": 9.051053205424898e-07, "loss": 0.0009, "step": 8853 }, { "epoch": 4.02820746132848, "grad_norm": 0.034531653057010385, "learning_rate": 9.042853241003136e-07, "loss": 0.0003, "step": 8854 }, { "epoch": 4.028662420382165, "grad_norm": 0.11969217982314607, "learning_rate": 9.034656623414451e-07, "loss": 0.0012, "step": 8855 }, { "epoch": 4.029117379435851, "grad_norm": 0.05900885201897496, "learning_rate": 9.026463353328613e-07, "loss": 0.0008, "step": 8856 }, { "epoch": 4.029572338489536, "grad_norm": 0.012944271170558883, "learning_rate": 9.018273431415159e-07, "loss": 0.0001, "step": 8857 }, { "epoch": 4.030027297543221, "grad_norm": 0.03252348541151254, "learning_rate": 9.010086858343337e-07, "loss": 0.0003, "step": 8858 }, { "epoch": 4.0304822565969065, "grad_norm": 0.010421336056305555, "learning_rate": 9.00190363478211e-07, "loss": 0.0001, "step": 8859 }, { "epoch": 4.030937215650591, "grad_norm": 0.0385043168603255, "learning_rate": 8.99372376140017e-07, "loss": 0.0004, "step": 8860 }, { "epoch": 4.031392174704276, "grad_norm": 0.021166918386738422, "learning_rate": 8.985547238865932e-07, "loss": 0.0001, "step": 8861 }, { "epoch": 4.031847133757962, "grad_norm": 0.06351162398142775, "learning_rate": 8.977374067847566e-07, "loss": 0.0007, "step": 8862 }, { "epoch": 4.032302092811647, "grad_norm": 0.014223236880788444, "learning_rate": 8.96920424901292e-07, "loss": 0.0001, "step": 8863 }, { "epoch": 4.032757051865332, "grad_norm": 0.2142073923819191, "learning_rate": 8.961037783029619e-07, "loss": 0.0018, "step": 8864 }, { "epoch": 4.0332120109190175, "grad_norm": 0.03503072632918302, "learning_rate": 8.952874670564987e-07, "loss": 0.0005, "step": 8865 }, { "epoch": 4.033666969972702, "grad_norm": 0.056155964747450354, "learning_rate": 8.944714912286051e-07, "loss": 0.0003, "step": 8866 }, { "epoch": 4.034121929026387, "grad_norm": 0.1566431753653592, "learning_rate": 8.936558508859627e-07, "loss": 0.0011, "step": 8867 }, { "epoch": 4.034576888080073, "grad_norm": 0.0293734711298181, "learning_rate": 8.9284054609522e-07, "loss": 0.0002, "step": 8868 }, { "epoch": 4.035031847133758, "grad_norm": 0.12108081780178453, "learning_rate": 8.920255769229996e-07, "loss": 0.0015, "step": 8869 }, { "epoch": 4.035486806187443, "grad_norm": 0.038777185050657005, "learning_rate": 8.912109434358967e-07, "loss": 0.0002, "step": 8870 }, { "epoch": 4.0359417652411285, "grad_norm": 0.06502099659443437, "learning_rate": 8.903966457004804e-07, "loss": 0.0004, "step": 8871 }, { "epoch": 4.036396724294813, "grad_norm": 0.10969463517955318, "learning_rate": 8.895826837832928e-07, "loss": 0.0004, "step": 8872 }, { "epoch": 4.036851683348498, "grad_norm": 0.1924762552943705, "learning_rate": 8.887690577508451e-07, "loss": 0.001, "step": 8873 }, { "epoch": 4.037306642402184, "grad_norm": 0.022684012443389672, "learning_rate": 8.879557676696243e-07, "loss": 0.0002, "step": 8874 }, { "epoch": 4.037761601455869, "grad_norm": 0.017218707485729432, "learning_rate": 8.871428136060883e-07, "loss": 0.0001, "step": 8875 }, { "epoch": 4.038216560509555, "grad_norm": 0.08411827591407593, "learning_rate": 8.863301956266673e-07, "loss": 0.0007, "step": 8876 }, { "epoch": 4.038671519563239, "grad_norm": 0.0490676564588678, "learning_rate": 8.855179137977649e-07, "loss": 0.0005, "step": 8877 }, { "epoch": 4.039126478616924, "grad_norm": 0.025097860062216235, "learning_rate": 8.847059681857595e-07, "loss": 0.0003, "step": 8878 }, { "epoch": 4.03958143767061, "grad_norm": 0.03299646778254466, "learning_rate": 8.838943588569976e-07, "loss": 0.0003, "step": 8879 }, { "epoch": 4.040036396724295, "grad_norm": 0.044524123147625376, "learning_rate": 8.830830858778001e-07, "loss": 0.0003, "step": 8880 }, { "epoch": 4.04049135577798, "grad_norm": 0.02142811088835668, "learning_rate": 8.822721493144603e-07, "loss": 0.0002, "step": 8881 }, { "epoch": 4.0409463148316656, "grad_norm": 0.0543461562792307, "learning_rate": 8.814615492332462e-07, "loss": 0.0004, "step": 8882 }, { "epoch": 4.04140127388535, "grad_norm": 0.24011374656624696, "learning_rate": 8.806512857003951e-07, "loss": 0.0007, "step": 8883 }, { "epoch": 4.041856232939035, "grad_norm": 0.0538549023953534, "learning_rate": 8.798413587821164e-07, "loss": 0.0003, "step": 8884 }, { "epoch": 4.042311191992721, "grad_norm": 0.14560340598590216, "learning_rate": 8.790317685445971e-07, "loss": 0.0027, "step": 8885 }, { "epoch": 4.042766151046406, "grad_norm": 0.15049845263483105, "learning_rate": 8.782225150539903e-07, "loss": 0.0026, "step": 8886 }, { "epoch": 4.043221110100091, "grad_norm": 0.045227957975040284, "learning_rate": 8.77413598376427e-07, "loss": 0.0003, "step": 8887 }, { "epoch": 4.0436760691537765, "grad_norm": 0.02505410553777718, "learning_rate": 8.766050185780067e-07, "loss": 0.0002, "step": 8888 }, { "epoch": 4.044131028207461, "grad_norm": 0.032392681875533055, "learning_rate": 8.757967757248037e-07, "loss": 0.0002, "step": 8889 }, { "epoch": 4.044585987261146, "grad_norm": 0.04228755254587372, "learning_rate": 8.749888698828618e-07, "loss": 0.0008, "step": 8890 }, { "epoch": 4.045040946314832, "grad_norm": 0.06947840080519249, "learning_rate": 8.741813011182015e-07, "loss": 0.0007, "step": 8891 }, { "epoch": 4.045495905368517, "grad_norm": 0.027067513906722015, "learning_rate": 8.73374069496814e-07, "loss": 0.0003, "step": 8892 }, { "epoch": 4.045950864422202, "grad_norm": 0.05425715690999819, "learning_rate": 8.725671750846621e-07, "loss": 0.0004, "step": 8893 }, { "epoch": 4.0464058234758875, "grad_norm": 0.12118559688524527, "learning_rate": 8.717606179476811e-07, "loss": 0.001, "step": 8894 }, { "epoch": 4.046860782529572, "grad_norm": 0.016646865707413114, "learning_rate": 8.709543981517787e-07, "loss": 0.0001, "step": 8895 }, { "epoch": 4.047315741583257, "grad_norm": 0.08585894100632072, "learning_rate": 8.70148515762837e-07, "loss": 0.0009, "step": 8896 }, { "epoch": 4.047770700636943, "grad_norm": 0.09019787343404101, "learning_rate": 8.693429708467089e-07, "loss": 0.0013, "step": 8897 }, { "epoch": 4.048225659690628, "grad_norm": 0.1757845738174576, "learning_rate": 8.685377634692177e-07, "loss": 0.002, "step": 8898 }, { "epoch": 4.048680618744313, "grad_norm": 0.2648062681391176, "learning_rate": 8.677328936961643e-07, "loss": 0.0018, "step": 8899 }, { "epoch": 4.0491355777979985, "grad_norm": 0.04054065784768181, "learning_rate": 8.669283615933161e-07, "loss": 0.0002, "step": 8900 }, { "epoch": 4.049590536851683, "grad_norm": 0.028120667044778246, "learning_rate": 8.661241672264193e-07, "loss": 0.0001, "step": 8901 }, { "epoch": 4.050045495905368, "grad_norm": 0.20649495046568908, "learning_rate": 8.653203106611868e-07, "loss": 0.0012, "step": 8902 }, { "epoch": 4.050500454959054, "grad_norm": 0.07667435440191665, "learning_rate": 8.645167919633063e-07, "loss": 0.0004, "step": 8903 }, { "epoch": 4.050955414012739, "grad_norm": 0.18083107846473992, "learning_rate": 8.637136111984368e-07, "loss": 0.0044, "step": 8904 }, { "epoch": 4.051410373066424, "grad_norm": 0.04045093253658398, "learning_rate": 8.629107684322113e-07, "loss": 0.0003, "step": 8905 }, { "epoch": 4.0518653321201095, "grad_norm": 0.051477488497152786, "learning_rate": 8.621082637302369e-07, "loss": 0.0006, "step": 8906 }, { "epoch": 4.052320291173794, "grad_norm": 0.09126540470406105, "learning_rate": 8.613060971580878e-07, "loss": 0.0005, "step": 8907 }, { "epoch": 4.052775250227479, "grad_norm": 0.029071446374448463, "learning_rate": 8.605042687813148e-07, "loss": 0.0003, "step": 8908 }, { "epoch": 4.053230209281165, "grad_norm": 0.04933030792822622, "learning_rate": 8.597027786654388e-07, "loss": 0.0006, "step": 8909 }, { "epoch": 4.05368516833485, "grad_norm": 0.06666459880024085, "learning_rate": 8.589016268759537e-07, "loss": 0.0006, "step": 8910 }, { "epoch": 4.054140127388535, "grad_norm": 0.022547665925337156, "learning_rate": 8.581008134783275e-07, "loss": 0.0002, "step": 8911 }, { "epoch": 4.05459508644222, "grad_norm": 0.015496997476280678, "learning_rate": 8.573003385379969e-07, "loss": 0.0001, "step": 8912 }, { "epoch": 4.055050045495905, "grad_norm": 0.05036702898357478, "learning_rate": 8.565002021203755e-07, "loss": 0.0007, "step": 8913 }, { "epoch": 4.05550500454959, "grad_norm": 0.02838668507755251, "learning_rate": 8.557004042908457e-07, "loss": 0.0002, "step": 8914 }, { "epoch": 4.055959963603276, "grad_norm": 0.031068534901867645, "learning_rate": 8.549009451147622e-07, "loss": 0.0003, "step": 8915 }, { "epoch": 4.056414922656961, "grad_norm": 0.020284579951607155, "learning_rate": 8.541018246574556e-07, "loss": 0.0002, "step": 8916 }, { "epoch": 4.056869881710646, "grad_norm": 0.07834446973243235, "learning_rate": 8.533030429842254e-07, "loss": 0.001, "step": 8917 }, { "epoch": 4.057324840764331, "grad_norm": 0.016110756599252412, "learning_rate": 8.525046001603437e-07, "loss": 0.0002, "step": 8918 }, { "epoch": 4.057779799818016, "grad_norm": 0.02754572370681331, "learning_rate": 8.517064962510552e-07, "loss": 0.0003, "step": 8919 }, { "epoch": 4.058234758871701, "grad_norm": 0.016533551553972476, "learning_rate": 8.509087313215786e-07, "loss": 0.0002, "step": 8920 }, { "epoch": 4.058689717925387, "grad_norm": 0.013175663004793819, "learning_rate": 8.501113054371041e-07, "loss": 0.0001, "step": 8921 }, { "epoch": 4.059144676979072, "grad_norm": 0.06306415838667524, "learning_rate": 8.493142186627934e-07, "loss": 0.0008, "step": 8922 }, { "epoch": 4.059599636032757, "grad_norm": 0.03610607875264149, "learning_rate": 8.485174710637801e-07, "loss": 0.0004, "step": 8923 }, { "epoch": 4.060054595086442, "grad_norm": 0.06247731857500946, "learning_rate": 8.477210627051702e-07, "loss": 0.0008, "step": 8924 }, { "epoch": 4.060509554140127, "grad_norm": 0.09650801477478864, "learning_rate": 8.469249936520446e-07, "loss": 0.0015, "step": 8925 }, { "epoch": 4.060964513193812, "grad_norm": 0.02882791154111998, "learning_rate": 8.461292639694519e-07, "loss": 0.0002, "step": 8926 }, { "epoch": 4.061419472247498, "grad_norm": 0.08301781116966915, "learning_rate": 8.453338737224187e-07, "loss": 0.0009, "step": 8927 }, { "epoch": 4.061874431301183, "grad_norm": 0.1050407721071887, "learning_rate": 8.445388229759388e-07, "loss": 0.0015, "step": 8928 }, { "epoch": 4.0623293903548685, "grad_norm": 0.10116590468987932, "learning_rate": 8.437441117949791e-07, "loss": 0.0006, "step": 8929 }, { "epoch": 4.062784349408553, "grad_norm": 0.10710108256887711, "learning_rate": 8.429497402444825e-07, "loss": 0.0012, "step": 8930 }, { "epoch": 4.063239308462238, "grad_norm": 0.14542571030514667, "learning_rate": 8.4215570838936e-07, "loss": 0.0028, "step": 8931 }, { "epoch": 4.063694267515924, "grad_norm": 0.04838768933729137, "learning_rate": 8.413620162944963e-07, "loss": 0.0004, "step": 8932 }, { "epoch": 4.064149226569609, "grad_norm": 0.025306242244517784, "learning_rate": 8.405686640247473e-07, "loss": 0.0001, "step": 8933 }, { "epoch": 4.064604185623294, "grad_norm": 0.06418806936722982, "learning_rate": 8.397756516449429e-07, "loss": 0.0007, "step": 8934 }, { "epoch": 4.0650591446769795, "grad_norm": 0.023924411571602452, "learning_rate": 8.389829792198867e-07, "loss": 0.0002, "step": 8935 }, { "epoch": 4.065514103730664, "grad_norm": 0.09422613042342526, "learning_rate": 8.381906468143497e-07, "loss": 0.0014, "step": 8936 }, { "epoch": 4.065969062784349, "grad_norm": 0.09464187290411909, "learning_rate": 8.37398654493079e-07, "loss": 0.0011, "step": 8937 }, { "epoch": 4.066424021838035, "grad_norm": 0.023980285812512162, "learning_rate": 8.366070023207906e-07, "loss": 0.0002, "step": 8938 }, { "epoch": 4.06687898089172, "grad_norm": 0.04353005793073376, "learning_rate": 8.358156903621778e-07, "loss": 0.0002, "step": 8939 }, { "epoch": 4.067333939945405, "grad_norm": 0.11476803805506165, "learning_rate": 8.350247186818999e-07, "loss": 0.0017, "step": 8940 }, { "epoch": 4.0677888989990905, "grad_norm": 0.048915106122024446, "learning_rate": 8.342340873445948e-07, "loss": 0.0009, "step": 8941 }, { "epoch": 4.068243858052775, "grad_norm": 0.06616594207739772, "learning_rate": 8.334437964148673e-07, "loss": 0.0005, "step": 8942 }, { "epoch": 4.06869881710646, "grad_norm": 0.043982404447476434, "learning_rate": 8.326538459572953e-07, "loss": 0.0003, "step": 8943 }, { "epoch": 4.069153776160146, "grad_norm": 0.025750021686192535, "learning_rate": 8.318642360364332e-07, "loss": 0.0003, "step": 8944 }, { "epoch": 4.069608735213831, "grad_norm": 0.16312377108904488, "learning_rate": 8.310749667168022e-07, "loss": 0.0026, "step": 8945 }, { "epoch": 4.070063694267516, "grad_norm": 0.07553269579476078, "learning_rate": 8.302860380628985e-07, "loss": 0.0011, "step": 8946 }, { "epoch": 4.070518653321201, "grad_norm": 0.023597852511622334, "learning_rate": 8.294974501391884e-07, "loss": 0.0002, "step": 8947 }, { "epoch": 4.070973612374886, "grad_norm": 0.019405493109933992, "learning_rate": 8.287092030101135e-07, "loss": 0.0001, "step": 8948 }, { "epoch": 4.071428571428571, "grad_norm": 0.029065838050355623, "learning_rate": 8.279212967400846e-07, "loss": 0.0002, "step": 8949 }, { "epoch": 4.071883530482257, "grad_norm": 0.11550024324451541, "learning_rate": 8.271337313934869e-07, "loss": 0.0004, "step": 8950 }, { "epoch": 4.072338489535942, "grad_norm": 0.08340750659124711, "learning_rate": 8.263465070346765e-07, "loss": 0.0009, "step": 8951 }, { "epoch": 4.072793448589627, "grad_norm": 0.009318842729009603, "learning_rate": 8.255596237279817e-07, "loss": 0.0001, "step": 8952 }, { "epoch": 4.073248407643312, "grad_norm": 0.05436080997286999, "learning_rate": 8.247730815377014e-07, "loss": 0.0002, "step": 8953 }, { "epoch": 4.073703366696997, "grad_norm": 0.12221853508932347, "learning_rate": 8.239868805281098e-07, "loss": 0.0006, "step": 8954 }, { "epoch": 4.074158325750682, "grad_norm": 0.23702379713841448, "learning_rate": 8.232010207634527e-07, "loss": 0.004, "step": 8955 }, { "epoch": 4.074613284804368, "grad_norm": 0.03331357686505322, "learning_rate": 8.224155023079461e-07, "loss": 0.0003, "step": 8956 }, { "epoch": 4.075068243858053, "grad_norm": 0.18943491853875813, "learning_rate": 8.216303252257791e-07, "loss": 0.0024, "step": 8957 }, { "epoch": 4.075523202911738, "grad_norm": 0.006566208299789381, "learning_rate": 8.208454895811108e-07, "loss": 0.0001, "step": 8958 }, { "epoch": 4.075978161965423, "grad_norm": 0.12251875752424424, "learning_rate": 8.200609954380778e-07, "loss": 0.0013, "step": 8959 }, { "epoch": 4.076433121019108, "grad_norm": 0.10692732317604908, "learning_rate": 8.192768428607839e-07, "loss": 0.0016, "step": 8960 }, { "epoch": 4.076888080072793, "grad_norm": 0.028289276831684338, "learning_rate": 8.18493031913305e-07, "loss": 0.0002, "step": 8961 }, { "epoch": 4.077343039126479, "grad_norm": 0.019305883017379562, "learning_rate": 8.177095626596932e-07, "loss": 0.0001, "step": 8962 }, { "epoch": 4.077797998180164, "grad_norm": 0.13520783450211288, "learning_rate": 8.169264351639672e-07, "loss": 0.0007, "step": 8963 }, { "epoch": 4.078252957233849, "grad_norm": 0.014133256391491447, "learning_rate": 8.161436494901242e-07, "loss": 0.0001, "step": 8964 }, { "epoch": 4.078707916287534, "grad_norm": 0.03690999317822864, "learning_rate": 8.153612057021276e-07, "loss": 0.0003, "step": 8965 }, { "epoch": 4.079162875341219, "grad_norm": 0.026264170966425613, "learning_rate": 8.145791038639161e-07, "loss": 0.0003, "step": 8966 }, { "epoch": 4.079617834394904, "grad_norm": 0.009964548093883467, "learning_rate": 8.137973440393976e-07, "loss": 0.0001, "step": 8967 }, { "epoch": 4.08007279344859, "grad_norm": 0.1445759254143165, "learning_rate": 8.130159262924553e-07, "loss": 0.0048, "step": 8968 }, { "epoch": 4.080527752502275, "grad_norm": 0.1467094810700478, "learning_rate": 8.122348506869448e-07, "loss": 0.0013, "step": 8969 }, { "epoch": 4.08098271155596, "grad_norm": 0.07713309591746362, "learning_rate": 8.114541172866902e-07, "loss": 0.0008, "step": 8970 }, { "epoch": 4.081437670609645, "grad_norm": 0.1432465944909815, "learning_rate": 8.106737261554897e-07, "loss": 0.0006, "step": 8971 }, { "epoch": 4.08189262966333, "grad_norm": 0.015942642563481164, "learning_rate": 8.098936773571126e-07, "loss": 0.0001, "step": 8972 }, { "epoch": 4.082347588717015, "grad_norm": 0.025928907252859866, "learning_rate": 8.091139709553031e-07, "loss": 0.0003, "step": 8973 }, { "epoch": 4.082802547770701, "grad_norm": 0.027104632299115566, "learning_rate": 8.083346070137737e-07, "loss": 0.0002, "step": 8974 }, { "epoch": 4.083257506824386, "grad_norm": 0.13595448163868545, "learning_rate": 8.075555855962097e-07, "loss": 0.0029, "step": 8975 }, { "epoch": 4.083712465878071, "grad_norm": 0.26466784047835923, "learning_rate": 8.067769067662718e-07, "loss": 0.0024, "step": 8976 }, { "epoch": 4.084167424931756, "grad_norm": 0.007845274885144131, "learning_rate": 8.059985705875873e-07, "loss": 0.0001, "step": 8977 }, { "epoch": 4.084622383985441, "grad_norm": 0.01942580719732545, "learning_rate": 8.052205771237603e-07, "loss": 0.0002, "step": 8978 }, { "epoch": 4.085077343039126, "grad_norm": 0.025475048300551154, "learning_rate": 8.044429264383652e-07, "loss": 0.0002, "step": 8979 }, { "epoch": 4.085532302092812, "grad_norm": 0.09668303521085533, "learning_rate": 8.036656185949466e-07, "loss": 0.0008, "step": 8980 }, { "epoch": 4.085987261146497, "grad_norm": 0.11432311808470239, "learning_rate": 8.028886536570235e-07, "loss": 0.0017, "step": 8981 }, { "epoch": 4.0864422202001816, "grad_norm": 0.026269303380395737, "learning_rate": 8.021120316880843e-07, "loss": 0.0001, "step": 8982 }, { "epoch": 4.086897179253867, "grad_norm": 0.03710699175861286, "learning_rate": 8.01335752751592e-07, "loss": 0.0002, "step": 8983 }, { "epoch": 4.087352138307552, "grad_norm": 0.010914013634221788, "learning_rate": 8.005598169109829e-07, "loss": 0.0001, "step": 8984 }, { "epoch": 4.087807097361237, "grad_norm": 0.047489529010998566, "learning_rate": 7.997842242296605e-07, "loss": 0.0004, "step": 8985 }, { "epoch": 4.088262056414923, "grad_norm": 0.19870774277849554, "learning_rate": 7.990089747710033e-07, "loss": 0.0015, "step": 8986 }, { "epoch": 4.088717015468608, "grad_norm": 0.14171092159495802, "learning_rate": 7.982340685983602e-07, "loss": 0.0021, "step": 8987 }, { "epoch": 4.089171974522293, "grad_norm": 0.04923405617189786, "learning_rate": 7.97459505775055e-07, "loss": 0.0004, "step": 8988 }, { "epoch": 4.089626933575978, "grad_norm": 0.23756487347204336, "learning_rate": 7.966852863643798e-07, "loss": 0.0039, "step": 8989 }, { "epoch": 4.090081892629663, "grad_norm": 0.0848559727271451, "learning_rate": 7.959114104296017e-07, "loss": 0.0004, "step": 8990 }, { "epoch": 4.090536851683349, "grad_norm": 0.059109878803031216, "learning_rate": 7.951378780339581e-07, "loss": 0.0002, "step": 8991 }, { "epoch": 4.090991810737034, "grad_norm": 0.010690299268787687, "learning_rate": 7.943646892406564e-07, "loss": 0.0001, "step": 8992 }, { "epoch": 4.091446769790719, "grad_norm": 0.016695187742703434, "learning_rate": 7.93591844112881e-07, "loss": 0.0002, "step": 8993 }, { "epoch": 4.091901728844404, "grad_norm": 0.08820152133944004, "learning_rate": 7.928193427137848e-07, "loss": 0.0005, "step": 8994 }, { "epoch": 4.092356687898089, "grad_norm": 0.27176668344128196, "learning_rate": 7.920471851064915e-07, "loss": 0.0012, "step": 8995 }, { "epoch": 4.092811646951774, "grad_norm": 0.2599016186439286, "learning_rate": 7.912753713540988e-07, "loss": 0.0016, "step": 8996 }, { "epoch": 4.09326660600546, "grad_norm": 0.24508168228113944, "learning_rate": 7.905039015196764e-07, "loss": 0.0029, "step": 8997 }, { "epoch": 4.093721565059145, "grad_norm": 0.16552595439233694, "learning_rate": 7.89732775666266e-07, "loss": 0.0003, "step": 8998 }, { "epoch": 4.09417652411283, "grad_norm": 0.03168636056515445, "learning_rate": 7.889619938568799e-07, "loss": 0.0003, "step": 8999 }, { "epoch": 4.094631483166515, "grad_norm": 0.03534578011139962, "learning_rate": 7.881915561545028e-07, "loss": 0.0002, "step": 9000 }, { "epoch": 4.0950864422202, "grad_norm": 0.03524834656645179, "learning_rate": 7.8742146262209e-07, "loss": 0.0003, "step": 9001 }, { "epoch": 4.095541401273885, "grad_norm": 0.033614103644947185, "learning_rate": 7.866517133225726e-07, "loss": 0.0003, "step": 9002 }, { "epoch": 4.095996360327571, "grad_norm": 0.11298186897731866, "learning_rate": 7.858823083188494e-07, "loss": 0.0012, "step": 9003 }, { "epoch": 4.096451319381256, "grad_norm": 0.00536461124229854, "learning_rate": 7.851132476737938e-07, "loss": 0.0, "step": 9004 }, { "epoch": 4.096906278434941, "grad_norm": 0.10682383732275096, "learning_rate": 7.843445314502491e-07, "loss": 0.0006, "step": 9005 }, { "epoch": 4.097361237488626, "grad_norm": 0.13699461915240788, "learning_rate": 7.835761597110308e-07, "loss": 0.001, "step": 9006 }, { "epoch": 4.097816196542311, "grad_norm": 0.25655542712667856, "learning_rate": 7.828081325189285e-07, "loss": 0.0011, "step": 9007 }, { "epoch": 4.098271155595996, "grad_norm": 0.1295219593968848, "learning_rate": 7.820404499367012e-07, "loss": 0.0016, "step": 9008 }, { "epoch": 4.098726114649682, "grad_norm": 0.010228893357943289, "learning_rate": 7.8127311202708e-07, "loss": 0.0001, "step": 9009 }, { "epoch": 4.099181073703367, "grad_norm": 0.06387778339624936, "learning_rate": 7.805061188527674e-07, "loss": 0.0005, "step": 9010 }, { "epoch": 4.099636032757052, "grad_norm": 0.04014668054526879, "learning_rate": 7.797394704764394e-07, "loss": 0.0003, "step": 9011 }, { "epoch": 4.100090991810737, "grad_norm": 0.15261539973298907, "learning_rate": 7.789731669607447e-07, "loss": 0.0026, "step": 9012 }, { "epoch": 4.100545950864422, "grad_norm": 0.013924416974846462, "learning_rate": 7.782072083683012e-07, "loss": 0.0001, "step": 9013 }, { "epoch": 4.101000909918107, "grad_norm": 0.048890312162571815, "learning_rate": 7.774415947616987e-07, "loss": 0.0004, "step": 9014 }, { "epoch": 4.101455868971793, "grad_norm": 0.1745776853619407, "learning_rate": 7.766763262035004e-07, "loss": 0.0006, "step": 9015 }, { "epoch": 4.101910828025478, "grad_norm": 0.026931852751956892, "learning_rate": 7.759114027562387e-07, "loss": 0.0002, "step": 9016 }, { "epoch": 4.1023657870791626, "grad_norm": 0.04054561665703668, "learning_rate": 7.751468244824217e-07, "loss": 0.0003, "step": 9017 }, { "epoch": 4.102820746132848, "grad_norm": 0.012079983109997021, "learning_rate": 7.743825914445285e-07, "loss": 0.0001, "step": 9018 }, { "epoch": 4.103275705186533, "grad_norm": 0.009909822756248432, "learning_rate": 7.736187037050069e-07, "loss": 0.0001, "step": 9019 }, { "epoch": 4.103730664240218, "grad_norm": 0.255041745201386, "learning_rate": 7.728551613262786e-07, "loss": 0.0023, "step": 9020 }, { "epoch": 4.104185623293904, "grad_norm": 0.07233558674493862, "learning_rate": 7.720919643707359e-07, "loss": 0.0009, "step": 9021 }, { "epoch": 4.104640582347589, "grad_norm": 0.028764691554044784, "learning_rate": 7.713291129007455e-07, "loss": 0.0001, "step": 9022 }, { "epoch": 4.1050955414012735, "grad_norm": 0.04014556800112941, "learning_rate": 7.705666069786438e-07, "loss": 0.0003, "step": 9023 }, { "epoch": 4.105550500454959, "grad_norm": 0.10363523284450694, "learning_rate": 7.698044466667381e-07, "loss": 0.0011, "step": 9024 }, { "epoch": 4.106005459508644, "grad_norm": 0.03143944711491693, "learning_rate": 7.690426320273104e-07, "loss": 0.0003, "step": 9025 }, { "epoch": 4.106460418562329, "grad_norm": 0.026968636564706217, "learning_rate": 7.682811631226112e-07, "loss": 0.0003, "step": 9026 }, { "epoch": 4.106915377616015, "grad_norm": 0.13690237862947646, "learning_rate": 7.675200400148658e-07, "loss": 0.003, "step": 9027 }, { "epoch": 4.1073703366697, "grad_norm": 0.24160531306052005, "learning_rate": 7.66759262766269e-07, "loss": 0.0032, "step": 9028 }, { "epoch": 4.1078252957233845, "grad_norm": 0.046244627139069, "learning_rate": 7.659988314389887e-07, "loss": 0.0005, "step": 9029 }, { "epoch": 4.10828025477707, "grad_norm": 0.06866401152338791, "learning_rate": 7.652387460951616e-07, "loss": 0.0008, "step": 9030 }, { "epoch": 4.108735213830755, "grad_norm": 0.03827093977408107, "learning_rate": 7.644790067969005e-07, "loss": 0.0003, "step": 9031 }, { "epoch": 4.10919017288444, "grad_norm": 0.034568775397146846, "learning_rate": 7.637196136062886e-07, "loss": 0.0003, "step": 9032 }, { "epoch": 4.109645131938126, "grad_norm": 0.7420127393791961, "learning_rate": 7.62960566585379e-07, "loss": 0.0003, "step": 9033 }, { "epoch": 4.110100090991811, "grad_norm": 0.07435224717976553, "learning_rate": 7.622018657961972e-07, "loss": 0.0007, "step": 9034 }, { "epoch": 4.1105550500454955, "grad_norm": 0.14420560210490088, "learning_rate": 7.614435113007406e-07, "loss": 0.0006, "step": 9035 }, { "epoch": 4.111010009099181, "grad_norm": 0.07155717824907191, "learning_rate": 7.6068550316098e-07, "loss": 0.0013, "step": 9036 }, { "epoch": 4.111464968152866, "grad_norm": 0.17961858540151596, "learning_rate": 7.599278414388544e-07, "loss": 0.0024, "step": 9037 }, { "epoch": 4.111919927206552, "grad_norm": 0.02222899526199989, "learning_rate": 7.591705261962784e-07, "loss": 0.0002, "step": 9038 }, { "epoch": 4.112374886260237, "grad_norm": 0.09069728412670236, "learning_rate": 7.584135574951362e-07, "loss": 0.0009, "step": 9039 }, { "epoch": 4.112829845313922, "grad_norm": 0.036027378337994576, "learning_rate": 7.576569353972818e-07, "loss": 0.0003, "step": 9040 }, { "epoch": 4.113284804367607, "grad_norm": 0.0427730374010759, "learning_rate": 7.569006599645456e-07, "loss": 0.0005, "step": 9041 }, { "epoch": 4.113739763421292, "grad_norm": 0.04034244624040654, "learning_rate": 7.561447312587256e-07, "loss": 0.0004, "step": 9042 }, { "epoch": 4.114194722474977, "grad_norm": 0.1526865906571951, "learning_rate": 7.553891493415933e-07, "loss": 0.0011, "step": 9043 }, { "epoch": 4.114649681528663, "grad_norm": 0.1703628161788179, "learning_rate": 7.546339142748899e-07, "loss": 0.005, "step": 9044 }, { "epoch": 4.115104640582348, "grad_norm": 0.009205656991178103, "learning_rate": 7.53879026120331e-07, "loss": 0.0001, "step": 9045 }, { "epoch": 4.115559599636033, "grad_norm": 0.04278017196341073, "learning_rate": 7.531244849396041e-07, "loss": 0.0003, "step": 9046 }, { "epoch": 4.116014558689718, "grad_norm": 0.012008839485111932, "learning_rate": 7.523702907943658e-07, "loss": 0.0001, "step": 9047 }, { "epoch": 4.116469517743403, "grad_norm": 0.0787735252110677, "learning_rate": 7.516164437462453e-07, "loss": 0.0004, "step": 9048 }, { "epoch": 4.116924476797088, "grad_norm": 0.014533373140239067, "learning_rate": 7.508629438568415e-07, "loss": 0.0001, "step": 9049 }, { "epoch": 4.117379435850774, "grad_norm": 0.06570845393221407, "learning_rate": 7.501097911877308e-07, "loss": 0.0006, "step": 9050 }, { "epoch": 4.117834394904459, "grad_norm": 0.10137236537515193, "learning_rate": 7.493569858004546e-07, "loss": 0.0013, "step": 9051 }, { "epoch": 4.1182893539581436, "grad_norm": 0.07812229924381484, "learning_rate": 7.486045277565307e-07, "loss": 0.0006, "step": 9052 }, { "epoch": 4.118744313011829, "grad_norm": 0.10653101170381142, "learning_rate": 7.478524171174456e-07, "loss": 0.0007, "step": 9053 }, { "epoch": 4.119199272065514, "grad_norm": 0.14023018899099193, "learning_rate": 7.471006539446585e-07, "loss": 0.0016, "step": 9054 }, { "epoch": 4.119654231119199, "grad_norm": 0.09752839565509065, "learning_rate": 7.46349238299599e-07, "loss": 0.0013, "step": 9055 }, { "epoch": 4.120109190172885, "grad_norm": 0.07994606258243822, "learning_rate": 7.455981702436715e-07, "loss": 0.0003, "step": 9056 }, { "epoch": 4.12056414922657, "grad_norm": 0.06766490200755171, "learning_rate": 7.448474498382491e-07, "loss": 0.0005, "step": 9057 }, { "epoch": 4.1210191082802545, "grad_norm": 0.07879223370637874, "learning_rate": 7.440970771446754e-07, "loss": 0.0011, "step": 9058 }, { "epoch": 4.12147406733394, "grad_norm": 0.05380977953691529, "learning_rate": 7.433470522242702e-07, "loss": 0.0012, "step": 9059 }, { "epoch": 4.121929026387625, "grad_norm": 0.06098484058756868, "learning_rate": 7.425973751383203e-07, "loss": 0.0004, "step": 9060 }, { "epoch": 4.12238398544131, "grad_norm": 0.13783283295433033, "learning_rate": 7.41848045948087e-07, "loss": 0.0017, "step": 9061 }, { "epoch": 4.122838944494996, "grad_norm": 0.0743303095245618, "learning_rate": 7.410990647148025e-07, "loss": 0.0009, "step": 9062 }, { "epoch": 4.123293903548681, "grad_norm": 0.04413514441662232, "learning_rate": 7.40350431499669e-07, "loss": 0.0004, "step": 9063 }, { "epoch": 4.1237488626023655, "grad_norm": 0.023929063696521806, "learning_rate": 7.396021463638608e-07, "loss": 0.0001, "step": 9064 }, { "epoch": 4.124203821656051, "grad_norm": 0.10022273116903364, "learning_rate": 7.388542093685258e-07, "loss": 0.0007, "step": 9065 }, { "epoch": 4.124658780709736, "grad_norm": 0.12799627777300554, "learning_rate": 7.381066205747822e-07, "loss": 0.0013, "step": 9066 }, { "epoch": 4.125113739763421, "grad_norm": 0.008836613750157247, "learning_rate": 7.373593800437196e-07, "loss": 0.0001, "step": 9067 }, { "epoch": 4.125568698817107, "grad_norm": 0.12382721197088449, "learning_rate": 7.366124878363984e-07, "loss": 0.0015, "step": 9068 }, { "epoch": 4.126023657870792, "grad_norm": 0.023711049079336122, "learning_rate": 7.358659440138499e-07, "loss": 0.0002, "step": 9069 }, { "epoch": 4.1264786169244765, "grad_norm": 0.056088606069611455, "learning_rate": 7.351197486370809e-07, "loss": 0.0004, "step": 9070 }, { "epoch": 4.126933575978162, "grad_norm": 0.026745150542072205, "learning_rate": 7.343739017670665e-07, "loss": 0.0002, "step": 9071 }, { "epoch": 4.127388535031847, "grad_norm": 0.12775574427460795, "learning_rate": 7.336284034647517e-07, "loss": 0.0022, "step": 9072 }, { "epoch": 4.127843494085532, "grad_norm": 0.029243124692943483, "learning_rate": 7.328832537910585e-07, "loss": 0.0002, "step": 9073 }, { "epoch": 4.128298453139218, "grad_norm": 0.374559378736072, "learning_rate": 7.321384528068748e-07, "loss": 0.0027, "step": 9074 }, { "epoch": 4.128753412192903, "grad_norm": 0.12335953680153586, "learning_rate": 7.313940005730641e-07, "loss": 0.002, "step": 9075 }, { "epoch": 4.1292083712465875, "grad_norm": 0.1683120319833235, "learning_rate": 7.306498971504589e-07, "loss": 0.0016, "step": 9076 }, { "epoch": 4.129663330300273, "grad_norm": 0.02827564683645294, "learning_rate": 7.299061425998638e-07, "loss": 0.0002, "step": 9077 }, { "epoch": 4.130118289353958, "grad_norm": 0.013370817807326758, "learning_rate": 7.291627369820542e-07, "loss": 0.0001, "step": 9078 }, { "epoch": 4.130573248407643, "grad_norm": 0.02059884797082071, "learning_rate": 7.284196803577787e-07, "loss": 0.0001, "step": 9079 }, { "epoch": 4.131028207461329, "grad_norm": 0.027242690545820625, "learning_rate": 7.276769727877575e-07, "loss": 0.0003, "step": 9080 }, { "epoch": 4.131483166515014, "grad_norm": 0.03871781328085265, "learning_rate": 7.269346143326805e-07, "loss": 0.0003, "step": 9081 }, { "epoch": 4.131938125568698, "grad_norm": 0.01139347640688604, "learning_rate": 7.261926050532103e-07, "loss": 0.0001, "step": 9082 }, { "epoch": 4.132393084622384, "grad_norm": 0.06897621353278155, "learning_rate": 7.254509450099784e-07, "loss": 0.0006, "step": 9083 }, { "epoch": 4.132848043676069, "grad_norm": 0.015195485397562918, "learning_rate": 7.247096342635929e-07, "loss": 0.0002, "step": 9084 }, { "epoch": 4.133303002729754, "grad_norm": 0.20469708096473443, "learning_rate": 7.239686728746292e-07, "loss": 0.0025, "step": 9085 }, { "epoch": 4.13375796178344, "grad_norm": 0.010152895976561375, "learning_rate": 7.232280609036341e-07, "loss": 0.0001, "step": 9086 }, { "epoch": 4.1342129208371245, "grad_norm": 0.04787282604103635, "learning_rate": 7.224877984111289e-07, "loss": 0.0004, "step": 9087 }, { "epoch": 4.134667879890809, "grad_norm": 0.06828705294116683, "learning_rate": 7.217478854576026e-07, "loss": 0.0009, "step": 9088 }, { "epoch": 4.135122838944495, "grad_norm": 0.13268464470767546, "learning_rate": 7.210083221035202e-07, "loss": 0.0012, "step": 9089 }, { "epoch": 4.13557779799818, "grad_norm": 0.009507070639699456, "learning_rate": 7.202691084093138e-07, "loss": 0.0001, "step": 9090 }, { "epoch": 4.136032757051865, "grad_norm": 0.07720946676333287, "learning_rate": 7.195302444353886e-07, "loss": 0.0004, "step": 9091 }, { "epoch": 4.136487716105551, "grad_norm": 0.16378615861974807, "learning_rate": 7.187917302421216e-07, "loss": 0.0017, "step": 9092 }, { "epoch": 4.1369426751592355, "grad_norm": 0.16474450040440558, "learning_rate": 7.180535658898596e-07, "loss": 0.0011, "step": 9093 }, { "epoch": 4.13739763421292, "grad_norm": 0.004564027357272073, "learning_rate": 7.173157514389228e-07, "loss": 0.0, "step": 9094 }, { "epoch": 4.137852593266606, "grad_norm": 0.10569007381464603, "learning_rate": 7.165782869496035e-07, "loss": 0.0015, "step": 9095 }, { "epoch": 4.138307552320291, "grad_norm": 0.09926510154592078, "learning_rate": 7.158411724821629e-07, "loss": 0.0005, "step": 9096 }, { "epoch": 4.138762511373977, "grad_norm": 0.0625997135512905, "learning_rate": 7.151044080968344e-07, "loss": 0.0013, "step": 9097 }, { "epoch": 4.139217470427662, "grad_norm": 0.009962982838196358, "learning_rate": 7.143679938538228e-07, "loss": 0.0001, "step": 9098 }, { "epoch": 4.1396724294813465, "grad_norm": 0.07108107178098812, "learning_rate": 7.136319298133054e-07, "loss": 0.0004, "step": 9099 }, { "epoch": 4.140127388535032, "grad_norm": 0.0874769956895481, "learning_rate": 7.128962160354291e-07, "loss": 0.0004, "step": 9100 }, { "epoch": 4.140582347588717, "grad_norm": 0.01717447621969392, "learning_rate": 7.121608525803142e-07, "loss": 0.0001, "step": 9101 }, { "epoch": 4.141037306642402, "grad_norm": 0.00924791996682537, "learning_rate": 7.114258395080509e-07, "loss": 0.0001, "step": 9102 }, { "epoch": 4.141492265696088, "grad_norm": 0.13289027307934165, "learning_rate": 7.106911768787e-07, "loss": 0.0014, "step": 9103 }, { "epoch": 4.141947224749773, "grad_norm": 0.019391147405749938, "learning_rate": 7.09956864752297e-07, "loss": 0.0001, "step": 9104 }, { "epoch": 4.1424021838034575, "grad_norm": 0.23201577816908722, "learning_rate": 7.092229031888448e-07, "loss": 0.0021, "step": 9105 }, { "epoch": 4.142857142857143, "grad_norm": 0.0814584473978802, "learning_rate": 7.084892922483205e-07, "loss": 0.0013, "step": 9106 }, { "epoch": 4.143312101910828, "grad_norm": 0.05315791364710135, "learning_rate": 7.077560319906696e-07, "loss": 0.0002, "step": 9107 }, { "epoch": 4.143767060964513, "grad_norm": 0.07157720774010551, "learning_rate": 7.070231224758123e-07, "loss": 0.0007, "step": 9108 }, { "epoch": 4.144222020018199, "grad_norm": 0.10267073891209148, "learning_rate": 7.062905637636397e-07, "loss": 0.0016, "step": 9109 }, { "epoch": 4.144676979071884, "grad_norm": 0.014047374623526305, "learning_rate": 7.055583559140116e-07, "loss": 0.0001, "step": 9110 }, { "epoch": 4.1451319381255685, "grad_norm": 0.06027531730501482, "learning_rate": 7.048264989867615e-07, "loss": 0.0004, "step": 9111 }, { "epoch": 4.145586897179254, "grad_norm": 0.10077012797627584, "learning_rate": 7.040949930416918e-07, "loss": 0.0019, "step": 9112 }, { "epoch": 4.146041856232939, "grad_norm": 0.07578086815231937, "learning_rate": 7.033638381385804e-07, "loss": 0.0007, "step": 9113 }, { "epoch": 4.146496815286624, "grad_norm": 0.2161013725003285, "learning_rate": 7.026330343371712e-07, "loss": 0.0009, "step": 9114 }, { "epoch": 4.14695177434031, "grad_norm": 0.0402471231960349, "learning_rate": 7.019025816971852e-07, "loss": 0.0002, "step": 9115 }, { "epoch": 4.147406733393995, "grad_norm": 0.027199882194507998, "learning_rate": 7.011724802783104e-07, "loss": 0.0002, "step": 9116 }, { "epoch": 4.147861692447679, "grad_norm": 0.03104854755254246, "learning_rate": 7.004427301402055e-07, "loss": 0.0003, "step": 9117 }, { "epoch": 4.148316651501365, "grad_norm": 0.009678113573452149, "learning_rate": 6.997133313425058e-07, "loss": 0.0001, "step": 9118 }, { "epoch": 4.14877161055505, "grad_norm": 0.04210170721353225, "learning_rate": 6.989842839448125e-07, "loss": 0.0004, "step": 9119 }, { "epoch": 4.149226569608735, "grad_norm": 0.0644528736816396, "learning_rate": 6.982555880066999e-07, "loss": 0.0012, "step": 9120 }, { "epoch": 4.149681528662421, "grad_norm": 0.1057194319425339, "learning_rate": 6.975272435877134e-07, "loss": 0.0014, "step": 9121 }, { "epoch": 4.1501364877161055, "grad_norm": 0.006663769364760856, "learning_rate": 6.967992507473703e-07, "loss": 0.0001, "step": 9122 }, { "epoch": 4.15059144676979, "grad_norm": 0.022638692143098008, "learning_rate": 6.960716095451608e-07, "loss": 0.0002, "step": 9123 }, { "epoch": 4.151046405823476, "grad_norm": 0.02387221776472979, "learning_rate": 6.95344320040543e-07, "loss": 0.0001, "step": 9124 }, { "epoch": 4.151501364877161, "grad_norm": 0.03687433742698374, "learning_rate": 6.946173822929481e-07, "loss": 0.0004, "step": 9125 }, { "epoch": 4.151956323930846, "grad_norm": 0.07900027947464566, "learning_rate": 6.938907963617775e-07, "loss": 0.0008, "step": 9126 }, { "epoch": 4.152411282984532, "grad_norm": 0.006882607438259228, "learning_rate": 6.931645623064031e-07, "loss": 0.0, "step": 9127 }, { "epoch": 4.1528662420382165, "grad_norm": 0.04867944724074638, "learning_rate": 6.924386801861721e-07, "loss": 0.0006, "step": 9128 }, { "epoch": 4.153321201091901, "grad_norm": 0.07776516566746394, "learning_rate": 6.917131500603996e-07, "loss": 0.0006, "step": 9129 }, { "epoch": 4.153776160145587, "grad_norm": 0.028951684215992028, "learning_rate": 6.909879719883733e-07, "loss": 0.0002, "step": 9130 }, { "epoch": 4.154231119199272, "grad_norm": 0.08448020514966632, "learning_rate": 6.902631460293501e-07, "loss": 0.0007, "step": 9131 }, { "epoch": 4.154686078252957, "grad_norm": 0.06194416165232362, "learning_rate": 6.89538672242559e-07, "loss": 0.0006, "step": 9132 }, { "epoch": 4.155141037306643, "grad_norm": 0.3397159170814614, "learning_rate": 6.888145506872029e-07, "loss": 0.0038, "step": 9133 }, { "epoch": 4.1555959963603275, "grad_norm": 0.20356016980668376, "learning_rate": 6.880907814224524e-07, "loss": 0.0012, "step": 9134 }, { "epoch": 4.156050955414012, "grad_norm": 0.07823289180254378, "learning_rate": 6.873673645074497e-07, "loss": 0.0006, "step": 9135 }, { "epoch": 4.156505914467698, "grad_norm": 0.08199286828782393, "learning_rate": 6.866443000013117e-07, "loss": 0.0007, "step": 9136 }, { "epoch": 4.156960873521383, "grad_norm": 0.03775878067999496, "learning_rate": 6.859215879631215e-07, "loss": 0.0002, "step": 9137 }, { "epoch": 4.157415832575068, "grad_norm": 0.25692113151172263, "learning_rate": 6.851992284519377e-07, "loss": 0.005, "step": 9138 }, { "epoch": 4.157870791628754, "grad_norm": 0.052334533110741516, "learning_rate": 6.844772215267875e-07, "loss": 0.0002, "step": 9139 }, { "epoch": 4.1583257506824385, "grad_norm": 0.12715905993237583, "learning_rate": 6.837555672466701e-07, "loss": 0.0009, "step": 9140 }, { "epoch": 4.158780709736123, "grad_norm": 0.07667259298747758, "learning_rate": 6.830342656705546e-07, "loss": 0.0007, "step": 9141 }, { "epoch": 4.159235668789809, "grad_norm": 0.10471670082195757, "learning_rate": 6.823133168573836e-07, "loss": 0.0008, "step": 9142 }, { "epoch": 4.159690627843494, "grad_norm": 0.11208945904229069, "learning_rate": 6.815927208660711e-07, "loss": 0.001, "step": 9143 }, { "epoch": 4.160145586897179, "grad_norm": 0.24994788542120291, "learning_rate": 6.808724777554998e-07, "loss": 0.0014, "step": 9144 }, { "epoch": 4.160600545950865, "grad_norm": 0.04349589401000913, "learning_rate": 6.801525875845244e-07, "loss": 0.0004, "step": 9145 }, { "epoch": 4.1610555050045495, "grad_norm": 0.19759787328195044, "learning_rate": 6.794330504119706e-07, "loss": 0.004, "step": 9146 }, { "epoch": 4.161510464058235, "grad_norm": 0.030857087462157606, "learning_rate": 6.787138662966369e-07, "loss": 0.0002, "step": 9147 }, { "epoch": 4.16196542311192, "grad_norm": 0.11520096368589824, "learning_rate": 6.779950352972919e-07, "loss": 0.0018, "step": 9148 }, { "epoch": 4.162420382165605, "grad_norm": 0.1908976447168784, "learning_rate": 6.77276557472673e-07, "loss": 0.001, "step": 9149 }, { "epoch": 4.162875341219291, "grad_norm": 0.07705742660010427, "learning_rate": 6.76558432881494e-07, "loss": 0.0006, "step": 9150 }, { "epoch": 4.163330300272976, "grad_norm": 0.011536493961918222, "learning_rate": 6.758406615824342e-07, "loss": 0.0001, "step": 9151 }, { "epoch": 4.16378525932666, "grad_norm": 0.12534348159449027, "learning_rate": 6.751232436341487e-07, "loss": 0.0008, "step": 9152 }, { "epoch": 4.164240218380346, "grad_norm": 0.04610340944499061, "learning_rate": 6.744061790952611e-07, "loss": 0.0003, "step": 9153 }, { "epoch": 4.164695177434031, "grad_norm": 0.016528406575559255, "learning_rate": 6.736894680243661e-07, "loss": 0.0001, "step": 9154 }, { "epoch": 4.165150136487716, "grad_norm": 0.14485888727291657, "learning_rate": 6.729731104800292e-07, "loss": 0.0012, "step": 9155 }, { "epoch": 4.165605095541402, "grad_norm": 0.028025914044455785, "learning_rate": 6.722571065207889e-07, "loss": 0.0002, "step": 9156 }, { "epoch": 4.1660600545950865, "grad_norm": 0.035713522445234486, "learning_rate": 6.715414562051553e-07, "loss": 0.0003, "step": 9157 }, { "epoch": 4.166515013648771, "grad_norm": 0.12071765878038823, "learning_rate": 6.708261595916071e-07, "loss": 0.0016, "step": 9158 }, { "epoch": 4.166969972702457, "grad_norm": 0.019864769840983554, "learning_rate": 6.701112167385943e-07, "loss": 0.0001, "step": 9159 }, { "epoch": 4.167424931756142, "grad_norm": 0.04428397490285644, "learning_rate": 6.693966277045394e-07, "loss": 0.0002, "step": 9160 }, { "epoch": 4.167879890809827, "grad_norm": 0.041786114304456355, "learning_rate": 6.686823925478336e-07, "loss": 0.0001, "step": 9161 }, { "epoch": 4.168334849863513, "grad_norm": 0.07412171322175884, "learning_rate": 6.679685113268447e-07, "loss": 0.0007, "step": 9162 }, { "epoch": 4.1687898089171975, "grad_norm": 0.023537291646692, "learning_rate": 6.672549840999037e-07, "loss": 0.0001, "step": 9163 }, { "epoch": 4.169244767970882, "grad_norm": 0.17160225817536284, "learning_rate": 6.665418109253207e-07, "loss": 0.0048, "step": 9164 }, { "epoch": 4.169699727024568, "grad_norm": 0.00893465321613862, "learning_rate": 6.658289918613709e-07, "loss": 0.0001, "step": 9165 }, { "epoch": 4.170154686078253, "grad_norm": 0.1361831934819655, "learning_rate": 6.651165269663018e-07, "loss": 0.0029, "step": 9166 }, { "epoch": 4.170609645131938, "grad_norm": 0.031564461284722276, "learning_rate": 6.644044162983355e-07, "loss": 0.0002, "step": 9167 }, { "epoch": 4.171064604185624, "grad_norm": 0.0770030833103646, "learning_rate": 6.636926599156601e-07, "loss": 0.0003, "step": 9168 }, { "epoch": 4.1715195632393085, "grad_norm": 0.006766130064054846, "learning_rate": 6.629812578764389e-07, "loss": 0.0001, "step": 9169 }, { "epoch": 4.171974522292993, "grad_norm": 0.024778419877562463, "learning_rate": 6.622702102388018e-07, "loss": 0.0002, "step": 9170 }, { "epoch": 4.172429481346679, "grad_norm": 0.05138540127432682, "learning_rate": 6.615595170608541e-07, "loss": 0.0003, "step": 9171 }, { "epoch": 4.172884440400364, "grad_norm": 0.023476761991421933, "learning_rate": 6.608491784006715e-07, "loss": 0.0002, "step": 9172 }, { "epoch": 4.173339399454049, "grad_norm": 0.0560661430328377, "learning_rate": 6.601391943162989e-07, "loss": 0.0005, "step": 9173 }, { "epoch": 4.173794358507735, "grad_norm": 0.059333921481697464, "learning_rate": 6.594295648657528e-07, "loss": 0.0005, "step": 9174 }, { "epoch": 4.1742493175614195, "grad_norm": 0.02768945967891085, "learning_rate": 6.587202901070194e-07, "loss": 0.0003, "step": 9175 }, { "epoch": 4.174704276615104, "grad_norm": 0.009875111081740105, "learning_rate": 6.5801137009806e-07, "loss": 0.0001, "step": 9176 }, { "epoch": 4.17515923566879, "grad_norm": 0.09459231301077736, "learning_rate": 6.573028048968022e-07, "loss": 0.0006, "step": 9177 }, { "epoch": 4.175614194722475, "grad_norm": 0.23591128626393254, "learning_rate": 6.565945945611485e-07, "loss": 0.0077, "step": 9178 }, { "epoch": 4.17606915377616, "grad_norm": 0.005969329223529804, "learning_rate": 6.558867391489703e-07, "loss": 0.0001, "step": 9179 }, { "epoch": 4.176524112829846, "grad_norm": 0.08929087480993467, "learning_rate": 6.551792387181089e-07, "loss": 0.0007, "step": 9180 }, { "epoch": 4.1769790718835305, "grad_norm": 0.02797661408841475, "learning_rate": 6.544720933263798e-07, "loss": 0.0002, "step": 9181 }, { "epoch": 4.177434030937215, "grad_norm": 0.013424039313164264, "learning_rate": 6.537653030315671e-07, "loss": 0.0001, "step": 9182 }, { "epoch": 4.177888989990901, "grad_norm": 0.0034710398803660517, "learning_rate": 6.530588678914263e-07, "loss": 0.0, "step": 9183 }, { "epoch": 4.178343949044586, "grad_norm": 0.031620278929570436, "learning_rate": 6.523527879636837e-07, "loss": 0.0002, "step": 9184 }, { "epoch": 4.178798908098271, "grad_norm": 0.029953733539113833, "learning_rate": 6.516470633060368e-07, "loss": 0.0003, "step": 9185 }, { "epoch": 4.179253867151957, "grad_norm": 0.10342099574404647, "learning_rate": 6.509416939761565e-07, "loss": 0.0019, "step": 9186 }, { "epoch": 4.179708826205641, "grad_norm": 0.16248306500458215, "learning_rate": 6.502366800316801e-07, "loss": 0.0009, "step": 9187 }, { "epoch": 4.180163785259326, "grad_norm": 0.013022761017413183, "learning_rate": 6.495320215302192e-07, "loss": 0.0001, "step": 9188 }, { "epoch": 4.180618744313012, "grad_norm": 0.013148338493735616, "learning_rate": 6.48827718529354e-07, "loss": 0.0001, "step": 9189 }, { "epoch": 4.181073703366697, "grad_norm": 0.16967024721292967, "learning_rate": 6.481237710866389e-07, "loss": 0.002, "step": 9190 }, { "epoch": 4.181528662420382, "grad_norm": 0.08451525810806661, "learning_rate": 6.474201792595958e-07, "loss": 0.001, "step": 9191 }, { "epoch": 4.1819836214740675, "grad_norm": 0.07785399963556024, "learning_rate": 6.467169431057202e-07, "loss": 0.0006, "step": 9192 }, { "epoch": 4.182438580527752, "grad_norm": 0.010904243117567148, "learning_rate": 6.460140626824763e-07, "loss": 0.0001, "step": 9193 }, { "epoch": 4.182893539581437, "grad_norm": 0.03257330343874157, "learning_rate": 6.453115380473001e-07, "loss": 0.0002, "step": 9194 }, { "epoch": 4.183348498635123, "grad_norm": 0.03690040075654703, "learning_rate": 6.446093692576005e-07, "loss": 0.0003, "step": 9195 }, { "epoch": 4.183803457688808, "grad_norm": 0.029523620598909953, "learning_rate": 6.439075563707548e-07, "loss": 0.0001, "step": 9196 }, { "epoch": 4.184258416742493, "grad_norm": 0.03577807208194298, "learning_rate": 6.432060994441114e-07, "loss": 0.0003, "step": 9197 }, { "epoch": 4.1847133757961785, "grad_norm": 0.05449437440305742, "learning_rate": 6.425049985349891e-07, "loss": 0.0006, "step": 9198 }, { "epoch": 4.185168334849863, "grad_norm": 0.14825086389507425, "learning_rate": 6.418042537006813e-07, "loss": 0.0017, "step": 9199 }, { "epoch": 4.185623293903548, "grad_norm": 0.027539289611392494, "learning_rate": 6.411038649984474e-07, "loss": 0.0002, "step": 9200 }, { "epoch": 4.186078252957234, "grad_norm": 0.10060678633737367, "learning_rate": 6.404038324855222e-07, "loss": 0.0018, "step": 9201 }, { "epoch": 4.186533212010919, "grad_norm": 0.008703302032577048, "learning_rate": 6.397041562191081e-07, "loss": 0.0001, "step": 9202 }, { "epoch": 4.186988171064604, "grad_norm": 0.1556722438759862, "learning_rate": 6.390048362563789e-07, "loss": 0.0022, "step": 9203 }, { "epoch": 4.1874431301182895, "grad_norm": 0.010543354781474334, "learning_rate": 6.383058726544799e-07, "loss": 0.0001, "step": 9204 }, { "epoch": 4.187898089171974, "grad_norm": 0.18037695066916648, "learning_rate": 6.376072654705274e-07, "loss": 0.002, "step": 9205 }, { "epoch": 4.188353048225659, "grad_norm": 0.16796778212296457, "learning_rate": 6.369090147616103e-07, "loss": 0.0021, "step": 9206 }, { "epoch": 4.188808007279345, "grad_norm": 0.019603927792413246, "learning_rate": 6.362111205847843e-07, "loss": 0.0001, "step": 9207 }, { "epoch": 4.18926296633303, "grad_norm": 0.3861315637527699, "learning_rate": 6.355135829970794e-07, "loss": 0.0022, "step": 9208 }, { "epoch": 4.189717925386716, "grad_norm": 0.16050242908147475, "learning_rate": 6.348164020554936e-07, "loss": 0.0043, "step": 9209 }, { "epoch": 4.1901728844404005, "grad_norm": 0.14560812987755511, "learning_rate": 6.341195778169989e-07, "loss": 0.0016, "step": 9210 }, { "epoch": 4.190627843494085, "grad_norm": 0.1310902894596744, "learning_rate": 6.334231103385369e-07, "loss": 0.0009, "step": 9211 }, { "epoch": 4.191082802547771, "grad_norm": 0.01991136686751464, "learning_rate": 6.327269996770174e-07, "loss": 0.0002, "step": 9212 }, { "epoch": 4.191537761601456, "grad_norm": 0.042213720963715916, "learning_rate": 6.320312458893262e-07, "loss": 0.0004, "step": 9213 }, { "epoch": 4.191992720655141, "grad_norm": 0.1363581936034296, "learning_rate": 6.313358490323152e-07, "loss": 0.0009, "step": 9214 }, { "epoch": 4.192447679708827, "grad_norm": 0.3706466109068231, "learning_rate": 6.306408091628108e-07, "loss": 0.001, "step": 9215 }, { "epoch": 4.1929026387625115, "grad_norm": 0.016176040948542317, "learning_rate": 6.299461263376078e-07, "loss": 0.0001, "step": 9216 }, { "epoch": 4.193357597816196, "grad_norm": 0.05161273612807351, "learning_rate": 6.292518006134723e-07, "loss": 0.0002, "step": 9217 }, { "epoch": 4.193812556869882, "grad_norm": 0.004714421082205538, "learning_rate": 6.285578320471403e-07, "loss": 0.0, "step": 9218 }, { "epoch": 4.194267515923567, "grad_norm": 0.1783373314823882, "learning_rate": 6.278642206953212e-07, "loss": 0.002, "step": 9219 }, { "epoch": 4.194722474977252, "grad_norm": 0.26380081822850604, "learning_rate": 6.271709666146947e-07, "loss": 0.0023, "step": 9220 }, { "epoch": 4.195177434030938, "grad_norm": 0.10525343942278702, "learning_rate": 6.264780698619094e-07, "loss": 0.0002, "step": 9221 }, { "epoch": 4.195632393084622, "grad_norm": 0.1434293007929281, "learning_rate": 6.257855304935851e-07, "loss": 0.0004, "step": 9222 }, { "epoch": 4.196087352138307, "grad_norm": 0.05190244122703771, "learning_rate": 6.250933485663124e-07, "loss": 0.0003, "step": 9223 }, { "epoch": 4.196542311191993, "grad_norm": 0.014246052764307845, "learning_rate": 6.244015241366558e-07, "loss": 0.0001, "step": 9224 }, { "epoch": 4.196997270245678, "grad_norm": 0.034111973326917, "learning_rate": 6.237100572611465e-07, "loss": 0.0003, "step": 9225 }, { "epoch": 4.197452229299363, "grad_norm": 0.2155729294220744, "learning_rate": 6.230189479962873e-07, "loss": 0.0039, "step": 9226 }, { "epoch": 4.1979071883530485, "grad_norm": 0.0065085798646169766, "learning_rate": 6.223281963985539e-07, "loss": 0.0001, "step": 9227 }, { "epoch": 4.198362147406733, "grad_norm": 0.02088539552658922, "learning_rate": 6.216378025243902e-07, "loss": 0.0001, "step": 9228 }, { "epoch": 4.198817106460418, "grad_norm": 0.007280283249482536, "learning_rate": 6.209477664302139e-07, "loss": 0.0001, "step": 9229 }, { "epoch": 4.199272065514104, "grad_norm": 0.019677628734965126, "learning_rate": 6.202580881724107e-07, "loss": 0.0001, "step": 9230 }, { "epoch": 4.199727024567789, "grad_norm": 0.06795452885858172, "learning_rate": 6.195687678073376e-07, "loss": 0.0004, "step": 9231 }, { "epoch": 4.200181983621474, "grad_norm": 0.0936993291356914, "learning_rate": 6.188798053913226e-07, "loss": 0.0011, "step": 9232 }, { "epoch": 4.2006369426751595, "grad_norm": 0.04389398655634868, "learning_rate": 6.181912009806629e-07, "loss": 0.0004, "step": 9233 }, { "epoch": 4.201091901728844, "grad_norm": 0.019063541393414738, "learning_rate": 6.175029546316325e-07, "loss": 0.0002, "step": 9234 }, { "epoch": 4.201546860782529, "grad_norm": 0.16280824663918508, "learning_rate": 6.168150664004696e-07, "loss": 0.0004, "step": 9235 }, { "epoch": 4.202001819836215, "grad_norm": 0.08059903867287413, "learning_rate": 6.16127536343385e-07, "loss": 0.0004, "step": 9236 }, { "epoch": 4.2024567788899, "grad_norm": 0.12587323985074533, "learning_rate": 6.154403645165608e-07, "loss": 0.0009, "step": 9237 }, { "epoch": 4.202911737943585, "grad_norm": 0.004735581879985467, "learning_rate": 6.147535509761487e-07, "loss": 0.0, "step": 9238 }, { "epoch": 4.2033666969972705, "grad_norm": 0.11462942349212701, "learning_rate": 6.140670957782735e-07, "loss": 0.0007, "step": 9239 }, { "epoch": 4.203821656050955, "grad_norm": 0.6553681836037633, "learning_rate": 6.133809989790274e-07, "loss": 0.0047, "step": 9240 }, { "epoch": 4.20427661510464, "grad_norm": 0.07555905467646963, "learning_rate": 6.126952606344777e-07, "loss": 0.0007, "step": 9241 }, { "epoch": 4.204731574158326, "grad_norm": 0.018565144935949913, "learning_rate": 6.120098808006581e-07, "loss": 0.0001, "step": 9242 }, { "epoch": 4.205186533212011, "grad_norm": 0.02181423444709814, "learning_rate": 6.113248595335742e-07, "loss": 0.0002, "step": 9243 }, { "epoch": 4.205641492265696, "grad_norm": 0.5092251492851335, "learning_rate": 6.106401968892045e-07, "loss": 0.0034, "step": 9244 }, { "epoch": 4.2060964513193815, "grad_norm": 0.1322294130743196, "learning_rate": 6.099558929234961e-07, "loss": 0.0013, "step": 9245 }, { "epoch": 4.206551410373066, "grad_norm": 0.09545367306436295, "learning_rate": 6.092719476923664e-07, "loss": 0.0004, "step": 9246 }, { "epoch": 4.207006369426751, "grad_norm": 0.033852493799744435, "learning_rate": 6.085883612517041e-07, "loss": 0.0002, "step": 9247 }, { "epoch": 4.207461328480437, "grad_norm": 0.11487475001328348, "learning_rate": 6.079051336573694e-07, "loss": 0.0005, "step": 9248 }, { "epoch": 4.207916287534122, "grad_norm": 0.1393337908259671, "learning_rate": 6.072222649651938e-07, "loss": 0.0016, "step": 9249 }, { "epoch": 4.208371246587807, "grad_norm": 0.086005086579078, "learning_rate": 6.065397552309765e-07, "loss": 0.0009, "step": 9250 }, { "epoch": 4.2088262056414925, "grad_norm": 0.016164087562687696, "learning_rate": 6.058576045104903e-07, "loss": 0.0001, "step": 9251 }, { "epoch": 4.209281164695177, "grad_norm": 0.0396690418853614, "learning_rate": 6.051758128594759e-07, "loss": 0.0003, "step": 9252 }, { "epoch": 4.209736123748862, "grad_norm": 0.17307388864539813, "learning_rate": 6.044943803336478e-07, "loss": 0.0025, "step": 9253 }, { "epoch": 4.210191082802548, "grad_norm": 0.010909629365686685, "learning_rate": 6.038133069886887e-07, "loss": 0.0001, "step": 9254 }, { "epoch": 4.210646041856233, "grad_norm": 0.04285829641335229, "learning_rate": 6.031325928802534e-07, "loss": 0.0001, "step": 9255 }, { "epoch": 4.211101000909918, "grad_norm": 0.03186494509096555, "learning_rate": 6.024522380639669e-07, "loss": 0.0003, "step": 9256 }, { "epoch": 4.211555959963603, "grad_norm": 0.07519870204749489, "learning_rate": 6.017722425954231e-07, "loss": 0.0008, "step": 9257 }, { "epoch": 4.212010919017288, "grad_norm": 0.10114821368449645, "learning_rate": 6.010926065301909e-07, "loss": 0.001, "step": 9258 }, { "epoch": 4.212465878070974, "grad_norm": 0.005373423467783357, "learning_rate": 6.004133299238052e-07, "loss": 0.0, "step": 9259 }, { "epoch": 4.212920837124659, "grad_norm": 0.06858789496133387, "learning_rate": 5.997344128317739e-07, "loss": 0.0005, "step": 9260 }, { "epoch": 4.213375796178344, "grad_norm": 0.04407547760768694, "learning_rate": 5.990558553095743e-07, "loss": 0.0006, "step": 9261 }, { "epoch": 4.2138307552320295, "grad_norm": 0.01451503844421713, "learning_rate": 5.983776574126554e-07, "loss": 0.0001, "step": 9262 }, { "epoch": 4.214285714285714, "grad_norm": 0.11242653710469215, "learning_rate": 5.976998191964378e-07, "loss": 0.001, "step": 9263 }, { "epoch": 4.214740673339399, "grad_norm": 0.016192436623986877, "learning_rate": 5.9702234071631e-07, "loss": 0.0001, "step": 9264 }, { "epoch": 4.215195632393085, "grad_norm": 0.0494932513329367, "learning_rate": 5.963452220276333e-07, "loss": 0.0002, "step": 9265 }, { "epoch": 4.21565059144677, "grad_norm": 0.1596845483423612, "learning_rate": 5.956684631857385e-07, "loss": 0.0009, "step": 9266 }, { "epoch": 4.216105550500455, "grad_norm": 0.014596039927809067, "learning_rate": 5.949920642459256e-07, "loss": 0.0001, "step": 9267 }, { "epoch": 4.2165605095541405, "grad_norm": 0.16087612869616275, "learning_rate": 5.943160252634688e-07, "loss": 0.0031, "step": 9268 }, { "epoch": 4.217015468607825, "grad_norm": 0.01839831589538077, "learning_rate": 5.936403462936113e-07, "loss": 0.0001, "step": 9269 }, { "epoch": 4.21747042766151, "grad_norm": 0.0702892304218404, "learning_rate": 5.92965027391566e-07, "loss": 0.0005, "step": 9270 }, { "epoch": 4.217925386715196, "grad_norm": 0.17384681246707095, "learning_rate": 5.922900686125166e-07, "loss": 0.0007, "step": 9271 }, { "epoch": 4.218380345768881, "grad_norm": 0.12433413518160925, "learning_rate": 5.916154700116161e-07, "loss": 0.0013, "step": 9272 }, { "epoch": 4.218835304822566, "grad_norm": 0.04959066437792704, "learning_rate": 5.909412316439933e-07, "loss": 0.0007, "step": 9273 }, { "epoch": 4.2192902638762515, "grad_norm": 0.03396951723817135, "learning_rate": 5.902673535647413e-07, "loss": 0.0002, "step": 9274 }, { "epoch": 4.219745222929936, "grad_norm": 0.2632647347248532, "learning_rate": 5.89593835828926e-07, "loss": 0.0055, "step": 9275 }, { "epoch": 4.220200181983621, "grad_norm": 0.006930218834141626, "learning_rate": 5.889206784915863e-07, "loss": 0.0, "step": 9276 }, { "epoch": 4.220655141037307, "grad_norm": 0.10977733810756486, "learning_rate": 5.882478816077275e-07, "loss": 0.0008, "step": 9277 }, { "epoch": 4.221110100090992, "grad_norm": 0.07528702005502176, "learning_rate": 5.875754452323296e-07, "loss": 0.0009, "step": 9278 }, { "epoch": 4.221565059144677, "grad_norm": 0.026784072640724205, "learning_rate": 5.869033694203402e-07, "loss": 0.0002, "step": 9279 }, { "epoch": 4.2220200181983625, "grad_norm": 0.05917152904685677, "learning_rate": 5.862316542266777e-07, "loss": 0.0005, "step": 9280 }, { "epoch": 4.222474977252047, "grad_norm": 0.05384090420509348, "learning_rate": 5.85560299706231e-07, "loss": 0.0003, "step": 9281 }, { "epoch": 4.222929936305732, "grad_norm": 0.275294963965191, "learning_rate": 5.848893059138616e-07, "loss": 0.0009, "step": 9282 }, { "epoch": 4.223384895359418, "grad_norm": 0.20394959958015133, "learning_rate": 5.842186729044003e-07, "loss": 0.0029, "step": 9283 }, { "epoch": 4.223839854413103, "grad_norm": 0.03366684958534347, "learning_rate": 5.835484007326475e-07, "loss": 0.0001, "step": 9284 }, { "epoch": 4.224294813466788, "grad_norm": 0.34716471218447637, "learning_rate": 5.828784894533751e-07, "loss": 0.0039, "step": 9285 }, { "epoch": 4.2247497725204735, "grad_norm": 0.09420327678053593, "learning_rate": 5.822089391213237e-07, "loss": 0.0007, "step": 9286 }, { "epoch": 4.225204731574158, "grad_norm": 0.0740663765362519, "learning_rate": 5.815397497912084e-07, "loss": 0.0006, "step": 9287 }, { "epoch": 4.225659690627843, "grad_norm": 0.02050749025065289, "learning_rate": 5.808709215177111e-07, "loss": 0.0001, "step": 9288 }, { "epoch": 4.226114649681529, "grad_norm": 0.04191357099582751, "learning_rate": 5.802024543554846e-07, "loss": 0.0009, "step": 9289 }, { "epoch": 4.226569608735214, "grad_norm": 0.023610364371886728, "learning_rate": 5.795343483591548e-07, "loss": 0.0001, "step": 9290 }, { "epoch": 4.227024567788899, "grad_norm": 0.027975160420341295, "learning_rate": 5.788666035833146e-07, "loss": 0.0002, "step": 9291 }, { "epoch": 4.227479526842584, "grad_norm": 0.14526961551799208, "learning_rate": 5.781992200825309e-07, "loss": 0.0015, "step": 9292 }, { "epoch": 4.227934485896269, "grad_norm": 0.06404254366889023, "learning_rate": 5.77532197911338e-07, "loss": 0.0003, "step": 9293 }, { "epoch": 4.228389444949954, "grad_norm": 0.05215656286078959, "learning_rate": 5.768655371242421e-07, "loss": 0.0004, "step": 9294 }, { "epoch": 4.22884440400364, "grad_norm": 0.18349748769479132, "learning_rate": 5.761992377757192e-07, "loss": 0.0012, "step": 9295 }, { "epoch": 4.229299363057325, "grad_norm": 0.028345648795295303, "learning_rate": 5.755332999202168e-07, "loss": 0.0003, "step": 9296 }, { "epoch": 4.22975432211101, "grad_norm": 0.07408040339289708, "learning_rate": 5.74867723612153e-07, "loss": 0.0006, "step": 9297 }, { "epoch": 4.230209281164695, "grad_norm": 0.07599486844267275, "learning_rate": 5.742025089059155e-07, "loss": 0.0002, "step": 9298 }, { "epoch": 4.23066424021838, "grad_norm": 0.22352911402439368, "learning_rate": 5.735376558558625e-07, "loss": 0.0045, "step": 9299 }, { "epoch": 4.231119199272065, "grad_norm": 0.06511784495072674, "learning_rate": 5.72873164516321e-07, "loss": 0.0004, "step": 9300 }, { "epoch": 4.231574158325751, "grad_norm": 0.09486577965947564, "learning_rate": 5.722090349415932e-07, "loss": 0.001, "step": 9301 }, { "epoch": 4.232029117379436, "grad_norm": 0.140978365623315, "learning_rate": 5.715452671859468e-07, "loss": 0.0023, "step": 9302 }, { "epoch": 4.232484076433121, "grad_norm": 0.023789570101093938, "learning_rate": 5.708818613036221e-07, "loss": 0.0002, "step": 9303 }, { "epoch": 4.232939035486806, "grad_norm": 0.06217827631473419, "learning_rate": 5.702188173488304e-07, "loss": 0.0006, "step": 9304 }, { "epoch": 4.233393994540491, "grad_norm": 0.029948375685679478, "learning_rate": 5.695561353757523e-07, "loss": 0.0001, "step": 9305 }, { "epoch": 4.233848953594176, "grad_norm": 0.06481214941934971, "learning_rate": 5.688938154385382e-07, "loss": 0.0006, "step": 9306 }, { "epoch": 4.234303912647862, "grad_norm": 0.015754880391528853, "learning_rate": 5.682318575913121e-07, "loss": 0.0001, "step": 9307 }, { "epoch": 4.234758871701547, "grad_norm": 0.1370104022465754, "learning_rate": 5.675702618881645e-07, "loss": 0.0032, "step": 9308 }, { "epoch": 4.235213830755232, "grad_norm": 0.030985005776223276, "learning_rate": 5.669090283831585e-07, "loss": 0.0002, "step": 9309 }, { "epoch": 4.235668789808917, "grad_norm": 0.019916934754802586, "learning_rate": 5.662481571303264e-07, "loss": 0.0001, "step": 9310 }, { "epoch": 4.236123748862602, "grad_norm": 0.33903228752697046, "learning_rate": 5.655876481836719e-07, "loss": 0.002, "step": 9311 }, { "epoch": 4.236578707916287, "grad_norm": 0.15377719459580733, "learning_rate": 5.649275015971706e-07, "loss": 0.0025, "step": 9312 }, { "epoch": 4.237033666969973, "grad_norm": 0.12203306522676567, "learning_rate": 5.642677174247646e-07, "loss": 0.0014, "step": 9313 }, { "epoch": 4.237488626023658, "grad_norm": 0.09511660950678767, "learning_rate": 5.636082957203698e-07, "loss": 0.0011, "step": 9314 }, { "epoch": 4.237943585077343, "grad_norm": 0.05398236632756204, "learning_rate": 5.629492365378691e-07, "loss": 0.0019, "step": 9315 }, { "epoch": 4.238398544131028, "grad_norm": 0.029383209840151758, "learning_rate": 5.622905399311201e-07, "loss": 0.0002, "step": 9316 }, { "epoch": 4.238853503184713, "grad_norm": 0.1474027183066491, "learning_rate": 5.616322059539469e-07, "loss": 0.0016, "step": 9317 }, { "epoch": 4.239308462238399, "grad_norm": 0.03116306615580843, "learning_rate": 5.609742346601471e-07, "loss": 0.0002, "step": 9318 }, { "epoch": 4.239763421292084, "grad_norm": 0.06595812418215023, "learning_rate": 5.603166261034865e-07, "loss": 0.0007, "step": 9319 }, { "epoch": 4.240218380345769, "grad_norm": 0.009383657030537765, "learning_rate": 5.59659380337701e-07, "loss": 0.0001, "step": 9320 }, { "epoch": 4.2406733393994545, "grad_norm": 0.025587756274128386, "learning_rate": 5.590024974164993e-07, "loss": 0.0001, "step": 9321 }, { "epoch": 4.241128298453139, "grad_norm": 0.07414089223016224, "learning_rate": 5.583459773935584e-07, "loss": 0.0009, "step": 9322 }, { "epoch": 4.241583257506824, "grad_norm": 0.2552290100720014, "learning_rate": 5.576898203225256e-07, "loss": 0.0014, "step": 9323 }, { "epoch": 4.24203821656051, "grad_norm": 0.21993919821261146, "learning_rate": 5.570340262570184e-07, "loss": 0.0033, "step": 9324 }, { "epoch": 4.242493175614195, "grad_norm": 0.04946897779007293, "learning_rate": 5.563785952506267e-07, "loss": 0.0005, "step": 9325 }, { "epoch": 4.24294813466788, "grad_norm": 0.1987220738552609, "learning_rate": 5.557235273569094e-07, "loss": 0.0036, "step": 9326 }, { "epoch": 4.243403093721565, "grad_norm": 0.12016134997718976, "learning_rate": 5.55068822629396e-07, "loss": 0.0003, "step": 9327 }, { "epoch": 4.24385805277525, "grad_norm": 0.17516747446154912, "learning_rate": 5.544144811215845e-07, "loss": 0.0015, "step": 9328 }, { "epoch": 4.244313011828935, "grad_norm": 0.053407387325286236, "learning_rate": 5.537605028869453e-07, "loss": 0.0006, "step": 9329 }, { "epoch": 4.244767970882621, "grad_norm": 0.0790862010567722, "learning_rate": 5.531068879789192e-07, "loss": 0.0008, "step": 9330 }, { "epoch": 4.245222929936306, "grad_norm": 0.02954864414504281, "learning_rate": 5.524536364509153e-07, "loss": 0.0002, "step": 9331 }, { "epoch": 4.245677888989991, "grad_norm": 0.05049424345756518, "learning_rate": 5.518007483563165e-07, "loss": 0.0004, "step": 9332 }, { "epoch": 4.246132848043676, "grad_norm": 0.10101528020228469, "learning_rate": 5.511482237484722e-07, "loss": 0.0006, "step": 9333 }, { "epoch": 4.246587807097361, "grad_norm": 0.052945242228749474, "learning_rate": 5.504960626807038e-07, "loss": 0.0004, "step": 9334 }, { "epoch": 4.247042766151046, "grad_norm": 0.03692554971228215, "learning_rate": 5.49844265206304e-07, "loss": 0.0001, "step": 9335 }, { "epoch": 4.247497725204732, "grad_norm": 0.009485519778121981, "learning_rate": 5.491928313785344e-07, "loss": 0.0001, "step": 9336 }, { "epoch": 4.247952684258417, "grad_norm": 0.009598598109304232, "learning_rate": 5.485417612506267e-07, "loss": 0.0001, "step": 9337 }, { "epoch": 4.248407643312102, "grad_norm": 0.02561969510083542, "learning_rate": 5.478910548757827e-07, "loss": 0.0001, "step": 9338 }, { "epoch": 4.248862602365787, "grad_norm": 0.10708835994655748, "learning_rate": 5.47240712307176e-07, "loss": 0.0029, "step": 9339 }, { "epoch": 4.249317561419472, "grad_norm": 0.09559653245999736, "learning_rate": 5.465907335979514e-07, "loss": 0.001, "step": 9340 }, { "epoch": 4.249772520473157, "grad_norm": 0.15467428574057263, "learning_rate": 5.459411188012198e-07, "loss": 0.0017, "step": 9341 }, { "epoch": 4.250227479526843, "grad_norm": 0.02213983272205055, "learning_rate": 5.452918679700664e-07, "loss": 0.0001, "step": 9342 }, { "epoch": 4.250682438580528, "grad_norm": 0.09463856127072065, "learning_rate": 5.446429811575438e-07, "loss": 0.0006, "step": 9343 }, { "epoch": 4.251137397634213, "grad_norm": 0.04324646566908147, "learning_rate": 5.439944584166756e-07, "loss": 0.0002, "step": 9344 }, { "epoch": 4.251592356687898, "grad_norm": 0.10622226285871803, "learning_rate": 5.433462998004574e-07, "loss": 0.0012, "step": 9345 }, { "epoch": 4.252047315741583, "grad_norm": 0.005745622212430651, "learning_rate": 5.426985053618545e-07, "loss": 0.0, "step": 9346 }, { "epoch": 4.252502274795268, "grad_norm": 0.05565024068149903, "learning_rate": 5.420510751538005e-07, "loss": 0.0001, "step": 9347 }, { "epoch": 4.252957233848954, "grad_norm": 0.12740508911108422, "learning_rate": 5.414040092292006e-07, "loss": 0.002, "step": 9348 }, { "epoch": 4.253412192902639, "grad_norm": 0.00713767722317854, "learning_rate": 5.407573076409295e-07, "loss": 0.0, "step": 9349 }, { "epoch": 4.253867151956324, "grad_norm": 0.18049298993801566, "learning_rate": 5.401109704418339e-07, "loss": 0.0016, "step": 9350 }, { "epoch": 4.254322111010009, "grad_norm": 0.141920957311733, "learning_rate": 5.3946499768473e-07, "loss": 0.0024, "step": 9351 }, { "epoch": 4.254777070063694, "grad_norm": 0.12093676476602054, "learning_rate": 5.388193894224014e-07, "loss": 0.001, "step": 9352 }, { "epoch": 4.255232029117379, "grad_norm": 0.02067757886910532, "learning_rate": 5.381741457076068e-07, "loss": 0.0001, "step": 9353 }, { "epoch": 4.255686988171065, "grad_norm": 0.4797307568601315, "learning_rate": 5.375292665930703e-07, "loss": 0.014, "step": 9354 }, { "epoch": 4.25614194722475, "grad_norm": 0.006306633377726425, "learning_rate": 5.368847521314912e-07, "loss": 0.0, "step": 9355 }, { "epoch": 4.256596906278435, "grad_norm": 0.0990989142607219, "learning_rate": 5.362406023755351e-07, "loss": 0.001, "step": 9356 }, { "epoch": 4.25705186533212, "grad_norm": 0.15494132093358595, "learning_rate": 5.355968173778386e-07, "loss": 0.0009, "step": 9357 }, { "epoch": 4.257506824385805, "grad_norm": 0.05255021309603676, "learning_rate": 5.349533971910081e-07, "loss": 0.0006, "step": 9358 }, { "epoch": 4.25796178343949, "grad_norm": 0.16000169484884014, "learning_rate": 5.343103418676215e-07, "loss": 0.0018, "step": 9359 }, { "epoch": 4.258416742493176, "grad_norm": 0.01837174525114683, "learning_rate": 5.336676514602285e-07, "loss": 0.0001, "step": 9360 }, { "epoch": 4.258871701546861, "grad_norm": 0.12017248432687933, "learning_rate": 5.330253260213452e-07, "loss": 0.0017, "step": 9361 }, { "epoch": 4.2593266606005455, "grad_norm": 0.011715700179753999, "learning_rate": 5.323833656034594e-07, "loss": 0.0001, "step": 9362 }, { "epoch": 4.259781619654231, "grad_norm": 0.040199168037055026, "learning_rate": 5.317417702590283e-07, "loss": 0.0004, "step": 9363 }, { "epoch": 4.260236578707916, "grad_norm": 0.05866484084764532, "learning_rate": 5.311005400404828e-07, "loss": 0.0003, "step": 9364 }, { "epoch": 4.260691537761602, "grad_norm": 0.0509454176492725, "learning_rate": 5.304596750002195e-07, "loss": 0.0004, "step": 9365 }, { "epoch": 4.261146496815287, "grad_norm": 0.005881427861288782, "learning_rate": 5.298191751906057e-07, "loss": 0.0, "step": 9366 }, { "epoch": 4.261601455868972, "grad_norm": 0.040475060842325425, "learning_rate": 5.291790406639836e-07, "loss": 0.0002, "step": 9367 }, { "epoch": 4.262056414922657, "grad_norm": 0.03195688298089145, "learning_rate": 5.285392714726589e-07, "loss": 0.0002, "step": 9368 }, { "epoch": 4.262511373976342, "grad_norm": 0.06749456836920831, "learning_rate": 5.278998676689129e-07, "loss": 0.0005, "step": 9369 }, { "epoch": 4.262966333030027, "grad_norm": 0.00963296156122002, "learning_rate": 5.272608293049941e-07, "loss": 0.0001, "step": 9370 }, { "epoch": 4.263421292083713, "grad_norm": 0.13375752970379154, "learning_rate": 5.266221564331214e-07, "loss": 0.0013, "step": 9371 }, { "epoch": 4.263876251137398, "grad_norm": 0.016385319406306883, "learning_rate": 5.259838491054836e-07, "loss": 0.0001, "step": 9372 }, { "epoch": 4.264331210191083, "grad_norm": 0.02039640415875035, "learning_rate": 5.253459073742411e-07, "loss": 0.0002, "step": 9373 }, { "epoch": 4.264786169244768, "grad_norm": 0.15402696514031883, "learning_rate": 5.247083312915247e-07, "loss": 0.0006, "step": 9374 }, { "epoch": 4.265241128298453, "grad_norm": 0.00758210887343876, "learning_rate": 5.240711209094335e-07, "loss": 0.0001, "step": 9375 }, { "epoch": 4.265696087352138, "grad_norm": 0.06534292542961342, "learning_rate": 5.234342762800365e-07, "loss": 0.0007, "step": 9376 }, { "epoch": 4.266151046405824, "grad_norm": 0.08075096057316716, "learning_rate": 5.227977974553749e-07, "loss": 0.0006, "step": 9377 }, { "epoch": 4.266606005459509, "grad_norm": 0.08130011001507072, "learning_rate": 5.221616844874577e-07, "loss": 0.0007, "step": 9378 }, { "epoch": 4.267060964513194, "grad_norm": 0.13562851418401212, "learning_rate": 5.215259374282666e-07, "loss": 0.0022, "step": 9379 }, { "epoch": 4.267515923566879, "grad_norm": 0.11049870894123699, "learning_rate": 5.2089055632975e-07, "loss": 0.0004, "step": 9380 }, { "epoch": 4.267970882620564, "grad_norm": 0.03203315008822801, "learning_rate": 5.202555412438309e-07, "loss": 0.0002, "step": 9381 }, { "epoch": 4.268425841674249, "grad_norm": 0.11246568988555525, "learning_rate": 5.196208922223988e-07, "loss": 0.0014, "step": 9382 }, { "epoch": 4.268880800727935, "grad_norm": 0.0484548824835751, "learning_rate": 5.189866093173135e-07, "loss": 0.0004, "step": 9383 }, { "epoch": 4.26933575978162, "grad_norm": 0.02306155849820032, "learning_rate": 5.183526925804067e-07, "loss": 0.0002, "step": 9384 }, { "epoch": 4.269790718835305, "grad_norm": 0.1073189119835409, "learning_rate": 5.177191420634792e-07, "loss": 0.0012, "step": 9385 }, { "epoch": 4.27024567788899, "grad_norm": 0.02978896684836263, "learning_rate": 5.170859578183019e-07, "loss": 0.0002, "step": 9386 }, { "epoch": 4.270700636942675, "grad_norm": 0.02872018391760248, "learning_rate": 5.164531398966138e-07, "loss": 0.0002, "step": 9387 }, { "epoch": 4.27115559599636, "grad_norm": 0.024454689149395675, "learning_rate": 5.158206883501282e-07, "loss": 0.0001, "step": 9388 }, { "epoch": 4.271610555050046, "grad_norm": 0.309703972484005, "learning_rate": 5.151886032305265e-07, "loss": 0.0012, "step": 9389 }, { "epoch": 4.272065514103731, "grad_norm": 0.025025046483026826, "learning_rate": 5.145568845894583e-07, "loss": 0.0001, "step": 9390 }, { "epoch": 4.272520473157416, "grad_norm": 0.160686964743567, "learning_rate": 5.139255324785458e-07, "loss": 0.0015, "step": 9391 }, { "epoch": 4.272975432211101, "grad_norm": 0.035499169518848406, "learning_rate": 5.132945469493788e-07, "loss": 0.0002, "step": 9392 }, { "epoch": 4.273430391264786, "grad_norm": 0.20541328876179651, "learning_rate": 5.126639280535211e-07, "loss": 0.0016, "step": 9393 }, { "epoch": 4.273885350318471, "grad_norm": 0.010777903427906499, "learning_rate": 5.12033675842501e-07, "loss": 0.0001, "step": 9394 }, { "epoch": 4.274340309372157, "grad_norm": 0.11456487271800445, "learning_rate": 5.114037903678227e-07, "loss": 0.0008, "step": 9395 }, { "epoch": 4.274795268425842, "grad_norm": 0.037896546187512016, "learning_rate": 5.107742716809566e-07, "loss": 0.0002, "step": 9396 }, { "epoch": 4.2752502274795265, "grad_norm": 0.03162816545617232, "learning_rate": 5.101451198333423e-07, "loss": 0.0002, "step": 9397 }, { "epoch": 4.275705186533212, "grad_norm": 0.08485308349316291, "learning_rate": 5.095163348763943e-07, "loss": 0.0007, "step": 9398 }, { "epoch": 4.276160145586897, "grad_norm": 0.10601461335791529, "learning_rate": 5.088879168614918e-07, "loss": 0.0014, "step": 9399 }, { "epoch": 4.276615104640582, "grad_norm": 0.04136955354817777, "learning_rate": 5.082598658399879e-07, "loss": 0.0002, "step": 9400 }, { "epoch": 4.277070063694268, "grad_norm": 0.18810748326277346, "learning_rate": 5.076321818632018e-07, "loss": 0.0039, "step": 9401 }, { "epoch": 4.277525022747953, "grad_norm": 0.09148530578537371, "learning_rate": 5.070048649824267e-07, "loss": 0.0004, "step": 9402 }, { "epoch": 4.2779799818016375, "grad_norm": 0.12006026487055753, "learning_rate": 5.063779152489245e-07, "loss": 0.0011, "step": 9403 }, { "epoch": 4.278434940855323, "grad_norm": 0.011503973504918717, "learning_rate": 5.057513327139263e-07, "loss": 0.0001, "step": 9404 }, { "epoch": 4.278889899909008, "grad_norm": 0.02451284363597608, "learning_rate": 5.051251174286331e-07, "loss": 0.0001, "step": 9405 }, { "epoch": 4.279344858962693, "grad_norm": 0.05348024494939322, "learning_rate": 5.044992694442158e-07, "loss": 0.0005, "step": 9406 }, { "epoch": 4.279799818016379, "grad_norm": 0.02422143268384626, "learning_rate": 5.038737888118178e-07, "loss": 0.0002, "step": 9407 }, { "epoch": 4.280254777070064, "grad_norm": 0.007810394825914106, "learning_rate": 5.032486755825484e-07, "loss": 0.0001, "step": 9408 }, { "epoch": 4.2807097361237485, "grad_norm": 0.009611292354112944, "learning_rate": 5.026239298074909e-07, "loss": 0.0001, "step": 9409 }, { "epoch": 4.281164695177434, "grad_norm": 0.1992134259239629, "learning_rate": 5.019995515376963e-07, "loss": 0.0013, "step": 9410 }, { "epoch": 4.281619654231119, "grad_norm": 0.03746306288079818, "learning_rate": 5.01375540824185e-07, "loss": 0.0002, "step": 9411 }, { "epoch": 4.282074613284804, "grad_norm": 0.007442771113152153, "learning_rate": 5.007518977179482e-07, "loss": 0.0, "step": 9412 }, { "epoch": 4.28252957233849, "grad_norm": 0.07102955563686467, "learning_rate": 5.001286222699491e-07, "loss": 0.0004, "step": 9413 }, { "epoch": 4.282984531392175, "grad_norm": 0.2658893686714957, "learning_rate": 4.995057145311172e-07, "loss": 0.0016, "step": 9414 }, { "epoch": 4.2834394904458595, "grad_norm": 0.01212385410953683, "learning_rate": 4.988831745523537e-07, "loss": 0.0001, "step": 9415 }, { "epoch": 4.283894449499545, "grad_norm": 0.08025179085578654, "learning_rate": 4.982610023845313e-07, "loss": 0.0006, "step": 9416 }, { "epoch": 4.28434940855323, "grad_norm": 0.03395557345563699, "learning_rate": 4.976391980784889e-07, "loss": 0.0002, "step": 9417 }, { "epoch": 4.284804367606915, "grad_norm": 0.28009571920375315, "learning_rate": 4.970177616850397e-07, "loss": 0.0035, "step": 9418 }, { "epoch": 4.285259326660601, "grad_norm": 0.2121134074411285, "learning_rate": 4.963966932549641e-07, "loss": 0.0022, "step": 9419 }, { "epoch": 4.285714285714286, "grad_norm": 0.019556036543950005, "learning_rate": 4.957759928390121e-07, "loss": 0.0001, "step": 9420 }, { "epoch": 4.2861692447679705, "grad_norm": 0.1880748330002612, "learning_rate": 4.951556604879049e-07, "loss": 0.0008, "step": 9421 }, { "epoch": 4.286624203821656, "grad_norm": 0.11429922284627615, "learning_rate": 4.945356962523329e-07, "loss": 0.0013, "step": 9422 }, { "epoch": 4.287079162875341, "grad_norm": 0.04548185066711093, "learning_rate": 4.939161001829579e-07, "loss": 0.0001, "step": 9423 }, { "epoch": 4.287534121929026, "grad_norm": 0.11820784247438874, "learning_rate": 4.932968723304105e-07, "loss": 0.0012, "step": 9424 }, { "epoch": 4.287989080982712, "grad_norm": 0.12433797446899217, "learning_rate": 4.926780127452901e-07, "loss": 0.0016, "step": 9425 }, { "epoch": 4.288444040036397, "grad_norm": 0.07678094650993814, "learning_rate": 4.920595214781671e-07, "loss": 0.0004, "step": 9426 }, { "epoch": 4.288898999090081, "grad_norm": 0.11673110203288319, "learning_rate": 4.914413985795829e-07, "loss": 0.0013, "step": 9427 }, { "epoch": 4.289353958143767, "grad_norm": 0.15560004974938102, "learning_rate": 4.908236441000474e-07, "loss": 0.0024, "step": 9428 }, { "epoch": 4.289808917197452, "grad_norm": 0.021967800806110525, "learning_rate": 4.902062580900396e-07, "loss": 0.0002, "step": 9429 }, { "epoch": 4.290263876251138, "grad_norm": 0.029300951166889445, "learning_rate": 4.895892406000113e-07, "loss": 0.0002, "step": 9430 }, { "epoch": 4.290718835304823, "grad_norm": 0.26672052180912426, "learning_rate": 4.889725916803801e-07, "loss": 0.0013, "step": 9431 }, { "epoch": 4.2911737943585075, "grad_norm": 0.14466017685738486, "learning_rate": 4.883563113815388e-07, "loss": 0.0014, "step": 9432 }, { "epoch": 4.291628753412193, "grad_norm": 0.03234383339019922, "learning_rate": 4.877403997538443e-07, "loss": 0.0002, "step": 9433 }, { "epoch": 4.292083712465878, "grad_norm": 0.01765892589603697, "learning_rate": 4.871248568476278e-07, "loss": 0.0001, "step": 9434 }, { "epoch": 4.292538671519563, "grad_norm": 0.030799748763440448, "learning_rate": 4.865096827131871e-07, "loss": 0.0003, "step": 9435 }, { "epoch": 4.292993630573249, "grad_norm": 0.12289043677660492, "learning_rate": 4.858948774007921e-07, "loss": 0.0006, "step": 9436 }, { "epoch": 4.293448589626934, "grad_norm": 0.10882075896221208, "learning_rate": 4.852804409606832e-07, "loss": 0.0007, "step": 9437 }, { "epoch": 4.2939035486806185, "grad_norm": 0.13369377006159008, "learning_rate": 4.846663734430684e-07, "loss": 0.0031, "step": 9438 }, { "epoch": 4.294358507734304, "grad_norm": 0.2689251459401683, "learning_rate": 4.840526748981267e-07, "loss": 0.0026, "step": 9439 }, { "epoch": 4.294813466787989, "grad_norm": 0.06324588817919938, "learning_rate": 4.83439345376005e-07, "loss": 0.0002, "step": 9440 }, { "epoch": 4.295268425841674, "grad_norm": 0.01046168002163692, "learning_rate": 4.82826384926825e-07, "loss": 0.0001, "step": 9441 }, { "epoch": 4.29572338489536, "grad_norm": 0.3567896872767112, "learning_rate": 4.822137936006732e-07, "loss": 0.0043, "step": 9442 }, { "epoch": 4.296178343949045, "grad_norm": 0.12613594506668488, "learning_rate": 4.816015714476074e-07, "loss": 0.0008, "step": 9443 }, { "epoch": 4.2966333030027295, "grad_norm": 0.13948413840303564, "learning_rate": 4.809897185176566e-07, "loss": 0.0006, "step": 9444 }, { "epoch": 4.297088262056415, "grad_norm": 0.06265948200416833, "learning_rate": 4.803782348608177e-07, "loss": 0.0006, "step": 9445 }, { "epoch": 4.2975432211101, "grad_norm": 0.010579017652374813, "learning_rate": 4.797671205270604e-07, "loss": 0.0001, "step": 9446 }, { "epoch": 4.297998180163785, "grad_norm": 0.1152175307054137, "learning_rate": 4.791563755663203e-07, "loss": 0.0023, "step": 9447 }, { "epoch": 4.298453139217471, "grad_norm": 0.015339585207144988, "learning_rate": 4.785460000285053e-07, "loss": 0.0001, "step": 9448 }, { "epoch": 4.298908098271156, "grad_norm": 0.005743888658135438, "learning_rate": 4.779359939634926e-07, "loss": 0.0, "step": 9449 }, { "epoch": 4.2993630573248405, "grad_norm": 0.164605272005598, "learning_rate": 4.773263574211279e-07, "loss": 0.0006, "step": 9450 }, { "epoch": 4.299818016378526, "grad_norm": 0.05192488266792382, "learning_rate": 4.7671709045122914e-07, "loss": 0.0012, "step": 9451 }, { "epoch": 4.300272975432211, "grad_norm": 0.34121220992405865, "learning_rate": 4.761081931035838e-07, "loss": 0.0059, "step": 9452 }, { "epoch": 4.300727934485896, "grad_norm": 0.12996774576180398, "learning_rate": 4.7549966542794703e-07, "loss": 0.001, "step": 9453 }, { "epoch": 4.301182893539582, "grad_norm": 0.04212948058346271, "learning_rate": 4.748915074740451e-07, "loss": 0.0001, "step": 9454 }, { "epoch": 4.301637852593267, "grad_norm": 0.06553098037555283, "learning_rate": 4.7428371929157333e-07, "loss": 0.0005, "step": 9455 }, { "epoch": 4.3020928116469515, "grad_norm": 0.01692617983929504, "learning_rate": 4.736763009301987e-07, "loss": 0.0001, "step": 9456 }, { "epoch": 4.302547770700637, "grad_norm": 0.0914594123086564, "learning_rate": 4.730692524395553e-07, "loss": 0.0009, "step": 9457 }, { "epoch": 4.303002729754322, "grad_norm": 0.07401523962304041, "learning_rate": 4.724625738692501e-07, "loss": 0.0003, "step": 9458 }, { "epoch": 4.303457688808007, "grad_norm": 0.05006230651350633, "learning_rate": 4.718562652688574e-07, "loss": 0.0002, "step": 9459 }, { "epoch": 4.303912647861693, "grad_norm": 0.12790973820871607, "learning_rate": 4.7125032668792036e-07, "loss": 0.001, "step": 9460 }, { "epoch": 4.304367606915378, "grad_norm": 0.09872796404226394, "learning_rate": 4.70644758175956e-07, "loss": 0.0009, "step": 9461 }, { "epoch": 4.304822565969062, "grad_norm": 0.09948775177384377, "learning_rate": 4.700395597824481e-07, "loss": 0.0007, "step": 9462 }, { "epoch": 4.305277525022748, "grad_norm": 0.19927698689297516, "learning_rate": 4.6943473155684983e-07, "loss": 0.0009, "step": 9463 }, { "epoch": 4.305732484076433, "grad_norm": 0.03053122579911819, "learning_rate": 4.6883027354858447e-07, "loss": 0.0002, "step": 9464 }, { "epoch": 4.306187443130118, "grad_norm": 0.3646886361853749, "learning_rate": 4.6822618580704694e-07, "loss": 0.0043, "step": 9465 }, { "epoch": 4.306642402183804, "grad_norm": 0.04212069404476076, "learning_rate": 4.676224683816005e-07, "loss": 0.0005, "step": 9466 }, { "epoch": 4.3070973612374885, "grad_norm": 0.1519858996652111, "learning_rate": 4.6701912132157854e-07, "loss": 0.0009, "step": 9467 }, { "epoch": 4.307552320291173, "grad_norm": 0.020674840563901613, "learning_rate": 4.664161446762827e-07, "loss": 0.0002, "step": 9468 }, { "epoch": 4.308007279344859, "grad_norm": 0.022163062605841628, "learning_rate": 4.6581353849498576e-07, "loss": 0.0001, "step": 9469 }, { "epoch": 4.308462238398544, "grad_norm": 0.03481091456227857, "learning_rate": 4.652113028269306e-07, "loss": 0.0002, "step": 9470 }, { "epoch": 4.308917197452229, "grad_norm": 0.02440523714926293, "learning_rate": 4.646094377213284e-07, "loss": 0.0001, "step": 9471 }, { "epoch": 4.309372156505915, "grad_norm": 0.18156949652247606, "learning_rate": 4.640079432273614e-07, "loss": 0.0013, "step": 9472 }, { "epoch": 4.3098271155595995, "grad_norm": 0.014171935112652133, "learning_rate": 4.6340681939418155e-07, "loss": 0.0001, "step": 9473 }, { "epoch": 4.310282074613285, "grad_norm": 0.10118555549312674, "learning_rate": 4.628060662709083e-07, "loss": 0.0005, "step": 9474 }, { "epoch": 4.31073703366697, "grad_norm": 0.07053292161657575, "learning_rate": 4.6220568390663465e-07, "loss": 0.0006, "step": 9475 }, { "epoch": 4.311191992720655, "grad_norm": 0.01780630011510076, "learning_rate": 4.6160567235041974e-07, "loss": 0.0001, "step": 9476 }, { "epoch": 4.311646951774341, "grad_norm": 0.07682431226079638, "learning_rate": 4.610060316512943e-07, "loss": 0.0008, "step": 9477 }, { "epoch": 4.312101910828026, "grad_norm": 0.07722225271805593, "learning_rate": 4.6040676185825696e-07, "loss": 0.0004, "step": 9478 }, { "epoch": 4.3125568698817105, "grad_norm": 0.009418970620694796, "learning_rate": 4.5980786302027846e-07, "loss": 0.0001, "step": 9479 }, { "epoch": 4.313011828935396, "grad_norm": 0.02565495146216306, "learning_rate": 4.592093351862992e-07, "loss": 0.0002, "step": 9480 }, { "epoch": 4.313466787989081, "grad_norm": 0.09445978290931722, "learning_rate": 4.5861117840522664e-07, "loss": 0.001, "step": 9481 }, { "epoch": 4.313921747042766, "grad_norm": 0.012392574447745174, "learning_rate": 4.5801339272594004e-07, "loss": 0.0, "step": 9482 }, { "epoch": 4.314376706096452, "grad_norm": 0.01611543225774431, "learning_rate": 4.574159781972876e-07, "loss": 0.0001, "step": 9483 }, { "epoch": 4.314831665150137, "grad_norm": 0.07530056225211533, "learning_rate": 4.5681893486808625e-07, "loss": 0.0004, "step": 9484 }, { "epoch": 4.3152866242038215, "grad_norm": 0.13969925655438337, "learning_rate": 4.562222627871249e-07, "loss": 0.0012, "step": 9485 }, { "epoch": 4.315741583257507, "grad_norm": 0.09025074476596753, "learning_rate": 4.556259620031617e-07, "loss": 0.0004, "step": 9486 }, { "epoch": 4.316196542311192, "grad_norm": 0.03016822732471592, "learning_rate": 4.550300325649226e-07, "loss": 0.0003, "step": 9487 }, { "epoch": 4.316651501364877, "grad_norm": 0.1167719769355159, "learning_rate": 4.544344745211038e-07, "loss": 0.0022, "step": 9488 }, { "epoch": 4.317106460418563, "grad_norm": 0.024338125589698348, "learning_rate": 4.538392879203718e-07, "loss": 0.0002, "step": 9489 }, { "epoch": 4.317561419472248, "grad_norm": 0.023710959654620913, "learning_rate": 4.5324447281136383e-07, "loss": 0.0003, "step": 9490 }, { "epoch": 4.3180163785259325, "grad_norm": 0.1678662928253239, "learning_rate": 4.5265002924268443e-07, "loss": 0.0024, "step": 9491 }, { "epoch": 4.318471337579618, "grad_norm": 0.08620100759381095, "learning_rate": 4.5205595726290795e-07, "loss": 0.0004, "step": 9492 }, { "epoch": 4.318926296633303, "grad_norm": 0.026232198256582, "learning_rate": 4.5146225692058174e-07, "loss": 0.0002, "step": 9493 }, { "epoch": 4.319381255686988, "grad_norm": 0.05810398380901062, "learning_rate": 4.5086892826421757e-07, "loss": 0.0003, "step": 9494 }, { "epoch": 4.319836214740674, "grad_norm": 0.030043277667089033, "learning_rate": 4.502759713423016e-07, "loss": 0.0001, "step": 9495 }, { "epoch": 4.320291173794359, "grad_norm": 0.14353490409139388, "learning_rate": 4.496833862032873e-07, "loss": 0.001, "step": 9496 }, { "epoch": 4.320746132848043, "grad_norm": 0.2908135803299327, "learning_rate": 4.4909117289559713e-07, "loss": 0.002, "step": 9497 }, { "epoch": 4.321201091901729, "grad_norm": 0.054794526283025904, "learning_rate": 4.484993314676239e-07, "loss": 0.0005, "step": 9498 }, { "epoch": 4.321656050955414, "grad_norm": 0.28404728994912926, "learning_rate": 4.479078619677313e-07, "loss": 0.0019, "step": 9499 }, { "epoch": 4.322111010009099, "grad_norm": 0.0231018922164419, "learning_rate": 4.4731676444425165e-07, "loss": 0.0002, "step": 9500 }, { "epoch": 4.322565969062785, "grad_norm": 0.002971606315893479, "learning_rate": 4.467260389454864e-07, "loss": 0.0, "step": 9501 }, { "epoch": 4.3230209281164695, "grad_norm": 0.12189084857917876, "learning_rate": 4.4613568551970687e-07, "loss": 0.0012, "step": 9502 }, { "epoch": 4.323475887170154, "grad_norm": 0.1379315244153815, "learning_rate": 4.455457042151529e-07, "loss": 0.0005, "step": 9503 }, { "epoch": 4.32393084622384, "grad_norm": 0.12266870676968793, "learning_rate": 4.4495609508003747e-07, "loss": 0.0003, "step": 9504 }, { "epoch": 4.324385805277525, "grad_norm": 0.11118893112725846, "learning_rate": 4.443668581625393e-07, "loss": 0.0013, "step": 9505 }, { "epoch": 4.32484076433121, "grad_norm": 0.10323862197120745, "learning_rate": 4.4377799351080776e-07, "loss": 0.001, "step": 9506 }, { "epoch": 4.325295723384896, "grad_norm": 0.2674571804087863, "learning_rate": 4.431895011729637e-07, "loss": 0.0014, "step": 9507 }, { "epoch": 4.3257506824385805, "grad_norm": 0.007617615129750181, "learning_rate": 4.426013811970942e-07, "loss": 0.0001, "step": 9508 }, { "epoch": 4.326205641492265, "grad_norm": 0.01794300954381809, "learning_rate": 4.420136336312597e-07, "loss": 0.0001, "step": 9509 }, { "epoch": 4.326660600545951, "grad_norm": 0.18656816292993106, "learning_rate": 4.414262585234874e-07, "loss": 0.0035, "step": 9510 }, { "epoch": 4.327115559599636, "grad_norm": 0.03864447361427777, "learning_rate": 4.40839255921775e-07, "loss": 0.0002, "step": 9511 }, { "epoch": 4.327570518653321, "grad_norm": 0.09947989222315545, "learning_rate": 4.402526258740886e-07, "loss": 0.0006, "step": 9512 }, { "epoch": 4.328025477707007, "grad_norm": 0.017387686072632164, "learning_rate": 4.396663684283664e-07, "loss": 0.0001, "step": 9513 }, { "epoch": 4.3284804367606915, "grad_norm": 0.041729592064830776, "learning_rate": 4.3908048363251464e-07, "loss": 0.0002, "step": 9514 }, { "epoch": 4.328935395814376, "grad_norm": 0.017757007796318103, "learning_rate": 4.384949715344089e-07, "loss": 0.0001, "step": 9515 }, { "epoch": 4.329390354868062, "grad_norm": 0.034981319824325924, "learning_rate": 4.379098321818948e-07, "loss": 0.0002, "step": 9516 }, { "epoch": 4.329845313921747, "grad_norm": 0.09251019222117542, "learning_rate": 4.373250656227862e-07, "loss": 0.0006, "step": 9517 }, { "epoch": 4.330300272975432, "grad_norm": 0.046653612926894385, "learning_rate": 4.367406719048689e-07, "loss": 0.0005, "step": 9518 }, { "epoch": 4.330755232029118, "grad_norm": 0.08693001754714129, "learning_rate": 4.361566510758963e-07, "loss": 0.0008, "step": 9519 }, { "epoch": 4.3312101910828025, "grad_norm": 0.03530663400392882, "learning_rate": 4.355730031835914e-07, "loss": 0.0002, "step": 9520 }, { "epoch": 4.331665150136487, "grad_norm": 0.011049055356075959, "learning_rate": 4.349897282756488e-07, "loss": 0.0, "step": 9521 }, { "epoch": 4.332120109190173, "grad_norm": 0.12178748468935524, "learning_rate": 4.344068263997303e-07, "loss": 0.0019, "step": 9522 }, { "epoch": 4.332575068243858, "grad_norm": 0.11680242767992842, "learning_rate": 4.338242976034668e-07, "loss": 0.0013, "step": 9523 }, { "epoch": 4.333030027297543, "grad_norm": 0.07923184615410592, "learning_rate": 4.3324214193446233e-07, "loss": 0.0001, "step": 9524 }, { "epoch": 4.333484986351229, "grad_norm": 0.1089654569742116, "learning_rate": 4.326603594402862e-07, "loss": 0.0012, "step": 9525 }, { "epoch": 4.3339399454049135, "grad_norm": 0.05951894803069833, "learning_rate": 4.3207895016847966e-07, "loss": 0.0007, "step": 9526 }, { "epoch": 4.334394904458598, "grad_norm": 0.23333836884089046, "learning_rate": 4.3149791416655206e-07, "loss": 0.0022, "step": 9527 }, { "epoch": 4.334849863512284, "grad_norm": 0.009065358724321152, "learning_rate": 4.309172514819837e-07, "loss": 0.0, "step": 9528 }, { "epoch": 4.335304822565969, "grad_norm": 0.026820224032963576, "learning_rate": 4.303369621622244e-07, "loss": 0.0002, "step": 9529 }, { "epoch": 4.335759781619654, "grad_norm": 0.003668693652079188, "learning_rate": 4.297570462546924e-07, "loss": 0.0, "step": 9530 }, { "epoch": 4.33621474067334, "grad_norm": 0.01269806947788099, "learning_rate": 4.2917750380677583e-07, "loss": 0.0001, "step": 9531 }, { "epoch": 4.336669699727024, "grad_norm": 0.04999190269455574, "learning_rate": 4.285983348658307e-07, "loss": 0.0004, "step": 9532 }, { "epoch": 4.337124658780709, "grad_norm": 0.015292310527195728, "learning_rate": 4.280195394791864e-07, "loss": 0.0001, "step": 9533 }, { "epoch": 4.337579617834395, "grad_norm": 0.052062547937813335, "learning_rate": 4.274411176941373e-07, "loss": 0.0002, "step": 9534 }, { "epoch": 4.33803457688808, "grad_norm": 0.11064949050872513, "learning_rate": 4.2686306955795173e-07, "loss": 0.0009, "step": 9535 }, { "epoch": 4.338489535941765, "grad_norm": 0.19712257675213457, "learning_rate": 4.2628539511786417e-07, "loss": 0.0037, "step": 9536 }, { "epoch": 4.3389444949954505, "grad_norm": 0.48844836229017824, "learning_rate": 4.2570809442107785e-07, "loss": 0.0026, "step": 9537 }, { "epoch": 4.339399454049135, "grad_norm": 0.034426168785234586, "learning_rate": 4.2513116751477013e-07, "loss": 0.0002, "step": 9538 }, { "epoch": 4.339854413102821, "grad_norm": 0.11093734819230727, "learning_rate": 4.245546144460838e-07, "loss": 0.001, "step": 9539 }, { "epoch": 4.340309372156506, "grad_norm": 0.011211162420060433, "learning_rate": 4.2397843526213124e-07, "loss": 0.0001, "step": 9540 }, { "epoch": 4.340764331210191, "grad_norm": 0.036901415893791166, "learning_rate": 4.2340263000999526e-07, "loss": 0.0004, "step": 9541 }, { "epoch": 4.341219290263877, "grad_norm": 0.14939522086022908, "learning_rate": 4.228271987367283e-07, "loss": 0.0018, "step": 9542 }, { "epoch": 4.3416742493175615, "grad_norm": 0.010760449335293095, "learning_rate": 4.222521414893538e-07, "loss": 0.0001, "step": 9543 }, { "epoch": 4.342129208371246, "grad_norm": 0.1323723659924108, "learning_rate": 4.216774583148608e-07, "loss": 0.0027, "step": 9544 }, { "epoch": 4.342584167424932, "grad_norm": 0.056197498836121025, "learning_rate": 4.2110314926021024e-07, "loss": 0.0004, "step": 9545 }, { "epoch": 4.343039126478617, "grad_norm": 0.016802289941257622, "learning_rate": 4.205292143723322e-07, "loss": 0.0001, "step": 9546 }, { "epoch": 4.343494085532302, "grad_norm": 0.013156974205455137, "learning_rate": 4.199556536981264e-07, "loss": 0.0001, "step": 9547 }, { "epoch": 4.343949044585988, "grad_norm": 0.016014505988141435, "learning_rate": 4.1938246728445986e-07, "loss": 0.0002, "step": 9548 }, { "epoch": 4.3444040036396725, "grad_norm": 0.031143187808687866, "learning_rate": 4.1880965517817396e-07, "loss": 0.0003, "step": 9549 }, { "epoch": 4.344858962693357, "grad_norm": 0.04607120566065836, "learning_rate": 4.18237217426074e-07, "loss": 0.0003, "step": 9550 }, { "epoch": 4.345313921747043, "grad_norm": 0.1270663262246374, "learning_rate": 4.176651540749371e-07, "loss": 0.0018, "step": 9551 }, { "epoch": 4.345768880800728, "grad_norm": 0.08882594961851505, "learning_rate": 4.1709346517151084e-07, "loss": 0.0007, "step": 9552 }, { "epoch": 4.346223839854413, "grad_norm": 0.037274770161642, "learning_rate": 4.165221507625106e-07, "loss": 0.0004, "step": 9553 }, { "epoch": 4.346678798908099, "grad_norm": 0.1558727170497576, "learning_rate": 4.1595121089462123e-07, "loss": 0.0033, "step": 9554 }, { "epoch": 4.3471337579617835, "grad_norm": 0.1721808385426201, "learning_rate": 4.1538064561449653e-07, "loss": 0.0008, "step": 9555 }, { "epoch": 4.347588717015468, "grad_norm": 0.12713874106425976, "learning_rate": 4.148104549687626e-07, "loss": 0.0006, "step": 9556 }, { "epoch": 4.348043676069154, "grad_norm": 0.10572219599084051, "learning_rate": 4.1424063900401046e-07, "loss": 0.0016, "step": 9557 }, { "epoch": 4.348498635122839, "grad_norm": 0.23264285433577755, "learning_rate": 4.1367119776680566e-07, "loss": 0.0037, "step": 9558 }, { "epoch": 4.348953594176524, "grad_norm": 0.10998039199353982, "learning_rate": 4.131021313036787e-07, "loss": 0.001, "step": 9559 }, { "epoch": 4.34940855323021, "grad_norm": 0.046975576352913544, "learning_rate": 4.1253343966113133e-07, "loss": 0.0002, "step": 9560 }, { "epoch": 4.3498635122838945, "grad_norm": 0.10266679880834646, "learning_rate": 4.119651228856331e-07, "loss": 0.0004, "step": 9561 }, { "epoch": 4.350318471337579, "grad_norm": 0.026858915138591524, "learning_rate": 4.113971810236261e-07, "loss": 0.0003, "step": 9562 }, { "epoch": 4.350773430391265, "grad_norm": 0.010133092996935038, "learning_rate": 4.1082961412152065e-07, "loss": 0.0001, "step": 9563 }, { "epoch": 4.35122838944495, "grad_norm": 0.04774426939198638, "learning_rate": 4.10262422225694e-07, "loss": 0.0004, "step": 9564 }, { "epoch": 4.351683348498635, "grad_norm": 0.06513242498920833, "learning_rate": 4.0969560538249574e-07, "loss": 0.0006, "step": 9565 }, { "epoch": 4.352138307552321, "grad_norm": 0.17211349660931294, "learning_rate": 4.0912916363824165e-07, "loss": 0.0018, "step": 9566 }, { "epoch": 4.352593266606005, "grad_norm": 0.08361707745428784, "learning_rate": 4.0856309703922124e-07, "loss": 0.001, "step": 9567 }, { "epoch": 4.35304822565969, "grad_norm": 0.08447822346193756, "learning_rate": 4.0799740563168934e-07, "loss": 0.0003, "step": 9568 }, { "epoch": 4.353503184713376, "grad_norm": 0.03343690160485407, "learning_rate": 4.074320894618716e-07, "loss": 0.0002, "step": 9569 }, { "epoch": 4.353958143767061, "grad_norm": 0.020594602393960933, "learning_rate": 4.068671485759651e-07, "loss": 0.0001, "step": 9570 }, { "epoch": 4.354413102820746, "grad_norm": 0.3461345788314648, "learning_rate": 4.0630258302013115e-07, "loss": 0.0043, "step": 9571 }, { "epoch": 4.3548680618744315, "grad_norm": 0.03684470442623735, "learning_rate": 4.057383928405062e-07, "loss": 0.0003, "step": 9572 }, { "epoch": 4.355323020928116, "grad_norm": 0.056045711349917225, "learning_rate": 4.0517457808319225e-07, "loss": 0.0003, "step": 9573 }, { "epoch": 4.355777979981801, "grad_norm": 0.01219136046219526, "learning_rate": 4.0461113879426197e-07, "loss": 0.0001, "step": 9574 }, { "epoch": 4.356232939035487, "grad_norm": 0.046258575449575595, "learning_rate": 4.0404807501975617e-07, "loss": 0.0004, "step": 9575 }, { "epoch": 4.356687898089172, "grad_norm": 0.03301448378595439, "learning_rate": 4.0348538680568595e-07, "loss": 0.0003, "step": 9576 }, { "epoch": 4.357142857142857, "grad_norm": 0.03625878472038226, "learning_rate": 4.0292307419803333e-07, "loss": 0.0001, "step": 9577 }, { "epoch": 4.3575978161965425, "grad_norm": 0.12908302766485893, "learning_rate": 4.0236113724274716e-07, "loss": 0.0014, "step": 9578 }, { "epoch": 4.358052775250227, "grad_norm": 0.0314803481001272, "learning_rate": 4.017995759857457e-07, "loss": 0.0002, "step": 9579 }, { "epoch": 4.358507734303912, "grad_norm": 0.17385132170622852, "learning_rate": 4.012383904729167e-07, "loss": 0.0005, "step": 9580 }, { "epoch": 4.358962693357598, "grad_norm": 0.17489838147799963, "learning_rate": 4.0067758075012006e-07, "loss": 0.0015, "step": 9581 }, { "epoch": 4.359417652411283, "grad_norm": 0.04735051412179548, "learning_rate": 4.001171468631809e-07, "loss": 0.0002, "step": 9582 }, { "epoch": 4.359872611464968, "grad_norm": 0.011593250507184535, "learning_rate": 3.995570888578942e-07, "loss": 0.0001, "step": 9583 }, { "epoch": 4.3603275705186535, "grad_norm": 0.10384953810570867, "learning_rate": 3.9899740678002843e-07, "loss": 0.0014, "step": 9584 }, { "epoch": 4.360782529572338, "grad_norm": 0.012969577997066021, "learning_rate": 3.984381006753152e-07, "loss": 0.0001, "step": 9585 }, { "epoch": 4.361237488626024, "grad_norm": 0.10846099358239839, "learning_rate": 3.978791705894608e-07, "loss": 0.0006, "step": 9586 }, { "epoch": 4.361692447679709, "grad_norm": 0.2728546015870505, "learning_rate": 3.9732061656813816e-07, "loss": 0.0013, "step": 9587 }, { "epoch": 4.362147406733394, "grad_norm": 0.04057684681920179, "learning_rate": 3.9676243865698847e-07, "loss": 0.0002, "step": 9588 }, { "epoch": 4.36260236578708, "grad_norm": 0.053313456372892414, "learning_rate": 3.962046369016248e-07, "loss": 0.0003, "step": 9589 }, { "epoch": 4.3630573248407645, "grad_norm": 0.08024098869555892, "learning_rate": 3.956472113476256e-07, "loss": 0.0007, "step": 9590 }, { "epoch": 4.363512283894449, "grad_norm": 0.005280989948720423, "learning_rate": 3.9509016204054506e-07, "loss": 0.0, "step": 9591 }, { "epoch": 4.363967242948135, "grad_norm": 0.08780626355445023, "learning_rate": 3.945334890259012e-07, "loss": 0.0006, "step": 9592 }, { "epoch": 4.36442220200182, "grad_norm": 0.0980754369380235, "learning_rate": 3.93977192349182e-07, "loss": 0.0007, "step": 9593 }, { "epoch": 4.364877161055505, "grad_norm": 0.024347136947112283, "learning_rate": 3.9342127205584615e-07, "loss": 0.0002, "step": 9594 }, { "epoch": 4.365332120109191, "grad_norm": 0.17287816110303303, "learning_rate": 3.928657281913201e-07, "loss": 0.0014, "step": 9595 }, { "epoch": 4.3657870791628755, "grad_norm": 0.09864768338881905, "learning_rate": 3.9231056080100196e-07, "loss": 0.0006, "step": 9596 }, { "epoch": 4.36624203821656, "grad_norm": 0.02510646786256023, "learning_rate": 3.91755769930256e-07, "loss": 0.0002, "step": 9597 }, { "epoch": 4.366696997270246, "grad_norm": 0.019304989411127894, "learning_rate": 3.912013556244182e-07, "loss": 0.0001, "step": 9598 }, { "epoch": 4.367151956323931, "grad_norm": 0.017127464710915995, "learning_rate": 3.9064731792879283e-07, "loss": 0.0001, "step": 9599 }, { "epoch": 4.367606915377616, "grad_norm": 0.08523559660315795, "learning_rate": 3.9009365688865207e-07, "loss": 0.0005, "step": 9600 }, { "epoch": 4.368061874431302, "grad_norm": 0.019164847643307118, "learning_rate": 3.8954037254924026e-07, "loss": 0.0001, "step": 9601 }, { "epoch": 4.368516833484986, "grad_norm": 0.11275523122428278, "learning_rate": 3.88987464955769e-07, "loss": 0.0016, "step": 9602 }, { "epoch": 4.368971792538671, "grad_norm": 0.039825553630519164, "learning_rate": 3.8843493415341826e-07, "loss": 0.0001, "step": 9603 }, { "epoch": 4.369426751592357, "grad_norm": 0.08636715304684091, "learning_rate": 3.878827801873386e-07, "loss": 0.0005, "step": 9604 }, { "epoch": 4.369881710646042, "grad_norm": 0.009525632671508556, "learning_rate": 3.8733100310265e-07, "loss": 0.0001, "step": 9605 }, { "epoch": 4.370336669699727, "grad_norm": 0.05250451534109771, "learning_rate": 3.8677960294444207e-07, "loss": 0.0002, "step": 9606 }, { "epoch": 4.3707916287534125, "grad_norm": 0.04944772206670779, "learning_rate": 3.8622857975777195e-07, "loss": 0.0004, "step": 9607 }, { "epoch": 4.371246587807097, "grad_norm": 0.00660304792501937, "learning_rate": 3.856779335876665e-07, "loss": 0.0, "step": 9608 }, { "epoch": 4.371701546860782, "grad_norm": 0.17920775271069825, "learning_rate": 3.8512766447912133e-07, "loss": 0.0009, "step": 9609 }, { "epoch": 4.372156505914468, "grad_norm": 0.12288588892681677, "learning_rate": 3.8457777247710384e-07, "loss": 0.0008, "step": 9610 }, { "epoch": 4.372611464968153, "grad_norm": 0.011675052209256074, "learning_rate": 3.8402825762654636e-07, "loss": 0.0001, "step": 9611 }, { "epoch": 4.373066424021838, "grad_norm": 0.05153275009335694, "learning_rate": 3.834791199723559e-07, "loss": 0.0004, "step": 9612 }, { "epoch": 4.3735213830755235, "grad_norm": 0.026925588330581448, "learning_rate": 3.8293035955940304e-07, "loss": 0.0002, "step": 9613 }, { "epoch": 4.373976342129208, "grad_norm": 0.31727413466743354, "learning_rate": 3.8238197643252984e-07, "loss": 0.0009, "step": 9614 }, { "epoch": 4.374431301182893, "grad_norm": 0.028318927128566894, "learning_rate": 3.818339706365498e-07, "loss": 0.0001, "step": 9615 }, { "epoch": 4.374886260236579, "grad_norm": 0.48466827051088585, "learning_rate": 3.812863422162422e-07, "loss": 0.004, "step": 9616 }, { "epoch": 4.375341219290264, "grad_norm": 0.09230738948824826, "learning_rate": 3.807390912163561e-07, "loss": 0.0018, "step": 9617 }, { "epoch": 4.375796178343949, "grad_norm": 0.008232451756627379, "learning_rate": 3.8019221768161087e-07, "loss": 0.0001, "step": 9618 }, { "epoch": 4.3762511373976345, "grad_norm": 0.046883467641014004, "learning_rate": 3.7964572165669456e-07, "loss": 0.0004, "step": 9619 }, { "epoch": 4.376706096451319, "grad_norm": 0.08010664195395718, "learning_rate": 3.790996031862654e-07, "loss": 0.0008, "step": 9620 }, { "epoch": 4.377161055505004, "grad_norm": 0.0029841600905793422, "learning_rate": 3.785538623149493e-07, "loss": 0.0, "step": 9621 }, { "epoch": 4.37761601455869, "grad_norm": 0.14784348278396045, "learning_rate": 3.7800849908734063e-07, "loss": 0.0024, "step": 9622 }, { "epoch": 4.378070973612375, "grad_norm": 0.19231677265268154, "learning_rate": 3.7746351354800425e-07, "loss": 0.0012, "step": 9623 }, { "epoch": 4.37852593266606, "grad_norm": 0.11553539412378755, "learning_rate": 3.769189057414752e-07, "loss": 0.0012, "step": 9624 }, { "epoch": 4.3789808917197455, "grad_norm": 0.10429395319435979, "learning_rate": 3.763746757122544e-07, "loss": 0.0003, "step": 9625 }, { "epoch": 4.37943585077343, "grad_norm": 0.10200522298621524, "learning_rate": 3.758308235048158e-07, "loss": 0.0014, "step": 9626 }, { "epoch": 4.379890809827115, "grad_norm": 0.18028201088531204, "learning_rate": 3.752873491636e-07, "loss": 0.0019, "step": 9627 }, { "epoch": 4.380345768880801, "grad_norm": 0.5178074103618024, "learning_rate": 3.7474425273301696e-07, "loss": 0.0028, "step": 9628 }, { "epoch": 4.380800727934486, "grad_norm": 0.2714839734938491, "learning_rate": 3.742015342574451e-07, "loss": 0.0019, "step": 9629 }, { "epoch": 4.381255686988171, "grad_norm": 0.04783280340686421, "learning_rate": 3.7365919378123507e-07, "loss": 0.0004, "step": 9630 }, { "epoch": 4.3817106460418564, "grad_norm": 0.010833970302822507, "learning_rate": 3.7311723134870304e-07, "loss": 0.0001, "step": 9631 }, { "epoch": 4.382165605095541, "grad_norm": 0.012692716406864656, "learning_rate": 3.7257564700413527e-07, "loss": 0.0001, "step": 9632 }, { "epoch": 4.382620564149226, "grad_norm": 0.15486804200847887, "learning_rate": 3.7203444079178977e-07, "loss": 0.0008, "step": 9633 }, { "epoch": 4.383075523202912, "grad_norm": 0.017057050576178897, "learning_rate": 3.7149361275588826e-07, "loss": 0.0001, "step": 9634 }, { "epoch": 4.383530482256597, "grad_norm": 0.05739298411511804, "learning_rate": 3.7095316294062824e-07, "loss": 0.0003, "step": 9635 }, { "epoch": 4.383985441310282, "grad_norm": 0.11215266516873934, "learning_rate": 3.7041309139017046e-07, "loss": 0.0016, "step": 9636 }, { "epoch": 4.384440400363967, "grad_norm": 0.18286538353234372, "learning_rate": 3.698733981486485e-07, "loss": 0.0016, "step": 9637 }, { "epoch": 4.384895359417652, "grad_norm": 0.10212499367328176, "learning_rate": 3.6933408326016164e-07, "loss": 0.0012, "step": 9638 }, { "epoch": 4.385350318471337, "grad_norm": 0.05009482784609614, "learning_rate": 3.687951467687817e-07, "loss": 0.0004, "step": 9639 }, { "epoch": 4.385805277525023, "grad_norm": 0.15005315396638721, "learning_rate": 3.6825658871854906e-07, "loss": 0.0008, "step": 9640 }, { "epoch": 4.386260236578708, "grad_norm": 0.048455735934680506, "learning_rate": 3.677184091534708e-07, "loss": 0.0004, "step": 9641 }, { "epoch": 4.386715195632393, "grad_norm": 0.017641312748139105, "learning_rate": 3.671806081175255e-07, "loss": 0.0001, "step": 9642 }, { "epoch": 4.387170154686078, "grad_norm": 0.0702345354656347, "learning_rate": 3.666431856546582e-07, "loss": 0.0011, "step": 9643 }, { "epoch": 4.387625113739763, "grad_norm": 0.043257386507523775, "learning_rate": 3.6610614180878636e-07, "loss": 0.0002, "step": 9644 }, { "epoch": 4.388080072793448, "grad_norm": 0.024067102843430388, "learning_rate": 3.6556947662379436e-07, "loss": 0.0001, "step": 9645 }, { "epoch": 4.388535031847134, "grad_norm": 0.013900689815314704, "learning_rate": 3.65033190143535e-07, "loss": 0.0001, "step": 9646 }, { "epoch": 4.388989990900819, "grad_norm": 0.012122857913547336, "learning_rate": 3.6449728241183256e-07, "loss": 0.0001, "step": 9647 }, { "epoch": 4.389444949954504, "grad_norm": 0.19466210882837834, "learning_rate": 3.639617534724782e-07, "loss": 0.0024, "step": 9648 }, { "epoch": 4.389899909008189, "grad_norm": 0.06278658953638261, "learning_rate": 3.634266033692335e-07, "loss": 0.0004, "step": 9649 }, { "epoch": 4.390354868061874, "grad_norm": 0.006738354979878417, "learning_rate": 3.6289183214582854e-07, "loss": 0.0001, "step": 9650 }, { "epoch": 4.39080982711556, "grad_norm": 0.1281438172455948, "learning_rate": 3.623574398459617e-07, "loss": 0.0007, "step": 9651 }, { "epoch": 4.391264786169245, "grad_norm": 0.009725434907386876, "learning_rate": 3.6182342651330083e-07, "loss": 0.0, "step": 9652 }, { "epoch": 4.39171974522293, "grad_norm": 0.10971134435132887, "learning_rate": 3.612897921914837e-07, "loss": 0.0008, "step": 9653 }, { "epoch": 4.3921747042766155, "grad_norm": 0.14548375277724887, "learning_rate": 3.607565369241173e-07, "loss": 0.0013, "step": 9654 }, { "epoch": 4.3926296633303, "grad_norm": 0.05081155951627734, "learning_rate": 3.60223660754776e-07, "loss": 0.0001, "step": 9655 }, { "epoch": 4.393084622383985, "grad_norm": 0.30777792060884823, "learning_rate": 3.596911637270045e-07, "loss": 0.0053, "step": 9656 }, { "epoch": 4.393539581437671, "grad_norm": 0.03905229247285689, "learning_rate": 3.591590458843142e-07, "loss": 0.0002, "step": 9657 }, { "epoch": 4.393994540491356, "grad_norm": 0.23688943423319317, "learning_rate": 3.586273072701901e-07, "loss": 0.0031, "step": 9658 }, { "epoch": 4.394449499545041, "grad_norm": 0.15961072087549197, "learning_rate": 3.580959479280821e-07, "loss": 0.0035, "step": 9659 }, { "epoch": 4.3949044585987265, "grad_norm": 0.3084126059174271, "learning_rate": 3.575649679014098e-07, "loss": 0.0007, "step": 9660 }, { "epoch": 4.395359417652411, "grad_norm": 0.07068959984525587, "learning_rate": 3.570343672335641e-07, "loss": 0.0004, "step": 9661 }, { "epoch": 4.395814376706096, "grad_norm": 0.01760097716547311, "learning_rate": 3.5650414596790137e-07, "loss": 0.0001, "step": 9662 }, { "epoch": 4.396269335759782, "grad_norm": 0.017771311226987968, "learning_rate": 3.559743041477509e-07, "loss": 0.0001, "step": 9663 }, { "epoch": 4.396724294813467, "grad_norm": 0.08812991679779757, "learning_rate": 3.5544484181640804e-07, "loss": 0.0008, "step": 9664 }, { "epoch": 4.397179253867152, "grad_norm": 0.09367364024309256, "learning_rate": 3.5491575901713815e-07, "loss": 0.0008, "step": 9665 }, { "epoch": 4.3976342129208374, "grad_norm": 0.039712037928291505, "learning_rate": 3.5438705579317557e-07, "loss": 0.0003, "step": 9666 }, { "epoch": 4.398089171974522, "grad_norm": 0.1434547622828054, "learning_rate": 3.538587321877224e-07, "loss": 0.0014, "step": 9667 }, { "epoch": 4.398544131028207, "grad_norm": 0.0647965971418509, "learning_rate": 3.5333078824395237e-07, "loss": 0.0007, "step": 9668 }, { "epoch": 4.398999090081893, "grad_norm": 0.12102628074073363, "learning_rate": 3.528032240050061e-07, "loss": 0.0007, "step": 9669 }, { "epoch": 4.399454049135578, "grad_norm": 0.20675964229726468, "learning_rate": 3.522760395139946e-07, "loss": 0.0008, "step": 9670 }, { "epoch": 4.399909008189263, "grad_norm": 0.014355203381870482, "learning_rate": 3.517492348139956e-07, "loss": 0.0001, "step": 9671 }, { "epoch": 4.400363967242948, "grad_norm": 0.18179559570037063, "learning_rate": 3.5122280994805747e-07, "loss": 0.0011, "step": 9672 }, { "epoch": 4.400818926296633, "grad_norm": 0.006988816690587801, "learning_rate": 3.50696764959198e-07, "loss": 0.0, "step": 9673 }, { "epoch": 4.401273885350318, "grad_norm": 0.06264685905739664, "learning_rate": 3.5017109989040234e-07, "loss": 0.0006, "step": 9674 }, { "epoch": 4.401728844404004, "grad_norm": 0.07370856029500973, "learning_rate": 3.4964581478462654e-07, "loss": 0.0005, "step": 9675 }, { "epoch": 4.402183803457689, "grad_norm": 0.15611045884952449, "learning_rate": 3.4912090968479417e-07, "loss": 0.0029, "step": 9676 }, { "epoch": 4.402638762511374, "grad_norm": 0.06902600631497491, "learning_rate": 3.4859638463379695e-07, "loss": 0.0007, "step": 9677 }, { "epoch": 4.403093721565059, "grad_norm": 0.2762981361753823, "learning_rate": 3.480722396744984e-07, "loss": 0.0023, "step": 9678 }, { "epoch": 4.403548680618744, "grad_norm": 0.008827958162164893, "learning_rate": 3.4754847484972877e-07, "loss": 0.0001, "step": 9679 }, { "epoch": 4.404003639672429, "grad_norm": 0.12605560627352566, "learning_rate": 3.47025090202287e-07, "loss": 0.0008, "step": 9680 }, { "epoch": 4.404458598726115, "grad_norm": 0.15338898186435398, "learning_rate": 3.4650208577494185e-07, "loss": 0.0007, "step": 9681 }, { "epoch": 4.4049135577798, "grad_norm": 0.06408781723482458, "learning_rate": 3.4597946161043063e-07, "loss": 0.0007, "step": 9682 }, { "epoch": 4.405368516833485, "grad_norm": 0.03052030364532293, "learning_rate": 3.45457217751462e-07, "loss": 0.0002, "step": 9683 }, { "epoch": 4.40582347588717, "grad_norm": 0.15679009916566664, "learning_rate": 3.4493535424070913e-07, "loss": 0.0011, "step": 9684 }, { "epoch": 4.406278434940855, "grad_norm": 0.031935211938976964, "learning_rate": 3.4441387112081724e-07, "loss": 0.0002, "step": 9685 }, { "epoch": 4.40673339399454, "grad_norm": 0.04184156567991399, "learning_rate": 3.43892768434399e-07, "loss": 0.0004, "step": 9686 }, { "epoch": 4.407188353048226, "grad_norm": 0.005955905539000646, "learning_rate": 3.433720462240375e-07, "loss": 0.0, "step": 9687 }, { "epoch": 4.407643312101911, "grad_norm": 0.051772575472979976, "learning_rate": 3.4285170453228214e-07, "loss": 0.0004, "step": 9688 }, { "epoch": 4.408098271155596, "grad_norm": 0.05526077577904048, "learning_rate": 3.4233174340165486e-07, "loss": 0.0003, "step": 9689 }, { "epoch": 4.408553230209281, "grad_norm": 0.12090852165042458, "learning_rate": 3.41812162874644e-07, "loss": 0.0013, "step": 9690 }, { "epoch": 4.409008189262966, "grad_norm": 0.057057035027325675, "learning_rate": 3.412929629937062e-07, "loss": 0.0002, "step": 9691 }, { "epoch": 4.409463148316651, "grad_norm": 0.15785618181098338, "learning_rate": 3.407741438012691e-07, "loss": 0.0031, "step": 9692 }, { "epoch": 4.409918107370337, "grad_norm": 0.06391119199517106, "learning_rate": 3.4025570533972876e-07, "loss": 0.0005, "step": 9693 }, { "epoch": 4.410373066424022, "grad_norm": 0.03104074143741479, "learning_rate": 3.397376476514486e-07, "loss": 0.0002, "step": 9694 }, { "epoch": 4.4108280254777075, "grad_norm": 0.038138633398526085, "learning_rate": 3.392199707787619e-07, "loss": 0.0002, "step": 9695 }, { "epoch": 4.411282984531392, "grad_norm": 0.01833962905184601, "learning_rate": 3.38702674763971e-07, "loss": 0.0002, "step": 9696 }, { "epoch": 4.411737943585077, "grad_norm": 0.019255456029010715, "learning_rate": 3.3818575964934764e-07, "loss": 0.0001, "step": 9697 }, { "epoch": 4.412192902638763, "grad_norm": 0.05634132477630591, "learning_rate": 3.376692254771324e-07, "loss": 0.0011, "step": 9698 }, { "epoch": 4.412647861692448, "grad_norm": 0.17832163922167318, "learning_rate": 3.371530722895322e-07, "loss": 0.0025, "step": 9699 }, { "epoch": 4.413102820746133, "grad_norm": 0.19601540817641203, "learning_rate": 3.3663730012872654e-07, "loss": 0.0028, "step": 9700 }, { "epoch": 4.4135577797998184, "grad_norm": 0.03220956884829947, "learning_rate": 3.3612190903686e-07, "loss": 0.0002, "step": 9701 }, { "epoch": 4.414012738853503, "grad_norm": 0.18995225346002148, "learning_rate": 3.35606899056049e-07, "loss": 0.0012, "step": 9702 }, { "epoch": 4.414467697907188, "grad_norm": 0.12808194422439712, "learning_rate": 3.3509227022837876e-07, "loss": 0.0016, "step": 9703 }, { "epoch": 4.414922656960874, "grad_norm": 0.0900081386892947, "learning_rate": 3.3457802259590165e-07, "loss": 0.0008, "step": 9704 }, { "epoch": 4.415377616014559, "grad_norm": 7.8541099720937915, "learning_rate": 3.3406415620064024e-07, "loss": 0.0555, "step": 9705 }, { "epoch": 4.415832575068244, "grad_norm": 0.009981937382365271, "learning_rate": 3.335506710845837e-07, "loss": 0.0001, "step": 9706 }, { "epoch": 4.416287534121929, "grad_norm": 0.11658179895853803, "learning_rate": 3.330375672896935e-07, "loss": 0.0005, "step": 9707 }, { "epoch": 4.416742493175614, "grad_norm": 0.21280310225032892, "learning_rate": 3.3252484485789716e-07, "loss": 0.0014, "step": 9708 }, { "epoch": 4.417197452229299, "grad_norm": 0.02765464238733074, "learning_rate": 3.320125038310923e-07, "loss": 0.0002, "step": 9709 }, { "epoch": 4.417652411282985, "grad_norm": 0.00996669636979424, "learning_rate": 3.315005442511454e-07, "loss": 0.0001, "step": 9710 }, { "epoch": 4.41810737033667, "grad_norm": 0.05063264672279933, "learning_rate": 3.3098896615989085e-07, "loss": 0.0004, "step": 9711 }, { "epoch": 4.418562329390355, "grad_norm": 0.07702474173078007, "learning_rate": 3.304777695991335e-07, "loss": 0.0002, "step": 9712 }, { "epoch": 4.41901728844404, "grad_norm": 0.026721926559433458, "learning_rate": 3.299669546106454e-07, "loss": 0.0002, "step": 9713 }, { "epoch": 4.419472247497725, "grad_norm": 0.010899089291708826, "learning_rate": 3.294565212361683e-07, "loss": 0.0001, "step": 9714 }, { "epoch": 4.41992720655141, "grad_norm": 0.03366531450123238, "learning_rate": 3.289464695174111e-07, "loss": 0.0002, "step": 9715 }, { "epoch": 4.420382165605096, "grad_norm": 0.009923945204701358, "learning_rate": 3.2843679949605466e-07, "loss": 0.0, "step": 9716 }, { "epoch": 4.420837124658781, "grad_norm": 0.06063481948797327, "learning_rate": 3.27927511213747e-07, "loss": 0.0002, "step": 9717 }, { "epoch": 4.421292083712466, "grad_norm": 0.024555764257756893, "learning_rate": 3.2741860471210364e-07, "loss": 0.0002, "step": 9718 }, { "epoch": 4.421747042766151, "grad_norm": 0.4021708913069274, "learning_rate": 3.269100800327113e-07, "loss": 0.0048, "step": 9719 }, { "epoch": 4.422202001819836, "grad_norm": 0.28517867811205955, "learning_rate": 3.2640193721712286e-07, "loss": 0.0027, "step": 9720 }, { "epoch": 4.422656960873521, "grad_norm": 0.3135499781605325, "learning_rate": 3.258941763068635e-07, "loss": 0.0041, "step": 9721 }, { "epoch": 4.423111919927207, "grad_norm": 0.07550189281238037, "learning_rate": 3.2538679734342327e-07, "loss": 0.0004, "step": 9722 }, { "epoch": 4.423566878980892, "grad_norm": 0.18851424145062828, "learning_rate": 3.248798003682629e-07, "loss": 0.0029, "step": 9723 }, { "epoch": 4.424021838034577, "grad_norm": 0.07922482042576807, "learning_rate": 3.243731854228138e-07, "loss": 0.0008, "step": 9724 }, { "epoch": 4.424476797088262, "grad_norm": 0.020516938144610152, "learning_rate": 3.238669525484722e-07, "loss": 0.0001, "step": 9725 }, { "epoch": 4.424931756141947, "grad_norm": 0.2950157139685972, "learning_rate": 3.2336110178660676e-07, "loss": 0.0038, "step": 9726 }, { "epoch": 4.425386715195632, "grad_norm": 0.21229129188726475, "learning_rate": 3.2285563317855207e-07, "loss": 0.0015, "step": 9727 }, { "epoch": 4.425841674249318, "grad_norm": 0.0949715281069857, "learning_rate": 3.223505467656135e-07, "loss": 0.0013, "step": 9728 }, { "epoch": 4.426296633303003, "grad_norm": 0.009500179798080936, "learning_rate": 3.218458425890636e-07, "loss": 0.0, "step": 9729 }, { "epoch": 4.426751592356688, "grad_norm": 0.20256127931545861, "learning_rate": 3.213415206901449e-07, "loss": 0.0023, "step": 9730 }, { "epoch": 4.427206551410373, "grad_norm": 0.09059227155147881, "learning_rate": 3.2083758111006946e-07, "loss": 0.0003, "step": 9731 }, { "epoch": 4.427661510464058, "grad_norm": 0.043626402178712116, "learning_rate": 3.20334023890016e-07, "loss": 0.0004, "step": 9732 }, { "epoch": 4.428116469517743, "grad_norm": 0.06265446526543357, "learning_rate": 3.198308490711327e-07, "loss": 0.0005, "step": 9733 }, { "epoch": 4.428571428571429, "grad_norm": 0.40477377674568726, "learning_rate": 3.1932805669453724e-07, "loss": 0.0027, "step": 9734 }, { "epoch": 4.429026387625114, "grad_norm": 0.14063853884218894, "learning_rate": 3.18825646801314e-07, "loss": 0.0005, "step": 9735 }, { "epoch": 4.429481346678799, "grad_norm": 0.07899892714086733, "learning_rate": 3.183236194325201e-07, "loss": 0.0005, "step": 9736 }, { "epoch": 4.429936305732484, "grad_norm": 0.006900169526151869, "learning_rate": 3.1782197462917664e-07, "loss": 0.0, "step": 9737 }, { "epoch": 4.430391264786169, "grad_norm": 0.017784397873982963, "learning_rate": 3.17320712432278e-07, "loss": 0.0002, "step": 9738 }, { "epoch": 4.430846223839854, "grad_norm": 0.006386438345564333, "learning_rate": 3.1681983288278375e-07, "loss": 0.0, "step": 9739 }, { "epoch": 4.43130118289354, "grad_norm": 0.06859236760882528, "learning_rate": 3.1631933602162326e-07, "loss": 0.0004, "step": 9740 }, { "epoch": 4.431756141947225, "grad_norm": 0.05124514038916852, "learning_rate": 3.1581922188969605e-07, "loss": 0.0003, "step": 9741 }, { "epoch": 4.4322111010009095, "grad_norm": 0.17893493472605435, "learning_rate": 3.153194905278678e-07, "loss": 0.0015, "step": 9742 }, { "epoch": 4.432666060054595, "grad_norm": 0.012273655532279657, "learning_rate": 3.1482014197697584e-07, "loss": 0.0001, "step": 9743 }, { "epoch": 4.43312101910828, "grad_norm": 0.008279159542419746, "learning_rate": 3.143211762778226e-07, "loss": 0.0, "step": 9744 }, { "epoch": 4.433575978161965, "grad_norm": 0.17784125720879143, "learning_rate": 3.1382259347118195e-07, "loss": 0.0037, "step": 9745 }, { "epoch": 4.434030937215651, "grad_norm": 0.027285914082301757, "learning_rate": 3.133243935977981e-07, "loss": 0.0001, "step": 9746 }, { "epoch": 4.434485896269336, "grad_norm": 0.3335437766838001, "learning_rate": 3.1282657669837956e-07, "loss": 0.0036, "step": 9747 }, { "epoch": 4.4349408553230205, "grad_norm": 0.17411214302792816, "learning_rate": 3.1232914281360607e-07, "loss": 0.0028, "step": 9748 }, { "epoch": 4.435395814376706, "grad_norm": 0.01583613165235244, "learning_rate": 3.1183209198412446e-07, "loss": 0.0001, "step": 9749 }, { "epoch": 4.435850773430391, "grad_norm": 0.009105500980011135, "learning_rate": 3.1133542425055394e-07, "loss": 0.0001, "step": 9750 }, { "epoch": 4.436305732484076, "grad_norm": 0.353614607238613, "learning_rate": 3.1083913965347824e-07, "loss": 0.0061, "step": 9751 }, { "epoch": 4.436760691537762, "grad_norm": 0.008776523309563642, "learning_rate": 3.1034323823345256e-07, "loss": 0.0001, "step": 9752 }, { "epoch": 4.437215650591447, "grad_norm": 0.016079597583139957, "learning_rate": 3.0984772003099905e-07, "loss": 0.0001, "step": 9753 }, { "epoch": 4.4376706096451315, "grad_norm": 0.1863709299910358, "learning_rate": 3.09352585086608e-07, "loss": 0.0015, "step": 9754 }, { "epoch": 4.438125568698817, "grad_norm": 0.037387267554902175, "learning_rate": 3.088578334407427e-07, "loss": 0.0003, "step": 9755 }, { "epoch": 4.438580527752502, "grad_norm": 0.4293941149466844, "learning_rate": 3.0836346513382963e-07, "loss": 0.002, "step": 9756 }, { "epoch": 4.439035486806187, "grad_norm": 0.016602409146596518, "learning_rate": 3.0786948020626706e-07, "loss": 0.0001, "step": 9757 }, { "epoch": 4.439490445859873, "grad_norm": 0.2978140116003094, "learning_rate": 3.0737587869842045e-07, "loss": 0.003, "step": 9758 }, { "epoch": 4.439945404913558, "grad_norm": 0.04684090690398713, "learning_rate": 3.0688266065062535e-07, "loss": 0.0002, "step": 9759 }, { "epoch": 4.440400363967243, "grad_norm": 0.09876346914508588, "learning_rate": 3.063898261031856e-07, "loss": 0.001, "step": 9760 }, { "epoch": 4.440855323020928, "grad_norm": 0.009139626111730102, "learning_rate": 3.058973750963734e-07, "loss": 0.0001, "step": 9761 }, { "epoch": 4.441310282074613, "grad_norm": 0.16439642644320687, "learning_rate": 3.0540530767042944e-07, "loss": 0.0013, "step": 9762 }, { "epoch": 4.441765241128299, "grad_norm": 0.022623478061656294, "learning_rate": 3.0491362386556254e-07, "loss": 0.0002, "step": 9763 }, { "epoch": 4.442220200181984, "grad_norm": 0.09362669926828819, "learning_rate": 3.044223237219518e-07, "loss": 0.0005, "step": 9764 }, { "epoch": 4.442675159235669, "grad_norm": 0.0885582827009058, "learning_rate": 3.0393140727974334e-07, "loss": 0.0004, "step": 9765 }, { "epoch": 4.443130118289354, "grad_norm": 0.22615170263271775, "learning_rate": 3.0344087457905344e-07, "loss": 0.0013, "step": 9766 }, { "epoch": 4.443585077343039, "grad_norm": 0.017767123213202212, "learning_rate": 3.029507256599662e-07, "loss": 0.0001, "step": 9767 }, { "epoch": 4.444040036396724, "grad_norm": 0.01997416799800844, "learning_rate": 3.024609605625328e-07, "loss": 0.0001, "step": 9768 }, { "epoch": 4.44449499545041, "grad_norm": 0.11262580844406248, "learning_rate": 3.019715793267769e-07, "loss": 0.0013, "step": 9769 }, { "epoch": 4.444949954504095, "grad_norm": 0.006793420981161215, "learning_rate": 3.01482581992687e-07, "loss": 0.0, "step": 9770 }, { "epoch": 4.44540491355778, "grad_norm": 0.07252150471494055, "learning_rate": 3.009939686002228e-07, "loss": 0.0007, "step": 9771 }, { "epoch": 4.445859872611465, "grad_norm": 0.0687106592733224, "learning_rate": 3.0050573918930957e-07, "loss": 0.0006, "step": 9772 }, { "epoch": 4.44631483166515, "grad_norm": 0.015071387322499517, "learning_rate": 3.0001789379984547e-07, "loss": 0.0001, "step": 9773 }, { "epoch": 4.446769790718835, "grad_norm": 0.08161484190804871, "learning_rate": 2.9953043247169355e-07, "loss": 0.0006, "step": 9774 }, { "epoch": 4.447224749772521, "grad_norm": 0.2128520990957493, "learning_rate": 2.9904335524468866e-07, "loss": 0.0039, "step": 9775 }, { "epoch": 4.447679708826206, "grad_norm": 0.08737768827246814, "learning_rate": 2.985566621586311e-07, "loss": 0.0011, "step": 9776 }, { "epoch": 4.4481346678798905, "grad_norm": 0.39062062527920377, "learning_rate": 2.9807035325329136e-07, "loss": 0.0004, "step": 9777 }, { "epoch": 4.448589626933576, "grad_norm": 0.0054942624799455, "learning_rate": 2.975844285684082e-07, "loss": 0.0, "step": 9778 }, { "epoch": 4.449044585987261, "grad_norm": 0.1983831108843659, "learning_rate": 2.9709888814368926e-07, "loss": 0.0044, "step": 9779 }, { "epoch": 4.449499545040946, "grad_norm": 0.012994031604128303, "learning_rate": 2.966137320188123e-07, "loss": 0.0001, "step": 9780 }, { "epoch": 4.449954504094632, "grad_norm": 0.036988204299509365, "learning_rate": 2.961289602334205e-07, "loss": 0.0002, "step": 9781 }, { "epoch": 4.450409463148317, "grad_norm": 0.13197265729278673, "learning_rate": 2.9564457282712787e-07, "loss": 0.0019, "step": 9782 }, { "epoch": 4.4508644222020015, "grad_norm": 0.039774391063334864, "learning_rate": 2.951605698395149e-07, "loss": 0.0002, "step": 9783 }, { "epoch": 4.451319381255687, "grad_norm": 0.006074030432539295, "learning_rate": 2.946769513101344e-07, "loss": 0.0, "step": 9784 }, { "epoch": 4.451774340309372, "grad_norm": 0.3046659119210393, "learning_rate": 2.941937172785042e-07, "loss": 0.0047, "step": 9785 }, { "epoch": 4.452229299363057, "grad_norm": 0.04319090795977366, "learning_rate": 2.937108677841116e-07, "loss": 0.0003, "step": 9786 }, { "epoch": 4.452684258416743, "grad_norm": 0.10283298670020563, "learning_rate": 2.932284028664145e-07, "loss": 0.0019, "step": 9787 }, { "epoch": 4.453139217470428, "grad_norm": 0.00943886261722189, "learning_rate": 2.9274632256483526e-07, "loss": 0.0001, "step": 9788 }, { "epoch": 4.4535941765241125, "grad_norm": 0.08254570558061936, "learning_rate": 2.922646269187701e-07, "loss": 0.0005, "step": 9789 }, { "epoch": 4.454049135577798, "grad_norm": 0.017799630563506644, "learning_rate": 2.917833159675792e-07, "loss": 0.0001, "step": 9790 }, { "epoch": 4.454504094631483, "grad_norm": 0.06126000346464943, "learning_rate": 2.91302389750594e-07, "loss": 0.0005, "step": 9791 }, { "epoch": 4.454959053685168, "grad_norm": 0.004720162083997436, "learning_rate": 2.908218483071124e-07, "loss": 0.0, "step": 9792 }, { "epoch": 4.455414012738854, "grad_norm": 0.019631397622341947, "learning_rate": 2.903416916764029e-07, "loss": 0.0001, "step": 9793 }, { "epoch": 4.455868971792539, "grad_norm": 0.01802349236863066, "learning_rate": 2.898619198977026e-07, "loss": 0.0002, "step": 9794 }, { "epoch": 4.4563239308462235, "grad_norm": 0.43256760719446097, "learning_rate": 2.893825330102151e-07, "loss": 0.0211, "step": 9795 }, { "epoch": 4.456778889899909, "grad_norm": 0.025298544497720836, "learning_rate": 2.889035310531141e-07, "loss": 0.0002, "step": 9796 }, { "epoch": 4.457233848953594, "grad_norm": 0.06483438654538458, "learning_rate": 2.8842491406554094e-07, "loss": 0.0003, "step": 9797 }, { "epoch": 4.457688808007279, "grad_norm": 0.16805962708227917, "learning_rate": 2.879466820866067e-07, "loss": 0.0013, "step": 9798 }, { "epoch": 4.458143767060965, "grad_norm": 0.02271715470086333, "learning_rate": 2.874688351553906e-07, "loss": 0.0002, "step": 9799 }, { "epoch": 4.45859872611465, "grad_norm": 0.20073293686823662, "learning_rate": 2.869913733109386e-07, "loss": 0.0017, "step": 9800 }, { "epoch": 4.4590536851683344, "grad_norm": 0.14380132232382806, "learning_rate": 2.8651429659226906e-07, "loss": 0.0012, "step": 9801 }, { "epoch": 4.45950864422202, "grad_norm": 0.012534494839139832, "learning_rate": 2.8603760503836454e-07, "loss": 0.0001, "step": 9802 }, { "epoch": 4.459963603275705, "grad_norm": 0.038842044308688276, "learning_rate": 2.8556129868817893e-07, "loss": 0.0001, "step": 9803 }, { "epoch": 4.460418562329391, "grad_norm": 1.0275785904236199, "learning_rate": 2.850853775806345e-07, "loss": 0.003, "step": 9804 }, { "epoch": 4.460873521383076, "grad_norm": 0.30648582761010784, "learning_rate": 2.846098417546206e-07, "loss": 0.0019, "step": 9805 }, { "epoch": 4.461328480436761, "grad_norm": 0.022603007775520007, "learning_rate": 2.841346912489956e-07, "loss": 0.0002, "step": 9806 }, { "epoch": 4.461783439490446, "grad_norm": 0.05186014983687546, "learning_rate": 2.836599261025852e-07, "loss": 0.0002, "step": 9807 }, { "epoch": 4.462238398544131, "grad_norm": 0.03837095601837969, "learning_rate": 2.831855463541888e-07, "loss": 0.0001, "step": 9808 }, { "epoch": 4.462693357597816, "grad_norm": 0.014959444476450033, "learning_rate": 2.8271155204256826e-07, "loss": 0.0001, "step": 9809 }, { "epoch": 4.463148316651502, "grad_norm": 0.0661108077314431, "learning_rate": 2.8223794320645705e-07, "loss": 0.0003, "step": 9810 }, { "epoch": 4.463603275705187, "grad_norm": 0.1025476499277121, "learning_rate": 2.817647198845558e-07, "loss": 0.0009, "step": 9811 }, { "epoch": 4.4640582347588715, "grad_norm": 0.04266926187999061, "learning_rate": 2.812918821155336e-07, "loss": 0.0002, "step": 9812 }, { "epoch": 4.464513193812557, "grad_norm": 0.17872743192093948, "learning_rate": 2.808194299380296e-07, "loss": 0.0006, "step": 9813 }, { "epoch": 4.464968152866242, "grad_norm": 0.14087940767837617, "learning_rate": 2.803473633906495e-07, "loss": 0.0009, "step": 9814 }, { "epoch": 4.465423111919927, "grad_norm": 0.014106178327896889, "learning_rate": 2.7987568251197027e-07, "loss": 0.0001, "step": 9815 }, { "epoch": 4.465878070973613, "grad_norm": 0.026955016264711362, "learning_rate": 2.794043873405339e-07, "loss": 0.0002, "step": 9816 }, { "epoch": 4.466333030027298, "grad_norm": 0.14319096113865132, "learning_rate": 2.789334779148528e-07, "loss": 0.0032, "step": 9817 }, { "epoch": 4.4667879890809825, "grad_norm": 0.01537631016289191, "learning_rate": 2.78462954273408e-07, "loss": 0.0001, "step": 9818 }, { "epoch": 4.467242948134668, "grad_norm": 0.10932080023011205, "learning_rate": 2.77992816454648e-07, "loss": 0.0017, "step": 9819 }, { "epoch": 4.467697907188353, "grad_norm": 0.011293209914633813, "learning_rate": 2.775230644969906e-07, "loss": 0.0001, "step": 9820 }, { "epoch": 4.468152866242038, "grad_norm": 0.16308582746928127, "learning_rate": 2.7705369843882223e-07, "loss": 0.001, "step": 9821 }, { "epoch": 4.468607825295724, "grad_norm": 0.0814893870538949, "learning_rate": 2.7658471831849664e-07, "loss": 0.0006, "step": 9822 }, { "epoch": 4.469062784349409, "grad_norm": 0.13535296181518966, "learning_rate": 2.761161241743371e-07, "loss": 0.0006, "step": 9823 }, { "epoch": 4.4695177434030935, "grad_norm": 0.057057305749909186, "learning_rate": 2.756479160446357e-07, "loss": 0.0008, "step": 9824 }, { "epoch": 4.469972702456779, "grad_norm": 0.013109914042717294, "learning_rate": 2.751800939676513e-07, "loss": 0.0001, "step": 9825 }, { "epoch": 4.470427661510464, "grad_norm": 0.12042985313853603, "learning_rate": 2.747126579816117e-07, "loss": 0.0022, "step": 9826 }, { "epoch": 4.470882620564149, "grad_norm": 0.010020386345474681, "learning_rate": 2.7424560812471466e-07, "loss": 0.0001, "step": 9827 }, { "epoch": 4.471337579617835, "grad_norm": 0.1843457913919323, "learning_rate": 2.737789444351263e-07, "loss": 0.002, "step": 9828 }, { "epoch": 4.47179253867152, "grad_norm": 0.1273467604869257, "learning_rate": 2.733126669509789e-07, "loss": 0.0011, "step": 9829 }, { "epoch": 4.4722474977252045, "grad_norm": 0.054725157669334536, "learning_rate": 2.728467757103748e-07, "loss": 0.0004, "step": 9830 }, { "epoch": 4.47270245677889, "grad_norm": 0.19774346462172113, "learning_rate": 2.7238127075138345e-07, "loss": 0.0043, "step": 9831 }, { "epoch": 4.473157415832575, "grad_norm": 0.012177097946780433, "learning_rate": 2.7191615211204613e-07, "loss": 0.0001, "step": 9832 }, { "epoch": 4.47361237488626, "grad_norm": 0.023122379653789975, "learning_rate": 2.714514198303686e-07, "loss": 0.0001, "step": 9833 }, { "epoch": 4.474067333939946, "grad_norm": 0.009683029066415974, "learning_rate": 2.7098707394432653e-07, "loss": 0.0001, "step": 9834 }, { "epoch": 4.474522292993631, "grad_norm": 0.05642467243687519, "learning_rate": 2.7052311449186577e-07, "loss": 0.0005, "step": 9835 }, { "epoch": 4.4749772520473154, "grad_norm": 0.08345240618601117, "learning_rate": 2.70059541510897e-07, "loss": 0.0005, "step": 9836 }, { "epoch": 4.475432211101001, "grad_norm": 0.0031893396602070702, "learning_rate": 2.6959635503930327e-07, "loss": 0.0, "step": 9837 }, { "epoch": 4.475887170154686, "grad_norm": 0.01898662942622112, "learning_rate": 2.691335551149327e-07, "loss": 0.0002, "step": 9838 }, { "epoch": 4.476342129208371, "grad_norm": 0.006469085962762504, "learning_rate": 2.686711417756038e-07, "loss": 0.0, "step": 9839 }, { "epoch": 4.476797088262057, "grad_norm": 0.07524928980180771, "learning_rate": 2.6820911505910263e-07, "loss": 0.0003, "step": 9840 }, { "epoch": 4.477252047315742, "grad_norm": 0.11950012732445724, "learning_rate": 2.677474750031822e-07, "loss": 0.0018, "step": 9841 }, { "epoch": 4.477707006369426, "grad_norm": 0.00783898214449213, "learning_rate": 2.672862216455696e-07, "loss": 0.0, "step": 9842 }, { "epoch": 4.478161965423112, "grad_norm": 0.022946767666007787, "learning_rate": 2.6682535502395356e-07, "loss": 0.0001, "step": 9843 }, { "epoch": 4.478616924476797, "grad_norm": 0.28792011528851646, "learning_rate": 2.663648751759951e-07, "loss": 0.0043, "step": 9844 }, { "epoch": 4.479071883530482, "grad_norm": 0.09237528427888451, "learning_rate": 2.6590478213932236e-07, "loss": 0.0011, "step": 9845 }, { "epoch": 4.479526842584168, "grad_norm": 0.21573638918480934, "learning_rate": 2.6544507595153036e-07, "loss": 0.0023, "step": 9846 }, { "epoch": 4.4799818016378525, "grad_norm": 0.005402306940496315, "learning_rate": 2.649857566501873e-07, "loss": 0.0001, "step": 9847 }, { "epoch": 4.480436760691537, "grad_norm": 0.039497324320792275, "learning_rate": 2.6452682427282383e-07, "loss": 0.0002, "step": 9848 }, { "epoch": 4.480891719745223, "grad_norm": 0.014236832125789911, "learning_rate": 2.6406827885694377e-07, "loss": 0.0001, "step": 9849 }, { "epoch": 4.481346678798908, "grad_norm": 0.022868857639931274, "learning_rate": 2.6361012044001654e-07, "loss": 0.0001, "step": 9850 }, { "epoch": 4.481801637852593, "grad_norm": 0.011047213994452935, "learning_rate": 2.6315234905948063e-07, "loss": 0.0001, "step": 9851 }, { "epoch": 4.482256596906279, "grad_norm": 0.06548485286490154, "learning_rate": 2.626949647527438e-07, "loss": 0.0006, "step": 9852 }, { "epoch": 4.4827115559599635, "grad_norm": 0.02527050144323225, "learning_rate": 2.622379675571812e-07, "loss": 0.0002, "step": 9853 }, { "epoch": 4.483166515013648, "grad_norm": 0.06102208484773239, "learning_rate": 2.6178135751013576e-07, "loss": 0.0006, "step": 9854 }, { "epoch": 4.483621474067334, "grad_norm": 0.02332799082250286, "learning_rate": 2.613251346489204e-07, "loss": 0.0002, "step": 9855 }, { "epoch": 4.484076433121019, "grad_norm": 0.021532156027894378, "learning_rate": 2.608692990108147e-07, "loss": 0.0001, "step": 9856 }, { "epoch": 4.484531392174704, "grad_norm": 0.00914003022193629, "learning_rate": 2.604138506330689e-07, "loss": 0.0001, "step": 9857 }, { "epoch": 4.48498635122839, "grad_norm": 0.02846928576310341, "learning_rate": 2.5995878955289933e-07, "loss": 0.0002, "step": 9858 }, { "epoch": 4.4854413102820745, "grad_norm": 0.21378335888448957, "learning_rate": 2.5950411580749235e-07, "loss": 0.0009, "step": 9859 }, { "epoch": 4.485896269335759, "grad_norm": 0.18327109367967684, "learning_rate": 2.5904982943399993e-07, "loss": 0.0021, "step": 9860 }, { "epoch": 4.486351228389445, "grad_norm": 0.04540874112772403, "learning_rate": 2.585959304695462e-07, "loss": 0.0001, "step": 9861 }, { "epoch": 4.48680618744313, "grad_norm": 0.1299983675938219, "learning_rate": 2.581424189512205e-07, "loss": 0.0012, "step": 9862 }, { "epoch": 4.487261146496815, "grad_norm": 0.06896514245733236, "learning_rate": 2.576892949160825e-07, "loss": 0.0006, "step": 9863 }, { "epoch": 4.487716105550501, "grad_norm": 0.0576733226007471, "learning_rate": 2.572365584011599e-07, "loss": 0.0005, "step": 9864 }, { "epoch": 4.4881710646041855, "grad_norm": 0.10620583590965074, "learning_rate": 2.567842094434464e-07, "loss": 0.0007, "step": 9865 }, { "epoch": 4.48862602365787, "grad_norm": 0.013973205863701958, "learning_rate": 2.56332248079908e-07, "loss": 0.0001, "step": 9866 }, { "epoch": 4.489080982711556, "grad_norm": 0.014089263477686374, "learning_rate": 2.558806743474762e-07, "loss": 0.0001, "step": 9867 }, { "epoch": 4.489535941765241, "grad_norm": 0.05230955011793734, "learning_rate": 2.5542948828305104e-07, "loss": 0.0004, "step": 9868 }, { "epoch": 4.489990900818927, "grad_norm": 0.010346874428992351, "learning_rate": 2.5497868992350184e-07, "loss": 0.0, "step": 9869 }, { "epoch": 4.490445859872612, "grad_norm": 0.02913525149330349, "learning_rate": 2.545282793056653e-07, "loss": 0.0002, "step": 9870 }, { "epoch": 4.4909008189262964, "grad_norm": 0.16629691698138255, "learning_rate": 2.54078256466348e-07, "loss": 0.0017, "step": 9871 }, { "epoch": 4.491355777979982, "grad_norm": 0.05124612674058239, "learning_rate": 2.5362862144232336e-07, "loss": 0.0003, "step": 9872 }, { "epoch": 4.491810737033667, "grad_norm": 0.24363180548823193, "learning_rate": 2.531793742703331e-07, "loss": 0.0013, "step": 9873 }, { "epoch": 4.492265696087352, "grad_norm": 0.021724911127147375, "learning_rate": 2.527305149870879e-07, "loss": 0.0001, "step": 9874 }, { "epoch": 4.492720655141038, "grad_norm": 0.2283618288108952, "learning_rate": 2.522820436292667e-07, "loss": 0.0034, "step": 9875 }, { "epoch": 4.4931756141947226, "grad_norm": 0.15896141386604887, "learning_rate": 2.5183396023351567e-07, "loss": 0.0012, "step": 9876 }, { "epoch": 4.493630573248407, "grad_norm": 0.12338300992770802, "learning_rate": 2.513862648364518e-07, "loss": 0.0012, "step": 9877 }, { "epoch": 4.494085532302093, "grad_norm": 0.011596265287238042, "learning_rate": 2.509389574746579e-07, "loss": 0.0001, "step": 9878 }, { "epoch": 4.494540491355778, "grad_norm": 0.14052493615326575, "learning_rate": 2.5049203818468537e-07, "loss": 0.0004, "step": 9879 }, { "epoch": 4.494995450409463, "grad_norm": 0.03327082778941423, "learning_rate": 2.500455070030544e-07, "loss": 0.0001, "step": 9880 }, { "epoch": 4.495450409463149, "grad_norm": 0.054568840509463375, "learning_rate": 2.495993639662547e-07, "loss": 0.0004, "step": 9881 }, { "epoch": 4.4959053685168335, "grad_norm": 0.1119712940527868, "learning_rate": 2.491536091107427e-07, "loss": 0.0008, "step": 9882 }, { "epoch": 4.496360327570518, "grad_norm": 0.1320771439856323, "learning_rate": 2.48708242472942e-07, "loss": 0.0007, "step": 9883 }, { "epoch": 4.496815286624204, "grad_norm": 0.07088333664436038, "learning_rate": 2.4826326408924795e-07, "loss": 0.0005, "step": 9884 }, { "epoch": 4.497270245677889, "grad_norm": 0.021123830596431616, "learning_rate": 2.4781867399602033e-07, "loss": 0.0001, "step": 9885 }, { "epoch": 4.497725204731574, "grad_norm": 0.28202501994495616, "learning_rate": 2.473744722295912e-07, "loss": 0.0038, "step": 9886 }, { "epoch": 4.49818016378526, "grad_norm": 0.027920598785741005, "learning_rate": 2.469306588262571e-07, "loss": 0.0002, "step": 9887 }, { "epoch": 4.4986351228389445, "grad_norm": 0.011353369280796066, "learning_rate": 2.4648723382228513e-07, "loss": 0.0001, "step": 9888 }, { "epoch": 4.499090081892629, "grad_norm": 0.012219980387193708, "learning_rate": 2.4604419725390906e-07, "loss": 0.0001, "step": 9889 }, { "epoch": 4.499545040946315, "grad_norm": 0.04660593775571815, "learning_rate": 2.456015491573327e-07, "loss": 0.0002, "step": 9890 }, { "epoch": 4.5, "grad_norm": 0.04883669816785238, "learning_rate": 2.4515928956872716e-07, "loss": 0.0004, "step": 9891 }, { "epoch": 4.500454959053685, "grad_norm": 0.010114742930456203, "learning_rate": 2.447174185242324e-07, "loss": 0.0001, "step": 9892 }, { "epoch": 4.500909918107371, "grad_norm": 0.4722986074888995, "learning_rate": 2.4427593605995505e-07, "loss": 0.0014, "step": 9893 }, { "epoch": 4.5013648771610555, "grad_norm": 0.045812815872761495, "learning_rate": 2.438348422119713e-07, "loss": 0.0003, "step": 9894 }, { "epoch": 4.50181983621474, "grad_norm": 0.004933274382772532, "learning_rate": 2.4339413701632617e-07, "loss": 0.0, "step": 9895 }, { "epoch": 4.502274795268426, "grad_norm": 0.020359412302883535, "learning_rate": 2.4295382050903147e-07, "loss": 0.0001, "step": 9896 }, { "epoch": 4.502729754322111, "grad_norm": 0.023678091742378612, "learning_rate": 2.4251389272606674e-07, "loss": 0.0002, "step": 9897 }, { "epoch": 4.503184713375796, "grad_norm": 0.12961085944872078, "learning_rate": 2.4207435370338374e-07, "loss": 0.0012, "step": 9898 }, { "epoch": 4.503639672429482, "grad_norm": 0.06681443463339036, "learning_rate": 2.416352034768965e-07, "loss": 0.0004, "step": 9899 }, { "epoch": 4.5040946314831665, "grad_norm": 0.052942536083503584, "learning_rate": 2.411964420824925e-07, "loss": 0.0004, "step": 9900 }, { "epoch": 4.504549590536851, "grad_norm": 0.0482369922530543, "learning_rate": 2.407580695560252e-07, "loss": 0.0002, "step": 9901 }, { "epoch": 4.505004549590537, "grad_norm": 0.09560473705119904, "learning_rate": 2.4032008593331544e-07, "loss": 0.0011, "step": 9902 }, { "epoch": 4.505459508644222, "grad_norm": 0.2272654642815518, "learning_rate": 2.398824912501535e-07, "loss": 0.0012, "step": 9903 }, { "epoch": 4.505914467697907, "grad_norm": 0.04665829289119957, "learning_rate": 2.3944528554229795e-07, "loss": 0.0002, "step": 9904 }, { "epoch": 4.506369426751593, "grad_norm": 0.04837064452101903, "learning_rate": 2.390084688454752e-07, "loss": 0.0002, "step": 9905 }, { "epoch": 4.5068243858052774, "grad_norm": 0.015412511288154834, "learning_rate": 2.3857204119538016e-07, "loss": 0.0001, "step": 9906 }, { "epoch": 4.507279344858962, "grad_norm": 0.06909807662246724, "learning_rate": 2.3813600262767578e-07, "loss": 0.0005, "step": 9907 }, { "epoch": 4.507734303912648, "grad_norm": 0.019903526284543397, "learning_rate": 2.37700353177992e-07, "loss": 0.0001, "step": 9908 }, { "epoch": 4.508189262966333, "grad_norm": 0.34300063357150967, "learning_rate": 2.3726509288192977e-07, "loss": 0.0013, "step": 9909 }, { "epoch": 4.508644222020019, "grad_norm": 0.006279392129840297, "learning_rate": 2.3683022177505565e-07, "loss": 0.0, "step": 9910 }, { "epoch": 4.5090991810737036, "grad_norm": 0.02269171887515086, "learning_rate": 2.363957398929051e-07, "loss": 0.0001, "step": 9911 }, { "epoch": 4.509554140127388, "grad_norm": 0.04685891906308402, "learning_rate": 2.3596164727098304e-07, "loss": 0.0003, "step": 9912 }, { "epoch": 4.510009099181074, "grad_norm": 0.005558002100558519, "learning_rate": 2.3552794394476053e-07, "loss": 0.0, "step": 9913 }, { "epoch": 4.510464058234759, "grad_norm": 0.11995212190840954, "learning_rate": 2.3509462994967868e-07, "loss": 0.0018, "step": 9914 }, { "epoch": 4.510919017288444, "grad_norm": 0.1817786184983082, "learning_rate": 2.346617053211453e-07, "loss": 0.0017, "step": 9915 }, { "epoch": 4.51137397634213, "grad_norm": 0.324050688034584, "learning_rate": 2.342291700945376e-07, "loss": 0.0062, "step": 9916 }, { "epoch": 4.5118289353958145, "grad_norm": 0.04382471818462054, "learning_rate": 2.3379702430520013e-07, "loss": 0.0002, "step": 9917 }, { "epoch": 4.512283894449499, "grad_norm": 0.11465236073782037, "learning_rate": 2.3336526798844517e-07, "loss": 0.0006, "step": 9918 }, { "epoch": 4.512738853503185, "grad_norm": 0.05984669697104292, "learning_rate": 2.3293390117955394e-07, "loss": 0.0005, "step": 9919 }, { "epoch": 4.51319381255687, "grad_norm": 0.2909316393569093, "learning_rate": 2.325029239137777e-07, "loss": 0.0031, "step": 9920 }, { "epoch": 4.513648771610555, "grad_norm": 0.07713496730362766, "learning_rate": 2.3207233622633275e-07, "loss": 0.0003, "step": 9921 }, { "epoch": 4.514103730664241, "grad_norm": 0.11585690530414175, "learning_rate": 2.3164213815240476e-07, "loss": 0.0018, "step": 9922 }, { "epoch": 4.5145586897179255, "grad_norm": 0.300016029472826, "learning_rate": 2.312123297271468e-07, "loss": 0.004, "step": 9923 }, { "epoch": 4.51501364877161, "grad_norm": 0.06862492293195506, "learning_rate": 2.3078291098568184e-07, "loss": 0.0008, "step": 9924 }, { "epoch": 4.515468607825296, "grad_norm": 0.20677295281035155, "learning_rate": 2.303538819630996e-07, "loss": 0.0042, "step": 9925 }, { "epoch": 4.515923566878981, "grad_norm": 0.008484838853610744, "learning_rate": 2.299252426944587e-07, "loss": 0.0001, "step": 9926 }, { "epoch": 4.516378525932666, "grad_norm": 0.088911079301893, "learning_rate": 2.2949699321478612e-07, "loss": 0.0004, "step": 9927 }, { "epoch": 4.516833484986352, "grad_norm": 0.3833271543482625, "learning_rate": 2.29069133559075e-07, "loss": 0.009, "step": 9928 }, { "epoch": 4.5172884440400365, "grad_norm": 0.3084607327507269, "learning_rate": 2.286416637622896e-07, "loss": 0.0035, "step": 9929 }, { "epoch": 4.517743403093721, "grad_norm": 0.2540997972747094, "learning_rate": 2.2821458385936025e-07, "loss": 0.0053, "step": 9930 }, { "epoch": 4.518198362147407, "grad_norm": 0.005081740422745523, "learning_rate": 2.2778789388518573e-07, "loss": 0.0, "step": 9931 }, { "epoch": 4.518653321201092, "grad_norm": 0.008606815220825223, "learning_rate": 2.2736159387463264e-07, "loss": 0.0, "step": 9932 }, { "epoch": 4.519108280254777, "grad_norm": 0.08896213781823452, "learning_rate": 2.269356838625375e-07, "loss": 0.0007, "step": 9933 }, { "epoch": 4.519563239308463, "grad_norm": 0.04512275087179489, "learning_rate": 2.2651016388370361e-07, "loss": 0.0003, "step": 9934 }, { "epoch": 4.5200181983621475, "grad_norm": 0.09403573057466452, "learning_rate": 2.2608503397290203e-07, "loss": 0.0009, "step": 9935 }, { "epoch": 4.520473157415832, "grad_norm": 0.16105008189387443, "learning_rate": 2.2566029416487333e-07, "loss": 0.0004, "step": 9936 }, { "epoch": 4.520928116469518, "grad_norm": 0.24235688766509916, "learning_rate": 2.252359444943236e-07, "loss": 0.0033, "step": 9937 }, { "epoch": 4.521383075523203, "grad_norm": 0.05418808698563803, "learning_rate": 2.248119849959307e-07, "loss": 0.0005, "step": 9938 }, { "epoch": 4.521838034576888, "grad_norm": 0.10286283469703374, "learning_rate": 2.243884157043369e-07, "loss": 0.001, "step": 9939 }, { "epoch": 4.522292993630574, "grad_norm": 0.05820316958147569, "learning_rate": 2.2396523665415615e-07, "loss": 0.0003, "step": 9940 }, { "epoch": 4.522747952684258, "grad_norm": 0.0851026713285884, "learning_rate": 2.2354244787996748e-07, "loss": 0.0005, "step": 9941 }, { "epoch": 4.523202911737943, "grad_norm": 0.04007618882852361, "learning_rate": 2.2312004941631936e-07, "loss": 0.0004, "step": 9942 }, { "epoch": 4.523657870791629, "grad_norm": 0.05060604467978839, "learning_rate": 2.226980412977292e-07, "loss": 0.0004, "step": 9943 }, { "epoch": 4.524112829845314, "grad_norm": 0.6355256071773773, "learning_rate": 2.2227642355868107e-07, "loss": 0.0009, "step": 9944 }, { "epoch": 4.524567788898999, "grad_norm": 0.2100560207057896, "learning_rate": 2.218551962336274e-07, "loss": 0.0018, "step": 9945 }, { "epoch": 4.5250227479526846, "grad_norm": 0.03370484451732676, "learning_rate": 2.2143435935698897e-07, "loss": 0.0001, "step": 9946 }, { "epoch": 4.525477707006369, "grad_norm": 0.024307783656008334, "learning_rate": 2.2101391296315444e-07, "loss": 0.0002, "step": 9947 }, { "epoch": 4.525932666060054, "grad_norm": 0.2007286369976889, "learning_rate": 2.2059385708648183e-07, "loss": 0.0029, "step": 9948 }, { "epoch": 4.52638762511374, "grad_norm": 0.016459887056630526, "learning_rate": 2.2017419176129596e-07, "loss": 0.0001, "step": 9949 }, { "epoch": 4.526842584167425, "grad_norm": 0.019678851060355582, "learning_rate": 2.197549170218899e-07, "loss": 0.0001, "step": 9950 }, { "epoch": 4.52729754322111, "grad_norm": 0.024405814092407618, "learning_rate": 2.1933603290252404e-07, "loss": 0.0002, "step": 9951 }, { "epoch": 4.5277525022747955, "grad_norm": 0.033450062320797204, "learning_rate": 2.1891753943742766e-07, "loss": 0.0002, "step": 9952 }, { "epoch": 4.52820746132848, "grad_norm": 0.060115051084917326, "learning_rate": 2.1849943666079899e-07, "loss": 0.0003, "step": 9953 }, { "epoch": 4.528662420382165, "grad_norm": 0.15643968431611702, "learning_rate": 2.1808172460680399e-07, "loss": 0.0011, "step": 9954 }, { "epoch": 4.529117379435851, "grad_norm": 0.35152894563582243, "learning_rate": 2.176644033095754e-07, "loss": 0.0014, "step": 9955 }, { "epoch": 4.529572338489536, "grad_norm": 0.13707314479892121, "learning_rate": 2.172474728032148e-07, "loss": 0.001, "step": 9956 }, { "epoch": 4.530027297543221, "grad_norm": 0.17159233844994978, "learning_rate": 2.168309331217916e-07, "loss": 0.0014, "step": 9957 }, { "epoch": 4.5304822565969065, "grad_norm": 0.19317591511616486, "learning_rate": 2.1641478429934415e-07, "loss": 0.0021, "step": 9958 }, { "epoch": 4.530937215650591, "grad_norm": 0.1222003441188645, "learning_rate": 2.1599902636987858e-07, "loss": 0.0011, "step": 9959 }, { "epoch": 4.531392174704276, "grad_norm": 0.3074446990591251, "learning_rate": 2.1558365936736715e-07, "loss": 0.0009, "step": 9960 }, { "epoch": 4.531847133757962, "grad_norm": 0.11791867665102791, "learning_rate": 2.151686833257538e-07, "loss": 0.0015, "step": 9961 }, { "epoch": 4.532302092811647, "grad_norm": 0.014327161167464516, "learning_rate": 2.14754098278947e-07, "loss": 0.0001, "step": 9962 }, { "epoch": 4.532757051865332, "grad_norm": 0.10183073000157901, "learning_rate": 2.1433990426082574e-07, "loss": 0.0009, "step": 9963 }, { "epoch": 4.5332120109190175, "grad_norm": 0.04708548455085974, "learning_rate": 2.1392610130523574e-07, "loss": 0.0001, "step": 9964 }, { "epoch": 4.533666969972702, "grad_norm": 0.07008209420271527, "learning_rate": 2.13512689445991e-07, "loss": 0.0003, "step": 9965 }, { "epoch": 4.534121929026387, "grad_norm": 0.027011181244921068, "learning_rate": 2.1309966871687292e-07, "loss": 0.0001, "step": 9966 }, { "epoch": 4.534576888080073, "grad_norm": 0.18192988706667365, "learning_rate": 2.1268703915163225e-07, "loss": 0.0018, "step": 9967 }, { "epoch": 4.535031847133758, "grad_norm": 0.10342716911334705, "learning_rate": 2.1227480078398866e-07, "loss": 0.002, "step": 9968 }, { "epoch": 4.535486806187443, "grad_norm": 0.010329184257664265, "learning_rate": 2.1186295364762687e-07, "loss": 0.0001, "step": 9969 }, { "epoch": 4.5359417652411285, "grad_norm": 0.011847811971578165, "learning_rate": 2.114514977762011e-07, "loss": 0.0001, "step": 9970 }, { "epoch": 4.536396724294813, "grad_norm": 0.028508735206714417, "learning_rate": 2.1104043320333388e-07, "loss": 0.0002, "step": 9971 }, { "epoch": 4.536851683348498, "grad_norm": 0.02471678864265696, "learning_rate": 2.1062975996261615e-07, "loss": 0.0001, "step": 9972 }, { "epoch": 4.537306642402184, "grad_norm": 0.15598499056191162, "learning_rate": 2.1021947808760602e-07, "loss": 0.001, "step": 9973 }, { "epoch": 4.537761601455869, "grad_norm": 0.15395266129932716, "learning_rate": 2.098095876118289e-07, "loss": 0.0007, "step": 9974 }, { "epoch": 4.538216560509554, "grad_norm": 0.01510795330811687, "learning_rate": 2.094000885687808e-07, "loss": 0.0001, "step": 9975 }, { "epoch": 4.538671519563239, "grad_norm": 0.16855095827378602, "learning_rate": 2.0899098099192273e-07, "loss": 0.0028, "step": 9976 }, { "epoch": 4.539126478616924, "grad_norm": 0.003939923343996043, "learning_rate": 2.085822649146857e-07, "loss": 0.0, "step": 9977 }, { "epoch": 4.539581437670609, "grad_norm": 0.20812141974751644, "learning_rate": 2.0817394037046856e-07, "loss": 0.001, "step": 9978 }, { "epoch": 4.540036396724295, "grad_norm": 0.007092484874895985, "learning_rate": 2.0776600739263742e-07, "loss": 0.0, "step": 9979 }, { "epoch": 4.54049135577798, "grad_norm": 0.07559988175948198, "learning_rate": 2.0735846601452613e-07, "loss": 0.0008, "step": 9980 }, { "epoch": 4.540946314831665, "grad_norm": 0.26008675786349245, "learning_rate": 2.06951316269437e-07, "loss": 0.0066, "step": 9981 }, { "epoch": 4.54140127388535, "grad_norm": 0.009340358641583231, "learning_rate": 2.0654455819064222e-07, "loss": 0.0, "step": 9982 }, { "epoch": 4.541856232939035, "grad_norm": 0.07075574860979925, "learning_rate": 2.0613819181137918e-07, "loss": 0.0006, "step": 9983 }, { "epoch": 4.542311191992721, "grad_norm": 0.0681312860868519, "learning_rate": 2.0573221716485402e-07, "loss": 0.0005, "step": 9984 }, { "epoch": 4.542766151046406, "grad_norm": 0.1720352875540983, "learning_rate": 2.0532663428424138e-07, "loss": 0.003, "step": 9985 }, { "epoch": 4.543221110100091, "grad_norm": 0.06982106929868871, "learning_rate": 2.0492144320268247e-07, "loss": 0.0006, "step": 9986 }, { "epoch": 4.5436760691537765, "grad_norm": 0.013441651889484069, "learning_rate": 2.045166439532903e-07, "loss": 0.0001, "step": 9987 }, { "epoch": 4.544131028207461, "grad_norm": 0.09723690321988937, "learning_rate": 2.0411223656914058e-07, "loss": 0.0007, "step": 9988 }, { "epoch": 4.544585987261146, "grad_norm": 0.0589015189687312, "learning_rate": 2.0370822108328191e-07, "loss": 0.0001, "step": 9989 }, { "epoch": 4.545040946314832, "grad_norm": 0.04474927927366833, "learning_rate": 2.0330459752872734e-07, "loss": 0.0002, "step": 9990 }, { "epoch": 4.545495905368517, "grad_norm": 0.015049182841729113, "learning_rate": 2.0290136593845821e-07, "loss": 0.0001, "step": 9991 }, { "epoch": 4.545950864422202, "grad_norm": 0.016189247709023722, "learning_rate": 2.024985263454271e-07, "loss": 0.0001, "step": 9992 }, { "epoch": 4.5464058234758875, "grad_norm": 0.027049167856260638, "learning_rate": 2.0209607878255156e-07, "loss": 0.0002, "step": 9993 }, { "epoch": 4.546860782529572, "grad_norm": 0.010416542932749248, "learning_rate": 2.0169402328271637e-07, "loss": 0.0, "step": 9994 }, { "epoch": 4.547315741583257, "grad_norm": 0.009078294144646514, "learning_rate": 2.0129235987877694e-07, "loss": 0.0, "step": 9995 }, { "epoch": 4.547770700636943, "grad_norm": 0.07828669015635847, "learning_rate": 2.0089108860355422e-07, "loss": 0.0009, "step": 9996 }, { "epoch": 4.548225659690628, "grad_norm": 0.04651502968247596, "learning_rate": 2.0049020948984033e-07, "loss": 0.0003, "step": 9997 }, { "epoch": 4.548680618744313, "grad_norm": 0.03401092455072713, "learning_rate": 2.0008972257039184e-07, "loss": 0.0002, "step": 9998 }, { "epoch": 4.5491355777979985, "grad_norm": 0.011284210409231964, "learning_rate": 1.9968962787793534e-07, "loss": 0.0001, "step": 9999 }, { "epoch": 4.549590536851683, "grad_norm": 0.05301564104224029, "learning_rate": 1.9928992544516356e-07, "loss": 0.0001, "step": 10000 }, { "epoch": 4.550045495905368, "grad_norm": 0.042940566955574654, "learning_rate": 1.9889061530473986e-07, "loss": 0.0002, "step": 10001 }, { "epoch": 4.550500454959054, "grad_norm": 0.07118417003742626, "learning_rate": 1.984916974892931e-07, "loss": 0.0009, "step": 10002 }, { "epoch": 4.550955414012739, "grad_norm": 0.01901971376805956, "learning_rate": 1.9809317203142165e-07, "loss": 0.0002, "step": 10003 }, { "epoch": 4.551410373066424, "grad_norm": 0.08501062045859577, "learning_rate": 1.9769503896369167e-07, "loss": 0.0006, "step": 10004 }, { "epoch": 4.5518653321201095, "grad_norm": 0.046243475480679676, "learning_rate": 1.9729729831863497e-07, "loss": 0.0003, "step": 10005 }, { "epoch": 4.552320291173794, "grad_norm": 0.029729060496125403, "learning_rate": 1.968999501287544e-07, "loss": 0.0002, "step": 10006 }, { "epoch": 4.552775250227479, "grad_norm": 0.020723444242764136, "learning_rate": 1.965029944265201e-07, "loss": 0.0001, "step": 10007 }, { "epoch": 4.553230209281165, "grad_norm": 0.030507413711084074, "learning_rate": 1.9610643124436834e-07, "loss": 0.0002, "step": 10008 }, { "epoch": 4.55368516833485, "grad_norm": 0.05761848108370722, "learning_rate": 1.9571026061470432e-07, "loss": 0.0002, "step": 10009 }, { "epoch": 4.554140127388535, "grad_norm": 0.3637030185561456, "learning_rate": 1.953144825699016e-07, "loss": 0.005, "step": 10010 }, { "epoch": 4.55459508644222, "grad_norm": 0.1264561631131197, "learning_rate": 1.9491909714230207e-07, "loss": 0.0006, "step": 10011 }, { "epoch": 4.555050045495905, "grad_norm": 0.02897131742345199, "learning_rate": 1.9452410436421486e-07, "loss": 0.0001, "step": 10012 }, { "epoch": 4.55550500454959, "grad_norm": 0.0543235407243238, "learning_rate": 1.9412950426791645e-07, "loss": 0.0001, "step": 10013 }, { "epoch": 4.555959963603276, "grad_norm": 0.0335056341710435, "learning_rate": 1.9373529688565095e-07, "loss": 0.0002, "step": 10014 }, { "epoch": 4.556414922656961, "grad_norm": 0.09350690681472225, "learning_rate": 1.9334148224963267e-07, "loss": 0.001, "step": 10015 }, { "epoch": 4.556869881710646, "grad_norm": 0.09716939135450206, "learning_rate": 1.9294806039204139e-07, "loss": 0.001, "step": 10016 }, { "epoch": 4.557324840764331, "grad_norm": 0.0810259986721641, "learning_rate": 1.925550313450264e-07, "loss": 0.0008, "step": 10017 }, { "epoch": 4.557779799818016, "grad_norm": 0.010121372960152175, "learning_rate": 1.9216239514070422e-07, "loss": 0.0001, "step": 10018 }, { "epoch": 4.558234758871702, "grad_norm": 0.004985594013211901, "learning_rate": 1.9177015181115866e-07, "loss": 0.0, "step": 10019 }, { "epoch": 4.558689717925387, "grad_norm": 0.07959891846905738, "learning_rate": 1.9137830138844295e-07, "loss": 0.0007, "step": 10020 }, { "epoch": 4.559144676979072, "grad_norm": 0.011407829444275024, "learning_rate": 1.90986843904577e-07, "loss": 0.0001, "step": 10021 }, { "epoch": 4.5595996360327575, "grad_norm": 0.003986694614751018, "learning_rate": 1.9059577939154917e-07, "loss": 0.0, "step": 10022 }, { "epoch": 4.560054595086442, "grad_norm": 0.06184289434632973, "learning_rate": 1.9020510788131385e-07, "loss": 0.0003, "step": 10023 }, { "epoch": 4.560509554140127, "grad_norm": 0.009696954144415907, "learning_rate": 1.898148294057972e-07, "loss": 0.0001, "step": 10024 }, { "epoch": 4.560964513193813, "grad_norm": 0.014845767753796878, "learning_rate": 1.8942494399688983e-07, "loss": 0.0001, "step": 10025 }, { "epoch": 4.561419472247498, "grad_norm": 0.030705769926938763, "learning_rate": 1.8903545168645177e-07, "loss": 0.0003, "step": 10026 }, { "epoch": 4.561874431301183, "grad_norm": 0.005480069322467514, "learning_rate": 1.8864635250631091e-07, "loss": 0.0, "step": 10027 }, { "epoch": 4.5623293903548685, "grad_norm": 0.2060006847676165, "learning_rate": 1.8825764648826182e-07, "loss": 0.0017, "step": 10028 }, { "epoch": 4.562784349408553, "grad_norm": 0.02037804494756718, "learning_rate": 1.87869333664068e-07, "loss": 0.0001, "step": 10029 }, { "epoch": 4.563239308462238, "grad_norm": 0.07736862950887789, "learning_rate": 1.8748141406546072e-07, "loss": 0.0004, "step": 10030 }, { "epoch": 4.563694267515924, "grad_norm": 0.003152446657269879, "learning_rate": 1.8709388772413962e-07, "loss": 0.0, "step": 10031 }, { "epoch": 4.564149226569609, "grad_norm": 0.1468126676196871, "learning_rate": 1.8670675467177102e-07, "loss": 0.0017, "step": 10032 }, { "epoch": 4.564604185623294, "grad_norm": 0.038073502984638075, "learning_rate": 1.863200149399902e-07, "loss": 0.0003, "step": 10033 }, { "epoch": 4.5650591446769795, "grad_norm": 0.03545595884241631, "learning_rate": 1.8593366856039852e-07, "loss": 0.0002, "step": 10034 }, { "epoch": 4.565514103730664, "grad_norm": 0.0077038321409879755, "learning_rate": 1.8554771556456796e-07, "loss": 0.0001, "step": 10035 }, { "epoch": 4.565969062784349, "grad_norm": 0.3464187108032849, "learning_rate": 1.8516215598403609e-07, "loss": 0.0076, "step": 10036 }, { "epoch": 4.566424021838035, "grad_norm": 0.07887769120835544, "learning_rate": 1.847769898503088e-07, "loss": 0.0006, "step": 10037 }, { "epoch": 4.56687898089172, "grad_norm": 0.04025740671936166, "learning_rate": 1.8439221719486088e-07, "loss": 0.0007, "step": 10038 }, { "epoch": 4.567333939945405, "grad_norm": 0.01815608476508528, "learning_rate": 1.8400783804913335e-07, "loss": 0.0001, "step": 10039 }, { "epoch": 4.5677888989990905, "grad_norm": 0.12191023964046135, "learning_rate": 1.8362385244453718e-07, "loss": 0.0011, "step": 10040 }, { "epoch": 4.568243858052775, "grad_norm": 0.027642578797082033, "learning_rate": 1.8324026041244947e-07, "loss": 0.0003, "step": 10041 }, { "epoch": 4.56869881710646, "grad_norm": 0.0029447030448456783, "learning_rate": 1.8285706198421516e-07, "loss": 0.0, "step": 10042 }, { "epoch": 4.569153776160146, "grad_norm": 0.10491726350368716, "learning_rate": 1.8247425719114696e-07, "loss": 0.0008, "step": 10043 }, { "epoch": 4.569608735213831, "grad_norm": 0.06091755817156818, "learning_rate": 1.820918460645271e-07, "loss": 0.0003, "step": 10044 }, { "epoch": 4.570063694267516, "grad_norm": 0.012433108077342896, "learning_rate": 1.8170982863560449e-07, "loss": 0.0001, "step": 10045 }, { "epoch": 4.570518653321201, "grad_norm": 0.04716933765658577, "learning_rate": 1.813282049355952e-07, "loss": 0.0002, "step": 10046 }, { "epoch": 4.570973612374886, "grad_norm": 0.03797918315483642, "learning_rate": 1.8094697499568437e-07, "loss": 0.0003, "step": 10047 }, { "epoch": 4.571428571428571, "grad_norm": 0.014231168195735184, "learning_rate": 1.8056613884702313e-07, "loss": 0.0001, "step": 10048 }, { "epoch": 4.571883530482257, "grad_norm": 0.18676111030865297, "learning_rate": 1.801856965207338e-07, "loss": 0.0032, "step": 10049 }, { "epoch": 4.572338489535942, "grad_norm": 0.07688337790373528, "learning_rate": 1.798056480479027e-07, "loss": 0.0007, "step": 10050 }, { "epoch": 4.572793448589627, "grad_norm": 0.009727580828325387, "learning_rate": 1.79425993459586e-07, "loss": 0.0001, "step": 10051 }, { "epoch": 4.573248407643312, "grad_norm": 0.08603696470832439, "learning_rate": 1.7904673278680838e-07, "loss": 0.0002, "step": 10052 }, { "epoch": 4.573703366696997, "grad_norm": 0.10407459027056128, "learning_rate": 1.7866786606055953e-07, "loss": 0.0008, "step": 10053 }, { "epoch": 4.574158325750682, "grad_norm": 0.02914426573795828, "learning_rate": 1.7828939331180073e-07, "loss": 0.0001, "step": 10054 }, { "epoch": 4.574613284804368, "grad_norm": 0.018091803266200208, "learning_rate": 1.7791131457145727e-07, "loss": 0.0001, "step": 10055 }, { "epoch": 4.575068243858053, "grad_norm": 0.011430570573384806, "learning_rate": 1.7753362987042555e-07, "loss": 0.0001, "step": 10056 }, { "epoch": 4.575523202911738, "grad_norm": 0.1189297919880683, "learning_rate": 1.7715633923956753e-07, "loss": 0.0011, "step": 10057 }, { "epoch": 4.575978161965423, "grad_norm": 0.28925825709099545, "learning_rate": 1.7677944270971193e-07, "loss": 0.0028, "step": 10058 }, { "epoch": 4.576433121019108, "grad_norm": 0.12277351557467386, "learning_rate": 1.7640294031166073e-07, "loss": 0.0031, "step": 10059 }, { "epoch": 4.576888080072793, "grad_norm": 0.17140840155551917, "learning_rate": 1.7602683207617766e-07, "loss": 0.0015, "step": 10060 }, { "epoch": 4.577343039126479, "grad_norm": 0.1066701218054566, "learning_rate": 1.7565111803399704e-07, "loss": 0.0003, "step": 10061 }, { "epoch": 4.577797998180164, "grad_norm": 0.045872613806426064, "learning_rate": 1.7527579821582042e-07, "loss": 0.0008, "step": 10062 }, { "epoch": 4.578252957233849, "grad_norm": 0.018937886976938538, "learning_rate": 1.749008726523166e-07, "loss": 0.0001, "step": 10063 }, { "epoch": 4.578707916287534, "grad_norm": 0.06746145909838563, "learning_rate": 1.745263413741244e-07, "loss": 0.0002, "step": 10064 }, { "epoch": 4.579162875341219, "grad_norm": 0.11869234682558218, "learning_rate": 1.741522044118471e-07, "loss": 0.0014, "step": 10065 }, { "epoch": 4.579617834394904, "grad_norm": 0.08853164214495346, "learning_rate": 1.7377846179605918e-07, "loss": 0.0002, "step": 10066 }, { "epoch": 4.58007279344859, "grad_norm": 0.0783136492175814, "learning_rate": 1.7340511355730006e-07, "loss": 0.0003, "step": 10067 }, { "epoch": 4.580527752502275, "grad_norm": 0.03460472027384699, "learning_rate": 1.7303215972607813e-07, "loss": 0.0002, "step": 10068 }, { "epoch": 4.58098271155596, "grad_norm": 0.007659397505490161, "learning_rate": 1.726596003328701e-07, "loss": 0.0, "step": 10069 }, { "epoch": 4.581437670609645, "grad_norm": 0.1955389345567847, "learning_rate": 1.7228743540811943e-07, "loss": 0.0004, "step": 10070 }, { "epoch": 4.58189262966333, "grad_norm": 0.024683740437104366, "learning_rate": 1.719156649822379e-07, "loss": 0.0002, "step": 10071 }, { "epoch": 4.582347588717015, "grad_norm": 0.36227605669775986, "learning_rate": 1.7154428908560394e-07, "loss": 0.0022, "step": 10072 }, { "epoch": 4.582802547770701, "grad_norm": 0.26752782591049856, "learning_rate": 1.7117330774856555e-07, "loss": 0.0013, "step": 10073 }, { "epoch": 4.583257506824386, "grad_norm": 0.06650524457738367, "learning_rate": 1.7080272100143847e-07, "loss": 0.0009, "step": 10074 }, { "epoch": 4.583712465878071, "grad_norm": 0.06080457162296043, "learning_rate": 1.7043252887450456e-07, "loss": 0.0003, "step": 10075 }, { "epoch": 4.584167424931756, "grad_norm": 0.01789797385823625, "learning_rate": 1.700627313980141e-07, "loss": 0.0002, "step": 10076 }, { "epoch": 4.584622383985441, "grad_norm": 0.006409853048861973, "learning_rate": 1.6969332860218514e-07, "loss": 0.0, "step": 10077 }, { "epoch": 4.585077343039126, "grad_norm": 0.17817406065578753, "learning_rate": 1.6932432051720405e-07, "loss": 0.0027, "step": 10078 }, { "epoch": 4.585532302092812, "grad_norm": 0.0927044138884308, "learning_rate": 1.6895570717322397e-07, "loss": 0.0011, "step": 10079 }, { "epoch": 4.585987261146497, "grad_norm": 0.02084884267294616, "learning_rate": 1.6858748860036745e-07, "loss": 0.0001, "step": 10080 }, { "epoch": 4.5864422202001816, "grad_norm": 0.14161989033558514, "learning_rate": 1.6821966482872264e-07, "loss": 0.001, "step": 10081 }, { "epoch": 4.586897179253867, "grad_norm": 0.1833901036683208, "learning_rate": 1.678522358883461e-07, "loss": 0.0055, "step": 10082 }, { "epoch": 4.587352138307552, "grad_norm": 0.03488514339002122, "learning_rate": 1.6748520180926376e-07, "loss": 0.0001, "step": 10083 }, { "epoch": 4.587807097361237, "grad_norm": 0.025599110099182264, "learning_rate": 1.671185626214672e-07, "loss": 0.0001, "step": 10084 }, { "epoch": 4.588262056414923, "grad_norm": 0.09677698649934258, "learning_rate": 1.667523183549169e-07, "loss": 0.0009, "step": 10085 }, { "epoch": 4.588717015468608, "grad_norm": 0.14701762866160098, "learning_rate": 1.6638646903953947e-07, "loss": 0.002, "step": 10086 }, { "epoch": 4.5891719745222925, "grad_norm": 0.03505381396798078, "learning_rate": 1.6602101470523158e-07, "loss": 0.0003, "step": 10087 }, { "epoch": 4.589626933575978, "grad_norm": 0.010135940452738574, "learning_rate": 1.6565595538185707e-07, "loss": 0.0001, "step": 10088 }, { "epoch": 4.590081892629663, "grad_norm": 0.010896084217685007, "learning_rate": 1.6529129109924547e-07, "loss": 0.0001, "step": 10089 }, { "epoch": 4.590536851683348, "grad_norm": 0.006505564507405903, "learning_rate": 1.649270218871968e-07, "loss": 0.0, "step": 10090 }, { "epoch": 4.590991810737034, "grad_norm": 0.2488147576899445, "learning_rate": 1.645631477754761e-07, "loss": 0.0019, "step": 10091 }, { "epoch": 4.591446769790719, "grad_norm": 0.04038727042596749, "learning_rate": 1.641996687938191e-07, "loss": 0.0004, "step": 10092 }, { "epoch": 4.5919017288444035, "grad_norm": 0.01763999325373993, "learning_rate": 1.6383658497192646e-07, "loss": 0.0002, "step": 10093 }, { "epoch": 4.592356687898089, "grad_norm": 0.022453668418020135, "learning_rate": 1.634738963394683e-07, "loss": 0.0002, "step": 10094 }, { "epoch": 4.592811646951774, "grad_norm": 0.08384841100925537, "learning_rate": 1.6311160292608208e-07, "loss": 0.0003, "step": 10095 }, { "epoch": 4.59326660600546, "grad_norm": 0.018475629340727648, "learning_rate": 1.627497047613724e-07, "loss": 0.0001, "step": 10096 }, { "epoch": 4.593721565059145, "grad_norm": 0.08583954796010979, "learning_rate": 1.623882018749112e-07, "loss": 0.0009, "step": 10097 }, { "epoch": 4.59417652411283, "grad_norm": 0.15376879717192948, "learning_rate": 1.6202709429624043e-07, "loss": 0.0016, "step": 10098 }, { "epoch": 4.594631483166515, "grad_norm": 0.0964049554766292, "learning_rate": 1.616663820548675e-07, "loss": 0.0011, "step": 10099 }, { "epoch": 4.5950864422202, "grad_norm": 0.13252709056548337, "learning_rate": 1.6130606518026725e-07, "loss": 0.0021, "step": 10100 }, { "epoch": 4.595541401273885, "grad_norm": 0.0038090229864312414, "learning_rate": 1.60946143701885e-07, "loss": 0.0, "step": 10101 }, { "epoch": 4.595996360327571, "grad_norm": 0.07459162772236673, "learning_rate": 1.6058661764912997e-07, "loss": 0.0004, "step": 10102 }, { "epoch": 4.596451319381256, "grad_norm": 0.26031320818769427, "learning_rate": 1.6022748705138313e-07, "loss": 0.0005, "step": 10103 }, { "epoch": 4.596906278434941, "grad_norm": 0.16112833984187921, "learning_rate": 1.5986875193798934e-07, "loss": 0.0017, "step": 10104 }, { "epoch": 4.597361237488626, "grad_norm": 0.10327430322060341, "learning_rate": 1.5951041233826347e-07, "loss": 0.0016, "step": 10105 }, { "epoch": 4.597816196542311, "grad_norm": 0.0335273949812391, "learning_rate": 1.5915246828148657e-07, "loss": 0.0003, "step": 10106 }, { "epoch": 4.598271155595996, "grad_norm": 0.02388105255093149, "learning_rate": 1.5879491979690854e-07, "loss": 0.0001, "step": 10107 }, { "epoch": 4.598726114649682, "grad_norm": 0.21977463164474043, "learning_rate": 1.5843776691374823e-07, "loss": 0.0007, "step": 10108 }, { "epoch": 4.599181073703367, "grad_norm": 0.016489540639126124, "learning_rate": 1.5808100966118844e-07, "loss": 0.0001, "step": 10109 }, { "epoch": 4.599636032757052, "grad_norm": 0.00642479285954726, "learning_rate": 1.577246480683836e-07, "loss": 0.0, "step": 10110 }, { "epoch": 4.600090991810737, "grad_norm": 0.03219413412893527, "learning_rate": 1.5736868216445155e-07, "loss": 0.0002, "step": 10111 }, { "epoch": 4.600545950864422, "grad_norm": 0.019346077148516046, "learning_rate": 1.570131119784829e-07, "loss": 0.0001, "step": 10112 }, { "epoch": 4.601000909918107, "grad_norm": 0.006218382792734654, "learning_rate": 1.5665793753953162e-07, "loss": 0.0, "step": 10113 }, { "epoch": 4.601455868971793, "grad_norm": 0.015622294368870639, "learning_rate": 1.5630315887662117e-07, "loss": 0.0001, "step": 10114 }, { "epoch": 4.601910828025478, "grad_norm": 0.017345444433933915, "learning_rate": 1.559487760187428e-07, "loss": 0.0002, "step": 10115 }, { "epoch": 4.6023657870791626, "grad_norm": 0.2160180379150265, "learning_rate": 1.5559478899485447e-07, "loss": 0.0016, "step": 10116 }, { "epoch": 4.602820746132848, "grad_norm": 0.0026341134473061, "learning_rate": 1.552411978338836e-07, "loss": 0.0, "step": 10117 }, { "epoch": 4.603275705186533, "grad_norm": 0.01083704962400509, "learning_rate": 1.5488800256472315e-07, "loss": 0.0001, "step": 10118 }, { "epoch": 4.603730664240218, "grad_norm": 0.15260011786840505, "learning_rate": 1.545352032162345e-07, "loss": 0.0028, "step": 10119 }, { "epoch": 4.604185623293904, "grad_norm": 0.07914157037190421, "learning_rate": 1.5418279981724683e-07, "loss": 0.0005, "step": 10120 }, { "epoch": 4.604640582347589, "grad_norm": 0.026299070051616653, "learning_rate": 1.5383079239655762e-07, "loss": 0.0002, "step": 10121 }, { "epoch": 4.6050955414012735, "grad_norm": 0.11537894084000455, "learning_rate": 1.5347918098293114e-07, "loss": 0.0011, "step": 10122 }, { "epoch": 4.605550500454959, "grad_norm": 0.12214011477653744, "learning_rate": 1.5312796560509935e-07, "loss": 0.0014, "step": 10123 }, { "epoch": 4.606005459508644, "grad_norm": 0.034621923594996946, "learning_rate": 1.5277714629176155e-07, "loss": 0.0002, "step": 10124 }, { "epoch": 4.606460418562329, "grad_norm": 0.006269139712969645, "learning_rate": 1.5242672307158534e-07, "loss": 0.0, "step": 10125 }, { "epoch": 4.606915377616015, "grad_norm": 0.0763685326895611, "learning_rate": 1.5207669597320618e-07, "loss": 0.0004, "step": 10126 }, { "epoch": 4.6073703366697, "grad_norm": 0.006566883940682684, "learning_rate": 1.5172706502522672e-07, "loss": 0.0, "step": 10127 }, { "epoch": 4.607825295723385, "grad_norm": 0.016857567333840084, "learning_rate": 1.5137783025621634e-07, "loss": 0.0001, "step": 10128 }, { "epoch": 4.60828025477707, "grad_norm": 0.07832595525597381, "learning_rate": 1.5102899169471386e-07, "loss": 0.0004, "step": 10129 }, { "epoch": 4.608735213830755, "grad_norm": 0.007996000943571764, "learning_rate": 1.5068054936922426e-07, "loss": 0.0, "step": 10130 }, { "epoch": 4.609190172884441, "grad_norm": 0.13135011514864436, "learning_rate": 1.5033250330822036e-07, "loss": 0.0038, "step": 10131 }, { "epoch": 4.609645131938126, "grad_norm": 0.008050907150329823, "learning_rate": 1.4998485354014436e-07, "loss": 0.0, "step": 10132 }, { "epoch": 4.610100090991811, "grad_norm": 0.025811960139408, "learning_rate": 1.49637600093403e-07, "loss": 0.0002, "step": 10133 }, { "epoch": 4.610555050045496, "grad_norm": 0.13329781420957088, "learning_rate": 1.492907429963736e-07, "loss": 0.0005, "step": 10134 }, { "epoch": 4.611010009099181, "grad_norm": 0.3968536018348747, "learning_rate": 1.4894428227739787e-07, "loss": 0.0021, "step": 10135 }, { "epoch": 4.611464968152866, "grad_norm": 0.019323539051422454, "learning_rate": 1.485982179647888e-07, "loss": 0.0001, "step": 10136 }, { "epoch": 4.611919927206552, "grad_norm": 0.05566346265320087, "learning_rate": 1.4825255008682483e-07, "loss": 0.0002, "step": 10137 }, { "epoch": 4.612374886260237, "grad_norm": 0.0908920460291126, "learning_rate": 1.4790727867175224e-07, "loss": 0.0003, "step": 10138 }, { "epoch": 4.612829845313922, "grad_norm": 0.06582488252393662, "learning_rate": 1.4756240374778463e-07, "loss": 0.0003, "step": 10139 }, { "epoch": 4.613284804367607, "grad_norm": 0.007087593541376637, "learning_rate": 1.4721792534310386e-07, "loss": 0.0, "step": 10140 }, { "epoch": 4.613739763421292, "grad_norm": 0.041696913325147585, "learning_rate": 1.4687384348585964e-07, "loss": 0.0004, "step": 10141 }, { "epoch": 4.614194722474977, "grad_norm": 0.022933051631727406, "learning_rate": 1.465301582041684e-07, "loss": 0.0001, "step": 10142 }, { "epoch": 4.614649681528663, "grad_norm": 1.303792613442042, "learning_rate": 1.4618686952611428e-07, "loss": 0.0118, "step": 10143 }, { "epoch": 4.615104640582348, "grad_norm": 0.032707504441561695, "learning_rate": 1.4584397747974987e-07, "loss": 0.0001, "step": 10144 }, { "epoch": 4.615559599636033, "grad_norm": 0.013022017326902556, "learning_rate": 1.455014820930939e-07, "loss": 0.0001, "step": 10145 }, { "epoch": 4.616014558689718, "grad_norm": 0.07130696505473208, "learning_rate": 1.4515938339413504e-07, "loss": 0.0006, "step": 10146 }, { "epoch": 4.616469517743403, "grad_norm": 0.07214436881185252, "learning_rate": 1.4481768141082652e-07, "loss": 0.0017, "step": 10147 }, { "epoch": 4.616924476797088, "grad_norm": 0.1033760209287351, "learning_rate": 1.4447637617109157e-07, "loss": 0.0011, "step": 10148 }, { "epoch": 4.617379435850774, "grad_norm": 0.010718819245754766, "learning_rate": 1.4413546770281893e-07, "loss": 0.0001, "step": 10149 }, { "epoch": 4.617834394904459, "grad_norm": 0.1250026119179043, "learning_rate": 1.437949560338675e-07, "loss": 0.0004, "step": 10150 }, { "epoch": 4.6182893539581436, "grad_norm": 0.013741319867024624, "learning_rate": 1.4345484119206222e-07, "loss": 0.0001, "step": 10151 }, { "epoch": 4.618744313011829, "grad_norm": 0.09472888366923614, "learning_rate": 1.4311512320519528e-07, "loss": 0.0012, "step": 10152 }, { "epoch": 4.619199272065514, "grad_norm": 0.08998392552959748, "learning_rate": 1.427758021010267e-07, "loss": 0.0003, "step": 10153 }, { "epoch": 4.619654231119199, "grad_norm": 0.06160328013824963, "learning_rate": 1.4243687790728433e-07, "loss": 0.0005, "step": 10154 }, { "epoch": 4.620109190172885, "grad_norm": 0.24342276091398451, "learning_rate": 1.4209835065166433e-07, "loss": 0.006, "step": 10155 }, { "epoch": 4.62056414922657, "grad_norm": 0.14924390832608933, "learning_rate": 1.417602203618279e-07, "loss": 0.0053, "step": 10156 }, { "epoch": 4.6210191082802545, "grad_norm": 0.011802511419363603, "learning_rate": 1.4142248706540796e-07, "loss": 0.0001, "step": 10157 }, { "epoch": 4.62147406733394, "grad_norm": 0.14234054105551489, "learning_rate": 1.4108515079000075e-07, "loss": 0.0015, "step": 10158 }, { "epoch": 4.621929026387625, "grad_norm": 0.029955309913570694, "learning_rate": 1.4074821156317197e-07, "loss": 0.0002, "step": 10159 }, { "epoch": 4.62238398544131, "grad_norm": 0.049696155983342795, "learning_rate": 1.4041166941245578e-07, "loss": 0.0003, "step": 10160 }, { "epoch": 4.622838944494996, "grad_norm": 0.00600299962862912, "learning_rate": 1.4007552436535176e-07, "loss": 0.0, "step": 10161 }, { "epoch": 4.623293903548681, "grad_norm": 0.0095031794806623, "learning_rate": 1.3973977644932913e-07, "loss": 0.0001, "step": 10162 }, { "epoch": 4.6237488626023655, "grad_norm": 0.3235915088642451, "learning_rate": 1.3940442569182255e-07, "loss": 0.0057, "step": 10163 }, { "epoch": 4.624203821656051, "grad_norm": 0.010381106645430217, "learning_rate": 1.3906947212023625e-07, "loss": 0.0001, "step": 10164 }, { "epoch": 4.624658780709736, "grad_norm": 0.24804601954697872, "learning_rate": 1.3873491576194165e-07, "loss": 0.0061, "step": 10165 }, { "epoch": 4.625113739763421, "grad_norm": 0.40596109806754843, "learning_rate": 1.3840075664427577e-07, "loss": 0.0047, "step": 10166 }, { "epoch": 4.625568698817107, "grad_norm": 0.05214450095762541, "learning_rate": 1.3806699479454567e-07, "loss": 0.0006, "step": 10167 }, { "epoch": 4.626023657870792, "grad_norm": 0.033912337877299234, "learning_rate": 1.377336302400245e-07, "loss": 0.0002, "step": 10168 }, { "epoch": 4.6264786169244765, "grad_norm": 0.13646254961781484, "learning_rate": 1.3740066300795274e-07, "loss": 0.0019, "step": 10169 }, { "epoch": 4.626933575978162, "grad_norm": 0.03438743028185538, "learning_rate": 1.3706809312553914e-07, "loss": 0.0002, "step": 10170 }, { "epoch": 4.627388535031847, "grad_norm": 0.013365391720777744, "learning_rate": 1.3673592061996087e-07, "loss": 0.0001, "step": 10171 }, { "epoch": 4.627843494085532, "grad_norm": 0.2200304686770983, "learning_rate": 1.3640414551836122e-07, "loss": 0.0046, "step": 10172 }, { "epoch": 4.628298453139218, "grad_norm": 0.009235076324389713, "learning_rate": 1.3607276784785074e-07, "loss": 0.0001, "step": 10173 }, { "epoch": 4.628753412192903, "grad_norm": 0.10847971104135985, "learning_rate": 1.3574178763550772e-07, "loss": 0.0009, "step": 10174 }, { "epoch": 4.6292083712465875, "grad_norm": 0.044590173955268986, "learning_rate": 1.3541120490837943e-07, "loss": 0.0002, "step": 10175 }, { "epoch": 4.629663330300273, "grad_norm": 0.02157787933266301, "learning_rate": 1.3508101969347986e-07, "loss": 0.0001, "step": 10176 }, { "epoch": 4.630118289353958, "grad_norm": 0.561859082001676, "learning_rate": 1.347512320177885e-07, "loss": 0.0027, "step": 10177 }, { "epoch": 4.630573248407643, "grad_norm": 0.008779433087511678, "learning_rate": 1.3442184190825547e-07, "loss": 0.0001, "step": 10178 }, { "epoch": 4.631028207461329, "grad_norm": 0.034840075704382495, "learning_rate": 1.34092849391797e-07, "loss": 0.0002, "step": 10179 }, { "epoch": 4.631483166515014, "grad_norm": 0.019750917666495425, "learning_rate": 1.3376425449529661e-07, "loss": 0.0001, "step": 10180 }, { "epoch": 4.631938125568698, "grad_norm": 0.28808741898602425, "learning_rate": 1.334360572456056e-07, "loss": 0.0017, "step": 10181 }, { "epoch": 4.632393084622384, "grad_norm": 0.05012937701671048, "learning_rate": 1.3310825766954305e-07, "loss": 0.001, "step": 10182 }, { "epoch": 4.632848043676069, "grad_norm": 0.010303042243101506, "learning_rate": 1.327808557938942e-07, "loss": 0.0001, "step": 10183 }, { "epoch": 4.633303002729754, "grad_norm": 0.04690058144319214, "learning_rate": 1.324538516454138e-07, "loss": 0.0002, "step": 10184 }, { "epoch": 4.63375796178344, "grad_norm": 0.18241458562307544, "learning_rate": 1.3212724525082376e-07, "loss": 0.002, "step": 10185 }, { "epoch": 4.6342129208371245, "grad_norm": 0.018623052885392427, "learning_rate": 1.3180103663681165e-07, "loss": 0.0001, "step": 10186 }, { "epoch": 4.634667879890809, "grad_norm": 0.008191792748821817, "learning_rate": 1.3147522583003448e-07, "loss": 0.0, "step": 10187 }, { "epoch": 4.635122838944495, "grad_norm": 0.10315490954016021, "learning_rate": 1.3114981285711538e-07, "loss": 0.0013, "step": 10188 }, { "epoch": 4.63557779799818, "grad_norm": 0.03132705029902822, "learning_rate": 1.308247977446464e-07, "loss": 0.0002, "step": 10189 }, { "epoch": 4.636032757051865, "grad_norm": 0.06235343469483827, "learning_rate": 1.3050018051918578e-07, "loss": 0.0006, "step": 10190 }, { "epoch": 4.636487716105551, "grad_norm": 0.009128196767433108, "learning_rate": 1.3017596120725952e-07, "loss": 0.0, "step": 10191 }, { "epoch": 4.6369426751592355, "grad_norm": 0.22046398257934455, "learning_rate": 1.298521398353625e-07, "loss": 0.0013, "step": 10192 }, { "epoch": 4.63739763421292, "grad_norm": 0.333970918208788, "learning_rate": 1.295287164299547e-07, "loss": 0.001, "step": 10193 }, { "epoch": 4.637852593266606, "grad_norm": 0.09685445479325029, "learning_rate": 1.292056910174655e-07, "loss": 0.0006, "step": 10194 }, { "epoch": 4.638307552320291, "grad_norm": 0.01494019227754789, "learning_rate": 1.288830636242916e-07, "loss": 0.0001, "step": 10195 }, { "epoch": 4.638762511373976, "grad_norm": 0.17916318720598856, "learning_rate": 1.2856083427679522e-07, "loss": 0.0009, "step": 10196 }, { "epoch": 4.639217470427662, "grad_norm": 0.1609286785384726, "learning_rate": 1.2823900300130808e-07, "loss": 0.0012, "step": 10197 }, { "epoch": 4.6396724294813465, "grad_norm": 0.10144835259228799, "learning_rate": 1.2791756982412917e-07, "loss": 0.0005, "step": 10198 }, { "epoch": 4.640127388535031, "grad_norm": 0.060658527836725366, "learning_rate": 1.2759653477152412e-07, "loss": 0.0003, "step": 10199 }, { "epoch": 4.640582347588717, "grad_norm": 0.0227548642896754, "learning_rate": 1.272758978697275e-07, "loss": 0.0002, "step": 10200 }, { "epoch": 4.641037306642402, "grad_norm": 0.015299341637384929, "learning_rate": 1.269556591449389e-07, "loss": 0.0001, "step": 10201 }, { "epoch": 4.641492265696087, "grad_norm": 0.06898131151561372, "learning_rate": 1.2663581862332741e-07, "loss": 0.0005, "step": 10202 }, { "epoch": 4.641947224749773, "grad_norm": 0.005027214953384058, "learning_rate": 1.2631637633102878e-07, "loss": 0.0, "step": 10203 }, { "epoch": 4.6424021838034575, "grad_norm": 0.006431894331208697, "learning_rate": 1.2599733229414656e-07, "loss": 0.0, "step": 10204 }, { "epoch": 4.642857142857143, "grad_norm": 0.012310336207182297, "learning_rate": 1.256786865387516e-07, "loss": 0.0, "step": 10205 }, { "epoch": 4.643312101910828, "grad_norm": 0.03426431921104071, "learning_rate": 1.253604390908819e-07, "loss": 0.0005, "step": 10206 }, { "epoch": 4.643767060964513, "grad_norm": 0.033299214981028294, "learning_rate": 1.2504258997654395e-07, "loss": 0.0001, "step": 10207 }, { "epoch": 4.644222020018199, "grad_norm": 0.11587346777721734, "learning_rate": 1.247251392217097e-07, "loss": 0.0007, "step": 10208 }, { "epoch": 4.644676979071884, "grad_norm": 0.04997306509282845, "learning_rate": 1.244080868523212e-07, "loss": 0.0003, "step": 10209 }, { "epoch": 4.6451319381255685, "grad_norm": 0.09826598425246098, "learning_rate": 1.2409143289428606e-07, "loss": 0.001, "step": 10210 }, { "epoch": 4.645586897179254, "grad_norm": 0.06985262563333702, "learning_rate": 1.237751773734791e-07, "loss": 0.0005, "step": 10211 }, { "epoch": 4.646041856232939, "grad_norm": 0.001415377924834579, "learning_rate": 1.234593203157436e-07, "loss": 0.0, "step": 10212 }, { "epoch": 4.646496815286624, "grad_norm": 0.08676187334445322, "learning_rate": 1.2314386174689052e-07, "loss": 0.0007, "step": 10213 }, { "epoch": 4.64695177434031, "grad_norm": 0.08094879529359549, "learning_rate": 1.2282880169269707e-07, "loss": 0.0006, "step": 10214 }, { "epoch": 4.647406733393995, "grad_norm": 0.08958867163380105, "learning_rate": 1.2251414017890928e-07, "loss": 0.0004, "step": 10215 }, { "epoch": 4.647861692447679, "grad_norm": 0.03489213435693581, "learning_rate": 1.2219987723123939e-07, "loss": 0.0001, "step": 10216 }, { "epoch": 4.648316651501365, "grad_norm": 0.04566919553045858, "learning_rate": 1.218860128753674e-07, "loss": 0.0003, "step": 10217 }, { "epoch": 4.64877161055505, "grad_norm": 0.225069329146852, "learning_rate": 1.215725471369411e-07, "loss": 0.0028, "step": 10218 }, { "epoch": 4.649226569608735, "grad_norm": 0.041881852562165846, "learning_rate": 1.2125948004157506e-07, "loss": 0.0002, "step": 10219 }, { "epoch": 4.649681528662421, "grad_norm": 0.008070887556896545, "learning_rate": 1.2094681161485267e-07, "loss": 0.0001, "step": 10220 }, { "epoch": 4.6501364877161055, "grad_norm": 0.01901146623798444, "learning_rate": 1.206345418823235e-07, "loss": 0.0001, "step": 10221 }, { "epoch": 4.65059144676979, "grad_norm": 0.36257880272150783, "learning_rate": 1.2032267086950378e-07, "loss": 0.0107, "step": 10222 }, { "epoch": 4.651046405823476, "grad_norm": 0.34285106672170484, "learning_rate": 1.2001119860187928e-07, "loss": 0.0039, "step": 10223 }, { "epoch": 4.651501364877161, "grad_norm": 0.004741663391622923, "learning_rate": 1.197001251049018e-07, "loss": 0.0, "step": 10224 }, { "epoch": 4.651956323930846, "grad_norm": 0.18874095547053066, "learning_rate": 1.1938945040399107e-07, "loss": 0.0013, "step": 10225 }, { "epoch": 4.652411282984532, "grad_norm": 0.1467383327015633, "learning_rate": 1.1907917452453344e-07, "loss": 0.0024, "step": 10226 }, { "epoch": 4.6528662420382165, "grad_norm": 0.0624467326070796, "learning_rate": 1.187692974918836e-07, "loss": 0.0004, "step": 10227 }, { "epoch": 4.653321201091901, "grad_norm": 0.11143894385287034, "learning_rate": 1.1845981933136352e-07, "loss": 0.0003, "step": 10228 }, { "epoch": 4.653776160145587, "grad_norm": 0.20219272049674344, "learning_rate": 1.1815074006826243e-07, "loss": 0.001, "step": 10229 }, { "epoch": 4.654231119199272, "grad_norm": 0.05480471236745072, "learning_rate": 1.178420597278368e-07, "loss": 0.0005, "step": 10230 }, { "epoch": 4.654686078252957, "grad_norm": 0.4792014577162499, "learning_rate": 1.1753377833530922e-07, "loss": 0.0037, "step": 10231 }, { "epoch": 4.655141037306643, "grad_norm": 0.03812148473216996, "learning_rate": 1.1722589591587342e-07, "loss": 0.0002, "step": 10232 }, { "epoch": 4.6555959963603275, "grad_norm": 0.059543807501753955, "learning_rate": 1.169184124946865e-07, "loss": 0.0005, "step": 10233 }, { "epoch": 4.656050955414012, "grad_norm": 0.05482593952747607, "learning_rate": 1.1661132809687504e-07, "loss": 0.0002, "step": 10234 }, { "epoch": 4.656505914467698, "grad_norm": 0.0259659490851893, "learning_rate": 1.1630464274753284e-07, "loss": 0.0002, "step": 10235 }, { "epoch": 4.656960873521383, "grad_norm": 0.0027512639589397808, "learning_rate": 1.1599835647172042e-07, "loss": 0.0, "step": 10236 }, { "epoch": 4.657415832575068, "grad_norm": 0.05746948692003079, "learning_rate": 1.1569246929446665e-07, "loss": 0.0005, "step": 10237 }, { "epoch": 4.657870791628754, "grad_norm": 0.09449271092010088, "learning_rate": 1.1538698124076709e-07, "loss": 0.0005, "step": 10238 }, { "epoch": 4.6583257506824385, "grad_norm": 0.0814308319838571, "learning_rate": 1.1508189233558453e-07, "loss": 0.0011, "step": 10239 }, { "epoch": 4.658780709736124, "grad_norm": 0.1277713641581946, "learning_rate": 1.1477720260384962e-07, "loss": 0.0008, "step": 10240 }, { "epoch": 4.659235668789809, "grad_norm": 0.012715468486772056, "learning_rate": 1.1447291207046019e-07, "loss": 0.0, "step": 10241 }, { "epoch": 4.659690627843494, "grad_norm": 0.015274381293408386, "learning_rate": 1.1416902076028136e-07, "loss": 0.0001, "step": 10242 }, { "epoch": 4.66014558689718, "grad_norm": 0.11634578067259214, "learning_rate": 1.1386552869814604e-07, "loss": 0.0015, "step": 10243 }, { "epoch": 4.660600545950865, "grad_norm": 0.03943422738373436, "learning_rate": 1.1356243590885441e-07, "loss": 0.0003, "step": 10244 }, { "epoch": 4.6610555050045495, "grad_norm": 0.0076796104732176846, "learning_rate": 1.1325974241717385e-07, "loss": 0.0, "step": 10245 }, { "epoch": 4.661510464058235, "grad_norm": 0.1559827480698999, "learning_rate": 1.1295744824783794e-07, "loss": 0.0027, "step": 10246 }, { "epoch": 4.66196542311192, "grad_norm": 0.04058144076867448, "learning_rate": 1.1265555342554968e-07, "loss": 0.0003, "step": 10247 }, { "epoch": 4.662420382165605, "grad_norm": 0.08963031140961897, "learning_rate": 1.1235405797497933e-07, "loss": 0.0006, "step": 10248 }, { "epoch": 4.662875341219291, "grad_norm": 0.03604198077039828, "learning_rate": 1.1205296192076275e-07, "loss": 0.0003, "step": 10249 }, { "epoch": 4.663330300272976, "grad_norm": 0.036854331060587636, "learning_rate": 1.1175226528750416e-07, "loss": 0.0003, "step": 10250 }, { "epoch": 4.66378525932666, "grad_norm": 0.003971254287974712, "learning_rate": 1.1145196809977499e-07, "loss": 0.0, "step": 10251 }, { "epoch": 4.664240218380346, "grad_norm": 0.12978544118659024, "learning_rate": 1.1115207038211507e-07, "loss": 0.0009, "step": 10252 }, { "epoch": 4.664695177434031, "grad_norm": 0.15588645365758752, "learning_rate": 1.1085257215902978e-07, "loss": 0.0019, "step": 10253 }, { "epoch": 4.665150136487716, "grad_norm": 0.004813458787314431, "learning_rate": 1.105534734549929e-07, "loss": 0.0, "step": 10254 }, { "epoch": 4.665605095541402, "grad_norm": 0.04465227128931435, "learning_rate": 1.1025477429444598e-07, "loss": 0.0004, "step": 10255 }, { "epoch": 4.6660600545950865, "grad_norm": 0.024452297518496233, "learning_rate": 1.0995647470179672e-07, "loss": 0.0002, "step": 10256 }, { "epoch": 4.666515013648771, "grad_norm": 0.014451426336303646, "learning_rate": 1.0965857470142172e-07, "loss": 0.0001, "step": 10257 }, { "epoch": 4.666969972702457, "grad_norm": 0.0044332680286918264, "learning_rate": 1.0936107431766319e-07, "loss": 0.0, "step": 10258 }, { "epoch": 4.667424931756142, "grad_norm": 0.05971002350169554, "learning_rate": 1.0906397357483167e-07, "loss": 0.0008, "step": 10259 }, { "epoch": 4.667879890809827, "grad_norm": 0.007070738488226248, "learning_rate": 1.0876727249720443e-07, "loss": 0.0001, "step": 10260 }, { "epoch": 4.668334849863513, "grad_norm": 0.014440176375710932, "learning_rate": 1.0847097110902704e-07, "loss": 0.0001, "step": 10261 }, { "epoch": 4.6687898089171975, "grad_norm": 0.029342900994430036, "learning_rate": 1.0817506943451239e-07, "loss": 0.0001, "step": 10262 }, { "epoch": 4.669244767970882, "grad_norm": 0.032939897933142996, "learning_rate": 1.0787956749784002e-07, "loss": 0.0003, "step": 10263 }, { "epoch": 4.669699727024568, "grad_norm": 0.08960649802277025, "learning_rate": 1.075844653231567e-07, "loss": 0.0007, "step": 10264 }, { "epoch": 4.670154686078253, "grad_norm": 0.0073237852704496455, "learning_rate": 1.0728976293457649e-07, "loss": 0.0001, "step": 10265 }, { "epoch": 4.670609645131938, "grad_norm": 0.05709982276190452, "learning_rate": 1.069954603561818e-07, "loss": 0.0006, "step": 10266 }, { "epoch": 4.671064604185624, "grad_norm": 0.12135944219803936, "learning_rate": 1.0670155761202172e-07, "loss": 0.0004, "step": 10267 }, { "epoch": 4.6715195632393085, "grad_norm": 0.05891484746184756, "learning_rate": 1.0640805472611204e-07, "loss": 0.0002, "step": 10268 }, { "epoch": 4.671974522292993, "grad_norm": 0.1087781571722819, "learning_rate": 1.061149517224369e-07, "loss": 0.0013, "step": 10269 }, { "epoch": 4.672429481346679, "grad_norm": 0.07540830259997695, "learning_rate": 1.0582224862494716e-07, "loss": 0.0004, "step": 10270 }, { "epoch": 4.672884440400364, "grad_norm": 0.07659004129922979, "learning_rate": 1.0552994545756201e-07, "loss": 0.0003, "step": 10271 }, { "epoch": 4.673339399454049, "grad_norm": 0.006056208404833988, "learning_rate": 1.0523804224416623e-07, "loss": 0.0, "step": 10272 }, { "epoch": 4.673794358507735, "grad_norm": 0.05847826235197548, "learning_rate": 1.0494653900861296e-07, "loss": 0.0006, "step": 10273 }, { "epoch": 4.6742493175614195, "grad_norm": 0.059442494775477576, "learning_rate": 1.0465543577472314e-07, "loss": 0.0005, "step": 10274 }, { "epoch": 4.674704276615104, "grad_norm": 0.022137740314037393, "learning_rate": 1.0436473256628277e-07, "loss": 0.0001, "step": 10275 }, { "epoch": 4.67515923566879, "grad_norm": 0.07105261175453277, "learning_rate": 1.0407442940704837e-07, "loss": 0.0006, "step": 10276 }, { "epoch": 4.675614194722475, "grad_norm": 0.09515882674468688, "learning_rate": 1.037845263207421e-07, "loss": 0.0012, "step": 10277 }, { "epoch": 4.67606915377616, "grad_norm": 0.010268336694827337, "learning_rate": 1.0349502333105333e-07, "loss": 0.0001, "step": 10278 }, { "epoch": 4.676524112829846, "grad_norm": 0.014583361848813897, "learning_rate": 1.0320592046163925e-07, "loss": 0.0001, "step": 10279 }, { "epoch": 4.6769790718835305, "grad_norm": 0.024517182114238003, "learning_rate": 1.0291721773612263e-07, "loss": 0.0001, "step": 10280 }, { "epoch": 4.677434030937215, "grad_norm": 0.13133312676613232, "learning_rate": 1.0262891517809626e-07, "loss": 0.0016, "step": 10281 }, { "epoch": 4.677888989990901, "grad_norm": 0.05639198907159649, "learning_rate": 1.0234101281111852e-07, "loss": 0.0003, "step": 10282 }, { "epoch": 4.678343949044586, "grad_norm": 0.11769120137609002, "learning_rate": 1.0205351065871615e-07, "loss": 0.001, "step": 10283 }, { "epoch": 4.678798908098271, "grad_norm": 0.11845125324173367, "learning_rate": 1.0176640874438148e-07, "loss": 0.0029, "step": 10284 }, { "epoch": 4.679253867151957, "grad_norm": 0.09705450154745682, "learning_rate": 1.0147970709157573e-07, "loss": 0.0005, "step": 10285 }, { "epoch": 4.679708826205641, "grad_norm": 0.048168920994499494, "learning_rate": 1.0119340572372683e-07, "loss": 0.0003, "step": 10286 }, { "epoch": 4.680163785259326, "grad_norm": 0.009505128742610303, "learning_rate": 1.0090750466422994e-07, "loss": 0.0, "step": 10287 }, { "epoch": 4.680618744313012, "grad_norm": 0.2689474288340439, "learning_rate": 1.0062200393644806e-07, "loss": 0.0017, "step": 10288 }, { "epoch": 4.681073703366697, "grad_norm": 0.43896843966617727, "learning_rate": 1.0033690356370973e-07, "loss": 0.0017, "step": 10289 }, { "epoch": 4.681528662420382, "grad_norm": 0.012701161599689375, "learning_rate": 1.0005220356931355e-07, "loss": 0.0001, "step": 10290 }, { "epoch": 4.6819836214740675, "grad_norm": 0.050418455025712544, "learning_rate": 9.976790397652314e-08, "loss": 0.0001, "step": 10291 }, { "epoch": 4.682438580527752, "grad_norm": 0.017990743852292785, "learning_rate": 9.948400480857101e-08, "loss": 0.0001, "step": 10292 }, { "epoch": 4.682893539581437, "grad_norm": 0.00863934575465124, "learning_rate": 9.920050608865473e-08, "loss": 0.0, "step": 10293 }, { "epoch": 4.683348498635123, "grad_norm": 0.006880755402871023, "learning_rate": 9.89174078399413e-08, "loss": 0.0, "step": 10294 }, { "epoch": 4.683803457688808, "grad_norm": 0.03217173446263907, "learning_rate": 9.863471008556447e-08, "loss": 0.0001, "step": 10295 }, { "epoch": 4.684258416742493, "grad_norm": 0.011075899015750672, "learning_rate": 9.835241284862462e-08, "loss": 0.0001, "step": 10296 }, { "epoch": 4.6847133757961785, "grad_norm": 0.0816900350939216, "learning_rate": 9.807051615218999e-08, "loss": 0.0017, "step": 10297 }, { "epoch": 4.685168334849863, "grad_norm": 0.2035813405958449, "learning_rate": 9.778902001929602e-08, "loss": 0.0025, "step": 10298 }, { "epoch": 4.685623293903548, "grad_norm": 0.18314774013053128, "learning_rate": 9.750792447294489e-08, "loss": 0.0005, "step": 10299 }, { "epoch": 4.686078252957234, "grad_norm": 0.009352875103096634, "learning_rate": 9.722722953610708e-08, "loss": 0.0001, "step": 10300 }, { "epoch": 4.686533212010919, "grad_norm": 0.02418393477665069, "learning_rate": 9.694693523171927e-08, "loss": 0.0001, "step": 10301 }, { "epoch": 4.686988171064604, "grad_norm": 0.051276069106874204, "learning_rate": 9.666704158268592e-08, "loss": 0.0002, "step": 10302 }, { "epoch": 4.6874431301182895, "grad_norm": 0.009813481523625078, "learning_rate": 9.638754861187816e-08, "loss": 0.0001, "step": 10303 }, { "epoch": 4.687898089171974, "grad_norm": 0.03243343470503502, "learning_rate": 9.610845634213551e-08, "loss": 0.0001, "step": 10304 }, { "epoch": 4.688353048225659, "grad_norm": 0.10791818056928296, "learning_rate": 9.582976479626471e-08, "loss": 0.0011, "step": 10305 }, { "epoch": 4.688808007279345, "grad_norm": 0.01373580111942201, "learning_rate": 9.555147399703813e-08, "loss": 0.0001, "step": 10306 }, { "epoch": 4.68926296633303, "grad_norm": 0.01609801415912017, "learning_rate": 9.527358396719699e-08, "loss": 0.0001, "step": 10307 }, { "epoch": 4.689717925386715, "grad_norm": 0.05467212082069933, "learning_rate": 9.49960947294487e-08, "loss": 0.0004, "step": 10308 }, { "epoch": 4.6901728844404005, "grad_norm": 0.10471969069327443, "learning_rate": 9.471900630646847e-08, "loss": 0.0002, "step": 10309 }, { "epoch": 4.690627843494085, "grad_norm": 0.010687898036433899, "learning_rate": 9.444231872089927e-08, "loss": 0.0001, "step": 10310 }, { "epoch": 4.69108280254777, "grad_norm": 0.016634716213867702, "learning_rate": 9.416603199535079e-08, "loss": 0.0001, "step": 10311 }, { "epoch": 4.691537761601456, "grad_norm": 0.009502206992849859, "learning_rate": 9.389014615239944e-08, "loss": 0.0, "step": 10312 }, { "epoch": 4.691992720655141, "grad_norm": 0.12690626687443035, "learning_rate": 9.36146612145894e-08, "loss": 0.0013, "step": 10313 }, { "epoch": 4.692447679708827, "grad_norm": 0.08252042132018089, "learning_rate": 9.333957720443209e-08, "loss": 0.0007, "step": 10314 }, { "epoch": 4.6929026387625115, "grad_norm": 0.10958263836203827, "learning_rate": 9.306489414440678e-08, "loss": 0.0017, "step": 10315 }, { "epoch": 4.693357597816196, "grad_norm": 0.13190084520488968, "learning_rate": 9.279061205695828e-08, "loss": 0.0006, "step": 10316 }, { "epoch": 4.693812556869882, "grad_norm": 0.033410268197950864, "learning_rate": 9.251673096450032e-08, "loss": 0.0003, "step": 10317 }, { "epoch": 4.694267515923567, "grad_norm": 0.2418523170102415, "learning_rate": 9.224325088941332e-08, "loss": 0.0008, "step": 10318 }, { "epoch": 4.694722474977252, "grad_norm": 0.15226320856554007, "learning_rate": 9.197017185404444e-08, "loss": 0.0008, "step": 10319 }, { "epoch": 4.695177434030938, "grad_norm": 0.032085887105710253, "learning_rate": 9.169749388070859e-08, "loss": 0.0001, "step": 10320 }, { "epoch": 4.695632393084622, "grad_norm": 0.2769150562142887, "learning_rate": 9.142521699168794e-08, "loss": 0.006, "step": 10321 }, { "epoch": 4.696087352138307, "grad_norm": 0.14071008323329443, "learning_rate": 9.115334120923191e-08, "loss": 0.0003, "step": 10322 }, { "epoch": 4.696542311191993, "grad_norm": 0.08100280793718856, "learning_rate": 9.088186655555608e-08, "loss": 0.0009, "step": 10323 }, { "epoch": 4.696997270245678, "grad_norm": 0.2314546957753362, "learning_rate": 9.061079305284492e-08, "loss": 0.0037, "step": 10324 }, { "epoch": 4.697452229299363, "grad_norm": 0.010094699283373984, "learning_rate": 9.03401207232496e-08, "loss": 0.0, "step": 10325 }, { "epoch": 4.6979071883530485, "grad_norm": 0.28837022294452597, "learning_rate": 9.006984958888742e-08, "loss": 0.0052, "step": 10326 }, { "epoch": 4.698362147406733, "grad_norm": 0.04588712277186774, "learning_rate": 8.979997967184462e-08, "loss": 0.0002, "step": 10327 }, { "epoch": 4.698817106460418, "grad_norm": 0.05293351730519668, "learning_rate": 8.953051099417242e-08, "loss": 0.0004, "step": 10328 }, { "epoch": 4.699272065514104, "grad_norm": 0.04172704486171423, "learning_rate": 8.926144357789158e-08, "loss": 0.0003, "step": 10329 }, { "epoch": 4.699727024567789, "grad_norm": 0.017897674231096176, "learning_rate": 8.899277744498891e-08, "loss": 0.0001, "step": 10330 }, { "epoch": 4.700181983621474, "grad_norm": 0.058405896279421984, "learning_rate": 8.872451261741854e-08, "loss": 0.0006, "step": 10331 }, { "epoch": 4.7006369426751595, "grad_norm": 0.09739571362233164, "learning_rate": 8.845664911710239e-08, "loss": 0.0004, "step": 10332 }, { "epoch": 4.701091901728844, "grad_norm": 0.10881848118936635, "learning_rate": 8.818918696592737e-08, "loss": 0.0008, "step": 10333 }, { "epoch": 4.701546860782529, "grad_norm": 0.08470917904403007, "learning_rate": 8.792212618575158e-08, "loss": 0.0008, "step": 10334 }, { "epoch": 4.702001819836215, "grad_norm": 0.03494299040115534, "learning_rate": 8.765546679839643e-08, "loss": 0.0001, "step": 10335 }, { "epoch": 4.7024567788899, "grad_norm": 0.007312358652135778, "learning_rate": 8.738920882565283e-08, "loss": 0.0, "step": 10336 }, { "epoch": 4.702911737943585, "grad_norm": 0.029885048881680767, "learning_rate": 8.712335228927782e-08, "loss": 0.0002, "step": 10337 }, { "epoch": 4.7033666969972705, "grad_norm": 0.056061374928798396, "learning_rate": 8.685789721099569e-08, "loss": 0.0005, "step": 10338 }, { "epoch": 4.703821656050955, "grad_norm": 0.06543708507156024, "learning_rate": 8.659284361249909e-08, "loss": 0.0005, "step": 10339 }, { "epoch": 4.70427661510464, "grad_norm": 0.10127908135566865, "learning_rate": 8.632819151544681e-08, "loss": 0.0007, "step": 10340 }, { "epoch": 4.704731574158326, "grad_norm": 0.018628976793558953, "learning_rate": 8.606394094146431e-08, "loss": 0.0001, "step": 10341 }, { "epoch": 4.705186533212011, "grad_norm": 0.028999291563235746, "learning_rate": 8.580009191214544e-08, "loss": 0.0003, "step": 10342 }, { "epoch": 4.705641492265696, "grad_norm": 0.03454009066266616, "learning_rate": 8.553664444905074e-08, "loss": 0.0002, "step": 10343 }, { "epoch": 4.7060964513193815, "grad_norm": 0.0578044670367911, "learning_rate": 8.527359857370799e-08, "loss": 0.0004, "step": 10344 }, { "epoch": 4.706551410373066, "grad_norm": 0.1353017322841546, "learning_rate": 8.501095430761219e-08, "loss": 0.0015, "step": 10345 }, { "epoch": 4.707006369426751, "grad_norm": 0.019850752998643802, "learning_rate": 8.474871167222509e-08, "loss": 0.0002, "step": 10346 }, { "epoch": 4.707461328480437, "grad_norm": 0.046681506511303344, "learning_rate": 8.448687068897676e-08, "loss": 0.0003, "step": 10347 }, { "epoch": 4.707916287534122, "grad_norm": 0.0048335019185104236, "learning_rate": 8.422543137926231e-08, "loss": 0.0, "step": 10348 }, { "epoch": 4.708371246587808, "grad_norm": 0.018965589136176502, "learning_rate": 8.396439376444631e-08, "loss": 0.0002, "step": 10349 }, { "epoch": 4.7088262056414925, "grad_norm": 0.00948008505034407, "learning_rate": 8.370375786586005e-08, "loss": 0.0, "step": 10350 }, { "epoch": 4.709281164695177, "grad_norm": 0.06281868868250738, "learning_rate": 8.344352370480036e-08, "loss": 0.0006, "step": 10351 }, { "epoch": 4.709736123748863, "grad_norm": 0.05066419694732638, "learning_rate": 8.318369130253301e-08, "loss": 0.0006, "step": 10352 }, { "epoch": 4.710191082802548, "grad_norm": 0.16733808203378805, "learning_rate": 8.292426068028992e-08, "loss": 0.0007, "step": 10353 }, { "epoch": 4.710646041856233, "grad_norm": 0.19217003206127167, "learning_rate": 8.266523185927134e-08, "loss": 0.0032, "step": 10354 }, { "epoch": 4.711101000909919, "grad_norm": 0.045897348967784274, "learning_rate": 8.240660486064367e-08, "loss": 0.0002, "step": 10355 }, { "epoch": 4.711555959963603, "grad_norm": 0.018638677900911078, "learning_rate": 8.214837970554057e-08, "loss": 0.0001, "step": 10356 }, { "epoch": 4.712010919017288, "grad_norm": 0.06399199253328469, "learning_rate": 8.189055641506293e-08, "loss": 0.0002, "step": 10357 }, { "epoch": 4.712465878070974, "grad_norm": 0.27939902481595374, "learning_rate": 8.163313501027892e-08, "loss": 0.005, "step": 10358 }, { "epoch": 4.712920837124659, "grad_norm": 0.2456729056592538, "learning_rate": 8.137611551222391e-08, "loss": 0.0019, "step": 10359 }, { "epoch": 4.713375796178344, "grad_norm": 0.025587039217796363, "learning_rate": 8.111949794190055e-08, "loss": 0.0001, "step": 10360 }, { "epoch": 4.7138307552320295, "grad_norm": 0.10163891544422349, "learning_rate": 8.086328232027874e-08, "loss": 0.001, "step": 10361 }, { "epoch": 4.714285714285714, "grad_norm": 0.05639120408845683, "learning_rate": 8.060746866829394e-08, "loss": 0.0006, "step": 10362 }, { "epoch": 4.714740673339399, "grad_norm": 0.011926202335770674, "learning_rate": 8.035205700685167e-08, "loss": 0.0001, "step": 10363 }, { "epoch": 4.715195632393085, "grad_norm": 0.04077934688115527, "learning_rate": 8.009704735682244e-08, "loss": 0.0002, "step": 10364 }, { "epoch": 4.71565059144677, "grad_norm": 0.18752014827787267, "learning_rate": 7.98424397390446e-08, "loss": 0.0037, "step": 10365 }, { "epoch": 4.716105550500455, "grad_norm": 0.007376981776185636, "learning_rate": 7.95882341743226e-08, "loss": 0.0, "step": 10366 }, { "epoch": 4.7165605095541405, "grad_norm": 0.06875385197181354, "learning_rate": 7.933443068342983e-08, "loss": 0.0003, "step": 10367 }, { "epoch": 4.717015468607825, "grad_norm": 0.08718207021948843, "learning_rate": 7.908102928710637e-08, "loss": 0.0004, "step": 10368 }, { "epoch": 4.71747042766151, "grad_norm": 0.11941730741458896, "learning_rate": 7.882803000605843e-08, "loss": 0.0007, "step": 10369 }, { "epoch": 4.717925386715196, "grad_norm": 0.007507273381700301, "learning_rate": 7.857543286096003e-08, "loss": 0.0, "step": 10370 }, { "epoch": 4.718380345768881, "grad_norm": 0.006327290062892956, "learning_rate": 7.832323787245188e-08, "loss": 0.0, "step": 10371 }, { "epoch": 4.718835304822566, "grad_norm": 0.07743546431310214, "learning_rate": 7.807144506114306e-08, "loss": 0.0003, "step": 10372 }, { "epoch": 4.7192902638762515, "grad_norm": 0.008904934451756781, "learning_rate": 7.782005444760821e-08, "loss": 0.0001, "step": 10373 }, { "epoch": 4.719745222929936, "grad_norm": 0.08263727860855853, "learning_rate": 7.756906605239089e-08, "loss": 0.0006, "step": 10374 }, { "epoch": 4.720200181983621, "grad_norm": 0.1537215297408134, "learning_rate": 7.731847989599916e-08, "loss": 0.0008, "step": 10375 }, { "epoch": 4.720655141037307, "grad_norm": 0.019947675947161292, "learning_rate": 7.706829599891107e-08, "loss": 0.0001, "step": 10376 }, { "epoch": 4.721110100090992, "grad_norm": 0.04268760277897016, "learning_rate": 7.681851438156973e-08, "loss": 0.0005, "step": 10377 }, { "epoch": 4.721565059144677, "grad_norm": 0.0710877894113309, "learning_rate": 7.656913506438712e-08, "loss": 0.0006, "step": 10378 }, { "epoch": 4.7220200181983625, "grad_norm": 0.05359024275467646, "learning_rate": 7.63201580677403e-08, "loss": 0.0002, "step": 10379 }, { "epoch": 4.722474977252047, "grad_norm": 0.016204789150332372, "learning_rate": 7.607158341197462e-08, "loss": 0.0001, "step": 10380 }, { "epoch": 4.722929936305732, "grad_norm": 0.022401934156869894, "learning_rate": 7.582341111740332e-08, "loss": 0.0002, "step": 10381 }, { "epoch": 4.723384895359418, "grad_norm": 0.1992439838520583, "learning_rate": 7.557564120430572e-08, "loss": 0.0013, "step": 10382 }, { "epoch": 4.723839854413103, "grad_norm": 0.0063882251300269605, "learning_rate": 7.532827369292783e-08, "loss": 0.0, "step": 10383 }, { "epoch": 4.724294813466788, "grad_norm": 0.0304810265854276, "learning_rate": 7.508130860348406e-08, "loss": 0.0001, "step": 10384 }, { "epoch": 4.7247497725204735, "grad_norm": 0.04008498112719115, "learning_rate": 7.483474595615491e-08, "loss": 0.0002, "step": 10385 }, { "epoch": 4.725204731574158, "grad_norm": 0.011124581368161962, "learning_rate": 7.458858577108818e-08, "loss": 0.0001, "step": 10386 }, { "epoch": 4.725659690627843, "grad_norm": 0.029634245073102704, "learning_rate": 7.434282806839944e-08, "loss": 0.0003, "step": 10387 }, { "epoch": 4.726114649681529, "grad_norm": 0.022487009086170885, "learning_rate": 7.409747286817093e-08, "loss": 0.0005, "step": 10388 }, { "epoch": 4.726569608735214, "grad_norm": 0.06382411591487908, "learning_rate": 7.385252019045164e-08, "loss": 0.0006, "step": 10389 }, { "epoch": 4.727024567788899, "grad_norm": 0.006060846013447366, "learning_rate": 7.360797005525833e-08, "loss": 0.0, "step": 10390 }, { "epoch": 4.727479526842584, "grad_norm": 0.37412869852178027, "learning_rate": 7.33638224825739e-08, "loss": 0.0032, "step": 10391 }, { "epoch": 4.727934485896269, "grad_norm": 0.22282554896505724, "learning_rate": 7.312007749234961e-08, "loss": 0.0016, "step": 10392 }, { "epoch": 4.728389444949954, "grad_norm": 0.025689585202640138, "learning_rate": 7.287673510450343e-08, "loss": 0.0002, "step": 10393 }, { "epoch": 4.72884440400364, "grad_norm": 0.005680373871094684, "learning_rate": 7.26337953389189e-08, "loss": 0.0, "step": 10394 }, { "epoch": 4.729299363057325, "grad_norm": 0.020321580200183473, "learning_rate": 7.239125821544957e-08, "loss": 0.0001, "step": 10395 }, { "epoch": 4.72975432211101, "grad_norm": 0.323477508892141, "learning_rate": 7.214912375391291e-08, "loss": 0.0008, "step": 10396 }, { "epoch": 4.730209281164695, "grad_norm": 0.0990288296925892, "learning_rate": 7.190739197409646e-08, "loss": 0.0009, "step": 10397 }, { "epoch": 4.73066424021838, "grad_norm": 0.11962998423555951, "learning_rate": 7.166606289575274e-08, "loss": 0.0008, "step": 10398 }, { "epoch": 4.731119199272065, "grad_norm": 0.009435083007987862, "learning_rate": 7.142513653860261e-08, "loss": 0.0, "step": 10399 }, { "epoch": 4.731574158325751, "grad_norm": 0.03480302526753761, "learning_rate": 7.118461292233258e-08, "loss": 0.0002, "step": 10400 }, { "epoch": 4.732029117379436, "grad_norm": 0.012640705639870676, "learning_rate": 7.094449206659748e-08, "loss": 0.0001, "step": 10401 }, { "epoch": 4.732484076433121, "grad_norm": 0.008999708535897202, "learning_rate": 7.070477399101938e-08, "loss": 0.0001, "step": 10402 }, { "epoch": 4.732939035486806, "grad_norm": 0.008499970566807848, "learning_rate": 7.046545871518651e-08, "loss": 0.0, "step": 10403 }, { "epoch": 4.733393994540491, "grad_norm": 0.0851887875865808, "learning_rate": 7.022654625865544e-08, "loss": 0.0003, "step": 10404 }, { "epoch": 4.733848953594176, "grad_norm": 0.15044690086974163, "learning_rate": 6.998803664094723e-08, "loss": 0.0004, "step": 10405 }, { "epoch": 4.734303912647862, "grad_norm": 0.01890569048704366, "learning_rate": 6.974992988155405e-08, "loss": 0.0001, "step": 10406 }, { "epoch": 4.734758871701547, "grad_norm": 0.013506523684603035, "learning_rate": 6.951222599993091e-08, "loss": 0.0001, "step": 10407 }, { "epoch": 4.735213830755232, "grad_norm": 0.014481787776529292, "learning_rate": 6.927492501550282e-08, "loss": 0.0001, "step": 10408 }, { "epoch": 4.735668789808917, "grad_norm": 0.49245972111253755, "learning_rate": 6.903802694766148e-08, "loss": 0.0029, "step": 10409 }, { "epoch": 4.736123748862602, "grad_norm": 0.020938023767858617, "learning_rate": 6.880153181576421e-08, "loss": 0.0001, "step": 10410 }, { "epoch": 4.736578707916287, "grad_norm": 0.12405111402772104, "learning_rate": 6.856543963913665e-08, "loss": 0.0002, "step": 10411 }, { "epoch": 4.737033666969973, "grad_norm": 0.13137879328676527, "learning_rate": 6.832975043707168e-08, "loss": 0.0005, "step": 10412 }, { "epoch": 4.737488626023658, "grad_norm": 0.11925776144501105, "learning_rate": 6.809446422882782e-08, "loss": 0.0014, "step": 10413 }, { "epoch": 4.737943585077343, "grad_norm": 0.05406749377476792, "learning_rate": 6.785958103363244e-08, "loss": 0.0002, "step": 10414 }, { "epoch": 4.738398544131028, "grad_norm": 0.13982304862638056, "learning_rate": 6.762510087067741e-08, "loss": 0.0014, "step": 10415 }, { "epoch": 4.738853503184713, "grad_norm": 0.04421406450067183, "learning_rate": 6.739102375912577e-08, "loss": 0.0004, "step": 10416 }, { "epoch": 4.739308462238398, "grad_norm": 0.012554461966809179, "learning_rate": 6.715734971810439e-08, "loss": 0.0001, "step": 10417 }, { "epoch": 4.739763421292084, "grad_norm": 0.014902518578885133, "learning_rate": 6.692407876670803e-08, "loss": 0.0001, "step": 10418 }, { "epoch": 4.740218380345769, "grad_norm": 0.0090459033328069, "learning_rate": 6.669121092399811e-08, "loss": 0.0001, "step": 10419 }, { "epoch": 4.740673339399454, "grad_norm": 0.26291839367654923, "learning_rate": 6.645874620900328e-08, "loss": 0.0069, "step": 10420 }, { "epoch": 4.741128298453139, "grad_norm": 0.104180717169593, "learning_rate": 6.622668464072057e-08, "loss": 0.0004, "step": 10421 }, { "epoch": 4.741583257506824, "grad_norm": 0.011277401545510287, "learning_rate": 6.599502623811204e-08, "loss": 0.0001, "step": 10422 }, { "epoch": 4.742038216560509, "grad_norm": 0.09708480399271728, "learning_rate": 6.576377102010866e-08, "loss": 0.0002, "step": 10423 }, { "epoch": 4.742493175614195, "grad_norm": 0.030156295580507506, "learning_rate": 6.5532919005607e-08, "loss": 0.0001, "step": 10424 }, { "epoch": 4.74294813466788, "grad_norm": 0.011097368621538343, "learning_rate": 6.53024702134708e-08, "loss": 0.0001, "step": 10425 }, { "epoch": 4.743403093721565, "grad_norm": 0.02677174333287603, "learning_rate": 6.507242466253283e-08, "loss": 0.0001, "step": 10426 }, { "epoch": 4.74385805277525, "grad_norm": 0.014480153932460315, "learning_rate": 6.48427823715897e-08, "loss": 0.0001, "step": 10427 }, { "epoch": 4.744313011828935, "grad_norm": 0.008721429054318669, "learning_rate": 6.461354335940807e-08, "loss": 0.0, "step": 10428 }, { "epoch": 4.744767970882621, "grad_norm": 0.013046979546112676, "learning_rate": 6.438470764471849e-08, "loss": 0.0001, "step": 10429 }, { "epoch": 4.745222929936306, "grad_norm": 0.13028606938694787, "learning_rate": 6.415627524622214e-08, "loss": 0.0015, "step": 10430 }, { "epoch": 4.745677888989991, "grad_norm": 0.012711252711354327, "learning_rate": 6.39282461825852e-08, "loss": 0.0001, "step": 10431 }, { "epoch": 4.746132848043676, "grad_norm": 0.05103948090065527, "learning_rate": 6.370062047244052e-08, "loss": 0.0004, "step": 10432 }, { "epoch": 4.746587807097361, "grad_norm": 0.03951076272118454, "learning_rate": 6.347339813438935e-08, "loss": 0.0003, "step": 10433 }, { "epoch": 4.747042766151046, "grad_norm": 0.011223772264066753, "learning_rate": 6.32465791869985e-08, "loss": 0.0001, "step": 10434 }, { "epoch": 4.747497725204732, "grad_norm": 0.028980534885325128, "learning_rate": 6.30201636488037e-08, "loss": 0.0001, "step": 10435 }, { "epoch": 4.747952684258417, "grad_norm": 0.010274337536098235, "learning_rate": 6.279415153830515e-08, "loss": 0.0001, "step": 10436 }, { "epoch": 4.748407643312102, "grad_norm": 0.01082462532463599, "learning_rate": 6.256854287397251e-08, "loss": 0.0001, "step": 10437 }, { "epoch": 4.748862602365787, "grad_norm": 0.00578098534764013, "learning_rate": 6.234333767424161e-08, "loss": 0.0, "step": 10438 }, { "epoch": 4.749317561419472, "grad_norm": 0.240120978350182, "learning_rate": 6.211853595751493e-08, "loss": 0.0036, "step": 10439 }, { "epoch": 4.749772520473157, "grad_norm": 0.25957538728202784, "learning_rate": 6.189413774216168e-08, "loss": 0.0026, "step": 10440 }, { "epoch": 4.750227479526843, "grad_norm": 0.24991561994030595, "learning_rate": 6.167014304651997e-08, "loss": 0.0011, "step": 10441 }, { "epoch": 4.750682438580528, "grad_norm": 0.003873625455326493, "learning_rate": 6.144655188889237e-08, "loss": 0.0, "step": 10442 }, { "epoch": 4.751137397634213, "grad_norm": 0.05548854489336238, "learning_rate": 6.122336428755038e-08, "loss": 0.0003, "step": 10443 }, { "epoch": 4.751592356687898, "grad_norm": 0.1288444402855887, "learning_rate": 6.100058026073108e-08, "loss": 0.0019, "step": 10444 }, { "epoch": 4.752047315741583, "grad_norm": 0.009789255576044124, "learning_rate": 6.077819982664101e-08, "loss": 0.0001, "step": 10445 }, { "epoch": 4.752502274795268, "grad_norm": 0.30060800972959284, "learning_rate": 6.055622300345066e-08, "loss": 0.0052, "step": 10446 }, { "epoch": 4.752957233848954, "grad_norm": 0.021005981817564315, "learning_rate": 6.033464980929993e-08, "loss": 0.0001, "step": 10447 }, { "epoch": 4.753412192902639, "grad_norm": 0.015513802068342611, "learning_rate": 6.011348026229324e-08, "loss": 0.0001, "step": 10448 }, { "epoch": 4.753867151956324, "grad_norm": 0.0983825208169601, "learning_rate": 5.989271438050558e-08, "loss": 0.0008, "step": 10449 }, { "epoch": 4.754322111010009, "grad_norm": 0.10051818823520767, "learning_rate": 5.967235218197531e-08, "loss": 0.001, "step": 10450 }, { "epoch": 4.754777070063694, "grad_norm": 0.11061422999148696, "learning_rate": 5.945239368471079e-08, "loss": 0.0015, "step": 10451 }, { "epoch": 4.755232029117379, "grad_norm": 0.050694672602760595, "learning_rate": 5.923283890668485e-08, "loss": 0.0003, "step": 10452 }, { "epoch": 4.755686988171065, "grad_norm": 0.06085398896416968, "learning_rate": 5.9013687865839273e-08, "loss": 0.0002, "step": 10453 }, { "epoch": 4.75614194722475, "grad_norm": 0.11032030590496665, "learning_rate": 5.8794940580081394e-08, "loss": 0.001, "step": 10454 }, { "epoch": 4.756596906278435, "grad_norm": 0.05142475024000202, "learning_rate": 5.857659706728691e-08, "loss": 0.0004, "step": 10455 }, { "epoch": 4.75705186533212, "grad_norm": 0.2125310134176713, "learning_rate": 5.835865734529822e-08, "loss": 0.0015, "step": 10456 }, { "epoch": 4.757506824385805, "grad_norm": 0.0184904908420292, "learning_rate": 5.814112143192274e-08, "loss": 0.0001, "step": 10457 }, { "epoch": 4.757961783439491, "grad_norm": 0.011255046644503164, "learning_rate": 5.792398934493848e-08, "loss": 0.0001, "step": 10458 }, { "epoch": 4.758416742493176, "grad_norm": 0.20133832984465874, "learning_rate": 5.7707261102086795e-08, "loss": 0.0012, "step": 10459 }, { "epoch": 4.758871701546861, "grad_norm": 0.09829205116312933, "learning_rate": 5.749093672107908e-08, "loss": 0.0014, "step": 10460 }, { "epoch": 4.759326660600546, "grad_norm": 0.05813792526776249, "learning_rate": 5.727501621959175e-08, "loss": 0.0003, "step": 10461 }, { "epoch": 4.759781619654231, "grad_norm": 0.1677711352494823, "learning_rate": 5.705949961526902e-08, "loss": 0.0013, "step": 10462 }, { "epoch": 4.760236578707916, "grad_norm": 0.11609644692104817, "learning_rate": 5.6844386925721805e-08, "loss": 0.0008, "step": 10463 }, { "epoch": 4.760691537761602, "grad_norm": 0.24157437545876134, "learning_rate": 5.6629678168527715e-08, "loss": 0.0045, "step": 10464 }, { "epoch": 4.761146496815287, "grad_norm": 0.2621746892735075, "learning_rate": 5.641537336123271e-08, "loss": 0.0026, "step": 10465 }, { "epoch": 4.761601455868972, "grad_norm": 0.013228704115087659, "learning_rate": 5.6201472521348885e-08, "loss": 0.0001, "step": 10466 }, { "epoch": 4.762056414922657, "grad_norm": 0.04136793599242851, "learning_rate": 5.5987975666353944e-08, "loss": 0.0003, "step": 10467 }, { "epoch": 4.762511373976342, "grad_norm": 0.02573806361551556, "learning_rate": 5.577488281369503e-08, "loss": 0.0001, "step": 10468 }, { "epoch": 4.762966333030027, "grad_norm": 0.18254481099006487, "learning_rate": 5.5562193980784886e-08, "loss": 0.0005, "step": 10469 }, { "epoch": 4.763421292083713, "grad_norm": 0.003874586068290597, "learning_rate": 5.534990918500294e-08, "loss": 0.0, "step": 10470 }, { "epoch": 4.763876251137398, "grad_norm": 0.02404991758021951, "learning_rate": 5.513802844369642e-08, "loss": 0.0001, "step": 10471 }, { "epoch": 4.764331210191083, "grad_norm": 0.16636884874536972, "learning_rate": 5.492655177418038e-08, "loss": 0.0028, "step": 10472 }, { "epoch": 4.764786169244768, "grad_norm": 0.008467201396735673, "learning_rate": 5.471547919373377e-08, "loss": 0.0, "step": 10473 }, { "epoch": 4.765241128298453, "grad_norm": 0.0835722910822301, "learning_rate": 5.4504810719606114e-08, "loss": 0.0004, "step": 10474 }, { "epoch": 4.765696087352138, "grad_norm": 0.016304320981645384, "learning_rate": 5.429454636901144e-08, "loss": 0.0001, "step": 10475 }, { "epoch": 4.766151046405824, "grad_norm": 0.06329695406533259, "learning_rate": 5.4084686159132096e-08, "loss": 0.0003, "step": 10476 }, { "epoch": 4.766606005459509, "grad_norm": 0.1303457832223836, "learning_rate": 5.3875230107116036e-08, "loss": 0.0016, "step": 10477 }, { "epoch": 4.767060964513194, "grad_norm": 0.035704064059513985, "learning_rate": 5.366617823007958e-08, "loss": 0.0002, "step": 10478 }, { "epoch": 4.767515923566879, "grad_norm": 0.061168384178163226, "learning_rate": 5.345753054510627e-08, "loss": 0.0003, "step": 10479 }, { "epoch": 4.767970882620564, "grad_norm": 0.009029478221275073, "learning_rate": 5.324928706924526e-08, "loss": 0.0001, "step": 10480 }, { "epoch": 4.768425841674249, "grad_norm": 0.2831000742667647, "learning_rate": 5.3041447819512925e-08, "loss": 0.0025, "step": 10481 }, { "epoch": 4.768880800727935, "grad_norm": 0.15296905017968024, "learning_rate": 5.283401281289291e-08, "loss": 0.0005, "step": 10482 }, { "epoch": 4.76933575978162, "grad_norm": 0.008065879076170169, "learning_rate": 5.26269820663361e-08, "loss": 0.0001, "step": 10483 }, { "epoch": 4.769790718835305, "grad_norm": 0.11772679437127716, "learning_rate": 5.242035559676062e-08, "loss": 0.0009, "step": 10484 }, { "epoch": 4.77024567788899, "grad_norm": 0.031230829898169044, "learning_rate": 5.22141334210502e-08, "loss": 0.0003, "step": 10485 }, { "epoch": 4.770700636942675, "grad_norm": 0.14587626768329914, "learning_rate": 5.200831555605745e-08, "loss": 0.0008, "step": 10486 }, { "epoch": 4.77115559599636, "grad_norm": 0.0858717029576134, "learning_rate": 5.18029020185995e-08, "loss": 0.0007, "step": 10487 }, { "epoch": 4.771610555050046, "grad_norm": 0.033812059761657534, "learning_rate": 5.1597892825462904e-08, "loss": 0.0001, "step": 10488 }, { "epoch": 4.772065514103731, "grad_norm": 0.006557216341889427, "learning_rate": 5.139328799339982e-08, "loss": 0.0, "step": 10489 }, { "epoch": 4.772520473157416, "grad_norm": 0.023955501475005192, "learning_rate": 5.1189087539129656e-08, "loss": 0.0002, "step": 10490 }, { "epoch": 4.772975432211101, "grad_norm": 0.04585853373194461, "learning_rate": 5.0985291479338506e-08, "loss": 0.0001, "step": 10491 }, { "epoch": 4.773430391264786, "grad_norm": 0.021702837462913984, "learning_rate": 5.078189983067916e-08, "loss": 0.0001, "step": 10492 }, { "epoch": 4.773885350318471, "grad_norm": 0.17922102132954887, "learning_rate": 5.057891260977277e-08, "loss": 0.0007, "step": 10493 }, { "epoch": 4.774340309372157, "grad_norm": 0.026902093312953422, "learning_rate": 5.037632983320662e-08, "loss": 0.0001, "step": 10494 }, { "epoch": 4.774795268425842, "grad_norm": 0.07414970298959292, "learning_rate": 5.017415151753413e-08, "loss": 0.0009, "step": 10495 }, { "epoch": 4.7752502274795265, "grad_norm": 0.004481096511032117, "learning_rate": 4.9972377679277096e-08, "loss": 0.0, "step": 10496 }, { "epoch": 4.775705186533212, "grad_norm": 0.07155299636763211, "learning_rate": 4.9771008334922874e-08, "loss": 0.0002, "step": 10497 }, { "epoch": 4.776160145586897, "grad_norm": 0.043105064651908984, "learning_rate": 4.95700435009272e-08, "loss": 0.0004, "step": 10498 }, { "epoch": 4.776615104640582, "grad_norm": 0.11334264726492232, "learning_rate": 4.9369483193711375e-08, "loss": 0.0008, "step": 10499 }, { "epoch": 4.777070063694268, "grad_norm": 0.04372280713250435, "learning_rate": 4.9169327429664513e-08, "loss": 0.0003, "step": 10500 }, { "epoch": 4.777525022747953, "grad_norm": 0.0567245885903299, "learning_rate": 4.896957622514298e-08, "loss": 0.0002, "step": 10501 }, { "epoch": 4.7779799818016375, "grad_norm": 0.05539153255073634, "learning_rate": 4.877022959646816e-08, "loss": 0.0005, "step": 10502 }, { "epoch": 4.778434940855323, "grad_norm": 0.18441064518113084, "learning_rate": 4.85712875599309e-08, "loss": 0.003, "step": 10503 }, { "epoch": 4.778889899909008, "grad_norm": 0.18221204838936633, "learning_rate": 4.8372750131788214e-08, "loss": 0.0046, "step": 10504 }, { "epoch": 4.779344858962693, "grad_norm": 0.16788315445887017, "learning_rate": 4.8174617328262675e-08, "loss": 0.0004, "step": 10505 }, { "epoch": 4.779799818016379, "grad_norm": 0.02893776665167774, "learning_rate": 4.797688916554466e-08, "loss": 0.0001, "step": 10506 }, { "epoch": 4.780254777070064, "grad_norm": 0.06444160325200556, "learning_rate": 4.777956565979236e-08, "loss": 0.0005, "step": 10507 }, { "epoch": 4.7807097361237485, "grad_norm": 0.05512069424498918, "learning_rate": 4.75826468271301e-08, "loss": 0.0001, "step": 10508 }, { "epoch": 4.781164695177434, "grad_norm": 0.06292658594426694, "learning_rate": 4.738613268364889e-08, "loss": 0.0009, "step": 10509 }, { "epoch": 4.781619654231119, "grad_norm": 0.006702160578139643, "learning_rate": 4.719002324540756e-08, "loss": 0.0001, "step": 10510 }, { "epoch": 4.782074613284804, "grad_norm": 0.037029417114959204, "learning_rate": 4.699431852842995e-08, "loss": 0.001, "step": 10511 }, { "epoch": 4.78252957233849, "grad_norm": 0.12414774787413381, "learning_rate": 4.679901854870994e-08, "loss": 0.0012, "step": 10512 }, { "epoch": 4.782984531392175, "grad_norm": 0.019471087984977813, "learning_rate": 4.660412332220476e-08, "loss": 0.0002, "step": 10513 }, { "epoch": 4.7834394904458595, "grad_norm": 0.2795646762845415, "learning_rate": 4.640963286484224e-08, "loss": 0.0031, "step": 10514 }, { "epoch": 4.783894449499545, "grad_norm": 0.02901829717063289, "learning_rate": 4.6215547192514085e-08, "loss": 0.0002, "step": 10515 }, { "epoch": 4.78434940855323, "grad_norm": 0.08182638069428427, "learning_rate": 4.602186632107985e-08, "loss": 0.0007, "step": 10516 }, { "epoch": 4.784804367606915, "grad_norm": 0.1631063372427597, "learning_rate": 4.582859026636688e-08, "loss": 0.0008, "step": 10517 }, { "epoch": 4.785259326660601, "grad_norm": 0.1844271632020055, "learning_rate": 4.5635719044169194e-08, "loss": 0.0032, "step": 10518 }, { "epoch": 4.785714285714286, "grad_norm": 0.07665528665097694, "learning_rate": 4.5443252670246404e-08, "loss": 0.0002, "step": 10519 }, { "epoch": 4.7861692447679705, "grad_norm": 0.030584329823764238, "learning_rate": 4.52511911603265e-08, "loss": 0.0001, "step": 10520 }, { "epoch": 4.786624203821656, "grad_norm": 0.07700336738538648, "learning_rate": 4.505953453010359e-08, "loss": 0.0002, "step": 10521 }, { "epoch": 4.787079162875341, "grad_norm": 0.07314791019480306, "learning_rate": 4.4868282795239026e-08, "loss": 0.001, "step": 10522 }, { "epoch": 4.787534121929026, "grad_norm": 0.08562270387581507, "learning_rate": 4.4677435971361985e-08, "loss": 0.0006, "step": 10523 }, { "epoch": 4.787989080982712, "grad_norm": 0.04221491144114259, "learning_rate": 4.448699407406665e-08, "loss": 0.0001, "step": 10524 }, { "epoch": 4.788444040036397, "grad_norm": 0.08206179531749871, "learning_rate": 4.429695711891502e-08, "loss": 0.0005, "step": 10525 }, { "epoch": 4.788898999090081, "grad_norm": 0.006166077996794459, "learning_rate": 4.410732512143634e-08, "loss": 0.0, "step": 10526 }, { "epoch": 4.789353958143767, "grad_norm": 0.05715006313493763, "learning_rate": 4.3918098097125994e-08, "loss": 0.0001, "step": 10527 }, { "epoch": 4.789808917197452, "grad_norm": 0.005578505933585062, "learning_rate": 4.372927606144772e-08, "loss": 0.0, "step": 10528 }, { "epoch": 4.790263876251137, "grad_norm": 0.05571538995352204, "learning_rate": 4.354085902983085e-08, "loss": 0.0005, "step": 10529 }, { "epoch": 4.790718835304823, "grad_norm": 0.006223425459099669, "learning_rate": 4.3352847017671396e-08, "loss": 0.0, "step": 10530 }, { "epoch": 4.7911737943585075, "grad_norm": 0.05182659626430289, "learning_rate": 4.316524004033318e-08, "loss": 0.0005, "step": 10531 }, { "epoch": 4.791628753412192, "grad_norm": 0.02142757918277954, "learning_rate": 4.297803811314727e-08, "loss": 0.0001, "step": 10532 }, { "epoch": 4.792083712465878, "grad_norm": 0.013843673384548316, "learning_rate": 4.2791241251409765e-08, "loss": 0.0, "step": 10533 }, { "epoch": 4.792538671519563, "grad_norm": 0.11427663225571885, "learning_rate": 4.260484947038568e-08, "loss": 0.0009, "step": 10534 }, { "epoch": 4.792993630573249, "grad_norm": 0.17773758151955296, "learning_rate": 4.2418862785306156e-08, "loss": 0.0028, "step": 10535 }, { "epoch": 4.793448589626934, "grad_norm": 0.027262757429204452, "learning_rate": 4.2233281211368494e-08, "loss": 0.0001, "step": 10536 }, { "epoch": 4.7939035486806185, "grad_norm": 0.06796770268321947, "learning_rate": 4.204810476373833e-08, "loss": 0.0003, "step": 10537 }, { "epoch": 4.794358507734304, "grad_norm": 0.10100379583039645, "learning_rate": 4.1863333457546895e-08, "loss": 0.0007, "step": 10538 }, { "epoch": 4.794813466787989, "grad_norm": 0.017344385957671453, "learning_rate": 4.1678967307893225e-08, "loss": 0.0001, "step": 10539 }, { "epoch": 4.795268425841674, "grad_norm": 0.0074177376043692254, "learning_rate": 4.1495006329843044e-08, "loss": 0.0, "step": 10540 }, { "epoch": 4.79572338489536, "grad_norm": 0.034727937445359014, "learning_rate": 4.1311450538427666e-08, "loss": 0.0004, "step": 10541 }, { "epoch": 4.796178343949045, "grad_norm": 0.021418066558488196, "learning_rate": 4.112829994864842e-08, "loss": 0.0001, "step": 10542 }, { "epoch": 4.7966333030027295, "grad_norm": 0.18629067898945587, "learning_rate": 4.094555457547e-08, "loss": 0.0021, "step": 10543 }, { "epoch": 4.797088262056415, "grad_norm": 0.2011310590971207, "learning_rate": 4.0763214433826024e-08, "loss": 0.001, "step": 10544 }, { "epoch": 4.7975432211101, "grad_norm": 0.2967322944483974, "learning_rate": 4.058127953861568e-08, "loss": 0.0065, "step": 10545 }, { "epoch": 4.797998180163785, "grad_norm": 0.02446076337060346, "learning_rate": 4.039974990470763e-08, "loss": 0.0001, "step": 10546 }, { "epoch": 4.798453139217471, "grad_norm": 0.016107392145627154, "learning_rate": 4.0218625546934455e-08, "loss": 0.0001, "step": 10547 }, { "epoch": 4.798908098271156, "grad_norm": 0.11004472475863146, "learning_rate": 4.0037906480096535e-08, "loss": 0.0007, "step": 10548 }, { "epoch": 4.7993630573248405, "grad_norm": 0.13171680692725785, "learning_rate": 3.985759271896261e-08, "loss": 0.0003, "step": 10549 }, { "epoch": 4.799818016378526, "grad_norm": 0.0857020502943738, "learning_rate": 3.9677684278265905e-08, "loss": 0.0008, "step": 10550 }, { "epoch": 4.800272975432211, "grad_norm": 0.007328631718289658, "learning_rate": 3.949818117270798e-08, "loss": 0.0001, "step": 10551 }, { "epoch": 4.800727934485896, "grad_norm": 0.01539439469606408, "learning_rate": 3.931908341695767e-08, "loss": 0.0001, "step": 10552 }, { "epoch": 4.801182893539582, "grad_norm": 0.05837276969107897, "learning_rate": 3.9140391025649925e-08, "loss": 0.0004, "step": 10553 }, { "epoch": 4.801637852593267, "grad_norm": 0.02905753305664897, "learning_rate": 3.896210401338585e-08, "loss": 0.0001, "step": 10554 }, { "epoch": 4.8020928116469515, "grad_norm": 0.046825068689999395, "learning_rate": 3.878422239473489e-08, "loss": 0.0003, "step": 10555 }, { "epoch": 4.802547770700637, "grad_norm": 0.005290289391284484, "learning_rate": 3.860674618423266e-08, "loss": 0.0, "step": 10556 }, { "epoch": 4.803002729754322, "grad_norm": 0.358722514954816, "learning_rate": 3.8429675396381985e-08, "loss": 0.0012, "step": 10557 }, { "epoch": 4.803457688808007, "grad_norm": 0.08606302424667361, "learning_rate": 3.825301004565185e-08, "loss": 0.001, "step": 10558 }, { "epoch": 4.803912647861693, "grad_norm": 0.029532784151471023, "learning_rate": 3.807675014647849e-08, "loss": 0.0005, "step": 10559 }, { "epoch": 4.804367606915378, "grad_norm": 0.11996673715780519, "learning_rate": 3.790089571326539e-08, "loss": 0.0013, "step": 10560 }, { "epoch": 4.804822565969062, "grad_norm": 0.14253178438403938, "learning_rate": 3.7725446760382144e-08, "loss": 0.0015, "step": 10561 }, { "epoch": 4.805277525022748, "grad_norm": 0.005642503228703067, "learning_rate": 3.7550403302166194e-08, "loss": 0.0, "step": 10562 }, { "epoch": 4.805732484076433, "grad_norm": 0.2927128660137623, "learning_rate": 3.737576535292109e-08, "loss": 0.0009, "step": 10563 }, { "epoch": 4.806187443130118, "grad_norm": 0.0164660350078814, "learning_rate": 3.720153292691764e-08, "loss": 0.0001, "step": 10564 }, { "epoch": 4.806642402183804, "grad_norm": 0.04231952990489744, "learning_rate": 3.7027706038392785e-08, "loss": 0.0003, "step": 10565 }, { "epoch": 4.8070973612374885, "grad_norm": 0.11948580104133229, "learning_rate": 3.685428470155128e-08, "loss": 0.0021, "step": 10566 }, { "epoch": 4.807552320291173, "grad_norm": 0.08263066159525466, "learning_rate": 3.668126893056456e-08, "loss": 0.0005, "step": 10567 }, { "epoch": 4.808007279344859, "grad_norm": 0.015181381716912952, "learning_rate": 3.650865873957021e-08, "loss": 0.0001, "step": 10568 }, { "epoch": 4.808462238398544, "grad_norm": 0.09240990068313966, "learning_rate": 3.633645414267306e-08, "loss": 0.0007, "step": 10569 }, { "epoch": 4.80891719745223, "grad_norm": 0.15355266112605442, "learning_rate": 3.616465515394518e-08, "loss": 0.0015, "step": 10570 }, { "epoch": 4.809372156505915, "grad_norm": 0.04200299837285704, "learning_rate": 3.599326178742535e-08, "loss": 0.0002, "step": 10571 }, { "epoch": 4.8098271155595995, "grad_norm": 0.0022545921513059154, "learning_rate": 3.582227405711902e-08, "loss": 0.0, "step": 10572 }, { "epoch": 4.810282074613285, "grad_norm": 0.01979978553526252, "learning_rate": 3.565169197699836e-08, "loss": 0.0001, "step": 10573 }, { "epoch": 4.81073703366697, "grad_norm": 0.01864082875974144, "learning_rate": 3.5481515561002766e-08, "loss": 0.0001, "step": 10574 }, { "epoch": 4.811191992720655, "grad_norm": 0.32465274689827534, "learning_rate": 3.5311744823037784e-08, "loss": 0.0007, "step": 10575 }, { "epoch": 4.811646951774341, "grad_norm": 0.016403546473606315, "learning_rate": 3.514237977697677e-08, "loss": 0.0001, "step": 10576 }, { "epoch": 4.812101910828026, "grad_norm": 0.027332910884705977, "learning_rate": 3.4973420436659745e-08, "loss": 0.0002, "step": 10577 }, { "epoch": 4.8125568698817105, "grad_norm": 0.026092405420165925, "learning_rate": 3.48048668158929e-08, "loss": 0.0001, "step": 10578 }, { "epoch": 4.813011828935396, "grad_norm": 0.017456939732913984, "learning_rate": 3.463671892844911e-08, "loss": 0.0001, "step": 10579 }, { "epoch": 4.813466787989081, "grad_norm": 0.03469750406876294, "learning_rate": 3.4468976788069596e-08, "loss": 0.0002, "step": 10580 }, { "epoch": 4.813921747042766, "grad_norm": 0.06642568455859754, "learning_rate": 3.4301640408461736e-08, "loss": 0.001, "step": 10581 }, { "epoch": 4.814376706096452, "grad_norm": 0.012568092935371502, "learning_rate": 3.413470980329847e-08, "loss": 0.0, "step": 10582 }, { "epoch": 4.814831665150137, "grad_norm": 0.007418483492608347, "learning_rate": 3.396818498622057e-08, "loss": 0.0, "step": 10583 }, { "epoch": 4.8152866242038215, "grad_norm": 0.18845215214961689, "learning_rate": 3.380206597083657e-08, "loss": 0.003, "step": 10584 }, { "epoch": 4.815741583257507, "grad_norm": 0.29583943446069366, "learning_rate": 3.3636352770720636e-08, "loss": 0.0013, "step": 10585 }, { "epoch": 4.816196542311192, "grad_norm": 0.07717152792916233, "learning_rate": 3.347104539941415e-08, "loss": 0.0004, "step": 10586 }, { "epoch": 4.816651501364877, "grad_norm": 0.01616399208865998, "learning_rate": 3.3306143870425744e-08, "loss": 0.0001, "step": 10587 }, { "epoch": 4.817106460418563, "grad_norm": 0.18726096813923176, "learning_rate": 3.3141648197229094e-08, "loss": 0.0018, "step": 10588 }, { "epoch": 4.817561419472248, "grad_norm": 0.05492710753916651, "learning_rate": 3.2977558393266774e-08, "loss": 0.0004, "step": 10589 }, { "epoch": 4.8180163785259325, "grad_norm": 0.08499093214175243, "learning_rate": 3.2813874471947506e-08, "loss": 0.0008, "step": 10590 }, { "epoch": 4.818471337579618, "grad_norm": 0.017809717834770953, "learning_rate": 3.265059644664725e-08, "loss": 0.0001, "step": 10591 }, { "epoch": 4.818926296633303, "grad_norm": 0.09934791525058385, "learning_rate": 3.248772433070757e-08, "loss": 0.0007, "step": 10592 }, { "epoch": 4.819381255686988, "grad_norm": 0.3402813866866615, "learning_rate": 3.232525813743781e-08, "loss": 0.0079, "step": 10593 }, { "epoch": 4.819836214740674, "grad_norm": 0.18440729692997979, "learning_rate": 3.2163197880114575e-08, "loss": 0.0008, "step": 10594 }, { "epoch": 4.820291173794359, "grad_norm": 0.046284525256954104, "learning_rate": 3.200154357198004e-08, "loss": 0.0002, "step": 10595 }, { "epoch": 4.820746132848043, "grad_norm": 0.1508279458121791, "learning_rate": 3.184029522624421e-08, "loss": 0.0017, "step": 10596 }, { "epoch": 4.821201091901729, "grad_norm": 0.056949666435369764, "learning_rate": 3.1679452856083204e-08, "loss": 0.0004, "step": 10597 }, { "epoch": 4.821656050955414, "grad_norm": 0.23372843492565107, "learning_rate": 3.151901647464095e-08, "loss": 0.0034, "step": 10598 }, { "epoch": 4.822111010009099, "grad_norm": 0.06029492930837099, "learning_rate": 3.1358986095026964e-08, "loss": 0.0001, "step": 10599 }, { "epoch": 4.822565969062785, "grad_norm": 0.09152515600055088, "learning_rate": 3.119936173031857e-08, "loss": 0.0008, "step": 10600 }, { "epoch": 4.8230209281164695, "grad_norm": 0.05125030283962127, "learning_rate": 3.104014339355921e-08, "loss": 0.0001, "step": 10601 }, { "epoch": 4.823475887170154, "grad_norm": 0.03963411058092666, "learning_rate": 3.088133109775959e-08, "loss": 0.0002, "step": 10602 }, { "epoch": 4.82393084622384, "grad_norm": 0.03372813182074394, "learning_rate": 3.0722924855897675e-08, "loss": 0.0002, "step": 10603 }, { "epoch": 4.824385805277525, "grad_norm": 0.08031103445161011, "learning_rate": 3.0564924680916986e-08, "loss": 0.0007, "step": 10604 }, { "epoch": 4.82484076433121, "grad_norm": 0.08041060202909711, "learning_rate": 3.040733058572887e-08, "loss": 0.001, "step": 10605 }, { "epoch": 4.825295723384896, "grad_norm": 0.19228550631166832, "learning_rate": 3.025014258321135e-08, "loss": 0.0017, "step": 10606 }, { "epoch": 4.8257506824385805, "grad_norm": 0.004968914261770674, "learning_rate": 3.009336068620916e-08, "loss": 0.0, "step": 10607 }, { "epoch": 4.826205641492265, "grad_norm": 0.22648649720504208, "learning_rate": 2.993698490753316e-08, "loss": 0.0009, "step": 10608 }, { "epoch": 4.826660600545951, "grad_norm": 0.0157318108343989, "learning_rate": 2.9781015259962576e-08, "loss": 0.0001, "step": 10609 }, { "epoch": 4.827115559599636, "grad_norm": 0.04121070327770224, "learning_rate": 2.9625451756241653e-08, "loss": 0.0001, "step": 10610 }, { "epoch": 4.827570518653321, "grad_norm": 0.014543193824360968, "learning_rate": 2.9470294409083e-08, "loss": 0.0001, "step": 10611 }, { "epoch": 4.828025477707007, "grad_norm": 0.06290233521550924, "learning_rate": 2.9315543231165366e-08, "loss": 0.0003, "step": 10612 }, { "epoch": 4.8284804367606915, "grad_norm": 0.011504104732439714, "learning_rate": 2.916119823513419e-08, "loss": 0.0001, "step": 10613 }, { "epoch": 4.828935395814376, "grad_norm": 0.033088749539481745, "learning_rate": 2.9007259433601608e-08, "loss": 0.0001, "step": 10614 }, { "epoch": 4.829390354868062, "grad_norm": 0.006712217621457061, "learning_rate": 2.8853726839147e-08, "loss": 0.0001, "step": 10615 }, { "epoch": 4.829845313921747, "grad_norm": 0.09153658316753538, "learning_rate": 2.870060046431644e-08, "loss": 0.0007, "step": 10616 }, { "epoch": 4.830300272975432, "grad_norm": 0.0587349130977096, "learning_rate": 2.8547880321622146e-08, "loss": 0.0005, "step": 10617 }, { "epoch": 4.830755232029118, "grad_norm": 0.04762494490565836, "learning_rate": 2.8395566423544686e-08, "loss": 0.0005, "step": 10618 }, { "epoch": 4.8312101910828025, "grad_norm": 0.04377171640039872, "learning_rate": 2.824365878252966e-08, "loss": 0.0004, "step": 10619 }, { "epoch": 4.831665150136487, "grad_norm": 0.010692326472147465, "learning_rate": 2.8092157410991028e-08, "loss": 0.0001, "step": 10620 }, { "epoch": 4.832120109190173, "grad_norm": 0.06545845433752331, "learning_rate": 2.794106232130833e-08, "loss": 0.0002, "step": 10621 }, { "epoch": 4.832575068243858, "grad_norm": 0.04931854595661653, "learning_rate": 2.7790373525827806e-08, "loss": 0.0004, "step": 10622 }, { "epoch": 4.833030027297543, "grad_norm": 0.009207938430837646, "learning_rate": 2.764009103686405e-08, "loss": 0.0001, "step": 10623 }, { "epoch": 4.833484986351229, "grad_norm": 0.015861222940373815, "learning_rate": 2.749021486669723e-08, "loss": 0.0001, "step": 10624 }, { "epoch": 4.8339399454049135, "grad_norm": 0.06850436907731229, "learning_rate": 2.7340745027574222e-08, "loss": 0.0006, "step": 10625 }, { "epoch": 4.834394904458598, "grad_norm": 0.00834391922843764, "learning_rate": 2.7191681531709147e-08, "loss": 0.0, "step": 10626 }, { "epoch": 4.834849863512284, "grad_norm": 0.13244810782001767, "learning_rate": 2.7043024391282814e-08, "loss": 0.0006, "step": 10627 }, { "epoch": 4.835304822565969, "grad_norm": 0.02351061306544875, "learning_rate": 2.6894773618442727e-08, "loss": 0.0001, "step": 10628 }, { "epoch": 4.835759781619654, "grad_norm": 0.08782471584334889, "learning_rate": 2.6746929225303642e-08, "loss": 0.0003, "step": 10629 }, { "epoch": 4.83621474067334, "grad_norm": 0.14719306004175872, "learning_rate": 2.659949122394645e-08, "loss": 0.0017, "step": 10630 }, { "epoch": 4.836669699727024, "grad_norm": 0.14312679067719997, "learning_rate": 2.645245962641929e-08, "loss": 0.0005, "step": 10631 }, { "epoch": 4.837124658780709, "grad_norm": 0.15687896467210338, "learning_rate": 2.6305834444736443e-08, "loss": 0.0016, "step": 10632 }, { "epoch": 4.837579617834395, "grad_norm": 0.013264322189545407, "learning_rate": 2.6159615690879992e-08, "loss": 0.0001, "step": 10633 }, { "epoch": 4.83803457688808, "grad_norm": 0.26436055477768866, "learning_rate": 2.6013803376798154e-08, "loss": 0.0018, "step": 10634 }, { "epoch": 4.838489535941765, "grad_norm": 0.016990845860600595, "learning_rate": 2.586839751440584e-08, "loss": 0.0, "step": 10635 }, { "epoch": 4.8389444949954505, "grad_norm": 0.0093148023201663, "learning_rate": 2.5723398115585774e-08, "loss": 0.0, "step": 10636 }, { "epoch": 4.839399454049135, "grad_norm": 0.009548493280044586, "learning_rate": 2.557880519218514e-08, "loss": 0.0, "step": 10637 }, { "epoch": 4.83985441310282, "grad_norm": 0.0335180642475159, "learning_rate": 2.5434618756020048e-08, "loss": 0.0001, "step": 10638 }, { "epoch": 4.840309372156506, "grad_norm": 0.11792350498693562, "learning_rate": 2.5290838818873842e-08, "loss": 0.0016, "step": 10639 }, { "epoch": 4.840764331210191, "grad_norm": 0.09128113575614688, "learning_rate": 2.514746539249435e-08, "loss": 0.0009, "step": 10640 }, { "epoch": 4.841219290263876, "grad_norm": 0.05620321006347124, "learning_rate": 2.500449848859776e-08, "loss": 0.0003, "step": 10641 }, { "epoch": 4.8416742493175615, "grad_norm": 0.08296479532707239, "learning_rate": 2.486193811886639e-08, "loss": 0.001, "step": 10642 }, { "epoch": 4.842129208371246, "grad_norm": 0.09480030830167191, "learning_rate": 2.4719784294949812e-08, "loss": 0.0009, "step": 10643 }, { "epoch": 4.842584167424932, "grad_norm": 0.042633327507038424, "learning_rate": 2.4578037028464842e-08, "loss": 0.0002, "step": 10644 }, { "epoch": 4.843039126478617, "grad_norm": 0.07367148583993635, "learning_rate": 2.4436696330993326e-08, "loss": 0.0004, "step": 10645 }, { "epoch": 4.843494085532302, "grad_norm": 0.024125952967483922, "learning_rate": 2.4295762214086027e-08, "loss": 0.0001, "step": 10646 }, { "epoch": 4.843949044585988, "grad_norm": 0.12441582649777472, "learning_rate": 2.4155234689258733e-08, "loss": 0.001, "step": 10647 }, { "epoch": 4.8444040036396725, "grad_norm": 0.02080198029374671, "learning_rate": 2.4015113767995036e-08, "loss": 0.0001, "step": 10648 }, { "epoch": 4.844858962693357, "grad_norm": 0.00855759792218338, "learning_rate": 2.387539946174522e-08, "loss": 0.0001, "step": 10649 }, { "epoch": 4.845313921747043, "grad_norm": 0.12399501575836457, "learning_rate": 2.3736091781925152e-08, "loss": 0.0004, "step": 10650 }, { "epoch": 4.845768880800728, "grad_norm": 0.01717303314278382, "learning_rate": 2.3597190739919064e-08, "loss": 0.0001, "step": 10651 }, { "epoch": 4.846223839854413, "grad_norm": 0.10593128706377823, "learning_rate": 2.345869634707787e-08, "loss": 0.0017, "step": 10652 }, { "epoch": 4.846678798908099, "grad_norm": 0.037310170014038156, "learning_rate": 2.332060861471752e-08, "loss": 0.0002, "step": 10653 }, { "epoch": 4.8471337579617835, "grad_norm": 0.22969565668342115, "learning_rate": 2.318292755412288e-08, "loss": 0.0006, "step": 10654 }, { "epoch": 4.847588717015468, "grad_norm": 0.1161783607683092, "learning_rate": 2.3045653176544392e-08, "loss": 0.0015, "step": 10655 }, { "epoch": 4.848043676069154, "grad_norm": 0.0020548125619038646, "learning_rate": 2.2908785493199192e-08, "loss": 0.0, "step": 10656 }, { "epoch": 4.848498635122839, "grad_norm": 0.08025111528509074, "learning_rate": 2.2772324515272225e-08, "loss": 0.001, "step": 10657 }, { "epoch": 4.848953594176524, "grad_norm": 0.022869723755637377, "learning_rate": 2.263627025391346e-08, "loss": 0.0001, "step": 10658 }, { "epoch": 4.84940855323021, "grad_norm": 0.012717143838263565, "learning_rate": 2.250062272024067e-08, "loss": 0.0001, "step": 10659 }, { "epoch": 4.8498635122838945, "grad_norm": 0.014129267222265735, "learning_rate": 2.2365381925339437e-08, "loss": 0.0001, "step": 10660 }, { "epoch": 4.850318471337579, "grad_norm": 0.035537994703623906, "learning_rate": 2.2230547880260356e-08, "loss": 0.0002, "step": 10661 }, { "epoch": 4.850773430391265, "grad_norm": 0.024043356715591935, "learning_rate": 2.2096120596021288e-08, "loss": 0.0001, "step": 10662 }, { "epoch": 4.85122838944495, "grad_norm": 0.14259522511204478, "learning_rate": 2.196210008360733e-08, "loss": 0.0008, "step": 10663 }, { "epoch": 4.851683348498635, "grad_norm": 0.03224475369778581, "learning_rate": 2.182848635397028e-08, "loss": 0.0003, "step": 10664 }, { "epoch": 4.852138307552321, "grad_norm": 0.05021504722988465, "learning_rate": 2.1695279418027514e-08, "loss": 0.0002, "step": 10665 }, { "epoch": 4.852593266606005, "grad_norm": 0.6022559623835628, "learning_rate": 2.1562479286664774e-08, "loss": 0.0073, "step": 10666 }, { "epoch": 4.85304822565969, "grad_norm": 0.14734256186708095, "learning_rate": 2.1430085970733928e-08, "loss": 0.0015, "step": 10667 }, { "epoch": 4.853503184713376, "grad_norm": 0.22444124392230336, "learning_rate": 2.1298099481053547e-08, "loss": 0.0012, "step": 10668 }, { "epoch": 4.853958143767061, "grad_norm": 0.18129506635800877, "learning_rate": 2.1166519828408892e-08, "loss": 0.0061, "step": 10669 }, { "epoch": 4.854413102820746, "grad_norm": 0.023582643451061067, "learning_rate": 2.103534702355192e-08, "loss": 0.0002, "step": 10670 }, { "epoch": 4.8548680618744315, "grad_norm": 0.029595311350128547, "learning_rate": 2.090458107720128e-08, "loss": 0.0003, "step": 10671 }, { "epoch": 4.855323020928116, "grad_norm": 0.004580527862419473, "learning_rate": 2.077422200004342e-08, "loss": 0.0, "step": 10672 }, { "epoch": 4.855777979981801, "grad_norm": 0.04994110836346044, "learning_rate": 2.064426980272982e-08, "loss": 0.0003, "step": 10673 }, { "epoch": 4.856232939035487, "grad_norm": 0.012475817436306325, "learning_rate": 2.051472449587977e-08, "loss": 0.0001, "step": 10674 }, { "epoch": 4.856687898089172, "grad_norm": 0.2544304006916711, "learning_rate": 2.038558609007979e-08, "loss": 0.0014, "step": 10675 }, { "epoch": 4.857142857142857, "grad_norm": 0.04518085471272285, "learning_rate": 2.025685459588145e-08, "loss": 0.0003, "step": 10676 }, { "epoch": 4.8575978161965425, "grad_norm": 0.06443166085751145, "learning_rate": 2.012853002380466e-08, "loss": 0.0004, "step": 10677 }, { "epoch": 4.858052775250227, "grad_norm": 0.03864194596464599, "learning_rate": 2.000061238433604e-08, "loss": 0.0003, "step": 10678 }, { "epoch": 4.858507734303913, "grad_norm": 0.010347136542787348, "learning_rate": 1.9873101687927775e-08, "loss": 0.0001, "step": 10679 }, { "epoch": 4.858962693357598, "grad_norm": 0.017414706808795203, "learning_rate": 1.9745997944998762e-08, "loss": 0.0001, "step": 10680 }, { "epoch": 4.859417652411283, "grad_norm": 0.009343436338073375, "learning_rate": 1.96193011659368e-08, "loss": 0.0001, "step": 10681 }, { "epoch": 4.859872611464969, "grad_norm": 0.17415439483500175, "learning_rate": 1.9493011361094716e-08, "loss": 0.0004, "step": 10682 }, { "epoch": 4.8603275705186535, "grad_norm": 0.07901538823059047, "learning_rate": 1.936712854079148e-08, "loss": 0.0007, "step": 10683 }, { "epoch": 4.860782529572338, "grad_norm": 0.1214173235780222, "learning_rate": 1.924165271531442e-08, "loss": 0.001, "step": 10684 }, { "epoch": 4.861237488626024, "grad_norm": 0.011098271197636941, "learning_rate": 1.9116583894915886e-08, "loss": 0.0001, "step": 10685 }, { "epoch": 4.861692447679709, "grad_norm": 0.047815362864294504, "learning_rate": 1.8991922089817703e-08, "loss": 0.0003, "step": 10686 }, { "epoch": 4.862147406733394, "grad_norm": 0.16758691708770435, "learning_rate": 1.8867667310204506e-08, "loss": 0.0029, "step": 10687 }, { "epoch": 4.86260236578708, "grad_norm": 0.00555308586160886, "learning_rate": 1.8743819566232057e-08, "loss": 0.0, "step": 10688 }, { "epoch": 4.8630573248407645, "grad_norm": 0.1612602743086112, "learning_rate": 1.862037886801893e-08, "loss": 0.0023, "step": 10689 }, { "epoch": 4.863512283894449, "grad_norm": 0.26120594656365803, "learning_rate": 1.849734522565261e-08, "loss": 0.0043, "step": 10690 }, { "epoch": 4.863967242948135, "grad_norm": 0.3493591190706625, "learning_rate": 1.8374718649187273e-08, "loss": 0.0055, "step": 10691 }, { "epoch": 4.86442220200182, "grad_norm": 0.08966898662117236, "learning_rate": 1.825249914864269e-08, "loss": 0.001, "step": 10692 }, { "epoch": 4.864877161055505, "grad_norm": 0.17451184077112178, "learning_rate": 1.8130686734006976e-08, "loss": 0.0024, "step": 10693 }, { "epoch": 4.865332120109191, "grad_norm": 0.08165617807963745, "learning_rate": 1.8009281415233282e-08, "loss": 0.0006, "step": 10694 }, { "epoch": 4.8657870791628755, "grad_norm": 0.05492175604924547, "learning_rate": 1.7888283202243117e-08, "loss": 0.0002, "step": 10695 }, { "epoch": 4.86624203821656, "grad_norm": 0.009346683821212861, "learning_rate": 1.776769210492302e-08, "loss": 0.0001, "step": 10696 }, { "epoch": 4.866696997270246, "grad_norm": 0.05318933491948676, "learning_rate": 1.7647508133127324e-08, "loss": 0.0002, "step": 10697 }, { "epoch": 4.867151956323931, "grad_norm": 0.03708503214947641, "learning_rate": 1.7527731296677618e-08, "loss": 0.0001, "step": 10698 }, { "epoch": 4.867606915377616, "grad_norm": 0.06246760727852482, "learning_rate": 1.740836160536108e-08, "loss": 0.0004, "step": 10699 }, { "epoch": 4.868061874431302, "grad_norm": 0.12212586053328098, "learning_rate": 1.7289399068932122e-08, "loss": 0.0015, "step": 10700 }, { "epoch": 4.868516833484986, "grad_norm": 0.08069567226941834, "learning_rate": 1.7170843697111307e-08, "loss": 0.0006, "step": 10701 }, { "epoch": 4.868971792538671, "grad_norm": 0.0754183913702691, "learning_rate": 1.705269549958699e-08, "loss": 0.0006, "step": 10702 }, { "epoch": 4.869426751592357, "grad_norm": 0.026070492248567615, "learning_rate": 1.6934954486013676e-08, "loss": 0.0001, "step": 10703 }, { "epoch": 4.869881710646042, "grad_norm": 0.006645493865083179, "learning_rate": 1.6817620666012557e-08, "loss": 0.0, "step": 10704 }, { "epoch": 4.870336669699727, "grad_norm": 0.04662025494129306, "learning_rate": 1.6700694049172073e-08, "loss": 0.0002, "step": 10705 }, { "epoch": 4.8707916287534125, "grad_norm": 0.1127424108606032, "learning_rate": 1.6584174645045693e-08, "loss": 0.0008, "step": 10706 }, { "epoch": 4.871246587807097, "grad_norm": 0.11513075254810022, "learning_rate": 1.6468062463156354e-08, "loss": 0.0008, "step": 10707 }, { "epoch": 4.871701546860782, "grad_norm": 0.03678174666343284, "learning_rate": 1.6352357512990914e-08, "loss": 0.0004, "step": 10708 }, { "epoch": 4.872156505914468, "grad_norm": 0.0876832122619242, "learning_rate": 1.623705980400514e-08, "loss": 0.0002, "step": 10709 }, { "epoch": 4.872611464968153, "grad_norm": 0.2232785292408477, "learning_rate": 1.6122169345620387e-08, "loss": 0.0047, "step": 10710 }, { "epoch": 4.873066424021838, "grad_norm": 0.005717378256438027, "learning_rate": 1.6007686147225254e-08, "loss": 0.0, "step": 10711 }, { "epoch": 4.8735213830755235, "grad_norm": 0.13466244114942846, "learning_rate": 1.5893610218173927e-08, "loss": 0.0016, "step": 10712 }, { "epoch": 4.873976342129208, "grad_norm": 0.10334017862472267, "learning_rate": 1.5779941567789502e-08, "loss": 0.0018, "step": 10713 }, { "epoch": 4.874431301182893, "grad_norm": 0.11841673465487632, "learning_rate": 1.5666680205358997e-08, "loss": 0.0006, "step": 10714 }, { "epoch": 4.874886260236579, "grad_norm": 0.007082742929595013, "learning_rate": 1.555382614013834e-08, "loss": 0.0, "step": 10715 }, { "epoch": 4.875341219290264, "grad_norm": 0.572702655866008, "learning_rate": 1.5441379381349596e-08, "loss": 0.0063, "step": 10716 }, { "epoch": 4.875796178343949, "grad_norm": 0.03563803326572268, "learning_rate": 1.532933993818153e-08, "loss": 0.0004, "step": 10717 }, { "epoch": 4.8762511373976345, "grad_norm": 0.013811663458973661, "learning_rate": 1.521770781978904e-08, "loss": 0.0001, "step": 10718 }, { "epoch": 4.876706096451319, "grad_norm": 0.01900269442838308, "learning_rate": 1.5106483035294273e-08, "loss": 0.0001, "step": 10719 }, { "epoch": 4.877161055505004, "grad_norm": 0.1319220146787873, "learning_rate": 1.4995665593786623e-08, "loss": 0.0011, "step": 10720 }, { "epoch": 4.87761601455869, "grad_norm": 0.045356248448323006, "learning_rate": 1.4885255504320518e-08, "loss": 0.0003, "step": 10721 }, { "epoch": 4.878070973612375, "grad_norm": 0.07151111753363854, "learning_rate": 1.4775252775918735e-08, "loss": 0.0003, "step": 10722 }, { "epoch": 4.87852593266606, "grad_norm": 0.12191493144564972, "learning_rate": 1.4665657417570756e-08, "loss": 0.0017, "step": 10723 }, { "epoch": 4.8789808917197455, "grad_norm": 0.049663083426852536, "learning_rate": 1.455646943823108e-08, "loss": 0.0004, "step": 10724 }, { "epoch": 4.87943585077343, "grad_norm": 0.09088581518362969, "learning_rate": 1.4447688846823127e-08, "loss": 0.0002, "step": 10725 }, { "epoch": 4.879890809827115, "grad_norm": 0.05467995360871926, "learning_rate": 1.4339315652235342e-08, "loss": 0.0004, "step": 10726 }, { "epoch": 4.880345768880801, "grad_norm": 0.018031218064506877, "learning_rate": 1.4231349863323974e-08, "loss": 0.0001, "step": 10727 }, { "epoch": 4.880800727934486, "grad_norm": 0.09508558936694048, "learning_rate": 1.4123791488910298e-08, "loss": 0.0011, "step": 10728 }, { "epoch": 4.881255686988171, "grad_norm": 0.027658735700268045, "learning_rate": 1.4016640537785065e-08, "loss": 0.0002, "step": 10729 }, { "epoch": 4.8817106460418564, "grad_norm": 0.15881865302016243, "learning_rate": 1.3909897018702933e-08, "loss": 0.0016, "step": 10730 }, { "epoch": 4.882165605095541, "grad_norm": 0.0026267244981982947, "learning_rate": 1.3803560940387484e-08, "loss": 0.0, "step": 10731 }, { "epoch": 4.882620564149226, "grad_norm": 0.060817926472952026, "learning_rate": 1.3697632311527875e-08, "loss": 0.0004, "step": 10732 }, { "epoch": 4.883075523202912, "grad_norm": 0.1841841626934046, "learning_rate": 1.3592111140778852e-08, "loss": 0.0023, "step": 10733 }, { "epoch": 4.883530482256597, "grad_norm": 0.014369623887369782, "learning_rate": 1.3486997436765182e-08, "loss": 0.0001, "step": 10734 }, { "epoch": 4.883985441310282, "grad_norm": 0.26336033828275013, "learning_rate": 1.3382291208074438e-08, "loss": 0.0093, "step": 10735 }, { "epoch": 4.884440400363967, "grad_norm": 0.02609010432448341, "learning_rate": 1.3277992463263667e-08, "loss": 0.0002, "step": 10736 }, { "epoch": 4.884895359417652, "grad_norm": 0.07982288461427701, "learning_rate": 1.3174101210855495e-08, "loss": 0.0004, "step": 10737 }, { "epoch": 4.885350318471337, "grad_norm": 0.011918262397551215, "learning_rate": 1.3070617459339241e-08, "loss": 0.0001, "step": 10738 }, { "epoch": 4.885805277525023, "grad_norm": 0.021574047390517718, "learning_rate": 1.2967541217172031e-08, "loss": 0.0001, "step": 10739 }, { "epoch": 4.886260236578708, "grad_norm": 0.715000706337954, "learning_rate": 1.286487249277546e-08, "loss": 0.0035, "step": 10740 }, { "epoch": 4.886715195632393, "grad_norm": 0.06271671748684034, "learning_rate": 1.2762611294540039e-08, "loss": 0.0002, "step": 10741 }, { "epoch": 4.887170154686078, "grad_norm": 0.05085349904594647, "learning_rate": 1.2660757630821864e-08, "loss": 0.0004, "step": 10742 }, { "epoch": 4.887625113739763, "grad_norm": 0.04546335978300146, "learning_rate": 1.2559311509943717e-08, "loss": 0.0003, "step": 10743 }, { "epoch": 4.888080072793448, "grad_norm": 0.028395285989852505, "learning_rate": 1.2458272940196192e-08, "loss": 0.0001, "step": 10744 }, { "epoch": 4.888535031847134, "grad_norm": 0.0033841815149936692, "learning_rate": 1.2357641929834352e-08, "loss": 0.0, "step": 10745 }, { "epoch": 4.888989990900819, "grad_norm": 0.05363076803483577, "learning_rate": 1.2257418487082729e-08, "loss": 0.0002, "step": 10746 }, { "epoch": 4.889444949954504, "grad_norm": 0.04168604120548817, "learning_rate": 1.2157602620129772e-08, "loss": 0.0003, "step": 10747 }, { "epoch": 4.889899909008189, "grad_norm": 0.07072252764377125, "learning_rate": 1.2058194337132844e-08, "loss": 0.0003, "step": 10748 }, { "epoch": 4.890354868061874, "grad_norm": 0.14519807970393397, "learning_rate": 1.1959193646214895e-08, "loss": 0.0011, "step": 10749 }, { "epoch": 4.890809827115559, "grad_norm": 0.042013709190410066, "learning_rate": 1.1860600555465562e-08, "loss": 0.0002, "step": 10750 }, { "epoch": 4.891264786169245, "grad_norm": 0.24802709479249932, "learning_rate": 1.176241507294229e-08, "loss": 0.0025, "step": 10751 }, { "epoch": 4.89171974522293, "grad_norm": 0.00817385730034234, "learning_rate": 1.1664637206667551e-08, "loss": 0.0001, "step": 10752 }, { "epoch": 4.892174704276615, "grad_norm": 0.19669054475268896, "learning_rate": 1.1567266964631618e-08, "loss": 0.0012, "step": 10753 }, { "epoch": 4.8926296633303, "grad_norm": 0.039211262828398916, "learning_rate": 1.1470304354790906e-08, "loss": 0.0003, "step": 10754 }, { "epoch": 4.893084622383985, "grad_norm": 0.12240771752079796, "learning_rate": 1.1373749385069077e-08, "loss": 0.0005, "step": 10755 }, { "epoch": 4.893539581437671, "grad_norm": 0.020290106912240133, "learning_rate": 1.1277602063355931e-08, "loss": 0.0001, "step": 10756 }, { "epoch": 4.893994540491356, "grad_norm": 0.11284311477031184, "learning_rate": 1.118186239750796e-08, "loss": 0.0006, "step": 10757 }, { "epoch": 4.894449499545041, "grad_norm": 0.08064750308957072, "learning_rate": 1.108653039534946e-08, "loss": 0.0007, "step": 10758 }, { "epoch": 4.8949044585987265, "grad_norm": 0.2522100858702282, "learning_rate": 1.0991606064669758e-08, "loss": 0.0051, "step": 10759 }, { "epoch": 4.895359417652411, "grad_norm": 0.026213301617833414, "learning_rate": 1.0897089413225425e-08, "loss": 0.0003, "step": 10760 }, { "epoch": 4.895814376706096, "grad_norm": 0.08663157509145294, "learning_rate": 1.0802980448740841e-08, "loss": 0.0012, "step": 10761 }, { "epoch": 4.896269335759782, "grad_norm": 0.018054555073571746, "learning_rate": 1.0709279178905407e-08, "loss": 0.0, "step": 10762 }, { "epoch": 4.896724294813467, "grad_norm": 0.18121500112857056, "learning_rate": 1.061598561137689e-08, "loss": 0.0015, "step": 10763 }, { "epoch": 4.897179253867152, "grad_norm": 0.016914277131477078, "learning_rate": 1.0523099753777521e-08, "loss": 0.0001, "step": 10764 }, { "epoch": 4.8976342129208374, "grad_norm": 0.11112241061273488, "learning_rate": 1.04306216136979e-08, "loss": 0.0013, "step": 10765 }, { "epoch": 4.898089171974522, "grad_norm": 0.03919722996321961, "learning_rate": 1.0338551198695867e-08, "loss": 0.0003, "step": 10766 }, { "epoch": 4.898544131028207, "grad_norm": 0.08086588025026376, "learning_rate": 1.024688851629374e-08, "loss": 0.0007, "step": 10767 }, { "epoch": 4.898999090081893, "grad_norm": 0.2603661170814079, "learning_rate": 1.0155633573982748e-08, "loss": 0.0023, "step": 10768 }, { "epoch": 4.899454049135578, "grad_norm": 0.04321847487618736, "learning_rate": 1.0064786379219704e-08, "loss": 0.0004, "step": 10769 }, { "epoch": 4.899909008189263, "grad_norm": 0.11928253092338069, "learning_rate": 9.97434693942756e-09, "loss": 0.0015, "step": 10770 }, { "epoch": 4.900363967242948, "grad_norm": 0.05358769534493055, "learning_rate": 9.884315261997068e-09, "loss": 0.0009, "step": 10771 }, { "epoch": 4.900818926296633, "grad_norm": 0.02243894349302724, "learning_rate": 9.794691354285124e-09, "loss": 0.0002, "step": 10772 }, { "epoch": 4.901273885350318, "grad_norm": 0.027918111576004948, "learning_rate": 9.705475223615312e-09, "loss": 0.0001, "step": 10773 }, { "epoch": 4.901728844404004, "grad_norm": 0.012546244359984752, "learning_rate": 9.61666687727847e-09, "loss": 0.0001, "step": 10774 }, { "epoch": 4.902183803457689, "grad_norm": 0.1798864241798382, "learning_rate": 9.52826632253101e-09, "loss": 0.0008, "step": 10775 }, { "epoch": 4.902638762511374, "grad_norm": 0.14727979323852924, "learning_rate": 9.440273566597158e-09, "loss": 0.0005, "step": 10776 }, { "epoch": 4.903093721565059, "grad_norm": 0.07535203419993688, "learning_rate": 9.352688616666162e-09, "loss": 0.0005, "step": 10777 }, { "epoch": 4.903548680618744, "grad_norm": 0.01815432520636852, "learning_rate": 9.265511479896738e-09, "loss": 0.0001, "step": 10778 }, { "epoch": 4.904003639672429, "grad_norm": 0.10589616513103145, "learning_rate": 9.178742163411525e-09, "loss": 0.0006, "step": 10779 }, { "epoch": 4.904458598726115, "grad_norm": 0.18742818347964016, "learning_rate": 9.09238067430096e-09, "loss": 0.0028, "step": 10780 }, { "epoch": 4.9049135577798, "grad_norm": 0.06144831804196491, "learning_rate": 9.006427019622177e-09, "loss": 0.0008, "step": 10781 }, { "epoch": 4.905368516833485, "grad_norm": 0.02301187688956902, "learning_rate": 8.920881206399557e-09, "loss": 0.0002, "step": 10782 }, { "epoch": 4.90582347588717, "grad_norm": 0.06112372291766895, "learning_rate": 8.835743241622507e-09, "loss": 0.0005, "step": 10783 }, { "epoch": 4.906278434940855, "grad_norm": 0.018657109133345284, "learning_rate": 8.751013132249353e-09, "loss": 0.0001, "step": 10784 }, { "epoch": 4.90673339399454, "grad_norm": 0.007524553933591692, "learning_rate": 8.666690885202334e-09, "loss": 0.0, "step": 10785 }, { "epoch": 4.907188353048226, "grad_norm": 0.032113260561605114, "learning_rate": 8.582776507373158e-09, "loss": 0.0002, "step": 10786 }, { "epoch": 4.907643312101911, "grad_norm": 0.01615753480787327, "learning_rate": 8.49927000561801e-09, "loss": 0.0001, "step": 10787 }, { "epoch": 4.9080982711555965, "grad_norm": 0.028929233608338458, "learning_rate": 8.41617138676143e-09, "loss": 0.0001, "step": 10788 }, { "epoch": 4.908553230209281, "grad_norm": 0.013660513032169622, "learning_rate": 8.33348065759354e-09, "loss": 0.0001, "step": 10789 }, { "epoch": 4.909008189262966, "grad_norm": 0.02594513796927825, "learning_rate": 8.25119782487116e-09, "loss": 0.0001, "step": 10790 }, { "epoch": 4.909463148316652, "grad_norm": 0.02207079539334765, "learning_rate": 8.169322895318355e-09, "loss": 0.0001, "step": 10791 }, { "epoch": 4.909918107370337, "grad_norm": 0.23338166565559773, "learning_rate": 8.087855875625882e-09, "loss": 0.0012, "step": 10792 }, { "epoch": 4.910373066424022, "grad_norm": 0.010520331548954286, "learning_rate": 8.006796772450642e-09, "loss": 0.0001, "step": 10793 }, { "epoch": 4.9108280254777075, "grad_norm": 0.28465737810929514, "learning_rate": 7.926145592416224e-09, "loss": 0.0039, "step": 10794 }, { "epoch": 4.911282984531392, "grad_norm": 0.2073272336221534, "learning_rate": 7.84590234211291e-09, "loss": 0.0023, "step": 10795 }, { "epoch": 4.911737943585077, "grad_norm": 0.15812695561363363, "learning_rate": 7.766067028098235e-09, "loss": 0.0011, "step": 10796 }, { "epoch": 4.912192902638763, "grad_norm": 0.22877541380524213, "learning_rate": 7.686639656895867e-09, "loss": 0.0017, "step": 10797 }, { "epoch": 4.912647861692448, "grad_norm": 0.18298067885726269, "learning_rate": 7.607620234996171e-09, "loss": 0.0058, "step": 10798 }, { "epoch": 4.913102820746133, "grad_norm": 0.2620430675692336, "learning_rate": 7.529008768856205e-09, "loss": 0.0034, "step": 10799 }, { "epoch": 4.9135577797998184, "grad_norm": 0.07077031978506924, "learning_rate": 7.450805264900274e-09, "loss": 0.0002, "step": 10800 }, { "epoch": 4.914012738853503, "grad_norm": 0.19064616522197256, "learning_rate": 7.373009729517711e-09, "loss": 0.0016, "step": 10801 }, { "epoch": 4.914467697907188, "grad_norm": 0.05195369449440231, "learning_rate": 7.295622169066763e-09, "loss": 0.0003, "step": 10802 }, { "epoch": 4.914922656960874, "grad_norm": 0.22860573877078047, "learning_rate": 7.2186425898707054e-09, "loss": 0.0023, "step": 10803 }, { "epoch": 4.915377616014559, "grad_norm": 0.05639848286433454, "learning_rate": 7.1420709982200635e-09, "loss": 0.0002, "step": 10804 }, { "epoch": 4.915832575068244, "grad_norm": 0.26493778775102833, "learning_rate": 7.065907400371496e-09, "loss": 0.0027, "step": 10805 }, { "epoch": 4.916287534121929, "grad_norm": 0.25236633467669217, "learning_rate": 6.990151802549472e-09, "loss": 0.0049, "step": 10806 }, { "epoch": 4.916742493175614, "grad_norm": 0.026149374732556022, "learning_rate": 6.914804210943482e-09, "loss": 0.0002, "step": 10807 }, { "epoch": 4.917197452229299, "grad_norm": 0.05002565206471791, "learning_rate": 6.839864631711379e-09, "loss": 0.0004, "step": 10808 }, { "epoch": 4.917652411282985, "grad_norm": 0.011192264309064776, "learning_rate": 6.765333070976599e-09, "loss": 0.0001, "step": 10809 }, { "epoch": 4.91810737033667, "grad_norm": 0.05126323701447767, "learning_rate": 6.6912095348298235e-09, "loss": 0.0004, "step": 10810 }, { "epoch": 4.918562329390355, "grad_norm": 0.0017312084433422313, "learning_rate": 6.617494029327876e-09, "loss": 0.0, "step": 10811 }, { "epoch": 4.91901728844404, "grad_norm": 0.027907354540763343, "learning_rate": 6.544186560493715e-09, "loss": 0.0001, "step": 10812 }, { "epoch": 4.919472247497725, "grad_norm": 0.04023437156138533, "learning_rate": 6.471287134319215e-09, "loss": 0.0003, "step": 10813 }, { "epoch": 4.91992720655141, "grad_norm": 0.03178296590915491, "learning_rate": 6.398795756760168e-09, "loss": 0.0002, "step": 10814 }, { "epoch": 4.920382165605096, "grad_norm": 0.007707836611457692, "learning_rate": 6.326712433740167e-09, "loss": 0.0001, "step": 10815 }, { "epoch": 4.920837124658781, "grad_norm": 0.12920175452020327, "learning_rate": 6.255037171150613e-09, "loss": 0.0012, "step": 10816 }, { "epoch": 4.921292083712466, "grad_norm": 0.039799350772767195, "learning_rate": 6.18376997484793e-09, "loss": 0.0001, "step": 10817 }, { "epoch": 4.921747042766151, "grad_norm": 0.12165495009208412, "learning_rate": 6.112910850655241e-09, "loss": 0.0005, "step": 10818 }, { "epoch": 4.922202001819836, "grad_norm": 0.004964290701576096, "learning_rate": 6.042459804363465e-09, "loss": 0.0, "step": 10819 }, { "epoch": 4.922656960873521, "grad_norm": 0.061223655247116496, "learning_rate": 5.972416841729667e-09, "loss": 0.001, "step": 10820 }, { "epoch": 4.923111919927207, "grad_norm": 0.014053133070010092, "learning_rate": 5.90278196847649e-09, "loss": 0.0002, "step": 10821 }, { "epoch": 4.923566878980892, "grad_norm": 0.03397882153132436, "learning_rate": 5.833555190295492e-09, "loss": 0.0001, "step": 10822 }, { "epoch": 4.924021838034577, "grad_norm": 0.01571616445429705, "learning_rate": 5.7647365128421505e-09, "loss": 0.0001, "step": 10823 }, { "epoch": 4.924476797088262, "grad_norm": 0.1746286142178768, "learning_rate": 5.69632594174141e-09, "loss": 0.0026, "step": 10824 }, { "epoch": 4.924931756141947, "grad_norm": 0.08845289616129574, "learning_rate": 5.628323482582687e-09, "loss": 0.0005, "step": 10825 }, { "epoch": 4.925386715195632, "grad_norm": 0.3738143161808278, "learning_rate": 5.560729140923205e-09, "loss": 0.001, "step": 10826 }, { "epoch": 4.925841674249318, "grad_norm": 0.05370870855382171, "learning_rate": 5.493542922285766e-09, "loss": 0.0004, "step": 10827 }, { "epoch": 4.926296633303003, "grad_norm": 0.05670696706068442, "learning_rate": 5.42676483216098e-09, "loss": 0.0005, "step": 10828 }, { "epoch": 4.926751592356688, "grad_norm": 0.038288095612642, "learning_rate": 5.360394876006147e-09, "loss": 0.0007, "step": 10829 }, { "epoch": 4.927206551410373, "grad_norm": 0.1068322658937891, "learning_rate": 5.294433059244153e-09, "loss": 0.0011, "step": 10830 }, { "epoch": 4.927661510464058, "grad_norm": 0.5060349838924039, "learning_rate": 5.228879387265129e-09, "loss": 0.0033, "step": 10831 }, { "epoch": 4.928116469517743, "grad_norm": 0.02030995797552303, "learning_rate": 5.163733865425901e-09, "loss": 0.0002, "step": 10832 }, { "epoch": 4.928571428571429, "grad_norm": 0.07112347018491112, "learning_rate": 5.098996499049991e-09, "loss": 0.0007, "step": 10833 }, { "epoch": 4.929026387625114, "grad_norm": 0.0193514930291762, "learning_rate": 5.034667293427053e-09, "loss": 0.0002, "step": 10834 }, { "epoch": 4.929481346678799, "grad_norm": 0.23225264723242176, "learning_rate": 4.970746253813996e-09, "loss": 0.0043, "step": 10835 }, { "epoch": 4.929936305732484, "grad_norm": 0.014820280229914546, "learning_rate": 4.907233385434418e-09, "loss": 0.0001, "step": 10836 }, { "epoch": 4.930391264786169, "grad_norm": 0.12370771478270444, "learning_rate": 4.844128693477501e-09, "loss": 0.0014, "step": 10837 }, { "epoch": 4.930846223839854, "grad_norm": 0.0367160626774886, "learning_rate": 4.781432183101342e-09, "loss": 0.0001, "step": 10838 }, { "epoch": 4.93130118289354, "grad_norm": 0.016838456169988348, "learning_rate": 4.719143859427955e-09, "loss": 0.0001, "step": 10839 }, { "epoch": 4.931756141947225, "grad_norm": 0.036085715577643944, "learning_rate": 4.657263727547711e-09, "loss": 0.0001, "step": 10840 }, { "epoch": 4.9322111010009095, "grad_norm": 0.22943523792000506, "learning_rate": 4.595791792516568e-09, "loss": 0.0033, "step": 10841 }, { "epoch": 4.932666060054595, "grad_norm": 0.025290566720896922, "learning_rate": 4.53472805935884e-09, "loss": 0.0002, "step": 10842 }, { "epoch": 4.93312101910828, "grad_norm": 0.13102769572946452, "learning_rate": 4.4740725330638666e-09, "loss": 0.0019, "step": 10843 }, { "epoch": 4.933575978161965, "grad_norm": 0.05022091444162789, "learning_rate": 4.413825218587686e-09, "loss": 0.0003, "step": 10844 }, { "epoch": 4.934030937215651, "grad_norm": 0.003077427005291513, "learning_rate": 4.353986120854137e-09, "loss": 0.0, "step": 10845 }, { "epoch": 4.934485896269336, "grad_norm": 0.008720078645919371, "learning_rate": 4.29455524475264e-09, "loss": 0.0001, "step": 10846 }, { "epoch": 4.9349408553230205, "grad_norm": 0.16518049174005775, "learning_rate": 4.235532595139869e-09, "loss": 0.0019, "step": 10847 }, { "epoch": 4.935395814376706, "grad_norm": 0.019422074116871902, "learning_rate": 4.176918176838629e-09, "loss": 0.0002, "step": 10848 }, { "epoch": 4.935850773430391, "grad_norm": 0.011027263694848586, "learning_rate": 4.118711994638425e-09, "loss": 0.0001, "step": 10849 }, { "epoch": 4.936305732484076, "grad_norm": 0.21621847576582834, "learning_rate": 4.060914053296006e-09, "loss": 0.0009, "step": 10850 }, { "epoch": 4.936760691537762, "grad_norm": 0.04274947221203345, "learning_rate": 4.003524357534261e-09, "loss": 0.0003, "step": 10851 }, { "epoch": 4.937215650591447, "grad_norm": 0.10213078578251676, "learning_rate": 3.946542912042772e-09, "loss": 0.0013, "step": 10852 }, { "epoch": 4.9376706096451315, "grad_norm": 0.07689077244047676, "learning_rate": 3.889969721478371e-09, "loss": 0.0001, "step": 10853 }, { "epoch": 4.938125568698817, "grad_norm": 0.048040071492861755, "learning_rate": 3.833804790462359e-09, "loss": 0.0002, "step": 10854 }, { "epoch": 4.938580527752502, "grad_norm": 0.09687551005824657, "learning_rate": 3.778048123586065e-09, "loss": 0.0012, "step": 10855 }, { "epoch": 4.939035486806187, "grad_norm": 0.021983895893309062, "learning_rate": 3.7226997254047327e-09, "loss": 0.0001, "step": 10856 }, { "epoch": 4.939490445859873, "grad_norm": 0.06693452549745987, "learning_rate": 3.6677596004414117e-09, "loss": 0.0002, "step": 10857 }, { "epoch": 4.939945404913558, "grad_norm": 0.09654262145343878, "learning_rate": 3.613227753185289e-09, "loss": 0.0006, "step": 10858 }, { "epoch": 4.9404003639672425, "grad_norm": 0.15247258419677492, "learning_rate": 3.559104188093354e-09, "loss": 0.0022, "step": 10859 }, { "epoch": 4.940855323020928, "grad_norm": 0.026876130335081058, "learning_rate": 3.505388909587071e-09, "loss": 0.0003, "step": 10860 }, { "epoch": 4.941310282074613, "grad_norm": 0.19408218438570815, "learning_rate": 3.452081922056816e-09, "loss": 0.0017, "step": 10861 }, { "epoch": 4.941765241128298, "grad_norm": 0.047793442137234785, "learning_rate": 3.399183229857994e-09, "loss": 0.0003, "step": 10862 }, { "epoch": 4.942220200181984, "grad_norm": 0.0987802894995168, "learning_rate": 3.346692837313259e-09, "loss": 0.0005, "step": 10863 }, { "epoch": 4.942675159235669, "grad_norm": 0.17813226843398383, "learning_rate": 3.2946107487125123e-09, "loss": 0.0024, "step": 10864 }, { "epoch": 4.943130118289354, "grad_norm": 0.19607491027369808, "learning_rate": 3.2429369683112387e-09, "loss": 0.001, "step": 10865 }, { "epoch": 4.943585077343039, "grad_norm": 0.008386982738270956, "learning_rate": 3.1916715003316168e-09, "loss": 0.0001, "step": 10866 }, { "epoch": 4.944040036396724, "grad_norm": 0.03477479658888475, "learning_rate": 3.1408143489636277e-09, "loss": 0.0002, "step": 10867 }, { "epoch": 4.94449499545041, "grad_norm": 0.020096416631453284, "learning_rate": 3.0903655183622815e-09, "loss": 0.0001, "step": 10868 }, { "epoch": 4.944949954504095, "grad_norm": 0.0636729296061624, "learning_rate": 3.040325012650391e-09, "loss": 0.0003, "step": 10869 }, { "epoch": 4.94540491355778, "grad_norm": 0.011589650278424997, "learning_rate": 2.9906928359174637e-09, "loss": 0.0001, "step": 10870 }, { "epoch": 4.945859872611465, "grad_norm": 0.09220098889108387, "learning_rate": 2.9414689922185878e-09, "loss": 0.001, "step": 10871 }, { "epoch": 4.94631483166515, "grad_norm": 0.0433916007927223, "learning_rate": 2.8926534855766574e-09, "loss": 0.0003, "step": 10872 }, { "epoch": 4.946769790718835, "grad_norm": 0.022197431471557245, "learning_rate": 2.8442463199801486e-09, "loss": 0.0001, "step": 10873 }, { "epoch": 4.947224749772521, "grad_norm": 0.02253388670119882, "learning_rate": 2.7962474993847854e-09, "loss": 0.0002, "step": 10874 }, { "epoch": 4.947679708826206, "grad_norm": 0.04235227552882244, "learning_rate": 2.7486570277129863e-09, "loss": 0.0003, "step": 10875 }, { "epoch": 4.9481346678798905, "grad_norm": 0.08683947003503448, "learning_rate": 2.7014749088533076e-09, "loss": 0.0006, "step": 10876 }, { "epoch": 4.948589626933576, "grad_norm": 0.11018071259813766, "learning_rate": 2.6547011466621086e-09, "loss": 0.0006, "step": 10877 }, { "epoch": 4.949044585987261, "grad_norm": 0.06894228889772958, "learning_rate": 2.608335744960222e-09, "loss": 0.0007, "step": 10878 }, { "epoch": 4.949499545040946, "grad_norm": 0.04483257874866047, "learning_rate": 2.5623787075373942e-09, "loss": 0.0004, "step": 10879 }, { "epoch": 4.949954504094632, "grad_norm": 0.003995904954043218, "learning_rate": 2.5168300381489543e-09, "loss": 0.0, "step": 10880 }, { "epoch": 4.950409463148317, "grad_norm": 0.056300930291868535, "learning_rate": 2.47168974051637e-09, "loss": 0.0003, "step": 10881 }, { "epoch": 4.9508644222020015, "grad_norm": 0.03559941118864991, "learning_rate": 2.4269578183289122e-09, "loss": 0.0002, "step": 10882 }, { "epoch": 4.951319381255687, "grad_norm": 0.019519216013765082, "learning_rate": 2.3826342752414356e-09, "loss": 0.0001, "step": 10883 }, { "epoch": 4.951774340309372, "grad_norm": 0.05904804916348565, "learning_rate": 2.3387191148765977e-09, "loss": 0.0005, "step": 10884 }, { "epoch": 4.952229299363057, "grad_norm": 0.012337956333497945, "learning_rate": 2.295212340821529e-09, "loss": 0.0001, "step": 10885 }, { "epoch": 4.952684258416743, "grad_norm": 0.010984977910480666, "learning_rate": 2.252113956632829e-09, "loss": 0.0001, "step": 10886 }, { "epoch": 4.953139217470428, "grad_norm": 0.04338379930075229, "learning_rate": 2.20942396583157e-09, "loss": 0.0003, "step": 10887 }, { "epoch": 4.9535941765241125, "grad_norm": 0.030292489891455613, "learning_rate": 2.1671423719066278e-09, "loss": 0.0002, "step": 10888 }, { "epoch": 4.954049135577798, "grad_norm": 0.030271013081185478, "learning_rate": 2.125269178312461e-09, "loss": 0.0003, "step": 10889 }, { "epoch": 4.954504094631483, "grad_norm": 0.09019745343520717, "learning_rate": 2.083804388471333e-09, "loss": 0.0004, "step": 10890 }, { "epoch": 4.954959053685168, "grad_norm": 0.17242104413248077, "learning_rate": 2.042748005771089e-09, "loss": 0.0009, "step": 10891 }, { "epoch": 4.955414012738854, "grad_norm": 0.06840531488336897, "learning_rate": 2.002100033567378e-09, "loss": 0.0007, "step": 10892 }, { "epoch": 4.955868971792539, "grad_norm": 0.016322779731294198, "learning_rate": 1.961860475180877e-09, "loss": 0.0001, "step": 10893 }, { "epoch": 4.9563239308462235, "grad_norm": 0.06690903236842381, "learning_rate": 1.9220293339000664e-09, "loss": 0.0004, "step": 10894 }, { "epoch": 4.956778889899909, "grad_norm": 0.0697696982448244, "learning_rate": 1.88260661298012e-09, "loss": 0.0006, "step": 10895 }, { "epoch": 4.957233848953594, "grad_norm": 0.13053152328750317, "learning_rate": 1.8435923156423507e-09, "loss": 0.0012, "step": 10896 }, { "epoch": 4.957688808007279, "grad_norm": 0.019426877670999442, "learning_rate": 1.8049864450742082e-09, "loss": 0.0002, "step": 10897 }, { "epoch": 4.958143767060965, "grad_norm": 0.09106577699100912, "learning_rate": 1.7667890044315018e-09, "loss": 0.0003, "step": 10898 }, { "epoch": 4.95859872611465, "grad_norm": 0.021244645337390455, "learning_rate": 1.728999996835068e-09, "loss": 0.0001, "step": 10899 }, { "epoch": 4.959053685168335, "grad_norm": 0.04205108379292617, "learning_rate": 1.6916194253724373e-09, "loss": 0.0004, "step": 10900 }, { "epoch": 4.95950864422202, "grad_norm": 0.11183715393218424, "learning_rate": 1.6546472930983882e-09, "loss": 0.0014, "step": 10901 }, { "epoch": 4.959963603275705, "grad_norm": 0.029860054735906208, "learning_rate": 1.6180836030343927e-09, "loss": 0.0002, "step": 10902 }, { "epoch": 4.960418562329391, "grad_norm": 0.011092991434746928, "learning_rate": 1.5819283581680612e-09, "loss": 0.0001, "step": 10903 }, { "epoch": 4.960873521383076, "grad_norm": 0.019774998805003913, "learning_rate": 1.5461815614542518e-09, "loss": 0.0, "step": 10904 }, { "epoch": 4.961328480436761, "grad_norm": 0.17414117254728018, "learning_rate": 1.5108432158134067e-09, "loss": 0.0004, "step": 10905 }, { "epoch": 4.961783439490446, "grad_norm": 0.04889916158230268, "learning_rate": 1.4759133241332158e-09, "loss": 0.0004, "step": 10906 }, { "epoch": 4.962238398544131, "grad_norm": 0.09811233631515277, "learning_rate": 1.4413918892686173e-09, "loss": 0.0011, "step": 10907 }, { "epoch": 4.962693357597816, "grad_norm": 0.023526199552622272, "learning_rate": 1.4072789140401334e-09, "loss": 0.0001, "step": 10908 }, { "epoch": 4.963148316651502, "grad_norm": 0.004126976879121829, "learning_rate": 1.3735744012349784e-09, "loss": 0.0, "step": 10909 }, { "epoch": 4.963603275705187, "grad_norm": 0.15541283909576037, "learning_rate": 1.3402783536081709e-09, "loss": 0.0027, "step": 10910 }, { "epoch": 4.9640582347588715, "grad_norm": 0.1000791848380999, "learning_rate": 1.3073907738797576e-09, "loss": 0.0008, "step": 10911 }, { "epoch": 4.964513193812557, "grad_norm": 0.10996003020944184, "learning_rate": 1.274911664737588e-09, "loss": 0.0008, "step": 10912 }, { "epoch": 4.964968152866242, "grad_norm": 0.08669328113412138, "learning_rate": 1.2428410288356507e-09, "loss": 0.0005, "step": 10913 }, { "epoch": 4.965423111919927, "grad_norm": 0.005481736030375833, "learning_rate": 1.211178868794627e-09, "loss": 0.0, "step": 10914 }, { "epoch": 4.965878070973613, "grad_norm": 0.10797169484938719, "learning_rate": 1.1799251872018913e-09, "loss": 0.0004, "step": 10915 }, { "epoch": 4.966333030027298, "grad_norm": 0.05727548922614798, "learning_rate": 1.149079986610957e-09, "loss": 0.0006, "step": 10916 }, { "epoch": 4.9667879890809825, "grad_norm": 0.12283514808926797, "learning_rate": 1.1186432695425853e-09, "loss": 0.0013, "step": 10917 }, { "epoch": 4.967242948134668, "grad_norm": 0.10474177473166287, "learning_rate": 1.0886150384836757e-09, "loss": 0.0008, "step": 10918 }, { "epoch": 4.967697907188353, "grad_norm": 0.45528417992192965, "learning_rate": 1.0589952958883765e-09, "loss": 0.011, "step": 10919 }, { "epoch": 4.968152866242038, "grad_norm": 0.42849149071166887, "learning_rate": 1.0297840441775286e-09, "loss": 0.0078, "step": 10920 }, { "epoch": 4.968607825295724, "grad_norm": 0.05487623026516873, "learning_rate": 1.0009812857370016e-09, "loss": 0.0004, "step": 10921 }, { "epoch": 4.969062784349409, "grad_norm": 0.14756002729907117, "learning_rate": 9.725870229210233e-10, "loss": 0.0021, "step": 10922 }, { "epoch": 4.9695177434030935, "grad_norm": 0.019228895014081548, "learning_rate": 9.446012580499598e-10, "loss": 0.0001, "step": 10923 }, { "epoch": 4.969972702456779, "grad_norm": 0.39566863765498095, "learning_rate": 9.170239934108705e-10, "loss": 0.0016, "step": 10924 }, { "epoch": 4.970427661510464, "grad_norm": 0.06575653520369153, "learning_rate": 8.898552312563979e-10, "loss": 0.0003, "step": 10925 }, { "epoch": 4.970882620564149, "grad_norm": 0.3085913950663694, "learning_rate": 8.630949738075434e-10, "loss": 0.0047, "step": 10926 }, { "epoch": 4.971337579617835, "grad_norm": 0.014646130065314109, "learning_rate": 8.367432232503358e-10, "loss": 0.0001, "step": 10927 }, { "epoch": 4.97179253867152, "grad_norm": 0.038226524376258605, "learning_rate": 8.107999817386081e-10, "loss": 0.0002, "step": 10928 }, { "epoch": 4.9722474977252045, "grad_norm": 0.06293464331183735, "learning_rate": 7.852652513923309e-10, "loss": 0.0007, "step": 10929 }, { "epoch": 4.97270245677889, "grad_norm": 0.1470405363346794, "learning_rate": 7.601390342976134e-10, "loss": 0.0032, "step": 10930 }, { "epoch": 4.973157415832575, "grad_norm": 0.015051105141409322, "learning_rate": 7.354213325083681e-10, "loss": 0.0001, "step": 10931 }, { "epoch": 4.97361237488626, "grad_norm": 0.010646691003337176, "learning_rate": 7.111121480435357e-10, "loss": 0.0001, "step": 10932 }, { "epoch": 4.974067333939946, "grad_norm": 0.0341791703100183, "learning_rate": 6.872114828904153e-10, "loss": 0.0004, "step": 10933 }, { "epoch": 4.974522292993631, "grad_norm": 0.264606686208059, "learning_rate": 6.637193390013341e-10, "loss": 0.0005, "step": 10934 }, { "epoch": 4.9749772520473154, "grad_norm": 0.13725819866682398, "learning_rate": 6.40635718296978e-10, "loss": 0.0021, "step": 10935 }, { "epoch": 4.975432211101001, "grad_norm": 0.1478755800196532, "learning_rate": 6.179606226625057e-10, "loss": 0.0007, "step": 10936 }, { "epoch": 4.975887170154686, "grad_norm": 0.09876605250435293, "learning_rate": 5.956940539508793e-10, "loss": 0.0006, "step": 10937 }, { "epoch": 4.976342129208371, "grad_norm": 0.031755628007786495, "learning_rate": 5.738360139823095e-10, "loss": 0.0003, "step": 10938 }, { "epoch": 4.976797088262057, "grad_norm": 0.32826987217487963, "learning_rate": 5.523865045431454e-10, "loss": 0.0026, "step": 10939 }, { "epoch": 4.977252047315742, "grad_norm": 0.023699946931260252, "learning_rate": 5.313455273853186e-10, "loss": 0.0001, "step": 10940 }, { "epoch": 4.977707006369426, "grad_norm": 0.026987154670057748, "learning_rate": 5.107130842280095e-10, "loss": 0.0001, "step": 10941 }, { "epoch": 4.978161965423112, "grad_norm": 0.03417856109375883, "learning_rate": 4.90489176758202e-10, "loss": 0.0002, "step": 10942 }, { "epoch": 4.978616924476797, "grad_norm": 0.04903854780681204, "learning_rate": 4.706738066284633e-10, "loss": 0.0006, "step": 10943 }, { "epoch": 4.979071883530482, "grad_norm": 0.015288818221339824, "learning_rate": 4.512669754569432e-10, "loss": 0.0001, "step": 10944 }, { "epoch": 4.979526842584168, "grad_norm": 0.09383898237143956, "learning_rate": 4.322686848301505e-10, "loss": 0.001, "step": 10945 }, { "epoch": 4.9799818016378525, "grad_norm": 0.0619081475623028, "learning_rate": 4.136789363007321e-10, "loss": 0.0009, "step": 10946 }, { "epoch": 4.980436760691537, "grad_norm": 0.33728053773709143, "learning_rate": 3.9549773138747305e-10, "loss": 0.0017, "step": 10947 }, { "epoch": 4.980891719745223, "grad_norm": 0.16869697246206847, "learning_rate": 3.777250715764069e-10, "loss": 0.001, "step": 10948 }, { "epoch": 4.981346678798908, "grad_norm": 0.029668207853564724, "learning_rate": 3.603609583191503e-10, "loss": 0.0002, "step": 10949 }, { "epoch": 4.981801637852593, "grad_norm": 0.006061547052200015, "learning_rate": 3.434053930351233e-10, "loss": 0.0, "step": 10950 }, { "epoch": 4.982256596906279, "grad_norm": 0.1312349524258168, "learning_rate": 3.268583771098843e-10, "loss": 0.0015, "step": 10951 }, { "epoch": 4.9827115559599635, "grad_norm": 0.04316970971705657, "learning_rate": 3.107199118951298e-10, "loss": 0.0003, "step": 10952 }, { "epoch": 4.983166515013648, "grad_norm": 0.0031078713161544597, "learning_rate": 2.9498999871035996e-10, "loss": 0.0, "step": 10953 }, { "epoch": 4.983621474067334, "grad_norm": 0.04089414242832978, "learning_rate": 2.7966863884010265e-10, "loss": 0.0003, "step": 10954 }, { "epoch": 4.984076433121019, "grad_norm": 0.013206372443343982, "learning_rate": 2.6475583353724465e-10, "loss": 0.0001, "step": 10955 }, { "epoch": 4.984531392174704, "grad_norm": 0.132860777657425, "learning_rate": 2.502515840197006e-10, "loss": 0.0013, "step": 10956 }, { "epoch": 4.98498635122839, "grad_norm": 0.009705850360911157, "learning_rate": 2.3615589147318875e-10, "loss": 0.0, "step": 10957 }, { "epoch": 4.9854413102820745, "grad_norm": 0.013041234567280421, "learning_rate": 2.2246875704901028e-10, "loss": 0.0001, "step": 10958 }, { "epoch": 4.985896269335759, "grad_norm": 0.07387909375031795, "learning_rate": 2.0919018186571495e-10, "loss": 0.0004, "step": 10959 }, { "epoch": 4.986351228389445, "grad_norm": 0.17894333002034335, "learning_rate": 1.9632016700910085e-10, "loss": 0.0016, "step": 10960 }, { "epoch": 4.98680618744313, "grad_norm": 0.024349979545171582, "learning_rate": 1.8385871352999408e-10, "loss": 0.0001, "step": 10961 }, { "epoch": 4.987261146496815, "grad_norm": 0.15367883750048947, "learning_rate": 1.7180582244646916e-10, "loss": 0.0008, "step": 10962 }, { "epoch": 4.987716105550501, "grad_norm": 0.06784418068375989, "learning_rate": 1.6016149474440412e-10, "loss": 0.0006, "step": 10963 }, { "epoch": 4.9881710646041855, "grad_norm": 0.0072818312395469785, "learning_rate": 1.4892573137526012e-10, "loss": 0.0, "step": 10964 }, { "epoch": 4.98862602365787, "grad_norm": 0.006616929129499675, "learning_rate": 1.3809853325608136e-10, "loss": 0.0, "step": 10965 }, { "epoch": 4.989080982711556, "grad_norm": 0.30928026028322436, "learning_rate": 1.276799012728258e-10, "loss": 0.005, "step": 10966 }, { "epoch": 4.989535941765241, "grad_norm": 0.03938748329528109, "learning_rate": 1.1766983627592432e-10, "loss": 0.0001, "step": 10967 }, { "epoch": 4.989990900818926, "grad_norm": 0.08734276496520511, "learning_rate": 1.0806833908416636e-10, "loss": 0.0003, "step": 10968 }, { "epoch": 4.990445859872612, "grad_norm": 0.004440189874590207, "learning_rate": 9.88754104813694e-11, "loss": 0.0, "step": 10969 }, { "epoch": 4.9909008189262964, "grad_norm": 0.04872669507561509, "learning_rate": 9.009105121970951e-11, "loss": 0.0003, "step": 10970 }, { "epoch": 4.991355777979981, "grad_norm": 0.0887063928171887, "learning_rate": 8.171526201583568e-11, "loss": 0.0011, "step": 10971 }, { "epoch": 4.991810737033667, "grad_norm": 0.039752157915562254, "learning_rate": 7.374804355475551e-11, "loss": 0.0004, "step": 10972 }, { "epoch": 4.992265696087352, "grad_norm": 0.12837500144845662, "learning_rate": 6.618939648816991e-11, "loss": 0.0006, "step": 10973 }, { "epoch": 4.992720655141038, "grad_norm": 0.1408038910303928, "learning_rate": 5.903932143225266e-11, "loss": 0.0017, "step": 10974 }, { "epoch": 4.9931756141947226, "grad_norm": 0.10664898869023143, "learning_rate": 5.229781897264641e-11, "loss": 0.0009, "step": 10975 }, { "epoch": 4.993630573248407, "grad_norm": 0.4310837705795608, "learning_rate": 4.596488965946666e-11, "loss": 0.0015, "step": 10976 }, { "epoch": 4.994085532302093, "grad_norm": 0.029038896366586554, "learning_rate": 4.004053401063246e-11, "loss": 0.0003, "step": 10977 }, { "epoch": 4.994540491355778, "grad_norm": 0.014178700769927291, "learning_rate": 3.452475250964593e-11, "loss": 0.0, "step": 10978 }, { "epoch": 4.994995450409463, "grad_norm": 0.006184474316448662, "learning_rate": 2.941754560836785e-11, "loss": 0.0, "step": 10979 }, { "epoch": 4.995450409463149, "grad_norm": 0.005560398834121418, "learning_rate": 2.4718913723131844e-11, "loss": 0.0, "step": 10980 }, { "epoch": 4.9959053685168335, "grad_norm": 0.029376756478942567, "learning_rate": 2.0428857237519973e-11, "loss": 0.0001, "step": 10981 }, { "epoch": 4.996360327570518, "grad_norm": 0.01022189036277154, "learning_rate": 1.6547376503472935e-11, "loss": 0.0001, "step": 10982 }, { "epoch": 4.996815286624204, "grad_norm": 0.08052039973495588, "learning_rate": 1.307447183740429e-11, "loss": 0.0008, "step": 10983 }, { "epoch": 4.997270245677889, "grad_norm": 0.018650018535295354, "learning_rate": 1.0010143522976024e-11, "loss": 0.0001, "step": 10984 }, { "epoch": 4.997725204731574, "grad_norm": 0.1179079506078607, "learning_rate": 7.354391810543426e-12, "loss": 0.0015, "step": 10985 }, { "epoch": 4.99818016378526, "grad_norm": 0.14276244877351496, "learning_rate": 5.1072169177102115e-12, "loss": 0.0021, "step": 10986 }, { "epoch": 4.9986351228389445, "grad_norm": 0.16779471084499145, "learning_rate": 3.268619027663178e-12, "loss": 0.0016, "step": 10987 }, { "epoch": 4.999090081892629, "grad_norm": 0.010458276110439946, "learning_rate": 1.8385982908375454e-12, "loss": 0.0001, "step": 10988 }, { "epoch": 4.999545040946315, "grad_norm": 0.03923457835929189, "learning_rate": 8.171548238067318e-13, "loss": 0.0004, "step": 10989 }, { "epoch": 5.0, "grad_norm": 0.010678056863082914, "learning_rate": 2.0428870983746353e-13, "loss": 0.0001, "step": 10990 }, { "epoch": 5.0, "step": 10990, "total_flos": 72288269254656.0, "train_loss": 0.026681415885462554, "train_runtime": 13806.7713, "train_samples_per_second": 3.183, "train_steps_per_second": 0.796 } ], "logging_steps": 1, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 555, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 72288269254656.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }