{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999581887360455, "eval_steps": 500, "global_step": 5979, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016724505581803737, "grad_norm": 2.9741883277893066, "learning_rate": 8.361204013377927e-08, "loss": 3.3056, "step": 1 }, { "epoch": 0.00033449011163607474, "grad_norm": 2.948265552520752, "learning_rate": 1.6722408026755853e-07, "loss": 3.5845, "step": 2 }, { "epoch": 0.0005017351674541121, "grad_norm": 3.488875389099121, "learning_rate": 2.508361204013378e-07, "loss": 3.5622, "step": 3 }, { "epoch": 0.0006689802232721495, "grad_norm": 5.2428059577941895, "learning_rate": 3.3444816053511706e-07, "loss": 3.9035, "step": 4 }, { "epoch": 0.0008362252790901868, "grad_norm": 2.8028392791748047, "learning_rate": 4.180602006688963e-07, "loss": 3.6025, "step": 5 }, { "epoch": 0.0010034703349082242, "grad_norm": 3.041893482208252, "learning_rate": 5.016722408026756e-07, "loss": 3.9053, "step": 6 }, { "epoch": 0.0011707153907262616, "grad_norm": 3.2136476039886475, "learning_rate": 5.852842809364548e-07, "loss": 3.5437, "step": 7 }, { "epoch": 0.001337960446544299, "grad_norm": 3.5519180297851562, "learning_rate": 6.688963210702341e-07, "loss": 3.6073, "step": 8 }, { "epoch": 0.0015052055023623363, "grad_norm": 3.5796613693237305, "learning_rate": 7.525083612040134e-07, "loss": 3.4521, "step": 9 }, { "epoch": 0.0016724505581803737, "grad_norm": 2.9603819847106934, "learning_rate": 8.361204013377926e-07, "loss": 3.7029, "step": 10 }, { "epoch": 0.0018396956139984113, "grad_norm": 2.6949944496154785, "learning_rate": 9.19732441471572e-07, "loss": 3.615, "step": 11 }, { "epoch": 0.0020069406698164484, "grad_norm": 2.5304205417633057, "learning_rate": 1.0033444816053512e-06, "loss": 3.0973, "step": 12 }, { "epoch": 0.002174185725634486, "grad_norm": 2.498004913330078, "learning_rate": 1.0869565217391306e-06, "loss": 3.3985, "step": 13 }, { "epoch": 0.002341430781452523, "grad_norm": 3.7959530353546143, "learning_rate": 1.1705685618729096e-06, "loss": 4.1439, "step": 14 }, { "epoch": 0.0025086758372705607, "grad_norm": 2.3117644786834717, "learning_rate": 1.254180602006689e-06, "loss": 3.3801, "step": 15 }, { "epoch": 0.002675920893088598, "grad_norm": 2.679330825805664, "learning_rate": 1.3377926421404683e-06, "loss": 3.6888, "step": 16 }, { "epoch": 0.0028431659489066355, "grad_norm": 2.755466938018799, "learning_rate": 1.4214046822742476e-06, "loss": 3.3999, "step": 17 }, { "epoch": 0.0030104110047246726, "grad_norm": 2.766191005706787, "learning_rate": 1.5050167224080269e-06, "loss": 3.573, "step": 18 }, { "epoch": 0.0031776560605427102, "grad_norm": 3.1202306747436523, "learning_rate": 1.5886287625418062e-06, "loss": 3.0202, "step": 19 }, { "epoch": 0.0033449011163607474, "grad_norm": 2.2878005504608154, "learning_rate": 1.6722408026755853e-06, "loss": 2.9893, "step": 20 }, { "epoch": 0.003512146172178785, "grad_norm": 4.4621171951293945, "learning_rate": 1.7558528428093646e-06, "loss": 4.2294, "step": 21 }, { "epoch": 0.0036793912279968225, "grad_norm": 3.1286187171936035, "learning_rate": 1.839464882943144e-06, "loss": 3.5128, "step": 22 }, { "epoch": 0.0038466362838148597, "grad_norm": 3.992309093475342, "learning_rate": 1.9230769230769234e-06, "loss": 4.2942, "step": 23 }, { "epoch": 0.004013881339632897, "grad_norm": 3.2530720233917236, "learning_rate": 2.0066889632107025e-06, "loss": 3.4474, "step": 24 }, { "epoch": 0.0041811263954509344, "grad_norm": 3.5445921421051025, "learning_rate": 2.0903010033444816e-06, "loss": 3.657, "step": 25 }, { "epoch": 0.004348371451268972, "grad_norm": 2.6539976596832275, "learning_rate": 2.173913043478261e-06, "loss": 3.4162, "step": 26 }, { "epoch": 0.00451561650708701, "grad_norm": 3.0370452404022217, "learning_rate": 2.25752508361204e-06, "loss": 3.6247, "step": 27 }, { "epoch": 0.004682861562905046, "grad_norm": 2.786710262298584, "learning_rate": 2.3411371237458193e-06, "loss": 3.0844, "step": 28 }, { "epoch": 0.004850106618723084, "grad_norm": 4.03367805480957, "learning_rate": 2.424749163879599e-06, "loss": 3.3473, "step": 29 }, { "epoch": 0.0050173516745411215, "grad_norm": 1.9546233415603638, "learning_rate": 2.508361204013378e-06, "loss": 3.2747, "step": 30 }, { "epoch": 0.005184596730359159, "grad_norm": 2.4446868896484375, "learning_rate": 2.5919732441471574e-06, "loss": 3.1357, "step": 31 }, { "epoch": 0.005351841786177196, "grad_norm": 1.9725579023361206, "learning_rate": 2.6755852842809365e-06, "loss": 3.2258, "step": 32 }, { "epoch": 0.005519086841995233, "grad_norm": 2.6416852474212646, "learning_rate": 2.7591973244147156e-06, "loss": 3.3313, "step": 33 }, { "epoch": 0.005686331897813271, "grad_norm": 3.753082752227783, "learning_rate": 2.842809364548495e-06, "loss": 3.7506, "step": 34 }, { "epoch": 0.0058535769536313086, "grad_norm": 3.0078001022338867, "learning_rate": 2.9264214046822746e-06, "loss": 3.4788, "step": 35 }, { "epoch": 0.006020822009449345, "grad_norm": 4.4457855224609375, "learning_rate": 3.0100334448160537e-06, "loss": 3.7324, "step": 36 }, { "epoch": 0.006188067065267383, "grad_norm": 2.7756223678588867, "learning_rate": 3.093645484949833e-06, "loss": 2.945, "step": 37 }, { "epoch": 0.0063553121210854205, "grad_norm": 2.9878220558166504, "learning_rate": 3.1772575250836123e-06, "loss": 3.6185, "step": 38 }, { "epoch": 0.006522557176903458, "grad_norm": 3.9122154712677, "learning_rate": 3.2608695652173914e-06, "loss": 3.5859, "step": 39 }, { "epoch": 0.006689802232721495, "grad_norm": 2.969026565551758, "learning_rate": 3.3444816053511705e-06, "loss": 3.1181, "step": 40 }, { "epoch": 0.006857047288539532, "grad_norm": 2.690906524658203, "learning_rate": 3.4280936454849496e-06, "loss": 3.1339, "step": 41 }, { "epoch": 0.00702429234435757, "grad_norm": 3.9662551879882812, "learning_rate": 3.511705685618729e-06, "loss": 3.4837, "step": 42 }, { "epoch": 0.0071915374001756075, "grad_norm": 3.5100505352020264, "learning_rate": 3.5953177257525082e-06, "loss": 2.9214, "step": 43 }, { "epoch": 0.007358782455993645, "grad_norm": 4.689671516418457, "learning_rate": 3.678929765886288e-06, "loss": 3.7994, "step": 44 }, { "epoch": 0.007526027511811682, "grad_norm": 4.4835429191589355, "learning_rate": 3.7625418060200673e-06, "loss": 3.2114, "step": 45 }, { "epoch": 0.007693272567629719, "grad_norm": 3.569324016571045, "learning_rate": 3.846153846153847e-06, "loss": 3.2308, "step": 46 }, { "epoch": 0.007860517623447756, "grad_norm": 3.1882712841033936, "learning_rate": 3.9297658862876255e-06, "loss": 3.0043, "step": 47 }, { "epoch": 0.008027762679265794, "grad_norm": 4.829992771148682, "learning_rate": 4.013377926421405e-06, "loss": 4.1502, "step": 48 }, { "epoch": 0.008195007735083831, "grad_norm": 3.306385040283203, "learning_rate": 4.0969899665551845e-06, "loss": 3.2589, "step": 49 }, { "epoch": 0.008362252790901869, "grad_norm": 3.7497446537017822, "learning_rate": 4.180602006688963e-06, "loss": 3.5459, "step": 50 }, { "epoch": 0.008529497846719906, "grad_norm": 2.6423192024230957, "learning_rate": 4.264214046822743e-06, "loss": 2.8553, "step": 51 }, { "epoch": 0.008696742902537944, "grad_norm": 3.652660369873047, "learning_rate": 4.347826086956522e-06, "loss": 3.3119, "step": 52 }, { "epoch": 0.008863987958355982, "grad_norm": 3.554577350616455, "learning_rate": 4.431438127090301e-06, "loss": 3.2485, "step": 53 }, { "epoch": 0.00903123301417402, "grad_norm": 4.067053318023682, "learning_rate": 4.51505016722408e-06, "loss": 3.761, "step": 54 }, { "epoch": 0.009198478069992055, "grad_norm": 3.2299931049346924, "learning_rate": 4.598662207357859e-06, "loss": 3.3133, "step": 55 }, { "epoch": 0.009365723125810093, "grad_norm": 4.027376651763916, "learning_rate": 4.682274247491639e-06, "loss": 3.3652, "step": 56 }, { "epoch": 0.00953296818162813, "grad_norm": 2.1754167079925537, "learning_rate": 4.765886287625419e-06, "loss": 3.0193, "step": 57 }, { "epoch": 0.009700213237446168, "grad_norm": 2.784782648086548, "learning_rate": 4.849498327759198e-06, "loss": 3.2275, "step": 58 }, { "epoch": 0.009867458293264205, "grad_norm": 4.317951679229736, "learning_rate": 4.933110367892977e-06, "loss": 4.0795, "step": 59 }, { "epoch": 0.010034703349082243, "grad_norm": 2.8223345279693604, "learning_rate": 5.016722408026756e-06, "loss": 3.0613, "step": 60 }, { "epoch": 0.01020194840490028, "grad_norm": 2.854752540588379, "learning_rate": 5.100334448160535e-06, "loss": 2.9629, "step": 61 }, { "epoch": 0.010369193460718318, "grad_norm": 2.6824591159820557, "learning_rate": 5.183946488294315e-06, "loss": 3.2371, "step": 62 }, { "epoch": 0.010536438516536356, "grad_norm": 4.622383117675781, "learning_rate": 5.2675585284280935e-06, "loss": 3.7907, "step": 63 }, { "epoch": 0.010703683572354392, "grad_norm": 2.6136574745178223, "learning_rate": 5.351170568561873e-06, "loss": 3.0283, "step": 64 }, { "epoch": 0.01087092862817243, "grad_norm": 2.7583680152893066, "learning_rate": 5.4347826086956525e-06, "loss": 3.3, "step": 65 }, { "epoch": 0.011038173683990467, "grad_norm": 3.716066837310791, "learning_rate": 5.518394648829431e-06, "loss": 3.1849, "step": 66 }, { "epoch": 0.011205418739808504, "grad_norm": 2.393721342086792, "learning_rate": 5.602006688963211e-06, "loss": 3.0544, "step": 67 }, { "epoch": 0.011372663795626542, "grad_norm": 2.868586301803589, "learning_rate": 5.68561872909699e-06, "loss": 3.0168, "step": 68 }, { "epoch": 0.01153990885144458, "grad_norm": 4.261537075042725, "learning_rate": 5.76923076923077e-06, "loss": 3.3566, "step": 69 }, { "epoch": 0.011707153907262617, "grad_norm": 2.7678298950195312, "learning_rate": 5.852842809364549e-06, "loss": 3.2407, "step": 70 }, { "epoch": 0.011874398963080655, "grad_norm": 5.096793174743652, "learning_rate": 5.936454849498328e-06, "loss": 3.8776, "step": 71 }, { "epoch": 0.01204164401889869, "grad_norm": 3.592867136001587, "learning_rate": 6.0200668896321075e-06, "loss": 3.4655, "step": 72 }, { "epoch": 0.012208889074716728, "grad_norm": 3.25419282913208, "learning_rate": 6.103678929765887e-06, "loss": 2.6928, "step": 73 }, { "epoch": 0.012376134130534766, "grad_norm": 2.9943008422851562, "learning_rate": 6.187290969899666e-06, "loss": 3.2799, "step": 74 }, { "epoch": 0.012543379186352803, "grad_norm": 4.6188883781433105, "learning_rate": 6.270903010033445e-06, "loss": 3.4216, "step": 75 }, { "epoch": 0.012710624242170841, "grad_norm": 4.306404113769531, "learning_rate": 6.354515050167225e-06, "loss": 3.5028, "step": 76 }, { "epoch": 0.012877869297988878, "grad_norm": 3.586388349533081, "learning_rate": 6.438127090301003e-06, "loss": 3.1702, "step": 77 }, { "epoch": 0.013045114353806916, "grad_norm": 3.657644748687744, "learning_rate": 6.521739130434783e-06, "loss": 3.0215, "step": 78 }, { "epoch": 0.013212359409624954, "grad_norm": 6.131587505340576, "learning_rate": 6.6053511705685616e-06, "loss": 3.3584, "step": 79 }, { "epoch": 0.01337960446544299, "grad_norm": 2.6182494163513184, "learning_rate": 6.688963210702341e-06, "loss": 3.1594, "step": 80 }, { "epoch": 0.013546849521261027, "grad_norm": 2.7246274948120117, "learning_rate": 6.772575250836121e-06, "loss": 2.9681, "step": 81 }, { "epoch": 0.013714094577079065, "grad_norm": 3.1227684020996094, "learning_rate": 6.856187290969899e-06, "loss": 2.8406, "step": 82 }, { "epoch": 0.013881339632897102, "grad_norm": 5.1152663230896, "learning_rate": 6.939799331103679e-06, "loss": 4.2343, "step": 83 }, { "epoch": 0.01404858468871514, "grad_norm": 3.2298457622528076, "learning_rate": 7.023411371237458e-06, "loss": 2.6789, "step": 84 }, { "epoch": 0.014215829744533177, "grad_norm": 4.572878360748291, "learning_rate": 7.107023411371237e-06, "loss": 3.3115, "step": 85 }, { "epoch": 0.014383074800351215, "grad_norm": 4.36238956451416, "learning_rate": 7.1906354515050165e-06, "loss": 3.1356, "step": 86 }, { "epoch": 0.014550319856169253, "grad_norm": 4.772114276885986, "learning_rate": 7.274247491638796e-06, "loss": 3.794, "step": 87 }, { "epoch": 0.01471756491198729, "grad_norm": 3.1816742420196533, "learning_rate": 7.357859531772576e-06, "loss": 3.1803, "step": 88 }, { "epoch": 0.014884809967805326, "grad_norm": 4.5551629066467285, "learning_rate": 7.441471571906356e-06, "loss": 3.1846, "step": 89 }, { "epoch": 0.015052055023623364, "grad_norm": 4.8129987716674805, "learning_rate": 7.5250836120401346e-06, "loss": 3.1304, "step": 90 }, { "epoch": 0.015219300079441401, "grad_norm": 3.195058584213257, "learning_rate": 7.608695652173914e-06, "loss": 2.8714, "step": 91 }, { "epoch": 0.015386545135259439, "grad_norm": 3.0516955852508545, "learning_rate": 7.692307692307694e-06, "loss": 2.999, "step": 92 }, { "epoch": 0.015553790191077476, "grad_norm": 3.7130820751190186, "learning_rate": 7.775919732441473e-06, "loss": 3.2579, "step": 93 }, { "epoch": 0.015721035246895512, "grad_norm": 5.527997016906738, "learning_rate": 7.859531772575251e-06, "loss": 3.1965, "step": 94 }, { "epoch": 0.01588828030271355, "grad_norm": 4.536369323730469, "learning_rate": 7.94314381270903e-06, "loss": 3.3135, "step": 95 }, { "epoch": 0.016055525358531587, "grad_norm": 4.456637859344482, "learning_rate": 8.02675585284281e-06, "loss": 3.0887, "step": 96 }, { "epoch": 0.016222770414349625, "grad_norm": 2.766878128051758, "learning_rate": 8.11036789297659e-06, "loss": 3.0202, "step": 97 }, { "epoch": 0.016390015470167663, "grad_norm": 4.646317481994629, "learning_rate": 8.193979933110369e-06, "loss": 2.9302, "step": 98 }, { "epoch": 0.0165572605259857, "grad_norm": 5.175597190856934, "learning_rate": 8.277591973244147e-06, "loss": 3.314, "step": 99 }, { "epoch": 0.016724505581803738, "grad_norm": 7.062839031219482, "learning_rate": 8.361204013377926e-06, "loss": 3.9225, "step": 100 }, { "epoch": 0.016891750637621775, "grad_norm": 4.512515068054199, "learning_rate": 8.444816053511706e-06, "loss": 3.2447, "step": 101 }, { "epoch": 0.017058995693439813, "grad_norm": 3.7748587131500244, "learning_rate": 8.528428093645485e-06, "loss": 2.5741, "step": 102 }, { "epoch": 0.01722624074925785, "grad_norm": 2.6156973838806152, "learning_rate": 8.612040133779265e-06, "loss": 2.9351, "step": 103 }, { "epoch": 0.017393485805075888, "grad_norm": 2.7578678131103516, "learning_rate": 8.695652173913044e-06, "loss": 2.7103, "step": 104 }, { "epoch": 0.017560730860893926, "grad_norm": 3.948707342147827, "learning_rate": 8.779264214046822e-06, "loss": 3.2245, "step": 105 }, { "epoch": 0.017727975916711963, "grad_norm": 6.611628532409668, "learning_rate": 8.862876254180602e-06, "loss": 3.8439, "step": 106 }, { "epoch": 0.01789522097253, "grad_norm": 3.322476625442505, "learning_rate": 8.946488294314381e-06, "loss": 3.1448, "step": 107 }, { "epoch": 0.01806246602834804, "grad_norm": 5.129034042358398, "learning_rate": 9.03010033444816e-06, "loss": 3.1261, "step": 108 }, { "epoch": 0.018229711084166076, "grad_norm": 3.9007513523101807, "learning_rate": 9.11371237458194e-06, "loss": 3.3979, "step": 109 }, { "epoch": 0.01839695613998411, "grad_norm": 3.264265537261963, "learning_rate": 9.197324414715718e-06, "loss": 2.9538, "step": 110 }, { "epoch": 0.018564201195802148, "grad_norm": 3.403266429901123, "learning_rate": 9.280936454849498e-06, "loss": 3.1599, "step": 111 }, { "epoch": 0.018731446251620185, "grad_norm": 3.379936456680298, "learning_rate": 9.364548494983277e-06, "loss": 3.0262, "step": 112 }, { "epoch": 0.018898691307438223, "grad_norm": 2.8045027256011963, "learning_rate": 9.448160535117058e-06, "loss": 2.7462, "step": 113 }, { "epoch": 0.01906593636325626, "grad_norm": 3.32436203956604, "learning_rate": 9.531772575250838e-06, "loss": 3.1727, "step": 114 }, { "epoch": 0.019233181419074298, "grad_norm": 2.1139793395996094, "learning_rate": 9.615384615384616e-06, "loss": 2.4401, "step": 115 }, { "epoch": 0.019400426474892336, "grad_norm": 2.6965510845184326, "learning_rate": 9.698996655518395e-06, "loss": 2.9416, "step": 116 }, { "epoch": 0.019567671530710373, "grad_norm": 4.17039155960083, "learning_rate": 9.782608695652175e-06, "loss": 3.1558, "step": 117 }, { "epoch": 0.01973491658652841, "grad_norm": 2.516172409057617, "learning_rate": 9.866220735785954e-06, "loss": 2.6599, "step": 118 }, { "epoch": 0.01990216164234645, "grad_norm": 2.842013359069824, "learning_rate": 9.949832775919734e-06, "loss": 2.9721, "step": 119 }, { "epoch": 0.020069406698164486, "grad_norm": 3.2888543605804443, "learning_rate": 1.0033444816053512e-05, "loss": 3.0155, "step": 120 }, { "epoch": 0.020236651753982524, "grad_norm": 3.51027774810791, "learning_rate": 1.0117056856187291e-05, "loss": 2.4694, "step": 121 }, { "epoch": 0.02040389680980056, "grad_norm": 4.568947792053223, "learning_rate": 1.020066889632107e-05, "loss": 2.9181, "step": 122 }, { "epoch": 0.0205711418656186, "grad_norm": 3.3086724281311035, "learning_rate": 1.028428093645485e-05, "loss": 3.2548, "step": 123 }, { "epoch": 0.020738386921436636, "grad_norm": 4.451070785522461, "learning_rate": 1.036789297658863e-05, "loss": 3.3512, "step": 124 }, { "epoch": 0.020905631977254674, "grad_norm": 4.03797721862793, "learning_rate": 1.045150501672241e-05, "loss": 3.114, "step": 125 }, { "epoch": 0.02107287703307271, "grad_norm": 6.2399468421936035, "learning_rate": 1.0535117056856187e-05, "loss": 3.4441, "step": 126 }, { "epoch": 0.021240122088890746, "grad_norm": 2.3957841396331787, "learning_rate": 1.0618729096989967e-05, "loss": 2.9276, "step": 127 }, { "epoch": 0.021407367144708783, "grad_norm": 2.4610610008239746, "learning_rate": 1.0702341137123746e-05, "loss": 2.6918, "step": 128 }, { "epoch": 0.02157461220052682, "grad_norm": 3.1997694969177246, "learning_rate": 1.0785953177257526e-05, "loss": 2.937, "step": 129 }, { "epoch": 0.02174185725634486, "grad_norm": 4.182227611541748, "learning_rate": 1.0869565217391305e-05, "loss": 2.9864, "step": 130 }, { "epoch": 0.021909102312162896, "grad_norm": 4.300537109375, "learning_rate": 1.0953177257525085e-05, "loss": 3.3013, "step": 131 }, { "epoch": 0.022076347367980934, "grad_norm": 4.072454452514648, "learning_rate": 1.1036789297658862e-05, "loss": 2.9373, "step": 132 }, { "epoch": 0.02224359242379897, "grad_norm": 3.2836849689483643, "learning_rate": 1.1120401337792642e-05, "loss": 2.7878, "step": 133 }, { "epoch": 0.02241083747961701, "grad_norm": 3.4962716102600098, "learning_rate": 1.1204013377926421e-05, "loss": 2.8778, "step": 134 }, { "epoch": 0.022578082535435046, "grad_norm": 2.6953587532043457, "learning_rate": 1.1287625418060201e-05, "loss": 3.0202, "step": 135 }, { "epoch": 0.022745327591253084, "grad_norm": 2.928971290588379, "learning_rate": 1.137123745819398e-05, "loss": 2.8958, "step": 136 }, { "epoch": 0.02291257264707112, "grad_norm": 3.0877625942230225, "learning_rate": 1.1454849498327758e-05, "loss": 3.0691, "step": 137 }, { "epoch": 0.02307981770288916, "grad_norm": 3.2704596519470215, "learning_rate": 1.153846153846154e-05, "loss": 2.8533, "step": 138 }, { "epoch": 0.023247062758707197, "grad_norm": 3.473400592803955, "learning_rate": 1.1622073578595319e-05, "loss": 3.1021, "step": 139 }, { "epoch": 0.023414307814525234, "grad_norm": 3.9560110569000244, "learning_rate": 1.1705685618729099e-05, "loss": 3.5274, "step": 140 }, { "epoch": 0.023581552870343272, "grad_norm": 1.6095669269561768, "learning_rate": 1.1789297658862878e-05, "loss": 2.2181, "step": 141 }, { "epoch": 0.02374879792616131, "grad_norm": 3.127284526824951, "learning_rate": 1.1872909698996656e-05, "loss": 2.9185, "step": 142 }, { "epoch": 0.023916042981979344, "grad_norm": 4.572995185852051, "learning_rate": 1.1956521739130435e-05, "loss": 3.2212, "step": 143 }, { "epoch": 0.02408328803779738, "grad_norm": 4.996701240539551, "learning_rate": 1.2040133779264215e-05, "loss": 3.4233, "step": 144 }, { "epoch": 0.02425053309361542, "grad_norm": 3.1315956115722656, "learning_rate": 1.2123745819397994e-05, "loss": 3.2886, "step": 145 }, { "epoch": 0.024417778149433456, "grad_norm": 2.3074121475219727, "learning_rate": 1.2207357859531774e-05, "loss": 2.9583, "step": 146 }, { "epoch": 0.024585023205251494, "grad_norm": 3.039752244949341, "learning_rate": 1.2290969899665552e-05, "loss": 2.9414, "step": 147 }, { "epoch": 0.02475226826106953, "grad_norm": 3.762587547302246, "learning_rate": 1.2374581939799331e-05, "loss": 2.6943, "step": 148 }, { "epoch": 0.02491951331688757, "grad_norm": 3.086115837097168, "learning_rate": 1.245819397993311e-05, "loss": 3.1018, "step": 149 }, { "epoch": 0.025086758372705607, "grad_norm": 3.4456818103790283, "learning_rate": 1.254180602006689e-05, "loss": 3.0469, "step": 150 }, { "epoch": 0.025254003428523644, "grad_norm": 3.9869983196258545, "learning_rate": 1.262541806020067e-05, "loss": 3.2501, "step": 151 }, { "epoch": 0.025421248484341682, "grad_norm": 2.8274381160736084, "learning_rate": 1.270903010033445e-05, "loss": 2.974, "step": 152 }, { "epoch": 0.02558849354015972, "grad_norm": 2.2604081630706787, "learning_rate": 1.2792642140468227e-05, "loss": 2.8994, "step": 153 }, { "epoch": 0.025755738595977757, "grad_norm": 3.5008833408355713, "learning_rate": 1.2876254180602007e-05, "loss": 3.3022, "step": 154 }, { "epoch": 0.025922983651795795, "grad_norm": 4.79054594039917, "learning_rate": 1.2959866220735786e-05, "loss": 3.0874, "step": 155 }, { "epoch": 0.026090228707613832, "grad_norm": 4.137330532073975, "learning_rate": 1.3043478260869566e-05, "loss": 3.0503, "step": 156 }, { "epoch": 0.02625747376343187, "grad_norm": 3.5677638053894043, "learning_rate": 1.3127090301003345e-05, "loss": 3.0319, "step": 157 }, { "epoch": 0.026424718819249907, "grad_norm": 2.5750298500061035, "learning_rate": 1.3210702341137123e-05, "loss": 2.8855, "step": 158 }, { "epoch": 0.026591963875067945, "grad_norm": 2.7348997592926025, "learning_rate": 1.3294314381270903e-05, "loss": 3.1835, "step": 159 }, { "epoch": 0.02675920893088598, "grad_norm": 4.5999274253845215, "learning_rate": 1.3377926421404682e-05, "loss": 3.4192, "step": 160 }, { "epoch": 0.026926453986704017, "grad_norm": 3.1714630126953125, "learning_rate": 1.3461538461538462e-05, "loss": 2.9389, "step": 161 }, { "epoch": 0.027093699042522054, "grad_norm": 2.940396547317505, "learning_rate": 1.3545150501672241e-05, "loss": 2.8103, "step": 162 }, { "epoch": 0.027260944098340092, "grad_norm": 2.753537654876709, "learning_rate": 1.362876254180602e-05, "loss": 2.8998, "step": 163 }, { "epoch": 0.02742818915415813, "grad_norm": 2.867635726928711, "learning_rate": 1.3712374581939799e-05, "loss": 2.9098, "step": 164 }, { "epoch": 0.027595434209976167, "grad_norm": 3.138235330581665, "learning_rate": 1.3795986622073578e-05, "loss": 2.9313, "step": 165 }, { "epoch": 0.027762679265794205, "grad_norm": 5.360775947570801, "learning_rate": 1.3879598662207358e-05, "loss": 3.1759, "step": 166 }, { "epoch": 0.027929924321612242, "grad_norm": 2.971513509750366, "learning_rate": 1.3963210702341137e-05, "loss": 2.6336, "step": 167 }, { "epoch": 0.02809716937743028, "grad_norm": 4.336639404296875, "learning_rate": 1.4046822742474917e-05, "loss": 2.8729, "step": 168 }, { "epoch": 0.028264414433248317, "grad_norm": 2.507906436920166, "learning_rate": 1.4130434782608694e-05, "loss": 2.9646, "step": 169 }, { "epoch": 0.028431659489066355, "grad_norm": 12.57156753540039, "learning_rate": 1.4214046822742474e-05, "loss": 3.3306, "step": 170 }, { "epoch": 0.028598904544884392, "grad_norm": 6.158684253692627, "learning_rate": 1.4297658862876253e-05, "loss": 3.4563, "step": 171 }, { "epoch": 0.02876614960070243, "grad_norm": 3.7421905994415283, "learning_rate": 1.4381270903010033e-05, "loss": 3.4318, "step": 172 }, { "epoch": 0.028933394656520468, "grad_norm": 3.5466387271881104, "learning_rate": 1.4464882943143812e-05, "loss": 2.9069, "step": 173 }, { "epoch": 0.029100639712338505, "grad_norm": 3.0859644412994385, "learning_rate": 1.4548494983277592e-05, "loss": 2.9586, "step": 174 }, { "epoch": 0.029267884768156543, "grad_norm": 3.2324202060699463, "learning_rate": 1.4632107023411373e-05, "loss": 3.034, "step": 175 }, { "epoch": 0.02943512982397458, "grad_norm": 2.817352533340454, "learning_rate": 1.4715719063545153e-05, "loss": 3.0349, "step": 176 }, { "epoch": 0.029602374879792615, "grad_norm": 2.7619965076446533, "learning_rate": 1.4799331103678932e-05, "loss": 3.1813, "step": 177 }, { "epoch": 0.029769619935610652, "grad_norm": 2.246718406677246, "learning_rate": 1.4882943143812712e-05, "loss": 2.9891, "step": 178 }, { "epoch": 0.02993686499142869, "grad_norm": 2.209447145462036, "learning_rate": 1.496655518394649e-05, "loss": 2.965, "step": 179 }, { "epoch": 0.030104110047246727, "grad_norm": 2.18790340423584, "learning_rate": 1.5050167224080269e-05, "loss": 2.7348, "step": 180 }, { "epoch": 0.030271355103064765, "grad_norm": 3.595372200012207, "learning_rate": 1.5133779264214049e-05, "loss": 3.2408, "step": 181 }, { "epoch": 0.030438600158882802, "grad_norm": 2.147653102874756, "learning_rate": 1.5217391304347828e-05, "loss": 2.6932, "step": 182 }, { "epoch": 0.03060584521470084, "grad_norm": 3.784696340560913, "learning_rate": 1.5301003344481608e-05, "loss": 3.2862, "step": 183 }, { "epoch": 0.030773090270518878, "grad_norm": 2.531696319580078, "learning_rate": 1.5384615384615387e-05, "loss": 2.7397, "step": 184 }, { "epoch": 0.030940335326336915, "grad_norm": 2.1138877868652344, "learning_rate": 1.5468227424749167e-05, "loss": 2.6445, "step": 185 }, { "epoch": 0.031107580382154953, "grad_norm": 4.116943359375, "learning_rate": 1.5551839464882946e-05, "loss": 3.3919, "step": 186 }, { "epoch": 0.03127482543797299, "grad_norm": 2.9857711791992188, "learning_rate": 1.5635451505016722e-05, "loss": 3.183, "step": 187 }, { "epoch": 0.031442070493791024, "grad_norm": 3.156313419342041, "learning_rate": 1.5719063545150502e-05, "loss": 2.33, "step": 188 }, { "epoch": 0.031609315549609066, "grad_norm": 2.2884647846221924, "learning_rate": 1.580267558528428e-05, "loss": 2.9114, "step": 189 }, { "epoch": 0.0317765606054271, "grad_norm": 3.370727777481079, "learning_rate": 1.588628762541806e-05, "loss": 3.2533, "step": 190 }, { "epoch": 0.03194380566124514, "grad_norm": 3.622157096862793, "learning_rate": 1.596989966555184e-05, "loss": 3.1144, "step": 191 }, { "epoch": 0.032111050717063175, "grad_norm": 2.4233129024505615, "learning_rate": 1.605351170568562e-05, "loss": 2.6672, "step": 192 }, { "epoch": 0.032278295772881216, "grad_norm": 2.375262975692749, "learning_rate": 1.61371237458194e-05, "loss": 3.1144, "step": 193 }, { "epoch": 0.03244554082869925, "grad_norm": 5.565793991088867, "learning_rate": 1.622073578595318e-05, "loss": 3.4614, "step": 194 }, { "epoch": 0.03261278588451729, "grad_norm": 2.712003469467163, "learning_rate": 1.630434782608696e-05, "loss": 3.1407, "step": 195 }, { "epoch": 0.032780030940335325, "grad_norm": 2.3584096431732178, "learning_rate": 1.6387959866220738e-05, "loss": 3.0196, "step": 196 }, { "epoch": 0.032947275996153366, "grad_norm": 2.500565767288208, "learning_rate": 1.6471571906354518e-05, "loss": 2.7881, "step": 197 }, { "epoch": 0.0331145210519714, "grad_norm": 3.6202588081359863, "learning_rate": 1.6555183946488294e-05, "loss": 3.1601, "step": 198 }, { "epoch": 0.03328176610778944, "grad_norm": 3.287095069885254, "learning_rate": 1.6638795986622073e-05, "loss": 3.2726, "step": 199 }, { "epoch": 0.033449011163607476, "grad_norm": 3.060696840286255, "learning_rate": 1.6722408026755853e-05, "loss": 2.9203, "step": 200 }, { "epoch": 0.03361625621942552, "grad_norm": 2.56316876411438, "learning_rate": 1.6806020066889632e-05, "loss": 3.0714, "step": 201 }, { "epoch": 0.03378350127524355, "grad_norm": 2.273206949234009, "learning_rate": 1.6889632107023412e-05, "loss": 2.4111, "step": 202 }, { "epoch": 0.033950746331061585, "grad_norm": 3.4627323150634766, "learning_rate": 1.697324414715719e-05, "loss": 3.2198, "step": 203 }, { "epoch": 0.034117991386879626, "grad_norm": 3.1882760524749756, "learning_rate": 1.705685618729097e-05, "loss": 2.718, "step": 204 }, { "epoch": 0.03428523644269766, "grad_norm": 2.174759864807129, "learning_rate": 1.714046822742475e-05, "loss": 2.7031, "step": 205 }, { "epoch": 0.0344524814985157, "grad_norm": 5.168435573577881, "learning_rate": 1.722408026755853e-05, "loss": 3.2544, "step": 206 }, { "epoch": 0.034619726554333735, "grad_norm": 2.751798629760742, "learning_rate": 1.730769230769231e-05, "loss": 2.8256, "step": 207 }, { "epoch": 0.034786971610151776, "grad_norm": 3.663996696472168, "learning_rate": 1.739130434782609e-05, "loss": 2.7474, "step": 208 }, { "epoch": 0.03495421666596981, "grad_norm": 2.254035472869873, "learning_rate": 1.7474916387959865e-05, "loss": 2.7654, "step": 209 }, { "epoch": 0.03512146172178785, "grad_norm": 5.582728385925293, "learning_rate": 1.7558528428093644e-05, "loss": 3.0179, "step": 210 }, { "epoch": 0.035288706777605885, "grad_norm": 4.149656295776367, "learning_rate": 1.7642140468227424e-05, "loss": 2.8562, "step": 211 }, { "epoch": 0.03545595183342393, "grad_norm": 3.49183988571167, "learning_rate": 1.7725752508361204e-05, "loss": 2.9096, "step": 212 }, { "epoch": 0.03562319688924196, "grad_norm": 2.52878999710083, "learning_rate": 1.7809364548494983e-05, "loss": 2.8835, "step": 213 }, { "epoch": 0.03579044194506, "grad_norm": 2.6654183864593506, "learning_rate": 1.7892976588628763e-05, "loss": 2.9351, "step": 214 }, { "epoch": 0.035957687000878036, "grad_norm": 2.934947967529297, "learning_rate": 1.7976588628762542e-05, "loss": 2.913, "step": 215 }, { "epoch": 0.03612493205669608, "grad_norm": 2.2010960578918457, "learning_rate": 1.806020066889632e-05, "loss": 2.878, "step": 216 }, { "epoch": 0.03629217711251411, "grad_norm": 2.2171716690063477, "learning_rate": 1.81438127090301e-05, "loss": 2.5743, "step": 217 }, { "epoch": 0.03645942216833215, "grad_norm": 2.6046314239501953, "learning_rate": 1.822742474916388e-05, "loss": 2.9135, "step": 218 }, { "epoch": 0.036626667224150186, "grad_norm": 5.267155170440674, "learning_rate": 1.831103678929766e-05, "loss": 3.0799, "step": 219 }, { "epoch": 0.03679391227996822, "grad_norm": 3.457653284072876, "learning_rate": 1.8394648829431436e-05, "loss": 3.3555, "step": 220 }, { "epoch": 0.03696115733578626, "grad_norm": 2.774623394012451, "learning_rate": 1.8478260869565216e-05, "loss": 2.8864, "step": 221 }, { "epoch": 0.037128402391604295, "grad_norm": 2.7217299938201904, "learning_rate": 1.8561872909698995e-05, "loss": 2.9175, "step": 222 }, { "epoch": 0.037295647447422337, "grad_norm": 2.79276704788208, "learning_rate": 1.8645484949832775e-05, "loss": 2.8278, "step": 223 }, { "epoch": 0.03746289250324037, "grad_norm": 2.7344377040863037, "learning_rate": 1.8729096989966554e-05, "loss": 2.703, "step": 224 }, { "epoch": 0.03763013755905841, "grad_norm": 2.829758882522583, "learning_rate": 1.8812709030100337e-05, "loss": 2.6104, "step": 225 }, { "epoch": 0.037797382614876446, "grad_norm": 3.5322909355163574, "learning_rate": 1.8896321070234117e-05, "loss": 2.906, "step": 226 }, { "epoch": 0.03796462767069449, "grad_norm": 3.3988935947418213, "learning_rate": 1.8979933110367896e-05, "loss": 3.0439, "step": 227 }, { "epoch": 0.03813187272651252, "grad_norm": 1.692403793334961, "learning_rate": 1.9063545150501676e-05, "loss": 2.3132, "step": 228 }, { "epoch": 0.03829911778233056, "grad_norm": 2.5986032485961914, "learning_rate": 1.9147157190635452e-05, "loss": 2.8657, "step": 229 }, { "epoch": 0.038466362838148596, "grad_norm": 3.2588491439819336, "learning_rate": 1.923076923076923e-05, "loss": 2.8546, "step": 230 }, { "epoch": 0.03863360789396664, "grad_norm": 4.370154857635498, "learning_rate": 1.931438127090301e-05, "loss": 3.031, "step": 231 }, { "epoch": 0.03880085294978467, "grad_norm": 2.601893663406372, "learning_rate": 1.939799331103679e-05, "loss": 2.9515, "step": 232 }, { "epoch": 0.03896809800560271, "grad_norm": 2.8003292083740234, "learning_rate": 1.948160535117057e-05, "loss": 3.1102, "step": 233 }, { "epoch": 0.039135343061420746, "grad_norm": 3.0462491512298584, "learning_rate": 1.956521739130435e-05, "loss": 2.7839, "step": 234 }, { "epoch": 0.03930258811723879, "grad_norm": 1.8971421718597412, "learning_rate": 1.964882943143813e-05, "loss": 2.8769, "step": 235 }, { "epoch": 0.03946983317305682, "grad_norm": 2.0805351734161377, "learning_rate": 1.973244147157191e-05, "loss": 2.7141, "step": 236 }, { "epoch": 0.039637078228874856, "grad_norm": 3.0283405780792236, "learning_rate": 1.9816053511705688e-05, "loss": 2.5337, "step": 237 }, { "epoch": 0.0398043232846929, "grad_norm": 2.713172674179077, "learning_rate": 1.9899665551839468e-05, "loss": 2.8075, "step": 238 }, { "epoch": 0.03997156834051093, "grad_norm": 2.854367971420288, "learning_rate": 1.9983277591973247e-05, "loss": 3.2755, "step": 239 }, { "epoch": 0.04013881339632897, "grad_norm": 2.245631217956543, "learning_rate": 2.0066889632107023e-05, "loss": 3.0, "step": 240 }, { "epoch": 0.040306058452147006, "grad_norm": 2.5356593132019043, "learning_rate": 2.0150501672240803e-05, "loss": 2.9145, "step": 241 }, { "epoch": 0.04047330350796505, "grad_norm": 2.472579002380371, "learning_rate": 2.0234113712374582e-05, "loss": 2.9067, "step": 242 }, { "epoch": 0.04064054856378308, "grad_norm": 2.589665174484253, "learning_rate": 2.0317725752508362e-05, "loss": 2.8425, "step": 243 }, { "epoch": 0.04080779361960112, "grad_norm": 4.297053813934326, "learning_rate": 2.040133779264214e-05, "loss": 3.52, "step": 244 }, { "epoch": 0.040975038675419156, "grad_norm": 3.8442001342773438, "learning_rate": 2.048494983277592e-05, "loss": 2.8885, "step": 245 }, { "epoch": 0.0411422837312372, "grad_norm": 2.7822859287261963, "learning_rate": 2.05685618729097e-05, "loss": 3.1807, "step": 246 }, { "epoch": 0.04130952878705523, "grad_norm": 2.322098731994629, "learning_rate": 2.065217391304348e-05, "loss": 2.9452, "step": 247 }, { "epoch": 0.04147677384287327, "grad_norm": 4.250288963317871, "learning_rate": 2.073578595317726e-05, "loss": 3.1077, "step": 248 }, { "epoch": 0.04164401889869131, "grad_norm": 3.7723629474639893, "learning_rate": 2.081939799331104e-05, "loss": 3.7532, "step": 249 }, { "epoch": 0.04181126395450935, "grad_norm": 2.6028764247894287, "learning_rate": 2.090301003344482e-05, "loss": 2.7633, "step": 250 }, { "epoch": 0.04197850901032738, "grad_norm": 2.277993679046631, "learning_rate": 2.0986622073578598e-05, "loss": 2.8844, "step": 251 }, { "epoch": 0.04214575406614542, "grad_norm": 2.3696744441986084, "learning_rate": 2.1070234113712374e-05, "loss": 2.8459, "step": 252 }, { "epoch": 0.04231299912196346, "grad_norm": 4.029857158660889, "learning_rate": 2.1153846153846154e-05, "loss": 3.0905, "step": 253 }, { "epoch": 0.04248024417778149, "grad_norm": 4.339833736419678, "learning_rate": 2.1237458193979933e-05, "loss": 3.0169, "step": 254 }, { "epoch": 0.04264748923359953, "grad_norm": 3.921724557876587, "learning_rate": 2.1321070234113713e-05, "loss": 3.0167, "step": 255 }, { "epoch": 0.042814734289417566, "grad_norm": 2.730450391769409, "learning_rate": 2.1404682274247492e-05, "loss": 2.9528, "step": 256 }, { "epoch": 0.04298197934523561, "grad_norm": 5.046687602996826, "learning_rate": 2.148829431438127e-05, "loss": 3.5738, "step": 257 }, { "epoch": 0.04314922440105364, "grad_norm": 2.9040350914001465, "learning_rate": 2.157190635451505e-05, "loss": 2.9081, "step": 258 }, { "epoch": 0.04331646945687168, "grad_norm": 2.2915031909942627, "learning_rate": 2.165551839464883e-05, "loss": 2.8576, "step": 259 }, { "epoch": 0.04348371451268972, "grad_norm": 3.1598222255706787, "learning_rate": 2.173913043478261e-05, "loss": 3.0263, "step": 260 }, { "epoch": 0.04365095956850776, "grad_norm": 3.3951756954193115, "learning_rate": 2.182274247491639e-05, "loss": 3.2792, "step": 261 }, { "epoch": 0.04381820462432579, "grad_norm": 1.8307420015335083, "learning_rate": 2.190635451505017e-05, "loss": 2.517, "step": 262 }, { "epoch": 0.04398544968014383, "grad_norm": 2.36759090423584, "learning_rate": 2.1989966555183945e-05, "loss": 2.9679, "step": 263 }, { "epoch": 0.04415269473596187, "grad_norm": 3.395207405090332, "learning_rate": 2.2073578595317725e-05, "loss": 2.9514, "step": 264 }, { "epoch": 0.04431993979177991, "grad_norm": 2.8616273403167725, "learning_rate": 2.2157190635451504e-05, "loss": 2.8335, "step": 265 }, { "epoch": 0.04448718484759794, "grad_norm": 4.017574787139893, "learning_rate": 2.2240802675585284e-05, "loss": 3.1815, "step": 266 }, { "epoch": 0.04465442990341598, "grad_norm": 4.17120885848999, "learning_rate": 2.2324414715719063e-05, "loss": 2.9059, "step": 267 }, { "epoch": 0.04482167495923402, "grad_norm": 2.555466890335083, "learning_rate": 2.2408026755852843e-05, "loss": 3.0273, "step": 268 }, { "epoch": 0.04498892001505205, "grad_norm": 2.3844339847564697, "learning_rate": 2.2491638795986622e-05, "loss": 3.0816, "step": 269 }, { "epoch": 0.04515616507087009, "grad_norm": 3.098693609237671, "learning_rate": 2.2575250836120402e-05, "loss": 3.4601, "step": 270 }, { "epoch": 0.04532341012668813, "grad_norm": 2.7326953411102295, "learning_rate": 2.265886287625418e-05, "loss": 2.6442, "step": 271 }, { "epoch": 0.04549065518250617, "grad_norm": 2.4390783309936523, "learning_rate": 2.274247491638796e-05, "loss": 2.8888, "step": 272 }, { "epoch": 0.0456579002383242, "grad_norm": 3.1897687911987305, "learning_rate": 2.282608695652174e-05, "loss": 3.2504, "step": 273 }, { "epoch": 0.04582514529414224, "grad_norm": 3.544348955154419, "learning_rate": 2.2909698996655517e-05, "loss": 2.8481, "step": 274 }, { "epoch": 0.04599239034996028, "grad_norm": 2.7071456909179688, "learning_rate": 2.29933110367893e-05, "loss": 2.7692, "step": 275 }, { "epoch": 0.04615963540577832, "grad_norm": 2.6731061935424805, "learning_rate": 2.307692307692308e-05, "loss": 2.7353, "step": 276 }, { "epoch": 0.04632688046159635, "grad_norm": 5.140926837921143, "learning_rate": 2.316053511705686e-05, "loss": 3.0627, "step": 277 }, { "epoch": 0.04649412551741439, "grad_norm": 2.5443577766418457, "learning_rate": 2.3244147157190638e-05, "loss": 3.0229, "step": 278 }, { "epoch": 0.04666137057323243, "grad_norm": 2.486827850341797, "learning_rate": 2.3327759197324418e-05, "loss": 2.6733, "step": 279 }, { "epoch": 0.04682861562905047, "grad_norm": 3.275817632675171, "learning_rate": 2.3411371237458197e-05, "loss": 2.5261, "step": 280 }, { "epoch": 0.0469958606848685, "grad_norm": 2.5083625316619873, "learning_rate": 2.3494983277591977e-05, "loss": 2.8387, "step": 281 }, { "epoch": 0.047163105740686544, "grad_norm": 3.0467896461486816, "learning_rate": 2.3578595317725756e-05, "loss": 3.0075, "step": 282 }, { "epoch": 0.04733035079650458, "grad_norm": 3.333277940750122, "learning_rate": 2.3662207357859532e-05, "loss": 2.8164, "step": 283 }, { "epoch": 0.04749759585232262, "grad_norm": 1.5239078998565674, "learning_rate": 2.3745819397993312e-05, "loss": 2.5968, "step": 284 }, { "epoch": 0.04766484090814065, "grad_norm": 3.594684600830078, "learning_rate": 2.382943143812709e-05, "loss": 2.6257, "step": 285 }, { "epoch": 0.04783208596395869, "grad_norm": 3.270054817199707, "learning_rate": 2.391304347826087e-05, "loss": 3.0171, "step": 286 }, { "epoch": 0.04799933101977673, "grad_norm": 2.747187852859497, "learning_rate": 2.399665551839465e-05, "loss": 2.928, "step": 287 }, { "epoch": 0.04816657607559476, "grad_norm": 1.994520902633667, "learning_rate": 2.408026755852843e-05, "loss": 2.4856, "step": 288 }, { "epoch": 0.0483338211314128, "grad_norm": 6.787609577178955, "learning_rate": 2.416387959866221e-05, "loss": 3.1104, "step": 289 }, { "epoch": 0.04850106618723084, "grad_norm": 3.098961353302002, "learning_rate": 2.424749163879599e-05, "loss": 2.9406, "step": 290 }, { "epoch": 0.04866831124304888, "grad_norm": 3.216601610183716, "learning_rate": 2.433110367892977e-05, "loss": 2.8698, "step": 291 }, { "epoch": 0.04883555629886691, "grad_norm": 3.2064504623413086, "learning_rate": 2.4414715719063548e-05, "loss": 2.493, "step": 292 }, { "epoch": 0.049002801354684954, "grad_norm": 2.672816276550293, "learning_rate": 2.4498327759197327e-05, "loss": 2.3724, "step": 293 }, { "epoch": 0.04917004641050299, "grad_norm": 2.201211929321289, "learning_rate": 2.4581939799331104e-05, "loss": 2.7577, "step": 294 }, { "epoch": 0.04933729146632103, "grad_norm": 3.2333321571350098, "learning_rate": 2.4665551839464883e-05, "loss": 2.9937, "step": 295 }, { "epoch": 0.04950453652213906, "grad_norm": 3.726701021194458, "learning_rate": 2.4749163879598663e-05, "loss": 2.667, "step": 296 }, { "epoch": 0.049671781577957104, "grad_norm": 2.5842950344085693, "learning_rate": 2.4832775919732442e-05, "loss": 2.7539, "step": 297 }, { "epoch": 0.04983902663377514, "grad_norm": 3.1460447311401367, "learning_rate": 2.491638795986622e-05, "loss": 3.126, "step": 298 }, { "epoch": 0.05000627168959318, "grad_norm": 3.1746737957000732, "learning_rate": 2.5e-05, "loss": 3.0533, "step": 299 }, { "epoch": 0.05017351674541121, "grad_norm": 3.216708183288574, "learning_rate": 2.508361204013378e-05, "loss": 3.0439, "step": 300 }, { "epoch": 0.050340761801229254, "grad_norm": 2.3466835021972656, "learning_rate": 2.516722408026756e-05, "loss": 2.6658, "step": 301 }, { "epoch": 0.05050800685704729, "grad_norm": 3.0401411056518555, "learning_rate": 2.525083612040134e-05, "loss": 2.7531, "step": 302 }, { "epoch": 0.05067525191286532, "grad_norm": 2.818443536758423, "learning_rate": 2.533444816053512e-05, "loss": 3.1605, "step": 303 }, { "epoch": 0.050842496968683364, "grad_norm": 3.152071952819824, "learning_rate": 2.54180602006689e-05, "loss": 2.8766, "step": 304 }, { "epoch": 0.0510097420245014, "grad_norm": 2.5771589279174805, "learning_rate": 2.5501672240802675e-05, "loss": 2.9473, "step": 305 }, { "epoch": 0.05117698708031944, "grad_norm": 5.174061298370361, "learning_rate": 2.5585284280936454e-05, "loss": 3.5162, "step": 306 }, { "epoch": 0.05134423213613747, "grad_norm": 2.4735107421875, "learning_rate": 2.5668896321070234e-05, "loss": 3.2454, "step": 307 }, { "epoch": 0.051511477191955514, "grad_norm": 3.41509747505188, "learning_rate": 2.5752508361204013e-05, "loss": 3.1509, "step": 308 }, { "epoch": 0.05167872224777355, "grad_norm": 2.7404887676239014, "learning_rate": 2.5836120401337793e-05, "loss": 3.2159, "step": 309 }, { "epoch": 0.05184596730359159, "grad_norm": 3.809058666229248, "learning_rate": 2.5919732441471573e-05, "loss": 3.0973, "step": 310 }, { "epoch": 0.05201321235940962, "grad_norm": 2.3356845378875732, "learning_rate": 2.6003344481605352e-05, "loss": 2.874, "step": 311 }, { "epoch": 0.052180457415227664, "grad_norm": 2.1969497203826904, "learning_rate": 2.608695652173913e-05, "loss": 2.7387, "step": 312 }, { "epoch": 0.0523477024710457, "grad_norm": 3.0938234329223633, "learning_rate": 2.617056856187291e-05, "loss": 2.7185, "step": 313 }, { "epoch": 0.05251494752686374, "grad_norm": 2.933333396911621, "learning_rate": 2.625418060200669e-05, "loss": 2.9347, "step": 314 }, { "epoch": 0.052682192582681774, "grad_norm": 2.3226966857910156, "learning_rate": 2.633779264214047e-05, "loss": 2.7018, "step": 315 }, { "epoch": 0.052849437638499815, "grad_norm": 2.9676430225372314, "learning_rate": 2.6421404682274246e-05, "loss": 2.8289, "step": 316 }, { "epoch": 0.05301668269431785, "grad_norm": 2.462031126022339, "learning_rate": 2.6505016722408026e-05, "loss": 2.594, "step": 317 }, { "epoch": 0.05318392775013589, "grad_norm": 3.8577070236206055, "learning_rate": 2.6588628762541805e-05, "loss": 3.0038, "step": 318 }, { "epoch": 0.053351172805953924, "grad_norm": 2.076836347579956, "learning_rate": 2.6672240802675585e-05, "loss": 2.6068, "step": 319 }, { "epoch": 0.05351841786177196, "grad_norm": 2.4710028171539307, "learning_rate": 2.6755852842809364e-05, "loss": 3.2279, "step": 320 }, { "epoch": 0.05368566291759, "grad_norm": 1.3439005613327026, "learning_rate": 2.6839464882943144e-05, "loss": 2.4715, "step": 321 }, { "epoch": 0.05385290797340803, "grad_norm": 1.851946234703064, "learning_rate": 2.6923076923076923e-05, "loss": 2.7011, "step": 322 }, { "epoch": 0.054020153029226074, "grad_norm": 5.426140785217285, "learning_rate": 2.7006688963210703e-05, "loss": 3.1285, "step": 323 }, { "epoch": 0.05418739808504411, "grad_norm": 4.193892002105713, "learning_rate": 2.7090301003344482e-05, "loss": 3.4446, "step": 324 }, { "epoch": 0.05435464314086215, "grad_norm": 3.7150914669036865, "learning_rate": 2.7173913043478262e-05, "loss": 2.636, "step": 325 }, { "epoch": 0.054521888196680184, "grad_norm": 2.4420547485351562, "learning_rate": 2.725752508361204e-05, "loss": 2.7666, "step": 326 }, { "epoch": 0.054689133252498225, "grad_norm": 3.844653606414795, "learning_rate": 2.7341137123745818e-05, "loss": 3.061, "step": 327 }, { "epoch": 0.05485637830831626, "grad_norm": 2.4088969230651855, "learning_rate": 2.7424749163879597e-05, "loss": 2.8347, "step": 328 }, { "epoch": 0.0550236233641343, "grad_norm": 2.9581501483917236, "learning_rate": 2.7508361204013377e-05, "loss": 2.8449, "step": 329 }, { "epoch": 0.055190868419952334, "grad_norm": 3.0160539150238037, "learning_rate": 2.7591973244147156e-05, "loss": 2.7953, "step": 330 }, { "epoch": 0.055358113475770375, "grad_norm": 2.5495786666870117, "learning_rate": 2.7675585284280936e-05, "loss": 3.0453, "step": 331 }, { "epoch": 0.05552535853158841, "grad_norm": 2.2011606693267822, "learning_rate": 2.7759197324414715e-05, "loss": 2.7172, "step": 332 }, { "epoch": 0.05569260358740645, "grad_norm": 3.4139556884765625, "learning_rate": 2.7842809364548495e-05, "loss": 3.2293, "step": 333 }, { "epoch": 0.055859848643224484, "grad_norm": 2.363109827041626, "learning_rate": 2.7926421404682274e-05, "loss": 2.744, "step": 334 }, { "epoch": 0.056027093699042525, "grad_norm": 3.4197423458099365, "learning_rate": 2.8010033444816054e-05, "loss": 3.0647, "step": 335 }, { "epoch": 0.05619433875486056, "grad_norm": 2.679003953933716, "learning_rate": 2.8093645484949833e-05, "loss": 3.0155, "step": 336 }, { "epoch": 0.056361583810678594, "grad_norm": 2.213334321975708, "learning_rate": 2.8177257525083613e-05, "loss": 2.8069, "step": 337 }, { "epoch": 0.056528828866496635, "grad_norm": 2.314166307449341, "learning_rate": 2.826086956521739e-05, "loss": 2.9516, "step": 338 }, { "epoch": 0.05669607392231467, "grad_norm": 1.482600212097168, "learning_rate": 2.834448160535117e-05, "loss": 2.3604, "step": 339 }, { "epoch": 0.05686331897813271, "grad_norm": 2.4307799339294434, "learning_rate": 2.8428093645484948e-05, "loss": 2.8843, "step": 340 }, { "epoch": 0.057030564033950744, "grad_norm": 1.6452877521514893, "learning_rate": 2.8511705685618727e-05, "loss": 2.5897, "step": 341 }, { "epoch": 0.057197809089768785, "grad_norm": 2.3803491592407227, "learning_rate": 2.8595317725752507e-05, "loss": 2.976, "step": 342 }, { "epoch": 0.05736505414558682, "grad_norm": 2.9357717037200928, "learning_rate": 2.8678929765886286e-05, "loss": 2.9847, "step": 343 }, { "epoch": 0.05753229920140486, "grad_norm": 2.328038454055786, "learning_rate": 2.8762541806020066e-05, "loss": 2.5497, "step": 344 }, { "epoch": 0.057699544257222894, "grad_norm": 3.2418501377105713, "learning_rate": 2.8846153846153845e-05, "loss": 3.4176, "step": 345 }, { "epoch": 0.057866789313040935, "grad_norm": 2.195089340209961, "learning_rate": 2.8929765886287625e-05, "loss": 3.0116, "step": 346 }, { "epoch": 0.05803403436885897, "grad_norm": 2.7394402027130127, "learning_rate": 2.9013377926421404e-05, "loss": 3.0174, "step": 347 }, { "epoch": 0.05820127942467701, "grad_norm": 2.8705575466156006, "learning_rate": 2.9096989966555184e-05, "loss": 3.1215, "step": 348 }, { "epoch": 0.058368524480495045, "grad_norm": 2.9989736080169678, "learning_rate": 2.9180602006688967e-05, "loss": 2.9601, "step": 349 }, { "epoch": 0.058535769536313086, "grad_norm": 1.793492078781128, "learning_rate": 2.9264214046822746e-05, "loss": 2.801, "step": 350 }, { "epoch": 0.05870301459213112, "grad_norm": 2.1212775707244873, "learning_rate": 2.9347826086956526e-05, "loss": 2.7815, "step": 351 }, { "epoch": 0.05887025964794916, "grad_norm": 1.7324256896972656, "learning_rate": 2.9431438127090305e-05, "loss": 2.694, "step": 352 }, { "epoch": 0.059037504703767195, "grad_norm": 2.2846250534057617, "learning_rate": 2.9515050167224085e-05, "loss": 2.9848, "step": 353 }, { "epoch": 0.05920474975958523, "grad_norm": 2.430148124694824, "learning_rate": 2.9598662207357864e-05, "loss": 2.97, "step": 354 }, { "epoch": 0.05937199481540327, "grad_norm": 2.3153843879699707, "learning_rate": 2.9682274247491644e-05, "loss": 2.855, "step": 355 }, { "epoch": 0.059539239871221304, "grad_norm": 3.246997594833374, "learning_rate": 2.9765886287625424e-05, "loss": 3.0974, "step": 356 }, { "epoch": 0.059706484927039345, "grad_norm": 3.1925625801086426, "learning_rate": 2.98494983277592e-05, "loss": 2.7053, "step": 357 }, { "epoch": 0.05987372998285738, "grad_norm": 2.7371108531951904, "learning_rate": 2.993311036789298e-05, "loss": 2.9145, "step": 358 }, { "epoch": 0.06004097503867542, "grad_norm": 2.8549253940582275, "learning_rate": 3.001672240802676e-05, "loss": 2.6867, "step": 359 }, { "epoch": 0.060208220094493455, "grad_norm": 2.657221794128418, "learning_rate": 3.0100334448160538e-05, "loss": 2.893, "step": 360 }, { "epoch": 0.060375465150311496, "grad_norm": 3.26680326461792, "learning_rate": 3.0183946488294318e-05, "loss": 3.029, "step": 361 }, { "epoch": 0.06054271020612953, "grad_norm": 2.4816505908966064, "learning_rate": 3.0267558528428097e-05, "loss": 2.7422, "step": 362 }, { "epoch": 0.06070995526194757, "grad_norm": 3.2470695972442627, "learning_rate": 3.0351170568561877e-05, "loss": 2.9265, "step": 363 }, { "epoch": 0.060877200317765605, "grad_norm": 4.237431049346924, "learning_rate": 3.0434782608695656e-05, "loss": 3.2934, "step": 364 }, { "epoch": 0.061044445373583646, "grad_norm": 3.897503614425659, "learning_rate": 3.051839464882943e-05, "loss": 2.9457, "step": 365 }, { "epoch": 0.06121169042940168, "grad_norm": 3.53225040435791, "learning_rate": 3.0602006688963215e-05, "loss": 3.34, "step": 366 }, { "epoch": 0.06137893548521972, "grad_norm": 3.6226885318756104, "learning_rate": 3.068561872909699e-05, "loss": 2.8472, "step": 367 }, { "epoch": 0.061546180541037755, "grad_norm": 3.1881330013275146, "learning_rate": 3.0769230769230774e-05, "loss": 2.7002, "step": 368 }, { "epoch": 0.061713425596855796, "grad_norm": 2.4619321823120117, "learning_rate": 3.085284280936455e-05, "loss": 2.8675, "step": 369 }, { "epoch": 0.06188067065267383, "grad_norm": 3.3086187839508057, "learning_rate": 3.0936454849498333e-05, "loss": 3.1736, "step": 370 }, { "epoch": 0.062047915708491864, "grad_norm": 3.1666252613067627, "learning_rate": 3.102006688963211e-05, "loss": 2.9237, "step": 371 }, { "epoch": 0.062215160764309906, "grad_norm": 3.072474479675293, "learning_rate": 3.110367892976589e-05, "loss": 2.8493, "step": 372 }, { "epoch": 0.06238240582012794, "grad_norm": 4.827178478240967, "learning_rate": 3.118729096989967e-05, "loss": 3.137, "step": 373 }, { "epoch": 0.06254965087594598, "grad_norm": 3.058297872543335, "learning_rate": 3.1270903010033445e-05, "loss": 3.1445, "step": 374 }, { "epoch": 0.06271689593176402, "grad_norm": 4.116382122039795, "learning_rate": 3.135451505016723e-05, "loss": 3.5449, "step": 375 }, { "epoch": 0.06288414098758205, "grad_norm": 4.400504112243652, "learning_rate": 3.1438127090301004e-05, "loss": 2.8952, "step": 376 }, { "epoch": 0.06305138604340009, "grad_norm": 2.603239059448242, "learning_rate": 3.152173913043479e-05, "loss": 2.5225, "step": 377 }, { "epoch": 0.06321863109921813, "grad_norm": 2.315930128097534, "learning_rate": 3.160535117056856e-05, "loss": 2.8894, "step": 378 }, { "epoch": 0.06338587615503617, "grad_norm": 3.490770101547241, "learning_rate": 3.1688963210702346e-05, "loss": 3.0129, "step": 379 }, { "epoch": 0.0635531212108542, "grad_norm": 3.22188401222229, "learning_rate": 3.177257525083612e-05, "loss": 2.4104, "step": 380 }, { "epoch": 0.06372036626667224, "grad_norm": 2.0256898403167725, "learning_rate": 3.1856187290969905e-05, "loss": 2.7555, "step": 381 }, { "epoch": 0.06388761132249028, "grad_norm": 2.930828332901001, "learning_rate": 3.193979933110368e-05, "loss": 3.0689, "step": 382 }, { "epoch": 0.06405485637830832, "grad_norm": 5.68071985244751, "learning_rate": 3.2023411371237464e-05, "loss": 3.4776, "step": 383 }, { "epoch": 0.06422210143412635, "grad_norm": 3.1502158641815186, "learning_rate": 3.210702341137124e-05, "loss": 3.1865, "step": 384 }, { "epoch": 0.06438934648994439, "grad_norm": 6.444261074066162, "learning_rate": 3.2190635451505016e-05, "loss": 3.514, "step": 385 }, { "epoch": 0.06455659154576243, "grad_norm": 3.240809440612793, "learning_rate": 3.22742474916388e-05, "loss": 2.7396, "step": 386 }, { "epoch": 0.06472383660158046, "grad_norm": 6.811744689941406, "learning_rate": 3.2357859531772575e-05, "loss": 3.3581, "step": 387 }, { "epoch": 0.0648910816573985, "grad_norm": 2.571991443634033, "learning_rate": 3.244147157190636e-05, "loss": 3.1503, "step": 388 }, { "epoch": 0.06505832671321654, "grad_norm": 3.3896682262420654, "learning_rate": 3.2525083612040134e-05, "loss": 2.4913, "step": 389 }, { "epoch": 0.06522557176903458, "grad_norm": 2.2832794189453125, "learning_rate": 3.260869565217392e-05, "loss": 2.4966, "step": 390 }, { "epoch": 0.06539281682485261, "grad_norm": 4.9534783363342285, "learning_rate": 3.269230769230769e-05, "loss": 3.0502, "step": 391 }, { "epoch": 0.06556006188067065, "grad_norm": 2.567293882369995, "learning_rate": 3.2775919732441476e-05, "loss": 3.2728, "step": 392 }, { "epoch": 0.06572730693648869, "grad_norm": 3.3909192085266113, "learning_rate": 3.285953177257525e-05, "loss": 2.6515, "step": 393 }, { "epoch": 0.06589455199230673, "grad_norm": 3.820385694503784, "learning_rate": 3.2943143812709035e-05, "loss": 3.0859, "step": 394 }, { "epoch": 0.06606179704812476, "grad_norm": 2.8999719619750977, "learning_rate": 3.302675585284281e-05, "loss": 3.2898, "step": 395 }, { "epoch": 0.0662290421039428, "grad_norm": 2.893812417984009, "learning_rate": 3.311036789297659e-05, "loss": 2.8549, "step": 396 }, { "epoch": 0.06639628715976084, "grad_norm": 2.162724018096924, "learning_rate": 3.319397993311037e-05, "loss": 2.9023, "step": 397 }, { "epoch": 0.06656353221557888, "grad_norm": 2.1580257415771484, "learning_rate": 3.3277591973244146e-05, "loss": 2.9298, "step": 398 }, { "epoch": 0.06673077727139691, "grad_norm": 2.034555435180664, "learning_rate": 3.336120401337793e-05, "loss": 2.6566, "step": 399 }, { "epoch": 0.06689802232721495, "grad_norm": 3.586911916732788, "learning_rate": 3.3444816053511705e-05, "loss": 2.9332, "step": 400 }, { "epoch": 0.06706526738303299, "grad_norm": 2.620753288269043, "learning_rate": 3.352842809364549e-05, "loss": 2.4945, "step": 401 }, { "epoch": 0.06723251243885103, "grad_norm": 3.7173426151275635, "learning_rate": 3.3612040133779264e-05, "loss": 3.0159, "step": 402 }, { "epoch": 0.06739975749466906, "grad_norm": 3.0982704162597656, "learning_rate": 3.369565217391305e-05, "loss": 3.1953, "step": 403 }, { "epoch": 0.0675670025504871, "grad_norm": 3.587805986404419, "learning_rate": 3.3779264214046823e-05, "loss": 3.142, "step": 404 }, { "epoch": 0.06773424760630514, "grad_norm": 2.2752764225006104, "learning_rate": 3.3862876254180606e-05, "loss": 2.8601, "step": 405 }, { "epoch": 0.06790149266212317, "grad_norm": 2.198387622833252, "learning_rate": 3.394648829431438e-05, "loss": 2.8012, "step": 406 }, { "epoch": 0.06806873771794121, "grad_norm": 3.431004524230957, "learning_rate": 3.403010033444816e-05, "loss": 2.6902, "step": 407 }, { "epoch": 0.06823598277375925, "grad_norm": 2.56248140335083, "learning_rate": 3.411371237458194e-05, "loss": 3.1008, "step": 408 }, { "epoch": 0.06840322782957729, "grad_norm": 2.338951349258423, "learning_rate": 3.419732441471572e-05, "loss": 2.8193, "step": 409 }, { "epoch": 0.06857047288539532, "grad_norm": 1.9651976823806763, "learning_rate": 3.42809364548495e-05, "loss": 2.7613, "step": 410 }, { "epoch": 0.06873771794121336, "grad_norm": 2.632683277130127, "learning_rate": 3.436454849498328e-05, "loss": 2.6582, "step": 411 }, { "epoch": 0.0689049629970314, "grad_norm": 2.418351411819458, "learning_rate": 3.444816053511706e-05, "loss": 3.3551, "step": 412 }, { "epoch": 0.06907220805284944, "grad_norm": 3.1330063343048096, "learning_rate": 3.4531772575250836e-05, "loss": 3.1453, "step": 413 }, { "epoch": 0.06923945310866747, "grad_norm": 2.1742377281188965, "learning_rate": 3.461538461538462e-05, "loss": 2.9422, "step": 414 }, { "epoch": 0.06940669816448551, "grad_norm": 1.9282771348953247, "learning_rate": 3.4698996655518395e-05, "loss": 2.823, "step": 415 }, { "epoch": 0.06957394322030355, "grad_norm": 1.88218355178833, "learning_rate": 3.478260869565218e-05, "loss": 2.6697, "step": 416 }, { "epoch": 0.0697411882761216, "grad_norm": 2.8210935592651367, "learning_rate": 3.4866220735785954e-05, "loss": 2.9213, "step": 417 }, { "epoch": 0.06990843333193962, "grad_norm": 3.7556068897247314, "learning_rate": 3.494983277591973e-05, "loss": 3.3385, "step": 418 }, { "epoch": 0.07007567838775766, "grad_norm": 2.3339555263519287, "learning_rate": 3.503344481605351e-05, "loss": 3.1949, "step": 419 }, { "epoch": 0.0702429234435757, "grad_norm": 2.2984628677368164, "learning_rate": 3.511705685618729e-05, "loss": 3.0507, "step": 420 }, { "epoch": 0.07041016849939373, "grad_norm": 2.7668983936309814, "learning_rate": 3.520066889632107e-05, "loss": 2.7914, "step": 421 }, { "epoch": 0.07057741355521177, "grad_norm": 5.563778877258301, "learning_rate": 3.528428093645485e-05, "loss": 3.3208, "step": 422 }, { "epoch": 0.07074465861102981, "grad_norm": 1.7779664993286133, "learning_rate": 3.536789297658863e-05, "loss": 2.7005, "step": 423 }, { "epoch": 0.07091190366684785, "grad_norm": 2.7842960357666016, "learning_rate": 3.545150501672241e-05, "loss": 3.1971, "step": 424 }, { "epoch": 0.07107914872266588, "grad_norm": 2.5281267166137695, "learning_rate": 3.553511705685619e-05, "loss": 3.0627, "step": 425 }, { "epoch": 0.07124639377848392, "grad_norm": 2.295423746109009, "learning_rate": 3.5618729096989966e-05, "loss": 2.6439, "step": 426 }, { "epoch": 0.07141363883430196, "grad_norm": 3.11745285987854, "learning_rate": 3.570234113712375e-05, "loss": 2.6878, "step": 427 }, { "epoch": 0.07158088389012, "grad_norm": 3.3071866035461426, "learning_rate": 3.5785953177257525e-05, "loss": 2.8286, "step": 428 }, { "epoch": 0.07174812894593803, "grad_norm": 2.321533203125, "learning_rate": 3.58695652173913e-05, "loss": 2.945, "step": 429 }, { "epoch": 0.07191537400175607, "grad_norm": 3.069917678833008, "learning_rate": 3.5953177257525084e-05, "loss": 2.9392, "step": 430 }, { "epoch": 0.07208261905757411, "grad_norm": 3.45310115814209, "learning_rate": 3.603678929765886e-05, "loss": 2.8397, "step": 431 }, { "epoch": 0.07224986411339215, "grad_norm": 3.3119537830352783, "learning_rate": 3.612040133779264e-05, "loss": 3.2321, "step": 432 }, { "epoch": 0.07241710916921018, "grad_norm": 4.411495685577393, "learning_rate": 3.620401337792642e-05, "loss": 3.3078, "step": 433 }, { "epoch": 0.07258435422502822, "grad_norm": 3.022550344467163, "learning_rate": 3.62876254180602e-05, "loss": 2.6804, "step": 434 }, { "epoch": 0.07275159928084626, "grad_norm": 2.872744560241699, "learning_rate": 3.637123745819398e-05, "loss": 3.1774, "step": 435 }, { "epoch": 0.0729188443366643, "grad_norm": 3.6469249725341797, "learning_rate": 3.645484949832776e-05, "loss": 3.0517, "step": 436 }, { "epoch": 0.07308608939248233, "grad_norm": 3.234005928039551, "learning_rate": 3.653846153846154e-05, "loss": 2.4697, "step": 437 }, { "epoch": 0.07325333444830037, "grad_norm": 4.272342205047607, "learning_rate": 3.662207357859532e-05, "loss": 2.4999, "step": 438 }, { "epoch": 0.07342057950411841, "grad_norm": 3.7748212814331055, "learning_rate": 3.6705685618729096e-05, "loss": 3.198, "step": 439 }, { "epoch": 0.07358782455993644, "grad_norm": 2.7186408042907715, "learning_rate": 3.678929765886287e-05, "loss": 3.07, "step": 440 }, { "epoch": 0.07375506961575448, "grad_norm": 2.4797282218933105, "learning_rate": 3.6872909698996655e-05, "loss": 2.7251, "step": 441 }, { "epoch": 0.07392231467157252, "grad_norm": 3.1052298545837402, "learning_rate": 3.695652173913043e-05, "loss": 3.2381, "step": 442 }, { "epoch": 0.07408955972739056, "grad_norm": 2.5988056659698486, "learning_rate": 3.7040133779264214e-05, "loss": 3.2832, "step": 443 }, { "epoch": 0.07425680478320859, "grad_norm": 3.413637161254883, "learning_rate": 3.712374581939799e-05, "loss": 3.1279, "step": 444 }, { "epoch": 0.07442404983902663, "grad_norm": 3.2097065448760986, "learning_rate": 3.7207357859531773e-05, "loss": 3.6342, "step": 445 }, { "epoch": 0.07459129489484467, "grad_norm": 4.0559892654418945, "learning_rate": 3.729096989966555e-05, "loss": 3.259, "step": 446 }, { "epoch": 0.07475853995066271, "grad_norm": 3.3015127182006836, "learning_rate": 3.737458193979933e-05, "loss": 2.6927, "step": 447 }, { "epoch": 0.07492578500648074, "grad_norm": 3.0410759449005127, "learning_rate": 3.745819397993311e-05, "loss": 2.9399, "step": 448 }, { "epoch": 0.07509303006229878, "grad_norm": 5.403712749481201, "learning_rate": 3.754180602006689e-05, "loss": 2.8228, "step": 449 }, { "epoch": 0.07526027511811682, "grad_norm": 6.746461391448975, "learning_rate": 3.7625418060200674e-05, "loss": 3.4638, "step": 450 }, { "epoch": 0.07542752017393486, "grad_norm": 3.1841976642608643, "learning_rate": 3.770903010033445e-05, "loss": 2.7079, "step": 451 }, { "epoch": 0.07559476522975289, "grad_norm": 3.8411378860473633, "learning_rate": 3.7792642140468233e-05, "loss": 3.0291, "step": 452 }, { "epoch": 0.07576201028557093, "grad_norm": 3.02731990814209, "learning_rate": 3.787625418060201e-05, "loss": 3.1825, "step": 453 }, { "epoch": 0.07592925534138897, "grad_norm": 2.762439489364624, "learning_rate": 3.795986622073579e-05, "loss": 2.9363, "step": 454 }, { "epoch": 0.076096500397207, "grad_norm": 3.0531787872314453, "learning_rate": 3.804347826086957e-05, "loss": 3.1615, "step": 455 }, { "epoch": 0.07626374545302504, "grad_norm": 3.0016767978668213, "learning_rate": 3.812709030100335e-05, "loss": 3.0524, "step": 456 }, { "epoch": 0.07643099050884308, "grad_norm": 2.439188003540039, "learning_rate": 3.821070234113713e-05, "loss": 3.1133, "step": 457 }, { "epoch": 0.07659823556466112, "grad_norm": 2.8942863941192627, "learning_rate": 3.8294314381270904e-05, "loss": 2.9256, "step": 458 }, { "epoch": 0.07676548062047915, "grad_norm": 2.3794572353363037, "learning_rate": 3.837792642140469e-05, "loss": 2.7124, "step": 459 }, { "epoch": 0.07693272567629719, "grad_norm": 2.521470785140991, "learning_rate": 3.846153846153846e-05, "loss": 3.2262, "step": 460 }, { "epoch": 0.07709997073211523, "grad_norm": 4.833841323852539, "learning_rate": 3.8545150501672246e-05, "loss": 3.7069, "step": 461 }, { "epoch": 0.07726721578793327, "grad_norm": 2.7709591388702393, "learning_rate": 3.862876254180602e-05, "loss": 3.0488, "step": 462 }, { "epoch": 0.0774344608437513, "grad_norm": 2.069821357727051, "learning_rate": 3.8712374581939805e-05, "loss": 2.712, "step": 463 }, { "epoch": 0.07760170589956934, "grad_norm": 2.7856242656707764, "learning_rate": 3.879598662207358e-05, "loss": 3.1227, "step": 464 }, { "epoch": 0.07776895095538738, "grad_norm": 2.752918243408203, "learning_rate": 3.8879598662207364e-05, "loss": 3.0519, "step": 465 }, { "epoch": 0.07793619601120542, "grad_norm": 2.96220326423645, "learning_rate": 3.896321070234114e-05, "loss": 2.7787, "step": 466 }, { "epoch": 0.07810344106702345, "grad_norm": 3.504067897796631, "learning_rate": 3.904682274247492e-05, "loss": 2.7128, "step": 467 }, { "epoch": 0.07827068612284149, "grad_norm": 2.392916440963745, "learning_rate": 3.91304347826087e-05, "loss": 2.9196, "step": 468 }, { "epoch": 0.07843793117865953, "grad_norm": 2.425628423690796, "learning_rate": 3.9214046822742475e-05, "loss": 3.0905, "step": 469 }, { "epoch": 0.07860517623447758, "grad_norm": 2.3471243381500244, "learning_rate": 3.929765886287626e-05, "loss": 2.733, "step": 470 }, { "epoch": 0.0787724212902956, "grad_norm": 2.73205304145813, "learning_rate": 3.9381270903010034e-05, "loss": 2.9615, "step": 471 }, { "epoch": 0.07893966634611364, "grad_norm": 2.3634400367736816, "learning_rate": 3.946488294314382e-05, "loss": 2.8898, "step": 472 }, { "epoch": 0.07910691140193168, "grad_norm": 3.020653247833252, "learning_rate": 3.954849498327759e-05, "loss": 3.0322, "step": 473 }, { "epoch": 0.07927415645774971, "grad_norm": 2.279266119003296, "learning_rate": 3.9632107023411376e-05, "loss": 3.0761, "step": 474 }, { "epoch": 0.07944140151356775, "grad_norm": 6.017271995544434, "learning_rate": 3.971571906354515e-05, "loss": 3.3964, "step": 475 }, { "epoch": 0.0796086465693858, "grad_norm": 7.147303581237793, "learning_rate": 3.9799331103678935e-05, "loss": 3.5973, "step": 476 }, { "epoch": 0.07977589162520383, "grad_norm": 2.451066493988037, "learning_rate": 3.988294314381271e-05, "loss": 2.9069, "step": 477 }, { "epoch": 0.07994313668102186, "grad_norm": 2.4622225761413574, "learning_rate": 3.9966555183946494e-05, "loss": 3.1422, "step": 478 }, { "epoch": 0.0801103817368399, "grad_norm": 1.827227234840393, "learning_rate": 4.005016722408027e-05, "loss": 3.0037, "step": 479 }, { "epoch": 0.08027762679265794, "grad_norm": 3.3249740600585938, "learning_rate": 4.0133779264214046e-05, "loss": 3.0798, "step": 480 }, { "epoch": 0.08044487184847599, "grad_norm": 4.29397439956665, "learning_rate": 4.021739130434783e-05, "loss": 3.1432, "step": 481 }, { "epoch": 0.08061211690429401, "grad_norm": 2.613179922103882, "learning_rate": 4.0301003344481605e-05, "loss": 2.9688, "step": 482 }, { "epoch": 0.08077936196011205, "grad_norm": 2.745164155960083, "learning_rate": 4.038461538461539e-05, "loss": 2.6549, "step": 483 }, { "epoch": 0.0809466070159301, "grad_norm": 2.806901693344116, "learning_rate": 4.0468227424749165e-05, "loss": 3.1231, "step": 484 }, { "epoch": 0.08111385207174814, "grad_norm": 1.6760051250457764, "learning_rate": 4.055183946488295e-05, "loss": 2.8076, "step": 485 }, { "epoch": 0.08128109712756616, "grad_norm": 2.723209857940674, "learning_rate": 4.0635451505016724e-05, "loss": 2.941, "step": 486 }, { "epoch": 0.0814483421833842, "grad_norm": 4.948307037353516, "learning_rate": 4.0719063545150506e-05, "loss": 3.5329, "step": 487 }, { "epoch": 0.08161558723920224, "grad_norm": 2.550595998764038, "learning_rate": 4.080267558528428e-05, "loss": 2.7554, "step": 488 }, { "epoch": 0.08178283229502027, "grad_norm": 2.435180187225342, "learning_rate": 4.0886287625418065e-05, "loss": 2.9709, "step": 489 }, { "epoch": 0.08195007735083831, "grad_norm": 2.4460854530334473, "learning_rate": 4.096989966555184e-05, "loss": 2.7379, "step": 490 }, { "epoch": 0.08211732240665635, "grad_norm": 2.0727951526641846, "learning_rate": 4.105351170568562e-05, "loss": 2.7947, "step": 491 }, { "epoch": 0.0822845674624744, "grad_norm": 3.2144663333892822, "learning_rate": 4.11371237458194e-05, "loss": 3.0438, "step": 492 }, { "epoch": 0.08245181251829242, "grad_norm": 2.535924196243286, "learning_rate": 4.122073578595318e-05, "loss": 2.9344, "step": 493 }, { "epoch": 0.08261905757411046, "grad_norm": 3.3263747692108154, "learning_rate": 4.130434782608696e-05, "loss": 2.9549, "step": 494 }, { "epoch": 0.0827863026299285, "grad_norm": 2.548413038253784, "learning_rate": 4.1387959866220736e-05, "loss": 2.611, "step": 495 }, { "epoch": 0.08295354768574655, "grad_norm": 1.486773133277893, "learning_rate": 4.147157190635452e-05, "loss": 2.5386, "step": 496 }, { "epoch": 0.08312079274156457, "grad_norm": 1.8956315517425537, "learning_rate": 4.1555183946488295e-05, "loss": 2.6791, "step": 497 }, { "epoch": 0.08328803779738261, "grad_norm": 2.615337371826172, "learning_rate": 4.163879598662208e-05, "loss": 3.0233, "step": 498 }, { "epoch": 0.08345528285320065, "grad_norm": 1.8332130908966064, "learning_rate": 4.1722408026755854e-05, "loss": 2.4811, "step": 499 }, { "epoch": 0.0836225279090187, "grad_norm": 2.273568868637085, "learning_rate": 4.180602006688964e-05, "loss": 3.18, "step": 500 }, { "epoch": 0.08378977296483672, "grad_norm": 2.9985435009002686, "learning_rate": 4.188963210702341e-05, "loss": 3.2103, "step": 501 }, { "epoch": 0.08395701802065476, "grad_norm": 2.80599308013916, "learning_rate": 4.1973244147157196e-05, "loss": 2.9555, "step": 502 }, { "epoch": 0.0841242630764728, "grad_norm": 4.00032377243042, "learning_rate": 4.205685618729097e-05, "loss": 2.8451, "step": 503 }, { "epoch": 0.08429150813229085, "grad_norm": 2.258972406387329, "learning_rate": 4.214046822742475e-05, "loss": 2.569, "step": 504 }, { "epoch": 0.08445875318810887, "grad_norm": 2.7033426761627197, "learning_rate": 4.222408026755853e-05, "loss": 3.1104, "step": 505 }, { "epoch": 0.08462599824392691, "grad_norm": 1.90407395362854, "learning_rate": 4.230769230769231e-05, "loss": 2.5809, "step": 506 }, { "epoch": 0.08479324329974496, "grad_norm": 4.017109394073486, "learning_rate": 4.239130434782609e-05, "loss": 3.0585, "step": 507 }, { "epoch": 0.08496048835556298, "grad_norm": 5.408859729766846, "learning_rate": 4.2474916387959866e-05, "loss": 3.1496, "step": 508 }, { "epoch": 0.08512773341138102, "grad_norm": 2.1023685932159424, "learning_rate": 4.255852842809365e-05, "loss": 3.3178, "step": 509 }, { "epoch": 0.08529497846719906, "grad_norm": 2.6143012046813965, "learning_rate": 4.2642140468227425e-05, "loss": 3.3151, "step": 510 }, { "epoch": 0.0854622235230171, "grad_norm": 2.490028142929077, "learning_rate": 4.272575250836121e-05, "loss": 3.1597, "step": 511 }, { "epoch": 0.08562946857883513, "grad_norm": 2.854557752609253, "learning_rate": 4.2809364548494984e-05, "loss": 3.0278, "step": 512 }, { "epoch": 0.08579671363465317, "grad_norm": 5.653354167938232, "learning_rate": 4.289297658862877e-05, "loss": 3.2853, "step": 513 }, { "epoch": 0.08596395869047121, "grad_norm": 2.5116045475006104, "learning_rate": 4.297658862876254e-05, "loss": 2.7297, "step": 514 }, { "epoch": 0.08613120374628926, "grad_norm": 2.757796049118042, "learning_rate": 4.306020066889632e-05, "loss": 2.9686, "step": 515 }, { "epoch": 0.08629844880210728, "grad_norm": 2.5267586708068848, "learning_rate": 4.31438127090301e-05, "loss": 2.817, "step": 516 }, { "epoch": 0.08646569385792532, "grad_norm": 2.502147674560547, "learning_rate": 4.322742474916388e-05, "loss": 2.6515, "step": 517 }, { "epoch": 0.08663293891374337, "grad_norm": 2.8840065002441406, "learning_rate": 4.331103678929766e-05, "loss": 2.7798, "step": 518 }, { "epoch": 0.0868001839695614, "grad_norm": 2.145831823348999, "learning_rate": 4.339464882943144e-05, "loss": 2.4949, "step": 519 }, { "epoch": 0.08696742902537943, "grad_norm": 2.161543846130371, "learning_rate": 4.347826086956522e-05, "loss": 2.6316, "step": 520 }, { "epoch": 0.08713467408119747, "grad_norm": 2.450133800506592, "learning_rate": 4.3561872909698996e-05, "loss": 2.3727, "step": 521 }, { "epoch": 0.08730191913701552, "grad_norm": 2.093007802963257, "learning_rate": 4.364548494983278e-05, "loss": 2.7789, "step": 522 }, { "epoch": 0.08746916419283354, "grad_norm": 1.978015661239624, "learning_rate": 4.3729096989966556e-05, "loss": 2.6919, "step": 523 }, { "epoch": 0.08763640924865158, "grad_norm": 4.8268513679504395, "learning_rate": 4.381270903010034e-05, "loss": 2.9232, "step": 524 }, { "epoch": 0.08780365430446962, "grad_norm": 1.7604217529296875, "learning_rate": 4.3896321070234115e-05, "loss": 2.518, "step": 525 }, { "epoch": 0.08797089936028767, "grad_norm": 4.039196491241455, "learning_rate": 4.397993311036789e-05, "loss": 3.1951, "step": 526 }, { "epoch": 0.0881381444161057, "grad_norm": 2.459641218185425, "learning_rate": 4.4063545150501674e-05, "loss": 3.1874, "step": 527 }, { "epoch": 0.08830538947192373, "grad_norm": 3.4245548248291016, "learning_rate": 4.414715719063545e-05, "loss": 3.2926, "step": 528 }, { "epoch": 0.08847263452774178, "grad_norm": 4.848878860473633, "learning_rate": 4.423076923076923e-05, "loss": 3.0004, "step": 529 }, { "epoch": 0.08863987958355982, "grad_norm": 3.163508176803589, "learning_rate": 4.431438127090301e-05, "loss": 2.8637, "step": 530 }, { "epoch": 0.08880712463937784, "grad_norm": 2.282503366470337, "learning_rate": 4.439799331103679e-05, "loss": 2.8797, "step": 531 }, { "epoch": 0.08897436969519588, "grad_norm": 3.465090274810791, "learning_rate": 4.448160535117057e-05, "loss": 3.057, "step": 532 }, { "epoch": 0.08914161475101393, "grad_norm": 3.3177149295806885, "learning_rate": 4.456521739130435e-05, "loss": 3.1779, "step": 533 }, { "epoch": 0.08930885980683197, "grad_norm": 3.7461469173431396, "learning_rate": 4.464882943143813e-05, "loss": 3.2288, "step": 534 }, { "epoch": 0.08947610486265, "grad_norm": 3.99851655960083, "learning_rate": 4.473244147157191e-05, "loss": 2.8059, "step": 535 }, { "epoch": 0.08964334991846803, "grad_norm": 2.8065969944000244, "learning_rate": 4.4816053511705686e-05, "loss": 2.9231, "step": 536 }, { "epoch": 0.08981059497428608, "grad_norm": 2.3320000171661377, "learning_rate": 4.489966555183946e-05, "loss": 3.0081, "step": 537 }, { "epoch": 0.0899778400301041, "grad_norm": 3.688882350921631, "learning_rate": 4.4983277591973245e-05, "loss": 3.2511, "step": 538 }, { "epoch": 0.09014508508592214, "grad_norm": 2.2491559982299805, "learning_rate": 4.506688963210702e-05, "loss": 2.5912, "step": 539 }, { "epoch": 0.09031233014174019, "grad_norm": 3.5064611434936523, "learning_rate": 4.5150501672240804e-05, "loss": 2.7932, "step": 540 }, { "epoch": 0.09047957519755823, "grad_norm": 7.184377193450928, "learning_rate": 4.523411371237458e-05, "loss": 3.6838, "step": 541 }, { "epoch": 0.09064682025337625, "grad_norm": 4.048781871795654, "learning_rate": 4.531772575250836e-05, "loss": 3.2179, "step": 542 }, { "epoch": 0.0908140653091943, "grad_norm": 3.166337251663208, "learning_rate": 4.540133779264214e-05, "loss": 2.9196, "step": 543 }, { "epoch": 0.09098131036501234, "grad_norm": 2.6877660751342773, "learning_rate": 4.548494983277592e-05, "loss": 2.8334, "step": 544 }, { "epoch": 0.09114855542083038, "grad_norm": 3.8985180854797363, "learning_rate": 4.55685618729097e-05, "loss": 3.0232, "step": 545 }, { "epoch": 0.0913158004766484, "grad_norm": 1.9233304262161255, "learning_rate": 4.565217391304348e-05, "loss": 2.7697, "step": 546 }, { "epoch": 0.09148304553246644, "grad_norm": 2.0788514614105225, "learning_rate": 4.573578595317726e-05, "loss": 2.7203, "step": 547 }, { "epoch": 0.09165029058828449, "grad_norm": 4.00677490234375, "learning_rate": 4.581939799331103e-05, "loss": 3.069, "step": 548 }, { "epoch": 0.09181753564410253, "grad_norm": 2.949918270111084, "learning_rate": 4.590301003344482e-05, "loss": 2.882, "step": 549 }, { "epoch": 0.09198478069992055, "grad_norm": 3.2927207946777344, "learning_rate": 4.59866220735786e-05, "loss": 2.9775, "step": 550 }, { "epoch": 0.0921520257557386, "grad_norm": 4.737890243530273, "learning_rate": 4.607023411371238e-05, "loss": 2.9977, "step": 551 }, { "epoch": 0.09231927081155664, "grad_norm": 4.001957893371582, "learning_rate": 4.615384615384616e-05, "loss": 3.0282, "step": 552 }, { "epoch": 0.09248651586737468, "grad_norm": 2.466604471206665, "learning_rate": 4.623745819397994e-05, "loss": 2.8802, "step": 553 }, { "epoch": 0.0926537609231927, "grad_norm": 2.453812599182129, "learning_rate": 4.632107023411372e-05, "loss": 2.8233, "step": 554 }, { "epoch": 0.09282100597901075, "grad_norm": 3.8131377696990967, "learning_rate": 4.640468227424749e-05, "loss": 3.2322, "step": 555 }, { "epoch": 0.09298825103482879, "grad_norm": 2.298360586166382, "learning_rate": 4.6488294314381276e-05, "loss": 2.6835, "step": 556 }, { "epoch": 0.09315549609064681, "grad_norm": 2.709240436553955, "learning_rate": 4.657190635451505e-05, "loss": 2.7683, "step": 557 }, { "epoch": 0.09332274114646485, "grad_norm": 2.6775803565979004, "learning_rate": 4.6655518394648835e-05, "loss": 3.0388, "step": 558 }, { "epoch": 0.0934899862022829, "grad_norm": 4.882277011871338, "learning_rate": 4.673913043478261e-05, "loss": 3.4278, "step": 559 }, { "epoch": 0.09365723125810094, "grad_norm": 2.8916590213775635, "learning_rate": 4.6822742474916394e-05, "loss": 2.8952, "step": 560 }, { "epoch": 0.09382447631391896, "grad_norm": 3.0521299839019775, "learning_rate": 4.690635451505017e-05, "loss": 2.8233, "step": 561 }, { "epoch": 0.093991721369737, "grad_norm": 2.46439790725708, "learning_rate": 4.698996655518395e-05, "loss": 2.7447, "step": 562 }, { "epoch": 0.09415896642555505, "grad_norm": 5.566948890686035, "learning_rate": 4.707357859531773e-05, "loss": 3.7515, "step": 563 }, { "epoch": 0.09432621148137309, "grad_norm": 2.5461742877960205, "learning_rate": 4.715719063545151e-05, "loss": 2.8935, "step": 564 }, { "epoch": 0.09449345653719111, "grad_norm": 4.556259632110596, "learning_rate": 4.724080267558529e-05, "loss": 3.0152, "step": 565 }, { "epoch": 0.09466070159300916, "grad_norm": 2.0223686695098877, "learning_rate": 4.7324414715719065e-05, "loss": 2.8202, "step": 566 }, { "epoch": 0.0948279466488272, "grad_norm": 3.4800827503204346, "learning_rate": 4.740802675585285e-05, "loss": 3.2629, "step": 567 }, { "epoch": 0.09499519170464524, "grad_norm": 2.793830394744873, "learning_rate": 4.7491638795986624e-05, "loss": 2.9035, "step": 568 }, { "epoch": 0.09516243676046326, "grad_norm": 3.924760580062866, "learning_rate": 4.7575250836120407e-05, "loss": 3.0, "step": 569 }, { "epoch": 0.0953296818162813, "grad_norm": 2.7171742916107178, "learning_rate": 4.765886287625418e-05, "loss": 3.3308, "step": 570 }, { "epoch": 0.09549692687209935, "grad_norm": 2.3963327407836914, "learning_rate": 4.7742474916387966e-05, "loss": 2.9596, "step": 571 }, { "epoch": 0.09566417192791737, "grad_norm": 4.712761878967285, "learning_rate": 4.782608695652174e-05, "loss": 3.2726, "step": 572 }, { "epoch": 0.09583141698373542, "grad_norm": 2.745406150817871, "learning_rate": 4.7909698996655525e-05, "loss": 3.0021, "step": 573 }, { "epoch": 0.09599866203955346, "grad_norm": 2.4656238555908203, "learning_rate": 4.79933110367893e-05, "loss": 2.8759, "step": 574 }, { "epoch": 0.0961659070953715, "grad_norm": 1.5971622467041016, "learning_rate": 4.8076923076923084e-05, "loss": 2.3707, "step": 575 }, { "epoch": 0.09633315215118952, "grad_norm": 2.011336326599121, "learning_rate": 4.816053511705686e-05, "loss": 2.646, "step": 576 }, { "epoch": 0.09650039720700757, "grad_norm": 3.466235399246216, "learning_rate": 4.8244147157190636e-05, "loss": 3.2762, "step": 577 }, { "epoch": 0.0966676422628256, "grad_norm": 2.9777166843414307, "learning_rate": 4.832775919732442e-05, "loss": 2.8262, "step": 578 }, { "epoch": 0.09683488731864365, "grad_norm": 2.2701306343078613, "learning_rate": 4.8411371237458195e-05, "loss": 2.6413, "step": 579 }, { "epoch": 0.09700213237446167, "grad_norm": 3.0545737743377686, "learning_rate": 4.849498327759198e-05, "loss": 2.9603, "step": 580 }, { "epoch": 0.09716937743027972, "grad_norm": 2.251453161239624, "learning_rate": 4.8578595317725754e-05, "loss": 2.9705, "step": 581 }, { "epoch": 0.09733662248609776, "grad_norm": 3.155710220336914, "learning_rate": 4.866220735785954e-05, "loss": 2.7563, "step": 582 }, { "epoch": 0.0975038675419158, "grad_norm": 2.93874454498291, "learning_rate": 4.874581939799331e-05, "loss": 2.7884, "step": 583 }, { "epoch": 0.09767111259773383, "grad_norm": 3.2797529697418213, "learning_rate": 4.8829431438127096e-05, "loss": 3.2133, "step": 584 }, { "epoch": 0.09783835765355187, "grad_norm": 2.5589985847473145, "learning_rate": 4.891304347826087e-05, "loss": 2.9607, "step": 585 }, { "epoch": 0.09800560270936991, "grad_norm": 3.784144878387451, "learning_rate": 4.8996655518394655e-05, "loss": 3.3828, "step": 586 }, { "epoch": 0.09817284776518795, "grad_norm": 2.628394365310669, "learning_rate": 4.908026755852843e-05, "loss": 2.625, "step": 587 }, { "epoch": 0.09834009282100598, "grad_norm": 3.1170599460601807, "learning_rate": 4.916387959866221e-05, "loss": 3.0606, "step": 588 }, { "epoch": 0.09850733787682402, "grad_norm": 2.1702139377593994, "learning_rate": 4.924749163879599e-05, "loss": 2.5045, "step": 589 }, { "epoch": 0.09867458293264206, "grad_norm": 3.8704984188079834, "learning_rate": 4.9331103678929766e-05, "loss": 3.3075, "step": 590 }, { "epoch": 0.09884182798846008, "grad_norm": 2.19712495803833, "learning_rate": 4.941471571906355e-05, "loss": 2.9537, "step": 591 }, { "epoch": 0.09900907304427813, "grad_norm": 2.7391722202301025, "learning_rate": 4.9498327759197325e-05, "loss": 2.988, "step": 592 }, { "epoch": 0.09917631810009617, "grad_norm": 3.17637038230896, "learning_rate": 4.958193979933111e-05, "loss": 3.2191, "step": 593 }, { "epoch": 0.09934356315591421, "grad_norm": 3.9576058387756348, "learning_rate": 4.9665551839464884e-05, "loss": 2.9384, "step": 594 }, { "epoch": 0.09951080821173224, "grad_norm": 2.7109036445617676, "learning_rate": 4.974916387959867e-05, "loss": 2.6752, "step": 595 }, { "epoch": 0.09967805326755028, "grad_norm": 4.1206583976745605, "learning_rate": 4.983277591973244e-05, "loss": 2.7142, "step": 596 }, { "epoch": 0.09984529832336832, "grad_norm": 6.826204776763916, "learning_rate": 4.9916387959866226e-05, "loss": 3.1785, "step": 597 }, { "epoch": 0.10001254337918636, "grad_norm": 2.215266466140747, "learning_rate": 5e-05, "loss": 2.8809, "step": 598 }, { "epoch": 0.10017978843500439, "grad_norm": 2.8870818614959717, "learning_rate": 4.9999995739272476e-05, "loss": 2.6294, "step": 599 }, { "epoch": 0.10034703349082243, "grad_norm": 3.1702349185943604, "learning_rate": 4.999998295709135e-05, "loss": 3.5003, "step": 600 }, { "epoch": 0.10051427854664047, "grad_norm": 4.595864772796631, "learning_rate": 4.999996165346098e-05, "loss": 3.343, "step": 601 }, { "epoch": 0.10068152360245851, "grad_norm": 4.702067852020264, "learning_rate": 4.999993182838863e-05, "loss": 3.5147, "step": 602 }, { "epoch": 0.10084876865827654, "grad_norm": 2.953237295150757, "learning_rate": 4.999989348188445e-05, "loss": 2.7923, "step": 603 }, { "epoch": 0.10101601371409458, "grad_norm": 2.904754161834717, "learning_rate": 4.999984661396154e-05, "loss": 3.0171, "step": 604 }, { "epoch": 0.10118325876991262, "grad_norm": 2.6170008182525635, "learning_rate": 4.999979122463585e-05, "loss": 2.9641, "step": 605 }, { "epoch": 0.10135050382573065, "grad_norm": 3.4213478565216064, "learning_rate": 4.9999727313926276e-05, "loss": 2.9954, "step": 606 }, { "epoch": 0.10151774888154869, "grad_norm": 3.1679115295410156, "learning_rate": 4.9999654881854596e-05, "loss": 3.2712, "step": 607 }, { "epoch": 0.10168499393736673, "grad_norm": 2.3099677562713623, "learning_rate": 4.999957392844549e-05, "loss": 2.8135, "step": 608 }, { "epoch": 0.10185223899318477, "grad_norm": 3.4394614696502686, "learning_rate": 4.999948445372657e-05, "loss": 2.5125, "step": 609 }, { "epoch": 0.1020194840490028, "grad_norm": 2.226670980453491, "learning_rate": 4.999938645772833e-05, "loss": 2.7025, "step": 610 }, { "epoch": 0.10218672910482084, "grad_norm": 2.1458637714385986, "learning_rate": 4.999927994048416e-05, "loss": 2.8471, "step": 611 }, { "epoch": 0.10235397416063888, "grad_norm": 2.1480789184570312, "learning_rate": 4.999916490203037e-05, "loss": 2.7935, "step": 612 }, { "epoch": 0.10252121921645692, "grad_norm": 3.5607407093048096, "learning_rate": 4.999904134240618e-05, "loss": 2.8858, "step": 613 }, { "epoch": 0.10268846427227495, "grad_norm": 2.8406479358673096, "learning_rate": 4.9998909261653707e-05, "loss": 2.9893, "step": 614 }, { "epoch": 0.10285570932809299, "grad_norm": 2.5045430660247803, "learning_rate": 4.999876865981797e-05, "loss": 2.9112, "step": 615 }, { "epoch": 0.10302295438391103, "grad_norm": 3.368617534637451, "learning_rate": 4.999861953694689e-05, "loss": 2.8039, "step": 616 }, { "epoch": 0.10319019943972907, "grad_norm": 4.266275882720947, "learning_rate": 4.99984618930913e-05, "loss": 3.0329, "step": 617 }, { "epoch": 0.1033574444955471, "grad_norm": 3.705104351043701, "learning_rate": 4.999829572830492e-05, "loss": 3.4798, "step": 618 }, { "epoch": 0.10352468955136514, "grad_norm": 4.453060150146484, "learning_rate": 4.999812104264442e-05, "loss": 3.0441, "step": 619 }, { "epoch": 0.10369193460718318, "grad_norm": 3.0145387649536133, "learning_rate": 4.9997937836169315e-05, "loss": 2.9078, "step": 620 }, { "epoch": 0.10385917966300122, "grad_norm": 3.0352790355682373, "learning_rate": 4.999774610894207e-05, "loss": 3.0915, "step": 621 }, { "epoch": 0.10402642471881925, "grad_norm": 2.853654623031616, "learning_rate": 4.999754586102802e-05, "loss": 2.8524, "step": 622 }, { "epoch": 0.10419366977463729, "grad_norm": 4.414551734924316, "learning_rate": 4.999733709249543e-05, "loss": 2.8104, "step": 623 }, { "epoch": 0.10436091483045533, "grad_norm": 1.9161115884780884, "learning_rate": 4.999711980341547e-05, "loss": 2.6947, "step": 624 }, { "epoch": 0.10452815988627336, "grad_norm": 2.7311668395996094, "learning_rate": 4.999689399386219e-05, "loss": 3.1101, "step": 625 }, { "epoch": 0.1046954049420914, "grad_norm": 2.001525402069092, "learning_rate": 4.999665966391256e-05, "loss": 2.6378, "step": 626 }, { "epoch": 0.10486264999790944, "grad_norm": 3.9979467391967773, "learning_rate": 4.9996416813646464e-05, "loss": 2.9856, "step": 627 }, { "epoch": 0.10502989505372748, "grad_norm": 2.6706976890563965, "learning_rate": 4.999616544314668e-05, "loss": 3.2006, "step": 628 }, { "epoch": 0.1051971401095455, "grad_norm": 2.8648200035095215, "learning_rate": 4.999590555249887e-05, "loss": 3.3365, "step": 629 }, { "epoch": 0.10536438516536355, "grad_norm": 3.355009078979492, "learning_rate": 4.999563714179164e-05, "loss": 2.8843, "step": 630 }, { "epoch": 0.10553163022118159, "grad_norm": 2.3864948749542236, "learning_rate": 4.9995360211116474e-05, "loss": 2.9304, "step": 631 }, { "epoch": 0.10569887527699963, "grad_norm": 1.964548110961914, "learning_rate": 4.999507476056776e-05, "loss": 3.0027, "step": 632 }, { "epoch": 0.10586612033281766, "grad_norm": 2.1849634647369385, "learning_rate": 4.99947807902428e-05, "loss": 2.7438, "step": 633 }, { "epoch": 0.1060333653886357, "grad_norm": 1.6892940998077393, "learning_rate": 4.9994478300241796e-05, "loss": 2.8941, "step": 634 }, { "epoch": 0.10620061044445374, "grad_norm": 4.646712303161621, "learning_rate": 4.999416729066786e-05, "loss": 3.1625, "step": 635 }, { "epoch": 0.10636785550027178, "grad_norm": 2.868910551071167, "learning_rate": 4.9993847761627e-05, "loss": 2.7794, "step": 636 }, { "epoch": 0.1065351005560898, "grad_norm": 2.4344096183776855, "learning_rate": 4.999351971322812e-05, "loss": 2.9074, "step": 637 }, { "epoch": 0.10670234561190785, "grad_norm": 2.1295628547668457, "learning_rate": 4.999318314558306e-05, "loss": 2.662, "step": 638 }, { "epoch": 0.10686959066772589, "grad_norm": 4.279069900512695, "learning_rate": 4.999283805880651e-05, "loss": 2.9295, "step": 639 }, { "epoch": 0.10703683572354392, "grad_norm": 2.7644617557525635, "learning_rate": 4.999248445301612e-05, "loss": 3.0761, "step": 640 }, { "epoch": 0.10720408077936196, "grad_norm": 3.0360782146453857, "learning_rate": 4.999212232833241e-05, "loss": 3.0306, "step": 641 }, { "epoch": 0.10737132583518, "grad_norm": 3.0822620391845703, "learning_rate": 4.999175168487883e-05, "loss": 2.4342, "step": 642 }, { "epoch": 0.10753857089099804, "grad_norm": 2.5402750968933105, "learning_rate": 4.999137252278169e-05, "loss": 2.9402, "step": 643 }, { "epoch": 0.10770581594681607, "grad_norm": 2.955796480178833, "learning_rate": 4.9990984842170245e-05, "loss": 3.0474, "step": 644 }, { "epoch": 0.10787306100263411, "grad_norm": 3.001021146774292, "learning_rate": 4.9990588643176636e-05, "loss": 3.0784, "step": 645 }, { "epoch": 0.10804030605845215, "grad_norm": 4.919010639190674, "learning_rate": 4.999018392593592e-05, "loss": 2.9352, "step": 646 }, { "epoch": 0.10820755111427019, "grad_norm": 1.9175080060958862, "learning_rate": 4.998977069058603e-05, "loss": 3.0419, "step": 647 }, { "epoch": 0.10837479617008822, "grad_norm": 3.4278807640075684, "learning_rate": 4.998934893726783e-05, "loss": 3.2697, "step": 648 }, { "epoch": 0.10854204122590626, "grad_norm": 2.8143584728240967, "learning_rate": 4.998891866612509e-05, "loss": 2.7576, "step": 649 }, { "epoch": 0.1087092862817243, "grad_norm": 2.6466827392578125, "learning_rate": 4.9988479877304455e-05, "loss": 3.0772, "step": 650 }, { "epoch": 0.10887653133754234, "grad_norm": 3.6217103004455566, "learning_rate": 4.99880325709555e-05, "loss": 3.0041, "step": 651 }, { "epoch": 0.10904377639336037, "grad_norm": 3.9509363174438477, "learning_rate": 4.998757674723068e-05, "loss": 2.7575, "step": 652 }, { "epoch": 0.10921102144917841, "grad_norm": 3.5483837127685547, "learning_rate": 4.998711240628538e-05, "loss": 2.82, "step": 653 }, { "epoch": 0.10937826650499645, "grad_norm": 3.042452812194824, "learning_rate": 4.998663954827786e-05, "loss": 2.9415, "step": 654 }, { "epoch": 0.10954551156081449, "grad_norm": 2.5541257858276367, "learning_rate": 4.998615817336931e-05, "loss": 2.9503, "step": 655 }, { "epoch": 0.10971275661663252, "grad_norm": 2.6180899143218994, "learning_rate": 4.998566828172381e-05, "loss": 2.7477, "step": 656 }, { "epoch": 0.10988000167245056, "grad_norm": 2.8619933128356934, "learning_rate": 4.998516987350834e-05, "loss": 3.1993, "step": 657 }, { "epoch": 0.1100472467282686, "grad_norm": 5.042386531829834, "learning_rate": 4.998466294889279e-05, "loss": 2.8459, "step": 658 }, { "epoch": 0.11021449178408663, "grad_norm": 2.532939910888672, "learning_rate": 4.9984147508049945e-05, "loss": 2.9642, "step": 659 }, { "epoch": 0.11038173683990467, "grad_norm": 3.8720648288726807, "learning_rate": 4.998362355115549e-05, "loss": 3.2197, "step": 660 }, { "epoch": 0.11054898189572271, "grad_norm": 5.120781898498535, "learning_rate": 4.998309107838804e-05, "loss": 3.4037, "step": 661 }, { "epoch": 0.11071622695154075, "grad_norm": 2.9340667724609375, "learning_rate": 4.998255008992908e-05, "loss": 3.0822, "step": 662 }, { "epoch": 0.11088347200735878, "grad_norm": 2.6100831031799316, "learning_rate": 4.9982000585963004e-05, "loss": 2.7368, "step": 663 }, { "epoch": 0.11105071706317682, "grad_norm": 3.689965009689331, "learning_rate": 4.9981442566677134e-05, "loss": 2.7232, "step": 664 }, { "epoch": 0.11121796211899486, "grad_norm": 3.1951332092285156, "learning_rate": 4.998087603226166e-05, "loss": 3.1685, "step": 665 }, { "epoch": 0.1113852071748129, "grad_norm": 2.0082695484161377, "learning_rate": 4.998030098290969e-05, "loss": 2.7064, "step": 666 }, { "epoch": 0.11155245223063093, "grad_norm": 2.5071628093719482, "learning_rate": 4.997971741881724e-05, "loss": 2.9123, "step": 667 }, { "epoch": 0.11171969728644897, "grad_norm": 3.033701181411743, "learning_rate": 4.9979125340183216e-05, "loss": 2.9744, "step": 668 }, { "epoch": 0.11188694234226701, "grad_norm": 4.606934547424316, "learning_rate": 4.997852474720945e-05, "loss": 3.3427, "step": 669 }, { "epoch": 0.11205418739808505, "grad_norm": 3.42999529838562, "learning_rate": 4.997791564010064e-05, "loss": 3.4904, "step": 670 }, { "epoch": 0.11222143245390308, "grad_norm": 3.481557846069336, "learning_rate": 4.997729801906441e-05, "loss": 2.7632, "step": 671 }, { "epoch": 0.11238867750972112, "grad_norm": 4.434562683105469, "learning_rate": 4.997667188431129e-05, "loss": 2.8543, "step": 672 }, { "epoch": 0.11255592256553916, "grad_norm": 5.441007614135742, "learning_rate": 4.997603723605469e-05, "loss": 3.3316, "step": 673 }, { "epoch": 0.11272316762135719, "grad_norm": 2.511531114578247, "learning_rate": 4.997539407451095e-05, "loss": 2.631, "step": 674 }, { "epoch": 0.11289041267717523, "grad_norm": 3.9592323303222656, "learning_rate": 4.997474239989928e-05, "loss": 3.1441, "step": 675 }, { "epoch": 0.11305765773299327, "grad_norm": 3.8708887100219727, "learning_rate": 4.997408221244182e-05, "loss": 2.777, "step": 676 }, { "epoch": 0.11322490278881131, "grad_norm": 2.3795552253723145, "learning_rate": 4.997341351236359e-05, "loss": 2.7933, "step": 677 }, { "epoch": 0.11339214784462934, "grad_norm": 2.8319790363311768, "learning_rate": 4.997273629989254e-05, "loss": 2.754, "step": 678 }, { "epoch": 0.11355939290044738, "grad_norm": 2.197510242462158, "learning_rate": 4.997205057525949e-05, "loss": 2.8485, "step": 679 }, { "epoch": 0.11372663795626542, "grad_norm": 2.6863725185394287, "learning_rate": 4.997135633869817e-05, "loss": 3.0453, "step": 680 }, { "epoch": 0.11389388301208346, "grad_norm": 2.2702786922454834, "learning_rate": 4.997065359044523e-05, "loss": 2.6886, "step": 681 }, { "epoch": 0.11406112806790149, "grad_norm": 2.5089964866638184, "learning_rate": 4.99699423307402e-05, "loss": 3.0206, "step": 682 }, { "epoch": 0.11422837312371953, "grad_norm": 2.712219715118408, "learning_rate": 4.9969222559825525e-05, "loss": 2.8283, "step": 683 }, { "epoch": 0.11439561817953757, "grad_norm": 2.6860079765319824, "learning_rate": 4.9968494277946534e-05, "loss": 3.0986, "step": 684 }, { "epoch": 0.11456286323535561, "grad_norm": 6.263725757598877, "learning_rate": 4.996775748535147e-05, "loss": 3.2857, "step": 685 }, { "epoch": 0.11473010829117364, "grad_norm": 2.3474090099334717, "learning_rate": 4.996701218229148e-05, "loss": 3.1375, "step": 686 }, { "epoch": 0.11489735334699168, "grad_norm": 2.5577433109283447, "learning_rate": 4.996625836902061e-05, "loss": 2.8036, "step": 687 }, { "epoch": 0.11506459840280972, "grad_norm": 4.5564775466918945, "learning_rate": 4.99654960457958e-05, "loss": 2.8482, "step": 688 }, { "epoch": 0.11523184345862776, "grad_norm": 3.627901077270508, "learning_rate": 4.99647252128769e-05, "loss": 2.9135, "step": 689 }, { "epoch": 0.11539908851444579, "grad_norm": 3.253361701965332, "learning_rate": 4.996394587052663e-05, "loss": 3.2387, "step": 690 }, { "epoch": 0.11556633357026383, "grad_norm": 2.9540932178497314, "learning_rate": 4.996315801901066e-05, "loss": 3.379, "step": 691 }, { "epoch": 0.11573357862608187, "grad_norm": 2.687427043914795, "learning_rate": 4.996236165859753e-05, "loss": 2.8956, "step": 692 }, { "epoch": 0.1159008236818999, "grad_norm": 2.0105161666870117, "learning_rate": 4.9961556789558685e-05, "loss": 2.5116, "step": 693 }, { "epoch": 0.11606806873771794, "grad_norm": 3.718292236328125, "learning_rate": 4.9960743412168463e-05, "loss": 3.2342, "step": 694 }, { "epoch": 0.11623531379353598, "grad_norm": 2.0292601585388184, "learning_rate": 4.995992152670412e-05, "loss": 2.6781, "step": 695 }, { "epoch": 0.11640255884935402, "grad_norm": 3.9491422176361084, "learning_rate": 4.99590911334458e-05, "loss": 2.9786, "step": 696 }, { "epoch": 0.11656980390517205, "grad_norm": 4.446747303009033, "learning_rate": 4.995825223267656e-05, "loss": 2.4746, "step": 697 }, { "epoch": 0.11673704896099009, "grad_norm": 2.7430481910705566, "learning_rate": 4.995740482468232e-05, "loss": 3.0298, "step": 698 }, { "epoch": 0.11690429401680813, "grad_norm": 2.3943698406219482, "learning_rate": 4.9956548909751956e-05, "loss": 2.8596, "step": 699 }, { "epoch": 0.11707153907262617, "grad_norm": 4.455947399139404, "learning_rate": 4.995568448817719e-05, "loss": 3.407, "step": 700 }, { "epoch": 0.1172387841284442, "grad_norm": 3.276171922683716, "learning_rate": 4.995481156025268e-05, "loss": 2.7501, "step": 701 }, { "epoch": 0.11740602918426224, "grad_norm": 2.7432668209075928, "learning_rate": 4.995393012627597e-05, "loss": 2.875, "step": 702 }, { "epoch": 0.11757327424008028, "grad_norm": 3.5378530025482178, "learning_rate": 4.99530401865475e-05, "loss": 2.9815, "step": 703 }, { "epoch": 0.11774051929589832, "grad_norm": 3.180459976196289, "learning_rate": 4.9952141741370606e-05, "loss": 3.1384, "step": 704 }, { "epoch": 0.11790776435171635, "grad_norm": 3.134700298309326, "learning_rate": 4.995123479105154e-05, "loss": 3.1168, "step": 705 }, { "epoch": 0.11807500940753439, "grad_norm": 3.123302698135376, "learning_rate": 4.9950319335899454e-05, "loss": 2.8052, "step": 706 }, { "epoch": 0.11824225446335243, "grad_norm": 2.7749836444854736, "learning_rate": 4.9949395376226366e-05, "loss": 2.9355, "step": 707 }, { "epoch": 0.11840949951917046, "grad_norm": 3.232726812362671, "learning_rate": 4.9948462912347225e-05, "loss": 2.9771, "step": 708 }, { "epoch": 0.1185767445749885, "grad_norm": 3.69875168800354, "learning_rate": 4.994752194457988e-05, "loss": 3.0952, "step": 709 }, { "epoch": 0.11874398963080654, "grad_norm": 2.4349277019500732, "learning_rate": 4.994657247324505e-05, "loss": 3.0227, "step": 710 }, { "epoch": 0.11891123468662458, "grad_norm": 7.107716083526611, "learning_rate": 4.9945614498666385e-05, "loss": 3.4098, "step": 711 }, { "epoch": 0.11907847974244261, "grad_norm": 2.712965250015259, "learning_rate": 4.9944648021170403e-05, "loss": 2.7514, "step": 712 }, { "epoch": 0.11924572479826065, "grad_norm": 3.6287479400634766, "learning_rate": 4.9943673041086546e-05, "loss": 2.7323, "step": 713 }, { "epoch": 0.11941296985407869, "grad_norm": 4.0539116859436035, "learning_rate": 4.994268955874715e-05, "loss": 2.8032, "step": 714 }, { "epoch": 0.11958021490989673, "grad_norm": 5.568749904632568, "learning_rate": 4.994169757448742e-05, "loss": 3.1187, "step": 715 }, { "epoch": 0.11974745996571476, "grad_norm": 4.165975093841553, "learning_rate": 4.994069708864551e-05, "loss": 2.9687, "step": 716 }, { "epoch": 0.1199147050215328, "grad_norm": 2.3578178882598877, "learning_rate": 4.993968810156242e-05, "loss": 2.8734, "step": 717 }, { "epoch": 0.12008195007735084, "grad_norm": 2.1569578647613525, "learning_rate": 4.993867061358208e-05, "loss": 2.7046, "step": 718 }, { "epoch": 0.12024919513316888, "grad_norm": 3.4436519145965576, "learning_rate": 4.993764462505132e-05, "loss": 2.8116, "step": 719 }, { "epoch": 0.12041644018898691, "grad_norm": 2.1866683959960938, "learning_rate": 4.993661013631985e-05, "loss": 2.7804, "step": 720 }, { "epoch": 0.12058368524480495, "grad_norm": 2.9582724571228027, "learning_rate": 4.993556714774027e-05, "loss": 3.2457, "step": 721 }, { "epoch": 0.12075093030062299, "grad_norm": 2.400707244873047, "learning_rate": 4.9934515659668116e-05, "loss": 2.7893, "step": 722 }, { "epoch": 0.12091817535644102, "grad_norm": 5.070247650146484, "learning_rate": 4.9933455672461784e-05, "loss": 3.3186, "step": 723 }, { "epoch": 0.12108542041225906, "grad_norm": 3.240022659301758, "learning_rate": 4.9932387186482565e-05, "loss": 2.9554, "step": 724 }, { "epoch": 0.1212526654680771, "grad_norm": 4.550904273986816, "learning_rate": 4.9931310202094683e-05, "loss": 2.9043, "step": 725 }, { "epoch": 0.12141991052389514, "grad_norm": 3.4847187995910645, "learning_rate": 4.993022471966523e-05, "loss": 3.154, "step": 726 }, { "epoch": 0.12158715557971317, "grad_norm": 2.785179853439331, "learning_rate": 4.9929130739564204e-05, "loss": 2.8536, "step": 727 }, { "epoch": 0.12175440063553121, "grad_norm": 2.897012710571289, "learning_rate": 4.992802826216449e-05, "loss": 2.8802, "step": 728 }, { "epoch": 0.12192164569134925, "grad_norm": 4.094494342803955, "learning_rate": 4.9926917287841865e-05, "loss": 2.7018, "step": 729 }, { "epoch": 0.12208889074716729, "grad_norm": 1.9531452655792236, "learning_rate": 4.992579781697505e-05, "loss": 2.486, "step": 730 }, { "epoch": 0.12225613580298532, "grad_norm": 2.8330042362213135, "learning_rate": 4.992466984994559e-05, "loss": 2.7859, "step": 731 }, { "epoch": 0.12242338085880336, "grad_norm": 3.4791319370269775, "learning_rate": 4.992353338713799e-05, "loss": 2.8041, "step": 732 }, { "epoch": 0.1225906259146214, "grad_norm": 3.500182628631592, "learning_rate": 4.99223884289396e-05, "loss": 3.138, "step": 733 }, { "epoch": 0.12275787097043944, "grad_norm": 4.43774938583374, "learning_rate": 4.992123497574069e-05, "loss": 3.136, "step": 734 }, { "epoch": 0.12292511602625747, "grad_norm": 2.560802936553955, "learning_rate": 4.9920073027934445e-05, "loss": 2.9168, "step": 735 }, { "epoch": 0.12309236108207551, "grad_norm": 5.097366809844971, "learning_rate": 4.991890258591691e-05, "loss": 2.8287, "step": 736 }, { "epoch": 0.12325960613789355, "grad_norm": 5.267874240875244, "learning_rate": 4.9917723650087034e-05, "loss": 3.2044, "step": 737 }, { "epoch": 0.12342685119371159, "grad_norm": 3.686082363128662, "learning_rate": 4.991653622084668e-05, "loss": 2.9971, "step": 738 }, { "epoch": 0.12359409624952962, "grad_norm": 3.854748487472534, "learning_rate": 4.9915340298600584e-05, "loss": 2.9005, "step": 739 }, { "epoch": 0.12376134130534766, "grad_norm": 3.6813154220581055, "learning_rate": 4.9914135883756385e-05, "loss": 2.9164, "step": 740 }, { "epoch": 0.1239285863611657, "grad_norm": 6.267175674438477, "learning_rate": 4.991292297672462e-05, "loss": 3.2557, "step": 741 }, { "epoch": 0.12409583141698373, "grad_norm": 3.1315629482269287, "learning_rate": 4.991170157791873e-05, "loss": 2.47, "step": 742 }, { "epoch": 0.12426307647280177, "grad_norm": 3.3873074054718018, "learning_rate": 4.9910471687755025e-05, "loss": 3.0962, "step": 743 }, { "epoch": 0.12443032152861981, "grad_norm": 5.7802300453186035, "learning_rate": 4.990923330665272e-05, "loss": 3.1892, "step": 744 }, { "epoch": 0.12459756658443785, "grad_norm": 2.272925853729248, "learning_rate": 4.990798643503393e-05, "loss": 2.8588, "step": 745 }, { "epoch": 0.12476481164025588, "grad_norm": 2.73773193359375, "learning_rate": 4.990673107332368e-05, "loss": 2.9109, "step": 746 }, { "epoch": 0.12493205669607392, "grad_norm": 2.611372947692871, "learning_rate": 4.990546722194984e-05, "loss": 2.7866, "step": 747 }, { "epoch": 0.12509930175189196, "grad_norm": 4.2068352699279785, "learning_rate": 4.990419488134323e-05, "loss": 2.7837, "step": 748 }, { "epoch": 0.12526654680771, "grad_norm": 4.5296173095703125, "learning_rate": 4.9902914051937524e-05, "loss": 3.1947, "step": 749 }, { "epoch": 0.12543379186352804, "grad_norm": 2.499241828918457, "learning_rate": 4.9901624734169306e-05, "loss": 3.2332, "step": 750 }, { "epoch": 0.12560103691934607, "grad_norm": 4.029646396636963, "learning_rate": 4.990032692847805e-05, "loss": 3.0389, "step": 751 }, { "epoch": 0.1257682819751641, "grad_norm": 4.44845724105835, "learning_rate": 4.9899020635306124e-05, "loss": 3.4199, "step": 752 }, { "epoch": 0.12593552703098215, "grad_norm": 3.346797466278076, "learning_rate": 4.98977058550988e-05, "loss": 3.4807, "step": 753 }, { "epoch": 0.12610277208680018, "grad_norm": 2.1955745220184326, "learning_rate": 4.9896382588304216e-05, "loss": 2.7587, "step": 754 }, { "epoch": 0.12627001714261823, "grad_norm": 2.601008415222168, "learning_rate": 4.9895050835373416e-05, "loss": 2.8116, "step": 755 }, { "epoch": 0.12643726219843626, "grad_norm": 3.0236427783966064, "learning_rate": 4.9893710596760346e-05, "loss": 2.898, "step": 756 }, { "epoch": 0.1266045072542543, "grad_norm": 2.5923123359680176, "learning_rate": 4.989236187292184e-05, "loss": 2.9379, "step": 757 }, { "epoch": 0.12677175231007234, "grad_norm": 5.02788782119751, "learning_rate": 4.989100466431763e-05, "loss": 3.051, "step": 758 }, { "epoch": 0.12693899736589037, "grad_norm": 2.024794340133667, "learning_rate": 4.988963897141031e-05, "loss": 2.9428, "step": 759 }, { "epoch": 0.1271062424217084, "grad_norm": 2.384519100189209, "learning_rate": 4.988826479466541e-05, "loss": 2.6777, "step": 760 }, { "epoch": 0.12727348747752645, "grad_norm": 3.7775096893310547, "learning_rate": 4.988688213455131e-05, "loss": 3.0094, "step": 761 }, { "epoch": 0.12744073253334448, "grad_norm": 2.453615188598633, "learning_rate": 4.9885490991539304e-05, "loss": 2.9058, "step": 762 }, { "epoch": 0.1276079775891625, "grad_norm": 5.520934104919434, "learning_rate": 4.988409136610358e-05, "loss": 3.5268, "step": 763 }, { "epoch": 0.12777522264498056, "grad_norm": 3.656724452972412, "learning_rate": 4.9882683258721216e-05, "loss": 2.9508, "step": 764 }, { "epoch": 0.1279424677007986, "grad_norm": 2.577235221862793, "learning_rate": 4.988126666987217e-05, "loss": 2.7051, "step": 765 }, { "epoch": 0.12810971275661664, "grad_norm": 3.701707363128662, "learning_rate": 4.987984160003931e-05, "loss": 2.9399, "step": 766 }, { "epoch": 0.12827695781243467, "grad_norm": 2.675867795944214, "learning_rate": 4.987840804970836e-05, "loss": 2.6069, "step": 767 }, { "epoch": 0.1284442028682527, "grad_norm": 4.06071138381958, "learning_rate": 4.9876966019367975e-05, "loss": 3.0995, "step": 768 }, { "epoch": 0.12861144792407075, "grad_norm": 2.74296236038208, "learning_rate": 4.9875515509509686e-05, "loss": 2.7405, "step": 769 }, { "epoch": 0.12877869297988878, "grad_norm": 3.2363839149475098, "learning_rate": 4.987405652062789e-05, "loss": 2.8284, "step": 770 }, { "epoch": 0.1289459380357068, "grad_norm": 6.875770092010498, "learning_rate": 4.987258905321992e-05, "loss": 3.0919, "step": 771 }, { "epoch": 0.12911318309152486, "grad_norm": 3.8777079582214355, "learning_rate": 4.987111310778596e-05, "loss": 2.7659, "step": 772 }, { "epoch": 0.1292804281473429, "grad_norm": 2.47179913520813, "learning_rate": 4.98696286848291e-05, "loss": 2.4689, "step": 773 }, { "epoch": 0.12944767320316092, "grad_norm": 2.6261703968048096, "learning_rate": 4.986813578485532e-05, "loss": 3.0775, "step": 774 }, { "epoch": 0.12961491825897897, "grad_norm": 4.262143135070801, "learning_rate": 4.9866634408373484e-05, "loss": 3.2007, "step": 775 }, { "epoch": 0.129782163314797, "grad_norm": 7.5064897537231445, "learning_rate": 4.986512455589535e-05, "loss": 2.779, "step": 776 }, { "epoch": 0.12994940837061505, "grad_norm": 4.7984747886657715, "learning_rate": 4.986360622793557e-05, "loss": 2.8522, "step": 777 }, { "epoch": 0.13011665342643308, "grad_norm": 2.9100794792175293, "learning_rate": 4.9862079425011664e-05, "loss": 3.032, "step": 778 }, { "epoch": 0.1302838984822511, "grad_norm": 2.9449944496154785, "learning_rate": 4.9860544147644065e-05, "loss": 2.6693, "step": 779 }, { "epoch": 0.13045114353806916, "grad_norm": 4.114263534545898, "learning_rate": 4.985900039635609e-05, "loss": 3.2063, "step": 780 }, { "epoch": 0.1306183885938872, "grad_norm": 4.283298969268799, "learning_rate": 4.985744817167393e-05, "loss": 3.1806, "step": 781 }, { "epoch": 0.13078563364970522, "grad_norm": 2.867211103439331, "learning_rate": 4.9855887474126675e-05, "loss": 2.8248, "step": 782 }, { "epoch": 0.13095287870552327, "grad_norm": 2.5060675144195557, "learning_rate": 4.9854318304246304e-05, "loss": 3.1386, "step": 783 }, { "epoch": 0.1311201237613413, "grad_norm": 3.6026721000671387, "learning_rate": 4.9852740662567685e-05, "loss": 2.9145, "step": 784 }, { "epoch": 0.13128736881715936, "grad_norm": 3.492955207824707, "learning_rate": 4.985115454962856e-05, "loss": 2.9509, "step": 785 }, { "epoch": 0.13145461387297738, "grad_norm": 3.3801512718200684, "learning_rate": 4.9849559965969586e-05, "loss": 2.931, "step": 786 }, { "epoch": 0.1316218589287954, "grad_norm": 4.571837902069092, "learning_rate": 4.984795691213427e-05, "loss": 3.0641, "step": 787 }, { "epoch": 0.13178910398461346, "grad_norm": 3.506398916244507, "learning_rate": 4.984634538866904e-05, "loss": 3.0384, "step": 788 }, { "epoch": 0.1319563490404315, "grad_norm": 3.927084445953369, "learning_rate": 4.984472539612318e-05, "loss": 3.298, "step": 789 }, { "epoch": 0.13212359409624952, "grad_norm": 2.7334961891174316, "learning_rate": 4.984309693504889e-05, "loss": 2.5432, "step": 790 }, { "epoch": 0.13229083915206757, "grad_norm": 6.2230048179626465, "learning_rate": 4.984146000600125e-05, "loss": 3.256, "step": 791 }, { "epoch": 0.1324580842078856, "grad_norm": 3.1181750297546387, "learning_rate": 4.983981460953821e-05, "loss": 2.7963, "step": 792 }, { "epoch": 0.13262532926370363, "grad_norm": 4.825429439544678, "learning_rate": 4.983816074622063e-05, "loss": 3.2091, "step": 793 }, { "epoch": 0.13279257431952168, "grad_norm": 3.4500768184661865, "learning_rate": 4.9836498416612224e-05, "loss": 2.9076, "step": 794 }, { "epoch": 0.1329598193753397, "grad_norm": 1.9775090217590332, "learning_rate": 4.9834827621279625e-05, "loss": 3.3427, "step": 795 }, { "epoch": 0.13312706443115777, "grad_norm": 5.289772987365723, "learning_rate": 4.9833148360792336e-05, "loss": 3.3102, "step": 796 }, { "epoch": 0.1332943094869758, "grad_norm": 4.246041297912598, "learning_rate": 4.983146063572274e-05, "loss": 3.2465, "step": 797 }, { "epoch": 0.13346155454279382, "grad_norm": 2.8196728229522705, "learning_rate": 4.982976444664611e-05, "loss": 2.6256, "step": 798 }, { "epoch": 0.13362879959861187, "grad_norm": 2.880267858505249, "learning_rate": 4.982805979414062e-05, "loss": 2.7121, "step": 799 }, { "epoch": 0.1337960446544299, "grad_norm": 3.552816867828369, "learning_rate": 4.982634667878731e-05, "loss": 2.5457, "step": 800 }, { "epoch": 0.13396328971024793, "grad_norm": 2.7183873653411865, "learning_rate": 4.982462510117009e-05, "loss": 2.4043, "step": 801 }, { "epoch": 0.13413053476606598, "grad_norm": 2.681792736053467, "learning_rate": 4.98228950618758e-05, "loss": 2.5395, "step": 802 }, { "epoch": 0.134297779821884, "grad_norm": 7.300746440887451, "learning_rate": 4.9821156561494125e-05, "loss": 3.3422, "step": 803 }, { "epoch": 0.13446502487770207, "grad_norm": 4.185237884521484, "learning_rate": 4.981940960061765e-05, "loss": 3.0387, "step": 804 }, { "epoch": 0.1346322699335201, "grad_norm": 4.543412685394287, "learning_rate": 4.981765417984184e-05, "loss": 3.2047, "step": 805 }, { "epoch": 0.13479951498933812, "grad_norm": 4.084909915924072, "learning_rate": 4.981589029976505e-05, "loss": 2.8066, "step": 806 }, { "epoch": 0.13496676004515618, "grad_norm": 2.8556268215179443, "learning_rate": 4.9814117960988506e-05, "loss": 2.9118, "step": 807 }, { "epoch": 0.1351340051009742, "grad_norm": 2.7172048091888428, "learning_rate": 4.9812337164116325e-05, "loss": 3.0175, "step": 808 }, { "epoch": 0.13530125015679223, "grad_norm": 3.225558042526245, "learning_rate": 4.981054790975551e-05, "loss": 3.0242, "step": 809 }, { "epoch": 0.13546849521261028, "grad_norm": 5.441693305969238, "learning_rate": 4.9808750198515934e-05, "loss": 3.3786, "step": 810 }, { "epoch": 0.1356357402684283, "grad_norm": 3.134338855743408, "learning_rate": 4.980694403101037e-05, "loss": 2.728, "step": 811 }, { "epoch": 0.13580298532424634, "grad_norm": 3.658837080001831, "learning_rate": 4.980512940785447e-05, "loss": 3.0895, "step": 812 }, { "epoch": 0.1359702303800644, "grad_norm": 2.0654354095458984, "learning_rate": 4.9803306329666756e-05, "loss": 2.7859, "step": 813 }, { "epoch": 0.13613747543588242, "grad_norm": 4.558130264282227, "learning_rate": 4.980147479706864e-05, "loss": 3.3265, "step": 814 }, { "epoch": 0.13630472049170048, "grad_norm": 6.055203914642334, "learning_rate": 4.97996348106844e-05, "loss": 3.6012, "step": 815 }, { "epoch": 0.1364719655475185, "grad_norm": 4.029632091522217, "learning_rate": 4.979778637114124e-05, "loss": 3.5147, "step": 816 }, { "epoch": 0.13663921060333653, "grad_norm": 3.0540897846221924, "learning_rate": 4.97959294790692e-05, "loss": 2.9341, "step": 817 }, { "epoch": 0.13680645565915459, "grad_norm": 3.4658312797546387, "learning_rate": 4.979406413510121e-05, "loss": 2.6603, "step": 818 }, { "epoch": 0.1369737007149726, "grad_norm": 3.4460811614990234, "learning_rate": 4.97921903398731e-05, "loss": 2.9152, "step": 819 }, { "epoch": 0.13714094577079064, "grad_norm": 3.8794186115264893, "learning_rate": 4.979030809402357e-05, "loss": 2.7716, "step": 820 }, { "epoch": 0.1373081908266087, "grad_norm": 2.992248773574829, "learning_rate": 4.9788417398194186e-05, "loss": 2.8267, "step": 821 }, { "epoch": 0.13747543588242672, "grad_norm": 3.708667039871216, "learning_rate": 4.978651825302941e-05, "loss": 2.7202, "step": 822 }, { "epoch": 0.13764268093824478, "grad_norm": 2.825927495956421, "learning_rate": 4.9784610659176596e-05, "loss": 2.7506, "step": 823 }, { "epoch": 0.1378099259940628, "grad_norm": 2.545137882232666, "learning_rate": 4.9782694617285944e-05, "loss": 2.571, "step": 824 }, { "epoch": 0.13797717104988083, "grad_norm": 2.5376436710357666, "learning_rate": 4.9780770128010565e-05, "loss": 3.0534, "step": 825 }, { "epoch": 0.1381444161056989, "grad_norm": 2.673774242401123, "learning_rate": 4.977883719200643e-05, "loss": 2.969, "step": 826 }, { "epoch": 0.1383116611615169, "grad_norm": 3.8221962451934814, "learning_rate": 4.9776895809932403e-05, "loss": 3.0761, "step": 827 }, { "epoch": 0.13847890621733494, "grad_norm": 2.3327362537384033, "learning_rate": 4.977494598245022e-05, "loss": 2.8256, "step": 828 }, { "epoch": 0.138646151273153, "grad_norm": 3.7796435356140137, "learning_rate": 4.977298771022448e-05, "loss": 2.9125, "step": 829 }, { "epoch": 0.13881339632897102, "grad_norm": 2.831695795059204, "learning_rate": 4.977102099392269e-05, "loss": 2.8719, "step": 830 }, { "epoch": 0.13898064138478905, "grad_norm": 4.443613529205322, "learning_rate": 4.9769045834215225e-05, "loss": 3.3423, "step": 831 }, { "epoch": 0.1391478864406071, "grad_norm": 2.1040289402008057, "learning_rate": 4.976706223177532e-05, "loss": 2.9978, "step": 832 }, { "epoch": 0.13931513149642513, "grad_norm": 3.944457769393921, "learning_rate": 4.9765070187279114e-05, "loss": 2.9562, "step": 833 }, { "epoch": 0.1394823765522432, "grad_norm": 3.338014841079712, "learning_rate": 4.9763069701405615e-05, "loss": 3.0351, "step": 834 }, { "epoch": 0.13964962160806121, "grad_norm": 3.786069631576538, "learning_rate": 4.97610607748367e-05, "loss": 2.7373, "step": 835 }, { "epoch": 0.13981686666387924, "grad_norm": 4.269554615020752, "learning_rate": 4.9759043408257114e-05, "loss": 2.9208, "step": 836 }, { "epoch": 0.1399841117196973, "grad_norm": 4.203729152679443, "learning_rate": 4.9757017602354515e-05, "loss": 3.0693, "step": 837 }, { "epoch": 0.14015135677551532, "grad_norm": 3.1127235889434814, "learning_rate": 4.9754983357819406e-05, "loss": 3.0156, "step": 838 }, { "epoch": 0.14031860183133335, "grad_norm": 1.4842658042907715, "learning_rate": 4.9752940675345175e-05, "loss": 2.4036, "step": 839 }, { "epoch": 0.1404858468871514, "grad_norm": 8.155736923217773, "learning_rate": 4.975088955562808e-05, "loss": 2.7715, "step": 840 }, { "epoch": 0.14065309194296943, "grad_norm": 2.456475257873535, "learning_rate": 4.9748829999367276e-05, "loss": 2.5367, "step": 841 }, { "epoch": 0.14082033699878746, "grad_norm": 4.290907859802246, "learning_rate": 4.974676200726478e-05, "loss": 3.1316, "step": 842 }, { "epoch": 0.14098758205460551, "grad_norm": 6.003057479858398, "learning_rate": 4.9744685580025466e-05, "loss": 3.142, "step": 843 }, { "epoch": 0.14115482711042354, "grad_norm": 5.68665885925293, "learning_rate": 4.974260071835712e-05, "loss": 3.3727, "step": 844 }, { "epoch": 0.1413220721662416, "grad_norm": 5.634039402008057, "learning_rate": 4.974050742297037e-05, "loss": 3.055, "step": 845 }, { "epoch": 0.14148931722205962, "grad_norm": 3.1904900074005127, "learning_rate": 4.973840569457875e-05, "loss": 2.7944, "step": 846 }, { "epoch": 0.14165656227787765, "grad_norm": 2.8836147785186768, "learning_rate": 4.973629553389864e-05, "loss": 2.742, "step": 847 }, { "epoch": 0.1418238073336957, "grad_norm": 3.4890224933624268, "learning_rate": 4.97341769416493e-05, "loss": 2.8146, "step": 848 }, { "epoch": 0.14199105238951373, "grad_norm": 5.275979518890381, "learning_rate": 4.9732049918552884e-05, "loss": 2.8207, "step": 849 }, { "epoch": 0.14215829744533176, "grad_norm": 2.2769558429718018, "learning_rate": 4.97299144653344e-05, "loss": 2.749, "step": 850 }, { "epoch": 0.14232554250114982, "grad_norm": 3.486044406890869, "learning_rate": 4.972777058272172e-05, "loss": 2.7548, "step": 851 }, { "epoch": 0.14249278755696784, "grad_norm": 4.818684101104736, "learning_rate": 4.9725618271445626e-05, "loss": 2.6371, "step": 852 }, { "epoch": 0.1426600326127859, "grad_norm": 4.143411636352539, "learning_rate": 4.972345753223974e-05, "loss": 2.9174, "step": 853 }, { "epoch": 0.14282727766860392, "grad_norm": 3.5977275371551514, "learning_rate": 4.9721288365840565e-05, "loss": 2.8536, "step": 854 }, { "epoch": 0.14299452272442195, "grad_norm": 2.6780688762664795, "learning_rate": 4.97191107729875e-05, "loss": 2.741, "step": 855 }, { "epoch": 0.14316176778024, "grad_norm": 5.183584213256836, "learning_rate": 4.971692475442276e-05, "loss": 2.9902, "step": 856 }, { "epoch": 0.14332901283605803, "grad_norm": 2.960965633392334, "learning_rate": 4.971473031089149e-05, "loss": 3.2021, "step": 857 }, { "epoch": 0.14349625789187606, "grad_norm": 3.111020565032959, "learning_rate": 4.971252744314169e-05, "loss": 2.8987, "step": 858 }, { "epoch": 0.14366350294769412, "grad_norm": 3.7345688343048096, "learning_rate": 4.97103161519242e-05, "loss": 3.1791, "step": 859 }, { "epoch": 0.14383074800351214, "grad_norm": 3.4041454792022705, "learning_rate": 4.9708096437992783e-05, "loss": 2.8909, "step": 860 }, { "epoch": 0.14399799305933017, "grad_norm": 3.4879870414733887, "learning_rate": 4.970586830210404e-05, "loss": 3.0678, "step": 861 }, { "epoch": 0.14416523811514823, "grad_norm": 2.0621705055236816, "learning_rate": 4.970363174501743e-05, "loss": 2.6914, "step": 862 }, { "epoch": 0.14433248317096625, "grad_norm": 2.171360492706299, "learning_rate": 4.970138676749533e-05, "loss": 2.7205, "step": 863 }, { "epoch": 0.1444997282267843, "grad_norm": 3.8659486770629883, "learning_rate": 4.969913337030294e-05, "loss": 2.9051, "step": 864 }, { "epoch": 0.14466697328260233, "grad_norm": 3.456695318222046, "learning_rate": 4.969687155420836e-05, "loss": 2.8212, "step": 865 }, { "epoch": 0.14483421833842036, "grad_norm": 3.0224263668060303, "learning_rate": 4.9694601319982545e-05, "loss": 2.6556, "step": 866 }, { "epoch": 0.14500146339423842, "grad_norm": 2.7337958812713623, "learning_rate": 4.9692322668399315e-05, "loss": 3.0903, "step": 867 }, { "epoch": 0.14516870845005644, "grad_norm": 2.9751241207122803, "learning_rate": 4.9690035600235386e-05, "loss": 3.2096, "step": 868 }, { "epoch": 0.14533595350587447, "grad_norm": 2.591764450073242, "learning_rate": 4.96877401162703e-05, "loss": 2.7554, "step": 869 }, { "epoch": 0.14550319856169253, "grad_norm": 3.164517641067505, "learning_rate": 4.9685436217286505e-05, "loss": 2.7966, "step": 870 }, { "epoch": 0.14567044361751055, "grad_norm": 2.51175594329834, "learning_rate": 4.968312390406931e-05, "loss": 2.8682, "step": 871 }, { "epoch": 0.1458376886733286, "grad_norm": 2.7003180980682373, "learning_rate": 4.968080317740687e-05, "loss": 2.6262, "step": 872 }, { "epoch": 0.14600493372914664, "grad_norm": 2.7648818492889404, "learning_rate": 4.9678474038090236e-05, "loss": 3.0528, "step": 873 }, { "epoch": 0.14617217878496466, "grad_norm": 9.203798294067383, "learning_rate": 4.967613648691331e-05, "loss": 3.3997, "step": 874 }, { "epoch": 0.14633942384078272, "grad_norm": 2.911978244781494, "learning_rate": 4.967379052467286e-05, "loss": 2.6501, "step": 875 }, { "epoch": 0.14650666889660074, "grad_norm": 4.370360374450684, "learning_rate": 4.967143615216854e-05, "loss": 3.0269, "step": 876 }, { "epoch": 0.14667391395241877, "grad_norm": 3.071187973022461, "learning_rate": 4.9669073370202846e-05, "loss": 2.7014, "step": 877 }, { "epoch": 0.14684115900823683, "grad_norm": 8.13399600982666, "learning_rate": 4.966670217958115e-05, "loss": 3.0949, "step": 878 }, { "epoch": 0.14700840406405485, "grad_norm": 10.020334243774414, "learning_rate": 4.96643225811117e-05, "loss": 3.2123, "step": 879 }, { "epoch": 0.14717564911987288, "grad_norm": 3.324976682662964, "learning_rate": 4.966193457560561e-05, "loss": 2.5194, "step": 880 }, { "epoch": 0.14734289417569094, "grad_norm": 5.76493501663208, "learning_rate": 4.9659538163876825e-05, "loss": 2.7937, "step": 881 }, { "epoch": 0.14751013923150896, "grad_norm": 2.5634613037109375, "learning_rate": 4.96571333467422e-05, "loss": 2.8246, "step": 882 }, { "epoch": 0.14767738428732702, "grad_norm": 1.9553459882736206, "learning_rate": 4.965472012502144e-05, "loss": 2.6732, "step": 883 }, { "epoch": 0.14784462934314505, "grad_norm": 2.3458220958709717, "learning_rate": 4.965229849953709e-05, "loss": 2.7116, "step": 884 }, { "epoch": 0.14801187439896307, "grad_norm": 2.0655570030212402, "learning_rate": 4.96498684711146e-05, "loss": 2.7435, "step": 885 }, { "epoch": 0.14817911945478113, "grad_norm": 6.013841152191162, "learning_rate": 4.964743004058227e-05, "loss": 2.7609, "step": 886 }, { "epoch": 0.14834636451059915, "grad_norm": 2.8914473056793213, "learning_rate": 4.964498320877124e-05, "loss": 2.9415, "step": 887 }, { "epoch": 0.14851360956641718, "grad_norm": 3.8962459564208984, "learning_rate": 4.964252797651554e-05, "loss": 3.5139, "step": 888 }, { "epoch": 0.14868085462223524, "grad_norm": 3.8408780097961426, "learning_rate": 4.964006434465207e-05, "loss": 3.1223, "step": 889 }, { "epoch": 0.14884809967805326, "grad_norm": 2.800198554992676, "learning_rate": 4.963759231402055e-05, "loss": 2.6089, "step": 890 }, { "epoch": 0.1490153447338713, "grad_norm": 5.9142842292785645, "learning_rate": 4.9635111885463623e-05, "loss": 2.5168, "step": 891 }, { "epoch": 0.14918258978968935, "grad_norm": 2.444817304611206, "learning_rate": 4.963262305982674e-05, "loss": 2.6707, "step": 892 }, { "epoch": 0.14934983484550737, "grad_norm": 5.5943427085876465, "learning_rate": 4.963012583795826e-05, "loss": 3.4165, "step": 893 }, { "epoch": 0.14951707990132543, "grad_norm": 3.0294277667999268, "learning_rate": 4.9627620220709356e-05, "loss": 2.8209, "step": 894 }, { "epoch": 0.14968432495714346, "grad_norm": 3.4258038997650146, "learning_rate": 4.962510620893411e-05, "loss": 2.5412, "step": 895 }, { "epoch": 0.14985157001296148, "grad_norm": 2.23157000541687, "learning_rate": 4.962258380348943e-05, "loss": 2.7811, "step": 896 }, { "epoch": 0.15001881506877954, "grad_norm": 3.4030351638793945, "learning_rate": 4.9620053005235105e-05, "loss": 2.6682, "step": 897 }, { "epoch": 0.15018606012459756, "grad_norm": 5.742190361022949, "learning_rate": 4.961751381503378e-05, "loss": 3.2503, "step": 898 }, { "epoch": 0.1503533051804156, "grad_norm": 2.471904993057251, "learning_rate": 4.9614966233750945e-05, "loss": 3.0621, "step": 899 }, { "epoch": 0.15052055023623365, "grad_norm": 4.1435770988464355, "learning_rate": 4.961241026225498e-05, "loss": 3.0713, "step": 900 }, { "epoch": 0.15068779529205167, "grad_norm": 4.222225666046143, "learning_rate": 4.960984590141711e-05, "loss": 2.7072, "step": 901 }, { "epoch": 0.15085504034786973, "grad_norm": 3.532573699951172, "learning_rate": 4.96072731521114e-05, "loss": 3.1064, "step": 902 }, { "epoch": 0.15102228540368776, "grad_norm": 4.191957473754883, "learning_rate": 4.960469201521481e-05, "loss": 3.1792, "step": 903 }, { "epoch": 0.15118953045950578, "grad_norm": 3.8456504344940186, "learning_rate": 4.960210249160713e-05, "loss": 3.0046, "step": 904 }, { "epoch": 0.15135677551532384, "grad_norm": 3.444807291030884, "learning_rate": 4.959950458217103e-05, "loss": 2.8901, "step": 905 }, { "epoch": 0.15152402057114187, "grad_norm": 5.175858497619629, "learning_rate": 4.959689828779203e-05, "loss": 3.108, "step": 906 }, { "epoch": 0.1516912656269599, "grad_norm": 3.8828248977661133, "learning_rate": 4.959428360935849e-05, "loss": 2.6533, "step": 907 }, { "epoch": 0.15185851068277795, "grad_norm": 5.5801777839660645, "learning_rate": 4.9591660547761664e-05, "loss": 3.2319, "step": 908 }, { "epoch": 0.15202575573859597, "grad_norm": 4.659512996673584, "learning_rate": 4.958902910389563e-05, "loss": 2.9891, "step": 909 }, { "epoch": 0.152193000794414, "grad_norm": 2.4521780014038086, "learning_rate": 4.958638927865735e-05, "loss": 3.078, "step": 910 }, { "epoch": 0.15236024585023206, "grad_norm": 3.572190284729004, "learning_rate": 4.958374107294663e-05, "loss": 3.1924, "step": 911 }, { "epoch": 0.15252749090605008, "grad_norm": 3.0209271907806396, "learning_rate": 4.9581084487666116e-05, "loss": 2.7009, "step": 912 }, { "epoch": 0.15269473596186814, "grad_norm": 2.7508115768432617, "learning_rate": 4.9578419523721344e-05, "loss": 2.6044, "step": 913 }, { "epoch": 0.15286198101768617, "grad_norm": 8.779255867004395, "learning_rate": 4.957574618202068e-05, "loss": 3.5642, "step": 914 }, { "epoch": 0.1530292260735042, "grad_norm": 3.7045867443084717, "learning_rate": 4.957306446347535e-05, "loss": 2.7654, "step": 915 }, { "epoch": 0.15319647112932225, "grad_norm": 3.3532588481903076, "learning_rate": 4.957037436899945e-05, "loss": 3.0435, "step": 916 }, { "epoch": 0.15336371618514028, "grad_norm": 3.2599470615386963, "learning_rate": 4.956767589950992e-05, "loss": 3.1355, "step": 917 }, { "epoch": 0.1535309612409583, "grad_norm": 2.9946463108062744, "learning_rate": 4.956496905592655e-05, "loss": 3.22, "step": 918 }, { "epoch": 0.15369820629677636, "grad_norm": 3.7455027103424072, "learning_rate": 4.956225383917199e-05, "loss": 2.7297, "step": 919 }, { "epoch": 0.15386545135259438, "grad_norm": 5.289287567138672, "learning_rate": 4.955953025017175e-05, "loss": 3.4865, "step": 920 }, { "epoch": 0.15403269640841244, "grad_norm": 3.455836296081543, "learning_rate": 4.955679828985418e-05, "loss": 3.1897, "step": 921 }, { "epoch": 0.15419994146423047, "grad_norm": 2.5963988304138184, "learning_rate": 4.95540579591505e-05, "loss": 3.0342, "step": 922 }, { "epoch": 0.1543671865200485, "grad_norm": 2.6490979194641113, "learning_rate": 4.955130925899477e-05, "loss": 3.0431, "step": 923 }, { "epoch": 0.15453443157586655, "grad_norm": 4.684563636779785, "learning_rate": 4.95485521903239e-05, "loss": 3.6322, "step": 924 }, { "epoch": 0.15470167663168458, "grad_norm": 5.027052879333496, "learning_rate": 4.954578675407767e-05, "loss": 3.1983, "step": 925 }, { "epoch": 0.1548689216875026, "grad_norm": 3.6460015773773193, "learning_rate": 4.95430129511987e-05, "loss": 2.7074, "step": 926 }, { "epoch": 0.15503616674332066, "grad_norm": 3.217562198638916, "learning_rate": 4.954023078263245e-05, "loss": 2.7309, "step": 927 }, { "epoch": 0.15520341179913869, "grad_norm": 4.728087425231934, "learning_rate": 4.953744024932726e-05, "loss": 2.98, "step": 928 }, { "epoch": 0.1553706568549567, "grad_norm": 4.36997127532959, "learning_rate": 4.9534641352234305e-05, "loss": 3.0621, "step": 929 }, { "epoch": 0.15553790191077477, "grad_norm": 3.59924578666687, "learning_rate": 4.953183409230761e-05, "loss": 2.9205, "step": 930 }, { "epoch": 0.1557051469665928, "grad_norm": 1.8278321027755737, "learning_rate": 4.952901847050405e-05, "loss": 2.4468, "step": 931 }, { "epoch": 0.15587239202241085, "grad_norm": 5.605057716369629, "learning_rate": 4.952619448778335e-05, "loss": 3.2014, "step": 932 }, { "epoch": 0.15603963707822888, "grad_norm": 6.013678073883057, "learning_rate": 4.95233621451081e-05, "loss": 2.8853, "step": 933 }, { "epoch": 0.1562068821340469, "grad_norm": 2.1723716259002686, "learning_rate": 4.9520521443443715e-05, "loss": 2.6527, "step": 934 }, { "epoch": 0.15637412718986496, "grad_norm": 2.651848793029785, "learning_rate": 4.9517672383758474e-05, "loss": 3.1687, "step": 935 }, { "epoch": 0.15654137224568299, "grad_norm": 2.1524884700775146, "learning_rate": 4.9514814967023506e-05, "loss": 2.9832, "step": 936 }, { "epoch": 0.156708617301501, "grad_norm": 4.900365829467773, "learning_rate": 4.951194919421278e-05, "loss": 3.8096, "step": 937 }, { "epoch": 0.15687586235731907, "grad_norm": 2.9858908653259277, "learning_rate": 4.9509075066303125e-05, "loss": 2.9513, "step": 938 }, { "epoch": 0.1570431074131371, "grad_norm": 3.5250532627105713, "learning_rate": 4.95061925842742e-05, "loss": 2.9288, "step": 939 }, { "epoch": 0.15721035246895515, "grad_norm": 3.258364200592041, "learning_rate": 4.950330174910854e-05, "loss": 2.5745, "step": 940 }, { "epoch": 0.15737759752477318, "grad_norm": 2.8043529987335205, "learning_rate": 4.9500402561791495e-05, "loss": 2.8623, "step": 941 }, { "epoch": 0.1575448425805912, "grad_norm": 3.0752501487731934, "learning_rate": 4.949749502331128e-05, "loss": 2.9742, "step": 942 }, { "epoch": 0.15771208763640926, "grad_norm": 4.8233819007873535, "learning_rate": 4.9494579134658955e-05, "loss": 2.9643, "step": 943 }, { "epoch": 0.1578793326922273, "grad_norm": 3.4322831630706787, "learning_rate": 4.949165489682842e-05, "loss": 2.5547, "step": 944 }, { "epoch": 0.1580465777480453, "grad_norm": 3.4118566513061523, "learning_rate": 4.9488722310816436e-05, "loss": 2.8478, "step": 945 }, { "epoch": 0.15821382280386337, "grad_norm": 1.407153844833374, "learning_rate": 4.9485781377622597e-05, "loss": 2.3836, "step": 946 }, { "epoch": 0.1583810678596814, "grad_norm": 3.8746774196624756, "learning_rate": 4.948283209824933e-05, "loss": 3.2043, "step": 947 }, { "epoch": 0.15854831291549942, "grad_norm": 3.3952040672302246, "learning_rate": 4.947987447370194e-05, "loss": 2.9817, "step": 948 }, { "epoch": 0.15871555797131748, "grad_norm": 3.0228207111358643, "learning_rate": 4.947690850498854e-05, "loss": 2.947, "step": 949 }, { "epoch": 0.1588828030271355, "grad_norm": 4.366982936859131, "learning_rate": 4.947393419312012e-05, "loss": 3.2105, "step": 950 }, { "epoch": 0.15905004808295356, "grad_norm": 10.299752235412598, "learning_rate": 4.947095153911049e-05, "loss": 3.6343, "step": 951 }, { "epoch": 0.1592172931387716, "grad_norm": 4.23699951171875, "learning_rate": 4.946796054397631e-05, "loss": 3.15, "step": 952 }, { "epoch": 0.15938453819458961, "grad_norm": 5.11250114440918, "learning_rate": 4.9464961208737095e-05, "loss": 2.9689, "step": 953 }, { "epoch": 0.15955178325040767, "grad_norm": 4.18213415145874, "learning_rate": 4.9461953534415184e-05, "loss": 2.7004, "step": 954 }, { "epoch": 0.1597190283062257, "grad_norm": 4.491884708404541, "learning_rate": 4.945893752203577e-05, "loss": 3.1821, "step": 955 }, { "epoch": 0.15988627336204372, "grad_norm": 3.32365345954895, "learning_rate": 4.945591317262689e-05, "loss": 3.1757, "step": 956 }, { "epoch": 0.16005351841786178, "grad_norm": 2.257373809814453, "learning_rate": 4.945288048721941e-05, "loss": 2.4473, "step": 957 }, { "epoch": 0.1602207634736798, "grad_norm": 5.480620861053467, "learning_rate": 4.9449839466847056e-05, "loss": 3.0933, "step": 958 }, { "epoch": 0.16038800852949783, "grad_norm": 3.2782599925994873, "learning_rate": 4.9446790112546374e-05, "loss": 3.206, "step": 959 }, { "epoch": 0.1605552535853159, "grad_norm": 3.5825448036193848, "learning_rate": 4.944373242535677e-05, "loss": 2.8274, "step": 960 }, { "epoch": 0.16072249864113392, "grad_norm": 6.771640777587891, "learning_rate": 4.944066640632048e-05, "loss": 3.2753, "step": 961 }, { "epoch": 0.16088974369695197, "grad_norm": 4.164596080780029, "learning_rate": 4.943759205648258e-05, "loss": 3.4139, "step": 962 }, { "epoch": 0.16105698875277, "grad_norm": 4.680684566497803, "learning_rate": 4.943450937689098e-05, "loss": 2.799, "step": 963 }, { "epoch": 0.16122423380858802, "grad_norm": 3.8590641021728516, "learning_rate": 4.943141836859645e-05, "loss": 2.9151, "step": 964 }, { "epoch": 0.16139147886440608, "grad_norm": 3.3534302711486816, "learning_rate": 4.942831903265257e-05, "loss": 2.9868, "step": 965 }, { "epoch": 0.1615587239202241, "grad_norm": 4.827520370483398, "learning_rate": 4.9425211370115796e-05, "loss": 2.961, "step": 966 }, { "epoch": 0.16172596897604213, "grad_norm": 6.6521992683410645, "learning_rate": 4.9422095382045383e-05, "loss": 3.1902, "step": 967 }, { "epoch": 0.1618932140318602, "grad_norm": 4.556955814361572, "learning_rate": 4.941897106950345e-05, "loss": 3.0861, "step": 968 }, { "epoch": 0.16206045908767822, "grad_norm": 4.333456993103027, "learning_rate": 4.941583843355493e-05, "loss": 3.3742, "step": 969 }, { "epoch": 0.16222770414349627, "grad_norm": 3.0652060508728027, "learning_rate": 4.941269747526762e-05, "loss": 2.984, "step": 970 }, { "epoch": 0.1623949491993143, "grad_norm": 2.0189380645751953, "learning_rate": 4.940954819571215e-05, "loss": 2.7941, "step": 971 }, { "epoch": 0.16256219425513233, "grad_norm": 3.5030956268310547, "learning_rate": 4.940639059596196e-05, "loss": 2.933, "step": 972 }, { "epoch": 0.16272943931095038, "grad_norm": 6.162909030914307, "learning_rate": 4.940322467709335e-05, "loss": 2.6566, "step": 973 }, { "epoch": 0.1628966843667684, "grad_norm": 3.294153928756714, "learning_rate": 4.940005044018545e-05, "loss": 3.3287, "step": 974 }, { "epoch": 0.16306392942258643, "grad_norm": 5.818653583526611, "learning_rate": 4.939686788632023e-05, "loss": 3.1454, "step": 975 }, { "epoch": 0.1632311744784045, "grad_norm": 2.061278820037842, "learning_rate": 4.9393677016582484e-05, "loss": 2.4444, "step": 976 }, { "epoch": 0.16339841953422252, "grad_norm": 4.6899542808532715, "learning_rate": 4.939047783205984e-05, "loss": 3.108, "step": 977 }, { "epoch": 0.16356566459004054, "grad_norm": 3.2128684520721436, "learning_rate": 4.938727033384278e-05, "loss": 2.8494, "step": 978 }, { "epoch": 0.1637329096458586, "grad_norm": 3.7413744926452637, "learning_rate": 4.9384054523024604e-05, "loss": 3.3185, "step": 979 }, { "epoch": 0.16390015470167663, "grad_norm": 2.1858129501342773, "learning_rate": 4.938083040070144e-05, "loss": 2.7503, "step": 980 }, { "epoch": 0.16406739975749468, "grad_norm": 3.656841278076172, "learning_rate": 4.937759796797225e-05, "loss": 2.1052, "step": 981 }, { "epoch": 0.1642346448133127, "grad_norm": 2.39621639251709, "learning_rate": 4.937435722593885e-05, "loss": 2.6412, "step": 982 }, { "epoch": 0.16440188986913074, "grad_norm": 3.0662436485290527, "learning_rate": 4.937110817570587e-05, "loss": 2.7987, "step": 983 }, { "epoch": 0.1645691349249488, "grad_norm": 3.927821397781372, "learning_rate": 4.9367850818380766e-05, "loss": 3.2476, "step": 984 }, { "epoch": 0.16473637998076682, "grad_norm": 2.9279229640960693, "learning_rate": 4.936458515507385e-05, "loss": 2.7904, "step": 985 }, { "epoch": 0.16490362503658484, "grad_norm": 3.0982666015625, "learning_rate": 4.936131118689824e-05, "loss": 2.7476, "step": 986 }, { "epoch": 0.1650708700924029, "grad_norm": 3.5176901817321777, "learning_rate": 4.93580289149699e-05, "loss": 2.6607, "step": 987 }, { "epoch": 0.16523811514822093, "grad_norm": 3.385801315307617, "learning_rate": 4.9354738340407614e-05, "loss": 2.7649, "step": 988 }, { "epoch": 0.16540536020403898, "grad_norm": 4.66230583190918, "learning_rate": 4.9351439464333e-05, "loss": 2.9874, "step": 989 }, { "epoch": 0.165572605259857, "grad_norm": 3.502385377883911, "learning_rate": 4.934813228787052e-05, "loss": 3.0081, "step": 990 }, { "epoch": 0.16573985031567504, "grad_norm": 5.518198013305664, "learning_rate": 4.934481681214744e-05, "loss": 2.355, "step": 991 }, { "epoch": 0.1659070953714931, "grad_norm": 2.603529930114746, "learning_rate": 4.9341493038293874e-05, "loss": 2.6285, "step": 992 }, { "epoch": 0.16607434042731112, "grad_norm": 3.056131601333618, "learning_rate": 4.9338160967442745e-05, "loss": 3.0645, "step": 993 }, { "epoch": 0.16624158548312915, "grad_norm": 2.2692205905914307, "learning_rate": 4.933482060072983e-05, "loss": 2.7363, "step": 994 }, { "epoch": 0.1664088305389472, "grad_norm": 3.5117950439453125, "learning_rate": 4.9331471939293715e-05, "loss": 3.1581, "step": 995 }, { "epoch": 0.16657607559476523, "grad_norm": 2.9542856216430664, "learning_rate": 4.9328114984275817e-05, "loss": 2.8222, "step": 996 }, { "epoch": 0.16674332065058325, "grad_norm": 5.708415985107422, "learning_rate": 4.9324749736820394e-05, "loss": 2.3886, "step": 997 }, { "epoch": 0.1669105657064013, "grad_norm": 3.498958110809326, "learning_rate": 4.93213761980745e-05, "loss": 2.6484, "step": 998 }, { "epoch": 0.16707781076221934, "grad_norm": 3.382742404937744, "learning_rate": 4.9317994369188046e-05, "loss": 2.7628, "step": 999 }, { "epoch": 0.1672450558180374, "grad_norm": 6.719852924346924, "learning_rate": 4.931460425131375e-05, "loss": 3.1871, "step": 1000 }, { "epoch": 0.16741230087385542, "grad_norm": 2.480229616165161, "learning_rate": 4.9311205845607164e-05, "loss": 3.1832, "step": 1001 }, { "epoch": 0.16757954592967345, "grad_norm": 3.1949431896209717, "learning_rate": 4.930779915322666e-05, "loss": 2.8456, "step": 1002 }, { "epoch": 0.1677467909854915, "grad_norm": 4.023664474487305, "learning_rate": 4.930438417533344e-05, "loss": 3.2673, "step": 1003 }, { "epoch": 0.16791403604130953, "grad_norm": 3.9439337253570557, "learning_rate": 4.930096091309153e-05, "loss": 3.2512, "step": 1004 }, { "epoch": 0.16808128109712756, "grad_norm": 5.491034984588623, "learning_rate": 4.929752936766777e-05, "loss": 3.2542, "step": 1005 }, { "epoch": 0.1682485261529456, "grad_norm": 7.270681858062744, "learning_rate": 4.929408954023183e-05, "loss": 2.8822, "step": 1006 }, { "epoch": 0.16841577120876364, "grad_norm": 5.916931629180908, "learning_rate": 4.9290641431956206e-05, "loss": 2.7474, "step": 1007 }, { "epoch": 0.1685830162645817, "grad_norm": 3.1261637210845947, "learning_rate": 4.928718504401623e-05, "loss": 2.775, "step": 1008 }, { "epoch": 0.16875026132039972, "grad_norm": 7.491856098175049, "learning_rate": 4.928372037759001e-05, "loss": 2.7794, "step": 1009 }, { "epoch": 0.16891750637621775, "grad_norm": 4.35812520980835, "learning_rate": 4.928024743385852e-05, "loss": 3.0051, "step": 1010 }, { "epoch": 0.1690847514320358, "grad_norm": 3.1814475059509277, "learning_rate": 4.927676621400555e-05, "loss": 2.6153, "step": 1011 }, { "epoch": 0.16925199648785383, "grad_norm": 4.139153957366943, "learning_rate": 4.92732767192177e-05, "loss": 3.2207, "step": 1012 }, { "epoch": 0.16941924154367186, "grad_norm": 5.09887170791626, "learning_rate": 4.9269778950684376e-05, "loss": 3.076, "step": 1013 }, { "epoch": 0.1695864865994899, "grad_norm": 3.278463363647461, "learning_rate": 4.926627290959784e-05, "loss": 2.5699, "step": 1014 }, { "epoch": 0.16975373165530794, "grad_norm": 4.956247806549072, "learning_rate": 4.926275859715315e-05, "loss": 3.0611, "step": 1015 }, { "epoch": 0.16992097671112597, "grad_norm": 5.205733299255371, "learning_rate": 4.925923601454817e-05, "loss": 2.9025, "step": 1016 }, { "epoch": 0.17008822176694402, "grad_norm": 3.2704849243164062, "learning_rate": 4.925570516298364e-05, "loss": 3.1511, "step": 1017 }, { "epoch": 0.17025546682276205, "grad_norm": 2.2342774868011475, "learning_rate": 4.925216604366304e-05, "loss": 2.7276, "step": 1018 }, { "epoch": 0.1704227118785801, "grad_norm": 3.7788705825805664, "learning_rate": 4.924861865779273e-05, "loss": 2.7753, "step": 1019 }, { "epoch": 0.17058995693439813, "grad_norm": 2.565629243850708, "learning_rate": 4.924506300658186e-05, "loss": 2.9586, "step": 1020 }, { "epoch": 0.17075720199021616, "grad_norm": 3.6045007705688477, "learning_rate": 4.9241499091242406e-05, "loss": 2.9869, "step": 1021 }, { "epoch": 0.1709244470460342, "grad_norm": 5.28696870803833, "learning_rate": 4.923792691298915e-05, "loss": 3.3188, "step": 1022 }, { "epoch": 0.17109169210185224, "grad_norm": 3.423274517059326, "learning_rate": 4.923434647303971e-05, "loss": 2.8713, "step": 1023 }, { "epoch": 0.17125893715767027, "grad_norm": 4.569013595581055, "learning_rate": 4.923075777261449e-05, "loss": 3.0308, "step": 1024 }, { "epoch": 0.17142618221348832, "grad_norm": 4.2942938804626465, "learning_rate": 4.9227160812936746e-05, "loss": 3.06, "step": 1025 }, { "epoch": 0.17159342726930635, "grad_norm": 2.914494752883911, "learning_rate": 4.9223555595232527e-05, "loss": 2.8556, "step": 1026 }, { "epoch": 0.17176067232512437, "grad_norm": 2.73791766166687, "learning_rate": 4.92199421207307e-05, "loss": 2.8399, "step": 1027 }, { "epoch": 0.17192791738094243, "grad_norm": 4.010573863983154, "learning_rate": 4.921632039066294e-05, "loss": 3.0377, "step": 1028 }, { "epoch": 0.17209516243676046, "grad_norm": 3.0981528759002686, "learning_rate": 4.921269040626375e-05, "loss": 3.2683, "step": 1029 }, { "epoch": 0.1722624074925785, "grad_norm": 6.571484088897705, "learning_rate": 4.920905216877044e-05, "loss": 3.4247, "step": 1030 }, { "epoch": 0.17242965254839654, "grad_norm": 1.9649081230163574, "learning_rate": 4.920540567942314e-05, "loss": 2.7156, "step": 1031 }, { "epoch": 0.17259689760421457, "grad_norm": 2.497293472290039, "learning_rate": 4.920175093946478e-05, "loss": 3.1083, "step": 1032 }, { "epoch": 0.17276414266003262, "grad_norm": 4.21910285949707, "learning_rate": 4.919808795014109e-05, "loss": 2.9289, "step": 1033 }, { "epoch": 0.17293138771585065, "grad_norm": 3.2430317401885986, "learning_rate": 4.919441671270066e-05, "loss": 2.8989, "step": 1034 }, { "epoch": 0.17309863277166868, "grad_norm": 1.7863857746124268, "learning_rate": 4.919073722839484e-05, "loss": 2.849, "step": 1035 }, { "epoch": 0.17326587782748673, "grad_norm": 3.0976691246032715, "learning_rate": 4.918704949847782e-05, "loss": 2.7566, "step": 1036 }, { "epoch": 0.17343312288330476, "grad_norm": 1.145336389541626, "learning_rate": 4.91833535242066e-05, "loss": 2.3209, "step": 1037 }, { "epoch": 0.1736003679391228, "grad_norm": 2.9120218753814697, "learning_rate": 4.917964930684097e-05, "loss": 3.129, "step": 1038 }, { "epoch": 0.17376761299494084, "grad_norm": 3.6691486835479736, "learning_rate": 4.917593684764355e-05, "loss": 3.201, "step": 1039 }, { "epoch": 0.17393485805075887, "grad_norm": 4.0041937828063965, "learning_rate": 4.917221614787977e-05, "loss": 2.9315, "step": 1040 }, { "epoch": 0.17410210310657692, "grad_norm": 4.151515960693359, "learning_rate": 4.9168487208817846e-05, "loss": 3.0604, "step": 1041 }, { "epoch": 0.17426934816239495, "grad_norm": 2.5350382328033447, "learning_rate": 4.916475003172882e-05, "loss": 3.0099, "step": 1042 }, { "epoch": 0.17443659321821298, "grad_norm": 3.4214115142822266, "learning_rate": 4.916100461788655e-05, "loss": 2.8545, "step": 1043 }, { "epoch": 0.17460383827403103, "grad_norm": 5.483109474182129, "learning_rate": 4.915725096856769e-05, "loss": 3.0086, "step": 1044 }, { "epoch": 0.17477108332984906, "grad_norm": 6.05507230758667, "learning_rate": 4.915348908505168e-05, "loss": 3.3078, "step": 1045 }, { "epoch": 0.17493832838566709, "grad_norm": 4.287694454193115, "learning_rate": 4.914971896862082e-05, "loss": 3.0935, "step": 1046 }, { "epoch": 0.17510557344148514, "grad_norm": 3.9438233375549316, "learning_rate": 4.914594062056016e-05, "loss": 3.0032, "step": 1047 }, { "epoch": 0.17527281849730317, "grad_norm": 2.2108142375946045, "learning_rate": 4.9142154042157595e-05, "loss": 2.7388, "step": 1048 }, { "epoch": 0.17544006355312122, "grad_norm": 2.4942917823791504, "learning_rate": 4.9138359234703815e-05, "loss": 3.0388, "step": 1049 }, { "epoch": 0.17560730860893925, "grad_norm": 2.5240321159362793, "learning_rate": 4.91345561994923e-05, "loss": 2.6449, "step": 1050 }, { "epoch": 0.17577455366475728, "grad_norm": 2.5153090953826904, "learning_rate": 4.913074493781934e-05, "loss": 2.9018, "step": 1051 }, { "epoch": 0.17594179872057533, "grad_norm": 7.784184455871582, "learning_rate": 4.912692545098405e-05, "loss": 3.4618, "step": 1052 }, { "epoch": 0.17610904377639336, "grad_norm": 5.902459144592285, "learning_rate": 4.912309774028834e-05, "loss": 2.895, "step": 1053 }, { "epoch": 0.1762762888322114, "grad_norm": 2.021613836288452, "learning_rate": 4.911926180703689e-05, "loss": 2.7497, "step": 1054 }, { "epoch": 0.17644353388802944, "grad_norm": 2.7701168060302734, "learning_rate": 4.911541765253723e-05, "loss": 3.0276, "step": 1055 }, { "epoch": 0.17661077894384747, "grad_norm": 5.316072940826416, "learning_rate": 4.9111565278099656e-05, "loss": 3.4204, "step": 1056 }, { "epoch": 0.17677802399966552, "grad_norm": 5.430384635925293, "learning_rate": 4.91077046850373e-05, "loss": 2.6331, "step": 1057 }, { "epoch": 0.17694526905548355, "grad_norm": 4.047879219055176, "learning_rate": 4.9103835874666074e-05, "loss": 3.1512, "step": 1058 }, { "epoch": 0.17711251411130158, "grad_norm": 4.993270397186279, "learning_rate": 4.9099958848304675e-05, "loss": 3.4328, "step": 1059 }, { "epoch": 0.17727975916711963, "grad_norm": 3.610191583633423, "learning_rate": 4.909607360727463e-05, "loss": 3.3773, "step": 1060 }, { "epoch": 0.17744700422293766, "grad_norm": 2.9813756942749023, "learning_rate": 4.909218015290027e-05, "loss": 2.7069, "step": 1061 }, { "epoch": 0.1776142492787557, "grad_norm": 5.061583995819092, "learning_rate": 4.908827848650869e-05, "loss": 2.7553, "step": 1062 }, { "epoch": 0.17778149433457374, "grad_norm": 2.5538523197174072, "learning_rate": 4.908436860942981e-05, "loss": 2.9755, "step": 1063 }, { "epoch": 0.17794873939039177, "grad_norm": 2.3692193031311035, "learning_rate": 4.9080450522996355e-05, "loss": 2.7433, "step": 1064 }, { "epoch": 0.1781159844462098, "grad_norm": 3.1756012439727783, "learning_rate": 4.907652422854382e-05, "loss": 3.337, "step": 1065 }, { "epoch": 0.17828322950202785, "grad_norm": 4.927761077880859, "learning_rate": 4.907258972741052e-05, "loss": 2.5134, "step": 1066 }, { "epoch": 0.17845047455784588, "grad_norm": 3.013889789581299, "learning_rate": 4.906864702093757e-05, "loss": 2.7425, "step": 1067 }, { "epoch": 0.17861771961366393, "grad_norm": 5.149726390838623, "learning_rate": 4.906469611046888e-05, "loss": 2.8619, "step": 1068 }, { "epoch": 0.17878496466948196, "grad_norm": 2.7834670543670654, "learning_rate": 4.906073699735113e-05, "loss": 2.5159, "step": 1069 }, { "epoch": 0.1789522097253, "grad_norm": 5.5360236167907715, "learning_rate": 4.905676968293382e-05, "loss": 3.0103, "step": 1070 }, { "epoch": 0.17911945478111804, "grad_norm": 7.0627827644348145, "learning_rate": 4.905279416856925e-05, "loss": 3.3889, "step": 1071 }, { "epoch": 0.17928669983693607, "grad_norm": 3.191145181655884, "learning_rate": 4.90488104556125e-05, "loss": 3.2054, "step": 1072 }, { "epoch": 0.1794539448927541, "grad_norm": 3.0523056983947754, "learning_rate": 4.9044818545421467e-05, "loss": 2.9173, "step": 1073 }, { "epoch": 0.17962118994857215, "grad_norm": 6.159040927886963, "learning_rate": 4.9040818439356805e-05, "loss": 3.0406, "step": 1074 }, { "epoch": 0.17978843500439018, "grad_norm": 5.8939642906188965, "learning_rate": 4.9036810138781997e-05, "loss": 3.4368, "step": 1075 }, { "epoch": 0.1799556800602082, "grad_norm": 5.667107582092285, "learning_rate": 4.90327936450633e-05, "loss": 3.2237, "step": 1076 }, { "epoch": 0.18012292511602626, "grad_norm": 4.8016815185546875, "learning_rate": 4.902876895956976e-05, "loss": 3.0558, "step": 1077 }, { "epoch": 0.1802901701718443, "grad_norm": 4.439964771270752, "learning_rate": 4.902473608367324e-05, "loss": 2.8794, "step": 1078 }, { "epoch": 0.18045741522766234, "grad_norm": 2.7001640796661377, "learning_rate": 4.902069501874837e-05, "loss": 2.8952, "step": 1079 }, { "epoch": 0.18062466028348037, "grad_norm": 2.9338314533233643, "learning_rate": 4.9016645766172584e-05, "loss": 2.6485, "step": 1080 }, { "epoch": 0.1807919053392984, "grad_norm": 3.735612630844116, "learning_rate": 4.9012588327326104e-05, "loss": 3.0966, "step": 1081 }, { "epoch": 0.18095915039511645, "grad_norm": 2.8932912349700928, "learning_rate": 4.900852270359193e-05, "loss": 2.6371, "step": 1082 }, { "epoch": 0.18112639545093448, "grad_norm": 3.9144272804260254, "learning_rate": 4.900444889635588e-05, "loss": 2.934, "step": 1083 }, { "epoch": 0.1812936405067525, "grad_norm": 2.4812169075012207, "learning_rate": 4.900036690700653e-05, "loss": 2.615, "step": 1084 }, { "epoch": 0.18146088556257056, "grad_norm": 2.1633071899414062, "learning_rate": 4.899627673693527e-05, "loss": 2.7079, "step": 1085 }, { "epoch": 0.1816281306183886, "grad_norm": 3.7712879180908203, "learning_rate": 4.8992178387536264e-05, "loss": 2.799, "step": 1086 }, { "epoch": 0.18179537567420664, "grad_norm": 2.2391879558563232, "learning_rate": 4.8988071860206465e-05, "loss": 2.7833, "step": 1087 }, { "epoch": 0.18196262073002467, "grad_norm": 6.898645877838135, "learning_rate": 4.898395715634562e-05, "loss": 3.261, "step": 1088 }, { "epoch": 0.1821298657858427, "grad_norm": 6.01616096496582, "learning_rate": 4.897983427735626e-05, "loss": 3.0961, "step": 1089 }, { "epoch": 0.18229711084166075, "grad_norm": 5.209904670715332, "learning_rate": 4.8975703224643696e-05, "loss": 2.7537, "step": 1090 }, { "epoch": 0.18246435589747878, "grad_norm": 2.476249933242798, "learning_rate": 4.897156399961604e-05, "loss": 2.7008, "step": 1091 }, { "epoch": 0.1826316009532968, "grad_norm": 2.995408773422241, "learning_rate": 4.896741660368418e-05, "loss": 2.7033, "step": 1092 }, { "epoch": 0.18279884600911486, "grad_norm": 4.466170787811279, "learning_rate": 4.896326103826178e-05, "loss": 3.0674, "step": 1093 }, { "epoch": 0.1829660910649329, "grad_norm": 3.4503443241119385, "learning_rate": 4.895909730476531e-05, "loss": 2.889, "step": 1094 }, { "epoch": 0.18313333612075092, "grad_norm": 6.719694137573242, "learning_rate": 4.895492540461401e-05, "loss": 3.3258, "step": 1095 }, { "epoch": 0.18330058117656897, "grad_norm": 2.9557948112487793, "learning_rate": 4.89507453392299e-05, "loss": 2.9158, "step": 1096 }, { "epoch": 0.183467826232387, "grad_norm": 3.9410359859466553, "learning_rate": 4.894655711003779e-05, "loss": 2.9996, "step": 1097 }, { "epoch": 0.18363507128820505, "grad_norm": 3.9904565811157227, "learning_rate": 4.8942360718465284e-05, "loss": 3.0863, "step": 1098 }, { "epoch": 0.18380231634402308, "grad_norm": 3.711761474609375, "learning_rate": 4.893815616594275e-05, "loss": 2.8775, "step": 1099 }, { "epoch": 0.1839695613998411, "grad_norm": 8.39311695098877, "learning_rate": 4.893394345390334e-05, "loss": 3.0536, "step": 1100 }, { "epoch": 0.18413680645565916, "grad_norm": 3.7164273262023926, "learning_rate": 4.8929722583782995e-05, "loss": 2.6985, "step": 1101 }, { "epoch": 0.1843040515114772, "grad_norm": 4.11097526550293, "learning_rate": 4.892549355702043e-05, "loss": 3.2946, "step": 1102 }, { "epoch": 0.18447129656729522, "grad_norm": 3.1277384757995605, "learning_rate": 4.8921256375057144e-05, "loss": 2.6912, "step": 1103 }, { "epoch": 0.18463854162311327, "grad_norm": 3.8946139812469482, "learning_rate": 4.8917011039337426e-05, "loss": 2.9477, "step": 1104 }, { "epoch": 0.1848057866789313, "grad_norm": 7.040565013885498, "learning_rate": 4.891275755130832e-05, "loss": 3.1332, "step": 1105 }, { "epoch": 0.18497303173474935, "grad_norm": 2.6003944873809814, "learning_rate": 4.890849591241967e-05, "loss": 3.1981, "step": 1106 }, { "epoch": 0.18514027679056738, "grad_norm": 4.635796546936035, "learning_rate": 4.890422612412409e-05, "loss": 3.3411, "step": 1107 }, { "epoch": 0.1853075218463854, "grad_norm": 2.800302505493164, "learning_rate": 4.8899948187876956e-05, "loss": 2.5537, "step": 1108 }, { "epoch": 0.18547476690220346, "grad_norm": 2.694589138031006, "learning_rate": 4.889566210513646e-05, "loss": 2.8271, "step": 1109 }, { "epoch": 0.1856420119580215, "grad_norm": 4.3596367835998535, "learning_rate": 4.8891367877363544e-05, "loss": 3.0145, "step": 1110 }, { "epoch": 0.18580925701383952, "grad_norm": 17.85613441467285, "learning_rate": 4.888706550602193e-05, "loss": 4.1476, "step": 1111 }, { "epoch": 0.18597650206965757, "grad_norm": 5.663623809814453, "learning_rate": 4.88827549925781e-05, "loss": 3.2846, "step": 1112 }, { "epoch": 0.1861437471254756, "grad_norm": 3.3504414558410645, "learning_rate": 4.887843633850136e-05, "loss": 2.7227, "step": 1113 }, { "epoch": 0.18631099218129363, "grad_norm": 4.39896821975708, "learning_rate": 4.887410954526372e-05, "loss": 2.8843, "step": 1114 }, { "epoch": 0.18647823723711168, "grad_norm": 3.3895323276519775, "learning_rate": 4.8869774614340033e-05, "loss": 3.1154, "step": 1115 }, { "epoch": 0.1866454822929297, "grad_norm": 3.397095203399658, "learning_rate": 4.886543154720789e-05, "loss": 3.0653, "step": 1116 }, { "epoch": 0.18681272734874776, "grad_norm": 3.5690619945526123, "learning_rate": 4.8861080345347656e-05, "loss": 2.7945, "step": 1117 }, { "epoch": 0.1869799724045658, "grad_norm": 4.070973873138428, "learning_rate": 4.8856721010242465e-05, "loss": 3.3538, "step": 1118 }, { "epoch": 0.18714721746038382, "grad_norm": 1.9790748357772827, "learning_rate": 4.885235354337825e-05, "loss": 2.527, "step": 1119 }, { "epoch": 0.18731446251620187, "grad_norm": 4.624767780303955, "learning_rate": 4.884797794624368e-05, "loss": 3.1118, "step": 1120 }, { "epoch": 0.1874817075720199, "grad_norm": 2.497506856918335, "learning_rate": 4.8843594220330235e-05, "loss": 2.8103, "step": 1121 }, { "epoch": 0.18764895262783793, "grad_norm": 3.6937246322631836, "learning_rate": 4.883920236713213e-05, "loss": 2.5864, "step": 1122 }, { "epoch": 0.18781619768365598, "grad_norm": 2.662184000015259, "learning_rate": 4.8834802388146364e-05, "loss": 3.1709, "step": 1123 }, { "epoch": 0.187983442739474, "grad_norm": 3.814220666885376, "learning_rate": 4.883039428487271e-05, "loss": 2.9483, "step": 1124 }, { "epoch": 0.18815068779529207, "grad_norm": 4.398327350616455, "learning_rate": 4.8825978058813695e-05, "loss": 3.0432, "step": 1125 }, { "epoch": 0.1883179328511101, "grad_norm": 1.9993137121200562, "learning_rate": 4.8821553711474647e-05, "loss": 2.4378, "step": 1126 }, { "epoch": 0.18848517790692812, "grad_norm": 2.7537593841552734, "learning_rate": 4.881712124436362e-05, "loss": 2.5359, "step": 1127 }, { "epoch": 0.18865242296274617, "grad_norm": 3.809882640838623, "learning_rate": 4.881268065899146e-05, "loss": 2.9499, "step": 1128 }, { "epoch": 0.1888196680185642, "grad_norm": 3.145026683807373, "learning_rate": 4.88082319568718e-05, "loss": 2.9202, "step": 1129 }, { "epoch": 0.18898691307438223, "grad_norm": 4.797821998596191, "learning_rate": 4.880377513952099e-05, "loss": 2.6811, "step": 1130 }, { "epoch": 0.18915415813020028, "grad_norm": 5.488893032073975, "learning_rate": 4.879931020845818e-05, "loss": 3.5815, "step": 1131 }, { "epoch": 0.1893214031860183, "grad_norm": 4.178220272064209, "learning_rate": 4.879483716520529e-05, "loss": 2.5811, "step": 1132 }, { "epoch": 0.18948864824183634, "grad_norm": 4.021731376647949, "learning_rate": 4.879035601128698e-05, "loss": 3.0282, "step": 1133 }, { "epoch": 0.1896558932976544, "grad_norm": 3.562116861343384, "learning_rate": 4.8785866748230684e-05, "loss": 2.637, "step": 1134 }, { "epoch": 0.18982313835347242, "grad_norm": 5.187073230743408, "learning_rate": 4.878136937756662e-05, "loss": 2.6086, "step": 1135 }, { "epoch": 0.18999038340929048, "grad_norm": 7.047665596008301, "learning_rate": 4.877686390082774e-05, "loss": 2.3912, "step": 1136 }, { "epoch": 0.1901576284651085, "grad_norm": 3.066321611404419, "learning_rate": 4.8772350319549784e-05, "loss": 2.6739, "step": 1137 }, { "epoch": 0.19032487352092653, "grad_norm": 3.410684823989868, "learning_rate": 4.876782863527124e-05, "loss": 2.5171, "step": 1138 }, { "epoch": 0.19049211857674458, "grad_norm": 3.9520108699798584, "learning_rate": 4.876329884953335e-05, "loss": 3.0862, "step": 1139 }, { "epoch": 0.1906593636325626, "grad_norm": 2.9469220638275146, "learning_rate": 4.875876096388015e-05, "loss": 2.901, "step": 1140 }, { "epoch": 0.19082660868838064, "grad_norm": 2.908921241760254, "learning_rate": 4.875421497985839e-05, "loss": 3.2015, "step": 1141 }, { "epoch": 0.1909938537441987, "grad_norm": 3.1349356174468994, "learning_rate": 4.8749660899017625e-05, "loss": 2.9837, "step": 1142 }, { "epoch": 0.19116109880001672, "grad_norm": 5.077385425567627, "learning_rate": 4.8745098722910145e-05, "loss": 2.857, "step": 1143 }, { "epoch": 0.19132834385583475, "grad_norm": 4.6072540283203125, "learning_rate": 4.8740528453091004e-05, "loss": 3.2037, "step": 1144 }, { "epoch": 0.1914955889116528, "grad_norm": 3.0631356239318848, "learning_rate": 4.873595009111802e-05, "loss": 3.0136, "step": 1145 }, { "epoch": 0.19166283396747083, "grad_norm": 3.0217983722686768, "learning_rate": 4.873136363855176e-05, "loss": 2.8001, "step": 1146 }, { "epoch": 0.19183007902328889, "grad_norm": 4.596329212188721, "learning_rate": 4.872676909695555e-05, "loss": 2.7715, "step": 1147 }, { "epoch": 0.1919973240791069, "grad_norm": 5.042298316955566, "learning_rate": 4.872216646789549e-05, "loss": 2.9062, "step": 1148 }, { "epoch": 0.19216456913492494, "grad_norm": 3.7145698070526123, "learning_rate": 4.871755575294042e-05, "loss": 2.8479, "step": 1149 }, { "epoch": 0.192331814190743, "grad_norm": 3.840198040008545, "learning_rate": 4.871293695366193e-05, "loss": 3.1549, "step": 1150 }, { "epoch": 0.19249905924656102, "grad_norm": 4.179709434509277, "learning_rate": 4.8708310071634386e-05, "loss": 2.5914, "step": 1151 }, { "epoch": 0.19266630430237905, "grad_norm": 3.9370927810668945, "learning_rate": 4.87036751084349e-05, "loss": 3.0293, "step": 1152 }, { "epoch": 0.1928335493581971, "grad_norm": 3.707364320755005, "learning_rate": 4.869903206564332e-05, "loss": 2.9702, "step": 1153 }, { "epoch": 0.19300079441401513, "grad_norm": 2.730984687805176, "learning_rate": 4.869438094484229e-05, "loss": 2.9435, "step": 1154 }, { "epoch": 0.19316803946983319, "grad_norm": 3.364837169647217, "learning_rate": 4.8689721747617164e-05, "loss": 2.8533, "step": 1155 }, { "epoch": 0.1933352845256512, "grad_norm": 4.059622287750244, "learning_rate": 4.868505447555608e-05, "loss": 2.6496, "step": 1156 }, { "epoch": 0.19350252958146924, "grad_norm": 2.878678560256958, "learning_rate": 4.868037913024991e-05, "loss": 2.7303, "step": 1157 }, { "epoch": 0.1936697746372873, "grad_norm": 5.486072540283203, "learning_rate": 4.867569571329228e-05, "loss": 2.8752, "step": 1158 }, { "epoch": 0.19383701969310532, "grad_norm": 2.37099289894104, "learning_rate": 4.867100422627958e-05, "loss": 2.7615, "step": 1159 }, { "epoch": 0.19400426474892335, "grad_norm": 7.198881149291992, "learning_rate": 4.866630467081094e-05, "loss": 3.6641, "step": 1160 }, { "epoch": 0.1941715098047414, "grad_norm": 1.9120362997055054, "learning_rate": 4.8661597048488225e-05, "loss": 2.6054, "step": 1161 }, { "epoch": 0.19433875486055943, "grad_norm": 8.825066566467285, "learning_rate": 4.865688136091609e-05, "loss": 3.4713, "step": 1162 }, { "epoch": 0.19450599991637746, "grad_norm": 3.053084135055542, "learning_rate": 4.86521576097019e-05, "loss": 2.7778, "step": 1163 }, { "epoch": 0.1946732449721955, "grad_norm": 2.259648323059082, "learning_rate": 4.86474257964558e-05, "loss": 3.0337, "step": 1164 }, { "epoch": 0.19484049002801354, "grad_norm": 3.408273935317993, "learning_rate": 4.864268592279065e-05, "loss": 3.0519, "step": 1165 }, { "epoch": 0.1950077350838316, "grad_norm": 2.3645620346069336, "learning_rate": 4.8637937990322094e-05, "loss": 2.4724, "step": 1166 }, { "epoch": 0.19517498013964962, "grad_norm": 3.6451363563537598, "learning_rate": 4.863318200066848e-05, "loss": 2.8829, "step": 1167 }, { "epoch": 0.19534222519546765, "grad_norm": 2.4679222106933594, "learning_rate": 4.8628417955450945e-05, "loss": 2.7294, "step": 1168 }, { "epoch": 0.1955094702512857, "grad_norm": 4.277161598205566, "learning_rate": 4.8623645856293345e-05, "loss": 3.0717, "step": 1169 }, { "epoch": 0.19567671530710373, "grad_norm": 3.5200629234313965, "learning_rate": 4.861886570482229e-05, "loss": 2.7823, "step": 1170 }, { "epoch": 0.19584396036292176, "grad_norm": 3.960588216781616, "learning_rate": 4.861407750266713e-05, "loss": 3.1927, "step": 1171 }, { "epoch": 0.19601120541873981, "grad_norm": 2.759403944015503, "learning_rate": 4.860928125145998e-05, "loss": 2.7178, "step": 1172 }, { "epoch": 0.19617845047455784, "grad_norm": 5.449185371398926, "learning_rate": 4.860447695283566e-05, "loss": 3.1217, "step": 1173 }, { "epoch": 0.1963456955303759, "grad_norm": 3.6755683422088623, "learning_rate": 4.859966460843177e-05, "loss": 2.7799, "step": 1174 }, { "epoch": 0.19651294058619392, "grad_norm": 4.535074234008789, "learning_rate": 4.8594844219888626e-05, "loss": 2.6574, "step": 1175 }, { "epoch": 0.19668018564201195, "grad_norm": 2.947943687438965, "learning_rate": 4.85900157888493e-05, "loss": 2.7945, "step": 1176 }, { "epoch": 0.19684743069783, "grad_norm": 2.9948387145996094, "learning_rate": 4.8585179316959606e-05, "loss": 3.045, "step": 1177 }, { "epoch": 0.19701467575364803, "grad_norm": 2.130695104598999, "learning_rate": 4.85803348058681e-05, "loss": 2.7696, "step": 1178 }, { "epoch": 0.19718192080946606, "grad_norm": 2.878772020339966, "learning_rate": 4.857548225722606e-05, "loss": 2.9274, "step": 1179 }, { "epoch": 0.19734916586528412, "grad_norm": 4.195512771606445, "learning_rate": 4.857062167268752e-05, "loss": 2.96, "step": 1180 }, { "epoch": 0.19751641092110214, "grad_norm": 3.115865468978882, "learning_rate": 4.8565753053909256e-05, "loss": 2.9591, "step": 1181 }, { "epoch": 0.19768365597692017, "grad_norm": 4.709169387817383, "learning_rate": 4.856087640255078e-05, "loss": 3.2066, "step": 1182 }, { "epoch": 0.19785090103273822, "grad_norm": 3.7177727222442627, "learning_rate": 4.855599172027433e-05, "loss": 2.6908, "step": 1183 }, { "epoch": 0.19801814608855625, "grad_norm": 2.8268134593963623, "learning_rate": 4.855109900874489e-05, "loss": 2.8825, "step": 1184 }, { "epoch": 0.1981853911443743, "grad_norm": 3.140854597091675, "learning_rate": 4.8546198269630185e-05, "loss": 2.3638, "step": 1185 }, { "epoch": 0.19835263620019233, "grad_norm": 3.6860625743865967, "learning_rate": 4.854128950460067e-05, "loss": 2.747, "step": 1186 }, { "epoch": 0.19851988125601036, "grad_norm": 4.045935153961182, "learning_rate": 4.853637271532954e-05, "loss": 2.6922, "step": 1187 }, { "epoch": 0.19868712631182842, "grad_norm": 3.2892701625823975, "learning_rate": 4.8531447903492724e-05, "loss": 3.1101, "step": 1188 }, { "epoch": 0.19885437136764644, "grad_norm": 2.898164987564087, "learning_rate": 4.8526515070768876e-05, "loss": 2.712, "step": 1189 }, { "epoch": 0.19902161642346447, "grad_norm": 3.0304291248321533, "learning_rate": 4.852157421883941e-05, "loss": 2.5732, "step": 1190 }, { "epoch": 0.19918886147928253, "grad_norm": 2.818896770477295, "learning_rate": 4.851662534938844e-05, "loss": 2.8168, "step": 1191 }, { "epoch": 0.19935610653510055, "grad_norm": 7.033897876739502, "learning_rate": 4.851166846410282e-05, "loss": 2.5846, "step": 1192 }, { "epoch": 0.1995233515909186, "grad_norm": 4.727588176727295, "learning_rate": 4.850670356467217e-05, "loss": 3.593, "step": 1193 }, { "epoch": 0.19969059664673663, "grad_norm": 2.420616626739502, "learning_rate": 4.85017306527888e-05, "loss": 3.0711, "step": 1194 }, { "epoch": 0.19985784170255466, "grad_norm": 5.16896915435791, "learning_rate": 4.849674973014778e-05, "loss": 2.8884, "step": 1195 }, { "epoch": 0.20002508675837272, "grad_norm": 3.997472047805786, "learning_rate": 4.8491760798446886e-05, "loss": 2.6767, "step": 1196 }, { "epoch": 0.20019233181419074, "grad_norm": 4.271536350250244, "learning_rate": 4.848676385938665e-05, "loss": 2.9512, "step": 1197 }, { "epoch": 0.20035957687000877, "grad_norm": 2.418776035308838, "learning_rate": 4.84817589146703e-05, "loss": 2.8783, "step": 1198 }, { "epoch": 0.20052682192582683, "grad_norm": 4.054032802581787, "learning_rate": 4.847674596600383e-05, "loss": 2.744, "step": 1199 }, { "epoch": 0.20069406698164485, "grad_norm": 5.5014328956604, "learning_rate": 4.847172501509594e-05, "loss": 2.8859, "step": 1200 }, { "epoch": 0.20086131203746288, "grad_norm": 3.9138543605804443, "learning_rate": 4.8466696063658053e-05, "loss": 3.0285, "step": 1201 }, { "epoch": 0.20102855709328094, "grad_norm": 3.6063599586486816, "learning_rate": 4.846165911340434e-05, "loss": 2.8015, "step": 1202 }, { "epoch": 0.20119580214909896, "grad_norm": 3.4767448902130127, "learning_rate": 4.8456614166051686e-05, "loss": 2.9807, "step": 1203 }, { "epoch": 0.20136304720491702, "grad_norm": 2.7155399322509766, "learning_rate": 4.845156122331969e-05, "loss": 2.5133, "step": 1204 }, { "epoch": 0.20153029226073504, "grad_norm": 4.280088424682617, "learning_rate": 4.844650028693071e-05, "loss": 2.9262, "step": 1205 }, { "epoch": 0.20169753731655307, "grad_norm": 3.565654754638672, "learning_rate": 4.844143135860979e-05, "loss": 2.9265, "step": 1206 }, { "epoch": 0.20186478237237113, "grad_norm": 3.1564314365386963, "learning_rate": 4.843635444008472e-05, "loss": 2.6668, "step": 1207 }, { "epoch": 0.20203202742818915, "grad_norm": 6.715216636657715, "learning_rate": 4.843126953308601e-05, "loss": 3.421, "step": 1208 }, { "epoch": 0.20219927248400718, "grad_norm": 4.200869083404541, "learning_rate": 4.84261766393469e-05, "loss": 3.3497, "step": 1209 }, { "epoch": 0.20236651753982524, "grad_norm": 2.296696186065674, "learning_rate": 4.8421075760603326e-05, "loss": 2.3632, "step": 1210 }, { "epoch": 0.20253376259564326, "grad_norm": 2.7671566009521484, "learning_rate": 4.841596689859399e-05, "loss": 2.6567, "step": 1211 }, { "epoch": 0.2027010076514613, "grad_norm": 5.453179836273193, "learning_rate": 4.8410850055060276e-05, "loss": 3.2083, "step": 1212 }, { "epoch": 0.20286825270727935, "grad_norm": 3.861680030822754, "learning_rate": 4.840572523174629e-05, "loss": 2.9337, "step": 1213 }, { "epoch": 0.20303549776309737, "grad_norm": 5.592348098754883, "learning_rate": 4.84005924303989e-05, "loss": 3.9621, "step": 1214 }, { "epoch": 0.20320274281891543, "grad_norm": 5.376613616943359, "learning_rate": 4.839545165276763e-05, "loss": 2.7551, "step": 1215 }, { "epoch": 0.20336998787473345, "grad_norm": 3.1772940158843994, "learning_rate": 4.8390302900604775e-05, "loss": 2.6795, "step": 1216 }, { "epoch": 0.20353723293055148, "grad_norm": 3.8724350929260254, "learning_rate": 4.838514617566533e-05, "loss": 3.0143, "step": 1217 }, { "epoch": 0.20370447798636954, "grad_norm": 2.854792356491089, "learning_rate": 4.8379981479706995e-05, "loss": 2.6564, "step": 1218 }, { "epoch": 0.20387172304218756, "grad_norm": 4.135457992553711, "learning_rate": 4.837480881449021e-05, "loss": 2.5697, "step": 1219 }, { "epoch": 0.2040389680980056, "grad_norm": 5.041413307189941, "learning_rate": 4.836962818177812e-05, "loss": 3.2098, "step": 1220 }, { "epoch": 0.20420621315382365, "grad_norm": 3.4275596141815186, "learning_rate": 4.836443958333658e-05, "loss": 2.8653, "step": 1221 }, { "epoch": 0.20437345820964167, "grad_norm": 2.569957733154297, "learning_rate": 4.8359243020934176e-05, "loss": 2.8478, "step": 1222 }, { "epoch": 0.20454070326545973, "grad_norm": 4.869930744171143, "learning_rate": 4.8354038496342193e-05, "loss": 2.7594, "step": 1223 }, { "epoch": 0.20470794832127776, "grad_norm": 3.0447633266448975, "learning_rate": 4.8348826011334634e-05, "loss": 2.8848, "step": 1224 }, { "epoch": 0.20487519337709578, "grad_norm": 3.317410945892334, "learning_rate": 4.834360556768821e-05, "loss": 3.0236, "step": 1225 }, { "epoch": 0.20504243843291384, "grad_norm": 4.5428643226623535, "learning_rate": 4.833837716718238e-05, "loss": 2.8855, "step": 1226 }, { "epoch": 0.20520968348873186, "grad_norm": 3.565728187561035, "learning_rate": 4.833314081159925e-05, "loss": 2.9325, "step": 1227 }, { "epoch": 0.2053769285445499, "grad_norm": 4.131716728210449, "learning_rate": 4.83278965027237e-05, "loss": 2.6906, "step": 1228 }, { "epoch": 0.20554417360036795, "grad_norm": 2.5135366916656494, "learning_rate": 4.832264424234329e-05, "loss": 3.4258, "step": 1229 }, { "epoch": 0.20571141865618597, "grad_norm": 5.255706787109375, "learning_rate": 4.8317384032248295e-05, "loss": 2.9737, "step": 1230 }, { "epoch": 0.205878663712004, "grad_norm": 4.959433555603027, "learning_rate": 4.83121158742317e-05, "loss": 3.0424, "step": 1231 }, { "epoch": 0.20604590876782206, "grad_norm": 5.668098449707031, "learning_rate": 4.8306839770089196e-05, "loss": 2.9528, "step": 1232 }, { "epoch": 0.20621315382364008, "grad_norm": 4.543437957763672, "learning_rate": 4.83015557216192e-05, "loss": 3.139, "step": 1233 }, { "epoch": 0.20638039887945814, "grad_norm": 5.569127082824707, "learning_rate": 4.82962637306228e-05, "loss": 2.9548, "step": 1234 }, { "epoch": 0.20654764393527617, "grad_norm": 2.7335026264190674, "learning_rate": 4.8290963798903845e-05, "loss": 2.8303, "step": 1235 }, { "epoch": 0.2067148889910942, "grad_norm": 3.8497395515441895, "learning_rate": 4.828565592826883e-05, "loss": 2.868, "step": 1236 }, { "epoch": 0.20688213404691225, "grad_norm": 5.058129787445068, "learning_rate": 4.8280340120527003e-05, "loss": 3.6506, "step": 1237 }, { "epoch": 0.20704937910273027, "grad_norm": 3.033888101577759, "learning_rate": 4.82750163774903e-05, "loss": 3.0653, "step": 1238 }, { "epoch": 0.2072166241585483, "grad_norm": 6.003199100494385, "learning_rate": 4.826968470097336e-05, "loss": 3.2298, "step": 1239 }, { "epoch": 0.20738386921436636, "grad_norm": 3.1966686248779297, "learning_rate": 4.826434509279353e-05, "loss": 2.9773, "step": 1240 }, { "epoch": 0.20755111427018438, "grad_norm": 3.467613458633423, "learning_rate": 4.8258997554770854e-05, "loss": 2.8109, "step": 1241 }, { "epoch": 0.20771835932600244, "grad_norm": 9.343045234680176, "learning_rate": 4.8253642088728094e-05, "loss": 2.9965, "step": 1242 }, { "epoch": 0.20788560438182047, "grad_norm": 3.621164083480835, "learning_rate": 4.82482786964907e-05, "loss": 3.0476, "step": 1243 }, { "epoch": 0.2080528494376385, "grad_norm": 3.9923720359802246, "learning_rate": 4.824290737988682e-05, "loss": 3.1442, "step": 1244 }, { "epoch": 0.20822009449345655, "grad_norm": 4.042149543762207, "learning_rate": 4.823752814074732e-05, "loss": 3.0432, "step": 1245 }, { "epoch": 0.20838733954927458, "grad_norm": 3.6483752727508545, "learning_rate": 4.823214098090576e-05, "loss": 2.74, "step": 1246 }, { "epoch": 0.2085545846050926, "grad_norm": 3.74108624458313, "learning_rate": 4.822674590219839e-05, "loss": 2.6679, "step": 1247 }, { "epoch": 0.20872182966091066, "grad_norm": 3.148432970046997, "learning_rate": 4.8221342906464174e-05, "loss": 2.8572, "step": 1248 }, { "epoch": 0.20888907471672868, "grad_norm": 5.087082386016846, "learning_rate": 4.8215931995544764e-05, "loss": 3.0352, "step": 1249 }, { "epoch": 0.2090563197725467, "grad_norm": 5.647087097167969, "learning_rate": 4.8210513171284506e-05, "loss": 3.3287, "step": 1250 }, { "epoch": 0.20922356482836477, "grad_norm": 4.060773849487305, "learning_rate": 4.820508643553046e-05, "loss": 3.1807, "step": 1251 }, { "epoch": 0.2093908098841828, "grad_norm": 4.294201850891113, "learning_rate": 4.819965179013237e-05, "loss": 3.2408, "step": 1252 }, { "epoch": 0.20955805494000085, "grad_norm": 2.693338632583618, "learning_rate": 4.8194209236942686e-05, "loss": 2.8667, "step": 1253 }, { "epoch": 0.20972529999581888, "grad_norm": 2.680729866027832, "learning_rate": 4.8188758777816546e-05, "loss": 2.7056, "step": 1254 }, { "epoch": 0.2098925450516369, "grad_norm": 3.736320734024048, "learning_rate": 4.818330041461177e-05, "loss": 2.617, "step": 1255 }, { "epoch": 0.21005979010745496, "grad_norm": 4.893702507019043, "learning_rate": 4.817783414918891e-05, "loss": 3.142, "step": 1256 }, { "epoch": 0.21022703516327299, "grad_norm": 2.5011513233184814, "learning_rate": 4.8172359983411155e-05, "loss": 2.5505, "step": 1257 }, { "epoch": 0.210394280219091, "grad_norm": 2.2320051193237305, "learning_rate": 4.8166877919144445e-05, "loss": 2.8279, "step": 1258 }, { "epoch": 0.21056152527490907, "grad_norm": 4.5419182777404785, "learning_rate": 4.816138795825737e-05, "loss": 2.7334, "step": 1259 }, { "epoch": 0.2107287703307271, "grad_norm": 3.85872220993042, "learning_rate": 4.815589010262125e-05, "loss": 3.1598, "step": 1260 }, { "epoch": 0.21089601538654512, "grad_norm": 7.619253635406494, "learning_rate": 4.8150384354110054e-05, "loss": 3.1601, "step": 1261 }, { "epoch": 0.21106326044236318, "grad_norm": 2.5515096187591553, "learning_rate": 4.814487071460047e-05, "loss": 2.7618, "step": 1262 }, { "epoch": 0.2112305054981812, "grad_norm": 3.2550411224365234, "learning_rate": 4.8139349185971866e-05, "loss": 3.0043, "step": 1263 }, { "epoch": 0.21139775055399926, "grad_norm": 4.658783435821533, "learning_rate": 4.8133819770106305e-05, "loss": 2.9828, "step": 1264 }, { "epoch": 0.21156499560981729, "grad_norm": 3.6708786487579346, "learning_rate": 4.812828246888852e-05, "loss": 2.8966, "step": 1265 }, { "epoch": 0.2117322406656353, "grad_norm": 2.8498246669769287, "learning_rate": 4.8122737284205955e-05, "loss": 2.7149, "step": 1266 }, { "epoch": 0.21189948572145337, "grad_norm": 2.697577714920044, "learning_rate": 4.811718421794874e-05, "loss": 3.1871, "step": 1267 }, { "epoch": 0.2120667307772714, "grad_norm": 4.695863723754883, "learning_rate": 4.811162327200967e-05, "loss": 2.5902, "step": 1268 }, { "epoch": 0.21223397583308942, "grad_norm": 8.364997863769531, "learning_rate": 4.810605444828423e-05, "loss": 3.1553, "step": 1269 }, { "epoch": 0.21240122088890748, "grad_norm": 4.405561447143555, "learning_rate": 4.810047774867063e-05, "loss": 3.2471, "step": 1270 }, { "epoch": 0.2125684659447255, "grad_norm": 3.4752912521362305, "learning_rate": 4.809489317506971e-05, "loss": 2.5773, "step": 1271 }, { "epoch": 0.21273571100054356, "grad_norm": 4.409657001495361, "learning_rate": 4.8089300729385025e-05, "loss": 2.9958, "step": 1272 }, { "epoch": 0.2129029560563616, "grad_norm": 5.3720831871032715, "learning_rate": 4.80837004135228e-05, "loss": 3.2917, "step": 1273 }, { "epoch": 0.2130702011121796, "grad_norm": 3.9324374198913574, "learning_rate": 4.8078092229391966e-05, "loss": 3.0622, "step": 1274 }, { "epoch": 0.21323744616799767, "grad_norm": 3.8348159790039062, "learning_rate": 4.807247617890409e-05, "loss": 3.233, "step": 1275 }, { "epoch": 0.2134046912238157, "grad_norm": 2.538527488708496, "learning_rate": 4.8066852263973475e-05, "loss": 2.7563, "step": 1276 }, { "epoch": 0.21357193627963372, "grad_norm": 6.002004146575928, "learning_rate": 4.8061220486517064e-05, "loss": 3.3881, "step": 1277 }, { "epoch": 0.21373918133545178, "grad_norm": 2.40084171295166, "learning_rate": 4.8055580848454505e-05, "loss": 2.709, "step": 1278 }, { "epoch": 0.2139064263912698, "grad_norm": 3.399263381958008, "learning_rate": 4.804993335170811e-05, "loss": 2.7907, "step": 1279 }, { "epoch": 0.21407367144708783, "grad_norm": 4.750132083892822, "learning_rate": 4.804427799820287e-05, "loss": 2.8811, "step": 1280 }, { "epoch": 0.2142409165029059, "grad_norm": 4.171694278717041, "learning_rate": 4.803861478986646e-05, "loss": 2.8502, "step": 1281 }, { "epoch": 0.21440816155872391, "grad_norm": 3.8132810592651367, "learning_rate": 4.803294372862923e-05, "loss": 3.301, "step": 1282 }, { "epoch": 0.21457540661454197, "grad_norm": 4.274444580078125, "learning_rate": 4.802726481642422e-05, "loss": 2.6393, "step": 1283 }, { "epoch": 0.21474265167036, "grad_norm": 4.319161415100098, "learning_rate": 4.8021578055187125e-05, "loss": 2.8727, "step": 1284 }, { "epoch": 0.21490989672617802, "grad_norm": 4.9232683181762695, "learning_rate": 4.801588344685632e-05, "loss": 2.7316, "step": 1285 }, { "epoch": 0.21507714178199608, "grad_norm": 2.565078020095825, "learning_rate": 4.801018099337286e-05, "loss": 2.7793, "step": 1286 }, { "epoch": 0.2152443868378141, "grad_norm": 3.1935760974884033, "learning_rate": 4.800447069668048e-05, "loss": 2.7417, "step": 1287 }, { "epoch": 0.21541163189363213, "grad_norm": 3.541063070297241, "learning_rate": 4.799875255872558e-05, "loss": 3.0827, "step": 1288 }, { "epoch": 0.2155788769494502, "grad_norm": 3.637702465057373, "learning_rate": 4.799302658145722e-05, "loss": 2.7703, "step": 1289 }, { "epoch": 0.21574612200526821, "grad_norm": 4.976041316986084, "learning_rate": 4.798729276682717e-05, "loss": 2.7303, "step": 1290 }, { "epoch": 0.21591336706108627, "grad_norm": 7.2660417556762695, "learning_rate": 4.798155111678983e-05, "loss": 3.5407, "step": 1291 }, { "epoch": 0.2160806121169043, "grad_norm": 5.283138751983643, "learning_rate": 4.7975801633302294e-05, "loss": 3.3521, "step": 1292 }, { "epoch": 0.21624785717272232, "grad_norm": 3.3907601833343506, "learning_rate": 4.797004431832432e-05, "loss": 2.7342, "step": 1293 }, { "epoch": 0.21641510222854038, "grad_norm": 3.3416504859924316, "learning_rate": 4.796427917381835e-05, "loss": 3.1623, "step": 1294 }, { "epoch": 0.2165823472843584, "grad_norm": 12.395359992980957, "learning_rate": 4.795850620174945e-05, "loss": 3.7449, "step": 1295 }, { "epoch": 0.21674959234017643, "grad_norm": 3.572089195251465, "learning_rate": 4.795272540408542e-05, "loss": 2.8219, "step": 1296 }, { "epoch": 0.2169168373959945, "grad_norm": 6.69376277923584, "learning_rate": 4.794693678279666e-05, "loss": 3.8324, "step": 1297 }, { "epoch": 0.21708408245181252, "grad_norm": 3.0700578689575195, "learning_rate": 4.794114033985629e-05, "loss": 2.9173, "step": 1298 }, { "epoch": 0.21725132750763054, "grad_norm": 3.183117389678955, "learning_rate": 4.793533607724007e-05, "loss": 2.7916, "step": 1299 }, { "epoch": 0.2174185725634486, "grad_norm": 5.978344440460205, "learning_rate": 4.7929523996926424e-05, "loss": 2.9556, "step": 1300 }, { "epoch": 0.21758581761926662, "grad_norm": 2.9593749046325684, "learning_rate": 4.792370410089646e-05, "loss": 2.745, "step": 1301 }, { "epoch": 0.21775306267508468, "grad_norm": 3.765470027923584, "learning_rate": 4.791787639113392e-05, "loss": 2.8609, "step": 1302 }, { "epoch": 0.2179203077309027, "grad_norm": 2.7488696575164795, "learning_rate": 4.791204086962524e-05, "loss": 2.747, "step": 1303 }, { "epoch": 0.21808755278672073, "grad_norm": 5.121306896209717, "learning_rate": 4.790619753835951e-05, "loss": 3.2381, "step": 1304 }, { "epoch": 0.2182547978425388, "grad_norm": 5.04605770111084, "learning_rate": 4.7900346399328464e-05, "loss": 3.0227, "step": 1305 }, { "epoch": 0.21842204289835682, "grad_norm": 5.609301567077637, "learning_rate": 4.789448745452652e-05, "loss": 3.1357, "step": 1306 }, { "epoch": 0.21858928795417484, "grad_norm": 5.2591118812561035, "learning_rate": 4.7888620705950735e-05, "loss": 2.8965, "step": 1307 }, { "epoch": 0.2187565330099929, "grad_norm": 3.6684064865112305, "learning_rate": 4.7882746155600854e-05, "loss": 3.0094, "step": 1308 }, { "epoch": 0.21892377806581093, "grad_norm": 4.028268337249756, "learning_rate": 4.7876863805479253e-05, "loss": 3.0573, "step": 1309 }, { "epoch": 0.21909102312162898, "grad_norm": 5.951987266540527, "learning_rate": 4.787097365759099e-05, "loss": 2.8604, "step": 1310 }, { "epoch": 0.219258268177447, "grad_norm": 7.345202445983887, "learning_rate": 4.7865075713943765e-05, "loss": 2.33, "step": 1311 }, { "epoch": 0.21942551323326503, "grad_norm": 3.597954273223877, "learning_rate": 4.785916997654794e-05, "loss": 2.8241, "step": 1312 }, { "epoch": 0.2195927582890831, "grad_norm": 4.744692802429199, "learning_rate": 4.7853256447416536e-05, "loss": 2.7084, "step": 1313 }, { "epoch": 0.21976000334490112, "grad_norm": 3.191045045852661, "learning_rate": 4.784733512856523e-05, "loss": 2.8219, "step": 1314 }, { "epoch": 0.21992724840071914, "grad_norm": 3.616974115371704, "learning_rate": 4.7841406022012335e-05, "loss": 2.825, "step": 1315 }, { "epoch": 0.2200944934565372, "grad_norm": 7.836925506591797, "learning_rate": 4.7835469129778866e-05, "loss": 3.5525, "step": 1316 }, { "epoch": 0.22026173851235523, "grad_norm": 3.4845545291900635, "learning_rate": 4.782952445388843e-05, "loss": 2.7637, "step": 1317 }, { "epoch": 0.22042898356817325, "grad_norm": 4.098702430725098, "learning_rate": 4.7823571996367344e-05, "loss": 2.8244, "step": 1318 }, { "epoch": 0.2205962286239913, "grad_norm": 6.446091175079346, "learning_rate": 4.781761175924454e-05, "loss": 2.6589, "step": 1319 }, { "epoch": 0.22076347367980934, "grad_norm": 3.843024969100952, "learning_rate": 4.781164374455161e-05, "loss": 2.9634, "step": 1320 }, { "epoch": 0.2209307187356274, "grad_norm": 2.680434226989746, "learning_rate": 4.780566795432281e-05, "loss": 2.699, "step": 1321 }, { "epoch": 0.22109796379144542, "grad_norm": 5.031355857849121, "learning_rate": 4.7799684390595026e-05, "loss": 2.815, "step": 1322 }, { "epoch": 0.22126520884726344, "grad_norm": 3.3839430809020996, "learning_rate": 4.779369305540782e-05, "loss": 3.1433, "step": 1323 }, { "epoch": 0.2214324539030815, "grad_norm": 2.886932849884033, "learning_rate": 4.778769395080338e-05, "loss": 2.697, "step": 1324 }, { "epoch": 0.22159969895889953, "grad_norm": 4.44404935836792, "learning_rate": 4.778168707882654e-05, "loss": 3.3202, "step": 1325 }, { "epoch": 0.22176694401471755, "grad_norm": 4.7765116691589355, "learning_rate": 4.777567244152481e-05, "loss": 2.9467, "step": 1326 }, { "epoch": 0.2219341890705356, "grad_norm": 3.574748992919922, "learning_rate": 4.776965004094831e-05, "loss": 2.6611, "step": 1327 }, { "epoch": 0.22210143412635364, "grad_norm": 2.9955716133117676, "learning_rate": 4.776361987914984e-05, "loss": 3.002, "step": 1328 }, { "epoch": 0.22226867918217166, "grad_norm": 5.1948041915893555, "learning_rate": 4.7757581958184825e-05, "loss": 2.7888, "step": 1329 }, { "epoch": 0.22243592423798972, "grad_norm": 4.402746677398682, "learning_rate": 4.775153628011134e-05, "loss": 3.1607, "step": 1330 }, { "epoch": 0.22260316929380775, "grad_norm": 4.101518154144287, "learning_rate": 4.77454828469901e-05, "loss": 2.7957, "step": 1331 }, { "epoch": 0.2227704143496258, "grad_norm": 3.882535696029663, "learning_rate": 4.7739421660884463e-05, "loss": 2.8072, "step": 1332 }, { "epoch": 0.22293765940544383, "grad_norm": 3.4276325702667236, "learning_rate": 4.773335272386045e-05, "loss": 2.4762, "step": 1333 }, { "epoch": 0.22310490446126185, "grad_norm": 4.3494977951049805, "learning_rate": 4.7727276037986704e-05, "loss": 3.1849, "step": 1334 }, { "epoch": 0.2232721495170799, "grad_norm": 2.5058693885803223, "learning_rate": 4.77211916053345e-05, "loss": 2.7036, "step": 1335 }, { "epoch": 0.22343939457289794, "grad_norm": 6.4419379234313965, "learning_rate": 4.771509942797778e-05, "loss": 2.8917, "step": 1336 }, { "epoch": 0.22360663962871596, "grad_norm": 2.7458102703094482, "learning_rate": 4.77089995079931e-05, "loss": 3.1206, "step": 1337 }, { "epoch": 0.22377388468453402, "grad_norm": 5.263635158538818, "learning_rate": 4.7702891847459685e-05, "loss": 2.92, "step": 1338 }, { "epoch": 0.22394112974035205, "grad_norm": 5.73922061920166, "learning_rate": 4.7696776448459366e-05, "loss": 2.5251, "step": 1339 }, { "epoch": 0.2241083747961701, "grad_norm": 6.380731105804443, "learning_rate": 4.769065331307664e-05, "loss": 2.6309, "step": 1340 }, { "epoch": 0.22427561985198813, "grad_norm": 3.0904195308685303, "learning_rate": 4.7684522443398615e-05, "loss": 2.6869, "step": 1341 }, { "epoch": 0.22444286490780616, "grad_norm": 4.063572406768799, "learning_rate": 4.767838384151505e-05, "loss": 2.9299, "step": 1342 }, { "epoch": 0.2246101099636242, "grad_norm": 6.8489155769348145, "learning_rate": 4.7672237509518346e-05, "loss": 2.9898, "step": 1343 }, { "epoch": 0.22477735501944224, "grad_norm": 4.314727783203125, "learning_rate": 4.766608344950353e-05, "loss": 3.5157, "step": 1344 }, { "epoch": 0.22494460007526026, "grad_norm": 10.099717140197754, "learning_rate": 4.765992166356825e-05, "loss": 2.8234, "step": 1345 }, { "epoch": 0.22511184513107832, "grad_norm": 3.1544950008392334, "learning_rate": 4.765375215381282e-05, "loss": 2.9923, "step": 1346 }, { "epoch": 0.22527909018689635, "grad_norm": 7.672467231750488, "learning_rate": 4.7647574922340156e-05, "loss": 3.3495, "step": 1347 }, { "epoch": 0.22544633524271437, "grad_norm": 2.986626148223877, "learning_rate": 4.764138997125582e-05, "loss": 2.7539, "step": 1348 }, { "epoch": 0.22561358029853243, "grad_norm": 2.3547534942626953, "learning_rate": 4.763519730266801e-05, "loss": 2.6663, "step": 1349 }, { "epoch": 0.22578082535435046, "grad_norm": 3.0261425971984863, "learning_rate": 4.762899691868754e-05, "loss": 2.7693, "step": 1350 }, { "epoch": 0.2259480704101685, "grad_norm": 4.456954002380371, "learning_rate": 4.762278882142787e-05, "loss": 3.0323, "step": 1351 }, { "epoch": 0.22611531546598654, "grad_norm": 3.7083582878112793, "learning_rate": 4.7616573013005066e-05, "loss": 3.0539, "step": 1352 }, { "epoch": 0.22628256052180457, "grad_norm": 3.1790835857391357, "learning_rate": 4.761034949553786e-05, "loss": 2.6977, "step": 1353 }, { "epoch": 0.22644980557762262, "grad_norm": 2.2049880027770996, "learning_rate": 4.7604118271147555e-05, "loss": 2.5453, "step": 1354 }, { "epoch": 0.22661705063344065, "grad_norm": 4.21901798248291, "learning_rate": 4.7597879341958154e-05, "loss": 2.8169, "step": 1355 }, { "epoch": 0.22678429568925867, "grad_norm": 2.5616376399993896, "learning_rate": 4.7591632710096215e-05, "loss": 2.7082, "step": 1356 }, { "epoch": 0.22695154074507673, "grad_norm": 5.84672737121582, "learning_rate": 4.758537837769097e-05, "loss": 3.1239, "step": 1357 }, { "epoch": 0.22711878580089476, "grad_norm": 6.2505879402160645, "learning_rate": 4.7579116346874266e-05, "loss": 2.7666, "step": 1358 }, { "epoch": 0.2272860308567128, "grad_norm": 4.8021392822265625, "learning_rate": 4.7572846619780545e-05, "loss": 3.0126, "step": 1359 }, { "epoch": 0.22745327591253084, "grad_norm": 2.607201337814331, "learning_rate": 4.7566569198546916e-05, "loss": 2.9015, "step": 1360 }, { "epoch": 0.22762052096834887, "grad_norm": 2.6000194549560547, "learning_rate": 4.7560284085313086e-05, "loss": 2.8032, "step": 1361 }, { "epoch": 0.22778776602416692, "grad_norm": 3.158031940460205, "learning_rate": 4.755399128222138e-05, "loss": 2.807, "step": 1362 }, { "epoch": 0.22795501107998495, "grad_norm": 7.71650505065918, "learning_rate": 4.754769079141675e-05, "loss": 2.8653, "step": 1363 }, { "epoch": 0.22812225613580298, "grad_norm": 4.821521759033203, "learning_rate": 4.754138261504678e-05, "loss": 3.2589, "step": 1364 }, { "epoch": 0.22828950119162103, "grad_norm": 2.9269561767578125, "learning_rate": 4.7535066755261656e-05, "loss": 3.0106, "step": 1365 }, { "epoch": 0.22845674624743906, "grad_norm": 4.097954273223877, "learning_rate": 4.7528743214214194e-05, "loss": 2.8944, "step": 1366 }, { "epoch": 0.22862399130325708, "grad_norm": 3.201842784881592, "learning_rate": 4.752241199405982e-05, "loss": 2.9598, "step": 1367 }, { "epoch": 0.22879123635907514, "grad_norm": 2.7583227157592773, "learning_rate": 4.751607309695659e-05, "loss": 2.8482, "step": 1368 }, { "epoch": 0.22895848141489317, "grad_norm": 4.008326053619385, "learning_rate": 4.750972652506517e-05, "loss": 2.8026, "step": 1369 }, { "epoch": 0.22912572647071122, "grad_norm": 3.7473604679107666, "learning_rate": 4.750337228054883e-05, "loss": 2.8601, "step": 1370 }, { "epoch": 0.22929297152652925, "grad_norm": 6.09278678894043, "learning_rate": 4.749701036557347e-05, "loss": 2.8238, "step": 1371 }, { "epoch": 0.22946021658234728, "grad_norm": 2.605675220489502, "learning_rate": 4.749064078230761e-05, "loss": 2.6229, "step": 1372 }, { "epoch": 0.22962746163816533, "grad_norm": 5.108896732330322, "learning_rate": 4.748426353292236e-05, "loss": 2.4593, "step": 1373 }, { "epoch": 0.22979470669398336, "grad_norm": 4.8447585105896, "learning_rate": 4.747787861959148e-05, "loss": 2.4502, "step": 1374 }, { "epoch": 0.22996195174980139, "grad_norm": 3.478970527648926, "learning_rate": 4.747148604449129e-05, "loss": 2.5885, "step": 1375 }, { "epoch": 0.23012919680561944, "grad_norm": 8.391753196716309, "learning_rate": 4.746508580980078e-05, "loss": 2.7168, "step": 1376 }, { "epoch": 0.23029644186143747, "grad_norm": 3.726053237915039, "learning_rate": 4.74586779177015e-05, "loss": 2.9802, "step": 1377 }, { "epoch": 0.23046368691725552, "grad_norm": 3.9546852111816406, "learning_rate": 4.7452262370377654e-05, "loss": 2.9462, "step": 1378 }, { "epoch": 0.23063093197307355, "grad_norm": 4.412408351898193, "learning_rate": 4.744583917001602e-05, "loss": 2.9898, "step": 1379 }, { "epoch": 0.23079817702889158, "grad_norm": 5.159209728240967, "learning_rate": 4.7439408318806e-05, "loss": 2.6404, "step": 1380 }, { "epoch": 0.23096542208470963, "grad_norm": 3.884101390838623, "learning_rate": 4.74329698189396e-05, "loss": 2.6959, "step": 1381 }, { "epoch": 0.23113266714052766, "grad_norm": 2.352579116821289, "learning_rate": 4.742652367261144e-05, "loss": 2.7953, "step": 1382 }, { "epoch": 0.23129991219634569, "grad_norm": 3.2354469299316406, "learning_rate": 4.7420069882018734e-05, "loss": 2.8939, "step": 1383 }, { "epoch": 0.23146715725216374, "grad_norm": 2.7716362476348877, "learning_rate": 4.741360844936134e-05, "loss": 2.8125, "step": 1384 }, { "epoch": 0.23163440230798177, "grad_norm": 3.4270524978637695, "learning_rate": 4.740713937684164e-05, "loss": 2.5676, "step": 1385 }, { "epoch": 0.2318016473637998, "grad_norm": 2.8584582805633545, "learning_rate": 4.7400662666664706e-05, "loss": 2.9177, "step": 1386 }, { "epoch": 0.23196889241961785, "grad_norm": 3.7548797130584717, "learning_rate": 4.739417832103817e-05, "loss": 2.8363, "step": 1387 }, { "epoch": 0.23213613747543588, "grad_norm": 2.22090482711792, "learning_rate": 4.738768634217228e-05, "loss": 2.5782, "step": 1388 }, { "epoch": 0.23230338253125393, "grad_norm": 3.7434661388397217, "learning_rate": 4.738118673227987e-05, "loss": 2.8135, "step": 1389 }, { "epoch": 0.23247062758707196, "grad_norm": 3.225525140762329, "learning_rate": 4.737467949357639e-05, "loss": 2.726, "step": 1390 }, { "epoch": 0.23263787264289, "grad_norm": 7.905787944793701, "learning_rate": 4.7368164628279885e-05, "loss": 3.3707, "step": 1391 }, { "epoch": 0.23280511769870804, "grad_norm": 3.750622034072876, "learning_rate": 4.7361642138611e-05, "loss": 2.9898, "step": 1392 }, { "epoch": 0.23297236275452607, "grad_norm": 1.9234082698822021, "learning_rate": 4.735511202679297e-05, "loss": 2.5642, "step": 1393 }, { "epoch": 0.2331396078103441, "grad_norm": 6.263643264770508, "learning_rate": 4.7348574295051654e-05, "loss": 2.614, "step": 1394 }, { "epoch": 0.23330685286616215, "grad_norm": 2.931565046310425, "learning_rate": 4.734202894561548e-05, "loss": 2.7859, "step": 1395 }, { "epoch": 0.23347409792198018, "grad_norm": 3.030764102935791, "learning_rate": 4.7335475980715495e-05, "loss": 2.7836, "step": 1396 }, { "epoch": 0.2336413429777982, "grad_norm": 1.9930671453475952, "learning_rate": 4.7328915402585315e-05, "loss": 2.5379, "step": 1397 }, { "epoch": 0.23380858803361626, "grad_norm": 2.4780094623565674, "learning_rate": 4.732234721346118e-05, "loss": 2.3616, "step": 1398 }, { "epoch": 0.2339758330894343, "grad_norm": 1.9325222969055176, "learning_rate": 4.731577141558191e-05, "loss": 2.495, "step": 1399 }, { "epoch": 0.23414307814525234, "grad_norm": 3.1302952766418457, "learning_rate": 4.730918801118891e-05, "loss": 2.9474, "step": 1400 }, { "epoch": 0.23431032320107037, "grad_norm": 6.1568403244018555, "learning_rate": 4.730259700252619e-05, "loss": 3.4605, "step": 1401 }, { "epoch": 0.2344775682568884, "grad_norm": 3.7865984439849854, "learning_rate": 4.7295998391840366e-05, "loss": 2.3905, "step": 1402 }, { "epoch": 0.23464481331270645, "grad_norm": 4.458315372467041, "learning_rate": 4.7289392181380606e-05, "loss": 2.8728, "step": 1403 }, { "epoch": 0.23481205836852448, "grad_norm": 6.057563781738281, "learning_rate": 4.72827783733987e-05, "loss": 3.2662, "step": 1404 }, { "epoch": 0.2349793034243425, "grad_norm": 10.371691703796387, "learning_rate": 4.727615697014902e-05, "loss": 4.0495, "step": 1405 }, { "epoch": 0.23514654848016056, "grad_norm": 2.5648179054260254, "learning_rate": 4.726952797388853e-05, "loss": 2.7842, "step": 1406 }, { "epoch": 0.2353137935359786, "grad_norm": 3.4823853969573975, "learning_rate": 4.726289138687677e-05, "loss": 2.6477, "step": 1407 }, { "epoch": 0.23548103859179664, "grad_norm": 4.810610294342041, "learning_rate": 4.725624721137588e-05, "loss": 2.8718, "step": 1408 }, { "epoch": 0.23564828364761467, "grad_norm": 3.2826921939849854, "learning_rate": 4.724959544965057e-05, "loss": 2.7867, "step": 1409 }, { "epoch": 0.2358155287034327, "grad_norm": 3.4068996906280518, "learning_rate": 4.7242936103968164e-05, "loss": 2.6298, "step": 1410 }, { "epoch": 0.23598277375925075, "grad_norm": 7.592313766479492, "learning_rate": 4.7236269176598545e-05, "loss": 2.9179, "step": 1411 }, { "epoch": 0.23615001881506878, "grad_norm": 4.622605323791504, "learning_rate": 4.7229594669814196e-05, "loss": 2.5706, "step": 1412 }, { "epoch": 0.2363172638708868, "grad_norm": 2.562312602996826, "learning_rate": 4.722291258589018e-05, "loss": 2.494, "step": 1413 }, { "epoch": 0.23648450892670486, "grad_norm": 2.495147228240967, "learning_rate": 4.721622292710412e-05, "loss": 2.6938, "step": 1414 }, { "epoch": 0.2366517539825229, "grad_norm": 2.5730926990509033, "learning_rate": 4.720952569573626e-05, "loss": 2.8575, "step": 1415 }, { "epoch": 0.23681899903834092, "grad_norm": 4.760297775268555, "learning_rate": 4.720282089406941e-05, "loss": 3.1813, "step": 1416 }, { "epoch": 0.23698624409415897, "grad_norm": 3.775949001312256, "learning_rate": 4.719610852438894e-05, "loss": 3.0046, "step": 1417 }, { "epoch": 0.237153489149977, "grad_norm": 3.327467918395996, "learning_rate": 4.718938858898282e-05, "loss": 2.6998, "step": 1418 }, { "epoch": 0.23732073420579505, "grad_norm": 5.934675693511963, "learning_rate": 4.71826610901416e-05, "loss": 2.7103, "step": 1419 }, { "epoch": 0.23748797926161308, "grad_norm": 5.932196617126465, "learning_rate": 4.717592603015841e-05, "loss": 2.7408, "step": 1420 }, { "epoch": 0.2376552243174311, "grad_norm": 4.816493034362793, "learning_rate": 4.7169183411328946e-05, "loss": 2.8811, "step": 1421 }, { "epoch": 0.23782246937324916, "grad_norm": 2.7515347003936768, "learning_rate": 4.716243323595147e-05, "loss": 3.2718, "step": 1422 }, { "epoch": 0.2379897144290672, "grad_norm": 3.327047348022461, "learning_rate": 4.7155675506326844e-05, "loss": 3.01, "step": 1423 }, { "epoch": 0.23815695948488522, "grad_norm": 3.2193548679351807, "learning_rate": 4.714891022475851e-05, "loss": 2.8055, "step": 1424 }, { "epoch": 0.23832420454070327, "grad_norm": 3.2266218662261963, "learning_rate": 4.714213739355246e-05, "loss": 2.4122, "step": 1425 }, { "epoch": 0.2384914495965213, "grad_norm": 5.634162902832031, "learning_rate": 4.7135357015017256e-05, "loss": 2.7432, "step": 1426 }, { "epoch": 0.23865869465233935, "grad_norm": 4.2997918128967285, "learning_rate": 4.712856909146406e-05, "loss": 2.7144, "step": 1427 }, { "epoch": 0.23882593970815738, "grad_norm": 4.164741039276123, "learning_rate": 4.712177362520659e-05, "loss": 3.1666, "step": 1428 }, { "epoch": 0.2389931847639754, "grad_norm": 3.483776330947876, "learning_rate": 4.711497061856113e-05, "loss": 2.8745, "step": 1429 }, { "epoch": 0.23916042981979346, "grad_norm": 3.967118740081787, "learning_rate": 4.710816007384655e-05, "loss": 2.8306, "step": 1430 }, { "epoch": 0.2393276748756115, "grad_norm": 2.282618284225464, "learning_rate": 4.710134199338428e-05, "loss": 3.1865, "step": 1431 }, { "epoch": 0.23949491993142952, "grad_norm": 4.7010297775268555, "learning_rate": 4.70945163794983e-05, "loss": 3.0011, "step": 1432 }, { "epoch": 0.23966216498724757, "grad_norm": 3.1358373165130615, "learning_rate": 4.7087683234515186e-05, "loss": 2.8819, "step": 1433 }, { "epoch": 0.2398294100430656, "grad_norm": 5.0080366134643555, "learning_rate": 4.7080842560764095e-05, "loss": 2.9308, "step": 1434 }, { "epoch": 0.23999665509888363, "grad_norm": 5.142736911773682, "learning_rate": 4.7073994360576695e-05, "loss": 2.7388, "step": 1435 }, { "epoch": 0.24016390015470168, "grad_norm": 3.2495651245117188, "learning_rate": 4.7067138636287266e-05, "loss": 2.8198, "step": 1436 }, { "epoch": 0.2403311452105197, "grad_norm": 4.757367134094238, "learning_rate": 4.706027539023263e-05, "loss": 2.6469, "step": 1437 }, { "epoch": 0.24049839026633776, "grad_norm": 3.662146806716919, "learning_rate": 4.705340462475219e-05, "loss": 2.7914, "step": 1438 }, { "epoch": 0.2406656353221558, "grad_norm": 6.000127792358398, "learning_rate": 4.704652634218791e-05, "loss": 2.8567, "step": 1439 }, { "epoch": 0.24083288037797382, "grad_norm": 3.2285749912261963, "learning_rate": 4.7039640544884286e-05, "loss": 2.2736, "step": 1440 }, { "epoch": 0.24100012543379187, "grad_norm": 3.079961061477661, "learning_rate": 4.703274723518841e-05, "loss": 2.5604, "step": 1441 }, { "epoch": 0.2411673704896099, "grad_norm": 4.766537189483643, "learning_rate": 4.7025846415449934e-05, "loss": 3.178, "step": 1442 }, { "epoch": 0.24133461554542793, "grad_norm": 11.248169898986816, "learning_rate": 4.701893808802104e-05, "loss": 3.0213, "step": 1443 }, { "epoch": 0.24150186060124598, "grad_norm": 3.714665412902832, "learning_rate": 4.701202225525649e-05, "loss": 2.4892, "step": 1444 }, { "epoch": 0.241669105657064, "grad_norm": 3.7647199630737305, "learning_rate": 4.700509891951362e-05, "loss": 3.1761, "step": 1445 }, { "epoch": 0.24183635071288204, "grad_norm": 3.406520366668701, "learning_rate": 4.6998168083152295e-05, "loss": 2.7982, "step": 1446 }, { "epoch": 0.2420035957687001, "grad_norm": 3.880436658859253, "learning_rate": 4.699122974853494e-05, "loss": 2.8151, "step": 1447 }, { "epoch": 0.24217084082451812, "grad_norm": 2.395223617553711, "learning_rate": 4.698428391802655e-05, "loss": 2.7048, "step": 1448 }, { "epoch": 0.24233808588033617, "grad_norm": 2.79514741897583, "learning_rate": 4.697733059399467e-05, "loss": 2.4486, "step": 1449 }, { "epoch": 0.2425053309361542, "grad_norm": 5.664627552032471, "learning_rate": 4.6970369778809406e-05, "loss": 2.969, "step": 1450 }, { "epoch": 0.24267257599197223, "grad_norm": 4.3186116218566895, "learning_rate": 4.6963401474843385e-05, "loss": 2.6255, "step": 1451 }, { "epoch": 0.24283982104779028, "grad_norm": 8.207850456237793, "learning_rate": 4.695642568447184e-05, "loss": 3.0097, "step": 1452 }, { "epoch": 0.2430070661036083, "grad_norm": 3.1803245544433594, "learning_rate": 4.69494424100725e-05, "loss": 2.9523, "step": 1453 }, { "epoch": 0.24317431115942634, "grad_norm": 3.585695505142212, "learning_rate": 4.694245165402568e-05, "loss": 2.594, "step": 1454 }, { "epoch": 0.2433415562152444, "grad_norm": 8.791074752807617, "learning_rate": 4.693545341871425e-05, "loss": 4.0905, "step": 1455 }, { "epoch": 0.24350880127106242, "grad_norm": 4.297835826873779, "learning_rate": 4.692844770652359e-05, "loss": 2.613, "step": 1456 }, { "epoch": 0.24367604632688047, "grad_norm": 3.612672805786133, "learning_rate": 4.692143451984168e-05, "loss": 2.8804, "step": 1457 }, { "epoch": 0.2438432913826985, "grad_norm": 4.394129276275635, "learning_rate": 4.6914413861059015e-05, "loss": 3.1466, "step": 1458 }, { "epoch": 0.24401053643851653, "grad_norm": 4.151269435882568, "learning_rate": 4.690738573256863e-05, "loss": 2.8128, "step": 1459 }, { "epoch": 0.24417778149433458, "grad_norm": 5.381654739379883, "learning_rate": 4.690035013676613e-05, "loss": 3.0343, "step": 1460 }, { "epoch": 0.2443450265501526, "grad_norm": 3.429797410964966, "learning_rate": 4.689330707604967e-05, "loss": 3.0687, "step": 1461 }, { "epoch": 0.24451227160597064, "grad_norm": 2.728212594985962, "learning_rate": 4.6886256552819914e-05, "loss": 2.499, "step": 1462 }, { "epoch": 0.2446795166617887, "grad_norm": 4.018295764923096, "learning_rate": 4.6879198569480096e-05, "loss": 2.8568, "step": 1463 }, { "epoch": 0.24484676171760672, "grad_norm": 5.859686374664307, "learning_rate": 4.687213312843599e-05, "loss": 3.1022, "step": 1464 }, { "epoch": 0.24501400677342475, "grad_norm": 2.8669190406799316, "learning_rate": 4.6865060232095915e-05, "loss": 2.6652, "step": 1465 }, { "epoch": 0.2451812518292428, "grad_norm": 4.128539562225342, "learning_rate": 4.6857979882870715e-05, "loss": 2.8055, "step": 1466 }, { "epoch": 0.24534849688506083, "grad_norm": 4.430007457733154, "learning_rate": 4.685089208317379e-05, "loss": 3.1603, "step": 1467 }, { "epoch": 0.24551574194087888, "grad_norm": 3.987570285797119, "learning_rate": 4.684379683542107e-05, "loss": 2.9953, "step": 1468 }, { "epoch": 0.2456829869966969, "grad_norm": 6.960021018981934, "learning_rate": 4.683669414203105e-05, "loss": 2.8692, "step": 1469 }, { "epoch": 0.24585023205251494, "grad_norm": 3.115567684173584, "learning_rate": 4.682958400542471e-05, "loss": 2.8359, "step": 1470 }, { "epoch": 0.246017477108333, "grad_norm": 2.6869804859161377, "learning_rate": 4.682246642802561e-05, "loss": 2.8831, "step": 1471 }, { "epoch": 0.24618472216415102, "grad_norm": 3.082726240158081, "learning_rate": 4.6815341412259844e-05, "loss": 2.7689, "step": 1472 }, { "epoch": 0.24635196721996905, "grad_norm": 4.457955837249756, "learning_rate": 4.680820896055602e-05, "loss": 2.6712, "step": 1473 }, { "epoch": 0.2465192122757871, "grad_norm": 4.530313491821289, "learning_rate": 4.6801069075345305e-05, "loss": 2.7208, "step": 1474 }, { "epoch": 0.24668645733160513, "grad_norm": 2.715955972671509, "learning_rate": 4.6793921759061376e-05, "loss": 2.7506, "step": 1475 }, { "epoch": 0.24685370238742319, "grad_norm": 3.935241937637329, "learning_rate": 4.678676701414046e-05, "loss": 2.8116, "step": 1476 }, { "epoch": 0.2470209474432412, "grad_norm": 3.953498363494873, "learning_rate": 4.677960484302132e-05, "loss": 2.4931, "step": 1477 }, { "epoch": 0.24718819249905924, "grad_norm": 4.801958084106445, "learning_rate": 4.6772435248145216e-05, "loss": 2.7562, "step": 1478 }, { "epoch": 0.2473554375548773, "grad_norm": 1.881885290145874, "learning_rate": 4.676525823195598e-05, "loss": 2.4489, "step": 1479 }, { "epoch": 0.24752268261069532, "grad_norm": 2.743849754333496, "learning_rate": 4.675807379689996e-05, "loss": 2.3923, "step": 1480 }, { "epoch": 0.24768992766651335, "grad_norm": 6.483356475830078, "learning_rate": 4.675088194542602e-05, "loss": 3.2644, "step": 1481 }, { "epoch": 0.2478571727223314, "grad_norm": 3.4948782920837402, "learning_rate": 4.674368267998556e-05, "loss": 3.1795, "step": 1482 }, { "epoch": 0.24802441777814943, "grad_norm": 3.328714609146118, "learning_rate": 4.6736476003032524e-05, "loss": 3.0757, "step": 1483 }, { "epoch": 0.24819166283396746, "grad_norm": 5.473494529724121, "learning_rate": 4.672926191702335e-05, "loss": 2.8088, "step": 1484 }, { "epoch": 0.2483589078897855, "grad_norm": 4.431870460510254, "learning_rate": 4.6722040424417026e-05, "loss": 2.8601, "step": 1485 }, { "epoch": 0.24852615294560354, "grad_norm": 3.8766846656799316, "learning_rate": 4.6714811527675064e-05, "loss": 3.2691, "step": 1486 }, { "epoch": 0.2486933980014216, "grad_norm": 3.6281983852386475, "learning_rate": 4.670757522926148e-05, "loss": 2.8265, "step": 1487 }, { "epoch": 0.24886064305723962, "grad_norm": 5.111774921417236, "learning_rate": 4.670033153164283e-05, "loss": 3.162, "step": 1488 }, { "epoch": 0.24902788811305765, "grad_norm": 4.171443939208984, "learning_rate": 4.6693080437288186e-05, "loss": 2.8809, "step": 1489 }, { "epoch": 0.2491951331688757, "grad_norm": 3.4638099670410156, "learning_rate": 4.668582194866915e-05, "loss": 2.8644, "step": 1490 }, { "epoch": 0.24936237822469373, "grad_norm": 3.8958282470703125, "learning_rate": 4.667855606825983e-05, "loss": 3.064, "step": 1491 }, { "epoch": 0.24952962328051176, "grad_norm": 8.172293663024902, "learning_rate": 4.667128279853686e-05, "loss": 3.3597, "step": 1492 }, { "epoch": 0.2496968683363298, "grad_norm": 8.520477294921875, "learning_rate": 4.666400214197941e-05, "loss": 2.7687, "step": 1493 }, { "epoch": 0.24986411339214784, "grad_norm": 3.096466302871704, "learning_rate": 4.665671410106913e-05, "loss": 2.7733, "step": 1494 }, { "epoch": 0.25003135844796587, "grad_norm": 3.2835075855255127, "learning_rate": 4.664941867829022e-05, "loss": 3.2001, "step": 1495 }, { "epoch": 0.2501986035037839, "grad_norm": 3.4850070476531982, "learning_rate": 4.6642115876129376e-05, "loss": 3.2554, "step": 1496 }, { "epoch": 0.250365848559602, "grad_norm": 3.8269400596618652, "learning_rate": 4.663480569707584e-05, "loss": 3.0794, "step": 1497 }, { "epoch": 0.25053309361542, "grad_norm": 8.346485137939453, "learning_rate": 4.6627488143621315e-05, "loss": 3.2379, "step": 1498 }, { "epoch": 0.25070033867123803, "grad_norm": 5.316501617431641, "learning_rate": 4.662016321826007e-05, "loss": 3.0679, "step": 1499 }, { "epoch": 0.2508675837270561, "grad_norm": 5.615002632141113, "learning_rate": 4.661283092348886e-05, "loss": 3.0818, "step": 1500 }, { "epoch": 0.2510348287828741, "grad_norm": 5.790916919708252, "learning_rate": 4.660549126180696e-05, "loss": 3.3186, "step": 1501 }, { "epoch": 0.25120207383869214, "grad_norm": 2.7448720932006836, "learning_rate": 4.6598144235716154e-05, "loss": 2.3657, "step": 1502 }, { "epoch": 0.2513693188945102, "grad_norm": 3.2495720386505127, "learning_rate": 4.659078984772073e-05, "loss": 3.1012, "step": 1503 }, { "epoch": 0.2515365639503282, "grad_norm": 9.477670669555664, "learning_rate": 4.65834281003275e-05, "loss": 4.2343, "step": 1504 }, { "epoch": 0.25170380900614625, "grad_norm": 3.625392198562622, "learning_rate": 4.657605899604577e-05, "loss": 3.0496, "step": 1505 }, { "epoch": 0.2518710540619643, "grad_norm": 3.9850802421569824, "learning_rate": 4.6568682537387366e-05, "loss": 3.082, "step": 1506 }, { "epoch": 0.2520382991177823, "grad_norm": 3.6818947792053223, "learning_rate": 4.65612987268666e-05, "loss": 3.0214, "step": 1507 }, { "epoch": 0.25220554417360036, "grad_norm": 5.050662517547607, "learning_rate": 4.655390756700032e-05, "loss": 2.8627, "step": 1508 }, { "epoch": 0.2523727892294184, "grad_norm": 3.276740074157715, "learning_rate": 4.654650906030786e-05, "loss": 2.8364, "step": 1509 }, { "epoch": 0.25254003428523647, "grad_norm": 3.8625450134277344, "learning_rate": 4.653910320931106e-05, "loss": 3.096, "step": 1510 }, { "epoch": 0.25270727934105447, "grad_norm": 10.044471740722656, "learning_rate": 4.6531690016534266e-05, "loss": 3.0066, "step": 1511 }, { "epoch": 0.2528745243968725, "grad_norm": 3.8051626682281494, "learning_rate": 4.6524269484504316e-05, "loss": 3.0854, "step": 1512 }, { "epoch": 0.2530417694526906, "grad_norm": 6.7563371658325195, "learning_rate": 4.6516841615750574e-05, "loss": 3.5775, "step": 1513 }, { "epoch": 0.2532090145085086, "grad_norm": 7.161057949066162, "learning_rate": 4.6509406412804883e-05, "loss": 3.0292, "step": 1514 }, { "epoch": 0.25337625956432663, "grad_norm": 3.734959125518799, "learning_rate": 4.650196387820159e-05, "loss": 2.8428, "step": 1515 }, { "epoch": 0.2535435046201447, "grad_norm": 4.95436429977417, "learning_rate": 4.649451401447755e-05, "loss": 2.9079, "step": 1516 }, { "epoch": 0.2537107496759627, "grad_norm": 6.73279333114624, "learning_rate": 4.648705682417211e-05, "loss": 2.4921, "step": 1517 }, { "epoch": 0.25387799473178074, "grad_norm": 4.114083290100098, "learning_rate": 4.647959230982711e-05, "loss": 3.0624, "step": 1518 }, { "epoch": 0.2540452397875988, "grad_norm": 2.8958327770233154, "learning_rate": 4.647212047398689e-05, "loss": 2.821, "step": 1519 }, { "epoch": 0.2542124848434168, "grad_norm": 6.306778907775879, "learning_rate": 4.646464131919829e-05, "loss": 3.4738, "step": 1520 }, { "epoch": 0.25437972989923485, "grad_norm": 6.24006462097168, "learning_rate": 4.645715484801064e-05, "loss": 2.9175, "step": 1521 }, { "epoch": 0.2545469749550529, "grad_norm": 2.54970383644104, "learning_rate": 4.644966106297577e-05, "loss": 2.787, "step": 1522 }, { "epoch": 0.2547142200108709, "grad_norm": 2.8381125926971436, "learning_rate": 4.6442159966647994e-05, "loss": 2.799, "step": 1523 }, { "epoch": 0.25488146506668896, "grad_norm": 3.45841121673584, "learning_rate": 4.643465156158412e-05, "loss": 3.0874, "step": 1524 }, { "epoch": 0.255048710122507, "grad_norm": 3.7562544345855713, "learning_rate": 4.642713585034345e-05, "loss": 2.8078, "step": 1525 }, { "epoch": 0.255215955178325, "grad_norm": 4.927243709564209, "learning_rate": 4.641961283548778e-05, "loss": 2.7479, "step": 1526 }, { "epoch": 0.25538320023414307, "grad_norm": 4.757547378540039, "learning_rate": 4.641208251958139e-05, "loss": 2.8616, "step": 1527 }, { "epoch": 0.2555504452899611, "grad_norm": 5.402925491333008, "learning_rate": 4.6404544905191035e-05, "loss": 3.2773, "step": 1528 }, { "epoch": 0.2557176903457792, "grad_norm": 5.630742073059082, "learning_rate": 4.6396999994886004e-05, "loss": 3.3361, "step": 1529 }, { "epoch": 0.2558849354015972, "grad_norm": 3.4615743160247803, "learning_rate": 4.6389447791238015e-05, "loss": 2.9256, "step": 1530 }, { "epoch": 0.25605218045741523, "grad_norm": 3.2134029865264893, "learning_rate": 4.638188829682131e-05, "loss": 2.6949, "step": 1531 }, { "epoch": 0.2562194255132333, "grad_norm": 3.9513509273529053, "learning_rate": 4.6374321514212606e-05, "loss": 3.0155, "step": 1532 }, { "epoch": 0.2563866705690513, "grad_norm": 2.8825387954711914, "learning_rate": 4.636674744599109e-05, "loss": 2.7442, "step": 1533 }, { "epoch": 0.25655391562486934, "grad_norm": 3.763817071914673, "learning_rate": 4.635916609473846e-05, "loss": 2.8379, "step": 1534 }, { "epoch": 0.2567211606806874, "grad_norm": 4.191507816314697, "learning_rate": 4.635157746303887e-05, "loss": 3.1353, "step": 1535 }, { "epoch": 0.2568884057365054, "grad_norm": 4.289510726928711, "learning_rate": 4.634398155347898e-05, "loss": 3.5333, "step": 1536 }, { "epoch": 0.25705565079232345, "grad_norm": 3.133981704711914, "learning_rate": 4.633637836864791e-05, "loss": 2.9713, "step": 1537 }, { "epoch": 0.2572228958481415, "grad_norm": 5.43375825881958, "learning_rate": 4.6328767911137274e-05, "loss": 2.6729, "step": 1538 }, { "epoch": 0.2573901409039595, "grad_norm": 8.262992858886719, "learning_rate": 4.632115018354114e-05, "loss": 2.8905, "step": 1539 }, { "epoch": 0.25755738595977756, "grad_norm": 4.353831768035889, "learning_rate": 4.6313525188456094e-05, "loss": 2.7983, "step": 1540 }, { "epoch": 0.2577246310155956, "grad_norm": 4.9411540031433105, "learning_rate": 4.6305892928481174e-05, "loss": 2.975, "step": 1541 }, { "epoch": 0.2578918760714136, "grad_norm": 5.094625949859619, "learning_rate": 4.6298253406217894e-05, "loss": 3.3055, "step": 1542 }, { "epoch": 0.25805912112723167, "grad_norm": 3.1445024013519287, "learning_rate": 4.629060662427026e-05, "loss": 3.1006, "step": 1543 }, { "epoch": 0.2582263661830497, "grad_norm": 2.655468702316284, "learning_rate": 4.6282952585244715e-05, "loss": 2.9018, "step": 1544 }, { "epoch": 0.2583936112388677, "grad_norm": 2.676879405975342, "learning_rate": 4.627529129175022e-05, "loss": 2.8975, "step": 1545 }, { "epoch": 0.2585608562946858, "grad_norm": 4.059142112731934, "learning_rate": 4.626762274639819e-05, "loss": 2.8803, "step": 1546 }, { "epoch": 0.25872810135050384, "grad_norm": 2.704961061477661, "learning_rate": 4.62599469518025e-05, "loss": 2.6579, "step": 1547 }, { "epoch": 0.25889534640632184, "grad_norm": 7.2624831199646, "learning_rate": 4.625226391057952e-05, "loss": 2.8914, "step": 1548 }, { "epoch": 0.2590625914621399, "grad_norm": 5.939176082611084, "learning_rate": 4.6244573625348074e-05, "loss": 3.0671, "step": 1549 }, { "epoch": 0.25922983651795795, "grad_norm": 2.654885768890381, "learning_rate": 4.6236876098729454e-05, "loss": 3.1937, "step": 1550 }, { "epoch": 0.259397081573776, "grad_norm": 4.454501628875732, "learning_rate": 4.622917133334743e-05, "loss": 3.1086, "step": 1551 }, { "epoch": 0.259564326629594, "grad_norm": 3.001354455947876, "learning_rate": 4.622145933182823e-05, "loss": 2.4238, "step": 1552 }, { "epoch": 0.25973157168541205, "grad_norm": 5.604628086090088, "learning_rate": 4.6213740096800555e-05, "loss": 2.975, "step": 1553 }, { "epoch": 0.2598988167412301, "grad_norm": 4.262529373168945, "learning_rate": 4.620601363089557e-05, "loss": 2.9829, "step": 1554 }, { "epoch": 0.2600660617970481, "grad_norm": 2.7250945568084717, "learning_rate": 4.619827993674691e-05, "loss": 2.7172, "step": 1555 }, { "epoch": 0.26023330685286616, "grad_norm": 3.0844898223876953, "learning_rate": 4.619053901699066e-05, "loss": 2.5804, "step": 1556 }, { "epoch": 0.2604005519086842, "grad_norm": 4.048311710357666, "learning_rate": 4.618279087426538e-05, "loss": 3.0449, "step": 1557 }, { "epoch": 0.2605677969645022, "grad_norm": 2.944145917892456, "learning_rate": 4.6175035511212085e-05, "loss": 2.6716, "step": 1558 }, { "epoch": 0.2607350420203203, "grad_norm": 3.2839536666870117, "learning_rate": 4.616727293047425e-05, "loss": 3.0096, "step": 1559 }, { "epoch": 0.26090228707613833, "grad_norm": 4.3839006423950195, "learning_rate": 4.615950313469782e-05, "loss": 2.8624, "step": 1560 }, { "epoch": 0.26106953213195633, "grad_norm": 2.6538209915161133, "learning_rate": 4.61517261265312e-05, "loss": 2.5754, "step": 1561 }, { "epoch": 0.2612367771877744, "grad_norm": 4.189825057983398, "learning_rate": 4.614394190862524e-05, "loss": 3.1287, "step": 1562 }, { "epoch": 0.26140402224359244, "grad_norm": 4.650110721588135, "learning_rate": 4.613615048363326e-05, "loss": 2.8604, "step": 1563 }, { "epoch": 0.26157126729941044, "grad_norm": 6.388495445251465, "learning_rate": 4.612835185421102e-05, "loss": 3.0723, "step": 1564 }, { "epoch": 0.2617385123552285, "grad_norm": 4.920874118804932, "learning_rate": 4.612054602301675e-05, "loss": 2.5962, "step": 1565 }, { "epoch": 0.26190575741104655, "grad_norm": 3.057264804840088, "learning_rate": 4.611273299271114e-05, "loss": 2.8305, "step": 1566 }, { "epoch": 0.26207300246686455, "grad_norm": 12.598405838012695, "learning_rate": 4.610491276595732e-05, "loss": 3.6868, "step": 1567 }, { "epoch": 0.2622402475226826, "grad_norm": 4.819662570953369, "learning_rate": 4.609708534542088e-05, "loss": 3.1022, "step": 1568 }, { "epoch": 0.26240749257850066, "grad_norm": 6.384423732757568, "learning_rate": 4.608925073376985e-05, "loss": 2.9828, "step": 1569 }, { "epoch": 0.2625747376343187, "grad_norm": 3.4974374771118164, "learning_rate": 4.608140893367474e-05, "loss": 2.6454, "step": 1570 }, { "epoch": 0.2627419826901367, "grad_norm": 4.071967601776123, "learning_rate": 4.6073559947808475e-05, "loss": 2.9092, "step": 1571 }, { "epoch": 0.26290922774595477, "grad_norm": 2.746671199798584, "learning_rate": 4.606570377884646e-05, "loss": 2.5061, "step": 1572 }, { "epoch": 0.2630764728017728, "grad_norm": 2.249537229537964, "learning_rate": 4.605784042946653e-05, "loss": 2.7322, "step": 1573 }, { "epoch": 0.2632437178575908, "grad_norm": 6.822818279266357, "learning_rate": 4.6049969902348966e-05, "loss": 3.0198, "step": 1574 }, { "epoch": 0.2634109629134089, "grad_norm": 4.669422149658203, "learning_rate": 4.604209220017651e-05, "loss": 2.6555, "step": 1575 }, { "epoch": 0.26357820796922693, "grad_norm": 3.6618762016296387, "learning_rate": 4.6034207325634335e-05, "loss": 2.9063, "step": 1576 }, { "epoch": 0.26374545302504493, "grad_norm": 3.1933364868164062, "learning_rate": 4.6026315281410066e-05, "loss": 3.0011, "step": 1577 }, { "epoch": 0.263912698080863, "grad_norm": 6.558648586273193, "learning_rate": 4.601841607019377e-05, "loss": 3.093, "step": 1578 }, { "epoch": 0.26407994313668104, "grad_norm": 11.353188514709473, "learning_rate": 4.6010509694677965e-05, "loss": 3.2225, "step": 1579 }, { "epoch": 0.26424718819249904, "grad_norm": 5.470324993133545, "learning_rate": 4.60025961575576e-05, "loss": 3.2135, "step": 1580 }, { "epoch": 0.2644144332483171, "grad_norm": 5.5783514976501465, "learning_rate": 4.599467546153007e-05, "loss": 2.6737, "step": 1581 }, { "epoch": 0.26458167830413515, "grad_norm": 3.5553760528564453, "learning_rate": 4.5986747609295214e-05, "loss": 2.803, "step": 1582 }, { "epoch": 0.26474892335995315, "grad_norm": 3.813628673553467, "learning_rate": 4.5978812603555285e-05, "loss": 2.8712, "step": 1583 }, { "epoch": 0.2649161684157712, "grad_norm": 10.437348365783691, "learning_rate": 4.597087044701502e-05, "loss": 2.4647, "step": 1584 }, { "epoch": 0.26508341347158926, "grad_norm": 16.289325714111328, "learning_rate": 4.5962921142381553e-05, "loss": 3.4766, "step": 1585 }, { "epoch": 0.26525065852740726, "grad_norm": 3.245065927505493, "learning_rate": 4.595496469236448e-05, "loss": 2.9111, "step": 1586 }, { "epoch": 0.2654179035832253, "grad_norm": 5.7753400802612305, "learning_rate": 4.59470010996758e-05, "loss": 3.0323, "step": 1587 }, { "epoch": 0.26558514863904337, "grad_norm": 9.724771499633789, "learning_rate": 4.593903036703e-05, "loss": 3.2866, "step": 1588 }, { "epoch": 0.2657523936948614, "grad_norm": 4.5186944007873535, "learning_rate": 4.593105249714396e-05, "loss": 3.2105, "step": 1589 }, { "epoch": 0.2659196387506794, "grad_norm": 4.638010025024414, "learning_rate": 4.592306749273698e-05, "loss": 3.2052, "step": 1590 }, { "epoch": 0.2660868838064975, "grad_norm": 4.369907379150391, "learning_rate": 4.591507535653085e-05, "loss": 3.0173, "step": 1591 }, { "epoch": 0.26625412886231553, "grad_norm": 4.735449314117432, "learning_rate": 4.5907076091249715e-05, "loss": 3.0948, "step": 1592 }, { "epoch": 0.26642137391813353, "grad_norm": 4.828398704528809, "learning_rate": 4.589906969962022e-05, "loss": 3.1845, "step": 1593 }, { "epoch": 0.2665886189739516, "grad_norm": 3.6175971031188965, "learning_rate": 4.589105618437141e-05, "loss": 3.1532, "step": 1594 }, { "epoch": 0.26675586402976964, "grad_norm": 7.015432357788086, "learning_rate": 4.588303554823474e-05, "loss": 2.6918, "step": 1595 }, { "epoch": 0.26692310908558764, "grad_norm": 4.237622261047363, "learning_rate": 4.5875007793944123e-05, "loss": 2.9558, "step": 1596 }, { "epoch": 0.2670903541414057, "grad_norm": 2.7422127723693848, "learning_rate": 4.586697292423588e-05, "loss": 2.9481, "step": 1597 }, { "epoch": 0.26725759919722375, "grad_norm": 4.934504985809326, "learning_rate": 4.585893094184876e-05, "loss": 3.2802, "step": 1598 }, { "epoch": 0.26742484425304175, "grad_norm": 2.9455649852752686, "learning_rate": 4.585088184952394e-05, "loss": 2.7704, "step": 1599 }, { "epoch": 0.2675920893088598, "grad_norm": 4.720494270324707, "learning_rate": 4.584282565000502e-05, "loss": 2.9644, "step": 1600 }, { "epoch": 0.26775933436467786, "grad_norm": 4.360541343688965, "learning_rate": 4.5834762346038024e-05, "loss": 2.6787, "step": 1601 }, { "epoch": 0.26792657942049586, "grad_norm": 2.217719078063965, "learning_rate": 4.582669194037139e-05, "loss": 2.5577, "step": 1602 }, { "epoch": 0.2680938244763139, "grad_norm": 2.730802059173584, "learning_rate": 4.581861443575599e-05, "loss": 2.9353, "step": 1603 }, { "epoch": 0.26826106953213197, "grad_norm": 4.729639053344727, "learning_rate": 4.581052983494511e-05, "loss": 2.8012, "step": 1604 }, { "epoch": 0.26842831458794997, "grad_norm": 4.87349271774292, "learning_rate": 4.580243814069443e-05, "loss": 3.3337, "step": 1605 }, { "epoch": 0.268595559643768, "grad_norm": 3.7976508140563965, "learning_rate": 4.579433935576208e-05, "loss": 2.9297, "step": 1606 }, { "epoch": 0.2687628046995861, "grad_norm": 7.3435444831848145, "learning_rate": 4.578623348290861e-05, "loss": 3.123, "step": 1607 }, { "epoch": 0.26893004975540413, "grad_norm": 5.428923606872559, "learning_rate": 4.5778120524896976e-05, "loss": 2.9775, "step": 1608 }, { "epoch": 0.26909729481122213, "grad_norm": 3.1783478260040283, "learning_rate": 4.577000048449253e-05, "loss": 2.6682, "step": 1609 }, { "epoch": 0.2692645398670402, "grad_norm": 2.600634813308716, "learning_rate": 4.576187336446306e-05, "loss": 2.3146, "step": 1610 }, { "epoch": 0.26943178492285824, "grad_norm": 3.2901575565338135, "learning_rate": 4.575373916757876e-05, "loss": 2.9137, "step": 1611 }, { "epoch": 0.26959902997867624, "grad_norm": 3.8706343173980713, "learning_rate": 4.574559789661225e-05, "loss": 2.53, "step": 1612 }, { "epoch": 0.2697662750344943, "grad_norm": 3.4502816200256348, "learning_rate": 4.573744955433853e-05, "loss": 2.6837, "step": 1613 }, { "epoch": 0.26993352009031235, "grad_norm": 3.3359265327453613, "learning_rate": 4.572929414353503e-05, "loss": 3.0494, "step": 1614 }, { "epoch": 0.27010076514613035, "grad_norm": 3.957695245742798, "learning_rate": 4.572113166698161e-05, "loss": 2.6285, "step": 1615 }, { "epoch": 0.2702680102019484, "grad_norm": 2.2081522941589355, "learning_rate": 4.57129621274605e-05, "loss": 2.3608, "step": 1616 }, { "epoch": 0.27043525525776646, "grad_norm": 2.4502906799316406, "learning_rate": 4.5704785527756355e-05, "loss": 2.8596, "step": 1617 }, { "epoch": 0.27060250031358446, "grad_norm": 3.627042531967163, "learning_rate": 4.5696601870656244e-05, "loss": 2.9724, "step": 1618 }, { "epoch": 0.2707697453694025, "grad_norm": 3.9233949184417725, "learning_rate": 4.5688411158949634e-05, "loss": 2.9851, "step": 1619 }, { "epoch": 0.27093699042522057, "grad_norm": 6.550014972686768, "learning_rate": 4.5680213395428386e-05, "loss": 2.9338, "step": 1620 }, { "epoch": 0.27110423548103857, "grad_norm": 6.227407455444336, "learning_rate": 4.567200858288678e-05, "loss": 2.4646, "step": 1621 }, { "epoch": 0.2712714805368566, "grad_norm": 3.564263105392456, "learning_rate": 4.566379672412149e-05, "loss": 2.9984, "step": 1622 }, { "epoch": 0.2714387255926747, "grad_norm": 3.7461400032043457, "learning_rate": 4.565557782193161e-05, "loss": 3.2309, "step": 1623 }, { "epoch": 0.2716059706484927, "grad_norm": 5.929854869842529, "learning_rate": 4.564735187911861e-05, "loss": 3.2683, "step": 1624 }, { "epoch": 0.27177321570431073, "grad_norm": 2.141589641571045, "learning_rate": 4.5639118898486354e-05, "loss": 2.876, "step": 1625 }, { "epoch": 0.2719404607601288, "grad_norm": 2.6686251163482666, "learning_rate": 4.563087888284115e-05, "loss": 2.5776, "step": 1626 }, { "epoch": 0.27210770581594684, "grad_norm": 4.202892780303955, "learning_rate": 4.562263183499166e-05, "loss": 3.3159, "step": 1627 }, { "epoch": 0.27227495087176484, "grad_norm": 4.473474025726318, "learning_rate": 4.561437775774895e-05, "loss": 2.7694, "step": 1628 }, { "epoch": 0.2724421959275829, "grad_norm": 6.514625072479248, "learning_rate": 4.560611665392651e-05, "loss": 2.9161, "step": 1629 }, { "epoch": 0.27260944098340095, "grad_norm": 4.685281753540039, "learning_rate": 4.559784852634019e-05, "loss": 3.1169, "step": 1630 }, { "epoch": 0.27277668603921895, "grad_norm": 8.433526039123535, "learning_rate": 4.558957337780826e-05, "loss": 2.7195, "step": 1631 }, { "epoch": 0.272943931095037, "grad_norm": 4.62741756439209, "learning_rate": 4.5581291211151353e-05, "loss": 3.2047, "step": 1632 }, { "epoch": 0.27311117615085506, "grad_norm": 3.609666585922241, "learning_rate": 4.557300202919254e-05, "loss": 2.8211, "step": 1633 }, { "epoch": 0.27327842120667306, "grad_norm": 3.6036274433135986, "learning_rate": 4.556470583475724e-05, "loss": 2.956, "step": 1634 }, { "epoch": 0.2734456662624911, "grad_norm": 4.860374450683594, "learning_rate": 4.555640263067328e-05, "loss": 2.8254, "step": 1635 }, { "epoch": 0.27361291131830917, "grad_norm": 3.2731010913848877, "learning_rate": 4.554809241977087e-05, "loss": 2.8471, "step": 1636 }, { "epoch": 0.27378015637412717, "grad_norm": 4.6778106689453125, "learning_rate": 4.553977520488263e-05, "loss": 3.232, "step": 1637 }, { "epoch": 0.2739474014299452, "grad_norm": 5.1672868728637695, "learning_rate": 4.553145098884354e-05, "loss": 3.0881, "step": 1638 }, { "epoch": 0.2741146464857633, "grad_norm": 4.327518939971924, "learning_rate": 4.5523119774490975e-05, "loss": 2.9817, "step": 1639 }, { "epoch": 0.2742818915415813, "grad_norm": 6.6139092445373535, "learning_rate": 4.5514781564664704e-05, "loss": 3.3157, "step": 1640 }, { "epoch": 0.27444913659739933, "grad_norm": 4.093576908111572, "learning_rate": 4.5506436362206874e-05, "loss": 3.2081, "step": 1641 }, { "epoch": 0.2746163816532174, "grad_norm": 3.9777965545654297, "learning_rate": 4.5498084169962006e-05, "loss": 2.7588, "step": 1642 }, { "epoch": 0.2747836267090354, "grad_norm": 3.813206195831299, "learning_rate": 4.548972499077703e-05, "loss": 2.6099, "step": 1643 }, { "epoch": 0.27495087176485344, "grad_norm": 3.5361380577087402, "learning_rate": 4.548135882750123e-05, "loss": 2.7471, "step": 1644 }, { "epoch": 0.2751181168206715, "grad_norm": 4.181602954864502, "learning_rate": 4.547298568298628e-05, "loss": 3.132, "step": 1645 }, { "epoch": 0.27528536187648955, "grad_norm": 5.826547145843506, "learning_rate": 4.546460556008624e-05, "loss": 2.8787, "step": 1646 }, { "epoch": 0.27545260693230755, "grad_norm": 4.063530445098877, "learning_rate": 4.5456218461657543e-05, "loss": 2.9037, "step": 1647 }, { "epoch": 0.2756198519881256, "grad_norm": 4.704963684082031, "learning_rate": 4.5447824390559e-05, "loss": 2.752, "step": 1648 }, { "epoch": 0.27578709704394366, "grad_norm": 6.809920310974121, "learning_rate": 4.5439423349651786e-05, "loss": 3.0767, "step": 1649 }, { "epoch": 0.27595434209976166, "grad_norm": 1.5691672563552856, "learning_rate": 4.5431015341799485e-05, "loss": 2.5762, "step": 1650 }, { "epoch": 0.2761215871555797, "grad_norm": 2.598796844482422, "learning_rate": 4.542260036986803e-05, "loss": 3.0165, "step": 1651 }, { "epoch": 0.2762888322113978, "grad_norm": 2.880626678466797, "learning_rate": 4.541417843672573e-05, "loss": 2.6901, "step": 1652 }, { "epoch": 0.27645607726721577, "grad_norm": 3.484653949737549, "learning_rate": 4.540574954524326e-05, "loss": 3.4008, "step": 1653 }, { "epoch": 0.2766233223230338, "grad_norm": 3.8534977436065674, "learning_rate": 4.53973136982937e-05, "loss": 2.5677, "step": 1654 }, { "epoch": 0.2767905673788519, "grad_norm": 2.340280055999756, "learning_rate": 4.5388870898752453e-05, "loss": 2.4204, "step": 1655 }, { "epoch": 0.2769578124346699, "grad_norm": 3.5583646297454834, "learning_rate": 4.538042114949734e-05, "loss": 3.0963, "step": 1656 }, { "epoch": 0.27712505749048794, "grad_norm": 4.005722522735596, "learning_rate": 4.5371964453408504e-05, "loss": 2.8067, "step": 1657 }, { "epoch": 0.277292302546306, "grad_norm": 2.7107152938842773, "learning_rate": 4.53635008133685e-05, "loss": 2.3482, "step": 1658 }, { "epoch": 0.277459547602124, "grad_norm": 6.630146503448486, "learning_rate": 4.535503023226221e-05, "loss": 2.5456, "step": 1659 }, { "epoch": 0.27762679265794205, "grad_norm": 4.769135475158691, "learning_rate": 4.534655271297692e-05, "loss": 2.6752, "step": 1660 }, { "epoch": 0.2777940377137601, "grad_norm": 5.041027069091797, "learning_rate": 4.533806825840225e-05, "loss": 2.6728, "step": 1661 }, { "epoch": 0.2779612827695781, "grad_norm": 4.1058573722839355, "learning_rate": 4.5329576871430195e-05, "loss": 2.4722, "step": 1662 }, { "epoch": 0.27812852782539615, "grad_norm": 2.646583080291748, "learning_rate": 4.532107855495512e-05, "loss": 2.4845, "step": 1663 }, { "epoch": 0.2782957728812142, "grad_norm": 5.620547294616699, "learning_rate": 4.5312573311873747e-05, "loss": 2.9589, "step": 1664 }, { "epoch": 0.2784630179370322, "grad_norm": 4.202587604522705, "learning_rate": 4.5304061145085155e-05, "loss": 2.4303, "step": 1665 }, { "epoch": 0.27863026299285026, "grad_norm": 4.841327667236328, "learning_rate": 4.529554205749078e-05, "loss": 2.5722, "step": 1666 }, { "epoch": 0.2787975080486683, "grad_norm": 3.803678512573242, "learning_rate": 4.5287016051994425e-05, "loss": 3.1344, "step": 1667 }, { "epoch": 0.2789647531044864, "grad_norm": 4.668429851531982, "learning_rate": 4.5278483131502255e-05, "loss": 2.7635, "step": 1668 }, { "epoch": 0.2791319981603044, "grad_norm": 3.312225103378296, "learning_rate": 4.526994329892279e-05, "loss": 2.996, "step": 1669 }, { "epoch": 0.27929924321612243, "grad_norm": 8.473593711853027, "learning_rate": 4.526139655716689e-05, "loss": 2.7495, "step": 1670 }, { "epoch": 0.2794664882719405, "grad_norm": 2.930959701538086, "learning_rate": 4.525284290914779e-05, "loss": 2.7065, "step": 1671 }, { "epoch": 0.2796337333277585, "grad_norm": 4.31817626953125, "learning_rate": 4.524428235778105e-05, "loss": 2.9972, "step": 1672 }, { "epoch": 0.27980097838357654, "grad_norm": 4.184410572052002, "learning_rate": 4.523571490598464e-05, "loss": 3.0748, "step": 1673 }, { "epoch": 0.2799682234393946, "grad_norm": 3.4628474712371826, "learning_rate": 4.5227140556678824e-05, "loss": 2.8362, "step": 1674 }, { "epoch": 0.2801354684952126, "grad_norm": 4.658220291137695, "learning_rate": 4.5218559312786244e-05, "loss": 2.8147, "step": 1675 }, { "epoch": 0.28030271355103065, "grad_norm": 4.295240879058838, "learning_rate": 4.520997117723189e-05, "loss": 3.0673, "step": 1676 }, { "epoch": 0.2804699586068487, "grad_norm": 7.5712714195251465, "learning_rate": 4.52013761529431e-05, "loss": 3.2558, "step": 1677 }, { "epoch": 0.2806372036626667, "grad_norm": 6.8220295906066895, "learning_rate": 4.5192774242849544e-05, "loss": 2.9213, "step": 1678 }, { "epoch": 0.28080444871848476, "grad_norm": 5.057028770446777, "learning_rate": 4.5184165449883265e-05, "loss": 3.1903, "step": 1679 }, { "epoch": 0.2809716937743028, "grad_norm": 2.5614097118377686, "learning_rate": 4.517554977697864e-05, "loss": 2.3962, "step": 1680 }, { "epoch": 0.2811389388301208, "grad_norm": 5.085020542144775, "learning_rate": 4.51669272270724e-05, "loss": 2.5791, "step": 1681 }, { "epoch": 0.28130618388593887, "grad_norm": 7.223097801208496, "learning_rate": 4.5158297803103585e-05, "loss": 3.2706, "step": 1682 }, { "epoch": 0.2814734289417569, "grad_norm": 3.966895818710327, "learning_rate": 4.514966150801364e-05, "loss": 3.1041, "step": 1683 }, { "epoch": 0.2816406739975749, "grad_norm": 5.523657321929932, "learning_rate": 4.514101834474629e-05, "loss": 3.0116, "step": 1684 }, { "epoch": 0.281807919053393, "grad_norm": 4.346877574920654, "learning_rate": 4.513236831624764e-05, "loss": 3.004, "step": 1685 }, { "epoch": 0.28197516410921103, "grad_norm": 3.7346553802490234, "learning_rate": 4.5123711425466126e-05, "loss": 2.7387, "step": 1686 }, { "epoch": 0.2821424091650291, "grad_norm": 6.19805908203125, "learning_rate": 4.5115047675352514e-05, "loss": 3.1357, "step": 1687 }, { "epoch": 0.2823096542208471, "grad_norm": 12.329728126525879, "learning_rate": 4.5106377068859905e-05, "loss": 2.7927, "step": 1688 }, { "epoch": 0.28247689927666514, "grad_norm": 4.547236442565918, "learning_rate": 4.509769960894377e-05, "loss": 2.8296, "step": 1689 }, { "epoch": 0.2826441443324832, "grad_norm": 7.3184285163879395, "learning_rate": 4.508901529856188e-05, "loss": 3.2874, "step": 1690 }, { "epoch": 0.2828113893883012, "grad_norm": 3.8873887062072754, "learning_rate": 4.5080324140674346e-05, "loss": 2.8691, "step": 1691 }, { "epoch": 0.28297863444411925, "grad_norm": 2.8999507427215576, "learning_rate": 4.5071626138243626e-05, "loss": 2.7633, "step": 1692 }, { "epoch": 0.2831458794999373, "grad_norm": 2.8137497901916504, "learning_rate": 4.506292129423451e-05, "loss": 2.7983, "step": 1693 }, { "epoch": 0.2833131245557553, "grad_norm": 4.230241298675537, "learning_rate": 4.505420961161411e-05, "loss": 3.1079, "step": 1694 }, { "epoch": 0.28348036961157336, "grad_norm": 3.5766894817352295, "learning_rate": 4.504549109335188e-05, "loss": 2.7943, "step": 1695 }, { "epoch": 0.2836476146673914, "grad_norm": 2.960934638977051, "learning_rate": 4.5036765742419594e-05, "loss": 2.5139, "step": 1696 }, { "epoch": 0.2838148597232094, "grad_norm": 5.492838382720947, "learning_rate": 4.502803356179136e-05, "loss": 2.9217, "step": 1697 }, { "epoch": 0.28398210477902747, "grad_norm": 3.217665910720825, "learning_rate": 4.501929455444361e-05, "loss": 2.296, "step": 1698 }, { "epoch": 0.2841493498348455, "grad_norm": 6.211584568023682, "learning_rate": 4.5010548723355115e-05, "loss": 2.9847, "step": 1699 }, { "epoch": 0.2843165948906635, "grad_norm": 6.205479621887207, "learning_rate": 4.5001796071506964e-05, "loss": 3.2263, "step": 1700 }, { "epoch": 0.2844838399464816, "grad_norm": 15.00015640258789, "learning_rate": 4.499303660188256e-05, "loss": 3.544, "step": 1701 }, { "epoch": 0.28465108500229963, "grad_norm": 7.467742443084717, "learning_rate": 4.4984270317467644e-05, "loss": 3.5035, "step": 1702 }, { "epoch": 0.28481833005811763, "grad_norm": 5.842050075531006, "learning_rate": 4.497549722125027e-05, "loss": 2.7806, "step": 1703 }, { "epoch": 0.2849855751139357, "grad_norm": 4.112623691558838, "learning_rate": 4.496671731622084e-05, "loss": 2.7369, "step": 1704 }, { "epoch": 0.28515282016975374, "grad_norm": 4.639468193054199, "learning_rate": 4.4957930605372036e-05, "loss": 2.8639, "step": 1705 }, { "epoch": 0.2853200652255718, "grad_norm": 3.502230644226074, "learning_rate": 4.4949137091698887e-05, "loss": 2.8641, "step": 1706 }, { "epoch": 0.2854873102813898, "grad_norm": 4.04213285446167, "learning_rate": 4.494033677819874e-05, "loss": 3.3644, "step": 1707 }, { "epoch": 0.28565455533720785, "grad_norm": 3.013875722885132, "learning_rate": 4.493152966787124e-05, "loss": 2.8743, "step": 1708 }, { "epoch": 0.2858218003930259, "grad_norm": 3.291621685028076, "learning_rate": 4.4922715763718384e-05, "loss": 2.733, "step": 1709 }, { "epoch": 0.2859890454488439, "grad_norm": 3.121678590774536, "learning_rate": 4.491389506874444e-05, "loss": 3.0719, "step": 1710 }, { "epoch": 0.28615629050466196, "grad_norm": 4.6122236251831055, "learning_rate": 4.490506758595603e-05, "loss": 3.2347, "step": 1711 }, { "epoch": 0.28632353556048, "grad_norm": 3.0540406703948975, "learning_rate": 4.4896233318362066e-05, "loss": 2.6822, "step": 1712 }, { "epoch": 0.286490780616298, "grad_norm": 2.3753087520599365, "learning_rate": 4.488739226897379e-05, "loss": 2.706, "step": 1713 }, { "epoch": 0.28665802567211607, "grad_norm": 5.372673988342285, "learning_rate": 4.4878544440804736e-05, "loss": 3.0591, "step": 1714 }, { "epoch": 0.2868252707279341, "grad_norm": 3.203874111175537, "learning_rate": 4.4869689836870766e-05, "loss": 2.6554, "step": 1715 }, { "epoch": 0.2869925157837521, "grad_norm": 4.611466407775879, "learning_rate": 4.486082846019004e-05, "loss": 3.1241, "step": 1716 }, { "epoch": 0.2871597608395702, "grad_norm": 3.763899564743042, "learning_rate": 4.485196031378303e-05, "loss": 2.4879, "step": 1717 }, { "epoch": 0.28732700589538823, "grad_norm": 3.8748373985290527, "learning_rate": 4.484308540067253e-05, "loss": 2.9454, "step": 1718 }, { "epoch": 0.28749425095120623, "grad_norm": 4.751715183258057, "learning_rate": 4.48342037238836e-05, "loss": 2.993, "step": 1719 }, { "epoch": 0.2876614960070243, "grad_norm": 5.262035846710205, "learning_rate": 4.482531528644366e-05, "loss": 2.3289, "step": 1720 }, { "epoch": 0.28782874106284234, "grad_norm": 5.2646164894104, "learning_rate": 4.481642009138238e-05, "loss": 2.5732, "step": 1721 }, { "epoch": 0.28799598611866034, "grad_norm": 3.2033631801605225, "learning_rate": 4.4807518141731784e-05, "loss": 2.6995, "step": 1722 }, { "epoch": 0.2881632311744784, "grad_norm": 5.5208210945129395, "learning_rate": 4.4798609440526176e-05, "loss": 2.8801, "step": 1723 }, { "epoch": 0.28833047623029645, "grad_norm": 3.7432076930999756, "learning_rate": 4.4789693990802136e-05, "loss": 2.9635, "step": 1724 }, { "epoch": 0.2884977212861145, "grad_norm": 5.726268291473389, "learning_rate": 4.478077179559858e-05, "loss": 3.091, "step": 1725 }, { "epoch": 0.2886649663419325, "grad_norm": 3.4497761726379395, "learning_rate": 4.477184285795672e-05, "loss": 2.6892, "step": 1726 }, { "epoch": 0.28883221139775056, "grad_norm": 3.750185489654541, "learning_rate": 4.4762907180920055e-05, "loss": 2.8417, "step": 1727 }, { "epoch": 0.2889994564535686, "grad_norm": 1.8158217668533325, "learning_rate": 4.475396476753437e-05, "loss": 2.5852, "step": 1728 }, { "epoch": 0.2891667015093866, "grad_norm": 1.8801790475845337, "learning_rate": 4.4745015620847775e-05, "loss": 3.1461, "step": 1729 }, { "epoch": 0.28933394656520467, "grad_norm": 6.002145290374756, "learning_rate": 4.473605974391065e-05, "loss": 3.356, "step": 1730 }, { "epoch": 0.2895011916210227, "grad_norm": 4.387648582458496, "learning_rate": 4.472709713977569e-05, "loss": 3.5352, "step": 1731 }, { "epoch": 0.2896684366768407, "grad_norm": 2.7741706371307373, "learning_rate": 4.471812781149786e-05, "loss": 3.1542, "step": 1732 }, { "epoch": 0.2898356817326588, "grad_norm": 4.132819175720215, "learning_rate": 4.4709151762134437e-05, "loss": 2.8357, "step": 1733 }, { "epoch": 0.29000292678847683, "grad_norm": 4.848496437072754, "learning_rate": 4.470016899474498e-05, "loss": 2.7457, "step": 1734 }, { "epoch": 0.29017017184429483, "grad_norm": 3.2364463806152344, "learning_rate": 4.469117951239134e-05, "loss": 2.587, "step": 1735 }, { "epoch": 0.2903374169001129, "grad_norm": 7.502954959869385, "learning_rate": 4.468218331813765e-05, "loss": 3.2236, "step": 1736 }, { "epoch": 0.29050466195593094, "grad_norm": 4.125027656555176, "learning_rate": 4.4673180415050335e-05, "loss": 2.7044, "step": 1737 }, { "epoch": 0.29067190701174894, "grad_norm": 6.577972412109375, "learning_rate": 4.466417080619812e-05, "loss": 3.0114, "step": 1738 }, { "epoch": 0.290839152067567, "grad_norm": 2.9730441570281982, "learning_rate": 4.4655154494651994e-05, "loss": 2.842, "step": 1739 }, { "epoch": 0.29100639712338505, "grad_norm": 4.449933052062988, "learning_rate": 4.464613148348525e-05, "loss": 3.0085, "step": 1740 }, { "epoch": 0.29117364217920305, "grad_norm": 4.006748676300049, "learning_rate": 4.463710177577345e-05, "loss": 2.9843, "step": 1741 }, { "epoch": 0.2913408872350211, "grad_norm": 2.1271421909332275, "learning_rate": 4.4628065374594444e-05, "loss": 2.9454, "step": 1742 }, { "epoch": 0.29150813229083916, "grad_norm": 4.533725261688232, "learning_rate": 4.4619022283028356e-05, "loss": 2.503, "step": 1743 }, { "epoch": 0.2916753773466572, "grad_norm": 3.4708921909332275, "learning_rate": 4.4609972504157607e-05, "loss": 2.4197, "step": 1744 }, { "epoch": 0.2918426224024752, "grad_norm": 8.371623039245605, "learning_rate": 4.460091604106689e-05, "loss": 2.9031, "step": 1745 }, { "epoch": 0.29200986745829327, "grad_norm": 7.194721221923828, "learning_rate": 4.4591852896843165e-05, "loss": 3.0674, "step": 1746 }, { "epoch": 0.2921771125141113, "grad_norm": 4.399629592895508, "learning_rate": 4.45827830745757e-05, "loss": 2.5819, "step": 1747 }, { "epoch": 0.2923443575699293, "grad_norm": 7.88906717300415, "learning_rate": 4.4573706577355986e-05, "loss": 2.8318, "step": 1748 }, { "epoch": 0.2925116026257474, "grad_norm": 4.829985618591309, "learning_rate": 4.456462340827785e-05, "loss": 2.6431, "step": 1749 }, { "epoch": 0.29267884768156543, "grad_norm": 3.637259006500244, "learning_rate": 4.455553357043735e-05, "loss": 2.7413, "step": 1750 }, { "epoch": 0.29284609273738343, "grad_norm": 9.177029609680176, "learning_rate": 4.4546437066932834e-05, "loss": 3.3596, "step": 1751 }, { "epoch": 0.2930133377932015, "grad_norm": 3.361025333404541, "learning_rate": 4.453733390086492e-05, "loss": 2.9063, "step": 1752 }, { "epoch": 0.29318058284901954, "grad_norm": 5.088170051574707, "learning_rate": 4.4528224075336504e-05, "loss": 2.8209, "step": 1753 }, { "epoch": 0.29334782790483754, "grad_norm": 3.990691900253296, "learning_rate": 4.4519107593452734e-05, "loss": 2.6571, "step": 1754 }, { "epoch": 0.2935150729606556, "grad_norm": 4.160034656524658, "learning_rate": 4.450998445832105e-05, "loss": 2.64, "step": 1755 }, { "epoch": 0.29368231801647365, "grad_norm": 4.156070709228516, "learning_rate": 4.450085467305114e-05, "loss": 2.5439, "step": 1756 }, { "epoch": 0.29384956307229165, "grad_norm": 3.114392042160034, "learning_rate": 4.449171824075496e-05, "loss": 2.9192, "step": 1757 }, { "epoch": 0.2940168081281097, "grad_norm": 4.108543395996094, "learning_rate": 4.4482575164546746e-05, "loss": 2.6821, "step": 1758 }, { "epoch": 0.29418405318392776, "grad_norm": 3.379845380783081, "learning_rate": 4.4473425447542985e-05, "loss": 2.9302, "step": 1759 }, { "epoch": 0.29435129823974576, "grad_norm": 6.444211483001709, "learning_rate": 4.446426909286244e-05, "loss": 3.0827, "step": 1760 }, { "epoch": 0.2945185432955638, "grad_norm": 5.5595598220825195, "learning_rate": 4.445510610362612e-05, "loss": 3.4957, "step": 1761 }, { "epoch": 0.29468578835138187, "grad_norm": 3.8112075328826904, "learning_rate": 4.444593648295733e-05, "loss": 2.9138, "step": 1762 }, { "epoch": 0.2948530334071999, "grad_norm": 4.559789657592773, "learning_rate": 4.443676023398158e-05, "loss": 2.5447, "step": 1763 }, { "epoch": 0.2950202784630179, "grad_norm": 6.181759834289551, "learning_rate": 4.4427577359826676e-05, "loss": 3.4244, "step": 1764 }, { "epoch": 0.295187523518836, "grad_norm": 3.441086530685425, "learning_rate": 4.441838786362268e-05, "loss": 2.5512, "step": 1765 }, { "epoch": 0.29535476857465404, "grad_norm": 12.556044578552246, "learning_rate": 4.4409191748501925e-05, "loss": 3.7825, "step": 1766 }, { "epoch": 0.29552201363047204, "grad_norm": 3.276071548461914, "learning_rate": 4.439998901759895e-05, "loss": 2.7103, "step": 1767 }, { "epoch": 0.2956892586862901, "grad_norm": 5.226451873779297, "learning_rate": 4.43907796740506e-05, "loss": 3.6808, "step": 1768 }, { "epoch": 0.29585650374210815, "grad_norm": 2.977020740509033, "learning_rate": 4.438156372099596e-05, "loss": 3.1555, "step": 1769 }, { "epoch": 0.29602374879792615, "grad_norm": 3.655348539352417, "learning_rate": 4.437234116157635e-05, "loss": 2.7891, "step": 1770 }, { "epoch": 0.2961909938537442, "grad_norm": 4.580479145050049, "learning_rate": 4.436311199893536e-05, "loss": 2.985, "step": 1771 }, { "epoch": 0.29635823890956225, "grad_norm": 2.124558925628662, "learning_rate": 4.435387623621883e-05, "loss": 3.0873, "step": 1772 }, { "epoch": 0.29652548396538025, "grad_norm": 3.9041852951049805, "learning_rate": 4.434463387657484e-05, "loss": 2.7868, "step": 1773 }, { "epoch": 0.2966927290211983, "grad_norm": 3.187666177749634, "learning_rate": 4.433538492315372e-05, "loss": 3.2478, "step": 1774 }, { "epoch": 0.29685997407701636, "grad_norm": 2.671687602996826, "learning_rate": 4.432612937910807e-05, "loss": 2.533, "step": 1775 }, { "epoch": 0.29702721913283436, "grad_norm": 2.765446662902832, "learning_rate": 4.4316867247592696e-05, "loss": 2.7202, "step": 1776 }, { "epoch": 0.2971944641886524, "grad_norm": 4.399050235748291, "learning_rate": 4.430759853176469e-05, "loss": 2.7669, "step": 1777 }, { "epoch": 0.2973617092444705, "grad_norm": 4.445383548736572, "learning_rate": 4.4298323234783356e-05, "loss": 2.9765, "step": 1778 }, { "epoch": 0.2975289543002885, "grad_norm": 5.740195274353027, "learning_rate": 4.428904135981026e-05, "loss": 2.9217, "step": 1779 }, { "epoch": 0.29769619935610653, "grad_norm": 3.07580304145813, "learning_rate": 4.42797529100092e-05, "loss": 2.6483, "step": 1780 }, { "epoch": 0.2978634444119246, "grad_norm": 3.1386733055114746, "learning_rate": 4.427045788854623e-05, "loss": 2.7409, "step": 1781 }, { "epoch": 0.2980306894677426, "grad_norm": 7.49973726272583, "learning_rate": 4.426115629858963e-05, "loss": 2.5213, "step": 1782 }, { "epoch": 0.29819793452356064, "grad_norm": 4.435051441192627, "learning_rate": 4.4251848143309925e-05, "loss": 3.1868, "step": 1783 }, { "epoch": 0.2983651795793787, "grad_norm": 3.1993637084960938, "learning_rate": 4.424253342587988e-05, "loss": 3.0383, "step": 1784 }, { "epoch": 0.29853242463519675, "grad_norm": 5.440072059631348, "learning_rate": 4.4233212149474476e-05, "loss": 2.8976, "step": 1785 }, { "epoch": 0.29869966969101475, "grad_norm": 3.8013482093811035, "learning_rate": 4.422388431727097e-05, "loss": 2.8121, "step": 1786 }, { "epoch": 0.2988669147468328, "grad_norm": 9.665240287780762, "learning_rate": 4.421454993244881e-05, "loss": 3.2339, "step": 1787 }, { "epoch": 0.29903415980265086, "grad_norm": 4.489198684692383, "learning_rate": 4.42052089981897e-05, "loss": 2.8964, "step": 1788 }, { "epoch": 0.29920140485846886, "grad_norm": 8.820923805236816, "learning_rate": 4.4195861517677584e-05, "loss": 2.9608, "step": 1789 }, { "epoch": 0.2993686499142869, "grad_norm": 4.8875603675842285, "learning_rate": 4.4186507494098625e-05, "loss": 2.7104, "step": 1790 }, { "epoch": 0.29953589497010497, "grad_norm": 3.1315200328826904, "learning_rate": 4.417714693064122e-05, "loss": 2.7383, "step": 1791 }, { "epoch": 0.29970314002592296, "grad_norm": 4.91286039352417, "learning_rate": 4.416777983049598e-05, "loss": 2.7803, "step": 1792 }, { "epoch": 0.299870385081741, "grad_norm": 5.490210056304932, "learning_rate": 4.415840619685577e-05, "loss": 3.0582, "step": 1793 }, { "epoch": 0.3000376301375591, "grad_norm": 3.6296896934509277, "learning_rate": 4.414902603291567e-05, "loss": 2.7106, "step": 1794 }, { "epoch": 0.3002048751933771, "grad_norm": 3.7691354751586914, "learning_rate": 4.4139639341872994e-05, "loss": 2.9449, "step": 1795 }, { "epoch": 0.30037212024919513, "grad_norm": 9.149449348449707, "learning_rate": 4.4130246126927256e-05, "loss": 3.0321, "step": 1796 }, { "epoch": 0.3005393653050132, "grad_norm": 2.573679208755493, "learning_rate": 4.4120846391280214e-05, "loss": 2.8138, "step": 1797 }, { "epoch": 0.3007066103608312, "grad_norm": 3.6780552864074707, "learning_rate": 4.411144013813584e-05, "loss": 2.6879, "step": 1798 }, { "epoch": 0.30087385541664924, "grad_norm": 4.8922834396362305, "learning_rate": 4.410202737070036e-05, "loss": 2.9689, "step": 1799 }, { "epoch": 0.3010411004724673, "grad_norm": 4.218683242797852, "learning_rate": 4.409260809218216e-05, "loss": 2.7035, "step": 1800 }, { "epoch": 0.3012083455282853, "grad_norm": 4.304034233093262, "learning_rate": 4.408318230579189e-05, "loss": 2.614, "step": 1801 }, { "epoch": 0.30137559058410335, "grad_norm": 3.98429536819458, "learning_rate": 4.407375001474242e-05, "loss": 3.0294, "step": 1802 }, { "epoch": 0.3015428356399214, "grad_norm": 4.74074125289917, "learning_rate": 4.40643112222488e-05, "loss": 3.0362, "step": 1803 }, { "epoch": 0.30171008069573946, "grad_norm": 8.230903625488281, "learning_rate": 4.405486593152834e-05, "loss": 3.4556, "step": 1804 }, { "epoch": 0.30187732575155746, "grad_norm": 5.635186195373535, "learning_rate": 4.4045414145800535e-05, "loss": 2.3674, "step": 1805 }, { "epoch": 0.3020445708073755, "grad_norm": 3.3161423206329346, "learning_rate": 4.4035955868287105e-05, "loss": 3.0879, "step": 1806 }, { "epoch": 0.30221181586319357, "grad_norm": 4.569234848022461, "learning_rate": 4.402649110221198e-05, "loss": 3.1179, "step": 1807 }, { "epoch": 0.30237906091901157, "grad_norm": 3.3022916316986084, "learning_rate": 4.401701985080131e-05, "loss": 2.7347, "step": 1808 }, { "epoch": 0.3025463059748296, "grad_norm": 4.84174108505249, "learning_rate": 4.400754211728344e-05, "loss": 2.9273, "step": 1809 }, { "epoch": 0.3027135510306477, "grad_norm": 2.941394805908203, "learning_rate": 4.3998057904888934e-05, "loss": 2.752, "step": 1810 }, { "epoch": 0.3028807960864657, "grad_norm": 4.8688154220581055, "learning_rate": 4.3988567216850576e-05, "loss": 3.2065, "step": 1811 }, { "epoch": 0.30304804114228373, "grad_norm": 4.5678791999816895, "learning_rate": 4.3979070056403326e-05, "loss": 2.7768, "step": 1812 }, { "epoch": 0.3032152861981018, "grad_norm": 2.6413748264312744, "learning_rate": 4.396956642678438e-05, "loss": 3.0289, "step": 1813 }, { "epoch": 0.3033825312539198, "grad_norm": 4.144144535064697, "learning_rate": 4.3960056331233134e-05, "loss": 2.4999, "step": 1814 }, { "epoch": 0.30354977630973784, "grad_norm": 5.577715873718262, "learning_rate": 4.395053977299117e-05, "loss": 2.8498, "step": 1815 }, { "epoch": 0.3037170213655559, "grad_norm": 6.295572280883789, "learning_rate": 4.394101675530229e-05, "loss": 2.3576, "step": 1816 }, { "epoch": 0.3038842664213739, "grad_norm": 5.117003917694092, "learning_rate": 4.393148728141249e-05, "loss": 2.9442, "step": 1817 }, { "epoch": 0.30405151147719195, "grad_norm": 7.44506311416626, "learning_rate": 4.392195135456997e-05, "loss": 3.0439, "step": 1818 }, { "epoch": 0.30421875653301, "grad_norm": 3.0893540382385254, "learning_rate": 4.3912408978025134e-05, "loss": 2.9604, "step": 1819 }, { "epoch": 0.304386001588828, "grad_norm": 4.523839950561523, "learning_rate": 4.390286015503058e-05, "loss": 2.6383, "step": 1820 }, { "epoch": 0.30455324664464606, "grad_norm": 7.709164142608643, "learning_rate": 4.389330488884109e-05, "loss": 3.4991, "step": 1821 }, { "epoch": 0.3047204917004641, "grad_norm": 5.405585289001465, "learning_rate": 4.3883743182713666e-05, "loss": 3.4628, "step": 1822 }, { "epoch": 0.30488773675628217, "grad_norm": 5.614868640899658, "learning_rate": 4.3874175039907496e-05, "loss": 2.954, "step": 1823 }, { "epoch": 0.30505498181210017, "grad_norm": 8.060934066772461, "learning_rate": 4.3864600463683955e-05, "loss": 3.137, "step": 1824 }, { "epoch": 0.3052222268679182, "grad_norm": 3.2063162326812744, "learning_rate": 4.3855019457306615e-05, "loss": 3.1283, "step": 1825 }, { "epoch": 0.3053894719237363, "grad_norm": 7.301669597625732, "learning_rate": 4.3845432024041234e-05, "loss": 2.6704, "step": 1826 }, { "epoch": 0.3055567169795543, "grad_norm": 4.972240924835205, "learning_rate": 4.383583816715579e-05, "loss": 3.0046, "step": 1827 }, { "epoch": 0.30572396203537233, "grad_norm": 4.447844982147217, "learning_rate": 4.38262378899204e-05, "loss": 2.9554, "step": 1828 }, { "epoch": 0.3058912070911904, "grad_norm": 5.238903522491455, "learning_rate": 4.3816631195607414e-05, "loss": 2.6383, "step": 1829 }, { "epoch": 0.3060584521470084, "grad_norm": 5.326669692993164, "learning_rate": 4.3807018087491346e-05, "loss": 3.1793, "step": 1830 }, { "epoch": 0.30622569720282644, "grad_norm": 3.9125781059265137, "learning_rate": 4.3797398568848906e-05, "loss": 3.0081, "step": 1831 }, { "epoch": 0.3063929422586445, "grad_norm": 7.178601264953613, "learning_rate": 4.378777264295899e-05, "loss": 3.5272, "step": 1832 }, { "epoch": 0.3065601873144625, "grad_norm": 2.830749750137329, "learning_rate": 4.377814031310267e-05, "loss": 2.6232, "step": 1833 }, { "epoch": 0.30672743237028055, "grad_norm": 4.0188984870910645, "learning_rate": 4.3768501582563195e-05, "loss": 2.8937, "step": 1834 }, { "epoch": 0.3068946774260986, "grad_norm": 5.74681282043457, "learning_rate": 4.375885645462602e-05, "loss": 2.8451, "step": 1835 }, { "epoch": 0.3070619224819166, "grad_norm": 3.84515118598938, "learning_rate": 4.374920493257876e-05, "loss": 2.6605, "step": 1836 }, { "epoch": 0.30722916753773466, "grad_norm": 3.9232585430145264, "learning_rate": 4.3739547019711214e-05, "loss": 2.953, "step": 1837 }, { "epoch": 0.3073964125935527, "grad_norm": 5.439550876617432, "learning_rate": 4.3729882719315355e-05, "loss": 3.0526, "step": 1838 }, { "epoch": 0.3075636576493707, "grad_norm": 4.395126819610596, "learning_rate": 4.372021203468536e-05, "loss": 2.8577, "step": 1839 }, { "epoch": 0.30773090270518877, "grad_norm": 5.704842567443848, "learning_rate": 4.3710534969117535e-05, "loss": 2.8911, "step": 1840 }, { "epoch": 0.3078981477610068, "grad_norm": 3.5285091400146484, "learning_rate": 4.3700851525910403e-05, "loss": 2.9437, "step": 1841 }, { "epoch": 0.3080653928168249, "grad_norm": 6.547240734100342, "learning_rate": 4.369116170836465e-05, "loss": 3.2152, "step": 1842 }, { "epoch": 0.3082326378726429, "grad_norm": 4.225785255432129, "learning_rate": 4.368146551978311e-05, "loss": 3.4916, "step": 1843 }, { "epoch": 0.30839988292846093, "grad_norm": 2.4446229934692383, "learning_rate": 4.367176296347083e-05, "loss": 2.6144, "step": 1844 }, { "epoch": 0.308567127984279, "grad_norm": 4.041599273681641, "learning_rate": 4.366205404273499e-05, "loss": 2.9716, "step": 1845 }, { "epoch": 0.308734373040097, "grad_norm": 5.086023330688477, "learning_rate": 4.365233876088496e-05, "loss": 2.6931, "step": 1846 }, { "epoch": 0.30890161809591504, "grad_norm": 2.6128721237182617, "learning_rate": 4.3642617121232276e-05, "loss": 2.6, "step": 1847 }, { "epoch": 0.3090688631517331, "grad_norm": 3.8443596363067627, "learning_rate": 4.3632889127090636e-05, "loss": 3.0401, "step": 1848 }, { "epoch": 0.3092361082075511, "grad_norm": 2.8798978328704834, "learning_rate": 4.362315478177591e-05, "loss": 2.6821, "step": 1849 }, { "epoch": 0.30940335326336915, "grad_norm": 2.1007537841796875, "learning_rate": 4.361341408860613e-05, "loss": 2.4919, "step": 1850 }, { "epoch": 0.3095705983191872, "grad_norm": 3.8827946186065674, "learning_rate": 4.360366705090149e-05, "loss": 3.3028, "step": 1851 }, { "epoch": 0.3097378433750052, "grad_norm": 2.969543695449829, "learning_rate": 4.359391367198433e-05, "loss": 2.5859, "step": 1852 }, { "epoch": 0.30990508843082326, "grad_norm": 4.503615379333496, "learning_rate": 4.358415395517919e-05, "loss": 2.7828, "step": 1853 }, { "epoch": 0.3100723334866413, "grad_norm": 6.032732009887695, "learning_rate": 4.357438790381276e-05, "loss": 3.1393, "step": 1854 }, { "epoch": 0.3102395785424593, "grad_norm": 5.71703577041626, "learning_rate": 4.356461552121386e-05, "loss": 3.1614, "step": 1855 }, { "epoch": 0.31040682359827737, "grad_norm": 4.801906585693359, "learning_rate": 4.3554836810713486e-05, "loss": 3.3287, "step": 1856 }, { "epoch": 0.3105740686540954, "grad_norm": 2.804518699645996, "learning_rate": 4.354505177564479e-05, "loss": 3.1027, "step": 1857 }, { "epoch": 0.3107413137099134, "grad_norm": 4.287504196166992, "learning_rate": 4.3535260419343095e-05, "loss": 2.3118, "step": 1858 }, { "epoch": 0.3109085587657315, "grad_norm": 5.285549163818359, "learning_rate": 4.352546274514585e-05, "loss": 2.7514, "step": 1859 }, { "epoch": 0.31107580382154953, "grad_norm": 7.823976993560791, "learning_rate": 4.351565875639268e-05, "loss": 2.9629, "step": 1860 }, { "epoch": 0.3112430488773676, "grad_norm": 2.921846389770508, "learning_rate": 4.350584845642537e-05, "loss": 3.0279, "step": 1861 }, { "epoch": 0.3114102939331856, "grad_norm": 2.688385486602783, "learning_rate": 4.349603184858781e-05, "loss": 2.9105, "step": 1862 }, { "epoch": 0.31157753898900364, "grad_norm": 5.472375392913818, "learning_rate": 4.348620893622609e-05, "loss": 2.9344, "step": 1863 }, { "epoch": 0.3117447840448217, "grad_norm": 2.2592382431030273, "learning_rate": 4.3476379722688435e-05, "loss": 2.4475, "step": 1864 }, { "epoch": 0.3119120291006397, "grad_norm": 3.7292327880859375, "learning_rate": 4.346654421132521e-05, "loss": 2.4111, "step": 1865 }, { "epoch": 0.31207927415645775, "grad_norm": 4.085743427276611, "learning_rate": 4.3456702405488915e-05, "loss": 2.9156, "step": 1866 }, { "epoch": 0.3122465192122758, "grad_norm": 2.9453542232513428, "learning_rate": 4.3446854308534224e-05, "loss": 2.7653, "step": 1867 }, { "epoch": 0.3124137642680938, "grad_norm": 4.903128147125244, "learning_rate": 4.3436999923817936e-05, "loss": 2.8926, "step": 1868 }, { "epoch": 0.31258100932391186, "grad_norm": 3.734124183654785, "learning_rate": 4.3427139254699e-05, "loss": 3.1153, "step": 1869 }, { "epoch": 0.3127482543797299, "grad_norm": 4.373854637145996, "learning_rate": 4.341727230453851e-05, "loss": 3.1765, "step": 1870 }, { "epoch": 0.3129154994355479, "grad_norm": 7.377640724182129, "learning_rate": 4.34073990766997e-05, "loss": 2.9976, "step": 1871 }, { "epoch": 0.31308274449136597, "grad_norm": 4.414946556091309, "learning_rate": 4.339751957454793e-05, "loss": 3.1202, "step": 1872 }, { "epoch": 0.313249989547184, "grad_norm": 4.033491134643555, "learning_rate": 4.338763380145071e-05, "loss": 3.0683, "step": 1873 }, { "epoch": 0.313417234603002, "grad_norm": 2.386878728866577, "learning_rate": 4.33777417607777e-05, "loss": 2.64, "step": 1874 }, { "epoch": 0.3135844796588201, "grad_norm": 6.829686164855957, "learning_rate": 4.336784345590067e-05, "loss": 3.3398, "step": 1875 }, { "epoch": 0.31375172471463814, "grad_norm": 3.789813995361328, "learning_rate": 4.335793889019354e-05, "loss": 2.7728, "step": 1876 }, { "epoch": 0.31391896977045614, "grad_norm": 2.8792173862457275, "learning_rate": 4.334802806703237e-05, "loss": 2.9333, "step": 1877 }, { "epoch": 0.3140862148262742, "grad_norm": 5.821765899658203, "learning_rate": 4.333811098979534e-05, "loss": 2.9791, "step": 1878 }, { "epoch": 0.31425345988209225, "grad_norm": 4.614078521728516, "learning_rate": 4.3328187661862764e-05, "loss": 3.2553, "step": 1879 }, { "epoch": 0.3144207049379103, "grad_norm": 1.5425963401794434, "learning_rate": 4.3318258086617095e-05, "loss": 2.4228, "step": 1880 }, { "epoch": 0.3145879499937283, "grad_norm": 2.675985097885132, "learning_rate": 4.3308322267442915e-05, "loss": 2.6546, "step": 1881 }, { "epoch": 0.31475519504954635, "grad_norm": 3.296445369720459, "learning_rate": 4.329838020772691e-05, "loss": 3.4571, "step": 1882 }, { "epoch": 0.3149224401053644, "grad_norm": 2.78728985786438, "learning_rate": 4.328843191085793e-05, "loss": 2.6475, "step": 1883 }, { "epoch": 0.3150896851611824, "grad_norm": 4.007457733154297, "learning_rate": 4.327847738022694e-05, "loss": 2.6964, "step": 1884 }, { "epoch": 0.31525693021700046, "grad_norm": 3.376396894454956, "learning_rate": 4.3268516619227e-05, "loss": 2.5644, "step": 1885 }, { "epoch": 0.3154241752728185, "grad_norm": 5.846045970916748, "learning_rate": 4.325854963125333e-05, "loss": 2.9898, "step": 1886 }, { "epoch": 0.3155914203286365, "grad_norm": 6.38449239730835, "learning_rate": 4.3248576419703266e-05, "loss": 2.8146, "step": 1887 }, { "epoch": 0.3157586653844546, "grad_norm": 5.799415111541748, "learning_rate": 4.323859698797625e-05, "loss": 2.9442, "step": 1888 }, { "epoch": 0.31592591044027263, "grad_norm": 2.6659719944000244, "learning_rate": 4.322861133947385e-05, "loss": 2.6448, "step": 1889 }, { "epoch": 0.3160931554960906, "grad_norm": 4.048138618469238, "learning_rate": 4.321861947759976e-05, "loss": 3.1276, "step": 1890 }, { "epoch": 0.3162604005519087, "grad_norm": 3.766430616378784, "learning_rate": 4.3208621405759806e-05, "loss": 2.997, "step": 1891 }, { "epoch": 0.31642764560772674, "grad_norm": 3.8337202072143555, "learning_rate": 4.319861712736188e-05, "loss": 2.8235, "step": 1892 }, { "epoch": 0.31659489066354474, "grad_norm": 3.917555809020996, "learning_rate": 4.318860664581605e-05, "loss": 2.8647, "step": 1893 }, { "epoch": 0.3167621357193628, "grad_norm": 3.242671251296997, "learning_rate": 4.317858996453445e-05, "loss": 2.6758, "step": 1894 }, { "epoch": 0.31692938077518085, "grad_norm": 7.787165641784668, "learning_rate": 4.3168567086931366e-05, "loss": 3.6934, "step": 1895 }, { "epoch": 0.31709662583099885, "grad_norm": 2.2663705348968506, "learning_rate": 4.315853801642317e-05, "loss": 2.5967, "step": 1896 }, { "epoch": 0.3172638708868169, "grad_norm": 3.574653387069702, "learning_rate": 4.314850275642834e-05, "loss": 2.6493, "step": 1897 }, { "epoch": 0.31743111594263496, "grad_norm": 7.406806945800781, "learning_rate": 4.3138461310367495e-05, "loss": 3.2013, "step": 1898 }, { "epoch": 0.317598360998453, "grad_norm": 2.569453001022339, "learning_rate": 4.312841368166335e-05, "loss": 2.6517, "step": 1899 }, { "epoch": 0.317765606054271, "grad_norm": 4.738746166229248, "learning_rate": 4.31183598737407e-05, "loss": 2.9845, "step": 1900 }, { "epoch": 0.31793285111008907, "grad_norm": 3.1842098236083984, "learning_rate": 4.310829989002647e-05, "loss": 2.947, "step": 1901 }, { "epoch": 0.3181000961659071, "grad_norm": 6.0765862464904785, "learning_rate": 4.309823373394971e-05, "loss": 2.6427, "step": 1902 }, { "epoch": 0.3182673412217251, "grad_norm": 3.9738245010375977, "learning_rate": 4.308816140894153e-05, "loss": 2.9561, "step": 1903 }, { "epoch": 0.3184345862775432, "grad_norm": 4.366907119750977, "learning_rate": 4.3078082918435176e-05, "loss": 3.1766, "step": 1904 }, { "epoch": 0.31860183133336123, "grad_norm": 3.4294168949127197, "learning_rate": 4.306799826586598e-05, "loss": 3.2957, "step": 1905 }, { "epoch": 0.31876907638917923, "grad_norm": 4.801085472106934, "learning_rate": 4.3057907454671374e-05, "loss": 2.9099, "step": 1906 }, { "epoch": 0.3189363214449973, "grad_norm": 2.717808961868286, "learning_rate": 4.3047810488290905e-05, "loss": 2.5485, "step": 1907 }, { "epoch": 0.31910356650081534, "grad_norm": 6.6500420570373535, "learning_rate": 4.30377073701662e-05, "loss": 3.3041, "step": 1908 }, { "epoch": 0.31927081155663334, "grad_norm": 6.315479755401611, "learning_rate": 4.3027598103740986e-05, "loss": 2.8613, "step": 1909 }, { "epoch": 0.3194380566124514, "grad_norm": 6.077008247375488, "learning_rate": 4.30174826924611e-05, "loss": 3.1873, "step": 1910 }, { "epoch": 0.31960530166826945, "grad_norm": 5.171370029449463, "learning_rate": 4.300736113977445e-05, "loss": 2.8696, "step": 1911 }, { "epoch": 0.31977254672408745, "grad_norm": 6.180931091308594, "learning_rate": 4.299723344913106e-05, "loss": 2.6747, "step": 1912 }, { "epoch": 0.3199397917799055, "grad_norm": 4.652810096740723, "learning_rate": 4.2987099623983034e-05, "loss": 2.9732, "step": 1913 }, { "epoch": 0.32010703683572356, "grad_norm": 13.354399681091309, "learning_rate": 4.297695966778457e-05, "loss": 3.4125, "step": 1914 }, { "epoch": 0.32027428189154156, "grad_norm": 4.645137786865234, "learning_rate": 4.2966813583991947e-05, "loss": 2.9272, "step": 1915 }, { "epoch": 0.3204415269473596, "grad_norm": 10.716592788696289, "learning_rate": 4.2956661376063555e-05, "loss": 2.9503, "step": 1916 }, { "epoch": 0.32060877200317767, "grad_norm": 3.0873429775238037, "learning_rate": 4.2946503047459853e-05, "loss": 2.8365, "step": 1917 }, { "epoch": 0.32077601705899567, "grad_norm": 5.814124584197998, "learning_rate": 4.293633860164338e-05, "loss": 2.57, "step": 1918 }, { "epoch": 0.3209432621148137, "grad_norm": 9.838218688964844, "learning_rate": 4.292616804207878e-05, "loss": 4.2579, "step": 1919 }, { "epoch": 0.3211105071706318, "grad_norm": 1.623528003692627, "learning_rate": 4.2915991372232765e-05, "loss": 2.3799, "step": 1920 }, { "epoch": 0.32127775222644983, "grad_norm": 3.1773924827575684, "learning_rate": 4.2905808595574155e-05, "loss": 2.581, "step": 1921 }, { "epoch": 0.32144499728226783, "grad_norm": 3.2550220489501953, "learning_rate": 4.289561971557381e-05, "loss": 2.6789, "step": 1922 }, { "epoch": 0.3216122423380859, "grad_norm": 3.553375482559204, "learning_rate": 4.28854247357047e-05, "loss": 2.6089, "step": 1923 }, { "epoch": 0.32177948739390394, "grad_norm": 4.314040660858154, "learning_rate": 4.2875223659441884e-05, "loss": 2.7571, "step": 1924 }, { "epoch": 0.32194673244972194, "grad_norm": 3.583649158477783, "learning_rate": 4.2865016490262454e-05, "loss": 3.1372, "step": 1925 }, { "epoch": 0.32211397750554, "grad_norm": 6.901949882507324, "learning_rate": 4.285480323164563e-05, "loss": 2.8413, "step": 1926 }, { "epoch": 0.32228122256135805, "grad_norm": 2.240621328353882, "learning_rate": 4.2844583887072676e-05, "loss": 2.774, "step": 1927 }, { "epoch": 0.32244846761717605, "grad_norm": 2.639983892440796, "learning_rate": 4.2834358460026944e-05, "loss": 2.5902, "step": 1928 }, { "epoch": 0.3226157126729941, "grad_norm": 8.329880714416504, "learning_rate": 4.282412695399385e-05, "loss": 3.3754, "step": 1929 }, { "epoch": 0.32278295772881216, "grad_norm": 2.9294960498809814, "learning_rate": 4.281388937246088e-05, "loss": 2.8154, "step": 1930 }, { "epoch": 0.32295020278463016, "grad_norm": 3.3302152156829834, "learning_rate": 4.280364571891761e-05, "loss": 2.8225, "step": 1931 }, { "epoch": 0.3231174478404482, "grad_norm": 6.6479811668396, "learning_rate": 4.279339599685567e-05, "loss": 3.3818, "step": 1932 }, { "epoch": 0.32328469289626627, "grad_norm": 3.4750466346740723, "learning_rate": 4.278314020976876e-05, "loss": 2.8161, "step": 1933 }, { "epoch": 0.32345193795208427, "grad_norm": 3.689955949783325, "learning_rate": 4.2772878361152655e-05, "loss": 3.2132, "step": 1934 }, { "epoch": 0.3236191830079023, "grad_norm": 5.513306617736816, "learning_rate": 4.276261045450518e-05, "loss": 3.0398, "step": 1935 }, { "epoch": 0.3237864280637204, "grad_norm": 6.434723377227783, "learning_rate": 4.275233649332624e-05, "loss": 3.1258, "step": 1936 }, { "epoch": 0.3239536731195384, "grad_norm": 3.599724054336548, "learning_rate": 4.27420564811178e-05, "loss": 2.8623, "step": 1937 }, { "epoch": 0.32412091817535643, "grad_norm": 2.8131051063537598, "learning_rate": 4.273177042138389e-05, "loss": 3.2877, "step": 1938 }, { "epoch": 0.3242881632311745, "grad_norm": 2.8549537658691406, "learning_rate": 4.272147831763059e-05, "loss": 3.0223, "step": 1939 }, { "epoch": 0.32445540828699254, "grad_norm": 3.2326037883758545, "learning_rate": 4.271118017336605e-05, "loss": 2.4997, "step": 1940 }, { "epoch": 0.32462265334281054, "grad_norm": 4.091347694396973, "learning_rate": 4.270087599210048e-05, "loss": 2.7415, "step": 1941 }, { "epoch": 0.3247898983986286, "grad_norm": 3.8966574668884277, "learning_rate": 4.269056577734615e-05, "loss": 2.6539, "step": 1942 }, { "epoch": 0.32495714345444665, "grad_norm": 1.9946322441101074, "learning_rate": 4.268024953261736e-05, "loss": 3.2257, "step": 1943 }, { "epoch": 0.32512438851026465, "grad_norm": 7.1934814453125, "learning_rate": 4.2669927261430516e-05, "loss": 3.4636, "step": 1944 }, { "epoch": 0.3252916335660827, "grad_norm": 5.938238620758057, "learning_rate": 4.2659598967304025e-05, "loss": 2.9865, "step": 1945 }, { "epoch": 0.32545887862190076, "grad_norm": 4.428837299346924, "learning_rate": 4.264926465375838e-05, "loss": 2.989, "step": 1946 }, { "epoch": 0.32562612367771876, "grad_norm": 4.466572284698486, "learning_rate": 4.2638924324316124e-05, "loss": 3.6735, "step": 1947 }, { "epoch": 0.3257933687335368, "grad_norm": 3.5516631603240967, "learning_rate": 4.2628577982501824e-05, "loss": 2.7311, "step": 1948 }, { "epoch": 0.32596061378935487, "grad_norm": 3.716730833053589, "learning_rate": 4.261822563184213e-05, "loss": 2.307, "step": 1949 }, { "epoch": 0.32612785884517287, "grad_norm": 4.79249382019043, "learning_rate": 4.2607867275865735e-05, "loss": 3.0064, "step": 1950 }, { "epoch": 0.3262951039009909, "grad_norm": 4.19760799407959, "learning_rate": 4.259750291810334e-05, "loss": 2.5131, "step": 1951 }, { "epoch": 0.326462348956809, "grad_norm": 5.312543869018555, "learning_rate": 4.258713256208775e-05, "loss": 3.0211, "step": 1952 }, { "epoch": 0.326629594012627, "grad_norm": 2.9033987522125244, "learning_rate": 4.257675621135376e-05, "loss": 2.7635, "step": 1953 }, { "epoch": 0.32679683906844503, "grad_norm": 4.440387725830078, "learning_rate": 4.2566373869438255e-05, "loss": 2.8387, "step": 1954 }, { "epoch": 0.3269640841242631, "grad_norm": 6.071342945098877, "learning_rate": 4.255598553988015e-05, "loss": 3.1194, "step": 1955 }, { "epoch": 0.3271313291800811, "grad_norm": 2.4908556938171387, "learning_rate": 4.254559122622036e-05, "loss": 2.9522, "step": 1956 }, { "epoch": 0.32729857423589914, "grad_norm": 3.9658448696136475, "learning_rate": 4.2535190932001904e-05, "loss": 2.7179, "step": 1957 }, { "epoch": 0.3274658192917172, "grad_norm": 3.624687910079956, "learning_rate": 4.252478466076978e-05, "loss": 2.6737, "step": 1958 }, { "epoch": 0.32763306434753525, "grad_norm": 4.328098773956299, "learning_rate": 4.251437241607107e-05, "loss": 2.8019, "step": 1959 }, { "epoch": 0.32780030940335325, "grad_norm": 4.221250057220459, "learning_rate": 4.250395420145486e-05, "loss": 2.6316, "step": 1960 }, { "epoch": 0.3279675544591713, "grad_norm": 3.9896185398101807, "learning_rate": 4.2493530020472296e-05, "loss": 2.5021, "step": 1961 }, { "epoch": 0.32813479951498936, "grad_norm": 5.810196399688721, "learning_rate": 4.2483099876676536e-05, "loss": 3.1305, "step": 1962 }, { "epoch": 0.32830204457080736, "grad_norm": 2.7076990604400635, "learning_rate": 4.247266377362279e-05, "loss": 3.017, "step": 1963 }, { "epoch": 0.3284692896266254, "grad_norm": 5.562281608581543, "learning_rate": 4.246222171486828e-05, "loss": 3.1985, "step": 1964 }, { "epoch": 0.32863653468244347, "grad_norm": 4.349507808685303, "learning_rate": 4.245177370397227e-05, "loss": 2.8954, "step": 1965 }, { "epoch": 0.32880377973826147, "grad_norm": 4.0609846115112305, "learning_rate": 4.244131974449606e-05, "loss": 2.7564, "step": 1966 }, { "epoch": 0.3289710247940795, "grad_norm": 4.889737129211426, "learning_rate": 4.2430859840002955e-05, "loss": 2.7237, "step": 1967 }, { "epoch": 0.3291382698498976, "grad_norm": 7.3542585372924805, "learning_rate": 4.24203939940583e-05, "loss": 2.3732, "step": 1968 }, { "epoch": 0.3293055149057156, "grad_norm": 7.237390995025635, "learning_rate": 4.240992221022947e-05, "loss": 3.3953, "step": 1969 }, { "epoch": 0.32947275996153363, "grad_norm": 3.1158530712127686, "learning_rate": 4.239944449208586e-05, "loss": 2.6855, "step": 1970 }, { "epoch": 0.3296400050173517, "grad_norm": 4.656050682067871, "learning_rate": 4.238896084319888e-05, "loss": 2.7796, "step": 1971 }, { "epoch": 0.3298072500731697, "grad_norm": 3.5850002765655518, "learning_rate": 4.237847126714196e-05, "loss": 3.0538, "step": 1972 }, { "epoch": 0.32997449512898774, "grad_norm": 5.112659931182861, "learning_rate": 4.236797576749059e-05, "loss": 2.7778, "step": 1973 }, { "epoch": 0.3301417401848058, "grad_norm": 3.761660575866699, "learning_rate": 4.235747434782221e-05, "loss": 2.8562, "step": 1974 }, { "epoch": 0.3303089852406238, "grad_norm": 5.849134922027588, "learning_rate": 4.2346967011716325e-05, "loss": 3.1242, "step": 1975 }, { "epoch": 0.33047623029644185, "grad_norm": 4.471280574798584, "learning_rate": 4.2336453762754465e-05, "loss": 3.2327, "step": 1976 }, { "epoch": 0.3306434753522599, "grad_norm": 4.40237283706665, "learning_rate": 4.232593460452014e-05, "loss": 3.0818, "step": 1977 }, { "epoch": 0.33081072040807796, "grad_norm": 3.951179027557373, "learning_rate": 4.231540954059889e-05, "loss": 2.9881, "step": 1978 }, { "epoch": 0.33097796546389596, "grad_norm": 2.7680702209472656, "learning_rate": 4.2304878574578276e-05, "loss": 2.5966, "step": 1979 }, { "epoch": 0.331145210519714, "grad_norm": 3.914487838745117, "learning_rate": 4.229434171004787e-05, "loss": 2.9297, "step": 1980 }, { "epoch": 0.33131245557553207, "grad_norm": 3.4134206771850586, "learning_rate": 4.228379895059924e-05, "loss": 2.8533, "step": 1981 }, { "epoch": 0.33147970063135007, "grad_norm": 5.9506754875183105, "learning_rate": 4.227325029982597e-05, "loss": 3.0916, "step": 1982 }, { "epoch": 0.3316469456871681, "grad_norm": 3.440929412841797, "learning_rate": 4.226269576132366e-05, "loss": 2.6525, "step": 1983 }, { "epoch": 0.3318141907429862, "grad_norm": 3.92964506149292, "learning_rate": 4.2252135338689916e-05, "loss": 3.1008, "step": 1984 }, { "epoch": 0.3319814357988042, "grad_norm": 2.32197904586792, "learning_rate": 4.2241569035524325e-05, "loss": 2.6956, "step": 1985 }, { "epoch": 0.33214868085462224, "grad_norm": 5.426007270812988, "learning_rate": 4.223099685542852e-05, "loss": 2.4476, "step": 1986 }, { "epoch": 0.3323159259104403, "grad_norm": 2.990340232849121, "learning_rate": 4.22204188020061e-05, "loss": 2.6656, "step": 1987 }, { "epoch": 0.3324831709662583, "grad_norm": 3.684333562850952, "learning_rate": 4.220983487886269e-05, "loss": 2.7784, "step": 1988 }, { "epoch": 0.33265041602207635, "grad_norm": 4.973931312561035, "learning_rate": 4.21992450896059e-05, "loss": 3.1419, "step": 1989 }, { "epoch": 0.3328176610778944, "grad_norm": 3.39973521232605, "learning_rate": 4.2188649437845356e-05, "loss": 2.5692, "step": 1990 }, { "epoch": 0.3329849061337124, "grad_norm": 4.1224212646484375, "learning_rate": 4.2178047927192666e-05, "loss": 2.6943, "step": 1991 }, { "epoch": 0.33315215118953045, "grad_norm": 3.158156633377075, "learning_rate": 4.216744056126144e-05, "loss": 3.1614, "step": 1992 }, { "epoch": 0.3333193962453485, "grad_norm": 3.446702003479004, "learning_rate": 4.215682734366729e-05, "loss": 2.7312, "step": 1993 }, { "epoch": 0.3334866413011665, "grad_norm": 5.1350483894348145, "learning_rate": 4.2146208278027826e-05, "loss": 2.6464, "step": 1994 }, { "epoch": 0.33365388635698456, "grad_norm": 4.580522537231445, "learning_rate": 4.213558336796263e-05, "loss": 3.0736, "step": 1995 }, { "epoch": 0.3338211314128026, "grad_norm": 7.391678333282471, "learning_rate": 4.212495261709329e-05, "loss": 2.7895, "step": 1996 }, { "epoch": 0.3339883764686207, "grad_norm": 2.8180949687957764, "learning_rate": 4.21143160290434e-05, "loss": 3.0215, "step": 1997 }, { "epoch": 0.3341556215244387, "grad_norm": 5.221526145935059, "learning_rate": 4.210367360743851e-05, "loss": 3.1191, "step": 1998 }, { "epoch": 0.33432286658025673, "grad_norm": 15.405474662780762, "learning_rate": 4.209302535590618e-05, "loss": 3.4556, "step": 1999 }, { "epoch": 0.3344901116360748, "grad_norm": 5.784823894500732, "learning_rate": 4.2082371278075966e-05, "loss": 2.6907, "step": 2000 }, { "epoch": 0.3346573566918928, "grad_norm": 4.700861930847168, "learning_rate": 4.207171137757939e-05, "loss": 2.8327, "step": 2001 }, { "epoch": 0.33482460174771084, "grad_norm": 7.612462043762207, "learning_rate": 4.206104565804997e-05, "loss": 3.122, "step": 2002 }, { "epoch": 0.3349918468035289, "grad_norm": 3.853128433227539, "learning_rate": 4.205037412312319e-05, "loss": 2.8309, "step": 2003 }, { "epoch": 0.3351590918593469, "grad_norm": 3.2603580951690674, "learning_rate": 4.2039696776436545e-05, "loss": 2.5677, "step": 2004 }, { "epoch": 0.33532633691516495, "grad_norm": 6.105206489562988, "learning_rate": 4.20290136216295e-05, "loss": 2.9916, "step": 2005 }, { "epoch": 0.335493581970983, "grad_norm": 4.510067939758301, "learning_rate": 4.201832466234348e-05, "loss": 2.6697, "step": 2006 }, { "epoch": 0.335660827026801, "grad_norm": 3.9837100505828857, "learning_rate": 4.2007629902221914e-05, "loss": 2.7833, "step": 2007 }, { "epoch": 0.33582807208261906, "grad_norm": 10.377662658691406, "learning_rate": 4.1996929344910194e-05, "loss": 3.2935, "step": 2008 }, { "epoch": 0.3359953171384371, "grad_norm": 4.564599990844727, "learning_rate": 4.1986222994055696e-05, "loss": 2.7589, "step": 2009 }, { "epoch": 0.3361625621942551, "grad_norm": 3.922545909881592, "learning_rate": 4.197551085330777e-05, "loss": 2.8471, "step": 2010 }, { "epoch": 0.33632980725007317, "grad_norm": 4.503798484802246, "learning_rate": 4.196479292631774e-05, "loss": 2.8204, "step": 2011 }, { "epoch": 0.3364970523058912, "grad_norm": 3.9774649143218994, "learning_rate": 4.195406921673888e-05, "loss": 2.7066, "step": 2012 }, { "epoch": 0.3366642973617092, "grad_norm": 4.261986255645752, "learning_rate": 4.1943339728226475e-05, "loss": 2.8174, "step": 2013 }, { "epoch": 0.3368315424175273, "grad_norm": 5.021264553070068, "learning_rate": 4.193260446443775e-05, "loss": 2.8594, "step": 2014 }, { "epoch": 0.33699878747334533, "grad_norm": 4.444366931915283, "learning_rate": 4.192186342903191e-05, "loss": 3.0685, "step": 2015 }, { "epoch": 0.3371660325291634, "grad_norm": 4.081801414489746, "learning_rate": 4.191111662567013e-05, "loss": 2.8652, "step": 2016 }, { "epoch": 0.3373332775849814, "grad_norm": 6.336332321166992, "learning_rate": 4.190036405801555e-05, "loss": 2.7531, "step": 2017 }, { "epoch": 0.33750052264079944, "grad_norm": 2.786384344100952, "learning_rate": 4.188960572973325e-05, "loss": 2.685, "step": 2018 }, { "epoch": 0.3376677676966175, "grad_norm": 4.053229331970215, "learning_rate": 4.187884164449031e-05, "loss": 2.7431, "step": 2019 }, { "epoch": 0.3378350127524355, "grad_norm": 3.8898849487304688, "learning_rate": 4.1868071805955746e-05, "loss": 2.4039, "step": 2020 }, { "epoch": 0.33800225780825355, "grad_norm": 5.628301620483398, "learning_rate": 4.1857296217800555e-05, "loss": 2.8719, "step": 2021 }, { "epoch": 0.3381695028640716, "grad_norm": 4.575279235839844, "learning_rate": 4.184651488369769e-05, "loss": 2.8579, "step": 2022 }, { "epoch": 0.3383367479198896, "grad_norm": 2.1460680961608887, "learning_rate": 4.183572780732204e-05, "loss": 2.7044, "step": 2023 }, { "epoch": 0.33850399297570766, "grad_norm": 4.718812465667725, "learning_rate": 4.182493499235048e-05, "loss": 3.1138, "step": 2024 }, { "epoch": 0.3386712380315257, "grad_norm": 3.6240220069885254, "learning_rate": 4.1814136442461824e-05, "loss": 2.6492, "step": 2025 }, { "epoch": 0.3388384830873437, "grad_norm": 6.295439720153809, "learning_rate": 4.180333216133686e-05, "loss": 3.0126, "step": 2026 }, { "epoch": 0.33900572814316177, "grad_norm": 5.436554431915283, "learning_rate": 4.1792522152658306e-05, "loss": 3.3378, "step": 2027 }, { "epoch": 0.3391729731989798, "grad_norm": 5.7955498695373535, "learning_rate": 4.1781706420110835e-05, "loss": 2.856, "step": 2028 }, { "epoch": 0.3393402182547978, "grad_norm": 2.8037428855895996, "learning_rate": 4.177088496738108e-05, "loss": 2.8572, "step": 2029 }, { "epoch": 0.3395074633106159, "grad_norm": 3.2423861026763916, "learning_rate": 4.1760057798157644e-05, "loss": 2.5705, "step": 2030 }, { "epoch": 0.33967470836643393, "grad_norm": 13.447222709655762, "learning_rate": 4.1749224916131024e-05, "loss": 3.1386, "step": 2031 }, { "epoch": 0.33984195342225193, "grad_norm": 4.138230800628662, "learning_rate": 4.1738386324993716e-05, "loss": 2.7101, "step": 2032 }, { "epoch": 0.34000919847807, "grad_norm": 5.600181579589844, "learning_rate": 4.1727542028440146e-05, "loss": 2.8163, "step": 2033 }, { "epoch": 0.34017644353388804, "grad_norm": 4.435468673706055, "learning_rate": 4.171669203016667e-05, "loss": 3.0244, "step": 2034 }, { "epoch": 0.34034368858970604, "grad_norm": 3.105405569076538, "learning_rate": 4.1705836333871604e-05, "loss": 2.6463, "step": 2035 }, { "epoch": 0.3405109336455241, "grad_norm": 4.283094882965088, "learning_rate": 4.1694974943255194e-05, "loss": 2.7503, "step": 2036 }, { "epoch": 0.34067817870134215, "grad_norm": 5.516997814178467, "learning_rate": 4.168410786201965e-05, "loss": 2.9256, "step": 2037 }, { "epoch": 0.3408454237571602, "grad_norm": 4.088803291320801, "learning_rate": 4.167323509386909e-05, "loss": 2.6912, "step": 2038 }, { "epoch": 0.3410126688129782, "grad_norm": 3.4144928455352783, "learning_rate": 4.1662356642509594e-05, "loss": 2.7473, "step": 2039 }, { "epoch": 0.34117991386879626, "grad_norm": 3.3685176372528076, "learning_rate": 4.165147251164916e-05, "loss": 2.5715, "step": 2040 }, { "epoch": 0.3413471589246143, "grad_norm": 6.146759033203125, "learning_rate": 4.164058270499775e-05, "loss": 2.6903, "step": 2041 }, { "epoch": 0.3415144039804323, "grad_norm": 4.03030252456665, "learning_rate": 4.1629687226267244e-05, "loss": 2.6713, "step": 2042 }, { "epoch": 0.34168164903625037, "grad_norm": 3.1066858768463135, "learning_rate": 4.161878607917145e-05, "loss": 2.8052, "step": 2043 }, { "epoch": 0.3418488940920684, "grad_norm": 7.342145919799805, "learning_rate": 4.16078792674261e-05, "loss": 2.4362, "step": 2044 }, { "epoch": 0.3420161391478864, "grad_norm": 7.897299766540527, "learning_rate": 4.159696679474888e-05, "loss": 2.7103, "step": 2045 }, { "epoch": 0.3421833842037045, "grad_norm": 3.9199349880218506, "learning_rate": 4.15860486648594e-05, "loss": 3.182, "step": 2046 }, { "epoch": 0.34235062925952253, "grad_norm": 3.234351396560669, "learning_rate": 4.15751248814792e-05, "loss": 2.5973, "step": 2047 }, { "epoch": 0.34251787431534053, "grad_norm": 5.346128463745117, "learning_rate": 4.156419544833173e-05, "loss": 2.6339, "step": 2048 }, { "epoch": 0.3426851193711586, "grad_norm": 2.5909111499786377, "learning_rate": 4.155326036914237e-05, "loss": 2.4124, "step": 2049 }, { "epoch": 0.34285236442697664, "grad_norm": 5.5628485679626465, "learning_rate": 4.154231964763845e-05, "loss": 2.8433, "step": 2050 }, { "epoch": 0.34301960948279464, "grad_norm": 3.850487470626831, "learning_rate": 4.15313732875492e-05, "loss": 2.8724, "step": 2051 }, { "epoch": 0.3431868545386127, "grad_norm": 2.636874198913574, "learning_rate": 4.1520421292605764e-05, "loss": 2.6746, "step": 2052 }, { "epoch": 0.34335409959443075, "grad_norm": 10.235560417175293, "learning_rate": 4.150946366654123e-05, "loss": 3.1131, "step": 2053 }, { "epoch": 0.34352134465024875, "grad_norm": 3.0532987117767334, "learning_rate": 4.14985004130906e-05, "loss": 2.6119, "step": 2054 }, { "epoch": 0.3436885897060668, "grad_norm": 5.453422546386719, "learning_rate": 4.148753153599078e-05, "loss": 2.5362, "step": 2055 }, { "epoch": 0.34385583476188486, "grad_norm": 5.215536594390869, "learning_rate": 4.147655703898059e-05, "loss": 2.857, "step": 2056 }, { "epoch": 0.3440230798177029, "grad_norm": 4.400965213775635, "learning_rate": 4.14655769258008e-05, "loss": 3.0778, "step": 2057 }, { "epoch": 0.3441903248735209, "grad_norm": 4.589625835418701, "learning_rate": 4.145459120019406e-05, "loss": 2.9954, "step": 2058 }, { "epoch": 0.34435756992933897, "grad_norm": 5.332126140594482, "learning_rate": 4.1443599865904944e-05, "loss": 2.7404, "step": 2059 }, { "epoch": 0.344524814985157, "grad_norm": 6.37963342666626, "learning_rate": 4.143260292667995e-05, "loss": 2.8949, "step": 2060 }, { "epoch": 0.344692060040975, "grad_norm": 4.061391353607178, "learning_rate": 4.142160038626746e-05, "loss": 2.821, "step": 2061 }, { "epoch": 0.3448593050967931, "grad_norm": 3.291536331176758, "learning_rate": 4.1410592248417787e-05, "loss": 2.8205, "step": 2062 }, { "epoch": 0.34502655015261113, "grad_norm": 10.164863586425781, "learning_rate": 4.139957851688314e-05, "loss": 2.8573, "step": 2063 }, { "epoch": 0.34519379520842913, "grad_norm": 3.0865399837493896, "learning_rate": 4.138855919541766e-05, "loss": 2.5867, "step": 2064 }, { "epoch": 0.3453610402642472, "grad_norm": 6.574512958526611, "learning_rate": 4.137753428777733e-05, "loss": 3.021, "step": 2065 }, { "epoch": 0.34552828532006524, "grad_norm": 3.127948522567749, "learning_rate": 4.136650379772013e-05, "loss": 2.8292, "step": 2066 }, { "epoch": 0.34569553037588324, "grad_norm": 5.669129371643066, "learning_rate": 4.135546772900586e-05, "loss": 2.9958, "step": 2067 }, { "epoch": 0.3458627754317013, "grad_norm": 8.498010635375977, "learning_rate": 4.134442608539628e-05, "loss": 2.8633, "step": 2068 }, { "epoch": 0.34603002048751935, "grad_norm": 3.660940170288086, "learning_rate": 4.1333378870654995e-05, "loss": 2.8011, "step": 2069 }, { "epoch": 0.34619726554333735, "grad_norm": 10.169939041137695, "learning_rate": 4.132232608854755e-05, "loss": 3.0222, "step": 2070 }, { "epoch": 0.3463645105991554, "grad_norm": 7.700591087341309, "learning_rate": 4.1311267742841385e-05, "loss": 2.8604, "step": 2071 }, { "epoch": 0.34653175565497346, "grad_norm": 11.108731269836426, "learning_rate": 4.130020383730583e-05, "loss": 3.9784, "step": 2072 }, { "epoch": 0.34669900071079146, "grad_norm": 3.9169206619262695, "learning_rate": 4.128913437571209e-05, "loss": 2.9373, "step": 2073 }, { "epoch": 0.3468662457666095, "grad_norm": 7.0897417068481445, "learning_rate": 4.12780593618333e-05, "loss": 3.0607, "step": 2074 }, { "epoch": 0.34703349082242757, "grad_norm": 8.011955261230469, "learning_rate": 4.1266978799444464e-05, "loss": 3.0286, "step": 2075 }, { "epoch": 0.3472007358782456, "grad_norm": 6.523968696594238, "learning_rate": 4.125589269232247e-05, "loss": 2.9064, "step": 2076 }, { "epoch": 0.3473679809340636, "grad_norm": 5.402623653411865, "learning_rate": 4.1244801044246126e-05, "loss": 3.014, "step": 2077 }, { "epoch": 0.3475352259898817, "grad_norm": 4.403411388397217, "learning_rate": 4.123370385899611e-05, "loss": 2.7544, "step": 2078 }, { "epoch": 0.34770247104569973, "grad_norm": 3.8623926639556885, "learning_rate": 4.122260114035497e-05, "loss": 2.8263, "step": 2079 }, { "epoch": 0.34786971610151773, "grad_norm": 3.2124578952789307, "learning_rate": 4.121149289210718e-05, "loss": 2.6997, "step": 2080 }, { "epoch": 0.3480369611573358, "grad_norm": 2.597919464111328, "learning_rate": 4.1200379118039064e-05, "loss": 3.0423, "step": 2081 }, { "epoch": 0.34820420621315384, "grad_norm": 2.3268141746520996, "learning_rate": 4.1189259821938854e-05, "loss": 2.5948, "step": 2082 }, { "epoch": 0.34837145126897184, "grad_norm": 3.5590782165527344, "learning_rate": 4.117813500759664e-05, "loss": 3.0125, "step": 2083 }, { "epoch": 0.3485386963247899, "grad_norm": 1.703224539756775, "learning_rate": 4.116700467880442e-05, "loss": 2.2831, "step": 2084 }, { "epoch": 0.34870594138060795, "grad_norm": 8.495203018188477, "learning_rate": 4.1155868839356046e-05, "loss": 3.1271, "step": 2085 }, { "epoch": 0.34887318643642595, "grad_norm": 4.4230241775512695, "learning_rate": 4.114472749304727e-05, "loss": 3.1157, "step": 2086 }, { "epoch": 0.349040431492244, "grad_norm": 1.5821654796600342, "learning_rate": 4.1133580643675706e-05, "loss": 2.1805, "step": 2087 }, { "epoch": 0.34920767654806206, "grad_norm": 4.633881092071533, "learning_rate": 4.112242829504085e-05, "loss": 2.8127, "step": 2088 }, { "epoch": 0.34937492160388006, "grad_norm": 6.801895618438721, "learning_rate": 4.111127045094407e-05, "loss": 3.0388, "step": 2089 }, { "epoch": 0.3495421666596981, "grad_norm": 3.9224510192871094, "learning_rate": 4.1100107115188616e-05, "loss": 2.7735, "step": 2090 }, { "epoch": 0.34970941171551617, "grad_norm": 6.072331428527832, "learning_rate": 4.108893829157958e-05, "loss": 3.4921, "step": 2091 }, { "epoch": 0.34987665677133417, "grad_norm": 3.322667360305786, "learning_rate": 4.1077763983923975e-05, "loss": 2.9819, "step": 2092 }, { "epoch": 0.3500439018271522, "grad_norm": 2.4951531887054443, "learning_rate": 4.106658419603066e-05, "loss": 2.4913, "step": 2093 }, { "epoch": 0.3502111468829703, "grad_norm": 3.3210535049438477, "learning_rate": 4.105539893171032e-05, "loss": 2.8316, "step": 2094 }, { "epoch": 0.35037839193878834, "grad_norm": 3.902832269668579, "learning_rate": 4.104420819477557e-05, "loss": 2.7967, "step": 2095 }, { "epoch": 0.35054563699460634, "grad_norm": 4.264398097991943, "learning_rate": 4.103301198904086e-05, "loss": 2.5013, "step": 2096 }, { "epoch": 0.3507128820504244, "grad_norm": 4.706326961517334, "learning_rate": 4.102181031832251e-05, "loss": 2.7356, "step": 2097 }, { "epoch": 0.35088012710624245, "grad_norm": 9.653465270996094, "learning_rate": 4.10106031864387e-05, "loss": 3.4272, "step": 2098 }, { "epoch": 0.35104737216206044, "grad_norm": 4.883925437927246, "learning_rate": 4.0999390597209475e-05, "loss": 2.8656, "step": 2099 }, { "epoch": 0.3512146172178785, "grad_norm": 3.9231088161468506, "learning_rate": 4.098817255445674e-05, "loss": 2.8244, "step": 2100 }, { "epoch": 0.35138186227369655, "grad_norm": 4.104783058166504, "learning_rate": 4.0976949062004246e-05, "loss": 2.5776, "step": 2101 }, { "epoch": 0.35154910732951455, "grad_norm": 5.520167350769043, "learning_rate": 4.096572012367762e-05, "loss": 3.4679, "step": 2102 }, { "epoch": 0.3517163523853326, "grad_norm": 3.751569986343384, "learning_rate": 4.095448574330434e-05, "loss": 2.8238, "step": 2103 }, { "epoch": 0.35188359744115066, "grad_norm": 6.446794033050537, "learning_rate": 4.094324592471372e-05, "loss": 3.3357, "step": 2104 }, { "epoch": 0.35205084249696866, "grad_norm": 5.905561447143555, "learning_rate": 4.093200067173698e-05, "loss": 3.1069, "step": 2105 }, { "epoch": 0.3522180875527867, "grad_norm": 5.879213809967041, "learning_rate": 4.092074998820712e-05, "loss": 2.8089, "step": 2106 }, { "epoch": 0.3523853326086048, "grad_norm": 3.522529363632202, "learning_rate": 4.090949387795905e-05, "loss": 2.9643, "step": 2107 }, { "epoch": 0.3525525776644228, "grad_norm": 2.9742085933685303, "learning_rate": 4.089823234482949e-05, "loss": 2.3463, "step": 2108 }, { "epoch": 0.3527198227202408, "grad_norm": 3.0864548683166504, "learning_rate": 4.0886965392657045e-05, "loss": 2.8403, "step": 2109 }, { "epoch": 0.3528870677760589, "grad_norm": 5.6686930656433105, "learning_rate": 4.087569302528214e-05, "loss": 3.535, "step": 2110 }, { "epoch": 0.3530543128318769, "grad_norm": 2.4369125366210938, "learning_rate": 4.086441524654705e-05, "loss": 2.83, "step": 2111 }, { "epoch": 0.35322155788769494, "grad_norm": 2.576749563217163, "learning_rate": 4.0853132060295896e-05, "loss": 2.6358, "step": 2112 }, { "epoch": 0.353388802943513, "grad_norm": 4.5668768882751465, "learning_rate": 4.084184347037466e-05, "loss": 2.6424, "step": 2113 }, { "epoch": 0.35355604799933105, "grad_norm": 3.693408250808716, "learning_rate": 4.083054948063113e-05, "loss": 2.5526, "step": 2114 }, { "epoch": 0.35372329305514905, "grad_norm": 4.101284027099609, "learning_rate": 4.081925009491497e-05, "loss": 2.8381, "step": 2115 }, { "epoch": 0.3538905381109671, "grad_norm": 2.285313844680786, "learning_rate": 4.0807945317077665e-05, "loss": 2.5998, "step": 2116 }, { "epoch": 0.35405778316678516, "grad_norm": 2.8875362873077393, "learning_rate": 4.079663515097254e-05, "loss": 3.2929, "step": 2117 }, { "epoch": 0.35422502822260316, "grad_norm": 4.495602607727051, "learning_rate": 4.0785319600454754e-05, "loss": 2.8723, "step": 2118 }, { "epoch": 0.3543922732784212, "grad_norm": 4.271526336669922, "learning_rate": 4.0773998669381306e-05, "loss": 3.3757, "step": 2119 }, { "epoch": 0.35455951833423927, "grad_norm": 5.275744438171387, "learning_rate": 4.076267236161104e-05, "loss": 2.9145, "step": 2120 }, { "epoch": 0.35472676339005726, "grad_norm": 3.0197198390960693, "learning_rate": 4.075134068100461e-05, "loss": 2.7483, "step": 2121 }, { "epoch": 0.3548940084458753, "grad_norm": 4.198792934417725, "learning_rate": 4.074000363142452e-05, "loss": 2.8971, "step": 2122 }, { "epoch": 0.3550612535016934, "grad_norm": 6.470603942871094, "learning_rate": 4.072866121673507e-05, "loss": 3.2269, "step": 2123 }, { "epoch": 0.3552284985575114, "grad_norm": 5.035688400268555, "learning_rate": 4.0717313440802464e-05, "loss": 2.6879, "step": 2124 }, { "epoch": 0.35539574361332943, "grad_norm": 3.963409423828125, "learning_rate": 4.070596030749464e-05, "loss": 2.8875, "step": 2125 }, { "epoch": 0.3555629886691475, "grad_norm": 3.9470713138580322, "learning_rate": 4.0694601820681425e-05, "loss": 2.8702, "step": 2126 }, { "epoch": 0.3557302337249655, "grad_norm": 4.466644763946533, "learning_rate": 4.068323798423445e-05, "loss": 2.4372, "step": 2127 }, { "epoch": 0.35589747878078354, "grad_norm": 11.164373397827148, "learning_rate": 4.067186880202717e-05, "loss": 3.5506, "step": 2128 }, { "epoch": 0.3560647238366016, "grad_norm": 3.1501662731170654, "learning_rate": 4.0660494277934866e-05, "loss": 2.7598, "step": 2129 }, { "epoch": 0.3562319688924196, "grad_norm": 2.8661205768585205, "learning_rate": 4.0649114415834646e-05, "loss": 2.4452, "step": 2130 }, { "epoch": 0.35639921394823765, "grad_norm": 4.408041477203369, "learning_rate": 4.063772921960541e-05, "loss": 2.9407, "step": 2131 }, { "epoch": 0.3565664590040557, "grad_norm": 3.153670072555542, "learning_rate": 4.062633869312792e-05, "loss": 2.5596, "step": 2132 }, { "epoch": 0.35673370405987376, "grad_norm": 5.296205043792725, "learning_rate": 4.061494284028471e-05, "loss": 2.7668, "step": 2133 }, { "epoch": 0.35690094911569176, "grad_norm": 2.8445241451263428, "learning_rate": 4.0603541664960156e-05, "loss": 2.4697, "step": 2134 }, { "epoch": 0.3570681941715098, "grad_norm": 4.486558437347412, "learning_rate": 4.0592135171040444e-05, "loss": 3.4679, "step": 2135 }, { "epoch": 0.35723543922732787, "grad_norm": 3.355323314666748, "learning_rate": 4.058072336241357e-05, "loss": 3.1906, "step": 2136 }, { "epoch": 0.35740268428314587, "grad_norm": 4.3569655418396, "learning_rate": 4.056930624296935e-05, "loss": 2.9113, "step": 2137 }, { "epoch": 0.3575699293389639, "grad_norm": 4.949251651763916, "learning_rate": 4.0557883816599386e-05, "loss": 2.5283, "step": 2138 }, { "epoch": 0.357737174394782, "grad_norm": 7.15925407409668, "learning_rate": 4.054645608719712e-05, "loss": 3.0098, "step": 2139 }, { "epoch": 0.3579044194506, "grad_norm": 5.092924118041992, "learning_rate": 4.053502305865778e-05, "loss": 3.2714, "step": 2140 }, { "epoch": 0.35807166450641803, "grad_norm": 5.509977340698242, "learning_rate": 4.0523584734878415e-05, "loss": 2.6052, "step": 2141 }, { "epoch": 0.3582389095622361, "grad_norm": 3.365187883377075, "learning_rate": 4.051214111975786e-05, "loss": 2.5683, "step": 2142 }, { "epoch": 0.3584061546180541, "grad_norm": 4.833772659301758, "learning_rate": 4.050069221719677e-05, "loss": 2.8299, "step": 2143 }, { "epoch": 0.35857339967387214, "grad_norm": 6.554752826690674, "learning_rate": 4.048923803109761e-05, "loss": 3.1168, "step": 2144 }, { "epoch": 0.3587406447296902, "grad_norm": 11.510537147521973, "learning_rate": 4.0477778565364616e-05, "loss": 3.9384, "step": 2145 }, { "epoch": 0.3589078897855082, "grad_norm": 3.203388214111328, "learning_rate": 4.046631382390384e-05, "loss": 3.4311, "step": 2146 }, { "epoch": 0.35907513484132625, "grad_norm": 5.0647149085998535, "learning_rate": 4.045484381062316e-05, "loss": 2.6367, "step": 2147 }, { "epoch": 0.3592423798971443, "grad_norm": 5.392296314239502, "learning_rate": 4.044336852943219e-05, "loss": 2.8036, "step": 2148 }, { "epoch": 0.3594096249529623, "grad_norm": 4.21710729598999, "learning_rate": 4.043188798424239e-05, "loss": 2.8041, "step": 2149 }, { "epoch": 0.35957687000878036, "grad_norm": 4.773560047149658, "learning_rate": 4.0420402178967e-05, "loss": 2.7678, "step": 2150 }, { "epoch": 0.3597441150645984, "grad_norm": 4.721332550048828, "learning_rate": 4.040891111752104e-05, "loss": 3.0721, "step": 2151 }, { "epoch": 0.3599113601204164, "grad_norm": 4.03021764755249, "learning_rate": 4.0397414803821343e-05, "loss": 2.9975, "step": 2152 }, { "epoch": 0.36007860517623447, "grad_norm": 3.0793473720550537, "learning_rate": 4.038591324178652e-05, "loss": 2.4525, "step": 2153 }, { "epoch": 0.3602458502320525, "grad_norm": 3.254359722137451, "learning_rate": 4.0374406435336975e-05, "loss": 3.1416, "step": 2154 }, { "epoch": 0.3604130952878706, "grad_norm": 2.281355381011963, "learning_rate": 4.036289438839489e-05, "loss": 2.7251, "step": 2155 }, { "epoch": 0.3605803403436886, "grad_norm": 3.723801374435425, "learning_rate": 4.0351377104884246e-05, "loss": 3.0046, "step": 2156 }, { "epoch": 0.36074758539950663, "grad_norm": 4.4744696617126465, "learning_rate": 4.0339854588730796e-05, "loss": 2.9059, "step": 2157 }, { "epoch": 0.3609148304553247, "grad_norm": 5.506280422210693, "learning_rate": 4.03283268438621e-05, "loss": 2.6884, "step": 2158 }, { "epoch": 0.3610820755111427, "grad_norm": 6.384836196899414, "learning_rate": 4.031679387420747e-05, "loss": 2.756, "step": 2159 }, { "epoch": 0.36124932056696074, "grad_norm": 5.162683010101318, "learning_rate": 4.030525568369802e-05, "loss": 2.7354, "step": 2160 }, { "epoch": 0.3614165656227788, "grad_norm": 4.003694534301758, "learning_rate": 4.029371227626663e-05, "loss": 3.0707, "step": 2161 }, { "epoch": 0.3615838106785968, "grad_norm": 2.838674545288086, "learning_rate": 4.028216365584797e-05, "loss": 2.7424, "step": 2162 }, { "epoch": 0.36175105573441485, "grad_norm": 5.208304405212402, "learning_rate": 4.027060982637848e-05, "loss": 3.0553, "step": 2163 }, { "epoch": 0.3619183007902329, "grad_norm": 4.682592868804932, "learning_rate": 4.025905079179638e-05, "loss": 2.8698, "step": 2164 }, { "epoch": 0.3620855458460509, "grad_norm": 8.498798370361328, "learning_rate": 4.024748655604166e-05, "loss": 2.9502, "step": 2165 }, { "epoch": 0.36225279090186896, "grad_norm": 3.17220401763916, "learning_rate": 4.023591712305609e-05, "loss": 2.6733, "step": 2166 }, { "epoch": 0.362420035957687, "grad_norm": 7.359817981719971, "learning_rate": 4.02243424967832e-05, "loss": 3.0855, "step": 2167 }, { "epoch": 0.362587281013505, "grad_norm": 5.516249656677246, "learning_rate": 4.0212762681168296e-05, "loss": 3.1613, "step": 2168 }, { "epoch": 0.36275452606932307, "grad_norm": 3.959703207015991, "learning_rate": 4.0201177680158466e-05, "loss": 2.8752, "step": 2169 }, { "epoch": 0.3629217711251411, "grad_norm": 9.333991050720215, "learning_rate": 4.018958749770253e-05, "loss": 3.4742, "step": 2170 }, { "epoch": 0.3630890161809591, "grad_norm": 3.8967065811157227, "learning_rate": 4.017799213775111e-05, "loss": 2.9171, "step": 2171 }, { "epoch": 0.3632562612367772, "grad_norm": 5.193264007568359, "learning_rate": 4.016639160425658e-05, "loss": 3.16, "step": 2172 }, { "epoch": 0.36342350629259523, "grad_norm": 5.640976428985596, "learning_rate": 4.015478590117307e-05, "loss": 2.7032, "step": 2173 }, { "epoch": 0.3635907513484133, "grad_norm": 4.802267074584961, "learning_rate": 4.014317503245649e-05, "loss": 2.7224, "step": 2174 }, { "epoch": 0.3637579964042313, "grad_norm": 5.221104621887207, "learning_rate": 4.013155900206449e-05, "loss": 2.4523, "step": 2175 }, { "epoch": 0.36392524146004934, "grad_norm": 3.201125144958496, "learning_rate": 4.01199378139565e-05, "loss": 2.8689, "step": 2176 }, { "epoch": 0.3640924865158674, "grad_norm": 5.070505142211914, "learning_rate": 4.0108311472093685e-05, "loss": 2.915, "step": 2177 }, { "epoch": 0.3642597315716854, "grad_norm": 3.21215558052063, "learning_rate": 4.0096679980438984e-05, "loss": 2.888, "step": 2178 }, { "epoch": 0.36442697662750345, "grad_norm": 2.052504062652588, "learning_rate": 4.0085043342957085e-05, "loss": 2.5523, "step": 2179 }, { "epoch": 0.3645942216833215, "grad_norm": 5.484819412231445, "learning_rate": 4.007340156361443e-05, "loss": 2.8514, "step": 2180 }, { "epoch": 0.3647614667391395, "grad_norm": 6.102361679077148, "learning_rate": 4.0061754646379235e-05, "loss": 2.8189, "step": 2181 }, { "epoch": 0.36492871179495756, "grad_norm": 6.427310466766357, "learning_rate": 4.005010259522142e-05, "loss": 2.983, "step": 2182 }, { "epoch": 0.3650959568507756, "grad_norm": 6.321053981781006, "learning_rate": 4.003844541411269e-05, "loss": 3.0812, "step": 2183 }, { "epoch": 0.3652632019065936, "grad_norm": 3.103813886642456, "learning_rate": 4.00267831070265e-05, "loss": 2.7185, "step": 2184 }, { "epoch": 0.36543044696241167, "grad_norm": 4.8612961769104, "learning_rate": 4.001511567793802e-05, "loss": 2.6364, "step": 2185 }, { "epoch": 0.3655976920182297, "grad_norm": 6.221328258514404, "learning_rate": 4.000344313082423e-05, "loss": 3.254, "step": 2186 }, { "epoch": 0.3657649370740477, "grad_norm": 5.325887203216553, "learning_rate": 3.9991765469663774e-05, "loss": 2.8393, "step": 2187 }, { "epoch": 0.3659321821298658, "grad_norm": 3.959763526916504, "learning_rate": 3.9980082698437094e-05, "loss": 2.9547, "step": 2188 }, { "epoch": 0.36609942718568383, "grad_norm": 2.8099985122680664, "learning_rate": 3.9968394821126355e-05, "loss": 2.6336, "step": 2189 }, { "epoch": 0.36626667224150183, "grad_norm": 4.078843116760254, "learning_rate": 3.995670184171548e-05, "loss": 2.4526, "step": 2190 }, { "epoch": 0.3664339172973199, "grad_norm": 5.427728176116943, "learning_rate": 3.994500376419009e-05, "loss": 2.7541, "step": 2191 }, { "epoch": 0.36660116235313794, "grad_norm": 3.8347973823547363, "learning_rate": 3.99333005925376e-05, "loss": 2.3376, "step": 2192 }, { "epoch": 0.366768407408956, "grad_norm": 8.850035667419434, "learning_rate": 3.992159233074711e-05, "loss": 3.4748, "step": 2193 }, { "epoch": 0.366935652464774, "grad_norm": 4.436789512634277, "learning_rate": 3.990987898280949e-05, "loss": 2.5384, "step": 2194 }, { "epoch": 0.36710289752059205, "grad_norm": 3.1046512126922607, "learning_rate": 3.9898160552717324e-05, "loss": 2.4974, "step": 2195 }, { "epoch": 0.3672701425764101, "grad_norm": 5.0558953285217285, "learning_rate": 3.988643704446493e-05, "loss": 3.0117, "step": 2196 }, { "epoch": 0.3674373876322281, "grad_norm": 3.8358569145202637, "learning_rate": 3.987470846204838e-05, "loss": 2.507, "step": 2197 }, { "epoch": 0.36760463268804616, "grad_norm": 7.396771430969238, "learning_rate": 3.986297480946544e-05, "loss": 2.9489, "step": 2198 }, { "epoch": 0.3677718777438642, "grad_norm": 3.58029842376709, "learning_rate": 3.985123609071563e-05, "loss": 2.5705, "step": 2199 }, { "epoch": 0.3679391227996822, "grad_norm": 4.6637043952941895, "learning_rate": 3.9839492309800195e-05, "loss": 3.1166, "step": 2200 }, { "epoch": 0.36810636785550027, "grad_norm": 3.180877923965454, "learning_rate": 3.9827743470722076e-05, "loss": 2.6287, "step": 2201 }, { "epoch": 0.3682736129113183, "grad_norm": 5.36077356338501, "learning_rate": 3.9815989577485985e-05, "loss": 3.3653, "step": 2202 }, { "epoch": 0.3684408579671363, "grad_norm": 6.901640892028809, "learning_rate": 3.980423063409832e-05, "loss": 2.7492, "step": 2203 }, { "epoch": 0.3686081030229544, "grad_norm": 9.325579643249512, "learning_rate": 3.979246664456721e-05, "loss": 3.5687, "step": 2204 }, { "epoch": 0.36877534807877244, "grad_norm": 3.232632875442505, "learning_rate": 3.9780697612902526e-05, "loss": 2.7728, "step": 2205 }, { "epoch": 0.36894259313459044, "grad_norm": 9.644865989685059, "learning_rate": 3.976892354311583e-05, "loss": 2.6319, "step": 2206 }, { "epoch": 0.3691098381904085, "grad_norm": 4.194123268127441, "learning_rate": 3.97571444392204e-05, "loss": 2.9406, "step": 2207 }, { "epoch": 0.36927708324622655, "grad_norm": 8.144009590148926, "learning_rate": 3.974536030523126e-05, "loss": 2.9132, "step": 2208 }, { "epoch": 0.36944432830204454, "grad_norm": 3.9011075496673584, "learning_rate": 3.9733571145165105e-05, "loss": 2.841, "step": 2209 }, { "epoch": 0.3696115733578626, "grad_norm": 3.118797540664673, "learning_rate": 3.972177696304038e-05, "loss": 2.5473, "step": 2210 }, { "epoch": 0.36977881841368065, "grad_norm": 4.334004878997803, "learning_rate": 3.9709977762877223e-05, "loss": 2.9911, "step": 2211 }, { "epoch": 0.3699460634694987, "grad_norm": 5.968229293823242, "learning_rate": 3.9698173548697506e-05, "loss": 3.1392, "step": 2212 }, { "epoch": 0.3701133085253167, "grad_norm": 2.781682252883911, "learning_rate": 3.9686364324524765e-05, "loss": 2.9217, "step": 2213 }, { "epoch": 0.37028055358113476, "grad_norm": 4.912415027618408, "learning_rate": 3.967455009438429e-05, "loss": 3.0145, "step": 2214 }, { "epoch": 0.3704477986369528, "grad_norm": 3.5599172115325928, "learning_rate": 3.966273086230305e-05, "loss": 3.1351, "step": 2215 }, { "epoch": 0.3706150436927708, "grad_norm": 2.7812790870666504, "learning_rate": 3.9650906632309736e-05, "loss": 2.3758, "step": 2216 }, { "epoch": 0.3707822887485889, "grad_norm": 4.726672172546387, "learning_rate": 3.963907740843472e-05, "loss": 2.2684, "step": 2217 }, { "epoch": 0.37094953380440693, "grad_norm": 8.936923027038574, "learning_rate": 3.962724319471011e-05, "loss": 3.2613, "step": 2218 }, { "epoch": 0.3711167788602249, "grad_norm": 3.8070833683013916, "learning_rate": 3.9615403995169674e-05, "loss": 2.6293, "step": 2219 }, { "epoch": 0.371284023916043, "grad_norm": 4.8537917137146, "learning_rate": 3.960355981384891e-05, "loss": 3.0613, "step": 2220 }, { "epoch": 0.37145126897186104, "grad_norm": 11.239511489868164, "learning_rate": 3.9591710654785e-05, "loss": 3.1649, "step": 2221 }, { "epoch": 0.37161851402767904, "grad_norm": 7.789117813110352, "learning_rate": 3.9579856522016834e-05, "loss": 2.8919, "step": 2222 }, { "epoch": 0.3717857590834971, "grad_norm": 2.5643935203552246, "learning_rate": 3.9567997419584983e-05, "loss": 2.6504, "step": 2223 }, { "epoch": 0.37195300413931515, "grad_norm": 6.073233127593994, "learning_rate": 3.9556133351531734e-05, "loss": 3.1031, "step": 2224 }, { "epoch": 0.37212024919513315, "grad_norm": 7.559192657470703, "learning_rate": 3.954426432190102e-05, "loss": 3.5917, "step": 2225 }, { "epoch": 0.3722874942509512, "grad_norm": 4.595912933349609, "learning_rate": 3.953239033473853e-05, "loss": 2.9518, "step": 2226 }, { "epoch": 0.37245473930676926, "grad_norm": 3.9838380813598633, "learning_rate": 3.9520511394091605e-05, "loss": 2.6935, "step": 2227 }, { "epoch": 0.37262198436258726, "grad_norm": 3.4424848556518555, "learning_rate": 3.9508627504009266e-05, "loss": 2.9975, "step": 2228 }, { "epoch": 0.3727892294184053, "grad_norm": 3.729421854019165, "learning_rate": 3.949673866854224e-05, "loss": 2.9753, "step": 2229 }, { "epoch": 0.37295647447422337, "grad_norm": 4.116894721984863, "learning_rate": 3.9484844891742936e-05, "loss": 2.6155, "step": 2230 }, { "epoch": 0.3731237195300414, "grad_norm": 3.9871273040771484, "learning_rate": 3.9472946177665444e-05, "loss": 2.4487, "step": 2231 }, { "epoch": 0.3732909645858594, "grad_norm": 3.3307223320007324, "learning_rate": 3.946104253036554e-05, "loss": 2.9689, "step": 2232 }, { "epoch": 0.3734582096416775, "grad_norm": 6.442385196685791, "learning_rate": 3.9449133953900673e-05, "loss": 2.7419, "step": 2233 }, { "epoch": 0.37362545469749553, "grad_norm": 3.8460495471954346, "learning_rate": 3.9437220452329984e-05, "loss": 2.7502, "step": 2234 }, { "epoch": 0.37379269975331353, "grad_norm": 4.948921203613281, "learning_rate": 3.94253020297143e-05, "loss": 2.9516, "step": 2235 }, { "epoch": 0.3739599448091316, "grad_norm": 4.014152526855469, "learning_rate": 3.9413378690116087e-05, "loss": 2.7527, "step": 2236 }, { "epoch": 0.37412718986494964, "grad_norm": 2.672684907913208, "learning_rate": 3.940145043759953e-05, "loss": 2.7937, "step": 2237 }, { "epoch": 0.37429443492076764, "grad_norm": 3.744354486465454, "learning_rate": 3.938951727623047e-05, "loss": 2.9049, "step": 2238 }, { "epoch": 0.3744616799765857, "grad_norm": 4.658524513244629, "learning_rate": 3.937757921007642e-05, "loss": 2.9906, "step": 2239 }, { "epoch": 0.37462892503240375, "grad_norm": 3.7683961391448975, "learning_rate": 3.936563624320658e-05, "loss": 2.5914, "step": 2240 }, { "epoch": 0.37479617008822175, "grad_norm": 6.320616245269775, "learning_rate": 3.935368837969179e-05, "loss": 2.9901, "step": 2241 }, { "epoch": 0.3749634151440398, "grad_norm": 3.655290365219116, "learning_rate": 3.934173562360458e-05, "loss": 2.8547, "step": 2242 }, { "epoch": 0.37513066019985786, "grad_norm": 4.2875213623046875, "learning_rate": 3.9329777979019154e-05, "loss": 3.0105, "step": 2243 }, { "epoch": 0.37529790525567586, "grad_norm": 4.370471477508545, "learning_rate": 3.931781545001137e-05, "loss": 2.8846, "step": 2244 }, { "epoch": 0.3754651503114939, "grad_norm": 4.097388744354248, "learning_rate": 3.930584804065876e-05, "loss": 2.6766, "step": 2245 }, { "epoch": 0.37563239536731197, "grad_norm": 2.7673163414001465, "learning_rate": 3.929387575504049e-05, "loss": 2.9069, "step": 2246 }, { "epoch": 0.37579964042312997, "grad_norm": 4.057861328125, "learning_rate": 3.9281898597237444e-05, "loss": 2.7661, "step": 2247 }, { "epoch": 0.375966885478948, "grad_norm": 3.784785270690918, "learning_rate": 3.926991657133212e-05, "loss": 2.7431, "step": 2248 }, { "epoch": 0.3761341305347661, "grad_norm": 3.940498113632202, "learning_rate": 3.925792968140868e-05, "loss": 3.0785, "step": 2249 }, { "epoch": 0.37630137559058413, "grad_norm": 6.901661396026611, "learning_rate": 3.924593793155297e-05, "loss": 3.1263, "step": 2250 }, { "epoch": 0.37646862064640213, "grad_norm": 2.894620656967163, "learning_rate": 3.9233941325852466e-05, "loss": 2.7723, "step": 2251 }, { "epoch": 0.3766358657022202, "grad_norm": 4.373434543609619, "learning_rate": 3.922193986839631e-05, "loss": 2.9502, "step": 2252 }, { "epoch": 0.37680311075803824, "grad_norm": 4.998827934265137, "learning_rate": 3.92099335632753e-05, "loss": 2.8893, "step": 2253 }, { "epoch": 0.37697035581385624, "grad_norm": 6.298311233520508, "learning_rate": 3.9197922414581875e-05, "loss": 3.0121, "step": 2254 }, { "epoch": 0.3771376008696743, "grad_norm": 3.089679002761841, "learning_rate": 3.918590642641016e-05, "loss": 2.8733, "step": 2255 }, { "epoch": 0.37730484592549235, "grad_norm": 7.536619663238525, "learning_rate": 3.917388560285587e-05, "loss": 3.0058, "step": 2256 }, { "epoch": 0.37747209098131035, "grad_norm": 12.007668495178223, "learning_rate": 3.9161859948016435e-05, "loss": 2.4551, "step": 2257 }, { "epoch": 0.3776393360371284, "grad_norm": 7.268791675567627, "learning_rate": 3.914982946599086e-05, "loss": 2.4502, "step": 2258 }, { "epoch": 0.37780658109294646, "grad_norm": 4.0674238204956055, "learning_rate": 3.913779416087986e-05, "loss": 2.8319, "step": 2259 }, { "epoch": 0.37797382614876446, "grad_norm": 5.13723087310791, "learning_rate": 3.912575403678576e-05, "loss": 3.1382, "step": 2260 }, { "epoch": 0.3781410712045825, "grad_norm": 3.691744565963745, "learning_rate": 3.9113709097812535e-05, "loss": 2.8627, "step": 2261 }, { "epoch": 0.37830831626040057, "grad_norm": 4.356899738311768, "learning_rate": 3.91016593480658e-05, "loss": 2.6444, "step": 2262 }, { "epoch": 0.37847556131621857, "grad_norm": 5.566958904266357, "learning_rate": 3.908960479165282e-05, "loss": 2.8186, "step": 2263 }, { "epoch": 0.3786428063720366, "grad_norm": 6.385880470275879, "learning_rate": 3.907754543268247e-05, "loss": 3.0717, "step": 2264 }, { "epoch": 0.3788100514278547, "grad_norm": 4.942833423614502, "learning_rate": 3.9065481275265307e-05, "loss": 3.1847, "step": 2265 }, { "epoch": 0.3789772964836727, "grad_norm": 2.2484238147735596, "learning_rate": 3.9053412323513474e-05, "loss": 3.0203, "step": 2266 }, { "epoch": 0.37914454153949073, "grad_norm": 5.323022842407227, "learning_rate": 3.904133858154079e-05, "loss": 2.936, "step": 2267 }, { "epoch": 0.3793117865953088, "grad_norm": 2.4883816242218018, "learning_rate": 3.902926005346267e-05, "loss": 2.4756, "step": 2268 }, { "epoch": 0.37947903165112684, "grad_norm": 4.245055675506592, "learning_rate": 3.901717674339621e-05, "loss": 3.1328, "step": 2269 }, { "epoch": 0.37964627670694484, "grad_norm": 3.761925458908081, "learning_rate": 3.900508865546007e-05, "loss": 2.8357, "step": 2270 }, { "epoch": 0.3798135217627629, "grad_norm": 5.312649726867676, "learning_rate": 3.8992995793774596e-05, "loss": 2.6242, "step": 2271 }, { "epoch": 0.37998076681858095, "grad_norm": 4.535066604614258, "learning_rate": 3.898089816246173e-05, "loss": 2.5243, "step": 2272 }, { "epoch": 0.38014801187439895, "grad_norm": 2.752199172973633, "learning_rate": 3.896879576564506e-05, "loss": 2.5572, "step": 2273 }, { "epoch": 0.380315256930217, "grad_norm": 3.7955541610717773, "learning_rate": 3.895668860744977e-05, "loss": 2.7997, "step": 2274 }, { "epoch": 0.38048250198603506, "grad_norm": 3.636723279953003, "learning_rate": 3.8944576692002696e-05, "loss": 2.6815, "step": 2275 }, { "epoch": 0.38064974704185306, "grad_norm": 5.669368743896484, "learning_rate": 3.893246002343229e-05, "loss": 2.861, "step": 2276 }, { "epoch": 0.3808169920976711, "grad_norm": 3.3921751976013184, "learning_rate": 3.89203386058686e-05, "loss": 2.6562, "step": 2277 }, { "epoch": 0.38098423715348917, "grad_norm": 4.523271560668945, "learning_rate": 3.8908212443443324e-05, "loss": 2.9247, "step": 2278 }, { "epoch": 0.38115148220930717, "grad_norm": 3.3059260845184326, "learning_rate": 3.8896081540289754e-05, "loss": 2.7041, "step": 2279 }, { "epoch": 0.3813187272651252, "grad_norm": 6.8038201332092285, "learning_rate": 3.8883945900542816e-05, "loss": 2.9395, "step": 2280 }, { "epoch": 0.3814859723209433, "grad_norm": 4.942650318145752, "learning_rate": 3.887180552833905e-05, "loss": 2.576, "step": 2281 }, { "epoch": 0.3816532173767613, "grad_norm": 4.529291152954102, "learning_rate": 3.885966042781658e-05, "loss": 3.2654, "step": 2282 }, { "epoch": 0.38182046243257933, "grad_norm": 4.414721965789795, "learning_rate": 3.884751060311518e-05, "loss": 2.9477, "step": 2283 }, { "epoch": 0.3819877074883974, "grad_norm": 8.713486671447754, "learning_rate": 3.883535605837621e-05, "loss": 3.6349, "step": 2284 }, { "epoch": 0.3821549525442154, "grad_norm": 3.148411512374878, "learning_rate": 3.8823196797742643e-05, "loss": 2.7615, "step": 2285 }, { "epoch": 0.38232219760003344, "grad_norm": 10.803143501281738, "learning_rate": 3.881103282535908e-05, "loss": 2.6128, "step": 2286 }, { "epoch": 0.3824894426558515, "grad_norm": 6.562727928161621, "learning_rate": 3.8798864145371684e-05, "loss": 2.0101, "step": 2287 }, { "epoch": 0.3826566877116695, "grad_norm": 3.230128288269043, "learning_rate": 3.878669076192827e-05, "loss": 2.8343, "step": 2288 }, { "epoch": 0.38282393276748755, "grad_norm": 4.09574556350708, "learning_rate": 3.877451267917823e-05, "loss": 2.7624, "step": 2289 }, { "epoch": 0.3829911778233056, "grad_norm": 8.990440368652344, "learning_rate": 3.876232990127257e-05, "loss": 2.8496, "step": 2290 }, { "epoch": 0.38315842287912366, "grad_norm": 4.775485515594482, "learning_rate": 3.875014243236388e-05, "loss": 3.2037, "step": 2291 }, { "epoch": 0.38332566793494166, "grad_norm": 5.094475269317627, "learning_rate": 3.873795027660636e-05, "loss": 3.2273, "step": 2292 }, { "epoch": 0.3834929129907597, "grad_norm": 4.8333892822265625, "learning_rate": 3.87257534381558e-05, "loss": 3.2444, "step": 2293 }, { "epoch": 0.38366015804657777, "grad_norm": 4.242833614349365, "learning_rate": 3.871355192116961e-05, "loss": 2.4772, "step": 2294 }, { "epoch": 0.38382740310239577, "grad_norm": 6.594497203826904, "learning_rate": 3.870134572980676e-05, "loss": 3.0755, "step": 2295 }, { "epoch": 0.3839946481582138, "grad_norm": 9.770747184753418, "learning_rate": 3.868913486822784e-05, "loss": 3.4536, "step": 2296 }, { "epoch": 0.3841618932140319, "grad_norm": 5.248602867126465, "learning_rate": 3.867691934059502e-05, "loss": 3.2244, "step": 2297 }, { "epoch": 0.3843291382698499, "grad_norm": 8.031108856201172, "learning_rate": 3.866469915107207e-05, "loss": 2.9955, "step": 2298 }, { "epoch": 0.38449638332566793, "grad_norm": 5.452845573425293, "learning_rate": 3.865247430382433e-05, "loss": 2.7042, "step": 2299 }, { "epoch": 0.384663628381486, "grad_norm": 5.66351842880249, "learning_rate": 3.864024480301874e-05, "loss": 2.5213, "step": 2300 }, { "epoch": 0.384830873437304, "grad_norm": 3.7808289527893066, "learning_rate": 3.8628010652823835e-05, "loss": 2.884, "step": 2301 }, { "epoch": 0.38499811849312204, "grad_norm": 11.350418090820312, "learning_rate": 3.861577185740972e-05, "loss": 3.2668, "step": 2302 }, { "epoch": 0.3851653635489401, "grad_norm": 5.030455112457275, "learning_rate": 3.860352842094809e-05, "loss": 2.7841, "step": 2303 }, { "epoch": 0.3853326086047581, "grad_norm": 4.949851036071777, "learning_rate": 3.8591280347612214e-05, "loss": 3.271, "step": 2304 }, { "epoch": 0.38549985366057615, "grad_norm": 4.504682540893555, "learning_rate": 3.857902764157696e-05, "loss": 2.7095, "step": 2305 }, { "epoch": 0.3856670987163942, "grad_norm": 6.8189239501953125, "learning_rate": 3.856677030701875e-05, "loss": 2.6261, "step": 2306 }, { "epoch": 0.3858343437722122, "grad_norm": 5.561661720275879, "learning_rate": 3.855450834811561e-05, "loss": 3.1, "step": 2307 }, { "epoch": 0.38600158882803026, "grad_norm": 9.234230041503906, "learning_rate": 3.854224176904713e-05, "loss": 2.9219, "step": 2308 }, { "epoch": 0.3861688338838483, "grad_norm": 3.5841071605682373, "learning_rate": 3.8529970573994455e-05, "loss": 2.7556, "step": 2309 }, { "epoch": 0.38633607893966637, "grad_norm": 4.41111421585083, "learning_rate": 3.851769476714034e-05, "loss": 2.6789, "step": 2310 }, { "epoch": 0.38650332399548437, "grad_norm": 3.958214521408081, "learning_rate": 3.85054143526691e-05, "loss": 2.662, "step": 2311 }, { "epoch": 0.3866705690513024, "grad_norm": 6.364981651306152, "learning_rate": 3.849312933476659e-05, "loss": 2.7574, "step": 2312 }, { "epoch": 0.3868378141071205, "grad_norm": 2.786710023880005, "learning_rate": 3.8480839717620285e-05, "loss": 2.873, "step": 2313 }, { "epoch": 0.3870050591629385, "grad_norm": 4.438518047332764, "learning_rate": 3.846854550541918e-05, "loss": 2.6462, "step": 2314 }, { "epoch": 0.38717230421875654, "grad_norm": 8.740317344665527, "learning_rate": 3.845624670235388e-05, "loss": 2.9139, "step": 2315 }, { "epoch": 0.3873395492745746, "grad_norm": 3.1161911487579346, "learning_rate": 3.8443943312616514e-05, "loss": 2.4449, "step": 2316 }, { "epoch": 0.3875067943303926, "grad_norm": 3.320859432220459, "learning_rate": 3.8431635340400805e-05, "loss": 2.6485, "step": 2317 }, { "epoch": 0.38767403938621064, "grad_norm": 4.571425914764404, "learning_rate": 3.8419322789902025e-05, "loss": 2.9258, "step": 2318 }, { "epoch": 0.3878412844420287, "grad_norm": 4.307285308837891, "learning_rate": 3.8407005665317e-05, "loss": 2.8616, "step": 2319 }, { "epoch": 0.3880085294978467, "grad_norm": 5.342110633850098, "learning_rate": 3.8394683970844126e-05, "loss": 2.916, "step": 2320 }, { "epoch": 0.38817577455366475, "grad_norm": 12.54957389831543, "learning_rate": 3.838235771068336e-05, "loss": 3.0683, "step": 2321 }, { "epoch": 0.3883430196094828, "grad_norm": 6.488868713378906, "learning_rate": 3.83700268890362e-05, "loss": 3.1689, "step": 2322 }, { "epoch": 0.3885102646653008, "grad_norm": 3.858682870864868, "learning_rate": 3.835769151010571e-05, "loss": 2.5792, "step": 2323 }, { "epoch": 0.38867750972111886, "grad_norm": 6.505568027496338, "learning_rate": 3.834535157809651e-05, "loss": 3.2226, "step": 2324 }, { "epoch": 0.3888447547769369, "grad_norm": 5.602285861968994, "learning_rate": 3.833300709721477e-05, "loss": 2.4812, "step": 2325 }, { "epoch": 0.3890119998327549, "grad_norm": 4.564261436462402, "learning_rate": 3.83206580716682e-05, "loss": 2.704, "step": 2326 }, { "epoch": 0.389179244888573, "grad_norm": 3.8989689350128174, "learning_rate": 3.830830450566607e-05, "loss": 3.2201, "step": 2327 }, { "epoch": 0.389346489944391, "grad_norm": 5.160497188568115, "learning_rate": 3.829594640341918e-05, "loss": 3.2393, "step": 2328 }, { "epoch": 0.3895137350002091, "grad_norm": 6.113044261932373, "learning_rate": 3.828358376913992e-05, "loss": 3.1111, "step": 2329 }, { "epoch": 0.3896809800560271, "grad_norm": 4.160523891448975, "learning_rate": 3.827121660704217e-05, "loss": 2.849, "step": 2330 }, { "epoch": 0.38984822511184514, "grad_norm": 4.020809650421143, "learning_rate": 3.8258844921341393e-05, "loss": 2.61, "step": 2331 }, { "epoch": 0.3900154701676632, "grad_norm": 3.3300414085388184, "learning_rate": 3.824646871625457e-05, "loss": 2.5374, "step": 2332 }, { "epoch": 0.3901827152234812, "grad_norm": 5.931562900543213, "learning_rate": 3.823408799600024e-05, "loss": 3.0656, "step": 2333 }, { "epoch": 0.39034996027929925, "grad_norm": 3.0005879402160645, "learning_rate": 3.822170276479846e-05, "loss": 2.7152, "step": 2334 }, { "epoch": 0.3905172053351173, "grad_norm": 11.922579765319824, "learning_rate": 3.820931302687085e-05, "loss": 3.9863, "step": 2335 }, { "epoch": 0.3906844503909353, "grad_norm": 2.7197773456573486, "learning_rate": 3.8196918786440557e-05, "loss": 2.7338, "step": 2336 }, { "epoch": 0.39085169544675336, "grad_norm": 3.9468655586242676, "learning_rate": 3.818452004773224e-05, "loss": 2.9257, "step": 2337 }, { "epoch": 0.3910189405025714, "grad_norm": 4.606106281280518, "learning_rate": 3.817211681497214e-05, "loss": 2.6545, "step": 2338 }, { "epoch": 0.3911861855583894, "grad_norm": 4.639556407928467, "learning_rate": 3.8159709092387975e-05, "loss": 3.1612, "step": 2339 }, { "epoch": 0.39135343061420746, "grad_norm": 3.1372015476226807, "learning_rate": 3.814729688420903e-05, "loss": 2.8095, "step": 2340 }, { "epoch": 0.3915206756700255, "grad_norm": 4.125485897064209, "learning_rate": 3.813488019466609e-05, "loss": 2.973, "step": 2341 }, { "epoch": 0.3916879207258435, "grad_norm": 3.999826192855835, "learning_rate": 3.8122459027991526e-05, "loss": 2.6372, "step": 2342 }, { "epoch": 0.3918551657816616, "grad_norm": 3.3851733207702637, "learning_rate": 3.811003338841916e-05, "loss": 2.4948, "step": 2343 }, { "epoch": 0.39202241083747963, "grad_norm": 4.309061527252197, "learning_rate": 3.8097603280184385e-05, "loss": 2.7335, "step": 2344 }, { "epoch": 0.39218965589329763, "grad_norm": 3.145214796066284, "learning_rate": 3.80851687075241e-05, "loss": 2.831, "step": 2345 }, { "epoch": 0.3923569009491157, "grad_norm": 3.3855013847351074, "learning_rate": 3.807272967467674e-05, "loss": 3.1141, "step": 2346 }, { "epoch": 0.39252414600493374, "grad_norm": 4.348839282989502, "learning_rate": 3.806028618588224e-05, "loss": 2.4769, "step": 2347 }, { "epoch": 0.3926913910607518, "grad_norm": 5.469672203063965, "learning_rate": 3.8047838245382086e-05, "loss": 3.1002, "step": 2348 }, { "epoch": 0.3928586361165698, "grad_norm": 6.206658840179443, "learning_rate": 3.803538585741923e-05, "loss": 3.0053, "step": 2349 }, { "epoch": 0.39302588117238785, "grad_norm": 4.202328205108643, "learning_rate": 3.802292902623819e-05, "loss": 2.7069, "step": 2350 }, { "epoch": 0.3931931262282059, "grad_norm": 51.555152893066406, "learning_rate": 3.8010467756084974e-05, "loss": 3.3852, "step": 2351 }, { "epoch": 0.3933603712840239, "grad_norm": 4.521754741668701, "learning_rate": 3.799800205120712e-05, "loss": 2.594, "step": 2352 }, { "epoch": 0.39352761633984196, "grad_norm": 5.674177169799805, "learning_rate": 3.7985531915853645e-05, "loss": 3.1792, "step": 2353 }, { "epoch": 0.39369486139566, "grad_norm": 5.109877109527588, "learning_rate": 3.797305735427511e-05, "loss": 2.6276, "step": 2354 }, { "epoch": 0.393862106451478, "grad_norm": 6.5439276695251465, "learning_rate": 3.796057837072358e-05, "loss": 3.5415, "step": 2355 }, { "epoch": 0.39402935150729607, "grad_norm": 5.586864471435547, "learning_rate": 3.794809496945259e-05, "loss": 2.6358, "step": 2356 }, { "epoch": 0.3941965965631141, "grad_norm": 2.837106943130493, "learning_rate": 3.793560715471723e-05, "loss": 3.0887, "step": 2357 }, { "epoch": 0.3943638416189321, "grad_norm": 4.101597785949707, "learning_rate": 3.792311493077408e-05, "loss": 2.8993, "step": 2358 }, { "epoch": 0.3945310866747502, "grad_norm": 7.328614711761475, "learning_rate": 3.791061830188122e-05, "loss": 3.1374, "step": 2359 }, { "epoch": 0.39469833173056823, "grad_norm": 4.128460884094238, "learning_rate": 3.78981172722982e-05, "loss": 2.8629, "step": 2360 }, { "epoch": 0.39486557678638623, "grad_norm": 2.956561326980591, "learning_rate": 3.788561184628612e-05, "loss": 2.8386, "step": 2361 }, { "epoch": 0.3950328218422043, "grad_norm": 3.0823421478271484, "learning_rate": 3.7873102028107556e-05, "loss": 2.7324, "step": 2362 }, { "epoch": 0.39520006689802234, "grad_norm": 5.031204700469971, "learning_rate": 3.7860587822026575e-05, "loss": 3.0685, "step": 2363 }, { "epoch": 0.39536731195384034, "grad_norm": 3.2398135662078857, "learning_rate": 3.784806923230875e-05, "loss": 2.5478, "step": 2364 }, { "epoch": 0.3955345570096584, "grad_norm": 1.9911688566207886, "learning_rate": 3.7835546263221146e-05, "loss": 2.5773, "step": 2365 }, { "epoch": 0.39570180206547645, "grad_norm": 6.768765449523926, "learning_rate": 3.782301891903233e-05, "loss": 3.8502, "step": 2366 }, { "epoch": 0.3958690471212945, "grad_norm": 3.2375550270080566, "learning_rate": 3.781048720401232e-05, "loss": 3.0587, "step": 2367 }, { "epoch": 0.3960362921771125, "grad_norm": 6.523290634155273, "learning_rate": 3.779795112243268e-05, "loss": 3.3211, "step": 2368 }, { "epoch": 0.39620353723293056, "grad_norm": 4.02286958694458, "learning_rate": 3.778541067856645e-05, "loss": 2.6296, "step": 2369 }, { "epoch": 0.3963707822887486, "grad_norm": 4.051251411437988, "learning_rate": 3.777286587668809e-05, "loss": 2.6049, "step": 2370 }, { "epoch": 0.3965380273445666, "grad_norm": 6.635698318481445, "learning_rate": 3.7760316721073655e-05, "loss": 3.1222, "step": 2371 }, { "epoch": 0.39670527240038467, "grad_norm": 3.9048142433166504, "learning_rate": 3.7747763216000595e-05, "loss": 2.836, "step": 2372 }, { "epoch": 0.3968725174562027, "grad_norm": 3.384138584136963, "learning_rate": 3.773520536574788e-05, "loss": 2.6679, "step": 2373 }, { "epoch": 0.3970397625120207, "grad_norm": 14.745644569396973, "learning_rate": 3.7722643174595975e-05, "loss": 3.6882, "step": 2374 }, { "epoch": 0.3972070075678388, "grad_norm": 4.052648067474365, "learning_rate": 3.7710076646826774e-05, "loss": 3.2001, "step": 2375 }, { "epoch": 0.39737425262365683, "grad_norm": 2.2837607860565186, "learning_rate": 3.769750578672371e-05, "loss": 2.6035, "step": 2376 }, { "epoch": 0.39754149767947483, "grad_norm": 12.489408493041992, "learning_rate": 3.768493059857165e-05, "loss": 3.6433, "step": 2377 }, { "epoch": 0.3977087427352929, "grad_norm": 4.271955490112305, "learning_rate": 3.767235108665694e-05, "loss": 2.7966, "step": 2378 }, { "epoch": 0.39787598779111094, "grad_norm": 2.7068428993225098, "learning_rate": 3.765976725526744e-05, "loss": 2.9937, "step": 2379 }, { "epoch": 0.39804323284692894, "grad_norm": 3.8248305320739746, "learning_rate": 3.764717910869242e-05, "loss": 2.6157, "step": 2380 }, { "epoch": 0.398210477902747, "grad_norm": 4.357799530029297, "learning_rate": 3.763458665122268e-05, "loss": 2.8393, "step": 2381 }, { "epoch": 0.39837772295856505, "grad_norm": 8.104601860046387, "learning_rate": 3.762198988715043e-05, "loss": 3.2406, "step": 2382 }, { "epoch": 0.39854496801438305, "grad_norm": 3.3306994438171387, "learning_rate": 3.760938882076942e-05, "loss": 2.5151, "step": 2383 }, { "epoch": 0.3987122130702011, "grad_norm": 2.5593671798706055, "learning_rate": 3.75967834563748e-05, "loss": 2.8863, "step": 2384 }, { "epoch": 0.39887945812601916, "grad_norm": 4.384718894958496, "learning_rate": 3.7584173798263214e-05, "loss": 2.8057, "step": 2385 }, { "epoch": 0.3990467031818372, "grad_norm": 5.272550582885742, "learning_rate": 3.757155985073277e-05, "loss": 3.0308, "step": 2386 }, { "epoch": 0.3992139482376552, "grad_norm": 4.171778202056885, "learning_rate": 3.755894161808304e-05, "loss": 3.1415, "step": 2387 }, { "epoch": 0.39938119329347327, "grad_norm": 3.7600531578063965, "learning_rate": 3.7546319104615056e-05, "loss": 2.5173, "step": 2388 }, { "epoch": 0.3995484383492913, "grad_norm": 3.8602848052978516, "learning_rate": 3.75336923146313e-05, "loss": 2.9621, "step": 2389 }, { "epoch": 0.3997156834051093, "grad_norm": 2.190941095352173, "learning_rate": 3.752106125243571e-05, "loss": 2.754, "step": 2390 }, { "epoch": 0.3998829284609274, "grad_norm": 2.903258800506592, "learning_rate": 3.750842592233369e-05, "loss": 2.7587, "step": 2391 }, { "epoch": 0.40005017351674543, "grad_norm": 3.3671538829803467, "learning_rate": 3.74957863286321e-05, "loss": 2.5881, "step": 2392 }, { "epoch": 0.40021741857256343, "grad_norm": 4.332389831542969, "learning_rate": 3.748314247563926e-05, "loss": 3.0654, "step": 2393 }, { "epoch": 0.4003846636283815, "grad_norm": 3.588890790939331, "learning_rate": 3.74704943676649e-05, "loss": 2.5849, "step": 2394 }, { "epoch": 0.40055190868419954, "grad_norm": 6.070440292358398, "learning_rate": 3.745784200902026e-05, "loss": 3.1421, "step": 2395 }, { "epoch": 0.40071915374001754, "grad_norm": 4.641623020172119, "learning_rate": 3.744518540401799e-05, "loss": 2.6809, "step": 2396 }, { "epoch": 0.4008863987958356, "grad_norm": 3.778353452682495, "learning_rate": 3.743252455697219e-05, "loss": 2.9259, "step": 2397 }, { "epoch": 0.40105364385165365, "grad_norm": 2.935297727584839, "learning_rate": 3.741985947219843e-05, "loss": 2.9297, "step": 2398 }, { "epoch": 0.40122088890747165, "grad_norm": 5.020274639129639, "learning_rate": 3.74071901540137e-05, "loss": 3.5055, "step": 2399 }, { "epoch": 0.4013881339632897, "grad_norm": 3.489565134048462, "learning_rate": 3.739451660673644e-05, "loss": 2.9513, "step": 2400 }, { "epoch": 0.40155537901910776, "grad_norm": 6.002955436706543, "learning_rate": 3.738183883468653e-05, "loss": 2.8716, "step": 2401 }, { "epoch": 0.40172262407492576, "grad_norm": 3.7859103679656982, "learning_rate": 3.736915684218529e-05, "loss": 2.5497, "step": 2402 }, { "epoch": 0.4018898691307438, "grad_norm": 11.139647483825684, "learning_rate": 3.735647063355551e-05, "loss": 3.3788, "step": 2403 }, { "epoch": 0.40205711418656187, "grad_norm": 5.578611373901367, "learning_rate": 3.734378021312135e-05, "loss": 2.761, "step": 2404 }, { "epoch": 0.40222435924237987, "grad_norm": 3.2936978340148926, "learning_rate": 3.733108558520846e-05, "loss": 2.7212, "step": 2405 }, { "epoch": 0.4023916042981979, "grad_norm": 4.0414886474609375, "learning_rate": 3.7318386754143906e-05, "loss": 3.1897, "step": 2406 }, { "epoch": 0.402558849354016, "grad_norm": 2.9578287601470947, "learning_rate": 3.730568372425618e-05, "loss": 2.6785, "step": 2407 }, { "epoch": 0.40272609440983403, "grad_norm": 5.281961917877197, "learning_rate": 3.7292976499875245e-05, "loss": 2.743, "step": 2408 }, { "epoch": 0.40289333946565203, "grad_norm": 3.8097727298736572, "learning_rate": 3.728026508533243e-05, "loss": 2.4899, "step": 2409 }, { "epoch": 0.4030605845214701, "grad_norm": 5.311424255371094, "learning_rate": 3.726754948496054e-05, "loss": 2.8507, "step": 2410 }, { "epoch": 0.40322782957728814, "grad_norm": 4.299258708953857, "learning_rate": 3.725482970309377e-05, "loss": 2.9473, "step": 2411 }, { "epoch": 0.40339507463310614, "grad_norm": 2.6682558059692383, "learning_rate": 3.72421057440678e-05, "loss": 2.5344, "step": 2412 }, { "epoch": 0.4035623196889242, "grad_norm": 3.2956674098968506, "learning_rate": 3.7229377612219677e-05, "loss": 3.2311, "step": 2413 }, { "epoch": 0.40372956474474225, "grad_norm": 3.7211620807647705, "learning_rate": 3.7216645311887874e-05, "loss": 2.7433, "step": 2414 }, { "epoch": 0.40389680980056025, "grad_norm": 7.169750213623047, "learning_rate": 3.720390884741232e-05, "loss": 3.2442, "step": 2415 }, { "epoch": 0.4040640548563783, "grad_norm": 4.610621929168701, "learning_rate": 3.719116822313434e-05, "loss": 2.2844, "step": 2416 }, { "epoch": 0.40423129991219636, "grad_norm": 6.308140277862549, "learning_rate": 3.717842344339668e-05, "loss": 3.2226, "step": 2417 }, { "epoch": 0.40439854496801436, "grad_norm": 4.780753135681152, "learning_rate": 3.71656745125435e-05, "loss": 2.8946, "step": 2418 }, { "epoch": 0.4045657900238324, "grad_norm": 3.3275935649871826, "learning_rate": 3.7152921434920375e-05, "loss": 2.9312, "step": 2419 }, { "epoch": 0.40473303507965047, "grad_norm": 4.683706760406494, "learning_rate": 3.71401642148743e-05, "loss": 2.7158, "step": 2420 }, { "epoch": 0.40490028013546847, "grad_norm": 3.3142316341400146, "learning_rate": 3.712740285675367e-05, "loss": 2.55, "step": 2421 }, { "epoch": 0.4050675251912865, "grad_norm": 2.9567079544067383, "learning_rate": 3.711463736490832e-05, "loss": 2.8757, "step": 2422 }, { "epoch": 0.4052347702471046, "grad_norm": 4.3135881423950195, "learning_rate": 3.710186774368945e-05, "loss": 2.5996, "step": 2423 }, { "epoch": 0.4054020153029226, "grad_norm": 2.0709054470062256, "learning_rate": 3.70890939974497e-05, "loss": 2.4728, "step": 2424 }, { "epoch": 0.40556926035874064, "grad_norm": 6.876306533813477, "learning_rate": 3.70763161305431e-05, "loss": 2.9369, "step": 2425 }, { "epoch": 0.4057365054145587, "grad_norm": 5.174966335296631, "learning_rate": 3.70635341473251e-05, "loss": 2.858, "step": 2426 }, { "epoch": 0.40590375047037675, "grad_norm": 3.1196393966674805, "learning_rate": 3.7050748052152536e-05, "loss": 2.6046, "step": 2427 }, { "epoch": 0.40607099552619474, "grad_norm": 3.546504020690918, "learning_rate": 3.7037957849383656e-05, "loss": 2.6957, "step": 2428 }, { "epoch": 0.4062382405820128, "grad_norm": 2.696834087371826, "learning_rate": 3.702516354337811e-05, "loss": 2.5113, "step": 2429 }, { "epoch": 0.40640548563783085, "grad_norm": 4.30765438079834, "learning_rate": 3.7012365138496943e-05, "loss": 2.8174, "step": 2430 }, { "epoch": 0.40657273069364885, "grad_norm": 9.047168731689453, "learning_rate": 3.699956263910258e-05, "loss": 2.622, "step": 2431 }, { "epoch": 0.4067399757494669, "grad_norm": 3.075723886489868, "learning_rate": 3.698675604955888e-05, "loss": 2.0971, "step": 2432 }, { "epoch": 0.40690722080528496, "grad_norm": 5.674729824066162, "learning_rate": 3.697394537423106e-05, "loss": 2.5898, "step": 2433 }, { "epoch": 0.40707446586110296, "grad_norm": 3.955933094024658, "learning_rate": 3.696113061748575e-05, "loss": 2.8882, "step": 2434 }, { "epoch": 0.407241710916921, "grad_norm": 11.25613784790039, "learning_rate": 3.694831178369095e-05, "loss": 2.9777, "step": 2435 }, { "epoch": 0.4074089559727391, "grad_norm": 5.862634658813477, "learning_rate": 3.693548887721609e-05, "loss": 2.8776, "step": 2436 }, { "epoch": 0.4075762010285571, "grad_norm": 3.1274619102478027, "learning_rate": 3.692266190243195e-05, "loss": 2.7437, "step": 2437 }, { "epoch": 0.4077434460843751, "grad_norm": 2.4201066493988037, "learning_rate": 3.6909830863710706e-05, "loss": 2.7914, "step": 2438 }, { "epoch": 0.4079106911401932, "grad_norm": 9.937926292419434, "learning_rate": 3.689699576542592e-05, "loss": 3.2919, "step": 2439 }, { "epoch": 0.4080779361960112, "grad_norm": 5.363358020782471, "learning_rate": 3.6884156611952554e-05, "loss": 3.2712, "step": 2440 }, { "epoch": 0.40824518125182924, "grad_norm": 6.00119686126709, "learning_rate": 3.687131340766693e-05, "loss": 2.9056, "step": 2441 }, { "epoch": 0.4084124263076473, "grad_norm": 2.3774590492248535, "learning_rate": 3.6858466156946755e-05, "loss": 2.6118, "step": 2442 }, { "epoch": 0.4085796713634653, "grad_norm": 4.844610214233398, "learning_rate": 3.684561486417114e-05, "loss": 3.1465, "step": 2443 }, { "epoch": 0.40874691641928335, "grad_norm": 4.140345573425293, "learning_rate": 3.683275953372052e-05, "loss": 2.6387, "step": 2444 }, { "epoch": 0.4089141614751014, "grad_norm": 6.463188648223877, "learning_rate": 3.6819900169976774e-05, "loss": 2.8469, "step": 2445 }, { "epoch": 0.40908140653091946, "grad_norm": 5.6157684326171875, "learning_rate": 3.68070367773231e-05, "loss": 2.6675, "step": 2446 }, { "epoch": 0.40924865158673746, "grad_norm": 3.9712846279144287, "learning_rate": 3.679416936014411e-05, "loss": 2.6853, "step": 2447 }, { "epoch": 0.4094158966425555, "grad_norm": 4.682806015014648, "learning_rate": 3.6781297922825745e-05, "loss": 2.9811, "step": 2448 }, { "epoch": 0.40958314169837357, "grad_norm": 2.5090785026550293, "learning_rate": 3.676842246975535e-05, "loss": 2.6021, "step": 2449 }, { "epoch": 0.40975038675419156, "grad_norm": 7.388420104980469, "learning_rate": 3.675554300532164e-05, "loss": 2.3886, "step": 2450 }, { "epoch": 0.4099176318100096, "grad_norm": 1.9170849323272705, "learning_rate": 3.6742659533914666e-05, "loss": 2.4086, "step": 2451 }, { "epoch": 0.4100848768658277, "grad_norm": 3.1079962253570557, "learning_rate": 3.672977205992587e-05, "loss": 2.5465, "step": 2452 }, { "epoch": 0.4102521219216457, "grad_norm": 3.734808921813965, "learning_rate": 3.671688058774807e-05, "loss": 2.744, "step": 2453 }, { "epoch": 0.41041936697746373, "grad_norm": 6.573564052581787, "learning_rate": 3.670398512177541e-05, "loss": 2.7485, "step": 2454 }, { "epoch": 0.4105866120332818, "grad_norm": 11.059652328491211, "learning_rate": 3.669108566640343e-05, "loss": 2.869, "step": 2455 }, { "epoch": 0.4107538570890998, "grad_norm": 3.358837366104126, "learning_rate": 3.667818222602901e-05, "loss": 2.76, "step": 2456 }, { "epoch": 0.41092110214491784, "grad_norm": 4.869976997375488, "learning_rate": 3.666527480505039e-05, "loss": 2.6133, "step": 2457 }, { "epoch": 0.4110883472007359, "grad_norm": 5.479213237762451, "learning_rate": 3.6652363407867174e-05, "loss": 3.0214, "step": 2458 }, { "epoch": 0.4112555922565539, "grad_norm": 8.298322677612305, "learning_rate": 3.6639448038880315e-05, "loss": 3.0564, "step": 2459 }, { "epoch": 0.41142283731237195, "grad_norm": 4.702263832092285, "learning_rate": 3.662652870249211e-05, "loss": 3.0161, "step": 2460 }, { "epoch": 0.41159008236819, "grad_norm": 3.5469744205474854, "learning_rate": 3.661360540310626e-05, "loss": 2.5912, "step": 2461 }, { "epoch": 0.411757327424008, "grad_norm": 2.583139181137085, "learning_rate": 3.6600678145127735e-05, "loss": 2.59, "step": 2462 }, { "epoch": 0.41192457247982606, "grad_norm": 4.162140369415283, "learning_rate": 3.658774693296292e-05, "loss": 2.8626, "step": 2463 }, { "epoch": 0.4120918175356441, "grad_norm": 3.058053731918335, "learning_rate": 3.657481177101952e-05, "loss": 2.8231, "step": 2464 }, { "epoch": 0.41225906259146217, "grad_norm": 3.562882661819458, "learning_rate": 3.656187266370659e-05, "loss": 2.8377, "step": 2465 }, { "epoch": 0.41242630764728017, "grad_norm": 7.292907238006592, "learning_rate": 3.6548929615434527e-05, "loss": 3.1418, "step": 2466 }, { "epoch": 0.4125935527030982, "grad_norm": 5.678032398223877, "learning_rate": 3.653598263061508e-05, "loss": 3.3731, "step": 2467 }, { "epoch": 0.4127607977589163, "grad_norm": 4.043649196624756, "learning_rate": 3.652303171366133e-05, "loss": 2.5023, "step": 2468 }, { "epoch": 0.4129280428147343, "grad_norm": 3.5435566902160645, "learning_rate": 3.6510076868987705e-05, "loss": 2.7279, "step": 2469 }, { "epoch": 0.41309528787055233, "grad_norm": 5.008689880371094, "learning_rate": 3.649711810100999e-05, "loss": 3.0633, "step": 2470 }, { "epoch": 0.4132625329263704, "grad_norm": 3.170131206512451, "learning_rate": 3.648415541414525e-05, "loss": 2.7849, "step": 2471 }, { "epoch": 0.4134297779821884, "grad_norm": 4.8178486824035645, "learning_rate": 3.647118881281195e-05, "loss": 2.5319, "step": 2472 }, { "epoch": 0.41359702303800644, "grad_norm": 6.053962230682373, "learning_rate": 3.645821830142985e-05, "loss": 3.2113, "step": 2473 }, { "epoch": 0.4137642680938245, "grad_norm": 4.517093181610107, "learning_rate": 3.6445243884420066e-05, "loss": 3.0294, "step": 2474 }, { "epoch": 0.4139315131496425, "grad_norm": 8.566788673400879, "learning_rate": 3.643226556620503e-05, "loss": 3.0532, "step": 2475 }, { "epoch": 0.41409875820546055, "grad_norm": 6.753882884979248, "learning_rate": 3.64192833512085e-05, "loss": 2.8451, "step": 2476 }, { "epoch": 0.4142660032612786, "grad_norm": 4.0887274742126465, "learning_rate": 3.640629724385559e-05, "loss": 2.7853, "step": 2477 }, { "epoch": 0.4144332483170966, "grad_norm": 5.001009464263916, "learning_rate": 3.63933072485727e-05, "loss": 2.8076, "step": 2478 }, { "epoch": 0.41460049337291466, "grad_norm": 2.3960721492767334, "learning_rate": 3.638031336978759e-05, "loss": 2.3307, "step": 2479 }, { "epoch": 0.4147677384287327, "grad_norm": 4.789402008056641, "learning_rate": 3.636731561192932e-05, "loss": 2.9685, "step": 2480 }, { "epoch": 0.4149349834845507, "grad_norm": 3.686969518661499, "learning_rate": 3.635431397942828e-05, "loss": 2.3948, "step": 2481 }, { "epoch": 0.41510222854036877, "grad_norm": 6.319107532501221, "learning_rate": 3.63413084767162e-05, "loss": 2.8698, "step": 2482 }, { "epoch": 0.4152694735961868, "grad_norm": 11.529572486877441, "learning_rate": 3.6328299108226104e-05, "loss": 3.7339, "step": 2483 }, { "epoch": 0.4154367186520049, "grad_norm": 6.432526111602783, "learning_rate": 3.6315285878392336e-05, "loss": 3.0263, "step": 2484 }, { "epoch": 0.4156039637078229, "grad_norm": 4.3036627769470215, "learning_rate": 3.630226879165057e-05, "loss": 3.2551, "step": 2485 }, { "epoch": 0.41577120876364093, "grad_norm": 5.766621112823486, "learning_rate": 3.628924785243778e-05, "loss": 2.7742, "step": 2486 }, { "epoch": 0.415938453819459, "grad_norm": 3.1232855319976807, "learning_rate": 3.627622306519227e-05, "loss": 2.8399, "step": 2487 }, { "epoch": 0.416105698875277, "grad_norm": 2.914320468902588, "learning_rate": 3.6263194434353636e-05, "loss": 2.7836, "step": 2488 }, { "epoch": 0.41627294393109504, "grad_norm": 4.401488780975342, "learning_rate": 3.625016196436279e-05, "loss": 2.4416, "step": 2489 }, { "epoch": 0.4164401889869131, "grad_norm": 6.222341537475586, "learning_rate": 3.6237125659661976e-05, "loss": 2.654, "step": 2490 }, { "epoch": 0.4166074340427311, "grad_norm": 3.851619243621826, "learning_rate": 3.6224085524694705e-05, "loss": 2.7063, "step": 2491 }, { "epoch": 0.41677467909854915, "grad_norm": 5.133815288543701, "learning_rate": 3.621104156390583e-05, "loss": 2.8168, "step": 2492 }, { "epoch": 0.4169419241543672, "grad_norm": 5.6387529373168945, "learning_rate": 3.619799378174147e-05, "loss": 2.9747, "step": 2493 }, { "epoch": 0.4171091692101852, "grad_norm": 3.6544549465179443, "learning_rate": 3.618494218264909e-05, "loss": 2.8357, "step": 2494 }, { "epoch": 0.41727641426600326, "grad_norm": 6.467111587524414, "learning_rate": 3.617188677107742e-05, "loss": 2.8896, "step": 2495 }, { "epoch": 0.4174436593218213, "grad_norm": 3.6658527851104736, "learning_rate": 3.6158827551476525e-05, "loss": 2.8002, "step": 2496 }, { "epoch": 0.4176109043776393, "grad_norm": 4.319884777069092, "learning_rate": 3.614576452829772e-05, "loss": 3.0071, "step": 2497 }, { "epoch": 0.41777814943345737, "grad_norm": 3.182084083557129, "learning_rate": 3.613269770599366e-05, "loss": 2.6578, "step": 2498 }, { "epoch": 0.4179453944892754, "grad_norm": 4.273075580596924, "learning_rate": 3.6119627089018276e-05, "loss": 2.8251, "step": 2499 }, { "epoch": 0.4181126395450934, "grad_norm": 17.230133056640625, "learning_rate": 3.6106552681826794e-05, "loss": 3.6793, "step": 2500 }, { "epoch": 0.4182798846009115, "grad_norm": 6.397789478302002, "learning_rate": 3.609347448887573e-05, "loss": 2.6914, "step": 2501 }, { "epoch": 0.41844712965672953, "grad_norm": 5.525222301483154, "learning_rate": 3.60803925146229e-05, "loss": 2.7283, "step": 2502 }, { "epoch": 0.4186143747125476, "grad_norm": 5.152113437652588, "learning_rate": 3.60673067635274e-05, "loss": 2.6366, "step": 2503 }, { "epoch": 0.4187816197683656, "grad_norm": 4.433398723602295, "learning_rate": 3.6054217240049616e-05, "loss": 2.778, "step": 2504 }, { "epoch": 0.41894886482418364, "grad_norm": 4.304104804992676, "learning_rate": 3.604112394865121e-05, "loss": 2.6542, "step": 2505 }, { "epoch": 0.4191161098800017, "grad_norm": 4.4151740074157715, "learning_rate": 3.6028026893795155e-05, "loss": 2.8436, "step": 2506 }, { "epoch": 0.4192833549358197, "grad_norm": 3.122807025909424, "learning_rate": 3.601492607994567e-05, "loss": 2.7818, "step": 2507 }, { "epoch": 0.41945059999163775, "grad_norm": 4.052037239074707, "learning_rate": 3.600182151156829e-05, "loss": 2.989, "step": 2508 }, { "epoch": 0.4196178450474558, "grad_norm": 9.076407432556152, "learning_rate": 3.5988713193129806e-05, "loss": 2.9146, "step": 2509 }, { "epoch": 0.4197850901032738, "grad_norm": 4.622187614440918, "learning_rate": 3.59756011290983e-05, "loss": 2.7231, "step": 2510 }, { "epoch": 0.41995233515909186, "grad_norm": 4.289161205291748, "learning_rate": 3.596248532394313e-05, "loss": 2.6144, "step": 2511 }, { "epoch": 0.4201195802149099, "grad_norm": 4.021975517272949, "learning_rate": 3.594936578213492e-05, "loss": 2.6185, "step": 2512 }, { "epoch": 0.4202868252707279, "grad_norm": 2.101064682006836, "learning_rate": 3.593624250814558e-05, "loss": 2.84, "step": 2513 }, { "epoch": 0.42045407032654597, "grad_norm": 6.105985641479492, "learning_rate": 3.5923115506448266e-05, "loss": 2.871, "step": 2514 }, { "epoch": 0.420621315382364, "grad_norm": 3.670258045196533, "learning_rate": 3.5909984781517445e-05, "loss": 2.5779, "step": 2515 }, { "epoch": 0.420788560438182, "grad_norm": 7.11684513092041, "learning_rate": 3.5896850337828833e-05, "loss": 3.2222, "step": 2516 }, { "epoch": 0.4209558054940001, "grad_norm": 6.308962821960449, "learning_rate": 3.5883712179859406e-05, "loss": 2.6769, "step": 2517 }, { "epoch": 0.42112305054981813, "grad_norm": 5.967824935913086, "learning_rate": 3.5870570312087405e-05, "loss": 2.8348, "step": 2518 }, { "epoch": 0.42129029560563613, "grad_norm": 2.7130284309387207, "learning_rate": 3.5857424738992356e-05, "loss": 2.644, "step": 2519 }, { "epoch": 0.4214575406614542, "grad_norm": 5.438333988189697, "learning_rate": 3.5844275465055034e-05, "loss": 2.9428, "step": 2520 }, { "epoch": 0.42162478571727224, "grad_norm": 5.764557361602783, "learning_rate": 3.583112249475747e-05, "loss": 2.5348, "step": 2521 }, { "epoch": 0.42179203077309024, "grad_norm": 4.548177242279053, "learning_rate": 3.5817965832582964e-05, "loss": 2.6318, "step": 2522 }, { "epoch": 0.4219592758289083, "grad_norm": 5.876018047332764, "learning_rate": 3.580480548301607e-05, "loss": 2.8019, "step": 2523 }, { "epoch": 0.42212652088472635, "grad_norm": 3.490492105484009, "learning_rate": 3.5791641450542615e-05, "loss": 2.5265, "step": 2524 }, { "epoch": 0.4222937659405444, "grad_norm": 19.452619552612305, "learning_rate": 3.577847373964966e-05, "loss": 3.7905, "step": 2525 }, { "epoch": 0.4224610109963624, "grad_norm": 4.256584167480469, "learning_rate": 3.57653023548255e-05, "loss": 2.364, "step": 2526 }, { "epoch": 0.42262825605218046, "grad_norm": 6.589704990386963, "learning_rate": 3.5752127300559755e-05, "loss": 2.9991, "step": 2527 }, { "epoch": 0.4227955011079985, "grad_norm": 3.5843188762664795, "learning_rate": 3.5738948581343224e-05, "loss": 2.3357, "step": 2528 }, { "epoch": 0.4229627461638165, "grad_norm": 3.629159927368164, "learning_rate": 3.572576620166799e-05, "loss": 2.2608, "step": 2529 }, { "epoch": 0.42312999121963457, "grad_norm": 3.78996205329895, "learning_rate": 3.571258016602737e-05, "loss": 2.5104, "step": 2530 }, { "epoch": 0.4232972362754526, "grad_norm": 5.914097785949707, "learning_rate": 3.569939047891593e-05, "loss": 2.9604, "step": 2531 }, { "epoch": 0.4234644813312706, "grad_norm": 3.493499517440796, "learning_rate": 3.568619714482949e-05, "loss": 2.5671, "step": 2532 }, { "epoch": 0.4236317263870887, "grad_norm": 2.4566287994384766, "learning_rate": 3.567300016826511e-05, "loss": 2.5733, "step": 2533 }, { "epoch": 0.42379897144290674, "grad_norm": 5.756424903869629, "learning_rate": 3.5659799553721075e-05, "loss": 3.4938, "step": 2534 }, { "epoch": 0.42396621649872474, "grad_norm": 3.0591318607330322, "learning_rate": 3.5646595305696926e-05, "loss": 2.6474, "step": 2535 }, { "epoch": 0.4241334615545428, "grad_norm": 3.7535934448242188, "learning_rate": 3.563338742869345e-05, "loss": 2.8204, "step": 2536 }, { "epoch": 0.42430070661036084, "grad_norm": 2.919597625732422, "learning_rate": 3.562017592721265e-05, "loss": 2.5185, "step": 2537 }, { "epoch": 0.42446795166617884, "grad_norm": 3.0312745571136475, "learning_rate": 3.5606960805757774e-05, "loss": 2.5899, "step": 2538 }, { "epoch": 0.4246351967219969, "grad_norm": 4.3286967277526855, "learning_rate": 3.559374206883331e-05, "loss": 2.749, "step": 2539 }, { "epoch": 0.42480244177781495, "grad_norm": 7.73644495010376, "learning_rate": 3.558051972094497e-05, "loss": 2.8132, "step": 2540 }, { "epoch": 0.42496968683363295, "grad_norm": 5.257650852203369, "learning_rate": 3.556729376659971e-05, "loss": 2.5274, "step": 2541 }, { "epoch": 0.425136931889451, "grad_norm": 4.152524948120117, "learning_rate": 3.555406421030568e-05, "loss": 2.7097, "step": 2542 }, { "epoch": 0.42530417694526906, "grad_norm": 4.008425712585449, "learning_rate": 3.55408310565723e-05, "loss": 3.1064, "step": 2543 }, { "epoch": 0.4254714220010871, "grad_norm": 9.253071784973145, "learning_rate": 3.5527594309910204e-05, "loss": 2.8645, "step": 2544 }, { "epoch": 0.4256386670569051, "grad_norm": 4.053314208984375, "learning_rate": 3.551435397483124e-05, "loss": 2.9696, "step": 2545 }, { "epoch": 0.4258059121127232, "grad_norm": 5.940432071685791, "learning_rate": 3.550111005584847e-05, "loss": 2.8169, "step": 2546 }, { "epoch": 0.4259731571685412, "grad_norm": 4.279513835906982, "learning_rate": 3.5487862557476224e-05, "loss": 2.4113, "step": 2547 }, { "epoch": 0.4261404022243592, "grad_norm": 2.7410783767700195, "learning_rate": 3.547461148422999e-05, "loss": 2.6029, "step": 2548 }, { "epoch": 0.4263076472801773, "grad_norm": 4.225793361663818, "learning_rate": 3.5461356840626524e-05, "loss": 2.7816, "step": 2549 }, { "epoch": 0.42647489233599534, "grad_norm": 2.763251543045044, "learning_rate": 3.544809863118378e-05, "loss": 2.5873, "step": 2550 }, { "epoch": 0.42664213739181334, "grad_norm": 5.779864311218262, "learning_rate": 3.543483686042091e-05, "loss": 2.799, "step": 2551 }, { "epoch": 0.4268093824476314, "grad_norm": 3.7189855575561523, "learning_rate": 3.542157153285831e-05, "loss": 2.6052, "step": 2552 }, { "epoch": 0.42697662750344945, "grad_norm": 4.466254711151123, "learning_rate": 3.540830265301758e-05, "loss": 2.5332, "step": 2553 }, { "epoch": 0.42714387255926745, "grad_norm": 7.498110294342041, "learning_rate": 3.539503022542151e-05, "loss": 2.8597, "step": 2554 }, { "epoch": 0.4273111176150855, "grad_norm": 4.081950664520264, "learning_rate": 3.538175425459413e-05, "loss": 2.5538, "step": 2555 }, { "epoch": 0.42747836267090356, "grad_norm": 3.0192339420318604, "learning_rate": 3.5368474745060664e-05, "loss": 2.5403, "step": 2556 }, { "epoch": 0.42764560772672155, "grad_norm": 5.594992637634277, "learning_rate": 3.5355191701347534e-05, "loss": 3.0123, "step": 2557 }, { "epoch": 0.4278128527825396, "grad_norm": 5.024133205413818, "learning_rate": 3.5341905127982384e-05, "loss": 3.239, "step": 2558 }, { "epoch": 0.42798009783835766, "grad_norm": 3.6631875038146973, "learning_rate": 3.532861502949403e-05, "loss": 2.9663, "step": 2559 }, { "epoch": 0.42814734289417566, "grad_norm": 2.968170642852783, "learning_rate": 3.531532141041254e-05, "loss": 2.8311, "step": 2560 }, { "epoch": 0.4283145879499937, "grad_norm": 8.417374610900879, "learning_rate": 3.530202427526914e-05, "loss": 3.129, "step": 2561 }, { "epoch": 0.4284818330058118, "grad_norm": 6.836033344268799, "learning_rate": 3.5288723628596265e-05, "loss": 3.5529, "step": 2562 }, { "epoch": 0.42864907806162983, "grad_norm": 3.640509605407715, "learning_rate": 3.527541947492755e-05, "loss": 2.5791, "step": 2563 }, { "epoch": 0.42881632311744783, "grad_norm": 5.467000961303711, "learning_rate": 3.5262111818797835e-05, "loss": 2.6876, "step": 2564 }, { "epoch": 0.4289835681732659, "grad_norm": 4.587298393249512, "learning_rate": 3.524880066474312e-05, "loss": 2.9253, "step": 2565 }, { "epoch": 0.42915081322908394, "grad_norm": 2.5102314949035645, "learning_rate": 3.523548601730065e-05, "loss": 2.4445, "step": 2566 }, { "epoch": 0.42931805828490194, "grad_norm": 2.7889363765716553, "learning_rate": 3.5222167881008814e-05, "loss": 2.6575, "step": 2567 }, { "epoch": 0.42948530334072, "grad_norm": 4.854933738708496, "learning_rate": 3.520884626040721e-05, "loss": 2.6254, "step": 2568 }, { "epoch": 0.42965254839653805, "grad_norm": 3.8827943801879883, "learning_rate": 3.5195521160036626e-05, "loss": 3.1317, "step": 2569 }, { "epoch": 0.42981979345235605, "grad_norm": 4.327825546264648, "learning_rate": 3.518219258443903e-05, "loss": 2.7184, "step": 2570 }, { "epoch": 0.4299870385081741, "grad_norm": 2.805140733718872, "learning_rate": 3.5168860538157564e-05, "loss": 2.7008, "step": 2571 }, { "epoch": 0.43015428356399216, "grad_norm": 2.938091516494751, "learning_rate": 3.515552502573659e-05, "loss": 2.5138, "step": 2572 }, { "epoch": 0.43032152861981016, "grad_norm": 3.8909053802490234, "learning_rate": 3.514218605172161e-05, "loss": 2.6143, "step": 2573 }, { "epoch": 0.4304887736756282, "grad_norm": 5.493832111358643, "learning_rate": 3.512884362065933e-05, "loss": 2.8515, "step": 2574 }, { "epoch": 0.43065601873144627, "grad_norm": 2.9926202297210693, "learning_rate": 3.511549773709762e-05, "loss": 2.6588, "step": 2575 }, { "epoch": 0.43082326378726427, "grad_norm": 4.265646457672119, "learning_rate": 3.510214840558555e-05, "loss": 2.2927, "step": 2576 }, { "epoch": 0.4309905088430823, "grad_norm": 5.10701322555542, "learning_rate": 3.5088795630673324e-05, "loss": 2.5253, "step": 2577 }, { "epoch": 0.4311577538989004, "grad_norm": 3.673614501953125, "learning_rate": 3.507543941691236e-05, "loss": 2.7382, "step": 2578 }, { "epoch": 0.4313249989547184, "grad_norm": 2.9125545024871826, "learning_rate": 3.5062079768855235e-05, "loss": 2.8377, "step": 2579 }, { "epoch": 0.43149224401053643, "grad_norm": 5.467430114746094, "learning_rate": 3.5048716691055685e-05, "loss": 3.0062, "step": 2580 }, { "epoch": 0.4316594890663545, "grad_norm": 3.136491537094116, "learning_rate": 3.503535018806863e-05, "loss": 2.3071, "step": 2581 }, { "epoch": 0.43182673412217254, "grad_norm": 3.0559587478637695, "learning_rate": 3.502198026445016e-05, "loss": 2.7023, "step": 2582 }, { "epoch": 0.43199397917799054, "grad_norm": 6.489109992980957, "learning_rate": 3.50086069247575e-05, "loss": 3.3116, "step": 2583 }, { "epoch": 0.4321612242338086, "grad_norm": 8.163722038269043, "learning_rate": 3.499523017354909e-05, "loss": 3.0178, "step": 2584 }, { "epoch": 0.43232846928962665, "grad_norm": 7.332399368286133, "learning_rate": 3.498185001538449e-05, "loss": 2.999, "step": 2585 }, { "epoch": 0.43249571434544465, "grad_norm": 2.640761613845825, "learning_rate": 3.496846645482445e-05, "loss": 2.4205, "step": 2586 }, { "epoch": 0.4326629594012627, "grad_norm": 2.61495304107666, "learning_rate": 3.495507949643084e-05, "loss": 2.8867, "step": 2587 }, { "epoch": 0.43283020445708076, "grad_norm": 2.998173236846924, "learning_rate": 3.4941689144766734e-05, "loss": 2.9443, "step": 2588 }, { "epoch": 0.43299744951289876, "grad_norm": 6.024118423461914, "learning_rate": 3.492829540439635e-05, "loss": 2.9958, "step": 2589 }, { "epoch": 0.4331646945687168, "grad_norm": 7.735472679138184, "learning_rate": 3.491489827988503e-05, "loss": 3.2356, "step": 2590 }, { "epoch": 0.43333193962453487, "grad_norm": 6.0049591064453125, "learning_rate": 3.4901497775799316e-05, "loss": 2.5697, "step": 2591 }, { "epoch": 0.43349918468035287, "grad_norm": 3.54941987991333, "learning_rate": 3.488809389670687e-05, "loss": 2.7484, "step": 2592 }, { "epoch": 0.4336664297361709, "grad_norm": 4.416013240814209, "learning_rate": 3.487468664717652e-05, "loss": 3.0881, "step": 2593 }, { "epoch": 0.433833674791989, "grad_norm": 4.704264163970947, "learning_rate": 3.4861276031778235e-05, "loss": 2.6247, "step": 2594 }, { "epoch": 0.434000919847807, "grad_norm": 3.8944287300109863, "learning_rate": 3.4847862055083134e-05, "loss": 2.7967, "step": 2595 }, { "epoch": 0.43416816490362503, "grad_norm": 4.763270378112793, "learning_rate": 3.483444472166346e-05, "loss": 2.9756, "step": 2596 }, { "epoch": 0.4343354099594431, "grad_norm": 3.763389825820923, "learning_rate": 3.482102403609265e-05, "loss": 2.8581, "step": 2597 }, { "epoch": 0.4345026550152611, "grad_norm": 4.861575126647949, "learning_rate": 3.4807600002945244e-05, "loss": 2.776, "step": 2598 }, { "epoch": 0.43466990007107914, "grad_norm": 3.535306453704834, "learning_rate": 3.479417262679694e-05, "loss": 2.5623, "step": 2599 }, { "epoch": 0.4348371451268972, "grad_norm": 2.893958806991577, "learning_rate": 3.478074191222455e-05, "loss": 2.6906, "step": 2600 }, { "epoch": 0.43500439018271525, "grad_norm": 3.186826705932617, "learning_rate": 3.4767307863806064e-05, "loss": 2.5488, "step": 2601 }, { "epoch": 0.43517163523853325, "grad_norm": 4.962519645690918, "learning_rate": 3.4753870486120574e-05, "loss": 2.7078, "step": 2602 }, { "epoch": 0.4353388802943513, "grad_norm": 5.056987762451172, "learning_rate": 3.474042978374834e-05, "loss": 2.9714, "step": 2603 }, { "epoch": 0.43550612535016936, "grad_norm": 3.613903522491455, "learning_rate": 3.4726985761270706e-05, "loss": 2.6829, "step": 2604 }, { "epoch": 0.43567337040598736, "grad_norm": 5.3246588706970215, "learning_rate": 3.47135384232702e-05, "loss": 2.6028, "step": 2605 }, { "epoch": 0.4358406154618054, "grad_norm": 5.825587749481201, "learning_rate": 3.470008777433045e-05, "loss": 2.9313, "step": 2606 }, { "epoch": 0.43600786051762347, "grad_norm": 9.978293418884277, "learning_rate": 3.468663381903623e-05, "loss": 2.1824, "step": 2607 }, { "epoch": 0.43617510557344147, "grad_norm": 4.201888084411621, "learning_rate": 3.4673176561973414e-05, "loss": 2.8344, "step": 2608 }, { "epoch": 0.4363423506292595, "grad_norm": 4.791114330291748, "learning_rate": 3.4659716007729027e-05, "loss": 2.8766, "step": 2609 }, { "epoch": 0.4365095956850776, "grad_norm": 4.110183238983154, "learning_rate": 3.464625216089121e-05, "loss": 2.9306, "step": 2610 }, { "epoch": 0.4366768407408956, "grad_norm": 4.22127628326416, "learning_rate": 3.463278502604922e-05, "loss": 2.4742, "step": 2611 }, { "epoch": 0.43684408579671363, "grad_norm": 4.301149368286133, "learning_rate": 3.461931460779344e-05, "loss": 2.8538, "step": 2612 }, { "epoch": 0.4370113308525317, "grad_norm": 6.452043533325195, "learning_rate": 3.460584091071539e-05, "loss": 2.6478, "step": 2613 }, { "epoch": 0.4371785759083497, "grad_norm": 4.2410407066345215, "learning_rate": 3.459236393940767e-05, "loss": 2.5084, "step": 2614 }, { "epoch": 0.43734582096416774, "grad_norm": 5.189432144165039, "learning_rate": 3.457888369846403e-05, "loss": 2.8396, "step": 2615 }, { "epoch": 0.4375130660199858, "grad_norm": 5.197939395904541, "learning_rate": 3.45654001924793e-05, "loss": 2.5456, "step": 2616 }, { "epoch": 0.4376803110758038, "grad_norm": 2.2332067489624023, "learning_rate": 3.455191342604947e-05, "loss": 2.7921, "step": 2617 }, { "epoch": 0.43784755613162185, "grad_norm": 3.5980589389801025, "learning_rate": 3.453842340377159e-05, "loss": 2.1199, "step": 2618 }, { "epoch": 0.4380148011874399, "grad_norm": 5.61405611038208, "learning_rate": 3.452493013024387e-05, "loss": 3.0813, "step": 2619 }, { "epoch": 0.43818204624325796, "grad_norm": 5.053504467010498, "learning_rate": 3.4511433610065594e-05, "loss": 2.8012, "step": 2620 }, { "epoch": 0.43834929129907596, "grad_norm": 4.2540507316589355, "learning_rate": 3.449793384783715e-05, "loss": 2.8966, "step": 2621 }, { "epoch": 0.438516536354894, "grad_norm": 3.6609251499176025, "learning_rate": 3.448443084816005e-05, "loss": 3.0997, "step": 2622 }, { "epoch": 0.43868378141071207, "grad_norm": 3.751577138900757, "learning_rate": 3.447092461563691e-05, "loss": 2.6956, "step": 2623 }, { "epoch": 0.43885102646653007, "grad_norm": 3.3572065830230713, "learning_rate": 3.4457415154871424e-05, "loss": 3.0183, "step": 2624 }, { "epoch": 0.4390182715223481, "grad_norm": 4.727490425109863, "learning_rate": 3.444390247046841e-05, "loss": 2.9716, "step": 2625 }, { "epoch": 0.4391855165781662, "grad_norm": 4.67921781539917, "learning_rate": 3.443038656703378e-05, "loss": 2.6807, "step": 2626 }, { "epoch": 0.4393527616339842, "grad_norm": 4.211130619049072, "learning_rate": 3.441686744917455e-05, "loss": 3.0857, "step": 2627 }, { "epoch": 0.43952000668980223, "grad_norm": 5.872931003570557, "learning_rate": 3.440334512149881e-05, "loss": 2.5154, "step": 2628 }, { "epoch": 0.4396872517456203, "grad_norm": 3.157461166381836, "learning_rate": 3.4389819588615744e-05, "loss": 2.9384, "step": 2629 }, { "epoch": 0.4398544968014383, "grad_norm": 5.840359210968018, "learning_rate": 3.437629085513566e-05, "loss": 2.7742, "step": 2630 }, { "epoch": 0.44002174185725634, "grad_norm": 5.587758541107178, "learning_rate": 3.436275892566994e-05, "loss": 2.9584, "step": 2631 }, { "epoch": 0.4401889869130744, "grad_norm": 12.881531715393066, "learning_rate": 3.4349223804831044e-05, "loss": 3.179, "step": 2632 }, { "epoch": 0.4403562319688924, "grad_norm": 4.595426082611084, "learning_rate": 3.433568549723252e-05, "loss": 2.9745, "step": 2633 }, { "epoch": 0.44052347702471045, "grad_norm": 14.56092357635498, "learning_rate": 3.432214400748903e-05, "loss": 3.2775, "step": 2634 }, { "epoch": 0.4406907220805285, "grad_norm": 2.8893208503723145, "learning_rate": 3.43085993402163e-05, "loss": 2.9376, "step": 2635 }, { "epoch": 0.4408579671363465, "grad_norm": 3.315563440322876, "learning_rate": 3.429505150003113e-05, "loss": 2.856, "step": 2636 }, { "epoch": 0.44102521219216456, "grad_norm": 2.498013734817505, "learning_rate": 3.4281500491551414e-05, "loss": 2.5648, "step": 2637 }, { "epoch": 0.4411924572479826, "grad_norm": 3.548940896987915, "learning_rate": 3.426794631939613e-05, "loss": 3.3152, "step": 2638 }, { "epoch": 0.4413597023038006, "grad_norm": 3.6011385917663574, "learning_rate": 3.4254388988185325e-05, "loss": 2.6149, "step": 2639 }, { "epoch": 0.44152694735961867, "grad_norm": 8.40599536895752, "learning_rate": 3.424082850254013e-05, "loss": 3.5145, "step": 2640 }, { "epoch": 0.4416941924154367, "grad_norm": 3.4974546432495117, "learning_rate": 3.4227264867082744e-05, "loss": 2.8451, "step": 2641 }, { "epoch": 0.4418614374712548, "grad_norm": 2.9390907287597656, "learning_rate": 3.421369808643644e-05, "loss": 2.6386, "step": 2642 }, { "epoch": 0.4420286825270728, "grad_norm": 5.509983062744141, "learning_rate": 3.4200128165225575e-05, "loss": 2.8014, "step": 2643 }, { "epoch": 0.44219592758289084, "grad_norm": 3.1659486293792725, "learning_rate": 3.418655510807557e-05, "loss": 2.488, "step": 2644 }, { "epoch": 0.4423631726387089, "grad_norm": 2.5254554748535156, "learning_rate": 3.4172978919612896e-05, "loss": 2.5323, "step": 2645 }, { "epoch": 0.4425304176945269, "grad_norm": 5.184157848358154, "learning_rate": 3.415939960446513e-05, "loss": 2.9452, "step": 2646 }, { "epoch": 0.44269766275034494, "grad_norm": 6.765993595123291, "learning_rate": 3.414581716726087e-05, "loss": 2.4685, "step": 2647 }, { "epoch": 0.442864907806163, "grad_norm": 4.972833156585693, "learning_rate": 3.413223161262982e-05, "loss": 2.5019, "step": 2648 }, { "epoch": 0.443032152861981, "grad_norm": 3.7821898460388184, "learning_rate": 3.411864294520273e-05, "loss": 2.6694, "step": 2649 }, { "epoch": 0.44319939791779905, "grad_norm": 2.932685375213623, "learning_rate": 3.410505116961139e-05, "loss": 2.6969, "step": 2650 }, { "epoch": 0.4433666429736171, "grad_norm": 5.253076553344727, "learning_rate": 3.409145629048868e-05, "loss": 2.8481, "step": 2651 }, { "epoch": 0.4435338880294351, "grad_norm": 2.1462671756744385, "learning_rate": 3.407785831246852e-05, "loss": 2.6337, "step": 2652 }, { "epoch": 0.44370113308525316, "grad_norm": 6.226997375488281, "learning_rate": 3.4064257240185903e-05, "loss": 3.061, "step": 2653 }, { "epoch": 0.4438683781410712, "grad_norm": 3.6949145793914795, "learning_rate": 3.4050653078276856e-05, "loss": 2.6946, "step": 2654 }, { "epoch": 0.4440356231968892, "grad_norm": 6.4193267822265625, "learning_rate": 3.403704583137847e-05, "loss": 2.9024, "step": 2655 }, { "epoch": 0.4442028682527073, "grad_norm": 5.5297770500183105, "learning_rate": 3.40234355041289e-05, "loss": 3.0457, "step": 2656 }, { "epoch": 0.4443701133085253, "grad_norm": 10.007364273071289, "learning_rate": 3.400982210116732e-05, "loss": 2.7583, "step": 2657 }, { "epoch": 0.4445373583643433, "grad_norm": 5.850971698760986, "learning_rate": 3.399620562713397e-05, "loss": 2.5503, "step": 2658 }, { "epoch": 0.4447046034201614, "grad_norm": 5.765912055969238, "learning_rate": 3.398258608667016e-05, "loss": 2.8296, "step": 2659 }, { "epoch": 0.44487184847597944, "grad_norm": 3.8774688243865967, "learning_rate": 3.3968963484418195e-05, "loss": 2.9414, "step": 2660 }, { "epoch": 0.4450390935317975, "grad_norm": 5.197319030761719, "learning_rate": 3.395533782502146e-05, "loss": 2.6322, "step": 2661 }, { "epoch": 0.4452063385876155, "grad_norm": 3.6945042610168457, "learning_rate": 3.394170911312437e-05, "loss": 2.6174, "step": 2662 }, { "epoch": 0.44537358364343355, "grad_norm": 7.892509460449219, "learning_rate": 3.39280773533724e-05, "loss": 2.8354, "step": 2663 }, { "epoch": 0.4455408286992516, "grad_norm": 5.511669635772705, "learning_rate": 3.3914442550412035e-05, "loss": 2.6711, "step": 2664 }, { "epoch": 0.4457080737550696, "grad_norm": 3.6831912994384766, "learning_rate": 3.39008047088908e-05, "loss": 2.7275, "step": 2665 }, { "epoch": 0.44587531881088766, "grad_norm": 9.05732250213623, "learning_rate": 3.388716383345729e-05, "loss": 2.8864, "step": 2666 }, { "epoch": 0.4460425638667057, "grad_norm": 2.0668063163757324, "learning_rate": 3.3873519928761075e-05, "loss": 2.379, "step": 2667 }, { "epoch": 0.4462098089225237, "grad_norm": 10.413956642150879, "learning_rate": 3.385987299945283e-05, "loss": 2.852, "step": 2668 }, { "epoch": 0.44637705397834176, "grad_norm": 3.799875259399414, "learning_rate": 3.384622305018419e-05, "loss": 2.8828, "step": 2669 }, { "epoch": 0.4465442990341598, "grad_norm": 6.161764621734619, "learning_rate": 3.383257008560786e-05, "loss": 3.0656, "step": 2670 }, { "epoch": 0.4467115440899778, "grad_norm": 6.535909652709961, "learning_rate": 3.381891411037758e-05, "loss": 2.7513, "step": 2671 }, { "epoch": 0.4468787891457959, "grad_norm": 8.002820014953613, "learning_rate": 3.3805255129148086e-05, "loss": 2.8773, "step": 2672 }, { "epoch": 0.44704603420161393, "grad_norm": 2.9862728118896484, "learning_rate": 3.3791593146575166e-05, "loss": 2.5715, "step": 2673 }, { "epoch": 0.44721327925743193, "grad_norm": 5.644151210784912, "learning_rate": 3.377792816731561e-05, "loss": 2.6956, "step": 2674 }, { "epoch": 0.44738052431325, "grad_norm": 4.209527969360352, "learning_rate": 3.3764260196027244e-05, "loss": 3.0807, "step": 2675 }, { "epoch": 0.44754776936906804, "grad_norm": 4.878886699676514, "learning_rate": 3.375058923736889e-05, "loss": 2.8033, "step": 2676 }, { "epoch": 0.44771501442488604, "grad_norm": 5.974396228790283, "learning_rate": 3.373691529600044e-05, "loss": 2.6402, "step": 2677 }, { "epoch": 0.4478822594807041, "grad_norm": 4.581871509552002, "learning_rate": 3.372323837658274e-05, "loss": 2.543, "step": 2678 }, { "epoch": 0.44804950453652215, "grad_norm": 4.0188727378845215, "learning_rate": 3.370955848377769e-05, "loss": 2.6876, "step": 2679 }, { "epoch": 0.4482167495923402, "grad_norm": 3.3618123531341553, "learning_rate": 3.36958756222482e-05, "loss": 2.9161, "step": 2680 }, { "epoch": 0.4483839946481582, "grad_norm": 6.2909135818481445, "learning_rate": 3.368218979665818e-05, "loss": 2.9959, "step": 2681 }, { "epoch": 0.44855123970397626, "grad_norm": 14.851531982421875, "learning_rate": 3.366850101167254e-05, "loss": 3.544, "step": 2682 }, { "epoch": 0.4487184847597943, "grad_norm": 3.520900249481201, "learning_rate": 3.365480927195724e-05, "loss": 2.4627, "step": 2683 }, { "epoch": 0.4488857298156123, "grad_norm": 3.9633803367614746, "learning_rate": 3.364111458217922e-05, "loss": 2.6352, "step": 2684 }, { "epoch": 0.44905297487143037, "grad_norm": 5.238057613372803, "learning_rate": 3.362741694700641e-05, "loss": 2.332, "step": 2685 }, { "epoch": 0.4492202199272484, "grad_norm": 4.4461140632629395, "learning_rate": 3.361371637110777e-05, "loss": 2.9494, "step": 2686 }, { "epoch": 0.4493874649830664, "grad_norm": 4.049163818359375, "learning_rate": 3.360001285915326e-05, "loss": 2.6252, "step": 2687 }, { "epoch": 0.4495547100388845, "grad_norm": 6.007683753967285, "learning_rate": 3.3586306415813824e-05, "loss": 2.9397, "step": 2688 }, { "epoch": 0.44972195509470253, "grad_norm": 7.447728633880615, "learning_rate": 3.3572597045761425e-05, "loss": 3.0311, "step": 2689 }, { "epoch": 0.44988920015052053, "grad_norm": 4.763315677642822, "learning_rate": 3.3558884753668996e-05, "loss": 2.947, "step": 2690 }, { "epoch": 0.4500564452063386, "grad_norm": 5.828144550323486, "learning_rate": 3.354516954421051e-05, "loss": 2.6937, "step": 2691 }, { "epoch": 0.45022369026215664, "grad_norm": 4.845881462097168, "learning_rate": 3.353145142206089e-05, "loss": 2.7964, "step": 2692 }, { "epoch": 0.45039093531797464, "grad_norm": 8.293279647827148, "learning_rate": 3.3517730391896085e-05, "loss": 3.3369, "step": 2693 }, { "epoch": 0.4505581803737927, "grad_norm": 3.632573366165161, "learning_rate": 3.3504006458393014e-05, "loss": 2.7919, "step": 2694 }, { "epoch": 0.45072542542961075, "grad_norm": 8.549243927001953, "learning_rate": 3.349027962622958e-05, "loss": 2.7411, "step": 2695 }, { "epoch": 0.45089267048542875, "grad_norm": 2.7619152069091797, "learning_rate": 3.3476549900084706e-05, "loss": 2.4363, "step": 2696 }, { "epoch": 0.4510599155412468, "grad_norm": 2.2152225971221924, "learning_rate": 3.346281728463827e-05, "loss": 2.5406, "step": 2697 }, { "epoch": 0.45122716059706486, "grad_norm": 3.7596256732940674, "learning_rate": 3.3449081784571145e-05, "loss": 2.8932, "step": 2698 }, { "epoch": 0.4513944056528829, "grad_norm": 11.80684757232666, "learning_rate": 3.343534340456519e-05, "loss": 3.6043, "step": 2699 }, { "epoch": 0.4515616507087009, "grad_norm": 3.8632924556732178, "learning_rate": 3.3421602149303256e-05, "loss": 2.3281, "step": 2700 }, { "epoch": 0.45172889576451897, "grad_norm": 2.969160318374634, "learning_rate": 3.340785802346915e-05, "loss": 3.0204, "step": 2701 }, { "epoch": 0.451896140820337, "grad_norm": 5.767897129058838, "learning_rate": 3.339411103174768e-05, "loss": 3.4407, "step": 2702 }, { "epoch": 0.452063385876155, "grad_norm": 3.490203380584717, "learning_rate": 3.33803611788246e-05, "loss": 2.9186, "step": 2703 }, { "epoch": 0.4522306309319731, "grad_norm": 5.673913955688477, "learning_rate": 3.33666084693867e-05, "loss": 2.7229, "step": 2704 }, { "epoch": 0.45239787598779113, "grad_norm": 4.26692533493042, "learning_rate": 3.335285290812166e-05, "loss": 2.867, "step": 2705 }, { "epoch": 0.45256512104360913, "grad_norm": 6.061853885650635, "learning_rate": 3.3339094499718205e-05, "loss": 2.7031, "step": 2706 }, { "epoch": 0.4527323660994272, "grad_norm": 2.307478427886963, "learning_rate": 3.3325333248865986e-05, "loss": 2.7047, "step": 2707 }, { "epoch": 0.45289961115524524, "grad_norm": 13.087883949279785, "learning_rate": 3.331156916025564e-05, "loss": 3.7724, "step": 2708 }, { "epoch": 0.45306685621106324, "grad_norm": 3.7967193126678467, "learning_rate": 3.3297802238578793e-05, "loss": 2.6844, "step": 2709 }, { "epoch": 0.4532341012668813, "grad_norm": 3.878228187561035, "learning_rate": 3.3284032488527976e-05, "loss": 2.7358, "step": 2710 }, { "epoch": 0.45340134632269935, "grad_norm": 4.388489246368408, "learning_rate": 3.3270259914796735e-05, "loss": 2.4695, "step": 2711 }, { "epoch": 0.45356859137851735, "grad_norm": 5.168774604797363, "learning_rate": 3.325648452207958e-05, "loss": 2.5203, "step": 2712 }, { "epoch": 0.4537358364343354, "grad_norm": 4.19241189956665, "learning_rate": 3.3242706315071946e-05, "loss": 2.9547, "step": 2713 }, { "epoch": 0.45390308149015346, "grad_norm": 4.796122074127197, "learning_rate": 3.3228925298470265e-05, "loss": 2.8661, "step": 2714 }, { "epoch": 0.45407032654597146, "grad_norm": 12.667119979858398, "learning_rate": 3.321514147697189e-05, "loss": 2.7743, "step": 2715 }, { "epoch": 0.4542375716017895, "grad_norm": 2.8338217735290527, "learning_rate": 3.320135485527517e-05, "loss": 2.7862, "step": 2716 }, { "epoch": 0.45440481665760757, "grad_norm": 7.256214141845703, "learning_rate": 3.318756543807938e-05, "loss": 3.2499, "step": 2717 }, { "epoch": 0.4545720617134256, "grad_norm": 8.770246505737305, "learning_rate": 3.3173773230084754e-05, "loss": 2.9738, "step": 2718 }, { "epoch": 0.4547393067692436, "grad_norm": 2.5629799365997314, "learning_rate": 3.3159978235992474e-05, "loss": 2.6156, "step": 2719 }, { "epoch": 0.4549065518250617, "grad_norm": 4.929149627685547, "learning_rate": 3.314618046050469e-05, "loss": 2.7287, "step": 2720 }, { "epoch": 0.45507379688087973, "grad_norm": 2.8721306324005127, "learning_rate": 3.313237990832447e-05, "loss": 2.7633, "step": 2721 }, { "epoch": 0.45524104193669773, "grad_norm": 6.477436065673828, "learning_rate": 3.311857658415587e-05, "loss": 2.787, "step": 2722 }, { "epoch": 0.4554082869925158, "grad_norm": 2.95931339263916, "learning_rate": 3.3104770492703846e-05, "loss": 2.8245, "step": 2723 }, { "epoch": 0.45557553204833384, "grad_norm": 3.1964051723480225, "learning_rate": 3.309096163867432e-05, "loss": 2.9104, "step": 2724 }, { "epoch": 0.45574277710415184, "grad_norm": 4.1161675453186035, "learning_rate": 3.307715002677416e-05, "loss": 3.2454, "step": 2725 }, { "epoch": 0.4559100221599699, "grad_norm": 8.135499954223633, "learning_rate": 3.3063335661711166e-05, "loss": 2.7266, "step": 2726 }, { "epoch": 0.45607726721578795, "grad_norm": 4.582993984222412, "learning_rate": 3.3049518548194064e-05, "loss": 2.7459, "step": 2727 }, { "epoch": 0.45624451227160595, "grad_norm": 5.232402801513672, "learning_rate": 3.303569869093255e-05, "loss": 2.5234, "step": 2728 }, { "epoch": 0.456411757327424, "grad_norm": 7.122742176055908, "learning_rate": 3.302187609463723e-05, "loss": 3.0397, "step": 2729 }, { "epoch": 0.45657900238324206, "grad_norm": 3.479363441467285, "learning_rate": 3.3008050764019636e-05, "loss": 2.6652, "step": 2730 }, { "epoch": 0.45674624743906006, "grad_norm": 3.56815767288208, "learning_rate": 3.2994222703792266e-05, "loss": 2.389, "step": 2731 }, { "epoch": 0.4569134924948781, "grad_norm": 7.179554462432861, "learning_rate": 3.298039191866851e-05, "loss": 2.9917, "step": 2732 }, { "epoch": 0.45708073755069617, "grad_norm": 4.486080169677734, "learning_rate": 3.296655841336272e-05, "loss": 3.1893, "step": 2733 }, { "epoch": 0.45724798260651417, "grad_norm": 12.062623977661133, "learning_rate": 3.2952722192590146e-05, "loss": 3.3691, "step": 2734 }, { "epoch": 0.4574152276623322, "grad_norm": 5.97243070602417, "learning_rate": 3.293888326106699e-05, "loss": 2.5293, "step": 2735 }, { "epoch": 0.4575824727181503, "grad_norm": 4.957681655883789, "learning_rate": 3.292504162351035e-05, "loss": 2.9527, "step": 2736 }, { "epoch": 0.45774971777396833, "grad_norm": 5.356862545013428, "learning_rate": 3.2911197284638276e-05, "loss": 2.8423, "step": 2737 }, { "epoch": 0.45791696282978633, "grad_norm": 6.198226451873779, "learning_rate": 3.289735024916972e-05, "loss": 2.8046, "step": 2738 }, { "epoch": 0.4580842078856044, "grad_norm": 4.001804351806641, "learning_rate": 3.288350052182456e-05, "loss": 2.5732, "step": 2739 }, { "epoch": 0.45825145294142244, "grad_norm": 7.543381214141846, "learning_rate": 3.286964810732358e-05, "loss": 3.119, "step": 2740 }, { "epoch": 0.45841869799724044, "grad_norm": 10.126092910766602, "learning_rate": 3.285579301038849e-05, "loss": 3.1001, "step": 2741 }, { "epoch": 0.4585859430530585, "grad_norm": 6.301828384399414, "learning_rate": 3.2841935235741924e-05, "loss": 3.2192, "step": 2742 }, { "epoch": 0.45875318810887655, "grad_norm": 6.835843563079834, "learning_rate": 3.2828074788107414e-05, "loss": 2.8109, "step": 2743 }, { "epoch": 0.45892043316469455, "grad_norm": 3.8638525009155273, "learning_rate": 3.2814211672209404e-05, "loss": 2.7949, "step": 2744 }, { "epoch": 0.4590876782205126, "grad_norm": 7.751282691955566, "learning_rate": 3.2800345892773256e-05, "loss": 3.1566, "step": 2745 }, { "epoch": 0.45925492327633066, "grad_norm": 6.28481912612915, "learning_rate": 3.2786477454525224e-05, "loss": 2.703, "step": 2746 }, { "epoch": 0.45942216833214866, "grad_norm": 4.845148086547852, "learning_rate": 3.277260636219249e-05, "loss": 3.209, "step": 2747 }, { "epoch": 0.4595894133879667, "grad_norm": 5.921385288238525, "learning_rate": 3.2758732620503116e-05, "loss": 2.5565, "step": 2748 }, { "epoch": 0.45975665844378477, "grad_norm": 3.29708194732666, "learning_rate": 3.27448562341861e-05, "loss": 2.802, "step": 2749 }, { "epoch": 0.45992390349960277, "grad_norm": 5.106003284454346, "learning_rate": 3.2730977207971314e-05, "loss": 2.9323, "step": 2750 }, { "epoch": 0.4600911485554208, "grad_norm": 5.337779998779297, "learning_rate": 3.271709554658953e-05, "loss": 3.4261, "step": 2751 }, { "epoch": 0.4602583936112389, "grad_norm": 4.607872009277344, "learning_rate": 3.270321125477244e-05, "loss": 3.1571, "step": 2752 }, { "epoch": 0.4604256386670569, "grad_norm": 5.400260925292969, "learning_rate": 3.268932433725261e-05, "loss": 2.6578, "step": 2753 }, { "epoch": 0.46059288372287494, "grad_norm": 2.0116567611694336, "learning_rate": 3.26754347987635e-05, "loss": 2.6039, "step": 2754 }, { "epoch": 0.460760128778693, "grad_norm": 4.469540119171143, "learning_rate": 3.26615426440395e-05, "loss": 2.7849, "step": 2755 }, { "epoch": 0.46092737383451104, "grad_norm": 5.739303112030029, "learning_rate": 3.2647647877815836e-05, "loss": 2.6115, "step": 2756 }, { "epoch": 0.46109461889032904, "grad_norm": 4.847532749176025, "learning_rate": 3.263375050482867e-05, "loss": 2.8272, "step": 2757 }, { "epoch": 0.4612618639461471, "grad_norm": 3.098142147064209, "learning_rate": 3.2619850529815035e-05, "loss": 2.8497, "step": 2758 }, { "epoch": 0.46142910900196515, "grad_norm": 10.274462699890137, "learning_rate": 3.260594795751284e-05, "loss": 2.975, "step": 2759 }, { "epoch": 0.46159635405778315, "grad_norm": 4.443307876586914, "learning_rate": 3.2592042792660903e-05, "loss": 2.7782, "step": 2760 }, { "epoch": 0.4617635991136012, "grad_norm": 9.716309547424316, "learning_rate": 3.257813503999891e-05, "loss": 3.2692, "step": 2761 }, { "epoch": 0.46193084416941926, "grad_norm": 3.0056021213531494, "learning_rate": 3.256422470426743e-05, "loss": 2.5124, "step": 2762 }, { "epoch": 0.46209808922523726, "grad_norm": 2.929952383041382, "learning_rate": 3.255031179020792e-05, "loss": 2.5371, "step": 2763 }, { "epoch": 0.4622653342810553, "grad_norm": 4.332847595214844, "learning_rate": 3.25363963025627e-05, "loss": 2.654, "step": 2764 }, { "epoch": 0.4624325793368734, "grad_norm": 5.5058722496032715, "learning_rate": 3.2522478246074996e-05, "loss": 3.2648, "step": 2765 }, { "epoch": 0.46259982439269137, "grad_norm": 7.081038951873779, "learning_rate": 3.2508557625488876e-05, "loss": 3.0501, "step": 2766 }, { "epoch": 0.4627670694485094, "grad_norm": 3.5014309883117676, "learning_rate": 3.2494634445549303e-05, "loss": 2.5726, "step": 2767 }, { "epoch": 0.4629343145043275, "grad_norm": 3.0770134925842285, "learning_rate": 3.2480708711002116e-05, "loss": 2.7953, "step": 2768 }, { "epoch": 0.4631015595601455, "grad_norm": 6.720861434936523, "learning_rate": 3.246678042659399e-05, "loss": 3.3256, "step": 2769 }, { "epoch": 0.46326880461596354, "grad_norm": 13.001111030578613, "learning_rate": 3.2452849597072524e-05, "loss": 3.1234, "step": 2770 }, { "epoch": 0.4634360496717816, "grad_norm": 4.951101303100586, "learning_rate": 3.2438916227186145e-05, "loss": 2.9005, "step": 2771 }, { "epoch": 0.4636032947275996, "grad_norm": 2.863924503326416, "learning_rate": 3.2424980321684154e-05, "loss": 2.6733, "step": 2772 }, { "epoch": 0.46377053978341765, "grad_norm": 2.4103269577026367, "learning_rate": 3.241104188531671e-05, "loss": 2.7572, "step": 2773 }, { "epoch": 0.4639377848392357, "grad_norm": 7.87831974029541, "learning_rate": 3.239710092283486e-05, "loss": 2.8739, "step": 2774 }, { "epoch": 0.4641050298950537, "grad_norm": 6.763671398162842, "learning_rate": 3.238315743899049e-05, "loss": 2.4852, "step": 2775 }, { "epoch": 0.46427227495087176, "grad_norm": 2.2267444133758545, "learning_rate": 3.236921143853635e-05, "loss": 2.4161, "step": 2776 }, { "epoch": 0.4644395200066898, "grad_norm": 2.8764333724975586, "learning_rate": 3.235526292622604e-05, "loss": 2.7586, "step": 2777 }, { "epoch": 0.46460676506250786, "grad_norm": 12.651801109313965, "learning_rate": 3.2341311906814044e-05, "loss": 2.9155, "step": 2778 }, { "epoch": 0.46477401011832586, "grad_norm": 2.1149837970733643, "learning_rate": 3.232735838505566e-05, "loss": 2.3525, "step": 2779 }, { "epoch": 0.4649412551741439, "grad_norm": 4.674714088439941, "learning_rate": 3.231340236570708e-05, "loss": 2.6303, "step": 2780 }, { "epoch": 0.465108500229962, "grad_norm": 6.797060012817383, "learning_rate": 3.2299443853525315e-05, "loss": 2.5173, "step": 2781 }, { "epoch": 0.46527574528578, "grad_norm": 5.972956657409668, "learning_rate": 3.228548285326825e-05, "loss": 2.8264, "step": 2782 }, { "epoch": 0.46544299034159803, "grad_norm": 7.763673782348633, "learning_rate": 3.227151936969458e-05, "loss": 2.9764, "step": 2783 }, { "epoch": 0.4656102353974161, "grad_norm": 3.5150303840637207, "learning_rate": 3.22575534075639e-05, "loss": 2.5363, "step": 2784 }, { "epoch": 0.4657774804532341, "grad_norm": 4.857624053955078, "learning_rate": 3.224358497163661e-05, "loss": 3.0687, "step": 2785 }, { "epoch": 0.46594472550905214, "grad_norm": 3.755931854248047, "learning_rate": 3.2229614066673974e-05, "loss": 3.2493, "step": 2786 }, { "epoch": 0.4661119705648702, "grad_norm": 2.9021518230438232, "learning_rate": 3.221564069743808e-05, "loss": 3.1517, "step": 2787 }, { "epoch": 0.4662792156206882, "grad_norm": 4.405710220336914, "learning_rate": 3.220166486869188e-05, "loss": 2.7679, "step": 2788 }, { "epoch": 0.46644646067650625, "grad_norm": 3.4416425228118896, "learning_rate": 3.218768658519913e-05, "loss": 3.0691, "step": 2789 }, { "epoch": 0.4666137057323243, "grad_norm": 4.584331512451172, "learning_rate": 3.217370585172445e-05, "loss": 2.8639, "step": 2790 }, { "epoch": 0.4667809507881423, "grad_norm": 6.197233200073242, "learning_rate": 3.215972267303331e-05, "loss": 2.6441, "step": 2791 }, { "epoch": 0.46694819584396036, "grad_norm": 4.2125163078308105, "learning_rate": 3.2145737053891964e-05, "loss": 2.7751, "step": 2792 }, { "epoch": 0.4671154408997784, "grad_norm": 3.816497325897217, "learning_rate": 3.213174899906752e-05, "loss": 2.8974, "step": 2793 }, { "epoch": 0.4672826859555964, "grad_norm": 8.123418807983398, "learning_rate": 3.2117758513327944e-05, "loss": 2.5066, "step": 2794 }, { "epoch": 0.46744993101141447, "grad_norm": 7.1080827713012695, "learning_rate": 3.2103765601442006e-05, "loss": 2.5833, "step": 2795 }, { "epoch": 0.4676171760672325, "grad_norm": 2.936511516571045, "learning_rate": 3.208977026817929e-05, "loss": 2.4613, "step": 2796 }, { "epoch": 0.4677844211230506, "grad_norm": 4.709719657897949, "learning_rate": 3.207577251831022e-05, "loss": 2.8333, "step": 2797 }, { "epoch": 0.4679516661788686, "grad_norm": 2.9223432540893555, "learning_rate": 3.206177235660606e-05, "loss": 2.5437, "step": 2798 }, { "epoch": 0.46811891123468663, "grad_norm": 5.244682312011719, "learning_rate": 3.2047769787838867e-05, "loss": 2.7598, "step": 2799 }, { "epoch": 0.4682861562905047, "grad_norm": 4.149872779846191, "learning_rate": 3.203376481678154e-05, "loss": 3.0568, "step": 2800 }, { "epoch": 0.4684534013463227, "grad_norm": 3.9034907817840576, "learning_rate": 3.201975744820778e-05, "loss": 3.0826, "step": 2801 }, { "epoch": 0.46862064640214074, "grad_norm": 8.490144729614258, "learning_rate": 3.2005747686892105e-05, "loss": 2.9862, "step": 2802 }, { "epoch": 0.4687878914579588, "grad_norm": 8.064421653747559, "learning_rate": 3.199173553760988e-05, "loss": 2.8983, "step": 2803 }, { "epoch": 0.4689551365137768, "grad_norm": 4.405576705932617, "learning_rate": 3.197772100513725e-05, "loss": 3.0033, "step": 2804 }, { "epoch": 0.46912238156959485, "grad_norm": 6.056215286254883, "learning_rate": 3.1963704094251183e-05, "loss": 2.5308, "step": 2805 }, { "epoch": 0.4692896266254129, "grad_norm": 2.3713057041168213, "learning_rate": 3.194968480972945e-05, "loss": 2.99, "step": 2806 }, { "epoch": 0.4694568716812309, "grad_norm": 5.238874435424805, "learning_rate": 3.193566315635066e-05, "loss": 2.6244, "step": 2807 }, { "epoch": 0.46962411673704896, "grad_norm": 4.986017227172852, "learning_rate": 3.192163913889419e-05, "loss": 2.8085, "step": 2808 }, { "epoch": 0.469791361792867, "grad_norm": 9.006490707397461, "learning_rate": 3.190761276214025e-05, "loss": 3.4621, "step": 2809 }, { "epoch": 0.469958606848685, "grad_norm": 6.053554058074951, "learning_rate": 3.189358403086984e-05, "loss": 3.1915, "step": 2810 }, { "epoch": 0.47012585190450307, "grad_norm": 6.36543083190918, "learning_rate": 3.1879552949864774e-05, "loss": 3.0813, "step": 2811 }, { "epoch": 0.4702930969603211, "grad_norm": 4.011750221252441, "learning_rate": 3.186551952390766e-05, "loss": 2.8175, "step": 2812 }, { "epoch": 0.4704603420161391, "grad_norm": 3.367924928665161, "learning_rate": 3.1851483757781904e-05, "loss": 3.0158, "step": 2813 }, { "epoch": 0.4706275870719572, "grad_norm": 4.170313835144043, "learning_rate": 3.1837445656271706e-05, "loss": 2.617, "step": 2814 }, { "epoch": 0.47079483212777523, "grad_norm": 4.323908805847168, "learning_rate": 3.182340522416207e-05, "loss": 2.5015, "step": 2815 }, { "epoch": 0.4709620771835933, "grad_norm": 3.907839775085449, "learning_rate": 3.180936246623881e-05, "loss": 2.8133, "step": 2816 }, { "epoch": 0.4711293222394113, "grad_norm": 2.754929542541504, "learning_rate": 3.17953173872885e-05, "loss": 2.6151, "step": 2817 }, { "epoch": 0.47129656729522934, "grad_norm": 3.292386054992676, "learning_rate": 3.178126999209851e-05, "loss": 2.9042, "step": 2818 }, { "epoch": 0.4714638123510474, "grad_norm": 4.59695291519165, "learning_rate": 3.176722028545703e-05, "loss": 2.8699, "step": 2819 }, { "epoch": 0.4716310574068654, "grad_norm": 3.2638888359069824, "learning_rate": 3.175316827215302e-05, "loss": 2.6898, "step": 2820 }, { "epoch": 0.47179830246268345, "grad_norm": 5.10040283203125, "learning_rate": 3.17391139569762e-05, "loss": 3.1001, "step": 2821 }, { "epoch": 0.4719655475185015, "grad_norm": 4.508291244506836, "learning_rate": 3.172505734471712e-05, "loss": 2.9523, "step": 2822 }, { "epoch": 0.4721327925743195, "grad_norm": 4.6342315673828125, "learning_rate": 3.1710998440167084e-05, "loss": 2.61, "step": 2823 }, { "epoch": 0.47230003763013756, "grad_norm": 3.364240884780884, "learning_rate": 3.169693724811818e-05, "loss": 3.0105, "step": 2824 }, { "epoch": 0.4724672826859556, "grad_norm": 4.053812503814697, "learning_rate": 3.1682873773363294e-05, "loss": 2.9501, "step": 2825 }, { "epoch": 0.4726345277417736, "grad_norm": 6.404336929321289, "learning_rate": 3.1668808020696064e-05, "loss": 3.1763, "step": 2826 }, { "epoch": 0.47280177279759167, "grad_norm": 3.7550208568573, "learning_rate": 3.165473999491092e-05, "loss": 2.6661, "step": 2827 }, { "epoch": 0.4729690178534097, "grad_norm": 3.0362250804901123, "learning_rate": 3.164066970080307e-05, "loss": 2.7923, "step": 2828 }, { "epoch": 0.4731362629092277, "grad_norm": 6.343136310577393, "learning_rate": 3.162659714316848e-05, "loss": 3.2019, "step": 2829 }, { "epoch": 0.4733035079650458, "grad_norm": 5.305064678192139, "learning_rate": 3.161252232680391e-05, "loss": 2.7441, "step": 2830 }, { "epoch": 0.47347075302086383, "grad_norm": 2.8165738582611084, "learning_rate": 3.159844525650686e-05, "loss": 2.8164, "step": 2831 }, { "epoch": 0.47363799807668183, "grad_norm": 3.1505234241485596, "learning_rate": 3.1584365937075626e-05, "loss": 2.5419, "step": 2832 }, { "epoch": 0.4738052431324999, "grad_norm": 2.4925551414489746, "learning_rate": 3.1570284373309254e-05, "loss": 2.5724, "step": 2833 }, { "epoch": 0.47397248818831794, "grad_norm": 3.9852311611175537, "learning_rate": 3.1556200570007564e-05, "loss": 2.6035, "step": 2834 }, { "epoch": 0.474139733244136, "grad_norm": 6.9373064041137695, "learning_rate": 3.154211453197114e-05, "loss": 3.0997, "step": 2835 }, { "epoch": 0.474306978299954, "grad_norm": 5.482633113861084, "learning_rate": 3.152802626400131e-05, "loss": 3.2723, "step": 2836 }, { "epoch": 0.47447422335577205, "grad_norm": 7.416325092315674, "learning_rate": 3.1513935770900195e-05, "loss": 2.5772, "step": 2837 }, { "epoch": 0.4746414684115901, "grad_norm": 3.641672134399414, "learning_rate": 3.1499843057470634e-05, "loss": 2.7685, "step": 2838 }, { "epoch": 0.4748087134674081, "grad_norm": 3.2854089736938477, "learning_rate": 3.148574812851626e-05, "loss": 2.7111, "step": 2839 }, { "epoch": 0.47497595852322616, "grad_norm": 4.720738887786865, "learning_rate": 3.147165098884144e-05, "loss": 2.5788, "step": 2840 }, { "epoch": 0.4751432035790442, "grad_norm": 3.119321584701538, "learning_rate": 3.1457551643251295e-05, "loss": 2.8935, "step": 2841 }, { "epoch": 0.4753104486348622, "grad_norm": 4.756255149841309, "learning_rate": 3.1443450096551715e-05, "loss": 2.8661, "step": 2842 }, { "epoch": 0.47547769369068027, "grad_norm": 3.529388189315796, "learning_rate": 3.14293463535493e-05, "loss": 2.6499, "step": 2843 }, { "epoch": 0.4756449387464983, "grad_norm": 2.4454550743103027, "learning_rate": 3.1415240419051465e-05, "loss": 2.9424, "step": 2844 }, { "epoch": 0.4758121838023163, "grad_norm": 4.676028728485107, "learning_rate": 3.1401132297866306e-05, "loss": 2.8652, "step": 2845 }, { "epoch": 0.4759794288581344, "grad_norm": 4.985014915466309, "learning_rate": 3.138702199480271e-05, "loss": 2.901, "step": 2846 }, { "epoch": 0.47614667391395243, "grad_norm": 9.370128631591797, "learning_rate": 3.137290951467027e-05, "loss": 3.362, "step": 2847 }, { "epoch": 0.47631391896977043, "grad_norm": 3.1018097400665283, "learning_rate": 3.135879486227935e-05, "loss": 2.649, "step": 2848 }, { "epoch": 0.4764811640255885, "grad_norm": 7.778246879577637, "learning_rate": 3.134467804244106e-05, "loss": 3.6368, "step": 2849 }, { "epoch": 0.47664840908140654, "grad_norm": 4.368343830108643, "learning_rate": 3.1330559059967204e-05, "loss": 2.8109, "step": 2850 }, { "epoch": 0.47681565413722454, "grad_norm": 5.38142728805542, "learning_rate": 3.1316437919670364e-05, "loss": 3.0692, "step": 2851 }, { "epoch": 0.4769828991930426, "grad_norm": 6.235647678375244, "learning_rate": 3.130231462636387e-05, "loss": 2.754, "step": 2852 }, { "epoch": 0.47715014424886065, "grad_norm": 5.700519561767578, "learning_rate": 3.128818918486173e-05, "loss": 2.774, "step": 2853 }, { "epoch": 0.4773173893046787, "grad_norm": 2.243556022644043, "learning_rate": 3.127406159997873e-05, "loss": 2.4289, "step": 2854 }, { "epoch": 0.4774846343604967, "grad_norm": 3.130079984664917, "learning_rate": 3.125993187653038e-05, "loss": 2.6167, "step": 2855 }, { "epoch": 0.47765187941631476, "grad_norm": 3.8546228408813477, "learning_rate": 3.12458000193329e-05, "loss": 2.931, "step": 2856 }, { "epoch": 0.4778191244721328, "grad_norm": 7.628808975219727, "learning_rate": 3.123166603320325e-05, "loss": 2.8143, "step": 2857 }, { "epoch": 0.4779863695279508, "grad_norm": 6.238963603973389, "learning_rate": 3.1217529922959134e-05, "loss": 2.5515, "step": 2858 }, { "epoch": 0.47815361458376887, "grad_norm": 3.150146722793579, "learning_rate": 3.120339169341893e-05, "loss": 2.5778, "step": 2859 }, { "epoch": 0.4783208596395869, "grad_norm": 3.9962542057037354, "learning_rate": 3.11892513494018e-05, "loss": 2.8864, "step": 2860 }, { "epoch": 0.4784881046954049, "grad_norm": 5.051580905914307, "learning_rate": 3.117510889572758e-05, "loss": 2.7854, "step": 2861 }, { "epoch": 0.478655349751223, "grad_norm": 6.348480701446533, "learning_rate": 3.116096433721684e-05, "loss": 2.8052, "step": 2862 }, { "epoch": 0.47882259480704104, "grad_norm": 12.705005645751953, "learning_rate": 3.1146817678690866e-05, "loss": 3.8771, "step": 2863 }, { "epoch": 0.47898983986285903, "grad_norm": 4.30927038192749, "learning_rate": 3.113266892497168e-05, "loss": 2.4762, "step": 2864 }, { "epoch": 0.4791570849186771, "grad_norm": 4.179882049560547, "learning_rate": 3.111851808088199e-05, "loss": 2.887, "step": 2865 }, { "epoch": 0.47932432997449514, "grad_norm": 3.115438461303711, "learning_rate": 3.1104365151245225e-05, "loss": 2.7422, "step": 2866 }, { "epoch": 0.47949157503031314, "grad_norm": 8.371920585632324, "learning_rate": 3.109021014088553e-05, "loss": 3.4295, "step": 2867 }, { "epoch": 0.4796588200861312, "grad_norm": 4.809936046600342, "learning_rate": 3.1076053054627754e-05, "loss": 2.9277, "step": 2868 }, { "epoch": 0.47982606514194925, "grad_norm": 6.0667009353637695, "learning_rate": 3.106189389729746e-05, "loss": 2.4827, "step": 2869 }, { "epoch": 0.47999331019776725, "grad_norm": 3.8321642875671387, "learning_rate": 3.104773267372091e-05, "loss": 2.7627, "step": 2870 }, { "epoch": 0.4801605552535853, "grad_norm": 3.308108329772949, "learning_rate": 3.1033569388725065e-05, "loss": 2.5654, "step": 2871 }, { "epoch": 0.48032780030940336, "grad_norm": 4.020956039428711, "learning_rate": 3.1019404047137615e-05, "loss": 2.4764, "step": 2872 }, { "epoch": 0.4804950453652214, "grad_norm": 4.314621448516846, "learning_rate": 3.1005236653786925e-05, "loss": 2.6762, "step": 2873 }, { "epoch": 0.4806622904210394, "grad_norm": 6.256698131561279, "learning_rate": 3.099106721350206e-05, "loss": 2.795, "step": 2874 }, { "epoch": 0.4808295354768575, "grad_norm": 7.049123764038086, "learning_rate": 3.09768957311128e-05, "loss": 3.221, "step": 2875 }, { "epoch": 0.4809967805326755, "grad_norm": 5.59953498840332, "learning_rate": 3.09627222114496e-05, "loss": 2.8257, "step": 2876 }, { "epoch": 0.4811640255884935, "grad_norm": 3.2941572666168213, "learning_rate": 3.0948546659343636e-05, "loss": 2.7456, "step": 2877 }, { "epoch": 0.4813312706443116, "grad_norm": 2.5855796337127686, "learning_rate": 3.093436907962674e-05, "loss": 2.5982, "step": 2878 }, { "epoch": 0.48149851570012964, "grad_norm": 6.35115909576416, "learning_rate": 3.092018947713149e-05, "loss": 3.2413, "step": 2879 }, { "epoch": 0.48166576075594764, "grad_norm": 4.622556209564209, "learning_rate": 3.090600785669108e-05, "loss": 2.6609, "step": 2880 }, { "epoch": 0.4818330058117657, "grad_norm": 3.220813512802124, "learning_rate": 3.089182422313945e-05, "loss": 2.9845, "step": 2881 }, { "epoch": 0.48200025086758375, "grad_norm": 5.295070648193359, "learning_rate": 3.087763858131122e-05, "loss": 2.6752, "step": 2882 }, { "epoch": 0.48216749592340175, "grad_norm": 4.779241561889648, "learning_rate": 3.0863450936041666e-05, "loss": 2.4447, "step": 2883 }, { "epoch": 0.4823347409792198, "grad_norm": 4.897919654846191, "learning_rate": 3.084926129216677e-05, "loss": 2.7474, "step": 2884 }, { "epoch": 0.48250198603503786, "grad_norm": 4.113572120666504, "learning_rate": 3.083506965452318e-05, "loss": 2.6674, "step": 2885 }, { "epoch": 0.48266923109085585, "grad_norm": 4.127908229827881, "learning_rate": 3.082087602794825e-05, "loss": 2.9027, "step": 2886 }, { "epoch": 0.4828364761466739, "grad_norm": 4.786901950836182, "learning_rate": 3.080668041727997e-05, "loss": 2.6667, "step": 2887 }, { "epoch": 0.48300372120249196, "grad_norm": 4.103257179260254, "learning_rate": 3.079248282735704e-05, "loss": 2.8722, "step": 2888 }, { "epoch": 0.48317096625830996, "grad_norm": 6.329278469085693, "learning_rate": 3.077828326301884e-05, "loss": 3.0906, "step": 2889 }, { "epoch": 0.483338211314128, "grad_norm": 4.548722743988037, "learning_rate": 3.076408172910539e-05, "loss": 2.924, "step": 2890 }, { "epoch": 0.4835054563699461, "grad_norm": 8.18855094909668, "learning_rate": 3.07498782304574e-05, "loss": 2.9989, "step": 2891 }, { "epoch": 0.4836727014257641, "grad_norm": 4.329102039337158, "learning_rate": 3.073567277191626e-05, "loss": 2.4857, "step": 2892 }, { "epoch": 0.48383994648158213, "grad_norm": 3.2363882064819336, "learning_rate": 3.072146535832401e-05, "loss": 3.0065, "step": 2893 }, { "epoch": 0.4840071915374002, "grad_norm": 4.676276206970215, "learning_rate": 3.070725599452336e-05, "loss": 2.8087, "step": 2894 }, { "epoch": 0.48417443659321824, "grad_norm": 5.57070779800415, "learning_rate": 3.0693044685357694e-05, "loss": 3.1695, "step": 2895 }, { "epoch": 0.48434168164903624, "grad_norm": 5.369676113128662, "learning_rate": 3.067883143567106e-05, "loss": 2.7019, "step": 2896 }, { "epoch": 0.4845089267048543, "grad_norm": 3.7709197998046875, "learning_rate": 3.066461625030815e-05, "loss": 2.7322, "step": 2897 }, { "epoch": 0.48467617176067235, "grad_norm": 3.2773935794830322, "learning_rate": 3.065039913411432e-05, "loss": 2.7822, "step": 2898 }, { "epoch": 0.48484341681649035, "grad_norm": 4.051985740661621, "learning_rate": 3.063618009193561e-05, "loss": 3.0232, "step": 2899 }, { "epoch": 0.4850106618723084, "grad_norm": 6.929952621459961, "learning_rate": 3.0621959128618674e-05, "loss": 2.7054, "step": 2900 }, { "epoch": 0.48517790692812646, "grad_norm": 6.956687927246094, "learning_rate": 3.060773624901087e-05, "loss": 2.8456, "step": 2901 }, { "epoch": 0.48534515198394446, "grad_norm": 5.420561790466309, "learning_rate": 3.059351145796017e-05, "loss": 3.0825, "step": 2902 }, { "epoch": 0.4855123970397625, "grad_norm": 6.840163707733154, "learning_rate": 3.057928476031521e-05, "loss": 3.1479, "step": 2903 }, { "epoch": 0.48567964209558057, "grad_norm": 5.7858381271362305, "learning_rate": 3.0565056160925265e-05, "loss": 2.4797, "step": 2904 }, { "epoch": 0.48584688715139857, "grad_norm": 4.9681196212768555, "learning_rate": 3.055082566464029e-05, "loss": 2.7486, "step": 2905 }, { "epoch": 0.4860141322072166, "grad_norm": 4.383670330047607, "learning_rate": 3.0536593276310864e-05, "loss": 3.2245, "step": 2906 }, { "epoch": 0.4861813772630347, "grad_norm": 6.76078987121582, "learning_rate": 3.05223590007882e-05, "loss": 2.9403, "step": 2907 }, { "epoch": 0.4863486223188527, "grad_norm": 5.517353534698486, "learning_rate": 3.050812284292418e-05, "loss": 3.2287, "step": 2908 }, { "epoch": 0.48651586737467073, "grad_norm": 1.9948031902313232, "learning_rate": 3.049388480757131e-05, "loss": 2.8284, "step": 2909 }, { "epoch": 0.4866831124304888, "grad_norm": 5.341765403747559, "learning_rate": 3.047964489958274e-05, "loss": 2.801, "step": 2910 }, { "epoch": 0.4868503574863068, "grad_norm": 5.3349833488464355, "learning_rate": 3.046540312381227e-05, "loss": 2.9843, "step": 2911 }, { "epoch": 0.48701760254212484, "grad_norm": 3.111042022705078, "learning_rate": 3.045115948511431e-05, "loss": 2.6683, "step": 2912 }, { "epoch": 0.4871848475979429, "grad_norm": 2.858727216720581, "learning_rate": 3.043691398834393e-05, "loss": 2.9035, "step": 2913 }, { "epoch": 0.48735209265376095, "grad_norm": 5.601384162902832, "learning_rate": 3.0422666638356822e-05, "loss": 3.1432, "step": 2914 }, { "epoch": 0.48751933770957895, "grad_norm": 4.305600643157959, "learning_rate": 3.0408417440009318e-05, "loss": 3.1568, "step": 2915 }, { "epoch": 0.487686582765397, "grad_norm": 3.455940008163452, "learning_rate": 3.0394166398158362e-05, "loss": 2.5974, "step": 2916 }, { "epoch": 0.48785382782121506, "grad_norm": 3.042714834213257, "learning_rate": 3.0379913517661546e-05, "loss": 2.6921, "step": 2917 }, { "epoch": 0.48802107287703306, "grad_norm": 3.4122445583343506, "learning_rate": 3.0365658803377077e-05, "loss": 2.8627, "step": 2918 }, { "epoch": 0.4881883179328511, "grad_norm": 3.865459680557251, "learning_rate": 3.0351402260163802e-05, "loss": 2.5885, "step": 2919 }, { "epoch": 0.48835556298866917, "grad_norm": 3.228307008743286, "learning_rate": 3.0337143892881176e-05, "loss": 2.578, "step": 2920 }, { "epoch": 0.48852280804448717, "grad_norm": 5.086442947387695, "learning_rate": 3.032288370638927e-05, "loss": 2.9978, "step": 2921 }, { "epoch": 0.4886900531003052, "grad_norm": 7.3732008934021, "learning_rate": 3.03086217055488e-05, "loss": 2.9786, "step": 2922 }, { "epoch": 0.4888572981561233, "grad_norm": 3.0388846397399902, "learning_rate": 3.0294357895221077e-05, "loss": 2.478, "step": 2923 }, { "epoch": 0.4890245432119413, "grad_norm": 3.206815004348755, "learning_rate": 3.028009228026804e-05, "loss": 2.4787, "step": 2924 }, { "epoch": 0.48919178826775933, "grad_norm": 8.370271682739258, "learning_rate": 3.0265824865552245e-05, "loss": 3.4288, "step": 2925 }, { "epoch": 0.4893590333235774, "grad_norm": 6.550251483917236, "learning_rate": 3.0251555655936853e-05, "loss": 2.6164, "step": 2926 }, { "epoch": 0.4895262783793954, "grad_norm": 4.159440517425537, "learning_rate": 3.0237284656285643e-05, "loss": 3.1376, "step": 2927 }, { "epoch": 0.48969352343521344, "grad_norm": 5.840036869049072, "learning_rate": 3.0223011871462992e-05, "loss": 2.904, "step": 2928 }, { "epoch": 0.4898607684910315, "grad_norm": 8.113520622253418, "learning_rate": 3.0208737306333906e-05, "loss": 2.9155, "step": 2929 }, { "epoch": 0.4900280135468495, "grad_norm": 6.806482791900635, "learning_rate": 3.0194460965763992e-05, "loss": 2.8597, "step": 2930 }, { "epoch": 0.49019525860266755, "grad_norm": 2.7418181896209717, "learning_rate": 3.0180182854619456e-05, "loss": 2.2571, "step": 2931 }, { "epoch": 0.4903625036584856, "grad_norm": 2.0961685180664062, "learning_rate": 3.0165902977767103e-05, "loss": 2.7337, "step": 2932 }, { "epoch": 0.49052974871430366, "grad_norm": 2.9251649379730225, "learning_rate": 3.015162134007434e-05, "loss": 2.5714, "step": 2933 }, { "epoch": 0.49069699377012166, "grad_norm": 4.168354034423828, "learning_rate": 3.0137337946409195e-05, "loss": 2.9413, "step": 2934 }, { "epoch": 0.4908642388259397, "grad_norm": 3.8439807891845703, "learning_rate": 3.0123052801640274e-05, "loss": 2.7809, "step": 2935 }, { "epoch": 0.49103148388175777, "grad_norm": 4.512439250946045, "learning_rate": 3.0108765910636786e-05, "loss": 2.8314, "step": 2936 }, { "epoch": 0.49119872893757577, "grad_norm": 5.66618013381958, "learning_rate": 3.009447727826853e-05, "loss": 2.7935, "step": 2937 }, { "epoch": 0.4913659739933938, "grad_norm": 3.7202208042144775, "learning_rate": 3.0080186909405918e-05, "loss": 2.5768, "step": 2938 }, { "epoch": 0.4915332190492119, "grad_norm": 7.673431396484375, "learning_rate": 3.006589480891992e-05, "loss": 2.6112, "step": 2939 }, { "epoch": 0.4917004641050299, "grad_norm": 3.586440324783325, "learning_rate": 3.005160098168213e-05, "loss": 2.5086, "step": 2940 }, { "epoch": 0.49186770916084793, "grad_norm": 2.6718974113464355, "learning_rate": 3.003730543256471e-05, "loss": 2.4252, "step": 2941 }, { "epoch": 0.492034954216666, "grad_norm": 6.641931533813477, "learning_rate": 3.0023008166440425e-05, "loss": 3.3909, "step": 2942 }, { "epoch": 0.492202199272484, "grad_norm": 4.574799537658691, "learning_rate": 3.00087091881826e-05, "loss": 2.3062, "step": 2943 }, { "epoch": 0.49236944432830204, "grad_norm": 3.18830943107605, "learning_rate": 2.9994408502665167e-05, "loss": 2.9062, "step": 2944 }, { "epoch": 0.4925366893841201, "grad_norm": 5.093817234039307, "learning_rate": 2.9980106114762625e-05, "loss": 3.0454, "step": 2945 }, { "epoch": 0.4927039344399381, "grad_norm": 4.089663982391357, "learning_rate": 2.996580202935007e-05, "loss": 2.98, "step": 2946 }, { "epoch": 0.49287117949575615, "grad_norm": 6.609007358551025, "learning_rate": 2.9951496251303168e-05, "loss": 3.0865, "step": 2947 }, { "epoch": 0.4930384245515742, "grad_norm": 4.200178146362305, "learning_rate": 2.993718878549815e-05, "loss": 2.6006, "step": 2948 }, { "epoch": 0.4932056696073922, "grad_norm": 11.421857833862305, "learning_rate": 2.9922879636811842e-05, "loss": 3.2849, "step": 2949 }, { "epoch": 0.49337291466321026, "grad_norm": 6.589548587799072, "learning_rate": 2.990856881012163e-05, "loss": 3.0534, "step": 2950 }, { "epoch": 0.4935401597190283, "grad_norm": 3.698119878768921, "learning_rate": 2.989425631030548e-05, "loss": 2.7539, "step": 2951 }, { "epoch": 0.49370740477484637, "grad_norm": 3.2065646648406982, "learning_rate": 2.9879942142241924e-05, "loss": 2.6596, "step": 2952 }, { "epoch": 0.49387464983066437, "grad_norm": 4.993155002593994, "learning_rate": 2.9865626310810058e-05, "loss": 3.0934, "step": 2953 }, { "epoch": 0.4940418948864824, "grad_norm": 9.62667179107666, "learning_rate": 2.9851308820889562e-05, "loss": 2.8889, "step": 2954 }, { "epoch": 0.4942091399423005, "grad_norm": 3.4661333560943604, "learning_rate": 2.983698967736066e-05, "loss": 2.3305, "step": 2955 }, { "epoch": 0.4943763849981185, "grad_norm": 6.611898422241211, "learning_rate": 2.9822668885104155e-05, "loss": 2.9654, "step": 2956 }, { "epoch": 0.49454363005393653, "grad_norm": 5.334603309631348, "learning_rate": 2.9808346449001408e-05, "loss": 2.5514, "step": 2957 }, { "epoch": 0.4947108751097546, "grad_norm": 3.117326021194458, "learning_rate": 2.9794022373934323e-05, "loss": 2.4971, "step": 2958 }, { "epoch": 0.4948781201655726, "grad_norm": 2.111342668533325, "learning_rate": 2.9779696664785394e-05, "loss": 2.5392, "step": 2959 }, { "epoch": 0.49504536522139064, "grad_norm": 2.8716301918029785, "learning_rate": 2.9765369326437658e-05, "loss": 2.8484, "step": 2960 }, { "epoch": 0.4952126102772087, "grad_norm": 4.1390485763549805, "learning_rate": 2.97510403637747e-05, "loss": 2.6349, "step": 2961 }, { "epoch": 0.4953798553330267, "grad_norm": 4.57918643951416, "learning_rate": 2.9736709781680666e-05, "loss": 2.6267, "step": 2962 }, { "epoch": 0.49554710038884475, "grad_norm": 12.460389137268066, "learning_rate": 2.972237758504025e-05, "loss": 3.6052, "step": 2963 }, { "epoch": 0.4957143454446628, "grad_norm": 5.580864906311035, "learning_rate": 2.9708043778738702e-05, "loss": 3.0877, "step": 2964 }, { "epoch": 0.4958815905004808, "grad_norm": 3.570781707763672, "learning_rate": 2.9693708367661815e-05, "loss": 3.132, "step": 2965 }, { "epoch": 0.49604883555629886, "grad_norm": 5.525310039520264, "learning_rate": 2.9679371356695928e-05, "loss": 2.4694, "step": 2966 }, { "epoch": 0.4962160806121169, "grad_norm": 3.857578754425049, "learning_rate": 2.966503275072794e-05, "loss": 2.6582, "step": 2967 }, { "epoch": 0.4963833256679349, "grad_norm": 3.641406297683716, "learning_rate": 2.9650692554645277e-05, "loss": 2.7703, "step": 2968 }, { "epoch": 0.49655057072375297, "grad_norm": 3.722712516784668, "learning_rate": 2.9636350773335907e-05, "loss": 2.8936, "step": 2969 }, { "epoch": 0.496717815779571, "grad_norm": 3.2751271724700928, "learning_rate": 2.962200741168835e-05, "loss": 2.8589, "step": 2970 }, { "epoch": 0.4968850608353891, "grad_norm": 9.429491996765137, "learning_rate": 2.9607662474591656e-05, "loss": 2.7624, "step": 2971 }, { "epoch": 0.4970523058912071, "grad_norm": 5.190647125244141, "learning_rate": 2.959331596693542e-05, "loss": 2.913, "step": 2972 }, { "epoch": 0.49721955094702514, "grad_norm": 2.9054126739501953, "learning_rate": 2.9578967893609753e-05, "loss": 2.731, "step": 2973 }, { "epoch": 0.4973867960028432, "grad_norm": 3.118767023086548, "learning_rate": 2.9564618259505322e-05, "loss": 2.6657, "step": 2974 }, { "epoch": 0.4975540410586612, "grad_norm": 4.583472728729248, "learning_rate": 2.955026706951332e-05, "loss": 2.8436, "step": 2975 }, { "epoch": 0.49772128611447924, "grad_norm": 6.25074577331543, "learning_rate": 2.953591432852547e-05, "loss": 3.2137, "step": 2976 }, { "epoch": 0.4978885311702973, "grad_norm": 7.021403789520264, "learning_rate": 2.9521560041434015e-05, "loss": 2.8799, "step": 2977 }, { "epoch": 0.4980557762261153, "grad_norm": 4.8616766929626465, "learning_rate": 2.950720421313173e-05, "loss": 2.8461, "step": 2978 }, { "epoch": 0.49822302128193335, "grad_norm": 3.8536550998687744, "learning_rate": 2.9492846848511917e-05, "loss": 2.6433, "step": 2979 }, { "epoch": 0.4983902663377514, "grad_norm": 4.833449363708496, "learning_rate": 2.9478487952468414e-05, "loss": 3.0755, "step": 2980 }, { "epoch": 0.4985575113935694, "grad_norm": 4.645150184631348, "learning_rate": 2.9464127529895557e-05, "loss": 2.7202, "step": 2981 }, { "epoch": 0.49872475644938746, "grad_norm": 3.5133090019226074, "learning_rate": 2.9449765585688217e-05, "loss": 2.603, "step": 2982 }, { "epoch": 0.4988920015052055, "grad_norm": 4.312957286834717, "learning_rate": 2.9435402124741768e-05, "loss": 3.104, "step": 2983 }, { "epoch": 0.4990592465610235, "grad_norm": 3.944045066833496, "learning_rate": 2.942103715195214e-05, "loss": 3.3267, "step": 2984 }, { "epoch": 0.4992264916168416, "grad_norm": 6.22388219833374, "learning_rate": 2.9406670672215737e-05, "loss": 3.3265, "step": 2985 }, { "epoch": 0.4993937366726596, "grad_norm": 5.284175395965576, "learning_rate": 2.9392302690429486e-05, "loss": 2.9045, "step": 2986 }, { "epoch": 0.4995609817284776, "grad_norm": 4.224052429199219, "learning_rate": 2.9377933211490827e-05, "loss": 2.5326, "step": 2987 }, { "epoch": 0.4997282267842957, "grad_norm": 4.3119707107543945, "learning_rate": 2.936356224029774e-05, "loss": 2.9178, "step": 2988 }, { "epoch": 0.49989547184011374, "grad_norm": 11.012482643127441, "learning_rate": 2.9349189781748666e-05, "loss": 3.1087, "step": 2989 }, { "epoch": 0.5000627168959317, "grad_norm": 9.313746452331543, "learning_rate": 2.933481584074258e-05, "loss": 3.0293, "step": 2990 }, { "epoch": 0.5002299619517498, "grad_norm": 10.70989990234375, "learning_rate": 2.932044042217896e-05, "loss": 3.1095, "step": 2991 }, { "epoch": 0.5003972070075678, "grad_norm": 5.845628261566162, "learning_rate": 2.9306063530957788e-05, "loss": 2.7187, "step": 2992 }, { "epoch": 0.5005644520633858, "grad_norm": 2.7106850147247314, "learning_rate": 2.9291685171979545e-05, "loss": 2.6471, "step": 2993 }, { "epoch": 0.500731697119204, "grad_norm": 2.8527653217315674, "learning_rate": 2.9277305350145208e-05, "loss": 2.6291, "step": 2994 }, { "epoch": 0.500898942175022, "grad_norm": 2.8746912479400635, "learning_rate": 2.926292407035625e-05, "loss": 2.5109, "step": 2995 }, { "epoch": 0.50106618723084, "grad_norm": 3.8542237281799316, "learning_rate": 2.9248541337514672e-05, "loss": 2.2888, "step": 2996 }, { "epoch": 0.5012334322866581, "grad_norm": 2.754331588745117, "learning_rate": 2.9234157156522928e-05, "loss": 2.5718, "step": 2997 }, { "epoch": 0.5014006773424761, "grad_norm": 5.4707159996032715, "learning_rate": 2.9219771532283986e-05, "loss": 2.717, "step": 2998 }, { "epoch": 0.5015679223982941, "grad_norm": 5.78064489364624, "learning_rate": 2.9205384469701307e-05, "loss": 3.3015, "step": 2999 }, { "epoch": 0.5017351674541122, "grad_norm": 4.332867622375488, "learning_rate": 2.9190995973678842e-05, "loss": 2.8923, "step": 3000 }, { "epoch": 0.5019024125099302, "grad_norm": 5.872550964355469, "learning_rate": 2.917660604912102e-05, "loss": 3.0967, "step": 3001 }, { "epoch": 0.5020696575657482, "grad_norm": 4.833941459655762, "learning_rate": 2.9162214700932773e-05, "loss": 3.0929, "step": 3002 }, { "epoch": 0.5022369026215663, "grad_norm": 2.5433645248413086, "learning_rate": 2.9147821934019497e-05, "loss": 2.7785, "step": 3003 }, { "epoch": 0.5024041476773843, "grad_norm": 4.793759822845459, "learning_rate": 2.91334277532871e-05, "loss": 2.7366, "step": 3004 }, { "epoch": 0.5025713927332023, "grad_norm": 5.333278656005859, "learning_rate": 2.9119032163641952e-05, "loss": 2.8156, "step": 3005 }, { "epoch": 0.5027386377890204, "grad_norm": 6.659074783325195, "learning_rate": 2.91046351699909e-05, "loss": 2.5677, "step": 3006 }, { "epoch": 0.5029058828448384, "grad_norm": 7.13813591003418, "learning_rate": 2.9090236777241288e-05, "loss": 2.772, "step": 3007 }, { "epoch": 0.5030731279006564, "grad_norm": 3.258162498474121, "learning_rate": 2.907583699030092e-05, "loss": 2.4727, "step": 3008 }, { "epoch": 0.5032403729564745, "grad_norm": 4.631686687469482, "learning_rate": 2.9061435814078085e-05, "loss": 2.7215, "step": 3009 }, { "epoch": 0.5034076180122925, "grad_norm": 1.803758144378662, "learning_rate": 2.9047033253481547e-05, "loss": 2.3468, "step": 3010 }, { "epoch": 0.5035748630681105, "grad_norm": 3.6274619102478027, "learning_rate": 2.9032629313420517e-05, "loss": 3.0198, "step": 3011 }, { "epoch": 0.5037421081239286, "grad_norm": 5.404469013214111, "learning_rate": 2.9018223998804716e-05, "loss": 2.9607, "step": 3012 }, { "epoch": 0.5039093531797466, "grad_norm": 4.318949222564697, "learning_rate": 2.9003817314544308e-05, "loss": 2.7944, "step": 3013 }, { "epoch": 0.5040765982355646, "grad_norm": 4.7319512367248535, "learning_rate": 2.8989409265549937e-05, "loss": 2.9125, "step": 3014 }, { "epoch": 0.5042438432913827, "grad_norm": 5.017335414886475, "learning_rate": 2.897499985673268e-05, "loss": 2.9092, "step": 3015 }, { "epoch": 0.5044110883472007, "grad_norm": 3.1331701278686523, "learning_rate": 2.8960589093004125e-05, "loss": 2.6435, "step": 3016 }, { "epoch": 0.5045783334030187, "grad_norm": 4.017138481140137, "learning_rate": 2.8946176979276297e-05, "loss": 2.6545, "step": 3017 }, { "epoch": 0.5047455784588368, "grad_norm": 11.56563663482666, "learning_rate": 2.8931763520461673e-05, "loss": 3.0817, "step": 3018 }, { "epoch": 0.5049128235146548, "grad_norm": 5.716440677642822, "learning_rate": 2.8917348721473208e-05, "loss": 2.7538, "step": 3019 }, { "epoch": 0.5050800685704729, "grad_norm": 5.439677715301514, "learning_rate": 2.890293258722429e-05, "loss": 2.727, "step": 3020 }, { "epoch": 0.5052473136262909, "grad_norm": 6.792258262634277, "learning_rate": 2.88885151226288e-05, "loss": 2.7847, "step": 3021 }, { "epoch": 0.5054145586821089, "grad_norm": 3.9173264503479004, "learning_rate": 2.887409633260103e-05, "loss": 2.9904, "step": 3022 }, { "epoch": 0.505581803737927, "grad_norm": 2.936354398727417, "learning_rate": 2.8859676222055754e-05, "loss": 2.625, "step": 3023 }, { "epoch": 0.505749048793745, "grad_norm": 2.8494503498077393, "learning_rate": 2.8845254795908174e-05, "loss": 2.5467, "step": 3024 }, { "epoch": 0.505916293849563, "grad_norm": 3.802537679672241, "learning_rate": 2.8830832059073958e-05, "loss": 2.4582, "step": 3025 }, { "epoch": 0.5060835389053812, "grad_norm": 3.6083688735961914, "learning_rate": 2.881640801646922e-05, "loss": 3.0173, "step": 3026 }, { "epoch": 0.5062507839611992, "grad_norm": 3.6214940547943115, "learning_rate": 2.8801982673010503e-05, "loss": 2.7775, "step": 3027 }, { "epoch": 0.5064180290170172, "grad_norm": 4.8553009033203125, "learning_rate": 2.878755603361481e-05, "loss": 2.9244, "step": 3028 }, { "epoch": 0.5065852740728353, "grad_norm": 6.653963088989258, "learning_rate": 2.8773128103199574e-05, "loss": 3.1392, "step": 3029 }, { "epoch": 0.5067525191286533, "grad_norm": 4.018600940704346, "learning_rate": 2.875869888668268e-05, "loss": 2.6298, "step": 3030 }, { "epoch": 0.5069197641844713, "grad_norm": 4.179348945617676, "learning_rate": 2.8744268388982437e-05, "loss": 2.4856, "step": 3031 }, { "epoch": 0.5070870092402894, "grad_norm": 5.363978862762451, "learning_rate": 2.87298366150176e-05, "loss": 2.8571, "step": 3032 }, { "epoch": 0.5072542542961074, "grad_norm": 13.268026351928711, "learning_rate": 2.8715403569707365e-05, "loss": 3.0312, "step": 3033 }, { "epoch": 0.5074214993519254, "grad_norm": 8.973628997802734, "learning_rate": 2.870096925797135e-05, "loss": 3.0055, "step": 3034 }, { "epoch": 0.5075887444077435, "grad_norm": 14.020593643188477, "learning_rate": 2.8686533684729604e-05, "loss": 3.2497, "step": 3035 }, { "epoch": 0.5077559894635615, "grad_norm": 2.224900722503662, "learning_rate": 2.8672096854902614e-05, "loss": 2.5035, "step": 3036 }, { "epoch": 0.5079232345193795, "grad_norm": 9.725306510925293, "learning_rate": 2.8657658773411293e-05, "loss": 3.1381, "step": 3037 }, { "epoch": 0.5080904795751976, "grad_norm": 2.915496587753296, "learning_rate": 2.8643219445176982e-05, "loss": 2.752, "step": 3038 }, { "epoch": 0.5082577246310156, "grad_norm": 5.192761421203613, "learning_rate": 2.8628778875121443e-05, "loss": 2.9516, "step": 3039 }, { "epoch": 0.5084249696868336, "grad_norm": 7.2841925621032715, "learning_rate": 2.8614337068166846e-05, "loss": 2.9205, "step": 3040 }, { "epoch": 0.5085922147426517, "grad_norm": 3.254154920578003, "learning_rate": 2.859989402923583e-05, "loss": 2.8932, "step": 3041 }, { "epoch": 0.5087594597984697, "grad_norm": 3.4575631618499756, "learning_rate": 2.8585449763251397e-05, "loss": 2.7147, "step": 3042 }, { "epoch": 0.5089267048542877, "grad_norm": 8.451218605041504, "learning_rate": 2.8571004275137015e-05, "loss": 3.2639, "step": 3043 }, { "epoch": 0.5090939499101058, "grad_norm": 3.0859298706054688, "learning_rate": 2.855655756981653e-05, "loss": 2.6426, "step": 3044 }, { "epoch": 0.5092611949659238, "grad_norm": 3.5409934520721436, "learning_rate": 2.854210965221422e-05, "loss": 2.9366, "step": 3045 }, { "epoch": 0.5094284400217418, "grad_norm": 3.69792103767395, "learning_rate": 2.852766052725479e-05, "loss": 2.5207, "step": 3046 }, { "epoch": 0.5095956850775599, "grad_norm": 4.2363057136535645, "learning_rate": 2.8513210199863337e-05, "loss": 2.5531, "step": 3047 }, { "epoch": 0.5097629301333779, "grad_norm": 4.695446014404297, "learning_rate": 2.849875867496536e-05, "loss": 3.101, "step": 3048 }, { "epoch": 0.5099301751891959, "grad_norm": 5.06190299987793, "learning_rate": 2.84843059574868e-05, "loss": 2.5722, "step": 3049 }, { "epoch": 0.510097420245014, "grad_norm": 4.762596130371094, "learning_rate": 2.8469852052353973e-05, "loss": 2.6899, "step": 3050 }, { "epoch": 0.510264665300832, "grad_norm": 3.6031789779663086, "learning_rate": 2.845539696449362e-05, "loss": 2.7458, "step": 3051 }, { "epoch": 0.51043191035665, "grad_norm": 3.1957757472991943, "learning_rate": 2.8440940698832858e-05, "loss": 2.6707, "step": 3052 }, { "epoch": 0.5105991554124681, "grad_norm": 4.954264163970947, "learning_rate": 2.842648326029925e-05, "loss": 2.9483, "step": 3053 }, { "epoch": 0.5107664004682861, "grad_norm": 3.246110200881958, "learning_rate": 2.8412024653820708e-05, "loss": 2.7874, "step": 3054 }, { "epoch": 0.5109336455241041, "grad_norm": 5.539830207824707, "learning_rate": 2.8397564884325577e-05, "loss": 3.0144, "step": 3055 }, { "epoch": 0.5111008905799223, "grad_norm": 6.122608184814453, "learning_rate": 2.8383103956742597e-05, "loss": 3.1971, "step": 3056 }, { "epoch": 0.5112681356357403, "grad_norm": 3.5607361793518066, "learning_rate": 2.8368641876000878e-05, "loss": 2.8769, "step": 3057 }, { "epoch": 0.5114353806915584, "grad_norm": 4.628219127655029, "learning_rate": 2.8354178647029946e-05, "loss": 2.5229, "step": 3058 }, { "epoch": 0.5116026257473764, "grad_norm": 4.811853885650635, "learning_rate": 2.833971427475971e-05, "loss": 2.926, "step": 3059 }, { "epoch": 0.5117698708031944, "grad_norm": 3.0814216136932373, "learning_rate": 2.8325248764120476e-05, "loss": 2.2423, "step": 3060 }, { "epoch": 0.5119371158590125, "grad_norm": 5.736023426055908, "learning_rate": 2.8310782120042916e-05, "loss": 3.0333, "step": 3061 }, { "epoch": 0.5121043609148305, "grad_norm": 3.773975133895874, "learning_rate": 2.8296314347458126e-05, "loss": 2.4626, "step": 3062 }, { "epoch": 0.5122716059706485, "grad_norm": 3.8122901916503906, "learning_rate": 2.8281845451297546e-05, "loss": 2.8996, "step": 3063 }, { "epoch": 0.5124388510264666, "grad_norm": 6.847386360168457, "learning_rate": 2.8267375436493033e-05, "loss": 3.4653, "step": 3064 }, { "epoch": 0.5126060960822846, "grad_norm": 4.875690460205078, "learning_rate": 2.825290430797679e-05, "loss": 2.714, "step": 3065 }, { "epoch": 0.5127733411381026, "grad_norm": 5.650965690612793, "learning_rate": 2.823843207068144e-05, "loss": 2.5305, "step": 3066 }, { "epoch": 0.5129405861939207, "grad_norm": 3.4341580867767334, "learning_rate": 2.822395872953996e-05, "loss": 3.1516, "step": 3067 }, { "epoch": 0.5131078312497387, "grad_norm": 4.866370677947998, "learning_rate": 2.8209484289485705e-05, "loss": 2.9787, "step": 3068 }, { "epoch": 0.5132750763055567, "grad_norm": 4.3161468505859375, "learning_rate": 2.81950087554524e-05, "loss": 2.4992, "step": 3069 }, { "epoch": 0.5134423213613748, "grad_norm": 2.25247859954834, "learning_rate": 2.818053213237416e-05, "loss": 2.6374, "step": 3070 }, { "epoch": 0.5136095664171928, "grad_norm": 5.331046104431152, "learning_rate": 2.816605442518545e-05, "loss": 3.3416, "step": 3071 }, { "epoch": 0.5137768114730108, "grad_norm": 6.573606491088867, "learning_rate": 2.8151575638821127e-05, "loss": 2.9274, "step": 3072 }, { "epoch": 0.5139440565288289, "grad_norm": 4.22003698348999, "learning_rate": 2.8137095778216394e-05, "loss": 2.844, "step": 3073 }, { "epoch": 0.5141113015846469, "grad_norm": 9.321552276611328, "learning_rate": 2.812261484830684e-05, "loss": 3.4059, "step": 3074 }, { "epoch": 0.5142785466404649, "grad_norm": 5.33750581741333, "learning_rate": 2.8108132854028396e-05, "loss": 2.8783, "step": 3075 }, { "epoch": 0.514445791696283, "grad_norm": 3.684384822845459, "learning_rate": 2.8093649800317378e-05, "loss": 2.3735, "step": 3076 }, { "epoch": 0.514613036752101, "grad_norm": 4.950192451477051, "learning_rate": 2.8079165692110443e-05, "loss": 3.0288, "step": 3077 }, { "epoch": 0.514780281807919, "grad_norm": 2.8315558433532715, "learning_rate": 2.8064680534344634e-05, "loss": 2.7672, "step": 3078 }, { "epoch": 0.5149475268637371, "grad_norm": 3.5651473999023438, "learning_rate": 2.805019433195733e-05, "loss": 2.6059, "step": 3079 }, { "epoch": 0.5151147719195551, "grad_norm": 6.1370768547058105, "learning_rate": 2.803570708988627e-05, "loss": 2.4524, "step": 3080 }, { "epoch": 0.5152820169753731, "grad_norm": 5.7827277183532715, "learning_rate": 2.802121881306954e-05, "loss": 2.2294, "step": 3081 }, { "epoch": 0.5154492620311912, "grad_norm": 2.2943315505981445, "learning_rate": 2.8006729506445607e-05, "loss": 2.336, "step": 3082 }, { "epoch": 0.5156165070870092, "grad_norm": 6.077810764312744, "learning_rate": 2.7992239174953255e-05, "loss": 3.2228, "step": 3083 }, { "epoch": 0.5157837521428272, "grad_norm": 5.809459686279297, "learning_rate": 2.7977747823531642e-05, "loss": 3.1426, "step": 3084 }, { "epoch": 0.5159509971986453, "grad_norm": 7.4663848876953125, "learning_rate": 2.7963255457120253e-05, "loss": 2.4338, "step": 3085 }, { "epoch": 0.5161182422544633, "grad_norm": 5.624415397644043, "learning_rate": 2.794876208065894e-05, "loss": 2.7974, "step": 3086 }, { "epoch": 0.5162854873102813, "grad_norm": 3.774395704269409, "learning_rate": 2.793426769908789e-05, "loss": 2.791, "step": 3087 }, { "epoch": 0.5164527323660995, "grad_norm": 3.6204428672790527, "learning_rate": 2.7919772317347626e-05, "loss": 2.9545, "step": 3088 }, { "epoch": 0.5166199774219175, "grad_norm": 3.570150136947632, "learning_rate": 2.790527594037901e-05, "loss": 2.6268, "step": 3089 }, { "epoch": 0.5167872224777355, "grad_norm": 5.523581027984619, "learning_rate": 2.7890778573123265e-05, "loss": 2.8369, "step": 3090 }, { "epoch": 0.5169544675335536, "grad_norm": 2.9207088947296143, "learning_rate": 2.787628022052194e-05, "loss": 2.5157, "step": 3091 }, { "epoch": 0.5171217125893716, "grad_norm": 3.3700881004333496, "learning_rate": 2.7861780887516904e-05, "loss": 2.8245, "step": 3092 }, { "epoch": 0.5172889576451896, "grad_norm": 4.166431903839111, "learning_rate": 2.7847280579050383e-05, "loss": 2.8872, "step": 3093 }, { "epoch": 0.5174562027010077, "grad_norm": 4.1954426765441895, "learning_rate": 2.7832779300064916e-05, "loss": 2.6951, "step": 3094 }, { "epoch": 0.5176234477568257, "grad_norm": 3.3132853507995605, "learning_rate": 2.7818277055503394e-05, "loss": 2.6315, "step": 3095 }, { "epoch": 0.5177906928126437, "grad_norm": 5.251841068267822, "learning_rate": 2.7803773850309022e-05, "loss": 2.7394, "step": 3096 }, { "epoch": 0.5179579378684618, "grad_norm": 5.619594097137451, "learning_rate": 2.7789269689425334e-05, "loss": 2.459, "step": 3097 }, { "epoch": 0.5181251829242798, "grad_norm": 6.644158363342285, "learning_rate": 2.77747645777962e-05, "loss": 2.5145, "step": 3098 }, { "epoch": 0.5182924279800979, "grad_norm": 3.7096188068389893, "learning_rate": 2.7760258520365795e-05, "loss": 3.111, "step": 3099 }, { "epoch": 0.5184596730359159, "grad_norm": 3.125216245651245, "learning_rate": 2.7745751522078634e-05, "loss": 2.7658, "step": 3100 }, { "epoch": 0.5186269180917339, "grad_norm": 2.7645421028137207, "learning_rate": 2.7731243587879545e-05, "loss": 3.0296, "step": 3101 }, { "epoch": 0.518794163147552, "grad_norm": 4.751526355743408, "learning_rate": 2.7716734722713682e-05, "loss": 3.2057, "step": 3102 }, { "epoch": 0.51896140820337, "grad_norm": 7.704139709472656, "learning_rate": 2.7702224931526506e-05, "loss": 2.8726, "step": 3103 }, { "epoch": 0.519128653259188, "grad_norm": 7.436704158782959, "learning_rate": 2.7687714219263793e-05, "loss": 2.1516, "step": 3104 }, { "epoch": 0.5192958983150061, "grad_norm": 6.139180660247803, "learning_rate": 2.767320259087165e-05, "loss": 3.006, "step": 3105 }, { "epoch": 0.5194631433708241, "grad_norm": 6.780179977416992, "learning_rate": 2.765869005129647e-05, "loss": 3.3065, "step": 3106 }, { "epoch": 0.5196303884266421, "grad_norm": 4.100700855255127, "learning_rate": 2.7644176605484985e-05, "loss": 2.9286, "step": 3107 }, { "epoch": 0.5197976334824602, "grad_norm": 3.1721925735473633, "learning_rate": 2.7629662258384216e-05, "loss": 2.6336, "step": 3108 }, { "epoch": 0.5199648785382782, "grad_norm": 4.548224925994873, "learning_rate": 2.7615147014941496e-05, "loss": 2.5689, "step": 3109 }, { "epoch": 0.5201321235940962, "grad_norm": 4.402547359466553, "learning_rate": 2.7600630880104454e-05, "loss": 2.7234, "step": 3110 }, { "epoch": 0.5202993686499143, "grad_norm": 4.113526344299316, "learning_rate": 2.758611385882106e-05, "loss": 3.2932, "step": 3111 }, { "epoch": 0.5204666137057323, "grad_norm": 14.481661796569824, "learning_rate": 2.7571595956039547e-05, "loss": 3.2952, "step": 3112 }, { "epoch": 0.5206338587615503, "grad_norm": 3.7324700355529785, "learning_rate": 2.7557077176708455e-05, "loss": 3.0796, "step": 3113 }, { "epoch": 0.5208011038173684, "grad_norm": 6.371458530426025, "learning_rate": 2.7542557525776636e-05, "loss": 2.8821, "step": 3114 }, { "epoch": 0.5209683488731864, "grad_norm": 3.0502970218658447, "learning_rate": 2.752803700819323e-05, "loss": 2.6075, "step": 3115 }, { "epoch": 0.5211355939290044, "grad_norm": 3.572669744491577, "learning_rate": 2.751351562890767e-05, "loss": 2.7319, "step": 3116 }, { "epoch": 0.5213028389848225, "grad_norm": 3.121718168258667, "learning_rate": 2.74989933928697e-05, "loss": 2.451, "step": 3117 }, { "epoch": 0.5214700840406405, "grad_norm": 3.2513842582702637, "learning_rate": 2.7484470305029326e-05, "loss": 2.6976, "step": 3118 }, { "epoch": 0.5216373290964585, "grad_norm": 2.494527816772461, "learning_rate": 2.7469946370336874e-05, "loss": 2.5205, "step": 3119 }, { "epoch": 0.5218045741522767, "grad_norm": 3.9188334941864014, "learning_rate": 2.7455421593742942e-05, "loss": 2.767, "step": 3120 }, { "epoch": 0.5219718192080947, "grad_norm": 4.827089786529541, "learning_rate": 2.7440895980198422e-05, "loss": 2.8525, "step": 3121 }, { "epoch": 0.5221390642639127, "grad_norm": 2.293349027633667, "learning_rate": 2.7426369534654485e-05, "loss": 2.5744, "step": 3122 }, { "epoch": 0.5223063093197308, "grad_norm": 4.628221035003662, "learning_rate": 2.7411842262062587e-05, "loss": 2.4171, "step": 3123 }, { "epoch": 0.5224735543755488, "grad_norm": 3.491091728210449, "learning_rate": 2.7397314167374477e-05, "loss": 2.4956, "step": 3124 }, { "epoch": 0.5226407994313668, "grad_norm": 3.1372084617614746, "learning_rate": 2.738278525554217e-05, "loss": 2.4245, "step": 3125 }, { "epoch": 0.5228080444871849, "grad_norm": 4.1341447830200195, "learning_rate": 2.7368255531517966e-05, "loss": 2.7171, "step": 3126 }, { "epoch": 0.5229752895430029, "grad_norm": 5.887813568115234, "learning_rate": 2.7353725000254432e-05, "loss": 3.1707, "step": 3127 }, { "epoch": 0.5231425345988209, "grad_norm": 5.604085445404053, "learning_rate": 2.7339193666704432e-05, "loss": 2.7069, "step": 3128 }, { "epoch": 0.523309779654639, "grad_norm": 3.8573226928710938, "learning_rate": 2.732466153582109e-05, "loss": 2.6415, "step": 3129 }, { "epoch": 0.523477024710457, "grad_norm": 5.523350715637207, "learning_rate": 2.731012861255779e-05, "loss": 2.8956, "step": 3130 }, { "epoch": 0.523644269766275, "grad_norm": 4.056521892547607, "learning_rate": 2.7295594901868204e-05, "loss": 2.8705, "step": 3131 }, { "epoch": 0.5238115148220931, "grad_norm": 10.069238662719727, "learning_rate": 2.7281060408706264e-05, "loss": 2.8297, "step": 3132 }, { "epoch": 0.5239787598779111, "grad_norm": 4.584534168243408, "learning_rate": 2.7266525138026183e-05, "loss": 2.7337, "step": 3133 }, { "epoch": 0.5241460049337291, "grad_norm": 6.198544025421143, "learning_rate": 2.7251989094782417e-05, "loss": 2.8513, "step": 3134 }, { "epoch": 0.5243132499895472, "grad_norm": 3.9820775985717773, "learning_rate": 2.7237452283929686e-05, "loss": 2.8828, "step": 3135 }, { "epoch": 0.5244804950453652, "grad_norm": 3.0868732929229736, "learning_rate": 2.7222914710422997e-05, "loss": 3.1146, "step": 3136 }, { "epoch": 0.5246477401011833, "grad_norm": 9.505027770996094, "learning_rate": 2.7208376379217606e-05, "loss": 2.6823, "step": 3137 }, { "epoch": 0.5248149851570013, "grad_norm": 6.574689865112305, "learning_rate": 2.7193837295269002e-05, "loss": 2.8787, "step": 3138 }, { "epoch": 0.5249822302128193, "grad_norm": 4.313689708709717, "learning_rate": 2.717929746353296e-05, "loss": 2.2468, "step": 3139 }, { "epoch": 0.5251494752686374, "grad_norm": 3.7155425548553467, "learning_rate": 2.716475688896551e-05, "loss": 2.9433, "step": 3140 }, { "epoch": 0.5253167203244554, "grad_norm": 3.260544538497925, "learning_rate": 2.715021557652292e-05, "loss": 2.5471, "step": 3141 }, { "epoch": 0.5254839653802734, "grad_norm": 3.5409414768218994, "learning_rate": 2.7135673531161708e-05, "loss": 2.8706, "step": 3142 }, { "epoch": 0.5256512104360915, "grad_norm": 5.851815223693848, "learning_rate": 2.712113075783866e-05, "loss": 2.8867, "step": 3143 }, { "epoch": 0.5258184554919095, "grad_norm": 3.7663581371307373, "learning_rate": 2.7106587261510797e-05, "loss": 2.7099, "step": 3144 }, { "epoch": 0.5259857005477275, "grad_norm": 11.493402481079102, "learning_rate": 2.7092043047135386e-05, "loss": 4.3581, "step": 3145 }, { "epoch": 0.5261529456035456, "grad_norm": 4.214013576507568, "learning_rate": 2.707749811966994e-05, "loss": 2.8357, "step": 3146 }, { "epoch": 0.5263201906593636, "grad_norm": 3.5712199211120605, "learning_rate": 2.7062952484072218e-05, "loss": 2.8069, "step": 3147 }, { "epoch": 0.5264874357151816, "grad_norm": 4.443002223968506, "learning_rate": 2.7048406145300225e-05, "loss": 2.8781, "step": 3148 }, { "epoch": 0.5266546807709998, "grad_norm": 6.2658281326293945, "learning_rate": 2.7033859108312197e-05, "loss": 2.8683, "step": 3149 }, { "epoch": 0.5268219258268177, "grad_norm": 3.082001209259033, "learning_rate": 2.7019311378066607e-05, "loss": 2.2524, "step": 3150 }, { "epoch": 0.5269891708826357, "grad_norm": 4.635153293609619, "learning_rate": 2.7004762959522174e-05, "loss": 2.9201, "step": 3151 }, { "epoch": 0.5271564159384539, "grad_norm": 10.672508239746094, "learning_rate": 2.699021385763784e-05, "loss": 2.7514, "step": 3152 }, { "epoch": 0.5273236609942719, "grad_norm": 9.836735725402832, "learning_rate": 2.6975664077372792e-05, "loss": 2.8661, "step": 3153 }, { "epoch": 0.5274909060500899, "grad_norm": 9.464338302612305, "learning_rate": 2.696111362368644e-05, "loss": 2.6502, "step": 3154 }, { "epoch": 0.527658151105908, "grad_norm": 3.6969873905181885, "learning_rate": 2.6946562501538413e-05, "loss": 2.5489, "step": 3155 }, { "epoch": 0.527825396161726, "grad_norm": 4.00490665435791, "learning_rate": 2.69320107158886e-05, "loss": 3.1784, "step": 3156 }, { "epoch": 0.527992641217544, "grad_norm": 4.511627674102783, "learning_rate": 2.6917458271697093e-05, "loss": 2.7598, "step": 3157 }, { "epoch": 0.5281598862733621, "grad_norm": 7.775540351867676, "learning_rate": 2.6902905173924202e-05, "loss": 3.112, "step": 3158 }, { "epoch": 0.5283271313291801, "grad_norm": 3.6549599170684814, "learning_rate": 2.6888351427530472e-05, "loss": 3.0793, "step": 3159 }, { "epoch": 0.5284943763849981, "grad_norm": 5.56593132019043, "learning_rate": 2.6873797037476673e-05, "loss": 2.8317, "step": 3160 }, { "epoch": 0.5286616214408162, "grad_norm": 5.166120529174805, "learning_rate": 2.6859242008723786e-05, "loss": 2.6973, "step": 3161 }, { "epoch": 0.5288288664966342, "grad_norm": 2.8481638431549072, "learning_rate": 2.6844686346233007e-05, "loss": 2.6881, "step": 3162 }, { "epoch": 0.5289961115524522, "grad_norm": 4.336769104003906, "learning_rate": 2.6830130054965758e-05, "loss": 2.9144, "step": 3163 }, { "epoch": 0.5291633566082703, "grad_norm": 3.3901617527008057, "learning_rate": 2.6815573139883667e-05, "loss": 2.8676, "step": 3164 }, { "epoch": 0.5293306016640883, "grad_norm": 4.406470775604248, "learning_rate": 2.680101560594858e-05, "loss": 2.7657, "step": 3165 }, { "epoch": 0.5294978467199063, "grad_norm": 3.9122366905212402, "learning_rate": 2.6786457458122554e-05, "loss": 2.7381, "step": 3166 }, { "epoch": 0.5296650917757244, "grad_norm": 4.761398792266846, "learning_rate": 2.677189870136785e-05, "loss": 2.862, "step": 3167 }, { "epoch": 0.5298323368315424, "grad_norm": 11.651016235351562, "learning_rate": 2.6757339340646936e-05, "loss": 2.9499, "step": 3168 }, { "epoch": 0.5299995818873604, "grad_norm": 4.748629093170166, "learning_rate": 2.6742779380922494e-05, "loss": 3.0071, "step": 3169 }, { "epoch": 0.5301668269431785, "grad_norm": 6.148697376251221, "learning_rate": 2.6728218827157413e-05, "loss": 2.5176, "step": 3170 }, { "epoch": 0.5303340719989965, "grad_norm": 5.878487586975098, "learning_rate": 2.6713657684314764e-05, "loss": 2.8925, "step": 3171 }, { "epoch": 0.5305013170548145, "grad_norm": 5.118287086486816, "learning_rate": 2.669909595735784e-05, "loss": 2.6868, "step": 3172 }, { "epoch": 0.5306685621106326, "grad_norm": 2.5549235343933105, "learning_rate": 2.668453365125012e-05, "loss": 2.9314, "step": 3173 }, { "epoch": 0.5308358071664506, "grad_norm": 4.784734725952148, "learning_rate": 2.6669970770955292e-05, "loss": 3.1964, "step": 3174 }, { "epoch": 0.5310030522222687, "grad_norm": 4.471218109130859, "learning_rate": 2.665540732143723e-05, "loss": 2.7626, "step": 3175 }, { "epoch": 0.5311702972780867, "grad_norm": 4.975656032562256, "learning_rate": 2.6640843307659997e-05, "loss": 2.5757, "step": 3176 }, { "epoch": 0.5313375423339047, "grad_norm": 5.622832775115967, "learning_rate": 2.6626278734587873e-05, "loss": 2.8406, "step": 3177 }, { "epoch": 0.5315047873897228, "grad_norm": 3.003981828689575, "learning_rate": 2.6611713607185296e-05, "loss": 2.6958, "step": 3178 }, { "epoch": 0.5316720324455408, "grad_norm": 3.026662826538086, "learning_rate": 2.6597147930416922e-05, "loss": 2.5292, "step": 3179 }, { "epoch": 0.5318392775013588, "grad_norm": 5.660758018493652, "learning_rate": 2.658258170924757e-05, "loss": 2.7995, "step": 3180 }, { "epoch": 0.532006522557177, "grad_norm": 5.592401504516602, "learning_rate": 2.6568014948642266e-05, "loss": 2.7955, "step": 3181 }, { "epoch": 0.532173767612995, "grad_norm": 13.457381248474121, "learning_rate": 2.655344765356621e-05, "loss": 3.8412, "step": 3182 }, { "epoch": 0.532341012668813, "grad_norm": 5.12701416015625, "learning_rate": 2.6538879828984776e-05, "loss": 2.6402, "step": 3183 }, { "epoch": 0.5325082577246311, "grad_norm": 9.225018501281738, "learning_rate": 2.6524311479863524e-05, "loss": 2.6502, "step": 3184 }, { "epoch": 0.5326755027804491, "grad_norm": 8.69658088684082, "learning_rate": 2.650974261116821e-05, "loss": 2.6889, "step": 3185 }, { "epoch": 0.5328427478362671, "grad_norm": 4.9013991355896, "learning_rate": 2.649517322786474e-05, "loss": 2.6016, "step": 3186 }, { "epoch": 0.5330099928920852, "grad_norm": 3.6428322792053223, "learning_rate": 2.6480603334919213e-05, "loss": 2.6287, "step": 3187 }, { "epoch": 0.5331772379479032, "grad_norm": 3.112133741378784, "learning_rate": 2.6466032937297895e-05, "loss": 2.0154, "step": 3188 }, { "epoch": 0.5333444830037212, "grad_norm": 5.664663791656494, "learning_rate": 2.6451462039967228e-05, "loss": 2.8895, "step": 3189 }, { "epoch": 0.5335117280595393, "grad_norm": 4.55488395690918, "learning_rate": 2.6436890647893814e-05, "loss": 2.8733, "step": 3190 }, { "epoch": 0.5336789731153573, "grad_norm": 15.57378101348877, "learning_rate": 2.6422318766044446e-05, "loss": 3.2387, "step": 3191 }, { "epoch": 0.5338462181711753, "grad_norm": 5.137446403503418, "learning_rate": 2.6407746399386047e-05, "loss": 3.0344, "step": 3192 }, { "epoch": 0.5340134632269934, "grad_norm": 8.943876266479492, "learning_rate": 2.6393173552885753e-05, "loss": 3.1798, "step": 3193 }, { "epoch": 0.5341807082828114, "grad_norm": 8.070920944213867, "learning_rate": 2.637860023151082e-05, "loss": 2.7433, "step": 3194 }, { "epoch": 0.5343479533386294, "grad_norm": 5.280412197113037, "learning_rate": 2.6364026440228696e-05, "loss": 2.8485, "step": 3195 }, { "epoch": 0.5345151983944475, "grad_norm": 3.0082082748413086, "learning_rate": 2.6349452184006962e-05, "loss": 2.3628, "step": 3196 }, { "epoch": 0.5346824434502655, "grad_norm": 3.9818568229675293, "learning_rate": 2.633487746781339e-05, "loss": 2.5707, "step": 3197 }, { "epoch": 0.5348496885060835, "grad_norm": 3.4259800910949707, "learning_rate": 2.6320302296615883e-05, "loss": 2.5491, "step": 3198 }, { "epoch": 0.5350169335619016, "grad_norm": 2.3688926696777344, "learning_rate": 2.6305726675382512e-05, "loss": 2.7956, "step": 3199 }, { "epoch": 0.5351841786177196, "grad_norm": 4.271066188812256, "learning_rate": 2.6291150609081494e-05, "loss": 2.8558, "step": 3200 }, { "epoch": 0.5353514236735376, "grad_norm": 6.758511066436768, "learning_rate": 2.62765741026812e-05, "loss": 3.1768, "step": 3201 }, { "epoch": 0.5355186687293557, "grad_norm": 5.910772800445557, "learning_rate": 2.6261997161150155e-05, "loss": 2.9605, "step": 3202 }, { "epoch": 0.5356859137851737, "grad_norm": 10.28618049621582, "learning_rate": 2.6247419789457022e-05, "loss": 2.8814, "step": 3203 }, { "epoch": 0.5358531588409917, "grad_norm": 7.530214309692383, "learning_rate": 2.6232841992570622e-05, "loss": 2.9027, "step": 3204 }, { "epoch": 0.5360204038968098, "grad_norm": 14.826898574829102, "learning_rate": 2.621826377545992e-05, "loss": 3.023, "step": 3205 }, { "epoch": 0.5361876489526278, "grad_norm": 3.6257388591766357, "learning_rate": 2.620368514309401e-05, "loss": 2.8162, "step": 3206 }, { "epoch": 0.5363548940084458, "grad_norm": 6.609095573425293, "learning_rate": 2.6189106100442152e-05, "loss": 3.0508, "step": 3207 }, { "epoch": 0.5365221390642639, "grad_norm": 8.754345893859863, "learning_rate": 2.617452665247373e-05, "loss": 3.3976, "step": 3208 }, { "epoch": 0.5366893841200819, "grad_norm": 9.14730167388916, "learning_rate": 2.6159946804158253e-05, "loss": 3.1644, "step": 3209 }, { "epoch": 0.5368566291758999, "grad_norm": 4.022088527679443, "learning_rate": 2.6145366560465407e-05, "loss": 2.7879, "step": 3210 }, { "epoch": 0.537023874231718, "grad_norm": 9.303924560546875, "learning_rate": 2.613078592636497e-05, "loss": 2.7949, "step": 3211 }, { "epoch": 0.537191119287536, "grad_norm": 8.164093971252441, "learning_rate": 2.6116204906826873e-05, "loss": 3.0357, "step": 3212 }, { "epoch": 0.537358364343354, "grad_norm": 3.8426811695098877, "learning_rate": 2.6101623506821175e-05, "loss": 2.7061, "step": 3213 }, { "epoch": 0.5375256093991722, "grad_norm": 5.641882419586182, "learning_rate": 2.6087041731318074e-05, "loss": 2.8718, "step": 3214 }, { "epoch": 0.5376928544549902, "grad_norm": 4.5168633460998535, "learning_rate": 2.6072459585287884e-05, "loss": 2.6362, "step": 3215 }, { "epoch": 0.5378600995108083, "grad_norm": 4.113743305206299, "learning_rate": 2.605787707370105e-05, "loss": 2.842, "step": 3216 }, { "epoch": 0.5380273445666263, "grad_norm": 3.456580400466919, "learning_rate": 2.6043294201528134e-05, "loss": 3.2179, "step": 3217 }, { "epoch": 0.5381945896224443, "grad_norm": 15.60109806060791, "learning_rate": 2.6028710973739834e-05, "loss": 2.7244, "step": 3218 }, { "epoch": 0.5383618346782624, "grad_norm": 4.732779502868652, "learning_rate": 2.6014127395306966e-05, "loss": 2.8427, "step": 3219 }, { "epoch": 0.5385290797340804, "grad_norm": 8.976261138916016, "learning_rate": 2.599954347120045e-05, "loss": 3.0464, "step": 3220 }, { "epoch": 0.5386963247898984, "grad_norm": 4.55327844619751, "learning_rate": 2.5984959206391342e-05, "loss": 2.6184, "step": 3221 }, { "epoch": 0.5388635698457165, "grad_norm": 3.310645580291748, "learning_rate": 2.5970374605850812e-05, "loss": 3.0869, "step": 3222 }, { "epoch": 0.5390308149015345, "grad_norm": 7.620575904846191, "learning_rate": 2.595578967455014e-05, "loss": 2.7151, "step": 3223 }, { "epoch": 0.5391980599573525, "grad_norm": 5.675690174102783, "learning_rate": 2.594120441746072e-05, "loss": 2.7962, "step": 3224 }, { "epoch": 0.5393653050131706, "grad_norm": 4.605445384979248, "learning_rate": 2.5926618839554045e-05, "loss": 2.7493, "step": 3225 }, { "epoch": 0.5395325500689886, "grad_norm": 2.486682415008545, "learning_rate": 2.5912032945801745e-05, "loss": 2.3304, "step": 3226 }, { "epoch": 0.5396997951248066, "grad_norm": 7.302953243255615, "learning_rate": 2.5897446741175536e-05, "loss": 2.594, "step": 3227 }, { "epoch": 0.5398670401806247, "grad_norm": 3.8346641063690186, "learning_rate": 2.5882860230647244e-05, "loss": 2.5441, "step": 3228 }, { "epoch": 0.5400342852364427, "grad_norm": 4.411869049072266, "learning_rate": 2.5868273419188793e-05, "loss": 3.2227, "step": 3229 }, { "epoch": 0.5402015302922607, "grad_norm": 4.785394191741943, "learning_rate": 2.585368631177224e-05, "loss": 2.6349, "step": 3230 }, { "epoch": 0.5403687753480788, "grad_norm": 6.081172943115234, "learning_rate": 2.5839098913369704e-05, "loss": 2.7852, "step": 3231 }, { "epoch": 0.5405360204038968, "grad_norm": 4.374632358551025, "learning_rate": 2.5824511228953412e-05, "loss": 2.5859, "step": 3232 }, { "epoch": 0.5407032654597148, "grad_norm": 14.444986343383789, "learning_rate": 2.5809923263495705e-05, "loss": 2.1357, "step": 3233 }, { "epoch": 0.5408705105155329, "grad_norm": 3.7058427333831787, "learning_rate": 2.5795335021969015e-05, "loss": 2.5366, "step": 3234 }, { "epoch": 0.5410377555713509, "grad_norm": 2.5005154609680176, "learning_rate": 2.5780746509345856e-05, "loss": 2.5872, "step": 3235 }, { "epoch": 0.5412050006271689, "grad_norm": 5.498269557952881, "learning_rate": 2.576615773059885e-05, "loss": 2.8535, "step": 3236 }, { "epoch": 0.541372245682987, "grad_norm": 2.6597342491149902, "learning_rate": 2.5751568690700696e-05, "loss": 2.6609, "step": 3237 }, { "epoch": 0.541539490738805, "grad_norm": 5.574611663818359, "learning_rate": 2.5736979394624183e-05, "loss": 2.8546, "step": 3238 }, { "epoch": 0.541706735794623, "grad_norm": 5.08323335647583, "learning_rate": 2.5722389847342205e-05, "loss": 2.8596, "step": 3239 }, { "epoch": 0.5418739808504411, "grad_norm": 6.171603202819824, "learning_rate": 2.570780005382772e-05, "loss": 2.8061, "step": 3240 }, { "epoch": 0.5420412259062591, "grad_norm": 5.816606521606445, "learning_rate": 2.5693210019053783e-05, "loss": 2.671, "step": 3241 }, { "epoch": 0.5422084709620771, "grad_norm": 3.317129373550415, "learning_rate": 2.5678619747993522e-05, "loss": 2.8083, "step": 3242 }, { "epoch": 0.5423757160178952, "grad_norm": 6.775679588317871, "learning_rate": 2.5664029245620154e-05, "loss": 3.2302, "step": 3243 }, { "epoch": 0.5425429610737132, "grad_norm": 6.861614227294922, "learning_rate": 2.5649438516906975e-05, "loss": 3.2002, "step": 3244 }, { "epoch": 0.5427102061295312, "grad_norm": 3.8094637393951416, "learning_rate": 2.5634847566827346e-05, "loss": 2.6222, "step": 3245 }, { "epoch": 0.5428774511853494, "grad_norm": 10.02008056640625, "learning_rate": 2.5620256400354726e-05, "loss": 2.7282, "step": 3246 }, { "epoch": 0.5430446962411674, "grad_norm": 7.058468341827393, "learning_rate": 2.5605665022462615e-05, "loss": 2.7486, "step": 3247 }, { "epoch": 0.5432119412969854, "grad_norm": 6.790282726287842, "learning_rate": 2.559107343812462e-05, "loss": 3.2432, "step": 3248 }, { "epoch": 0.5433791863528035, "grad_norm": 6.433040618896484, "learning_rate": 2.5576481652314394e-05, "loss": 2.2857, "step": 3249 }, { "epoch": 0.5435464314086215, "grad_norm": 7.300047397613525, "learning_rate": 2.5561889670005663e-05, "loss": 2.938, "step": 3250 }, { "epoch": 0.5437136764644395, "grad_norm": 6.542567729949951, "learning_rate": 2.554729749617223e-05, "loss": 2.7916, "step": 3251 }, { "epoch": 0.5438809215202576, "grad_norm": 3.7539188861846924, "learning_rate": 2.553270513578796e-05, "loss": 3.2965, "step": 3252 }, { "epoch": 0.5440481665760756, "grad_norm": 2.8393638134002686, "learning_rate": 2.5518112593826778e-05, "loss": 2.6962, "step": 3253 }, { "epoch": 0.5442154116318937, "grad_norm": 4.171432971954346, "learning_rate": 2.550351987526266e-05, "loss": 2.5869, "step": 3254 }, { "epoch": 0.5443826566877117, "grad_norm": 6.497957229614258, "learning_rate": 2.5488926985069665e-05, "loss": 3.1566, "step": 3255 }, { "epoch": 0.5445499017435297, "grad_norm": 4.770649433135986, "learning_rate": 2.5474333928221904e-05, "loss": 3.0689, "step": 3256 }, { "epoch": 0.5447171467993478, "grad_norm": 7.623207092285156, "learning_rate": 2.545974070969353e-05, "loss": 3.0639, "step": 3257 }, { "epoch": 0.5448843918551658, "grad_norm": 5.244846343994141, "learning_rate": 2.544514733445876e-05, "loss": 2.5285, "step": 3258 }, { "epoch": 0.5450516369109838, "grad_norm": 5.155587196350098, "learning_rate": 2.5430553807491875e-05, "loss": 2.6166, "step": 3259 }, { "epoch": 0.5452188819668019, "grad_norm": 7.965775012969971, "learning_rate": 2.5415960133767196e-05, "loss": 2.9419, "step": 3260 }, { "epoch": 0.5453861270226199, "grad_norm": 5.599846363067627, "learning_rate": 2.540136631825909e-05, "loss": 2.5456, "step": 3261 }, { "epoch": 0.5455533720784379, "grad_norm": 4.604198455810547, "learning_rate": 2.5386772365941973e-05, "loss": 3.0828, "step": 3262 }, { "epoch": 0.545720617134256, "grad_norm": 14.258877754211426, "learning_rate": 2.5372178281790332e-05, "loss": 3.2314, "step": 3263 }, { "epoch": 0.545887862190074, "grad_norm": 5.632051944732666, "learning_rate": 2.535758407077867e-05, "loss": 3.274, "step": 3264 }, { "epoch": 0.546055107245892, "grad_norm": 4.861525058746338, "learning_rate": 2.5342989737881545e-05, "loss": 2.5385, "step": 3265 }, { "epoch": 0.5462223523017101, "grad_norm": 5.163285255432129, "learning_rate": 2.5328395288073553e-05, "loss": 2.7948, "step": 3266 }, { "epoch": 0.5463895973575281, "grad_norm": 2.255234718322754, "learning_rate": 2.5313800726329334e-05, "loss": 2.5542, "step": 3267 }, { "epoch": 0.5465568424133461, "grad_norm": 6.41292142868042, "learning_rate": 2.5299206057623563e-05, "loss": 2.6537, "step": 3268 }, { "epoch": 0.5467240874691642, "grad_norm": 2.8978748321533203, "learning_rate": 2.528461128693096e-05, "loss": 2.7744, "step": 3269 }, { "epoch": 0.5468913325249822, "grad_norm": 3.423196315765381, "learning_rate": 2.527001641922625e-05, "loss": 2.8092, "step": 3270 }, { "epoch": 0.5470585775808002, "grad_norm": 4.161614894866943, "learning_rate": 2.525542145948424e-05, "loss": 3.0147, "step": 3271 }, { "epoch": 0.5472258226366183, "grad_norm": 17.97797966003418, "learning_rate": 2.524082641267973e-05, "loss": 3.0038, "step": 3272 }, { "epoch": 0.5473930676924363, "grad_norm": 3.8939168453216553, "learning_rate": 2.5226231283787565e-05, "loss": 2.4161, "step": 3273 }, { "epoch": 0.5475603127482543, "grad_norm": 5.805858612060547, "learning_rate": 2.5211636077782608e-05, "loss": 3.025, "step": 3274 }, { "epoch": 0.5477275578040725, "grad_norm": 4.0631327629089355, "learning_rate": 2.519704079963976e-05, "loss": 2.4077, "step": 3275 }, { "epoch": 0.5478948028598905, "grad_norm": 3.741769313812256, "learning_rate": 2.5182445454333936e-05, "loss": 2.9443, "step": 3276 }, { "epoch": 0.5480620479157085, "grad_norm": 6.392513751983643, "learning_rate": 2.516785004684008e-05, "loss": 3.418, "step": 3277 }, { "epoch": 0.5482292929715266, "grad_norm": 3.732273817062378, "learning_rate": 2.5153254582133167e-05, "loss": 2.9076, "step": 3278 }, { "epoch": 0.5483965380273446, "grad_norm": 10.507570266723633, "learning_rate": 2.5138659065188162e-05, "loss": 1.5224, "step": 3279 }, { "epoch": 0.5485637830831626, "grad_norm": 2.4484519958496094, "learning_rate": 2.5124063500980084e-05, "loss": 2.3741, "step": 3280 }, { "epoch": 0.5487310281389807, "grad_norm": 2.922632932662964, "learning_rate": 2.5109467894483947e-05, "loss": 2.3643, "step": 3281 }, { "epoch": 0.5488982731947987, "grad_norm": 5.8869218826293945, "learning_rate": 2.5094872250674772e-05, "loss": 2.9566, "step": 3282 }, { "epoch": 0.5490655182506167, "grad_norm": 3.043200969696045, "learning_rate": 2.508027657452761e-05, "loss": 2.745, "step": 3283 }, { "epoch": 0.5492327633064348, "grad_norm": 3.8970704078674316, "learning_rate": 2.5065680871017527e-05, "loss": 2.5735, "step": 3284 }, { "epoch": 0.5494000083622528, "grad_norm": 2.786454677581787, "learning_rate": 2.5051085145119584e-05, "loss": 2.5377, "step": 3285 }, { "epoch": 0.5495672534180708, "grad_norm": 3.7385988235473633, "learning_rate": 2.5036489401808845e-05, "loss": 2.689, "step": 3286 }, { "epoch": 0.5497344984738889, "grad_norm": 5.931396007537842, "learning_rate": 2.502189364606039e-05, "loss": 3.0596, "step": 3287 }, { "epoch": 0.5499017435297069, "grad_norm": 7.684423446655273, "learning_rate": 2.500729788284931e-05, "loss": 3.5485, "step": 3288 }, { "epoch": 0.5500689885855249, "grad_norm": 5.424783229827881, "learning_rate": 2.4992702117150695e-05, "loss": 2.6762, "step": 3289 }, { "epoch": 0.550236233641343, "grad_norm": 7.933459758758545, "learning_rate": 2.4978106353939616e-05, "loss": 2.8711, "step": 3290 }, { "epoch": 0.550403478697161, "grad_norm": 6.501350402832031, "learning_rate": 2.4963510598191167e-05, "loss": 2.4609, "step": 3291 }, { "epoch": 0.5505707237529791, "grad_norm": 4.606449604034424, "learning_rate": 2.494891485488043e-05, "loss": 2.5996, "step": 3292 }, { "epoch": 0.5507379688087971, "grad_norm": 4.260654926300049, "learning_rate": 2.493431912898247e-05, "loss": 2.92, "step": 3293 }, { "epoch": 0.5509052138646151, "grad_norm": 3.7255682945251465, "learning_rate": 2.491972342547239e-05, "loss": 2.8022, "step": 3294 }, { "epoch": 0.5510724589204332, "grad_norm": 3.4048476219177246, "learning_rate": 2.4905127749325234e-05, "loss": 2.3055, "step": 3295 }, { "epoch": 0.5512397039762512, "grad_norm": 5.025787830352783, "learning_rate": 2.4890532105516062e-05, "loss": 2.9005, "step": 3296 }, { "epoch": 0.5514069490320692, "grad_norm": 4.8434672355651855, "learning_rate": 2.4875936499019922e-05, "loss": 3.0041, "step": 3297 }, { "epoch": 0.5515741940878873, "grad_norm": 4.002682685852051, "learning_rate": 2.4861340934811844e-05, "loss": 2.7638, "step": 3298 }, { "epoch": 0.5517414391437053, "grad_norm": 5.6782732009887695, "learning_rate": 2.4846745417866845e-05, "loss": 2.5895, "step": 3299 }, { "epoch": 0.5519086841995233, "grad_norm": 3.3286468982696533, "learning_rate": 2.4832149953159928e-05, "loss": 2.7012, "step": 3300 }, { "epoch": 0.5520759292553414, "grad_norm": 4.352336883544922, "learning_rate": 2.481755454566607e-05, "loss": 2.9699, "step": 3301 }, { "epoch": 0.5522431743111594, "grad_norm": 3.5013020038604736, "learning_rate": 2.4802959200360247e-05, "loss": 2.934, "step": 3302 }, { "epoch": 0.5524104193669774, "grad_norm": 5.614786148071289, "learning_rate": 2.4788363922217398e-05, "loss": 2.8437, "step": 3303 }, { "epoch": 0.5525776644227955, "grad_norm": 6.519090175628662, "learning_rate": 2.477376871621244e-05, "loss": 2.9153, "step": 3304 }, { "epoch": 0.5527449094786135, "grad_norm": 4.442521095275879, "learning_rate": 2.4759173587320273e-05, "loss": 2.728, "step": 3305 }, { "epoch": 0.5529121545344315, "grad_norm": 4.637145042419434, "learning_rate": 2.4744578540515766e-05, "loss": 2.6205, "step": 3306 }, { "epoch": 0.5530793995902497, "grad_norm": 4.257297039031982, "learning_rate": 2.4729983580773753e-05, "loss": 3.0783, "step": 3307 }, { "epoch": 0.5532466446460677, "grad_norm": 8.573330879211426, "learning_rate": 2.4715388713069053e-05, "loss": 3.131, "step": 3308 }, { "epoch": 0.5534138897018857, "grad_norm": 3.7349042892456055, "learning_rate": 2.4700793942376446e-05, "loss": 2.8849, "step": 3309 }, { "epoch": 0.5535811347577038, "grad_norm": 4.131507396697998, "learning_rate": 2.4686199273670665e-05, "loss": 2.7879, "step": 3310 }, { "epoch": 0.5537483798135218, "grad_norm": 3.021564483642578, "learning_rate": 2.467160471192645e-05, "loss": 2.5148, "step": 3311 }, { "epoch": 0.5539156248693398, "grad_norm": 6.571287155151367, "learning_rate": 2.4657010262118457e-05, "loss": 2.7361, "step": 3312 }, { "epoch": 0.5540828699251579, "grad_norm": 4.514445781707764, "learning_rate": 2.464241592922133e-05, "loss": 2.7737, "step": 3313 }, { "epoch": 0.5542501149809759, "grad_norm": 3.940917491912842, "learning_rate": 2.4627821718209674e-05, "loss": 2.7705, "step": 3314 }, { "epoch": 0.5544173600367939, "grad_norm": 6.936313152313232, "learning_rate": 2.461322763405803e-05, "loss": 2.9758, "step": 3315 }, { "epoch": 0.554584605092612, "grad_norm": 4.644308567047119, "learning_rate": 2.4598633681740924e-05, "loss": 2.3951, "step": 3316 }, { "epoch": 0.55475185014843, "grad_norm": 3.449061632156372, "learning_rate": 2.4584039866232816e-05, "loss": 2.5643, "step": 3317 }, { "epoch": 0.554919095204248, "grad_norm": 5.263500690460205, "learning_rate": 2.4569446192508127e-05, "loss": 3.0316, "step": 3318 }, { "epoch": 0.5550863402600661, "grad_norm": 5.303912162780762, "learning_rate": 2.455485266554124e-05, "loss": 2.4603, "step": 3319 }, { "epoch": 0.5552535853158841, "grad_norm": 3.9077494144439697, "learning_rate": 2.4540259290306476e-05, "loss": 2.4092, "step": 3320 }, { "epoch": 0.5554208303717021, "grad_norm": 3.35345196723938, "learning_rate": 2.4525666071778102e-05, "loss": 2.3884, "step": 3321 }, { "epoch": 0.5555880754275202, "grad_norm": 6.211004734039307, "learning_rate": 2.4511073014930337e-05, "loss": 2.7794, "step": 3322 }, { "epoch": 0.5557553204833382, "grad_norm": 3.949704885482788, "learning_rate": 2.4496480124737345e-05, "loss": 2.592, "step": 3323 }, { "epoch": 0.5559225655391562, "grad_norm": 3.1405582427978516, "learning_rate": 2.4481887406173235e-05, "loss": 2.9534, "step": 3324 }, { "epoch": 0.5560898105949743, "grad_norm": 3.6915090084075928, "learning_rate": 2.4467294864212048e-05, "loss": 2.5332, "step": 3325 }, { "epoch": 0.5562570556507923, "grad_norm": 5.1028666496276855, "learning_rate": 2.4452702503827766e-05, "loss": 2.6302, "step": 3326 }, { "epoch": 0.5564243007066103, "grad_norm": 5.100492000579834, "learning_rate": 2.443811032999434e-05, "loss": 2.8115, "step": 3327 }, { "epoch": 0.5565915457624284, "grad_norm": 3.509079933166504, "learning_rate": 2.4423518347685612e-05, "loss": 3.0062, "step": 3328 }, { "epoch": 0.5567587908182464, "grad_norm": 4.668586730957031, "learning_rate": 2.4408926561875386e-05, "loss": 2.6632, "step": 3329 }, { "epoch": 0.5569260358740644, "grad_norm": 4.422085762023926, "learning_rate": 2.4394334977537388e-05, "loss": 2.6481, "step": 3330 }, { "epoch": 0.5570932809298825, "grad_norm": 3.0227692127227783, "learning_rate": 2.4379743599645283e-05, "loss": 2.7248, "step": 3331 }, { "epoch": 0.5572605259857005, "grad_norm": 4.0112738609313965, "learning_rate": 2.4365152433172656e-05, "loss": 2.4585, "step": 3332 }, { "epoch": 0.5574277710415186, "grad_norm": 5.636368751525879, "learning_rate": 2.435056148309303e-05, "loss": 2.4406, "step": 3333 }, { "epoch": 0.5575950160973366, "grad_norm": 4.736163139343262, "learning_rate": 2.433597075437985e-05, "loss": 2.9618, "step": 3334 }, { "epoch": 0.5577622611531546, "grad_norm": 5.247453212738037, "learning_rate": 2.432138025200648e-05, "loss": 2.3713, "step": 3335 }, { "epoch": 0.5579295062089727, "grad_norm": 3.007375478744507, "learning_rate": 2.4306789980946223e-05, "loss": 2.6681, "step": 3336 }, { "epoch": 0.5580967512647907, "grad_norm": 4.180466175079346, "learning_rate": 2.4292199946172287e-05, "loss": 3.0066, "step": 3337 }, { "epoch": 0.5582639963206087, "grad_norm": 3.3689937591552734, "learning_rate": 2.4277610152657804e-05, "loss": 2.6729, "step": 3338 }, { "epoch": 0.5584312413764269, "grad_norm": 8.719054222106934, "learning_rate": 2.4263020605375826e-05, "loss": 3.5495, "step": 3339 }, { "epoch": 0.5585984864322449, "grad_norm": 3.9284451007843018, "learning_rate": 2.4248431309299317e-05, "loss": 2.534, "step": 3340 }, { "epoch": 0.5587657314880629, "grad_norm": 4.282293796539307, "learning_rate": 2.4233842269401163e-05, "loss": 2.5561, "step": 3341 }, { "epoch": 0.558932976543881, "grad_norm": 3.963540554046631, "learning_rate": 2.4219253490654147e-05, "loss": 3.0908, "step": 3342 }, { "epoch": 0.559100221599699, "grad_norm": 7.81181526184082, "learning_rate": 2.4204664978030987e-05, "loss": 2.6526, "step": 3343 }, { "epoch": 0.559267466655517, "grad_norm": 8.07853889465332, "learning_rate": 2.4190076736504297e-05, "loss": 2.8882, "step": 3344 }, { "epoch": 0.5594347117113351, "grad_norm": 5.51242733001709, "learning_rate": 2.4175488771046594e-05, "loss": 3.0633, "step": 3345 }, { "epoch": 0.5596019567671531, "grad_norm": 2.812042713165283, "learning_rate": 2.4160901086630305e-05, "loss": 2.745, "step": 3346 }, { "epoch": 0.5597692018229711, "grad_norm": 11.528167724609375, "learning_rate": 2.4146313688227763e-05, "loss": 3.8871, "step": 3347 }, { "epoch": 0.5599364468787892, "grad_norm": 4.036991119384766, "learning_rate": 2.413172658081121e-05, "loss": 2.7844, "step": 3348 }, { "epoch": 0.5601036919346072, "grad_norm": 4.839345932006836, "learning_rate": 2.411713976935277e-05, "loss": 3.1414, "step": 3349 }, { "epoch": 0.5602709369904252, "grad_norm": 4.0247578620910645, "learning_rate": 2.4102553258824476e-05, "loss": 2.7001, "step": 3350 }, { "epoch": 0.5604381820462433, "grad_norm": 3.3140878677368164, "learning_rate": 2.4087967054198254e-05, "loss": 2.6206, "step": 3351 }, { "epoch": 0.5606054271020613, "grad_norm": 5.104462146759033, "learning_rate": 2.4073381160445957e-05, "loss": 2.819, "step": 3352 }, { "epoch": 0.5607726721578793, "grad_norm": 4.2328033447265625, "learning_rate": 2.4058795582539287e-05, "loss": 3.0777, "step": 3353 }, { "epoch": 0.5609399172136974, "grad_norm": 6.773633003234863, "learning_rate": 2.404421032544987e-05, "loss": 3.4297, "step": 3354 }, { "epoch": 0.5611071622695154, "grad_norm": 5.537255764007568, "learning_rate": 2.4029625394149194e-05, "loss": 2.6353, "step": 3355 }, { "epoch": 0.5612744073253334, "grad_norm": 3.5164287090301514, "learning_rate": 2.4015040793608667e-05, "loss": 2.7755, "step": 3356 }, { "epoch": 0.5614416523811515, "grad_norm": 3.0604188442230225, "learning_rate": 2.400045652879956e-05, "loss": 2.5979, "step": 3357 }, { "epoch": 0.5616088974369695, "grad_norm": 3.3262288570404053, "learning_rate": 2.398587260469305e-05, "loss": 2.4233, "step": 3358 }, { "epoch": 0.5617761424927875, "grad_norm": 3.1158368587493896, "learning_rate": 2.397128902626017e-05, "loss": 2.5632, "step": 3359 }, { "epoch": 0.5619433875486056, "grad_norm": 4.553134918212891, "learning_rate": 2.395670579847187e-05, "loss": 3.1128, "step": 3360 }, { "epoch": 0.5621106326044236, "grad_norm": 6.797251224517822, "learning_rate": 2.3942122926298957e-05, "loss": 2.6367, "step": 3361 }, { "epoch": 0.5622778776602416, "grad_norm": 6.118861675262451, "learning_rate": 2.392754041471212e-05, "loss": 2.3695, "step": 3362 }, { "epoch": 0.5624451227160597, "grad_norm": 3.8031837940216064, "learning_rate": 2.391295826868193e-05, "loss": 2.9095, "step": 3363 }, { "epoch": 0.5626123677718777, "grad_norm": 2.9005286693573, "learning_rate": 2.3898376493178827e-05, "loss": 2.265, "step": 3364 }, { "epoch": 0.5627796128276957, "grad_norm": 3.9444713592529297, "learning_rate": 2.3883795093173136e-05, "loss": 2.6142, "step": 3365 }, { "epoch": 0.5629468578835138, "grad_norm": 5.891763210296631, "learning_rate": 2.3869214073635044e-05, "loss": 2.8658, "step": 3366 }, { "epoch": 0.5631141029393318, "grad_norm": 5.741815567016602, "learning_rate": 2.3854633439534592e-05, "loss": 3.1051, "step": 3367 }, { "epoch": 0.5632813479951498, "grad_norm": 3.3879518508911133, "learning_rate": 2.3840053195841743e-05, "loss": 2.7775, "step": 3368 }, { "epoch": 0.563448593050968, "grad_norm": 4.755329608917236, "learning_rate": 2.3825473347526275e-05, "loss": 3.3742, "step": 3369 }, { "epoch": 0.563615838106786, "grad_norm": 5.768465518951416, "learning_rate": 2.381089389955785e-05, "loss": 2.9126, "step": 3370 }, { "epoch": 0.5637830831626041, "grad_norm": 5.053259372711182, "learning_rate": 2.3796314856905993e-05, "loss": 3.0025, "step": 3371 }, { "epoch": 0.5639503282184221, "grad_norm": 5.350028038024902, "learning_rate": 2.378173622454009e-05, "loss": 3.2347, "step": 3372 }, { "epoch": 0.5641175732742401, "grad_norm": 3.216212272644043, "learning_rate": 2.3767158007429387e-05, "loss": 2.4738, "step": 3373 }, { "epoch": 0.5642848183300582, "grad_norm": 9.717513084411621, "learning_rate": 2.375258021054299e-05, "loss": 3.4962, "step": 3374 }, { "epoch": 0.5644520633858762, "grad_norm": 3.887312412261963, "learning_rate": 2.373800283884985e-05, "loss": 2.7001, "step": 3375 }, { "epoch": 0.5646193084416942, "grad_norm": 3.9655165672302246, "learning_rate": 2.3723425897318805e-05, "loss": 2.692, "step": 3376 }, { "epoch": 0.5647865534975123, "grad_norm": 4.441561698913574, "learning_rate": 2.3708849390918512e-05, "loss": 2.172, "step": 3377 }, { "epoch": 0.5649537985533303, "grad_norm": 8.524123191833496, "learning_rate": 2.369427332461749e-05, "loss": 3.0488, "step": 3378 }, { "epoch": 0.5651210436091483, "grad_norm": 4.635868549346924, "learning_rate": 2.367969770338412e-05, "loss": 2.4435, "step": 3379 }, { "epoch": 0.5652882886649664, "grad_norm": 4.45917272567749, "learning_rate": 2.366512253218661e-05, "loss": 2.9055, "step": 3380 }, { "epoch": 0.5654555337207844, "grad_norm": 2.7160110473632812, "learning_rate": 2.3650547815993044e-05, "loss": 2.5742, "step": 3381 }, { "epoch": 0.5656227787766024, "grad_norm": 7.6580376625061035, "learning_rate": 2.3635973559771317e-05, "loss": 3.0551, "step": 3382 }, { "epoch": 0.5657900238324205, "grad_norm": 8.91563606262207, "learning_rate": 2.3621399768489193e-05, "loss": 2.8855, "step": 3383 }, { "epoch": 0.5659572688882385, "grad_norm": 3.9310145378112793, "learning_rate": 2.360682644711425e-05, "loss": 3.0618, "step": 3384 }, { "epoch": 0.5661245139440565, "grad_norm": 5.291264533996582, "learning_rate": 2.3592253600613952e-05, "loss": 2.5894, "step": 3385 }, { "epoch": 0.5662917589998746, "grad_norm": 4.149991512298584, "learning_rate": 2.3577681233955563e-05, "loss": 2.8469, "step": 3386 }, { "epoch": 0.5664590040556926, "grad_norm": 5.8073883056640625, "learning_rate": 2.3563109352106188e-05, "loss": 3.1173, "step": 3387 }, { "epoch": 0.5666262491115106, "grad_norm": 6.197672367095947, "learning_rate": 2.3548537960032778e-05, "loss": 2.7446, "step": 3388 }, { "epoch": 0.5667934941673287, "grad_norm": 4.541050910949707, "learning_rate": 2.353396706270211e-05, "loss": 2.7825, "step": 3389 }, { "epoch": 0.5669607392231467, "grad_norm": 8.88914966583252, "learning_rate": 2.3519396665080793e-05, "loss": 2.8232, "step": 3390 }, { "epoch": 0.5671279842789647, "grad_norm": 5.561491012573242, "learning_rate": 2.3504826772135265e-05, "loss": 3.0374, "step": 3391 }, { "epoch": 0.5672952293347828, "grad_norm": 3.0744423866271973, "learning_rate": 2.3490257388831793e-05, "loss": 2.6875, "step": 3392 }, { "epoch": 0.5674624743906008, "grad_norm": 10.879633903503418, "learning_rate": 2.347568852013648e-05, "loss": 3.6624, "step": 3393 }, { "epoch": 0.5676297194464188, "grad_norm": 7.839295864105225, "learning_rate": 2.346112017101523e-05, "loss": 2.509, "step": 3394 }, { "epoch": 0.5677969645022369, "grad_norm": 3.850477933883667, "learning_rate": 2.3446552346433798e-05, "loss": 2.421, "step": 3395 }, { "epoch": 0.5679642095580549, "grad_norm": 6.501573085784912, "learning_rate": 2.343198505135774e-05, "loss": 2.9809, "step": 3396 }, { "epoch": 0.5681314546138729, "grad_norm": 3.4226202964782715, "learning_rate": 2.3417418290752434e-05, "loss": 3.1407, "step": 3397 }, { "epoch": 0.568298699669691, "grad_norm": 9.084012985229492, "learning_rate": 2.3402852069583087e-05, "loss": 2.5361, "step": 3398 }, { "epoch": 0.568465944725509, "grad_norm": 8.773322105407715, "learning_rate": 2.3388286392814714e-05, "loss": 3.26, "step": 3399 }, { "epoch": 0.568633189781327, "grad_norm": 4.082138538360596, "learning_rate": 2.337372126541213e-05, "loss": 2.6168, "step": 3400 }, { "epoch": 0.5688004348371452, "grad_norm": 2.7654712200164795, "learning_rate": 2.3359156692340002e-05, "loss": 2.6041, "step": 3401 }, { "epoch": 0.5689676798929632, "grad_norm": 8.76997184753418, "learning_rate": 2.3344592678562775e-05, "loss": 2.9597, "step": 3402 }, { "epoch": 0.5691349249487812, "grad_norm": 3.9922995567321777, "learning_rate": 2.333002922904471e-05, "loss": 3.0676, "step": 3403 }, { "epoch": 0.5693021700045993, "grad_norm": 3.40864634513855, "learning_rate": 2.3315466348749883e-05, "loss": 2.6674, "step": 3404 }, { "epoch": 0.5694694150604173, "grad_norm": 7.8561692237854, "learning_rate": 2.3300904042642163e-05, "loss": 3.2497, "step": 3405 }, { "epoch": 0.5696366601162353, "grad_norm": 6.333088397979736, "learning_rate": 2.328634231568524e-05, "loss": 2.8465, "step": 3406 }, { "epoch": 0.5698039051720534, "grad_norm": 4.598640441894531, "learning_rate": 2.3271781172842593e-05, "loss": 2.7654, "step": 3407 }, { "epoch": 0.5699711502278714, "grad_norm": 9.302179336547852, "learning_rate": 2.3257220619077505e-05, "loss": 2.946, "step": 3408 }, { "epoch": 0.5701383952836895, "grad_norm": 5.849001407623291, "learning_rate": 2.3242660659353066e-05, "loss": 2.8213, "step": 3409 }, { "epoch": 0.5703056403395075, "grad_norm": 5.567190647125244, "learning_rate": 2.3228101298632157e-05, "loss": 3.1773, "step": 3410 }, { "epoch": 0.5704728853953255, "grad_norm": 4.277426242828369, "learning_rate": 2.3213542541877455e-05, "loss": 2.671, "step": 3411 }, { "epoch": 0.5706401304511436, "grad_norm": 4.867709636688232, "learning_rate": 2.3198984394051425e-05, "loss": 2.4782, "step": 3412 }, { "epoch": 0.5708073755069616, "grad_norm": 7.40592622756958, "learning_rate": 2.318442686011634e-05, "loss": 2.598, "step": 3413 }, { "epoch": 0.5709746205627796, "grad_norm": 3.416700839996338, "learning_rate": 2.316986994503425e-05, "loss": 2.2513, "step": 3414 }, { "epoch": 0.5711418656185977, "grad_norm": 3.2043018341064453, "learning_rate": 2.3155313653767002e-05, "loss": 2.9005, "step": 3415 }, { "epoch": 0.5713091106744157, "grad_norm": 3.103545904159546, "learning_rate": 2.3140757991276224e-05, "loss": 2.7446, "step": 3416 }, { "epoch": 0.5714763557302337, "grad_norm": 5.603225231170654, "learning_rate": 2.312620296252333e-05, "loss": 2.6385, "step": 3417 }, { "epoch": 0.5716436007860518, "grad_norm": 5.882975101470947, "learning_rate": 2.3111648572469534e-05, "loss": 2.4413, "step": 3418 }, { "epoch": 0.5718108458418698, "grad_norm": 6.760990619659424, "learning_rate": 2.3097094826075803e-05, "loss": 2.8811, "step": 3419 }, { "epoch": 0.5719780908976878, "grad_norm": 7.99689245223999, "learning_rate": 2.3082541728302913e-05, "loss": 3.5901, "step": 3420 }, { "epoch": 0.5721453359535059, "grad_norm": 6.615955829620361, "learning_rate": 2.30679892841114e-05, "loss": 3.118, "step": 3421 }, { "epoch": 0.5723125810093239, "grad_norm": 4.304279804229736, "learning_rate": 2.305343749846159e-05, "loss": 2.6838, "step": 3422 }, { "epoch": 0.5724798260651419, "grad_norm": 4.377318859100342, "learning_rate": 2.3038886376313573e-05, "loss": 2.7684, "step": 3423 }, { "epoch": 0.57264707112096, "grad_norm": 3.8275070190429688, "learning_rate": 2.3024335922627217e-05, "loss": 2.6729, "step": 3424 }, { "epoch": 0.572814316176778, "grad_norm": 9.698385238647461, "learning_rate": 2.300978614236216e-05, "loss": 3.0721, "step": 3425 }, { "epoch": 0.572981561232596, "grad_norm": 3.368293523788452, "learning_rate": 2.299523704047783e-05, "loss": 2.831, "step": 3426 }, { "epoch": 0.5731488062884141, "grad_norm": 6.184434413909912, "learning_rate": 2.2980688621933395e-05, "loss": 2.8647, "step": 3427 }, { "epoch": 0.5733160513442321, "grad_norm": 6.255228042602539, "learning_rate": 2.296614089168781e-05, "loss": 3.0027, "step": 3428 }, { "epoch": 0.5734832964000501, "grad_norm": 5.366077423095703, "learning_rate": 2.295159385469978e-05, "loss": 2.9433, "step": 3429 }, { "epoch": 0.5736505414558682, "grad_norm": 4.323173522949219, "learning_rate": 2.2937047515927788e-05, "loss": 2.7259, "step": 3430 }, { "epoch": 0.5738177865116862, "grad_norm": 3.782792806625366, "learning_rate": 2.2922501880330072e-05, "loss": 3.0685, "step": 3431 }, { "epoch": 0.5739850315675042, "grad_norm": 5.674475193023682, "learning_rate": 2.290795695286463e-05, "loss": 2.984, "step": 3432 }, { "epoch": 0.5741522766233224, "grad_norm": 3.6893842220306396, "learning_rate": 2.2893412738489212e-05, "loss": 2.7613, "step": 3433 }, { "epoch": 0.5743195216791404, "grad_norm": 5.550006866455078, "learning_rate": 2.2878869242161343e-05, "loss": 2.8387, "step": 3434 }, { "epoch": 0.5744867667349584, "grad_norm": 3.753077745437622, "learning_rate": 2.2864326468838295e-05, "loss": 3.0835, "step": 3435 }, { "epoch": 0.5746540117907765, "grad_norm": 3.665271520614624, "learning_rate": 2.284978442347709e-05, "loss": 2.9763, "step": 3436 }, { "epoch": 0.5748212568465945, "grad_norm": 4.055235385894775, "learning_rate": 2.2835243111034495e-05, "loss": 2.6277, "step": 3437 }, { "epoch": 0.5749885019024125, "grad_norm": 4.0839314460754395, "learning_rate": 2.2820702536467045e-05, "loss": 2.6009, "step": 3438 }, { "epoch": 0.5751557469582306, "grad_norm": 2.7435944080352783, "learning_rate": 2.280616270473101e-05, "loss": 2.4981, "step": 3439 }, { "epoch": 0.5753229920140486, "grad_norm": 3.1216928958892822, "learning_rate": 2.279162362078241e-05, "loss": 2.6505, "step": 3440 }, { "epoch": 0.5754902370698666, "grad_norm": 3.4045908451080322, "learning_rate": 2.2777085289577e-05, "loss": 2.3992, "step": 3441 }, { "epoch": 0.5756574821256847, "grad_norm": 9.9126558303833, "learning_rate": 2.2762547716070316e-05, "loss": 2.8806, "step": 3442 }, { "epoch": 0.5758247271815027, "grad_norm": 8.688063621520996, "learning_rate": 2.2748010905217592e-05, "loss": 2.9372, "step": 3443 }, { "epoch": 0.5759919722373207, "grad_norm": 3.5326902866363525, "learning_rate": 2.2733474861973823e-05, "loss": 2.8482, "step": 3444 }, { "epoch": 0.5761592172931388, "grad_norm": 7.280900001525879, "learning_rate": 2.2718939591293738e-05, "loss": 3.1255, "step": 3445 }, { "epoch": 0.5763264623489568, "grad_norm": 6.872743606567383, "learning_rate": 2.2704405098131802e-05, "loss": 2.7449, "step": 3446 }, { "epoch": 0.5764937074047748, "grad_norm": 5.30918550491333, "learning_rate": 2.2689871387442216e-05, "loss": 2.985, "step": 3447 }, { "epoch": 0.5766609524605929, "grad_norm": 6.474517345428467, "learning_rate": 2.267533846417892e-05, "loss": 3.1452, "step": 3448 }, { "epoch": 0.5768281975164109, "grad_norm": 2.6041529178619385, "learning_rate": 2.2660806333295564e-05, "loss": 2.9986, "step": 3449 }, { "epoch": 0.576995442572229, "grad_norm": 4.148865222930908, "learning_rate": 2.2646274999745563e-05, "loss": 2.6497, "step": 3450 }, { "epoch": 0.577162687628047, "grad_norm": 7.261341571807861, "learning_rate": 2.263174446848204e-05, "loss": 3.1409, "step": 3451 }, { "epoch": 0.577329932683865, "grad_norm": 7.070272445678711, "learning_rate": 2.2617214744457832e-05, "loss": 3.1395, "step": 3452 }, { "epoch": 0.5774971777396831, "grad_norm": 6.887009143829346, "learning_rate": 2.2602685832625525e-05, "loss": 2.7386, "step": 3453 }, { "epoch": 0.5776644227955011, "grad_norm": 4.571042060852051, "learning_rate": 2.2588157737937415e-05, "loss": 3.0187, "step": 3454 }, { "epoch": 0.5778316678513191, "grad_norm": 3.7878952026367188, "learning_rate": 2.2573630465345524e-05, "loss": 2.7087, "step": 3455 }, { "epoch": 0.5779989129071372, "grad_norm": 3.25519061088562, "learning_rate": 2.2559104019801587e-05, "loss": 2.4114, "step": 3456 }, { "epoch": 0.5781661579629552, "grad_norm": 3.001103162765503, "learning_rate": 2.254457840625707e-05, "loss": 2.6911, "step": 3457 }, { "epoch": 0.5783334030187732, "grad_norm": 6.21985387802124, "learning_rate": 2.253005362966313e-05, "loss": 2.9937, "step": 3458 }, { "epoch": 0.5785006480745913, "grad_norm": 6.906620025634766, "learning_rate": 2.2515529694970676e-05, "loss": 2.7873, "step": 3459 }, { "epoch": 0.5786678931304093, "grad_norm": 5.032602310180664, "learning_rate": 2.2501006607130306e-05, "loss": 2.6393, "step": 3460 }, { "epoch": 0.5788351381862273, "grad_norm": 4.376297950744629, "learning_rate": 2.2486484371092332e-05, "loss": 2.6411, "step": 3461 }, { "epoch": 0.5790023832420454, "grad_norm": 5.515293121337891, "learning_rate": 2.247196299180678e-05, "loss": 2.8598, "step": 3462 }, { "epoch": 0.5791696282978634, "grad_norm": 3.7100350856781006, "learning_rate": 2.245744247422337e-05, "loss": 2.6983, "step": 3463 }, { "epoch": 0.5793368733536814, "grad_norm": 3.6732892990112305, "learning_rate": 2.244292282329155e-05, "loss": 2.8406, "step": 3464 }, { "epoch": 0.5795041184094996, "grad_norm": 2.3247697353363037, "learning_rate": 2.242840404396046e-05, "loss": 2.6458, "step": 3465 }, { "epoch": 0.5796713634653176, "grad_norm": 5.780785083770752, "learning_rate": 2.241388614117894e-05, "loss": 3.0016, "step": 3466 }, { "epoch": 0.5798386085211356, "grad_norm": 6.515728950500488, "learning_rate": 2.239936911989554e-05, "loss": 3.1218, "step": 3467 }, { "epoch": 0.5800058535769537, "grad_norm": 2.20945405960083, "learning_rate": 2.2384852985058513e-05, "loss": 2.7039, "step": 3468 }, { "epoch": 0.5801730986327717, "grad_norm": 13.76756763458252, "learning_rate": 2.2370337741615793e-05, "loss": 3.3574, "step": 3469 }, { "epoch": 0.5803403436885897, "grad_norm": 5.805243015289307, "learning_rate": 2.2355823394515024e-05, "loss": 2.8624, "step": 3470 }, { "epoch": 0.5805075887444078, "grad_norm": 5.635258197784424, "learning_rate": 2.234130994870354e-05, "loss": 2.6495, "step": 3471 }, { "epoch": 0.5806748338002258, "grad_norm": 4.120258808135986, "learning_rate": 2.232679740912836e-05, "loss": 2.7296, "step": 3472 }, { "epoch": 0.5808420788560438, "grad_norm": 7.534624099731445, "learning_rate": 2.2312285780736213e-05, "loss": 3.041, "step": 3473 }, { "epoch": 0.5810093239118619, "grad_norm": 4.46177864074707, "learning_rate": 2.2297775068473496e-05, "loss": 2.6432, "step": 3474 }, { "epoch": 0.5811765689676799, "grad_norm": 8.559663772583008, "learning_rate": 2.2283265277286323e-05, "loss": 2.8553, "step": 3475 }, { "epoch": 0.5813438140234979, "grad_norm": 4.4365010261535645, "learning_rate": 2.2268756412120457e-05, "loss": 2.7389, "step": 3476 }, { "epoch": 0.581511059079316, "grad_norm": 4.3700480461120605, "learning_rate": 2.2254248477921372e-05, "loss": 2.4141, "step": 3477 }, { "epoch": 0.581678304135134, "grad_norm": 4.904073238372803, "learning_rate": 2.2239741479634214e-05, "loss": 2.831, "step": 3478 }, { "epoch": 0.581845549190952, "grad_norm": 4.268773078918457, "learning_rate": 2.222523542220381e-05, "loss": 2.7413, "step": 3479 }, { "epoch": 0.5820127942467701, "grad_norm": 5.399546146392822, "learning_rate": 2.2210730310574672e-05, "loss": 2.542, "step": 3480 }, { "epoch": 0.5821800393025881, "grad_norm": 10.926070213317871, "learning_rate": 2.219622614969099e-05, "loss": 3.8423, "step": 3481 }, { "epoch": 0.5823472843584061, "grad_norm": 4.461344242095947, "learning_rate": 2.218172294449661e-05, "loss": 2.4865, "step": 3482 }, { "epoch": 0.5825145294142242, "grad_norm": 3.955815076828003, "learning_rate": 2.2167220699935086e-05, "loss": 2.7064, "step": 3483 }, { "epoch": 0.5826817744700422, "grad_norm": 5.43054723739624, "learning_rate": 2.2152719420949626e-05, "loss": 2.6511, "step": 3484 }, { "epoch": 0.5828490195258602, "grad_norm": 6.301039695739746, "learning_rate": 2.2138219112483102e-05, "loss": 2.4648, "step": 3485 }, { "epoch": 0.5830162645816783, "grad_norm": 5.4989142417907715, "learning_rate": 2.212371977947807e-05, "loss": 2.7675, "step": 3486 }, { "epoch": 0.5831835096374963, "grad_norm": 3.391913890838623, "learning_rate": 2.210922142687674e-05, "loss": 2.6334, "step": 3487 }, { "epoch": 0.5833507546933144, "grad_norm": 6.689277172088623, "learning_rate": 2.2094724059621e-05, "loss": 2.1792, "step": 3488 }, { "epoch": 0.5835179997491324, "grad_norm": 4.671609401702881, "learning_rate": 2.2080227682652387e-05, "loss": 2.3444, "step": 3489 }, { "epoch": 0.5836852448049504, "grad_norm": 8.480521202087402, "learning_rate": 2.2065732300912123e-05, "loss": 2.6043, "step": 3490 }, { "epoch": 0.5838524898607685, "grad_norm": 4.402759552001953, "learning_rate": 2.205123791934106e-05, "loss": 2.9548, "step": 3491 }, { "epoch": 0.5840197349165865, "grad_norm": 4.696617603302002, "learning_rate": 2.203674454287975e-05, "loss": 2.6338, "step": 3492 }, { "epoch": 0.5841869799724045, "grad_norm": 4.292108058929443, "learning_rate": 2.2022252176468367e-05, "loss": 2.7363, "step": 3493 }, { "epoch": 0.5843542250282227, "grad_norm": 4.377951622009277, "learning_rate": 2.200776082504675e-05, "loss": 2.7309, "step": 3494 }, { "epoch": 0.5845214700840407, "grad_norm": 4.947636127471924, "learning_rate": 2.1993270493554402e-05, "loss": 2.6656, "step": 3495 }, { "epoch": 0.5846887151398587, "grad_norm": 6.96194314956665, "learning_rate": 2.1978781186930466e-05, "loss": 2.6836, "step": 3496 }, { "epoch": 0.5848559601956768, "grad_norm": 8.19409465789795, "learning_rate": 2.1964292910113744e-05, "loss": 2.8103, "step": 3497 }, { "epoch": 0.5850232052514948, "grad_norm": 8.446910858154297, "learning_rate": 2.194980566804268e-05, "loss": 3.0518, "step": 3498 }, { "epoch": 0.5851904503073128, "grad_norm": 4.190689563751221, "learning_rate": 2.1935319465655362e-05, "loss": 2.3612, "step": 3499 }, { "epoch": 0.5853576953631309, "grad_norm": 5.0151214599609375, "learning_rate": 2.1920834307889553e-05, "loss": 2.5662, "step": 3500 }, { "epoch": 0.5855249404189489, "grad_norm": 12.196157455444336, "learning_rate": 2.190635019968263e-05, "loss": 2.3916, "step": 3501 }, { "epoch": 0.5856921854747669, "grad_norm": 3.963925361633301, "learning_rate": 2.189186714597161e-05, "loss": 2.6432, "step": 3502 }, { "epoch": 0.585859430530585, "grad_norm": 13.108072280883789, "learning_rate": 2.187738515169317e-05, "loss": 2.6223, "step": 3503 }, { "epoch": 0.586026675586403, "grad_norm": 3.5332047939300537, "learning_rate": 2.1862904221783608e-05, "loss": 2.4681, "step": 3504 }, { "epoch": 0.586193920642221, "grad_norm": 7.331363677978516, "learning_rate": 2.184842436117888e-05, "loss": 2.8011, "step": 3505 }, { "epoch": 0.5863611656980391, "grad_norm": 4.485180854797363, "learning_rate": 2.1833945574814553e-05, "loss": 2.3526, "step": 3506 }, { "epoch": 0.5865284107538571, "grad_norm": 6.103482246398926, "learning_rate": 2.1819467867625843e-05, "loss": 2.9544, "step": 3507 }, { "epoch": 0.5866956558096751, "grad_norm": 4.14151668548584, "learning_rate": 2.18049912445476e-05, "loss": 2.7677, "step": 3508 }, { "epoch": 0.5868629008654932, "grad_norm": 3.681506633758545, "learning_rate": 2.1790515710514298e-05, "loss": 2.6597, "step": 3509 }, { "epoch": 0.5870301459213112, "grad_norm": 4.084831237792969, "learning_rate": 2.177604127046004e-05, "loss": 3.1006, "step": 3510 }, { "epoch": 0.5871973909771292, "grad_norm": 6.794727802276611, "learning_rate": 2.176156792931856e-05, "loss": 2.8003, "step": 3511 }, { "epoch": 0.5873646360329473, "grad_norm": 4.7179412841796875, "learning_rate": 2.1747095692023214e-05, "loss": 2.6186, "step": 3512 }, { "epoch": 0.5875318810887653, "grad_norm": 5.926383972167969, "learning_rate": 2.1732624563506983e-05, "loss": 3.1004, "step": 3513 }, { "epoch": 0.5876991261445833, "grad_norm": 6.002767086029053, "learning_rate": 2.1718154548702464e-05, "loss": 3.2194, "step": 3514 }, { "epoch": 0.5878663712004014, "grad_norm": 5.809793949127197, "learning_rate": 2.1703685652541876e-05, "loss": 2.4958, "step": 3515 }, { "epoch": 0.5880336162562194, "grad_norm": 4.055419445037842, "learning_rate": 2.1689217879957083e-05, "loss": 2.625, "step": 3516 }, { "epoch": 0.5882008613120374, "grad_norm": 4.042679786682129, "learning_rate": 2.167475123587953e-05, "loss": 2.6231, "step": 3517 }, { "epoch": 0.5883681063678555, "grad_norm": 4.964648723602295, "learning_rate": 2.166028572524029e-05, "loss": 2.9715, "step": 3518 }, { "epoch": 0.5885353514236735, "grad_norm": 3.2490363121032715, "learning_rate": 2.1645821352970057e-05, "loss": 2.5638, "step": 3519 }, { "epoch": 0.5887025964794915, "grad_norm": 4.186033248901367, "learning_rate": 2.1631358123999128e-05, "loss": 2.8688, "step": 3520 }, { "epoch": 0.5888698415353096, "grad_norm": 5.819522857666016, "learning_rate": 2.1616896043257412e-05, "loss": 2.9484, "step": 3521 }, { "epoch": 0.5890370865911276, "grad_norm": 4.054407596588135, "learning_rate": 2.1602435115674425e-05, "loss": 3.1795, "step": 3522 }, { "epoch": 0.5892043316469456, "grad_norm": 6.2132954597473145, "learning_rate": 2.158797534617929e-05, "loss": 2.9615, "step": 3523 }, { "epoch": 0.5893715767027637, "grad_norm": 6.968730926513672, "learning_rate": 2.1573516739700754e-05, "loss": 3.2238, "step": 3524 }, { "epoch": 0.5895388217585817, "grad_norm": 7.900485515594482, "learning_rate": 2.1559059301167138e-05, "loss": 2.7186, "step": 3525 }, { "epoch": 0.5897060668143999, "grad_norm": 3.988508701324463, "learning_rate": 2.1544603035506386e-05, "loss": 3.0304, "step": 3526 }, { "epoch": 0.5898733118702179, "grad_norm": 7.534481525421143, "learning_rate": 2.153014794764603e-05, "loss": 2.865, "step": 3527 }, { "epoch": 0.5900405569260359, "grad_norm": 4.995853424072266, "learning_rate": 2.1515694042513205e-05, "loss": 2.6326, "step": 3528 }, { "epoch": 0.590207801981854, "grad_norm": 2.955970525741577, "learning_rate": 2.1501241325034642e-05, "loss": 2.4353, "step": 3529 }, { "epoch": 0.590375047037672, "grad_norm": 3.508430242538452, "learning_rate": 2.1486789800136676e-05, "loss": 2.4572, "step": 3530 }, { "epoch": 0.59054229209349, "grad_norm": 3.4169745445251465, "learning_rate": 2.1472339472745216e-05, "loss": 2.8905, "step": 3531 }, { "epoch": 0.5907095371493081, "grad_norm": 4.044239044189453, "learning_rate": 2.1457890347785774e-05, "loss": 2.7047, "step": 3532 }, { "epoch": 0.5908767822051261, "grad_norm": 3.6238224506378174, "learning_rate": 2.144344243018347e-05, "loss": 2.5664, "step": 3533 }, { "epoch": 0.5910440272609441, "grad_norm": 5.757742404937744, "learning_rate": 2.1428995724862987e-05, "loss": 3.0528, "step": 3534 }, { "epoch": 0.5912112723167622, "grad_norm": 3.917135715484619, "learning_rate": 2.1414550236748605e-05, "loss": 2.6845, "step": 3535 }, { "epoch": 0.5913785173725802, "grad_norm": 3.230069398880005, "learning_rate": 2.140010597076418e-05, "loss": 2.9017, "step": 3536 }, { "epoch": 0.5915457624283982, "grad_norm": 8.697614669799805, "learning_rate": 2.1385662931833156e-05, "loss": 2.4011, "step": 3537 }, { "epoch": 0.5917130074842163, "grad_norm": 3.936352252960205, "learning_rate": 2.1371221124878573e-05, "loss": 2.6409, "step": 3538 }, { "epoch": 0.5918802525400343, "grad_norm": 4.865544319152832, "learning_rate": 2.135678055482303e-05, "loss": 2.9396, "step": 3539 }, { "epoch": 0.5920474975958523, "grad_norm": 4.9968976974487305, "learning_rate": 2.1342341226588706e-05, "loss": 2.6302, "step": 3540 }, { "epoch": 0.5922147426516704, "grad_norm": 7.376828193664551, "learning_rate": 2.1327903145097385e-05, "loss": 2.9003, "step": 3541 }, { "epoch": 0.5923819877074884, "grad_norm": 5.308356285095215, "learning_rate": 2.13134663152704e-05, "loss": 2.9102, "step": 3542 }, { "epoch": 0.5925492327633064, "grad_norm": 4.384665012359619, "learning_rate": 2.1299030742028654e-05, "loss": 2.5431, "step": 3543 }, { "epoch": 0.5927164778191245, "grad_norm": 6.194285869598389, "learning_rate": 2.128459643029264e-05, "loss": 2.5289, "step": 3544 }, { "epoch": 0.5928837228749425, "grad_norm": 5.872435092926025, "learning_rate": 2.1270163384982404e-05, "loss": 3.0021, "step": 3545 }, { "epoch": 0.5930509679307605, "grad_norm": 5.7762956619262695, "learning_rate": 2.1255731611017573e-05, "loss": 3.074, "step": 3546 }, { "epoch": 0.5932182129865786, "grad_norm": 5.460101127624512, "learning_rate": 2.1241301113317334e-05, "loss": 2.7856, "step": 3547 }, { "epoch": 0.5933854580423966, "grad_norm": 3.2480499744415283, "learning_rate": 2.122687189680043e-05, "loss": 2.8791, "step": 3548 }, { "epoch": 0.5935527030982146, "grad_norm": 4.776412010192871, "learning_rate": 2.1212443966385197e-05, "loss": 2.866, "step": 3549 }, { "epoch": 0.5937199481540327, "grad_norm": 3.038984537124634, "learning_rate": 2.1198017326989503e-05, "loss": 2.4895, "step": 3550 }, { "epoch": 0.5938871932098507, "grad_norm": 7.78411340713501, "learning_rate": 2.1183591983530786e-05, "loss": 2.9563, "step": 3551 }, { "epoch": 0.5940544382656687, "grad_norm": 6.697144508361816, "learning_rate": 2.1169167940926045e-05, "loss": 3.1837, "step": 3552 }, { "epoch": 0.5942216833214868, "grad_norm": 4.496632099151611, "learning_rate": 2.1154745204091832e-05, "loss": 2.9258, "step": 3553 }, { "epoch": 0.5943889283773048, "grad_norm": 5.322939872741699, "learning_rate": 2.1140323777944255e-05, "loss": 2.8818, "step": 3554 }, { "epoch": 0.5945561734331228, "grad_norm": 4.704375743865967, "learning_rate": 2.112590366739898e-05, "loss": 2.5969, "step": 3555 }, { "epoch": 0.594723418488941, "grad_norm": 3.3341522216796875, "learning_rate": 2.1111484877371203e-05, "loss": 3.074, "step": 3556 }, { "epoch": 0.594890663544759, "grad_norm": 3.2236225605010986, "learning_rate": 2.1097067412775708e-05, "loss": 2.7053, "step": 3557 }, { "epoch": 0.595057908600577, "grad_norm": 6.098731994628906, "learning_rate": 2.1082651278526798e-05, "loss": 2.606, "step": 3558 }, { "epoch": 0.5952251536563951, "grad_norm": 8.997207641601562, "learning_rate": 2.1068236479538333e-05, "loss": 2.9087, "step": 3559 }, { "epoch": 0.5953923987122131, "grad_norm": 4.560131072998047, "learning_rate": 2.105382302072371e-05, "loss": 2.8666, "step": 3560 }, { "epoch": 0.595559643768031, "grad_norm": 5.279417037963867, "learning_rate": 2.1039410906995878e-05, "loss": 2.8261, "step": 3561 }, { "epoch": 0.5957268888238492, "grad_norm": 3.8421249389648438, "learning_rate": 2.1025000143267325e-05, "loss": 2.7726, "step": 3562 }, { "epoch": 0.5958941338796672, "grad_norm": 5.944949150085449, "learning_rate": 2.1010590734450076e-05, "loss": 2.8311, "step": 3563 }, { "epoch": 0.5960613789354852, "grad_norm": 7.260647773742676, "learning_rate": 2.0996182685455695e-05, "loss": 3.7491, "step": 3564 }, { "epoch": 0.5962286239913033, "grad_norm": 5.914389610290527, "learning_rate": 2.0981776001195283e-05, "loss": 3.2152, "step": 3565 }, { "epoch": 0.5963958690471213, "grad_norm": 3.23297381401062, "learning_rate": 2.0967370686579482e-05, "loss": 2.7472, "step": 3566 }, { "epoch": 0.5965631141029394, "grad_norm": 3.1378061771392822, "learning_rate": 2.095296674651846e-05, "loss": 2.468, "step": 3567 }, { "epoch": 0.5967303591587574, "grad_norm": 4.357882022857666, "learning_rate": 2.0938564185921917e-05, "loss": 2.837, "step": 3568 }, { "epoch": 0.5968976042145754, "grad_norm": 8.868453025817871, "learning_rate": 2.0924163009699084e-05, "loss": 3.3323, "step": 3569 }, { "epoch": 0.5970648492703935, "grad_norm": 6.62494421005249, "learning_rate": 2.0909763222758714e-05, "loss": 2.6718, "step": 3570 }, { "epoch": 0.5972320943262115, "grad_norm": 4.898979663848877, "learning_rate": 2.0895364830009108e-05, "loss": 2.7064, "step": 3571 }, { "epoch": 0.5973993393820295, "grad_norm": 3.688948154449463, "learning_rate": 2.088096783635806e-05, "loss": 2.8708, "step": 3572 }, { "epoch": 0.5975665844378476, "grad_norm": 3.778496742248535, "learning_rate": 2.0866572246712902e-05, "loss": 2.5247, "step": 3573 }, { "epoch": 0.5977338294936656, "grad_norm": 2.3775010108947754, "learning_rate": 2.0852178065980505e-05, "loss": 2.299, "step": 3574 }, { "epoch": 0.5979010745494836, "grad_norm": 3.2327284812927246, "learning_rate": 2.0837785299067233e-05, "loss": 2.8027, "step": 3575 }, { "epoch": 0.5980683196053017, "grad_norm": 3.570462703704834, "learning_rate": 2.0823393950878985e-05, "loss": 2.5513, "step": 3576 }, { "epoch": 0.5982355646611197, "grad_norm": 7.741060256958008, "learning_rate": 2.0809004026321167e-05, "loss": 2.6955, "step": 3577 }, { "epoch": 0.5984028097169377, "grad_norm": 3.441566228866577, "learning_rate": 2.07946155302987e-05, "loss": 2.744, "step": 3578 }, { "epoch": 0.5985700547727558, "grad_norm": 8.602303504943848, "learning_rate": 2.0780228467716016e-05, "loss": 2.9178, "step": 3579 }, { "epoch": 0.5987372998285738, "grad_norm": 2.7660136222839355, "learning_rate": 2.0765842843477078e-05, "loss": 2.7539, "step": 3580 }, { "epoch": 0.5989045448843918, "grad_norm": 3.0268564224243164, "learning_rate": 2.075145866248533e-05, "loss": 2.8031, "step": 3581 }, { "epoch": 0.5990717899402099, "grad_norm": 5.586061954498291, "learning_rate": 2.0737075929643745e-05, "loss": 2.5753, "step": 3582 }, { "epoch": 0.5992390349960279, "grad_norm": 4.667654514312744, "learning_rate": 2.0722694649854798e-05, "loss": 2.5881, "step": 3583 }, { "epoch": 0.5994062800518459, "grad_norm": 4.777454853057861, "learning_rate": 2.070831482802046e-05, "loss": 3.1254, "step": 3584 }, { "epoch": 0.599573525107664, "grad_norm": 5.036238193511963, "learning_rate": 2.0693936469042218e-05, "loss": 2.7141, "step": 3585 }, { "epoch": 0.599740770163482, "grad_norm": 3.19610857963562, "learning_rate": 2.0679559577821044e-05, "loss": 2.3951, "step": 3586 }, { "epoch": 0.5999080152193, "grad_norm": 3.6717112064361572, "learning_rate": 2.066518415925743e-05, "loss": 2.8592, "step": 3587 }, { "epoch": 0.6000752602751181, "grad_norm": 3.8518900871276855, "learning_rate": 2.0650810218251347e-05, "loss": 2.609, "step": 3588 }, { "epoch": 0.6002425053309361, "grad_norm": 6.294150352478027, "learning_rate": 2.0636437759702264e-05, "loss": 2.3917, "step": 3589 }, { "epoch": 0.6004097503867541, "grad_norm": 5.833480358123779, "learning_rate": 2.062206678850917e-05, "loss": 3.1786, "step": 3590 }, { "epoch": 0.6005769954425723, "grad_norm": 7.443213939666748, "learning_rate": 2.0607697309570523e-05, "loss": 2.8828, "step": 3591 }, { "epoch": 0.6007442404983903, "grad_norm": 3.0347182750701904, "learning_rate": 2.0593329327784272e-05, "loss": 2.4801, "step": 3592 }, { "epoch": 0.6009114855542083, "grad_norm": 5.142729759216309, "learning_rate": 2.0578962848047866e-05, "loss": 2.7179, "step": 3593 }, { "epoch": 0.6010787306100264, "grad_norm": 7.335456848144531, "learning_rate": 2.0564597875258235e-05, "loss": 2.06, "step": 3594 }, { "epoch": 0.6012459756658444, "grad_norm": 3.1442630290985107, "learning_rate": 2.05502344143118e-05, "loss": 2.6331, "step": 3595 }, { "epoch": 0.6014132207216624, "grad_norm": 9.565868377685547, "learning_rate": 2.0535872470104456e-05, "loss": 2.9795, "step": 3596 }, { "epoch": 0.6015804657774805, "grad_norm": 4.7943315505981445, "learning_rate": 2.0521512047531595e-05, "loss": 3.0609, "step": 3597 }, { "epoch": 0.6017477108332985, "grad_norm": 4.21230936050415, "learning_rate": 2.050715315148808e-05, "loss": 2.7008, "step": 3598 }, { "epoch": 0.6019149558891165, "grad_norm": 3.798954963684082, "learning_rate": 2.0492795786868273e-05, "loss": 2.9805, "step": 3599 }, { "epoch": 0.6020822009449346, "grad_norm": 3.2187352180480957, "learning_rate": 2.0478439958565994e-05, "loss": 2.6169, "step": 3600 }, { "epoch": 0.6022494460007526, "grad_norm": 5.134038925170898, "learning_rate": 2.0464085671474535e-05, "loss": 2.8076, "step": 3601 }, { "epoch": 0.6024166910565706, "grad_norm": 3.2674832344055176, "learning_rate": 2.0449732930486685e-05, "loss": 3.133, "step": 3602 }, { "epoch": 0.6025839361123887, "grad_norm": 3.0135414600372314, "learning_rate": 2.0435381740494684e-05, "loss": 2.7806, "step": 3603 }, { "epoch": 0.6027511811682067, "grad_norm": 4.845764636993408, "learning_rate": 2.0421032106390256e-05, "loss": 2.4945, "step": 3604 }, { "epoch": 0.6029184262240248, "grad_norm": 5.393866062164307, "learning_rate": 2.0406684033064594e-05, "loss": 2.746, "step": 3605 }, { "epoch": 0.6030856712798428, "grad_norm": 3.453505277633667, "learning_rate": 2.0392337525408346e-05, "loss": 2.5508, "step": 3606 }, { "epoch": 0.6032529163356608, "grad_norm": 9.536645889282227, "learning_rate": 2.0377992588311655e-05, "loss": 3.3272, "step": 3607 }, { "epoch": 0.6034201613914789, "grad_norm": 5.692997455596924, "learning_rate": 2.0363649226664096e-05, "loss": 4.04, "step": 3608 }, { "epoch": 0.6035874064472969, "grad_norm": 6.70174503326416, "learning_rate": 2.034930744535473e-05, "loss": 2.5068, "step": 3609 }, { "epoch": 0.6037546515031149, "grad_norm": 4.083108425140381, "learning_rate": 2.0334967249272063e-05, "loss": 2.6957, "step": 3610 }, { "epoch": 0.603921896558933, "grad_norm": 5.462594509124756, "learning_rate": 2.0320628643304074e-05, "loss": 2.4931, "step": 3611 }, { "epoch": 0.604089141614751, "grad_norm": 6.348516941070557, "learning_rate": 2.0306291632338194e-05, "loss": 2.8324, "step": 3612 }, { "epoch": 0.604256386670569, "grad_norm": 4.708584308624268, "learning_rate": 2.029195622126131e-05, "loss": 2.1011, "step": 3613 }, { "epoch": 0.6044236317263871, "grad_norm": 7.930304050445557, "learning_rate": 2.027762241495975e-05, "loss": 2.9957, "step": 3614 }, { "epoch": 0.6045908767822051, "grad_norm": 5.821415424346924, "learning_rate": 2.0263290218319336e-05, "loss": 2.8524, "step": 3615 }, { "epoch": 0.6047581218380231, "grad_norm": 4.8877949714660645, "learning_rate": 2.02489596362253e-05, "loss": 2.9939, "step": 3616 }, { "epoch": 0.6049253668938412, "grad_norm": 8.554951667785645, "learning_rate": 2.0234630673562344e-05, "loss": 2.6204, "step": 3617 }, { "epoch": 0.6050926119496592, "grad_norm": 3.295123815536499, "learning_rate": 2.022030333521461e-05, "loss": 2.5683, "step": 3618 }, { "epoch": 0.6052598570054772, "grad_norm": 5.4477362632751465, "learning_rate": 2.0205977626065683e-05, "loss": 2.1236, "step": 3619 }, { "epoch": 0.6054271020612954, "grad_norm": 10.270236015319824, "learning_rate": 2.0191653550998608e-05, "loss": 3.2697, "step": 3620 }, { "epoch": 0.6055943471171134, "grad_norm": 8.063796997070312, "learning_rate": 2.0177331114895858e-05, "loss": 3.4675, "step": 3621 }, { "epoch": 0.6057615921729314, "grad_norm": 5.379790782928467, "learning_rate": 2.0163010322639346e-05, "loss": 2.7193, "step": 3622 }, { "epoch": 0.6059288372287495, "grad_norm": 5.465414524078369, "learning_rate": 2.0148691179110444e-05, "loss": 3.022, "step": 3623 }, { "epoch": 0.6060960822845675, "grad_norm": 1.9777859449386597, "learning_rate": 2.0134373689189944e-05, "loss": 2.2171, "step": 3624 }, { "epoch": 0.6062633273403855, "grad_norm": 9.783770561218262, "learning_rate": 2.0120057857758085e-05, "loss": 3.1124, "step": 3625 }, { "epoch": 0.6064305723962036, "grad_norm": 2.85927414894104, "learning_rate": 2.0105743689694528e-05, "loss": 2.5427, "step": 3626 }, { "epoch": 0.6065978174520216, "grad_norm": 10.831384658813477, "learning_rate": 2.009143118987838e-05, "loss": 3.3736, "step": 3627 }, { "epoch": 0.6067650625078396, "grad_norm": 2.8799121379852295, "learning_rate": 2.0077120363188167e-05, "loss": 2.6986, "step": 3628 }, { "epoch": 0.6069323075636577, "grad_norm": 3.742656707763672, "learning_rate": 2.006281121450186e-05, "loss": 2.6222, "step": 3629 }, { "epoch": 0.6070995526194757, "grad_norm": 4.074070930480957, "learning_rate": 2.0048503748696835e-05, "loss": 2.6199, "step": 3630 }, { "epoch": 0.6072667976752937, "grad_norm": 8.305510520935059, "learning_rate": 2.0034197970649932e-05, "loss": 3.0851, "step": 3631 }, { "epoch": 0.6074340427311118, "grad_norm": 4.269957065582275, "learning_rate": 2.0019893885237378e-05, "loss": 3.0216, "step": 3632 }, { "epoch": 0.6076012877869298, "grad_norm": 4.9617838859558105, "learning_rate": 2.0005591497334842e-05, "loss": 2.5295, "step": 3633 }, { "epoch": 0.6077685328427478, "grad_norm": 3.1550345420837402, "learning_rate": 1.999129081181741e-05, "loss": 2.5221, "step": 3634 }, { "epoch": 0.6079357778985659, "grad_norm": 6.584998607635498, "learning_rate": 1.9976991833559585e-05, "loss": 2.9511, "step": 3635 }, { "epoch": 0.6081030229543839, "grad_norm": 8.826692581176758, "learning_rate": 1.996269456743529e-05, "loss": 2.9005, "step": 3636 }, { "epoch": 0.6082702680102019, "grad_norm": 5.698314666748047, "learning_rate": 1.9948399018317874e-05, "loss": 2.7576, "step": 3637 }, { "epoch": 0.60843751306602, "grad_norm": 4.956974983215332, "learning_rate": 1.9934105191080082e-05, "loss": 2.4985, "step": 3638 }, { "epoch": 0.608604758121838, "grad_norm": 5.098049640655518, "learning_rate": 1.9919813090594084e-05, "loss": 2.8455, "step": 3639 }, { "epoch": 0.608772003177656, "grad_norm": 2.457550048828125, "learning_rate": 1.9905522721731466e-05, "loss": 2.4215, "step": 3640 }, { "epoch": 0.6089392482334741, "grad_norm": 4.155354022979736, "learning_rate": 1.9891234089363216e-05, "loss": 3.2603, "step": 3641 }, { "epoch": 0.6091064932892921, "grad_norm": 3.724465847015381, "learning_rate": 1.987694719835973e-05, "loss": 2.6717, "step": 3642 }, { "epoch": 0.6092737383451102, "grad_norm": 8.34815788269043, "learning_rate": 1.986266205359081e-05, "loss": 3.0936, "step": 3643 }, { "epoch": 0.6094409834009282, "grad_norm": 3.998157262802124, "learning_rate": 1.9848378659925664e-05, "loss": 2.7931, "step": 3644 }, { "epoch": 0.6096082284567462, "grad_norm": 6.9869184494018555, "learning_rate": 1.9834097022232913e-05, "loss": 3.1601, "step": 3645 }, { "epoch": 0.6097754735125643, "grad_norm": 4.100867748260498, "learning_rate": 1.9819817145380557e-05, "loss": 2.9553, "step": 3646 }, { "epoch": 0.6099427185683823, "grad_norm": 4.193237781524658, "learning_rate": 1.9805539034236007e-05, "loss": 2.7542, "step": 3647 }, { "epoch": 0.6101099636242003, "grad_norm": 4.034468173980713, "learning_rate": 1.979126269366609e-05, "loss": 2.5605, "step": 3648 }, { "epoch": 0.6102772086800184, "grad_norm": 6.319310665130615, "learning_rate": 1.977698812853701e-05, "loss": 2.853, "step": 3649 }, { "epoch": 0.6104444537358364, "grad_norm": 4.591775417327881, "learning_rate": 1.9762715343714366e-05, "loss": 2.6269, "step": 3650 }, { "epoch": 0.6106116987916544, "grad_norm": 5.402958393096924, "learning_rate": 1.9748444344063156e-05, "loss": 2.8459, "step": 3651 }, { "epoch": 0.6107789438474726, "grad_norm": 5.499091148376465, "learning_rate": 1.973417513444776e-05, "loss": 2.8043, "step": 3652 }, { "epoch": 0.6109461889032906, "grad_norm": 3.9377989768981934, "learning_rate": 1.971990771973197e-05, "loss": 3.0252, "step": 3653 }, { "epoch": 0.6111134339591086, "grad_norm": 11.184733390808105, "learning_rate": 1.9705642104778936e-05, "loss": 2.6903, "step": 3654 }, { "epoch": 0.6112806790149267, "grad_norm": 5.417604923248291, "learning_rate": 1.9691378294451203e-05, "loss": 2.7538, "step": 3655 }, { "epoch": 0.6114479240707447, "grad_norm": 6.044201374053955, "learning_rate": 1.9677116293610733e-05, "loss": 2.885, "step": 3656 }, { "epoch": 0.6116151691265627, "grad_norm": 4.632783889770508, "learning_rate": 1.966285610711883e-05, "loss": 2.6123, "step": 3657 }, { "epoch": 0.6117824141823808, "grad_norm": 2.9139349460601807, "learning_rate": 1.96485977398362e-05, "loss": 2.3812, "step": 3658 }, { "epoch": 0.6119496592381988, "grad_norm": 2.6452443599700928, "learning_rate": 1.9634341196622925e-05, "loss": 2.4147, "step": 3659 }, { "epoch": 0.6121169042940168, "grad_norm": 4.3147807121276855, "learning_rate": 1.9620086482338464e-05, "loss": 2.581, "step": 3660 }, { "epoch": 0.6122841493498349, "grad_norm": 3.856644868850708, "learning_rate": 1.9605833601841647e-05, "loss": 2.7333, "step": 3661 }, { "epoch": 0.6124513944056529, "grad_norm": 8.273821830749512, "learning_rate": 1.9591582559990695e-05, "loss": 2.2953, "step": 3662 }, { "epoch": 0.6126186394614709, "grad_norm": 3.7360661029815674, "learning_rate": 1.957733336164318e-05, "loss": 2.6205, "step": 3663 }, { "epoch": 0.612785884517289, "grad_norm": 4.293590068817139, "learning_rate": 1.9563086011656072e-05, "loss": 2.3924, "step": 3664 }, { "epoch": 0.612953129573107, "grad_norm": 2.2021026611328125, "learning_rate": 1.9548840514885695e-05, "loss": 2.3052, "step": 3665 }, { "epoch": 0.613120374628925, "grad_norm": 2.353166341781616, "learning_rate": 1.9534596876187737e-05, "loss": 2.3689, "step": 3666 }, { "epoch": 0.6132876196847431, "grad_norm": 5.3397345542907715, "learning_rate": 1.952035510041726e-05, "loss": 2.8246, "step": 3667 }, { "epoch": 0.6134548647405611, "grad_norm": 3.51047945022583, "learning_rate": 1.950611519242869e-05, "loss": 2.7589, "step": 3668 }, { "epoch": 0.6136221097963791, "grad_norm": 4.23015832901001, "learning_rate": 1.9491877157075825e-05, "loss": 2.4132, "step": 3669 }, { "epoch": 0.6137893548521972, "grad_norm": 4.393100261688232, "learning_rate": 1.9477640999211806e-05, "loss": 2.9402, "step": 3670 }, { "epoch": 0.6139565999080152, "grad_norm": 3.819322109222412, "learning_rate": 1.9463406723689145e-05, "loss": 2.5281, "step": 3671 }, { "epoch": 0.6141238449638332, "grad_norm": 6.415349006652832, "learning_rate": 1.9449174335359706e-05, "loss": 3.1405, "step": 3672 }, { "epoch": 0.6142910900196513, "grad_norm": 3.5635812282562256, "learning_rate": 1.9434943839074734e-05, "loss": 2.7278, "step": 3673 }, { "epoch": 0.6144583350754693, "grad_norm": 6.967138767242432, "learning_rate": 1.94207152396848e-05, "loss": 3.1625, "step": 3674 }, { "epoch": 0.6146255801312873, "grad_norm": 11.462828636169434, "learning_rate": 1.9406488542039834e-05, "loss": 2.8096, "step": 3675 }, { "epoch": 0.6147928251871054, "grad_norm": 9.430855751037598, "learning_rate": 1.9392263750989134e-05, "loss": 2.4542, "step": 3676 }, { "epoch": 0.6149600702429234, "grad_norm": 6.5126729011535645, "learning_rate": 1.9378040871381325e-05, "loss": 2.8924, "step": 3677 }, { "epoch": 0.6151273152987414, "grad_norm": 4.9165472984313965, "learning_rate": 1.93638199080644e-05, "loss": 2.7606, "step": 3678 }, { "epoch": 0.6152945603545595, "grad_norm": 7.948554515838623, "learning_rate": 1.9349600865885687e-05, "loss": 2.2964, "step": 3679 }, { "epoch": 0.6154618054103775, "grad_norm": 3.968233346939087, "learning_rate": 1.933538374969186e-05, "loss": 2.7712, "step": 3680 }, { "epoch": 0.6156290504661955, "grad_norm": 5.376319408416748, "learning_rate": 1.9321168564328945e-05, "loss": 2.5331, "step": 3681 }, { "epoch": 0.6157962955220136, "grad_norm": 6.285268306732178, "learning_rate": 1.9306955314642308e-05, "loss": 2.6375, "step": 3682 }, { "epoch": 0.6159635405778316, "grad_norm": 3.859678030014038, "learning_rate": 1.9292744005476647e-05, "loss": 2.637, "step": 3683 }, { "epoch": 0.6161307856336498, "grad_norm": 2.1643919944763184, "learning_rate": 1.9278534641676e-05, "loss": 2.0728, "step": 3684 }, { "epoch": 0.6162980306894678, "grad_norm": 4.63027811050415, "learning_rate": 1.926432722808375e-05, "loss": 3.088, "step": 3685 }, { "epoch": 0.6164652757452858, "grad_norm": 2.1423070430755615, "learning_rate": 1.925012176954261e-05, "loss": 2.6235, "step": 3686 }, { "epoch": 0.6166325208011039, "grad_norm": 4.436881065368652, "learning_rate": 1.9235918270894624e-05, "loss": 2.6857, "step": 3687 }, { "epoch": 0.6167997658569219, "grad_norm": 4.757719993591309, "learning_rate": 1.922171673698116e-05, "loss": 2.5076, "step": 3688 }, { "epoch": 0.6169670109127399, "grad_norm": 9.525437355041504, "learning_rate": 1.920751717264296e-05, "loss": 3.2472, "step": 3689 }, { "epoch": 0.617134255968558, "grad_norm": 6.084108829498291, "learning_rate": 1.9193319582720036e-05, "loss": 2.6019, "step": 3690 }, { "epoch": 0.617301501024376, "grad_norm": 5.705394744873047, "learning_rate": 1.917912397205176e-05, "loss": 2.9045, "step": 3691 }, { "epoch": 0.617468746080194, "grad_norm": 3.737058162689209, "learning_rate": 1.9164930345476823e-05, "loss": 2.5033, "step": 3692 }, { "epoch": 0.6176359911360121, "grad_norm": 5.8205132484436035, "learning_rate": 1.9150738707833237e-05, "loss": 2.6319, "step": 3693 }, { "epoch": 0.6178032361918301, "grad_norm": 7.168110370635986, "learning_rate": 1.9136549063958336e-05, "loss": 3.2468, "step": 3694 }, { "epoch": 0.6179704812476481, "grad_norm": 7.844515800476074, "learning_rate": 1.9122361418688784e-05, "loss": 2.8585, "step": 3695 }, { "epoch": 0.6181377263034662, "grad_norm": 4.438600540161133, "learning_rate": 1.9108175776860543e-05, "loss": 2.5281, "step": 3696 }, { "epoch": 0.6183049713592842, "grad_norm": 2.574274778366089, "learning_rate": 1.909399214330892e-05, "loss": 2.3369, "step": 3697 }, { "epoch": 0.6184722164151022, "grad_norm": 7.080519676208496, "learning_rate": 1.907981052286852e-05, "loss": 2.8899, "step": 3698 }, { "epoch": 0.6186394614709203, "grad_norm": 3.1580283641815186, "learning_rate": 1.9065630920373256e-05, "loss": 2.9338, "step": 3699 }, { "epoch": 0.6188067065267383, "grad_norm": 6.33699369430542, "learning_rate": 1.9051453340656374e-05, "loss": 2.9433, "step": 3700 }, { "epoch": 0.6189739515825563, "grad_norm": 4.557555675506592, "learning_rate": 1.9037277788550406e-05, "loss": 2.9324, "step": 3701 }, { "epoch": 0.6191411966383744, "grad_norm": 7.720186710357666, "learning_rate": 1.902310426888721e-05, "loss": 2.7572, "step": 3702 }, { "epoch": 0.6193084416941924, "grad_norm": 4.730926513671875, "learning_rate": 1.9008932786497953e-05, "loss": 2.6121, "step": 3703 }, { "epoch": 0.6194756867500104, "grad_norm": 5.459977626800537, "learning_rate": 1.899476334621309e-05, "loss": 2.4794, "step": 3704 }, { "epoch": 0.6196429318058285, "grad_norm": 5.150100231170654, "learning_rate": 1.8980595952862388e-05, "loss": 2.764, "step": 3705 }, { "epoch": 0.6198101768616465, "grad_norm": 3.4221606254577637, "learning_rate": 1.8966430611274934e-05, "loss": 2.9248, "step": 3706 }, { "epoch": 0.6199774219174645, "grad_norm": 7.868557929992676, "learning_rate": 1.8952267326279098e-05, "loss": 3.0354, "step": 3707 }, { "epoch": 0.6201446669732826, "grad_norm": 5.805881023406982, "learning_rate": 1.893810610270255e-05, "loss": 2.2031, "step": 3708 }, { "epoch": 0.6203119120291006, "grad_norm": 3.9205405712127686, "learning_rate": 1.8923946945372252e-05, "loss": 2.4568, "step": 3709 }, { "epoch": 0.6204791570849186, "grad_norm": 4.912517070770264, "learning_rate": 1.890978985911448e-05, "loss": 2.6295, "step": 3710 }, { "epoch": 0.6206464021407367, "grad_norm": 5.281459331512451, "learning_rate": 1.8895634848754784e-05, "loss": 2.6992, "step": 3711 }, { "epoch": 0.6208136471965547, "grad_norm": 2.5492746829986572, "learning_rate": 1.888148191911802e-05, "loss": 2.4621, "step": 3712 }, { "epoch": 0.6209808922523727, "grad_norm": 31.982608795166016, "learning_rate": 1.886733107502832e-05, "loss": 2.7032, "step": 3713 }, { "epoch": 0.6211481373081909, "grad_norm": 5.2499189376831055, "learning_rate": 1.8853182321309133e-05, "loss": 2.7823, "step": 3714 }, { "epoch": 0.6213153823640089, "grad_norm": 5.771376132965088, "learning_rate": 1.8839035662783165e-05, "loss": 2.6172, "step": 3715 }, { "epoch": 0.6214826274198268, "grad_norm": 3.480346202850342, "learning_rate": 1.882489110427243e-05, "loss": 2.4864, "step": 3716 }, { "epoch": 0.621649872475645, "grad_norm": 5.253401279449463, "learning_rate": 1.8810748650598205e-05, "loss": 2.9179, "step": 3717 }, { "epoch": 0.621817117531463, "grad_norm": 4.195467948913574, "learning_rate": 1.879660830658107e-05, "loss": 2.8398, "step": 3718 }, { "epoch": 0.621984362587281, "grad_norm": 3.71916127204895, "learning_rate": 1.8782470077040875e-05, "loss": 2.7745, "step": 3719 }, { "epoch": 0.6221516076430991, "grad_norm": 4.130202293395996, "learning_rate": 1.8768333966796753e-05, "loss": 2.9531, "step": 3720 }, { "epoch": 0.6223188526989171, "grad_norm": 6.842179298400879, "learning_rate": 1.8754199980667104e-05, "loss": 2.9126, "step": 3721 }, { "epoch": 0.6224860977547352, "grad_norm": 3.4415123462677, "learning_rate": 1.8740068123469627e-05, "loss": 2.6971, "step": 3722 }, { "epoch": 0.6226533428105532, "grad_norm": 5.507798194885254, "learning_rate": 1.8725938400021274e-05, "loss": 2.9996, "step": 3723 }, { "epoch": 0.6228205878663712, "grad_norm": 5.1119279861450195, "learning_rate": 1.8711810815138274e-05, "loss": 2.8085, "step": 3724 }, { "epoch": 0.6229878329221893, "grad_norm": 4.5962910652160645, "learning_rate": 1.869768537363614e-05, "loss": 3.288, "step": 3725 }, { "epoch": 0.6231550779780073, "grad_norm": 9.068204879760742, "learning_rate": 1.8683562080329638e-05, "loss": 3.2357, "step": 3726 }, { "epoch": 0.6233223230338253, "grad_norm": 3.3499603271484375, "learning_rate": 1.8669440940032805e-05, "loss": 2.4274, "step": 3727 }, { "epoch": 0.6234895680896434, "grad_norm": 2.4893078804016113, "learning_rate": 1.8655321957558958e-05, "loss": 2.4448, "step": 3728 }, { "epoch": 0.6236568131454614, "grad_norm": 2.016932249069214, "learning_rate": 1.8641205137720646e-05, "loss": 2.451, "step": 3729 }, { "epoch": 0.6238240582012794, "grad_norm": 5.6324334144592285, "learning_rate": 1.8627090485329732e-05, "loss": 2.7261, "step": 3730 }, { "epoch": 0.6239913032570975, "grad_norm": 6.942343235015869, "learning_rate": 1.8612978005197295e-05, "loss": 3.2196, "step": 3731 }, { "epoch": 0.6241585483129155, "grad_norm": 6.444573402404785, "learning_rate": 1.8598867702133693e-05, "loss": 2.7448, "step": 3732 }, { "epoch": 0.6243257933687335, "grad_norm": 4.830702304840088, "learning_rate": 1.858475958094854e-05, "loss": 2.5691, "step": 3733 }, { "epoch": 0.6244930384245516, "grad_norm": 4.8297553062438965, "learning_rate": 1.85706536464507e-05, "loss": 2.5674, "step": 3734 }, { "epoch": 0.6246602834803696, "grad_norm": 5.202088356018066, "learning_rate": 1.85565499034483e-05, "loss": 2.977, "step": 3735 }, { "epoch": 0.6248275285361876, "grad_norm": 3.5651323795318604, "learning_rate": 1.8542448356748714e-05, "loss": 2.6845, "step": 3736 }, { "epoch": 0.6249947735920057, "grad_norm": 3.7216413021087646, "learning_rate": 1.8528349011158566e-05, "loss": 2.6901, "step": 3737 }, { "epoch": 0.6251620186478237, "grad_norm": 4.438888072967529, "learning_rate": 1.8514251871483745e-05, "loss": 2.9278, "step": 3738 }, { "epoch": 0.6253292637036417, "grad_norm": 7.314457416534424, "learning_rate": 1.850015694252937e-05, "loss": 2.9696, "step": 3739 }, { "epoch": 0.6254965087594598, "grad_norm": 2.4626235961914062, "learning_rate": 1.8486064229099814e-05, "loss": 2.8369, "step": 3740 }, { "epoch": 0.6256637538152778, "grad_norm": 2.842716932296753, "learning_rate": 1.8471973735998693e-05, "loss": 2.7614, "step": 3741 }, { "epoch": 0.6258309988710958, "grad_norm": 4.02866268157959, "learning_rate": 1.845788546802887e-05, "loss": 2.9687, "step": 3742 }, { "epoch": 0.6259982439269139, "grad_norm": 4.1550374031066895, "learning_rate": 1.8443799429992442e-05, "loss": 2.4856, "step": 3743 }, { "epoch": 0.6261654889827319, "grad_norm": 7.359994888305664, "learning_rate": 1.8429715626690755e-05, "loss": 3.4403, "step": 3744 }, { "epoch": 0.6263327340385499, "grad_norm": 5.5724616050720215, "learning_rate": 1.8415634062924387e-05, "loss": 2.4351, "step": 3745 }, { "epoch": 0.626499979094368, "grad_norm": 5.750912666320801, "learning_rate": 1.8401554743493142e-05, "loss": 2.7138, "step": 3746 }, { "epoch": 0.626667224150186, "grad_norm": 4.951770782470703, "learning_rate": 1.8387477673196098e-05, "loss": 2.8373, "step": 3747 }, { "epoch": 0.626834469206004, "grad_norm": 3.277707099914551, "learning_rate": 1.837340285683152e-05, "loss": 2.7316, "step": 3748 }, { "epoch": 0.6270017142618222, "grad_norm": 6.306072235107422, "learning_rate": 1.8359330299196934e-05, "loss": 2.2243, "step": 3749 }, { "epoch": 0.6271689593176402, "grad_norm": 4.923126220703125, "learning_rate": 1.834526000508908e-05, "loss": 2.739, "step": 3750 }, { "epoch": 0.6273362043734582, "grad_norm": 5.626863479614258, "learning_rate": 1.8331191979303942e-05, "loss": 2.7905, "step": 3751 }, { "epoch": 0.6275034494292763, "grad_norm": 10.522805213928223, "learning_rate": 1.831712622663671e-05, "loss": 2.5621, "step": 3752 }, { "epoch": 0.6276706944850943, "grad_norm": 7.549012184143066, "learning_rate": 1.8303062751881823e-05, "loss": 2.5913, "step": 3753 }, { "epoch": 0.6278379395409123, "grad_norm": 4.136613845825195, "learning_rate": 1.8289001559832918e-05, "loss": 2.9261, "step": 3754 }, { "epoch": 0.6280051845967304, "grad_norm": 4.923119068145752, "learning_rate": 1.8274942655282884e-05, "loss": 2.7508, "step": 3755 }, { "epoch": 0.6281724296525484, "grad_norm": 5.644711017608643, "learning_rate": 1.82608860430238e-05, "loss": 2.4354, "step": 3756 }, { "epoch": 0.6283396747083664, "grad_norm": 7.150838851928711, "learning_rate": 1.824683172784699e-05, "loss": 2.2779, "step": 3757 }, { "epoch": 0.6285069197641845, "grad_norm": 3.829030752182007, "learning_rate": 1.823277971454297e-05, "loss": 2.8172, "step": 3758 }, { "epoch": 0.6286741648200025, "grad_norm": 4.99474573135376, "learning_rate": 1.8218730007901494e-05, "loss": 3.0076, "step": 3759 }, { "epoch": 0.6288414098758206, "grad_norm": 4.325135231018066, "learning_rate": 1.8204682612711514e-05, "loss": 2.6463, "step": 3760 }, { "epoch": 0.6290086549316386, "grad_norm": 4.823440074920654, "learning_rate": 1.8190637533761203e-05, "loss": 2.9358, "step": 3761 }, { "epoch": 0.6291758999874566, "grad_norm": 4.112978935241699, "learning_rate": 1.8176594775837926e-05, "loss": 2.3381, "step": 3762 }, { "epoch": 0.6293431450432747, "grad_norm": 4.375173091888428, "learning_rate": 1.81625543437283e-05, "loss": 2.6209, "step": 3763 }, { "epoch": 0.6295103900990927, "grad_norm": 14.924942016601562, "learning_rate": 1.8148516242218105e-05, "loss": 3.3201, "step": 3764 }, { "epoch": 0.6296776351549107, "grad_norm": 11.927820205688477, "learning_rate": 1.8134480476092347e-05, "loss": 3.1747, "step": 3765 }, { "epoch": 0.6298448802107288, "grad_norm": 4.4918293952941895, "learning_rate": 1.8120447050135232e-05, "loss": 2.7623, "step": 3766 }, { "epoch": 0.6300121252665468, "grad_norm": 9.50061321258545, "learning_rate": 1.8106415969130162e-05, "loss": 3.1738, "step": 3767 }, { "epoch": 0.6301793703223648, "grad_norm": 3.963066577911377, "learning_rate": 1.8092387237859754e-05, "loss": 3.3301, "step": 3768 }, { "epoch": 0.6303466153781829, "grad_norm": 2.691875696182251, "learning_rate": 1.8078360861105814e-05, "loss": 2.6447, "step": 3769 }, { "epoch": 0.6305138604340009, "grad_norm": 9.067344665527344, "learning_rate": 1.806433684364934e-05, "loss": 2.7259, "step": 3770 }, { "epoch": 0.6306811054898189, "grad_norm": 3.0332682132720947, "learning_rate": 1.8050315190270546e-05, "loss": 2.5523, "step": 3771 }, { "epoch": 0.630848350545637, "grad_norm": 5.610728740692139, "learning_rate": 1.8036295905748822e-05, "loss": 2.9734, "step": 3772 }, { "epoch": 0.631015595601455, "grad_norm": 10.78840446472168, "learning_rate": 1.8022278994862756e-05, "loss": 2.7565, "step": 3773 }, { "epoch": 0.631182840657273, "grad_norm": 5.514931678771973, "learning_rate": 1.8008264462390126e-05, "loss": 2.7068, "step": 3774 }, { "epoch": 0.6313500857130911, "grad_norm": 5.265707492828369, "learning_rate": 1.79942523131079e-05, "loss": 2.6816, "step": 3775 }, { "epoch": 0.6315173307689091, "grad_norm": 7.072509765625, "learning_rate": 1.7980242551792237e-05, "loss": 2.2639, "step": 3776 }, { "epoch": 0.6316845758247271, "grad_norm": 2.3045079708099365, "learning_rate": 1.7966235183218478e-05, "loss": 2.3966, "step": 3777 }, { "epoch": 0.6318518208805453, "grad_norm": 3.2400941848754883, "learning_rate": 1.7952230212161146e-05, "loss": 2.9085, "step": 3778 }, { "epoch": 0.6320190659363633, "grad_norm": 6.891081809997559, "learning_rate": 1.7938227643393942e-05, "loss": 2.8877, "step": 3779 }, { "epoch": 0.6321863109921813, "grad_norm": 3.9212000370025635, "learning_rate": 1.7924227481689786e-05, "loss": 2.7374, "step": 3780 }, { "epoch": 0.6323535560479994, "grad_norm": 4.328762531280518, "learning_rate": 1.791022973182072e-05, "loss": 2.8608, "step": 3781 }, { "epoch": 0.6325208011038174, "grad_norm": 12.846704483032227, "learning_rate": 1.7896234398558004e-05, "loss": 2.7061, "step": 3782 }, { "epoch": 0.6326880461596354, "grad_norm": 6.762118339538574, "learning_rate": 1.788224148667206e-05, "loss": 2.5773, "step": 3783 }, { "epoch": 0.6328552912154535, "grad_norm": 4.862046241760254, "learning_rate": 1.7868251000932484e-05, "loss": 2.6044, "step": 3784 }, { "epoch": 0.6330225362712715, "grad_norm": 4.553565979003906, "learning_rate": 1.7854262946108052e-05, "loss": 2.9462, "step": 3785 }, { "epoch": 0.6331897813270895, "grad_norm": 6.520255088806152, "learning_rate": 1.7840277326966705e-05, "loss": 3.1539, "step": 3786 }, { "epoch": 0.6333570263829076, "grad_norm": 3.5869503021240234, "learning_rate": 1.782629414827554e-05, "loss": 2.3985, "step": 3787 }, { "epoch": 0.6335242714387256, "grad_norm": 2.751145839691162, "learning_rate": 1.7812313414800868e-05, "loss": 2.5047, "step": 3788 }, { "epoch": 0.6336915164945436, "grad_norm": 3.79736065864563, "learning_rate": 1.7798335131308126e-05, "loss": 2.5845, "step": 3789 }, { "epoch": 0.6338587615503617, "grad_norm": 2.592050313949585, "learning_rate": 1.778435930256192e-05, "loss": 2.5509, "step": 3790 }, { "epoch": 0.6340260066061797, "grad_norm": 9.093317985534668, "learning_rate": 1.7770385933326032e-05, "loss": 3.1449, "step": 3791 }, { "epoch": 0.6341932516619977, "grad_norm": 5.722671031951904, "learning_rate": 1.7756415028363392e-05, "loss": 2.7127, "step": 3792 }, { "epoch": 0.6343604967178158, "grad_norm": 5.39863395690918, "learning_rate": 1.7742446592436108e-05, "loss": 2.724, "step": 3793 }, { "epoch": 0.6345277417736338, "grad_norm": 3.3498458862304688, "learning_rate": 1.772848063030543e-05, "loss": 2.7866, "step": 3794 }, { "epoch": 0.6346949868294518, "grad_norm": 3.35135555267334, "learning_rate": 1.7714517146731762e-05, "loss": 2.6518, "step": 3795 }, { "epoch": 0.6348622318852699, "grad_norm": 4.845488548278809, "learning_rate": 1.7700556146474688e-05, "loss": 2.9231, "step": 3796 }, { "epoch": 0.6350294769410879, "grad_norm": 4.598339557647705, "learning_rate": 1.7686597634292924e-05, "loss": 3.4434, "step": 3797 }, { "epoch": 0.635196721996906, "grad_norm": 9.037178993225098, "learning_rate": 1.767264161494434e-05, "loss": 2.644, "step": 3798 }, { "epoch": 0.635363967052724, "grad_norm": 4.625882148742676, "learning_rate": 1.7658688093185962e-05, "loss": 2.8432, "step": 3799 }, { "epoch": 0.635531212108542, "grad_norm": 2.57057785987854, "learning_rate": 1.7644737073773963e-05, "loss": 2.4446, "step": 3800 }, { "epoch": 0.6356984571643601, "grad_norm": 4.742549896240234, "learning_rate": 1.763078856146366e-05, "loss": 2.9054, "step": 3801 }, { "epoch": 0.6358657022201781, "grad_norm": 4.347770690917969, "learning_rate": 1.7616842561009517e-05, "loss": 2.8255, "step": 3802 }, { "epoch": 0.6360329472759961, "grad_norm": 6.000614643096924, "learning_rate": 1.7602899077165136e-05, "loss": 2.6754, "step": 3803 }, { "epoch": 0.6362001923318142, "grad_norm": 3.9218480587005615, "learning_rate": 1.7588958114683286e-05, "loss": 3.0419, "step": 3804 }, { "epoch": 0.6363674373876322, "grad_norm": 3.619107484817505, "learning_rate": 1.7575019678315852e-05, "loss": 2.6373, "step": 3805 }, { "epoch": 0.6365346824434502, "grad_norm": 5.200246810913086, "learning_rate": 1.7561083772813858e-05, "loss": 2.3875, "step": 3806 }, { "epoch": 0.6367019274992683, "grad_norm": 7.04906702041626, "learning_rate": 1.7547150402927475e-05, "loss": 2.8597, "step": 3807 }, { "epoch": 0.6368691725550863, "grad_norm": 3.500810384750366, "learning_rate": 1.7533219573406013e-05, "loss": 2.4177, "step": 3808 }, { "epoch": 0.6370364176109043, "grad_norm": 4.080125331878662, "learning_rate": 1.7519291288997896e-05, "loss": 2.9382, "step": 3809 }, { "epoch": 0.6372036626667225, "grad_norm": 3.850417137145996, "learning_rate": 1.7505365554450702e-05, "loss": 2.4427, "step": 3810 }, { "epoch": 0.6373709077225405, "grad_norm": 5.874654769897461, "learning_rate": 1.7491442374511123e-05, "loss": 3.5152, "step": 3811 }, { "epoch": 0.6375381527783585, "grad_norm": 3.1604630947113037, "learning_rate": 1.747752175392501e-05, "loss": 2.6171, "step": 3812 }, { "epoch": 0.6377053978341766, "grad_norm": 5.330573081970215, "learning_rate": 1.74636036974373e-05, "loss": 3.2094, "step": 3813 }, { "epoch": 0.6378726428899946, "grad_norm": 6.330973148345947, "learning_rate": 1.7449688209792087e-05, "loss": 3.1309, "step": 3814 }, { "epoch": 0.6380398879458126, "grad_norm": 3.537266254425049, "learning_rate": 1.7435775295732577e-05, "loss": 2.539, "step": 3815 }, { "epoch": 0.6382071330016307, "grad_norm": 2.601524591445923, "learning_rate": 1.7421864960001096e-05, "loss": 2.6407, "step": 3816 }, { "epoch": 0.6383743780574487, "grad_norm": 5.069955825805664, "learning_rate": 1.7407957207339102e-05, "loss": 2.7004, "step": 3817 }, { "epoch": 0.6385416231132667, "grad_norm": 5.720069408416748, "learning_rate": 1.7394052042487168e-05, "loss": 3.3892, "step": 3818 }, { "epoch": 0.6387088681690848, "grad_norm": 2.6810295581817627, "learning_rate": 1.738014947018498e-05, "loss": 2.4163, "step": 3819 }, { "epoch": 0.6388761132249028, "grad_norm": 4.778876781463623, "learning_rate": 1.736624949517133e-05, "loss": 2.7606, "step": 3820 }, { "epoch": 0.6390433582807208, "grad_norm": 3.688612699508667, "learning_rate": 1.7352352122184166e-05, "loss": 2.1715, "step": 3821 }, { "epoch": 0.6392106033365389, "grad_norm": 4.0118408203125, "learning_rate": 1.7338457355960506e-05, "loss": 2.8663, "step": 3822 }, { "epoch": 0.6393778483923569, "grad_norm": 7.8887619972229, "learning_rate": 1.7324565201236496e-05, "loss": 2.752, "step": 3823 }, { "epoch": 0.6395450934481749, "grad_norm": 6.3802666664123535, "learning_rate": 1.7310675662747396e-05, "loss": 2.7234, "step": 3824 }, { "epoch": 0.639712338503993, "grad_norm": 3.502182722091675, "learning_rate": 1.7296788745227567e-05, "loss": 2.844, "step": 3825 }, { "epoch": 0.639879583559811, "grad_norm": 4.031065464019775, "learning_rate": 1.728290445341047e-05, "loss": 2.724, "step": 3826 }, { "epoch": 0.640046828615629, "grad_norm": 2.492504119873047, "learning_rate": 1.726902279202869e-05, "loss": 2.7606, "step": 3827 }, { "epoch": 0.6402140736714471, "grad_norm": 5.0422539710998535, "learning_rate": 1.7255143765813897e-05, "loss": 2.6497, "step": 3828 }, { "epoch": 0.6403813187272651, "grad_norm": 3.0694544315338135, "learning_rate": 1.724126737949688e-05, "loss": 2.6099, "step": 3829 }, { "epoch": 0.6405485637830831, "grad_norm": 3.3699519634246826, "learning_rate": 1.7227393637807516e-05, "loss": 2.7787, "step": 3830 }, { "epoch": 0.6407158088389012, "grad_norm": 7.915779113769531, "learning_rate": 1.7213522545474782e-05, "loss": 2.8599, "step": 3831 }, { "epoch": 0.6408830538947192, "grad_norm": 11.93416690826416, "learning_rate": 1.7199654107226753e-05, "loss": 3.3093, "step": 3832 }, { "epoch": 0.6410502989505372, "grad_norm": 4.908160209655762, "learning_rate": 1.71857883277906e-05, "loss": 2.9327, "step": 3833 }, { "epoch": 0.6412175440063553, "grad_norm": 3.1814804077148438, "learning_rate": 1.7171925211892592e-05, "loss": 2.8696, "step": 3834 }, { "epoch": 0.6413847890621733, "grad_norm": 3.475897789001465, "learning_rate": 1.715806476425808e-05, "loss": 2.6641, "step": 3835 }, { "epoch": 0.6415520341179913, "grad_norm": 4.584676742553711, "learning_rate": 1.7144206989611507e-05, "loss": 2.6398, "step": 3836 }, { "epoch": 0.6417192791738094, "grad_norm": 9.30288028717041, "learning_rate": 1.7130351892676423e-05, "loss": 2.9135, "step": 3837 }, { "epoch": 0.6418865242296274, "grad_norm": 5.456047058105469, "learning_rate": 1.7116499478175444e-05, "loss": 3.172, "step": 3838 }, { "epoch": 0.6420537692854456, "grad_norm": 7.831234931945801, "learning_rate": 1.710264975083028e-05, "loss": 3.0696, "step": 3839 }, { "epoch": 0.6422210143412636, "grad_norm": 7.234132289886475, "learning_rate": 1.708880271536173e-05, "loss": 2.7384, "step": 3840 }, { "epoch": 0.6423882593970816, "grad_norm": 6.865843772888184, "learning_rate": 1.7074958376489652e-05, "loss": 2.5186, "step": 3841 }, { "epoch": 0.6425555044528997, "grad_norm": 5.833865165710449, "learning_rate": 1.706111673893302e-05, "loss": 2.9318, "step": 3842 }, { "epoch": 0.6427227495087177, "grad_norm": 5.3126540184021, "learning_rate": 1.7047277807409863e-05, "loss": 2.8109, "step": 3843 }, { "epoch": 0.6428899945645357, "grad_norm": 5.329280376434326, "learning_rate": 1.7033441586637283e-05, "loss": 2.7557, "step": 3844 }, { "epoch": 0.6430572396203538, "grad_norm": 3.929020643234253, "learning_rate": 1.701960808133149e-05, "loss": 2.613, "step": 3845 }, { "epoch": 0.6432244846761718, "grad_norm": 8.328234672546387, "learning_rate": 1.7005777296207736e-05, "loss": 3.517, "step": 3846 }, { "epoch": 0.6433917297319898, "grad_norm": 4.534942150115967, "learning_rate": 1.6991949235980363e-05, "loss": 2.6978, "step": 3847 }, { "epoch": 0.6435589747878079, "grad_norm": 5.904055595397949, "learning_rate": 1.697812390536278e-05, "loss": 2.7373, "step": 3848 }, { "epoch": 0.6437262198436259, "grad_norm": 6.875117301940918, "learning_rate": 1.6964301309067458e-05, "loss": 2.8712, "step": 3849 }, { "epoch": 0.6438934648994439, "grad_norm": 13.691344261169434, "learning_rate": 1.6950481451805942e-05, "loss": 3.3637, "step": 3850 }, { "epoch": 0.644060709955262, "grad_norm": 5.838823318481445, "learning_rate": 1.693666433828885e-05, "loss": 2.7595, "step": 3851 }, { "epoch": 0.64422795501108, "grad_norm": 6.6484880447387695, "learning_rate": 1.692284997322585e-05, "loss": 2.9771, "step": 3852 }, { "epoch": 0.644395200066898, "grad_norm": 7.206340312957764, "learning_rate": 1.6909038361325686e-05, "loss": 3.1955, "step": 3853 }, { "epoch": 0.6445624451227161, "grad_norm": 7.50755500793457, "learning_rate": 1.6895229507296163e-05, "loss": 2.2226, "step": 3854 }, { "epoch": 0.6447296901785341, "grad_norm": 4.617588043212891, "learning_rate": 1.6881423415844138e-05, "loss": 2.7659, "step": 3855 }, { "epoch": 0.6448969352343521, "grad_norm": 4.028453826904297, "learning_rate": 1.686762009167553e-05, "loss": 2.9315, "step": 3856 }, { "epoch": 0.6450641802901702, "grad_norm": 5.628993511199951, "learning_rate": 1.6853819539495318e-05, "loss": 2.8437, "step": 3857 }, { "epoch": 0.6452314253459882, "grad_norm": 4.628597736358643, "learning_rate": 1.6840021764007532e-05, "loss": 2.8559, "step": 3858 }, { "epoch": 0.6453986704018062, "grad_norm": 4.780325412750244, "learning_rate": 1.682622676991526e-05, "loss": 2.676, "step": 3859 }, { "epoch": 0.6455659154576243, "grad_norm": 3.8497138023376465, "learning_rate": 1.6812434561920632e-05, "loss": 2.6031, "step": 3860 }, { "epoch": 0.6457331605134423, "grad_norm": 3.1993768215179443, "learning_rate": 1.6798645144724827e-05, "loss": 3.1997, "step": 3861 }, { "epoch": 0.6459004055692603, "grad_norm": 2.481847047805786, "learning_rate": 1.6784858523028108e-05, "loss": 2.3454, "step": 3862 }, { "epoch": 0.6460676506250784, "grad_norm": 5.213744163513184, "learning_rate": 1.677107470152974e-05, "loss": 2.9539, "step": 3863 }, { "epoch": 0.6462348956808964, "grad_norm": 3.417383909225464, "learning_rate": 1.6757293684928053e-05, "loss": 2.7302, "step": 3864 }, { "epoch": 0.6464021407367144, "grad_norm": 4.430137634277344, "learning_rate": 1.674351547792043e-05, "loss": 2.9268, "step": 3865 }, { "epoch": 0.6465693857925325, "grad_norm": 6.002292156219482, "learning_rate": 1.6729740085203267e-05, "loss": 2.4972, "step": 3866 }, { "epoch": 0.6467366308483505, "grad_norm": 7.91270112991333, "learning_rate": 1.6715967511472036e-05, "loss": 2.6408, "step": 3867 }, { "epoch": 0.6469038759041685, "grad_norm": 4.115695953369141, "learning_rate": 1.6702197761421222e-05, "loss": 2.8963, "step": 3868 }, { "epoch": 0.6470711209599866, "grad_norm": 6.012133598327637, "learning_rate": 1.6688430839744357e-05, "loss": 2.5643, "step": 3869 }, { "epoch": 0.6472383660158046, "grad_norm": 4.668788909912109, "learning_rate": 1.6674666751134017e-05, "loss": 2.5816, "step": 3870 }, { "epoch": 0.6474056110716226, "grad_norm": 6.43157958984375, "learning_rate": 1.66609055002818e-05, "loss": 2.8923, "step": 3871 }, { "epoch": 0.6475728561274408, "grad_norm": 4.73787784576416, "learning_rate": 1.6647147091878342e-05, "loss": 3.3744, "step": 3872 }, { "epoch": 0.6477401011832588, "grad_norm": 3.1681368350982666, "learning_rate": 1.663339153061331e-05, "loss": 2.5781, "step": 3873 }, { "epoch": 0.6479073462390768, "grad_norm": 2.3819220066070557, "learning_rate": 1.66196388211754e-05, "loss": 2.4866, "step": 3874 }, { "epoch": 0.6480745912948949, "grad_norm": 4.842905044555664, "learning_rate": 1.660588896825233e-05, "loss": 2.592, "step": 3875 }, { "epoch": 0.6482418363507129, "grad_norm": 5.3346076011657715, "learning_rate": 1.659214197653086e-05, "loss": 3.0288, "step": 3876 }, { "epoch": 0.648409081406531, "grad_norm": 8.037290573120117, "learning_rate": 1.6578397850696743e-05, "loss": 2.7934, "step": 3877 }, { "epoch": 0.648576326462349, "grad_norm": 2.9616363048553467, "learning_rate": 1.656465659543481e-05, "loss": 2.5377, "step": 3878 }, { "epoch": 0.648743571518167, "grad_norm": 4.708240032196045, "learning_rate": 1.655091821542886e-05, "loss": 2.3876, "step": 3879 }, { "epoch": 0.6489108165739851, "grad_norm": 3.609867572784424, "learning_rate": 1.653718271536174e-05, "loss": 2.2576, "step": 3880 }, { "epoch": 0.6490780616298031, "grad_norm": 5.309292793273926, "learning_rate": 1.6523450099915304e-05, "loss": 2.6376, "step": 3881 }, { "epoch": 0.6492453066856211, "grad_norm": 6.358583927154541, "learning_rate": 1.6509720373770425e-05, "loss": 3.1231, "step": 3882 }, { "epoch": 0.6494125517414392, "grad_norm": 9.046334266662598, "learning_rate": 1.6495993541606995e-05, "loss": 3.4203, "step": 3883 }, { "epoch": 0.6495797967972572, "grad_norm": 3.426180601119995, "learning_rate": 1.648226960810392e-05, "loss": 2.8881, "step": 3884 }, { "epoch": 0.6497470418530752, "grad_norm": 1.8677431344985962, "learning_rate": 1.6468548577939113e-05, "loss": 2.2453, "step": 3885 }, { "epoch": 0.6499142869088933, "grad_norm": 2.09354829788208, "learning_rate": 1.6454830455789492e-05, "loss": 2.4704, "step": 3886 }, { "epoch": 0.6500815319647113, "grad_norm": 5.226034164428711, "learning_rate": 1.6441115246331e-05, "loss": 2.9722, "step": 3887 }, { "epoch": 0.6502487770205293, "grad_norm": 3.567728042602539, "learning_rate": 1.6427402954238587e-05, "loss": 3.0948, "step": 3888 }, { "epoch": 0.6504160220763474, "grad_norm": 4.638893127441406, "learning_rate": 1.6413693584186182e-05, "loss": 2.9786, "step": 3889 }, { "epoch": 0.6505832671321654, "grad_norm": 4.087899684906006, "learning_rate": 1.639998714084675e-05, "loss": 2.6201, "step": 3890 }, { "epoch": 0.6507505121879834, "grad_norm": 9.371967315673828, "learning_rate": 1.6386283628892235e-05, "loss": 3.0167, "step": 3891 }, { "epoch": 0.6509177572438015, "grad_norm": 4.968016147613525, "learning_rate": 1.6372583052993596e-05, "loss": 2.7874, "step": 3892 }, { "epoch": 0.6510850022996195, "grad_norm": 3.250472068786621, "learning_rate": 1.635888541782079e-05, "loss": 2.642, "step": 3893 }, { "epoch": 0.6512522473554375, "grad_norm": 6.279197692871094, "learning_rate": 1.634519072804275e-05, "loss": 2.9413, "step": 3894 }, { "epoch": 0.6514194924112556, "grad_norm": 3.6681816577911377, "learning_rate": 1.6331498988327454e-05, "loss": 2.9313, "step": 3895 }, { "epoch": 0.6515867374670736, "grad_norm": 4.263389587402344, "learning_rate": 1.631781020334183e-05, "loss": 2.5879, "step": 3896 }, { "epoch": 0.6517539825228916, "grad_norm": 8.403786659240723, "learning_rate": 1.6304124377751803e-05, "loss": 3.1038, "step": 3897 }, { "epoch": 0.6519212275787097, "grad_norm": 4.1622843742370605, "learning_rate": 1.629044151622231e-05, "loss": 2.6818, "step": 3898 }, { "epoch": 0.6520884726345277, "grad_norm": 4.522341728210449, "learning_rate": 1.627676162341727e-05, "loss": 2.7218, "step": 3899 }, { "epoch": 0.6522557176903457, "grad_norm": 3.679506778717041, "learning_rate": 1.6263084703999572e-05, "loss": 2.6867, "step": 3900 }, { "epoch": 0.6524229627461638, "grad_norm": 3.018371343612671, "learning_rate": 1.6249410762631113e-05, "loss": 2.5453, "step": 3901 }, { "epoch": 0.6525902078019818, "grad_norm": 3.960871696472168, "learning_rate": 1.623573980397276e-05, "loss": 2.8781, "step": 3902 }, { "epoch": 0.6527574528577998, "grad_norm": 5.53309440612793, "learning_rate": 1.6222071832684395e-05, "loss": 2.5082, "step": 3903 }, { "epoch": 0.652924697913618, "grad_norm": 3.450801134109497, "learning_rate": 1.6208406853424836e-05, "loss": 2.4606, "step": 3904 }, { "epoch": 0.653091942969436, "grad_norm": 8.188538551330566, "learning_rate": 1.6194744870851913e-05, "loss": 3.0044, "step": 3905 }, { "epoch": 0.653259188025254, "grad_norm": 5.846758842468262, "learning_rate": 1.6181085889622427e-05, "loss": 3.0495, "step": 3906 }, { "epoch": 0.6534264330810721, "grad_norm": 10.226333618164062, "learning_rate": 1.6167429914392144e-05, "loss": 3.1952, "step": 3907 }, { "epoch": 0.6535936781368901, "grad_norm": 7.836836814880371, "learning_rate": 1.6153776949815824e-05, "loss": 3.3741, "step": 3908 }, { "epoch": 0.6537609231927081, "grad_norm": 2.9586191177368164, "learning_rate": 1.6140127000547185e-05, "loss": 2.5099, "step": 3909 }, { "epoch": 0.6539281682485262, "grad_norm": 4.146342754364014, "learning_rate": 1.6126480071238924e-05, "loss": 2.7312, "step": 3910 }, { "epoch": 0.6540954133043442, "grad_norm": 4.214728355407715, "learning_rate": 1.611283616654272e-05, "loss": 3.0831, "step": 3911 }, { "epoch": 0.6542626583601622, "grad_norm": 3.040491819381714, "learning_rate": 1.60991952911092e-05, "loss": 2.733, "step": 3912 }, { "epoch": 0.6544299034159803, "grad_norm": 5.800417423248291, "learning_rate": 1.6085557449587975e-05, "loss": 3.2382, "step": 3913 }, { "epoch": 0.6545971484717983, "grad_norm": 3.919419765472412, "learning_rate": 1.6071922646627603e-05, "loss": 2.5211, "step": 3914 }, { "epoch": 0.6547643935276164, "grad_norm": 3.2085297107696533, "learning_rate": 1.6058290886875633e-05, "loss": 2.762, "step": 3915 }, { "epoch": 0.6549316385834344, "grad_norm": 5.549785614013672, "learning_rate": 1.604466217497855e-05, "loss": 2.8343, "step": 3916 }, { "epoch": 0.6550988836392524, "grad_norm": 5.52817964553833, "learning_rate": 1.603103651558182e-05, "loss": 2.6601, "step": 3917 }, { "epoch": 0.6552661286950705, "grad_norm": 5.268473148345947, "learning_rate": 1.601741391332985e-05, "loss": 2.8654, "step": 3918 }, { "epoch": 0.6554333737508885, "grad_norm": 2.887836456298828, "learning_rate": 1.600379437286603e-05, "loss": 2.6203, "step": 3919 }, { "epoch": 0.6556006188067065, "grad_norm": 4.110098361968994, "learning_rate": 1.5990177898832687e-05, "loss": 2.6739, "step": 3920 }, { "epoch": 0.6557678638625246, "grad_norm": 7.931051731109619, "learning_rate": 1.597656449587111e-05, "loss": 3.5181, "step": 3921 }, { "epoch": 0.6559351089183426, "grad_norm": 6.375673294067383, "learning_rate": 1.596295416862153e-05, "loss": 2.9316, "step": 3922 }, { "epoch": 0.6561023539741606, "grad_norm": 2.784280300140381, "learning_rate": 1.5949346921723153e-05, "loss": 2.6697, "step": 3923 }, { "epoch": 0.6562695990299787, "grad_norm": 4.330873012542725, "learning_rate": 1.5935742759814102e-05, "loss": 2.9794, "step": 3924 }, { "epoch": 0.6564368440857967, "grad_norm": 3.7207207679748535, "learning_rate": 1.5922141687531487e-05, "loss": 2.5727, "step": 3925 }, { "epoch": 0.6566040891416147, "grad_norm": 7.046762466430664, "learning_rate": 1.590854370951133e-05, "loss": 2.7895, "step": 3926 }, { "epoch": 0.6567713341974328, "grad_norm": 3.9868853092193604, "learning_rate": 1.5894948830388618e-05, "loss": 1.9735, "step": 3927 }, { "epoch": 0.6569385792532508, "grad_norm": 4.494137763977051, "learning_rate": 1.588135705479728e-05, "loss": 3.0386, "step": 3928 }, { "epoch": 0.6571058243090688, "grad_norm": 5.099454402923584, "learning_rate": 1.586776838737018e-05, "loss": 2.7567, "step": 3929 }, { "epoch": 0.6572730693648869, "grad_norm": 5.327308654785156, "learning_rate": 1.5854182832739133e-05, "loss": 2.7681, "step": 3930 }, { "epoch": 0.6574403144207049, "grad_norm": 3.5808284282684326, "learning_rate": 1.584060039553488e-05, "loss": 2.6989, "step": 3931 }, { "epoch": 0.6576075594765229, "grad_norm": 5.972328186035156, "learning_rate": 1.582702108038711e-05, "loss": 2.7618, "step": 3932 }, { "epoch": 0.657774804532341, "grad_norm": 7.723893642425537, "learning_rate": 1.5813444891924444e-05, "loss": 2.445, "step": 3933 }, { "epoch": 0.657942049588159, "grad_norm": 3.4955997467041016, "learning_rate": 1.5799871834774434e-05, "loss": 2.3301, "step": 3934 }, { "epoch": 0.658109294643977, "grad_norm": 4.264527797698975, "learning_rate": 1.578630191356356e-05, "loss": 2.6033, "step": 3935 }, { "epoch": 0.6582765396997952, "grad_norm": 4.656225204467773, "learning_rate": 1.577273513291726e-05, "loss": 2.4483, "step": 3936 }, { "epoch": 0.6584437847556132, "grad_norm": 14.345069885253906, "learning_rate": 1.5759171497459872e-05, "loss": 2.7459, "step": 3937 }, { "epoch": 0.6586110298114312, "grad_norm": 5.7522358894348145, "learning_rate": 1.574561101181468e-05, "loss": 2.7493, "step": 3938 }, { "epoch": 0.6587782748672493, "grad_norm": 6.336310863494873, "learning_rate": 1.5732053680603875e-05, "loss": 2.8613, "step": 3939 }, { "epoch": 0.6589455199230673, "grad_norm": 6.259703636169434, "learning_rate": 1.571849950844859e-05, "loss": 2.7083, "step": 3940 }, { "epoch": 0.6591127649788853, "grad_norm": 4.549555778503418, "learning_rate": 1.5704948499968875e-05, "loss": 2.4386, "step": 3941 }, { "epoch": 0.6592800100347034, "grad_norm": 6.351601600646973, "learning_rate": 1.5691400659783707e-05, "loss": 2.904, "step": 3942 }, { "epoch": 0.6594472550905214, "grad_norm": 4.086788177490234, "learning_rate": 1.5677855992510966e-05, "loss": 3.0624, "step": 3943 }, { "epoch": 0.6596145001463394, "grad_norm": 3.8338422775268555, "learning_rate": 1.5664314502767475e-05, "loss": 2.5174, "step": 3944 }, { "epoch": 0.6597817452021575, "grad_norm": 4.013120651245117, "learning_rate": 1.5650776195168958e-05, "loss": 2.569, "step": 3945 }, { "epoch": 0.6599489902579755, "grad_norm": 6.107297420501709, "learning_rate": 1.5637241074330064e-05, "loss": 3.4505, "step": 3946 }, { "epoch": 0.6601162353137935, "grad_norm": 7.463050842285156, "learning_rate": 1.562370914486434e-05, "loss": 3.3897, "step": 3947 }, { "epoch": 0.6602834803696116, "grad_norm": 2.742060661315918, "learning_rate": 1.5610180411384262e-05, "loss": 2.3763, "step": 3948 }, { "epoch": 0.6604507254254296, "grad_norm": 3.5281829833984375, "learning_rate": 1.5596654878501204e-05, "loss": 2.8835, "step": 3949 }, { "epoch": 0.6606179704812476, "grad_norm": 11.327028274536133, "learning_rate": 1.5583132550825462e-05, "loss": 4.2708, "step": 3950 }, { "epoch": 0.6607852155370657, "grad_norm": 9.439321517944336, "learning_rate": 1.5569613432966217e-05, "loss": 2.9863, "step": 3951 }, { "epoch": 0.6609524605928837, "grad_norm": 4.977163314819336, "learning_rate": 1.555609752953159e-05, "loss": 2.4732, "step": 3952 }, { "epoch": 0.6611197056487017, "grad_norm": 5.029117107391357, "learning_rate": 1.5542584845128582e-05, "loss": 3.0097, "step": 3953 }, { "epoch": 0.6612869507045198, "grad_norm": 4.613850116729736, "learning_rate": 1.5529075384363102e-05, "loss": 2.765, "step": 3954 }, { "epoch": 0.6614541957603378, "grad_norm": 7.652193546295166, "learning_rate": 1.5515569151839952e-05, "loss": 2.8929, "step": 3955 }, { "epoch": 0.6616214408161559, "grad_norm": 7.342623233795166, "learning_rate": 1.5502066152162858e-05, "loss": 2.7101, "step": 3956 }, { "epoch": 0.6617886858719739, "grad_norm": 3.9656147956848145, "learning_rate": 1.5488566389934416e-05, "loss": 2.6679, "step": 3957 }, { "epoch": 0.6619559309277919, "grad_norm": 5.02079963684082, "learning_rate": 1.5475069869756136e-05, "loss": 2.625, "step": 3958 }, { "epoch": 0.66212317598361, "grad_norm": 6.637492656707764, "learning_rate": 1.5461576596228415e-05, "loss": 2.8615, "step": 3959 }, { "epoch": 0.662290421039428, "grad_norm": 9.929197311401367, "learning_rate": 1.5448086573950535e-05, "loss": 2.8183, "step": 3960 }, { "epoch": 0.662457666095246, "grad_norm": 3.3156049251556396, "learning_rate": 1.5434599807520704e-05, "loss": 2.9325, "step": 3961 }, { "epoch": 0.6626249111510641, "grad_norm": 5.075874328613281, "learning_rate": 1.542111630153598e-05, "loss": 2.7787, "step": 3962 }, { "epoch": 0.6627921562068821, "grad_norm": 3.8324596881866455, "learning_rate": 1.5407636060592336e-05, "loss": 2.6291, "step": 3963 }, { "epoch": 0.6629594012627001, "grad_norm": 5.974466800689697, "learning_rate": 1.539415908928462e-05, "loss": 2.7928, "step": 3964 }, { "epoch": 0.6631266463185183, "grad_norm": 13.587583541870117, "learning_rate": 1.538068539220656e-05, "loss": 3.1794, "step": 3965 }, { "epoch": 0.6632938913743363, "grad_norm": 6.183319091796875, "learning_rate": 1.5367214973950788e-05, "loss": 3.2049, "step": 3966 }, { "epoch": 0.6634611364301543, "grad_norm": 3.4161434173583984, "learning_rate": 1.5353747839108802e-05, "loss": 2.7036, "step": 3967 }, { "epoch": 0.6636283814859724, "grad_norm": 3.398043155670166, "learning_rate": 1.534028399227098e-05, "loss": 2.6459, "step": 3968 }, { "epoch": 0.6637956265417904, "grad_norm": 6.655284881591797, "learning_rate": 1.5326823438026595e-05, "loss": 2.8824, "step": 3969 }, { "epoch": 0.6639628715976084, "grad_norm": 8.815752983093262, "learning_rate": 1.531336618096378e-05, "loss": 2.7963, "step": 3970 }, { "epoch": 0.6641301166534265, "grad_norm": 2.8620543479919434, "learning_rate": 1.529991222566955e-05, "loss": 2.3885, "step": 3971 }, { "epoch": 0.6642973617092445, "grad_norm": 3.0645570755004883, "learning_rate": 1.5286461576729806e-05, "loss": 2.5135, "step": 3972 }, { "epoch": 0.6644646067650625, "grad_norm": 3.929450750350952, "learning_rate": 1.52730142387293e-05, "loss": 2.8613, "step": 3973 }, { "epoch": 0.6646318518208806, "grad_norm": 3.3572654724121094, "learning_rate": 1.5259570216251672e-05, "loss": 2.3671, "step": 3974 }, { "epoch": 0.6647990968766986, "grad_norm": 5.646228313446045, "learning_rate": 1.5246129513879431e-05, "loss": 2.6988, "step": 3975 }, { "epoch": 0.6649663419325166, "grad_norm": 8.635396003723145, "learning_rate": 1.5232692136193938e-05, "loss": 1.9925, "step": 3976 }, { "epoch": 0.6651335869883347, "grad_norm": 3.2180211544036865, "learning_rate": 1.5219258087775451e-05, "loss": 2.6954, "step": 3977 }, { "epoch": 0.6653008320441527, "grad_norm": 5.1684250831604, "learning_rate": 1.5205827373203069e-05, "loss": 2.605, "step": 3978 }, { "epoch": 0.6654680770999707, "grad_norm": 4.556488990783691, "learning_rate": 1.5192399997054757e-05, "loss": 2.331, "step": 3979 }, { "epoch": 0.6656353221557888, "grad_norm": 6.3118896484375, "learning_rate": 1.5178975963907354e-05, "loss": 3.2785, "step": 3980 }, { "epoch": 0.6658025672116068, "grad_norm": 8.329947471618652, "learning_rate": 1.5165555278336546e-05, "loss": 3.432, "step": 3981 }, { "epoch": 0.6659698122674248, "grad_norm": 2.76436710357666, "learning_rate": 1.5152137944916882e-05, "loss": 2.7475, "step": 3982 }, { "epoch": 0.6661370573232429, "grad_norm": 6.655486583709717, "learning_rate": 1.5138723968221774e-05, "loss": 3.2504, "step": 3983 }, { "epoch": 0.6663043023790609, "grad_norm": 4.9133453369140625, "learning_rate": 1.5125313352823478e-05, "loss": 2.7729, "step": 3984 }, { "epoch": 0.6664715474348789, "grad_norm": 6.774014949798584, "learning_rate": 1.5111906103293127e-05, "loss": 3.0476, "step": 3985 }, { "epoch": 0.666638792490697, "grad_norm": 3.9590208530426025, "learning_rate": 1.5098502224200683e-05, "loss": 2.5946, "step": 3986 }, { "epoch": 0.666806037546515, "grad_norm": 6.5320329666137695, "learning_rate": 1.5085101720114972e-05, "loss": 2.4259, "step": 3987 }, { "epoch": 0.666973282602333, "grad_norm": 4.436924457550049, "learning_rate": 1.5071704595603659e-05, "loss": 2.6204, "step": 3988 }, { "epoch": 0.6671405276581511, "grad_norm": 4.205306529998779, "learning_rate": 1.5058310855233272e-05, "loss": 2.6741, "step": 3989 }, { "epoch": 0.6673077727139691, "grad_norm": 3.0880043506622314, "learning_rate": 1.5044920503569168e-05, "loss": 2.7287, "step": 3990 }, { "epoch": 0.6674750177697871, "grad_norm": 4.168480396270752, "learning_rate": 1.5031533545175565e-05, "loss": 2.6745, "step": 3991 }, { "epoch": 0.6676422628256052, "grad_norm": 3.3411364555358887, "learning_rate": 1.5018149984615506e-05, "loss": 2.7043, "step": 3992 }, { "epoch": 0.6678095078814232, "grad_norm": 3.593954086303711, "learning_rate": 1.500476982645091e-05, "loss": 2.7127, "step": 3993 }, { "epoch": 0.6679767529372413, "grad_norm": 3.908499240875244, "learning_rate": 1.4991393075242497e-05, "loss": 2.4742, "step": 3994 }, { "epoch": 0.6681439979930593, "grad_norm": 3.2244455814361572, "learning_rate": 1.4978019735549847e-05, "loss": 2.7287, "step": 3995 }, { "epoch": 0.6683112430488773, "grad_norm": 5.717851638793945, "learning_rate": 1.4964649811931371e-05, "loss": 2.956, "step": 3996 }, { "epoch": 0.6684784881046955, "grad_norm": 4.6269025802612305, "learning_rate": 1.495128330894432e-05, "loss": 2.4888, "step": 3997 }, { "epoch": 0.6686457331605135, "grad_norm": 6.1578569412231445, "learning_rate": 1.4937920231144772e-05, "loss": 2.9199, "step": 3998 }, { "epoch": 0.6688129782163315, "grad_norm": 4.952661037445068, "learning_rate": 1.4924560583087643e-05, "loss": 3.0536, "step": 3999 }, { "epoch": 0.6689802232721496, "grad_norm": 5.217294216156006, "learning_rate": 1.4911204369326682e-05, "loss": 2.5944, "step": 4000 }, { "epoch": 0.6691474683279676, "grad_norm": 5.717238903045654, "learning_rate": 1.4897851594414455e-05, "loss": 2.9593, "step": 4001 }, { "epoch": 0.6693147133837856, "grad_norm": 6.55879020690918, "learning_rate": 1.4884502262902378e-05, "loss": 3.0425, "step": 4002 }, { "epoch": 0.6694819584396037, "grad_norm": 4.209293842315674, "learning_rate": 1.487115637934067e-05, "loss": 2.6765, "step": 4003 }, { "epoch": 0.6696492034954217, "grad_norm": 4.630340576171875, "learning_rate": 1.485781394827839e-05, "loss": 2.8087, "step": 4004 }, { "epoch": 0.6698164485512397, "grad_norm": 6.203700065612793, "learning_rate": 1.4844474974263412e-05, "loss": 2.3114, "step": 4005 }, { "epoch": 0.6699836936070578, "grad_norm": 4.031764984130859, "learning_rate": 1.4831139461842435e-05, "loss": 2.6488, "step": 4006 }, { "epoch": 0.6701509386628758, "grad_norm": 4.859573841094971, "learning_rate": 1.4817807415560982e-05, "loss": 2.903, "step": 4007 }, { "epoch": 0.6703181837186938, "grad_norm": 3.821202278137207, "learning_rate": 1.4804478839963387e-05, "loss": 2.6708, "step": 4008 }, { "epoch": 0.6704854287745119, "grad_norm": 3.3460049629211426, "learning_rate": 1.4791153739592793e-05, "loss": 2.6329, "step": 4009 }, { "epoch": 0.6706526738303299, "grad_norm": 7.778902530670166, "learning_rate": 1.4777832118991192e-05, "loss": 3.0458, "step": 4010 }, { "epoch": 0.6708199188861479, "grad_norm": 3.889526605606079, "learning_rate": 1.4764513982699354e-05, "loss": 2.3517, "step": 4011 }, { "epoch": 0.670987163941966, "grad_norm": 3.745624542236328, "learning_rate": 1.475119933525688e-05, "loss": 2.5618, "step": 4012 }, { "epoch": 0.671154408997784, "grad_norm": 4.574234962463379, "learning_rate": 1.4737888181202176e-05, "loss": 2.6603, "step": 4013 }, { "epoch": 0.671321654053602, "grad_norm": 12.781251907348633, "learning_rate": 1.4724580525072451e-05, "loss": 2.987, "step": 4014 }, { "epoch": 0.6714888991094201, "grad_norm": 6.936892986297607, "learning_rate": 1.4711276371403739e-05, "loss": 3.18, "step": 4015 }, { "epoch": 0.6716561441652381, "grad_norm": 4.403237342834473, "learning_rate": 1.4697975724730863e-05, "loss": 2.3448, "step": 4016 }, { "epoch": 0.6718233892210561, "grad_norm": 4.474015235900879, "learning_rate": 1.4684678589587458e-05, "loss": 2.6845, "step": 4017 }, { "epoch": 0.6719906342768742, "grad_norm": 5.938997268676758, "learning_rate": 1.4671384970505964e-05, "loss": 2.9611, "step": 4018 }, { "epoch": 0.6721578793326922, "grad_norm": 4.5181660652160645, "learning_rate": 1.4658094872017625e-05, "loss": 2.7093, "step": 4019 }, { "epoch": 0.6723251243885102, "grad_norm": 2.9384188652038574, "learning_rate": 1.4644808298652468e-05, "loss": 2.6325, "step": 4020 }, { "epoch": 0.6724923694443283, "grad_norm": 6.239288330078125, "learning_rate": 1.463152525493934e-05, "loss": 2.7846, "step": 4021 }, { "epoch": 0.6726596145001463, "grad_norm": 5.704988479614258, "learning_rate": 1.4618245745405873e-05, "loss": 2.8518, "step": 4022 }, { "epoch": 0.6728268595559643, "grad_norm": 3.7650110721588135, "learning_rate": 1.4604969774578498e-05, "loss": 2.8609, "step": 4023 }, { "epoch": 0.6729941046117824, "grad_norm": 4.286537170410156, "learning_rate": 1.4591697346982434e-05, "loss": 2.6393, "step": 4024 }, { "epoch": 0.6731613496676004, "grad_norm": 7.866906642913818, "learning_rate": 1.4578428467141692e-05, "loss": 3.0229, "step": 4025 }, { "epoch": 0.6733285947234184, "grad_norm": 5.708764553070068, "learning_rate": 1.4565163139579091e-05, "loss": 2.9654, "step": 4026 }, { "epoch": 0.6734958397792365, "grad_norm": 4.19986629486084, "learning_rate": 1.455190136881623e-05, "loss": 2.8041, "step": 4027 }, { "epoch": 0.6736630848350545, "grad_norm": 5.846632957458496, "learning_rate": 1.4538643159373472e-05, "loss": 2.8738, "step": 4028 }, { "epoch": 0.6738303298908725, "grad_norm": 2.3797178268432617, "learning_rate": 1.452538851577001e-05, "loss": 2.3571, "step": 4029 }, { "epoch": 0.6739975749466907, "grad_norm": 4.435003280639648, "learning_rate": 1.4512137442523777e-05, "loss": 2.5416, "step": 4030 }, { "epoch": 0.6741648200025087, "grad_norm": 2.782670736312866, "learning_rate": 1.4498889944151528e-05, "loss": 3.1009, "step": 4031 }, { "epoch": 0.6743320650583268, "grad_norm": 4.358791351318359, "learning_rate": 1.4485646025168764e-05, "loss": 2.4213, "step": 4032 }, { "epoch": 0.6744993101141448, "grad_norm": 3.5689263343811035, "learning_rate": 1.4472405690089801e-05, "loss": 2.6681, "step": 4033 }, { "epoch": 0.6746665551699628, "grad_norm": 5.0613274574279785, "learning_rate": 1.4459168943427703e-05, "loss": 3.4409, "step": 4034 }, { "epoch": 0.6748338002257809, "grad_norm": 6.976962089538574, "learning_rate": 1.444593578969432e-05, "loss": 2.7695, "step": 4035 }, { "epoch": 0.6750010452815989, "grad_norm": 4.058055877685547, "learning_rate": 1.4432706233400306e-05, "loss": 2.7266, "step": 4036 }, { "epoch": 0.6751682903374169, "grad_norm": 5.595638751983643, "learning_rate": 1.4419480279055032e-05, "loss": 2.6258, "step": 4037 }, { "epoch": 0.675335535393235, "grad_norm": 4.249335289001465, "learning_rate": 1.4406257931166697e-05, "loss": 2.6198, "step": 4038 }, { "epoch": 0.675502780449053, "grad_norm": 5.392949104309082, "learning_rate": 1.439303919424223e-05, "loss": 2.6245, "step": 4039 }, { "epoch": 0.675670025504871, "grad_norm": 5.159661293029785, "learning_rate": 1.4379824072787362e-05, "loss": 2.7661, "step": 4040 }, { "epoch": 0.6758372705606891, "grad_norm": 3.7448267936706543, "learning_rate": 1.4366612571306559e-05, "loss": 2.4377, "step": 4041 }, { "epoch": 0.6760045156165071, "grad_norm": 5.159013271331787, "learning_rate": 1.4353404694303074e-05, "loss": 2.7494, "step": 4042 }, { "epoch": 0.6761717606723251, "grad_norm": 6.937900066375732, "learning_rate": 1.4340200446278923e-05, "loss": 2.9957, "step": 4043 }, { "epoch": 0.6763390057281432, "grad_norm": 14.384710311889648, "learning_rate": 1.4326999831734896e-05, "loss": 3.7007, "step": 4044 }, { "epoch": 0.6765062507839612, "grad_norm": 9.43464469909668, "learning_rate": 1.4313802855170509e-05, "loss": 2.7697, "step": 4045 }, { "epoch": 0.6766734958397792, "grad_norm": 3.2253048419952393, "learning_rate": 1.4300609521084074e-05, "loss": 2.6145, "step": 4046 }, { "epoch": 0.6768407408955973, "grad_norm": 9.860648155212402, "learning_rate": 1.4287419833972634e-05, "loss": 2.8564, "step": 4047 }, { "epoch": 0.6770079859514153, "grad_norm": 3.2782795429229736, "learning_rate": 1.427423379833202e-05, "loss": 2.6509, "step": 4048 }, { "epoch": 0.6771752310072333, "grad_norm": 4.889387130737305, "learning_rate": 1.4261051418656779e-05, "loss": 2.8009, "step": 4049 }, { "epoch": 0.6773424760630514, "grad_norm": 4.11689567565918, "learning_rate": 1.4247872699440246e-05, "loss": 2.6945, "step": 4050 }, { "epoch": 0.6775097211188694, "grad_norm": 3.1678712368011475, "learning_rate": 1.4234697645174499e-05, "loss": 2.419, "step": 4051 }, { "epoch": 0.6776769661746874, "grad_norm": 2.618826150894165, "learning_rate": 1.4221526260350354e-05, "loss": 2.1425, "step": 4052 }, { "epoch": 0.6778442112305055, "grad_norm": 4.194958686828613, "learning_rate": 1.4208358549457396e-05, "loss": 2.631, "step": 4053 }, { "epoch": 0.6780114562863235, "grad_norm": 2.7583491802215576, "learning_rate": 1.4195194516983928e-05, "loss": 2.5543, "step": 4054 }, { "epoch": 0.6781787013421415, "grad_norm": 5.6926422119140625, "learning_rate": 1.4182034167417047e-05, "loss": 2.3416, "step": 4055 }, { "epoch": 0.6783459463979596, "grad_norm": 4.2574663162231445, "learning_rate": 1.4168877505242539e-05, "loss": 2.6278, "step": 4056 }, { "epoch": 0.6785131914537776, "grad_norm": 8.424081802368164, "learning_rate": 1.415572453494498e-05, "loss": 3.7378, "step": 4057 }, { "epoch": 0.6786804365095956, "grad_norm": 5.686334133148193, "learning_rate": 1.414257526100764e-05, "loss": 2.4114, "step": 4058 }, { "epoch": 0.6788476815654138, "grad_norm": 4.189460754394531, "learning_rate": 1.4129429687912595e-05, "loss": 2.7077, "step": 4059 }, { "epoch": 0.6790149266212318, "grad_norm": 4.254843235015869, "learning_rate": 1.4116287820140595e-05, "loss": 2.8551, "step": 4060 }, { "epoch": 0.6791821716770498, "grad_norm": 4.039995193481445, "learning_rate": 1.4103149662171169e-05, "loss": 2.4978, "step": 4061 }, { "epoch": 0.6793494167328679, "grad_norm": 5.588167667388916, "learning_rate": 1.409001521848255e-05, "loss": 2.7555, "step": 4062 }, { "epoch": 0.6795166617886859, "grad_norm": 4.775662422180176, "learning_rate": 1.4076884493551736e-05, "loss": 2.3861, "step": 4063 }, { "epoch": 0.6796839068445039, "grad_norm": 3.2782235145568848, "learning_rate": 1.406375749185443e-05, "loss": 2.951, "step": 4064 }, { "epoch": 0.679851151900322, "grad_norm": 2.981173515319824, "learning_rate": 1.4050634217865089e-05, "loss": 2.499, "step": 4065 }, { "epoch": 0.68001839695614, "grad_norm": 7.255524158477783, "learning_rate": 1.4037514676056872e-05, "loss": 2.4, "step": 4066 }, { "epoch": 0.680185642011958, "grad_norm": 4.140323638916016, "learning_rate": 1.4024398870901697e-05, "loss": 2.3906, "step": 4067 }, { "epoch": 0.6803528870677761, "grad_norm": 4.973621845245361, "learning_rate": 1.40112868068702e-05, "loss": 3.0873, "step": 4068 }, { "epoch": 0.6805201321235941, "grad_norm": 4.36267614364624, "learning_rate": 1.3998178488431712e-05, "loss": 2.7294, "step": 4069 }, { "epoch": 0.6806873771794121, "grad_norm": 5.370552062988281, "learning_rate": 1.3985073920054337e-05, "loss": 2.7964, "step": 4070 }, { "epoch": 0.6808546222352302, "grad_norm": 5.392683506011963, "learning_rate": 1.3971973106204856e-05, "loss": 2.6064, "step": 4071 }, { "epoch": 0.6810218672910482, "grad_norm": 4.819271087646484, "learning_rate": 1.3958876051348798e-05, "loss": 2.763, "step": 4072 }, { "epoch": 0.6811891123468663, "grad_norm": 5.021207809448242, "learning_rate": 1.3945782759950393e-05, "loss": 2.7951, "step": 4073 }, { "epoch": 0.6813563574026843, "grad_norm": 4.253841876983643, "learning_rate": 1.393269323647261e-05, "loss": 2.727, "step": 4074 }, { "epoch": 0.6815236024585023, "grad_norm": 7.775555610656738, "learning_rate": 1.3919607485377094e-05, "loss": 2.9884, "step": 4075 }, { "epoch": 0.6816908475143204, "grad_norm": 4.3784894943237305, "learning_rate": 1.390652551112427e-05, "loss": 2.5879, "step": 4076 }, { "epoch": 0.6818580925701384, "grad_norm": 4.2864203453063965, "learning_rate": 1.3893447318173205e-05, "loss": 2.9156, "step": 4077 }, { "epoch": 0.6820253376259564, "grad_norm": 6.518377780914307, "learning_rate": 1.388037291098173e-05, "loss": 2.6571, "step": 4078 }, { "epoch": 0.6821925826817745, "grad_norm": 4.527149677276611, "learning_rate": 1.386730229400634e-05, "loss": 2.3838, "step": 4079 }, { "epoch": 0.6823598277375925, "grad_norm": 4.097616195678711, "learning_rate": 1.3854235471702287e-05, "loss": 2.5604, "step": 4080 }, { "epoch": 0.6825270727934105, "grad_norm": 4.942432880401611, "learning_rate": 1.3841172448523492e-05, "loss": 2.7327, "step": 4081 }, { "epoch": 0.6826943178492286, "grad_norm": 3.523773670196533, "learning_rate": 1.3828113228922585e-05, "loss": 2.4534, "step": 4082 }, { "epoch": 0.6828615629050466, "grad_norm": 7.602453231811523, "learning_rate": 1.3815057817350915e-05, "loss": 3.0743, "step": 4083 }, { "epoch": 0.6830288079608646, "grad_norm": 4.255028247833252, "learning_rate": 1.380200621825853e-05, "loss": 2.6638, "step": 4084 }, { "epoch": 0.6831960530166827, "grad_norm": 4.773059368133545, "learning_rate": 1.3788958436094182e-05, "loss": 2.5516, "step": 4085 }, { "epoch": 0.6833632980725007, "grad_norm": 6.93398380279541, "learning_rate": 1.3775914475305296e-05, "loss": 2.4026, "step": 4086 }, { "epoch": 0.6835305431283187, "grad_norm": 3.475966691970825, "learning_rate": 1.3762874340338034e-05, "loss": 2.5607, "step": 4087 }, { "epoch": 0.6836977881841368, "grad_norm": 5.60888147354126, "learning_rate": 1.3749838035637208e-05, "loss": 3.219, "step": 4088 }, { "epoch": 0.6838650332399548, "grad_norm": 4.627189636230469, "learning_rate": 1.3736805565646377e-05, "loss": 2.775, "step": 4089 }, { "epoch": 0.6840322782957728, "grad_norm": 5.217837333679199, "learning_rate": 1.3723776934807736e-05, "loss": 2.9157, "step": 4090 }, { "epoch": 0.684199523351591, "grad_norm": 4.220655918121338, "learning_rate": 1.371075214756222e-05, "loss": 2.4388, "step": 4091 }, { "epoch": 0.684366768407409, "grad_norm": 8.147871017456055, "learning_rate": 1.3697731208349438e-05, "loss": 2.974, "step": 4092 }, { "epoch": 0.684534013463227, "grad_norm": 5.036416530609131, "learning_rate": 1.3684714121607668e-05, "loss": 2.233, "step": 4093 }, { "epoch": 0.6847012585190451, "grad_norm": 5.557186126708984, "learning_rate": 1.3671700891773908e-05, "loss": 2.1467, "step": 4094 }, { "epoch": 0.6848685035748631, "grad_norm": 9.626578330993652, "learning_rate": 1.3658691523283804e-05, "loss": 3.1121, "step": 4095 }, { "epoch": 0.6850357486306811, "grad_norm": 3.67147159576416, "learning_rate": 1.3645686020571729e-05, "loss": 3.0004, "step": 4096 }, { "epoch": 0.6852029936864992, "grad_norm": 4.74141788482666, "learning_rate": 1.3632684388070693e-05, "loss": 2.8766, "step": 4097 }, { "epoch": 0.6853702387423172, "grad_norm": 5.945125102996826, "learning_rate": 1.3619686630212431e-05, "loss": 2.6494, "step": 4098 }, { "epoch": 0.6855374837981352, "grad_norm": 6.3758955001831055, "learning_rate": 1.36066927514273e-05, "loss": 2.9856, "step": 4099 }, { "epoch": 0.6857047288539533, "grad_norm": 2.80767822265625, "learning_rate": 1.3593702756144417e-05, "loss": 2.7973, "step": 4100 }, { "epoch": 0.6858719739097713, "grad_norm": 4.161080837249756, "learning_rate": 1.3580716648791492e-05, "loss": 2.7312, "step": 4101 }, { "epoch": 0.6860392189655893, "grad_norm": 2.5620484352111816, "learning_rate": 1.3567734433794973e-05, "loss": 2.469, "step": 4102 }, { "epoch": 0.6862064640214074, "grad_norm": 3.5903866291046143, "learning_rate": 1.355475611557993e-05, "loss": 2.8618, "step": 4103 }, { "epoch": 0.6863737090772254, "grad_norm": 5.460768699645996, "learning_rate": 1.3541781698570153e-05, "loss": 2.9905, "step": 4104 }, { "epoch": 0.6865409541330434, "grad_norm": 6.323298454284668, "learning_rate": 1.352881118718805e-05, "loss": 2.8626, "step": 4105 }, { "epoch": 0.6867081991888615, "grad_norm": 4.221624851226807, "learning_rate": 1.3515844585854758e-05, "loss": 2.8416, "step": 4106 }, { "epoch": 0.6868754442446795, "grad_norm": 3.475215435028076, "learning_rate": 1.3502881898990022e-05, "loss": 2.8602, "step": 4107 }, { "epoch": 0.6870426893004975, "grad_norm": 3.922999858856201, "learning_rate": 1.3489923131012289e-05, "loss": 2.5936, "step": 4108 }, { "epoch": 0.6872099343563156, "grad_norm": 4.424213409423828, "learning_rate": 1.3476968286338675e-05, "loss": 2.4651, "step": 4109 }, { "epoch": 0.6873771794121336, "grad_norm": 3.0172979831695557, "learning_rate": 1.3464017369384924e-05, "loss": 2.3801, "step": 4110 }, { "epoch": 0.6875444244679517, "grad_norm": 7.945366382598877, "learning_rate": 1.3451070384565484e-05, "loss": 2.7402, "step": 4111 }, { "epoch": 0.6877116695237697, "grad_norm": 3.1600089073181152, "learning_rate": 1.3438127336293418e-05, "loss": 2.6608, "step": 4112 }, { "epoch": 0.6878789145795877, "grad_norm": 6.083408832550049, "learning_rate": 1.342518822898049e-05, "loss": 2.6733, "step": 4113 }, { "epoch": 0.6880461596354058, "grad_norm": 4.993020057678223, "learning_rate": 1.3412253067037084e-05, "loss": 3.408, "step": 4114 }, { "epoch": 0.6882134046912238, "grad_norm": 6.982968807220459, "learning_rate": 1.3399321854872274e-05, "loss": 2.739, "step": 4115 }, { "epoch": 0.6883806497470418, "grad_norm": 8.360477447509766, "learning_rate": 1.3386394596893741e-05, "loss": 2.7663, "step": 4116 }, { "epoch": 0.6885478948028599, "grad_norm": 3.7547574043273926, "learning_rate": 1.3373471297507884e-05, "loss": 2.6858, "step": 4117 }, { "epoch": 0.6887151398586779, "grad_norm": 7.536102771759033, "learning_rate": 1.3360551961119688e-05, "loss": 2.5801, "step": 4118 }, { "epoch": 0.6888823849144959, "grad_norm": 4.568572998046875, "learning_rate": 1.3347636592132835e-05, "loss": 3.1112, "step": 4119 }, { "epoch": 0.689049629970314, "grad_norm": 3.647118091583252, "learning_rate": 1.3334725194949613e-05, "loss": 2.5405, "step": 4120 }, { "epoch": 0.689216875026132, "grad_norm": 6.568499565124512, "learning_rate": 1.3321817773970995e-05, "loss": 2.8284, "step": 4121 }, { "epoch": 0.68938412008195, "grad_norm": 3.1105310916900635, "learning_rate": 1.3308914333596573e-05, "loss": 2.8229, "step": 4122 }, { "epoch": 0.6895513651377682, "grad_norm": 4.887767791748047, "learning_rate": 1.3296014878224594e-05, "loss": 2.8321, "step": 4123 }, { "epoch": 0.6897186101935862, "grad_norm": 5.199182987213135, "learning_rate": 1.3283119412251937e-05, "loss": 3.0612, "step": 4124 }, { "epoch": 0.6898858552494042, "grad_norm": 4.665002822875977, "learning_rate": 1.3270227940074129e-05, "loss": 2.8135, "step": 4125 }, { "epoch": 0.6900531003052223, "grad_norm": 4.860243320465088, "learning_rate": 1.3257340466085345e-05, "loss": 2.8978, "step": 4126 }, { "epoch": 0.6902203453610403, "grad_norm": 3.1412408351898193, "learning_rate": 1.3244456994678372e-05, "loss": 2.5964, "step": 4127 }, { "epoch": 0.6903875904168583, "grad_norm": 4.054561138153076, "learning_rate": 1.323157753024466e-05, "loss": 2.6249, "step": 4128 }, { "epoch": 0.6905548354726764, "grad_norm": 7.311315059661865, "learning_rate": 1.3218702077174264e-05, "loss": 3.0894, "step": 4129 }, { "epoch": 0.6907220805284944, "grad_norm": 8.579916000366211, "learning_rate": 1.3205830639855904e-05, "loss": 2.7612, "step": 4130 }, { "epoch": 0.6908893255843124, "grad_norm": 4.547669410705566, "learning_rate": 1.3192963222676902e-05, "loss": 2.818, "step": 4131 }, { "epoch": 0.6910565706401305, "grad_norm": 4.294390678405762, "learning_rate": 1.3180099830023226e-05, "loss": 3.3811, "step": 4132 }, { "epoch": 0.6912238156959485, "grad_norm": 4.333841323852539, "learning_rate": 1.3167240466279473e-05, "loss": 2.8908, "step": 4133 }, { "epoch": 0.6913910607517665, "grad_norm": 6.221395492553711, "learning_rate": 1.3154385135828866e-05, "loss": 2.6291, "step": 4134 }, { "epoch": 0.6915583058075846, "grad_norm": 4.233438968658447, "learning_rate": 1.3141533843053239e-05, "loss": 2.8819, "step": 4135 }, { "epoch": 0.6917255508634026, "grad_norm": 11.914573669433594, "learning_rate": 1.3128686592333078e-05, "loss": 3.3721, "step": 4136 }, { "epoch": 0.6918927959192206, "grad_norm": 6.24445104598999, "learning_rate": 1.311584338804745e-05, "loss": 2.4356, "step": 4137 }, { "epoch": 0.6920600409750387, "grad_norm": 5.23052978515625, "learning_rate": 1.3103004234574085e-05, "loss": 3.2422, "step": 4138 }, { "epoch": 0.6922272860308567, "grad_norm": 5.645590305328369, "learning_rate": 1.3090169136289301e-05, "loss": 3.0188, "step": 4139 }, { "epoch": 0.6923945310866747, "grad_norm": 5.763430595397949, "learning_rate": 1.3077338097568062e-05, "loss": 2.6109, "step": 4140 }, { "epoch": 0.6925617761424928, "grad_norm": 24.41903305053711, "learning_rate": 1.3064511122783912e-05, "loss": 3.4415, "step": 4141 }, { "epoch": 0.6927290211983108, "grad_norm": 5.564511299133301, "learning_rate": 1.3051688216309046e-05, "loss": 2.8562, "step": 4142 }, { "epoch": 0.6928962662541288, "grad_norm": 5.231406211853027, "learning_rate": 1.3038869382514263e-05, "loss": 2.4499, "step": 4143 }, { "epoch": 0.6930635113099469, "grad_norm": 5.391275882720947, "learning_rate": 1.3026054625768944e-05, "loss": 2.7835, "step": 4144 }, { "epoch": 0.6932307563657649, "grad_norm": 6.092835903167725, "learning_rate": 1.3013243950441129e-05, "loss": 2.7148, "step": 4145 }, { "epoch": 0.6933980014215829, "grad_norm": 4.026411533355713, "learning_rate": 1.3000437360897421e-05, "loss": 2.8506, "step": 4146 }, { "epoch": 0.693565246477401, "grad_norm": 13.313684463500977, "learning_rate": 1.2987634861503067e-05, "loss": 2.4273, "step": 4147 }, { "epoch": 0.693732491533219, "grad_norm": 4.534210205078125, "learning_rate": 1.2974836456621892e-05, "loss": 2.3742, "step": 4148 }, { "epoch": 0.6938997365890371, "grad_norm": 4.819547176361084, "learning_rate": 1.2962042150616343e-05, "loss": 2.5939, "step": 4149 }, { "epoch": 0.6940669816448551, "grad_norm": 3.758559465408325, "learning_rate": 1.2949251947847472e-05, "loss": 2.8003, "step": 4150 }, { "epoch": 0.6942342267006731, "grad_norm": 6.2649688720703125, "learning_rate": 1.2936465852674906e-05, "loss": 3.1919, "step": 4151 }, { "epoch": 0.6944014717564913, "grad_norm": 10.600600242614746, "learning_rate": 1.292368386945691e-05, "loss": 2.6856, "step": 4152 }, { "epoch": 0.6945687168123093, "grad_norm": 5.592875957489014, "learning_rate": 1.291090600255031e-05, "loss": 2.9359, "step": 4153 }, { "epoch": 0.6947359618681272, "grad_norm": 9.136013984680176, "learning_rate": 1.2898132256310564e-05, "loss": 2.975, "step": 4154 }, { "epoch": 0.6949032069239454, "grad_norm": 2.760650396347046, "learning_rate": 1.2885362635091692e-05, "loss": 2.9978, "step": 4155 }, { "epoch": 0.6950704519797634, "grad_norm": 11.677093505859375, "learning_rate": 1.287259714324634e-05, "loss": 3.5211, "step": 4156 }, { "epoch": 0.6952376970355814, "grad_norm": 6.540205001831055, "learning_rate": 1.2859835785125702e-05, "loss": 1.9717, "step": 4157 }, { "epoch": 0.6954049420913995, "grad_norm": 3.155702590942383, "learning_rate": 1.2847078565079632e-05, "loss": 2.3766, "step": 4158 }, { "epoch": 0.6955721871472175, "grad_norm": 3.4289839267730713, "learning_rate": 1.2834325487456502e-05, "loss": 2.8692, "step": 4159 }, { "epoch": 0.6957394322030355, "grad_norm": 5.83174991607666, "learning_rate": 1.2821576556603326e-05, "loss": 2.4709, "step": 4160 }, { "epoch": 0.6959066772588536, "grad_norm": 4.306151866912842, "learning_rate": 1.280883177686566e-05, "loss": 2.6418, "step": 4161 }, { "epoch": 0.6960739223146716, "grad_norm": 6.234220027923584, "learning_rate": 1.2796091152587683e-05, "loss": 2.6814, "step": 4162 }, { "epoch": 0.6962411673704896, "grad_norm": 5.624479293823242, "learning_rate": 1.2783354688112125e-05, "loss": 3.3351, "step": 4163 }, { "epoch": 0.6964084124263077, "grad_norm": 4.026275634765625, "learning_rate": 1.2770622387780338e-05, "loss": 2.6625, "step": 4164 }, { "epoch": 0.6965756574821257, "grad_norm": 3.002798080444336, "learning_rate": 1.2757894255932203e-05, "loss": 2.4195, "step": 4165 }, { "epoch": 0.6967429025379437, "grad_norm": 3.2861056327819824, "learning_rate": 1.2745170296906225e-05, "loss": 2.5332, "step": 4166 }, { "epoch": 0.6969101475937618, "grad_norm": 7.82907247543335, "learning_rate": 1.2732450515039477e-05, "loss": 3.1146, "step": 4167 }, { "epoch": 0.6970773926495798, "grad_norm": 5.399588584899902, "learning_rate": 1.271973491466758e-05, "loss": 2.9229, "step": 4168 }, { "epoch": 0.6972446377053978, "grad_norm": 6.496389865875244, "learning_rate": 1.2707023500124771e-05, "loss": 2.8271, "step": 4169 }, { "epoch": 0.6974118827612159, "grad_norm": 3.994016408920288, "learning_rate": 1.269431627574382e-05, "loss": 2.7892, "step": 4170 }, { "epoch": 0.6975791278170339, "grad_norm": 4.924968242645264, "learning_rate": 1.2681613245856112e-05, "loss": 2.6616, "step": 4171 }, { "epoch": 0.6977463728728519, "grad_norm": 4.076845169067383, "learning_rate": 1.2668914414791553e-05, "loss": 2.4969, "step": 4172 }, { "epoch": 0.69791361792867, "grad_norm": 4.960830211639404, "learning_rate": 1.2656219786878659e-05, "loss": 2.7147, "step": 4173 }, { "epoch": 0.698080862984488, "grad_norm": 2.739008903503418, "learning_rate": 1.2643529366444496e-05, "loss": 2.3851, "step": 4174 }, { "epoch": 0.698248108040306, "grad_norm": 4.313978672027588, "learning_rate": 1.2630843157814703e-05, "loss": 2.6724, "step": 4175 }, { "epoch": 0.6984153530961241, "grad_norm": 4.580832481384277, "learning_rate": 1.261816116531347e-05, "loss": 2.2722, "step": 4176 }, { "epoch": 0.6985825981519421, "grad_norm": 5.015343189239502, "learning_rate": 1.2605483393263567e-05, "loss": 2.791, "step": 4177 }, { "epoch": 0.6987498432077601, "grad_norm": 3.0352494716644287, "learning_rate": 1.25928098459863e-05, "loss": 2.7844, "step": 4178 }, { "epoch": 0.6989170882635782, "grad_norm": 5.243936061859131, "learning_rate": 1.2580140527801575e-05, "loss": 2.6133, "step": 4179 }, { "epoch": 0.6990843333193962, "grad_norm": 5.048190593719482, "learning_rate": 1.256747544302781e-05, "loss": 2.8078, "step": 4180 }, { "epoch": 0.6992515783752142, "grad_norm": 3.8147521018981934, "learning_rate": 1.2554814595982023e-05, "loss": 2.6488, "step": 4181 }, { "epoch": 0.6994188234310323, "grad_norm": 5.1742844581604, "learning_rate": 1.2542157990979746e-05, "loss": 2.9325, "step": 4182 }, { "epoch": 0.6995860684868503, "grad_norm": 4.8718414306640625, "learning_rate": 1.25295056323351e-05, "loss": 2.803, "step": 4183 }, { "epoch": 0.6997533135426683, "grad_norm": 2.829014778137207, "learning_rate": 1.2516857524360753e-05, "loss": 2.4175, "step": 4184 }, { "epoch": 0.6999205585984865, "grad_norm": 5.893111705780029, "learning_rate": 1.2504213671367901e-05, "loss": 3.3812, "step": 4185 }, { "epoch": 0.7000878036543045, "grad_norm": 4.530887126922607, "learning_rate": 1.2491574077666317e-05, "loss": 2.3974, "step": 4186 }, { "epoch": 0.7002550487101225, "grad_norm": 6.824902534484863, "learning_rate": 1.2478938747564297e-05, "loss": 2.6645, "step": 4187 }, { "epoch": 0.7004222937659406, "grad_norm": 5.408566474914551, "learning_rate": 1.246630768536871e-05, "loss": 2.4703, "step": 4188 }, { "epoch": 0.7005895388217586, "grad_norm": 3.12423038482666, "learning_rate": 1.2453680895384948e-05, "loss": 2.7497, "step": 4189 }, { "epoch": 0.7007567838775767, "grad_norm": 3.406522035598755, "learning_rate": 1.2441058381916953e-05, "loss": 2.3257, "step": 4190 }, { "epoch": 0.7009240289333947, "grad_norm": 3.8622324466705322, "learning_rate": 1.2428440149267223e-05, "loss": 2.414, "step": 4191 }, { "epoch": 0.7010912739892127, "grad_norm": 5.538946628570557, "learning_rate": 1.241582620173679e-05, "loss": 2.6471, "step": 4192 }, { "epoch": 0.7012585190450308, "grad_norm": 3.82353138923645, "learning_rate": 1.2403216543625204e-05, "loss": 2.1658, "step": 4193 }, { "epoch": 0.7014257641008488, "grad_norm": 4.156881809234619, "learning_rate": 1.2390611179230587e-05, "loss": 2.4607, "step": 4194 }, { "epoch": 0.7015930091566668, "grad_norm": 7.142679691314697, "learning_rate": 1.2378010112849564e-05, "loss": 2.9856, "step": 4195 }, { "epoch": 0.7017602542124849, "grad_norm": 5.671872138977051, "learning_rate": 1.2365413348777333e-05, "loss": 2.5995, "step": 4196 }, { "epoch": 0.7019274992683029, "grad_norm": 4.813528060913086, "learning_rate": 1.235282089130758e-05, "loss": 2.8692, "step": 4197 }, { "epoch": 0.7020947443241209, "grad_norm": 6.18093204498291, "learning_rate": 1.2340232744732563e-05, "loss": 3.0746, "step": 4198 }, { "epoch": 0.702261989379939, "grad_norm": 3.3124430179595947, "learning_rate": 1.2327648913343059e-05, "loss": 2.7388, "step": 4199 }, { "epoch": 0.702429234435757, "grad_norm": 5.50970458984375, "learning_rate": 1.2315069401428355e-05, "loss": 2.7546, "step": 4200 }, { "epoch": 0.702596479491575, "grad_norm": 4.061094284057617, "learning_rate": 1.23024942132763e-05, "loss": 3.1726, "step": 4201 }, { "epoch": 0.7027637245473931, "grad_norm": 4.956970691680908, "learning_rate": 1.2289923353173227e-05, "loss": 2.8724, "step": 4202 }, { "epoch": 0.7029309696032111, "grad_norm": 5.322290897369385, "learning_rate": 1.227735682540404e-05, "loss": 2.8494, "step": 4203 }, { "epoch": 0.7030982146590291, "grad_norm": 2.412689208984375, "learning_rate": 1.2264794634252122e-05, "loss": 2.5983, "step": 4204 }, { "epoch": 0.7032654597148472, "grad_norm": 7.25974702835083, "learning_rate": 1.2252236783999416e-05, "loss": 2.9785, "step": 4205 }, { "epoch": 0.7034327047706652, "grad_norm": 5.408334732055664, "learning_rate": 1.2239683278926344e-05, "loss": 2.433, "step": 4206 }, { "epoch": 0.7035999498264832, "grad_norm": 4.979634761810303, "learning_rate": 1.2227134123311907e-05, "loss": 2.7184, "step": 4207 }, { "epoch": 0.7037671948823013, "grad_norm": 4.857287406921387, "learning_rate": 1.2214589321433567e-05, "loss": 2.749, "step": 4208 }, { "epoch": 0.7039344399381193, "grad_norm": 4.78774881362915, "learning_rate": 1.2202048877567315e-05, "loss": 2.6749, "step": 4209 }, { "epoch": 0.7041016849939373, "grad_norm": 9.779616355895996, "learning_rate": 1.2189512795987684e-05, "loss": 2.9787, "step": 4210 }, { "epoch": 0.7042689300497554, "grad_norm": 5.455239295959473, "learning_rate": 1.2176981080967681e-05, "loss": 3.277, "step": 4211 }, { "epoch": 0.7044361751055734, "grad_norm": 8.328086853027344, "learning_rate": 1.2164453736778863e-05, "loss": 2.8489, "step": 4212 }, { "epoch": 0.7046034201613914, "grad_norm": 3.649038076400757, "learning_rate": 1.2151930767691256e-05, "loss": 2.7962, "step": 4213 }, { "epoch": 0.7047706652172095, "grad_norm": 6.020130634307861, "learning_rate": 1.2139412177973439e-05, "loss": 3.0815, "step": 4214 }, { "epoch": 0.7049379102730275, "grad_norm": 4.548501968383789, "learning_rate": 1.2126897971892445e-05, "loss": 3.1683, "step": 4215 }, { "epoch": 0.7051051553288455, "grad_norm": 8.684638023376465, "learning_rate": 1.2114388153713882e-05, "loss": 2.7567, "step": 4216 }, { "epoch": 0.7052724003846637, "grad_norm": 3.93182110786438, "learning_rate": 1.21018827277018e-05, "loss": 2.5188, "step": 4217 }, { "epoch": 0.7054396454404817, "grad_norm": 4.7221879959106445, "learning_rate": 1.208938169811879e-05, "loss": 2.6835, "step": 4218 }, { "epoch": 0.7056068904962997, "grad_norm": 5.881104469299316, "learning_rate": 1.2076885069225912e-05, "loss": 3.263, "step": 4219 }, { "epoch": 0.7057741355521178, "grad_norm": 4.979895114898682, "learning_rate": 1.2064392845282768e-05, "loss": 2.2878, "step": 4220 }, { "epoch": 0.7059413806079358, "grad_norm": 10.021673202514648, "learning_rate": 1.205190503054741e-05, "loss": 3.0993, "step": 4221 }, { "epoch": 0.7061086256637538, "grad_norm": 3.3851096630096436, "learning_rate": 1.2039421629276434e-05, "loss": 2.6412, "step": 4222 }, { "epoch": 0.7062758707195719, "grad_norm": 4.166059970855713, "learning_rate": 1.2026942645724893e-05, "loss": 2.5308, "step": 4223 }, { "epoch": 0.7064431157753899, "grad_norm": 3.943336009979248, "learning_rate": 1.2014468084146354e-05, "loss": 2.7143, "step": 4224 }, { "epoch": 0.7066103608312079, "grad_norm": 7.176669597625732, "learning_rate": 1.200199794879289e-05, "loss": 2.5373, "step": 4225 }, { "epoch": 0.706777605887026, "grad_norm": 5.338301181793213, "learning_rate": 1.1989532243915025e-05, "loss": 2.5661, "step": 4226 }, { "epoch": 0.706944850942844, "grad_norm": 11.267152786254883, "learning_rate": 1.1977070973761818e-05, "loss": 3.2246, "step": 4227 }, { "epoch": 0.7071120959986621, "grad_norm": 6.414507865905762, "learning_rate": 1.1964614142580777e-05, "loss": 3.3953, "step": 4228 }, { "epoch": 0.7072793410544801, "grad_norm": 3.586308240890503, "learning_rate": 1.1952161754617933e-05, "loss": 2.5078, "step": 4229 }, { "epoch": 0.7074465861102981, "grad_norm": 3.734772205352783, "learning_rate": 1.1939713814117761e-05, "loss": 2.8169, "step": 4230 }, { "epoch": 0.7076138311661162, "grad_norm": 6.96466588973999, "learning_rate": 1.1927270325323264e-05, "loss": 3.1814, "step": 4231 }, { "epoch": 0.7077810762219342, "grad_norm": 4.694899559020996, "learning_rate": 1.1914831292475897e-05, "loss": 3.1037, "step": 4232 }, { "epoch": 0.7079483212777522, "grad_norm": 4.616008281707764, "learning_rate": 1.190239671981562e-05, "loss": 2.8101, "step": 4233 }, { "epoch": 0.7081155663335703, "grad_norm": 6.1031084060668945, "learning_rate": 1.188996661158084e-05, "loss": 3.2199, "step": 4234 }, { "epoch": 0.7082828113893883, "grad_norm": 4.682427406311035, "learning_rate": 1.187754097200848e-05, "loss": 2.5255, "step": 4235 }, { "epoch": 0.7084500564452063, "grad_norm": 3.8007309436798096, "learning_rate": 1.1865119805333904e-05, "loss": 2.7295, "step": 4236 }, { "epoch": 0.7086173015010244, "grad_norm": 3.561382532119751, "learning_rate": 1.1852703115790984e-05, "loss": 2.5932, "step": 4237 }, { "epoch": 0.7087845465568424, "grad_norm": 4.603872299194336, "learning_rate": 1.1840290907612034e-05, "loss": 2.8213, "step": 4238 }, { "epoch": 0.7089517916126604, "grad_norm": 8.511421203613281, "learning_rate": 1.1827883185027867e-05, "loss": 2.2323, "step": 4239 }, { "epoch": 0.7091190366684785, "grad_norm": 5.814254283905029, "learning_rate": 1.1815479952267763e-05, "loss": 2.7287, "step": 4240 }, { "epoch": 0.7092862817242965, "grad_norm": 7.757671356201172, "learning_rate": 1.180308121355945e-05, "loss": 3.6333, "step": 4241 }, { "epoch": 0.7094535267801145, "grad_norm": 4.688182353973389, "learning_rate": 1.1790686973129155e-05, "loss": 2.4395, "step": 4242 }, { "epoch": 0.7096207718359326, "grad_norm": 3.381458282470703, "learning_rate": 1.1778297235201543e-05, "loss": 2.6843, "step": 4243 }, { "epoch": 0.7097880168917506, "grad_norm": 3.81624698638916, "learning_rate": 1.1765912003999774e-05, "loss": 2.5591, "step": 4244 }, { "epoch": 0.7099552619475686, "grad_norm": 5.832765579223633, "learning_rate": 1.1753531283745436e-05, "loss": 2.5609, "step": 4245 }, { "epoch": 0.7101225070033867, "grad_norm": 5.71767520904541, "learning_rate": 1.174115507865862e-05, "loss": 2.7447, "step": 4246 }, { "epoch": 0.7102897520592047, "grad_norm": 5.4305949211120605, "learning_rate": 1.1728783392957834e-05, "loss": 2.6288, "step": 4247 }, { "epoch": 0.7104569971150227, "grad_norm": 2.6473400592803955, "learning_rate": 1.1716416230860084e-05, "loss": 2.6297, "step": 4248 }, { "epoch": 0.7106242421708409, "grad_norm": 3.4088563919067383, "learning_rate": 1.170405359658081e-05, "loss": 2.2172, "step": 4249 }, { "epoch": 0.7107914872266589, "grad_norm": 2.964857816696167, "learning_rate": 1.1691695494333937e-05, "loss": 2.7181, "step": 4250 }, { "epoch": 0.7109587322824769, "grad_norm": 6.205248832702637, "learning_rate": 1.16793419283318e-05, "loss": 2.7616, "step": 4251 }, { "epoch": 0.711125977338295, "grad_norm": 4.345443248748779, "learning_rate": 1.1666992902785234e-05, "loss": 3.0046, "step": 4252 }, { "epoch": 0.711293222394113, "grad_norm": 4.566990375518799, "learning_rate": 1.1654648421903489e-05, "loss": 2.8332, "step": 4253 }, { "epoch": 0.711460467449931, "grad_norm": 6.572324752807617, "learning_rate": 1.1642308489894294e-05, "loss": 2.5753, "step": 4254 }, { "epoch": 0.7116277125057491, "grad_norm": 4.658884525299072, "learning_rate": 1.1629973110963808e-05, "loss": 2.9247, "step": 4255 }, { "epoch": 0.7117949575615671, "grad_norm": 4.442417621612549, "learning_rate": 1.1617642289316643e-05, "loss": 2.7778, "step": 4256 }, { "epoch": 0.7119622026173851, "grad_norm": 3.825622797012329, "learning_rate": 1.1605316029155883e-05, "loss": 2.7045, "step": 4257 }, { "epoch": 0.7121294476732032, "grad_norm": 3.15873646736145, "learning_rate": 1.1592994334683008e-05, "loss": 2.8514, "step": 4258 }, { "epoch": 0.7122966927290212, "grad_norm": 2.8934195041656494, "learning_rate": 1.1580677210097989e-05, "loss": 3.0704, "step": 4259 }, { "epoch": 0.7124639377848392, "grad_norm": 2.832733154296875, "learning_rate": 1.1568364659599202e-05, "loss": 2.6232, "step": 4260 }, { "epoch": 0.7126311828406573, "grad_norm": 5.341838359832764, "learning_rate": 1.1556056687383495e-05, "loss": 2.8025, "step": 4261 }, { "epoch": 0.7127984278964753, "grad_norm": 6.073269844055176, "learning_rate": 1.1543753297646127e-05, "loss": 2.3139, "step": 4262 }, { "epoch": 0.7129656729522933, "grad_norm": 3.7907509803771973, "learning_rate": 1.1531454494580826e-05, "loss": 2.5398, "step": 4263 }, { "epoch": 0.7131329180081114, "grad_norm": 4.590839862823486, "learning_rate": 1.1519160282379716e-05, "loss": 2.7698, "step": 4264 }, { "epoch": 0.7133001630639294, "grad_norm": 5.138202667236328, "learning_rate": 1.150687066523341e-05, "loss": 2.6125, "step": 4265 }, { "epoch": 0.7134674081197475, "grad_norm": 4.195893287658691, "learning_rate": 1.1494585647330902e-05, "loss": 2.3077, "step": 4266 }, { "epoch": 0.7136346531755655, "grad_norm": 6.065313339233398, "learning_rate": 1.1482305232859661e-05, "loss": 2.8445, "step": 4267 }, { "epoch": 0.7138018982313835, "grad_norm": 4.038660049438477, "learning_rate": 1.1470029426005552e-05, "loss": 2.7843, "step": 4268 }, { "epoch": 0.7139691432872016, "grad_norm": 4.517635822296143, "learning_rate": 1.145775823095288e-05, "loss": 2.8054, "step": 4269 }, { "epoch": 0.7141363883430196, "grad_norm": 5.132694244384766, "learning_rate": 1.14454916518844e-05, "loss": 3.0751, "step": 4270 }, { "epoch": 0.7143036333988376, "grad_norm": 6.227969169616699, "learning_rate": 1.1433229692981254e-05, "loss": 2.5676, "step": 4271 }, { "epoch": 0.7144708784546557, "grad_norm": 3.507523536682129, "learning_rate": 1.1420972358423047e-05, "loss": 2.6844, "step": 4272 }, { "epoch": 0.7146381235104737, "grad_norm": 16.140018463134766, "learning_rate": 1.1408719652387786e-05, "loss": 3.2328, "step": 4273 }, { "epoch": 0.7148053685662917, "grad_norm": 13.263899803161621, "learning_rate": 1.139647157905192e-05, "loss": 2.8295, "step": 4274 }, { "epoch": 0.7149726136221098, "grad_norm": 5.583302974700928, "learning_rate": 1.1384228142590284e-05, "loss": 2.8889, "step": 4275 }, { "epoch": 0.7151398586779278, "grad_norm": 4.324066162109375, "learning_rate": 1.1371989347176173e-05, "loss": 2.8532, "step": 4276 }, { "epoch": 0.7153071037337458, "grad_norm": 3.6856720447540283, "learning_rate": 1.1359755196981262e-05, "loss": 2.9328, "step": 4277 }, { "epoch": 0.715474348789564, "grad_norm": 4.607168197631836, "learning_rate": 1.134752569617568e-05, "loss": 2.5403, "step": 4278 }, { "epoch": 0.715641593845382, "grad_norm": 6.202007293701172, "learning_rate": 1.1335300848927938e-05, "loss": 2.457, "step": 4279 }, { "epoch": 0.7158088389012, "grad_norm": 9.692631721496582, "learning_rate": 1.1323080659404977e-05, "loss": 2.7745, "step": 4280 }, { "epoch": 0.7159760839570181, "grad_norm": 5.413043022155762, "learning_rate": 1.1310865131772163e-05, "loss": 2.7872, "step": 4281 }, { "epoch": 0.7161433290128361, "grad_norm": 3.753819227218628, "learning_rate": 1.1298654270193242e-05, "loss": 2.6945, "step": 4282 }, { "epoch": 0.7163105740686541, "grad_norm": 3.222909927368164, "learning_rate": 1.1286448078830402e-05, "loss": 2.5843, "step": 4283 }, { "epoch": 0.7164778191244722, "grad_norm": 4.881777286529541, "learning_rate": 1.1274246561844203e-05, "loss": 2.4913, "step": 4284 }, { "epoch": 0.7166450641802902, "grad_norm": 4.504857540130615, "learning_rate": 1.1262049723393656e-05, "loss": 2.9773, "step": 4285 }, { "epoch": 0.7168123092361082, "grad_norm": 5.832816123962402, "learning_rate": 1.1249857567636129e-05, "loss": 2.9874, "step": 4286 }, { "epoch": 0.7169795542919263, "grad_norm": 3.8851494789123535, "learning_rate": 1.1237670098727443e-05, "loss": 2.7631, "step": 4287 }, { "epoch": 0.7171467993477443, "grad_norm": 3.2879209518432617, "learning_rate": 1.1225487320821773e-05, "loss": 2.3867, "step": 4288 }, { "epoch": 0.7173140444035623, "grad_norm": 8.482224464416504, "learning_rate": 1.1213309238071728e-05, "loss": 3.1894, "step": 4289 }, { "epoch": 0.7174812894593804, "grad_norm": 3.1223044395446777, "learning_rate": 1.1201135854628315e-05, "loss": 2.5411, "step": 4290 }, { "epoch": 0.7176485345151984, "grad_norm": 3.0767390727996826, "learning_rate": 1.118896717464093e-05, "loss": 2.7506, "step": 4291 }, { "epoch": 0.7178157795710164, "grad_norm": 5.786077499389648, "learning_rate": 1.1176803202257356e-05, "loss": 2.6284, "step": 4292 }, { "epoch": 0.7179830246268345, "grad_norm": 6.413300514221191, "learning_rate": 1.1164643941623797e-05, "loss": 2.2218, "step": 4293 }, { "epoch": 0.7181502696826525, "grad_norm": 4.427346229553223, "learning_rate": 1.1152489396884825e-05, "loss": 2.4, "step": 4294 }, { "epoch": 0.7183175147384705, "grad_norm": 3.565035581588745, "learning_rate": 1.1140339572183428e-05, "loss": 2.7562, "step": 4295 }, { "epoch": 0.7184847597942886, "grad_norm": 4.037796497344971, "learning_rate": 1.1128194471660957e-05, "loss": 2.4659, "step": 4296 }, { "epoch": 0.7186520048501066, "grad_norm": 2.534038543701172, "learning_rate": 1.111605409945718e-05, "loss": 2.4225, "step": 4297 }, { "epoch": 0.7188192499059246, "grad_norm": 4.665956020355225, "learning_rate": 1.110391845971025e-05, "loss": 2.451, "step": 4298 }, { "epoch": 0.7189864949617427, "grad_norm": 3.5915071964263916, "learning_rate": 1.109178755655668e-05, "loss": 2.6547, "step": 4299 }, { "epoch": 0.7191537400175607, "grad_norm": 3.4058678150177, "learning_rate": 1.1079661394131408e-05, "loss": 2.8924, "step": 4300 }, { "epoch": 0.7193209850733787, "grad_norm": 4.474489688873291, "learning_rate": 1.1067539976567718e-05, "loss": 2.5805, "step": 4301 }, { "epoch": 0.7194882301291968, "grad_norm": 6.5769524574279785, "learning_rate": 1.105542330799731e-05, "loss": 3.1877, "step": 4302 }, { "epoch": 0.7196554751850148, "grad_norm": 4.330998420715332, "learning_rate": 1.1043311392550235e-05, "loss": 3.1577, "step": 4303 }, { "epoch": 0.7198227202408328, "grad_norm": 4.56954288482666, "learning_rate": 1.1031204234354953e-05, "loss": 2.6016, "step": 4304 }, { "epoch": 0.7199899652966509, "grad_norm": 6.156902313232422, "learning_rate": 1.1019101837538266e-05, "loss": 2.945, "step": 4305 }, { "epoch": 0.7201572103524689, "grad_norm": 4.4082818031311035, "learning_rate": 1.1007004206225408e-05, "loss": 2.9798, "step": 4306 }, { "epoch": 0.720324455408287, "grad_norm": 5.022252559661865, "learning_rate": 1.0994911344539928e-05, "loss": 2.8424, "step": 4307 }, { "epoch": 0.720491700464105, "grad_norm": 10.43213176727295, "learning_rate": 1.09828232566038e-05, "loss": 2.4987, "step": 4308 }, { "epoch": 0.720658945519923, "grad_norm": 4.969090461730957, "learning_rate": 1.0970739946537326e-05, "loss": 2.6371, "step": 4309 }, { "epoch": 0.7208261905757412, "grad_norm": 6.676119327545166, "learning_rate": 1.095866141845922e-05, "loss": 2.7738, "step": 4310 }, { "epoch": 0.7209934356315592, "grad_norm": 3.7204887866973877, "learning_rate": 1.0946587676486528e-05, "loss": 3.0493, "step": 4311 }, { "epoch": 0.7211606806873772, "grad_norm": 4.792177677154541, "learning_rate": 1.0934518724734704e-05, "loss": 2.7105, "step": 4312 }, { "epoch": 0.7213279257431953, "grad_norm": 3.912888288497925, "learning_rate": 1.092245456731753e-05, "loss": 2.7147, "step": 4313 }, { "epoch": 0.7214951707990133, "grad_norm": 4.655920505523682, "learning_rate": 1.0910395208347182e-05, "loss": 2.9462, "step": 4314 }, { "epoch": 0.7216624158548313, "grad_norm": 3.2458536624908447, "learning_rate": 1.0898340651934202e-05, "loss": 2.8299, "step": 4315 }, { "epoch": 0.7218296609106494, "grad_norm": 12.473393440246582, "learning_rate": 1.0886290902187466e-05, "loss": 3.0753, "step": 4316 }, { "epoch": 0.7219969059664674, "grad_norm": 5.171153545379639, "learning_rate": 1.0874245963214248e-05, "loss": 2.6936, "step": 4317 }, { "epoch": 0.7221641510222854, "grad_norm": 5.4356369972229, "learning_rate": 1.0862205839120143e-05, "loss": 2.9011, "step": 4318 }, { "epoch": 0.7223313960781035, "grad_norm": 7.884683609008789, "learning_rate": 1.085017053400915e-05, "loss": 3.0963, "step": 4319 }, { "epoch": 0.7224986411339215, "grad_norm": 4.208898544311523, "learning_rate": 1.0838140051983577e-05, "loss": 2.2391, "step": 4320 }, { "epoch": 0.7226658861897395, "grad_norm": 5.16409969329834, "learning_rate": 1.0826114397144135e-05, "loss": 2.3829, "step": 4321 }, { "epoch": 0.7228331312455576, "grad_norm": 3.545076847076416, "learning_rate": 1.0814093573589839e-05, "loss": 2.3318, "step": 4322 }, { "epoch": 0.7230003763013756, "grad_norm": 4.453977108001709, "learning_rate": 1.0802077585418122e-05, "loss": 2.5816, "step": 4323 }, { "epoch": 0.7231676213571936, "grad_norm": 3.617222547531128, "learning_rate": 1.0790066436724703e-05, "loss": 3.0588, "step": 4324 }, { "epoch": 0.7233348664130117, "grad_norm": 4.802639484405518, "learning_rate": 1.0778060131603699e-05, "loss": 2.8769, "step": 4325 }, { "epoch": 0.7235021114688297, "grad_norm": 5.16600227355957, "learning_rate": 1.076605867414754e-05, "loss": 2.9056, "step": 4326 }, { "epoch": 0.7236693565246477, "grad_norm": 5.124411106109619, "learning_rate": 1.075406206844704e-05, "loss": 2.8117, "step": 4327 }, { "epoch": 0.7238366015804658, "grad_norm": 5.675736427307129, "learning_rate": 1.0742070318591332e-05, "loss": 2.6228, "step": 4328 }, { "epoch": 0.7240038466362838, "grad_norm": 2.7787649631500244, "learning_rate": 1.0730083428667892e-05, "loss": 2.7491, "step": 4329 }, { "epoch": 0.7241710916921018, "grad_norm": 5.750668048858643, "learning_rate": 1.071810140276256e-05, "loss": 3.1261, "step": 4330 }, { "epoch": 0.7243383367479199, "grad_norm": 4.662039279937744, "learning_rate": 1.0706124244959504e-05, "loss": 2.9685, "step": 4331 }, { "epoch": 0.7245055818037379, "grad_norm": 14.357001304626465, "learning_rate": 1.069415195934125e-05, "loss": 2.4444, "step": 4332 }, { "epoch": 0.7246728268595559, "grad_norm": 3.301590919494629, "learning_rate": 1.068218454998863e-05, "loss": 2.3713, "step": 4333 }, { "epoch": 0.724840071915374, "grad_norm": 4.661069393157959, "learning_rate": 1.067022202098085e-05, "loss": 2.7966, "step": 4334 }, { "epoch": 0.725007316971192, "grad_norm": 8.18974494934082, "learning_rate": 1.0658264376395422e-05, "loss": 2.5247, "step": 4335 }, { "epoch": 0.72517456202701, "grad_norm": 4.511560440063477, "learning_rate": 1.0646311620308221e-05, "loss": 2.6255, "step": 4336 }, { "epoch": 0.7253418070828281, "grad_norm": 16.792922973632812, "learning_rate": 1.0634363756793428e-05, "loss": 2.281, "step": 4337 }, { "epoch": 0.7255090521386461, "grad_norm": 4.519766330718994, "learning_rate": 1.0622420789923577e-05, "loss": 2.3892, "step": 4338 }, { "epoch": 0.7256762971944641, "grad_norm": 8.071016311645508, "learning_rate": 1.0610482723769535e-05, "loss": 2.5219, "step": 4339 }, { "epoch": 0.7258435422502822, "grad_norm": 4.002042293548584, "learning_rate": 1.0598549562400472e-05, "loss": 2.5105, "step": 4340 }, { "epoch": 0.7260107873061002, "grad_norm": 4.05426549911499, "learning_rate": 1.0586621309883923e-05, "loss": 2.66, "step": 4341 }, { "epoch": 0.7261780323619182, "grad_norm": 5.307985305786133, "learning_rate": 1.057469797028571e-05, "loss": 2.6877, "step": 4342 }, { "epoch": 0.7263452774177364, "grad_norm": 4.763155460357666, "learning_rate": 1.0562779547670024e-05, "loss": 3.0868, "step": 4343 }, { "epoch": 0.7265125224735544, "grad_norm": 4.549712181091309, "learning_rate": 1.0550866046099334e-05, "loss": 2.5069, "step": 4344 }, { "epoch": 0.7266797675293725, "grad_norm": 9.072540283203125, "learning_rate": 1.0538957469634474e-05, "loss": 3.0249, "step": 4345 }, { "epoch": 0.7268470125851905, "grad_norm": 1.6699861288070679, "learning_rate": 1.0527053822334557e-05, "loss": 2.61, "step": 4346 }, { "epoch": 0.7270142576410085, "grad_norm": 4.625550746917725, "learning_rate": 1.051515510825707e-05, "loss": 2.3621, "step": 4347 }, { "epoch": 0.7271815026968266, "grad_norm": 4.910833835601807, "learning_rate": 1.0503261331457758e-05, "loss": 2.6632, "step": 4348 }, { "epoch": 0.7273487477526446, "grad_norm": 4.031731128692627, "learning_rate": 1.049137249599074e-05, "loss": 2.5247, "step": 4349 }, { "epoch": 0.7275159928084626, "grad_norm": 3.414724349975586, "learning_rate": 1.0479488605908397e-05, "loss": 2.9876, "step": 4350 }, { "epoch": 0.7276832378642807, "grad_norm": 4.898738861083984, "learning_rate": 1.0467609665261468e-05, "loss": 3.2954, "step": 4351 }, { "epoch": 0.7278504829200987, "grad_norm": 3.782891273498535, "learning_rate": 1.0455735678098976e-05, "loss": 2.7433, "step": 4352 }, { "epoch": 0.7280177279759167, "grad_norm": 3.7309231758117676, "learning_rate": 1.044386664846828e-05, "loss": 2.7278, "step": 4353 }, { "epoch": 0.7281849730317348, "grad_norm": 4.3312225341796875, "learning_rate": 1.043200258041502e-05, "loss": 2.7154, "step": 4354 }, { "epoch": 0.7283522180875528, "grad_norm": 4.183990955352783, "learning_rate": 1.0420143477983169e-05, "loss": 2.7318, "step": 4355 }, { "epoch": 0.7285194631433708, "grad_norm": 8.018410682678223, "learning_rate": 1.0408289345215005e-05, "loss": 2.7811, "step": 4356 }, { "epoch": 0.7286867081991889, "grad_norm": 3.4371209144592285, "learning_rate": 1.0396440186151096e-05, "loss": 2.9252, "step": 4357 }, { "epoch": 0.7288539532550069, "grad_norm": 5.169920444488525, "learning_rate": 1.0384596004830335e-05, "loss": 3.1399, "step": 4358 }, { "epoch": 0.7290211983108249, "grad_norm": 3.3663766384124756, "learning_rate": 1.0372756805289896e-05, "loss": 2.9259, "step": 4359 }, { "epoch": 0.729188443366643, "grad_norm": 3.229036808013916, "learning_rate": 1.0360922591565284e-05, "loss": 2.6806, "step": 4360 }, { "epoch": 0.729355688422461, "grad_norm": 4.915264129638672, "learning_rate": 1.0349093367690268e-05, "loss": 2.8493, "step": 4361 }, { "epoch": 0.729522933478279, "grad_norm": 6.618992805480957, "learning_rate": 1.0337269137696956e-05, "loss": 2.8955, "step": 4362 }, { "epoch": 0.7296901785340971, "grad_norm": 5.661863803863525, "learning_rate": 1.0325449905615708e-05, "loss": 2.543, "step": 4363 }, { "epoch": 0.7298574235899151, "grad_norm": 4.383277893066406, "learning_rate": 1.0313635675475236e-05, "loss": 2.9335, "step": 4364 }, { "epoch": 0.7300246686457331, "grad_norm": 5.646833419799805, "learning_rate": 1.0301826451302498e-05, "loss": 2.7316, "step": 4365 }, { "epoch": 0.7301919137015512, "grad_norm": 5.141000270843506, "learning_rate": 1.0290022237122779e-05, "loss": 2.3929, "step": 4366 }, { "epoch": 0.7303591587573692, "grad_norm": 3.0488083362579346, "learning_rate": 1.0278223036959623e-05, "loss": 2.2268, "step": 4367 }, { "epoch": 0.7305264038131872, "grad_norm": 4.416526794433594, "learning_rate": 1.0266428854834904e-05, "loss": 2.1888, "step": 4368 }, { "epoch": 0.7306936488690053, "grad_norm": 4.448939323425293, "learning_rate": 1.0254639694768748e-05, "loss": 2.9765, "step": 4369 }, { "epoch": 0.7308608939248233, "grad_norm": 4.217455863952637, "learning_rate": 1.0242855560779601e-05, "loss": 2.6622, "step": 4370 }, { "epoch": 0.7310281389806413, "grad_norm": 8.743438720703125, "learning_rate": 1.0231076456884173e-05, "loss": 2.989, "step": 4371 }, { "epoch": 0.7311953840364595, "grad_norm": 5.574118614196777, "learning_rate": 1.0219302387097468e-05, "loss": 2.6611, "step": 4372 }, { "epoch": 0.7313626290922774, "grad_norm": 4.872920513153076, "learning_rate": 1.0207533355432786e-05, "loss": 3.0038, "step": 4373 }, { "epoch": 0.7315298741480954, "grad_norm": 4.233184814453125, "learning_rate": 1.019576936590168e-05, "loss": 2.599, "step": 4374 }, { "epoch": 0.7316971192039136, "grad_norm": 6.243813514709473, "learning_rate": 1.0184010422514024e-05, "loss": 2.6967, "step": 4375 }, { "epoch": 0.7318643642597316, "grad_norm": 4.692346572875977, "learning_rate": 1.0172256529277926e-05, "loss": 2.853, "step": 4376 }, { "epoch": 0.7320316093155496, "grad_norm": 2.77646541595459, "learning_rate": 1.016050769019982e-05, "loss": 2.2628, "step": 4377 }, { "epoch": 0.7321988543713677, "grad_norm": 7.051612854003906, "learning_rate": 1.0148763909284372e-05, "loss": 2.3681, "step": 4378 }, { "epoch": 0.7323660994271857, "grad_norm": 3.566725015640259, "learning_rate": 1.013702519053456e-05, "loss": 2.4618, "step": 4379 }, { "epoch": 0.7325333444830037, "grad_norm": 5.045804500579834, "learning_rate": 1.012529153795162e-05, "loss": 2.8325, "step": 4380 }, { "epoch": 0.7327005895388218, "grad_norm": 4.610067367553711, "learning_rate": 1.011356295553507e-05, "loss": 2.6473, "step": 4381 }, { "epoch": 0.7328678345946398, "grad_norm": 2.7466511726379395, "learning_rate": 1.0101839447282679e-05, "loss": 2.58, "step": 4382 }, { "epoch": 0.7330350796504579, "grad_norm": 5.381248474121094, "learning_rate": 1.0090121017190518e-05, "loss": 2.6429, "step": 4383 }, { "epoch": 0.7332023247062759, "grad_norm": 3.0026793479919434, "learning_rate": 1.0078407669252892e-05, "loss": 2.287, "step": 4384 }, { "epoch": 0.7333695697620939, "grad_norm": 3.5858254432678223, "learning_rate": 1.006669940746241e-05, "loss": 2.4813, "step": 4385 }, { "epoch": 0.733536814817912, "grad_norm": 3.8296735286712646, "learning_rate": 1.0054996235809919e-05, "loss": 2.5536, "step": 4386 }, { "epoch": 0.73370405987373, "grad_norm": 3.5263829231262207, "learning_rate": 1.0043298158284525e-05, "loss": 2.6597, "step": 4387 }, { "epoch": 0.733871304929548, "grad_norm": 5.078731060028076, "learning_rate": 1.0031605178873649e-05, "loss": 3.064, "step": 4388 }, { "epoch": 0.7340385499853661, "grad_norm": 3.822460412979126, "learning_rate": 1.001991730156291e-05, "loss": 2.4873, "step": 4389 }, { "epoch": 0.7342057950411841, "grad_norm": 3.217503070831299, "learning_rate": 1.0008234530336236e-05, "loss": 2.6273, "step": 4390 }, { "epoch": 0.7343730400970021, "grad_norm": 5.111030578613281, "learning_rate": 9.996556869175777e-06, "loss": 2.5834, "step": 4391 }, { "epoch": 0.7345402851528202, "grad_norm": 2.9361538887023926, "learning_rate": 9.98488432206198e-06, "loss": 2.6437, "step": 4392 }, { "epoch": 0.7347075302086382, "grad_norm": 8.432393074035645, "learning_rate": 9.973216892973509e-06, "loss": 2.5237, "step": 4393 }, { "epoch": 0.7348747752644562, "grad_norm": 7.379847049713135, "learning_rate": 9.96155458588732e-06, "loss": 3.777, "step": 4394 }, { "epoch": 0.7350420203202743, "grad_norm": 3.4134061336517334, "learning_rate": 9.949897404778588e-06, "loss": 2.7146, "step": 4395 }, { "epoch": 0.7352092653760923, "grad_norm": 6.504240989685059, "learning_rate": 9.93824535362077e-06, "loss": 2.9395, "step": 4396 }, { "epoch": 0.7353765104319103, "grad_norm": 5.135034561157227, "learning_rate": 9.926598436385571e-06, "loss": 2.8311, "step": 4397 }, { "epoch": 0.7355437554877284, "grad_norm": 6.2809834480285645, "learning_rate": 9.914956657042917e-06, "loss": 3.0731, "step": 4398 }, { "epoch": 0.7357110005435464, "grad_norm": 5.091394901275635, "learning_rate": 9.903320019561027e-06, "loss": 2.8499, "step": 4399 }, { "epoch": 0.7358782455993644, "grad_norm": 14.009598731994629, "learning_rate": 9.891688527906323e-06, "loss": 3.5165, "step": 4400 }, { "epoch": 0.7360454906551825, "grad_norm": 7.440865993499756, "learning_rate": 9.880062186043516e-06, "loss": 2.5769, "step": 4401 }, { "epoch": 0.7362127357110005, "grad_norm": 6.142072677612305, "learning_rate": 9.868440997935515e-06, "loss": 2.9285, "step": 4402 }, { "epoch": 0.7363799807668185, "grad_norm": 9.941740989685059, "learning_rate": 9.856824967543524e-06, "loss": 2.8926, "step": 4403 }, { "epoch": 0.7365472258226367, "grad_norm": 14.425402641296387, "learning_rate": 9.84521409882693e-06, "loss": 3.0898, "step": 4404 }, { "epoch": 0.7367144708784547, "grad_norm": 4.272504806518555, "learning_rate": 9.83360839574343e-06, "loss": 2.8836, "step": 4405 }, { "epoch": 0.7368817159342727, "grad_norm": 5.862778186798096, "learning_rate": 9.822007862248894e-06, "loss": 3.0031, "step": 4406 }, { "epoch": 0.7370489609900908, "grad_norm": 5.315934658050537, "learning_rate": 9.810412502297481e-06, "loss": 2.7258, "step": 4407 }, { "epoch": 0.7372162060459088, "grad_norm": 3.0955233573913574, "learning_rate": 9.798822319841544e-06, "loss": 2.4922, "step": 4408 }, { "epoch": 0.7373834511017268, "grad_norm": 2.45697283744812, "learning_rate": 9.78723731883171e-06, "loss": 2.0798, "step": 4409 }, { "epoch": 0.7375506961575449, "grad_norm": 5.7462921142578125, "learning_rate": 9.775657503216803e-06, "loss": 2.7448, "step": 4410 }, { "epoch": 0.7377179412133629, "grad_norm": 3.3839378356933594, "learning_rate": 9.764082876943916e-06, "loss": 2.7123, "step": 4411 }, { "epoch": 0.7378851862691809, "grad_norm": 4.446228981018066, "learning_rate": 9.752513443958342e-06, "loss": 2.3419, "step": 4412 }, { "epoch": 0.738052431324999, "grad_norm": 4.292930603027344, "learning_rate": 9.74094920820362e-06, "loss": 2.9405, "step": 4413 }, { "epoch": 0.738219676380817, "grad_norm": 3.175572156906128, "learning_rate": 9.729390173621528e-06, "loss": 2.5173, "step": 4414 }, { "epoch": 0.738386921436635, "grad_norm": 3.7842047214508057, "learning_rate": 9.717836344152037e-06, "loss": 2.3814, "step": 4415 }, { "epoch": 0.7385541664924531, "grad_norm": 3.0237061977386475, "learning_rate": 9.706287723733382e-06, "loss": 2.9422, "step": 4416 }, { "epoch": 0.7387214115482711, "grad_norm": 6.528133392333984, "learning_rate": 9.69474431630199e-06, "loss": 3.1139, "step": 4417 }, { "epoch": 0.7388886566040891, "grad_norm": 3.6223185062408447, "learning_rate": 9.683206125792543e-06, "loss": 2.7379, "step": 4418 }, { "epoch": 0.7390559016599072, "grad_norm": 4.037101745605469, "learning_rate": 9.671673156137905e-06, "loss": 2.7748, "step": 4419 }, { "epoch": 0.7392231467157252, "grad_norm": 2.9244298934936523, "learning_rate": 9.660145411269203e-06, "loss": 2.4782, "step": 4420 }, { "epoch": 0.7393903917715432, "grad_norm": 5.592029094696045, "learning_rate": 9.648622895115753e-06, "loss": 2.5747, "step": 4421 }, { "epoch": 0.7395576368273613, "grad_norm": 5.416989803314209, "learning_rate": 9.637105611605113e-06, "loss": 2.2903, "step": 4422 }, { "epoch": 0.7397248818831793, "grad_norm": 12.672222137451172, "learning_rate": 9.625593564663024e-06, "loss": 3.2382, "step": 4423 }, { "epoch": 0.7398921269389974, "grad_norm": 3.37444806098938, "learning_rate": 9.61408675821348e-06, "loss": 2.5557, "step": 4424 }, { "epoch": 0.7400593719948154, "grad_norm": 4.211962699890137, "learning_rate": 9.602585196178654e-06, "loss": 2.7409, "step": 4425 }, { "epoch": 0.7402266170506334, "grad_norm": 6.7532267570495605, "learning_rate": 9.591088882478963e-06, "loss": 3.3169, "step": 4426 }, { "epoch": 0.7403938621064515, "grad_norm": 2.182913064956665, "learning_rate": 9.579597821033004e-06, "loss": 2.1105, "step": 4427 }, { "epoch": 0.7405611071622695, "grad_norm": 4.064446449279785, "learning_rate": 9.568112015757618e-06, "loss": 2.7896, "step": 4428 }, { "epoch": 0.7407283522180875, "grad_norm": 5.334422588348389, "learning_rate": 9.556631470567815e-06, "loss": 2.7818, "step": 4429 }, { "epoch": 0.7408955972739056, "grad_norm": 6.8458170890808105, "learning_rate": 9.545156189376844e-06, "loss": 2.8749, "step": 4430 }, { "epoch": 0.7410628423297236, "grad_norm": 5.0264997482299805, "learning_rate": 9.533686176096159e-06, "loss": 2.8673, "step": 4431 }, { "epoch": 0.7412300873855416, "grad_norm": 6.465572834014893, "learning_rate": 9.522221434635386e-06, "loss": 2.9254, "step": 4432 }, { "epoch": 0.7413973324413597, "grad_norm": 3.227018117904663, "learning_rate": 9.510761968902398e-06, "loss": 2.6311, "step": 4433 }, { "epoch": 0.7415645774971777, "grad_norm": 2.9651594161987305, "learning_rate": 9.49930778280323e-06, "loss": 2.5115, "step": 4434 }, { "epoch": 0.7417318225529957, "grad_norm": 4.02671480178833, "learning_rate": 9.48785888024215e-06, "loss": 2.5155, "step": 4435 }, { "epoch": 0.7418990676088139, "grad_norm": 3.7153279781341553, "learning_rate": 9.476415265121596e-06, "loss": 2.5654, "step": 4436 }, { "epoch": 0.7420663126646319, "grad_norm": 4.826799392700195, "learning_rate": 9.464976941342223e-06, "loss": 2.5496, "step": 4437 }, { "epoch": 0.7422335577204499, "grad_norm": 5.882707595825195, "learning_rate": 9.453543912802882e-06, "loss": 2.4817, "step": 4438 }, { "epoch": 0.742400802776268, "grad_norm": 5.449031829833984, "learning_rate": 9.44211618340062e-06, "loss": 3.0517, "step": 4439 }, { "epoch": 0.742568047832086, "grad_norm": 4.341989517211914, "learning_rate": 9.430693757030657e-06, "loss": 2.8869, "step": 4440 }, { "epoch": 0.742735292887904, "grad_norm": 4.166437149047852, "learning_rate": 9.419276637586435e-06, "loss": 2.2658, "step": 4441 }, { "epoch": 0.7429025379437221, "grad_norm": 3.881448984146118, "learning_rate": 9.40786482895956e-06, "loss": 2.7203, "step": 4442 }, { "epoch": 0.7430697829995401, "grad_norm": 2.901719570159912, "learning_rate": 9.396458335039851e-06, "loss": 2.6279, "step": 4443 }, { "epoch": 0.7432370280553581, "grad_norm": 5.05027437210083, "learning_rate": 9.385057159715296e-06, "loss": 2.4556, "step": 4444 }, { "epoch": 0.7434042731111762, "grad_norm": 3.263686418533325, "learning_rate": 9.373661306872081e-06, "loss": 2.6316, "step": 4445 }, { "epoch": 0.7435715181669942, "grad_norm": 3.9038150310516357, "learning_rate": 9.362270780394588e-06, "loss": 2.4534, "step": 4446 }, { "epoch": 0.7437387632228122, "grad_norm": 3.162064790725708, "learning_rate": 9.350885584165355e-06, "loss": 2.7462, "step": 4447 }, { "epoch": 0.7439060082786303, "grad_norm": 6.165729999542236, "learning_rate": 9.339505722065136e-06, "loss": 2.9675, "step": 4448 }, { "epoch": 0.7440732533344483, "grad_norm": 5.5695037841796875, "learning_rate": 9.328131197972831e-06, "loss": 2.9005, "step": 4449 }, { "epoch": 0.7442404983902663, "grad_norm": 4.389410018920898, "learning_rate": 9.31676201576556e-06, "loss": 2.8637, "step": 4450 }, { "epoch": 0.7444077434460844, "grad_norm": 4.741387367248535, "learning_rate": 9.305398179318581e-06, "loss": 2.668, "step": 4451 }, { "epoch": 0.7445749885019024, "grad_norm": 4.807168960571289, "learning_rate": 9.294039692505372e-06, "loss": 2.9781, "step": 4452 }, { "epoch": 0.7447422335577204, "grad_norm": 4.61118221282959, "learning_rate": 9.282686559197537e-06, "loss": 1.9663, "step": 4453 }, { "epoch": 0.7449094786135385, "grad_norm": 5.361738681793213, "learning_rate": 9.271338783264922e-06, "loss": 2.7414, "step": 4454 }, { "epoch": 0.7450767236693565, "grad_norm": 2.627535104751587, "learning_rate": 9.259996368575491e-06, "loss": 2.8494, "step": 4455 }, { "epoch": 0.7452439687251745, "grad_norm": 8.525969505310059, "learning_rate": 9.248659318995393e-06, "loss": 3.5803, "step": 4456 }, { "epoch": 0.7454112137809926, "grad_norm": 4.082057952880859, "learning_rate": 9.237327638388965e-06, "loss": 2.6395, "step": 4457 }, { "epoch": 0.7455784588368106, "grad_norm": 6.145197868347168, "learning_rate": 9.226001330618691e-06, "loss": 2.9114, "step": 4458 }, { "epoch": 0.7457457038926286, "grad_norm": 7.746750354766846, "learning_rate": 9.214680399545255e-06, "loss": 3.3187, "step": 4459 }, { "epoch": 0.7459129489484467, "grad_norm": 5.678595542907715, "learning_rate": 9.203364849027468e-06, "loss": 2.99, "step": 4460 }, { "epoch": 0.7460801940042647, "grad_norm": 6.620910167694092, "learning_rate": 9.192054682922339e-06, "loss": 2.6819, "step": 4461 }, { "epoch": 0.7462474390600828, "grad_norm": 7.426158905029297, "learning_rate": 9.180749905085029e-06, "loss": 3.137, "step": 4462 }, { "epoch": 0.7464146841159008, "grad_norm": 4.895527362823486, "learning_rate": 9.169450519368874e-06, "loss": 2.8019, "step": 4463 }, { "epoch": 0.7465819291717188, "grad_norm": 6.066232204437256, "learning_rate": 9.158156529625344e-06, "loss": 2.9224, "step": 4464 }, { "epoch": 0.746749174227537, "grad_norm": 7.732909202575684, "learning_rate": 9.14686793970411e-06, "loss": 3.2139, "step": 4465 }, { "epoch": 0.746916419283355, "grad_norm": 4.7190165519714355, "learning_rate": 9.135584753452955e-06, "loss": 2.2527, "step": 4466 }, { "epoch": 0.747083664339173, "grad_norm": 7.6130876541137695, "learning_rate": 9.124306974717872e-06, "loss": 3.2792, "step": 4467 }, { "epoch": 0.7472509093949911, "grad_norm": 7.461098670959473, "learning_rate": 9.11303460734296e-06, "loss": 3.1619, "step": 4468 }, { "epoch": 0.7474181544508091, "grad_norm": 4.873455047607422, "learning_rate": 9.101767655170518e-06, "loss": 2.5569, "step": 4469 }, { "epoch": 0.7475853995066271, "grad_norm": 3.5444018840789795, "learning_rate": 9.09050612204096e-06, "loss": 2.9149, "step": 4470 }, { "epoch": 0.7477526445624452, "grad_norm": 7.890201568603516, "learning_rate": 9.079250011792883e-06, "loss": 2.7882, "step": 4471 }, { "epoch": 0.7479198896182632, "grad_norm": 7.490345001220703, "learning_rate": 9.067999328263032e-06, "loss": 3.1816, "step": 4472 }, { "epoch": 0.7480871346740812, "grad_norm": 3.923417806625366, "learning_rate": 9.056754075286277e-06, "loss": 2.6325, "step": 4473 }, { "epoch": 0.7482543797298993, "grad_norm": 2.458958148956299, "learning_rate": 9.045514256695676e-06, "loss": 2.6221, "step": 4474 }, { "epoch": 0.7484216247857173, "grad_norm": 10.008612632751465, "learning_rate": 9.034279876322388e-06, "loss": 3.3277, "step": 4475 }, { "epoch": 0.7485888698415353, "grad_norm": 4.853857517242432, "learning_rate": 9.02305093799577e-06, "loss": 2.229, "step": 4476 }, { "epoch": 0.7487561148973534, "grad_norm": 3.5783774852752686, "learning_rate": 9.011827445543272e-06, "loss": 2.6525, "step": 4477 }, { "epoch": 0.7489233599531714, "grad_norm": 3.226348400115967, "learning_rate": 9.000609402790527e-06, "loss": 2.7236, "step": 4478 }, { "epoch": 0.7490906050089894, "grad_norm": 8.779886245727539, "learning_rate": 8.989396813561298e-06, "loss": 3.5066, "step": 4479 }, { "epoch": 0.7492578500648075, "grad_norm": 7.83176851272583, "learning_rate": 8.978189681677493e-06, "loss": 2.789, "step": 4480 }, { "epoch": 0.7494250951206255, "grad_norm": 4.7844719886779785, "learning_rate": 8.96698801095914e-06, "loss": 2.2238, "step": 4481 }, { "epoch": 0.7495923401764435, "grad_norm": 3.881502389907837, "learning_rate": 8.95579180522444e-06, "loss": 2.8269, "step": 4482 }, { "epoch": 0.7497595852322616, "grad_norm": 5.491853713989258, "learning_rate": 8.944601068289687e-06, "loss": 2.4295, "step": 4483 }, { "epoch": 0.7499268302880796, "grad_norm": 3.917844772338867, "learning_rate": 8.933415803969358e-06, "loss": 2.7864, "step": 4484 }, { "epoch": 0.7500940753438976, "grad_norm": 5.839964866638184, "learning_rate": 8.922236016076024e-06, "loss": 3.5145, "step": 4485 }, { "epoch": 0.7502613203997157, "grad_norm": 11.443772315979004, "learning_rate": 8.911061708420415e-06, "loss": 2.2477, "step": 4486 }, { "epoch": 0.7504285654555337, "grad_norm": 5.590603351593018, "learning_rate": 8.899892884811395e-06, "loss": 2.6435, "step": 4487 }, { "epoch": 0.7505958105113517, "grad_norm": 8.937637329101562, "learning_rate": 8.888729549055933e-06, "loss": 3.2845, "step": 4488 }, { "epoch": 0.7507630555671698, "grad_norm": 7.387848377227783, "learning_rate": 8.877571704959159e-06, "loss": 3.2458, "step": 4489 }, { "epoch": 0.7509303006229878, "grad_norm": 5.535523414611816, "learning_rate": 8.866419356324298e-06, "loss": 2.3153, "step": 4490 }, { "epoch": 0.7510975456788058, "grad_norm": 5.061375141143799, "learning_rate": 8.855272506952739e-06, "loss": 2.6639, "step": 4491 }, { "epoch": 0.7512647907346239, "grad_norm": 5.189950942993164, "learning_rate": 8.844131160643956e-06, "loss": 2.6321, "step": 4492 }, { "epoch": 0.7514320357904419, "grad_norm": 4.517870903015137, "learning_rate": 8.832995321195591e-06, "loss": 3.1117, "step": 4493 }, { "epoch": 0.7515992808462599, "grad_norm": 3.996547222137451, "learning_rate": 8.821864992403356e-06, "loss": 2.8388, "step": 4494 }, { "epoch": 0.751766525902078, "grad_norm": 4.208589553833008, "learning_rate": 8.81074017806115e-06, "loss": 2.4865, "step": 4495 }, { "epoch": 0.751933770957896, "grad_norm": 4.1053972244262695, "learning_rate": 8.799620881960935e-06, "loss": 2.6125, "step": 4496 }, { "epoch": 0.752101016013714, "grad_norm": 6.011171340942383, "learning_rate": 8.788507107892824e-06, "loss": 3.0199, "step": 4497 }, { "epoch": 0.7522682610695322, "grad_norm": 3.6976587772369385, "learning_rate": 8.777398859645029e-06, "loss": 2.648, "step": 4498 }, { "epoch": 0.7524355061253502, "grad_norm": 4.926847457885742, "learning_rate": 8.7662961410039e-06, "loss": 2.3542, "step": 4499 }, { "epoch": 0.7526027511811683, "grad_norm": 4.00570011138916, "learning_rate": 8.755198955753873e-06, "loss": 2.9182, "step": 4500 }, { "epoch": 0.7527699962369863, "grad_norm": 4.783298492431641, "learning_rate": 8.744107307677533e-06, "loss": 2.6898, "step": 4501 }, { "epoch": 0.7529372412928043, "grad_norm": 2.992678642272949, "learning_rate": 8.733021200555542e-06, "loss": 2.2824, "step": 4502 }, { "epoch": 0.7531044863486224, "grad_norm": 11.25572395324707, "learning_rate": 8.721940638166697e-06, "loss": 3.3966, "step": 4503 }, { "epoch": 0.7532717314044404, "grad_norm": 5.121528625488281, "learning_rate": 8.710865624287911e-06, "loss": 2.9347, "step": 4504 }, { "epoch": 0.7534389764602584, "grad_norm": 5.838565349578857, "learning_rate": 8.699796162694174e-06, "loss": 2.4945, "step": 4505 }, { "epoch": 0.7536062215160765, "grad_norm": 3.255167245864868, "learning_rate": 8.688732257158616e-06, "loss": 2.6665, "step": 4506 }, { "epoch": 0.7537734665718945, "grad_norm": 7.149387836456299, "learning_rate": 8.67767391145245e-06, "loss": 2.6027, "step": 4507 }, { "epoch": 0.7539407116277125, "grad_norm": 3.302642345428467, "learning_rate": 8.666621129345018e-06, "loss": 2.6484, "step": 4508 }, { "epoch": 0.7541079566835306, "grad_norm": 2.8176112174987793, "learning_rate": 8.655573914603732e-06, "loss": 2.4981, "step": 4509 }, { "epoch": 0.7542752017393486, "grad_norm": 3.236888885498047, "learning_rate": 8.644532270994147e-06, "loss": 2.9128, "step": 4510 }, { "epoch": 0.7544424467951666, "grad_norm": 4.177479267120361, "learning_rate": 8.633496202279867e-06, "loss": 2.5329, "step": 4511 }, { "epoch": 0.7546096918509847, "grad_norm": 6.739837646484375, "learning_rate": 8.622465712222666e-06, "loss": 2.4002, "step": 4512 }, { "epoch": 0.7547769369068027, "grad_norm": 4.870169162750244, "learning_rate": 8.611440804582358e-06, "loss": 3.0854, "step": 4513 }, { "epoch": 0.7549441819626207, "grad_norm": 6.383517742156982, "learning_rate": 8.60042148311686e-06, "loss": 2.9032, "step": 4514 }, { "epoch": 0.7551114270184388, "grad_norm": 3.4202370643615723, "learning_rate": 8.589407751582224e-06, "loss": 2.9286, "step": 4515 }, { "epoch": 0.7552786720742568, "grad_norm": 6.87025785446167, "learning_rate": 8.57839961373255e-06, "loss": 2.9356, "step": 4516 }, { "epoch": 0.7554459171300748, "grad_norm": 4.05291223526001, "learning_rate": 8.567397073320063e-06, "loss": 2.577, "step": 4517 }, { "epoch": 0.7556131621858929, "grad_norm": 5.036474704742432, "learning_rate": 8.55640013409506e-06, "loss": 3.0391, "step": 4518 }, { "epoch": 0.7557804072417109, "grad_norm": 6.138881206512451, "learning_rate": 8.545408799805946e-06, "loss": 2.3917, "step": 4519 }, { "epoch": 0.7559476522975289, "grad_norm": 5.776637077331543, "learning_rate": 8.534423074199202e-06, "loss": 3.1436, "step": 4520 }, { "epoch": 0.756114897353347, "grad_norm": 3.280256986618042, "learning_rate": 8.523442961019415e-06, "loss": 2.4632, "step": 4521 }, { "epoch": 0.756282142409165, "grad_norm": 4.652491092681885, "learning_rate": 8.51246846400923e-06, "loss": 2.8871, "step": 4522 }, { "epoch": 0.756449387464983, "grad_norm": 4.229855537414551, "learning_rate": 8.50149958690941e-06, "loss": 2.7499, "step": 4523 }, { "epoch": 0.7566166325208011, "grad_norm": 5.762433052062988, "learning_rate": 8.49053633345877e-06, "loss": 3.0765, "step": 4524 }, { "epoch": 0.7567838775766191, "grad_norm": 3.8169548511505127, "learning_rate": 8.479578707394242e-06, "loss": 2.7263, "step": 4525 }, { "epoch": 0.7569511226324371, "grad_norm": 3.483090400695801, "learning_rate": 8.468626712450806e-06, "loss": 2.5149, "step": 4526 }, { "epoch": 0.7571183676882552, "grad_norm": 7.895653247833252, "learning_rate": 8.457680352361546e-06, "loss": 3.1532, "step": 4527 }, { "epoch": 0.7572856127440732, "grad_norm": 5.585296630859375, "learning_rate": 8.44673963085763e-06, "loss": 2.6851, "step": 4528 }, { "epoch": 0.7574528577998912, "grad_norm": 4.800823211669922, "learning_rate": 8.435804551668275e-06, "loss": 2.5677, "step": 4529 }, { "epoch": 0.7576201028557094, "grad_norm": 5.308714866638184, "learning_rate": 8.424875118520809e-06, "loss": 3.0149, "step": 4530 }, { "epoch": 0.7577873479115274, "grad_norm": 3.2263245582580566, "learning_rate": 8.413951335140599e-06, "loss": 2.6202, "step": 4531 }, { "epoch": 0.7579545929673454, "grad_norm": 9.412346839904785, "learning_rate": 8.403033205251129e-06, "loss": 2.7758, "step": 4532 }, { "epoch": 0.7581218380231635, "grad_norm": 3.457900285720825, "learning_rate": 8.39212073257391e-06, "loss": 2.9188, "step": 4533 }, { "epoch": 0.7582890830789815, "grad_norm": 10.533369064331055, "learning_rate": 8.381213920828568e-06, "loss": 2.8665, "step": 4534 }, { "epoch": 0.7584563281347995, "grad_norm": 4.014355659484863, "learning_rate": 8.370312773732764e-06, "loss": 2.4392, "step": 4535 }, { "epoch": 0.7586235731906176, "grad_norm": 3.6115167140960693, "learning_rate": 8.359417295002248e-06, "loss": 2.4923, "step": 4536 }, { "epoch": 0.7587908182464356, "grad_norm": 5.155210971832275, "learning_rate": 8.348527488350837e-06, "loss": 2.6385, "step": 4537 }, { "epoch": 0.7589580633022537, "grad_norm": 4.697580814361572, "learning_rate": 8.337643357490415e-06, "loss": 2.4869, "step": 4538 }, { "epoch": 0.7591253083580717, "grad_norm": 4.737175941467285, "learning_rate": 8.326764906130915e-06, "loss": 2.8847, "step": 4539 }, { "epoch": 0.7592925534138897, "grad_norm": 6.350038051605225, "learning_rate": 8.31589213798036e-06, "loss": 2.6612, "step": 4540 }, { "epoch": 0.7594597984697078, "grad_norm": 3.9859166145324707, "learning_rate": 8.305025056744808e-06, "loss": 2.3453, "step": 4541 }, { "epoch": 0.7596270435255258, "grad_norm": 5.238142967224121, "learning_rate": 8.294163666128407e-06, "loss": 2.5324, "step": 4542 }, { "epoch": 0.7597942885813438, "grad_norm": 2.9059624671936035, "learning_rate": 8.283307969833334e-06, "loss": 2.6306, "step": 4543 }, { "epoch": 0.7599615336371619, "grad_norm": 3.75927472114563, "learning_rate": 8.272457971559855e-06, "loss": 2.6931, "step": 4544 }, { "epoch": 0.7601287786929799, "grad_norm": 6.305368423461914, "learning_rate": 8.261613675006286e-06, "loss": 2.593, "step": 4545 }, { "epoch": 0.7602960237487979, "grad_norm": 4.151636600494385, "learning_rate": 8.250775083868975e-06, "loss": 2.6429, "step": 4546 }, { "epoch": 0.760463268804616, "grad_norm": 6.180394172668457, "learning_rate": 8.239942201842365e-06, "loss": 3.0777, "step": 4547 }, { "epoch": 0.760630513860434, "grad_norm": 3.4459009170532227, "learning_rate": 8.229115032618917e-06, "loss": 2.8868, "step": 4548 }, { "epoch": 0.760797758916252, "grad_norm": 4.785977840423584, "learning_rate": 8.218293579889174e-06, "loss": 2.5039, "step": 4549 }, { "epoch": 0.7609650039720701, "grad_norm": 5.154587745666504, "learning_rate": 8.207477847341702e-06, "loss": 2.3191, "step": 4550 }, { "epoch": 0.7611322490278881, "grad_norm": 8.859968185424805, "learning_rate": 8.196667838663149e-06, "loss": 3.0014, "step": 4551 }, { "epoch": 0.7612994940837061, "grad_norm": 8.167197227478027, "learning_rate": 8.185863557538168e-06, "loss": 2.7059, "step": 4552 }, { "epoch": 0.7614667391395242, "grad_norm": 5.207485198974609, "learning_rate": 8.175065007649524e-06, "loss": 2.8019, "step": 4553 }, { "epoch": 0.7616339841953422, "grad_norm": 4.815952301025391, "learning_rate": 8.164272192677963e-06, "loss": 2.7649, "step": 4554 }, { "epoch": 0.7618012292511602, "grad_norm": 5.141364097595215, "learning_rate": 8.15348511630232e-06, "loss": 2.5684, "step": 4555 }, { "epoch": 0.7619684743069783, "grad_norm": 3.6281042098999023, "learning_rate": 8.142703782199449e-06, "loss": 2.5355, "step": 4556 }, { "epoch": 0.7621357193627963, "grad_norm": 7.751478672027588, "learning_rate": 8.131928194044263e-06, "loss": 2.3566, "step": 4557 }, { "epoch": 0.7623029644186143, "grad_norm": 5.187582015991211, "learning_rate": 8.1211583555097e-06, "loss": 2.0654, "step": 4558 }, { "epoch": 0.7624702094744324, "grad_norm": 3.591481924057007, "learning_rate": 8.110394270266763e-06, "loss": 2.6745, "step": 4559 }, { "epoch": 0.7626374545302504, "grad_norm": 3.4844815731048584, "learning_rate": 8.09963594198446e-06, "loss": 2.6389, "step": 4560 }, { "epoch": 0.7628046995860684, "grad_norm": 6.057557582855225, "learning_rate": 8.088883374329865e-06, "loss": 2.3648, "step": 4561 }, { "epoch": 0.7629719446418866, "grad_norm": 4.802995204925537, "learning_rate": 8.078136570968086e-06, "loss": 2.5329, "step": 4562 }, { "epoch": 0.7631391896977046, "grad_norm": 5.128208160400391, "learning_rate": 8.067395535562247e-06, "loss": 2.413, "step": 4563 }, { "epoch": 0.7633064347535226, "grad_norm": 6.9774489402771, "learning_rate": 8.05666027177353e-06, "loss": 3.0908, "step": 4564 }, { "epoch": 0.7634736798093407, "grad_norm": 5.108948707580566, "learning_rate": 8.045930783261121e-06, "loss": 2.6416, "step": 4565 }, { "epoch": 0.7636409248651587, "grad_norm": 9.483901977539062, "learning_rate": 8.035207073682274e-06, "loss": 3.1998, "step": 4566 }, { "epoch": 0.7638081699209767, "grad_norm": 9.151754379272461, "learning_rate": 8.024489146692235e-06, "loss": 3.4174, "step": 4567 }, { "epoch": 0.7639754149767948, "grad_norm": 5.733241081237793, "learning_rate": 8.013777005944306e-06, "loss": 2.8163, "step": 4568 }, { "epoch": 0.7641426600326128, "grad_norm": 6.957686901092529, "learning_rate": 8.003070655089805e-06, "loss": 2.5026, "step": 4569 }, { "epoch": 0.7643099050884308, "grad_norm": 5.6144938468933105, "learning_rate": 7.992370097778093e-06, "loss": 2.8122, "step": 4570 }, { "epoch": 0.7644771501442489, "grad_norm": 5.422502040863037, "learning_rate": 7.981675337656525e-06, "loss": 2.371, "step": 4571 }, { "epoch": 0.7646443952000669, "grad_norm": 6.778336048126221, "learning_rate": 7.970986378370509e-06, "loss": 2.5762, "step": 4572 }, { "epoch": 0.7648116402558849, "grad_norm": 11.074893951416016, "learning_rate": 7.960303223563461e-06, "loss": 2.5964, "step": 4573 }, { "epoch": 0.764978885311703, "grad_norm": 4.085208892822266, "learning_rate": 7.949625876876816e-06, "loss": 2.6198, "step": 4574 }, { "epoch": 0.765146130367521, "grad_norm": 4.406464576721191, "learning_rate": 7.938954341950044e-06, "loss": 2.4434, "step": 4575 }, { "epoch": 0.765313375423339, "grad_norm": 4.864837646484375, "learning_rate": 7.928288622420618e-06, "loss": 3.1098, "step": 4576 }, { "epoch": 0.7654806204791571, "grad_norm": 3.529651641845703, "learning_rate": 7.917628721924036e-06, "loss": 2.7801, "step": 4577 }, { "epoch": 0.7656478655349751, "grad_norm": 16.455299377441406, "learning_rate": 7.906974644093817e-06, "loss": 3.4651, "step": 4578 }, { "epoch": 0.7658151105907932, "grad_norm": 7.467899322509766, "learning_rate": 7.896326392561496e-06, "loss": 2.8794, "step": 4579 }, { "epoch": 0.7659823556466112, "grad_norm": 3.4963080883026123, "learning_rate": 7.885683970956607e-06, "loss": 2.4148, "step": 4580 }, { "epoch": 0.7661496007024292, "grad_norm": 2.9815640449523926, "learning_rate": 7.875047382906714e-06, "loss": 2.5095, "step": 4581 }, { "epoch": 0.7663168457582473, "grad_norm": 5.691580772399902, "learning_rate": 7.864416632037375e-06, "loss": 2.7779, "step": 4582 }, { "epoch": 0.7664840908140653, "grad_norm": 4.453370571136475, "learning_rate": 7.853791721972181e-06, "loss": 2.6917, "step": 4583 }, { "epoch": 0.7666513358698833, "grad_norm": 5.651138782501221, "learning_rate": 7.843172656332708e-06, "loss": 2.7844, "step": 4584 }, { "epoch": 0.7668185809257014, "grad_norm": 8.49561882019043, "learning_rate": 7.83255943873856e-06, "loss": 3.1243, "step": 4585 }, { "epoch": 0.7669858259815194, "grad_norm": 4.337004661560059, "learning_rate": 7.82195207280734e-06, "loss": 2.3504, "step": 4586 }, { "epoch": 0.7671530710373374, "grad_norm": 6.301113605499268, "learning_rate": 7.811350562154648e-06, "loss": 3.0545, "step": 4587 }, { "epoch": 0.7673203160931555, "grad_norm": 10.498684883117676, "learning_rate": 7.800754910394106e-06, "loss": 2.6556, "step": 4588 }, { "epoch": 0.7674875611489735, "grad_norm": 8.494911193847656, "learning_rate": 7.790165121137315e-06, "loss": 3.2392, "step": 4589 }, { "epoch": 0.7676548062047915, "grad_norm": 4.924398899078369, "learning_rate": 7.779581197993913e-06, "loss": 2.5172, "step": 4590 }, { "epoch": 0.7678220512606097, "grad_norm": 4.684637069702148, "learning_rate": 7.76900314457149e-06, "loss": 3.0466, "step": 4591 }, { "epoch": 0.7679892963164276, "grad_norm": 4.687937259674072, "learning_rate": 7.758430964475685e-06, "loss": 2.692, "step": 4592 }, { "epoch": 0.7681565413722456, "grad_norm": 3.7278690338134766, "learning_rate": 7.74786466131009e-06, "loss": 2.8885, "step": 4593 }, { "epoch": 0.7683237864280638, "grad_norm": 7.127924919128418, "learning_rate": 7.737304238676343e-06, "loss": 2.62, "step": 4594 }, { "epoch": 0.7684910314838818, "grad_norm": 6.787526607513428, "learning_rate": 7.72674970017403e-06, "loss": 2.8117, "step": 4595 }, { "epoch": 0.7686582765396998, "grad_norm": 3.8452308177948, "learning_rate": 7.716201049400764e-06, "loss": 2.6174, "step": 4596 }, { "epoch": 0.7688255215955179, "grad_norm": 5.327746391296387, "learning_rate": 7.70565828995213e-06, "loss": 2.6843, "step": 4597 }, { "epoch": 0.7689927666513359, "grad_norm": 6.455427169799805, "learning_rate": 7.695121425421724e-06, "loss": 2.4999, "step": 4598 }, { "epoch": 0.7691600117071539, "grad_norm": 6.525650978088379, "learning_rate": 7.684590459401111e-06, "loss": 3.0576, "step": 4599 }, { "epoch": 0.769327256762972, "grad_norm": 4.974756240844727, "learning_rate": 7.67406539547987e-06, "loss": 2.3042, "step": 4600 }, { "epoch": 0.76949450181879, "grad_norm": 3.6960623264312744, "learning_rate": 7.663546237245542e-06, "loss": 2.8514, "step": 4601 }, { "epoch": 0.769661746874608, "grad_norm": 6.440914630889893, "learning_rate": 7.653032988283674e-06, "loss": 2.6291, "step": 4602 }, { "epoch": 0.7698289919304261, "grad_norm": 4.481714248657227, "learning_rate": 7.6425256521778e-06, "loss": 2.8678, "step": 4603 }, { "epoch": 0.7699962369862441, "grad_norm": 4.389357089996338, "learning_rate": 7.632024232509419e-06, "loss": 3.0835, "step": 4604 }, { "epoch": 0.7701634820420621, "grad_norm": 4.5098876953125, "learning_rate": 7.621528732858041e-06, "loss": 2.1851, "step": 4605 }, { "epoch": 0.7703307270978802, "grad_norm": 3.5349323749542236, "learning_rate": 7.611039156801125e-06, "loss": 2.3589, "step": 4606 }, { "epoch": 0.7704979721536982, "grad_norm": 5.259244441986084, "learning_rate": 7.60055550791415e-06, "loss": 2.4749, "step": 4607 }, { "epoch": 0.7706652172095162, "grad_norm": 3.7163751125335693, "learning_rate": 7.590077789770533e-06, "loss": 2.5412, "step": 4608 }, { "epoch": 0.7708324622653343, "grad_norm": 2.9577348232269287, "learning_rate": 7.579606005941708e-06, "loss": 2.6047, "step": 4609 }, { "epoch": 0.7709997073211523, "grad_norm": 8.639564514160156, "learning_rate": 7.569140159997043e-06, "loss": 2.7466, "step": 4610 }, { "epoch": 0.7711669523769703, "grad_norm": 3.7244315147399902, "learning_rate": 7.558680255503942e-06, "loss": 2.5, "step": 4611 }, { "epoch": 0.7713341974327884, "grad_norm": 5.761275291442871, "learning_rate": 7.548226296027725e-06, "loss": 2.8127, "step": 4612 }, { "epoch": 0.7715014424886064, "grad_norm": 3.296964645385742, "learning_rate": 7.537778285131722e-06, "loss": 2.8554, "step": 4613 }, { "epoch": 0.7716686875444244, "grad_norm": 7.845616340637207, "learning_rate": 7.52733622637721e-06, "loss": 2.357, "step": 4614 }, { "epoch": 0.7718359326002425, "grad_norm": 4.375486850738525, "learning_rate": 7.516900123323467e-06, "loss": 2.6266, "step": 4615 }, { "epoch": 0.7720031776560605, "grad_norm": 4.988513469696045, "learning_rate": 7.506469979527708e-06, "loss": 2.3344, "step": 4616 }, { "epoch": 0.7721704227118786, "grad_norm": 6.285513877868652, "learning_rate": 7.496045798545148e-06, "loss": 2.4319, "step": 4617 }, { "epoch": 0.7723376677676966, "grad_norm": 7.322163105010986, "learning_rate": 7.485627583928939e-06, "loss": 3.0284, "step": 4618 }, { "epoch": 0.7725049128235146, "grad_norm": 4.199641704559326, "learning_rate": 7.475215339230223e-06, "loss": 2.3889, "step": 4619 }, { "epoch": 0.7726721578793327, "grad_norm": 6.597107887268066, "learning_rate": 7.464809067998107e-06, "loss": 2.7189, "step": 4620 }, { "epoch": 0.7728394029351507, "grad_norm": 4.6346282958984375, "learning_rate": 7.454408773779639e-06, "loss": 2.7993, "step": 4621 }, { "epoch": 0.7730066479909687, "grad_norm": 5.66965389251709, "learning_rate": 7.444014460119861e-06, "loss": 2.7294, "step": 4622 }, { "epoch": 0.7731738930467869, "grad_norm": 6.750359058380127, "learning_rate": 7.433626130561741e-06, "loss": 2.6584, "step": 4623 }, { "epoch": 0.7733411381026049, "grad_norm": 5.531951427459717, "learning_rate": 7.423243788646245e-06, "loss": 2.7987, "step": 4624 }, { "epoch": 0.7735083831584229, "grad_norm": 3.923097610473633, "learning_rate": 7.4128674379122606e-06, "loss": 2.4984, "step": 4625 }, { "epoch": 0.773675628214241, "grad_norm": 5.409636497497559, "learning_rate": 7.402497081896661e-06, "loss": 2.3961, "step": 4626 }, { "epoch": 0.773842873270059, "grad_norm": 5.900990962982178, "learning_rate": 7.3921327241342704e-06, "loss": 2.7798, "step": 4627 }, { "epoch": 0.774010118325877, "grad_norm": 4.413610935211182, "learning_rate": 7.38177436815787e-06, "loss": 2.6565, "step": 4628 }, { "epoch": 0.7741773633816951, "grad_norm": 4.974040985107422, "learning_rate": 7.371422017498175e-06, "loss": 2.4988, "step": 4629 }, { "epoch": 0.7743446084375131, "grad_norm": 7.906752586364746, "learning_rate": 7.361075675683887e-06, "loss": 2.8624, "step": 4630 }, { "epoch": 0.7745118534933311, "grad_norm": 5.544703006744385, "learning_rate": 7.350735346241622e-06, "loss": 3.0604, "step": 4631 }, { "epoch": 0.7746790985491492, "grad_norm": 10.452054977416992, "learning_rate": 7.340401032695982e-06, "loss": 3.2181, "step": 4632 }, { "epoch": 0.7748463436049672, "grad_norm": 5.394006729125977, "learning_rate": 7.330072738569499e-06, "loss": 2.5416, "step": 4633 }, { "epoch": 0.7750135886607852, "grad_norm": 4.044349670410156, "learning_rate": 7.319750467382636e-06, "loss": 2.4129, "step": 4634 }, { "epoch": 0.7751808337166033, "grad_norm": 3.682363510131836, "learning_rate": 7.30943422265386e-06, "loss": 2.6279, "step": 4635 }, { "epoch": 0.7753480787724213, "grad_norm": 5.334381103515625, "learning_rate": 7.299124007899519e-06, "loss": 3.1376, "step": 4636 }, { "epoch": 0.7755153238282393, "grad_norm": 4.987429618835449, "learning_rate": 7.288819826633955e-06, "loss": 3.0218, "step": 4637 }, { "epoch": 0.7756825688840574, "grad_norm": 4.04842472076416, "learning_rate": 7.278521682369413e-06, "loss": 2.7533, "step": 4638 }, { "epoch": 0.7758498139398754, "grad_norm": 16.16722297668457, "learning_rate": 7.268229578616118e-06, "loss": 3.0268, "step": 4639 }, { "epoch": 0.7760170589956934, "grad_norm": 7.779079437255859, "learning_rate": 7.257943518882202e-06, "loss": 2.4309, "step": 4640 }, { "epoch": 0.7761843040515115, "grad_norm": 4.3412909507751465, "learning_rate": 7.247663506673766e-06, "loss": 2.9788, "step": 4641 }, { "epoch": 0.7763515491073295, "grad_norm": 7.474921703338623, "learning_rate": 7.237389545494824e-06, "loss": 2.5818, "step": 4642 }, { "epoch": 0.7765187941631475, "grad_norm": 7.3992767333984375, "learning_rate": 7.227121638847348e-06, "loss": 2.7403, "step": 4643 }, { "epoch": 0.7766860392189656, "grad_norm": 4.516554355621338, "learning_rate": 7.216859790231242e-06, "loss": 2.5794, "step": 4644 }, { "epoch": 0.7768532842747836, "grad_norm": 3.772817611694336, "learning_rate": 7.206604003144329e-06, "loss": 2.8867, "step": 4645 }, { "epoch": 0.7770205293306016, "grad_norm": 2.983074188232422, "learning_rate": 7.196354281082396e-06, "loss": 2.6766, "step": 4646 }, { "epoch": 0.7771877743864197, "grad_norm": 5.753443241119385, "learning_rate": 7.186110627539122e-06, "loss": 2.4726, "step": 4647 }, { "epoch": 0.7773550194422377, "grad_norm": 5.968169212341309, "learning_rate": 7.175873046006162e-06, "loss": 3.2446, "step": 4648 }, { "epoch": 0.7775222644980557, "grad_norm": 6.613035202026367, "learning_rate": 7.165641539973064e-06, "loss": 3.5763, "step": 4649 }, { "epoch": 0.7776895095538738, "grad_norm": 3.261693239212036, "learning_rate": 7.155416112927333e-06, "loss": 2.7293, "step": 4650 }, { "epoch": 0.7778567546096918, "grad_norm": 10.443603515625, "learning_rate": 7.145196768354367e-06, "loss": 2.1909, "step": 4651 }, { "epoch": 0.7780239996655098, "grad_norm": 3.7787790298461914, "learning_rate": 7.134983509737544e-06, "loss": 2.7553, "step": 4652 }, { "epoch": 0.778191244721328, "grad_norm": 4.04592752456665, "learning_rate": 7.12477634055812e-06, "loss": 2.4576, "step": 4653 }, { "epoch": 0.778358489777146, "grad_norm": 9.40493392944336, "learning_rate": 7.114575264295298e-06, "loss": 2.912, "step": 4654 }, { "epoch": 0.7785257348329641, "grad_norm": 2.9068284034729004, "learning_rate": 7.10438028442619e-06, "loss": 2.4421, "step": 4655 }, { "epoch": 0.778692979888782, "grad_norm": 3.663703441619873, "learning_rate": 7.094191404425854e-06, "loss": 2.4624, "step": 4656 }, { "epoch": 0.7788602249446, "grad_norm": 5.246114253997803, "learning_rate": 7.084008627767233e-06, "loss": 2.3589, "step": 4657 }, { "epoch": 0.7790274700004182, "grad_norm": 3.533069372177124, "learning_rate": 7.0738319579212285e-06, "loss": 2.3177, "step": 4658 }, { "epoch": 0.7791947150562362, "grad_norm": 5.709691047668457, "learning_rate": 7.063661398356628e-06, "loss": 2.7893, "step": 4659 }, { "epoch": 0.7793619601120542, "grad_norm": 6.103975296020508, "learning_rate": 7.053496952540153e-06, "loss": 2.7787, "step": 4660 }, { "epoch": 0.7795292051678723, "grad_norm": 6.692827224731445, "learning_rate": 7.043338623936452e-06, "loss": 2.7658, "step": 4661 }, { "epoch": 0.7796964502236903, "grad_norm": 6.106437683105469, "learning_rate": 7.033186416008053e-06, "loss": 2.5136, "step": 4662 }, { "epoch": 0.7798636952795083, "grad_norm": 3.6255297660827637, "learning_rate": 7.023040332215438e-06, "loss": 2.7699, "step": 4663 }, { "epoch": 0.7800309403353264, "grad_norm": 8.227151870727539, "learning_rate": 7.01290037601697e-06, "loss": 3.2665, "step": 4664 }, { "epoch": 0.7801981853911444, "grad_norm": 4.61199951171875, "learning_rate": 7.0027665508689475e-06, "loss": 2.7443, "step": 4665 }, { "epoch": 0.7803654304469624, "grad_norm": 5.6063079833984375, "learning_rate": 6.992638860225556e-06, "loss": 2.6419, "step": 4666 }, { "epoch": 0.7805326755027805, "grad_norm": 5.509608268737793, "learning_rate": 6.982517307538905e-06, "loss": 2.7237, "step": 4667 }, { "epoch": 0.7806999205585985, "grad_norm": 6.710029602050781, "learning_rate": 6.972401896259012e-06, "loss": 3.1657, "step": 4668 }, { "epoch": 0.7808671656144165, "grad_norm": 5.401461601257324, "learning_rate": 6.962292629833805e-06, "loss": 2.5857, "step": 4669 }, { "epoch": 0.7810344106702346, "grad_norm": 7.383265018463135, "learning_rate": 6.952189511709095e-06, "loss": 2.8455, "step": 4670 }, { "epoch": 0.7812016557260526, "grad_norm": 4.217738151550293, "learning_rate": 6.94209254532863e-06, "loss": 3.06, "step": 4671 }, { "epoch": 0.7813689007818706, "grad_norm": 3.1575822830200195, "learning_rate": 6.932001734134025e-06, "loss": 2.3661, "step": 4672 }, { "epoch": 0.7815361458376887, "grad_norm": 7.785317897796631, "learning_rate": 6.921917081564832e-06, "loss": 3.1621, "step": 4673 }, { "epoch": 0.7817033908935067, "grad_norm": 4.319643974304199, "learning_rate": 6.911838591058473e-06, "loss": 2.407, "step": 4674 }, { "epoch": 0.7818706359493247, "grad_norm": 2.3820106983184814, "learning_rate": 6.901766266050292e-06, "loss": 2.527, "step": 4675 }, { "epoch": 0.7820378810051428, "grad_norm": 3.5268468856811523, "learning_rate": 6.891700109973531e-06, "loss": 2.1844, "step": 4676 }, { "epoch": 0.7822051260609608, "grad_norm": 6.073051929473877, "learning_rate": 6.881640126259306e-06, "loss": 2.5644, "step": 4677 }, { "epoch": 0.7823723711167788, "grad_norm": 6.599061489105225, "learning_rate": 6.871586318336662e-06, "loss": 3.1578, "step": 4678 }, { "epoch": 0.7825396161725969, "grad_norm": 7.561093330383301, "learning_rate": 6.861538689632504e-06, "loss": 3.2591, "step": 4679 }, { "epoch": 0.7827068612284149, "grad_norm": 3.8271028995513916, "learning_rate": 6.851497243571664e-06, "loss": 2.5567, "step": 4680 }, { "epoch": 0.7828741062842329, "grad_norm": 4.436853408813477, "learning_rate": 6.841461983576842e-06, "loss": 2.4604, "step": 4681 }, { "epoch": 0.783041351340051, "grad_norm": 3.4893641471862793, "learning_rate": 6.831432913068644e-06, "loss": 2.8403, "step": 4682 }, { "epoch": 0.783208596395869, "grad_norm": 4.555424213409424, "learning_rate": 6.821410035465553e-06, "loss": 2.6631, "step": 4683 }, { "epoch": 0.783375841451687, "grad_norm": 6.50808572769165, "learning_rate": 6.8113933541839535e-06, "loss": 2.556, "step": 4684 }, { "epoch": 0.7835430865075051, "grad_norm": 11.72445297241211, "learning_rate": 6.801382872638115e-06, "loss": 3.125, "step": 4685 }, { "epoch": 0.7837103315633231, "grad_norm": 4.594075679779053, "learning_rate": 6.7913785942402e-06, "loss": 2.6031, "step": 4686 }, { "epoch": 0.7838775766191411, "grad_norm": 5.292758941650391, "learning_rate": 6.781380522400233e-06, "loss": 2.461, "step": 4687 }, { "epoch": 0.7840448216749593, "grad_norm": 3.927380323410034, "learning_rate": 6.771388660526154e-06, "loss": 2.847, "step": 4688 }, { "epoch": 0.7842120667307773, "grad_norm": 5.624163627624512, "learning_rate": 6.761403012023754e-06, "loss": 2.8145, "step": 4689 }, { "epoch": 0.7843793117865953, "grad_norm": 2.845022678375244, "learning_rate": 6.751423580296743e-06, "loss": 2.4452, "step": 4690 }, { "epoch": 0.7845465568424134, "grad_norm": 3.287027597427368, "learning_rate": 6.741450368746679e-06, "loss": 2.7246, "step": 4691 }, { "epoch": 0.7847138018982314, "grad_norm": 6.018945217132568, "learning_rate": 6.7314833807730015e-06, "loss": 2.9549, "step": 4692 }, { "epoch": 0.7848810469540494, "grad_norm": 4.042255401611328, "learning_rate": 6.721522619773068e-06, "loss": 2.6022, "step": 4693 }, { "epoch": 0.7850482920098675, "grad_norm": 9.778560638427734, "learning_rate": 6.711568089142068e-06, "loss": 3.4845, "step": 4694 }, { "epoch": 0.7852155370656855, "grad_norm": 4.870809078216553, "learning_rate": 6.701619792273095e-06, "loss": 2.4661, "step": 4695 }, { "epoch": 0.7853827821215036, "grad_norm": 6.3645219802856445, "learning_rate": 6.691677732557094e-06, "loss": 3.0154, "step": 4696 }, { "epoch": 0.7855500271773216, "grad_norm": 4.786750793457031, "learning_rate": 6.68174191338291e-06, "loss": 2.5282, "step": 4697 }, { "epoch": 0.7857172722331396, "grad_norm": 4.830770969390869, "learning_rate": 6.6718123381372395e-06, "loss": 2.7935, "step": 4698 }, { "epoch": 0.7858845172889577, "grad_norm": 5.806929588317871, "learning_rate": 6.66188901020467e-06, "loss": 2.9375, "step": 4699 }, { "epoch": 0.7860517623447757, "grad_norm": 6.722040176391602, "learning_rate": 6.651971932967635e-06, "loss": 3.2612, "step": 4700 }, { "epoch": 0.7862190074005937, "grad_norm": 9.898653984069824, "learning_rate": 6.642061109806461e-06, "loss": 3.0492, "step": 4701 }, { "epoch": 0.7863862524564118, "grad_norm": 5.465662956237793, "learning_rate": 6.6321565440993355e-06, "loss": 2.8446, "step": 4702 }, { "epoch": 0.7865534975122298, "grad_norm": 4.512746334075928, "learning_rate": 6.622258239222304e-06, "loss": 3.1065, "step": 4703 }, { "epoch": 0.7867207425680478, "grad_norm": 3.7123422622680664, "learning_rate": 6.612366198549294e-06, "loss": 2.6226, "step": 4704 }, { "epoch": 0.7868879876238659, "grad_norm": 4.528548717498779, "learning_rate": 6.602480425452074e-06, "loss": 3.1079, "step": 4705 }, { "epoch": 0.7870552326796839, "grad_norm": 6.820843696594238, "learning_rate": 6.59260092330031e-06, "loss": 3.0353, "step": 4706 }, { "epoch": 0.7872224777355019, "grad_norm": 3.722944498062134, "learning_rate": 6.5827276954614905e-06, "loss": 2.5084, "step": 4707 }, { "epoch": 0.78738972279132, "grad_norm": 4.212715148925781, "learning_rate": 6.572860745300999e-06, "loss": 2.6961, "step": 4708 }, { "epoch": 0.787556967847138, "grad_norm": 7.582869052886963, "learning_rate": 6.563000076182063e-06, "loss": 2.9705, "step": 4709 }, { "epoch": 0.787724212902956, "grad_norm": 4.422638416290283, "learning_rate": 6.553145691465784e-06, "loss": 2.8501, "step": 4710 }, { "epoch": 0.7878914579587741, "grad_norm": 3.4505317211151123, "learning_rate": 6.543297594511089e-06, "loss": 2.5779, "step": 4711 }, { "epoch": 0.7880587030145921, "grad_norm": 8.016464233398438, "learning_rate": 6.533455788674803e-06, "loss": 3.326, "step": 4712 }, { "epoch": 0.7882259480704101, "grad_norm": 6.025285720825195, "learning_rate": 6.523620277311565e-06, "loss": 2.9694, "step": 4713 }, { "epoch": 0.7883931931262282, "grad_norm": 5.952260971069336, "learning_rate": 6.513791063773913e-06, "loss": 2.5722, "step": 4714 }, { "epoch": 0.7885604381820462, "grad_norm": 8.738877296447754, "learning_rate": 6.5039681514121934e-06, "loss": 3.442, "step": 4715 }, { "epoch": 0.7887276832378642, "grad_norm": 6.518337726593018, "learning_rate": 6.494151543574645e-06, "loss": 3.2428, "step": 4716 }, { "epoch": 0.7888949282936824, "grad_norm": 4.333138942718506, "learning_rate": 6.48434124360732e-06, "loss": 2.8467, "step": 4717 }, { "epoch": 0.7890621733495004, "grad_norm": 5.157438278198242, "learning_rate": 6.474537254854152e-06, "loss": 2.9625, "step": 4718 }, { "epoch": 0.7892294184053184, "grad_norm": 2.846113920211792, "learning_rate": 6.464739580656915e-06, "loss": 2.3378, "step": 4719 }, { "epoch": 0.7893966634611365, "grad_norm": 7.147388935089111, "learning_rate": 6.4549482243552145e-06, "loss": 2.6188, "step": 4720 }, { "epoch": 0.7895639085169545, "grad_norm": 6.5098185539245605, "learning_rate": 6.445163189286527e-06, "loss": 2.4869, "step": 4721 }, { "epoch": 0.7897311535727725, "grad_norm": 3.653627634048462, "learning_rate": 6.435384478786149e-06, "loss": 2.5903, "step": 4722 }, { "epoch": 0.7898983986285906, "grad_norm": 4.774979114532471, "learning_rate": 6.425612096187247e-06, "loss": 2.2774, "step": 4723 }, { "epoch": 0.7900656436844086, "grad_norm": 3.8514161109924316, "learning_rate": 6.415846044820806e-06, "loss": 2.7565, "step": 4724 }, { "epoch": 0.7902328887402266, "grad_norm": 4.369378566741943, "learning_rate": 6.40608632801567e-06, "loss": 2.6845, "step": 4725 }, { "epoch": 0.7904001337960447, "grad_norm": 3.722034215927124, "learning_rate": 6.396332949098516e-06, "loss": 2.8602, "step": 4726 }, { "epoch": 0.7905673788518627, "grad_norm": 5.688478469848633, "learning_rate": 6.386585911393875e-06, "loss": 2.688, "step": 4727 }, { "epoch": 0.7907346239076807, "grad_norm": 4.68977689743042, "learning_rate": 6.376845218224089e-06, "loss": 2.0285, "step": 4728 }, { "epoch": 0.7909018689634988, "grad_norm": 15.299909591674805, "learning_rate": 6.367110872909368e-06, "loss": 4.1249, "step": 4729 }, { "epoch": 0.7910691140193168, "grad_norm": 4.444928169250488, "learning_rate": 6.3573828787677255e-06, "loss": 2.9217, "step": 4730 }, { "epoch": 0.7912363590751348, "grad_norm": 5.369839668273926, "learning_rate": 6.3476612391150465e-06, "loss": 3.1243, "step": 4731 }, { "epoch": 0.7914036041309529, "grad_norm": 3.841668128967285, "learning_rate": 6.337945957265015e-06, "loss": 2.6593, "step": 4732 }, { "epoch": 0.7915708491867709, "grad_norm": 4.017642498016357, "learning_rate": 6.328237036529172e-06, "loss": 2.434, "step": 4733 }, { "epoch": 0.791738094242589, "grad_norm": 4.077638149261475, "learning_rate": 6.318534480216892e-06, "loss": 2.6498, "step": 4734 }, { "epoch": 0.791905339298407, "grad_norm": 11.13046646118164, "learning_rate": 6.308838291635355e-06, "loss": 3.0549, "step": 4735 }, { "epoch": 0.792072584354225, "grad_norm": 11.517773628234863, "learning_rate": 6.2991484740896e-06, "loss": 3.3226, "step": 4736 }, { "epoch": 0.7922398294100431, "grad_norm": 10.233646392822266, "learning_rate": 6.289465030882469e-06, "loss": 2.1235, "step": 4737 }, { "epoch": 0.7924070744658611, "grad_norm": 5.457786560058594, "learning_rate": 6.279787965314654e-06, "loss": 2.3362, "step": 4738 }, { "epoch": 0.7925743195216791, "grad_norm": 3.4196786880493164, "learning_rate": 6.270117280684648e-06, "loss": 2.9958, "step": 4739 }, { "epoch": 0.7927415645774972, "grad_norm": 4.268697738647461, "learning_rate": 6.2604529802888e-06, "loss": 2.6184, "step": 4740 }, { "epoch": 0.7929088096333152, "grad_norm": 7.0807719230651855, "learning_rate": 6.250795067421244e-06, "loss": 2.358, "step": 4741 }, { "epoch": 0.7930760546891332, "grad_norm": 6.292612552642822, "learning_rate": 6.241143545373987e-06, "loss": 2.9566, "step": 4742 }, { "epoch": 0.7932432997449513, "grad_norm": 8.428791046142578, "learning_rate": 6.2314984174368065e-06, "loss": 3.0134, "step": 4743 }, { "epoch": 0.7934105448007693, "grad_norm": 4.38985538482666, "learning_rate": 6.221859686897341e-06, "loss": 2.8309, "step": 4744 }, { "epoch": 0.7935777898565873, "grad_norm": 8.09278392791748, "learning_rate": 6.212227357041015e-06, "loss": 2.6408, "step": 4745 }, { "epoch": 0.7937450349124054, "grad_norm": 2.257078170776367, "learning_rate": 6.202601431151101e-06, "loss": 2.2841, "step": 4746 }, { "epoch": 0.7939122799682234, "grad_norm": 5.640878677368164, "learning_rate": 6.192981912508658e-06, "loss": 2.8264, "step": 4747 }, { "epoch": 0.7940795250240414, "grad_norm": 6.310675144195557, "learning_rate": 6.183368804392597e-06, "loss": 2.8022, "step": 4748 }, { "epoch": 0.7942467700798596, "grad_norm": 8.047863960266113, "learning_rate": 6.173762110079609e-06, "loss": 2.8079, "step": 4749 }, { "epoch": 0.7944140151356776, "grad_norm": 5.499382495880127, "learning_rate": 6.164161832844217e-06, "loss": 3.1539, "step": 4750 }, { "epoch": 0.7945812601914956, "grad_norm": 4.682917594909668, "learning_rate": 6.154567975958769e-06, "loss": 2.0594, "step": 4751 }, { "epoch": 0.7947485052473137, "grad_norm": 7.531745910644531, "learning_rate": 6.144980542693393e-06, "loss": 3.0602, "step": 4752 }, { "epoch": 0.7949157503031317, "grad_norm": 5.1806817054748535, "learning_rate": 6.135399536316056e-06, "loss": 2.7724, "step": 4753 }, { "epoch": 0.7950829953589497, "grad_norm": 4.629027366638184, "learning_rate": 6.125824960092508e-06, "loss": 2.9023, "step": 4754 }, { "epoch": 0.7952502404147678, "grad_norm": 6.373608589172363, "learning_rate": 6.116256817286339e-06, "loss": 2.6488, "step": 4755 }, { "epoch": 0.7954174854705858, "grad_norm": 6.1713995933532715, "learning_rate": 6.106695111158914e-06, "loss": 2.8644, "step": 4756 }, { "epoch": 0.7955847305264038, "grad_norm": 3.28525447845459, "learning_rate": 6.097139844969432e-06, "loss": 2.5488, "step": 4757 }, { "epoch": 0.7957519755822219, "grad_norm": 5.60606050491333, "learning_rate": 6.087591021974864e-06, "loss": 2.5197, "step": 4758 }, { "epoch": 0.7959192206380399, "grad_norm": 4.913180351257324, "learning_rate": 6.078048645430032e-06, "loss": 2.7714, "step": 4759 }, { "epoch": 0.7960864656938579, "grad_norm": 4.3573832511901855, "learning_rate": 6.068512718587518e-06, "loss": 3.0198, "step": 4760 }, { "epoch": 0.796253710749676, "grad_norm": 6.0592474937438965, "learning_rate": 6.058983244697719e-06, "loss": 2.6691, "step": 4761 }, { "epoch": 0.796420955805494, "grad_norm": 4.472130298614502, "learning_rate": 6.049460227008841e-06, "loss": 2.6204, "step": 4762 }, { "epoch": 0.796588200861312, "grad_norm": 9.679295539855957, "learning_rate": 6.039943668766876e-06, "loss": 2.9972, "step": 4763 }, { "epoch": 0.7967554459171301, "grad_norm": 8.115713119506836, "learning_rate": 6.030433573215627e-06, "loss": 2.8676, "step": 4764 }, { "epoch": 0.7969226909729481, "grad_norm": 4.671666145324707, "learning_rate": 6.0209299435966815e-06, "loss": 2.3737, "step": 4765 }, { "epoch": 0.7970899360287661, "grad_norm": 4.5006632804870605, "learning_rate": 6.011432783149432e-06, "loss": 2.7233, "step": 4766 }, { "epoch": 0.7972571810845842, "grad_norm": 4.772523403167725, "learning_rate": 6.0019420951110655e-06, "loss": 2.7758, "step": 4767 }, { "epoch": 0.7974244261404022, "grad_norm": 5.614901065826416, "learning_rate": 5.992457882716568e-06, "loss": 3.0435, "step": 4768 }, { "epoch": 0.7975916711962202, "grad_norm": 4.073661804199219, "learning_rate": 5.9829801491986945e-06, "loss": 2.523, "step": 4769 }, { "epoch": 0.7977589162520383, "grad_norm": 7.398709297180176, "learning_rate": 5.973508897788027e-06, "loss": 2.5076, "step": 4770 }, { "epoch": 0.7979261613078563, "grad_norm": 3.5000369548797607, "learning_rate": 5.964044131712901e-06, "loss": 2.7497, "step": 4771 }, { "epoch": 0.7980934063636744, "grad_norm": 4.095185279846191, "learning_rate": 5.954585854199474e-06, "loss": 2.3191, "step": 4772 }, { "epoch": 0.7982606514194924, "grad_norm": 4.506287097930908, "learning_rate": 5.945134068471664e-06, "loss": 2.4766, "step": 4773 }, { "epoch": 0.7984278964753104, "grad_norm": 3.2130236625671387, "learning_rate": 5.9356887777512e-06, "loss": 2.4328, "step": 4774 }, { "epoch": 0.7985951415311285, "grad_norm": 5.097899436950684, "learning_rate": 5.92624998525759e-06, "loss": 2.6239, "step": 4775 }, { "epoch": 0.7987623865869465, "grad_norm": 2.6447689533233643, "learning_rate": 5.91681769420811e-06, "loss": 2.7642, "step": 4776 }, { "epoch": 0.7989296316427645, "grad_norm": 3.6875576972961426, "learning_rate": 5.907391907817847e-06, "loss": 3.0908, "step": 4777 }, { "epoch": 0.7990968766985826, "grad_norm": 6.072707653045654, "learning_rate": 5.897972629299647e-06, "loss": 2.6647, "step": 4778 }, { "epoch": 0.7992641217544006, "grad_norm": 5.760492324829102, "learning_rate": 5.888559861864162e-06, "loss": 2.9731, "step": 4779 }, { "epoch": 0.7994313668102186, "grad_norm": 4.847591876983643, "learning_rate": 5.879153608719792e-06, "loss": 2.8125, "step": 4780 }, { "epoch": 0.7995986118660368, "grad_norm": 4.451694011688232, "learning_rate": 5.869753873072756e-06, "loss": 2.3892, "step": 4781 }, { "epoch": 0.7997658569218548, "grad_norm": 3.427318572998047, "learning_rate": 5.860360658127007e-06, "loss": 2.6691, "step": 4782 }, { "epoch": 0.7999331019776728, "grad_norm": 4.5234456062316895, "learning_rate": 5.850973967084328e-06, "loss": 2.4824, "step": 4783 }, { "epoch": 0.8001003470334909, "grad_norm": 6.470495700836182, "learning_rate": 5.841593803144227e-06, "loss": 2.5658, "step": 4784 }, { "epoch": 0.8002675920893089, "grad_norm": 6.567051410675049, "learning_rate": 5.832220169504024e-06, "loss": 2.3809, "step": 4785 }, { "epoch": 0.8004348371451269, "grad_norm": 2.9089205265045166, "learning_rate": 5.822853069358786e-06, "loss": 2.809, "step": 4786 }, { "epoch": 0.800602082200945, "grad_norm": 4.04157018661499, "learning_rate": 5.81349250590138e-06, "loss": 2.7873, "step": 4787 }, { "epoch": 0.800769327256763, "grad_norm": 4.4101691246032715, "learning_rate": 5.804138482322416e-06, "loss": 2.7586, "step": 4788 }, { "epoch": 0.800936572312581, "grad_norm": 5.818861961364746, "learning_rate": 5.794791001810307e-06, "loss": 2.8621, "step": 4789 }, { "epoch": 0.8011038173683991, "grad_norm": 4.627352714538574, "learning_rate": 5.785450067551198e-06, "loss": 2.3112, "step": 4790 }, { "epoch": 0.8012710624242171, "grad_norm": 3.3560895919799805, "learning_rate": 5.776115682729036e-06, "loss": 2.3886, "step": 4791 }, { "epoch": 0.8014383074800351, "grad_norm": 6.711406230926514, "learning_rate": 5.7667878505255265e-06, "loss": 3.0291, "step": 4792 }, { "epoch": 0.8016055525358532, "grad_norm": 6.505064010620117, "learning_rate": 5.757466574120124e-06, "loss": 2.9314, "step": 4793 }, { "epoch": 0.8017727975916712, "grad_norm": 4.801153182983398, "learning_rate": 5.748151856690076e-06, "loss": 2.8119, "step": 4794 }, { "epoch": 0.8019400426474892, "grad_norm": 4.0382866859436035, "learning_rate": 5.738843701410368e-06, "loss": 2.5325, "step": 4795 }, { "epoch": 0.8021072877033073, "grad_norm": 8.970230102539062, "learning_rate": 5.729542111453773e-06, "loss": 3.0994, "step": 4796 }, { "epoch": 0.8022745327591253, "grad_norm": 6.498626708984375, "learning_rate": 5.7202470899908015e-06, "loss": 2.9743, "step": 4797 }, { "epoch": 0.8024417778149433, "grad_norm": 5.86052131652832, "learning_rate": 5.7109586401897506e-06, "loss": 2.8647, "step": 4798 }, { "epoch": 0.8026090228707614, "grad_norm": 7.452024936676025, "learning_rate": 5.701676765216643e-06, "loss": 2.8552, "step": 4799 }, { "epoch": 0.8027762679265794, "grad_norm": 7.075765132904053, "learning_rate": 5.692401468235315e-06, "loss": 2.7311, "step": 4800 }, { "epoch": 0.8029435129823974, "grad_norm": 4.906827449798584, "learning_rate": 5.6831327524073e-06, "loss": 2.4682, "step": 4801 }, { "epoch": 0.8031107580382155, "grad_norm": 3.653677225112915, "learning_rate": 5.673870620891933e-06, "loss": 2.6734, "step": 4802 }, { "epoch": 0.8032780030940335, "grad_norm": 4.398099899291992, "learning_rate": 5.664615076846275e-06, "loss": 2.4804, "step": 4803 }, { "epoch": 0.8034452481498515, "grad_norm": 4.595213413238525, "learning_rate": 5.655366123425166e-06, "loss": 2.7989, "step": 4804 }, { "epoch": 0.8036124932056696, "grad_norm": 7.085729122161865, "learning_rate": 5.646123763781172e-06, "loss": 2.8496, "step": 4805 }, { "epoch": 0.8037797382614876, "grad_norm": 3.534729242324829, "learning_rate": 5.636888001064644e-06, "loss": 2.8476, "step": 4806 }, { "epoch": 0.8039469833173056, "grad_norm": 3.4338178634643555, "learning_rate": 5.627658838423655e-06, "loss": 2.5061, "step": 4807 }, { "epoch": 0.8041142283731237, "grad_norm": 4.709340572357178, "learning_rate": 5.6184362790040415e-06, "loss": 2.708, "step": 4808 }, { "epoch": 0.8042814734289417, "grad_norm": 8.328669548034668, "learning_rate": 5.609220325949402e-06, "loss": 3.2549, "step": 4809 }, { "epoch": 0.8044487184847597, "grad_norm": 7.319459438323975, "learning_rate": 5.600010982401052e-06, "loss": 2.8219, "step": 4810 }, { "epoch": 0.8046159635405778, "grad_norm": 7.294244289398193, "learning_rate": 5.590808251498086e-06, "loss": 3.102, "step": 4811 }, { "epoch": 0.8047832085963958, "grad_norm": 4.6359710693359375, "learning_rate": 5.581612136377318e-06, "loss": 2.4531, "step": 4812 }, { "epoch": 0.804950453652214, "grad_norm": 7.80157470703125, "learning_rate": 5.572422640173333e-06, "loss": 2.742, "step": 4813 }, { "epoch": 0.805117698708032, "grad_norm": 4.318916320800781, "learning_rate": 5.56323976601843e-06, "loss": 2.6807, "step": 4814 }, { "epoch": 0.80528494376385, "grad_norm": 6.976205348968506, "learning_rate": 5.5540635170426765e-06, "loss": 2.4373, "step": 4815 }, { "epoch": 0.8054521888196681, "grad_norm": 5.105635643005371, "learning_rate": 5.5448938963738735e-06, "loss": 3.0821, "step": 4816 }, { "epoch": 0.8056194338754861, "grad_norm": 8.007562637329102, "learning_rate": 5.535730907137562e-06, "loss": 2.8326, "step": 4817 }, { "epoch": 0.8057866789313041, "grad_norm": 3.1349575519561768, "learning_rate": 5.526574552457015e-06, "loss": 2.573, "step": 4818 }, { "epoch": 0.8059539239871222, "grad_norm": 6.811776638031006, "learning_rate": 5.517424835453261e-06, "loss": 2.4617, "step": 4819 }, { "epoch": 0.8061211690429402, "grad_norm": 5.258831977844238, "learning_rate": 5.5082817592450516e-06, "loss": 2.5254, "step": 4820 }, { "epoch": 0.8062884140987582, "grad_norm": 3.7439868450164795, "learning_rate": 5.499145326948868e-06, "loss": 2.8711, "step": 4821 }, { "epoch": 0.8064556591545763, "grad_norm": 5.046473503112793, "learning_rate": 5.490015541678958e-06, "loss": 2.702, "step": 4822 }, { "epoch": 0.8066229042103943, "grad_norm": 4.396510601043701, "learning_rate": 5.480892406547261e-06, "loss": 2.6616, "step": 4823 }, { "epoch": 0.8067901492662123, "grad_norm": 3.82529878616333, "learning_rate": 5.471775924663497e-06, "loss": 2.547, "step": 4824 }, { "epoch": 0.8069573943220304, "grad_norm": 4.499103546142578, "learning_rate": 5.462666099135075e-06, "loss": 3.3714, "step": 4825 }, { "epoch": 0.8071246393778484, "grad_norm": 9.019579887390137, "learning_rate": 5.453562933067169e-06, "loss": 2.8823, "step": 4826 }, { "epoch": 0.8072918844336664, "grad_norm": 3.4488234519958496, "learning_rate": 5.444466429562653e-06, "loss": 2.703, "step": 4827 }, { "epoch": 0.8074591294894845, "grad_norm": 6.438562870025635, "learning_rate": 5.435376591722157e-06, "loss": 2.9193, "step": 4828 }, { "epoch": 0.8076263745453025, "grad_norm": 3.3613274097442627, "learning_rate": 5.426293422644016e-06, "loss": 2.5461, "step": 4829 }, { "epoch": 0.8077936196011205, "grad_norm": 4.195059776306152, "learning_rate": 5.4172169254243126e-06, "loss": 2.5556, "step": 4830 }, { "epoch": 0.8079608646569386, "grad_norm": 6.007268905639648, "learning_rate": 5.408147103156835e-06, "loss": 2.5917, "step": 4831 }, { "epoch": 0.8081281097127566, "grad_norm": 4.777346611022949, "learning_rate": 5.399083958933113e-06, "loss": 2.5004, "step": 4832 }, { "epoch": 0.8082953547685746, "grad_norm": 3.859391689300537, "learning_rate": 5.390027495842398e-06, "loss": 2.4778, "step": 4833 }, { "epoch": 0.8084625998243927, "grad_norm": 3.005439519882202, "learning_rate": 5.38097771697165e-06, "loss": 2.6688, "step": 4834 }, { "epoch": 0.8086298448802107, "grad_norm": 3.2867209911346436, "learning_rate": 5.3719346254055694e-06, "loss": 2.6956, "step": 4835 }, { "epoch": 0.8087970899360287, "grad_norm": 6.32511568069458, "learning_rate": 5.362898224226556e-06, "loss": 3.1064, "step": 4836 }, { "epoch": 0.8089643349918468, "grad_norm": 8.019476890563965, "learning_rate": 5.3538685165147566e-06, "loss": 4.2303, "step": 4837 }, { "epoch": 0.8091315800476648, "grad_norm": 5.93538761138916, "learning_rate": 5.344845505348009e-06, "loss": 3.2184, "step": 4838 }, { "epoch": 0.8092988251034828, "grad_norm": 3.6941123008728027, "learning_rate": 5.335829193801889e-06, "loss": 2.5829, "step": 4839 }, { "epoch": 0.8094660701593009, "grad_norm": 3.0932793617248535, "learning_rate": 5.326819584949663e-06, "loss": 2.3764, "step": 4840 }, { "epoch": 0.8096333152151189, "grad_norm": 3.6294915676116943, "learning_rate": 5.317816681862358e-06, "loss": 2.7349, "step": 4841 }, { "epoch": 0.8098005602709369, "grad_norm": 4.913646221160889, "learning_rate": 5.308820487608665e-06, "loss": 2.664, "step": 4842 }, { "epoch": 0.809967805326755, "grad_norm": 9.007634162902832, "learning_rate": 5.2998310052550266e-06, "loss": 2.3032, "step": 4843 }, { "epoch": 0.810135050382573, "grad_norm": 16.537561416625977, "learning_rate": 5.2908482378655675e-06, "loss": 4.7996, "step": 4844 }, { "epoch": 0.810302295438391, "grad_norm": 7.04978609085083, "learning_rate": 5.281872188502146e-06, "loss": 2.6649, "step": 4845 }, { "epoch": 0.8104695404942092, "grad_norm": 4.328455448150635, "learning_rate": 5.272902860224316e-06, "loss": 2.7386, "step": 4846 }, { "epoch": 0.8106367855500272, "grad_norm": 4.904548168182373, "learning_rate": 5.263940256089356e-06, "loss": 2.8451, "step": 4847 }, { "epoch": 0.8108040306058452, "grad_norm": 15.608811378479004, "learning_rate": 5.254984379152231e-06, "loss": 3.2779, "step": 4848 }, { "epoch": 0.8109712756616633, "grad_norm": 11.341690063476562, "learning_rate": 5.246035232465632e-06, "loss": 3.3235, "step": 4849 }, { "epoch": 0.8111385207174813, "grad_norm": 5.339177131652832, "learning_rate": 5.237092819079955e-06, "loss": 2.8098, "step": 4850 }, { "epoch": 0.8113057657732994, "grad_norm": 4.134941101074219, "learning_rate": 5.22815714204328e-06, "loss": 2.671, "step": 4851 }, { "epoch": 0.8114730108291174, "grad_norm": 4.89901065826416, "learning_rate": 5.219228204401424e-06, "loss": 2.8141, "step": 4852 }, { "epoch": 0.8116402558849354, "grad_norm": 5.164276123046875, "learning_rate": 5.210306009197871e-06, "loss": 2.8031, "step": 4853 }, { "epoch": 0.8118075009407535, "grad_norm": 6.515478134155273, "learning_rate": 5.201390559473837e-06, "loss": 2.8569, "step": 4854 }, { "epoch": 0.8119747459965715, "grad_norm": 4.177342414855957, "learning_rate": 5.192481858268216e-06, "loss": 2.6376, "step": 4855 }, { "epoch": 0.8121419910523895, "grad_norm": 2.673025131225586, "learning_rate": 5.183579908617619e-06, "loss": 3.0354, "step": 4856 }, { "epoch": 0.8123092361082076, "grad_norm": 4.1720733642578125, "learning_rate": 5.174684713556346e-06, "loss": 2.0314, "step": 4857 }, { "epoch": 0.8124764811640256, "grad_norm": 5.351958274841309, "learning_rate": 5.165796276116405e-06, "loss": 2.6392, "step": 4858 }, { "epoch": 0.8126437262198436, "grad_norm": 8.005772590637207, "learning_rate": 5.156914599327478e-06, "loss": 2.3436, "step": 4859 }, { "epoch": 0.8128109712756617, "grad_norm": 7.3500471115112305, "learning_rate": 5.148039686216974e-06, "loss": 2.4795, "step": 4860 }, { "epoch": 0.8129782163314797, "grad_norm": 4.370339393615723, "learning_rate": 5.139171539809962e-06, "loss": 2.5722, "step": 4861 }, { "epoch": 0.8131454613872977, "grad_norm": 9.620964050292969, "learning_rate": 5.13031016312924e-06, "loss": 2.5523, "step": 4862 }, { "epoch": 0.8133127064431158, "grad_norm": 7.065378665924072, "learning_rate": 5.121455559195265e-06, "loss": 2.9197, "step": 4863 }, { "epoch": 0.8134799514989338, "grad_norm": 3.2615866661071777, "learning_rate": 5.1126077310262175e-06, "loss": 2.5297, "step": 4864 }, { "epoch": 0.8136471965547518, "grad_norm": 5.765289306640625, "learning_rate": 5.103766681637937e-06, "loss": 3.1238, "step": 4865 }, { "epoch": 0.8138144416105699, "grad_norm": 4.311836242675781, "learning_rate": 5.094932414043974e-06, "loss": 2.7424, "step": 4866 }, { "epoch": 0.8139816866663879, "grad_norm": 5.9812188148498535, "learning_rate": 5.086104931255567e-06, "loss": 2.4298, "step": 4867 }, { "epoch": 0.8141489317222059, "grad_norm": 2.823162317276001, "learning_rate": 5.077284236281624e-06, "loss": 2.7201, "step": 4868 }, { "epoch": 0.814316176778024, "grad_norm": 6.987941265106201, "learning_rate": 5.068470332128764e-06, "loss": 3.49, "step": 4869 }, { "epoch": 0.814483421833842, "grad_norm": 3.5943076610565186, "learning_rate": 5.059663221801267e-06, "loss": 2.5627, "step": 4870 }, { "epoch": 0.81465066688966, "grad_norm": 3.5504584312438965, "learning_rate": 5.050862908301121e-06, "loss": 2.746, "step": 4871 }, { "epoch": 0.8148179119454781, "grad_norm": 4.103069305419922, "learning_rate": 5.042069394627968e-06, "loss": 2.6992, "step": 4872 }, { "epoch": 0.8149851570012961, "grad_norm": 14.091602325439453, "learning_rate": 5.033282683779164e-06, "loss": 3.7637, "step": 4873 }, { "epoch": 0.8151524020571141, "grad_norm": 5.9760050773620605, "learning_rate": 5.024502778749724e-06, "loss": 2.7119, "step": 4874 }, { "epoch": 0.8153196471129323, "grad_norm": 3.496980667114258, "learning_rate": 5.015729682532361e-06, "loss": 2.2751, "step": 4875 }, { "epoch": 0.8154868921687503, "grad_norm": 5.86085319519043, "learning_rate": 5.006963398117442e-06, "loss": 3.1942, "step": 4876 }, { "epoch": 0.8156541372245683, "grad_norm": 4.523303508758545, "learning_rate": 4.998203928493042e-06, "loss": 2.9496, "step": 4877 }, { "epoch": 0.8158213822803864, "grad_norm": 13.770041465759277, "learning_rate": 4.989451276644888e-06, "loss": 2.3888, "step": 4878 }, { "epoch": 0.8159886273362044, "grad_norm": 5.080413818359375, "learning_rate": 4.980705445556391e-06, "loss": 3.0471, "step": 4879 }, { "epoch": 0.8161558723920224, "grad_norm": 3.9628846645355225, "learning_rate": 4.97196643820865e-06, "loss": 3.2074, "step": 4880 }, { "epoch": 0.8163231174478405, "grad_norm": 7.042820930480957, "learning_rate": 4.963234257580407e-06, "loss": 2.452, "step": 4881 }, { "epoch": 0.8164903625036585, "grad_norm": 3.3273749351501465, "learning_rate": 4.954508906648123e-06, "loss": 2.5864, "step": 4882 }, { "epoch": 0.8166576075594765, "grad_norm": 8.148648262023926, "learning_rate": 4.94579038838589e-06, "loss": 3.0577, "step": 4883 }, { "epoch": 0.8168248526152946, "grad_norm": 9.088695526123047, "learning_rate": 4.9370787057654966e-06, "loss": 3.4129, "step": 4884 }, { "epoch": 0.8169920976711126, "grad_norm": 6.5263471603393555, "learning_rate": 4.928373861756378e-06, "loss": 3.1781, "step": 4885 }, { "epoch": 0.8171593427269306, "grad_norm": 5.077635765075684, "learning_rate": 4.919675859325665e-06, "loss": 2.9895, "step": 4886 }, { "epoch": 0.8173265877827487, "grad_norm": 5.8283371925354, "learning_rate": 4.910984701438129e-06, "loss": 2.7613, "step": 4887 }, { "epoch": 0.8174938328385667, "grad_norm": 6.127190113067627, "learning_rate": 4.902300391056236e-06, "loss": 2.651, "step": 4888 }, { "epoch": 0.8176610778943848, "grad_norm": 4.84967565536499, "learning_rate": 4.893622931140096e-06, "loss": 2.8179, "step": 4889 }, { "epoch": 0.8178283229502028, "grad_norm": 3.906505584716797, "learning_rate": 4.884952324647491e-06, "loss": 2.4716, "step": 4890 }, { "epoch": 0.8179955680060208, "grad_norm": 3.523444652557373, "learning_rate": 4.87628857453388e-06, "loss": 2.4976, "step": 4891 }, { "epoch": 0.8181628130618389, "grad_norm": 3.5478639602661133, "learning_rate": 4.8676316837523605e-06, "loss": 2.6682, "step": 4892 }, { "epoch": 0.8183300581176569, "grad_norm": 5.197030544281006, "learning_rate": 4.8589816552537165e-06, "loss": 3.0973, "step": 4893 }, { "epoch": 0.8184973031734749, "grad_norm": 1.3584232330322266, "learning_rate": 4.850338491986367e-06, "loss": 2.3152, "step": 4894 }, { "epoch": 0.818664548229293, "grad_norm": 4.179235458374023, "learning_rate": 4.84170219689642e-06, "loss": 2.6672, "step": 4895 }, { "epoch": 0.818831793285111, "grad_norm": 3.9051167964935303, "learning_rate": 4.833072772927611e-06, "loss": 2.5438, "step": 4896 }, { "epoch": 0.818999038340929, "grad_norm": 4.757745742797852, "learning_rate": 4.824450223021368e-06, "loss": 2.5698, "step": 4897 }, { "epoch": 0.8191662833967471, "grad_norm": 8.950657844543457, "learning_rate": 4.815834550116735e-06, "loss": 3.8757, "step": 4898 }, { "epoch": 0.8193335284525651, "grad_norm": 8.143211364746094, "learning_rate": 4.807225757150461e-06, "loss": 2.959, "step": 4899 }, { "epoch": 0.8195007735083831, "grad_norm": 4.077327728271484, "learning_rate": 4.798623847056904e-06, "loss": 2.7114, "step": 4900 }, { "epoch": 0.8196680185642012, "grad_norm": 7.099188804626465, "learning_rate": 4.790028822768111e-06, "loss": 3.0433, "step": 4901 }, { "epoch": 0.8198352636200192, "grad_norm": 4.704214572906494, "learning_rate": 4.781440687213753e-06, "loss": 2.7441, "step": 4902 }, { "epoch": 0.8200025086758372, "grad_norm": 9.198271751403809, "learning_rate": 4.772859443321179e-06, "loss": 2.9912, "step": 4903 }, { "epoch": 0.8201697537316553, "grad_norm": 3.791957378387451, "learning_rate": 4.764285094015361e-06, "loss": 2.9287, "step": 4904 }, { "epoch": 0.8203369987874733, "grad_norm": 3.5789737701416016, "learning_rate": 4.7557176422189516e-06, "loss": 2.8771, "step": 4905 }, { "epoch": 0.8205042438432913, "grad_norm": 4.216577529907227, "learning_rate": 4.747157090852222e-06, "loss": 3.015, "step": 4906 }, { "epoch": 0.8206714888991095, "grad_norm": 13.079646110534668, "learning_rate": 4.738603442833115e-06, "loss": 3.3107, "step": 4907 }, { "epoch": 0.8208387339549275, "grad_norm": 4.729413032531738, "learning_rate": 4.730056701077218e-06, "loss": 2.7272, "step": 4908 }, { "epoch": 0.8210059790107455, "grad_norm": 6.680239200592041, "learning_rate": 4.7215168684977464e-06, "loss": 2.5563, "step": 4909 }, { "epoch": 0.8211732240665636, "grad_norm": 5.746894836425781, "learning_rate": 4.712983948005581e-06, "loss": 2.7633, "step": 4910 }, { "epoch": 0.8213404691223816, "grad_norm": 8.074914932250977, "learning_rate": 4.7044579425092274e-06, "loss": 3.5355, "step": 4911 }, { "epoch": 0.8215077141781996, "grad_norm": 8.936540603637695, "learning_rate": 4.695938854914858e-06, "loss": 3.2651, "step": 4912 }, { "epoch": 0.8216749592340177, "grad_norm": 7.798746585845947, "learning_rate": 4.687426688126259e-06, "loss": 2.5563, "step": 4913 }, { "epoch": 0.8218422042898357, "grad_norm": 8.654975891113281, "learning_rate": 4.678921445044882e-06, "loss": 2.88, "step": 4914 }, { "epoch": 0.8220094493456537, "grad_norm": 4.5897603034973145, "learning_rate": 4.6704231285698035e-06, "loss": 2.6773, "step": 4915 }, { "epoch": 0.8221766944014718, "grad_norm": 4.6897969245910645, "learning_rate": 4.6619317415977554e-06, "loss": 2.4664, "step": 4916 }, { "epoch": 0.8223439394572898, "grad_norm": 4.673471450805664, "learning_rate": 4.653447287023083e-06, "loss": 2.6376, "step": 4917 }, { "epoch": 0.8225111845131078, "grad_norm": 4.805413722991943, "learning_rate": 4.644969767737792e-06, "loss": 2.6606, "step": 4918 }, { "epoch": 0.8226784295689259, "grad_norm": 3.5889477729797363, "learning_rate": 4.636499186631504e-06, "loss": 2.7423, "step": 4919 }, { "epoch": 0.8228456746247439, "grad_norm": 4.890689373016357, "learning_rate": 4.6280355465915e-06, "loss": 2.9435, "step": 4920 }, { "epoch": 0.8230129196805619, "grad_norm": 6.50131368637085, "learning_rate": 4.619578850502667e-06, "loss": 2.8761, "step": 4921 }, { "epoch": 0.82318016473638, "grad_norm": 13.609007835388184, "learning_rate": 4.611129101247544e-06, "loss": 3.9812, "step": 4922 }, { "epoch": 0.823347409792198, "grad_norm": 7.78747034072876, "learning_rate": 4.602686301706308e-06, "loss": 2.9258, "step": 4923 }, { "epoch": 0.823514654848016, "grad_norm": 6.152140140533447, "learning_rate": 4.594250454756738e-06, "loss": 3.1772, "step": 4924 }, { "epoch": 0.8236818999038341, "grad_norm": 4.242938041687012, "learning_rate": 4.585821563274278e-06, "loss": 2.3868, "step": 4925 }, { "epoch": 0.8238491449596521, "grad_norm": 6.07140588760376, "learning_rate": 4.577399630131973e-06, "loss": 2.7426, "step": 4926 }, { "epoch": 0.8240163900154701, "grad_norm": 8.68221378326416, "learning_rate": 4.5689846582005205e-06, "loss": 2.7225, "step": 4927 }, { "epoch": 0.8241836350712882, "grad_norm": 4.354984760284424, "learning_rate": 4.560576650348214e-06, "loss": 2.8517, "step": 4928 }, { "epoch": 0.8243508801271062, "grad_norm": 4.674553394317627, "learning_rate": 4.552175609441015e-06, "loss": 2.4733, "step": 4929 }, { "epoch": 0.8245181251829243, "grad_norm": 7.719079494476318, "learning_rate": 4.5437815383424564e-06, "loss": 2.7048, "step": 4930 }, { "epoch": 0.8246853702387423, "grad_norm": 11.1497802734375, "learning_rate": 4.535394439913762e-06, "loss": 2.883, "step": 4931 }, { "epoch": 0.8248526152945603, "grad_norm": 5.6133131980896, "learning_rate": 4.52701431701372e-06, "loss": 3.0431, "step": 4932 }, { "epoch": 0.8250198603503784, "grad_norm": 3.5935819149017334, "learning_rate": 4.518641172498772e-06, "loss": 2.7563, "step": 4933 }, { "epoch": 0.8251871054061964, "grad_norm": 3.977858781814575, "learning_rate": 4.51027500922297e-06, "loss": 2.4965, "step": 4934 }, { "epoch": 0.8253543504620144, "grad_norm": 5.0183868408203125, "learning_rate": 4.501915830037992e-06, "loss": 2.8535, "step": 4935 }, { "epoch": 0.8255215955178326, "grad_norm": 6.857435703277588, "learning_rate": 4.4935636377931275e-06, "loss": 2.5444, "step": 4936 }, { "epoch": 0.8256888405736506, "grad_norm": 3.248380661010742, "learning_rate": 4.4852184353353004e-06, "loss": 2.6342, "step": 4937 }, { "epoch": 0.8258560856294686, "grad_norm": 5.8249053955078125, "learning_rate": 4.476880225509034e-06, "loss": 2.7905, "step": 4938 }, { "epoch": 0.8260233306852867, "grad_norm": 5.625945091247559, "learning_rate": 4.468549011156462e-06, "loss": 2.7608, "step": 4939 }, { "epoch": 0.8261905757411047, "grad_norm": 6.179443359375, "learning_rate": 4.460224795117374e-06, "loss": 2.6804, "step": 4940 }, { "epoch": 0.8263578207969227, "grad_norm": 7.076389789581299, "learning_rate": 4.451907580229128e-06, "loss": 3.1047, "step": 4941 }, { "epoch": 0.8265250658527408, "grad_norm": 7.658638000488281, "learning_rate": 4.44359736932673e-06, "loss": 2.7704, "step": 4942 }, { "epoch": 0.8266923109085588, "grad_norm": 5.0610480308532715, "learning_rate": 4.435294165242765e-06, "loss": 2.8476, "step": 4943 }, { "epoch": 0.8268595559643768, "grad_norm": 4.34113073348999, "learning_rate": 4.426997970807467e-06, "loss": 2.6544, "step": 4944 }, { "epoch": 0.8270268010201949, "grad_norm": 3.1303060054779053, "learning_rate": 4.418708788848646e-06, "loss": 2.4754, "step": 4945 }, { "epoch": 0.8271940460760129, "grad_norm": 7.3371710777282715, "learning_rate": 4.410426622191752e-06, "loss": 2.7024, "step": 4946 }, { "epoch": 0.8273612911318309, "grad_norm": 7.772311687469482, "learning_rate": 4.402151473659813e-06, "loss": 1.7044, "step": 4947 }, { "epoch": 0.827528536187649, "grad_norm": 3.3433167934417725, "learning_rate": 4.393883346073494e-06, "loss": 2.9732, "step": 4948 }, { "epoch": 0.827695781243467, "grad_norm": 7.078674793243408, "learning_rate": 4.385622242251053e-06, "loss": 2.429, "step": 4949 }, { "epoch": 0.827863026299285, "grad_norm": 3.800166368484497, "learning_rate": 4.3773681650083495e-06, "loss": 3.2378, "step": 4950 }, { "epoch": 0.8280302713551031, "grad_norm": 4.805545806884766, "learning_rate": 4.3691211171588615e-06, "loss": 2.7081, "step": 4951 }, { "epoch": 0.8281975164109211, "grad_norm": 4.60774040222168, "learning_rate": 4.36088110151365e-06, "loss": 2.3643, "step": 4952 }, { "epoch": 0.8283647614667391, "grad_norm": 4.388908386230469, "learning_rate": 4.352648120881409e-06, "loss": 2.7326, "step": 4953 }, { "epoch": 0.8285320065225572, "grad_norm": 3.1051251888275146, "learning_rate": 4.3444221780684e-06, "loss": 2.3439, "step": 4954 }, { "epoch": 0.8286992515783752, "grad_norm": 3.835063934326172, "learning_rate": 4.336203275878509e-06, "loss": 2.8438, "step": 4955 }, { "epoch": 0.8288664966341932, "grad_norm": 6.280572414398193, "learning_rate": 4.3279914171132225e-06, "loss": 2.8887, "step": 4956 }, { "epoch": 0.8290337416900113, "grad_norm": 3.4362151622772217, "learning_rate": 4.319786604571621e-06, "loss": 2.7871, "step": 4957 }, { "epoch": 0.8292009867458293, "grad_norm": 7.899212837219238, "learning_rate": 4.3115888410503705e-06, "loss": 3.2603, "step": 4958 }, { "epoch": 0.8293682318016473, "grad_norm": 1.9582403898239136, "learning_rate": 4.303398129343758e-06, "loss": 2.0651, "step": 4959 }, { "epoch": 0.8295354768574654, "grad_norm": 3.4014809131622314, "learning_rate": 4.295214472243645e-06, "loss": 2.5458, "step": 4960 }, { "epoch": 0.8297027219132834, "grad_norm": 5.961167335510254, "learning_rate": 4.2870378725395066e-06, "loss": 3.0205, "step": 4961 }, { "epoch": 0.8298699669691014, "grad_norm": 7.9768171310424805, "learning_rate": 4.278868333018393e-06, "loss": 2.9168, "step": 4962 }, { "epoch": 0.8300372120249195, "grad_norm": 5.145112991333008, "learning_rate": 4.270705856464966e-06, "loss": 2.4953, "step": 4963 }, { "epoch": 0.8302044570807375, "grad_norm": 3.1214585304260254, "learning_rate": 4.262550445661479e-06, "loss": 2.1493, "step": 4964 }, { "epoch": 0.8303717021365555, "grad_norm": 5.724051475524902, "learning_rate": 4.254402103387758e-06, "loss": 3.1292, "step": 4965 }, { "epoch": 0.8305389471923736, "grad_norm": 2.296285390853882, "learning_rate": 4.246260832421245e-06, "loss": 2.3755, "step": 4966 }, { "epoch": 0.8307061922481916, "grad_norm": 4.231016159057617, "learning_rate": 4.238126635536943e-06, "loss": 2.6979, "step": 4967 }, { "epoch": 0.8308734373040098, "grad_norm": 10.759556770324707, "learning_rate": 4.229999515507477e-06, "loss": 3.0462, "step": 4968 }, { "epoch": 0.8310406823598278, "grad_norm": 4.441195487976074, "learning_rate": 4.2218794751030275e-06, "loss": 2.7368, "step": 4969 }, { "epoch": 0.8312079274156458, "grad_norm": 5.623783111572266, "learning_rate": 4.213766517091391e-06, "loss": 2.3368, "step": 4970 }, { "epoch": 0.8313751724714639, "grad_norm": 3.9906258583068848, "learning_rate": 4.205660644237922e-06, "loss": 2.8094, "step": 4971 }, { "epoch": 0.8315424175272819, "grad_norm": 6.030663967132568, "learning_rate": 4.197561859305579e-06, "loss": 2.6429, "step": 4972 }, { "epoch": 0.8317096625830999, "grad_norm": 4.891695499420166, "learning_rate": 4.189470165054901e-06, "loss": 2.8801, "step": 4973 }, { "epoch": 0.831876907638918, "grad_norm": 6.084169387817383, "learning_rate": 4.181385564244014e-06, "loss": 2.6714, "step": 4974 }, { "epoch": 0.832044152694736, "grad_norm": 4.670328140258789, "learning_rate": 4.173308059628609e-06, "loss": 2.5123, "step": 4975 }, { "epoch": 0.832211397750554, "grad_norm": 4.329651355743408, "learning_rate": 4.165237653961978e-06, "loss": 2.7434, "step": 4976 }, { "epoch": 0.8323786428063721, "grad_norm": 5.083930492401123, "learning_rate": 4.15717434999498e-06, "loss": 2.419, "step": 4977 }, { "epoch": 0.8325458878621901, "grad_norm": 3.033236026763916, "learning_rate": 4.149118150476064e-06, "loss": 2.4339, "step": 4978 }, { "epoch": 0.8327131329180081, "grad_norm": 9.021098136901855, "learning_rate": 4.141069058151242e-06, "loss": 3.473, "step": 4979 }, { "epoch": 0.8328803779738262, "grad_norm": 4.889710426330566, "learning_rate": 4.13302707576412e-06, "loss": 2.4527, "step": 4980 }, { "epoch": 0.8330476230296442, "grad_norm": 4.798369407653809, "learning_rate": 4.124992206055881e-06, "loss": 2.4577, "step": 4981 }, { "epoch": 0.8332148680854622, "grad_norm": 6.730278491973877, "learning_rate": 4.116964451765257e-06, "loss": 2.6926, "step": 4982 }, { "epoch": 0.8333821131412803, "grad_norm": 3.8533787727355957, "learning_rate": 4.108943815628593e-06, "loss": 2.7934, "step": 4983 }, { "epoch": 0.8335493581970983, "grad_norm": 2.4219655990600586, "learning_rate": 4.100930300379777e-06, "loss": 2.1456, "step": 4984 }, { "epoch": 0.8337166032529163, "grad_norm": 6.67440128326416, "learning_rate": 4.0929239087502875e-06, "loss": 2.7349, "step": 4985 }, { "epoch": 0.8338838483087344, "grad_norm": 3.9136781692504883, "learning_rate": 4.0849246434691624e-06, "loss": 2.532, "step": 4986 }, { "epoch": 0.8340510933645524, "grad_norm": 5.269946575164795, "learning_rate": 4.076932507263026e-06, "loss": 3.0152, "step": 4987 }, { "epoch": 0.8342183384203704, "grad_norm": 10.438421249389648, "learning_rate": 4.068947502856046e-06, "loss": 2.8161, "step": 4988 }, { "epoch": 0.8343855834761885, "grad_norm": 7.3225226402282715, "learning_rate": 4.060969632970002e-06, "loss": 2.937, "step": 4989 }, { "epoch": 0.8345528285320065, "grad_norm": 2.739819049835205, "learning_rate": 4.052998900324195e-06, "loss": 2.399, "step": 4990 }, { "epoch": 0.8347200735878245, "grad_norm": 4.541650772094727, "learning_rate": 4.045035307635531e-06, "loss": 2.9803, "step": 4991 }, { "epoch": 0.8348873186436426, "grad_norm": 9.047296524047852, "learning_rate": 4.037078857618448e-06, "loss": 2.7825, "step": 4992 }, { "epoch": 0.8350545636994606, "grad_norm": 10.190052032470703, "learning_rate": 4.029129552984986e-06, "loss": 2.2878, "step": 4993 }, { "epoch": 0.8352218087552786, "grad_norm": 7.631012439727783, "learning_rate": 4.0211873964447145e-06, "loss": 3.3291, "step": 4994 }, { "epoch": 0.8353890538110967, "grad_norm": 6.824099063873291, "learning_rate": 4.013252390704797e-06, "loss": 2.849, "step": 4995 }, { "epoch": 0.8355562988669147, "grad_norm": 4.04900598526001, "learning_rate": 4.005324538469932e-06, "loss": 2.6952, "step": 4996 }, { "epoch": 0.8357235439227327, "grad_norm": 8.889842987060547, "learning_rate": 3.997403842442399e-06, "loss": 2.7326, "step": 4997 }, { "epoch": 0.8358907889785508, "grad_norm": 3.9806714057922363, "learning_rate": 3.9894903053220355e-06, "loss": 2.1816, "step": 4998 }, { "epoch": 0.8360580340343688, "grad_norm": 9.27423095703125, "learning_rate": 3.9815839298062294e-06, "loss": 2.79, "step": 4999 }, { "epoch": 0.8362252790901868, "grad_norm": 4.780155658721924, "learning_rate": 3.973684718589943e-06, "loss": 2.6451, "step": 5000 }, { "epoch": 0.836392524146005, "grad_norm": 3.410822629928589, "learning_rate": 3.965792674365673e-06, "loss": 2.623, "step": 5001 }, { "epoch": 0.836559769201823, "grad_norm": 4.528405666351318, "learning_rate": 3.957907799823501e-06, "loss": 2.6666, "step": 5002 }, { "epoch": 0.836727014257641, "grad_norm": 9.380829811096191, "learning_rate": 3.950030097651039e-06, "loss": 2.9483, "step": 5003 }, { "epoch": 0.8368942593134591, "grad_norm": 6.871382236480713, "learning_rate": 3.94215957053348e-06, "loss": 3.0957, "step": 5004 }, { "epoch": 0.8370615043692771, "grad_norm": 4.688058376312256, "learning_rate": 3.934296221153544e-06, "loss": 3.3879, "step": 5005 }, { "epoch": 0.8372287494250952, "grad_norm": 6.058874607086182, "learning_rate": 3.926440052191524e-06, "loss": 2.7567, "step": 5006 }, { "epoch": 0.8373959944809132, "grad_norm": 9.372459411621094, "learning_rate": 3.918591066325267e-06, "loss": 2.5696, "step": 5007 }, { "epoch": 0.8375632395367312, "grad_norm": 3.671502113342285, "learning_rate": 3.910749266230152e-06, "loss": 2.8434, "step": 5008 }, { "epoch": 0.8377304845925493, "grad_norm": 5.903204441070557, "learning_rate": 3.902914654579132e-06, "loss": 3.3619, "step": 5009 }, { "epoch": 0.8378977296483673, "grad_norm": 3.9195706844329834, "learning_rate": 3.895087234042686e-06, "loss": 2.7111, "step": 5010 }, { "epoch": 0.8380649747041853, "grad_norm": 14.848851203918457, "learning_rate": 3.8872670072888665e-06, "loss": 2.6354, "step": 5011 }, { "epoch": 0.8382322197600034, "grad_norm": 6.585175514221191, "learning_rate": 3.879453976983255e-06, "loss": 2.5337, "step": 5012 }, { "epoch": 0.8383994648158214, "grad_norm": 2.899019241333008, "learning_rate": 3.871648145788987e-06, "loss": 2.6919, "step": 5013 }, { "epoch": 0.8385667098716394, "grad_norm": 3.2141549587249756, "learning_rate": 3.863849516366744e-06, "loss": 2.4959, "step": 5014 }, { "epoch": 0.8387339549274575, "grad_norm": 6.826817035675049, "learning_rate": 3.856058091374762e-06, "loss": 3.0396, "step": 5015 }, { "epoch": 0.8389011999832755, "grad_norm": 11.702659606933594, "learning_rate": 3.848273873468797e-06, "loss": 2.7064, "step": 5016 }, { "epoch": 0.8390684450390935, "grad_norm": 4.0539774894714355, "learning_rate": 3.840496865302179e-06, "loss": 2.5605, "step": 5017 }, { "epoch": 0.8392356900949116, "grad_norm": 6.47318172454834, "learning_rate": 3.832727069525752e-06, "loss": 3.1011, "step": 5018 }, { "epoch": 0.8394029351507296, "grad_norm": 5.157046794891357, "learning_rate": 3.824964488787924e-06, "loss": 2.6758, "step": 5019 }, { "epoch": 0.8395701802065476, "grad_norm": 4.693551540374756, "learning_rate": 3.817209125734628e-06, "loss": 2.9361, "step": 5020 }, { "epoch": 0.8397374252623657, "grad_norm": 4.34956169128418, "learning_rate": 3.8094609830093412e-06, "loss": 2.926, "step": 5021 }, { "epoch": 0.8399046703181837, "grad_norm": 4.570329189300537, "learning_rate": 3.8017200632530947e-06, "loss": 2.8092, "step": 5022 }, { "epoch": 0.8400719153740017, "grad_norm": 4.673158168792725, "learning_rate": 3.79398636910443e-06, "loss": 2.7793, "step": 5023 }, { "epoch": 0.8402391604298198, "grad_norm": 7.568717002868652, "learning_rate": 3.7862599031994487e-06, "loss": 2.9078, "step": 5024 }, { "epoch": 0.8404064054856378, "grad_norm": 7.0878496170043945, "learning_rate": 3.7785406681717732e-06, "loss": 3.0762, "step": 5025 }, { "epoch": 0.8405736505414558, "grad_norm": 6.079555988311768, "learning_rate": 3.7708286666525787e-06, "loss": 3.0867, "step": 5026 }, { "epoch": 0.8407408955972739, "grad_norm": 5.141243934631348, "learning_rate": 3.7631239012705514e-06, "loss": 1.9204, "step": 5027 }, { "epoch": 0.8409081406530919, "grad_norm": 3.4557883739471436, "learning_rate": 3.7554263746519352e-06, "loss": 2.9852, "step": 5028 }, { "epoch": 0.8410753857089099, "grad_norm": 6.682114124298096, "learning_rate": 3.7477360894204782e-06, "loss": 2.7733, "step": 5029 }, { "epoch": 0.841242630764728, "grad_norm": 2.2407174110412598, "learning_rate": 3.7400530481975005e-06, "loss": 2.2629, "step": 5030 }, { "epoch": 0.841409875820546, "grad_norm": 7.482152462005615, "learning_rate": 3.7323772536018097e-06, "loss": 2.5482, "step": 5031 }, { "epoch": 0.841577120876364, "grad_norm": 6.5958943367004395, "learning_rate": 3.7247087082497827e-06, "loss": 3.0901, "step": 5032 }, { "epoch": 0.8417443659321822, "grad_norm": 4.188297271728516, "learning_rate": 3.717047414755284e-06, "loss": 2.6044, "step": 5033 }, { "epoch": 0.8419116109880002, "grad_norm": 3.763449192047119, "learning_rate": 3.70939337572975e-06, "loss": 2.8475, "step": 5034 }, { "epoch": 0.8420788560438182, "grad_norm": 4.849785804748535, "learning_rate": 3.7017465937821045e-06, "loss": 2.5804, "step": 5035 }, { "epoch": 0.8422461010996363, "grad_norm": 5.16277551651001, "learning_rate": 3.694107071518829e-06, "loss": 2.3348, "step": 5036 }, { "epoch": 0.8424133461554543, "grad_norm": 4.384265899658203, "learning_rate": 3.6864748115439053e-06, "loss": 2.7281, "step": 5037 }, { "epoch": 0.8425805912112723, "grad_norm": 5.143332004547119, "learning_rate": 3.67884981645886e-06, "loss": 2.596, "step": 5038 }, { "epoch": 0.8427478362670904, "grad_norm": 4.2014079093933105, "learning_rate": 3.671232088862736e-06, "loss": 2.7418, "step": 5039 }, { "epoch": 0.8429150813229084, "grad_norm": 4.872739315032959, "learning_rate": 3.663621631352093e-06, "loss": 2.4104, "step": 5040 }, { "epoch": 0.8430823263787264, "grad_norm": 4.721876621246338, "learning_rate": 3.6560184465210257e-06, "loss": 3.4582, "step": 5041 }, { "epoch": 0.8432495714345445, "grad_norm": 3.7532708644866943, "learning_rate": 3.6484225369611326e-06, "loss": 2.483, "step": 5042 }, { "epoch": 0.8434168164903625, "grad_norm": 5.8904852867126465, "learning_rate": 3.640833905261551e-06, "loss": 2.1816, "step": 5043 }, { "epoch": 0.8435840615461805, "grad_norm": 8.536558151245117, "learning_rate": 3.6332525540089147e-06, "loss": 2.6077, "step": 5044 }, { "epoch": 0.8437513066019986, "grad_norm": 7.253062725067139, "learning_rate": 3.6256784857874065e-06, "loss": 2.4055, "step": 5045 }, { "epoch": 0.8439185516578166, "grad_norm": 9.031614303588867, "learning_rate": 3.618111703178689e-06, "loss": 3.3145, "step": 5046 }, { "epoch": 0.8440857967136347, "grad_norm": 4.150712966918945, "learning_rate": 3.6105522087619888e-06, "loss": 2.6152, "step": 5047 }, { "epoch": 0.8442530417694527, "grad_norm": 5.272751808166504, "learning_rate": 3.6030000051139988e-06, "loss": 2.8795, "step": 5048 }, { "epoch": 0.8444202868252707, "grad_norm": 3.306688070297241, "learning_rate": 3.595455094808964e-06, "loss": 2.5599, "step": 5049 }, { "epoch": 0.8445875318810888, "grad_norm": 4.235332012176514, "learning_rate": 3.587917480418618e-06, "loss": 2.8056, "step": 5050 }, { "epoch": 0.8447547769369068, "grad_norm": 5.243941307067871, "learning_rate": 3.5803871645122276e-06, "loss": 2.2344, "step": 5051 }, { "epoch": 0.8449220219927248, "grad_norm": 4.187727451324463, "learning_rate": 3.5728641496565547e-06, "loss": 2.8656, "step": 5052 }, { "epoch": 0.8450892670485429, "grad_norm": 5.3889241218566895, "learning_rate": 3.5653484384158887e-06, "loss": 2.8145, "step": 5053 }, { "epoch": 0.8452565121043609, "grad_norm": 4.398578643798828, "learning_rate": 3.557840033352011e-06, "loss": 2.5862, "step": 5054 }, { "epoch": 0.8454237571601789, "grad_norm": 5.204013347625732, "learning_rate": 3.550338937024231e-06, "loss": 2.3603, "step": 5055 }, { "epoch": 0.845591002215997, "grad_norm": 2.9704630374908447, "learning_rate": 3.54284515198936e-06, "loss": 2.1279, "step": 5056 }, { "epoch": 0.845758247271815, "grad_norm": 7.358898162841797, "learning_rate": 3.535358680801712e-06, "loss": 2.6615, "step": 5057 }, { "epoch": 0.845925492327633, "grad_norm": 9.855672836303711, "learning_rate": 3.5278795260131144e-06, "loss": 2.458, "step": 5058 }, { "epoch": 0.8460927373834511, "grad_norm": 11.05049991607666, "learning_rate": 3.5204076901728946e-06, "loss": 2.6963, "step": 5059 }, { "epoch": 0.8462599824392691, "grad_norm": 6.124732971191406, "learning_rate": 3.5129431758278984e-06, "loss": 2.8899, "step": 5060 }, { "epoch": 0.8464272274950871, "grad_norm": 5.0608811378479, "learning_rate": 3.5054859855224525e-06, "loss": 2.5563, "step": 5061 }, { "epoch": 0.8465944725509053, "grad_norm": 19.711040496826172, "learning_rate": 3.4980361217984093e-06, "loss": 2.9539, "step": 5062 }, { "epoch": 0.8467617176067233, "grad_norm": 5.886483192443848, "learning_rate": 3.490593587195118e-06, "loss": 2.6415, "step": 5063 }, { "epoch": 0.8469289626625413, "grad_norm": 4.413772106170654, "learning_rate": 3.483158384249427e-06, "loss": 2.7504, "step": 5064 }, { "epoch": 0.8470962077183594, "grad_norm": 4.343077659606934, "learning_rate": 3.475730515495687e-06, "loss": 2.9235, "step": 5065 }, { "epoch": 0.8472634527741774, "grad_norm": 4.448087215423584, "learning_rate": 3.4683099834657396e-06, "loss": 2.7612, "step": 5066 }, { "epoch": 0.8474306978299954, "grad_norm": 18.963411331176758, "learning_rate": 3.460896790688947e-06, "loss": 3.4514, "step": 5067 }, { "epoch": 0.8475979428858135, "grad_norm": 7.339010238647461, "learning_rate": 3.453490939692142e-06, "loss": 2.5786, "step": 5068 }, { "epoch": 0.8477651879416315, "grad_norm": 6.406414985656738, "learning_rate": 3.446092432999684e-06, "loss": 2.4365, "step": 5069 }, { "epoch": 0.8479324329974495, "grad_norm": 4.727849960327148, "learning_rate": 3.438701273133399e-06, "loss": 2.7709, "step": 5070 }, { "epoch": 0.8480996780532676, "grad_norm": 6.356373310089111, "learning_rate": 3.4313174626126404e-06, "loss": 3.0901, "step": 5071 }, { "epoch": 0.8482669231090856, "grad_norm": 7.337518215179443, "learning_rate": 3.4239410039542315e-06, "loss": 2.6895, "step": 5072 }, { "epoch": 0.8484341681649036, "grad_norm": 4.176826000213623, "learning_rate": 3.416571899672505e-06, "loss": 2.883, "step": 5073 }, { "epoch": 0.8486014132207217, "grad_norm": 2.278505325317383, "learning_rate": 3.4092101522792713e-06, "loss": 2.4154, "step": 5074 }, { "epoch": 0.8487686582765397, "grad_norm": 9.058781623840332, "learning_rate": 3.401855764283854e-06, "loss": 2.6222, "step": 5075 }, { "epoch": 0.8489359033323577, "grad_norm": 9.11232852935791, "learning_rate": 3.3945087381930447e-06, "loss": 3.1823, "step": 5076 }, { "epoch": 0.8491031483881758, "grad_norm": 2.9943196773529053, "learning_rate": 3.387169076511146e-06, "loss": 2.2365, "step": 5077 }, { "epoch": 0.8492703934439938, "grad_norm": 10.467034339904785, "learning_rate": 3.379836781739934e-06, "loss": 2.6211, "step": 5078 }, { "epoch": 0.8494376384998118, "grad_norm": 7.763972282409668, "learning_rate": 3.372511856378688e-06, "loss": 3.1861, "step": 5079 }, { "epoch": 0.8496048835556299, "grad_norm": 7.657201290130615, "learning_rate": 3.3651943029241712e-06, "loss": 2.4367, "step": 5080 }, { "epoch": 0.8497721286114479, "grad_norm": 12.802032470703125, "learning_rate": 3.3578841238706247e-06, "loss": 2.6711, "step": 5081 }, { "epoch": 0.8499393736672659, "grad_norm": 5.623006343841553, "learning_rate": 3.350581321709789e-06, "loss": 2.7438, "step": 5082 }, { "epoch": 0.850106618723084, "grad_norm": 3.3584883213043213, "learning_rate": 3.3432858989308758e-06, "loss": 2.4129, "step": 5083 }, { "epoch": 0.850273863778902, "grad_norm": 6.300833702087402, "learning_rate": 3.3359978580206007e-06, "loss": 3.4022, "step": 5084 }, { "epoch": 0.8504411088347201, "grad_norm": 4.686997413635254, "learning_rate": 3.328717201463141e-06, "loss": 2.4996, "step": 5085 }, { "epoch": 0.8506083538905381, "grad_norm": 4.550241947174072, "learning_rate": 3.3214439317401784e-06, "loss": 2.5397, "step": 5086 }, { "epoch": 0.8507755989463561, "grad_norm": 2.9628822803497314, "learning_rate": 3.314178051330852e-06, "loss": 2.6489, "step": 5087 }, { "epoch": 0.8509428440021742, "grad_norm": 12.814192771911621, "learning_rate": 3.3069195627118186e-06, "loss": 2.3162, "step": 5088 }, { "epoch": 0.8511100890579922, "grad_norm": 5.928567886352539, "learning_rate": 3.2996684683571755e-06, "loss": 2.6365, "step": 5089 }, { "epoch": 0.8512773341138102, "grad_norm": 3.7530157566070557, "learning_rate": 3.292424770738528e-06, "loss": 2.6453, "step": 5090 }, { "epoch": 0.8514445791696283, "grad_norm": 5.510869026184082, "learning_rate": 3.285188472324943e-06, "loss": 2.8172, "step": 5091 }, { "epoch": 0.8516118242254463, "grad_norm": 5.510162353515625, "learning_rate": 3.2779595755829783e-06, "loss": 2.7027, "step": 5092 }, { "epoch": 0.8517790692812643, "grad_norm": 3.378939390182495, "learning_rate": 3.270738082976654e-06, "loss": 2.3997, "step": 5093 }, { "epoch": 0.8519463143370825, "grad_norm": 7.211568832397461, "learning_rate": 3.2635239969674852e-06, "loss": 2.9978, "step": 5094 }, { "epoch": 0.8521135593929005, "grad_norm": 7.566339015960693, "learning_rate": 3.256317320014443e-06, "loss": 2.6443, "step": 5095 }, { "epoch": 0.8522808044487185, "grad_norm": 5.069674015045166, "learning_rate": 3.249118054573985e-06, "loss": 2.7028, "step": 5096 }, { "epoch": 0.8524480495045366, "grad_norm": 6.437502384185791, "learning_rate": 3.2419262031000487e-06, "loss": 2.5501, "step": 5097 }, { "epoch": 0.8526152945603546, "grad_norm": 2.803046703338623, "learning_rate": 3.2347417680440224e-06, "loss": 2.692, "step": 5098 }, { "epoch": 0.8527825396161726, "grad_norm": 4.956595420837402, "learning_rate": 3.2275647518547917e-06, "loss": 2.8769, "step": 5099 }, { "epoch": 0.8529497846719907, "grad_norm": 6.787181377410889, "learning_rate": 3.2203951569786893e-06, "loss": 2.5486, "step": 5100 }, { "epoch": 0.8531170297278087, "grad_norm": 6.357896327972412, "learning_rate": 3.2132329858595433e-06, "loss": 2.7683, "step": 5101 }, { "epoch": 0.8532842747836267, "grad_norm": 4.577526092529297, "learning_rate": 3.206078240938626e-06, "loss": 2.9056, "step": 5102 }, { "epoch": 0.8534515198394448, "grad_norm": 3.7935142517089844, "learning_rate": 3.1989309246546946e-06, "loss": 2.8592, "step": 5103 }, { "epoch": 0.8536187648952628, "grad_norm": 3.0745301246643066, "learning_rate": 3.1917910394439756e-06, "loss": 2.4021, "step": 5104 }, { "epoch": 0.8537860099510808, "grad_norm": 4.677557945251465, "learning_rate": 3.184658587740158e-06, "loss": 2.2929, "step": 5105 }, { "epoch": 0.8539532550068989, "grad_norm": 17.588685989379883, "learning_rate": 3.1775335719743894e-06, "loss": 2.9336, "step": 5106 }, { "epoch": 0.8541205000627169, "grad_norm": 6.8988542556762695, "learning_rate": 3.170415994575299e-06, "loss": 3.0458, "step": 5107 }, { "epoch": 0.8542877451185349, "grad_norm": 2.5071072578430176, "learning_rate": 3.1633058579689593e-06, "loss": 2.6177, "step": 5108 }, { "epoch": 0.854454990174353, "grad_norm": 7.051429748535156, "learning_rate": 3.15620316457893e-06, "loss": 2.5427, "step": 5109 }, { "epoch": 0.854622235230171, "grad_norm": 4.671677112579346, "learning_rate": 3.1491079168262133e-06, "loss": 2.7659, "step": 5110 }, { "epoch": 0.854789480285989, "grad_norm": 6.370981693267822, "learning_rate": 3.142020117129288e-06, "loss": 2.9117, "step": 5111 }, { "epoch": 0.8549567253418071, "grad_norm": 4.115886211395264, "learning_rate": 3.134939767904091e-06, "loss": 2.5705, "step": 5112 }, { "epoch": 0.8551239703976251, "grad_norm": 4.520246505737305, "learning_rate": 3.1278668715640087e-06, "loss": 2.7605, "step": 5113 }, { "epoch": 0.8552912154534431, "grad_norm": 4.963520526885986, "learning_rate": 3.120801430519907e-06, "loss": 2.596, "step": 5114 }, { "epoch": 0.8554584605092612, "grad_norm": 11.833632469177246, "learning_rate": 3.11374344718009e-06, "loss": 2.3161, "step": 5115 }, { "epoch": 0.8556257055650792, "grad_norm": 4.741958141326904, "learning_rate": 3.1066929239503355e-06, "loss": 2.5717, "step": 5116 }, { "epoch": 0.8557929506208972, "grad_norm": 4.837205410003662, "learning_rate": 3.0996498632338667e-06, "loss": 2.5036, "step": 5117 }, { "epoch": 0.8559601956767153, "grad_norm": 4.1379923820495605, "learning_rate": 3.092614267431376e-06, "loss": 2.7425, "step": 5118 }, { "epoch": 0.8561274407325333, "grad_norm": 7.043578147888184, "learning_rate": 3.0855861389409935e-06, "loss": 2.8221, "step": 5119 }, { "epoch": 0.8562946857883513, "grad_norm": 2.536895751953125, "learning_rate": 3.0785654801583212e-06, "loss": 2.1654, "step": 5120 }, { "epoch": 0.8564619308441694, "grad_norm": 4.442600727081299, "learning_rate": 3.071552293476407e-06, "loss": 3.107, "step": 5121 }, { "epoch": 0.8566291758999874, "grad_norm": 3.517634153366089, "learning_rate": 3.064546581285757e-06, "loss": 2.4772, "step": 5122 }, { "epoch": 0.8567964209558055, "grad_norm": 7.508183479309082, "learning_rate": 3.0575483459743203e-06, "loss": 2.6551, "step": 5123 }, { "epoch": 0.8569636660116235, "grad_norm": 6.123786926269531, "learning_rate": 3.050557589927508e-06, "loss": 2.7713, "step": 5124 }, { "epoch": 0.8571309110674415, "grad_norm": 8.114075660705566, "learning_rate": 3.043574315528172e-06, "loss": 2.7555, "step": 5125 }, { "epoch": 0.8572981561232597, "grad_norm": 3.3295984268188477, "learning_rate": 3.036598525156617e-06, "loss": 2.5982, "step": 5126 }, { "epoch": 0.8574654011790777, "grad_norm": 13.440391540527344, "learning_rate": 3.029630221190605e-06, "loss": 2.2935, "step": 5127 }, { "epoch": 0.8576326462348957, "grad_norm": 3.243175745010376, "learning_rate": 3.0226694060053266e-06, "loss": 2.4532, "step": 5128 }, { "epoch": 0.8577998912907138, "grad_norm": 6.8567328453063965, "learning_rate": 3.015716081973452e-06, "loss": 2.5821, "step": 5129 }, { "epoch": 0.8579671363465318, "grad_norm": 4.979224681854248, "learning_rate": 3.0087702514650622e-06, "loss": 2.7851, "step": 5130 }, { "epoch": 0.8581343814023498, "grad_norm": 6.643808364868164, "learning_rate": 3.001831916847714e-06, "loss": 2.8966, "step": 5131 }, { "epoch": 0.8583016264581679, "grad_norm": 4.518975734710693, "learning_rate": 2.9949010804863816e-06, "loss": 2.3664, "step": 5132 }, { "epoch": 0.8584688715139859, "grad_norm": 5.8963494300842285, "learning_rate": 2.9879777447435113e-06, "loss": 2.5506, "step": 5133 }, { "epoch": 0.8586361165698039, "grad_norm": 4.7900390625, "learning_rate": 2.981061911978966e-06, "loss": 2.6925, "step": 5134 }, { "epoch": 0.858803361625622, "grad_norm": 5.1283674240112305, "learning_rate": 2.9741535845500768e-06, "loss": 2.7177, "step": 5135 }, { "epoch": 0.85897060668144, "grad_norm": 8.067514419555664, "learning_rate": 2.96725276481159e-06, "loss": 3.5046, "step": 5136 }, { "epoch": 0.859137851737258, "grad_norm": 7.315921306610107, "learning_rate": 2.960359455115716e-06, "loss": 2.6221, "step": 5137 }, { "epoch": 0.8593050967930761, "grad_norm": 11.797233581542969, "learning_rate": 2.9534736578120973e-06, "loss": 3.6515, "step": 5138 }, { "epoch": 0.8594723418488941, "grad_norm": 5.365890979766846, "learning_rate": 2.9465953752478043e-06, "loss": 2.9612, "step": 5139 }, { "epoch": 0.8596395869047121, "grad_norm": 4.059781551361084, "learning_rate": 2.9397246097673693e-06, "loss": 2.6507, "step": 5140 }, { "epoch": 0.8598068319605302, "grad_norm": 5.817462921142578, "learning_rate": 2.9328613637127385e-06, "loss": 2.9226, "step": 5141 }, { "epoch": 0.8599740770163482, "grad_norm": 5.006478786468506, "learning_rate": 2.9260056394233126e-06, "loss": 2.9868, "step": 5142 }, { "epoch": 0.8601413220721662, "grad_norm": 11.010635375976562, "learning_rate": 2.9191574392359116e-06, "loss": 3.1339, "step": 5143 }, { "epoch": 0.8603085671279843, "grad_norm": 5.543432712554932, "learning_rate": 2.91231676548481e-06, "loss": 2.717, "step": 5144 }, { "epoch": 0.8604758121838023, "grad_norm": 5.15162992477417, "learning_rate": 2.9054836205017034e-06, "loss": 2.3295, "step": 5145 }, { "epoch": 0.8606430572396203, "grad_norm": 5.037103176116943, "learning_rate": 2.8986580066157343e-06, "loss": 2.5218, "step": 5146 }, { "epoch": 0.8608103022954384, "grad_norm": 8.967065811157227, "learning_rate": 2.891839926153453e-06, "loss": 2.6267, "step": 5147 }, { "epoch": 0.8609775473512564, "grad_norm": 5.238532066345215, "learning_rate": 2.885029381438875e-06, "loss": 2.7822, "step": 5148 }, { "epoch": 0.8611447924070744, "grad_norm": 5.891392230987549, "learning_rate": 2.8782263747934136e-06, "loss": 2.7908, "step": 5149 }, { "epoch": 0.8613120374628925, "grad_norm": 8.463727951049805, "learning_rate": 2.8714309085359466e-06, "loss": 2.875, "step": 5150 }, { "epoch": 0.8614792825187105, "grad_norm": 5.5065789222717285, "learning_rate": 2.864642984982749e-06, "loss": 2.8692, "step": 5151 }, { "epoch": 0.8616465275745285, "grad_norm": 6.48697566986084, "learning_rate": 2.8578626064475505e-06, "loss": 2.9886, "step": 5152 }, { "epoch": 0.8618137726303466, "grad_norm": 5.1550068855285645, "learning_rate": 2.8510897752414916e-06, "loss": 2.301, "step": 5153 }, { "epoch": 0.8619810176861646, "grad_norm": 6.893200397491455, "learning_rate": 2.8443244936731505e-06, "loss": 2.5661, "step": 5154 }, { "epoch": 0.8621482627419826, "grad_norm": 4.1540021896362305, "learning_rate": 2.8375667640485353e-06, "loss": 2.3387, "step": 5155 }, { "epoch": 0.8623155077978008, "grad_norm": 7.229578971862793, "learning_rate": 2.8308165886710645e-06, "loss": 2.535, "step": 5156 }, { "epoch": 0.8624827528536188, "grad_norm": 8.691336631774902, "learning_rate": 2.824073969841595e-06, "loss": 2.5967, "step": 5157 }, { "epoch": 0.8626499979094367, "grad_norm": 11.041668891906738, "learning_rate": 2.8173389098584e-06, "loss": 3.2836, "step": 5158 }, { "epoch": 0.8628172429652549, "grad_norm": 6.149074554443359, "learning_rate": 2.810611411017186e-06, "loss": 2.7942, "step": 5159 }, { "epoch": 0.8629844880210729, "grad_norm": 7.553060054779053, "learning_rate": 2.80389147561107e-06, "loss": 2.8965, "step": 5160 }, { "epoch": 0.8631517330768909, "grad_norm": 4.207579612731934, "learning_rate": 2.7971791059305974e-06, "loss": 2.9362, "step": 5161 }, { "epoch": 0.863318978132709, "grad_norm": 6.770568370819092, "learning_rate": 2.790474304263738e-06, "loss": 3.2271, "step": 5162 }, { "epoch": 0.863486223188527, "grad_norm": 2.5599849224090576, "learning_rate": 2.7837770728958824e-06, "loss": 2.3071, "step": 5163 }, { "epoch": 0.8636534682443451, "grad_norm": 7.815213680267334, "learning_rate": 2.7770874141098275e-06, "loss": 2.9698, "step": 5164 }, { "epoch": 0.8638207133001631, "grad_norm": 4.006940841674805, "learning_rate": 2.7704053301858075e-06, "loss": 2.167, "step": 5165 }, { "epoch": 0.8639879583559811, "grad_norm": 3.1392509937286377, "learning_rate": 2.763730823401453e-06, "loss": 2.476, "step": 5166 }, { "epoch": 0.8641552034117992, "grad_norm": 4.3266191482543945, "learning_rate": 2.757063896031839e-06, "loss": 2.8932, "step": 5167 }, { "epoch": 0.8643224484676172, "grad_norm": 3.389923334121704, "learning_rate": 2.750404550349431e-06, "loss": 2.3073, "step": 5168 }, { "epoch": 0.8644896935234352, "grad_norm": 4.751375675201416, "learning_rate": 2.7437527886241266e-06, "loss": 2.3691, "step": 5169 }, { "epoch": 0.8646569385792533, "grad_norm": 7.138225078582764, "learning_rate": 2.737108613123235e-06, "loss": 2.3783, "step": 5170 }, { "epoch": 0.8648241836350713, "grad_norm": 3.4688236713409424, "learning_rate": 2.7304720261114745e-06, "loss": 2.4529, "step": 5171 }, { "epoch": 0.8649914286908893, "grad_norm": 5.289647579193115, "learning_rate": 2.7238430298509837e-06, "loss": 2.7322, "step": 5172 }, { "epoch": 0.8651586737467074, "grad_norm": 10.236954689025879, "learning_rate": 2.717221626601302e-06, "loss": 3.4623, "step": 5173 }, { "epoch": 0.8653259188025254, "grad_norm": 2.8359508514404297, "learning_rate": 2.710607818619401e-06, "loss": 2.7587, "step": 5174 }, { "epoch": 0.8654931638583434, "grad_norm": 3.8157403469085693, "learning_rate": 2.7040016081596426e-06, "loss": 2.483, "step": 5175 }, { "epoch": 0.8656604089141615, "grad_norm": 11.969720840454102, "learning_rate": 2.6974029974738137e-06, "loss": 2.8472, "step": 5176 }, { "epoch": 0.8658276539699795, "grad_norm": 5.117988586425781, "learning_rate": 2.690811988811093e-06, "loss": 3.0758, "step": 5177 }, { "epoch": 0.8659948990257975, "grad_norm": 3.597445487976074, "learning_rate": 2.684228584418097e-06, "loss": 2.2457, "step": 5178 }, { "epoch": 0.8661621440816156, "grad_norm": 3.9555318355560303, "learning_rate": 2.67765278653882e-06, "loss": 2.561, "step": 5179 }, { "epoch": 0.8663293891374336, "grad_norm": 4.675159454345703, "learning_rate": 2.6710845974146888e-06, "loss": 2.7058, "step": 5180 }, { "epoch": 0.8664966341932516, "grad_norm": 9.10074520111084, "learning_rate": 2.664524019284509e-06, "loss": 2.856, "step": 5181 }, { "epoch": 0.8666638792490697, "grad_norm": 3.8507933616638184, "learning_rate": 2.6579710543845225e-06, "loss": 3.077, "step": 5182 }, { "epoch": 0.8668311243048877, "grad_norm": 5.427565574645996, "learning_rate": 2.6514257049483544e-06, "loss": 2.8237, "step": 5183 }, { "epoch": 0.8669983693607057, "grad_norm": 5.700568199157715, "learning_rate": 2.6448879732070332e-06, "loss": 2.8473, "step": 5184 }, { "epoch": 0.8671656144165238, "grad_norm": 4.675955772399902, "learning_rate": 2.6383578613890125e-06, "loss": 3.1427, "step": 5185 }, { "epoch": 0.8673328594723418, "grad_norm": 4.12567138671875, "learning_rate": 2.6318353717201203e-06, "loss": 2.5706, "step": 5186 }, { "epoch": 0.8675001045281598, "grad_norm": 8.727436065673828, "learning_rate": 2.625320506423615e-06, "loss": 2.9229, "step": 5187 }, { "epoch": 0.867667349583978, "grad_norm": 6.358867168426514, "learning_rate": 2.61881326772013e-06, "loss": 3.1785, "step": 5188 }, { "epoch": 0.867834594639796, "grad_norm": 6.716037273406982, "learning_rate": 2.6123136578277226e-06, "loss": 2.8695, "step": 5189 }, { "epoch": 0.868001839695614, "grad_norm": 10.066751480102539, "learning_rate": 2.6058216789618256e-06, "loss": 3.3473, "step": 5190 }, { "epoch": 0.8681690847514321, "grad_norm": 3.846135139465332, "learning_rate": 2.5993373333352942e-06, "loss": 2.5751, "step": 5191 }, { "epoch": 0.8683363298072501, "grad_norm": 4.924368858337402, "learning_rate": 2.5928606231583606e-06, "loss": 2.7342, "step": 5192 }, { "epoch": 0.8685035748630681, "grad_norm": 7.418871879577637, "learning_rate": 2.5863915506386755e-06, "loss": 2.6725, "step": 5193 }, { "epoch": 0.8686708199188862, "grad_norm": 3.8071281909942627, "learning_rate": 2.5799301179812635e-06, "loss": 2.6677, "step": 5194 }, { "epoch": 0.8688380649747042, "grad_norm": 8.082538604736328, "learning_rate": 2.573476327388566e-06, "loss": 2.907, "step": 5195 }, { "epoch": 0.8690053100305222, "grad_norm": 5.339237213134766, "learning_rate": 2.567030181060409e-06, "loss": 2.6782, "step": 5196 }, { "epoch": 0.8691725550863403, "grad_norm": 8.186684608459473, "learning_rate": 2.5605916811940077e-06, "loss": 3.0905, "step": 5197 }, { "epoch": 0.8693398001421583, "grad_norm": 5.121346950531006, "learning_rate": 2.55416082998399e-06, "loss": 2.5915, "step": 5198 }, { "epoch": 0.8695070451979763, "grad_norm": 5.241125106811523, "learning_rate": 2.5477376296223513e-06, "loss": 3.177, "step": 5199 }, { "epoch": 0.8696742902537944, "grad_norm": 3.784055471420288, "learning_rate": 2.541322082298503e-06, "loss": 2.6164, "step": 5200 }, { "epoch": 0.8698415353096124, "grad_norm": 9.034314155578613, "learning_rate": 2.5349141901992277e-06, "loss": 2.9479, "step": 5201 }, { "epoch": 0.8700087803654305, "grad_norm": 4.913028717041016, "learning_rate": 2.52851395550871e-06, "loss": 2.3699, "step": 5202 }, { "epoch": 0.8701760254212485, "grad_norm": 6.180242538452148, "learning_rate": 2.5221213804085265e-06, "loss": 3.1013, "step": 5203 }, { "epoch": 0.8703432704770665, "grad_norm": 5.613980293273926, "learning_rate": 2.5157364670776396e-06, "loss": 2.6583, "step": 5204 }, { "epoch": 0.8705105155328846, "grad_norm": 6.027699947357178, "learning_rate": 2.5093592176923946e-06, "loss": 2.7655, "step": 5205 }, { "epoch": 0.8706777605887026, "grad_norm": 3.45556378364563, "learning_rate": 2.5029896344265335e-06, "loss": 2.6008, "step": 5206 }, { "epoch": 0.8708450056445206, "grad_norm": 4.0079569816589355, "learning_rate": 2.496627719451175e-06, "loss": 2.6957, "step": 5207 }, { "epoch": 0.8710122507003387, "grad_norm": 4.13932991027832, "learning_rate": 2.49027347493484e-06, "loss": 2.6773, "step": 5208 }, { "epoch": 0.8711794957561567, "grad_norm": 6.109152317047119, "learning_rate": 2.4839269030434125e-06, "loss": 2.881, "step": 5209 }, { "epoch": 0.8713467408119747, "grad_norm": 10.481668472290039, "learning_rate": 2.477588005940179e-06, "loss": 3.0366, "step": 5210 }, { "epoch": 0.8715139858677928, "grad_norm": 4.423671722412109, "learning_rate": 2.471256785785811e-06, "loss": 2.6968, "step": 5211 }, { "epoch": 0.8716812309236108, "grad_norm": 6.732901096343994, "learning_rate": 2.464933244738346e-06, "loss": 2.4607, "step": 5212 }, { "epoch": 0.8718484759794288, "grad_norm": 5.970854759216309, "learning_rate": 2.4586173849532268e-06, "loss": 2.4259, "step": 5213 }, { "epoch": 0.8720157210352469, "grad_norm": 5.109378814697266, "learning_rate": 2.4523092085832535e-06, "loss": 2.6714, "step": 5214 }, { "epoch": 0.8721829660910649, "grad_norm": 3.2922515869140625, "learning_rate": 2.446008717778628e-06, "loss": 2.3423, "step": 5215 }, { "epoch": 0.8723502111468829, "grad_norm": 3.7570533752441406, "learning_rate": 2.4397159146869188e-06, "loss": 2.4951, "step": 5216 }, { "epoch": 0.872517456202701, "grad_norm": 6.841156005859375, "learning_rate": 2.4334308014530878e-06, "loss": 2.7108, "step": 5217 }, { "epoch": 0.872684701258519, "grad_norm": 4.856072902679443, "learning_rate": 2.427153380219452e-06, "loss": 2.9532, "step": 5218 }, { "epoch": 0.872851946314337, "grad_norm": 5.216188430786133, "learning_rate": 2.420883653125741e-06, "loss": 2.5295, "step": 5219 }, { "epoch": 0.8730191913701552, "grad_norm": 5.846280574798584, "learning_rate": 2.4146216223090294e-06, "loss": 2.3391, "step": 5220 }, { "epoch": 0.8731864364259732, "grad_norm": 5.649930953979492, "learning_rate": 2.40836728990379e-06, "loss": 2.4073, "step": 5221 }, { "epoch": 0.8733536814817912, "grad_norm": 6.650360107421875, "learning_rate": 2.4021206580418536e-06, "loss": 2.5206, "step": 5222 }, { "epoch": 0.8735209265376093, "grad_norm": 4.192843914031982, "learning_rate": 2.3958817288524476e-06, "loss": 2.9567, "step": 5223 }, { "epoch": 0.8736881715934273, "grad_norm": 14.522388458251953, "learning_rate": 2.3896505044621515e-06, "loss": 3.262, "step": 5224 }, { "epoch": 0.8738554166492453, "grad_norm": 7.313216209411621, "learning_rate": 2.3834269869949382e-06, "loss": 2.5667, "step": 5225 }, { "epoch": 0.8740226617050634, "grad_norm": 4.603438377380371, "learning_rate": 2.3772111785721364e-06, "loss": 2.4412, "step": 5226 }, { "epoch": 0.8741899067608814, "grad_norm": 6.281129837036133, "learning_rate": 2.371003081312459e-06, "loss": 2.7197, "step": 5227 }, { "epoch": 0.8743571518166994, "grad_norm": 4.71771764755249, "learning_rate": 2.3648026973319913e-06, "loss": 2.8821, "step": 5228 }, { "epoch": 0.8745243968725175, "grad_norm": 7.076990604400635, "learning_rate": 2.3586100287441766e-06, "loss": 3.1666, "step": 5229 }, { "epoch": 0.8746916419283355, "grad_norm": 4.4265031814575195, "learning_rate": 2.352425077659848e-06, "loss": 2.6171, "step": 5230 }, { "epoch": 0.8748588869841535, "grad_norm": 2.90143084526062, "learning_rate": 2.346247846187183e-06, "loss": 2.5208, "step": 5231 }, { "epoch": 0.8750261320399716, "grad_norm": 5.072590351104736, "learning_rate": 2.340078336431753e-06, "loss": 3.1445, "step": 5232 }, { "epoch": 0.8751933770957896, "grad_norm": 6.58900260925293, "learning_rate": 2.333916550496476e-06, "loss": 3.0228, "step": 5233 }, { "epoch": 0.8753606221516076, "grad_norm": 3.935560941696167, "learning_rate": 2.3277624904816607e-06, "loss": 2.6295, "step": 5234 }, { "epoch": 0.8755278672074257, "grad_norm": 4.238943576812744, "learning_rate": 2.3216161584849487e-06, "loss": 2.3972, "step": 5235 }, { "epoch": 0.8756951122632437, "grad_norm": 4.899868965148926, "learning_rate": 2.315477556601392e-06, "loss": 2.5885, "step": 5236 }, { "epoch": 0.8758623573190617, "grad_norm": 6.877076148986816, "learning_rate": 2.3093466869233635e-06, "loss": 2.5789, "step": 5237 }, { "epoch": 0.8760296023748798, "grad_norm": 3.0044949054718018, "learning_rate": 2.303223551540637e-06, "loss": 2.3156, "step": 5238 }, { "epoch": 0.8761968474306978, "grad_norm": 2.59704852104187, "learning_rate": 2.297108152540317e-06, "loss": 2.0963, "step": 5239 }, { "epoch": 0.8763640924865159, "grad_norm": 12.402918815612793, "learning_rate": 2.2910004920069006e-06, "loss": 3.0984, "step": 5240 }, { "epoch": 0.8765313375423339, "grad_norm": 5.726922988891602, "learning_rate": 2.2849005720222248e-06, "loss": 2.7702, "step": 5241 }, { "epoch": 0.8766985825981519, "grad_norm": 11.751184463500977, "learning_rate": 2.2788083946655047e-06, "loss": 3.5829, "step": 5242 }, { "epoch": 0.87686582765397, "grad_norm": 5.069803237915039, "learning_rate": 2.272723962013301e-06, "loss": 2.8876, "step": 5243 }, { "epoch": 0.877033072709788, "grad_norm": 4.789041996002197, "learning_rate": 2.266647276139547e-06, "loss": 2.4623, "step": 5244 }, { "epoch": 0.877200317765606, "grad_norm": 9.497261047363281, "learning_rate": 2.2605783391155377e-06, "loss": 2.6662, "step": 5245 }, { "epoch": 0.8773675628214241, "grad_norm": 5.407397270202637, "learning_rate": 2.2545171530099052e-06, "loss": 2.8744, "step": 5246 }, { "epoch": 0.8775348078772421, "grad_norm": 4.121047496795654, "learning_rate": 2.248463719888669e-06, "loss": 2.4162, "step": 5247 }, { "epoch": 0.8777020529330601, "grad_norm": 6.175109386444092, "learning_rate": 2.242418041815178e-06, "loss": 2.5993, "step": 5248 }, { "epoch": 0.8778692979888782, "grad_norm": 7.2970380783081055, "learning_rate": 2.236380120850165e-06, "loss": 2.7231, "step": 5249 }, { "epoch": 0.8780365430446962, "grad_norm": 4.035260200500488, "learning_rate": 2.230349959051692e-06, "loss": 2.8414, "step": 5250 }, { "epoch": 0.8782037881005142, "grad_norm": 4.288783550262451, "learning_rate": 2.224327558475195e-06, "loss": 2.7506, "step": 5251 }, { "epoch": 0.8783710331563324, "grad_norm": 4.2283430099487305, "learning_rate": 2.2183129211734653e-06, "loss": 2.7183, "step": 5252 }, { "epoch": 0.8785382782121504, "grad_norm": 5.604532718658447, "learning_rate": 2.2123060491966275e-06, "loss": 2.4437, "step": 5253 }, { "epoch": 0.8787055232679684, "grad_norm": 7.173028469085693, "learning_rate": 2.2063069445921846e-06, "loss": 3.0478, "step": 5254 }, { "epoch": 0.8788727683237865, "grad_norm": 8.009917259216309, "learning_rate": 2.2003156094049738e-06, "loss": 3.132, "step": 5255 }, { "epoch": 0.8790400133796045, "grad_norm": 4.061643600463867, "learning_rate": 2.194332045677197e-06, "loss": 2.4743, "step": 5256 }, { "epoch": 0.8792072584354225, "grad_norm": 3.7475545406341553, "learning_rate": 2.188356255448393e-06, "loss": 2.6137, "step": 5257 }, { "epoch": 0.8793745034912406, "grad_norm": 4.363213062286377, "learning_rate": 2.18238824075547e-06, "loss": 2.7119, "step": 5258 }, { "epoch": 0.8795417485470586, "grad_norm": 3.7451884746551514, "learning_rate": 2.176428003632661e-06, "loss": 2.8167, "step": 5259 }, { "epoch": 0.8797089936028766, "grad_norm": 6.460381031036377, "learning_rate": 2.1704755461115693e-06, "loss": 2.7943, "step": 5260 }, { "epoch": 0.8798762386586947, "grad_norm": 4.209609508514404, "learning_rate": 2.1645308702211376e-06, "loss": 2.6525, "step": 5261 }, { "epoch": 0.8800434837145127, "grad_norm": 4.100426197052002, "learning_rate": 2.1585939779876633e-06, "loss": 2.5135, "step": 5262 }, { "epoch": 0.8802107287703307, "grad_norm": 6.685253620147705, "learning_rate": 2.152664871434776e-06, "loss": 2.3479, "step": 5263 }, { "epoch": 0.8803779738261488, "grad_norm": 5.400805473327637, "learning_rate": 2.146743552583469e-06, "loss": 3.1252, "step": 5264 }, { "epoch": 0.8805452188819668, "grad_norm": 4.308763027191162, "learning_rate": 2.1408300234520616e-06, "loss": 3.0447, "step": 5265 }, { "epoch": 0.8807124639377848, "grad_norm": 4.266951084136963, "learning_rate": 2.134924286056239e-06, "loss": 2.9565, "step": 5266 }, { "epoch": 0.8808797089936029, "grad_norm": 6.339324951171875, "learning_rate": 2.12902634240901e-06, "loss": 2.7003, "step": 5267 }, { "epoch": 0.8810469540494209, "grad_norm": 4.511047840118408, "learning_rate": 2.1231361945207447e-06, "loss": 2.3514, "step": 5268 }, { "epoch": 0.8812141991052389, "grad_norm": 6.766480445861816, "learning_rate": 2.11725384439915e-06, "loss": 2.9862, "step": 5269 }, { "epoch": 0.881381444161057, "grad_norm": 6.09503173828125, "learning_rate": 2.1113792940492675e-06, "loss": 2.4727, "step": 5270 }, { "epoch": 0.881548689216875, "grad_norm": 3.003695011138916, "learning_rate": 2.10551254547349e-06, "loss": 2.5637, "step": 5271 }, { "epoch": 0.881715934272693, "grad_norm": 5.110933780670166, "learning_rate": 2.0996536006715415e-06, "loss": 2.4904, "step": 5272 }, { "epoch": 0.8818831793285111, "grad_norm": 3.8732924461364746, "learning_rate": 2.0938024616404985e-06, "loss": 2.5536, "step": 5273 }, { "epoch": 0.8820504243843291, "grad_norm": 6.092417240142822, "learning_rate": 2.0879591303747616e-06, "loss": 2.9421, "step": 5274 }, { "epoch": 0.8822176694401471, "grad_norm": 14.269906997680664, "learning_rate": 2.082123608866085e-06, "loss": 2.3507, "step": 5275 }, { "epoch": 0.8823849144959652, "grad_norm": 6.068264007568359, "learning_rate": 2.0762958991035476e-06, "loss": 2.4426, "step": 5276 }, { "epoch": 0.8825521595517832, "grad_norm": 3.812016248703003, "learning_rate": 2.070476003073582e-06, "loss": 2.6779, "step": 5277 }, { "epoch": 0.8827194046076012, "grad_norm": 4.100927829742432, "learning_rate": 2.064663922759935e-06, "loss": 2.4567, "step": 5278 }, { "epoch": 0.8828866496634193, "grad_norm": 3.399949073791504, "learning_rate": 2.0588596601437166e-06, "loss": 2.7213, "step": 5279 }, { "epoch": 0.8830538947192373, "grad_norm": 8.265006065368652, "learning_rate": 2.0530632172033436e-06, "loss": 2.8642, "step": 5280 }, { "epoch": 0.8832211397750555, "grad_norm": 6.0497002601623535, "learning_rate": 2.0472745959145916e-06, "loss": 2.7208, "step": 5281 }, { "epoch": 0.8833883848308735, "grad_norm": 3.2470972537994385, "learning_rate": 2.041493798250549e-06, "loss": 2.6256, "step": 5282 }, { "epoch": 0.8835556298866915, "grad_norm": 3.607302188873291, "learning_rate": 2.0357208261816583e-06, "loss": 2.7245, "step": 5283 }, { "epoch": 0.8837228749425096, "grad_norm": 3.190889596939087, "learning_rate": 2.029955681675677e-06, "loss": 2.3379, "step": 5284 }, { "epoch": 0.8838901199983276, "grad_norm": 6.397571086883545, "learning_rate": 2.024198366697705e-06, "loss": 2.7915, "step": 5285 }, { "epoch": 0.8840573650541456, "grad_norm": 5.015715599060059, "learning_rate": 2.018448883210172e-06, "loss": 2.3406, "step": 5286 }, { "epoch": 0.8842246101099637, "grad_norm": 7.0810465812683105, "learning_rate": 2.0127072331728338e-06, "loss": 3.2645, "step": 5287 }, { "epoch": 0.8843918551657817, "grad_norm": 3.579272508621216, "learning_rate": 2.006973418542782e-06, "loss": 2.8642, "step": 5288 }, { "epoch": 0.8845591002215997, "grad_norm": 5.020390033721924, "learning_rate": 2.0012474412744285e-06, "loss": 2.4819, "step": 5289 }, { "epoch": 0.8847263452774178, "grad_norm": 5.132533073425293, "learning_rate": 1.9955293033195265e-06, "loss": 3.0299, "step": 5290 }, { "epoch": 0.8848935903332358, "grad_norm": 5.6946635246276855, "learning_rate": 1.9898190066271446e-06, "loss": 2.8839, "step": 5291 }, { "epoch": 0.8850608353890538, "grad_norm": 5.690025806427002, "learning_rate": 1.9841165531436835e-06, "loss": 2.6351, "step": 5292 }, { "epoch": 0.8852280804448719, "grad_norm": 4.133336067199707, "learning_rate": 1.9784219448128776e-06, "loss": 2.5289, "step": 5293 }, { "epoch": 0.8853953255006899, "grad_norm": 4.36055326461792, "learning_rate": 1.97273518357578e-06, "loss": 2.6467, "step": 5294 }, { "epoch": 0.8855625705565079, "grad_norm": 5.944424152374268, "learning_rate": 1.967056271370765e-06, "loss": 2.889, "step": 5295 }, { "epoch": 0.885729815612326, "grad_norm": 4.3217339515686035, "learning_rate": 1.9613852101335427e-06, "loss": 2.4361, "step": 5296 }, { "epoch": 0.885897060668144, "grad_norm": 6.963741779327393, "learning_rate": 1.9557220017971332e-06, "loss": 2.2952, "step": 5297 }, { "epoch": 0.886064305723962, "grad_norm": 7.796229362487793, "learning_rate": 1.9500666482918955e-06, "loss": 2.2906, "step": 5298 }, { "epoch": 0.8862315507797801, "grad_norm": 6.183309555053711, "learning_rate": 1.9444191515454957e-06, "loss": 2.9239, "step": 5299 }, { "epoch": 0.8863987958355981, "grad_norm": 3.9297780990600586, "learning_rate": 1.9387795134829355e-06, "loss": 2.6376, "step": 5300 }, { "epoch": 0.8865660408914161, "grad_norm": 4.04302453994751, "learning_rate": 1.933147736026525e-06, "loss": 2.7134, "step": 5301 }, { "epoch": 0.8867332859472342, "grad_norm": 7.538610458374023, "learning_rate": 1.927523821095906e-06, "loss": 2.9291, "step": 5302 }, { "epoch": 0.8869005310030522, "grad_norm": 5.725361347198486, "learning_rate": 1.921907770608042e-06, "loss": 3.0327, "step": 5303 }, { "epoch": 0.8870677760588702, "grad_norm": 4.924356460571289, "learning_rate": 1.9162995864771995e-06, "loss": 2.721, "step": 5304 }, { "epoch": 0.8872350211146883, "grad_norm": 3.8593339920043945, "learning_rate": 1.9106992706149797e-06, "loss": 2.3903, "step": 5305 }, { "epoch": 0.8874022661705063, "grad_norm": 3.636610269546509, "learning_rate": 1.9051068249302916e-06, "loss": 2.954, "step": 5306 }, { "epoch": 0.8875695112263243, "grad_norm": 7.185811519622803, "learning_rate": 1.8995222513293748e-06, "loss": 3.1052, "step": 5307 }, { "epoch": 0.8877367562821424, "grad_norm": 6.794389724731445, "learning_rate": 1.8939455517157673e-06, "loss": 2.3772, "step": 5308 }, { "epoch": 0.8879040013379604, "grad_norm": 4.324968338012695, "learning_rate": 1.8883767279903375e-06, "loss": 2.6623, "step": 5309 }, { "epoch": 0.8880712463937784, "grad_norm": 2.4945318698883057, "learning_rate": 1.88281578205127e-06, "loss": 2.555, "step": 5310 }, { "epoch": 0.8882384914495965, "grad_norm": 4.619632244110107, "learning_rate": 1.8772627157940453e-06, "loss": 3.1636, "step": 5311 }, { "epoch": 0.8884057365054145, "grad_norm": 10.295694351196289, "learning_rate": 1.871717531111486e-06, "loss": 4.2666, "step": 5312 }, { "epoch": 0.8885729815612325, "grad_norm": 3.606560707092285, "learning_rate": 1.8661802298937049e-06, "loss": 2.7269, "step": 5313 }, { "epoch": 0.8887402266170507, "grad_norm": 11.543442726135254, "learning_rate": 1.8606508140281392e-06, "loss": 2.8798, "step": 5314 }, { "epoch": 0.8889074716728687, "grad_norm": 4.0649638175964355, "learning_rate": 1.8551292853995345e-06, "loss": 3.0517, "step": 5315 }, { "epoch": 0.8890747167286867, "grad_norm": 4.347663402557373, "learning_rate": 1.8496156458899516e-06, "loss": 2.8916, "step": 5316 }, { "epoch": 0.8892419617845048, "grad_norm": 6.711286544799805, "learning_rate": 1.8441098973787513e-06, "loss": 3.2382, "step": 5317 }, { "epoch": 0.8894092068403228, "grad_norm": 4.458380222320557, "learning_rate": 1.8386120417426294e-06, "loss": 2.8975, "step": 5318 }, { "epoch": 0.8895764518961409, "grad_norm": 5.068148612976074, "learning_rate": 1.8331220808555593e-06, "loss": 2.5321, "step": 5319 }, { "epoch": 0.8897436969519589, "grad_norm": 5.181546688079834, "learning_rate": 1.8276400165888524e-06, "loss": 2.566, "step": 5320 }, { "epoch": 0.8899109420077769, "grad_norm": 5.950967788696289, "learning_rate": 1.8221658508111028e-06, "loss": 2.641, "step": 5321 }, { "epoch": 0.890078187063595, "grad_norm": 6.949619293212891, "learning_rate": 1.8166995853882318e-06, "loss": 2.4588, "step": 5322 }, { "epoch": 0.890245432119413, "grad_norm": 3.3236069679260254, "learning_rate": 1.81124122218346e-06, "loss": 2.4271, "step": 5323 }, { "epoch": 0.890412677175231, "grad_norm": 8.653010368347168, "learning_rate": 1.8057907630573162e-06, "loss": 2.8915, "step": 5324 }, { "epoch": 0.8905799222310491, "grad_norm": 4.474353313446045, "learning_rate": 1.8003482098676305e-06, "loss": 2.573, "step": 5325 }, { "epoch": 0.8907471672868671, "grad_norm": 9.356247901916504, "learning_rate": 1.7949135644695413e-06, "loss": 3.105, "step": 5326 }, { "epoch": 0.8909144123426851, "grad_norm": 7.615756034851074, "learning_rate": 1.7894868287155004e-06, "loss": 2.569, "step": 5327 }, { "epoch": 0.8910816573985032, "grad_norm": 3.4516031742095947, "learning_rate": 1.7840680044552444e-06, "loss": 3.0873, "step": 5328 }, { "epoch": 0.8912489024543212, "grad_norm": 8.218615531921387, "learning_rate": 1.7786570935358349e-06, "loss": 3.4232, "step": 5329 }, { "epoch": 0.8914161475101392, "grad_norm": 5.176138401031494, "learning_rate": 1.773254097801616e-06, "loss": 2.6925, "step": 5330 }, { "epoch": 0.8915833925659573, "grad_norm": 3.9558475017547607, "learning_rate": 1.7678590190942478e-06, "loss": 2.8613, "step": 5331 }, { "epoch": 0.8917506376217753, "grad_norm": 6.880895614624023, "learning_rate": 1.7624718592526845e-06, "loss": 3.15, "step": 5332 }, { "epoch": 0.8919178826775933, "grad_norm": 8.224952697753906, "learning_rate": 1.7570926201131877e-06, "loss": 2.8712, "step": 5333 }, { "epoch": 0.8920851277334114, "grad_norm": 11.860028266906738, "learning_rate": 1.7517213035093044e-06, "loss": 3.3216, "step": 5334 }, { "epoch": 0.8922523727892294, "grad_norm": 4.078397274017334, "learning_rate": 1.7463579112719092e-06, "loss": 2.6951, "step": 5335 }, { "epoch": 0.8924196178450474, "grad_norm": 6.7064313888549805, "learning_rate": 1.7410024452291423e-06, "loss": 2.5281, "step": 5336 }, { "epoch": 0.8925868629008655, "grad_norm": 4.8440704345703125, "learning_rate": 1.7356549072064714e-06, "loss": 2.9387, "step": 5337 }, { "epoch": 0.8927541079566835, "grad_norm": 7.493864059448242, "learning_rate": 1.7303152990266386e-06, "loss": 2.7757, "step": 5338 }, { "epoch": 0.8929213530125015, "grad_norm": 4.629037857055664, "learning_rate": 1.7249836225097015e-06, "loss": 2.4869, "step": 5339 }, { "epoch": 0.8930885980683196, "grad_norm": 3.846902370452881, "learning_rate": 1.7196598794729958e-06, "loss": 2.715, "step": 5340 }, { "epoch": 0.8932558431241376, "grad_norm": 4.616375923156738, "learning_rate": 1.7143440717311749e-06, "loss": 2.9475, "step": 5341 }, { "epoch": 0.8934230881799556, "grad_norm": 2.924095630645752, "learning_rate": 1.709036201096162e-06, "loss": 2.5206, "step": 5342 }, { "epoch": 0.8935903332357737, "grad_norm": 3.9507436752319336, "learning_rate": 1.7037362693771986e-06, "loss": 2.3253, "step": 5343 }, { "epoch": 0.8937575782915917, "grad_norm": 4.927101135253906, "learning_rate": 1.698444278380809e-06, "loss": 2.8558, "step": 5344 }, { "epoch": 0.8939248233474097, "grad_norm": 6.445294380187988, "learning_rate": 1.6931602299108063e-06, "loss": 2.3397, "step": 5345 }, { "epoch": 0.8940920684032279, "grad_norm": 3.1328396797180176, "learning_rate": 1.6878841257683077e-06, "loss": 2.4057, "step": 5346 }, { "epoch": 0.8942593134590459, "grad_norm": 7.881208896636963, "learning_rate": 1.6826159677517079e-06, "loss": 2.8743, "step": 5347 }, { "epoch": 0.8944265585148639, "grad_norm": 5.876470565795898, "learning_rate": 1.677355757656715e-06, "loss": 2.7147, "step": 5348 }, { "epoch": 0.894593803570682, "grad_norm": 8.550921440124512, "learning_rate": 1.6721034972763033e-06, "loss": 2.7474, "step": 5349 }, { "epoch": 0.8947610486265, "grad_norm": 9.681849479675293, "learning_rate": 1.6668591884007517e-06, "loss": 2.6589, "step": 5350 }, { "epoch": 0.894928293682318, "grad_norm": 4.406521797180176, "learning_rate": 1.6616228328176275e-06, "loss": 2.6017, "step": 5351 }, { "epoch": 0.8950955387381361, "grad_norm": 4.028356552124023, "learning_rate": 1.6563944323117863e-06, "loss": 2.8277, "step": 5352 }, { "epoch": 0.8952627837939541, "grad_norm": 5.013889312744141, "learning_rate": 1.6511739886653694e-06, "loss": 2.3586, "step": 5353 }, { "epoch": 0.8954300288497721, "grad_norm": 7.362745761871338, "learning_rate": 1.6459615036578113e-06, "loss": 3.0453, "step": 5354 }, { "epoch": 0.8955972739055902, "grad_norm": 9.489303588867188, "learning_rate": 1.6407569790658246e-06, "loss": 2.92, "step": 5355 }, { "epoch": 0.8957645189614082, "grad_norm": 3.62514328956604, "learning_rate": 1.6355604166634204e-06, "loss": 2.7499, "step": 5356 }, { "epoch": 0.8959317640172263, "grad_norm": 3.7131447792053223, "learning_rate": 1.630371818221882e-06, "loss": 2.4888, "step": 5357 }, { "epoch": 0.8960990090730443, "grad_norm": 4.4774885177612305, "learning_rate": 1.625191185509789e-06, "loss": 3.0574, "step": 5358 }, { "epoch": 0.8962662541288623, "grad_norm": 3.2748522758483887, "learning_rate": 1.6200185202930097e-06, "loss": 3.0658, "step": 5359 }, { "epoch": 0.8964334991846804, "grad_norm": 4.889610290527344, "learning_rate": 1.6148538243346777e-06, "loss": 2.2655, "step": 5360 }, { "epoch": 0.8966007442404984, "grad_norm": 4.395793437957764, "learning_rate": 1.6096970993952325e-06, "loss": 2.9814, "step": 5361 }, { "epoch": 0.8967679892963164, "grad_norm": 14.071759223937988, "learning_rate": 1.6045483472323763e-06, "loss": 2.431, "step": 5362 }, { "epoch": 0.8969352343521345, "grad_norm": 3.8885014057159424, "learning_rate": 1.5994075696011136e-06, "loss": 3.1639, "step": 5363 }, { "epoch": 0.8971024794079525, "grad_norm": 7.345308780670166, "learning_rate": 1.5942747682537119e-06, "loss": 2.4999, "step": 5364 }, { "epoch": 0.8972697244637705, "grad_norm": 2.7314836978912354, "learning_rate": 1.589149944939733e-06, "loss": 2.6476, "step": 5365 }, { "epoch": 0.8974369695195886, "grad_norm": 5.330618858337402, "learning_rate": 1.5840331014060129e-06, "loss": 2.6043, "step": 5366 }, { "epoch": 0.8976042145754066, "grad_norm": 7.93289852142334, "learning_rate": 1.5789242393966702e-06, "loss": 3.0034, "step": 5367 }, { "epoch": 0.8977714596312246, "grad_norm": 5.721530437469482, "learning_rate": 1.5738233606531038e-06, "loss": 2.477, "step": 5368 }, { "epoch": 0.8979387046870427, "grad_norm": 9.098907470703125, "learning_rate": 1.5687304669139925e-06, "loss": 2.3321, "step": 5369 }, { "epoch": 0.8981059497428607, "grad_norm": 8.145036697387695, "learning_rate": 1.563645559915289e-06, "loss": 2.9656, "step": 5370 }, { "epoch": 0.8982731947986787, "grad_norm": 5.3588948249816895, "learning_rate": 1.5585686413902178e-06, "loss": 2.2637, "step": 5371 }, { "epoch": 0.8984404398544968, "grad_norm": 6.895659446716309, "learning_rate": 1.5534997130693008e-06, "loss": 2.5869, "step": 5372 }, { "epoch": 0.8986076849103148, "grad_norm": 7.203694820404053, "learning_rate": 1.5484387766803137e-06, "loss": 3.6697, "step": 5373 }, { "epoch": 0.8987749299661328, "grad_norm": 2.0560100078582764, "learning_rate": 1.5433858339483242e-06, "loss": 2.5156, "step": 5374 }, { "epoch": 0.898942175021951, "grad_norm": 7.300956726074219, "learning_rate": 1.5383408865956628e-06, "loss": 3.0385, "step": 5375 }, { "epoch": 0.899109420077769, "grad_norm": 10.979328155517578, "learning_rate": 1.533303936341951e-06, "loss": 2.5663, "step": 5376 }, { "epoch": 0.899276665133587, "grad_norm": 7.38287878036499, "learning_rate": 1.5282749849040655e-06, "loss": 2.4841, "step": 5377 }, { "epoch": 0.8994439101894051, "grad_norm": 5.0817766189575195, "learning_rate": 1.5232540339961744e-06, "loss": 2.4071, "step": 5378 }, { "epoch": 0.8996111552452231, "grad_norm": 7.085948944091797, "learning_rate": 1.5182410853297025e-06, "loss": 2.585, "step": 5379 }, { "epoch": 0.8997784003010411, "grad_norm": 4.873412609100342, "learning_rate": 1.5132361406133584e-06, "loss": 3.1914, "step": 5380 }, { "epoch": 0.8999456453568592, "grad_norm": 5.27073860168457, "learning_rate": 1.5082392015531132e-06, "loss": 2.7034, "step": 5381 }, { "epoch": 0.9001128904126772, "grad_norm": 4.527646064758301, "learning_rate": 1.5032502698522244e-06, "loss": 2.8087, "step": 5382 }, { "epoch": 0.9002801354684952, "grad_norm": 5.126293659210205, "learning_rate": 1.4982693472111981e-06, "loss": 2.4148, "step": 5383 }, { "epoch": 0.9004473805243133, "grad_norm": 2.550096035003662, "learning_rate": 1.4932964353278317e-06, "loss": 2.1366, "step": 5384 }, { "epoch": 0.9006146255801313, "grad_norm": 5.327191352844238, "learning_rate": 1.4883315358971834e-06, "loss": 2.6485, "step": 5385 }, { "epoch": 0.9007818706359493, "grad_norm": 9.543914794921875, "learning_rate": 1.4833746506115713e-06, "loss": 3.4719, "step": 5386 }, { "epoch": 0.9009491156917674, "grad_norm": 5.721414089202881, "learning_rate": 1.4784257811606001e-06, "loss": 2.8571, "step": 5387 }, { "epoch": 0.9011163607475854, "grad_norm": 5.845968723297119, "learning_rate": 1.4734849292311255e-06, "loss": 2.8763, "step": 5388 }, { "epoch": 0.9012836058034034, "grad_norm": 6.612632751464844, "learning_rate": 1.4685520965072835e-06, "loss": 2.5229, "step": 5389 }, { "epoch": 0.9014508508592215, "grad_norm": 6.77650260925293, "learning_rate": 1.463627284670463e-06, "loss": 2.6746, "step": 5390 }, { "epoch": 0.9016180959150395, "grad_norm": 24.572153091430664, "learning_rate": 1.458710495399332e-06, "loss": 2.2715, "step": 5391 }, { "epoch": 0.9017853409708575, "grad_norm": 5.269320964813232, "learning_rate": 1.4538017303698166e-06, "loss": 2.7061, "step": 5392 }, { "epoch": 0.9019525860266756, "grad_norm": 10.849899291992188, "learning_rate": 1.4489009912551144e-06, "loss": 2.8482, "step": 5393 }, { "epoch": 0.9021198310824936, "grad_norm": 4.406374931335449, "learning_rate": 1.4440082797256755e-06, "loss": 2.8749, "step": 5394 }, { "epoch": 0.9022870761383117, "grad_norm": 4.428279399871826, "learning_rate": 1.4391235974492267e-06, "loss": 2.5703, "step": 5395 }, { "epoch": 0.9024543211941297, "grad_norm": 6.241497039794922, "learning_rate": 1.4342469460907448e-06, "loss": 2.4536, "step": 5396 }, { "epoch": 0.9026215662499477, "grad_norm": 4.4558939933776855, "learning_rate": 1.4293783273124832e-06, "loss": 2.2803, "step": 5397 }, { "epoch": 0.9027888113057658, "grad_norm": 10.378976821899414, "learning_rate": 1.4245177427739453e-06, "loss": 2.9467, "step": 5398 }, { "epoch": 0.9029560563615838, "grad_norm": 2.91723370552063, "learning_rate": 1.4196651941319056e-06, "loss": 2.5664, "step": 5399 }, { "epoch": 0.9031233014174018, "grad_norm": 12.106935501098633, "learning_rate": 1.4148206830403964e-06, "loss": 3.5089, "step": 5400 }, { "epoch": 0.9032905464732199, "grad_norm": 4.6773152351379395, "learning_rate": 1.4099842111507e-06, "loss": 2.4219, "step": 5401 }, { "epoch": 0.9034577915290379, "grad_norm": 4.928563594818115, "learning_rate": 1.4051557801113808e-06, "loss": 2.6149, "step": 5402 }, { "epoch": 0.9036250365848559, "grad_norm": 4.346463680267334, "learning_rate": 1.400335391568236e-06, "loss": 2.5882, "step": 5403 }, { "epoch": 0.903792281640674, "grad_norm": 5.656641960144043, "learning_rate": 1.3955230471643433e-06, "loss": 2.6833, "step": 5404 }, { "epoch": 0.903959526696492, "grad_norm": 4.252095699310303, "learning_rate": 1.3907187485400236e-06, "loss": 2.8648, "step": 5405 }, { "epoch": 0.90412677175231, "grad_norm": 2.8849408626556396, "learning_rate": 1.3859224973328699e-06, "loss": 1.996, "step": 5406 }, { "epoch": 0.9042940168081282, "grad_norm": 6.183009624481201, "learning_rate": 1.3811342951777134e-06, "loss": 2.3643, "step": 5407 }, { "epoch": 0.9044612618639462, "grad_norm": 3.9866888523101807, "learning_rate": 1.3763541437066568e-06, "loss": 2.5981, "step": 5408 }, { "epoch": 0.9046285069197642, "grad_norm": 4.468308925628662, "learning_rate": 1.371582044549058e-06, "loss": 2.8731, "step": 5409 }, { "epoch": 0.9047957519755823, "grad_norm": 4.737344264984131, "learning_rate": 1.366817999331524e-06, "loss": 3.0129, "step": 5410 }, { "epoch": 0.9049629970314003, "grad_norm": 1.3444324731826782, "learning_rate": 1.3620620096779114e-06, "loss": 2.0913, "step": 5411 }, { "epoch": 0.9051302420872183, "grad_norm": 4.146517276763916, "learning_rate": 1.3573140772093512e-06, "loss": 2.5142, "step": 5412 }, { "epoch": 0.9052974871430364, "grad_norm": 5.192017078399658, "learning_rate": 1.3525742035442018e-06, "loss": 2.5821, "step": 5413 }, { "epoch": 0.9054647321988544, "grad_norm": 5.160160064697266, "learning_rate": 1.3478423902981012e-06, "loss": 2.8653, "step": 5414 }, { "epoch": 0.9056319772546724, "grad_norm": 4.599849224090576, "learning_rate": 1.3431186390839122e-06, "loss": 3.0404, "step": 5415 }, { "epoch": 0.9057992223104905, "grad_norm": 8.339733123779297, "learning_rate": 1.3384029515117747e-06, "loss": 3.2942, "step": 5416 }, { "epoch": 0.9059664673663085, "grad_norm": 3.1671814918518066, "learning_rate": 1.3336953291890691e-06, "loss": 2.362, "step": 5417 }, { "epoch": 0.9061337124221265, "grad_norm": 3.8472836017608643, "learning_rate": 1.3289957737204207e-06, "loss": 2.9779, "step": 5418 }, { "epoch": 0.9063009574779446, "grad_norm": 2.8001348972320557, "learning_rate": 1.3243042867077198e-06, "loss": 2.2837, "step": 5419 }, { "epoch": 0.9064682025337626, "grad_norm": 5.900055885314941, "learning_rate": 1.3196208697500905e-06, "loss": 2.878, "step": 5420 }, { "epoch": 0.9066354475895806, "grad_norm": 13.703722953796387, "learning_rate": 1.3149455244439197e-06, "loss": 4.3084, "step": 5421 }, { "epoch": 0.9068026926453987, "grad_norm": 5.959255695343018, "learning_rate": 1.3102782523828323e-06, "loss": 2.9383, "step": 5422 }, { "epoch": 0.9069699377012167, "grad_norm": 4.296502113342285, "learning_rate": 1.3056190551577114e-06, "loss": 3.126, "step": 5423 }, { "epoch": 0.9071371827570347, "grad_norm": 8.43901252746582, "learning_rate": 1.3009679343566727e-06, "loss": 2.6612, "step": 5424 }, { "epoch": 0.9073044278128528, "grad_norm": 11.144301414489746, "learning_rate": 1.296324891565104e-06, "loss": 3.2236, "step": 5425 }, { "epoch": 0.9074716728686708, "grad_norm": 5.191564083099365, "learning_rate": 1.2916899283656137e-06, "loss": 2.8671, "step": 5426 }, { "epoch": 0.9076389179244888, "grad_norm": 9.442760467529297, "learning_rate": 1.2870630463380717e-06, "loss": 3.405, "step": 5427 }, { "epoch": 0.9078061629803069, "grad_norm": 14.399473190307617, "learning_rate": 1.2824442470595831e-06, "loss": 3.0788, "step": 5428 }, { "epoch": 0.9079734080361249, "grad_norm": 4.746778964996338, "learning_rate": 1.277833532104511e-06, "loss": 2.4338, "step": 5429 }, { "epoch": 0.9081406530919429, "grad_norm": 5.487277984619141, "learning_rate": 1.2732309030444533e-06, "loss": 3.037, "step": 5430 }, { "epoch": 0.908307898147761, "grad_norm": 8.222587585449219, "learning_rate": 1.2686363614482472e-06, "loss": 3.1909, "step": 5431 }, { "epoch": 0.908475143203579, "grad_norm": 5.924144744873047, "learning_rate": 1.264049908881987e-06, "loss": 2.6667, "step": 5432 }, { "epoch": 0.908642388259397, "grad_norm": 3.394078254699707, "learning_rate": 1.2594715469089969e-06, "loss": 2.6316, "step": 5433 }, { "epoch": 0.9088096333152151, "grad_norm": 4.864422798156738, "learning_rate": 1.2549012770898593e-06, "loss": 2.8293, "step": 5434 }, { "epoch": 0.9089768783710331, "grad_norm": 5.457976341247559, "learning_rate": 1.2503391009823783e-06, "loss": 2.5249, "step": 5435 }, { "epoch": 0.9091441234268512, "grad_norm": 8.000348091125488, "learning_rate": 1.2457850201416155e-06, "loss": 3.584, "step": 5436 }, { "epoch": 0.9093113684826692, "grad_norm": 4.4420905113220215, "learning_rate": 1.2412390361198594e-06, "loss": 2.7965, "step": 5437 }, { "epoch": 0.9094786135384872, "grad_norm": 7.71448278427124, "learning_rate": 1.2367011504666543e-06, "loss": 2.8603, "step": 5438 }, { "epoch": 0.9096458585943054, "grad_norm": 3.8289971351623535, "learning_rate": 1.2321713647287653e-06, "loss": 2.7323, "step": 5439 }, { "epoch": 0.9098131036501234, "grad_norm": 7.387739658355713, "learning_rate": 1.2276496804502214e-06, "loss": 2.4673, "step": 5440 }, { "epoch": 0.9099803487059414, "grad_norm": 5.484954833984375, "learning_rate": 1.223136099172262e-06, "loss": 2.6721, "step": 5441 }, { "epoch": 0.9101475937617595, "grad_norm": 7.440786838531494, "learning_rate": 1.2186306224333838e-06, "loss": 2.7155, "step": 5442 }, { "epoch": 0.9103148388175775, "grad_norm": 4.680426120758057, "learning_rate": 1.21413325176932e-06, "loss": 2.241, "step": 5443 }, { "epoch": 0.9104820838733955, "grad_norm": 9.869094848632812, "learning_rate": 1.2096439887130274e-06, "loss": 3.207, "step": 5444 }, { "epoch": 0.9106493289292136, "grad_norm": 4.695989608764648, "learning_rate": 1.205162834794718e-06, "loss": 3.0572, "step": 5445 }, { "epoch": 0.9108165739850316, "grad_norm": 3.09794282913208, "learning_rate": 1.2006897915418202e-06, "loss": 2.6677, "step": 5446 }, { "epoch": 0.9109838190408496, "grad_norm": 3.5635809898376465, "learning_rate": 1.1962248604790143e-06, "loss": 2.7387, "step": 5447 }, { "epoch": 0.9111510640966677, "grad_norm": 5.284529209136963, "learning_rate": 1.1917680431282052e-06, "loss": 2.6652, "step": 5448 }, { "epoch": 0.9113183091524857, "grad_norm": 4.119052886962891, "learning_rate": 1.187319341008536e-06, "loss": 2.5379, "step": 5449 }, { "epoch": 0.9114855542083037, "grad_norm": 6.4975080490112305, "learning_rate": 1.1828787556363825e-06, "loss": 2.8105, "step": 5450 }, { "epoch": 0.9116527992641218, "grad_norm": 8.476025581359863, "learning_rate": 1.1784462885253616e-06, "loss": 3.1338, "step": 5451 }, { "epoch": 0.9118200443199398, "grad_norm": 3.0320041179656982, "learning_rate": 1.1740219411863068e-06, "loss": 2.4031, "step": 5452 }, { "epoch": 0.9119872893757578, "grad_norm": 4.849741458892822, "learning_rate": 1.1696057151272977e-06, "loss": 2.7749, "step": 5453 }, { "epoch": 0.9121545344315759, "grad_norm": 8.887347221374512, "learning_rate": 1.1651976118536413e-06, "loss": 3.1745, "step": 5454 }, { "epoch": 0.9123217794873939, "grad_norm": 7.942831039428711, "learning_rate": 1.1607976328678772e-06, "loss": 2.9517, "step": 5455 }, { "epoch": 0.9124890245432119, "grad_norm": 2.437243700027466, "learning_rate": 1.156405779669767e-06, "loss": 2.5434, "step": 5456 }, { "epoch": 0.91265626959903, "grad_norm": 8.357685089111328, "learning_rate": 1.152022053756316e-06, "loss": 2.865, "step": 5457 }, { "epoch": 0.912823514654848, "grad_norm": 4.293981075286865, "learning_rate": 1.1476464566217537e-06, "loss": 2.9948, "step": 5458 }, { "epoch": 0.912990759710666, "grad_norm": 2.537734270095825, "learning_rate": 1.1432789897575374e-06, "loss": 2.2696, "step": 5459 }, { "epoch": 0.9131580047664841, "grad_norm": 4.7268500328063965, "learning_rate": 1.138919654652354e-06, "loss": 2.779, "step": 5460 }, { "epoch": 0.9133252498223021, "grad_norm": 7.306709289550781, "learning_rate": 1.1345684527921147e-06, "loss": 2.7068, "step": 5461 }, { "epoch": 0.9134924948781201, "grad_norm": 4.53432035446167, "learning_rate": 1.1302253856599692e-06, "loss": 2.6343, "step": 5462 }, { "epoch": 0.9136597399339382, "grad_norm": 4.904445648193359, "learning_rate": 1.125890454736278e-06, "loss": 2.4122, "step": 5463 }, { "epoch": 0.9138269849897562, "grad_norm": 4.0059990882873535, "learning_rate": 1.1215636614986508e-06, "loss": 2.7423, "step": 5464 }, { "epoch": 0.9139942300455742, "grad_norm": 3.167473554611206, "learning_rate": 1.1172450074218965e-06, "loss": 2.5916, "step": 5465 }, { "epoch": 0.9141614751013923, "grad_norm": 4.27986478805542, "learning_rate": 1.1129344939780766e-06, "loss": 2.6687, "step": 5466 }, { "epoch": 0.9143287201572103, "grad_norm": 3.461885452270508, "learning_rate": 1.108632122636455e-06, "loss": 2.7505, "step": 5467 }, { "epoch": 0.9144959652130283, "grad_norm": 5.957283973693848, "learning_rate": 1.1043378948635386e-06, "loss": 2.7043, "step": 5468 }, { "epoch": 0.9146632102688464, "grad_norm": 3.2727746963500977, "learning_rate": 1.100051812123043e-06, "loss": 2.5103, "step": 5469 }, { "epoch": 0.9148304553246644, "grad_norm": 5.082544326782227, "learning_rate": 1.095773875875919e-06, "loss": 2.6768, "step": 5470 }, { "epoch": 0.9149977003804824, "grad_norm": 5.097360610961914, "learning_rate": 1.0915040875803334e-06, "loss": 2.6347, "step": 5471 }, { "epoch": 0.9151649454363006, "grad_norm": 3.871170997619629, "learning_rate": 1.0872424486916833e-06, "loss": 2.3943, "step": 5472 }, { "epoch": 0.9153321904921186, "grad_norm": 4.477044582366943, "learning_rate": 1.0829889606625759e-06, "loss": 2.53, "step": 5473 }, { "epoch": 0.9154994355479367, "grad_norm": 4.576572895050049, "learning_rate": 1.0787436249428516e-06, "loss": 2.7427, "step": 5474 }, { "epoch": 0.9156666806037547, "grad_norm": 2.125699520111084, "learning_rate": 1.0745064429795725e-06, "loss": 2.1405, "step": 5475 }, { "epoch": 0.9158339256595727, "grad_norm": 5.190334796905518, "learning_rate": 1.0702774162170081e-06, "loss": 2.5023, "step": 5476 }, { "epoch": 0.9160011707153908, "grad_norm": 18.024017333984375, "learning_rate": 1.0660565460966638e-06, "loss": 2.1851, "step": 5477 }, { "epoch": 0.9161684157712088, "grad_norm": 3.865297555923462, "learning_rate": 1.0618438340572524e-06, "loss": 2.5825, "step": 5478 }, { "epoch": 0.9163356608270268, "grad_norm": 4.025332927703857, "learning_rate": 1.057639281534717e-06, "loss": 2.7097, "step": 5479 }, { "epoch": 0.9165029058828449, "grad_norm": 5.038557529449463, "learning_rate": 1.0534428899622085e-06, "loss": 2.1334, "step": 5480 }, { "epoch": 0.9166701509386629, "grad_norm": 5.2893781661987305, "learning_rate": 1.0492546607701077e-06, "loss": 2.4768, "step": 5481 }, { "epoch": 0.9168373959944809, "grad_norm": 8.895092010498047, "learning_rate": 1.0450745953859946e-06, "loss": 3.5372, "step": 5482 }, { "epoch": 0.917004641050299, "grad_norm": 4.263413429260254, "learning_rate": 1.0409026952346912e-06, "loss": 3.0173, "step": 5483 }, { "epoch": 0.917171886106117, "grad_norm": 3.7460427284240723, "learning_rate": 1.0367389617382207e-06, "loss": 2.1313, "step": 5484 }, { "epoch": 0.917339131161935, "grad_norm": 4.700969219207764, "learning_rate": 1.0325833963158255e-06, "loss": 2.7027, "step": 5485 }, { "epoch": 0.9175063762177531, "grad_norm": 11.415138244628906, "learning_rate": 1.0284360003839616e-06, "loss": 2.1575, "step": 5486 }, { "epoch": 0.9176736212735711, "grad_norm": 5.389650821685791, "learning_rate": 1.0242967753563065e-06, "loss": 2.5028, "step": 5487 }, { "epoch": 0.9178408663293891, "grad_norm": 9.607047080993652, "learning_rate": 1.0201657226437484e-06, "loss": 2.6905, "step": 5488 }, { "epoch": 0.9180081113852072, "grad_norm": 4.867250442504883, "learning_rate": 1.0160428436543857e-06, "loss": 2.8936, "step": 5489 }, { "epoch": 0.9181753564410252, "grad_norm": 10.242938995361328, "learning_rate": 1.0119281397935387e-06, "loss": 2.4355, "step": 5490 }, { "epoch": 0.9183426014968432, "grad_norm": 3.7552735805511475, "learning_rate": 1.0078216124637384e-06, "loss": 2.3213, "step": 5491 }, { "epoch": 0.9185098465526613, "grad_norm": 3.5489625930786133, "learning_rate": 1.003723263064732e-06, "loss": 2.4886, "step": 5492 }, { "epoch": 0.9186770916084793, "grad_norm": 5.488532543182373, "learning_rate": 9.996330929934683e-07, "loss": 2.5689, "step": 5493 }, { "epoch": 0.9188443366642973, "grad_norm": 6.5570759773254395, "learning_rate": 9.95551103644124e-07, "loss": 2.6714, "step": 5494 }, { "epoch": 0.9190115817201154, "grad_norm": 3.5693037509918213, "learning_rate": 9.914772964080693e-07, "loss": 2.6073, "step": 5495 }, { "epoch": 0.9191788267759334, "grad_norm": 6.12166690826416, "learning_rate": 9.874116726739018e-07, "loss": 3.0161, "step": 5496 }, { "epoch": 0.9193460718317514, "grad_norm": 4.939452171325684, "learning_rate": 9.83354233827416e-07, "loss": 3.0163, "step": 5497 }, { "epoch": 0.9195133168875695, "grad_norm": 5.869290351867676, "learning_rate": 9.793049812516303e-07, "loss": 2.7677, "step": 5498 }, { "epoch": 0.9196805619433875, "grad_norm": 5.733816623687744, "learning_rate": 9.75263916326763e-07, "loss": 2.5947, "step": 5499 }, { "epoch": 0.9198478069992055, "grad_norm": 6.369095325469971, "learning_rate": 9.712310404302399e-07, "loss": 2.8543, "step": 5500 }, { "epoch": 0.9200150520550237, "grad_norm": 4.93766450881958, "learning_rate": 9.67206354936709e-07, "loss": 2.7447, "step": 5501 }, { "epoch": 0.9201822971108417, "grad_norm": 5.854793548583984, "learning_rate": 9.631898612180084e-07, "loss": 2.7596, "step": 5502 }, { "epoch": 0.9203495421666597, "grad_norm": 4.597932815551758, "learning_rate": 9.591815606431992e-07, "loss": 2.7848, "step": 5503 }, { "epoch": 0.9205167872224778, "grad_norm": 13.038863182067871, "learning_rate": 9.551814545785354e-07, "loss": 2.4676, "step": 5504 }, { "epoch": 0.9206840322782958, "grad_norm": 5.1957550048828125, "learning_rate": 9.511895443874985e-07, "loss": 2.5465, "step": 5505 }, { "epoch": 0.9208512773341138, "grad_norm": 3.659773588180542, "learning_rate": 9.472058314307475e-07, "loss": 2.5173, "step": 5506 }, { "epoch": 0.9210185223899319, "grad_norm": 3.9289004802703857, "learning_rate": 9.432303170661822e-07, "loss": 2.9726, "step": 5507 }, { "epoch": 0.9211857674457499, "grad_norm": 5.103517532348633, "learning_rate": 9.392630026488741e-07, "loss": 2.7584, "step": 5508 }, { "epoch": 0.9213530125015679, "grad_norm": 3.8743083477020264, "learning_rate": 9.35303889531125e-07, "loss": 2.9824, "step": 5509 }, { "epoch": 0.921520257557386, "grad_norm": 6.039096355438232, "learning_rate": 9.313529790624248e-07, "loss": 2.8739, "step": 5510 }, { "epoch": 0.921687502613204, "grad_norm": 7.662531852722168, "learning_rate": 9.274102725894768e-07, "loss": 2.9128, "step": 5511 }, { "epoch": 0.9218547476690221, "grad_norm": 4.94014835357666, "learning_rate": 9.234757714561837e-07, "loss": 2.6105, "step": 5512 }, { "epoch": 0.9220219927248401, "grad_norm": 4.832831859588623, "learning_rate": 9.195494770036533e-07, "loss": 2.716, "step": 5513 }, { "epoch": 0.9221892377806581, "grad_norm": 5.740993499755859, "learning_rate": 9.156313905701902e-07, "loss": 2.6866, "step": 5514 }, { "epoch": 0.9223564828364762, "grad_norm": 8.097668647766113, "learning_rate": 9.117215134913149e-07, "loss": 3.0213, "step": 5515 }, { "epoch": 0.9225237278922942, "grad_norm": 3.340135335922241, "learning_rate": 9.078198470997362e-07, "loss": 3.094, "step": 5516 }, { "epoch": 0.9226909729481122, "grad_norm": 3.6110854148864746, "learning_rate": 9.039263927253683e-07, "loss": 2.8309, "step": 5517 }, { "epoch": 0.9228582180039303, "grad_norm": 7.497878074645996, "learning_rate": 9.0004115169533e-07, "loss": 2.8228, "step": 5518 }, { "epoch": 0.9230254630597483, "grad_norm": 4.585825443267822, "learning_rate": 8.961641253339342e-07, "loss": 2.5999, "step": 5519 }, { "epoch": 0.9231927081155663, "grad_norm": 3.214254856109619, "learning_rate": 8.922953149627017e-07, "loss": 2.2881, "step": 5520 }, { "epoch": 0.9233599531713844, "grad_norm": 6.199071407318115, "learning_rate": 8.884347219003441e-07, "loss": 2.5959, "step": 5521 }, { "epoch": 0.9235271982272024, "grad_norm": 7.652985572814941, "learning_rate": 8.845823474627785e-07, "loss": 2.4638, "step": 5522 }, { "epoch": 0.9236944432830204, "grad_norm": 7.175296306610107, "learning_rate": 8.807381929631125e-07, "loss": 2.8917, "step": 5523 }, { "epoch": 0.9238616883388385, "grad_norm": 4.457863807678223, "learning_rate": 8.769022597116677e-07, "loss": 2.8337, "step": 5524 }, { "epoch": 0.9240289333946565, "grad_norm": 6.567732334136963, "learning_rate": 8.730745490159481e-07, "loss": 2.7269, "step": 5525 }, { "epoch": 0.9241961784504745, "grad_norm": 4.2891764640808105, "learning_rate": 8.6925506218066e-07, "loss": 2.6195, "step": 5526 }, { "epoch": 0.9243634235062926, "grad_norm": 4.4577460289001465, "learning_rate": 8.654438005077064e-07, "loss": 2.5261, "step": 5527 }, { "epoch": 0.9245306685621106, "grad_norm": 6.601644039154053, "learning_rate": 8.6164076529619e-07, "loss": 2.553, "step": 5528 }, { "epoch": 0.9246979136179286, "grad_norm": 4.471607208251953, "learning_rate": 8.578459578424041e-07, "loss": 2.8195, "step": 5529 }, { "epoch": 0.9248651586737467, "grad_norm": 3.719282388687134, "learning_rate": 8.540593794398421e-07, "loss": 2.4898, "step": 5530 }, { "epoch": 0.9250324037295647, "grad_norm": 17.67975616455078, "learning_rate": 8.502810313791854e-07, "loss": 3.0336, "step": 5531 }, { "epoch": 0.9251996487853827, "grad_norm": 3.7146718502044678, "learning_rate": 8.465109149483175e-07, "loss": 2.7466, "step": 5532 }, { "epoch": 0.9253668938412009, "grad_norm": 12.5942964553833, "learning_rate": 8.427490314323161e-07, "loss": 3.5848, "step": 5533 }, { "epoch": 0.9255341388970189, "grad_norm": 6.366203784942627, "learning_rate": 8.389953821134472e-07, "loss": 3.0646, "step": 5534 }, { "epoch": 0.9257013839528369, "grad_norm": 3.270090103149414, "learning_rate": 8.352499682711762e-07, "loss": 2.7318, "step": 5535 }, { "epoch": 0.925868629008655, "grad_norm": 5.942275524139404, "learning_rate": 8.31512791182154e-07, "loss": 2.8081, "step": 5536 }, { "epoch": 0.926035874064473, "grad_norm": 7.451598644256592, "learning_rate": 8.277838521202314e-07, "loss": 3.2672, "step": 5537 }, { "epoch": 0.926203119120291, "grad_norm": 5.54301643371582, "learning_rate": 8.240631523564441e-07, "loss": 2.7784, "step": 5538 }, { "epoch": 0.9263703641761091, "grad_norm": 6.661081790924072, "learning_rate": 8.203506931590277e-07, "loss": 2.9418, "step": 5539 }, { "epoch": 0.9265376092319271, "grad_norm": 6.071676254272461, "learning_rate": 8.166464757933978e-07, "loss": 3.0114, "step": 5540 }, { "epoch": 0.9267048542877451, "grad_norm": 4.907977104187012, "learning_rate": 8.129505015221778e-07, "loss": 2.8325, "step": 5541 }, { "epoch": 0.9268720993435632, "grad_norm": 4.853057861328125, "learning_rate": 8.092627716051598e-07, "loss": 2.8767, "step": 5542 }, { "epoch": 0.9270393443993812, "grad_norm": 5.745611667633057, "learning_rate": 8.055832872993441e-07, "loss": 2.6903, "step": 5543 }, { "epoch": 0.9272065894551992, "grad_norm": 3.1465744972229004, "learning_rate": 8.019120498589106e-07, "loss": 2.9619, "step": 5544 }, { "epoch": 0.9273738345110173, "grad_norm": 5.3940205574035645, "learning_rate": 7.982490605352305e-07, "loss": 3.211, "step": 5545 }, { "epoch": 0.9275410795668353, "grad_norm": 5.779568672180176, "learning_rate": 7.945943205768636e-07, "loss": 2.4928, "step": 5546 }, { "epoch": 0.9277083246226533, "grad_norm": 5.069723129272461, "learning_rate": 7.909478312295577e-07, "loss": 2.5337, "step": 5547 }, { "epoch": 0.9278755696784714, "grad_norm": 5.3263702392578125, "learning_rate": 7.87309593736249e-07, "loss": 2.7942, "step": 5548 }, { "epoch": 0.9280428147342894, "grad_norm": 9.357943534851074, "learning_rate": 7.836796093370624e-07, "loss": 2.8537, "step": 5549 }, { "epoch": 0.9282100597901074, "grad_norm": 3.768944501876831, "learning_rate": 7.800578792693053e-07, "loss": 2.6977, "step": 5550 }, { "epoch": 0.9283773048459255, "grad_norm": 3.6926519870758057, "learning_rate": 7.764444047674735e-07, "loss": 2.7306, "step": 5551 }, { "epoch": 0.9285445499017435, "grad_norm": 7.754231929779053, "learning_rate": 7.72839187063254e-07, "loss": 3.3842, "step": 5552 }, { "epoch": 0.9287117949575616, "grad_norm": 4.801486015319824, "learning_rate": 7.69242227385511e-07, "loss": 2.3616, "step": 5553 }, { "epoch": 0.9288790400133796, "grad_norm": 3.6538169384002686, "learning_rate": 7.656535269602999e-07, "loss": 2.2936, "step": 5554 }, { "epoch": 0.9290462850691976, "grad_norm": 5.421813011169434, "learning_rate": 7.620730870108533e-07, "loss": 2.6715, "step": 5555 }, { "epoch": 0.9292135301250157, "grad_norm": 4.0734357833862305, "learning_rate": 7.585009087575978e-07, "loss": 2.5717, "step": 5556 }, { "epoch": 0.9293807751808337, "grad_norm": 5.818288326263428, "learning_rate": 7.549369934181427e-07, "loss": 2.7325, "step": 5557 }, { "epoch": 0.9295480202366517, "grad_norm": 4.764278888702393, "learning_rate": 7.513813422072747e-07, "loss": 2.8998, "step": 5558 }, { "epoch": 0.9297152652924698, "grad_norm": 5.114161014556885, "learning_rate": 7.478339563369657e-07, "loss": 2.3541, "step": 5559 }, { "epoch": 0.9298825103482878, "grad_norm": 4.702939987182617, "learning_rate": 7.442948370163683e-07, "loss": 2.5598, "step": 5560 }, { "epoch": 0.9300497554041058, "grad_norm": 4.260854244232178, "learning_rate": 7.407639854518283e-07, "loss": 2.5228, "step": 5561 }, { "epoch": 0.930217000459924, "grad_norm": 10.038850784301758, "learning_rate": 7.372414028468611e-07, "loss": 3.4237, "step": 5562 }, { "epoch": 0.930384245515742, "grad_norm": 5.883835792541504, "learning_rate": 7.33727090402167e-07, "loss": 3.281, "step": 5563 }, { "epoch": 0.93055149057156, "grad_norm": 3.7855968475341797, "learning_rate": 7.302210493156242e-07, "loss": 2.7574, "step": 5564 }, { "epoch": 0.9307187356273781, "grad_norm": 4.800691604614258, "learning_rate": 7.267232807823072e-07, "loss": 3.0152, "step": 5565 }, { "epoch": 0.9308859806831961, "grad_norm": 6.638973712921143, "learning_rate": 7.232337859944482e-07, "loss": 2.3581, "step": 5566 }, { "epoch": 0.9310532257390141, "grad_norm": 2.8607771396636963, "learning_rate": 7.197525661414795e-07, "loss": 2.4633, "step": 5567 }, { "epoch": 0.9312204707948322, "grad_norm": 5.138278484344482, "learning_rate": 7.162796224099933e-07, "loss": 2.9556, "step": 5568 }, { "epoch": 0.9313877158506502, "grad_norm": 4.469886302947998, "learning_rate": 7.128149559837816e-07, "loss": 2.507, "step": 5569 }, { "epoch": 0.9315549609064682, "grad_norm": 13.452753067016602, "learning_rate": 7.093585680437914e-07, "loss": 3.5094, "step": 5570 }, { "epoch": 0.9317222059622863, "grad_norm": 6.588552951812744, "learning_rate": 7.059104597681748e-07, "loss": 2.7407, "step": 5571 }, { "epoch": 0.9318894510181043, "grad_norm": 6.874495983123779, "learning_rate": 7.024706323322361e-07, "loss": 3.0272, "step": 5572 }, { "epoch": 0.9320566960739223, "grad_norm": 6.607324123382568, "learning_rate": 6.990390869084762e-07, "loss": 2.6021, "step": 5573 }, { "epoch": 0.9322239411297404, "grad_norm": 9.294548988342285, "learning_rate": 6.95615824666565e-07, "loss": 2.7357, "step": 5574 }, { "epoch": 0.9323911861855584, "grad_norm": 3.4935221672058105, "learning_rate": 6.922008467733443e-07, "loss": 2.6697, "step": 5575 }, { "epoch": 0.9325584312413764, "grad_norm": 5.099630832672119, "learning_rate": 6.887941543928411e-07, "loss": 2.3045, "step": 5576 }, { "epoch": 0.9327256762971945, "grad_norm": 5.771602630615234, "learning_rate": 6.853957486862544e-07, "loss": 2.6126, "step": 5577 }, { "epoch": 0.9328929213530125, "grad_norm": 5.338258743286133, "learning_rate": 6.820056308119605e-07, "loss": 2.6481, "step": 5578 }, { "epoch": 0.9330601664088305, "grad_norm": 2.9823498725891113, "learning_rate": 6.786238019255042e-07, "loss": 2.3209, "step": 5579 }, { "epoch": 0.9332274114646486, "grad_norm": 5.313453674316406, "learning_rate": 6.752502631796109e-07, "loss": 2.7427, "step": 5580 }, { "epoch": 0.9333946565204666, "grad_norm": 3.409245491027832, "learning_rate": 6.718850157241802e-07, "loss": 2.6915, "step": 5581 }, { "epoch": 0.9335619015762846, "grad_norm": 3.882349729537964, "learning_rate": 6.68528060706286e-07, "loss": 2.6899, "step": 5582 }, { "epoch": 0.9337291466321027, "grad_norm": 8.318379402160645, "learning_rate": 6.651793992701716e-07, "loss": 3.3859, "step": 5583 }, { "epoch": 0.9338963916879207, "grad_norm": 7.204057693481445, "learning_rate": 6.618390325572571e-07, "loss": 2.7111, "step": 5584 }, { "epoch": 0.9340636367437387, "grad_norm": 7.600271224975586, "learning_rate": 6.585069617061318e-07, "loss": 2.9857, "step": 5585 }, { "epoch": 0.9342308817995568, "grad_norm": 8.553564071655273, "learning_rate": 6.551831878525622e-07, "loss": 2.4292, "step": 5586 }, { "epoch": 0.9343981268553748, "grad_norm": 4.708897590637207, "learning_rate": 6.518677121294809e-07, "loss": 2.8371, "step": 5587 }, { "epoch": 0.9345653719111928, "grad_norm": 7.458796501159668, "learning_rate": 6.485605356669977e-07, "loss": 2.8801, "step": 5588 }, { "epoch": 0.9347326169670109, "grad_norm": 7.198914051055908, "learning_rate": 6.452616595923861e-07, "loss": 2.414, "step": 5589 }, { "epoch": 0.9348998620228289, "grad_norm": 5.763337135314941, "learning_rate": 6.419710850300992e-07, "loss": 2.1791, "step": 5590 }, { "epoch": 0.935067107078647, "grad_norm": 8.378960609436035, "learning_rate": 6.386888131017594e-07, "loss": 3.116, "step": 5591 }, { "epoch": 0.935234352134465, "grad_norm": 3.95295786857605, "learning_rate": 6.354148449261494e-07, "loss": 2.4583, "step": 5592 }, { "epoch": 0.935401597190283, "grad_norm": 3.6740572452545166, "learning_rate": 6.321491816192321e-07, "loss": 2.1603, "step": 5593 }, { "epoch": 0.9355688422461012, "grad_norm": 6.602855682373047, "learning_rate": 6.288918242941339e-07, "loss": 2.594, "step": 5594 }, { "epoch": 0.9357360873019192, "grad_norm": 4.983553886413574, "learning_rate": 6.256427740611525e-07, "loss": 2.7453, "step": 5595 }, { "epoch": 0.9359033323577371, "grad_norm": 5.087932586669922, "learning_rate": 6.224020320277524e-07, "loss": 2.8348, "step": 5596 }, { "epoch": 0.9360705774135553, "grad_norm": 5.39533805847168, "learning_rate": 6.191695992985663e-07, "loss": 2.396, "step": 5597 }, { "epoch": 0.9362378224693733, "grad_norm": 5.344443321228027, "learning_rate": 6.159454769753992e-07, "loss": 2.8501, "step": 5598 }, { "epoch": 0.9364050675251913, "grad_norm": 3.6696572303771973, "learning_rate": 6.127296661572191e-07, "loss": 2.6257, "step": 5599 }, { "epoch": 0.9365723125810094, "grad_norm": 5.765847682952881, "learning_rate": 6.095221679401547e-07, "loss": 2.6088, "step": 5600 }, { "epoch": 0.9367395576368274, "grad_norm": 6.111525058746338, "learning_rate": 6.063229834175177e-07, "loss": 2.2934, "step": 5601 }, { "epoch": 0.9369068026926454, "grad_norm": 6.405860424041748, "learning_rate": 6.03132113679769e-07, "loss": 2.9051, "step": 5602 }, { "epoch": 0.9370740477484635, "grad_norm": 6.5107316970825195, "learning_rate": 5.999495598145499e-07, "loss": 3.0085, "step": 5603 }, { "epoch": 0.9372412928042815, "grad_norm": 3.302609920501709, "learning_rate": 5.967753229066508e-07, "loss": 2.619, "step": 5604 }, { "epoch": 0.9374085378600995, "grad_norm": 12.682585716247559, "learning_rate": 5.936094040380424e-07, "loss": 3.2446, "step": 5605 }, { "epoch": 0.9375757829159176, "grad_norm": 4.714587211608887, "learning_rate": 5.90451804287856e-07, "loss": 2.4012, "step": 5606 }, { "epoch": 0.9377430279717356, "grad_norm": 5.449630260467529, "learning_rate": 5.873025247323776e-07, "loss": 2.6133, "step": 5607 }, { "epoch": 0.9379102730275536, "grad_norm": 5.309162616729736, "learning_rate": 5.84161566445074e-07, "loss": 2.1442, "step": 5608 }, { "epoch": 0.9380775180833717, "grad_norm": 6.891894817352295, "learning_rate": 5.810289304965605e-07, "loss": 3.036, "step": 5609 }, { "epoch": 0.9382447631391897, "grad_norm": 5.435842514038086, "learning_rate": 5.779046179546221e-07, "loss": 2.7085, "step": 5610 }, { "epoch": 0.9384120081950077, "grad_norm": 3.6266772747039795, "learning_rate": 5.747886298842098e-07, "loss": 2.2721, "step": 5611 }, { "epoch": 0.9385792532508258, "grad_norm": 6.0675129890441895, "learning_rate": 5.716809673474321e-07, "loss": 2.8721, "step": 5612 }, { "epoch": 0.9387464983066438, "grad_norm": 6.269439220428467, "learning_rate": 5.685816314035558e-07, "loss": 2.6504, "step": 5613 }, { "epoch": 0.9389137433624618, "grad_norm": 6.186154365539551, "learning_rate": 5.654906231090246e-07, "loss": 2.6853, "step": 5614 }, { "epoch": 0.9390809884182799, "grad_norm": 3.557142734527588, "learning_rate": 5.624079435174318e-07, "loss": 2.3217, "step": 5615 }, { "epoch": 0.9392482334740979, "grad_norm": 5.673525333404541, "learning_rate": 5.593335936795258e-07, "loss": 2.8061, "step": 5616 }, { "epoch": 0.9394154785299159, "grad_norm": 6.858178615570068, "learning_rate": 5.56267574643235e-07, "loss": 2.5854, "step": 5617 }, { "epoch": 0.939582723585734, "grad_norm": 7.215294361114502, "learning_rate": 5.532098874536291e-07, "loss": 2.5174, "step": 5618 }, { "epoch": 0.939749968641552, "grad_norm": 14.125140190124512, "learning_rate": 5.501605331529491e-07, "loss": 3.2868, "step": 5619 }, { "epoch": 0.93991721369737, "grad_norm": 3.167513847351074, "learning_rate": 5.471195127805917e-07, "loss": 2.4166, "step": 5620 }, { "epoch": 0.9400844587531881, "grad_norm": 7.076135158538818, "learning_rate": 5.440868273731164e-07, "loss": 2.7321, "step": 5621 }, { "epoch": 0.9402517038090061, "grad_norm": 9.906133651733398, "learning_rate": 5.410624779642298e-07, "loss": 2.8115, "step": 5622 }, { "epoch": 0.9404189488648241, "grad_norm": 4.746105670928955, "learning_rate": 5.380464655848183e-07, "loss": 2.5533, "step": 5623 }, { "epoch": 0.9405861939206422, "grad_norm": 5.765876293182373, "learning_rate": 5.350387912629096e-07, "loss": 2.6839, "step": 5624 }, { "epoch": 0.9407534389764602, "grad_norm": 5.733865261077881, "learning_rate": 5.320394560236919e-07, "loss": 3.2143, "step": 5625 }, { "epoch": 0.9409206840322782, "grad_norm": 4.589909076690674, "learning_rate": 5.29048460889514e-07, "loss": 2.3102, "step": 5626 }, { "epoch": 0.9410879290880964, "grad_norm": 5.669680118560791, "learning_rate": 5.260658068798857e-07, "loss": 3.077, "step": 5627 }, { "epoch": 0.9412551741439144, "grad_norm": 25.920270919799805, "learning_rate": 5.23091495011463e-07, "loss": 2.4016, "step": 5628 }, { "epoch": 0.9414224191997325, "grad_norm": 5.855594158172607, "learning_rate": 5.201255262980681e-07, "loss": 2.9233, "step": 5629 }, { "epoch": 0.9415896642555505, "grad_norm": 4.763648986816406, "learning_rate": 5.171679017506731e-07, "loss": 2.6829, "step": 5630 }, { "epoch": 0.9417569093113685, "grad_norm": 3.164358377456665, "learning_rate": 5.142186223774076e-07, "loss": 2.2831, "step": 5631 }, { "epoch": 0.9419241543671866, "grad_norm": 3.6025354862213135, "learning_rate": 5.112776891835647e-07, "loss": 2.7756, "step": 5632 }, { "epoch": 0.9420913994230046, "grad_norm": 10.55051040649414, "learning_rate": 5.083451031715786e-07, "loss": 3.0956, "step": 5633 }, { "epoch": 0.9422586444788226, "grad_norm": 6.1416015625, "learning_rate": 5.054208653410497e-07, "loss": 2.4517, "step": 5634 }, { "epoch": 0.9424258895346407, "grad_norm": 6.066828727722168, "learning_rate": 5.025049766887252e-07, "loss": 2.68, "step": 5635 }, { "epoch": 0.9425931345904587, "grad_norm": 8.028677940368652, "learning_rate": 4.995974382085128e-07, "loss": 2.5315, "step": 5636 }, { "epoch": 0.9427603796462767, "grad_norm": 6.534561634063721, "learning_rate": 4.966982508914641e-07, "loss": 2.732, "step": 5637 }, { "epoch": 0.9429276247020948, "grad_norm": 4.724697113037109, "learning_rate": 4.938074157257999e-07, "loss": 2.8141, "step": 5638 }, { "epoch": 0.9430948697579128, "grad_norm": 7.048883438110352, "learning_rate": 4.90924933696879e-07, "loss": 2.7953, "step": 5639 }, { "epoch": 0.9432621148137308, "grad_norm": 5.02522611618042, "learning_rate": 4.88050805787224e-07, "loss": 2.6363, "step": 5640 }, { "epoch": 0.9434293598695489, "grad_norm": 5.697553634643555, "learning_rate": 4.851850329764984e-07, "loss": 2.6514, "step": 5641 }, { "epoch": 0.9435966049253669, "grad_norm": 5.6726226806640625, "learning_rate": 4.823276162415292e-07, "loss": 2.7412, "step": 5642 }, { "epoch": 0.9437638499811849, "grad_norm": 5.384840965270996, "learning_rate": 4.794785565562898e-07, "loss": 2.5323, "step": 5643 }, { "epoch": 0.943931095037003, "grad_norm": 7.355140686035156, "learning_rate": 4.7663785489190373e-07, "loss": 2.3409, "step": 5644 }, { "epoch": 0.944098340092821, "grad_norm": 8.128437995910645, "learning_rate": 4.7380551221664906e-07, "loss": 2.8342, "step": 5645 }, { "epoch": 0.944265585148639, "grad_norm": 4.795982837677002, "learning_rate": 4.7098152949595075e-07, "loss": 2.3497, "step": 5646 }, { "epoch": 0.9444328302044571, "grad_norm": 5.473878383636475, "learning_rate": 4.6816590769239164e-07, "loss": 2.8645, "step": 5647 }, { "epoch": 0.9446000752602751, "grad_norm": 4.610753536224365, "learning_rate": 4.653586477656957e-07, "loss": 2.2606, "step": 5648 }, { "epoch": 0.9447673203160931, "grad_norm": 7.058230876922607, "learning_rate": 4.6255975067273916e-07, "loss": 2.5898, "step": 5649 }, { "epoch": 0.9449345653719112, "grad_norm": 4.75299596786499, "learning_rate": 4.5976921736755053e-07, "loss": 2.8734, "step": 5650 }, { "epoch": 0.9451018104277292, "grad_norm": 4.827449321746826, "learning_rate": 4.5698704880130784e-07, "loss": 2.4068, "step": 5651 }, { "epoch": 0.9452690554835472, "grad_norm": 5.514195442199707, "learning_rate": 4.5421324592233307e-07, "loss": 2.6922, "step": 5652 }, { "epoch": 0.9454363005393653, "grad_norm": 2.296638011932373, "learning_rate": 4.514478096761032e-07, "loss": 2.4349, "step": 5653 }, { "epoch": 0.9456035455951833, "grad_norm": 4.981836318969727, "learning_rate": 4.4869074100523653e-07, "loss": 2.6207, "step": 5654 }, { "epoch": 0.9457707906510013, "grad_norm": 9.00262451171875, "learning_rate": 4.4594204084950075e-07, "loss": 2.7366, "step": 5655 }, { "epoch": 0.9459380357068194, "grad_norm": 5.353426456451416, "learning_rate": 4.432017101458186e-07, "loss": 2.7648, "step": 5656 }, { "epoch": 0.9461052807626374, "grad_norm": 5.649375915527344, "learning_rate": 4.4046974982825117e-07, "loss": 2.9056, "step": 5657 }, { "epoch": 0.9462725258184554, "grad_norm": 5.302669525146484, "learning_rate": 4.3774616082800923e-07, "loss": 3.0827, "step": 5658 }, { "epoch": 0.9464397708742736, "grad_norm": 5.591965675354004, "learning_rate": 4.350309440734557e-07, "loss": 2.7071, "step": 5659 }, { "epoch": 0.9466070159300916, "grad_norm": 3.6497743129730225, "learning_rate": 4.323241004900835e-07, "loss": 2.3643, "step": 5660 }, { "epoch": 0.9467742609859096, "grad_norm": 5.035357475280762, "learning_rate": 4.29625631000552e-07, "loss": 2.8014, "step": 5661 }, { "epoch": 0.9469415060417277, "grad_norm": 6.282066822052002, "learning_rate": 4.26935536524653e-07, "loss": 2.6548, "step": 5662 }, { "epoch": 0.9471087510975457, "grad_norm": 7.312952995300293, "learning_rate": 4.242538179793282e-07, "loss": 2.491, "step": 5663 }, { "epoch": 0.9472759961533637, "grad_norm": 5.0274529457092285, "learning_rate": 4.215804762786629e-07, "loss": 2.7475, "step": 5664 }, { "epoch": 0.9474432412091818, "grad_norm": 4.579266548156738, "learning_rate": 4.189155123338867e-07, "loss": 2.9729, "step": 5665 }, { "epoch": 0.9476104862649998, "grad_norm": 11.114564895629883, "learning_rate": 4.1625892705337564e-07, "loss": 2.6658, "step": 5666 }, { "epoch": 0.9477777313208178, "grad_norm": 5.385292053222656, "learning_rate": 4.136107213426471e-07, "loss": 2.8887, "step": 5667 }, { "epoch": 0.9479449763766359, "grad_norm": 8.011484146118164, "learning_rate": 4.109708961043679e-07, "loss": 2.8435, "step": 5668 }, { "epoch": 0.9481122214324539, "grad_norm": 5.638619899749756, "learning_rate": 4.083394522383377e-07, "loss": 2.4433, "step": 5669 }, { "epoch": 0.948279466488272, "grad_norm": 5.535667896270752, "learning_rate": 4.05716390641514e-07, "loss": 2.6477, "step": 5670 }, { "epoch": 0.94844671154409, "grad_norm": 5.774267673492432, "learning_rate": 4.031017122079761e-07, "loss": 2.5593, "step": 5671 }, { "epoch": 0.948613956599908, "grad_norm": 6.232032299041748, "learning_rate": 4.0049541782897216e-07, "loss": 2.62, "step": 5672 }, { "epoch": 0.9487812016557261, "grad_norm": 9.09248161315918, "learning_rate": 3.978975083928721e-07, "loss": 3.0383, "step": 5673 }, { "epoch": 0.9489484467115441, "grad_norm": 4.248410701751709, "learning_rate": 3.9530798478519817e-07, "loss": 2.5649, "step": 5674 }, { "epoch": 0.9491156917673621, "grad_norm": 4.906217098236084, "learning_rate": 3.9272684788860537e-07, "loss": 2.8092, "step": 5675 }, { "epoch": 0.9492829368231802, "grad_norm": 4.071691989898682, "learning_rate": 3.901540985828983e-07, "loss": 2.7639, "step": 5676 }, { "epoch": 0.9494501818789982, "grad_norm": 5.270440101623535, "learning_rate": 3.875897377450227e-07, "loss": 2.4368, "step": 5677 }, { "epoch": 0.9496174269348162, "grad_norm": 5.629199028015137, "learning_rate": 3.850337662490572e-07, "loss": 2.5805, "step": 5678 }, { "epoch": 0.9497846719906343, "grad_norm": 5.720698356628418, "learning_rate": 3.82486184966227e-07, "loss": 2.8518, "step": 5679 }, { "epoch": 0.9499519170464523, "grad_norm": 3.1585347652435303, "learning_rate": 3.799469947648959e-07, "loss": 2.6469, "step": 5680 }, { "epoch": 0.9501191621022703, "grad_norm": 6.203394412994385, "learning_rate": 3.774161965105688e-07, "loss": 2.655, "step": 5681 }, { "epoch": 0.9502864071580884, "grad_norm": 3.579051971435547, "learning_rate": 3.74893791065889e-07, "loss": 2.6954, "step": 5682 }, { "epoch": 0.9504536522139064, "grad_norm": 6.902586936950684, "learning_rate": 3.723797792906408e-07, "loss": 3.4282, "step": 5683 }, { "epoch": 0.9506208972697244, "grad_norm": 5.914361000061035, "learning_rate": 3.6987416204174176e-07, "loss": 3.037, "step": 5684 }, { "epoch": 0.9507881423255425, "grad_norm": 8.997753143310547, "learning_rate": 3.6737694017325573e-07, "loss": 2.7642, "step": 5685 }, { "epoch": 0.9509553873813605, "grad_norm": 5.432087421417236, "learning_rate": 3.6488811453637696e-07, "loss": 3.367, "step": 5686 }, { "epoch": 0.9511226324371785, "grad_norm": 3.771697998046875, "learning_rate": 3.624076859794462e-07, "loss": 2.7358, "step": 5687 }, { "epoch": 0.9512898774929966, "grad_norm": 7.465067386627197, "learning_rate": 3.5993565534793715e-07, "loss": 2.6706, "step": 5688 }, { "epoch": 0.9514571225488146, "grad_norm": 3.4386396408081055, "learning_rate": 3.574720234844592e-07, "loss": 2.9646, "step": 5689 }, { "epoch": 0.9516243676046326, "grad_norm": 1.8254220485687256, "learning_rate": 3.5501679122876276e-07, "loss": 2.2626, "step": 5690 }, { "epoch": 0.9517916126604508, "grad_norm": 2.695876359939575, "learning_rate": 3.5256995941773674e-07, "loss": 2.5223, "step": 5691 }, { "epoch": 0.9519588577162688, "grad_norm": 5.3182373046875, "learning_rate": 3.501315288854001e-07, "loss": 2.6269, "step": 5692 }, { "epoch": 0.9521261027720868, "grad_norm": 10.014904022216797, "learning_rate": 3.4770150046291305e-07, "loss": 2.7684, "step": 5693 }, { "epoch": 0.9522933478279049, "grad_norm": 3.6873035430908203, "learning_rate": 3.452798749785713e-07, "loss": 2.4874, "step": 5694 }, { "epoch": 0.9524605928837229, "grad_norm": 6.849990367889404, "learning_rate": 3.428666532578034e-07, "loss": 2.8535, "step": 5695 }, { "epoch": 0.9526278379395409, "grad_norm": 2.2082736492156982, "learning_rate": 3.404618361231793e-07, "loss": 2.2434, "step": 5696 }, { "epoch": 0.952795082995359, "grad_norm": 5.47413444519043, "learning_rate": 3.3806542439439594e-07, "loss": 2.4285, "step": 5697 }, { "epoch": 0.952962328051177, "grad_norm": 4.918098449707031, "learning_rate": 3.3567741888829717e-07, "loss": 2.7182, "step": 5698 }, { "epoch": 0.953129573106995, "grad_norm": 11.365853309631348, "learning_rate": 3.332978204188486e-07, "loss": 3.2974, "step": 5699 }, { "epoch": 0.9532968181628131, "grad_norm": 5.030725955963135, "learning_rate": 3.309266297971597e-07, "loss": 3.0683, "step": 5700 }, { "epoch": 0.9534640632186311, "grad_norm": 4.700732707977295, "learning_rate": 3.2856384783146454e-07, "loss": 2.3558, "step": 5701 }, { "epoch": 0.9536313082744491, "grad_norm": 5.8616251945495605, "learning_rate": 3.262094753271411e-07, "loss": 2.6144, "step": 5702 }, { "epoch": 0.9537985533302672, "grad_norm": 3.6122663021087646, "learning_rate": 3.238635130866946e-07, "loss": 2.283, "step": 5703 }, { "epoch": 0.9539657983860852, "grad_norm": 4.145473480224609, "learning_rate": 3.21525961909766e-07, "loss": 2.3672, "step": 5704 }, { "epoch": 0.9541330434419032, "grad_norm": 3.994170665740967, "learning_rate": 3.1919682259313187e-07, "loss": 2.3302, "step": 5705 }, { "epoch": 0.9543002884977213, "grad_norm": 4.4488701820373535, "learning_rate": 3.16876095930696e-07, "loss": 2.3951, "step": 5706 }, { "epoch": 0.9544675335535393, "grad_norm": 5.31576681137085, "learning_rate": 3.1456378271349507e-07, "loss": 2.8047, "step": 5707 }, { "epoch": 0.9546347786093574, "grad_norm": 3.9168951511383057, "learning_rate": 3.122598837297014e-07, "loss": 2.437, "step": 5708 }, { "epoch": 0.9548020236651754, "grad_norm": 3.7113380432128906, "learning_rate": 3.0996439976462286e-07, "loss": 2.4638, "step": 5709 }, { "epoch": 0.9549692687209934, "grad_norm": 5.420399188995361, "learning_rate": 3.076773316006837e-07, "loss": 2.6587, "step": 5710 }, { "epoch": 0.9551365137768115, "grad_norm": 4.794158935546875, "learning_rate": 3.053986800174602e-07, "loss": 2.6729, "step": 5711 }, { "epoch": 0.9553037588326295, "grad_norm": 8.662692070007324, "learning_rate": 3.0312844579163944e-07, "loss": 2.6729, "step": 5712 }, { "epoch": 0.9554710038884475, "grad_norm": 6.364772796630859, "learning_rate": 3.0086662969705806e-07, "loss": 2.9137, "step": 5713 }, { "epoch": 0.9556382489442656, "grad_norm": 6.182532787322998, "learning_rate": 2.986132325046714e-07, "loss": 2.6767, "step": 5714 }, { "epoch": 0.9558054940000836, "grad_norm": 9.693269729614258, "learning_rate": 2.9636825498256803e-07, "loss": 2.1869, "step": 5715 }, { "epoch": 0.9559727390559016, "grad_norm": 4.306026458740234, "learning_rate": 2.9413169789596627e-07, "loss": 2.4033, "step": 5716 }, { "epoch": 0.9561399841117197, "grad_norm": 14.665205955505371, "learning_rate": 2.9190356200722023e-07, "loss": 2.5706, "step": 5717 }, { "epoch": 0.9563072291675377, "grad_norm": 5.394385814666748, "learning_rate": 2.896838480758002e-07, "loss": 2.4048, "step": 5718 }, { "epoch": 0.9564744742233557, "grad_norm": 8.422616004943848, "learning_rate": 2.874725568583175e-07, "loss": 2.4164, "step": 5719 }, { "epoch": 0.9566417192791739, "grad_norm": 3.9133658409118652, "learning_rate": 2.8526968910851106e-07, "loss": 2.7507, "step": 5720 }, { "epoch": 0.9568089643349919, "grad_norm": 4.9862470626831055, "learning_rate": 2.830752455772412e-07, "loss": 2.604, "step": 5721 }, { "epoch": 0.9569762093908099, "grad_norm": 2.9696578979492188, "learning_rate": 2.8088922701250964e-07, "loss": 2.6531, "step": 5722 }, { "epoch": 0.957143454446628, "grad_norm": 6.1013383865356445, "learning_rate": 2.787116341594315e-07, "loss": 3.0735, "step": 5723 }, { "epoch": 0.957310699502446, "grad_norm": 6.518977165222168, "learning_rate": 2.7654246776026015e-07, "loss": 2.6008, "step": 5724 }, { "epoch": 0.957477944558264, "grad_norm": 7.726726531982422, "learning_rate": 2.743817285543737e-07, "loss": 2.3293, "step": 5725 }, { "epoch": 0.9576451896140821, "grad_norm": 9.546602249145508, "learning_rate": 2.7222941727827743e-07, "loss": 2.4728, "step": 5726 }, { "epoch": 0.9578124346699001, "grad_norm": 4.794469356536865, "learning_rate": 2.700855346656067e-07, "loss": 2.6328, "step": 5727 }, { "epoch": 0.9579796797257181, "grad_norm": 8.138178825378418, "learning_rate": 2.679500814471186e-07, "loss": 3.1449, "step": 5728 }, { "epoch": 0.9581469247815362, "grad_norm": 4.158497333526611, "learning_rate": 2.6582305835069764e-07, "loss": 2.7705, "step": 5729 }, { "epoch": 0.9583141698373542, "grad_norm": 5.135260581970215, "learning_rate": 2.63704466101361e-07, "loss": 2.7152, "step": 5730 }, { "epoch": 0.9584814148931722, "grad_norm": 4.336676120758057, "learning_rate": 2.6159430542124775e-07, "loss": 2.3632, "step": 5731 }, { "epoch": 0.9586486599489903, "grad_norm": 4.889263153076172, "learning_rate": 2.594925770296269e-07, "loss": 2.3034, "step": 5732 }, { "epoch": 0.9588159050048083, "grad_norm": 4.2594170570373535, "learning_rate": 2.5739928164288107e-07, "loss": 2.2827, "step": 5733 }, { "epoch": 0.9589831500606263, "grad_norm": 4.031805515289307, "learning_rate": 2.5531441997453387e-07, "loss": 2.9325, "step": 5734 }, { "epoch": 0.9591503951164444, "grad_norm": 4.564347743988037, "learning_rate": 2.5323799273522796e-07, "loss": 2.7482, "step": 5735 }, { "epoch": 0.9593176401722624, "grad_norm": 7.605842590332031, "learning_rate": 2.51170000632725e-07, "loss": 2.816, "step": 5736 }, { "epoch": 0.9594848852280804, "grad_norm": 3.228604555130005, "learning_rate": 2.491104443719222e-07, "loss": 2.702, "step": 5737 }, { "epoch": 0.9596521302838985, "grad_norm": 13.108623504638672, "learning_rate": 2.470593246548303e-07, "loss": 2.8544, "step": 5738 }, { "epoch": 0.9598193753397165, "grad_norm": 5.721864223480225, "learning_rate": 2.4501664218060103e-07, "loss": 3.0969, "step": 5739 }, { "epoch": 0.9599866203955345, "grad_norm": 4.794158935546875, "learning_rate": 2.429823976454887e-07, "loss": 2.6523, "step": 5740 }, { "epoch": 0.9601538654513526, "grad_norm": 5.750003337860107, "learning_rate": 2.409565917428858e-07, "loss": 2.4967, "step": 5741 }, { "epoch": 0.9603211105071706, "grad_norm": 5.3498148918151855, "learning_rate": 2.3893922516330656e-07, "loss": 2.5526, "step": 5742 }, { "epoch": 0.9604883555629886, "grad_norm": 3.9837746620178223, "learning_rate": 2.369302985943872e-07, "loss": 2.6324, "step": 5743 }, { "epoch": 0.9606556006188067, "grad_norm": 2.8513052463531494, "learning_rate": 2.3492981272088267e-07, "loss": 2.2969, "step": 5744 }, { "epoch": 0.9608228456746247, "grad_norm": 4.667818069458008, "learning_rate": 2.3293776822467805e-07, "loss": 2.5167, "step": 5745 }, { "epoch": 0.9609900907304428, "grad_norm": 7.691545009613037, "learning_rate": 2.3095416578478024e-07, "loss": 2.7127, "step": 5746 }, { "epoch": 0.9611573357862608, "grad_norm": 5.1676154136657715, "learning_rate": 2.2897900607731227e-07, "loss": 1.9609, "step": 5747 }, { "epoch": 0.9613245808420788, "grad_norm": 3.8186774253845215, "learning_rate": 2.2701228977552448e-07, "loss": 2.623, "step": 5748 }, { "epoch": 0.961491825897897, "grad_norm": 7.1512250900268555, "learning_rate": 2.2505401754978895e-07, "loss": 2.01, "step": 5749 }, { "epoch": 0.961659070953715, "grad_norm": 10.798931121826172, "learning_rate": 2.231041900676023e-07, "loss": 2.7243, "step": 5750 }, { "epoch": 0.9618263160095329, "grad_norm": 4.158714771270752, "learning_rate": 2.211628079935718e-07, "loss": 3.0129, "step": 5751 }, { "epoch": 0.961993561065351, "grad_norm": 4.716842174530029, "learning_rate": 2.1922987198943758e-07, "loss": 2.9431, "step": 5752 }, { "epoch": 0.962160806121169, "grad_norm": 4.630619525909424, "learning_rate": 2.1730538271405588e-07, "loss": 2.42, "step": 5753 }, { "epoch": 0.962328051176987, "grad_norm": 6.290351390838623, "learning_rate": 2.1538934082340757e-07, "loss": 2.8095, "step": 5754 }, { "epoch": 0.9624952962328052, "grad_norm": 4.536811351776123, "learning_rate": 2.1348174697058687e-07, "loss": 2.7688, "step": 5755 }, { "epoch": 0.9626625412886232, "grad_norm": 6.094025611877441, "learning_rate": 2.1158260180581813e-07, "loss": 2.6263, "step": 5756 }, { "epoch": 0.9628297863444412, "grad_norm": 5.329250335693359, "learning_rate": 2.096919059764335e-07, "loss": 2.7049, "step": 5757 }, { "epoch": 0.9629970314002593, "grad_norm": 9.112748146057129, "learning_rate": 2.0780966012690085e-07, "loss": 3.5588, "step": 5758 }, { "epoch": 0.9631642764560773, "grad_norm": 13.173627853393555, "learning_rate": 2.0593586489879024e-07, "loss": 3.8444, "step": 5759 }, { "epoch": 0.9633315215118953, "grad_norm": 5.748164176940918, "learning_rate": 2.040705209308047e-07, "loss": 2.8468, "step": 5760 }, { "epoch": 0.9634987665677134, "grad_norm": 4.090095043182373, "learning_rate": 2.022136288587606e-07, "loss": 2.5835, "step": 5761 }, { "epoch": 0.9636660116235314, "grad_norm": 8.871829986572266, "learning_rate": 2.0036518931559612e-07, "loss": 2.8578, "step": 5762 }, { "epoch": 0.9638332566793494, "grad_norm": 6.414892196655273, "learning_rate": 1.9852520293136834e-07, "loss": 2.7546, "step": 5763 }, { "epoch": 0.9640005017351675, "grad_norm": 2.584711790084839, "learning_rate": 1.9669367033324782e-07, "loss": 2.544, "step": 5764 }, { "epoch": 0.9641677467909855, "grad_norm": 13.741525650024414, "learning_rate": 1.9487059214553238e-07, "loss": 4.2797, "step": 5765 }, { "epoch": 0.9643349918468035, "grad_norm": 6.200967788696289, "learning_rate": 1.9305596898962774e-07, "loss": 2.5301, "step": 5766 }, { "epoch": 0.9645022369026216, "grad_norm": 6.4713215827941895, "learning_rate": 1.912498014840669e-07, "loss": 2.7422, "step": 5767 }, { "epoch": 0.9646694819584396, "grad_norm": 5.081512451171875, "learning_rate": 1.8945209024449628e-07, "loss": 3.4072, "step": 5768 }, { "epoch": 0.9648367270142576, "grad_norm": 5.358619689941406, "learning_rate": 1.8766283588368127e-07, "loss": 3.0253, "step": 5769 }, { "epoch": 0.9650039720700757, "grad_norm": 2.8926355838775635, "learning_rate": 1.8588203901149792e-07, "loss": 2.4453, "step": 5770 }, { "epoch": 0.9651712171258937, "grad_norm": 4.1386308670043945, "learning_rate": 1.8410970023495232e-07, "loss": 2.9993, "step": 5771 }, { "epoch": 0.9653384621817117, "grad_norm": 3.5281431674957275, "learning_rate": 1.823458201581585e-07, "loss": 2.5362, "step": 5772 }, { "epoch": 0.9655057072375298, "grad_norm": 5.1408185958862305, "learning_rate": 1.8059039938235212e-07, "loss": 2.8226, "step": 5773 }, { "epoch": 0.9656729522933478, "grad_norm": 8.912006378173828, "learning_rate": 1.788434385058768e-07, "loss": 2.789, "step": 5774 }, { "epoch": 0.9658401973491658, "grad_norm": 4.358989238739014, "learning_rate": 1.7710493812420347e-07, "loss": 3.1268, "step": 5775 }, { "epoch": 0.9660074424049839, "grad_norm": 5.533484935760498, "learning_rate": 1.7537489882991086e-07, "loss": 2.5931, "step": 5776 }, { "epoch": 0.9661746874608019, "grad_norm": 4.655098915100098, "learning_rate": 1.736533212126995e-07, "loss": 2.6475, "step": 5777 }, { "epoch": 0.9663419325166199, "grad_norm": 4.950163841247559, "learning_rate": 1.7194020585938332e-07, "loss": 2.7337, "step": 5778 }, { "epoch": 0.966509177572438, "grad_norm": 5.073270320892334, "learning_rate": 1.7023555335388963e-07, "loss": 2.8498, "step": 5779 }, { "epoch": 0.966676422628256, "grad_norm": 5.699751853942871, "learning_rate": 1.685393642772648e-07, "loss": 2.8049, "step": 5780 }, { "epoch": 0.966843667684074, "grad_norm": 5.26263427734375, "learning_rate": 1.6685163920766855e-07, "loss": 3.0268, "step": 5781 }, { "epoch": 0.9670109127398921, "grad_norm": 9.936155319213867, "learning_rate": 1.6517237872037684e-07, "loss": 2.9311, "step": 5782 }, { "epoch": 0.9671781577957101, "grad_norm": 3.858610153198242, "learning_rate": 1.635015833877762e-07, "loss": 2.7333, "step": 5783 }, { "epoch": 0.9673454028515281, "grad_norm": 6.143043518066406, "learning_rate": 1.6183925377937504e-07, "loss": 2.9279, "step": 5784 }, { "epoch": 0.9675126479073463, "grad_norm": 4.812164783477783, "learning_rate": 1.601853904617867e-07, "loss": 2.6085, "step": 5785 }, { "epoch": 0.9676798929631643, "grad_norm": 6.644602298736572, "learning_rate": 1.585399939987492e-07, "loss": 2.7589, "step": 5786 }, { "epoch": 0.9678471380189824, "grad_norm": 6.426840305328369, "learning_rate": 1.569030649511083e-07, "loss": 3.1144, "step": 5787 }, { "epoch": 0.9680143830748004, "grad_norm": 4.581924915313721, "learning_rate": 1.5527460387682324e-07, "loss": 2.2822, "step": 5788 }, { "epoch": 0.9681816281306184, "grad_norm": 7.2356486320495605, "learning_rate": 1.5365461133096947e-07, "loss": 2.748, "step": 5789 }, { "epoch": 0.9683488731864365, "grad_norm": 7.671509265899658, "learning_rate": 1.5204308786573584e-07, "loss": 3.0896, "step": 5790 }, { "epoch": 0.9685161182422545, "grad_norm": 6.3458991050720215, "learning_rate": 1.5044003403041905e-07, "loss": 3.2842, "step": 5791 }, { "epoch": 0.9686833632980725, "grad_norm": 5.877579689025879, "learning_rate": 1.488454503714376e-07, "loss": 2.3029, "step": 5792 }, { "epoch": 0.9688506083538906, "grad_norm": 5.631944179534912, "learning_rate": 1.4725933743231502e-07, "loss": 3.0986, "step": 5793 }, { "epoch": 0.9690178534097086, "grad_norm": 7.130926132202148, "learning_rate": 1.4568169575369383e-07, "loss": 3.2815, "step": 5794 }, { "epoch": 0.9691850984655266, "grad_norm": 6.623461723327637, "learning_rate": 1.4411252587332447e-07, "loss": 2.7883, "step": 5795 }, { "epoch": 0.9693523435213447, "grad_norm": 4.458541393280029, "learning_rate": 1.4255182832607074e-07, "loss": 2.1485, "step": 5796 }, { "epoch": 0.9695195885771627, "grad_norm": 3.127737522125244, "learning_rate": 1.4099960364391261e-07, "loss": 2.1501, "step": 5797 }, { "epoch": 0.9696868336329807, "grad_norm": 6.669377326965332, "learning_rate": 1.3945585235593528e-07, "loss": 2.5121, "step": 5798 }, { "epoch": 0.9698540786887988, "grad_norm": 5.726391792297363, "learning_rate": 1.3792057498833998e-07, "loss": 2.2482, "step": 5799 }, { "epoch": 0.9700213237446168, "grad_norm": 4.484707832336426, "learning_rate": 1.363937720644387e-07, "loss": 2.432, "step": 5800 }, { "epoch": 0.9701885688004348, "grad_norm": 5.765923500061035, "learning_rate": 1.3487544410465404e-07, "loss": 2.7797, "step": 5801 }, { "epoch": 0.9703558138562529, "grad_norm": 5.396022796630859, "learning_rate": 1.3336559162652195e-07, "loss": 2.9606, "step": 5802 }, { "epoch": 0.9705230589120709, "grad_norm": 5.74508810043335, "learning_rate": 1.318642151446864e-07, "loss": 3.1912, "step": 5803 }, { "epoch": 0.9706903039678889, "grad_norm": 6.346031188964844, "learning_rate": 1.3037131517090462e-07, "loss": 2.7818, "step": 5804 }, { "epoch": 0.970857549023707, "grad_norm": 3.8892481327056885, "learning_rate": 1.288868922140446e-07, "loss": 2.6181, "step": 5805 }, { "epoch": 0.971024794079525, "grad_norm": 2.937753438949585, "learning_rate": 1.2741094678008213e-07, "loss": 2.8954, "step": 5806 }, { "epoch": 0.971192039135343, "grad_norm": 4.0948076248168945, "learning_rate": 1.2594347937210927e-07, "loss": 2.5589, "step": 5807 }, { "epoch": 0.9713592841911611, "grad_norm": 4.72580623626709, "learning_rate": 1.244844904903203e-07, "loss": 2.8045, "step": 5808 }, { "epoch": 0.9715265292469791, "grad_norm": 6.819939613342285, "learning_rate": 1.2303398063202297e-07, "loss": 2.9335, "step": 5809 }, { "epoch": 0.9716937743027971, "grad_norm": 5.342371940612793, "learning_rate": 1.2159195029164128e-07, "loss": 2.7294, "step": 5810 }, { "epoch": 0.9718610193586152, "grad_norm": 8.220560073852539, "learning_rate": 1.2015839996069312e-07, "loss": 3.3071, "step": 5811 }, { "epoch": 0.9720282644144332, "grad_norm": 3.543635606765747, "learning_rate": 1.187333301278265e-07, "loss": 2.2812, "step": 5812 }, { "epoch": 0.9721955094702512, "grad_norm": 3.523185968399048, "learning_rate": 1.1731674127878344e-07, "loss": 2.4211, "step": 5813 }, { "epoch": 0.9723627545260694, "grad_norm": 6.262028217315674, "learning_rate": 1.1590863389641938e-07, "loss": 2.8164, "step": 5814 }, { "epoch": 0.9725299995818873, "grad_norm": 14.02365779876709, "learning_rate": 1.1450900846070035e-07, "loss": 2.6566, "step": 5815 }, { "epoch": 0.9726972446377053, "grad_norm": 6.180339813232422, "learning_rate": 1.1311786544870029e-07, "loss": 3.1188, "step": 5816 }, { "epoch": 0.9728644896935235, "grad_norm": 6.3090128898620605, "learning_rate": 1.1173520533459825e-07, "loss": 2.8089, "step": 5817 }, { "epoch": 0.9730317347493415, "grad_norm": 7.776222229003906, "learning_rate": 1.1036102858969222e-07, "loss": 2.5341, "step": 5818 }, { "epoch": 0.9731989798051595, "grad_norm": 6.05702018737793, "learning_rate": 1.0899533568237419e-07, "loss": 2.8575, "step": 5819 }, { "epoch": 0.9733662248609776, "grad_norm": 5.446789741516113, "learning_rate": 1.0763812707815513e-07, "loss": 2.5368, "step": 5820 }, { "epoch": 0.9735334699167956, "grad_norm": 2.9088263511657715, "learning_rate": 1.0628940323965386e-07, "loss": 2.3413, "step": 5821 }, { "epoch": 0.9737007149726136, "grad_norm": 3.937028169631958, "learning_rate": 1.0494916462658877e-07, "loss": 2.3577, "step": 5822 }, { "epoch": 0.9738679600284317, "grad_norm": 3.7760493755340576, "learning_rate": 1.0361741169579442e-07, "loss": 2.6941, "step": 5823 }, { "epoch": 0.9740352050842497, "grad_norm": 8.0503568649292, "learning_rate": 1.0229414490120492e-07, "loss": 2.6131, "step": 5824 }, { "epoch": 0.9742024501400678, "grad_norm": 7.0625739097595215, "learning_rate": 1.0097936469387615e-07, "loss": 2.5472, "step": 5825 }, { "epoch": 0.9743696951958858, "grad_norm": 5.0691962242126465, "learning_rate": 9.967307152195237e-08, "loss": 2.7899, "step": 5826 }, { "epoch": 0.9745369402517038, "grad_norm": 4.981039047241211, "learning_rate": 9.83752658306969e-08, "loss": 2.7983, "step": 5827 }, { "epoch": 0.9747041853075219, "grad_norm": 9.824261665344238, "learning_rate": 9.708594806247806e-08, "loss": 2.919, "step": 5828 }, { "epoch": 0.9748714303633399, "grad_norm": 3.0932304859161377, "learning_rate": 9.580511865677211e-08, "loss": 2.4985, "step": 5829 }, { "epoch": 0.9750386754191579, "grad_norm": 4.7484307289123535, "learning_rate": 9.453277805016037e-08, "loss": 2.6721, "step": 5830 }, { "epoch": 0.975205920474976, "grad_norm": 5.158802509307861, "learning_rate": 9.326892667632648e-08, "loss": 2.6178, "step": 5831 }, { "epoch": 0.975373165530794, "grad_norm": 5.726059436798096, "learning_rate": 9.20135649660675e-08, "loss": 2.8006, "step": 5832 }, { "epoch": 0.975540410586612, "grad_norm": 4.058821678161621, "learning_rate": 9.07666933472856e-08, "loss": 2.4055, "step": 5833 }, { "epoch": 0.9757076556424301, "grad_norm": 3.9657602310180664, "learning_rate": 8.952831224498248e-08, "loss": 2.6397, "step": 5834 }, { "epoch": 0.9758749006982481, "grad_norm": 3.5632638931274414, "learning_rate": 8.829842208127604e-08, "loss": 2.5895, "step": 5835 }, { "epoch": 0.9760421457540661, "grad_norm": 4.232386112213135, "learning_rate": 8.707702327537815e-08, "loss": 2.7762, "step": 5836 }, { "epoch": 0.9762093908098842, "grad_norm": 3.9486188888549805, "learning_rate": 8.58641162436169e-08, "loss": 2.6747, "step": 5837 }, { "epoch": 0.9763766358657022, "grad_norm": 10.178568840026855, "learning_rate": 8.465970139941993e-08, "loss": 3.7308, "step": 5838 }, { "epoch": 0.9765438809215202, "grad_norm": 4.9340667724609375, "learning_rate": 8.346377915332271e-08, "loss": 2.8975, "step": 5839 }, { "epoch": 0.9767111259773383, "grad_norm": 3.1367671489715576, "learning_rate": 8.227634991296584e-08, "loss": 2.7534, "step": 5840 }, { "epoch": 0.9768783710331563, "grad_norm": 4.392470359802246, "learning_rate": 8.109741408309223e-08, "loss": 2.6425, "step": 5841 }, { "epoch": 0.9770456160889743, "grad_norm": 5.095688819885254, "learning_rate": 7.992697206555544e-08, "loss": 2.7747, "step": 5842 }, { "epoch": 0.9772128611447924, "grad_norm": 7.070742130279541, "learning_rate": 7.876502425930577e-08, "loss": 2.7342, "step": 5843 }, { "epoch": 0.9773801062006104, "grad_norm": 9.08476448059082, "learning_rate": 7.761157106040418e-08, "loss": 2.9143, "step": 5844 }, { "epoch": 0.9775473512564284, "grad_norm": 14.5781888961792, "learning_rate": 7.646661286201672e-08, "loss": 2.7687, "step": 5845 }, { "epoch": 0.9777145963122466, "grad_norm": 5.163862228393555, "learning_rate": 7.533015005440902e-08, "loss": 2.5549, "step": 5846 }, { "epoch": 0.9778818413680646, "grad_norm": 4.893588066101074, "learning_rate": 7.420218302495452e-08, "loss": 3.0691, "step": 5847 }, { "epoch": 0.9780490864238826, "grad_norm": 3.44484806060791, "learning_rate": 7.308271215813178e-08, "loss": 2.6553, "step": 5848 }, { "epoch": 0.9782163314797007, "grad_norm": 3.6726129055023193, "learning_rate": 7.197173783552169e-08, "loss": 3.0064, "step": 5849 }, { "epoch": 0.9783835765355187, "grad_norm": 4.422990322113037, "learning_rate": 7.086926043580466e-08, "loss": 2.728, "step": 5850 }, { "epoch": 0.9785508215913367, "grad_norm": 4.510922431945801, "learning_rate": 6.977528033477454e-08, "loss": 2.8075, "step": 5851 }, { "epoch": 0.9787180666471548, "grad_norm": 5.67623233795166, "learning_rate": 6.868979790531916e-08, "loss": 2.9806, "step": 5852 }, { "epoch": 0.9788853117029728, "grad_norm": 7.538052082061768, "learning_rate": 6.761281351743698e-08, "loss": 2.5914, "step": 5853 }, { "epoch": 0.9790525567587908, "grad_norm": 5.392258167266846, "learning_rate": 6.654432753822326e-08, "loss": 2.4602, "step": 5854 }, { "epoch": 0.9792198018146089, "grad_norm": 3.045635461807251, "learning_rate": 6.548434033188667e-08, "loss": 2.4473, "step": 5855 }, { "epoch": 0.9793870468704269, "grad_norm": 5.573869228363037, "learning_rate": 6.443285225972706e-08, "loss": 2.6294, "step": 5856 }, { "epoch": 0.9795542919262449, "grad_norm": 4.8956074714660645, "learning_rate": 6.338986368015498e-08, "loss": 2.8775, "step": 5857 }, { "epoch": 0.979721536982063, "grad_norm": 3.983020067214966, "learning_rate": 6.235537494868049e-08, "loss": 2.3854, "step": 5858 }, { "epoch": 0.979888782037881, "grad_norm": 4.172373294830322, "learning_rate": 6.132938641791874e-08, "loss": 2.6275, "step": 5859 }, { "epoch": 0.980056027093699, "grad_norm": 5.772435188293457, "learning_rate": 6.031189843758445e-08, "loss": 2.6713, "step": 5860 }, { "epoch": 0.9802232721495171, "grad_norm": 7.891183376312256, "learning_rate": 5.930291135450017e-08, "loss": 2.5121, "step": 5861 }, { "epoch": 0.9803905172053351, "grad_norm": 14.122135162353516, "learning_rate": 5.830242551258247e-08, "loss": 2.4378, "step": 5862 }, { "epoch": 0.9805577622611532, "grad_norm": 10.712884902954102, "learning_rate": 5.731044125286134e-08, "loss": 3.11, "step": 5863 }, { "epoch": 0.9807250073169712, "grad_norm": 5.099606513977051, "learning_rate": 5.6326958913457964e-08, "loss": 2.777, "step": 5864 }, { "epoch": 0.9808922523727892, "grad_norm": 4.440805912017822, "learning_rate": 5.535197882959864e-08, "loss": 2.6348, "step": 5865 }, { "epoch": 0.9810594974286073, "grad_norm": 4.3766703605651855, "learning_rate": 5.43855013336203e-08, "loss": 2.2369, "step": 5866 }, { "epoch": 0.9812267424844253, "grad_norm": 4.664854049682617, "learning_rate": 5.342752675495111e-08, "loss": 2.4495, "step": 5867 }, { "epoch": 0.9813939875402433, "grad_norm": 4.994537830352783, "learning_rate": 5.247805542012152e-08, "loss": 3.0979, "step": 5868 }, { "epoch": 0.9815612325960614, "grad_norm": 4.5275774002075195, "learning_rate": 5.1537087652772656e-08, "loss": 2.8945, "step": 5869 }, { "epoch": 0.9817284776518794, "grad_norm": 5.3145246505737305, "learning_rate": 5.060462377363684e-08, "loss": 2.5145, "step": 5870 }, { "epoch": 0.9818957227076974, "grad_norm": 8.362918853759766, "learning_rate": 4.96806641005515e-08, "loss": 2.6927, "step": 5871 }, { "epoch": 0.9820629677635155, "grad_norm": 8.439339637756348, "learning_rate": 4.8765208948459154e-08, "loss": 2.2618, "step": 5872 }, { "epoch": 0.9822302128193335, "grad_norm": 3.872117280960083, "learning_rate": 4.78582586293963e-08, "loss": 2.5414, "step": 5873 }, { "epoch": 0.9823974578751515, "grad_norm": 5.159261703491211, "learning_rate": 4.69598134525101e-08, "loss": 2.8219, "step": 5874 }, { "epoch": 0.9825647029309696, "grad_norm": 3.5096144676208496, "learning_rate": 4.606987372403615e-08, "loss": 2.3374, "step": 5875 }, { "epoch": 0.9827319479867876, "grad_norm": 3.6688921451568604, "learning_rate": 4.518843974732345e-08, "loss": 2.5919, "step": 5876 }, { "epoch": 0.9828991930426056, "grad_norm": 7.28072452545166, "learning_rate": 4.4315511822812236e-08, "loss": 2.7415, "step": 5877 }, { "epoch": 0.9830664380984238, "grad_norm": 4.540366172790527, "learning_rate": 4.3451090248050606e-08, "loss": 2.6583, "step": 5878 }, { "epoch": 0.9832336831542418, "grad_norm": 6.171463489532471, "learning_rate": 4.2595175317680645e-08, "loss": 3.2797, "step": 5879 }, { "epoch": 0.9834009282100598, "grad_norm": 4.3530707359313965, "learning_rate": 4.174776732344676e-08, "loss": 2.8694, "step": 5880 }, { "epoch": 0.9835681732658779, "grad_norm": 3.9458961486816406, "learning_rate": 4.0908866554198453e-08, "loss": 2.7194, "step": 5881 }, { "epoch": 0.9837354183216959, "grad_norm": 4.5140814781188965, "learning_rate": 4.0078473295881994e-08, "loss": 3.0798, "step": 5882 }, { "epoch": 0.9839026633775139, "grad_norm": 4.411372661590576, "learning_rate": 3.9256587831540406e-08, "loss": 2.6147, "step": 5883 }, { "epoch": 0.984069908433332, "grad_norm": 6.039856910705566, "learning_rate": 3.8443210441324597e-08, "loss": 3.1559, "step": 5884 }, { "epoch": 0.98423715348915, "grad_norm": 5.120823383331299, "learning_rate": 3.763834140247391e-08, "loss": 2.6602, "step": 5885 }, { "epoch": 0.984404398544968, "grad_norm": 4.664632797241211, "learning_rate": 3.6841980989343886e-08, "loss": 2.6888, "step": 5886 }, { "epoch": 0.9845716436007861, "grad_norm": 6.4578094482421875, "learning_rate": 3.605412947337294e-08, "loss": 2.7633, "step": 5887 }, { "epoch": 0.9847388886566041, "grad_norm": 7.6717753410339355, "learning_rate": 3.527478712311016e-08, "loss": 2.7977, "step": 5888 }, { "epoch": 0.9849061337124221, "grad_norm": 4.201443195343018, "learning_rate": 3.450395420419861e-08, "loss": 2.6202, "step": 5889 }, { "epoch": 0.9850733787682402, "grad_norm": 4.311738014221191, "learning_rate": 3.374163097938643e-08, "loss": 3.2517, "step": 5890 }, { "epoch": 0.9852406238240582, "grad_norm": 5.78620719909668, "learning_rate": 3.298781770851578e-08, "loss": 3.2208, "step": 5891 }, { "epoch": 0.9854078688798762, "grad_norm": 4.630188465118408, "learning_rate": 3.2242514648528344e-08, "loss": 2.5405, "step": 5892 }, { "epoch": 0.9855751139356943, "grad_norm": 4.757447719573975, "learning_rate": 3.1505722053468136e-08, "loss": 2.5697, "step": 5893 }, { "epoch": 0.9857423589915123, "grad_norm": 6.112194538116455, "learning_rate": 3.07774401744787e-08, "loss": 2.9789, "step": 5894 }, { "epoch": 0.9859096040473303, "grad_norm": 3.3824727535247803, "learning_rate": 3.005766925980036e-08, "loss": 2.9201, "step": 5895 }, { "epoch": 0.9860768491031484, "grad_norm": 4.512463092803955, "learning_rate": 2.9346409554770193e-08, "loss": 2.0314, "step": 5896 }, { "epoch": 0.9862440941589664, "grad_norm": 8.088712692260742, "learning_rate": 2.8643661301830383e-08, "loss": 2.8565, "step": 5897 }, { "epoch": 0.9864113392147844, "grad_norm": 4.06563138961792, "learning_rate": 2.7949424740517095e-08, "loss": 2.7655, "step": 5898 }, { "epoch": 0.9865785842706025, "grad_norm": 5.54555606842041, "learning_rate": 2.7263700107463264e-08, "loss": 2.3747, "step": 5899 }, { "epoch": 0.9867458293264205, "grad_norm": 4.043916702270508, "learning_rate": 2.6586487636409695e-08, "loss": 2.5343, "step": 5900 }, { "epoch": 0.9869130743822385, "grad_norm": 15.451604843139648, "learning_rate": 2.5917787558185636e-08, "loss": 4.1078, "step": 5901 }, { "epoch": 0.9870803194380566, "grad_norm": 7.214953422546387, "learning_rate": 2.525760010072542e-08, "loss": 2.4802, "step": 5902 }, { "epoch": 0.9872475644938746, "grad_norm": 6.911556720733643, "learning_rate": 2.460592548905738e-08, "loss": 2.9537, "step": 5903 }, { "epoch": 0.9874148095496927, "grad_norm": 4.844998836517334, "learning_rate": 2.396276394531216e-08, "loss": 2.901, "step": 5904 }, { "epoch": 0.9875820546055107, "grad_norm": 6.198072910308838, "learning_rate": 2.3328115688714402e-08, "loss": 2.803, "step": 5905 }, { "epoch": 0.9877492996613287, "grad_norm": 8.776344299316406, "learning_rate": 2.2701980935588284e-08, "loss": 2.9669, "step": 5906 }, { "epoch": 0.9879165447171468, "grad_norm": 36.7845458984375, "learning_rate": 2.2084359899363083e-08, "loss": 2.0354, "step": 5907 }, { "epoch": 0.9880837897729648, "grad_norm": 5.129694938659668, "learning_rate": 2.1475252790553736e-08, "loss": 2.3872, "step": 5908 }, { "epoch": 0.9882510348287828, "grad_norm": 7.918032169342041, "learning_rate": 2.0874659816780273e-08, "loss": 3.0791, "step": 5909 }, { "epoch": 0.988418279884601, "grad_norm": 4.385732173919678, "learning_rate": 2.0282581182762273e-08, "loss": 2.8844, "step": 5910 }, { "epoch": 0.988585524940419, "grad_norm": 8.399837493896484, "learning_rate": 1.96990170903133e-08, "loss": 2.3673, "step": 5911 }, { "epoch": 0.988752769996237, "grad_norm": 3.865755796432495, "learning_rate": 1.912396773834646e-08, "loss": 2.5537, "step": 5912 }, { "epoch": 0.9889200150520551, "grad_norm": 4.399834632873535, "learning_rate": 1.8557433322868857e-08, "loss": 2.6708, "step": 5913 }, { "epoch": 0.9890872601078731, "grad_norm": 3.6262619495391846, "learning_rate": 1.799941403699268e-08, "loss": 2.7224, "step": 5914 }, { "epoch": 0.9892545051636911, "grad_norm": 5.634056568145752, "learning_rate": 1.7449910070924115e-08, "loss": 3.0297, "step": 5915 }, { "epoch": 0.9894217502195092, "grad_norm": 6.623456001281738, "learning_rate": 1.690892161196056e-08, "loss": 3.2568, "step": 5916 }, { "epoch": 0.9895889952753272, "grad_norm": 5.166713714599609, "learning_rate": 1.6376448844507285e-08, "loss": 3.0862, "step": 5917 }, { "epoch": 0.9897562403311452, "grad_norm": 4.771968364715576, "learning_rate": 1.5852491950058e-08, "loss": 2.5701, "step": 5918 }, { "epoch": 0.9899234853869633, "grad_norm": 2.707681179046631, "learning_rate": 1.5337051107211508e-08, "loss": 2.431, "step": 5919 }, { "epoch": 0.9900907304427813, "grad_norm": 4.365352153778076, "learning_rate": 1.48301264916606e-08, "loss": 2.44, "step": 5920 }, { "epoch": 0.9902579754985993, "grad_norm": 4.811575412750244, "learning_rate": 1.4331718276189287e-08, "loss": 3.0742, "step": 5921 }, { "epoch": 0.9904252205544174, "grad_norm": 8.666050910949707, "learning_rate": 1.3841826630689448e-08, "loss": 2.6553, "step": 5922 }, { "epoch": 0.9905924656102354, "grad_norm": 3.2178728580474854, "learning_rate": 1.3360451722141398e-08, "loss": 2.3309, "step": 5923 }, { "epoch": 0.9907597106660534, "grad_norm": 6.603808403015137, "learning_rate": 1.2887593714627776e-08, "loss": 3.1719, "step": 5924 }, { "epoch": 0.9909269557218715, "grad_norm": 3.955395460128784, "learning_rate": 1.242325276932521e-08, "loss": 2.7597, "step": 5925 }, { "epoch": 0.9910942007776895, "grad_norm": 6.202901363372803, "learning_rate": 1.1967429044507095e-08, "loss": 3.1493, "step": 5926 }, { "epoch": 0.9912614458335075, "grad_norm": 4.726330280303955, "learning_rate": 1.1520122695546365e-08, "loss": 2.694, "step": 5927 }, { "epoch": 0.9914286908893256, "grad_norm": 7.390532970428467, "learning_rate": 1.1081333874912724e-08, "loss": 2.8452, "step": 5928 }, { "epoch": 0.9915959359451436, "grad_norm": 8.459007263183594, "learning_rate": 1.0651062732167094e-08, "loss": 2.394, "step": 5929 }, { "epoch": 0.9917631810009616, "grad_norm": 6.039361476898193, "learning_rate": 1.022930941397271e-08, "loss": 3.0185, "step": 5930 }, { "epoch": 0.9919304260567797, "grad_norm": 3.7567179203033447, "learning_rate": 9.8160740640868e-09, "loss": 2.8098, "step": 5931 }, { "epoch": 0.9920976711125977, "grad_norm": 4.90992546081543, "learning_rate": 9.41135682336336e-09, "loss": 2.8089, "step": 5932 }, { "epoch": 0.9922649161684157, "grad_norm": 5.323075771331787, "learning_rate": 9.015157829755927e-09, "loss": 2.6928, "step": 5933 }, { "epoch": 0.9924321612242338, "grad_norm": 6.484539031982422, "learning_rate": 8.62747721831203e-09, "loss": 2.9647, "step": 5934 }, { "epoch": 0.9925994062800518, "grad_norm": 9.741381645202637, "learning_rate": 8.248315121175964e-09, "loss": 3.2419, "step": 5935 }, { "epoch": 0.9927666513358698, "grad_norm": 3.284675121307373, "learning_rate": 7.877671667586017e-09, "loss": 3.1087, "step": 5936 }, { "epoch": 0.9929338963916879, "grad_norm": 5.218320846557617, "learning_rate": 7.515546983880017e-09, "loss": 2.6584, "step": 5937 }, { "epoch": 0.9931011414475059, "grad_norm": 7.416790962219238, "learning_rate": 7.161941193489785e-09, "loss": 3.0437, "step": 5938 }, { "epoch": 0.9932683865033239, "grad_norm": 4.982656478881836, "learning_rate": 6.816854416949458e-09, "loss": 2.7335, "step": 5939 }, { "epoch": 0.993435631559142, "grad_norm": 6.600374221801758, "learning_rate": 6.480286771878841e-09, "loss": 2.637, "step": 5940 }, { "epoch": 0.99360287661496, "grad_norm": 4.569051742553711, "learning_rate": 6.1522383730056036e-09, "loss": 2.6008, "step": 5941 }, { "epoch": 0.9937701216707782, "grad_norm": 7.356767654418945, "learning_rate": 5.832709332140307e-09, "loss": 2.8751, "step": 5942 }, { "epoch": 0.9939373667265962, "grad_norm": 4.66675329208374, "learning_rate": 5.521699758204158e-09, "loss": 2.3397, "step": 5943 }, { "epoch": 0.9941046117824142, "grad_norm": 2.739680290222168, "learning_rate": 5.2192097572040245e-09, "loss": 2.6127, "step": 5944 }, { "epoch": 0.9942718568382323, "grad_norm": 5.744908332824707, "learning_rate": 4.925239432246321e-09, "loss": 2.5129, "step": 5945 }, { "epoch": 0.9944391018940503, "grad_norm": 3.319920778274536, "learning_rate": 4.639788883534224e-09, "loss": 2.4167, "step": 5946 }, { "epoch": 0.9946063469498683, "grad_norm": 4.93839693069458, "learning_rate": 4.362858208364906e-09, "loss": 2.8262, "step": 5947 }, { "epoch": 0.9947735920056864, "grad_norm": 6.634796142578125, "learning_rate": 4.094447501132303e-09, "loss": 3.1441, "step": 5948 }, { "epoch": 0.9949408370615044, "grad_norm": 5.7452168464660645, "learning_rate": 3.834556853329896e-09, "loss": 3.1472, "step": 5949 }, { "epoch": 0.9951080821173224, "grad_norm": 8.677360534667969, "learning_rate": 3.583186353536827e-09, "loss": 2.0786, "step": 5950 }, { "epoch": 0.9952753271731405, "grad_norm": 4.957656383514404, "learning_rate": 3.3403360874401103e-09, "loss": 2.8479, "step": 5951 }, { "epoch": 0.9954425722289585, "grad_norm": 3.4464590549468994, "learning_rate": 3.1060061378151983e-09, "loss": 2.8668, "step": 5952 }, { "epoch": 0.9956098172847765, "grad_norm": 3.1797192096710205, "learning_rate": 2.8801965845343114e-09, "loss": 2.306, "step": 5953 }, { "epoch": 0.9957770623405946, "grad_norm": 6.98383903503418, "learning_rate": 2.662907504569212e-09, "loss": 2.4599, "step": 5954 }, { "epoch": 0.9959443073964126, "grad_norm": 3.6638922691345215, "learning_rate": 2.4541389719828778e-09, "loss": 2.569, "step": 5955 }, { "epoch": 0.9961115524522306, "grad_norm": 4.800361633300781, "learning_rate": 2.2538910579378294e-09, "loss": 2.3287, "step": 5956 }, { "epoch": 0.9962787975080487, "grad_norm": 4.6165452003479, "learning_rate": 2.062163830687802e-09, "loss": 2.8186, "step": 5957 }, { "epoch": 0.9964460425638667, "grad_norm": 5.757997035980225, "learning_rate": 1.878957355586075e-09, "loss": 2.7671, "step": 5958 }, { "epoch": 0.9966132876196847, "grad_norm": 3.10366153717041, "learning_rate": 1.704271695077142e-09, "loss": 3.051, "step": 5959 }, { "epoch": 0.9967805326755028, "grad_norm": 4.10654354095459, "learning_rate": 1.538106908707815e-09, "loss": 2.6949, "step": 5960 }, { "epoch": 0.9969477777313208, "grad_norm": 4.076190948486328, "learning_rate": 1.380463053116121e-09, "loss": 2.6455, "step": 5961 }, { "epoch": 0.9971150227871388, "grad_norm": 11.726816177368164, "learning_rate": 1.2313401820340797e-09, "loss": 3.0753, "step": 5962 }, { "epoch": 0.9972822678429569, "grad_norm": 5.404442310333252, "learning_rate": 1.0907383462932519e-09, "loss": 2.5644, "step": 5963 }, { "epoch": 0.9974495128987749, "grad_norm": 6.301368713378906, "learning_rate": 9.5865759381919e-10, "loss": 2.8465, "step": 5964 }, { "epoch": 0.9976167579545929, "grad_norm": 6.097677707672119, "learning_rate": 8.350979696314376e-10, "loss": 2.7941, "step": 5965 }, { "epoch": 0.997784003010411, "grad_norm": 4.015034198760986, "learning_rate": 7.200595158490808e-10, "loss": 2.5779, "step": 5966 }, { "epoch": 0.997951248066229, "grad_norm": 5.040926456451416, "learning_rate": 6.135422716768702e-10, "loss": 2.9063, "step": 5967 }, { "epoch": 0.998118493122047, "grad_norm": 10.062379837036133, "learning_rate": 5.155462734302008e-10, "loss": 3.1822, "step": 5968 }, { "epoch": 0.9982857381778651, "grad_norm": 3.053220510482788, "learning_rate": 4.260715545101324e-10, "loss": 2.4087, "step": 5969 }, { "epoch": 0.9984529832336831, "grad_norm": 10.1709623336792, "learning_rate": 3.4511814540894026e-10, "loss": 3.1863, "step": 5970 }, { "epoch": 0.9986202282895011, "grad_norm": 4.317211627960205, "learning_rate": 2.726860737267689e-10, "loss": 2.5505, "step": 5971 }, { "epoch": 0.9987874733453193, "grad_norm": 3.9773740768432617, "learning_rate": 2.0877536415220277e-10, "loss": 2.5077, "step": 5972 }, { "epoch": 0.9989547184011373, "grad_norm": 13.281478881835938, "learning_rate": 1.5338603846504208e-10, "loss": 2.8204, "step": 5973 }, { "epoch": 0.9991219634569553, "grad_norm": 4.644124984741211, "learning_rate": 1.0651811555018043e-10, "loss": 2.4474, "step": 5974 }, { "epoch": 0.9992892085127734, "grad_norm": 7.178278923034668, "learning_rate": 6.817161137817607e-11, "loss": 3.1254, "step": 5975 }, { "epoch": 0.9994564535685914, "grad_norm": 3.214143991470337, "learning_rate": 3.834653902468066e-11, "loss": 2.7088, "step": 5976 }, { "epoch": 0.9996236986244094, "grad_norm": 4.297837734222412, "learning_rate": 1.7042908653785993e-11, "loss": 2.857, "step": 5977 }, { "epoch": 0.9997909436802275, "grad_norm": 7.052220821380615, "learning_rate": 4.260727526350649e-12, "loss": 3.0834, "step": 5978 }, { "epoch": 0.9999581887360455, "grad_norm": 6.18245267868042, "learning_rate": 0.0, "loss": 2.4525, "step": 5979 } ], "logging_steps": 1, "max_steps": 5979, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.404064772443835e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }