{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996942830938551, "eval_steps": 500, "global_step": 1635, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.6230373155383, "learning_rate": 2.0000000000000002e-07, "loss": 1.0665, "step": 1 }, { "epoch": 0.0, "grad_norm": 6.01161593196551, "learning_rate": 4.0000000000000003e-07, "loss": 1.0829, "step": 2 }, { "epoch": 0.0, "grad_norm": 5.53722803622308, "learning_rate": 6.000000000000001e-07, "loss": 1.0876, "step": 3 }, { "epoch": 0.0, "grad_norm": 7.93426726662942, "learning_rate": 8.000000000000001e-07, "loss": 1.0719, "step": 4 }, { "epoch": 0.0, "grad_norm": 7.031649978841274, "learning_rate": 1.0000000000000002e-06, "loss": 1.0572, "step": 5 }, { "epoch": 0.0, "grad_norm": 6.3575449397660835, "learning_rate": 1.2000000000000002e-06, "loss": 1.1097, "step": 6 }, { "epoch": 0.0, "grad_norm": 6.573922198067659, "learning_rate": 1.4000000000000001e-06, "loss": 1.1593, "step": 7 }, { "epoch": 0.0, "grad_norm": 5.564979871860141, "learning_rate": 1.6000000000000001e-06, "loss": 1.0538, "step": 8 }, { "epoch": 0.01, "grad_norm": 6.0509131309088655, "learning_rate": 1.8000000000000001e-06, "loss": 1.0715, "step": 9 }, { "epoch": 0.01, "grad_norm": 6.12848977059448, "learning_rate": 2.0000000000000003e-06, "loss": 0.9286, "step": 10 }, { "epoch": 0.01, "grad_norm": 4.916177727066278, "learning_rate": 2.2e-06, "loss": 0.9492, "step": 11 }, { "epoch": 0.01, "grad_norm": 4.8581443117369405, "learning_rate": 2.4000000000000003e-06, "loss": 1.0133, "step": 12 }, { "epoch": 0.01, "grad_norm": 3.908502671210593, "learning_rate": 2.6e-06, "loss": 0.9243, "step": 13 }, { "epoch": 0.01, "grad_norm": 4.034985649544406, "learning_rate": 2.8000000000000003e-06, "loss": 1.0416, "step": 14 }, { "epoch": 0.01, "grad_norm": 3.507758376052119, "learning_rate": 3e-06, "loss": 0.9373, "step": 15 }, { "epoch": 0.01, "grad_norm": 3.663272180369727, "learning_rate": 3.2000000000000003e-06, "loss": 0.8673, "step": 16 }, { "epoch": 0.01, "grad_norm": 3.508069835907157, "learning_rate": 3.4000000000000005e-06, "loss": 0.8894, "step": 17 }, { "epoch": 0.01, "grad_norm": 3.294815456496393, "learning_rate": 3.6000000000000003e-06, "loss": 0.8841, "step": 18 }, { "epoch": 0.01, "grad_norm": 2.877754612416487, "learning_rate": 3.8000000000000005e-06, "loss": 0.768, "step": 19 }, { "epoch": 0.01, "grad_norm": 2.664239443889974, "learning_rate": 4.000000000000001e-06, "loss": 0.7173, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.900279841618844, "learning_rate": 4.2000000000000004e-06, "loss": 0.7689, "step": 21 }, { "epoch": 0.01, "grad_norm": 3.0487383417411658, "learning_rate": 4.4e-06, "loss": 0.7327, "step": 22 }, { "epoch": 0.01, "grad_norm": 2.9928876018893447, "learning_rate": 4.600000000000001e-06, "loss": 0.8763, "step": 23 }, { "epoch": 0.01, "grad_norm": 3.031747010513625, "learning_rate": 4.800000000000001e-06, "loss": 0.812, "step": 24 }, { "epoch": 0.02, "grad_norm": 2.5408522914684863, "learning_rate": 5e-06, "loss": 0.8598, "step": 25 }, { "epoch": 0.02, "grad_norm": 2.7561902015944253, "learning_rate": 5.2e-06, "loss": 0.8967, "step": 26 }, { "epoch": 0.02, "grad_norm": 2.976716291176, "learning_rate": 5.400000000000001e-06, "loss": 0.7226, "step": 27 }, { "epoch": 0.02, "grad_norm": 2.9056079278585227, "learning_rate": 5.600000000000001e-06, "loss": 0.8096, "step": 28 }, { "epoch": 0.02, "grad_norm": 2.4758306534625802, "learning_rate": 5.8e-06, "loss": 0.8044, "step": 29 }, { "epoch": 0.02, "grad_norm": 2.3717118099560217, "learning_rate": 6e-06, "loss": 0.6461, "step": 30 }, { "epoch": 0.02, "grad_norm": 2.6666631455135335, "learning_rate": 6.200000000000001e-06, "loss": 0.8049, "step": 31 }, { "epoch": 0.02, "grad_norm": 2.8733424047344993, "learning_rate": 6.4000000000000006e-06, "loss": 0.8736, "step": 32 }, { "epoch": 0.02, "grad_norm": 2.6648200702201637, "learning_rate": 6.600000000000001e-06, "loss": 0.7235, "step": 33 }, { "epoch": 0.02, "grad_norm": 2.5601473220515056, "learning_rate": 6.800000000000001e-06, "loss": 0.68, "step": 34 }, { "epoch": 0.02, "grad_norm": 2.7840115776082466, "learning_rate": 7e-06, "loss": 0.7816, "step": 35 }, { "epoch": 0.02, "grad_norm": 2.524287013051412, "learning_rate": 7.2000000000000005e-06, "loss": 0.7471, "step": 36 }, { "epoch": 0.02, "grad_norm": 2.4550748912153613, "learning_rate": 7.4e-06, "loss": 0.7719, "step": 37 }, { "epoch": 0.02, "grad_norm": 3.1526664248369936, "learning_rate": 7.600000000000001e-06, "loss": 0.8696, "step": 38 }, { "epoch": 0.02, "grad_norm": 2.6121499364302383, "learning_rate": 7.800000000000002e-06, "loss": 0.8445, "step": 39 }, { "epoch": 0.02, "grad_norm": 2.862520896543254, "learning_rate": 8.000000000000001e-06, "loss": 0.7393, "step": 40 }, { "epoch": 0.03, "grad_norm": 2.4539611276002877, "learning_rate": 8.2e-06, "loss": 0.7101, "step": 41 }, { "epoch": 0.03, "grad_norm": 2.4407347141807656, "learning_rate": 8.400000000000001e-06, "loss": 0.6975, "step": 42 }, { "epoch": 0.03, "grad_norm": 2.2749278520807903, "learning_rate": 8.6e-06, "loss": 0.6533, "step": 43 }, { "epoch": 0.03, "grad_norm": 2.666183851086396, "learning_rate": 8.8e-06, "loss": 0.7896, "step": 44 }, { "epoch": 0.03, "grad_norm": 2.6380466110589214, "learning_rate": 9e-06, "loss": 0.7127, "step": 45 }, { "epoch": 0.03, "grad_norm": 2.6229206709830577, "learning_rate": 9.200000000000002e-06, "loss": 0.8321, "step": 46 }, { "epoch": 0.03, "grad_norm": 2.439693927487141, "learning_rate": 9.4e-06, "loss": 0.6645, "step": 47 }, { "epoch": 0.03, "grad_norm": 2.3073039773943127, "learning_rate": 9.600000000000001e-06, "loss": 0.6964, "step": 48 }, { "epoch": 0.03, "grad_norm": 2.4261514181880757, "learning_rate": 9.800000000000001e-06, "loss": 0.7328, "step": 49 }, { "epoch": 0.03, "grad_norm": 2.6298961031353434, "learning_rate": 1e-05, "loss": 0.7843, "step": 50 }, { "epoch": 0.03, "grad_norm": 2.368357019452094, "learning_rate": 9.999990178426327e-06, "loss": 0.668, "step": 51 }, { "epoch": 0.03, "grad_norm": 2.7183273232506373, "learning_rate": 9.999960713743888e-06, "loss": 0.9064, "step": 52 }, { "epoch": 0.03, "grad_norm": 2.4366531712753416, "learning_rate": 9.99991160606844e-06, "loss": 0.6861, "step": 53 }, { "epoch": 0.03, "grad_norm": 2.594914984056206, "learning_rate": 9.999842855592912e-06, "loss": 0.6947, "step": 54 }, { "epoch": 0.03, "grad_norm": 2.631921187839759, "learning_rate": 9.999754462587396e-06, "loss": 0.8039, "step": 55 }, { "epoch": 0.03, "grad_norm": 2.5158584160059085, "learning_rate": 9.999646427399155e-06, "loss": 0.7604, "step": 56 }, { "epoch": 0.03, "grad_norm": 2.4520389413039703, "learning_rate": 9.999518750452622e-06, "loss": 0.8429, "step": 57 }, { "epoch": 0.04, "grad_norm": 2.511219649765081, "learning_rate": 9.99937143224939e-06, "loss": 0.6068, "step": 58 }, { "epoch": 0.04, "grad_norm": 2.2738544213871683, "learning_rate": 9.999204473368218e-06, "loss": 0.7126, "step": 59 }, { "epoch": 0.04, "grad_norm": 2.2995201634238365, "learning_rate": 9.999017874465028e-06, "loss": 0.7117, "step": 60 }, { "epoch": 0.04, "grad_norm": 2.3985761980411646, "learning_rate": 9.998811636272893e-06, "loss": 0.6838, "step": 61 }, { "epoch": 0.04, "grad_norm": 2.2586065910900595, "learning_rate": 9.998585759602052e-06, "loss": 0.6853, "step": 62 }, { "epoch": 0.04, "grad_norm": 2.585278752939056, "learning_rate": 9.998340245339888e-06, "loss": 0.8295, "step": 63 }, { "epoch": 0.04, "grad_norm": 2.4881456153581327, "learning_rate": 9.998075094450935e-06, "loss": 0.6653, "step": 64 }, { "epoch": 0.04, "grad_norm": 2.2548663647589096, "learning_rate": 9.997790307976874e-06, "loss": 0.6354, "step": 65 }, { "epoch": 0.04, "grad_norm": 2.229109711289454, "learning_rate": 9.997485887036524e-06, "loss": 0.6932, "step": 66 }, { "epoch": 0.04, "grad_norm": 2.452224143427515, "learning_rate": 9.997161832825843e-06, "loss": 0.7654, "step": 67 }, { "epoch": 0.04, "grad_norm": 2.2297142951513824, "learning_rate": 9.996818146617922e-06, "loss": 0.6906, "step": 68 }, { "epoch": 0.04, "grad_norm": 2.2791022191610946, "learning_rate": 9.996454829762973e-06, "loss": 0.754, "step": 69 }, { "epoch": 0.04, "grad_norm": 2.4219124929553515, "learning_rate": 9.996071883688333e-06, "loss": 0.6764, "step": 70 }, { "epoch": 0.04, "grad_norm": 2.3501944220961146, "learning_rate": 9.99566930989846e-06, "loss": 0.7615, "step": 71 }, { "epoch": 0.04, "grad_norm": 2.344366272975179, "learning_rate": 9.995247109974915e-06, "loss": 0.7134, "step": 72 }, { "epoch": 0.04, "grad_norm": 2.2314263082377934, "learning_rate": 9.994805285576364e-06, "loss": 0.7631, "step": 73 }, { "epoch": 0.05, "grad_norm": 2.2834194270284636, "learning_rate": 9.99434383843857e-06, "loss": 0.8618, "step": 74 }, { "epoch": 0.05, "grad_norm": 2.1338375917815813, "learning_rate": 9.99386277037439e-06, "loss": 0.6911, "step": 75 }, { "epoch": 0.05, "grad_norm": 2.3756649144864954, "learning_rate": 9.993362083273763e-06, "loss": 0.6907, "step": 76 }, { "epoch": 0.05, "grad_norm": 2.243315382854752, "learning_rate": 9.992841779103701e-06, "loss": 0.7424, "step": 77 }, { "epoch": 0.05, "grad_norm": 2.2756590145094773, "learning_rate": 9.992301859908289e-06, "loss": 0.7107, "step": 78 }, { "epoch": 0.05, "grad_norm": 2.4499338417790324, "learning_rate": 9.991742327808667e-06, "loss": 0.6014, "step": 79 }, { "epoch": 0.05, "grad_norm": 2.5224681026150524, "learning_rate": 9.991163185003028e-06, "loss": 0.7545, "step": 80 }, { "epoch": 0.05, "grad_norm": 2.255631190324587, "learning_rate": 9.990564433766615e-06, "loss": 0.6931, "step": 81 }, { "epoch": 0.05, "grad_norm": 2.3236984572992045, "learning_rate": 9.989946076451693e-06, "loss": 0.7708, "step": 82 }, { "epoch": 0.05, "grad_norm": 2.3364421780529887, "learning_rate": 9.989308115487563e-06, "loss": 0.6633, "step": 83 }, { "epoch": 0.05, "grad_norm": 2.4163016525364336, "learning_rate": 9.988650553380537e-06, "loss": 0.7195, "step": 84 }, { "epoch": 0.05, "grad_norm": 2.296872674947527, "learning_rate": 9.987973392713932e-06, "loss": 0.6912, "step": 85 }, { "epoch": 0.05, "grad_norm": 2.236387998298346, "learning_rate": 9.987276636148062e-06, "loss": 0.6737, "step": 86 }, { "epoch": 0.05, "grad_norm": 2.277025205153288, "learning_rate": 9.986560286420224e-06, "loss": 0.7312, "step": 87 }, { "epoch": 0.05, "grad_norm": 2.200531820532158, "learning_rate": 9.985824346344692e-06, "loss": 0.6251, "step": 88 }, { "epoch": 0.05, "grad_norm": 2.488944948555403, "learning_rate": 9.9850688188127e-06, "loss": 0.7303, "step": 89 }, { "epoch": 0.06, "grad_norm": 2.260371060935904, "learning_rate": 9.984293706792438e-06, "loss": 0.7546, "step": 90 }, { "epoch": 0.06, "grad_norm": 2.4449103602612245, "learning_rate": 9.983499013329035e-06, "loss": 0.7119, "step": 91 }, { "epoch": 0.06, "grad_norm": 2.0759253415676246, "learning_rate": 9.982684741544543e-06, "loss": 0.6844, "step": 92 }, { "epoch": 0.06, "grad_norm": 2.387552887692122, "learning_rate": 9.981850894637937e-06, "loss": 0.6649, "step": 93 }, { "epoch": 0.06, "grad_norm": 2.3995087357031797, "learning_rate": 9.980997475885092e-06, "loss": 0.6547, "step": 94 }, { "epoch": 0.06, "grad_norm": 2.0904111698207495, "learning_rate": 9.980124488638774e-06, "loss": 0.6566, "step": 95 }, { "epoch": 0.06, "grad_norm": 2.156307458383918, "learning_rate": 9.979231936328627e-06, "loss": 0.6928, "step": 96 }, { "epoch": 0.06, "grad_norm": 2.24711153397962, "learning_rate": 9.978319822461156e-06, "loss": 0.6853, "step": 97 }, { "epoch": 0.06, "grad_norm": 2.1649921192352317, "learning_rate": 9.97738815061972e-06, "loss": 0.7694, "step": 98 }, { "epoch": 0.06, "grad_norm": 2.374786305390187, "learning_rate": 9.976436924464513e-06, "loss": 0.5882, "step": 99 }, { "epoch": 0.06, "grad_norm": 2.4306832331036436, "learning_rate": 9.975466147732551e-06, "loss": 0.7988, "step": 100 }, { "epoch": 0.06, "grad_norm": 2.407124728247618, "learning_rate": 9.974475824237653e-06, "loss": 0.7287, "step": 101 }, { "epoch": 0.06, "grad_norm": 2.264722126370796, "learning_rate": 9.973465957870437e-06, "loss": 0.629, "step": 102 }, { "epoch": 0.06, "grad_norm": 2.1005973714933273, "learning_rate": 9.972436552598287e-06, "loss": 0.6619, "step": 103 }, { "epoch": 0.06, "grad_norm": 2.270262489712689, "learning_rate": 9.971387612465364e-06, "loss": 0.7143, "step": 104 }, { "epoch": 0.06, "grad_norm": 2.265439764692705, "learning_rate": 9.970319141592559e-06, "loss": 0.7442, "step": 105 }, { "epoch": 0.06, "grad_norm": 2.4319715531705213, "learning_rate": 9.9692311441775e-06, "loss": 0.7737, "step": 106 }, { "epoch": 0.07, "grad_norm": 2.491658969929883, "learning_rate": 9.968123624494525e-06, "loss": 0.7946, "step": 107 }, { "epoch": 0.07, "grad_norm": 2.027048425981043, "learning_rate": 9.966996586894669e-06, "loss": 0.6461, "step": 108 }, { "epoch": 0.07, "grad_norm": 2.3372539030580874, "learning_rate": 9.965850035805647e-06, "loss": 0.6859, "step": 109 }, { "epoch": 0.07, "grad_norm": 2.2435812305648057, "learning_rate": 9.964683975731828e-06, "loss": 0.7748, "step": 110 }, { "epoch": 0.07, "grad_norm": 2.315025140176691, "learning_rate": 9.963498411254235e-06, "loss": 0.659, "step": 111 }, { "epoch": 0.07, "grad_norm": 2.074191768339382, "learning_rate": 9.96229334703051e-06, "loss": 0.6609, "step": 112 }, { "epoch": 0.07, "grad_norm": 2.145042227584639, "learning_rate": 9.961068787794905e-06, "loss": 0.6756, "step": 113 }, { "epoch": 0.07, "grad_norm": 2.131588951231997, "learning_rate": 9.959824738358257e-06, "loss": 0.6334, "step": 114 }, { "epoch": 0.07, "grad_norm": 2.069665378872537, "learning_rate": 9.958561203607975e-06, "loss": 0.6443, "step": 115 }, { "epoch": 0.07, "grad_norm": 2.3057314770322646, "learning_rate": 9.957278188508023e-06, "loss": 0.7952, "step": 116 }, { "epoch": 0.07, "grad_norm": 2.2378897605544474, "learning_rate": 9.955975698098887e-06, "loss": 0.7272, "step": 117 }, { "epoch": 0.07, "grad_norm": 2.0307818678260654, "learning_rate": 9.954653737497573e-06, "loss": 0.5701, "step": 118 }, { "epoch": 0.07, "grad_norm": 2.24162131248661, "learning_rate": 9.953312311897573e-06, "loss": 0.7793, "step": 119 }, { "epoch": 0.07, "grad_norm": 2.3552427687921393, "learning_rate": 9.951951426568852e-06, "loss": 0.7209, "step": 120 }, { "epoch": 0.07, "grad_norm": 2.141178442038793, "learning_rate": 9.950571086857821e-06, "loss": 0.6716, "step": 121 }, { "epoch": 0.07, "grad_norm": 2.140261901239331, "learning_rate": 9.949171298187328e-06, "loss": 0.6743, "step": 122 }, { "epoch": 0.08, "grad_norm": 2.4084672072357907, "learning_rate": 9.94775206605662e-06, "loss": 0.7973, "step": 123 }, { "epoch": 0.08, "grad_norm": 2.025773154875324, "learning_rate": 9.946313396041334e-06, "loss": 0.7025, "step": 124 }, { "epoch": 0.08, "grad_norm": 2.150720688365092, "learning_rate": 9.944855293793477e-06, "loss": 0.6128, "step": 125 }, { "epoch": 0.08, "grad_norm": 2.161613123811631, "learning_rate": 9.943377765041385e-06, "loss": 0.6306, "step": 126 }, { "epoch": 0.08, "grad_norm": 2.593842849691437, "learning_rate": 9.941880815589726e-06, "loss": 0.5894, "step": 127 }, { "epoch": 0.08, "grad_norm": 2.504320662466919, "learning_rate": 9.94036445131946e-06, "loss": 0.772, "step": 128 }, { "epoch": 0.08, "grad_norm": 2.1848467219695498, "learning_rate": 9.938828678187816e-06, "loss": 0.6397, "step": 129 }, { "epoch": 0.08, "grad_norm": 2.27661938480736, "learning_rate": 9.937273502228283e-06, "loss": 0.6975, "step": 130 }, { "epoch": 0.08, "grad_norm": 2.3673651764999573, "learning_rate": 9.935698929550565e-06, "loss": 0.7621, "step": 131 }, { "epoch": 0.08, "grad_norm": 2.26639915751491, "learning_rate": 9.934104966340582e-06, "loss": 0.6551, "step": 132 }, { "epoch": 0.08, "grad_norm": 2.3961035912104927, "learning_rate": 9.932491618860419e-06, "loss": 0.7304, "step": 133 }, { "epoch": 0.08, "grad_norm": 2.2515245018919505, "learning_rate": 9.93085889344832e-06, "loss": 0.6655, "step": 134 }, { "epoch": 0.08, "grad_norm": 2.254087375166334, "learning_rate": 9.929206796518663e-06, "loss": 0.666, "step": 135 }, { "epoch": 0.08, "grad_norm": 2.31262707571602, "learning_rate": 9.927535334561922e-06, "loss": 0.7362, "step": 136 }, { "epoch": 0.08, "grad_norm": 2.443869927919863, "learning_rate": 9.925844514144651e-06, "loss": 0.6805, "step": 137 }, { "epoch": 0.08, "grad_norm": 2.2594952238504904, "learning_rate": 9.924134341909459e-06, "loss": 0.6936, "step": 138 }, { "epoch": 0.08, "grad_norm": 2.3645395585503013, "learning_rate": 9.922404824574976e-06, "loss": 0.6318, "step": 139 }, { "epoch": 0.09, "grad_norm": 1.952498408951814, "learning_rate": 9.920655968935839e-06, "loss": 0.6884, "step": 140 }, { "epoch": 0.09, "grad_norm": 1.9398379720377739, "learning_rate": 9.91888778186265e-06, "loss": 0.5678, "step": 141 }, { "epoch": 0.09, "grad_norm": 2.213422596923249, "learning_rate": 9.917100270301963e-06, "loss": 0.6868, "step": 142 }, { "epoch": 0.09, "grad_norm": 2.2025927290666307, "learning_rate": 9.915293441276246e-06, "loss": 0.7192, "step": 143 }, { "epoch": 0.09, "grad_norm": 2.80775911051946, "learning_rate": 9.913467301883863e-06, "loss": 0.784, "step": 144 }, { "epoch": 0.09, "grad_norm": 2.0925302922228175, "learning_rate": 9.91162185929904e-06, "loss": 0.7198, "step": 145 }, { "epoch": 0.09, "grad_norm": 2.334842810434387, "learning_rate": 9.909757120771835e-06, "loss": 0.7402, "step": 146 }, { "epoch": 0.09, "grad_norm": 2.3908201091336805, "learning_rate": 9.907873093628115e-06, "loss": 0.6787, "step": 147 }, { "epoch": 0.09, "grad_norm": 2.0776209566634316, "learning_rate": 9.905969785269527e-06, "loss": 0.6842, "step": 148 }, { "epoch": 0.09, "grad_norm": 1.997004958324354, "learning_rate": 9.904047203173462e-06, "loss": 0.5733, "step": 149 }, { "epoch": 0.09, "grad_norm": 2.3139630172641508, "learning_rate": 9.90210535489303e-06, "loss": 0.6647, "step": 150 }, { "epoch": 0.09, "grad_norm": 1.9690194811261275, "learning_rate": 9.90014424805704e-06, "loss": 0.7311, "step": 151 }, { "epoch": 0.09, "grad_norm": 2.3995016349539617, "learning_rate": 9.898163890369948e-06, "loss": 0.7473, "step": 152 }, { "epoch": 0.09, "grad_norm": 2.1433446483892324, "learning_rate": 9.896164289611849e-06, "loss": 0.7016, "step": 153 }, { "epoch": 0.09, "grad_norm": 2.190153295128278, "learning_rate": 9.894145453638433e-06, "loss": 0.67, "step": 154 }, { "epoch": 0.09, "grad_norm": 2.4110621078855394, "learning_rate": 9.892107390380959e-06, "loss": 0.6655, "step": 155 }, { "epoch": 0.1, "grad_norm": 2.2675629481717667, "learning_rate": 9.890050107846219e-06, "loss": 0.6459, "step": 156 }, { "epoch": 0.1, "grad_norm": 2.274763942094237, "learning_rate": 9.887973614116517e-06, "loss": 0.6077, "step": 157 }, { "epoch": 0.1, "grad_norm": 2.4805064767185314, "learning_rate": 9.885877917349626e-06, "loss": 0.7247, "step": 158 }, { "epoch": 0.1, "grad_norm": 2.002019535180498, "learning_rate": 9.883763025778766e-06, "loss": 0.5373, "step": 159 }, { "epoch": 0.1, "grad_norm": 2.337028939246668, "learning_rate": 9.881628947712556e-06, "loss": 0.7776, "step": 160 }, { "epoch": 0.1, "grad_norm": 2.1479302094948247, "learning_rate": 9.879475691535e-06, "loss": 0.6499, "step": 161 }, { "epoch": 0.1, "grad_norm": 1.9593814752135854, "learning_rate": 9.87730326570545e-06, "loss": 0.5575, "step": 162 }, { "epoch": 0.1, "grad_norm": 2.3120091764398714, "learning_rate": 9.875111678758553e-06, "loss": 0.8117, "step": 163 }, { "epoch": 0.1, "grad_norm": 2.58203181614075, "learning_rate": 9.872900939304246e-06, "loss": 0.7774, "step": 164 }, { "epoch": 0.1, "grad_norm": 2.1359404314304173, "learning_rate": 9.870671056027705e-06, "loss": 0.7738, "step": 165 }, { "epoch": 0.1, "grad_norm": 2.0558218346618484, "learning_rate": 9.868422037689316e-06, "loss": 0.6216, "step": 166 }, { "epoch": 0.1, "grad_norm": 2.243529236580866, "learning_rate": 9.866153893124638e-06, "loss": 0.6684, "step": 167 }, { "epoch": 0.1, "grad_norm": 2.1145868718408316, "learning_rate": 9.863866631244371e-06, "loss": 0.6847, "step": 168 }, { "epoch": 0.1, "grad_norm": 1.9024551152535656, "learning_rate": 9.861560261034319e-06, "loss": 0.5933, "step": 169 }, { "epoch": 0.1, "grad_norm": 2.1434847962939396, "learning_rate": 9.859234791555356e-06, "loss": 0.6503, "step": 170 }, { "epoch": 0.1, "grad_norm": 2.1711654094618713, "learning_rate": 9.856890231943389e-06, "loss": 0.6768, "step": 171 }, { "epoch": 0.11, "grad_norm": 2.4118520851923524, "learning_rate": 9.854526591409325e-06, "loss": 0.7787, "step": 172 }, { "epoch": 0.11, "grad_norm": 2.0666041021999977, "learning_rate": 9.852143879239032e-06, "loss": 0.6304, "step": 173 }, { "epoch": 0.11, "grad_norm": 2.0942561413873806, "learning_rate": 9.849742104793303e-06, "loss": 0.659, "step": 174 }, { "epoch": 0.11, "grad_norm": 1.9464606223168837, "learning_rate": 9.847321277507821e-06, "loss": 0.5119, "step": 175 }, { "epoch": 0.11, "grad_norm": 2.0838258311462443, "learning_rate": 9.844881406893118e-06, "loss": 0.6413, "step": 176 }, { "epoch": 0.11, "grad_norm": 2.2413046501319136, "learning_rate": 9.842422502534542e-06, "loss": 0.6781, "step": 177 }, { "epoch": 0.11, "grad_norm": 2.0434533347433392, "learning_rate": 9.839944574092215e-06, "loss": 0.7173, "step": 178 }, { "epoch": 0.11, "grad_norm": 2.0756948950274956, "learning_rate": 9.837447631301003e-06, "loss": 0.691, "step": 179 }, { "epoch": 0.11, "grad_norm": 2.096621527142799, "learning_rate": 9.834931683970468e-06, "loss": 0.6164, "step": 180 }, { "epoch": 0.11, "grad_norm": 2.325512127795748, "learning_rate": 9.832396741984834e-06, "loss": 0.7617, "step": 181 }, { "epoch": 0.11, "grad_norm": 2.1101717813557723, "learning_rate": 9.829842815302951e-06, "loss": 0.5708, "step": 182 }, { "epoch": 0.11, "grad_norm": 2.1629292221166567, "learning_rate": 9.827269913958247e-06, "loss": 0.7347, "step": 183 }, { "epoch": 0.11, "grad_norm": 2.3197439288304906, "learning_rate": 9.8246780480587e-06, "loss": 0.7264, "step": 184 }, { "epoch": 0.11, "grad_norm": 2.1964017610148843, "learning_rate": 9.822067227786794e-06, "loss": 0.759, "step": 185 }, { "epoch": 0.11, "grad_norm": 2.272214121168636, "learning_rate": 9.819437463399468e-06, "loss": 0.6904, "step": 186 }, { "epoch": 0.11, "grad_norm": 2.3994827721770684, "learning_rate": 9.816788765228095e-06, "loss": 0.7399, "step": 187 }, { "epoch": 0.11, "grad_norm": 2.111638499108041, "learning_rate": 9.81412114367843e-06, "loss": 0.7612, "step": 188 }, { "epoch": 0.12, "grad_norm": 1.9840878981791865, "learning_rate": 9.81143460923057e-06, "loss": 0.5875, "step": 189 }, { "epoch": 0.12, "grad_norm": 2.1652359918123167, "learning_rate": 9.808729172438909e-06, "loss": 0.724, "step": 190 }, { "epoch": 0.12, "grad_norm": 1.9395882749114965, "learning_rate": 9.80600484393211e-06, "loss": 0.5528, "step": 191 }, { "epoch": 0.12, "grad_norm": 2.1604646292422527, "learning_rate": 9.803261634413049e-06, "loss": 0.5707, "step": 192 }, { "epoch": 0.12, "grad_norm": 2.016941878380451, "learning_rate": 9.80049955465878e-06, "loss": 0.6385, "step": 193 }, { "epoch": 0.12, "grad_norm": 2.532345965216657, "learning_rate": 9.797718615520488e-06, "loss": 0.6938, "step": 194 }, { "epoch": 0.12, "grad_norm": 2.2511667943525038, "learning_rate": 9.794918827923458e-06, "loss": 0.753, "step": 195 }, { "epoch": 0.12, "grad_norm": 2.1447413054072517, "learning_rate": 9.792100202867014e-06, "loss": 0.6697, "step": 196 }, { "epoch": 0.12, "grad_norm": 2.189313732305628, "learning_rate": 9.78926275142449e-06, "loss": 0.616, "step": 197 }, { "epoch": 0.12, "grad_norm": 2.268919346618177, "learning_rate": 9.786406484743183e-06, "loss": 0.7412, "step": 198 }, { "epoch": 0.12, "grad_norm": 2.2333800757193276, "learning_rate": 9.783531414044304e-06, "loss": 0.6923, "step": 199 }, { "epoch": 0.12, "grad_norm": 2.055136095603475, "learning_rate": 9.780637550622943e-06, "loss": 0.7031, "step": 200 }, { "epoch": 0.12, "grad_norm": 2.226669249666592, "learning_rate": 9.777724905848013e-06, "loss": 0.7531, "step": 201 }, { "epoch": 0.12, "grad_norm": 2.232153739326395, "learning_rate": 9.774793491162221e-06, "loss": 0.6133, "step": 202 }, { "epoch": 0.12, "grad_norm": 2.264407351963802, "learning_rate": 9.771843318082008e-06, "loss": 0.5952, "step": 203 }, { "epoch": 0.12, "grad_norm": 2.294784666626483, "learning_rate": 9.76887439819751e-06, "loss": 0.8448, "step": 204 }, { "epoch": 0.13, "grad_norm": 2.1307772179440936, "learning_rate": 9.765886743172512e-06, "loss": 0.5948, "step": 205 }, { "epoch": 0.13, "grad_norm": 1.83961485752976, "learning_rate": 9.762880364744404e-06, "loss": 0.5447, "step": 206 }, { "epoch": 0.13, "grad_norm": 2.031812871238954, "learning_rate": 9.759855274724137e-06, "loss": 0.6538, "step": 207 }, { "epoch": 0.13, "grad_norm": 2.0040113831634745, "learning_rate": 9.756811484996162e-06, "loss": 0.6421, "step": 208 }, { "epoch": 0.13, "grad_norm": 1.7984932798626254, "learning_rate": 9.753749007518407e-06, "loss": 0.5157, "step": 209 }, { "epoch": 0.13, "grad_norm": 2.0477874542114916, "learning_rate": 9.750667854322207e-06, "loss": 0.6199, "step": 210 }, { "epoch": 0.13, "grad_norm": 2.206751428653727, "learning_rate": 9.747568037512274e-06, "loss": 0.6161, "step": 211 }, { "epoch": 0.13, "grad_norm": 2.3285004622852767, "learning_rate": 9.744449569266637e-06, "loss": 0.7607, "step": 212 }, { "epoch": 0.13, "grad_norm": 2.1011118330445475, "learning_rate": 9.741312461836606e-06, "loss": 0.666, "step": 213 }, { "epoch": 0.13, "grad_norm": 2.397695659444179, "learning_rate": 9.738156727546711e-06, "loss": 0.7105, "step": 214 }, { "epoch": 0.13, "grad_norm": 1.91331183170689, "learning_rate": 9.734982378794662e-06, "loss": 0.619, "step": 215 }, { "epoch": 0.13, "grad_norm": 2.1362268650914125, "learning_rate": 9.731789428051302e-06, "loss": 0.7317, "step": 216 }, { "epoch": 0.13, "grad_norm": 2.04421825962035, "learning_rate": 9.72857788786055e-06, "loss": 0.6309, "step": 217 }, { "epoch": 0.13, "grad_norm": 2.1550284488031473, "learning_rate": 9.725347770839356e-06, "loss": 0.6768, "step": 218 }, { "epoch": 0.13, "grad_norm": 2.049439369305375, "learning_rate": 9.722099089677655e-06, "loss": 0.6423, "step": 219 }, { "epoch": 0.13, "grad_norm": 2.122940983855365, "learning_rate": 9.718831857138308e-06, "loss": 0.6345, "step": 220 }, { "epoch": 0.14, "grad_norm": 2.274655471484878, "learning_rate": 9.715546086057066e-06, "loss": 0.5792, "step": 221 }, { "epoch": 0.14, "grad_norm": 1.8385884175738376, "learning_rate": 9.712241789342504e-06, "loss": 0.656, "step": 222 }, { "epoch": 0.14, "grad_norm": 2.2460907183322933, "learning_rate": 9.708918979975982e-06, "loss": 0.6417, "step": 223 }, { "epoch": 0.14, "grad_norm": 1.9323218121201529, "learning_rate": 9.705577671011579e-06, "loss": 0.6371, "step": 224 }, { "epoch": 0.14, "grad_norm": 2.0382284797399293, "learning_rate": 9.702217875576067e-06, "loss": 0.591, "step": 225 }, { "epoch": 0.14, "grad_norm": 2.0698398210796567, "learning_rate": 9.698839606868835e-06, "loss": 0.5794, "step": 226 }, { "epoch": 0.14, "grad_norm": 1.9440879438361034, "learning_rate": 9.69544287816185e-06, "loss": 0.6745, "step": 227 }, { "epoch": 0.14, "grad_norm": 1.9900929740524849, "learning_rate": 9.6920277027996e-06, "loss": 0.6757, "step": 228 }, { "epoch": 0.14, "grad_norm": 2.1940921838343446, "learning_rate": 9.688594094199043e-06, "loss": 0.6472, "step": 229 }, { "epoch": 0.14, "grad_norm": 2.1958440427756636, "learning_rate": 9.685142065849556e-06, "loss": 0.7342, "step": 230 }, { "epoch": 0.14, "grad_norm": 2.165725186559193, "learning_rate": 9.681671631312876e-06, "loss": 0.6485, "step": 231 }, { "epoch": 0.14, "grad_norm": 2.1894781279792443, "learning_rate": 9.67818280422306e-06, "loss": 0.6896, "step": 232 }, { "epoch": 0.14, "grad_norm": 2.195740186965468, "learning_rate": 9.674675598286414e-06, "loss": 0.6974, "step": 233 }, { "epoch": 0.14, "grad_norm": 2.2452322721170668, "learning_rate": 9.671150027281449e-06, "loss": 0.6163, "step": 234 }, { "epoch": 0.14, "grad_norm": 2.326336053478045, "learning_rate": 9.667606105058828e-06, "loss": 0.6448, "step": 235 }, { "epoch": 0.14, "grad_norm": 2.0032814883659036, "learning_rate": 9.66404384554131e-06, "loss": 0.619, "step": 236 }, { "epoch": 0.14, "grad_norm": 1.932037123804567, "learning_rate": 9.660463262723691e-06, "loss": 0.5897, "step": 237 }, { "epoch": 0.15, "grad_norm": 2.340804976271579, "learning_rate": 9.656864370672757e-06, "loss": 0.8023, "step": 238 }, { "epoch": 0.15, "grad_norm": 2.1022457172205327, "learning_rate": 9.653247183527216e-06, "loss": 0.7218, "step": 239 }, { "epoch": 0.15, "grad_norm": 1.8423012822099027, "learning_rate": 9.649611715497662e-06, "loss": 0.6005, "step": 240 }, { "epoch": 0.15, "grad_norm": 1.8546169042416565, "learning_rate": 9.645957980866499e-06, "loss": 0.5888, "step": 241 }, { "epoch": 0.15, "grad_norm": 1.9846561311341997, "learning_rate": 9.642285993987895e-06, "loss": 0.5579, "step": 242 }, { "epoch": 0.15, "grad_norm": 2.239777371231001, "learning_rate": 9.63859576928773e-06, "loss": 0.755, "step": 243 }, { "epoch": 0.15, "grad_norm": 2.0594775910705083, "learning_rate": 9.634887321263525e-06, "loss": 0.6442, "step": 244 }, { "epoch": 0.15, "grad_norm": 2.176249216011953, "learning_rate": 9.631160664484398e-06, "loss": 0.8016, "step": 245 }, { "epoch": 0.15, "grad_norm": 2.085520659410642, "learning_rate": 9.627415813591007e-06, "loss": 0.6368, "step": 246 }, { "epoch": 0.15, "grad_norm": 2.021294737304931, "learning_rate": 9.623652783295483e-06, "loss": 0.5614, "step": 247 }, { "epoch": 0.15, "grad_norm": 2.231835038374354, "learning_rate": 9.619871588381376e-06, "loss": 0.7216, "step": 248 }, { "epoch": 0.15, "grad_norm": 1.910271586943225, "learning_rate": 9.616072243703598e-06, "loss": 0.5791, "step": 249 }, { "epoch": 0.15, "grad_norm": 1.8003560124729412, "learning_rate": 9.612254764188368e-06, "loss": 0.5448, "step": 250 }, { "epoch": 0.15, "grad_norm": 2.2007549176054404, "learning_rate": 9.608419164833152e-06, "loss": 0.7257, "step": 251 }, { "epoch": 0.15, "grad_norm": 2.0571622186487044, "learning_rate": 9.604565460706592e-06, "loss": 0.6335, "step": 252 }, { "epoch": 0.15, "grad_norm": 2.2497466958972154, "learning_rate": 9.60069366694847e-06, "loss": 0.6597, "step": 253 }, { "epoch": 0.16, "grad_norm": 1.9612178789599213, "learning_rate": 9.596803798769626e-06, "loss": 0.7287, "step": 254 }, { "epoch": 0.16, "grad_norm": 2.1104419163141115, "learning_rate": 9.592895871451908e-06, "loss": 0.6671, "step": 255 }, { "epoch": 0.16, "grad_norm": 2.0822889988204305, "learning_rate": 9.58896990034812e-06, "loss": 0.7013, "step": 256 }, { "epoch": 0.16, "grad_norm": 2.248918383189871, "learning_rate": 9.585025900881944e-06, "loss": 0.7042, "step": 257 }, { "epoch": 0.16, "grad_norm": 2.0495739015390857, "learning_rate": 9.581063888547895e-06, "loss": 0.6913, "step": 258 }, { "epoch": 0.16, "grad_norm": 1.759437262151785, "learning_rate": 9.57708387891125e-06, "loss": 0.5709, "step": 259 }, { "epoch": 0.16, "grad_norm": 2.420770662182739, "learning_rate": 9.573085887607991e-06, "loss": 0.6814, "step": 260 }, { "epoch": 0.16, "grad_norm": 2.130894942110269, "learning_rate": 9.569069930344746e-06, "loss": 0.6187, "step": 261 }, { "epoch": 0.16, "grad_norm": 1.9591579536191646, "learning_rate": 9.565036022898723e-06, "loss": 0.5882, "step": 262 }, { "epoch": 0.16, "grad_norm": 1.8084671651408435, "learning_rate": 9.56098418111765e-06, "loss": 0.6313, "step": 263 }, { "epoch": 0.16, "grad_norm": 2.1526443073933086, "learning_rate": 9.556914420919711e-06, "loss": 0.6102, "step": 264 }, { "epoch": 0.16, "grad_norm": 2.194691833732068, "learning_rate": 9.552826758293487e-06, "loss": 0.6878, "step": 265 }, { "epoch": 0.16, "grad_norm": 2.501846785947928, "learning_rate": 9.548721209297889e-06, "loss": 0.6596, "step": 266 }, { "epoch": 0.16, "grad_norm": 2.0669308931128123, "learning_rate": 9.544597790062098e-06, "loss": 0.6224, "step": 267 }, { "epoch": 0.16, "grad_norm": 2.2681106555575052, "learning_rate": 9.5404565167855e-06, "loss": 0.6786, "step": 268 }, { "epoch": 0.16, "grad_norm": 2.0576613963336445, "learning_rate": 9.536297405737624e-06, "loss": 0.5946, "step": 269 }, { "epoch": 0.17, "grad_norm": 2.0426855396207264, "learning_rate": 9.532120473258075e-06, "loss": 0.641, "step": 270 }, { "epoch": 0.17, "grad_norm": 2.1061182708873973, "learning_rate": 9.527925735756473e-06, "loss": 0.6379, "step": 271 }, { "epoch": 0.17, "grad_norm": 1.6795883204795699, "learning_rate": 9.52371320971239e-06, "loss": 0.5449, "step": 272 }, { "epoch": 0.17, "grad_norm": 1.9653754600396853, "learning_rate": 9.519482911675278e-06, "loss": 0.6875, "step": 273 }, { "epoch": 0.17, "grad_norm": 2.1944241074929534, "learning_rate": 9.51523485826441e-06, "loss": 0.6387, "step": 274 }, { "epoch": 0.17, "grad_norm": 2.2621502173726418, "learning_rate": 9.510969066168814e-06, "loss": 0.7567, "step": 275 }, { "epoch": 0.17, "grad_norm": 2.0713889312949623, "learning_rate": 9.506685552147208e-06, "loss": 0.8003, "step": 276 }, { "epoch": 0.17, "grad_norm": 2.161082236049816, "learning_rate": 9.502384333027929e-06, "loss": 0.7317, "step": 277 }, { "epoch": 0.17, "grad_norm": 2.099935757711094, "learning_rate": 9.498065425708878e-06, "loss": 0.6365, "step": 278 }, { "epoch": 0.17, "grad_norm": 2.0269709971121768, "learning_rate": 9.493728847157436e-06, "loss": 0.6275, "step": 279 }, { "epoch": 0.17, "grad_norm": 1.9361261922037705, "learning_rate": 9.489374614410413e-06, "loss": 0.6505, "step": 280 }, { "epoch": 0.17, "grad_norm": 2.276393660154319, "learning_rate": 9.485002744573982e-06, "loss": 0.7315, "step": 281 }, { "epoch": 0.17, "grad_norm": 1.9425800587711943, "learning_rate": 9.480613254823595e-06, "loss": 0.6143, "step": 282 }, { "epoch": 0.17, "grad_norm": 2.2697653664784534, "learning_rate": 9.476206162403933e-06, "loss": 0.6727, "step": 283 }, { "epoch": 0.17, "grad_norm": 2.1842308672307063, "learning_rate": 9.471781484628828e-06, "loss": 0.6416, "step": 284 }, { "epoch": 0.17, "grad_norm": 1.9855040976893727, "learning_rate": 9.467339238881199e-06, "loss": 0.6107, "step": 285 }, { "epoch": 0.17, "grad_norm": 1.9633251460753256, "learning_rate": 9.462879442612984e-06, "loss": 0.5977, "step": 286 }, { "epoch": 0.18, "grad_norm": 2.0847506096615094, "learning_rate": 9.458402113345071e-06, "loss": 0.5964, "step": 287 }, { "epoch": 0.18, "grad_norm": 2.071971955099866, "learning_rate": 9.453907268667226e-06, "loss": 0.6926, "step": 288 }, { "epoch": 0.18, "grad_norm": 2.1433436130539074, "learning_rate": 9.44939492623803e-06, "loss": 0.5979, "step": 289 }, { "epoch": 0.18, "grad_norm": 2.262092646829491, "learning_rate": 9.444865103784803e-06, "loss": 0.6555, "step": 290 }, { "epoch": 0.18, "grad_norm": 2.1581337027107192, "learning_rate": 9.440317819103542e-06, "loss": 0.7022, "step": 291 }, { "epoch": 0.18, "grad_norm": 2.207365600854885, "learning_rate": 9.435753090058839e-06, "loss": 0.6298, "step": 292 }, { "epoch": 0.18, "grad_norm": 1.9716493031373659, "learning_rate": 9.431170934583826e-06, "loss": 0.6057, "step": 293 }, { "epoch": 0.18, "grad_norm": 1.8605748771934563, "learning_rate": 9.426571370680094e-06, "loss": 0.6488, "step": 294 }, { "epoch": 0.18, "grad_norm": 2.101750848753291, "learning_rate": 9.421954416417624e-06, "loss": 0.6334, "step": 295 }, { "epoch": 0.18, "grad_norm": 2.189471586472517, "learning_rate": 9.417320089934721e-06, "loss": 0.6478, "step": 296 }, { "epoch": 0.18, "grad_norm": 1.8693406953522982, "learning_rate": 9.412668409437934e-06, "loss": 0.5423, "step": 297 }, { "epoch": 0.18, "grad_norm": 2.1604549448326207, "learning_rate": 9.407999393201992e-06, "loss": 0.6778, "step": 298 }, { "epoch": 0.18, "grad_norm": 2.260164616585325, "learning_rate": 9.403313059569729e-06, "loss": 0.7631, "step": 299 }, { "epoch": 0.18, "grad_norm": 2.0264565243677652, "learning_rate": 9.398609426952019e-06, "loss": 0.6039, "step": 300 }, { "epoch": 0.18, "grad_norm": 2.0077711587490987, "learning_rate": 9.393888513827686e-06, "loss": 0.6003, "step": 301 }, { "epoch": 0.18, "grad_norm": 2.0401684479993563, "learning_rate": 9.389150338743451e-06, "loss": 0.6232, "step": 302 }, { "epoch": 0.19, "grad_norm": 2.0592434888026467, "learning_rate": 9.384394920313847e-06, "loss": 0.692, "step": 303 }, { "epoch": 0.19, "grad_norm": 2.127639974580799, "learning_rate": 9.379622277221152e-06, "loss": 0.6403, "step": 304 }, { "epoch": 0.19, "grad_norm": 2.1637948423090596, "learning_rate": 9.37483242821531e-06, "loss": 0.7911, "step": 305 }, { "epoch": 0.19, "grad_norm": 1.96655742278293, "learning_rate": 9.370025392113866e-06, "loss": 0.6817, "step": 306 }, { "epoch": 0.19, "grad_norm": 2.4075353559357375, "learning_rate": 9.365201187801884e-06, "loss": 0.7468, "step": 307 }, { "epoch": 0.19, "grad_norm": 2.170026110189212, "learning_rate": 9.360359834231873e-06, "loss": 0.7148, "step": 308 }, { "epoch": 0.19, "grad_norm": 2.0723680052005378, "learning_rate": 9.355501350423717e-06, "loss": 0.6234, "step": 309 }, { "epoch": 0.19, "grad_norm": 2.111461085654852, "learning_rate": 9.3506257554646e-06, "loss": 0.6659, "step": 310 }, { "epoch": 0.19, "grad_norm": 2.2496008204864104, "learning_rate": 9.345733068508929e-06, "loss": 0.7033, "step": 311 }, { "epoch": 0.19, "grad_norm": 1.9996970862014591, "learning_rate": 9.340823308778255e-06, "loss": 0.7255, "step": 312 }, { "epoch": 0.19, "grad_norm": 1.8792750115155255, "learning_rate": 9.335896495561207e-06, "loss": 0.6429, "step": 313 }, { "epoch": 0.19, "grad_norm": 2.12929524638482, "learning_rate": 9.33095264821341e-06, "loss": 0.6596, "step": 314 }, { "epoch": 0.19, "grad_norm": 1.9025785100638457, "learning_rate": 9.325991786157405e-06, "loss": 0.6464, "step": 315 }, { "epoch": 0.19, "grad_norm": 1.9934226422368588, "learning_rate": 9.321013928882583e-06, "loss": 0.5929, "step": 316 }, { "epoch": 0.19, "grad_norm": 2.1268235022475697, "learning_rate": 9.3160190959451e-06, "loss": 0.6511, "step": 317 }, { "epoch": 0.19, "grad_norm": 1.9740290904745004, "learning_rate": 9.311007306967805e-06, "loss": 0.5765, "step": 318 }, { "epoch": 0.2, "grad_norm": 2.0624735759975823, "learning_rate": 9.305978581640157e-06, "loss": 0.7006, "step": 319 }, { "epoch": 0.2, "grad_norm": 2.2850117779663144, "learning_rate": 9.300932939718159e-06, "loss": 0.6555, "step": 320 }, { "epoch": 0.2, "grad_norm": 1.9373893589189668, "learning_rate": 9.295870401024266e-06, "loss": 0.6105, "step": 321 }, { "epoch": 0.2, "grad_norm": 1.9939827504480299, "learning_rate": 9.290790985447316e-06, "loss": 0.574, "step": 322 }, { "epoch": 0.2, "grad_norm": 2.518967684022752, "learning_rate": 9.285694712942453e-06, "loss": 0.737, "step": 323 }, { "epoch": 0.2, "grad_norm": 2.061941526906131, "learning_rate": 9.28058160353104e-06, "loss": 0.6289, "step": 324 }, { "epoch": 0.2, "grad_norm": 2.046986924521927, "learning_rate": 9.275451677300591e-06, "loss": 0.6026, "step": 325 }, { "epoch": 0.2, "grad_norm": 1.9643917949664476, "learning_rate": 9.270304954404688e-06, "loss": 0.6726, "step": 326 }, { "epoch": 0.2, "grad_norm": 2.030653402715337, "learning_rate": 9.265141455062894e-06, "loss": 0.6522, "step": 327 }, { "epoch": 0.2, "grad_norm": 2.083183062824829, "learning_rate": 9.259961199560686e-06, "loss": 0.6681, "step": 328 }, { "epoch": 0.2, "grad_norm": 2.0946830011733955, "learning_rate": 9.254764208249369e-06, "loss": 0.7092, "step": 329 }, { "epoch": 0.2, "grad_norm": 2.1225126781820283, "learning_rate": 9.249550501545998e-06, "loss": 0.67, "step": 330 }, { "epoch": 0.2, "grad_norm": 2.026753617785709, "learning_rate": 9.244320099933291e-06, "loss": 0.6578, "step": 331 }, { "epoch": 0.2, "grad_norm": 2.3515094288361125, "learning_rate": 9.239073023959562e-06, "loss": 0.7187, "step": 332 }, { "epoch": 0.2, "grad_norm": 2.1066900321641655, "learning_rate": 9.233809294238625e-06, "loss": 0.709, "step": 333 }, { "epoch": 0.2, "grad_norm": 2.2215204725890416, "learning_rate": 9.228528931449724e-06, "loss": 0.7507, "step": 334 }, { "epoch": 0.2, "grad_norm": 2.2519794194499068, "learning_rate": 9.22323195633745e-06, "loss": 0.804, "step": 335 }, { "epoch": 0.21, "grad_norm": 2.1625190316209792, "learning_rate": 9.217918389711652e-06, "loss": 0.7088, "step": 336 }, { "epoch": 0.21, "grad_norm": 2.095704698093118, "learning_rate": 9.21258825244737e-06, "loss": 0.6989, "step": 337 }, { "epoch": 0.21, "grad_norm": 2.330487081066773, "learning_rate": 9.207241565484733e-06, "loss": 0.7033, "step": 338 }, { "epoch": 0.21, "grad_norm": 2.132239589678436, "learning_rate": 9.201878349828897e-06, "loss": 0.6656, "step": 339 }, { "epoch": 0.21, "grad_norm": 2.158342662695929, "learning_rate": 9.196498626549944e-06, "loss": 0.5794, "step": 340 }, { "epoch": 0.21, "grad_norm": 1.9571490668467135, "learning_rate": 9.191102416782819e-06, "loss": 0.5614, "step": 341 }, { "epoch": 0.21, "grad_norm": 2.0828460881254154, "learning_rate": 9.185689741727229e-06, "loss": 0.7618, "step": 342 }, { "epoch": 0.21, "grad_norm": 1.945496988662335, "learning_rate": 9.180260622647565e-06, "loss": 0.6134, "step": 343 }, { "epoch": 0.21, "grad_norm": 2.1345490610587006, "learning_rate": 9.174815080872829e-06, "loss": 0.6491, "step": 344 }, { "epoch": 0.21, "grad_norm": 1.888910241101656, "learning_rate": 9.169353137796533e-06, "loss": 0.5433, "step": 345 }, { "epoch": 0.21, "grad_norm": 2.2231218683381346, "learning_rate": 9.163874814876632e-06, "loss": 0.6674, "step": 346 }, { "epoch": 0.21, "grad_norm": 1.8397128888601602, "learning_rate": 9.158380133635425e-06, "loss": 0.5104, "step": 347 }, { "epoch": 0.21, "grad_norm": 1.8435017185456046, "learning_rate": 9.152869115659474e-06, "loss": 0.5708, "step": 348 }, { "epoch": 0.21, "grad_norm": 2.004371953603906, "learning_rate": 9.147341782599534e-06, "loss": 0.5923, "step": 349 }, { "epoch": 0.21, "grad_norm": 2.1426206185002523, "learning_rate": 9.141798156170447e-06, "loss": 0.6067, "step": 350 }, { "epoch": 0.21, "grad_norm": 2.0707774939518435, "learning_rate": 9.136238258151063e-06, "loss": 0.621, "step": 351 }, { "epoch": 0.22, "grad_norm": 2.0756065658076808, "learning_rate": 9.130662110384163e-06, "loss": 0.609, "step": 352 }, { "epoch": 0.22, "grad_norm": 1.864542155335991, "learning_rate": 9.125069734776367e-06, "loss": 0.5795, "step": 353 }, { "epoch": 0.22, "grad_norm": 2.247405694299018, "learning_rate": 9.119461153298045e-06, "loss": 0.6788, "step": 354 }, { "epoch": 0.22, "grad_norm": 2.0281028236357908, "learning_rate": 9.113836387983239e-06, "loss": 0.6667, "step": 355 }, { "epoch": 0.22, "grad_norm": 2.1739992658132126, "learning_rate": 9.108195460929563e-06, "loss": 0.6559, "step": 356 }, { "epoch": 0.22, "grad_norm": 1.844308015715884, "learning_rate": 9.10253839429813e-06, "loss": 0.5637, "step": 357 }, { "epoch": 0.22, "grad_norm": 2.158849134009064, "learning_rate": 9.096865210313461e-06, "loss": 0.6977, "step": 358 }, { "epoch": 0.22, "grad_norm": 1.9857083622278322, "learning_rate": 9.091175931263395e-06, "loss": 0.7014, "step": 359 }, { "epoch": 0.22, "grad_norm": 2.083743100705083, "learning_rate": 9.085470579498996e-06, "loss": 0.6288, "step": 360 }, { "epoch": 0.22, "grad_norm": 2.0848589757887304, "learning_rate": 9.079749177434481e-06, "loss": 0.5892, "step": 361 }, { "epoch": 0.22, "grad_norm": 2.2211766443468073, "learning_rate": 9.074011747547118e-06, "loss": 0.6756, "step": 362 }, { "epoch": 0.22, "grad_norm": 2.259415733177512, "learning_rate": 9.068258312377143e-06, "loss": 0.637, "step": 363 }, { "epoch": 0.22, "grad_norm": 1.8541880063977976, "learning_rate": 9.06248889452767e-06, "loss": 0.5564, "step": 364 }, { "epoch": 0.22, "grad_norm": 2.043397669872696, "learning_rate": 9.056703516664606e-06, "loss": 0.5995, "step": 365 }, { "epoch": 0.22, "grad_norm": 1.9966425012080062, "learning_rate": 9.050902201516555e-06, "loss": 0.5602, "step": 366 }, { "epoch": 0.22, "grad_norm": 2.097676942573622, "learning_rate": 9.045084971874738e-06, "loss": 0.669, "step": 367 }, { "epoch": 0.23, "grad_norm": 2.0595811405443016, "learning_rate": 9.039251850592892e-06, "loss": 0.6529, "step": 368 }, { "epoch": 0.23, "grad_norm": 2.0255229150761576, "learning_rate": 9.033402860587187e-06, "loss": 0.5948, "step": 369 }, { "epoch": 0.23, "grad_norm": 2.0548212104417276, "learning_rate": 9.027538024836143e-06, "loss": 0.6584, "step": 370 }, { "epoch": 0.23, "grad_norm": 2.2114936351325465, "learning_rate": 9.021657366380521e-06, "loss": 0.6837, "step": 371 }, { "epoch": 0.23, "grad_norm": 1.9893474856689934, "learning_rate": 9.015760908323253e-06, "loss": 0.5977, "step": 372 }, { "epoch": 0.23, "grad_norm": 1.9935862578665022, "learning_rate": 9.009848673829337e-06, "loss": 0.6574, "step": 373 }, { "epoch": 0.23, "grad_norm": 1.8536984972638404, "learning_rate": 9.00392068612575e-06, "loss": 0.5571, "step": 374 }, { "epoch": 0.23, "grad_norm": 2.07272622617217, "learning_rate": 8.997976968501362e-06, "loss": 0.6437, "step": 375 }, { "epoch": 0.23, "grad_norm": 1.9669798106315952, "learning_rate": 8.992017544306834e-06, "loss": 0.6805, "step": 376 }, { "epoch": 0.23, "grad_norm": 2.243741605970751, "learning_rate": 8.986042436954538e-06, "loss": 0.7328, "step": 377 }, { "epoch": 0.23, "grad_norm": 2.011662513116711, "learning_rate": 8.980051669918458e-06, "loss": 0.6209, "step": 378 }, { "epoch": 0.23, "grad_norm": 2.1937242214026007, "learning_rate": 8.974045266734094e-06, "loss": 0.7434, "step": 379 }, { "epoch": 0.23, "grad_norm": 2.132031298132569, "learning_rate": 8.96802325099838e-06, "loss": 0.6832, "step": 380 }, { "epoch": 0.23, "grad_norm": 1.7086865848142259, "learning_rate": 8.961985646369587e-06, "loss": 0.5608, "step": 381 }, { "epoch": 0.23, "grad_norm": 1.9009692420702806, "learning_rate": 8.955932476567224e-06, "loss": 0.6121, "step": 382 }, { "epoch": 0.23, "grad_norm": 1.9044767808035803, "learning_rate": 8.949863765371952e-06, "loss": 0.6172, "step": 383 }, { "epoch": 0.23, "grad_norm": 2.1087095562200946, "learning_rate": 8.943779536625489e-06, "loss": 0.7064, "step": 384 }, { "epoch": 0.24, "grad_norm": 2.17610448059507, "learning_rate": 8.937679814230517e-06, "loss": 0.6725, "step": 385 }, { "epoch": 0.24, "grad_norm": 2.006215616453568, "learning_rate": 8.931564622150583e-06, "loss": 0.6987, "step": 386 }, { "epoch": 0.24, "grad_norm": 1.9223795578578178, "learning_rate": 8.925433984410012e-06, "loss": 0.5192, "step": 387 }, { "epoch": 0.24, "grad_norm": 1.8039652484819113, "learning_rate": 8.919287925093808e-06, "loss": 0.628, "step": 388 }, { "epoch": 0.24, "grad_norm": 1.769522071377601, "learning_rate": 8.913126468347561e-06, "loss": 0.4867, "step": 389 }, { "epoch": 0.24, "grad_norm": 2.290538985245612, "learning_rate": 8.906949638377352e-06, "loss": 0.6833, "step": 390 }, { "epoch": 0.24, "grad_norm": 2.1791089656581764, "learning_rate": 8.900757459449655e-06, "loss": 0.76, "step": 391 }, { "epoch": 0.24, "grad_norm": 2.0748417472498537, "learning_rate": 8.894549955891247e-06, "loss": 0.6931, "step": 392 }, { "epoch": 0.24, "grad_norm": 2.007190815984241, "learning_rate": 8.888327152089112e-06, "loss": 0.6713, "step": 393 }, { "epoch": 0.24, "grad_norm": 1.7418731560021379, "learning_rate": 8.882089072490339e-06, "loss": 0.5852, "step": 394 }, { "epoch": 0.24, "grad_norm": 1.9279427627473156, "learning_rate": 8.875835741602031e-06, "loss": 0.5998, "step": 395 }, { "epoch": 0.24, "grad_norm": 2.011804969137247, "learning_rate": 8.869567183991208e-06, "loss": 0.7047, "step": 396 }, { "epoch": 0.24, "grad_norm": 2.0919216640489577, "learning_rate": 8.86328342428471e-06, "loss": 0.6773, "step": 397 }, { "epoch": 0.24, "grad_norm": 2.1708903996053994, "learning_rate": 8.856984487169102e-06, "loss": 0.6511, "step": 398 }, { "epoch": 0.24, "grad_norm": 2.0482302804600954, "learning_rate": 8.85067039739057e-06, "loss": 0.6458, "step": 399 }, { "epoch": 0.24, "grad_norm": 2.2389233691566184, "learning_rate": 8.84434117975484e-06, "loss": 0.6042, "step": 400 }, { "epoch": 0.25, "grad_norm": 2.3238794603179365, "learning_rate": 8.837996859127056e-06, "loss": 0.6536, "step": 401 }, { "epoch": 0.25, "grad_norm": 1.9634398094275907, "learning_rate": 8.831637460431708e-06, "loss": 0.6009, "step": 402 }, { "epoch": 0.25, "grad_norm": 2.1849384771988167, "learning_rate": 8.825263008652513e-06, "loss": 0.6747, "step": 403 }, { "epoch": 0.25, "grad_norm": 1.9367137201020725, "learning_rate": 8.818873528832334e-06, "loss": 0.5679, "step": 404 }, { "epoch": 0.25, "grad_norm": 1.9444930407351348, "learning_rate": 8.812469046073069e-06, "loss": 0.5809, "step": 405 }, { "epoch": 0.25, "grad_norm": 2.0382647745048263, "learning_rate": 8.806049585535554e-06, "loss": 0.5664, "step": 406 }, { "epoch": 0.25, "grad_norm": 2.1047083940033944, "learning_rate": 8.799615172439475e-06, "loss": 0.5677, "step": 407 }, { "epoch": 0.25, "grad_norm": 1.8990495481992753, "learning_rate": 8.793165832063254e-06, "loss": 0.6238, "step": 408 }, { "epoch": 0.25, "grad_norm": 2.096972722468596, "learning_rate": 8.786701589743965e-06, "loss": 0.6452, "step": 409 }, { "epoch": 0.25, "grad_norm": 1.8640478732019463, "learning_rate": 8.780222470877213e-06, "loss": 0.5267, "step": 410 }, { "epoch": 0.25, "grad_norm": 1.9247739069634147, "learning_rate": 8.77372850091706e-06, "loss": 0.6142, "step": 411 }, { "epoch": 0.25, "grad_norm": 2.087695274157492, "learning_rate": 8.76721970537591e-06, "loss": 0.6652, "step": 412 }, { "epoch": 0.25, "grad_norm": 1.850788744558352, "learning_rate": 8.760696109824403e-06, "loss": 0.5258, "step": 413 }, { "epoch": 0.25, "grad_norm": 2.118016145296157, "learning_rate": 8.754157739891332e-06, "loss": 0.6427, "step": 414 }, { "epoch": 0.25, "grad_norm": 2.1471953099208525, "learning_rate": 8.74760462126353e-06, "loss": 0.6361, "step": 415 }, { "epoch": 0.25, "grad_norm": 1.9116075357657814, "learning_rate": 8.741036779685771e-06, "loss": 0.5885, "step": 416 }, { "epoch": 0.25, "grad_norm": 2.186935678265862, "learning_rate": 8.734454240960672e-06, "loss": 0.7819, "step": 417 }, { "epoch": 0.26, "grad_norm": 1.9329278314726581, "learning_rate": 8.727857030948587e-06, "loss": 0.6089, "step": 418 }, { "epoch": 0.26, "grad_norm": 2.29559179529083, "learning_rate": 8.721245175567513e-06, "loss": 0.6536, "step": 419 }, { "epoch": 0.26, "grad_norm": 1.949685346432584, "learning_rate": 8.714618700792975e-06, "loss": 0.588, "step": 420 }, { "epoch": 0.26, "grad_norm": 2.1364335358069555, "learning_rate": 8.707977632657942e-06, "loss": 0.5693, "step": 421 }, { "epoch": 0.26, "grad_norm": 2.1052630337837646, "learning_rate": 8.701321997252707e-06, "loss": 0.6618, "step": 422 }, { "epoch": 0.26, "grad_norm": 1.9419341419413294, "learning_rate": 8.694651820724796e-06, "loss": 0.6432, "step": 423 }, { "epoch": 0.26, "grad_norm": 2.1809423639777847, "learning_rate": 8.687967129278863e-06, "loss": 0.6786, "step": 424 }, { "epoch": 0.26, "grad_norm": 2.048911015295105, "learning_rate": 8.68126794917658e-06, "loss": 0.6848, "step": 425 }, { "epoch": 0.26, "grad_norm": 2.1090753469834076, "learning_rate": 8.674554306736545e-06, "loss": 0.6447, "step": 426 }, { "epoch": 0.26, "grad_norm": 2.106438933355631, "learning_rate": 8.667826228334173e-06, "loss": 0.5551, "step": 427 }, { "epoch": 0.26, "grad_norm": 2.0203947279705226, "learning_rate": 8.66108374040159e-06, "loss": 0.5717, "step": 428 }, { "epoch": 0.26, "grad_norm": 2.0615093467875854, "learning_rate": 8.654326869427533e-06, "loss": 0.6311, "step": 429 }, { "epoch": 0.26, "grad_norm": 2.1329704988537665, "learning_rate": 8.647555641957243e-06, "loss": 0.6243, "step": 430 }, { "epoch": 0.26, "grad_norm": 1.8315214353591525, "learning_rate": 8.640770084592367e-06, "loss": 0.5547, "step": 431 }, { "epoch": 0.26, "grad_norm": 2.210411556217951, "learning_rate": 8.633970223990841e-06, "loss": 0.6408, "step": 432 }, { "epoch": 0.26, "grad_norm": 2.007561414582103, "learning_rate": 8.627156086866804e-06, "loss": 0.5894, "step": 433 }, { "epoch": 0.27, "grad_norm": 1.939162087316279, "learning_rate": 8.620327699990469e-06, "loss": 0.5772, "step": 434 }, { "epoch": 0.27, "grad_norm": 2.1700787302690094, "learning_rate": 8.613485090188044e-06, "loss": 0.6095, "step": 435 }, { "epoch": 0.27, "grad_norm": 2.0168885983710703, "learning_rate": 8.606628284341603e-06, "loss": 0.6537, "step": 436 }, { "epoch": 0.27, "grad_norm": 1.9420432087459054, "learning_rate": 8.599757309388998e-06, "loss": 0.5503, "step": 437 }, { "epoch": 0.27, "grad_norm": 1.9005641654421328, "learning_rate": 8.592872192323742e-06, "loss": 0.5285, "step": 438 }, { "epoch": 0.27, "grad_norm": 2.310866512162385, "learning_rate": 8.58597296019491e-06, "loss": 0.6925, "step": 439 }, { "epoch": 0.27, "grad_norm": 2.0185104038237283, "learning_rate": 8.57905964010703e-06, "loss": 0.6208, "step": 440 }, { "epoch": 0.27, "grad_norm": 2.227534027585251, "learning_rate": 8.572132259219973e-06, "loss": 0.6722, "step": 441 }, { "epoch": 0.27, "grad_norm": 1.972521989095671, "learning_rate": 8.565190844748852e-06, "loss": 0.6204, "step": 442 }, { "epoch": 0.27, "grad_norm": 2.192852585817164, "learning_rate": 8.558235423963912e-06, "loss": 0.6615, "step": 443 }, { "epoch": 0.27, "grad_norm": 2.1499672574920883, "learning_rate": 8.551266024190425e-06, "loss": 0.5939, "step": 444 }, { "epoch": 0.27, "grad_norm": 2.0344036721852303, "learning_rate": 8.54428267280858e-06, "loss": 0.609, "step": 445 }, { "epoch": 0.27, "grad_norm": 2.100328047808317, "learning_rate": 8.537285397253378e-06, "loss": 0.6728, "step": 446 }, { "epoch": 0.27, "grad_norm": 1.9522804167083359, "learning_rate": 8.53027422501452e-06, "loss": 0.5963, "step": 447 }, { "epoch": 0.27, "grad_norm": 2.0276370479958663, "learning_rate": 8.523249183636303e-06, "loss": 0.6615, "step": 448 }, { "epoch": 0.27, "grad_norm": 1.969628544118802, "learning_rate": 8.516210300717519e-06, "loss": 0.6111, "step": 449 }, { "epoch": 0.28, "grad_norm": 2.029052699494888, "learning_rate": 8.50915760391132e-06, "loss": 0.6396, "step": 450 }, { "epoch": 0.28, "grad_norm": 1.8921377970210058, "learning_rate": 8.502091120925147e-06, "loss": 0.6135, "step": 451 }, { "epoch": 0.28, "grad_norm": 1.905825397098304, "learning_rate": 8.49501087952059e-06, "loss": 0.6531, "step": 452 }, { "epoch": 0.28, "grad_norm": 2.062356135135231, "learning_rate": 8.487916907513291e-06, "loss": 0.6511, "step": 453 }, { "epoch": 0.28, "grad_norm": 2.0500672806486047, "learning_rate": 8.480809232772845e-06, "loss": 0.6973, "step": 454 }, { "epoch": 0.28, "grad_norm": 1.9780923474909595, "learning_rate": 8.473687883222665e-06, "loss": 0.5567, "step": 455 }, { "epoch": 0.28, "grad_norm": 2.001802531470044, "learning_rate": 8.4665528868399e-06, "loss": 0.6096, "step": 456 }, { "epoch": 0.28, "grad_norm": 2.0486427239843343, "learning_rate": 8.459404271655304e-06, "loss": 0.7061, "step": 457 }, { "epoch": 0.28, "grad_norm": 2.1064266393636113, "learning_rate": 8.452242065753138e-06, "loss": 0.6797, "step": 458 }, { "epoch": 0.28, "grad_norm": 2.3915047992203, "learning_rate": 8.445066297271055e-06, "loss": 0.6238, "step": 459 }, { "epoch": 0.28, "grad_norm": 2.0029093719066053, "learning_rate": 8.437876994399992e-06, "loss": 0.5708, "step": 460 }, { "epoch": 0.28, "grad_norm": 2.0251422666051178, "learning_rate": 8.430674185384054e-06, "loss": 0.6305, "step": 461 }, { "epoch": 0.28, "grad_norm": 2.3215947492777222, "learning_rate": 8.423457898520411e-06, "loss": 0.6077, "step": 462 }, { "epoch": 0.28, "grad_norm": 1.9799905222032952, "learning_rate": 8.416228162159178e-06, "loss": 0.5937, "step": 463 }, { "epoch": 0.28, "grad_norm": 1.9919779229157657, "learning_rate": 8.408985004703312e-06, "loss": 0.6588, "step": 464 }, { "epoch": 0.28, "grad_norm": 1.8545854124573158, "learning_rate": 8.401728454608495e-06, "loss": 0.5624, "step": 465 }, { "epoch": 0.28, "grad_norm": 1.9951991842396126, "learning_rate": 8.394458540383021e-06, "loss": 0.6586, "step": 466 }, { "epoch": 0.29, "grad_norm": 1.9832046641551582, "learning_rate": 8.387175290587692e-06, "loss": 0.6178, "step": 467 }, { "epoch": 0.29, "grad_norm": 2.1705961273936456, "learning_rate": 8.379878733835697e-06, "loss": 0.6783, "step": 468 }, { "epoch": 0.29, "grad_norm": 2.0865192960586323, "learning_rate": 8.372568898792504e-06, "loss": 0.6141, "step": 469 }, { "epoch": 0.29, "grad_norm": 1.965297643743764, "learning_rate": 8.365245814175744e-06, "loss": 0.5656, "step": 470 }, { "epoch": 0.29, "grad_norm": 1.994816838265779, "learning_rate": 8.357909508755106e-06, "loss": 0.5594, "step": 471 }, { "epoch": 0.29, "grad_norm": 2.081917736412011, "learning_rate": 8.350560011352217e-06, "loss": 0.6753, "step": 472 }, { "epoch": 0.29, "grad_norm": 1.9190310357166047, "learning_rate": 8.343197350840525e-06, "loss": 0.5778, "step": 473 }, { "epoch": 0.29, "grad_norm": 1.9990825253769382, "learning_rate": 8.335821556145196e-06, "loss": 0.5679, "step": 474 }, { "epoch": 0.29, "grad_norm": 1.8601731215327446, "learning_rate": 8.328432656242998e-06, "loss": 0.5376, "step": 475 }, { "epoch": 0.29, "grad_norm": 1.8011984252968534, "learning_rate": 8.321030680162177e-06, "loss": 0.5679, "step": 476 }, { "epoch": 0.29, "grad_norm": 2.2985243811453637, "learning_rate": 8.313615656982354e-06, "loss": 0.6887, "step": 477 }, { "epoch": 0.29, "grad_norm": 2.0168932090236624, "learning_rate": 8.306187615834411e-06, "loss": 0.6523, "step": 478 }, { "epoch": 0.29, "grad_norm": 2.007847760050486, "learning_rate": 8.298746585900367e-06, "loss": 0.6079, "step": 479 }, { "epoch": 0.29, "grad_norm": 2.1580333815191914, "learning_rate": 8.291292596413272e-06, "loss": 0.7007, "step": 480 }, { "epoch": 0.29, "grad_norm": 2.0148089585758857, "learning_rate": 8.28382567665709e-06, "loss": 0.6778, "step": 481 }, { "epoch": 0.29, "grad_norm": 2.0624654834089697, "learning_rate": 8.276345855966579e-06, "loss": 0.618, "step": 482 }, { "epoch": 0.3, "grad_norm": 1.9930903577238281, "learning_rate": 8.268853163727184e-06, "loss": 0.6011, "step": 483 }, { "epoch": 0.3, "grad_norm": 1.9326313111875104, "learning_rate": 8.26134762937492e-06, "loss": 0.5755, "step": 484 }, { "epoch": 0.3, "grad_norm": 2.1052378837219283, "learning_rate": 8.253829282396246e-06, "loss": 0.576, "step": 485 }, { "epoch": 0.3, "grad_norm": 1.9490986209974357, "learning_rate": 8.246298152327965e-06, "loss": 0.4944, "step": 486 }, { "epoch": 0.3, "grad_norm": 2.0980391720214002, "learning_rate": 8.238754268757092e-06, "loss": 0.7186, "step": 487 }, { "epoch": 0.3, "grad_norm": 1.881768667514335, "learning_rate": 8.231197661320755e-06, "loss": 0.6097, "step": 488 }, { "epoch": 0.3, "grad_norm": 1.9646616396951349, "learning_rate": 8.223628359706063e-06, "loss": 0.6717, "step": 489 }, { "epoch": 0.3, "grad_norm": 1.9845900062528004, "learning_rate": 8.216046393649997e-06, "loss": 0.5794, "step": 490 }, { "epoch": 0.3, "grad_norm": 1.975691200881602, "learning_rate": 8.20845179293929e-06, "loss": 0.6777, "step": 491 }, { "epoch": 0.3, "grad_norm": 1.9790588163074925, "learning_rate": 8.20084458741032e-06, "loss": 0.5762, "step": 492 }, { "epoch": 0.3, "grad_norm": 2.033854053229917, "learning_rate": 8.193224806948975e-06, "loss": 0.6425, "step": 493 }, { "epoch": 0.3, "grad_norm": 2.0564703779289855, "learning_rate": 8.185592481490549e-06, "loss": 0.5421, "step": 494 }, { "epoch": 0.3, "grad_norm": 1.9024282460009037, "learning_rate": 8.177947641019622e-06, "loss": 0.5416, "step": 495 }, { "epoch": 0.3, "grad_norm": 1.8428880273743034, "learning_rate": 8.170290315569937e-06, "loss": 0.5476, "step": 496 }, { "epoch": 0.3, "grad_norm": 2.0314873709790517, "learning_rate": 8.16262053522429e-06, "loss": 0.6254, "step": 497 }, { "epoch": 0.3, "grad_norm": 1.959747747554248, "learning_rate": 8.154938330114407e-06, "loss": 0.6715, "step": 498 }, { "epoch": 0.31, "grad_norm": 1.9605352675210954, "learning_rate": 8.147243730420827e-06, "loss": 0.5389, "step": 499 }, { "epoch": 0.31, "grad_norm": 1.9808533481893225, "learning_rate": 8.139536766372775e-06, "loss": 0.5917, "step": 500 }, { "epoch": 0.31, "grad_norm": 2.124751570239496, "learning_rate": 8.131817468248064e-06, "loss": 0.646, "step": 501 }, { "epoch": 0.31, "grad_norm": 1.9453603552598644, "learning_rate": 8.124085866372952e-06, "loss": 0.6475, "step": 502 }, { "epoch": 0.31, "grad_norm": 2.284493964086694, "learning_rate": 8.116341991122038e-06, "loss": 0.657, "step": 503 }, { "epoch": 0.31, "grad_norm": 2.173487845748996, "learning_rate": 8.108585872918142e-06, "loss": 0.6072, "step": 504 }, { "epoch": 0.31, "grad_norm": 1.9740790341680636, "learning_rate": 8.100817542232175e-06, "loss": 0.6192, "step": 505 }, { "epoch": 0.31, "grad_norm": 1.9882407145838754, "learning_rate": 8.09303702958303e-06, "loss": 0.7174, "step": 506 }, { "epoch": 0.31, "grad_norm": 1.765767752810985, "learning_rate": 8.085244365537459e-06, "loss": 0.5659, "step": 507 }, { "epoch": 0.31, "grad_norm": 2.0919873787965018, "learning_rate": 8.077439580709954e-06, "loss": 0.7014, "step": 508 }, { "epoch": 0.31, "grad_norm": 2.0909317709072597, "learning_rate": 8.069622705762619e-06, "loss": 0.6553, "step": 509 }, { "epoch": 0.31, "grad_norm": 2.0985013077972163, "learning_rate": 8.06179377140506e-06, "loss": 0.5996, "step": 510 }, { "epoch": 0.31, "grad_norm": 1.875167456622752, "learning_rate": 8.05395280839426e-06, "loss": 0.4977, "step": 511 }, { "epoch": 0.31, "grad_norm": 1.8642775987752205, "learning_rate": 8.046099847534458e-06, "loss": 0.516, "step": 512 }, { "epoch": 0.31, "grad_norm": 1.8047762854038711, "learning_rate": 8.038234919677029e-06, "loss": 0.5456, "step": 513 }, { "epoch": 0.31, "grad_norm": 2.054027103241828, "learning_rate": 8.030358055720355e-06, "loss": 0.6449, "step": 514 }, { "epoch": 0.31, "grad_norm": 1.88938224837625, "learning_rate": 8.02246928660972e-06, "loss": 0.5853, "step": 515 }, { "epoch": 0.32, "grad_norm": 1.9949053145025524, "learning_rate": 8.014568643337175e-06, "loss": 0.6374, "step": 516 }, { "epoch": 0.32, "grad_norm": 2.181017234415942, "learning_rate": 8.006656156941418e-06, "loss": 0.6383, "step": 517 }, { "epoch": 0.32, "grad_norm": 2.1231246586306325, "learning_rate": 7.998731858507675e-06, "loss": 0.6517, "step": 518 }, { "epoch": 0.32, "grad_norm": 2.014704090337969, "learning_rate": 7.990795779167584e-06, "loss": 0.6203, "step": 519 }, { "epoch": 0.32, "grad_norm": 2.1690171263618785, "learning_rate": 7.982847950099055e-06, "loss": 0.7135, "step": 520 }, { "epoch": 0.32, "grad_norm": 1.8051904968061352, "learning_rate": 7.974888402526166e-06, "loss": 0.5658, "step": 521 }, { "epoch": 0.32, "grad_norm": 2.0602443463430555, "learning_rate": 7.966917167719029e-06, "loss": 0.6524, "step": 522 }, { "epoch": 0.32, "grad_norm": 1.8973152827677298, "learning_rate": 7.95893427699367e-06, "loss": 0.5758, "step": 523 }, { "epoch": 0.32, "grad_norm": 2.019571292211095, "learning_rate": 7.950939761711915e-06, "loss": 0.6241, "step": 524 }, { "epoch": 0.32, "grad_norm": 1.8785901266881793, "learning_rate": 7.942933653281245e-06, "loss": 0.5769, "step": 525 }, { "epoch": 0.32, "grad_norm": 2.166266745685418, "learning_rate": 7.934915983154698e-06, "loss": 0.6663, "step": 526 }, { "epoch": 0.32, "grad_norm": 2.123451242286571, "learning_rate": 7.92688678283073e-06, "loss": 0.6527, "step": 527 }, { "epoch": 0.32, "grad_norm": 1.9075717966913297, "learning_rate": 7.918846083853089e-06, "loss": 0.6569, "step": 528 }, { "epoch": 0.32, "grad_norm": 1.9000996169691746, "learning_rate": 7.910793917810707e-06, "loss": 0.6385, "step": 529 }, { "epoch": 0.32, "grad_norm": 1.927434736070484, "learning_rate": 7.902730316337556e-06, "loss": 0.5631, "step": 530 }, { "epoch": 0.32, "grad_norm": 1.9790989754571544, "learning_rate": 7.894655311112545e-06, "loss": 0.6068, "step": 531 }, { "epoch": 0.33, "grad_norm": 2.1188146600936535, "learning_rate": 7.886568933859372e-06, "loss": 0.696, "step": 532 }, { "epoch": 0.33, "grad_norm": 1.9800997991892215, "learning_rate": 7.878471216346418e-06, "loss": 0.6283, "step": 533 }, { "epoch": 0.33, "grad_norm": 1.8749813668837976, "learning_rate": 7.870362190386616e-06, "loss": 0.5925, "step": 534 }, { "epoch": 0.33, "grad_norm": 2.030181629804673, "learning_rate": 7.862241887837322e-06, "loss": 0.5838, "step": 535 }, { "epoch": 0.33, "grad_norm": 2.191116164536583, "learning_rate": 7.854110340600199e-06, "loss": 0.6621, "step": 536 }, { "epoch": 0.33, "grad_norm": 2.1322352729861747, "learning_rate": 7.845967580621082e-06, "loss": 0.7296, "step": 537 }, { "epoch": 0.33, "grad_norm": 1.9503331877159438, "learning_rate": 7.837813639889858e-06, "loss": 0.6131, "step": 538 }, { "epoch": 0.33, "grad_norm": 2.0706877886034802, "learning_rate": 7.829648550440337e-06, "loss": 0.6048, "step": 539 }, { "epoch": 0.33, "grad_norm": 2.0116116138534617, "learning_rate": 7.821472344350131e-06, "loss": 0.6343, "step": 540 }, { "epoch": 0.33, "grad_norm": 1.9969976539512104, "learning_rate": 7.813285053740526e-06, "loss": 0.6453, "step": 541 }, { "epoch": 0.33, "grad_norm": 2.1284306784258638, "learning_rate": 7.805086710776353e-06, "loss": 0.6498, "step": 542 }, { "epoch": 0.33, "grad_norm": 1.9199881561880785, "learning_rate": 7.796877347665861e-06, "loss": 0.5469, "step": 543 }, { "epoch": 0.33, "grad_norm": 2.190799717584273, "learning_rate": 7.788656996660596e-06, "loss": 0.6443, "step": 544 }, { "epoch": 0.33, "grad_norm": 2.0667507128163525, "learning_rate": 7.780425690055275e-06, "loss": 0.6689, "step": 545 }, { "epoch": 0.33, "grad_norm": 2.0614204138949077, "learning_rate": 7.772183460187647e-06, "loss": 0.7005, "step": 546 }, { "epoch": 0.33, "grad_norm": 1.8873750448102828, "learning_rate": 7.763930339438383e-06, "loss": 0.54, "step": 547 }, { "epoch": 0.34, "grad_norm": 1.8982868706163196, "learning_rate": 7.755666360230933e-06, "loss": 0.6, "step": 548 }, { "epoch": 0.34, "grad_norm": 1.9491417935122528, "learning_rate": 7.747391555031414e-06, "loss": 0.5981, "step": 549 }, { "epoch": 0.34, "grad_norm": 2.082993477568864, "learning_rate": 7.739105956348465e-06, "loss": 0.6724, "step": 550 }, { "epoch": 0.34, "grad_norm": 1.9771665196367632, "learning_rate": 7.730809596733136e-06, "loss": 0.6199, "step": 551 }, { "epoch": 0.34, "grad_norm": 1.9910315139065318, "learning_rate": 7.722502508778747e-06, "loss": 0.6237, "step": 552 }, { "epoch": 0.34, "grad_norm": 1.8173451465224066, "learning_rate": 7.71418472512077e-06, "loss": 0.5711, "step": 553 }, { "epoch": 0.34, "grad_norm": 2.017970082027841, "learning_rate": 7.705856278436696e-06, "loss": 0.568, "step": 554 }, { "epoch": 0.34, "grad_norm": 1.756994616816012, "learning_rate": 7.697517201445906e-06, "loss": 0.4771, "step": 555 }, { "epoch": 0.34, "grad_norm": 1.783316776618109, "learning_rate": 7.689167526909542e-06, "loss": 0.5154, "step": 556 }, { "epoch": 0.34, "grad_norm": 2.019720522663777, "learning_rate": 7.680807287630383e-06, "loss": 0.6041, "step": 557 }, { "epoch": 0.34, "grad_norm": 2.0783908508210622, "learning_rate": 7.67243651645271e-06, "loss": 0.5921, "step": 558 }, { "epoch": 0.34, "grad_norm": 1.8528382833978114, "learning_rate": 7.664055246262183e-06, "loss": 0.5604, "step": 559 }, { "epoch": 0.34, "grad_norm": 2.117910334131364, "learning_rate": 7.655663509985707e-06, "loss": 0.6059, "step": 560 }, { "epoch": 0.34, "grad_norm": 1.9758606107637775, "learning_rate": 7.647261340591303e-06, "loss": 0.6412, "step": 561 }, { "epoch": 0.34, "grad_norm": 1.9290348907447834, "learning_rate": 7.638848771087982e-06, "loss": 0.5705, "step": 562 }, { "epoch": 0.34, "grad_norm": 1.8960214594144043, "learning_rate": 7.63042583452561e-06, "loss": 0.6163, "step": 563 }, { "epoch": 0.34, "grad_norm": 1.7445681487714644, "learning_rate": 7.621992563994789e-06, "loss": 0.5722, "step": 564 }, { "epoch": 0.35, "grad_norm": 1.9279287791801931, "learning_rate": 7.613548992626711e-06, "loss": 0.5845, "step": 565 }, { "epoch": 0.35, "grad_norm": 1.8792256339894968, "learning_rate": 7.605095153593038e-06, "loss": 0.5947, "step": 566 }, { "epoch": 0.35, "grad_norm": 1.9849662044668719, "learning_rate": 7.596631080105774e-06, "loss": 0.6454, "step": 567 }, { "epoch": 0.35, "grad_norm": 2.1721543557468643, "learning_rate": 7.588156805417126e-06, "loss": 0.5729, "step": 568 }, { "epoch": 0.35, "grad_norm": 2.0849048969435136, "learning_rate": 7.5796723628193815e-06, "loss": 0.5947, "step": 569 }, { "epoch": 0.35, "grad_norm": 2.2968962836408324, "learning_rate": 7.571177785644766e-06, "loss": 0.6569, "step": 570 }, { "epoch": 0.35, "grad_norm": 1.9267367648655322, "learning_rate": 7.562673107265333e-06, "loss": 0.5691, "step": 571 }, { "epoch": 0.35, "grad_norm": 1.9013543447575418, "learning_rate": 7.554158361092807e-06, "loss": 0.5434, "step": 572 }, { "epoch": 0.35, "grad_norm": 1.82007897551597, "learning_rate": 7.545633580578474e-06, "loss": 0.6298, "step": 573 }, { "epoch": 0.35, "grad_norm": 2.04394739282291, "learning_rate": 7.537098799213036e-06, "loss": 0.622, "step": 574 }, { "epoch": 0.35, "grad_norm": 2.075155956099819, "learning_rate": 7.528554050526489e-06, "loss": 0.6556, "step": 575 }, { "epoch": 0.35, "grad_norm": 2.0315943098160236, "learning_rate": 7.519999368087982e-06, "loss": 0.6453, "step": 576 }, { "epoch": 0.35, "grad_norm": 2.0428160155679786, "learning_rate": 7.511434785505693e-06, "loss": 0.7135, "step": 577 }, { "epoch": 0.35, "grad_norm": 2.222904873771381, "learning_rate": 7.502860336426696e-06, "loss": 0.6357, "step": 578 }, { "epoch": 0.35, "grad_norm": 1.7822608620060818, "learning_rate": 7.494276054536821e-06, "loss": 0.5291, "step": 579 }, { "epoch": 0.35, "grad_norm": 2.0685923305897624, "learning_rate": 7.485681973560532e-06, "loss": 0.5797, "step": 580 }, { "epoch": 0.36, "grad_norm": 1.8859535667625311, "learning_rate": 7.4770781272607895e-06, "loss": 0.548, "step": 581 }, { "epoch": 0.36, "grad_norm": 2.2063159373182093, "learning_rate": 7.468464549438916e-06, "loss": 0.5926, "step": 582 }, { "epoch": 0.36, "grad_norm": 1.8937819274172978, "learning_rate": 7.45984127393447e-06, "loss": 0.6131, "step": 583 }, { "epoch": 0.36, "grad_norm": 2.0362363463005506, "learning_rate": 7.4512083346251026e-06, "loss": 0.5862, "step": 584 }, { "epoch": 0.36, "grad_norm": 2.2409987097836717, "learning_rate": 7.442565765426436e-06, "loss": 0.6329, "step": 585 }, { "epoch": 0.36, "grad_norm": 1.984238720215282, "learning_rate": 7.433913600291921e-06, "loss": 0.6436, "step": 586 }, { "epoch": 0.36, "grad_norm": 2.104431967032393, "learning_rate": 7.425251873212709e-06, "loss": 0.7334, "step": 587 }, { "epoch": 0.36, "grad_norm": 2.1700383544690096, "learning_rate": 7.416580618217515e-06, "loss": 0.6976, "step": 588 }, { "epoch": 0.36, "grad_norm": 2.2359253381623345, "learning_rate": 7.407899869372489e-06, "loss": 0.6529, "step": 589 }, { "epoch": 0.36, "grad_norm": 2.207059006099314, "learning_rate": 7.399209660781075e-06, "loss": 0.6742, "step": 590 }, { "epoch": 0.36, "grad_norm": 2.261704338275933, "learning_rate": 7.390510026583884e-06, "loss": 0.6153, "step": 591 }, { "epoch": 0.36, "grad_norm": 2.0605566199360004, "learning_rate": 7.381801000958554e-06, "loss": 0.6127, "step": 592 }, { "epoch": 0.36, "grad_norm": 1.8448877901078287, "learning_rate": 7.3730826181196206e-06, "loss": 0.5658, "step": 593 }, { "epoch": 0.36, "grad_norm": 1.813700393133709, "learning_rate": 7.364354912318379e-06, "loss": 0.5713, "step": 594 }, { "epoch": 0.36, "grad_norm": 1.8851240301079237, "learning_rate": 7.355617917842751e-06, "loss": 0.551, "step": 595 }, { "epoch": 0.36, "grad_norm": 1.7963234257752434, "learning_rate": 7.346871669017153e-06, "loss": 0.5825, "step": 596 }, { "epoch": 0.37, "grad_norm": 1.9811136419250976, "learning_rate": 7.338116200202352e-06, "loss": 0.6257, "step": 597 }, { "epoch": 0.37, "grad_norm": 2.113085248629396, "learning_rate": 7.329351545795345e-06, "loss": 0.6154, "step": 598 }, { "epoch": 0.37, "grad_norm": 1.664313245894575, "learning_rate": 7.320577740229208e-06, "loss": 0.5348, "step": 599 }, { "epoch": 0.37, "grad_norm": 2.0938115916173095, "learning_rate": 7.311794817972975e-06, "loss": 0.619, "step": 600 }, { "epoch": 0.37, "grad_norm": 1.9914918015459053, "learning_rate": 7.3030028135314905e-06, "loss": 0.5977, "step": 601 }, { "epoch": 0.37, "grad_norm": 1.8973404703566117, "learning_rate": 7.294201761445284e-06, "loss": 0.6016, "step": 602 }, { "epoch": 0.37, "grad_norm": 2.686916719491371, "learning_rate": 7.285391696290427e-06, "loss": 0.5594, "step": 603 }, { "epoch": 0.37, "grad_norm": 1.8914279547975104, "learning_rate": 7.276572652678403e-06, "loss": 0.6548, "step": 604 }, { "epoch": 0.37, "grad_norm": 1.9045214756464477, "learning_rate": 7.267744665255966e-06, "loss": 0.5625, "step": 605 }, { "epoch": 0.37, "grad_norm": 2.0508726477606416, "learning_rate": 7.258907768705006e-06, "loss": 0.5994, "step": 606 }, { "epoch": 0.37, "grad_norm": 2.1572541720871206, "learning_rate": 7.2500619977424154e-06, "loss": 0.6259, "step": 607 }, { "epoch": 0.37, "grad_norm": 1.8740105212119254, "learning_rate": 7.241207387119953e-06, "loss": 0.5498, "step": 608 }, { "epoch": 0.37, "grad_norm": 2.11048827570066, "learning_rate": 7.2323439716241e-06, "loss": 0.6176, "step": 609 }, { "epoch": 0.37, "grad_norm": 2.138108461906426, "learning_rate": 7.223471786075934e-06, "loss": 0.7467, "step": 610 }, { "epoch": 0.37, "grad_norm": 2.1086676582577035, "learning_rate": 7.214590865330984e-06, "loss": 0.6513, "step": 611 }, { "epoch": 0.37, "grad_norm": 1.8758955739955738, "learning_rate": 7.2057012442790975e-06, "loss": 0.5449, "step": 612 }, { "epoch": 0.37, "grad_norm": 1.682499149886398, "learning_rate": 7.1968029578443e-06, "loss": 0.525, "step": 613 }, { "epoch": 0.38, "grad_norm": 1.9868958931777934, "learning_rate": 7.187896040984661e-06, "loss": 0.626, "step": 614 }, { "epoch": 0.38, "grad_norm": 1.9273119955565226, "learning_rate": 7.178980528692161e-06, "loss": 0.6012, "step": 615 }, { "epoch": 0.38, "grad_norm": 2.0192285563021466, "learning_rate": 7.170056455992541e-06, "loss": 0.6065, "step": 616 }, { "epoch": 0.38, "grad_norm": 1.9019456790022062, "learning_rate": 7.161123857945177e-06, "loss": 0.6329, "step": 617 }, { "epoch": 0.38, "grad_norm": 2.024610041244123, "learning_rate": 7.152182769642936e-06, "loss": 0.6359, "step": 618 }, { "epoch": 0.38, "grad_norm": 1.9325040211739186, "learning_rate": 7.143233226212042e-06, "loss": 0.5215, "step": 619 }, { "epoch": 0.38, "grad_norm": 2.012751842328307, "learning_rate": 7.134275262811935e-06, "loss": 0.6432, "step": 620 }, { "epoch": 0.38, "grad_norm": 1.7572649666598243, "learning_rate": 7.1253089146351325e-06, "loss": 0.5677, "step": 621 }, { "epoch": 0.38, "grad_norm": 1.7788331655165412, "learning_rate": 7.116334216907097e-06, "loss": 0.5215, "step": 622 }, { "epoch": 0.38, "grad_norm": 1.9050240439242967, "learning_rate": 7.107351204886088e-06, "loss": 0.5178, "step": 623 }, { "epoch": 0.38, "grad_norm": 2.041894936366493, "learning_rate": 7.098359913863034e-06, "loss": 0.6043, "step": 624 }, { "epoch": 0.38, "grad_norm": 2.0308197433902797, "learning_rate": 7.089360379161381e-06, "loss": 0.6213, "step": 625 }, { "epoch": 0.38, "grad_norm": 1.95222686445269, "learning_rate": 7.08035263613697e-06, "loss": 0.5971, "step": 626 }, { "epoch": 0.38, "grad_norm": 2.017912918523442, "learning_rate": 7.071336720177886e-06, "loss": 0.6046, "step": 627 }, { "epoch": 0.38, "grad_norm": 2.166790566645372, "learning_rate": 7.062312666704321e-06, "loss": 0.5927, "step": 628 }, { "epoch": 0.38, "grad_norm": 2.2400201098243544, "learning_rate": 7.053280511168437e-06, "loss": 0.7107, "step": 629 }, { "epoch": 0.39, "grad_norm": 1.8761384322160164, "learning_rate": 7.044240289054227e-06, "loss": 0.5877, "step": 630 }, { "epoch": 0.39, "grad_norm": 1.8121190685789235, "learning_rate": 7.035192035877374e-06, "loss": 0.5278, "step": 631 }, { "epoch": 0.39, "grad_norm": 2.291146349707187, "learning_rate": 7.026135787185113e-06, "loss": 0.6674, "step": 632 }, { "epoch": 0.39, "grad_norm": 1.9115866137344344, "learning_rate": 7.017071578556088e-06, "loss": 0.6101, "step": 633 }, { "epoch": 0.39, "grad_norm": 2.2159502369044746, "learning_rate": 7.007999445600216e-06, "loss": 0.6451, "step": 634 }, { "epoch": 0.39, "grad_norm": 1.94968047657449, "learning_rate": 6.998919423958548e-06, "loss": 0.6115, "step": 635 }, { "epoch": 0.39, "grad_norm": 2.0483508425325208, "learning_rate": 6.989831549303121e-06, "loss": 0.5641, "step": 636 }, { "epoch": 0.39, "grad_norm": 2.078362428704396, "learning_rate": 6.980735857336831e-06, "loss": 0.5859, "step": 637 }, { "epoch": 0.39, "grad_norm": 2.102194806164863, "learning_rate": 6.971632383793278e-06, "loss": 0.5956, "step": 638 }, { "epoch": 0.39, "grad_norm": 2.1562012485508766, "learning_rate": 6.962521164436641e-06, "loss": 0.6522, "step": 639 }, { "epoch": 0.39, "grad_norm": 1.942549118113248, "learning_rate": 6.953402235061519e-06, "loss": 0.5656, "step": 640 }, { "epoch": 0.39, "grad_norm": 2.032598660713363, "learning_rate": 6.944275631492813e-06, "loss": 0.6328, "step": 641 }, { "epoch": 0.39, "grad_norm": 2.120207767189764, "learning_rate": 6.935141389585562e-06, "loss": 0.6283, "step": 642 }, { "epoch": 0.39, "grad_norm": 1.846891984881128, "learning_rate": 6.925999545224819e-06, "loss": 0.5348, "step": 643 }, { "epoch": 0.39, "grad_norm": 2.2117072258313515, "learning_rate": 6.916850134325505e-06, "loss": 0.5428, "step": 644 }, { "epoch": 0.39, "grad_norm": 1.9428888699888005, "learning_rate": 6.907693192832263e-06, "loss": 0.6194, "step": 645 }, { "epoch": 0.39, "grad_norm": 2.324654552066874, "learning_rate": 6.898528756719325e-06, "loss": 0.6157, "step": 646 }, { "epoch": 0.4, "grad_norm": 2.105488134378262, "learning_rate": 6.8893568619903625e-06, "loss": 0.6574, "step": 647 }, { "epoch": 0.4, "grad_norm": 2.0741815083758803, "learning_rate": 6.8801775446783545e-06, "loss": 0.681, "step": 648 }, { "epoch": 0.4, "grad_norm": 1.8599982138936229, "learning_rate": 6.870990840845435e-06, "loss": 0.532, "step": 649 }, { "epoch": 0.4, "grad_norm": 1.8066163998362903, "learning_rate": 6.861796786582761e-06, "loss": 0.5864, "step": 650 }, { "epoch": 0.4, "grad_norm": 2.103633090126261, "learning_rate": 6.852595418010364e-06, "loss": 0.6276, "step": 651 }, { "epoch": 0.4, "grad_norm": 2.0271857194621994, "learning_rate": 6.843386771277012e-06, "loss": 0.6113, "step": 652 }, { "epoch": 0.4, "grad_norm": 1.9892261757816698, "learning_rate": 6.834170882560066e-06, "loss": 0.6066, "step": 653 }, { "epoch": 0.4, "grad_norm": 2.0920982211462142, "learning_rate": 6.824947788065339e-06, "loss": 0.6631, "step": 654 }, { "epoch": 0.4, "grad_norm": 1.7304966407527353, "learning_rate": 6.8157175240269495e-06, "loss": 0.5458, "step": 655 }, { "epoch": 0.4, "grad_norm": 1.9003138804763595, "learning_rate": 6.806480126707187e-06, "loss": 0.6121, "step": 656 }, { "epoch": 0.4, "grad_norm": 1.9727053822571718, "learning_rate": 6.797235632396362e-06, "loss": 0.6235, "step": 657 }, { "epoch": 0.4, "grad_norm": 2.1447934975774325, "learning_rate": 6.787984077412666e-06, "loss": 0.652, "step": 658 }, { "epoch": 0.4, "grad_norm": 2.0660746773365775, "learning_rate": 6.7787254981020335e-06, "loss": 0.6679, "step": 659 }, { "epoch": 0.4, "grad_norm": 2.0622987551332597, "learning_rate": 6.7694599308379895e-06, "loss": 0.6033, "step": 660 }, { "epoch": 0.4, "grad_norm": 1.9723578845421632, "learning_rate": 6.760187412021516e-06, "loss": 0.6082, "step": 661 }, { "epoch": 0.4, "grad_norm": 1.7982428478028805, "learning_rate": 6.750907978080902e-06, "loss": 0.5334, "step": 662 }, { "epoch": 0.41, "grad_norm": 2.036081125390073, "learning_rate": 6.741621665471607e-06, "loss": 0.6212, "step": 663 }, { "epoch": 0.41, "grad_norm": 2.1493033458896664, "learning_rate": 6.732328510676111e-06, "loss": 0.6751, "step": 664 }, { "epoch": 0.41, "grad_norm": 2.070635996051103, "learning_rate": 6.723028550203779e-06, "loss": 0.5758, "step": 665 }, { "epoch": 0.41, "grad_norm": 1.9050719437104036, "learning_rate": 6.7137218205907036e-06, "loss": 0.54, "step": 666 }, { "epoch": 0.41, "grad_norm": 1.7928377835662002, "learning_rate": 6.704408358399583e-06, "loss": 0.5676, "step": 667 }, { "epoch": 0.41, "grad_norm": 2.027588661623482, "learning_rate": 6.695088200219557e-06, "loss": 0.5546, "step": 668 }, { "epoch": 0.41, "grad_norm": 1.9325373078264918, "learning_rate": 6.6857613826660714e-06, "loss": 0.5941, "step": 669 }, { "epoch": 0.41, "grad_norm": 1.9172475345332523, "learning_rate": 6.676427942380741e-06, "loss": 0.5328, "step": 670 }, { "epoch": 0.41, "grad_norm": 2.1396308238670367, "learning_rate": 6.667087916031192e-06, "loss": 0.6748, "step": 671 }, { "epoch": 0.41, "grad_norm": 1.8568393271779622, "learning_rate": 6.657741340310927e-06, "loss": 0.5975, "step": 672 }, { "epoch": 0.41, "grad_norm": 2.0294986249307394, "learning_rate": 6.648388251939177e-06, "loss": 0.6111, "step": 673 }, { "epoch": 0.41, "grad_norm": 1.9234325700371586, "learning_rate": 6.639028687660766e-06, "loss": 0.596, "step": 674 }, { "epoch": 0.41, "grad_norm": 2.0366203200056088, "learning_rate": 6.629662684245949e-06, "loss": 0.5688, "step": 675 }, { "epoch": 0.41, "grad_norm": 1.8958934625265222, "learning_rate": 6.620290278490284e-06, "loss": 0.5791, "step": 676 }, { "epoch": 0.41, "grad_norm": 1.806822731262611, "learning_rate": 6.610911507214482e-06, "loss": 0.5465, "step": 677 }, { "epoch": 0.41, "grad_norm": 1.8876652924106438, "learning_rate": 6.601526407264261e-06, "loss": 0.5537, "step": 678 }, { "epoch": 0.42, "grad_norm": 2.0030181777961156, "learning_rate": 6.592135015510197e-06, "loss": 0.6045, "step": 679 }, { "epoch": 0.42, "grad_norm": 1.97710315660336, "learning_rate": 6.5827373688475925e-06, "loss": 0.5725, "step": 680 }, { "epoch": 0.42, "grad_norm": 2.01659583449962, "learning_rate": 6.5733335041963175e-06, "loss": 0.6237, "step": 681 }, { "epoch": 0.42, "grad_norm": 1.8805423446258591, "learning_rate": 6.563923458500672e-06, "loss": 0.5479, "step": 682 }, { "epoch": 0.42, "grad_norm": 1.7669993233431147, "learning_rate": 6.554507268729238e-06, "loss": 0.5109, "step": 683 }, { "epoch": 0.42, "grad_norm": 1.8269014493705453, "learning_rate": 6.545084971874738e-06, "loss": 0.5462, "step": 684 }, { "epoch": 0.42, "grad_norm": 2.0359088067386786, "learning_rate": 6.535656604953884e-06, "loss": 0.6384, "step": 685 }, { "epoch": 0.42, "grad_norm": 1.9599189973913222, "learning_rate": 6.526222205007236e-06, "loss": 0.5452, "step": 686 }, { "epoch": 0.42, "grad_norm": 1.7645021807661985, "learning_rate": 6.516781809099055e-06, "loss": 0.4752, "step": 687 }, { "epoch": 0.42, "grad_norm": 1.9917551528037687, "learning_rate": 6.507335454317161e-06, "loss": 0.5545, "step": 688 }, { "epoch": 0.42, "grad_norm": 2.026042072169137, "learning_rate": 6.497883177772779e-06, "loss": 0.627, "step": 689 }, { "epoch": 0.42, "grad_norm": 1.8861715324088848, "learning_rate": 6.488425016600403e-06, "loss": 0.6235, "step": 690 }, { "epoch": 0.42, "grad_norm": 1.957150362035283, "learning_rate": 6.4789610079576426e-06, "loss": 0.5386, "step": 691 }, { "epoch": 0.42, "grad_norm": 2.1453992014090364, "learning_rate": 6.469491189025081e-06, "loss": 0.6518, "step": 692 }, { "epoch": 0.42, "grad_norm": 1.765906720730784, "learning_rate": 6.46001559700613e-06, "loss": 0.6203, "step": 693 }, { "epoch": 0.42, "grad_norm": 2.260525086713535, "learning_rate": 6.450534269126878e-06, "loss": 0.6806, "step": 694 }, { "epoch": 0.42, "grad_norm": 2.171042858463267, "learning_rate": 6.441047242635947e-06, "loss": 0.6542, "step": 695 }, { "epoch": 0.43, "grad_norm": 1.9103992956138385, "learning_rate": 6.431554554804353e-06, "loss": 0.6342, "step": 696 }, { "epoch": 0.43, "grad_norm": 2.0002914383944974, "learning_rate": 6.422056242925347e-06, "loss": 0.561, "step": 697 }, { "epoch": 0.43, "grad_norm": 1.8379292896391608, "learning_rate": 6.412552344314279e-06, "loss": 0.5599, "step": 698 }, { "epoch": 0.43, "grad_norm": 1.9748932397312229, "learning_rate": 6.40304289630844e-06, "loss": 0.5952, "step": 699 }, { "epoch": 0.43, "grad_norm": 2.1614553712542253, "learning_rate": 6.3935279362669335e-06, "loss": 0.6412, "step": 700 }, { "epoch": 0.43, "grad_norm": 1.9859247585151905, "learning_rate": 6.384007501570509e-06, "loss": 0.6359, "step": 701 }, { "epoch": 0.43, "grad_norm": 1.8136261988901872, "learning_rate": 6.374481629621427e-06, "loss": 0.5893, "step": 702 }, { "epoch": 0.43, "grad_norm": 2.0161944625478574, "learning_rate": 6.364950357843309e-06, "loss": 0.5371, "step": 703 }, { "epoch": 0.43, "grad_norm": 1.8707294403008965, "learning_rate": 6.355413723680991e-06, "loss": 0.606, "step": 704 }, { "epoch": 0.43, "grad_norm": 2.083870266773342, "learning_rate": 6.3458717646003746e-06, "loss": 0.5857, "step": 705 }, { "epoch": 0.43, "grad_norm": 2.21481293408251, "learning_rate": 6.33632451808828e-06, "loss": 0.5945, "step": 706 }, { "epoch": 0.43, "grad_norm": 1.8209514169327161, "learning_rate": 6.326772021652303e-06, "loss": 0.561, "step": 707 }, { "epoch": 0.43, "grad_norm": 1.7765877330909154, "learning_rate": 6.317214312820662e-06, "loss": 0.5808, "step": 708 }, { "epoch": 0.43, "grad_norm": 2.301880122628837, "learning_rate": 6.307651429142053e-06, "loss": 0.6169, "step": 709 }, { "epoch": 0.43, "grad_norm": 1.9511858179855806, "learning_rate": 6.298083408185503e-06, "loss": 0.5485, "step": 710 }, { "epoch": 0.43, "grad_norm": 1.8733084336465669, "learning_rate": 6.288510287540221e-06, "loss": 0.5414, "step": 711 }, { "epoch": 0.44, "grad_norm": 1.9100686536267126, "learning_rate": 6.278932104815453e-06, "loss": 0.5177, "step": 712 }, { "epoch": 0.44, "grad_norm": 1.980724894367988, "learning_rate": 6.269348897640327e-06, "loss": 0.5847, "step": 713 }, { "epoch": 0.44, "grad_norm": 2.4372622910469515, "learning_rate": 6.259760703663713e-06, "loss": 0.6332, "step": 714 }, { "epoch": 0.44, "grad_norm": 1.7666399516614875, "learning_rate": 6.2501675605540755e-06, "loss": 0.4731, "step": 715 }, { "epoch": 0.44, "grad_norm": 1.9269831431359743, "learning_rate": 6.240569505999317e-06, "loss": 0.5864, "step": 716 }, { "epoch": 0.44, "grad_norm": 2.1889446969469306, "learning_rate": 6.230966577706637e-06, "loss": 0.6465, "step": 717 }, { "epoch": 0.44, "grad_norm": 2.0970566330713036, "learning_rate": 6.221358813402383e-06, "loss": 0.6136, "step": 718 }, { "epoch": 0.44, "grad_norm": 2.05054134285554, "learning_rate": 6.211746250831902e-06, "loss": 0.6313, "step": 719 }, { "epoch": 0.44, "grad_norm": 1.997693167566272, "learning_rate": 6.202128927759391e-06, "loss": 0.5838, "step": 720 }, { "epoch": 0.44, "grad_norm": 1.902952046522087, "learning_rate": 6.192506881967746e-06, "loss": 0.5913, "step": 721 }, { "epoch": 0.44, "grad_norm": 2.118349972159298, "learning_rate": 6.182880151258422e-06, "loss": 0.6401, "step": 722 }, { "epoch": 0.44, "grad_norm": 1.7923389650042116, "learning_rate": 6.173248773451278e-06, "loss": 0.4488, "step": 723 }, { "epoch": 0.44, "grad_norm": 2.0358206659939206, "learning_rate": 6.163612786384426e-06, "loss": 0.5871, "step": 724 }, { "epoch": 0.44, "grad_norm": 1.9373816200953502, "learning_rate": 6.153972227914089e-06, "loss": 0.6472, "step": 725 }, { "epoch": 0.44, "grad_norm": 1.7741801403344204, "learning_rate": 6.144327135914452e-06, "loss": 0.5512, "step": 726 }, { "epoch": 0.44, "grad_norm": 2.000681880884647, "learning_rate": 6.134677548277504e-06, "loss": 0.6792, "step": 727 }, { "epoch": 0.45, "grad_norm": 2.030783168366151, "learning_rate": 6.125023502912901e-06, "loss": 0.6046, "step": 728 }, { "epoch": 0.45, "grad_norm": 2.0794685094461802, "learning_rate": 6.1153650377478116e-06, "loss": 0.6356, "step": 729 }, { "epoch": 0.45, "grad_norm": 2.0447620516144394, "learning_rate": 6.105702190726765e-06, "loss": 0.6179, "step": 730 }, { "epoch": 0.45, "grad_norm": 2.0749186785935803, "learning_rate": 6.096034999811507e-06, "loss": 0.6269, "step": 731 }, { "epoch": 0.45, "grad_norm": 2.099298563572386, "learning_rate": 6.086363502980848e-06, "loss": 0.5107, "step": 732 }, { "epoch": 0.45, "grad_norm": 2.143595827433539, "learning_rate": 6.076687738230517e-06, "loss": 0.613, "step": 733 }, { "epoch": 0.45, "grad_norm": 2.2534551834297574, "learning_rate": 6.067007743573007e-06, "loss": 0.6627, "step": 734 }, { "epoch": 0.45, "grad_norm": 2.196821231032266, "learning_rate": 6.0573235570374315e-06, "loss": 0.6868, "step": 735 }, { "epoch": 0.45, "grad_norm": 1.9274444301473674, "learning_rate": 6.04763521666937e-06, "loss": 0.6234, "step": 736 }, { "epoch": 0.45, "grad_norm": 2.0827989695986906, "learning_rate": 6.037942760530722e-06, "loss": 0.5338, "step": 737 }, { "epoch": 0.45, "grad_norm": 1.7904575978539012, "learning_rate": 6.028246226699559e-06, "loss": 0.5255, "step": 738 }, { "epoch": 0.45, "grad_norm": 1.8597042651935416, "learning_rate": 6.018545653269967e-06, "loss": 0.5604, "step": 739 }, { "epoch": 0.45, "grad_norm": 1.925996097217488, "learning_rate": 6.008841078351903e-06, "loss": 0.5435, "step": 740 }, { "epoch": 0.45, "grad_norm": 1.726239669389769, "learning_rate": 5.9991325400710506e-06, "loss": 0.5033, "step": 741 }, { "epoch": 0.45, "grad_norm": 1.8668368547030405, "learning_rate": 5.9894200765686574e-06, "loss": 0.5801, "step": 742 }, { "epoch": 0.45, "grad_norm": 2.0334452116466037, "learning_rate": 5.9797037260013915e-06, "loss": 0.6715, "step": 743 }, { "epoch": 0.45, "grad_norm": 2.1692961748152384, "learning_rate": 5.969983526541197e-06, "loss": 0.6002, "step": 744 }, { "epoch": 0.46, "grad_norm": 1.8242102907354445, "learning_rate": 5.960259516375134e-06, "loss": 0.5459, "step": 745 }, { "epoch": 0.46, "grad_norm": 1.9465644945877867, "learning_rate": 5.950531733705237e-06, "loss": 0.5633, "step": 746 }, { "epoch": 0.46, "grad_norm": 2.033416173745934, "learning_rate": 5.940800216748357e-06, "loss": 0.595, "step": 747 }, { "epoch": 0.46, "grad_norm": 2.029664712146445, "learning_rate": 5.9310650037360226e-06, "loss": 0.636, "step": 748 }, { "epoch": 0.46, "grad_norm": 1.949211782466542, "learning_rate": 5.921326132914275e-06, "loss": 0.5598, "step": 749 }, { "epoch": 0.46, "grad_norm": 2.130586940499767, "learning_rate": 5.911583642543532e-06, "loss": 0.6793, "step": 750 }, { "epoch": 0.46, "grad_norm": 2.037641386476974, "learning_rate": 5.901837570898425e-06, "loss": 0.6281, "step": 751 }, { "epoch": 0.46, "grad_norm": 2.063158938340875, "learning_rate": 5.892087956267659e-06, "loss": 0.5975, "step": 752 }, { "epoch": 0.46, "grad_norm": 2.0120237220111954, "learning_rate": 5.88233483695386e-06, "loss": 0.5072, "step": 753 }, { "epoch": 0.46, "grad_norm": 1.8881476646969595, "learning_rate": 5.872578251273418e-06, "loss": 0.5661, "step": 754 }, { "epoch": 0.46, "grad_norm": 1.8984742507906354, "learning_rate": 5.862818237556344e-06, "loss": 0.5364, "step": 755 }, { "epoch": 0.46, "grad_norm": 2.041836701931837, "learning_rate": 5.8530548341461125e-06, "loss": 0.6654, "step": 756 }, { "epoch": 0.46, "grad_norm": 1.9187386355732121, "learning_rate": 5.843288079399523e-06, "loss": 0.5945, "step": 757 }, { "epoch": 0.46, "grad_norm": 1.929660691723023, "learning_rate": 5.833518011686531e-06, "loss": 0.523, "step": 758 }, { "epoch": 0.46, "grad_norm": 2.150008834991264, "learning_rate": 5.823744669390115e-06, "loss": 0.669, "step": 759 }, { "epoch": 0.46, "grad_norm": 2.124617912097345, "learning_rate": 5.813968090906117e-06, "loss": 0.5963, "step": 760 }, { "epoch": 0.47, "grad_norm": 2.0097512818088563, "learning_rate": 5.804188314643088e-06, "loss": 0.5946, "step": 761 }, { "epoch": 0.47, "grad_norm": 1.7904830455893548, "learning_rate": 5.794405379022147e-06, "loss": 0.5818, "step": 762 }, { "epoch": 0.47, "grad_norm": 2.1097636641498805, "learning_rate": 5.784619322476822e-06, "loss": 0.5711, "step": 763 }, { "epoch": 0.47, "grad_norm": 2.1115025364007636, "learning_rate": 5.774830183452905e-06, "loss": 0.5844, "step": 764 }, { "epoch": 0.47, "grad_norm": 2.0129995897966833, "learning_rate": 5.765038000408295e-06, "loss": 0.5759, "step": 765 }, { "epoch": 0.47, "grad_norm": 1.9329010751228102, "learning_rate": 5.755242811812851e-06, "loss": 0.5464, "step": 766 }, { "epoch": 0.47, "grad_norm": 2.0085429975812055, "learning_rate": 5.74544465614824e-06, "loss": 0.5751, "step": 767 }, { "epoch": 0.47, "grad_norm": 1.9903327654676763, "learning_rate": 5.735643571907785e-06, "loss": 0.6458, "step": 768 }, { "epoch": 0.47, "grad_norm": 2.1067344237718393, "learning_rate": 5.725839597596312e-06, "loss": 0.6115, "step": 769 }, { "epoch": 0.47, "grad_norm": 1.9189891230884772, "learning_rate": 5.716032771730008e-06, "loss": 0.5441, "step": 770 }, { "epoch": 0.47, "grad_norm": 2.4011718668135993, "learning_rate": 5.706223132836255e-06, "loss": 0.5773, "step": 771 }, { "epoch": 0.47, "grad_norm": 2.072775046614376, "learning_rate": 5.69641071945349e-06, "loss": 0.6463, "step": 772 }, { "epoch": 0.47, "grad_norm": 1.9935114006477437, "learning_rate": 5.686595570131048e-06, "loss": 0.5186, "step": 773 }, { "epoch": 0.47, "grad_norm": 1.844020593200682, "learning_rate": 5.6767777234290165e-06, "loss": 0.5469, "step": 774 }, { "epoch": 0.47, "grad_norm": 2.0393335340113743, "learning_rate": 5.666957217918076e-06, "loss": 0.6512, "step": 775 }, { "epoch": 0.47, "grad_norm": 1.9394154490786393, "learning_rate": 5.657134092179354e-06, "loss": 0.603, "step": 776 }, { "epoch": 0.48, "grad_norm": 2.005329689227122, "learning_rate": 5.647308384804272e-06, "loss": 0.6182, "step": 777 }, { "epoch": 0.48, "grad_norm": 2.170791771636413, "learning_rate": 5.637480134394394e-06, "loss": 0.6277, "step": 778 }, { "epoch": 0.48, "grad_norm": 2.001689578136617, "learning_rate": 5.627649379561273e-06, "loss": 0.6089, "step": 779 }, { "epoch": 0.48, "grad_norm": 2.0454235186622114, "learning_rate": 5.617816158926303e-06, "loss": 0.5878, "step": 780 }, { "epoch": 0.48, "grad_norm": 1.8823086840222076, "learning_rate": 5.607980511120565e-06, "loss": 0.5706, "step": 781 }, { "epoch": 0.48, "grad_norm": 1.80269943780875, "learning_rate": 5.598142474784671e-06, "loss": 0.4961, "step": 782 }, { "epoch": 0.48, "grad_norm": 2.033336270252808, "learning_rate": 5.588302088568625e-06, "loss": 0.556, "step": 783 }, { "epoch": 0.48, "grad_norm": 1.9778139575223732, "learning_rate": 5.578459391131657e-06, "loss": 0.5302, "step": 784 }, { "epoch": 0.48, "grad_norm": 2.020736561300123, "learning_rate": 5.568614421142078e-06, "loss": 0.5978, "step": 785 }, { "epoch": 0.48, "grad_norm": 1.9174222510669499, "learning_rate": 5.558767217277127e-06, "loss": 0.4907, "step": 786 }, { "epoch": 0.48, "grad_norm": 1.8250592907674714, "learning_rate": 5.548917818222818e-06, "loss": 0.5262, "step": 787 }, { "epoch": 0.48, "grad_norm": 1.966787711230393, "learning_rate": 5.539066262673793e-06, "loss": 0.6737, "step": 788 }, { "epoch": 0.48, "grad_norm": 1.855643229521311, "learning_rate": 5.529212589333163e-06, "loss": 0.5955, "step": 789 }, { "epoch": 0.48, "grad_norm": 2.1346079614371543, "learning_rate": 5.5193568369123576e-06, "loss": 0.5729, "step": 790 }, { "epoch": 0.48, "grad_norm": 2.0515766199556706, "learning_rate": 5.509499044130977e-06, "loss": 0.5719, "step": 791 }, { "epoch": 0.48, "grad_norm": 2.0107844202744336, "learning_rate": 5.4996392497166375e-06, "loss": 0.6046, "step": 792 }, { "epoch": 0.48, "grad_norm": 1.9275125168152694, "learning_rate": 5.489777492404818e-06, "loss": 0.5201, "step": 793 }, { "epoch": 0.49, "grad_norm": 2.0862073890728428, "learning_rate": 5.479913810938706e-06, "loss": 0.6474, "step": 794 }, { "epoch": 0.49, "grad_norm": 1.947899506277237, "learning_rate": 5.470048244069055e-06, "loss": 0.6276, "step": 795 }, { "epoch": 0.49, "grad_norm": 1.938490880806152, "learning_rate": 5.46018083055402e-06, "loss": 0.573, "step": 796 }, { "epoch": 0.49, "grad_norm": 1.9073385387532762, "learning_rate": 5.450311609159013e-06, "loss": 0.5404, "step": 797 }, { "epoch": 0.49, "grad_norm": 1.809537621124168, "learning_rate": 5.4404406186565465e-06, "loss": 0.541, "step": 798 }, { "epoch": 0.49, "grad_norm": 1.9527437330399584, "learning_rate": 5.430567897826086e-06, "loss": 0.6258, "step": 799 }, { "epoch": 0.49, "grad_norm": 1.7508264552185595, "learning_rate": 5.420693485453893e-06, "loss": 0.5149, "step": 800 }, { "epoch": 0.49, "grad_norm": 2.0056514367983858, "learning_rate": 5.410817420332876e-06, "loss": 0.5755, "step": 801 }, { "epoch": 0.49, "grad_norm": 2.035682132284417, "learning_rate": 5.400939741262434e-06, "loss": 0.6091, "step": 802 }, { "epoch": 0.49, "grad_norm": 2.039715629729808, "learning_rate": 5.39106048704831e-06, "loss": 0.6284, "step": 803 }, { "epoch": 0.49, "grad_norm": 1.9509583461522269, "learning_rate": 5.381179696502432e-06, "loss": 0.6541, "step": 804 }, { "epoch": 0.49, "grad_norm": 2.101000418400194, "learning_rate": 5.371297408442765e-06, "loss": 0.719, "step": 805 }, { "epoch": 0.49, "grad_norm": 2.1274177930152187, "learning_rate": 5.361413661693157e-06, "loss": 0.6933, "step": 806 }, { "epoch": 0.49, "grad_norm": 1.9350531172343641, "learning_rate": 5.351528495083187e-06, "loss": 0.5427, "step": 807 }, { "epoch": 0.49, "grad_norm": 1.9236019600576935, "learning_rate": 5.341641947448011e-06, "loss": 0.5427, "step": 808 }, { "epoch": 0.49, "grad_norm": 1.896084411851985, "learning_rate": 5.331754057628212e-06, "loss": 0.5404, "step": 809 }, { "epoch": 0.5, "grad_norm": 2.1943365136159345, "learning_rate": 5.321864864469646e-06, "loss": 0.6178, "step": 810 }, { "epoch": 0.5, "grad_norm": 2.0214323469529307, "learning_rate": 5.311974406823288e-06, "loss": 0.5394, "step": 811 }, { "epoch": 0.5, "grad_norm": 1.867537229859426, "learning_rate": 5.3020827235450815e-06, "loss": 0.5502, "step": 812 }, { "epoch": 0.5, "grad_norm": 2.234343663037103, "learning_rate": 5.292189853495784e-06, "loss": 0.6277, "step": 813 }, { "epoch": 0.5, "grad_norm": 2.013802275182187, "learning_rate": 5.282295835540818e-06, "loss": 0.6056, "step": 814 }, { "epoch": 0.5, "grad_norm": 1.9513906655142625, "learning_rate": 5.272400708550114e-06, "loss": 0.5685, "step": 815 }, { "epoch": 0.5, "grad_norm": 1.9338299630529332, "learning_rate": 5.262504511397959e-06, "loss": 0.592, "step": 816 }, { "epoch": 0.5, "grad_norm": 1.7548541609411559, "learning_rate": 5.252607282962843e-06, "loss": 0.526, "step": 817 }, { "epoch": 0.5, "grad_norm": 2.0616714683528667, "learning_rate": 5.2427090621273114e-06, "loss": 0.5529, "step": 818 }, { "epoch": 0.5, "grad_norm": 1.7804791451461532, "learning_rate": 5.232809887777807e-06, "loss": 0.5478, "step": 819 }, { "epoch": 0.5, "grad_norm": 1.826725496699057, "learning_rate": 5.222909798804515e-06, "loss": 0.5544, "step": 820 }, { "epoch": 0.5, "grad_norm": 2.138811923531637, "learning_rate": 5.213008834101218e-06, "loss": 0.643, "step": 821 }, { "epoch": 0.5, "grad_norm": 1.9873736384117076, "learning_rate": 5.20310703256514e-06, "loss": 0.6616, "step": 822 }, { "epoch": 0.5, "grad_norm": 1.9208415386150814, "learning_rate": 5.193204433096787e-06, "loss": 0.5055, "step": 823 }, { "epoch": 0.5, "grad_norm": 1.9813842045072931, "learning_rate": 5.183301074599805e-06, "loss": 0.6327, "step": 824 }, { "epoch": 0.5, "grad_norm": 1.908652107185451, "learning_rate": 5.173396995980818e-06, "loss": 0.6359, "step": 825 }, { "epoch": 0.51, "grad_norm": 2.0742468419024847, "learning_rate": 5.1634922361492845e-06, "loss": 0.6413, "step": 826 }, { "epoch": 0.51, "grad_norm": 1.9352720515169122, "learning_rate": 5.153586834017333e-06, "loss": 0.4937, "step": 827 }, { "epoch": 0.51, "grad_norm": 1.8636055456230387, "learning_rate": 5.14368082849962e-06, "loss": 0.5491, "step": 828 }, { "epoch": 0.51, "grad_norm": 2.2525115422822255, "learning_rate": 5.133774258513168e-06, "loss": 0.6518, "step": 829 }, { "epoch": 0.51, "grad_norm": 1.976929887451241, "learning_rate": 5.123867162977224e-06, "loss": 0.5955, "step": 830 }, { "epoch": 0.51, "grad_norm": 2.1238491116296787, "learning_rate": 5.1139595808130915e-06, "loss": 0.5438, "step": 831 }, { "epoch": 0.51, "grad_norm": 1.9460536517410532, "learning_rate": 5.1040515509439926e-06, "loss": 0.6111, "step": 832 }, { "epoch": 0.51, "grad_norm": 1.8502322758352145, "learning_rate": 5.0941431122949044e-06, "loss": 0.5802, "step": 833 }, { "epoch": 0.51, "grad_norm": 2.1312052237471226, "learning_rate": 5.08423430379241e-06, "loss": 0.6531, "step": 834 }, { "epoch": 0.51, "grad_norm": 1.955948461366251, "learning_rate": 5.074325164364549e-06, "loss": 0.576, "step": 835 }, { "epoch": 0.51, "grad_norm": 2.2603660355638016, "learning_rate": 5.064415732940654e-06, "loss": 0.6709, "step": 836 }, { "epoch": 0.51, "grad_norm": 2.2004715834934854, "learning_rate": 5.054506048451214e-06, "loss": 0.7273, "step": 837 }, { "epoch": 0.51, "grad_norm": 1.9625833391118874, "learning_rate": 5.044596149827705e-06, "loss": 0.5655, "step": 838 }, { "epoch": 0.51, "grad_norm": 2.0367810488166196, "learning_rate": 5.034686076002447e-06, "loss": 0.5503, "step": 839 }, { "epoch": 0.51, "grad_norm": 2.0781271470418865, "learning_rate": 5.024775865908451e-06, "loss": 0.5408, "step": 840 }, { "epoch": 0.51, "grad_norm": 1.8174563416303517, "learning_rate": 5.014865558479257e-06, "loss": 0.5601, "step": 841 }, { "epoch": 0.51, "grad_norm": 2.04027597278746, "learning_rate": 5.004955192648791e-06, "loss": 0.5129, "step": 842 }, { "epoch": 0.52, "grad_norm": 1.929086047655504, "learning_rate": 4.9950448073512096e-06, "loss": 0.6012, "step": 843 }, { "epoch": 0.52, "grad_norm": 2.0846476788174018, "learning_rate": 4.9851344415207455e-06, "loss": 0.5691, "step": 844 }, { "epoch": 0.52, "grad_norm": 2.015199227101593, "learning_rate": 4.975224134091551e-06, "loss": 0.626, "step": 845 }, { "epoch": 0.52, "grad_norm": 2.005830472801361, "learning_rate": 4.965313923997552e-06, "loss": 0.5876, "step": 846 }, { "epoch": 0.52, "grad_norm": 2.11312125492647, "learning_rate": 4.955403850172297e-06, "loss": 0.5779, "step": 847 }, { "epoch": 0.52, "grad_norm": 1.964404887222109, "learning_rate": 4.945493951548788e-06, "loss": 0.5264, "step": 848 }, { "epoch": 0.52, "grad_norm": 1.632455019293396, "learning_rate": 4.935584267059346e-06, "loss": 0.4701, "step": 849 }, { "epoch": 0.52, "grad_norm": 1.9988491228675496, "learning_rate": 4.925674835635455e-06, "loss": 0.604, "step": 850 }, { "epoch": 0.52, "grad_norm": 1.9517240575905959, "learning_rate": 4.915765696207591e-06, "loss": 0.6134, "step": 851 }, { "epoch": 0.52, "grad_norm": 1.8771210243391112, "learning_rate": 4.905856887705097e-06, "loss": 0.5352, "step": 852 }, { "epoch": 0.52, "grad_norm": 1.9010355843007118, "learning_rate": 4.895948449056008e-06, "loss": 0.5825, "step": 853 }, { "epoch": 0.52, "grad_norm": 1.8640061544368143, "learning_rate": 4.886040419186909e-06, "loss": 0.536, "step": 854 }, { "epoch": 0.52, "grad_norm": 2.127850537210119, "learning_rate": 4.876132837022778e-06, "loss": 0.7484, "step": 855 }, { "epoch": 0.52, "grad_norm": 2.035416663771683, "learning_rate": 4.866225741486833e-06, "loss": 0.5556, "step": 856 }, { "epoch": 0.52, "grad_norm": 1.8112553709887884, "learning_rate": 4.856319171500382e-06, "loss": 0.5089, "step": 857 }, { "epoch": 0.52, "grad_norm": 1.7461261116319204, "learning_rate": 4.846413165982668e-06, "loss": 0.5798, "step": 858 }, { "epoch": 0.53, "grad_norm": 1.8200383652508103, "learning_rate": 4.836507763850717e-06, "loss": 0.5644, "step": 859 }, { "epoch": 0.53, "grad_norm": 2.0210685681015517, "learning_rate": 4.826603004019182e-06, "loss": 0.6028, "step": 860 }, { "epoch": 0.53, "grad_norm": 2.0488262467671654, "learning_rate": 4.816698925400197e-06, "loss": 0.6634, "step": 861 }, { "epoch": 0.53, "grad_norm": 1.9045043503411678, "learning_rate": 4.806795566903214e-06, "loss": 0.5246, "step": 862 }, { "epoch": 0.53, "grad_norm": 1.903132223526836, "learning_rate": 4.796892967434861e-06, "loss": 0.5501, "step": 863 }, { "epoch": 0.53, "grad_norm": 1.9775121455691418, "learning_rate": 4.7869911658987825e-06, "loss": 0.5821, "step": 864 }, { "epoch": 0.53, "grad_norm": 2.134944135303822, "learning_rate": 4.777090201195486e-06, "loss": 0.5914, "step": 865 }, { "epoch": 0.53, "grad_norm": 1.8017818510043424, "learning_rate": 4.767190112222196e-06, "loss": 0.5215, "step": 866 }, { "epoch": 0.53, "grad_norm": 1.8986193975250871, "learning_rate": 4.757290937872689e-06, "loss": 0.5674, "step": 867 }, { "epoch": 0.53, "grad_norm": 2.198006939268661, "learning_rate": 4.747392717037158e-06, "loss": 0.6696, "step": 868 }, { "epoch": 0.53, "grad_norm": 1.9844558939372063, "learning_rate": 4.737495488602044e-06, "loss": 0.6495, "step": 869 }, { "epoch": 0.53, "grad_norm": 1.8377231311260462, "learning_rate": 4.727599291449887e-06, "loss": 0.526, "step": 870 }, { "epoch": 0.53, "grad_norm": 2.1843148217052795, "learning_rate": 4.717704164459182e-06, "loss": 0.6569, "step": 871 }, { "epoch": 0.53, "grad_norm": 2.0731163232163525, "learning_rate": 4.707810146504217e-06, "loss": 0.6277, "step": 872 }, { "epoch": 0.53, "grad_norm": 1.8835943474176664, "learning_rate": 4.697917276454919e-06, "loss": 0.5287, "step": 873 }, { "epoch": 0.53, "grad_norm": 2.0281931145371828, "learning_rate": 4.688025593176713e-06, "loss": 0.5604, "step": 874 }, { "epoch": 0.54, "grad_norm": 1.9088774231682988, "learning_rate": 4.6781351355303555e-06, "loss": 0.554, "step": 875 }, { "epoch": 0.54, "grad_norm": 1.9551048202904684, "learning_rate": 4.668245942371789e-06, "loss": 0.6467, "step": 876 }, { "epoch": 0.54, "grad_norm": 2.067313801101298, "learning_rate": 4.658358052551992e-06, "loss": 0.5992, "step": 877 }, { "epoch": 0.54, "grad_norm": 1.902021092998417, "learning_rate": 4.648471504916815e-06, "loss": 0.5812, "step": 878 }, { "epoch": 0.54, "grad_norm": 1.8922807148527254, "learning_rate": 4.638586338306845e-06, "loss": 0.5374, "step": 879 }, { "epoch": 0.54, "grad_norm": 1.884819760587392, "learning_rate": 4.628702591557237e-06, "loss": 0.5056, "step": 880 }, { "epoch": 0.54, "grad_norm": 1.809064236289934, "learning_rate": 4.61882030349757e-06, "loss": 0.5311, "step": 881 }, { "epoch": 0.54, "grad_norm": 1.939206185133062, "learning_rate": 4.60893951295169e-06, "loss": 0.5821, "step": 882 }, { "epoch": 0.54, "grad_norm": 2.1796594240586518, "learning_rate": 4.599060258737567e-06, "loss": 0.6658, "step": 883 }, { "epoch": 0.54, "grad_norm": 2.103575194199594, "learning_rate": 4.589182579667125e-06, "loss": 0.6145, "step": 884 }, { "epoch": 0.54, "grad_norm": 2.159018112419537, "learning_rate": 4.579306514546107e-06, "loss": 0.6203, "step": 885 }, { "epoch": 0.54, "grad_norm": 1.9460192510920176, "learning_rate": 4.569432102173917e-06, "loss": 0.5578, "step": 886 }, { "epoch": 0.54, "grad_norm": 1.8654041708472648, "learning_rate": 4.559559381343455e-06, "loss": 0.528, "step": 887 }, { "epoch": 0.54, "grad_norm": 1.9680995454358476, "learning_rate": 4.5496883908409905e-06, "loss": 0.6183, "step": 888 }, { "epoch": 0.54, "grad_norm": 1.9853793075023518, "learning_rate": 4.539819169445982e-06, "loss": 0.5658, "step": 889 }, { "epoch": 0.54, "grad_norm": 2.1272265558554695, "learning_rate": 4.529951755930946e-06, "loss": 0.6413, "step": 890 }, { "epoch": 0.54, "grad_norm": 2.0536110493039827, "learning_rate": 4.5200861890612955e-06, "loss": 0.5394, "step": 891 }, { "epoch": 0.55, "grad_norm": 1.9694789258484728, "learning_rate": 4.510222507595185e-06, "loss": 0.5543, "step": 892 }, { "epoch": 0.55, "grad_norm": 2.0637295858493214, "learning_rate": 4.500360750283363e-06, "loss": 0.6254, "step": 893 }, { "epoch": 0.55, "grad_norm": 2.2382680799881762, "learning_rate": 4.490500955869025e-06, "loss": 0.5594, "step": 894 }, { "epoch": 0.55, "grad_norm": 1.9215522609744207, "learning_rate": 4.480643163087644e-06, "loss": 0.5565, "step": 895 }, { "epoch": 0.55, "grad_norm": 1.8992138968072834, "learning_rate": 4.4707874106668406e-06, "loss": 0.5549, "step": 896 }, { "epoch": 0.55, "grad_norm": 2.053529626956222, "learning_rate": 4.460933737326208e-06, "loss": 0.5997, "step": 897 }, { "epoch": 0.55, "grad_norm": 1.9545793745044062, "learning_rate": 4.4510821817771825e-06, "loss": 0.5397, "step": 898 }, { "epoch": 0.55, "grad_norm": 2.085152918955289, "learning_rate": 4.441232782722875e-06, "loss": 0.6005, "step": 899 }, { "epoch": 0.55, "grad_norm": 2.1202774407600926, "learning_rate": 4.431385578857924e-06, "loss": 0.5819, "step": 900 }, { "epoch": 0.55, "grad_norm": 1.8352961154836602, "learning_rate": 4.421540608868344e-06, "loss": 0.5951, "step": 901 }, { "epoch": 0.55, "grad_norm": 2.1495914883931904, "learning_rate": 4.411697911431376e-06, "loss": 0.6428, "step": 902 }, { "epoch": 0.55, "grad_norm": 2.1564746769491876, "learning_rate": 4.4018575252153295e-06, "loss": 0.6402, "step": 903 }, { "epoch": 0.55, "grad_norm": 1.8954514160537663, "learning_rate": 4.392019488879438e-06, "loss": 0.6072, "step": 904 }, { "epoch": 0.55, "grad_norm": 1.8105483820540889, "learning_rate": 4.382183841073698e-06, "loss": 0.5387, "step": 905 }, { "epoch": 0.55, "grad_norm": 1.9485751025827374, "learning_rate": 4.372350620438728e-06, "loss": 0.531, "step": 906 }, { "epoch": 0.55, "grad_norm": 1.9608862157969138, "learning_rate": 4.362519865605608e-06, "loss": 0.5402, "step": 907 }, { "epoch": 0.56, "grad_norm": 1.9691930324266667, "learning_rate": 4.352691615195729e-06, "loss": 0.5624, "step": 908 }, { "epoch": 0.56, "grad_norm": 1.8973081884631189, "learning_rate": 4.342865907820647e-06, "loss": 0.5595, "step": 909 }, { "epoch": 0.56, "grad_norm": 1.9717587970990957, "learning_rate": 4.333042782081926e-06, "loss": 0.662, "step": 910 }, { "epoch": 0.56, "grad_norm": 2.007138023783923, "learning_rate": 4.323222276570984e-06, "loss": 0.5723, "step": 911 }, { "epoch": 0.56, "grad_norm": 2.22307977714878, "learning_rate": 4.313404429868952e-06, "loss": 0.6789, "step": 912 }, { "epoch": 0.56, "grad_norm": 2.190452780908872, "learning_rate": 4.303589280546513e-06, "loss": 0.6045, "step": 913 }, { "epoch": 0.56, "grad_norm": 1.9051078417596634, "learning_rate": 4.293776867163746e-06, "loss": 0.5001, "step": 914 }, { "epoch": 0.56, "grad_norm": 2.098076895433394, "learning_rate": 4.283967228269993e-06, "loss": 0.6982, "step": 915 }, { "epoch": 0.56, "grad_norm": 2.135564782739449, "learning_rate": 4.274160402403689e-06, "loss": 0.6086, "step": 916 }, { "epoch": 0.56, "grad_norm": 2.322595532423094, "learning_rate": 4.264356428092217e-06, "loss": 0.6274, "step": 917 }, { "epoch": 0.56, "grad_norm": 1.9740445661634287, "learning_rate": 4.254555343851762e-06, "loss": 0.6254, "step": 918 }, { "epoch": 0.56, "grad_norm": 2.1042432186823965, "learning_rate": 4.24475718818715e-06, "loss": 0.4925, "step": 919 }, { "epoch": 0.56, "grad_norm": 2.092018762885259, "learning_rate": 4.234961999591706e-06, "loss": 0.638, "step": 920 }, { "epoch": 0.56, "grad_norm": 1.819978568221369, "learning_rate": 4.2251698165470965e-06, "loss": 0.5285, "step": 921 }, { "epoch": 0.56, "grad_norm": 2.0575179276629685, "learning_rate": 4.215380677523179e-06, "loss": 0.5426, "step": 922 }, { "epoch": 0.56, "grad_norm": 1.8926418567324161, "learning_rate": 4.205594620977854e-06, "loss": 0.5378, "step": 923 }, { "epoch": 0.56, "grad_norm": 2.055749823208842, "learning_rate": 4.195811685356914e-06, "loss": 0.5888, "step": 924 }, { "epoch": 0.57, "grad_norm": 2.0051242608320745, "learning_rate": 4.186031909093884e-06, "loss": 0.5652, "step": 925 }, { "epoch": 0.57, "grad_norm": 1.8592910852701108, "learning_rate": 4.176255330609885e-06, "loss": 0.487, "step": 926 }, { "epoch": 0.57, "grad_norm": 2.2411604400071674, "learning_rate": 4.16648198831347e-06, "loss": 0.6867, "step": 927 }, { "epoch": 0.57, "grad_norm": 2.072512037016527, "learning_rate": 4.156711920600479e-06, "loss": 0.6362, "step": 928 }, { "epoch": 0.57, "grad_norm": 1.9250096482155696, "learning_rate": 4.146945165853888e-06, "loss": 0.4271, "step": 929 }, { "epoch": 0.57, "grad_norm": 2.1874561938184898, "learning_rate": 4.137181762443658e-06, "loss": 0.5753, "step": 930 }, { "epoch": 0.57, "grad_norm": 1.8676645313834617, "learning_rate": 4.127421748726583e-06, "loss": 0.5137, "step": 931 }, { "epoch": 0.57, "grad_norm": 2.211228766881823, "learning_rate": 4.117665163046141e-06, "loss": 0.6821, "step": 932 }, { "epoch": 0.57, "grad_norm": 2.095689790428209, "learning_rate": 4.107912043732342e-06, "loss": 0.5183, "step": 933 }, { "epoch": 0.57, "grad_norm": 2.3214361789624944, "learning_rate": 4.098162429101576e-06, "loss": 0.588, "step": 934 }, { "epoch": 0.57, "grad_norm": 1.9276332877997406, "learning_rate": 4.088416357456471e-06, "loss": 0.5425, "step": 935 }, { "epoch": 0.57, "grad_norm": 1.8872033903192418, "learning_rate": 4.0786738670857254e-06, "loss": 0.5275, "step": 936 }, { "epoch": 0.57, "grad_norm": 2.173402848844034, "learning_rate": 4.068934996263978e-06, "loss": 0.6501, "step": 937 }, { "epoch": 0.57, "grad_norm": 2.216415986512064, "learning_rate": 4.059199783251644e-06, "loss": 0.5988, "step": 938 }, { "epoch": 0.57, "grad_norm": 1.8838143839651054, "learning_rate": 4.049468266294765e-06, "loss": 0.6169, "step": 939 }, { "epoch": 0.57, "grad_norm": 2.028659010558423, "learning_rate": 4.039740483624869e-06, "loss": 0.6277, "step": 940 }, { "epoch": 0.58, "grad_norm": 2.0194050107448303, "learning_rate": 4.030016473458805e-06, "loss": 0.6028, "step": 941 }, { "epoch": 0.58, "grad_norm": 2.1557586383454574, "learning_rate": 4.020296273998609e-06, "loss": 0.6176, "step": 942 }, { "epoch": 0.58, "grad_norm": 1.8682573092015993, "learning_rate": 4.010579923431346e-06, "loss": 0.5763, "step": 943 }, { "epoch": 0.58, "grad_norm": 2.1548920939456093, "learning_rate": 4.00086745992895e-06, "loss": 0.6331, "step": 944 }, { "epoch": 0.58, "grad_norm": 2.267061719065842, "learning_rate": 3.991158921648096e-06, "loss": 0.7066, "step": 945 }, { "epoch": 0.58, "grad_norm": 1.9935522203843874, "learning_rate": 3.981454346730036e-06, "loss": 0.5729, "step": 946 }, { "epoch": 0.58, "grad_norm": 1.8026214689706248, "learning_rate": 3.9717537733004415e-06, "loss": 0.5706, "step": 947 }, { "epoch": 0.58, "grad_norm": 1.904682640150856, "learning_rate": 3.9620572394692776e-06, "loss": 0.5683, "step": 948 }, { "epoch": 0.58, "grad_norm": 2.136529894979737, "learning_rate": 3.952364783330632e-06, "loss": 0.651, "step": 949 }, { "epoch": 0.58, "grad_norm": 2.233298404795316, "learning_rate": 3.942676442962569e-06, "loss": 0.5268, "step": 950 }, { "epoch": 0.58, "grad_norm": 2.2946703794023486, "learning_rate": 3.932992256426995e-06, "loss": 0.676, "step": 951 }, { "epoch": 0.58, "grad_norm": 1.9555055432357573, "learning_rate": 3.923312261769485e-06, "loss": 0.598, "step": 952 }, { "epoch": 0.58, "grad_norm": 2.3078155728224212, "learning_rate": 3.913636497019154e-06, "loss": 0.6872, "step": 953 }, { "epoch": 0.58, "grad_norm": 1.8574615796272702, "learning_rate": 3.903965000188495e-06, "loss": 0.5518, "step": 954 }, { "epoch": 0.58, "grad_norm": 2.213864265081535, "learning_rate": 3.894297809273237e-06, "loss": 0.5652, "step": 955 }, { "epoch": 0.58, "grad_norm": 1.8234823525571142, "learning_rate": 3.884634962252189e-06, "loss": 0.4526, "step": 956 }, { "epoch": 0.59, "grad_norm": 1.8652657269096666, "learning_rate": 3.8749764970871e-06, "loss": 0.5418, "step": 957 }, { "epoch": 0.59, "grad_norm": 2.0976734402107224, "learning_rate": 3.8653224517224965e-06, "loss": 0.5637, "step": 958 }, { "epoch": 0.59, "grad_norm": 2.0254191334608826, "learning_rate": 3.855672864085549e-06, "loss": 0.5265, "step": 959 }, { "epoch": 0.59, "grad_norm": 1.8196043256300247, "learning_rate": 3.846027772085912e-06, "loss": 0.5179, "step": 960 }, { "epoch": 0.59, "grad_norm": 1.931679412683687, "learning_rate": 3.836387213615576e-06, "loss": 0.5646, "step": 961 }, { "epoch": 0.59, "grad_norm": 1.9232046934900524, "learning_rate": 3.826751226548725e-06, "loss": 0.4793, "step": 962 }, { "epoch": 0.59, "grad_norm": 1.8622914220495714, "learning_rate": 3.817119848741579e-06, "loss": 0.5253, "step": 963 }, { "epoch": 0.59, "grad_norm": 2.294972552628036, "learning_rate": 3.8074931180322544e-06, "loss": 0.6577, "step": 964 }, { "epoch": 0.59, "grad_norm": 1.9186117148347783, "learning_rate": 3.7978710722406113e-06, "loss": 0.5449, "step": 965 }, { "epoch": 0.59, "grad_norm": 2.1934148088583014, "learning_rate": 3.7882537491680992e-06, "loss": 0.5944, "step": 966 }, { "epoch": 0.59, "grad_norm": 2.0440451523844816, "learning_rate": 3.7786411865976167e-06, "loss": 0.5916, "step": 967 }, { "epoch": 0.59, "grad_norm": 2.18697015319259, "learning_rate": 3.7690334222933654e-06, "loss": 0.5679, "step": 968 }, { "epoch": 0.59, "grad_norm": 2.3700476225659957, "learning_rate": 3.7594304940006846e-06, "loss": 0.7297, "step": 969 }, { "epoch": 0.59, "grad_norm": 2.032165887398491, "learning_rate": 3.7498324394459253e-06, "loss": 0.5391, "step": 970 }, { "epoch": 0.59, "grad_norm": 2.0047253114127006, "learning_rate": 3.7402392963362878e-06, "loss": 0.6912, "step": 971 }, { "epoch": 0.59, "grad_norm": 1.925349824119937, "learning_rate": 3.7306511023596743e-06, "loss": 0.4714, "step": 972 }, { "epoch": 0.59, "grad_norm": 1.8222273181991067, "learning_rate": 3.721067895184549e-06, "loss": 0.5714, "step": 973 }, { "epoch": 0.6, "grad_norm": 1.8642832971234993, "learning_rate": 3.711489712459779e-06, "loss": 0.5697, "step": 974 }, { "epoch": 0.6, "grad_norm": 2.028076998382638, "learning_rate": 3.7019165918144974e-06, "loss": 0.6216, "step": 975 }, { "epoch": 0.6, "grad_norm": 2.272782312304872, "learning_rate": 3.6923485708579487e-06, "loss": 0.4969, "step": 976 }, { "epoch": 0.6, "grad_norm": 2.1112583503988525, "learning_rate": 3.6827856871793393e-06, "loss": 0.5942, "step": 977 }, { "epoch": 0.6, "grad_norm": 2.0753754185170243, "learning_rate": 3.673227978347698e-06, "loss": 0.5954, "step": 978 }, { "epoch": 0.6, "grad_norm": 1.8568456850511224, "learning_rate": 3.6636754819117213e-06, "loss": 0.5574, "step": 979 }, { "epoch": 0.6, "grad_norm": 2.165971383751749, "learning_rate": 3.6541282353996275e-06, "loss": 0.5837, "step": 980 }, { "epoch": 0.6, "grad_norm": 1.9460733583421799, "learning_rate": 3.6445862763190104e-06, "loss": 0.5682, "step": 981 }, { "epoch": 0.6, "grad_norm": 2.1328727066329525, "learning_rate": 3.635049642156692e-06, "loss": 0.6156, "step": 982 }, { "epoch": 0.6, "grad_norm": 2.206066665837199, "learning_rate": 3.6255183703785735e-06, "loss": 0.5946, "step": 983 }, { "epoch": 0.6, "grad_norm": 2.0007589219567854, "learning_rate": 3.615992498429493e-06, "loss": 0.5819, "step": 984 }, { "epoch": 0.6, "grad_norm": 2.038096288010089, "learning_rate": 3.6064720637330673e-06, "loss": 0.5356, "step": 985 }, { "epoch": 0.6, "grad_norm": 1.978722860188176, "learning_rate": 3.5969571036915596e-06, "loss": 0.5895, "step": 986 }, { "epoch": 0.6, "grad_norm": 1.9480013936453797, "learning_rate": 3.587447655685724e-06, "loss": 0.5308, "step": 987 }, { "epoch": 0.6, "grad_norm": 2.1945763438024453, "learning_rate": 3.5779437570746536e-06, "loss": 0.6562, "step": 988 }, { "epoch": 0.6, "grad_norm": 2.048811833634992, "learning_rate": 3.568445445195647e-06, "loss": 0.5449, "step": 989 }, { "epoch": 0.61, "grad_norm": 1.9036038375948279, "learning_rate": 3.5589527573640537e-06, "loss": 0.5552, "step": 990 }, { "epoch": 0.61, "grad_norm": 1.5575371034983034, "learning_rate": 3.549465730873124e-06, "loss": 0.4615, "step": 991 }, { "epoch": 0.61, "grad_norm": 1.981358268031162, "learning_rate": 3.5399844029938724e-06, "loss": 0.5655, "step": 992 }, { "epoch": 0.61, "grad_norm": 1.9338027875151251, "learning_rate": 3.5305088109749196e-06, "loss": 0.4972, "step": 993 }, { "epoch": 0.61, "grad_norm": 1.9485561300488783, "learning_rate": 3.5210389920423582e-06, "loss": 0.5759, "step": 994 }, { "epoch": 0.61, "grad_norm": 1.860842546149849, "learning_rate": 3.511574983399599e-06, "loss": 0.5328, "step": 995 }, { "epoch": 0.61, "grad_norm": 2.1337146376104985, "learning_rate": 3.5021168222272227e-06, "loss": 0.6441, "step": 996 }, { "epoch": 0.61, "grad_norm": 2.041043993151766, "learning_rate": 3.49266454568284e-06, "loss": 0.543, "step": 997 }, { "epoch": 0.61, "grad_norm": 1.9008324125548013, "learning_rate": 3.4832181909009467e-06, "loss": 0.5582, "step": 998 }, { "epoch": 0.61, "grad_norm": 2.0009175266246113, "learning_rate": 3.473777794992765e-06, "loss": 0.5657, "step": 999 }, { "epoch": 0.61, "grad_norm": 2.0488988575321274, "learning_rate": 3.4643433950461175e-06, "loss": 0.5898, "step": 1000 }, { "epoch": 0.61, "grad_norm": 2.164963300418657, "learning_rate": 3.4549150281252635e-06, "loss": 0.5981, "step": 1001 }, { "epoch": 0.61, "grad_norm": 1.9763623413542644, "learning_rate": 3.4454927312707633e-06, "loss": 0.6106, "step": 1002 }, { "epoch": 0.61, "grad_norm": 1.9962996245557485, "learning_rate": 3.43607654149933e-06, "loss": 0.5782, "step": 1003 }, { "epoch": 0.61, "grad_norm": 2.2151735776108, "learning_rate": 3.4266664958036838e-06, "loss": 0.5685, "step": 1004 }, { "epoch": 0.61, "grad_norm": 2.018094880353655, "learning_rate": 3.417262631152409e-06, "loss": 0.528, "step": 1005 }, { "epoch": 0.62, "grad_norm": 1.7630000234879197, "learning_rate": 3.4078649844898045e-06, "loss": 0.5205, "step": 1006 }, { "epoch": 0.62, "grad_norm": 1.7955549765689147, "learning_rate": 3.3984735927357414e-06, "loss": 0.4731, "step": 1007 }, { "epoch": 0.62, "grad_norm": 1.8797578836131676, "learning_rate": 3.3890884927855185e-06, "loss": 0.5603, "step": 1008 }, { "epoch": 0.62, "grad_norm": 1.909711782728961, "learning_rate": 3.3797097215097173e-06, "loss": 0.5129, "step": 1009 }, { "epoch": 0.62, "grad_norm": 1.8389954494108633, "learning_rate": 3.3703373157540525e-06, "loss": 0.5193, "step": 1010 }, { "epoch": 0.62, "grad_norm": 1.8474429582879734, "learning_rate": 3.3609713123392352e-06, "loss": 0.4737, "step": 1011 }, { "epoch": 0.62, "grad_norm": 2.3345478444238354, "learning_rate": 3.3516117480608234e-06, "loss": 0.7071, "step": 1012 }, { "epoch": 0.62, "grad_norm": 2.175175435777652, "learning_rate": 3.3422586596890742e-06, "loss": 0.5722, "step": 1013 }, { "epoch": 0.62, "grad_norm": 2.1305366509524055, "learning_rate": 3.3329120839688102e-06, "loss": 0.6892, "step": 1014 }, { "epoch": 0.62, "grad_norm": 2.0052757824888037, "learning_rate": 3.32357205761926e-06, "loss": 0.5995, "step": 1015 }, { "epoch": 0.62, "grad_norm": 2.0159203979737668, "learning_rate": 3.314238617333928e-06, "loss": 0.6025, "step": 1016 }, { "epoch": 0.62, "grad_norm": 2.012244600426063, "learning_rate": 3.304911799780445e-06, "loss": 0.5673, "step": 1017 }, { "epoch": 0.62, "grad_norm": 1.9361641471312003, "learning_rate": 3.295591641600418e-06, "loss": 0.5838, "step": 1018 }, { "epoch": 0.62, "grad_norm": 1.8304544679156056, "learning_rate": 3.2862781794092964e-06, "loss": 0.5585, "step": 1019 }, { "epoch": 0.62, "grad_norm": 2.149167385207215, "learning_rate": 3.2769714497962235e-06, "loss": 0.5886, "step": 1020 }, { "epoch": 0.62, "grad_norm": 2.04272728052408, "learning_rate": 3.267671489323889e-06, "loss": 0.5355, "step": 1021 }, { "epoch": 0.62, "grad_norm": 1.9965937830873703, "learning_rate": 3.258378334528393e-06, "loss": 0.5976, "step": 1022 }, { "epoch": 0.63, "grad_norm": 2.126700139225219, "learning_rate": 3.249092021919099e-06, "loss": 0.5431, "step": 1023 }, { "epoch": 0.63, "grad_norm": 1.8129449899934444, "learning_rate": 3.239812587978485e-06, "loss": 0.5674, "step": 1024 }, { "epoch": 0.63, "grad_norm": 2.0383597018537865, "learning_rate": 3.2305400691620126e-06, "loss": 0.6182, "step": 1025 }, { "epoch": 0.63, "grad_norm": 2.050473137758968, "learning_rate": 3.221274501897968e-06, "loss": 0.5404, "step": 1026 }, { "epoch": 0.63, "grad_norm": 1.9168650695196385, "learning_rate": 3.212015922587335e-06, "loss": 0.5563, "step": 1027 }, { "epoch": 0.63, "grad_norm": 1.98106980221109, "learning_rate": 3.2027643676036402e-06, "loss": 0.5734, "step": 1028 }, { "epoch": 0.63, "grad_norm": 1.847757494976792, "learning_rate": 3.193519873292815e-06, "loss": 0.5501, "step": 1029 }, { "epoch": 0.63, "grad_norm": 2.0870831968238965, "learning_rate": 3.1842824759730518e-06, "loss": 0.5744, "step": 1030 }, { "epoch": 0.63, "grad_norm": 1.8249978023375093, "learning_rate": 3.1750522119346626e-06, "loss": 0.5438, "step": 1031 }, { "epoch": 0.63, "grad_norm": 1.9231526649033666, "learning_rate": 3.165829117439935e-06, "loss": 0.529, "step": 1032 }, { "epoch": 0.63, "grad_norm": 2.0472491835741873, "learning_rate": 3.1566132287229876e-06, "loss": 0.5332, "step": 1033 }, { "epoch": 0.63, "grad_norm": 2.1920921831656375, "learning_rate": 3.1474045819896374e-06, "loss": 0.5604, "step": 1034 }, { "epoch": 0.63, "grad_norm": 1.9909841140079707, "learning_rate": 3.1382032134172395e-06, "loss": 0.5111, "step": 1035 }, { "epoch": 0.63, "grad_norm": 1.942915636194669, "learning_rate": 3.129009159154567e-06, "loss": 0.5641, "step": 1036 }, { "epoch": 0.63, "grad_norm": 2.11037815260772, "learning_rate": 3.1198224553216472e-06, "loss": 0.593, "step": 1037 }, { "epoch": 0.63, "grad_norm": 2.1107440825628494, "learning_rate": 3.1106431380096374e-06, "loss": 0.5313, "step": 1038 }, { "epoch": 0.64, "grad_norm": 2.0760892689013324, "learning_rate": 3.101471243280677e-06, "loss": 0.5261, "step": 1039 }, { "epoch": 0.64, "grad_norm": 1.9189444275549399, "learning_rate": 3.092306807167738e-06, "loss": 0.5436, "step": 1040 }, { "epoch": 0.64, "grad_norm": 2.0604725562399913, "learning_rate": 3.083149865674496e-06, "loss": 0.6429, "step": 1041 }, { "epoch": 0.64, "grad_norm": 1.8122634340035755, "learning_rate": 3.0740004547751824e-06, "loss": 0.5544, "step": 1042 }, { "epoch": 0.64, "grad_norm": 1.8800400262197519, "learning_rate": 3.0648586104144397e-06, "loss": 0.5622, "step": 1043 }, { "epoch": 0.64, "grad_norm": 2.0644669592689127, "learning_rate": 3.0557243685071874e-06, "loss": 0.6323, "step": 1044 }, { "epoch": 0.64, "grad_norm": 2.127783852090682, "learning_rate": 3.0465977649384813e-06, "loss": 0.6729, "step": 1045 }, { "epoch": 0.64, "grad_norm": 1.7585798179204468, "learning_rate": 3.03747883556336e-06, "loss": 0.5283, "step": 1046 }, { "epoch": 0.64, "grad_norm": 2.122048769461784, "learning_rate": 3.0283676162067234e-06, "loss": 0.6467, "step": 1047 }, { "epoch": 0.64, "grad_norm": 2.0797175331249282, "learning_rate": 3.0192641426631707e-06, "loss": 0.5904, "step": 1048 }, { "epoch": 0.64, "grad_norm": 1.911144315914054, "learning_rate": 3.010168450696879e-06, "loss": 0.5504, "step": 1049 }, { "epoch": 0.64, "grad_norm": 1.733484390437723, "learning_rate": 3.0010805760414544e-06, "loss": 0.4998, "step": 1050 }, { "epoch": 0.64, "grad_norm": 2.102048422163863, "learning_rate": 2.9920005543997847e-06, "loss": 0.5276, "step": 1051 }, { "epoch": 0.64, "grad_norm": 1.9804256500678663, "learning_rate": 2.982928421443914e-06, "loss": 0.4796, "step": 1052 }, { "epoch": 0.64, "grad_norm": 2.058252604257508, "learning_rate": 2.9738642128148887e-06, "loss": 0.5238, "step": 1053 }, { "epoch": 0.64, "grad_norm": 1.9511201686457833, "learning_rate": 2.9648079641226267e-06, "loss": 0.5746, "step": 1054 }, { "epoch": 0.65, "grad_norm": 2.1199181959460947, "learning_rate": 2.955759710945773e-06, "loss": 0.5502, "step": 1055 }, { "epoch": 0.65, "grad_norm": 2.3766960612858234, "learning_rate": 2.946719488831564e-06, "loss": 0.518, "step": 1056 }, { "epoch": 0.65, "grad_norm": 1.960044833847147, "learning_rate": 2.93768733329568e-06, "loss": 0.5366, "step": 1057 }, { "epoch": 0.65, "grad_norm": 2.1052778798118643, "learning_rate": 2.928663279822116e-06, "loss": 0.6107, "step": 1058 }, { "epoch": 0.65, "grad_norm": 1.9902325930624214, "learning_rate": 2.919647363863031e-06, "loss": 0.5625, "step": 1059 }, { "epoch": 0.65, "grad_norm": 1.8937657013896414, "learning_rate": 2.910639620838619e-06, "loss": 0.5431, "step": 1060 }, { "epoch": 0.65, "grad_norm": 2.081716192850459, "learning_rate": 2.901640086136969e-06, "loss": 0.504, "step": 1061 }, { "epoch": 0.65, "grad_norm": 2.1189522676194037, "learning_rate": 2.892648795113912e-06, "loss": 0.6598, "step": 1062 }, { "epoch": 0.65, "grad_norm": 1.907842433021825, "learning_rate": 2.8836657830929048e-06, "loss": 0.5169, "step": 1063 }, { "epoch": 0.65, "grad_norm": 1.9520656325829888, "learning_rate": 2.874691085364868e-06, "loss": 0.5536, "step": 1064 }, { "epoch": 0.65, "grad_norm": 2.072673902863609, "learning_rate": 2.865724737188067e-06, "loss": 0.579, "step": 1065 }, { "epoch": 0.65, "grad_norm": 2.0924189620942775, "learning_rate": 2.856766773787959e-06, "loss": 0.5745, "step": 1066 }, { "epoch": 0.65, "grad_norm": 1.7781952691539604, "learning_rate": 2.847817230357066e-06, "loss": 0.5756, "step": 1067 }, { "epoch": 0.65, "grad_norm": 2.034790297885924, "learning_rate": 2.838876142054825e-06, "loss": 0.4909, "step": 1068 }, { "epoch": 0.65, "grad_norm": 2.17420374619433, "learning_rate": 2.8299435440074596e-06, "loss": 0.5831, "step": 1069 }, { "epoch": 0.65, "grad_norm": 1.9647837412380398, "learning_rate": 2.8210194713078408e-06, "loss": 0.5177, "step": 1070 }, { "epoch": 0.65, "grad_norm": 2.2111415620011017, "learning_rate": 2.81210395901534e-06, "loss": 0.625, "step": 1071 }, { "epoch": 0.66, "grad_norm": 2.2292783394598406, "learning_rate": 2.8031970421557035e-06, "loss": 0.6244, "step": 1072 }, { "epoch": 0.66, "grad_norm": 2.079126116302448, "learning_rate": 2.7942987557209054e-06, "loss": 0.5667, "step": 1073 }, { "epoch": 0.66, "grad_norm": 1.8984066910628055, "learning_rate": 2.785409134669017e-06, "loss": 0.5423, "step": 1074 }, { "epoch": 0.66, "grad_norm": 1.8127916497049823, "learning_rate": 2.776528213924068e-06, "loss": 0.494, "step": 1075 }, { "epoch": 0.66, "grad_norm": 2.06245637765686, "learning_rate": 2.7676560283759013e-06, "loss": 0.5621, "step": 1076 }, { "epoch": 0.66, "grad_norm": 2.114616000228047, "learning_rate": 2.7587926128800503e-06, "loss": 0.582, "step": 1077 }, { "epoch": 0.66, "grad_norm": 1.8380497026526659, "learning_rate": 2.7499380022575862e-06, "loss": 0.5381, "step": 1078 }, { "epoch": 0.66, "grad_norm": 1.898889400333682, "learning_rate": 2.7410922312949955e-06, "loss": 0.543, "step": 1079 }, { "epoch": 0.66, "grad_norm": 2.2504985306779677, "learning_rate": 2.7322553347440368e-06, "loss": 0.5839, "step": 1080 }, { "epoch": 0.66, "grad_norm": 2.202704207784561, "learning_rate": 2.723427347321598e-06, "loss": 0.6228, "step": 1081 }, { "epoch": 0.66, "grad_norm": 2.0144401171881405, "learning_rate": 2.7146083037095726e-06, "loss": 0.5422, "step": 1082 }, { "epoch": 0.66, "grad_norm": 1.736757757636721, "learning_rate": 2.705798238554718e-06, "loss": 0.5307, "step": 1083 }, { "epoch": 0.66, "grad_norm": 1.8565795545311183, "learning_rate": 2.696997186468511e-06, "loss": 0.5413, "step": 1084 }, { "epoch": 0.66, "grad_norm": 2.2977818371394663, "learning_rate": 2.688205182027026e-06, "loss": 0.6052, "step": 1085 }, { "epoch": 0.66, "grad_norm": 1.9308033883834883, "learning_rate": 2.6794222597707937e-06, "loss": 0.5361, "step": 1086 }, { "epoch": 0.66, "grad_norm": 1.9086632142591924, "learning_rate": 2.6706484542046564e-06, "loss": 0.5446, "step": 1087 }, { "epoch": 0.67, "grad_norm": 1.9880975815360458, "learning_rate": 2.6618837997976497e-06, "loss": 0.5471, "step": 1088 }, { "epoch": 0.67, "grad_norm": 2.080617319056223, "learning_rate": 2.6531283309828493e-06, "loss": 0.6338, "step": 1089 }, { "epoch": 0.67, "grad_norm": 1.9076523429048307, "learning_rate": 2.6443820821572496e-06, "loss": 0.5312, "step": 1090 }, { "epoch": 0.67, "grad_norm": 2.0510429772806784, "learning_rate": 2.635645087681623e-06, "loss": 0.6337, "step": 1091 }, { "epoch": 0.67, "grad_norm": 1.8261749722012643, "learning_rate": 2.626917381880381e-06, "loss": 0.4953, "step": 1092 }, { "epoch": 0.67, "grad_norm": 1.9732654093784376, "learning_rate": 2.618198999041447e-06, "loss": 0.5538, "step": 1093 }, { "epoch": 0.67, "grad_norm": 2.012868194526242, "learning_rate": 2.609489973416118e-06, "loss": 0.6014, "step": 1094 }, { "epoch": 0.67, "grad_norm": 2.0313320710830274, "learning_rate": 2.600790339218926e-06, "loss": 0.5784, "step": 1095 }, { "epoch": 0.67, "grad_norm": 2.1398171593710185, "learning_rate": 2.5921001306275116e-06, "loss": 0.5516, "step": 1096 }, { "epoch": 0.67, "grad_norm": 2.120355813263771, "learning_rate": 2.5834193817824865e-06, "loss": 0.5909, "step": 1097 }, { "epoch": 0.67, "grad_norm": 2.030214931297693, "learning_rate": 2.5747481267872925e-06, "loss": 0.5592, "step": 1098 }, { "epoch": 0.67, "grad_norm": 1.9767602094754053, "learning_rate": 2.5660863997080808e-06, "loss": 0.5503, "step": 1099 }, { "epoch": 0.67, "grad_norm": 1.9483830769279278, "learning_rate": 2.557434234573565e-06, "loss": 0.5671, "step": 1100 }, { "epoch": 0.67, "grad_norm": 1.9276456895969436, "learning_rate": 2.548791665374898e-06, "loss": 0.5127, "step": 1101 }, { "epoch": 0.67, "grad_norm": 1.8090439396291618, "learning_rate": 2.540158726065532e-06, "loss": 0.5713, "step": 1102 }, { "epoch": 0.67, "grad_norm": 2.320785088443513, "learning_rate": 2.5315354505610847e-06, "loss": 0.6488, "step": 1103 }, { "epoch": 0.68, "grad_norm": 1.9982757056307234, "learning_rate": 2.522921872739211e-06, "loss": 0.5425, "step": 1104 }, { "epoch": 0.68, "grad_norm": 1.9334400895537176, "learning_rate": 2.514318026439469e-06, "loss": 0.6033, "step": 1105 }, { "epoch": 0.68, "grad_norm": 2.027430128222646, "learning_rate": 2.50572394546318e-06, "loss": 0.5551, "step": 1106 }, { "epoch": 0.68, "grad_norm": 2.212527434238601, "learning_rate": 2.4971396635733043e-06, "loss": 0.6576, "step": 1107 }, { "epoch": 0.68, "grad_norm": 2.047142954880681, "learning_rate": 2.488565214494307e-06, "loss": 0.6133, "step": 1108 }, { "epoch": 0.68, "grad_norm": 1.855345806040437, "learning_rate": 2.480000631912018e-06, "loss": 0.5198, "step": 1109 }, { "epoch": 0.68, "grad_norm": 2.1075238338772895, "learning_rate": 2.471445949473512e-06, "loss": 0.5667, "step": 1110 }, { "epoch": 0.68, "grad_norm": 1.9673428615144855, "learning_rate": 2.4629012007869634e-06, "loss": 0.5715, "step": 1111 }, { "epoch": 0.68, "grad_norm": 2.2624688260095382, "learning_rate": 2.4543664194215272e-06, "loss": 0.7673, "step": 1112 }, { "epoch": 0.68, "grad_norm": 2.0285948397058626, "learning_rate": 2.445841638907194e-06, "loss": 0.5768, "step": 1113 }, { "epoch": 0.68, "grad_norm": 2.088805134401383, "learning_rate": 2.4373268927346678e-06, "loss": 0.5607, "step": 1114 }, { "epoch": 0.68, "grad_norm": 1.7787754535359048, "learning_rate": 2.428822214355235e-06, "loss": 0.5723, "step": 1115 }, { "epoch": 0.68, "grad_norm": 2.149439034712146, "learning_rate": 2.4203276371806206e-06, "loss": 0.6358, "step": 1116 }, { "epoch": 0.68, "grad_norm": 1.8509674900388537, "learning_rate": 2.4118431945828757e-06, "loss": 0.5393, "step": 1117 }, { "epoch": 0.68, "grad_norm": 1.8927674268591554, "learning_rate": 2.4033689198942272e-06, "loss": 0.5846, "step": 1118 }, { "epoch": 0.68, "grad_norm": 1.9917147049157173, "learning_rate": 2.394904846406964e-06, "loss": 0.6189, "step": 1119 }, { "epoch": 0.68, "grad_norm": 1.9129347000522996, "learning_rate": 2.3864510073732914e-06, "loss": 0.5045, "step": 1120 }, { "epoch": 0.69, "grad_norm": 2.050425773481486, "learning_rate": 2.378007436005214e-06, "loss": 0.5873, "step": 1121 }, { "epoch": 0.69, "grad_norm": 1.9719631201639327, "learning_rate": 2.3695741654743913e-06, "loss": 0.5375, "step": 1122 }, { "epoch": 0.69, "grad_norm": 2.0214460337530946, "learning_rate": 2.3611512289120208e-06, "loss": 0.5548, "step": 1123 }, { "epoch": 0.69, "grad_norm": 1.8042574192263385, "learning_rate": 2.3527386594087003e-06, "loss": 0.5189, "step": 1124 }, { "epoch": 0.69, "grad_norm": 1.9600273923574374, "learning_rate": 2.344336490014295e-06, "loss": 0.5378, "step": 1125 }, { "epoch": 0.69, "grad_norm": 1.9432405793477807, "learning_rate": 2.3359447537378173e-06, "loss": 0.5354, "step": 1126 }, { "epoch": 0.69, "grad_norm": 2.04898553718917, "learning_rate": 2.3275634835472914e-06, "loss": 0.6216, "step": 1127 }, { "epoch": 0.69, "grad_norm": 1.9391818082210701, "learning_rate": 2.3191927123696185e-06, "loss": 0.5523, "step": 1128 }, { "epoch": 0.69, "grad_norm": 1.9965210994972027, "learning_rate": 2.3108324730904584e-06, "loss": 0.5929, "step": 1129 }, { "epoch": 0.69, "grad_norm": 1.8480997829156387, "learning_rate": 2.302482798554096e-06, "loss": 0.5467, "step": 1130 }, { "epoch": 0.69, "grad_norm": 1.8430449159542786, "learning_rate": 2.2941437215633043e-06, "loss": 0.5267, "step": 1131 }, { "epoch": 0.69, "grad_norm": 2.0808145765908543, "learning_rate": 2.2858152748792316e-06, "loss": 0.6113, "step": 1132 }, { "epoch": 0.69, "grad_norm": 1.8218623081262797, "learning_rate": 2.277497491221255e-06, "loss": 0.4938, "step": 1133 }, { "epoch": 0.69, "grad_norm": 1.9041064505829883, "learning_rate": 2.269190403266866e-06, "loss": 0.5633, "step": 1134 }, { "epoch": 0.69, "grad_norm": 1.652734879699611, "learning_rate": 2.260894043651537e-06, "loss": 0.5735, "step": 1135 }, { "epoch": 0.69, "grad_norm": 2.2538231832723445, "learning_rate": 2.2526084449685876e-06, "loss": 0.6128, "step": 1136 }, { "epoch": 0.7, "grad_norm": 1.7960651904147913, "learning_rate": 2.244333639769066e-06, "loss": 0.4856, "step": 1137 }, { "epoch": 0.7, "grad_norm": 1.9343813745783291, "learning_rate": 2.236069660561619e-06, "loss": 0.5552, "step": 1138 }, { "epoch": 0.7, "grad_norm": 2.0101528659989025, "learning_rate": 2.2278165398123538e-06, "loss": 0.5589, "step": 1139 }, { "epoch": 0.7, "grad_norm": 2.1101574118483826, "learning_rate": 2.2195743099447257e-06, "loss": 0.5837, "step": 1140 }, { "epoch": 0.7, "grad_norm": 1.7157523251139841, "learning_rate": 2.211343003339405e-06, "loss": 0.4769, "step": 1141 }, { "epoch": 0.7, "grad_norm": 1.7800599491375297, "learning_rate": 2.203122652334141e-06, "loss": 0.5251, "step": 1142 }, { "epoch": 0.7, "grad_norm": 1.9387781774592656, "learning_rate": 2.1949132892236495e-06, "loss": 0.5669, "step": 1143 }, { "epoch": 0.7, "grad_norm": 1.9580346237978052, "learning_rate": 2.1867149462594745e-06, "loss": 0.6192, "step": 1144 }, { "epoch": 0.7, "grad_norm": 1.8050769261944541, "learning_rate": 2.178527655649868e-06, "loss": 0.5353, "step": 1145 }, { "epoch": 0.7, "grad_norm": 2.028704396965185, "learning_rate": 2.1703514495596643e-06, "loss": 0.565, "step": 1146 }, { "epoch": 0.7, "grad_norm": 1.8852797512232322, "learning_rate": 2.1621863601101434e-06, "loss": 0.4691, "step": 1147 }, { "epoch": 0.7, "grad_norm": 2.0307600694751566, "learning_rate": 2.1540324193789177e-06, "loss": 0.6075, "step": 1148 }, { "epoch": 0.7, "grad_norm": 2.0635630868289274, "learning_rate": 2.145889659399801e-06, "loss": 0.5713, "step": 1149 }, { "epoch": 0.7, "grad_norm": 1.8359324107226915, "learning_rate": 2.137758112162678e-06, "loss": 0.5419, "step": 1150 }, { "epoch": 0.7, "grad_norm": 1.9327432699880955, "learning_rate": 2.1296378096133863e-06, "loss": 0.5219, "step": 1151 }, { "epoch": 0.7, "grad_norm": 1.9646016130657808, "learning_rate": 2.1215287836535836e-06, "loss": 0.5865, "step": 1152 }, { "epoch": 0.7, "grad_norm": 2.053237316493875, "learning_rate": 2.1134310661406293e-06, "loss": 0.5495, "step": 1153 }, { "epoch": 0.71, "grad_norm": 1.9224877080515406, "learning_rate": 2.1053446888874575e-06, "loss": 0.57, "step": 1154 }, { "epoch": 0.71, "grad_norm": 2.0094429579360296, "learning_rate": 2.097269683662444e-06, "loss": 0.5966, "step": 1155 }, { "epoch": 0.71, "grad_norm": 2.214480865585653, "learning_rate": 2.089206082189294e-06, "loss": 0.6409, "step": 1156 }, { "epoch": 0.71, "grad_norm": 2.0300256495896516, "learning_rate": 2.0811539161469126e-06, "loss": 0.5318, "step": 1157 }, { "epoch": 0.71, "grad_norm": 1.9501820223073412, "learning_rate": 2.073113217169272e-06, "loss": 0.5289, "step": 1158 }, { "epoch": 0.71, "grad_norm": 2.073955162988734, "learning_rate": 2.065084016845301e-06, "loss": 0.6114, "step": 1159 }, { "epoch": 0.71, "grad_norm": 2.2617011183013505, "learning_rate": 2.0570663467187556e-06, "loss": 0.692, "step": 1160 }, { "epoch": 0.71, "grad_norm": 1.9412567792801219, "learning_rate": 2.049060238288086e-06, "loss": 0.5781, "step": 1161 }, { "epoch": 0.71, "grad_norm": 1.9161885318665781, "learning_rate": 2.0410657230063304e-06, "loss": 0.4698, "step": 1162 }, { "epoch": 0.71, "grad_norm": 1.9595900836322337, "learning_rate": 2.0330828322809727e-06, "loss": 0.5868, "step": 1163 }, { "epoch": 0.71, "grad_norm": 1.860761169455946, "learning_rate": 2.025111597473836e-06, "loss": 0.5014, "step": 1164 }, { "epoch": 0.71, "grad_norm": 1.9607972138295733, "learning_rate": 2.0171520499009457e-06, "loss": 0.5398, "step": 1165 }, { "epoch": 0.71, "grad_norm": 1.8911857797620755, "learning_rate": 2.009204220832418e-06, "loss": 0.5382, "step": 1166 }, { "epoch": 0.71, "grad_norm": 2.149350473430931, "learning_rate": 2.0012681414923254e-06, "loss": 0.5554, "step": 1167 }, { "epoch": 0.71, "grad_norm": 1.8778243908792742, "learning_rate": 1.993343843058585e-06, "loss": 0.5085, "step": 1168 }, { "epoch": 0.71, "grad_norm": 1.9294134892146952, "learning_rate": 1.9854313566628273e-06, "loss": 0.5678, "step": 1169 }, { "epoch": 0.72, "grad_norm": 1.9503939087047788, "learning_rate": 1.977530713390281e-06, "loss": 0.5656, "step": 1170 }, { "epoch": 0.72, "grad_norm": 2.092855618928881, "learning_rate": 1.9696419442796474e-06, "loss": 0.5589, "step": 1171 }, { "epoch": 0.72, "grad_norm": 2.0558632332328366, "learning_rate": 1.9617650803229736e-06, "loss": 0.565, "step": 1172 }, { "epoch": 0.72, "grad_norm": 1.8074897396978709, "learning_rate": 1.953900152465544e-06, "loss": 0.5278, "step": 1173 }, { "epoch": 0.72, "grad_norm": 1.9979141137083714, "learning_rate": 1.9460471916057415e-06, "loss": 0.542, "step": 1174 }, { "epoch": 0.72, "grad_norm": 2.031624720114998, "learning_rate": 1.9382062285949416e-06, "loss": 0.4827, "step": 1175 }, { "epoch": 0.72, "grad_norm": 2.109236104508361, "learning_rate": 1.9303772942373846e-06, "loss": 0.5567, "step": 1176 }, { "epoch": 0.72, "grad_norm": 2.2956266675459576, "learning_rate": 1.9225604192900488e-06, "loss": 0.6067, "step": 1177 }, { "epoch": 0.72, "grad_norm": 1.9743192317750047, "learning_rate": 1.914755634462542e-06, "loss": 0.4976, "step": 1178 }, { "epoch": 0.72, "grad_norm": 1.8028667562352623, "learning_rate": 1.9069629704169723e-06, "loss": 0.509, "step": 1179 }, { "epoch": 0.72, "grad_norm": 1.7362668980411815, "learning_rate": 1.8991824577678269e-06, "loss": 0.5544, "step": 1180 }, { "epoch": 0.72, "grad_norm": 2.0285534879494715, "learning_rate": 1.8914141270818593e-06, "loss": 0.4984, "step": 1181 }, { "epoch": 0.72, "grad_norm": 2.046266381772136, "learning_rate": 1.8836580088779628e-06, "loss": 0.59, "step": 1182 }, { "epoch": 0.72, "grad_norm": 1.9988343414891951, "learning_rate": 1.8759141336270486e-06, "loss": 0.5491, "step": 1183 }, { "epoch": 0.72, "grad_norm": 2.0240438299435968, "learning_rate": 1.868182531751938e-06, "loss": 0.5816, "step": 1184 }, { "epoch": 0.72, "grad_norm": 2.1594937237415226, "learning_rate": 1.8604632336272249e-06, "loss": 0.5865, "step": 1185 }, { "epoch": 0.73, "grad_norm": 2.0666115681553032, "learning_rate": 1.8527562695791746e-06, "loss": 0.5231, "step": 1186 }, { "epoch": 0.73, "grad_norm": 2.0866219869375064, "learning_rate": 1.8450616698855938e-06, "loss": 0.5465, "step": 1187 }, { "epoch": 0.73, "grad_norm": 1.9640838158081952, "learning_rate": 1.8373794647757105e-06, "loss": 0.5484, "step": 1188 }, { "epoch": 0.73, "grad_norm": 1.8827641315498593, "learning_rate": 1.8297096844300638e-06, "loss": 0.5447, "step": 1189 }, { "epoch": 0.73, "grad_norm": 1.9270241151498686, "learning_rate": 1.8220523589803808e-06, "loss": 0.5148, "step": 1190 }, { "epoch": 0.73, "grad_norm": 1.8983158181788506, "learning_rate": 1.8144075185094523e-06, "loss": 0.5089, "step": 1191 }, { "epoch": 0.73, "grad_norm": 1.9391982904773535, "learning_rate": 1.8067751930510258e-06, "loss": 0.6062, "step": 1192 }, { "epoch": 0.73, "grad_norm": 1.8572456201220782, "learning_rate": 1.799155412589681e-06, "loss": 0.4707, "step": 1193 }, { "epoch": 0.73, "grad_norm": 2.180002124115602, "learning_rate": 1.7915482070607094e-06, "loss": 0.597, "step": 1194 }, { "epoch": 0.73, "grad_norm": 1.8462663478460364, "learning_rate": 1.783953606350005e-06, "loss": 0.5577, "step": 1195 }, { "epoch": 0.73, "grad_norm": 1.889638886458579, "learning_rate": 1.7763716402939385e-06, "loss": 0.519, "step": 1196 }, { "epoch": 0.73, "grad_norm": 1.726660341693367, "learning_rate": 1.7688023386792452e-06, "loss": 0.4718, "step": 1197 }, { "epoch": 0.73, "grad_norm": 2.2128024274172655, "learning_rate": 1.7612457312429093e-06, "loss": 0.6105, "step": 1198 }, { "epoch": 0.73, "grad_norm": 1.8977444616718881, "learning_rate": 1.7537018476720369e-06, "loss": 0.5442, "step": 1199 }, { "epoch": 0.73, "grad_norm": 1.7605812625109447, "learning_rate": 1.7461707176037546e-06, "loss": 0.4897, "step": 1200 }, { "epoch": 0.73, "grad_norm": 2.1962948774783655, "learning_rate": 1.738652370625082e-06, "loss": 0.5795, "step": 1201 }, { "epoch": 0.73, "grad_norm": 1.8753753975602796, "learning_rate": 1.7311468362728163e-06, "loss": 0.5267, "step": 1202 }, { "epoch": 0.74, "grad_norm": 2.2401308043999877, "learning_rate": 1.723654144033422e-06, "loss": 0.5422, "step": 1203 }, { "epoch": 0.74, "grad_norm": 2.360182834014352, "learning_rate": 1.7161743233429123e-06, "loss": 0.5932, "step": 1204 }, { "epoch": 0.74, "grad_norm": 2.1509906603775675, "learning_rate": 1.7087074035867284e-06, "loss": 0.5336, "step": 1205 }, { "epoch": 0.74, "grad_norm": 2.2167606816262215, "learning_rate": 1.7012534140996351e-06, "loss": 0.6204, "step": 1206 }, { "epoch": 0.74, "grad_norm": 2.2516401026227193, "learning_rate": 1.69381238416559e-06, "loss": 0.6229, "step": 1207 }, { "epoch": 0.74, "grad_norm": 2.040572811584935, "learning_rate": 1.6863843430176464e-06, "loss": 0.5554, "step": 1208 }, { "epoch": 0.74, "grad_norm": 2.001470306966103, "learning_rate": 1.6789693198378254e-06, "loss": 0.5494, "step": 1209 }, { "epoch": 0.74, "grad_norm": 2.0804007666629434, "learning_rate": 1.6715673437570035e-06, "loss": 0.6031, "step": 1210 }, { "epoch": 0.74, "grad_norm": 2.017960337685253, "learning_rate": 1.6641784438548048e-06, "loss": 0.5567, "step": 1211 }, { "epoch": 0.74, "grad_norm": 2.084312076747243, "learning_rate": 1.6568026491594763e-06, "loss": 0.5529, "step": 1212 }, { "epoch": 0.74, "grad_norm": 2.016310638065491, "learning_rate": 1.6494399886477859e-06, "loss": 0.5525, "step": 1213 }, { "epoch": 0.74, "grad_norm": 1.8351394186785017, "learning_rate": 1.6420904912448942e-06, "loss": 0.5631, "step": 1214 }, { "epoch": 0.74, "grad_norm": 1.978522191746191, "learning_rate": 1.634754185824256e-06, "loss": 0.5075, "step": 1215 }, { "epoch": 0.74, "grad_norm": 2.223631750342603, "learning_rate": 1.6274311012074984e-06, "loss": 0.6659, "step": 1216 }, { "epoch": 0.74, "grad_norm": 1.9821415990981424, "learning_rate": 1.6201212661643045e-06, "loss": 0.5744, "step": 1217 }, { "epoch": 0.74, "grad_norm": 1.9744423033984106, "learning_rate": 1.61282470941231e-06, "loss": 0.6117, "step": 1218 }, { "epoch": 0.75, "grad_norm": 2.065211898764052, "learning_rate": 1.6055414596169806e-06, "loss": 0.5691, "step": 1219 }, { "epoch": 0.75, "grad_norm": 1.98605861724129, "learning_rate": 1.5982715453915082e-06, "loss": 0.4985, "step": 1220 }, { "epoch": 0.75, "grad_norm": 2.04457416568264, "learning_rate": 1.5910149952966898e-06, "loss": 0.5538, "step": 1221 }, { "epoch": 0.75, "grad_norm": 2.0121702166230895, "learning_rate": 1.583771837840823e-06, "loss": 0.5658, "step": 1222 }, { "epoch": 0.75, "grad_norm": 2.0092606234614694, "learning_rate": 1.5765421014795911e-06, "loss": 0.5113, "step": 1223 }, { "epoch": 0.75, "grad_norm": 1.8935411378036877, "learning_rate": 1.569325814615947e-06, "loss": 0.507, "step": 1224 }, { "epoch": 0.75, "grad_norm": 2.34585820804892, "learning_rate": 1.562123005600009e-06, "loss": 0.5769, "step": 1225 }, { "epoch": 0.75, "grad_norm": 2.1528289643549234, "learning_rate": 1.5549337027289468e-06, "loss": 0.5501, "step": 1226 }, { "epoch": 0.75, "grad_norm": 1.8301608372784568, "learning_rate": 1.5477579342468634e-06, "loss": 0.5208, "step": 1227 }, { "epoch": 0.75, "grad_norm": 2.176141189999809, "learning_rate": 1.5405957283446987e-06, "loss": 0.6609, "step": 1228 }, { "epoch": 0.75, "grad_norm": 2.0473489032027565, "learning_rate": 1.5334471131601025e-06, "loss": 0.5715, "step": 1229 }, { "epoch": 0.75, "grad_norm": 2.1029826913605647, "learning_rate": 1.526312116777336e-06, "loss": 0.4786, "step": 1230 }, { "epoch": 0.75, "grad_norm": 1.9871418869111652, "learning_rate": 1.5191907672271582e-06, "loss": 0.4602, "step": 1231 }, { "epoch": 0.75, "grad_norm": 1.9159131882394276, "learning_rate": 1.5120830924867098e-06, "loss": 0.508, "step": 1232 }, { "epoch": 0.75, "grad_norm": 2.282250060067885, "learning_rate": 1.5049891204794125e-06, "loss": 0.5567, "step": 1233 }, { "epoch": 0.75, "grad_norm": 1.9230524058397154, "learning_rate": 1.4979088790748553e-06, "loss": 0.5514, "step": 1234 }, { "epoch": 0.76, "grad_norm": 2.036428797678635, "learning_rate": 1.4908423960886808e-06, "loss": 0.5909, "step": 1235 }, { "epoch": 0.76, "grad_norm": 2.1397315181807506, "learning_rate": 1.4837896992824835e-06, "loss": 0.6168, "step": 1236 }, { "epoch": 0.76, "grad_norm": 2.197275580787461, "learning_rate": 1.4767508163636968e-06, "loss": 0.5636, "step": 1237 }, { "epoch": 0.76, "grad_norm": 1.9450184678272302, "learning_rate": 1.4697257749854815e-06, "loss": 0.5576, "step": 1238 }, { "epoch": 0.76, "grad_norm": 1.708185896072239, "learning_rate": 1.4627146027466248e-06, "loss": 0.5048, "step": 1239 }, { "epoch": 0.76, "grad_norm": 1.931851292310904, "learning_rate": 1.4557173271914216e-06, "loss": 0.6003, "step": 1240 }, { "epoch": 0.76, "grad_norm": 1.7739213428365466, "learning_rate": 1.4487339758095758e-06, "loss": 0.4847, "step": 1241 }, { "epoch": 0.76, "grad_norm": 2.065334796741025, "learning_rate": 1.4417645760360899e-06, "loss": 0.4995, "step": 1242 }, { "epoch": 0.76, "grad_norm": 1.9060500200669357, "learning_rate": 1.4348091552511496e-06, "loss": 0.4772, "step": 1243 }, { "epoch": 0.76, "grad_norm": 1.8926362460364048, "learning_rate": 1.427867740780028e-06, "loss": 0.4678, "step": 1244 }, { "epoch": 0.76, "grad_norm": 2.230768332923383, "learning_rate": 1.4209403598929711e-06, "loss": 0.5556, "step": 1245 }, { "epoch": 0.76, "grad_norm": 2.2444577467361078, "learning_rate": 1.4140270398050899e-06, "loss": 0.6313, "step": 1246 }, { "epoch": 0.76, "grad_norm": 2.0449648982479385, "learning_rate": 1.407127807676259e-06, "loss": 0.5457, "step": 1247 }, { "epoch": 0.76, "grad_norm": 2.0517370766830707, "learning_rate": 1.4002426906110034e-06, "loss": 0.539, "step": 1248 }, { "epoch": 0.76, "grad_norm": 2.0847967027851375, "learning_rate": 1.3933717156583975e-06, "loss": 0.5256, "step": 1249 }, { "epoch": 0.76, "grad_norm": 2.1822442085414386, "learning_rate": 1.386514909811958e-06, "loss": 0.5648, "step": 1250 }, { "epoch": 0.76, "grad_norm": 2.01409537100995, "learning_rate": 1.3796723000095312e-06, "loss": 0.5878, "step": 1251 }, { "epoch": 0.77, "grad_norm": 2.2139379142084357, "learning_rate": 1.3728439131331972e-06, "loss": 0.5724, "step": 1252 }, { "epoch": 0.77, "grad_norm": 2.022116537133516, "learning_rate": 1.366029776009159e-06, "loss": 0.5686, "step": 1253 }, { "epoch": 0.77, "grad_norm": 2.3703607571589265, "learning_rate": 1.3592299154076344e-06, "loss": 0.676, "step": 1254 }, { "epoch": 0.77, "grad_norm": 2.1435846797265317, "learning_rate": 1.3524443580427565e-06, "loss": 0.6176, "step": 1255 }, { "epoch": 0.77, "grad_norm": 1.9476892823094056, "learning_rate": 1.3456731305724685e-06, "loss": 0.5245, "step": 1256 }, { "epoch": 0.77, "grad_norm": 1.7780892199670588, "learning_rate": 1.3389162595984106e-06, "loss": 0.4913, "step": 1257 }, { "epoch": 0.77, "grad_norm": 2.147087917412656, "learning_rate": 1.3321737716658284e-06, "loss": 0.5712, "step": 1258 }, { "epoch": 0.77, "grad_norm": 1.9396671940173766, "learning_rate": 1.3254456932634557e-06, "loss": 0.5236, "step": 1259 }, { "epoch": 0.77, "grad_norm": 2.105081861677922, "learning_rate": 1.3187320508234208e-06, "loss": 0.528, "step": 1260 }, { "epoch": 0.77, "grad_norm": 2.119521541148906, "learning_rate": 1.3120328707211394e-06, "loss": 0.5511, "step": 1261 }, { "epoch": 0.77, "grad_norm": 1.9411630633184176, "learning_rate": 1.3053481792752044e-06, "loss": 0.5692, "step": 1262 }, { "epoch": 0.77, "grad_norm": 2.13953797890098, "learning_rate": 1.298678002747294e-06, "loss": 0.6083, "step": 1263 }, { "epoch": 0.77, "grad_norm": 1.696024362037954, "learning_rate": 1.2920223673420584e-06, "loss": 0.4515, "step": 1264 }, { "epoch": 0.77, "grad_norm": 1.8867892952085517, "learning_rate": 1.285381299207026e-06, "loss": 0.5367, "step": 1265 }, { "epoch": 0.77, "grad_norm": 2.052265073795798, "learning_rate": 1.2787548244324888e-06, "loss": 0.6345, "step": 1266 }, { "epoch": 0.77, "grad_norm": 1.9697590487372725, "learning_rate": 1.2721429690514142e-06, "loss": 0.5131, "step": 1267 }, { "epoch": 0.78, "grad_norm": 1.950928985112725, "learning_rate": 1.26554575903933e-06, "loss": 0.5065, "step": 1268 }, { "epoch": 0.78, "grad_norm": 2.040285812654646, "learning_rate": 1.2589632203142316e-06, "loss": 0.6118, "step": 1269 }, { "epoch": 0.78, "grad_norm": 2.0858975965094095, "learning_rate": 1.2523953787364723e-06, "loss": 0.5986, "step": 1270 }, { "epoch": 0.78, "grad_norm": 1.96463477151841, "learning_rate": 1.24584226010867e-06, "loss": 0.5598, "step": 1271 }, { "epoch": 0.78, "grad_norm": 2.1145952141868434, "learning_rate": 1.2393038901756e-06, "loss": 0.5922, "step": 1272 }, { "epoch": 0.78, "grad_norm": 1.8968425608898443, "learning_rate": 1.232780294624093e-06, "loss": 0.5095, "step": 1273 }, { "epoch": 0.78, "grad_norm": 2.0301980708684866, "learning_rate": 1.22627149908294e-06, "loss": 0.5498, "step": 1274 }, { "epoch": 0.78, "grad_norm": 1.8270409790198994, "learning_rate": 1.2197775291227887e-06, "loss": 0.4714, "step": 1275 }, { "epoch": 0.78, "grad_norm": 2.0963357615007703, "learning_rate": 1.2132984102560374e-06, "loss": 0.6149, "step": 1276 }, { "epoch": 0.78, "grad_norm": 1.9801877339079816, "learning_rate": 1.2068341679367452e-06, "loss": 0.5337, "step": 1277 }, { "epoch": 0.78, "grad_norm": 1.980282736079062, "learning_rate": 1.2003848275605263e-06, "loss": 0.5857, "step": 1278 }, { "epoch": 0.78, "grad_norm": 1.835028289058003, "learning_rate": 1.1939504144644464e-06, "loss": 0.5959, "step": 1279 }, { "epoch": 0.78, "grad_norm": 2.120379341814735, "learning_rate": 1.1875309539269332e-06, "loss": 0.5015, "step": 1280 }, { "epoch": 0.78, "grad_norm": 1.93303989363275, "learning_rate": 1.1811264711676661e-06, "loss": 0.5125, "step": 1281 }, { "epoch": 0.78, "grad_norm": 1.973773020853456, "learning_rate": 1.1747369913474866e-06, "loss": 0.5864, "step": 1282 }, { "epoch": 0.78, "grad_norm": 2.0447822785351994, "learning_rate": 1.1683625395682935e-06, "loss": 0.572, "step": 1283 }, { "epoch": 0.79, "grad_norm": 1.9251270954167108, "learning_rate": 1.1620031408729443e-06, "loss": 0.5745, "step": 1284 }, { "epoch": 0.79, "grad_norm": 1.881635576782646, "learning_rate": 1.1556588202451613e-06, "loss": 0.4638, "step": 1285 }, { "epoch": 0.79, "grad_norm": 1.7896548110439616, "learning_rate": 1.1493296026094302e-06, "loss": 0.5252, "step": 1286 }, { "epoch": 0.79, "grad_norm": 1.9326908202122164, "learning_rate": 1.1430155128309e-06, "loss": 0.4933, "step": 1287 }, { "epoch": 0.79, "grad_norm": 1.9844579045400434, "learning_rate": 1.1367165757152905e-06, "loss": 0.5393, "step": 1288 }, { "epoch": 0.79, "grad_norm": 2.0830711516317573, "learning_rate": 1.1304328160087935e-06, "loss": 0.6165, "step": 1289 }, { "epoch": 0.79, "grad_norm": 2.0820587210774875, "learning_rate": 1.12416425839797e-06, "loss": 0.5735, "step": 1290 }, { "epoch": 0.79, "grad_norm": 1.9942738518131777, "learning_rate": 1.1179109275096628e-06, "loss": 0.5331, "step": 1291 }, { "epoch": 0.79, "grad_norm": 2.09146135494079, "learning_rate": 1.1116728479108884e-06, "loss": 0.4912, "step": 1292 }, { "epoch": 0.79, "grad_norm": 2.0181223861753685, "learning_rate": 1.105450044108753e-06, "loss": 0.5767, "step": 1293 }, { "epoch": 0.79, "grad_norm": 1.7770312565830746, "learning_rate": 1.099242540550347e-06, "loss": 0.5222, "step": 1294 }, { "epoch": 0.79, "grad_norm": 1.9504760795862741, "learning_rate": 1.0930503616226495e-06, "loss": 0.605, "step": 1295 }, { "epoch": 0.79, "grad_norm": 1.8039286890109292, "learning_rate": 1.0868735316524387e-06, "loss": 0.439, "step": 1296 }, { "epoch": 0.79, "grad_norm": 1.8781587995004858, "learning_rate": 1.0807120749061923e-06, "loss": 0.4785, "step": 1297 }, { "epoch": 0.79, "grad_norm": 2.1386175502459466, "learning_rate": 1.0745660155899878e-06, "loss": 0.6047, "step": 1298 }, { "epoch": 0.79, "grad_norm": 2.0105202575538255, "learning_rate": 1.0684353778494166e-06, "loss": 0.6412, "step": 1299 }, { "epoch": 0.79, "grad_norm": 2.110747721557068, "learning_rate": 1.0623201857694837e-06, "loss": 0.5084, "step": 1300 }, { "epoch": 0.8, "grad_norm": 2.0266382461097576, "learning_rate": 1.056220463374511e-06, "loss": 0.5513, "step": 1301 }, { "epoch": 0.8, "grad_norm": 1.941131403268958, "learning_rate": 1.0501362346280492e-06, "loss": 0.5362, "step": 1302 }, { "epoch": 0.8, "grad_norm": 2.179036766615289, "learning_rate": 1.0440675234327774e-06, "loss": 0.5566, "step": 1303 }, { "epoch": 0.8, "grad_norm": 1.9388634477990079, "learning_rate": 1.0380143536304133e-06, "loss": 0.5316, "step": 1304 }, { "epoch": 0.8, "grad_norm": 1.9303281784663635, "learning_rate": 1.0319767490016196e-06, "loss": 0.5194, "step": 1305 }, { "epoch": 0.8, "grad_norm": 2.3880703426336356, "learning_rate": 1.0259547332659065e-06, "loss": 0.7486, "step": 1306 }, { "epoch": 0.8, "grad_norm": 1.9926065703927103, "learning_rate": 1.0199483300815421e-06, "loss": 0.527, "step": 1307 }, { "epoch": 0.8, "grad_norm": 1.9341158574219506, "learning_rate": 1.0139575630454618e-06, "loss": 0.5403, "step": 1308 }, { "epoch": 0.8, "grad_norm": 1.8567155488677838, "learning_rate": 1.0079824556931655e-06, "loss": 0.548, "step": 1309 }, { "epoch": 0.8, "grad_norm": 2.0404242620951436, "learning_rate": 1.0020230314986395e-06, "loss": 0.498, "step": 1310 }, { "epoch": 0.8, "grad_norm": 2.0107577837327457, "learning_rate": 9.960793138742503e-07, "loss": 0.58, "step": 1311 }, { "epoch": 0.8, "grad_norm": 1.764122033295333, "learning_rate": 9.901513261706652e-07, "loss": 0.4909, "step": 1312 }, { "epoch": 0.8, "grad_norm": 2.1074644373274225, "learning_rate": 9.84239091676748e-07, "loss": 0.5358, "step": 1313 }, { "epoch": 0.8, "grad_norm": 1.8835155965933714, "learning_rate": 9.783426336194807e-07, "loss": 0.5683, "step": 1314 }, { "epoch": 0.8, "grad_norm": 1.9460151844862306, "learning_rate": 9.724619751638598e-07, "loss": 0.5901, "step": 1315 }, { "epoch": 0.8, "grad_norm": 1.9960692765948507, "learning_rate": 9.665971394128137e-07, "loss": 0.5299, "step": 1316 }, { "epoch": 0.81, "grad_norm": 1.904429551506709, "learning_rate": 9.607481494071107e-07, "loss": 0.5077, "step": 1317 }, { "epoch": 0.81, "grad_norm": 2.0807551324391618, "learning_rate": 9.549150281252633e-07, "loss": 0.4932, "step": 1318 }, { "epoch": 0.81, "grad_norm": 1.880996194999353, "learning_rate": 9.490977984834454e-07, "loss": 0.5256, "step": 1319 }, { "epoch": 0.81, "grad_norm": 2.292702889415888, "learning_rate": 9.432964833353947e-07, "loss": 0.5633, "step": 1320 }, { "epoch": 0.81, "grad_norm": 1.8157124323149034, "learning_rate": 9.375111054723301e-07, "loss": 0.5443, "step": 1321 }, { "epoch": 0.81, "grad_norm": 1.997225759296561, "learning_rate": 9.317416876228591e-07, "loss": 0.6053, "step": 1322 }, { "epoch": 0.81, "grad_norm": 2.2930834747649187, "learning_rate": 9.259882524528835e-07, "loss": 0.647, "step": 1323 }, { "epoch": 0.81, "grad_norm": 1.7855930884686897, "learning_rate": 9.202508225655216e-07, "loss": 0.4861, "step": 1324 }, { "epoch": 0.81, "grad_norm": 2.041666912419482, "learning_rate": 9.145294205010058e-07, "loss": 0.5105, "step": 1325 }, { "epoch": 0.81, "grad_norm": 2.1259688035148496, "learning_rate": 9.088240687366073e-07, "loss": 0.6038, "step": 1326 }, { "epoch": 0.81, "grad_norm": 2.0323306545521436, "learning_rate": 9.0313478968654e-07, "loss": 0.5853, "step": 1327 }, { "epoch": 0.81, "grad_norm": 2.044128726826921, "learning_rate": 8.974616057018709e-07, "loss": 0.5153, "step": 1328 }, { "epoch": 0.81, "grad_norm": 1.944192337336169, "learning_rate": 8.918045390704383e-07, "loss": 0.5475, "step": 1329 }, { "epoch": 0.81, "grad_norm": 2.062636307328745, "learning_rate": 8.861636120167632e-07, "loss": 0.5959, "step": 1330 }, { "epoch": 0.81, "grad_norm": 2.1486851598365946, "learning_rate": 8.805388467019549e-07, "loss": 0.5959, "step": 1331 }, { "epoch": 0.81, "grad_norm": 2.2040420193997483, "learning_rate": 8.749302652236341e-07, "loss": 0.6322, "step": 1332 }, { "epoch": 0.82, "grad_norm": 2.20406327847528, "learning_rate": 8.693378896158377e-07, "loss": 0.6114, "step": 1333 }, { "epoch": 0.82, "grad_norm": 2.1795279212664886, "learning_rate": 8.637617418489386e-07, "loss": 0.5828, "step": 1334 }, { "epoch": 0.82, "grad_norm": 2.041352967900095, "learning_rate": 8.582018438295553e-07, "loss": 0.5139, "step": 1335 }, { "epoch": 0.82, "grad_norm": 2.0047043727475167, "learning_rate": 8.52658217400466e-07, "loss": 0.5492, "step": 1336 }, { "epoch": 0.82, "grad_norm": 1.9340395107266044, "learning_rate": 8.471308843405252e-07, "loss": 0.5404, "step": 1337 }, { "epoch": 0.82, "grad_norm": 1.9443767009288708, "learning_rate": 8.416198663645775e-07, "loss": 0.6145, "step": 1338 }, { "epoch": 0.82, "grad_norm": 2.0047578158470385, "learning_rate": 8.361251851233687e-07, "loss": 0.5147, "step": 1339 }, { "epoch": 0.82, "grad_norm": 1.9775796297395034, "learning_rate": 8.306468622034663e-07, "loss": 0.4914, "step": 1340 }, { "epoch": 0.82, "grad_norm": 2.0350740385108783, "learning_rate": 8.251849191271727e-07, "loss": 0.5988, "step": 1341 }, { "epoch": 0.82, "grad_norm": 1.810070925909079, "learning_rate": 8.197393773524359e-07, "loss": 0.4841, "step": 1342 }, { "epoch": 0.82, "grad_norm": 1.9563215783299615, "learning_rate": 8.143102582727741e-07, "loss": 0.5356, "step": 1343 }, { "epoch": 0.82, "grad_norm": 1.8199362899555016, "learning_rate": 8.088975832171819e-07, "loss": 0.4712, "step": 1344 }, { "epoch": 0.82, "grad_norm": 2.309704865979389, "learning_rate": 8.035013734500557e-07, "loss": 0.6218, "step": 1345 }, { "epoch": 0.82, "grad_norm": 2.0272627809313923, "learning_rate": 7.981216501711053e-07, "loss": 0.5838, "step": 1346 }, { "epoch": 0.82, "grad_norm": 2.0114764889056613, "learning_rate": 7.927584345152672e-07, "loss": 0.5609, "step": 1347 }, { "epoch": 0.82, "grad_norm": 1.9745269009451916, "learning_rate": 7.874117475526305e-07, "loss": 0.4989, "step": 1348 }, { "epoch": 0.82, "grad_norm": 2.253717109150984, "learning_rate": 7.820816102883477e-07, "loss": 0.6223, "step": 1349 }, { "epoch": 0.83, "grad_norm": 2.098840934097801, "learning_rate": 7.767680436625513e-07, "loss": 0.5429, "step": 1350 }, { "epoch": 0.83, "grad_norm": 1.9010834528450948, "learning_rate": 7.714710685502764e-07, "loss": 0.5055, "step": 1351 }, { "epoch": 0.83, "grad_norm": 2.0272090911880176, "learning_rate": 7.661907057613766e-07, "loss": 0.5749, "step": 1352 }, { "epoch": 0.83, "grad_norm": 2.1091229605800645, "learning_rate": 7.609269760404392e-07, "loss": 0.5019, "step": 1353 }, { "epoch": 0.83, "grad_norm": 1.9509717065137873, "learning_rate": 7.556799000667097e-07, "loss": 0.4808, "step": 1354 }, { "epoch": 0.83, "grad_norm": 1.873704324767879, "learning_rate": 7.504494984540033e-07, "loss": 0.4928, "step": 1355 }, { "epoch": 0.83, "grad_norm": 1.987007022534763, "learning_rate": 7.452357917506309e-07, "loss": 0.5312, "step": 1356 }, { "epoch": 0.83, "grad_norm": 2.1167223759818503, "learning_rate": 7.40038800439315e-07, "loss": 0.5532, "step": 1357 }, { "epoch": 0.83, "grad_norm": 1.8820691821813884, "learning_rate": 7.348585449371076e-07, "loss": 0.5615, "step": 1358 }, { "epoch": 0.83, "grad_norm": 1.9800556013383923, "learning_rate": 7.296950455953145e-07, "loss": 0.5546, "step": 1359 }, { "epoch": 0.83, "grad_norm": 2.278698097867691, "learning_rate": 7.245483226994094e-07, "loss": 0.6373, "step": 1360 }, { "epoch": 0.83, "grad_norm": 1.8404133278696633, "learning_rate": 7.19418396468961e-07, "loss": 0.5212, "step": 1361 }, { "epoch": 0.83, "grad_norm": 2.082478736477018, "learning_rate": 7.14305287057549e-07, "loss": 0.5405, "step": 1362 }, { "epoch": 0.83, "grad_norm": 2.274543877989927, "learning_rate": 7.092090145526842e-07, "loss": 0.5788, "step": 1363 }, { "epoch": 0.83, "grad_norm": 2.1975775615791284, "learning_rate": 7.041295989757352e-07, "loss": 0.5705, "step": 1364 }, { "epoch": 0.83, "grad_norm": 2.3957382130192744, "learning_rate": 6.990670602818412e-07, "loss": 0.5319, "step": 1365 }, { "epoch": 0.84, "grad_norm": 1.9572975655214617, "learning_rate": 6.940214183598431e-07, "loss": 0.4977, "step": 1366 }, { "epoch": 0.84, "grad_norm": 2.0343363601457796, "learning_rate": 6.889926930321961e-07, "loss": 0.5601, "step": 1367 }, { "epoch": 0.84, "grad_norm": 2.2173297633145497, "learning_rate": 6.839809040549017e-07, "loss": 0.5652, "step": 1368 }, { "epoch": 0.84, "grad_norm": 1.9898806046683502, "learning_rate": 6.789860711174184e-07, "loss": 0.5604, "step": 1369 }, { "epoch": 0.84, "grad_norm": 1.755937949976552, "learning_rate": 6.740082138425963e-07, "loss": 0.5268, "step": 1370 }, { "epoch": 0.84, "grad_norm": 2.059314359247024, "learning_rate": 6.690473517865925e-07, "loss": 0.5516, "step": 1371 }, { "epoch": 0.84, "grad_norm": 1.921092025716401, "learning_rate": 6.641035044387939e-07, "loss": 0.5282, "step": 1372 }, { "epoch": 0.84, "grad_norm": 2.1611589441440904, "learning_rate": 6.591766912217456e-07, "loss": 0.5721, "step": 1373 }, { "epoch": 0.84, "grad_norm": 2.07998470722096, "learning_rate": 6.542669314910732e-07, "loss": 0.616, "step": 1374 }, { "epoch": 0.84, "grad_norm": 1.8276327443296445, "learning_rate": 6.493742445354012e-07, "loss": 0.4733, "step": 1375 }, { "epoch": 0.84, "grad_norm": 2.0470888106096568, "learning_rate": 6.44498649576285e-07, "loss": 0.6115, "step": 1376 }, { "epoch": 0.84, "grad_norm": 1.930085414769607, "learning_rate": 6.39640165768129e-07, "loss": 0.5524, "step": 1377 }, { "epoch": 0.84, "grad_norm": 1.992132029715179, "learning_rate": 6.347988121981175e-07, "loss": 0.5116, "step": 1378 }, { "epoch": 0.84, "grad_norm": 1.9749787878274394, "learning_rate": 6.299746078861346e-07, "loss": 0.5243, "step": 1379 }, { "epoch": 0.84, "grad_norm": 2.1338239619198003, "learning_rate": 6.251675717846905e-07, "loss": 0.6601, "step": 1380 }, { "epoch": 0.84, "grad_norm": 1.9037772081403355, "learning_rate": 6.203777227788493e-07, "loss": 0.537, "step": 1381 }, { "epoch": 0.85, "grad_norm": 1.982734111031249, "learning_rate": 6.156050796861551e-07, "loss": 0.5447, "step": 1382 }, { "epoch": 0.85, "grad_norm": 1.9647030662707663, "learning_rate": 6.108496612565507e-07, "loss": 0.5572, "step": 1383 }, { "epoch": 0.85, "grad_norm": 2.20503978552407, "learning_rate": 6.061114861723144e-07, "loss": 0.5847, "step": 1384 }, { "epoch": 0.85, "grad_norm": 1.8985370142150249, "learning_rate": 6.013905730479824e-07, "loss": 0.5245, "step": 1385 }, { "epoch": 0.85, "grad_norm": 2.0108479395190204, "learning_rate": 5.966869404302705e-07, "loss": 0.4869, "step": 1386 }, { "epoch": 0.85, "grad_norm": 1.6778892025630097, "learning_rate": 5.920006067980105e-07, "loss": 0.4713, "step": 1387 }, { "epoch": 0.85, "grad_norm": 1.9622392449880077, "learning_rate": 5.873315905620685e-07, "loss": 0.5619, "step": 1388 }, { "epoch": 0.85, "grad_norm": 1.8986949233718315, "learning_rate": 5.826799100652802e-07, "loss": 0.5944, "step": 1389 }, { "epoch": 0.85, "grad_norm": 2.0094350555686726, "learning_rate": 5.780455835823767e-07, "loss": 0.6029, "step": 1390 }, { "epoch": 0.85, "grad_norm": 2.01549443606308, "learning_rate": 5.734286293199065e-07, "loss": 0.5168, "step": 1391 }, { "epoch": 0.85, "grad_norm": 1.9026568713715968, "learning_rate": 5.688290654161738e-07, "loss": 0.4661, "step": 1392 }, { "epoch": 0.85, "grad_norm": 2.090067485392997, "learning_rate": 5.642469099411619e-07, "loss": 0.5773, "step": 1393 }, { "epoch": 0.85, "grad_norm": 2.041801412484887, "learning_rate": 5.596821808964592e-07, "loss": 0.5174, "step": 1394 }, { "epoch": 0.85, "grad_norm": 1.8047865906862048, "learning_rate": 5.551348962151965e-07, "loss": 0.5096, "step": 1395 }, { "epoch": 0.85, "grad_norm": 1.7749522813672998, "learning_rate": 5.506050737619706e-07, "loss": 0.4149, "step": 1396 }, { "epoch": 0.85, "grad_norm": 1.9669167109351449, "learning_rate": 5.460927313327746e-07, "loss": 0.5318, "step": 1397 }, { "epoch": 0.85, "grad_norm": 2.0851119947571397, "learning_rate": 5.415978866549309e-07, "loss": 0.5206, "step": 1398 }, { "epoch": 0.86, "grad_norm": 1.770305337825321, "learning_rate": 5.371205573870169e-07, "loss": 0.5146, "step": 1399 }, { "epoch": 0.86, "grad_norm": 2.0529803134924793, "learning_rate": 5.326607611188023e-07, "loss": 0.5925, "step": 1400 }, { "epoch": 0.86, "grad_norm": 1.8408476413034762, "learning_rate": 5.282185153711739e-07, "loss": 0.5419, "step": 1401 }, { "epoch": 0.86, "grad_norm": 2.1277879535451887, "learning_rate": 5.237938375960683e-07, "loss": 0.5522, "step": 1402 }, { "epoch": 0.86, "grad_norm": 1.798009767692874, "learning_rate": 5.19386745176405e-07, "loss": 0.4908, "step": 1403 }, { "epoch": 0.86, "grad_norm": 1.9218825894337166, "learning_rate": 5.149972554260191e-07, "loss": 0.5907, "step": 1404 }, { "epoch": 0.86, "grad_norm": 1.7135646036349135, "learning_rate": 5.106253855895865e-07, "loss": 0.5325, "step": 1405 }, { "epoch": 0.86, "grad_norm": 2.0051336782599916, "learning_rate": 5.062711528425657e-07, "loss": 0.552, "step": 1406 }, { "epoch": 0.86, "grad_norm": 1.823540052411061, "learning_rate": 5.019345742911241e-07, "loss": 0.5279, "step": 1407 }, { "epoch": 0.86, "grad_norm": 1.9498421802995456, "learning_rate": 4.976156669720706e-07, "loss": 0.4684, "step": 1408 }, { "epoch": 0.86, "grad_norm": 2.0510089396163464, "learning_rate": 4.933144478527929e-07, "loss": 0.5733, "step": 1409 }, { "epoch": 0.86, "grad_norm": 1.718921166851957, "learning_rate": 4.890309338311861e-07, "loss": 0.4503, "step": 1410 }, { "epoch": 0.86, "grad_norm": 2.066841848267666, "learning_rate": 4.847651417355914e-07, "loss": 0.5523, "step": 1411 }, { "epoch": 0.86, "grad_norm": 1.935126073825529, "learning_rate": 4.805170883247228e-07, "loss": 0.5709, "step": 1412 }, { "epoch": 0.86, "grad_norm": 1.7529161435960845, "learning_rate": 4.7628679028761114e-07, "loss": 0.4784, "step": 1413 }, { "epoch": 0.86, "grad_norm": 1.9418531589165946, "learning_rate": 4.720742642435272e-07, "loss": 0.5417, "step": 1414 }, { "epoch": 0.87, "grad_norm": 2.256822591186601, "learning_rate": 4.678795267419267e-07, "loss": 0.5787, "step": 1415 }, { "epoch": 0.87, "grad_norm": 1.7848727741194108, "learning_rate": 4.63702594262378e-07, "loss": 0.4068, "step": 1416 }, { "epoch": 0.87, "grad_norm": 2.1119584563313314, "learning_rate": 4.595434832145013e-07, "loss": 0.6635, "step": 1417 }, { "epoch": 0.87, "grad_norm": 2.047030257561458, "learning_rate": 4.554022099379035e-07, "loss": 0.5171, "step": 1418 }, { "epoch": 0.87, "grad_norm": 1.9273060718802395, "learning_rate": 4.5127879070211213e-07, "loss": 0.5597, "step": 1419 }, { "epoch": 0.87, "grad_norm": 2.1177945388342727, "learning_rate": 4.471732417065144e-07, "loss": 0.5861, "step": 1420 }, { "epoch": 0.87, "grad_norm": 2.1663172290104002, "learning_rate": 4.430855790802896e-07, "loss": 0.5851, "step": 1421 }, { "epoch": 0.87, "grad_norm": 2.1902483639800887, "learning_rate": 4.3901581888235067e-07, "loss": 0.5485, "step": 1422 }, { "epoch": 0.87, "grad_norm": 2.27940192829255, "learning_rate": 4.3496397710127756e-07, "loss": 0.5683, "step": 1423 }, { "epoch": 0.87, "grad_norm": 1.6787266853728975, "learning_rate": 4.3093006965525483e-07, "loss": 0.4487, "step": 1424 }, { "epoch": 0.87, "grad_norm": 2.2173461979846554, "learning_rate": 4.2691411239201007e-07, "loss": 0.6181, "step": 1425 }, { "epoch": 0.87, "grad_norm": 2.0362219497157663, "learning_rate": 4.2291612108875226e-07, "loss": 0.5827, "step": 1426 }, { "epoch": 0.87, "grad_norm": 1.984683976482598, "learning_rate": 4.189361114521062e-07, "loss": 0.5687, "step": 1427 }, { "epoch": 0.87, "grad_norm": 2.010835056818883, "learning_rate": 4.149740991180573e-07, "loss": 0.5484, "step": 1428 }, { "epoch": 0.87, "grad_norm": 2.113774248995943, "learning_rate": 4.1103009965188125e-07, "loss": 0.598, "step": 1429 }, { "epoch": 0.87, "grad_norm": 2.0151775021994385, "learning_rate": 4.0710412854809255e-07, "loss": 0.4896, "step": 1430 }, { "epoch": 0.87, "grad_norm": 1.9514989153137001, "learning_rate": 4.0319620123037697e-07, "loss": 0.5659, "step": 1431 }, { "epoch": 0.88, "grad_norm": 2.07614867510403, "learning_rate": 3.9930633305153177e-07, "loss": 0.4641, "step": 1432 }, { "epoch": 0.88, "grad_norm": 2.2405590611515933, "learning_rate": 3.9543453929340834e-07, "loss": 0.5112, "step": 1433 }, { "epoch": 0.88, "grad_norm": 2.14510497102877, "learning_rate": 3.9158083516685043e-07, "loss": 0.6867, "step": 1434 }, { "epoch": 0.88, "grad_norm": 1.8206597677309004, "learning_rate": 3.8774523581163236e-07, "loss": 0.5024, "step": 1435 }, { "epoch": 0.88, "grad_norm": 1.9730244343440717, "learning_rate": 3.8392775629640275e-07, "loss": 0.6115, "step": 1436 }, { "epoch": 0.88, "grad_norm": 2.0113186406624677, "learning_rate": 3.80128411618626e-07, "loss": 0.5308, "step": 1437 }, { "epoch": 0.88, "grad_norm": 2.291565453318076, "learning_rate": 3.763472167045179e-07, "loss": 0.5849, "step": 1438 }, { "epoch": 0.88, "grad_norm": 1.865162322112732, "learning_rate": 3.72584186408993e-07, "loss": 0.4345, "step": 1439 }, { "epoch": 0.88, "grad_norm": 1.7806466246249277, "learning_rate": 3.688393355156022e-07, "loss": 0.4976, "step": 1440 }, { "epoch": 0.88, "grad_norm": 1.837932154589643, "learning_rate": 3.6511267873647725e-07, "loss": 0.5382, "step": 1441 }, { "epoch": 0.88, "grad_norm": 1.900042396349841, "learning_rate": 3.614042307122728e-07, "loss": 0.5135, "step": 1442 }, { "epoch": 0.88, "grad_norm": 2.0377705675722435, "learning_rate": 3.577140060121059e-07, "loss": 0.6439, "step": 1443 }, { "epoch": 0.88, "grad_norm": 1.9812798873664048, "learning_rate": 3.54042019133502e-07, "loss": 0.518, "step": 1444 }, { "epoch": 0.88, "grad_norm": 1.9801105899864375, "learning_rate": 3.5038828450233874e-07, "loss": 0.5513, "step": 1445 }, { "epoch": 0.88, "grad_norm": 1.787558486401592, "learning_rate": 3.4675281647278346e-07, "loss": 0.4717, "step": 1446 }, { "epoch": 0.88, "grad_norm": 1.7350169337556531, "learning_rate": 3.431356293272442e-07, "loss": 0.4517, "step": 1447 }, { "epoch": 0.89, "grad_norm": 1.9036391339274414, "learning_rate": 3.395367372763092e-07, "loss": 0.4952, "step": 1448 }, { "epoch": 0.89, "grad_norm": 2.1326612564420078, "learning_rate": 3.3595615445869033e-07, "loss": 0.665, "step": 1449 }, { "epoch": 0.89, "grad_norm": 1.9766629375982392, "learning_rate": 3.3239389494117316e-07, "loss": 0.4712, "step": 1450 }, { "epoch": 0.89, "grad_norm": 2.085629103255329, "learning_rate": 3.288499727185529e-07, "loss": 0.5991, "step": 1451 }, { "epoch": 0.89, "grad_norm": 1.9952308886565442, "learning_rate": 3.253244017135876e-07, "loss": 0.5492, "step": 1452 }, { "epoch": 0.89, "grad_norm": 1.9823172713501394, "learning_rate": 3.218171957769411e-07, "loss": 0.5133, "step": 1453 }, { "epoch": 0.89, "grad_norm": 2.243504252708836, "learning_rate": 3.183283686871236e-07, "loss": 0.5375, "step": 1454 }, { "epoch": 0.89, "grad_norm": 2.160814779756097, "learning_rate": 3.1485793415044483e-07, "loss": 0.5441, "step": 1455 }, { "epoch": 0.89, "grad_norm": 2.1575822253594397, "learning_rate": 3.1140590580095777e-07, "loss": 0.5261, "step": 1456 }, { "epoch": 0.89, "grad_norm": 2.072867103693364, "learning_rate": 3.079722972004007e-07, "loss": 0.528, "step": 1457 }, { "epoch": 0.89, "grad_norm": 2.067717194924602, "learning_rate": 3.0455712183815044e-07, "loss": 0.5705, "step": 1458 }, { "epoch": 0.89, "grad_norm": 2.2018910614851053, "learning_rate": 3.011603931311652e-07, "loss": 0.6087, "step": 1459 }, { "epoch": 0.89, "grad_norm": 1.799733514347377, "learning_rate": 2.9778212442393373e-07, "loss": 0.3817, "step": 1460 }, { "epoch": 0.89, "grad_norm": 2.0297829733559096, "learning_rate": 2.9442232898842184e-07, "loss": 0.5627, "step": 1461 }, { "epoch": 0.89, "grad_norm": 2.1127229638235314, "learning_rate": 2.910810200240205e-07, "loss": 0.6539, "step": 1462 }, { "epoch": 0.89, "grad_norm": 2.2173549226318148, "learning_rate": 2.877582106574961e-07, "loss": 0.6292, "step": 1463 }, { "epoch": 0.9, "grad_norm": 1.7665892894148392, "learning_rate": 2.8445391394293364e-07, "loss": 0.536, "step": 1464 }, { "epoch": 0.9, "grad_norm": 1.9459724264850673, "learning_rate": 2.811681428616919e-07, "loss": 0.506, "step": 1465 }, { "epoch": 0.9, "grad_norm": 2.055519305865498, "learning_rate": 2.779009103223473e-07, "loss": 0.5743, "step": 1466 }, { "epoch": 0.9, "grad_norm": 1.927080536920677, "learning_rate": 2.746522291606463e-07, "loss": 0.5181, "step": 1467 }, { "epoch": 0.9, "grad_norm": 2.04096009635488, "learning_rate": 2.7142211213945224e-07, "loss": 0.564, "step": 1468 }, { "epoch": 0.9, "grad_norm": 1.9317429342953267, "learning_rate": 2.682105719486994e-07, "loss": 0.5655, "step": 1469 }, { "epoch": 0.9, "grad_norm": 2.1614143475246803, "learning_rate": 2.65017621205339e-07, "loss": 0.5385, "step": 1470 }, { "epoch": 0.9, "grad_norm": 1.9518158323053523, "learning_rate": 2.61843272453291e-07, "loss": 0.5292, "step": 1471 }, { "epoch": 0.9, "grad_norm": 1.988402222876475, "learning_rate": 2.5868753816339574e-07, "loss": 0.4855, "step": 1472 }, { "epoch": 0.9, "grad_norm": 2.2461460760928973, "learning_rate": 2.5555043073336394e-07, "loss": 0.545, "step": 1473 }, { "epoch": 0.9, "grad_norm": 2.016749712197434, "learning_rate": 2.524319624877275e-07, "loss": 0.5487, "step": 1474 }, { "epoch": 0.9, "grad_norm": 1.8374656172629769, "learning_rate": 2.4933214567779473e-07, "loss": 0.4698, "step": 1475 }, { "epoch": 0.9, "grad_norm": 2.1647972903983224, "learning_rate": 2.462509924815948e-07, "loss": 0.6418, "step": 1476 }, { "epoch": 0.9, "grad_norm": 1.9502593053235608, "learning_rate": 2.4318851500383823e-07, "loss": 0.5, "step": 1477 }, { "epoch": 0.9, "grad_norm": 1.9211282369606173, "learning_rate": 2.4014472527586483e-07, "loss": 0.4927, "step": 1478 }, { "epoch": 0.9, "grad_norm": 2.329177425501773, "learning_rate": 2.3711963525559544e-07, "loss": 0.5993, "step": 1479 }, { "epoch": 0.9, "grad_norm": 2.225800165473701, "learning_rate": 2.3411325682748843e-07, "loss": 0.6954, "step": 1480 }, { "epoch": 0.91, "grad_norm": 2.0522726914562703, "learning_rate": 2.3112560180249154e-07, "loss": 0.5618, "step": 1481 }, { "epoch": 0.91, "grad_norm": 2.1168420702658657, "learning_rate": 2.2815668191799255e-07, "loss": 0.5674, "step": 1482 }, { "epoch": 0.91, "grad_norm": 2.032866002096309, "learning_rate": 2.2520650883777917e-07, "loss": 0.5903, "step": 1483 }, { "epoch": 0.91, "grad_norm": 1.927079943417791, "learning_rate": 2.222750941519869e-07, "loss": 0.5379, "step": 1484 }, { "epoch": 0.91, "grad_norm": 2.230233104141958, "learning_rate": 2.193624493770591e-07, "loss": 0.5362, "step": 1485 }, { "epoch": 0.91, "grad_norm": 1.9871866256133242, "learning_rate": 2.1646858595569754e-07, "loss": 0.5402, "step": 1486 }, { "epoch": 0.91, "grad_norm": 2.0451780616189827, "learning_rate": 2.135935152568186e-07, "loss": 0.5671, "step": 1487 }, { "epoch": 0.91, "grad_norm": 1.9066201288251257, "learning_rate": 2.107372485755105e-07, "loss": 0.5467, "step": 1488 }, { "epoch": 0.91, "grad_norm": 1.797950359009013, "learning_rate": 2.0789979713298714e-07, "loss": 0.5164, "step": 1489 }, { "epoch": 0.91, "grad_norm": 2.071072453128202, "learning_rate": 2.0508117207654276e-07, "loss": 0.5991, "step": 1490 }, { "epoch": 0.91, "grad_norm": 2.317654978785418, "learning_rate": 2.0228138447951128e-07, "loss": 0.6293, "step": 1491 }, { "epoch": 0.91, "grad_norm": 2.053251641022129, "learning_rate": 1.9950044534122138e-07, "loss": 0.5853, "step": 1492 }, { "epoch": 0.91, "grad_norm": 1.971312933014048, "learning_rate": 1.9673836558695148e-07, "loss": 0.4732, "step": 1493 }, { "epoch": 0.91, "grad_norm": 1.9713907215979547, "learning_rate": 1.9399515606789098e-07, "loss": 0.6066, "step": 1494 }, { "epoch": 0.91, "grad_norm": 1.9568482450287366, "learning_rate": 1.9127082756109138e-07, "loss": 0.547, "step": 1495 }, { "epoch": 0.91, "grad_norm": 1.7163468064474119, "learning_rate": 1.8856539076943126e-07, "loss": 0.3999, "step": 1496 }, { "epoch": 0.92, "grad_norm": 1.9178630187217227, "learning_rate": 1.858788563215702e-07, "loss": 0.5042, "step": 1497 }, { "epoch": 0.92, "grad_norm": 2.127461173898097, "learning_rate": 1.8321123477190506e-07, "loss": 0.6439, "step": 1498 }, { "epoch": 0.92, "grad_norm": 2.231384706660567, "learning_rate": 1.8056253660053258e-07, "loss": 0.5077, "step": 1499 }, { "epoch": 0.92, "grad_norm": 1.9635997200687012, "learning_rate": 1.7793277221320794e-07, "loss": 0.5042, "step": 1500 }, { "epoch": 0.92, "grad_norm": 2.073065458571614, "learning_rate": 1.7532195194129964e-07, "loss": 0.5212, "step": 1501 }, { "epoch": 0.92, "grad_norm": 1.977411836335037, "learning_rate": 1.7273008604175301e-07, "loss": 0.5035, "step": 1502 }, { "epoch": 0.92, "grad_norm": 1.9442884403854808, "learning_rate": 1.7015718469705066e-07, "loss": 0.5649, "step": 1503 }, { "epoch": 0.92, "grad_norm": 1.9764790370004715, "learning_rate": 1.6760325801516597e-07, "loss": 0.5255, "step": 1504 }, { "epoch": 0.92, "grad_norm": 1.9950729966418053, "learning_rate": 1.6506831602953298e-07, "loss": 0.5285, "step": 1505 }, { "epoch": 0.92, "grad_norm": 1.876091437155584, "learning_rate": 1.625523686989977e-07, "loss": 0.4915, "step": 1506 }, { "epoch": 0.92, "grad_norm": 2.1698152647396465, "learning_rate": 1.6005542590778521e-07, "loss": 0.6394, "step": 1507 }, { "epoch": 0.92, "grad_norm": 1.9390084511570913, "learning_rate": 1.5757749746546037e-07, "loss": 0.5461, "step": 1508 }, { "epoch": 0.92, "grad_norm": 2.0186157583922135, "learning_rate": 1.5511859310688326e-07, "loss": 0.5515, "step": 1509 }, { "epoch": 0.92, "grad_norm": 1.908132809961028, "learning_rate": 1.5267872249217997e-07, "loss": 0.4557, "step": 1510 }, { "epoch": 0.92, "grad_norm": 2.189140445691145, "learning_rate": 1.5025789520669688e-07, "loss": 0.5904, "step": 1511 }, { "epoch": 0.92, "grad_norm": 2.1203241389390257, "learning_rate": 1.4785612076096856e-07, "loss": 0.5698, "step": 1512 }, { "epoch": 0.93, "grad_norm": 1.8864144078816145, "learning_rate": 1.454734085906756e-07, "loss": 0.5211, "step": 1513 }, { "epoch": 0.93, "grad_norm": 2.0640726129454867, "learning_rate": 1.4310976805661237e-07, "loss": 0.534, "step": 1514 }, { "epoch": 0.93, "grad_norm": 1.9593391676293495, "learning_rate": 1.407652084446459e-07, "loss": 0.5575, "step": 1515 }, { "epoch": 0.93, "grad_norm": 1.9956718882503213, "learning_rate": 1.3843973896568275e-07, "loss": 0.4995, "step": 1516 }, { "epoch": 0.93, "grad_norm": 2.0309484325202027, "learning_rate": 1.3613336875563045e-07, "loss": 0.5561, "step": 1517 }, { "epoch": 0.93, "grad_norm": 2.2977806815622808, "learning_rate": 1.338461068753627e-07, "loss": 0.6895, "step": 1518 }, { "epoch": 0.93, "grad_norm": 1.9903192519492166, "learning_rate": 1.3157796231068497e-07, "loss": 0.5644, "step": 1519 }, { "epoch": 0.93, "grad_norm": 1.9448808087386893, "learning_rate": 1.293289439722961e-07, "loss": 0.5146, "step": 1520 }, { "epoch": 0.93, "grad_norm": 2.115645901357327, "learning_rate": 1.2709906069575561e-07, "loss": 0.5702, "step": 1521 }, { "epoch": 0.93, "grad_norm": 2.1808080246002457, "learning_rate": 1.2488832124144923e-07, "loss": 0.4805, "step": 1522 }, { "epoch": 0.93, "grad_norm": 1.8228170984929375, "learning_rate": 1.2269673429455287e-07, "loss": 0.4851, "step": 1523 }, { "epoch": 0.93, "grad_norm": 2.038456785853388, "learning_rate": 1.2052430846499984e-07, "loss": 0.5771, "step": 1524 }, { "epoch": 0.93, "grad_norm": 1.8904101603890644, "learning_rate": 1.183710522874454e-07, "loss": 0.4813, "step": 1525 }, { "epoch": 0.93, "grad_norm": 1.7215684414215477, "learning_rate": 1.1623697422123603e-07, "loss": 0.4418, "step": 1526 }, { "epoch": 0.93, "grad_norm": 1.949814139587883, "learning_rate": 1.1412208265037417e-07, "loss": 0.4467, "step": 1527 }, { "epoch": 0.93, "grad_norm": 1.989698073002338, "learning_rate": 1.1202638588348413e-07, "loss": 0.479, "step": 1528 }, { "epoch": 0.93, "grad_norm": 1.9796631752851332, "learning_rate": 1.0994989215378227e-07, "loss": 0.6001, "step": 1529 }, { "epoch": 0.94, "grad_norm": 2.0989381422150837, "learning_rate": 1.0789260961904357e-07, "loss": 0.5106, "step": 1530 }, { "epoch": 0.94, "grad_norm": 1.971183598874908, "learning_rate": 1.0585454636156788e-07, "loss": 0.5654, "step": 1531 }, { "epoch": 0.94, "grad_norm": 1.8203580247750317, "learning_rate": 1.0383571038815155e-07, "loss": 0.4136, "step": 1532 }, { "epoch": 0.94, "grad_norm": 2.1265452345564264, "learning_rate": 1.0183610963005298e-07, "loss": 0.6466, "step": 1533 }, { "epoch": 0.94, "grad_norm": 1.9020416641784, "learning_rate": 9.98557519429616e-08, "loss": 0.5149, "step": 1534 }, { "epoch": 0.94, "grad_norm": 1.9565333994726395, "learning_rate": 9.789464510697011e-08, "loss": 0.6182, "step": 1535 }, { "epoch": 0.94, "grad_norm": 2.192339437616818, "learning_rate": 9.595279682654002e-08, "loss": 0.5793, "step": 1536 }, { "epoch": 0.94, "grad_norm": 1.752208514931023, "learning_rate": 9.40302147304739e-08, "loss": 0.4573, "step": 1537 }, { "epoch": 0.94, "grad_norm": 1.9487144688198685, "learning_rate": 9.212690637188492e-08, "loss": 0.5045, "step": 1538 }, { "epoch": 0.94, "grad_norm": 2.1355012550891095, "learning_rate": 9.024287922816566e-08, "loss": 0.5376, "step": 1539 }, { "epoch": 0.94, "grad_norm": 1.830404039518353, "learning_rate": 8.83781407009604e-08, "loss": 0.5084, "step": 1540 }, { "epoch": 0.94, "grad_norm": 2.0061575400005927, "learning_rate": 8.653269811613685e-08, "loss": 0.5405, "step": 1541 }, { "epoch": 0.94, "grad_norm": 1.9051869282056297, "learning_rate": 8.4706558723755e-08, "loss": 0.4705, "step": 1542 }, { "epoch": 0.94, "grad_norm": 1.970236377135854, "learning_rate": 8.289972969803884e-08, "loss": 0.4761, "step": 1543 }, { "epoch": 0.94, "grad_norm": 2.393702968269512, "learning_rate": 8.111221813735137e-08, "loss": 0.5913, "step": 1544 }, { "epoch": 0.94, "grad_norm": 2.1643166252959807, "learning_rate": 7.934403106416245e-08, "loss": 0.6399, "step": 1545 }, { "epoch": 0.95, "grad_norm": 2.24965238428619, "learning_rate": 7.759517542502426e-08, "loss": 0.5946, "step": 1546 }, { "epoch": 0.95, "grad_norm": 2.0038056323845663, "learning_rate": 7.586565809054258e-08, "loss": 0.5606, "step": 1547 }, { "epoch": 0.95, "grad_norm": 2.1007936583122397, "learning_rate": 7.415548585534949e-08, "loss": 0.6222, "step": 1548 }, { "epoch": 0.95, "grad_norm": 2.0990617623271497, "learning_rate": 7.246466543807951e-08, "loss": 0.6033, "step": 1549 }, { "epoch": 0.95, "grad_norm": 2.235834211479321, "learning_rate": 7.0793203481338e-08, "loss": 0.5658, "step": 1550 }, { "epoch": 0.95, "grad_norm": 2.0971634187847843, "learning_rate": 6.914110655168005e-08, "loss": 0.5197, "step": 1551 }, { "epoch": 0.95, "grad_norm": 2.2723168272777845, "learning_rate": 6.750838113958381e-08, "loss": 0.5444, "step": 1552 }, { "epoch": 0.95, "grad_norm": 2.14031105890706, "learning_rate": 6.589503365941996e-08, "loss": 0.5484, "step": 1553 }, { "epoch": 0.95, "grad_norm": 1.9836284579510188, "learning_rate": 6.430107044943512e-08, "loss": 0.5281, "step": 1554 }, { "epoch": 0.95, "grad_norm": 1.8255909265594834, "learning_rate": 6.272649777171902e-08, "loss": 0.4866, "step": 1555 }, { "epoch": 0.95, "grad_norm": 2.107443031046224, "learning_rate": 6.117132181218454e-08, "loss": 0.5199, "step": 1556 }, { "epoch": 0.95, "grad_norm": 2.1682405735610626, "learning_rate": 5.963554868054167e-08, "loss": 0.539, "step": 1557 }, { "epoch": 0.95, "grad_norm": 2.24775315981034, "learning_rate": 5.8119184410274085e-08, "loss": 0.5139, "step": 1558 }, { "epoch": 0.95, "grad_norm": 1.998700285054948, "learning_rate": 5.662223495861596e-08, "loss": 0.5518, "step": 1559 }, { "epoch": 0.95, "grad_norm": 2.006720101430102, "learning_rate": 5.5144706206525235e-08, "loss": 0.5034, "step": 1560 }, { "epoch": 0.95, "grad_norm": 1.9240681602880376, "learning_rate": 5.368660395866643e-08, "loss": 0.5566, "step": 1561 }, { "epoch": 0.96, "grad_norm": 2.367451906830005, "learning_rate": 5.2247933943382344e-08, "loss": 0.6171, "step": 1562 }, { "epoch": 0.96, "grad_norm": 1.9591190023164184, "learning_rate": 5.0828701812674074e-08, "loss": 0.5367, "step": 1563 }, { "epoch": 0.96, "grad_norm": 2.0598213561773577, "learning_rate": 4.94289131421799e-08, "loss": 0.5737, "step": 1564 }, { "epoch": 0.96, "grad_norm": 2.0711836264717634, "learning_rate": 4.804857343114977e-08, "loss": 0.5522, "step": 1565 }, { "epoch": 0.96, "grad_norm": 1.803888225967423, "learning_rate": 4.668768810242752e-08, "loss": 0.441, "step": 1566 }, { "epoch": 0.96, "grad_norm": 2.0820741333254507, "learning_rate": 4.534626250242702e-08, "loss": 0.5394, "step": 1567 }, { "epoch": 0.96, "grad_norm": 2.2847763509593477, "learning_rate": 4.4024301901113285e-08, "loss": 0.5254, "step": 1568 }, { "epoch": 0.96, "grad_norm": 1.9411071290247621, "learning_rate": 4.2721811491978626e-08, "loss": 0.5702, "step": 1569 }, { "epoch": 0.96, "grad_norm": 1.745037379504202, "learning_rate": 4.1438796392025416e-08, "loss": 0.4511, "step": 1570 }, { "epoch": 0.96, "grad_norm": 2.24676269385492, "learning_rate": 4.017526164174501e-08, "loss": 0.5475, "step": 1571 }, { "epoch": 0.96, "grad_norm": 1.8719954199651734, "learning_rate": 3.8931212205096655e-08, "loss": 0.5167, "step": 1572 }, { "epoch": 0.96, "grad_norm": 2.1782725251581443, "learning_rate": 3.770665296949028e-08, "loss": 0.583, "step": 1573 }, { "epoch": 0.96, "grad_norm": 2.084620351446328, "learning_rate": 3.650158874576537e-08, "loss": 0.5573, "step": 1574 }, { "epoch": 0.96, "grad_norm": 1.9942648310254383, "learning_rate": 3.5316024268172713e-08, "loss": 0.5195, "step": 1575 }, { "epoch": 0.96, "grad_norm": 1.9276106667007256, "learning_rate": 3.41499641943549e-08, "loss": 0.461, "step": 1576 }, { "epoch": 0.96, "grad_norm": 2.0002769940610676, "learning_rate": 3.3003413105331396e-08, "loss": 0.5253, "step": 1577 }, { "epoch": 0.96, "grad_norm": 2.3233564755810336, "learning_rate": 3.187637550547573e-08, "loss": 0.6343, "step": 1578 }, { "epoch": 0.97, "grad_norm": 1.9669404121882792, "learning_rate": 3.076885582250111e-08, "loss": 0.5298, "step": 1579 }, { "epoch": 0.97, "grad_norm": 2.160706870886176, "learning_rate": 2.9680858407441503e-08, "loss": 0.5412, "step": 1580 }, { "epoch": 0.97, "grad_norm": 2.1010182979674363, "learning_rate": 2.8612387534636687e-08, "loss": 0.5874, "step": 1581 }, { "epoch": 0.97, "grad_norm": 2.3185770521155464, "learning_rate": 2.756344740171224e-08, "loss": 0.5676, "step": 1582 }, { "epoch": 0.97, "grad_norm": 1.928726797539864, "learning_rate": 2.653404212956512e-08, "loss": 0.5199, "step": 1583 }, { "epoch": 0.97, "grad_norm": 2.374638635528354, "learning_rate": 2.552417576234756e-08, "loss": 0.5822, "step": 1584 }, { "epoch": 0.97, "grad_norm": 2.1312842535423666, "learning_rate": 2.4533852267450976e-08, "loss": 0.5486, "step": 1585 }, { "epoch": 0.97, "grad_norm": 1.887474697390811, "learning_rate": 2.3563075535487646e-08, "loss": 0.5318, "step": 1586 }, { "epoch": 0.97, "grad_norm": 2.162989562166336, "learning_rate": 2.2611849380280715e-08, "loss": 0.5646, "step": 1587 }, { "epoch": 0.97, "grad_norm": 1.8865197403491825, "learning_rate": 2.1680177538845882e-08, "loss": 0.511, "step": 1588 }, { "epoch": 0.97, "grad_norm": 2.0221844144721084, "learning_rate": 2.0768063671375292e-08, "loss": 0.5605, "step": 1589 }, { "epoch": 0.97, "grad_norm": 1.7341292361915335, "learning_rate": 1.9875511361227562e-08, "loss": 0.5054, "step": 1590 }, { "epoch": 0.97, "grad_norm": 2.1625957875579194, "learning_rate": 1.9002524114909438e-08, "loss": 0.5484, "step": 1591 }, { "epoch": 0.97, "grad_norm": 1.890787277810342, "learning_rate": 1.8149105362064157e-08, "loss": 0.4912, "step": 1592 }, { "epoch": 0.97, "grad_norm": 1.89646065518405, "learning_rate": 1.731525845545812e-08, "loss": 0.4987, "step": 1593 }, { "epoch": 0.97, "grad_norm": 2.000138281803427, "learning_rate": 1.6500986670966444e-08, "loss": 0.5334, "step": 1594 }, { "epoch": 0.98, "grad_norm": 1.9360765732063108, "learning_rate": 1.5706293207561896e-08, "loss": 0.5272, "step": 1595 }, { "epoch": 0.98, "grad_norm": 1.8899446974988472, "learning_rate": 1.4931181187300413e-08, "loss": 0.5351, "step": 1596 }, { "epoch": 0.98, "grad_norm": 2.083452401449017, "learning_rate": 1.4175653655309484e-08, "loss": 0.5654, "step": 1597 }, { "epoch": 0.98, "grad_norm": 1.7902773793269389, "learning_rate": 1.3439713579777025e-08, "loss": 0.4598, "step": 1598 }, { "epoch": 0.98, "grad_norm": 1.9347948196954086, "learning_rate": 1.2723363851939175e-08, "loss": 0.5546, "step": 1599 }, { "epoch": 0.98, "grad_norm": 2.1793801518923117, "learning_rate": 1.2026607286068637e-08, "loss": 0.5184, "step": 1600 }, { "epoch": 0.98, "grad_norm": 2.1162767261686497, "learning_rate": 1.1349446619463578e-08, "loss": 0.5576, "step": 1601 }, { "epoch": 0.98, "grad_norm": 1.9064679554702675, "learning_rate": 1.0691884512437078e-08, "loss": 0.5593, "step": 1602 }, { "epoch": 0.98, "grad_norm": 1.9599229582469748, "learning_rate": 1.0053923548307698e-08, "loss": 0.5226, "step": 1603 }, { "epoch": 0.98, "grad_norm": 2.3703310743026367, "learning_rate": 9.435566233387261e-09, "loss": 0.6998, "step": 1604 }, { "epoch": 0.98, "grad_norm": 1.8265190173903945, "learning_rate": 8.836814996971977e-09, "loss": 0.5149, "step": 1605 }, { "epoch": 0.98, "grad_norm": 1.919991660951489, "learning_rate": 8.257672191334664e-09, "loss": 0.5058, "step": 1606 }, { "epoch": 0.98, "grad_norm": 2.226643529735206, "learning_rate": 7.698140091712547e-09, "loss": 0.5828, "step": 1607 }, { "epoch": 0.98, "grad_norm": 1.971584660303256, "learning_rate": 7.158220896298917e-09, "loss": 0.5688, "step": 1608 }, { "epoch": 0.98, "grad_norm": 2.0480628954718223, "learning_rate": 6.637916726237592e-09, "loss": 0.5851, "step": 1609 }, { "epoch": 0.98, "grad_norm": 2.2325632034699905, "learning_rate": 6.1372296256101414e-09, "loss": 0.5965, "step": 1610 }, { "epoch": 0.99, "grad_norm": 2.0574293300744557, "learning_rate": 5.6561615614314505e-09, "loss": 0.5216, "step": 1611 }, { "epoch": 0.99, "grad_norm": 2.0626282148992625, "learning_rate": 5.194714423638059e-09, "loss": 0.5443, "step": 1612 }, { "epoch": 0.99, "grad_norm": 1.8130730344455217, "learning_rate": 4.752890025086499e-09, "loss": 0.5109, "step": 1613 }, { "epoch": 0.99, "grad_norm": 2.1550184229788463, "learning_rate": 4.330690101539969e-09, "loss": 0.6121, "step": 1614 }, { "epoch": 0.99, "grad_norm": 1.795454334127577, "learning_rate": 3.928116311666119e-09, "loss": 0.4971, "step": 1615 }, { "epoch": 0.99, "grad_norm": 1.8886493347103483, "learning_rate": 3.5451702370281616e-09, "loss": 0.4622, "step": 1616 }, { "epoch": 0.99, "grad_norm": 1.7323735035854002, "learning_rate": 3.181853382079325e-09, "loss": 0.4841, "step": 1617 }, { "epoch": 0.99, "grad_norm": 1.9823067756781059, "learning_rate": 2.8381671741567475e-09, "loss": 0.5521, "step": 1618 }, { "epoch": 0.99, "grad_norm": 2.127563659134435, "learning_rate": 2.514112963476478e-09, "loss": 0.5391, "step": 1619 }, { "epoch": 0.99, "grad_norm": 2.124869138735419, "learning_rate": 2.209692023126819e-09, "loss": 0.6062, "step": 1620 }, { "epoch": 0.99, "grad_norm": 1.986396902793689, "learning_rate": 1.9249055490655477e-09, "loss": 0.4799, "step": 1621 }, { "epoch": 0.99, "grad_norm": 1.9573989864120591, "learning_rate": 1.6597546601127001e-09, "loss": 0.5542, "step": 1622 }, { "epoch": 0.99, "grad_norm": 1.8291458436134207, "learning_rate": 1.4142403979483522e-09, "loss": 0.4648, "step": 1623 }, { "epoch": 0.99, "grad_norm": 2.0950932153675312, "learning_rate": 1.1883637271065118e-09, "loss": 0.5056, "step": 1624 }, { "epoch": 0.99, "grad_norm": 1.8233239812965405, "learning_rate": 9.821255349734548e-10, "loss": 0.5067, "step": 1625 }, { "epoch": 0.99, "grad_norm": 2.1447666317815934, "learning_rate": 7.955266317821731e-10, "loss": 0.5142, "step": 1626 }, { "epoch": 0.99, "grad_norm": 2.0265645163369683, "learning_rate": 6.28567750610709e-10, "loss": 0.5296, "step": 1627 }, { "epoch": 1.0, "grad_norm": 1.8561507428186768, "learning_rate": 4.812495473788259e-10, "loss": 0.5011, "step": 1628 }, { "epoch": 1.0, "grad_norm": 2.2576656272188163, "learning_rate": 3.5357260084523114e-10, "loss": 0.5452, "step": 1629 }, { "epoch": 1.0, "grad_norm": 2.224588449835144, "learning_rate": 2.4553741260535667e-10, "loss": 0.56, "step": 1630 }, { "epoch": 1.0, "grad_norm": 2.027404547376381, "learning_rate": 1.5714440708913815e-10, "loss": 0.5433, "step": 1631 }, { "epoch": 1.0, "grad_norm": 1.9249522577325628, "learning_rate": 8.839393155990472e-11, "loss": 0.5585, "step": 1632 }, { "epoch": 1.0, "grad_norm": 1.916155539003378, "learning_rate": 3.9286256113268973e-11, "loss": 0.5188, "step": 1633 }, { "epoch": 1.0, "grad_norm": 2.0829325561732537, "learning_rate": 9.821573674906326e-12, "loss": 0.5721, "step": 1634 }, { "epoch": 1.0, "grad_norm": 2.060500339504095, "learning_rate": 0.0, "loss": 0.5242, "step": 1635 }, { "epoch": 1.0, "step": 1635, "total_flos": 669628105687040.0, "train_loss": 0.6020827626780997, "train_runtime": 53189.986, "train_samples_per_second": 3.936, "train_steps_per_second": 0.031 } ], "logging_steps": 1.0, "max_steps": 1635, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 669628105687040.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }