{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 2008, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00398406374501992, "grad_norm": 4.348448276519775, "learning_rate": 5.940594059405941e-08, "loss": 2.1171607971191406, "step": 2 }, { "epoch": 0.00796812749003984, "grad_norm": 2.55696177482605, "learning_rate": 1.782178217821782e-07, "loss": 2.068465232849121, "step": 4 }, { "epoch": 0.01195219123505976, "grad_norm": 3.159899950027466, "learning_rate": 2.9702970297029703e-07, "loss": 2.136167287826538, "step": 6 }, { "epoch": 0.01593625498007968, "grad_norm": 2.0796260833740234, "learning_rate": 4.158415841584159e-07, "loss": 1.8786698579788208, "step": 8 }, { "epoch": 0.0199203187250996, "grad_norm": 5.41955041885376, "learning_rate": 5.346534653465346e-07, "loss": 1.9257912635803223, "step": 10 }, { "epoch": 0.02390438247011952, "grad_norm": 11.406185150146484, "learning_rate": 6.534653465346535e-07, "loss": 2.368868827819824, "step": 12 }, { "epoch": 0.027888446215139442, "grad_norm": 1.901093602180481, "learning_rate": 7.722772277227723e-07, "loss": 1.9428346157073975, "step": 14 }, { "epoch": 0.03187250996015936, "grad_norm": 1.393601894378662, "learning_rate": 8.910891089108911e-07, "loss": 1.7873543500900269, "step": 16 }, { "epoch": 0.035856573705179286, "grad_norm": 1.4436230659484863, "learning_rate": 1.00990099009901e-06, "loss": 1.2166668176651, "step": 18 }, { "epoch": 0.0398406374501992, "grad_norm": 1.8145285844802856, "learning_rate": 1.1287128712871288e-06, "loss": 1.6057647466659546, "step": 20 }, { "epoch": 0.043824701195219126, "grad_norm": 1.2188401222229004, "learning_rate": 1.2475247524752474e-06, "loss": 1.7550266981124878, "step": 22 }, { "epoch": 0.04780876494023904, "grad_norm": 1.664843201637268, "learning_rate": 1.3663366336633665e-06, "loss": 1.5540839433670044, "step": 24 }, { "epoch": 0.05179282868525897, "grad_norm": 9.42098617553711, "learning_rate": 1.4851485148514852e-06, "loss": 2.344756841659546, "step": 26 }, { "epoch": 0.055776892430278883, "grad_norm": 2.532942771911621, "learning_rate": 1.603960396039604e-06, "loss": 1.6220295429229736, "step": 28 }, { "epoch": 0.05976095617529881, "grad_norm": 14.82198429107666, "learning_rate": 1.7227722772277227e-06, "loss": 1.83803129196167, "step": 30 }, { "epoch": 0.06374501992031872, "grad_norm": 4.736570358276367, "learning_rate": 1.8415841584158415e-06, "loss": 0.9668034315109253, "step": 32 }, { "epoch": 0.06772908366533864, "grad_norm": 1.077216386795044, "learning_rate": 1.9603960396039604e-06, "loss": 1.5424432754516602, "step": 34 }, { "epoch": 0.07171314741035857, "grad_norm": 2.908050060272217, "learning_rate": 2.079207920792079e-06, "loss": 1.578654408454895, "step": 36 }, { "epoch": 0.07569721115537849, "grad_norm": 1.1227400302886963, "learning_rate": 2.198019801980198e-06, "loss": 1.4881534576416016, "step": 38 }, { "epoch": 0.0796812749003984, "grad_norm": 1.1037142276763916, "learning_rate": 2.316831683168317e-06, "loss": 1.4990899562835693, "step": 40 }, { "epoch": 0.08366533864541832, "grad_norm": 11.898150444030762, "learning_rate": 2.4356435643564358e-06, "loss": 0.6503542065620422, "step": 42 }, { "epoch": 0.08764940239043825, "grad_norm": 1.8804869651794434, "learning_rate": 2.5544554455445544e-06, "loss": 1.5055851936340332, "step": 44 }, { "epoch": 0.09163346613545817, "grad_norm": 1.0558547973632812, "learning_rate": 2.6732673267326735e-06, "loss": 1.4046030044555664, "step": 46 }, { "epoch": 0.09561752988047809, "grad_norm": 1.6279054880142212, "learning_rate": 2.792079207920792e-06, "loss": 1.228537678718567, "step": 48 }, { "epoch": 0.099601593625498, "grad_norm": 9.176322937011719, "learning_rate": 2.9108910891089108e-06, "loss": 1.4280906915664673, "step": 50 }, { "epoch": 0.10358565737051793, "grad_norm": 0.7322860956192017, "learning_rate": 3.02970297029703e-06, "loss": 1.3861970901489258, "step": 52 }, { "epoch": 0.10756972111553785, "grad_norm": 1.9037761688232422, "learning_rate": 3.148514851485149e-06, "loss": 1.8983885049819946, "step": 54 }, { "epoch": 0.11155378486055777, "grad_norm": 2.3619227409362793, "learning_rate": 3.2673267326732676e-06, "loss": 1.265608549118042, "step": 56 }, { "epoch": 0.11553784860557768, "grad_norm": 1.079222559928894, "learning_rate": 3.3861386138613858e-06, "loss": 1.3720718622207642, "step": 58 }, { "epoch": 0.11952191235059761, "grad_norm": 2.0091183185577393, "learning_rate": 3.504950495049505e-06, "loss": 1.221197485923767, "step": 60 }, { "epoch": 0.12350597609561753, "grad_norm": 1.6702687740325928, "learning_rate": 3.623762376237624e-06, "loss": 1.3659617900848389, "step": 62 }, { "epoch": 0.12749003984063745, "grad_norm": 1.4772052764892578, "learning_rate": 3.7425742574257425e-06, "loss": 0.9504954218864441, "step": 64 }, { "epoch": 0.13147410358565736, "grad_norm": 1.1839888095855713, "learning_rate": 3.861386138613861e-06, "loss": 1.3058485984802246, "step": 66 }, { "epoch": 0.13545816733067728, "grad_norm": 2.274195671081543, "learning_rate": 3.98019801980198e-06, "loss": 1.142093300819397, "step": 68 }, { "epoch": 0.1394422310756972, "grad_norm": 1.23581063747406, "learning_rate": 4.099009900990099e-06, "loss": 1.3501553535461426, "step": 70 }, { "epoch": 0.14342629482071714, "grad_norm": 0.9609231352806091, "learning_rate": 4.2178217821782175e-06, "loss": 1.394975185394287, "step": 72 }, { "epoch": 0.14741035856573706, "grad_norm": 1.0941250324249268, "learning_rate": 4.336633663366337e-06, "loss": 1.3037439584732056, "step": 74 }, { "epoch": 0.15139442231075698, "grad_norm": 2.8824353218078613, "learning_rate": 4.455445544554456e-06, "loss": 1.0416653156280518, "step": 76 }, { "epoch": 0.1553784860557769, "grad_norm": 0.9365191459655762, "learning_rate": 4.574257425742575e-06, "loss": 1.2687112092971802, "step": 78 }, { "epoch": 0.1593625498007968, "grad_norm": 1.5261850357055664, "learning_rate": 4.693069306930693e-06, "loss": 1.317929744720459, "step": 80 }, { "epoch": 0.16334661354581673, "grad_norm": 1.2774893045425415, "learning_rate": 4.811881188118812e-06, "loss": 1.0010876655578613, "step": 82 }, { "epoch": 0.16733067729083664, "grad_norm": 2.2119550704956055, "learning_rate": 4.93069306930693e-06, "loss": 0.4978000223636627, "step": 84 }, { "epoch": 0.17131474103585656, "grad_norm": 0.8584203124046326, "learning_rate": 5.049504950495049e-06, "loss": 0.5417638421058655, "step": 86 }, { "epoch": 0.1752988047808765, "grad_norm": 1.1234948635101318, "learning_rate": 5.168316831683168e-06, "loss": 1.3661960363388062, "step": 88 }, { "epoch": 0.17928286852589642, "grad_norm": 1.9819002151489258, "learning_rate": 5.2871287128712874e-06, "loss": 0.8402650952339172, "step": 90 }, { "epoch": 0.18326693227091634, "grad_norm": 9.981027603149414, "learning_rate": 5.4059405940594065e-06, "loss": 1.0407862663269043, "step": 92 }, { "epoch": 0.18725099601593626, "grad_norm": 5.5226335525512695, "learning_rate": 5.524752475247525e-06, "loss": 1.3026604652404785, "step": 94 }, { "epoch": 0.19123505976095617, "grad_norm": 2.536931037902832, "learning_rate": 5.643564356435644e-06, "loss": 1.050534963607788, "step": 96 }, { "epoch": 0.1952191235059761, "grad_norm": 2.8480377197265625, "learning_rate": 5.762376237623762e-06, "loss": 1.2697981595993042, "step": 98 }, { "epoch": 0.199203187250996, "grad_norm": 1.2788549661636353, "learning_rate": 5.881188118811881e-06, "loss": 1.3857513666152954, "step": 100 }, { "epoch": 0.20318725099601595, "grad_norm": 4.938348293304443, "learning_rate": 6e-06, "loss": 1.2503310441970825, "step": 102 }, { "epoch": 0.20717131474103587, "grad_norm": 1.3763278722763062, "learning_rate": 5.99998534480079e-06, "loss": 1.316627025604248, "step": 104 }, { "epoch": 0.21115537848605578, "grad_norm": 1.2016820907592773, "learning_rate": 5.9999413793622525e-06, "loss": 1.3336181640625, "step": 106 }, { "epoch": 0.2151394422310757, "grad_norm": 2.7037742137908936, "learning_rate": 5.9998681041616624e-06, "loss": 0.848972737789154, "step": 108 }, { "epoch": 0.21912350597609562, "grad_norm": 2.082820177078247, "learning_rate": 5.999765519994475e-06, "loss": 1.1773113012313843, "step": 110 }, { "epoch": 0.22310756972111553, "grad_norm": 1.349158525466919, "learning_rate": 5.999633627974312e-06, "loss": 1.838499903678894, "step": 112 }, { "epoch": 0.22709163346613545, "grad_norm": 1.0457987785339355, "learning_rate": 5.9994724295329546e-06, "loss": 1.2931954860687256, "step": 114 }, { "epoch": 0.23107569721115537, "grad_norm": 1.0233925580978394, "learning_rate": 5.999281926420326e-06, "loss": 1.3657619953155518, "step": 116 }, { "epoch": 0.2350597609561753, "grad_norm": 1.456226110458374, "learning_rate": 5.999062120704471e-06, "loss": 0.39271149039268494, "step": 118 }, { "epoch": 0.23904382470119523, "grad_norm": 1.1591161489486694, "learning_rate": 5.998813014771534e-06, "loss": 1.283569097518921, "step": 120 }, { "epoch": 0.24302788844621515, "grad_norm": 1.4893031120300293, "learning_rate": 5.998534611325737e-06, "loss": 1.3696374893188477, "step": 122 }, { "epoch": 0.24701195219123506, "grad_norm": 1.0916317701339722, "learning_rate": 5.998226913389344e-06, "loss": 1.2977485656738281, "step": 124 }, { "epoch": 0.250996015936255, "grad_norm": 1.5058797597885132, "learning_rate": 5.997889924302632e-06, "loss": 1.2800962924957275, "step": 126 }, { "epoch": 0.2549800796812749, "grad_norm": 2.89294695854187, "learning_rate": 5.997523647723856e-06, "loss": 0.9177144169807434, "step": 128 }, { "epoch": 0.2589641434262948, "grad_norm": 2.416161060333252, "learning_rate": 5.997128087629205e-06, "loss": 1.280983567237854, "step": 130 }, { "epoch": 0.26294820717131473, "grad_norm": 1.2975496053695679, "learning_rate": 5.996703248312762e-06, "loss": 1.2503688335418701, "step": 132 }, { "epoch": 0.26693227091633465, "grad_norm": 0.9795719385147095, "learning_rate": 5.996249134386455e-06, "loss": 1.2679003477096558, "step": 134 }, { "epoch": 0.27091633466135456, "grad_norm": 1.4742954969406128, "learning_rate": 5.995765750780013e-06, "loss": 0.5531994700431824, "step": 136 }, { "epoch": 0.2749003984063745, "grad_norm": 2.563380241394043, "learning_rate": 5.995253102740903e-06, "loss": 1.901612401008606, "step": 138 }, { "epoch": 0.2788844621513944, "grad_norm": 1.4704535007476807, "learning_rate": 5.994711195834279e-06, "loss": 1.1717365980148315, "step": 140 }, { "epoch": 0.28286852589641437, "grad_norm": 1.1811615228652954, "learning_rate": 5.994140035942923e-06, "loss": 0.7471544742584229, "step": 142 }, { "epoch": 0.2868525896414343, "grad_norm": 1.6094988584518433, "learning_rate": 5.993539629267178e-06, "loss": 0.9018757939338684, "step": 144 }, { "epoch": 0.2908366533864542, "grad_norm": 2.305218458175659, "learning_rate": 5.992909982324879e-06, "loss": 1.277273178100586, "step": 146 }, { "epoch": 0.2948207171314741, "grad_norm": 3.697319746017456, "learning_rate": 5.992251101951287e-06, "loss": 1.0025593042373657, "step": 148 }, { "epoch": 0.29880478087649404, "grad_norm": 1.539844036102295, "learning_rate": 5.991562995299011e-06, "loss": 1.3024755716323853, "step": 150 }, { "epoch": 0.30278884462151395, "grad_norm": 1.0249600410461426, "learning_rate": 5.990845669837933e-06, "loss": 1.5959429740905762, "step": 152 }, { "epoch": 0.30677290836653387, "grad_norm": 0.8561967015266418, "learning_rate": 5.990099133355126e-06, "loss": 1.2801433801651, "step": 154 }, { "epoch": 0.3107569721115538, "grad_norm": 4.086156845092773, "learning_rate": 5.989323393954767e-06, "loss": 0.4956245422363281, "step": 156 }, { "epoch": 0.3147410358565737, "grad_norm": 3.771010398864746, "learning_rate": 5.988518460058054e-06, "loss": 0.4668130576610565, "step": 158 }, { "epoch": 0.3187250996015936, "grad_norm": 1.3703054189682007, "learning_rate": 5.9876843404031096e-06, "loss": 1.2212884426116943, "step": 160 }, { "epoch": 0.32270916334661354, "grad_norm": 1.210668921470642, "learning_rate": 5.986821044044889e-06, "loss": 1.7916109561920166, "step": 162 }, { "epoch": 0.32669322709163345, "grad_norm": 1.0227242708206177, "learning_rate": 5.985928580355082e-06, "loss": 0.8739029765129089, "step": 164 }, { "epoch": 0.33067729083665337, "grad_norm": 2.860746383666992, "learning_rate": 5.985006959022008e-06, "loss": 0.4693869352340698, "step": 166 }, { "epoch": 0.3346613545816733, "grad_norm": 1.755257487297058, "learning_rate": 5.984056190050517e-06, "loss": 1.324602723121643, "step": 168 }, { "epoch": 0.3386454183266932, "grad_norm": 7.148312568664551, "learning_rate": 5.983076283761872e-06, "loss": 1.3821817636489868, "step": 170 }, { "epoch": 0.3426294820717131, "grad_norm": 1.2952216863632202, "learning_rate": 5.982067250793646e-06, "loss": 1.2612062692642212, "step": 172 }, { "epoch": 0.3466135458167331, "grad_norm": 1.727574348449707, "learning_rate": 5.981029102099601e-06, "loss": 1.341408133506775, "step": 174 }, { "epoch": 0.350597609561753, "grad_norm": 2.543426513671875, "learning_rate": 5.979961848949572e-06, "loss": 0.5157387852668762, "step": 176 }, { "epoch": 0.3545816733067729, "grad_norm": 1.489472508430481, "learning_rate": 5.978865502929343e-06, "loss": 1.3691034317016602, "step": 178 }, { "epoch": 0.35856573705179284, "grad_norm": 3.3407742977142334, "learning_rate": 5.977740075940517e-06, "loss": 1.2798420190811157, "step": 180 }, { "epoch": 0.36254980079681276, "grad_norm": 0.7936763763427734, "learning_rate": 5.976585580200399e-06, "loss": 1.2865771055221558, "step": 182 }, { "epoch": 0.3665338645418327, "grad_norm": 1.722764492034912, "learning_rate": 5.9754020282418505e-06, "loss": 0.9274950623512268, "step": 184 }, { "epoch": 0.3705179282868526, "grad_norm": 1.4277971982955933, "learning_rate": 5.974189432913161e-06, "loss": 1.2118057012557983, "step": 186 }, { "epoch": 0.3745019920318725, "grad_norm": 0.7755621671676636, "learning_rate": 5.972947807377905e-06, "loss": 1.262542724609375, "step": 188 }, { "epoch": 0.3784860557768924, "grad_norm": 2.0006139278411865, "learning_rate": 5.971677165114801e-06, "loss": 1.1163339614868164, "step": 190 }, { "epoch": 0.38247011952191234, "grad_norm": 1.9247850179672241, "learning_rate": 5.970377519917563e-06, "loss": 1.0671018362045288, "step": 192 }, { "epoch": 0.38645418326693226, "grad_norm": 1.1371593475341797, "learning_rate": 5.969048885894754e-06, "loss": 1.2458205223083496, "step": 194 }, { "epoch": 0.3904382470119522, "grad_norm": 1.5814062356948853, "learning_rate": 5.967691277469631e-06, "loss": 1.2479208707809448, "step": 196 }, { "epoch": 0.3944223107569721, "grad_norm": 1.3527947664260864, "learning_rate": 5.9663047093799874e-06, "loss": 0.46853581070899963, "step": 198 }, { "epoch": 0.398406374501992, "grad_norm": 0.9908071160316467, "learning_rate": 5.964889196677996e-06, "loss": 1.2344821691513062, "step": 200 }, { "epoch": 0.40239043824701193, "grad_norm": 0.9923727512359619, "learning_rate": 5.9634447547300415e-06, "loss": 1.2732172012329102, "step": 202 }, { "epoch": 0.4063745019920319, "grad_norm": 2.537524700164795, "learning_rate": 5.961971399216556e-06, "loss": 1.234106183052063, "step": 204 }, { "epoch": 0.4103585657370518, "grad_norm": 3.067852735519409, "learning_rate": 5.960469146131851e-06, "loss": 0.38716864585876465, "step": 206 }, { "epoch": 0.41434262948207173, "grad_norm": 0.8039565086364746, "learning_rate": 5.95893801178394e-06, "loss": 1.223067045211792, "step": 208 }, { "epoch": 0.41832669322709165, "grad_norm": 1.5125787258148193, "learning_rate": 5.957378012794361e-06, "loss": 0.698806881904602, "step": 210 }, { "epoch": 0.42231075697211157, "grad_norm": 1.2418526411056519, "learning_rate": 5.955789166098002e-06, "loss": 0.7970227599143982, "step": 212 }, { "epoch": 0.4262948207171315, "grad_norm": 2.7106666564941406, "learning_rate": 5.954171488942911e-06, "loss": 0.8325067758560181, "step": 214 }, { "epoch": 0.4302788844621514, "grad_norm": 3.5096561908721924, "learning_rate": 5.952524998890109e-06, "loss": 1.1556031703948975, "step": 216 }, { "epoch": 0.4342629482071713, "grad_norm": 1.513983130455017, "learning_rate": 5.950849713813405e-06, "loss": 1.263627529144287, "step": 218 }, { "epoch": 0.43824701195219123, "grad_norm": 0.7860940098762512, "learning_rate": 5.949145651899196e-06, "loss": 1.2762495279312134, "step": 220 }, { "epoch": 0.44223107569721115, "grad_norm": 1.6819899082183838, "learning_rate": 5.947412831646271e-06, "loss": 0.5981872081756592, "step": 222 }, { "epoch": 0.44621513944223107, "grad_norm": 1.2630786895751953, "learning_rate": 5.945651271865616e-06, "loss": 1.120012879371643, "step": 224 }, { "epoch": 0.450199203187251, "grad_norm": 0.9950310587882996, "learning_rate": 5.943860991680195e-06, "loss": 1.2754716873168945, "step": 226 }, { "epoch": 0.4541832669322709, "grad_norm": 1.6684496402740479, "learning_rate": 5.942042010524764e-06, "loss": 0.9846575856208801, "step": 228 }, { "epoch": 0.4581673306772908, "grad_norm": 1.4847872257232666, "learning_rate": 5.9401943481456386e-06, "loss": 1.2583152055740356, "step": 230 }, { "epoch": 0.46215139442231074, "grad_norm": 0.9578908681869507, "learning_rate": 5.9383180246004935e-06, "loss": 1.2739794254302979, "step": 232 }, { "epoch": 0.46613545816733065, "grad_norm": 1.1821162700653076, "learning_rate": 5.936413060258143e-06, "loss": 1.4074854850769043, "step": 234 }, { "epoch": 0.4701195219123506, "grad_norm": 0.8178677558898926, "learning_rate": 5.9344794757983115e-06, "loss": 1.2413185834884644, "step": 236 }, { "epoch": 0.47410358565737054, "grad_norm": 2.4166979789733887, "learning_rate": 5.932517292211418e-06, "loss": 1.1744059324264526, "step": 238 }, { "epoch": 0.47808764940239046, "grad_norm": 1.1220707893371582, "learning_rate": 5.930526530798347e-06, "loss": 1.2574900388717651, "step": 240 }, { "epoch": 0.4820717131474104, "grad_norm": 0.7189679741859436, "learning_rate": 5.928507213170211e-06, "loss": 1.2059662342071533, "step": 242 }, { "epoch": 0.4860557768924303, "grad_norm": 1.4799033403396606, "learning_rate": 5.926459361248125e-06, "loss": 0.7257046103477478, "step": 244 }, { "epoch": 0.4900398406374502, "grad_norm": 8.812633514404297, "learning_rate": 5.9243829972629584e-06, "loss": 1.0781515836715698, "step": 246 }, { "epoch": 0.4940239043824701, "grad_norm": 2.5435431003570557, "learning_rate": 5.922278143755105e-06, "loss": 0.9890032410621643, "step": 248 }, { "epoch": 0.49800796812749004, "grad_norm": 1.1066993474960327, "learning_rate": 5.920144823574229e-06, "loss": 1.275596261024475, "step": 250 }, { "epoch": 0.50199203187251, "grad_norm": 3.8385164737701416, "learning_rate": 5.917983059879021e-06, "loss": 0.5777413249015808, "step": 252 }, { "epoch": 0.5059760956175299, "grad_norm": 2.5549728870391846, "learning_rate": 5.915792876136944e-06, "loss": 1.2903834581375122, "step": 254 }, { "epoch": 0.5099601593625498, "grad_norm": 1.1752848625183105, "learning_rate": 5.913574296123985e-06, "loss": 1.2607370615005493, "step": 256 }, { "epoch": 0.5139442231075697, "grad_norm": 3.4985756874084473, "learning_rate": 5.9113273439243885e-06, "loss": 0.6077223420143127, "step": 258 }, { "epoch": 0.5179282868525896, "grad_norm": 0.8346880674362183, "learning_rate": 5.909052043930402e-06, "loss": 1.2486491203308105, "step": 260 }, { "epoch": 0.5219123505976095, "grad_norm": 1.6400198936462402, "learning_rate": 5.9067484208420046e-06, "loss": 0.3859616219997406, "step": 262 }, { "epoch": 0.5258964143426295, "grad_norm": 2.0709147453308105, "learning_rate": 5.904416499666646e-06, "loss": 1.250545620918274, "step": 264 }, { "epoch": 0.5298804780876494, "grad_norm": 3.2738661766052246, "learning_rate": 5.902056305718969e-06, "loss": 0.5132614970207214, "step": 266 }, { "epoch": 0.5338645418326693, "grad_norm": 1.4471163749694824, "learning_rate": 5.89966786462054e-06, "loss": 1.2536060810089111, "step": 268 }, { "epoch": 0.5378486055776892, "grad_norm": 2.023653030395508, "learning_rate": 5.897251202299566e-06, "loss": 1.7837636470794678, "step": 270 }, { "epoch": 0.5418326693227091, "grad_norm": 0.7867792248725891, "learning_rate": 5.894806344990614e-06, "loss": 0.7907792329788208, "step": 272 }, { "epoch": 0.545816733067729, "grad_norm": 0.9616872072219849, "learning_rate": 5.892333319234332e-06, "loss": 1.240364670753479, "step": 274 }, { "epoch": 0.549800796812749, "grad_norm": 1.5364048480987549, "learning_rate": 5.889832151877152e-06, "loss": 0.6271519064903259, "step": 276 }, { "epoch": 0.5537848605577689, "grad_norm": 1.9956889152526855, "learning_rate": 5.887302870071004e-06, "loss": 1.354748010635376, "step": 278 }, { "epoch": 0.5577689243027888, "grad_norm": 3.179105043411255, "learning_rate": 5.88474550127302e-06, "loss": 0.7769224047660828, "step": 280 }, { "epoch": 0.5617529880478087, "grad_norm": 2.1050288677215576, "learning_rate": 5.882160073245238e-06, "loss": 0.7815161347389221, "step": 282 }, { "epoch": 0.5657370517928287, "grad_norm": 1.0835380554199219, "learning_rate": 5.879546614054295e-06, "loss": 1.2420227527618408, "step": 284 }, { "epoch": 0.5697211155378487, "grad_norm": 0.9784935712814331, "learning_rate": 5.876905152071131e-06, "loss": 1.2437528371810913, "step": 286 }, { "epoch": 0.5737051792828686, "grad_norm": 1.059682011604309, "learning_rate": 5.874235715970671e-06, "loss": 1.1747212409973145, "step": 288 }, { "epoch": 0.5776892430278885, "grad_norm": 1.0844000577926636, "learning_rate": 5.87153833473152e-06, "loss": 1.2218478918075562, "step": 290 }, { "epoch": 0.5816733067729084, "grad_norm": 1.2831990718841553, "learning_rate": 5.868813037635649e-06, "loss": 1.1690454483032227, "step": 292 }, { "epoch": 0.5856573705179283, "grad_norm": 2.694718360900879, "learning_rate": 5.866059854268076e-06, "loss": 0.49895596504211426, "step": 294 }, { "epoch": 0.5896414342629482, "grad_norm": 1.1014599800109863, "learning_rate": 5.863278814516539e-06, "loss": 1.4519755840301514, "step": 296 }, { "epoch": 0.5936254980079682, "grad_norm": 6.0046305656433105, "learning_rate": 5.860469948571181e-06, "loss": 0.6872335076332092, "step": 298 }, { "epoch": 0.5976095617529881, "grad_norm": 1.493370771408081, "learning_rate": 5.857633286924219e-06, "loss": 1.241629958152771, "step": 300 }, { "epoch": 0.601593625498008, "grad_norm": 1.3740859031677246, "learning_rate": 5.854768860369607e-06, "loss": 1.0279847383499146, "step": 302 }, { "epoch": 0.6055776892430279, "grad_norm": 4.5894083976745605, "learning_rate": 5.85187670000271e-06, "loss": 0.8594214916229248, "step": 304 }, { "epoch": 0.6095617529880478, "grad_norm": 1.9348714351654053, "learning_rate": 5.848956837219964e-06, "loss": 1.1640937328338623, "step": 306 }, { "epoch": 0.6135458167330677, "grad_norm": 3.6650631427764893, "learning_rate": 5.846009303718529e-06, "loss": 1.083706259727478, "step": 308 }, { "epoch": 0.6175298804780877, "grad_norm": 0.8985078930854797, "learning_rate": 5.8430341314959565e-06, "loss": 1.2840549945831299, "step": 310 }, { "epoch": 0.6215139442231076, "grad_norm": 3.3366034030914307, "learning_rate": 5.840031352849833e-06, "loss": 0.6729341149330139, "step": 312 }, { "epoch": 0.6254980079681275, "grad_norm": 0.5400150418281555, "learning_rate": 5.83700100037743e-06, "loss": 0.9031069874763489, "step": 314 }, { "epoch": 0.6294820717131474, "grad_norm": 0.8818338513374329, "learning_rate": 5.833943106975355e-06, "loss": 1.403872013092041, "step": 316 }, { "epoch": 0.6334661354581673, "grad_norm": 0.9534677267074585, "learning_rate": 5.830857705839191e-06, "loss": 0.7257641553878784, "step": 318 }, { "epoch": 0.6374501992031872, "grad_norm": 1.2703937292099, "learning_rate": 5.8277448304631385e-06, "loss": 1.2789297103881836, "step": 320 }, { "epoch": 0.6414342629482072, "grad_norm": 2.5597033500671387, "learning_rate": 5.824604514639647e-06, "loss": 0.5666279792785645, "step": 322 }, { "epoch": 0.6454183266932271, "grad_norm": 1.932152509689331, "learning_rate": 5.8214367924590515e-06, "loss": 0.9416989088058472, "step": 324 }, { "epoch": 0.649402390438247, "grad_norm": 2.5085222721099854, "learning_rate": 5.818241698309205e-06, "loss": 0.9871986508369446, "step": 326 }, { "epoch": 0.6533864541832669, "grad_norm": 0.8283513784408569, "learning_rate": 5.8150192668751015e-06, "loss": 1.2529672384262085, "step": 328 }, { "epoch": 0.6573705179282868, "grad_norm": 7.669778347015381, "learning_rate": 5.811769533138499e-06, "loss": 0.46496719121932983, "step": 330 }, { "epoch": 0.6613545816733067, "grad_norm": 3.1111960411071777, "learning_rate": 5.808492532377542e-06, "loss": 1.1308894157409668, "step": 332 }, { "epoch": 0.6653386454183267, "grad_norm": 1.0599477291107178, "learning_rate": 5.805188300166379e-06, "loss": 1.1927093267440796, "step": 334 }, { "epoch": 0.6693227091633466, "grad_norm": 0.7919442653656006, "learning_rate": 5.801856872374772e-06, "loss": 1.2229902744293213, "step": 336 }, { "epoch": 0.6733067729083665, "grad_norm": 0.874751627445221, "learning_rate": 5.798498285167714e-06, "loss": 1.239054560661316, "step": 338 }, { "epoch": 0.6772908366533864, "grad_norm": 3.267413854598999, "learning_rate": 5.795112575005031e-06, "loss": 0.5422060489654541, "step": 340 }, { "epoch": 0.6812749003984063, "grad_norm": 0.603284478187561, "learning_rate": 5.791699778640985e-06, "loss": 0.5057201385498047, "step": 342 }, { "epoch": 0.6852589641434262, "grad_norm": 1.073237419128418, "learning_rate": 5.788259933123882e-06, "loss": 1.212401270866394, "step": 344 }, { "epoch": 0.6892430278884463, "grad_norm": 0.9039257168769836, "learning_rate": 5.7847930757956626e-06, "loss": 1.2373487949371338, "step": 346 }, { "epoch": 0.6932270916334662, "grad_norm": 0.6864405870437622, "learning_rate": 5.7812992442915016e-06, "loss": 1.1827311515808105, "step": 348 }, { "epoch": 0.6972111553784861, "grad_norm": 1.7330577373504639, "learning_rate": 5.777778476539397e-06, "loss": 0.7856748104095459, "step": 350 }, { "epoch": 0.701195219123506, "grad_norm": 4.816940784454346, "learning_rate": 5.774230810759756e-06, "loss": 0.7216228246688843, "step": 352 }, { "epoch": 0.7051792828685259, "grad_norm": 2.1332626342773438, "learning_rate": 5.7706562854649866e-06, "loss": 0.49049532413482666, "step": 354 }, { "epoch": 0.7091633466135459, "grad_norm": 2.8059940338134766, "learning_rate": 5.767054939459075e-06, "loss": 1.3019351959228516, "step": 356 }, { "epoch": 0.7131474103585658, "grad_norm": 4.427498817443848, "learning_rate": 5.763426811837164e-06, "loss": 0.48208871483802795, "step": 358 }, { "epoch": 0.7171314741035857, "grad_norm": 4.743298530578613, "learning_rate": 5.759771941985128e-06, "loss": 1.6483818292617798, "step": 360 }, { "epoch": 0.7211155378486056, "grad_norm": 0.8030229210853577, "learning_rate": 5.75609036957915e-06, "loss": 0.7936917543411255, "step": 362 }, { "epoch": 0.7250996015936255, "grad_norm": 4.138736248016357, "learning_rate": 5.752382134585289e-06, "loss": 0.19702184200286865, "step": 364 }, { "epoch": 0.7290836653386454, "grad_norm": 0.7204448580741882, "learning_rate": 5.748647277259041e-06, "loss": 1.3097480535507202, "step": 366 }, { "epoch": 0.7330677290836654, "grad_norm": 0.6811744570732117, "learning_rate": 5.744885838144908e-06, "loss": 1.282241702079773, "step": 368 }, { "epoch": 0.7370517928286853, "grad_norm": 1.3216296434402466, "learning_rate": 5.741097858075958e-06, "loss": 1.1899917125701904, "step": 370 }, { "epoch": 0.7410358565737052, "grad_norm": 0.7291891574859619, "learning_rate": 5.737283378173377e-06, "loss": 1.289171576499939, "step": 372 }, { "epoch": 0.7450199203187251, "grad_norm": 1.4926878213882446, "learning_rate": 5.733442439846028e-06, "loss": 0.9133517742156982, "step": 374 }, { "epoch": 0.749003984063745, "grad_norm": 1.1999213695526123, "learning_rate": 5.729575084789995e-06, "loss": 1.2485815286636353, "step": 376 }, { "epoch": 0.7529880478087649, "grad_norm": 0.4571026563644409, "learning_rate": 5.725681354988137e-06, "loss": 0.41173255443573, "step": 378 }, { "epoch": 0.7569721115537849, "grad_norm": 0.9662789106369019, "learning_rate": 5.72176129270963e-06, "loss": 1.3222002983093262, "step": 380 }, { "epoch": 0.7609561752988048, "grad_norm": 0.8864423036575317, "learning_rate": 5.717814940509503e-06, "loss": 1.2533366680145264, "step": 382 }, { "epoch": 0.7649402390438247, "grad_norm": 1.8013001680374146, "learning_rate": 5.713842341228187e-06, "loss": 1.132637858390808, "step": 384 }, { "epoch": 0.7689243027888446, "grad_norm": 1.4815607070922852, "learning_rate": 5.70984353799104e-06, "loss": 0.28086692094802856, "step": 386 }, { "epoch": 0.7729083665338645, "grad_norm": 0.8467429280281067, "learning_rate": 5.705818574207883e-06, "loss": 1.4608538150787354, "step": 388 }, { "epoch": 0.7768924302788844, "grad_norm": 2.4864161014556885, "learning_rate": 5.701767493572526e-06, "loss": 0.7464155554771423, "step": 390 }, { "epoch": 0.7808764940239044, "grad_norm": 2.4926576614379883, "learning_rate": 5.6976903400623e-06, "loss": 0.5242215991020203, "step": 392 }, { "epoch": 0.7848605577689243, "grad_norm": 3.3884170055389404, "learning_rate": 5.693587157937572e-06, "loss": 0.7744420766830444, "step": 394 }, { "epoch": 0.7888446215139442, "grad_norm": 1.3466330766677856, "learning_rate": 5.689457991741267e-06, "loss": 0.8062616586685181, "step": 396 }, { "epoch": 0.7928286852589641, "grad_norm": 0.8415664434432983, "learning_rate": 5.685302886298392e-06, "loss": 0.9788842797279358, "step": 398 }, { "epoch": 0.796812749003984, "grad_norm": 1.0375547409057617, "learning_rate": 5.681121886715534e-06, "loss": 1.068263053894043, "step": 400 }, { "epoch": 0.8007968127490039, "grad_norm": 1.184495210647583, "learning_rate": 5.676915038380384e-06, "loss": 0.7641897797584534, "step": 402 }, { "epoch": 0.8047808764940239, "grad_norm": 0.5623915195465088, "learning_rate": 5.67268238696124e-06, "loss": 1.194584846496582, "step": 404 }, { "epoch": 0.8087649402390438, "grad_norm": 1.6544809341430664, "learning_rate": 5.668423978406509e-06, "loss": 1.8557928800582886, "step": 406 }, { "epoch": 0.8127490039840638, "grad_norm": 0.9776933193206787, "learning_rate": 5.664139858944209e-06, "loss": 1.157083511352539, "step": 408 }, { "epoch": 0.8167330677290837, "grad_norm": 0.9368433356285095, "learning_rate": 5.65983007508147e-06, "loss": 1.1894208192825317, "step": 410 }, { "epoch": 0.8207171314741036, "grad_norm": 1.024929165840149, "learning_rate": 5.655494673604024e-06, "loss": 1.2211333513259888, "step": 412 }, { "epoch": 0.8247011952191236, "grad_norm": 0.9331441521644592, "learning_rate": 5.651133701575706e-06, "loss": 0.9813644289970398, "step": 414 }, { "epoch": 0.8286852589641435, "grad_norm": 0.43455296754837036, "learning_rate": 5.64674720633793e-06, "loss": 0.2262841910123825, "step": 416 }, { "epoch": 0.8326693227091634, "grad_norm": 0.9842036366462708, "learning_rate": 5.642335235509189e-06, "loss": 1.2737834453582764, "step": 418 }, { "epoch": 0.8366533864541833, "grad_norm": 1.0286755561828613, "learning_rate": 5.637897836984526e-06, "loss": 1.2228126525878906, "step": 420 }, { "epoch": 0.8406374501992032, "grad_norm": 0.8756253123283386, "learning_rate": 5.633435058935023e-06, "loss": 1.1928170919418335, "step": 422 }, { "epoch": 0.8446215139442231, "grad_norm": 0.758901834487915, "learning_rate": 5.628946949807274e-06, "loss": 1.1966356039047241, "step": 424 }, { "epoch": 0.848605577689243, "grad_norm": 2.6789400577545166, "learning_rate": 5.624433558322859e-06, "loss": 0.7115716338157654, "step": 426 }, { "epoch": 0.852589641434263, "grad_norm": 1.1329255104064941, "learning_rate": 5.619894933477816e-06, "loss": 1.2351547479629517, "step": 428 }, { "epoch": 0.8565737051792829, "grad_norm": 0.8669703602790833, "learning_rate": 5.615331124542109e-06, "loss": 1.0460853576660156, "step": 430 }, { "epoch": 0.8605577689243028, "grad_norm": 1.4718725681304932, "learning_rate": 5.610742181059092e-06, "loss": 1.8136500120162964, "step": 432 }, { "epoch": 0.8645418326693227, "grad_norm": 1.955024003982544, "learning_rate": 5.606128152844975e-06, "loss": 1.2090433835983276, "step": 434 }, { "epoch": 0.8685258964143426, "grad_norm": 2.959174156188965, "learning_rate": 5.601489089988277e-06, "loss": 0.4959055483341217, "step": 436 }, { "epoch": 0.8725099601593626, "grad_norm": 0.8022291660308838, "learning_rate": 5.596825042849287e-06, "loss": 1.2489244937896729, "step": 438 }, { "epoch": 0.8764940239043825, "grad_norm": 0.867755651473999, "learning_rate": 5.592136062059517e-06, "loss": 1.187935709953308, "step": 440 }, { "epoch": 0.8804780876494024, "grad_norm": 2.0213284492492676, "learning_rate": 5.587422198521149e-06, "loss": 1.6624571084976196, "step": 442 }, { "epoch": 0.8844621513944223, "grad_norm": 1.8472967147827148, "learning_rate": 5.582683503406488e-06, "loss": 1.3048073053359985, "step": 444 }, { "epoch": 0.8884462151394422, "grad_norm": 0.8281286954879761, "learning_rate": 5.5779200281574e-06, "loss": 1.043340802192688, "step": 446 }, { "epoch": 0.8924302788844621, "grad_norm": 1.8063609600067139, "learning_rate": 5.573131824484758e-06, "loss": 0.371786892414093, "step": 448 }, { "epoch": 0.896414342629482, "grad_norm": 0.8337019681930542, "learning_rate": 5.56831894436788e-06, "loss": 1.1593928337097168, "step": 450 }, { "epoch": 0.900398406374502, "grad_norm": 0.808246374130249, "learning_rate": 5.563481440053964e-06, "loss": 0.8130660057067871, "step": 452 }, { "epoch": 0.9043824701195219, "grad_norm": 0.7648867964744568, "learning_rate": 5.55861936405752e-06, "loss": 1.2445188760757446, "step": 454 }, { "epoch": 0.9083665338645418, "grad_norm": 4.679040431976318, "learning_rate": 5.5537327691598026e-06, "loss": 0.9090757966041565, "step": 456 }, { "epoch": 0.9123505976095617, "grad_norm": 0.8703306317329407, "learning_rate": 5.548821708408234e-06, "loss": 1.2912606000900269, "step": 458 }, { "epoch": 0.9163346613545816, "grad_norm": 3.33894681930542, "learning_rate": 5.543886235115832e-06, "loss": 1.0427659749984741, "step": 460 }, { "epoch": 0.9203187250996016, "grad_norm": 1.598880410194397, "learning_rate": 5.538926402860631e-06, "loss": 1.2816940546035767, "step": 462 }, { "epoch": 0.9243027888446215, "grad_norm": 1.35460364818573, "learning_rate": 5.533942265485095e-06, "loss": 1.3399840593338013, "step": 464 }, { "epoch": 0.9282868525896414, "grad_norm": 7.064363956451416, "learning_rate": 5.528933877095541e-06, "loss": 0.40876510739326477, "step": 466 }, { "epoch": 0.9322709163346613, "grad_norm": 0.7858706712722778, "learning_rate": 5.523901292061547e-06, "loss": 1.1805975437164307, "step": 468 }, { "epoch": 0.9362549800796812, "grad_norm": 8.24327278137207, "learning_rate": 5.518844565015361e-06, "loss": 0.38794469833374023, "step": 470 }, { "epoch": 0.9402390438247012, "grad_norm": 0.7928199768066406, "learning_rate": 5.51376375085131e-06, "loss": 1.2316607236862183, "step": 472 }, { "epoch": 0.9442231075697212, "grad_norm": 4.031145095825195, "learning_rate": 5.508658904725206e-06, "loss": 0.5695405602455139, "step": 474 }, { "epoch": 0.9482071713147411, "grad_norm": 2.9237377643585205, "learning_rate": 5.503530082053741e-06, "loss": 0.338968962430954, "step": 476 }, { "epoch": 0.952191235059761, "grad_norm": 0.8833221793174744, "learning_rate": 5.498377338513894e-06, "loss": 1.2102028131484985, "step": 478 }, { "epoch": 0.9561752988047809, "grad_norm": 25.611223220825195, "learning_rate": 5.493200730042317e-06, "loss": 0.4739567041397095, "step": 480 }, { "epoch": 0.9601593625498008, "grad_norm": 5.376172065734863, "learning_rate": 5.488000312834735e-06, "loss": 0.9883483648300171, "step": 482 }, { "epoch": 0.9641434262948207, "grad_norm": 1.7662686109542847, "learning_rate": 5.482776143345333e-06, "loss": 1.2430894374847412, "step": 484 }, { "epoch": 0.9681274900398407, "grad_norm": 2.5627293586730957, "learning_rate": 5.477528278286145e-06, "loss": 1.2240179777145386, "step": 486 }, { "epoch": 0.9721115537848606, "grad_norm": 0.8417234420776367, "learning_rate": 5.472256774626435e-06, "loss": 1.1680150032043457, "step": 488 }, { "epoch": 0.9760956175298805, "grad_norm": 0.8709147572517395, "learning_rate": 5.4669616895920826e-06, "loss": 1.2006162405014038, "step": 490 }, { "epoch": 0.9800796812749004, "grad_norm": 5.11852502822876, "learning_rate": 5.46164308066496e-06, "loss": 0.7005679607391357, "step": 492 }, { "epoch": 0.9840637450199203, "grad_norm": 2.7665576934814453, "learning_rate": 5.456301005582304e-06, "loss": 0.7001307606697083, "step": 494 }, { "epoch": 0.9880478087649402, "grad_norm": 0.8219811320304871, "learning_rate": 5.4509355223360956e-06, "loss": 1.254296898841858, "step": 496 }, { "epoch": 0.9920318725099602, "grad_norm": 1.0245788097381592, "learning_rate": 5.445546689172432e-06, "loss": 1.267047643661499, "step": 498 }, { "epoch": 0.9960159362549801, "grad_norm": 1.1505917310714722, "learning_rate": 5.440134564590883e-06, "loss": 0.7141546010971069, "step": 500 }, { "epoch": 1.0, "grad_norm": 6.24027681350708, "learning_rate": 5.434699207343867e-06, "loss": 1.0391122102737427, "step": 502 }, { "epoch": 1.00398406374502, "grad_norm": 1.2134792804718018, "learning_rate": 5.429240676436008e-06, "loss": 0.7802969217300415, "step": 504 }, { "epoch": 1.0079681274900398, "grad_norm": 1.5164703130722046, "learning_rate": 5.423759031123498e-06, "loss": 0.31817543506622314, "step": 506 }, { "epoch": 1.0119521912350598, "grad_norm": 0.6141365170478821, "learning_rate": 5.41825433091345e-06, "loss": 1.0097558498382568, "step": 508 }, { "epoch": 1.0159362549800797, "grad_norm": 0.8733232021331787, "learning_rate": 5.4127266355632575e-06, "loss": 1.0352897644042969, "step": 510 }, { "epoch": 1.0199203187250996, "grad_norm": 2.5583245754241943, "learning_rate": 5.407176005079938e-06, "loss": 1.0885701179504395, "step": 512 }, { "epoch": 1.0239043824701195, "grad_norm": 1.0007575750350952, "learning_rate": 5.401602499719488e-06, "loss": 1.0486167669296265, "step": 514 }, { "epoch": 1.0278884462151394, "grad_norm": 1.1661553382873535, "learning_rate": 5.396006179986228e-06, "loss": 1.0347387790679932, "step": 516 }, { "epoch": 1.0318725099601593, "grad_norm": 0.8863986134529114, "learning_rate": 5.390387106632143e-06, "loss": 1.0672526359558105, "step": 518 }, { "epoch": 1.0358565737051793, "grad_norm": 2.13053035736084, "learning_rate": 5.384745340656227e-06, "loss": 0.8640899062156677, "step": 520 }, { "epoch": 1.0398406374501992, "grad_norm": 2.6343281269073486, "learning_rate": 5.379080943303814e-06, "loss": 0.943762481212616, "step": 522 }, { "epoch": 1.043824701195219, "grad_norm": 1.45510733127594, "learning_rate": 5.373393976065921e-06, "loss": 0.9649692177772522, "step": 524 }, { "epoch": 1.047808764940239, "grad_norm": 1.4119848012924194, "learning_rate": 5.367684500678576e-06, "loss": 1.1445621252059937, "step": 526 }, { "epoch": 1.051792828685259, "grad_norm": 1.0543644428253174, "learning_rate": 5.361952579122149e-06, "loss": 0.9114750027656555, "step": 528 }, { "epoch": 1.0557768924302788, "grad_norm": 1.5039920806884766, "learning_rate": 5.356198273620678e-06, "loss": 0.8998257517814636, "step": 530 }, { "epoch": 1.0597609561752988, "grad_norm": 2.6351239681243896, "learning_rate": 5.350421646641195e-06, "loss": 0.3897404074668884, "step": 532 }, { "epoch": 1.0637450199203187, "grad_norm": 1.1779015064239502, "learning_rate": 5.344622760893049e-06, "loss": 1.2084486484527588, "step": 534 }, { "epoch": 1.0677290836653386, "grad_norm": 0.50465989112854, "learning_rate": 5.338801679327221e-06, "loss": 0.48134946823120117, "step": 536 }, { "epoch": 1.0717131474103585, "grad_norm": 6.834875106811523, "learning_rate": 5.332958465135645e-06, "loss": 0.8534721732139587, "step": 538 }, { "epoch": 1.0756972111553784, "grad_norm": 0.8775362372398376, "learning_rate": 5.327093181750519e-06, "loss": 0.1745588630437851, "step": 540 }, { "epoch": 1.0796812749003983, "grad_norm": 0.8401792049407959, "learning_rate": 5.3212058928436175e-06, "loss": 1.0862375497817993, "step": 542 }, { "epoch": 1.0836653386454183, "grad_norm": 1.2075270414352417, "learning_rate": 5.3152966623256026e-06, "loss": 1.2837507724761963, "step": 544 }, { "epoch": 1.0876494023904382, "grad_norm": 3.44868803024292, "learning_rate": 5.309365554345325e-06, "loss": 0.4348865747451782, "step": 546 }, { "epoch": 1.091633466135458, "grad_norm": 1.060323715209961, "learning_rate": 5.303412633289133e-06, "loss": 0.7609821557998657, "step": 548 }, { "epoch": 1.095617529880478, "grad_norm": 0.48030683398246765, "learning_rate": 5.297437963780171e-06, "loss": 0.5199949741363525, "step": 550 }, { "epoch": 1.099601593625498, "grad_norm": 0.8254769444465637, "learning_rate": 5.2914416106776745e-06, "loss": 1.0883558988571167, "step": 552 }, { "epoch": 1.1035856573705178, "grad_norm": 2.637892246246338, "learning_rate": 5.2854236390762755e-06, "loss": 0.48916831612586975, "step": 554 }, { "epoch": 1.1075697211155378, "grad_norm": 1.684272050857544, "learning_rate": 5.2793841143052855e-06, "loss": 1.0254663228988647, "step": 556 }, { "epoch": 1.1115537848605577, "grad_norm": 2.17739200592041, "learning_rate": 5.273323101927994e-06, "loss": 0.9679847359657288, "step": 558 }, { "epoch": 1.1155378486055776, "grad_norm": 5.525514125823975, "learning_rate": 5.26724066774095e-06, "loss": 0.9007784128189087, "step": 560 }, { "epoch": 1.1195219123505975, "grad_norm": 1.1246291399002075, "learning_rate": 5.261136877773254e-06, "loss": 1.0599032640457153, "step": 562 }, { "epoch": 1.1235059760956174, "grad_norm": 1.811063289642334, "learning_rate": 5.255011798285838e-06, "loss": 1.053318738937378, "step": 564 }, { "epoch": 1.1274900398406373, "grad_norm": 1.0067085027694702, "learning_rate": 5.248865495770747e-06, "loss": 1.0161441564559937, "step": 566 }, { "epoch": 1.1314741035856573, "grad_norm": 1.653944730758667, "learning_rate": 5.242698036950416e-06, "loss": 1.211927890777588, "step": 568 }, { "epoch": 1.1354581673306772, "grad_norm": 5.520211219787598, "learning_rate": 5.236509488776946e-06, "loss": 0.2512112259864807, "step": 570 }, { "epoch": 1.139442231075697, "grad_norm": 0.6854221224784851, "learning_rate": 5.230299918431381e-06, "loss": 0.20837584137916565, "step": 572 }, { "epoch": 1.1434262948207172, "grad_norm": 1.0965662002563477, "learning_rate": 5.224069393322971e-06, "loss": 0.8550689220428467, "step": 574 }, { "epoch": 1.1474103585657371, "grad_norm": 0.5142279863357544, "learning_rate": 5.2178179810884465e-06, "loss": 0.5071516633033752, "step": 576 }, { "epoch": 1.151394422310757, "grad_norm": 1.3928073644638062, "learning_rate": 5.211545749591285e-06, "loss": 1.1629210710525513, "step": 578 }, { "epoch": 1.155378486055777, "grad_norm": 4.516799449920654, "learning_rate": 5.205252766920967e-06, "loss": 0.615897536277771, "step": 580 }, { "epoch": 1.159362549800797, "grad_norm": 1.9076368808746338, "learning_rate": 5.198939101392247e-06, "loss": 0.6484902501106262, "step": 582 }, { "epoch": 1.1633466135458168, "grad_norm": 2.9412710666656494, "learning_rate": 5.192604821544402e-06, "loss": 0.22438056766986847, "step": 584 }, { "epoch": 1.1673306772908367, "grad_norm": 0.8736124038696289, "learning_rate": 5.186249996140492e-06, "loss": 1.1574631929397583, "step": 586 }, { "epoch": 1.1713147410358566, "grad_norm": 1.56623375415802, "learning_rate": 5.179874694166617e-06, "loss": 1.0566999912261963, "step": 588 }, { "epoch": 1.1752988047808766, "grad_norm": 3.406691551208496, "learning_rate": 5.1734789848311635e-06, "loss": 1.28257417678833, "step": 590 }, { "epoch": 1.1792828685258965, "grad_norm": 1.163465976715088, "learning_rate": 5.16706293756405e-06, "loss": 1.0826280117034912, "step": 592 }, { "epoch": 1.1832669322709164, "grad_norm": 3.0535504817962646, "learning_rate": 5.160626622015983e-06, "loss": 1.4529417753219604, "step": 594 }, { "epoch": 1.1872509960159363, "grad_norm": 0.8099126815795898, "learning_rate": 5.154170108057693e-06, "loss": 1.1337939500808716, "step": 596 }, { "epoch": 1.1912350597609562, "grad_norm": 3.8160228729248047, "learning_rate": 5.147693465779179e-06, "loss": 0.3046616017818451, "step": 598 }, { "epoch": 1.1952191235059761, "grad_norm": 1.2103179693222046, "learning_rate": 5.141196765488946e-06, "loss": 0.8739789724349976, "step": 600 }, { "epoch": 1.199203187250996, "grad_norm": 3.3165013790130615, "learning_rate": 5.134680077713244e-06, "loss": 0.5771604776382446, "step": 602 }, { "epoch": 1.203187250996016, "grad_norm": 1.3412213325500488, "learning_rate": 5.1281434731953e-06, "loss": 1.1980223655700684, "step": 604 }, { "epoch": 1.207171314741036, "grad_norm": 14.288922309875488, "learning_rate": 5.121587022894554e-06, "loss": 0.4752068817615509, "step": 606 }, { "epoch": 1.2111553784860558, "grad_norm": 0.9397494196891785, "learning_rate": 5.115010797985882e-06, "loss": 0.5870952010154724, "step": 608 }, { "epoch": 1.2151394422310757, "grad_norm": 0.735195517539978, "learning_rate": 5.108414869858831e-06, "loss": 1.0899227857589722, "step": 610 }, { "epoch": 1.2191235059760956, "grad_norm": 0.9480123519897461, "learning_rate": 5.1017993101168374e-06, "loss": 1.1740434169769287, "step": 612 }, { "epoch": 1.2231075697211156, "grad_norm": 1.5338431596755981, "learning_rate": 5.095164190576452e-06, "loss": 1.4396584033966064, "step": 614 }, { "epoch": 1.2270916334661355, "grad_norm": 11.36307144165039, "learning_rate": 5.0885095832665666e-06, "loss": 0.3999689817428589, "step": 616 }, { "epoch": 1.2310756972111554, "grad_norm": 1.546046495437622, "learning_rate": 5.081835560427619e-06, "loss": 0.9995384812355042, "step": 618 }, { "epoch": 1.2350597609561753, "grad_norm": 1.254744291305542, "learning_rate": 5.075142194510823e-06, "loss": 1.0542714595794678, "step": 620 }, { "epoch": 1.2390438247011952, "grad_norm": 2.047104597091675, "learning_rate": 5.068429558177369e-06, "loss": 0.9798321723937988, "step": 622 }, { "epoch": 1.2430278884462151, "grad_norm": 1.0986047983169556, "learning_rate": 5.061697724297646e-06, "loss": 1.068199872970581, "step": 624 }, { "epoch": 1.247011952191235, "grad_norm": 1.8080114126205444, "learning_rate": 5.054946765950443e-06, "loss": 0.9513214230537415, "step": 626 }, { "epoch": 1.250996015936255, "grad_norm": 1.3059947490692139, "learning_rate": 5.048176756422159e-06, "loss": 0.7849744558334351, "step": 628 }, { "epoch": 1.254980079681275, "grad_norm": 0.7330244779586792, "learning_rate": 5.041387769206009e-06, "loss": 1.0498535633087158, "step": 630 }, { "epoch": 1.2589641434262948, "grad_norm": 5.962719440460205, "learning_rate": 5.034579878001222e-06, "loss": 0.2894093096256256, "step": 632 }, { "epoch": 1.2629482071713147, "grad_norm": 4.925858974456787, "learning_rate": 5.027753156712246e-06, "loss": 0.36715632677078247, "step": 634 }, { "epoch": 1.2669322709163346, "grad_norm": 3.4104573726654053, "learning_rate": 5.020907679447936e-06, "loss": 0.844882071018219, "step": 636 }, { "epoch": 1.2709163346613546, "grad_norm": 1.9961673021316528, "learning_rate": 5.0140435205207636e-06, "loss": 0.8165204524993896, "step": 638 }, { "epoch": 1.2749003984063745, "grad_norm": 2.4332053661346436, "learning_rate": 5.007160754446002e-06, "loss": 0.3054620623588562, "step": 640 }, { "epoch": 1.2788844621513944, "grad_norm": 0.6446577906608582, "learning_rate": 5.000259455940913e-06, "loss": 0.9809127449989319, "step": 642 }, { "epoch": 1.2828685258964143, "grad_norm": 1.2125827074050903, "learning_rate": 4.9933396999239455e-06, "loss": 0.7705118060112, "step": 644 }, { "epoch": 1.2868525896414342, "grad_norm": 0.7487397193908691, "learning_rate": 4.986401561513917e-06, "loss": 1.0824811458587646, "step": 646 }, { "epoch": 1.2908366533864541, "grad_norm": 1.9600952863693237, "learning_rate": 4.979445116029199e-06, "loss": 0.6253088116645813, "step": 648 }, { "epoch": 1.294820717131474, "grad_norm": 1.7079068422317505, "learning_rate": 4.972470438986896e-06, "loss": 1.5013655424118042, "step": 650 }, { "epoch": 1.298804780876494, "grad_norm": 1.1496132612228394, "learning_rate": 4.965477606102033e-06, "loss": 0.8948485255241394, "step": 652 }, { "epoch": 1.302788844621514, "grad_norm": 1.8034613132476807, "learning_rate": 4.9584666932867285e-06, "loss": 0.24509888887405396, "step": 654 }, { "epoch": 1.3067729083665338, "grad_norm": 0.6996963620185852, "learning_rate": 4.951437776649368e-06, "loss": 1.0769448280334473, "step": 656 }, { "epoch": 1.3107569721115537, "grad_norm": 0.571880578994751, "learning_rate": 4.944390932493787e-06, "loss": 0.8138774633407593, "step": 658 }, { "epoch": 1.3147410358565736, "grad_norm": 0.9483959674835205, "learning_rate": 4.937326237318431e-06, "loss": 0.6459387540817261, "step": 660 }, { "epoch": 1.3187250996015936, "grad_norm": 0.9495901465415955, "learning_rate": 4.930243767815534e-06, "loss": 1.1829910278320312, "step": 662 }, { "epoch": 1.3227091633466135, "grad_norm": 1.2907254695892334, "learning_rate": 4.923143600870284e-06, "loss": 0.5661064386367798, "step": 664 }, { "epoch": 1.3266932270916334, "grad_norm": 1.5633907318115234, "learning_rate": 4.916025813559983e-06, "loss": 0.8189319372177124, "step": 666 }, { "epoch": 1.3306772908366533, "grad_norm": 1.9113082885742188, "learning_rate": 4.908890483153218e-06, "loss": 0.38532766699790955, "step": 668 }, { "epoch": 1.3346613545816732, "grad_norm": 0.9342731237411499, "learning_rate": 4.901737687109019e-06, "loss": 1.0321613550186157, "step": 670 }, { "epoch": 1.3386454183266931, "grad_norm": 3.1048390865325928, "learning_rate": 4.894567503076014e-06, "loss": 0.5770927667617798, "step": 672 }, { "epoch": 1.342629482071713, "grad_norm": 0.820324182510376, "learning_rate": 4.887380008891593e-06, "loss": 1.0886192321777344, "step": 674 }, { "epoch": 1.3466135458167332, "grad_norm": 1.3751561641693115, "learning_rate": 4.880175282581059e-06, "loss": 0.97751384973526, "step": 676 }, { "epoch": 1.3505976095617531, "grad_norm": 0.7426400184631348, "learning_rate": 4.872953402356782e-06, "loss": 1.076625943183899, "step": 678 }, { "epoch": 1.354581673306773, "grad_norm": 1.1565395593643188, "learning_rate": 4.86571444661735e-06, "loss": 1.0121248960494995, "step": 680 }, { "epoch": 1.358565737051793, "grad_norm": 0.7444704174995422, "learning_rate": 4.858458493946716e-06, "loss": 1.0811046361923218, "step": 682 }, { "epoch": 1.3625498007968129, "grad_norm": 1.0144495964050293, "learning_rate": 4.851185623113349e-06, "loss": 1.1279915571212769, "step": 684 }, { "epoch": 1.3665338645418328, "grad_norm": 0.7559702396392822, "learning_rate": 4.843895913069377e-06, "loss": 1.0942429304122925, "step": 686 }, { "epoch": 1.3705179282868527, "grad_norm": 0.8456003069877625, "learning_rate": 4.836589442949727e-06, "loss": 1.0091909170150757, "step": 688 }, { "epoch": 1.3745019920318726, "grad_norm": 0.7402591705322266, "learning_rate": 4.829266292071268e-06, "loss": 0.9695682525634766, "step": 690 }, { "epoch": 1.3784860557768925, "grad_norm": 1.815006136894226, "learning_rate": 4.821926539931952e-06, "loss": 0.3355652689933777, "step": 692 }, { "epoch": 1.3824701195219125, "grad_norm": 1.0571285486221313, "learning_rate": 4.814570266209952e-06, "loss": 1.1081352233886719, "step": 694 }, { "epoch": 1.3864541832669324, "grad_norm": 1.3027758598327637, "learning_rate": 4.80719755076279e-06, "loss": 1.0507612228393555, "step": 696 }, { "epoch": 1.3904382470119523, "grad_norm": 0.9322640299797058, "learning_rate": 4.799808473626476e-06, "loss": 1.1305720806121826, "step": 698 }, { "epoch": 1.3944223107569722, "grad_norm": 1.1364309787750244, "learning_rate": 4.792403115014637e-06, "loss": 0.1400398164987564, "step": 700 }, { "epoch": 1.3984063745019921, "grad_norm": 1.2325326204299927, "learning_rate": 4.7849815553176476e-06, "loss": 1.1220163106918335, "step": 702 }, { "epoch": 1.402390438247012, "grad_norm": 1.0282156467437744, "learning_rate": 4.777543875101757e-06, "loss": 1.0591614246368408, "step": 704 }, { "epoch": 1.406374501992032, "grad_norm": 0.7515193223953247, "learning_rate": 4.770090155108215e-06, "loss": 1.1357749700546265, "step": 706 }, { "epoch": 1.4103585657370519, "grad_norm": 1.05164635181427, "learning_rate": 4.7626204762523905e-06, "loss": 0.9992522597312927, "step": 708 }, { "epoch": 1.4143426294820718, "grad_norm": 0.7848185896873474, "learning_rate": 4.755134919622901e-06, "loss": 1.0771911144256592, "step": 710 }, { "epoch": 1.4183266932270917, "grad_norm": 2.0036990642547607, "learning_rate": 4.747633566480726e-06, "loss": 0.6499975323677063, "step": 712 }, { "epoch": 1.4223107569721116, "grad_norm": 1.088212251663208, "learning_rate": 4.740116498258328e-06, "loss": 1.0736567974090576, "step": 714 }, { "epoch": 1.4262948207171315, "grad_norm": 1.0202051401138306, "learning_rate": 4.73258379655877e-06, "loss": 1.1317867040634155, "step": 716 }, { "epoch": 1.4302788844621515, "grad_norm": 0.6986392140388489, "learning_rate": 4.7250355431548244e-06, "loss": 0.1079653948545456, "step": 718 }, { "epoch": 1.4342629482071714, "grad_norm": 1.2315129041671753, "learning_rate": 4.717471819988088e-06, "loss": 1.070616364479065, "step": 720 }, { "epoch": 1.4382470119521913, "grad_norm": 2.786571502685547, "learning_rate": 4.709892709168096e-06, "loss": 0.2563188672065735, "step": 722 }, { "epoch": 1.4422310756972112, "grad_norm": 0.634524941444397, "learning_rate": 4.702298292971422e-06, "loss": 1.0500552654266357, "step": 724 }, { "epoch": 1.4462151394422311, "grad_norm": 0.7324956059455872, "learning_rate": 4.6946886538407975e-06, "loss": 1.092575192451477, "step": 726 }, { "epoch": 1.450199203187251, "grad_norm": 1.8564890623092651, "learning_rate": 4.687063874384204e-06, "loss": 0.8989277482032776, "step": 728 }, { "epoch": 1.454183266932271, "grad_norm": 0.6646371483802795, "learning_rate": 4.679424037373984e-06, "loss": 1.0014073848724365, "step": 730 }, { "epoch": 1.4581673306772909, "grad_norm": 2.136218786239624, "learning_rate": 4.671769225745939e-06, "loss": 1.0647640228271484, "step": 732 }, { "epoch": 1.4621513944223108, "grad_norm": 0.5179296135902405, "learning_rate": 4.664099522598432e-06, "loss": 0.12710000574588776, "step": 734 }, { "epoch": 1.4661354581673307, "grad_norm": 0.8502590656280518, "learning_rate": 4.656415011191484e-06, "loss": 1.085228681564331, "step": 736 }, { "epoch": 1.4701195219123506, "grad_norm": 1.1160621643066406, "learning_rate": 4.648715774945869e-06, "loss": 1.1700797080993652, "step": 738 }, { "epoch": 1.4741035856573705, "grad_norm": 4.530128002166748, "learning_rate": 4.641001897442209e-06, "loss": 0.19807864725589752, "step": 740 }, { "epoch": 1.4780876494023905, "grad_norm": 1.182551383972168, "learning_rate": 4.633273462420069e-06, "loss": 1.2210465669631958, "step": 742 }, { "epoch": 1.4820717131474104, "grad_norm": 7.367408752441406, "learning_rate": 4.625530553777045e-06, "loss": 1.2010120153427124, "step": 744 }, { "epoch": 1.4860557768924303, "grad_norm": 0.8875226378440857, "learning_rate": 4.617773255567855e-06, "loss": 1.0283279418945312, "step": 746 }, { "epoch": 1.4900398406374502, "grad_norm": 1.780938744544983, "learning_rate": 4.610001652003426e-06, "loss": 1.0667709112167358, "step": 748 }, { "epoch": 1.4940239043824701, "grad_norm": 1.2433035373687744, "learning_rate": 4.602215827449976e-06, "loss": 1.0492123365402222, "step": 750 }, { "epoch": 1.49800796812749, "grad_norm": 0.8798750638961792, "learning_rate": 4.594415866428108e-06, "loss": 1.0049997568130493, "step": 752 }, { "epoch": 1.50199203187251, "grad_norm": 1.146921992301941, "learning_rate": 4.586601853611882e-06, "loss": 0.994334876537323, "step": 754 }, { "epoch": 1.5059760956175299, "grad_norm": 3.869616746902466, "learning_rate": 4.578773873827901e-06, "loss": 0.7532044053077698, "step": 756 }, { "epoch": 1.5099601593625498, "grad_norm": 1.7733598947525024, "learning_rate": 4.57093201205439e-06, "loss": 1.0711463689804077, "step": 758 }, { "epoch": 1.5139442231075697, "grad_norm": 4.040090560913086, "learning_rate": 4.563076353420272e-06, "loss": 1.1239742040634155, "step": 760 }, { "epoch": 1.5179282868525896, "grad_norm": 1.1118268966674805, "learning_rate": 4.5552069832042455e-06, "loss": 0.22398273646831512, "step": 762 }, { "epoch": 1.5219123505976095, "grad_norm": 0.8436402678489685, "learning_rate": 4.547323986833857e-06, "loss": 1.0367255210876465, "step": 764 }, { "epoch": 1.5258964143426295, "grad_norm": 1.7664424180984497, "learning_rate": 4.539427449884576e-06, "loss": 0.7687526941299438, "step": 766 }, { "epoch": 1.5298804780876494, "grad_norm": 1.0416488647460938, "learning_rate": 4.53151745807886e-06, "loss": 0.5652468204498291, "step": 768 }, { "epoch": 1.5338645418326693, "grad_norm": 1.3710383176803589, "learning_rate": 4.523594097285234e-06, "loss": 1.0875599384307861, "step": 770 }, { "epoch": 1.5378486055776892, "grad_norm": 1.310120701789856, "learning_rate": 4.51565745351735e-06, "loss": 0.8149851560592651, "step": 772 }, { "epoch": 1.5418326693227091, "grad_norm": 1.0462884902954102, "learning_rate": 4.507707612933059e-06, "loss": 1.044182300567627, "step": 774 }, { "epoch": 1.545816733067729, "grad_norm": 2.2944624423980713, "learning_rate": 4.4997446618334664e-06, "loss": 1.1731159687042236, "step": 776 }, { "epoch": 1.549800796812749, "grad_norm": 6.394598960876465, "learning_rate": 4.491768686662005e-06, "loss": 0.5516869425773621, "step": 778 }, { "epoch": 1.5537848605577689, "grad_norm": 2.329699754714966, "learning_rate": 4.483779774003498e-06, "loss": 0.5405542850494385, "step": 780 }, { "epoch": 1.5577689243027888, "grad_norm": 0.42006587982177734, "learning_rate": 4.475778010583205e-06, "loss": 0.20549674332141876, "step": 782 }, { "epoch": 1.5617529880478087, "grad_norm": 2.271444082260132, "learning_rate": 4.467763483265897e-06, "loss": 0.9095351696014404, "step": 784 }, { "epoch": 1.5657370517928286, "grad_norm": 1.6157774925231934, "learning_rate": 4.459736279054901e-06, "loss": 1.3291853666305542, "step": 786 }, { "epoch": 1.5697211155378485, "grad_norm": 4.978515625, "learning_rate": 4.451696485091164e-06, "loss": 0.7586594223976135, "step": 788 }, { "epoch": 1.5737051792828685, "grad_norm": 1.2765519618988037, "learning_rate": 4.4436441886523025e-06, "loss": 1.1358023881912231, "step": 790 }, { "epoch": 1.5776892430278884, "grad_norm": 8.105411529541016, "learning_rate": 4.435579477151655e-06, "loss": 0.8000907897949219, "step": 792 }, { "epoch": 1.5816733067729083, "grad_norm": 0.7435089349746704, "learning_rate": 4.427502438137337e-06, "loss": 1.073531150817871, "step": 794 }, { "epoch": 1.5856573705179282, "grad_norm": 0.9908289313316345, "learning_rate": 4.419413159291284e-06, "loss": 1.011960744857788, "step": 796 }, { "epoch": 1.5896414342629481, "grad_norm": 1.1573151350021362, "learning_rate": 4.411311728428307e-06, "loss": 0.8743354082107544, "step": 798 }, { "epoch": 1.593625498007968, "grad_norm": 6.756656646728516, "learning_rate": 4.403198233495133e-06, "loss": 0.32545700669288635, "step": 800 }, { "epoch": 1.597609561752988, "grad_norm": 1.2311936616897583, "learning_rate": 4.395072762569457e-06, "loss": 0.9778568744659424, "step": 802 }, { "epoch": 1.6015936254980079, "grad_norm": 3.5830166339874268, "learning_rate": 4.386935403858977e-06, "loss": 1.0981725454330444, "step": 804 }, { "epoch": 1.6055776892430278, "grad_norm": 0.9334324598312378, "learning_rate": 4.378786245700443e-06, "loss": 1.3115934133529663, "step": 806 }, { "epoch": 1.6095617529880477, "grad_norm": 0.8329153656959534, "learning_rate": 4.370625376558698e-06, "loss": 1.028051733970642, "step": 808 }, { "epoch": 1.6135458167330676, "grad_norm": 1.030179500579834, "learning_rate": 4.362452885025713e-06, "loss": 0.9735574722290039, "step": 810 }, { "epoch": 1.6175298804780875, "grad_norm": 6.181675434112549, "learning_rate": 4.35426885981963e-06, "loss": 0.42590758204460144, "step": 812 }, { "epoch": 1.6215139442231075, "grad_norm": 3.902128219604492, "learning_rate": 4.346073389783799e-06, "loss": 0.7486605048179626, "step": 814 }, { "epoch": 1.6254980079681274, "grad_norm": 0.6811983585357666, "learning_rate": 4.337866563885808e-06, "loss": 0.2310914248228073, "step": 816 }, { "epoch": 1.6294820717131473, "grad_norm": 0.7712540030479431, "learning_rate": 4.329648471216523e-06, "loss": 1.112511157989502, "step": 818 }, { "epoch": 1.6334661354581672, "grad_norm": 1.0290017127990723, "learning_rate": 4.321419200989117e-06, "loss": 0.287282794713974, "step": 820 }, { "epoch": 1.6374501992031871, "grad_norm": 2.3703389167785645, "learning_rate": 4.313178842538107e-06, "loss": 0.7247891426086426, "step": 822 }, { "epoch": 1.641434262948207, "grad_norm": 1.919006586074829, "learning_rate": 4.304927485318375e-06, "loss": 0.21648265421390533, "step": 824 }, { "epoch": 1.645418326693227, "grad_norm": 1.1350631713867188, "learning_rate": 4.296665218904207e-06, "loss": 1.0472216606140137, "step": 826 }, { "epoch": 1.6494023904382469, "grad_norm": 0.42043375968933105, "learning_rate": 4.288392132988313e-06, "loss": 0.40000608563423157, "step": 828 }, { "epoch": 1.6533864541832668, "grad_norm": 1.6645681858062744, "learning_rate": 4.280108317380859e-06, "loss": 0.4568580985069275, "step": 830 }, { "epoch": 1.6573705179282867, "grad_norm": 1.5291117429733276, "learning_rate": 4.27181386200849e-06, "loss": 0.9923895597457886, "step": 832 }, { "epoch": 1.6613545816733066, "grad_norm": 1.294873833656311, "learning_rate": 4.263508856913346e-06, "loss": 0.994326651096344, "step": 834 }, { "epoch": 1.6653386454183265, "grad_norm": 2.7709615230560303, "learning_rate": 4.2551933922521e-06, "loss": 0.8918184041976929, "step": 836 }, { "epoch": 1.6693227091633465, "grad_norm": 1.2106887102127075, "learning_rate": 4.246867558294967e-06, "loss": 1.1439393758773804, "step": 838 }, { "epoch": 1.6733067729083664, "grad_norm": 1.091464877128601, "learning_rate": 4.2385314454247275e-06, "loss": 1.0264958143234253, "step": 840 }, { "epoch": 1.6772908366533863, "grad_norm": 1.5609543323516846, "learning_rate": 4.230185144135749e-06, "loss": 0.8460158109664917, "step": 842 }, { "epoch": 1.6812749003984062, "grad_norm": 0.8120943903923035, "learning_rate": 4.221828745033002e-06, "loss": 1.0981191396713257, "step": 844 }, { "epoch": 1.6852589641434261, "grad_norm": 1.0494468212127686, "learning_rate": 4.2134623388310706e-06, "loss": 0.3851274847984314, "step": 846 }, { "epoch": 1.6892430278884463, "grad_norm": 1.039975643157959, "learning_rate": 4.20508601635318e-06, "loss": 0.7145401239395142, "step": 848 }, { "epoch": 1.6932270916334662, "grad_norm": 1.385925054550171, "learning_rate": 4.1966998685302e-06, "loss": 1.1264657974243164, "step": 850 }, { "epoch": 1.697211155378486, "grad_norm": 0.7857804894447327, "learning_rate": 4.18830398639966e-06, "loss": 1.1105672121047974, "step": 852 }, { "epoch": 1.701195219123506, "grad_norm": 1.1625089645385742, "learning_rate": 4.179898461104764e-06, "loss": 1.078861117362976, "step": 854 }, { "epoch": 1.705179282868526, "grad_norm": 0.9041614532470703, "learning_rate": 4.1714833838934006e-06, "loss": 1.0313189029693604, "step": 856 }, { "epoch": 1.7091633466135459, "grad_norm": 0.8065091967582703, "learning_rate": 4.163058846117148e-06, "loss": 0.34671998023986816, "step": 858 }, { "epoch": 1.7131474103585658, "grad_norm": 1.2888925075531006, "learning_rate": 4.154624939230289e-06, "loss": 1.031374454498291, "step": 860 }, { "epoch": 1.7171314741035857, "grad_norm": 0.8425755500793457, "learning_rate": 4.146181754788813e-06, "loss": 1.0426599979400635, "step": 862 }, { "epoch": 1.7211155378486056, "grad_norm": 1.4209198951721191, "learning_rate": 4.13772938444942e-06, "loss": 0.6024843454360962, "step": 864 }, { "epoch": 1.7250996015936255, "grad_norm": 1.0409010648727417, "learning_rate": 4.129267919968536e-06, "loss": 0.4379670023918152, "step": 866 }, { "epoch": 1.7290836653386454, "grad_norm": 1.4887381792068481, "learning_rate": 4.120797453201309e-06, "loss": 0.8161473274230957, "step": 868 }, { "epoch": 1.7330677290836654, "grad_norm": 12.129778861999512, "learning_rate": 4.112318076100608e-06, "loss": 0.22986909747123718, "step": 870 }, { "epoch": 1.7370517928286853, "grad_norm": 2.050231456756592, "learning_rate": 4.103829880716036e-06, "loss": 0.5155397057533264, "step": 872 }, { "epoch": 1.7410358565737052, "grad_norm": 3.127119541168213, "learning_rate": 4.0953329591929204e-06, "loss": 0.42298442125320435, "step": 874 }, { "epoch": 1.745019920318725, "grad_norm": 1.210281491279602, "learning_rate": 4.08682740377132e-06, "loss": 1.0322401523590088, "step": 876 }, { "epoch": 1.749003984063745, "grad_norm": 0.7078624367713928, "learning_rate": 4.0783133067850185e-06, "loss": 1.0741485357284546, "step": 878 }, { "epoch": 1.752988047808765, "grad_norm": 0.9627106189727783, "learning_rate": 4.069790760660525e-06, "loss": 0.08892940729856491, "step": 880 }, { "epoch": 1.7569721115537849, "grad_norm": 2.872758388519287, "learning_rate": 4.06125985791607e-06, "loss": 1.2808747291564941, "step": 882 }, { "epoch": 1.7609561752988048, "grad_norm": 1.4781732559204102, "learning_rate": 4.0527206911606025e-06, "loss": 1.6314507722854614, "step": 884 }, { "epoch": 1.7649402390438247, "grad_norm": 0.4292491674423218, "learning_rate": 4.044173353092779e-06, "loss": 0.2118670642375946, "step": 886 }, { "epoch": 1.7689243027888446, "grad_norm": 1.0890276432037354, "learning_rate": 4.035617936499967e-06, "loss": 1.1356523036956787, "step": 888 }, { "epoch": 1.7729083665338645, "grad_norm": 1.0168540477752686, "learning_rate": 4.0270545342572265e-06, "loss": 0.9910404086112976, "step": 890 }, { "epoch": 1.7768924302788844, "grad_norm": 0.8853142261505127, "learning_rate": 4.018483239326312e-06, "loss": 0.9891409277915955, "step": 892 }, { "epoch": 1.7808764940239044, "grad_norm": 0.7593168020248413, "learning_rate": 4.009904144754655e-06, "loss": 1.1023067235946655, "step": 894 }, { "epoch": 1.7848605577689243, "grad_norm": 3.0125675201416016, "learning_rate": 4.00131734367436e-06, "loss": 0.9771660566329956, "step": 896 }, { "epoch": 1.7888446215139442, "grad_norm": 1.7285772562026978, "learning_rate": 3.99272292930119e-06, "loss": 0.5689830780029297, "step": 898 }, { "epoch": 1.792828685258964, "grad_norm": 0.7325118184089661, "learning_rate": 3.984120994933558e-06, "loss": 1.026572823524475, "step": 900 }, { "epoch": 1.796812749003984, "grad_norm": 1.3268436193466187, "learning_rate": 3.975511633951506e-06, "loss": 0.5517056584358215, "step": 902 }, { "epoch": 1.800796812749004, "grad_norm": 0.8117510676383972, "learning_rate": 3.966894939815702e-06, "loss": 0.3609198033809662, "step": 904 }, { "epoch": 1.8047808764940239, "grad_norm": 1.122198224067688, "learning_rate": 3.958271006066421e-06, "loss": 0.9236494898796082, "step": 906 }, { "epoch": 1.8087649402390438, "grad_norm": 2.9102554321289062, "learning_rate": 3.949639926322527e-06, "loss": 0.8726416230201721, "step": 908 }, { "epoch": 1.812749003984064, "grad_norm": 13.756661415100098, "learning_rate": 3.941001794280458e-06, "loss": 1.0099586248397827, "step": 910 }, { "epoch": 1.8167330677290838, "grad_norm": 3.1848342418670654, "learning_rate": 3.932356703713212e-06, "loss": 0.25727564096450806, "step": 912 }, { "epoch": 1.8207171314741037, "grad_norm": 1.389024019241333, "learning_rate": 3.923704748469326e-06, "loss": 1.0060839653015137, "step": 914 }, { "epoch": 1.8247011952191237, "grad_norm": 0.8609137535095215, "learning_rate": 3.915046022471857e-06, "loss": 1.0158603191375732, "step": 916 }, { "epoch": 1.8286852589641436, "grad_norm": 0.8087533116340637, "learning_rate": 3.906380619717363e-06, "loss": 1.0479439496994019, "step": 918 }, { "epoch": 1.8326693227091635, "grad_norm": 3.3105380535125732, "learning_rate": 3.897708634274886e-06, "loss": 0.36958053708076477, "step": 920 }, { "epoch": 1.8366533864541834, "grad_norm": 1.9331108331680298, "learning_rate": 3.889030160284922e-06, "loss": 0.35556235909461975, "step": 922 }, { "epoch": 1.8406374501992033, "grad_norm": 0.7566105723381042, "learning_rate": 3.88034529195841e-06, "loss": 1.1607534885406494, "step": 924 }, { "epoch": 1.8446215139442232, "grad_norm": 0.2870655953884125, "learning_rate": 3.871654123575704e-06, "loss": 0.14478978514671326, "step": 926 }, { "epoch": 1.8486055776892432, "grad_norm": 0.3280292749404907, "learning_rate": 3.8629567494855445e-06, "loss": 0.0896715372800827, "step": 928 }, { "epoch": 1.852589641434263, "grad_norm": 1.354030728340149, "learning_rate": 3.854253264104045e-06, "loss": 1.078214168548584, "step": 930 }, { "epoch": 1.856573705179283, "grad_norm": 1.015066146850586, "learning_rate": 3.845543761913657e-06, "loss": 1.114577293395996, "step": 932 }, { "epoch": 1.860557768924303, "grad_norm": 0.39395958185195923, "learning_rate": 3.836828337462152e-06, "loss": 0.5930612087249756, "step": 934 }, { "epoch": 1.8645418326693228, "grad_norm": 3.372042417526245, "learning_rate": 3.82810708536159e-06, "loss": 0.34988486766815186, "step": 936 }, { "epoch": 1.8685258964143427, "grad_norm": 1.3925652503967285, "learning_rate": 3.819380100287294e-06, "loss": 1.0657780170440674, "step": 938 }, { "epoch": 1.8725099601593627, "grad_norm": 1.6448031663894653, "learning_rate": 3.810647476976824e-06, "loss": 1.0907565355300903, "step": 940 }, { "epoch": 1.8764940239043826, "grad_norm": 0.7891445159912109, "learning_rate": 3.801909310228945e-06, "loss": 0.35766711831092834, "step": 942 }, { "epoch": 1.8804780876494025, "grad_norm": 1.724031686782837, "learning_rate": 3.7931656949026028e-06, "loss": 1.7528119087219238, "step": 944 }, { "epoch": 1.8844621513944224, "grad_norm": 1.0190646648406982, "learning_rate": 3.784416725915887e-06, "loss": 0.706551194190979, "step": 946 }, { "epoch": 1.8884462151394423, "grad_norm": 3.7524330615997314, "learning_rate": 3.7756624982450105e-06, "loss": 1.3365905284881592, "step": 948 }, { "epoch": 1.8924302788844622, "grad_norm": 1.1480021476745605, "learning_rate": 3.7669031069232684e-06, "loss": 0.7811166048049927, "step": 950 }, { "epoch": 1.8964143426294822, "grad_norm": 0.7147510647773743, "learning_rate": 3.7581386470400106e-06, "loss": 1.0117745399475098, "step": 952 }, { "epoch": 1.900398406374502, "grad_norm": 2.004282236099243, "learning_rate": 3.7493692137396153e-06, "loss": 0.5164535045623779, "step": 954 }, { "epoch": 1.904382470119522, "grad_norm": 0.7438123822212219, "learning_rate": 3.7405949022204435e-06, "loss": 1.0378838777542114, "step": 956 }, { "epoch": 1.908366533864542, "grad_norm": 3.5988733768463135, "learning_rate": 3.731815807733818e-06, "loss": 0.6023346781730652, "step": 958 }, { "epoch": 1.9123505976095618, "grad_norm": 2.4353888034820557, "learning_rate": 3.723032025582982e-06, "loss": 0.5875221490859985, "step": 960 }, { "epoch": 1.9163346613545817, "grad_norm": 1.3933720588684082, "learning_rate": 3.7142436511220676e-06, "loss": 0.1774052381515503, "step": 962 }, { "epoch": 1.9203187250996017, "grad_norm": 2.9852864742279053, "learning_rate": 3.7054507797550564e-06, "loss": 1.3314721584320068, "step": 964 }, { "epoch": 1.9243027888446216, "grad_norm": 0.7507312893867493, "learning_rate": 3.6966535069347523e-06, "loss": 1.0096935033798218, "step": 966 }, { "epoch": 1.9282868525896415, "grad_norm": 1.7996251583099365, "learning_rate": 3.6878519281617354e-06, "loss": 1.0307931900024414, "step": 968 }, { "epoch": 1.9322709163346614, "grad_norm": 1.16811203956604, "learning_rate": 3.6790461389833317e-06, "loss": 0.9180192351341248, "step": 970 }, { "epoch": 1.9362549800796813, "grad_norm": 0.7789274454116821, "learning_rate": 3.670236234992576e-06, "loss": 1.1056816577911377, "step": 972 }, { "epoch": 1.9402390438247012, "grad_norm": 0.8071714639663696, "learning_rate": 3.661422311827169e-06, "loss": 1.061263084411621, "step": 974 }, { "epoch": 1.9442231075697212, "grad_norm": 2.5436365604400635, "learning_rate": 3.652604465168444e-06, "loss": 0.9830687642097473, "step": 976 }, { "epoch": 1.948207171314741, "grad_norm": 0.7201181054115295, "learning_rate": 3.6437827907403273e-06, "loss": 1.0000416040420532, "step": 978 }, { "epoch": 1.952191235059761, "grad_norm": 0.7345990538597107, "learning_rate": 3.6349573843082966e-06, "loss": 1.0285298824310303, "step": 980 }, { "epoch": 1.956175298804781, "grad_norm": 0.6029013395309448, "learning_rate": 3.6261283416783447e-06, "loss": 0.3689904808998108, "step": 982 }, { "epoch": 1.9601593625498008, "grad_norm": 5.31935977935791, "learning_rate": 3.6172957586959372e-06, "loss": 1.075624704360962, "step": 984 }, { "epoch": 1.9641434262948207, "grad_norm": 2.391829252243042, "learning_rate": 3.6084597312449725e-06, "loss": 0.8474624156951904, "step": 986 }, { "epoch": 1.9681274900398407, "grad_norm": 5.1822967529296875, "learning_rate": 3.599620355246742e-06, "loss": 0.31603577733039856, "step": 988 }, { "epoch": 1.9721115537848606, "grad_norm": 1.8022582530975342, "learning_rate": 3.5907777266588856e-06, "loss": 0.911726713180542, "step": 990 }, { "epoch": 1.9760956175298805, "grad_norm": 0.7391871213912964, "learning_rate": 3.5819319414743555e-06, "loss": 1.0421473979949951, "step": 992 }, { "epoch": 1.9800796812749004, "grad_norm": 1.211188554763794, "learning_rate": 3.573083095720369e-06, "loss": 1.0375580787658691, "step": 994 }, { "epoch": 1.9840637450199203, "grad_norm": 6.231225967407227, "learning_rate": 3.5642312854573686e-06, "loss": 0.5392568707466125, "step": 996 }, { "epoch": 1.9880478087649402, "grad_norm": 1.1782855987548828, "learning_rate": 3.5553766067779785e-06, "loss": 1.188450813293457, "step": 998 }, { "epoch": 1.9920318725099602, "grad_norm": 0.6256092190742493, "learning_rate": 3.546519155805962e-06, "loss": 1.0698131322860718, "step": 1000 }, { "epoch": 1.99601593625498, "grad_norm": 0.89486163854599, "learning_rate": 3.5376590286951774e-06, "loss": 1.02101469039917, "step": 1002 }, { "epoch": 2.0, "grad_norm": 0.5744116902351379, "learning_rate": 3.5287963216285337e-06, "loss": 0.08481757342815399, "step": 1004 }, { "epoch": 2.00398406374502, "grad_norm": 0.4444674849510193, "learning_rate": 3.519931130816947e-06, "loss": 0.14744052290916443, "step": 1006 }, { "epoch": 2.00796812749004, "grad_norm": 1.0349431037902832, "learning_rate": 3.511063552498299e-06, "loss": 0.894745945930481, "step": 1008 }, { "epoch": 2.0119521912350598, "grad_norm": 0.5005489587783813, "learning_rate": 3.502193682936385e-06, "loss": 0.29803839325904846, "step": 1010 }, { "epoch": 2.0159362549800797, "grad_norm": 1.0027674436569214, "learning_rate": 3.493321618419877e-06, "loss": 0.6132505536079407, "step": 1012 }, { "epoch": 2.0199203187250996, "grad_norm": 0.722247302532196, "learning_rate": 3.484447455261272e-06, "loss": 0.8650059700012207, "step": 1014 }, { "epoch": 2.0239043824701195, "grad_norm": 0.1125183254480362, "learning_rate": 3.4755712897958524e-06, "loss": 0.06626415252685547, "step": 1016 }, { "epoch": 2.0278884462151394, "grad_norm": 2.244713306427002, "learning_rate": 3.4666932183806345e-06, "loss": 0.6729474663734436, "step": 1018 }, { "epoch": 2.0318725099601593, "grad_norm": 0.8710299730300903, "learning_rate": 3.4578133373933263e-06, "loss": 0.8701741099357605, "step": 1020 }, { "epoch": 2.0358565737051793, "grad_norm": 0.8872413635253906, "learning_rate": 3.4489317432312796e-06, "loss": 0.8716042041778564, "step": 1022 }, { "epoch": 2.039840637450199, "grad_norm": 1.219373106956482, "learning_rate": 3.4400485323104426e-06, "loss": 0.34580960869789124, "step": 1024 }, { "epoch": 2.043824701195219, "grad_norm": 1.7070385217666626, "learning_rate": 3.431163801064317e-06, "loss": 0.3066391348838806, "step": 1026 }, { "epoch": 2.047808764940239, "grad_norm": 3.4397644996643066, "learning_rate": 3.422277645942907e-06, "loss": 0.3099243938922882, "step": 1028 }, { "epoch": 2.051792828685259, "grad_norm": 20.93805694580078, "learning_rate": 3.413390163411675e-06, "loss": 0.6691966652870178, "step": 1030 }, { "epoch": 2.055776892430279, "grad_norm": 1.0854685306549072, "learning_rate": 3.4045014499504923e-06, "loss": 0.8780809640884399, "step": 1032 }, { "epoch": 2.0597609561752988, "grad_norm": 11.395671844482422, "learning_rate": 3.3956116020525924e-06, "loss": 0.2683337926864624, "step": 1034 }, { "epoch": 2.0637450199203187, "grad_norm": 2.4742014408111572, "learning_rate": 3.3867207162235272e-06, "loss": 0.7748890519142151, "step": 1036 }, { "epoch": 2.0677290836653386, "grad_norm": 2.432234525680542, "learning_rate": 3.377828888980112e-06, "loss": 0.8894884586334229, "step": 1038 }, { "epoch": 2.0717131474103585, "grad_norm": 2.468468427658081, "learning_rate": 3.3689362168493844e-06, "loss": 0.6649755239486694, "step": 1040 }, { "epoch": 2.0756972111553784, "grad_norm": 0.6127830147743225, "learning_rate": 3.3600427963675516e-06, "loss": 0.8452335596084595, "step": 1042 }, { "epoch": 2.0796812749003983, "grad_norm": 1.180112361907959, "learning_rate": 3.3511487240789483e-06, "loss": 0.929725170135498, "step": 1044 }, { "epoch": 2.0836653386454183, "grad_norm": 0.738735020160675, "learning_rate": 3.3422540965349806e-06, "loss": 0.8923982381820679, "step": 1046 }, { "epoch": 2.087649402390438, "grad_norm": 3.025284767150879, "learning_rate": 3.333359010293085e-06, "loss": 0.9607875347137451, "step": 1048 }, { "epoch": 2.091633466135458, "grad_norm": 0.7996847033500671, "learning_rate": 3.3244635619156786e-06, "loss": 0.4797319769859314, "step": 1050 }, { "epoch": 2.095617529880478, "grad_norm": 10.094463348388672, "learning_rate": 3.315567847969106e-06, "loss": 0.2578115165233612, "step": 1052 }, { "epoch": 2.099601593625498, "grad_norm": 0.6219993233680725, "learning_rate": 3.306671965022598e-06, "loss": 0.315256267786026, "step": 1054 }, { "epoch": 2.103585657370518, "grad_norm": 1.1088297367095947, "learning_rate": 3.2977760096472184e-06, "loss": 0.9286193251609802, "step": 1056 }, { "epoch": 2.1075697211155378, "grad_norm": 1.1025009155273438, "learning_rate": 3.2888800784148174e-06, "loss": 0.7976268529891968, "step": 1058 }, { "epoch": 2.1115537848605577, "grad_norm": 0.7398043274879456, "learning_rate": 3.2799842678969835e-06, "loss": 0.3379042148590088, "step": 1060 }, { "epoch": 2.1155378486055776, "grad_norm": 1.8223795890808105, "learning_rate": 3.2710886746639964e-06, "loss": 0.29785844683647156, "step": 1062 }, { "epoch": 2.1195219123505975, "grad_norm": 0.9167846441268921, "learning_rate": 3.262193395283773e-06, "loss": 0.10107379406690598, "step": 1064 }, { "epoch": 2.1235059760956174, "grad_norm": 6.6176300048828125, "learning_rate": 3.2532985263208266e-06, "loss": 0.4440305829048157, "step": 1066 }, { "epoch": 2.1274900398406373, "grad_norm": 0.8213241696357727, "learning_rate": 3.244404164335213e-06, "loss": 0.8258364796638489, "step": 1068 }, { "epoch": 2.1314741035856573, "grad_norm": 2.339560031890869, "learning_rate": 3.2355104058814874e-06, "loss": 0.9001627564430237, "step": 1070 }, { "epoch": 2.135458167330677, "grad_norm": 1.07158625125885, "learning_rate": 3.226617347507649e-06, "loss": 0.3943869471549988, "step": 1072 }, { "epoch": 2.139442231075697, "grad_norm": 0.9587336182594299, "learning_rate": 3.2177250857541007e-06, "loss": 1.0341042280197144, "step": 1074 }, { "epoch": 2.143426294820717, "grad_norm": 0.8883066773414612, "learning_rate": 3.208833717152594e-06, "loss": 0.19238322973251343, "step": 1076 }, { "epoch": 2.147410358565737, "grad_norm": 1.4621644020080566, "learning_rate": 3.199943338225189e-06, "loss": 0.7075263261795044, "step": 1078 }, { "epoch": 2.151394422310757, "grad_norm": 0.9659390449523926, "learning_rate": 3.1910540454832e-06, "loss": 0.9844989776611328, "step": 1080 }, { "epoch": 2.1553784860557768, "grad_norm": 0.9126376509666443, "learning_rate": 3.1821659354261478e-06, "loss": 0.8773077130317688, "step": 1082 }, { "epoch": 2.1593625498007967, "grad_norm": 1.5047764778137207, "learning_rate": 3.173279104540719e-06, "loss": 0.7283194065093994, "step": 1084 }, { "epoch": 2.1633466135458166, "grad_norm": 2.4488370418548584, "learning_rate": 3.164393649299711e-06, "loss": 1.0191715955734253, "step": 1086 }, { "epoch": 2.1673306772908365, "grad_norm": 0.6298505663871765, "learning_rate": 3.155509666160986e-06, "loss": 0.19404178857803345, "step": 1088 }, { "epoch": 2.1713147410358564, "grad_norm": 3.298346519470215, "learning_rate": 3.1466272515664287e-06, "loss": 0.4330817759037018, "step": 1090 }, { "epoch": 2.1752988047808763, "grad_norm": 1.4736095666885376, "learning_rate": 3.137746501940894e-06, "loss": 0.8412344455718994, "step": 1092 }, { "epoch": 2.1792828685258963, "grad_norm": 1.3612383604049683, "learning_rate": 3.1288675136911653e-06, "loss": 0.7719582915306091, "step": 1094 }, { "epoch": 2.183266932270916, "grad_norm": 1.6760456562042236, "learning_rate": 3.1199903832049025e-06, "loss": 0.8681936264038086, "step": 1096 }, { "epoch": 2.187250996015936, "grad_norm": 0.9944242238998413, "learning_rate": 3.1111152068495982e-06, "loss": 0.8590313196182251, "step": 1098 }, { "epoch": 2.191235059760956, "grad_norm": 1.1411633491516113, "learning_rate": 3.102242080971531e-06, "loss": 0.8502429723739624, "step": 1100 }, { "epoch": 2.195219123505976, "grad_norm": 1.0093145370483398, "learning_rate": 3.0933711018947217e-06, "loss": 0.8326080441474915, "step": 1102 }, { "epoch": 2.199203187250996, "grad_norm": 1.3518801927566528, "learning_rate": 3.084502365919887e-06, "loss": 0.31851112842559814, "step": 1104 }, { "epoch": 2.2031872509960158, "grad_norm": 0.8486732840538025, "learning_rate": 3.0756359693233897e-06, "loss": 0.12462817877531052, "step": 1106 }, { "epoch": 2.2071713147410357, "grad_norm": 3.158237934112549, "learning_rate": 3.066772008356201e-06, "loss": 0.7065569162368774, "step": 1108 }, { "epoch": 2.2111553784860556, "grad_norm": 1.6595673561096191, "learning_rate": 3.057910579242848e-06, "loss": 0.32911333441734314, "step": 1110 }, { "epoch": 2.2151394422310755, "grad_norm": 0.9766960740089417, "learning_rate": 3.0490517781803748e-06, "loss": 0.8282409906387329, "step": 1112 }, { "epoch": 2.2191235059760954, "grad_norm": 2.551868438720703, "learning_rate": 3.040195701337296e-06, "loss": 0.8591130971908569, "step": 1114 }, { "epoch": 2.2231075697211153, "grad_norm": 2.4142255783081055, "learning_rate": 3.0313424448525513e-06, "loss": 0.6863746643066406, "step": 1116 }, { "epoch": 2.2270916334661353, "grad_norm": 1.8660197257995605, "learning_rate": 3.022492104834467e-06, "loss": 0.867939829826355, "step": 1118 }, { "epoch": 2.231075697211155, "grad_norm": 1.012052297592163, "learning_rate": 3.013644777359706e-06, "loss": 0.862476110458374, "step": 1120 }, { "epoch": 2.235059760956175, "grad_norm": 1.3242058753967285, "learning_rate": 3.004800558472228e-06, "loss": 0.8478327393531799, "step": 1122 }, { "epoch": 2.239043824701195, "grad_norm": 1.5202715396881104, "learning_rate": 2.995959544182248e-06, "loss": 0.8780950307846069, "step": 1124 }, { "epoch": 2.243027888446215, "grad_norm": 1.5164873600006104, "learning_rate": 2.9871218304651926e-06, "loss": 0.8773269653320312, "step": 1126 }, { "epoch": 2.247011952191235, "grad_norm": 12.062283515930176, "learning_rate": 2.9782875132606573e-06, "loss": 0.5782788991928101, "step": 1128 }, { "epoch": 2.2509960159362548, "grad_norm": 0.4626627266407013, "learning_rate": 2.969456688471368e-06, "loss": 0.17795492708683014, "step": 1130 }, { "epoch": 2.2549800796812747, "grad_norm": 8.622909545898438, "learning_rate": 2.960629451962137e-06, "loss": 0.876864492893219, "step": 1132 }, { "epoch": 2.2589641434262946, "grad_norm": 2.5603370666503906, "learning_rate": 2.9518058995588217e-06, "loss": 0.5039679408073425, "step": 1134 }, { "epoch": 2.2629482071713145, "grad_norm": 1.9047883749008179, "learning_rate": 2.9429861270472884e-06, "loss": 0.8298702836036682, "step": 1136 }, { "epoch": 2.2669322709163344, "grad_norm": 1.333377480506897, "learning_rate": 2.9341702301723704e-06, "loss": 0.8177191019058228, "step": 1138 }, { "epoch": 2.2709163346613543, "grad_norm": 0.8072558641433716, "learning_rate": 2.9253583046368243e-06, "loss": 0.8483671545982361, "step": 1140 }, { "epoch": 2.2749003984063743, "grad_norm": 1.162376046180725, "learning_rate": 2.916550446100299e-06, "loss": 0.8442429900169373, "step": 1142 }, { "epoch": 2.278884462151394, "grad_norm": 2.1500282287597656, "learning_rate": 2.907746750178293e-06, "loss": 0.40876924991607666, "step": 1144 }, { "epoch": 2.2828685258964145, "grad_norm": 1.5930662155151367, "learning_rate": 2.8989473124411136e-06, "loss": 0.3929884433746338, "step": 1146 }, { "epoch": 2.2868525896414345, "grad_norm": 0.9812231659889221, "learning_rate": 2.8901522284128454e-06, "loss": 0.8924030661582947, "step": 1148 }, { "epoch": 2.2908366533864544, "grad_norm": 4.809815883636475, "learning_rate": 2.881361593570308e-06, "loss": 0.412593275308609, "step": 1150 }, { "epoch": 2.2948207171314743, "grad_norm": 0.34295371174812317, "learning_rate": 2.872575503342027e-06, "loss": 0.07170237600803375, "step": 1152 }, { "epoch": 2.298804780876494, "grad_norm": 2.6662888526916504, "learning_rate": 2.8637940531071856e-06, "loss": 0.9125880599021912, "step": 1154 }, { "epoch": 2.302788844621514, "grad_norm": 1.016099214553833, "learning_rate": 2.8550173381946035e-06, "loss": 0.20460867881774902, "step": 1156 }, { "epoch": 2.306772908366534, "grad_norm": 1.2535561323165894, "learning_rate": 2.84624545388169e-06, "loss": 0.18213213980197906, "step": 1158 }, { "epoch": 2.310756972111554, "grad_norm": 5.914939880371094, "learning_rate": 2.837478495393418e-06, "loss": 1.015434980392456, "step": 1160 }, { "epoch": 2.314741035856574, "grad_norm": 3.516514539718628, "learning_rate": 2.828716557901286e-06, "loss": 0.4791782796382904, "step": 1162 }, { "epoch": 2.318725099601594, "grad_norm": 1.2415333986282349, "learning_rate": 2.819959736522286e-06, "loss": 0.6430278420448303, "step": 1164 }, { "epoch": 2.3227091633466137, "grad_norm": 6.374106407165527, "learning_rate": 2.8112081263178727e-06, "loss": 0.7340620756149292, "step": 1166 }, { "epoch": 2.3266932270916336, "grad_norm": 0.7349236011505127, "learning_rate": 2.8024618222929257e-06, "loss": 0.8904776573181152, "step": 1168 }, { "epoch": 2.3306772908366535, "grad_norm": 3.1692311763763428, "learning_rate": 2.793720919394726e-06, "loss": 0.3335300385951996, "step": 1170 }, { "epoch": 2.3346613545816735, "grad_norm": 1.9627305269241333, "learning_rate": 2.7849855125119204e-06, "loss": 0.9338223338127136, "step": 1172 }, { "epoch": 2.3386454183266934, "grad_norm": 1.715811014175415, "learning_rate": 2.7762556964734925e-06, "loss": 0.8548279404640198, "step": 1174 }, { "epoch": 2.3426294820717133, "grad_norm": 1.2761598825454712, "learning_rate": 2.7675315660477342e-06, "loss": 0.6551219820976257, "step": 1176 }, { "epoch": 2.346613545816733, "grad_norm": 0.5829970836639404, "learning_rate": 2.7588132159412153e-06, "loss": 0.8633916974067688, "step": 1178 }, { "epoch": 2.350597609561753, "grad_norm": 0.8791594505310059, "learning_rate": 2.7501007407977554e-06, "loss": 0.8312200903892517, "step": 1180 }, { "epoch": 2.354581673306773, "grad_norm": 0.8145209550857544, "learning_rate": 2.7413942351973994e-06, "loss": 0.8451777696609497, "step": 1182 }, { "epoch": 2.358565737051793, "grad_norm": 0.8338920474052429, "learning_rate": 2.7326937936553845e-06, "loss": 0.9415311813354492, "step": 1184 }, { "epoch": 2.362549800796813, "grad_norm": 0.9346828460693359, "learning_rate": 2.7239995106211244e-06, "loss": 0.8471455574035645, "step": 1186 }, { "epoch": 2.366533864541833, "grad_norm": 1.4322340488433838, "learning_rate": 2.715311480477173e-06, "loss": 0.30060604214668274, "step": 1188 }, { "epoch": 2.3705179282868527, "grad_norm": 1.1024688482284546, "learning_rate": 2.7066297975382065e-06, "loss": 0.7530568838119507, "step": 1190 }, { "epoch": 2.3745019920318726, "grad_norm": 0.5967240333557129, "learning_rate": 2.697954556049997e-06, "loss": 0.867277204990387, "step": 1192 }, { "epoch": 2.3784860557768925, "grad_norm": 0.9026405811309814, "learning_rate": 2.689285850188391e-06, "loss": 0.9335858225822449, "step": 1194 }, { "epoch": 2.3824701195219125, "grad_norm": 0.48514679074287415, "learning_rate": 2.6806237740582855e-06, "loss": 0.2793917655944824, "step": 1196 }, { "epoch": 2.3864541832669324, "grad_norm": 2.9039154052734375, "learning_rate": 2.671968421692607e-06, "loss": 1.4733071327209473, "step": 1198 }, { "epoch": 2.3904382470119523, "grad_norm": 3.6072850227355957, "learning_rate": 2.6633198870512927e-06, "loss": 0.3655731976032257, "step": 1200 }, { "epoch": 2.394422310756972, "grad_norm": 0.6584874391555786, "learning_rate": 2.6546782640202666e-06, "loss": 0.8660189509391785, "step": 1202 }, { "epoch": 2.398406374501992, "grad_norm": 0.5407839417457581, "learning_rate": 2.6460436464104216e-06, "loss": 0.848800003528595, "step": 1204 }, { "epoch": 2.402390438247012, "grad_norm": 1.0635416507720947, "learning_rate": 2.6374161279566035e-06, "loss": 0.9516815543174744, "step": 1206 }, { "epoch": 2.406374501992032, "grad_norm": 0.41980046033859253, "learning_rate": 2.628795802316591e-06, "loss": 0.120535708963871, "step": 1208 }, { "epoch": 2.410358565737052, "grad_norm": 0.3191829323768616, "learning_rate": 2.620182763070081e-06, "loss": 0.023226367309689522, "step": 1210 }, { "epoch": 2.414342629482072, "grad_norm": 1.4996663331985474, "learning_rate": 2.61157710371767e-06, "loss": 0.45069432258605957, "step": 1212 }, { "epoch": 2.4183266932270917, "grad_norm": 1.0962636470794678, "learning_rate": 2.6029789176798417e-06, "loss": 0.6983217000961304, "step": 1214 }, { "epoch": 2.4223107569721116, "grad_norm": 0.8529632091522217, "learning_rate": 2.594388298295949e-06, "loss": 0.17169800400733948, "step": 1216 }, { "epoch": 2.4262948207171315, "grad_norm": 0.9947030544281006, "learning_rate": 2.585805338823208e-06, "loss": 0.8718166947364807, "step": 1218 }, { "epoch": 2.4302788844621515, "grad_norm": 0.39905738830566406, "learning_rate": 2.577230132435678e-06, "loss": 0.5236790776252747, "step": 1220 }, { "epoch": 2.4342629482071714, "grad_norm": 1.6986416578292847, "learning_rate": 2.5686627722232518e-06, "loss": 0.4206949770450592, "step": 1222 }, { "epoch": 2.4382470119521913, "grad_norm": 0.8914661407470703, "learning_rate": 2.560103351190651e-06, "loss": 0.8530100584030151, "step": 1224 }, { "epoch": 2.442231075697211, "grad_norm": 1.940697193145752, "learning_rate": 2.5515519622564086e-06, "loss": 0.03098766878247261, "step": 1226 }, { "epoch": 2.446215139442231, "grad_norm": 0.740294873714447, "learning_rate": 2.543008698251863e-06, "loss": 0.8904476165771484, "step": 1228 }, { "epoch": 2.450199203187251, "grad_norm": 1.2256784439086914, "learning_rate": 2.534473651920153e-06, "loss": 0.6660670042037964, "step": 1230 }, { "epoch": 2.454183266932271, "grad_norm": 1.3577665090560913, "learning_rate": 2.5259469159152063e-06, "loss": 0.8957257270812988, "step": 1232 }, { "epoch": 2.458167330677291, "grad_norm": 5.5895209312438965, "learning_rate": 2.5174285828007387e-06, "loss": 0.4879809319972992, "step": 1234 }, { "epoch": 2.462151394422311, "grad_norm": 1.602962851524353, "learning_rate": 2.5089187450492464e-06, "loss": 0.8527651429176331, "step": 1236 }, { "epoch": 2.4661354581673307, "grad_norm": 1.6139048337936401, "learning_rate": 2.5004174950409996e-06, "loss": 0.814254641532898, "step": 1238 }, { "epoch": 2.4701195219123506, "grad_norm": 2.1591413021087646, "learning_rate": 2.4919249250630463e-06, "loss": 0.620861828327179, "step": 1240 }, { "epoch": 2.4741035856573705, "grad_norm": 2.2499430179595947, "learning_rate": 2.483441127308202e-06, "loss": 0.622882068157196, "step": 1242 }, { "epoch": 2.4780876494023905, "grad_norm": 0.8735558390617371, "learning_rate": 2.47496619387406e-06, "loss": 0.8819273114204407, "step": 1244 }, { "epoch": 2.4820717131474104, "grad_norm": 1.0973459482192993, "learning_rate": 2.4665002167619798e-06, "loss": 0.85080885887146, "step": 1246 }, { "epoch": 2.4860557768924303, "grad_norm": 1.19606351852417, "learning_rate": 2.4580432878760968e-06, "loss": 0.5080418586730957, "step": 1248 }, { "epoch": 2.49003984063745, "grad_norm": 0.36084145307540894, "learning_rate": 2.449595499022318e-06, "loss": 0.3111553192138672, "step": 1250 }, { "epoch": 2.49402390438247, "grad_norm": 0.7546538710594177, "learning_rate": 2.441156941907333e-06, "loss": 0.6624001264572144, "step": 1252 }, { "epoch": 2.49800796812749, "grad_norm": 0.7720620632171631, "learning_rate": 2.432727708137612e-06, "loss": 0.7852078676223755, "step": 1254 }, { "epoch": 2.50199203187251, "grad_norm": 2.640068292617798, "learning_rate": 2.424307889218414e-06, "loss": 0.9888243079185486, "step": 1256 }, { "epoch": 2.50597609561753, "grad_norm": 0.47891512513160706, "learning_rate": 2.415897576552795e-06, "loss": 0.11806351691484451, "step": 1258 }, { "epoch": 2.50996015936255, "grad_norm": 1.773125171661377, "learning_rate": 2.407496861440611e-06, "loss": 0.712026834487915, "step": 1260 }, { "epoch": 2.5139442231075697, "grad_norm": 0.8916162848472595, "learning_rate": 2.3991058350775316e-06, "loss": 0.27510854601860046, "step": 1262 }, { "epoch": 2.5179282868525896, "grad_norm": 2.915144205093384, "learning_rate": 2.3907245885540473e-06, "loss": 0.5907682180404663, "step": 1264 }, { "epoch": 2.5219123505976095, "grad_norm": 0.7523391842842102, "learning_rate": 2.382353212854483e-06, "loss": 0.875799298286438, "step": 1266 }, { "epoch": 2.5258964143426295, "grad_norm": 0.7640947699546814, "learning_rate": 2.373991798856008e-06, "loss": 0.8100597858428955, "step": 1268 }, { "epoch": 2.5298804780876494, "grad_norm": 0.9602063894271851, "learning_rate": 2.3656404373276496e-06, "loss": 0.8617823719978333, "step": 1270 }, { "epoch": 2.5338645418326693, "grad_norm": 1.0857386589050293, "learning_rate": 2.35729921892931e-06, "loss": 0.7695320248603821, "step": 1272 }, { "epoch": 2.537848605577689, "grad_norm": 2.655921220779419, "learning_rate": 2.3489682342107787e-06, "loss": 1.0393037796020508, "step": 1274 }, { "epoch": 2.541832669322709, "grad_norm": 1.602705478668213, "learning_rate": 2.3406475736107537e-06, "loss": 0.8128276467323303, "step": 1276 }, { "epoch": 2.545816733067729, "grad_norm": 1.7629623413085938, "learning_rate": 2.332337327455856e-06, "loss": 0.8416529893875122, "step": 1278 }, { "epoch": 2.549800796812749, "grad_norm": 0.3072420656681061, "learning_rate": 2.3240375859596493e-06, "loss": 0.21107147634029388, "step": 1280 }, { "epoch": 2.553784860557769, "grad_norm": 0.7584460973739624, "learning_rate": 2.3157484392216645e-06, "loss": 0.7613718509674072, "step": 1282 }, { "epoch": 2.557768924302789, "grad_norm": 0.7467636466026306, "learning_rate": 2.3074699772264184e-06, "loss": 0.9068883657455444, "step": 1284 }, { "epoch": 2.5617529880478087, "grad_norm": 2.827934503555298, "learning_rate": 2.2992022898424358e-06, "loss": 0.9814170002937317, "step": 1286 }, { "epoch": 2.5657370517928286, "grad_norm": 0.6314749717712402, "learning_rate": 2.2909454668212763e-06, "loss": 0.9777659177780151, "step": 1288 }, { "epoch": 2.5697211155378485, "grad_norm": 1.5785683393478394, "learning_rate": 2.2826995977965586e-06, "loss": 0.14857736229896545, "step": 1290 }, { "epoch": 2.5737051792828685, "grad_norm": 0.8036978244781494, "learning_rate": 2.27446477228299e-06, "loss": 0.9405508041381836, "step": 1292 }, { "epoch": 2.5776892430278884, "grad_norm": 0.7155508399009705, "learning_rate": 2.2662410796753924e-06, "loss": 0.8522077202796936, "step": 1294 }, { "epoch": 2.5816733067729083, "grad_norm": 1.1586476564407349, "learning_rate": 2.2580286092477285e-06, "loss": 0.8515244722366333, "step": 1296 }, { "epoch": 2.585657370517928, "grad_norm": 1.105276346206665, "learning_rate": 2.2498274501521414e-06, "loss": 0.8348259925842285, "step": 1298 }, { "epoch": 2.589641434262948, "grad_norm": 0.5298115611076355, "learning_rate": 2.2416376914179776e-06, "loss": 0.37851282954216003, "step": 1300 }, { "epoch": 2.593625498007968, "grad_norm": 0.8865681290626526, "learning_rate": 2.2334594219508283e-06, "loss": 0.493791401386261, "step": 1302 }, { "epoch": 2.597609561752988, "grad_norm": 0.8937894105911255, "learning_rate": 2.2252927305315587e-06, "loss": 0.768490731716156, "step": 1304 }, { "epoch": 2.601593625498008, "grad_norm": 2.249807119369507, "learning_rate": 2.2171377058153465e-06, "loss": 0.28239089250564575, "step": 1306 }, { "epoch": 2.605577689243028, "grad_norm": 0.7723252773284912, "learning_rate": 2.2089944363307165e-06, "loss": 0.8856875896453857, "step": 1308 }, { "epoch": 2.6095617529880477, "grad_norm": 0.43645548820495605, "learning_rate": 2.2008630104785874e-06, "loss": 0.352665513753891, "step": 1310 }, { "epoch": 2.6135458167330676, "grad_norm": 2.615204095840454, "learning_rate": 2.1927435165313036e-06, "loss": 0.1691545695066452, "step": 1312 }, { "epoch": 2.6175298804780875, "grad_norm": 0.7458433508872986, "learning_rate": 2.184636042631679e-06, "loss": 0.06585448980331421, "step": 1314 }, { "epoch": 2.6215139442231075, "grad_norm": 1.3437604904174805, "learning_rate": 2.176540676792046e-06, "loss": 0.956698477268219, "step": 1316 }, { "epoch": 2.6254980079681274, "grad_norm": 2.3479928970336914, "learning_rate": 2.168457506893292e-06, "loss": 0.669885516166687, "step": 1318 }, { "epoch": 2.6294820717131473, "grad_norm": 0.6726356744766235, "learning_rate": 2.1603866206839074e-06, "loss": 0.9108378887176514, "step": 1320 }, { "epoch": 2.633466135458167, "grad_norm": 0.6728199124336243, "learning_rate": 2.152328105779041e-06, "loss": 0.46163687109947205, "step": 1322 }, { "epoch": 2.637450199203187, "grad_norm": 3.6970763206481934, "learning_rate": 2.1442820496595337e-06, "loss": 1.0799225568771362, "step": 1324 }, { "epoch": 2.641434262948207, "grad_norm": 2.347198009490967, "learning_rate": 2.1362485396709847e-06, "loss": 0.2297479808330536, "step": 1326 }, { "epoch": 2.645418326693227, "grad_norm": 1.014694094657898, "learning_rate": 2.128227663022794e-06, "loss": 0.7543836832046509, "step": 1328 }, { "epoch": 2.649402390438247, "grad_norm": 1.9803884029388428, "learning_rate": 2.1202195067872153e-06, "loss": 0.8650748133659363, "step": 1330 }, { "epoch": 2.653386454183267, "grad_norm": 1.038819432258606, "learning_rate": 2.112224157898416e-06, "loss": 0.7467201352119446, "step": 1332 }, { "epoch": 2.6573705179282867, "grad_norm": 4.248292922973633, "learning_rate": 2.1042417031515303e-06, "loss": 1.0267494916915894, "step": 1334 }, { "epoch": 2.6613545816733066, "grad_norm": 0.40952640771865845, "learning_rate": 2.096272229201716e-06, "loss": 0.06949189305305481, "step": 1336 }, { "epoch": 2.6653386454183265, "grad_norm": 1.2858881950378418, "learning_rate": 2.0883158225632168e-06, "loss": 0.9944968223571777, "step": 1338 }, { "epoch": 2.6693227091633465, "grad_norm": 1.2663077116012573, "learning_rate": 2.0803725696084224e-06, "loss": 0.32381299138069153, "step": 1340 }, { "epoch": 2.6733067729083664, "grad_norm": 2.5092110633850098, "learning_rate": 2.072442556566928e-06, "loss": 0.5067175626754761, "step": 1342 }, { "epoch": 2.6772908366533863, "grad_norm": 0.4816880226135254, "learning_rate": 2.0645258695245993e-06, "loss": 0.06836852431297302, "step": 1344 }, { "epoch": 2.681274900398406, "grad_norm": 0.8811363577842712, "learning_rate": 2.0566225944226414e-06, "loss": 0.8118082284927368, "step": 1346 }, { "epoch": 2.685258964143426, "grad_norm": 0.7595816850662231, "learning_rate": 2.0487328170566643e-06, "loss": 0.833029568195343, "step": 1348 }, { "epoch": 2.6892430278884465, "grad_norm": 0.9555457830429077, "learning_rate": 2.0408566230757465e-06, "loss": 0.8837859034538269, "step": 1350 }, { "epoch": 2.6932270916334664, "grad_norm": 2.7736618518829346, "learning_rate": 2.0329940979815116e-06, "loss": 0.3744777739048004, "step": 1352 }, { "epoch": 2.6972111553784863, "grad_norm": 1.4651148319244385, "learning_rate": 2.0251453271272e-06, "loss": 0.3069399297237396, "step": 1354 }, { "epoch": 2.7011952191235062, "grad_norm": 1.0298899412155151, "learning_rate": 2.0173103957167367e-06, "loss": 0.8419727087020874, "step": 1356 }, { "epoch": 2.705179282868526, "grad_norm": 1.365960955619812, "learning_rate": 2.009489388803809e-06, "loss": 0.8394007682800293, "step": 1358 }, { "epoch": 2.709163346613546, "grad_norm": 0.9906344413757324, "learning_rate": 2.0016823912909486e-06, "loss": 0.8413975238800049, "step": 1360 }, { "epoch": 2.713147410358566, "grad_norm": 0.6724693775177002, "learning_rate": 1.9938894879286024e-06, "loss": 0.8469905853271484, "step": 1362 }, { "epoch": 2.717131474103586, "grad_norm": 1.9248793125152588, "learning_rate": 1.9861107633142155e-06, "loss": 0.8509299755096436, "step": 1364 }, { "epoch": 2.721115537848606, "grad_norm": 1.4797543287277222, "learning_rate": 1.978346301891312e-06, "loss": 0.35483643412590027, "step": 1366 }, { "epoch": 2.7250996015936257, "grad_norm": 0.8299886584281921, "learning_rate": 1.9705961879485813e-06, "loss": 0.8987928628921509, "step": 1368 }, { "epoch": 2.7290836653386457, "grad_norm": 1.4776321649551392, "learning_rate": 1.962860505618958e-06, "loss": 0.6491652131080627, "step": 1370 }, { "epoch": 2.7330677290836656, "grad_norm": 6.724909782409668, "learning_rate": 1.955139338878714e-06, "loss": 0.19401389360427856, "step": 1372 }, { "epoch": 2.7370517928286855, "grad_norm": 0.943676233291626, "learning_rate": 1.9474327715465444e-06, "loss": 0.8299869894981384, "step": 1374 }, { "epoch": 2.7410358565737054, "grad_norm": 1.2990317344665527, "learning_rate": 1.9397408872826545e-06, "loss": 0.871895968914032, "step": 1376 }, { "epoch": 2.7450199203187253, "grad_norm": 1.9206279516220093, "learning_rate": 1.9320637695878555e-06, "loss": 0.30201855301856995, "step": 1378 }, { "epoch": 2.7490039840637452, "grad_norm": 0.7692667841911316, "learning_rate": 1.924401501802659e-06, "loss": 0.6371020078659058, "step": 1380 }, { "epoch": 2.752988047808765, "grad_norm": 0.8262352347373962, "learning_rate": 1.9167541671063703e-06, "loss": 0.9497525691986084, "step": 1382 }, { "epoch": 2.756972111553785, "grad_norm": 1.0128363370895386, "learning_rate": 1.9091218485161824e-06, "loss": 0.9976522922515869, "step": 1384 }, { "epoch": 2.760956175298805, "grad_norm": 0.8022831082344055, "learning_rate": 1.9015046288862815e-06, "loss": 0.8430491089820862, "step": 1386 }, { "epoch": 2.764940239043825, "grad_norm": 1.4386292695999146, "learning_rate": 1.893902590906943e-06, "loss": 0.6075490117073059, "step": 1388 }, { "epoch": 2.768924302788845, "grad_norm": 1.3775461912155151, "learning_rate": 1.8863158171036336e-06, "loss": 0.12825116515159607, "step": 1390 }, { "epoch": 2.7729083665338647, "grad_norm": 1.3699278831481934, "learning_rate": 1.8787443898361158e-06, "loss": 1.1316020488739014, "step": 1392 }, { "epoch": 2.7768924302788847, "grad_norm": 0.8569239377975464, "learning_rate": 1.8711883912975575e-06, "loss": 0.655997633934021, "step": 1394 }, { "epoch": 2.7808764940239046, "grad_norm": 0.7035950422286987, "learning_rate": 1.8636479035136368e-06, "loss": 0.8871821165084839, "step": 1396 }, { "epoch": 2.7848605577689245, "grad_norm": 0.7683161497116089, "learning_rate": 1.8561230083416488e-06, "loss": 0.9570977687835693, "step": 1398 }, { "epoch": 2.7888446215139444, "grad_norm": 0.8087801337242126, "learning_rate": 1.8486137874696223e-06, "loss": 0.8703477382659912, "step": 1400 }, { "epoch": 2.7928286852589643, "grad_norm": 0.9088819622993469, "learning_rate": 1.8411203224154289e-06, "loss": 0.8619301915168762, "step": 1402 }, { "epoch": 2.7968127490039842, "grad_norm": 0.3485574424266815, "learning_rate": 1.833642694525902e-06, "loss": 0.13462619483470917, "step": 1404 }, { "epoch": 2.800796812749004, "grad_norm": 0.9604331851005554, "learning_rate": 1.826180984975948e-06, "loss": 0.8676316142082214, "step": 1406 }, { "epoch": 2.804780876494024, "grad_norm": 1.302273154258728, "learning_rate": 1.8187352747676718e-06, "loss": 1.241036295890808, "step": 1408 }, { "epoch": 2.808764940239044, "grad_norm": 1.2466564178466797, "learning_rate": 1.8113056447294936e-06, "loss": 1.0569744110107422, "step": 1410 }, { "epoch": 2.812749003984064, "grad_norm": 0.9512035846710205, "learning_rate": 1.8038921755152704e-06, "loss": 0.8206438422203064, "step": 1412 }, { "epoch": 2.816733067729084, "grad_norm": 1.0051904916763306, "learning_rate": 1.7964949476034223e-06, "loss": 0.9369583129882812, "step": 1414 }, { "epoch": 2.8207171314741037, "grad_norm": 3.8374409675598145, "learning_rate": 1.7891140412960615e-06, "loss": 1.116792917251587, "step": 1416 }, { "epoch": 2.8247011952191237, "grad_norm": 1.1146875619888306, "learning_rate": 1.7817495367181132e-06, "loss": 0.8257051110267639, "step": 1418 }, { "epoch": 2.8286852589641436, "grad_norm": 0.2130766063928604, "learning_rate": 1.774401513816454e-06, "loss": 0.08374066650867462, "step": 1420 }, { "epoch": 2.8326693227091635, "grad_norm": 0.8484716415405273, "learning_rate": 1.76707005235904e-06, "loss": 0.9364421963691711, "step": 1422 }, { "epoch": 2.8366533864541834, "grad_norm": 0.7365440130233765, "learning_rate": 1.759755231934039e-06, "loss": 0.9269137978553772, "step": 1424 }, { "epoch": 2.8406374501992033, "grad_norm": 0.9674385190010071, "learning_rate": 1.7524571319489695e-06, "loss": 0.24093596637248993, "step": 1426 }, { "epoch": 2.8446215139442232, "grad_norm": 0.8217137455940247, "learning_rate": 1.7451758316298386e-06, "loss": 0.8590070605278015, "step": 1428 }, { "epoch": 2.848605577689243, "grad_norm": 0.818912148475647, "learning_rate": 1.7379114100202824e-06, "loss": 0.8883748650550842, "step": 1430 }, { "epoch": 2.852589641434263, "grad_norm": 2.239244222640991, "learning_rate": 1.7306639459807026e-06, "loss": 0.8789231777191162, "step": 1432 }, { "epoch": 2.856573705179283, "grad_norm": 1.3130366802215576, "learning_rate": 1.7234335181874197e-06, "loss": 0.41715553402900696, "step": 1434 }, { "epoch": 2.860557768924303, "grad_norm": 2.1881866455078125, "learning_rate": 1.7162202051318092e-06, "loss": 0.8317433595657349, "step": 1436 }, { "epoch": 2.864541832669323, "grad_norm": 0.4997340440750122, "learning_rate": 1.7090240851194576e-06, "loss": 0.06248881667852402, "step": 1438 }, { "epoch": 2.8685258964143427, "grad_norm": 0.7684650421142578, "learning_rate": 1.7018452362693062e-06, "loss": 0.9771674871444702, "step": 1440 }, { "epoch": 2.8725099601593627, "grad_norm": 2.6358094215393066, "learning_rate": 1.694683736512807e-06, "loss": 0.4274534285068512, "step": 1442 }, { "epoch": 2.8764940239043826, "grad_norm": 3.7041735649108887, "learning_rate": 1.6875396635930767e-06, "loss": 0.8502193689346313, "step": 1444 }, { "epoch": 2.8804780876494025, "grad_norm": 1.7656716108322144, "learning_rate": 1.6804130950640492e-06, "loss": 0.2269526571035385, "step": 1446 }, { "epoch": 2.8844621513944224, "grad_norm": 0.9704077839851379, "learning_rate": 1.6733041082896355e-06, "loss": 0.9017117619514465, "step": 1448 }, { "epoch": 2.8884462151394423, "grad_norm": 1.1423131227493286, "learning_rate": 1.666212780442887e-06, "loss": 0.7310890555381775, "step": 1450 }, { "epoch": 2.8924302788844622, "grad_norm": 0.8818380832672119, "learning_rate": 1.659139188505152e-06, "loss": 0.9649314880371094, "step": 1452 }, { "epoch": 2.896414342629482, "grad_norm": 0.9627234935760498, "learning_rate": 1.652083409265246e-06, "loss": 0.1323651671409607, "step": 1454 }, { "epoch": 2.900398406374502, "grad_norm": 0.625633955001831, "learning_rate": 1.6450455193186137e-06, "loss": 0.8300275206565857, "step": 1456 }, { "epoch": 2.904382470119522, "grad_norm": 1.691175103187561, "learning_rate": 1.638025595066499e-06, "loss": 0.7612891793251038, "step": 1458 }, { "epoch": 2.908366533864542, "grad_norm": 0.9278882145881653, "learning_rate": 1.6310237127151137e-06, "loss": 0.9076191782951355, "step": 1460 }, { "epoch": 2.912350597609562, "grad_norm": 2.7954494953155518, "learning_rate": 1.624039948274815e-06, "loss": 0.37150129675865173, "step": 1462 }, { "epoch": 2.9163346613545817, "grad_norm": 0.423910528421402, "learning_rate": 1.6170743775592773e-06, "loss": 0.20058873295783997, "step": 1464 }, { "epoch": 2.9203187250996017, "grad_norm": 0.9244667887687683, "learning_rate": 1.610127076184667e-06, "loss": 0.8625198602676392, "step": 1466 }, { "epoch": 2.9243027888446216, "grad_norm": 0.8803090453147888, "learning_rate": 1.6031981195688252e-06, "loss": 0.9291595816612244, "step": 1468 }, { "epoch": 2.9282868525896415, "grad_norm": 1.0361244678497314, "learning_rate": 1.59628758293045e-06, "loss": 0.23180729150772095, "step": 1470 }, { "epoch": 2.9322709163346614, "grad_norm": 5.147000789642334, "learning_rate": 1.5893955412882733e-06, "loss": 0.5987867712974548, "step": 1472 }, { "epoch": 2.9362549800796813, "grad_norm": 0.5982325673103333, "learning_rate": 1.582522069460253e-06, "loss": 0.8363850116729736, "step": 1474 }, { "epoch": 2.9402390438247012, "grad_norm": 3.7226884365081787, "learning_rate": 1.5756672420627596e-06, "loss": 0.8606371283531189, "step": 1476 }, { "epoch": 2.944223107569721, "grad_norm": 1.0484495162963867, "learning_rate": 1.5688311335097646e-06, "loss": 0.9633500576019287, "step": 1478 }, { "epoch": 2.948207171314741, "grad_norm": 0.7016828656196594, "learning_rate": 1.5620138180120331e-06, "loss": 0.8571369647979736, "step": 1480 }, { "epoch": 2.952191235059761, "grad_norm": 2.1188414096832275, "learning_rate": 1.5552153695763156e-06, "loss": 0.44183531403541565, "step": 1482 }, { "epoch": 2.956175298804781, "grad_norm": 2.2254960536956787, "learning_rate": 1.5484358620045534e-06, "loss": 0.28760015964508057, "step": 1484 }, { "epoch": 2.960159362549801, "grad_norm": 2.748490333557129, "learning_rate": 1.5416753688930654e-06, "loss": 0.6493697166442871, "step": 1486 }, { "epoch": 2.9641434262948207, "grad_norm": 1.3967127799987793, "learning_rate": 1.5349339636317584e-06, "loss": 0.8622140288352966, "step": 1488 }, { "epoch": 2.9681274900398407, "grad_norm": 1.959518313407898, "learning_rate": 1.528211719403328e-06, "loss": 0.722124457359314, "step": 1490 }, { "epoch": 2.9721115537848606, "grad_norm": 1.3386509418487549, "learning_rate": 1.521508709182461e-06, "loss": 0.9694193601608276, "step": 1492 }, { "epoch": 2.9760956175298805, "grad_norm": 0.9864974617958069, "learning_rate": 1.514825005735045e-06, "loss": 0.8088407516479492, "step": 1494 }, { "epoch": 2.9800796812749004, "grad_norm": 2.115551471710205, "learning_rate": 1.5081606816173814e-06, "loss": 0.12242338061332703, "step": 1496 }, { "epoch": 2.9840637450199203, "grad_norm": 0.75198894739151, "learning_rate": 1.5015158091753958e-06, "loss": 0.1432493031024933, "step": 1498 }, { "epoch": 2.9880478087649402, "grad_norm": 1.4102544784545898, "learning_rate": 1.4948904605438477e-06, "loss": 0.0790117010474205, "step": 1500 }, { "epoch": 2.99203187250996, "grad_norm": 0.6461302638053894, "learning_rate": 1.488284707645557e-06, "loss": 0.7927932739257812, "step": 1502 }, { "epoch": 2.99601593625498, "grad_norm": 0.9944819211959839, "learning_rate": 1.4816986221906159e-06, "loss": 0.8774588704109192, "step": 1504 }, { "epoch": 3.0, "grad_norm": 2.3869407176971436, "learning_rate": 1.4751322756756127e-06, "loss": 0.23395386338233948, "step": 1506 }, { "epoch": 3.00398406374502, "grad_norm": 0.6929567456245422, "learning_rate": 1.4685857393828543e-06, "loss": 0.6813750267028809, "step": 1508 }, { "epoch": 3.00796812749004, "grad_norm": 1.4428455829620361, "learning_rate": 1.4620590843795967e-06, "loss": 0.27471280097961426, "step": 1510 }, { "epoch": 3.0119521912350598, "grad_norm": 1.1208453178405762, "learning_rate": 1.4555523815172693e-06, "loss": 0.7926130294799805, "step": 1512 }, { "epoch": 3.0159362549800797, "grad_norm": 1.4112131595611572, "learning_rate": 1.449065701430705e-06, "loss": 0.3855717182159424, "step": 1514 }, { "epoch": 3.0199203187250996, "grad_norm": 7.652811527252197, "learning_rate": 1.4425991145373788e-06, "loss": 0.1316222846508026, "step": 1516 }, { "epoch": 3.0239043824701195, "grad_norm": 1.6621893644332886, "learning_rate": 1.4361526910366368e-06, "loss": 0.2520155906677246, "step": 1518 }, { "epoch": 3.0278884462151394, "grad_norm": 0.8125709891319275, "learning_rate": 1.4297265009089397e-06, "loss": 0.7272902727127075, "step": 1520 }, { "epoch": 3.0318725099601593, "grad_norm": 1.4255092144012451, "learning_rate": 1.423320613915099e-06, "loss": 0.5655202865600586, "step": 1522 }, { "epoch": 3.0358565737051793, "grad_norm": 1.9694007635116577, "learning_rate": 1.416935099595522e-06, "loss": 0.21059830486774445, "step": 1524 }, { "epoch": 3.039840637450199, "grad_norm": 0.7592612504959106, "learning_rate": 1.4105700272694578e-06, "loss": 0.6575446724891663, "step": 1526 }, { "epoch": 3.043824701195219, "grad_norm": 1.133392572402954, "learning_rate": 1.4042254660342408e-06, "loss": 0.9429333209991455, "step": 1528 }, { "epoch": 3.047808764940239, "grad_norm": 1.231631875038147, "learning_rate": 1.3979014847645435e-06, "loss": 0.2242284119129181, "step": 1530 }, { "epoch": 3.051792828685259, "grad_norm": 1.1999961137771606, "learning_rate": 1.391598152111631e-06, "loss": 0.15949700772762299, "step": 1532 }, { "epoch": 3.055776892430279, "grad_norm": 1.6939618587493896, "learning_rate": 1.385315536502609e-06, "loss": 0.21413640677928925, "step": 1534 }, { "epoch": 3.0597609561752988, "grad_norm": 1.3219988346099854, "learning_rate": 1.3790537061396887e-06, "loss": 0.6202045679092407, "step": 1536 }, { "epoch": 3.0637450199203187, "grad_norm": 0.998444676399231, "learning_rate": 1.372812728999442e-06, "loss": 0.7671471238136292, "step": 1538 }, { "epoch": 3.0677290836653386, "grad_norm": 1.4698975086212158, "learning_rate": 1.3665926728320632e-06, "loss": 0.47750726342201233, "step": 1540 }, { "epoch": 3.0717131474103585, "grad_norm": 0.9587137699127197, "learning_rate": 1.3603936051606346e-06, "loss": 0.7269394397735596, "step": 1542 }, { "epoch": 3.0756972111553784, "grad_norm": 2.3286054134368896, "learning_rate": 1.3542155932803954e-06, "loss": 0.7805855870246887, "step": 1544 }, { "epoch": 3.0796812749003983, "grad_norm": 0.7439804077148438, "learning_rate": 1.3480587042580092e-06, "loss": 0.6787388324737549, "step": 1546 }, { "epoch": 3.0836653386454183, "grad_norm": 1.8882228136062622, "learning_rate": 1.3419230049308333e-06, "loss": 0.6134771108627319, "step": 1548 }, { "epoch": 3.087649402390438, "grad_norm": 1.0494561195373535, "learning_rate": 1.3358085619062003e-06, "loss": 0.7737662196159363, "step": 1550 }, { "epoch": 3.091633466135458, "grad_norm": 0.31838488578796387, "learning_rate": 1.3297154415606864e-06, "loss": 0.034840308129787445, "step": 1552 }, { "epoch": 3.095617529880478, "grad_norm": 1.5378990173339844, "learning_rate": 1.3236437100393992e-06, "loss": 0.21899044513702393, "step": 1554 }, { "epoch": 3.099601593625498, "grad_norm": 0.9580462574958801, "learning_rate": 1.3175934332552511e-06, "loss": 0.635277271270752, "step": 1556 }, { "epoch": 3.103585657370518, "grad_norm": 1.2689288854599, "learning_rate": 1.3115646768882522e-06, "loss": 0.6710810661315918, "step": 1558 }, { "epoch": 3.1075697211155378, "grad_norm": 0.9133360385894775, "learning_rate": 1.3055575063847923e-06, "loss": 0.7197314500808716, "step": 1560 }, { "epoch": 3.1115537848605577, "grad_norm": 3.067455768585205, "learning_rate": 1.29957198695693e-06, "loss": 0.21895435452461243, "step": 1562 }, { "epoch": 3.1155378486055776, "grad_norm": 0.27349138259887695, "learning_rate": 1.2936081835816867e-06, "loss": 0.19600287079811096, "step": 1564 }, { "epoch": 3.1195219123505975, "grad_norm": 1.1419686079025269, "learning_rate": 1.2876661610003428e-06, "loss": 0.7878577709197998, "step": 1566 }, { "epoch": 3.1235059760956174, "grad_norm": 1.1395351886749268, "learning_rate": 1.2817459837177298e-06, "loss": 0.7802326679229736, "step": 1568 }, { "epoch": 3.1274900398406373, "grad_norm": 1.9237797260284424, "learning_rate": 1.2758477160015355e-06, "loss": 0.5069929361343384, "step": 1570 }, { "epoch": 3.1314741035856573, "grad_norm": 0.7889575958251953, "learning_rate": 1.2699714218816036e-06, "loss": 0.6714158654212952, "step": 1572 }, { "epoch": 3.135458167330677, "grad_norm": 0.9449037313461304, "learning_rate": 1.2641171651492383e-06, "loss": 0.6565294861793518, "step": 1574 }, { "epoch": 3.139442231075697, "grad_norm": 1.7222603559494019, "learning_rate": 1.2582850093565115e-06, "loss": 0.2423674762248993, "step": 1576 }, { "epoch": 3.143426294820717, "grad_norm": 0.8361628651618958, "learning_rate": 1.2524750178155762e-06, "loss": 0.6483781933784485, "step": 1578 }, { "epoch": 3.147410358565737, "grad_norm": 0.4106227159500122, "learning_rate": 1.2466872535979755e-06, "loss": 0.06941226869821548, "step": 1580 }, { "epoch": 3.151394422310757, "grad_norm": 1.131303071975708, "learning_rate": 1.2409217795339592e-06, "loss": 0.6722179651260376, "step": 1582 }, { "epoch": 3.1553784860557768, "grad_norm": 1.3526575565338135, "learning_rate": 1.2351786582118018e-06, "loss": 0.37432199716567993, "step": 1584 }, { "epoch": 3.1593625498007967, "grad_norm": 1.5046707391738892, "learning_rate": 1.2294579519771246e-06, "loss": 0.36908501386642456, "step": 1586 }, { "epoch": 3.1633466135458166, "grad_norm": 0.14365744590759277, "learning_rate": 1.2237597229322155e-06, "loss": 0.01732539013028145, "step": 1588 }, { "epoch": 3.1673306772908365, "grad_norm": 0.7536062598228455, "learning_rate": 1.2180840329353564e-06, "loss": 0.2823001444339752, "step": 1590 }, { "epoch": 3.1713147410358564, "grad_norm": 6.318256855010986, "learning_rate": 1.2124309436001533e-06, "loss": 0.5411125421524048, "step": 1592 }, { "epoch": 3.1752988047808763, "grad_norm": 1.1654754877090454, "learning_rate": 1.2068005162948668e-06, "loss": 0.7602944374084473, "step": 1594 }, { "epoch": 3.1792828685258963, "grad_norm": 2.5576841831207275, "learning_rate": 1.2011928121417431e-06, "loss": 0.1262691169977188, "step": 1596 }, { "epoch": 3.183266932270916, "grad_norm": 1.2924350500106812, "learning_rate": 1.195607892016354e-06, "loss": 0.6975268721580505, "step": 1598 }, { "epoch": 3.187250996015936, "grad_norm": 2.0278656482696533, "learning_rate": 1.1900458165469345e-06, "loss": 0.5072341561317444, "step": 1600 }, { "epoch": 3.191235059760956, "grad_norm": 2.13330078125, "learning_rate": 1.184506646113724e-06, "loss": 0.7287152409553528, "step": 1602 }, { "epoch": 3.195219123505976, "grad_norm": 0.19735604524612427, "learning_rate": 1.1789904408483123e-06, "loss": 0.20490704476833344, "step": 1604 }, { "epoch": 3.199203187250996, "grad_norm": 2.342869997024536, "learning_rate": 1.1734972606329874e-06, "loss": 0.6201443076133728, "step": 1606 }, { "epoch": 3.2031872509960158, "grad_norm": 1.9951808452606201, "learning_rate": 1.1680271651000819e-06, "loss": 0.2740911543369293, "step": 1608 }, { "epoch": 3.2071713147410357, "grad_norm": 1.075411319732666, "learning_rate": 1.162580213631328e-06, "loss": 0.6568232774734497, "step": 1610 }, { "epoch": 3.2111553784860556, "grad_norm": 2.3391730785369873, "learning_rate": 1.1571564653572148e-06, "loss": 1.0995919704437256, "step": 1612 }, { "epoch": 3.2151394422310755, "grad_norm": 0.11555808782577515, "learning_rate": 1.1517559791563439e-06, "loss": 0.003191891126334667, "step": 1614 }, { "epoch": 3.2191235059760954, "grad_norm": 2.371424674987793, "learning_rate": 1.1463788136547887e-06, "loss": 0.396582692861557, "step": 1616 }, { "epoch": 3.2231075697211153, "grad_norm": 1.8076469898223877, "learning_rate": 1.141025027225463e-06, "loss": 0.3241533637046814, "step": 1618 }, { "epoch": 3.2270916334661353, "grad_norm": 0.9942080974578857, "learning_rate": 1.1356946779874825e-06, "loss": 0.6740264296531677, "step": 1620 }, { "epoch": 3.231075697211155, "grad_norm": 1.624965786933899, "learning_rate": 1.1303878238055357e-06, "loss": 0.44572022557258606, "step": 1622 }, { "epoch": 3.235059760956175, "grad_norm": 1.6572600603103638, "learning_rate": 1.1251045222892553e-06, "loss": 0.21951913833618164, "step": 1624 }, { "epoch": 3.239043824701195, "grad_norm": 1.5844409465789795, "learning_rate": 1.119844830792595e-06, "loss": 0.7072573900222778, "step": 1626 }, { "epoch": 3.243027888446215, "grad_norm": 1.0160541534423828, "learning_rate": 1.1146088064132052e-06, "loss": 0.6218189001083374, "step": 1628 }, { "epoch": 3.247011952191235, "grad_norm": 0.6660611033439636, "learning_rate": 1.10939650599181e-06, "loss": 0.15160006284713745, "step": 1630 }, { "epoch": 3.2509960159362548, "grad_norm": 4.854979038238525, "learning_rate": 1.1042079861115967e-06, "loss": 0.4013654887676239, "step": 1632 }, { "epoch": 3.2549800796812747, "grad_norm": 1.7456501722335815, "learning_rate": 1.099043303097596e-06, "loss": 0.6942977905273438, "step": 1634 }, { "epoch": 3.2589641434262946, "grad_norm": 0.6688535809516907, "learning_rate": 1.0939025130160743e-06, "loss": 0.7660707831382751, "step": 1636 }, { "epoch": 3.2629482071713145, "grad_norm": 1.3489729166030884, "learning_rate": 1.088785671673921e-06, "loss": 0.4087866544723511, "step": 1638 }, { "epoch": 3.2669322709163344, "grad_norm": 3.7537801265716553, "learning_rate": 1.0836928346180481e-06, "loss": 0.26779600977897644, "step": 1640 }, { "epoch": 3.2709163346613543, "grad_norm": 1.0913664102554321, "learning_rate": 1.0786240571347827e-06, "loss": 0.11661072820425034, "step": 1642 }, { "epoch": 3.2749003984063743, "grad_norm": 1.3544014692306519, "learning_rate": 1.0735793942492676e-06, "loss": 0.9415394067764282, "step": 1644 }, { "epoch": 3.278884462151394, "grad_norm": 1.880513072013855, "learning_rate": 1.068558900724865e-06, "loss": 0.6600284576416016, "step": 1646 }, { "epoch": 3.2828685258964145, "grad_norm": 2.517366647720337, "learning_rate": 1.0635626310625637e-06, "loss": 0.3240680694580078, "step": 1648 }, { "epoch": 3.2868525896414345, "grad_norm": 0.825859010219574, "learning_rate": 1.058590639500382e-06, "loss": 0.6646403074264526, "step": 1650 }, { "epoch": 3.2908366533864544, "grad_norm": 0.9859835505485535, "learning_rate": 1.0536429800127851e-06, "loss": 0.642147958278656, "step": 1652 }, { "epoch": 3.2948207171314743, "grad_norm": 1.7152155637741089, "learning_rate": 1.0487197063100961e-06, "loss": 0.7060829401016235, "step": 1654 }, { "epoch": 3.298804780876494, "grad_norm": 1.7756178379058838, "learning_rate": 1.0438208718379124e-06, "loss": 0.7361951470375061, "step": 1656 }, { "epoch": 3.302788844621514, "grad_norm": 1.7107096910476685, "learning_rate": 1.0389465297765253e-06, "loss": 0.6126337647438049, "step": 1658 }, { "epoch": 3.306772908366534, "grad_norm": 1.4858530759811401, "learning_rate": 1.0340967330403468e-06, "loss": 0.614052414894104, "step": 1660 }, { "epoch": 3.310756972111554, "grad_norm": 6.398506164550781, "learning_rate": 1.02927153427733e-06, "loss": 0.6388739347457886, "step": 1662 }, { "epoch": 3.314741035856574, "grad_norm": 4.903992652893066, "learning_rate": 1.0244709858683996e-06, "loss": 0.1377391517162323, "step": 1664 }, { "epoch": 3.318725099601594, "grad_norm": 1.644950270652771, "learning_rate": 1.0196951399268847e-06, "loss": 0.3214379549026489, "step": 1666 }, { "epoch": 3.3227091633466137, "grad_norm": 6.5153608322143555, "learning_rate": 1.0149440482979503e-06, "loss": 0.23638975620269775, "step": 1668 }, { "epoch": 3.3266932270916336, "grad_norm": 1.4857839345932007, "learning_rate": 1.0102177625580375e-06, "loss": 0.22218865156173706, "step": 1670 }, { "epoch": 3.3306772908366535, "grad_norm": 8.828252792358398, "learning_rate": 1.0055163340143e-06, "loss": 0.6645467877388, "step": 1672 }, { "epoch": 3.3346613545816735, "grad_norm": 1.0999014377593994, "learning_rate": 1.0008398137040507e-06, "loss": 0.5620592832565308, "step": 1674 }, { "epoch": 3.3386454183266934, "grad_norm": 2.528717279434204, "learning_rate": 9.961882523942068e-07, "loss": 0.6080818176269531, "step": 1676 }, { "epoch": 3.3426294820717133, "grad_norm": 0.1366569846868515, "learning_rate": 9.915617005807357e-07, "loss": 0.01138792559504509, "step": 1678 }, { "epoch": 3.346613545816733, "grad_norm": 5.231603622436523, "learning_rate": 9.869602084881103e-07, "loss": 0.29557374119758606, "step": 1680 }, { "epoch": 3.350597609561753, "grad_norm": 0.9051138758659363, "learning_rate": 9.823838260687635e-07, "loss": 0.41544756293296814, "step": 1682 }, { "epoch": 3.354581673306773, "grad_norm": 1.6163842678070068, "learning_rate": 9.778326030025432e-07, "loss": 0.45938849449157715, "step": 1684 }, { "epoch": 3.358565737051793, "grad_norm": 7.820988178253174, "learning_rate": 9.733065886961764e-07, "loss": 0.4935106337070465, "step": 1686 }, { "epoch": 3.362549800796813, "grad_norm": 1.3769513368606567, "learning_rate": 9.688058322827313e-07, "loss": 0.5252028107643127, "step": 1688 }, { "epoch": 3.366533864541833, "grad_norm": 1.0140272378921509, "learning_rate": 9.643303826210824e-07, "loss": 0.7207529544830322, "step": 1690 }, { "epoch": 3.3705179282868527, "grad_norm": 1.3448855876922607, "learning_rate": 9.598802882953828e-07, "loss": 0.7529066205024719, "step": 1692 }, { "epoch": 3.3745019920318726, "grad_norm": 1.0818604230880737, "learning_rate": 9.554555976145349e-07, "loss": 0.6526249647140503, "step": 1694 }, { "epoch": 3.3784860557768925, "grad_norm": 0.858180046081543, "learning_rate": 9.510563586116686e-07, "loss": 0.6609078645706177, "step": 1696 }, { "epoch": 3.3824701195219125, "grad_norm": 1.1475756168365479, "learning_rate": 9.466826190436147e-07, "loss": 0.7812352180480957, "step": 1698 }, { "epoch": 3.3864541832669324, "grad_norm": 2.1600332260131836, "learning_rate": 9.423344263903926e-07, "loss": 0.7400810122489929, "step": 1700 }, { "epoch": 3.3904382470119523, "grad_norm": 1.9892051219940186, "learning_rate": 9.380118278546906e-07, "loss": 0.6348077654838562, "step": 1702 }, { "epoch": 3.394422310756972, "grad_norm": 0.9929773211479187, "learning_rate": 9.337148703613554e-07, "loss": 0.6541098356246948, "step": 1704 }, { "epoch": 3.398406374501992, "grad_norm": 5.213384628295898, "learning_rate": 9.29443600556881e-07, "loss": 0.20520062744617462, "step": 1706 }, { "epoch": 3.402390438247012, "grad_norm": 1.8277703523635864, "learning_rate": 9.251980648089045e-07, "loss": 0.596899688243866, "step": 1708 }, { "epoch": 3.406374501992032, "grad_norm": 0.9781650304794312, "learning_rate": 9.209783092057025e-07, "loss": 0.7202063202857971, "step": 1710 }, { "epoch": 3.410358565737052, "grad_norm": 1.2887661457061768, "learning_rate": 9.16784379555688e-07, "loss": 0.668391764163971, "step": 1712 }, { "epoch": 3.414342629482072, "grad_norm": 1.2524248361587524, "learning_rate": 9.126163213869171e-07, "loss": 0.6738901138305664, "step": 1714 }, { "epoch": 3.4183266932270917, "grad_norm": 0.8974006772041321, "learning_rate": 9.084741799465915e-07, "loss": 0.6369835734367371, "step": 1716 }, { "epoch": 3.4223107569721116, "grad_norm": 0.9271976351737976, "learning_rate": 9.043580002005681e-07, "loss": 0.7468122839927673, "step": 1718 }, { "epoch": 3.4262948207171315, "grad_norm": 0.9398600459098816, "learning_rate": 9.002678268328732e-07, "loss": 0.6316313743591309, "step": 1720 }, { "epoch": 3.4302788844621515, "grad_norm": 3.112898111343384, "learning_rate": 8.962037042452146e-07, "loss": 0.3467191755771637, "step": 1722 }, { "epoch": 3.4342629482071714, "grad_norm": 0.8903955817222595, "learning_rate": 8.921656765564998e-07, "loss": 0.5496594309806824, "step": 1724 }, { "epoch": 3.4382470119521913, "grad_norm": 2.7363598346710205, "learning_rate": 8.881537876023597e-07, "loss": 0.6846615076065063, "step": 1726 }, { "epoch": 3.442231075697211, "grad_norm": 1.7913397550582886, "learning_rate": 8.841680809346684e-07, "loss": 0.4614332914352417, "step": 1728 }, { "epoch": 3.446215139442231, "grad_norm": 2.286719560623169, "learning_rate": 8.802085998210754e-07, "loss": 0.6514830589294434, "step": 1730 }, { "epoch": 3.450199203187251, "grad_norm": 1.2754535675048828, "learning_rate": 8.762753872445316e-07, "loss": 0.6596709489822388, "step": 1732 }, { "epoch": 3.454183266932271, "grad_norm": 3.1973865032196045, "learning_rate": 8.723684859028244e-07, "loss": 0.3601575791835785, "step": 1734 }, { "epoch": 3.458167330677291, "grad_norm": 1.0521482229232788, "learning_rate": 8.684879382081163e-07, "loss": 0.6533339023590088, "step": 1736 }, { "epoch": 3.462151394422311, "grad_norm": 1.2476742267608643, "learning_rate": 8.646337862864804e-07, "loss": 0.7225340604782104, "step": 1738 }, { "epoch": 3.4661354581673307, "grad_norm": 8.218664169311523, "learning_rate": 8.608060719774452e-07, "loss": 0.14243163168430328, "step": 1740 }, { "epoch": 3.4701195219123506, "grad_norm": 0.8877552151679993, "learning_rate": 8.570048368335411e-07, "loss": 0.7387225031852722, "step": 1742 }, { "epoch": 3.4741035856573705, "grad_norm": 0.26608389616012573, "learning_rate": 8.532301221198491e-07, "loss": 0.060973528772592545, "step": 1744 }, { "epoch": 3.4780876494023905, "grad_norm": 1.655069351196289, "learning_rate": 8.494819688135502e-07, "loss": 0.6722233891487122, "step": 1746 }, { "epoch": 3.4820717131474104, "grad_norm": 0.3229190409183502, "learning_rate": 8.457604176034851e-07, "loss": 0.16490302979946136, "step": 1748 }, { "epoch": 3.4860557768924303, "grad_norm": 0.3072760999202728, "learning_rate": 8.42065508889708e-07, "loss": 0.06224316358566284, "step": 1750 }, { "epoch": 3.49003984063745, "grad_norm": 1.0425161123275757, "learning_rate": 8.383972827830517e-07, "loss": 0.6595985293388367, "step": 1752 }, { "epoch": 3.49402390438247, "grad_norm": 1.6916478872299194, "learning_rate": 8.347557791046892e-07, "loss": 0.18403995037078857, "step": 1754 }, { "epoch": 3.49800796812749, "grad_norm": 0.8162530064582825, "learning_rate": 8.311410373857033e-07, "loss": 0.6693860292434692, "step": 1756 }, { "epoch": 3.50199203187251, "grad_norm": 3.898818254470825, "learning_rate": 8.275530968666578e-07, "loss": 0.5436112880706787, "step": 1758 }, { "epoch": 3.50597609561753, "grad_norm": 0.576738178730011, "learning_rate": 8.239919964971689e-07, "loss": 0.1252291202545166, "step": 1760 }, { "epoch": 3.50996015936255, "grad_norm": 0.9629335403442383, "learning_rate": 8.20457774935485e-07, "loss": 0.2324841022491455, "step": 1762 }, { "epoch": 3.5139442231075697, "grad_norm": 1.051251769065857, "learning_rate": 8.16950470548067e-07, "loss": 0.5175900459289551, "step": 1764 }, { "epoch": 3.5179282868525896, "grad_norm": 5.374156951904297, "learning_rate": 8.134701214091691e-07, "loss": 0.19936859607696533, "step": 1766 }, { "epoch": 3.5219123505976095, "grad_norm": 1.134244680404663, "learning_rate": 8.100167653004285e-07, "loss": 0.09222012758255005, "step": 1768 }, { "epoch": 3.5258964143426295, "grad_norm": 1.0654293298721313, "learning_rate": 8.065904397104543e-07, "loss": 0.6717595458030701, "step": 1770 }, { "epoch": 3.5298804780876494, "grad_norm": 2.4975504875183105, "learning_rate": 8.031911818344201e-07, "loss": 0.5180625915527344, "step": 1772 }, { "epoch": 3.5338645418326693, "grad_norm": 0.9296510219573975, "learning_rate": 7.998190285736589e-07, "loss": 0.6407575607299805, "step": 1774 }, { "epoch": 3.537848605577689, "grad_norm": 2.6143455505371094, "learning_rate": 7.964740165352664e-07, "loss": 0.6667947769165039, "step": 1776 }, { "epoch": 3.541832669322709, "grad_norm": 0.19827701151371002, "learning_rate": 7.931561820317005e-07, "loss": 0.023438258096575737, "step": 1778 }, { "epoch": 3.545816733067729, "grad_norm": 1.148992657661438, "learning_rate": 7.898655610803869e-07, "loss": 0.6734960675239563, "step": 1780 }, { "epoch": 3.549800796812749, "grad_norm": 1.8085567951202393, "learning_rate": 7.866021894033296e-07, "loss": 0.6972249150276184, "step": 1782 }, { "epoch": 3.553784860557769, "grad_norm": 2.9096920490264893, "learning_rate": 7.833661024267235e-07, "loss": 0.6476399302482605, "step": 1784 }, { "epoch": 3.557768924302789, "grad_norm": 0.7224079966545105, "learning_rate": 7.80157335280568e-07, "loss": 0.9946411848068237, "step": 1786 }, { "epoch": 3.5617529880478087, "grad_norm": 1.2070460319519043, "learning_rate": 7.769759227982855e-07, "loss": 0.711801290512085, "step": 1788 }, { "epoch": 3.5657370517928286, "grad_norm": 2.714474678039551, "learning_rate": 7.738218995163462e-07, "loss": 0.15059031546115875, "step": 1790 }, { "epoch": 3.5697211155378485, "grad_norm": 1.3999918699264526, "learning_rate": 7.70695299673891e-07, "loss": 0.139665424823761, "step": 1792 }, { "epoch": 3.5737051792828685, "grad_norm": 0.37299129366874695, "learning_rate": 7.67596157212359e-07, "loss": 0.11374976485967636, "step": 1794 }, { "epoch": 3.5776892430278884, "grad_norm": 0.8067252039909363, "learning_rate": 7.645245057751201e-07, "loss": 0.6304631233215332, "step": 1796 }, { "epoch": 3.5816733067729083, "grad_norm": 1.578432559967041, "learning_rate": 7.614803787071115e-07, "loss": 0.22770892083644867, "step": 1798 }, { "epoch": 3.585657370517928, "grad_norm": 3.3027656078338623, "learning_rate": 7.584638090544717e-07, "loss": 0.20699705183506012, "step": 1800 }, { "epoch": 3.589641434262948, "grad_norm": 0.14634272456169128, "learning_rate": 7.554748295641862e-07, "loss": 0.055411506444215775, "step": 1802 }, { "epoch": 3.593625498007968, "grad_norm": 1.2589038610458374, "learning_rate": 7.525134726837289e-07, "loss": 0.15108336508274078, "step": 1804 }, { "epoch": 3.597609561752988, "grad_norm": 1.8965911865234375, "learning_rate": 7.49579770560711e-07, "loss": 0.4452376961708069, "step": 1806 }, { "epoch": 3.601593625498008, "grad_norm": 1.1629970073699951, "learning_rate": 7.46673755042531e-07, "loss": 0.6423868536949158, "step": 1808 }, { "epoch": 3.605577689243028, "grad_norm": 0.5293740630149841, "learning_rate": 7.437954576760312e-07, "loss": 0.21336103975772858, "step": 1810 }, { "epoch": 3.6095617529880477, "grad_norm": 1.164920449256897, "learning_rate": 7.409449097071536e-07, "loss": 0.5466434359550476, "step": 1812 }, { "epoch": 3.6135458167330676, "grad_norm": 1.1033563613891602, "learning_rate": 7.381221420805999e-07, "loss": 0.6399943232536316, "step": 1814 }, { "epoch": 3.6175298804780875, "grad_norm": 1.056943416595459, "learning_rate": 7.353271854394979e-07, "loss": 0.5917325019836426, "step": 1816 }, { "epoch": 3.6215139442231075, "grad_norm": 0.9444670677185059, "learning_rate": 7.325600701250674e-07, "loss": 0.7685708403587341, "step": 1818 }, { "epoch": 3.6254980079681274, "grad_norm": 1.8602865934371948, "learning_rate": 7.298208261762906e-07, "loss": 0.45633015036582947, "step": 1820 }, { "epoch": 3.6294820717131473, "grad_norm": 0.10787267237901688, "learning_rate": 7.271094833295859e-07, "loss": 0.011536069214344025, "step": 1822 }, { "epoch": 3.633466135458167, "grad_norm": 0.2886284291744232, "learning_rate": 7.244260710184868e-07, "loss": 0.024275042116642, "step": 1824 }, { "epoch": 3.637450199203187, "grad_norm": 0.6795600652694702, "learning_rate": 7.21770618373321e-07, "loss": 0.45940348505973816, "step": 1826 }, { "epoch": 3.641434262948207, "grad_norm": 2.2104618549346924, "learning_rate": 7.191431542208935e-07, "loss": 0.6470014452934265, "step": 1828 }, { "epoch": 3.645418326693227, "grad_norm": 1.12752103805542, "learning_rate": 7.165437070841758e-07, "loss": 0.7721574902534485, "step": 1830 }, { "epoch": 3.649402390438247, "grad_norm": 6.11736536026001, "learning_rate": 7.139723051819938e-07, "loss": 0.5740348696708679, "step": 1832 }, { "epoch": 3.653386454183267, "grad_norm": 0.4044356048107147, "learning_rate": 7.114289764287227e-07, "loss": 0.05502355471253395, "step": 1834 }, { "epoch": 3.6573705179282867, "grad_norm": 4.303436279296875, "learning_rate": 7.08913748433985e-07, "loss": 0.17597807943820953, "step": 1836 }, { "epoch": 3.6613545816733066, "grad_norm": 1.0884654521942139, "learning_rate": 7.064266485023493e-07, "loss": 0.6930414438247681, "step": 1838 }, { "epoch": 3.6653386454183265, "grad_norm": 2.256512403488159, "learning_rate": 7.039677036330331e-07, "loss": 0.6587978601455688, "step": 1840 }, { "epoch": 3.6693227091633465, "grad_norm": 0.19702738523483276, "learning_rate": 7.015369405196132e-07, "loss": 0.016245799139142036, "step": 1842 }, { "epoch": 3.6733067729083664, "grad_norm": 0.9400996565818787, "learning_rate": 6.991343855497312e-07, "loss": 0.15207843482494354, "step": 1844 }, { "epoch": 3.6772908366533863, "grad_norm": 1.0055437088012695, "learning_rate": 6.967600648048113e-07, "loss": 0.6164069175720215, "step": 1846 }, { "epoch": 3.681274900398406, "grad_norm": 1.8582080602645874, "learning_rate": 6.944140040597742e-07, "loss": 0.7226882576942444, "step": 1848 }, { "epoch": 3.685258964143426, "grad_norm": 1.656290054321289, "learning_rate": 6.920962287827587e-07, "loss": 0.07943466305732727, "step": 1850 }, { "epoch": 3.6892430278884465, "grad_norm": 1.666813611984253, "learning_rate": 6.898067641348459e-07, "loss": 0.30842339992523193, "step": 1852 }, { "epoch": 3.6932270916334664, "grad_norm": 0.8802257776260376, "learning_rate": 6.875456349697834e-07, "loss": 0.6316725611686707, "step": 1854 }, { "epoch": 3.6972111553784863, "grad_norm": 2.5803232192993164, "learning_rate": 6.853128658337188e-07, "loss": 0.09659645706415176, "step": 1856 }, { "epoch": 3.7011952191235062, "grad_norm": 1.351311206817627, "learning_rate": 6.831084809649302e-07, "loss": 0.6809911131858826, "step": 1858 }, { "epoch": 3.705179282868526, "grad_norm": 1.1612941026687622, "learning_rate": 6.809325042935666e-07, "loss": 0.3540644943714142, "step": 1860 }, { "epoch": 3.709163346613546, "grad_norm": 0.9889734387397766, "learning_rate": 6.787849594413833e-07, "loss": 0.6793351173400879, "step": 1862 }, { "epoch": 3.713147410358566, "grad_norm": 1.0778642892837524, "learning_rate": 6.766658697214906e-07, "loss": 0.6664227247238159, "step": 1864 }, { "epoch": 3.717131474103586, "grad_norm": 2.6285629272460938, "learning_rate": 6.745752581380965e-07, "loss": 0.33559897541999817, "step": 1866 }, { "epoch": 3.721115537848606, "grad_norm": 1.0389450788497925, "learning_rate": 6.72513147386261e-07, "loss": 0.5156994462013245, "step": 1868 }, { "epoch": 3.7250996015936257, "grad_norm": 0.9331614375114441, "learning_rate": 6.704795598516451e-07, "loss": 0.5414950251579285, "step": 1870 }, { "epoch": 3.7290836653386457, "grad_norm": 1.0866365432739258, "learning_rate": 6.684745176102714e-07, "loss": 0.735094428062439, "step": 1872 }, { "epoch": 3.7330677290836656, "grad_norm": 1.4017014503479004, "learning_rate": 6.664980424282842e-07, "loss": 0.2802731692790985, "step": 1874 }, { "epoch": 3.7370517928286855, "grad_norm": 2.2784199714660645, "learning_rate": 6.645501557617104e-07, "loss": 0.5592929124832153, "step": 1876 }, { "epoch": 3.7410358565737054, "grad_norm": 4.115759372711182, "learning_rate": 6.626308787562294e-07, "loss": 0.41764435172080994, "step": 1878 }, { "epoch": 3.7450199203187253, "grad_norm": 0.9289363622665405, "learning_rate": 6.607402322469429e-07, "loss": 0.6480333209037781, "step": 1880 }, { "epoch": 3.7490039840637452, "grad_norm": 2.0568838119506836, "learning_rate": 6.588782367581475e-07, "loss": 0.773093581199646, "step": 1882 }, { "epoch": 3.752988047808765, "grad_norm": 3.918016195297241, "learning_rate": 6.570449125031144e-07, "loss": 0.5592324137687683, "step": 1884 }, { "epoch": 3.756972111553785, "grad_norm": 0.8172755241394043, "learning_rate": 6.552402793838667e-07, "loss": 0.6393176913261414, "step": 1886 }, { "epoch": 3.760956175298805, "grad_norm": 0.3844411075115204, "learning_rate": 6.534643569909665e-07, "loss": 0.08161535859107971, "step": 1888 }, { "epoch": 3.764940239043825, "grad_norm": 2.660936117172241, "learning_rate": 6.517171646032988e-07, "loss": 0.7531623244285583, "step": 1890 }, { "epoch": 3.768924302788845, "grad_norm": 2.1934661865234375, "learning_rate": 6.499987211878666e-07, "loss": 0.6893159747123718, "step": 1892 }, { "epoch": 3.7729083665338647, "grad_norm": 1.1734172105789185, "learning_rate": 6.483090453995811e-07, "loss": 0.09743469953536987, "step": 1894 }, { "epoch": 3.7768924302788847, "grad_norm": 1.5317673683166504, "learning_rate": 6.466481555810608e-07, "loss": 0.6921253204345703, "step": 1896 }, { "epoch": 3.7808764940239046, "grad_norm": 0.8458757996559143, "learning_rate": 6.450160697624327e-07, "loss": 0.6649323105812073, "step": 1898 }, { "epoch": 3.7848605577689245, "grad_norm": 1.0291515588760376, "learning_rate": 6.434128056611361e-07, "loss": 0.6685061454772949, "step": 1900 }, { "epoch": 3.7888446215139444, "grad_norm": 0.8199694156646729, "learning_rate": 6.418383806817298e-07, "loss": 0.7103414535522461, "step": 1902 }, { "epoch": 3.7928286852589643, "grad_norm": 0.8696004748344421, "learning_rate": 6.40292811915704e-07, "loss": 0.6235980987548828, "step": 1904 }, { "epoch": 3.7968127490039842, "grad_norm": 2.7558107376098633, "learning_rate": 6.387761161412942e-07, "loss": 0.14641408622264862, "step": 1906 }, { "epoch": 3.800796812749004, "grad_norm": 0.8049102425575256, "learning_rate": 6.372883098232999e-07, "loss": 0.6313645839691162, "step": 1908 }, { "epoch": 3.804780876494024, "grad_norm": 1.0484040975570679, "learning_rate": 6.358294091129044e-07, "loss": 0.689453661441803, "step": 1910 }, { "epoch": 3.808764940239044, "grad_norm": 1.3624324798583984, "learning_rate": 6.34399429847501e-07, "loss": 0.4293438196182251, "step": 1912 }, { "epoch": 3.812749003984064, "grad_norm": 2.118128538131714, "learning_rate": 6.329983875505202e-07, "loss": 0.7885560989379883, "step": 1914 }, { "epoch": 3.816733067729084, "grad_norm": 1.88889479637146, "learning_rate": 6.316262974312607e-07, "loss": 0.12458698451519012, "step": 1916 }, { "epoch": 3.8207171314741037, "grad_norm": 2.0474905967712402, "learning_rate": 6.302831743847255e-07, "loss": 0.7278786897659302, "step": 1918 }, { "epoch": 3.8247011952191237, "grad_norm": 1.8699114322662354, "learning_rate": 6.289690329914599e-07, "loss": 0.10339318215847015, "step": 1920 }, { "epoch": 3.8286852589641436, "grad_norm": 0.9766838550567627, "learning_rate": 6.276838875173931e-07, "loss": 0.7524492144584656, "step": 1922 }, { "epoch": 3.8326693227091635, "grad_norm": 0.34323349595069885, "learning_rate": 6.264277519136821e-07, "loss": 0.051684651523828506, "step": 1924 }, { "epoch": 3.8366533864541834, "grad_norm": 1.1233506202697754, "learning_rate": 6.252006398165622e-07, "loss": 0.7036517262458801, "step": 1926 }, { "epoch": 3.8406374501992033, "grad_norm": 1.529929757118225, "learning_rate": 6.240025645471986e-07, "loss": 0.8575693368911743, "step": 1928 }, { "epoch": 3.8446215139442232, "grad_norm": 0.11210882663726807, "learning_rate": 6.228335391115402e-07, "loss": 0.02451253868639469, "step": 1930 }, { "epoch": 3.848605577689243, "grad_norm": 1.864715576171875, "learning_rate": 6.216935762001803e-07, "loss": 0.5305463671684265, "step": 1932 }, { "epoch": 3.852589641434263, "grad_norm": 1.8157854080200195, "learning_rate": 6.205826881882179e-07, "loss": 0.13252875208854675, "step": 1934 }, { "epoch": 3.856573705179283, "grad_norm": 0.9740794897079468, "learning_rate": 6.195008871351232e-07, "loss": 0.7859750986099243, "step": 1936 }, { "epoch": 3.860557768924303, "grad_norm": 1.070713758468628, "learning_rate": 6.184481847846074e-07, "loss": 0.7027934789657593, "step": 1938 }, { "epoch": 3.864541832669323, "grad_norm": 1.440918207168579, "learning_rate": 6.174245925644948e-07, "loss": 0.30577710270881653, "step": 1940 }, { "epoch": 3.8685258964143427, "grad_norm": 2.0320322513580322, "learning_rate": 6.164301215865982e-07, "loss": 0.9369683265686035, "step": 1942 }, { "epoch": 3.8725099601593627, "grad_norm": 0.6125801801681519, "learning_rate": 6.154647826465999e-07, "loss": 0.03845952823758125, "step": 1944 }, { "epoch": 3.8764940239043826, "grad_norm": 3.9984986782073975, "learning_rate": 6.145285862239327e-07, "loss": 0.6496099233627319, "step": 1946 }, { "epoch": 3.8804780876494025, "grad_norm": 0.08795814216136932, "learning_rate": 6.136215424816668e-07, "loss": 0.04779617115855217, "step": 1948 }, { "epoch": 3.8844621513944224, "grad_norm": 0.9127535820007324, "learning_rate": 6.127436612664e-07, "loss": 0.6776239275932312, "step": 1950 }, { "epoch": 3.8884462151394423, "grad_norm": 1.5462641716003418, "learning_rate": 6.118949521081495e-07, "loss": 0.7221356630325317, "step": 1952 }, { "epoch": 3.8924302788844622, "grad_norm": 0.6864924430847168, "learning_rate": 6.11075424220251e-07, "loss": 0.6018074154853821, "step": 1954 }, { "epoch": 3.896414342629482, "grad_norm": 8.130626678466797, "learning_rate": 6.102850864992553e-07, "loss": 0.15544459223747253, "step": 1956 }, { "epoch": 3.900398406374502, "grad_norm": 1.5887444019317627, "learning_rate": 6.095239475248345e-07, "loss": 0.5947393178939819, "step": 1958 }, { "epoch": 3.904382470119522, "grad_norm": 0.9882814288139343, "learning_rate": 6.087920155596867e-07, "loss": 0.016275843605399132, "step": 1960 }, { "epoch": 3.908366533864542, "grad_norm": 0.3859656751155853, "learning_rate": 6.080892985494482e-07, "loss": 0.04228988662362099, "step": 1962 }, { "epoch": 3.912350597609562, "grad_norm": 1.2562545537948608, "learning_rate": 6.074158041226068e-07, "loss": 0.6111615300178528, "step": 1964 }, { "epoch": 3.9163346613545817, "grad_norm": 3.6256649494171143, "learning_rate": 6.067715395904173e-07, "loss": 0.6986129283905029, "step": 1966 }, { "epoch": 3.9203187250996017, "grad_norm": 1.0995627641677856, "learning_rate": 6.061565119468247e-07, "loss": 0.7141016125679016, "step": 1968 }, { "epoch": 3.9243027888446216, "grad_norm": 2.30956768989563, "learning_rate": 6.055707278683863e-07, "loss": 0.22550952434539795, "step": 1970 }, { "epoch": 3.9282868525896415, "grad_norm": 1.4764176607131958, "learning_rate": 6.050141937142003e-07, "loss": 0.1283264309167862, "step": 1972 }, { "epoch": 3.9322709163346614, "grad_norm": 0.9012427926063538, "learning_rate": 6.04486915525836e-07, "loss": 0.8311380743980408, "step": 1974 }, { "epoch": 3.9362549800796813, "grad_norm": 1.559435486793518, "learning_rate": 6.039888990272691e-07, "loss": 0.1916397362947464, "step": 1976 }, { "epoch": 3.9402390438247012, "grad_norm": 0.8929998874664307, "learning_rate": 6.035201496248188e-07, "loss": 0.6807030439376831, "step": 1978 }, { "epoch": 3.944223107569721, "grad_norm": 0.25589969754219055, "learning_rate": 6.030806724070893e-07, "loss": 0.07943480461835861, "step": 1980 }, { "epoch": 3.948207171314741, "grad_norm": 1.3471908569335938, "learning_rate": 6.026704721449152e-07, "loss": 0.805228590965271, "step": 1982 }, { "epoch": 3.952191235059761, "grad_norm": 0.9127321243286133, "learning_rate": 6.022895532913081e-07, "loss": 0.6197107434272766, "step": 1984 }, { "epoch": 3.956175298804781, "grad_norm": 2.661827802658081, "learning_rate": 6.019379199814108e-07, "loss": 0.49690714478492737, "step": 1986 }, { "epoch": 3.960159362549801, "grad_norm": 0.08383038640022278, "learning_rate": 6.016155760324495e-07, "loss": 0.00437126774340868, "step": 1988 }, { "epoch": 3.9641434262948207, "grad_norm": 0.9041069746017456, "learning_rate": 6.013225249436945e-07, "loss": 0.7191581726074219, "step": 1990 }, { "epoch": 3.9681274900398407, "grad_norm": 1.6254363059997559, "learning_rate": 6.010587698964216e-07, "loss": 0.5217870473861694, "step": 1992 }, { "epoch": 3.9721115537848606, "grad_norm": 1.7610574960708618, "learning_rate": 6.008243137538774e-07, "loss": 0.7896353006362915, "step": 1994 }, { "epoch": 3.9760956175298805, "grad_norm": 0.506505012512207, "learning_rate": 6.006191590612478e-07, "loss": 0.06072104722261429, "step": 1996 }, { "epoch": 3.9800796812749004, "grad_norm": 1.679490566253662, "learning_rate": 6.004433080456312e-07, "loss": 0.0873764306306839, "step": 1998 }, { "epoch": 3.9840637450199203, "grad_norm": 1.07437002658844, "learning_rate": 6.002967626160147e-07, "loss": 0.6510695219039917, "step": 2000 }, { "epoch": 3.9880478087649402, "grad_norm": 1.063508152961731, "learning_rate": 6.001795243632514e-07, "loss": 0.6352625489234924, "step": 2002 }, { "epoch": 3.99203187250996, "grad_norm": 0.9537666440010071, "learning_rate": 6.00091594560045e-07, "loss": 0.7177177667617798, "step": 2004 }, { "epoch": 3.99601593625498, "grad_norm": 4.541738986968994, "learning_rate": 6.000329741609355e-07, "loss": 0.23844213783740997, "step": 2006 }, { "epoch": 4.0, "grad_norm": 0.5011924505233765, "learning_rate": 6.000036638022886e-07, "loss": 0.15317194163799286, "step": 2008 }, { "epoch": 4.0, "step": 2008, "total_flos": 3.519329208629199e+18, "train_loss": 0.7788769946752703, "train_runtime": 8944.5824, "train_samples_per_second": 6.735, "train_steps_per_second": 0.224 } ], "logging_steps": 2, "max_steps": 2008, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.519329208629199e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }