{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": 9.999629974365234, "learning_rate": 4.25531914893617e-08, "loss": 1.0338, "step": 5 }, { "epoch": 0.0032, "grad_norm": 7.134084224700928, "learning_rate": 9.574468085106382e-08, "loss": 1.1826, "step": 10 }, { "epoch": 0.0048, "grad_norm": 5.787372589111328, "learning_rate": 1.4893617021276595e-07, "loss": 1.145, "step": 15 }, { "epoch": 0.0064, "grad_norm": 13.603924751281738, "learning_rate": 2.0212765957446807e-07, "loss": 1.1501, "step": 20 }, { "epoch": 0.008, "grad_norm": 13.660343170166016, "learning_rate": 2.5531914893617016e-07, "loss": 1.1397, "step": 25 }, { "epoch": 0.0096, "grad_norm": 5.391168594360352, "learning_rate": 3.085106382978723e-07, "loss": 1.0871, "step": 30 }, { "epoch": 0.0112, "grad_norm": 9.299227714538574, "learning_rate": 3.617021276595745e-07, "loss": 1.2235, "step": 35 }, { "epoch": 0.0128, "grad_norm": 4.13670539855957, "learning_rate": 4.148936170212766e-07, "loss": 1.0978, "step": 40 }, { "epoch": 0.0144, "grad_norm": 5.579084396362305, "learning_rate": 4.6808510638297873e-07, "loss": 1.2814, "step": 45 }, { "epoch": 0.016, "grad_norm": 13.503495216369629, "learning_rate": 5.212765957446809e-07, "loss": 0.9962, "step": 50 }, { "epoch": 0.0176, "grad_norm": 16.958608627319336, "learning_rate": 5.74468085106383e-07, "loss": 1.0797, "step": 55 }, { "epoch": 0.0192, "grad_norm": 8.709441184997559, "learning_rate": 6.276595744680851e-07, "loss": 1.1269, "step": 60 }, { "epoch": 0.0208, "grad_norm": 6.347010612487793, "learning_rate": 6.808510638297872e-07, "loss": 1.1201, "step": 65 }, { "epoch": 0.0224, "grad_norm": 12.77128791809082, "learning_rate": 7.340425531914893e-07, "loss": 1.0846, "step": 70 }, { "epoch": 0.024, "grad_norm": 8.114184379577637, "learning_rate": 7.872340425531915e-07, "loss": 1.151, "step": 75 }, { "epoch": 0.0256, "grad_norm": 17.945985794067383, "learning_rate": 8.404255319148936e-07, "loss": 1.0711, "step": 80 }, { "epoch": 0.0272, "grad_norm": 5.469882965087891, "learning_rate": 8.936170212765957e-07, "loss": 0.9901, "step": 85 }, { "epoch": 0.0288, "grad_norm": 10.689380645751953, "learning_rate": 9.468085106382978e-07, "loss": 1.2635, "step": 90 }, { "epoch": 0.0304, "grad_norm": 9.217676162719727, "learning_rate": 1e-06, "loss": 1.0604, "step": 95 }, { "epoch": 0.032, "grad_norm": 4.253987789154053, "learning_rate": 9.98350379412735e-07, "loss": 1.1865, "step": 100 }, { "epoch": 0.0336, "grad_norm": 10.984722137451172, "learning_rate": 9.967007588254702e-07, "loss": 1.1504, "step": 105 }, { "epoch": 0.0352, "grad_norm": 6.79901647567749, "learning_rate": 9.950511382382052e-07, "loss": 1.0338, "step": 110 }, { "epoch": 0.0368, "grad_norm": 10.826879501342773, "learning_rate": 9.934015176509404e-07, "loss": 1.1058, "step": 115 }, { "epoch": 0.0384, "grad_norm": 6.437893390655518, "learning_rate": 9.917518970636754e-07, "loss": 1.0317, "step": 120 }, { "epoch": 0.04, "grad_norm": 3.419424295425415, "learning_rate": 9.901022764764103e-07, "loss": 1.0573, "step": 125 }, { "epoch": 0.0416, "grad_norm": 14.196756362915039, "learning_rate": 9.884526558891456e-07, "loss": 1.0892, "step": 130 }, { "epoch": 0.0432, "grad_norm": 4.492501258850098, "learning_rate": 9.868030353018806e-07, "loss": 0.9772, "step": 135 }, { "epoch": 0.0448, "grad_norm": 8.752493858337402, "learning_rate": 9.851534147146155e-07, "loss": 0.984, "step": 140 }, { "epoch": 0.0464, "grad_norm": 3.8549039363861084, "learning_rate": 9.835037941273505e-07, "loss": 0.9935, "step": 145 }, { "epoch": 0.048, "grad_norm": 5.623340129852295, "learning_rate": 9.818541735400857e-07, "loss": 1.1332, "step": 150 }, { "epoch": 0.0496, "grad_norm": 5.848334789276123, "learning_rate": 9.802045529528207e-07, "loss": 0.912, "step": 155 }, { "epoch": 0.0512, "grad_norm": 7.2841877937316895, "learning_rate": 9.78554932365556e-07, "loss": 1.1562, "step": 160 }, { "epoch": 0.0528, "grad_norm": 5.735965728759766, "learning_rate": 9.76905311778291e-07, "loss": 0.9819, "step": 165 }, { "epoch": 0.0544, "grad_norm": 3.197845220565796, "learning_rate": 9.75255691191026e-07, "loss": 0.9843, "step": 170 }, { "epoch": 0.056, "grad_norm": 8.21580696105957, "learning_rate": 9.736060706037611e-07, "loss": 0.9533, "step": 175 }, { "epoch": 0.0576, "grad_norm": 5.551872730255127, "learning_rate": 9.719564500164961e-07, "loss": 1.0205, "step": 180 }, { "epoch": 0.0592, "grad_norm": 5.322945594787598, "learning_rate": 9.703068294292313e-07, "loss": 1.0036, "step": 185 }, { "epoch": 0.0608, "grad_norm": 4.90363883972168, "learning_rate": 9.686572088419663e-07, "loss": 1.0135, "step": 190 }, { "epoch": 0.0624, "grad_norm": 8.087169647216797, "learning_rate": 9.670075882547013e-07, "loss": 1.0293, "step": 195 }, { "epoch": 0.064, "grad_norm": 4.1587910652160645, "learning_rate": 9.653579676674365e-07, "loss": 1.0719, "step": 200 }, { "epoch": 0.0656, "grad_norm": 3.2837698459625244, "learning_rate": 9.637083470801715e-07, "loss": 1.0107, "step": 205 }, { "epoch": 0.0672, "grad_norm": 5.359975814819336, "learning_rate": 9.620587264929065e-07, "loss": 0.9036, "step": 210 }, { "epoch": 0.0688, "grad_norm": 5.2580037117004395, "learning_rate": 9.604091059056415e-07, "loss": 0.9704, "step": 215 }, { "epoch": 0.0704, "grad_norm": 9.796117782592773, "learning_rate": 9.587594853183767e-07, "loss": 0.9456, "step": 220 }, { "epoch": 0.072, "grad_norm": 10.24465560913086, "learning_rate": 9.571098647311117e-07, "loss": 1.0502, "step": 225 }, { "epoch": 0.0736, "grad_norm": 7.899555683135986, "learning_rate": 9.55460244143847e-07, "loss": 0.9849, "step": 230 }, { "epoch": 0.0752, "grad_norm": 6.677064418792725, "learning_rate": 9.53810623556582e-07, "loss": 1.103, "step": 235 }, { "epoch": 0.0768, "grad_norm": 8.85185718536377, "learning_rate": 9.52161002969317e-07, "loss": 0.9351, "step": 240 }, { "epoch": 0.0784, "grad_norm": 7.826456069946289, "learning_rate": 9.505113823820521e-07, "loss": 0.9369, "step": 245 }, { "epoch": 0.08, "grad_norm": 7.929803371429443, "learning_rate": 9.488617617947871e-07, "loss": 0.8399, "step": 250 }, { "epoch": 0.0816, "grad_norm": 4.857858180999756, "learning_rate": 9.472121412075222e-07, "loss": 0.9738, "step": 255 }, { "epoch": 0.0832, "grad_norm": 7.609739303588867, "learning_rate": 9.455625206202573e-07, "loss": 1.0397, "step": 260 }, { "epoch": 0.0848, "grad_norm": 4.687152862548828, "learning_rate": 9.439129000329924e-07, "loss": 1.0207, "step": 265 }, { "epoch": 0.0864, "grad_norm": 3.707120180130005, "learning_rate": 9.422632794457274e-07, "loss": 1.0404, "step": 270 }, { "epoch": 0.088, "grad_norm": 7.183946132659912, "learning_rate": 9.406136588584625e-07, "loss": 0.9329, "step": 275 }, { "epoch": 0.0896, "grad_norm": 4.107320785522461, "learning_rate": 9.389640382711976e-07, "loss": 0.9674, "step": 280 }, { "epoch": 0.0912, "grad_norm": 2.987569808959961, "learning_rate": 9.373144176839326e-07, "loss": 1.0431, "step": 285 }, { "epoch": 0.0928, "grad_norm": 7.492343902587891, "learning_rate": 9.356647970966677e-07, "loss": 0.8819, "step": 290 }, { "epoch": 0.0944, "grad_norm": 3.460360050201416, "learning_rate": 9.340151765094027e-07, "loss": 1.1553, "step": 295 }, { "epoch": 0.096, "grad_norm": 4.670125961303711, "learning_rate": 9.323655559221378e-07, "loss": 0.9511, "step": 300 }, { "epoch": 0.0976, "grad_norm": 6.766526699066162, "learning_rate": 9.307159353348729e-07, "loss": 0.9625, "step": 305 }, { "epoch": 0.0992, "grad_norm": 3.6760966777801514, "learning_rate": 9.29066314747608e-07, "loss": 0.9503, "step": 310 }, { "epoch": 0.1008, "grad_norm": 9.794249534606934, "learning_rate": 9.27416694160343e-07, "loss": 0.9533, "step": 315 }, { "epoch": 0.1024, "grad_norm": 8.894811630249023, "learning_rate": 9.257670735730781e-07, "loss": 0.8668, "step": 320 }, { "epoch": 0.104, "grad_norm": 3.3411269187927246, "learning_rate": 9.241174529858132e-07, "loss": 0.9171, "step": 325 }, { "epoch": 0.1056, "grad_norm": 9.227668762207031, "learning_rate": 9.224678323985483e-07, "loss": 0.9065, "step": 330 }, { "epoch": 0.1072, "grad_norm": 3.217501640319824, "learning_rate": 9.208182118112834e-07, "loss": 0.945, "step": 335 }, { "epoch": 0.1088, "grad_norm": 3.9283738136291504, "learning_rate": 9.191685912240184e-07, "loss": 0.9166, "step": 340 }, { "epoch": 0.1104, "grad_norm": 7.905593395233154, "learning_rate": 9.175189706367535e-07, "loss": 1.0471, "step": 345 }, { "epoch": 0.112, "grad_norm": 3.8356964588165283, "learning_rate": 9.158693500494886e-07, "loss": 0.9593, "step": 350 }, { "epoch": 0.1136, "grad_norm": 4.25161600112915, "learning_rate": 9.142197294622237e-07, "loss": 0.9251, "step": 355 }, { "epoch": 0.1152, "grad_norm": 2.885007381439209, "learning_rate": 9.125701088749587e-07, "loss": 0.9124, "step": 360 }, { "epoch": 0.1168, "grad_norm": 6.2251763343811035, "learning_rate": 9.109204882876937e-07, "loss": 1.0204, "step": 365 }, { "epoch": 0.1184, "grad_norm": 5.769200801849365, "learning_rate": 9.092708677004288e-07, "loss": 0.8877, "step": 370 }, { "epoch": 0.12, "grad_norm": 6.167758941650391, "learning_rate": 9.076212471131639e-07, "loss": 0.9371, "step": 375 }, { "epoch": 0.1216, "grad_norm": 4.101468563079834, "learning_rate": 9.05971626525899e-07, "loss": 0.9571, "step": 380 }, { "epoch": 0.1232, "grad_norm": 3.352560043334961, "learning_rate": 9.04322005938634e-07, "loss": 0.9988, "step": 385 }, { "epoch": 0.1248, "grad_norm": 4.724895000457764, "learning_rate": 9.026723853513691e-07, "loss": 0.972, "step": 390 }, { "epoch": 0.1264, "grad_norm": 5.613089561462402, "learning_rate": 9.010227647641042e-07, "loss": 0.8806, "step": 395 }, { "epoch": 0.128, "grad_norm": 4.489896774291992, "learning_rate": 8.993731441768393e-07, "loss": 0.9119, "step": 400 }, { "epoch": 0.1296, "grad_norm": 6.169275760650635, "learning_rate": 8.977235235895744e-07, "loss": 0.9942, "step": 405 }, { "epoch": 0.1312, "grad_norm": 4.484738826751709, "learning_rate": 8.960739030023094e-07, "loss": 0.8567, "step": 410 }, { "epoch": 0.1328, "grad_norm": 3.1110870838165283, "learning_rate": 8.944242824150445e-07, "loss": 0.912, "step": 415 }, { "epoch": 0.1344, "grad_norm": 7.398248672485352, "learning_rate": 8.927746618277796e-07, "loss": 0.9913, "step": 420 }, { "epoch": 0.136, "grad_norm": 6.651194095611572, "learning_rate": 8.911250412405147e-07, "loss": 0.8492, "step": 425 }, { "epoch": 0.1376, "grad_norm": 6.037364482879639, "learning_rate": 8.894754206532498e-07, "loss": 0.9839, "step": 430 }, { "epoch": 0.1392, "grad_norm": 3.9697699546813965, "learning_rate": 8.878258000659847e-07, "loss": 0.9343, "step": 435 }, { "epoch": 0.1408, "grad_norm": 4.4155497550964355, "learning_rate": 8.861761794787198e-07, "loss": 0.9474, "step": 440 }, { "epoch": 0.1424, "grad_norm": 4.292988300323486, "learning_rate": 8.845265588914549e-07, "loss": 1.0596, "step": 445 }, { "epoch": 0.144, "grad_norm": 2.772756338119507, "learning_rate": 8.8287693830419e-07, "loss": 0.9765, "step": 450 }, { "epoch": 0.1456, "grad_norm": 7.738980770111084, "learning_rate": 8.81227317716925e-07, "loss": 0.8599, "step": 455 }, { "epoch": 0.1472, "grad_norm": 9.246415138244629, "learning_rate": 8.795776971296601e-07, "loss": 0.9711, "step": 460 }, { "epoch": 0.1488, "grad_norm": 5.940875053405762, "learning_rate": 8.779280765423952e-07, "loss": 0.9433, "step": 465 }, { "epoch": 0.1504, "grad_norm": 6.259022235870361, "learning_rate": 8.762784559551303e-07, "loss": 0.9859, "step": 470 }, { "epoch": 0.152, "grad_norm": 7.941705226898193, "learning_rate": 8.746288353678654e-07, "loss": 0.8696, "step": 475 }, { "epoch": 0.1536, "grad_norm": 3.571704626083374, "learning_rate": 8.729792147806004e-07, "loss": 0.943, "step": 480 }, { "epoch": 0.1552, "grad_norm": 4.129303455352783, "learning_rate": 8.713295941933355e-07, "loss": 0.8251, "step": 485 }, { "epoch": 0.1568, "grad_norm": 8.326216697692871, "learning_rate": 8.696799736060706e-07, "loss": 0.9393, "step": 490 }, { "epoch": 0.1584, "grad_norm": 2.903012275695801, "learning_rate": 8.680303530188057e-07, "loss": 0.8944, "step": 495 }, { "epoch": 0.16, "grad_norm": 5.4961628913879395, "learning_rate": 8.663807324315408e-07, "loss": 1.0191, "step": 500 }, { "epoch": 0.1616, "grad_norm": 17.958810806274414, "learning_rate": 8.647311118442758e-07, "loss": 0.8689, "step": 505 }, { "epoch": 0.1632, "grad_norm": 7.708248138427734, "learning_rate": 8.630814912570108e-07, "loss": 0.9344, "step": 510 }, { "epoch": 0.1648, "grad_norm": 3.0089898109436035, "learning_rate": 8.614318706697459e-07, "loss": 0.9085, "step": 515 }, { "epoch": 0.1664, "grad_norm": 3.333603858947754, "learning_rate": 8.59782250082481e-07, "loss": 0.9389, "step": 520 }, { "epoch": 0.168, "grad_norm": 4.273075580596924, "learning_rate": 8.58132629495216e-07, "loss": 0.9545, "step": 525 }, { "epoch": 0.1696, "grad_norm": 3.9365367889404297, "learning_rate": 8.564830089079511e-07, "loss": 1.044, "step": 530 }, { "epoch": 0.1712, "grad_norm": 8.090559959411621, "learning_rate": 8.548333883206862e-07, "loss": 0.8949, "step": 535 }, { "epoch": 0.1728, "grad_norm": 3.675675868988037, "learning_rate": 8.531837677334213e-07, "loss": 0.8984, "step": 540 }, { "epoch": 0.1744, "grad_norm": 4.396546840667725, "learning_rate": 8.515341471461564e-07, "loss": 0.8276, "step": 545 }, { "epoch": 0.176, "grad_norm": 5.8129706382751465, "learning_rate": 8.498845265588914e-07, "loss": 0.9485, "step": 550 }, { "epoch": 0.1776, "grad_norm": 4.243994235992432, "learning_rate": 8.482349059716265e-07, "loss": 0.8613, "step": 555 }, { "epoch": 0.1792, "grad_norm": 3.252338409423828, "learning_rate": 8.465852853843616e-07, "loss": 0.9014, "step": 560 }, { "epoch": 0.1808, "grad_norm": 2.9563217163085938, "learning_rate": 8.449356647970967e-07, "loss": 0.9632, "step": 565 }, { "epoch": 0.1824, "grad_norm": 4.2105302810668945, "learning_rate": 8.432860442098317e-07, "loss": 0.8758, "step": 570 }, { "epoch": 0.184, "grad_norm": 4.756136417388916, "learning_rate": 8.416364236225668e-07, "loss": 0.9102, "step": 575 }, { "epoch": 0.1856, "grad_norm": 3.9406795501708984, "learning_rate": 8.399868030353019e-07, "loss": 0.9286, "step": 580 }, { "epoch": 0.1872, "grad_norm": 5.04837703704834, "learning_rate": 8.383371824480369e-07, "loss": 1.0353, "step": 585 }, { "epoch": 0.1888, "grad_norm": 2.621293306350708, "learning_rate": 8.36687561860772e-07, "loss": 0.9047, "step": 590 }, { "epoch": 0.1904, "grad_norm": 4.60697078704834, "learning_rate": 8.35037941273507e-07, "loss": 0.9403, "step": 595 }, { "epoch": 0.192, "grad_norm": 3.5132675170898438, "learning_rate": 8.333883206862421e-07, "loss": 0.9576, "step": 600 }, { "epoch": 0.1936, "grad_norm": 6.349253177642822, "learning_rate": 8.317387000989772e-07, "loss": 0.9443, "step": 605 }, { "epoch": 0.1952, "grad_norm": 4.456959247589111, "learning_rate": 8.300890795117123e-07, "loss": 0.9476, "step": 610 }, { "epoch": 0.1968, "grad_norm": 3.2254133224487305, "learning_rate": 8.284394589244474e-07, "loss": 0.9401, "step": 615 }, { "epoch": 0.1984, "grad_norm": 4.775053977966309, "learning_rate": 8.267898383371824e-07, "loss": 1.0189, "step": 620 }, { "epoch": 0.2, "grad_norm": 5.755006790161133, "learning_rate": 8.251402177499175e-07, "loss": 0.8521, "step": 625 }, { "epoch": 0.2016, "grad_norm": 4.0426716804504395, "learning_rate": 8.234905971626526e-07, "loss": 0.9075, "step": 630 }, { "epoch": 0.2032, "grad_norm": 4.348297119140625, "learning_rate": 8.218409765753877e-07, "loss": 0.9625, "step": 635 }, { "epoch": 0.2048, "grad_norm": 6.654307842254639, "learning_rate": 8.201913559881227e-07, "loss": 0.8582, "step": 640 }, { "epoch": 0.2064, "grad_norm": 4.090488910675049, "learning_rate": 8.185417354008578e-07, "loss": 0.9359, "step": 645 }, { "epoch": 0.208, "grad_norm": 5.286700248718262, "learning_rate": 8.168921148135929e-07, "loss": 0.9783, "step": 650 }, { "epoch": 0.2096, "grad_norm": 4.438581466674805, "learning_rate": 8.15242494226328e-07, "loss": 0.9526, "step": 655 }, { "epoch": 0.2112, "grad_norm": 3.836512565612793, "learning_rate": 8.13592873639063e-07, "loss": 0.8768, "step": 660 }, { "epoch": 0.2128, "grad_norm": 3.1037371158599854, "learning_rate": 8.11943253051798e-07, "loss": 0.963, "step": 665 }, { "epoch": 0.2144, "grad_norm": 5.103884696960449, "learning_rate": 8.102936324645331e-07, "loss": 0.9322, "step": 670 }, { "epoch": 0.216, "grad_norm": 3.707827091217041, "learning_rate": 8.086440118772682e-07, "loss": 0.8691, "step": 675 }, { "epoch": 0.2176, "grad_norm": 3.7925477027893066, "learning_rate": 8.069943912900033e-07, "loss": 0.9636, "step": 680 }, { "epoch": 0.2192, "grad_norm": 4.857919692993164, "learning_rate": 8.053447707027383e-07, "loss": 0.8867, "step": 685 }, { "epoch": 0.2208, "grad_norm": 3.513091564178467, "learning_rate": 8.036951501154734e-07, "loss": 0.9646, "step": 690 }, { "epoch": 0.2224, "grad_norm": 3.4893958568573, "learning_rate": 8.020455295282085e-07, "loss": 0.9276, "step": 695 }, { "epoch": 0.224, "grad_norm": 3.087334156036377, "learning_rate": 8.003959089409436e-07, "loss": 0.89, "step": 700 }, { "epoch": 0.2256, "grad_norm": 4.584767818450928, "learning_rate": 7.987462883536787e-07, "loss": 1.0421, "step": 705 }, { "epoch": 0.2272, "grad_norm": 3.7277486324310303, "learning_rate": 7.970966677664137e-07, "loss": 0.8757, "step": 710 }, { "epoch": 0.2288, "grad_norm": 4.989358425140381, "learning_rate": 7.954470471791488e-07, "loss": 0.8421, "step": 715 }, { "epoch": 0.2304, "grad_norm": 4.230960845947266, "learning_rate": 7.937974265918839e-07, "loss": 0.883, "step": 720 }, { "epoch": 0.232, "grad_norm": 7.118892192840576, "learning_rate": 7.92147806004619e-07, "loss": 0.9549, "step": 725 }, { "epoch": 0.2336, "grad_norm": 10.041189193725586, "learning_rate": 7.904981854173541e-07, "loss": 0.9897, "step": 730 }, { "epoch": 0.2352, "grad_norm": 3.822767734527588, "learning_rate": 7.88848564830089e-07, "loss": 0.8711, "step": 735 }, { "epoch": 0.2368, "grad_norm": 4.475744724273682, "learning_rate": 7.871989442428241e-07, "loss": 0.9175, "step": 740 }, { "epoch": 0.2384, "grad_norm": 5.1056976318359375, "learning_rate": 7.855493236555592e-07, "loss": 0.9022, "step": 745 }, { "epoch": 0.24, "grad_norm": 4.522522926330566, "learning_rate": 7.838997030682943e-07, "loss": 1.0007, "step": 750 }, { "epoch": 0.2416, "grad_norm": 4.390966415405273, "learning_rate": 7.822500824810293e-07, "loss": 0.9074, "step": 755 }, { "epoch": 0.2432, "grad_norm": 7.068999290466309, "learning_rate": 7.806004618937644e-07, "loss": 0.8454, "step": 760 }, { "epoch": 0.2448, "grad_norm": 3.558549642562866, "learning_rate": 7.789508413064995e-07, "loss": 0.88, "step": 765 }, { "epoch": 0.2464, "grad_norm": 4.7729949951171875, "learning_rate": 7.773012207192346e-07, "loss": 0.8577, "step": 770 }, { "epoch": 0.248, "grad_norm": 3.512878894805908, "learning_rate": 7.756516001319697e-07, "loss": 1.0939, "step": 775 }, { "epoch": 0.2496, "grad_norm": 2.6263558864593506, "learning_rate": 7.740019795447047e-07, "loss": 0.8807, "step": 780 }, { "epoch": 0.2512, "grad_norm": 2.518568992614746, "learning_rate": 7.723523589574398e-07, "loss": 0.7706, "step": 785 }, { "epoch": 0.2528, "grad_norm": 5.156455993652344, "learning_rate": 7.707027383701749e-07, "loss": 0.8809, "step": 790 }, { "epoch": 0.2544, "grad_norm": 7.734130382537842, "learning_rate": 7.6905311778291e-07, "loss": 0.967, "step": 795 }, { "epoch": 0.256, "grad_norm": 2.6134588718414307, "learning_rate": 7.674034971956451e-07, "loss": 0.7965, "step": 800 }, { "epoch": 0.2576, "grad_norm": 9.95977783203125, "learning_rate": 7.657538766083801e-07, "loss": 0.9095, "step": 805 }, { "epoch": 0.2592, "grad_norm": 3.093651533126831, "learning_rate": 7.64104256021115e-07, "loss": 0.9707, "step": 810 }, { "epoch": 0.2608, "grad_norm": 3.039573907852173, "learning_rate": 7.624546354338501e-07, "loss": 0.9317, "step": 815 }, { "epoch": 0.2624, "grad_norm": 4.40300989151001, "learning_rate": 7.608050148465853e-07, "loss": 0.8223, "step": 820 }, { "epoch": 0.264, "grad_norm": 5.527564525604248, "learning_rate": 7.591553942593202e-07, "loss": 0.9234, "step": 825 }, { "epoch": 0.2656, "grad_norm": 5.654271602630615, "learning_rate": 7.575057736720553e-07, "loss": 0.9437, "step": 830 }, { "epoch": 0.2672, "grad_norm": 5.316553115844727, "learning_rate": 7.558561530847904e-07, "loss": 0.8862, "step": 835 }, { "epoch": 0.2688, "grad_norm": 2.8125505447387695, "learning_rate": 7.542065324975255e-07, "loss": 0.8235, "step": 840 }, { "epoch": 0.2704, "grad_norm": 5.254530429840088, "learning_rate": 7.525569119102606e-07, "loss": 0.8986, "step": 845 }, { "epoch": 0.272, "grad_norm": 5.69275426864624, "learning_rate": 7.509072913229956e-07, "loss": 0.9043, "step": 850 }, { "epoch": 0.2736, "grad_norm": 4.995587348937988, "learning_rate": 7.492576707357307e-07, "loss": 0.8242, "step": 855 }, { "epoch": 0.2752, "grad_norm": 8.568499565124512, "learning_rate": 7.476080501484658e-07, "loss": 0.9116, "step": 860 }, { "epoch": 0.2768, "grad_norm": 5.804699420928955, "learning_rate": 7.45958429561201e-07, "loss": 0.863, "step": 865 }, { "epoch": 0.2784, "grad_norm": 7.921741962432861, "learning_rate": 7.44308808973936e-07, "loss": 0.9719, "step": 870 }, { "epoch": 0.28, "grad_norm": 5.888009071350098, "learning_rate": 7.42659188386671e-07, "loss": 0.8999, "step": 875 }, { "epoch": 0.2816, "grad_norm": 3.9009482860565186, "learning_rate": 7.410095677994061e-07, "loss": 0.9543, "step": 880 }, { "epoch": 0.2832, "grad_norm": 3.502060651779175, "learning_rate": 7.393599472121411e-07, "loss": 0.8367, "step": 885 }, { "epoch": 0.2848, "grad_norm": 4.675789833068848, "learning_rate": 7.377103266248762e-07, "loss": 0.8843, "step": 890 }, { "epoch": 0.2864, "grad_norm": 3.246445417404175, "learning_rate": 7.360607060376112e-07, "loss": 0.9663, "step": 895 }, { "epoch": 0.288, "grad_norm": 3.168081521987915, "learning_rate": 7.344110854503463e-07, "loss": 0.8186, "step": 900 }, { "epoch": 0.2896, "grad_norm": 5.229098320007324, "learning_rate": 7.327614648630814e-07, "loss": 0.9686, "step": 905 }, { "epoch": 0.2912, "grad_norm": 6.258688926696777, "learning_rate": 7.311118442758165e-07, "loss": 0.8538, "step": 910 }, { "epoch": 0.2928, "grad_norm": 5.021489143371582, "learning_rate": 7.294622236885516e-07, "loss": 0.7937, "step": 915 }, { "epoch": 0.2944, "grad_norm": 3.9986860752105713, "learning_rate": 7.278126031012866e-07, "loss": 0.8503, "step": 920 }, { "epoch": 0.296, "grad_norm": 3.62813663482666, "learning_rate": 7.261629825140217e-07, "loss": 0.8199, "step": 925 }, { "epoch": 0.2976, "grad_norm": 4.517638683319092, "learning_rate": 7.245133619267568e-07, "loss": 0.8521, "step": 930 }, { "epoch": 0.2992, "grad_norm": 8.5663423538208, "learning_rate": 7.228637413394919e-07, "loss": 0.9132, "step": 935 }, { "epoch": 0.3008, "grad_norm": 4.776329040527344, "learning_rate": 7.212141207522269e-07, "loss": 0.8566, "step": 940 }, { "epoch": 0.3024, "grad_norm": 4.764708518981934, "learning_rate": 7.19564500164962e-07, "loss": 0.8181, "step": 945 }, { "epoch": 0.304, "grad_norm": 3.810011863708496, "learning_rate": 7.179148795776971e-07, "loss": 1.003, "step": 950 }, { "epoch": 0.3056, "grad_norm": 4.385900020599365, "learning_rate": 7.162652589904322e-07, "loss": 0.8525, "step": 955 }, { "epoch": 0.3072, "grad_norm": 8.322861671447754, "learning_rate": 7.146156384031672e-07, "loss": 0.9047, "step": 960 }, { "epoch": 0.3088, "grad_norm": 6.5899224281311035, "learning_rate": 7.129660178159022e-07, "loss": 0.7103, "step": 965 }, { "epoch": 0.3104, "grad_norm": 6.054207801818848, "learning_rate": 7.113163972286373e-07, "loss": 0.876, "step": 970 }, { "epoch": 0.312, "grad_norm": 3.6956799030303955, "learning_rate": 7.096667766413724e-07, "loss": 0.9382, "step": 975 }, { "epoch": 0.3136, "grad_norm": 4.055649757385254, "learning_rate": 7.080171560541075e-07, "loss": 0.8944, "step": 980 }, { "epoch": 0.3152, "grad_norm": 2.8398051261901855, "learning_rate": 7.063675354668426e-07, "loss": 0.8233, "step": 985 }, { "epoch": 0.3168, "grad_norm": 5.903645992279053, "learning_rate": 7.047179148795776e-07, "loss": 1.0067, "step": 990 }, { "epoch": 0.3184, "grad_norm": 5.374630451202393, "learning_rate": 7.030682942923127e-07, "loss": 0.8423, "step": 995 }, { "epoch": 0.32, "grad_norm": 6.729516506195068, "learning_rate": 7.014186737050478e-07, "loss": 1.0209, "step": 1000 }, { "epoch": 0.3216, "grad_norm": 3.4207727909088135, "learning_rate": 6.997690531177829e-07, "loss": 0.9196, "step": 1005 }, { "epoch": 0.3232, "grad_norm": 5.797353744506836, "learning_rate": 6.981194325305179e-07, "loss": 0.921, "step": 1010 }, { "epoch": 0.3248, "grad_norm": 4.802167892456055, "learning_rate": 6.96469811943253e-07, "loss": 0.8844, "step": 1015 }, { "epoch": 0.3264, "grad_norm": 6.671936511993408, "learning_rate": 6.948201913559881e-07, "loss": 0.8399, "step": 1020 }, { "epoch": 0.328, "grad_norm": 4.027926921844482, "learning_rate": 6.931705707687232e-07, "loss": 0.8733, "step": 1025 }, { "epoch": 0.3296, "grad_norm": 7.0996575355529785, "learning_rate": 6.915209501814583e-07, "loss": 0.8886, "step": 1030 }, { "epoch": 0.3312, "grad_norm": 3.9534752368927, "learning_rate": 6.898713295941932e-07, "loss": 0.9144, "step": 1035 }, { "epoch": 0.3328, "grad_norm": 5.016911506652832, "learning_rate": 6.882217090069283e-07, "loss": 0.9936, "step": 1040 }, { "epoch": 0.3344, "grad_norm": 3.6231181621551514, "learning_rate": 6.865720884196634e-07, "loss": 0.7205, "step": 1045 }, { "epoch": 0.336, "grad_norm": 3.6556529998779297, "learning_rate": 6.849224678323985e-07, "loss": 0.914, "step": 1050 }, { "epoch": 0.3376, "grad_norm": 3.27970552444458, "learning_rate": 6.832728472451335e-07, "loss": 0.8306, "step": 1055 }, { "epoch": 0.3392, "grad_norm": 3.816570997238159, "learning_rate": 6.816232266578686e-07, "loss": 0.9458, "step": 1060 }, { "epoch": 0.3408, "grad_norm": 7.391907215118408, "learning_rate": 6.799736060706037e-07, "loss": 0.8389, "step": 1065 }, { "epoch": 0.3424, "grad_norm": 3.756998300552368, "learning_rate": 6.783239854833388e-07, "loss": 0.8889, "step": 1070 }, { "epoch": 0.344, "grad_norm": 4.19740629196167, "learning_rate": 6.766743648960739e-07, "loss": 0.8942, "step": 1075 }, { "epoch": 0.3456, "grad_norm": 4.351824760437012, "learning_rate": 6.750247443088089e-07, "loss": 0.9602, "step": 1080 }, { "epoch": 0.3472, "grad_norm": 3.371953010559082, "learning_rate": 6.73375123721544e-07, "loss": 0.9365, "step": 1085 }, { "epoch": 0.3488, "grad_norm": 5.847254753112793, "learning_rate": 6.717255031342791e-07, "loss": 0.7705, "step": 1090 }, { "epoch": 0.3504, "grad_norm": 5.160730361938477, "learning_rate": 6.700758825470142e-07, "loss": 0.9221, "step": 1095 }, { "epoch": 0.352, "grad_norm": 3.4430952072143555, "learning_rate": 6.684262619597493e-07, "loss": 0.9484, "step": 1100 }, { "epoch": 0.3536, "grad_norm": 4.218683242797852, "learning_rate": 6.667766413724843e-07, "loss": 0.8446, "step": 1105 }, { "epoch": 0.3552, "grad_norm": 5.120244026184082, "learning_rate": 6.651270207852193e-07, "loss": 0.9515, "step": 1110 }, { "epoch": 0.3568, "grad_norm": 5.609252452850342, "learning_rate": 6.634774001979544e-07, "loss": 0.8694, "step": 1115 }, { "epoch": 0.3584, "grad_norm": 3.753680467605591, "learning_rate": 6.618277796106895e-07, "loss": 0.9242, "step": 1120 }, { "epoch": 0.36, "grad_norm": 3.665069580078125, "learning_rate": 6.601781590234245e-07, "loss": 0.9034, "step": 1125 }, { "epoch": 0.3616, "grad_norm": 4.715619087219238, "learning_rate": 6.585285384361596e-07, "loss": 0.8842, "step": 1130 }, { "epoch": 0.3632, "grad_norm": 4.438577651977539, "learning_rate": 6.568789178488947e-07, "loss": 0.9432, "step": 1135 }, { "epoch": 0.3648, "grad_norm": 3.8930490016937256, "learning_rate": 6.552292972616298e-07, "loss": 0.8113, "step": 1140 }, { "epoch": 0.3664, "grad_norm": 4.182096004486084, "learning_rate": 6.535796766743649e-07, "loss": 0.7999, "step": 1145 }, { "epoch": 0.368, "grad_norm": 5.353331089019775, "learning_rate": 6.519300560870999e-07, "loss": 0.8499, "step": 1150 }, { "epoch": 0.3696, "grad_norm": 3.641796588897705, "learning_rate": 6.50280435499835e-07, "loss": 0.9247, "step": 1155 }, { "epoch": 0.3712, "grad_norm": 4.2418646812438965, "learning_rate": 6.486308149125701e-07, "loss": 1.0406, "step": 1160 }, { "epoch": 0.3728, "grad_norm": 2.948838233947754, "learning_rate": 6.469811943253052e-07, "loss": 0.8411, "step": 1165 }, { "epoch": 0.3744, "grad_norm": 7.832685947418213, "learning_rate": 6.453315737380403e-07, "loss": 0.9339, "step": 1170 }, { "epoch": 0.376, "grad_norm": 3.882305145263672, "learning_rate": 6.436819531507753e-07, "loss": 0.9936, "step": 1175 }, { "epoch": 0.3776, "grad_norm": 6.916220664978027, "learning_rate": 6.420323325635104e-07, "loss": 1.0441, "step": 1180 }, { "epoch": 0.3792, "grad_norm": 6.770009517669678, "learning_rate": 6.403827119762454e-07, "loss": 0.9299, "step": 1185 }, { "epoch": 0.3808, "grad_norm": 4.584465980529785, "learning_rate": 6.387330913889805e-07, "loss": 0.9169, "step": 1190 }, { "epoch": 0.3824, "grad_norm": 8.000226974487305, "learning_rate": 6.370834708017155e-07, "loss": 1.0345, "step": 1195 }, { "epoch": 0.384, "grad_norm": 13.314818382263184, "learning_rate": 6.354338502144506e-07, "loss": 0.919, "step": 1200 }, { "epoch": 0.3856, "grad_norm": 3.7661311626434326, "learning_rate": 6.337842296271857e-07, "loss": 0.9678, "step": 1205 }, { "epoch": 0.3872, "grad_norm": 4.133317470550537, "learning_rate": 6.321346090399208e-07, "loss": 0.9533, "step": 1210 }, { "epoch": 0.3888, "grad_norm": 2.6607346534729004, "learning_rate": 6.304849884526559e-07, "loss": 0.9946, "step": 1215 }, { "epoch": 0.3904, "grad_norm": 3.8332831859588623, "learning_rate": 6.288353678653909e-07, "loss": 0.8293, "step": 1220 }, { "epoch": 0.392, "grad_norm": 3.6170003414154053, "learning_rate": 6.27185747278126e-07, "loss": 0.8952, "step": 1225 }, { "epoch": 0.3936, "grad_norm": 5.026386737823486, "learning_rate": 6.255361266908611e-07, "loss": 0.917, "step": 1230 }, { "epoch": 0.3952, "grad_norm": 10.285544395446777, "learning_rate": 6.238865061035962e-07, "loss": 0.8624, "step": 1235 }, { "epoch": 0.3968, "grad_norm": 2.899703025817871, "learning_rate": 6.222368855163313e-07, "loss": 0.9242, "step": 1240 }, { "epoch": 0.3984, "grad_norm": 6.087869167327881, "learning_rate": 6.205872649290663e-07, "loss": 0.8574, "step": 1245 }, { "epoch": 0.4, "grad_norm": 4.513827323913574, "learning_rate": 6.189376443418014e-07, "loss": 0.9205, "step": 1250 }, { "epoch": 0.4016, "grad_norm": 3.0381855964660645, "learning_rate": 6.172880237545365e-07, "loss": 0.8779, "step": 1255 }, { "epoch": 0.4032, "grad_norm": 9.265677452087402, "learning_rate": 6.156384031672715e-07, "loss": 0.8645, "step": 1260 }, { "epoch": 0.4048, "grad_norm": 5.159205436706543, "learning_rate": 6.139887825800065e-07, "loss": 0.7365, "step": 1265 }, { "epoch": 0.4064, "grad_norm": 4.157783031463623, "learning_rate": 6.123391619927416e-07, "loss": 1.003, "step": 1270 }, { "epoch": 0.408, "grad_norm": 4.129422187805176, "learning_rate": 6.106895414054767e-07, "loss": 1.0333, "step": 1275 }, { "epoch": 0.4096, "grad_norm": 5.1480536460876465, "learning_rate": 6.090399208182118e-07, "loss": 0.9097, "step": 1280 }, { "epoch": 0.4112, "grad_norm": 6.21195650100708, "learning_rate": 6.073903002309469e-07, "loss": 0.9753, "step": 1285 }, { "epoch": 0.4128, "grad_norm": 4.375741481781006, "learning_rate": 6.057406796436819e-07, "loss": 0.8613, "step": 1290 }, { "epoch": 0.4144, "grad_norm": 6.381781578063965, "learning_rate": 6.04091059056417e-07, "loss": 0.8447, "step": 1295 }, { "epoch": 0.416, "grad_norm": 6.689861297607422, "learning_rate": 6.024414384691521e-07, "loss": 0.9417, "step": 1300 }, { "epoch": 0.4176, "grad_norm": 7.573152542114258, "learning_rate": 6.007918178818872e-07, "loss": 0.8559, "step": 1305 }, { "epoch": 0.4192, "grad_norm": 3.2965288162231445, "learning_rate": 5.991421972946222e-07, "loss": 0.8483, "step": 1310 }, { "epoch": 0.4208, "grad_norm": 3.5138070583343506, "learning_rate": 5.974925767073573e-07, "loss": 0.7122, "step": 1315 }, { "epoch": 0.4224, "grad_norm": 3.6955363750457764, "learning_rate": 5.958429561200924e-07, "loss": 0.8294, "step": 1320 }, { "epoch": 0.424, "grad_norm": 3.6129462718963623, "learning_rate": 5.941933355328275e-07, "loss": 1.0534, "step": 1325 }, { "epoch": 0.4256, "grad_norm": 4.549454212188721, "learning_rate": 5.925437149455626e-07, "loss": 0.8865, "step": 1330 }, { "epoch": 0.4272, "grad_norm": 5.264537811279297, "learning_rate": 5.908940943582975e-07, "loss": 0.9219, "step": 1335 }, { "epoch": 0.4288, "grad_norm": 5.2592267990112305, "learning_rate": 5.892444737710326e-07, "loss": 0.8414, "step": 1340 }, { "epoch": 0.4304, "grad_norm": 7.621036529541016, "learning_rate": 5.875948531837677e-07, "loss": 0.8553, "step": 1345 }, { "epoch": 0.432, "grad_norm": 4.718931674957275, "learning_rate": 5.859452325965028e-07, "loss": 1.0172, "step": 1350 }, { "epoch": 0.4336, "grad_norm": 3.382890224456787, "learning_rate": 5.842956120092379e-07, "loss": 0.9208, "step": 1355 }, { "epoch": 0.4352, "grad_norm": 5.720285415649414, "learning_rate": 5.826459914219729e-07, "loss": 0.9376, "step": 1360 }, { "epoch": 0.4368, "grad_norm": 6.348506450653076, "learning_rate": 5.80996370834708e-07, "loss": 0.8526, "step": 1365 }, { "epoch": 0.4384, "grad_norm": 3.639775276184082, "learning_rate": 5.793467502474431e-07, "loss": 0.9067, "step": 1370 }, { "epoch": 0.44, "grad_norm": 4.016887187957764, "learning_rate": 5.776971296601782e-07, "loss": 0.9526, "step": 1375 }, { "epoch": 0.4416, "grad_norm": 6.822007656097412, "learning_rate": 5.760475090729132e-07, "loss": 0.776, "step": 1380 }, { "epoch": 0.4432, "grad_norm": 4.3558125495910645, "learning_rate": 5.743978884856483e-07, "loss": 0.8932, "step": 1385 }, { "epoch": 0.4448, "grad_norm": 6.4160475730896, "learning_rate": 5.727482678983834e-07, "loss": 0.9081, "step": 1390 }, { "epoch": 0.4464, "grad_norm": 3.1769394874572754, "learning_rate": 5.710986473111185e-07, "loss": 0.8688, "step": 1395 }, { "epoch": 0.448, "grad_norm": 3.9851887226104736, "learning_rate": 5.694490267238536e-07, "loss": 0.877, "step": 1400 }, { "epoch": 0.4496, "grad_norm": 5.6506571769714355, "learning_rate": 5.677994061365886e-07, "loss": 0.8325, "step": 1405 }, { "epoch": 0.4512, "grad_norm": 3.549743175506592, "learning_rate": 5.661497855493236e-07, "loss": 0.8987, "step": 1410 }, { "epoch": 0.4528, "grad_norm": 3.143094062805176, "learning_rate": 5.645001649620587e-07, "loss": 0.9429, "step": 1415 }, { "epoch": 0.4544, "grad_norm": 8.156094551086426, "learning_rate": 5.628505443747938e-07, "loss": 0.7903, "step": 1420 }, { "epoch": 0.456, "grad_norm": 4.86202335357666, "learning_rate": 5.612009237875289e-07, "loss": 0.933, "step": 1425 }, { "epoch": 0.4576, "grad_norm": 5.636049270629883, "learning_rate": 5.595513032002639e-07, "loss": 0.921, "step": 1430 }, { "epoch": 0.4592, "grad_norm": 3.5446226596832275, "learning_rate": 5.57901682612999e-07, "loss": 0.8239, "step": 1435 }, { "epoch": 0.4608, "grad_norm": 3.516528606414795, "learning_rate": 5.562520620257341e-07, "loss": 0.894, "step": 1440 }, { "epoch": 0.4624, "grad_norm": 3.1388487815856934, "learning_rate": 5.546024414384692e-07, "loss": 0.9401, "step": 1445 }, { "epoch": 0.464, "grad_norm": 3.378370761871338, "learning_rate": 5.529528208512042e-07, "loss": 0.8786, "step": 1450 }, { "epoch": 0.4656, "grad_norm": 4.898928165435791, "learning_rate": 5.513032002639393e-07, "loss": 0.8457, "step": 1455 }, { "epoch": 0.4672, "grad_norm": 8.320155143737793, "learning_rate": 5.496535796766744e-07, "loss": 0.9857, "step": 1460 }, { "epoch": 0.4688, "grad_norm": 12.393474578857422, "learning_rate": 5.480039590894095e-07, "loss": 1.0179, "step": 1465 }, { "epoch": 0.4704, "grad_norm": 3.834761142730713, "learning_rate": 5.463543385021446e-07, "loss": 0.8355, "step": 1470 }, { "epoch": 0.472, "grad_norm": 5.7657694816589355, "learning_rate": 5.447047179148796e-07, "loss": 0.906, "step": 1475 }, { "epoch": 0.4736, "grad_norm": 2.89928936958313, "learning_rate": 5.430550973276147e-07, "loss": 0.8453, "step": 1480 }, { "epoch": 0.4752, "grad_norm": 3.30023455619812, "learning_rate": 5.414054767403497e-07, "loss": 0.863, "step": 1485 }, { "epoch": 0.4768, "grad_norm": 6.904449462890625, "learning_rate": 5.397558561530848e-07, "loss": 0.8722, "step": 1490 }, { "epoch": 0.4784, "grad_norm": 2.936325788497925, "learning_rate": 5.381062355658197e-07, "loss": 0.8565, "step": 1495 }, { "epoch": 0.48, "grad_norm": 6.707699775695801, "learning_rate": 5.364566149785548e-07, "loss": 0.827, "step": 1500 }, { "epoch": 0.4816, "grad_norm": 3.5800673961639404, "learning_rate": 5.3480699439129e-07, "loss": 0.8582, "step": 1505 }, { "epoch": 0.4832, "grad_norm": 5.940330505371094, "learning_rate": 5.33157373804025e-07, "loss": 0.9983, "step": 1510 }, { "epoch": 0.4848, "grad_norm": 4.438694000244141, "learning_rate": 5.315077532167602e-07, "loss": 0.8307, "step": 1515 }, { "epoch": 0.4864, "grad_norm": 6.149857044219971, "learning_rate": 5.298581326294951e-07, "loss": 0.9128, "step": 1520 }, { "epoch": 0.488, "grad_norm": 3.874925136566162, "learning_rate": 5.282085120422302e-07, "loss": 0.966, "step": 1525 }, { "epoch": 0.4896, "grad_norm": 17.836402893066406, "learning_rate": 5.265588914549653e-07, "loss": 0.8737, "step": 1530 }, { "epoch": 0.4912, "grad_norm": 5.488133430480957, "learning_rate": 5.249092708677005e-07, "loss": 0.9353, "step": 1535 }, { "epoch": 0.4928, "grad_norm": 4.590605735778809, "learning_rate": 5.232596502804356e-07, "loss": 0.8757, "step": 1540 }, { "epoch": 0.4944, "grad_norm": 3.0040788650512695, "learning_rate": 5.216100296931705e-07, "loss": 0.8116, "step": 1545 }, { "epoch": 0.496, "grad_norm": 3.2542080879211426, "learning_rate": 5.199604091059056e-07, "loss": 0.9621, "step": 1550 }, { "epoch": 0.4976, "grad_norm": 4.786755561828613, "learning_rate": 5.183107885186407e-07, "loss": 0.9551, "step": 1555 }, { "epoch": 0.4992, "grad_norm": 5.058788299560547, "learning_rate": 5.166611679313757e-07, "loss": 0.7788, "step": 1560 }, { "epoch": 0.5008, "grad_norm": 6.7137131690979, "learning_rate": 5.150115473441107e-07, "loss": 0.8605, "step": 1565 }, { "epoch": 0.5024, "grad_norm": 5.942770004272461, "learning_rate": 5.133619267568458e-07, "loss": 0.8731, "step": 1570 }, { "epoch": 0.504, "grad_norm": 3.7935171127319336, "learning_rate": 5.117123061695809e-07, "loss": 0.8126, "step": 1575 }, { "epoch": 0.5056, "grad_norm": 3.8675737380981445, "learning_rate": 5.10062685582316e-07, "loss": 0.8163, "step": 1580 }, { "epoch": 0.5072, "grad_norm": 3.6353890895843506, "learning_rate": 5.084130649950511e-07, "loss": 0.8407, "step": 1585 }, { "epoch": 0.5088, "grad_norm": 6.919312477111816, "learning_rate": 5.067634444077861e-07, "loss": 0.8906, "step": 1590 }, { "epoch": 0.5104, "grad_norm": 4.731250286102295, "learning_rate": 5.051138238205212e-07, "loss": 0.7421, "step": 1595 }, { "epoch": 0.512, "grad_norm": 3.6495304107666016, "learning_rate": 5.034642032332563e-07, "loss": 0.8691, "step": 1600 }, { "epoch": 0.5136, "grad_norm": 3.6082992553710938, "learning_rate": 5.018145826459914e-07, "loss": 0.9252, "step": 1605 }, { "epoch": 0.5152, "grad_norm": 2.5912933349609375, "learning_rate": 5.001649620587265e-07, "loss": 0.8538, "step": 1610 }, { "epoch": 0.5168, "grad_norm": 7.729884624481201, "learning_rate": 4.985153414714615e-07, "loss": 0.7578, "step": 1615 }, { "epoch": 0.5184, "grad_norm": 4.614051342010498, "learning_rate": 4.968657208841966e-07, "loss": 0.8363, "step": 1620 }, { "epoch": 0.52, "grad_norm": 3.5848758220672607, "learning_rate": 4.952161002969316e-07, "loss": 0.902, "step": 1625 }, { "epoch": 0.5216, "grad_norm": 4.744536399841309, "learning_rate": 4.935664797096667e-07, "loss": 0.8356, "step": 1630 }, { "epoch": 0.5232, "grad_norm": 6.719925880432129, "learning_rate": 4.919168591224018e-07, "loss": 0.8663, "step": 1635 }, { "epoch": 0.5248, "grad_norm": 5.994638442993164, "learning_rate": 4.902672385351369e-07, "loss": 0.8854, "step": 1640 }, { "epoch": 0.5264, "grad_norm": 3.5340418815612793, "learning_rate": 4.88617617947872e-07, "loss": 0.738, "step": 1645 }, { "epoch": 0.528, "grad_norm": 4.414712905883789, "learning_rate": 4.86967997360607e-07, "loss": 0.8637, "step": 1650 }, { "epoch": 0.5296, "grad_norm": 3.8119003772735596, "learning_rate": 4.853183767733421e-07, "loss": 0.8948, "step": 1655 }, { "epoch": 0.5312, "grad_norm": 3.453695058822632, "learning_rate": 4.836687561860771e-07, "loss": 0.9324, "step": 1660 }, { "epoch": 0.5328, "grad_norm": 8.695696830749512, "learning_rate": 4.820191355988122e-07, "loss": 0.8486, "step": 1665 }, { "epoch": 0.5344, "grad_norm": 3.696438789367676, "learning_rate": 4.803695150115473e-07, "loss": 0.8801, "step": 1670 }, { "epoch": 0.536, "grad_norm": 5.598580360412598, "learning_rate": 4.787198944242824e-07, "loss": 0.9598, "step": 1675 }, { "epoch": 0.5376, "grad_norm": 7.447549819946289, "learning_rate": 4.770702738370175e-07, "loss": 0.7981, "step": 1680 }, { "epoch": 0.5392, "grad_norm": 3.8933768272399902, "learning_rate": 4.754206532497526e-07, "loss": 0.7836, "step": 1685 }, { "epoch": 0.5408, "grad_norm": 4.233343124389648, "learning_rate": 4.737710326624876e-07, "loss": 0.9953, "step": 1690 }, { "epoch": 0.5424, "grad_norm": 4.121957302093506, "learning_rate": 4.721214120752227e-07, "loss": 0.8909, "step": 1695 }, { "epoch": 0.544, "grad_norm": 3.326876640319824, "learning_rate": 4.704717914879577e-07, "loss": 0.8207, "step": 1700 }, { "epoch": 0.5456, "grad_norm": 4.2965006828308105, "learning_rate": 4.688221709006928e-07, "loss": 0.7338, "step": 1705 }, { "epoch": 0.5472, "grad_norm": 12.319995880126953, "learning_rate": 4.6717255031342787e-07, "loss": 0.8716, "step": 1710 }, { "epoch": 0.5488, "grad_norm": 3.4306647777557373, "learning_rate": 4.6552292972616297e-07, "loss": 0.8844, "step": 1715 }, { "epoch": 0.5504, "grad_norm": 3.8839616775512695, "learning_rate": 4.638733091388981e-07, "loss": 0.6966, "step": 1720 }, { "epoch": 0.552, "grad_norm": 4.802063941955566, "learning_rate": 4.622236885516331e-07, "loss": 1.0128, "step": 1725 }, { "epoch": 0.5536, "grad_norm": 3.7047386169433594, "learning_rate": 4.6057406796436817e-07, "loss": 0.7706, "step": 1730 }, { "epoch": 0.5552, "grad_norm": 5.304298400878906, "learning_rate": 4.589244473771032e-07, "loss": 0.7627, "step": 1735 }, { "epoch": 0.5568, "grad_norm": 3.211620330810547, "learning_rate": 4.572748267898383e-07, "loss": 0.8409, "step": 1740 }, { "epoch": 0.5584, "grad_norm": 4.873741149902344, "learning_rate": 4.5562520620257337e-07, "loss": 0.9454, "step": 1745 }, { "epoch": 0.56, "grad_norm": 3.801036834716797, "learning_rate": 4.5397558561530847e-07, "loss": 0.8148, "step": 1750 }, { "epoch": 0.5616, "grad_norm": 4.238209247589111, "learning_rate": 4.5232596502804357e-07, "loss": 0.907, "step": 1755 }, { "epoch": 0.5632, "grad_norm": 5.311016082763672, "learning_rate": 4.506763444407786e-07, "loss": 0.8865, "step": 1760 }, { "epoch": 0.5648, "grad_norm": 5.096076011657715, "learning_rate": 4.4902672385351366e-07, "loss": 0.8967, "step": 1765 }, { "epoch": 0.5664, "grad_norm": 3.0391855239868164, "learning_rate": 4.473771032662487e-07, "loss": 0.8671, "step": 1770 }, { "epoch": 0.568, "grad_norm": 3.2100040912628174, "learning_rate": 4.457274826789838e-07, "loss": 0.8144, "step": 1775 }, { "epoch": 0.5696, "grad_norm": 4.312873840332031, "learning_rate": 4.4407786209171886e-07, "loss": 0.9743, "step": 1780 }, { "epoch": 0.5712, "grad_norm": 4.577536582946777, "learning_rate": 4.4242824150445396e-07, "loss": 0.8988, "step": 1785 }, { "epoch": 0.5728, "grad_norm": 6.181716442108154, "learning_rate": 4.40778620917189e-07, "loss": 0.8709, "step": 1790 }, { "epoch": 0.5744, "grad_norm": 3.878676176071167, "learning_rate": 4.391290003299241e-07, "loss": 0.9212, "step": 1795 }, { "epoch": 0.576, "grad_norm": 3.634641170501709, "learning_rate": 4.374793797426592e-07, "loss": 0.9252, "step": 1800 }, { "epoch": 0.5776, "grad_norm": 4.589493274688721, "learning_rate": 4.358297591553942e-07, "loss": 0.9442, "step": 1805 }, { "epoch": 0.5792, "grad_norm": 3.5581719875335693, "learning_rate": 4.341801385681293e-07, "loss": 0.8068, "step": 1810 }, { "epoch": 0.5808, "grad_norm": 10.048519134521484, "learning_rate": 4.3253051798086436e-07, "loss": 0.9334, "step": 1815 }, { "epoch": 0.5824, "grad_norm": 7.136456489562988, "learning_rate": 4.3088089739359946e-07, "loss": 0.8711, "step": 1820 }, { "epoch": 0.584, "grad_norm": 12.951844215393066, "learning_rate": 4.292312768063345e-07, "loss": 0.8709, "step": 1825 }, { "epoch": 0.5856, "grad_norm": 4.377983093261719, "learning_rate": 4.275816562190696e-07, "loss": 0.8509, "step": 1830 }, { "epoch": 0.5872, "grad_norm": 4.836514472961426, "learning_rate": 4.259320356318047e-07, "loss": 0.882, "step": 1835 }, { "epoch": 0.5888, "grad_norm": 3.0401344299316406, "learning_rate": 4.242824150445397e-07, "loss": 0.9351, "step": 1840 }, { "epoch": 0.5904, "grad_norm": 4.854428768157959, "learning_rate": 4.226327944572748e-07, "loss": 0.9465, "step": 1845 }, { "epoch": 0.592, "grad_norm": 3.092222213745117, "learning_rate": 4.2098317387000985e-07, "loss": 0.8038, "step": 1850 }, { "epoch": 0.5936, "grad_norm": 5.498143196105957, "learning_rate": 4.1933355328274495e-07, "loss": 0.769, "step": 1855 }, { "epoch": 0.5952, "grad_norm": 2.4063949584960938, "learning_rate": 4.1768393269548e-07, "loss": 0.8057, "step": 1860 }, { "epoch": 0.5968, "grad_norm": 5.123895168304443, "learning_rate": 4.160343121082151e-07, "loss": 0.9554, "step": 1865 }, { "epoch": 0.5984, "grad_norm": 7.29245662689209, "learning_rate": 4.143846915209502e-07, "loss": 0.9079, "step": 1870 }, { "epoch": 0.6, "grad_norm": 2.9312267303466797, "learning_rate": 4.1273507093368525e-07, "loss": 0.8765, "step": 1875 }, { "epoch": 0.6016, "grad_norm": 3.0390522480010986, "learning_rate": 4.110854503464203e-07, "loss": 0.9539, "step": 1880 }, { "epoch": 0.6032, "grad_norm": 3.8350090980529785, "learning_rate": 4.0943582975915535e-07, "loss": 0.8397, "step": 1885 }, { "epoch": 0.6048, "grad_norm": 3.9119083881378174, "learning_rate": 4.0778620917189045e-07, "loss": 0.8769, "step": 1890 }, { "epoch": 0.6064, "grad_norm": 3.361199378967285, "learning_rate": 4.061365885846255e-07, "loss": 0.9028, "step": 1895 }, { "epoch": 0.608, "grad_norm": 4.87637186050415, "learning_rate": 4.044869679973606e-07, "loss": 0.8022, "step": 1900 }, { "epoch": 0.6096, "grad_norm": 4.546545505523682, "learning_rate": 4.028373474100957e-07, "loss": 0.7343, "step": 1905 }, { "epoch": 0.6112, "grad_norm": 2.975339651107788, "learning_rate": 4.0118772682283075e-07, "loss": 0.8335, "step": 1910 }, { "epoch": 0.6128, "grad_norm": 3.8709919452667236, "learning_rate": 3.995381062355658e-07, "loss": 0.7803, "step": 1915 }, { "epoch": 0.6144, "grad_norm": 2.690919876098633, "learning_rate": 3.9788848564830084e-07, "loss": 1.0247, "step": 1920 }, { "epoch": 0.616, "grad_norm": 4.163801193237305, "learning_rate": 3.9623886506103594e-07, "loss": 0.8764, "step": 1925 }, { "epoch": 0.6176, "grad_norm": 5.445613384246826, "learning_rate": 3.94589244473771e-07, "loss": 0.9047, "step": 1930 }, { "epoch": 0.6192, "grad_norm": 3.3369109630584717, "learning_rate": 3.929396238865061e-07, "loss": 0.9644, "step": 1935 }, { "epoch": 0.6208, "grad_norm": 2.8063957691192627, "learning_rate": 3.912900032992412e-07, "loss": 0.8411, "step": 1940 }, { "epoch": 0.6224, "grad_norm": 3.369598865509033, "learning_rate": 3.8964038271197624e-07, "loss": 0.8904, "step": 1945 }, { "epoch": 0.624, "grad_norm": 11.861967086791992, "learning_rate": 3.8799076212471134e-07, "loss": 0.8503, "step": 1950 }, { "epoch": 0.6256, "grad_norm": 3.746105670928955, "learning_rate": 3.8634114153744634e-07, "loss": 0.8549, "step": 1955 }, { "epoch": 0.6272, "grad_norm": 4.717544078826904, "learning_rate": 3.8469152095018144e-07, "loss": 0.7668, "step": 1960 }, { "epoch": 0.6288, "grad_norm": 3.0035829544067383, "learning_rate": 3.830419003629165e-07, "loss": 0.7519, "step": 1965 }, { "epoch": 0.6304, "grad_norm": 4.065003395080566, "learning_rate": 3.813922797756516e-07, "loss": 0.8116, "step": 1970 }, { "epoch": 0.632, "grad_norm": 5.251111030578613, "learning_rate": 3.7974265918838663e-07, "loss": 0.8879, "step": 1975 }, { "epoch": 0.6336, "grad_norm": 5.612459659576416, "learning_rate": 3.7809303860112173e-07, "loss": 0.7931, "step": 1980 }, { "epoch": 0.6352, "grad_norm": 4.041755199432373, "learning_rate": 3.7644341801385684e-07, "loss": 0.7963, "step": 1985 }, { "epoch": 0.6368, "grad_norm": 9.98974609375, "learning_rate": 3.7479379742659183e-07, "loss": 0.8962, "step": 1990 }, { "epoch": 0.6384, "grad_norm": 3.949065685272217, "learning_rate": 3.7314417683932693e-07, "loss": 0.9132, "step": 1995 }, { "epoch": 0.64, "grad_norm": 4.0974297523498535, "learning_rate": 3.71494556252062e-07, "loss": 0.9125, "step": 2000 }, { "epoch": 0.6416, "grad_norm": 3.70499324798584, "learning_rate": 3.698449356647971e-07, "loss": 0.7575, "step": 2005 }, { "epoch": 0.6432, "grad_norm": 4.345754623413086, "learning_rate": 3.6819531507753213e-07, "loss": 0.7473, "step": 2010 }, { "epoch": 0.6448, "grad_norm": 2.8242263793945312, "learning_rate": 3.6654569449026723e-07, "loss": 0.959, "step": 2015 }, { "epoch": 0.6464, "grad_norm": 3.6714463233947754, "learning_rate": 3.6489607390300233e-07, "loss": 0.7814, "step": 2020 }, { "epoch": 0.648, "grad_norm": 4.4022908210754395, "learning_rate": 3.632464533157374e-07, "loss": 0.8977, "step": 2025 }, { "epoch": 0.6496, "grad_norm": 3.5451886653900146, "learning_rate": 3.6159683272847243e-07, "loss": 0.8835, "step": 2030 }, { "epoch": 0.6512, "grad_norm": 5.699954509735107, "learning_rate": 3.599472121412075e-07, "loss": 0.8339, "step": 2035 }, { "epoch": 0.6528, "grad_norm": 3.2886204719543457, "learning_rate": 3.582975915539426e-07, "loss": 0.85, "step": 2040 }, { "epoch": 0.6544, "grad_norm": 2.8363375663757324, "learning_rate": 3.566479709666776e-07, "loss": 0.8195, "step": 2045 }, { "epoch": 0.656, "grad_norm": 3.734877824783325, "learning_rate": 3.549983503794127e-07, "loss": 0.8803, "step": 2050 }, { "epoch": 0.6576, "grad_norm": 2.6836330890655518, "learning_rate": 3.533487297921478e-07, "loss": 0.8574, "step": 2055 }, { "epoch": 0.6592, "grad_norm": 3.9296648502349854, "learning_rate": 3.516991092048829e-07, "loss": 0.7938, "step": 2060 }, { "epoch": 0.6608, "grad_norm": 2.973696231842041, "learning_rate": 3.500494886176179e-07, "loss": 0.8593, "step": 2065 }, { "epoch": 0.6624, "grad_norm": 4.675530433654785, "learning_rate": 3.4839986803035297e-07, "loss": 0.8456, "step": 2070 }, { "epoch": 0.664, "grad_norm": 4.891861915588379, "learning_rate": 3.4675024744308807e-07, "loss": 0.9814, "step": 2075 }, { "epoch": 0.6656, "grad_norm": 3.9921982288360596, "learning_rate": 3.451006268558231e-07, "loss": 0.8416, "step": 2080 }, { "epoch": 0.6672, "grad_norm": 3.1958041191101074, "learning_rate": 3.434510062685582e-07, "loss": 0.905, "step": 2085 }, { "epoch": 0.6688, "grad_norm": 4.344924449920654, "learning_rate": 3.418013856812933e-07, "loss": 0.8202, "step": 2090 }, { "epoch": 0.6704, "grad_norm": 7.5191569328308105, "learning_rate": 3.4015176509402837e-07, "loss": 0.9426, "step": 2095 }, { "epoch": 0.672, "grad_norm": 4.440326690673828, "learning_rate": 3.3850214450676347e-07, "loss": 0.8205, "step": 2100 }, { "epoch": 0.6736, "grad_norm": 6.4901123046875, "learning_rate": 3.3685252391949846e-07, "loss": 0.7936, "step": 2105 }, { "epoch": 0.6752, "grad_norm": 3.9426374435424805, "learning_rate": 3.3520290333223357e-07, "loss": 0.793, "step": 2110 }, { "epoch": 0.6768, "grad_norm": 5.018584728240967, "learning_rate": 3.335532827449686e-07, "loss": 0.7799, "step": 2115 }, { "epoch": 0.6784, "grad_norm": 3.7835421562194824, "learning_rate": 3.319036621577037e-07, "loss": 0.8696, "step": 2120 }, { "epoch": 0.68, "grad_norm": 6.0190839767456055, "learning_rate": 3.302540415704388e-07, "loss": 0.9139, "step": 2125 }, { "epoch": 0.6816, "grad_norm": 5.751317977905273, "learning_rate": 3.2860442098317386e-07, "loss": 0.8522, "step": 2130 }, { "epoch": 0.6832, "grad_norm": 6.684688091278076, "learning_rate": 3.2695480039590896e-07, "loss": 0.8061, "step": 2135 }, { "epoch": 0.6848, "grad_norm": 2.783705234527588, "learning_rate": 3.2530517980864396e-07, "loss": 0.7725, "step": 2140 }, { "epoch": 0.6864, "grad_norm": 4.636482238769531, "learning_rate": 3.2365555922137906e-07, "loss": 0.8479, "step": 2145 }, { "epoch": 0.688, "grad_norm": 5.260950565338135, "learning_rate": 3.220059386341141e-07, "loss": 1.0717, "step": 2150 }, { "epoch": 0.6896, "grad_norm": 5.191953659057617, "learning_rate": 3.203563180468492e-07, "loss": 0.8714, "step": 2155 }, { "epoch": 0.6912, "grad_norm": 7.279730796813965, "learning_rate": 3.187066974595843e-07, "loss": 0.8246, "step": 2160 }, { "epoch": 0.6928, "grad_norm": 2.966627359390259, "learning_rate": 3.1705707687231936e-07, "loss": 0.9053, "step": 2165 }, { "epoch": 0.6944, "grad_norm": 8.789515495300293, "learning_rate": 3.1540745628505446e-07, "loss": 0.7851, "step": 2170 }, { "epoch": 0.696, "grad_norm": 2.929105520248413, "learning_rate": 3.137578356977895e-07, "loss": 0.956, "step": 2175 }, { "epoch": 0.6976, "grad_norm": 5.6356096267700195, "learning_rate": 3.1210821511052456e-07, "loss": 0.7946, "step": 2180 }, { "epoch": 0.6992, "grad_norm": 3.3033862113952637, "learning_rate": 3.104585945232596e-07, "loss": 0.884, "step": 2185 }, { "epoch": 0.7008, "grad_norm": 5.996482849121094, "learning_rate": 3.088089739359947e-07, "loss": 0.8342, "step": 2190 }, { "epoch": 0.7024, "grad_norm": 7.644280910491943, "learning_rate": 3.0715935334872975e-07, "loss": 0.8575, "step": 2195 }, { "epoch": 0.704, "grad_norm": 5.780369281768799, "learning_rate": 3.0550973276146485e-07, "loss": 0.84, "step": 2200 }, { "epoch": 0.7056, "grad_norm": 3.7677314281463623, "learning_rate": 3.0386011217419995e-07, "loss": 0.8764, "step": 2205 }, { "epoch": 0.7072, "grad_norm": 4.153870105743408, "learning_rate": 3.02210491586935e-07, "loss": 0.8447, "step": 2210 }, { "epoch": 0.7088, "grad_norm": 6.395594120025635, "learning_rate": 3.0056087099967005e-07, "loss": 0.8499, "step": 2215 }, { "epoch": 0.7104, "grad_norm": 3.4210963249206543, "learning_rate": 2.989112504124051e-07, "loss": 0.7547, "step": 2220 }, { "epoch": 0.712, "grad_norm": 2.710740327835083, "learning_rate": 2.972616298251402e-07, "loss": 0.8286, "step": 2225 }, { "epoch": 0.7136, "grad_norm": 5.014111042022705, "learning_rate": 2.9561200923787525e-07, "loss": 0.8706, "step": 2230 }, { "epoch": 0.7152, "grad_norm": 3.8330109119415283, "learning_rate": 2.9396238865061035e-07, "loss": 0.8221, "step": 2235 }, { "epoch": 0.7168, "grad_norm": 5.695978164672852, "learning_rate": 2.9231276806334545e-07, "loss": 0.8412, "step": 2240 }, { "epoch": 0.7184, "grad_norm": 5.974388599395752, "learning_rate": 2.906631474760805e-07, "loss": 0.8235, "step": 2245 }, { "epoch": 0.72, "grad_norm": 2.9334166049957275, "learning_rate": 2.890135268888156e-07, "loss": 0.8965, "step": 2250 }, { "epoch": 0.7216, "grad_norm": 8.407828330993652, "learning_rate": 2.873639063015506e-07, "loss": 0.8991, "step": 2255 }, { "epoch": 0.7232, "grad_norm": 4.443752765655518, "learning_rate": 2.857142857142857e-07, "loss": 0.7565, "step": 2260 }, { "epoch": 0.7248, "grad_norm": 6.351187229156494, "learning_rate": 2.8406466512702074e-07, "loss": 0.8202, "step": 2265 }, { "epoch": 0.7264, "grad_norm": 4.715820789337158, "learning_rate": 2.8241504453975584e-07, "loss": 0.877, "step": 2270 }, { "epoch": 0.728, "grad_norm": 3.1347246170043945, "learning_rate": 2.8076542395249094e-07, "loss": 0.8227, "step": 2275 }, { "epoch": 0.7296, "grad_norm": 3.8322551250457764, "learning_rate": 2.79115803365226e-07, "loss": 0.7839, "step": 2280 }, { "epoch": 0.7312, "grad_norm": 4.289877414703369, "learning_rate": 2.774661827779611e-07, "loss": 0.7952, "step": 2285 }, { "epoch": 0.7328, "grad_norm": 3.775768995285034, "learning_rate": 2.758165621906961e-07, "loss": 0.7924, "step": 2290 }, { "epoch": 0.7344, "grad_norm": 4.233770370483398, "learning_rate": 2.741669416034312e-07, "loss": 0.8008, "step": 2295 }, { "epoch": 0.736, "grad_norm": 5.131399154663086, "learning_rate": 2.7251732101616624e-07, "loss": 0.8103, "step": 2300 }, { "epoch": 0.7376, "grad_norm": 7.184566497802734, "learning_rate": 2.7086770042890134e-07, "loss": 0.9024, "step": 2305 }, { "epoch": 0.7392, "grad_norm": 3.6044952869415283, "learning_rate": 2.6921807984163644e-07, "loss": 0.8925, "step": 2310 }, { "epoch": 0.7408, "grad_norm": 8.124959945678711, "learning_rate": 2.675684592543715e-07, "loss": 0.8227, "step": 2315 }, { "epoch": 0.7424, "grad_norm": 5.050447463989258, "learning_rate": 2.659188386671066e-07, "loss": 0.87, "step": 2320 }, { "epoch": 0.744, "grad_norm": 3.2532646656036377, "learning_rate": 2.6426921807984164e-07, "loss": 0.9024, "step": 2325 }, { "epoch": 0.7456, "grad_norm": 7.244692325592041, "learning_rate": 2.626195974925767e-07, "loss": 0.7402, "step": 2330 }, { "epoch": 0.7472, "grad_norm": 5.4176435470581055, "learning_rate": 2.6096997690531173e-07, "loss": 0.9476, "step": 2335 }, { "epoch": 0.7488, "grad_norm": 13.94157886505127, "learning_rate": 2.5932035631804683e-07, "loss": 0.7621, "step": 2340 }, { "epoch": 0.7504, "grad_norm": 2.509117603302002, "learning_rate": 2.5767073573078193e-07, "loss": 0.7259, "step": 2345 }, { "epoch": 0.752, "grad_norm": 3.073138952255249, "learning_rate": 2.56021115143517e-07, "loss": 0.8224, "step": 2350 }, { "epoch": 0.7536, "grad_norm": 3.9155077934265137, "learning_rate": 2.543714945562521e-07, "loss": 0.8596, "step": 2355 }, { "epoch": 0.7552, "grad_norm": 6.405920028686523, "learning_rate": 2.5272187396898713e-07, "loss": 0.8152, "step": 2360 }, { "epoch": 0.7568, "grad_norm": 3.8203928470611572, "learning_rate": 2.510722533817222e-07, "loss": 0.8907, "step": 2365 }, { "epoch": 0.7584, "grad_norm": 3.9368674755096436, "learning_rate": 2.494226327944573e-07, "loss": 0.8804, "step": 2370 }, { "epoch": 0.76, "grad_norm": 4.452835559844971, "learning_rate": 2.4777301220719233e-07, "loss": 0.8304, "step": 2375 }, { "epoch": 0.7616, "grad_norm": 4.987030982971191, "learning_rate": 2.461233916199274e-07, "loss": 0.9112, "step": 2380 }, { "epoch": 0.7632, "grad_norm": 7.84393310546875, "learning_rate": 2.444737710326625e-07, "loss": 0.8584, "step": 2385 }, { "epoch": 0.7648, "grad_norm": 3.063011646270752, "learning_rate": 2.428241504453976e-07, "loss": 0.8648, "step": 2390 }, { "epoch": 0.7664, "grad_norm": 5.494943618774414, "learning_rate": 2.411745298581326e-07, "loss": 0.8633, "step": 2395 }, { "epoch": 0.768, "grad_norm": 3.209425449371338, "learning_rate": 2.3952490927086767e-07, "loss": 0.7618, "step": 2400 }, { "epoch": 0.7696, "grad_norm": 3.163612127304077, "learning_rate": 2.3787528868360277e-07, "loss": 0.7093, "step": 2405 }, { "epoch": 0.7712, "grad_norm": 4.022956848144531, "learning_rate": 2.3622566809633785e-07, "loss": 0.7705, "step": 2410 }, { "epoch": 0.7728, "grad_norm": 3.748598575592041, "learning_rate": 2.345760475090729e-07, "loss": 0.942, "step": 2415 }, { "epoch": 0.7744, "grad_norm": 3.5295920372009277, "learning_rate": 2.3292642692180797e-07, "loss": 0.8094, "step": 2420 }, { "epoch": 0.776, "grad_norm": 5.767539978027344, "learning_rate": 2.3127680633454305e-07, "loss": 0.8425, "step": 2425 }, { "epoch": 0.7776, "grad_norm": 3.4246678352355957, "learning_rate": 2.2962718574727812e-07, "loss": 0.7777, "step": 2430 }, { "epoch": 0.7792, "grad_norm": 3.2887349128723145, "learning_rate": 2.2797756516001317e-07, "loss": 0.8842, "step": 2435 }, { "epoch": 0.7808, "grad_norm": 4.332666397094727, "learning_rate": 2.2632794457274827e-07, "loss": 0.8946, "step": 2440 }, { "epoch": 0.7824, "grad_norm": 3.6099178791046143, "learning_rate": 2.2467832398548334e-07, "loss": 0.929, "step": 2445 }, { "epoch": 0.784, "grad_norm": 2.9944612979888916, "learning_rate": 2.230287033982184e-07, "loss": 0.9407, "step": 2450 }, { "epoch": 0.7856, "grad_norm": 4.5248613357543945, "learning_rate": 2.2137908281095347e-07, "loss": 0.9392, "step": 2455 }, { "epoch": 0.7872, "grad_norm": 2.7737669944763184, "learning_rate": 2.1972946222368854e-07, "loss": 0.627, "step": 2460 }, { "epoch": 0.7888, "grad_norm": 2.95540452003479, "learning_rate": 2.1807984163642361e-07, "loss": 0.7346, "step": 2465 }, { "epoch": 0.7904, "grad_norm": 8.727768898010254, "learning_rate": 2.1643022104915866e-07, "loss": 0.9009, "step": 2470 }, { "epoch": 0.792, "grad_norm": 4.173437118530273, "learning_rate": 2.1478060046189376e-07, "loss": 0.8968, "step": 2475 }, { "epoch": 0.7936, "grad_norm": 3.4933767318725586, "learning_rate": 2.1313097987462884e-07, "loss": 0.868, "step": 2480 }, { "epoch": 0.7952, "grad_norm": 3.33754301071167, "learning_rate": 2.114813592873639e-07, "loss": 0.8185, "step": 2485 }, { "epoch": 0.7968, "grad_norm": 7.453437328338623, "learning_rate": 2.0983173870009896e-07, "loss": 0.8654, "step": 2490 }, { "epoch": 0.7984, "grad_norm": 9.287111282348633, "learning_rate": 2.0818211811283404e-07, "loss": 0.7744, "step": 2495 }, { "epoch": 0.8, "grad_norm": 4.195357322692871, "learning_rate": 2.065324975255691e-07, "loss": 0.8295, "step": 2500 }, { "epoch": 0.8016, "grad_norm": 4.878857612609863, "learning_rate": 2.0488287693830418e-07, "loss": 0.7917, "step": 2505 }, { "epoch": 0.8032, "grad_norm": 3.239182710647583, "learning_rate": 2.0323325635103923e-07, "loss": 0.8339, "step": 2510 }, { "epoch": 0.8048, "grad_norm": 4.196946144104004, "learning_rate": 2.0158363576377433e-07, "loss": 0.9609, "step": 2515 }, { "epoch": 0.8064, "grad_norm": 2.7448809146881104, "learning_rate": 1.999340151765094e-07, "loss": 0.8516, "step": 2520 }, { "epoch": 0.808, "grad_norm": 4.893881320953369, "learning_rate": 1.9828439458924446e-07, "loss": 0.7908, "step": 2525 }, { "epoch": 0.8096, "grad_norm": 3.318279266357422, "learning_rate": 1.9663477400197953e-07, "loss": 0.7966, "step": 2530 }, { "epoch": 0.8112, "grad_norm": 2.901827573776245, "learning_rate": 1.949851534147146e-07, "loss": 0.7831, "step": 2535 }, { "epoch": 0.8128, "grad_norm": 5.1762847900390625, "learning_rate": 1.9333553282744968e-07, "loss": 0.9897, "step": 2540 }, { "epoch": 0.8144, "grad_norm": 6.721929550170898, "learning_rate": 1.9168591224018473e-07, "loss": 0.8774, "step": 2545 }, { "epoch": 0.816, "grad_norm": 3.2159762382507324, "learning_rate": 1.9003629165291983e-07, "loss": 0.8543, "step": 2550 }, { "epoch": 0.8176, "grad_norm": 2.2698450088500977, "learning_rate": 1.883866710656549e-07, "loss": 0.7665, "step": 2555 }, { "epoch": 0.8192, "grad_norm": 6.745720386505127, "learning_rate": 1.8673705047838998e-07, "loss": 0.8497, "step": 2560 }, { "epoch": 0.8208, "grad_norm": 7.653261661529541, "learning_rate": 1.8508742989112503e-07, "loss": 0.8156, "step": 2565 }, { "epoch": 0.8224, "grad_norm": 5.070962905883789, "learning_rate": 1.834378093038601e-07, "loss": 0.8599, "step": 2570 }, { "epoch": 0.824, "grad_norm": 3.101536750793457, "learning_rate": 1.8178818871659517e-07, "loss": 0.9103, "step": 2575 }, { "epoch": 0.8256, "grad_norm": 5.420032024383545, "learning_rate": 1.8013856812933025e-07, "loss": 0.8812, "step": 2580 }, { "epoch": 0.8272, "grad_norm": 6.0531697273254395, "learning_rate": 1.7848894754206532e-07, "loss": 0.9167, "step": 2585 }, { "epoch": 0.8288, "grad_norm": 7.951639175415039, "learning_rate": 1.768393269548004e-07, "loss": 0.958, "step": 2590 }, { "epoch": 0.8304, "grad_norm": 3.8448524475097656, "learning_rate": 1.7518970636753547e-07, "loss": 0.9107, "step": 2595 }, { "epoch": 0.832, "grad_norm": 6.432617664337158, "learning_rate": 1.7354008578027052e-07, "loss": 0.9313, "step": 2600 }, { "epoch": 0.8336, "grad_norm": 6.240530967712402, "learning_rate": 1.718904651930056e-07, "loss": 0.8152, "step": 2605 }, { "epoch": 0.8352, "grad_norm": 3.8396613597869873, "learning_rate": 1.7024084460574067e-07, "loss": 0.8105, "step": 2610 }, { "epoch": 0.8368, "grad_norm": 5.357729434967041, "learning_rate": 1.6859122401847574e-07, "loss": 0.8994, "step": 2615 }, { "epoch": 0.8384, "grad_norm": 3.711209535598755, "learning_rate": 1.669416034312108e-07, "loss": 0.8206, "step": 2620 }, { "epoch": 0.84, "grad_norm": 8.217768669128418, "learning_rate": 1.652919828439459e-07, "loss": 0.8569, "step": 2625 }, { "epoch": 0.8416, "grad_norm": 3.3683290481567383, "learning_rate": 1.6364236225668097e-07, "loss": 0.8385, "step": 2630 }, { "epoch": 0.8432, "grad_norm": 5.228672981262207, "learning_rate": 1.6199274166941604e-07, "loss": 0.9177, "step": 2635 }, { "epoch": 0.8448, "grad_norm": 4.824789524078369, "learning_rate": 1.603431210821511e-07, "loss": 0.8195, "step": 2640 }, { "epoch": 0.8464, "grad_norm": 3.6725375652313232, "learning_rate": 1.5869350049488616e-07, "loss": 0.8017, "step": 2645 }, { "epoch": 0.848, "grad_norm": 3.130878210067749, "learning_rate": 1.5704387990762124e-07, "loss": 0.8088, "step": 2650 }, { "epoch": 0.8496, "grad_norm": 4.449658393859863, "learning_rate": 1.553942593203563e-07, "loss": 0.7835, "step": 2655 }, { "epoch": 0.8512, "grad_norm": 4.97245454788208, "learning_rate": 1.537446387330914e-07, "loss": 0.837, "step": 2660 }, { "epoch": 0.8528, "grad_norm": 16.94793128967285, "learning_rate": 1.5209501814582646e-07, "loss": 0.8113, "step": 2665 }, { "epoch": 0.8544, "grad_norm": 4.743756294250488, "learning_rate": 1.5044539755856154e-07, "loss": 0.8042, "step": 2670 }, { "epoch": 0.856, "grad_norm": 10.094191551208496, "learning_rate": 1.4879577697129658e-07, "loss": 0.8571, "step": 2675 }, { "epoch": 0.8576, "grad_norm": 5.925148010253906, "learning_rate": 1.4714615638403166e-07, "loss": 0.8749, "step": 2680 }, { "epoch": 0.8592, "grad_norm": 3.691056251525879, "learning_rate": 1.4549653579676673e-07, "loss": 0.8824, "step": 2685 }, { "epoch": 0.8608, "grad_norm": 3.58223295211792, "learning_rate": 1.438469152095018e-07, "loss": 0.8761, "step": 2690 }, { "epoch": 0.8624, "grad_norm": 2.6448755264282227, "learning_rate": 1.4219729462223686e-07, "loss": 0.8868, "step": 2695 }, { "epoch": 0.864, "grad_norm": 3.1470277309417725, "learning_rate": 1.4054767403497196e-07, "loss": 0.738, "step": 2700 }, { "epoch": 0.8656, "grad_norm": 2.670072317123413, "learning_rate": 1.3889805344770703e-07, "loss": 0.834, "step": 2705 }, { "epoch": 0.8672, "grad_norm": 3.6198477745056152, "learning_rate": 1.372484328604421e-07, "loss": 0.866, "step": 2710 }, { "epoch": 0.8688, "grad_norm": 3.4495441913604736, "learning_rate": 1.3559881227317715e-07, "loss": 0.8393, "step": 2715 }, { "epoch": 0.8704, "grad_norm": 4.266736030578613, "learning_rate": 1.3394919168591223e-07, "loss": 0.6669, "step": 2720 }, { "epoch": 0.872, "grad_norm": 7.86549711227417, "learning_rate": 1.322995710986473e-07, "loss": 0.8842, "step": 2725 }, { "epoch": 0.8736, "grad_norm": 9.990023612976074, "learning_rate": 1.3064995051138238e-07, "loss": 0.8435, "step": 2730 }, { "epoch": 0.8752, "grad_norm": 5.904709815979004, "learning_rate": 1.2900032992411745e-07, "loss": 0.8569, "step": 2735 }, { "epoch": 0.8768, "grad_norm": 3.9784185886383057, "learning_rate": 1.2735070933685253e-07, "loss": 0.8613, "step": 2740 }, { "epoch": 0.8784, "grad_norm": 3.423604965209961, "learning_rate": 1.257010887495876e-07, "loss": 0.8349, "step": 2745 }, { "epoch": 0.88, "grad_norm": 7.008749961853027, "learning_rate": 1.2405146816232267e-07, "loss": 0.739, "step": 2750 }, { "epoch": 0.8816, "grad_norm": 4.69740104675293, "learning_rate": 1.2240184757505772e-07, "loss": 0.8429, "step": 2755 }, { "epoch": 0.8832, "grad_norm": 11.424933433532715, "learning_rate": 1.207522269877928e-07, "loss": 0.8223, "step": 2760 }, { "epoch": 0.8848, "grad_norm": 9.028188705444336, "learning_rate": 1.1910260640052787e-07, "loss": 0.8795, "step": 2765 }, { "epoch": 0.8864, "grad_norm": 4.878159999847412, "learning_rate": 1.1745298581326295e-07, "loss": 0.8503, "step": 2770 }, { "epoch": 0.888, "grad_norm": 4.5786237716674805, "learning_rate": 1.1580336522599801e-07, "loss": 0.7894, "step": 2775 }, { "epoch": 0.8896, "grad_norm": 5.562559127807617, "learning_rate": 1.141537446387331e-07, "loss": 0.9339, "step": 2780 }, { "epoch": 0.8912, "grad_norm": 5.035255432128906, "learning_rate": 1.1250412405146816e-07, "loss": 0.9041, "step": 2785 }, { "epoch": 0.8928, "grad_norm": 3.3735504150390625, "learning_rate": 1.1085450346420323e-07, "loss": 0.8033, "step": 2790 }, { "epoch": 0.8944, "grad_norm": 3.345130443572998, "learning_rate": 1.0920488287693829e-07, "loss": 0.7649, "step": 2795 }, { "epoch": 0.896, "grad_norm": 2.9101784229278564, "learning_rate": 1.0755526228967338e-07, "loss": 0.7702, "step": 2800 }, { "epoch": 0.8976, "grad_norm": 8.602867126464844, "learning_rate": 1.0590564170240844e-07, "loss": 0.8769, "step": 2805 }, { "epoch": 0.8992, "grad_norm": 3.7691822052001953, "learning_rate": 1.042560211151435e-07, "loss": 0.7437, "step": 2810 }, { "epoch": 0.9008, "grad_norm": 3.548344612121582, "learning_rate": 1.0260640052787859e-07, "loss": 0.7927, "step": 2815 }, { "epoch": 0.9024, "grad_norm": 6.130397319793701, "learning_rate": 1.0095677994061365e-07, "loss": 0.8964, "step": 2820 }, { "epoch": 0.904, "grad_norm": 6.0678935050964355, "learning_rate": 9.930715935334873e-08, "loss": 0.8846, "step": 2825 }, { "epoch": 0.9056, "grad_norm": 3.202853202819824, "learning_rate": 9.765753876608379e-08, "loss": 0.7872, "step": 2830 }, { "epoch": 0.9072, "grad_norm": 7.076948165893555, "learning_rate": 9.600791817881887e-08, "loss": 0.841, "step": 2835 }, { "epoch": 0.9088, "grad_norm": 4.527687072753906, "learning_rate": 9.435829759155394e-08, "loss": 0.8376, "step": 2840 }, { "epoch": 0.9104, "grad_norm": 3.8655998706817627, "learning_rate": 9.270867700428901e-08, "loss": 0.9163, "step": 2845 }, { "epoch": 0.912, "grad_norm": 4.189991474151611, "learning_rate": 9.105905641702407e-08, "loss": 0.9171, "step": 2850 }, { "epoch": 0.9136, "grad_norm": 3.6782002449035645, "learning_rate": 8.940943582975916e-08, "loss": 0.8107, "step": 2855 }, { "epoch": 0.9152, "grad_norm": 3.8793959617614746, "learning_rate": 8.775981524249422e-08, "loss": 0.8324, "step": 2860 }, { "epoch": 0.9168, "grad_norm": 6.360919952392578, "learning_rate": 8.61101946552293e-08, "loss": 0.901, "step": 2865 }, { "epoch": 0.9184, "grad_norm": 4.218044757843018, "learning_rate": 8.446057406796437e-08, "loss": 0.7871, "step": 2870 }, { "epoch": 0.92, "grad_norm": 6.177008628845215, "learning_rate": 8.281095348069944e-08, "loss": 0.841, "step": 2875 }, { "epoch": 0.9216, "grad_norm": 2.739051103591919, "learning_rate": 8.11613328934345e-08, "loss": 0.7872, "step": 2880 }, { "epoch": 0.9232, "grad_norm": 8.707544326782227, "learning_rate": 7.951171230616957e-08, "loss": 0.9274, "step": 2885 }, { "epoch": 0.9248, "grad_norm": 4.484316825866699, "learning_rate": 7.786209171890465e-08, "loss": 0.9121, "step": 2890 }, { "epoch": 0.9264, "grad_norm": 3.211519479751587, "learning_rate": 7.621247113163972e-08, "loss": 0.8737, "step": 2895 }, { "epoch": 0.928, "grad_norm": 5.020310878753662, "learning_rate": 7.456285054437479e-08, "loss": 0.8358, "step": 2900 }, { "epoch": 0.9296, "grad_norm": 5.312314510345459, "learning_rate": 7.291322995710985e-08, "loss": 0.831, "step": 2905 }, { "epoch": 0.9312, "grad_norm": 2.917203903198242, "learning_rate": 7.126360936984494e-08, "loss": 0.8756, "step": 2910 }, { "epoch": 0.9328, "grad_norm": 3.924370288848877, "learning_rate": 6.961398878258e-08, "loss": 0.7825, "step": 2915 }, { "epoch": 0.9344, "grad_norm": 3.571991205215454, "learning_rate": 6.796436819531507e-08, "loss": 0.807, "step": 2920 }, { "epoch": 0.936, "grad_norm": 5.816591739654541, "learning_rate": 6.631474760805014e-08, "loss": 0.8491, "step": 2925 }, { "epoch": 0.9376, "grad_norm": 2.8010520935058594, "learning_rate": 6.466512702078522e-08, "loss": 0.9383, "step": 2930 }, { "epoch": 0.9392, "grad_norm": 4.16404914855957, "learning_rate": 6.301550643352028e-08, "loss": 0.9048, "step": 2935 }, { "epoch": 0.9408, "grad_norm": 3.1094634532928467, "learning_rate": 6.136588584625536e-08, "loss": 0.8829, "step": 2940 }, { "epoch": 0.9424, "grad_norm": 6.206966400146484, "learning_rate": 5.971626525899043e-08, "loss": 0.8568, "step": 2945 }, { "epoch": 0.944, "grad_norm": 3.327371120452881, "learning_rate": 5.80666446717255e-08, "loss": 0.8895, "step": 2950 }, { "epoch": 0.9456, "grad_norm": 3.068650722503662, "learning_rate": 5.641702408446057e-08, "loss": 0.8451, "step": 2955 }, { "epoch": 0.9472, "grad_norm": 18.251916885375977, "learning_rate": 5.4767403497195644e-08, "loss": 0.8896, "step": 2960 }, { "epoch": 0.9488, "grad_norm": 6.762292385101318, "learning_rate": 5.311778290993071e-08, "loss": 0.775, "step": 2965 }, { "epoch": 0.9504, "grad_norm": 3.4393362998962402, "learning_rate": 5.1468162322665786e-08, "loss": 0.8701, "step": 2970 }, { "epoch": 0.952, "grad_norm": 3.25495982170105, "learning_rate": 4.9818541735400854e-08, "loss": 0.9626, "step": 2975 }, { "epoch": 0.9536, "grad_norm": 3.750603437423706, "learning_rate": 4.816892114813593e-08, "loss": 0.8311, "step": 2980 }, { "epoch": 0.9552, "grad_norm": 8.03615951538086, "learning_rate": 4.6519300560871e-08, "loss": 0.8256, "step": 2985 }, { "epoch": 0.9568, "grad_norm": 3.2003724575042725, "learning_rate": 4.486967997360607e-08, "loss": 0.8626, "step": 2990 }, { "epoch": 0.9584, "grad_norm": 4.359886169433594, "learning_rate": 4.3220059386341145e-08, "loss": 0.8756, "step": 2995 }, { "epoch": 0.96, "grad_norm": 6.897675514221191, "learning_rate": 4.1570438799076207e-08, "loss": 0.8767, "step": 3000 }, { "epoch": 0.9616, "grad_norm": 3.0032472610473633, "learning_rate": 3.992081821181128e-08, "loss": 0.9283, "step": 3005 }, { "epoch": 0.9632, "grad_norm": 4.568953037261963, "learning_rate": 3.827119762454635e-08, "loss": 0.6707, "step": 3010 }, { "epoch": 0.9648, "grad_norm": 6.175788879394531, "learning_rate": 3.6621577037281423e-08, "loss": 0.7988, "step": 3015 }, { "epoch": 0.9664, "grad_norm": 6.2108354568481445, "learning_rate": 3.497195645001649e-08, "loss": 0.8649, "step": 3020 }, { "epoch": 0.968, "grad_norm": 5.617148399353027, "learning_rate": 3.3322335862751566e-08, "loss": 0.8005, "step": 3025 }, { "epoch": 0.9696, "grad_norm": 2.560255765914917, "learning_rate": 3.1672715275486634e-08, "loss": 0.8828, "step": 3030 }, { "epoch": 0.9712, "grad_norm": 3.895402669906616, "learning_rate": 3.002309468822171e-08, "loss": 0.9119, "step": 3035 }, { "epoch": 0.9728, "grad_norm": 3.2274169921875, "learning_rate": 2.837347410095678e-08, "loss": 0.9227, "step": 3040 }, { "epoch": 0.9744, "grad_norm": 3.9759535789489746, "learning_rate": 2.672385351369185e-08, "loss": 0.7694, "step": 3045 }, { "epoch": 0.976, "grad_norm": 5.453105926513672, "learning_rate": 2.507423292642692e-08, "loss": 0.8129, "step": 3050 }, { "epoch": 0.9776, "grad_norm": 3.8954710960388184, "learning_rate": 2.342461233916199e-08, "loss": 0.8646, "step": 3055 }, { "epoch": 0.9792, "grad_norm": 5.158627033233643, "learning_rate": 2.177499175189706e-08, "loss": 0.873, "step": 3060 }, { "epoch": 0.9808, "grad_norm": 3.405482769012451, "learning_rate": 2.0125371164632132e-08, "loss": 0.6943, "step": 3065 }, { "epoch": 0.9824, "grad_norm": 4.411757469177246, "learning_rate": 1.8475750577367203e-08, "loss": 0.8178, "step": 3070 }, { "epoch": 0.984, "grad_norm": 3.8500680923461914, "learning_rate": 1.6826129990102277e-08, "loss": 0.8075, "step": 3075 }, { "epoch": 0.9856, "grad_norm": 9.081463813781738, "learning_rate": 1.517650940283735e-08, "loss": 0.8583, "step": 3080 }, { "epoch": 0.9872, "grad_norm": 3.0819270610809326, "learning_rate": 1.3526888815572416e-08, "loss": 0.7818, "step": 3085 }, { "epoch": 0.9888, "grad_norm": 12.406457901000977, "learning_rate": 1.187726822830749e-08, "loss": 0.9238, "step": 3090 }, { "epoch": 0.9904, "grad_norm": 4.571506977081299, "learning_rate": 1.022764764104256e-08, "loss": 0.788, "step": 3095 }, { "epoch": 0.992, "grad_norm": 2.9352900981903076, "learning_rate": 8.578027053777632e-09, "loss": 0.9279, "step": 3100 }, { "epoch": 0.9936, "grad_norm": 4.189651966094971, "learning_rate": 6.928406466512702e-09, "loss": 0.8379, "step": 3105 }, { "epoch": 0.9952, "grad_norm": 3.8705317974090576, "learning_rate": 5.278785879247773e-09, "loss": 0.778, "step": 3110 }, { "epoch": 0.9968, "grad_norm": 2.701988458633423, "learning_rate": 3.629165291982844e-09, "loss": 0.9363, "step": 3115 }, { "epoch": 0.9984, "grad_norm": 3.2893664836883545, "learning_rate": 1.9795447047179146e-09, "loss": 0.7902, "step": 3120 }, { "epoch": 1.0, "grad_norm": 4.953529357910156, "learning_rate": 3.299241174529858e-10, "loss": 0.7962, "step": 3125 }, { "epoch": 1.0, "step": 3125, "total_flos": 1.0275244834155397e+18, "train_loss": 0.8923269732666016, "train_runtime": 7058.0642, "train_samples_per_second": 7.084, "train_steps_per_second": 0.443 } ], "logging_steps": 5, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0275244834155397e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }