{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 10710, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009337068160597573, "grad_norm": 24.474673838237816, "learning_rate": 0.0, "loss": 0.8882, "step": 1 }, { "epoch": 0.0018674136321195146, "grad_norm": 39.469397513448925, "learning_rate": 3.1055900621118015e-08, "loss": 1.2827, "step": 2 }, { "epoch": 0.0028011204481792717, "grad_norm": 23.17863262688958, "learning_rate": 6.211180124223603e-08, "loss": 1.0869, "step": 3 }, { "epoch": 0.003734827264239029, "grad_norm": 21.178026807466882, "learning_rate": 9.316770186335405e-08, "loss": 0.8127, "step": 4 }, { "epoch": 0.004668534080298786, "grad_norm": 24.57582041577202, "learning_rate": 1.2422360248447206e-07, "loss": 0.8678, "step": 5 }, { "epoch": 0.0056022408963585435, "grad_norm": 15.874670883943848, "learning_rate": 1.5527950310559006e-07, "loss": 0.7354, "step": 6 }, { "epoch": 0.006535947712418301, "grad_norm": 19.023881941599424, "learning_rate": 1.863354037267081e-07, "loss": 0.7972, "step": 7 }, { "epoch": 0.007469654528478058, "grad_norm": 21.252843927126968, "learning_rate": 2.173913043478261e-07, "loss": 0.8618, "step": 8 }, { "epoch": 0.008403361344537815, "grad_norm": 29.143916020933318, "learning_rate": 2.484472049689441e-07, "loss": 1.126, "step": 9 }, { "epoch": 0.009337068160597572, "grad_norm": 20.855264769481558, "learning_rate": 2.795031055900621e-07, "loss": 0.9344, "step": 10 }, { "epoch": 0.01027077497665733, "grad_norm": 26.146790642902538, "learning_rate": 3.1055900621118013e-07, "loss": 0.9497, "step": 11 }, { "epoch": 0.011204481792717087, "grad_norm": 19.939320583571053, "learning_rate": 3.416149068322982e-07, "loss": 0.9076, "step": 12 }, { "epoch": 0.012138188608776844, "grad_norm": 20.268878693661055, "learning_rate": 3.726708074534162e-07, "loss": 0.8634, "step": 13 }, { "epoch": 0.013071895424836602, "grad_norm": 20.2695883346598, "learning_rate": 4.037267080745342e-07, "loss": 0.932, "step": 14 }, { "epoch": 0.014005602240896359, "grad_norm": 18.974157219689673, "learning_rate": 4.347826086956522e-07, "loss": 0.7643, "step": 15 }, { "epoch": 0.014939309056956116, "grad_norm": 16.81878378356936, "learning_rate": 4.658385093167702e-07, "loss": 0.8395, "step": 16 }, { "epoch": 0.015873015873015872, "grad_norm": 18.822878848836883, "learning_rate": 4.968944099378882e-07, "loss": 0.8896, "step": 17 }, { "epoch": 0.01680672268907563, "grad_norm": 18.81193046403055, "learning_rate": 5.279503105590063e-07, "loss": 0.8774, "step": 18 }, { "epoch": 0.017740429505135387, "grad_norm": 16.54085834444693, "learning_rate": 5.590062111801243e-07, "loss": 0.8148, "step": 19 }, { "epoch": 0.018674136321195144, "grad_norm": 20.47351336072656, "learning_rate": 5.900621118012423e-07, "loss": 0.8668, "step": 20 }, { "epoch": 0.0196078431372549, "grad_norm": 16.7518697790092, "learning_rate": 6.211180124223603e-07, "loss": 0.7517, "step": 21 }, { "epoch": 0.02054154995331466, "grad_norm": 19.418526359060845, "learning_rate": 6.521739130434783e-07, "loss": 0.7591, "step": 22 }, { "epoch": 0.021475256769374416, "grad_norm": 60.90655631122115, "learning_rate": 6.832298136645964e-07, "loss": 1.5147, "step": 23 }, { "epoch": 0.022408963585434174, "grad_norm": 22.148194570425947, "learning_rate": 7.142857142857143e-07, "loss": 0.7759, "step": 24 }, { "epoch": 0.02334267040149393, "grad_norm": 23.278662747729904, "learning_rate": 7.453416149068324e-07, "loss": 0.9543, "step": 25 }, { "epoch": 0.02427637721755369, "grad_norm": 16.738726858301344, "learning_rate": 7.763975155279503e-07, "loss": 0.7951, "step": 26 }, { "epoch": 0.025210084033613446, "grad_norm": 17.03618210769562, "learning_rate": 8.074534161490684e-07, "loss": 0.6865, "step": 27 }, { "epoch": 0.026143790849673203, "grad_norm": 21.098891839859554, "learning_rate": 8.385093167701864e-07, "loss": 0.8546, "step": 28 }, { "epoch": 0.02707749766573296, "grad_norm": 14.823063699971865, "learning_rate": 8.695652173913044e-07, "loss": 0.6737, "step": 29 }, { "epoch": 0.028011204481792718, "grad_norm": 54.22130227812077, "learning_rate": 9.006211180124224e-07, "loss": 1.0722, "step": 30 }, { "epoch": 0.028944911297852476, "grad_norm": 14.311807583044814, "learning_rate": 9.316770186335404e-07, "loss": 0.5855, "step": 31 }, { "epoch": 0.029878618113912233, "grad_norm": 15.53952139867147, "learning_rate": 9.627329192546585e-07, "loss": 0.6449, "step": 32 }, { "epoch": 0.03081232492997199, "grad_norm": 17.032352719288415, "learning_rate": 9.937888198757765e-07, "loss": 0.7397, "step": 33 }, { "epoch": 0.031746031746031744, "grad_norm": 11.82115577380343, "learning_rate": 1.0248447204968944e-06, "loss": 0.6099, "step": 34 }, { "epoch": 0.032679738562091505, "grad_norm": 11.462334955391942, "learning_rate": 1.0559006211180126e-06, "loss": 0.6044, "step": 35 }, { "epoch": 0.03361344537815126, "grad_norm": 12.807743403344297, "learning_rate": 1.0869565217391306e-06, "loss": 0.7419, "step": 36 }, { "epoch": 0.03454715219421102, "grad_norm": 8.87648849412914, "learning_rate": 1.1180124223602485e-06, "loss": 0.4672, "step": 37 }, { "epoch": 0.035480859010270774, "grad_norm": 13.879142679076807, "learning_rate": 1.1490683229813664e-06, "loss": 0.6211, "step": 38 }, { "epoch": 0.036414565826330535, "grad_norm": 9.475275015636266, "learning_rate": 1.1801242236024846e-06, "loss": 0.5035, "step": 39 }, { "epoch": 0.03734827264239029, "grad_norm": 9.342512504561586, "learning_rate": 1.2111801242236026e-06, "loss": 0.4472, "step": 40 }, { "epoch": 0.03828197945845005, "grad_norm": 9.099881685170999, "learning_rate": 1.2422360248447205e-06, "loss": 0.4896, "step": 41 }, { "epoch": 0.0392156862745098, "grad_norm": 9.649917587873572, "learning_rate": 1.2732919254658385e-06, "loss": 0.4286, "step": 42 }, { "epoch": 0.040149393090569564, "grad_norm": 8.852175321587666, "learning_rate": 1.3043478260869566e-06, "loss": 0.4097, "step": 43 }, { "epoch": 0.04108309990662932, "grad_norm": 9.838747356156427, "learning_rate": 1.3354037267080746e-06, "loss": 0.4379, "step": 44 }, { "epoch": 0.04201680672268908, "grad_norm": 8.923981141398546, "learning_rate": 1.3664596273291927e-06, "loss": 0.4378, "step": 45 }, { "epoch": 0.04295051353874883, "grad_norm": 12.177372418830734, "learning_rate": 1.3975155279503105e-06, "loss": 0.6094, "step": 46 }, { "epoch": 0.04388422035480859, "grad_norm": 8.188411900651252, "learning_rate": 1.4285714285714286e-06, "loss": 0.3885, "step": 47 }, { "epoch": 0.04481792717086835, "grad_norm": 8.92157366981287, "learning_rate": 1.4596273291925466e-06, "loss": 0.3989, "step": 48 }, { "epoch": 0.0457516339869281, "grad_norm": 7.108118170470213, "learning_rate": 1.4906832298136647e-06, "loss": 0.3712, "step": 49 }, { "epoch": 0.04668534080298786, "grad_norm": 7.071860497422837, "learning_rate": 1.521739130434783e-06, "loss": 0.376, "step": 50 }, { "epoch": 0.047619047619047616, "grad_norm": 6.671139803809393, "learning_rate": 1.5527950310559006e-06, "loss": 0.2636, "step": 51 }, { "epoch": 0.04855275443510738, "grad_norm": 8.03362669267229, "learning_rate": 1.5838509316770188e-06, "loss": 0.4664, "step": 52 }, { "epoch": 0.04948646125116713, "grad_norm": 24.594462159127342, "learning_rate": 1.6149068322981367e-06, "loss": 0.4577, "step": 53 }, { "epoch": 0.05042016806722689, "grad_norm": 6.903993053793768, "learning_rate": 1.645962732919255e-06, "loss": 0.3442, "step": 54 }, { "epoch": 0.051353874883286646, "grad_norm": 8.658378780331752, "learning_rate": 1.6770186335403729e-06, "loss": 0.3204, "step": 55 }, { "epoch": 0.05228758169934641, "grad_norm": 6.441103612432593, "learning_rate": 1.7080745341614908e-06, "loss": 0.2675, "step": 56 }, { "epoch": 0.05322128851540616, "grad_norm": 16.46872187643785, "learning_rate": 1.7391304347826088e-06, "loss": 0.4594, "step": 57 }, { "epoch": 0.05415499533146592, "grad_norm": 10.114421336119905, "learning_rate": 1.770186335403727e-06, "loss": 0.382, "step": 58 }, { "epoch": 0.055088702147525676, "grad_norm": 9.970703323237156, "learning_rate": 1.8012422360248449e-06, "loss": 0.4594, "step": 59 }, { "epoch": 0.056022408963585436, "grad_norm": 9.493826613402016, "learning_rate": 1.832298136645963e-06, "loss": 0.3455, "step": 60 }, { "epoch": 0.05695611577964519, "grad_norm": 10.537325944894715, "learning_rate": 1.8633540372670808e-06, "loss": 0.4058, "step": 61 }, { "epoch": 0.05788982259570495, "grad_norm": 8.05332021450219, "learning_rate": 1.894409937888199e-06, "loss": 0.1806, "step": 62 }, { "epoch": 0.058823529411764705, "grad_norm": 10.418619418906014, "learning_rate": 1.925465838509317e-06, "loss": 0.3915, "step": 63 }, { "epoch": 0.059757236227824466, "grad_norm": 6.921402343153753, "learning_rate": 1.956521739130435e-06, "loss": 0.2359, "step": 64 }, { "epoch": 0.06069094304388422, "grad_norm": 5.085662593597982, "learning_rate": 1.987577639751553e-06, "loss": 0.3222, "step": 65 }, { "epoch": 0.06162464985994398, "grad_norm": 5.872790751265629, "learning_rate": 2.018633540372671e-06, "loss": 0.2953, "step": 66 }, { "epoch": 0.06255835667600373, "grad_norm": 7.5048032877333, "learning_rate": 2.049689440993789e-06, "loss": 0.3356, "step": 67 }, { "epoch": 0.06349206349206349, "grad_norm": 6.834820659283478, "learning_rate": 2.0807453416149073e-06, "loss": 0.335, "step": 68 }, { "epoch": 0.06442577030812324, "grad_norm": 13.823349146828512, "learning_rate": 2.111801242236025e-06, "loss": 0.4417, "step": 69 }, { "epoch": 0.06535947712418301, "grad_norm": 7.767273203672307, "learning_rate": 2.1428571428571427e-06, "loss": 0.4616, "step": 70 }, { "epoch": 0.06629318394024276, "grad_norm": 4.644061029916878, "learning_rate": 2.173913043478261e-06, "loss": 0.2758, "step": 71 }, { "epoch": 0.06722689075630252, "grad_norm": 7.135004681769124, "learning_rate": 2.204968944099379e-06, "loss": 0.3571, "step": 72 }, { "epoch": 0.06816059757236227, "grad_norm": 12.127974865593838, "learning_rate": 2.236024844720497e-06, "loss": 0.367, "step": 73 }, { "epoch": 0.06909430438842204, "grad_norm": 5.651868530631708, "learning_rate": 2.2670807453416154e-06, "loss": 0.1724, "step": 74 }, { "epoch": 0.0700280112044818, "grad_norm": 4.996999497934144, "learning_rate": 2.298136645962733e-06, "loss": 0.2899, "step": 75 }, { "epoch": 0.07096171802054155, "grad_norm": 9.249714414813827, "learning_rate": 2.3291925465838513e-06, "loss": 0.4365, "step": 76 }, { "epoch": 0.0718954248366013, "grad_norm": 10.130536954404192, "learning_rate": 2.3602484472049692e-06, "loss": 0.4152, "step": 77 }, { "epoch": 0.07282913165266107, "grad_norm": 7.886072990162823, "learning_rate": 2.391304347826087e-06, "loss": 0.3584, "step": 78 }, { "epoch": 0.07376283846872082, "grad_norm": 10.705136936987468, "learning_rate": 2.422360248447205e-06, "loss": 0.3746, "step": 79 }, { "epoch": 0.07469654528478058, "grad_norm": 6.338520782222879, "learning_rate": 2.453416149068323e-06, "loss": 0.3374, "step": 80 }, { "epoch": 0.07563025210084033, "grad_norm": 15.764021534615928, "learning_rate": 2.484472049689441e-06, "loss": 0.4306, "step": 81 }, { "epoch": 0.0765639589169001, "grad_norm": 6.696535285136097, "learning_rate": 2.515527950310559e-06, "loss": 0.2005, "step": 82 }, { "epoch": 0.07749766573295985, "grad_norm": 6.457512928408381, "learning_rate": 2.546583850931677e-06, "loss": 0.1799, "step": 83 }, { "epoch": 0.0784313725490196, "grad_norm": 8.395373920564941, "learning_rate": 2.5776397515527953e-06, "loss": 0.3149, "step": 84 }, { "epoch": 0.07936507936507936, "grad_norm": 5.654955884413909, "learning_rate": 2.6086956521739132e-06, "loss": 0.1646, "step": 85 }, { "epoch": 0.08029878618113913, "grad_norm": 6.291567826319513, "learning_rate": 2.639751552795031e-06, "loss": 0.2716, "step": 86 }, { "epoch": 0.08123249299719888, "grad_norm": 8.335086496360354, "learning_rate": 2.670807453416149e-06, "loss": 0.2693, "step": 87 }, { "epoch": 0.08216619981325864, "grad_norm": 4.860192694507268, "learning_rate": 2.7018633540372675e-06, "loss": 0.2493, "step": 88 }, { "epoch": 0.08309990662931839, "grad_norm": 7.733026919613304, "learning_rate": 2.7329192546583855e-06, "loss": 0.3095, "step": 89 }, { "epoch": 0.08403361344537816, "grad_norm": 8.79291549433535, "learning_rate": 2.7639751552795034e-06, "loss": 0.1971, "step": 90 }, { "epoch": 0.08496732026143791, "grad_norm": 5.44733284723899, "learning_rate": 2.795031055900621e-06, "loss": 0.3086, "step": 91 }, { "epoch": 0.08590102707749767, "grad_norm": 6.8868404125793745, "learning_rate": 2.8260869565217393e-06, "loss": 0.3026, "step": 92 }, { "epoch": 0.08683473389355742, "grad_norm": 5.4043976621501315, "learning_rate": 2.8571428571428573e-06, "loss": 0.2477, "step": 93 }, { "epoch": 0.08776844070961717, "grad_norm": 24.217050545948553, "learning_rate": 2.888198757763975e-06, "loss": 0.3642, "step": 94 }, { "epoch": 0.08870214752567694, "grad_norm": 6.472384556203597, "learning_rate": 2.919254658385093e-06, "loss": 0.3378, "step": 95 }, { "epoch": 0.0896358543417367, "grad_norm": 4.716208610272063, "learning_rate": 2.9503105590062115e-06, "loss": 0.2532, "step": 96 }, { "epoch": 0.09056956115779645, "grad_norm": 7.604969514470815, "learning_rate": 2.9813664596273295e-06, "loss": 0.1514, "step": 97 }, { "epoch": 0.0915032679738562, "grad_norm": 9.817496689091302, "learning_rate": 3.0124223602484474e-06, "loss": 0.5209, "step": 98 }, { "epoch": 0.09243697478991597, "grad_norm": 7.612285422747696, "learning_rate": 3.043478260869566e-06, "loss": 0.2732, "step": 99 }, { "epoch": 0.09337068160597572, "grad_norm": 6.669835249450553, "learning_rate": 3.0745341614906837e-06, "loss": 0.2635, "step": 100 }, { "epoch": 0.09430438842203548, "grad_norm": 5.341716954454641, "learning_rate": 3.1055900621118013e-06, "loss": 0.1831, "step": 101 }, { "epoch": 0.09523809523809523, "grad_norm": 5.574131478367205, "learning_rate": 3.1366459627329192e-06, "loss": 0.1443, "step": 102 }, { "epoch": 0.096171802054155, "grad_norm": 6.440052186380905, "learning_rate": 3.1677018633540376e-06, "loss": 0.2417, "step": 103 }, { "epoch": 0.09710550887021475, "grad_norm": 6.448139110997804, "learning_rate": 3.1987577639751555e-06, "loss": 0.2912, "step": 104 }, { "epoch": 0.09803921568627451, "grad_norm": 7.173070917137347, "learning_rate": 3.2298136645962735e-06, "loss": 0.2191, "step": 105 }, { "epoch": 0.09897292250233426, "grad_norm": 8.56926180360254, "learning_rate": 3.2608695652173914e-06, "loss": 0.3189, "step": 106 }, { "epoch": 0.09990662931839403, "grad_norm": 6.560468564142049, "learning_rate": 3.29192546583851e-06, "loss": 0.123, "step": 107 }, { "epoch": 0.10084033613445378, "grad_norm": 4.399434653945426, "learning_rate": 3.3229813664596278e-06, "loss": 0.0835, "step": 108 }, { "epoch": 0.10177404295051354, "grad_norm": 4.877503163883028, "learning_rate": 3.3540372670807457e-06, "loss": 0.1826, "step": 109 }, { "epoch": 0.10270774976657329, "grad_norm": 6.47044428612467, "learning_rate": 3.3850931677018632e-06, "loss": 0.1288, "step": 110 }, { "epoch": 0.10364145658263306, "grad_norm": 7.02496381252311, "learning_rate": 3.4161490683229816e-06, "loss": 0.3253, "step": 111 }, { "epoch": 0.10457516339869281, "grad_norm": 6.904514151006274, "learning_rate": 3.4472049689440996e-06, "loss": 0.2256, "step": 112 }, { "epoch": 0.10550887021475257, "grad_norm": 13.783514193461015, "learning_rate": 3.4782608695652175e-06, "loss": 0.3964, "step": 113 }, { "epoch": 0.10644257703081232, "grad_norm": 6.097231871713247, "learning_rate": 3.5093167701863355e-06, "loss": 0.1693, "step": 114 }, { "epoch": 0.10737628384687208, "grad_norm": 7.426460162776171, "learning_rate": 3.540372670807454e-06, "loss": 0.2038, "step": 115 }, { "epoch": 0.10830999066293184, "grad_norm": 6.519622425051044, "learning_rate": 3.5714285714285718e-06, "loss": 0.25, "step": 116 }, { "epoch": 0.1092436974789916, "grad_norm": 7.081748287101973, "learning_rate": 3.6024844720496897e-06, "loss": 0.251, "step": 117 }, { "epoch": 0.11017740429505135, "grad_norm": 5.734184342259299, "learning_rate": 3.633540372670808e-06, "loss": 0.2582, "step": 118 }, { "epoch": 0.1111111111111111, "grad_norm": 7.3650867090008285, "learning_rate": 3.664596273291926e-06, "loss": 0.2686, "step": 119 }, { "epoch": 0.11204481792717087, "grad_norm": 4.509872755865839, "learning_rate": 3.6956521739130436e-06, "loss": 0.1503, "step": 120 }, { "epoch": 0.11297852474323063, "grad_norm": 7.719541227061605, "learning_rate": 3.7267080745341615e-06, "loss": 0.3347, "step": 121 }, { "epoch": 0.11391223155929038, "grad_norm": 4.065337126761521, "learning_rate": 3.7577639751552795e-06, "loss": 0.1974, "step": 122 }, { "epoch": 0.11484593837535013, "grad_norm": 4.856256117695872, "learning_rate": 3.788819875776398e-06, "loss": 0.2351, "step": 123 }, { "epoch": 0.1157796451914099, "grad_norm": 6.432188205936963, "learning_rate": 3.819875776397516e-06, "loss": 0.1408, "step": 124 }, { "epoch": 0.11671335200746966, "grad_norm": 4.500543972558549, "learning_rate": 3.850931677018634e-06, "loss": 0.2913, "step": 125 }, { "epoch": 0.11764705882352941, "grad_norm": 3.440374360418336, "learning_rate": 3.881987577639752e-06, "loss": 0.114, "step": 126 }, { "epoch": 0.11858076563958916, "grad_norm": 7.629183437697692, "learning_rate": 3.91304347826087e-06, "loss": 0.2032, "step": 127 }, { "epoch": 0.11951447245564893, "grad_norm": 6.994901799766309, "learning_rate": 3.9440993788819884e-06, "loss": 0.2701, "step": 128 }, { "epoch": 0.12044817927170869, "grad_norm": 4.907867462293705, "learning_rate": 3.975155279503106e-06, "loss": 0.1464, "step": 129 }, { "epoch": 0.12138188608776844, "grad_norm": 6.1318149528233095, "learning_rate": 4.0062111801242235e-06, "loss": 0.2401, "step": 130 }, { "epoch": 0.1223155929038282, "grad_norm": 4.0629340048846245, "learning_rate": 4.037267080745342e-06, "loss": 0.1164, "step": 131 }, { "epoch": 0.12324929971988796, "grad_norm": 7.176074307435528, "learning_rate": 4.06832298136646e-06, "loss": 0.4233, "step": 132 }, { "epoch": 0.12418300653594772, "grad_norm": 6.2860887240540455, "learning_rate": 4.099378881987578e-06, "loss": 0.2218, "step": 133 }, { "epoch": 0.12511671335200747, "grad_norm": 12.94204061943663, "learning_rate": 4.130434782608696e-06, "loss": 0.3139, "step": 134 }, { "epoch": 0.12605042016806722, "grad_norm": 7.750046366417605, "learning_rate": 4.1614906832298145e-06, "loss": 0.2302, "step": 135 }, { "epoch": 0.12698412698412698, "grad_norm": 7.748836391789648, "learning_rate": 4.192546583850932e-06, "loss": 0.1593, "step": 136 }, { "epoch": 0.12791783380018673, "grad_norm": 5.554948957573974, "learning_rate": 4.22360248447205e-06, "loss": 0.1898, "step": 137 }, { "epoch": 0.12885154061624648, "grad_norm": 5.685951654726603, "learning_rate": 4.254658385093168e-06, "loss": 0.2244, "step": 138 }, { "epoch": 0.12978524743230627, "grad_norm": 30.436948917382413, "learning_rate": 4.2857142857142855e-06, "loss": 0.5976, "step": 139 }, { "epoch": 0.13071895424836602, "grad_norm": 6.627905590095537, "learning_rate": 4.316770186335404e-06, "loss": 0.2749, "step": 140 }, { "epoch": 0.13165266106442577, "grad_norm": 8.216408211436704, "learning_rate": 4.347826086956522e-06, "loss": 0.3709, "step": 141 }, { "epoch": 0.13258636788048553, "grad_norm": 7.996381388752662, "learning_rate": 4.37888198757764e-06, "loss": 0.2021, "step": 142 }, { "epoch": 0.13352007469654528, "grad_norm": 6.384560221121178, "learning_rate": 4.409937888198758e-06, "loss": 0.1536, "step": 143 }, { "epoch": 0.13445378151260504, "grad_norm": 8.828308341267046, "learning_rate": 4.4409937888198765e-06, "loss": 0.2244, "step": 144 }, { "epoch": 0.1353874883286648, "grad_norm": 7.7940773498276075, "learning_rate": 4.472049689440994e-06, "loss": 0.243, "step": 145 }, { "epoch": 0.13632119514472454, "grad_norm": 5.2572755458567855, "learning_rate": 4.503105590062112e-06, "loss": 0.1757, "step": 146 }, { "epoch": 0.13725490196078433, "grad_norm": 5.298173459491019, "learning_rate": 4.534161490683231e-06, "loss": 0.0399, "step": 147 }, { "epoch": 0.13818860877684408, "grad_norm": 4.559512630968876, "learning_rate": 4.565217391304348e-06, "loss": 0.2832, "step": 148 }, { "epoch": 0.13912231559290383, "grad_norm": 7.474627927886245, "learning_rate": 4.596273291925466e-06, "loss": 0.231, "step": 149 }, { "epoch": 0.1400560224089636, "grad_norm": 8.144375821181198, "learning_rate": 4.627329192546584e-06, "loss": 0.3172, "step": 150 }, { "epoch": 0.14098972922502334, "grad_norm": 7.469528679133387, "learning_rate": 4.6583850931677025e-06, "loss": 0.2879, "step": 151 }, { "epoch": 0.1419234360410831, "grad_norm": 7.3313032751097085, "learning_rate": 4.68944099378882e-06, "loss": 0.3748, "step": 152 }, { "epoch": 0.14285714285714285, "grad_norm": 4.238598899479954, "learning_rate": 4.7204968944099384e-06, "loss": 0.2273, "step": 153 }, { "epoch": 0.1437908496732026, "grad_norm": 8.173201829530978, "learning_rate": 4.751552795031056e-06, "loss": 0.1837, "step": 154 }, { "epoch": 0.14472455648926238, "grad_norm": 6.479930907180178, "learning_rate": 4.782608695652174e-06, "loss": 0.2961, "step": 155 }, { "epoch": 0.14565826330532214, "grad_norm": 8.346799404264187, "learning_rate": 4.813664596273293e-06, "loss": 0.3469, "step": 156 }, { "epoch": 0.1465919701213819, "grad_norm": 5.066220473939228, "learning_rate": 4.84472049689441e-06, "loss": 0.3124, "step": 157 }, { "epoch": 0.14752567693744165, "grad_norm": 5.698897750142967, "learning_rate": 4.875776397515528e-06, "loss": 0.1846, "step": 158 }, { "epoch": 0.1484593837535014, "grad_norm": 5.977842767542157, "learning_rate": 4.906832298136646e-06, "loss": 0.2425, "step": 159 }, { "epoch": 0.14939309056956115, "grad_norm": 4.958048006594576, "learning_rate": 4.9378881987577645e-06, "loss": 0.2241, "step": 160 }, { "epoch": 0.1503267973856209, "grad_norm": 5.0996951844267695, "learning_rate": 4.968944099378882e-06, "loss": 0.1625, "step": 161 }, { "epoch": 0.15126050420168066, "grad_norm": 4.756628324415835, "learning_rate": 5e-06, "loss": 0.2945, "step": 162 }, { "epoch": 0.15219421101774042, "grad_norm": 5.635148360108371, "learning_rate": 5.031055900621118e-06, "loss": 0.2818, "step": 163 }, { "epoch": 0.1531279178338002, "grad_norm": 4.771524057473491, "learning_rate": 5.062111801242236e-06, "loss": 0.1742, "step": 164 }, { "epoch": 0.15406162464985995, "grad_norm": 6.899047780620031, "learning_rate": 5.093167701863354e-06, "loss": 0.1904, "step": 165 }, { "epoch": 0.1549953314659197, "grad_norm": 4.958205257244631, "learning_rate": 5.124223602484473e-06, "loss": 0.1885, "step": 166 }, { "epoch": 0.15592903828197946, "grad_norm": 4.919880525283771, "learning_rate": 5.155279503105591e-06, "loss": 0.1288, "step": 167 }, { "epoch": 0.1568627450980392, "grad_norm": 5.684587723470519, "learning_rate": 5.186335403726709e-06, "loss": 0.1766, "step": 168 }, { "epoch": 0.15779645191409897, "grad_norm": 5.380401414391444, "learning_rate": 5.2173913043478265e-06, "loss": 0.1239, "step": 169 }, { "epoch": 0.15873015873015872, "grad_norm": 8.655046483557623, "learning_rate": 5.248447204968945e-06, "loss": 0.2321, "step": 170 }, { "epoch": 0.15966386554621848, "grad_norm": 5.226081447110192, "learning_rate": 5.279503105590062e-06, "loss": 0.1414, "step": 171 }, { "epoch": 0.16059757236227826, "grad_norm": 9.126533533687125, "learning_rate": 5.31055900621118e-06, "loss": 0.3414, "step": 172 }, { "epoch": 0.161531279178338, "grad_norm": 6.396353507711828, "learning_rate": 5.341614906832298e-06, "loss": 0.1427, "step": 173 }, { "epoch": 0.16246498599439776, "grad_norm": 10.631060974979992, "learning_rate": 5.372670807453416e-06, "loss": 0.3133, "step": 174 }, { "epoch": 0.16339869281045752, "grad_norm": 13.380314583869879, "learning_rate": 5.403726708074535e-06, "loss": 0.2988, "step": 175 }, { "epoch": 0.16433239962651727, "grad_norm": 7.847945335450852, "learning_rate": 5.4347826086956525e-06, "loss": 0.1565, "step": 176 }, { "epoch": 0.16526610644257703, "grad_norm": 8.07062448428823, "learning_rate": 5.465838509316771e-06, "loss": 0.3591, "step": 177 }, { "epoch": 0.16619981325863678, "grad_norm": 10.55649472459891, "learning_rate": 5.4968944099378884e-06, "loss": 0.3099, "step": 178 }, { "epoch": 0.16713352007469653, "grad_norm": 7.074106747432758, "learning_rate": 5.527950310559007e-06, "loss": 0.0766, "step": 179 }, { "epoch": 0.16806722689075632, "grad_norm": 8.162479520432445, "learning_rate": 5.559006211180124e-06, "loss": 0.3417, "step": 180 }, { "epoch": 0.16900093370681607, "grad_norm": 5.505536734694231, "learning_rate": 5.590062111801242e-06, "loss": 0.2212, "step": 181 }, { "epoch": 0.16993464052287582, "grad_norm": 6.838106639384378, "learning_rate": 5.621118012422361e-06, "loss": 0.2656, "step": 182 }, { "epoch": 0.17086834733893558, "grad_norm": 11.752547146528359, "learning_rate": 5.652173913043479e-06, "loss": 0.2573, "step": 183 }, { "epoch": 0.17180205415499533, "grad_norm": 5.661199371566618, "learning_rate": 5.683229813664597e-06, "loss": 0.3045, "step": 184 }, { "epoch": 0.17273576097105509, "grad_norm": 5.6269233830351215, "learning_rate": 5.7142857142857145e-06, "loss": 0.1471, "step": 185 }, { "epoch": 0.17366946778711484, "grad_norm": 5.290042076991005, "learning_rate": 5.745341614906833e-06, "loss": 0.0985, "step": 186 }, { "epoch": 0.1746031746031746, "grad_norm": 4.80892953451703, "learning_rate": 5.77639751552795e-06, "loss": 0.0963, "step": 187 }, { "epoch": 0.17553688141923435, "grad_norm": 10.610811894515109, "learning_rate": 5.80745341614907e-06, "loss": 0.2631, "step": 188 }, { "epoch": 0.17647058823529413, "grad_norm": 5.882603810221271, "learning_rate": 5.838509316770186e-06, "loss": 0.132, "step": 189 }, { "epoch": 0.17740429505135388, "grad_norm": 13.143833320079677, "learning_rate": 5.8695652173913055e-06, "loss": 0.4008, "step": 190 }, { "epoch": 0.17833800186741364, "grad_norm": 6.993871239488991, "learning_rate": 5.900621118012423e-06, "loss": 0.2621, "step": 191 }, { "epoch": 0.1792717086834734, "grad_norm": 10.788987305788343, "learning_rate": 5.931677018633541e-06, "loss": 0.1856, "step": 192 }, { "epoch": 0.18020541549953314, "grad_norm": 5.52310305553004, "learning_rate": 5.962732919254659e-06, "loss": 0.2406, "step": 193 }, { "epoch": 0.1811391223155929, "grad_norm": 5.924911025459489, "learning_rate": 5.9937888198757765e-06, "loss": 0.2583, "step": 194 }, { "epoch": 0.18207282913165265, "grad_norm": 5.155728048220187, "learning_rate": 6.024844720496895e-06, "loss": 0.1684, "step": 195 }, { "epoch": 0.1830065359477124, "grad_norm": 6.28125635245583, "learning_rate": 6.055900621118012e-06, "loss": 0.2722, "step": 196 }, { "epoch": 0.1839402427637722, "grad_norm": 7.119623393596324, "learning_rate": 6.086956521739132e-06, "loss": 0.2748, "step": 197 }, { "epoch": 0.18487394957983194, "grad_norm": 8.533152537261836, "learning_rate": 6.118012422360249e-06, "loss": 0.3696, "step": 198 }, { "epoch": 0.1858076563958917, "grad_norm": 5.0317593218139764, "learning_rate": 6.1490683229813675e-06, "loss": 0.146, "step": 199 }, { "epoch": 0.18674136321195145, "grad_norm": 5.30893368730107, "learning_rate": 6.180124223602485e-06, "loss": 0.201, "step": 200 }, { "epoch": 0.1876750700280112, "grad_norm": 5.616900515871352, "learning_rate": 6.2111801242236025e-06, "loss": 0.2007, "step": 201 }, { "epoch": 0.18860877684407096, "grad_norm": 4.318337195009417, "learning_rate": 6.242236024844721e-06, "loss": 0.183, "step": 202 }, { "epoch": 0.1895424836601307, "grad_norm": 5.98626867702937, "learning_rate": 6.2732919254658384e-06, "loss": 0.282, "step": 203 }, { "epoch": 0.19047619047619047, "grad_norm": 4.365177073920947, "learning_rate": 6.304347826086958e-06, "loss": 0.1978, "step": 204 }, { "epoch": 0.19140989729225025, "grad_norm": 4.029619203861666, "learning_rate": 6.335403726708075e-06, "loss": 0.2428, "step": 205 }, { "epoch": 0.19234360410831, "grad_norm": 4.626582411118401, "learning_rate": 6.3664596273291936e-06, "loss": 0.1804, "step": 206 }, { "epoch": 0.19327731092436976, "grad_norm": 6.593385379136394, "learning_rate": 6.397515527950311e-06, "loss": 0.2398, "step": 207 }, { "epoch": 0.1942110177404295, "grad_norm": 8.642768386780622, "learning_rate": 6.4285714285714295e-06, "loss": 0.3105, "step": 208 }, { "epoch": 0.19514472455648926, "grad_norm": 4.4598344925871105, "learning_rate": 6.459627329192547e-06, "loss": 0.1366, "step": 209 }, { "epoch": 0.19607843137254902, "grad_norm": 4.913577218462294, "learning_rate": 6.4906832298136645e-06, "loss": 0.0952, "step": 210 }, { "epoch": 0.19701213818860877, "grad_norm": 9.50057070240653, "learning_rate": 6.521739130434783e-06, "loss": 0.2998, "step": 211 }, { "epoch": 0.19794584500466852, "grad_norm": 7.913407420389643, "learning_rate": 6.5527950310559e-06, "loss": 0.2907, "step": 212 }, { "epoch": 0.19887955182072828, "grad_norm": 6.776313787395195, "learning_rate": 6.58385093167702e-06, "loss": 0.3156, "step": 213 }, { "epoch": 0.19981325863678806, "grad_norm": 9.530250649196915, "learning_rate": 6.614906832298137e-06, "loss": 0.3115, "step": 214 }, { "epoch": 0.20074696545284781, "grad_norm": 5.111655406536103, "learning_rate": 6.6459627329192555e-06, "loss": 0.2419, "step": 215 }, { "epoch": 0.20168067226890757, "grad_norm": 4.080828700286765, "learning_rate": 6.677018633540373e-06, "loss": 0.1055, "step": 216 }, { "epoch": 0.20261437908496732, "grad_norm": 4.582633037631629, "learning_rate": 6.7080745341614914e-06, "loss": 0.1736, "step": 217 }, { "epoch": 0.20354808590102708, "grad_norm": 6.606124440756107, "learning_rate": 6.739130434782609e-06, "loss": 0.1442, "step": 218 }, { "epoch": 0.20448179271708683, "grad_norm": 6.5084545811031065, "learning_rate": 6.7701863354037265e-06, "loss": 0.3279, "step": 219 }, { "epoch": 0.20541549953314658, "grad_norm": 4.0857843171820045, "learning_rate": 6.801242236024846e-06, "loss": 0.0926, "step": 220 }, { "epoch": 0.20634920634920634, "grad_norm": 6.364206826689976, "learning_rate": 6.832298136645963e-06, "loss": 0.2724, "step": 221 }, { "epoch": 0.20728291316526612, "grad_norm": 6.185596423305357, "learning_rate": 6.863354037267082e-06, "loss": 0.3021, "step": 222 }, { "epoch": 0.20821661998132587, "grad_norm": 5.2651387778796925, "learning_rate": 6.894409937888199e-06, "loss": 0.219, "step": 223 }, { "epoch": 0.20915032679738563, "grad_norm": 4.521832458248844, "learning_rate": 6.9254658385093175e-06, "loss": 0.2098, "step": 224 }, { "epoch": 0.21008403361344538, "grad_norm": 6.489617886504977, "learning_rate": 6.956521739130435e-06, "loss": 0.2813, "step": 225 }, { "epoch": 0.21101774042950514, "grad_norm": 6.930607296646899, "learning_rate": 6.987577639751553e-06, "loss": 0.3458, "step": 226 }, { "epoch": 0.2119514472455649, "grad_norm": 5.504205051279705, "learning_rate": 7.018633540372671e-06, "loss": 0.254, "step": 227 }, { "epoch": 0.21288515406162464, "grad_norm": 8.168536695159332, "learning_rate": 7.04968944099379e-06, "loss": 0.3565, "step": 228 }, { "epoch": 0.2138188608776844, "grad_norm": 6.049253626870813, "learning_rate": 7.080745341614908e-06, "loss": 0.2541, "step": 229 }, { "epoch": 0.21475256769374415, "grad_norm": 4.385787086993419, "learning_rate": 7.111801242236025e-06, "loss": 0.1496, "step": 230 }, { "epoch": 0.21568627450980393, "grad_norm": 4.519480122303107, "learning_rate": 7.1428571428571436e-06, "loss": 0.1277, "step": 231 }, { "epoch": 0.2166199813258637, "grad_norm": 4.673277473556723, "learning_rate": 7.173913043478261e-06, "loss": 0.1534, "step": 232 }, { "epoch": 0.21755368814192344, "grad_norm": 3.7532396245764157, "learning_rate": 7.2049689440993795e-06, "loss": 0.105, "step": 233 }, { "epoch": 0.2184873949579832, "grad_norm": 5.277810449938521, "learning_rate": 7.236024844720497e-06, "loss": 0.1532, "step": 234 }, { "epoch": 0.21942110177404295, "grad_norm": 5.465513735554339, "learning_rate": 7.267080745341616e-06, "loss": 0.2034, "step": 235 }, { "epoch": 0.2203548085901027, "grad_norm": 4.320796275483751, "learning_rate": 7.298136645962734e-06, "loss": 0.1612, "step": 236 }, { "epoch": 0.22128851540616246, "grad_norm": 12.491579308147879, "learning_rate": 7.329192546583852e-06, "loss": 0.3272, "step": 237 }, { "epoch": 0.2222222222222222, "grad_norm": 5.982391515917491, "learning_rate": 7.36024844720497e-06, "loss": 0.1867, "step": 238 }, { "epoch": 0.223155929038282, "grad_norm": 6.4731138386503835, "learning_rate": 7.391304347826087e-06, "loss": 0.2175, "step": 239 }, { "epoch": 0.22408963585434175, "grad_norm": 8.668211740303237, "learning_rate": 7.4223602484472055e-06, "loss": 0.2482, "step": 240 }, { "epoch": 0.2250233426704015, "grad_norm": 16.428475601532387, "learning_rate": 7.453416149068323e-06, "loss": 0.2794, "step": 241 }, { "epoch": 0.22595704948646125, "grad_norm": 5.600510992574216, "learning_rate": 7.484472049689442e-06, "loss": 0.0819, "step": 242 }, { "epoch": 0.226890756302521, "grad_norm": 5.854914382283012, "learning_rate": 7.515527950310559e-06, "loss": 0.1816, "step": 243 }, { "epoch": 0.22782446311858076, "grad_norm": 3.3668144399984508, "learning_rate": 7.546583850931678e-06, "loss": 0.0844, "step": 244 }, { "epoch": 0.22875816993464052, "grad_norm": 6.992780526944702, "learning_rate": 7.577639751552796e-06, "loss": 0.3247, "step": 245 }, { "epoch": 0.22969187675070027, "grad_norm": 3.5029310245437815, "learning_rate": 7.608695652173914e-06, "loss": 0.1477, "step": 246 }, { "epoch": 0.23062558356676005, "grad_norm": 5.755680822522221, "learning_rate": 7.639751552795032e-06, "loss": 0.1495, "step": 247 }, { "epoch": 0.2315592903828198, "grad_norm": 5.266591670791916, "learning_rate": 7.670807453416149e-06, "loss": 0.2638, "step": 248 }, { "epoch": 0.23249299719887956, "grad_norm": 5.895270450400852, "learning_rate": 7.701863354037268e-06, "loss": 0.1543, "step": 249 }, { "epoch": 0.2334267040149393, "grad_norm": 3.632395794202594, "learning_rate": 7.732919254658386e-06, "loss": 0.1738, "step": 250 }, { "epoch": 0.23436041083099907, "grad_norm": 9.789185085769644, "learning_rate": 7.763975155279503e-06, "loss": 0.2019, "step": 251 }, { "epoch": 0.23529411764705882, "grad_norm": 4.2927460140121845, "learning_rate": 7.795031055900621e-06, "loss": 0.1984, "step": 252 }, { "epoch": 0.23622782446311857, "grad_norm": 4.660651452362528, "learning_rate": 7.82608695652174e-06, "loss": 0.1877, "step": 253 }, { "epoch": 0.23716153127917833, "grad_norm": 5.317961199459378, "learning_rate": 7.857142857142858e-06, "loss": 0.222, "step": 254 }, { "epoch": 0.23809523809523808, "grad_norm": 5.384257411485096, "learning_rate": 7.888198757763977e-06, "loss": 0.2963, "step": 255 }, { "epoch": 0.23902894491129786, "grad_norm": 3.6469091719562363, "learning_rate": 7.919254658385094e-06, "loss": 0.1368, "step": 256 }, { "epoch": 0.23996265172735762, "grad_norm": 3.6362734235374785, "learning_rate": 7.950310559006212e-06, "loss": 0.2549, "step": 257 }, { "epoch": 0.24089635854341737, "grad_norm": 4.419261845304813, "learning_rate": 7.98136645962733e-06, "loss": 0.1397, "step": 258 }, { "epoch": 0.24183006535947713, "grad_norm": 4.547012810718411, "learning_rate": 8.012422360248447e-06, "loss": 0.1574, "step": 259 }, { "epoch": 0.24276377217553688, "grad_norm": 4.966997291800919, "learning_rate": 8.043478260869566e-06, "loss": 0.1502, "step": 260 }, { "epoch": 0.24369747899159663, "grad_norm": 3.8015297701276074, "learning_rate": 8.074534161490684e-06, "loss": 0.199, "step": 261 }, { "epoch": 0.2446311858076564, "grad_norm": 4.126218303214326, "learning_rate": 8.105590062111803e-06, "loss": 0.1847, "step": 262 }, { "epoch": 0.24556489262371614, "grad_norm": 7.707868696135542, "learning_rate": 8.13664596273292e-06, "loss": 0.2705, "step": 263 }, { "epoch": 0.24649859943977592, "grad_norm": 3.869984247913388, "learning_rate": 8.167701863354038e-06, "loss": 0.1041, "step": 264 }, { "epoch": 0.24743230625583568, "grad_norm": 3.466524199274959, "learning_rate": 8.198757763975156e-06, "loss": 0.1181, "step": 265 }, { "epoch": 0.24836601307189543, "grad_norm": 5.289105644675294, "learning_rate": 8.229813664596275e-06, "loss": 0.1645, "step": 266 }, { "epoch": 0.24929971988795518, "grad_norm": 7.31841986506117, "learning_rate": 8.260869565217392e-06, "loss": 0.2881, "step": 267 }, { "epoch": 0.25023342670401494, "grad_norm": 3.5616026929050433, "learning_rate": 8.29192546583851e-06, "loss": 0.0832, "step": 268 }, { "epoch": 0.2511671335200747, "grad_norm": 6.240194535222705, "learning_rate": 8.322981366459629e-06, "loss": 0.2269, "step": 269 }, { "epoch": 0.25210084033613445, "grad_norm": 5.090564937456406, "learning_rate": 8.354037267080745e-06, "loss": 0.2652, "step": 270 }, { "epoch": 0.2530345471521942, "grad_norm": 3.069601443140852, "learning_rate": 8.385093167701864e-06, "loss": 0.0918, "step": 271 }, { "epoch": 0.25396825396825395, "grad_norm": 9.165576106584918, "learning_rate": 8.416149068322982e-06, "loss": 0.3709, "step": 272 }, { "epoch": 0.2549019607843137, "grad_norm": 3.8591796906093236, "learning_rate": 8.4472049689441e-06, "loss": 0.1175, "step": 273 }, { "epoch": 0.25583566760037346, "grad_norm": 6.4340012890950895, "learning_rate": 8.478260869565218e-06, "loss": 0.3037, "step": 274 }, { "epoch": 0.2567693744164332, "grad_norm": 3.373513244382824, "learning_rate": 8.509316770186336e-06, "loss": 0.1485, "step": 275 }, { "epoch": 0.25770308123249297, "grad_norm": 4.759224817070829, "learning_rate": 8.540372670807453e-06, "loss": 0.2226, "step": 276 }, { "epoch": 0.2586367880485528, "grad_norm": 5.077237550146845, "learning_rate": 8.571428571428571e-06, "loss": 0.2689, "step": 277 }, { "epoch": 0.25957049486461253, "grad_norm": 5.165562632475677, "learning_rate": 8.60248447204969e-06, "loss": 0.2114, "step": 278 }, { "epoch": 0.2605042016806723, "grad_norm": 4.023901597068931, "learning_rate": 8.633540372670808e-06, "loss": 0.0926, "step": 279 }, { "epoch": 0.26143790849673204, "grad_norm": 6.798832857235012, "learning_rate": 8.664596273291927e-06, "loss": 0.2344, "step": 280 }, { "epoch": 0.2623716153127918, "grad_norm": 9.144125067563566, "learning_rate": 8.695652173913044e-06, "loss": 0.3501, "step": 281 }, { "epoch": 0.26330532212885155, "grad_norm": 6.762173014181721, "learning_rate": 8.726708074534162e-06, "loss": 0.2827, "step": 282 }, { "epoch": 0.2642390289449113, "grad_norm": 5.2854008631136935, "learning_rate": 8.75776397515528e-06, "loss": 0.207, "step": 283 }, { "epoch": 0.26517273576097106, "grad_norm": 6.06186570017828, "learning_rate": 8.788819875776399e-06, "loss": 0.1893, "step": 284 }, { "epoch": 0.2661064425770308, "grad_norm": 5.796517548413642, "learning_rate": 8.819875776397516e-06, "loss": 0.2458, "step": 285 }, { "epoch": 0.26704014939309056, "grad_norm": 4.843633816938901, "learning_rate": 8.850931677018634e-06, "loss": 0.2073, "step": 286 }, { "epoch": 0.2679738562091503, "grad_norm": 6.138790246589221, "learning_rate": 8.881987577639753e-06, "loss": 0.2285, "step": 287 }, { "epoch": 0.2689075630252101, "grad_norm": 4.811845592208753, "learning_rate": 8.91304347826087e-06, "loss": 0.1344, "step": 288 }, { "epoch": 0.2698412698412698, "grad_norm": 5.103476921167675, "learning_rate": 8.944099378881988e-06, "loss": 0.173, "step": 289 }, { "epoch": 0.2707749766573296, "grad_norm": 3.7199629227686644, "learning_rate": 8.975155279503106e-06, "loss": 0.1252, "step": 290 }, { "epoch": 0.27170868347338933, "grad_norm": 5.517090942754365, "learning_rate": 9.006211180124225e-06, "loss": 0.1753, "step": 291 }, { "epoch": 0.2726423902894491, "grad_norm": 6.359631706536928, "learning_rate": 9.037267080745342e-06, "loss": 0.3006, "step": 292 }, { "epoch": 0.2735760971055089, "grad_norm": 3.5056279746892383, "learning_rate": 9.068322981366461e-06, "loss": 0.168, "step": 293 }, { "epoch": 0.27450980392156865, "grad_norm": 6.847966844547178, "learning_rate": 9.099378881987579e-06, "loss": 0.2564, "step": 294 }, { "epoch": 0.2754435107376284, "grad_norm": 4.236947558501779, "learning_rate": 9.130434782608697e-06, "loss": 0.1894, "step": 295 }, { "epoch": 0.27637721755368816, "grad_norm": 4.597496564136178, "learning_rate": 9.161490683229814e-06, "loss": 0.1183, "step": 296 }, { "epoch": 0.2773109243697479, "grad_norm": 10.507247313366493, "learning_rate": 9.192546583850932e-06, "loss": 0.346, "step": 297 }, { "epoch": 0.27824463118580767, "grad_norm": 69.42119822954322, "learning_rate": 9.22360248447205e-06, "loss": 0.9198, "step": 298 }, { "epoch": 0.2791783380018674, "grad_norm": 5.730699575313489, "learning_rate": 9.254658385093168e-06, "loss": 0.2044, "step": 299 }, { "epoch": 0.2801120448179272, "grad_norm": 9.074975598002657, "learning_rate": 9.285714285714288e-06, "loss": 0.3204, "step": 300 }, { "epoch": 0.28104575163398693, "grad_norm": 6.223407247352979, "learning_rate": 9.316770186335405e-06, "loss": 0.1704, "step": 301 }, { "epoch": 0.2819794584500467, "grad_norm": 4.185490662659638, "learning_rate": 9.347826086956523e-06, "loss": 0.0859, "step": 302 }, { "epoch": 0.28291316526610644, "grad_norm": 3.8528631184674933, "learning_rate": 9.37888198757764e-06, "loss": 0.2055, "step": 303 }, { "epoch": 0.2838468720821662, "grad_norm": 4.670800621882704, "learning_rate": 9.40993788819876e-06, "loss": 0.2612, "step": 304 }, { "epoch": 0.28478057889822594, "grad_norm": 7.497122505835605, "learning_rate": 9.440993788819877e-06, "loss": 0.3462, "step": 305 }, { "epoch": 0.2857142857142857, "grad_norm": 5.192492031190614, "learning_rate": 9.472049689440994e-06, "loss": 0.2551, "step": 306 }, { "epoch": 0.28664799253034545, "grad_norm": 5.603270360198832, "learning_rate": 9.503105590062112e-06, "loss": 0.3498, "step": 307 }, { "epoch": 0.2875816993464052, "grad_norm": 4.757311250030508, "learning_rate": 9.53416149068323e-06, "loss": 0.1952, "step": 308 }, { "epoch": 0.28851540616246496, "grad_norm": 3.7515530630866936, "learning_rate": 9.565217391304349e-06, "loss": 0.2156, "step": 309 }, { "epoch": 0.28944911297852477, "grad_norm": 3.5770780429391658, "learning_rate": 9.596273291925466e-06, "loss": 0.1524, "step": 310 }, { "epoch": 0.2903828197945845, "grad_norm": 4.577012764181959, "learning_rate": 9.627329192546585e-06, "loss": 0.2363, "step": 311 }, { "epoch": 0.2913165266106443, "grad_norm": 9.204672775445022, "learning_rate": 9.658385093167703e-06, "loss": 0.4537, "step": 312 }, { "epoch": 0.29225023342670403, "grad_norm": 5.7336968782302735, "learning_rate": 9.68944099378882e-06, "loss": 0.1572, "step": 313 }, { "epoch": 0.2931839402427638, "grad_norm": 5.879283897334004, "learning_rate": 9.720496894409938e-06, "loss": 0.1336, "step": 314 }, { "epoch": 0.29411764705882354, "grad_norm": 10.767486232654266, "learning_rate": 9.751552795031056e-06, "loss": 0.4026, "step": 315 }, { "epoch": 0.2950513538748833, "grad_norm": 6.958564140849953, "learning_rate": 9.782608695652175e-06, "loss": 0.1557, "step": 316 }, { "epoch": 0.29598506069094305, "grad_norm": 7.356508897091506, "learning_rate": 9.813664596273292e-06, "loss": 0.1922, "step": 317 }, { "epoch": 0.2969187675070028, "grad_norm": 5.931910668970306, "learning_rate": 9.844720496894411e-06, "loss": 0.3268, "step": 318 }, { "epoch": 0.29785247432306255, "grad_norm": 5.26331397132265, "learning_rate": 9.875776397515529e-06, "loss": 0.3235, "step": 319 }, { "epoch": 0.2987861811391223, "grad_norm": 2.6782053122813285, "learning_rate": 9.906832298136647e-06, "loss": 0.0863, "step": 320 }, { "epoch": 0.29971988795518206, "grad_norm": 3.8106592746620627, "learning_rate": 9.937888198757764e-06, "loss": 0.1753, "step": 321 }, { "epoch": 0.3006535947712418, "grad_norm": 2.379049305689277, "learning_rate": 9.968944099378883e-06, "loss": 0.113, "step": 322 }, { "epoch": 0.30158730158730157, "grad_norm": 3.252212973141822, "learning_rate": 1e-05, "loss": 0.1088, "step": 323 }, { "epoch": 0.3025210084033613, "grad_norm": 7.214982012468996, "learning_rate": 9.999999771347546e-06, "loss": 0.3966, "step": 324 }, { "epoch": 0.3034547152194211, "grad_norm": 2.4351849964990557, "learning_rate": 9.999999085390203e-06, "loss": 0.0813, "step": 325 }, { "epoch": 0.30438842203548083, "grad_norm": 4.880513081007667, "learning_rate": 9.99999794212803e-06, "loss": 0.2678, "step": 326 }, { "epoch": 0.30532212885154064, "grad_norm": 6.4813166172974706, "learning_rate": 9.999996341561138e-06, "loss": 0.3511, "step": 327 }, { "epoch": 0.3062558356676004, "grad_norm": 4.405014616105612, "learning_rate": 9.999994283689668e-06, "loss": 0.2179, "step": 328 }, { "epoch": 0.30718954248366015, "grad_norm": 5.051020259964896, "learning_rate": 9.999991768513814e-06, "loss": 0.2633, "step": 329 }, { "epoch": 0.3081232492997199, "grad_norm": 3.8809499677587027, "learning_rate": 9.999988796033801e-06, "loss": 0.1898, "step": 330 }, { "epoch": 0.30905695611577966, "grad_norm": 4.72649846735322, "learning_rate": 9.999985366249902e-06, "loss": 0.2339, "step": 331 }, { "epoch": 0.3099906629318394, "grad_norm": 5.780230324284134, "learning_rate": 9.999981479162432e-06, "loss": 0.2538, "step": 332 }, { "epoch": 0.31092436974789917, "grad_norm": 4.7771765580201, "learning_rate": 9.999977134771746e-06, "loss": 0.2051, "step": 333 }, { "epoch": 0.3118580765639589, "grad_norm": 5.711452509746245, "learning_rate": 9.99997233307824e-06, "loss": 0.3577, "step": 334 }, { "epoch": 0.3127917833800187, "grad_norm": 2.857615355319642, "learning_rate": 9.999967074082356e-06, "loss": 0.062, "step": 335 }, { "epoch": 0.3137254901960784, "grad_norm": 4.074981409826805, "learning_rate": 9.999961357784571e-06, "loss": 0.1262, "step": 336 }, { "epoch": 0.3146591970121382, "grad_norm": 4.197710033284552, "learning_rate": 9.999955184185413e-06, "loss": 0.1499, "step": 337 }, { "epoch": 0.31559290382819793, "grad_norm": 6.8303744343879, "learning_rate": 9.999948553285442e-06, "loss": 0.375, "step": 338 }, { "epoch": 0.3165266106442577, "grad_norm": 4.055286612944565, "learning_rate": 9.999941465085265e-06, "loss": 0.1444, "step": 339 }, { "epoch": 0.31746031746031744, "grad_norm": 3.6841659911636087, "learning_rate": 9.999933919585533e-06, "loss": 0.1953, "step": 340 }, { "epoch": 0.3183940242763772, "grad_norm": 2.5589253632566105, "learning_rate": 9.999925916786934e-06, "loss": 0.1259, "step": 341 }, { "epoch": 0.31932773109243695, "grad_norm": 5.636380793697199, "learning_rate": 9.999917456690203e-06, "loss": 0.2514, "step": 342 }, { "epoch": 0.3202614379084967, "grad_norm": 3.8942690013870522, "learning_rate": 9.999908539296109e-06, "loss": 0.1837, "step": 343 }, { "epoch": 0.3211951447245565, "grad_norm": 3.5352875123434764, "learning_rate": 9.999899164605469e-06, "loss": 0.1605, "step": 344 }, { "epoch": 0.32212885154061627, "grad_norm": 4.748429401482719, "learning_rate": 9.999889332619144e-06, "loss": 0.2435, "step": 345 }, { "epoch": 0.323062558356676, "grad_norm": 5.736892582651714, "learning_rate": 9.999879043338027e-06, "loss": 0.2543, "step": 346 }, { "epoch": 0.3239962651727358, "grad_norm": 4.054243382027467, "learning_rate": 9.999868296763067e-06, "loss": 0.2348, "step": 347 }, { "epoch": 0.32492997198879553, "grad_norm": 5.205413459056311, "learning_rate": 9.99985709289524e-06, "loss": 0.2231, "step": 348 }, { "epoch": 0.3258636788048553, "grad_norm": 7.072911203985967, "learning_rate": 9.999845431735573e-06, "loss": 0.2984, "step": 349 }, { "epoch": 0.32679738562091504, "grad_norm": 3.617808545106347, "learning_rate": 9.999833313285135e-06, "loss": 0.1455, "step": 350 }, { "epoch": 0.3277310924369748, "grad_norm": 3.801676848395678, "learning_rate": 9.99982073754503e-06, "loss": 0.1723, "step": 351 }, { "epoch": 0.32866479925303455, "grad_norm": 3.319104395249914, "learning_rate": 9.999807704516411e-06, "loss": 0.159, "step": 352 }, { "epoch": 0.3295985060690943, "grad_norm": 3.527903981429738, "learning_rate": 9.999794214200469e-06, "loss": 0.1334, "step": 353 }, { "epoch": 0.33053221288515405, "grad_norm": 3.4187759197305274, "learning_rate": 9.999780266598437e-06, "loss": 0.176, "step": 354 }, { "epoch": 0.3314659197012138, "grad_norm": 2.5293889909328517, "learning_rate": 9.999765861711594e-06, "loss": 0.1398, "step": 355 }, { "epoch": 0.33239962651727356, "grad_norm": 5.134730124589937, "learning_rate": 9.999750999541255e-06, "loss": 0.225, "step": 356 }, { "epoch": 0.3333333333333333, "grad_norm": 3.971959918234676, "learning_rate": 9.99973568008878e-06, "loss": 0.2629, "step": 357 }, { "epoch": 0.33426704014939307, "grad_norm": 7.25056074371194, "learning_rate": 9.999719903355568e-06, "loss": 0.2619, "step": 358 }, { "epoch": 0.3352007469654528, "grad_norm": 2.3057057095836595, "learning_rate": 9.999703669343067e-06, "loss": 0.0746, "step": 359 }, { "epoch": 0.33613445378151263, "grad_norm": 1.8046357958375123, "learning_rate": 9.999686978052756e-06, "loss": 0.0461, "step": 360 }, { "epoch": 0.3370681605975724, "grad_norm": 4.247270547917305, "learning_rate": 9.999669829486165e-06, "loss": 0.172, "step": 361 }, { "epoch": 0.33800186741363214, "grad_norm": 4.600281187363716, "learning_rate": 9.99965222364486e-06, "loss": 0.2006, "step": 362 }, { "epoch": 0.3389355742296919, "grad_norm": 3.7148023212307337, "learning_rate": 9.999634160530455e-06, "loss": 0.1311, "step": 363 }, { "epoch": 0.33986928104575165, "grad_norm": 3.8245569896250164, "learning_rate": 9.999615640144599e-06, "loss": 0.1521, "step": 364 }, { "epoch": 0.3408029878618114, "grad_norm": 5.018027767305163, "learning_rate": 9.999596662488987e-06, "loss": 0.2807, "step": 365 }, { "epoch": 0.34173669467787116, "grad_norm": 3.9451461504587186, "learning_rate": 9.999577227565354e-06, "loss": 0.2087, "step": 366 }, { "epoch": 0.3426704014939309, "grad_norm": 6.090497740041675, "learning_rate": 9.999557335375478e-06, "loss": 0.167, "step": 367 }, { "epoch": 0.34360410830999066, "grad_norm": 5.959713233407351, "learning_rate": 9.999536985921178e-06, "loss": 0.2544, "step": 368 }, { "epoch": 0.3445378151260504, "grad_norm": 2.0195039858334156, "learning_rate": 9.999516179204316e-06, "loss": 0.081, "step": 369 }, { "epoch": 0.34547152194211017, "grad_norm": 4.099961444240055, "learning_rate": 9.999494915226796e-06, "loss": 0.2586, "step": 370 }, { "epoch": 0.3464052287581699, "grad_norm": 3.0297593853114013, "learning_rate": 9.99947319399056e-06, "loss": 0.0917, "step": 371 }, { "epoch": 0.3473389355742297, "grad_norm": 3.776704055661631, "learning_rate": 9.999451015497595e-06, "loss": 0.1744, "step": 372 }, { "epoch": 0.34827264239028943, "grad_norm": 6.802918169183637, "learning_rate": 9.999428379749932e-06, "loss": 0.4089, "step": 373 }, { "epoch": 0.3492063492063492, "grad_norm": 2.6866118796299308, "learning_rate": 9.99940528674964e-06, "loss": 0.0691, "step": 374 }, { "epoch": 0.35014005602240894, "grad_norm": 3.7630999515182832, "learning_rate": 9.99938173649883e-06, "loss": 0.0706, "step": 375 }, { "epoch": 0.3510737628384687, "grad_norm": 3.834233526956242, "learning_rate": 9.999357728999657e-06, "loss": 0.1308, "step": 376 }, { "epoch": 0.3520074696545285, "grad_norm": 3.597711857860343, "learning_rate": 9.999333264254316e-06, "loss": 0.1797, "step": 377 }, { "epoch": 0.35294117647058826, "grad_norm": 4.433806583296204, "learning_rate": 9.999308342265046e-06, "loss": 0.1492, "step": 378 }, { "epoch": 0.353874883286648, "grad_norm": 7.404260771275956, "learning_rate": 9.999282963034126e-06, "loss": 0.3156, "step": 379 }, { "epoch": 0.35480859010270777, "grad_norm": 3.322763146388615, "learning_rate": 9.999257126563876e-06, "loss": 0.0943, "step": 380 }, { "epoch": 0.3557422969187675, "grad_norm": 3.63109849921369, "learning_rate": 9.99923083285666e-06, "loss": 0.1371, "step": 381 }, { "epoch": 0.3566760037348273, "grad_norm": 3.1165822260582057, "learning_rate": 9.999204081914881e-06, "loss": 0.0993, "step": 382 }, { "epoch": 0.357609710550887, "grad_norm": 6.116510959489865, "learning_rate": 9.99917687374099e-06, "loss": 0.2811, "step": 383 }, { "epoch": 0.3585434173669468, "grad_norm": 4.234896093954266, "learning_rate": 9.999149208337472e-06, "loss": 0.1291, "step": 384 }, { "epoch": 0.35947712418300654, "grad_norm": 5.3513017202359405, "learning_rate": 9.999121085706857e-06, "loss": 0.194, "step": 385 }, { "epoch": 0.3604108309990663, "grad_norm": 2.557876245393736, "learning_rate": 9.99909250585172e-06, "loss": 0.0952, "step": 386 }, { "epoch": 0.36134453781512604, "grad_norm": 2.6647140360367847, "learning_rate": 9.999063468774672e-06, "loss": 0.0717, "step": 387 }, { "epoch": 0.3622782446311858, "grad_norm": 3.345342513746036, "learning_rate": 9.999033974478369e-06, "loss": 0.1585, "step": 388 }, { "epoch": 0.36321195144724555, "grad_norm": 3.47686916964757, "learning_rate": 9.99900402296551e-06, "loss": 0.1745, "step": 389 }, { "epoch": 0.3641456582633053, "grad_norm": 5.475279948664227, "learning_rate": 9.998973614238835e-06, "loss": 0.2946, "step": 390 }, { "epoch": 0.36507936507936506, "grad_norm": 3.777069124818802, "learning_rate": 9.998942748301123e-06, "loss": 0.1434, "step": 391 }, { "epoch": 0.3660130718954248, "grad_norm": 3.4547005209653294, "learning_rate": 9.9989114251552e-06, "loss": 0.1737, "step": 392 }, { "epoch": 0.36694677871148457, "grad_norm": 2.7753194726202026, "learning_rate": 9.998879644803927e-06, "loss": 0.156, "step": 393 }, { "epoch": 0.3678804855275444, "grad_norm": 3.6155107626615473, "learning_rate": 9.998847407250212e-06, "loss": 0.2221, "step": 394 }, { "epoch": 0.36881419234360413, "grad_norm": 4.777103812769329, "learning_rate": 9.998814712497007e-06, "loss": 0.2168, "step": 395 }, { "epoch": 0.3697478991596639, "grad_norm": 4.5861667415670855, "learning_rate": 9.998781560547297e-06, "loss": 0.1117, "step": 396 }, { "epoch": 0.37068160597572364, "grad_norm": 3.698493456987846, "learning_rate": 9.998747951404117e-06, "loss": 0.1282, "step": 397 }, { "epoch": 0.3716153127917834, "grad_norm": 2.907635675360706, "learning_rate": 9.998713885070541e-06, "loss": 0.1415, "step": 398 }, { "epoch": 0.37254901960784315, "grad_norm": 6.887710956980117, "learning_rate": 9.998679361549682e-06, "loss": 0.4458, "step": 399 }, { "epoch": 0.3734827264239029, "grad_norm": 3.090281465935092, "learning_rate": 9.998644380844702e-06, "loss": 0.1823, "step": 400 }, { "epoch": 0.37441643323996265, "grad_norm": 4.2087215974666865, "learning_rate": 9.998608942958798e-06, "loss": 0.2369, "step": 401 }, { "epoch": 0.3753501400560224, "grad_norm": 3.353444331966039, "learning_rate": 9.998573047895211e-06, "loss": 0.1871, "step": 402 }, { "epoch": 0.37628384687208216, "grad_norm": 1.9845874719722763, "learning_rate": 9.998536695657224e-06, "loss": 0.0864, "step": 403 }, { "epoch": 0.3772175536881419, "grad_norm": 3.1049038797458763, "learning_rate": 9.99849988624816e-06, "loss": 0.0956, "step": 404 }, { "epoch": 0.37815126050420167, "grad_norm": 3.6141939264820366, "learning_rate": 9.998462619671391e-06, "loss": 0.1698, "step": 405 }, { "epoch": 0.3790849673202614, "grad_norm": 3.3482147355557856, "learning_rate": 9.998424895930321e-06, "loss": 0.0831, "step": 406 }, { "epoch": 0.3800186741363212, "grad_norm": 12.937751125327633, "learning_rate": 9.998386715028403e-06, "loss": 0.4509, "step": 407 }, { "epoch": 0.38095238095238093, "grad_norm": 4.09688519118855, "learning_rate": 9.998348076969125e-06, "loss": 0.1707, "step": 408 }, { "epoch": 0.3818860877684407, "grad_norm": 3.8399014306065116, "learning_rate": 9.998308981756023e-06, "loss": 0.2189, "step": 409 }, { "epoch": 0.3828197945845005, "grad_norm": 6.280334896226895, "learning_rate": 9.998269429392676e-06, "loss": 0.1753, "step": 410 }, { "epoch": 0.38375350140056025, "grad_norm": 5.538045156790274, "learning_rate": 9.998229419882697e-06, "loss": 0.2954, "step": 411 }, { "epoch": 0.38468720821662, "grad_norm": 3.159003794231097, "learning_rate": 9.998188953229746e-06, "loss": 0.1604, "step": 412 }, { "epoch": 0.38562091503267976, "grad_norm": 3.6493958692450437, "learning_rate": 9.998148029437525e-06, "loss": 0.1445, "step": 413 }, { "epoch": 0.3865546218487395, "grad_norm": 5.752850171323055, "learning_rate": 9.998106648509778e-06, "loss": 0.2592, "step": 414 }, { "epoch": 0.38748832866479926, "grad_norm": 3.39200886308657, "learning_rate": 9.998064810450287e-06, "loss": 0.1282, "step": 415 }, { "epoch": 0.388422035480859, "grad_norm": 3.5386335555187842, "learning_rate": 9.998022515262881e-06, "loss": 0.0717, "step": 416 }, { "epoch": 0.38935574229691877, "grad_norm": 5.1334732596672215, "learning_rate": 9.997979762951428e-06, "loss": 0.2331, "step": 417 }, { "epoch": 0.3902894491129785, "grad_norm": 3.120423157246127, "learning_rate": 9.997936553519839e-06, "loss": 0.1368, "step": 418 }, { "epoch": 0.3912231559290383, "grad_norm": 4.725809012374557, "learning_rate": 9.997892886972063e-06, "loss": 0.1864, "step": 419 }, { "epoch": 0.39215686274509803, "grad_norm": 6.01311386355843, "learning_rate": 9.997848763312096e-06, "loss": 0.2997, "step": 420 }, { "epoch": 0.3930905695611578, "grad_norm": 3.1026055808757658, "learning_rate": 9.997804182543973e-06, "loss": 0.1342, "step": 421 }, { "epoch": 0.39402427637721754, "grad_norm": 3.1823563724685973, "learning_rate": 9.997759144671771e-06, "loss": 0.1124, "step": 422 }, { "epoch": 0.3949579831932773, "grad_norm": 5.568750382729109, "learning_rate": 9.99771364969961e-06, "loss": 0.2428, "step": 423 }, { "epoch": 0.39589169000933705, "grad_norm": 5.693644977304309, "learning_rate": 9.99766769763165e-06, "loss": 0.2484, "step": 424 }, { "epoch": 0.3968253968253968, "grad_norm": 7.001531493487862, "learning_rate": 9.997621288472095e-06, "loss": 0.1856, "step": 425 }, { "epoch": 0.39775910364145656, "grad_norm": 5.601061571573185, "learning_rate": 9.997574422225188e-06, "loss": 0.2198, "step": 426 }, { "epoch": 0.39869281045751637, "grad_norm": 8.898376603301367, "learning_rate": 9.99752709889522e-06, "loss": 0.1674, "step": 427 }, { "epoch": 0.3996265172735761, "grad_norm": 5.692660126539724, "learning_rate": 9.997479318486512e-06, "loss": 0.2564, "step": 428 }, { "epoch": 0.4005602240896359, "grad_norm": 7.2090440693161915, "learning_rate": 9.99743108100344e-06, "loss": 0.1698, "step": 429 }, { "epoch": 0.40149393090569563, "grad_norm": 4.112564397532336, "learning_rate": 9.997382386450414e-06, "loss": 0.0637, "step": 430 }, { "epoch": 0.4024276377217554, "grad_norm": 7.0390266560658965, "learning_rate": 9.997333234831886e-06, "loss": 0.2073, "step": 431 }, { "epoch": 0.40336134453781514, "grad_norm": 8.081200557718462, "learning_rate": 9.997283626152354e-06, "loss": 0.3172, "step": 432 }, { "epoch": 0.4042950513538749, "grad_norm": 4.895750824429287, "learning_rate": 9.997233560416353e-06, "loss": 0.3058, "step": 433 }, { "epoch": 0.40522875816993464, "grad_norm": 2.1545341923373282, "learning_rate": 9.997183037628463e-06, "loss": 0.0698, "step": 434 }, { "epoch": 0.4061624649859944, "grad_norm": 2.536024439786811, "learning_rate": 9.997132057793306e-06, "loss": 0.1401, "step": 435 }, { "epoch": 0.40709617180205415, "grad_norm": 3.2775065866420774, "learning_rate": 9.997080620915542e-06, "loss": 0.138, "step": 436 }, { "epoch": 0.4080298786181139, "grad_norm": 3.4222720618777536, "learning_rate": 9.997028726999877e-06, "loss": 0.0719, "step": 437 }, { "epoch": 0.40896358543417366, "grad_norm": 4.308232224018676, "learning_rate": 9.99697637605106e-06, "loss": 0.1554, "step": 438 }, { "epoch": 0.4098972922502334, "grad_norm": 5.018842661391261, "learning_rate": 9.996923568073875e-06, "loss": 0.2962, "step": 439 }, { "epoch": 0.41083099906629317, "grad_norm": 3.4711845961529413, "learning_rate": 9.996870303073154e-06, "loss": 0.1411, "step": 440 }, { "epoch": 0.4117647058823529, "grad_norm": 2.516661106450011, "learning_rate": 9.996816581053766e-06, "loss": 0.0831, "step": 441 }, { "epoch": 0.4126984126984127, "grad_norm": 5.376622596297656, "learning_rate": 9.996762402020627e-06, "loss": 0.2666, "step": 442 }, { "epoch": 0.41363211951447243, "grad_norm": 5.85130002069025, "learning_rate": 9.996707765978694e-06, "loss": 0.2194, "step": 443 }, { "epoch": 0.41456582633053224, "grad_norm": 4.343548990016964, "learning_rate": 9.99665267293296e-06, "loss": 0.2695, "step": 444 }, { "epoch": 0.415499533146592, "grad_norm": 7.902413300720541, "learning_rate": 9.996597122888467e-06, "loss": 0.3179, "step": 445 }, { "epoch": 0.41643323996265175, "grad_norm": 7.310896531830179, "learning_rate": 9.996541115850292e-06, "loss": 0.2775, "step": 446 }, { "epoch": 0.4173669467787115, "grad_norm": 3.372744953701315, "learning_rate": 9.996484651823562e-06, "loss": 0.0701, "step": 447 }, { "epoch": 0.41830065359477125, "grad_norm": 6.369913120899294, "learning_rate": 9.996427730813439e-06, "loss": 0.309, "step": 448 }, { "epoch": 0.419234360410831, "grad_norm": 3.1591817288437474, "learning_rate": 9.996370352825126e-06, "loss": 0.0587, "step": 449 }, { "epoch": 0.42016806722689076, "grad_norm": 4.249222269464909, "learning_rate": 9.996312517863878e-06, "loss": 0.1096, "step": 450 }, { "epoch": 0.4211017740429505, "grad_norm": 4.560476745280689, "learning_rate": 9.996254225934978e-06, "loss": 0.099, "step": 451 }, { "epoch": 0.42203548085901027, "grad_norm": 3.7707743490222847, "learning_rate": 9.996195477043759e-06, "loss": 0.1818, "step": 452 }, { "epoch": 0.42296918767507, "grad_norm": 5.756927993980001, "learning_rate": 9.996136271195598e-06, "loss": 0.1512, "step": 453 }, { "epoch": 0.4239028944911298, "grad_norm": 4.026105596308351, "learning_rate": 9.996076608395905e-06, "loss": 0.1409, "step": 454 }, { "epoch": 0.42483660130718953, "grad_norm": 6.389502977305879, "learning_rate": 9.996016488650142e-06, "loss": 0.3158, "step": 455 }, { "epoch": 0.4257703081232493, "grad_norm": 3.9288009493208893, "learning_rate": 9.995955911963804e-06, "loss": 0.2, "step": 456 }, { "epoch": 0.42670401493930904, "grad_norm": 4.717165371447782, "learning_rate": 9.99589487834243e-06, "loss": 0.1942, "step": 457 }, { "epoch": 0.4276377217553688, "grad_norm": 4.932002941292291, "learning_rate": 9.995833387791604e-06, "loss": 0.1075, "step": 458 }, { "epoch": 0.42857142857142855, "grad_norm": 3.4129052858382543, "learning_rate": 9.995771440316953e-06, "loss": 0.1248, "step": 459 }, { "epoch": 0.4295051353874883, "grad_norm": 3.8326440483406445, "learning_rate": 9.995709035924139e-06, "loss": 0.1689, "step": 460 }, { "epoch": 0.4304388422035481, "grad_norm": 5.68907861181186, "learning_rate": 9.99564617461887e-06, "loss": 0.1692, "step": 461 }, { "epoch": 0.43137254901960786, "grad_norm": 2.8258750197235245, "learning_rate": 9.995582856406897e-06, "loss": 0.0787, "step": 462 }, { "epoch": 0.4323062558356676, "grad_norm": 4.356764858735526, "learning_rate": 9.995519081294009e-06, "loss": 0.1415, "step": 463 }, { "epoch": 0.4332399626517274, "grad_norm": 5.8294434460045, "learning_rate": 9.995454849286042e-06, "loss": 0.1286, "step": 464 }, { "epoch": 0.4341736694677871, "grad_norm": 3.1417724687517916, "learning_rate": 9.995390160388867e-06, "loss": 0.1343, "step": 465 }, { "epoch": 0.4351073762838469, "grad_norm": 3.0027525857536412, "learning_rate": 9.995325014608402e-06, "loss": 0.1369, "step": 466 }, { "epoch": 0.43604108309990663, "grad_norm": 3.8470307598057003, "learning_rate": 9.995259411950607e-06, "loss": 0.1902, "step": 467 }, { "epoch": 0.4369747899159664, "grad_norm": 2.5468342647843603, "learning_rate": 9.99519335242148e-06, "loss": 0.0736, "step": 468 }, { "epoch": 0.43790849673202614, "grad_norm": 3.7223648998354997, "learning_rate": 9.995126836027063e-06, "loss": 0.1707, "step": 469 }, { "epoch": 0.4388422035480859, "grad_norm": 3.7659337878524877, "learning_rate": 9.99505986277344e-06, "loss": 0.1439, "step": 470 }, { "epoch": 0.43977591036414565, "grad_norm": 4.35569131427319, "learning_rate": 9.994992432666737e-06, "loss": 0.2532, "step": 471 }, { "epoch": 0.4407096171802054, "grad_norm": 5.084431858066571, "learning_rate": 9.994924545713121e-06, "loss": 0.2652, "step": 472 }, { "epoch": 0.44164332399626516, "grad_norm": 5.373607364070299, "learning_rate": 9.994856201918802e-06, "loss": 0.1976, "step": 473 }, { "epoch": 0.4425770308123249, "grad_norm": 4.073882852619903, "learning_rate": 9.99478740129003e-06, "loss": 0.2044, "step": 474 }, { "epoch": 0.44351073762838467, "grad_norm": 3.1887504571009098, "learning_rate": 9.994718143833095e-06, "loss": 0.1742, "step": 475 }, { "epoch": 0.4444444444444444, "grad_norm": 5.138668827280845, "learning_rate": 9.994648429554335e-06, "loss": 0.2903, "step": 476 }, { "epoch": 0.44537815126050423, "grad_norm": 3.911520574111662, "learning_rate": 9.994578258460124e-06, "loss": 0.1203, "step": 477 }, { "epoch": 0.446311858076564, "grad_norm": 4.895444818602745, "learning_rate": 9.99450763055688e-06, "loss": 0.2153, "step": 478 }, { "epoch": 0.44724556489262374, "grad_norm": 3.389576828009551, "learning_rate": 9.994436545851064e-06, "loss": 0.1231, "step": 479 }, { "epoch": 0.4481792717086835, "grad_norm": 3.067893925872565, "learning_rate": 9.994365004349178e-06, "loss": 0.1397, "step": 480 }, { "epoch": 0.44911297852474324, "grad_norm": 3.101276783127866, "learning_rate": 9.994293006057762e-06, "loss": 0.0765, "step": 481 }, { "epoch": 0.450046685340803, "grad_norm": 5.11993235889534, "learning_rate": 9.994220550983404e-06, "loss": 0.2067, "step": 482 }, { "epoch": 0.45098039215686275, "grad_norm": 10.870514175877474, "learning_rate": 9.994147639132732e-06, "loss": 0.3653, "step": 483 }, { "epoch": 0.4519140989729225, "grad_norm": 2.244920322215628, "learning_rate": 9.99407427051241e-06, "loss": 0.0849, "step": 484 }, { "epoch": 0.45284780578898226, "grad_norm": 4.613514873769769, "learning_rate": 9.994000445129152e-06, "loss": 0.2182, "step": 485 }, { "epoch": 0.453781512605042, "grad_norm": 6.856739011883591, "learning_rate": 9.993926162989709e-06, "loss": 0.1331, "step": 486 }, { "epoch": 0.45471521942110177, "grad_norm": 2.409348026488325, "learning_rate": 9.993851424100874e-06, "loss": 0.0959, "step": 487 }, { "epoch": 0.4556489262371615, "grad_norm": 4.319014597560454, "learning_rate": 9.993776228469484e-06, "loss": 0.1775, "step": 488 }, { "epoch": 0.4565826330532213, "grad_norm": 4.196216402398226, "learning_rate": 9.993700576102416e-06, "loss": 0.1127, "step": 489 }, { "epoch": 0.45751633986928103, "grad_norm": 2.7276069148479998, "learning_rate": 9.99362446700659e-06, "loss": 0.1178, "step": 490 }, { "epoch": 0.4584500466853408, "grad_norm": 2.626669796577096, "learning_rate": 9.993547901188966e-06, "loss": 0.1038, "step": 491 }, { "epoch": 0.45938375350140054, "grad_norm": 10.312943308667325, "learning_rate": 9.993470878656546e-06, "loss": 0.3703, "step": 492 }, { "epoch": 0.4603174603174603, "grad_norm": 4.255236301542817, "learning_rate": 9.993393399416378e-06, "loss": 0.2461, "step": 493 }, { "epoch": 0.4612511671335201, "grad_norm": 3.11964674859829, "learning_rate": 9.993315463475543e-06, "loss": 0.1831, "step": 494 }, { "epoch": 0.46218487394957986, "grad_norm": 6.298402568067235, "learning_rate": 9.993237070841176e-06, "loss": 0.1699, "step": 495 }, { "epoch": 0.4631185807656396, "grad_norm": 4.816920239839375, "learning_rate": 9.99315822152044e-06, "loss": 0.2624, "step": 496 }, { "epoch": 0.46405228758169936, "grad_norm": 5.747507567754676, "learning_rate": 9.99307891552055e-06, "loss": 0.3497, "step": 497 }, { "epoch": 0.4649859943977591, "grad_norm": 6.083581997034444, "learning_rate": 9.99299915284876e-06, "loss": 0.2056, "step": 498 }, { "epoch": 0.46591970121381887, "grad_norm": 3.492854129718511, "learning_rate": 9.992918933512363e-06, "loss": 0.2113, "step": 499 }, { "epoch": 0.4668534080298786, "grad_norm": 3.8700408880040666, "learning_rate": 9.992838257518698e-06, "loss": 0.151, "step": 500 }, { "epoch": 0.4677871148459384, "grad_norm": 3.502514220898818, "learning_rate": 9.992757124875143e-06, "loss": 0.2104, "step": 501 }, { "epoch": 0.46872082166199813, "grad_norm": 3.643156022218275, "learning_rate": 9.992675535589117e-06, "loss": 0.1618, "step": 502 }, { "epoch": 0.4696545284780579, "grad_norm": 4.140596876942036, "learning_rate": 9.992593489668086e-06, "loss": 0.2479, "step": 503 }, { "epoch": 0.47058823529411764, "grad_norm": 2.22539265852568, "learning_rate": 9.992510987119551e-06, "loss": 0.0995, "step": 504 }, { "epoch": 0.4715219421101774, "grad_norm": 3.2387633227938233, "learning_rate": 9.992428027951055e-06, "loss": 0.1362, "step": 505 }, { "epoch": 0.47245564892623715, "grad_norm": 4.581969387573396, "learning_rate": 9.992344612170192e-06, "loss": 0.1254, "step": 506 }, { "epoch": 0.4733893557422969, "grad_norm": 3.027265832406344, "learning_rate": 9.992260739784588e-06, "loss": 0.1428, "step": 507 }, { "epoch": 0.47432306255835666, "grad_norm": 4.778277347671885, "learning_rate": 9.992176410801913e-06, "loss": 0.2783, "step": 508 }, { "epoch": 0.4752567693744164, "grad_norm": 7.643389731686421, "learning_rate": 9.992091625229883e-06, "loss": 0.2983, "step": 509 }, { "epoch": 0.47619047619047616, "grad_norm": 5.7865062110460315, "learning_rate": 9.992006383076248e-06, "loss": 0.2262, "step": 510 }, { "epoch": 0.477124183006536, "grad_norm": 4.746897992249887, "learning_rate": 9.991920684348807e-06, "loss": 0.2248, "step": 511 }, { "epoch": 0.4780578898225957, "grad_norm": 4.1650218073786585, "learning_rate": 9.9918345290554e-06, "loss": 0.0954, "step": 512 }, { "epoch": 0.4789915966386555, "grad_norm": 3.9774789934901422, "learning_rate": 9.991747917203904e-06, "loss": 0.1412, "step": 513 }, { "epoch": 0.47992530345471524, "grad_norm": 4.464849362936283, "learning_rate": 9.99166084880224e-06, "loss": 0.275, "step": 514 }, { "epoch": 0.480859010270775, "grad_norm": 4.736276384442491, "learning_rate": 9.991573323858374e-06, "loss": 0.229, "step": 515 }, { "epoch": 0.48179271708683474, "grad_norm": 4.2421794041886205, "learning_rate": 9.991485342380312e-06, "loss": 0.1959, "step": 516 }, { "epoch": 0.4827264239028945, "grad_norm": 2.7892438338932064, "learning_rate": 9.991396904376097e-06, "loss": 0.0699, "step": 517 }, { "epoch": 0.48366013071895425, "grad_norm": 3.645228023138488, "learning_rate": 9.991308009853818e-06, "loss": 0.1627, "step": 518 }, { "epoch": 0.484593837535014, "grad_norm": 7.260792819503719, "learning_rate": 9.991218658821609e-06, "loss": 0.3259, "step": 519 }, { "epoch": 0.48552754435107376, "grad_norm": 3.358224991846664, "learning_rate": 9.991128851287638e-06, "loss": 0.1532, "step": 520 }, { "epoch": 0.4864612511671335, "grad_norm": 2.3779227356646335, "learning_rate": 9.991038587260121e-06, "loss": 0.1254, "step": 521 }, { "epoch": 0.48739495798319327, "grad_norm": 5.676419272123507, "learning_rate": 9.990947866747315e-06, "loss": 0.1642, "step": 522 }, { "epoch": 0.488328664799253, "grad_norm": 1.7171206057787562, "learning_rate": 9.990856689757514e-06, "loss": 0.0867, "step": 523 }, { "epoch": 0.4892623716153128, "grad_norm": 3.531922726317776, "learning_rate": 9.990765056299061e-06, "loss": 0.1332, "step": 524 }, { "epoch": 0.49019607843137253, "grad_norm": 8.19780159936167, "learning_rate": 9.990672966380333e-06, "loss": 0.0697, "step": 525 }, { "epoch": 0.4911297852474323, "grad_norm": 2.4794284245680034, "learning_rate": 9.990580420009756e-06, "loss": 0.0869, "step": 526 }, { "epoch": 0.49206349206349204, "grad_norm": 4.674128651602755, "learning_rate": 9.990487417195792e-06, "loss": 0.2253, "step": 527 }, { "epoch": 0.49299719887955185, "grad_norm": 6.874629038189787, "learning_rate": 9.990393957946947e-06, "loss": 0.2931, "step": 528 }, { "epoch": 0.4939309056956116, "grad_norm": 2.8274746011046465, "learning_rate": 9.990300042271772e-06, "loss": 0.0804, "step": 529 }, { "epoch": 0.49486461251167135, "grad_norm": 3.005679652445484, "learning_rate": 9.990205670178855e-06, "loss": 0.1177, "step": 530 }, { "epoch": 0.4957983193277311, "grad_norm": 3.933439972116848, "learning_rate": 9.990110841676823e-06, "loss": 0.147, "step": 531 }, { "epoch": 0.49673202614379086, "grad_norm": 3.3354891053537683, "learning_rate": 9.990015556774357e-06, "loss": 0.1512, "step": 532 }, { "epoch": 0.4976657329598506, "grad_norm": 1.1761432454306844, "learning_rate": 9.989919815480166e-06, "loss": 0.022, "step": 533 }, { "epoch": 0.49859943977591037, "grad_norm": 6.0954795235678585, "learning_rate": 9.989823617803008e-06, "loss": 0.2311, "step": 534 }, { "epoch": 0.4995331465919701, "grad_norm": 5.783163027363899, "learning_rate": 9.989726963751683e-06, "loss": 0.1793, "step": 535 }, { "epoch": 0.5004668534080299, "grad_norm": 4.955265870924981, "learning_rate": 9.98962985333503e-06, "loss": 0.2408, "step": 536 }, { "epoch": 0.5014005602240896, "grad_norm": 2.6370479361108505, "learning_rate": 9.98953228656193e-06, "loss": 0.1207, "step": 537 }, { "epoch": 0.5023342670401494, "grad_norm": 3.826757601648785, "learning_rate": 9.989434263441305e-06, "loss": 0.134, "step": 538 }, { "epoch": 0.5032679738562091, "grad_norm": 2.6697590836182563, "learning_rate": 9.989335783982125e-06, "loss": 0.1704, "step": 539 }, { "epoch": 0.5042016806722689, "grad_norm": 4.540114806345516, "learning_rate": 9.989236848193394e-06, "loss": 0.2323, "step": 540 }, { "epoch": 0.5051353874883286, "grad_norm": 3.3462153136023334, "learning_rate": 9.989137456084162e-06, "loss": 0.1478, "step": 541 }, { "epoch": 0.5060690943043884, "grad_norm": 3.3848480201574636, "learning_rate": 9.989037607663518e-06, "loss": 0.2039, "step": 542 }, { "epoch": 0.5070028011204482, "grad_norm": 6.072139297032275, "learning_rate": 9.988937302940595e-06, "loss": 0.1925, "step": 543 }, { "epoch": 0.5079365079365079, "grad_norm": 4.784109586408362, "learning_rate": 9.988836541924566e-06, "loss": 0.138, "step": 544 }, { "epoch": 0.5088702147525677, "grad_norm": 4.9525553435511105, "learning_rate": 9.988735324624649e-06, "loss": 0.1848, "step": 545 }, { "epoch": 0.5098039215686274, "grad_norm": 5.127886879255578, "learning_rate": 9.988633651050101e-06, "loss": 0.3064, "step": 546 }, { "epoch": 0.5107376283846872, "grad_norm": 1.188828449440995, "learning_rate": 9.988531521210219e-06, "loss": 0.0276, "step": 547 }, { "epoch": 0.5116713352007469, "grad_norm": 3.3046774343184744, "learning_rate": 9.988428935114346e-06, "loss": 0.2256, "step": 548 }, { "epoch": 0.5126050420168067, "grad_norm": 2.5956586805162982, "learning_rate": 9.988325892771861e-06, "loss": 0.0868, "step": 549 }, { "epoch": 0.5135387488328664, "grad_norm": 3.905455249728397, "learning_rate": 9.988222394192194e-06, "loss": 0.2462, "step": 550 }, { "epoch": 0.5144724556489262, "grad_norm": 4.947646985134975, "learning_rate": 9.988118439384807e-06, "loss": 0.2285, "step": 551 }, { "epoch": 0.5154061624649859, "grad_norm": 3.3818642878600342, "learning_rate": 9.98801402835921e-06, "loss": 0.2042, "step": 552 }, { "epoch": 0.5163398692810458, "grad_norm": 3.6122352696983713, "learning_rate": 9.987909161124951e-06, "loss": 0.0845, "step": 553 }, { "epoch": 0.5172735760971056, "grad_norm": 3.2191243835493437, "learning_rate": 9.987803837691622e-06, "loss": 0.0673, "step": 554 }, { "epoch": 0.5182072829131653, "grad_norm": 2.671474271282329, "learning_rate": 9.987698058068856e-06, "loss": 0.1575, "step": 555 }, { "epoch": 0.5191409897292251, "grad_norm": 6.641177324651936, "learning_rate": 9.987591822266327e-06, "loss": 0.2462, "step": 556 }, { "epoch": 0.5200746965452848, "grad_norm": 4.620826766960176, "learning_rate": 9.987485130293753e-06, "loss": 0.2583, "step": 557 }, { "epoch": 0.5210084033613446, "grad_norm": 4.626693457546501, "learning_rate": 9.987377982160888e-06, "loss": 0.2142, "step": 558 }, { "epoch": 0.5219421101774043, "grad_norm": 9.171623887491856, "learning_rate": 9.987270377877539e-06, "loss": 0.0956, "step": 559 }, { "epoch": 0.5228758169934641, "grad_norm": 3.8754158358062303, "learning_rate": 9.987162317453542e-06, "loss": 0.1641, "step": 560 }, { "epoch": 0.5238095238095238, "grad_norm": 10.50129914628969, "learning_rate": 9.98705380089878e-06, "loss": 0.287, "step": 561 }, { "epoch": 0.5247432306255836, "grad_norm": 3.927646350685739, "learning_rate": 9.986944828223183e-06, "loss": 0.1857, "step": 562 }, { "epoch": 0.5256769374416433, "grad_norm": 1.600269827907874, "learning_rate": 9.986835399436714e-06, "loss": 0.0426, "step": 563 }, { "epoch": 0.5266106442577031, "grad_norm": 5.749202105202749, "learning_rate": 9.98672551454938e-06, "loss": 0.2473, "step": 564 }, { "epoch": 0.5275443510737629, "grad_norm": 5.160143022355809, "learning_rate": 9.986615173571237e-06, "loss": 0.2401, "step": 565 }, { "epoch": 0.5284780578898226, "grad_norm": 5.772850401476032, "learning_rate": 9.98650437651237e-06, "loss": 0.1239, "step": 566 }, { "epoch": 0.5294117647058824, "grad_norm": 5.9372481184784744, "learning_rate": 9.986393123382916e-06, "loss": 0.2079, "step": 567 }, { "epoch": 0.5303454715219421, "grad_norm": 3.3102282616909746, "learning_rate": 9.98628141419305e-06, "loss": 0.142, "step": 568 }, { "epoch": 0.5312791783380019, "grad_norm": 5.213788036928291, "learning_rate": 9.986169248952991e-06, "loss": 0.1637, "step": 569 }, { "epoch": 0.5322128851540616, "grad_norm": 3.0162339226901564, "learning_rate": 9.986056627672995e-06, "loss": 0.0894, "step": 570 }, { "epoch": 0.5331465919701214, "grad_norm": 3.6756272632102363, "learning_rate": 9.985943550363364e-06, "loss": 0.1582, "step": 571 }, { "epoch": 0.5340802987861811, "grad_norm": 4.31313747823727, "learning_rate": 9.98583001703444e-06, "loss": 0.2684, "step": 572 }, { "epoch": 0.5350140056022409, "grad_norm": 2.994997243443638, "learning_rate": 9.985716027696606e-06, "loss": 0.194, "step": 573 }, { "epoch": 0.5359477124183006, "grad_norm": 3.8731368886766564, "learning_rate": 9.985601582360285e-06, "loss": 0.2164, "step": 574 }, { "epoch": 0.5368814192343604, "grad_norm": 2.6096341031584243, "learning_rate": 9.98548668103595e-06, "loss": 0.1497, "step": 575 }, { "epoch": 0.5378151260504201, "grad_norm": 3.2549773259566104, "learning_rate": 9.985371323734107e-06, "loss": 0.2143, "step": 576 }, { "epoch": 0.5387488328664799, "grad_norm": 4.171096967775366, "learning_rate": 9.985255510465306e-06, "loss": 0.1722, "step": 577 }, { "epoch": 0.5396825396825397, "grad_norm": 3.8235424983238055, "learning_rate": 9.985139241240142e-06, "loss": 0.1409, "step": 578 }, { "epoch": 0.5406162464985994, "grad_norm": 3.5621571918393893, "learning_rate": 9.985022516069247e-06, "loss": 0.1511, "step": 579 }, { "epoch": 0.5415499533146592, "grad_norm": 1.4136496589135152, "learning_rate": 9.984905334963296e-06, "loss": 0.0336, "step": 580 }, { "epoch": 0.5424836601307189, "grad_norm": 5.230479839278366, "learning_rate": 9.98478769793301e-06, "loss": 0.2271, "step": 581 }, { "epoch": 0.5434173669467787, "grad_norm": 6.006917124648363, "learning_rate": 9.984669604989144e-06, "loss": 0.17, "step": 582 }, { "epoch": 0.5443510737628384, "grad_norm": 5.772306365527682, "learning_rate": 9.984551056142501e-06, "loss": 0.2115, "step": 583 }, { "epoch": 0.5452847805788982, "grad_norm": 1.0926397938928052, "learning_rate": 9.984432051403922e-06, "loss": 0.0283, "step": 584 }, { "epoch": 0.5462184873949579, "grad_norm": 2.743553450087076, "learning_rate": 9.984312590784294e-06, "loss": 0.0806, "step": 585 }, { "epoch": 0.5471521942110178, "grad_norm": 1.5907162733608837, "learning_rate": 9.984192674294543e-06, "loss": 0.0518, "step": 586 }, { "epoch": 0.5480859010270775, "grad_norm": 3.4692732053669735, "learning_rate": 9.984072301945636e-06, "loss": 0.188, "step": 587 }, { "epoch": 0.5490196078431373, "grad_norm": 7.300535774129544, "learning_rate": 9.983951473748579e-06, "loss": 0.1668, "step": 588 }, { "epoch": 0.5499533146591971, "grad_norm": 3.6241448378536196, "learning_rate": 9.983830189714428e-06, "loss": 0.2284, "step": 589 }, { "epoch": 0.5508870214752568, "grad_norm": 3.783577979399508, "learning_rate": 9.983708449854273e-06, "loss": 0.1906, "step": 590 }, { "epoch": 0.5518207282913166, "grad_norm": 2.3332866820364324, "learning_rate": 9.983586254179246e-06, "loss": 0.0631, "step": 591 }, { "epoch": 0.5527544351073763, "grad_norm": 4.926739159899868, "learning_rate": 9.98346360270053e-06, "loss": 0.2867, "step": 592 }, { "epoch": 0.5536881419234361, "grad_norm": 2.5943905138567467, "learning_rate": 9.983340495429338e-06, "loss": 0.1132, "step": 593 }, { "epoch": 0.5546218487394958, "grad_norm": 4.1062929142490034, "learning_rate": 9.983216932376931e-06, "loss": 0.2538, "step": 594 }, { "epoch": 0.5555555555555556, "grad_norm": 4.346060248961884, "learning_rate": 9.983092913554608e-06, "loss": 0.2145, "step": 595 }, { "epoch": 0.5564892623716153, "grad_norm": 1.6354200845773017, "learning_rate": 9.982968438973715e-06, "loss": 0.0731, "step": 596 }, { "epoch": 0.5574229691876751, "grad_norm": 2.4272643862855214, "learning_rate": 9.982843508645634e-06, "loss": 0.1611, "step": 597 }, { "epoch": 0.5583566760037348, "grad_norm": 2.2430777490557827, "learning_rate": 9.982718122581793e-06, "loss": 0.061, "step": 598 }, { "epoch": 0.5592903828197946, "grad_norm": 9.641963522545545, "learning_rate": 9.98259228079366e-06, "loss": 0.1975, "step": 599 }, { "epoch": 0.5602240896358543, "grad_norm": 2.569539819827914, "learning_rate": 9.982465983292741e-06, "loss": 0.1186, "step": 600 }, { "epoch": 0.5611577964519141, "grad_norm": 6.009681590484396, "learning_rate": 9.982339230090594e-06, "loss": 0.3535, "step": 601 }, { "epoch": 0.5620915032679739, "grad_norm": 3.1272461743032114, "learning_rate": 9.982212021198806e-06, "loss": 0.1286, "step": 602 }, { "epoch": 0.5630252100840336, "grad_norm": 3.567071350363439, "learning_rate": 9.982084356629014e-06, "loss": 0.1488, "step": 603 }, { "epoch": 0.5639589169000934, "grad_norm": 3.499524151482478, "learning_rate": 9.981956236392896e-06, "loss": 0.1902, "step": 604 }, { "epoch": 0.5648926237161531, "grad_norm": 3.0588826065183192, "learning_rate": 9.981827660502166e-06, "loss": 0.1564, "step": 605 }, { "epoch": 0.5658263305322129, "grad_norm": 4.057415000088292, "learning_rate": 9.981698628968587e-06, "loss": 0.1863, "step": 606 }, { "epoch": 0.5667600373482726, "grad_norm": 3.6585202998531448, "learning_rate": 9.981569141803959e-06, "loss": 0.1823, "step": 607 }, { "epoch": 0.5676937441643324, "grad_norm": 2.0849622823388443, "learning_rate": 9.981439199020124e-06, "loss": 0.081, "step": 608 }, { "epoch": 0.5686274509803921, "grad_norm": 2.8374978609524506, "learning_rate": 9.981308800628971e-06, "loss": 0.1408, "step": 609 }, { "epoch": 0.5695611577964519, "grad_norm": 3.113899495935193, "learning_rate": 9.98117794664242e-06, "loss": 0.1936, "step": 610 }, { "epoch": 0.5704948646125116, "grad_norm": 4.535962807503257, "learning_rate": 9.981046637072445e-06, "loss": 0.2241, "step": 611 }, { "epoch": 0.5714285714285714, "grad_norm": 4.4611612579802475, "learning_rate": 9.980914871931052e-06, "loss": 0.1361, "step": 612 }, { "epoch": 0.5723622782446312, "grad_norm": 3.5752477184167897, "learning_rate": 9.980782651230292e-06, "loss": 0.1657, "step": 613 }, { "epoch": 0.5732959850606909, "grad_norm": 2.866317216341854, "learning_rate": 9.98064997498226e-06, "loss": 0.1589, "step": 614 }, { "epoch": 0.5742296918767507, "grad_norm": 2.184210680286673, "learning_rate": 9.98051684319909e-06, "loss": 0.0944, "step": 615 }, { "epoch": 0.5751633986928104, "grad_norm": 7.45827929697658, "learning_rate": 9.98038325589296e-06, "loss": 0.2481, "step": 616 }, { "epoch": 0.5760971055088702, "grad_norm": 5.610948793945887, "learning_rate": 9.980249213076085e-06, "loss": 0.2665, "step": 617 }, { "epoch": 0.5770308123249299, "grad_norm": 2.888928646274913, "learning_rate": 9.980114714760727e-06, "loss": 0.1574, "step": 618 }, { "epoch": 0.5779645191409897, "grad_norm": 1.5989840133388906, "learning_rate": 9.979979760959186e-06, "loss": 0.0441, "step": 619 }, { "epoch": 0.5788982259570495, "grad_norm": 2.6611490716131265, "learning_rate": 9.979844351683807e-06, "loss": 0.1499, "step": 620 }, { "epoch": 0.5798319327731093, "grad_norm": 7.096646234479516, "learning_rate": 9.979708486946972e-06, "loss": 0.2383, "step": 621 }, { "epoch": 0.580765639589169, "grad_norm": 4.133395604732259, "learning_rate": 9.97957216676111e-06, "loss": 0.2922, "step": 622 }, { "epoch": 0.5816993464052288, "grad_norm": 3.3997253127222233, "learning_rate": 9.979435391138685e-06, "loss": 0.0996, "step": 623 }, { "epoch": 0.5826330532212886, "grad_norm": 6.237127406255877, "learning_rate": 9.97929816009221e-06, "loss": 0.3624, "step": 624 }, { "epoch": 0.5835667600373483, "grad_norm": 4.366670375547621, "learning_rate": 9.979160473634236e-06, "loss": 0.3019, "step": 625 }, { "epoch": 0.5845004668534081, "grad_norm": 2.8155106230393594, "learning_rate": 9.979022331777355e-06, "loss": 0.1368, "step": 626 }, { "epoch": 0.5854341736694678, "grad_norm": 3.752393319200759, "learning_rate": 9.978883734534202e-06, "loss": 0.1413, "step": 627 }, { "epoch": 0.5863678804855276, "grad_norm": 1.4422769620131966, "learning_rate": 9.978744681917454e-06, "loss": 0.043, "step": 628 }, { "epoch": 0.5873015873015873, "grad_norm": 4.0786977428471145, "learning_rate": 9.978605173939828e-06, "loss": 0.1711, "step": 629 }, { "epoch": 0.5882352941176471, "grad_norm": 5.755502404965847, "learning_rate": 9.978465210614084e-06, "loss": 0.3081, "step": 630 }, { "epoch": 0.5891690009337068, "grad_norm": 4.329564751362549, "learning_rate": 9.97832479195302e-06, "loss": 0.2337, "step": 631 }, { "epoch": 0.5901027077497666, "grad_norm": 3.498951374705164, "learning_rate": 9.978183917969484e-06, "loss": 0.2175, "step": 632 }, { "epoch": 0.5910364145658263, "grad_norm": 2.995210985061113, "learning_rate": 9.978042588676358e-06, "loss": 0.1131, "step": 633 }, { "epoch": 0.5919701213818861, "grad_norm": 4.1484974120570755, "learning_rate": 9.977900804086569e-06, "loss": 0.2952, "step": 634 }, { "epoch": 0.5929038281979458, "grad_norm": 3.169463549940716, "learning_rate": 9.977758564213083e-06, "loss": 0.1578, "step": 635 }, { "epoch": 0.5938375350140056, "grad_norm": 2.2732384916293706, "learning_rate": 9.977615869068911e-06, "loss": 0.1642, "step": 636 }, { "epoch": 0.5947712418300654, "grad_norm": 4.660451196627954, "learning_rate": 9.977472718667102e-06, "loss": 0.2648, "step": 637 }, { "epoch": 0.5957049486461251, "grad_norm": 2.56122451451977, "learning_rate": 9.97732911302075e-06, "loss": 0.0992, "step": 638 }, { "epoch": 0.5966386554621849, "grad_norm": 2.137142785035431, "learning_rate": 9.977185052142991e-06, "loss": 0.0461, "step": 639 }, { "epoch": 0.5975723622782446, "grad_norm": 4.689541976350505, "learning_rate": 9.977040536046998e-06, "loss": 0.0838, "step": 640 }, { "epoch": 0.5985060690943044, "grad_norm": 4.159689600709485, "learning_rate": 9.976895564745993e-06, "loss": 0.2348, "step": 641 }, { "epoch": 0.5994397759103641, "grad_norm": 2.0156802796537563, "learning_rate": 9.976750138253228e-06, "loss": 0.0953, "step": 642 }, { "epoch": 0.6003734827264239, "grad_norm": 3.6367671972783864, "learning_rate": 9.97660425658201e-06, "loss": 0.1455, "step": 643 }, { "epoch": 0.6013071895424836, "grad_norm": 3.4465432185563225, "learning_rate": 9.97645791974568e-06, "loss": 0.2003, "step": 644 }, { "epoch": 0.6022408963585434, "grad_norm": 4.602242910240695, "learning_rate": 9.97631112775762e-06, "loss": 0.1443, "step": 645 }, { "epoch": 0.6031746031746031, "grad_norm": 1.7034920579288413, "learning_rate": 9.97616388063126e-06, "loss": 0.0455, "step": 646 }, { "epoch": 0.6041083099906629, "grad_norm": 7.285908655525203, "learning_rate": 9.976016178380063e-06, "loss": 0.0502, "step": 647 }, { "epoch": 0.6050420168067226, "grad_norm": 3.2073970994492367, "learning_rate": 9.97586802101754e-06, "loss": 0.1282, "step": 648 }, { "epoch": 0.6059757236227824, "grad_norm": 4.129993962380863, "learning_rate": 9.97571940855724e-06, "loss": 0.0936, "step": 649 }, { "epoch": 0.6069094304388422, "grad_norm": 4.793323127649811, "learning_rate": 9.975570341012759e-06, "loss": 0.1658, "step": 650 }, { "epoch": 0.6078431372549019, "grad_norm": 4.328035383543886, "learning_rate": 9.975420818397727e-06, "loss": 0.1719, "step": 651 }, { "epoch": 0.6087768440709617, "grad_norm": 7.002883556458926, "learning_rate": 9.975270840725822e-06, "loss": 0.2563, "step": 652 }, { "epoch": 0.6097105508870215, "grad_norm": 2.5635783104080554, "learning_rate": 9.975120408010758e-06, "loss": 0.1657, "step": 653 }, { "epoch": 0.6106442577030813, "grad_norm": 4.648957307517926, "learning_rate": 9.974969520266298e-06, "loss": 0.2184, "step": 654 }, { "epoch": 0.611577964519141, "grad_norm": 5.352534354014314, "learning_rate": 9.974818177506238e-06, "loss": 0.3638, "step": 655 }, { "epoch": 0.6125116713352008, "grad_norm": 2.788617647920601, "learning_rate": 9.974666379744423e-06, "loss": 0.1399, "step": 656 }, { "epoch": 0.6134453781512605, "grad_norm": 2.7635661813927994, "learning_rate": 9.974514126994736e-06, "loss": 0.1327, "step": 657 }, { "epoch": 0.6143790849673203, "grad_norm": 2.873831881843825, "learning_rate": 9.974361419271101e-06, "loss": 0.1275, "step": 658 }, { "epoch": 0.61531279178338, "grad_norm": 5.132434466441912, "learning_rate": 9.974208256587488e-06, "loss": 0.134, "step": 659 }, { "epoch": 0.6162464985994398, "grad_norm": 4.2288641013068675, "learning_rate": 9.9740546389579e-06, "loss": 0.0897, "step": 660 }, { "epoch": 0.6171802054154996, "grad_norm": 8.989509309118349, "learning_rate": 9.973900566396391e-06, "loss": 0.3206, "step": 661 }, { "epoch": 0.6181139122315593, "grad_norm": 4.649389191574115, "learning_rate": 9.973746038917052e-06, "loss": 0.2419, "step": 662 }, { "epoch": 0.6190476190476191, "grad_norm": 7.2959568399242025, "learning_rate": 9.973591056534016e-06, "loss": 0.0999, "step": 663 }, { "epoch": 0.6199813258636788, "grad_norm": 2.150644666456355, "learning_rate": 9.973435619261459e-06, "loss": 0.0819, "step": 664 }, { "epoch": 0.6209150326797386, "grad_norm": 4.965182643571926, "learning_rate": 9.973279727113594e-06, "loss": 0.2293, "step": 665 }, { "epoch": 0.6218487394957983, "grad_norm": 2.603780662929895, "learning_rate": 9.97312338010468e-06, "loss": 0.1592, "step": 666 }, { "epoch": 0.6227824463118581, "grad_norm": 3.786620673283724, "learning_rate": 9.972966578249021e-06, "loss": 0.2247, "step": 667 }, { "epoch": 0.6237161531279178, "grad_norm": 4.173641455858341, "learning_rate": 9.972809321560954e-06, "loss": 0.117, "step": 668 }, { "epoch": 0.6246498599439776, "grad_norm": 3.9863524541503543, "learning_rate": 9.972651610054861e-06, "loss": 0.2793, "step": 669 }, { "epoch": 0.6255835667600373, "grad_norm": 1.7483567315990654, "learning_rate": 9.972493443745173e-06, "loss": 0.0527, "step": 670 }, { "epoch": 0.6265172735760971, "grad_norm": 3.3743208998546357, "learning_rate": 9.972334822646348e-06, "loss": 0.1626, "step": 671 }, { "epoch": 0.6274509803921569, "grad_norm": 3.4991966376213663, "learning_rate": 9.972175746772899e-06, "loss": 0.1589, "step": 672 }, { "epoch": 0.6283846872082166, "grad_norm": 4.918949372923897, "learning_rate": 9.972016216139372e-06, "loss": 0.2779, "step": 673 }, { "epoch": 0.6293183940242764, "grad_norm": 2.9960576304631408, "learning_rate": 9.971856230760359e-06, "loss": 0.0617, "step": 674 }, { "epoch": 0.6302521008403361, "grad_norm": 5.508297162094768, "learning_rate": 9.971695790650493e-06, "loss": 0.1811, "step": 675 }, { "epoch": 0.6311858076563959, "grad_norm": 5.269068502691729, "learning_rate": 9.971534895824448e-06, "loss": 0.1759, "step": 676 }, { "epoch": 0.6321195144724556, "grad_norm": 2.8272694976077757, "learning_rate": 9.971373546296939e-06, "loss": 0.0762, "step": 677 }, { "epoch": 0.6330532212885154, "grad_norm": 3.2357230444949803, "learning_rate": 9.971211742082723e-06, "loss": 0.1461, "step": 678 }, { "epoch": 0.6339869281045751, "grad_norm": 4.788786431698696, "learning_rate": 9.9710494831966e-06, "loss": 0.3064, "step": 679 }, { "epoch": 0.6349206349206349, "grad_norm": 4.483099103123373, "learning_rate": 9.970886769653409e-06, "loss": 0.1967, "step": 680 }, { "epoch": 0.6358543417366946, "grad_norm": 4.915355433213198, "learning_rate": 9.970723601468034e-06, "loss": 0.2196, "step": 681 }, { "epoch": 0.6367880485527544, "grad_norm": 2.025135662733985, "learning_rate": 9.970559978655394e-06, "loss": 0.0859, "step": 682 }, { "epoch": 0.6377217553688141, "grad_norm": 3.777677378188102, "learning_rate": 9.970395901230459e-06, "loss": 0.1102, "step": 683 }, { "epoch": 0.6386554621848739, "grad_norm": 3.6960779277517397, "learning_rate": 9.970231369208234e-06, "loss": 0.1346, "step": 684 }, { "epoch": 0.6395891690009337, "grad_norm": 5.274159773780511, "learning_rate": 9.970066382603767e-06, "loss": 0.1702, "step": 685 }, { "epoch": 0.6405228758169934, "grad_norm": 3.050442356532048, "learning_rate": 9.969900941432146e-06, "loss": 0.1266, "step": 686 }, { "epoch": 0.6414565826330533, "grad_norm": 3.5106442242002243, "learning_rate": 9.969735045708506e-06, "loss": 0.166, "step": 687 }, { "epoch": 0.642390289449113, "grad_norm": 2.7312580362295122, "learning_rate": 9.969568695448018e-06, "loss": 0.084, "step": 688 }, { "epoch": 0.6433239962651728, "grad_norm": 1.9756802249106948, "learning_rate": 9.969401890665897e-06, "loss": 0.0726, "step": 689 }, { "epoch": 0.6442577030812325, "grad_norm": 3.0483056401333113, "learning_rate": 9.969234631377399e-06, "loss": 0.1346, "step": 690 }, { "epoch": 0.6451914098972923, "grad_norm": 3.2321605373175766, "learning_rate": 9.969066917597822e-06, "loss": 0.0772, "step": 691 }, { "epoch": 0.646125116713352, "grad_norm": 2.537312529338533, "learning_rate": 9.968898749342505e-06, "loss": 0.0924, "step": 692 }, { "epoch": 0.6470588235294118, "grad_norm": 6.268716901364519, "learning_rate": 9.968730126626827e-06, "loss": 0.1538, "step": 693 }, { "epoch": 0.6479925303454716, "grad_norm": 5.209361074143544, "learning_rate": 9.968561049466214e-06, "loss": 0.1622, "step": 694 }, { "epoch": 0.6489262371615313, "grad_norm": 9.837352575633227, "learning_rate": 9.968391517876127e-06, "loss": 0.3474, "step": 695 }, { "epoch": 0.6498599439775911, "grad_norm": 4.139061597388758, "learning_rate": 9.968221531872074e-06, "loss": 0.1471, "step": 696 }, { "epoch": 0.6507936507936508, "grad_norm": 2.05613058294094, "learning_rate": 9.9680510914696e-06, "loss": 0.0867, "step": 697 }, { "epoch": 0.6517273576097106, "grad_norm": 3.10124621414902, "learning_rate": 9.967880196684295e-06, "loss": 0.148, "step": 698 }, { "epoch": 0.6526610644257703, "grad_norm": 5.03916585085019, "learning_rate": 9.967708847531787e-06, "loss": 0.1106, "step": 699 }, { "epoch": 0.6535947712418301, "grad_norm": 3.227729473006707, "learning_rate": 9.96753704402775e-06, "loss": 0.1502, "step": 700 }, { "epoch": 0.6545284780578898, "grad_norm": 3.369565251859761, "learning_rate": 9.967364786187896e-06, "loss": 0.1325, "step": 701 }, { "epoch": 0.6554621848739496, "grad_norm": 5.186063033938718, "learning_rate": 9.967192074027982e-06, "loss": 0.1738, "step": 702 }, { "epoch": 0.6563958916900093, "grad_norm": 3.47113689236691, "learning_rate": 9.967018907563803e-06, "loss": 0.1659, "step": 703 }, { "epoch": 0.6573295985060691, "grad_norm": 4.561575146338716, "learning_rate": 9.966845286811196e-06, "loss": 0.2799, "step": 704 }, { "epoch": 0.6582633053221288, "grad_norm": 3.569488315623707, "learning_rate": 9.966671211786041e-06, "loss": 0.1781, "step": 705 }, { "epoch": 0.6591970121381886, "grad_norm": 6.6494401306692374, "learning_rate": 9.96649668250426e-06, "loss": 0.3581, "step": 706 }, { "epoch": 0.6601307189542484, "grad_norm": 1.6086819650520157, "learning_rate": 9.966321698981816e-06, "loss": 0.0715, "step": 707 }, { "epoch": 0.6610644257703081, "grad_norm": 2.5368335731416285, "learning_rate": 9.966146261234712e-06, "loss": 0.1143, "step": 708 }, { "epoch": 0.6619981325863679, "grad_norm": 0.9677756079737113, "learning_rate": 9.965970369278994e-06, "loss": 0.0313, "step": 709 }, { "epoch": 0.6629318394024276, "grad_norm": 2.387714173120096, "learning_rate": 9.96579402313075e-06, "loss": 0.1524, "step": 710 }, { "epoch": 0.6638655462184874, "grad_norm": 2.6892175103220177, "learning_rate": 9.965617222806106e-06, "loss": 0.1767, "step": 711 }, { "epoch": 0.6647992530345471, "grad_norm": 3.6712641364157204, "learning_rate": 9.965439968321236e-06, "loss": 0.2189, "step": 712 }, { "epoch": 0.6657329598506069, "grad_norm": 5.99178569472302, "learning_rate": 9.96526225969235e-06, "loss": 0.1725, "step": 713 }, { "epoch": 0.6666666666666666, "grad_norm": 2.4740416475944156, "learning_rate": 9.965084096935702e-06, "loss": 0.1445, "step": 714 }, { "epoch": 0.6676003734827264, "grad_norm": 6.82707051739829, "learning_rate": 9.964905480067585e-06, "loss": 0.2345, "step": 715 }, { "epoch": 0.6685340802987861, "grad_norm": 2.572789869445823, "learning_rate": 9.96472640910434e-06, "loss": 0.1619, "step": 716 }, { "epoch": 0.6694677871148459, "grad_norm": 2.87812575823915, "learning_rate": 9.96454688406234e-06, "loss": 0.1941, "step": 717 }, { "epoch": 0.6704014939309056, "grad_norm": 6.993139608829777, "learning_rate": 9.964366904958008e-06, "loss": 0.2319, "step": 718 }, { "epoch": 0.6713352007469654, "grad_norm": 3.174276043693055, "learning_rate": 9.964186471807803e-06, "loss": 0.1342, "step": 719 }, { "epoch": 0.6722689075630253, "grad_norm": 2.9785279448158577, "learning_rate": 9.964005584628227e-06, "loss": 0.1967, "step": 720 }, { "epoch": 0.673202614379085, "grad_norm": 1.38198397222932, "learning_rate": 9.963824243435826e-06, "loss": 0.0559, "step": 721 }, { "epoch": 0.6741363211951448, "grad_norm": 4.878539126716048, "learning_rate": 9.963642448247185e-06, "loss": 0.2475, "step": 722 }, { "epoch": 0.6750700280112045, "grad_norm": 3.047319085061251, "learning_rate": 9.963460199078932e-06, "loss": 0.2044, "step": 723 }, { "epoch": 0.6760037348272643, "grad_norm": 2.038642124172725, "learning_rate": 9.963277495947733e-06, "loss": 0.0931, "step": 724 }, { "epoch": 0.676937441643324, "grad_norm": 4.030955110670916, "learning_rate": 9.963094338870303e-06, "loss": 0.0967, "step": 725 }, { "epoch": 0.6778711484593838, "grad_norm": 3.8771255347958795, "learning_rate": 9.96291072786339e-06, "loss": 0.2117, "step": 726 }, { "epoch": 0.6788048552754435, "grad_norm": 2.0683746014439595, "learning_rate": 9.962726662943787e-06, "loss": 0.0623, "step": 727 }, { "epoch": 0.6797385620915033, "grad_norm": 2.6124902145072793, "learning_rate": 9.962542144128331e-06, "loss": 0.106, "step": 728 }, { "epoch": 0.680672268907563, "grad_norm": 6.925208466713234, "learning_rate": 9.962357171433895e-06, "loss": 0.3652, "step": 729 }, { "epoch": 0.6816059757236228, "grad_norm": 2.5869829463211933, "learning_rate": 9.962171744877403e-06, "loss": 0.172, "step": 730 }, { "epoch": 0.6825396825396826, "grad_norm": 3.4058702261891574, "learning_rate": 9.961985864475806e-06, "loss": 0.1955, "step": 731 }, { "epoch": 0.6834733893557423, "grad_norm": 4.010665041346329, "learning_rate": 9.961799530246112e-06, "loss": 0.2071, "step": 732 }, { "epoch": 0.6844070961718021, "grad_norm": 4.563912980284291, "learning_rate": 9.96161274220536e-06, "loss": 0.3024, "step": 733 }, { "epoch": 0.6853408029878618, "grad_norm": 2.1830786373509072, "learning_rate": 9.961425500370632e-06, "loss": 0.0801, "step": 734 }, { "epoch": 0.6862745098039216, "grad_norm": 2.934894352591943, "learning_rate": 9.961237804759057e-06, "loss": 0.1203, "step": 735 }, { "epoch": 0.6872082166199813, "grad_norm": 2.974431302406517, "learning_rate": 9.9610496553878e-06, "loss": 0.0603, "step": 736 }, { "epoch": 0.6881419234360411, "grad_norm": 3.648976250255267, "learning_rate": 9.96086105227407e-06, "loss": 0.1386, "step": 737 }, { "epoch": 0.6890756302521008, "grad_norm": 4.702380432471398, "learning_rate": 9.960671995435115e-06, "loss": 0.2903, "step": 738 }, { "epoch": 0.6900093370681606, "grad_norm": 1.8730852161605602, "learning_rate": 9.96048248488823e-06, "loss": 0.0482, "step": 739 }, { "epoch": 0.6909430438842203, "grad_norm": 1.8863623132178404, "learning_rate": 9.960292520650744e-06, "loss": 0.0668, "step": 740 }, { "epoch": 0.6918767507002801, "grad_norm": 3.4078690531481133, "learning_rate": 9.960102102740033e-06, "loss": 0.2599, "step": 741 }, { "epoch": 0.6928104575163399, "grad_norm": 1.753841824742856, "learning_rate": 9.959911231173514e-06, "loss": 0.083, "step": 742 }, { "epoch": 0.6937441643323996, "grad_norm": 6.410595106372833, "learning_rate": 9.959719905968642e-06, "loss": 0.2547, "step": 743 }, { "epoch": 0.6946778711484594, "grad_norm": 3.421610892304902, "learning_rate": 9.959528127142917e-06, "loss": 0.1442, "step": 744 }, { "epoch": 0.6956115779645191, "grad_norm": 4.291187287370661, "learning_rate": 9.959335894713877e-06, "loss": 0.2069, "step": 745 }, { "epoch": 0.6965452847805789, "grad_norm": 1.3495999950292898, "learning_rate": 9.95914320869911e-06, "loss": 0.0424, "step": 746 }, { "epoch": 0.6974789915966386, "grad_norm": 3.2845485261253913, "learning_rate": 9.95895006911623e-06, "loss": 0.1218, "step": 747 }, { "epoch": 0.6984126984126984, "grad_norm": 2.8800019589438968, "learning_rate": 9.95875647598291e-06, "loss": 0.1546, "step": 748 }, { "epoch": 0.6993464052287581, "grad_norm": 3.480295476027096, "learning_rate": 9.958562429316854e-06, "loss": 0.1505, "step": 749 }, { "epoch": 0.7002801120448179, "grad_norm": 2.850484932457255, "learning_rate": 9.958367929135808e-06, "loss": 0.1855, "step": 750 }, { "epoch": 0.7012138188608776, "grad_norm": 2.432822307955079, "learning_rate": 9.958172975457561e-06, "loss": 0.1086, "step": 751 }, { "epoch": 0.7021475256769374, "grad_norm": 2.5469608297213417, "learning_rate": 9.957977568299943e-06, "loss": 0.1451, "step": 752 }, { "epoch": 0.7030812324929971, "grad_norm": 5.439764585252835, "learning_rate": 9.957781707680831e-06, "loss": 0.2764, "step": 753 }, { "epoch": 0.704014939309057, "grad_norm": 3.5876252931914716, "learning_rate": 9.957585393618135e-06, "loss": 0.0644, "step": 754 }, { "epoch": 0.7049486461251168, "grad_norm": 4.184917232210895, "learning_rate": 9.957388626129808e-06, "loss": 0.1755, "step": 755 }, { "epoch": 0.7058823529411765, "grad_norm": 4.2088795233735725, "learning_rate": 9.95719140523385e-06, "loss": 0.0343, "step": 756 }, { "epoch": 0.7068160597572363, "grad_norm": 2.6821338688320067, "learning_rate": 9.956993730948299e-06, "loss": 0.1089, "step": 757 }, { "epoch": 0.707749766573296, "grad_norm": 3.4387217321504173, "learning_rate": 9.956795603291231e-06, "loss": 0.1454, "step": 758 }, { "epoch": 0.7086834733893558, "grad_norm": 2.928351785391986, "learning_rate": 9.956597022280772e-06, "loss": 0.074, "step": 759 }, { "epoch": 0.7096171802054155, "grad_norm": 4.16882387319146, "learning_rate": 9.95639798793508e-06, "loss": 0.1813, "step": 760 }, { "epoch": 0.7105508870214753, "grad_norm": 2.6013904475472893, "learning_rate": 9.956198500272362e-06, "loss": 0.1074, "step": 761 }, { "epoch": 0.711484593837535, "grad_norm": 6.998432408938549, "learning_rate": 9.955998559310862e-06, "loss": 0.2972, "step": 762 }, { "epoch": 0.7124183006535948, "grad_norm": 4.43972470229218, "learning_rate": 9.955798165068866e-06, "loss": 0.2139, "step": 763 }, { "epoch": 0.7133520074696545, "grad_norm": 4.164322990549817, "learning_rate": 9.955597317564705e-06, "loss": 0.1756, "step": 764 }, { "epoch": 0.7142857142857143, "grad_norm": 3.110827176383738, "learning_rate": 9.955396016816745e-06, "loss": 0.0793, "step": 765 }, { "epoch": 0.715219421101774, "grad_norm": 6.7922593892379295, "learning_rate": 9.955194262843398e-06, "loss": 0.3261, "step": 766 }, { "epoch": 0.7161531279178338, "grad_norm": 1.6096347925837864, "learning_rate": 9.95499205566312e-06, "loss": 0.0469, "step": 767 }, { "epoch": 0.7170868347338936, "grad_norm": 3.528010315040101, "learning_rate": 9.954789395294401e-06, "loss": 0.1154, "step": 768 }, { "epoch": 0.7180205415499533, "grad_norm": 3.326185573736356, "learning_rate": 9.954586281755779e-06, "loss": 0.1614, "step": 769 }, { "epoch": 0.7189542483660131, "grad_norm": 4.46197495341431, "learning_rate": 9.954382715065829e-06, "loss": 0.1981, "step": 770 }, { "epoch": 0.7198879551820728, "grad_norm": 2.480424271142451, "learning_rate": 9.954178695243171e-06, "loss": 0.133, "step": 771 }, { "epoch": 0.7208216619981326, "grad_norm": 3.6066004825892937, "learning_rate": 9.953974222306464e-06, "loss": 0.1526, "step": 772 }, { "epoch": 0.7217553688141923, "grad_norm": 3.292452215306781, "learning_rate": 9.95376929627441e-06, "loss": 0.1159, "step": 773 }, { "epoch": 0.7226890756302521, "grad_norm": 5.750842953057026, "learning_rate": 9.953563917165752e-06, "loss": 0.2524, "step": 774 }, { "epoch": 0.7236227824463118, "grad_norm": 11.458565442431377, "learning_rate": 9.953358084999274e-06, "loss": 0.143, "step": 775 }, { "epoch": 0.7245564892623716, "grad_norm": 1.0279769462643136, "learning_rate": 9.953151799793799e-06, "loss": 0.0327, "step": 776 }, { "epoch": 0.7254901960784313, "grad_norm": 3.3986957058909253, "learning_rate": 9.952945061568198e-06, "loss": 0.0768, "step": 777 }, { "epoch": 0.7264239028944911, "grad_norm": 5.834474784645139, "learning_rate": 9.952737870341378e-06, "loss": 0.2332, "step": 778 }, { "epoch": 0.7273576097105509, "grad_norm": 2.2472356584307343, "learning_rate": 9.952530226132288e-06, "loss": 0.083, "step": 779 }, { "epoch": 0.7282913165266106, "grad_norm": 2.36005690557205, "learning_rate": 9.952322128959919e-06, "loss": 0.103, "step": 780 }, { "epoch": 0.7292250233426704, "grad_norm": 4.469576136247529, "learning_rate": 9.952113578843305e-06, "loss": 0.238, "step": 781 }, { "epoch": 0.7301587301587301, "grad_norm": 6.619981200217861, "learning_rate": 9.951904575801522e-06, "loss": 0.3368, "step": 782 }, { "epoch": 0.7310924369747899, "grad_norm": 9.31739342093873, "learning_rate": 9.951695119853681e-06, "loss": 0.3505, "step": 783 }, { "epoch": 0.7320261437908496, "grad_norm": 3.3919921299373477, "learning_rate": 9.951485211018943e-06, "loss": 0.1979, "step": 784 }, { "epoch": 0.7329598506069094, "grad_norm": 1.9869757238683756, "learning_rate": 9.951274849316505e-06, "loss": 0.0606, "step": 785 }, { "epoch": 0.7338935574229691, "grad_norm": 2.8487031501083235, "learning_rate": 9.951064034765607e-06, "loss": 0.134, "step": 786 }, { "epoch": 0.734827264239029, "grad_norm": 2.301175226028639, "learning_rate": 9.95085276738553e-06, "loss": 0.1102, "step": 787 }, { "epoch": 0.7357609710550888, "grad_norm": 2.765699544217147, "learning_rate": 9.950641047195596e-06, "loss": 0.1017, "step": 788 }, { "epoch": 0.7366946778711485, "grad_norm": 3.598094910055165, "learning_rate": 9.950428874215172e-06, "loss": 0.1603, "step": 789 }, { "epoch": 0.7376283846872083, "grad_norm": 1.1931513717522737, "learning_rate": 9.950216248463661e-06, "loss": 0.024, "step": 790 }, { "epoch": 0.738562091503268, "grad_norm": 4.903169077110706, "learning_rate": 9.950003169960511e-06, "loss": 0.1908, "step": 791 }, { "epoch": 0.7394957983193278, "grad_norm": 5.231090110995694, "learning_rate": 9.949789638725208e-06, "loss": 0.1049, "step": 792 }, { "epoch": 0.7404295051353875, "grad_norm": 3.716746164833035, "learning_rate": 9.949575654777285e-06, "loss": 0.1125, "step": 793 }, { "epoch": 0.7413632119514473, "grad_norm": 3.3090844138561684, "learning_rate": 9.949361218136314e-06, "loss": 0.1252, "step": 794 }, { "epoch": 0.742296918767507, "grad_norm": 1.920371841102993, "learning_rate": 9.949146328821904e-06, "loss": 0.0708, "step": 795 }, { "epoch": 0.7432306255835668, "grad_norm": 3.338648614176514, "learning_rate": 9.94893098685371e-06, "loss": 0.0409, "step": 796 }, { "epoch": 0.7441643323996265, "grad_norm": 6.280302128857458, "learning_rate": 9.948715192251428e-06, "loss": 0.2795, "step": 797 }, { "epoch": 0.7450980392156863, "grad_norm": 5.934486471348006, "learning_rate": 9.948498945034796e-06, "loss": 0.2609, "step": 798 }, { "epoch": 0.746031746031746, "grad_norm": 4.456107364410252, "learning_rate": 9.94828224522359e-06, "loss": 0.1811, "step": 799 }, { "epoch": 0.7469654528478058, "grad_norm": 3.010284547593809, "learning_rate": 9.948065092837631e-06, "loss": 0.0901, "step": 800 }, { "epoch": 0.7478991596638656, "grad_norm": 2.3346332661148512, "learning_rate": 9.947847487896778e-06, "loss": 0.0769, "step": 801 }, { "epoch": 0.7488328664799253, "grad_norm": 4.422313454910725, "learning_rate": 9.947629430420936e-06, "loss": 0.2028, "step": 802 }, { "epoch": 0.7497665732959851, "grad_norm": 2.4211345649981277, "learning_rate": 9.947410920430048e-06, "loss": 0.0649, "step": 803 }, { "epoch": 0.7507002801120448, "grad_norm": 3.456914580036419, "learning_rate": 9.947191957944098e-06, "loss": 0.166, "step": 804 }, { "epoch": 0.7516339869281046, "grad_norm": 2.349848900008704, "learning_rate": 9.946972542983112e-06, "loss": 0.0998, "step": 805 }, { "epoch": 0.7525676937441643, "grad_norm": 4.6350424851331455, "learning_rate": 9.946752675567161e-06, "loss": 0.1795, "step": 806 }, { "epoch": 0.7535014005602241, "grad_norm": 2.1874078491459903, "learning_rate": 9.946532355716351e-06, "loss": 0.0838, "step": 807 }, { "epoch": 0.7544351073762838, "grad_norm": 4.028757757703133, "learning_rate": 9.946311583450834e-06, "loss": 0.2508, "step": 808 }, { "epoch": 0.7553688141923436, "grad_norm": 4.066344354926907, "learning_rate": 9.946090358790804e-06, "loss": 0.1128, "step": 809 }, { "epoch": 0.7563025210084033, "grad_norm": 2.0441750391157836, "learning_rate": 9.945868681756492e-06, "loss": 0.0611, "step": 810 }, { "epoch": 0.7572362278244631, "grad_norm": 2.5218722482793527, "learning_rate": 9.945646552368173e-06, "loss": 0.1468, "step": 811 }, { "epoch": 0.7581699346405228, "grad_norm": 3.3854027757767495, "learning_rate": 9.945423970646163e-06, "loss": 0.2552, "step": 812 }, { "epoch": 0.7591036414565826, "grad_norm": 2.066243436376451, "learning_rate": 9.945200936610821e-06, "loss": 0.1026, "step": 813 }, { "epoch": 0.7600373482726424, "grad_norm": 3.684442319637859, "learning_rate": 9.944977450282545e-06, "loss": 0.1189, "step": 814 }, { "epoch": 0.7609710550887021, "grad_norm": 4.182505388126459, "learning_rate": 9.944753511681775e-06, "loss": 0.1488, "step": 815 }, { "epoch": 0.7619047619047619, "grad_norm": 3.543421054480762, "learning_rate": 9.944529120828991e-06, "loss": 0.179, "step": 816 }, { "epoch": 0.7628384687208216, "grad_norm": 1.6295356289559577, "learning_rate": 9.94430427774472e-06, "loss": 0.0498, "step": 817 }, { "epoch": 0.7637721755368814, "grad_norm": 3.5001707656457057, "learning_rate": 9.944078982449524e-06, "loss": 0.1691, "step": 818 }, { "epoch": 0.7647058823529411, "grad_norm": 2.34378090443754, "learning_rate": 9.94385323496401e-06, "loss": 0.0685, "step": 819 }, { "epoch": 0.765639589169001, "grad_norm": 1.0100321486795671, "learning_rate": 9.943627035308822e-06, "loss": 0.0291, "step": 820 }, { "epoch": 0.7665732959850607, "grad_norm": 2.8854632888236114, "learning_rate": 9.943400383504652e-06, "loss": 0.1561, "step": 821 }, { "epoch": 0.7675070028011205, "grad_norm": 2.3603796981987237, "learning_rate": 9.943173279572228e-06, "loss": 0.0499, "step": 822 }, { "epoch": 0.7684407096171803, "grad_norm": 3.5623335835649126, "learning_rate": 9.94294572353232e-06, "loss": 0.0773, "step": 823 }, { "epoch": 0.76937441643324, "grad_norm": 3.580132801836317, "learning_rate": 9.942717715405745e-06, "loss": 0.164, "step": 824 }, { "epoch": 0.7703081232492998, "grad_norm": 6.253055251797678, "learning_rate": 9.942489255213353e-06, "loss": 0.3016, "step": 825 }, { "epoch": 0.7712418300653595, "grad_norm": 2.2308211689123523, "learning_rate": 9.942260342976038e-06, "loss": 0.0493, "step": 826 }, { "epoch": 0.7721755368814193, "grad_norm": 3.4545630896074204, "learning_rate": 9.94203097871474e-06, "loss": 0.1965, "step": 827 }, { "epoch": 0.773109243697479, "grad_norm": 2.795927873424985, "learning_rate": 9.941801162450434e-06, "loss": 0.1293, "step": 828 }, { "epoch": 0.7740429505135388, "grad_norm": 4.554969019074254, "learning_rate": 9.941570894204144e-06, "loss": 0.1198, "step": 829 }, { "epoch": 0.7749766573295985, "grad_norm": 3.7191253121524936, "learning_rate": 9.941340173996926e-06, "loss": 0.1737, "step": 830 }, { "epoch": 0.7759103641456583, "grad_norm": 2.426902359462844, "learning_rate": 9.941109001849881e-06, "loss": 0.1405, "step": 831 }, { "epoch": 0.776844070961718, "grad_norm": 2.039165158689037, "learning_rate": 9.940877377784156e-06, "loss": 0.0644, "step": 832 }, { "epoch": 0.7777777777777778, "grad_norm": 2.435930670627958, "learning_rate": 9.940645301820933e-06, "loss": 0.0741, "step": 833 }, { "epoch": 0.7787114845938375, "grad_norm": 8.086674322133895, "learning_rate": 9.940412773981441e-06, "loss": 0.2693, "step": 834 }, { "epoch": 0.7796451914098973, "grad_norm": 2.7234198532791183, "learning_rate": 9.940179794286943e-06, "loss": 0.1803, "step": 835 }, { "epoch": 0.780578898225957, "grad_norm": 2.3663175554083495, "learning_rate": 9.93994636275875e-06, "loss": 0.0801, "step": 836 }, { "epoch": 0.7815126050420168, "grad_norm": 2.139440763229443, "learning_rate": 9.93971247941821e-06, "loss": 0.064, "step": 837 }, { "epoch": 0.7824463118580766, "grad_norm": 3.966130215481024, "learning_rate": 9.939478144286718e-06, "loss": 0.1908, "step": 838 }, { "epoch": 0.7833800186741363, "grad_norm": 3.1529930142690517, "learning_rate": 9.939243357385703e-06, "loss": 0.124, "step": 839 }, { "epoch": 0.7843137254901961, "grad_norm": 3.735634697281142, "learning_rate": 9.939008118736641e-06, "loss": 0.1365, "step": 840 }, { "epoch": 0.7852474323062558, "grad_norm": 2.4032819625427595, "learning_rate": 9.938772428361047e-06, "loss": 0.0818, "step": 841 }, { "epoch": 0.7861811391223156, "grad_norm": 2.4675307028770894, "learning_rate": 9.938536286280475e-06, "loss": 0.0988, "step": 842 }, { "epoch": 0.7871148459383753, "grad_norm": 3.3595508408614525, "learning_rate": 9.938299692516524e-06, "loss": 0.0503, "step": 843 }, { "epoch": 0.7880485527544351, "grad_norm": 2.7758335463098263, "learning_rate": 9.938062647090835e-06, "loss": 0.1511, "step": 844 }, { "epoch": 0.7889822595704948, "grad_norm": 2.8668994237272725, "learning_rate": 9.937825150025086e-06, "loss": 0.1218, "step": 845 }, { "epoch": 0.7899159663865546, "grad_norm": 2.2129522596155393, "learning_rate": 9.937587201341001e-06, "loss": 0.1323, "step": 846 }, { "epoch": 0.7908496732026143, "grad_norm": 3.1222500506636623, "learning_rate": 9.937348801060341e-06, "loss": 0.1174, "step": 847 }, { "epoch": 0.7917833800186741, "grad_norm": 5.2204687945889265, "learning_rate": 9.93710994920491e-06, "loss": 0.2087, "step": 848 }, { "epoch": 0.7927170868347339, "grad_norm": 1.630954159905446, "learning_rate": 9.936870645796555e-06, "loss": 0.0439, "step": 849 }, { "epoch": 0.7936507936507936, "grad_norm": 5.716301693235619, "learning_rate": 9.936630890857164e-06, "loss": 0.3179, "step": 850 }, { "epoch": 0.7945845004668534, "grad_norm": 3.236073791042338, "learning_rate": 9.936390684408662e-06, "loss": 0.1429, "step": 851 }, { "epoch": 0.7955182072829131, "grad_norm": 3.0164747645138843, "learning_rate": 9.936150026473022e-06, "loss": 0.1202, "step": 852 }, { "epoch": 0.7964519140989729, "grad_norm": 2.2033272384775677, "learning_rate": 9.935908917072253e-06, "loss": 0.1316, "step": 853 }, { "epoch": 0.7973856209150327, "grad_norm": 2.0172733263813325, "learning_rate": 9.935667356228405e-06, "loss": 0.0986, "step": 854 }, { "epoch": 0.7983193277310925, "grad_norm": 1.8160348496490095, "learning_rate": 9.935425343963574e-06, "loss": 0.0819, "step": 855 }, { "epoch": 0.7992530345471522, "grad_norm": 3.1553045523990604, "learning_rate": 9.935182880299897e-06, "loss": 0.1818, "step": 856 }, { "epoch": 0.800186741363212, "grad_norm": 1.5872456352483033, "learning_rate": 9.934939965259545e-06, "loss": 0.0709, "step": 857 }, { "epoch": 0.8011204481792717, "grad_norm": 2.2413497287840443, "learning_rate": 9.934696598864737e-06, "loss": 0.0946, "step": 858 }, { "epoch": 0.8020541549953315, "grad_norm": 1.8943289458975154, "learning_rate": 9.934452781137733e-06, "loss": 0.0925, "step": 859 }, { "epoch": 0.8029878618113913, "grad_norm": 2.746706488870039, "learning_rate": 9.934208512100833e-06, "loss": 0.1423, "step": 860 }, { "epoch": 0.803921568627451, "grad_norm": 1.5584830110390275, "learning_rate": 9.933963791776376e-06, "loss": 0.0389, "step": 861 }, { "epoch": 0.8048552754435108, "grad_norm": 2.5845817935573985, "learning_rate": 9.933718620186745e-06, "loss": 0.1183, "step": 862 }, { "epoch": 0.8057889822595705, "grad_norm": 4.555536212040681, "learning_rate": 9.933472997354364e-06, "loss": 0.2266, "step": 863 }, { "epoch": 0.8067226890756303, "grad_norm": 3.176857730250523, "learning_rate": 9.933226923301697e-06, "loss": 0.1965, "step": 864 }, { "epoch": 0.80765639589169, "grad_norm": 5.074711531497866, "learning_rate": 9.932980398051253e-06, "loss": 0.3669, "step": 865 }, { "epoch": 0.8085901027077498, "grad_norm": 2.6707859679634907, "learning_rate": 9.932733421625577e-06, "loss": 0.1257, "step": 866 }, { "epoch": 0.8095238095238095, "grad_norm": 3.633122907043409, "learning_rate": 9.932485994047258e-06, "loss": 0.1221, "step": 867 }, { "epoch": 0.8104575163398693, "grad_norm": 2.5832554940135433, "learning_rate": 9.932238115338925e-06, "loss": 0.0874, "step": 868 }, { "epoch": 0.811391223155929, "grad_norm": 3.1639124159730905, "learning_rate": 9.93198978552325e-06, "loss": 0.1386, "step": 869 }, { "epoch": 0.8123249299719888, "grad_norm": 2.055187234634805, "learning_rate": 9.931741004622947e-06, "loss": 0.0573, "step": 870 }, { "epoch": 0.8132586367880486, "grad_norm": 4.677525893782564, "learning_rate": 9.93149177266077e-06, "loss": 0.1867, "step": 871 }, { "epoch": 0.8141923436041083, "grad_norm": 1.8328874613684363, "learning_rate": 9.93124208965951e-06, "loss": 0.1019, "step": 872 }, { "epoch": 0.8151260504201681, "grad_norm": 5.186665278495043, "learning_rate": 9.930991955642006e-06, "loss": 0.3243, "step": 873 }, { "epoch": 0.8160597572362278, "grad_norm": 1.6862525358580958, "learning_rate": 9.930741370631137e-06, "loss": 0.0584, "step": 874 }, { "epoch": 0.8169934640522876, "grad_norm": 4.0995983207296955, "learning_rate": 9.93049033464982e-06, "loss": 0.1703, "step": 875 }, { "epoch": 0.8179271708683473, "grad_norm": 8.633296250943896, "learning_rate": 9.930238847721015e-06, "loss": 0.2433, "step": 876 }, { "epoch": 0.8188608776844071, "grad_norm": 4.126640968808962, "learning_rate": 9.929986909867721e-06, "loss": 0.2012, "step": 877 }, { "epoch": 0.8197945845004668, "grad_norm": 3.1808860805572823, "learning_rate": 9.929734521112986e-06, "loss": 0.1156, "step": 878 }, { "epoch": 0.8207282913165266, "grad_norm": 2.1955710860268924, "learning_rate": 9.929481681479889e-06, "loss": 0.0535, "step": 879 }, { "epoch": 0.8216619981325863, "grad_norm": 2.1639020170079166, "learning_rate": 9.929228390991557e-06, "loss": 0.076, "step": 880 }, { "epoch": 0.8225957049486461, "grad_norm": 2.426860885999806, "learning_rate": 9.928974649671155e-06, "loss": 0.1204, "step": 881 }, { "epoch": 0.8235294117647058, "grad_norm": 2.853335835569478, "learning_rate": 9.928720457541892e-06, "loss": 0.1576, "step": 882 }, { "epoch": 0.8244631185807656, "grad_norm": 3.1453460889671874, "learning_rate": 9.928465814627016e-06, "loss": 0.1541, "step": 883 }, { "epoch": 0.8253968253968254, "grad_norm": 3.5401129141821843, "learning_rate": 9.928210720949814e-06, "loss": 0.1541, "step": 884 }, { "epoch": 0.8263305322128851, "grad_norm": 1.4085304219957233, "learning_rate": 9.927955176533622e-06, "loss": 0.0594, "step": 885 }, { "epoch": 0.8272642390289449, "grad_norm": 2.6188352730784126, "learning_rate": 9.927699181401811e-06, "loss": 0.1368, "step": 886 }, { "epoch": 0.8281979458450047, "grad_norm": 3.105824074966025, "learning_rate": 9.927442735577793e-06, "loss": 0.1276, "step": 887 }, { "epoch": 0.8291316526610645, "grad_norm": 4.661557605980531, "learning_rate": 9.927185839085021e-06, "loss": 0.1965, "step": 888 }, { "epoch": 0.8300653594771242, "grad_norm": 3.8273894255804914, "learning_rate": 9.926928491946996e-06, "loss": 0.1476, "step": 889 }, { "epoch": 0.830999066293184, "grad_norm": 3.165142862918767, "learning_rate": 9.926670694187252e-06, "loss": 0.21, "step": 890 }, { "epoch": 0.8319327731092437, "grad_norm": 4.028369483704292, "learning_rate": 9.92641244582937e-06, "loss": 0.1235, "step": 891 }, { "epoch": 0.8328664799253035, "grad_norm": 4.318669727311548, "learning_rate": 9.926153746896964e-06, "loss": 0.2259, "step": 892 }, { "epoch": 0.8338001867413632, "grad_norm": 2.473154418811061, "learning_rate": 9.925894597413703e-06, "loss": 0.0966, "step": 893 }, { "epoch": 0.834733893557423, "grad_norm": 1.4858876731627688, "learning_rate": 9.925634997403281e-06, "loss": 0.0549, "step": 894 }, { "epoch": 0.8356676003734828, "grad_norm": 3.6036010398821405, "learning_rate": 9.925374946889448e-06, "loss": 0.2031, "step": 895 }, { "epoch": 0.8366013071895425, "grad_norm": 4.252344508972572, "learning_rate": 9.925114445895985e-06, "loss": 0.2235, "step": 896 }, { "epoch": 0.8375350140056023, "grad_norm": 2.8146718166420817, "learning_rate": 9.924853494446716e-06, "loss": 0.1259, "step": 897 }, { "epoch": 0.838468720821662, "grad_norm": 6.897137568288283, "learning_rate": 9.924592092565512e-06, "loss": 0.2271, "step": 898 }, { "epoch": 0.8394024276377218, "grad_norm": 3.486887587864985, "learning_rate": 9.924330240276279e-06, "loss": 0.0993, "step": 899 }, { "epoch": 0.8403361344537815, "grad_norm": 2.94838832143815, "learning_rate": 9.924067937602965e-06, "loss": 0.0647, "step": 900 }, { "epoch": 0.8412698412698413, "grad_norm": 2.8301414683869193, "learning_rate": 9.923805184569562e-06, "loss": 0.1136, "step": 901 }, { "epoch": 0.842203548085901, "grad_norm": 9.15788439300488, "learning_rate": 9.923541981200103e-06, "loss": 0.219, "step": 902 }, { "epoch": 0.8431372549019608, "grad_norm": 3.8732110715289036, "learning_rate": 9.923278327518659e-06, "loss": 0.0816, "step": 903 }, { "epoch": 0.8440709617180205, "grad_norm": 4.0444471450220245, "learning_rate": 9.923014223549343e-06, "loss": 0.1626, "step": 904 }, { "epoch": 0.8450046685340803, "grad_norm": 3.071415602549754, "learning_rate": 9.922749669316312e-06, "loss": 0.0601, "step": 905 }, { "epoch": 0.84593837535014, "grad_norm": 4.974541834111028, "learning_rate": 9.922484664843763e-06, "loss": 0.2315, "step": 906 }, { "epoch": 0.8468720821661998, "grad_norm": 3.717881474740215, "learning_rate": 9.922219210155932e-06, "loss": 0.1668, "step": 907 }, { "epoch": 0.8478057889822596, "grad_norm": 1.1916462109150008, "learning_rate": 9.921953305277097e-06, "loss": 0.047, "step": 908 }, { "epoch": 0.8487394957983193, "grad_norm": 1.7310061561133148, "learning_rate": 9.921686950231581e-06, "loss": 0.0612, "step": 909 }, { "epoch": 0.8496732026143791, "grad_norm": 4.806251258995923, "learning_rate": 9.921420145043743e-06, "loss": 0.2983, "step": 910 }, { "epoch": 0.8506069094304388, "grad_norm": 3.5879252321365502, "learning_rate": 9.921152889737985e-06, "loss": 0.1704, "step": 911 }, { "epoch": 0.8515406162464986, "grad_norm": 1.128447978457779, "learning_rate": 9.920885184338751e-06, "loss": 0.027, "step": 912 }, { "epoch": 0.8524743230625583, "grad_norm": 2.1749203925614817, "learning_rate": 9.920617028870524e-06, "loss": 0.1231, "step": 913 }, { "epoch": 0.8534080298786181, "grad_norm": 9.175651724594834, "learning_rate": 9.920348423357835e-06, "loss": 0.1473, "step": 914 }, { "epoch": 0.8543417366946778, "grad_norm": 2.2343853449681332, "learning_rate": 9.920079367825244e-06, "loss": 0.1104, "step": 915 }, { "epoch": 0.8552754435107376, "grad_norm": 3.046737068729024, "learning_rate": 9.919809862297364e-06, "loss": 0.2303, "step": 916 }, { "epoch": 0.8562091503267973, "grad_norm": 3.5758597757414776, "learning_rate": 9.919539906798844e-06, "loss": 0.1729, "step": 917 }, { "epoch": 0.8571428571428571, "grad_norm": 6.059965411696225, "learning_rate": 9.91926950135437e-06, "loss": 0.3084, "step": 918 }, { "epoch": 0.8580765639589168, "grad_norm": 3.4758428186355617, "learning_rate": 9.918998645988678e-06, "loss": 0.2327, "step": 919 }, { "epoch": 0.8590102707749766, "grad_norm": 6.47229338882104, "learning_rate": 9.918727340726538e-06, "loss": 0.2347, "step": 920 }, { "epoch": 0.8599439775910365, "grad_norm": 4.329056273102721, "learning_rate": 9.918455585592767e-06, "loss": 0.2889, "step": 921 }, { "epoch": 0.8608776844070962, "grad_norm": 5.369140970297625, "learning_rate": 9.918183380612216e-06, "loss": 0.3377, "step": 922 }, { "epoch": 0.861811391223156, "grad_norm": 5.4797383912631705, "learning_rate": 9.917910725809786e-06, "loss": 0.0887, "step": 923 }, { "epoch": 0.8627450980392157, "grad_norm": 2.638429843525787, "learning_rate": 9.917637621210407e-06, "loss": 0.1405, "step": 924 }, { "epoch": 0.8636788048552755, "grad_norm": 3.466083463320452, "learning_rate": 9.917364066839066e-06, "loss": 0.1427, "step": 925 }, { "epoch": 0.8646125116713352, "grad_norm": 3.9914542667314032, "learning_rate": 9.917090062720778e-06, "loss": 0.0855, "step": 926 }, { "epoch": 0.865546218487395, "grad_norm": 2.8087231462325266, "learning_rate": 9.916815608880602e-06, "loss": 0.0653, "step": 927 }, { "epoch": 0.8664799253034547, "grad_norm": 7.113998522023931, "learning_rate": 9.916540705343643e-06, "loss": 0.2236, "step": 928 }, { "epoch": 0.8674136321195145, "grad_norm": 3.547481330555779, "learning_rate": 9.916265352135043e-06, "loss": 0.1167, "step": 929 }, { "epoch": 0.8683473389355743, "grad_norm": 2.8510115918681542, "learning_rate": 9.915989549279986e-06, "loss": 0.0869, "step": 930 }, { "epoch": 0.869281045751634, "grad_norm": 2.5189405020272044, "learning_rate": 9.915713296803697e-06, "loss": 0.1124, "step": 931 }, { "epoch": 0.8702147525676938, "grad_norm": 3.703374180909856, "learning_rate": 9.915436594731443e-06, "loss": 0.1212, "step": 932 }, { "epoch": 0.8711484593837535, "grad_norm": 3.474692408498576, "learning_rate": 9.91515944308853e-06, "loss": 0.0932, "step": 933 }, { "epoch": 0.8720821661998133, "grad_norm": 4.256677453542541, "learning_rate": 9.914881841900307e-06, "loss": 0.1042, "step": 934 }, { "epoch": 0.873015873015873, "grad_norm": 2.037959034237598, "learning_rate": 9.914603791192166e-06, "loss": 0.0706, "step": 935 }, { "epoch": 0.8739495798319328, "grad_norm": 2.40037672383084, "learning_rate": 9.914325290989535e-06, "loss": 0.0551, "step": 936 }, { "epoch": 0.8748832866479925, "grad_norm": 4.086491296705496, "learning_rate": 9.914046341317887e-06, "loss": 0.0765, "step": 937 }, { "epoch": 0.8758169934640523, "grad_norm": 1.7997911892420944, "learning_rate": 9.913766942202735e-06, "loss": 0.084, "step": 938 }, { "epoch": 0.876750700280112, "grad_norm": 4.147088789352655, "learning_rate": 9.913487093669633e-06, "loss": 0.2715, "step": 939 }, { "epoch": 0.8776844070961718, "grad_norm": 1.1544159356649477, "learning_rate": 9.913206795744177e-06, "loss": 0.0272, "step": 940 }, { "epoch": 0.8786181139122315, "grad_norm": 1.806203708430463, "learning_rate": 9.912926048452001e-06, "loss": 0.0474, "step": 941 }, { "epoch": 0.8795518207282913, "grad_norm": 3.172533136745077, "learning_rate": 9.912644851818784e-06, "loss": 0.1256, "step": 942 }, { "epoch": 0.880485527544351, "grad_norm": 1.294537183769138, "learning_rate": 9.912363205870245e-06, "loss": 0.0395, "step": 943 }, { "epoch": 0.8814192343604108, "grad_norm": 1.5896725452132554, "learning_rate": 9.912081110632143e-06, "loss": 0.0522, "step": 944 }, { "epoch": 0.8823529411764706, "grad_norm": 3.6715878780639337, "learning_rate": 9.91179856613028e-06, "loss": 0.1718, "step": 945 }, { "epoch": 0.8832866479925303, "grad_norm": 4.723016025825213, "learning_rate": 9.911515572390496e-06, "loss": 0.2223, "step": 946 }, { "epoch": 0.8842203548085901, "grad_norm": 3.209553615056048, "learning_rate": 9.911232129438672e-06, "loss": 0.1673, "step": 947 }, { "epoch": 0.8851540616246498, "grad_norm": 0.9135346853610421, "learning_rate": 9.910948237300737e-06, "loss": 0.0269, "step": 948 }, { "epoch": 0.8860877684407096, "grad_norm": 2.101125136735973, "learning_rate": 9.910663896002653e-06, "loss": 0.1146, "step": 949 }, { "epoch": 0.8870214752567693, "grad_norm": 3.3307291858251853, "learning_rate": 9.910379105570427e-06, "loss": 0.1349, "step": 950 }, { "epoch": 0.8879551820728291, "grad_norm": 4.488488636536478, "learning_rate": 9.910093866030107e-06, "loss": 0.1908, "step": 951 }, { "epoch": 0.8888888888888888, "grad_norm": 1.9655297909584286, "learning_rate": 9.909808177407778e-06, "loss": 0.08, "step": 952 }, { "epoch": 0.8898225957049486, "grad_norm": 3.478310750539363, "learning_rate": 9.909522039729571e-06, "loss": 0.1012, "step": 953 }, { "epoch": 0.8907563025210085, "grad_norm": 2.553594335272661, "learning_rate": 9.90923545302166e-06, "loss": 0.1157, "step": 954 }, { "epoch": 0.8916900093370682, "grad_norm": 2.2612777027897533, "learning_rate": 9.90894841731025e-06, "loss": 0.1176, "step": 955 }, { "epoch": 0.892623716153128, "grad_norm": 2.9944515278360613, "learning_rate": 9.9086609326216e-06, "loss": 0.1842, "step": 956 }, { "epoch": 0.8935574229691877, "grad_norm": 3.623089521629253, "learning_rate": 9.908372998981998e-06, "loss": 0.1501, "step": 957 }, { "epoch": 0.8944911297852475, "grad_norm": 2.9945064957741403, "learning_rate": 9.90808461641778e-06, "loss": 0.1641, "step": 958 }, { "epoch": 0.8954248366013072, "grad_norm": 3.4685375819976407, "learning_rate": 9.907795784955327e-06, "loss": 0.1538, "step": 959 }, { "epoch": 0.896358543417367, "grad_norm": 2.4836095537129186, "learning_rate": 9.907506504621052e-06, "loss": 0.1345, "step": 960 }, { "epoch": 0.8972922502334267, "grad_norm": 2.701240609406823, "learning_rate": 9.90721677544141e-06, "loss": 0.1736, "step": 961 }, { "epoch": 0.8982259570494865, "grad_norm": 4.753791168103312, "learning_rate": 9.906926597442905e-06, "loss": 0.2021, "step": 962 }, { "epoch": 0.8991596638655462, "grad_norm": 1.5689673032928362, "learning_rate": 9.906635970652071e-06, "loss": 0.0535, "step": 963 }, { "epoch": 0.900093370681606, "grad_norm": 2.2703707582110884, "learning_rate": 9.906344895095496e-06, "loss": 0.0616, "step": 964 }, { "epoch": 0.9010270774976658, "grad_norm": 4.25898017586217, "learning_rate": 9.906053370799799e-06, "loss": 0.1421, "step": 965 }, { "epoch": 0.9019607843137255, "grad_norm": 2.9864802679487767, "learning_rate": 9.905761397791642e-06, "loss": 0.1563, "step": 966 }, { "epoch": 0.9028944911297853, "grad_norm": 5.159036580216084, "learning_rate": 9.90546897609773e-06, "loss": 0.2494, "step": 967 }, { "epoch": 0.903828197945845, "grad_norm": 3.0442254468463372, "learning_rate": 9.905176105744807e-06, "loss": 0.133, "step": 968 }, { "epoch": 0.9047619047619048, "grad_norm": 1.6457922327277914, "learning_rate": 9.904882786759661e-06, "loss": 0.0533, "step": 969 }, { "epoch": 0.9056956115779645, "grad_norm": 2.754087802348758, "learning_rate": 9.904589019169118e-06, "loss": 0.1736, "step": 970 }, { "epoch": 0.9066293183940243, "grad_norm": 2.5272977401890726, "learning_rate": 9.904294803000048e-06, "loss": 0.1239, "step": 971 }, { "epoch": 0.907563025210084, "grad_norm": 3.7915084028839874, "learning_rate": 9.904000138279359e-06, "loss": 0.1863, "step": 972 }, { "epoch": 0.9084967320261438, "grad_norm": 2.920597118990808, "learning_rate": 9.903705025034001e-06, "loss": 0.1339, "step": 973 }, { "epoch": 0.9094304388422035, "grad_norm": 2.456840250887364, "learning_rate": 9.903409463290964e-06, "loss": 0.0822, "step": 974 }, { "epoch": 0.9103641456582633, "grad_norm": 3.0119837853414175, "learning_rate": 9.903113453077285e-06, "loss": 0.151, "step": 975 }, { "epoch": 0.911297852474323, "grad_norm": 3.0861024185480868, "learning_rate": 9.902816994420034e-06, "loss": 0.0635, "step": 976 }, { "epoch": 0.9122315592903828, "grad_norm": 1.5562594815604187, "learning_rate": 9.902520087346326e-06, "loss": 0.054, "step": 977 }, { "epoch": 0.9131652661064426, "grad_norm": 3.192113568862246, "learning_rate": 9.902222731883316e-06, "loss": 0.1609, "step": 978 }, { "epoch": 0.9140989729225023, "grad_norm": 2.077074002094318, "learning_rate": 9.901924928058201e-06, "loss": 0.0861, "step": 979 }, { "epoch": 0.9150326797385621, "grad_norm": 2.60512911966638, "learning_rate": 9.901626675898217e-06, "loss": 0.112, "step": 980 }, { "epoch": 0.9159663865546218, "grad_norm": 3.930057919643749, "learning_rate": 9.901327975430645e-06, "loss": 0.2102, "step": 981 }, { "epoch": 0.9169000933706816, "grad_norm": 6.386265428388982, "learning_rate": 9.901028826682803e-06, "loss": 0.2984, "step": 982 }, { "epoch": 0.9178338001867413, "grad_norm": 1.8826502268895096, "learning_rate": 9.900729229682051e-06, "loss": 0.093, "step": 983 }, { "epoch": 0.9187675070028011, "grad_norm": 4.110355090707916, "learning_rate": 9.900429184455792e-06, "loss": 0.1266, "step": 984 }, { "epoch": 0.9197012138188608, "grad_norm": 3.5937334757658443, "learning_rate": 9.900128691031466e-06, "loss": 0.1634, "step": 985 }, { "epoch": 0.9206349206349206, "grad_norm": 1.6266568745539158, "learning_rate": 9.89982774943656e-06, "loss": 0.0859, "step": 986 }, { "epoch": 0.9215686274509803, "grad_norm": 4.610974290257633, "learning_rate": 9.899526359698594e-06, "loss": 0.1231, "step": 987 }, { "epoch": 0.9225023342670402, "grad_norm": 2.005350189687377, "learning_rate": 9.899224521845136e-06, "loss": 0.0807, "step": 988 }, { "epoch": 0.9234360410831, "grad_norm": 2.905599739062576, "learning_rate": 9.898922235903793e-06, "loss": 0.1948, "step": 989 }, { "epoch": 0.9243697478991597, "grad_norm": 3.8565025783060114, "learning_rate": 9.898619501902211e-06, "loss": 0.1307, "step": 990 }, { "epoch": 0.9253034547152195, "grad_norm": 4.3007508151713125, "learning_rate": 9.89831631986808e-06, "loss": 0.2167, "step": 991 }, { "epoch": 0.9262371615312792, "grad_norm": 2.653842067072618, "learning_rate": 9.898012689829125e-06, "loss": 0.133, "step": 992 }, { "epoch": 0.927170868347339, "grad_norm": 1.5839616439380977, "learning_rate": 9.897708611813121e-06, "loss": 0.0626, "step": 993 }, { "epoch": 0.9281045751633987, "grad_norm": 1.9159189061060402, "learning_rate": 9.897404085847879e-06, "loss": 0.1037, "step": 994 }, { "epoch": 0.9290382819794585, "grad_norm": 1.5408691070857645, "learning_rate": 9.897099111961248e-06, "loss": 0.0712, "step": 995 }, { "epoch": 0.9299719887955182, "grad_norm": 4.278856760791173, "learning_rate": 9.896793690181123e-06, "loss": 0.0674, "step": 996 }, { "epoch": 0.930905695611578, "grad_norm": 2.868878216155147, "learning_rate": 9.89648782053544e-06, "loss": 0.0919, "step": 997 }, { "epoch": 0.9318394024276377, "grad_norm": 3.8665620149568944, "learning_rate": 9.896181503052171e-06, "loss": 0.1955, "step": 998 }, { "epoch": 0.9327731092436975, "grad_norm": 3.0575706161784053, "learning_rate": 9.895874737759334e-06, "loss": 0.1209, "step": 999 }, { "epoch": 0.9337068160597572, "grad_norm": 3.4089337697300515, "learning_rate": 9.895567524684986e-06, "loss": 0.2259, "step": 1000 }, { "epoch": 0.934640522875817, "grad_norm": 2.195694085320903, "learning_rate": 9.895259863857223e-06, "loss": 0.0626, "step": 1001 }, { "epoch": 0.9355742296918768, "grad_norm": 2.811194777232517, "learning_rate": 9.894951755304187e-06, "loss": 0.1028, "step": 1002 }, { "epoch": 0.9365079365079365, "grad_norm": 3.1430697266691103, "learning_rate": 9.894643199054057e-06, "loss": 0.1887, "step": 1003 }, { "epoch": 0.9374416433239963, "grad_norm": 1.675589489259222, "learning_rate": 9.894334195135052e-06, "loss": 0.0284, "step": 1004 }, { "epoch": 0.938375350140056, "grad_norm": 8.528410814844325, "learning_rate": 9.894024743575436e-06, "loss": 0.2373, "step": 1005 }, { "epoch": 0.9393090569561158, "grad_norm": 1.343868718716114, "learning_rate": 9.89371484440351e-06, "loss": 0.0365, "step": 1006 }, { "epoch": 0.9402427637721755, "grad_norm": 1.4026750269210921, "learning_rate": 9.89340449764762e-06, "loss": 0.0475, "step": 1007 }, { "epoch": 0.9411764705882353, "grad_norm": 6.405944150151547, "learning_rate": 9.893093703336147e-06, "loss": 0.3502, "step": 1008 }, { "epoch": 0.942110177404295, "grad_norm": 1.928261436417829, "learning_rate": 9.892782461497521e-06, "loss": 0.0507, "step": 1009 }, { "epoch": 0.9430438842203548, "grad_norm": 1.698661003519393, "learning_rate": 9.892470772160205e-06, "loss": 0.0482, "step": 1010 }, { "epoch": 0.9439775910364145, "grad_norm": 2.7335900401625275, "learning_rate": 9.892158635352707e-06, "loss": 0.129, "step": 1011 }, { "epoch": 0.9449112978524743, "grad_norm": 4.684515296783952, "learning_rate": 9.891846051103578e-06, "loss": 0.0991, "step": 1012 }, { "epoch": 0.945845004668534, "grad_norm": 2.169298833252371, "learning_rate": 9.891533019441404e-06, "loss": 0.0607, "step": 1013 }, { "epoch": 0.9467787114845938, "grad_norm": 2.445258384155542, "learning_rate": 9.891219540394816e-06, "loss": 0.0463, "step": 1014 }, { "epoch": 0.9477124183006536, "grad_norm": 4.089506988499461, "learning_rate": 9.890905613992485e-06, "loss": 0.2262, "step": 1015 }, { "epoch": 0.9486461251167133, "grad_norm": 6.092947142876529, "learning_rate": 9.890591240263125e-06, "loss": 0.252, "step": 1016 }, { "epoch": 0.9495798319327731, "grad_norm": 7.934925711756923, "learning_rate": 9.890276419235488e-06, "loss": 0.3156, "step": 1017 }, { "epoch": 0.9505135387488328, "grad_norm": 2.6226818554158595, "learning_rate": 9.889961150938366e-06, "loss": 0.0694, "step": 1018 }, { "epoch": 0.9514472455648926, "grad_norm": 4.735733829437665, "learning_rate": 9.889645435400594e-06, "loss": 0.1279, "step": 1019 }, { "epoch": 0.9523809523809523, "grad_norm": 8.11177247345657, "learning_rate": 9.889329272651049e-06, "loss": 0.1468, "step": 1020 }, { "epoch": 0.9533146591970122, "grad_norm": 6.003281925529701, "learning_rate": 9.889012662718648e-06, "loss": 0.1719, "step": 1021 }, { "epoch": 0.954248366013072, "grad_norm": 2.9964866042288154, "learning_rate": 9.888695605632347e-06, "loss": 0.1171, "step": 1022 }, { "epoch": 0.9551820728291317, "grad_norm": 7.848951417108149, "learning_rate": 9.888378101421148e-06, "loss": 0.3013, "step": 1023 }, { "epoch": 0.9561157796451915, "grad_norm": 8.023097734977094, "learning_rate": 9.888060150114084e-06, "loss": 0.2271, "step": 1024 }, { "epoch": 0.9570494864612512, "grad_norm": 15.042424330119767, "learning_rate": 9.88774175174024e-06, "loss": 0.1136, "step": 1025 }, { "epoch": 0.957983193277311, "grad_norm": 6.907509032299053, "learning_rate": 9.887422906328735e-06, "loss": 0.0941, "step": 1026 }, { "epoch": 0.9589169000933707, "grad_norm": 1.7327978435711828, "learning_rate": 9.88710361390873e-06, "loss": 0.0357, "step": 1027 }, { "epoch": 0.9598506069094305, "grad_norm": 3.269118499998131, "learning_rate": 9.88678387450943e-06, "loss": 0.1013, "step": 1028 }, { "epoch": 0.9607843137254902, "grad_norm": 6.0669425947255275, "learning_rate": 9.88646368816008e-06, "loss": 0.3174, "step": 1029 }, { "epoch": 0.96171802054155, "grad_norm": 2.803777850674087, "learning_rate": 9.88614305488996e-06, "loss": 0.1369, "step": 1030 }, { "epoch": 0.9626517273576097, "grad_norm": 2.6465277746257647, "learning_rate": 9.8858219747284e-06, "loss": 0.0655, "step": 1031 }, { "epoch": 0.9635854341736695, "grad_norm": 3.2300618319263728, "learning_rate": 9.885500447704762e-06, "loss": 0.0848, "step": 1032 }, { "epoch": 0.9645191409897292, "grad_norm": 5.450409282686068, "learning_rate": 9.885178473848455e-06, "loss": 0.2007, "step": 1033 }, { "epoch": 0.965452847805789, "grad_norm": 3.902521498612869, "learning_rate": 9.88485605318893e-06, "loss": 0.1672, "step": 1034 }, { "epoch": 0.9663865546218487, "grad_norm": 4.540344040383726, "learning_rate": 9.88453318575567e-06, "loss": 0.2512, "step": 1035 }, { "epoch": 0.9673202614379085, "grad_norm": 4.003452802016604, "learning_rate": 9.884209871578213e-06, "loss": 0.1127, "step": 1036 }, { "epoch": 0.9682539682539683, "grad_norm": 2.090295548302232, "learning_rate": 9.88388611068612e-06, "loss": 0.0994, "step": 1037 }, { "epoch": 0.969187675070028, "grad_norm": 2.747332666906141, "learning_rate": 9.88356190310901e-06, "loss": 0.0856, "step": 1038 }, { "epoch": 0.9701213818860878, "grad_norm": 1.3922497002514853, "learning_rate": 9.88323724887653e-06, "loss": 0.0515, "step": 1039 }, { "epoch": 0.9710550887021475, "grad_norm": 1.901543287663392, "learning_rate": 9.882912148018378e-06, "loss": 0.0658, "step": 1040 }, { "epoch": 0.9719887955182073, "grad_norm": 3.7860565169110383, "learning_rate": 9.882586600564284e-06, "loss": 0.1365, "step": 1041 }, { "epoch": 0.972922502334267, "grad_norm": 3.365831713118152, "learning_rate": 9.882260606544025e-06, "loss": 0.0811, "step": 1042 }, { "epoch": 0.9738562091503268, "grad_norm": 2.4501961471365554, "learning_rate": 9.881934165987416e-06, "loss": 0.1061, "step": 1043 }, { "epoch": 0.9747899159663865, "grad_norm": 4.510118515480747, "learning_rate": 9.881607278924317e-06, "loss": 0.2423, "step": 1044 }, { "epoch": 0.9757236227824463, "grad_norm": 3.794248418644263, "learning_rate": 9.88127994538462e-06, "loss": 0.2576, "step": 1045 }, { "epoch": 0.976657329598506, "grad_norm": 2.172320530773855, "learning_rate": 9.880952165398265e-06, "loss": 0.1063, "step": 1046 }, { "epoch": 0.9775910364145658, "grad_norm": 3.1266931400272795, "learning_rate": 9.880623938995232e-06, "loss": 0.1557, "step": 1047 }, { "epoch": 0.9785247432306255, "grad_norm": 2.225061806461758, "learning_rate": 9.880295266205542e-06, "loss": 0.0683, "step": 1048 }, { "epoch": 0.9794584500466853, "grad_norm": 3.9165907634781845, "learning_rate": 9.879966147059254e-06, "loss": 0.1102, "step": 1049 }, { "epoch": 0.9803921568627451, "grad_norm": 3.1910336757817217, "learning_rate": 9.879636581586469e-06, "loss": 0.0931, "step": 1050 }, { "epoch": 0.9813258636788048, "grad_norm": 4.356078756492667, "learning_rate": 9.879306569817331e-06, "loss": 0.1683, "step": 1051 }, { "epoch": 0.9822595704948646, "grad_norm": 2.009972635702513, "learning_rate": 9.878976111782022e-06, "loss": 0.1024, "step": 1052 }, { "epoch": 0.9831932773109243, "grad_norm": 6.0817359500384, "learning_rate": 9.878645207510767e-06, "loss": 0.3411, "step": 1053 }, { "epoch": 0.9841269841269841, "grad_norm": 4.303132265952971, "learning_rate": 9.87831385703383e-06, "loss": 0.2847, "step": 1054 }, { "epoch": 0.9850606909430439, "grad_norm": 1.6175339673827531, "learning_rate": 9.877982060381516e-06, "loss": 0.0602, "step": 1055 }, { "epoch": 0.9859943977591037, "grad_norm": 2.659599118078244, "learning_rate": 9.877649817584174e-06, "loss": 0.0729, "step": 1056 }, { "epoch": 0.9869281045751634, "grad_norm": 2.23191333232627, "learning_rate": 9.877317128672188e-06, "loss": 0.1218, "step": 1057 }, { "epoch": 0.9878618113912232, "grad_norm": 3.722497381366038, "learning_rate": 9.87698399367599e-06, "loss": 0.1119, "step": 1058 }, { "epoch": 0.988795518207283, "grad_norm": 4.9322201918673345, "learning_rate": 9.876650412626045e-06, "loss": 0.1581, "step": 1059 }, { "epoch": 0.9897292250233427, "grad_norm": 2.176586253534966, "learning_rate": 9.876316385552865e-06, "loss": 0.0824, "step": 1060 }, { "epoch": 0.9906629318394025, "grad_norm": 2.191793019247684, "learning_rate": 9.875981912486998e-06, "loss": 0.1092, "step": 1061 }, { "epoch": 0.9915966386554622, "grad_norm": 3.7168678821861656, "learning_rate": 9.875646993459038e-06, "loss": 0.1535, "step": 1062 }, { "epoch": 0.992530345471522, "grad_norm": 2.0266994630690855, "learning_rate": 9.875311628499618e-06, "loss": 0.0579, "step": 1063 }, { "epoch": 0.9934640522875817, "grad_norm": 2.6627897589957366, "learning_rate": 9.874975817639406e-06, "loss": 0.1424, "step": 1064 }, { "epoch": 0.9943977591036415, "grad_norm": 2.887634811264857, "learning_rate": 9.874639560909118e-06, "loss": 0.1804, "step": 1065 }, { "epoch": 0.9953314659197012, "grad_norm": 2.9086375057282012, "learning_rate": 9.87430285833951e-06, "loss": 0.1701, "step": 1066 }, { "epoch": 0.996265172735761, "grad_norm": 2.0028589419658807, "learning_rate": 9.873965709961378e-06, "loss": 0.0701, "step": 1067 }, { "epoch": 0.9971988795518207, "grad_norm": 4.462822406718407, "learning_rate": 9.873628115805551e-06, "loss": 0.2035, "step": 1068 }, { "epoch": 0.9981325863678805, "grad_norm": 2.535981969735211, "learning_rate": 9.873290075902914e-06, "loss": 0.0891, "step": 1069 }, { "epoch": 0.9990662931839402, "grad_norm": 2.5819075508513323, "learning_rate": 9.87295159028438e-06, "loss": 0.1614, "step": 1070 }, { "epoch": 1.0, "grad_norm": 4.24611575718345, "learning_rate": 9.872612658980908e-06, "loss": 0.1754, "step": 1071 }, { "epoch": 1.0009337068160598, "grad_norm": 5.494060855332059, "learning_rate": 9.872273282023497e-06, "loss": 0.3099, "step": 1072 }, { "epoch": 1.0018674136321195, "grad_norm": 4.712846067213594, "learning_rate": 9.871933459443188e-06, "loss": 0.2613, "step": 1073 }, { "epoch": 1.0028011204481793, "grad_norm": 3.4867528798586225, "learning_rate": 9.871593191271059e-06, "loss": 0.1781, "step": 1074 }, { "epoch": 1.003734827264239, "grad_norm": 1.860464898297681, "learning_rate": 9.871252477538233e-06, "loss": 0.0979, "step": 1075 }, { "epoch": 1.0046685340802988, "grad_norm": 2.929679699083356, "learning_rate": 9.870911318275871e-06, "loss": 0.0941, "step": 1076 }, { "epoch": 1.0056022408963585, "grad_norm": 2.363053030590289, "learning_rate": 9.87056971351518e-06, "loss": 0.0979, "step": 1077 }, { "epoch": 1.0065359477124183, "grad_norm": 1.931507036934167, "learning_rate": 9.870227663287396e-06, "loss": 0.0575, "step": 1078 }, { "epoch": 1.007469654528478, "grad_norm": 3.083690376832284, "learning_rate": 9.869885167623808e-06, "loss": 0.1042, "step": 1079 }, { "epoch": 1.0084033613445378, "grad_norm": 3.107623055018518, "learning_rate": 9.86954222655574e-06, "loss": 0.168, "step": 1080 }, { "epoch": 1.0093370681605975, "grad_norm": 1.6193046840017857, "learning_rate": 9.86919884011456e-06, "loss": 0.0854, "step": 1081 }, { "epoch": 1.0102707749766573, "grad_norm": 6.754502460566785, "learning_rate": 9.86885500833167e-06, "loss": 0.1832, "step": 1082 }, { "epoch": 1.011204481792717, "grad_norm": 2.9544950322072756, "learning_rate": 9.868510731238522e-06, "loss": 0.1653, "step": 1083 }, { "epoch": 1.0121381886087768, "grad_norm": 1.5873989636513692, "learning_rate": 9.868166008866599e-06, "loss": 0.0697, "step": 1084 }, { "epoch": 1.0130718954248366, "grad_norm": 2.716437286289235, "learning_rate": 9.867820841247433e-06, "loss": 0.1519, "step": 1085 }, { "epoch": 1.0140056022408963, "grad_norm": 1.9113389407221175, "learning_rate": 9.867475228412593e-06, "loss": 0.144, "step": 1086 }, { "epoch": 1.014939309056956, "grad_norm": 4.514990993271559, "learning_rate": 9.867129170393688e-06, "loss": 0.1821, "step": 1087 }, { "epoch": 1.0158730158730158, "grad_norm": 3.4816042199213135, "learning_rate": 9.86678266722237e-06, "loss": 0.1338, "step": 1088 }, { "epoch": 1.0168067226890756, "grad_norm": 5.380316783332016, "learning_rate": 9.866435718930329e-06, "loss": 0.2689, "step": 1089 }, { "epoch": 1.0177404295051353, "grad_norm": 1.551718434618846, "learning_rate": 9.8660883255493e-06, "loss": 0.0646, "step": 1090 }, { "epoch": 1.018674136321195, "grad_norm": 1.1104280430760805, "learning_rate": 9.86574048711105e-06, "loss": 0.0321, "step": 1091 }, { "epoch": 1.0196078431372548, "grad_norm": 1.8861327197219677, "learning_rate": 9.865392203647402e-06, "loss": 0.0504, "step": 1092 }, { "epoch": 1.0205415499533146, "grad_norm": 4.595551929295361, "learning_rate": 9.865043475190202e-06, "loss": 0.1602, "step": 1093 }, { "epoch": 1.0214752567693743, "grad_norm": 4.002502026637323, "learning_rate": 9.864694301771348e-06, "loss": 0.152, "step": 1094 }, { "epoch": 1.022408963585434, "grad_norm": 2.99492023935074, "learning_rate": 9.864344683422777e-06, "loss": 0.1464, "step": 1095 }, { "epoch": 1.0233426704014938, "grad_norm": 4.538804171937684, "learning_rate": 9.863994620176464e-06, "loss": 0.1188, "step": 1096 }, { "epoch": 1.0242763772175536, "grad_norm": 2.1330956545965645, "learning_rate": 9.863644112064426e-06, "loss": 0.0844, "step": 1097 }, { "epoch": 1.0252100840336134, "grad_norm": 2.5909174784473517, "learning_rate": 9.86329315911872e-06, "loss": 0.1307, "step": 1098 }, { "epoch": 1.026143790849673, "grad_norm": 1.7766550156676717, "learning_rate": 9.862941761371449e-06, "loss": 0.0898, "step": 1099 }, { "epoch": 1.0270774976657329, "grad_norm": 5.933756662243653, "learning_rate": 9.862589918854747e-06, "loss": 0.2249, "step": 1100 }, { "epoch": 1.0280112044817926, "grad_norm": 2.9283418823896605, "learning_rate": 9.862237631600796e-06, "loss": 0.1954, "step": 1101 }, { "epoch": 1.0289449112978524, "grad_norm": 3.219876323715372, "learning_rate": 9.861884899641815e-06, "loss": 0.1735, "step": 1102 }, { "epoch": 1.0298786181139121, "grad_norm": 2.306707175732204, "learning_rate": 9.86153172301007e-06, "loss": 0.0593, "step": 1103 }, { "epoch": 1.0308123249299719, "grad_norm": 2.2329467881000054, "learning_rate": 9.861178101737857e-06, "loss": 0.0927, "step": 1104 }, { "epoch": 1.0317460317460316, "grad_norm": 2.9377864258824617, "learning_rate": 9.860824035857521e-06, "loss": 0.0938, "step": 1105 }, { "epoch": 1.0326797385620916, "grad_norm": 3.704402296251938, "learning_rate": 9.860469525401446e-06, "loss": 0.1654, "step": 1106 }, { "epoch": 1.0336134453781514, "grad_norm": 1.388867189797726, "learning_rate": 9.860114570402055e-06, "loss": 0.0465, "step": 1107 }, { "epoch": 1.0345471521942111, "grad_norm": 2.6374144619765763, "learning_rate": 9.859759170891812e-06, "loss": 0.1151, "step": 1108 }, { "epoch": 1.0354808590102709, "grad_norm": 4.564919432339159, "learning_rate": 9.859403326903224e-06, "loss": 0.1947, "step": 1109 }, { "epoch": 1.0364145658263306, "grad_norm": 1.2438918221464013, "learning_rate": 9.859047038468835e-06, "loss": 0.0545, "step": 1110 }, { "epoch": 1.0373482726423904, "grad_norm": 3.955812274579812, "learning_rate": 9.858690305621231e-06, "loss": 0.161, "step": 1111 }, { "epoch": 1.0382819794584501, "grad_norm": 5.257492637960816, "learning_rate": 9.858333128393041e-06, "loss": 0.2243, "step": 1112 }, { "epoch": 1.0392156862745099, "grad_norm": 0.8620240799223473, "learning_rate": 9.857975506816932e-06, "loss": 0.031, "step": 1113 }, { "epoch": 1.0401493930905696, "grad_norm": 4.6523833741518175, "learning_rate": 9.857617440925613e-06, "loss": 0.2298, "step": 1114 }, { "epoch": 1.0410830999066294, "grad_norm": 3.5694647994687796, "learning_rate": 9.857258930751832e-06, "loss": 0.1167, "step": 1115 }, { "epoch": 1.0420168067226891, "grad_norm": 2.842855390563923, "learning_rate": 9.856899976328377e-06, "loss": 0.1577, "step": 1116 }, { "epoch": 1.042950513538749, "grad_norm": 3.1015154001328296, "learning_rate": 9.856540577688085e-06, "loss": 0.1609, "step": 1117 }, { "epoch": 1.0438842203548087, "grad_norm": 2.7889450470540575, "learning_rate": 9.85618073486382e-06, "loss": 0.0585, "step": 1118 }, { "epoch": 1.0448179271708684, "grad_norm": 1.7310798344964515, "learning_rate": 9.855820447888496e-06, "loss": 0.0624, "step": 1119 }, { "epoch": 1.0457516339869282, "grad_norm": 2.0260350719611138, "learning_rate": 9.855459716795066e-06, "loss": 0.0713, "step": 1120 }, { "epoch": 1.046685340802988, "grad_norm": 1.3943661270005765, "learning_rate": 9.855098541616522e-06, "loss": 0.0397, "step": 1121 }, { "epoch": 1.0476190476190477, "grad_norm": 3.2814191075367063, "learning_rate": 9.854736922385897e-06, "loss": 0.1931, "step": 1122 }, { "epoch": 1.0485527544351074, "grad_norm": 3.558825285198161, "learning_rate": 9.854374859136266e-06, "loss": 0.279, "step": 1123 }, { "epoch": 1.0494864612511672, "grad_norm": 1.031611021216216, "learning_rate": 9.854012351900743e-06, "loss": 0.0266, "step": 1124 }, { "epoch": 1.050420168067227, "grad_norm": 4.288207959114594, "learning_rate": 9.853649400712484e-06, "loss": 0.1541, "step": 1125 }, { "epoch": 1.0513538748832867, "grad_norm": 2.257904631350803, "learning_rate": 9.853286005604683e-06, "loss": 0.1176, "step": 1126 }, { "epoch": 1.0522875816993464, "grad_norm": 5.095550264471739, "learning_rate": 9.85292216661058e-06, "loss": 0.2867, "step": 1127 }, { "epoch": 1.0532212885154062, "grad_norm": 2.3259990978763847, "learning_rate": 9.852557883763447e-06, "loss": 0.1081, "step": 1128 }, { "epoch": 1.054154995331466, "grad_norm": 5.515697346788587, "learning_rate": 9.852193157096607e-06, "loss": 0.3017, "step": 1129 }, { "epoch": 1.0550887021475257, "grad_norm": 2.0649200568607844, "learning_rate": 9.851827986643415e-06, "loss": 0.1031, "step": 1130 }, { "epoch": 1.0560224089635855, "grad_norm": 1.572252548639418, "learning_rate": 9.85146237243727e-06, "loss": 0.04, "step": 1131 }, { "epoch": 1.0569561157796452, "grad_norm": 2.1392861465738022, "learning_rate": 9.851096314511614e-06, "loss": 0.1093, "step": 1132 }, { "epoch": 1.057889822595705, "grad_norm": 3.014938874477779, "learning_rate": 9.850729812899925e-06, "loss": 0.1482, "step": 1133 }, { "epoch": 1.0588235294117647, "grad_norm": 2.623028539131156, "learning_rate": 9.850362867635722e-06, "loss": 0.0317, "step": 1134 }, { "epoch": 1.0597572362278245, "grad_norm": 3.694679373917601, "learning_rate": 9.849995478752568e-06, "loss": 0.1429, "step": 1135 }, { "epoch": 1.0606909430438842, "grad_norm": 3.28273026661535, "learning_rate": 9.849627646284065e-06, "loss": 0.0837, "step": 1136 }, { "epoch": 1.061624649859944, "grad_norm": 2.3684494065730792, "learning_rate": 9.849259370263855e-06, "loss": 0.109, "step": 1137 }, { "epoch": 1.0625583566760037, "grad_norm": 4.052071440594696, "learning_rate": 9.848890650725622e-06, "loss": 0.2048, "step": 1138 }, { "epoch": 1.0634920634920635, "grad_norm": 4.352013258974517, "learning_rate": 9.848521487703089e-06, "loss": 0.1276, "step": 1139 }, { "epoch": 1.0644257703081232, "grad_norm": 3.730579695268801, "learning_rate": 9.848151881230016e-06, "loss": 0.1472, "step": 1140 }, { "epoch": 1.065359477124183, "grad_norm": 1.9818633497529412, "learning_rate": 9.847781831340212e-06, "loss": 0.0385, "step": 1141 }, { "epoch": 1.0662931839402428, "grad_norm": 2.355315200538246, "learning_rate": 9.847411338067522e-06, "loss": 0.125, "step": 1142 }, { "epoch": 1.0672268907563025, "grad_norm": 6.601558726947356, "learning_rate": 9.847040401445831e-06, "loss": 0.2169, "step": 1143 }, { "epoch": 1.0681605975723623, "grad_norm": 2.14022137544987, "learning_rate": 9.846669021509065e-06, "loss": 0.1036, "step": 1144 }, { "epoch": 1.069094304388422, "grad_norm": 2.0984907202296688, "learning_rate": 9.84629719829119e-06, "loss": 0.0406, "step": 1145 }, { "epoch": 1.0700280112044818, "grad_norm": 3.0805257309497187, "learning_rate": 9.845924931826215e-06, "loss": 0.1706, "step": 1146 }, { "epoch": 1.0709617180205415, "grad_norm": 3.160332757686331, "learning_rate": 9.845552222148185e-06, "loss": 0.1925, "step": 1147 }, { "epoch": 1.0718954248366013, "grad_norm": 3.3648375317416614, "learning_rate": 9.84517906929119e-06, "loss": 0.1866, "step": 1148 }, { "epoch": 1.072829131652661, "grad_norm": 2.748511190394799, "learning_rate": 9.844805473289363e-06, "loss": 0.1552, "step": 1149 }, { "epoch": 1.0737628384687208, "grad_norm": 0.7718046262444455, "learning_rate": 9.844431434176866e-06, "loss": 0.0229, "step": 1150 }, { "epoch": 1.0746965452847805, "grad_norm": 3.0907580481247146, "learning_rate": 9.844056951987915e-06, "loss": 0.1711, "step": 1151 }, { "epoch": 1.0756302521008403, "grad_norm": 2.774903920307838, "learning_rate": 9.843682026756755e-06, "loss": 0.1893, "step": 1152 }, { "epoch": 1.0765639589169, "grad_norm": 3.855850084218911, "learning_rate": 9.843306658517683e-06, "loss": 0.2226, "step": 1153 }, { "epoch": 1.0774976657329598, "grad_norm": 3.2848984240152452, "learning_rate": 9.842930847305028e-06, "loss": 0.2195, "step": 1154 }, { "epoch": 1.0784313725490196, "grad_norm": 5.837427476808251, "learning_rate": 9.84255459315316e-06, "loss": 0.3094, "step": 1155 }, { "epoch": 1.0793650793650793, "grad_norm": 1.489650318360177, "learning_rate": 9.842177896096495e-06, "loss": 0.0564, "step": 1156 }, { "epoch": 1.080298786181139, "grad_norm": 2.515096947736772, "learning_rate": 9.841800756169481e-06, "loss": 0.0777, "step": 1157 }, { "epoch": 1.0812324929971988, "grad_norm": 1.2472649516278684, "learning_rate": 9.84142317340662e-06, "loss": 0.0302, "step": 1158 }, { "epoch": 1.0821661998132586, "grad_norm": 2.7406302861778538, "learning_rate": 9.841045147842438e-06, "loss": 0.1082, "step": 1159 }, { "epoch": 1.0830999066293183, "grad_norm": 2.8982181258004256, "learning_rate": 9.840666679511512e-06, "loss": 0.119, "step": 1160 }, { "epoch": 1.084033613445378, "grad_norm": 3.4066109266973514, "learning_rate": 9.84028776844846e-06, "loss": 0.2189, "step": 1161 }, { "epoch": 1.0849673202614378, "grad_norm": 2.6823560812705503, "learning_rate": 9.839908414687935e-06, "loss": 0.1275, "step": 1162 }, { "epoch": 1.0859010270774976, "grad_norm": 2.9049407048959903, "learning_rate": 9.839528618264633e-06, "loss": 0.1133, "step": 1163 }, { "epoch": 1.0868347338935573, "grad_norm": 0.7879714987706057, "learning_rate": 9.83914837921329e-06, "loss": 0.0294, "step": 1164 }, { "epoch": 1.087768440709617, "grad_norm": 2.493521843422881, "learning_rate": 9.838767697568686e-06, "loss": 0.1187, "step": 1165 }, { "epoch": 1.0887021475256768, "grad_norm": 2.400026882868003, "learning_rate": 9.838386573365635e-06, "loss": 0.1419, "step": 1166 }, { "epoch": 1.0896358543417366, "grad_norm": 1.0456464591127483, "learning_rate": 9.838005006638997e-06, "loss": 0.0404, "step": 1167 }, { "epoch": 1.0905695611577964, "grad_norm": 3.2293405223119453, "learning_rate": 9.83762299742367e-06, "loss": 0.0778, "step": 1168 }, { "epoch": 1.091503267973856, "grad_norm": 4.218614903412301, "learning_rate": 9.837240545754594e-06, "loss": 0.1769, "step": 1169 }, { "epoch": 1.092436974789916, "grad_norm": 1.605373345418663, "learning_rate": 9.836857651666747e-06, "loss": 0.0452, "step": 1170 }, { "epoch": 1.0933706816059758, "grad_norm": 2.6750261565702096, "learning_rate": 9.836474315195148e-06, "loss": 0.1138, "step": 1171 }, { "epoch": 1.0943043884220356, "grad_norm": 3.389452300668123, "learning_rate": 9.836090536374859e-06, "loss": 0.1282, "step": 1172 }, { "epoch": 1.0952380952380953, "grad_norm": 2.251267027354964, "learning_rate": 9.83570631524098e-06, "loss": 0.1086, "step": 1173 }, { "epoch": 1.096171802054155, "grad_norm": 2.2896291232623582, "learning_rate": 9.835321651828653e-06, "loss": 0.0875, "step": 1174 }, { "epoch": 1.0971055088702149, "grad_norm": 5.4176789685463955, "learning_rate": 9.834936546173061e-06, "loss": 0.2112, "step": 1175 }, { "epoch": 1.0980392156862746, "grad_norm": 3.748569835236193, "learning_rate": 9.834550998309422e-06, "loss": 0.2475, "step": 1176 }, { "epoch": 1.0989729225023344, "grad_norm": 2.7544654195739913, "learning_rate": 9.834165008273002e-06, "loss": 0.1241, "step": 1177 }, { "epoch": 1.0999066293183941, "grad_norm": 2.207706013813258, "learning_rate": 9.833778576099104e-06, "loss": 0.0345, "step": 1178 }, { "epoch": 1.1008403361344539, "grad_norm": 3.104465684122127, "learning_rate": 9.833391701823069e-06, "loss": 0.1165, "step": 1179 }, { "epoch": 1.1017740429505136, "grad_norm": 5.899854994364641, "learning_rate": 9.833004385480283e-06, "loss": 0.1094, "step": 1180 }, { "epoch": 1.1027077497665734, "grad_norm": 3.9003636846153174, "learning_rate": 9.832616627106169e-06, "loss": 0.1352, "step": 1181 }, { "epoch": 1.1036414565826331, "grad_norm": 2.2580604534561433, "learning_rate": 9.832228426736194e-06, "loss": 0.143, "step": 1182 }, { "epoch": 1.1045751633986929, "grad_norm": 1.883075266863754, "learning_rate": 9.83183978440586e-06, "loss": 0.0557, "step": 1183 }, { "epoch": 1.1055088702147526, "grad_norm": 3.3362997407724273, "learning_rate": 9.831450700150716e-06, "loss": 0.1271, "step": 1184 }, { "epoch": 1.1064425770308124, "grad_norm": 2.2848168586990742, "learning_rate": 9.831061174006344e-06, "loss": 0.0491, "step": 1185 }, { "epoch": 1.1073762838468721, "grad_norm": 1.0774256345300346, "learning_rate": 9.830671206008375e-06, "loss": 0.0184, "step": 1186 }, { "epoch": 1.108309990662932, "grad_norm": 3.2012514980181055, "learning_rate": 9.830280796192473e-06, "loss": 0.0634, "step": 1187 }, { "epoch": 1.1092436974789917, "grad_norm": 7.795264581914308, "learning_rate": 9.829889944594345e-06, "loss": 0.1416, "step": 1188 }, { "epoch": 1.1101774042950514, "grad_norm": 2.3114389010068273, "learning_rate": 9.82949865124974e-06, "loss": 0.06, "step": 1189 }, { "epoch": 1.1111111111111112, "grad_norm": 3.6057445771644447, "learning_rate": 9.829106916194446e-06, "loss": 0.1705, "step": 1190 }, { "epoch": 1.112044817927171, "grad_norm": 1.0446399791137821, "learning_rate": 9.82871473946429e-06, "loss": 0.0257, "step": 1191 }, { "epoch": 1.1129785247432307, "grad_norm": 1.2308072615054075, "learning_rate": 9.828322121095144e-06, "loss": 0.0496, "step": 1192 }, { "epoch": 1.1139122315592904, "grad_norm": 1.589708986308654, "learning_rate": 9.827929061122914e-06, "loss": 0.0242, "step": 1193 }, { "epoch": 1.1148459383753502, "grad_norm": 4.391099607477734, "learning_rate": 9.827535559583553e-06, "loss": 0.1781, "step": 1194 }, { "epoch": 1.11577964519141, "grad_norm": 1.9299312968974247, "learning_rate": 9.827141616513046e-06, "loss": 0.0747, "step": 1195 }, { "epoch": 1.1167133520074697, "grad_norm": 3.2218692692794537, "learning_rate": 9.826747231947427e-06, "loss": 0.1773, "step": 1196 }, { "epoch": 1.1176470588235294, "grad_norm": 2.215738286056874, "learning_rate": 9.826352405922766e-06, "loss": 0.0413, "step": 1197 }, { "epoch": 1.1185807656395892, "grad_norm": 2.600376587298697, "learning_rate": 9.825957138475175e-06, "loss": 0.1488, "step": 1198 }, { "epoch": 1.119514472455649, "grad_norm": 2.7820827147582037, "learning_rate": 9.825561429640804e-06, "loss": 0.1233, "step": 1199 }, { "epoch": 1.1204481792717087, "grad_norm": 8.190955242881527, "learning_rate": 9.825165279455847e-06, "loss": 0.2583, "step": 1200 }, { "epoch": 1.1213818860877685, "grad_norm": 2.1557704603655243, "learning_rate": 9.824768687956535e-06, "loss": 0.1082, "step": 1201 }, { "epoch": 1.1223155929038282, "grad_norm": 2.4649816354369216, "learning_rate": 9.82437165517914e-06, "loss": 0.1036, "step": 1202 }, { "epoch": 1.123249299719888, "grad_norm": 3.657534302437441, "learning_rate": 9.823974181159976e-06, "loss": 0.2293, "step": 1203 }, { "epoch": 1.1241830065359477, "grad_norm": 2.07722284111398, "learning_rate": 9.823576265935395e-06, "loss": 0.0958, "step": 1204 }, { "epoch": 1.1251167133520075, "grad_norm": 1.227460798147538, "learning_rate": 9.823177909541795e-06, "loss": 0.0362, "step": 1205 }, { "epoch": 1.1260504201680672, "grad_norm": 2.6920345387068974, "learning_rate": 9.822779112015604e-06, "loss": 0.0957, "step": 1206 }, { "epoch": 1.126984126984127, "grad_norm": 1.5973842324628238, "learning_rate": 9.822379873393299e-06, "loss": 0.0519, "step": 1207 }, { "epoch": 1.1279178338001867, "grad_norm": 7.828655167877014, "learning_rate": 9.821980193711397e-06, "loss": 0.1781, "step": 1208 }, { "epoch": 1.1288515406162465, "grad_norm": 1.9892742949174838, "learning_rate": 9.82158007300645e-06, "loss": 0.0618, "step": 1209 }, { "epoch": 1.1297852474323062, "grad_norm": 1.8270469506403098, "learning_rate": 9.821179511315055e-06, "loss": 0.0723, "step": 1210 }, { "epoch": 1.130718954248366, "grad_norm": 2.686117862260423, "learning_rate": 9.820778508673846e-06, "loss": 0.1003, "step": 1211 }, { "epoch": 1.1316526610644257, "grad_norm": 2.756738822293755, "learning_rate": 9.820377065119502e-06, "loss": 0.142, "step": 1212 }, { "epoch": 1.1325863678804855, "grad_norm": 4.850818041338777, "learning_rate": 9.819975180688739e-06, "loss": 0.1271, "step": 1213 }, { "epoch": 1.1335200746965453, "grad_norm": 1.4465613961103532, "learning_rate": 9.819572855418311e-06, "loss": 0.0666, "step": 1214 }, { "epoch": 1.134453781512605, "grad_norm": 1.8647017703186537, "learning_rate": 9.819170089345015e-06, "loss": 0.0682, "step": 1215 }, { "epoch": 1.1353874883286648, "grad_norm": 3.1748828405301404, "learning_rate": 9.818766882505693e-06, "loss": 0.1242, "step": 1216 }, { "epoch": 1.1363211951447245, "grad_norm": 6.978946133258739, "learning_rate": 9.818363234937218e-06, "loss": 0.3061, "step": 1217 }, { "epoch": 1.1372549019607843, "grad_norm": 2.8096928456339922, "learning_rate": 9.81795914667651e-06, "loss": 0.0733, "step": 1218 }, { "epoch": 1.138188608776844, "grad_norm": 2.327251017133324, "learning_rate": 9.817554617760529e-06, "loss": 0.0943, "step": 1219 }, { "epoch": 1.1391223155929038, "grad_norm": 1.0199253385142104, "learning_rate": 9.81714964822627e-06, "loss": 0.0354, "step": 1220 }, { "epoch": 1.1400560224089635, "grad_norm": 7.515392437295214, "learning_rate": 9.816744238110774e-06, "loss": 0.3193, "step": 1221 }, { "epoch": 1.1409897292250233, "grad_norm": 3.1794425868143414, "learning_rate": 9.816338387451119e-06, "loss": 0.1837, "step": 1222 }, { "epoch": 1.141923436041083, "grad_norm": 1.662136080904597, "learning_rate": 9.815932096284425e-06, "loss": 0.0434, "step": 1223 }, { "epoch": 1.1428571428571428, "grad_norm": 2.4990498662343192, "learning_rate": 9.815525364647853e-06, "loss": 0.0647, "step": 1224 }, { "epoch": 1.1437908496732025, "grad_norm": 3.2891483577321443, "learning_rate": 9.815118192578602e-06, "loss": 0.1665, "step": 1225 }, { "epoch": 1.1447245564892623, "grad_norm": 6.0559488390679, "learning_rate": 9.814710580113912e-06, "loss": 0.2488, "step": 1226 }, { "epoch": 1.145658263305322, "grad_norm": 2.07588996089809, "learning_rate": 9.814302527291065e-06, "loss": 0.0627, "step": 1227 }, { "epoch": 1.1465919701213818, "grad_norm": 2.463514283561062, "learning_rate": 9.81389403414738e-06, "loss": 0.1217, "step": 1228 }, { "epoch": 1.1475256769374416, "grad_norm": 8.745930765322024, "learning_rate": 9.81348510072022e-06, "loss": 0.1963, "step": 1229 }, { "epoch": 1.1484593837535013, "grad_norm": 6.553443265301286, "learning_rate": 9.813075727046986e-06, "loss": 0.1138, "step": 1230 }, { "epoch": 1.149393090569561, "grad_norm": 3.700221929215869, "learning_rate": 9.812665913165118e-06, "loss": 0.2423, "step": 1231 }, { "epoch": 1.1503267973856208, "grad_norm": 4.717183382912566, "learning_rate": 9.812255659112099e-06, "loss": 0.2005, "step": 1232 }, { "epoch": 1.1512605042016806, "grad_norm": 1.791371820427958, "learning_rate": 9.811844964925455e-06, "loss": 0.0644, "step": 1233 }, { "epoch": 1.1521942110177403, "grad_norm": 1.6610260845704095, "learning_rate": 9.811433830642742e-06, "loss": 0.0785, "step": 1234 }, { "epoch": 1.1531279178338, "grad_norm": 5.805138464712445, "learning_rate": 9.811022256301568e-06, "loss": 0.2002, "step": 1235 }, { "epoch": 1.1540616246498598, "grad_norm": 3.415620145878457, "learning_rate": 9.810610241939572e-06, "loss": 0.0891, "step": 1236 }, { "epoch": 1.1549953314659196, "grad_norm": 2.0096423670808186, "learning_rate": 9.81019778759444e-06, "loss": 0.1051, "step": 1237 }, { "epoch": 1.1559290382819793, "grad_norm": 1.1101618920085214, "learning_rate": 9.809784893303896e-06, "loss": 0.0349, "step": 1238 }, { "epoch": 1.156862745098039, "grad_norm": 2.7648127699895606, "learning_rate": 9.8093715591057e-06, "loss": 0.0542, "step": 1239 }, { "epoch": 1.1577964519140989, "grad_norm": 3.6688883809638293, "learning_rate": 9.808957785037658e-06, "loss": 0.1705, "step": 1240 }, { "epoch": 1.1587301587301586, "grad_norm": 2.1026254488100324, "learning_rate": 9.808543571137617e-06, "loss": 0.1103, "step": 1241 }, { "epoch": 1.1596638655462184, "grad_norm": 2.0743869380601443, "learning_rate": 9.808128917443456e-06, "loss": 0.0495, "step": 1242 }, { "epoch": 1.1605975723622783, "grad_norm": 6.312197720078128, "learning_rate": 9.807713823993104e-06, "loss": 0.3138, "step": 1243 }, { "epoch": 1.161531279178338, "grad_norm": 3.30570153675493, "learning_rate": 9.807298290824524e-06, "loss": 0.159, "step": 1244 }, { "epoch": 1.1624649859943978, "grad_norm": 5.051473294607086, "learning_rate": 9.806882317975721e-06, "loss": 0.1007, "step": 1245 }, { "epoch": 1.1633986928104576, "grad_norm": 2.4968803304590637, "learning_rate": 9.80646590548474e-06, "loss": 0.0403, "step": 1246 }, { "epoch": 1.1643323996265174, "grad_norm": 1.3958576960841238, "learning_rate": 9.80604905338967e-06, "loss": 0.0158, "step": 1247 }, { "epoch": 1.165266106442577, "grad_norm": 1.4092934649625737, "learning_rate": 9.805631761728633e-06, "loss": 0.0209, "step": 1248 }, { "epoch": 1.1661998132586369, "grad_norm": 6.677779379831306, "learning_rate": 9.805214030539794e-06, "loss": 0.1374, "step": 1249 }, { "epoch": 1.1671335200746966, "grad_norm": 3.4054562033745115, "learning_rate": 9.804795859861362e-06, "loss": 0.1506, "step": 1250 }, { "epoch": 1.1680672268907564, "grad_norm": 4.094148055220302, "learning_rate": 9.804377249731582e-06, "loss": 0.041, "step": 1251 }, { "epoch": 1.1690009337068161, "grad_norm": 4.126829327310927, "learning_rate": 9.80395820018874e-06, "loss": 0.2331, "step": 1252 }, { "epoch": 1.1699346405228759, "grad_norm": 3.123882423884055, "learning_rate": 9.803538711271166e-06, "loss": 0.0696, "step": 1253 }, { "epoch": 1.1708683473389356, "grad_norm": 1.5815239095548277, "learning_rate": 9.803118783017221e-06, "loss": 0.0717, "step": 1254 }, { "epoch": 1.1718020541549954, "grad_norm": 4.038691496014968, "learning_rate": 9.802698415465317e-06, "loss": 0.1217, "step": 1255 }, { "epoch": 1.1727357609710551, "grad_norm": 2.4193401530860057, "learning_rate": 9.802277608653898e-06, "loss": 0.0681, "step": 1256 }, { "epoch": 1.173669467787115, "grad_norm": 1.7168414132078555, "learning_rate": 9.801856362621455e-06, "loss": 0.043, "step": 1257 }, { "epoch": 1.1746031746031746, "grad_norm": 2.884506135485723, "learning_rate": 9.801434677406512e-06, "loss": 0.1221, "step": 1258 }, { "epoch": 1.1755368814192344, "grad_norm": 5.32450220340766, "learning_rate": 9.80101255304764e-06, "loss": 0.2133, "step": 1259 }, { "epoch": 1.1764705882352942, "grad_norm": 3.477739973616463, "learning_rate": 9.800589989583445e-06, "loss": 0.0998, "step": 1260 }, { "epoch": 1.177404295051354, "grad_norm": 1.7927229052480003, "learning_rate": 9.800166987052572e-06, "loss": 0.1042, "step": 1261 }, { "epoch": 1.1783380018674137, "grad_norm": 3.9906633794177564, "learning_rate": 9.799743545493715e-06, "loss": 0.2079, "step": 1262 }, { "epoch": 1.1792717086834734, "grad_norm": 7.053065951599343, "learning_rate": 9.7993196649456e-06, "loss": 0.2215, "step": 1263 }, { "epoch": 1.1802054154995332, "grad_norm": 3.7607116695155267, "learning_rate": 9.798895345446995e-06, "loss": 0.2414, "step": 1264 }, { "epoch": 1.181139122315593, "grad_norm": 1.4536490285870234, "learning_rate": 9.798470587036708e-06, "loss": 0.0385, "step": 1265 }, { "epoch": 1.1820728291316527, "grad_norm": 2.3748216045712285, "learning_rate": 9.79804538975359e-06, "loss": 0.0901, "step": 1266 }, { "epoch": 1.1830065359477124, "grad_norm": 2.550389436922358, "learning_rate": 9.797619753636529e-06, "loss": 0.0561, "step": 1267 }, { "epoch": 1.1839402427637722, "grad_norm": 2.839255913148862, "learning_rate": 9.797193678724454e-06, "loss": 0.1443, "step": 1268 }, { "epoch": 1.184873949579832, "grad_norm": 2.8483414670753846, "learning_rate": 9.796767165056334e-06, "loss": 0.0821, "step": 1269 }, { "epoch": 1.1858076563958917, "grad_norm": 3.8449204711702847, "learning_rate": 9.796340212671178e-06, "loss": 0.2151, "step": 1270 }, { "epoch": 1.1867413632119514, "grad_norm": 1.4632316258092832, "learning_rate": 9.795912821608035e-06, "loss": 0.0646, "step": 1271 }, { "epoch": 1.1876750700280112, "grad_norm": 2.2486965100367375, "learning_rate": 9.795484991905997e-06, "loss": 0.0627, "step": 1272 }, { "epoch": 1.188608776844071, "grad_norm": 0.9057512438457352, "learning_rate": 9.795056723604192e-06, "loss": 0.016, "step": 1273 }, { "epoch": 1.1895424836601307, "grad_norm": 3.6199817329499133, "learning_rate": 9.79462801674179e-06, "loss": 0.1981, "step": 1274 }, { "epoch": 1.1904761904761905, "grad_norm": 4.178461083062617, "learning_rate": 9.794198871358002e-06, "loss": 0.188, "step": 1275 }, { "epoch": 1.1914098972922502, "grad_norm": 2.313436323213474, "learning_rate": 9.793769287492077e-06, "loss": 0.0499, "step": 1276 }, { "epoch": 1.19234360410831, "grad_norm": 2.3617147604470743, "learning_rate": 9.793339265183303e-06, "loss": 0.1231, "step": 1277 }, { "epoch": 1.1932773109243697, "grad_norm": 6.711252873686599, "learning_rate": 9.792908804471015e-06, "loss": 0.2282, "step": 1278 }, { "epoch": 1.1942110177404295, "grad_norm": 4.4423824286804745, "learning_rate": 9.792477905394581e-06, "loss": 0.1395, "step": 1279 }, { "epoch": 1.1951447245564892, "grad_norm": 3.2647378604221275, "learning_rate": 9.792046567993411e-06, "loss": 0.1656, "step": 1280 }, { "epoch": 1.196078431372549, "grad_norm": 1.328410733020697, "learning_rate": 9.791614792306954e-06, "loss": 0.0371, "step": 1281 }, { "epoch": 1.1970121381886087, "grad_norm": 8.110761096848684, "learning_rate": 9.791182578374704e-06, "loss": 0.1996, "step": 1282 }, { "epoch": 1.1979458450046685, "grad_norm": 4.847333020374388, "learning_rate": 9.790749926236191e-06, "loss": 0.2391, "step": 1283 }, { "epoch": 1.1988795518207283, "grad_norm": 1.830063490601548, "learning_rate": 9.790316835930985e-06, "loss": 0.0546, "step": 1284 }, { "epoch": 1.199813258636788, "grad_norm": 2.9278264029488335, "learning_rate": 9.789883307498697e-06, "loss": 0.1296, "step": 1285 }, { "epoch": 1.2007469654528478, "grad_norm": 2.1988606743485306, "learning_rate": 9.789449340978978e-06, "loss": 0.0578, "step": 1286 }, { "epoch": 1.2016806722689075, "grad_norm": 4.929862836485705, "learning_rate": 9.789014936411519e-06, "loss": 0.1749, "step": 1287 }, { "epoch": 1.2026143790849673, "grad_norm": 5.612061216675074, "learning_rate": 9.788580093836048e-06, "loss": 0.1157, "step": 1288 }, { "epoch": 1.203548085901027, "grad_norm": 3.452610145946566, "learning_rate": 9.788144813292343e-06, "loss": 0.1559, "step": 1289 }, { "epoch": 1.2044817927170868, "grad_norm": 4.442426336708184, "learning_rate": 9.78770909482021e-06, "loss": 0.1923, "step": 1290 }, { "epoch": 1.2054154995331465, "grad_norm": 5.105852884070466, "learning_rate": 9.7872729384595e-06, "loss": 0.1941, "step": 1291 }, { "epoch": 1.2063492063492063, "grad_norm": 4.741582628589483, "learning_rate": 9.786836344250108e-06, "loss": 0.2111, "step": 1292 }, { "epoch": 1.207282913165266, "grad_norm": 4.1258438857756365, "learning_rate": 9.786399312231961e-06, "loss": 0.1779, "step": 1293 }, { "epoch": 1.2082166199813258, "grad_norm": 3.3154801878027973, "learning_rate": 9.785961842445033e-06, "loss": 0.1171, "step": 1294 }, { "epoch": 1.2091503267973855, "grad_norm": 2.6042924372848835, "learning_rate": 9.785523934929337e-06, "loss": 0.1736, "step": 1295 }, { "epoch": 1.2100840336134453, "grad_norm": 3.321230086462192, "learning_rate": 9.78508558972492e-06, "loss": 0.0613, "step": 1296 }, { "epoch": 1.211017740429505, "grad_norm": 3.3769416879705063, "learning_rate": 9.784646806871876e-06, "loss": 0.2225, "step": 1297 }, { "epoch": 1.2119514472455648, "grad_norm": 2.740333174459273, "learning_rate": 9.784207586410338e-06, "loss": 0.1734, "step": 1298 }, { "epoch": 1.2128851540616246, "grad_norm": 5.989563618932578, "learning_rate": 9.783767928380474e-06, "loss": 0.1244, "step": 1299 }, { "epoch": 1.2138188608776843, "grad_norm": 1.6898485790673952, "learning_rate": 9.7833278328225e-06, "loss": 0.0512, "step": 1300 }, { "epoch": 1.214752567693744, "grad_norm": 3.137814711604599, "learning_rate": 9.782887299776663e-06, "loss": 0.1449, "step": 1301 }, { "epoch": 1.215686274509804, "grad_norm": 2.1742533884506976, "learning_rate": 9.782446329283258e-06, "loss": 0.0991, "step": 1302 }, { "epoch": 1.2166199813258638, "grad_norm": 2.1374648860217933, "learning_rate": 9.782004921382612e-06, "loss": 0.1535, "step": 1303 }, { "epoch": 1.2175536881419236, "grad_norm": 1.836483935369599, "learning_rate": 9.781563076115104e-06, "loss": 0.1, "step": 1304 }, { "epoch": 1.2184873949579833, "grad_norm": 3.225973772840015, "learning_rate": 9.781120793521138e-06, "loss": 0.2226, "step": 1305 }, { "epoch": 1.219421101774043, "grad_norm": 2.1730658387483586, "learning_rate": 9.78067807364117e-06, "loss": 0.051, "step": 1306 }, { "epoch": 1.2203548085901028, "grad_norm": 1.2473857005452205, "learning_rate": 9.780234916515691e-06, "loss": 0.0622, "step": 1307 }, { "epoch": 1.2212885154061626, "grad_norm": 7.005493449931224, "learning_rate": 9.779791322185234e-06, "loss": 0.3122, "step": 1308 }, { "epoch": 1.2222222222222223, "grad_norm": 3.7497625160658505, "learning_rate": 9.779347290690366e-06, "loss": 0.1688, "step": 1309 }, { "epoch": 1.223155929038282, "grad_norm": 4.8752546964791454, "learning_rate": 9.778902822071704e-06, "loss": 0.313, "step": 1310 }, { "epoch": 1.2240896358543418, "grad_norm": 2.322359275429854, "learning_rate": 9.778457916369894e-06, "loss": 0.1657, "step": 1311 }, { "epoch": 1.2250233426704016, "grad_norm": 1.1174854030151755, "learning_rate": 9.778012573625631e-06, "loss": 0.0531, "step": 1312 }, { "epoch": 1.2259570494864613, "grad_norm": 2.3553186616188904, "learning_rate": 9.777566793879647e-06, "loss": 0.1126, "step": 1313 }, { "epoch": 1.226890756302521, "grad_norm": 5.4637046919051935, "learning_rate": 9.777120577172712e-06, "loss": 0.2446, "step": 1314 }, { "epoch": 1.2278244631185808, "grad_norm": 4.498249715374323, "learning_rate": 9.776673923545637e-06, "loss": 0.1351, "step": 1315 }, { "epoch": 1.2287581699346406, "grad_norm": 3.379043234068137, "learning_rate": 9.776226833039274e-06, "loss": 0.1869, "step": 1316 }, { "epoch": 1.2296918767507004, "grad_norm": 4.653258061644359, "learning_rate": 9.775779305694516e-06, "loss": 0.2389, "step": 1317 }, { "epoch": 1.23062558356676, "grad_norm": 2.6711993182715217, "learning_rate": 9.775331341552293e-06, "loss": 0.1064, "step": 1318 }, { "epoch": 1.2315592903828199, "grad_norm": 1.7801851958502286, "learning_rate": 9.774882940653574e-06, "loss": 0.1067, "step": 1319 }, { "epoch": 1.2324929971988796, "grad_norm": 3.9108698676849616, "learning_rate": 9.774434103039371e-06, "loss": 0.1467, "step": 1320 }, { "epoch": 1.2334267040149394, "grad_norm": 2.424364304849191, "learning_rate": 9.77398482875074e-06, "loss": 0.114, "step": 1321 }, { "epoch": 1.2343604108309991, "grad_norm": 2.512963771599349, "learning_rate": 9.773535117828766e-06, "loss": 0.1765, "step": 1322 }, { "epoch": 1.2352941176470589, "grad_norm": 2.694688396058572, "learning_rate": 9.773084970314584e-06, "loss": 0.1653, "step": 1323 }, { "epoch": 1.2362278244631186, "grad_norm": 2.090107367497019, "learning_rate": 9.772634386249362e-06, "loss": 0.0575, "step": 1324 }, { "epoch": 1.2371615312791784, "grad_norm": 0.7980414537634805, "learning_rate": 9.772183365674311e-06, "loss": 0.0189, "step": 1325 }, { "epoch": 1.2380952380952381, "grad_norm": 2.59698871340065, "learning_rate": 9.771731908630686e-06, "loss": 0.0999, "step": 1326 }, { "epoch": 1.239028944911298, "grad_norm": 2.1561552210032033, "learning_rate": 9.771280015159772e-06, "loss": 0.0548, "step": 1327 }, { "epoch": 1.2399626517273576, "grad_norm": 2.5296532470579463, "learning_rate": 9.770827685302904e-06, "loss": 0.1831, "step": 1328 }, { "epoch": 1.2408963585434174, "grad_norm": 5.195638487634428, "learning_rate": 9.770374919101449e-06, "loss": 0.2102, "step": 1329 }, { "epoch": 1.2418300653594772, "grad_norm": 3.0998308133083925, "learning_rate": 9.76992171659682e-06, "loss": 0.0352, "step": 1330 }, { "epoch": 1.242763772175537, "grad_norm": 0.9048640245653599, "learning_rate": 9.769468077830467e-06, "loss": 0.0324, "step": 1331 }, { "epoch": 1.2436974789915967, "grad_norm": 6.787050902430783, "learning_rate": 9.76901400284388e-06, "loss": 0.0777, "step": 1332 }, { "epoch": 1.2446311858076564, "grad_norm": 2.275892853173079, "learning_rate": 9.768559491678588e-06, "loss": 0.1447, "step": 1333 }, { "epoch": 1.2455648926237162, "grad_norm": 2.529847100035499, "learning_rate": 9.768104544376162e-06, "loss": 0.1096, "step": 1334 }, { "epoch": 1.246498599439776, "grad_norm": 0.6847773329540632, "learning_rate": 9.767649160978214e-06, "loss": 0.0251, "step": 1335 }, { "epoch": 1.2474323062558357, "grad_norm": 1.112017667174244, "learning_rate": 9.76719334152639e-06, "loss": 0.0267, "step": 1336 }, { "epoch": 1.2483660130718954, "grad_norm": 7.222183059149078, "learning_rate": 9.766737086062382e-06, "loss": 0.1611, "step": 1337 }, { "epoch": 1.2492997198879552, "grad_norm": 2.9209359268235597, "learning_rate": 9.766280394627919e-06, "loss": 0.2258, "step": 1338 }, { "epoch": 1.250233426704015, "grad_norm": 2.638774360364688, "learning_rate": 9.765823267264768e-06, "loss": 0.0951, "step": 1339 }, { "epoch": 1.2511671335200747, "grad_norm": 3.016094816683215, "learning_rate": 9.765365704014745e-06, "loss": 0.1703, "step": 1340 }, { "epoch": 1.2521008403361344, "grad_norm": 7.404952561908548, "learning_rate": 9.764907704919693e-06, "loss": 0.2974, "step": 1341 }, { "epoch": 1.2530345471521942, "grad_norm": 1.5685562672390017, "learning_rate": 9.764449270021503e-06, "loss": 0.0653, "step": 1342 }, { "epoch": 1.253968253968254, "grad_norm": 4.753425392028086, "learning_rate": 9.763990399362104e-06, "loss": 0.3007, "step": 1343 }, { "epoch": 1.2549019607843137, "grad_norm": 3.036244861746218, "learning_rate": 9.763531092983464e-06, "loss": 0.0943, "step": 1344 }, { "epoch": 1.2558356676003735, "grad_norm": 2.6391316364905024, "learning_rate": 9.763071350927594e-06, "loss": 0.0558, "step": 1345 }, { "epoch": 1.2567693744164332, "grad_norm": 1.096687058261006, "learning_rate": 9.762611173236537e-06, "loss": 0.0343, "step": 1346 }, { "epoch": 1.257703081232493, "grad_norm": 1.2764961459392767, "learning_rate": 9.76215055995239e-06, "loss": 0.0456, "step": 1347 }, { "epoch": 1.2586367880485527, "grad_norm": 3.59409461267877, "learning_rate": 9.761689511117273e-06, "loss": 0.2604, "step": 1348 }, { "epoch": 1.2595704948646125, "grad_norm": 3.1662302227553645, "learning_rate": 9.76122802677336e-06, "loss": 0.0936, "step": 1349 }, { "epoch": 1.2605042016806722, "grad_norm": 2.110067036165786, "learning_rate": 9.760766106962853e-06, "loss": 0.1201, "step": 1350 }, { "epoch": 1.261437908496732, "grad_norm": 2.161031453419568, "learning_rate": 9.760303751728004e-06, "loss": 0.0775, "step": 1351 }, { "epoch": 1.2623716153127917, "grad_norm": 5.495563733191344, "learning_rate": 9.759840961111098e-06, "loss": 0.3467, "step": 1352 }, { "epoch": 1.2633053221288515, "grad_norm": 3.834373203096184, "learning_rate": 9.759377735154464e-06, "loss": 0.0604, "step": 1353 }, { "epoch": 1.2642390289449112, "grad_norm": 3.668789831176171, "learning_rate": 9.75891407390047e-06, "loss": 0.2334, "step": 1354 }, { "epoch": 1.265172735760971, "grad_norm": 2.871498263749925, "learning_rate": 9.75844997739152e-06, "loss": 0.126, "step": 1355 }, { "epoch": 1.2661064425770308, "grad_norm": 3.253460422680439, "learning_rate": 9.757985445670062e-06, "loss": 0.1612, "step": 1356 }, { "epoch": 1.2670401493930905, "grad_norm": 1.545927511242103, "learning_rate": 9.757520478778581e-06, "loss": 0.0349, "step": 1357 }, { "epoch": 1.2679738562091503, "grad_norm": 3.063830084764931, "learning_rate": 9.757055076759608e-06, "loss": 0.1046, "step": 1358 }, { "epoch": 1.26890756302521, "grad_norm": 3.639132808445678, "learning_rate": 9.756589239655705e-06, "loss": 0.1932, "step": 1359 }, { "epoch": 1.2698412698412698, "grad_norm": 3.472685718460533, "learning_rate": 9.756122967509479e-06, "loss": 0.1629, "step": 1360 }, { "epoch": 1.2707749766573295, "grad_norm": 1.5077468177536324, "learning_rate": 9.755656260363576e-06, "loss": 0.0359, "step": 1361 }, { "epoch": 1.2717086834733893, "grad_norm": 3.8339813900299813, "learning_rate": 9.755189118260681e-06, "loss": 0.1676, "step": 1362 }, { "epoch": 1.272642390289449, "grad_norm": 6.116597179779356, "learning_rate": 9.754721541243519e-06, "loss": 0.3798, "step": 1363 }, { "epoch": 1.2735760971055088, "grad_norm": 1.886615249305293, "learning_rate": 9.754253529354855e-06, "loss": 0.042, "step": 1364 }, { "epoch": 1.2745098039215685, "grad_norm": 2.604024255608858, "learning_rate": 9.753785082637497e-06, "loss": 0.1676, "step": 1365 }, { "epoch": 1.2754435107376283, "grad_norm": 1.7651248153772492, "learning_rate": 9.753316201134284e-06, "loss": 0.0674, "step": 1366 }, { "epoch": 1.276377217553688, "grad_norm": 3.4171000487561463, "learning_rate": 9.752846884888105e-06, "loss": 0.1133, "step": 1367 }, { "epoch": 1.2773109243697478, "grad_norm": 3.0812527169092534, "learning_rate": 9.752377133941881e-06, "loss": 0.1326, "step": 1368 }, { "epoch": 1.2782446311858076, "grad_norm": 2.5930044005886628, "learning_rate": 9.75190694833858e-06, "loss": 0.106, "step": 1369 }, { "epoch": 1.2791783380018673, "grad_norm": 2.7206813127174203, "learning_rate": 9.7514363281212e-06, "loss": 0.1195, "step": 1370 }, { "epoch": 1.280112044817927, "grad_norm": 4.644824368336307, "learning_rate": 9.75096527333279e-06, "loss": 0.1623, "step": 1371 }, { "epoch": 1.2810457516339868, "grad_norm": 4.448305354810876, "learning_rate": 9.750493784016429e-06, "loss": 0.1918, "step": 1372 }, { "epoch": 1.2819794584500466, "grad_norm": 2.807665962584065, "learning_rate": 9.750021860215242e-06, "loss": 0.1389, "step": 1373 }, { "epoch": 1.2829131652661063, "grad_norm": 6.634100373540942, "learning_rate": 9.74954950197239e-06, "loss": 0.0731, "step": 1374 }, { "epoch": 1.283846872082166, "grad_norm": 4.980629025071476, "learning_rate": 9.749076709331078e-06, "loss": 0.2166, "step": 1375 }, { "epoch": 1.2847805788982258, "grad_norm": 1.9710427606332124, "learning_rate": 9.748603482334545e-06, "loss": 0.0381, "step": 1376 }, { "epoch": 1.2857142857142856, "grad_norm": 3.9797588449804486, "learning_rate": 9.748129821026073e-06, "loss": 0.2137, "step": 1377 }, { "epoch": 1.2866479925303453, "grad_norm": 4.917162099054147, "learning_rate": 9.747655725448988e-06, "loss": 0.2384, "step": 1378 }, { "epoch": 1.287581699346405, "grad_norm": 2.8230141991918596, "learning_rate": 9.747181195646647e-06, "loss": 0.1884, "step": 1379 }, { "epoch": 1.2885154061624648, "grad_norm": 2.0980567134827632, "learning_rate": 9.746706231662451e-06, "loss": 0.1137, "step": 1380 }, { "epoch": 1.2894491129785248, "grad_norm": 3.7978909619617487, "learning_rate": 9.74623083353984e-06, "loss": 0.2568, "step": 1381 }, { "epoch": 1.2903828197945846, "grad_norm": 3.3611937536695584, "learning_rate": 9.745755001322299e-06, "loss": 0.0673, "step": 1382 }, { "epoch": 1.2913165266106443, "grad_norm": 1.781235508242761, "learning_rate": 9.745278735053345e-06, "loss": 0.0393, "step": 1383 }, { "epoch": 1.292250233426704, "grad_norm": 3.7494227320339304, "learning_rate": 9.744802034776536e-06, "loss": 0.1799, "step": 1384 }, { "epoch": 1.2931839402427638, "grad_norm": 1.8364483087645385, "learning_rate": 9.744324900535475e-06, "loss": 0.1463, "step": 1385 }, { "epoch": 1.2941176470588236, "grad_norm": 5.808754422864392, "learning_rate": 9.743847332373797e-06, "loss": 0.0506, "step": 1386 }, { "epoch": 1.2950513538748833, "grad_norm": 2.4245565364307176, "learning_rate": 9.743369330335186e-06, "loss": 0.07, "step": 1387 }, { "epoch": 1.295985060690943, "grad_norm": 3.955354996831996, "learning_rate": 9.742890894463357e-06, "loss": 0.1574, "step": 1388 }, { "epoch": 1.2969187675070029, "grad_norm": 2.177699205731622, "learning_rate": 9.74241202480207e-06, "loss": 0.0434, "step": 1389 }, { "epoch": 1.2978524743230626, "grad_norm": 1.7863454607749332, "learning_rate": 9.741932721395122e-06, "loss": 0.0704, "step": 1390 }, { "epoch": 1.2987861811391224, "grad_norm": 2.3350259754258964, "learning_rate": 9.741452984286347e-06, "loss": 0.0623, "step": 1391 }, { "epoch": 1.2997198879551821, "grad_norm": 4.858176868183641, "learning_rate": 9.74097281351963e-06, "loss": 0.2219, "step": 1392 }, { "epoch": 1.3006535947712419, "grad_norm": 2.23271650368672, "learning_rate": 9.740492209138883e-06, "loss": 0.1355, "step": 1393 }, { "epoch": 1.3015873015873016, "grad_norm": 0.8686604158389086, "learning_rate": 9.740011171188062e-06, "loss": 0.0218, "step": 1394 }, { "epoch": 1.3025210084033614, "grad_norm": 2.5248637956678155, "learning_rate": 9.739529699711166e-06, "loss": 0.0604, "step": 1395 }, { "epoch": 1.3034547152194211, "grad_norm": 4.459981118276416, "learning_rate": 9.739047794752229e-06, "loss": 0.2004, "step": 1396 }, { "epoch": 1.3043884220354809, "grad_norm": 2.2732729101299305, "learning_rate": 9.738565456355324e-06, "loss": 0.0699, "step": 1397 }, { "epoch": 1.3053221288515406, "grad_norm": 2.2593926743511306, "learning_rate": 9.738082684564572e-06, "loss": 0.0424, "step": 1398 }, { "epoch": 1.3062558356676004, "grad_norm": 2.2429981820715996, "learning_rate": 9.737599479424123e-06, "loss": 0.0979, "step": 1399 }, { "epoch": 1.3071895424836601, "grad_norm": 2.657652918852371, "learning_rate": 9.737115840978172e-06, "loss": 0.1394, "step": 1400 }, { "epoch": 1.30812324929972, "grad_norm": 3.781597102414449, "learning_rate": 9.736631769270958e-06, "loss": 0.2408, "step": 1401 }, { "epoch": 1.3090569561157797, "grad_norm": 1.9052795685175694, "learning_rate": 9.736147264346748e-06, "loss": 0.0943, "step": 1402 }, { "epoch": 1.3099906629318394, "grad_norm": 2.598831657468855, "learning_rate": 9.73566232624986e-06, "loss": 0.1274, "step": 1403 }, { "epoch": 1.3109243697478992, "grad_norm": 3.1952730497753072, "learning_rate": 9.735176955024643e-06, "loss": 0.2225, "step": 1404 }, { "epoch": 1.311858076563959, "grad_norm": 3.063972995901817, "learning_rate": 9.734691150715493e-06, "loss": 0.2206, "step": 1405 }, { "epoch": 1.3127917833800187, "grad_norm": 1.832211413129397, "learning_rate": 9.73420491336684e-06, "loss": 0.1316, "step": 1406 }, { "epoch": 1.3137254901960784, "grad_norm": 1.352704439341591, "learning_rate": 9.733718243023159e-06, "loss": 0.0465, "step": 1407 }, { "epoch": 1.3146591970121382, "grad_norm": 3.3169194615087774, "learning_rate": 9.733231139728956e-06, "loss": 0.1393, "step": 1408 }, { "epoch": 1.315592903828198, "grad_norm": 1.7692237876317733, "learning_rate": 9.732743603528785e-06, "loss": 0.1294, "step": 1409 }, { "epoch": 1.3165266106442577, "grad_norm": 6.405040044468377, "learning_rate": 9.732255634467238e-06, "loss": 0.0629, "step": 1410 }, { "epoch": 1.3174603174603174, "grad_norm": 2.4568346385484374, "learning_rate": 9.73176723258894e-06, "loss": 0.1233, "step": 1411 }, { "epoch": 1.3183940242763772, "grad_norm": 1.8495128640970053, "learning_rate": 9.731278397938568e-06, "loss": 0.0826, "step": 1412 }, { "epoch": 1.319327731092437, "grad_norm": 1.4808415102142485, "learning_rate": 9.730789130560825e-06, "loss": 0.0359, "step": 1413 }, { "epoch": 1.3202614379084967, "grad_norm": 4.139823030943098, "learning_rate": 9.730299430500463e-06, "loss": 0.0867, "step": 1414 }, { "epoch": 1.3211951447245565, "grad_norm": 3.7343804828587692, "learning_rate": 9.72980929780227e-06, "loss": 0.075, "step": 1415 }, { "epoch": 1.3221288515406162, "grad_norm": 3.272993255297751, "learning_rate": 9.729318732511075e-06, "loss": 0.1071, "step": 1416 }, { "epoch": 1.323062558356676, "grad_norm": 1.9916398124686978, "learning_rate": 9.728827734671741e-06, "loss": 0.1226, "step": 1417 }, { "epoch": 1.3239962651727357, "grad_norm": 5.942696692096217, "learning_rate": 9.728336304329182e-06, "loss": 0.1617, "step": 1418 }, { "epoch": 1.3249299719887955, "grad_norm": 0.6647132724839026, "learning_rate": 9.72784444152834e-06, "loss": 0.0251, "step": 1419 }, { "epoch": 1.3258636788048552, "grad_norm": 1.8978125774469192, "learning_rate": 9.727352146314203e-06, "loss": 0.0878, "step": 1420 }, { "epoch": 1.326797385620915, "grad_norm": 1.1899164381342615, "learning_rate": 9.726859418731796e-06, "loss": 0.0965, "step": 1421 }, { "epoch": 1.3277310924369747, "grad_norm": 1.9114009422984566, "learning_rate": 9.726366258826183e-06, "loss": 0.1227, "step": 1422 }, { "epoch": 1.3286647992530345, "grad_norm": 1.9512534987259746, "learning_rate": 9.725872666642473e-06, "loss": 0.0973, "step": 1423 }, { "epoch": 1.3295985060690942, "grad_norm": 0.7700273616744044, "learning_rate": 9.725378642225807e-06, "loss": 0.0273, "step": 1424 }, { "epoch": 1.330532212885154, "grad_norm": 1.9253996326639629, "learning_rate": 9.72488418562137e-06, "loss": 0.1026, "step": 1425 }, { "epoch": 1.3314659197012138, "grad_norm": 2.791637872612841, "learning_rate": 9.724389296874383e-06, "loss": 0.0771, "step": 1426 }, { "epoch": 1.3323996265172735, "grad_norm": 3.51135684778979, "learning_rate": 9.723893976030115e-06, "loss": 0.2655, "step": 1427 }, { "epoch": 1.3333333333333333, "grad_norm": 4.452603125286827, "learning_rate": 9.723398223133863e-06, "loss": 0.268, "step": 1428 }, { "epoch": 1.334267040149393, "grad_norm": 2.2839630182785653, "learning_rate": 9.722902038230973e-06, "loss": 0.116, "step": 1429 }, { "epoch": 1.3352007469654528, "grad_norm": 2.3820388093078484, "learning_rate": 9.722405421366822e-06, "loss": 0.1153, "step": 1430 }, { "epoch": 1.3361344537815127, "grad_norm": 3.7114671093608314, "learning_rate": 9.721908372586835e-06, "loss": 0.2246, "step": 1431 }, { "epoch": 1.3370681605975725, "grad_norm": 2.25972569531072, "learning_rate": 9.72141089193647e-06, "loss": 0.1262, "step": 1432 }, { "epoch": 1.3380018674136323, "grad_norm": 4.126323202784483, "learning_rate": 9.720912979461229e-06, "loss": 0.0559, "step": 1433 }, { "epoch": 1.338935574229692, "grad_norm": 1.5286825299656075, "learning_rate": 9.72041463520665e-06, "loss": 0.0951, "step": 1434 }, { "epoch": 1.3398692810457518, "grad_norm": 0.8146477436267575, "learning_rate": 9.719915859218313e-06, "loss": 0.0339, "step": 1435 }, { "epoch": 1.3408029878618115, "grad_norm": 2.0857757289991743, "learning_rate": 9.719416651541839e-06, "loss": 0.106, "step": 1436 }, { "epoch": 1.3417366946778713, "grad_norm": 3.127308771405091, "learning_rate": 9.718917012222881e-06, "loss": 0.1536, "step": 1437 }, { "epoch": 1.342670401493931, "grad_norm": 3.5623006008521525, "learning_rate": 9.718416941307139e-06, "loss": 0.1615, "step": 1438 }, { "epoch": 1.3436041083099908, "grad_norm": 2.240567479103467, "learning_rate": 9.717916438840351e-06, "loss": 0.0733, "step": 1439 }, { "epoch": 1.3445378151260505, "grad_norm": 2.983721419787584, "learning_rate": 9.717415504868292e-06, "loss": 0.2118, "step": 1440 }, { "epoch": 1.3454715219421103, "grad_norm": 3.5808544371157454, "learning_rate": 9.716914139436777e-06, "loss": 0.1219, "step": 1441 }, { "epoch": 1.34640522875817, "grad_norm": 3.495316907833415, "learning_rate": 9.716412342591664e-06, "loss": 0.1527, "step": 1442 }, { "epoch": 1.3473389355742298, "grad_norm": 2.687040123327191, "learning_rate": 9.715910114378845e-06, "loss": 0.1392, "step": 1443 }, { "epoch": 1.3482726423902895, "grad_norm": 1.9485144221019088, "learning_rate": 9.71540745484426e-06, "loss": 0.1226, "step": 1444 }, { "epoch": 1.3492063492063493, "grad_norm": 3.1469095078955456, "learning_rate": 9.714904364033873e-06, "loss": 0.1162, "step": 1445 }, { "epoch": 1.350140056022409, "grad_norm": 3.3303690009213343, "learning_rate": 9.714400841993706e-06, "loss": 0.2077, "step": 1446 }, { "epoch": 1.3510737628384688, "grad_norm": 1.1781335534740904, "learning_rate": 9.713896888769807e-06, "loss": 0.0427, "step": 1447 }, { "epoch": 1.3520074696545286, "grad_norm": 3.1234830627281736, "learning_rate": 9.71339250440827e-06, "loss": 0.2088, "step": 1448 }, { "epoch": 1.3529411764705883, "grad_norm": 8.531633955151687, "learning_rate": 9.712887688955226e-06, "loss": 0.0423, "step": 1449 }, { "epoch": 1.353874883286648, "grad_norm": 1.6867416038402556, "learning_rate": 9.712382442456845e-06, "loss": 0.0894, "step": 1450 }, { "epoch": 1.3548085901027078, "grad_norm": 3.0324830844743564, "learning_rate": 9.711876764959338e-06, "loss": 0.2368, "step": 1451 }, { "epoch": 1.3557422969187676, "grad_norm": 1.3334117280106494, "learning_rate": 9.711370656508957e-06, "loss": 0.0347, "step": 1452 }, { "epoch": 1.3566760037348273, "grad_norm": 1.1250532373044593, "learning_rate": 9.710864117151985e-06, "loss": 0.0223, "step": 1453 }, { "epoch": 1.357609710550887, "grad_norm": 1.0463847324176578, "learning_rate": 9.710357146934758e-06, "loss": 0.0486, "step": 1454 }, { "epoch": 1.3585434173669468, "grad_norm": 3.1367758129158494, "learning_rate": 9.70984974590364e-06, "loss": 0.197, "step": 1455 }, { "epoch": 1.3594771241830066, "grad_norm": 1.3057619079682594, "learning_rate": 9.709341914105037e-06, "loss": 0.0569, "step": 1456 }, { "epoch": 1.3604108309990663, "grad_norm": 1.4665911908135767, "learning_rate": 9.708833651585402e-06, "loss": 0.0825, "step": 1457 }, { "epoch": 1.361344537815126, "grad_norm": 3.064705737675689, "learning_rate": 9.708324958391212e-06, "loss": 0.1754, "step": 1458 }, { "epoch": 1.3622782446311859, "grad_norm": 4.343013577124443, "learning_rate": 9.707815834569001e-06, "loss": 0.1273, "step": 1459 }, { "epoch": 1.3632119514472456, "grad_norm": 2.2126899825141004, "learning_rate": 9.70730628016533e-06, "loss": 0.1565, "step": 1460 }, { "epoch": 1.3641456582633054, "grad_norm": 2.781690968113164, "learning_rate": 9.706796295226802e-06, "loss": 0.1653, "step": 1461 }, { "epoch": 1.3650793650793651, "grad_norm": 2.8394777929977537, "learning_rate": 9.706285879800064e-06, "loss": 0.094, "step": 1462 }, { "epoch": 1.3660130718954249, "grad_norm": 2.283755843811091, "learning_rate": 9.705775033931799e-06, "loss": 0.1077, "step": 1463 }, { "epoch": 1.3669467787114846, "grad_norm": 4.418163161831906, "learning_rate": 9.705263757668726e-06, "loss": 0.232, "step": 1464 }, { "epoch": 1.3678804855275444, "grad_norm": 2.848003011032414, "learning_rate": 9.70475205105761e-06, "loss": 0.0852, "step": 1465 }, { "epoch": 1.3688141923436041, "grad_norm": 1.193298250347857, "learning_rate": 9.70423991414525e-06, "loss": 0.0363, "step": 1466 }, { "epoch": 1.3697478991596639, "grad_norm": 1.582100309017699, "learning_rate": 9.70372734697849e-06, "loss": 0.0504, "step": 1467 }, { "epoch": 1.3706816059757236, "grad_norm": 5.625604054046142, "learning_rate": 9.703214349604206e-06, "loss": 0.3471, "step": 1468 }, { "epoch": 1.3716153127917834, "grad_norm": 1.8565282619103038, "learning_rate": 9.702700922069319e-06, "loss": 0.0483, "step": 1469 }, { "epoch": 1.3725490196078431, "grad_norm": 2.4205811509819215, "learning_rate": 9.702187064420789e-06, "loss": 0.1549, "step": 1470 }, { "epoch": 1.373482726423903, "grad_norm": 1.5143300684359826, "learning_rate": 9.70167277670561e-06, "loss": 0.0571, "step": 1471 }, { "epoch": 1.3744164332399627, "grad_norm": 3.7672006405186282, "learning_rate": 9.701158058970823e-06, "loss": 0.1068, "step": 1472 }, { "epoch": 1.3753501400560224, "grad_norm": 5.145261655205022, "learning_rate": 9.700642911263503e-06, "loss": 0.2383, "step": 1473 }, { "epoch": 1.3762838468720822, "grad_norm": 2.521844523067596, "learning_rate": 9.700127333630766e-06, "loss": 0.0726, "step": 1474 }, { "epoch": 1.377217553688142, "grad_norm": 0.9165574335187399, "learning_rate": 9.69961132611977e-06, "loss": 0.0217, "step": 1475 }, { "epoch": 1.3781512605042017, "grad_norm": 1.52239611687748, "learning_rate": 9.699094888777703e-06, "loss": 0.0341, "step": 1476 }, { "epoch": 1.3790849673202614, "grad_norm": 2.220555440724562, "learning_rate": 9.698578021651804e-06, "loss": 0.1445, "step": 1477 }, { "epoch": 1.3800186741363212, "grad_norm": 1.2461111294284535, "learning_rate": 9.698060724789347e-06, "loss": 0.0718, "step": 1478 }, { "epoch": 1.380952380952381, "grad_norm": 2.5329330193481487, "learning_rate": 9.69754299823764e-06, "loss": 0.1052, "step": 1479 }, { "epoch": 1.3818860877684407, "grad_norm": 5.9842617624812995, "learning_rate": 9.697024842044038e-06, "loss": 0.2181, "step": 1480 }, { "epoch": 1.3828197945845004, "grad_norm": 1.8690502410893037, "learning_rate": 9.696506256255931e-06, "loss": 0.0791, "step": 1481 }, { "epoch": 1.3837535014005602, "grad_norm": 4.612106139309425, "learning_rate": 9.69598724092075e-06, "loss": 0.3103, "step": 1482 }, { "epoch": 1.38468720821662, "grad_norm": 2.4096118063438343, "learning_rate": 9.695467796085964e-06, "loss": 0.0398, "step": 1483 }, { "epoch": 1.3856209150326797, "grad_norm": 2.218990179745229, "learning_rate": 9.694947921799083e-06, "loss": 0.0832, "step": 1484 }, { "epoch": 1.3865546218487395, "grad_norm": 4.790558974332546, "learning_rate": 9.694427618107655e-06, "loss": 0.1445, "step": 1485 }, { "epoch": 1.3874883286647992, "grad_norm": 2.658135336201529, "learning_rate": 9.693906885059265e-06, "loss": 0.15, "step": 1486 }, { "epoch": 1.388422035480859, "grad_norm": 2.9801113497190475, "learning_rate": 9.693385722701543e-06, "loss": 0.1587, "step": 1487 }, { "epoch": 1.3893557422969187, "grad_norm": 3.8596662869418763, "learning_rate": 9.692864131082153e-06, "loss": 0.1681, "step": 1488 }, { "epoch": 1.3902894491129785, "grad_norm": 1.0921951248844126, "learning_rate": 9.692342110248802e-06, "loss": 0.0296, "step": 1489 }, { "epoch": 1.3912231559290382, "grad_norm": 1.515376977047756, "learning_rate": 9.691819660249234e-06, "loss": 0.0277, "step": 1490 }, { "epoch": 1.392156862745098, "grad_norm": 1.7665007961828847, "learning_rate": 9.691296781131232e-06, "loss": 0.052, "step": 1491 }, { "epoch": 1.3930905695611577, "grad_norm": 3.1701132919429993, "learning_rate": 9.690773472942618e-06, "loss": 0.148, "step": 1492 }, { "epoch": 1.3940242763772175, "grad_norm": 3.294406662550643, "learning_rate": 9.690249735731256e-06, "loss": 0.0429, "step": 1493 }, { "epoch": 1.3949579831932772, "grad_norm": 3.3530624557879216, "learning_rate": 9.689725569545048e-06, "loss": 0.1914, "step": 1494 }, { "epoch": 1.395891690009337, "grad_norm": 4.08209858337951, "learning_rate": 9.689200974431934e-06, "loss": 0.1731, "step": 1495 }, { "epoch": 1.3968253968253967, "grad_norm": 3.5847254875827703, "learning_rate": 9.688675950439894e-06, "loss": 0.0938, "step": 1496 }, { "epoch": 1.3977591036414565, "grad_norm": 3.2085553506178233, "learning_rate": 9.688150497616947e-06, "loss": 0.2087, "step": 1497 }, { "epoch": 1.3986928104575163, "grad_norm": 4.667476725823043, "learning_rate": 9.687624616011151e-06, "loss": 0.1074, "step": 1498 }, { "epoch": 1.399626517273576, "grad_norm": 4.257129414180453, "learning_rate": 9.687098305670606e-06, "loss": 0.2422, "step": 1499 }, { "epoch": 1.4005602240896358, "grad_norm": 4.580498406978629, "learning_rate": 9.686571566643446e-06, "loss": 0.0921, "step": 1500 }, { "epoch": 1.4014939309056955, "grad_norm": 7.345156512765325, "learning_rate": 9.686044398977847e-06, "loss": 0.2608, "step": 1501 }, { "epoch": 1.4024276377217553, "grad_norm": 2.0365514765891595, "learning_rate": 9.685516802722026e-06, "loss": 0.0589, "step": 1502 }, { "epoch": 1.403361344537815, "grad_norm": 2.789493033732231, "learning_rate": 9.684988777924237e-06, "loss": 0.1238, "step": 1503 }, { "epoch": 1.4042950513538748, "grad_norm": 2.93263672574988, "learning_rate": 9.684460324632774e-06, "loss": 0.0615, "step": 1504 }, { "epoch": 1.4052287581699345, "grad_norm": 1.5353893560680554, "learning_rate": 9.683931442895968e-06, "loss": 0.0694, "step": 1505 }, { "epoch": 1.4061624649859943, "grad_norm": 3.1393054142402965, "learning_rate": 9.683402132762194e-06, "loss": 0.1293, "step": 1506 }, { "epoch": 1.407096171802054, "grad_norm": 2.3084921808370344, "learning_rate": 9.682872394279863e-06, "loss": 0.1061, "step": 1507 }, { "epoch": 1.4080298786181138, "grad_norm": 11.259806735638712, "learning_rate": 9.68234222749742e-06, "loss": 0.1379, "step": 1508 }, { "epoch": 1.4089635854341735, "grad_norm": 2.464205336769419, "learning_rate": 9.681811632463362e-06, "loss": 0.0948, "step": 1509 }, { "epoch": 1.4098972922502333, "grad_norm": 1.8114555131982693, "learning_rate": 9.681280609226213e-06, "loss": 0.0705, "step": 1510 }, { "epoch": 1.410830999066293, "grad_norm": 4.584622096572452, "learning_rate": 9.680749157834543e-06, "loss": 0.0679, "step": 1511 }, { "epoch": 1.4117647058823528, "grad_norm": 1.5703660445312815, "learning_rate": 9.680217278336958e-06, "loss": 0.0811, "step": 1512 }, { "epoch": 1.4126984126984126, "grad_norm": 2.5402896124839778, "learning_rate": 9.679684970782105e-06, "loss": 0.0851, "step": 1513 }, { "epoch": 1.4136321195144723, "grad_norm": 2.4409415760954647, "learning_rate": 9.67915223521867e-06, "loss": 0.1052, "step": 1514 }, { "epoch": 1.4145658263305323, "grad_norm": 2.395994519813661, "learning_rate": 9.678619071695374e-06, "loss": 0.0541, "step": 1515 }, { "epoch": 1.415499533146592, "grad_norm": 2.7042983753185976, "learning_rate": 9.678085480260985e-06, "loss": 0.163, "step": 1516 }, { "epoch": 1.4164332399626518, "grad_norm": 10.192283340096692, "learning_rate": 9.677551460964303e-06, "loss": 0.3576, "step": 1517 }, { "epoch": 1.4173669467787116, "grad_norm": 3.8664984373180507, "learning_rate": 9.677017013854171e-06, "loss": 0.2272, "step": 1518 }, { "epoch": 1.4183006535947713, "grad_norm": 1.4893427256990068, "learning_rate": 9.67648213897947e-06, "loss": 0.0457, "step": 1519 }, { "epoch": 1.419234360410831, "grad_norm": 2.1809293261723943, "learning_rate": 9.67594683638912e-06, "loss": 0.0685, "step": 1520 }, { "epoch": 1.4201680672268908, "grad_norm": 2.141901672624546, "learning_rate": 9.675411106132082e-06, "loss": 0.0775, "step": 1521 }, { "epoch": 1.4211017740429506, "grad_norm": 1.9438462618854142, "learning_rate": 9.674874948257352e-06, "loss": 0.0424, "step": 1522 }, { "epoch": 1.4220354808590103, "grad_norm": 1.4580953876113953, "learning_rate": 9.674338362813966e-06, "loss": 0.0651, "step": 1523 }, { "epoch": 1.42296918767507, "grad_norm": 6.05528876177192, "learning_rate": 9.673801349851006e-06, "loss": 0.1926, "step": 1524 }, { "epoch": 1.4239028944911298, "grad_norm": 8.414620384131856, "learning_rate": 9.673263909417583e-06, "loss": 0.3415, "step": 1525 }, { "epoch": 1.4248366013071896, "grad_norm": 2.142064864558311, "learning_rate": 9.672726041562854e-06, "loss": 0.0658, "step": 1526 }, { "epoch": 1.4257703081232493, "grad_norm": 5.622926913855413, "learning_rate": 9.672187746336012e-06, "loss": 0.1737, "step": 1527 }, { "epoch": 1.426704014939309, "grad_norm": 4.260044394813395, "learning_rate": 9.671649023786291e-06, "loss": 0.0711, "step": 1528 }, { "epoch": 1.4276377217553688, "grad_norm": 1.3736616481668207, "learning_rate": 9.671109873962963e-06, "loss": 0.0415, "step": 1529 }, { "epoch": 1.4285714285714286, "grad_norm": 2.199396012285479, "learning_rate": 9.670570296915338e-06, "loss": 0.103, "step": 1530 }, { "epoch": 1.4295051353874884, "grad_norm": 3.685420916157017, "learning_rate": 9.670030292692768e-06, "loss": 0.0511, "step": 1531 }, { "epoch": 1.430438842203548, "grad_norm": 10.343550901472844, "learning_rate": 9.66948986134464e-06, "loss": 0.2531, "step": 1532 }, { "epoch": 1.4313725490196079, "grad_norm": 4.5681624138596835, "learning_rate": 9.668949002920386e-06, "loss": 0.1209, "step": 1533 }, { "epoch": 1.4323062558356676, "grad_norm": 7.935263577456579, "learning_rate": 9.668407717469468e-06, "loss": 0.2187, "step": 1534 }, { "epoch": 1.4332399626517274, "grad_norm": 3.989463747081464, "learning_rate": 9.667866005041398e-06, "loss": 0.1683, "step": 1535 }, { "epoch": 1.4341736694677871, "grad_norm": 3.1693372984394648, "learning_rate": 9.667323865685718e-06, "loss": 0.1273, "step": 1536 }, { "epoch": 1.4351073762838469, "grad_norm": 5.919515803663811, "learning_rate": 9.666781299452014e-06, "loss": 0.3098, "step": 1537 }, { "epoch": 1.4360410830999066, "grad_norm": 4.798320725756062, "learning_rate": 9.66623830638991e-06, "loss": 0.3496, "step": 1538 }, { "epoch": 1.4369747899159664, "grad_norm": 1.8845124037728858, "learning_rate": 9.665694886549067e-06, "loss": 0.0554, "step": 1539 }, { "epoch": 1.4379084967320261, "grad_norm": 1.8026580721810064, "learning_rate": 9.665151039979189e-06, "loss": 0.042, "step": 1540 }, { "epoch": 1.438842203548086, "grad_norm": 4.204788584787051, "learning_rate": 9.664606766730014e-06, "loss": 0.1803, "step": 1541 }, { "epoch": 1.4397759103641457, "grad_norm": 2.619149106236627, "learning_rate": 9.664062066851325e-06, "loss": 0.2092, "step": 1542 }, { "epoch": 1.4407096171802054, "grad_norm": 1.4786758334476515, "learning_rate": 9.66351694039294e-06, "loss": 0.0397, "step": 1543 }, { "epoch": 1.4416433239962652, "grad_norm": 3.1193446757896512, "learning_rate": 9.662971387404712e-06, "loss": 0.1742, "step": 1544 }, { "epoch": 1.442577030812325, "grad_norm": 2.343544757370146, "learning_rate": 9.662425407936544e-06, "loss": 0.1086, "step": 1545 }, { "epoch": 1.4435107376283847, "grad_norm": 4.126875269712366, "learning_rate": 9.66187900203837e-06, "loss": 0.2541, "step": 1546 }, { "epoch": 1.4444444444444444, "grad_norm": 2.30872680939944, "learning_rate": 9.661332169760164e-06, "loss": 0.03, "step": 1547 }, { "epoch": 1.4453781512605042, "grad_norm": 4.002334852531082, "learning_rate": 9.66078491115194e-06, "loss": 0.272, "step": 1548 }, { "epoch": 1.446311858076564, "grad_norm": 1.2596255929266678, "learning_rate": 9.660237226263751e-06, "loss": 0.0721, "step": 1549 }, { "epoch": 1.4472455648926237, "grad_norm": 3.2569305836584275, "learning_rate": 9.65968911514569e-06, "loss": 0.1466, "step": 1550 }, { "epoch": 1.4481792717086834, "grad_norm": 1.6369093070040244, "learning_rate": 9.659140577847886e-06, "loss": 0.0672, "step": 1551 }, { "epoch": 1.4491129785247432, "grad_norm": 3.5356490242711014, "learning_rate": 9.658591614420507e-06, "loss": 0.2098, "step": 1552 }, { "epoch": 1.450046685340803, "grad_norm": 2.457192822784734, "learning_rate": 9.658042224913766e-06, "loss": 0.1036, "step": 1553 }, { "epoch": 1.4509803921568627, "grad_norm": 2.220704946616647, "learning_rate": 9.657492409377908e-06, "loss": 0.0634, "step": 1554 }, { "epoch": 1.4519140989729225, "grad_norm": 1.815988970309413, "learning_rate": 9.656942167863221e-06, "loss": 0.1075, "step": 1555 }, { "epoch": 1.4528478057889822, "grad_norm": 3.128415123772387, "learning_rate": 9.65639150042003e-06, "loss": 0.1181, "step": 1556 }, { "epoch": 1.453781512605042, "grad_norm": 1.066986406367883, "learning_rate": 9.6558404070987e-06, "loss": 0.0427, "step": 1557 }, { "epoch": 1.4547152194211017, "grad_norm": 1.0316326220091163, "learning_rate": 9.655288887949634e-06, "loss": 0.0324, "step": 1558 }, { "epoch": 1.4556489262371615, "grad_norm": 4.148286912150839, "learning_rate": 9.654736943023276e-06, "loss": 0.0653, "step": 1559 }, { "epoch": 1.4565826330532212, "grad_norm": 1.504334457371611, "learning_rate": 9.654184572370104e-06, "loss": 0.0614, "step": 1560 }, { "epoch": 1.457516339869281, "grad_norm": 3.8456676037388373, "learning_rate": 9.653631776040641e-06, "loss": 0.1799, "step": 1561 }, { "epoch": 1.4584500466853407, "grad_norm": 4.577707138084448, "learning_rate": 9.653078554085446e-06, "loss": 0.3047, "step": 1562 }, { "epoch": 1.4593837535014005, "grad_norm": 2.453362333812573, "learning_rate": 9.652524906555116e-06, "loss": 0.0783, "step": 1563 }, { "epoch": 1.4603174603174602, "grad_norm": 2.199583198569151, "learning_rate": 9.651970833500291e-06, "loss": 0.0867, "step": 1564 }, { "epoch": 1.4612511671335202, "grad_norm": 3.0940528868008945, "learning_rate": 9.651416334971643e-06, "loss": 0.2058, "step": 1565 }, { "epoch": 1.46218487394958, "grad_norm": 2.6654217412612202, "learning_rate": 9.650861411019891e-06, "loss": 0.0544, "step": 1566 }, { "epoch": 1.4631185807656397, "grad_norm": 5.7552845122615865, "learning_rate": 9.650306061695787e-06, "loss": 0.2201, "step": 1567 }, { "epoch": 1.4640522875816995, "grad_norm": 2.938212368662597, "learning_rate": 9.649750287050123e-06, "loss": 0.0898, "step": 1568 }, { "epoch": 1.4649859943977592, "grad_norm": 1.5091194170843867, "learning_rate": 9.649194087133731e-06, "loss": 0.0448, "step": 1569 }, { "epoch": 1.465919701213819, "grad_norm": 4.432769205996084, "learning_rate": 9.648637461997483e-06, "loss": 0.1284, "step": 1570 }, { "epoch": 1.4668534080298787, "grad_norm": 2.749376559138755, "learning_rate": 9.648080411692287e-06, "loss": 0.1306, "step": 1571 }, { "epoch": 1.4677871148459385, "grad_norm": 1.156744913052019, "learning_rate": 9.647522936269092e-06, "loss": 0.0355, "step": 1572 }, { "epoch": 1.4687208216619982, "grad_norm": 0.9819750723265197, "learning_rate": 9.646965035778885e-06, "loss": 0.0231, "step": 1573 }, { "epoch": 1.469654528478058, "grad_norm": 2.8377322903101794, "learning_rate": 9.646406710272694e-06, "loss": 0.1632, "step": 1574 }, { "epoch": 1.4705882352941178, "grad_norm": 1.6543138529663641, "learning_rate": 9.645847959801581e-06, "loss": 0.0708, "step": 1575 }, { "epoch": 1.4715219421101775, "grad_norm": 2.8913919403798425, "learning_rate": 9.645288784416653e-06, "loss": 0.1284, "step": 1576 }, { "epoch": 1.4724556489262373, "grad_norm": 1.8179362951910294, "learning_rate": 9.64472918416905e-06, "loss": 0.0215, "step": 1577 }, { "epoch": 1.473389355742297, "grad_norm": 2.8325799275610137, "learning_rate": 9.644169159109952e-06, "loss": 0.0316, "step": 1578 }, { "epoch": 1.4743230625583568, "grad_norm": 1.863275260451088, "learning_rate": 9.643608709290586e-06, "loss": 0.0827, "step": 1579 }, { "epoch": 1.4752567693744165, "grad_norm": 2.6263689475582175, "learning_rate": 9.643047834762205e-06, "loss": 0.1546, "step": 1580 }, { "epoch": 1.4761904761904763, "grad_norm": 2.4150755396256387, "learning_rate": 9.64248653557611e-06, "loss": 0.1311, "step": 1581 }, { "epoch": 1.477124183006536, "grad_norm": 2.092638325799082, "learning_rate": 9.641924811783636e-06, "loss": 0.0533, "step": 1582 }, { "epoch": 1.4780578898225958, "grad_norm": 6.527893911246904, "learning_rate": 9.641362663436162e-06, "loss": 0.3039, "step": 1583 }, { "epoch": 1.4789915966386555, "grad_norm": 1.6557066471038104, "learning_rate": 9.640800090585101e-06, "loss": 0.0393, "step": 1584 }, { "epoch": 1.4799253034547153, "grad_norm": 1.8425080133779184, "learning_rate": 9.640237093281906e-06, "loss": 0.0989, "step": 1585 }, { "epoch": 1.480859010270775, "grad_norm": 4.144985419136028, "learning_rate": 9.639673671578068e-06, "loss": 0.2196, "step": 1586 }, { "epoch": 1.4817927170868348, "grad_norm": 7.463370152266463, "learning_rate": 9.63910982552512e-06, "loss": 0.0866, "step": 1587 }, { "epoch": 1.4827264239028946, "grad_norm": 1.4535613079690168, "learning_rate": 9.638545555174634e-06, "loss": 0.0769, "step": 1588 }, { "epoch": 1.4836601307189543, "grad_norm": 1.8551447679515005, "learning_rate": 9.637980860578214e-06, "loss": 0.0561, "step": 1589 }, { "epoch": 1.484593837535014, "grad_norm": 0.8044134641980261, "learning_rate": 9.63741574178751e-06, "loss": 0.0312, "step": 1590 }, { "epoch": 1.4855275443510738, "grad_norm": 5.435420701274796, "learning_rate": 9.636850198854209e-06, "loss": 0.32, "step": 1591 }, { "epoch": 1.4864612511671336, "grad_norm": 2.8937507844500208, "learning_rate": 9.636284231830035e-06, "loss": 0.1088, "step": 1592 }, { "epoch": 1.4873949579831933, "grad_norm": 4.830291223497085, "learning_rate": 9.635717840766751e-06, "loss": 0.2632, "step": 1593 }, { "epoch": 1.488328664799253, "grad_norm": 2.257521176916934, "learning_rate": 9.635151025716162e-06, "loss": 0.1549, "step": 1594 }, { "epoch": 1.4892623716153128, "grad_norm": 0.8249895570808746, "learning_rate": 9.63458378673011e-06, "loss": 0.023, "step": 1595 }, { "epoch": 1.4901960784313726, "grad_norm": 1.2935825584703344, "learning_rate": 9.634016123860473e-06, "loss": 0.0578, "step": 1596 }, { "epoch": 1.4911297852474323, "grad_norm": 0.8047747703174761, "learning_rate": 9.633448037159167e-06, "loss": 0.0211, "step": 1597 }, { "epoch": 1.492063492063492, "grad_norm": 3.5342835115251554, "learning_rate": 9.632879526678156e-06, "loss": 0.1863, "step": 1598 }, { "epoch": 1.4929971988795518, "grad_norm": 3.0182012748585327, "learning_rate": 9.632310592469435e-06, "loss": 0.1336, "step": 1599 }, { "epoch": 1.4939309056956116, "grad_norm": 2.7064497989980443, "learning_rate": 9.631741234585036e-06, "loss": 0.1832, "step": 1600 }, { "epoch": 1.4948646125116714, "grad_norm": 1.7807505022824668, "learning_rate": 9.631171453077037e-06, "loss": 0.1227, "step": 1601 }, { "epoch": 1.495798319327731, "grad_norm": 3.7305070237386078, "learning_rate": 9.630601247997547e-06, "loss": 0.0518, "step": 1602 }, { "epoch": 1.4967320261437909, "grad_norm": 5.2588213350633914, "learning_rate": 9.63003061939872e-06, "loss": 0.265, "step": 1603 }, { "epoch": 1.4976657329598506, "grad_norm": 1.8827220529188438, "learning_rate": 9.629459567332747e-06, "loss": 0.1163, "step": 1604 }, { "epoch": 1.4985994397759104, "grad_norm": 2.516634696172235, "learning_rate": 9.628888091851854e-06, "loss": 0.1562, "step": 1605 }, { "epoch": 1.4995331465919701, "grad_norm": 1.9279413740148599, "learning_rate": 9.628316193008311e-06, "loss": 0.0916, "step": 1606 }, { "epoch": 1.5004668534080299, "grad_norm": 3.7339016869515382, "learning_rate": 9.627743870854424e-06, "loss": 0.1961, "step": 1607 }, { "epoch": 1.5014005602240896, "grad_norm": 3.3137930096826334, "learning_rate": 9.627171125442538e-06, "loss": 0.1051, "step": 1608 }, { "epoch": 1.5023342670401494, "grad_norm": 1.9101361361645177, "learning_rate": 9.626597956825036e-06, "loss": 0.1052, "step": 1609 }, { "epoch": 1.5032679738562091, "grad_norm": 1.0376300861116214, "learning_rate": 9.626024365054342e-06, "loss": 0.0504, "step": 1610 }, { "epoch": 1.504201680672269, "grad_norm": 2.232828473797589, "learning_rate": 9.625450350182919e-06, "loss": 0.1012, "step": 1611 }, { "epoch": 1.5051353874883286, "grad_norm": 3.9749844688358893, "learning_rate": 9.624875912263261e-06, "loss": 0.1673, "step": 1612 }, { "epoch": 1.5060690943043884, "grad_norm": 3.230964428601176, "learning_rate": 9.624301051347913e-06, "loss": 0.0909, "step": 1613 }, { "epoch": 1.5070028011204482, "grad_norm": 1.927330779093572, "learning_rate": 9.623725767489448e-06, "loss": 0.0997, "step": 1614 }, { "epoch": 1.507936507936508, "grad_norm": 1.896111090065582, "learning_rate": 9.623150060740483e-06, "loss": 0.0595, "step": 1615 }, { "epoch": 1.5088702147525677, "grad_norm": 1.044743731161175, "learning_rate": 9.622573931153676e-06, "loss": 0.0387, "step": 1616 }, { "epoch": 1.5098039215686274, "grad_norm": 1.3719311925513444, "learning_rate": 9.621997378781715e-06, "loss": 0.0607, "step": 1617 }, { "epoch": 1.5107376283846872, "grad_norm": 3.7259004949380126, "learning_rate": 9.621420403677338e-06, "loss": 0.1477, "step": 1618 }, { "epoch": 1.511671335200747, "grad_norm": 4.410144713491308, "learning_rate": 9.620843005893308e-06, "loss": 0.2003, "step": 1619 }, { "epoch": 1.5126050420168067, "grad_norm": 2.229616743095976, "learning_rate": 9.620265185482442e-06, "loss": 0.0379, "step": 1620 }, { "epoch": 1.5135387488328664, "grad_norm": 0.9193450216835584, "learning_rate": 9.619686942497584e-06, "loss": 0.0352, "step": 1621 }, { "epoch": 1.5144724556489262, "grad_norm": 3.721285563820914, "learning_rate": 9.619108276991621e-06, "loss": 0.1419, "step": 1622 }, { "epoch": 1.515406162464986, "grad_norm": 4.627755883682286, "learning_rate": 9.61852918901748e-06, "loss": 0.2451, "step": 1623 }, { "epoch": 1.5163398692810457, "grad_norm": 1.8272009825398987, "learning_rate": 9.617949678628124e-06, "loss": 0.1058, "step": 1624 }, { "epoch": 1.5172735760971054, "grad_norm": 6.776138145980427, "learning_rate": 9.617369745876555e-06, "loss": 0.3798, "step": 1625 }, { "epoch": 1.5182072829131652, "grad_norm": 1.0867202191437642, "learning_rate": 9.616789390815815e-06, "loss": 0.0304, "step": 1626 }, { "epoch": 1.519140989729225, "grad_norm": 1.4850633721813487, "learning_rate": 9.616208613498982e-06, "loss": 0.0744, "step": 1627 }, { "epoch": 1.5200746965452847, "grad_norm": 4.547550351344697, "learning_rate": 9.615627413979178e-06, "loss": 0.252, "step": 1628 }, { "epoch": 1.5210084033613445, "grad_norm": 1.4346435767736911, "learning_rate": 9.615045792309557e-06, "loss": 0.1174, "step": 1629 }, { "epoch": 1.5219421101774042, "grad_norm": 3.1368699527982855, "learning_rate": 9.614463748543315e-06, "loss": 0.0527, "step": 1630 }, { "epoch": 1.522875816993464, "grad_norm": 4.3908460899602995, "learning_rate": 9.613881282733688e-06, "loss": 0.2269, "step": 1631 }, { "epoch": 1.5238095238095237, "grad_norm": 4.411656749538173, "learning_rate": 9.613298394933948e-06, "loss": 0.2196, "step": 1632 }, { "epoch": 1.5247432306255835, "grad_norm": 1.5246502419094834, "learning_rate": 9.612715085197406e-06, "loss": 0.0489, "step": 1633 }, { "epoch": 1.5256769374416432, "grad_norm": 8.240754347667965, "learning_rate": 9.612131353577413e-06, "loss": 0.1302, "step": 1634 }, { "epoch": 1.526610644257703, "grad_norm": 1.0370937831579996, "learning_rate": 9.611547200127356e-06, "loss": 0.0365, "step": 1635 }, { "epoch": 1.5275443510737627, "grad_norm": 4.374554968875638, "learning_rate": 9.610962624900664e-06, "loss": 0.2283, "step": 1636 }, { "epoch": 1.5284780578898225, "grad_norm": 1.9642867721643826, "learning_rate": 9.610377627950802e-06, "loss": 0.0518, "step": 1637 }, { "epoch": 1.5294117647058822, "grad_norm": 1.871332289779012, "learning_rate": 9.609792209331274e-06, "loss": 0.1037, "step": 1638 }, { "epoch": 1.530345471521942, "grad_norm": 1.486879517685751, "learning_rate": 9.609206369095626e-06, "loss": 0.0659, "step": 1639 }, { "epoch": 1.5312791783380018, "grad_norm": 2.9899339208388436, "learning_rate": 9.608620107297436e-06, "loss": 0.2078, "step": 1640 }, { "epoch": 1.5322128851540615, "grad_norm": 3.485219245386019, "learning_rate": 9.608033423990325e-06, "loss": 0.172, "step": 1641 }, { "epoch": 1.5331465919701213, "grad_norm": 1.8884953150414183, "learning_rate": 9.60744631922795e-06, "loss": 0.1244, "step": 1642 }, { "epoch": 1.534080298786181, "grad_norm": 6.881574085817088, "learning_rate": 9.60685879306401e-06, "loss": 0.1575, "step": 1643 }, { "epoch": 1.5350140056022408, "grad_norm": 3.103212118081127, "learning_rate": 9.606270845552243e-06, "loss": 0.2065, "step": 1644 }, { "epoch": 1.5359477124183005, "grad_norm": 3.2341233845916295, "learning_rate": 9.60568247674642e-06, "loss": 0.0409, "step": 1645 }, { "epoch": 1.5368814192343603, "grad_norm": 3.6027177152852956, "learning_rate": 9.605093686700356e-06, "loss": 0.1974, "step": 1646 }, { "epoch": 1.53781512605042, "grad_norm": 3.052765157295625, "learning_rate": 9.6045044754679e-06, "loss": 0.2138, "step": 1647 }, { "epoch": 1.5387488328664798, "grad_norm": 2.0110916203059777, "learning_rate": 9.603914843102941e-06, "loss": 0.1003, "step": 1648 }, { "epoch": 1.5396825396825395, "grad_norm": 2.1873919712542365, "learning_rate": 9.603324789659412e-06, "loss": 0.1157, "step": 1649 }, { "epoch": 1.5406162464985993, "grad_norm": 1.5715454317147617, "learning_rate": 9.602734315191275e-06, "loss": 0.0748, "step": 1650 }, { "epoch": 1.541549953314659, "grad_norm": 7.445578221159724, "learning_rate": 9.60214341975254e-06, "loss": 0.324, "step": 1651 }, { "epoch": 1.5424836601307188, "grad_norm": 0.9633737979171236, "learning_rate": 9.601552103397248e-06, "loss": 0.0271, "step": 1652 }, { "epoch": 1.5434173669467786, "grad_norm": 2.9824590448366246, "learning_rate": 9.60096036617948e-06, "loss": 0.2738, "step": 1653 }, { "epoch": 1.5443510737628383, "grad_norm": 1.4316416960384, "learning_rate": 9.600368208153359e-06, "loss": 0.0632, "step": 1654 }, { "epoch": 1.545284780578898, "grad_norm": 2.255372011269888, "learning_rate": 9.599775629373045e-06, "loss": 0.147, "step": 1655 }, { "epoch": 1.5462184873949578, "grad_norm": 2.8222449370703337, "learning_rate": 9.599182629892735e-06, "loss": 0.135, "step": 1656 }, { "epoch": 1.5471521942110178, "grad_norm": 1.7138041758920248, "learning_rate": 9.598589209766664e-06, "loss": 0.0606, "step": 1657 }, { "epoch": 1.5480859010270775, "grad_norm": 2.721730935115081, "learning_rate": 9.597995369049107e-06, "loss": 0.0875, "step": 1658 }, { "epoch": 1.5490196078431373, "grad_norm": 1.3946987172720822, "learning_rate": 9.59740110779438e-06, "loss": 0.0496, "step": 1659 }, { "epoch": 1.549953314659197, "grad_norm": 8.007733934749819, "learning_rate": 9.596806426056833e-06, "loss": 0.2144, "step": 1660 }, { "epoch": 1.5508870214752568, "grad_norm": 2.111628463618387, "learning_rate": 9.596211323890855e-06, "loss": 0.1113, "step": 1661 }, { "epoch": 1.5518207282913166, "grad_norm": 1.261134973886681, "learning_rate": 9.595615801350876e-06, "loss": 0.0523, "step": 1662 }, { "epoch": 1.5527544351073763, "grad_norm": 1.064965131895747, "learning_rate": 9.595019858491364e-06, "loss": 0.0334, "step": 1663 }, { "epoch": 1.553688141923436, "grad_norm": 1.9124997996129174, "learning_rate": 9.59442349536682e-06, "loss": 0.1213, "step": 1664 }, { "epoch": 1.5546218487394958, "grad_norm": 2.717240178559544, "learning_rate": 9.593826712031793e-06, "loss": 0.0964, "step": 1665 }, { "epoch": 1.5555555555555556, "grad_norm": 1.5877143488854317, "learning_rate": 9.593229508540864e-06, "loss": 0.0648, "step": 1666 }, { "epoch": 1.5564892623716153, "grad_norm": 1.075302214512788, "learning_rate": 9.592631884948654e-06, "loss": 0.0307, "step": 1667 }, { "epoch": 1.557422969187675, "grad_norm": 2.6437237357076464, "learning_rate": 9.592033841309821e-06, "loss": 0.1207, "step": 1668 }, { "epoch": 1.5583566760037348, "grad_norm": 1.705332482000671, "learning_rate": 9.591435377679064e-06, "loss": 0.0885, "step": 1669 }, { "epoch": 1.5592903828197946, "grad_norm": 2.799310700086446, "learning_rate": 9.590836494111118e-06, "loss": 0.0918, "step": 1670 }, { "epoch": 1.5602240896358543, "grad_norm": 7.019742128688856, "learning_rate": 9.590237190660758e-06, "loss": 0.2247, "step": 1671 }, { "epoch": 1.561157796451914, "grad_norm": 2.6035571607061136, "learning_rate": 9.589637467382795e-06, "loss": 0.1653, "step": 1672 }, { "epoch": 1.5620915032679739, "grad_norm": 5.626722023748838, "learning_rate": 9.589037324332084e-06, "loss": 0.3058, "step": 1673 }, { "epoch": 1.5630252100840336, "grad_norm": 4.122910665450975, "learning_rate": 9.588436761563513e-06, "loss": 0.2522, "step": 1674 }, { "epoch": 1.5639589169000934, "grad_norm": 1.5207134230317412, "learning_rate": 9.58783577913201e-06, "loss": 0.0713, "step": 1675 }, { "epoch": 1.5648926237161531, "grad_norm": 1.3602747429631377, "learning_rate": 9.587234377092541e-06, "loss": 0.0396, "step": 1676 }, { "epoch": 1.5658263305322129, "grad_norm": 3.657305183106329, "learning_rate": 9.586632555500109e-06, "loss": 0.2906, "step": 1677 }, { "epoch": 1.5667600373482726, "grad_norm": 3.7326261078770946, "learning_rate": 9.586030314409761e-06, "loss": 0.235, "step": 1678 }, { "epoch": 1.5676937441643324, "grad_norm": 4.391070679655278, "learning_rate": 9.585427653876577e-06, "loss": 0.1853, "step": 1679 }, { "epoch": 1.5686274509803921, "grad_norm": 1.408317409999076, "learning_rate": 9.584824573955675e-06, "loss": 0.0601, "step": 1680 }, { "epoch": 1.569561157796452, "grad_norm": 2.265598354226704, "learning_rate": 9.584221074702219e-06, "loss": 0.1692, "step": 1681 }, { "epoch": 1.5704948646125116, "grad_norm": 1.9056326025673354, "learning_rate": 9.583617156171397e-06, "loss": 0.1088, "step": 1682 }, { "epoch": 1.5714285714285714, "grad_norm": 2.226666714871257, "learning_rate": 9.58301281841845e-06, "loss": 0.1074, "step": 1683 }, { "epoch": 1.5723622782446312, "grad_norm": 1.360109018884013, "learning_rate": 9.582408061498651e-06, "loss": 0.0577, "step": 1684 }, { "epoch": 1.573295985060691, "grad_norm": 1.5417093080434157, "learning_rate": 9.58180288546731e-06, "loss": 0.1052, "step": 1685 }, { "epoch": 1.5742296918767507, "grad_norm": 1.6244631327696197, "learning_rate": 9.581197290379778e-06, "loss": 0.033, "step": 1686 }, { "epoch": 1.5751633986928104, "grad_norm": 1.540769154995024, "learning_rate": 9.580591276291442e-06, "loss": 0.0675, "step": 1687 }, { "epoch": 1.5760971055088702, "grad_norm": 1.8458058456669202, "learning_rate": 9.579984843257731e-06, "loss": 0.1275, "step": 1688 }, { "epoch": 1.57703081232493, "grad_norm": 2.0344146084776455, "learning_rate": 9.57937799133411e-06, "loss": 0.1382, "step": 1689 }, { "epoch": 1.5779645191409897, "grad_norm": 0.6807436633311574, "learning_rate": 9.578770720576077e-06, "loss": 0.0167, "step": 1690 }, { "epoch": 1.5788982259570497, "grad_norm": 1.9096107446801829, "learning_rate": 9.578163031039178e-06, "loss": 0.0783, "step": 1691 }, { "epoch": 1.5798319327731094, "grad_norm": 3.187752273909705, "learning_rate": 9.577554922778994e-06, "loss": 0.2029, "step": 1692 }, { "epoch": 1.5807656395891692, "grad_norm": 3.565155904479389, "learning_rate": 9.576946395851141e-06, "loss": 0.2172, "step": 1693 }, { "epoch": 1.581699346405229, "grad_norm": 2.5875669936260506, "learning_rate": 9.576337450311275e-06, "loss": 0.1407, "step": 1694 }, { "epoch": 1.5826330532212887, "grad_norm": 2.611886272134902, "learning_rate": 9.575728086215093e-06, "loss": 0.0978, "step": 1695 }, { "epoch": 1.5835667600373484, "grad_norm": 3.1094526473233204, "learning_rate": 9.575118303618326e-06, "loss": 0.1603, "step": 1696 }, { "epoch": 1.5845004668534082, "grad_norm": 2.5757748656717663, "learning_rate": 9.574508102576747e-06, "loss": 0.1611, "step": 1697 }, { "epoch": 1.585434173669468, "grad_norm": 2.368689573547699, "learning_rate": 9.573897483146163e-06, "loss": 0.1494, "step": 1698 }, { "epoch": 1.5863678804855277, "grad_norm": 1.6441612359844964, "learning_rate": 9.573286445382425e-06, "loss": 0.0909, "step": 1699 }, { "epoch": 1.5873015873015874, "grad_norm": 1.545436302241553, "learning_rate": 9.572674989341416e-06, "loss": 0.0469, "step": 1700 }, { "epoch": 1.5882352941176472, "grad_norm": 0.7574374780956064, "learning_rate": 9.572063115079063e-06, "loss": 0.0264, "step": 1701 }, { "epoch": 1.589169000933707, "grad_norm": 2.60258705686552, "learning_rate": 9.571450822651327e-06, "loss": 0.1255, "step": 1702 }, { "epoch": 1.5901027077497667, "grad_norm": 3.5188156928525154, "learning_rate": 9.57083811211421e-06, "loss": 0.1212, "step": 1703 }, { "epoch": 1.5910364145658265, "grad_norm": 4.226540823747601, "learning_rate": 9.570224983523751e-06, "loss": 0.2348, "step": 1704 }, { "epoch": 1.5919701213818862, "grad_norm": 2.097990381253172, "learning_rate": 9.569611436936026e-06, "loss": 0.1287, "step": 1705 }, { "epoch": 1.592903828197946, "grad_norm": 3.876880998289964, "learning_rate": 9.568997472407152e-06, "loss": 0.1775, "step": 1706 }, { "epoch": 1.5938375350140057, "grad_norm": 4.556037417659876, "learning_rate": 9.568383089993283e-06, "loss": 0.2117, "step": 1707 }, { "epoch": 1.5947712418300655, "grad_norm": 0.728403752699978, "learning_rate": 9.567768289750609e-06, "loss": 0.0208, "step": 1708 }, { "epoch": 1.5957049486461252, "grad_norm": 3.4689585180936215, "learning_rate": 9.567153071735361e-06, "loss": 0.1245, "step": 1709 }, { "epoch": 1.596638655462185, "grad_norm": 1.842095159676327, "learning_rate": 9.56653743600381e-06, "loss": 0.1218, "step": 1710 }, { "epoch": 1.5975723622782447, "grad_norm": 5.600043727594362, "learning_rate": 9.565921382612258e-06, "loss": 0.2089, "step": 1711 }, { "epoch": 1.5985060690943045, "grad_norm": 1.947646448884611, "learning_rate": 9.565304911617055e-06, "loss": 0.0972, "step": 1712 }, { "epoch": 1.5994397759103642, "grad_norm": 3.507693182366708, "learning_rate": 9.564688023074579e-06, "loss": 0.0884, "step": 1713 }, { "epoch": 1.600373482726424, "grad_norm": 2.9636234822682526, "learning_rate": 9.564070717041256e-06, "loss": 0.1283, "step": 1714 }, { "epoch": 1.6013071895424837, "grad_norm": 1.0914838949383812, "learning_rate": 9.563452993573541e-06, "loss": 0.0295, "step": 1715 }, { "epoch": 1.6022408963585435, "grad_norm": 2.2837174888595695, "learning_rate": 9.562834852727935e-06, "loss": 0.106, "step": 1716 }, { "epoch": 1.6031746031746033, "grad_norm": 2.0669640895530548, "learning_rate": 9.562216294560973e-06, "loss": 0.1042, "step": 1717 }, { "epoch": 1.604108309990663, "grad_norm": 1.706153995875339, "learning_rate": 9.56159731912923e-06, "loss": 0.101, "step": 1718 }, { "epoch": 1.6050420168067228, "grad_norm": 2.0307985267114135, "learning_rate": 9.560977926489314e-06, "loss": 0.0434, "step": 1719 }, { "epoch": 1.6059757236227825, "grad_norm": 3.8763896885918188, "learning_rate": 9.560358116697878e-06, "loss": 0.1799, "step": 1720 }, { "epoch": 1.6069094304388423, "grad_norm": 2.56677905416428, "learning_rate": 9.559737889811612e-06, "loss": 0.179, "step": 1721 }, { "epoch": 1.607843137254902, "grad_norm": 2.6139316049630854, "learning_rate": 9.559117245887238e-06, "loss": 0.1566, "step": 1722 }, { "epoch": 1.6087768440709618, "grad_norm": 4.727972640447235, "learning_rate": 9.558496184981525e-06, "loss": 0.2303, "step": 1723 }, { "epoch": 1.6097105508870215, "grad_norm": 2.059303829303892, "learning_rate": 9.557874707151274e-06, "loss": 0.0403, "step": 1724 }, { "epoch": 1.6106442577030813, "grad_norm": 4.712408543790666, "learning_rate": 9.557252812453326e-06, "loss": 0.1674, "step": 1725 }, { "epoch": 1.611577964519141, "grad_norm": 2.9135067253181237, "learning_rate": 9.556630500944561e-06, "loss": 0.1393, "step": 1726 }, { "epoch": 1.6125116713352008, "grad_norm": 1.6443807191238617, "learning_rate": 9.556007772681897e-06, "loss": 0.0534, "step": 1727 }, { "epoch": 1.6134453781512605, "grad_norm": 2.199886085270681, "learning_rate": 9.555384627722286e-06, "loss": 0.0939, "step": 1728 }, { "epoch": 1.6143790849673203, "grad_norm": 4.599886603183836, "learning_rate": 9.554761066122724e-06, "loss": 0.1681, "step": 1729 }, { "epoch": 1.61531279178338, "grad_norm": 1.0550580583129734, "learning_rate": 9.554137087940241e-06, "loss": 0.0344, "step": 1730 }, { "epoch": 1.6162464985994398, "grad_norm": 2.040631423062139, "learning_rate": 9.55351269323191e-06, "loss": 0.1077, "step": 1731 }, { "epoch": 1.6171802054154996, "grad_norm": 2.5801622848881824, "learning_rate": 9.552887882054835e-06, "loss": 0.0409, "step": 1732 }, { "epoch": 1.6181139122315593, "grad_norm": 6.1134179557179396, "learning_rate": 9.552262654466162e-06, "loss": 0.2695, "step": 1733 }, { "epoch": 1.619047619047619, "grad_norm": 2.5455456750806413, "learning_rate": 9.551637010523078e-06, "loss": 0.1769, "step": 1734 }, { "epoch": 1.6199813258636788, "grad_norm": 3.4449523727573736, "learning_rate": 9.551010950282802e-06, "loss": 0.1859, "step": 1735 }, { "epoch": 1.6209150326797386, "grad_norm": 1.4923892205944187, "learning_rate": 9.550384473802596e-06, "loss": 0.0532, "step": 1736 }, { "epoch": 1.6218487394957983, "grad_norm": 1.1894713045346579, "learning_rate": 9.549757581139758e-06, "loss": 0.0353, "step": 1737 }, { "epoch": 1.622782446311858, "grad_norm": 1.598077052330874, "learning_rate": 9.549130272351621e-06, "loss": 0.0742, "step": 1738 }, { "epoch": 1.6237161531279178, "grad_norm": 3.9838220980377455, "learning_rate": 9.548502547495565e-06, "loss": 0.196, "step": 1739 }, { "epoch": 1.6246498599439776, "grad_norm": 3.497869950356952, "learning_rate": 9.547874406628998e-06, "loss": 0.2481, "step": 1740 }, { "epoch": 1.6255835667600373, "grad_norm": 3.841551230126482, "learning_rate": 9.547245849809371e-06, "loss": 0.1714, "step": 1741 }, { "epoch": 1.626517273576097, "grad_norm": 2.6328427554239684, "learning_rate": 9.546616877094173e-06, "loss": 0.1209, "step": 1742 }, { "epoch": 1.6274509803921569, "grad_norm": 1.1005137161684277, "learning_rate": 9.54598748854093e-06, "loss": 0.0248, "step": 1743 }, { "epoch": 1.6283846872082166, "grad_norm": 2.9022752922016517, "learning_rate": 9.54535768420721e-06, "loss": 0.1105, "step": 1744 }, { "epoch": 1.6293183940242764, "grad_norm": 1.9538634993848667, "learning_rate": 9.54472746415061e-06, "loss": 0.1149, "step": 1745 }, { "epoch": 1.6302521008403361, "grad_norm": 2.6697221019845485, "learning_rate": 9.544096828428775e-06, "loss": 0.1836, "step": 1746 }, { "epoch": 1.6311858076563959, "grad_norm": 1.967137349571421, "learning_rate": 9.543465777099378e-06, "loss": 0.0894, "step": 1747 }, { "epoch": 1.6321195144724556, "grad_norm": 1.6270571076896918, "learning_rate": 9.542834310220143e-06, "loss": 0.0597, "step": 1748 }, { "epoch": 1.6330532212885154, "grad_norm": 2.2683167153088752, "learning_rate": 9.542202427848819e-06, "loss": 0.1378, "step": 1749 }, { "epoch": 1.6339869281045751, "grad_norm": 1.2087779196289365, "learning_rate": 9.541570130043201e-06, "loss": 0.0347, "step": 1750 }, { "epoch": 1.6349206349206349, "grad_norm": 2.153041693285009, "learning_rate": 9.540937416861117e-06, "loss": 0.0609, "step": 1751 }, { "epoch": 1.6358543417366946, "grad_norm": 5.044143322359289, "learning_rate": 9.54030428836044e-06, "loss": 0.1585, "step": 1752 }, { "epoch": 1.6367880485527544, "grad_norm": 1.6212713137791326, "learning_rate": 9.539670744599072e-06, "loss": 0.0764, "step": 1753 }, { "epoch": 1.6377217553688141, "grad_norm": 2.9210847471936803, "learning_rate": 9.539036785634961e-06, "loss": 0.1461, "step": 1754 }, { "epoch": 1.638655462184874, "grad_norm": 2.366559220369335, "learning_rate": 9.53840241152609e-06, "loss": 0.0792, "step": 1755 }, { "epoch": 1.6395891690009337, "grad_norm": 2.7334372928323143, "learning_rate": 9.537767622330473e-06, "loss": 0.2385, "step": 1756 }, { "epoch": 1.6405228758169934, "grad_norm": 3.1040967229867347, "learning_rate": 9.537132418106176e-06, "loss": 0.1225, "step": 1757 }, { "epoch": 1.6414565826330532, "grad_norm": 1.9781555363504786, "learning_rate": 9.536496798911293e-06, "loss": 0.1026, "step": 1758 }, { "epoch": 1.642390289449113, "grad_norm": 2.110504918012052, "learning_rate": 9.535860764803955e-06, "loss": 0.1047, "step": 1759 }, { "epoch": 1.6433239962651727, "grad_norm": 4.716733445444416, "learning_rate": 9.53522431584234e-06, "loss": 0.2202, "step": 1760 }, { "epoch": 1.6442577030812324, "grad_norm": 2.128714349754415, "learning_rate": 9.534587452084653e-06, "loss": 0.1157, "step": 1761 }, { "epoch": 1.6451914098972922, "grad_norm": 1.2454342636225268, "learning_rate": 9.533950173589145e-06, "loss": 0.0873, "step": 1762 }, { "epoch": 1.646125116713352, "grad_norm": 2.259279901259521, "learning_rate": 9.533312480414103e-06, "loss": 0.1667, "step": 1763 }, { "epoch": 1.6470588235294117, "grad_norm": 1.757433287105761, "learning_rate": 9.532674372617849e-06, "loss": 0.0751, "step": 1764 }, { "epoch": 1.6479925303454714, "grad_norm": 5.459057959464994, "learning_rate": 9.532035850258745e-06, "loss": 0.3292, "step": 1765 }, { "epoch": 1.6489262371615312, "grad_norm": 2.267772838355241, "learning_rate": 9.531396913395193e-06, "loss": 0.1184, "step": 1766 }, { "epoch": 1.649859943977591, "grad_norm": 2.039253044209648, "learning_rate": 9.530757562085627e-06, "loss": 0.0431, "step": 1767 }, { "epoch": 1.6507936507936507, "grad_norm": 3.755622456869109, "learning_rate": 9.530117796388527e-06, "loss": 0.1325, "step": 1768 }, { "epoch": 1.6517273576097105, "grad_norm": 2.657366114605349, "learning_rate": 9.529477616362404e-06, "loss": 0.0998, "step": 1769 }, { "epoch": 1.6526610644257702, "grad_norm": 1.612667729748863, "learning_rate": 9.52883702206581e-06, "loss": 0.0843, "step": 1770 }, { "epoch": 1.65359477124183, "grad_norm": 2.244570501707427, "learning_rate": 9.528196013557335e-06, "loss": 0.1182, "step": 1771 }, { "epoch": 1.6545284780578897, "grad_norm": 2.283138607734862, "learning_rate": 9.527554590895604e-06, "loss": 0.1454, "step": 1772 }, { "epoch": 1.6554621848739495, "grad_norm": 0.984591609795398, "learning_rate": 9.526912754139286e-06, "loss": 0.0228, "step": 1773 }, { "epoch": 1.6563958916900092, "grad_norm": 4.992202862964228, "learning_rate": 9.526270503347081e-06, "loss": 0.2624, "step": 1774 }, { "epoch": 1.657329598506069, "grad_norm": 0.6854657158927584, "learning_rate": 9.525627838577732e-06, "loss": 0.0215, "step": 1775 }, { "epoch": 1.6582633053221287, "grad_norm": 1.0908694350627568, "learning_rate": 9.524984759890015e-06, "loss": 0.0284, "step": 1776 }, { "epoch": 1.6591970121381885, "grad_norm": 2.3622123691861416, "learning_rate": 9.52434126734275e-06, "loss": 0.1185, "step": 1777 }, { "epoch": 1.6601307189542482, "grad_norm": 2.1832478527998838, "learning_rate": 9.523697360994788e-06, "loss": 0.1243, "step": 1778 }, { "epoch": 1.661064425770308, "grad_norm": 1.0688999400597936, "learning_rate": 9.523053040905024e-06, "loss": 0.059, "step": 1779 }, { "epoch": 1.6619981325863677, "grad_norm": 3.3046783974126397, "learning_rate": 9.522408307132386e-06, "loss": 0.1426, "step": 1780 }, { "epoch": 1.6629318394024275, "grad_norm": 2.014967366203356, "learning_rate": 9.521763159735843e-06, "loss": 0.0824, "step": 1781 }, { "epoch": 1.6638655462184873, "grad_norm": 1.7547932049817734, "learning_rate": 9.521117598774401e-06, "loss": 0.0984, "step": 1782 }, { "epoch": 1.664799253034547, "grad_norm": 2.90499251372725, "learning_rate": 9.520471624307105e-06, "loss": 0.1049, "step": 1783 }, { "epoch": 1.6657329598506068, "grad_norm": 1.8884924599795854, "learning_rate": 9.519825236393033e-06, "loss": 0.0782, "step": 1784 }, { "epoch": 1.6666666666666665, "grad_norm": 2.356051652174269, "learning_rate": 9.519178435091308e-06, "loss": 0.0864, "step": 1785 }, { "epoch": 1.6676003734827263, "grad_norm": 1.3865956168280005, "learning_rate": 9.518531220461086e-06, "loss": 0.0374, "step": 1786 }, { "epoch": 1.668534080298786, "grad_norm": 2.2211694010707292, "learning_rate": 9.517883592561559e-06, "loss": 0.1556, "step": 1787 }, { "epoch": 1.6694677871148458, "grad_norm": 1.2101484936866802, "learning_rate": 9.517235551451962e-06, "loss": 0.0652, "step": 1788 }, { "epoch": 1.6704014939309055, "grad_norm": 2.1481112204748003, "learning_rate": 9.516587097191565e-06, "loss": 0.1211, "step": 1789 }, { "epoch": 1.6713352007469653, "grad_norm": 2.5304912325990903, "learning_rate": 9.515938229839678e-06, "loss": 0.0788, "step": 1790 }, { "epoch": 1.6722689075630253, "grad_norm": 4.230348576399718, "learning_rate": 9.515288949455645e-06, "loss": 0.1898, "step": 1791 }, { "epoch": 1.673202614379085, "grad_norm": 3.4518166349884067, "learning_rate": 9.514639256098852e-06, "loss": 0.2314, "step": 1792 }, { "epoch": 1.6741363211951448, "grad_norm": 3.8212749297176294, "learning_rate": 9.513989149828718e-06, "loss": 0.1067, "step": 1793 }, { "epoch": 1.6750700280112045, "grad_norm": 3.8590463502131636, "learning_rate": 9.513338630704703e-06, "loss": 0.2379, "step": 1794 }, { "epoch": 1.6760037348272643, "grad_norm": 1.0385490911383175, "learning_rate": 9.512687698786307e-06, "loss": 0.0505, "step": 1795 }, { "epoch": 1.676937441643324, "grad_norm": 1.694614785107867, "learning_rate": 9.51203635413306e-06, "loss": 0.0727, "step": 1796 }, { "epoch": 1.6778711484593838, "grad_norm": 1.2417943552400115, "learning_rate": 9.511384596804539e-06, "loss": 0.0463, "step": 1797 }, { "epoch": 1.6788048552754435, "grad_norm": 1.6063333609609258, "learning_rate": 9.510732426860349e-06, "loss": 0.0522, "step": 1798 }, { "epoch": 1.6797385620915033, "grad_norm": 2.7200067277793045, "learning_rate": 9.510079844360145e-06, "loss": 0.1545, "step": 1799 }, { "epoch": 1.680672268907563, "grad_norm": 2.8426251923440082, "learning_rate": 9.509426849363608e-06, "loss": 0.1766, "step": 1800 }, { "epoch": 1.6816059757236228, "grad_norm": 1.1447562050816382, "learning_rate": 9.508773441930464e-06, "loss": 0.0244, "step": 1801 }, { "epoch": 1.6825396825396826, "grad_norm": 2.037940363660738, "learning_rate": 9.50811962212047e-06, "loss": 0.1299, "step": 1802 }, { "epoch": 1.6834733893557423, "grad_norm": 1.0776381637966819, "learning_rate": 9.507465389993431e-06, "loss": 0.0226, "step": 1803 }, { "epoch": 1.684407096171802, "grad_norm": 1.6649431117739937, "learning_rate": 9.50681074560918e-06, "loss": 0.0624, "step": 1804 }, { "epoch": 1.6853408029878618, "grad_norm": 1.5022887560025673, "learning_rate": 9.506155689027595e-06, "loss": 0.0619, "step": 1805 }, { "epoch": 1.6862745098039216, "grad_norm": 0.5000793116733006, "learning_rate": 9.505500220308583e-06, "loss": 0.0124, "step": 1806 }, { "epoch": 1.6872082166199813, "grad_norm": 1.830567183529938, "learning_rate": 9.504844339512096e-06, "loss": 0.087, "step": 1807 }, { "epoch": 1.688141923436041, "grad_norm": 2.0069529451936674, "learning_rate": 9.504188046698122e-06, "loss": 0.0577, "step": 1808 }, { "epoch": 1.6890756302521008, "grad_norm": 1.9543739410456817, "learning_rate": 9.503531341926687e-06, "loss": 0.0981, "step": 1809 }, { "epoch": 1.6900093370681606, "grad_norm": 1.080929540391815, "learning_rate": 9.502874225257852e-06, "loss": 0.0531, "step": 1810 }, { "epoch": 1.6909430438842203, "grad_norm": 1.8583888312498267, "learning_rate": 9.50221669675172e-06, "loss": 0.1168, "step": 1811 }, { "epoch": 1.69187675070028, "grad_norm": 1.190729006718797, "learning_rate": 9.501558756468426e-06, "loss": 0.0296, "step": 1812 }, { "epoch": 1.6928104575163399, "grad_norm": 5.809195132586833, "learning_rate": 9.500900404468147e-06, "loss": 0.1603, "step": 1813 }, { "epoch": 1.6937441643323996, "grad_norm": 2.2836685055617747, "learning_rate": 9.500241640811099e-06, "loss": 0.1077, "step": 1814 }, { "epoch": 1.6946778711484594, "grad_norm": 2.086271038373127, "learning_rate": 9.49958246555753e-06, "loss": 0.1067, "step": 1815 }, { "epoch": 1.6956115779645191, "grad_norm": 1.746736165469523, "learning_rate": 9.49892287876773e-06, "loss": 0.0884, "step": 1816 }, { "epoch": 1.6965452847805789, "grad_norm": 3.509074487287132, "learning_rate": 9.498262880502026e-06, "loss": 0.1443, "step": 1817 }, { "epoch": 1.6974789915966386, "grad_norm": 1.6432530400383611, "learning_rate": 9.49760247082078e-06, "loss": 0.0768, "step": 1818 }, { "epoch": 1.6984126984126984, "grad_norm": 2.934946853396911, "learning_rate": 9.496941649784396e-06, "loss": 0.1566, "step": 1819 }, { "epoch": 1.6993464052287581, "grad_norm": 6.833258030957113, "learning_rate": 9.496280417453311e-06, "loss": 0.2902, "step": 1820 }, { "epoch": 1.7002801120448179, "grad_norm": 2.780029087032045, "learning_rate": 9.495618773888007e-06, "loss": 0.1625, "step": 1821 }, { "epoch": 1.7012138188608776, "grad_norm": 1.899022809178191, "learning_rate": 9.494956719148993e-06, "loss": 0.0928, "step": 1822 }, { "epoch": 1.7021475256769374, "grad_norm": 1.0441776125844384, "learning_rate": 9.494294253296822e-06, "loss": 0.0356, "step": 1823 }, { "epoch": 1.7030812324929971, "grad_norm": 0.9820054995995788, "learning_rate": 9.493631376392087e-06, "loss": 0.0324, "step": 1824 }, { "epoch": 1.7040149393090571, "grad_norm": 1.6787060178561621, "learning_rate": 9.492968088495413e-06, "loss": 0.0989, "step": 1825 }, { "epoch": 1.7049486461251169, "grad_norm": 1.5055734945343784, "learning_rate": 9.492304389667466e-06, "loss": 0.1115, "step": 1826 }, { "epoch": 1.7058823529411766, "grad_norm": 1.0970630924472324, "learning_rate": 9.491640279968948e-06, "loss": 0.0366, "step": 1827 }, { "epoch": 1.7068160597572364, "grad_norm": 2.588713578301614, "learning_rate": 9.490975759460597e-06, "loss": 0.1911, "step": 1828 }, { "epoch": 1.7077497665732961, "grad_norm": 0.8760278365970918, "learning_rate": 9.490310828203195e-06, "loss": 0.0393, "step": 1829 }, { "epoch": 1.708683473389356, "grad_norm": 2.03333060022264, "learning_rate": 9.489645486257555e-06, "loss": 0.0391, "step": 1830 }, { "epoch": 1.7096171802054156, "grad_norm": 2.998091184481655, "learning_rate": 9.488979733684529e-06, "loss": 0.2188, "step": 1831 }, { "epoch": 1.7105508870214754, "grad_norm": 4.809680987516048, "learning_rate": 9.488313570545007e-06, "loss": 0.1682, "step": 1832 }, { "epoch": 1.7114845938375352, "grad_norm": 3.795188389062766, "learning_rate": 9.48764699689992e-06, "loss": 0.2135, "step": 1833 }, { "epoch": 1.712418300653595, "grad_norm": 1.0210906884894013, "learning_rate": 9.486980012810233e-06, "loss": 0.0389, "step": 1834 }, { "epoch": 1.7133520074696547, "grad_norm": 5.386716742650556, "learning_rate": 9.486312618336944e-06, "loss": 0.2285, "step": 1835 }, { "epoch": 1.7142857142857144, "grad_norm": 1.8698032713474901, "learning_rate": 9.4856448135411e-06, "loss": 0.1272, "step": 1836 }, { "epoch": 1.7152194211017742, "grad_norm": 2.5073186795260827, "learning_rate": 9.484976598483774e-06, "loss": 0.1563, "step": 1837 }, { "epoch": 1.716153127917834, "grad_norm": 2.3018915752723, "learning_rate": 9.484307973226085e-06, "loss": 0.1331, "step": 1838 }, { "epoch": 1.7170868347338937, "grad_norm": 6.381969757522983, "learning_rate": 9.483638937829185e-06, "loss": 0.2426, "step": 1839 }, { "epoch": 1.7180205415499534, "grad_norm": 0.8260146310739459, "learning_rate": 9.482969492354266e-06, "loss": 0.0185, "step": 1840 }, { "epoch": 1.7189542483660132, "grad_norm": 2.779476497369881, "learning_rate": 9.482299636862553e-06, "loss": 0.1697, "step": 1841 }, { "epoch": 1.719887955182073, "grad_norm": 2.000431510202891, "learning_rate": 9.481629371415315e-06, "loss": 0.0728, "step": 1842 }, { "epoch": 1.7208216619981327, "grad_norm": 0.5863379902777811, "learning_rate": 9.480958696073852e-06, "loss": 0.0286, "step": 1843 }, { "epoch": 1.7217553688141924, "grad_norm": 2.937975341562664, "learning_rate": 9.480287610899507e-06, "loss": 0.2052, "step": 1844 }, { "epoch": 1.7226890756302522, "grad_norm": 3.07229619497638, "learning_rate": 9.479616115953657e-06, "loss": 0.1242, "step": 1845 }, { "epoch": 1.723622782446312, "grad_norm": 2.7396920330965164, "learning_rate": 9.478944211297719e-06, "loss": 0.1269, "step": 1846 }, { "epoch": 1.7245564892623717, "grad_norm": 3.4074491781285343, "learning_rate": 9.478271896993144e-06, "loss": 0.2035, "step": 1847 }, { "epoch": 1.7254901960784315, "grad_norm": 2.0216306228801186, "learning_rate": 9.477599173101425e-06, "loss": 0.1377, "step": 1848 }, { "epoch": 1.7264239028944912, "grad_norm": 2.1775806875634625, "learning_rate": 9.476926039684085e-06, "loss": 0.0861, "step": 1849 }, { "epoch": 1.727357609710551, "grad_norm": 5.355897430667129, "learning_rate": 9.476252496802696e-06, "loss": 0.1572, "step": 1850 }, { "epoch": 1.7282913165266107, "grad_norm": 4.748420406947119, "learning_rate": 9.475578544518855e-06, "loss": 0.1877, "step": 1851 }, { "epoch": 1.7292250233426705, "grad_norm": 0.8855028496517748, "learning_rate": 9.474904182894207e-06, "loss": 0.031, "step": 1852 }, { "epoch": 1.7301587301587302, "grad_norm": 2.961357541157364, "learning_rate": 9.47422941199043e-06, "loss": 0.0741, "step": 1853 }, { "epoch": 1.73109243697479, "grad_norm": 2.8675200617683085, "learning_rate": 9.473554231869234e-06, "loss": 0.1526, "step": 1854 }, { "epoch": 1.7320261437908497, "grad_norm": 1.6857995141755604, "learning_rate": 9.472878642592376e-06, "loss": 0.043, "step": 1855 }, { "epoch": 1.7329598506069095, "grad_norm": 4.095251467941194, "learning_rate": 9.472202644221644e-06, "loss": 0.3419, "step": 1856 }, { "epoch": 1.7338935574229692, "grad_norm": 2.322566218124089, "learning_rate": 9.471526236818867e-06, "loss": 0.1298, "step": 1857 }, { "epoch": 1.734827264239029, "grad_norm": 2.44758393928944, "learning_rate": 9.47084942044591e-06, "loss": 0.1387, "step": 1858 }, { "epoch": 1.7357609710550888, "grad_norm": 2.22065615152886, "learning_rate": 9.470172195164673e-06, "loss": 0.1109, "step": 1859 }, { "epoch": 1.7366946778711485, "grad_norm": 3.0271986789116982, "learning_rate": 9.469494561037097e-06, "loss": 0.1305, "step": 1860 }, { "epoch": 1.7376283846872083, "grad_norm": 2.9915273197468055, "learning_rate": 9.468816518125162e-06, "loss": 0.0307, "step": 1861 }, { "epoch": 1.738562091503268, "grad_norm": 2.6203452649471495, "learning_rate": 9.468138066490877e-06, "loss": 0.1639, "step": 1862 }, { "epoch": 1.7394957983193278, "grad_norm": 2.9155749180552277, "learning_rate": 9.467459206196298e-06, "loss": 0.1435, "step": 1863 }, { "epoch": 1.7404295051353875, "grad_norm": 2.463672079583639, "learning_rate": 9.466779937303511e-06, "loss": 0.1005, "step": 1864 }, { "epoch": 1.7413632119514473, "grad_norm": 1.210273510881077, "learning_rate": 9.466100259874647e-06, "loss": 0.0624, "step": 1865 }, { "epoch": 1.742296918767507, "grad_norm": 1.066916050288903, "learning_rate": 9.465420173971867e-06, "loss": 0.0555, "step": 1866 }, { "epoch": 1.7432306255835668, "grad_norm": 1.289455394030475, "learning_rate": 9.464739679657371e-06, "loss": 0.0696, "step": 1867 }, { "epoch": 1.7441643323996265, "grad_norm": 1.9755841018800522, "learning_rate": 9.4640587769934e-06, "loss": 0.0394, "step": 1868 }, { "epoch": 1.7450980392156863, "grad_norm": 3.33003766204254, "learning_rate": 9.463377466042229e-06, "loss": 0.1972, "step": 1869 }, { "epoch": 1.746031746031746, "grad_norm": 3.3142241101496226, "learning_rate": 9.462695746866173e-06, "loss": 0.1346, "step": 1870 }, { "epoch": 1.7469654528478058, "grad_norm": 2.5016722732605476, "learning_rate": 9.462013619527581e-06, "loss": 0.1639, "step": 1871 }, { "epoch": 1.7478991596638656, "grad_norm": 2.3002392162184973, "learning_rate": 9.46133108408884e-06, "loss": 0.0922, "step": 1872 }, { "epoch": 1.7488328664799253, "grad_norm": 0.9436792809665957, "learning_rate": 9.460648140612379e-06, "loss": 0.0154, "step": 1873 }, { "epoch": 1.749766573295985, "grad_norm": 5.586001612615263, "learning_rate": 9.459964789160659e-06, "loss": 0.2435, "step": 1874 }, { "epoch": 1.7507002801120448, "grad_norm": 1.3635059803910237, "learning_rate": 9.459281029796177e-06, "loss": 0.056, "step": 1875 }, { "epoch": 1.7516339869281046, "grad_norm": 2.0065201731458298, "learning_rate": 9.458596862581476e-06, "loss": 0.0307, "step": 1876 }, { "epoch": 1.7525676937441643, "grad_norm": 2.5497321189028175, "learning_rate": 9.457912287579125e-06, "loss": 0.144, "step": 1877 }, { "epoch": 1.753501400560224, "grad_norm": 0.5934173581330368, "learning_rate": 9.457227304851739e-06, "loss": 0.0065, "step": 1878 }, { "epoch": 1.7544351073762838, "grad_norm": 0.7460558468871772, "learning_rate": 9.456541914461966e-06, "loss": 0.0294, "step": 1879 }, { "epoch": 1.7553688141923436, "grad_norm": 3.19232136817103, "learning_rate": 9.455856116472492e-06, "loss": 0.17, "step": 1880 }, { "epoch": 1.7563025210084033, "grad_norm": 1.2617069001348333, "learning_rate": 9.455169910946044e-06, "loss": 0.0424, "step": 1881 }, { "epoch": 1.757236227824463, "grad_norm": 4.438132032736028, "learning_rate": 9.45448329794538e-06, "loss": 0.1314, "step": 1882 }, { "epoch": 1.7581699346405228, "grad_norm": 3.208536188612863, "learning_rate": 9.4537962775333e-06, "loss": 0.1452, "step": 1883 }, { "epoch": 1.7591036414565826, "grad_norm": 5.498859134747674, "learning_rate": 9.45310884977264e-06, "loss": 0.3323, "step": 1884 }, { "epoch": 1.7600373482726424, "grad_norm": 1.8595561207470424, "learning_rate": 9.452421014726268e-06, "loss": 0.0447, "step": 1885 }, { "epoch": 1.760971055088702, "grad_norm": 3.6258584738854287, "learning_rate": 9.4517327724571e-06, "loss": 0.1914, "step": 1886 }, { "epoch": 1.7619047619047619, "grad_norm": 1.770273618342888, "learning_rate": 9.45104412302808e-06, "loss": 0.09, "step": 1887 }, { "epoch": 1.7628384687208216, "grad_norm": 3.505301177925097, "learning_rate": 9.450355066502192e-06, "loss": 0.342, "step": 1888 }, { "epoch": 1.7637721755368814, "grad_norm": 2.008773079995072, "learning_rate": 9.449665602942461e-06, "loss": 0.0255, "step": 1889 }, { "epoch": 1.7647058823529411, "grad_norm": 1.6067455307863785, "learning_rate": 9.448975732411944e-06, "loss": 0.0422, "step": 1890 }, { "epoch": 1.7656395891690009, "grad_norm": 1.7938381046420258, "learning_rate": 9.448285454973739e-06, "loss": 0.0956, "step": 1891 }, { "epoch": 1.7665732959850606, "grad_norm": 1.5685961170954443, "learning_rate": 9.447594770690975e-06, "loss": 0.0501, "step": 1892 }, { "epoch": 1.7675070028011204, "grad_norm": 0.8810447264866351, "learning_rate": 9.446903679626827e-06, "loss": 0.0273, "step": 1893 }, { "epoch": 1.7684407096171801, "grad_norm": 3.316489849864457, "learning_rate": 9.446212181844501e-06, "loss": 0.1416, "step": 1894 }, { "epoch": 1.76937441643324, "grad_norm": 4.562432914087363, "learning_rate": 9.445520277407243e-06, "loss": 0.1662, "step": 1895 }, { "epoch": 1.7703081232492996, "grad_norm": 3.603433966779492, "learning_rate": 9.444827966378333e-06, "loss": 0.0412, "step": 1896 }, { "epoch": 1.7712418300653594, "grad_norm": 0.7606703887553756, "learning_rate": 9.444135248821096e-06, "loss": 0.0215, "step": 1897 }, { "epoch": 1.7721755368814192, "grad_norm": 4.001828827116805, "learning_rate": 9.443442124798882e-06, "loss": 0.179, "step": 1898 }, { "epoch": 1.773109243697479, "grad_norm": 2.323304732822115, "learning_rate": 9.44274859437509e-06, "loss": 0.1037, "step": 1899 }, { "epoch": 1.7740429505135387, "grad_norm": 5.536632150796641, "learning_rate": 9.442054657613146e-06, "loss": 0.061, "step": 1900 }, { "epoch": 1.7749766573295984, "grad_norm": 0.9179357565679765, "learning_rate": 9.441360314576522e-06, "loss": 0.03, "step": 1901 }, { "epoch": 1.7759103641456582, "grad_norm": 2.1755362477808444, "learning_rate": 9.440665565328722e-06, "loss": 0.0914, "step": 1902 }, { "epoch": 1.776844070961718, "grad_norm": 3.4162953183426614, "learning_rate": 9.439970409933288e-06, "loss": 0.1674, "step": 1903 }, { "epoch": 1.7777777777777777, "grad_norm": 5.060592536146544, "learning_rate": 9.4392748484538e-06, "loss": 0.1098, "step": 1904 }, { "epoch": 1.7787114845938374, "grad_norm": 8.077044612190836, "learning_rate": 9.438578880953875e-06, "loss": 0.0538, "step": 1905 }, { "epoch": 1.7796451914098972, "grad_norm": 4.86416163404006, "learning_rate": 9.437882507497167e-06, "loss": 0.0837, "step": 1906 }, { "epoch": 1.780578898225957, "grad_norm": 1.9141049922879, "learning_rate": 9.437185728147366e-06, "loss": 0.0272, "step": 1907 }, { "epoch": 1.7815126050420167, "grad_norm": 1.9520795548799252, "learning_rate": 9.436488542968203e-06, "loss": 0.1031, "step": 1908 }, { "epoch": 1.7824463118580764, "grad_norm": 2.033113942171644, "learning_rate": 9.43579095202344e-06, "loss": 0.0553, "step": 1909 }, { "epoch": 1.7833800186741362, "grad_norm": 4.293170536404016, "learning_rate": 9.43509295537688e-06, "loss": 0.2008, "step": 1910 }, { "epoch": 1.784313725490196, "grad_norm": 5.385840836113783, "learning_rate": 9.434394553092363e-06, "loss": 0.381, "step": 1911 }, { "epoch": 1.7852474323062557, "grad_norm": 8.082763419319427, "learning_rate": 9.433695745233767e-06, "loss": 0.2551, "step": 1912 }, { "epoch": 1.7861811391223155, "grad_norm": 2.1243298696197086, "learning_rate": 9.432996531865001e-06, "loss": 0.1656, "step": 1913 }, { "epoch": 1.7871148459383752, "grad_norm": 5.515272233300086, "learning_rate": 9.432296913050021e-06, "loss": 0.0813, "step": 1914 }, { "epoch": 1.788048552754435, "grad_norm": 1.3048753083868343, "learning_rate": 9.431596888852812e-06, "loss": 0.069, "step": 1915 }, { "epoch": 1.7889822595704947, "grad_norm": 3.0026755062881163, "learning_rate": 9.4308964593374e-06, "loss": 0.1056, "step": 1916 }, { "epoch": 1.7899159663865545, "grad_norm": 1.5676105352710115, "learning_rate": 9.430195624567846e-06, "loss": 0.0549, "step": 1917 }, { "epoch": 1.7908496732026142, "grad_norm": 1.765244834237023, "learning_rate": 9.42949438460825e-06, "loss": 0.0565, "step": 1918 }, { "epoch": 1.791783380018674, "grad_norm": 1.6246224208916058, "learning_rate": 9.428792739522747e-06, "loss": 0.0362, "step": 1919 }, { "epoch": 1.7927170868347337, "grad_norm": 1.9687348517442693, "learning_rate": 9.428090689375513e-06, "loss": 0.0874, "step": 1920 }, { "epoch": 1.7936507936507935, "grad_norm": 1.4822653470307912, "learning_rate": 9.427388234230754e-06, "loss": 0.0413, "step": 1921 }, { "epoch": 1.7945845004668532, "grad_norm": 4.977073736188033, "learning_rate": 9.426685374152718e-06, "loss": 0.1716, "step": 1922 }, { "epoch": 1.795518207282913, "grad_norm": 2.8966799150866387, "learning_rate": 9.425982109205693e-06, "loss": 0.0814, "step": 1923 }, { "epoch": 1.7964519140989728, "grad_norm": 1.9424428892493713, "learning_rate": 9.425278439453997e-06, "loss": 0.0518, "step": 1924 }, { "epoch": 1.7973856209150327, "grad_norm": 2.5628132988587407, "learning_rate": 9.424574364961989e-06, "loss": 0.1446, "step": 1925 }, { "epoch": 1.7983193277310925, "grad_norm": 1.9564167557081138, "learning_rate": 9.423869885794064e-06, "loss": 0.0975, "step": 1926 }, { "epoch": 1.7992530345471522, "grad_norm": 2.837939143769396, "learning_rate": 9.423165002014655e-06, "loss": 0.1134, "step": 1927 }, { "epoch": 1.800186741363212, "grad_norm": 2.3529564817065807, "learning_rate": 9.422459713688233e-06, "loss": 0.102, "step": 1928 }, { "epoch": 1.8011204481792717, "grad_norm": 4.478394436820673, "learning_rate": 9.4217540208793e-06, "loss": 0.074, "step": 1929 }, { "epoch": 1.8020541549953315, "grad_norm": 1.197146669714803, "learning_rate": 9.421047923652404e-06, "loss": 0.0481, "step": 1930 }, { "epoch": 1.8029878618113913, "grad_norm": 2.0784942216829783, "learning_rate": 9.420341422072124e-06, "loss": 0.0864, "step": 1931 }, { "epoch": 1.803921568627451, "grad_norm": 1.917897081091645, "learning_rate": 9.419634516203074e-06, "loss": 0.0833, "step": 1932 }, { "epoch": 1.8048552754435108, "grad_norm": 2.5984904524368844, "learning_rate": 9.418927206109914e-06, "loss": 0.0671, "step": 1933 }, { "epoch": 1.8057889822595705, "grad_norm": 3.9476176925433846, "learning_rate": 9.418219491857329e-06, "loss": 0.2018, "step": 1934 }, { "epoch": 1.8067226890756303, "grad_norm": 1.0722927411773508, "learning_rate": 9.417511373510052e-06, "loss": 0.057, "step": 1935 }, { "epoch": 1.80765639589169, "grad_norm": 4.771938530478485, "learning_rate": 9.416802851132847e-06, "loss": 0.1994, "step": 1936 }, { "epoch": 1.8085901027077498, "grad_norm": 3.786891649656435, "learning_rate": 9.416093924790515e-06, "loss": 0.0377, "step": 1937 }, { "epoch": 1.8095238095238095, "grad_norm": 2.6031722482845807, "learning_rate": 9.415384594547897e-06, "loss": 0.1044, "step": 1938 }, { "epoch": 1.8104575163398693, "grad_norm": 4.542417168578123, "learning_rate": 9.414674860469865e-06, "loss": 0.1419, "step": 1939 }, { "epoch": 1.811391223155929, "grad_norm": 4.23187210899233, "learning_rate": 9.413964722621339e-06, "loss": 0.3348, "step": 1940 }, { "epoch": 1.8123249299719888, "grad_norm": 4.812303331979852, "learning_rate": 9.413254181067261e-06, "loss": 0.1702, "step": 1941 }, { "epoch": 1.8132586367880486, "grad_norm": 3.324837315782138, "learning_rate": 9.412543235872624e-06, "loss": 0.1485, "step": 1942 }, { "epoch": 1.8141923436041083, "grad_norm": 2.221249822312652, "learning_rate": 9.411831887102449e-06, "loss": 0.1172, "step": 1943 }, { "epoch": 1.815126050420168, "grad_norm": 3.7568504121254116, "learning_rate": 9.411120134821796e-06, "loss": 0.1266, "step": 1944 }, { "epoch": 1.8160597572362278, "grad_norm": 1.0090883165198659, "learning_rate": 9.410407979095765e-06, "loss": 0.0181, "step": 1945 }, { "epoch": 1.8169934640522876, "grad_norm": 3.270182625979887, "learning_rate": 9.409695419989488e-06, "loss": 0.1099, "step": 1946 }, { "epoch": 1.8179271708683473, "grad_norm": 1.702159540762874, "learning_rate": 9.408982457568138e-06, "loss": 0.0838, "step": 1947 }, { "epoch": 1.818860877684407, "grad_norm": 2.0698327762182807, "learning_rate": 9.408269091896923e-06, "loss": 0.0737, "step": 1948 }, { "epoch": 1.8197945845004668, "grad_norm": 1.452202449270915, "learning_rate": 9.407555323041088e-06, "loss": 0.0385, "step": 1949 }, { "epoch": 1.8207282913165266, "grad_norm": 1.1159019384589086, "learning_rate": 9.406841151065914e-06, "loss": 0.0424, "step": 1950 }, { "epoch": 1.8216619981325863, "grad_norm": 3.41485955631308, "learning_rate": 9.40612657603672e-06, "loss": 0.1732, "step": 1951 }, { "epoch": 1.822595704948646, "grad_norm": 1.358362603244683, "learning_rate": 9.405411598018864e-06, "loss": 0.0778, "step": 1952 }, { "epoch": 1.8235294117647058, "grad_norm": 2.0472084746653296, "learning_rate": 9.404696217077735e-06, "loss": 0.0742, "step": 1953 }, { "epoch": 1.8244631185807656, "grad_norm": 1.232072804912432, "learning_rate": 9.403980433278768e-06, "loss": 0.1076, "step": 1954 }, { "epoch": 1.8253968253968254, "grad_norm": 1.259399901035565, "learning_rate": 9.403264246687423e-06, "loss": 0.0689, "step": 1955 }, { "epoch": 1.826330532212885, "grad_norm": 2.23978421696753, "learning_rate": 9.402547657369206e-06, "loss": 0.1051, "step": 1956 }, { "epoch": 1.8272642390289449, "grad_norm": 2.62261880847571, "learning_rate": 9.401830665389658e-06, "loss": 0.1446, "step": 1957 }, { "epoch": 1.8281979458450048, "grad_norm": 1.9034546734271696, "learning_rate": 9.401113270814352e-06, "loss": 0.0201, "step": 1958 }, { "epoch": 1.8291316526610646, "grad_norm": 1.6556728159029217, "learning_rate": 9.400395473708907e-06, "loss": 0.0972, "step": 1959 }, { "epoch": 1.8300653594771243, "grad_norm": 0.9331420113690944, "learning_rate": 9.399677274138969e-06, "loss": 0.0317, "step": 1960 }, { "epoch": 1.830999066293184, "grad_norm": 1.8707926876580434, "learning_rate": 9.398958672170227e-06, "loss": 0.0414, "step": 1961 }, { "epoch": 1.8319327731092439, "grad_norm": 3.804555749029013, "learning_rate": 9.398239667868405e-06, "loss": 0.1773, "step": 1962 }, { "epoch": 1.8328664799253036, "grad_norm": 0.5015916631128209, "learning_rate": 9.397520261299265e-06, "loss": 0.0078, "step": 1963 }, { "epoch": 1.8338001867413634, "grad_norm": 2.8364574929513613, "learning_rate": 9.396800452528604e-06, "loss": 0.1358, "step": 1964 }, { "epoch": 1.8347338935574231, "grad_norm": 2.6764551703924195, "learning_rate": 9.396080241622252e-06, "loss": 0.0901, "step": 1965 }, { "epoch": 1.8356676003734829, "grad_norm": 1.639056338330199, "learning_rate": 9.395359628646087e-06, "loss": 0.0801, "step": 1966 }, { "epoch": 1.8366013071895426, "grad_norm": 1.296654630744137, "learning_rate": 9.394638613666013e-06, "loss": 0.0407, "step": 1967 }, { "epoch": 1.8375350140056024, "grad_norm": 1.3371550668957177, "learning_rate": 9.393917196747976e-06, "loss": 0.0427, "step": 1968 }, { "epoch": 1.8384687208216621, "grad_norm": 2.180476199480724, "learning_rate": 9.393195377957958e-06, "loss": 0.0962, "step": 1969 }, { "epoch": 1.8394024276377219, "grad_norm": 0.9682443239521774, "learning_rate": 9.392473157361974e-06, "loss": 0.0452, "step": 1970 }, { "epoch": 1.8403361344537816, "grad_norm": 2.0025180954276185, "learning_rate": 9.391750535026085e-06, "loss": 0.0717, "step": 1971 }, { "epoch": 1.8412698412698414, "grad_norm": 2.997412158419331, "learning_rate": 9.391027511016377e-06, "loss": 0.1154, "step": 1972 }, { "epoch": 1.8422035480859011, "grad_norm": 4.27846866217144, "learning_rate": 9.39030408539898e-06, "loss": 0.1665, "step": 1973 }, { "epoch": 1.843137254901961, "grad_norm": 2.1474441436052745, "learning_rate": 9.389580258240062e-06, "loss": 0.1187, "step": 1974 }, { "epoch": 1.8440709617180207, "grad_norm": 3.0790690737880317, "learning_rate": 9.388856029605821e-06, "loss": 0.2142, "step": 1975 }, { "epoch": 1.8450046685340804, "grad_norm": 3.7273842892033975, "learning_rate": 9.3881313995625e-06, "loss": 0.1713, "step": 1976 }, { "epoch": 1.8459383753501402, "grad_norm": 2.8895521583866834, "learning_rate": 9.38740636817637e-06, "loss": 0.0623, "step": 1977 }, { "epoch": 1.8468720821662, "grad_norm": 3.442132006446569, "learning_rate": 9.386680935513746e-06, "loss": 0.1692, "step": 1978 }, { "epoch": 1.8478057889822597, "grad_norm": 1.7708959208872586, "learning_rate": 9.385955101640974e-06, "loss": 0.0619, "step": 1979 }, { "epoch": 1.8487394957983194, "grad_norm": 6.001708753106752, "learning_rate": 9.385228866624443e-06, "loss": 0.2408, "step": 1980 }, { "epoch": 1.8496732026143792, "grad_norm": 1.231903933164756, "learning_rate": 9.384502230530572e-06, "loss": 0.0379, "step": 1981 }, { "epoch": 1.850606909430439, "grad_norm": 2.1847002650106124, "learning_rate": 9.383775193425823e-06, "loss": 0.1541, "step": 1982 }, { "epoch": 1.8515406162464987, "grad_norm": 1.8147544980102859, "learning_rate": 9.383047755376688e-06, "loss": 0.0551, "step": 1983 }, { "epoch": 1.8524743230625584, "grad_norm": 3.046618145886767, "learning_rate": 9.382319916449703e-06, "loss": 0.1077, "step": 1984 }, { "epoch": 1.8534080298786182, "grad_norm": 2.476692084445991, "learning_rate": 9.381591676711432e-06, "loss": 0.1048, "step": 1985 }, { "epoch": 1.854341736694678, "grad_norm": 1.1695721027566282, "learning_rate": 9.380863036228488e-06, "loss": 0.0325, "step": 1986 }, { "epoch": 1.8552754435107377, "grad_norm": 1.0409515763563646, "learning_rate": 9.380133995067503e-06, "loss": 0.0418, "step": 1987 }, { "epoch": 1.8562091503267975, "grad_norm": 6.041240681470535, "learning_rate": 9.379404553295165e-06, "loss": 0.2366, "step": 1988 }, { "epoch": 1.8571428571428572, "grad_norm": 1.7874746697559893, "learning_rate": 9.378674710978185e-06, "loss": 0.0859, "step": 1989 }, { "epoch": 1.858076563958917, "grad_norm": 2.5979669158138883, "learning_rate": 9.377944468183315e-06, "loss": 0.1945, "step": 1990 }, { "epoch": 1.8590102707749767, "grad_norm": 1.4604477052510811, "learning_rate": 9.377213824977347e-06, "loss": 0.0491, "step": 1991 }, { "epoch": 1.8599439775910365, "grad_norm": 1.7598361670706608, "learning_rate": 9.376482781427101e-06, "loss": 0.0427, "step": 1992 }, { "epoch": 1.8608776844070962, "grad_norm": 1.657675121703688, "learning_rate": 9.375751337599444e-06, "loss": 0.0274, "step": 1993 }, { "epoch": 1.861811391223156, "grad_norm": 3.4838590884316143, "learning_rate": 9.375019493561273e-06, "loss": 0.1826, "step": 1994 }, { "epoch": 1.8627450980392157, "grad_norm": 1.497711391163974, "learning_rate": 9.37428724937952e-06, "loss": 0.0681, "step": 1995 }, { "epoch": 1.8636788048552755, "grad_norm": 1.5267544635027293, "learning_rate": 9.373554605121163e-06, "loss": 0.0519, "step": 1996 }, { "epoch": 1.8646125116713352, "grad_norm": 1.6195933223854915, "learning_rate": 9.372821560853205e-06, "loss": 0.0881, "step": 1997 }, { "epoch": 1.865546218487395, "grad_norm": 1.5370906722017283, "learning_rate": 9.372088116642693e-06, "loss": 0.0746, "step": 1998 }, { "epoch": 1.8664799253034547, "grad_norm": 3.630121997058771, "learning_rate": 9.371354272556707e-06, "loss": 0.1579, "step": 1999 }, { "epoch": 1.8674136321195145, "grad_norm": 6.579268953353069, "learning_rate": 9.370620028662368e-06, "loss": 0.2054, "step": 2000 }, { "epoch": 1.8683473389355743, "grad_norm": 3.157078733246885, "learning_rate": 9.36988538502683e-06, "loss": 0.1727, "step": 2001 }, { "epoch": 1.869281045751634, "grad_norm": 1.1030853897170865, "learning_rate": 9.369150341717282e-06, "loss": 0.0454, "step": 2002 }, { "epoch": 1.8702147525676938, "grad_norm": 0.7571059068426924, "learning_rate": 9.368414898800951e-06, "loss": 0.0237, "step": 2003 }, { "epoch": 1.8711484593837535, "grad_norm": 2.5426835058994364, "learning_rate": 9.367679056345107e-06, "loss": 0.12, "step": 2004 }, { "epoch": 1.8720821661998133, "grad_norm": 4.580988327178791, "learning_rate": 9.366942814417046e-06, "loss": 0.2691, "step": 2005 }, { "epoch": 1.873015873015873, "grad_norm": 2.5746204477634804, "learning_rate": 9.366206173084105e-06, "loss": 0.0607, "step": 2006 }, { "epoch": 1.8739495798319328, "grad_norm": 3.321314722539813, "learning_rate": 9.365469132413661e-06, "loss": 0.1716, "step": 2007 }, { "epoch": 1.8748832866479925, "grad_norm": 3.9353682333139353, "learning_rate": 9.364731692473124e-06, "loss": 0.1939, "step": 2008 }, { "epoch": 1.8758169934640523, "grad_norm": 1.5349648657440351, "learning_rate": 9.36399385332994e-06, "loss": 0.0751, "step": 2009 }, { "epoch": 1.876750700280112, "grad_norm": 1.5953524847668006, "learning_rate": 9.363255615051591e-06, "loss": 0.0599, "step": 2010 }, { "epoch": 1.8776844070961718, "grad_norm": 0.9988568855311079, "learning_rate": 9.3625169777056e-06, "loss": 0.0294, "step": 2011 }, { "epoch": 1.8786181139122315, "grad_norm": 1.2295381944150001, "learning_rate": 9.361777941359522e-06, "loss": 0.056, "step": 2012 }, { "epoch": 1.8795518207282913, "grad_norm": 1.4619096051807652, "learning_rate": 9.361038506080949e-06, "loss": 0.084, "step": 2013 }, { "epoch": 1.880485527544351, "grad_norm": 8.170275855329564, "learning_rate": 9.360298671937514e-06, "loss": 0.3461, "step": 2014 }, { "epoch": 1.8814192343604108, "grad_norm": 2.1003847547329415, "learning_rate": 9.359558438996878e-06, "loss": 0.1161, "step": 2015 }, { "epoch": 1.8823529411764706, "grad_norm": 1.9055700352390974, "learning_rate": 9.358817807326745e-06, "loss": 0.0569, "step": 2016 }, { "epoch": 1.8832866479925303, "grad_norm": 3.8594332345225615, "learning_rate": 9.358076776994858e-06, "loss": 0.2113, "step": 2017 }, { "epoch": 1.88422035480859, "grad_norm": 0.8507604626148614, "learning_rate": 9.357335348068988e-06, "loss": 0.0341, "step": 2018 }, { "epoch": 1.8851540616246498, "grad_norm": 1.0299662643931355, "learning_rate": 9.356593520616948e-06, "loss": 0.0286, "step": 2019 }, { "epoch": 1.8860877684407096, "grad_norm": 2.181473862418066, "learning_rate": 9.355851294706587e-06, "loss": 0.1105, "step": 2020 }, { "epoch": 1.8870214752567693, "grad_norm": 2.731601225386847, "learning_rate": 9.355108670405786e-06, "loss": 0.1581, "step": 2021 }, { "epoch": 1.887955182072829, "grad_norm": 5.597040635967341, "learning_rate": 9.354365647782472e-06, "loss": 0.1706, "step": 2022 }, { "epoch": 1.8888888888888888, "grad_norm": 2.615048753930359, "learning_rate": 9.3536222269046e-06, "loss": 0.1239, "step": 2023 }, { "epoch": 1.8898225957049486, "grad_norm": 2.272395787135475, "learning_rate": 9.352878407840162e-06, "loss": 0.1127, "step": 2024 }, { "epoch": 1.8907563025210083, "grad_norm": 1.3803804676732865, "learning_rate": 9.35213419065719e-06, "loss": 0.0513, "step": 2025 }, { "epoch": 1.891690009337068, "grad_norm": 0.7378269678044179, "learning_rate": 9.351389575423752e-06, "loss": 0.0246, "step": 2026 }, { "epoch": 1.8926237161531279, "grad_norm": 7.57725930373036, "learning_rate": 9.350644562207951e-06, "loss": 0.1921, "step": 2027 }, { "epoch": 1.8935574229691876, "grad_norm": 3.277708817699089, "learning_rate": 9.349899151077925e-06, "loss": 0.1209, "step": 2028 }, { "epoch": 1.8944911297852474, "grad_norm": 2.31952143046171, "learning_rate": 9.34915334210185e-06, "loss": 0.1077, "step": 2029 }, { "epoch": 1.8954248366013071, "grad_norm": 2.2366631031611086, "learning_rate": 9.348407135347941e-06, "loss": 0.1485, "step": 2030 }, { "epoch": 1.8963585434173669, "grad_norm": 3.101020134395407, "learning_rate": 9.347660530884443e-06, "loss": 0.1326, "step": 2031 }, { "epoch": 1.8972922502334266, "grad_norm": 1.3403289512890366, "learning_rate": 9.346913528779646e-06, "loss": 0.0661, "step": 2032 }, { "epoch": 1.8982259570494864, "grad_norm": 2.6309761967699887, "learning_rate": 9.346166129101867e-06, "loss": 0.0746, "step": 2033 }, { "epoch": 1.8991596638655461, "grad_norm": 3.675740640355808, "learning_rate": 9.345418331919467e-06, "loss": 0.1575, "step": 2034 }, { "epoch": 1.9000933706816059, "grad_norm": 3.710813541017335, "learning_rate": 9.344670137300837e-06, "loss": 0.1512, "step": 2035 }, { "epoch": 1.9010270774976656, "grad_norm": 1.968801109853747, "learning_rate": 9.343921545314412e-06, "loss": 0.1502, "step": 2036 }, { "epoch": 1.9019607843137254, "grad_norm": 1.532634101624356, "learning_rate": 9.343172556028655e-06, "loss": 0.0484, "step": 2037 }, { "epoch": 1.9028944911297851, "grad_norm": 1.8609562187169697, "learning_rate": 9.342423169512072e-06, "loss": 0.1345, "step": 2038 }, { "epoch": 1.903828197945845, "grad_norm": 0.801736756350878, "learning_rate": 9.341673385833201e-06, "loss": 0.0252, "step": 2039 }, { "epoch": 1.9047619047619047, "grad_norm": 3.8396599365706714, "learning_rate": 9.340923205060617e-06, "loss": 0.2204, "step": 2040 }, { "epoch": 1.9056956115779644, "grad_norm": 2.6236460270399147, "learning_rate": 9.340172627262935e-06, "loss": 0.107, "step": 2041 }, { "epoch": 1.9066293183940242, "grad_norm": 2.0975606783897587, "learning_rate": 9.339421652508803e-06, "loss": 0.0904, "step": 2042 }, { "epoch": 1.907563025210084, "grad_norm": 1.3797925196733531, "learning_rate": 9.338670280866905e-06, "loss": 0.0617, "step": 2043 }, { "epoch": 1.9084967320261437, "grad_norm": 1.2882121961764195, "learning_rate": 9.337918512405961e-06, "loss": 0.0573, "step": 2044 }, { "epoch": 1.9094304388422034, "grad_norm": 0.596323805765308, "learning_rate": 9.337166347194732e-06, "loss": 0.0291, "step": 2045 }, { "epoch": 1.9103641456582632, "grad_norm": 1.4084467704693018, "learning_rate": 9.336413785302007e-06, "loss": 0.0932, "step": 2046 }, { "epoch": 1.911297852474323, "grad_norm": 0.9835277889101605, "learning_rate": 9.335660826796622e-06, "loss": 0.0275, "step": 2047 }, { "epoch": 1.9122315592903827, "grad_norm": 3.755543538513655, "learning_rate": 9.334907471747438e-06, "loss": 0.0744, "step": 2048 }, { "epoch": 1.9131652661064424, "grad_norm": 3.896481223478455, "learning_rate": 9.33415372022336e-06, "loss": 0.0395, "step": 2049 }, { "epoch": 1.9140989729225022, "grad_norm": 4.308311795425239, "learning_rate": 9.333399572293325e-06, "loss": 0.1858, "step": 2050 }, { "epoch": 1.915032679738562, "grad_norm": 2.2101798311535776, "learning_rate": 9.332645028026312e-06, "loss": 0.0965, "step": 2051 }, { "epoch": 1.9159663865546217, "grad_norm": 2.9002336322626516, "learning_rate": 9.331890087491328e-06, "loss": 0.1197, "step": 2052 }, { "epoch": 1.9169000933706815, "grad_norm": 0.6344450977671839, "learning_rate": 9.331134750757425e-06, "loss": 0.0204, "step": 2053 }, { "epoch": 1.9178338001867412, "grad_norm": 2.2924179553121062, "learning_rate": 9.330379017893683e-06, "loss": 0.1449, "step": 2054 }, { "epoch": 1.918767507002801, "grad_norm": 1.2756634767107031, "learning_rate": 9.329622888969221e-06, "loss": 0.0245, "step": 2055 }, { "epoch": 1.9197012138188607, "grad_norm": 1.4436848755377643, "learning_rate": 9.328866364053202e-06, "loss": 0.0871, "step": 2056 }, { "epoch": 1.9206349206349205, "grad_norm": 0.7383889941850373, "learning_rate": 9.328109443214812e-06, "loss": 0.0178, "step": 2057 }, { "epoch": 1.9215686274509802, "grad_norm": 2.143746745705035, "learning_rate": 9.327352126523282e-06, "loss": 0.0874, "step": 2058 }, { "epoch": 1.9225023342670402, "grad_norm": 0.6414419791442038, "learning_rate": 9.326594414047877e-06, "loss": 0.023, "step": 2059 }, { "epoch": 1.9234360410831, "grad_norm": 3.339274056775656, "learning_rate": 9.325836305857898e-06, "loss": 0.1763, "step": 2060 }, { "epoch": 1.9243697478991597, "grad_norm": 2.6708683792434686, "learning_rate": 9.325077802022682e-06, "loss": 0.1593, "step": 2061 }, { "epoch": 1.9253034547152195, "grad_norm": 3.2696065167095942, "learning_rate": 9.324318902611604e-06, "loss": 0.122, "step": 2062 }, { "epoch": 1.9262371615312792, "grad_norm": 3.1812506617887, "learning_rate": 9.323559607694072e-06, "loss": 0.1499, "step": 2063 }, { "epoch": 1.927170868347339, "grad_norm": 0.8524680893927084, "learning_rate": 9.322799917339532e-06, "loss": 0.0331, "step": 2064 }, { "epoch": 1.9281045751633987, "grad_norm": 0.9387215464006282, "learning_rate": 9.322039831617466e-06, "loss": 0.034, "step": 2065 }, { "epoch": 1.9290382819794585, "grad_norm": 1.4520822372907634, "learning_rate": 9.321279350597394e-06, "loss": 0.0566, "step": 2066 }, { "epoch": 1.9299719887955182, "grad_norm": 2.414608618305295, "learning_rate": 9.320518474348868e-06, "loss": 0.1522, "step": 2067 }, { "epoch": 1.930905695611578, "grad_norm": 1.1204940060633974, "learning_rate": 9.31975720294148e-06, "loss": 0.0729, "step": 2068 }, { "epoch": 1.9318394024276377, "grad_norm": 2.7431436489011616, "learning_rate": 9.318995536444857e-06, "loss": 0.1342, "step": 2069 }, { "epoch": 1.9327731092436975, "grad_norm": 3.2280466334455773, "learning_rate": 9.31823347492866e-06, "loss": 0.2382, "step": 2070 }, { "epoch": 1.9337068160597572, "grad_norm": 1.6401964256809571, "learning_rate": 9.317471018462587e-06, "loss": 0.0474, "step": 2071 }, { "epoch": 1.934640522875817, "grad_norm": 2.424564321031303, "learning_rate": 9.316708167116377e-06, "loss": 0.0764, "step": 2072 }, { "epoch": 1.9355742296918768, "grad_norm": 3.6245231667229976, "learning_rate": 9.315944920959798e-06, "loss": 0.0984, "step": 2073 }, { "epoch": 1.9365079365079365, "grad_norm": 2.188231272735473, "learning_rate": 9.315181280062658e-06, "loss": 0.054, "step": 2074 }, { "epoch": 1.9374416433239963, "grad_norm": 1.7375342109113774, "learning_rate": 9.314417244494801e-06, "loss": 0.0236, "step": 2075 }, { "epoch": 1.938375350140056, "grad_norm": 2.2652297765977627, "learning_rate": 9.313652814326106e-06, "loss": 0.124, "step": 2076 }, { "epoch": 1.9393090569561158, "grad_norm": 8.774796840362665, "learning_rate": 9.312887989626488e-06, "loss": 0.3397, "step": 2077 }, { "epoch": 1.9402427637721755, "grad_norm": 1.7693422408797743, "learning_rate": 9.312122770465899e-06, "loss": 0.0705, "step": 2078 }, { "epoch": 1.9411764705882353, "grad_norm": 3.0161637794308103, "learning_rate": 9.311357156914327e-06, "loss": 0.1627, "step": 2079 }, { "epoch": 1.942110177404295, "grad_norm": 7.463123713234923, "learning_rate": 9.310591149041795e-06, "loss": 0.184, "step": 2080 }, { "epoch": 1.9430438842203548, "grad_norm": 1.8612773237375317, "learning_rate": 9.309824746918365e-06, "loss": 0.0611, "step": 2081 }, { "epoch": 1.9439775910364145, "grad_norm": 2.2077816729436677, "learning_rate": 9.309057950614129e-06, "loss": 0.1216, "step": 2082 }, { "epoch": 1.9449112978524743, "grad_norm": 3.0395829906314433, "learning_rate": 9.308290760199224e-06, "loss": 0.1222, "step": 2083 }, { "epoch": 1.945845004668534, "grad_norm": 3.106538412785197, "learning_rate": 9.307523175743812e-06, "loss": 0.1629, "step": 2084 }, { "epoch": 1.9467787114845938, "grad_norm": 4.809439641966218, "learning_rate": 9.306755197318103e-06, "loss": 0.0531, "step": 2085 }, { "epoch": 1.9477124183006536, "grad_norm": 1.363340732836654, "learning_rate": 9.305986824992332e-06, "loss": 0.0642, "step": 2086 }, { "epoch": 1.9486461251167133, "grad_norm": 1.7692552110392528, "learning_rate": 9.305218058836778e-06, "loss": 0.0404, "step": 2087 }, { "epoch": 1.949579831932773, "grad_norm": 1.1682945415704387, "learning_rate": 9.304448898921753e-06, "loss": 0.0435, "step": 2088 }, { "epoch": 1.9505135387488328, "grad_norm": 2.754283476981043, "learning_rate": 9.303679345317604e-06, "loss": 0.0829, "step": 2089 }, { "epoch": 1.9514472455648926, "grad_norm": 1.3283831601536522, "learning_rate": 9.302909398094718e-06, "loss": 0.0355, "step": 2090 }, { "epoch": 1.9523809523809523, "grad_norm": 1.311876809005338, "learning_rate": 9.302139057323509e-06, "loss": 0.0533, "step": 2091 }, { "epoch": 1.9533146591970123, "grad_norm": 6.735397566348657, "learning_rate": 9.30136832307444e-06, "loss": 0.1204, "step": 2092 }, { "epoch": 1.954248366013072, "grad_norm": 1.4267851868807577, "learning_rate": 9.300597195418e-06, "loss": 0.067, "step": 2093 }, { "epoch": 1.9551820728291318, "grad_norm": 1.654466050234769, "learning_rate": 9.299825674424714e-06, "loss": 0.083, "step": 2094 }, { "epoch": 1.9561157796451916, "grad_norm": 3.1071868278767867, "learning_rate": 9.299053760165153e-06, "loss": 0.1591, "step": 2095 }, { "epoch": 1.9570494864612513, "grad_norm": 3.9278580423685696, "learning_rate": 9.298281452709911e-06, "loss": 0.1394, "step": 2096 }, { "epoch": 1.957983193277311, "grad_norm": 1.2018331377246625, "learning_rate": 9.297508752129626e-06, "loss": 0.0685, "step": 2097 }, { "epoch": 1.9589169000933708, "grad_norm": 6.838391200030262, "learning_rate": 9.296735658494973e-06, "loss": 0.1039, "step": 2098 }, { "epoch": 1.9598506069094306, "grad_norm": 2.1655655966066125, "learning_rate": 9.295962171876656e-06, "loss": 0.0967, "step": 2099 }, { "epoch": 1.9607843137254903, "grad_norm": 1.6882883828612858, "learning_rate": 9.295188292345418e-06, "loss": 0.0228, "step": 2100 }, { "epoch": 1.96171802054155, "grad_norm": 2.5725187091210278, "learning_rate": 9.294414019972043e-06, "loss": 0.1035, "step": 2101 }, { "epoch": 1.9626517273576098, "grad_norm": 2.9867099816283407, "learning_rate": 9.293639354827343e-06, "loss": 0.0566, "step": 2102 }, { "epoch": 1.9635854341736696, "grad_norm": 5.290746999513022, "learning_rate": 9.292864296982174e-06, "loss": 0.3465, "step": 2103 }, { "epoch": 1.9645191409897294, "grad_norm": 1.351765491983201, "learning_rate": 9.292088846507418e-06, "loss": 0.0715, "step": 2104 }, { "epoch": 1.965452847805789, "grad_norm": 3.32697837825502, "learning_rate": 9.291313003474001e-06, "loss": 0.1618, "step": 2105 }, { "epoch": 1.9663865546218489, "grad_norm": 2.3002722894830567, "learning_rate": 9.290536767952883e-06, "loss": 0.1153, "step": 2106 }, { "epoch": 1.9673202614379086, "grad_norm": 2.3038808395255557, "learning_rate": 9.289760140015059e-06, "loss": 0.099, "step": 2107 }, { "epoch": 1.9682539682539684, "grad_norm": 3.0638030736767408, "learning_rate": 9.28898311973156e-06, "loss": 0.0418, "step": 2108 }, { "epoch": 1.9691876750700281, "grad_norm": 1.7830040912303105, "learning_rate": 9.288205707173453e-06, "loss": 0.1347, "step": 2109 }, { "epoch": 1.9701213818860879, "grad_norm": 0.9944626070700625, "learning_rate": 9.28742790241184e-06, "loss": 0.0385, "step": 2110 }, { "epoch": 1.9710550887021476, "grad_norm": 2.3131595434372603, "learning_rate": 9.286649705517861e-06, "loss": 0.0381, "step": 2111 }, { "epoch": 1.9719887955182074, "grad_norm": 1.0575348881322357, "learning_rate": 9.28587111656269e-06, "loss": 0.0341, "step": 2112 }, { "epoch": 1.9729225023342671, "grad_norm": 1.5473486483433263, "learning_rate": 9.285092135617539e-06, "loss": 0.0499, "step": 2113 }, { "epoch": 1.973856209150327, "grad_norm": 1.2334276240030237, "learning_rate": 9.284312762753653e-06, "loss": 0.041, "step": 2114 }, { "epoch": 1.9747899159663866, "grad_norm": 2.291788250309788, "learning_rate": 9.283532998042314e-06, "loss": 0.0995, "step": 2115 }, { "epoch": 1.9757236227824464, "grad_norm": 3.477152055712409, "learning_rate": 9.28275284155484e-06, "loss": 0.0408, "step": 2116 }, { "epoch": 1.9766573295985062, "grad_norm": 1.0589981294111481, "learning_rate": 9.281972293362587e-06, "loss": 0.0442, "step": 2117 }, { "epoch": 1.977591036414566, "grad_norm": 2.1958190252739413, "learning_rate": 9.28119135353694e-06, "loss": 0.1769, "step": 2118 }, { "epoch": 1.9785247432306257, "grad_norm": 1.5814813764272058, "learning_rate": 9.28041002214933e-06, "loss": 0.0899, "step": 2119 }, { "epoch": 1.9794584500466854, "grad_norm": 1.9927572544889023, "learning_rate": 9.279628299271214e-06, "loss": 0.1108, "step": 2120 }, { "epoch": 1.9803921568627452, "grad_norm": 2.3948062485342043, "learning_rate": 9.278846184974094e-06, "loss": 0.1372, "step": 2121 }, { "epoch": 1.981325863678805, "grad_norm": 3.9644871547882334, "learning_rate": 9.278063679329498e-06, "loss": 0.1867, "step": 2122 }, { "epoch": 1.9822595704948647, "grad_norm": 4.1320221631438, "learning_rate": 9.277280782408996e-06, "loss": 0.2313, "step": 2123 }, { "epoch": 1.9831932773109244, "grad_norm": 1.548913342629816, "learning_rate": 9.276497494284193e-06, "loss": 0.0799, "step": 2124 }, { "epoch": 1.9841269841269842, "grad_norm": 3.200769942207771, "learning_rate": 9.275713815026732e-06, "loss": 0.1281, "step": 2125 }, { "epoch": 1.985060690943044, "grad_norm": 2.556181170345708, "learning_rate": 9.274929744708285e-06, "loss": 0.1609, "step": 2126 }, { "epoch": 1.9859943977591037, "grad_norm": 1.6285293248801116, "learning_rate": 9.274145283400566e-06, "loss": 0.0983, "step": 2127 }, { "epoch": 1.9869281045751634, "grad_norm": 1.7388365877521057, "learning_rate": 9.27336043117532e-06, "loss": 0.0744, "step": 2128 }, { "epoch": 1.9878618113912232, "grad_norm": 4.783705127870393, "learning_rate": 9.272575188104335e-06, "loss": 0.213, "step": 2129 }, { "epoch": 1.988795518207283, "grad_norm": 3.719287470681492, "learning_rate": 9.271789554259426e-06, "loss": 0.0522, "step": 2130 }, { "epoch": 1.9897292250233427, "grad_norm": 2.19277824069662, "learning_rate": 9.27100352971245e-06, "loss": 0.0689, "step": 2131 }, { "epoch": 1.9906629318394025, "grad_norm": 2.451262745486086, "learning_rate": 9.270217114535297e-06, "loss": 0.1558, "step": 2132 }, { "epoch": 1.9915966386554622, "grad_norm": 3.1408507233755727, "learning_rate": 9.269430308799893e-06, "loss": 0.2231, "step": 2133 }, { "epoch": 1.992530345471522, "grad_norm": 2.0460436950791236, "learning_rate": 9.2686431125782e-06, "loss": 0.0399, "step": 2134 }, { "epoch": 1.9934640522875817, "grad_norm": 2.1266332122780134, "learning_rate": 9.267855525942215e-06, "loss": 0.0902, "step": 2135 }, { "epoch": 1.9943977591036415, "grad_norm": 1.8613090776792447, "learning_rate": 9.267067548963975e-06, "loss": 0.0872, "step": 2136 }, { "epoch": 1.9953314659197012, "grad_norm": 0.79046205212187, "learning_rate": 9.266279181715546e-06, "loss": 0.0235, "step": 2137 }, { "epoch": 1.996265172735761, "grad_norm": 2.099848886314738, "learning_rate": 9.26549042426903e-06, "loss": 0.1254, "step": 2138 }, { "epoch": 1.9971988795518207, "grad_norm": 2.843217876941497, "learning_rate": 9.264701276696576e-06, "loss": 0.0521, "step": 2139 }, { "epoch": 1.9981325863678805, "grad_norm": 1.0034173951406349, "learning_rate": 9.263911739070351e-06, "loss": 0.0259, "step": 2140 }, { "epoch": 1.9990662931839402, "grad_norm": 5.068059679211491, "learning_rate": 9.263121811462573e-06, "loss": 0.2935, "step": 2141 }, { "epoch": 2.0, "grad_norm": 2.147595712844217, "learning_rate": 9.262331493945487e-06, "loss": 0.0261, "step": 2142 }, { "epoch": 2.0009337068160598, "grad_norm": 1.5368785713497446, "learning_rate": 9.261540786591377e-06, "loss": 0.101, "step": 2143 }, { "epoch": 2.0018674136321195, "grad_norm": 1.882063457291364, "learning_rate": 9.260749689472562e-06, "loss": 0.1018, "step": 2144 }, { "epoch": 2.0028011204481793, "grad_norm": 0.8377922827194187, "learning_rate": 9.259958202661397e-06, "loss": 0.0307, "step": 2145 }, { "epoch": 2.003734827264239, "grad_norm": 1.3583787673290084, "learning_rate": 9.259166326230269e-06, "loss": 0.0406, "step": 2146 }, { "epoch": 2.0046685340802988, "grad_norm": 1.1296575150527939, "learning_rate": 9.258374060251608e-06, "loss": 0.0358, "step": 2147 }, { "epoch": 2.0056022408963585, "grad_norm": 1.9238868423092228, "learning_rate": 9.257581404797873e-06, "loss": 0.0244, "step": 2148 }, { "epoch": 2.0065359477124183, "grad_norm": 1.0179012761598403, "learning_rate": 9.256788359941562e-06, "loss": 0.0308, "step": 2149 }, { "epoch": 2.007469654528478, "grad_norm": 0.8311646264306571, "learning_rate": 9.255994925755208e-06, "loss": 0.0227, "step": 2150 }, { "epoch": 2.008403361344538, "grad_norm": 4.9582444668808465, "learning_rate": 9.255201102311377e-06, "loss": 0.1903, "step": 2151 }, { "epoch": 2.0093370681605975, "grad_norm": 1.2251396966071486, "learning_rate": 9.254406889682674e-06, "loss": 0.0248, "step": 2152 }, { "epoch": 2.0102707749766573, "grad_norm": 4.384081100617994, "learning_rate": 9.25361228794174e-06, "loss": 0.1712, "step": 2153 }, { "epoch": 2.011204481792717, "grad_norm": 3.217356605042895, "learning_rate": 9.252817297161249e-06, "loss": 0.0626, "step": 2154 }, { "epoch": 2.012138188608777, "grad_norm": 1.8206981872509986, "learning_rate": 9.252021917413912e-06, "loss": 0.092, "step": 2155 }, { "epoch": 2.0130718954248366, "grad_norm": 1.5199609129009402, "learning_rate": 9.251226148772474e-06, "loss": 0.0658, "step": 2156 }, { "epoch": 2.0140056022408963, "grad_norm": 3.5486003154654195, "learning_rate": 9.250429991309719e-06, "loss": 0.1773, "step": 2157 }, { "epoch": 2.014939309056956, "grad_norm": 2.2946726784826468, "learning_rate": 9.24963344509846e-06, "loss": 0.0891, "step": 2158 }, { "epoch": 2.015873015873016, "grad_norm": 2.3510456858042064, "learning_rate": 9.248836510211557e-06, "loss": 0.0982, "step": 2159 }, { "epoch": 2.0168067226890756, "grad_norm": 0.9019963948323411, "learning_rate": 9.248039186721891e-06, "loss": 0.0325, "step": 2160 }, { "epoch": 2.0177404295051353, "grad_norm": 2.9605512607550195, "learning_rate": 9.24724147470239e-06, "loss": 0.0996, "step": 2161 }, { "epoch": 2.018674136321195, "grad_norm": 3.992575705117996, "learning_rate": 9.246443374226013e-06, "loss": 0.2339, "step": 2162 }, { "epoch": 2.019607843137255, "grad_norm": 2.1901765064891543, "learning_rate": 9.245644885365754e-06, "loss": 0.0479, "step": 2163 }, { "epoch": 2.0205415499533146, "grad_norm": 2.117236500835536, "learning_rate": 9.244846008194645e-06, "loss": 0.0826, "step": 2164 }, { "epoch": 2.0214752567693743, "grad_norm": 3.7316323206553794, "learning_rate": 9.244046742785752e-06, "loss": 0.1678, "step": 2165 }, { "epoch": 2.022408963585434, "grad_norm": 1.7620665171699699, "learning_rate": 9.243247089212177e-06, "loss": 0.0751, "step": 2166 }, { "epoch": 2.023342670401494, "grad_norm": 1.5292149890904574, "learning_rate": 9.242447047547054e-06, "loss": 0.0621, "step": 2167 }, { "epoch": 2.0242763772175536, "grad_norm": 3.8027725980822265, "learning_rate": 9.24164661786356e-06, "loss": 0.3326, "step": 2168 }, { "epoch": 2.0252100840336134, "grad_norm": 4.438365438177853, "learning_rate": 9.2408458002349e-06, "loss": 0.1608, "step": 2169 }, { "epoch": 2.026143790849673, "grad_norm": 1.1957909921863055, "learning_rate": 9.24004459473432e-06, "loss": 0.0199, "step": 2170 }, { "epoch": 2.027077497665733, "grad_norm": 7.755115285937373, "learning_rate": 9.239243001435095e-06, "loss": 0.2164, "step": 2171 }, { "epoch": 2.0280112044817926, "grad_norm": 1.8066225465856418, "learning_rate": 9.238441020410543e-06, "loss": 0.1193, "step": 2172 }, { "epoch": 2.0289449112978524, "grad_norm": 2.0876402997680144, "learning_rate": 9.237638651734012e-06, "loss": 0.0765, "step": 2173 }, { "epoch": 2.029878618113912, "grad_norm": 1.911808981820444, "learning_rate": 9.23683589547889e-06, "loss": 0.1194, "step": 2174 }, { "epoch": 2.030812324929972, "grad_norm": 1.357456453689775, "learning_rate": 9.236032751718595e-06, "loss": 0.0611, "step": 2175 }, { "epoch": 2.0317460317460316, "grad_norm": 3.1000386128417983, "learning_rate": 9.235229220526585e-06, "loss": 0.2132, "step": 2176 }, { "epoch": 2.0326797385620914, "grad_norm": 1.3211251212841422, "learning_rate": 9.23442530197635e-06, "loss": 0.0527, "step": 2177 }, { "epoch": 2.033613445378151, "grad_norm": 2.4810924758435893, "learning_rate": 9.233620996141421e-06, "loss": 0.0914, "step": 2178 }, { "epoch": 2.034547152194211, "grad_norm": 1.7452534799928683, "learning_rate": 9.232816303095358e-06, "loss": 0.0906, "step": 2179 }, { "epoch": 2.0354808590102706, "grad_norm": 1.574156892893405, "learning_rate": 9.232011222911759e-06, "loss": 0.0445, "step": 2180 }, { "epoch": 2.0364145658263304, "grad_norm": 1.6940659435038765, "learning_rate": 9.231205755664256e-06, "loss": 0.0798, "step": 2181 }, { "epoch": 2.03734827264239, "grad_norm": 1.1453455215061077, "learning_rate": 9.230399901426521e-06, "loss": 0.0505, "step": 2182 }, { "epoch": 2.03828197945845, "grad_norm": 3.174559276263881, "learning_rate": 9.229593660272255e-06, "loss": 0.135, "step": 2183 }, { "epoch": 2.0392156862745097, "grad_norm": 1.205688467183678, "learning_rate": 9.228787032275201e-06, "loss": 0.0341, "step": 2184 }, { "epoch": 2.0401493930905694, "grad_norm": 0.8169595204605123, "learning_rate": 9.22798001750913e-06, "loss": 0.0161, "step": 2185 }, { "epoch": 2.041083099906629, "grad_norm": 3.1584419177706597, "learning_rate": 9.227172616047857e-06, "loss": 0.1159, "step": 2186 }, { "epoch": 2.042016806722689, "grad_norm": 2.0571801548169297, "learning_rate": 9.226364827965224e-06, "loss": 0.1097, "step": 2187 }, { "epoch": 2.0429505135387487, "grad_norm": 1.5191509436643142, "learning_rate": 9.225556653335115e-06, "loss": 0.0345, "step": 2188 }, { "epoch": 2.0438842203548084, "grad_norm": 2.4547448831657634, "learning_rate": 9.224748092231442e-06, "loss": 0.1431, "step": 2189 }, { "epoch": 2.044817927170868, "grad_norm": 2.062389384418952, "learning_rate": 9.223939144728162e-06, "loss": 0.201, "step": 2190 }, { "epoch": 2.045751633986928, "grad_norm": 1.9163575352283018, "learning_rate": 9.223129810899258e-06, "loss": 0.1294, "step": 2191 }, { "epoch": 2.0466853408029877, "grad_norm": 1.220589149561724, "learning_rate": 9.222320090818757e-06, "loss": 0.0524, "step": 2192 }, { "epoch": 2.0476190476190474, "grad_norm": 3.8247382097830815, "learning_rate": 9.22150998456071e-06, "loss": 0.1554, "step": 2193 }, { "epoch": 2.048552754435107, "grad_norm": 1.527739665329963, "learning_rate": 9.22069949219922e-06, "loss": 0.0919, "step": 2194 }, { "epoch": 2.049486461251167, "grad_norm": 2.4000206121023986, "learning_rate": 9.219888613808404e-06, "loss": 0.1026, "step": 2195 }, { "epoch": 2.0504201680672267, "grad_norm": 1.8035495053811235, "learning_rate": 9.219077349462433e-06, "loss": 0.0903, "step": 2196 }, { "epoch": 2.0513538748832865, "grad_norm": 1.5684661761885788, "learning_rate": 9.218265699235505e-06, "loss": 0.0948, "step": 2197 }, { "epoch": 2.052287581699346, "grad_norm": 0.9124783954463559, "learning_rate": 9.217453663201853e-06, "loss": 0.0553, "step": 2198 }, { "epoch": 2.053221288515406, "grad_norm": 1.94018115492391, "learning_rate": 9.216641241435748e-06, "loss": 0.093, "step": 2199 }, { "epoch": 2.0541549953314657, "grad_norm": 2.406051123644837, "learning_rate": 9.215828434011494e-06, "loss": 0.1577, "step": 2200 }, { "epoch": 2.0550887021475255, "grad_norm": 1.8172332148342027, "learning_rate": 9.215015241003432e-06, "loss": 0.0302, "step": 2201 }, { "epoch": 2.0560224089635852, "grad_norm": 2.8459370934325965, "learning_rate": 9.214201662485935e-06, "loss": 0.1578, "step": 2202 }, { "epoch": 2.056956115779645, "grad_norm": 1.3229482229922003, "learning_rate": 9.213387698533416e-06, "loss": 0.0693, "step": 2203 }, { "epoch": 2.0578898225957047, "grad_norm": 2.419966886864347, "learning_rate": 9.212573349220322e-06, "loss": 0.0918, "step": 2204 }, { "epoch": 2.0588235294117645, "grad_norm": 1.9242239338092815, "learning_rate": 9.21175861462113e-06, "loss": 0.0374, "step": 2205 }, { "epoch": 2.0597572362278243, "grad_norm": 1.1367945897633243, "learning_rate": 9.210943494810362e-06, "loss": 0.0246, "step": 2206 }, { "epoch": 2.060690943043884, "grad_norm": 3.9701086026743493, "learning_rate": 9.210127989862566e-06, "loss": 0.2104, "step": 2207 }, { "epoch": 2.0616246498599438, "grad_norm": 1.3933410110160702, "learning_rate": 9.209312099852328e-06, "loss": 0.0742, "step": 2208 }, { "epoch": 2.0625583566760035, "grad_norm": 1.2303782481737224, "learning_rate": 9.208495824854272e-06, "loss": 0.0299, "step": 2209 }, { "epoch": 2.0634920634920633, "grad_norm": 1.6075977373852819, "learning_rate": 9.207679164943055e-06, "loss": 0.1029, "step": 2210 }, { "epoch": 2.064425770308123, "grad_norm": 1.463419752820759, "learning_rate": 9.20686212019337e-06, "loss": 0.0606, "step": 2211 }, { "epoch": 2.065359477124183, "grad_norm": 2.06386985939237, "learning_rate": 9.206044690679944e-06, "loss": 0.0896, "step": 2212 }, { "epoch": 2.066293183940243, "grad_norm": 1.8793428117437951, "learning_rate": 9.20522687647754e-06, "loss": 0.1033, "step": 2213 }, { "epoch": 2.0672268907563027, "grad_norm": 1.4105878632407747, "learning_rate": 9.204408677660957e-06, "loss": 0.0592, "step": 2214 }, { "epoch": 2.0681605975723625, "grad_norm": 2.4771195816062965, "learning_rate": 9.203590094305027e-06, "loss": 0.0759, "step": 2215 }, { "epoch": 2.0690943043884222, "grad_norm": 0.626236879206348, "learning_rate": 9.202771126484618e-06, "loss": 0.0231, "step": 2216 }, { "epoch": 2.070028011204482, "grad_norm": 2.0437333797235775, "learning_rate": 9.201951774274635e-06, "loss": 0.1422, "step": 2217 }, { "epoch": 2.0709617180205417, "grad_norm": 1.142344282035207, "learning_rate": 9.201132037750017e-06, "loss": 0.043, "step": 2218 }, { "epoch": 2.0718954248366015, "grad_norm": 4.299343034736592, "learning_rate": 9.200311916985738e-06, "loss": 0.1079, "step": 2219 }, { "epoch": 2.0728291316526612, "grad_norm": 1.5244778151899245, "learning_rate": 9.199491412056805e-06, "loss": 0.0996, "step": 2220 }, { "epoch": 2.073762838468721, "grad_norm": 2.0350579943155407, "learning_rate": 9.198670523038263e-06, "loss": 0.0729, "step": 2221 }, { "epoch": 2.0746965452847808, "grad_norm": 0.5367704479693212, "learning_rate": 9.197849250005192e-06, "loss": 0.0208, "step": 2222 }, { "epoch": 2.0756302521008405, "grad_norm": 1.3125437586404334, "learning_rate": 9.197027593032705e-06, "loss": 0.0702, "step": 2223 }, { "epoch": 2.0765639589169003, "grad_norm": 1.6956673561710658, "learning_rate": 9.196205552195954e-06, "loss": 0.0814, "step": 2224 }, { "epoch": 2.07749766573296, "grad_norm": 1.3640370399246047, "learning_rate": 9.195383127570123e-06, "loss": 0.0851, "step": 2225 }, { "epoch": 2.0784313725490198, "grad_norm": 1.1679819456695826, "learning_rate": 9.19456031923043e-06, "loss": 0.0327, "step": 2226 }, { "epoch": 2.0793650793650795, "grad_norm": 1.156308308486371, "learning_rate": 9.193737127252132e-06, "loss": 0.0247, "step": 2227 }, { "epoch": 2.0802987861811393, "grad_norm": 1.3003942605404624, "learning_rate": 9.192913551710518e-06, "loss": 0.0627, "step": 2228 }, { "epoch": 2.081232492997199, "grad_norm": 1.8789485683139568, "learning_rate": 9.192089592680913e-06, "loss": 0.079, "step": 2229 }, { "epoch": 2.082166199813259, "grad_norm": 1.8240179631960736, "learning_rate": 9.191265250238676e-06, "loss": 0.0897, "step": 2230 }, { "epoch": 2.0830999066293185, "grad_norm": 1.7568947990800192, "learning_rate": 9.190440524459203e-06, "loss": 0.095, "step": 2231 }, { "epoch": 2.0840336134453783, "grad_norm": 3.235013445782492, "learning_rate": 9.189615415417926e-06, "loss": 0.2051, "step": 2232 }, { "epoch": 2.084967320261438, "grad_norm": 1.5430371987364397, "learning_rate": 9.188789923190309e-06, "loss": 0.0387, "step": 2233 }, { "epoch": 2.085901027077498, "grad_norm": 4.25103837628406, "learning_rate": 9.187964047851851e-06, "loss": 0.195, "step": 2234 }, { "epoch": 2.0868347338935576, "grad_norm": 0.9479002144738151, "learning_rate": 9.187137789478089e-06, "loss": 0.0284, "step": 2235 }, { "epoch": 2.0877684407096173, "grad_norm": 1.7309207464742966, "learning_rate": 9.186311148144593e-06, "loss": 0.0624, "step": 2236 }, { "epoch": 2.088702147525677, "grad_norm": 1.2394084837704908, "learning_rate": 9.185484123926966e-06, "loss": 0.0387, "step": 2237 }, { "epoch": 2.089635854341737, "grad_norm": 3.034347338948596, "learning_rate": 9.184656716900853e-06, "loss": 0.1305, "step": 2238 }, { "epoch": 2.0905695611577966, "grad_norm": 2.601762621594668, "learning_rate": 9.183828927141927e-06, "loss": 0.1341, "step": 2239 }, { "epoch": 2.0915032679738563, "grad_norm": 1.256039217372379, "learning_rate": 9.183000754725899e-06, "loss": 0.067, "step": 2240 }, { "epoch": 2.092436974789916, "grad_norm": 2.5805860491192054, "learning_rate": 9.182172199728514e-06, "loss": 0.0736, "step": 2241 }, { "epoch": 2.093370681605976, "grad_norm": 1.5328462862109002, "learning_rate": 9.181343262225551e-06, "loss": 0.0543, "step": 2242 }, { "epoch": 2.0943043884220356, "grad_norm": 2.702851048646312, "learning_rate": 9.180513942292827e-06, "loss": 0.1302, "step": 2243 }, { "epoch": 2.0952380952380953, "grad_norm": 3.506859079477558, "learning_rate": 9.179684240006195e-06, "loss": 0.099, "step": 2244 }, { "epoch": 2.096171802054155, "grad_norm": 0.7054918393666862, "learning_rate": 9.178854155441537e-06, "loss": 0.0255, "step": 2245 }, { "epoch": 2.097105508870215, "grad_norm": 2.9225900769182838, "learning_rate": 9.178023688674773e-06, "loss": 0.1347, "step": 2246 }, { "epoch": 2.0980392156862746, "grad_norm": 1.0160020126855254, "learning_rate": 9.177192839781861e-06, "loss": 0.0221, "step": 2247 }, { "epoch": 2.0989729225023344, "grad_norm": 3.2168685368304835, "learning_rate": 9.17636160883879e-06, "loss": 0.2379, "step": 2248 }, { "epoch": 2.099906629318394, "grad_norm": 2.550927762221844, "learning_rate": 9.175529995921583e-06, "loss": 0.1185, "step": 2249 }, { "epoch": 2.100840336134454, "grad_norm": 1.4717783257624617, "learning_rate": 9.174698001106303e-06, "loss": 0.0414, "step": 2250 }, { "epoch": 2.1017740429505136, "grad_norm": 3.6814140832408704, "learning_rate": 9.173865624469043e-06, "loss": 0.143, "step": 2251 }, { "epoch": 2.1027077497665734, "grad_norm": 4.027358134742863, "learning_rate": 9.173032866085936e-06, "loss": 0.2054, "step": 2252 }, { "epoch": 2.103641456582633, "grad_norm": 2.2566423833252935, "learning_rate": 9.172199726033145e-06, "loss": 0.0187, "step": 2253 }, { "epoch": 2.104575163398693, "grad_norm": 1.8487132752323863, "learning_rate": 9.171366204386869e-06, "loss": 0.0703, "step": 2254 }, { "epoch": 2.1055088702147526, "grad_norm": 0.6163549926236241, "learning_rate": 9.170532301223343e-06, "loss": 0.0137, "step": 2255 }, { "epoch": 2.1064425770308124, "grad_norm": 1.2374191091365745, "learning_rate": 9.169698016618838e-06, "loss": 0.0349, "step": 2256 }, { "epoch": 2.107376283846872, "grad_norm": 1.8300302418221734, "learning_rate": 9.168863350649656e-06, "loss": 0.0886, "step": 2257 }, { "epoch": 2.108309990662932, "grad_norm": 1.2661307681669642, "learning_rate": 9.16802830339214e-06, "loss": 0.0247, "step": 2258 }, { "epoch": 2.1092436974789917, "grad_norm": 2.8671415309464607, "learning_rate": 9.167192874922661e-06, "loss": 0.0355, "step": 2259 }, { "epoch": 2.1101774042950514, "grad_norm": 1.953371981727541, "learning_rate": 9.166357065317631e-06, "loss": 0.0469, "step": 2260 }, { "epoch": 2.111111111111111, "grad_norm": 0.4151041341371444, "learning_rate": 9.16552087465349e-06, "loss": 0.01, "step": 2261 }, { "epoch": 2.112044817927171, "grad_norm": 7.5715417507647595, "learning_rate": 9.16468430300672e-06, "loss": 0.292, "step": 2262 }, { "epoch": 2.1129785247432307, "grad_norm": 2.952374272957204, "learning_rate": 9.163847350453835e-06, "loss": 0.1645, "step": 2263 }, { "epoch": 2.1139122315592904, "grad_norm": 0.500567250462549, "learning_rate": 9.16301001707138e-06, "loss": 0.0061, "step": 2264 }, { "epoch": 2.11484593837535, "grad_norm": 3.481140498010539, "learning_rate": 9.162172302935942e-06, "loss": 0.1318, "step": 2265 }, { "epoch": 2.11577964519141, "grad_norm": 1.0444471246222125, "learning_rate": 9.161334208124138e-06, "loss": 0.0229, "step": 2266 }, { "epoch": 2.1167133520074697, "grad_norm": 1.8267491436539327, "learning_rate": 9.160495732712622e-06, "loss": 0.0733, "step": 2267 }, { "epoch": 2.1176470588235294, "grad_norm": 3.482341152253371, "learning_rate": 9.159656876778077e-06, "loss": 0.1778, "step": 2268 }, { "epoch": 2.118580765639589, "grad_norm": 3.2123391034128184, "learning_rate": 9.158817640397233e-06, "loss": 0.1855, "step": 2269 }, { "epoch": 2.119514472455649, "grad_norm": 2.0598550001540423, "learning_rate": 9.15797802364684e-06, "loss": 0.0384, "step": 2270 }, { "epoch": 2.1204481792717087, "grad_norm": 1.2023875699325248, "learning_rate": 9.157138026603695e-06, "loss": 0.0461, "step": 2271 }, { "epoch": 2.1213818860877685, "grad_norm": 2.4142762668894027, "learning_rate": 9.156297649344625e-06, "loss": 0.0232, "step": 2272 }, { "epoch": 2.122315592903828, "grad_norm": 1.8170471979751717, "learning_rate": 9.15545689194649e-06, "loss": 0.1029, "step": 2273 }, { "epoch": 2.123249299719888, "grad_norm": 1.015299005063448, "learning_rate": 9.154615754486186e-06, "loss": 0.0253, "step": 2274 }, { "epoch": 2.1241830065359477, "grad_norm": 1.3470777106928224, "learning_rate": 9.153774237040645e-06, "loss": 0.0517, "step": 2275 }, { "epoch": 2.1251167133520075, "grad_norm": 2.2285182537658366, "learning_rate": 9.152932339686833e-06, "loss": 0.112, "step": 2276 }, { "epoch": 2.1260504201680672, "grad_norm": 6.332424730472683, "learning_rate": 9.152090062501752e-06, "loss": 0.1365, "step": 2277 }, { "epoch": 2.126984126984127, "grad_norm": 1.132452941355117, "learning_rate": 9.151247405562436e-06, "loss": 0.0427, "step": 2278 }, { "epoch": 2.1279178338001867, "grad_norm": 6.062344527438495, "learning_rate": 9.150404368945955e-06, "loss": 0.0922, "step": 2279 }, { "epoch": 2.1288515406162465, "grad_norm": 4.090940568885878, "learning_rate": 9.149560952729417e-06, "loss": 0.1865, "step": 2280 }, { "epoch": 2.1297852474323062, "grad_norm": 3.384125473007568, "learning_rate": 9.148717156989956e-06, "loss": 0.1841, "step": 2281 }, { "epoch": 2.130718954248366, "grad_norm": 1.7718571416319138, "learning_rate": 9.14787298180475e-06, "loss": 0.042, "step": 2282 }, { "epoch": 2.1316526610644257, "grad_norm": 1.7763672564566655, "learning_rate": 9.14702842725101e-06, "loss": 0.0778, "step": 2283 }, { "epoch": 2.1325863678804855, "grad_norm": 0.9929647760470373, "learning_rate": 9.146183493405976e-06, "loss": 0.034, "step": 2284 }, { "epoch": 2.1335200746965453, "grad_norm": 2.4916200764539784, "learning_rate": 9.145338180346927e-06, "loss": 0.1039, "step": 2285 }, { "epoch": 2.134453781512605, "grad_norm": 4.2660101928612795, "learning_rate": 9.14449248815118e-06, "loss": 0.1478, "step": 2286 }, { "epoch": 2.1353874883286648, "grad_norm": 5.025851423912928, "learning_rate": 9.143646416896079e-06, "loss": 0.1869, "step": 2287 }, { "epoch": 2.1363211951447245, "grad_norm": 2.2842062630657733, "learning_rate": 9.142799966659006e-06, "loss": 0.108, "step": 2288 }, { "epoch": 2.1372549019607843, "grad_norm": 2.2147367483322578, "learning_rate": 9.141953137517381e-06, "loss": 0.1316, "step": 2289 }, { "epoch": 2.138188608776844, "grad_norm": 2.14390266594622, "learning_rate": 9.141105929548654e-06, "loss": 0.0711, "step": 2290 }, { "epoch": 2.139122315592904, "grad_norm": 2.034213346842289, "learning_rate": 9.140258342830312e-06, "loss": 0.0837, "step": 2291 }, { "epoch": 2.1400560224089635, "grad_norm": 1.8138562917656902, "learning_rate": 9.139410377439877e-06, "loss": 0.0913, "step": 2292 }, { "epoch": 2.1409897292250233, "grad_norm": 3.4580339072695705, "learning_rate": 9.138562033454903e-06, "loss": 0.1267, "step": 2293 }, { "epoch": 2.141923436041083, "grad_norm": 2.2058841547212427, "learning_rate": 9.13771331095298e-06, "loss": 0.1285, "step": 2294 }, { "epoch": 2.142857142857143, "grad_norm": 1.9044194511595542, "learning_rate": 9.136864210011735e-06, "loss": 0.106, "step": 2295 }, { "epoch": 2.1437908496732025, "grad_norm": 1.0542328801366472, "learning_rate": 9.136014730708829e-06, "loss": 0.0194, "step": 2296 }, { "epoch": 2.1447245564892623, "grad_norm": 1.3389642873700973, "learning_rate": 9.135164873121952e-06, "loss": 0.0853, "step": 2297 }, { "epoch": 2.145658263305322, "grad_norm": 1.826695602830777, "learning_rate": 9.134314637328835e-06, "loss": 0.083, "step": 2298 }, { "epoch": 2.146591970121382, "grad_norm": 1.8283025752183144, "learning_rate": 9.133464023407242e-06, "loss": 0.106, "step": 2299 }, { "epoch": 2.1475256769374416, "grad_norm": 0.8308141189224512, "learning_rate": 9.13261303143497e-06, "loss": 0.0179, "step": 2300 }, { "epoch": 2.1484593837535013, "grad_norm": 3.320530855457528, "learning_rate": 9.131761661489851e-06, "loss": 0.2459, "step": 2301 }, { "epoch": 2.149393090569561, "grad_norm": 6.943170530907429, "learning_rate": 9.130909913649753e-06, "loss": 0.1723, "step": 2302 }, { "epoch": 2.150326797385621, "grad_norm": 1.1655214102924083, "learning_rate": 9.130057787992577e-06, "loss": 0.0264, "step": 2303 }, { "epoch": 2.1512605042016806, "grad_norm": 2.082367669312573, "learning_rate": 9.129205284596263e-06, "loss": 0.1061, "step": 2304 }, { "epoch": 2.1521942110177403, "grad_norm": 5.001085576187427, "learning_rate": 9.128352403538776e-06, "loss": 0.2932, "step": 2305 }, { "epoch": 2.1531279178338, "grad_norm": 2.334962058503767, "learning_rate": 9.127499144898124e-06, "loss": 0.1219, "step": 2306 }, { "epoch": 2.15406162464986, "grad_norm": 1.2830336093348165, "learning_rate": 9.126645508752347e-06, "loss": 0.0282, "step": 2307 }, { "epoch": 2.1549953314659196, "grad_norm": 2.7720609323190915, "learning_rate": 9.125791495179519e-06, "loss": 0.1343, "step": 2308 }, { "epoch": 2.1559290382819793, "grad_norm": 2.307527733790301, "learning_rate": 9.124937104257751e-06, "loss": 0.1669, "step": 2309 }, { "epoch": 2.156862745098039, "grad_norm": 1.0580910638404117, "learning_rate": 9.124082336065182e-06, "loss": 0.052, "step": 2310 }, { "epoch": 2.157796451914099, "grad_norm": 2.654825070097057, "learning_rate": 9.123227190679995e-06, "loss": 0.0332, "step": 2311 }, { "epoch": 2.1587301587301586, "grad_norm": 1.6739802326477526, "learning_rate": 9.122371668180399e-06, "loss": 0.0401, "step": 2312 }, { "epoch": 2.1596638655462184, "grad_norm": 3.4023370821834016, "learning_rate": 9.121515768644642e-06, "loss": 0.2568, "step": 2313 }, { "epoch": 2.160597572362278, "grad_norm": 1.1614854546518343, "learning_rate": 9.120659492151005e-06, "loss": 0.0397, "step": 2314 }, { "epoch": 2.161531279178338, "grad_norm": 3.7975722977958295, "learning_rate": 9.119802838777806e-06, "loss": 0.2589, "step": 2315 }, { "epoch": 2.1624649859943976, "grad_norm": 2.3306112663322827, "learning_rate": 9.118945808603391e-06, "loss": 0.1332, "step": 2316 }, { "epoch": 2.1633986928104574, "grad_norm": 1.1498155569019335, "learning_rate": 9.118088401706148e-06, "loss": 0.0343, "step": 2317 }, { "epoch": 2.164332399626517, "grad_norm": 1.8502226454514625, "learning_rate": 9.117230618164496e-06, "loss": 0.0697, "step": 2318 }, { "epoch": 2.165266106442577, "grad_norm": 2.6997513785164307, "learning_rate": 9.11637245805689e-06, "loss": 0.1215, "step": 2319 }, { "epoch": 2.1661998132586366, "grad_norm": 1.074792144472581, "learning_rate": 9.115513921461814e-06, "loss": 0.0267, "step": 2320 }, { "epoch": 2.1671335200746964, "grad_norm": 2.028855943371394, "learning_rate": 9.114655008457795e-06, "loss": 0.1012, "step": 2321 }, { "epoch": 2.168067226890756, "grad_norm": 0.9678223956184321, "learning_rate": 9.113795719123387e-06, "loss": 0.0529, "step": 2322 }, { "epoch": 2.169000933706816, "grad_norm": 0.7074455983361209, "learning_rate": 9.112936053537184e-06, "loss": 0.0254, "step": 2323 }, { "epoch": 2.1699346405228757, "grad_norm": 1.9827990839608152, "learning_rate": 9.112076011777808e-06, "loss": 0.1207, "step": 2324 }, { "epoch": 2.1708683473389354, "grad_norm": 1.226177661070389, "learning_rate": 9.111215593923925e-06, "loss": 0.0442, "step": 2325 }, { "epoch": 2.171802054154995, "grad_norm": 1.265528806328375, "learning_rate": 9.110354800054224e-06, "loss": 0.0507, "step": 2326 }, { "epoch": 2.172735760971055, "grad_norm": 0.6826494873632085, "learning_rate": 9.109493630247438e-06, "loss": 0.0234, "step": 2327 }, { "epoch": 2.1736694677871147, "grad_norm": 1.788024218472137, "learning_rate": 9.108632084582329e-06, "loss": 0.0439, "step": 2328 }, { "epoch": 2.1746031746031744, "grad_norm": 4.647251296195825, "learning_rate": 9.107770163137693e-06, "loss": 0.2335, "step": 2329 }, { "epoch": 2.175536881419234, "grad_norm": 3.0530150989247877, "learning_rate": 9.106907865992365e-06, "loss": 0.109, "step": 2330 }, { "epoch": 2.176470588235294, "grad_norm": 1.1318108228021286, "learning_rate": 9.10604519322521e-06, "loss": 0.022, "step": 2331 }, { "epoch": 2.1774042950513537, "grad_norm": 0.9032181201291322, "learning_rate": 9.10518214491513e-06, "loss": 0.03, "step": 2332 }, { "epoch": 2.1783380018674134, "grad_norm": 1.0184003088073297, "learning_rate": 9.10431872114106e-06, "loss": 0.0385, "step": 2333 }, { "epoch": 2.179271708683473, "grad_norm": 0.9339263209353988, "learning_rate": 9.103454921981969e-06, "loss": 0.0385, "step": 2334 }, { "epoch": 2.180205415499533, "grad_norm": 2.0850090822434453, "learning_rate": 9.102590747516862e-06, "loss": 0.1134, "step": 2335 }, { "epoch": 2.1811391223155927, "grad_norm": 1.2515710505374067, "learning_rate": 9.101726197824774e-06, "loss": 0.0509, "step": 2336 }, { "epoch": 2.1820728291316525, "grad_norm": 1.4041213625979816, "learning_rate": 9.10086127298478e-06, "loss": 0.0445, "step": 2337 }, { "epoch": 2.183006535947712, "grad_norm": 3.4251313734828557, "learning_rate": 9.099995973075987e-06, "loss": 0.3281, "step": 2338 }, { "epoch": 2.1839402427637724, "grad_norm": 2.90865001970434, "learning_rate": 9.099130298177538e-06, "loss": 0.1486, "step": 2339 }, { "epoch": 2.184873949579832, "grad_norm": 2.7713071251003414, "learning_rate": 9.098264248368604e-06, "loss": 0.0927, "step": 2340 }, { "epoch": 2.185807656395892, "grad_norm": 0.6433227050602736, "learning_rate": 9.0973978237284e-06, "loss": 0.0127, "step": 2341 }, { "epoch": 2.1867413632119517, "grad_norm": 2.7977720671482977, "learning_rate": 9.096531024336164e-06, "loss": 0.0989, "step": 2342 }, { "epoch": 2.1876750700280114, "grad_norm": 1.9492734794950233, "learning_rate": 9.09566385027118e-06, "loss": 0.0914, "step": 2343 }, { "epoch": 2.188608776844071, "grad_norm": 2.574017756827439, "learning_rate": 9.094796301612758e-06, "loss": 0.1516, "step": 2344 }, { "epoch": 2.189542483660131, "grad_norm": 8.293378277113911, "learning_rate": 9.093928378440244e-06, "loss": 0.2883, "step": 2345 }, { "epoch": 2.1904761904761907, "grad_norm": 1.4524807098192285, "learning_rate": 9.09306008083302e-06, "loss": 0.0579, "step": 2346 }, { "epoch": 2.1914098972922504, "grad_norm": 1.06943781965255, "learning_rate": 9.092191408870503e-06, "loss": 0.053, "step": 2347 }, { "epoch": 2.19234360410831, "grad_norm": 1.099157053374722, "learning_rate": 9.09132236263214e-06, "loss": 0.0484, "step": 2348 }, { "epoch": 2.19327731092437, "grad_norm": 3.1745536485707184, "learning_rate": 9.090452942197417e-06, "loss": 0.1143, "step": 2349 }, { "epoch": 2.1942110177404297, "grad_norm": 0.9278695805119879, "learning_rate": 9.08958314764585e-06, "loss": 0.0226, "step": 2350 }, { "epoch": 2.1951447245564895, "grad_norm": 1.2658810039914865, "learning_rate": 9.088712979056992e-06, "loss": 0.0646, "step": 2351 }, { "epoch": 2.196078431372549, "grad_norm": 1.008534794144837, "learning_rate": 9.08784243651043e-06, "loss": 0.0243, "step": 2352 }, { "epoch": 2.197012138188609, "grad_norm": 3.5882959017606666, "learning_rate": 9.086971520085784e-06, "loss": 0.2466, "step": 2353 }, { "epoch": 2.1979458450046687, "grad_norm": 3.1177820998517713, "learning_rate": 9.086100229862708e-06, "loss": 0.1051, "step": 2354 }, { "epoch": 2.1988795518207285, "grad_norm": 2.4491738196020414, "learning_rate": 9.085228565920897e-06, "loss": 0.0714, "step": 2355 }, { "epoch": 2.1998132586367882, "grad_norm": 3.211739345151393, "learning_rate": 9.084356528340066e-06, "loss": 0.141, "step": 2356 }, { "epoch": 2.200746965452848, "grad_norm": 1.1696837871107404, "learning_rate": 9.083484117199975e-06, "loss": 0.0489, "step": 2357 }, { "epoch": 2.2016806722689077, "grad_norm": 1.995281602921752, "learning_rate": 9.082611332580418e-06, "loss": 0.0681, "step": 2358 }, { "epoch": 2.2026143790849675, "grad_norm": 2.6413211381455968, "learning_rate": 9.081738174561219e-06, "loss": 0.0954, "step": 2359 }, { "epoch": 2.2035480859010272, "grad_norm": 4.105240467123111, "learning_rate": 9.080864643222237e-06, "loss": 0.1989, "step": 2360 }, { "epoch": 2.204481792717087, "grad_norm": 1.5823416342244472, "learning_rate": 9.07999073864337e-06, "loss": 0.0706, "step": 2361 }, { "epoch": 2.2054154995331468, "grad_norm": 2.2385776522448997, "learning_rate": 9.079116460904541e-06, "loss": 0.1267, "step": 2362 }, { "epoch": 2.2063492063492065, "grad_norm": 1.3337963275014597, "learning_rate": 9.078241810085715e-06, "loss": 0.0518, "step": 2363 }, { "epoch": 2.2072829131652663, "grad_norm": 4.583148725233816, "learning_rate": 9.07736678626689e-06, "loss": 0.2226, "step": 2364 }, { "epoch": 2.208216619981326, "grad_norm": 1.648092785539779, "learning_rate": 9.076491389528093e-06, "loss": 0.0815, "step": 2365 }, { "epoch": 2.2091503267973858, "grad_norm": 2.5364904525505745, "learning_rate": 9.07561561994939e-06, "loss": 0.1854, "step": 2366 }, { "epoch": 2.2100840336134455, "grad_norm": 2.217464932026842, "learning_rate": 9.07473947761088e-06, "loss": 0.1116, "step": 2367 }, { "epoch": 2.2110177404295053, "grad_norm": 0.8334228344347674, "learning_rate": 9.073862962592696e-06, "loss": 0.0191, "step": 2368 }, { "epoch": 2.211951447245565, "grad_norm": 3.356552759800953, "learning_rate": 9.072986074975006e-06, "loss": 0.1541, "step": 2369 }, { "epoch": 2.212885154061625, "grad_norm": 3.9395177054090684, "learning_rate": 9.07210881483801e-06, "loss": 0.173, "step": 2370 }, { "epoch": 2.2138188608776845, "grad_norm": 1.059785754770871, "learning_rate": 9.071231182261941e-06, "loss": 0.0288, "step": 2371 }, { "epoch": 2.2147525676937443, "grad_norm": 3.4032957889219375, "learning_rate": 9.070353177327071e-06, "loss": 0.071, "step": 2372 }, { "epoch": 2.215686274509804, "grad_norm": 5.851508964934679, "learning_rate": 9.0694748001137e-06, "loss": 0.2257, "step": 2373 }, { "epoch": 2.216619981325864, "grad_norm": 6.6150344872332, "learning_rate": 9.068596050702171e-06, "loss": 0.197, "step": 2374 }, { "epoch": 2.2175536881419236, "grad_norm": 4.773268918504549, "learning_rate": 9.06771692917285e-06, "loss": 0.2024, "step": 2375 }, { "epoch": 2.2184873949579833, "grad_norm": 2.385288183458224, "learning_rate": 9.066837435606143e-06, "loss": 0.151, "step": 2376 }, { "epoch": 2.219421101774043, "grad_norm": 1.6426340276671552, "learning_rate": 9.065957570082493e-06, "loss": 0.0935, "step": 2377 }, { "epoch": 2.220354808590103, "grad_norm": 1.3560861109394555, "learning_rate": 9.065077332682367e-06, "loss": 0.0817, "step": 2378 }, { "epoch": 2.2212885154061626, "grad_norm": 3.608996649264056, "learning_rate": 9.06419672348628e-06, "loss": 0.2192, "step": 2379 }, { "epoch": 2.2222222222222223, "grad_norm": 0.9174349472917519, "learning_rate": 9.063315742574767e-06, "loss": 0.0214, "step": 2380 }, { "epoch": 2.223155929038282, "grad_norm": 2.134192199917555, "learning_rate": 9.062434390028407e-06, "loss": 0.0725, "step": 2381 }, { "epoch": 2.224089635854342, "grad_norm": 2.4027876649703996, "learning_rate": 9.061552665927808e-06, "loss": 0.0928, "step": 2382 }, { "epoch": 2.2250233426704016, "grad_norm": 2.964347373052273, "learning_rate": 9.060670570353613e-06, "loss": 0.1726, "step": 2383 }, { "epoch": 2.2259570494864613, "grad_norm": 2.7786608166152176, "learning_rate": 9.059788103386502e-06, "loss": 0.1489, "step": 2384 }, { "epoch": 2.226890756302521, "grad_norm": 3.7873415229313987, "learning_rate": 9.058905265107182e-06, "loss": 0.1728, "step": 2385 }, { "epoch": 2.227824463118581, "grad_norm": 4.113735457515813, "learning_rate": 9.0580220555964e-06, "loss": 0.1778, "step": 2386 }, { "epoch": 2.2287581699346406, "grad_norm": 1.2363936019306327, "learning_rate": 9.057138474934939e-06, "loss": 0.0449, "step": 2387 }, { "epoch": 2.2296918767507004, "grad_norm": 0.7449976742011165, "learning_rate": 9.056254523203604e-06, "loss": 0.0342, "step": 2388 }, { "epoch": 2.23062558356676, "grad_norm": 2.155714765724157, "learning_rate": 9.055370200483252e-06, "loss": 0.146, "step": 2389 }, { "epoch": 2.23155929038282, "grad_norm": 2.2098631168826337, "learning_rate": 9.054485506854756e-06, "loss": 0.0996, "step": 2390 }, { "epoch": 2.2324929971988796, "grad_norm": 1.5104273619944415, "learning_rate": 9.053600442399034e-06, "loss": 0.0907, "step": 2391 }, { "epoch": 2.2334267040149394, "grad_norm": 2.5033604038015462, "learning_rate": 9.052715007197036e-06, "loss": 0.2036, "step": 2392 }, { "epoch": 2.234360410830999, "grad_norm": 2.4500630287651064, "learning_rate": 9.051829201329743e-06, "loss": 0.1676, "step": 2393 }, { "epoch": 2.235294117647059, "grad_norm": 3.03292120678963, "learning_rate": 9.050943024878172e-06, "loss": 0.2589, "step": 2394 }, { "epoch": 2.2362278244631186, "grad_norm": 1.4650529473456644, "learning_rate": 9.050056477923373e-06, "loss": 0.1016, "step": 2395 }, { "epoch": 2.2371615312791784, "grad_norm": 4.644327837760132, "learning_rate": 9.049169560546433e-06, "loss": 0.1513, "step": 2396 }, { "epoch": 2.238095238095238, "grad_norm": 1.1956572837355617, "learning_rate": 9.048282272828465e-06, "loss": 0.0556, "step": 2397 }, { "epoch": 2.239028944911298, "grad_norm": 1.6164240501214682, "learning_rate": 9.047394614850628e-06, "loss": 0.1018, "step": 2398 }, { "epoch": 2.2399626517273576, "grad_norm": 1.8683774932429185, "learning_rate": 9.046506586694107e-06, "loss": 0.1284, "step": 2399 }, { "epoch": 2.2408963585434174, "grad_norm": 4.270876053370232, "learning_rate": 9.045618188440116e-06, "loss": 0.1373, "step": 2400 }, { "epoch": 2.241830065359477, "grad_norm": 2.2892124280727804, "learning_rate": 9.044729420169914e-06, "loss": 0.23, "step": 2401 }, { "epoch": 2.242763772175537, "grad_norm": 1.2112478129887136, "learning_rate": 9.043840281964787e-06, "loss": 0.041, "step": 2402 }, { "epoch": 2.2436974789915967, "grad_norm": 2.092329798012599, "learning_rate": 9.042950773906055e-06, "loss": 0.1395, "step": 2403 }, { "epoch": 2.2446311858076564, "grad_norm": 7.013199420066738, "learning_rate": 9.042060896075077e-06, "loss": 0.2652, "step": 2404 }, { "epoch": 2.245564892623716, "grad_norm": 1.898240224878989, "learning_rate": 9.04117064855324e-06, "loss": 0.0854, "step": 2405 }, { "epoch": 2.246498599439776, "grad_norm": 1.0270600394906009, "learning_rate": 9.040280031421965e-06, "loss": 0.042, "step": 2406 }, { "epoch": 2.2474323062558357, "grad_norm": 0.9650075061995385, "learning_rate": 9.039389044762712e-06, "loss": 0.0628, "step": 2407 }, { "epoch": 2.2483660130718954, "grad_norm": 6.642838208455547, "learning_rate": 9.038497688656971e-06, "loss": 0.1933, "step": 2408 }, { "epoch": 2.249299719887955, "grad_norm": 1.991227401725207, "learning_rate": 9.037605963186265e-06, "loss": 0.111, "step": 2409 }, { "epoch": 2.250233426704015, "grad_norm": 2.0526468310413546, "learning_rate": 9.03671386843215e-06, "loss": 0.1322, "step": 2410 }, { "epoch": 2.2511671335200747, "grad_norm": 1.4850528285136833, "learning_rate": 9.035821404476223e-06, "loss": 0.0707, "step": 2411 }, { "epoch": 2.2521008403361344, "grad_norm": 3.217741130231553, "learning_rate": 9.034928571400107e-06, "loss": 0.1563, "step": 2412 }, { "epoch": 2.253034547152194, "grad_norm": 3.678010398924817, "learning_rate": 9.034035369285461e-06, "loss": 0.2069, "step": 2413 }, { "epoch": 2.253968253968254, "grad_norm": 2.133841813268947, "learning_rate": 9.033141798213978e-06, "loss": 0.058, "step": 2414 }, { "epoch": 2.2549019607843137, "grad_norm": 2.026403722501059, "learning_rate": 9.032247858267387e-06, "loss": 0.087, "step": 2415 }, { "epoch": 2.2558356676003735, "grad_norm": 1.9209939641077072, "learning_rate": 9.031353549527445e-06, "loss": 0.0624, "step": 2416 }, { "epoch": 2.256769374416433, "grad_norm": 2.696299684983707, "learning_rate": 9.03045887207595e-06, "loss": 0.0708, "step": 2417 }, { "epoch": 2.257703081232493, "grad_norm": 4.308888182164846, "learning_rate": 9.029563825994727e-06, "loss": 0.3498, "step": 2418 }, { "epoch": 2.2586367880485527, "grad_norm": 3.8423365906448397, "learning_rate": 9.028668411365642e-06, "loss": 0.2485, "step": 2419 }, { "epoch": 2.2595704948646125, "grad_norm": 2.3755205768545147, "learning_rate": 9.027772628270585e-06, "loss": 0.1559, "step": 2420 }, { "epoch": 2.2605042016806722, "grad_norm": 2.930466731141526, "learning_rate": 9.02687647679149e-06, "loss": 0.1326, "step": 2421 }, { "epoch": 2.261437908496732, "grad_norm": 2.749550980837556, "learning_rate": 9.025979957010317e-06, "loss": 0.1818, "step": 2422 }, { "epoch": 2.2623716153127917, "grad_norm": 1.2095158117814897, "learning_rate": 9.025083069009065e-06, "loss": 0.037, "step": 2423 }, { "epoch": 2.2633053221288515, "grad_norm": 2.3209218653508397, "learning_rate": 9.024185812869759e-06, "loss": 0.1288, "step": 2424 }, { "epoch": 2.2642390289449112, "grad_norm": 3.7293209939483787, "learning_rate": 9.023288188674469e-06, "loss": 0.1441, "step": 2425 }, { "epoch": 2.265172735760971, "grad_norm": 2.1027680900696266, "learning_rate": 9.02239019650529e-06, "loss": 0.1486, "step": 2426 }, { "epoch": 2.2661064425770308, "grad_norm": 0.7046783785687595, "learning_rate": 9.021491836444353e-06, "loss": 0.0244, "step": 2427 }, { "epoch": 2.2670401493930905, "grad_norm": 1.2701489821981906, "learning_rate": 9.020593108573826e-06, "loss": 0.0445, "step": 2428 }, { "epoch": 2.2679738562091503, "grad_norm": 2.662234553088074, "learning_rate": 9.019694012975904e-06, "loss": 0.031, "step": 2429 }, { "epoch": 2.26890756302521, "grad_norm": 2.1890175430884717, "learning_rate": 9.018794549732819e-06, "loss": 0.1494, "step": 2430 }, { "epoch": 2.2698412698412698, "grad_norm": 1.444044210843907, "learning_rate": 9.017894718926838e-06, "loss": 0.0732, "step": 2431 }, { "epoch": 2.2707749766573295, "grad_norm": 0.8422306105724416, "learning_rate": 9.01699452064026e-06, "loss": 0.0255, "step": 2432 }, { "epoch": 2.2717086834733893, "grad_norm": 4.532975449034702, "learning_rate": 9.016093954955418e-06, "loss": 0.2122, "step": 2433 }, { "epoch": 2.272642390289449, "grad_norm": 5.140314898140526, "learning_rate": 9.01519302195468e-06, "loss": 0.0418, "step": 2434 }, { "epoch": 2.273576097105509, "grad_norm": 1.332205511572415, "learning_rate": 9.014291721720445e-06, "loss": 0.0948, "step": 2435 }, { "epoch": 2.2745098039215685, "grad_norm": 1.5060678122174749, "learning_rate": 9.013390054335148e-06, "loss": 0.0683, "step": 2436 }, { "epoch": 2.2754435107376283, "grad_norm": 2.5714938570450627, "learning_rate": 9.012488019881253e-06, "loss": 0.1205, "step": 2437 }, { "epoch": 2.276377217553688, "grad_norm": 0.7852024706222329, "learning_rate": 9.011585618441265e-06, "loss": 0.0137, "step": 2438 }, { "epoch": 2.277310924369748, "grad_norm": 2.2187458961754447, "learning_rate": 9.010682850097716e-06, "loss": 0.0394, "step": 2439 }, { "epoch": 2.2782446311858076, "grad_norm": 3.0320785212527004, "learning_rate": 9.009779714933176e-06, "loss": 0.2206, "step": 2440 }, { "epoch": 2.2791783380018673, "grad_norm": 1.0143944977706039, "learning_rate": 9.008876213030244e-06, "loss": 0.0507, "step": 2441 }, { "epoch": 2.280112044817927, "grad_norm": 1.683557881623694, "learning_rate": 9.007972344471556e-06, "loss": 0.0791, "step": 2442 }, { "epoch": 2.281045751633987, "grad_norm": 0.7670504767974448, "learning_rate": 9.007068109339783e-06, "loss": 0.0251, "step": 2443 }, { "epoch": 2.2819794584500466, "grad_norm": 2.6030651468536323, "learning_rate": 9.006163507717628e-06, "loss": 0.1824, "step": 2444 }, { "epoch": 2.2829131652661063, "grad_norm": 1.7852279582228079, "learning_rate": 9.00525853968782e-06, "loss": 0.0839, "step": 2445 }, { "epoch": 2.283846872082166, "grad_norm": 0.6157199608306244, "learning_rate": 9.004353205333136e-06, "loss": 0.0198, "step": 2446 }, { "epoch": 2.284780578898226, "grad_norm": 4.896439502362942, "learning_rate": 9.003447504736374e-06, "loss": 0.1028, "step": 2447 }, { "epoch": 2.2857142857142856, "grad_norm": 2.2722306874376317, "learning_rate": 9.00254143798037e-06, "loss": 0.1148, "step": 2448 }, { "epoch": 2.2866479925303453, "grad_norm": 1.7664500034269757, "learning_rate": 9.001635005147998e-06, "loss": 0.0378, "step": 2449 }, { "epoch": 2.287581699346405, "grad_norm": 2.381137136613355, "learning_rate": 9.000728206322157e-06, "loss": 0.2134, "step": 2450 }, { "epoch": 2.288515406162465, "grad_norm": 1.9531514203703066, "learning_rate": 8.999821041585788e-06, "loss": 0.1891, "step": 2451 }, { "epoch": 2.2894491129785246, "grad_norm": 3.4347144208281377, "learning_rate": 8.998913511021857e-06, "loss": 0.0616, "step": 2452 }, { "epoch": 2.2903828197945844, "grad_norm": 1.7231612921338715, "learning_rate": 8.998005614713368e-06, "loss": 0.0817, "step": 2453 }, { "epoch": 2.291316526610644, "grad_norm": 1.0820504766316368, "learning_rate": 8.99709735274336e-06, "loss": 0.0408, "step": 2454 }, { "epoch": 2.292250233426704, "grad_norm": 7.505670536648044, "learning_rate": 8.996188725194904e-06, "loss": 0.418, "step": 2455 }, { "epoch": 2.2931839402427636, "grad_norm": 1.1503484554296166, "learning_rate": 8.995279732151103e-06, "loss": 0.0605, "step": 2456 }, { "epoch": 2.2941176470588234, "grad_norm": 2.6256743371386273, "learning_rate": 8.994370373695091e-06, "loss": 0.0423, "step": 2457 }, { "epoch": 2.295051353874883, "grad_norm": 3.7733679416723485, "learning_rate": 8.993460649910046e-06, "loss": 0.2265, "step": 2458 }, { "epoch": 2.295985060690943, "grad_norm": 5.1239734920533735, "learning_rate": 8.992550560879167e-06, "loss": 0.15, "step": 2459 }, { "epoch": 2.2969187675070026, "grad_norm": 2.0899859734689916, "learning_rate": 8.991640106685692e-06, "loss": 0.077, "step": 2460 }, { "epoch": 2.2978524743230624, "grad_norm": 2.120989332158025, "learning_rate": 8.990729287412893e-06, "loss": 0.0947, "step": 2461 }, { "epoch": 2.298786181139122, "grad_norm": 1.121436798017504, "learning_rate": 8.989818103144075e-06, "loss": 0.0301, "step": 2462 }, { "epoch": 2.299719887955182, "grad_norm": 1.7230365401720908, "learning_rate": 8.988906553962576e-06, "loss": 0.0275, "step": 2463 }, { "epoch": 2.3006535947712417, "grad_norm": 3.8076126762331777, "learning_rate": 8.987994639951763e-06, "loss": 0.2077, "step": 2464 }, { "epoch": 2.3015873015873014, "grad_norm": 3.7455818360726156, "learning_rate": 8.987082361195047e-06, "loss": 0.2407, "step": 2465 }, { "epoch": 2.302521008403361, "grad_norm": 1.8591687722470736, "learning_rate": 8.986169717775862e-06, "loss": 0.0748, "step": 2466 }, { "epoch": 2.303454715219421, "grad_norm": 1.7041922695529101, "learning_rate": 8.98525670977768e-06, "loss": 0.0648, "step": 2467 }, { "epoch": 2.3043884220354807, "grad_norm": 2.157144495137762, "learning_rate": 8.984343337284006e-06, "loss": 0.111, "step": 2468 }, { "epoch": 2.3053221288515404, "grad_norm": 2.2879497574791468, "learning_rate": 8.983429600378376e-06, "loss": 0.15, "step": 2469 }, { "epoch": 2.3062558356676, "grad_norm": 0.5467068667273849, "learning_rate": 8.982515499144364e-06, "loss": 0.0113, "step": 2470 }, { "epoch": 2.30718954248366, "grad_norm": 4.088139806932059, "learning_rate": 8.981601033665575e-06, "loss": 0.1366, "step": 2471 }, { "epoch": 2.3081232492997197, "grad_norm": 1.4850763361152726, "learning_rate": 8.980686204025645e-06, "loss": 0.0541, "step": 2472 }, { "epoch": 2.3090569561157794, "grad_norm": 2.3158060838445653, "learning_rate": 8.979771010308243e-06, "loss": 0.1223, "step": 2473 }, { "epoch": 2.309990662931839, "grad_norm": 2.3876721887901775, "learning_rate": 8.978855452597078e-06, "loss": 0.1444, "step": 2474 }, { "epoch": 2.310924369747899, "grad_norm": 3.092589683970796, "learning_rate": 8.977939530975885e-06, "loss": 0.1813, "step": 2475 }, { "epoch": 2.3118580765639587, "grad_norm": 1.8475588236835334, "learning_rate": 8.977023245528438e-06, "loss": 0.0982, "step": 2476 }, { "epoch": 2.3127917833800185, "grad_norm": 3.4531680657861905, "learning_rate": 8.976106596338537e-06, "loss": 0.2211, "step": 2477 }, { "epoch": 2.313725490196078, "grad_norm": 1.4054822646254805, "learning_rate": 8.975189583490023e-06, "loss": 0.0303, "step": 2478 }, { "epoch": 2.314659197012138, "grad_norm": 0.5976586791859178, "learning_rate": 8.974272207066767e-06, "loss": 0.0238, "step": 2479 }, { "epoch": 2.3155929038281977, "grad_norm": 8.376653269239231, "learning_rate": 8.97335446715267e-06, "loss": 0.229, "step": 2480 }, { "epoch": 2.3165266106442575, "grad_norm": 1.0611821684293643, "learning_rate": 8.972436363831673e-06, "loss": 0.049, "step": 2481 }, { "epoch": 2.317460317460317, "grad_norm": 2.1647417301097467, "learning_rate": 8.971517897187745e-06, "loss": 0.0439, "step": 2482 }, { "epoch": 2.318394024276377, "grad_norm": 4.022538639181435, "learning_rate": 8.97059906730489e-06, "loss": 0.1727, "step": 2483 }, { "epoch": 2.3193277310924367, "grad_norm": 1.7461831960755285, "learning_rate": 8.969679874267146e-06, "loss": 0.0577, "step": 2484 }, { "epoch": 2.3202614379084965, "grad_norm": 2.9057186745975714, "learning_rate": 8.96876031815858e-06, "loss": 0.1852, "step": 2485 }, { "epoch": 2.3211951447245567, "grad_norm": 5.338961979703752, "learning_rate": 8.967840399063298e-06, "loss": 0.2111, "step": 2486 }, { "epoch": 2.3221288515406164, "grad_norm": 2.3974988699443136, "learning_rate": 8.966920117065439e-06, "loss": 0.1274, "step": 2487 }, { "epoch": 2.323062558356676, "grad_norm": 1.400589568705195, "learning_rate": 8.96599947224917e-06, "loss": 0.0425, "step": 2488 }, { "epoch": 2.323996265172736, "grad_norm": 3.545906288748642, "learning_rate": 8.965078464698694e-06, "loss": 0.1787, "step": 2489 }, { "epoch": 2.3249299719887957, "grad_norm": 4.714701474642147, "learning_rate": 8.964157094498245e-06, "loss": 0.2486, "step": 2490 }, { "epoch": 2.3258636788048555, "grad_norm": 3.2673676527380318, "learning_rate": 8.963235361732098e-06, "loss": 0.1749, "step": 2491 }, { "epoch": 2.326797385620915, "grad_norm": 1.9621910973650252, "learning_rate": 8.962313266484552e-06, "loss": 0.0903, "step": 2492 }, { "epoch": 2.327731092436975, "grad_norm": 4.281596089922368, "learning_rate": 8.961390808839945e-06, "loss": 0.1962, "step": 2493 }, { "epoch": 2.3286647992530347, "grad_norm": 3.055443101303314, "learning_rate": 8.960467988882643e-06, "loss": 0.2001, "step": 2494 }, { "epoch": 2.3295985060690945, "grad_norm": 2.7425620112672116, "learning_rate": 8.959544806697048e-06, "loss": 0.1294, "step": 2495 }, { "epoch": 2.330532212885154, "grad_norm": 2.958751524194259, "learning_rate": 8.9586212623676e-06, "loss": 0.1591, "step": 2496 }, { "epoch": 2.331465919701214, "grad_norm": 0.6779097455117863, "learning_rate": 8.957697355978761e-06, "loss": 0.029, "step": 2497 }, { "epoch": 2.3323996265172737, "grad_norm": 2.056989661342427, "learning_rate": 8.956773087615036e-06, "loss": 0.1146, "step": 2498 }, { "epoch": 2.3333333333333335, "grad_norm": 1.2796397058781128, "learning_rate": 8.95584845736096e-06, "loss": 0.0814, "step": 2499 }, { "epoch": 2.3342670401493932, "grad_norm": 0.4012071967637215, "learning_rate": 8.9549234653011e-06, "loss": 0.0067, "step": 2500 }, { "epoch": 2.335200746965453, "grad_norm": 2.2677734691420937, "learning_rate": 8.953998111520052e-06, "loss": 0.101, "step": 2501 }, { "epoch": 2.3361344537815127, "grad_norm": 1.2675488561591115, "learning_rate": 8.953072396102458e-06, "loss": 0.0587, "step": 2502 }, { "epoch": 2.3370681605975725, "grad_norm": 2.9616712771266474, "learning_rate": 8.952146319132978e-06, "loss": 0.138, "step": 2503 }, { "epoch": 2.3380018674136323, "grad_norm": 3.79049185529053, "learning_rate": 8.951219880696315e-06, "loss": 0.2169, "step": 2504 }, { "epoch": 2.338935574229692, "grad_norm": 1.9422666869401062, "learning_rate": 8.950293080877202e-06, "loss": 0.1205, "step": 2505 }, { "epoch": 2.3398692810457518, "grad_norm": 1.5485994676492014, "learning_rate": 8.949365919760404e-06, "loss": 0.0837, "step": 2506 }, { "epoch": 2.3408029878618115, "grad_norm": 0.7570403600164446, "learning_rate": 8.948438397430721e-06, "loss": 0.028, "step": 2507 }, { "epoch": 2.3417366946778713, "grad_norm": 2.8282980358223346, "learning_rate": 8.947510513972985e-06, "loss": 0.0715, "step": 2508 }, { "epoch": 2.342670401493931, "grad_norm": 1.0838142526704861, "learning_rate": 8.946582269472059e-06, "loss": 0.0467, "step": 2509 }, { "epoch": 2.3436041083099908, "grad_norm": 1.5648545357144035, "learning_rate": 8.945653664012845e-06, "loss": 0.0565, "step": 2510 }, { "epoch": 2.3445378151260505, "grad_norm": 2.84804519905626, "learning_rate": 8.944724697680271e-06, "loss": 0.1064, "step": 2511 }, { "epoch": 2.3454715219421103, "grad_norm": 6.080746434096064, "learning_rate": 8.943795370559303e-06, "loss": 0.247, "step": 2512 }, { "epoch": 2.34640522875817, "grad_norm": 7.727125337094631, "learning_rate": 8.942865682734938e-06, "loss": 0.2204, "step": 2513 }, { "epoch": 2.34733893557423, "grad_norm": 4.930307085943278, "learning_rate": 8.941935634292205e-06, "loss": 0.2123, "step": 2514 }, { "epoch": 2.3482726423902895, "grad_norm": 1.9220738438810772, "learning_rate": 8.941005225316168e-06, "loss": 0.1033, "step": 2515 }, { "epoch": 2.3492063492063493, "grad_norm": 1.1312342541188383, "learning_rate": 8.940074455891921e-06, "loss": 0.0296, "step": 2516 }, { "epoch": 2.350140056022409, "grad_norm": 0.3727801451866279, "learning_rate": 8.939143326104598e-06, "loss": 0.0189, "step": 2517 }, { "epoch": 2.351073762838469, "grad_norm": 1.758083714519215, "learning_rate": 8.938211836039356e-06, "loss": 0.0801, "step": 2518 }, { "epoch": 2.3520074696545286, "grad_norm": 2.443923972305091, "learning_rate": 8.937279985781393e-06, "loss": 0.1581, "step": 2519 }, { "epoch": 2.3529411764705883, "grad_norm": 2.9074997390148276, "learning_rate": 8.936347775415935e-06, "loss": 0.0783, "step": 2520 }, { "epoch": 2.353874883286648, "grad_norm": 1.986492474836599, "learning_rate": 8.935415205028243e-06, "loss": 0.1031, "step": 2521 }, { "epoch": 2.354808590102708, "grad_norm": 1.6832801540717868, "learning_rate": 8.934482274703615e-06, "loss": 0.0871, "step": 2522 }, { "epoch": 2.3557422969187676, "grad_norm": 1.5705363184985792, "learning_rate": 8.933548984527372e-06, "loss": 0.088, "step": 2523 }, { "epoch": 2.3566760037348273, "grad_norm": 1.2044914012305474, "learning_rate": 8.932615334584876e-06, "loss": 0.054, "step": 2524 }, { "epoch": 2.357609710550887, "grad_norm": 1.7551198886155548, "learning_rate": 8.931681324961521e-06, "loss": 0.0794, "step": 2525 }, { "epoch": 2.358543417366947, "grad_norm": 2.0990962714056267, "learning_rate": 8.930746955742728e-06, "loss": 0.1101, "step": 2526 }, { "epoch": 2.3594771241830066, "grad_norm": 2.9075885805810406, "learning_rate": 8.92981222701396e-06, "loss": 0.0886, "step": 2527 }, { "epoch": 2.3604108309990663, "grad_norm": 1.3390104070205442, "learning_rate": 8.928877138860708e-06, "loss": 0.0527, "step": 2528 }, { "epoch": 2.361344537815126, "grad_norm": 2.2085932302106475, "learning_rate": 8.927941691368493e-06, "loss": 0.1505, "step": 2529 }, { "epoch": 2.362278244631186, "grad_norm": 2.247089240596894, "learning_rate": 8.927005884622875e-06, "loss": 0.1785, "step": 2530 }, { "epoch": 2.3632119514472456, "grad_norm": 3.312206326357402, "learning_rate": 8.92606971870944e-06, "loss": 0.2261, "step": 2531 }, { "epoch": 2.3641456582633054, "grad_norm": 1.6456083342810683, "learning_rate": 8.925133193713815e-06, "loss": 0.0941, "step": 2532 }, { "epoch": 2.365079365079365, "grad_norm": 2.416965460083342, "learning_rate": 8.924196309721652e-06, "loss": 0.1654, "step": 2533 }, { "epoch": 2.366013071895425, "grad_norm": 0.5796928436427415, "learning_rate": 8.923259066818642e-06, "loss": 0.0167, "step": 2534 }, { "epoch": 2.3669467787114846, "grad_norm": 1.6593812531058598, "learning_rate": 8.922321465090504e-06, "loss": 0.0381, "step": 2535 }, { "epoch": 2.3678804855275444, "grad_norm": 7.518097822531058, "learning_rate": 8.921383504622993e-06, "loss": 0.3179, "step": 2536 }, { "epoch": 2.368814192343604, "grad_norm": 2.008616733227253, "learning_rate": 8.920445185501898e-06, "loss": 0.0479, "step": 2537 }, { "epoch": 2.369747899159664, "grad_norm": 1.8503097318293444, "learning_rate": 8.919506507813035e-06, "loss": 0.1623, "step": 2538 }, { "epoch": 2.3706816059757236, "grad_norm": 7.230935385604306, "learning_rate": 8.918567471642256e-06, "loss": 0.0868, "step": 2539 }, { "epoch": 2.3716153127917834, "grad_norm": 5.198798739910717, "learning_rate": 8.917628077075449e-06, "loss": 0.2516, "step": 2540 }, { "epoch": 2.372549019607843, "grad_norm": 1.7819691204598733, "learning_rate": 8.916688324198531e-06, "loss": 0.1329, "step": 2541 }, { "epoch": 2.373482726423903, "grad_norm": 1.0993142845761366, "learning_rate": 8.915748213097453e-06, "loss": 0.0732, "step": 2542 }, { "epoch": 2.3744164332399627, "grad_norm": 3.0528313513609517, "learning_rate": 8.914807743858197e-06, "loss": 0.2178, "step": 2543 }, { "epoch": 2.3753501400560224, "grad_norm": 0.8875177468575277, "learning_rate": 8.913866916566781e-06, "loss": 0.029, "step": 2544 }, { "epoch": 2.376283846872082, "grad_norm": 1.4815851919846592, "learning_rate": 8.912925731309254e-06, "loss": 0.0254, "step": 2545 }, { "epoch": 2.377217553688142, "grad_norm": 4.098868479345849, "learning_rate": 8.911984188171695e-06, "loss": 0.2259, "step": 2546 }, { "epoch": 2.3781512605042017, "grad_norm": 4.873846118159706, "learning_rate": 8.91104228724022e-06, "loss": 0.0419, "step": 2547 }, { "epoch": 2.3790849673202614, "grad_norm": 1.523540308995799, "learning_rate": 8.91010002860098e-06, "loss": 0.0468, "step": 2548 }, { "epoch": 2.380018674136321, "grad_norm": 1.5751545924103771, "learning_rate": 8.90915741234015e-06, "loss": 0.0522, "step": 2549 }, { "epoch": 2.380952380952381, "grad_norm": 1.400004354282381, "learning_rate": 8.908214438543943e-06, "loss": 0.0398, "step": 2550 }, { "epoch": 2.3818860877684407, "grad_norm": 2.818616121938522, "learning_rate": 8.907271107298608e-06, "loss": 0.0566, "step": 2551 }, { "epoch": 2.3828197945845004, "grad_norm": 1.0729563350596807, "learning_rate": 8.906327418690417e-06, "loss": 0.0548, "step": 2552 }, { "epoch": 2.38375350140056, "grad_norm": 1.8296191380793827, "learning_rate": 8.905383372805686e-06, "loss": 0.0836, "step": 2553 }, { "epoch": 2.38468720821662, "grad_norm": 3.0582464988066707, "learning_rate": 8.904438969730758e-06, "loss": 0.072, "step": 2554 }, { "epoch": 2.3856209150326797, "grad_norm": 3.072758005866715, "learning_rate": 8.903494209552006e-06, "loss": 0.1444, "step": 2555 }, { "epoch": 2.3865546218487395, "grad_norm": 3.32903621376931, "learning_rate": 8.90254909235584e-06, "loss": 0.1666, "step": 2556 }, { "epoch": 2.387488328664799, "grad_norm": 2.7671303103573397, "learning_rate": 8.901603618228702e-06, "loss": 0.0999, "step": 2557 }, { "epoch": 2.388422035480859, "grad_norm": 1.545899702331326, "learning_rate": 8.900657787257065e-06, "loss": 0.0387, "step": 2558 }, { "epoch": 2.3893557422969187, "grad_norm": 1.1166548175611504, "learning_rate": 8.899711599527437e-06, "loss": 0.0145, "step": 2559 }, { "epoch": 2.3902894491129785, "grad_norm": 1.256210347484311, "learning_rate": 8.898765055126356e-06, "loss": 0.0178, "step": 2560 }, { "epoch": 2.3912231559290382, "grad_norm": 2.715245829644891, "learning_rate": 8.897818154140393e-06, "loss": 0.0959, "step": 2561 }, { "epoch": 2.392156862745098, "grad_norm": 1.7892388696435726, "learning_rate": 8.896870896656156e-06, "loss": 0.0654, "step": 2562 }, { "epoch": 2.3930905695611577, "grad_norm": 2.0666773952035085, "learning_rate": 8.89592328276028e-06, "loss": 0.0275, "step": 2563 }, { "epoch": 2.3940242763772175, "grad_norm": 1.552255256809444, "learning_rate": 8.894975312539434e-06, "loss": 0.0571, "step": 2564 }, { "epoch": 2.3949579831932772, "grad_norm": 3.355048470953625, "learning_rate": 8.89402698608032e-06, "loss": 0.1114, "step": 2565 }, { "epoch": 2.395891690009337, "grad_norm": 2.464844477167918, "learning_rate": 8.893078303469675e-06, "loss": 0.1452, "step": 2566 }, { "epoch": 2.3968253968253967, "grad_norm": 3.598779347794711, "learning_rate": 8.892129264794265e-06, "loss": 0.2004, "step": 2567 }, { "epoch": 2.3977591036414565, "grad_norm": 3.9246675568816154, "learning_rate": 8.891179870140888e-06, "loss": 0.123, "step": 2568 }, { "epoch": 2.3986928104575163, "grad_norm": 0.6330887963238765, "learning_rate": 8.890230119596382e-06, "loss": 0.0209, "step": 2569 }, { "epoch": 2.399626517273576, "grad_norm": 1.339588604555377, "learning_rate": 8.889280013247609e-06, "loss": 0.0496, "step": 2570 }, { "epoch": 2.4005602240896358, "grad_norm": 5.305087730522086, "learning_rate": 8.888329551181464e-06, "loss": 0.1211, "step": 2571 }, { "epoch": 2.4014939309056955, "grad_norm": 1.4948842661610224, "learning_rate": 8.887378733484881e-06, "loss": 0.0887, "step": 2572 }, { "epoch": 2.4024276377217553, "grad_norm": 2.1478061890298323, "learning_rate": 8.886427560244822e-06, "loss": 0.1917, "step": 2573 }, { "epoch": 2.403361344537815, "grad_norm": 2.8145803248417742, "learning_rate": 8.885476031548282e-06, "loss": 0.1581, "step": 2574 }, { "epoch": 2.404295051353875, "grad_norm": 3.995883269582093, "learning_rate": 8.884524147482287e-06, "loss": 0.2419, "step": 2575 }, { "epoch": 2.4052287581699345, "grad_norm": 4.643290306480158, "learning_rate": 8.8835719081339e-06, "loss": 0.1891, "step": 2576 }, { "epoch": 2.4061624649859943, "grad_norm": 4.156362637101995, "learning_rate": 8.882619313590212e-06, "loss": 0.1614, "step": 2577 }, { "epoch": 2.407096171802054, "grad_norm": 1.882168543819971, "learning_rate": 8.88166636393835e-06, "loss": 0.0578, "step": 2578 }, { "epoch": 2.408029878618114, "grad_norm": 2.02941849670856, "learning_rate": 8.880713059265469e-06, "loss": 0.0232, "step": 2579 }, { "epoch": 2.4089635854341735, "grad_norm": 3.780033835517736, "learning_rate": 8.879759399658761e-06, "loss": 0.0343, "step": 2580 }, { "epoch": 2.4098972922502333, "grad_norm": 1.350210276866617, "learning_rate": 8.878805385205449e-06, "loss": 0.1, "step": 2581 }, { "epoch": 2.410830999066293, "grad_norm": 1.5841808837746865, "learning_rate": 8.877851015992786e-06, "loss": 0.0542, "step": 2582 }, { "epoch": 2.411764705882353, "grad_norm": 3.2048491813547955, "learning_rate": 8.876896292108062e-06, "loss": 0.1667, "step": 2583 }, { "epoch": 2.4126984126984126, "grad_norm": 1.5201734184988913, "learning_rate": 8.875941213638595e-06, "loss": 0.0529, "step": 2584 }, { "epoch": 2.4136321195144723, "grad_norm": 8.173269090480773, "learning_rate": 8.874985780671739e-06, "loss": 0.1586, "step": 2585 }, { "epoch": 2.414565826330532, "grad_norm": 2.449861791607262, "learning_rate": 8.874029993294878e-06, "loss": 0.0928, "step": 2586 }, { "epoch": 2.415499533146592, "grad_norm": 2.6063106365766817, "learning_rate": 8.873073851595429e-06, "loss": 0.1676, "step": 2587 }, { "epoch": 2.4164332399626516, "grad_norm": 1.7408616043200962, "learning_rate": 8.872117355660841e-06, "loss": 0.0672, "step": 2588 }, { "epoch": 2.4173669467787113, "grad_norm": 3.79794997139812, "learning_rate": 8.871160505578597e-06, "loss": 0.0447, "step": 2589 }, { "epoch": 2.418300653594771, "grad_norm": 1.8741232337672233, "learning_rate": 8.870203301436214e-06, "loss": 0.1358, "step": 2590 }, { "epoch": 2.419234360410831, "grad_norm": 2.8986187140376907, "learning_rate": 8.869245743321235e-06, "loss": 0.1617, "step": 2591 }, { "epoch": 2.4201680672268906, "grad_norm": 3.751424624106718, "learning_rate": 8.868287831321242e-06, "loss": 0.1246, "step": 2592 }, { "epoch": 2.4211017740429503, "grad_norm": 2.2782210899042954, "learning_rate": 8.867329565523843e-06, "loss": 0.1253, "step": 2593 }, { "epoch": 2.42203548085901, "grad_norm": 2.618735105135262, "learning_rate": 8.866370946016685e-06, "loss": 0.1744, "step": 2594 }, { "epoch": 2.42296918767507, "grad_norm": 2.1187073782046815, "learning_rate": 8.865411972887444e-06, "loss": 0.0899, "step": 2595 }, { "epoch": 2.4239028944911296, "grad_norm": 2.1782501527014464, "learning_rate": 8.864452646223828e-06, "loss": 0.106, "step": 2596 }, { "epoch": 2.4248366013071894, "grad_norm": 0.9880975733661087, "learning_rate": 8.863492966113578e-06, "loss": 0.0339, "step": 2597 }, { "epoch": 2.425770308123249, "grad_norm": 1.2144357438785087, "learning_rate": 8.862532932644466e-06, "loss": 0.0342, "step": 2598 }, { "epoch": 2.426704014939309, "grad_norm": 2.052685830356198, "learning_rate": 8.8615725459043e-06, "loss": 0.0721, "step": 2599 }, { "epoch": 2.4276377217553686, "grad_norm": 1.9315895619803534, "learning_rate": 8.860611805980916e-06, "loss": 0.0515, "step": 2600 }, { "epoch": 2.4285714285714284, "grad_norm": 4.125230082151471, "learning_rate": 8.859650712962185e-06, "loss": 0.2059, "step": 2601 }, { "epoch": 2.429505135387488, "grad_norm": 1.7092902939534573, "learning_rate": 8.85868926693601e-06, "loss": 0.108, "step": 2602 }, { "epoch": 2.4304388422035483, "grad_norm": 3.360357191971547, "learning_rate": 8.857727467990325e-06, "loss": 0.1558, "step": 2603 }, { "epoch": 2.431372549019608, "grad_norm": 5.908654522846203, "learning_rate": 8.856765316213097e-06, "loss": 0.244, "step": 2604 }, { "epoch": 2.432306255835668, "grad_norm": 1.3390948675238699, "learning_rate": 8.855802811692327e-06, "loss": 0.0925, "step": 2605 }, { "epoch": 2.4332399626517276, "grad_norm": 4.37946710551446, "learning_rate": 8.854839954516043e-06, "loss": 0.2892, "step": 2606 }, { "epoch": 2.4341736694677873, "grad_norm": 1.9237857557720717, "learning_rate": 8.853876744772314e-06, "loss": 0.0832, "step": 2607 }, { "epoch": 2.435107376283847, "grad_norm": 5.758219619331248, "learning_rate": 8.852913182549232e-06, "loss": 0.1189, "step": 2608 }, { "epoch": 2.436041083099907, "grad_norm": 1.7515210876442127, "learning_rate": 8.851949267934925e-06, "loss": 0.0587, "step": 2609 }, { "epoch": 2.4369747899159666, "grad_norm": 3.5950929192645873, "learning_rate": 8.850985001017559e-06, "loss": 0.1602, "step": 2610 }, { "epoch": 2.4379084967320264, "grad_norm": 5.442993548602286, "learning_rate": 8.850020381885319e-06, "loss": 0.2517, "step": 2611 }, { "epoch": 2.438842203548086, "grad_norm": 6.518873518410625, "learning_rate": 8.849055410626436e-06, "loss": 0.0588, "step": 2612 }, { "epoch": 2.439775910364146, "grad_norm": 3.2672032033879574, "learning_rate": 8.848090087329165e-06, "loss": 0.0655, "step": 2613 }, { "epoch": 2.4407096171802056, "grad_norm": 3.1221876080726956, "learning_rate": 8.847124412081796e-06, "loss": 0.1805, "step": 2614 }, { "epoch": 2.4416433239962654, "grad_norm": 1.7701863004801612, "learning_rate": 8.846158384972651e-06, "loss": 0.0521, "step": 2615 }, { "epoch": 2.442577030812325, "grad_norm": 1.4625860762485936, "learning_rate": 8.845192006090082e-06, "loss": 0.0546, "step": 2616 }, { "epoch": 2.443510737628385, "grad_norm": 1.9404403608208012, "learning_rate": 8.844225275522475e-06, "loss": 0.1077, "step": 2617 }, { "epoch": 2.4444444444444446, "grad_norm": 1.6612596314152328, "learning_rate": 8.843258193358251e-06, "loss": 0.0743, "step": 2618 }, { "epoch": 2.4453781512605044, "grad_norm": 4.477591265779747, "learning_rate": 8.842290759685857e-06, "loss": 0.3387, "step": 2619 }, { "epoch": 2.446311858076564, "grad_norm": 2.703009196969407, "learning_rate": 8.841322974593778e-06, "loss": 0.0788, "step": 2620 }, { "epoch": 2.447245564892624, "grad_norm": 3.9304720231867027, "learning_rate": 8.840354838170528e-06, "loss": 0.1944, "step": 2621 }, { "epoch": 2.4481792717086837, "grad_norm": 0.9751435953388853, "learning_rate": 8.839386350504651e-06, "loss": 0.0387, "step": 2622 }, { "epoch": 2.4491129785247434, "grad_norm": 3.4717863464430527, "learning_rate": 8.83841751168473e-06, "loss": 0.1681, "step": 2623 }, { "epoch": 2.450046685340803, "grad_norm": 2.331452625376318, "learning_rate": 8.837448321799373e-06, "loss": 0.1281, "step": 2624 }, { "epoch": 2.450980392156863, "grad_norm": 0.9630339052970973, "learning_rate": 8.836478780937224e-06, "loss": 0.0243, "step": 2625 }, { "epoch": 2.4519140989729227, "grad_norm": 1.619549573872213, "learning_rate": 8.835508889186957e-06, "loss": 0.095, "step": 2626 }, { "epoch": 2.4528478057889824, "grad_norm": 2.854643139856339, "learning_rate": 8.834538646637282e-06, "loss": 0.0419, "step": 2627 }, { "epoch": 2.453781512605042, "grad_norm": 1.4917976318097805, "learning_rate": 8.833568053376937e-06, "loss": 0.0826, "step": 2628 }, { "epoch": 2.454715219421102, "grad_norm": 1.872269701697886, "learning_rate": 8.832597109494691e-06, "loss": 0.091, "step": 2629 }, { "epoch": 2.4556489262371617, "grad_norm": 3.6898158687848412, "learning_rate": 8.831625815079351e-06, "loss": 0.1428, "step": 2630 }, { "epoch": 2.4565826330532214, "grad_norm": 1.2165651640242934, "learning_rate": 8.830654170219752e-06, "loss": 0.0689, "step": 2631 }, { "epoch": 2.457516339869281, "grad_norm": 1.3738081719968878, "learning_rate": 8.829682175004761e-06, "loss": 0.0731, "step": 2632 }, { "epoch": 2.458450046685341, "grad_norm": 3.8017459885658784, "learning_rate": 8.828709829523277e-06, "loss": 0.3566, "step": 2633 }, { "epoch": 2.4593837535014007, "grad_norm": 12.421360068517778, "learning_rate": 8.82773713386423e-06, "loss": 0.1794, "step": 2634 }, { "epoch": 2.4603174603174605, "grad_norm": 2.0731798608714676, "learning_rate": 8.826764088116588e-06, "loss": 0.0648, "step": 2635 }, { "epoch": 2.46125116713352, "grad_norm": 2.1750905717986835, "learning_rate": 8.825790692369344e-06, "loss": 0.1153, "step": 2636 }, { "epoch": 2.46218487394958, "grad_norm": 2.1112610785738273, "learning_rate": 8.824816946711525e-06, "loss": 0.0455, "step": 2637 }, { "epoch": 2.4631185807656397, "grad_norm": 1.558229576025645, "learning_rate": 8.823842851232194e-06, "loss": 0.0717, "step": 2638 }, { "epoch": 2.4640522875816995, "grad_norm": 0.8496954825912675, "learning_rate": 8.82286840602044e-06, "loss": 0.0436, "step": 2639 }, { "epoch": 2.4649859943977592, "grad_norm": 1.537369852656203, "learning_rate": 8.821893611165387e-06, "loss": 0.0201, "step": 2640 }, { "epoch": 2.465919701213819, "grad_norm": 4.696597792151453, "learning_rate": 8.820918466756189e-06, "loss": 0.1055, "step": 2641 }, { "epoch": 2.4668534080298787, "grad_norm": 3.00316113093418, "learning_rate": 8.81994297288204e-06, "loss": 0.1401, "step": 2642 }, { "epoch": 2.4677871148459385, "grad_norm": 1.0048308090736096, "learning_rate": 8.818967129632153e-06, "loss": 0.028, "step": 2643 }, { "epoch": 2.4687208216619982, "grad_norm": 1.9746169185392328, "learning_rate": 8.817990937095782e-06, "loss": 0.0697, "step": 2644 }, { "epoch": 2.469654528478058, "grad_norm": 1.586387015246486, "learning_rate": 8.817014395362212e-06, "loss": 0.0824, "step": 2645 }, { "epoch": 2.4705882352941178, "grad_norm": 2.4593183128686906, "learning_rate": 8.816037504520753e-06, "loss": 0.0798, "step": 2646 }, { "epoch": 2.4715219421101775, "grad_norm": 1.7240968230465827, "learning_rate": 8.815060264660759e-06, "loss": 0.0603, "step": 2647 }, { "epoch": 2.4724556489262373, "grad_norm": 3.4308820439117294, "learning_rate": 8.814082675871609e-06, "loss": 0.1504, "step": 2648 }, { "epoch": 2.473389355742297, "grad_norm": 3.9213610504356673, "learning_rate": 8.81310473824271e-06, "loss": 0.1325, "step": 2649 }, { "epoch": 2.4743230625583568, "grad_norm": 2.2072288548320027, "learning_rate": 8.812126451863506e-06, "loss": 0.1097, "step": 2650 }, { "epoch": 2.4752567693744165, "grad_norm": 3.8968085042201666, "learning_rate": 8.811147816823474e-06, "loss": 0.046, "step": 2651 }, { "epoch": 2.4761904761904763, "grad_norm": 1.5004549173156807, "learning_rate": 8.81016883321212e-06, "loss": 0.1059, "step": 2652 }, { "epoch": 2.477124183006536, "grad_norm": 2.7395026438896393, "learning_rate": 8.809189501118981e-06, "loss": 0.183, "step": 2653 }, { "epoch": 2.478057889822596, "grad_norm": 1.1265554172760976, "learning_rate": 8.808209820633633e-06, "loss": 0.0314, "step": 2654 }, { "epoch": 2.4789915966386555, "grad_norm": 2.414459134193562, "learning_rate": 8.807229791845673e-06, "loss": 0.1274, "step": 2655 }, { "epoch": 2.4799253034547153, "grad_norm": 1.603051073446151, "learning_rate": 8.806249414844738e-06, "loss": 0.1068, "step": 2656 }, { "epoch": 2.480859010270775, "grad_norm": 1.7822563017668738, "learning_rate": 8.805268689720493e-06, "loss": 0.0816, "step": 2657 }, { "epoch": 2.481792717086835, "grad_norm": 2.4001664132790546, "learning_rate": 8.804287616562637e-06, "loss": 0.1568, "step": 2658 }, { "epoch": 2.4827264239028946, "grad_norm": 2.3414250085520942, "learning_rate": 8.803306195460898e-06, "loss": 0.1492, "step": 2659 }, { "epoch": 2.4836601307189543, "grad_norm": 2.0366373392623185, "learning_rate": 8.802324426505042e-06, "loss": 0.032, "step": 2660 }, { "epoch": 2.484593837535014, "grad_norm": 2.699072561088097, "learning_rate": 8.80134230978486e-06, "loss": 0.1625, "step": 2661 }, { "epoch": 2.485527544351074, "grad_norm": 2.0259532024746183, "learning_rate": 8.800359845390177e-06, "loss": 0.0962, "step": 2662 }, { "epoch": 2.4864612511671336, "grad_norm": 1.214616271300689, "learning_rate": 8.799377033410848e-06, "loss": 0.057, "step": 2663 }, { "epoch": 2.4873949579831933, "grad_norm": 5.783171103056298, "learning_rate": 8.798393873936766e-06, "loss": 0.1425, "step": 2664 }, { "epoch": 2.488328664799253, "grad_norm": 2.0765404826674314, "learning_rate": 8.797410367057852e-06, "loss": 0.0795, "step": 2665 }, { "epoch": 2.489262371615313, "grad_norm": 3.439635126248445, "learning_rate": 8.796426512864054e-06, "loss": 0.1192, "step": 2666 }, { "epoch": 2.4901960784313726, "grad_norm": 0.5672701205343561, "learning_rate": 8.79544231144536e-06, "loss": 0.0215, "step": 2667 }, { "epoch": 2.4911297852474323, "grad_norm": 2.05716239664848, "learning_rate": 8.794457762891785e-06, "loss": 0.0865, "step": 2668 }, { "epoch": 2.492063492063492, "grad_norm": 1.6597345464271251, "learning_rate": 8.793472867293377e-06, "loss": 0.0899, "step": 2669 }, { "epoch": 2.492997198879552, "grad_norm": 2.109688431267943, "learning_rate": 8.792487624740215e-06, "loss": 0.0634, "step": 2670 }, { "epoch": 2.4939309056956116, "grad_norm": 2.217943583788775, "learning_rate": 8.79150203532241e-06, "loss": 0.1317, "step": 2671 }, { "epoch": 2.4948646125116714, "grad_norm": 3.4923095663900194, "learning_rate": 8.790516099130106e-06, "loss": 0.1978, "step": 2672 }, { "epoch": 2.495798319327731, "grad_norm": 8.712371936030312, "learning_rate": 8.789529816253476e-06, "loss": 0.0865, "step": 2673 }, { "epoch": 2.496732026143791, "grad_norm": 2.5676963585905943, "learning_rate": 8.78854318678273e-06, "loss": 0.1751, "step": 2674 }, { "epoch": 2.4976657329598506, "grad_norm": 3.4931981821403726, "learning_rate": 8.787556210808101e-06, "loss": 0.1302, "step": 2675 }, { "epoch": 2.4985994397759104, "grad_norm": 1.7877590400293308, "learning_rate": 8.786568888419864e-06, "loss": 0.0361, "step": 2676 }, { "epoch": 2.49953314659197, "grad_norm": 2.6071609089760552, "learning_rate": 8.785581219708316e-06, "loss": 0.0385, "step": 2677 }, { "epoch": 2.50046685340803, "grad_norm": 4.075508411066314, "learning_rate": 8.78459320476379e-06, "loss": 0.054, "step": 2678 }, { "epoch": 2.5014005602240896, "grad_norm": 1.9386795762884805, "learning_rate": 8.783604843676658e-06, "loss": 0.1128, "step": 2679 }, { "epoch": 2.5023342670401494, "grad_norm": 2.545546034223646, "learning_rate": 8.782616136537308e-06, "loss": 0.2, "step": 2680 }, { "epoch": 2.503267973856209, "grad_norm": 2.162244813546239, "learning_rate": 8.781627083436172e-06, "loss": 0.0633, "step": 2681 }, { "epoch": 2.504201680672269, "grad_norm": 6.43086125002467, "learning_rate": 8.780637684463711e-06, "loss": 0.0394, "step": 2682 }, { "epoch": 2.5051353874883286, "grad_norm": 3.7608633996304053, "learning_rate": 8.779647939710413e-06, "loss": 0.2206, "step": 2683 }, { "epoch": 2.5060690943043884, "grad_norm": 1.9518998746512617, "learning_rate": 8.778657849266802e-06, "loss": 0.1171, "step": 2684 }, { "epoch": 2.507002801120448, "grad_norm": 1.855239887923212, "learning_rate": 8.777667413223436e-06, "loss": 0.0773, "step": 2685 }, { "epoch": 2.507936507936508, "grad_norm": 1.748281415447936, "learning_rate": 8.776676631670898e-06, "loss": 0.1409, "step": 2686 }, { "epoch": 2.5088702147525677, "grad_norm": 1.3318288813488581, "learning_rate": 8.775685504699805e-06, "loss": 0.0138, "step": 2687 }, { "epoch": 2.5098039215686274, "grad_norm": 1.2462786147935934, "learning_rate": 8.774694032400807e-06, "loss": 0.0363, "step": 2688 }, { "epoch": 2.510737628384687, "grad_norm": 2.251079932109214, "learning_rate": 8.773702214864587e-06, "loss": 0.1218, "step": 2689 }, { "epoch": 2.511671335200747, "grad_norm": 1.9138263889876406, "learning_rate": 8.772710052181858e-06, "loss": 0.0895, "step": 2690 }, { "epoch": 2.5126050420168067, "grad_norm": 4.080913805130336, "learning_rate": 8.77171754444336e-06, "loss": 0.0504, "step": 2691 }, { "epoch": 2.5135387488328664, "grad_norm": 2.6381616188785073, "learning_rate": 8.770724691739874e-06, "loss": 0.1011, "step": 2692 }, { "epoch": 2.514472455648926, "grad_norm": 1.5946140029878928, "learning_rate": 8.769731494162203e-06, "loss": 0.0744, "step": 2693 }, { "epoch": 2.515406162464986, "grad_norm": 2.4596785529163085, "learning_rate": 8.76873795180119e-06, "loss": 0.1404, "step": 2694 }, { "epoch": 2.5163398692810457, "grad_norm": 4.28535946850025, "learning_rate": 8.767744064747701e-06, "loss": 0.1627, "step": 2695 }, { "epoch": 2.5172735760971054, "grad_norm": 3.5722617979768776, "learning_rate": 8.76674983309264e-06, "loss": 0.1422, "step": 2696 }, { "epoch": 2.518207282913165, "grad_norm": 4.659086459368488, "learning_rate": 8.76575525692694e-06, "loss": 0.1614, "step": 2697 }, { "epoch": 2.519140989729225, "grad_norm": 2.926281699998474, "learning_rate": 8.764760336341566e-06, "loss": 0.0718, "step": 2698 }, { "epoch": 2.5200746965452847, "grad_norm": 1.395466242249608, "learning_rate": 8.763765071427515e-06, "loss": 0.0891, "step": 2699 }, { "epoch": 2.5210084033613445, "grad_norm": 1.9029593430289902, "learning_rate": 8.762769462275812e-06, "loss": 0.0845, "step": 2700 }, { "epoch": 2.521942110177404, "grad_norm": 3.0078922823500704, "learning_rate": 8.761773508977523e-06, "loss": 0.1975, "step": 2701 }, { "epoch": 2.522875816993464, "grad_norm": 2.4211759367766366, "learning_rate": 8.760777211623733e-06, "loss": 0.0475, "step": 2702 }, { "epoch": 2.5238095238095237, "grad_norm": 2.5866469247002217, "learning_rate": 8.759780570305565e-06, "loss": 0.1482, "step": 2703 }, { "epoch": 2.5247432306255835, "grad_norm": 3.0968657293612316, "learning_rate": 8.758783585114175e-06, "loss": 0.0517, "step": 2704 }, { "epoch": 2.5256769374416432, "grad_norm": 1.736589558956399, "learning_rate": 8.757786256140745e-06, "loss": 0.0978, "step": 2705 }, { "epoch": 2.526610644257703, "grad_norm": 3.150546054187943, "learning_rate": 8.756788583476497e-06, "loss": 0.1102, "step": 2706 }, { "epoch": 2.5275443510737627, "grad_norm": 2.6761995567457286, "learning_rate": 8.755790567212673e-06, "loss": 0.1022, "step": 2707 }, { "epoch": 2.5284780578898225, "grad_norm": 2.3527183526739788, "learning_rate": 8.754792207440557e-06, "loss": 0.1423, "step": 2708 }, { "epoch": 2.5294117647058822, "grad_norm": 2.700329876225771, "learning_rate": 8.753793504251458e-06, "loss": 0.1755, "step": 2709 }, { "epoch": 2.530345471521942, "grad_norm": 4.523347521418508, "learning_rate": 8.752794457736719e-06, "loss": 0.1929, "step": 2710 }, { "epoch": 2.5312791783380018, "grad_norm": 1.9780052187391899, "learning_rate": 8.751795067987712e-06, "loss": 0.1238, "step": 2711 }, { "epoch": 2.5322128851540615, "grad_norm": 0.7017302244935539, "learning_rate": 8.750795335095845e-06, "loss": 0.0221, "step": 2712 }, { "epoch": 2.5331465919701213, "grad_norm": 0.8473228266267812, "learning_rate": 8.749795259152554e-06, "loss": 0.0349, "step": 2713 }, { "epoch": 2.534080298786181, "grad_norm": 1.9961996862906857, "learning_rate": 8.748794840249306e-06, "loss": 0.153, "step": 2714 }, { "epoch": 2.5350140056022408, "grad_norm": 2.0557205744769926, "learning_rate": 8.7477940784776e-06, "loss": 0.1104, "step": 2715 }, { "epoch": 2.5359477124183005, "grad_norm": 2.311725128921488, "learning_rate": 8.746792973928968e-06, "loss": 0.0787, "step": 2716 }, { "epoch": 2.5368814192343603, "grad_norm": 2.81185697184681, "learning_rate": 8.745791526694969e-06, "loss": 0.0395, "step": 2717 }, { "epoch": 2.53781512605042, "grad_norm": 0.9215037748757307, "learning_rate": 8.7447897368672e-06, "loss": 0.0276, "step": 2718 }, { "epoch": 2.53874883286648, "grad_norm": 4.389282812071465, "learning_rate": 8.743787604537284e-06, "loss": 0.0405, "step": 2719 }, { "epoch": 2.5396825396825395, "grad_norm": 3.5185483931896853, "learning_rate": 8.742785129796875e-06, "loss": 0.1492, "step": 2720 }, { "epoch": 2.5406162464985993, "grad_norm": 3.658414975410911, "learning_rate": 8.741782312737665e-06, "loss": 0.1483, "step": 2721 }, { "epoch": 2.541549953314659, "grad_norm": 4.3020497200532555, "learning_rate": 8.740779153451369e-06, "loss": 0.2365, "step": 2722 }, { "epoch": 2.542483660130719, "grad_norm": 3.5576984671918077, "learning_rate": 8.73977565202974e-06, "loss": 0.092, "step": 2723 }, { "epoch": 2.5434173669467786, "grad_norm": 1.9563120952337212, "learning_rate": 8.738771808564555e-06, "loss": 0.1209, "step": 2724 }, { "epoch": 2.5443510737628383, "grad_norm": 2.059488981814087, "learning_rate": 8.73776762314763e-06, "loss": 0.0429, "step": 2725 }, { "epoch": 2.545284780578898, "grad_norm": 2.0391404430362137, "learning_rate": 8.736763095870809e-06, "loss": 0.0402, "step": 2726 }, { "epoch": 2.546218487394958, "grad_norm": 5.5926416865574184, "learning_rate": 8.735758226825962e-06, "loss": 0.2603, "step": 2727 }, { "epoch": 2.5471521942110176, "grad_norm": 0.8120080695003277, "learning_rate": 8.734753016105001e-06, "loss": 0.02, "step": 2728 }, { "epoch": 2.5480859010270773, "grad_norm": 2.5081661807496363, "learning_rate": 8.733747463799862e-06, "loss": 0.05, "step": 2729 }, { "epoch": 2.549019607843137, "grad_norm": 2.0275344289957205, "learning_rate": 8.732741570002512e-06, "loss": 0.0846, "step": 2730 }, { "epoch": 2.549953314659197, "grad_norm": 2.970188506601336, "learning_rate": 8.731735334804953e-06, "loss": 0.0485, "step": 2731 }, { "epoch": 2.5508870214752566, "grad_norm": 3.4266030339061437, "learning_rate": 8.730728758299217e-06, "loss": 0.1587, "step": 2732 }, { "epoch": 2.5518207282913163, "grad_norm": 3.1201058539039823, "learning_rate": 8.729721840577363e-06, "loss": 0.0852, "step": 2733 }, { "epoch": 2.552754435107376, "grad_norm": 1.8886032710178844, "learning_rate": 8.728714581731488e-06, "loss": 0.1264, "step": 2734 }, { "epoch": 2.553688141923436, "grad_norm": 4.080875511118795, "learning_rate": 8.727706981853714e-06, "loss": 0.1695, "step": 2735 }, { "epoch": 2.5546218487394956, "grad_norm": 2.425006798020095, "learning_rate": 8.7266990410362e-06, "loss": 0.1248, "step": 2736 }, { "epoch": 2.5555555555555554, "grad_norm": 1.3232001207371424, "learning_rate": 8.725690759371132e-06, "loss": 0.0881, "step": 2737 }, { "epoch": 2.556489262371615, "grad_norm": 2.051006373461358, "learning_rate": 8.724682136950728e-06, "loss": 0.0885, "step": 2738 }, { "epoch": 2.557422969187675, "grad_norm": 2.3648186152836557, "learning_rate": 8.723673173867238e-06, "loss": 0.1258, "step": 2739 }, { "epoch": 2.5583566760037346, "grad_norm": 2.0198761808401997, "learning_rate": 8.722663870212943e-06, "loss": 0.0844, "step": 2740 }, { "epoch": 2.5592903828197944, "grad_norm": 0.6385565460255537, "learning_rate": 8.721654226080154e-06, "loss": 0.0211, "step": 2741 }, { "epoch": 2.560224089635854, "grad_norm": 0.734994989554573, "learning_rate": 8.720644241561216e-06, "loss": 0.0276, "step": 2742 }, { "epoch": 2.561157796451914, "grad_norm": 1.9620336089803898, "learning_rate": 8.7196339167485e-06, "loss": 0.0256, "step": 2743 }, { "epoch": 2.5620915032679736, "grad_norm": 1.5412247362495943, "learning_rate": 8.718623251734415e-06, "loss": 0.0946, "step": 2744 }, { "epoch": 2.5630252100840334, "grad_norm": 3.01987180208893, "learning_rate": 8.717612246611395e-06, "loss": 0.0262, "step": 2745 }, { "epoch": 2.563958916900093, "grad_norm": 3.015920845510114, "learning_rate": 8.716600901471908e-06, "loss": 0.1662, "step": 2746 }, { "epoch": 2.564892623716153, "grad_norm": 0.5076841552027489, "learning_rate": 8.715589216408453e-06, "loss": 0.0101, "step": 2747 }, { "epoch": 2.5658263305322127, "grad_norm": 1.3900381537640825, "learning_rate": 8.71457719151356e-06, "loss": 0.0561, "step": 2748 }, { "epoch": 2.5667600373482724, "grad_norm": 1.6668215988556438, "learning_rate": 8.713564826879788e-06, "loss": 0.0696, "step": 2749 }, { "epoch": 2.567693744164332, "grad_norm": 1.3571337400018921, "learning_rate": 8.71255212259973e-06, "loss": 0.0789, "step": 2750 }, { "epoch": 2.568627450980392, "grad_norm": 3.226622586131166, "learning_rate": 8.71153907876601e-06, "loss": 0.1795, "step": 2751 }, { "epoch": 2.5695611577964517, "grad_norm": 3.9783840528872063, "learning_rate": 8.71052569547128e-06, "loss": 0.2436, "step": 2752 }, { "epoch": 2.5704948646125114, "grad_norm": 1.2078464278648449, "learning_rate": 8.709511972808226e-06, "loss": 0.0404, "step": 2753 }, { "epoch": 2.571428571428571, "grad_norm": 1.531828858166684, "learning_rate": 8.708497910869565e-06, "loss": 0.0733, "step": 2754 }, { "epoch": 2.572362278244631, "grad_norm": 2.7599098299259177, "learning_rate": 8.707483509748042e-06, "loss": 0.1484, "step": 2755 }, { "epoch": 2.5732959850606907, "grad_norm": 8.51890682212553, "learning_rate": 8.706468769536439e-06, "loss": 0.1945, "step": 2756 }, { "epoch": 2.5742296918767504, "grad_norm": 1.7676850891519276, "learning_rate": 8.705453690327559e-06, "loss": 0.0262, "step": 2757 }, { "epoch": 2.57516339869281, "grad_norm": 3.326518494672734, "learning_rate": 8.704438272214248e-06, "loss": 0.1912, "step": 2758 }, { "epoch": 2.57609710550887, "grad_norm": 6.031989367091717, "learning_rate": 8.703422515289374e-06, "loss": 0.2702, "step": 2759 }, { "epoch": 2.5770308123249297, "grad_norm": 8.149069607589391, "learning_rate": 8.70240641964584e-06, "loss": 0.262, "step": 2760 }, { "epoch": 2.5779645191409895, "grad_norm": 5.1676571611956215, "learning_rate": 8.701389985376578e-06, "loss": 0.1609, "step": 2761 }, { "epoch": 2.5788982259570497, "grad_norm": 3.1621943304942035, "learning_rate": 8.700373212574555e-06, "loss": 0.2459, "step": 2762 }, { "epoch": 2.5798319327731094, "grad_norm": 2.140723417812569, "learning_rate": 8.699356101332764e-06, "loss": 0.0761, "step": 2763 }, { "epoch": 2.580765639589169, "grad_norm": 2.153288587237236, "learning_rate": 8.69833865174423e-06, "loss": 0.0841, "step": 2764 }, { "epoch": 2.581699346405229, "grad_norm": 1.8919136878406502, "learning_rate": 8.69732086390201e-06, "loss": 0.0903, "step": 2765 }, { "epoch": 2.5826330532212887, "grad_norm": 4.10120681773559, "learning_rate": 8.696302737899193e-06, "loss": 0.2055, "step": 2766 }, { "epoch": 2.5835667600373484, "grad_norm": 3.9249158651141496, "learning_rate": 8.695284273828898e-06, "loss": 0.2201, "step": 2767 }, { "epoch": 2.584500466853408, "grad_norm": 2.965196135575833, "learning_rate": 8.694265471784277e-06, "loss": 0.1883, "step": 2768 }, { "epoch": 2.585434173669468, "grad_norm": 2.559620038293263, "learning_rate": 8.693246331858506e-06, "loss": 0.0939, "step": 2769 }, { "epoch": 2.5863678804855277, "grad_norm": 1.9153017517206552, "learning_rate": 8.6922268541448e-06, "loss": 0.0584, "step": 2770 }, { "epoch": 2.5873015873015874, "grad_norm": 2.329445597714225, "learning_rate": 8.691207038736397e-06, "loss": 0.0451, "step": 2771 }, { "epoch": 2.588235294117647, "grad_norm": 4.928315507533337, "learning_rate": 8.690186885726575e-06, "loss": 0.2797, "step": 2772 }, { "epoch": 2.589169000933707, "grad_norm": 2.657554854651654, "learning_rate": 8.689166395208638e-06, "loss": 0.1322, "step": 2773 }, { "epoch": 2.5901027077497667, "grad_norm": 5.240025354410724, "learning_rate": 8.688145567275917e-06, "loss": 0.2296, "step": 2774 }, { "epoch": 2.5910364145658265, "grad_norm": 2.21747176544451, "learning_rate": 8.687124402021782e-06, "loss": 0.0424, "step": 2775 }, { "epoch": 2.591970121381886, "grad_norm": 3.025293790635761, "learning_rate": 8.686102899539627e-06, "loss": 0.1166, "step": 2776 }, { "epoch": 2.592903828197946, "grad_norm": 2.830472198520306, "learning_rate": 8.685081059922884e-06, "loss": 0.237, "step": 2777 }, { "epoch": 2.5938375350140057, "grad_norm": 3.0627508836526522, "learning_rate": 8.684058883265006e-06, "loss": 0.0634, "step": 2778 }, { "epoch": 2.5947712418300655, "grad_norm": 2.349950665389256, "learning_rate": 8.683036369659486e-06, "loss": 0.0325, "step": 2779 }, { "epoch": 2.595704948646125, "grad_norm": 1.4915202207538543, "learning_rate": 8.682013519199841e-06, "loss": 0.0643, "step": 2780 }, { "epoch": 2.596638655462185, "grad_norm": 0.4586298484003383, "learning_rate": 8.680990331979625e-06, "loss": 0.0194, "step": 2781 }, { "epoch": 2.5975723622782447, "grad_norm": 1.677812561512139, "learning_rate": 8.679966808092419e-06, "loss": 0.0361, "step": 2782 }, { "epoch": 2.5985060690943045, "grad_norm": 1.1401338765303048, "learning_rate": 8.678942947631832e-06, "loss": 0.0837, "step": 2783 }, { "epoch": 2.5994397759103642, "grad_norm": 1.9608376722977578, "learning_rate": 8.677918750691514e-06, "loss": 0.0854, "step": 2784 }, { "epoch": 2.600373482726424, "grad_norm": 5.152889876580063, "learning_rate": 8.676894217365133e-06, "loss": 0.1637, "step": 2785 }, { "epoch": 2.6013071895424837, "grad_norm": 4.019275604898166, "learning_rate": 8.675869347746396e-06, "loss": 0.1893, "step": 2786 }, { "epoch": 2.6022408963585435, "grad_norm": 4.607020023387168, "learning_rate": 8.67484414192904e-06, "loss": 0.0522, "step": 2787 }, { "epoch": 2.6031746031746033, "grad_norm": 4.123655133820926, "learning_rate": 8.673818600006828e-06, "loss": 0.2496, "step": 2788 }, { "epoch": 2.604108309990663, "grad_norm": 1.0615007659455675, "learning_rate": 8.67279272207356e-06, "loss": 0.023, "step": 2789 }, { "epoch": 2.6050420168067228, "grad_norm": 2.1660766395678785, "learning_rate": 8.671766508223063e-06, "loss": 0.0542, "step": 2790 }, { "epoch": 2.6059757236227825, "grad_norm": 1.011131739550417, "learning_rate": 8.670739958549195e-06, "loss": 0.0427, "step": 2791 }, { "epoch": 2.6069094304388423, "grad_norm": 3.2191644184449553, "learning_rate": 8.669713073145845e-06, "loss": 0.1515, "step": 2792 }, { "epoch": 2.607843137254902, "grad_norm": 3.9543885473251934, "learning_rate": 8.668685852106935e-06, "loss": 0.1379, "step": 2793 }, { "epoch": 2.6087768440709618, "grad_norm": 2.959953777400402, "learning_rate": 8.667658295526414e-06, "loss": 0.1921, "step": 2794 }, { "epoch": 2.6097105508870215, "grad_norm": 7.402094817484788, "learning_rate": 8.666630403498262e-06, "loss": 0.3153, "step": 2795 }, { "epoch": 2.6106442577030813, "grad_norm": 1.3026363475200855, "learning_rate": 8.665602176116494e-06, "loss": 0.0363, "step": 2796 }, { "epoch": 2.611577964519141, "grad_norm": 1.817255430684913, "learning_rate": 8.664573613475152e-06, "loss": 0.0891, "step": 2797 }, { "epoch": 2.612511671335201, "grad_norm": 2.0509258500062217, "learning_rate": 8.663544715668306e-06, "loss": 0.0393, "step": 2798 }, { "epoch": 2.6134453781512605, "grad_norm": 1.219752992603364, "learning_rate": 8.662515482790065e-06, "loss": 0.0337, "step": 2799 }, { "epoch": 2.6143790849673203, "grad_norm": 2.68511594360568, "learning_rate": 8.66148591493456e-06, "loss": 0.1304, "step": 2800 }, { "epoch": 2.61531279178338, "grad_norm": 0.8368572176449789, "learning_rate": 8.660456012195957e-06, "loss": 0.0294, "step": 2801 }, { "epoch": 2.61624649859944, "grad_norm": 2.56397253451524, "learning_rate": 8.659425774668455e-06, "loss": 0.0966, "step": 2802 }, { "epoch": 2.6171802054154996, "grad_norm": 1.3187293818572443, "learning_rate": 8.658395202446275e-06, "loss": 0.0729, "step": 2803 }, { "epoch": 2.6181139122315593, "grad_norm": 1.2440067256506397, "learning_rate": 8.65736429562368e-06, "loss": 0.0377, "step": 2804 }, { "epoch": 2.619047619047619, "grad_norm": 4.83151920568899, "learning_rate": 8.656333054294952e-06, "loss": 0.1051, "step": 2805 }, { "epoch": 2.619981325863679, "grad_norm": 4.069684942388283, "learning_rate": 8.655301478554414e-06, "loss": 0.0868, "step": 2806 }, { "epoch": 2.6209150326797386, "grad_norm": 1.3990396161702372, "learning_rate": 8.654269568496411e-06, "loss": 0.0872, "step": 2807 }, { "epoch": 2.6218487394957983, "grad_norm": 2.114996576082049, "learning_rate": 8.653237324215327e-06, "loss": 0.1245, "step": 2808 }, { "epoch": 2.622782446311858, "grad_norm": 2.7610119347130504, "learning_rate": 8.652204745805569e-06, "loss": 0.0813, "step": 2809 }, { "epoch": 2.623716153127918, "grad_norm": 2.295305301062227, "learning_rate": 8.651171833361578e-06, "loss": 0.095, "step": 2810 }, { "epoch": 2.6246498599439776, "grad_norm": 1.9674200453250146, "learning_rate": 8.650138586977826e-06, "loss": 0.1158, "step": 2811 }, { "epoch": 2.6255835667600373, "grad_norm": 1.8427463778555628, "learning_rate": 8.649105006748815e-06, "loss": 0.0408, "step": 2812 }, { "epoch": 2.626517273576097, "grad_norm": 2.250567756297756, "learning_rate": 8.648071092769076e-06, "loss": 0.1835, "step": 2813 }, { "epoch": 2.627450980392157, "grad_norm": 5.264804632304001, "learning_rate": 8.647036845133171e-06, "loss": 0.2758, "step": 2814 }, { "epoch": 2.6283846872082166, "grad_norm": 1.667157237455291, "learning_rate": 8.646002263935695e-06, "loss": 0.0373, "step": 2815 }, { "epoch": 2.6293183940242764, "grad_norm": 1.6371224279743934, "learning_rate": 8.644967349271274e-06, "loss": 0.0551, "step": 2816 }, { "epoch": 2.630252100840336, "grad_norm": 1.515238434068441, "learning_rate": 8.643932101234558e-06, "loss": 0.1023, "step": 2817 }, { "epoch": 2.631185807656396, "grad_norm": 2.4815978868994084, "learning_rate": 8.642896519920231e-06, "loss": 0.063, "step": 2818 }, { "epoch": 2.6321195144724556, "grad_norm": 2.850507615164909, "learning_rate": 8.641860605423015e-06, "loss": 0.1717, "step": 2819 }, { "epoch": 2.6330532212885154, "grad_norm": 1.6158811712188095, "learning_rate": 8.64082435783765e-06, "loss": 0.1218, "step": 2820 }, { "epoch": 2.633986928104575, "grad_norm": 4.562097853174326, "learning_rate": 8.639787777258914e-06, "loss": 0.1889, "step": 2821 }, { "epoch": 2.634920634920635, "grad_norm": 7.076906805358433, "learning_rate": 8.638750863781614e-06, "loss": 0.2764, "step": 2822 }, { "epoch": 2.6358543417366946, "grad_norm": 3.2680291275917166, "learning_rate": 8.637713617500586e-06, "loss": 0.1024, "step": 2823 }, { "epoch": 2.6367880485527544, "grad_norm": 1.8006113867166529, "learning_rate": 8.636676038510696e-06, "loss": 0.0925, "step": 2824 }, { "epoch": 2.637721755368814, "grad_norm": 1.4015964876933313, "learning_rate": 8.635638126906847e-06, "loss": 0.0532, "step": 2825 }, { "epoch": 2.638655462184874, "grad_norm": 3.2452463384520613, "learning_rate": 8.634599882783964e-06, "loss": 0.1354, "step": 2826 }, { "epoch": 2.6395891690009337, "grad_norm": 3.228287917308904, "learning_rate": 8.633561306237006e-06, "loss": 0.2181, "step": 2827 }, { "epoch": 2.6405228758169934, "grad_norm": 1.365625185328403, "learning_rate": 8.632522397360963e-06, "loss": 0.0449, "step": 2828 }, { "epoch": 2.641456582633053, "grad_norm": 3.5821414156085476, "learning_rate": 8.631483156250855e-06, "loss": 0.1439, "step": 2829 }, { "epoch": 2.642390289449113, "grad_norm": 1.8351393299681389, "learning_rate": 8.630443583001731e-06, "loss": 0.1058, "step": 2830 }, { "epoch": 2.6433239962651727, "grad_norm": 2.701378842122812, "learning_rate": 8.62940367770867e-06, "loss": 0.1486, "step": 2831 }, { "epoch": 2.6442577030812324, "grad_norm": 1.2721931286912267, "learning_rate": 8.628363440466786e-06, "loss": 0.064, "step": 2832 }, { "epoch": 2.645191409897292, "grad_norm": 2.167699884062504, "learning_rate": 8.627322871371217e-06, "loss": 0.1032, "step": 2833 }, { "epoch": 2.646125116713352, "grad_norm": 2.5053286711922724, "learning_rate": 8.62628197051714e-06, "loss": 0.132, "step": 2834 }, { "epoch": 2.6470588235294117, "grad_norm": 3.97171109864579, "learning_rate": 8.625240737999749e-06, "loss": 0.2653, "step": 2835 }, { "epoch": 2.6479925303454714, "grad_norm": 1.186500800322956, "learning_rate": 8.62419917391428e-06, "loss": 0.0392, "step": 2836 }, { "epoch": 2.648926237161531, "grad_norm": 2.1825970589839665, "learning_rate": 8.623157278355997e-06, "loss": 0.1259, "step": 2837 }, { "epoch": 2.649859943977591, "grad_norm": 2.2828720935738933, "learning_rate": 8.62211505142019e-06, "loss": 0.1143, "step": 2838 }, { "epoch": 2.6507936507936507, "grad_norm": 1.9618807020695945, "learning_rate": 8.621072493202185e-06, "loss": 0.0461, "step": 2839 }, { "epoch": 2.6517273576097105, "grad_norm": 2.941923531210746, "learning_rate": 8.620029603797333e-06, "loss": 0.1496, "step": 2840 }, { "epoch": 2.65266106442577, "grad_norm": 0.6909288177841951, "learning_rate": 8.618986383301018e-06, "loss": 0.0167, "step": 2841 }, { "epoch": 2.65359477124183, "grad_norm": 1.684298796606744, "learning_rate": 8.617942831808654e-06, "loss": 0.064, "step": 2842 }, { "epoch": 2.6545284780578897, "grad_norm": 1.3157075243626473, "learning_rate": 8.616898949415688e-06, "loss": 0.0542, "step": 2843 }, { "epoch": 2.6554621848739495, "grad_norm": 2.8134654290869956, "learning_rate": 8.615854736217591e-06, "loss": 0.2149, "step": 2844 }, { "epoch": 2.6563958916900092, "grad_norm": 6.238908175269185, "learning_rate": 8.614810192309867e-06, "loss": 0.193, "step": 2845 }, { "epoch": 2.657329598506069, "grad_norm": 1.4994347066306424, "learning_rate": 8.613765317788057e-06, "loss": 0.0135, "step": 2846 }, { "epoch": 2.6582633053221287, "grad_norm": 2.5561608612349778, "learning_rate": 8.612720112747718e-06, "loss": 0.1007, "step": 2847 }, { "epoch": 2.6591970121381885, "grad_norm": 1.3110428074029994, "learning_rate": 8.611674577284453e-06, "loss": 0.0254, "step": 2848 }, { "epoch": 2.6601307189542482, "grad_norm": 3.349492008579997, "learning_rate": 8.610628711493883e-06, "loss": 0.2094, "step": 2849 }, { "epoch": 2.661064425770308, "grad_norm": 0.8464729095847399, "learning_rate": 8.609582515471663e-06, "loss": 0.0297, "step": 2850 }, { "epoch": 2.6619981325863677, "grad_norm": 1.8543369110606633, "learning_rate": 8.608535989313484e-06, "loss": 0.0769, "step": 2851 }, { "epoch": 2.6629318394024275, "grad_norm": 8.283043056302564, "learning_rate": 8.60748913311506e-06, "loss": 0.086, "step": 2852 }, { "epoch": 2.6638655462184873, "grad_norm": 0.8989586317383323, "learning_rate": 8.606441946972135e-06, "loss": 0.0274, "step": 2853 }, { "epoch": 2.664799253034547, "grad_norm": 2.8385792154496485, "learning_rate": 8.60539443098049e-06, "loss": 0.2195, "step": 2854 }, { "epoch": 2.6657329598506068, "grad_norm": 2.184633296899727, "learning_rate": 8.60434658523593e-06, "loss": 0.0915, "step": 2855 }, { "epoch": 2.6666666666666665, "grad_norm": 1.9292780158384562, "learning_rate": 8.60329840983429e-06, "loss": 0.1431, "step": 2856 }, { "epoch": 2.6676003734827263, "grad_norm": 0.6602244261823889, "learning_rate": 8.60224990487144e-06, "loss": 0.0168, "step": 2857 }, { "epoch": 2.668534080298786, "grad_norm": 4.253024832888847, "learning_rate": 8.601201070443274e-06, "loss": 0.2622, "step": 2858 }, { "epoch": 2.669467787114846, "grad_norm": 1.7172716746359449, "learning_rate": 8.600151906645725e-06, "loss": 0.0328, "step": 2859 }, { "epoch": 2.6704014939309055, "grad_norm": 2.0621035682445377, "learning_rate": 8.599102413574745e-06, "loss": 0.1549, "step": 2860 }, { "epoch": 2.6713352007469653, "grad_norm": 2.4334925575422677, "learning_rate": 8.598052591326325e-06, "loss": 0.2106, "step": 2861 }, { "epoch": 2.6722689075630255, "grad_norm": 0.9634962844605728, "learning_rate": 8.59700243999648e-06, "loss": 0.0492, "step": 2862 }, { "epoch": 2.6732026143790852, "grad_norm": 2.326485346380216, "learning_rate": 8.595951959681261e-06, "loss": 0.1327, "step": 2863 }, { "epoch": 2.674136321195145, "grad_norm": 0.5525740159983121, "learning_rate": 8.594901150476744e-06, "loss": 0.0183, "step": 2864 }, { "epoch": 2.6750700280112047, "grad_norm": 1.0559237111832642, "learning_rate": 8.593850012479037e-06, "loss": 0.0232, "step": 2865 }, { "epoch": 2.6760037348272645, "grad_norm": 2.403641298503032, "learning_rate": 8.59279854578428e-06, "loss": 0.1198, "step": 2866 }, { "epoch": 2.6769374416433243, "grad_norm": 3.1778965357966005, "learning_rate": 8.591746750488639e-06, "loss": 0.1651, "step": 2867 }, { "epoch": 2.677871148459384, "grad_norm": 3.2631962595112203, "learning_rate": 8.590694626688311e-06, "loss": 0.2122, "step": 2868 }, { "epoch": 2.6788048552754438, "grad_norm": 1.2274280724782565, "learning_rate": 8.589642174479529e-06, "loss": 0.0222, "step": 2869 }, { "epoch": 2.6797385620915035, "grad_norm": 0.8454554347609826, "learning_rate": 8.588589393958548e-06, "loss": 0.0463, "step": 2870 }, { "epoch": 2.6806722689075633, "grad_norm": 2.26003356166725, "learning_rate": 8.587536285221656e-06, "loss": 0.1516, "step": 2871 }, { "epoch": 2.681605975723623, "grad_norm": 1.605729349352659, "learning_rate": 8.586482848365175e-06, "loss": 0.0581, "step": 2872 }, { "epoch": 2.682539682539683, "grad_norm": 0.7891629942055498, "learning_rate": 8.585429083485446e-06, "loss": 0.0301, "step": 2873 }, { "epoch": 2.6834733893557425, "grad_norm": 2.91282713687067, "learning_rate": 8.584374990678855e-06, "loss": 0.2403, "step": 2874 }, { "epoch": 2.6844070961718023, "grad_norm": 0.9580370780230928, "learning_rate": 8.583320570041807e-06, "loss": 0.0164, "step": 2875 }, { "epoch": 2.685340802987862, "grad_norm": 3.01647054749135, "learning_rate": 8.58226582167074e-06, "loss": 0.1288, "step": 2876 }, { "epoch": 2.686274509803922, "grad_norm": 2.422219936931119, "learning_rate": 8.581210745662124e-06, "loss": 0.1244, "step": 2877 }, { "epoch": 2.6872082166199815, "grad_norm": 2.987190984448622, "learning_rate": 8.580155342112455e-06, "loss": 0.2086, "step": 2878 }, { "epoch": 2.6881419234360413, "grad_norm": 3.9775398921300926, "learning_rate": 8.579099611118264e-06, "loss": 0.0803, "step": 2879 }, { "epoch": 2.689075630252101, "grad_norm": 1.3802386979252876, "learning_rate": 8.578043552776108e-06, "loss": 0.024, "step": 2880 }, { "epoch": 2.690009337068161, "grad_norm": 0.5197530386324687, "learning_rate": 8.576987167182573e-06, "loss": 0.0229, "step": 2881 }, { "epoch": 2.6909430438842206, "grad_norm": 0.8248406630663495, "learning_rate": 8.575930454434281e-06, "loss": 0.0374, "step": 2882 }, { "epoch": 2.6918767507002803, "grad_norm": 2.33423652231396, "learning_rate": 8.574873414627879e-06, "loss": 0.1239, "step": 2883 }, { "epoch": 2.69281045751634, "grad_norm": 1.2811911943387229, "learning_rate": 8.57381604786004e-06, "loss": 0.0334, "step": 2884 }, { "epoch": 2.6937441643324, "grad_norm": 5.116160457531624, "learning_rate": 8.572758354227476e-06, "loss": 0.0808, "step": 2885 }, { "epoch": 2.6946778711484596, "grad_norm": 7.058085318213026, "learning_rate": 8.571700333826928e-06, "loss": 0.2553, "step": 2886 }, { "epoch": 2.6956115779645193, "grad_norm": 3.439130010855143, "learning_rate": 8.570641986755156e-06, "loss": 0.2414, "step": 2887 }, { "epoch": 2.696545284780579, "grad_norm": 2.385101767885277, "learning_rate": 8.569583313108965e-06, "loss": 0.127, "step": 2888 }, { "epoch": 2.697478991596639, "grad_norm": 3.061458960710098, "learning_rate": 8.568524312985178e-06, "loss": 0.0576, "step": 2889 }, { "epoch": 2.6984126984126986, "grad_norm": 2.482862454224542, "learning_rate": 8.567464986480651e-06, "loss": 0.0818, "step": 2890 }, { "epoch": 2.6993464052287583, "grad_norm": 3.1352980621020956, "learning_rate": 8.566405333692274e-06, "loss": 0.245, "step": 2891 }, { "epoch": 2.700280112044818, "grad_norm": 0.701854237165473, "learning_rate": 8.565345354716963e-06, "loss": 0.0403, "step": 2892 }, { "epoch": 2.701213818860878, "grad_norm": 1.9499839589160224, "learning_rate": 8.564285049651665e-06, "loss": 0.1381, "step": 2893 }, { "epoch": 2.7021475256769376, "grad_norm": 4.984240057814721, "learning_rate": 8.563224418593354e-06, "loss": 0.1434, "step": 2894 }, { "epoch": 2.7030812324929974, "grad_norm": 2.042849971756001, "learning_rate": 8.562163461639041e-06, "loss": 0.1808, "step": 2895 }, { "epoch": 2.704014939309057, "grad_norm": 3.629419347822366, "learning_rate": 8.561102178885758e-06, "loss": 0.1844, "step": 2896 }, { "epoch": 2.704948646125117, "grad_norm": 1.9173560662515559, "learning_rate": 8.560040570430575e-06, "loss": 0.1015, "step": 2897 }, { "epoch": 2.7058823529411766, "grad_norm": 1.361384746870623, "learning_rate": 8.558978636370583e-06, "loss": 0.0789, "step": 2898 }, { "epoch": 2.7068160597572364, "grad_norm": 1.6159899336682824, "learning_rate": 8.55791637680291e-06, "loss": 0.0825, "step": 2899 }, { "epoch": 2.707749766573296, "grad_norm": 2.5706646256374635, "learning_rate": 8.556853791824711e-06, "loss": 0.103, "step": 2900 }, { "epoch": 2.708683473389356, "grad_norm": 1.7713289708128734, "learning_rate": 8.555790881533173e-06, "loss": 0.0574, "step": 2901 }, { "epoch": 2.7096171802054156, "grad_norm": 1.3697689641067565, "learning_rate": 8.554727646025509e-06, "loss": 0.0685, "step": 2902 }, { "epoch": 2.7105508870214754, "grad_norm": 1.982809575163461, "learning_rate": 8.55366408539896e-06, "loss": 0.1096, "step": 2903 }, { "epoch": 2.711484593837535, "grad_norm": 2.542930526108789, "learning_rate": 8.552600199750808e-06, "loss": 0.0944, "step": 2904 }, { "epoch": 2.712418300653595, "grad_norm": 2.3390608539239293, "learning_rate": 8.551535989178352e-06, "loss": 0.1303, "step": 2905 }, { "epoch": 2.7133520074696547, "grad_norm": 2.8652298102689926, "learning_rate": 8.550471453778926e-06, "loss": 0.1243, "step": 2906 }, { "epoch": 2.7142857142857144, "grad_norm": 0.7289866451545226, "learning_rate": 8.549406593649893e-06, "loss": 0.026, "step": 2907 }, { "epoch": 2.715219421101774, "grad_norm": 1.7521151199613545, "learning_rate": 8.54834140888865e-06, "loss": 0.0976, "step": 2908 }, { "epoch": 2.716153127917834, "grad_norm": 1.7712533040174658, "learning_rate": 8.547275899592616e-06, "loss": 0.1154, "step": 2909 }, { "epoch": 2.7170868347338937, "grad_norm": 1.9374068735717094, "learning_rate": 8.546210065859245e-06, "loss": 0.0662, "step": 2910 }, { "epoch": 2.7180205415499534, "grad_norm": 2.2200568253438218, "learning_rate": 8.545143907786018e-06, "loss": 0.1255, "step": 2911 }, { "epoch": 2.718954248366013, "grad_norm": 0.8656989937916445, "learning_rate": 8.544077425470447e-06, "loss": 0.0225, "step": 2912 }, { "epoch": 2.719887955182073, "grad_norm": 1.8297676689957285, "learning_rate": 8.543010619010077e-06, "loss": 0.0448, "step": 2913 }, { "epoch": 2.7208216619981327, "grad_norm": 4.205015066706102, "learning_rate": 8.541943488502474e-06, "loss": 0.1556, "step": 2914 }, { "epoch": 2.7217553688141924, "grad_norm": 2.281494364030364, "learning_rate": 8.540876034045242e-06, "loss": 0.1257, "step": 2915 }, { "epoch": 2.722689075630252, "grad_norm": 3.130282297810289, "learning_rate": 8.53980825573601e-06, "loss": 0.101, "step": 2916 }, { "epoch": 2.723622782446312, "grad_norm": 1.537039589667246, "learning_rate": 8.53874015367244e-06, "loss": 0.1354, "step": 2917 }, { "epoch": 2.7245564892623717, "grad_norm": 0.49121446841610517, "learning_rate": 8.537671727952221e-06, "loss": 0.0187, "step": 2918 }, { "epoch": 2.7254901960784315, "grad_norm": 1.6122550807088758, "learning_rate": 8.53660297867307e-06, "loss": 0.0908, "step": 2919 }, { "epoch": 2.726423902894491, "grad_norm": 0.9089169446073768, "learning_rate": 8.535533905932739e-06, "loss": 0.0256, "step": 2920 }, { "epoch": 2.727357609710551, "grad_norm": 2.149147645877394, "learning_rate": 8.534464509829003e-06, "loss": 0.105, "step": 2921 }, { "epoch": 2.7282913165266107, "grad_norm": 1.5301351291140923, "learning_rate": 8.533394790459673e-06, "loss": 0.0382, "step": 2922 }, { "epoch": 2.7292250233426705, "grad_norm": 2.3938505053494743, "learning_rate": 8.532324747922586e-06, "loss": 0.1733, "step": 2923 }, { "epoch": 2.7301587301587302, "grad_norm": 1.1913166178839043, "learning_rate": 8.531254382315609e-06, "loss": 0.0256, "step": 2924 }, { "epoch": 2.73109243697479, "grad_norm": 3.444846253893788, "learning_rate": 8.530183693736638e-06, "loss": 0.1904, "step": 2925 }, { "epoch": 2.7320261437908497, "grad_norm": 2.067825838243424, "learning_rate": 8.529112682283598e-06, "loss": 0.0411, "step": 2926 }, { "epoch": 2.7329598506069095, "grad_norm": 0.9712121266473109, "learning_rate": 8.528041348054448e-06, "loss": 0.033, "step": 2927 }, { "epoch": 2.7338935574229692, "grad_norm": 2.1772827447774397, "learning_rate": 8.52696969114717e-06, "loss": 0.1704, "step": 2928 }, { "epoch": 2.734827264239029, "grad_norm": 2.5404412860484085, "learning_rate": 8.525897711659783e-06, "loss": 0.1159, "step": 2929 }, { "epoch": 2.7357609710550888, "grad_norm": 2.6558421339409923, "learning_rate": 8.524825409690328e-06, "loss": 0.0313, "step": 2930 }, { "epoch": 2.7366946778711485, "grad_norm": 1.357937027625918, "learning_rate": 8.523752785336878e-06, "loss": 0.0466, "step": 2931 }, { "epoch": 2.7376283846872083, "grad_norm": 1.7167720174292844, "learning_rate": 8.52267983869754e-06, "loss": 0.1108, "step": 2932 }, { "epoch": 2.738562091503268, "grad_norm": 0.8269734207156697, "learning_rate": 8.521606569870443e-06, "loss": 0.0237, "step": 2933 }, { "epoch": 2.7394957983193278, "grad_norm": 3.3930820589569755, "learning_rate": 8.52053297895375e-06, "loss": 0.0543, "step": 2934 }, { "epoch": 2.7404295051353875, "grad_norm": 1.2986281230239924, "learning_rate": 8.519459066045656e-06, "loss": 0.0151, "step": 2935 }, { "epoch": 2.7413632119514473, "grad_norm": 1.8708865338817728, "learning_rate": 8.518384831244378e-06, "loss": 0.0846, "step": 2936 }, { "epoch": 2.742296918767507, "grad_norm": 2.2994477284947648, "learning_rate": 8.517310274648168e-06, "loss": 0.1198, "step": 2937 }, { "epoch": 2.743230625583567, "grad_norm": 0.7490476323369785, "learning_rate": 8.516235396355307e-06, "loss": 0.0233, "step": 2938 }, { "epoch": 2.7441643323996265, "grad_norm": 0.9563241469841852, "learning_rate": 8.515160196464103e-06, "loss": 0.0438, "step": 2939 }, { "epoch": 2.7450980392156863, "grad_norm": 0.4262687399974905, "learning_rate": 8.514084675072896e-06, "loss": 0.0072, "step": 2940 }, { "epoch": 2.746031746031746, "grad_norm": 1.5290282024544763, "learning_rate": 8.513008832280054e-06, "loss": 0.035, "step": 2941 }, { "epoch": 2.746965452847806, "grad_norm": 0.6002533199272082, "learning_rate": 8.511932668183973e-06, "loss": 0.0153, "step": 2942 }, { "epoch": 2.7478991596638656, "grad_norm": 2.50608584722948, "learning_rate": 8.510856182883083e-06, "loss": 0.1046, "step": 2943 }, { "epoch": 2.7488328664799253, "grad_norm": 2.3176936194265845, "learning_rate": 8.509779376475837e-06, "loss": 0.109, "step": 2944 }, { "epoch": 2.749766573295985, "grad_norm": 2.0455120282401102, "learning_rate": 8.508702249060723e-06, "loss": 0.1028, "step": 2945 }, { "epoch": 2.750700280112045, "grad_norm": 2.7066559123437215, "learning_rate": 8.507624800736256e-06, "loss": 0.1066, "step": 2946 }, { "epoch": 2.7516339869281046, "grad_norm": 2.4957046090982415, "learning_rate": 8.50654703160098e-06, "loss": 0.1823, "step": 2947 }, { "epoch": 2.7525676937441643, "grad_norm": 1.5904758834443695, "learning_rate": 8.505468941753468e-06, "loss": 0.0544, "step": 2948 }, { "epoch": 2.753501400560224, "grad_norm": 1.394319203787727, "learning_rate": 8.504390531292326e-06, "loss": 0.0495, "step": 2949 }, { "epoch": 2.754435107376284, "grad_norm": 0.9754332526422048, "learning_rate": 8.503311800316182e-06, "loss": 0.0368, "step": 2950 }, { "epoch": 2.7553688141923436, "grad_norm": 2.7545055005657892, "learning_rate": 8.502232748923703e-06, "loss": 0.2144, "step": 2951 }, { "epoch": 2.7563025210084033, "grad_norm": 1.462181471776769, "learning_rate": 8.501153377213577e-06, "loss": 0.0569, "step": 2952 }, { "epoch": 2.757236227824463, "grad_norm": 1.7364030330681437, "learning_rate": 8.500073685284525e-06, "loss": 0.1183, "step": 2953 }, { "epoch": 2.758169934640523, "grad_norm": 1.633931812050102, "learning_rate": 8.498993673235295e-06, "loss": 0.0738, "step": 2954 }, { "epoch": 2.7591036414565826, "grad_norm": 1.6024497129328819, "learning_rate": 8.49791334116467e-06, "loss": 0.0809, "step": 2955 }, { "epoch": 2.7600373482726424, "grad_norm": 1.5448685025462034, "learning_rate": 8.496832689171453e-06, "loss": 0.1113, "step": 2956 }, { "epoch": 2.760971055088702, "grad_norm": 3.8976678386027737, "learning_rate": 8.495751717354485e-06, "loss": 0.2254, "step": 2957 }, { "epoch": 2.761904761904762, "grad_norm": 0.5496116146403932, "learning_rate": 8.494670425812632e-06, "loss": 0.0095, "step": 2958 }, { "epoch": 2.7628384687208216, "grad_norm": 0.8306882979541125, "learning_rate": 8.493588814644791e-06, "loss": 0.0311, "step": 2959 }, { "epoch": 2.7637721755368814, "grad_norm": 4.552782429910309, "learning_rate": 8.492506883949884e-06, "loss": 0.1679, "step": 2960 }, { "epoch": 2.764705882352941, "grad_norm": 1.5498837121267315, "learning_rate": 8.49142463382687e-06, "loss": 0.0883, "step": 2961 }, { "epoch": 2.765639589169001, "grad_norm": 0.9031141339957067, "learning_rate": 8.49034206437473e-06, "loss": 0.0443, "step": 2962 }, { "epoch": 2.7665732959850606, "grad_norm": 2.2155100375729604, "learning_rate": 8.489259175692477e-06, "loss": 0.1269, "step": 2963 }, { "epoch": 2.7675070028011204, "grad_norm": 2.5602650160497586, "learning_rate": 8.488175967879153e-06, "loss": 0.1622, "step": 2964 }, { "epoch": 2.76844070961718, "grad_norm": 0.8582331292178506, "learning_rate": 8.48709244103383e-06, "loss": 0.0127, "step": 2965 }, { "epoch": 2.76937441643324, "grad_norm": 1.263721150807097, "learning_rate": 8.486008595255605e-06, "loss": 0.0528, "step": 2966 }, { "epoch": 2.7703081232492996, "grad_norm": 2.9475511409244284, "learning_rate": 8.484924430643615e-06, "loss": 0.1834, "step": 2967 }, { "epoch": 2.7712418300653594, "grad_norm": 3.0509777517121326, "learning_rate": 8.483839947297013e-06, "loss": 0.151, "step": 2968 }, { "epoch": 2.772175536881419, "grad_norm": 1.6108245242520494, "learning_rate": 8.482755145314987e-06, "loss": 0.0367, "step": 2969 }, { "epoch": 2.773109243697479, "grad_norm": 1.0959812501452038, "learning_rate": 8.481670024796756e-06, "loss": 0.0313, "step": 2970 }, { "epoch": 2.7740429505135387, "grad_norm": 1.1396604125476235, "learning_rate": 8.480584585841562e-06, "loss": 0.0463, "step": 2971 }, { "epoch": 2.7749766573295984, "grad_norm": 1.3076213917338195, "learning_rate": 8.47949882854869e-06, "loss": 0.0501, "step": 2972 }, { "epoch": 2.775910364145658, "grad_norm": 0.4522277337731009, "learning_rate": 8.478412753017433e-06, "loss": 0.0143, "step": 2973 }, { "epoch": 2.776844070961718, "grad_norm": 2.252859524301234, "learning_rate": 8.477326359347132e-06, "loss": 0.0535, "step": 2974 }, { "epoch": 2.7777777777777777, "grad_norm": 0.6328842994855683, "learning_rate": 8.476239647637145e-06, "loss": 0.0262, "step": 2975 }, { "epoch": 2.7787114845938374, "grad_norm": 1.8361924983303535, "learning_rate": 8.47515261798687e-06, "loss": 0.043, "step": 2976 }, { "epoch": 2.779645191409897, "grad_norm": 3.201961895525266, "learning_rate": 8.474065270495721e-06, "loss": 0.0518, "step": 2977 }, { "epoch": 2.780578898225957, "grad_norm": 0.17534846752790403, "learning_rate": 8.472977605263151e-06, "loss": 0.0023, "step": 2978 }, { "epoch": 2.7815126050420167, "grad_norm": 4.178451137459378, "learning_rate": 8.47188962238864e-06, "loss": 0.2172, "step": 2979 }, { "epoch": 2.7824463118580764, "grad_norm": 1.4732465467661757, "learning_rate": 8.470801321971694e-06, "loss": 0.0742, "step": 2980 }, { "epoch": 2.783380018674136, "grad_norm": 0.41068928256077314, "learning_rate": 8.46971270411185e-06, "loss": 0.0055, "step": 2981 }, { "epoch": 2.784313725490196, "grad_norm": 1.5035750838569895, "learning_rate": 8.468623768908674e-06, "loss": 0.0832, "step": 2982 }, { "epoch": 2.7852474323062557, "grad_norm": 1.5460790274412413, "learning_rate": 8.467534516461763e-06, "loss": 0.1119, "step": 2983 }, { "epoch": 2.7861811391223155, "grad_norm": 2.6983829470846645, "learning_rate": 8.46644494687074e-06, "loss": 0.1701, "step": 2984 }, { "epoch": 2.787114845938375, "grad_norm": 4.22214811935003, "learning_rate": 8.465355060235258e-06, "loss": 0.1967, "step": 2985 }, { "epoch": 2.788048552754435, "grad_norm": 7.993666378949524, "learning_rate": 8.464264856654999e-06, "loss": 0.133, "step": 2986 }, { "epoch": 2.7889822595704947, "grad_norm": 4.062262477643728, "learning_rate": 8.463174336229674e-06, "loss": 0.2421, "step": 2987 }, { "epoch": 2.7899159663865545, "grad_norm": 2.1532313096870017, "learning_rate": 8.462083499059022e-06, "loss": 0.1665, "step": 2988 }, { "epoch": 2.7908496732026142, "grad_norm": 0.5592316886387376, "learning_rate": 8.460992345242815e-06, "loss": 0.0212, "step": 2989 }, { "epoch": 2.791783380018674, "grad_norm": 0.9680516841862008, "learning_rate": 8.459900874880848e-06, "loss": 0.0277, "step": 2990 }, { "epoch": 2.7927170868347337, "grad_norm": 3.331034955538546, "learning_rate": 8.458809088072952e-06, "loss": 0.0583, "step": 2991 }, { "epoch": 2.7936507936507935, "grad_norm": 1.3726394386394893, "learning_rate": 8.457716984918978e-06, "loss": 0.0118, "step": 2992 }, { "epoch": 2.7945845004668532, "grad_norm": 1.5610912487316821, "learning_rate": 8.456624565518813e-06, "loss": 0.078, "step": 2993 }, { "epoch": 2.795518207282913, "grad_norm": 4.491922732225287, "learning_rate": 8.455531829972371e-06, "loss": 0.2047, "step": 2994 }, { "epoch": 2.7964519140989728, "grad_norm": 1.8352368873553486, "learning_rate": 8.454438778379594e-06, "loss": 0.0578, "step": 2995 }, { "epoch": 2.7973856209150325, "grad_norm": 2.3441505751792726, "learning_rate": 8.453345410840455e-06, "loss": 0.1505, "step": 2996 }, { "epoch": 2.7983193277310923, "grad_norm": 2.2211784293151977, "learning_rate": 8.452251727454953e-06, "loss": 0.1214, "step": 2997 }, { "epoch": 2.799253034547152, "grad_norm": 1.1521726723330534, "learning_rate": 8.451157728323116e-06, "loss": 0.0421, "step": 2998 }, { "epoch": 2.8001867413632118, "grad_norm": 1.290403052768959, "learning_rate": 8.450063413545006e-06, "loss": 0.0617, "step": 2999 }, { "epoch": 2.8011204481792715, "grad_norm": 3.5836337613985845, "learning_rate": 8.448968783220708e-06, "loss": 0.0958, "step": 3000 }, { "epoch": 2.8020541549953313, "grad_norm": 0.7019449760085281, "learning_rate": 8.447873837450339e-06, "loss": 0.015, "step": 3001 }, { "epoch": 2.802987861811391, "grad_norm": 4.249613768464821, "learning_rate": 8.44677857633404e-06, "loss": 0.054, "step": 3002 }, { "epoch": 2.803921568627451, "grad_norm": 1.8032063195018702, "learning_rate": 8.445682999971992e-06, "loss": 0.0339, "step": 3003 }, { "epoch": 2.8048552754435105, "grad_norm": 3.0786965968468483, "learning_rate": 8.444587108464388e-06, "loss": 0.0985, "step": 3004 }, { "epoch": 2.8057889822595703, "grad_norm": 4.548180339119334, "learning_rate": 8.443490901911467e-06, "loss": 0.2557, "step": 3005 }, { "epoch": 2.80672268907563, "grad_norm": 1.8211448432024289, "learning_rate": 8.442394380413487e-06, "loss": 0.0591, "step": 3006 }, { "epoch": 2.80765639589169, "grad_norm": 0.6199058875095458, "learning_rate": 8.441297544070734e-06, "loss": 0.0122, "step": 3007 }, { "epoch": 2.8085901027077496, "grad_norm": 1.0605403286437574, "learning_rate": 8.44020039298353e-06, "loss": 0.0646, "step": 3008 }, { "epoch": 2.8095238095238093, "grad_norm": 1.8702622688690878, "learning_rate": 8.439102927252221e-06, "loss": 0.0344, "step": 3009 }, { "epoch": 2.810457516339869, "grad_norm": 0.5187062756524372, "learning_rate": 8.438005146977179e-06, "loss": 0.0176, "step": 3010 }, { "epoch": 2.811391223155929, "grad_norm": 1.5546907013668232, "learning_rate": 8.436907052258809e-06, "loss": 0.0184, "step": 3011 }, { "epoch": 2.8123249299719886, "grad_norm": 2.1694104315819365, "learning_rate": 8.435808643197547e-06, "loss": 0.1033, "step": 3012 }, { "epoch": 2.8132586367880483, "grad_norm": 3.749069321999132, "learning_rate": 8.434709919893853e-06, "loss": 0.292, "step": 3013 }, { "epoch": 2.814192343604108, "grad_norm": 3.641704192002793, "learning_rate": 8.433610882448215e-06, "loss": 0.0636, "step": 3014 }, { "epoch": 2.815126050420168, "grad_norm": 2.3767037118907646, "learning_rate": 8.432511530961154e-06, "loss": 0.0992, "step": 3015 }, { "epoch": 2.8160597572362276, "grad_norm": 0.9010908466421831, "learning_rate": 8.431411865533218e-06, "loss": 0.0201, "step": 3016 }, { "epoch": 2.8169934640522873, "grad_norm": 2.355725892329722, "learning_rate": 8.430311886264982e-06, "loss": 0.1436, "step": 3017 }, { "epoch": 2.817927170868347, "grad_norm": 2.5266601206928194, "learning_rate": 8.429211593257054e-06, "loss": 0.1742, "step": 3018 }, { "epoch": 2.818860877684407, "grad_norm": 3.819245484989854, "learning_rate": 8.428110986610064e-06, "loss": 0.2887, "step": 3019 }, { "epoch": 2.8197945845004666, "grad_norm": 2.1594554656640064, "learning_rate": 8.427010066424678e-06, "loss": 0.1343, "step": 3020 }, { "epoch": 2.8207282913165264, "grad_norm": 1.9833554004856468, "learning_rate": 8.425908832801586e-06, "loss": 0.1117, "step": 3021 }, { "epoch": 2.821661998132586, "grad_norm": 3.174246024339709, "learning_rate": 8.424807285841507e-06, "loss": 0.1217, "step": 3022 }, { "epoch": 2.822595704948646, "grad_norm": 0.8181649013427434, "learning_rate": 8.42370542564519e-06, "loss": 0.0212, "step": 3023 }, { "epoch": 2.8235294117647056, "grad_norm": 3.3188338197407545, "learning_rate": 8.422603252313413e-06, "loss": 0.0544, "step": 3024 }, { "epoch": 2.8244631185807654, "grad_norm": 0.8451053142876624, "learning_rate": 8.421500765946984e-06, "loss": 0.024, "step": 3025 }, { "epoch": 2.825396825396825, "grad_norm": 1.8647804445691487, "learning_rate": 8.420397966646732e-06, "loss": 0.0519, "step": 3026 }, { "epoch": 2.826330532212885, "grad_norm": 1.604070822251006, "learning_rate": 8.419294854513522e-06, "loss": 0.0674, "step": 3027 }, { "epoch": 2.8272642390289446, "grad_norm": 1.539281384165358, "learning_rate": 8.418191429648249e-06, "loss": 0.0465, "step": 3028 }, { "epoch": 2.828197945845005, "grad_norm": 1.5143885815748273, "learning_rate": 8.41708769215183e-06, "loss": 0.0726, "step": 3029 }, { "epoch": 2.8291316526610646, "grad_norm": 2.4907377312296806, "learning_rate": 8.415983642125215e-06, "loss": 0.1325, "step": 3030 }, { "epoch": 2.8300653594771243, "grad_norm": 1.672281344359541, "learning_rate": 8.414879279669383e-06, "loss": 0.0395, "step": 3031 }, { "epoch": 2.830999066293184, "grad_norm": 0.871239080177563, "learning_rate": 8.413774604885338e-06, "loss": 0.0318, "step": 3032 }, { "epoch": 2.831932773109244, "grad_norm": 0.7032010042211737, "learning_rate": 8.412669617874114e-06, "loss": 0.0143, "step": 3033 }, { "epoch": 2.8328664799253036, "grad_norm": 0.8884322191533818, "learning_rate": 8.411564318736778e-06, "loss": 0.0267, "step": 3034 }, { "epoch": 2.8338001867413634, "grad_norm": 2.8314968784868637, "learning_rate": 8.410458707574416e-06, "loss": 0.2306, "step": 3035 }, { "epoch": 2.834733893557423, "grad_norm": 1.521526013194695, "learning_rate": 8.409352784488155e-06, "loss": 0.0309, "step": 3036 }, { "epoch": 2.835667600373483, "grad_norm": 2.125961971063801, "learning_rate": 8.408246549579135e-06, "loss": 0.0568, "step": 3037 }, { "epoch": 2.8366013071895426, "grad_norm": 3.142034181565169, "learning_rate": 8.407140002948543e-06, "loss": 0.1518, "step": 3038 }, { "epoch": 2.8375350140056024, "grad_norm": 1.1912258004564666, "learning_rate": 8.406033144697579e-06, "loss": 0.0591, "step": 3039 }, { "epoch": 2.838468720821662, "grad_norm": 4.010974649130293, "learning_rate": 8.404925974927478e-06, "loss": 0.1836, "step": 3040 }, { "epoch": 2.839402427637722, "grad_norm": 2.869964971062339, "learning_rate": 8.403818493739505e-06, "loss": 0.1768, "step": 3041 }, { "epoch": 2.8403361344537816, "grad_norm": 1.0475632874179963, "learning_rate": 8.402710701234949e-06, "loss": 0.0627, "step": 3042 }, { "epoch": 2.8412698412698414, "grad_norm": 0.7423926799931994, "learning_rate": 8.40160259751513e-06, "loss": 0.0174, "step": 3043 }, { "epoch": 2.842203548085901, "grad_norm": 2.283930563136134, "learning_rate": 8.400494182681399e-06, "loss": 0.1029, "step": 3044 }, { "epoch": 2.843137254901961, "grad_norm": 1.3281587440756348, "learning_rate": 8.399385456835128e-06, "loss": 0.0306, "step": 3045 }, { "epoch": 2.8440709617180207, "grad_norm": 3.5172194212399757, "learning_rate": 8.398276420077726e-06, "loss": 0.1952, "step": 3046 }, { "epoch": 2.8450046685340804, "grad_norm": 1.2528610667522189, "learning_rate": 8.397167072510626e-06, "loss": 0.0689, "step": 3047 }, { "epoch": 2.84593837535014, "grad_norm": 0.404650893143994, "learning_rate": 8.39605741423529e-06, "loss": 0.0204, "step": 3048 }, { "epoch": 2.8468720821662, "grad_norm": 3.5616582123938323, "learning_rate": 8.394947445353207e-06, "loss": 0.1344, "step": 3049 }, { "epoch": 2.8478057889822597, "grad_norm": 4.612279272125597, "learning_rate": 8.393837165965896e-06, "loss": 0.2601, "step": 3050 }, { "epoch": 2.8487394957983194, "grad_norm": 3.1629047537612434, "learning_rate": 8.392726576174906e-06, "loss": 0.2268, "step": 3051 }, { "epoch": 2.849673202614379, "grad_norm": 2.8571383416199, "learning_rate": 8.39161567608181e-06, "loss": 0.1643, "step": 3052 }, { "epoch": 2.850606909430439, "grad_norm": 2.5508511693953766, "learning_rate": 8.390504465788217e-06, "loss": 0.1468, "step": 3053 }, { "epoch": 2.8515406162464987, "grad_norm": 3.7339590759990635, "learning_rate": 8.389392945395754e-06, "loss": 0.1719, "step": 3054 }, { "epoch": 2.8524743230625584, "grad_norm": 1.0664642244842175, "learning_rate": 8.388281115006083e-06, "loss": 0.0376, "step": 3055 }, { "epoch": 2.853408029878618, "grad_norm": 2.890915874225269, "learning_rate": 8.387168974720895e-06, "loss": 0.136, "step": 3056 }, { "epoch": 2.854341736694678, "grad_norm": 1.1112848489798637, "learning_rate": 8.386056524641905e-06, "loss": 0.0378, "step": 3057 }, { "epoch": 2.8552754435107377, "grad_norm": 1.2336952832977381, "learning_rate": 8.384943764870859e-06, "loss": 0.0554, "step": 3058 }, { "epoch": 2.8562091503267975, "grad_norm": 2.175807935649943, "learning_rate": 8.383830695509535e-06, "loss": 0.0592, "step": 3059 }, { "epoch": 2.857142857142857, "grad_norm": 1.675249195198841, "learning_rate": 8.38271731665973e-06, "loss": 0.0968, "step": 3060 }, { "epoch": 2.858076563958917, "grad_norm": 2.3095097088851553, "learning_rate": 8.38160362842328e-06, "loss": 0.1372, "step": 3061 }, { "epoch": 2.8590102707749767, "grad_norm": 1.9170957435420894, "learning_rate": 8.380489630902039e-06, "loss": 0.087, "step": 3062 }, { "epoch": 2.8599439775910365, "grad_norm": 2.2066946355985335, "learning_rate": 8.379375324197898e-06, "loss": 0.0319, "step": 3063 }, { "epoch": 2.860877684407096, "grad_norm": 0.5768797106896227, "learning_rate": 8.378260708412771e-06, "loss": 0.0143, "step": 3064 }, { "epoch": 2.861811391223156, "grad_norm": 0.6472874175349355, "learning_rate": 8.3771457836486e-06, "loss": 0.0155, "step": 3065 }, { "epoch": 2.8627450980392157, "grad_norm": 1.6832486681234824, "learning_rate": 8.37603055000736e-06, "loss": 0.0893, "step": 3066 }, { "epoch": 2.8636788048552755, "grad_norm": 1.5532730036199078, "learning_rate": 8.374915007591053e-06, "loss": 0.0316, "step": 3067 }, { "epoch": 2.8646125116713352, "grad_norm": 5.078266579794226, "learning_rate": 8.373799156501704e-06, "loss": 0.2129, "step": 3068 }, { "epoch": 2.865546218487395, "grad_norm": 1.3445970953764574, "learning_rate": 8.37268299684137e-06, "loss": 0.0476, "step": 3069 }, { "epoch": 2.8664799253034547, "grad_norm": 1.7409406654745292, "learning_rate": 8.371566528712138e-06, "loss": 0.1191, "step": 3070 }, { "epoch": 2.8674136321195145, "grad_norm": 1.8596796372180213, "learning_rate": 8.37044975221612e-06, "loss": 0.0863, "step": 3071 }, { "epoch": 2.8683473389355743, "grad_norm": 2.5798090168484227, "learning_rate": 8.369332667455458e-06, "loss": 0.1791, "step": 3072 }, { "epoch": 2.869281045751634, "grad_norm": 2.44626180429261, "learning_rate": 8.368215274532321e-06, "loss": 0.1652, "step": 3073 }, { "epoch": 2.8702147525676938, "grad_norm": 2.061064853332479, "learning_rate": 8.367097573548907e-06, "loss": 0.089, "step": 3074 }, { "epoch": 2.8711484593837535, "grad_norm": 0.8651250347550168, "learning_rate": 8.365979564607443e-06, "loss": 0.0311, "step": 3075 }, { "epoch": 2.8720821661998133, "grad_norm": 1.2799078052415571, "learning_rate": 8.364861247810182e-06, "loss": 0.0592, "step": 3076 }, { "epoch": 2.873015873015873, "grad_norm": 1.0337613111380206, "learning_rate": 8.363742623259407e-06, "loss": 0.0378, "step": 3077 }, { "epoch": 2.8739495798319328, "grad_norm": 2.0678633615623165, "learning_rate": 8.362623691057429e-06, "loss": 0.1407, "step": 3078 }, { "epoch": 2.8748832866479925, "grad_norm": 0.898059125780986, "learning_rate": 8.361504451306585e-06, "loss": 0.0231, "step": 3079 }, { "epoch": 2.8758169934640523, "grad_norm": 1.3797769443149726, "learning_rate": 8.360384904109243e-06, "loss": 0.0718, "step": 3080 }, { "epoch": 2.876750700280112, "grad_norm": 2.2694564897889715, "learning_rate": 8.359265049567798e-06, "loss": 0.2158, "step": 3081 }, { "epoch": 2.877684407096172, "grad_norm": 2.37309244373018, "learning_rate": 8.358144887784672e-06, "loss": 0.1661, "step": 3082 }, { "epoch": 2.8786181139122315, "grad_norm": 4.316584573124775, "learning_rate": 8.357024418862318e-06, "loss": 0.2363, "step": 3083 }, { "epoch": 2.8795518207282913, "grad_norm": 1.2319085961216931, "learning_rate": 8.355903642903213e-06, "loss": 0.0559, "step": 3084 }, { "epoch": 2.880485527544351, "grad_norm": 0.564119130833204, "learning_rate": 8.354782560009866e-06, "loss": 0.0232, "step": 3085 }, { "epoch": 2.881419234360411, "grad_norm": 1.3949910347265058, "learning_rate": 8.35366117028481e-06, "loss": 0.0305, "step": 3086 }, { "epoch": 2.8823529411764706, "grad_norm": 3.4443474999248327, "learning_rate": 8.352539473830612e-06, "loss": 0.1533, "step": 3087 }, { "epoch": 2.8832866479925303, "grad_norm": 2.3016393222960985, "learning_rate": 8.351417470749859e-06, "loss": 0.1113, "step": 3088 }, { "epoch": 2.88422035480859, "grad_norm": 1.1067060640111737, "learning_rate": 8.350295161145176e-06, "loss": 0.0354, "step": 3089 }, { "epoch": 2.88515406162465, "grad_norm": 1.7810877604888595, "learning_rate": 8.349172545119203e-06, "loss": 0.0899, "step": 3090 }, { "epoch": 2.8860877684407096, "grad_norm": 4.962020730581279, "learning_rate": 8.348049622774626e-06, "loss": 0.1717, "step": 3091 }, { "epoch": 2.8870214752567693, "grad_norm": 1.00436818525958, "learning_rate": 8.346926394214139e-06, "loss": 0.0498, "step": 3092 }, { "epoch": 2.887955182072829, "grad_norm": 3.1599928727737994, "learning_rate": 8.345802859540477e-06, "loss": 0.0263, "step": 3093 }, { "epoch": 2.888888888888889, "grad_norm": 1.9597520684896292, "learning_rate": 8.344679018856401e-06, "loss": 0.0696, "step": 3094 }, { "epoch": 2.8898225957049486, "grad_norm": 1.5440190921548043, "learning_rate": 8.343554872264697e-06, "loss": 0.083, "step": 3095 }, { "epoch": 2.8907563025210083, "grad_norm": 1.481668902273783, "learning_rate": 8.34243041986818e-06, "loss": 0.0897, "step": 3096 }, { "epoch": 2.891690009337068, "grad_norm": 1.276368041976225, "learning_rate": 8.341305661769696e-06, "loss": 0.06, "step": 3097 }, { "epoch": 2.892623716153128, "grad_norm": 2.5335048223691885, "learning_rate": 8.340180598072112e-06, "loss": 0.136, "step": 3098 }, { "epoch": 2.8935574229691876, "grad_norm": 1.461711527587458, "learning_rate": 8.339055228878333e-06, "loss": 0.0701, "step": 3099 }, { "epoch": 2.8944911297852474, "grad_norm": 2.2934557559666424, "learning_rate": 8.337929554291282e-06, "loss": 0.1025, "step": 3100 }, { "epoch": 2.895424836601307, "grad_norm": 1.919604713836472, "learning_rate": 8.336803574413919e-06, "loss": 0.1005, "step": 3101 }, { "epoch": 2.896358543417367, "grad_norm": 0.55296467467024, "learning_rate": 8.335677289349222e-06, "loss": 0.0122, "step": 3102 }, { "epoch": 2.8972922502334266, "grad_norm": 1.2892511896481926, "learning_rate": 8.334550699200204e-06, "loss": 0.0256, "step": 3103 }, { "epoch": 2.8982259570494864, "grad_norm": 2.3111087021220387, "learning_rate": 8.333423804069905e-06, "loss": 0.1198, "step": 3104 }, { "epoch": 2.899159663865546, "grad_norm": 1.4002816008543837, "learning_rate": 8.332296604061391e-06, "loss": 0.0674, "step": 3105 }, { "epoch": 2.900093370681606, "grad_norm": 1.0850436994963895, "learning_rate": 8.331169099277758e-06, "loss": 0.052, "step": 3106 }, { "epoch": 2.9010270774976656, "grad_norm": 0.5376166023809623, "learning_rate": 8.330041289822129e-06, "loss": 0.0196, "step": 3107 }, { "epoch": 2.9019607843137254, "grad_norm": 1.9443927295892014, "learning_rate": 8.328913175797651e-06, "loss": 0.0922, "step": 3108 }, { "epoch": 2.902894491129785, "grad_norm": 1.4716464150293997, "learning_rate": 8.327784757307506e-06, "loss": 0.0812, "step": 3109 }, { "epoch": 2.903828197945845, "grad_norm": 1.7706418858985564, "learning_rate": 8.326656034454899e-06, "loss": 0.0826, "step": 3110 }, { "epoch": 2.9047619047619047, "grad_norm": 0.6602766570683828, "learning_rate": 8.325527007343063e-06, "loss": 0.0077, "step": 3111 }, { "epoch": 2.9056956115779644, "grad_norm": 1.2620149568736632, "learning_rate": 8.324397676075262e-06, "loss": 0.0723, "step": 3112 }, { "epoch": 2.906629318394024, "grad_norm": 18.502536361692236, "learning_rate": 8.323268040754787e-06, "loss": 0.0446, "step": 3113 }, { "epoch": 2.907563025210084, "grad_norm": 1.5992497896860431, "learning_rate": 8.32213810148495e-06, "loss": 0.0709, "step": 3114 }, { "epoch": 2.9084967320261437, "grad_norm": 1.4071918431215367, "learning_rate": 8.3210078583691e-06, "loss": 0.0681, "step": 3115 }, { "epoch": 2.9094304388422034, "grad_norm": 1.5594558233874944, "learning_rate": 8.319877311510614e-06, "loss": 0.0472, "step": 3116 }, { "epoch": 2.910364145658263, "grad_norm": 2.3138591039388654, "learning_rate": 8.318746461012884e-06, "loss": 0.0514, "step": 3117 }, { "epoch": 2.911297852474323, "grad_norm": 1.256721700693901, "learning_rate": 8.317615306979347e-06, "loss": 0.026, "step": 3118 }, { "epoch": 2.9122315592903827, "grad_norm": 1.281367938303327, "learning_rate": 8.316483849513455e-06, "loss": 0.0378, "step": 3119 }, { "epoch": 2.9131652661064424, "grad_norm": 2.2812320699897763, "learning_rate": 8.315352088718692e-06, "loss": 0.092, "step": 3120 }, { "epoch": 2.914098972922502, "grad_norm": 2.4063611130951346, "learning_rate": 8.314220024698572e-06, "loss": 0.1363, "step": 3121 }, { "epoch": 2.915032679738562, "grad_norm": 3.43928539563177, "learning_rate": 8.313087657556633e-06, "loss": 0.1789, "step": 3122 }, { "epoch": 2.9159663865546217, "grad_norm": 4.6436747900982835, "learning_rate": 8.311954987396444e-06, "loss": 0.262, "step": 3123 }, { "epoch": 2.9169000933706815, "grad_norm": 1.73045584744296, "learning_rate": 8.3108220143216e-06, "loss": 0.0352, "step": 3124 }, { "epoch": 2.917833800186741, "grad_norm": 2.664669764264585, "learning_rate": 8.309688738435724e-06, "loss": 0.1543, "step": 3125 }, { "epoch": 2.918767507002801, "grad_norm": 0.5399372214597608, "learning_rate": 8.308555159842463e-06, "loss": 0.017, "step": 3126 }, { "epoch": 2.9197012138188607, "grad_norm": 4.443019473250218, "learning_rate": 8.307421278645502e-06, "loss": 0.2096, "step": 3127 }, { "epoch": 2.9206349206349205, "grad_norm": 2.1749254607508477, "learning_rate": 8.30628709494854e-06, "loss": 0.0531, "step": 3128 }, { "epoch": 2.9215686274509802, "grad_norm": 0.8263756688223984, "learning_rate": 8.305152608855314e-06, "loss": 0.038, "step": 3129 }, { "epoch": 2.9225023342670404, "grad_norm": 1.933779525353145, "learning_rate": 8.304017820469583e-06, "loss": 0.1043, "step": 3130 }, { "epoch": 2.9234360410831, "grad_norm": 0.7228033005172462, "learning_rate": 8.30288272989514e-06, "loss": 0.0331, "step": 3131 }, { "epoch": 2.92436974789916, "grad_norm": 1.3039960106870587, "learning_rate": 8.301747337235798e-06, "loss": 0.0362, "step": 3132 }, { "epoch": 2.9253034547152197, "grad_norm": 2.510670114283528, "learning_rate": 8.300611642595403e-06, "loss": 0.1842, "step": 3133 }, { "epoch": 2.9262371615312794, "grad_norm": 1.0830927389190297, "learning_rate": 8.299475646077824e-06, "loss": 0.0734, "step": 3134 }, { "epoch": 2.927170868347339, "grad_norm": 0.823071056884329, "learning_rate": 8.298339347786963e-06, "loss": 0.0285, "step": 3135 }, { "epoch": 2.928104575163399, "grad_norm": 1.8219234458097062, "learning_rate": 8.297202747826747e-06, "loss": 0.083, "step": 3136 }, { "epoch": 2.9290382819794587, "grad_norm": 2.2245636108950775, "learning_rate": 8.296065846301129e-06, "loss": 0.1228, "step": 3137 }, { "epoch": 2.9299719887955185, "grad_norm": 1.1774396192499321, "learning_rate": 8.29492864331409e-06, "loss": 0.0293, "step": 3138 }, { "epoch": 2.930905695611578, "grad_norm": 1.5777882580102158, "learning_rate": 8.293791138969645e-06, "loss": 0.1008, "step": 3139 }, { "epoch": 2.931839402427638, "grad_norm": 2.682028126024789, "learning_rate": 8.292653333371827e-06, "loss": 0.1131, "step": 3140 }, { "epoch": 2.9327731092436977, "grad_norm": 3.2730665698158967, "learning_rate": 8.2915152266247e-06, "loss": 0.3074, "step": 3141 }, { "epoch": 2.9337068160597575, "grad_norm": 1.5743952456153336, "learning_rate": 8.29037681883236e-06, "loss": 0.0946, "step": 3142 }, { "epoch": 2.9346405228758172, "grad_norm": 1.6484166776602245, "learning_rate": 8.289238110098922e-06, "loss": 0.1364, "step": 3143 }, { "epoch": 2.935574229691877, "grad_norm": 2.447325546548022, "learning_rate": 8.28809910052854e-06, "loss": 0.0739, "step": 3144 }, { "epoch": 2.9365079365079367, "grad_norm": 1.8572616465210041, "learning_rate": 8.286959790225382e-06, "loss": 0.1787, "step": 3145 }, { "epoch": 2.9374416433239965, "grad_norm": 0.6617094673636362, "learning_rate": 8.285820179293655e-06, "loss": 0.0111, "step": 3146 }, { "epoch": 2.9383753501400562, "grad_norm": 5.657799388220863, "learning_rate": 8.284680267837589e-06, "loss": 0.147, "step": 3147 }, { "epoch": 2.939309056956116, "grad_norm": 3.3973688358480616, "learning_rate": 8.283540055961438e-06, "loss": 0.2274, "step": 3148 }, { "epoch": 2.9402427637721757, "grad_norm": 5.046113997220839, "learning_rate": 8.28239954376949e-06, "loss": 0.1895, "step": 3149 }, { "epoch": 2.9411764705882355, "grad_norm": 2.6330532360338936, "learning_rate": 8.281258731366055e-06, "loss": 0.1531, "step": 3150 }, { "epoch": 2.9421101774042953, "grad_norm": 0.5663833653147645, "learning_rate": 8.280117618855475e-06, "loss": 0.0096, "step": 3151 }, { "epoch": 2.943043884220355, "grad_norm": 1.3743406748555753, "learning_rate": 8.278976206342118e-06, "loss": 0.0622, "step": 3152 }, { "epoch": 2.9439775910364148, "grad_norm": 1.7723518284705286, "learning_rate": 8.277834493930375e-06, "loss": 0.1293, "step": 3153 }, { "epoch": 2.9449112978524745, "grad_norm": 2.2808824462571677, "learning_rate": 8.27669248172467e-06, "loss": 0.0961, "step": 3154 }, { "epoch": 2.9458450046685343, "grad_norm": 5.734365446066498, "learning_rate": 8.275550169829454e-06, "loss": 0.3723, "step": 3155 }, { "epoch": 2.946778711484594, "grad_norm": 3.226701400487213, "learning_rate": 8.274407558349202e-06, "loss": 0.1368, "step": 3156 }, { "epoch": 2.947712418300654, "grad_norm": 2.029182752326021, "learning_rate": 8.27326464738842e-06, "loss": 0.1137, "step": 3157 }, { "epoch": 2.9486461251167135, "grad_norm": 2.0320308972058596, "learning_rate": 8.272121437051639e-06, "loss": 0.1147, "step": 3158 }, { "epoch": 2.9495798319327733, "grad_norm": 0.9963000637165156, "learning_rate": 8.270977927443418e-06, "loss": 0.0387, "step": 3159 }, { "epoch": 2.950513538748833, "grad_norm": 2.4612377565579417, "learning_rate": 8.269834118668343e-06, "loss": 0.1109, "step": 3160 }, { "epoch": 2.951447245564893, "grad_norm": 2.4056483321060904, "learning_rate": 8.26869001083103e-06, "loss": 0.1198, "step": 3161 }, { "epoch": 2.9523809523809526, "grad_norm": 1.750913107992089, "learning_rate": 8.267545604036115e-06, "loss": 0.0695, "step": 3162 }, { "epoch": 2.9533146591970123, "grad_norm": 2.0880094811907575, "learning_rate": 8.266400898388273e-06, "loss": 0.0359, "step": 3163 }, { "epoch": 2.954248366013072, "grad_norm": 2.593287829762683, "learning_rate": 8.265255893992195e-06, "loss": 0.1892, "step": 3164 }, { "epoch": 2.955182072829132, "grad_norm": 2.909266206377948, "learning_rate": 8.264110590952609e-06, "loss": 0.1344, "step": 3165 }, { "epoch": 2.9561157796451916, "grad_norm": 1.848274830801766, "learning_rate": 8.262964989374261e-06, "loss": 0.0956, "step": 3166 }, { "epoch": 2.9570494864612513, "grad_norm": 3.7544485455923353, "learning_rate": 8.261819089361932e-06, "loss": 0.1883, "step": 3167 }, { "epoch": 2.957983193277311, "grad_norm": 3.8027572295425434, "learning_rate": 8.260672891020424e-06, "loss": 0.1312, "step": 3168 }, { "epoch": 2.958916900093371, "grad_norm": 3.2255806574826273, "learning_rate": 8.259526394454574e-06, "loss": 0.0945, "step": 3169 }, { "epoch": 2.9598506069094306, "grad_norm": 1.053482170456347, "learning_rate": 8.258379599769239e-06, "loss": 0.036, "step": 3170 }, { "epoch": 2.9607843137254903, "grad_norm": 4.70078795744482, "learning_rate": 8.257232507069303e-06, "loss": 0.1433, "step": 3171 }, { "epoch": 2.96171802054155, "grad_norm": 4.30388017068285, "learning_rate": 8.256085116459686e-06, "loss": 0.1457, "step": 3172 }, { "epoch": 2.96265172735761, "grad_norm": 2.723035727522001, "learning_rate": 8.254937428045324e-06, "loss": 0.1604, "step": 3173 }, { "epoch": 2.9635854341736696, "grad_norm": 2.4766735916133036, "learning_rate": 8.25378944193119e-06, "loss": 0.1267, "step": 3174 }, { "epoch": 2.9645191409897294, "grad_norm": 4.714636963542335, "learning_rate": 8.252641158222278e-06, "loss": 0.183, "step": 3175 }, { "epoch": 2.965452847805789, "grad_norm": 3.946284080748357, "learning_rate": 8.251492577023611e-06, "loss": 0.1044, "step": 3176 }, { "epoch": 2.966386554621849, "grad_norm": 1.0951012788778494, "learning_rate": 8.250343698440241e-06, "loss": 0.0405, "step": 3177 }, { "epoch": 2.9673202614379086, "grad_norm": 6.371976536669386, "learning_rate": 8.249194522577244e-06, "loss": 0.2807, "step": 3178 }, { "epoch": 2.9682539682539684, "grad_norm": 0.6081607105593498, "learning_rate": 8.248045049539726e-06, "loss": 0.025, "step": 3179 }, { "epoch": 2.969187675070028, "grad_norm": 1.4871206012650857, "learning_rate": 8.246895279432815e-06, "loss": 0.0774, "step": 3180 }, { "epoch": 2.970121381886088, "grad_norm": 0.7763065058494584, "learning_rate": 8.245745212361676e-06, "loss": 0.0224, "step": 3181 }, { "epoch": 2.9710550887021476, "grad_norm": 2.8428947952727652, "learning_rate": 8.244594848431493e-06, "loss": 0.1307, "step": 3182 }, { "epoch": 2.9719887955182074, "grad_norm": 4.707092791759024, "learning_rate": 8.243444187747477e-06, "loss": 0.2329, "step": 3183 }, { "epoch": 2.972922502334267, "grad_norm": 1.7343851467716165, "learning_rate": 8.242293230414873e-06, "loss": 0.0701, "step": 3184 }, { "epoch": 2.973856209150327, "grad_norm": 1.2234324846396245, "learning_rate": 8.241141976538944e-06, "loss": 0.0428, "step": 3185 }, { "epoch": 2.9747899159663866, "grad_norm": 1.3009668919049306, "learning_rate": 8.239990426224987e-06, "loss": 0.0678, "step": 3186 }, { "epoch": 2.9757236227824464, "grad_norm": 5.157749952127598, "learning_rate": 8.238838579578324e-06, "loss": 0.2992, "step": 3187 }, { "epoch": 2.976657329598506, "grad_norm": 1.3902953772577955, "learning_rate": 8.237686436704301e-06, "loss": 0.0642, "step": 3188 }, { "epoch": 2.977591036414566, "grad_norm": 2.6795346840777934, "learning_rate": 8.236533997708303e-06, "loss": 0.132, "step": 3189 }, { "epoch": 2.9785247432306257, "grad_norm": 0.6962769013355081, "learning_rate": 8.235381262695722e-06, "loss": 0.0133, "step": 3190 }, { "epoch": 2.9794584500466854, "grad_norm": 2.458930228579265, "learning_rate": 8.234228231771997e-06, "loss": 0.0274, "step": 3191 }, { "epoch": 2.980392156862745, "grad_norm": 2.5494729446836315, "learning_rate": 8.23307490504258e-06, "loss": 0.1104, "step": 3192 }, { "epoch": 2.981325863678805, "grad_norm": 2.3899307554092606, "learning_rate": 8.231921282612957e-06, "loss": 0.1267, "step": 3193 }, { "epoch": 2.9822595704948647, "grad_norm": 1.5284722964551682, "learning_rate": 8.23076736458864e-06, "loss": 0.0401, "step": 3194 }, { "epoch": 2.9831932773109244, "grad_norm": 1.4628728483216136, "learning_rate": 8.229613151075166e-06, "loss": 0.0243, "step": 3195 }, { "epoch": 2.984126984126984, "grad_norm": 1.013135197898966, "learning_rate": 8.228458642178103e-06, "loss": 0.0318, "step": 3196 }, { "epoch": 2.985060690943044, "grad_norm": 0.546069878184344, "learning_rate": 8.227303838003042e-06, "loss": 0.013, "step": 3197 }, { "epoch": 2.9859943977591037, "grad_norm": 2.891256119762568, "learning_rate": 8.226148738655602e-06, "loss": 0.1005, "step": 3198 }, { "epoch": 2.9869281045751634, "grad_norm": 2.8933626857678325, "learning_rate": 8.224993344241433e-06, "loss": 0.143, "step": 3199 }, { "epoch": 2.987861811391223, "grad_norm": 1.064914997528891, "learning_rate": 8.223837654866202e-06, "loss": 0.0265, "step": 3200 }, { "epoch": 2.988795518207283, "grad_norm": 0.47071568824904186, "learning_rate": 8.222681670635615e-06, "loss": 0.0068, "step": 3201 }, { "epoch": 2.9897292250233427, "grad_norm": 2.808717549969289, "learning_rate": 8.221525391655398e-06, "loss": 0.1297, "step": 3202 }, { "epoch": 2.9906629318394025, "grad_norm": 1.1039175798601955, "learning_rate": 8.220368818031304e-06, "loss": 0.0213, "step": 3203 }, { "epoch": 2.991596638655462, "grad_norm": 1.485999912976903, "learning_rate": 8.219211949869116e-06, "loss": 0.0444, "step": 3204 }, { "epoch": 2.992530345471522, "grad_norm": 2.5648450559358085, "learning_rate": 8.21805478727464e-06, "loss": 0.0308, "step": 3205 }, { "epoch": 2.9934640522875817, "grad_norm": 2.141086608936535, "learning_rate": 8.216897330353715e-06, "loss": 0.0748, "step": 3206 }, { "epoch": 2.9943977591036415, "grad_norm": 1.2983569507878898, "learning_rate": 8.2157395792122e-06, "loss": 0.0836, "step": 3207 }, { "epoch": 2.9953314659197012, "grad_norm": 2.9394586403788576, "learning_rate": 8.214581533955986e-06, "loss": 0.1283, "step": 3208 }, { "epoch": 2.996265172735761, "grad_norm": 2.5771532203322556, "learning_rate": 8.213423194690988e-06, "loss": 0.162, "step": 3209 }, { "epoch": 2.9971988795518207, "grad_norm": 2.7574685967850296, "learning_rate": 8.21226456152315e-06, "loss": 0.1211, "step": 3210 }, { "epoch": 2.9981325863678805, "grad_norm": 5.068979858543916, "learning_rate": 8.211105634558438e-06, "loss": 0.1083, "step": 3211 }, { "epoch": 2.9990662931839402, "grad_norm": 0.9457938079287451, "learning_rate": 8.209946413902851e-06, "loss": 0.0384, "step": 3212 }, { "epoch": 3.0, "grad_norm": 1.4009828160546194, "learning_rate": 8.208786899662415e-06, "loss": 0.0874, "step": 3213 }, { "epoch": 3.0009337068160598, "grad_norm": 3.4579930624271964, "learning_rate": 8.207627091943178e-06, "loss": 0.0951, "step": 3214 }, { "epoch": 3.0018674136321195, "grad_norm": 1.0741561824936685, "learning_rate": 8.206466990851216e-06, "loss": 0.0389, "step": 3215 }, { "epoch": 3.0028011204481793, "grad_norm": 2.0712729012619255, "learning_rate": 8.205306596492633e-06, "loss": 0.0831, "step": 3216 }, { "epoch": 3.003734827264239, "grad_norm": 1.1912590557850244, "learning_rate": 8.204145908973563e-06, "loss": 0.0506, "step": 3217 }, { "epoch": 3.0046685340802988, "grad_norm": 2.578712557425825, "learning_rate": 8.202984928400163e-06, "loss": 0.0817, "step": 3218 }, { "epoch": 3.0056022408963585, "grad_norm": 2.5766402658912106, "learning_rate": 8.201823654878613e-06, "loss": 0.133, "step": 3219 }, { "epoch": 3.0065359477124183, "grad_norm": 1.287267401102676, "learning_rate": 8.200662088515128e-06, "loss": 0.0669, "step": 3220 }, { "epoch": 3.007469654528478, "grad_norm": 2.9758239982343877, "learning_rate": 8.199500229415946e-06, "loss": 0.1595, "step": 3221 }, { "epoch": 3.008403361344538, "grad_norm": 1.2195093018987613, "learning_rate": 8.19833807768733e-06, "loss": 0.054, "step": 3222 }, { "epoch": 3.0093370681605975, "grad_norm": 0.9820712190341983, "learning_rate": 8.197175633435573e-06, "loss": 0.0445, "step": 3223 }, { "epoch": 3.0102707749766573, "grad_norm": 1.799544303370564, "learning_rate": 8.196012896766994e-06, "loss": 0.0781, "step": 3224 }, { "epoch": 3.011204481792717, "grad_norm": 3.722385570860927, "learning_rate": 8.194849867787935e-06, "loss": 0.316, "step": 3225 }, { "epoch": 3.012138188608777, "grad_norm": 7.849157474895417, "learning_rate": 8.193686546604771e-06, "loss": 0.3341, "step": 3226 }, { "epoch": 3.0130718954248366, "grad_norm": 0.8768117430280389, "learning_rate": 8.192522933323899e-06, "loss": 0.0557, "step": 3227 }, { "epoch": 3.0140056022408963, "grad_norm": 2.110921377539249, "learning_rate": 8.191359028051743e-06, "loss": 0.1429, "step": 3228 }, { "epoch": 3.014939309056956, "grad_norm": 3.451688837585679, "learning_rate": 8.190194830894757e-06, "loss": 0.0385, "step": 3229 }, { "epoch": 3.015873015873016, "grad_norm": 4.331437158660285, "learning_rate": 8.189030341959418e-06, "loss": 0.1895, "step": 3230 }, { "epoch": 3.0168067226890756, "grad_norm": 3.360915229322815, "learning_rate": 8.187865561352235e-06, "loss": 0.1642, "step": 3231 }, { "epoch": 3.0177404295051353, "grad_norm": 2.7515776127805847, "learning_rate": 8.186700489179736e-06, "loss": 0.0962, "step": 3232 }, { "epoch": 3.018674136321195, "grad_norm": 1.6820878104856145, "learning_rate": 8.185535125548481e-06, "loss": 0.0787, "step": 3233 }, { "epoch": 3.019607843137255, "grad_norm": 2.7975672870542843, "learning_rate": 8.184369470565052e-06, "loss": 0.1419, "step": 3234 }, { "epoch": 3.0205415499533146, "grad_norm": 1.3854424422366534, "learning_rate": 8.183203524336067e-06, "loss": 0.0694, "step": 3235 }, { "epoch": 3.0214752567693743, "grad_norm": 1.9654319059465932, "learning_rate": 8.182037286968163e-06, "loss": 0.1094, "step": 3236 }, { "epoch": 3.022408963585434, "grad_norm": 2.583567323254728, "learning_rate": 8.180870758568e-06, "loss": 0.1335, "step": 3237 }, { "epoch": 3.023342670401494, "grad_norm": 1.9598653489190956, "learning_rate": 8.179703939242276e-06, "loss": 0.0356, "step": 3238 }, { "epoch": 3.0242763772175536, "grad_norm": 2.4013036918453046, "learning_rate": 8.178536829097707e-06, "loss": 0.1311, "step": 3239 }, { "epoch": 3.0252100840336134, "grad_norm": 0.6479555119435029, "learning_rate": 8.177369428241038e-06, "loss": 0.032, "step": 3240 }, { "epoch": 3.026143790849673, "grad_norm": 1.4241630916519776, "learning_rate": 8.176201736779042e-06, "loss": 0.0798, "step": 3241 }, { "epoch": 3.027077497665733, "grad_norm": 2.0495214751075794, "learning_rate": 8.175033754818514e-06, "loss": 0.0739, "step": 3242 }, { "epoch": 3.0280112044817926, "grad_norm": 1.9782177064991937, "learning_rate": 8.173865482466281e-06, "loss": 0.13, "step": 3243 }, { "epoch": 3.0289449112978524, "grad_norm": 5.756523656418266, "learning_rate": 8.172696919829196e-06, "loss": 0.1861, "step": 3244 }, { "epoch": 3.029878618113912, "grad_norm": 2.6604115035858547, "learning_rate": 8.171528067014134e-06, "loss": 0.2084, "step": 3245 }, { "epoch": 3.030812324929972, "grad_norm": 3.528684419082514, "learning_rate": 8.170358924128e-06, "loss": 0.2258, "step": 3246 }, { "epoch": 3.0317460317460316, "grad_norm": 2.0268928995140105, "learning_rate": 8.169189491277725e-06, "loss": 0.0928, "step": 3247 }, { "epoch": 3.0326797385620914, "grad_norm": 1.1056209270980553, "learning_rate": 8.168019768570268e-06, "loss": 0.0394, "step": 3248 }, { "epoch": 3.033613445378151, "grad_norm": 0.6386517929044427, "learning_rate": 8.166849756112612e-06, "loss": 0.0179, "step": 3249 }, { "epoch": 3.034547152194211, "grad_norm": 0.7843944036550087, "learning_rate": 8.165679454011767e-06, "loss": 0.0265, "step": 3250 }, { "epoch": 3.0354808590102706, "grad_norm": 0.555752498499991, "learning_rate": 8.164508862374769e-06, "loss": 0.0197, "step": 3251 }, { "epoch": 3.0364145658263304, "grad_norm": 2.0764838457542414, "learning_rate": 8.163337981308686e-06, "loss": 0.1094, "step": 3252 }, { "epoch": 3.03734827264239, "grad_norm": 1.50072926049814, "learning_rate": 8.162166810920602e-06, "loss": 0.0663, "step": 3253 }, { "epoch": 3.03828197945845, "grad_norm": 3.4925617329861796, "learning_rate": 8.160995351317637e-06, "loss": 0.0516, "step": 3254 }, { "epoch": 3.0392156862745097, "grad_norm": 2.8187512408719657, "learning_rate": 8.159823602606932e-06, "loss": 0.2395, "step": 3255 }, { "epoch": 3.0401493930905694, "grad_norm": 2.286122009027192, "learning_rate": 8.158651564895657e-06, "loss": 0.0829, "step": 3256 }, { "epoch": 3.041083099906629, "grad_norm": 0.6513255880224118, "learning_rate": 8.15747923829101e-06, "loss": 0.02, "step": 3257 }, { "epoch": 3.042016806722689, "grad_norm": 1.1378575073874446, "learning_rate": 8.15630662290021e-06, "loss": 0.0544, "step": 3258 }, { "epoch": 3.0429505135387487, "grad_norm": 1.1263377657202074, "learning_rate": 8.155133718830509e-06, "loss": 0.0846, "step": 3259 }, { "epoch": 3.0438842203548084, "grad_norm": 2.560658225046075, "learning_rate": 8.153960526189175e-06, "loss": 0.0912, "step": 3260 }, { "epoch": 3.044817927170868, "grad_norm": 2.860789332407131, "learning_rate": 8.152787045083517e-06, "loss": 0.2718, "step": 3261 }, { "epoch": 3.045751633986928, "grad_norm": 1.7689332977487238, "learning_rate": 8.151613275620859e-06, "loss": 0.0984, "step": 3262 }, { "epoch": 3.0466853408029877, "grad_norm": 1.7049980118479835, "learning_rate": 8.150439217908557e-06, "loss": 0.0982, "step": 3263 }, { "epoch": 3.0476190476190474, "grad_norm": 2.024283110067335, "learning_rate": 8.149264872053989e-06, "loss": 0.1189, "step": 3264 }, { "epoch": 3.048552754435107, "grad_norm": 1.5157713788279708, "learning_rate": 8.148090238164564e-06, "loss": 0.0543, "step": 3265 }, { "epoch": 3.049486461251167, "grad_norm": 2.2333327550097706, "learning_rate": 8.146915316347713e-06, "loss": 0.1426, "step": 3266 }, { "epoch": 3.0504201680672267, "grad_norm": 0.8537828990058309, "learning_rate": 8.145740106710898e-06, "loss": 0.0359, "step": 3267 }, { "epoch": 3.0513538748832865, "grad_norm": 2.0995886545218783, "learning_rate": 8.144564609361602e-06, "loss": 0.0943, "step": 3268 }, { "epoch": 3.052287581699346, "grad_norm": 3.982337140040971, "learning_rate": 8.14338882440734e-06, "loss": 0.245, "step": 3269 }, { "epoch": 3.053221288515406, "grad_norm": 0.9706491466581135, "learning_rate": 8.14221275195565e-06, "loss": 0.0238, "step": 3270 }, { "epoch": 3.0541549953314657, "grad_norm": 1.7487064932775056, "learning_rate": 8.141036392114095e-06, "loss": 0.076, "step": 3271 }, { "epoch": 3.0550887021475255, "grad_norm": 2.0036801190113573, "learning_rate": 8.139859744990266e-06, "loss": 0.0863, "step": 3272 }, { "epoch": 3.0560224089635852, "grad_norm": 1.9250046527747766, "learning_rate": 8.138682810691783e-06, "loss": 0.1069, "step": 3273 }, { "epoch": 3.056956115779645, "grad_norm": 2.8461281666925706, "learning_rate": 8.137505589326287e-06, "loss": 0.1561, "step": 3274 }, { "epoch": 3.0578898225957047, "grad_norm": 2.5439726662377558, "learning_rate": 8.136328081001448e-06, "loss": 0.139, "step": 3275 }, { "epoch": 3.0588235294117645, "grad_norm": 0.7074583323346167, "learning_rate": 8.135150285824964e-06, "loss": 0.0293, "step": 3276 }, { "epoch": 3.0597572362278243, "grad_norm": 0.6718993064578755, "learning_rate": 8.133972203904555e-06, "loss": 0.0189, "step": 3277 }, { "epoch": 3.060690943043884, "grad_norm": 2.873712384641109, "learning_rate": 8.132793835347972e-06, "loss": 0.1957, "step": 3278 }, { "epoch": 3.0616246498599438, "grad_norm": 4.8557896816879715, "learning_rate": 8.131615180262987e-06, "loss": 0.2901, "step": 3279 }, { "epoch": 3.0625583566760035, "grad_norm": 1.0556989948865048, "learning_rate": 8.130436238757403e-06, "loss": 0.058, "step": 3280 }, { "epoch": 3.0634920634920633, "grad_norm": 1.883855307093645, "learning_rate": 8.129257010939045e-06, "loss": 0.0806, "step": 3281 }, { "epoch": 3.064425770308123, "grad_norm": 1.8319057419184521, "learning_rate": 8.12807749691577e-06, "loss": 0.1067, "step": 3282 }, { "epoch": 3.065359477124183, "grad_norm": 1.1249650465609191, "learning_rate": 8.126897696795454e-06, "loss": 0.038, "step": 3283 }, { "epoch": 3.066293183940243, "grad_norm": 1.9623061416053365, "learning_rate": 8.125717610686004e-06, "loss": 0.1152, "step": 3284 }, { "epoch": 3.0672268907563027, "grad_norm": 0.9578228967553046, "learning_rate": 8.124537238695353e-06, "loss": 0.0516, "step": 3285 }, { "epoch": 3.0681605975723625, "grad_norm": 1.4497304670483124, "learning_rate": 8.123356580931456e-06, "loss": 0.0778, "step": 3286 }, { "epoch": 3.0690943043884222, "grad_norm": 1.324592007142213, "learning_rate": 8.1221756375023e-06, "loss": 0.0666, "step": 3287 }, { "epoch": 3.070028011204482, "grad_norm": 1.1886851780437637, "learning_rate": 8.120994408515892e-06, "loss": 0.0231, "step": 3288 }, { "epoch": 3.0709617180205417, "grad_norm": 1.156139469342589, "learning_rate": 8.119812894080273e-06, "loss": 0.0561, "step": 3289 }, { "epoch": 3.0718954248366015, "grad_norm": 1.4211030525755448, "learning_rate": 8.118631094303502e-06, "loss": 0.1023, "step": 3290 }, { "epoch": 3.0728291316526612, "grad_norm": 0.4328101489998332, "learning_rate": 8.117449009293668e-06, "loss": 0.0109, "step": 3291 }, { "epoch": 3.073762838468721, "grad_norm": 2.493582927503061, "learning_rate": 8.116266639158888e-06, "loss": 0.1491, "step": 3292 }, { "epoch": 3.0746965452847808, "grad_norm": 2.556403482863813, "learning_rate": 8.115083984007299e-06, "loss": 0.1312, "step": 3293 }, { "epoch": 3.0756302521008405, "grad_norm": 3.310318310145873, "learning_rate": 8.11390104394707e-06, "loss": 0.1089, "step": 3294 }, { "epoch": 3.0765639589169003, "grad_norm": 2.8601274999473523, "learning_rate": 8.112717819086394e-06, "loss": 0.1322, "step": 3295 }, { "epoch": 3.07749766573296, "grad_norm": 5.9089695473858015, "learning_rate": 8.11153430953349e-06, "loss": 0.2281, "step": 3296 }, { "epoch": 3.0784313725490198, "grad_norm": 1.0255583233947454, "learning_rate": 8.110350515396602e-06, "loss": 0.0234, "step": 3297 }, { "epoch": 3.0793650793650795, "grad_norm": 2.825452894112327, "learning_rate": 8.109166436784e-06, "loss": 0.0858, "step": 3298 }, { "epoch": 3.0802987861811393, "grad_norm": 0.5015106594888754, "learning_rate": 8.107982073803983e-06, "loss": 0.0143, "step": 3299 }, { "epoch": 3.081232492997199, "grad_norm": 1.5402627104054918, "learning_rate": 8.106797426564874e-06, "loss": 0.065, "step": 3300 }, { "epoch": 3.082166199813259, "grad_norm": 4.248038004046988, "learning_rate": 8.105612495175022e-06, "loss": 0.259, "step": 3301 }, { "epoch": 3.0830999066293185, "grad_norm": 1.6686817821758182, "learning_rate": 8.1044272797428e-06, "loss": 0.1035, "step": 3302 }, { "epoch": 3.0840336134453783, "grad_norm": 1.2471114096649363, "learning_rate": 8.103241780376612e-06, "loss": 0.0851, "step": 3303 }, { "epoch": 3.084967320261438, "grad_norm": 2.8232401354987493, "learning_rate": 8.102055997184884e-06, "loss": 0.1636, "step": 3304 }, { "epoch": 3.085901027077498, "grad_norm": 1.6732439656176643, "learning_rate": 8.100869930276065e-06, "loss": 0.0563, "step": 3305 }, { "epoch": 3.0868347338935576, "grad_norm": 3.125712611583541, "learning_rate": 8.09968357975864e-06, "loss": 0.2161, "step": 3306 }, { "epoch": 3.0877684407096173, "grad_norm": 1.498853359887284, "learning_rate": 8.098496945741108e-06, "loss": 0.0813, "step": 3307 }, { "epoch": 3.088702147525677, "grad_norm": 0.8023780493239054, "learning_rate": 8.097310028332005e-06, "loss": 0.0221, "step": 3308 }, { "epoch": 3.089635854341737, "grad_norm": 0.4445695789633342, "learning_rate": 8.096122827639883e-06, "loss": 0.0137, "step": 3309 }, { "epoch": 3.0905695611577966, "grad_norm": 1.7332272752825453, "learning_rate": 8.094935343773328e-06, "loss": 0.1065, "step": 3310 }, { "epoch": 3.0915032679738563, "grad_norm": 1.385764974450171, "learning_rate": 8.093747576840947e-06, "loss": 0.1023, "step": 3311 }, { "epoch": 3.092436974789916, "grad_norm": 3.679427075405828, "learning_rate": 8.092559526951374e-06, "loss": 0.1781, "step": 3312 }, { "epoch": 3.093370681605976, "grad_norm": 4.236324719168813, "learning_rate": 8.091371194213272e-06, "loss": 0.2914, "step": 3313 }, { "epoch": 3.0943043884220356, "grad_norm": 1.401705523754552, "learning_rate": 8.090182578735322e-06, "loss": 0.0317, "step": 3314 }, { "epoch": 3.0952380952380953, "grad_norm": 3.2300642625705813, "learning_rate": 8.088993680626238e-06, "loss": 0.1552, "step": 3315 }, { "epoch": 3.096171802054155, "grad_norm": 2.3246384586136, "learning_rate": 8.08780449999476e-06, "loss": 0.1299, "step": 3316 }, { "epoch": 3.097105508870215, "grad_norm": 1.8014456970403043, "learning_rate": 8.086615036949652e-06, "loss": 0.0266, "step": 3317 }, { "epoch": 3.0980392156862746, "grad_norm": 1.4436924450028696, "learning_rate": 8.085425291599699e-06, "loss": 0.0566, "step": 3318 }, { "epoch": 3.0989729225023344, "grad_norm": 1.044007178087339, "learning_rate": 8.08423526405372e-06, "loss": 0.0346, "step": 3319 }, { "epoch": 3.099906629318394, "grad_norm": 1.8891833935139983, "learning_rate": 8.083044954420554e-06, "loss": 0.1324, "step": 3320 }, { "epoch": 3.100840336134454, "grad_norm": 1.4758940689261435, "learning_rate": 8.081854362809069e-06, "loss": 0.0689, "step": 3321 }, { "epoch": 3.1017740429505136, "grad_norm": 2.7418745990528186, "learning_rate": 8.080663489328158e-06, "loss": 0.2028, "step": 3322 }, { "epoch": 3.1027077497665734, "grad_norm": 0.6851560270900988, "learning_rate": 8.079472334086738e-06, "loss": 0.0126, "step": 3323 }, { "epoch": 3.103641456582633, "grad_norm": 0.8637389607943834, "learning_rate": 8.078280897193756e-06, "loss": 0.0294, "step": 3324 }, { "epoch": 3.104575163398693, "grad_norm": 1.4513924461524852, "learning_rate": 8.077089178758178e-06, "loss": 0.1129, "step": 3325 }, { "epoch": 3.1055088702147526, "grad_norm": 0.8592340404490498, "learning_rate": 8.075897178889003e-06, "loss": 0.0259, "step": 3326 }, { "epoch": 3.1064425770308124, "grad_norm": 1.9548248207261287, "learning_rate": 8.074704897695252e-06, "loss": 0.1421, "step": 3327 }, { "epoch": 3.107376283846872, "grad_norm": 3.9526617910508626, "learning_rate": 8.07351233528597e-06, "loss": 0.2579, "step": 3328 }, { "epoch": 3.108309990662932, "grad_norm": 2.260889757554444, "learning_rate": 8.072319491770234e-06, "loss": 0.0779, "step": 3329 }, { "epoch": 3.1092436974789917, "grad_norm": 1.653350558113655, "learning_rate": 8.071126367257137e-06, "loss": 0.0749, "step": 3330 }, { "epoch": 3.1101774042950514, "grad_norm": 3.097963061011822, "learning_rate": 8.069932961855808e-06, "loss": 0.1302, "step": 3331 }, { "epoch": 3.111111111111111, "grad_norm": 3.6524914254601915, "learning_rate": 8.068739275675395e-06, "loss": 0.2295, "step": 3332 }, { "epoch": 3.112044817927171, "grad_norm": 3.1483119911340776, "learning_rate": 8.067545308825075e-06, "loss": 0.2161, "step": 3333 }, { "epoch": 3.1129785247432307, "grad_norm": 2.5134318475348687, "learning_rate": 8.06635106141405e-06, "loss": 0.1651, "step": 3334 }, { "epoch": 3.1139122315592904, "grad_norm": 2.3138140000425724, "learning_rate": 8.065156533551544e-06, "loss": 0.1105, "step": 3335 }, { "epoch": 3.11484593837535, "grad_norm": 2.5290839922668242, "learning_rate": 8.063961725346811e-06, "loss": 0.1542, "step": 3336 }, { "epoch": 3.11577964519141, "grad_norm": 3.7209661428258696, "learning_rate": 8.062766636909129e-06, "loss": 0.1986, "step": 3337 }, { "epoch": 3.1167133520074697, "grad_norm": 0.6345883366224367, "learning_rate": 8.061571268347805e-06, "loss": 0.0156, "step": 3338 }, { "epoch": 3.1176470588235294, "grad_norm": 1.096267146022426, "learning_rate": 8.060375619772164e-06, "loss": 0.0416, "step": 3339 }, { "epoch": 3.118580765639589, "grad_norm": 1.6904252231162453, "learning_rate": 8.059179691291566e-06, "loss": 0.0326, "step": 3340 }, { "epoch": 3.119514472455649, "grad_norm": 2.3892734684460875, "learning_rate": 8.057983483015388e-06, "loss": 0.1232, "step": 3341 }, { "epoch": 3.1204481792717087, "grad_norm": 1.2118068145637317, "learning_rate": 8.056786995053038e-06, "loss": 0.0176, "step": 3342 }, { "epoch": 3.1213818860877685, "grad_norm": 1.5193920187997958, "learning_rate": 8.055590227513945e-06, "loss": 0.0842, "step": 3343 }, { "epoch": 3.122315592903828, "grad_norm": 1.8481584139304448, "learning_rate": 8.054393180507572e-06, "loss": 0.152, "step": 3344 }, { "epoch": 3.123249299719888, "grad_norm": 0.6958007784170818, "learning_rate": 8.053195854143398e-06, "loss": 0.0376, "step": 3345 }, { "epoch": 3.1241830065359477, "grad_norm": 2.9683989845857233, "learning_rate": 8.051998248530935e-06, "loss": 0.1428, "step": 3346 }, { "epoch": 3.1251167133520075, "grad_norm": 1.7243154597864199, "learning_rate": 8.050800363779711e-06, "loss": 0.1043, "step": 3347 }, { "epoch": 3.1260504201680672, "grad_norm": 2.7523529010333587, "learning_rate": 8.049602199999294e-06, "loss": 0.1006, "step": 3348 }, { "epoch": 3.126984126984127, "grad_norm": 1.5897215842642978, "learning_rate": 8.048403757299262e-06, "loss": 0.0485, "step": 3349 }, { "epoch": 3.1279178338001867, "grad_norm": 1.2129499289820573, "learning_rate": 8.04720503578923e-06, "loss": 0.0565, "step": 3350 }, { "epoch": 3.1288515406162465, "grad_norm": 2.0109654197902924, "learning_rate": 8.046006035578832e-06, "loss": 0.118, "step": 3351 }, { "epoch": 3.1297852474323062, "grad_norm": 0.5280488301466482, "learning_rate": 8.04480675677773e-06, "loss": 0.0191, "step": 3352 }, { "epoch": 3.130718954248366, "grad_norm": 3.6783501416322055, "learning_rate": 8.043607199495615e-06, "loss": 0.1968, "step": 3353 }, { "epoch": 3.1316526610644257, "grad_norm": 2.126011920134514, "learning_rate": 8.042407363842193e-06, "loss": 0.1197, "step": 3354 }, { "epoch": 3.1325863678804855, "grad_norm": 2.662747000494536, "learning_rate": 8.04120724992721e-06, "loss": 0.0582, "step": 3355 }, { "epoch": 3.1335200746965453, "grad_norm": 8.83636998691445, "learning_rate": 8.040006857860421e-06, "loss": 0.0604, "step": 3356 }, { "epoch": 3.134453781512605, "grad_norm": 1.8482563949147275, "learning_rate": 8.038806187751622e-06, "loss": 0.1356, "step": 3357 }, { "epoch": 3.1353874883286648, "grad_norm": 1.2691384181010663, "learning_rate": 8.037605239710623e-06, "loss": 0.0342, "step": 3358 }, { "epoch": 3.1363211951447245, "grad_norm": 1.9749976899554769, "learning_rate": 8.036404013847268e-06, "loss": 0.1196, "step": 3359 }, { "epoch": 3.1372549019607843, "grad_norm": 1.0189829179900343, "learning_rate": 8.035202510271419e-06, "loss": 0.0524, "step": 3360 }, { "epoch": 3.138188608776844, "grad_norm": 1.983861254641706, "learning_rate": 8.034000729092967e-06, "loss": 0.1681, "step": 3361 }, { "epoch": 3.139122315592904, "grad_norm": 2.8758263629582506, "learning_rate": 8.03279867042183e-06, "loss": 0.1568, "step": 3362 }, { "epoch": 3.1400560224089635, "grad_norm": 2.2314267783013024, "learning_rate": 8.031596334367949e-06, "loss": 0.1049, "step": 3363 }, { "epoch": 3.1409897292250233, "grad_norm": 2.8925687053372613, "learning_rate": 8.030393721041287e-06, "loss": 0.1044, "step": 3364 }, { "epoch": 3.141923436041083, "grad_norm": 1.7485790694579462, "learning_rate": 8.029190830551843e-06, "loss": 0.135, "step": 3365 }, { "epoch": 3.142857142857143, "grad_norm": 0.5192017746767535, "learning_rate": 8.027987663009628e-06, "loss": 0.0221, "step": 3366 }, { "epoch": 3.1437908496732025, "grad_norm": 2.1895395147326138, "learning_rate": 8.02678421852469e-06, "loss": 0.1356, "step": 3367 }, { "epoch": 3.1447245564892623, "grad_norm": 1.2984818204877524, "learning_rate": 8.025580497207093e-06, "loss": 0.0435, "step": 3368 }, { "epoch": 3.145658263305322, "grad_norm": 0.9237410551994719, "learning_rate": 8.024376499166934e-06, "loss": 0.0212, "step": 3369 }, { "epoch": 3.146591970121382, "grad_norm": 1.292729701887333, "learning_rate": 8.02317222451433e-06, "loss": 0.064, "step": 3370 }, { "epoch": 3.1475256769374416, "grad_norm": 2.9990437237908742, "learning_rate": 8.021967673359426e-06, "loss": 0.0447, "step": 3371 }, { "epoch": 3.1484593837535013, "grad_norm": 1.1262489183779467, "learning_rate": 8.020762845812389e-06, "loss": 0.0609, "step": 3372 }, { "epoch": 3.149393090569561, "grad_norm": 0.9065901382967025, "learning_rate": 8.019557741983417e-06, "loss": 0.0371, "step": 3373 }, { "epoch": 3.150326797385621, "grad_norm": 2.8568897483409463, "learning_rate": 8.018352361982728e-06, "loss": 0.1233, "step": 3374 }, { "epoch": 3.1512605042016806, "grad_norm": 1.5981746786167865, "learning_rate": 8.01714670592057e-06, "loss": 0.1239, "step": 3375 }, { "epoch": 3.1521942110177403, "grad_norm": 3.513638460785753, "learning_rate": 8.015940773907209e-06, "loss": 0.0152, "step": 3376 }, { "epoch": 3.1531279178338, "grad_norm": 2.1041193522956303, "learning_rate": 8.014734566052943e-06, "loss": 0.1042, "step": 3377 }, { "epoch": 3.15406162464986, "grad_norm": 2.9814963046175325, "learning_rate": 8.013528082468094e-06, "loss": 0.2002, "step": 3378 }, { "epoch": 3.1549953314659196, "grad_norm": 0.8469333197517387, "learning_rate": 8.012321323263006e-06, "loss": 0.0438, "step": 3379 }, { "epoch": 3.1559290382819793, "grad_norm": 1.7043153851690271, "learning_rate": 8.011114288548054e-06, "loss": 0.0899, "step": 3380 }, { "epoch": 3.156862745098039, "grad_norm": 1.5591750633712413, "learning_rate": 8.00990697843363e-06, "loss": 0.0654, "step": 3381 }, { "epoch": 3.157796451914099, "grad_norm": 0.5536497258873558, "learning_rate": 8.00869939303016e-06, "loss": 0.0126, "step": 3382 }, { "epoch": 3.1587301587301586, "grad_norm": 3.02718003093718, "learning_rate": 8.007491532448087e-06, "loss": 0.1499, "step": 3383 }, { "epoch": 3.1596638655462184, "grad_norm": 1.366461928697322, "learning_rate": 8.006283396797885e-06, "loss": 0.085, "step": 3384 }, { "epoch": 3.160597572362278, "grad_norm": 2.002655092760233, "learning_rate": 8.005074986190052e-06, "loss": 0.0937, "step": 3385 }, { "epoch": 3.161531279178338, "grad_norm": 1.0406236847332693, "learning_rate": 8.00386630073511e-06, "loss": 0.0364, "step": 3386 }, { "epoch": 3.1624649859943976, "grad_norm": 1.585922427758568, "learning_rate": 8.002657340543604e-06, "loss": 0.1118, "step": 3387 }, { "epoch": 3.1633986928104574, "grad_norm": 0.9397382238244597, "learning_rate": 8.001448105726111e-06, "loss": 0.0354, "step": 3388 }, { "epoch": 3.164332399626517, "grad_norm": 0.9460552081337809, "learning_rate": 8.000238596393226e-06, "loss": 0.0163, "step": 3389 }, { "epoch": 3.165266106442577, "grad_norm": 1.1798058614105986, "learning_rate": 7.999028812655574e-06, "loss": 0.0524, "step": 3390 }, { "epoch": 3.1661998132586366, "grad_norm": 3.968583125767802, "learning_rate": 7.9978187546238e-06, "loss": 0.1163, "step": 3391 }, { "epoch": 3.1671335200746964, "grad_norm": 1.98440081730729, "learning_rate": 7.996608422408577e-06, "loss": 0.115, "step": 3392 }, { "epoch": 3.168067226890756, "grad_norm": 1.6697905642589774, "learning_rate": 7.995397816120608e-06, "loss": 0.0688, "step": 3393 }, { "epoch": 3.169000933706816, "grad_norm": 0.6698425751055824, "learning_rate": 7.994186935870614e-06, "loss": 0.0153, "step": 3394 }, { "epoch": 3.1699346405228757, "grad_norm": 1.5529213713783407, "learning_rate": 7.992975781769341e-06, "loss": 0.1398, "step": 3395 }, { "epoch": 3.1708683473389354, "grad_norm": 2.389157917675529, "learning_rate": 7.991764353927564e-06, "loss": 0.0117, "step": 3396 }, { "epoch": 3.171802054154995, "grad_norm": 0.6326163717502746, "learning_rate": 7.99055265245608e-06, "loss": 0.0204, "step": 3397 }, { "epoch": 3.172735760971055, "grad_norm": 2.6002485833053015, "learning_rate": 7.989340677465714e-06, "loss": 0.1261, "step": 3398 }, { "epoch": 3.1736694677871147, "grad_norm": 2.712493808450247, "learning_rate": 7.988128429067315e-06, "loss": 0.198, "step": 3399 }, { "epoch": 3.1746031746031744, "grad_norm": 0.9159111654751574, "learning_rate": 7.986915907371757e-06, "loss": 0.025, "step": 3400 }, { "epoch": 3.175536881419234, "grad_norm": 1.5790610266049518, "learning_rate": 7.985703112489933e-06, "loss": 0.0816, "step": 3401 }, { "epoch": 3.176470588235294, "grad_norm": 2.7020838026143044, "learning_rate": 7.984490044532773e-06, "loss": 0.0404, "step": 3402 }, { "epoch": 3.1774042950513537, "grad_norm": 2.385581623435029, "learning_rate": 7.98327670361122e-06, "loss": 0.0264, "step": 3403 }, { "epoch": 3.1783380018674134, "grad_norm": 1.812002718492679, "learning_rate": 7.982063089836253e-06, "loss": 0.0786, "step": 3404 }, { "epoch": 3.179271708683473, "grad_norm": 1.2868296732493167, "learning_rate": 7.980849203318865e-06, "loss": 0.0566, "step": 3405 }, { "epoch": 3.180205415499533, "grad_norm": 1.4198548315251966, "learning_rate": 7.979635044170085e-06, "loss": 0.0829, "step": 3406 }, { "epoch": 3.1811391223155927, "grad_norm": 2.1302510883630883, "learning_rate": 7.978420612500955e-06, "loss": 0.0822, "step": 3407 }, { "epoch": 3.1820728291316525, "grad_norm": 2.4537562179635537, "learning_rate": 7.97720590842255e-06, "loss": 0.2353, "step": 3408 }, { "epoch": 3.183006535947712, "grad_norm": 1.2649797260385423, "learning_rate": 7.97599093204597e-06, "loss": 0.0558, "step": 3409 }, { "epoch": 3.1839402427637724, "grad_norm": 0.9587628629518351, "learning_rate": 7.974775683482337e-06, "loss": 0.0176, "step": 3410 }, { "epoch": 3.184873949579832, "grad_norm": 1.972903073482285, "learning_rate": 7.973560162842799e-06, "loss": 0.0554, "step": 3411 }, { "epoch": 3.185807656395892, "grad_norm": 1.5655337817418242, "learning_rate": 7.972344370238527e-06, "loss": 0.0906, "step": 3412 }, { "epoch": 3.1867413632119517, "grad_norm": 2.613411952331527, "learning_rate": 7.971128305780722e-06, "loss": 0.1437, "step": 3413 }, { "epoch": 3.1876750700280114, "grad_norm": 0.9503425921636572, "learning_rate": 7.969911969580603e-06, "loss": 0.0139, "step": 3414 }, { "epoch": 3.188608776844071, "grad_norm": 1.5210583629226233, "learning_rate": 7.968695361749418e-06, "loss": 0.0855, "step": 3415 }, { "epoch": 3.189542483660131, "grad_norm": 1.9725096286455566, "learning_rate": 7.967478482398443e-06, "loss": 0.1483, "step": 3416 }, { "epoch": 3.1904761904761907, "grad_norm": 2.1954282057007073, "learning_rate": 7.96626133163897e-06, "loss": 0.172, "step": 3417 }, { "epoch": 3.1914098972922504, "grad_norm": 0.7567295428078056, "learning_rate": 7.965043909582322e-06, "loss": 0.0386, "step": 3418 }, { "epoch": 3.19234360410831, "grad_norm": 1.2657107621168262, "learning_rate": 7.963826216339848e-06, "loss": 0.0897, "step": 3419 }, { "epoch": 3.19327731092437, "grad_norm": 0.7534726843856189, "learning_rate": 7.962608252022914e-06, "loss": 0.0224, "step": 3420 }, { "epoch": 3.1942110177404297, "grad_norm": 2.2043394614485474, "learning_rate": 7.961390016742923e-06, "loss": 0.1167, "step": 3421 }, { "epoch": 3.1951447245564895, "grad_norm": 0.8204033083867082, "learning_rate": 7.960171510611292e-06, "loss": 0.0195, "step": 3422 }, { "epoch": 3.196078431372549, "grad_norm": 7.489034092775982, "learning_rate": 7.958952733739468e-06, "loss": 0.3096, "step": 3423 }, { "epoch": 3.197012138188609, "grad_norm": 2.36739143538466, "learning_rate": 7.957733686238921e-06, "loss": 0.1213, "step": 3424 }, { "epoch": 3.1979458450046687, "grad_norm": 2.5550369101437718, "learning_rate": 7.956514368221147e-06, "loss": 0.1179, "step": 3425 }, { "epoch": 3.1988795518207285, "grad_norm": 1.9840172762295687, "learning_rate": 7.955294779797664e-06, "loss": 0.0958, "step": 3426 }, { "epoch": 3.1998132586367882, "grad_norm": 0.6718093966215772, "learning_rate": 7.95407492108002e-06, "loss": 0.019, "step": 3427 }, { "epoch": 3.200746965452848, "grad_norm": 0.44912008756369903, "learning_rate": 7.952854792179782e-06, "loss": 0.015, "step": 3428 }, { "epoch": 3.2016806722689077, "grad_norm": 2.6334154917202865, "learning_rate": 7.951634393208545e-06, "loss": 0.1647, "step": 3429 }, { "epoch": 3.2026143790849675, "grad_norm": 0.741515826632486, "learning_rate": 7.950413724277927e-06, "loss": 0.016, "step": 3430 }, { "epoch": 3.2035480859010272, "grad_norm": 1.3628713746681536, "learning_rate": 7.949192785499573e-06, "loss": 0.1002, "step": 3431 }, { "epoch": 3.204481792717087, "grad_norm": 0.9284400510732526, "learning_rate": 7.94797157698515e-06, "loss": 0.0511, "step": 3432 }, { "epoch": 3.2054154995331468, "grad_norm": 2.2440190936613353, "learning_rate": 7.946750098846354e-06, "loss": 0.1397, "step": 3433 }, { "epoch": 3.2063492063492065, "grad_norm": 2.1254723582825923, "learning_rate": 7.945528351194898e-06, "loss": 0.0861, "step": 3434 }, { "epoch": 3.2072829131652663, "grad_norm": 1.3850184762213598, "learning_rate": 7.944306334142527e-06, "loss": 0.0301, "step": 3435 }, { "epoch": 3.208216619981326, "grad_norm": 1.0686630547957259, "learning_rate": 7.943084047801008e-06, "loss": 0.0241, "step": 3436 }, { "epoch": 3.2091503267973858, "grad_norm": 3.2315563824843423, "learning_rate": 7.94186149228213e-06, "loss": 0.1828, "step": 3437 }, { "epoch": 3.2100840336134455, "grad_norm": 1.5393653351393577, "learning_rate": 7.940638667697712e-06, "loss": 0.064, "step": 3438 }, { "epoch": 3.2110177404295053, "grad_norm": 1.1012707488545108, "learning_rate": 7.939415574159593e-06, "loss": 0.0605, "step": 3439 }, { "epoch": 3.211951447245565, "grad_norm": 2.625852320546292, "learning_rate": 7.938192211779639e-06, "loss": 0.2333, "step": 3440 }, { "epoch": 3.212885154061625, "grad_norm": 0.7721206608075368, "learning_rate": 7.93696858066974e-06, "loss": 0.0211, "step": 3441 }, { "epoch": 3.2138188608776845, "grad_norm": 0.9736664956842884, "learning_rate": 7.935744680941812e-06, "loss": 0.0296, "step": 3442 }, { "epoch": 3.2147525676937443, "grad_norm": 2.422155763638296, "learning_rate": 7.93452051270779e-06, "loss": 0.0995, "step": 3443 }, { "epoch": 3.215686274509804, "grad_norm": 1.8545056908028108, "learning_rate": 7.933296076079642e-06, "loss": 0.0907, "step": 3444 }, { "epoch": 3.216619981325864, "grad_norm": 1.124090520707117, "learning_rate": 7.932071371169353e-06, "loss": 0.0664, "step": 3445 }, { "epoch": 3.2175536881419236, "grad_norm": 1.6378145388208056, "learning_rate": 7.930846398088939e-06, "loss": 0.0902, "step": 3446 }, { "epoch": 3.2184873949579833, "grad_norm": 1.1521277069629685, "learning_rate": 7.929621156950433e-06, "loss": 0.0264, "step": 3447 }, { "epoch": 3.219421101774043, "grad_norm": 0.9499407314798032, "learning_rate": 7.928395647865899e-06, "loss": 0.0505, "step": 3448 }, { "epoch": 3.220354808590103, "grad_norm": 2.1148994170372437, "learning_rate": 7.927169870947424e-06, "loss": 0.1199, "step": 3449 }, { "epoch": 3.2212885154061626, "grad_norm": 4.7201353115259455, "learning_rate": 7.925943826307119e-06, "loss": 0.296, "step": 3450 }, { "epoch": 3.2222222222222223, "grad_norm": 1.5483038192527818, "learning_rate": 7.924717514057118e-06, "loss": 0.0735, "step": 3451 }, { "epoch": 3.223155929038282, "grad_norm": 2.278438181785662, "learning_rate": 7.92349093430958e-06, "loss": 0.1899, "step": 3452 }, { "epoch": 3.224089635854342, "grad_norm": 1.7714261064372874, "learning_rate": 7.92226408717669e-06, "loss": 0.0995, "step": 3453 }, { "epoch": 3.2250233426704016, "grad_norm": 0.998822452104237, "learning_rate": 7.921036972770658e-06, "loss": 0.0366, "step": 3454 }, { "epoch": 3.2259570494864613, "grad_norm": 0.43007296155826064, "learning_rate": 7.919809591203716e-06, "loss": 0.0101, "step": 3455 }, { "epoch": 3.226890756302521, "grad_norm": 4.073602290153445, "learning_rate": 7.91858194258812e-06, "loss": 0.0857, "step": 3456 }, { "epoch": 3.227824463118581, "grad_norm": 2.6324964413357534, "learning_rate": 7.917354027036156e-06, "loss": 0.0849, "step": 3457 }, { "epoch": 3.2287581699346406, "grad_norm": 1.9316349290941475, "learning_rate": 7.916125844660125e-06, "loss": 0.0997, "step": 3458 }, { "epoch": 3.2296918767507004, "grad_norm": 2.68758819580965, "learning_rate": 7.914897395572362e-06, "loss": 0.2451, "step": 3459 }, { "epoch": 3.23062558356676, "grad_norm": 1.5900280136180105, "learning_rate": 7.91366867988522e-06, "loss": 0.1159, "step": 3460 }, { "epoch": 3.23155929038282, "grad_norm": 2.08731260773833, "learning_rate": 7.912439697711078e-06, "loss": 0.0624, "step": 3461 }, { "epoch": 3.2324929971988796, "grad_norm": 1.4953714740598443, "learning_rate": 7.911210449162343e-06, "loss": 0.0505, "step": 3462 }, { "epoch": 3.2334267040149394, "grad_norm": 1.0980776311918607, "learning_rate": 7.909980934351442e-06, "loss": 0.0344, "step": 3463 }, { "epoch": 3.234360410830999, "grad_norm": 0.7315542189154302, "learning_rate": 7.908751153390825e-06, "loss": 0.0195, "step": 3464 }, { "epoch": 3.235294117647059, "grad_norm": 4.4828056446471924, "learning_rate": 7.907521106392972e-06, "loss": 0.0609, "step": 3465 }, { "epoch": 3.2362278244631186, "grad_norm": 1.8007226846593003, "learning_rate": 7.906290793470384e-06, "loss": 0.0523, "step": 3466 }, { "epoch": 3.2371615312791784, "grad_norm": 0.6596341729074999, "learning_rate": 7.905060214735585e-06, "loss": 0.0192, "step": 3467 }, { "epoch": 3.238095238095238, "grad_norm": 0.9299610089215252, "learning_rate": 7.903829370301125e-06, "loss": 0.0493, "step": 3468 }, { "epoch": 3.239028944911298, "grad_norm": 1.7767876614394051, "learning_rate": 7.902598260279581e-06, "loss": 0.1273, "step": 3469 }, { "epoch": 3.2399626517273576, "grad_norm": 5.438162336769103, "learning_rate": 7.90136688478355e-06, "loss": 0.1876, "step": 3470 }, { "epoch": 3.2408963585434174, "grad_norm": 1.5336904640922546, "learning_rate": 7.900135243925653e-06, "loss": 0.0284, "step": 3471 }, { "epoch": 3.241830065359477, "grad_norm": 3.4451315087449155, "learning_rate": 7.89890333781854e-06, "loss": 0.1515, "step": 3472 }, { "epoch": 3.242763772175537, "grad_norm": 3.225262612556446, "learning_rate": 7.897671166574878e-06, "loss": 0.2049, "step": 3473 }, { "epoch": 3.2436974789915967, "grad_norm": 2.238833180489575, "learning_rate": 7.896438730307369e-06, "loss": 0.1152, "step": 3474 }, { "epoch": 3.2446311858076564, "grad_norm": 5.582789823660096, "learning_rate": 7.895206029128726e-06, "loss": 0.1587, "step": 3475 }, { "epoch": 3.245564892623716, "grad_norm": 1.36266596284607, "learning_rate": 7.8939730631517e-06, "loss": 0.0382, "step": 3476 }, { "epoch": 3.246498599439776, "grad_norm": 3.276569451512059, "learning_rate": 7.892739832489054e-06, "loss": 0.0848, "step": 3477 }, { "epoch": 3.2474323062558357, "grad_norm": 2.434895356818572, "learning_rate": 7.89150633725358e-06, "loss": 0.1382, "step": 3478 }, { "epoch": 3.2483660130718954, "grad_norm": 2.1063339149348383, "learning_rate": 7.8902725775581e-06, "loss": 0.1205, "step": 3479 }, { "epoch": 3.249299719887955, "grad_norm": 2.1904635984168106, "learning_rate": 7.88903855351545e-06, "loss": 0.1148, "step": 3480 }, { "epoch": 3.250233426704015, "grad_norm": 1.4019918885552547, "learning_rate": 7.887804265238498e-06, "loss": 0.0455, "step": 3481 }, { "epoch": 3.2511671335200747, "grad_norm": 2.7576044623052414, "learning_rate": 7.886569712840132e-06, "loss": 0.1663, "step": 3482 }, { "epoch": 3.2521008403361344, "grad_norm": 3.6791885139051734, "learning_rate": 7.885334896433263e-06, "loss": 0.2236, "step": 3483 }, { "epoch": 3.253034547152194, "grad_norm": 1.2230394813152963, "learning_rate": 7.884099816130833e-06, "loss": 0.0355, "step": 3484 }, { "epoch": 3.253968253968254, "grad_norm": 2.0921874134888285, "learning_rate": 7.882864472045802e-06, "loss": 0.0847, "step": 3485 }, { "epoch": 3.2549019607843137, "grad_norm": 2.5917039904593837, "learning_rate": 7.881628864291153e-06, "loss": 0.1778, "step": 3486 }, { "epoch": 3.2558356676003735, "grad_norm": 0.8822154905474647, "learning_rate": 7.8803929929799e-06, "loss": 0.0231, "step": 3487 }, { "epoch": 3.256769374416433, "grad_norm": 2.049499527987743, "learning_rate": 7.879156858225075e-06, "loss": 0.1209, "step": 3488 }, { "epoch": 3.257703081232493, "grad_norm": 1.347525070162262, "learning_rate": 7.877920460139736e-06, "loss": 0.05, "step": 3489 }, { "epoch": 3.2586367880485527, "grad_norm": 0.520573738466213, "learning_rate": 7.876683798836966e-06, "loss": 0.0135, "step": 3490 }, { "epoch": 3.2595704948646125, "grad_norm": 1.8575882548917286, "learning_rate": 7.87544687442987e-06, "loss": 0.0534, "step": 3491 }, { "epoch": 3.2605042016806722, "grad_norm": 1.230548732444782, "learning_rate": 7.874209687031579e-06, "loss": 0.0927, "step": 3492 }, { "epoch": 3.261437908496732, "grad_norm": 1.7256303091095004, "learning_rate": 7.87297223675525e-06, "loss": 0.0475, "step": 3493 }, { "epoch": 3.2623716153127917, "grad_norm": 1.998685464648202, "learning_rate": 7.871734523714057e-06, "loss": 0.0552, "step": 3494 }, { "epoch": 3.2633053221288515, "grad_norm": 3.3235683694313303, "learning_rate": 7.870496548021202e-06, "loss": 0.1545, "step": 3495 }, { "epoch": 3.2642390289449112, "grad_norm": 2.3060056157086724, "learning_rate": 7.869258309789916e-06, "loss": 0.1667, "step": 3496 }, { "epoch": 3.265172735760971, "grad_norm": 2.6146459756575346, "learning_rate": 7.86801980913345e-06, "loss": 0.1085, "step": 3497 }, { "epoch": 3.2661064425770308, "grad_norm": 2.0932105572163855, "learning_rate": 7.866781046165071e-06, "loss": 0.1494, "step": 3498 }, { "epoch": 3.2670401493930905, "grad_norm": 5.8905547123325555, "learning_rate": 7.865542020998085e-06, "loss": 0.0284, "step": 3499 }, { "epoch": 3.2679738562091503, "grad_norm": 1.2968145852692725, "learning_rate": 7.864302733745812e-06, "loss": 0.0473, "step": 3500 }, { "epoch": 3.26890756302521, "grad_norm": 2.439158519700841, "learning_rate": 7.863063184521596e-06, "loss": 0.1235, "step": 3501 }, { "epoch": 3.2698412698412698, "grad_norm": 1.2403540455320836, "learning_rate": 7.861823373438811e-06, "loss": 0.0296, "step": 3502 }, { "epoch": 3.2707749766573295, "grad_norm": 3.015924387147375, "learning_rate": 7.860583300610849e-06, "loss": 0.0865, "step": 3503 }, { "epoch": 3.2717086834733893, "grad_norm": 2.8742202632531404, "learning_rate": 7.85934296615113e-06, "loss": 0.0726, "step": 3504 }, { "epoch": 3.272642390289449, "grad_norm": 1.0532393982360395, "learning_rate": 7.858102370173096e-06, "loss": 0.0507, "step": 3505 }, { "epoch": 3.273576097105509, "grad_norm": 1.5671558085679982, "learning_rate": 7.85686151279021e-06, "loss": 0.0565, "step": 3506 }, { "epoch": 3.2745098039215685, "grad_norm": 1.258698486482383, "learning_rate": 7.855620394115966e-06, "loss": 0.0767, "step": 3507 }, { "epoch": 3.2754435107376283, "grad_norm": 6.997907839469387, "learning_rate": 7.854379014263877e-06, "loss": 0.2721, "step": 3508 }, { "epoch": 3.276377217553688, "grad_norm": 0.5692667087900982, "learning_rate": 7.853137373347479e-06, "loss": 0.0162, "step": 3509 }, { "epoch": 3.277310924369748, "grad_norm": 0.5518452609641851, "learning_rate": 7.851895471480336e-06, "loss": 0.0174, "step": 3510 }, { "epoch": 3.2782446311858076, "grad_norm": 2.1280324682045326, "learning_rate": 7.850653308776031e-06, "loss": 0.0335, "step": 3511 }, { "epoch": 3.2791783380018673, "grad_norm": 0.8063056412740568, "learning_rate": 7.849410885348176e-06, "loss": 0.0273, "step": 3512 }, { "epoch": 3.280112044817927, "grad_norm": 2.2135661457563756, "learning_rate": 7.848168201310404e-06, "loss": 0.1247, "step": 3513 }, { "epoch": 3.281045751633987, "grad_norm": 3.7537539304886978, "learning_rate": 7.846925256776372e-06, "loss": 0.1771, "step": 3514 }, { "epoch": 3.2819794584500466, "grad_norm": 2.9985584002320396, "learning_rate": 7.845682051859757e-06, "loss": 0.0263, "step": 3515 }, { "epoch": 3.2829131652661063, "grad_norm": 3.2242878996203674, "learning_rate": 7.84443858667427e-06, "loss": 0.1054, "step": 3516 }, { "epoch": 3.283846872082166, "grad_norm": 4.141000964637038, "learning_rate": 7.843194861333637e-06, "loss": 0.1098, "step": 3517 }, { "epoch": 3.284780578898226, "grad_norm": 1.3045402237831223, "learning_rate": 7.841950875951608e-06, "loss": 0.0524, "step": 3518 }, { "epoch": 3.2857142857142856, "grad_norm": 1.2282369573527614, "learning_rate": 7.84070663064196e-06, "loss": 0.0471, "step": 3519 }, { "epoch": 3.2866479925303453, "grad_norm": 2.643123858477025, "learning_rate": 7.839462125518495e-06, "loss": 0.1608, "step": 3520 }, { "epoch": 3.287581699346405, "grad_norm": 1.1047257908752144, "learning_rate": 7.838217360695035e-06, "loss": 0.0285, "step": 3521 }, { "epoch": 3.288515406162465, "grad_norm": 2.159186106002279, "learning_rate": 7.836972336285429e-06, "loss": 0.0865, "step": 3522 }, { "epoch": 3.2894491129785246, "grad_norm": 1.6955096128204865, "learning_rate": 7.835727052403547e-06, "loss": 0.0945, "step": 3523 }, { "epoch": 3.2903828197945844, "grad_norm": 1.5254942730544736, "learning_rate": 7.834481509163282e-06, "loss": 0.0376, "step": 3524 }, { "epoch": 3.291316526610644, "grad_norm": 1.7812312611095333, "learning_rate": 7.833235706678558e-06, "loss": 0.0853, "step": 3525 }, { "epoch": 3.292250233426704, "grad_norm": 1.0395557163673914, "learning_rate": 7.831989645063313e-06, "loss": 0.0222, "step": 3526 }, { "epoch": 3.2931839402427636, "grad_norm": 1.0645677649886616, "learning_rate": 7.830743324431514e-06, "loss": 0.0564, "step": 3527 }, { "epoch": 3.2941176470588234, "grad_norm": 3.1215598345909807, "learning_rate": 7.82949674489715e-06, "loss": 0.1618, "step": 3528 }, { "epoch": 3.295051353874883, "grad_norm": 1.2695812184211603, "learning_rate": 7.828249906574235e-06, "loss": 0.0213, "step": 3529 }, { "epoch": 3.295985060690943, "grad_norm": 2.5664850767966314, "learning_rate": 7.827002809576805e-06, "loss": 0.1181, "step": 3530 }, { "epoch": 3.2969187675070026, "grad_norm": 1.071736423177485, "learning_rate": 7.825755454018924e-06, "loss": 0.0179, "step": 3531 }, { "epoch": 3.2978524743230624, "grad_norm": 2.432595623906862, "learning_rate": 7.824507840014673e-06, "loss": 0.1338, "step": 3532 }, { "epoch": 3.298786181139122, "grad_norm": 0.44721674262015365, "learning_rate": 7.82325996767816e-06, "loss": 0.0118, "step": 3533 }, { "epoch": 3.299719887955182, "grad_norm": 3.2736341386109995, "learning_rate": 7.822011837123521e-06, "loss": 0.1416, "step": 3534 }, { "epoch": 3.3006535947712417, "grad_norm": 2.6519976664724276, "learning_rate": 7.820763448464906e-06, "loss": 0.1007, "step": 3535 }, { "epoch": 3.3015873015873014, "grad_norm": 3.6356420915720684, "learning_rate": 7.819514801816497e-06, "loss": 0.2205, "step": 3536 }, { "epoch": 3.302521008403361, "grad_norm": 4.291968130997556, "learning_rate": 7.818265897292494e-06, "loss": 0.1464, "step": 3537 }, { "epoch": 3.303454715219421, "grad_norm": 2.1967555312496887, "learning_rate": 7.817016735007125e-06, "loss": 0.1441, "step": 3538 }, { "epoch": 3.3043884220354807, "grad_norm": 1.260439624197903, "learning_rate": 7.815767315074639e-06, "loss": 0.0734, "step": 3539 }, { "epoch": 3.3053221288515404, "grad_norm": 2.051572148229023, "learning_rate": 7.814517637609309e-06, "loss": 0.0903, "step": 3540 }, { "epoch": 3.3062558356676, "grad_norm": 0.8622029006779712, "learning_rate": 7.813267702725433e-06, "loss": 0.0318, "step": 3541 }, { "epoch": 3.30718954248366, "grad_norm": 1.6065609696393812, "learning_rate": 7.812017510537329e-06, "loss": 0.1081, "step": 3542 }, { "epoch": 3.3081232492997197, "grad_norm": 1.5915086030577688, "learning_rate": 7.810767061159341e-06, "loss": 0.1044, "step": 3543 }, { "epoch": 3.3090569561157794, "grad_norm": 2.273027217177013, "learning_rate": 7.809516354705839e-06, "loss": 0.1315, "step": 3544 }, { "epoch": 3.309990662931839, "grad_norm": 1.6089609821003705, "learning_rate": 7.808265391291212e-06, "loss": 0.0868, "step": 3545 }, { "epoch": 3.310924369747899, "grad_norm": 2.700737495334759, "learning_rate": 7.807014171029874e-06, "loss": 0.1178, "step": 3546 }, { "epoch": 3.3118580765639587, "grad_norm": 1.2636254643904765, "learning_rate": 7.805762694036263e-06, "loss": 0.0619, "step": 3547 }, { "epoch": 3.3127917833800185, "grad_norm": 1.1225903163357782, "learning_rate": 7.804510960424843e-06, "loss": 0.048, "step": 3548 }, { "epoch": 3.313725490196078, "grad_norm": 0.6347125966190321, "learning_rate": 7.803258970310094e-06, "loss": 0.0132, "step": 3549 }, { "epoch": 3.314659197012138, "grad_norm": 1.6833172073811102, "learning_rate": 7.802006723806529e-06, "loss": 0.1003, "step": 3550 }, { "epoch": 3.3155929038281977, "grad_norm": 3.2932395974552042, "learning_rate": 7.800754221028674e-06, "loss": 0.1877, "step": 3551 }, { "epoch": 3.3165266106442575, "grad_norm": 2.699757642003927, "learning_rate": 7.79950146209109e-06, "loss": 0.175, "step": 3552 }, { "epoch": 3.317460317460317, "grad_norm": 3.4541662764212053, "learning_rate": 7.798248447108353e-06, "loss": 0.1545, "step": 3553 }, { "epoch": 3.318394024276377, "grad_norm": 0.9138217805472547, "learning_rate": 7.796995176195064e-06, "loss": 0.0411, "step": 3554 }, { "epoch": 3.3193277310924367, "grad_norm": 1.542766916008456, "learning_rate": 7.79574164946585e-06, "loss": 0.0601, "step": 3555 }, { "epoch": 3.3202614379084965, "grad_norm": 0.9618973625962283, "learning_rate": 7.794487867035358e-06, "loss": 0.0278, "step": 3556 }, { "epoch": 3.3211951447245567, "grad_norm": 2.356653885028445, "learning_rate": 7.793233829018263e-06, "loss": 0.1381, "step": 3557 }, { "epoch": 3.3221288515406164, "grad_norm": 1.0290660632633462, "learning_rate": 7.791979535529258e-06, "loss": 0.0339, "step": 3558 }, { "epoch": 3.323062558356676, "grad_norm": 8.50535412735898, "learning_rate": 7.790724986683063e-06, "loss": 0.0234, "step": 3559 }, { "epoch": 3.323996265172736, "grad_norm": 7.884133961142762, "learning_rate": 7.78947018259442e-06, "loss": 0.2123, "step": 3560 }, { "epoch": 3.3249299719887957, "grad_norm": 4.187599570592494, "learning_rate": 7.788215123378096e-06, "loss": 0.1129, "step": 3561 }, { "epoch": 3.3258636788048555, "grad_norm": 3.422603480032168, "learning_rate": 7.786959809148878e-06, "loss": 0.2638, "step": 3562 }, { "epoch": 3.326797385620915, "grad_norm": 3.2016340158360577, "learning_rate": 7.785704240021578e-06, "loss": 0.1668, "step": 3563 }, { "epoch": 3.327731092436975, "grad_norm": 2.1730574362360837, "learning_rate": 7.784448416111035e-06, "loss": 0.1897, "step": 3564 }, { "epoch": 3.3286647992530347, "grad_norm": 2.0059528827305617, "learning_rate": 7.783192337532104e-06, "loss": 0.1604, "step": 3565 }, { "epoch": 3.3295985060690945, "grad_norm": 0.6484315095873708, "learning_rate": 7.781936004399668e-06, "loss": 0.0235, "step": 3566 }, { "epoch": 3.330532212885154, "grad_norm": 4.90697894845258, "learning_rate": 7.780679416828634e-06, "loss": 0.2678, "step": 3567 }, { "epoch": 3.331465919701214, "grad_norm": 4.376692230558764, "learning_rate": 7.77942257493393e-06, "loss": 0.2027, "step": 3568 }, { "epoch": 3.3323996265172737, "grad_norm": 0.9442625274243323, "learning_rate": 7.778165478830505e-06, "loss": 0.0271, "step": 3569 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5825435401158964, "learning_rate": 7.77690812863334e-06, "loss": 0.0245, "step": 3570 }, { "epoch": 3.3342670401493932, "grad_norm": 1.6273444690658014, "learning_rate": 7.775650524457431e-06, "loss": 0.1063, "step": 3571 }, { "epoch": 3.335200746965453, "grad_norm": 2.127575070044573, "learning_rate": 7.774392666417799e-06, "loss": 0.137, "step": 3572 }, { "epoch": 3.3361344537815127, "grad_norm": 1.0342129435046634, "learning_rate": 7.773134554629486e-06, "loss": 0.0302, "step": 3573 }, { "epoch": 3.3370681605975725, "grad_norm": 1.4932706437472578, "learning_rate": 7.771876189207567e-06, "loss": 0.041, "step": 3574 }, { "epoch": 3.3380018674136323, "grad_norm": 1.956549137536622, "learning_rate": 7.770617570267129e-06, "loss": 0.1393, "step": 3575 }, { "epoch": 3.338935574229692, "grad_norm": 2.9537523959134715, "learning_rate": 7.769358697923285e-06, "loss": 0.2185, "step": 3576 }, { "epoch": 3.3398692810457518, "grad_norm": 1.2165117783127766, "learning_rate": 7.768099572291177e-06, "loss": 0.0488, "step": 3577 }, { "epoch": 3.3408029878618115, "grad_norm": 1.2995013042170327, "learning_rate": 7.766840193485963e-06, "loss": 0.0501, "step": 3578 }, { "epoch": 3.3417366946778713, "grad_norm": 2.69419145819176, "learning_rate": 7.765580561622827e-06, "loss": 0.1727, "step": 3579 }, { "epoch": 3.342670401493931, "grad_norm": 3.5463176606084095, "learning_rate": 7.764320676816978e-06, "loss": 0.2817, "step": 3580 }, { "epoch": 3.3436041083099908, "grad_norm": 0.8487967808757004, "learning_rate": 7.763060539183645e-06, "loss": 0.0257, "step": 3581 }, { "epoch": 3.3445378151260505, "grad_norm": 0.9848833810148396, "learning_rate": 7.761800148838082e-06, "loss": 0.066, "step": 3582 }, { "epoch": 3.3454715219421103, "grad_norm": 2.5189799804910633, "learning_rate": 7.760539505895566e-06, "loss": 0.077, "step": 3583 }, { "epoch": 3.34640522875817, "grad_norm": 2.000328618511928, "learning_rate": 7.759278610471394e-06, "loss": 0.1284, "step": 3584 }, { "epoch": 3.34733893557423, "grad_norm": 1.6015874283378932, "learning_rate": 7.758017462680893e-06, "loss": 0.0685, "step": 3585 }, { "epoch": 3.3482726423902895, "grad_norm": 2.467856550315613, "learning_rate": 7.756756062639404e-06, "loss": 0.2166, "step": 3586 }, { "epoch": 3.3492063492063493, "grad_norm": 1.919923996166157, "learning_rate": 7.755494410462299e-06, "loss": 0.1401, "step": 3587 }, { "epoch": 3.350140056022409, "grad_norm": 3.865721473105582, "learning_rate": 7.754232506264969e-06, "loss": 0.1742, "step": 3588 }, { "epoch": 3.351073762838469, "grad_norm": 2.579421613216282, "learning_rate": 7.752970350162829e-06, "loss": 0.1171, "step": 3589 }, { "epoch": 3.3520074696545286, "grad_norm": 1.3726895507720698, "learning_rate": 7.751707942271317e-06, "loss": 0.1052, "step": 3590 }, { "epoch": 3.3529411764705883, "grad_norm": 1.634684298245777, "learning_rate": 7.750445282705897e-06, "loss": 0.0747, "step": 3591 }, { "epoch": 3.353874883286648, "grad_norm": 5.228276886833385, "learning_rate": 7.749182371582049e-06, "loss": 0.169, "step": 3592 }, { "epoch": 3.354808590102708, "grad_norm": 0.8425869667501511, "learning_rate": 7.74791920901528e-06, "loss": 0.0564, "step": 3593 }, { "epoch": 3.3557422969187676, "grad_norm": 0.8766063075676707, "learning_rate": 7.746655795121124e-06, "loss": 0.0211, "step": 3594 }, { "epoch": 3.3566760037348273, "grad_norm": 0.7508040719730742, "learning_rate": 7.745392130015131e-06, "loss": 0.0298, "step": 3595 }, { "epoch": 3.357609710550887, "grad_norm": 1.556666566391214, "learning_rate": 7.744128213812875e-06, "loss": 0.0292, "step": 3596 }, { "epoch": 3.358543417366947, "grad_norm": 3.2390919566822847, "learning_rate": 7.742864046629962e-06, "loss": 0.2085, "step": 3597 }, { "epoch": 3.3594771241830066, "grad_norm": 0.8786244166259928, "learning_rate": 7.741599628582006e-06, "loss": 0.0273, "step": 3598 }, { "epoch": 3.3604108309990663, "grad_norm": 2.6156233139899157, "learning_rate": 7.740334959784656e-06, "loss": 0.1456, "step": 3599 }, { "epoch": 3.361344537815126, "grad_norm": 1.4655386072327108, "learning_rate": 7.739070040353583e-06, "loss": 0.0506, "step": 3600 }, { "epoch": 3.362278244631186, "grad_norm": 0.9534871337127764, "learning_rate": 7.73780487040447e-06, "loss": 0.0404, "step": 3601 }, { "epoch": 3.3632119514472456, "grad_norm": 0.9225841158861684, "learning_rate": 7.736539450053038e-06, "loss": 0.024, "step": 3602 }, { "epoch": 3.3641456582633054, "grad_norm": 4.797386251407602, "learning_rate": 7.735273779415018e-06, "loss": 0.3166, "step": 3603 }, { "epoch": 3.365079365079365, "grad_norm": 2.5266912243420045, "learning_rate": 7.734007858606174e-06, "loss": 0.1075, "step": 3604 }, { "epoch": 3.366013071895425, "grad_norm": 3.141078096756538, "learning_rate": 7.732741687742286e-06, "loss": 0.0819, "step": 3605 }, { "epoch": 3.3669467787114846, "grad_norm": 0.7059981418195039, "learning_rate": 7.731475266939159e-06, "loss": 0.0203, "step": 3606 }, { "epoch": 3.3678804855275444, "grad_norm": 1.0444729555551067, "learning_rate": 7.730208596312622e-06, "loss": 0.0287, "step": 3607 }, { "epoch": 3.368814192343604, "grad_norm": 6.328009552352581, "learning_rate": 7.728941675978527e-06, "loss": 0.2962, "step": 3608 }, { "epoch": 3.369747899159664, "grad_norm": 0.6067847310672237, "learning_rate": 7.727674506052744e-06, "loss": 0.0326, "step": 3609 }, { "epoch": 3.3706816059757236, "grad_norm": 0.9383329566590769, "learning_rate": 7.726407086651173e-06, "loss": 0.0482, "step": 3610 }, { "epoch": 3.3716153127917834, "grad_norm": 1.6261889066553425, "learning_rate": 7.725139417889734e-06, "loss": 0.0416, "step": 3611 }, { "epoch": 3.372549019607843, "grad_norm": 1.646785737247231, "learning_rate": 7.723871499884366e-06, "loss": 0.0639, "step": 3612 }, { "epoch": 3.373482726423903, "grad_norm": 0.9979643961970517, "learning_rate": 7.722603332751035e-06, "loss": 0.0518, "step": 3613 }, { "epoch": 3.3744164332399627, "grad_norm": 2.0450921037379506, "learning_rate": 7.721334916605731e-06, "loss": 0.1319, "step": 3614 }, { "epoch": 3.3753501400560224, "grad_norm": 2.811588293445024, "learning_rate": 7.720066251564462e-06, "loss": 0.0943, "step": 3615 }, { "epoch": 3.376283846872082, "grad_norm": 0.49506225399411113, "learning_rate": 7.718797337743263e-06, "loss": 0.0231, "step": 3616 }, { "epoch": 3.377217553688142, "grad_norm": 2.5738204710601638, "learning_rate": 7.717528175258188e-06, "loss": 0.1063, "step": 3617 }, { "epoch": 3.3781512605042017, "grad_norm": 0.6723167650835375, "learning_rate": 7.71625876422532e-06, "loss": 0.0138, "step": 3618 }, { "epoch": 3.3790849673202614, "grad_norm": 0.5766085885955841, "learning_rate": 7.714989104760757e-06, "loss": 0.0152, "step": 3619 }, { "epoch": 3.380018674136321, "grad_norm": 2.5955885602577444, "learning_rate": 7.713719196980623e-06, "loss": 0.0947, "step": 3620 }, { "epoch": 3.380952380952381, "grad_norm": 1.8882497024030984, "learning_rate": 7.712449041001068e-06, "loss": 0.1465, "step": 3621 }, { "epoch": 3.3818860877684407, "grad_norm": 0.8510946503213073, "learning_rate": 7.71117863693826e-06, "loss": 0.0165, "step": 3622 }, { "epoch": 3.3828197945845004, "grad_norm": 3.2374404521378666, "learning_rate": 7.709907984908391e-06, "loss": 0.2074, "step": 3623 }, { "epoch": 3.38375350140056, "grad_norm": 1.1574300406893017, "learning_rate": 7.708637085027675e-06, "loss": 0.0643, "step": 3624 }, { "epoch": 3.38468720821662, "grad_norm": 2.2504436516206656, "learning_rate": 7.707365937412353e-06, "loss": 0.1495, "step": 3625 }, { "epoch": 3.3856209150326797, "grad_norm": 0.8522945480324036, "learning_rate": 7.706094542178684e-06, "loss": 0.0354, "step": 3626 }, { "epoch": 3.3865546218487395, "grad_norm": 1.8356536292246206, "learning_rate": 7.70482289944295e-06, "loss": 0.0814, "step": 3627 }, { "epoch": 3.387488328664799, "grad_norm": 0.5561398831203129, "learning_rate": 7.703551009321456e-06, "loss": 0.016, "step": 3628 }, { "epoch": 3.388422035480859, "grad_norm": 1.5842878350613232, "learning_rate": 7.702278871930533e-06, "loss": 0.0836, "step": 3629 }, { "epoch": 3.3893557422969187, "grad_norm": 2.1815553442680247, "learning_rate": 7.701006487386532e-06, "loss": 0.0267, "step": 3630 }, { "epoch": 3.3902894491129785, "grad_norm": 1.785349591018884, "learning_rate": 7.699733855805822e-06, "loss": 0.1587, "step": 3631 }, { "epoch": 3.3912231559290382, "grad_norm": 1.531694983262273, "learning_rate": 7.698460977304805e-06, "loss": 0.0785, "step": 3632 }, { "epoch": 3.392156862745098, "grad_norm": 1.4497305702316508, "learning_rate": 7.697187851999895e-06, "loss": 0.0484, "step": 3633 }, { "epoch": 3.3930905695611577, "grad_norm": 1.8181402966445224, "learning_rate": 7.695914480007537e-06, "loss": 0.1223, "step": 3634 }, { "epoch": 3.3940242763772175, "grad_norm": 3.2700229303271335, "learning_rate": 7.694640861444193e-06, "loss": 0.1901, "step": 3635 }, { "epoch": 3.3949579831932772, "grad_norm": 2.275215553186186, "learning_rate": 7.693366996426349e-06, "loss": 0.0732, "step": 3636 }, { "epoch": 3.395891690009337, "grad_norm": 2.267646736838687, "learning_rate": 7.692092885070514e-06, "loss": 0.1253, "step": 3637 }, { "epoch": 3.3968253968253967, "grad_norm": 1.9251962298442409, "learning_rate": 7.690818527493222e-06, "loss": 0.0903, "step": 3638 }, { "epoch": 3.3977591036414565, "grad_norm": 0.6090971636692678, "learning_rate": 7.689543923811023e-06, "loss": 0.0101, "step": 3639 }, { "epoch": 3.3986928104575163, "grad_norm": 0.8392873385455422, "learning_rate": 7.688269074140498e-06, "loss": 0.0384, "step": 3640 }, { "epoch": 3.399626517273576, "grad_norm": 1.371751296453917, "learning_rate": 7.68699397859824e-06, "loss": 0.0314, "step": 3641 }, { "epoch": 3.4005602240896358, "grad_norm": 1.4604377966634596, "learning_rate": 7.685718637300877e-06, "loss": 0.0838, "step": 3642 }, { "epoch": 3.4014939309056955, "grad_norm": 1.0282329058282131, "learning_rate": 7.684443050365048e-06, "loss": 0.0682, "step": 3643 }, { "epoch": 3.4024276377217553, "grad_norm": 1.8258024161047974, "learning_rate": 7.683167217907423e-06, "loss": 0.1245, "step": 3644 }, { "epoch": 3.403361344537815, "grad_norm": 1.1813225098174498, "learning_rate": 7.681891140044687e-06, "loss": 0.0613, "step": 3645 }, { "epoch": 3.404295051353875, "grad_norm": 0.8621539021872883, "learning_rate": 7.680614816893557e-06, "loss": 0.0307, "step": 3646 }, { "epoch": 3.4052287581699345, "grad_norm": 3.6275721364146145, "learning_rate": 7.67933824857076e-06, "loss": 0.2259, "step": 3647 }, { "epoch": 3.4061624649859943, "grad_norm": 0.6574411825787633, "learning_rate": 7.678061435193058e-06, "loss": 0.0154, "step": 3648 }, { "epoch": 3.407096171802054, "grad_norm": 0.51392045909041, "learning_rate": 7.676784376877227e-06, "loss": 0.019, "step": 3649 }, { "epoch": 3.408029878618114, "grad_norm": 1.9729570617762804, "learning_rate": 7.675507073740066e-06, "loss": 0.1235, "step": 3650 }, { "epoch": 3.4089635854341735, "grad_norm": 2.8540120025352627, "learning_rate": 7.674229525898404e-06, "loss": 0.1961, "step": 3651 }, { "epoch": 3.4098972922502333, "grad_norm": 2.084484126433453, "learning_rate": 7.67295173346908e-06, "loss": 0.1282, "step": 3652 }, { "epoch": 3.410830999066293, "grad_norm": 1.6251850696451084, "learning_rate": 7.671673696568968e-06, "loss": 0.0761, "step": 3653 }, { "epoch": 3.411764705882353, "grad_norm": 0.7574863633312704, "learning_rate": 7.670395415314953e-06, "loss": 0.0253, "step": 3654 }, { "epoch": 3.4126984126984126, "grad_norm": 0.5627920667513737, "learning_rate": 7.669116889823955e-06, "loss": 0.0266, "step": 3655 }, { "epoch": 3.4136321195144723, "grad_norm": 1.4040977286230578, "learning_rate": 7.667838120212903e-06, "loss": 0.0266, "step": 3656 }, { "epoch": 3.414565826330532, "grad_norm": 1.0525432092775981, "learning_rate": 7.666559106598756e-06, "loss": 0.045, "step": 3657 }, { "epoch": 3.415499533146592, "grad_norm": 1.4385714117966282, "learning_rate": 7.665279849098495e-06, "loss": 0.0494, "step": 3658 }, { "epoch": 3.4164332399626516, "grad_norm": 2.22565311344388, "learning_rate": 7.664000347829123e-06, "loss": 0.0858, "step": 3659 }, { "epoch": 3.4173669467787113, "grad_norm": 3.117186119179403, "learning_rate": 7.66272060290766e-06, "loss": 0.2018, "step": 3660 }, { "epoch": 3.418300653594771, "grad_norm": 1.5610799465568068, "learning_rate": 7.66144061445116e-06, "loss": 0.0705, "step": 3661 }, { "epoch": 3.419234360410831, "grad_norm": 2.4482001593014453, "learning_rate": 7.660160382576683e-06, "loss": 0.0275, "step": 3662 }, { "epoch": 3.4201680672268906, "grad_norm": 1.2610966329130169, "learning_rate": 7.658879907401329e-06, "loss": 0.0732, "step": 3663 }, { "epoch": 3.4211017740429503, "grad_norm": 2.1718573483565202, "learning_rate": 7.657599189042206e-06, "loss": 0.1123, "step": 3664 }, { "epoch": 3.42203548085901, "grad_norm": 0.905624471024667, "learning_rate": 7.656318227616451e-06, "loss": 0.0325, "step": 3665 }, { "epoch": 3.42296918767507, "grad_norm": 1.0754700572922322, "learning_rate": 7.655037023241222e-06, "loss": 0.0611, "step": 3666 }, { "epoch": 3.4239028944911296, "grad_norm": 0.7605102467639546, "learning_rate": 7.653755576033698e-06, "loss": 0.0294, "step": 3667 }, { "epoch": 3.4248366013071894, "grad_norm": 1.5266649149653195, "learning_rate": 7.652473886111086e-06, "loss": 0.0101, "step": 3668 }, { "epoch": 3.425770308123249, "grad_norm": 4.347898365140359, "learning_rate": 7.651191953590606e-06, "loss": 0.2534, "step": 3669 }, { "epoch": 3.426704014939309, "grad_norm": 0.9463814025399693, "learning_rate": 7.649909778589508e-06, "loss": 0.0327, "step": 3670 }, { "epoch": 3.4276377217553686, "grad_norm": 1.2193748165237435, "learning_rate": 7.648627361225058e-06, "loss": 0.0506, "step": 3671 }, { "epoch": 3.4285714285714284, "grad_norm": 1.60739835383783, "learning_rate": 7.64734470161455e-06, "loss": 0.075, "step": 3672 }, { "epoch": 3.429505135387488, "grad_norm": 4.252530265705451, "learning_rate": 7.646061799875294e-06, "loss": 0.3011, "step": 3673 }, { "epoch": 3.4304388422035483, "grad_norm": 0.9112670985727522, "learning_rate": 7.644778656124628e-06, "loss": 0.0357, "step": 3674 }, { "epoch": 3.431372549019608, "grad_norm": 2.260364015811775, "learning_rate": 7.643495270479907e-06, "loss": 0.047, "step": 3675 }, { "epoch": 3.432306255835668, "grad_norm": 1.9682332512524106, "learning_rate": 7.642211643058518e-06, "loss": 0.1313, "step": 3676 }, { "epoch": 3.4332399626517276, "grad_norm": 1.2022327194762692, "learning_rate": 7.640927773977852e-06, "loss": 0.0563, "step": 3677 }, { "epoch": 3.4341736694677873, "grad_norm": 5.770125806786419, "learning_rate": 7.63964366335534e-06, "loss": 0.2778, "step": 3678 }, { "epoch": 3.435107376283847, "grad_norm": 3.2746912554203638, "learning_rate": 7.638359311308426e-06, "loss": 0.1934, "step": 3679 }, { "epoch": 3.436041083099907, "grad_norm": 2.785807567103127, "learning_rate": 7.63707471795458e-06, "loss": 0.1079, "step": 3680 }, { "epoch": 3.4369747899159666, "grad_norm": 1.7232516618388483, "learning_rate": 7.63578988341129e-06, "loss": 0.0259, "step": 3681 }, { "epoch": 3.4379084967320264, "grad_norm": 0.972571713197686, "learning_rate": 7.634504807796067e-06, "loss": 0.0385, "step": 3682 }, { "epoch": 3.438842203548086, "grad_norm": 2.8559599718463216, "learning_rate": 7.63321949122645e-06, "loss": 0.1838, "step": 3683 }, { "epoch": 3.439775910364146, "grad_norm": 3.3778481489214633, "learning_rate": 7.631933933819991e-06, "loss": 0.1746, "step": 3684 }, { "epoch": 3.4407096171802056, "grad_norm": 2.8294114917045534, "learning_rate": 7.630648135694269e-06, "loss": 0.173, "step": 3685 }, { "epoch": 3.4416433239962654, "grad_norm": 2.087504693364989, "learning_rate": 7.6293620969668855e-06, "loss": 0.1702, "step": 3686 }, { "epoch": 3.442577030812325, "grad_norm": 0.5457134483590921, "learning_rate": 7.6280758177554625e-06, "loss": 0.0228, "step": 3687 }, { "epoch": 3.443510737628385, "grad_norm": 2.3479346601197757, "learning_rate": 7.6267892981776435e-06, "loss": 0.1387, "step": 3688 }, { "epoch": 3.4444444444444446, "grad_norm": 2.1632169035243187, "learning_rate": 7.6255025383510975e-06, "loss": 0.0954, "step": 3689 }, { "epoch": 3.4453781512605044, "grad_norm": 0.9797948315785007, "learning_rate": 7.62421553839351e-06, "loss": 0.0524, "step": 3690 }, { "epoch": 3.446311858076564, "grad_norm": 0.9597567073328493, "learning_rate": 7.6229282984225935e-06, "loss": 0.0169, "step": 3691 }, { "epoch": 3.447245564892624, "grad_norm": 2.4757020051247585, "learning_rate": 7.621640818556078e-06, "loss": 0.1284, "step": 3692 }, { "epoch": 3.4481792717086837, "grad_norm": 1.014580151252324, "learning_rate": 7.620353098911719e-06, "loss": 0.052, "step": 3693 }, { "epoch": 3.4491129785247434, "grad_norm": 1.2566549670458036, "learning_rate": 7.619065139607292e-06, "loss": 0.0954, "step": 3694 }, { "epoch": 3.450046685340803, "grad_norm": 2.1611672518861225, "learning_rate": 7.617776940760595e-06, "loss": 0.1995, "step": 3695 }, { "epoch": 3.450980392156863, "grad_norm": 3.1847827093616075, "learning_rate": 7.6164885024894485e-06, "loss": 0.2186, "step": 3696 }, { "epoch": 3.4519140989729227, "grad_norm": 1.4770540112358486, "learning_rate": 7.615199824911694e-06, "loss": 0.1063, "step": 3697 }, { "epoch": 3.4528478057889824, "grad_norm": 2.1410041364444723, "learning_rate": 7.613910908145197e-06, "loss": 0.1529, "step": 3698 }, { "epoch": 3.453781512605042, "grad_norm": 1.1185328029045192, "learning_rate": 7.61262175230784e-06, "loss": 0.0279, "step": 3699 }, { "epoch": 3.454715219421102, "grad_norm": 1.0430841059060239, "learning_rate": 7.61133235751753e-06, "loss": 0.0379, "step": 3700 }, { "epoch": 3.4556489262371617, "grad_norm": 2.17661249805892, "learning_rate": 7.610042723892201e-06, "loss": 0.2089, "step": 3701 }, { "epoch": 3.4565826330532214, "grad_norm": 1.8730344430188142, "learning_rate": 7.6087528515498e-06, "loss": 0.0581, "step": 3702 }, { "epoch": 3.457516339869281, "grad_norm": 1.1586160209930465, "learning_rate": 7.607462740608302e-06, "loss": 0.0609, "step": 3703 }, { "epoch": 3.458450046685341, "grad_norm": 1.2287296728395534, "learning_rate": 7.6061723911857e-06, "loss": 0.0817, "step": 3704 }, { "epoch": 3.4593837535014007, "grad_norm": 2.241183722859324, "learning_rate": 7.604881803400013e-06, "loss": 0.1179, "step": 3705 }, { "epoch": 3.4603174603174605, "grad_norm": 0.6678646782134864, "learning_rate": 7.603590977369278e-06, "loss": 0.0516, "step": 3706 }, { "epoch": 3.46125116713352, "grad_norm": 2.7567229912978806, "learning_rate": 7.602299913211554e-06, "loss": 0.1188, "step": 3707 }, { "epoch": 3.46218487394958, "grad_norm": 1.853693266905365, "learning_rate": 7.601008611044926e-06, "loss": 0.1501, "step": 3708 }, { "epoch": 3.4631185807656397, "grad_norm": 1.0259790882043647, "learning_rate": 7.599717070987495e-06, "loss": 0.0494, "step": 3709 }, { "epoch": 3.4640522875816995, "grad_norm": 3.7339944235308224, "learning_rate": 7.598425293157389e-06, "loss": 0.1955, "step": 3710 }, { "epoch": 3.4649859943977592, "grad_norm": 0.7790285250325721, "learning_rate": 7.5971332776727525e-06, "loss": 0.0392, "step": 3711 }, { "epoch": 3.465919701213819, "grad_norm": 0.4120758108771792, "learning_rate": 7.595841024651758e-06, "loss": 0.0092, "step": 3712 }, { "epoch": 3.4668534080298787, "grad_norm": 3.5112721639771545, "learning_rate": 7.594548534212592e-06, "loss": 0.192, "step": 3713 }, { "epoch": 3.4677871148459385, "grad_norm": 2.8004130621704686, "learning_rate": 7.59325580647347e-06, "loss": 0.0223, "step": 3714 }, { "epoch": 3.4687208216619982, "grad_norm": 2.876785522763442, "learning_rate": 7.591962841552627e-06, "loss": 0.1883, "step": 3715 }, { "epoch": 3.469654528478058, "grad_norm": 1.757612263912197, "learning_rate": 7.590669639568315e-06, "loss": 0.061, "step": 3716 }, { "epoch": 3.4705882352941178, "grad_norm": 1.481458229961153, "learning_rate": 7.589376200638814e-06, "loss": 0.0175, "step": 3717 }, { "epoch": 3.4715219421101775, "grad_norm": 1.866330839745126, "learning_rate": 7.588082524882423e-06, "loss": 0.1275, "step": 3718 }, { "epoch": 3.4724556489262373, "grad_norm": 2.1032455048689367, "learning_rate": 7.586788612417462e-06, "loss": 0.1119, "step": 3719 }, { "epoch": 3.473389355742297, "grad_norm": 2.631838124422578, "learning_rate": 7.5854944633622755e-06, "loss": 0.135, "step": 3720 }, { "epoch": 3.4743230625583568, "grad_norm": 1.3877728019948095, "learning_rate": 7.584200077835226e-06, "loss": 0.0435, "step": 3721 }, { "epoch": 3.4752567693744165, "grad_norm": 1.0952159361399545, "learning_rate": 7.582905455954699e-06, "loss": 0.0495, "step": 3722 }, { "epoch": 3.4761904761904763, "grad_norm": 1.3345749572900825, "learning_rate": 7.581610597839102e-06, "loss": 0.0419, "step": 3723 }, { "epoch": 3.477124183006536, "grad_norm": 0.5656831801451643, "learning_rate": 7.580315503606865e-06, "loss": 0.0095, "step": 3724 }, { "epoch": 3.478057889822596, "grad_norm": 4.309789599854847, "learning_rate": 7.579020173376439e-06, "loss": 0.1983, "step": 3725 }, { "epoch": 3.4789915966386555, "grad_norm": 0.8546780662444118, "learning_rate": 7.577724607266295e-06, "loss": 0.0151, "step": 3726 }, { "epoch": 3.4799253034547153, "grad_norm": 2.219907244501127, "learning_rate": 7.576428805394927e-06, "loss": 0.1533, "step": 3727 }, { "epoch": 3.480859010270775, "grad_norm": 2.2553485952175896, "learning_rate": 7.575132767880849e-06, "loss": 0.0396, "step": 3728 }, { "epoch": 3.481792717086835, "grad_norm": 1.775250848599541, "learning_rate": 7.573836494842601e-06, "loss": 0.0978, "step": 3729 }, { "epoch": 3.4827264239028946, "grad_norm": 1.6088068296531866, "learning_rate": 7.572539986398737e-06, "loss": 0.1246, "step": 3730 }, { "epoch": 3.4836601307189543, "grad_norm": 0.9112047273021462, "learning_rate": 7.571243242667843e-06, "loss": 0.0426, "step": 3731 }, { "epoch": 3.484593837535014, "grad_norm": 2.0475190588502667, "learning_rate": 7.569946263768515e-06, "loss": 0.1196, "step": 3732 }, { "epoch": 3.485527544351074, "grad_norm": 1.7585198091935883, "learning_rate": 7.568649049819378e-06, "loss": 0.1541, "step": 3733 }, { "epoch": 3.4864612511671336, "grad_norm": 1.9608506786410007, "learning_rate": 7.567351600939077e-06, "loss": 0.1273, "step": 3734 }, { "epoch": 3.4873949579831933, "grad_norm": 1.831274380223454, "learning_rate": 7.566053917246276e-06, "loss": 0.0852, "step": 3735 }, { "epoch": 3.488328664799253, "grad_norm": 1.0512080518768185, "learning_rate": 7.564755998859665e-06, "loss": 0.0128, "step": 3736 }, { "epoch": 3.489262371615313, "grad_norm": 4.052942115921411, "learning_rate": 7.563457845897952e-06, "loss": 0.1718, "step": 3737 }, { "epoch": 3.4901960784313726, "grad_norm": 0.6624247978726051, "learning_rate": 7.562159458479867e-06, "loss": 0.0316, "step": 3738 }, { "epoch": 3.4911297852474323, "grad_norm": 2.4130979833827535, "learning_rate": 7.5608608367241595e-06, "loss": 0.0868, "step": 3739 }, { "epoch": 3.492063492063492, "grad_norm": 1.4508899350303177, "learning_rate": 7.559561980749608e-06, "loss": 0.1109, "step": 3740 }, { "epoch": 3.492997198879552, "grad_norm": 2.6839300706608724, "learning_rate": 7.558262890675003e-06, "loss": 0.1292, "step": 3741 }, { "epoch": 3.4939309056956116, "grad_norm": 1.1630713163387238, "learning_rate": 7.556963566619161e-06, "loss": 0.0245, "step": 3742 }, { "epoch": 3.4948646125116714, "grad_norm": 2.511968539228516, "learning_rate": 7.55566400870092e-06, "loss": 0.0847, "step": 3743 }, { "epoch": 3.495798319327731, "grad_norm": 3.49485262225552, "learning_rate": 7.554364217039141e-06, "loss": 0.2415, "step": 3744 }, { "epoch": 3.496732026143791, "grad_norm": 0.48346623992532367, "learning_rate": 7.553064191752699e-06, "loss": 0.0117, "step": 3745 }, { "epoch": 3.4976657329598506, "grad_norm": 2.863032432681059, "learning_rate": 7.551763932960503e-06, "loss": 0.0838, "step": 3746 }, { "epoch": 3.4985994397759104, "grad_norm": 3.386844726620651, "learning_rate": 7.550463440781468e-06, "loss": 0.1846, "step": 3747 }, { "epoch": 3.49953314659197, "grad_norm": 1.59860988275551, "learning_rate": 7.549162715334545e-06, "loss": 0.0941, "step": 3748 }, { "epoch": 3.50046685340803, "grad_norm": 0.47029794621673876, "learning_rate": 7.547861756738696e-06, "loss": 0.0147, "step": 3749 }, { "epoch": 3.5014005602240896, "grad_norm": 1.0500281525164803, "learning_rate": 7.5465605651129085e-06, "loss": 0.0591, "step": 3750 }, { "epoch": 3.5023342670401494, "grad_norm": 2.4141229947880305, "learning_rate": 7.54525914057619e-06, "loss": 0.0461, "step": 3751 }, { "epoch": 3.503267973856209, "grad_norm": 0.9748381290229439, "learning_rate": 7.54395748324757e-06, "loss": 0.0596, "step": 3752 }, { "epoch": 3.504201680672269, "grad_norm": 1.6912006512250068, "learning_rate": 7.542655593246103e-06, "loss": 0.0288, "step": 3753 }, { "epoch": 3.5051353874883286, "grad_norm": 1.2934687398348834, "learning_rate": 7.541353470690857e-06, "loss": 0.085, "step": 3754 }, { "epoch": 3.5060690943043884, "grad_norm": 1.7876130287762266, "learning_rate": 7.540051115700928e-06, "loss": 0.0654, "step": 3755 }, { "epoch": 3.507002801120448, "grad_norm": 0.6282110262468725, "learning_rate": 7.538748528395427e-06, "loss": 0.0158, "step": 3756 }, { "epoch": 3.507936507936508, "grad_norm": 0.645541217812142, "learning_rate": 7.537445708893496e-06, "loss": 0.0254, "step": 3757 }, { "epoch": 3.5088702147525677, "grad_norm": 1.359589966932639, "learning_rate": 7.536142657314286e-06, "loss": 0.0597, "step": 3758 }, { "epoch": 3.5098039215686274, "grad_norm": 1.431014494238137, "learning_rate": 7.534839373776979e-06, "loss": 0.0624, "step": 3759 }, { "epoch": 3.510737628384687, "grad_norm": 1.9910186746143974, "learning_rate": 7.533535858400773e-06, "loss": 0.0452, "step": 3760 }, { "epoch": 3.511671335200747, "grad_norm": 1.7466780144888456, "learning_rate": 7.532232111304891e-06, "loss": 0.0646, "step": 3761 }, { "epoch": 3.5126050420168067, "grad_norm": 0.5831289587521978, "learning_rate": 7.5309281326085705e-06, "loss": 0.0225, "step": 3762 }, { "epoch": 3.5135387488328664, "grad_norm": 4.256041859850909, "learning_rate": 7.52962392243108e-06, "loss": 0.2207, "step": 3763 }, { "epoch": 3.514472455648926, "grad_norm": 0.7832189825736408, "learning_rate": 7.528319480891701e-06, "loss": 0.0237, "step": 3764 }, { "epoch": 3.515406162464986, "grad_norm": 5.277810133007541, "learning_rate": 7.527014808109739e-06, "loss": 0.1654, "step": 3765 }, { "epoch": 3.5163398692810457, "grad_norm": 2.2869859967010306, "learning_rate": 7.525709904204521e-06, "loss": 0.1174, "step": 3766 }, { "epoch": 3.5172735760971054, "grad_norm": 0.6882570478634139, "learning_rate": 7.5244047692953944e-06, "loss": 0.0201, "step": 3767 }, { "epoch": 3.518207282913165, "grad_norm": 3.3943259069910723, "learning_rate": 7.52309940350173e-06, "loss": 0.2073, "step": 3768 }, { "epoch": 3.519140989729225, "grad_norm": 0.4769759433303597, "learning_rate": 7.521793806942914e-06, "loss": 0.0178, "step": 3769 }, { "epoch": 3.5200746965452847, "grad_norm": 0.7214726260611304, "learning_rate": 7.520487979738362e-06, "loss": 0.0305, "step": 3770 }, { "epoch": 3.5210084033613445, "grad_norm": 2.827219132085801, "learning_rate": 7.519181922007503e-06, "loss": 0.1664, "step": 3771 }, { "epoch": 3.521942110177404, "grad_norm": 1.0250805036275255, "learning_rate": 7.517875633869792e-06, "loss": 0.034, "step": 3772 }, { "epoch": 3.522875816993464, "grad_norm": 4.26923257928881, "learning_rate": 7.5165691154447025e-06, "loss": 0.3254, "step": 3773 }, { "epoch": 3.5238095238095237, "grad_norm": 2.793236078960543, "learning_rate": 7.515262366851732e-06, "loss": 0.1501, "step": 3774 }, { "epoch": 3.5247432306255835, "grad_norm": 1.3381193887857328, "learning_rate": 7.513955388210394e-06, "loss": 0.0578, "step": 3775 }, { "epoch": 3.5256769374416432, "grad_norm": 1.7093335836419576, "learning_rate": 7.5126481796402276e-06, "loss": 0.0918, "step": 3776 }, { "epoch": 3.526610644257703, "grad_norm": 2.0035806311674254, "learning_rate": 7.5113407412607894e-06, "loss": 0.0238, "step": 3777 }, { "epoch": 3.5275443510737627, "grad_norm": 6.75631096392236, "learning_rate": 7.510033073191663e-06, "loss": 0.2439, "step": 3778 }, { "epoch": 3.5284780578898225, "grad_norm": 0.8840599136315926, "learning_rate": 7.508725175552446e-06, "loss": 0.0354, "step": 3779 }, { "epoch": 3.5294117647058822, "grad_norm": 2.900236259543501, "learning_rate": 7.507417048462761e-06, "loss": 0.1266, "step": 3780 }, { "epoch": 3.530345471521942, "grad_norm": 3.007738294154785, "learning_rate": 7.506108692042251e-06, "loss": 0.145, "step": 3781 }, { "epoch": 3.5312791783380018, "grad_norm": 1.4230269194980727, "learning_rate": 7.504800106410577e-06, "loss": 0.02, "step": 3782 }, { "epoch": 3.5322128851540615, "grad_norm": 0.3773153696274837, "learning_rate": 7.503491291687428e-06, "loss": 0.0111, "step": 3783 }, { "epoch": 3.5331465919701213, "grad_norm": 3.8252943624554523, "learning_rate": 7.5021822479925045e-06, "loss": 0.1795, "step": 3784 }, { "epoch": 3.534080298786181, "grad_norm": 1.4007560125419098, "learning_rate": 7.500872975445538e-06, "loss": 0.0753, "step": 3785 }, { "epoch": 3.5350140056022408, "grad_norm": 1.9967749456221235, "learning_rate": 7.499563474166271e-06, "loss": 0.128, "step": 3786 }, { "epoch": 3.5359477124183005, "grad_norm": 0.36850868538580833, "learning_rate": 7.498253744274475e-06, "loss": 0.0151, "step": 3787 }, { "epoch": 3.5368814192343603, "grad_norm": 1.9086673694057743, "learning_rate": 7.496943785889939e-06, "loss": 0.0997, "step": 3788 }, { "epoch": 3.53781512605042, "grad_norm": 2.868375655210585, "learning_rate": 7.49563359913247e-06, "loss": 0.1052, "step": 3789 }, { "epoch": 3.53874883286648, "grad_norm": 2.2943722166093967, "learning_rate": 7.494323184121903e-06, "loss": 0.1292, "step": 3790 }, { "epoch": 3.5396825396825395, "grad_norm": 1.137084047447136, "learning_rate": 7.493012540978089e-06, "loss": 0.0246, "step": 3791 }, { "epoch": 3.5406162464985993, "grad_norm": 1.342658454012794, "learning_rate": 7.491701669820897e-06, "loss": 0.04, "step": 3792 }, { "epoch": 3.541549953314659, "grad_norm": 4.74966570718982, "learning_rate": 7.4903905707702255e-06, "loss": 0.2117, "step": 3793 }, { "epoch": 3.542483660130719, "grad_norm": 4.045101613326573, "learning_rate": 7.489079243945984e-06, "loss": 0.0295, "step": 3794 }, { "epoch": 3.5434173669467786, "grad_norm": 1.7359403358746122, "learning_rate": 7.487767689468114e-06, "loss": 0.0898, "step": 3795 }, { "epoch": 3.5443510737628383, "grad_norm": 1.1710487397288936, "learning_rate": 7.4864559074565665e-06, "loss": 0.0292, "step": 3796 }, { "epoch": 3.545284780578898, "grad_norm": 2.1530275745774987, "learning_rate": 7.485143898031321e-06, "loss": 0.1569, "step": 3797 }, { "epoch": 3.546218487394958, "grad_norm": 1.2559463005555602, "learning_rate": 7.483831661312373e-06, "loss": 0.0698, "step": 3798 }, { "epoch": 3.5471521942110176, "grad_norm": 1.0852748591090065, "learning_rate": 7.482519197419742e-06, "loss": 0.056, "step": 3799 }, { "epoch": 3.5480859010270773, "grad_norm": 2.034984200179285, "learning_rate": 7.481206506473467e-06, "loss": 0.0117, "step": 3800 }, { "epoch": 3.549019607843137, "grad_norm": 0.850159464941442, "learning_rate": 7.4798935885936085e-06, "loss": 0.0474, "step": 3801 }, { "epoch": 3.549953314659197, "grad_norm": 3.425142189005676, "learning_rate": 7.478580443900247e-06, "loss": 0.0337, "step": 3802 }, { "epoch": 3.5508870214752566, "grad_norm": 1.3661365367128726, "learning_rate": 7.4772670725134845e-06, "loss": 0.0294, "step": 3803 }, { "epoch": 3.5518207282913163, "grad_norm": 1.6454009140899368, "learning_rate": 7.475953474553443e-06, "loss": 0.0641, "step": 3804 }, { "epoch": 3.552754435107376, "grad_norm": 3.623754244885235, "learning_rate": 7.474639650140265e-06, "loss": 0.1812, "step": 3805 }, { "epoch": 3.553688141923436, "grad_norm": 1.4963119708127544, "learning_rate": 7.4733255993941146e-06, "loss": 0.0794, "step": 3806 }, { "epoch": 3.5546218487394956, "grad_norm": 1.7999031912041186, "learning_rate": 7.472011322435175e-06, "loss": 0.0853, "step": 3807 }, { "epoch": 3.5555555555555554, "grad_norm": 1.455862067373577, "learning_rate": 7.470696819383654e-06, "loss": 0.0954, "step": 3808 }, { "epoch": 3.556489262371615, "grad_norm": 4.595856168567021, "learning_rate": 7.469382090359774e-06, "loss": 0.3643, "step": 3809 }, { "epoch": 3.557422969187675, "grad_norm": 6.2904963693453055, "learning_rate": 7.468067135483783e-06, "loss": 0.1654, "step": 3810 }, { "epoch": 3.5583566760037346, "grad_norm": 0.9997680522341333, "learning_rate": 7.466751954875948e-06, "loss": 0.0123, "step": 3811 }, { "epoch": 3.5592903828197944, "grad_norm": 1.6295222668133607, "learning_rate": 7.465436548656559e-06, "loss": 0.0918, "step": 3812 }, { "epoch": 3.560224089635854, "grad_norm": 1.3673320734191887, "learning_rate": 7.464120916945919e-06, "loss": 0.0465, "step": 3813 }, { "epoch": 3.561157796451914, "grad_norm": 2.2144152574518605, "learning_rate": 7.462805059864362e-06, "loss": 0.1186, "step": 3814 }, { "epoch": 3.5620915032679736, "grad_norm": 4.037271428297033, "learning_rate": 7.4614889775322355e-06, "loss": 0.095, "step": 3815 }, { "epoch": 3.5630252100840334, "grad_norm": 1.1083139544529532, "learning_rate": 7.460172670069909e-06, "loss": 0.0479, "step": 3816 }, { "epoch": 3.563958916900093, "grad_norm": 2.3809778396637094, "learning_rate": 7.458856137597775e-06, "loss": 0.1247, "step": 3817 }, { "epoch": 3.564892623716153, "grad_norm": 2.7698869771289676, "learning_rate": 7.457539380236245e-06, "loss": 0.1626, "step": 3818 }, { "epoch": 3.5658263305322127, "grad_norm": 2.319397857754381, "learning_rate": 7.456222398105748e-06, "loss": 0.1202, "step": 3819 }, { "epoch": 3.5667600373482724, "grad_norm": 2.022674054170415, "learning_rate": 7.454905191326738e-06, "loss": 0.0958, "step": 3820 }, { "epoch": 3.567693744164332, "grad_norm": 2.7702236293278792, "learning_rate": 7.453587760019691e-06, "loss": 0.1596, "step": 3821 }, { "epoch": 3.568627450980392, "grad_norm": 2.9209341850965993, "learning_rate": 7.452270104305095e-06, "loss": 0.1434, "step": 3822 }, { "epoch": 3.5695611577964517, "grad_norm": 1.3280692982179856, "learning_rate": 7.45095222430347e-06, "loss": 0.0563, "step": 3823 }, { "epoch": 3.5704948646125114, "grad_norm": 0.9909750809330571, "learning_rate": 7.449634120135345e-06, "loss": 0.0384, "step": 3824 }, { "epoch": 3.571428571428571, "grad_norm": 2.7956322039522674, "learning_rate": 7.448315791921279e-06, "loss": 0.1638, "step": 3825 }, { "epoch": 3.572362278244631, "grad_norm": 3.2872327966868213, "learning_rate": 7.446997239781846e-06, "loss": 0.3123, "step": 3826 }, { "epoch": 3.5732959850606907, "grad_norm": 1.2183197242738124, "learning_rate": 7.445678463837642e-06, "loss": 0.0827, "step": 3827 }, { "epoch": 3.5742296918767504, "grad_norm": 1.3652851092283749, "learning_rate": 7.4443594642092855e-06, "loss": 0.0351, "step": 3828 }, { "epoch": 3.57516339869281, "grad_norm": 0.7374999051224896, "learning_rate": 7.443040241017411e-06, "loss": 0.0233, "step": 3829 }, { "epoch": 3.57609710550887, "grad_norm": 0.9516418038106021, "learning_rate": 7.441720794382678e-06, "loss": 0.0463, "step": 3830 }, { "epoch": 3.5770308123249297, "grad_norm": 0.49246302259632935, "learning_rate": 7.440401124425761e-06, "loss": 0.0096, "step": 3831 }, { "epoch": 3.5779645191409895, "grad_norm": 3.903148964303019, "learning_rate": 7.439081231267362e-06, "loss": 0.3245, "step": 3832 }, { "epoch": 3.5788982259570497, "grad_norm": 3.0758840383245496, "learning_rate": 7.437761115028198e-06, "loss": 0.0402, "step": 3833 }, { "epoch": 3.5798319327731094, "grad_norm": 1.5506128773864516, "learning_rate": 7.436440775829009e-06, "loss": 0.0764, "step": 3834 }, { "epoch": 3.580765639589169, "grad_norm": 2.4537518769811553, "learning_rate": 7.435120213790553e-06, "loss": 0.1377, "step": 3835 }, { "epoch": 3.581699346405229, "grad_norm": 2.6555246406735598, "learning_rate": 7.433799429033612e-06, "loss": 0.2126, "step": 3836 }, { "epoch": 3.5826330532212887, "grad_norm": 3.750041970043384, "learning_rate": 7.432478421678983e-06, "loss": 0.0118, "step": 3837 }, { "epoch": 3.5835667600373484, "grad_norm": 2.089667866413871, "learning_rate": 7.431157191847492e-06, "loss": 0.1141, "step": 3838 }, { "epoch": 3.584500466853408, "grad_norm": 2.62116556312677, "learning_rate": 7.429835739659975e-06, "loss": 0.138, "step": 3839 }, { "epoch": 3.585434173669468, "grad_norm": 2.2353074582840717, "learning_rate": 7.428514065237295e-06, "loss": 0.1293, "step": 3840 }, { "epoch": 3.5863678804855277, "grad_norm": 3.17479461557548, "learning_rate": 7.427192168700335e-06, "loss": 0.0375, "step": 3841 }, { "epoch": 3.5873015873015874, "grad_norm": 1.2934350427901524, "learning_rate": 7.425870050169994e-06, "loss": 0.0617, "step": 3842 }, { "epoch": 3.588235294117647, "grad_norm": 0.973954376681773, "learning_rate": 7.424547709767196e-06, "loss": 0.0114, "step": 3843 }, { "epoch": 3.589169000933707, "grad_norm": 2.417666676506676, "learning_rate": 7.4232251476128844e-06, "loss": 0.1681, "step": 3844 }, { "epoch": 3.5901027077497667, "grad_norm": 2.069300663768122, "learning_rate": 7.421902363828021e-06, "loss": 0.1263, "step": 3845 }, { "epoch": 3.5910364145658265, "grad_norm": 1.8560753284861562, "learning_rate": 7.420579358533588e-06, "loss": 0.0727, "step": 3846 }, { "epoch": 3.591970121381886, "grad_norm": 2.254294088890208, "learning_rate": 7.419256131850592e-06, "loss": 0.1368, "step": 3847 }, { "epoch": 3.592903828197946, "grad_norm": 3.210134313446621, "learning_rate": 7.417932683900053e-06, "loss": 0.2112, "step": 3848 }, { "epoch": 3.5938375350140057, "grad_norm": 3.3547851472533017, "learning_rate": 7.416609014803015e-06, "loss": 0.123, "step": 3849 }, { "epoch": 3.5947712418300655, "grad_norm": 1.8269200054051333, "learning_rate": 7.415285124680545e-06, "loss": 0.0736, "step": 3850 }, { "epoch": 3.595704948646125, "grad_norm": 1.9249241804163848, "learning_rate": 7.413961013653725e-06, "loss": 0.1135, "step": 3851 }, { "epoch": 3.596638655462185, "grad_norm": 2.3329719723570497, "learning_rate": 7.412636681843661e-06, "loss": 0.1762, "step": 3852 }, { "epoch": 3.5975723622782447, "grad_norm": 2.469942784197196, "learning_rate": 7.411312129371476e-06, "loss": 0.2075, "step": 3853 }, { "epoch": 3.5985060690943045, "grad_norm": 2.1947258851063736, "learning_rate": 7.409987356358315e-06, "loss": 0.1278, "step": 3854 }, { "epoch": 3.5994397759103642, "grad_norm": 0.7187054096604495, "learning_rate": 7.408662362925344e-06, "loss": 0.0362, "step": 3855 }, { "epoch": 3.600373482726424, "grad_norm": 0.43133875831672763, "learning_rate": 7.40733714919375e-06, "loss": 0.0068, "step": 3856 }, { "epoch": 3.6013071895424837, "grad_norm": 4.103962684469627, "learning_rate": 7.406011715284734e-06, "loss": 0.2205, "step": 3857 }, { "epoch": 3.6022408963585435, "grad_norm": 2.457162962874384, "learning_rate": 7.4046860613195234e-06, "loss": 0.082, "step": 3858 }, { "epoch": 3.6031746031746033, "grad_norm": 1.694002721893177, "learning_rate": 7.403360187419365e-06, "loss": 0.111, "step": 3859 }, { "epoch": 3.604108309990663, "grad_norm": 3.5637903569591534, "learning_rate": 7.402034093705524e-06, "loss": 0.171, "step": 3860 }, { "epoch": 3.6050420168067228, "grad_norm": 1.77190233543176, "learning_rate": 7.4007077802992855e-06, "loss": 0.0972, "step": 3861 }, { "epoch": 3.6059757236227825, "grad_norm": 2.5555455303709285, "learning_rate": 7.399381247321955e-06, "loss": 0.1469, "step": 3862 }, { "epoch": 3.6069094304388423, "grad_norm": 2.1328867876081934, "learning_rate": 7.398054494894862e-06, "loss": 0.1658, "step": 3863 }, { "epoch": 3.607843137254902, "grad_norm": 1.690039409655542, "learning_rate": 7.396727523139348e-06, "loss": 0.0245, "step": 3864 }, { "epoch": 3.6087768440709618, "grad_norm": 0.37266400011071926, "learning_rate": 7.395400332176781e-06, "loss": 0.0155, "step": 3865 }, { "epoch": 3.6097105508870215, "grad_norm": 0.7231816636841495, "learning_rate": 7.394072922128548e-06, "loss": 0.018, "step": 3866 }, { "epoch": 3.6106442577030813, "grad_norm": 3.269883515858915, "learning_rate": 7.392745293116054e-06, "loss": 0.0541, "step": 3867 }, { "epoch": 3.611577964519141, "grad_norm": 3.689708794209806, "learning_rate": 7.391417445260726e-06, "loss": 0.219, "step": 3868 }, { "epoch": 3.612511671335201, "grad_norm": 0.9557272928230864, "learning_rate": 7.390089378684009e-06, "loss": 0.0192, "step": 3869 }, { "epoch": 3.6134453781512605, "grad_norm": 2.635296350779976, "learning_rate": 7.3887610935073716e-06, "loss": 0.2102, "step": 3870 }, { "epoch": 3.6143790849673203, "grad_norm": 1.7671975354412952, "learning_rate": 7.387432589852298e-06, "loss": 0.1057, "step": 3871 }, { "epoch": 3.61531279178338, "grad_norm": 1.0505315583147063, "learning_rate": 7.386103867840295e-06, "loss": 0.0707, "step": 3872 }, { "epoch": 3.61624649859944, "grad_norm": 1.2505889612063124, "learning_rate": 7.384774927592889e-06, "loss": 0.0399, "step": 3873 }, { "epoch": 3.6171802054154996, "grad_norm": 1.2034556216519223, "learning_rate": 7.383445769231628e-06, "loss": 0.0226, "step": 3874 }, { "epoch": 3.6181139122315593, "grad_norm": 1.8941318043778836, "learning_rate": 7.3821163928780735e-06, "loss": 0.0657, "step": 3875 }, { "epoch": 3.619047619047619, "grad_norm": 0.519178291106102, "learning_rate": 7.380786798653817e-06, "loss": 0.0138, "step": 3876 }, { "epoch": 3.619981325863679, "grad_norm": 1.7805436785531592, "learning_rate": 7.3794569866804595e-06, "loss": 0.0769, "step": 3877 }, { "epoch": 3.6209150326797386, "grad_norm": 1.347877724844668, "learning_rate": 7.378126957079632e-06, "loss": 0.0985, "step": 3878 }, { "epoch": 3.6218487394957983, "grad_norm": 1.552354000416011, "learning_rate": 7.376796709972975e-06, "loss": 0.074, "step": 3879 }, { "epoch": 3.622782446311858, "grad_norm": 0.727961254816025, "learning_rate": 7.375466245482159e-06, "loss": 0.0364, "step": 3880 }, { "epoch": 3.623716153127918, "grad_norm": 1.285883951541117, "learning_rate": 7.3741355637288665e-06, "loss": 0.0948, "step": 3881 }, { "epoch": 3.6246498599439776, "grad_norm": 0.6296709800929042, "learning_rate": 7.372804664834804e-06, "loss": 0.0264, "step": 3882 }, { "epoch": 3.6255835667600373, "grad_norm": 0.43158783424840413, "learning_rate": 7.371473548921697e-06, "loss": 0.0108, "step": 3883 }, { "epoch": 3.626517273576097, "grad_norm": 0.7710135630362615, "learning_rate": 7.370142216111289e-06, "loss": 0.01, "step": 3884 }, { "epoch": 3.627450980392157, "grad_norm": 1.6638061074128174, "learning_rate": 7.368810666525348e-06, "loss": 0.1125, "step": 3885 }, { "epoch": 3.6283846872082166, "grad_norm": 2.9912584536780087, "learning_rate": 7.367478900285655e-06, "loss": 0.169, "step": 3886 }, { "epoch": 3.6293183940242764, "grad_norm": 1.1526243960494555, "learning_rate": 7.366146917514019e-06, "loss": 0.0153, "step": 3887 }, { "epoch": 3.630252100840336, "grad_norm": 3.853943599753866, "learning_rate": 7.36481471833226e-06, "loss": 0.1612, "step": 3888 }, { "epoch": 3.631185807656396, "grad_norm": 0.8034624384882023, "learning_rate": 7.363482302862228e-06, "loss": 0.031, "step": 3889 }, { "epoch": 3.6321195144724556, "grad_norm": 0.958421930843647, "learning_rate": 7.36214967122578e-06, "loss": 0.0287, "step": 3890 }, { "epoch": 3.6330532212885154, "grad_norm": 3.319076180932474, "learning_rate": 7.360816823544806e-06, "loss": 0.1553, "step": 3891 }, { "epoch": 3.633986928104575, "grad_norm": 3.0578446316632792, "learning_rate": 7.359483759941206e-06, "loss": 0.1508, "step": 3892 }, { "epoch": 3.634920634920635, "grad_norm": 0.9212675934586406, "learning_rate": 7.358150480536904e-06, "loss": 0.058, "step": 3893 }, { "epoch": 3.6358543417366946, "grad_norm": 1.687255773238956, "learning_rate": 7.356816985453843e-06, "loss": 0.0995, "step": 3894 }, { "epoch": 3.6367880485527544, "grad_norm": 3.016022362802181, "learning_rate": 7.355483274813986e-06, "loss": 0.1655, "step": 3895 }, { "epoch": 3.637721755368814, "grad_norm": 0.9203692415486985, "learning_rate": 7.354149348739317e-06, "loss": 0.0406, "step": 3896 }, { "epoch": 3.638655462184874, "grad_norm": 1.5628156481068483, "learning_rate": 7.3528152073518345e-06, "loss": 0.0969, "step": 3897 }, { "epoch": 3.6395891690009337, "grad_norm": 2.0237636918578312, "learning_rate": 7.351480850773564e-06, "loss": 0.141, "step": 3898 }, { "epoch": 3.6405228758169934, "grad_norm": 1.6494908626556994, "learning_rate": 7.350146279126544e-06, "loss": 0.0842, "step": 3899 }, { "epoch": 3.641456582633053, "grad_norm": 0.8235957339722534, "learning_rate": 7.34881149253284e-06, "loss": 0.0354, "step": 3900 }, { "epoch": 3.642390289449113, "grad_norm": 0.6811953854574749, "learning_rate": 7.347476491114529e-06, "loss": 0.0189, "step": 3901 }, { "epoch": 3.6433239962651727, "grad_norm": 0.38397323280590123, "learning_rate": 7.346141274993713e-06, "loss": 0.0157, "step": 3902 }, { "epoch": 3.6442577030812324, "grad_norm": 3.697981805945686, "learning_rate": 7.344805844292512e-06, "loss": 0.0725, "step": 3903 }, { "epoch": 3.645191409897292, "grad_norm": 4.302333679878498, "learning_rate": 7.3434701991330656e-06, "loss": 0.208, "step": 3904 }, { "epoch": 3.646125116713352, "grad_norm": 2.867181418323113, "learning_rate": 7.3421343396375335e-06, "loss": 0.0962, "step": 3905 }, { "epoch": 3.6470588235294117, "grad_norm": 0.9280571617554519, "learning_rate": 7.340798265928095e-06, "loss": 0.0313, "step": 3906 }, { "epoch": 3.6479925303454714, "grad_norm": 2.75447217678634, "learning_rate": 7.339461978126947e-06, "loss": 0.0835, "step": 3907 }, { "epoch": 3.648926237161531, "grad_norm": 1.9025141734604112, "learning_rate": 7.33812547635631e-06, "loss": 0.0449, "step": 3908 }, { "epoch": 3.649859943977591, "grad_norm": 2.8094995362195894, "learning_rate": 7.336788760738421e-06, "loss": 0.0839, "step": 3909 }, { "epoch": 3.6507936507936507, "grad_norm": 1.8866863303068477, "learning_rate": 7.335451831395538e-06, "loss": 0.1225, "step": 3910 }, { "epoch": 3.6517273576097105, "grad_norm": 1.844494739725318, "learning_rate": 7.334114688449936e-06, "loss": 0.0735, "step": 3911 }, { "epoch": 3.65266106442577, "grad_norm": 1.8771598821035163, "learning_rate": 7.3327773320239124e-06, "loss": 0.1096, "step": 3912 }, { "epoch": 3.65359477124183, "grad_norm": 2.0047547681118325, "learning_rate": 7.331439762239784e-06, "loss": 0.1308, "step": 3913 }, { "epoch": 3.6545284780578897, "grad_norm": 2.2023345927824454, "learning_rate": 7.330101979219884e-06, "loss": 0.1135, "step": 3914 }, { "epoch": 3.6554621848739495, "grad_norm": 1.5637798287216569, "learning_rate": 7.32876398308657e-06, "loss": 0.0691, "step": 3915 }, { "epoch": 3.6563958916900092, "grad_norm": 2.1181129832517405, "learning_rate": 7.327425773962213e-06, "loss": 0.0576, "step": 3916 }, { "epoch": 3.657329598506069, "grad_norm": 2.9476718487865483, "learning_rate": 7.326087351969211e-06, "loss": 0.2058, "step": 3917 }, { "epoch": 3.6582633053221287, "grad_norm": 1.8447400267610012, "learning_rate": 7.3247487172299745e-06, "loss": 0.0785, "step": 3918 }, { "epoch": 3.6591970121381885, "grad_norm": 0.9675840645502761, "learning_rate": 7.32340986986694e-06, "loss": 0.043, "step": 3919 }, { "epoch": 3.6601307189542482, "grad_norm": 0.34121973645508635, "learning_rate": 7.322070810002554e-06, "loss": 0.0055, "step": 3920 }, { "epoch": 3.661064425770308, "grad_norm": 1.1990886059934582, "learning_rate": 7.3207315377592935e-06, "loss": 0.0154, "step": 3921 }, { "epoch": 3.6619981325863677, "grad_norm": 2.7963268913080523, "learning_rate": 7.319392053259645e-06, "loss": 0.1483, "step": 3922 }, { "epoch": 3.6629318394024275, "grad_norm": 0.7025445518956315, "learning_rate": 7.318052356626124e-06, "loss": 0.0167, "step": 3923 }, { "epoch": 3.6638655462184873, "grad_norm": 2.585193916223791, "learning_rate": 7.316712447981257e-06, "loss": 0.1552, "step": 3924 }, { "epoch": 3.664799253034547, "grad_norm": 1.6028217380864356, "learning_rate": 7.3153723274475955e-06, "loss": 0.0692, "step": 3925 }, { "epoch": 3.6657329598506068, "grad_norm": 3.275209388710678, "learning_rate": 7.3140319951477055e-06, "loss": 0.1623, "step": 3926 }, { "epoch": 3.6666666666666665, "grad_norm": 2.4071385031834196, "learning_rate": 7.312691451204178e-06, "loss": 0.1452, "step": 3927 }, { "epoch": 3.6676003734827263, "grad_norm": 2.7033002881913206, "learning_rate": 7.3113506957396186e-06, "loss": 0.1494, "step": 3928 }, { "epoch": 3.668534080298786, "grad_norm": 2.960423340410371, "learning_rate": 7.3100097288766546e-06, "loss": 0.1796, "step": 3929 }, { "epoch": 3.669467787114846, "grad_norm": 2.6901388510393036, "learning_rate": 7.308668550737932e-06, "loss": 0.1903, "step": 3930 }, { "epoch": 3.6704014939309055, "grad_norm": 0.6323906112167919, "learning_rate": 7.307327161446118e-06, "loss": 0.0299, "step": 3931 }, { "epoch": 3.6713352007469653, "grad_norm": 1.7759936894874486, "learning_rate": 7.305985561123894e-06, "loss": 0.0981, "step": 3932 }, { "epoch": 3.6722689075630255, "grad_norm": 0.9704028875021325, "learning_rate": 7.304643749893968e-06, "loss": 0.0455, "step": 3933 }, { "epoch": 3.6732026143790852, "grad_norm": 1.8715848381624318, "learning_rate": 7.303301727879059e-06, "loss": 0.0995, "step": 3934 }, { "epoch": 3.674136321195145, "grad_norm": 4.515715148900511, "learning_rate": 7.301959495201915e-06, "loss": 0.2764, "step": 3935 }, { "epoch": 3.6750700280112047, "grad_norm": 2.5974723082995226, "learning_rate": 7.300617051985293e-06, "loss": 0.2142, "step": 3936 }, { "epoch": 3.6760037348272645, "grad_norm": 1.6050235903224237, "learning_rate": 7.299274398351976e-06, "loss": 0.0821, "step": 3937 }, { "epoch": 3.6769374416433243, "grad_norm": 8.369693543108777, "learning_rate": 7.297931534424766e-06, "loss": 0.2129, "step": 3938 }, { "epoch": 3.677871148459384, "grad_norm": 2.4362620280984353, "learning_rate": 7.29658846032648e-06, "loss": 0.1351, "step": 3939 }, { "epoch": 3.6788048552754438, "grad_norm": 2.6769695861811, "learning_rate": 7.295245176179959e-06, "loss": 0.1215, "step": 3940 }, { "epoch": 3.6797385620915035, "grad_norm": 1.6483948456931163, "learning_rate": 7.29390168210806e-06, "loss": 0.0942, "step": 3941 }, { "epoch": 3.6806722689075633, "grad_norm": 2.8256897618422205, "learning_rate": 7.292557978233661e-06, "loss": 0.1878, "step": 3942 }, { "epoch": 3.681605975723623, "grad_norm": 2.2097023710806147, "learning_rate": 7.291214064679656e-06, "loss": 0.1286, "step": 3943 }, { "epoch": 3.682539682539683, "grad_norm": 2.2637526748936128, "learning_rate": 7.289869941568964e-06, "loss": 0.1405, "step": 3944 }, { "epoch": 3.6834733893557425, "grad_norm": 0.43462736732093515, "learning_rate": 7.288525609024518e-06, "loss": 0.0199, "step": 3945 }, { "epoch": 3.6844070961718023, "grad_norm": 0.6437133187770675, "learning_rate": 7.287181067169273e-06, "loss": 0.023, "step": 3946 }, { "epoch": 3.685340802987862, "grad_norm": 1.2037871286966735, "learning_rate": 7.285836316126202e-06, "loss": 0.0757, "step": 3947 }, { "epoch": 3.686274509803922, "grad_norm": 1.1508846800698878, "learning_rate": 7.284491356018295e-06, "loss": 0.0687, "step": 3948 }, { "epoch": 3.6872082166199815, "grad_norm": 0.7719374093536707, "learning_rate": 7.283146186968566e-06, "loss": 0.0227, "step": 3949 }, { "epoch": 3.6881419234360413, "grad_norm": 0.9880338995846991, "learning_rate": 7.281800809100045e-06, "loss": 0.0823, "step": 3950 }, { "epoch": 3.689075630252101, "grad_norm": 3.2869626297457497, "learning_rate": 7.280455222535781e-06, "loss": 0.1474, "step": 3951 }, { "epoch": 3.690009337068161, "grad_norm": 4.249260172648764, "learning_rate": 7.2791094273988415e-06, "loss": 0.1477, "step": 3952 }, { "epoch": 3.6909430438842206, "grad_norm": 1.6108143770589984, "learning_rate": 7.277763423812318e-06, "loss": 0.0798, "step": 3953 }, { "epoch": 3.6918767507002803, "grad_norm": 1.7788523042291229, "learning_rate": 7.276417211899314e-06, "loss": 0.1281, "step": 3954 }, { "epoch": 3.69281045751634, "grad_norm": 1.7112861661573484, "learning_rate": 7.275070791782955e-06, "loss": 0.1123, "step": 3955 }, { "epoch": 3.6937441643324, "grad_norm": 1.7103713466495194, "learning_rate": 7.273724163586389e-06, "loss": 0.1, "step": 3956 }, { "epoch": 3.6946778711484596, "grad_norm": 3.1618011373178003, "learning_rate": 7.272377327432777e-06, "loss": 0.1517, "step": 3957 }, { "epoch": 3.6956115779645193, "grad_norm": 1.0524886148216437, "learning_rate": 7.271030283445303e-06, "loss": 0.0599, "step": 3958 }, { "epoch": 3.696545284780579, "grad_norm": 2.96509165560197, "learning_rate": 7.26968303174717e-06, "loss": 0.164, "step": 3959 }, { "epoch": 3.697478991596639, "grad_norm": 1.2702955392203852, "learning_rate": 7.268335572461597e-06, "loss": 0.0265, "step": 3960 }, { "epoch": 3.6984126984126986, "grad_norm": 2.076507223859358, "learning_rate": 7.266987905711827e-06, "loss": 0.1044, "step": 3961 }, { "epoch": 3.6993464052287583, "grad_norm": 1.115042356000082, "learning_rate": 7.265640031621114e-06, "loss": 0.0842, "step": 3962 }, { "epoch": 3.700280112044818, "grad_norm": 0.8044162677611313, "learning_rate": 7.2642919503127405e-06, "loss": 0.0268, "step": 3963 }, { "epoch": 3.701213818860878, "grad_norm": 1.266235833610989, "learning_rate": 7.262943661910002e-06, "loss": 0.0285, "step": 3964 }, { "epoch": 3.7021475256769376, "grad_norm": 1.905117318389277, "learning_rate": 7.261595166536211e-06, "loss": 0.0852, "step": 3965 }, { "epoch": 3.7030812324929974, "grad_norm": 0.5854461120737021, "learning_rate": 7.260246464314709e-06, "loss": 0.0121, "step": 3966 }, { "epoch": 3.704014939309057, "grad_norm": 1.3604425124229855, "learning_rate": 7.258897555368844e-06, "loss": 0.0533, "step": 3967 }, { "epoch": 3.704948646125117, "grad_norm": 1.355664390138542, "learning_rate": 7.257548439821988e-06, "loss": 0.0565, "step": 3968 }, { "epoch": 3.7058823529411766, "grad_norm": 0.8619927000664404, "learning_rate": 7.256199117797538e-06, "loss": 0.0487, "step": 3969 }, { "epoch": 3.7068160597572364, "grad_norm": 1.210967933256761, "learning_rate": 7.2548495894189e-06, "loss": 0.0373, "step": 3970 }, { "epoch": 3.707749766573296, "grad_norm": 5.663038543157773, "learning_rate": 7.253499854809505e-06, "loss": 0.2528, "step": 3971 }, { "epoch": 3.708683473389356, "grad_norm": 1.6887818240724408, "learning_rate": 7.252149914092798e-06, "loss": 0.062, "step": 3972 }, { "epoch": 3.7096171802054156, "grad_norm": 0.745957137737893, "learning_rate": 7.250799767392249e-06, "loss": 0.0332, "step": 3973 }, { "epoch": 3.7105508870214754, "grad_norm": 1.439034986685065, "learning_rate": 7.2494494148313445e-06, "loss": 0.0746, "step": 3974 }, { "epoch": 3.711484593837535, "grad_norm": 1.9734024053663188, "learning_rate": 7.248098856533586e-06, "loss": 0.1006, "step": 3975 }, { "epoch": 3.712418300653595, "grad_norm": 1.1415788296447709, "learning_rate": 7.2467480926225e-06, "loss": 0.0573, "step": 3976 }, { "epoch": 3.7133520074696547, "grad_norm": 3.647272499523935, "learning_rate": 7.2453971232216244e-06, "loss": 0.114, "step": 3977 }, { "epoch": 3.7142857142857144, "grad_norm": 2.178681525410035, "learning_rate": 7.244045948454525e-06, "loss": 0.1188, "step": 3978 }, { "epoch": 3.715219421101774, "grad_norm": 1.4683311355513744, "learning_rate": 7.24269456844478e-06, "loss": 0.0277, "step": 3979 }, { "epoch": 3.716153127917834, "grad_norm": 1.6830726546218515, "learning_rate": 7.241342983315985e-06, "loss": 0.1047, "step": 3980 }, { "epoch": 3.7170868347338937, "grad_norm": 0.8740928111676383, "learning_rate": 7.239991193191762e-06, "loss": 0.0291, "step": 3981 }, { "epoch": 3.7180205415499534, "grad_norm": 2.7561840868512406, "learning_rate": 7.238639198195743e-06, "loss": 0.1649, "step": 3982 }, { "epoch": 3.718954248366013, "grad_norm": 3.199744583637102, "learning_rate": 7.237286998451586e-06, "loss": 0.1375, "step": 3983 }, { "epoch": 3.719887955182073, "grad_norm": 1.5363286855868334, "learning_rate": 7.235934594082961e-06, "loss": 0.0812, "step": 3984 }, { "epoch": 3.7208216619981327, "grad_norm": 0.4230525735937001, "learning_rate": 7.2345819852135645e-06, "loss": 0.0078, "step": 3985 }, { "epoch": 3.7217553688141924, "grad_norm": 0.634411074615553, "learning_rate": 7.233229171967103e-06, "loss": 0.0076, "step": 3986 }, { "epoch": 3.722689075630252, "grad_norm": 0.34657602688144284, "learning_rate": 7.23187615446731e-06, "loss": 0.0096, "step": 3987 }, { "epoch": 3.723622782446312, "grad_norm": 1.3569436239949033, "learning_rate": 7.230522932837931e-06, "loss": 0.0538, "step": 3988 }, { "epoch": 3.7245564892623717, "grad_norm": 1.130903490385817, "learning_rate": 7.229169507202735e-06, "loss": 0.0439, "step": 3989 }, { "epoch": 3.7254901960784315, "grad_norm": 1.823412258247706, "learning_rate": 7.227815877685507e-06, "loss": 0.0735, "step": 3990 }, { "epoch": 3.726423902894491, "grad_norm": 1.705983991672445, "learning_rate": 7.2264620444100505e-06, "loss": 0.0674, "step": 3991 }, { "epoch": 3.727357609710551, "grad_norm": 2.937978874952231, "learning_rate": 7.225108007500189e-06, "loss": 0.0985, "step": 3992 }, { "epoch": 3.7282913165266107, "grad_norm": 2.8003805642993944, "learning_rate": 7.223753767079764e-06, "loss": 0.1507, "step": 3993 }, { "epoch": 3.7292250233426705, "grad_norm": 1.6660291174103992, "learning_rate": 7.222399323272635e-06, "loss": 0.0662, "step": 3994 }, { "epoch": 3.7301587301587302, "grad_norm": 1.0479858966897144, "learning_rate": 7.221044676202683e-06, "loss": 0.0203, "step": 3995 }, { "epoch": 3.73109243697479, "grad_norm": 3.1689231429219302, "learning_rate": 7.219689825993803e-06, "loss": 0.155, "step": 3996 }, { "epoch": 3.7320261437908497, "grad_norm": 0.8249453541683947, "learning_rate": 7.218334772769912e-06, "loss": 0.0118, "step": 3997 }, { "epoch": 3.7329598506069095, "grad_norm": 4.557983656560968, "learning_rate": 7.216979516654944e-06, "loss": 0.3273, "step": 3998 }, { "epoch": 3.7338935574229692, "grad_norm": 2.2965680432090463, "learning_rate": 7.215624057772852e-06, "loss": 0.1028, "step": 3999 }, { "epoch": 3.734827264239029, "grad_norm": 1.6412160903723843, "learning_rate": 7.214268396247608e-06, "loss": 0.1165, "step": 4000 }, { "epoch": 3.7357609710550888, "grad_norm": 2.292341863028166, "learning_rate": 7.212912532203201e-06, "loss": 0.1187, "step": 4001 }, { "epoch": 3.7366946778711485, "grad_norm": 1.060419529395313, "learning_rate": 7.211556465763643e-06, "loss": 0.0342, "step": 4002 }, { "epoch": 3.7376283846872083, "grad_norm": 0.8172260797035462, "learning_rate": 7.210200197052957e-06, "loss": 0.0321, "step": 4003 }, { "epoch": 3.738562091503268, "grad_norm": 4.007575413499242, "learning_rate": 7.20884372619519e-06, "loss": 0.1157, "step": 4004 }, { "epoch": 3.7394957983193278, "grad_norm": 1.7498731624690511, "learning_rate": 7.207487053314408e-06, "loss": 0.0692, "step": 4005 }, { "epoch": 3.7404295051353875, "grad_norm": 0.9236978256958726, "learning_rate": 7.206130178534692e-06, "loss": 0.0179, "step": 4006 }, { "epoch": 3.7413632119514473, "grad_norm": 0.9679135057112633, "learning_rate": 7.204773101980142e-06, "loss": 0.0455, "step": 4007 }, { "epoch": 3.742296918767507, "grad_norm": 0.7351420239669937, "learning_rate": 7.20341582377488e-06, "loss": 0.0396, "step": 4008 }, { "epoch": 3.743230625583567, "grad_norm": 2.41576172470054, "learning_rate": 7.202058344043043e-06, "loss": 0.1386, "step": 4009 }, { "epoch": 3.7441643323996265, "grad_norm": 2.0413020749103805, "learning_rate": 7.200700662908788e-06, "loss": 0.075, "step": 4010 }, { "epoch": 3.7450980392156863, "grad_norm": 1.7992458320711038, "learning_rate": 7.199342780496289e-06, "loss": 0.0532, "step": 4011 }, { "epoch": 3.746031746031746, "grad_norm": 4.349675920342818, "learning_rate": 7.197984696929739e-06, "loss": 0.1107, "step": 4012 }, { "epoch": 3.746965452847806, "grad_norm": 7.758838104946765, "learning_rate": 7.196626412333349e-06, "loss": 0.0509, "step": 4013 }, { "epoch": 3.7478991596638656, "grad_norm": 2.2277502898571, "learning_rate": 7.195267926831352e-06, "loss": 0.1709, "step": 4014 }, { "epoch": 3.7488328664799253, "grad_norm": 1.0868890888693785, "learning_rate": 7.193909240547995e-06, "loss": 0.0398, "step": 4015 }, { "epoch": 3.749766573295985, "grad_norm": 0.6350202299831628, "learning_rate": 7.192550353607542e-06, "loss": 0.0192, "step": 4016 }, { "epoch": 3.750700280112045, "grad_norm": 5.482949483851297, "learning_rate": 7.191191266134283e-06, "loss": 0.2399, "step": 4017 }, { "epoch": 3.7516339869281046, "grad_norm": 1.9856508770026564, "learning_rate": 7.189831978252517e-06, "loss": 0.0748, "step": 4018 }, { "epoch": 3.7525676937441643, "grad_norm": 1.12766156339244, "learning_rate": 7.188472490086569e-06, "loss": 0.0402, "step": 4019 }, { "epoch": 3.753501400560224, "grad_norm": 1.8350872868122934, "learning_rate": 7.187112801760777e-06, "loss": 0.0464, "step": 4020 }, { "epoch": 3.754435107376284, "grad_norm": 2.239453219646877, "learning_rate": 7.185752913399502e-06, "loss": 0.1219, "step": 4021 }, { "epoch": 3.7553688141923436, "grad_norm": 0.5015473078651892, "learning_rate": 7.184392825127117e-06, "loss": 0.0195, "step": 4022 }, { "epoch": 3.7563025210084033, "grad_norm": 0.4848141742073792, "learning_rate": 7.1830325370680196e-06, "loss": 0.0144, "step": 4023 }, { "epoch": 3.757236227824463, "grad_norm": 4.527250344703607, "learning_rate": 7.181672049346623e-06, "loss": 0.101, "step": 4024 }, { "epoch": 3.758169934640523, "grad_norm": 3.9916582484437293, "learning_rate": 7.180311362087359e-06, "loss": 0.2287, "step": 4025 }, { "epoch": 3.7591036414565826, "grad_norm": 2.1474576649904904, "learning_rate": 7.178950475414675e-06, "loss": 0.0442, "step": 4026 }, { "epoch": 3.7600373482726424, "grad_norm": 2.426964435356465, "learning_rate": 7.177589389453042e-06, "loss": 0.0442, "step": 4027 }, { "epoch": 3.760971055088702, "grad_norm": 1.6878345198181353, "learning_rate": 7.176228104326944e-06, "loss": 0.1202, "step": 4028 }, { "epoch": 3.761904761904762, "grad_norm": 6.794194645790444, "learning_rate": 7.174866620160888e-06, "loss": 0.2683, "step": 4029 }, { "epoch": 3.7628384687208216, "grad_norm": 5.027480724536665, "learning_rate": 7.173504937079395e-06, "loss": 0.1164, "step": 4030 }, { "epoch": 3.7637721755368814, "grad_norm": 2.3350466635633036, "learning_rate": 7.172143055207005e-06, "loss": 0.1559, "step": 4031 }, { "epoch": 3.764705882352941, "grad_norm": 6.350996294726975, "learning_rate": 7.170780974668279e-06, "loss": 0.219, "step": 4032 }, { "epoch": 3.765639589169001, "grad_norm": 2.4635755600558933, "learning_rate": 7.169418695587791e-06, "loss": 0.1111, "step": 4033 }, { "epoch": 3.7665732959850606, "grad_norm": 1.7905194209132762, "learning_rate": 7.1680562180901415e-06, "loss": 0.1448, "step": 4034 }, { "epoch": 3.7675070028011204, "grad_norm": 1.7857332131116563, "learning_rate": 7.166693542299939e-06, "loss": 0.0869, "step": 4035 }, { "epoch": 3.76844070961718, "grad_norm": 3.0220532954318746, "learning_rate": 7.165330668341819e-06, "loss": 0.1555, "step": 4036 }, { "epoch": 3.76937441643324, "grad_norm": 2.6718695471826335, "learning_rate": 7.163967596340429e-06, "loss": 0.1024, "step": 4037 }, { "epoch": 3.7703081232492996, "grad_norm": 2.3065714072393337, "learning_rate": 7.162604326420437e-06, "loss": 0.0901, "step": 4038 }, { "epoch": 3.7712418300653594, "grad_norm": 1.9043487681409632, "learning_rate": 7.16124085870653e-06, "loss": 0.1246, "step": 4039 }, { "epoch": 3.772175536881419, "grad_norm": 2.2800854387972604, "learning_rate": 7.159877193323412e-06, "loss": 0.1525, "step": 4040 }, { "epoch": 3.773109243697479, "grad_norm": 2.3987813700297553, "learning_rate": 7.158513330395804e-06, "loss": 0.1757, "step": 4041 }, { "epoch": 3.7740429505135387, "grad_norm": 3.8710783375691777, "learning_rate": 7.157149270048448e-06, "loss": 0.0337, "step": 4042 }, { "epoch": 3.7749766573295984, "grad_norm": 2.0685626008869735, "learning_rate": 7.1557850124061e-06, "loss": 0.0717, "step": 4043 }, { "epoch": 3.775910364145658, "grad_norm": 3.1858811485182623, "learning_rate": 7.15442055759354e-06, "loss": 0.0814, "step": 4044 }, { "epoch": 3.776844070961718, "grad_norm": 0.9025979073058186, "learning_rate": 7.153055905735557e-06, "loss": 0.0456, "step": 4045 }, { "epoch": 3.7777777777777777, "grad_norm": 2.5215988426224016, "learning_rate": 7.15169105695697e-06, "loss": 0.1208, "step": 4046 }, { "epoch": 3.7787114845938374, "grad_norm": 1.4334468537359615, "learning_rate": 7.1503260113826035e-06, "loss": 0.0407, "step": 4047 }, { "epoch": 3.779645191409897, "grad_norm": 1.690787527247449, "learning_rate": 7.148960769137307e-06, "loss": 0.0076, "step": 4048 }, { "epoch": 3.780578898225957, "grad_norm": 1.7658307430788038, "learning_rate": 7.147595330345951e-06, "loss": 0.0581, "step": 4049 }, { "epoch": 3.7815126050420167, "grad_norm": 6.269305781301275, "learning_rate": 7.146229695133416e-06, "loss": 0.1545, "step": 4050 }, { "epoch": 3.7824463118580764, "grad_norm": 5.841891922083343, "learning_rate": 7.144863863624607e-06, "loss": 0.2676, "step": 4051 }, { "epoch": 3.783380018674136, "grad_norm": 2.7440247873043746, "learning_rate": 7.143497835944441e-06, "loss": 0.1514, "step": 4052 }, { "epoch": 3.784313725490196, "grad_norm": 2.9749688383965025, "learning_rate": 7.142131612217858e-06, "loss": 0.1456, "step": 4053 }, { "epoch": 3.7852474323062557, "grad_norm": 1.3269386642378775, "learning_rate": 7.140765192569814e-06, "loss": 0.0701, "step": 4054 }, { "epoch": 3.7861811391223155, "grad_norm": 0.7817685083849084, "learning_rate": 7.139398577125284e-06, "loss": 0.0171, "step": 4055 }, { "epoch": 3.787114845938375, "grad_norm": 1.853502220279058, "learning_rate": 7.138031766009259e-06, "loss": 0.0216, "step": 4056 }, { "epoch": 3.788048552754435, "grad_norm": 0.8661569349826399, "learning_rate": 7.1366647593467486e-06, "loss": 0.0288, "step": 4057 }, { "epoch": 3.7889822595704947, "grad_norm": 1.367519646722407, "learning_rate": 7.135297557262781e-06, "loss": 0.0654, "step": 4058 }, { "epoch": 3.7899159663865545, "grad_norm": 2.033864773968156, "learning_rate": 7.133930159882403e-06, "loss": 0.1056, "step": 4059 }, { "epoch": 3.7908496732026142, "grad_norm": 0.5722792098891192, "learning_rate": 7.132562567330675e-06, "loss": 0.0142, "step": 4060 }, { "epoch": 3.791783380018674, "grad_norm": 0.46766974705335435, "learning_rate": 7.131194779732682e-06, "loss": 0.0155, "step": 4061 }, { "epoch": 3.7927170868347337, "grad_norm": 1.9253963003184376, "learning_rate": 7.129826797213521e-06, "loss": 0.1054, "step": 4062 }, { "epoch": 3.7936507936507935, "grad_norm": 0.6617161752441661, "learning_rate": 7.128458619898309e-06, "loss": 0.0226, "step": 4063 }, { "epoch": 3.7945845004668532, "grad_norm": 0.3497105547073562, "learning_rate": 7.127090247912183e-06, "loss": 0.0045, "step": 4064 }, { "epoch": 3.795518207282913, "grad_norm": 1.4132479168476995, "learning_rate": 7.125721681380293e-06, "loss": 0.0333, "step": 4065 }, { "epoch": 3.7964519140989728, "grad_norm": 2.5814504912497505, "learning_rate": 7.12435292042781e-06, "loss": 0.1263, "step": 4066 }, { "epoch": 3.7973856209150325, "grad_norm": 2.2424052149699745, "learning_rate": 7.122983965179924e-06, "loss": 0.0938, "step": 4067 }, { "epoch": 3.7983193277310923, "grad_norm": 1.2395593612780935, "learning_rate": 7.121614815761839e-06, "loss": 0.0421, "step": 4068 }, { "epoch": 3.799253034547152, "grad_norm": 1.8079610821606427, "learning_rate": 7.120245472298779e-06, "loss": 0.1128, "step": 4069 }, { "epoch": 3.8001867413632118, "grad_norm": 0.7779508680211079, "learning_rate": 7.118875934915987e-06, "loss": 0.0375, "step": 4070 }, { "epoch": 3.8011204481792715, "grad_norm": 2.4847494324470776, "learning_rate": 7.11750620373872e-06, "loss": 0.2262, "step": 4071 }, { "epoch": 3.8020541549953313, "grad_norm": 1.36596319970721, "learning_rate": 7.116136278892257e-06, "loss": 0.059, "step": 4072 }, { "epoch": 3.802987861811391, "grad_norm": 0.7372399919852829, "learning_rate": 7.114766160501892e-06, "loss": 0.0219, "step": 4073 }, { "epoch": 3.803921568627451, "grad_norm": 1.2245871894797007, "learning_rate": 7.113395848692937e-06, "loss": 0.0273, "step": 4074 }, { "epoch": 3.8048552754435105, "grad_norm": 1.371606918689206, "learning_rate": 7.112025343590721e-06, "loss": 0.0827, "step": 4075 }, { "epoch": 3.8057889822595703, "grad_norm": 1.9663870466918816, "learning_rate": 7.110654645320595e-06, "loss": 0.0765, "step": 4076 }, { "epoch": 3.80672268907563, "grad_norm": 1.1802883484426088, "learning_rate": 7.109283754007921e-06, "loss": 0.0325, "step": 4077 }, { "epoch": 3.80765639589169, "grad_norm": 1.1071387845275589, "learning_rate": 7.107912669778084e-06, "loss": 0.0509, "step": 4078 }, { "epoch": 3.8085901027077496, "grad_norm": 0.37840764710827884, "learning_rate": 7.106541392756484e-06, "loss": 0.0123, "step": 4079 }, { "epoch": 3.8095238095238093, "grad_norm": 1.3506202436966588, "learning_rate": 7.10516992306854e-06, "loss": 0.074, "step": 4080 }, { "epoch": 3.810457516339869, "grad_norm": 0.9545684778299868, "learning_rate": 7.103798260839687e-06, "loss": 0.0416, "step": 4081 }, { "epoch": 3.811391223155929, "grad_norm": 0.8507026443615568, "learning_rate": 7.1024264061953786e-06, "loss": 0.0492, "step": 4082 }, { "epoch": 3.8123249299719886, "grad_norm": 1.313696922271759, "learning_rate": 7.101054359261086e-06, "loss": 0.0534, "step": 4083 }, { "epoch": 3.8132586367880483, "grad_norm": 1.4356106516140488, "learning_rate": 7.0996821201623e-06, "loss": 0.0791, "step": 4084 }, { "epoch": 3.814192343604108, "grad_norm": 1.669987173689035, "learning_rate": 7.098309689024525e-06, "loss": 0.0819, "step": 4085 }, { "epoch": 3.815126050420168, "grad_norm": 0.6734082492596895, "learning_rate": 7.096937065973285e-06, "loss": 0.0414, "step": 4086 }, { "epoch": 3.8160597572362276, "grad_norm": 3.4102997089489966, "learning_rate": 7.095564251134121e-06, "loss": 0.0673, "step": 4087 }, { "epoch": 3.8169934640522873, "grad_norm": 1.32121835095273, "learning_rate": 7.094191244632594e-06, "loss": 0.0342, "step": 4088 }, { "epoch": 3.817927170868347, "grad_norm": 2.0075700937988503, "learning_rate": 7.092818046594279e-06, "loss": 0.1534, "step": 4089 }, { "epoch": 3.818860877684407, "grad_norm": 0.9988699021175902, "learning_rate": 7.091444657144768e-06, "loss": 0.0447, "step": 4090 }, { "epoch": 3.8197945845004666, "grad_norm": 1.6609574865567838, "learning_rate": 7.090071076409677e-06, "loss": 0.1163, "step": 4091 }, { "epoch": 3.8207282913165264, "grad_norm": 5.543628361795882, "learning_rate": 7.088697304514633e-06, "loss": 0.2698, "step": 4092 }, { "epoch": 3.821661998132586, "grad_norm": 1.180283858005707, "learning_rate": 7.087323341585282e-06, "loss": 0.052, "step": 4093 }, { "epoch": 3.822595704948646, "grad_norm": 1.195258621665203, "learning_rate": 7.085949187747287e-06, "loss": 0.0696, "step": 4094 }, { "epoch": 3.8235294117647056, "grad_norm": 1.973961937191341, "learning_rate": 7.084574843126333e-06, "loss": 0.0838, "step": 4095 }, { "epoch": 3.8244631185807654, "grad_norm": 1.6268863460710064, "learning_rate": 7.083200307848116e-06, "loss": 0.0788, "step": 4096 }, { "epoch": 3.825396825396825, "grad_norm": 4.175357242249098, "learning_rate": 7.081825582038352e-06, "loss": 0.2349, "step": 4097 }, { "epoch": 3.826330532212885, "grad_norm": 1.6491136379205757, "learning_rate": 7.080450665822777e-06, "loss": 0.0926, "step": 4098 }, { "epoch": 3.8272642390289446, "grad_norm": 0.9625586950261017, "learning_rate": 7.07907555932714e-06, "loss": 0.0218, "step": 4099 }, { "epoch": 3.828197945845005, "grad_norm": 2.821715661206522, "learning_rate": 7.077700262677212e-06, "loss": 0.1639, "step": 4100 }, { "epoch": 3.8291316526610646, "grad_norm": 1.9219185833934405, "learning_rate": 7.0763247759987765e-06, "loss": 0.0802, "step": 4101 }, { "epoch": 3.8300653594771243, "grad_norm": 4.606233759183722, "learning_rate": 7.074949099417638e-06, "loss": 0.3401, "step": 4102 }, { "epoch": 3.830999066293184, "grad_norm": 1.2232649043601243, "learning_rate": 7.073573233059618e-06, "loss": 0.0539, "step": 4103 }, { "epoch": 3.831932773109244, "grad_norm": 0.7841026075587223, "learning_rate": 7.072197177050553e-06, "loss": 0.0356, "step": 4104 }, { "epoch": 3.8328664799253036, "grad_norm": 2.4596463803632473, "learning_rate": 7.070820931516299e-06, "loss": 0.1219, "step": 4105 }, { "epoch": 3.8338001867413634, "grad_norm": 1.4376393761046182, "learning_rate": 7.06944449658273e-06, "loss": 0.1242, "step": 4106 }, { "epoch": 3.834733893557423, "grad_norm": 0.8207878494595158, "learning_rate": 7.068067872375734e-06, "loss": 0.0164, "step": 4107 }, { "epoch": 3.835667600373483, "grad_norm": 1.8847382886541322, "learning_rate": 7.066691059021221e-06, "loss": 0.1177, "step": 4108 }, { "epoch": 3.8366013071895426, "grad_norm": 0.7314060605588721, "learning_rate": 7.065314056645113e-06, "loss": 0.0215, "step": 4109 }, { "epoch": 3.8375350140056024, "grad_norm": 1.152359298930137, "learning_rate": 7.063936865373353e-06, "loss": 0.0323, "step": 4110 }, { "epoch": 3.838468720821662, "grad_norm": 1.3907109017773784, "learning_rate": 7.0625594853319025e-06, "loss": 0.0653, "step": 4111 }, { "epoch": 3.839402427637722, "grad_norm": 4.906945703074804, "learning_rate": 7.061181916646733e-06, "loss": 0.0863, "step": 4112 }, { "epoch": 3.8403361344537816, "grad_norm": 1.266183913205298, "learning_rate": 7.059804159443844e-06, "loss": 0.0642, "step": 4113 }, { "epoch": 3.8412698412698414, "grad_norm": 3.136149732391955, "learning_rate": 7.058426213849241e-06, "loss": 0.2105, "step": 4114 }, { "epoch": 3.842203548085901, "grad_norm": 2.2726749158959674, "learning_rate": 7.0570480799889575e-06, "loss": 0.1318, "step": 4115 }, { "epoch": 3.843137254901961, "grad_norm": 1.0307321902409978, "learning_rate": 7.055669757989034e-06, "loss": 0.0381, "step": 4116 }, { "epoch": 3.8440709617180207, "grad_norm": 0.8917823274003855, "learning_rate": 7.0542912479755385e-06, "loss": 0.0289, "step": 4117 }, { "epoch": 3.8450046685340804, "grad_norm": 2.1367174141344973, "learning_rate": 7.052912550074546e-06, "loss": 0.1127, "step": 4118 }, { "epoch": 3.84593837535014, "grad_norm": 1.4788228272036197, "learning_rate": 7.051533664412157e-06, "loss": 0.1017, "step": 4119 }, { "epoch": 3.8468720821662, "grad_norm": 1.4697560808453118, "learning_rate": 7.050154591114483e-06, "loss": 0.1007, "step": 4120 }, { "epoch": 3.8478057889822597, "grad_norm": 1.5709238258068905, "learning_rate": 7.048775330307658e-06, "loss": 0.0562, "step": 4121 }, { "epoch": 3.8487394957983194, "grad_norm": 1.734081646197721, "learning_rate": 7.047395882117829e-06, "loss": 0.1236, "step": 4122 }, { "epoch": 3.849673202614379, "grad_norm": 1.1194903302577697, "learning_rate": 7.046016246671162e-06, "loss": 0.0686, "step": 4123 }, { "epoch": 3.850606909430439, "grad_norm": 1.060759628058162, "learning_rate": 7.044636424093839e-06, "loss": 0.0644, "step": 4124 }, { "epoch": 3.8515406162464987, "grad_norm": 1.540300287370233, "learning_rate": 7.043256414512061e-06, "loss": 0.0432, "step": 4125 }, { "epoch": 3.8524743230625584, "grad_norm": 0.9106386898056639, "learning_rate": 7.041876218052046e-06, "loss": 0.0523, "step": 4126 }, { "epoch": 3.853408029878618, "grad_norm": 1.3984976929917383, "learning_rate": 7.040495834840025e-06, "loss": 0.0853, "step": 4127 }, { "epoch": 3.854341736694678, "grad_norm": 1.9126393189130322, "learning_rate": 7.039115265002253e-06, "loss": 0.0848, "step": 4128 }, { "epoch": 3.8552754435107377, "grad_norm": 1.4147852777914989, "learning_rate": 7.037734508664995e-06, "loss": 0.064, "step": 4129 }, { "epoch": 3.8562091503267975, "grad_norm": 0.8007906335636767, "learning_rate": 7.036353565954539e-06, "loss": 0.0214, "step": 4130 }, { "epoch": 3.857142857142857, "grad_norm": 0.5839478887822741, "learning_rate": 7.034972436997185e-06, "loss": 0.0131, "step": 4131 }, { "epoch": 3.858076563958917, "grad_norm": 2.5814245987064717, "learning_rate": 7.0335911219192545e-06, "loss": 0.1472, "step": 4132 }, { "epoch": 3.8590102707749767, "grad_norm": 2.386041940914463, "learning_rate": 7.032209620847083e-06, "loss": 0.1745, "step": 4133 }, { "epoch": 3.8599439775910365, "grad_norm": 1.6174702901678177, "learning_rate": 7.030827933907023e-06, "loss": 0.1024, "step": 4134 }, { "epoch": 3.860877684407096, "grad_norm": 1.2925453371916047, "learning_rate": 7.0294460612254455e-06, "loss": 0.0634, "step": 4135 }, { "epoch": 3.861811391223156, "grad_norm": 1.7299116594702049, "learning_rate": 7.02806400292874e-06, "loss": 0.0268, "step": 4136 }, { "epoch": 3.8627450980392157, "grad_norm": 1.0434124487424956, "learning_rate": 7.0266817591433065e-06, "loss": 0.017, "step": 4137 }, { "epoch": 3.8636788048552755, "grad_norm": 0.6510880421947607, "learning_rate": 7.025299329995572e-06, "loss": 0.015, "step": 4138 }, { "epoch": 3.8646125116713352, "grad_norm": 0.8902362313157425, "learning_rate": 7.023916715611969e-06, "loss": 0.0315, "step": 4139 }, { "epoch": 3.865546218487395, "grad_norm": 2.0669653260147594, "learning_rate": 7.022533916118958e-06, "loss": 0.1043, "step": 4140 }, { "epoch": 3.8664799253034547, "grad_norm": 1.5147017303496695, "learning_rate": 7.021150931643007e-06, "loss": 0.0813, "step": 4141 }, { "epoch": 3.8674136321195145, "grad_norm": 2.9971357128624487, "learning_rate": 7.0197677623106085e-06, "loss": 0.0589, "step": 4142 }, { "epoch": 3.8683473389355743, "grad_norm": 2.480572808980537, "learning_rate": 7.018384408248267e-06, "loss": 0.1102, "step": 4143 }, { "epoch": 3.869281045751634, "grad_norm": 1.1713318313034746, "learning_rate": 7.017000869582504e-06, "loss": 0.0818, "step": 4144 }, { "epoch": 3.8702147525676938, "grad_norm": 0.9918318625483413, "learning_rate": 7.015617146439863e-06, "loss": 0.0557, "step": 4145 }, { "epoch": 3.8711484593837535, "grad_norm": 2.221608733506879, "learning_rate": 7.014233238946896e-06, "loss": 0.0861, "step": 4146 }, { "epoch": 3.8720821661998133, "grad_norm": 1.278554188759088, "learning_rate": 7.012849147230181e-06, "loss": 0.0497, "step": 4147 }, { "epoch": 3.873015873015873, "grad_norm": 1.0623876553006608, "learning_rate": 7.011464871416304e-06, "loss": 0.0747, "step": 4148 }, { "epoch": 3.8739495798319328, "grad_norm": 1.1005211237037862, "learning_rate": 7.010080411631876e-06, "loss": 0.0521, "step": 4149 }, { "epoch": 3.8748832866479925, "grad_norm": 1.9833154662007115, "learning_rate": 7.008695768003518e-06, "loss": 0.1362, "step": 4150 }, { "epoch": 3.8758169934640523, "grad_norm": 1.2915526274563929, "learning_rate": 7.0073109406578745e-06, "loss": 0.1172, "step": 4151 }, { "epoch": 3.876750700280112, "grad_norm": 0.3053638610930248, "learning_rate": 7.0059259297216e-06, "loss": 0.0108, "step": 4152 }, { "epoch": 3.877684407096172, "grad_norm": 0.9216176793237999, "learning_rate": 7.00454073532137e-06, "loss": 0.0469, "step": 4153 }, { "epoch": 3.8786181139122315, "grad_norm": 0.38182577680218166, "learning_rate": 7.003155357583877e-06, "loss": 0.0196, "step": 4154 }, { "epoch": 3.8795518207282913, "grad_norm": 3.731668384253367, "learning_rate": 7.001769796635827e-06, "loss": 0.1551, "step": 4155 }, { "epoch": 3.880485527544351, "grad_norm": 6.570504353108517, "learning_rate": 7.000384052603946e-06, "loss": 0.2846, "step": 4156 }, { "epoch": 3.881419234360411, "grad_norm": 2.567167556013672, "learning_rate": 6.998998125614974e-06, "loss": 0.134, "step": 4157 }, { "epoch": 3.8823529411764706, "grad_norm": 1.2209834067167114, "learning_rate": 6.99761201579567e-06, "loss": 0.059, "step": 4158 }, { "epoch": 3.8832866479925303, "grad_norm": 4.18701619829476, "learning_rate": 6.996225723272812e-06, "loss": 0.2114, "step": 4159 }, { "epoch": 3.88422035480859, "grad_norm": 0.8574746292450398, "learning_rate": 6.9948392481731865e-06, "loss": 0.0531, "step": 4160 }, { "epoch": 3.88515406162465, "grad_norm": 1.4470679397638582, "learning_rate": 6.993452590623606e-06, "loss": 0.0158, "step": 4161 }, { "epoch": 3.8860877684407096, "grad_norm": 0.5203074212464454, "learning_rate": 6.992065750750893e-06, "loss": 0.0097, "step": 4162 }, { "epoch": 3.8870214752567693, "grad_norm": 2.877373710806633, "learning_rate": 6.99067872868189e-06, "loss": 0.2373, "step": 4163 }, { "epoch": 3.887955182072829, "grad_norm": 1.6505095232765212, "learning_rate": 6.989291524543456e-06, "loss": 0.06, "step": 4164 }, { "epoch": 3.888888888888889, "grad_norm": 0.9608099111626819, "learning_rate": 6.987904138462465e-06, "loss": 0.0512, "step": 4165 }, { "epoch": 3.8898225957049486, "grad_norm": 3.246512299108542, "learning_rate": 6.98651657056581e-06, "loss": 0.2383, "step": 4166 }, { "epoch": 3.8907563025210083, "grad_norm": 2.087218118283218, "learning_rate": 6.985128820980398e-06, "loss": 0.1046, "step": 4167 }, { "epoch": 3.891690009337068, "grad_norm": 2.1152300415414595, "learning_rate": 6.983740889833156e-06, "loss": 0.0741, "step": 4168 }, { "epoch": 3.892623716153128, "grad_norm": 1.0564916596844836, "learning_rate": 6.982352777251023e-06, "loss": 0.0442, "step": 4169 }, { "epoch": 3.8935574229691876, "grad_norm": 1.0443323930326882, "learning_rate": 6.9809644833609595e-06, "loss": 0.0667, "step": 4170 }, { "epoch": 3.8944911297852474, "grad_norm": 3.6105197640136963, "learning_rate": 6.979576008289936e-06, "loss": 0.2425, "step": 4171 }, { "epoch": 3.895424836601307, "grad_norm": 0.6996475538351429, "learning_rate": 6.978187352164949e-06, "loss": 0.0269, "step": 4172 }, { "epoch": 3.896358543417367, "grad_norm": 1.4125231004733225, "learning_rate": 6.976798515113003e-06, "loss": 0.0521, "step": 4173 }, { "epoch": 3.8972922502334266, "grad_norm": 1.2836396269168437, "learning_rate": 6.975409497261125e-06, "loss": 0.0568, "step": 4174 }, { "epoch": 3.8982259570494864, "grad_norm": 2.3644515341635226, "learning_rate": 6.974020298736354e-06, "loss": 0.1036, "step": 4175 }, { "epoch": 3.899159663865546, "grad_norm": 2.3581182091772535, "learning_rate": 6.972630919665748e-06, "loss": 0.0569, "step": 4176 }, { "epoch": 3.900093370681606, "grad_norm": 1.2547080252768343, "learning_rate": 6.971241360176381e-06, "loss": 0.0475, "step": 4177 }, { "epoch": 3.9010270774976656, "grad_norm": 1.0951945267761014, "learning_rate": 6.969851620395343e-06, "loss": 0.0687, "step": 4178 }, { "epoch": 3.9019607843137254, "grad_norm": 0.8517301774052631, "learning_rate": 6.968461700449742e-06, "loss": 0.0232, "step": 4179 }, { "epoch": 3.902894491129785, "grad_norm": 2.0940370238169277, "learning_rate": 6.967071600466699e-06, "loss": 0.1539, "step": 4180 }, { "epoch": 3.903828197945845, "grad_norm": 2.5977141515273874, "learning_rate": 6.9656813205733575e-06, "loss": 0.1174, "step": 4181 }, { "epoch": 3.9047619047619047, "grad_norm": 1.5712980452167238, "learning_rate": 6.964290860896871e-06, "loss": 0.0618, "step": 4182 }, { "epoch": 3.9056956115779644, "grad_norm": 1.2718545813827837, "learning_rate": 6.962900221564415e-06, "loss": 0.0908, "step": 4183 }, { "epoch": 3.906629318394024, "grad_norm": 0.37333643625882323, "learning_rate": 6.961509402703175e-06, "loss": 0.007, "step": 4184 }, { "epoch": 3.907563025210084, "grad_norm": 2.5481292267412505, "learning_rate": 6.9601184044403605e-06, "loss": 0.1367, "step": 4185 }, { "epoch": 3.9084967320261437, "grad_norm": 0.4130309005962697, "learning_rate": 6.958727226903191e-06, "loss": 0.0241, "step": 4186 }, { "epoch": 3.9094304388422034, "grad_norm": 1.2530721194672616, "learning_rate": 6.957335870218906e-06, "loss": 0.0681, "step": 4187 }, { "epoch": 3.910364145658263, "grad_norm": 4.353646045545304, "learning_rate": 6.955944334514758e-06, "loss": 0.1078, "step": 4188 }, { "epoch": 3.911297852474323, "grad_norm": 2.422296051686996, "learning_rate": 6.954552619918023e-06, "loss": 0.1624, "step": 4189 }, { "epoch": 3.9122315592903827, "grad_norm": 1.2675696897856097, "learning_rate": 6.9531607265559855e-06, "loss": 0.0554, "step": 4190 }, { "epoch": 3.9131652661064424, "grad_norm": 1.8875178165431141, "learning_rate": 6.951768654555949e-06, "loss": 0.1353, "step": 4191 }, { "epoch": 3.914098972922502, "grad_norm": 2.1877654567542257, "learning_rate": 6.950376404045235e-06, "loss": 0.1199, "step": 4192 }, { "epoch": 3.915032679738562, "grad_norm": 0.3886390433606379, "learning_rate": 6.94898397515118e-06, "loss": 0.0145, "step": 4193 }, { "epoch": 3.9159663865546217, "grad_norm": 1.1818210450468853, "learning_rate": 6.947591368001138e-06, "loss": 0.05, "step": 4194 }, { "epoch": 3.9169000933706815, "grad_norm": 1.2719181998759221, "learning_rate": 6.946198582722474e-06, "loss": 0.0426, "step": 4195 }, { "epoch": 3.917833800186741, "grad_norm": 1.9610953171953565, "learning_rate": 6.944805619442578e-06, "loss": 0.1811, "step": 4196 }, { "epoch": 3.918767507002801, "grad_norm": 0.6981157554236173, "learning_rate": 6.943412478288851e-06, "loss": 0.0256, "step": 4197 }, { "epoch": 3.9197012138188607, "grad_norm": 1.0893144909228758, "learning_rate": 6.942019159388709e-06, "loss": 0.0484, "step": 4198 }, { "epoch": 3.9206349206349205, "grad_norm": 2.1033182539800968, "learning_rate": 6.940625662869588e-06, "loss": 0.0719, "step": 4199 }, { "epoch": 3.9215686274509802, "grad_norm": 2.371650470266384, "learning_rate": 6.939231988858938e-06, "loss": 0.1123, "step": 4200 }, { "epoch": 3.9225023342670404, "grad_norm": 1.0352560325352667, "learning_rate": 6.937838137484226e-06, "loss": 0.0278, "step": 4201 }, { "epoch": 3.9234360410831, "grad_norm": 1.8502635355763155, "learning_rate": 6.936444108872935e-06, "loss": 0.0926, "step": 4202 }, { "epoch": 3.92436974789916, "grad_norm": 1.2974180979656404, "learning_rate": 6.935049903152563e-06, "loss": 0.0592, "step": 4203 }, { "epoch": 3.9253034547152197, "grad_norm": 3.100280668583476, "learning_rate": 6.933655520450628e-06, "loss": 0.1493, "step": 4204 }, { "epoch": 3.9262371615312794, "grad_norm": 0.6871835091834079, "learning_rate": 6.932260960894659e-06, "loss": 0.0231, "step": 4205 }, { "epoch": 3.927170868347339, "grad_norm": 1.7007293739528764, "learning_rate": 6.930866224612207e-06, "loss": 0.1103, "step": 4206 }, { "epoch": 3.928104575163399, "grad_norm": 1.9496115842445851, "learning_rate": 6.929471311730832e-06, "loss": 0.1118, "step": 4207 }, { "epoch": 3.9290382819794587, "grad_norm": 1.4443555978914142, "learning_rate": 6.928076222378117e-06, "loss": 0.0674, "step": 4208 }, { "epoch": 3.9299719887955185, "grad_norm": 1.4959355549842333, "learning_rate": 6.9266809566816574e-06, "loss": 0.0588, "step": 4209 }, { "epoch": 3.930905695611578, "grad_norm": 0.40912967357127006, "learning_rate": 6.925285514769065e-06, "loss": 0.0088, "step": 4210 }, { "epoch": 3.931839402427638, "grad_norm": 1.0238599212999502, "learning_rate": 6.923889896767969e-06, "loss": 0.0587, "step": 4211 }, { "epoch": 3.9327731092436977, "grad_norm": 2.1738549522014567, "learning_rate": 6.922494102806013e-06, "loss": 0.1293, "step": 4212 }, { "epoch": 3.9337068160597575, "grad_norm": 0.3006741595684197, "learning_rate": 6.921098133010859e-06, "loss": 0.0067, "step": 4213 }, { "epoch": 3.9346405228758172, "grad_norm": 1.592156805968131, "learning_rate": 6.919701987510184e-06, "loss": 0.0861, "step": 4214 }, { "epoch": 3.935574229691877, "grad_norm": 2.7194558103985282, "learning_rate": 6.918305666431678e-06, "loss": 0.1141, "step": 4215 }, { "epoch": 3.9365079365079367, "grad_norm": 0.6010713900389253, "learning_rate": 6.9169091699030535e-06, "loss": 0.0228, "step": 4216 }, { "epoch": 3.9374416433239965, "grad_norm": 1.7699773843058935, "learning_rate": 6.915512498052033e-06, "loss": 0.084, "step": 4217 }, { "epoch": 3.9383753501400562, "grad_norm": 0.9240957078471448, "learning_rate": 6.914115651006359e-06, "loss": 0.0323, "step": 4218 }, { "epoch": 3.939309056956116, "grad_norm": 1.5141469328757298, "learning_rate": 6.912718628893788e-06, "loss": 0.083, "step": 4219 }, { "epoch": 3.9402427637721757, "grad_norm": 0.9308323244087587, "learning_rate": 6.911321431842091e-06, "loss": 0.0433, "step": 4220 }, { "epoch": 3.9411764705882355, "grad_norm": 6.279571611741908, "learning_rate": 6.909924059979061e-06, "loss": 0.2118, "step": 4221 }, { "epoch": 3.9421101774042953, "grad_norm": 1.7780565330819944, "learning_rate": 6.9085265134325e-06, "loss": 0.1128, "step": 4222 }, { "epoch": 3.943043884220355, "grad_norm": 2.817678929493537, "learning_rate": 6.90712879233023e-06, "loss": 0.1745, "step": 4223 }, { "epoch": 3.9439775910364148, "grad_norm": 0.9799914711778858, "learning_rate": 6.905730896800087e-06, "loss": 0.0641, "step": 4224 }, { "epoch": 3.9449112978524745, "grad_norm": 0.676088194746558, "learning_rate": 6.904332826969926e-06, "loss": 0.0361, "step": 4225 }, { "epoch": 3.9458450046685343, "grad_norm": 1.366787075171141, "learning_rate": 6.902934582967613e-06, "loss": 0.073, "step": 4226 }, { "epoch": 3.946778711484594, "grad_norm": 1.9708294290866928, "learning_rate": 6.901536164921037e-06, "loss": 0.1125, "step": 4227 }, { "epoch": 3.947712418300654, "grad_norm": 1.5618003323474978, "learning_rate": 6.900137572958095e-06, "loss": 0.0931, "step": 4228 }, { "epoch": 3.9486461251167135, "grad_norm": 2.3859278060092866, "learning_rate": 6.898738807206702e-06, "loss": 0.1061, "step": 4229 }, { "epoch": 3.9495798319327733, "grad_norm": 0.730221190870262, "learning_rate": 6.8973398677947955e-06, "loss": 0.0086, "step": 4230 }, { "epoch": 3.950513538748833, "grad_norm": 1.5110165344696949, "learning_rate": 6.895940754850321e-06, "loss": 0.0799, "step": 4231 }, { "epoch": 3.951447245564893, "grad_norm": 0.6135626976799005, "learning_rate": 6.8945414685012435e-06, "loss": 0.0264, "step": 4232 }, { "epoch": 3.9523809523809526, "grad_norm": 0.35567425596536034, "learning_rate": 6.8931420088755416e-06, "loss": 0.0174, "step": 4233 }, { "epoch": 3.9533146591970123, "grad_norm": 0.8661495381566972, "learning_rate": 6.891742376101214e-06, "loss": 0.0385, "step": 4234 }, { "epoch": 3.954248366013072, "grad_norm": 3.3011766135087512, "learning_rate": 6.8903425703062684e-06, "loss": 0.1726, "step": 4235 }, { "epoch": 3.955182072829132, "grad_norm": 2.971378923785035, "learning_rate": 6.888942591618737e-06, "loss": 0.134, "step": 4236 }, { "epoch": 3.9561157796451916, "grad_norm": 2.1746925105639923, "learning_rate": 6.887542440166662e-06, "loss": 0.1109, "step": 4237 }, { "epoch": 3.9570494864612513, "grad_norm": 1.6192913667050273, "learning_rate": 6.886142116078098e-06, "loss": 0.0885, "step": 4238 }, { "epoch": 3.957983193277311, "grad_norm": 1.3018029138088931, "learning_rate": 6.884741619481128e-06, "loss": 0.0944, "step": 4239 }, { "epoch": 3.958916900093371, "grad_norm": 6.363684557729228, "learning_rate": 6.883340950503836e-06, "loss": 0.1913, "step": 4240 }, { "epoch": 3.9598506069094306, "grad_norm": 5.577969371670913, "learning_rate": 6.8819401092743335e-06, "loss": 0.3159, "step": 4241 }, { "epoch": 3.9607843137254903, "grad_norm": 2.8090132309860376, "learning_rate": 6.880539095920737e-06, "loss": 0.2583, "step": 4242 }, { "epoch": 3.96171802054155, "grad_norm": 0.6602788512227319, "learning_rate": 6.879137910571191e-06, "loss": 0.0376, "step": 4243 }, { "epoch": 3.96265172735761, "grad_norm": 1.780890769651109, "learning_rate": 6.877736553353846e-06, "loss": 0.179, "step": 4244 }, { "epoch": 3.9635854341736696, "grad_norm": 1.5349832974065933, "learning_rate": 6.876335024396872e-06, "loss": 0.057, "step": 4245 }, { "epoch": 3.9645191409897294, "grad_norm": 0.47660552418401414, "learning_rate": 6.874933323828452e-06, "loss": 0.0097, "step": 4246 }, { "epoch": 3.965452847805789, "grad_norm": 0.5907570948655867, "learning_rate": 6.873531451776791e-06, "loss": 0.014, "step": 4247 }, { "epoch": 3.966386554621849, "grad_norm": 1.3204138595733907, "learning_rate": 6.872129408370103e-06, "loss": 0.0832, "step": 4248 }, { "epoch": 3.9673202614379086, "grad_norm": 1.5390438616608526, "learning_rate": 6.870727193736621e-06, "loss": 0.0822, "step": 4249 }, { "epoch": 3.9682539682539684, "grad_norm": 1.6104540524273399, "learning_rate": 6.8693248080045916e-06, "loss": 0.0348, "step": 4250 }, { "epoch": 3.969187675070028, "grad_norm": 1.3256316902302059, "learning_rate": 6.867922251302282e-06, "loss": 0.09, "step": 4251 }, { "epoch": 3.970121381886088, "grad_norm": 0.8088613664407299, "learning_rate": 6.866519523757967e-06, "loss": 0.0385, "step": 4252 }, { "epoch": 3.9710550887021476, "grad_norm": 0.9338107507259754, "learning_rate": 6.865116625499944e-06, "loss": 0.0163, "step": 4253 }, { "epoch": 3.9719887955182074, "grad_norm": 2.4888622339980344, "learning_rate": 6.863713556656521e-06, "loss": 0.1148, "step": 4254 }, { "epoch": 3.972922502334267, "grad_norm": 1.7665213912228541, "learning_rate": 6.862310317356028e-06, "loss": 0.109, "step": 4255 }, { "epoch": 3.973856209150327, "grad_norm": 3.395321402888969, "learning_rate": 6.860906907726803e-06, "loss": 0.0885, "step": 4256 }, { "epoch": 3.9747899159663866, "grad_norm": 0.870130159886189, "learning_rate": 6.859503327897207e-06, "loss": 0.0127, "step": 4257 }, { "epoch": 3.9757236227824464, "grad_norm": 1.202291684491188, "learning_rate": 6.8580995779956085e-06, "loss": 0.0439, "step": 4258 }, { "epoch": 3.976657329598506, "grad_norm": 0.6833910343460746, "learning_rate": 6.856695658150398e-06, "loss": 0.0207, "step": 4259 }, { "epoch": 3.977591036414566, "grad_norm": 0.8124870677353878, "learning_rate": 6.855291568489982e-06, "loss": 0.0281, "step": 4260 }, { "epoch": 3.9785247432306257, "grad_norm": 3.9108306129591726, "learning_rate": 6.8538873091427735e-06, "loss": 0.2154, "step": 4261 }, { "epoch": 3.9794584500466854, "grad_norm": 1.8331343925617636, "learning_rate": 6.852482880237213e-06, "loss": 0.1271, "step": 4262 }, { "epoch": 3.980392156862745, "grad_norm": 2.22252656709399, "learning_rate": 6.8510782819017485e-06, "loss": 0.1482, "step": 4263 }, { "epoch": 3.981325863678805, "grad_norm": 1.853671474056703, "learning_rate": 6.849673514264847e-06, "loss": 0.1179, "step": 4264 }, { "epoch": 3.9822595704948647, "grad_norm": 1.5482026176674843, "learning_rate": 6.84826857745499e-06, "loss": 0.0697, "step": 4265 }, { "epoch": 3.9831932773109244, "grad_norm": 1.3809288876218797, "learning_rate": 6.846863471600673e-06, "loss": 0.0539, "step": 4266 }, { "epoch": 3.984126984126984, "grad_norm": 1.805337378400516, "learning_rate": 6.845458196830408e-06, "loss": 0.0788, "step": 4267 }, { "epoch": 3.985060690943044, "grad_norm": 1.2530289183387318, "learning_rate": 6.844052753272726e-06, "loss": 0.0436, "step": 4268 }, { "epoch": 3.9859943977591037, "grad_norm": 2.0088958533945807, "learning_rate": 6.842647141056167e-06, "loss": 0.0567, "step": 4269 }, { "epoch": 3.9869281045751634, "grad_norm": 7.029691371788439, "learning_rate": 6.841241360309292e-06, "loss": 0.2506, "step": 4270 }, { "epoch": 3.987861811391223, "grad_norm": 1.586394505166291, "learning_rate": 6.839835411160674e-06, "loss": 0.1048, "step": 4271 }, { "epoch": 3.988795518207283, "grad_norm": 0.4546849229846918, "learning_rate": 6.8384292937389015e-06, "loss": 0.013, "step": 4272 }, { "epoch": 3.9897292250233427, "grad_norm": 1.8036690484246845, "learning_rate": 6.837023008172581e-06, "loss": 0.1012, "step": 4273 }, { "epoch": 3.9906629318394025, "grad_norm": 2.9463045748042145, "learning_rate": 6.835616554590332e-06, "loss": 0.1555, "step": 4274 }, { "epoch": 3.991596638655462, "grad_norm": 4.787775155510743, "learning_rate": 6.83420993312079e-06, "loss": 0.1553, "step": 4275 }, { "epoch": 3.992530345471522, "grad_norm": 3.6130227069729948, "learning_rate": 6.832803143892607e-06, "loss": 0.2229, "step": 4276 }, { "epoch": 3.9934640522875817, "grad_norm": 2.7563647279637746, "learning_rate": 6.831396187034448e-06, "loss": 0.1834, "step": 4277 }, { "epoch": 3.9943977591036415, "grad_norm": 1.1689640065973135, "learning_rate": 6.829989062674996e-06, "loss": 0.0632, "step": 4278 }, { "epoch": 3.9953314659197012, "grad_norm": 1.0146794749209078, "learning_rate": 6.828581770942946e-06, "loss": 0.0426, "step": 4279 }, { "epoch": 3.996265172735761, "grad_norm": 2.1184446104224097, "learning_rate": 6.827174311967012e-06, "loss": 0.1147, "step": 4280 }, { "epoch": 3.9971988795518207, "grad_norm": 1.114951386883382, "learning_rate": 6.825766685875922e-06, "loss": 0.0592, "step": 4281 }, { "epoch": 3.9981325863678805, "grad_norm": 0.7201422094517623, "learning_rate": 6.824358892798417e-06, "loss": 0.0265, "step": 4282 }, { "epoch": 3.9990662931839402, "grad_norm": 0.8093576667586911, "learning_rate": 6.822950932863257e-06, "loss": 0.0451, "step": 4283 }, { "epoch": 4.0, "grad_norm": 1.739196993325399, "learning_rate": 6.821542806199213e-06, "loss": 0.1151, "step": 4284 }, { "epoch": 4.00093370681606, "grad_norm": 2.3106976261600085, "learning_rate": 6.820134512935076e-06, "loss": 0.1288, "step": 4285 }, { "epoch": 4.0018674136321195, "grad_norm": 2.7322989388071948, "learning_rate": 6.818726053199649e-06, "loss": 0.1444, "step": 4286 }, { "epoch": 4.002801120448179, "grad_norm": 0.7985251894943441, "learning_rate": 6.817317427121751e-06, "loss": 0.0352, "step": 4287 }, { "epoch": 4.003734827264239, "grad_norm": 0.7614965050456443, "learning_rate": 6.815908634830216e-06, "loss": 0.0288, "step": 4288 }, { "epoch": 4.004668534080299, "grad_norm": 1.966914223872155, "learning_rate": 6.814499676453895e-06, "loss": 0.1646, "step": 4289 }, { "epoch": 4.0056022408963585, "grad_norm": 6.611707408342563, "learning_rate": 6.8130905521216505e-06, "loss": 0.2784, "step": 4290 }, { "epoch": 4.006535947712418, "grad_norm": 4.557432432674256, "learning_rate": 6.811681261962365e-06, "loss": 0.1746, "step": 4291 }, { "epoch": 4.007469654528478, "grad_norm": 2.3807481804064956, "learning_rate": 6.810271806104931e-06, "loss": 0.0918, "step": 4292 }, { "epoch": 4.008403361344538, "grad_norm": 1.6428426998490473, "learning_rate": 6.808862184678261e-06, "loss": 0.0635, "step": 4293 }, { "epoch": 4.0093370681605975, "grad_norm": 3.817159850800512, "learning_rate": 6.807452397811279e-06, "loss": 0.125, "step": 4294 }, { "epoch": 4.010270774976657, "grad_norm": 2.3464959641887386, "learning_rate": 6.806042445632925e-06, "loss": 0.103, "step": 4295 }, { "epoch": 4.011204481792717, "grad_norm": 1.440215386858898, "learning_rate": 6.804632328272156e-06, "loss": 0.0764, "step": 4296 }, { "epoch": 4.012138188608777, "grad_norm": 0.5835516044400063, "learning_rate": 6.803222045857942e-06, "loss": 0.0162, "step": 4297 }, { "epoch": 4.0130718954248366, "grad_norm": 0.7593698065426833, "learning_rate": 6.801811598519268e-06, "loss": 0.0143, "step": 4298 }, { "epoch": 4.014005602240896, "grad_norm": 1.182976058351351, "learning_rate": 6.800400986385136e-06, "loss": 0.0318, "step": 4299 }, { "epoch": 4.014939309056956, "grad_norm": 1.2891996724267527, "learning_rate": 6.798990209584562e-06, "loss": 0.0566, "step": 4300 }, { "epoch": 4.015873015873016, "grad_norm": 3.9247729201996076, "learning_rate": 6.797579268246577e-06, "loss": 0.2826, "step": 4301 }, { "epoch": 4.016806722689076, "grad_norm": 2.1135423707122367, "learning_rate": 6.796168162500228e-06, "loss": 0.1912, "step": 4302 }, { "epoch": 4.017740429505135, "grad_norm": 2.6534216769017074, "learning_rate": 6.794756892474572e-06, "loss": 0.1946, "step": 4303 }, { "epoch": 4.018674136321195, "grad_norm": 0.5460020621850932, "learning_rate": 6.79334545829869e-06, "loss": 0.0147, "step": 4304 }, { "epoch": 4.019607843137255, "grad_norm": 0.9991938750053018, "learning_rate": 6.791933860101671e-06, "loss": 0.0465, "step": 4305 }, { "epoch": 4.020541549953315, "grad_norm": 3.135722968002818, "learning_rate": 6.790522098012622e-06, "loss": 0.1953, "step": 4306 }, { "epoch": 4.021475256769374, "grad_norm": 0.7633802782546146, "learning_rate": 6.789110172160663e-06, "loss": 0.0142, "step": 4307 }, { "epoch": 4.022408963585434, "grad_norm": 2.5519794566669045, "learning_rate": 6.787698082674929e-06, "loss": 0.1095, "step": 4308 }, { "epoch": 4.023342670401494, "grad_norm": 0.4036624031047988, "learning_rate": 6.7862858296845744e-06, "loss": 0.0088, "step": 4309 }, { "epoch": 4.024276377217554, "grad_norm": 3.110022981024751, "learning_rate": 6.784873413318764e-06, "loss": 0.0741, "step": 4310 }, { "epoch": 4.025210084033613, "grad_norm": 1.5909806510022213, "learning_rate": 6.783460833706678e-06, "loss": 0.0838, "step": 4311 }, { "epoch": 4.026143790849673, "grad_norm": 1.407231081946454, "learning_rate": 6.7820480909775135e-06, "loss": 0.075, "step": 4312 }, { "epoch": 4.027077497665733, "grad_norm": 1.7534781058521982, "learning_rate": 6.780635185260479e-06, "loss": 0.0726, "step": 4313 }, { "epoch": 4.028011204481793, "grad_norm": 1.6635045486154518, "learning_rate": 6.779222116684803e-06, "loss": 0.074, "step": 4314 }, { "epoch": 4.028944911297852, "grad_norm": 1.2626401659223332, "learning_rate": 6.7778088853797245e-06, "loss": 0.0355, "step": 4315 }, { "epoch": 4.029878618113912, "grad_norm": 1.2402102025650301, "learning_rate": 6.776395491474499e-06, "loss": 0.0257, "step": 4316 }, { "epoch": 4.030812324929972, "grad_norm": 2.3768475723043445, "learning_rate": 6.774981935098398e-06, "loss": 0.0932, "step": 4317 }, { "epoch": 4.031746031746032, "grad_norm": 1.9793171722473744, "learning_rate": 6.773568216380706e-06, "loss": 0.1069, "step": 4318 }, { "epoch": 4.032679738562091, "grad_norm": 1.7169653419418904, "learning_rate": 6.772154335450723e-06, "loss": 0.101, "step": 4319 }, { "epoch": 4.033613445378151, "grad_norm": 0.42542451560140515, "learning_rate": 6.7707402924377656e-06, "loss": 0.0088, "step": 4320 }, { "epoch": 4.034547152194211, "grad_norm": 2.8513276995943673, "learning_rate": 6.769326087471161e-06, "loss": 0.0831, "step": 4321 }, { "epoch": 4.035480859010271, "grad_norm": 1.0264238493272495, "learning_rate": 6.767911720680253e-06, "loss": 0.0227, "step": 4322 }, { "epoch": 4.03641456582633, "grad_norm": 1.3217909843303386, "learning_rate": 6.766497192194407e-06, "loss": 0.0357, "step": 4323 }, { "epoch": 4.03734827264239, "grad_norm": 2.961304005912866, "learning_rate": 6.76508250214299e-06, "loss": 0.1814, "step": 4324 }, { "epoch": 4.03828197945845, "grad_norm": 0.7835722185528409, "learning_rate": 6.763667650655395e-06, "loss": 0.0181, "step": 4325 }, { "epoch": 4.03921568627451, "grad_norm": 4.724347837779808, "learning_rate": 6.762252637861024e-06, "loss": 0.336, "step": 4326 }, { "epoch": 4.040149393090569, "grad_norm": 1.797931892326214, "learning_rate": 6.760837463889297e-06, "loss": 0.046, "step": 4327 }, { "epoch": 4.041083099906629, "grad_norm": 2.3450433778746724, "learning_rate": 6.759422128869647e-06, "loss": 0.1577, "step": 4328 }, { "epoch": 4.042016806722689, "grad_norm": 0.9307106802517652, "learning_rate": 6.75800663293152e-06, "loss": 0.0405, "step": 4329 }, { "epoch": 4.042950513538749, "grad_norm": 2.8360174999757475, "learning_rate": 6.75659097620438e-06, "loss": 0.1214, "step": 4330 }, { "epoch": 4.043884220354808, "grad_norm": 2.7974441495928155, "learning_rate": 6.755175158817704e-06, "loss": 0.0516, "step": 4331 }, { "epoch": 4.044817927170868, "grad_norm": 0.5720467384789067, "learning_rate": 6.753759180900985e-06, "loss": 0.0157, "step": 4332 }, { "epoch": 4.045751633986928, "grad_norm": 0.3223500201194376, "learning_rate": 6.752343042583729e-06, "loss": 0.0123, "step": 4333 }, { "epoch": 4.046685340802988, "grad_norm": 1.6132880606266307, "learning_rate": 6.750926743995459e-06, "loss": 0.0954, "step": 4334 }, { "epoch": 4.0476190476190474, "grad_norm": 1.5671947791199803, "learning_rate": 6.749510285265707e-06, "loss": 0.0449, "step": 4335 }, { "epoch": 4.048552754435107, "grad_norm": 0.7430582035238327, "learning_rate": 6.7480936665240274e-06, "loss": 0.0265, "step": 4336 }, { "epoch": 4.049486461251167, "grad_norm": 3.4454448275721985, "learning_rate": 6.746676887899983e-06, "loss": 0.1333, "step": 4337 }, { "epoch": 4.050420168067227, "grad_norm": 0.3839358022151257, "learning_rate": 6.745259949523157e-06, "loss": 0.0063, "step": 4338 }, { "epoch": 4.0513538748832865, "grad_norm": 0.743275718666018, "learning_rate": 6.743842851523141e-06, "loss": 0.0316, "step": 4339 }, { "epoch": 4.052287581699346, "grad_norm": 1.4533091456240026, "learning_rate": 6.7424255940295465e-06, "loss": 0.064, "step": 4340 }, { "epoch": 4.053221288515406, "grad_norm": 2.0683560010133917, "learning_rate": 6.741008177171995e-06, "loss": 0.0368, "step": 4341 }, { "epoch": 4.054154995331466, "grad_norm": 2.7920430299296384, "learning_rate": 6.739590601080126e-06, "loss": 0.1593, "step": 4342 }, { "epoch": 4.0550887021475255, "grad_norm": 5.9076751448551965, "learning_rate": 6.738172865883593e-06, "loss": 0.1331, "step": 4343 }, { "epoch": 4.056022408963585, "grad_norm": 1.0418222729241051, "learning_rate": 6.7367549717120615e-06, "loss": 0.0515, "step": 4344 }, { "epoch": 4.056956115779645, "grad_norm": 2.677661118031733, "learning_rate": 6.735336918695217e-06, "loss": 0.2167, "step": 4345 }, { "epoch": 4.057889822595705, "grad_norm": 6.658231198710199, "learning_rate": 6.733918706962751e-06, "loss": 0.3128, "step": 4346 }, { "epoch": 4.0588235294117645, "grad_norm": 1.2365503960929378, "learning_rate": 6.7325003366443785e-06, "loss": 0.0313, "step": 4347 }, { "epoch": 4.059757236227824, "grad_norm": 0.3969521566204768, "learning_rate": 6.731081807869824e-06, "loss": 0.0058, "step": 4348 }, { "epoch": 4.060690943043884, "grad_norm": 2.834703410584473, "learning_rate": 6.7296631207688266e-06, "loss": 0.2415, "step": 4349 }, { "epoch": 4.061624649859944, "grad_norm": 0.45992906633754316, "learning_rate": 6.728244275471142e-06, "loss": 0.0071, "step": 4350 }, { "epoch": 4.0625583566760035, "grad_norm": 1.4020496677308225, "learning_rate": 6.726825272106539e-06, "loss": 0.0734, "step": 4351 }, { "epoch": 4.063492063492063, "grad_norm": 0.5992225664146353, "learning_rate": 6.725406110804801e-06, "loss": 0.0277, "step": 4352 }, { "epoch": 4.064425770308123, "grad_norm": 2.096273715228043, "learning_rate": 6.7239867916957244e-06, "loss": 0.1537, "step": 4353 }, { "epoch": 4.065359477124183, "grad_norm": 4.2350183070649665, "learning_rate": 6.722567314909125e-06, "loss": 0.2511, "step": 4354 }, { "epoch": 4.0662931839402425, "grad_norm": 0.5646960989468999, "learning_rate": 6.7211476805748254e-06, "loss": 0.0161, "step": 4355 }, { "epoch": 4.067226890756302, "grad_norm": 1.507565373581243, "learning_rate": 6.7197278888226695e-06, "loss": 0.1096, "step": 4356 }, { "epoch": 4.068160597572362, "grad_norm": 2.95215103824302, "learning_rate": 6.7183079397825126e-06, "loss": 0.0143, "step": 4357 }, { "epoch": 4.069094304388422, "grad_norm": 3.0031897989892435, "learning_rate": 6.716887833584223e-06, "loss": 0.1342, "step": 4358 }, { "epoch": 4.0700280112044815, "grad_norm": 4.3546880658356075, "learning_rate": 6.715467570357687e-06, "loss": 0.1064, "step": 4359 }, { "epoch": 4.070961718020541, "grad_norm": 2.398850172667676, "learning_rate": 6.714047150232803e-06, "loss": 0.0827, "step": 4360 }, { "epoch": 4.071895424836601, "grad_norm": 1.1470327111855159, "learning_rate": 6.712626573339481e-06, "loss": 0.0319, "step": 4361 }, { "epoch": 4.072829131652661, "grad_norm": 1.2413534598811033, "learning_rate": 6.711205839807653e-06, "loss": 0.0589, "step": 4362 }, { "epoch": 4.073762838468721, "grad_norm": 1.7583126582037247, "learning_rate": 6.709784949767258e-06, "loss": 0.1597, "step": 4363 }, { "epoch": 4.07469654528478, "grad_norm": 1.098235704983303, "learning_rate": 6.708363903348252e-06, "loss": 0.05, "step": 4364 }, { "epoch": 4.07563025210084, "grad_norm": 1.285464980754351, "learning_rate": 6.706942700680606e-06, "loss": 0.0447, "step": 4365 }, { "epoch": 4.0765639589169, "grad_norm": 0.6328202914318793, "learning_rate": 6.705521341894303e-06, "loss": 0.0115, "step": 4366 }, { "epoch": 4.07749766573296, "grad_norm": 2.3575075094670646, "learning_rate": 6.704099827119345e-06, "loss": 0.1575, "step": 4367 }, { "epoch": 4.078431372549019, "grad_norm": 2.300584783395815, "learning_rate": 6.702678156485742e-06, "loss": 0.0288, "step": 4368 }, { "epoch": 4.079365079365079, "grad_norm": 3.2627043753298604, "learning_rate": 6.701256330123523e-06, "loss": 0.2022, "step": 4369 }, { "epoch": 4.080298786181139, "grad_norm": 2.3966941791474987, "learning_rate": 6.6998343481627306e-06, "loss": 0.171, "step": 4370 }, { "epoch": 4.081232492997199, "grad_norm": 2.0636190295614028, "learning_rate": 6.698412210733418e-06, "loss": 0.1314, "step": 4371 }, { "epoch": 4.082166199813258, "grad_norm": 0.7599361620187075, "learning_rate": 6.696989917965658e-06, "loss": 0.0334, "step": 4372 }, { "epoch": 4.083099906629318, "grad_norm": 1.6162535151082424, "learning_rate": 6.695567469989532e-06, "loss": 0.0675, "step": 4373 }, { "epoch": 4.084033613445378, "grad_norm": 4.230659273528016, "learning_rate": 6.694144866935142e-06, "loss": 0.3022, "step": 4374 }, { "epoch": 4.084967320261438, "grad_norm": 3.4641135887296346, "learning_rate": 6.6927221089325964e-06, "loss": 0.2319, "step": 4375 }, { "epoch": 4.085901027077497, "grad_norm": 0.7650393128729415, "learning_rate": 6.691299196112027e-06, "loss": 0.0291, "step": 4376 }, { "epoch": 4.086834733893557, "grad_norm": 2.9003328195443108, "learning_rate": 6.689876128603572e-06, "loss": 0.2193, "step": 4377 }, { "epoch": 4.087768440709617, "grad_norm": 1.5906540757280838, "learning_rate": 6.688452906537385e-06, "loss": 0.0905, "step": 4378 }, { "epoch": 4.088702147525677, "grad_norm": 2.369925385348243, "learning_rate": 6.687029530043638e-06, "loss": 0.188, "step": 4379 }, { "epoch": 4.089635854341736, "grad_norm": 0.726303392484597, "learning_rate": 6.685605999252513e-06, "loss": 0.0405, "step": 4380 }, { "epoch": 4.090569561157796, "grad_norm": 3.741455818059964, "learning_rate": 6.684182314294209e-06, "loss": 0.1858, "step": 4381 }, { "epoch": 4.091503267973856, "grad_norm": 1.7127863021522884, "learning_rate": 6.682758475298935e-06, "loss": 0.0799, "step": 4382 }, { "epoch": 4.092436974789916, "grad_norm": 1.9175813855757013, "learning_rate": 6.68133448239692e-06, "loss": 0.0953, "step": 4383 }, { "epoch": 4.093370681605975, "grad_norm": 1.350655998568703, "learning_rate": 6.6799103357184e-06, "loss": 0.0639, "step": 4384 }, { "epoch": 4.094304388422035, "grad_norm": 1.0813591038646666, "learning_rate": 6.678486035393633e-06, "loss": 0.0271, "step": 4385 }, { "epoch": 4.095238095238095, "grad_norm": 1.3723548013749118, "learning_rate": 6.677061581552884e-06, "loss": 0.0277, "step": 4386 }, { "epoch": 4.096171802054155, "grad_norm": 0.7331369629121213, "learning_rate": 6.675636974326436e-06, "loss": 0.0055, "step": 4387 }, { "epoch": 4.097105508870214, "grad_norm": 1.9883052242181058, "learning_rate": 6.674212213844584e-06, "loss": 0.1239, "step": 4388 }, { "epoch": 4.098039215686274, "grad_norm": 2.448160720042863, "learning_rate": 6.67278730023764e-06, "loss": 0.1102, "step": 4389 }, { "epoch": 4.098972922502334, "grad_norm": 2.014028731999717, "learning_rate": 6.671362233635926e-06, "loss": 0.1139, "step": 4390 }, { "epoch": 4.099906629318394, "grad_norm": 3.513816914893406, "learning_rate": 6.66993701416978e-06, "loss": 0.1601, "step": 4391 }, { "epoch": 4.100840336134453, "grad_norm": 2.062274840628384, "learning_rate": 6.668511641969557e-06, "loss": 0.105, "step": 4392 }, { "epoch": 4.101774042950513, "grad_norm": 1.8010083251123645, "learning_rate": 6.66708611716562e-06, "loss": 0.0994, "step": 4393 }, { "epoch": 4.102707749766573, "grad_norm": 0.4542220067385621, "learning_rate": 6.665660439888348e-06, "loss": 0.0114, "step": 4394 }, { "epoch": 4.103641456582633, "grad_norm": 3.138296618449872, "learning_rate": 6.664234610268138e-06, "loss": 0.1449, "step": 4395 }, { "epoch": 4.104575163398692, "grad_norm": 2.082883675298177, "learning_rate": 6.6628086284353965e-06, "loss": 0.1735, "step": 4396 }, { "epoch": 4.105508870214752, "grad_norm": 0.8698806138348112, "learning_rate": 6.661382494520543e-06, "loss": 0.0288, "step": 4397 }, { "epoch": 4.106442577030812, "grad_norm": 1.7097335512832341, "learning_rate": 6.659956208654017e-06, "loss": 0.0951, "step": 4398 }, { "epoch": 4.107376283846872, "grad_norm": 0.7091909837673238, "learning_rate": 6.658529770966265e-06, "loss": 0.0189, "step": 4399 }, { "epoch": 4.1083099906629315, "grad_norm": 0.8229289347252472, "learning_rate": 6.657103181587752e-06, "loss": 0.0407, "step": 4400 }, { "epoch": 4.109243697478991, "grad_norm": 2.5950066685371294, "learning_rate": 6.655676440648954e-06, "loss": 0.1292, "step": 4401 }, { "epoch": 4.110177404295051, "grad_norm": 1.9440101044952691, "learning_rate": 6.654249548280364e-06, "loss": 0.129, "step": 4402 }, { "epoch": 4.111111111111111, "grad_norm": 1.0835192735457118, "learning_rate": 6.652822504612484e-06, "loss": 0.0384, "step": 4403 }, { "epoch": 4.1120448179271705, "grad_norm": 2.155911493382153, "learning_rate": 6.651395309775837e-06, "loss": 0.1711, "step": 4404 }, { "epoch": 4.11297852474323, "grad_norm": 2.739178483898015, "learning_rate": 6.649967963900951e-06, "loss": 0.2173, "step": 4405 }, { "epoch": 4.11391223155929, "grad_norm": 0.8192741386448045, "learning_rate": 6.648540467118375e-06, "loss": 0.0174, "step": 4406 }, { "epoch": 4.11484593837535, "grad_norm": 2.8303102602620425, "learning_rate": 6.64711281955867e-06, "loss": 0.1059, "step": 4407 }, { "epoch": 4.1157796451914095, "grad_norm": 2.6152226220701102, "learning_rate": 6.645685021352408e-06, "loss": 0.1577, "step": 4408 }, { "epoch": 4.116713352007469, "grad_norm": 1.6488542281179621, "learning_rate": 6.644257072630179e-06, "loss": 0.0951, "step": 4409 }, { "epoch": 4.117647058823529, "grad_norm": 1.7624720493293715, "learning_rate": 6.642828973522582e-06, "loss": 0.109, "step": 4410 }, { "epoch": 4.118580765639589, "grad_norm": 0.8684956086705702, "learning_rate": 6.641400724160235e-06, "loss": 0.0204, "step": 4411 }, { "epoch": 4.1195144724556485, "grad_norm": 1.7612285168365778, "learning_rate": 6.639972324673765e-06, "loss": 0.0985, "step": 4412 }, { "epoch": 4.120448179271708, "grad_norm": 2.0679703569695977, "learning_rate": 6.6385437751938165e-06, "loss": 0.1197, "step": 4413 }, { "epoch": 4.121381886087768, "grad_norm": 1.469811890163192, "learning_rate": 6.637115075851044e-06, "loss": 0.0812, "step": 4414 }, { "epoch": 4.122315592903828, "grad_norm": 0.7266838707582611, "learning_rate": 6.6356862267761215e-06, "loss": 0.0129, "step": 4415 }, { "epoch": 4.1232492997198875, "grad_norm": 1.0808332379658048, "learning_rate": 6.6342572280997276e-06, "loss": 0.0411, "step": 4416 }, { "epoch": 4.124183006535947, "grad_norm": 2.109805244650502, "learning_rate": 6.632828079952565e-06, "loss": 0.129, "step": 4417 }, { "epoch": 4.125116713352007, "grad_norm": 0.3615807060988939, "learning_rate": 6.63139878246534e-06, "loss": 0.0091, "step": 4418 }, { "epoch": 4.126050420168067, "grad_norm": 0.9878472632074182, "learning_rate": 6.629969335768782e-06, "loss": 0.043, "step": 4419 }, { "epoch": 4.1269841269841265, "grad_norm": 1.912703359013335, "learning_rate": 6.628539739993627e-06, "loss": 0.108, "step": 4420 }, { "epoch": 4.127917833800186, "grad_norm": 3.097442483767271, "learning_rate": 6.62710999527063e-06, "loss": 0.1194, "step": 4421 }, { "epoch": 4.128851540616246, "grad_norm": 0.8970514829571425, "learning_rate": 6.625680101730553e-06, "loss": 0.0398, "step": 4422 }, { "epoch": 4.129785247432307, "grad_norm": 0.6630981567567188, "learning_rate": 6.624250059504178e-06, "loss": 0.0114, "step": 4423 }, { "epoch": 4.130718954248366, "grad_norm": 1.3184866571926994, "learning_rate": 6.622819868722296e-06, "loss": 0.0403, "step": 4424 }, { "epoch": 4.131652661064426, "grad_norm": 0.9484766896456254, "learning_rate": 6.621389529515718e-06, "loss": 0.0292, "step": 4425 }, { "epoch": 4.132586367880486, "grad_norm": 1.7513312468035118, "learning_rate": 6.619959042015261e-06, "loss": 0.0579, "step": 4426 }, { "epoch": 4.133520074696546, "grad_norm": 1.2416283039835534, "learning_rate": 6.6185284063517585e-06, "loss": 0.0688, "step": 4427 }, { "epoch": 4.1344537815126055, "grad_norm": 2.336238191048203, "learning_rate": 6.6170976226560564e-06, "loss": 0.1207, "step": 4428 }, { "epoch": 4.135387488328665, "grad_norm": 1.873851192029991, "learning_rate": 6.615666691059021e-06, "loss": 0.1211, "step": 4429 }, { "epoch": 4.136321195144725, "grad_norm": 1.7870773423821271, "learning_rate": 6.614235611691522e-06, "loss": 0.1643, "step": 4430 }, { "epoch": 4.137254901960785, "grad_norm": 0.9447730141066607, "learning_rate": 6.612804384684447e-06, "loss": 0.0766, "step": 4431 }, { "epoch": 4.1381886087768445, "grad_norm": 0.5154288463428485, "learning_rate": 6.611373010168703e-06, "loss": 0.0144, "step": 4432 }, { "epoch": 4.139122315592904, "grad_norm": 1.8160694132710211, "learning_rate": 6.6099414882751965e-06, "loss": 0.0842, "step": 4433 }, { "epoch": 4.140056022408964, "grad_norm": 2.014624855948206, "learning_rate": 6.608509819134864e-06, "loss": 0.1439, "step": 4434 }, { "epoch": 4.140989729225024, "grad_norm": 2.7797658327681933, "learning_rate": 6.607078002878642e-06, "loss": 0.1176, "step": 4435 }, { "epoch": 4.1419234360410835, "grad_norm": 1.8591374655999398, "learning_rate": 6.605646039637489e-06, "loss": 0.1712, "step": 4436 }, { "epoch": 4.142857142857143, "grad_norm": 1.926400153941673, "learning_rate": 6.6042139295423715e-06, "loss": 0.0053, "step": 4437 }, { "epoch": 4.143790849673203, "grad_norm": 1.2003346552807668, "learning_rate": 6.6027816727242734e-06, "loss": 0.0607, "step": 4438 }, { "epoch": 4.144724556489263, "grad_norm": 1.8661064678176296, "learning_rate": 6.601349269314188e-06, "loss": 0.1011, "step": 4439 }, { "epoch": 4.1456582633053225, "grad_norm": 1.1142371412415661, "learning_rate": 6.599916719443128e-06, "loss": 0.0389, "step": 4440 }, { "epoch": 4.146591970121382, "grad_norm": 1.8096315576027657, "learning_rate": 6.598484023242111e-06, "loss": 0.0617, "step": 4441 }, { "epoch": 4.147525676937442, "grad_norm": 0.6718330100252786, "learning_rate": 6.597051180842178e-06, "loss": 0.0232, "step": 4442 }, { "epoch": 4.148459383753502, "grad_norm": 1.6147271958885476, "learning_rate": 6.595618192374376e-06, "loss": 0.0724, "step": 4443 }, { "epoch": 4.1493930905695615, "grad_norm": 0.5045259163500863, "learning_rate": 6.5941850579697644e-06, "loss": 0.0079, "step": 4444 }, { "epoch": 4.150326797385621, "grad_norm": 0.42380726897983956, "learning_rate": 6.592751777759424e-06, "loss": 0.0116, "step": 4445 }, { "epoch": 4.151260504201681, "grad_norm": 1.529342550463523, "learning_rate": 6.5913183518744405e-06, "loss": 0.0488, "step": 4446 }, { "epoch": 4.152194211017741, "grad_norm": 0.3419756312460029, "learning_rate": 6.5898847804459184e-06, "loss": 0.0109, "step": 4447 }, { "epoch": 4.1531279178338005, "grad_norm": 1.3363147710137526, "learning_rate": 6.5884510636049735e-06, "loss": 0.0206, "step": 4448 }, { "epoch": 4.15406162464986, "grad_norm": 1.0183430234927722, "learning_rate": 6.587017201482735e-06, "loss": 0.037, "step": 4449 }, { "epoch": 4.15499533146592, "grad_norm": 1.36658620118923, "learning_rate": 6.585583194210342e-06, "loss": 0.0529, "step": 4450 }, { "epoch": 4.15592903828198, "grad_norm": 2.2804686080802634, "learning_rate": 6.584149041918956e-06, "loss": 0.105, "step": 4451 }, { "epoch": 4.1568627450980395, "grad_norm": 0.6542448617775763, "learning_rate": 6.582714744739741e-06, "loss": 0.0208, "step": 4452 }, { "epoch": 4.157796451914099, "grad_norm": 1.2565337207819762, "learning_rate": 6.581280302803882e-06, "loss": 0.0493, "step": 4453 }, { "epoch": 4.158730158730159, "grad_norm": 0.5514514385306671, "learning_rate": 6.579845716242574e-06, "loss": 0.0239, "step": 4454 }, { "epoch": 4.159663865546219, "grad_norm": 0.49724925370466205, "learning_rate": 6.578410985187025e-06, "loss": 0.0155, "step": 4455 }, { "epoch": 4.160597572362279, "grad_norm": 4.982557317473497, "learning_rate": 6.576976109768456e-06, "loss": 0.0843, "step": 4456 }, { "epoch": 4.161531279178338, "grad_norm": 6.41395716551402, "learning_rate": 6.575541090118105e-06, "loss": 0.2321, "step": 4457 }, { "epoch": 4.162464985994398, "grad_norm": 3.2086327560500805, "learning_rate": 6.574105926367217e-06, "loss": 0.1914, "step": 4458 }, { "epoch": 4.163398692810458, "grad_norm": 1.9795198801058138, "learning_rate": 6.572670618647057e-06, "loss": 0.0695, "step": 4459 }, { "epoch": 4.164332399626518, "grad_norm": 1.4221899124893627, "learning_rate": 6.571235167088897e-06, "loss": 0.0201, "step": 4460 }, { "epoch": 4.165266106442577, "grad_norm": 3.7086726721386096, "learning_rate": 6.5697995718240246e-06, "loss": 0.1936, "step": 4461 }, { "epoch": 4.166199813258637, "grad_norm": 1.6513569288358805, "learning_rate": 6.568363832983744e-06, "loss": 0.0652, "step": 4462 }, { "epoch": 4.167133520074697, "grad_norm": 1.6544159223015633, "learning_rate": 6.566927950699365e-06, "loss": 0.0408, "step": 4463 }, { "epoch": 4.168067226890757, "grad_norm": 0.6397769872031325, "learning_rate": 6.5654919251022186e-06, "loss": 0.0228, "step": 4464 }, { "epoch": 4.169000933706816, "grad_norm": 2.586362163225137, "learning_rate": 6.5640557563236415e-06, "loss": 0.082, "step": 4465 }, { "epoch": 4.169934640522876, "grad_norm": 1.0818066816028378, "learning_rate": 6.562619444494989e-06, "loss": 0.045, "step": 4466 }, { "epoch": 4.170868347338936, "grad_norm": 2.4458260748922718, "learning_rate": 6.561182989747629e-06, "loss": 0.1191, "step": 4467 }, { "epoch": 4.171802054154996, "grad_norm": 1.555112448891998, "learning_rate": 6.559746392212939e-06, "loss": 0.0552, "step": 4468 }, { "epoch": 4.172735760971055, "grad_norm": 0.46974512320944467, "learning_rate": 6.558309652022314e-06, "loss": 0.0188, "step": 4469 }, { "epoch": 4.173669467787115, "grad_norm": 0.9618606333435568, "learning_rate": 6.556872769307154e-06, "loss": 0.0341, "step": 4470 }, { "epoch": 4.174603174603175, "grad_norm": 1.2137600484107873, "learning_rate": 6.555435744198883e-06, "loss": 0.0655, "step": 4471 }, { "epoch": 4.175536881419235, "grad_norm": 1.6443674845002034, "learning_rate": 6.553998576828934e-06, "loss": 0.088, "step": 4472 }, { "epoch": 4.176470588235294, "grad_norm": 1.0726391848271404, "learning_rate": 6.552561267328746e-06, "loss": 0.0338, "step": 4473 }, { "epoch": 4.177404295051354, "grad_norm": 2.3987581916611194, "learning_rate": 6.551123815829782e-06, "loss": 0.0698, "step": 4474 }, { "epoch": 4.178338001867414, "grad_norm": 2.2439679016203313, "learning_rate": 6.54968622246351e-06, "loss": 0.1055, "step": 4475 }, { "epoch": 4.179271708683474, "grad_norm": 1.9624344087848877, "learning_rate": 6.548248487361414e-06, "loss": 0.0936, "step": 4476 }, { "epoch": 4.180205415499533, "grad_norm": 3.0527925269357925, "learning_rate": 6.546810610654991e-06, "loss": 0.1439, "step": 4477 }, { "epoch": 4.181139122315593, "grad_norm": 3.155226623484321, "learning_rate": 6.545372592475749e-06, "loss": 0.19, "step": 4478 }, { "epoch": 4.182072829131653, "grad_norm": 2.362073510528553, "learning_rate": 6.543934432955215e-06, "loss": 0.1242, "step": 4479 }, { "epoch": 4.183006535947713, "grad_norm": 2.854175758156111, "learning_rate": 6.542496132224919e-06, "loss": 0.0676, "step": 4480 }, { "epoch": 4.183940242763772, "grad_norm": 1.148977611516831, "learning_rate": 6.5410576904164145e-06, "loss": 0.0403, "step": 4481 }, { "epoch": 4.184873949579832, "grad_norm": 1.7447644815042644, "learning_rate": 6.539619107661258e-06, "loss": 0.0817, "step": 4482 }, { "epoch": 4.185807656395892, "grad_norm": 3.7857024414925267, "learning_rate": 6.538180384091028e-06, "loss": 0.1603, "step": 4483 }, { "epoch": 4.186741363211952, "grad_norm": 1.7498317768920804, "learning_rate": 6.5367415198373064e-06, "loss": 0.059, "step": 4484 }, { "epoch": 4.187675070028011, "grad_norm": 6.864514764491585, "learning_rate": 6.535302515031698e-06, "loss": 0.2638, "step": 4485 }, { "epoch": 4.188608776844071, "grad_norm": 1.8501807749638863, "learning_rate": 6.533863369805815e-06, "loss": 0.1033, "step": 4486 }, { "epoch": 4.189542483660131, "grad_norm": 1.0830827349943215, "learning_rate": 6.532424084291281e-06, "loss": 0.0426, "step": 4487 }, { "epoch": 4.190476190476191, "grad_norm": 0.8648150677175245, "learning_rate": 6.530984658619735e-06, "loss": 0.022, "step": 4488 }, { "epoch": 4.19140989729225, "grad_norm": 2.37880213498863, "learning_rate": 6.5295450929228284e-06, "loss": 0.1038, "step": 4489 }, { "epoch": 4.19234360410831, "grad_norm": 1.088531685425764, "learning_rate": 6.528105387332226e-06, "loss": 0.0516, "step": 4490 }, { "epoch": 4.19327731092437, "grad_norm": 0.5722286038425266, "learning_rate": 6.5266655419796046e-06, "loss": 0.0158, "step": 4491 }, { "epoch": 4.19421101774043, "grad_norm": 1.2780464817365753, "learning_rate": 6.5252255569966534e-06, "loss": 0.0702, "step": 4492 }, { "epoch": 4.1951447245564895, "grad_norm": 2.9119578039711866, "learning_rate": 6.5237854325150754e-06, "loss": 0.2127, "step": 4493 }, { "epoch": 4.196078431372549, "grad_norm": 1.725253574738951, "learning_rate": 6.522345168666584e-06, "loss": 0.1092, "step": 4494 }, { "epoch": 4.197012138188609, "grad_norm": 3.049532382618324, "learning_rate": 6.520904765582909e-06, "loss": 0.1557, "step": 4495 }, { "epoch": 4.197945845004669, "grad_norm": 1.7496313063518412, "learning_rate": 6.519464223395791e-06, "loss": 0.0652, "step": 4496 }, { "epoch": 4.1988795518207285, "grad_norm": 0.6024774050409964, "learning_rate": 6.518023542236982e-06, "loss": 0.0292, "step": 4497 }, { "epoch": 4.199813258636788, "grad_norm": 1.425481879137402, "learning_rate": 6.51658272223825e-06, "loss": 0.016, "step": 4498 }, { "epoch": 4.200746965452848, "grad_norm": 2.701920746787292, "learning_rate": 6.515141763531373e-06, "loss": 0.0325, "step": 4499 }, { "epoch": 4.201680672268908, "grad_norm": 3.4973907872170718, "learning_rate": 6.513700666248141e-06, "loss": 0.0199, "step": 4500 }, { "epoch": 4.2026143790849675, "grad_norm": 0.5410090579012065, "learning_rate": 6.512259430520361e-06, "loss": 0.0156, "step": 4501 }, { "epoch": 4.203548085901027, "grad_norm": 4.683589085327264, "learning_rate": 6.510818056479848e-06, "loss": 0.2065, "step": 4502 }, { "epoch": 4.204481792717087, "grad_norm": 3.708412492615384, "learning_rate": 6.509376544258433e-06, "loss": 0.1775, "step": 4503 }, { "epoch": 4.205415499533147, "grad_norm": 1.5684079824250414, "learning_rate": 6.507934893987954e-06, "loss": 0.0902, "step": 4504 }, { "epoch": 4.2063492063492065, "grad_norm": 3.1979951308060013, "learning_rate": 6.506493105800271e-06, "loss": 0.1273, "step": 4505 }, { "epoch": 4.207282913165266, "grad_norm": 2.6786120057513063, "learning_rate": 6.505051179827249e-06, "loss": 0.1125, "step": 4506 }, { "epoch": 4.208216619981326, "grad_norm": 2.382027076829928, "learning_rate": 6.5036091162007675e-06, "loss": 0.1697, "step": 4507 }, { "epoch": 4.209150326797386, "grad_norm": 2.0006781336391093, "learning_rate": 6.50216691505272e-06, "loss": 0.0949, "step": 4508 }, { "epoch": 4.2100840336134455, "grad_norm": 0.9624350147640095, "learning_rate": 6.500724576515012e-06, "loss": 0.0379, "step": 4509 }, { "epoch": 4.211017740429505, "grad_norm": 2.9572882789361166, "learning_rate": 6.499282100719558e-06, "loss": 0.1546, "step": 4510 }, { "epoch": 4.211951447245565, "grad_norm": 2.831047890897426, "learning_rate": 6.4978394877982935e-06, "loss": 0.093, "step": 4511 }, { "epoch": 4.212885154061625, "grad_norm": 1.2617215899126601, "learning_rate": 6.496396737883157e-06, "loss": 0.0501, "step": 4512 }, { "epoch": 4.2138188608776845, "grad_norm": 1.4473779483796572, "learning_rate": 6.494953851106106e-06, "loss": 0.0569, "step": 4513 }, { "epoch": 4.214752567693744, "grad_norm": 1.4512995804181712, "learning_rate": 6.493510827599107e-06, "loss": 0.0682, "step": 4514 }, { "epoch": 4.215686274509804, "grad_norm": 0.9018138189181116, "learning_rate": 6.492067667494142e-06, "loss": 0.0236, "step": 4515 }, { "epoch": 4.216619981325864, "grad_norm": 1.0022869334810531, "learning_rate": 6.490624370923202e-06, "loss": 0.0173, "step": 4516 }, { "epoch": 4.2175536881419236, "grad_norm": 0.5820034737420142, "learning_rate": 6.489180938018293e-06, "loss": 0.014, "step": 4517 }, { "epoch": 4.218487394957983, "grad_norm": 2.4297499263900018, "learning_rate": 6.487737368911434e-06, "loss": 0.041, "step": 4518 }, { "epoch": 4.219421101774043, "grad_norm": 1.6396699200094915, "learning_rate": 6.486293663734654e-06, "loss": 0.0748, "step": 4519 }, { "epoch": 4.220354808590103, "grad_norm": 1.5474692162398938, "learning_rate": 6.484849822619995e-06, "loss": 0.0858, "step": 4520 }, { "epoch": 4.221288515406163, "grad_norm": 1.338260938241931, "learning_rate": 6.483405845699515e-06, "loss": 0.0556, "step": 4521 }, { "epoch": 4.222222222222222, "grad_norm": 2.9745772538194073, "learning_rate": 6.481961733105278e-06, "loss": 0.1574, "step": 4522 }, { "epoch": 4.223155929038282, "grad_norm": 2.5623669738531287, "learning_rate": 6.480517484969366e-06, "loss": 0.1305, "step": 4523 }, { "epoch": 4.224089635854342, "grad_norm": 4.138350148088067, "learning_rate": 6.479073101423871e-06, "loss": 0.1343, "step": 4524 }, { "epoch": 4.225023342670402, "grad_norm": 2.0190183056382844, "learning_rate": 6.477628582600896e-06, "loss": 0.1243, "step": 4525 }, { "epoch": 4.225957049486461, "grad_norm": 1.1973593317381193, "learning_rate": 6.476183928632563e-06, "loss": 0.0605, "step": 4526 }, { "epoch": 4.226890756302521, "grad_norm": 2.7340478575812015, "learning_rate": 6.474739139650996e-06, "loss": 0.1767, "step": 4527 }, { "epoch": 4.227824463118581, "grad_norm": 1.5185769480078262, "learning_rate": 6.47329421578834e-06, "loss": 0.0244, "step": 4528 }, { "epoch": 4.228758169934641, "grad_norm": 0.1873112533286103, "learning_rate": 6.471849157176747e-06, "loss": 0.0063, "step": 4529 }, { "epoch": 4.2296918767507, "grad_norm": 0.7598846350021122, "learning_rate": 6.470403963948386e-06, "loss": 0.0275, "step": 4530 }, { "epoch": 4.23062558356676, "grad_norm": 1.510127762521138, "learning_rate": 6.4689586362354325e-06, "loss": 0.0723, "step": 4531 }, { "epoch": 4.23155929038282, "grad_norm": 0.5074032995591221, "learning_rate": 6.4675131741700826e-06, "loss": 0.017, "step": 4532 }, { "epoch": 4.23249299719888, "grad_norm": 1.0383976156597763, "learning_rate": 6.466067577884534e-06, "loss": 0.0513, "step": 4533 }, { "epoch": 4.233426704014939, "grad_norm": 1.8224115337201272, "learning_rate": 6.4646218475110066e-06, "loss": 0.0206, "step": 4534 }, { "epoch": 4.234360410830999, "grad_norm": 1.5861315418452533, "learning_rate": 6.463175983181725e-06, "loss": 0.0759, "step": 4535 }, { "epoch": 4.235294117647059, "grad_norm": 4.2780297735306085, "learning_rate": 6.461729985028933e-06, "loss": 0.2362, "step": 4536 }, { "epoch": 4.236227824463119, "grad_norm": 0.8034867948060318, "learning_rate": 6.46028385318488e-06, "loss": 0.0143, "step": 4537 }, { "epoch": 4.237161531279178, "grad_norm": 2.305652463523042, "learning_rate": 6.458837587781832e-06, "loss": 0.0649, "step": 4538 }, { "epoch": 4.238095238095238, "grad_norm": 2.272009479181352, "learning_rate": 6.457391188952066e-06, "loss": 0.0706, "step": 4539 }, { "epoch": 4.239028944911298, "grad_norm": 0.45800193610226453, "learning_rate": 6.45594465682787e-06, "loss": 0.0206, "step": 4540 }, { "epoch": 4.239962651727358, "grad_norm": 0.45116497646527937, "learning_rate": 6.454497991541546e-06, "loss": 0.0109, "step": 4541 }, { "epoch": 4.240896358543417, "grad_norm": 1.0751077079905786, "learning_rate": 6.453051193225408e-06, "loss": 0.0655, "step": 4542 }, { "epoch": 4.241830065359477, "grad_norm": 3.020505012162196, "learning_rate": 6.4516042620117795e-06, "loss": 0.2037, "step": 4543 }, { "epoch": 4.242763772175537, "grad_norm": 3.0091744507703626, "learning_rate": 6.450157198033e-06, "loss": 0.1674, "step": 4544 }, { "epoch": 4.243697478991597, "grad_norm": 0.9790105915251683, "learning_rate": 6.448710001421421e-06, "loss": 0.037, "step": 4545 }, { "epoch": 4.244631185807656, "grad_norm": 1.5870632255043569, "learning_rate": 6.4472626723094e-06, "loss": 0.0793, "step": 4546 }, { "epoch": 4.245564892623716, "grad_norm": 1.128813617477998, "learning_rate": 6.445815210829316e-06, "loss": 0.0528, "step": 4547 }, { "epoch": 4.246498599439776, "grad_norm": 2.8226187685769473, "learning_rate": 6.444367617113552e-06, "loss": 0.1231, "step": 4548 }, { "epoch": 4.247432306255836, "grad_norm": 1.6292389555563438, "learning_rate": 6.442919891294507e-06, "loss": 0.083, "step": 4549 }, { "epoch": 4.248366013071895, "grad_norm": 2.1285248943679376, "learning_rate": 6.4414720335045925e-06, "loss": 0.0975, "step": 4550 }, { "epoch": 4.249299719887955, "grad_norm": 2.172966586943761, "learning_rate": 6.44002404387623e-06, "loss": 0.0892, "step": 4551 }, { "epoch": 4.250233426704015, "grad_norm": 1.5468912890390343, "learning_rate": 6.438575922541853e-06, "loss": 0.0701, "step": 4552 }, { "epoch": 4.251167133520075, "grad_norm": 1.6546322099788067, "learning_rate": 6.437127669633911e-06, "loss": 0.1012, "step": 4553 }, { "epoch": 4.2521008403361344, "grad_norm": 1.9737643107923948, "learning_rate": 6.435679285284859e-06, "loss": 0.104, "step": 4554 }, { "epoch": 4.253034547152194, "grad_norm": 1.211029760738853, "learning_rate": 6.434230769627172e-06, "loss": 0.0752, "step": 4555 }, { "epoch": 4.253968253968254, "grad_norm": 1.8870655070051892, "learning_rate": 6.43278212279333e-06, "loss": 0.0789, "step": 4556 }, { "epoch": 4.254901960784314, "grad_norm": 4.152426085370514, "learning_rate": 6.431333344915828e-06, "loss": 0.1664, "step": 4557 }, { "epoch": 4.2558356676003735, "grad_norm": 0.8028587149396655, "learning_rate": 6.429884436127172e-06, "loss": 0.0333, "step": 4558 }, { "epoch": 4.256769374416433, "grad_norm": 1.3110574647707256, "learning_rate": 6.428435396559881e-06, "loss": 0.0703, "step": 4559 }, { "epoch": 4.257703081232493, "grad_norm": 1.4064101600693544, "learning_rate": 6.4269862263464876e-06, "loss": 0.0658, "step": 4560 }, { "epoch": 4.258636788048553, "grad_norm": 1.3417164176990315, "learning_rate": 6.425536925619531e-06, "loss": 0.0525, "step": 4561 }, { "epoch": 4.2595704948646125, "grad_norm": 2.3795104423410027, "learning_rate": 6.424087494511567e-06, "loss": 0.1221, "step": 4562 }, { "epoch": 4.260504201680672, "grad_norm": 0.7566511513414205, "learning_rate": 6.4226379331551625e-06, "loss": 0.0219, "step": 4563 }, { "epoch": 4.261437908496732, "grad_norm": 3.311635989097313, "learning_rate": 6.421188241682896e-06, "loss": 0.1865, "step": 4564 }, { "epoch": 4.262371615312792, "grad_norm": 2.9119678612115836, "learning_rate": 6.419738420227357e-06, "loss": 0.1551, "step": 4565 }, { "epoch": 4.2633053221288515, "grad_norm": 5.792551501539635, "learning_rate": 6.4182884689211485e-06, "loss": 0.0527, "step": 4566 }, { "epoch": 4.264239028944911, "grad_norm": 2.559619988861493, "learning_rate": 6.416838387896883e-06, "loss": 0.0307, "step": 4567 }, { "epoch": 4.265172735760971, "grad_norm": 0.6166866266238229, "learning_rate": 6.4153881772871884e-06, "loss": 0.0158, "step": 4568 }, { "epoch": 4.266106442577031, "grad_norm": 2.476805281651247, "learning_rate": 6.4139378372247e-06, "loss": 0.2026, "step": 4569 }, { "epoch": 4.2670401493930905, "grad_norm": 5.6625587854398285, "learning_rate": 6.412487367842068e-06, "loss": 0.0728, "step": 4570 }, { "epoch": 4.26797385620915, "grad_norm": 1.9245955803588142, "learning_rate": 6.411036769271955e-06, "loss": 0.042, "step": 4571 }, { "epoch": 4.26890756302521, "grad_norm": 0.4115326431903952, "learning_rate": 6.409586041647034e-06, "loss": 0.006, "step": 4572 }, { "epoch": 4.26984126984127, "grad_norm": 1.6843016958860715, "learning_rate": 6.408135185099989e-06, "loss": 0.0583, "step": 4573 }, { "epoch": 4.2707749766573295, "grad_norm": 1.379641198585923, "learning_rate": 6.406684199763516e-06, "loss": 0.0602, "step": 4574 }, { "epoch": 4.271708683473389, "grad_norm": 0.585638828417042, "learning_rate": 6.405233085770327e-06, "loss": 0.015, "step": 4575 }, { "epoch": 4.272642390289449, "grad_norm": 2.216895422913894, "learning_rate": 6.403781843253138e-06, "loss": 0.0842, "step": 4576 }, { "epoch": 4.273576097105509, "grad_norm": 0.7379094282222501, "learning_rate": 6.402330472344684e-06, "loss": 0.0162, "step": 4577 }, { "epoch": 4.2745098039215685, "grad_norm": 2.351876040432222, "learning_rate": 6.4008789731777065e-06, "loss": 0.1049, "step": 4578 }, { "epoch": 4.275443510737628, "grad_norm": 0.8976952713830915, "learning_rate": 6.399427345884964e-06, "loss": 0.0272, "step": 4579 }, { "epoch": 4.276377217553688, "grad_norm": 2.5546113895281732, "learning_rate": 6.397975590599222e-06, "loss": 0.066, "step": 4580 }, { "epoch": 4.277310924369748, "grad_norm": 2.338500251587413, "learning_rate": 6.3965237074532586e-06, "loss": 0.0383, "step": 4581 }, { "epoch": 4.278244631185808, "grad_norm": 1.702937730673367, "learning_rate": 6.395071696579867e-06, "loss": 0.0807, "step": 4582 }, { "epoch": 4.279178338001867, "grad_norm": 6.5144472816785335, "learning_rate": 6.393619558111846e-06, "loss": 0.1434, "step": 4583 }, { "epoch": 4.280112044817927, "grad_norm": 1.755270238599362, "learning_rate": 6.392167292182012e-06, "loss": 0.0912, "step": 4584 }, { "epoch": 4.281045751633987, "grad_norm": 1.3102367261222, "learning_rate": 6.3907148989231915e-06, "loss": 0.0704, "step": 4585 }, { "epoch": 4.281979458450047, "grad_norm": 8.522761967091917, "learning_rate": 6.389262378468219e-06, "loss": 0.4554, "step": 4586 }, { "epoch": 4.282913165266106, "grad_norm": 3.7299526021945426, "learning_rate": 6.387809730949948e-06, "loss": 0.1544, "step": 4587 }, { "epoch": 4.283846872082166, "grad_norm": 0.41671026120993326, "learning_rate": 6.386356956501233e-06, "loss": 0.0134, "step": 4588 }, { "epoch": 4.284780578898226, "grad_norm": 1.4569313751550574, "learning_rate": 6.3849040552549515e-06, "loss": 0.0629, "step": 4589 }, { "epoch": 4.285714285714286, "grad_norm": 4.377998285106633, "learning_rate": 6.383451027343983e-06, "loss": 0.2016, "step": 4590 }, { "epoch": 4.286647992530345, "grad_norm": 1.3779256892964333, "learning_rate": 6.381997872901227e-06, "loss": 0.063, "step": 4591 }, { "epoch": 4.287581699346405, "grad_norm": 1.8698815334479824, "learning_rate": 6.380544592059588e-06, "loss": 0.1046, "step": 4592 }, { "epoch": 4.288515406162465, "grad_norm": 2.635205248044379, "learning_rate": 6.379091184951984e-06, "loss": 0.124, "step": 4593 }, { "epoch": 4.289449112978525, "grad_norm": 3.9029639666465545, "learning_rate": 6.377637651711347e-06, "loss": 0.109, "step": 4594 }, { "epoch": 4.290382819794584, "grad_norm": 1.301328952199619, "learning_rate": 6.376183992470616e-06, "loss": 0.0495, "step": 4595 }, { "epoch": 4.291316526610644, "grad_norm": 4.501200199115758, "learning_rate": 6.374730207362747e-06, "loss": 0.1241, "step": 4596 }, { "epoch": 4.292250233426704, "grad_norm": 4.341832263170681, "learning_rate": 6.3732762965207025e-06, "loss": 0.2263, "step": 4597 }, { "epoch": 4.293183940242764, "grad_norm": 2.496918883905534, "learning_rate": 6.371822260077459e-06, "loss": 0.0419, "step": 4598 }, { "epoch": 4.294117647058823, "grad_norm": 4.159126114742093, "learning_rate": 6.370368098166005e-06, "loss": 0.1902, "step": 4599 }, { "epoch": 4.295051353874883, "grad_norm": 3.726943094544409, "learning_rate": 6.368913810919339e-06, "loss": 0.1451, "step": 4600 }, { "epoch": 4.295985060690943, "grad_norm": 1.699326080574752, "learning_rate": 6.367459398470472e-06, "loss": 0.0051, "step": 4601 }, { "epoch": 4.296918767507003, "grad_norm": 1.3703194400960457, "learning_rate": 6.366004860952425e-06, "loss": 0.0579, "step": 4602 }, { "epoch": 4.297852474323062, "grad_norm": 7.737681725401581, "learning_rate": 6.3645501984982315e-06, "loss": 0.0779, "step": 4603 }, { "epoch": 4.298786181139122, "grad_norm": 3.734088907968462, "learning_rate": 6.363095411240937e-06, "loss": 0.1716, "step": 4604 }, { "epoch": 4.299719887955182, "grad_norm": 0.9568834956108216, "learning_rate": 6.361640499313597e-06, "loss": 0.0217, "step": 4605 }, { "epoch": 4.300653594771242, "grad_norm": 1.417012496213121, "learning_rate": 6.360185462849282e-06, "loss": 0.0295, "step": 4606 }, { "epoch": 4.301587301587301, "grad_norm": 5.418844338487423, "learning_rate": 6.358730301981066e-06, "loss": 0.1367, "step": 4607 }, { "epoch": 4.302521008403361, "grad_norm": 2.1974936417044213, "learning_rate": 6.357275016842045e-06, "loss": 0.127, "step": 4608 }, { "epoch": 4.303454715219421, "grad_norm": 4.322375260671576, "learning_rate": 6.355819607565316e-06, "loss": 0.0346, "step": 4609 }, { "epoch": 4.304388422035481, "grad_norm": 3.1101912109686958, "learning_rate": 6.354364074283995e-06, "loss": 0.0692, "step": 4610 }, { "epoch": 4.30532212885154, "grad_norm": 1.8594889114667104, "learning_rate": 6.352908417131206e-06, "loss": 0.1228, "step": 4611 }, { "epoch": 4.3062558356676, "grad_norm": 1.7155673382796004, "learning_rate": 6.3514526362400845e-06, "loss": 0.0739, "step": 4612 }, { "epoch": 4.30718954248366, "grad_norm": 6.760434504524846, "learning_rate": 6.349996731743777e-06, "loss": 0.0714, "step": 4613 }, { "epoch": 4.30812324929972, "grad_norm": 6.518687693634284, "learning_rate": 6.348540703775443e-06, "loss": 0.1888, "step": 4614 }, { "epoch": 4.309056956115779, "grad_norm": 2.0571764677425803, "learning_rate": 6.347084552468252e-06, "loss": 0.0937, "step": 4615 }, { "epoch": 4.309990662931839, "grad_norm": 2.4496206700922087, "learning_rate": 6.345628277955384e-06, "loss": 0.1753, "step": 4616 }, { "epoch": 4.310924369747899, "grad_norm": 2.1649454325436612, "learning_rate": 6.344171880370035e-06, "loss": 0.1422, "step": 4617 }, { "epoch": 4.311858076563959, "grad_norm": 0.9283589305471347, "learning_rate": 6.342715359845404e-06, "loss": 0.0336, "step": 4618 }, { "epoch": 4.3127917833800185, "grad_norm": 1.6596954729732922, "learning_rate": 6.341258716514707e-06, "loss": 0.0772, "step": 4619 }, { "epoch": 4.313725490196078, "grad_norm": 1.1840528266472765, "learning_rate": 6.33980195051117e-06, "loss": 0.0453, "step": 4620 }, { "epoch": 4.314659197012138, "grad_norm": 1.8178163847093616, "learning_rate": 6.338345061968033e-06, "loss": 0.0594, "step": 4621 }, { "epoch": 4.315592903828198, "grad_norm": 5.761137180150367, "learning_rate": 6.336888051018542e-06, "loss": 0.0697, "step": 4622 }, { "epoch": 4.3165266106442575, "grad_norm": 1.6452280446523326, "learning_rate": 6.335430917795956e-06, "loss": 0.0639, "step": 4623 }, { "epoch": 4.317460317460317, "grad_norm": 5.834201717350939, "learning_rate": 6.333973662433548e-06, "loss": 0.1158, "step": 4624 }, { "epoch": 4.318394024276377, "grad_norm": 1.3512447218922703, "learning_rate": 6.332516285064597e-06, "loss": 0.0302, "step": 4625 }, { "epoch": 4.319327731092437, "grad_norm": 0.4527656189251249, "learning_rate": 6.3310587858224005e-06, "loss": 0.0061, "step": 4626 }, { "epoch": 4.3202614379084965, "grad_norm": 1.7741015188070288, "learning_rate": 6.3296011648402574e-06, "loss": 0.0837, "step": 4627 }, { "epoch": 4.321195144724556, "grad_norm": 0.8324473666844274, "learning_rate": 6.328143422251489e-06, "loss": 0.0386, "step": 4628 }, { "epoch": 4.322128851540616, "grad_norm": 2.3538708764440064, "learning_rate": 6.326685558189416e-06, "loss": 0.0701, "step": 4629 }, { "epoch": 4.323062558356676, "grad_norm": 3.335970728106644, "learning_rate": 6.325227572787381e-06, "loss": 0.0779, "step": 4630 }, { "epoch": 4.3239962651727355, "grad_norm": 2.989811636969883, "learning_rate": 6.323769466178731e-06, "loss": 0.1938, "step": 4631 }, { "epoch": 4.324929971988795, "grad_norm": 1.2351699044898874, "learning_rate": 6.322311238496825e-06, "loss": 0.0425, "step": 4632 }, { "epoch": 4.325863678804855, "grad_norm": 2.2868258112046256, "learning_rate": 6.320852889875033e-06, "loss": 0.1387, "step": 4633 }, { "epoch": 4.326797385620915, "grad_norm": 2.2041505986008554, "learning_rate": 6.319394420446742e-06, "loss": 0.0466, "step": 4634 }, { "epoch": 4.3277310924369745, "grad_norm": 1.0584818810203018, "learning_rate": 6.3179358303453386e-06, "loss": 0.0631, "step": 4635 }, { "epoch": 4.328664799253034, "grad_norm": 1.6316283745977087, "learning_rate": 6.316477119704232e-06, "loss": 0.0284, "step": 4636 }, { "epoch": 4.329598506069094, "grad_norm": 4.178438368528419, "learning_rate": 6.315018288656833e-06, "loss": 0.1924, "step": 4637 }, { "epoch": 4.330532212885154, "grad_norm": 3.6953224606615973, "learning_rate": 6.313559337336572e-06, "loss": 0.1666, "step": 4638 }, { "epoch": 4.3314659197012135, "grad_norm": 2.0406990788056465, "learning_rate": 6.312100265876883e-06, "loss": 0.0986, "step": 4639 }, { "epoch": 4.332399626517273, "grad_norm": 0.6921069968200434, "learning_rate": 6.310641074411216e-06, "loss": 0.0263, "step": 4640 }, { "epoch": 4.333333333333333, "grad_norm": 2.5492104514691145, "learning_rate": 6.309181763073029e-06, "loss": 0.1333, "step": 4641 }, { "epoch": 4.334267040149393, "grad_norm": 4.307143921742539, "learning_rate": 6.30772233199579e-06, "loss": 0.1206, "step": 4642 }, { "epoch": 4.3352007469654525, "grad_norm": 4.207398687051919, "learning_rate": 6.306262781312985e-06, "loss": 0.1592, "step": 4643 }, { "epoch": 4.336134453781512, "grad_norm": 0.762272174633624, "learning_rate": 6.304803111158102e-06, "loss": 0.0411, "step": 4644 }, { "epoch": 4.337068160597572, "grad_norm": 1.9552594710325937, "learning_rate": 6.303343321664646e-06, "loss": 0.0842, "step": 4645 }, { "epoch": 4.338001867413632, "grad_norm": 1.3389733138645985, "learning_rate": 6.3018834129661285e-06, "loss": 0.0285, "step": 4646 }, { "epoch": 4.338935574229692, "grad_norm": 0.7863586641956041, "learning_rate": 6.300423385196077e-06, "loss": 0.0248, "step": 4647 }, { "epoch": 4.339869281045751, "grad_norm": 0.7230129810178904, "learning_rate": 6.298963238488025e-06, "loss": 0.0164, "step": 4648 }, { "epoch": 4.340802987861811, "grad_norm": 1.1174483990836332, "learning_rate": 6.29750297297552e-06, "loss": 0.0231, "step": 4649 }, { "epoch": 4.341736694677871, "grad_norm": 2.06384136264115, "learning_rate": 6.296042588792118e-06, "loss": 0.1191, "step": 4650 }, { "epoch": 4.342670401493931, "grad_norm": 2.8937284125390366, "learning_rate": 6.294582086071389e-06, "loss": 0.1129, "step": 4651 }, { "epoch": 4.34360410830999, "grad_norm": 2.8286525041676644, "learning_rate": 6.293121464946911e-06, "loss": 0.0834, "step": 4652 }, { "epoch": 4.34453781512605, "grad_norm": 2.505263786541836, "learning_rate": 6.291660725552273e-06, "loss": 0.0938, "step": 4653 }, { "epoch": 4.34547152194211, "grad_norm": 1.7573717366624186, "learning_rate": 6.290199868021077e-06, "loss": 0.112, "step": 4654 }, { "epoch": 4.34640522875817, "grad_norm": 0.29953183134877087, "learning_rate": 6.288738892486935e-06, "loss": 0.0055, "step": 4655 }, { "epoch": 4.347338935574229, "grad_norm": 2.511697184242286, "learning_rate": 6.287277799083466e-06, "loss": 0.0833, "step": 4656 }, { "epoch": 4.348272642390289, "grad_norm": 1.8675548897788277, "learning_rate": 6.285816587944309e-06, "loss": 0.0971, "step": 4657 }, { "epoch": 4.349206349206349, "grad_norm": 1.7777655418675669, "learning_rate": 6.284355259203101e-06, "loss": 0.0396, "step": 4658 }, { "epoch": 4.350140056022409, "grad_norm": 2.6094438362841443, "learning_rate": 6.2828938129935e-06, "loss": 0.0672, "step": 4659 }, { "epoch": 4.351073762838468, "grad_norm": 3.713055156853431, "learning_rate": 6.281432249449173e-06, "loss": 0.2146, "step": 4660 }, { "epoch": 4.352007469654528, "grad_norm": 1.6426622902620889, "learning_rate": 6.279970568703793e-06, "loss": 0.0628, "step": 4661 }, { "epoch": 4.352941176470588, "grad_norm": 0.8414974827526845, "learning_rate": 6.278508770891048e-06, "loss": 0.0406, "step": 4662 }, { "epoch": 4.353874883286648, "grad_norm": 1.7695997432266868, "learning_rate": 6.277046856144633e-06, "loss": 0.0754, "step": 4663 }, { "epoch": 4.354808590102707, "grad_norm": 2.8934901727324585, "learning_rate": 6.275584824598262e-06, "loss": 0.1116, "step": 4664 }, { "epoch": 4.355742296918767, "grad_norm": 1.353026975316865, "learning_rate": 6.274122676385648e-06, "loss": 0.049, "step": 4665 }, { "epoch": 4.356676003734827, "grad_norm": 0.4545205347281568, "learning_rate": 6.2726604116405235e-06, "loss": 0.0086, "step": 4666 }, { "epoch": 4.357609710550887, "grad_norm": 0.5143365836135851, "learning_rate": 6.271198030496628e-06, "loss": 0.0164, "step": 4667 }, { "epoch": 4.358543417366946, "grad_norm": 1.7899932288171945, "learning_rate": 6.269735533087712e-06, "loss": 0.1228, "step": 4668 }, { "epoch": 4.359477124183006, "grad_norm": 4.323269395680209, "learning_rate": 6.268272919547537e-06, "loss": 0.1719, "step": 4669 }, { "epoch": 4.360410830999066, "grad_norm": 2.0669877920160036, "learning_rate": 6.266810190009876e-06, "loss": 0.1441, "step": 4670 }, { "epoch": 4.361344537815126, "grad_norm": 0.4568714448831469, "learning_rate": 6.26534734460851e-06, "loss": 0.0147, "step": 4671 }, { "epoch": 4.362278244631185, "grad_norm": 0.9920600105834336, "learning_rate": 6.2638843834772345e-06, "loss": 0.0318, "step": 4672 }, { "epoch": 4.363211951447245, "grad_norm": 0.7703178919715036, "learning_rate": 6.2624213067498515e-06, "loss": 0.0112, "step": 4673 }, { "epoch": 4.364145658263305, "grad_norm": 1.7099967162093888, "learning_rate": 6.260958114560176e-06, "loss": 0.1004, "step": 4674 }, { "epoch": 4.365079365079365, "grad_norm": 0.9292511631832185, "learning_rate": 6.259494807042035e-06, "loss": 0.014, "step": 4675 }, { "epoch": 4.366013071895424, "grad_norm": 2.2291051464786453, "learning_rate": 6.25803138432926e-06, "loss": 0.0133, "step": 4676 }, { "epoch": 4.366946778711484, "grad_norm": 6.90593743960914, "learning_rate": 6.256567846555699e-06, "loss": 0.3391, "step": 4677 }, { "epoch": 4.367880485527545, "grad_norm": 1.9952774900566659, "learning_rate": 6.255104193855208e-06, "loss": 0.0785, "step": 4678 }, { "epoch": 4.368814192343605, "grad_norm": 2.639457102885081, "learning_rate": 6.2536404263616564e-06, "loss": 0.0833, "step": 4679 }, { "epoch": 4.369747899159664, "grad_norm": 0.5607260363161495, "learning_rate": 6.25217654420892e-06, "loss": 0.008, "step": 4680 }, { "epoch": 4.370681605975724, "grad_norm": 0.6198902435077879, "learning_rate": 6.250712547530887e-06, "loss": 0.0203, "step": 4681 }, { "epoch": 4.371615312791784, "grad_norm": 2.0777710428701606, "learning_rate": 6.249248436461456e-06, "loss": 0.158, "step": 4682 }, { "epoch": 4.372549019607844, "grad_norm": 1.9951340706858804, "learning_rate": 6.247784211134538e-06, "loss": 0.0881, "step": 4683 }, { "epoch": 4.373482726423903, "grad_norm": 1.1521434195539209, "learning_rate": 6.246319871684048e-06, "loss": 0.0607, "step": 4684 }, { "epoch": 4.374416433239963, "grad_norm": 0.47513673658274613, "learning_rate": 6.24485541824392e-06, "loss": 0.0183, "step": 4685 }, { "epoch": 4.375350140056023, "grad_norm": 1.430712991697142, "learning_rate": 6.243390850948092e-06, "loss": 0.0342, "step": 4686 }, { "epoch": 4.376283846872083, "grad_norm": 2.1995464242082226, "learning_rate": 6.241926169930516e-06, "loss": 0.0864, "step": 4687 }, { "epoch": 4.377217553688142, "grad_norm": 3.520251387631682, "learning_rate": 6.240461375325154e-06, "loss": 0.187, "step": 4688 }, { "epoch": 4.378151260504202, "grad_norm": 4.266018789529475, "learning_rate": 6.238996467265977e-06, "loss": 0.1708, "step": 4689 }, { "epoch": 4.379084967320262, "grad_norm": 3.5690671252902773, "learning_rate": 6.237531445886965e-06, "loss": 0.2209, "step": 4690 }, { "epoch": 4.380018674136322, "grad_norm": 0.9771502322873425, "learning_rate": 6.236066311322111e-06, "loss": 0.0381, "step": 4691 }, { "epoch": 4.380952380952381, "grad_norm": 1.7772443585895386, "learning_rate": 6.23460106370542e-06, "loss": 0.1066, "step": 4692 }, { "epoch": 4.381886087768441, "grad_norm": 2.7219492842203152, "learning_rate": 6.233135703170902e-06, "loss": 0.1576, "step": 4693 }, { "epoch": 4.382819794584501, "grad_norm": 0.883494286152523, "learning_rate": 6.231670229852581e-06, "loss": 0.0247, "step": 4694 }, { "epoch": 4.383753501400561, "grad_norm": 1.4847533830318187, "learning_rate": 6.230204643884493e-06, "loss": 0.073, "step": 4695 }, { "epoch": 4.38468720821662, "grad_norm": 2.488385339636944, "learning_rate": 6.228738945400678e-06, "loss": 0.1274, "step": 4696 }, { "epoch": 4.38562091503268, "grad_norm": 0.5479688321370714, "learning_rate": 6.227273134535196e-06, "loss": 0.0154, "step": 4697 }, { "epoch": 4.38655462184874, "grad_norm": 1.1210795416480661, "learning_rate": 6.225807211422104e-06, "loss": 0.036, "step": 4698 }, { "epoch": 4.3874883286648, "grad_norm": 0.7672802000580999, "learning_rate": 6.224341176195482e-06, "loss": 0.0253, "step": 4699 }, { "epoch": 4.388422035480859, "grad_norm": 2.530890113257069, "learning_rate": 6.222875028989415e-06, "loss": 0.1742, "step": 4700 }, { "epoch": 4.389355742296919, "grad_norm": 3.121793626699793, "learning_rate": 6.221408769937995e-06, "loss": 0.1926, "step": 4701 }, { "epoch": 4.390289449112979, "grad_norm": 4.056173064835759, "learning_rate": 6.219942399175331e-06, "loss": 0.1514, "step": 4702 }, { "epoch": 4.391223155929039, "grad_norm": 1.430163275105803, "learning_rate": 6.2184759168355355e-06, "loss": 0.0617, "step": 4703 }, { "epoch": 4.392156862745098, "grad_norm": 2.6631746775010208, "learning_rate": 6.217009323052739e-06, "loss": 0.1473, "step": 4704 }, { "epoch": 4.393090569561158, "grad_norm": 2.2162417533114103, "learning_rate": 6.2155426179610715e-06, "loss": 0.1563, "step": 4705 }, { "epoch": 4.394024276377218, "grad_norm": 0.7305676446030638, "learning_rate": 6.214075801694685e-06, "loss": 0.02, "step": 4706 }, { "epoch": 4.394957983193278, "grad_norm": 2.45607092153341, "learning_rate": 6.212608874387732e-06, "loss": 0.1086, "step": 4707 }, { "epoch": 4.395891690009337, "grad_norm": 0.3863784923578856, "learning_rate": 6.211141836174381e-06, "loss": 0.013, "step": 4708 }, { "epoch": 4.396825396825397, "grad_norm": 1.951904146562318, "learning_rate": 6.209674687188808e-06, "loss": 0.0796, "step": 4709 }, { "epoch": 4.397759103641457, "grad_norm": 3.220277631288878, "learning_rate": 6.2082074275652e-06, "loss": 0.1737, "step": 4710 }, { "epoch": 4.398692810457517, "grad_norm": 2.1128576311956606, "learning_rate": 6.206740057437754e-06, "loss": 0.0676, "step": 4711 }, { "epoch": 4.3996265172735765, "grad_norm": 3.647573896801003, "learning_rate": 6.205272576940678e-06, "loss": 0.0335, "step": 4712 }, { "epoch": 4.400560224089636, "grad_norm": 0.7699457184152793, "learning_rate": 6.203804986208189e-06, "loss": 0.0215, "step": 4713 }, { "epoch": 4.401493930905696, "grad_norm": 1.9422573431916703, "learning_rate": 6.202337285374511e-06, "loss": 0.1118, "step": 4714 }, { "epoch": 4.402427637721756, "grad_norm": 1.553863023524684, "learning_rate": 6.200869474573887e-06, "loss": 0.0671, "step": 4715 }, { "epoch": 4.4033613445378155, "grad_norm": 0.5731634108802985, "learning_rate": 6.19940155394056e-06, "loss": 0.0131, "step": 4716 }, { "epoch": 4.404295051353875, "grad_norm": 2.415388557620273, "learning_rate": 6.19793352360879e-06, "loss": 0.1223, "step": 4717 }, { "epoch": 4.405228758169935, "grad_norm": 1.6443945493313092, "learning_rate": 6.196465383712843e-06, "loss": 0.0552, "step": 4718 }, { "epoch": 4.406162464985995, "grad_norm": 1.823766895029141, "learning_rate": 6.1949971343869975e-06, "loss": 0.0461, "step": 4719 }, { "epoch": 4.4070961718020545, "grad_norm": 2.8932567201237838, "learning_rate": 6.193528775765541e-06, "loss": 0.2171, "step": 4720 }, { "epoch": 4.408029878618114, "grad_norm": 2.9449343450691465, "learning_rate": 6.192060307982769e-06, "loss": 0.1162, "step": 4721 }, { "epoch": 4.408963585434174, "grad_norm": 3.4970281789304893, "learning_rate": 6.1905917311729915e-06, "loss": 0.179, "step": 4722 }, { "epoch": 4.409897292250234, "grad_norm": 6.512757827510605, "learning_rate": 6.189123045470525e-06, "loss": 0.2117, "step": 4723 }, { "epoch": 4.4108309990662935, "grad_norm": 2.0237658840734314, "learning_rate": 6.187654251009698e-06, "loss": 0.0885, "step": 4724 }, { "epoch": 4.411764705882353, "grad_norm": 1.7399272913072121, "learning_rate": 6.186185347924846e-06, "loss": 0.0598, "step": 4725 }, { "epoch": 4.412698412698413, "grad_norm": 0.6069228021035329, "learning_rate": 6.184716336350317e-06, "loss": 0.0301, "step": 4726 }, { "epoch": 4.413632119514473, "grad_norm": 0.5062422644247014, "learning_rate": 6.1832472164204685e-06, "loss": 0.0229, "step": 4727 }, { "epoch": 4.4145658263305325, "grad_norm": 5.511581674583108, "learning_rate": 6.181777988269668e-06, "loss": 0.1231, "step": 4728 }, { "epoch": 4.415499533146592, "grad_norm": 3.2079508237016845, "learning_rate": 6.180308652032291e-06, "loss": 0.0794, "step": 4729 }, { "epoch": 4.416433239962652, "grad_norm": 2.8550683559192254, "learning_rate": 6.178839207842727e-06, "loss": 0.13, "step": 4730 }, { "epoch": 4.417366946778712, "grad_norm": 0.6361935099313614, "learning_rate": 6.17736965583537e-06, "loss": 0.0117, "step": 4731 }, { "epoch": 4.4183006535947715, "grad_norm": 4.424315534487942, "learning_rate": 6.17589999614463e-06, "loss": 0.2148, "step": 4732 }, { "epoch": 4.419234360410831, "grad_norm": 2.719749593816563, "learning_rate": 6.17443022890492e-06, "loss": 0.0963, "step": 4733 }, { "epoch": 4.420168067226891, "grad_norm": 1.801310536773372, "learning_rate": 6.1729603542506675e-06, "loss": 0.0699, "step": 4734 }, { "epoch": 4.421101774042951, "grad_norm": 1.1526835005368654, "learning_rate": 6.17149037231631e-06, "loss": 0.0458, "step": 4735 }, { "epoch": 4.4220354808590105, "grad_norm": 0.7294364654467702, "learning_rate": 6.170020283236292e-06, "loss": 0.0221, "step": 4736 }, { "epoch": 4.42296918767507, "grad_norm": 1.4641981888425655, "learning_rate": 6.168550087145071e-06, "loss": 0.0778, "step": 4737 }, { "epoch": 4.42390289449113, "grad_norm": 1.0335519513140428, "learning_rate": 6.1670797841771105e-06, "loss": 0.0448, "step": 4738 }, { "epoch": 4.42483660130719, "grad_norm": 0.7434417879270919, "learning_rate": 6.1656093744668865e-06, "loss": 0.0331, "step": 4739 }, { "epoch": 4.42577030812325, "grad_norm": 0.9783598999556974, "learning_rate": 6.164138858148886e-06, "loss": 0.0275, "step": 4740 }, { "epoch": 4.426704014939309, "grad_norm": 1.9659544869180825, "learning_rate": 6.1626682353576024e-06, "loss": 0.059, "step": 4741 }, { "epoch": 4.427637721755369, "grad_norm": 0.7875452666300785, "learning_rate": 6.161197506227539e-06, "loss": 0.0242, "step": 4742 }, { "epoch": 4.428571428571429, "grad_norm": 1.6982674198862429, "learning_rate": 6.159726670893212e-06, "loss": 0.0623, "step": 4743 }, { "epoch": 4.429505135387489, "grad_norm": 3.3626953034223224, "learning_rate": 6.158255729489144e-06, "loss": 0.0163, "step": 4744 }, { "epoch": 4.430438842203548, "grad_norm": 1.7114574101995315, "learning_rate": 6.15678468214987e-06, "loss": 0.0786, "step": 4745 }, { "epoch": 4.431372549019608, "grad_norm": 2.896835703343884, "learning_rate": 6.155313529009933e-06, "loss": 0.1829, "step": 4746 }, { "epoch": 4.432306255835668, "grad_norm": 1.8809799393299542, "learning_rate": 6.153842270203887e-06, "loss": 0.1182, "step": 4747 }, { "epoch": 4.433239962651728, "grad_norm": 1.7138337337288363, "learning_rate": 6.1523709058662915e-06, "loss": 0.1334, "step": 4748 }, { "epoch": 4.434173669467787, "grad_norm": 0.7372541360216363, "learning_rate": 6.1508994361317245e-06, "loss": 0.0259, "step": 4749 }, { "epoch": 4.435107376283847, "grad_norm": 2.2235316230251407, "learning_rate": 6.149427861134762e-06, "loss": 0.1777, "step": 4750 }, { "epoch": 4.436041083099907, "grad_norm": 0.3462685106547214, "learning_rate": 6.14795618101e-06, "loss": 0.0091, "step": 4751 }, { "epoch": 4.436974789915967, "grad_norm": 0.2515205421861913, "learning_rate": 6.146484395892037e-06, "loss": 0.008, "step": 4752 }, { "epoch": 4.437908496732026, "grad_norm": 3.2750150469750374, "learning_rate": 6.145012505915487e-06, "loss": 0.1407, "step": 4753 }, { "epoch": 4.438842203548086, "grad_norm": 3.2741194331467183, "learning_rate": 6.1435405112149675e-06, "loss": 0.1741, "step": 4754 }, { "epoch": 4.439775910364146, "grad_norm": 0.9570890331264177, "learning_rate": 6.142068411925111e-06, "loss": 0.0368, "step": 4755 }, { "epoch": 4.440709617180206, "grad_norm": 2.882090989400943, "learning_rate": 6.140596208180555e-06, "loss": 0.1031, "step": 4756 }, { "epoch": 4.441643323996265, "grad_norm": 0.9615710979872447, "learning_rate": 6.1391239001159504e-06, "loss": 0.0421, "step": 4757 }, { "epoch": 4.442577030812325, "grad_norm": 3.0187215855584086, "learning_rate": 6.137651487865955e-06, "loss": 0.1226, "step": 4758 }, { "epoch": 4.443510737628385, "grad_norm": 2.654376568212127, "learning_rate": 6.136178971565238e-06, "loss": 0.1506, "step": 4759 }, { "epoch": 4.444444444444445, "grad_norm": 0.3970031340228968, "learning_rate": 6.134706351348475e-06, "loss": 0.0109, "step": 4760 }, { "epoch": 4.445378151260504, "grad_norm": 0.6562580630411416, "learning_rate": 6.133233627350356e-06, "loss": 0.0068, "step": 4761 }, { "epoch": 4.446311858076564, "grad_norm": 1.2349486651148542, "learning_rate": 6.131760799705576e-06, "loss": 0.0423, "step": 4762 }, { "epoch": 4.447245564892624, "grad_norm": 1.4742322370316852, "learning_rate": 6.130287868548841e-06, "loss": 0.0784, "step": 4763 }, { "epoch": 4.448179271708684, "grad_norm": 1.4064238092070842, "learning_rate": 6.128814834014869e-06, "loss": 0.0353, "step": 4764 }, { "epoch": 4.449112978524743, "grad_norm": 1.2829422214565962, "learning_rate": 6.127341696238383e-06, "loss": 0.0141, "step": 4765 }, { "epoch": 4.450046685340803, "grad_norm": 0.3330026306227612, "learning_rate": 6.12586845535412e-06, "loss": 0.0056, "step": 4766 }, { "epoch": 4.450980392156863, "grad_norm": 0.6251673625520747, "learning_rate": 6.1243951114968195e-06, "loss": 0.0146, "step": 4767 }, { "epoch": 4.451914098972923, "grad_norm": 0.6449474085620055, "learning_rate": 6.12292166480124e-06, "loss": 0.0251, "step": 4768 }, { "epoch": 4.452847805788982, "grad_norm": 0.8473740876892674, "learning_rate": 6.121448115402141e-06, "loss": 0.0174, "step": 4769 }, { "epoch": 4.453781512605042, "grad_norm": 1.1390736753084503, "learning_rate": 6.119974463434298e-06, "loss": 0.0523, "step": 4770 }, { "epoch": 4.454715219421102, "grad_norm": 1.0022606968594319, "learning_rate": 6.118500709032489e-06, "loss": 0.0415, "step": 4771 }, { "epoch": 4.455648926237162, "grad_norm": 2.5314358427089587, "learning_rate": 6.117026852331508e-06, "loss": 0.1283, "step": 4772 }, { "epoch": 4.456582633053221, "grad_norm": 0.6291809862155368, "learning_rate": 6.115552893466152e-06, "loss": 0.011, "step": 4773 }, { "epoch": 4.457516339869281, "grad_norm": 1.4101901377147588, "learning_rate": 6.114078832571234e-06, "loss": 0.0681, "step": 4774 }, { "epoch": 4.458450046685341, "grad_norm": 2.7878109301499268, "learning_rate": 6.112604669781572e-06, "loss": 0.1474, "step": 4775 }, { "epoch": 4.459383753501401, "grad_norm": 1.7515867220242798, "learning_rate": 6.1111304052319954e-06, "loss": 0.0872, "step": 4776 }, { "epoch": 4.4603174603174605, "grad_norm": 2.828850606109866, "learning_rate": 6.10965603905734e-06, "loss": 0.1574, "step": 4777 }, { "epoch": 4.46125116713352, "grad_norm": 0.6384779669050568, "learning_rate": 6.108181571392453e-06, "loss": 0.0144, "step": 4778 }, { "epoch": 4.46218487394958, "grad_norm": 1.6508052804258961, "learning_rate": 6.106707002372192e-06, "loss": 0.029, "step": 4779 }, { "epoch": 4.46311858076564, "grad_norm": 1.944356675383863, "learning_rate": 6.10523233213142e-06, "loss": 0.0661, "step": 4780 }, { "epoch": 4.4640522875816995, "grad_norm": 0.40036599155413455, "learning_rate": 6.103757560805016e-06, "loss": 0.0123, "step": 4781 }, { "epoch": 4.464985994397759, "grad_norm": 3.067237639651272, "learning_rate": 6.10228268852786e-06, "loss": 0.1519, "step": 4782 }, { "epoch": 4.465919701213819, "grad_norm": 1.5967191074236609, "learning_rate": 6.1008077154348475e-06, "loss": 0.0385, "step": 4783 }, { "epoch": 4.466853408029879, "grad_norm": 1.8587293391558743, "learning_rate": 6.09933264166088e-06, "loss": 0.078, "step": 4784 }, { "epoch": 4.4677871148459385, "grad_norm": 2.0400856450296194, "learning_rate": 6.0978574673408694e-06, "loss": 0.095, "step": 4785 }, { "epoch": 4.468720821661998, "grad_norm": 0.8027376469123957, "learning_rate": 6.096382192609736e-06, "loss": 0.028, "step": 4786 }, { "epoch": 4.469654528478058, "grad_norm": 1.6536332340936104, "learning_rate": 6.094906817602413e-06, "loss": 0.0735, "step": 4787 }, { "epoch": 4.470588235294118, "grad_norm": 0.712680033615735, "learning_rate": 6.093431342453835e-06, "loss": 0.0272, "step": 4788 }, { "epoch": 4.4715219421101775, "grad_norm": 2.8220935104054514, "learning_rate": 6.091955767298954e-06, "loss": 0.139, "step": 4789 }, { "epoch": 4.472455648926237, "grad_norm": 2.0951929701775156, "learning_rate": 6.090480092272726e-06, "loss": 0.0477, "step": 4790 }, { "epoch": 4.473389355742297, "grad_norm": 1.1892228003742962, "learning_rate": 6.089004317510116e-06, "loss": 0.0592, "step": 4791 }, { "epoch": 4.474323062558357, "grad_norm": 1.396108517840923, "learning_rate": 6.087528443146104e-06, "loss": 0.0538, "step": 4792 }, { "epoch": 4.4752567693744165, "grad_norm": 1.6617802788114022, "learning_rate": 6.086052469315671e-06, "loss": 0.0491, "step": 4793 }, { "epoch": 4.476190476190476, "grad_norm": 0.6610775645324413, "learning_rate": 6.084576396153814e-06, "loss": 0.0188, "step": 4794 }, { "epoch": 4.477124183006536, "grad_norm": 0.49180188374881845, "learning_rate": 6.083100223795534e-06, "loss": 0.0093, "step": 4795 }, { "epoch": 4.478057889822596, "grad_norm": 2.436955442363202, "learning_rate": 6.081623952375844e-06, "loss": 0.1325, "step": 4796 }, { "epoch": 4.4789915966386555, "grad_norm": 1.1146054918973545, "learning_rate": 6.080147582029764e-06, "loss": 0.0244, "step": 4797 }, { "epoch": 4.479925303454715, "grad_norm": 1.9783398688605482, "learning_rate": 6.078671112892327e-06, "loss": 0.0606, "step": 4798 }, { "epoch": 4.480859010270775, "grad_norm": 2.5067149543401137, "learning_rate": 6.07719454509857e-06, "loss": 0.1422, "step": 4799 }, { "epoch": 4.481792717086835, "grad_norm": 0.8540449989607092, "learning_rate": 6.075717878783543e-06, "loss": 0.0386, "step": 4800 }, { "epoch": 4.4827264239028946, "grad_norm": 1.3903608712730768, "learning_rate": 6.074241114082301e-06, "loss": 0.0385, "step": 4801 }, { "epoch": 4.483660130718954, "grad_norm": 0.261523498019418, "learning_rate": 6.0727642511299125e-06, "loss": 0.007, "step": 4802 }, { "epoch": 4.484593837535014, "grad_norm": 0.7278984201578725, "learning_rate": 6.0712872900614515e-06, "loss": 0.0225, "step": 4803 }, { "epoch": 4.485527544351074, "grad_norm": 1.3111474645586256, "learning_rate": 6.069810231012004e-06, "loss": 0.0404, "step": 4804 }, { "epoch": 4.486461251167134, "grad_norm": 1.391961396694494, "learning_rate": 6.068333074116661e-06, "loss": 0.0573, "step": 4805 }, { "epoch": 4.487394957983193, "grad_norm": 2.4808966517466597, "learning_rate": 6.066855819510527e-06, "loss": 0.0846, "step": 4806 }, { "epoch": 4.488328664799253, "grad_norm": 1.8270126080094578, "learning_rate": 6.065378467328712e-06, "loss": 0.1106, "step": 4807 }, { "epoch": 4.489262371615313, "grad_norm": 2.0685218653146, "learning_rate": 6.063901017706335e-06, "loss": 0.0798, "step": 4808 }, { "epoch": 4.490196078431373, "grad_norm": 1.4741753635249375, "learning_rate": 6.062423470778527e-06, "loss": 0.0646, "step": 4809 }, { "epoch": 4.491129785247432, "grad_norm": 3.1365636719837178, "learning_rate": 6.060945826680426e-06, "loss": 0.0686, "step": 4810 }, { "epoch": 4.492063492063492, "grad_norm": 1.3072204973098387, "learning_rate": 6.059468085547175e-06, "loss": 0.0618, "step": 4811 }, { "epoch": 4.492997198879552, "grad_norm": 1.4039595095263078, "learning_rate": 6.057990247513935e-06, "loss": 0.0236, "step": 4812 }, { "epoch": 4.493930905695612, "grad_norm": 1.3993658104439806, "learning_rate": 6.056512312715867e-06, "loss": 0.0368, "step": 4813 }, { "epoch": 4.494864612511671, "grad_norm": 0.4558486921228447, "learning_rate": 6.055034281288144e-06, "loss": 0.0155, "step": 4814 }, { "epoch": 4.495798319327731, "grad_norm": 0.5449811713613995, "learning_rate": 6.0535561533659526e-06, "loss": 0.0236, "step": 4815 }, { "epoch": 4.496732026143791, "grad_norm": 1.2907620122215793, "learning_rate": 6.052077929084479e-06, "loss": 0.0488, "step": 4816 }, { "epoch": 4.497665732959851, "grad_norm": 5.080948376753995, "learning_rate": 6.050599608578925e-06, "loss": 0.1782, "step": 4817 }, { "epoch": 4.49859943977591, "grad_norm": 2.8345561509054367, "learning_rate": 6.0491211919844985e-06, "loss": 0.088, "step": 4818 }, { "epoch": 4.49953314659197, "grad_norm": 1.0721317025435866, "learning_rate": 6.047642679436418e-06, "loss": 0.0591, "step": 4819 }, { "epoch": 4.50046685340803, "grad_norm": 0.7196874734548707, "learning_rate": 6.0461640710699095e-06, "loss": 0.0148, "step": 4820 }, { "epoch": 4.50140056022409, "grad_norm": 1.8900487579587022, "learning_rate": 6.044685367020208e-06, "loss": 0.1373, "step": 4821 }, { "epoch": 4.502334267040149, "grad_norm": 1.3097064838467962, "learning_rate": 6.043206567422556e-06, "loss": 0.0501, "step": 4822 }, { "epoch": 4.503267973856209, "grad_norm": 2.136975166210292, "learning_rate": 6.0417276724122074e-06, "loss": 0.1234, "step": 4823 }, { "epoch": 4.504201680672269, "grad_norm": 1.2290357325503702, "learning_rate": 6.0402486821244245e-06, "loss": 0.0773, "step": 4824 }, { "epoch": 4.505135387488329, "grad_norm": 2.5668426862217038, "learning_rate": 6.038769596694473e-06, "loss": 0.1714, "step": 4825 }, { "epoch": 4.506069094304388, "grad_norm": 3.3632789153985856, "learning_rate": 6.037290416257635e-06, "loss": 0.1121, "step": 4826 }, { "epoch": 4.507002801120448, "grad_norm": 2.92571382428684, "learning_rate": 6.035811140949197e-06, "loss": 0.1259, "step": 4827 }, { "epoch": 4.507936507936508, "grad_norm": 0.47601257339414776, "learning_rate": 6.034331770904455e-06, "loss": 0.0128, "step": 4828 }, { "epoch": 4.508870214752568, "grad_norm": 2.7399365222571936, "learning_rate": 6.032852306258713e-06, "loss": 0.1735, "step": 4829 }, { "epoch": 4.509803921568627, "grad_norm": 1.4786557229154782, "learning_rate": 6.0313727471472845e-06, "loss": 0.0977, "step": 4830 }, { "epoch": 4.510737628384687, "grad_norm": 1.73559692695888, "learning_rate": 6.029893093705492e-06, "loss": 0.1096, "step": 4831 }, { "epoch": 4.511671335200747, "grad_norm": 0.4290187297499124, "learning_rate": 6.028413346068667e-06, "loss": 0.0146, "step": 4832 }, { "epoch": 4.512605042016807, "grad_norm": 1.779948031941323, "learning_rate": 6.026933504372146e-06, "loss": 0.091, "step": 4833 }, { "epoch": 4.513538748832866, "grad_norm": 6.538779334750786, "learning_rate": 6.025453568751279e-06, "loss": 0.2006, "step": 4834 }, { "epoch": 4.514472455648926, "grad_norm": 0.9600965383779134, "learning_rate": 6.0239735393414204e-06, "loss": 0.0273, "step": 4835 }, { "epoch": 4.515406162464986, "grad_norm": 0.5015255993195944, "learning_rate": 6.022493416277938e-06, "loss": 0.0145, "step": 4836 }, { "epoch": 4.516339869281046, "grad_norm": 0.4674055033912238, "learning_rate": 6.0210131996962015e-06, "loss": 0.0139, "step": 4837 }, { "epoch": 4.5172735760971054, "grad_norm": 3.737054356237901, "learning_rate": 6.0195328897315975e-06, "loss": 0.1847, "step": 4838 }, { "epoch": 4.518207282913165, "grad_norm": 2.2931231403412977, "learning_rate": 6.018052486519513e-06, "loss": 0.0961, "step": 4839 }, { "epoch": 4.519140989729225, "grad_norm": 1.685261177987835, "learning_rate": 6.016571990195348e-06, "loss": 0.1032, "step": 4840 }, { "epoch": 4.520074696545285, "grad_norm": 4.059285693052053, "learning_rate": 6.015091400894511e-06, "loss": 0.17, "step": 4841 }, { "epoch": 4.5210084033613445, "grad_norm": 2.59466925790814, "learning_rate": 6.013610718752417e-06, "loss": 0.1398, "step": 4842 }, { "epoch": 4.521942110177404, "grad_norm": 1.2221469381036438, "learning_rate": 6.012129943904493e-06, "loss": 0.0914, "step": 4843 }, { "epoch": 4.522875816993464, "grad_norm": 2.627119362328914, "learning_rate": 6.010649076486168e-06, "loss": 0.2133, "step": 4844 }, { "epoch": 4.523809523809524, "grad_norm": 1.8231131450901263, "learning_rate": 6.009168116632888e-06, "loss": 0.0905, "step": 4845 }, { "epoch": 4.5247432306255835, "grad_norm": 0.5268324481833468, "learning_rate": 6.007687064480099e-06, "loss": 0.0183, "step": 4846 }, { "epoch": 4.525676937441643, "grad_norm": 1.009643516767824, "learning_rate": 6.006205920163262e-06, "loss": 0.0369, "step": 4847 }, { "epoch": 4.526610644257703, "grad_norm": 3.3027598547603336, "learning_rate": 6.004724683817844e-06, "loss": 0.0744, "step": 4848 }, { "epoch": 4.527544351073763, "grad_norm": 2.4889754857787194, "learning_rate": 6.003243355579319e-06, "loss": 0.1253, "step": 4849 }, { "epoch": 4.5284780578898225, "grad_norm": 2.2031403745543736, "learning_rate": 6.001761935583173e-06, "loss": 0.0888, "step": 4850 }, { "epoch": 4.529411764705882, "grad_norm": 1.2938636072661327, "learning_rate": 6.000280423964895e-06, "loss": 0.035, "step": 4851 }, { "epoch": 4.530345471521942, "grad_norm": 1.141392653384006, "learning_rate": 5.998798820859988e-06, "loss": 0.0493, "step": 4852 }, { "epoch": 4.531279178338002, "grad_norm": 3.7193421225837113, "learning_rate": 5.99731712640396e-06, "loss": 0.1893, "step": 4853 }, { "epoch": 4.5322128851540615, "grad_norm": 1.7597855684044295, "learning_rate": 5.995835340732328e-06, "loss": 0.0894, "step": 4854 }, { "epoch": 4.533146591970121, "grad_norm": 1.7720568616970336, "learning_rate": 5.994353463980618e-06, "loss": 0.0513, "step": 4855 }, { "epoch": 4.534080298786181, "grad_norm": 2.9034635398133912, "learning_rate": 5.992871496284365e-06, "loss": 0.1673, "step": 4856 }, { "epoch": 4.535014005602241, "grad_norm": 1.8409641189587107, "learning_rate": 5.991389437779107e-06, "loss": 0.0923, "step": 4857 }, { "epoch": 4.5359477124183005, "grad_norm": 0.5106505148388811, "learning_rate": 5.989907288600399e-06, "loss": 0.0118, "step": 4858 }, { "epoch": 4.53688141923436, "grad_norm": 1.7274678271396704, "learning_rate": 5.988425048883799e-06, "loss": 0.0951, "step": 4859 }, { "epoch": 4.53781512605042, "grad_norm": 1.5689606394804743, "learning_rate": 5.986942718764873e-06, "loss": 0.1125, "step": 4860 }, { "epoch": 4.53874883286648, "grad_norm": 5.015884793972062, "learning_rate": 5.985460298379196e-06, "loss": 0.0738, "step": 4861 }, { "epoch": 4.5396825396825395, "grad_norm": 0.593350242266301, "learning_rate": 5.983977787862353e-06, "loss": 0.0116, "step": 4862 }, { "epoch": 4.540616246498599, "grad_norm": 1.1263973368235964, "learning_rate": 5.982495187349933e-06, "loss": 0.0474, "step": 4863 }, { "epoch": 4.541549953314659, "grad_norm": 0.373738501859831, "learning_rate": 5.981012496977542e-06, "loss": 0.0046, "step": 4864 }, { "epoch": 4.542483660130719, "grad_norm": 1.2193142436439859, "learning_rate": 5.979529716880782e-06, "loss": 0.0371, "step": 4865 }, { "epoch": 4.543417366946779, "grad_norm": 0.9420472077545599, "learning_rate": 5.978046847195274e-06, "loss": 0.0389, "step": 4866 }, { "epoch": 4.544351073762838, "grad_norm": 1.625921347143218, "learning_rate": 5.976563888056639e-06, "loss": 0.0626, "step": 4867 }, { "epoch": 4.545284780578898, "grad_norm": 0.30836163853615284, "learning_rate": 5.9750808396005136e-06, "loss": 0.0081, "step": 4868 }, { "epoch": 4.546218487394958, "grad_norm": 1.649829906466308, "learning_rate": 5.973597701962536e-06, "loss": 0.0811, "step": 4869 }, { "epoch": 4.547152194211018, "grad_norm": 0.7621710993791943, "learning_rate": 5.972114475278356e-06, "loss": 0.013, "step": 4870 }, { "epoch": 4.548085901027077, "grad_norm": 2.350708634027029, "learning_rate": 5.970631159683632e-06, "loss": 0.0726, "step": 4871 }, { "epoch": 4.549019607843137, "grad_norm": 4.258004668000941, "learning_rate": 5.96914775531403e-06, "loss": 0.3051, "step": 4872 }, { "epoch": 4.549953314659197, "grad_norm": 1.8547573891542894, "learning_rate": 5.967664262305221e-06, "loss": 0.0857, "step": 4873 }, { "epoch": 4.550887021475257, "grad_norm": 3.5278699393925965, "learning_rate": 5.966180680792889e-06, "loss": 0.2252, "step": 4874 }, { "epoch": 4.551820728291316, "grad_norm": 1.0091603888236902, "learning_rate": 5.9646970109127235e-06, "loss": 0.0466, "step": 4875 }, { "epoch": 4.552754435107376, "grad_norm": 1.396616483968444, "learning_rate": 5.963213252800422e-06, "loss": 0.0916, "step": 4876 }, { "epoch": 4.553688141923436, "grad_norm": 5.609679911391652, "learning_rate": 5.96172940659169e-06, "loss": 0.3368, "step": 4877 }, { "epoch": 4.554621848739496, "grad_norm": 0.9389223618319713, "learning_rate": 5.960245472422242e-06, "loss": 0.0289, "step": 4878 }, { "epoch": 4.555555555555555, "grad_norm": 0.5820311241576921, "learning_rate": 5.9587614504278e-06, "loss": 0.0233, "step": 4879 }, { "epoch": 4.556489262371615, "grad_norm": 1.7542519104834309, "learning_rate": 5.957277340744094e-06, "loss": 0.0929, "step": 4880 }, { "epoch": 4.557422969187675, "grad_norm": 0.35698576596221127, "learning_rate": 5.955793143506863e-06, "loss": 0.0093, "step": 4881 }, { "epoch": 4.558356676003735, "grad_norm": 0.4277782986281955, "learning_rate": 5.954308858851851e-06, "loss": 0.0156, "step": 4882 }, { "epoch": 4.559290382819794, "grad_norm": 0.8832219199361923, "learning_rate": 5.952824486914815e-06, "loss": 0.0511, "step": 4883 }, { "epoch": 4.560224089635854, "grad_norm": 2.5073489939960054, "learning_rate": 5.951340027831516e-06, "loss": 0.1942, "step": 4884 }, { "epoch": 4.561157796451914, "grad_norm": 0.956768811797079, "learning_rate": 5.949855481737723e-06, "loss": 0.0396, "step": 4885 }, { "epoch": 4.562091503267974, "grad_norm": 0.44640905419983, "learning_rate": 5.948370848769215e-06, "loss": 0.0119, "step": 4886 }, { "epoch": 4.563025210084033, "grad_norm": 2.2610446419225587, "learning_rate": 5.946886129061777e-06, "loss": 0.1846, "step": 4887 }, { "epoch": 4.563958916900093, "grad_norm": 1.4474798443753034, "learning_rate": 5.945401322751205e-06, "loss": 0.0675, "step": 4888 }, { "epoch": 4.564892623716153, "grad_norm": 1.3036259263933352, "learning_rate": 5.9439164299732995e-06, "loss": 0.057, "step": 4889 }, { "epoch": 4.565826330532213, "grad_norm": 2.3290945451829344, "learning_rate": 5.942431450863871e-06, "loss": 0.1493, "step": 4890 }, { "epoch": 4.566760037348272, "grad_norm": 3.1518493739784765, "learning_rate": 5.9409463855587345e-06, "loss": 0.1798, "step": 4891 }, { "epoch": 4.567693744164332, "grad_norm": 0.9346795400444413, "learning_rate": 5.93946123419372e-06, "loss": 0.0312, "step": 4892 }, { "epoch": 4.568627450980392, "grad_norm": 2.636774461202559, "learning_rate": 5.937975996904657e-06, "loss": 0.1341, "step": 4893 }, { "epoch": 4.569561157796452, "grad_norm": 0.9658096248177883, "learning_rate": 5.936490673827389e-06, "loss": 0.0286, "step": 4894 }, { "epoch": 4.570494864612511, "grad_norm": 0.9982325928124326, "learning_rate": 5.935005265097763e-06, "loss": 0.0494, "step": 4895 }, { "epoch": 4.571428571428571, "grad_norm": 1.010840753315714, "learning_rate": 5.933519770851638e-06, "loss": 0.0507, "step": 4896 }, { "epoch": 4.572362278244631, "grad_norm": 0.614855450490242, "learning_rate": 5.932034191224878e-06, "loss": 0.022, "step": 4897 }, { "epoch": 4.573295985060691, "grad_norm": 0.9508936187351558, "learning_rate": 5.930548526353356e-06, "loss": 0.0367, "step": 4898 }, { "epoch": 4.57422969187675, "grad_norm": 0.3467051558110438, "learning_rate": 5.9290627763729505e-06, "loss": 0.0086, "step": 4899 }, { "epoch": 4.57516339869281, "grad_norm": 1.043725527441112, "learning_rate": 5.927576941419553e-06, "loss": 0.0441, "step": 4900 }, { "epoch": 4.57609710550887, "grad_norm": 0.4102936253029203, "learning_rate": 5.926091021629056e-06, "loss": 0.0049, "step": 4901 }, { "epoch": 4.57703081232493, "grad_norm": 1.4367108077143806, "learning_rate": 5.924605017137366e-06, "loss": 0.0779, "step": 4902 }, { "epoch": 4.5779645191409895, "grad_norm": 2.0178318159450592, "learning_rate": 5.923118928080391e-06, "loss": 0.1186, "step": 4903 }, { "epoch": 4.578898225957049, "grad_norm": 1.162891699083225, "learning_rate": 5.921632754594054e-06, "loss": 0.0601, "step": 4904 }, { "epoch": 4.579831932773109, "grad_norm": 0.557322503346615, "learning_rate": 5.920146496814281e-06, "loss": 0.0076, "step": 4905 }, { "epoch": 4.580765639589169, "grad_norm": 2.6371738749901064, "learning_rate": 5.918660154877005e-06, "loss": 0.1651, "step": 4906 }, { "epoch": 4.5816993464052285, "grad_norm": 1.7170499288515229, "learning_rate": 5.917173728918169e-06, "loss": 0.0738, "step": 4907 }, { "epoch": 4.582633053221288, "grad_norm": 2.0179322847210077, "learning_rate": 5.915687219073723e-06, "loss": 0.1003, "step": 4908 }, { "epoch": 4.583566760037348, "grad_norm": 1.9594975101079537, "learning_rate": 5.914200625479625e-06, "loss": 0.0853, "step": 4909 }, { "epoch": 4.584500466853408, "grad_norm": 1.582678100894644, "learning_rate": 5.912713948271839e-06, "loss": 0.0865, "step": 4910 }, { "epoch": 4.5854341736694675, "grad_norm": 0.4747619207090325, "learning_rate": 5.911227187586339e-06, "loss": 0.0121, "step": 4911 }, { "epoch": 4.586367880485527, "grad_norm": 0.6769436066144354, "learning_rate": 5.909740343559106e-06, "loss": 0.0313, "step": 4912 }, { "epoch": 4.587301587301587, "grad_norm": 3.5424350453578617, "learning_rate": 5.908253416326128e-06, "loss": 0.1279, "step": 4913 }, { "epoch": 4.588235294117647, "grad_norm": 0.5170756009482378, "learning_rate": 5.906766406023401e-06, "loss": 0.0072, "step": 4914 }, { "epoch": 4.5891690009337065, "grad_norm": 1.0827554469245797, "learning_rate": 5.905279312786928e-06, "loss": 0.0259, "step": 4915 }, { "epoch": 4.590102707749766, "grad_norm": 0.2921925404747323, "learning_rate": 5.90379213675272e-06, "loss": 0.0117, "step": 4916 }, { "epoch": 4.591036414565826, "grad_norm": 2.038280743192256, "learning_rate": 5.902304878056795e-06, "loss": 0.1101, "step": 4917 }, { "epoch": 4.591970121381886, "grad_norm": 1.6386313814749969, "learning_rate": 5.900817536835179e-06, "loss": 0.0952, "step": 4918 }, { "epoch": 4.5929038281979455, "grad_norm": 0.7106792909472043, "learning_rate": 5.899330113223909e-06, "loss": 0.0215, "step": 4919 }, { "epoch": 4.593837535014005, "grad_norm": 1.710923110391569, "learning_rate": 5.8978426073590214e-06, "loss": 0.0499, "step": 4920 }, { "epoch": 4.594771241830065, "grad_norm": 1.267542108361443, "learning_rate": 5.896355019376568e-06, "loss": 0.0787, "step": 4921 }, { "epoch": 4.595704948646125, "grad_norm": 1.3843468805377799, "learning_rate": 5.894867349412604e-06, "loss": 0.0789, "step": 4922 }, { "epoch": 4.5966386554621845, "grad_norm": 2.4971972608729955, "learning_rate": 5.893379597603194e-06, "loss": 0.1892, "step": 4923 }, { "epoch": 4.597572362278244, "grad_norm": 0.8504041685108384, "learning_rate": 5.891891764084407e-06, "loss": 0.0254, "step": 4924 }, { "epoch": 4.598506069094304, "grad_norm": 0.4447273419729892, "learning_rate": 5.890403848992325e-06, "loss": 0.0072, "step": 4925 }, { "epoch": 4.599439775910364, "grad_norm": 1.2083034401520614, "learning_rate": 5.888915852463033e-06, "loss": 0.0449, "step": 4926 }, { "epoch": 4.6003734827264235, "grad_norm": 2.245141489506475, "learning_rate": 5.8874277746326226e-06, "loss": 0.0856, "step": 4927 }, { "epoch": 4.601307189542483, "grad_norm": 1.1976182572495404, "learning_rate": 5.885939615637198e-06, "loss": 0.0048, "step": 4928 }, { "epoch": 4.602240896358543, "grad_norm": 6.157863207520373, "learning_rate": 5.884451375612865e-06, "loss": 0.2955, "step": 4929 }, { "epoch": 4.603174603174603, "grad_norm": 0.4982087846190011, "learning_rate": 5.882963054695741e-06, "loss": 0.014, "step": 4930 }, { "epoch": 4.604108309990663, "grad_norm": 1.50420285166517, "learning_rate": 5.881474653021949e-06, "loss": 0.0192, "step": 4931 }, { "epoch": 4.605042016806722, "grad_norm": 1.187963350542107, "learning_rate": 5.879986170727621e-06, "loss": 0.0398, "step": 4932 }, { "epoch": 4.605975723622782, "grad_norm": 1.3996107727217582, "learning_rate": 5.878497607948891e-06, "loss": 0.0546, "step": 4933 }, { "epoch": 4.606909430438842, "grad_norm": 0.7867218815576349, "learning_rate": 5.877008964821909e-06, "loss": 0.0391, "step": 4934 }, { "epoch": 4.607843137254902, "grad_norm": 1.5619123132572164, "learning_rate": 5.875520241482824e-06, "loss": 0.0563, "step": 4935 }, { "epoch": 4.608776844070961, "grad_norm": 1.6399650247862227, "learning_rate": 5.8740314380678e-06, "loss": 0.0282, "step": 4936 }, { "epoch": 4.609710550887021, "grad_norm": 5.066755189176341, "learning_rate": 5.872542554713001e-06, "loss": 0.1757, "step": 4937 }, { "epoch": 4.610644257703081, "grad_norm": 0.6490041846224379, "learning_rate": 5.8710535915546045e-06, "loss": 0.0278, "step": 4938 }, { "epoch": 4.611577964519141, "grad_norm": 2.1236787373305477, "learning_rate": 5.86956454872879e-06, "loss": 0.1504, "step": 4939 }, { "epoch": 4.6125116713352, "grad_norm": 1.2923238989038417, "learning_rate": 5.868075426371748e-06, "loss": 0.0599, "step": 4940 }, { "epoch": 4.61344537815126, "grad_norm": 1.4215589661404662, "learning_rate": 5.866586224619675e-06, "loss": 0.0798, "step": 4941 }, { "epoch": 4.61437908496732, "grad_norm": 3.7522676737490146, "learning_rate": 5.865096943608775e-06, "loss": 0.1033, "step": 4942 }, { "epoch": 4.61531279178338, "grad_norm": 3.0470310694816756, "learning_rate": 5.863607583475259e-06, "loss": 0.1777, "step": 4943 }, { "epoch": 4.616246498599439, "grad_norm": 0.28892646527912846, "learning_rate": 5.862118144355345e-06, "loss": 0.0055, "step": 4944 }, { "epoch": 4.617180205415499, "grad_norm": 1.244171472420566, "learning_rate": 5.86062862638526e-06, "loss": 0.0659, "step": 4945 }, { "epoch": 4.618113912231559, "grad_norm": 1.63926089323794, "learning_rate": 5.859139029701234e-06, "loss": 0.0784, "step": 4946 }, { "epoch": 4.619047619047619, "grad_norm": 0.5142950787261514, "learning_rate": 5.85764935443951e-06, "loss": 0.013, "step": 4947 }, { "epoch": 4.619981325863678, "grad_norm": 3.047896773098991, "learning_rate": 5.856159600736333e-06, "loss": 0.1664, "step": 4948 }, { "epoch": 4.620915032679738, "grad_norm": 4.236297947669474, "learning_rate": 5.854669768727958e-06, "loss": 0.1742, "step": 4949 }, { "epoch": 4.621848739495798, "grad_norm": 1.333487070135131, "learning_rate": 5.853179858550647e-06, "loss": 0.0731, "step": 4950 }, { "epoch": 4.622782446311858, "grad_norm": 1.5279216560382707, "learning_rate": 5.851689870340669e-06, "loss": 0.066, "step": 4951 }, { "epoch": 4.623716153127917, "grad_norm": 0.2668382475349992, "learning_rate": 5.850199804234298e-06, "loss": 0.0119, "step": 4952 }, { "epoch": 4.624649859943977, "grad_norm": 0.3928593814886867, "learning_rate": 5.8487096603678184e-06, "loss": 0.0107, "step": 4953 }, { "epoch": 4.625583566760037, "grad_norm": 0.2871201395691056, "learning_rate": 5.8472194388775195e-06, "loss": 0.0085, "step": 4954 }, { "epoch": 4.626517273576097, "grad_norm": 0.7411637671612573, "learning_rate": 5.8457291398996995e-06, "loss": 0.0279, "step": 4955 }, { "epoch": 4.627450980392156, "grad_norm": 2.2026492005467517, "learning_rate": 5.844238763570662e-06, "loss": 0.1461, "step": 4956 }, { "epoch": 4.628384687208216, "grad_norm": 2.031534320686603, "learning_rate": 5.842748310026717e-06, "loss": 0.0862, "step": 4957 }, { "epoch": 4.629318394024276, "grad_norm": 0.33270182281559724, "learning_rate": 5.841257779404184e-06, "loss": 0.0037, "step": 4958 }, { "epoch": 4.630252100840336, "grad_norm": 0.7859263198451589, "learning_rate": 5.839767171839389e-06, "loss": 0.0221, "step": 4959 }, { "epoch": 4.631185807656395, "grad_norm": 0.7434340907287602, "learning_rate": 5.838276487468665e-06, "loss": 0.026, "step": 4960 }, { "epoch": 4.632119514472455, "grad_norm": 0.968432105204138, "learning_rate": 5.836785726428348e-06, "loss": 0.0426, "step": 4961 }, { "epoch": 4.633053221288515, "grad_norm": 3.636010137176984, "learning_rate": 5.835294888854787e-06, "loss": 0.1357, "step": 4962 }, { "epoch": 4.633986928104575, "grad_norm": 1.7278787477484898, "learning_rate": 5.833803974884336e-06, "loss": 0.1029, "step": 4963 }, { "epoch": 4.634920634920634, "grad_norm": 1.8975631360381897, "learning_rate": 5.8323129846533535e-06, "loss": 0.1119, "step": 4964 }, { "epoch": 4.635854341736694, "grad_norm": 0.6359849471366771, "learning_rate": 5.830821918298209e-06, "loss": 0.0249, "step": 4965 }, { "epoch": 4.636788048552754, "grad_norm": 0.895712799808988, "learning_rate": 5.829330775955275e-06, "loss": 0.0068, "step": 4966 }, { "epoch": 4.637721755368814, "grad_norm": 0.6222454865452041, "learning_rate": 5.827839557760935e-06, "loss": 0.0149, "step": 4967 }, { "epoch": 4.6386554621848735, "grad_norm": 2.455104898536604, "learning_rate": 5.826348263851576e-06, "loss": 0.1346, "step": 4968 }, { "epoch": 4.639589169000933, "grad_norm": 2.2955975198231977, "learning_rate": 5.824856894363593e-06, "loss": 0.1378, "step": 4969 }, { "epoch": 4.640522875816993, "grad_norm": 1.5323854580017904, "learning_rate": 5.823365449433389e-06, "loss": 0.0869, "step": 4970 }, { "epoch": 4.641456582633054, "grad_norm": 0.5553868099115358, "learning_rate": 5.821873929197371e-06, "loss": 0.0171, "step": 4971 }, { "epoch": 4.642390289449113, "grad_norm": 1.177266967156488, "learning_rate": 5.8203823337919576e-06, "loss": 0.0439, "step": 4972 }, { "epoch": 4.643323996265173, "grad_norm": 2.153503263030578, "learning_rate": 5.81889066335357e-06, "loss": 0.0975, "step": 4973 }, { "epoch": 4.644257703081233, "grad_norm": 0.9945898726400082, "learning_rate": 5.817398918018638e-06, "loss": 0.0493, "step": 4974 }, { "epoch": 4.645191409897293, "grad_norm": 2.4124451391332458, "learning_rate": 5.815907097923599e-06, "loss": 0.179, "step": 4975 }, { "epoch": 4.646125116713352, "grad_norm": 5.320481283549246, "learning_rate": 5.814415203204894e-06, "loss": 0.2005, "step": 4976 }, { "epoch": 4.647058823529412, "grad_norm": 0.6873390626527457, "learning_rate": 5.812923233998976e-06, "loss": 0.021, "step": 4977 }, { "epoch": 4.647992530345472, "grad_norm": 0.7342595166962229, "learning_rate": 5.8114311904423e-06, "loss": 0.0218, "step": 4978 }, { "epoch": 4.648926237161532, "grad_norm": 0.42533043721710356, "learning_rate": 5.8099390726713315e-06, "loss": 0.0202, "step": 4979 }, { "epoch": 4.649859943977591, "grad_norm": 2.657365858210832, "learning_rate": 5.808446880822539e-06, "loss": 0.1952, "step": 4980 }, { "epoch": 4.650793650793651, "grad_norm": 0.8162654411593762, "learning_rate": 5.806954615032402e-06, "loss": 0.037, "step": 4981 }, { "epoch": 4.651727357609711, "grad_norm": 3.853360890768706, "learning_rate": 5.805462275437401e-06, "loss": 0.0847, "step": 4982 }, { "epoch": 4.652661064425771, "grad_norm": 0.5044978976014608, "learning_rate": 5.803969862174031e-06, "loss": 0.0139, "step": 4983 }, { "epoch": 4.65359477124183, "grad_norm": 1.2345487150415666, "learning_rate": 5.802477375378788e-06, "loss": 0.0262, "step": 4984 }, { "epoch": 4.65452847805789, "grad_norm": 1.8498224656108384, "learning_rate": 5.8009848151881765e-06, "loss": 0.1098, "step": 4985 }, { "epoch": 4.65546218487395, "grad_norm": 1.329949871931701, "learning_rate": 5.799492181738706e-06, "loss": 0.0738, "step": 4986 }, { "epoch": 4.65639589169001, "grad_norm": 1.0618494005260595, "learning_rate": 5.797999475166897e-06, "loss": 0.0319, "step": 4987 }, { "epoch": 4.657329598506069, "grad_norm": 0.8720646705349103, "learning_rate": 5.796506695609272e-06, "loss": 0.019, "step": 4988 }, { "epoch": 4.658263305322129, "grad_norm": 1.700460857458099, "learning_rate": 5.795013843202362e-06, "loss": 0.0768, "step": 4989 }, { "epoch": 4.659197012138189, "grad_norm": 0.49948431332198157, "learning_rate": 5.793520918082705e-06, "loss": 0.0125, "step": 4990 }, { "epoch": 4.660130718954249, "grad_norm": 2.0700601532870073, "learning_rate": 5.7920279203868444e-06, "loss": 0.114, "step": 4991 }, { "epoch": 4.661064425770308, "grad_norm": 3.383054432825485, "learning_rate": 5.790534850251335e-06, "loss": 0.2257, "step": 4992 }, { "epoch": 4.661998132586368, "grad_norm": 1.0790498028991307, "learning_rate": 5.78904170781273e-06, "loss": 0.0474, "step": 4993 }, { "epoch": 4.662931839402428, "grad_norm": 1.1593378944160735, "learning_rate": 5.7875484932075965e-06, "loss": 0.0224, "step": 4994 }, { "epoch": 4.663865546218488, "grad_norm": 1.916704802822852, "learning_rate": 5.786055206572503e-06, "loss": 0.0972, "step": 4995 }, { "epoch": 4.6647992530345475, "grad_norm": 3.7394616234770486, "learning_rate": 5.784561848044031e-06, "loss": 0.195, "step": 4996 }, { "epoch": 4.665732959850607, "grad_norm": 1.4812242177647028, "learning_rate": 5.783068417758759e-06, "loss": 0.0879, "step": 4997 }, { "epoch": 4.666666666666667, "grad_norm": 1.1076143003865124, "learning_rate": 5.781574915853282e-06, "loss": 0.0337, "step": 4998 }, { "epoch": 4.667600373482727, "grad_norm": 0.31461449487297916, "learning_rate": 5.780081342464195e-06, "loss": 0.0056, "step": 4999 }, { "epoch": 4.6685340802987865, "grad_norm": 1.6579869644297467, "learning_rate": 5.778587697728103e-06, "loss": 0.0934, "step": 5000 }, { "epoch": 4.669467787114846, "grad_norm": 2.709174692937193, "learning_rate": 5.777093981781615e-06, "loss": 0.1689, "step": 5001 }, { "epoch": 4.670401493930906, "grad_norm": 0.8407431977938419, "learning_rate": 5.775600194761349e-06, "loss": 0.0452, "step": 5002 }, { "epoch": 4.671335200746966, "grad_norm": 1.4545676176112372, "learning_rate": 5.774106336803926e-06, "loss": 0.0735, "step": 5003 }, { "epoch": 4.6722689075630255, "grad_norm": 0.6469242346648886, "learning_rate": 5.772612408045979e-06, "loss": 0.0359, "step": 5004 }, { "epoch": 4.673202614379085, "grad_norm": 3.218904699025421, "learning_rate": 5.7711184086241425e-06, "loss": 0.1793, "step": 5005 }, { "epoch": 4.674136321195145, "grad_norm": 1.2566272293806129, "learning_rate": 5.769624338675057e-06, "loss": 0.0705, "step": 5006 }, { "epoch": 4.675070028011205, "grad_norm": 3.2298142568373662, "learning_rate": 5.768130198335376e-06, "loss": 0.0629, "step": 5007 }, { "epoch": 4.6760037348272645, "grad_norm": 1.4758337851120824, "learning_rate": 5.766635987741751e-06, "loss": 0.0616, "step": 5008 }, { "epoch": 4.676937441643324, "grad_norm": 0.8164229784562661, "learning_rate": 5.765141707030848e-06, "loss": 0.0236, "step": 5009 }, { "epoch": 4.677871148459384, "grad_norm": 1.6103662911127565, "learning_rate": 5.76364735633933e-06, "loss": 0.0484, "step": 5010 }, { "epoch": 4.678804855275444, "grad_norm": 2.1978326800253236, "learning_rate": 5.762152935803877e-06, "loss": 0.1268, "step": 5011 }, { "epoch": 4.6797385620915035, "grad_norm": 1.6104785407754685, "learning_rate": 5.760658445561167e-06, "loss": 0.0598, "step": 5012 }, { "epoch": 4.680672268907563, "grad_norm": 2.445424900537444, "learning_rate": 5.75916388574789e-06, "loss": 0.2104, "step": 5013 }, { "epoch": 4.681605975723623, "grad_norm": 0.5887099083262678, "learning_rate": 5.757669256500737e-06, "loss": 0.0231, "step": 5014 }, { "epoch": 4.682539682539683, "grad_norm": 0.7351172515414719, "learning_rate": 5.756174557956411e-06, "loss": 0.024, "step": 5015 }, { "epoch": 4.6834733893557425, "grad_norm": 4.748552392559098, "learning_rate": 5.754679790251616e-06, "loss": 0.1723, "step": 5016 }, { "epoch": 4.684407096171802, "grad_norm": 3.005836923403386, "learning_rate": 5.753184953523068e-06, "loss": 0.1464, "step": 5017 }, { "epoch": 4.685340802987862, "grad_norm": 1.0943565036512486, "learning_rate": 5.751690047907482e-06, "loss": 0.0655, "step": 5018 }, { "epoch": 4.686274509803922, "grad_norm": 4.180513896821383, "learning_rate": 5.750195073541589e-06, "loss": 0.1207, "step": 5019 }, { "epoch": 4.6872082166199815, "grad_norm": 1.5127205001509183, "learning_rate": 5.748700030562116e-06, "loss": 0.1093, "step": 5020 }, { "epoch": 4.688141923436041, "grad_norm": 1.4526127591260745, "learning_rate": 5.747204919105803e-06, "loss": 0.0846, "step": 5021 }, { "epoch": 4.689075630252101, "grad_norm": 2.73158823993108, "learning_rate": 5.7457097393093955e-06, "loss": 0.0896, "step": 5022 }, { "epoch": 4.690009337068161, "grad_norm": 0.6771166534232004, "learning_rate": 5.744214491309641e-06, "loss": 0.0162, "step": 5023 }, { "epoch": 4.690943043884221, "grad_norm": 1.0430227203255198, "learning_rate": 5.7427191752433e-06, "loss": 0.0452, "step": 5024 }, { "epoch": 4.69187675070028, "grad_norm": 1.6463452722466334, "learning_rate": 5.741223791247133e-06, "loss": 0.1216, "step": 5025 }, { "epoch": 4.69281045751634, "grad_norm": 1.5506778808929995, "learning_rate": 5.739728339457911e-06, "loss": 0.0831, "step": 5026 }, { "epoch": 4.6937441643324, "grad_norm": 1.8895205386515503, "learning_rate": 5.738232820012407e-06, "loss": 0.0792, "step": 5027 }, { "epoch": 4.69467787114846, "grad_norm": 0.7037748778258613, "learning_rate": 5.736737233047406e-06, "loss": 0.0246, "step": 5028 }, { "epoch": 4.695611577964519, "grad_norm": 1.263602901522549, "learning_rate": 5.735241578699693e-06, "loss": 0.0585, "step": 5029 }, { "epoch": 4.696545284780579, "grad_norm": 0.5085796135390466, "learning_rate": 5.733745857106063e-06, "loss": 0.0142, "step": 5030 }, { "epoch": 4.697478991596639, "grad_norm": 0.9201781293159229, "learning_rate": 5.732250068403316e-06, "loss": 0.0284, "step": 5031 }, { "epoch": 4.698412698412699, "grad_norm": 1.4569131076332629, "learning_rate": 5.730754212728261e-06, "loss": 0.023, "step": 5032 }, { "epoch": 4.699346405228758, "grad_norm": 1.6297258639645045, "learning_rate": 5.7292582902177066e-06, "loss": 0.0681, "step": 5033 }, { "epoch": 4.700280112044818, "grad_norm": 1.1658647011070735, "learning_rate": 5.727762301008473e-06, "loss": 0.0582, "step": 5034 }, { "epoch": 4.701213818860878, "grad_norm": 3.1725658134014263, "learning_rate": 5.726266245237384e-06, "loss": 0.139, "step": 5035 }, { "epoch": 4.702147525676938, "grad_norm": 2.084092787868231, "learning_rate": 5.7247701230412725e-06, "loss": 0.1387, "step": 5036 }, { "epoch": 4.703081232492997, "grad_norm": 1.7615091141821473, "learning_rate": 5.723273934556972e-06, "loss": 0.1091, "step": 5037 }, { "epoch": 4.704014939309057, "grad_norm": 1.3946894143520403, "learning_rate": 5.721777679921329e-06, "loss": 0.053, "step": 5038 }, { "epoch": 4.704948646125117, "grad_norm": 7.171289534541758, "learning_rate": 5.720281359271189e-06, "loss": 0.2167, "step": 5039 }, { "epoch": 4.705882352941177, "grad_norm": 0.6708068761176849, "learning_rate": 5.71878497274341e-06, "loss": 0.0325, "step": 5040 }, { "epoch": 4.706816059757236, "grad_norm": 2.626301542723536, "learning_rate": 5.717288520474849e-06, "loss": 0.1275, "step": 5041 }, { "epoch": 4.707749766573296, "grad_norm": 2.1601964964740463, "learning_rate": 5.715792002602378e-06, "loss": 0.0592, "step": 5042 }, { "epoch": 4.708683473389356, "grad_norm": 5.9459087574730765, "learning_rate": 5.714295419262867e-06, "loss": 0.2196, "step": 5043 }, { "epoch": 4.709617180205416, "grad_norm": 1.012461276396491, "learning_rate": 5.712798770593194e-06, "loss": 0.0528, "step": 5044 }, { "epoch": 4.710550887021475, "grad_norm": 1.7477368941729408, "learning_rate": 5.711302056730248e-06, "loss": 0.0933, "step": 5045 }, { "epoch": 4.711484593837535, "grad_norm": 2.352183227922767, "learning_rate": 5.709805277810915e-06, "loss": 0.171, "step": 5046 }, { "epoch": 4.712418300653595, "grad_norm": 3.010701127449384, "learning_rate": 5.708308433972096e-06, "loss": 0.1635, "step": 5047 }, { "epoch": 4.713352007469655, "grad_norm": 0.6285616823986151, "learning_rate": 5.706811525350691e-06, "loss": 0.021, "step": 5048 }, { "epoch": 4.714285714285714, "grad_norm": 1.4979201495106744, "learning_rate": 5.7053145520836094e-06, "loss": 0.0645, "step": 5049 }, { "epoch": 4.715219421101774, "grad_norm": 1.665010498401045, "learning_rate": 5.703817514307767e-06, "loss": 0.0826, "step": 5050 }, { "epoch": 4.716153127917834, "grad_norm": 1.7902540204284991, "learning_rate": 5.702320412160084e-06, "loss": 0.0275, "step": 5051 }, { "epoch": 4.717086834733894, "grad_norm": 1.2354562280447652, "learning_rate": 5.700823245777485e-06, "loss": 0.0526, "step": 5052 }, { "epoch": 4.718020541549953, "grad_norm": 2.120952572364369, "learning_rate": 5.699326015296905e-06, "loss": 0.1115, "step": 5053 }, { "epoch": 4.718954248366013, "grad_norm": 1.3798399923677642, "learning_rate": 5.697828720855282e-06, "loss": 0.0724, "step": 5054 }, { "epoch": 4.719887955182073, "grad_norm": 1.2985957347753627, "learning_rate": 5.696331362589557e-06, "loss": 0.0422, "step": 5055 }, { "epoch": 4.720821661998133, "grad_norm": 1.5223514059505356, "learning_rate": 5.694833940636682e-06, "loss": 0.0302, "step": 5056 }, { "epoch": 4.721755368814192, "grad_norm": 0.3525824363448251, "learning_rate": 5.693336455133613e-06, "loss": 0.0096, "step": 5057 }, { "epoch": 4.722689075630252, "grad_norm": 0.5298725838265852, "learning_rate": 5.691838906217311e-06, "loss": 0.0086, "step": 5058 }, { "epoch": 4.723622782446312, "grad_norm": 2.389875977840241, "learning_rate": 5.6903412940247415e-06, "loss": 0.0978, "step": 5059 }, { "epoch": 4.724556489262372, "grad_norm": 1.460880125946459, "learning_rate": 5.688843618692881e-06, "loss": 0.0704, "step": 5060 }, { "epoch": 4.7254901960784315, "grad_norm": 1.8264029212874835, "learning_rate": 5.687345880358706e-06, "loss": 0.0894, "step": 5061 }, { "epoch": 4.726423902894491, "grad_norm": 2.358295501563852, "learning_rate": 5.685848079159202e-06, "loss": 0.1633, "step": 5062 }, { "epoch": 4.727357609710551, "grad_norm": 0.7781817768053146, "learning_rate": 5.684350215231359e-06, "loss": 0.029, "step": 5063 }, { "epoch": 4.728291316526611, "grad_norm": 0.9358888094570552, "learning_rate": 5.682852288712174e-06, "loss": 0.0498, "step": 5064 }, { "epoch": 4.7292250233426705, "grad_norm": 0.3581047523070084, "learning_rate": 5.6813542997386465e-06, "loss": 0.0124, "step": 5065 }, { "epoch": 4.73015873015873, "grad_norm": 2.6832831369142065, "learning_rate": 5.679856248447787e-06, "loss": 0.1706, "step": 5066 }, { "epoch": 4.73109243697479, "grad_norm": 0.8920637671020883, "learning_rate": 5.678358134976607e-06, "loss": 0.0456, "step": 5067 }, { "epoch": 4.73202614379085, "grad_norm": 0.24704878025044347, "learning_rate": 5.676859959462127e-06, "loss": 0.0089, "step": 5068 }, { "epoch": 4.7329598506069095, "grad_norm": 1.647229020698884, "learning_rate": 5.675361722041368e-06, "loss": 0.0742, "step": 5069 }, { "epoch": 4.733893557422969, "grad_norm": 1.2007707237419794, "learning_rate": 5.673863422851364e-06, "loss": 0.0622, "step": 5070 }, { "epoch": 4.734827264239029, "grad_norm": 0.5050588388556337, "learning_rate": 5.6723650620291504e-06, "loss": 0.0127, "step": 5071 }, { "epoch": 4.735760971055089, "grad_norm": 0.40472910282240904, "learning_rate": 5.670866639711766e-06, "loss": 0.0072, "step": 5072 }, { "epoch": 4.7366946778711485, "grad_norm": 1.9664645136978878, "learning_rate": 5.669368156036263e-06, "loss": 0.0844, "step": 5073 }, { "epoch": 4.737628384687208, "grad_norm": 1.4283845144180702, "learning_rate": 5.667869611139687e-06, "loss": 0.0557, "step": 5074 }, { "epoch": 4.738562091503268, "grad_norm": 1.186061482575995, "learning_rate": 5.666371005159103e-06, "loss": 0.0442, "step": 5075 }, { "epoch": 4.739495798319328, "grad_norm": 2.955907525886251, "learning_rate": 5.664872338231572e-06, "loss": 0.1794, "step": 5076 }, { "epoch": 4.7404295051353875, "grad_norm": 6.12690945249861, "learning_rate": 5.663373610494164e-06, "loss": 0.2096, "step": 5077 }, { "epoch": 4.741363211951447, "grad_norm": 1.4861412211189935, "learning_rate": 5.661874822083955e-06, "loss": 0.073, "step": 5078 }, { "epoch": 4.742296918767507, "grad_norm": 2.15927400889889, "learning_rate": 5.660375973138022e-06, "loss": 0.1032, "step": 5079 }, { "epoch": 4.743230625583567, "grad_norm": 0.7112306287407195, "learning_rate": 5.658877063793458e-06, "loss": 0.0179, "step": 5080 }, { "epoch": 4.7441643323996265, "grad_norm": 2.1369827504440155, "learning_rate": 5.6573780941873465e-06, "loss": 0.0703, "step": 5081 }, { "epoch": 4.745098039215686, "grad_norm": 1.396420206930631, "learning_rate": 5.655879064456792e-06, "loss": 0.076, "step": 5082 }, { "epoch": 4.746031746031746, "grad_norm": 6.027439486945991, "learning_rate": 5.654379974738893e-06, "loss": 0.2689, "step": 5083 }, { "epoch": 4.746965452847806, "grad_norm": 1.5497088533355354, "learning_rate": 5.652880825170759e-06, "loss": 0.0679, "step": 5084 }, { "epoch": 4.7478991596638656, "grad_norm": 0.6010156341737388, "learning_rate": 5.651381615889504e-06, "loss": 0.019, "step": 5085 }, { "epoch": 4.748832866479925, "grad_norm": 7.366871246724638, "learning_rate": 5.649882347032246e-06, "loss": 0.1295, "step": 5086 }, { "epoch": 4.749766573295985, "grad_norm": 2.067793748220289, "learning_rate": 5.6483830187361115e-06, "loss": 0.0588, "step": 5087 }, { "epoch": 4.750700280112045, "grad_norm": 3.13477740774972, "learning_rate": 5.6468836311382295e-06, "loss": 0.1696, "step": 5088 }, { "epoch": 4.751633986928105, "grad_norm": 0.6844722915951784, "learning_rate": 5.645384184375734e-06, "loss": 0.0272, "step": 5089 }, { "epoch": 4.752567693744164, "grad_norm": 3.0693152834170245, "learning_rate": 5.643884678585768e-06, "loss": 0.1425, "step": 5090 }, { "epoch": 4.753501400560224, "grad_norm": 0.8406985580499668, "learning_rate": 5.642385113905477e-06, "loss": 0.0354, "step": 5091 }, { "epoch": 4.754435107376284, "grad_norm": 1.2527186333993594, "learning_rate": 5.640885490472013e-06, "loss": 0.0059, "step": 5092 }, { "epoch": 4.755368814192344, "grad_norm": 0.6329373811016198, "learning_rate": 5.6393858084225305e-06, "loss": 0.0159, "step": 5093 }, { "epoch": 4.756302521008403, "grad_norm": 4.426536837433164, "learning_rate": 5.637886067894196e-06, "loss": 0.2097, "step": 5094 }, { "epoch": 4.757236227824463, "grad_norm": 4.251593350170097, "learning_rate": 5.636386269024174e-06, "loss": 0.2755, "step": 5095 }, { "epoch": 4.758169934640523, "grad_norm": 1.0636465866909253, "learning_rate": 5.634886411949641e-06, "loss": 0.0187, "step": 5096 }, { "epoch": 4.759103641456583, "grad_norm": 1.85166588700958, "learning_rate": 5.6333864968077716e-06, "loss": 0.0318, "step": 5097 }, { "epoch": 4.760037348272642, "grad_norm": 1.7068042513672839, "learning_rate": 5.631886523735752e-06, "loss": 0.0997, "step": 5098 }, { "epoch": 4.760971055088702, "grad_norm": 2.7405334334254157, "learning_rate": 5.630386492870769e-06, "loss": 0.1375, "step": 5099 }, { "epoch": 4.761904761904762, "grad_norm": 0.9388490224223898, "learning_rate": 5.62888640435002e-06, "loss": 0.0372, "step": 5100 }, { "epoch": 4.762838468720822, "grad_norm": 1.5361214967773758, "learning_rate": 5.627386258310702e-06, "loss": 0.0524, "step": 5101 }, { "epoch": 4.763772175536881, "grad_norm": 2.077288584710037, "learning_rate": 5.625886054890022e-06, "loss": 0.1038, "step": 5102 }, { "epoch": 4.764705882352941, "grad_norm": 1.242218105688576, "learning_rate": 5.624385794225187e-06, "loss": 0.0379, "step": 5103 }, { "epoch": 4.765639589169001, "grad_norm": 2.025760055796497, "learning_rate": 5.6228854764534144e-06, "loss": 0.1154, "step": 5104 }, { "epoch": 4.766573295985061, "grad_norm": 0.7190473859046198, "learning_rate": 5.621385101711926e-06, "loss": 0.0439, "step": 5105 }, { "epoch": 4.76750700280112, "grad_norm": 0.9545092882132171, "learning_rate": 5.619884670137944e-06, "loss": 0.0382, "step": 5106 }, { "epoch": 4.76844070961718, "grad_norm": 3.2278216425501216, "learning_rate": 5.618384181868704e-06, "loss": 0.1886, "step": 5107 }, { "epoch": 4.76937441643324, "grad_norm": 1.3836026042572622, "learning_rate": 5.616883637041436e-06, "loss": 0.0708, "step": 5108 }, { "epoch": 4.7703081232493, "grad_norm": 3.118909289369193, "learning_rate": 5.615383035793387e-06, "loss": 0.2002, "step": 5109 }, { "epoch": 4.771241830065359, "grad_norm": 2.9138958750394433, "learning_rate": 5.6138823782618e-06, "loss": 0.1718, "step": 5110 }, { "epoch": 4.772175536881419, "grad_norm": 1.138101792523448, "learning_rate": 5.6123816645839295e-06, "loss": 0.0537, "step": 5111 }, { "epoch": 4.773109243697479, "grad_norm": 1.8095657467476138, "learning_rate": 5.6108808948970285e-06, "loss": 0.0993, "step": 5112 }, { "epoch": 4.774042950513539, "grad_norm": 1.4415137672801508, "learning_rate": 5.609380069338363e-06, "loss": 0.1249, "step": 5113 }, { "epoch": 4.774976657329598, "grad_norm": 0.475679369450147, "learning_rate": 5.607879188045196e-06, "loss": 0.0231, "step": 5114 }, { "epoch": 4.775910364145658, "grad_norm": 1.4339808627245, "learning_rate": 5.606378251154801e-06, "loss": 0.0736, "step": 5115 }, { "epoch": 4.776844070961718, "grad_norm": 1.7328390206037767, "learning_rate": 5.6048772588044575e-06, "loss": 0.0193, "step": 5116 }, { "epoch": 4.777777777777778, "grad_norm": 2.0402114347605824, "learning_rate": 5.603376211131445e-06, "loss": 0.11, "step": 5117 }, { "epoch": 4.778711484593837, "grad_norm": 0.7469127705032377, "learning_rate": 5.6018751082730525e-06, "loss": 0.0302, "step": 5118 }, { "epoch": 4.779645191409897, "grad_norm": 0.7956784091770992, "learning_rate": 5.600373950366571e-06, "loss": 0.0237, "step": 5119 }, { "epoch": 4.780578898225957, "grad_norm": 1.2827144096532506, "learning_rate": 5.598872737549299e-06, "loss": 0.0158, "step": 5120 }, { "epoch": 4.781512605042017, "grad_norm": 1.8080474943998974, "learning_rate": 5.597371469958538e-06, "loss": 0.0645, "step": 5121 }, { "epoch": 4.7824463118580764, "grad_norm": 0.27538927064121055, "learning_rate": 5.595870147731595e-06, "loss": 0.0062, "step": 5122 }, { "epoch": 4.783380018674136, "grad_norm": 2.369384885783586, "learning_rate": 5.594368771005784e-06, "loss": 0.0204, "step": 5123 }, { "epoch": 4.784313725490196, "grad_norm": 0.7523843741579953, "learning_rate": 5.592867339918422e-06, "loss": 0.0265, "step": 5124 }, { "epoch": 4.785247432306256, "grad_norm": 2.564187376155386, "learning_rate": 5.591365854606829e-06, "loss": 0.0386, "step": 5125 }, { "epoch": 4.7861811391223155, "grad_norm": 4.901922703877109, "learning_rate": 5.589864315208336e-06, "loss": 0.1792, "step": 5126 }, { "epoch": 4.787114845938375, "grad_norm": 1.5674964751669804, "learning_rate": 5.588362721860272e-06, "loss": 0.0511, "step": 5127 }, { "epoch": 4.788048552754435, "grad_norm": 1.1256073238789719, "learning_rate": 5.586861074699978e-06, "loss": 0.0307, "step": 5128 }, { "epoch": 4.788982259570495, "grad_norm": 0.9423523841210036, "learning_rate": 5.585359373864792e-06, "loss": 0.0464, "step": 5129 }, { "epoch": 4.7899159663865545, "grad_norm": 0.5380936204346907, "learning_rate": 5.583857619492062e-06, "loss": 0.0092, "step": 5130 }, { "epoch": 4.790849673202614, "grad_norm": 2.1390942678222116, "learning_rate": 5.582355811719142e-06, "loss": 0.0879, "step": 5131 }, { "epoch": 4.791783380018674, "grad_norm": 4.3372805009998245, "learning_rate": 5.580853950683388e-06, "loss": 0.1823, "step": 5132 }, { "epoch": 4.792717086834734, "grad_norm": 3.1280778646621368, "learning_rate": 5.57935203652216e-06, "loss": 0.1729, "step": 5133 }, { "epoch": 4.7936507936507935, "grad_norm": 1.3385668628434708, "learning_rate": 5.577850069372826e-06, "loss": 0.0789, "step": 5134 }, { "epoch": 4.794584500466853, "grad_norm": 1.7590623607390161, "learning_rate": 5.5763480493727584e-06, "loss": 0.0602, "step": 5135 }, { "epoch": 4.795518207282913, "grad_norm": 1.886007372461417, "learning_rate": 5.5748459766593324e-06, "loss": 0.0508, "step": 5136 }, { "epoch": 4.796451914098973, "grad_norm": 1.5461262985279827, "learning_rate": 5.573343851369928e-06, "loss": 0.0815, "step": 5137 }, { "epoch": 4.7973856209150325, "grad_norm": 2.0391915556936415, "learning_rate": 5.571841673641931e-06, "loss": 0.0871, "step": 5138 }, { "epoch": 4.798319327731092, "grad_norm": 1.1675132596108673, "learning_rate": 5.570339443612734e-06, "loss": 0.0508, "step": 5139 }, { "epoch": 4.799253034547152, "grad_norm": 0.42185632248548505, "learning_rate": 5.5688371614197315e-06, "loss": 0.0081, "step": 5140 }, { "epoch": 4.800186741363212, "grad_norm": 2.2942961838727953, "learning_rate": 5.5673348272003236e-06, "loss": 0.1483, "step": 5141 }, { "epoch": 4.8011204481792715, "grad_norm": 2.551170115040623, "learning_rate": 5.565832441091914e-06, "loss": 0.0608, "step": 5142 }, { "epoch": 4.802054154995331, "grad_norm": 1.5442778748432422, "learning_rate": 5.564330003231915e-06, "loss": 0.0809, "step": 5143 }, { "epoch": 4.802987861811391, "grad_norm": 1.3389493637443681, "learning_rate": 5.562827513757739e-06, "loss": 0.0654, "step": 5144 }, { "epoch": 4.803921568627451, "grad_norm": 1.6185803477976035, "learning_rate": 5.5613249728068065e-06, "loss": 0.071, "step": 5145 }, { "epoch": 4.8048552754435105, "grad_norm": 1.429204236991886, "learning_rate": 5.559822380516539e-06, "loss": 0.0919, "step": 5146 }, { "epoch": 4.80578898225957, "grad_norm": 2.3804082073886943, "learning_rate": 5.5583197370243705e-06, "loss": 0.1241, "step": 5147 }, { "epoch": 4.80672268907563, "grad_norm": 2.6020519690868285, "learning_rate": 5.556817042467727e-06, "loss": 0.1326, "step": 5148 }, { "epoch": 4.80765639589169, "grad_norm": 1.2537590541362165, "learning_rate": 5.5553142969840526e-06, "loss": 0.067, "step": 5149 }, { "epoch": 4.80859010270775, "grad_norm": 1.3901237810990792, "learning_rate": 5.553811500710785e-06, "loss": 0.0923, "step": 5150 }, { "epoch": 4.809523809523809, "grad_norm": 1.2995827523497125, "learning_rate": 5.552308653785375e-06, "loss": 0.0943, "step": 5151 }, { "epoch": 4.810457516339869, "grad_norm": 0.7630198776399838, "learning_rate": 5.550805756345272e-06, "loss": 0.0063, "step": 5152 }, { "epoch": 4.811391223155929, "grad_norm": 0.681734175480313, "learning_rate": 5.549302808527936e-06, "loss": 0.0104, "step": 5153 }, { "epoch": 4.812324929971989, "grad_norm": 2.153870674035537, "learning_rate": 5.547799810470823e-06, "loss": 0.0938, "step": 5154 }, { "epoch": 4.813258636788048, "grad_norm": 0.49640674373381855, "learning_rate": 5.546296762311402e-06, "loss": 0.0128, "step": 5155 }, { "epoch": 4.814192343604108, "grad_norm": 3.992421728502538, "learning_rate": 5.544793664187143e-06, "loss": 0.1877, "step": 5156 }, { "epoch": 4.815126050420168, "grad_norm": 2.5227618003405787, "learning_rate": 5.54329051623552e-06, "loss": 0.0845, "step": 5157 }, { "epoch": 4.816059757236228, "grad_norm": 3.581469907696411, "learning_rate": 5.541787318594014e-06, "loss": 0.1756, "step": 5158 }, { "epoch": 4.816993464052287, "grad_norm": 0.648823964000986, "learning_rate": 5.5402840714001075e-06, "loss": 0.0339, "step": 5159 }, { "epoch": 4.817927170868347, "grad_norm": 1.3592010094773066, "learning_rate": 5.538780774791289e-06, "loss": 0.0839, "step": 5160 }, { "epoch": 4.818860877684407, "grad_norm": 1.4086785002401754, "learning_rate": 5.5372774289050515e-06, "loss": 0.0477, "step": 5161 }, { "epoch": 4.819794584500467, "grad_norm": 1.4840402612469528, "learning_rate": 5.535774033878894e-06, "loss": 0.0626, "step": 5162 }, { "epoch": 4.820728291316526, "grad_norm": 2.0263363676461146, "learning_rate": 5.534270589850315e-06, "loss": 0.0897, "step": 5163 }, { "epoch": 4.821661998132586, "grad_norm": 2.23576622691414, "learning_rate": 5.532767096956825e-06, "loss": 0.1556, "step": 5164 }, { "epoch": 4.822595704948646, "grad_norm": 1.1545532702633787, "learning_rate": 5.5312635553359315e-06, "loss": 0.0553, "step": 5165 }, { "epoch": 4.823529411764706, "grad_norm": 0.4662798964243575, "learning_rate": 5.529759965125153e-06, "loss": 0.0175, "step": 5166 }, { "epoch": 4.824463118580765, "grad_norm": 2.258038394626211, "learning_rate": 5.528256326462007e-06, "loss": 0.1317, "step": 5167 }, { "epoch": 4.825396825396825, "grad_norm": 0.40042412304282615, "learning_rate": 5.52675263948402e-06, "loss": 0.0129, "step": 5168 }, { "epoch": 4.826330532212885, "grad_norm": 1.2187044898895698, "learning_rate": 5.525248904328717e-06, "loss": 0.0532, "step": 5169 }, { "epoch": 4.827264239028945, "grad_norm": 2.2989166465278283, "learning_rate": 5.523745121133635e-06, "loss": 0.1329, "step": 5170 }, { "epoch": 4.828197945845004, "grad_norm": 1.1672915837594355, "learning_rate": 5.52224129003631e-06, "loss": 0.0562, "step": 5171 }, { "epoch": 4.829131652661064, "grad_norm": 1.8913571905419164, "learning_rate": 5.520737411174283e-06, "loss": 0.0357, "step": 5172 }, { "epoch": 4.830065359477124, "grad_norm": 1.474083129438248, "learning_rate": 5.519233484685102e-06, "loss": 0.0568, "step": 5173 }, { "epoch": 4.830999066293184, "grad_norm": 0.4759569123345707, "learning_rate": 5.517729510706316e-06, "loss": 0.0119, "step": 5174 }, { "epoch": 4.831932773109243, "grad_norm": 1.5755971575381855, "learning_rate": 5.51622548937548e-06, "loss": 0.0542, "step": 5175 }, { "epoch": 4.832866479925303, "grad_norm": 3.9303913718295624, "learning_rate": 5.514721420830153e-06, "loss": 0.1915, "step": 5176 }, { "epoch": 4.833800186741363, "grad_norm": 1.5730132655335956, "learning_rate": 5.513217305207902e-06, "loss": 0.1398, "step": 5177 }, { "epoch": 4.834733893557423, "grad_norm": 1.7206160846543745, "learning_rate": 5.51171314264629e-06, "loss": 0.0778, "step": 5178 }, { "epoch": 4.835667600373482, "grad_norm": 0.7449397909907578, "learning_rate": 5.510208933282893e-06, "loss": 0.0196, "step": 5179 }, { "epoch": 4.836601307189542, "grad_norm": 4.6779035358858, "learning_rate": 5.508704677255284e-06, "loss": 0.0975, "step": 5180 }, { "epoch": 4.837535014005602, "grad_norm": 1.047583058552235, "learning_rate": 5.507200374701048e-06, "loss": 0.0353, "step": 5181 }, { "epoch": 4.838468720821662, "grad_norm": 1.963539288577213, "learning_rate": 5.505696025757767e-06, "loss": 0.073, "step": 5182 }, { "epoch": 4.839402427637721, "grad_norm": 1.9132517481645304, "learning_rate": 5.50419163056303e-06, "loss": 0.0652, "step": 5183 }, { "epoch": 4.840336134453781, "grad_norm": 1.3914267944876106, "learning_rate": 5.502687189254431e-06, "loss": 0.0536, "step": 5184 }, { "epoch": 4.841269841269841, "grad_norm": 1.7077980448243344, "learning_rate": 5.50118270196957e-06, "loss": 0.0607, "step": 5185 }, { "epoch": 4.842203548085901, "grad_norm": 3.8957823962880047, "learning_rate": 5.499678168846044e-06, "loss": 0.2368, "step": 5186 }, { "epoch": 4.8431372549019605, "grad_norm": 2.479643195058238, "learning_rate": 5.498173590021465e-06, "loss": 0.12, "step": 5187 }, { "epoch": 4.84407096171802, "grad_norm": 2.2571834387920116, "learning_rate": 5.496668965633439e-06, "loss": 0.1433, "step": 5188 }, { "epoch": 4.84500466853408, "grad_norm": 0.3349582252436875, "learning_rate": 5.495164295819581e-06, "loss": 0.0042, "step": 5189 }, { "epoch": 4.84593837535014, "grad_norm": 1.7612314448616049, "learning_rate": 5.49365958071751e-06, "loss": 0.0795, "step": 5190 }, { "epoch": 4.8468720821661995, "grad_norm": 0.5744375182397556, "learning_rate": 5.49215482046485e-06, "loss": 0.0108, "step": 5191 }, { "epoch": 4.847805788982259, "grad_norm": 2.626748829938737, "learning_rate": 5.490650015199226e-06, "loss": 0.0982, "step": 5192 }, { "epoch": 4.848739495798319, "grad_norm": 1.179396815545844, "learning_rate": 5.4891451650582716e-06, "loss": 0.0577, "step": 5193 }, { "epoch": 4.849673202614379, "grad_norm": 1.4872758453987243, "learning_rate": 5.487640270179617e-06, "loss": 0.0804, "step": 5194 }, { "epoch": 4.8506069094304385, "grad_norm": 0.45642930382219266, "learning_rate": 5.486135330700906e-06, "loss": 0.0132, "step": 5195 }, { "epoch": 4.851540616246498, "grad_norm": 0.4998696654495059, "learning_rate": 5.4846303467597815e-06, "loss": 0.0203, "step": 5196 }, { "epoch": 4.852474323062558, "grad_norm": 0.19528622528757036, "learning_rate": 5.483125318493888e-06, "loss": 0.0018, "step": 5197 }, { "epoch": 4.853408029878618, "grad_norm": 1.950714744656726, "learning_rate": 5.48162024604088e-06, "loss": 0.0545, "step": 5198 }, { "epoch": 4.8543417366946775, "grad_norm": 1.3841117099682214, "learning_rate": 5.480115129538409e-06, "loss": 0.0732, "step": 5199 }, { "epoch": 4.855275443510737, "grad_norm": 1.5103261324013717, "learning_rate": 5.478609969124138e-06, "loss": 0.0456, "step": 5200 }, { "epoch": 4.856209150326797, "grad_norm": 0.9323934312512216, "learning_rate": 5.477104764935729e-06, "loss": 0.0436, "step": 5201 }, { "epoch": 4.857142857142857, "grad_norm": 1.2120287215439287, "learning_rate": 5.47559951711085e-06, "loss": 0.0352, "step": 5202 }, { "epoch": 4.8580765639589165, "grad_norm": 1.2956765455496149, "learning_rate": 5.4740942257871735e-06, "loss": 0.0419, "step": 5203 }, { "epoch": 4.859010270774976, "grad_norm": 2.1957233987068787, "learning_rate": 5.47258889110237e-06, "loss": 0.0781, "step": 5204 }, { "epoch": 4.859943977591037, "grad_norm": 3.0227892146989057, "learning_rate": 5.471083513194125e-06, "loss": 0.1232, "step": 5205 }, { "epoch": 4.860877684407097, "grad_norm": 2.2984929783055468, "learning_rate": 5.469578092200118e-06, "loss": 0.0839, "step": 5206 }, { "epoch": 4.861811391223156, "grad_norm": 0.8280998175799305, "learning_rate": 5.46807262825804e-06, "loss": 0.0226, "step": 5207 }, { "epoch": 4.862745098039216, "grad_norm": 1.7479238442080722, "learning_rate": 5.466567121505577e-06, "loss": 0.0736, "step": 5208 }, { "epoch": 4.863678804855276, "grad_norm": 0.4482138429955874, "learning_rate": 5.465061572080429e-06, "loss": 0.0161, "step": 5209 }, { "epoch": 4.864612511671336, "grad_norm": 0.48533815022513715, "learning_rate": 5.463555980120291e-06, "loss": 0.0052, "step": 5210 }, { "epoch": 4.865546218487395, "grad_norm": 1.7958450584936105, "learning_rate": 5.46205034576287e-06, "loss": 0.0604, "step": 5211 }, { "epoch": 4.866479925303455, "grad_norm": 3.8255180654598964, "learning_rate": 5.460544669145867e-06, "loss": 0.1737, "step": 5212 }, { "epoch": 4.867413632119515, "grad_norm": 1.1230710891198763, "learning_rate": 5.459038950407e-06, "loss": 0.0472, "step": 5213 }, { "epoch": 4.868347338935575, "grad_norm": 4.778636253909432, "learning_rate": 5.457533189683978e-06, "loss": 0.1796, "step": 5214 }, { "epoch": 4.8692810457516345, "grad_norm": 1.3425856868515114, "learning_rate": 5.456027387114522e-06, "loss": 0.0743, "step": 5215 }, { "epoch": 4.870214752567694, "grad_norm": 1.6048503221656094, "learning_rate": 5.4545215428363515e-06, "loss": 0.0658, "step": 5216 }, { "epoch": 4.871148459383754, "grad_norm": 1.0914655643323983, "learning_rate": 5.453015656987196e-06, "loss": 0.0149, "step": 5217 }, { "epoch": 4.872082166199814, "grad_norm": 2.195139159060474, "learning_rate": 5.451509729704783e-06, "loss": 0.1027, "step": 5218 }, { "epoch": 4.8730158730158735, "grad_norm": 0.84998873465763, "learning_rate": 5.450003761126847e-06, "loss": 0.0364, "step": 5219 }, { "epoch": 4.873949579831933, "grad_norm": 0.4584452370986254, "learning_rate": 5.448497751391126e-06, "loss": 0.0116, "step": 5220 }, { "epoch": 4.874883286647993, "grad_norm": 4.016922221792272, "learning_rate": 5.446991700635359e-06, "loss": 0.1491, "step": 5221 }, { "epoch": 4.875816993464053, "grad_norm": 3.132834285093821, "learning_rate": 5.445485608997293e-06, "loss": 0.1174, "step": 5222 }, { "epoch": 4.8767507002801125, "grad_norm": 2.935223144689368, "learning_rate": 5.443979476614674e-06, "loss": 0.1178, "step": 5223 }, { "epoch": 4.877684407096172, "grad_norm": 1.3001975589685884, "learning_rate": 5.442473303625259e-06, "loss": 0.0267, "step": 5224 }, { "epoch": 4.878618113912232, "grad_norm": 0.4579493709654048, "learning_rate": 5.4409670901667985e-06, "loss": 0.0154, "step": 5225 }, { "epoch": 4.879551820728292, "grad_norm": 0.3579317923449988, "learning_rate": 5.439460836377056e-06, "loss": 0.0126, "step": 5226 }, { "epoch": 4.8804855275443515, "grad_norm": 1.0052991514317497, "learning_rate": 5.437954542393793e-06, "loss": 0.0443, "step": 5227 }, { "epoch": 4.881419234360411, "grad_norm": 4.860458439892909, "learning_rate": 5.436448208354779e-06, "loss": 0.1448, "step": 5228 }, { "epoch": 4.882352941176471, "grad_norm": 0.23019968216194314, "learning_rate": 5.434941834397782e-06, "loss": 0.0067, "step": 5229 }, { "epoch": 4.883286647992531, "grad_norm": 0.7092215594055287, "learning_rate": 5.433435420660578e-06, "loss": 0.0285, "step": 5230 }, { "epoch": 4.8842203548085905, "grad_norm": 1.6431417585990578, "learning_rate": 5.431928967280945e-06, "loss": 0.0329, "step": 5231 }, { "epoch": 4.88515406162465, "grad_norm": 3.9530386209623676, "learning_rate": 5.430422474396663e-06, "loss": 0.1858, "step": 5232 }, { "epoch": 4.88608776844071, "grad_norm": 1.344050053070475, "learning_rate": 5.4289159421455205e-06, "loss": 0.0774, "step": 5233 }, { "epoch": 4.88702147525677, "grad_norm": 2.1628584543432074, "learning_rate": 5.427409370665304e-06, "loss": 0.1214, "step": 5234 }, { "epoch": 4.8879551820728295, "grad_norm": 1.2332461127822087, "learning_rate": 5.425902760093806e-06, "loss": 0.0562, "step": 5235 }, { "epoch": 4.888888888888889, "grad_norm": 2.5680323822961357, "learning_rate": 5.424396110568826e-06, "loss": 0.1393, "step": 5236 }, { "epoch": 4.889822595704949, "grad_norm": 1.115354590644739, "learning_rate": 5.4228894222281595e-06, "loss": 0.0562, "step": 5237 }, { "epoch": 4.890756302521009, "grad_norm": 1.2260085367005171, "learning_rate": 5.42138269520961e-06, "loss": 0.0588, "step": 5238 }, { "epoch": 4.8916900093370685, "grad_norm": 0.4545685527962831, "learning_rate": 5.419875929650987e-06, "loss": 0.0031, "step": 5239 }, { "epoch": 4.892623716153128, "grad_norm": 0.7756716951479935, "learning_rate": 5.418369125690098e-06, "loss": 0.037, "step": 5240 }, { "epoch": 4.893557422969188, "grad_norm": 0.34776130717162757, "learning_rate": 5.416862283464759e-06, "loss": 0.0128, "step": 5241 }, { "epoch": 4.894491129785248, "grad_norm": 1.0486263486504661, "learning_rate": 5.415355403112785e-06, "loss": 0.0347, "step": 5242 }, { "epoch": 4.895424836601308, "grad_norm": 2.9205307813188104, "learning_rate": 5.413848484771998e-06, "loss": 0.2376, "step": 5243 }, { "epoch": 4.896358543417367, "grad_norm": 3.2902641172995173, "learning_rate": 5.412341528580222e-06, "loss": 0.2348, "step": 5244 }, { "epoch": 4.897292250233427, "grad_norm": 0.3961052189153861, "learning_rate": 5.410834534675288e-06, "loss": 0.02, "step": 5245 }, { "epoch": 4.898225957049487, "grad_norm": 0.44100820425630255, "learning_rate": 5.4093275031950195e-06, "loss": 0.0131, "step": 5246 }, { "epoch": 4.899159663865547, "grad_norm": 2.158679746021977, "learning_rate": 5.407820434277259e-06, "loss": 0.1503, "step": 5247 }, { "epoch": 4.900093370681606, "grad_norm": 2.722887174881909, "learning_rate": 5.406313328059839e-06, "loss": 0.1231, "step": 5248 }, { "epoch": 4.901027077497666, "grad_norm": 2.3929269002239355, "learning_rate": 5.404806184680604e-06, "loss": 0.1606, "step": 5249 }, { "epoch": 4.901960784313726, "grad_norm": 6.39635374122942, "learning_rate": 5.403299004277397e-06, "loss": 0.1778, "step": 5250 }, { "epoch": 4.902894491129786, "grad_norm": 2.1446545716255034, "learning_rate": 5.401791786988068e-06, "loss": 0.1909, "step": 5251 }, { "epoch": 4.903828197945845, "grad_norm": 0.5677032855861539, "learning_rate": 5.4002845329504675e-06, "loss": 0.0115, "step": 5252 }, { "epoch": 4.904761904761905, "grad_norm": 1.364778810196796, "learning_rate": 5.39877724230245e-06, "loss": 0.0857, "step": 5253 }, { "epoch": 4.905695611577965, "grad_norm": 6.461199880215163, "learning_rate": 5.397269915181875e-06, "loss": 0.2524, "step": 5254 }, { "epoch": 4.906629318394025, "grad_norm": 1.3038168297670918, "learning_rate": 5.395762551726602e-06, "loss": 0.0271, "step": 5255 }, { "epoch": 4.907563025210084, "grad_norm": 1.859607025892023, "learning_rate": 5.394255152074499e-06, "loss": 0.0925, "step": 5256 }, { "epoch": 4.908496732026144, "grad_norm": 2.1676225363936625, "learning_rate": 5.392747716363432e-06, "loss": 0.161, "step": 5257 }, { "epoch": 4.909430438842204, "grad_norm": 1.735423359952709, "learning_rate": 5.391240244731273e-06, "loss": 0.0875, "step": 5258 }, { "epoch": 4.910364145658264, "grad_norm": 0.9584919770463883, "learning_rate": 5.389732737315897e-06, "loss": 0.0525, "step": 5259 }, { "epoch": 4.911297852474323, "grad_norm": 1.3254969638243321, "learning_rate": 5.388225194255182e-06, "loss": 0.0707, "step": 5260 }, { "epoch": 4.912231559290383, "grad_norm": 4.622175607427029, "learning_rate": 5.3867176156870094e-06, "loss": 0.24, "step": 5261 }, { "epoch": 4.913165266106443, "grad_norm": 0.4936318693149093, "learning_rate": 5.385210001749265e-06, "loss": 0.0086, "step": 5262 }, { "epoch": 4.914098972922503, "grad_norm": 0.7695119659610711, "learning_rate": 5.383702352579834e-06, "loss": 0.0221, "step": 5263 }, { "epoch": 4.915032679738562, "grad_norm": 0.4836295324399577, "learning_rate": 5.38219466831661e-06, "loss": 0.019, "step": 5264 }, { "epoch": 4.915966386554622, "grad_norm": 1.2884003748025983, "learning_rate": 5.380686949097486e-06, "loss": 0.0646, "step": 5265 }, { "epoch": 4.916900093370682, "grad_norm": 3.179544235354322, "learning_rate": 5.37917919506036e-06, "loss": 0.1836, "step": 5266 }, { "epoch": 4.917833800186742, "grad_norm": 2.008720085487109, "learning_rate": 5.377671406343132e-06, "loss": 0.1088, "step": 5267 }, { "epoch": 4.918767507002801, "grad_norm": 0.3915128039860084, "learning_rate": 5.376163583083708e-06, "loss": 0.0103, "step": 5268 }, { "epoch": 4.919701213818861, "grad_norm": 1.1215685302669793, "learning_rate": 5.374655725419991e-06, "loss": 0.0576, "step": 5269 }, { "epoch": 4.920634920634921, "grad_norm": 1.8640937175816281, "learning_rate": 5.373147833489894e-06, "loss": 0.087, "step": 5270 }, { "epoch": 4.921568627450981, "grad_norm": 0.45177843415865937, "learning_rate": 5.37163990743133e-06, "loss": 0.0121, "step": 5271 }, { "epoch": 4.92250233426704, "grad_norm": 2.5819064599283266, "learning_rate": 5.370131947382215e-06, "loss": 0.0954, "step": 5272 }, { "epoch": 4.9234360410831, "grad_norm": 2.4641011542364164, "learning_rate": 5.368623953480468e-06, "loss": 0.105, "step": 5273 }, { "epoch": 4.92436974789916, "grad_norm": 1.4912568742898178, "learning_rate": 5.3671159258640125e-06, "loss": 0.0509, "step": 5274 }, { "epoch": 4.92530345471522, "grad_norm": 0.6728177434361851, "learning_rate": 5.365607864670775e-06, "loss": 0.0124, "step": 5275 }, { "epoch": 4.926237161531279, "grad_norm": 1.217403804440936, "learning_rate": 5.364099770038682e-06, "loss": 0.0927, "step": 5276 }, { "epoch": 4.927170868347339, "grad_norm": 0.6294275211013429, "learning_rate": 5.3625916421056674e-06, "loss": 0.0174, "step": 5277 }, { "epoch": 4.928104575163399, "grad_norm": 0.8055524226283814, "learning_rate": 5.3610834810096646e-06, "loss": 0.0192, "step": 5278 }, { "epoch": 4.929038281979459, "grad_norm": 2.254289529153214, "learning_rate": 5.359575286888613e-06, "loss": 0.1043, "step": 5279 }, { "epoch": 4.9299719887955185, "grad_norm": 1.1737916718218304, "learning_rate": 5.3580670598804506e-06, "loss": 0.0058, "step": 5280 }, { "epoch": 4.930905695611578, "grad_norm": 2.599387601730233, "learning_rate": 5.356558800123125e-06, "loss": 0.1455, "step": 5281 }, { "epoch": 4.931839402427638, "grad_norm": 2.3812261311966236, "learning_rate": 5.35505050775458e-06, "loss": 0.097, "step": 5282 }, { "epoch": 4.932773109243698, "grad_norm": 2.6593417361997087, "learning_rate": 5.353542182912769e-06, "loss": 0.0926, "step": 5283 }, { "epoch": 4.9337068160597575, "grad_norm": 0.9423030745045222, "learning_rate": 5.3520338257356404e-06, "loss": 0.0376, "step": 5284 }, { "epoch": 4.934640522875817, "grad_norm": 0.9459961493340838, "learning_rate": 5.350525436361154e-06, "loss": 0.0391, "step": 5285 }, { "epoch": 4.935574229691877, "grad_norm": 1.8353404230603128, "learning_rate": 5.3490170149272665e-06, "loss": 0.1214, "step": 5286 }, { "epoch": 4.936507936507937, "grad_norm": 1.977309069077685, "learning_rate": 5.3475085615719404e-06, "loss": 0.0784, "step": 5287 }, { "epoch": 4.9374416433239965, "grad_norm": 1.0525933144698694, "learning_rate": 5.34600007643314e-06, "loss": 0.0462, "step": 5288 }, { "epoch": 4.938375350140056, "grad_norm": 1.6987200335661288, "learning_rate": 5.344491559648831e-06, "loss": 0.0807, "step": 5289 }, { "epoch": 4.939309056956116, "grad_norm": 0.4677285077462791, "learning_rate": 5.342983011356988e-06, "loss": 0.0129, "step": 5290 }, { "epoch": 4.940242763772176, "grad_norm": 1.1989527907243007, "learning_rate": 5.341474431695581e-06, "loss": 0.0524, "step": 5291 }, { "epoch": 4.9411764705882355, "grad_norm": 1.9339655013217736, "learning_rate": 5.339965820802586e-06, "loss": 0.0855, "step": 5292 }, { "epoch": 4.942110177404295, "grad_norm": 2.9523618282386175, "learning_rate": 5.338457178815984e-06, "loss": 0.2109, "step": 5293 }, { "epoch": 4.943043884220355, "grad_norm": 2.470150851121129, "learning_rate": 5.336948505873757e-06, "loss": 0.1165, "step": 5294 }, { "epoch": 4.943977591036415, "grad_norm": 1.547557196043361, "learning_rate": 5.335439802113888e-06, "loss": 0.0931, "step": 5295 }, { "epoch": 4.9449112978524745, "grad_norm": 4.889202562798748, "learning_rate": 5.333931067674366e-06, "loss": 0.3429, "step": 5296 }, { "epoch": 4.945845004668534, "grad_norm": 2.36773815421399, "learning_rate": 5.3324223026931785e-06, "loss": 0.1344, "step": 5297 }, { "epoch": 4.946778711484594, "grad_norm": 1.4516511848235703, "learning_rate": 5.330913507308323e-06, "loss": 0.0313, "step": 5298 }, { "epoch": 4.947712418300654, "grad_norm": 2.7520662112648395, "learning_rate": 5.329404681657793e-06, "loss": 0.1217, "step": 5299 }, { "epoch": 4.9486461251167135, "grad_norm": 1.422885759026392, "learning_rate": 5.327895825879587e-06, "loss": 0.0733, "step": 5300 }, { "epoch": 4.949579831932773, "grad_norm": 0.5468924873372468, "learning_rate": 5.326386940111705e-06, "loss": 0.0205, "step": 5301 }, { "epoch": 4.950513538748833, "grad_norm": 2.4247131277439915, "learning_rate": 5.324878024492155e-06, "loss": 0.0876, "step": 5302 }, { "epoch": 4.951447245564893, "grad_norm": 3.7384290349720097, "learning_rate": 5.323369079158942e-06, "loss": 0.0671, "step": 5303 }, { "epoch": 4.9523809523809526, "grad_norm": 1.5211671342796167, "learning_rate": 5.321860104250074e-06, "loss": 0.1109, "step": 5304 }, { "epoch": 4.953314659197012, "grad_norm": 1.111638869403154, "learning_rate": 5.320351099903565e-06, "loss": 0.0587, "step": 5305 }, { "epoch": 4.954248366013072, "grad_norm": 0.5179953885567455, "learning_rate": 5.318842066257432e-06, "loss": 0.013, "step": 5306 }, { "epoch": 4.955182072829132, "grad_norm": 2.251637326818948, "learning_rate": 5.3173330034496875e-06, "loss": 0.0809, "step": 5307 }, { "epoch": 4.956115779645192, "grad_norm": 1.6250426570009364, "learning_rate": 5.315823911618356e-06, "loss": 0.0686, "step": 5308 }, { "epoch": 4.957049486461251, "grad_norm": 1.133214439797118, "learning_rate": 5.314314790901459e-06, "loss": 0.0553, "step": 5309 }, { "epoch": 4.957983193277311, "grad_norm": 0.5439162716896833, "learning_rate": 5.312805641437023e-06, "loss": 0.0156, "step": 5310 }, { "epoch": 4.958916900093371, "grad_norm": 0.5688091234441383, "learning_rate": 5.311296463363077e-06, "loss": 0.0104, "step": 5311 }, { "epoch": 4.959850606909431, "grad_norm": 0.8717210671560477, "learning_rate": 5.309787256817649e-06, "loss": 0.026, "step": 5312 }, { "epoch": 4.96078431372549, "grad_norm": 0.79751106942649, "learning_rate": 5.308278021938775e-06, "loss": 0.0204, "step": 5313 }, { "epoch": 4.96171802054155, "grad_norm": 1.9502969497192901, "learning_rate": 5.30676875886449e-06, "loss": 0.1136, "step": 5314 }, { "epoch": 4.96265172735761, "grad_norm": 0.11277749008654396, "learning_rate": 5.305259467732834e-06, "loss": 0.0015, "step": 5315 }, { "epoch": 4.96358543417367, "grad_norm": 0.9549168509889409, "learning_rate": 5.303750148681846e-06, "loss": 0.0433, "step": 5316 }, { "epoch": 4.964519140989729, "grad_norm": 1.0225324202818133, "learning_rate": 5.302240801849573e-06, "loss": 0.0451, "step": 5317 }, { "epoch": 4.965452847805789, "grad_norm": 1.896923353521538, "learning_rate": 5.300731427374057e-06, "loss": 0.1023, "step": 5318 }, { "epoch": 4.966386554621849, "grad_norm": 2.6723951794563607, "learning_rate": 5.299222025393352e-06, "loss": 0.121, "step": 5319 }, { "epoch": 4.967320261437909, "grad_norm": 3.6052694386687776, "learning_rate": 5.297712596045506e-06, "loss": 0.1275, "step": 5320 }, { "epoch": 4.968253968253968, "grad_norm": 1.2939823623146676, "learning_rate": 5.296203139468572e-06, "loss": 0.0701, "step": 5321 }, { "epoch": 4.969187675070028, "grad_norm": 0.559642508085657, "learning_rate": 5.294693655800609e-06, "loss": 0.0057, "step": 5322 }, { "epoch": 4.970121381886088, "grad_norm": 1.963064563751447, "learning_rate": 5.293184145179675e-06, "loss": 0.1215, "step": 5323 }, { "epoch": 4.971055088702148, "grad_norm": 0.6395581837893053, "learning_rate": 5.291674607743831e-06, "loss": 0.0275, "step": 5324 }, { "epoch": 4.971988795518207, "grad_norm": 2.3892421075647667, "learning_rate": 5.2901650436311405e-06, "loss": 0.1263, "step": 5325 }, { "epoch": 4.972922502334267, "grad_norm": 1.6428273064484358, "learning_rate": 5.288655452979671e-06, "loss": 0.0651, "step": 5326 }, { "epoch": 4.973856209150327, "grad_norm": 0.69702892457463, "learning_rate": 5.287145835927489e-06, "loss": 0.0161, "step": 5327 }, { "epoch": 4.974789915966387, "grad_norm": 1.9612612389611472, "learning_rate": 5.285636192612669e-06, "loss": 0.1328, "step": 5328 }, { "epoch": 4.975723622782446, "grad_norm": 0.9351401956165347, "learning_rate": 5.28412652317328e-06, "loss": 0.0245, "step": 5329 }, { "epoch": 4.976657329598506, "grad_norm": 1.8260672832703082, "learning_rate": 5.282616827747402e-06, "loss": 0.0716, "step": 5330 }, { "epoch": 4.977591036414566, "grad_norm": 0.6625406969350951, "learning_rate": 5.28110710647311e-06, "loss": 0.0229, "step": 5331 }, { "epoch": 4.978524743230626, "grad_norm": 1.2352982225821763, "learning_rate": 5.279597359488486e-06, "loss": 0.065, "step": 5332 }, { "epoch": 4.979458450046685, "grad_norm": 1.0966741183035489, "learning_rate": 5.278087586931614e-06, "loss": 0.0215, "step": 5333 }, { "epoch": 4.980392156862745, "grad_norm": 0.7824851331093283, "learning_rate": 5.2765777889405775e-06, "loss": 0.0229, "step": 5334 }, { "epoch": 4.981325863678805, "grad_norm": 0.7155111184079996, "learning_rate": 5.275067965653465e-06, "loss": 0.0196, "step": 5335 }, { "epoch": 4.982259570494865, "grad_norm": 0.2582294675431543, "learning_rate": 5.273558117208367e-06, "loss": 0.0046, "step": 5336 }, { "epoch": 4.983193277310924, "grad_norm": 1.1051856627836805, "learning_rate": 5.272048243743375e-06, "loss": 0.0316, "step": 5337 }, { "epoch": 4.984126984126984, "grad_norm": 1.087345207047393, "learning_rate": 5.270538345396582e-06, "loss": 0.0541, "step": 5338 }, { "epoch": 4.985060690943044, "grad_norm": 2.430784541516399, "learning_rate": 5.269028422306087e-06, "loss": 0.1686, "step": 5339 }, { "epoch": 4.985994397759104, "grad_norm": 0.6049439662923273, "learning_rate": 5.267518474609988e-06, "loss": 0.01, "step": 5340 }, { "epoch": 4.9869281045751634, "grad_norm": 1.3186652278131286, "learning_rate": 5.266008502446387e-06, "loss": 0.0261, "step": 5341 }, { "epoch": 4.987861811391223, "grad_norm": 1.7391229987880146, "learning_rate": 5.264498505953387e-06, "loss": 0.0958, "step": 5342 }, { "epoch": 4.988795518207283, "grad_norm": 3.2923653666822155, "learning_rate": 5.262988485269094e-06, "loss": 0.1961, "step": 5343 }, { "epoch": 4.989729225023343, "grad_norm": 1.3873382090803876, "learning_rate": 5.261478440531616e-06, "loss": 0.0955, "step": 5344 }, { "epoch": 4.9906629318394025, "grad_norm": 2.5869228491043654, "learning_rate": 5.259968371879062e-06, "loss": 0.0251, "step": 5345 }, { "epoch": 4.991596638655462, "grad_norm": 1.843844572181294, "learning_rate": 5.258458279449546e-06, "loss": 0.0791, "step": 5346 }, { "epoch": 4.992530345471522, "grad_norm": 2.7697629886844743, "learning_rate": 5.256948163381182e-06, "loss": 0.1032, "step": 5347 }, { "epoch": 4.993464052287582, "grad_norm": 1.453869701818874, "learning_rate": 5.255438023812087e-06, "loss": 0.0528, "step": 5348 }, { "epoch": 4.9943977591036415, "grad_norm": 1.2433743677573317, "learning_rate": 5.2539278608803786e-06, "loss": 0.0543, "step": 5349 }, { "epoch": 4.995331465919701, "grad_norm": 1.106832348972742, "learning_rate": 5.2524176747241796e-06, "loss": 0.0349, "step": 5350 }, { "epoch": 4.996265172735761, "grad_norm": 1.4074350637398332, "learning_rate": 5.250907465481611e-06, "loss": 0.028, "step": 5351 }, { "epoch": 4.997198879551821, "grad_norm": 0.8913702757189622, "learning_rate": 5.249397233290801e-06, "loss": 0.0342, "step": 5352 }, { "epoch": 4.9981325863678805, "grad_norm": 2.551440554716456, "learning_rate": 5.247886978289874e-06, "loss": 0.0727, "step": 5353 }, { "epoch": 4.99906629318394, "grad_norm": 2.6050302570973556, "learning_rate": 5.246376700616962e-06, "loss": 0.1792, "step": 5354 }, { "epoch": 5.0, "grad_norm": 1.565844409212473, "learning_rate": 5.244866400410193e-06, "loss": 0.0824, "step": 5355 }, { "epoch": 5.00093370681606, "grad_norm": 2.0612759461622603, "learning_rate": 5.243356077807704e-06, "loss": 0.0497, "step": 5356 }, { "epoch": 5.0018674136321195, "grad_norm": 1.6123349158975968, "learning_rate": 5.241845732947628e-06, "loss": 0.0899, "step": 5357 }, { "epoch": 5.002801120448179, "grad_norm": 1.4108379599758742, "learning_rate": 5.240335365968104e-06, "loss": 0.053, "step": 5358 }, { "epoch": 5.003734827264239, "grad_norm": 1.5681409805958957, "learning_rate": 5.238824977007272e-06, "loss": 0.109, "step": 5359 }, { "epoch": 5.004668534080299, "grad_norm": 1.2448447122903976, "learning_rate": 5.237314566203273e-06, "loss": 0.0298, "step": 5360 }, { "epoch": 5.0056022408963585, "grad_norm": 2.099988723735129, "learning_rate": 5.23580413369425e-06, "loss": 0.0737, "step": 5361 }, { "epoch": 5.006535947712418, "grad_norm": 2.8694386411083808, "learning_rate": 5.234293679618351e-06, "loss": 0.1243, "step": 5362 }, { "epoch": 5.007469654528478, "grad_norm": 3.4143432826013913, "learning_rate": 5.23278320411372e-06, "loss": 0.1935, "step": 5363 }, { "epoch": 5.008403361344538, "grad_norm": 0.7441903026020187, "learning_rate": 5.23127270731851e-06, "loss": 0.0287, "step": 5364 }, { "epoch": 5.0093370681605975, "grad_norm": 1.7221658291800996, "learning_rate": 5.229762189370869e-06, "loss": 0.0886, "step": 5365 }, { "epoch": 5.010270774976657, "grad_norm": 1.1448784311924758, "learning_rate": 5.228251650408955e-06, "loss": 0.0351, "step": 5366 }, { "epoch": 5.011204481792717, "grad_norm": 0.610358955192021, "learning_rate": 5.226741090570919e-06, "loss": 0.0187, "step": 5367 }, { "epoch": 5.012138188608777, "grad_norm": 1.3945220746355682, "learning_rate": 5.2252305099949215e-06, "loss": 0.079, "step": 5368 }, { "epoch": 5.0130718954248366, "grad_norm": 2.123737642093687, "learning_rate": 5.223719908819122e-06, "loss": 0.1503, "step": 5369 }, { "epoch": 5.014005602240896, "grad_norm": 2.2253302538320665, "learning_rate": 5.222209287181677e-06, "loss": 0.1521, "step": 5370 }, { "epoch": 5.014939309056956, "grad_norm": 1.8793638315215422, "learning_rate": 5.220698645220753e-06, "loss": 0.0284, "step": 5371 }, { "epoch": 5.015873015873016, "grad_norm": 4.045207682653771, "learning_rate": 5.219187983074514e-06, "loss": 0.1398, "step": 5372 }, { "epoch": 5.016806722689076, "grad_norm": 1.9872430179382419, "learning_rate": 5.217677300881128e-06, "loss": 0.1048, "step": 5373 }, { "epoch": 5.017740429505135, "grad_norm": 1.9501247997940456, "learning_rate": 5.2161665987787616e-06, "loss": 0.1409, "step": 5374 }, { "epoch": 5.018674136321195, "grad_norm": 1.4155380529723038, "learning_rate": 5.214655876905586e-06, "loss": 0.023, "step": 5375 }, { "epoch": 5.019607843137255, "grad_norm": 0.7275353503067911, "learning_rate": 5.213145135399772e-06, "loss": 0.0205, "step": 5376 }, { "epoch": 5.020541549953315, "grad_norm": 5.44073687663303, "learning_rate": 5.211634374399496e-06, "loss": 0.1263, "step": 5377 }, { "epoch": 5.021475256769374, "grad_norm": 0.6985379733717848, "learning_rate": 5.210123594042932e-06, "loss": 0.0063, "step": 5378 }, { "epoch": 5.022408963585434, "grad_norm": 0.9227635830881548, "learning_rate": 5.208612794468258e-06, "loss": 0.0367, "step": 5379 }, { "epoch": 5.023342670401494, "grad_norm": 0.7607155925502851, "learning_rate": 5.207101975813651e-06, "loss": 0.0305, "step": 5380 }, { "epoch": 5.024276377217554, "grad_norm": 1.3823805597326964, "learning_rate": 5.2055911382172966e-06, "loss": 0.0326, "step": 5381 }, { "epoch": 5.025210084033613, "grad_norm": 0.8309887111887991, "learning_rate": 5.204080281817372e-06, "loss": 0.0381, "step": 5382 }, { "epoch": 5.026143790849673, "grad_norm": 3.5662820331544403, "learning_rate": 5.202569406752066e-06, "loss": 0.0312, "step": 5383 }, { "epoch": 5.027077497665733, "grad_norm": 1.3671292443374645, "learning_rate": 5.201058513159564e-06, "loss": 0.0583, "step": 5384 }, { "epoch": 5.028011204481793, "grad_norm": 0.28299705895180977, "learning_rate": 5.199547601178051e-06, "loss": 0.0091, "step": 5385 }, { "epoch": 5.028944911297852, "grad_norm": 0.39372319499154673, "learning_rate": 5.198036670945719e-06, "loss": 0.0143, "step": 5386 }, { "epoch": 5.029878618113912, "grad_norm": 1.496918257666198, "learning_rate": 5.196525722600759e-06, "loss": 0.0826, "step": 5387 }, { "epoch": 5.030812324929972, "grad_norm": 1.6191048852799783, "learning_rate": 5.195014756281365e-06, "loss": 0.0793, "step": 5388 }, { "epoch": 5.031746031746032, "grad_norm": 4.745885715774044, "learning_rate": 5.193503772125728e-06, "loss": 0.2785, "step": 5389 }, { "epoch": 5.032679738562091, "grad_norm": 1.4188272478444472, "learning_rate": 5.191992770272046e-06, "loss": 0.0264, "step": 5390 }, { "epoch": 5.033613445378151, "grad_norm": 1.0464979268530166, "learning_rate": 5.190481750858517e-06, "loss": 0.0568, "step": 5391 }, { "epoch": 5.034547152194211, "grad_norm": 2.0697659576870437, "learning_rate": 5.18897071402334e-06, "loss": 0.0698, "step": 5392 }, { "epoch": 5.035480859010271, "grad_norm": 3.6643014278467527, "learning_rate": 5.187459659904715e-06, "loss": 0.1315, "step": 5393 }, { "epoch": 5.03641456582633, "grad_norm": 2.2943694863291872, "learning_rate": 5.185948588640848e-06, "loss": 0.0728, "step": 5394 }, { "epoch": 5.03734827264239, "grad_norm": 2.7381455713569, "learning_rate": 5.184437500369937e-06, "loss": 0.1158, "step": 5395 }, { "epoch": 5.03828197945845, "grad_norm": 5.1811731904836575, "learning_rate": 5.182926395230195e-06, "loss": 0.1525, "step": 5396 }, { "epoch": 5.03921568627451, "grad_norm": 2.0351913644052333, "learning_rate": 5.181415273359822e-06, "loss": 0.1019, "step": 5397 }, { "epoch": 5.040149393090569, "grad_norm": 1.901155213947466, "learning_rate": 5.179904134897032e-06, "loss": 0.0604, "step": 5398 }, { "epoch": 5.041083099906629, "grad_norm": 0.6000343100192707, "learning_rate": 5.178392979980032e-06, "loss": 0.0161, "step": 5399 }, { "epoch": 5.042016806722689, "grad_norm": 1.4621009922984345, "learning_rate": 5.176881808747035e-06, "loss": 0.0722, "step": 5400 }, { "epoch": 5.042950513538749, "grad_norm": 1.2082685180236656, "learning_rate": 5.175370621336253e-06, "loss": 0.0578, "step": 5401 }, { "epoch": 5.043884220354808, "grad_norm": 0.5521056516456605, "learning_rate": 5.1738594178859035e-06, "loss": 0.0339, "step": 5402 }, { "epoch": 5.044817927170868, "grad_norm": 0.3451816088342854, "learning_rate": 5.172348198534202e-06, "loss": 0.014, "step": 5403 }, { "epoch": 5.045751633986928, "grad_norm": 3.727895928138171, "learning_rate": 5.170836963419363e-06, "loss": 0.1711, "step": 5404 }, { "epoch": 5.046685340802988, "grad_norm": 2.280477091411966, "learning_rate": 5.169325712679608e-06, "loss": 0.1467, "step": 5405 }, { "epoch": 5.0476190476190474, "grad_norm": 0.6783558510828963, "learning_rate": 5.167814446453157e-06, "loss": 0.0236, "step": 5406 }, { "epoch": 5.048552754435107, "grad_norm": 0.6282775588067089, "learning_rate": 5.166303164878232e-06, "loss": 0.0232, "step": 5407 }, { "epoch": 5.049486461251167, "grad_norm": 6.217438178918744, "learning_rate": 5.164791868093056e-06, "loss": 0.1434, "step": 5408 }, { "epoch": 5.050420168067227, "grad_norm": 0.2997066428113594, "learning_rate": 5.163280556235853e-06, "loss": 0.0031, "step": 5409 }, { "epoch": 5.0513538748832865, "grad_norm": 0.36422677724081753, "learning_rate": 5.161769229444851e-06, "loss": 0.0083, "step": 5410 }, { "epoch": 5.052287581699346, "grad_norm": 2.3882529999932447, "learning_rate": 5.160257887858278e-06, "loss": 0.0989, "step": 5411 }, { "epoch": 5.053221288515406, "grad_norm": 4.630806920030181, "learning_rate": 5.158746531614358e-06, "loss": 0.2226, "step": 5412 }, { "epoch": 5.054154995331466, "grad_norm": 0.8185309499830437, "learning_rate": 5.157235160851327e-06, "loss": 0.0399, "step": 5413 }, { "epoch": 5.0550887021475255, "grad_norm": 1.4492521768853694, "learning_rate": 5.155723775707412e-06, "loss": 0.078, "step": 5414 }, { "epoch": 5.056022408963585, "grad_norm": 1.8351381020718878, "learning_rate": 5.154212376320848e-06, "loss": 0.1367, "step": 5415 }, { "epoch": 5.056956115779645, "grad_norm": 2.6979774450632914, "learning_rate": 5.152700962829868e-06, "loss": 0.1276, "step": 5416 }, { "epoch": 5.057889822595705, "grad_norm": 1.0256447685258554, "learning_rate": 5.15118953537271e-06, "loss": 0.043, "step": 5417 }, { "epoch": 5.0588235294117645, "grad_norm": 2.337772779518335, "learning_rate": 5.1496780940876065e-06, "loss": 0.072, "step": 5418 }, { "epoch": 5.059757236227824, "grad_norm": 2.6876740834914283, "learning_rate": 5.148166639112799e-06, "loss": 0.155, "step": 5419 }, { "epoch": 5.060690943043884, "grad_norm": 1.2449913950260556, "learning_rate": 5.1466551705865254e-06, "loss": 0.0871, "step": 5420 }, { "epoch": 5.061624649859944, "grad_norm": 2.182010938048928, "learning_rate": 5.145143688647025e-06, "loss": 0.083, "step": 5421 }, { "epoch": 5.0625583566760035, "grad_norm": 2.4981257736639395, "learning_rate": 5.14363219343254e-06, "loss": 0.1835, "step": 5422 }, { "epoch": 5.063492063492063, "grad_norm": 3.1455385043022215, "learning_rate": 5.142120685081313e-06, "loss": 0.178, "step": 5423 }, { "epoch": 5.064425770308123, "grad_norm": 1.082210312222245, "learning_rate": 5.1406091637315915e-06, "loss": 0.0233, "step": 5424 }, { "epoch": 5.065359477124183, "grad_norm": 3.504774371839283, "learning_rate": 5.139097629521614e-06, "loss": 0.0821, "step": 5425 }, { "epoch": 5.0662931839402425, "grad_norm": 1.5954508013924944, "learning_rate": 5.137586082589634e-06, "loss": 0.0688, "step": 5426 }, { "epoch": 5.067226890756302, "grad_norm": 1.369285277566803, "learning_rate": 5.136074523073894e-06, "loss": 0.0435, "step": 5427 }, { "epoch": 5.068160597572362, "grad_norm": 2.1763955372101305, "learning_rate": 5.134562951112645e-06, "loss": 0.0848, "step": 5428 }, { "epoch": 5.069094304388422, "grad_norm": 2.6554323178961288, "learning_rate": 5.133051366844136e-06, "loss": 0.1064, "step": 5429 }, { "epoch": 5.0700280112044815, "grad_norm": 1.3587756856511157, "learning_rate": 5.131539770406618e-06, "loss": 0.0856, "step": 5430 }, { "epoch": 5.070961718020541, "grad_norm": 2.0037198960161997, "learning_rate": 5.130028161938344e-06, "loss": 0.0947, "step": 5431 }, { "epoch": 5.071895424836601, "grad_norm": 2.932900062473239, "learning_rate": 5.128516541577567e-06, "loss": 0.1648, "step": 5432 }, { "epoch": 5.072829131652661, "grad_norm": 0.674388477808172, "learning_rate": 5.127004909462541e-06, "loss": 0.0188, "step": 5433 }, { "epoch": 5.073762838468721, "grad_norm": 0.807804538674159, "learning_rate": 5.1254932657315204e-06, "loss": 0.0271, "step": 5434 }, { "epoch": 5.07469654528478, "grad_norm": 0.6514320235340056, "learning_rate": 5.123981610522764e-06, "loss": 0.023, "step": 5435 }, { "epoch": 5.07563025210084, "grad_norm": 1.169384227309827, "learning_rate": 5.122469943974525e-06, "loss": 0.04, "step": 5436 }, { "epoch": 5.0765639589169, "grad_norm": 1.6097247632937983, "learning_rate": 5.120958266225067e-06, "loss": 0.0914, "step": 5437 }, { "epoch": 5.07749766573296, "grad_norm": 3.2972111421906996, "learning_rate": 5.119446577412646e-06, "loss": 0.134, "step": 5438 }, { "epoch": 5.078431372549019, "grad_norm": 4.769379533036258, "learning_rate": 5.117934877675525e-06, "loss": 0.0736, "step": 5439 }, { "epoch": 5.079365079365079, "grad_norm": 0.4313352917009008, "learning_rate": 5.1164231671519625e-06, "loss": 0.0138, "step": 5440 }, { "epoch": 5.080298786181139, "grad_norm": 3.9690910225793443, "learning_rate": 5.114911445980224e-06, "loss": 0.105, "step": 5441 }, { "epoch": 5.081232492997199, "grad_norm": 0.6224206966768873, "learning_rate": 5.113399714298571e-06, "loss": 0.021, "step": 5442 }, { "epoch": 5.082166199813258, "grad_norm": 0.8869952453799458, "learning_rate": 5.1118879722452695e-06, "loss": 0.0227, "step": 5443 }, { "epoch": 5.083099906629318, "grad_norm": 0.724206195045238, "learning_rate": 5.110376219958583e-06, "loss": 0.0191, "step": 5444 }, { "epoch": 5.084033613445378, "grad_norm": 2.681580491861519, "learning_rate": 5.1088644575767825e-06, "loss": 0.1318, "step": 5445 }, { "epoch": 5.084967320261438, "grad_norm": 2.7382359730118093, "learning_rate": 5.107352685238129e-06, "loss": 0.1557, "step": 5446 }, { "epoch": 5.085901027077497, "grad_norm": 3.039370273886976, "learning_rate": 5.1058409030808935e-06, "loss": 0.1516, "step": 5447 }, { "epoch": 5.086834733893557, "grad_norm": 0.709864661612812, "learning_rate": 5.104329111243345e-06, "loss": 0.0226, "step": 5448 }, { "epoch": 5.087768440709617, "grad_norm": 0.6169926437766339, "learning_rate": 5.102817309863753e-06, "loss": 0.012, "step": 5449 }, { "epoch": 5.088702147525677, "grad_norm": 0.5146386293264482, "learning_rate": 5.10130549908039e-06, "loss": 0.0169, "step": 5450 }, { "epoch": 5.089635854341736, "grad_norm": 2.5957399589921586, "learning_rate": 5.099793679031527e-06, "loss": 0.1404, "step": 5451 }, { "epoch": 5.090569561157796, "grad_norm": 0.2811589156258597, "learning_rate": 5.098281849855435e-06, "loss": 0.0021, "step": 5452 }, { "epoch": 5.091503267973856, "grad_norm": 1.972712658280943, "learning_rate": 5.096770011690389e-06, "loss": 0.1134, "step": 5453 }, { "epoch": 5.092436974789916, "grad_norm": 1.9635833979270332, "learning_rate": 5.095258164674664e-06, "loss": 0.1233, "step": 5454 }, { "epoch": 5.093370681605975, "grad_norm": 3.4164183820894243, "learning_rate": 5.093746308946531e-06, "loss": 0.2063, "step": 5455 }, { "epoch": 5.094304388422035, "grad_norm": 3.0948679696307098, "learning_rate": 5.09223444464427e-06, "loss": 0.17, "step": 5456 }, { "epoch": 5.095238095238095, "grad_norm": 2.2326996034545927, "learning_rate": 5.090722571906156e-06, "loss": 0.1211, "step": 5457 }, { "epoch": 5.096171802054155, "grad_norm": 0.45925232670893856, "learning_rate": 5.089210690870466e-06, "loss": 0.016, "step": 5458 }, { "epoch": 5.097105508870214, "grad_norm": 2.925284716615495, "learning_rate": 5.087698801675479e-06, "loss": 0.1814, "step": 5459 }, { "epoch": 5.098039215686274, "grad_norm": 2.5268530215659952, "learning_rate": 5.0861869044594735e-06, "loss": 0.1086, "step": 5460 }, { "epoch": 5.098972922502334, "grad_norm": 1.556306128134824, "learning_rate": 5.0846749993607296e-06, "loss": 0.0694, "step": 5461 }, { "epoch": 5.099906629318394, "grad_norm": 0.9327337787445894, "learning_rate": 5.083163086517526e-06, "loss": 0.05, "step": 5462 }, { "epoch": 5.100840336134453, "grad_norm": 0.40478998764706187, "learning_rate": 5.081651166068145e-06, "loss": 0.0068, "step": 5463 }, { "epoch": 5.101774042950513, "grad_norm": 2.1682488148976056, "learning_rate": 5.080139238150869e-06, "loss": 0.1257, "step": 5464 }, { "epoch": 5.102707749766573, "grad_norm": 2.14465857366685, "learning_rate": 5.078627302903979e-06, "loss": 0.1419, "step": 5465 }, { "epoch": 5.103641456582633, "grad_norm": 1.3067661623839721, "learning_rate": 5.0771153604657596e-06, "loss": 0.0632, "step": 5466 }, { "epoch": 5.104575163398692, "grad_norm": 1.196894826343126, "learning_rate": 5.075603410974492e-06, "loss": 0.0742, "step": 5467 }, { "epoch": 5.105508870214752, "grad_norm": 1.77248594770474, "learning_rate": 5.074091454568464e-06, "loss": 0.0918, "step": 5468 }, { "epoch": 5.106442577030812, "grad_norm": 1.1054379134111931, "learning_rate": 5.072579491385958e-06, "loss": 0.0684, "step": 5469 }, { "epoch": 5.107376283846872, "grad_norm": 3.515990321798876, "learning_rate": 5.07106752156526e-06, "loss": 0.1233, "step": 5470 }, { "epoch": 5.1083099906629315, "grad_norm": 0.896406975616322, "learning_rate": 5.0695555452446575e-06, "loss": 0.0265, "step": 5471 }, { "epoch": 5.109243697478991, "grad_norm": 0.9373766277697185, "learning_rate": 5.068043562562435e-06, "loss": 0.0367, "step": 5472 }, { "epoch": 5.110177404295051, "grad_norm": 1.1728210957052065, "learning_rate": 5.066531573656883e-06, "loss": 0.0489, "step": 5473 }, { "epoch": 5.111111111111111, "grad_norm": 0.1755083675160797, "learning_rate": 5.065019578666287e-06, "loss": 0.0055, "step": 5474 }, { "epoch": 5.1120448179271705, "grad_norm": 2.3439016297438497, "learning_rate": 5.063507577728938e-06, "loss": 0.1189, "step": 5475 }, { "epoch": 5.11297852474323, "grad_norm": 3.332725536889722, "learning_rate": 5.061995570983121e-06, "loss": 0.1215, "step": 5476 }, { "epoch": 5.11391223155929, "grad_norm": 0.7092402229804814, "learning_rate": 5.060483558567131e-06, "loss": 0.0247, "step": 5477 }, { "epoch": 5.11484593837535, "grad_norm": 1.3888931346088504, "learning_rate": 5.058971540619253e-06, "loss": 0.0583, "step": 5478 }, { "epoch": 5.1157796451914095, "grad_norm": 2.481049962757426, "learning_rate": 5.057459517277782e-06, "loss": 0.1407, "step": 5479 }, { "epoch": 5.116713352007469, "grad_norm": 0.8681346067166333, "learning_rate": 5.055947488681005e-06, "loss": 0.0048, "step": 5480 }, { "epoch": 5.117647058823529, "grad_norm": 1.7770788743229744, "learning_rate": 5.054435454967217e-06, "loss": 0.0082, "step": 5481 }, { "epoch": 5.118580765639589, "grad_norm": 1.849335973176322, "learning_rate": 5.052923416274709e-06, "loss": 0.0831, "step": 5482 }, { "epoch": 5.1195144724556485, "grad_norm": 2.9994739756422457, "learning_rate": 5.051411372741774e-06, "loss": 0.1306, "step": 5483 }, { "epoch": 5.120448179271708, "grad_norm": 2.0371079929395113, "learning_rate": 5.049899324506703e-06, "loss": 0.0801, "step": 5484 }, { "epoch": 5.121381886087768, "grad_norm": 1.537114089868634, "learning_rate": 5.048387271707791e-06, "loss": 0.0848, "step": 5485 }, { "epoch": 5.122315592903828, "grad_norm": 2.18394649715092, "learning_rate": 5.046875214483332e-06, "loss": 0.1368, "step": 5486 }, { "epoch": 5.1232492997198875, "grad_norm": 0.3037719468928947, "learning_rate": 5.04536315297162e-06, "loss": 0.0076, "step": 5487 }, { "epoch": 5.124183006535947, "grad_norm": 3.76759540025749, "learning_rate": 5.043851087310949e-06, "loss": 0.2084, "step": 5488 }, { "epoch": 5.125116713352007, "grad_norm": 2.4148557067689, "learning_rate": 5.042339017639614e-06, "loss": 0.1657, "step": 5489 }, { "epoch": 5.126050420168067, "grad_norm": 2.3553822646139224, "learning_rate": 5.0408269440959125e-06, "loss": 0.0786, "step": 5490 }, { "epoch": 5.1269841269841265, "grad_norm": 1.2576664764492478, "learning_rate": 5.039314866818137e-06, "loss": 0.088, "step": 5491 }, { "epoch": 5.127917833800186, "grad_norm": 0.9138232598390748, "learning_rate": 5.0378027859445854e-06, "loss": 0.0276, "step": 5492 }, { "epoch": 5.128851540616246, "grad_norm": 0.6348917929854871, "learning_rate": 5.036290701613555e-06, "loss": 0.0338, "step": 5493 }, { "epoch": 5.129785247432307, "grad_norm": 0.5320300497692539, "learning_rate": 5.034778613963341e-06, "loss": 0.0152, "step": 5494 }, { "epoch": 5.130718954248366, "grad_norm": 3.553520075046549, "learning_rate": 5.033266523132238e-06, "loss": 0.1563, "step": 5495 }, { "epoch": 5.131652661064426, "grad_norm": 1.5407851459869062, "learning_rate": 5.03175442925855e-06, "loss": 0.0639, "step": 5496 }, { "epoch": 5.132586367880486, "grad_norm": 1.0197121816815373, "learning_rate": 5.030242332480568e-06, "loss": 0.0326, "step": 5497 }, { "epoch": 5.133520074696546, "grad_norm": 0.9011219692907404, "learning_rate": 5.028730232936594e-06, "loss": 0.01, "step": 5498 }, { "epoch": 5.1344537815126055, "grad_norm": 0.18194396967683707, "learning_rate": 5.027218130764925e-06, "loss": 0.003, "step": 5499 }, { "epoch": 5.135387488328665, "grad_norm": 0.9381730467854499, "learning_rate": 5.025706026103859e-06, "loss": 0.04, "step": 5500 }, { "epoch": 5.136321195144725, "grad_norm": 2.0357321289715657, "learning_rate": 5.0241939190916945e-06, "loss": 0.1027, "step": 5501 }, { "epoch": 5.137254901960785, "grad_norm": 0.5287440769506431, "learning_rate": 5.022681809866729e-06, "loss": 0.0171, "step": 5502 }, { "epoch": 5.1381886087768445, "grad_norm": 1.041163919752738, "learning_rate": 5.021169698567264e-06, "loss": 0.043, "step": 5503 }, { "epoch": 5.139122315592904, "grad_norm": 2.5554700192034354, "learning_rate": 5.0196575853315975e-06, "loss": 0.0942, "step": 5504 }, { "epoch": 5.140056022408964, "grad_norm": 0.6173511490058754, "learning_rate": 5.018145470298029e-06, "loss": 0.0223, "step": 5505 }, { "epoch": 5.140989729225024, "grad_norm": 4.11778786431677, "learning_rate": 5.016633353604858e-06, "loss": 0.1952, "step": 5506 }, { "epoch": 5.1419234360410835, "grad_norm": 1.3967507086578121, "learning_rate": 5.0151212353903845e-06, "loss": 0.0359, "step": 5507 }, { "epoch": 5.142857142857143, "grad_norm": 0.9957388627047363, "learning_rate": 5.0136091157929066e-06, "loss": 0.0425, "step": 5508 }, { "epoch": 5.143790849673203, "grad_norm": 5.928858896921469, "learning_rate": 5.012096994950727e-06, "loss": 0.0892, "step": 5509 }, { "epoch": 5.144724556489263, "grad_norm": 3.17343864139665, "learning_rate": 5.0105848730021434e-06, "loss": 0.1309, "step": 5510 }, { "epoch": 5.1456582633053225, "grad_norm": 0.6336989704877346, "learning_rate": 5.0090727500854585e-06, "loss": 0.026, "step": 5511 }, { "epoch": 5.146591970121382, "grad_norm": 8.056118536087606, "learning_rate": 5.00756062633897e-06, "loss": 0.2731, "step": 5512 }, { "epoch": 5.147525676937442, "grad_norm": 2.749653595167315, "learning_rate": 5.0060485019009784e-06, "loss": 0.0823, "step": 5513 }, { "epoch": 5.148459383753502, "grad_norm": 1.7909424914537257, "learning_rate": 5.004536376909785e-06, "loss": 0.1129, "step": 5514 }, { "epoch": 5.1493930905695615, "grad_norm": 0.6127276020742104, "learning_rate": 5.003024251503692e-06, "loss": 0.0173, "step": 5515 }, { "epoch": 5.150326797385621, "grad_norm": 0.28702841370276533, "learning_rate": 5.001512125820996e-06, "loss": 0.0086, "step": 5516 }, { "epoch": 5.151260504201681, "grad_norm": 0.875505910787842, "learning_rate": 5e-06, "loss": 0.0188, "step": 5517 }, { "epoch": 5.152194211017741, "grad_norm": 5.035967426597356, "learning_rate": 4.9984878741790055e-06, "loss": 0.1239, "step": 5518 }, { "epoch": 5.1531279178338005, "grad_norm": 0.7439022113745272, "learning_rate": 4.996975748496311e-06, "loss": 0.0271, "step": 5519 }, { "epoch": 5.15406162464986, "grad_norm": 1.7568954449367589, "learning_rate": 4.995463623090215e-06, "loss": 0.1233, "step": 5520 }, { "epoch": 5.15499533146592, "grad_norm": 1.3453174073650123, "learning_rate": 4.993951498099022e-06, "loss": 0.0834, "step": 5521 }, { "epoch": 5.15592903828198, "grad_norm": 2.730351101039853, "learning_rate": 4.992439373661032e-06, "loss": 0.1692, "step": 5522 }, { "epoch": 5.1568627450980395, "grad_norm": 0.7134445467628082, "learning_rate": 4.990927249914544e-06, "loss": 0.0261, "step": 5523 }, { "epoch": 5.157796451914099, "grad_norm": 3.877308144400159, "learning_rate": 4.9894151269978565e-06, "loss": 0.0762, "step": 5524 }, { "epoch": 5.158730158730159, "grad_norm": 1.2569308734313847, "learning_rate": 4.9879030050492735e-06, "loss": 0.0434, "step": 5525 }, { "epoch": 5.159663865546219, "grad_norm": 1.7636024616053843, "learning_rate": 4.986390884207094e-06, "loss": 0.0877, "step": 5526 }, { "epoch": 5.160597572362279, "grad_norm": 3.1968039461193816, "learning_rate": 4.984878764609618e-06, "loss": 0.0157, "step": 5527 }, { "epoch": 5.161531279178338, "grad_norm": 1.6794270103744318, "learning_rate": 4.9833666463951425e-06, "loss": 0.0543, "step": 5528 }, { "epoch": 5.162464985994398, "grad_norm": 1.9886231426239755, "learning_rate": 4.981854529701972e-06, "loss": 0.0846, "step": 5529 }, { "epoch": 5.163398692810458, "grad_norm": 0.9277175176301795, "learning_rate": 4.980342414668404e-06, "loss": 0.0447, "step": 5530 }, { "epoch": 5.164332399626518, "grad_norm": 0.3421760397812874, "learning_rate": 4.978830301432738e-06, "loss": 0.0066, "step": 5531 }, { "epoch": 5.165266106442577, "grad_norm": 0.5416192917366407, "learning_rate": 4.977318190133271e-06, "loss": 0.0103, "step": 5532 }, { "epoch": 5.166199813258637, "grad_norm": 1.8377764687555367, "learning_rate": 4.975806080908307e-06, "loss": 0.0708, "step": 5533 }, { "epoch": 5.167133520074697, "grad_norm": 1.1153962502085457, "learning_rate": 4.974293973896143e-06, "loss": 0.0624, "step": 5534 }, { "epoch": 5.168067226890757, "grad_norm": 1.563555345379363, "learning_rate": 4.972781869235076e-06, "loss": 0.0532, "step": 5535 }, { "epoch": 5.169000933706816, "grad_norm": 0.784203042535619, "learning_rate": 4.9712697670634075e-06, "loss": 0.0311, "step": 5536 }, { "epoch": 5.169934640522876, "grad_norm": 2.340068197240273, "learning_rate": 4.969757667519433e-06, "loss": 0.0426, "step": 5537 }, { "epoch": 5.170868347338936, "grad_norm": 2.0667906457163143, "learning_rate": 4.968245570741451e-06, "loss": 0.0968, "step": 5538 }, { "epoch": 5.171802054154996, "grad_norm": 3.8833815996739025, "learning_rate": 4.9667334768677625e-06, "loss": 0.0301, "step": 5539 }, { "epoch": 5.172735760971055, "grad_norm": 0.41499322340786876, "learning_rate": 4.965221386036663e-06, "loss": 0.016, "step": 5540 }, { "epoch": 5.173669467787115, "grad_norm": 1.7324278924571814, "learning_rate": 4.963709298386448e-06, "loss": 0.0832, "step": 5541 }, { "epoch": 5.174603174603175, "grad_norm": 1.7279554944684163, "learning_rate": 4.962197214055416e-06, "loss": 0.0977, "step": 5542 }, { "epoch": 5.175536881419235, "grad_norm": 5.278610716014506, "learning_rate": 4.960685133181865e-06, "loss": 0.2384, "step": 5543 }, { "epoch": 5.176470588235294, "grad_norm": 0.2528451849542635, "learning_rate": 4.959173055904091e-06, "loss": 0.0038, "step": 5544 }, { "epoch": 5.177404295051354, "grad_norm": 3.691097288768061, "learning_rate": 4.957660982360386e-06, "loss": 0.118, "step": 5545 }, { "epoch": 5.178338001867414, "grad_norm": 2.1381680688571385, "learning_rate": 4.956148912689053e-06, "loss": 0.1512, "step": 5546 }, { "epoch": 5.179271708683474, "grad_norm": 2.1547805150920376, "learning_rate": 4.954636847028383e-06, "loss": 0.109, "step": 5547 }, { "epoch": 5.180205415499533, "grad_norm": 0.2674598637197554, "learning_rate": 4.95312478551667e-06, "loss": 0.0036, "step": 5548 }, { "epoch": 5.181139122315593, "grad_norm": 0.4657549119501235, "learning_rate": 4.95161272829221e-06, "loss": 0.0182, "step": 5549 }, { "epoch": 5.182072829131653, "grad_norm": 1.5005614811297154, "learning_rate": 4.950100675493299e-06, "loss": 0.0113, "step": 5550 }, { "epoch": 5.183006535947713, "grad_norm": 1.6641197203334925, "learning_rate": 4.948588627258228e-06, "loss": 0.0651, "step": 5551 }, { "epoch": 5.183940242763772, "grad_norm": 5.644876652486946, "learning_rate": 4.947076583725293e-06, "loss": 0.1377, "step": 5552 }, { "epoch": 5.184873949579832, "grad_norm": 1.8772973349498638, "learning_rate": 4.9455645450327846e-06, "loss": 0.0801, "step": 5553 }, { "epoch": 5.185807656395892, "grad_norm": 2.53390456696507, "learning_rate": 4.944052511318995e-06, "loss": 0.1314, "step": 5554 }, { "epoch": 5.186741363211952, "grad_norm": 1.3836288702899144, "learning_rate": 4.94254048272222e-06, "loss": 0.0346, "step": 5555 }, { "epoch": 5.187675070028011, "grad_norm": 1.5578592286254773, "learning_rate": 4.941028459380749e-06, "loss": 0.0687, "step": 5556 }, { "epoch": 5.188608776844071, "grad_norm": 3.3992637275664523, "learning_rate": 4.9395164414328725e-06, "loss": 0.0846, "step": 5557 }, { "epoch": 5.189542483660131, "grad_norm": 1.4719612648297788, "learning_rate": 4.938004429016879e-06, "loss": 0.0875, "step": 5558 }, { "epoch": 5.190476190476191, "grad_norm": 7.593229371120106, "learning_rate": 4.936492422271064e-06, "loss": 0.2517, "step": 5559 }, { "epoch": 5.19140989729225, "grad_norm": 1.996580566463959, "learning_rate": 4.934980421333714e-06, "loss": 0.1457, "step": 5560 }, { "epoch": 5.19234360410831, "grad_norm": 2.609439364221338, "learning_rate": 4.933468426343119e-06, "loss": 0.1236, "step": 5561 }, { "epoch": 5.19327731092437, "grad_norm": 0.2871599249772561, "learning_rate": 4.931956437437565e-06, "loss": 0.0043, "step": 5562 }, { "epoch": 5.19421101774043, "grad_norm": 1.2973706934067593, "learning_rate": 4.930444454755344e-06, "loss": 0.0237, "step": 5563 }, { "epoch": 5.1951447245564895, "grad_norm": 2.2809333150500355, "learning_rate": 4.928932478434742e-06, "loss": 0.1726, "step": 5564 }, { "epoch": 5.196078431372549, "grad_norm": 0.9446288884290069, "learning_rate": 4.9274205086140446e-06, "loss": 0.0317, "step": 5565 }, { "epoch": 5.197012138188609, "grad_norm": 2.3494458058258925, "learning_rate": 4.925908545431537e-06, "loss": 0.1349, "step": 5566 }, { "epoch": 5.197945845004669, "grad_norm": 0.7404503110113746, "learning_rate": 4.9243965890255084e-06, "loss": 0.0202, "step": 5567 }, { "epoch": 5.1988795518207285, "grad_norm": 2.611046833700541, "learning_rate": 4.922884639534242e-06, "loss": 0.0928, "step": 5568 }, { "epoch": 5.199813258636788, "grad_norm": 1.3291069303908236, "learning_rate": 4.921372697096023e-06, "loss": 0.0438, "step": 5569 }, { "epoch": 5.200746965452848, "grad_norm": 1.914862002307406, "learning_rate": 4.919860761849132e-06, "loss": 0.1263, "step": 5570 }, { "epoch": 5.201680672268908, "grad_norm": 1.3107677201546575, "learning_rate": 4.918348833931855e-06, "loss": 0.0425, "step": 5571 }, { "epoch": 5.2026143790849675, "grad_norm": 1.4502274255444183, "learning_rate": 4.9168369134824755e-06, "loss": 0.0536, "step": 5572 }, { "epoch": 5.203548085901027, "grad_norm": 1.7235584924133955, "learning_rate": 4.915325000639271e-06, "loss": 0.0989, "step": 5573 }, { "epoch": 5.204481792717087, "grad_norm": 0.4645145288976728, "learning_rate": 4.913813095540528e-06, "loss": 0.0096, "step": 5574 }, { "epoch": 5.205415499533147, "grad_norm": 0.8556319300915867, "learning_rate": 4.912301198324522e-06, "loss": 0.0267, "step": 5575 }, { "epoch": 5.2063492063492065, "grad_norm": 1.7432287185890807, "learning_rate": 4.910789309129536e-06, "loss": 0.0536, "step": 5576 }, { "epoch": 5.207282913165266, "grad_norm": 0.8741334480363682, "learning_rate": 4.909277428093846e-06, "loss": 0.0285, "step": 5577 }, { "epoch": 5.208216619981326, "grad_norm": 3.4643914171044345, "learning_rate": 4.907765555355732e-06, "loss": 0.1038, "step": 5578 }, { "epoch": 5.209150326797386, "grad_norm": 6.008026961090272, "learning_rate": 4.90625369105347e-06, "loss": 0.101, "step": 5579 }, { "epoch": 5.2100840336134455, "grad_norm": 0.7910435128267707, "learning_rate": 4.904741835325338e-06, "loss": 0.0168, "step": 5580 }, { "epoch": 5.211017740429505, "grad_norm": 0.9856296765431469, "learning_rate": 4.903229988309612e-06, "loss": 0.0471, "step": 5581 }, { "epoch": 5.211951447245565, "grad_norm": 1.5638970487548933, "learning_rate": 4.9017181501445665e-06, "loss": 0.0967, "step": 5582 }, { "epoch": 5.212885154061625, "grad_norm": 1.2468089057774403, "learning_rate": 4.900206320968474e-06, "loss": 0.044, "step": 5583 }, { "epoch": 5.2138188608776845, "grad_norm": 1.7131189540443899, "learning_rate": 4.89869450091961e-06, "loss": 0.1299, "step": 5584 }, { "epoch": 5.214752567693744, "grad_norm": 0.511668618690041, "learning_rate": 4.897182690136248e-06, "loss": 0.0185, "step": 5585 }, { "epoch": 5.215686274509804, "grad_norm": 1.3955571713468384, "learning_rate": 4.895670888756657e-06, "loss": 0.0204, "step": 5586 }, { "epoch": 5.216619981325864, "grad_norm": 2.200653049140935, "learning_rate": 4.894159096919109e-06, "loss": 0.1335, "step": 5587 }, { "epoch": 5.2175536881419236, "grad_norm": 1.6845920154539145, "learning_rate": 4.892647314761872e-06, "loss": 0.0882, "step": 5588 }, { "epoch": 5.218487394957983, "grad_norm": 0.4036507298071527, "learning_rate": 4.89113554242322e-06, "loss": 0.0125, "step": 5589 }, { "epoch": 5.219421101774043, "grad_norm": 1.7000076167997173, "learning_rate": 4.889623780041417e-06, "loss": 0.1074, "step": 5590 }, { "epoch": 5.220354808590103, "grad_norm": 1.0295925112060644, "learning_rate": 4.888112027754732e-06, "loss": 0.0298, "step": 5591 }, { "epoch": 5.221288515406163, "grad_norm": 1.6728246651157999, "learning_rate": 4.886600285701429e-06, "loss": 0.0401, "step": 5592 }, { "epoch": 5.222222222222222, "grad_norm": 2.807132872037675, "learning_rate": 4.885088554019777e-06, "loss": 0.0474, "step": 5593 }, { "epoch": 5.223155929038282, "grad_norm": 1.484331589781936, "learning_rate": 4.883576832848038e-06, "loss": 0.0302, "step": 5594 }, { "epoch": 5.224089635854342, "grad_norm": 1.283841477527106, "learning_rate": 4.882065122324478e-06, "loss": 0.0565, "step": 5595 }, { "epoch": 5.225023342670402, "grad_norm": 1.5419104379903712, "learning_rate": 4.880553422587354e-06, "loss": 0.0345, "step": 5596 }, { "epoch": 5.225957049486461, "grad_norm": 1.1008637020550767, "learning_rate": 4.879041733774934e-06, "loss": 0.0352, "step": 5597 }, { "epoch": 5.226890756302521, "grad_norm": 1.5506510044767694, "learning_rate": 4.8775300560254754e-06, "loss": 0.0833, "step": 5598 }, { "epoch": 5.227824463118581, "grad_norm": 3.443691970634334, "learning_rate": 4.876018389477239e-06, "loss": 0.1158, "step": 5599 }, { "epoch": 5.228758169934641, "grad_norm": 1.4551157921343172, "learning_rate": 4.874506734268482e-06, "loss": 0.0146, "step": 5600 }, { "epoch": 5.2296918767507, "grad_norm": 4.059627423985574, "learning_rate": 4.87299509053746e-06, "loss": 0.1422, "step": 5601 }, { "epoch": 5.23062558356676, "grad_norm": 2.9248799531104255, "learning_rate": 4.871483458422434e-06, "loss": 0.1029, "step": 5602 }, { "epoch": 5.23155929038282, "grad_norm": 1.856463492345226, "learning_rate": 4.869971838061657e-06, "loss": 0.0646, "step": 5603 }, { "epoch": 5.23249299719888, "grad_norm": 2.3301289725359897, "learning_rate": 4.8684602295933835e-06, "loss": 0.1093, "step": 5604 }, { "epoch": 5.233426704014939, "grad_norm": 1.5035717609268537, "learning_rate": 4.866948633155865e-06, "loss": 0.0745, "step": 5605 }, { "epoch": 5.234360410830999, "grad_norm": 1.3862478519775803, "learning_rate": 4.8654370488873565e-06, "loss": 0.0307, "step": 5606 }, { "epoch": 5.235294117647059, "grad_norm": 1.9829683708219925, "learning_rate": 4.863925476926108e-06, "loss": 0.1582, "step": 5607 }, { "epoch": 5.236227824463119, "grad_norm": 0.7596127517033802, "learning_rate": 4.862413917410368e-06, "loss": 0.0312, "step": 5608 }, { "epoch": 5.237161531279178, "grad_norm": 3.2236747903818865, "learning_rate": 4.860902370478386e-06, "loss": 0.1048, "step": 5609 }, { "epoch": 5.238095238095238, "grad_norm": 0.4166724132453204, "learning_rate": 4.859390836268411e-06, "loss": 0.0081, "step": 5610 }, { "epoch": 5.239028944911298, "grad_norm": 2.015297352246079, "learning_rate": 4.8578793149186875e-06, "loss": 0.0376, "step": 5611 }, { "epoch": 5.239962651727358, "grad_norm": 3.8296160666772208, "learning_rate": 4.856367806567461e-06, "loss": 0.1716, "step": 5612 }, { "epoch": 5.240896358543417, "grad_norm": 6.444011457150936, "learning_rate": 4.854856311352976e-06, "loss": 0.0624, "step": 5613 }, { "epoch": 5.241830065359477, "grad_norm": 1.676549250981886, "learning_rate": 4.853344829413476e-06, "loss": 0.0955, "step": 5614 }, { "epoch": 5.242763772175537, "grad_norm": 1.6835136170337073, "learning_rate": 4.8518333608872015e-06, "loss": 0.0887, "step": 5615 }, { "epoch": 5.243697478991597, "grad_norm": 1.640689834928119, "learning_rate": 4.850321905912394e-06, "loss": 0.0433, "step": 5616 }, { "epoch": 5.244631185807656, "grad_norm": 0.24694689029806818, "learning_rate": 4.848810464627292e-06, "loss": 0.0049, "step": 5617 }, { "epoch": 5.245564892623716, "grad_norm": 4.86792942768844, "learning_rate": 4.847299037170131e-06, "loss": 0.2512, "step": 5618 }, { "epoch": 5.246498599439776, "grad_norm": 1.0802091431077483, "learning_rate": 4.845787623679153e-06, "loss": 0.038, "step": 5619 }, { "epoch": 5.247432306255836, "grad_norm": 4.101562845921657, "learning_rate": 4.844276224292589e-06, "loss": 0.1933, "step": 5620 }, { "epoch": 5.248366013071895, "grad_norm": 3.394003092637959, "learning_rate": 4.842764839148676e-06, "loss": 0.134, "step": 5621 }, { "epoch": 5.249299719887955, "grad_norm": 0.39288283879346897, "learning_rate": 4.841253468385642e-06, "loss": 0.0026, "step": 5622 }, { "epoch": 5.250233426704015, "grad_norm": 1.0109115275514315, "learning_rate": 4.839742112141725e-06, "loss": 0.0263, "step": 5623 }, { "epoch": 5.251167133520075, "grad_norm": 0.7840986808547858, "learning_rate": 4.83823077055515e-06, "loss": 0.0358, "step": 5624 }, { "epoch": 5.2521008403361344, "grad_norm": 2.8718649713916635, "learning_rate": 4.836719443764149e-06, "loss": 0.1047, "step": 5625 }, { "epoch": 5.253034547152194, "grad_norm": 2.9508057790061457, "learning_rate": 4.835208131906945e-06, "loss": 0.1578, "step": 5626 }, { "epoch": 5.253968253968254, "grad_norm": 0.8553781574029424, "learning_rate": 4.83369683512177e-06, "loss": 0.0177, "step": 5627 }, { "epoch": 5.254901960784314, "grad_norm": 1.2004991757908139, "learning_rate": 4.832185553546845e-06, "loss": 0.0434, "step": 5628 }, { "epoch": 5.2558356676003735, "grad_norm": 2.7003558468800084, "learning_rate": 4.830674287320395e-06, "loss": 0.0976, "step": 5629 }, { "epoch": 5.256769374416433, "grad_norm": 1.4202209016994018, "learning_rate": 4.829163036580638e-06, "loss": 0.0554, "step": 5630 }, { "epoch": 5.257703081232493, "grad_norm": 1.161591518502759, "learning_rate": 4.8276518014658e-06, "loss": 0.0147, "step": 5631 }, { "epoch": 5.258636788048553, "grad_norm": 0.938778809329981, "learning_rate": 4.826140582114097e-06, "loss": 0.0298, "step": 5632 }, { "epoch": 5.2595704948646125, "grad_norm": 3.9782104212354166, "learning_rate": 4.824629378663748e-06, "loss": 0.0599, "step": 5633 }, { "epoch": 5.260504201680672, "grad_norm": 7.831265101698187, "learning_rate": 4.823118191252968e-06, "loss": 0.2286, "step": 5634 }, { "epoch": 5.261437908496732, "grad_norm": 2.366353554283749, "learning_rate": 4.821607020019969e-06, "loss": 0.1142, "step": 5635 }, { "epoch": 5.262371615312792, "grad_norm": 1.9735610481538741, "learning_rate": 4.82009586510297e-06, "loss": 0.0909, "step": 5636 }, { "epoch": 5.2633053221288515, "grad_norm": 1.721004967041418, "learning_rate": 4.81858472664018e-06, "loss": 0.0647, "step": 5637 }, { "epoch": 5.264239028944911, "grad_norm": 1.4148763662701576, "learning_rate": 4.8170736047698085e-06, "loss": 0.0602, "step": 5638 }, { "epoch": 5.265172735760971, "grad_norm": 1.5668643008609573, "learning_rate": 4.815562499630063e-06, "loss": 0.026, "step": 5639 }, { "epoch": 5.266106442577031, "grad_norm": 1.4326534669267572, "learning_rate": 4.814051411359154e-06, "loss": 0.0306, "step": 5640 }, { "epoch": 5.2670401493930905, "grad_norm": 2.4535242567437923, "learning_rate": 4.812540340095286e-06, "loss": 0.1166, "step": 5641 }, { "epoch": 5.26797385620915, "grad_norm": 4.4953149957102285, "learning_rate": 4.8110292859766625e-06, "loss": 0.131, "step": 5642 }, { "epoch": 5.26890756302521, "grad_norm": 3.9396149601074715, "learning_rate": 4.809518249141484e-06, "loss": 0.0303, "step": 5643 }, { "epoch": 5.26984126984127, "grad_norm": 2.1284754947416804, "learning_rate": 4.808007229727955e-06, "loss": 0.1007, "step": 5644 }, { "epoch": 5.2707749766573295, "grad_norm": 1.3916368959055998, "learning_rate": 4.806496227874274e-06, "loss": 0.0653, "step": 5645 }, { "epoch": 5.271708683473389, "grad_norm": 3.620537117534526, "learning_rate": 4.804985243718638e-06, "loss": 0.2412, "step": 5646 }, { "epoch": 5.272642390289449, "grad_norm": 0.3496429070894699, "learning_rate": 4.803474277399241e-06, "loss": 0.0095, "step": 5647 }, { "epoch": 5.273576097105509, "grad_norm": 1.7516950469839025, "learning_rate": 4.8019633290542815e-06, "loss": 0.0879, "step": 5648 }, { "epoch": 5.2745098039215685, "grad_norm": 2.4670154270790374, "learning_rate": 4.80045239882195e-06, "loss": 0.15, "step": 5649 }, { "epoch": 5.275443510737628, "grad_norm": 2.2619417704207683, "learning_rate": 4.798941486840439e-06, "loss": 0.0943, "step": 5650 }, { "epoch": 5.276377217553688, "grad_norm": 2.2208107471987026, "learning_rate": 4.797430593247935e-06, "loss": 0.1265, "step": 5651 }, { "epoch": 5.277310924369748, "grad_norm": 0.5766316439578744, "learning_rate": 4.795919718182629e-06, "loss": 0.013, "step": 5652 }, { "epoch": 5.278244631185808, "grad_norm": 0.7483108602768571, "learning_rate": 4.794408861782706e-06, "loss": 0.018, "step": 5653 }, { "epoch": 5.279178338001867, "grad_norm": 3.9740898149430586, "learning_rate": 4.7928980241863506e-06, "loss": 0.1592, "step": 5654 }, { "epoch": 5.280112044817927, "grad_norm": 0.2570000132724665, "learning_rate": 4.791387205531745e-06, "loss": 0.0038, "step": 5655 }, { "epoch": 5.281045751633987, "grad_norm": 0.7838916797001125, "learning_rate": 4.7898764059570705e-06, "loss": 0.0296, "step": 5656 }, { "epoch": 5.281979458450047, "grad_norm": 2.2475388934110585, "learning_rate": 4.788365625600505e-06, "loss": 0.1041, "step": 5657 }, { "epoch": 5.282913165266106, "grad_norm": 0.6329818165182621, "learning_rate": 4.786854864600229e-06, "loss": 0.0216, "step": 5658 }, { "epoch": 5.283846872082166, "grad_norm": 2.842545084525952, "learning_rate": 4.785344123094417e-06, "loss": 0.1652, "step": 5659 }, { "epoch": 5.284780578898226, "grad_norm": 1.1035051578132398, "learning_rate": 4.783833401221239e-06, "loss": 0.0245, "step": 5660 }, { "epoch": 5.285714285714286, "grad_norm": 1.0266443607573559, "learning_rate": 4.782322699118873e-06, "loss": 0.0383, "step": 5661 }, { "epoch": 5.286647992530345, "grad_norm": 0.2480415395348607, "learning_rate": 4.780812016925487e-06, "loss": 0.0011, "step": 5662 }, { "epoch": 5.287581699346405, "grad_norm": 0.469987808626036, "learning_rate": 4.77930135477925e-06, "loss": 0.0135, "step": 5663 }, { "epoch": 5.288515406162465, "grad_norm": 1.8582352143687384, "learning_rate": 4.777790712818324e-06, "loss": 0.048, "step": 5664 }, { "epoch": 5.289449112978525, "grad_norm": 1.2358017762598044, "learning_rate": 4.776280091180881e-06, "loss": 0.0198, "step": 5665 }, { "epoch": 5.290382819794584, "grad_norm": 1.9345480300392814, "learning_rate": 4.774769490005079e-06, "loss": 0.0959, "step": 5666 }, { "epoch": 5.291316526610644, "grad_norm": 0.4694350267608623, "learning_rate": 4.773258909429082e-06, "loss": 0.0085, "step": 5667 }, { "epoch": 5.292250233426704, "grad_norm": 1.9661030865193705, "learning_rate": 4.771748349591047e-06, "loss": 0.0534, "step": 5668 }, { "epoch": 5.293183940242764, "grad_norm": 1.6615498711702643, "learning_rate": 4.7702378106291306e-06, "loss": 0.0696, "step": 5669 }, { "epoch": 5.294117647058823, "grad_norm": 2.5775237207099173, "learning_rate": 4.7687272926814915e-06, "loss": 0.1337, "step": 5670 }, { "epoch": 5.295051353874883, "grad_norm": 1.4049940086354737, "learning_rate": 4.767216795886281e-06, "loss": 0.0801, "step": 5671 }, { "epoch": 5.295985060690943, "grad_norm": 3.3202735435312967, "learning_rate": 4.765706320381652e-06, "loss": 0.2447, "step": 5672 }, { "epoch": 5.296918767507003, "grad_norm": 3.677193371740402, "learning_rate": 4.76419586630575e-06, "loss": 0.1212, "step": 5673 }, { "epoch": 5.297852474323062, "grad_norm": 0.7953708638043167, "learning_rate": 4.762685433796728e-06, "loss": 0.0357, "step": 5674 }, { "epoch": 5.298786181139122, "grad_norm": 0.3649298356406654, "learning_rate": 4.76117502299273e-06, "loss": 0.0052, "step": 5675 }, { "epoch": 5.299719887955182, "grad_norm": 1.0253428939222347, "learning_rate": 4.759664634031897e-06, "loss": 0.0817, "step": 5676 }, { "epoch": 5.300653594771242, "grad_norm": 0.9085298678179975, "learning_rate": 4.758154267052372e-06, "loss": 0.032, "step": 5677 }, { "epoch": 5.301587301587301, "grad_norm": 0.2891958430930725, "learning_rate": 4.756643922192297e-06, "loss": 0.0089, "step": 5678 }, { "epoch": 5.302521008403361, "grad_norm": 1.8113344113169376, "learning_rate": 4.7551335995898085e-06, "loss": 0.0886, "step": 5679 }, { "epoch": 5.303454715219421, "grad_norm": 1.9612813416508215, "learning_rate": 4.753623299383041e-06, "loss": 0.1162, "step": 5680 }, { "epoch": 5.304388422035481, "grad_norm": 1.4916874895042525, "learning_rate": 4.752113021710126e-06, "loss": 0.0301, "step": 5681 }, { "epoch": 5.30532212885154, "grad_norm": 2.10318719895293, "learning_rate": 4.7506027667091995e-06, "loss": 0.1027, "step": 5682 }, { "epoch": 5.3062558356676, "grad_norm": 3.7566678673368, "learning_rate": 4.7490925345183895e-06, "loss": 0.099, "step": 5683 }, { "epoch": 5.30718954248366, "grad_norm": 0.2703958832239815, "learning_rate": 4.747582325275822e-06, "loss": 0.0064, "step": 5684 }, { "epoch": 5.30812324929972, "grad_norm": 0.3613259141413058, "learning_rate": 4.746072139119622e-06, "loss": 0.01, "step": 5685 }, { "epoch": 5.309056956115779, "grad_norm": 0.3924526722268185, "learning_rate": 4.744561976187914e-06, "loss": 0.009, "step": 5686 }, { "epoch": 5.309990662931839, "grad_norm": 3.2450942971695853, "learning_rate": 4.743051836618819e-06, "loss": 0.1663, "step": 5687 }, { "epoch": 5.310924369747899, "grad_norm": 1.141889622925391, "learning_rate": 4.741541720550455e-06, "loss": 0.0257, "step": 5688 }, { "epoch": 5.311858076563959, "grad_norm": 0.7512760569129604, "learning_rate": 4.7400316281209405e-06, "loss": 0.0167, "step": 5689 }, { "epoch": 5.3127917833800185, "grad_norm": 3.472468457583766, "learning_rate": 4.738521559468387e-06, "loss": 0.1319, "step": 5690 }, { "epoch": 5.313725490196078, "grad_norm": 1.106438078567935, "learning_rate": 4.737011514730908e-06, "loss": 0.0565, "step": 5691 }, { "epoch": 5.314659197012138, "grad_norm": 2.8072159498183344, "learning_rate": 4.735501494046615e-06, "loss": 0.0272, "step": 5692 }, { "epoch": 5.315592903828198, "grad_norm": 2.3205684422212474, "learning_rate": 4.7339914975536155e-06, "loss": 0.0796, "step": 5693 }, { "epoch": 5.3165266106442575, "grad_norm": 2.1776431909767617, "learning_rate": 4.732481525390014e-06, "loss": 0.1056, "step": 5694 }, { "epoch": 5.317460317460317, "grad_norm": 0.7678164934480615, "learning_rate": 4.730971577693914e-06, "loss": 0.0246, "step": 5695 }, { "epoch": 5.318394024276377, "grad_norm": 1.9425651636127195, "learning_rate": 4.72946165460342e-06, "loss": 0.0799, "step": 5696 }, { "epoch": 5.319327731092437, "grad_norm": 3.114101784942256, "learning_rate": 4.727951756256628e-06, "loss": 0.169, "step": 5697 }, { "epoch": 5.3202614379084965, "grad_norm": 0.9685611792166455, "learning_rate": 4.726441882791634e-06, "loss": 0.052, "step": 5698 }, { "epoch": 5.321195144724556, "grad_norm": 2.3455597493426708, "learning_rate": 4.724932034346536e-06, "loss": 0.16, "step": 5699 }, { "epoch": 5.322128851540616, "grad_norm": 1.5332248440364322, "learning_rate": 4.723422211059423e-06, "loss": 0.0752, "step": 5700 }, { "epoch": 5.323062558356676, "grad_norm": 1.8186812546825135, "learning_rate": 4.721912413068387e-06, "loss": 0.0772, "step": 5701 }, { "epoch": 5.3239962651727355, "grad_norm": 2.1161892244531764, "learning_rate": 4.7204026405115156e-06, "loss": 0.0731, "step": 5702 }, { "epoch": 5.324929971988795, "grad_norm": 0.41106194660133666, "learning_rate": 4.718892893526891e-06, "loss": 0.0181, "step": 5703 }, { "epoch": 5.325863678804855, "grad_norm": 1.7138607627701503, "learning_rate": 4.7173831722526e-06, "loss": 0.0873, "step": 5704 }, { "epoch": 5.326797385620915, "grad_norm": 1.6330139327602575, "learning_rate": 4.715873476826722e-06, "loss": 0.0534, "step": 5705 }, { "epoch": 5.3277310924369745, "grad_norm": 1.0305844008854923, "learning_rate": 4.714363807387334e-06, "loss": 0.0452, "step": 5706 }, { "epoch": 5.328664799253034, "grad_norm": 0.6180778474106172, "learning_rate": 4.7128541640725115e-06, "loss": 0.0036, "step": 5707 }, { "epoch": 5.329598506069094, "grad_norm": 0.40295340428843796, "learning_rate": 4.7113445470203305e-06, "loss": 0.0098, "step": 5708 }, { "epoch": 5.330532212885154, "grad_norm": 7.15035446838982, "learning_rate": 4.709834956368861e-06, "loss": 0.3073, "step": 5709 }, { "epoch": 5.3314659197012135, "grad_norm": 1.1732401834458823, "learning_rate": 4.708325392256172e-06, "loss": 0.0512, "step": 5710 }, { "epoch": 5.332399626517273, "grad_norm": 2.0503064633472388, "learning_rate": 4.706815854820326e-06, "loss": 0.0962, "step": 5711 }, { "epoch": 5.333333333333333, "grad_norm": 0.39677108729663263, "learning_rate": 4.705306344199392e-06, "loss": 0.0108, "step": 5712 }, { "epoch": 5.334267040149393, "grad_norm": 1.4142385902871368, "learning_rate": 4.703796860531429e-06, "loss": 0.0443, "step": 5713 }, { "epoch": 5.3352007469654525, "grad_norm": 1.5394745439050932, "learning_rate": 4.702287403954497e-06, "loss": 0.0556, "step": 5714 }, { "epoch": 5.336134453781512, "grad_norm": 3.3054458382486858, "learning_rate": 4.700777974606648e-06, "loss": 0.1567, "step": 5715 }, { "epoch": 5.337068160597572, "grad_norm": 3.3162939524682358, "learning_rate": 4.699268572625943e-06, "loss": 0.0991, "step": 5716 }, { "epoch": 5.338001867413632, "grad_norm": 3.763652841399825, "learning_rate": 4.6977591981504286e-06, "loss": 0.2495, "step": 5717 }, { "epoch": 5.338935574229692, "grad_norm": 1.8025351779396088, "learning_rate": 4.6962498513181554e-06, "loss": 0.0309, "step": 5718 }, { "epoch": 5.339869281045751, "grad_norm": 4.011829168817965, "learning_rate": 4.694740532267169e-06, "loss": 0.1727, "step": 5719 }, { "epoch": 5.340802987861811, "grad_norm": 0.6708572666346488, "learning_rate": 4.69323124113551e-06, "loss": 0.016, "step": 5720 }, { "epoch": 5.341736694677871, "grad_norm": 1.1991195126187435, "learning_rate": 4.691721978061226e-06, "loss": 0.0251, "step": 5721 }, { "epoch": 5.342670401493931, "grad_norm": 0.4984667235587339, "learning_rate": 4.690212743182352e-06, "loss": 0.0152, "step": 5722 }, { "epoch": 5.34360410830999, "grad_norm": 2.2636032519635116, "learning_rate": 4.6887035366369245e-06, "loss": 0.116, "step": 5723 }, { "epoch": 5.34453781512605, "grad_norm": 2.5137590101174645, "learning_rate": 4.687194358562977e-06, "loss": 0.1237, "step": 5724 }, { "epoch": 5.34547152194211, "grad_norm": 2.1229432439458127, "learning_rate": 4.685685209098542e-06, "loss": 0.0842, "step": 5725 }, { "epoch": 5.34640522875817, "grad_norm": 2.8405811885115932, "learning_rate": 4.684176088381645e-06, "loss": 0.1474, "step": 5726 }, { "epoch": 5.347338935574229, "grad_norm": 1.7961554913567541, "learning_rate": 4.682666996550314e-06, "loss": 0.122, "step": 5727 }, { "epoch": 5.348272642390289, "grad_norm": 0.7365630869386187, "learning_rate": 4.681157933742571e-06, "loss": 0.0286, "step": 5728 }, { "epoch": 5.349206349206349, "grad_norm": 0.8004418279002843, "learning_rate": 4.679648900096436e-06, "loss": 0.0167, "step": 5729 }, { "epoch": 5.350140056022409, "grad_norm": 1.93962982481048, "learning_rate": 4.678139895749927e-06, "loss": 0.0627, "step": 5730 }, { "epoch": 5.351073762838468, "grad_norm": 1.4468143706637446, "learning_rate": 4.67663092084106e-06, "loss": 0.0639, "step": 5731 }, { "epoch": 5.352007469654528, "grad_norm": 1.6006739040072901, "learning_rate": 4.6751219755078455e-06, "loss": 0.0687, "step": 5732 }, { "epoch": 5.352941176470588, "grad_norm": 1.1942108612751707, "learning_rate": 4.673613059888295e-06, "loss": 0.0447, "step": 5733 }, { "epoch": 5.353874883286648, "grad_norm": 1.3591289945233584, "learning_rate": 4.672104174120415e-06, "loss": 0.0378, "step": 5734 }, { "epoch": 5.354808590102707, "grad_norm": 0.8081323849534577, "learning_rate": 4.670595318342209e-06, "loss": 0.0121, "step": 5735 }, { "epoch": 5.355742296918767, "grad_norm": 1.083699327178511, "learning_rate": 4.669086492691679e-06, "loss": 0.0439, "step": 5736 }, { "epoch": 5.356676003734827, "grad_norm": 0.7140195426406192, "learning_rate": 4.667577697306821e-06, "loss": 0.0166, "step": 5737 }, { "epoch": 5.357609710550887, "grad_norm": 0.7496706267817366, "learning_rate": 4.6660689323256355e-06, "loss": 0.0114, "step": 5738 }, { "epoch": 5.358543417366946, "grad_norm": 1.7648125135303516, "learning_rate": 4.664560197886114e-06, "loss": 0.0851, "step": 5739 }, { "epoch": 5.359477124183006, "grad_norm": 2.151560253436524, "learning_rate": 4.663051494126245e-06, "loss": 0.0872, "step": 5740 }, { "epoch": 5.360410830999066, "grad_norm": 1.2207299744581992, "learning_rate": 4.661542821184016e-06, "loss": 0.0541, "step": 5741 }, { "epoch": 5.361344537815126, "grad_norm": 3.2774939800422125, "learning_rate": 4.6600341791974144e-06, "loss": 0.1602, "step": 5742 }, { "epoch": 5.362278244631185, "grad_norm": 0.8415745094417797, "learning_rate": 4.658525568304421e-06, "loss": 0.0234, "step": 5743 }, { "epoch": 5.363211951447245, "grad_norm": 1.3244152839313645, "learning_rate": 4.657016988643015e-06, "loss": 0.0573, "step": 5744 }, { "epoch": 5.364145658263305, "grad_norm": 0.5128362415194028, "learning_rate": 4.655508440351169e-06, "loss": 0.0133, "step": 5745 }, { "epoch": 5.365079365079365, "grad_norm": 0.9916676611733565, "learning_rate": 4.653999923566862e-06, "loss": 0.0265, "step": 5746 }, { "epoch": 5.366013071895424, "grad_norm": 0.5269929049509265, "learning_rate": 4.652491438428062e-06, "loss": 0.0091, "step": 5747 }, { "epoch": 5.366946778711484, "grad_norm": 1.9614029034735867, "learning_rate": 4.650982985072736e-06, "loss": 0.1219, "step": 5748 }, { "epoch": 5.367880485527545, "grad_norm": 0.7751019543345685, "learning_rate": 4.649474563638849e-06, "loss": 0.0161, "step": 5749 }, { "epoch": 5.368814192343605, "grad_norm": 1.8068767007579023, "learning_rate": 4.64796617426436e-06, "loss": 0.0569, "step": 5750 }, { "epoch": 5.369747899159664, "grad_norm": 1.4727222290166313, "learning_rate": 4.646457817087233e-06, "loss": 0.084, "step": 5751 }, { "epoch": 5.370681605975724, "grad_norm": 1.1493578544207814, "learning_rate": 4.644949492245421e-06, "loss": 0.039, "step": 5752 }, { "epoch": 5.371615312791784, "grad_norm": 2.3685972482956967, "learning_rate": 4.643441199876878e-06, "loss": 0.1483, "step": 5753 }, { "epoch": 5.372549019607844, "grad_norm": 2.135509539583834, "learning_rate": 4.641932940119549e-06, "loss": 0.1216, "step": 5754 }, { "epoch": 5.373482726423903, "grad_norm": 4.559342786347246, "learning_rate": 4.640424713111389e-06, "loss": 0.2191, "step": 5755 }, { "epoch": 5.374416433239963, "grad_norm": 0.392105568216587, "learning_rate": 4.638916518990337e-06, "loss": 0.0133, "step": 5756 }, { "epoch": 5.375350140056023, "grad_norm": 5.543527000655444, "learning_rate": 4.637408357894334e-06, "loss": 0.3309, "step": 5757 }, { "epoch": 5.376283846872083, "grad_norm": 0.3324644591788684, "learning_rate": 4.635900229961318e-06, "loss": 0.008, "step": 5758 }, { "epoch": 5.377217553688142, "grad_norm": 1.5324030347494504, "learning_rate": 4.634392135329226e-06, "loss": 0.0762, "step": 5759 }, { "epoch": 5.378151260504202, "grad_norm": 1.2596946576586547, "learning_rate": 4.632884074135988e-06, "loss": 0.0418, "step": 5760 }, { "epoch": 5.379084967320262, "grad_norm": 1.265379707123609, "learning_rate": 4.631376046519533e-06, "loss": 0.0654, "step": 5761 }, { "epoch": 5.380018674136322, "grad_norm": 1.5364703992535533, "learning_rate": 4.629868052617786e-06, "loss": 0.0677, "step": 5762 }, { "epoch": 5.380952380952381, "grad_norm": 2.594615298513343, "learning_rate": 4.628360092568672e-06, "loss": 0.1276, "step": 5763 }, { "epoch": 5.381886087768441, "grad_norm": 1.3520260779795588, "learning_rate": 4.626852166510108e-06, "loss": 0.0293, "step": 5764 }, { "epoch": 5.382819794584501, "grad_norm": 1.8412898961056188, "learning_rate": 4.625344274580011e-06, "loss": 0.0644, "step": 5765 }, { "epoch": 5.383753501400561, "grad_norm": 0.9858723839689654, "learning_rate": 4.623836416916295e-06, "loss": 0.0284, "step": 5766 }, { "epoch": 5.38468720821662, "grad_norm": 0.2982150866654619, "learning_rate": 4.6223285936568694e-06, "loss": 0.0025, "step": 5767 }, { "epoch": 5.38562091503268, "grad_norm": 1.6471314720031782, "learning_rate": 4.620820804939641e-06, "loss": 0.0869, "step": 5768 }, { "epoch": 5.38655462184874, "grad_norm": 4.549363894503057, "learning_rate": 4.619313050902515e-06, "loss": 0.2211, "step": 5769 }, { "epoch": 5.3874883286648, "grad_norm": 3.132881005882416, "learning_rate": 4.617805331683392e-06, "loss": 0.0432, "step": 5770 }, { "epoch": 5.388422035480859, "grad_norm": 2.2940337390465353, "learning_rate": 4.616297647420166e-06, "loss": 0.0848, "step": 5771 }, { "epoch": 5.389355742296919, "grad_norm": 1.1811060350639686, "learning_rate": 4.614789998250736e-06, "loss": 0.0552, "step": 5772 }, { "epoch": 5.390289449112979, "grad_norm": 0.8632447373212453, "learning_rate": 4.613282384312991e-06, "loss": 0.0195, "step": 5773 }, { "epoch": 5.391223155929039, "grad_norm": 0.743453190370038, "learning_rate": 4.611774805744819e-06, "loss": 0.0295, "step": 5774 }, { "epoch": 5.392156862745098, "grad_norm": 0.6283518549233333, "learning_rate": 4.610267262684103e-06, "loss": 0.0255, "step": 5775 }, { "epoch": 5.393090569561158, "grad_norm": 0.3436593000702192, "learning_rate": 4.608759755268728e-06, "loss": 0.0068, "step": 5776 }, { "epoch": 5.394024276377218, "grad_norm": 1.4317729947846534, "learning_rate": 4.6072522836365695e-06, "loss": 0.0358, "step": 5777 }, { "epoch": 5.394957983193278, "grad_norm": 0.5535462188666201, "learning_rate": 4.605744847925503e-06, "loss": 0.0138, "step": 5778 }, { "epoch": 5.395891690009337, "grad_norm": 1.8456766553009036, "learning_rate": 4.604237448273398e-06, "loss": 0.0944, "step": 5779 }, { "epoch": 5.396825396825397, "grad_norm": 1.9082979672117772, "learning_rate": 4.602730084818126e-06, "loss": 0.0949, "step": 5780 }, { "epoch": 5.397759103641457, "grad_norm": 4.09045166342954, "learning_rate": 4.601222757697551e-06, "loss": 0.2352, "step": 5781 }, { "epoch": 5.398692810457517, "grad_norm": 0.5733020735813774, "learning_rate": 4.599715467049534e-06, "loss": 0.0096, "step": 5782 }, { "epoch": 5.3996265172735765, "grad_norm": 1.7604031252291914, "learning_rate": 4.598208213011934e-06, "loss": 0.1175, "step": 5783 }, { "epoch": 5.400560224089636, "grad_norm": 1.9950811732565414, "learning_rate": 4.596700995722603e-06, "loss": 0.1307, "step": 5784 }, { "epoch": 5.401493930905696, "grad_norm": 3.341507152220367, "learning_rate": 4.595193815319397e-06, "loss": 0.1373, "step": 5785 }, { "epoch": 5.402427637721756, "grad_norm": 0.8997962125319104, "learning_rate": 4.593686671940163e-06, "loss": 0.0362, "step": 5786 }, { "epoch": 5.4033613445378155, "grad_norm": 1.7817645769373676, "learning_rate": 4.592179565722744e-06, "loss": 0.0757, "step": 5787 }, { "epoch": 5.404295051353875, "grad_norm": 1.3205423139157673, "learning_rate": 4.5906724968049796e-06, "loss": 0.0612, "step": 5788 }, { "epoch": 5.405228758169935, "grad_norm": 0.8133400301520577, "learning_rate": 4.589165465324714e-06, "loss": 0.0077, "step": 5789 }, { "epoch": 5.406162464985995, "grad_norm": 6.095985712667299, "learning_rate": 4.5876584714197785e-06, "loss": 0.135, "step": 5790 }, { "epoch": 5.4070961718020545, "grad_norm": 2.8429475071298467, "learning_rate": 4.586151515228003e-06, "loss": 0.1587, "step": 5791 }, { "epoch": 5.408029878618114, "grad_norm": 2.0607826671308795, "learning_rate": 4.584644596887216e-06, "loss": 0.1193, "step": 5792 }, { "epoch": 5.408963585434174, "grad_norm": 2.20727791969926, "learning_rate": 4.583137716535243e-06, "loss": 0.0723, "step": 5793 }, { "epoch": 5.409897292250234, "grad_norm": 2.1779764238877153, "learning_rate": 4.581630874309903e-06, "loss": 0.0592, "step": 5794 }, { "epoch": 5.4108309990662935, "grad_norm": 1.173724323761325, "learning_rate": 4.5801240703490155e-06, "loss": 0.0271, "step": 5795 }, { "epoch": 5.411764705882353, "grad_norm": 1.7750588405137566, "learning_rate": 4.578617304790391e-06, "loss": 0.0836, "step": 5796 }, { "epoch": 5.412698412698413, "grad_norm": 1.5909628151027735, "learning_rate": 4.577110577771842e-06, "loss": 0.0568, "step": 5797 }, { "epoch": 5.413632119514473, "grad_norm": 0.5194299246153428, "learning_rate": 4.575603889431176e-06, "loss": 0.0122, "step": 5798 }, { "epoch": 5.4145658263305325, "grad_norm": 1.4793682318824115, "learning_rate": 4.5740972399061955e-06, "loss": 0.0851, "step": 5799 }, { "epoch": 5.415499533146592, "grad_norm": 1.24096693067758, "learning_rate": 4.572590629334697e-06, "loss": 0.0513, "step": 5800 }, { "epoch": 5.416433239962652, "grad_norm": 0.6933232343572473, "learning_rate": 4.57108405785448e-06, "loss": 0.0215, "step": 5801 }, { "epoch": 5.417366946778712, "grad_norm": 1.2908789804888143, "learning_rate": 4.569577525603338e-06, "loss": 0.053, "step": 5802 }, { "epoch": 5.4183006535947715, "grad_norm": 0.7469238455455303, "learning_rate": 4.568071032719057e-06, "loss": 0.0265, "step": 5803 }, { "epoch": 5.419234360410831, "grad_norm": 1.39795706754151, "learning_rate": 4.566564579339424e-06, "loss": 0.06, "step": 5804 }, { "epoch": 5.420168067226891, "grad_norm": 7.54331283599952, "learning_rate": 4.5650581656022196e-06, "loss": 0.1688, "step": 5805 }, { "epoch": 5.421101774042951, "grad_norm": 0.7534550508798662, "learning_rate": 4.563551791645223e-06, "loss": 0.0153, "step": 5806 }, { "epoch": 5.4220354808590105, "grad_norm": 3.9073974409154766, "learning_rate": 4.562045457606208e-06, "loss": 0.1634, "step": 5807 }, { "epoch": 5.42296918767507, "grad_norm": 0.8475782423980988, "learning_rate": 4.560539163622947e-06, "loss": 0.0196, "step": 5808 }, { "epoch": 5.42390289449113, "grad_norm": 1.3883468571149353, "learning_rate": 4.559032909833203e-06, "loss": 0.0733, "step": 5809 }, { "epoch": 5.42483660130719, "grad_norm": 0.23101353337594954, "learning_rate": 4.557526696374743e-06, "loss": 0.0019, "step": 5810 }, { "epoch": 5.42577030812325, "grad_norm": 1.0183272126499572, "learning_rate": 4.556020523385326e-06, "loss": 0.0379, "step": 5811 }, { "epoch": 5.426704014939309, "grad_norm": 5.013371418303705, "learning_rate": 4.554514391002709e-06, "loss": 0.1882, "step": 5812 }, { "epoch": 5.427637721755369, "grad_norm": 1.8037267673138198, "learning_rate": 4.553008299364641e-06, "loss": 0.0885, "step": 5813 }, { "epoch": 5.428571428571429, "grad_norm": 1.5063413275557525, "learning_rate": 4.551502248608875e-06, "loss": 0.0604, "step": 5814 }, { "epoch": 5.429505135387489, "grad_norm": 4.083722992388822, "learning_rate": 4.5499962388731535e-06, "loss": 0.0815, "step": 5815 }, { "epoch": 5.430438842203548, "grad_norm": 0.8813665471117333, "learning_rate": 4.548490270295218e-06, "loss": 0.0403, "step": 5816 }, { "epoch": 5.431372549019608, "grad_norm": 1.2482404002722307, "learning_rate": 4.546984343012806e-06, "loss": 0.0361, "step": 5817 }, { "epoch": 5.432306255835668, "grad_norm": 3.375325072137186, "learning_rate": 4.5454784571636485e-06, "loss": 0.1506, "step": 5818 }, { "epoch": 5.433239962651728, "grad_norm": 2.3319139437143965, "learning_rate": 4.54397261288548e-06, "loss": 0.0776, "step": 5819 }, { "epoch": 5.434173669467787, "grad_norm": 1.6828705235065637, "learning_rate": 4.542466810316024e-06, "loss": 0.0585, "step": 5820 }, { "epoch": 5.435107376283847, "grad_norm": 5.187045470149326, "learning_rate": 4.5409610495930025e-06, "loss": 0.2051, "step": 5821 }, { "epoch": 5.436041083099907, "grad_norm": 3.058480403398023, "learning_rate": 4.539455330854133e-06, "loss": 0.1098, "step": 5822 }, { "epoch": 5.436974789915967, "grad_norm": 0.29655940021399546, "learning_rate": 4.537949654237133e-06, "loss": 0.0062, "step": 5823 }, { "epoch": 5.437908496732026, "grad_norm": 0.9423773259034051, "learning_rate": 4.536444019879711e-06, "loss": 0.0202, "step": 5824 }, { "epoch": 5.438842203548086, "grad_norm": 1.2338820271697724, "learning_rate": 4.534938427919574e-06, "loss": 0.0452, "step": 5825 }, { "epoch": 5.439775910364146, "grad_norm": 1.6383160997710753, "learning_rate": 4.533432878494423e-06, "loss": 0.0974, "step": 5826 }, { "epoch": 5.440709617180206, "grad_norm": 1.5678965864813543, "learning_rate": 4.531927371741962e-06, "loss": 0.0551, "step": 5827 }, { "epoch": 5.441643323996265, "grad_norm": 1.172798303096114, "learning_rate": 4.530421907799883e-06, "loss": 0.0357, "step": 5828 }, { "epoch": 5.442577030812325, "grad_norm": 3.839015817519638, "learning_rate": 4.528916486805876e-06, "loss": 0.2667, "step": 5829 }, { "epoch": 5.443510737628385, "grad_norm": 1.8939192369558275, "learning_rate": 4.52741110889763e-06, "loss": 0.1001, "step": 5830 }, { "epoch": 5.444444444444445, "grad_norm": 0.31898560667253895, "learning_rate": 4.525905774212829e-06, "loss": 0.0051, "step": 5831 }, { "epoch": 5.445378151260504, "grad_norm": 0.5742751327334796, "learning_rate": 4.524400482889152e-06, "loss": 0.0129, "step": 5832 }, { "epoch": 5.446311858076564, "grad_norm": 2.754333615264745, "learning_rate": 4.522895235064272e-06, "loss": 0.1737, "step": 5833 }, { "epoch": 5.447245564892624, "grad_norm": 2.3341793228266963, "learning_rate": 4.521390030875864e-06, "loss": 0.177, "step": 5834 }, { "epoch": 5.448179271708684, "grad_norm": 2.177280063185959, "learning_rate": 4.5198848704615915e-06, "loss": 0.1719, "step": 5835 }, { "epoch": 5.449112978524743, "grad_norm": 2.612829270302747, "learning_rate": 4.518379753959122e-06, "loss": 0.0763, "step": 5836 }, { "epoch": 5.450046685340803, "grad_norm": 1.1390012580396598, "learning_rate": 4.516874681506114e-06, "loss": 0.0422, "step": 5837 }, { "epoch": 5.450980392156863, "grad_norm": 1.5738915546458456, "learning_rate": 4.51536965324022e-06, "loss": 0.0745, "step": 5838 }, { "epoch": 5.451914098972923, "grad_norm": 1.0801845259803675, "learning_rate": 4.513864669299094e-06, "loss": 0.0354, "step": 5839 }, { "epoch": 5.452847805788982, "grad_norm": 2.533727083286896, "learning_rate": 4.512359729820384e-06, "loss": 0.2338, "step": 5840 }, { "epoch": 5.453781512605042, "grad_norm": 1.7979756247616152, "learning_rate": 4.51085483494173e-06, "loss": 0.0162, "step": 5841 }, { "epoch": 5.454715219421102, "grad_norm": 3.2670524067850115, "learning_rate": 4.509349984800775e-06, "loss": 0.1967, "step": 5842 }, { "epoch": 5.455648926237162, "grad_norm": 0.7160096432270373, "learning_rate": 4.507845179535151e-06, "loss": 0.0331, "step": 5843 }, { "epoch": 5.456582633053221, "grad_norm": 1.4674231019785404, "learning_rate": 4.506340419282491e-06, "loss": 0.0706, "step": 5844 }, { "epoch": 5.457516339869281, "grad_norm": 1.5665218447011022, "learning_rate": 4.50483570418042e-06, "loss": 0.0758, "step": 5845 }, { "epoch": 5.458450046685341, "grad_norm": 1.17789255254598, "learning_rate": 4.503331034366564e-06, "loss": 0.0548, "step": 5846 }, { "epoch": 5.459383753501401, "grad_norm": 0.38046722991357285, "learning_rate": 4.501826409978537e-06, "loss": 0.0145, "step": 5847 }, { "epoch": 5.4603174603174605, "grad_norm": 2.13265620865029, "learning_rate": 4.500321831153956e-06, "loss": 0.1153, "step": 5848 }, { "epoch": 5.46125116713352, "grad_norm": 2.901588437672319, "learning_rate": 4.498817298030432e-06, "loss": 0.0956, "step": 5849 }, { "epoch": 5.46218487394958, "grad_norm": 0.8070830799389206, "learning_rate": 4.49731281074557e-06, "loss": 0.0173, "step": 5850 }, { "epoch": 5.46311858076564, "grad_norm": 1.7759376170563086, "learning_rate": 4.495808369436973e-06, "loss": 0.059, "step": 5851 }, { "epoch": 5.4640522875816995, "grad_norm": 1.1980011936483526, "learning_rate": 4.494303974242234e-06, "loss": 0.0583, "step": 5852 }, { "epoch": 5.464985994397759, "grad_norm": 1.0467155441425247, "learning_rate": 4.492799625298953e-06, "loss": 0.0471, "step": 5853 }, { "epoch": 5.465919701213819, "grad_norm": 0.42463905197251567, "learning_rate": 4.491295322744717e-06, "loss": 0.0157, "step": 5854 }, { "epoch": 5.466853408029879, "grad_norm": 2.021737382376963, "learning_rate": 4.489791066717109e-06, "loss": 0.0767, "step": 5855 }, { "epoch": 5.4677871148459385, "grad_norm": 3.210030396613064, "learning_rate": 4.48828685735371e-06, "loss": 0.1556, "step": 5856 }, { "epoch": 5.468720821661998, "grad_norm": 1.816029978528517, "learning_rate": 4.4867826947920996e-06, "loss": 0.0695, "step": 5857 }, { "epoch": 5.469654528478058, "grad_norm": 0.7510500607893461, "learning_rate": 4.4852785791698474e-06, "loss": 0.0069, "step": 5858 }, { "epoch": 5.470588235294118, "grad_norm": 2.3875309670291576, "learning_rate": 4.483774510624523e-06, "loss": 0.1208, "step": 5859 }, { "epoch": 5.4715219421101775, "grad_norm": 1.3404107633708289, "learning_rate": 4.482270489293685e-06, "loss": 0.0989, "step": 5860 }, { "epoch": 5.472455648926237, "grad_norm": 1.9559678031830658, "learning_rate": 4.480766515314899e-06, "loss": 0.1264, "step": 5861 }, { "epoch": 5.473389355742297, "grad_norm": 1.3141170627595742, "learning_rate": 4.479262588825719e-06, "loss": 0.0747, "step": 5862 }, { "epoch": 5.474323062558357, "grad_norm": 1.687300916273836, "learning_rate": 4.477758709963692e-06, "loss": 0.0845, "step": 5863 }, { "epoch": 5.4752567693744165, "grad_norm": 0.5891952928274604, "learning_rate": 4.4762548788663655e-06, "loss": 0.0172, "step": 5864 }, { "epoch": 5.476190476190476, "grad_norm": 1.9010201471586987, "learning_rate": 4.474751095671283e-06, "loss": 0.067, "step": 5865 }, { "epoch": 5.477124183006536, "grad_norm": 3.467141066605716, "learning_rate": 4.4732473605159825e-06, "loss": 0.1747, "step": 5866 }, { "epoch": 5.478057889822596, "grad_norm": 0.7415233645295487, "learning_rate": 4.471743673537994e-06, "loss": 0.0177, "step": 5867 }, { "epoch": 5.4789915966386555, "grad_norm": 1.6332376934875916, "learning_rate": 4.47024003487485e-06, "loss": 0.0735, "step": 5868 }, { "epoch": 5.479925303454715, "grad_norm": 2.2774931926291897, "learning_rate": 4.4687364446640685e-06, "loss": 0.1113, "step": 5869 }, { "epoch": 5.480859010270775, "grad_norm": 0.49707656500758, "learning_rate": 4.4672329030431765e-06, "loss": 0.0077, "step": 5870 }, { "epoch": 5.481792717086835, "grad_norm": 0.2065601699756661, "learning_rate": 4.465729410149687e-06, "loss": 0.0026, "step": 5871 }, { "epoch": 5.4827264239028946, "grad_norm": 1.5047834274004437, "learning_rate": 4.4642259661211095e-06, "loss": 0.1004, "step": 5872 }, { "epoch": 5.483660130718954, "grad_norm": 2.8319222827819317, "learning_rate": 4.462722571094949e-06, "loss": 0.1291, "step": 5873 }, { "epoch": 5.484593837535014, "grad_norm": 2.0895007204908156, "learning_rate": 4.461219225208712e-06, "loss": 0.0915, "step": 5874 }, { "epoch": 5.485527544351074, "grad_norm": 3.458124739735084, "learning_rate": 4.459715928599894e-06, "loss": 0.1621, "step": 5875 }, { "epoch": 5.486461251167134, "grad_norm": 3.4323329816079995, "learning_rate": 4.458212681405988e-06, "loss": 0.0626, "step": 5876 }, { "epoch": 5.487394957983193, "grad_norm": 2.232908564396222, "learning_rate": 4.45670948376448e-06, "loss": 0.0524, "step": 5877 }, { "epoch": 5.488328664799253, "grad_norm": 2.2160294655563617, "learning_rate": 4.4552063358128585e-06, "loss": 0.1271, "step": 5878 }, { "epoch": 5.489262371615313, "grad_norm": 0.576486451289177, "learning_rate": 4.4537032376886e-06, "loss": 0.0259, "step": 5879 }, { "epoch": 5.490196078431373, "grad_norm": 0.28740925348655144, "learning_rate": 4.452200189529179e-06, "loss": 0.0093, "step": 5880 }, { "epoch": 5.491129785247432, "grad_norm": 2.619992808749876, "learning_rate": 4.450697191472068e-06, "loss": 0.0966, "step": 5881 }, { "epoch": 5.492063492063492, "grad_norm": 0.0557571000297999, "learning_rate": 4.4491942436547294e-06, "loss": 0.0005, "step": 5882 }, { "epoch": 5.492997198879552, "grad_norm": 1.4590730670774559, "learning_rate": 4.447691346214627e-06, "loss": 0.0805, "step": 5883 }, { "epoch": 5.493930905695612, "grad_norm": 2.1849725489772633, "learning_rate": 4.446188499289217e-06, "loss": 0.0691, "step": 5884 }, { "epoch": 5.494864612511671, "grad_norm": 3.5043891257897926, "learning_rate": 4.444685703015951e-06, "loss": 0.1832, "step": 5885 }, { "epoch": 5.495798319327731, "grad_norm": 0.21609613191060317, "learning_rate": 4.443182957532273e-06, "loss": 0.0032, "step": 5886 }, { "epoch": 5.496732026143791, "grad_norm": 2.9849541025692714, "learning_rate": 4.441680262975632e-06, "loss": 0.1848, "step": 5887 }, { "epoch": 5.497665732959851, "grad_norm": 4.154784985198993, "learning_rate": 4.4401776194834615e-06, "loss": 0.1398, "step": 5888 }, { "epoch": 5.49859943977591, "grad_norm": 0.7645839441485583, "learning_rate": 4.438675027193196e-06, "loss": 0.0247, "step": 5889 }, { "epoch": 5.49953314659197, "grad_norm": 2.171173204292812, "learning_rate": 4.4371724862422616e-06, "loss": 0.1329, "step": 5890 }, { "epoch": 5.50046685340803, "grad_norm": 0.8538802132735281, "learning_rate": 4.435669996768086e-06, "loss": 0.0204, "step": 5891 }, { "epoch": 5.50140056022409, "grad_norm": 1.195407336686722, "learning_rate": 4.434167558908087e-06, "loss": 0.0474, "step": 5892 }, { "epoch": 5.502334267040149, "grad_norm": 3.3373375962083083, "learning_rate": 4.43266517279968e-06, "loss": 0.1784, "step": 5893 }, { "epoch": 5.503267973856209, "grad_norm": 1.3471409188357677, "learning_rate": 4.431162838580269e-06, "loss": 0.076, "step": 5894 }, { "epoch": 5.504201680672269, "grad_norm": 0.18509735531244123, "learning_rate": 4.4296605563872675e-06, "loss": 0.0101, "step": 5895 }, { "epoch": 5.505135387488329, "grad_norm": 1.1909633874900634, "learning_rate": 4.4281583263580705e-06, "loss": 0.0247, "step": 5896 }, { "epoch": 5.506069094304388, "grad_norm": 3.229490430134321, "learning_rate": 4.426656148630075e-06, "loss": 0.1286, "step": 5897 }, { "epoch": 5.507002801120448, "grad_norm": 0.7230692765942038, "learning_rate": 4.425154023340671e-06, "loss": 0.02, "step": 5898 }, { "epoch": 5.507936507936508, "grad_norm": 0.9046140001186639, "learning_rate": 4.423651950627242e-06, "loss": 0.028, "step": 5899 }, { "epoch": 5.508870214752568, "grad_norm": 2.570380235991612, "learning_rate": 4.422149930627175e-06, "loss": 0.1393, "step": 5900 }, { "epoch": 5.509803921568627, "grad_norm": 4.35030175794801, "learning_rate": 4.420647963477842e-06, "loss": 0.1598, "step": 5901 }, { "epoch": 5.510737628384687, "grad_norm": 2.3097133858245016, "learning_rate": 4.419146049316614e-06, "loss": 0.109, "step": 5902 }, { "epoch": 5.511671335200747, "grad_norm": 1.8107389494953436, "learning_rate": 4.417644188280859e-06, "loss": 0.0846, "step": 5903 }, { "epoch": 5.512605042016807, "grad_norm": 1.1308321097875527, "learning_rate": 4.4161423805079385e-06, "loss": 0.031, "step": 5904 }, { "epoch": 5.513538748832866, "grad_norm": 2.200145827756268, "learning_rate": 4.41464062613521e-06, "loss": 0.1471, "step": 5905 }, { "epoch": 5.514472455648926, "grad_norm": 1.2891277707234299, "learning_rate": 4.413138925300025e-06, "loss": 0.0572, "step": 5906 }, { "epoch": 5.515406162464986, "grad_norm": 0.5372138148634642, "learning_rate": 4.4116372781397276e-06, "loss": 0.0178, "step": 5907 }, { "epoch": 5.516339869281046, "grad_norm": 1.7324041475567682, "learning_rate": 4.4101356847916656e-06, "loss": 0.078, "step": 5908 }, { "epoch": 5.5172735760971054, "grad_norm": 2.690548144090963, "learning_rate": 4.408634145393172e-06, "loss": 0.1489, "step": 5909 }, { "epoch": 5.518207282913165, "grad_norm": 1.2904910233933702, "learning_rate": 4.40713266008158e-06, "loss": 0.0589, "step": 5910 }, { "epoch": 5.519140989729225, "grad_norm": 0.9653405594294224, "learning_rate": 4.405631228994217e-06, "loss": 0.022, "step": 5911 }, { "epoch": 5.520074696545285, "grad_norm": 1.6085335435818213, "learning_rate": 4.4041298522684054e-06, "loss": 0.0936, "step": 5912 }, { "epoch": 5.5210084033613445, "grad_norm": 1.77891421447668, "learning_rate": 4.402628530041464e-06, "loss": 0.1126, "step": 5913 }, { "epoch": 5.521942110177404, "grad_norm": 4.436770298499618, "learning_rate": 4.401127262450702e-06, "loss": 0.1369, "step": 5914 }, { "epoch": 5.522875816993464, "grad_norm": 4.741387763324295, "learning_rate": 4.39962604963343e-06, "loss": 0.1892, "step": 5915 }, { "epoch": 5.523809523809524, "grad_norm": 1.3096093631596162, "learning_rate": 4.398124891726948e-06, "loss": 0.0349, "step": 5916 }, { "epoch": 5.5247432306255835, "grad_norm": 0.6137810092585386, "learning_rate": 4.396623788868556e-06, "loss": 0.017, "step": 5917 }, { "epoch": 5.525676937441643, "grad_norm": 1.3524970711652582, "learning_rate": 4.395122741195543e-06, "loss": 0.0449, "step": 5918 }, { "epoch": 5.526610644257703, "grad_norm": 0.8600902102273749, "learning_rate": 4.3936217488452e-06, "loss": 0.0299, "step": 5919 }, { "epoch": 5.527544351073763, "grad_norm": 0.48148075041223937, "learning_rate": 4.392120811954806e-06, "loss": 0.0129, "step": 5920 }, { "epoch": 5.5284780578898225, "grad_norm": 3.4446692840180733, "learning_rate": 4.3906199306616395e-06, "loss": 0.1535, "step": 5921 }, { "epoch": 5.529411764705882, "grad_norm": 2.557338365144592, "learning_rate": 4.389119105102973e-06, "loss": 0.1071, "step": 5922 }, { "epoch": 5.530345471521942, "grad_norm": 1.1107255769811444, "learning_rate": 4.387618335416073e-06, "loss": 0.049, "step": 5923 }, { "epoch": 5.531279178338002, "grad_norm": 0.7921163943372531, "learning_rate": 4.386117621738202e-06, "loss": 0.0133, "step": 5924 }, { "epoch": 5.5322128851540615, "grad_norm": 0.5651429425289791, "learning_rate": 4.384616964206615e-06, "loss": 0.0202, "step": 5925 }, { "epoch": 5.533146591970121, "grad_norm": 1.5805422715632167, "learning_rate": 4.3831163629585654e-06, "loss": 0.0942, "step": 5926 }, { "epoch": 5.534080298786181, "grad_norm": 1.7345808436185555, "learning_rate": 4.3816158181312995e-06, "loss": 0.0702, "step": 5927 }, { "epoch": 5.535014005602241, "grad_norm": 0.4338933912781493, "learning_rate": 4.380115329862056e-06, "loss": 0.0091, "step": 5928 }, { "epoch": 5.5359477124183005, "grad_norm": 1.7292655274759832, "learning_rate": 4.378614898288075e-06, "loss": 0.0679, "step": 5929 }, { "epoch": 5.53688141923436, "grad_norm": 1.100106379770186, "learning_rate": 4.377114523546586e-06, "loss": 0.0417, "step": 5930 }, { "epoch": 5.53781512605042, "grad_norm": 0.685503179581347, "learning_rate": 4.375614205774815e-06, "loss": 0.0308, "step": 5931 }, { "epoch": 5.53874883286648, "grad_norm": 1.7027239330607622, "learning_rate": 4.3741139451099816e-06, "loss": 0.05, "step": 5932 }, { "epoch": 5.5396825396825395, "grad_norm": 1.4980375393946364, "learning_rate": 4.3726137416892985e-06, "loss": 0.1057, "step": 5933 }, { "epoch": 5.540616246498599, "grad_norm": 1.296621374089843, "learning_rate": 4.3711135956499816e-06, "loss": 0.0447, "step": 5934 }, { "epoch": 5.541549953314659, "grad_norm": 1.1749206106880319, "learning_rate": 4.369613507129232e-06, "loss": 0.043, "step": 5935 }, { "epoch": 5.542483660130719, "grad_norm": 1.9264029359476083, "learning_rate": 4.3681134762642505e-06, "loss": 0.0674, "step": 5936 }, { "epoch": 5.543417366946779, "grad_norm": 1.0490103622752218, "learning_rate": 4.366613503192229e-06, "loss": 0.0348, "step": 5937 }, { "epoch": 5.544351073762838, "grad_norm": 2.6659119143588033, "learning_rate": 4.36511358805036e-06, "loss": 0.1451, "step": 5938 }, { "epoch": 5.545284780578898, "grad_norm": 1.4659137898838783, "learning_rate": 4.363613730975827e-06, "loss": 0.0545, "step": 5939 }, { "epoch": 5.546218487394958, "grad_norm": 0.8389582606928282, "learning_rate": 4.3621139321058065e-06, "loss": 0.0226, "step": 5940 }, { "epoch": 5.547152194211018, "grad_norm": 3.0596268729080447, "learning_rate": 4.3606141915774695e-06, "loss": 0.0108, "step": 5941 }, { "epoch": 5.548085901027077, "grad_norm": 1.4025077929439556, "learning_rate": 4.35911450952799e-06, "loss": 0.0249, "step": 5942 }, { "epoch": 5.549019607843137, "grad_norm": 1.8311002190275556, "learning_rate": 4.357614886094526e-06, "loss": 0.0672, "step": 5943 }, { "epoch": 5.549953314659197, "grad_norm": 1.5950833262182305, "learning_rate": 4.356115321414235e-06, "loss": 0.0461, "step": 5944 }, { "epoch": 5.550887021475257, "grad_norm": 2.383737811989289, "learning_rate": 4.354615815624267e-06, "loss": 0.0411, "step": 5945 }, { "epoch": 5.551820728291316, "grad_norm": 1.54583745860299, "learning_rate": 4.353116368861772e-06, "loss": 0.067, "step": 5946 }, { "epoch": 5.552754435107376, "grad_norm": 2.1834853969439134, "learning_rate": 4.351616981263889e-06, "loss": 0.0566, "step": 5947 }, { "epoch": 5.553688141923436, "grad_norm": 1.7300210614625904, "learning_rate": 4.350117652967756e-06, "loss": 0.0875, "step": 5948 }, { "epoch": 5.554621848739496, "grad_norm": 1.3052806751991102, "learning_rate": 4.348618384110497e-06, "loss": 0.0527, "step": 5949 }, { "epoch": 5.555555555555555, "grad_norm": 0.3271319671127631, "learning_rate": 4.347119174829242e-06, "loss": 0.0059, "step": 5950 }, { "epoch": 5.556489262371615, "grad_norm": 3.0027169048586813, "learning_rate": 4.345620025261108e-06, "loss": 0.0606, "step": 5951 }, { "epoch": 5.557422969187675, "grad_norm": 2.2148847336880624, "learning_rate": 4.34412093554321e-06, "loss": 0.1133, "step": 5952 }, { "epoch": 5.558356676003735, "grad_norm": 0.8929232595297066, "learning_rate": 4.342621905812654e-06, "loss": 0.0304, "step": 5953 }, { "epoch": 5.559290382819794, "grad_norm": 3.308450160277385, "learning_rate": 4.341122936206545e-06, "loss": 0.1636, "step": 5954 }, { "epoch": 5.560224089635854, "grad_norm": 1.4283115648738214, "learning_rate": 4.3396240268619785e-06, "loss": 0.0938, "step": 5955 }, { "epoch": 5.561157796451914, "grad_norm": 2.5927187332921973, "learning_rate": 4.338125177916047e-06, "loss": 0.167, "step": 5956 }, { "epoch": 5.562091503267974, "grad_norm": 0.5426170967553618, "learning_rate": 4.336626389505838e-06, "loss": 0.0135, "step": 5957 }, { "epoch": 5.563025210084033, "grad_norm": 0.34717200587799785, "learning_rate": 4.335127661768429e-06, "loss": 0.0101, "step": 5958 }, { "epoch": 5.563958916900093, "grad_norm": 0.5094180918439969, "learning_rate": 4.333628994840899e-06, "loss": 0.0082, "step": 5959 }, { "epoch": 5.564892623716153, "grad_norm": 0.963045436713249, "learning_rate": 4.332130388860314e-06, "loss": 0.0358, "step": 5960 }, { "epoch": 5.565826330532213, "grad_norm": 4.130866659931569, "learning_rate": 4.330631843963741e-06, "loss": 0.1498, "step": 5961 }, { "epoch": 5.566760037348272, "grad_norm": 1.1714430792664623, "learning_rate": 4.329133360288235e-06, "loss": 0.0331, "step": 5962 }, { "epoch": 5.567693744164332, "grad_norm": 3.9591644517027946, "learning_rate": 4.327634937970851e-06, "loss": 0.2171, "step": 5963 }, { "epoch": 5.568627450980392, "grad_norm": 4.418777266423285, "learning_rate": 4.326136577148637e-06, "loss": 0.1643, "step": 5964 }, { "epoch": 5.569561157796452, "grad_norm": 4.330565640540708, "learning_rate": 4.324638277958634e-06, "loss": 0.1768, "step": 5965 }, { "epoch": 5.570494864612511, "grad_norm": 1.2559379567478215, "learning_rate": 4.323140040537877e-06, "loss": 0.0279, "step": 5966 }, { "epoch": 5.571428571428571, "grad_norm": 1.6629861802857941, "learning_rate": 4.321641865023394e-06, "loss": 0.0198, "step": 5967 }, { "epoch": 5.572362278244631, "grad_norm": 1.7886789129264677, "learning_rate": 4.320143751552214e-06, "loss": 0.0493, "step": 5968 }, { "epoch": 5.573295985060691, "grad_norm": 1.27878457518779, "learning_rate": 4.318645700261354e-06, "loss": 0.0736, "step": 5969 }, { "epoch": 5.57422969187675, "grad_norm": 1.7172999650888645, "learning_rate": 4.317147711287829e-06, "loss": 0.1053, "step": 5970 }, { "epoch": 5.57516339869281, "grad_norm": 0.7234386793423929, "learning_rate": 4.315649784768642e-06, "loss": 0.0225, "step": 5971 }, { "epoch": 5.57609710550887, "grad_norm": 5.613503058189799, "learning_rate": 4.3141519208408e-06, "loss": 0.1077, "step": 5972 }, { "epoch": 5.57703081232493, "grad_norm": 0.9910768357067142, "learning_rate": 4.312654119641296e-06, "loss": 0.0149, "step": 5973 }, { "epoch": 5.5779645191409895, "grad_norm": 2.66045907955561, "learning_rate": 4.311156381307122e-06, "loss": 0.1243, "step": 5974 }, { "epoch": 5.578898225957049, "grad_norm": 1.8995847886125972, "learning_rate": 4.309658705975259e-06, "loss": 0.1187, "step": 5975 }, { "epoch": 5.579831932773109, "grad_norm": 5.35231521015998, "learning_rate": 4.308161093782691e-06, "loss": 0.0538, "step": 5976 }, { "epoch": 5.580765639589169, "grad_norm": 0.4929691882749936, "learning_rate": 4.30666354486639e-06, "loss": 0.0194, "step": 5977 }, { "epoch": 5.5816993464052285, "grad_norm": 4.266950060959774, "learning_rate": 4.30516605936332e-06, "loss": 0.2428, "step": 5978 }, { "epoch": 5.582633053221288, "grad_norm": 1.4318034995733244, "learning_rate": 4.303668637410444e-06, "loss": 0.0652, "step": 5979 }, { "epoch": 5.583566760037348, "grad_norm": 0.7265167671003325, "learning_rate": 4.30217127914472e-06, "loss": 0.0125, "step": 5980 }, { "epoch": 5.584500466853408, "grad_norm": 2.7631872115733045, "learning_rate": 4.300673984703096e-06, "loss": 0.0862, "step": 5981 }, { "epoch": 5.5854341736694675, "grad_norm": 3.6371121862258713, "learning_rate": 4.299176754222516e-06, "loss": 0.1441, "step": 5982 }, { "epoch": 5.586367880485527, "grad_norm": 0.7771813332776081, "learning_rate": 4.297679587839919e-06, "loss": 0.0343, "step": 5983 }, { "epoch": 5.587301587301587, "grad_norm": 4.347579381084426, "learning_rate": 4.296182485692234e-06, "loss": 0.2185, "step": 5984 }, { "epoch": 5.588235294117647, "grad_norm": 2.370452925639974, "learning_rate": 4.294685447916391e-06, "loss": 0.122, "step": 5985 }, { "epoch": 5.5891690009337065, "grad_norm": 2.6010777625178996, "learning_rate": 4.293188474649311e-06, "loss": 0.1434, "step": 5986 }, { "epoch": 5.590102707749766, "grad_norm": 1.852152074396773, "learning_rate": 4.291691566027907e-06, "loss": 0.0543, "step": 5987 }, { "epoch": 5.591036414565826, "grad_norm": 1.1646414199225457, "learning_rate": 4.290194722189086e-06, "loss": 0.0217, "step": 5988 }, { "epoch": 5.591970121381886, "grad_norm": 0.9722373397034133, "learning_rate": 4.288697943269754e-06, "loss": 0.0382, "step": 5989 }, { "epoch": 5.5929038281979455, "grad_norm": 3.05162557320261, "learning_rate": 4.287201229406807e-06, "loss": 0.0186, "step": 5990 }, { "epoch": 5.593837535014005, "grad_norm": 4.7953479807328785, "learning_rate": 4.285704580737135e-06, "loss": 0.2674, "step": 5991 }, { "epoch": 5.594771241830065, "grad_norm": 0.47621715302986894, "learning_rate": 4.284207997397623e-06, "loss": 0.0148, "step": 5992 }, { "epoch": 5.595704948646125, "grad_norm": 1.3690404087504082, "learning_rate": 4.2827114795251514e-06, "loss": 0.061, "step": 5993 }, { "epoch": 5.5966386554621845, "grad_norm": 0.5398354177371878, "learning_rate": 4.281215027256592e-06, "loss": 0.0247, "step": 5994 }, { "epoch": 5.597572362278244, "grad_norm": 2.826574861898883, "learning_rate": 4.279718640728813e-06, "loss": 0.1914, "step": 5995 }, { "epoch": 5.598506069094304, "grad_norm": 2.446805506962378, "learning_rate": 4.278222320078673e-06, "loss": 0.1243, "step": 5996 }, { "epoch": 5.599439775910364, "grad_norm": 0.22098781262947376, "learning_rate": 4.27672606544303e-06, "loss": 0.0029, "step": 5997 }, { "epoch": 5.6003734827264235, "grad_norm": 2.1075806901713903, "learning_rate": 4.275229876958729e-06, "loss": 0.0998, "step": 5998 }, { "epoch": 5.601307189542483, "grad_norm": 1.6856654928357777, "learning_rate": 4.273733754762617e-06, "loss": 0.1007, "step": 5999 }, { "epoch": 5.602240896358543, "grad_norm": 0.6371452487084975, "learning_rate": 4.27223769899153e-06, "loss": 0.0237, "step": 6000 }, { "epoch": 5.603174603174603, "grad_norm": 0.9695768416565503, "learning_rate": 4.270741709782294e-06, "loss": 0.05, "step": 6001 }, { "epoch": 5.604108309990663, "grad_norm": 1.094507952009289, "learning_rate": 4.26924578727174e-06, "loss": 0.0561, "step": 6002 }, { "epoch": 5.605042016806722, "grad_norm": 1.184470199679986, "learning_rate": 4.2677499315966845e-06, "loss": 0.0637, "step": 6003 }, { "epoch": 5.605975723622782, "grad_norm": 3.7770844509180956, "learning_rate": 4.266254142893938e-06, "loss": 0.1437, "step": 6004 }, { "epoch": 5.606909430438842, "grad_norm": 1.367475672148653, "learning_rate": 4.264758421300308e-06, "loss": 0.0605, "step": 6005 }, { "epoch": 5.607843137254902, "grad_norm": 0.5284354295712539, "learning_rate": 4.2632627669525954e-06, "loss": 0.0177, "step": 6006 }, { "epoch": 5.608776844070961, "grad_norm": 1.1307279148400313, "learning_rate": 4.261767179987595e-06, "loss": 0.0336, "step": 6007 }, { "epoch": 5.609710550887021, "grad_norm": 1.0379698106074602, "learning_rate": 4.2602716605420916e-06, "loss": 0.0385, "step": 6008 }, { "epoch": 5.610644257703081, "grad_norm": 2.6472176361320643, "learning_rate": 4.258776208752868e-06, "loss": 0.1556, "step": 6009 }, { "epoch": 5.611577964519141, "grad_norm": 2.4988069513404554, "learning_rate": 4.257280824756702e-06, "loss": 0.1076, "step": 6010 }, { "epoch": 5.6125116713352, "grad_norm": 0.9267842565576724, "learning_rate": 4.25578550869036e-06, "loss": 0.0358, "step": 6011 }, { "epoch": 5.61344537815126, "grad_norm": 0.3607154195020065, "learning_rate": 4.254290260690607e-06, "loss": 0.0073, "step": 6012 }, { "epoch": 5.61437908496732, "grad_norm": 1.260631272413052, "learning_rate": 4.2527950808941976e-06, "loss": 0.0577, "step": 6013 }, { "epoch": 5.61531279178338, "grad_norm": 1.4170899062888092, "learning_rate": 4.251299969437885e-06, "loss": 0.0685, "step": 6014 }, { "epoch": 5.616246498599439, "grad_norm": 1.9797987615902475, "learning_rate": 4.249804926458413e-06, "loss": 0.1031, "step": 6015 }, { "epoch": 5.617180205415499, "grad_norm": 3.5514475207416694, "learning_rate": 4.248309952092519e-06, "loss": 0.16, "step": 6016 }, { "epoch": 5.618113912231559, "grad_norm": 3.366852348258023, "learning_rate": 4.246815046476935e-06, "loss": 0.2118, "step": 6017 }, { "epoch": 5.619047619047619, "grad_norm": 1.6239463352671324, "learning_rate": 4.245320209748384e-06, "loss": 0.0986, "step": 6018 }, { "epoch": 5.619981325863678, "grad_norm": 0.41508223257853355, "learning_rate": 4.2438254420435905e-06, "loss": 0.0094, "step": 6019 }, { "epoch": 5.620915032679738, "grad_norm": 2.758572670883159, "learning_rate": 4.242330743499264e-06, "loss": 0.1752, "step": 6020 }, { "epoch": 5.621848739495798, "grad_norm": 2.0618407092598785, "learning_rate": 4.240836114252112e-06, "loss": 0.0699, "step": 6021 }, { "epoch": 5.622782446311858, "grad_norm": 0.44414660248261023, "learning_rate": 4.2393415544388325e-06, "loss": 0.0072, "step": 6022 }, { "epoch": 5.623716153127917, "grad_norm": 1.5213064077657117, "learning_rate": 4.237847064196124e-06, "loss": 0.0089, "step": 6023 }, { "epoch": 5.624649859943977, "grad_norm": 1.185917935396569, "learning_rate": 4.2363526436606705e-06, "loss": 0.0401, "step": 6024 }, { "epoch": 5.625583566760037, "grad_norm": 2.5033743322280646, "learning_rate": 4.234858292969155e-06, "loss": 0.1112, "step": 6025 }, { "epoch": 5.626517273576097, "grad_norm": 0.4677849650100268, "learning_rate": 4.233364012258249e-06, "loss": 0.0134, "step": 6026 }, { "epoch": 5.627450980392156, "grad_norm": 1.742217860912372, "learning_rate": 4.231869801664625e-06, "loss": 0.0885, "step": 6027 }, { "epoch": 5.628384687208216, "grad_norm": 0.29039388603375704, "learning_rate": 4.230375661324944e-06, "loss": 0.0142, "step": 6028 }, { "epoch": 5.629318394024276, "grad_norm": 2.3726701880508303, "learning_rate": 4.22888159137586e-06, "loss": 0.0797, "step": 6029 }, { "epoch": 5.630252100840336, "grad_norm": 4.598288296924474, "learning_rate": 4.2273875919540225e-06, "loss": 0.2002, "step": 6030 }, { "epoch": 5.631185807656395, "grad_norm": 0.8175329129410073, "learning_rate": 4.225893663196075e-06, "loss": 0.01, "step": 6031 }, { "epoch": 5.632119514472455, "grad_norm": 0.9030162731191768, "learning_rate": 4.224399805238653e-06, "loss": 0.0315, "step": 6032 }, { "epoch": 5.633053221288515, "grad_norm": 3.1182148723190517, "learning_rate": 4.2229060182183865e-06, "loss": 0.0676, "step": 6033 }, { "epoch": 5.633986928104575, "grad_norm": 0.7944855209498077, "learning_rate": 4.221412302271899e-06, "loss": 0.0168, "step": 6034 }, { "epoch": 5.634920634920634, "grad_norm": 2.6634608718393036, "learning_rate": 4.219918657535806e-06, "loss": 0.1722, "step": 6035 }, { "epoch": 5.635854341736694, "grad_norm": 3.249030043142306, "learning_rate": 4.21842508414672e-06, "loss": 0.1469, "step": 6036 }, { "epoch": 5.636788048552754, "grad_norm": 0.5514753188826236, "learning_rate": 4.216931582241242e-06, "loss": 0.0101, "step": 6037 }, { "epoch": 5.637721755368814, "grad_norm": 1.1741508121086321, "learning_rate": 4.215438151955972e-06, "loss": 0.0642, "step": 6038 }, { "epoch": 5.6386554621848735, "grad_norm": 0.9457853471864986, "learning_rate": 4.2139447934274964e-06, "loss": 0.0213, "step": 6039 }, { "epoch": 5.639589169000933, "grad_norm": 2.104096207351832, "learning_rate": 4.212451506792405e-06, "loss": 0.0757, "step": 6040 }, { "epoch": 5.640522875816993, "grad_norm": 2.0041571266584506, "learning_rate": 4.210958292187271e-06, "loss": 0.1013, "step": 6041 }, { "epoch": 5.641456582633054, "grad_norm": 0.7052753074808323, "learning_rate": 4.209465149748668e-06, "loss": 0.0284, "step": 6042 }, { "epoch": 5.642390289449113, "grad_norm": 0.5122296666951442, "learning_rate": 4.2079720796131555e-06, "loss": 0.0144, "step": 6043 }, { "epoch": 5.643323996265173, "grad_norm": 1.4528186301106911, "learning_rate": 4.206479081917296e-06, "loss": 0.0291, "step": 6044 }, { "epoch": 5.644257703081233, "grad_norm": 0.8413679474010133, "learning_rate": 4.2049861567976405e-06, "loss": 0.0205, "step": 6045 }, { "epoch": 5.645191409897293, "grad_norm": 2.1509518834062145, "learning_rate": 4.20349330439073e-06, "loss": 0.0922, "step": 6046 }, { "epoch": 5.646125116713352, "grad_norm": 0.3702502533930336, "learning_rate": 4.2020005248331056e-06, "loss": 0.0051, "step": 6047 }, { "epoch": 5.647058823529412, "grad_norm": 1.2932669377372568, "learning_rate": 4.2005078182612945e-06, "loss": 0.014, "step": 6048 }, { "epoch": 5.647992530345472, "grad_norm": 0.563354065135735, "learning_rate": 4.199015184811825e-06, "loss": 0.0194, "step": 6049 }, { "epoch": 5.648926237161532, "grad_norm": 2.5088882474909546, "learning_rate": 4.197522624621213e-06, "loss": 0.1233, "step": 6050 }, { "epoch": 5.649859943977591, "grad_norm": 1.5885756005865206, "learning_rate": 4.19603013782597e-06, "loss": 0.0691, "step": 6051 }, { "epoch": 5.650793650793651, "grad_norm": 1.2558449171088741, "learning_rate": 4.194537724562599e-06, "loss": 0.0523, "step": 6052 }, { "epoch": 5.651727357609711, "grad_norm": 0.397119516706868, "learning_rate": 4.1930453849676e-06, "loss": 0.0068, "step": 6053 }, { "epoch": 5.652661064425771, "grad_norm": 1.071242982484647, "learning_rate": 4.191553119177462e-06, "loss": 0.0425, "step": 6054 }, { "epoch": 5.65359477124183, "grad_norm": 3.149163601440093, "learning_rate": 4.190060927328671e-06, "loss": 0.1235, "step": 6055 }, { "epoch": 5.65452847805789, "grad_norm": 1.9151412512138895, "learning_rate": 4.188568809557701e-06, "loss": 0.081, "step": 6056 }, { "epoch": 5.65546218487395, "grad_norm": 1.3343335270007275, "learning_rate": 4.187076766001025e-06, "loss": 0.0519, "step": 6057 }, { "epoch": 5.65639589169001, "grad_norm": 1.497019182774272, "learning_rate": 4.185584796795107e-06, "loss": 0.0728, "step": 6058 }, { "epoch": 5.657329598506069, "grad_norm": 1.8364992248279979, "learning_rate": 4.184092902076403e-06, "loss": 0.0754, "step": 6059 }, { "epoch": 5.658263305322129, "grad_norm": 1.341391953738735, "learning_rate": 4.182601081981363e-06, "loss": 0.0414, "step": 6060 }, { "epoch": 5.659197012138189, "grad_norm": 4.322378178468827, "learning_rate": 4.181109336646431e-06, "loss": 0.1687, "step": 6061 }, { "epoch": 5.660130718954249, "grad_norm": 2.060233088380612, "learning_rate": 4.179617666208043e-06, "loss": 0.1293, "step": 6062 }, { "epoch": 5.661064425770308, "grad_norm": 1.345931912907466, "learning_rate": 4.17812607080263e-06, "loss": 0.0455, "step": 6063 }, { "epoch": 5.661998132586368, "grad_norm": 0.8229752714678977, "learning_rate": 4.176634550566613e-06, "loss": 0.0206, "step": 6064 }, { "epoch": 5.662931839402428, "grad_norm": 0.5767690407295724, "learning_rate": 4.175143105636408e-06, "loss": 0.011, "step": 6065 }, { "epoch": 5.663865546218488, "grad_norm": 0.31677009640724685, "learning_rate": 4.173651736148425e-06, "loss": 0.0047, "step": 6066 }, { "epoch": 5.6647992530345475, "grad_norm": 2.0294642622124375, "learning_rate": 4.1721604422390665e-06, "loss": 0.1157, "step": 6067 }, { "epoch": 5.665732959850607, "grad_norm": 1.7069784909305576, "learning_rate": 4.170669224044726e-06, "loss": 0.0559, "step": 6068 }, { "epoch": 5.666666666666667, "grad_norm": 2.1537078675516375, "learning_rate": 4.169178081701792e-06, "loss": 0.0621, "step": 6069 }, { "epoch": 5.667600373482727, "grad_norm": 0.686509342461989, "learning_rate": 4.167687015346647e-06, "loss": 0.0197, "step": 6070 }, { "epoch": 5.6685340802987865, "grad_norm": 1.1018186054270074, "learning_rate": 4.166196025115666e-06, "loss": 0.0261, "step": 6071 }, { "epoch": 5.669467787114846, "grad_norm": 3.852555511624845, "learning_rate": 4.164705111145215e-06, "loss": 0.1421, "step": 6072 }, { "epoch": 5.670401493930906, "grad_norm": 2.4767283065260774, "learning_rate": 4.163214273571654e-06, "loss": 0.0547, "step": 6073 }, { "epoch": 5.671335200746966, "grad_norm": 0.3957728405664105, "learning_rate": 4.161723512531337e-06, "loss": 0.02, "step": 6074 }, { "epoch": 5.6722689075630255, "grad_norm": 3.9966620578193, "learning_rate": 4.1602328281606115e-06, "loss": 0.0864, "step": 6075 }, { "epoch": 5.673202614379085, "grad_norm": 0.8072828497174209, "learning_rate": 4.158742220595817e-06, "loss": 0.0221, "step": 6076 }, { "epoch": 5.674136321195145, "grad_norm": 0.707585237194313, "learning_rate": 4.1572516899732845e-06, "loss": 0.0217, "step": 6077 }, { "epoch": 5.675070028011205, "grad_norm": 0.7515555403318113, "learning_rate": 4.1557612364293395e-06, "loss": 0.0159, "step": 6078 }, { "epoch": 5.6760037348272645, "grad_norm": 1.4833721894088223, "learning_rate": 4.154270860100302e-06, "loss": 0.06, "step": 6079 }, { "epoch": 5.676937441643324, "grad_norm": 0.9857123650423288, "learning_rate": 4.152780561122482e-06, "loss": 0.0414, "step": 6080 }, { "epoch": 5.677871148459384, "grad_norm": 1.2389123741395278, "learning_rate": 4.151290339632183e-06, "loss": 0.0459, "step": 6081 }, { "epoch": 5.678804855275444, "grad_norm": 1.5608891735729458, "learning_rate": 4.149800195765702e-06, "loss": 0.0592, "step": 6082 }, { "epoch": 5.6797385620915035, "grad_norm": 1.1282403898333724, "learning_rate": 4.1483101296593325e-06, "loss": 0.0221, "step": 6083 }, { "epoch": 5.680672268907563, "grad_norm": 2.1965506912722743, "learning_rate": 4.146820141449354e-06, "loss": 0.061, "step": 6084 }, { "epoch": 5.681605975723623, "grad_norm": 3.670663425907484, "learning_rate": 4.145330231272044e-06, "loss": 0.1218, "step": 6085 }, { "epoch": 5.682539682539683, "grad_norm": 1.2976040503120367, "learning_rate": 4.1438403992636675e-06, "loss": 0.0381, "step": 6086 }, { "epoch": 5.6834733893557425, "grad_norm": 1.3537321005080685, "learning_rate": 4.142350645560492e-06, "loss": 0.0566, "step": 6087 }, { "epoch": 5.684407096171802, "grad_norm": 1.031350944395374, "learning_rate": 4.140860970298767e-06, "loss": 0.0208, "step": 6088 }, { "epoch": 5.685340802987862, "grad_norm": 1.6776319089635132, "learning_rate": 4.1393713736147426e-06, "loss": 0.0421, "step": 6089 }, { "epoch": 5.686274509803922, "grad_norm": 0.7079020692234651, "learning_rate": 4.137881855644655e-06, "loss": 0.0281, "step": 6090 }, { "epoch": 5.6872082166199815, "grad_norm": 4.721270699531833, "learning_rate": 4.136392416524742e-06, "loss": 0.1437, "step": 6091 }, { "epoch": 5.688141923436041, "grad_norm": 1.9505646837035304, "learning_rate": 4.134903056391227e-06, "loss": 0.0625, "step": 6092 }, { "epoch": 5.689075630252101, "grad_norm": 2.487243645680958, "learning_rate": 4.133413775380327e-06, "loss": 0.0319, "step": 6093 }, { "epoch": 5.690009337068161, "grad_norm": 1.4971431339772403, "learning_rate": 4.131924573628252e-06, "loss": 0.0534, "step": 6094 }, { "epoch": 5.690943043884221, "grad_norm": 7.248488560739265, "learning_rate": 4.130435451271212e-06, "loss": 0.2172, "step": 6095 }, { "epoch": 5.69187675070028, "grad_norm": 1.2265037214270684, "learning_rate": 4.128946408445397e-06, "loss": 0.059, "step": 6096 }, { "epoch": 5.69281045751634, "grad_norm": 1.7642262422858965, "learning_rate": 4.1274574452870006e-06, "loss": 0.0861, "step": 6097 }, { "epoch": 5.6937441643324, "grad_norm": 5.915134534000012, "learning_rate": 4.125968561932203e-06, "loss": 0.0288, "step": 6098 }, { "epoch": 5.69467787114846, "grad_norm": 1.106321513791716, "learning_rate": 4.124479758517176e-06, "loss": 0.0633, "step": 6099 }, { "epoch": 5.695611577964519, "grad_norm": 3.440946945618362, "learning_rate": 4.122991035178093e-06, "loss": 0.0793, "step": 6100 }, { "epoch": 5.696545284780579, "grad_norm": 0.7459391658805676, "learning_rate": 4.1215023920511104e-06, "loss": 0.0275, "step": 6101 }, { "epoch": 5.697478991596639, "grad_norm": 1.599017429614903, "learning_rate": 4.1200138292723825e-06, "loss": 0.0381, "step": 6102 }, { "epoch": 5.698412698412699, "grad_norm": 0.7465040637807057, "learning_rate": 4.118525346978051e-06, "loss": 0.0292, "step": 6103 }, { "epoch": 5.699346405228758, "grad_norm": 1.932658719987216, "learning_rate": 4.11703694530426e-06, "loss": 0.065, "step": 6104 }, { "epoch": 5.700280112044818, "grad_norm": 1.2010920088127313, "learning_rate": 4.115548624387136e-06, "loss": 0.0505, "step": 6105 }, { "epoch": 5.701213818860878, "grad_norm": 1.5553108527569584, "learning_rate": 4.114060384362803e-06, "loss": 0.0878, "step": 6106 }, { "epoch": 5.702147525676938, "grad_norm": 1.0615405395828508, "learning_rate": 4.112572225367377e-06, "loss": 0.0522, "step": 6107 }, { "epoch": 5.703081232492997, "grad_norm": 0.6731758991499647, "learning_rate": 4.111084147536969e-06, "loss": 0.0247, "step": 6108 }, { "epoch": 5.704014939309057, "grad_norm": 0.6619585500135321, "learning_rate": 4.1095961510076755e-06, "loss": 0.0171, "step": 6109 }, { "epoch": 5.704948646125117, "grad_norm": 3.342869075159363, "learning_rate": 4.1081082359155935e-06, "loss": 0.1609, "step": 6110 }, { "epoch": 5.705882352941177, "grad_norm": 0.6385645511749782, "learning_rate": 4.106620402396808e-06, "loss": 0.0176, "step": 6111 }, { "epoch": 5.706816059757236, "grad_norm": 3.1514914883438667, "learning_rate": 4.105132650587398e-06, "loss": 0.1096, "step": 6112 }, { "epoch": 5.707749766573296, "grad_norm": 1.3667971852830387, "learning_rate": 4.103644980623433e-06, "loss": 0.0558, "step": 6113 }, { "epoch": 5.708683473389356, "grad_norm": 3.839796773278767, "learning_rate": 4.102157392640981e-06, "loss": 0.15, "step": 6114 }, { "epoch": 5.709617180205416, "grad_norm": 1.2724691282578031, "learning_rate": 4.100669886776094e-06, "loss": 0.0413, "step": 6115 }, { "epoch": 5.710550887021475, "grad_norm": 2.959273005959702, "learning_rate": 4.099182463164822e-06, "loss": 0.0786, "step": 6116 }, { "epoch": 5.711484593837535, "grad_norm": 1.9531738630282331, "learning_rate": 4.097695121943207e-06, "loss": 0.0729, "step": 6117 }, { "epoch": 5.712418300653595, "grad_norm": 5.257819379870838, "learning_rate": 4.096207863247282e-06, "loss": 0.1915, "step": 6118 }, { "epoch": 5.713352007469655, "grad_norm": 1.9187636293849502, "learning_rate": 4.094720687213075e-06, "loss": 0.0754, "step": 6119 }, { "epoch": 5.714285714285714, "grad_norm": 2.0696026786658672, "learning_rate": 4.0932335939766e-06, "loss": 0.0978, "step": 6120 }, { "epoch": 5.715219421101774, "grad_norm": 4.297844708386398, "learning_rate": 4.0917465836738726e-06, "loss": 0.2042, "step": 6121 }, { "epoch": 5.716153127917834, "grad_norm": 3.003454292526765, "learning_rate": 4.090259656440896e-06, "loss": 0.1272, "step": 6122 }, { "epoch": 5.717086834733894, "grad_norm": 0.6710214338238335, "learning_rate": 4.088772812413663e-06, "loss": 0.0075, "step": 6123 }, { "epoch": 5.718020541549953, "grad_norm": 1.7246413971847985, "learning_rate": 4.087286051728162e-06, "loss": 0.0735, "step": 6124 }, { "epoch": 5.718954248366013, "grad_norm": 2.6658204901886253, "learning_rate": 4.085799374520378e-06, "loss": 0.0376, "step": 6125 }, { "epoch": 5.719887955182073, "grad_norm": 1.0097698877865828, "learning_rate": 4.08431278092628e-06, "loss": 0.0453, "step": 6126 }, { "epoch": 5.720821661998133, "grad_norm": 0.6019760956379941, "learning_rate": 4.082826271081834e-06, "loss": 0.0148, "step": 6127 }, { "epoch": 5.721755368814192, "grad_norm": 1.2906590270138318, "learning_rate": 4.081339845122996e-06, "loss": 0.0683, "step": 6128 }, { "epoch": 5.722689075630252, "grad_norm": 0.4951849848460066, "learning_rate": 4.07985350318572e-06, "loss": 0.0174, "step": 6129 }, { "epoch": 5.723622782446312, "grad_norm": 1.9577923090279787, "learning_rate": 4.078367245405947e-06, "loss": 0.0795, "step": 6130 }, { "epoch": 5.724556489262372, "grad_norm": 1.285766569732554, "learning_rate": 4.07688107191961e-06, "loss": 0.035, "step": 6131 }, { "epoch": 5.7254901960784315, "grad_norm": 1.3582948130169044, "learning_rate": 4.075394982862637e-06, "loss": 0.091, "step": 6132 }, { "epoch": 5.726423902894491, "grad_norm": 2.448361693279496, "learning_rate": 4.073908978370945e-06, "loss": 0.0759, "step": 6133 }, { "epoch": 5.727357609710551, "grad_norm": 1.9486545552857621, "learning_rate": 4.0724230585804486e-06, "loss": 0.0508, "step": 6134 }, { "epoch": 5.728291316526611, "grad_norm": 1.0955350667134194, "learning_rate": 4.070937223627051e-06, "loss": 0.0559, "step": 6135 }, { "epoch": 5.7292250233426705, "grad_norm": 3.168293122216546, "learning_rate": 4.069451473646647e-06, "loss": 0.1537, "step": 6136 }, { "epoch": 5.73015873015873, "grad_norm": 1.6548743989487753, "learning_rate": 4.067965808775123e-06, "loss": 0.069, "step": 6137 }, { "epoch": 5.73109243697479, "grad_norm": 1.3955882322536546, "learning_rate": 4.066480229148364e-06, "loss": 0.058, "step": 6138 }, { "epoch": 5.73202614379085, "grad_norm": 2.479523584907818, "learning_rate": 4.064994734902239e-06, "loss": 0.1384, "step": 6139 }, { "epoch": 5.7329598506069095, "grad_norm": 1.7296000555305495, "learning_rate": 4.063509326172615e-06, "loss": 0.0889, "step": 6140 }, { "epoch": 5.733893557422969, "grad_norm": 0.9767782430130176, "learning_rate": 4.062024003095344e-06, "loss": 0.0302, "step": 6141 }, { "epoch": 5.734827264239029, "grad_norm": 1.135657784771307, "learning_rate": 4.060538765806282e-06, "loss": 0.0205, "step": 6142 }, { "epoch": 5.735760971055089, "grad_norm": 1.9255067307604847, "learning_rate": 4.059053614441266e-06, "loss": 0.0156, "step": 6143 }, { "epoch": 5.7366946778711485, "grad_norm": 1.6763222219622378, "learning_rate": 4.05756854913613e-06, "loss": 0.0767, "step": 6144 }, { "epoch": 5.737628384687208, "grad_norm": 0.3858811070741923, "learning_rate": 4.056083570026701e-06, "loss": 0.0097, "step": 6145 }, { "epoch": 5.738562091503268, "grad_norm": 0.363318953200629, "learning_rate": 4.054598677248796e-06, "loss": 0.0133, "step": 6146 }, { "epoch": 5.739495798319328, "grad_norm": 2.1763504248879997, "learning_rate": 4.053113870938224e-06, "loss": 0.0835, "step": 6147 }, { "epoch": 5.7404295051353875, "grad_norm": 0.6258071475388713, "learning_rate": 4.051629151230787e-06, "loss": 0.0123, "step": 6148 }, { "epoch": 5.741363211951447, "grad_norm": 0.7487742564083031, "learning_rate": 4.050144518262279e-06, "loss": 0.0238, "step": 6149 }, { "epoch": 5.742296918767507, "grad_norm": 3.2010203026086965, "learning_rate": 4.048659972168486e-06, "loss": 0.1484, "step": 6150 }, { "epoch": 5.743230625583567, "grad_norm": 1.4516102622464555, "learning_rate": 4.047175513085186e-06, "loss": 0.0539, "step": 6151 }, { "epoch": 5.7441643323996265, "grad_norm": 2.6983246140675767, "learning_rate": 4.0456911411481495e-06, "loss": 0.152, "step": 6152 }, { "epoch": 5.745098039215686, "grad_norm": 1.8010387409420525, "learning_rate": 4.04420685649314e-06, "loss": 0.0677, "step": 6153 }, { "epoch": 5.746031746031746, "grad_norm": 1.247577121284846, "learning_rate": 4.042722659255907e-06, "loss": 0.04, "step": 6154 }, { "epoch": 5.746965452847806, "grad_norm": 1.0622085640372225, "learning_rate": 4.041238549572202e-06, "loss": 0.0299, "step": 6155 }, { "epoch": 5.7478991596638656, "grad_norm": 5.699492745736112, "learning_rate": 4.039754527577761e-06, "loss": 0.246, "step": 6156 }, { "epoch": 5.748832866479925, "grad_norm": 2.2404794097789593, "learning_rate": 4.0382705934083125e-06, "loss": 0.1111, "step": 6157 }, { "epoch": 5.749766573295985, "grad_norm": 0.5922598167637381, "learning_rate": 4.036786747199579e-06, "loss": 0.0189, "step": 6158 }, { "epoch": 5.750700280112045, "grad_norm": 2.820535927722583, "learning_rate": 4.035302989087277e-06, "loss": 0.0989, "step": 6159 }, { "epoch": 5.751633986928105, "grad_norm": 4.522363165899933, "learning_rate": 4.033819319207112e-06, "loss": 0.2678, "step": 6160 }, { "epoch": 5.752567693744164, "grad_norm": 0.5281802213463255, "learning_rate": 4.032335737694781e-06, "loss": 0.0212, "step": 6161 }, { "epoch": 5.753501400560224, "grad_norm": 1.6771383857224635, "learning_rate": 4.03085224468597e-06, "loss": 0.0139, "step": 6162 }, { "epoch": 5.754435107376284, "grad_norm": 1.350318908633213, "learning_rate": 4.029368840316368e-06, "loss": 0.0885, "step": 6163 }, { "epoch": 5.755368814192344, "grad_norm": 2.1912853798532983, "learning_rate": 4.0278855247216446e-06, "loss": 0.0151, "step": 6164 }, { "epoch": 5.756302521008403, "grad_norm": 0.9880462379102725, "learning_rate": 4.0264022980374666e-06, "loss": 0.0372, "step": 6165 }, { "epoch": 5.757236227824463, "grad_norm": 3.1324721169571106, "learning_rate": 4.024919160399489e-06, "loss": 0.1535, "step": 6166 }, { "epoch": 5.758169934640523, "grad_norm": 0.4075000478000769, "learning_rate": 4.0234361119433615e-06, "loss": 0.0105, "step": 6167 }, { "epoch": 5.759103641456583, "grad_norm": 4.478917243955837, "learning_rate": 4.021953152804727e-06, "loss": 0.0997, "step": 6168 }, { "epoch": 5.760037348272642, "grad_norm": 0.4682621042324629, "learning_rate": 4.020470283119219e-06, "loss": 0.0088, "step": 6169 }, { "epoch": 5.760971055088702, "grad_norm": 1.0225583050291356, "learning_rate": 4.018987503022461e-06, "loss": 0.0136, "step": 6170 }, { "epoch": 5.761904761904762, "grad_norm": 1.4746227009753488, "learning_rate": 4.017504812650066e-06, "loss": 0.0574, "step": 6171 }, { "epoch": 5.762838468720822, "grad_norm": 3.1359285264891823, "learning_rate": 4.016022212137649e-06, "loss": 0.1357, "step": 6172 }, { "epoch": 5.763772175536881, "grad_norm": 1.9177354032943668, "learning_rate": 4.014539701620806e-06, "loss": 0.0648, "step": 6173 }, { "epoch": 5.764705882352941, "grad_norm": 3.3640676745013587, "learning_rate": 4.01305728123513e-06, "loss": 0.1381, "step": 6174 }, { "epoch": 5.765639589169001, "grad_norm": 2.28160314458556, "learning_rate": 4.011574951116202e-06, "loss": 0.1117, "step": 6175 }, { "epoch": 5.766573295985061, "grad_norm": 1.9551427238063823, "learning_rate": 4.010092711399602e-06, "loss": 0.0699, "step": 6176 }, { "epoch": 5.76750700280112, "grad_norm": 1.1457208489227424, "learning_rate": 4.0086105622208935e-06, "loss": 0.0369, "step": 6177 }, { "epoch": 5.76844070961718, "grad_norm": 0.6180815609708326, "learning_rate": 4.0071285037156385e-06, "loss": 0.0276, "step": 6178 }, { "epoch": 5.76937441643324, "grad_norm": 3.8978487899504106, "learning_rate": 4.005646536019383e-06, "loss": 0.2285, "step": 6179 }, { "epoch": 5.7703081232493, "grad_norm": 2.166977664619502, "learning_rate": 4.004164659267672e-06, "loss": 0.1186, "step": 6180 }, { "epoch": 5.771241830065359, "grad_norm": 1.3474498751334378, "learning_rate": 4.002682873596041e-06, "loss": 0.0706, "step": 6181 }, { "epoch": 5.772175536881419, "grad_norm": 3.561657599906794, "learning_rate": 4.001201179140014e-06, "loss": 0.185, "step": 6182 }, { "epoch": 5.773109243697479, "grad_norm": 0.5691539428077798, "learning_rate": 3.999719576035106e-06, "loss": 0.0144, "step": 6183 }, { "epoch": 5.774042950513539, "grad_norm": 1.144331505651256, "learning_rate": 3.998238064416828e-06, "loss": 0.0393, "step": 6184 }, { "epoch": 5.774976657329598, "grad_norm": 2.414375820669943, "learning_rate": 3.996756644420682e-06, "loss": 0.0866, "step": 6185 }, { "epoch": 5.775910364145658, "grad_norm": 1.8311184058862882, "learning_rate": 3.995275316182157e-06, "loss": 0.1158, "step": 6186 }, { "epoch": 5.776844070961718, "grad_norm": 1.627643422653248, "learning_rate": 3.99379407983674e-06, "loss": 0.0654, "step": 6187 }, { "epoch": 5.777777777777778, "grad_norm": 1.2027744944474246, "learning_rate": 3.992312935519903e-06, "loss": 0.0484, "step": 6188 }, { "epoch": 5.778711484593837, "grad_norm": 4.051978892569762, "learning_rate": 3.990831883367114e-06, "loss": 0.2071, "step": 6189 }, { "epoch": 5.779645191409897, "grad_norm": 3.4792613043866827, "learning_rate": 3.989350923513834e-06, "loss": 0.1099, "step": 6190 }, { "epoch": 5.780578898225957, "grad_norm": 2.268220474660374, "learning_rate": 3.98787005609551e-06, "loss": 0.0897, "step": 6191 }, { "epoch": 5.781512605042017, "grad_norm": 2.548547909891402, "learning_rate": 3.986389281247584e-06, "loss": 0.1798, "step": 6192 }, { "epoch": 5.7824463118580764, "grad_norm": 2.879762271687292, "learning_rate": 3.9849085991054905e-06, "loss": 0.1241, "step": 6193 }, { "epoch": 5.783380018674136, "grad_norm": 2.350755431998862, "learning_rate": 3.9834280098046535e-06, "loss": 0.0832, "step": 6194 }, { "epoch": 5.784313725490196, "grad_norm": 1.5460804136767516, "learning_rate": 3.981947513480489e-06, "loss": 0.086, "step": 6195 }, { "epoch": 5.785247432306256, "grad_norm": 0.5587418563438542, "learning_rate": 3.980467110268406e-06, "loss": 0.0105, "step": 6196 }, { "epoch": 5.7861811391223155, "grad_norm": 3.2879064107371376, "learning_rate": 3.9789868003037984e-06, "loss": 0.167, "step": 6197 }, { "epoch": 5.787114845938375, "grad_norm": 7.284455149991133, "learning_rate": 3.977506583722064e-06, "loss": 0.1475, "step": 6198 }, { "epoch": 5.788048552754435, "grad_norm": 2.617539875912896, "learning_rate": 3.97602646065858e-06, "loss": 0.1414, "step": 6199 }, { "epoch": 5.788982259570495, "grad_norm": 8.07314590859133, "learning_rate": 3.974546431248724e-06, "loss": 0.1597, "step": 6200 }, { "epoch": 5.7899159663865545, "grad_norm": 1.4956297690074138, "learning_rate": 3.973066495627855e-06, "loss": 0.0758, "step": 6201 }, { "epoch": 5.790849673202614, "grad_norm": 2.9031973682032444, "learning_rate": 3.971586653931335e-06, "loss": 0.0958, "step": 6202 }, { "epoch": 5.791783380018674, "grad_norm": 4.5765766453584344, "learning_rate": 3.970106906294509e-06, "loss": 0.1706, "step": 6203 }, { "epoch": 5.792717086834734, "grad_norm": 1.5917404074060635, "learning_rate": 3.968627252852717e-06, "loss": 0.0914, "step": 6204 }, { "epoch": 5.7936507936507935, "grad_norm": 1.1231940006794343, "learning_rate": 3.9671476937412874e-06, "loss": 0.0441, "step": 6205 }, { "epoch": 5.794584500466853, "grad_norm": 4.017653622655591, "learning_rate": 3.965668229095546e-06, "loss": 0.1401, "step": 6206 }, { "epoch": 5.795518207282913, "grad_norm": 0.6990495743831815, "learning_rate": 3.964188859050805e-06, "loss": 0.026, "step": 6207 }, { "epoch": 5.796451914098973, "grad_norm": 1.3815069869447556, "learning_rate": 3.962709583742367e-06, "loss": 0.0452, "step": 6208 }, { "epoch": 5.7973856209150325, "grad_norm": 2.2842391288635406, "learning_rate": 3.961230403305527e-06, "loss": 0.1027, "step": 6209 }, { "epoch": 5.798319327731092, "grad_norm": 3.4890437861411185, "learning_rate": 3.959751317875577e-06, "loss": 0.0483, "step": 6210 }, { "epoch": 5.799253034547152, "grad_norm": 0.844564871647174, "learning_rate": 3.958272327587793e-06, "loss": 0.0307, "step": 6211 }, { "epoch": 5.800186741363212, "grad_norm": 1.8228538300689296, "learning_rate": 3.956793432577445e-06, "loss": 0.0221, "step": 6212 }, { "epoch": 5.8011204481792715, "grad_norm": 0.43249228870225126, "learning_rate": 3.955314632979794e-06, "loss": 0.0102, "step": 6213 }, { "epoch": 5.802054154995331, "grad_norm": 2.0874689501416848, "learning_rate": 3.953835928930091e-06, "loss": 0.0918, "step": 6214 }, { "epoch": 5.802987861811391, "grad_norm": 5.362542810617266, "learning_rate": 3.9523573205635825e-06, "loss": 0.1539, "step": 6215 }, { "epoch": 5.803921568627451, "grad_norm": 1.1843654487484017, "learning_rate": 3.950878808015502e-06, "loss": 0.0443, "step": 6216 }, { "epoch": 5.8048552754435105, "grad_norm": 2.0272468067299596, "learning_rate": 3.949400391421076e-06, "loss": 0.0863, "step": 6217 }, { "epoch": 5.80578898225957, "grad_norm": 1.8122411942380114, "learning_rate": 3.947922070915522e-06, "loss": 0.053, "step": 6218 }, { "epoch": 5.80672268907563, "grad_norm": 0.8999386746336901, "learning_rate": 3.946443846634049e-06, "loss": 0.0208, "step": 6219 }, { "epoch": 5.80765639589169, "grad_norm": 1.7741299295776156, "learning_rate": 3.9449657187118565e-06, "loss": 0.0764, "step": 6220 }, { "epoch": 5.80859010270775, "grad_norm": 1.02001357131251, "learning_rate": 3.943487687284134e-06, "loss": 0.0315, "step": 6221 }, { "epoch": 5.809523809523809, "grad_norm": 1.1427049359720534, "learning_rate": 3.942009752486066e-06, "loss": 0.051, "step": 6222 }, { "epoch": 5.810457516339869, "grad_norm": 1.4014673829299922, "learning_rate": 3.940531914452826e-06, "loss": 0.0545, "step": 6223 }, { "epoch": 5.811391223155929, "grad_norm": 2.518709835269264, "learning_rate": 3.939054173319576e-06, "loss": 0.1112, "step": 6224 }, { "epoch": 5.812324929971989, "grad_norm": 3.9041205749841406, "learning_rate": 3.937576529221474e-06, "loss": 0.1694, "step": 6225 }, { "epoch": 5.813258636788048, "grad_norm": 2.4337695708581455, "learning_rate": 3.936098982293666e-06, "loss": 0.0851, "step": 6226 }, { "epoch": 5.814192343604108, "grad_norm": 1.9432358616793457, "learning_rate": 3.93462153267129e-06, "loss": 0.1127, "step": 6227 }, { "epoch": 5.815126050420168, "grad_norm": 1.4484823332239662, "learning_rate": 3.933144180489474e-06, "loss": 0.0556, "step": 6228 }, { "epoch": 5.816059757236228, "grad_norm": 0.5072248988350757, "learning_rate": 3.93166692588334e-06, "loss": 0.0116, "step": 6229 }, { "epoch": 5.816993464052287, "grad_norm": 2.763234429733437, "learning_rate": 3.930189768987998e-06, "loss": 0.151, "step": 6230 }, { "epoch": 5.817927170868347, "grad_norm": 1.3392368864236976, "learning_rate": 3.9287127099385485e-06, "loss": 0.0366, "step": 6231 }, { "epoch": 5.818860877684407, "grad_norm": 0.44266587304427996, "learning_rate": 3.927235748870088e-06, "loss": 0.019, "step": 6232 }, { "epoch": 5.819794584500467, "grad_norm": 0.43790742047900416, "learning_rate": 3.9257588859177005e-06, "loss": 0.0076, "step": 6233 }, { "epoch": 5.820728291316526, "grad_norm": 1.1748168485684602, "learning_rate": 3.92428212121646e-06, "loss": 0.0391, "step": 6234 }, { "epoch": 5.821661998132586, "grad_norm": 1.5058838857305215, "learning_rate": 3.922805454901431e-06, "loss": 0.061, "step": 6235 }, { "epoch": 5.822595704948646, "grad_norm": 0.9614581531362779, "learning_rate": 3.9213288871076735e-06, "loss": 0.0433, "step": 6236 }, { "epoch": 5.823529411764706, "grad_norm": 1.105177571562464, "learning_rate": 3.9198524179702365e-06, "loss": 0.0405, "step": 6237 }, { "epoch": 5.824463118580765, "grad_norm": 1.185780123939397, "learning_rate": 3.918376047624159e-06, "loss": 0.0807, "step": 6238 }, { "epoch": 5.825396825396825, "grad_norm": 0.42274845857344406, "learning_rate": 3.916899776204467e-06, "loss": 0.0046, "step": 6239 }, { "epoch": 5.826330532212885, "grad_norm": 3.1486944701274395, "learning_rate": 3.915423603846188e-06, "loss": 0.1805, "step": 6240 }, { "epoch": 5.827264239028945, "grad_norm": 4.06084867879953, "learning_rate": 3.91394753068433e-06, "loss": 0.2547, "step": 6241 }, { "epoch": 5.828197945845004, "grad_norm": 0.3610438473346778, "learning_rate": 3.912471556853898e-06, "loss": 0.0094, "step": 6242 }, { "epoch": 5.829131652661064, "grad_norm": 3.599322260048264, "learning_rate": 3.910995682489884e-06, "loss": 0.1624, "step": 6243 }, { "epoch": 5.830065359477124, "grad_norm": 0.448593592872938, "learning_rate": 3.909519907727276e-06, "loss": 0.0019, "step": 6244 }, { "epoch": 5.830999066293184, "grad_norm": 2.103058159084119, "learning_rate": 3.9080442327010486e-06, "loss": 0.0922, "step": 6245 }, { "epoch": 5.831932773109243, "grad_norm": 1.0545796618483336, "learning_rate": 3.906568657546167e-06, "loss": 0.025, "step": 6246 }, { "epoch": 5.832866479925303, "grad_norm": 1.4526659028524287, "learning_rate": 3.90509318239759e-06, "loss": 0.054, "step": 6247 }, { "epoch": 5.833800186741363, "grad_norm": 6.483352225331369, "learning_rate": 3.903617807390264e-06, "loss": 0.1086, "step": 6248 }, { "epoch": 5.834733893557423, "grad_norm": 2.2418164690942928, "learning_rate": 3.902142532659131e-06, "loss": 0.1241, "step": 6249 }, { "epoch": 5.835667600373482, "grad_norm": 2.792576697128241, "learning_rate": 3.900667358339122e-06, "loss": 0.1103, "step": 6250 }, { "epoch": 5.836601307189542, "grad_norm": 2.067725196266143, "learning_rate": 3.899192284565155e-06, "loss": 0.0353, "step": 6251 }, { "epoch": 5.837535014005602, "grad_norm": 1.4726599557498694, "learning_rate": 3.897717311472141e-06, "loss": 0.0897, "step": 6252 }, { "epoch": 5.838468720821662, "grad_norm": 1.8724197035987062, "learning_rate": 3.896242439194986e-06, "loss": 0.0806, "step": 6253 }, { "epoch": 5.839402427637721, "grad_norm": 1.6658222002433092, "learning_rate": 3.894767667868581e-06, "loss": 0.1137, "step": 6254 }, { "epoch": 5.840336134453781, "grad_norm": 1.0873797030737016, "learning_rate": 3.893292997627811e-06, "loss": 0.0517, "step": 6255 }, { "epoch": 5.841269841269841, "grad_norm": 3.4133254756340126, "learning_rate": 3.891818428607548e-06, "loss": 0.1867, "step": 6256 }, { "epoch": 5.842203548085901, "grad_norm": 1.4597447483851678, "learning_rate": 3.890343960942662e-06, "loss": 0.0741, "step": 6257 }, { "epoch": 5.8431372549019605, "grad_norm": 0.9905180911325222, "learning_rate": 3.888869594768006e-06, "loss": 0.0359, "step": 6258 }, { "epoch": 5.84407096171802, "grad_norm": 2.0646675728294954, "learning_rate": 3.887395330218429e-06, "loss": 0.1077, "step": 6259 }, { "epoch": 5.84500466853408, "grad_norm": 2.4027244184323697, "learning_rate": 3.885921167428766e-06, "loss": 0.1125, "step": 6260 }, { "epoch": 5.84593837535014, "grad_norm": 1.3530409569412798, "learning_rate": 3.884447106533849e-06, "loss": 0.0903, "step": 6261 }, { "epoch": 5.8468720821661995, "grad_norm": 3.5251676366508207, "learning_rate": 3.882973147668494e-06, "loss": 0.1367, "step": 6262 }, { "epoch": 5.847805788982259, "grad_norm": 0.918064131727665, "learning_rate": 3.881499290967512e-06, "loss": 0.0291, "step": 6263 }, { "epoch": 5.848739495798319, "grad_norm": 1.3880295623001748, "learning_rate": 3.880025536565705e-06, "loss": 0.0613, "step": 6264 }, { "epoch": 5.849673202614379, "grad_norm": 1.3638367887411695, "learning_rate": 3.8785518845978595e-06, "loss": 0.049, "step": 6265 }, { "epoch": 5.8506069094304385, "grad_norm": 1.148616842952733, "learning_rate": 3.877078335198761e-06, "loss": 0.054, "step": 6266 }, { "epoch": 5.851540616246498, "grad_norm": 0.5379439576661078, "learning_rate": 3.875604888503181e-06, "loss": 0.0138, "step": 6267 }, { "epoch": 5.852474323062558, "grad_norm": 2.619977682656093, "learning_rate": 3.874131544645883e-06, "loss": 0.1656, "step": 6268 }, { "epoch": 5.853408029878618, "grad_norm": 1.1714077407490882, "learning_rate": 3.8726583037616176e-06, "loss": 0.0571, "step": 6269 }, { "epoch": 5.8543417366946775, "grad_norm": 2.32329158751428, "learning_rate": 3.871185165985132e-06, "loss": 0.1493, "step": 6270 }, { "epoch": 5.855275443510737, "grad_norm": 0.770744317764321, "learning_rate": 3.86971213145116e-06, "loss": 0.0349, "step": 6271 }, { "epoch": 5.856209150326797, "grad_norm": 2.2305568137449505, "learning_rate": 3.8682392002944265e-06, "loss": 0.1025, "step": 6272 }, { "epoch": 5.857142857142857, "grad_norm": 1.3309441889396723, "learning_rate": 3.866766372649645e-06, "loss": 0.0596, "step": 6273 }, { "epoch": 5.8580765639589165, "grad_norm": 1.6533059450502983, "learning_rate": 3.865293648651526e-06, "loss": 0.0718, "step": 6274 }, { "epoch": 5.859010270774976, "grad_norm": 0.5867179786996262, "learning_rate": 3.863821028434764e-06, "loss": 0.0111, "step": 6275 }, { "epoch": 5.859943977591037, "grad_norm": 2.3599110692853986, "learning_rate": 3.862348512134047e-06, "loss": 0.1316, "step": 6276 }, { "epoch": 5.860877684407097, "grad_norm": 1.5338565004378326, "learning_rate": 3.86087609988405e-06, "loss": 0.0729, "step": 6277 }, { "epoch": 5.861811391223156, "grad_norm": 1.2050348165167284, "learning_rate": 3.859403791819446e-06, "loss": 0.05, "step": 6278 }, { "epoch": 5.862745098039216, "grad_norm": 4.227588488391211, "learning_rate": 3.85793158807489e-06, "loss": 0.1869, "step": 6279 }, { "epoch": 5.863678804855276, "grad_norm": 1.327774831136255, "learning_rate": 3.856459488785033e-06, "loss": 0.0548, "step": 6280 }, { "epoch": 5.864612511671336, "grad_norm": 1.563755546159343, "learning_rate": 3.854987494084515e-06, "loss": 0.0657, "step": 6281 }, { "epoch": 5.865546218487395, "grad_norm": 5.0966226430284, "learning_rate": 3.853515604107963e-06, "loss": 0.202, "step": 6282 }, { "epoch": 5.866479925303455, "grad_norm": 2.003213957373049, "learning_rate": 3.852043818990001e-06, "loss": 0.1044, "step": 6283 }, { "epoch": 5.867413632119515, "grad_norm": 0.488940917232562, "learning_rate": 3.8505721388652395e-06, "loss": 0.0103, "step": 6284 }, { "epoch": 5.868347338935575, "grad_norm": 2.0217173259892376, "learning_rate": 3.849100563868279e-06, "loss": 0.0648, "step": 6285 }, { "epoch": 5.8692810457516345, "grad_norm": 6.004184120273084, "learning_rate": 3.847629094133708e-06, "loss": 0.1722, "step": 6286 }, { "epoch": 5.870214752567694, "grad_norm": 1.7618374938475354, "learning_rate": 3.846157729796115e-06, "loss": 0.0709, "step": 6287 }, { "epoch": 5.871148459383754, "grad_norm": 1.6797179107685307, "learning_rate": 3.844686470990068e-06, "loss": 0.0511, "step": 6288 }, { "epoch": 5.872082166199814, "grad_norm": 0.4685834075478505, "learning_rate": 3.843215317850132e-06, "loss": 0.0153, "step": 6289 }, { "epoch": 5.8730158730158735, "grad_norm": 3.353963917413396, "learning_rate": 3.8417442705108564e-06, "loss": 0.1222, "step": 6290 }, { "epoch": 5.873949579831933, "grad_norm": 3.2628128424707055, "learning_rate": 3.840273329106789e-06, "loss": 0.1695, "step": 6291 }, { "epoch": 5.874883286647993, "grad_norm": 4.054808417327763, "learning_rate": 3.838802493772463e-06, "loss": 0.14, "step": 6292 }, { "epoch": 5.875816993464053, "grad_norm": 1.7591315152134595, "learning_rate": 3.8373317646424e-06, "loss": 0.0897, "step": 6293 }, { "epoch": 5.8767507002801125, "grad_norm": 1.0334611817633417, "learning_rate": 3.8358611418511135e-06, "loss": 0.0352, "step": 6294 }, { "epoch": 5.877684407096172, "grad_norm": 2.7314764230241004, "learning_rate": 3.8343906255331135e-06, "loss": 0.0937, "step": 6295 }, { "epoch": 5.878618113912232, "grad_norm": 2.8100522213740446, "learning_rate": 3.83292021582289e-06, "loss": 0.1255, "step": 6296 }, { "epoch": 5.879551820728292, "grad_norm": 2.040241480880998, "learning_rate": 3.83144991285493e-06, "loss": 0.111, "step": 6297 }, { "epoch": 5.8804855275443515, "grad_norm": 1.1635077537726382, "learning_rate": 3.829979716763709e-06, "loss": 0.0276, "step": 6298 }, { "epoch": 5.881419234360411, "grad_norm": 1.2510148904696805, "learning_rate": 3.828509627683691e-06, "loss": 0.0605, "step": 6299 }, { "epoch": 5.882352941176471, "grad_norm": 4.408924898916897, "learning_rate": 3.827039645749334e-06, "loss": 0.1038, "step": 6300 }, { "epoch": 5.883286647992531, "grad_norm": 5.346649572901279, "learning_rate": 3.825569771095082e-06, "loss": 0.0673, "step": 6301 }, { "epoch": 5.8842203548085905, "grad_norm": 0.9024998608599616, "learning_rate": 3.824100003855373e-06, "loss": 0.0095, "step": 6302 }, { "epoch": 5.88515406162465, "grad_norm": 0.7773825596378452, "learning_rate": 3.8226303441646314e-06, "loss": 0.0252, "step": 6303 }, { "epoch": 5.88608776844071, "grad_norm": 2.971836099552705, "learning_rate": 3.8211607921572745e-06, "loss": 0.1271, "step": 6304 }, { "epoch": 5.88702147525677, "grad_norm": 3.0543397791503772, "learning_rate": 3.819691347967711e-06, "loss": 0.1319, "step": 6305 }, { "epoch": 5.8879551820728295, "grad_norm": 1.7783464816246621, "learning_rate": 3.818222011730335e-06, "loss": 0.0639, "step": 6306 }, { "epoch": 5.888888888888889, "grad_norm": 0.57337690486497, "learning_rate": 3.816752783579534e-06, "loss": 0.0072, "step": 6307 }, { "epoch": 5.889822595704949, "grad_norm": 4.937073853621419, "learning_rate": 3.815283663649685e-06, "loss": 0.0917, "step": 6308 }, { "epoch": 5.890756302521009, "grad_norm": 1.0168074904314182, "learning_rate": 3.8138146520751564e-06, "loss": 0.0126, "step": 6309 }, { "epoch": 5.8916900093370685, "grad_norm": 1.3905264029557047, "learning_rate": 3.812345748990305e-06, "loss": 0.0508, "step": 6310 }, { "epoch": 5.892623716153128, "grad_norm": 2.7624800972341945, "learning_rate": 3.810876954529477e-06, "loss": 0.124, "step": 6311 }, { "epoch": 5.893557422969188, "grad_norm": 1.2411154743299353, "learning_rate": 3.809408268827009e-06, "loss": 0.0607, "step": 6312 }, { "epoch": 5.894491129785248, "grad_norm": 7.436451789160014, "learning_rate": 3.8079396920172316e-06, "loss": 0.3574, "step": 6313 }, { "epoch": 5.895424836601308, "grad_norm": 0.5943377128760006, "learning_rate": 3.8064712242344614e-06, "loss": 0.013, "step": 6314 }, { "epoch": 5.896358543417367, "grad_norm": 2.6463791305415456, "learning_rate": 3.8050028656130046e-06, "loss": 0.1004, "step": 6315 }, { "epoch": 5.897292250233427, "grad_norm": 1.7195665916743184, "learning_rate": 3.803534616287157e-06, "loss": 0.0563, "step": 6316 }, { "epoch": 5.898225957049487, "grad_norm": 1.4208085608885983, "learning_rate": 3.8020664763912106e-06, "loss": 0.0613, "step": 6317 }, { "epoch": 5.899159663865547, "grad_norm": 2.5055719251650994, "learning_rate": 3.8005984460594415e-06, "loss": 0.1217, "step": 6318 }, { "epoch": 5.900093370681606, "grad_norm": 1.6707893656202835, "learning_rate": 3.799130525426116e-06, "loss": 0.0686, "step": 6319 }, { "epoch": 5.901027077497666, "grad_norm": 0.814291594573153, "learning_rate": 3.7976627146254887e-06, "loss": 0.0169, "step": 6320 }, { "epoch": 5.901960784313726, "grad_norm": 1.3002996618559433, "learning_rate": 3.7961950137918134e-06, "loss": 0.0527, "step": 6321 }, { "epoch": 5.902894491129786, "grad_norm": 1.1433562332704272, "learning_rate": 3.794727423059324e-06, "loss": 0.0619, "step": 6322 }, { "epoch": 5.903828197945845, "grad_norm": 1.975095061261952, "learning_rate": 3.793259942562248e-06, "loss": 0.0798, "step": 6323 }, { "epoch": 5.904761904761905, "grad_norm": 1.5274208482098657, "learning_rate": 3.791792572434801e-06, "loss": 0.0559, "step": 6324 }, { "epoch": 5.905695611577965, "grad_norm": 1.747165643781639, "learning_rate": 3.790325312811194e-06, "loss": 0.0745, "step": 6325 }, { "epoch": 5.906629318394025, "grad_norm": 2.239823037030826, "learning_rate": 3.788858163825621e-06, "loss": 0.0534, "step": 6326 }, { "epoch": 5.907563025210084, "grad_norm": 3.0105177873242215, "learning_rate": 3.7873911256122704e-06, "loss": 0.102, "step": 6327 }, { "epoch": 5.908496732026144, "grad_norm": 0.17895104005831877, "learning_rate": 3.785924198305318e-06, "loss": 0.0024, "step": 6328 }, { "epoch": 5.909430438842204, "grad_norm": 1.5073635936714784, "learning_rate": 3.7844573820389285e-06, "loss": 0.0561, "step": 6329 }, { "epoch": 5.910364145658264, "grad_norm": 1.3719243604348015, "learning_rate": 3.782990676947263e-06, "loss": 0.0386, "step": 6330 }, { "epoch": 5.911297852474323, "grad_norm": 1.0631572208311795, "learning_rate": 3.781524083164465e-06, "loss": 0.0488, "step": 6331 }, { "epoch": 5.912231559290383, "grad_norm": 0.7879255654560149, "learning_rate": 3.7800576008246704e-06, "loss": 0.0167, "step": 6332 }, { "epoch": 5.913165266106443, "grad_norm": 3.462235823777661, "learning_rate": 3.7785912300620052e-06, "loss": 0.1684, "step": 6333 }, { "epoch": 5.914098972922503, "grad_norm": 0.5878901921770014, "learning_rate": 3.777124971010587e-06, "loss": 0.0144, "step": 6334 }, { "epoch": 5.915032679738562, "grad_norm": 1.0128254341978917, "learning_rate": 3.7756588238045195e-06, "loss": 0.0181, "step": 6335 }, { "epoch": 5.915966386554622, "grad_norm": 0.8421336523793091, "learning_rate": 3.7741927885778974e-06, "loss": 0.0402, "step": 6336 }, { "epoch": 5.916900093370682, "grad_norm": 2.594491606836932, "learning_rate": 3.772726865464806e-06, "loss": 0.159, "step": 6337 }, { "epoch": 5.917833800186742, "grad_norm": 0.4336557490222086, "learning_rate": 3.771261054599322e-06, "loss": 0.0136, "step": 6338 }, { "epoch": 5.918767507002801, "grad_norm": 3.144324458565403, "learning_rate": 3.7697953561155086e-06, "loss": 0.1625, "step": 6339 }, { "epoch": 5.919701213818861, "grad_norm": 1.1076899030258476, "learning_rate": 3.7683297701474203e-06, "loss": 0.0423, "step": 6340 }, { "epoch": 5.920634920634921, "grad_norm": 3.5749676308239993, "learning_rate": 3.7668642968290995e-06, "loss": 0.2065, "step": 6341 }, { "epoch": 5.921568627450981, "grad_norm": 0.4968462281915516, "learning_rate": 3.7653989362945815e-06, "loss": 0.0187, "step": 6342 }, { "epoch": 5.92250233426704, "grad_norm": 1.9449639930941545, "learning_rate": 3.76393368867789e-06, "loss": 0.0437, "step": 6343 }, { "epoch": 5.9234360410831, "grad_norm": 0.5563315360157083, "learning_rate": 3.7624685541130374e-06, "loss": 0.019, "step": 6344 }, { "epoch": 5.92436974789916, "grad_norm": 2.245208830231786, "learning_rate": 3.7610035327340262e-06, "loss": 0.063, "step": 6345 }, { "epoch": 5.92530345471522, "grad_norm": 1.1005776769248252, "learning_rate": 3.7595386246748465e-06, "loss": 0.0435, "step": 6346 }, { "epoch": 5.926237161531279, "grad_norm": 1.330620354234828, "learning_rate": 3.758073830069484e-06, "loss": 0.0565, "step": 6347 }, { "epoch": 5.927170868347339, "grad_norm": 2.8138168519099294, "learning_rate": 3.756609149051909e-06, "loss": 0.1062, "step": 6348 }, { "epoch": 5.928104575163399, "grad_norm": 1.2404116707747146, "learning_rate": 3.755144581756082e-06, "loss": 0.0658, "step": 6349 }, { "epoch": 5.929038281979459, "grad_norm": 0.4718914321352439, "learning_rate": 3.7536801283159523e-06, "loss": 0.0165, "step": 6350 }, { "epoch": 5.9299719887955185, "grad_norm": 0.8290439651477539, "learning_rate": 3.7522157888654643e-06, "loss": 0.0302, "step": 6351 }, { "epoch": 5.930905695611578, "grad_norm": 1.1254168349960423, "learning_rate": 3.750751563538545e-06, "loss": 0.0541, "step": 6352 }, { "epoch": 5.931839402427638, "grad_norm": 1.4688902150016776, "learning_rate": 3.749287452469115e-06, "loss": 0.0682, "step": 6353 }, { "epoch": 5.932773109243698, "grad_norm": 1.563983418825871, "learning_rate": 3.7478234557910805e-06, "loss": 0.0959, "step": 6354 }, { "epoch": 5.9337068160597575, "grad_norm": 0.6836885706304633, "learning_rate": 3.7463595736383444e-06, "loss": 0.0292, "step": 6355 }, { "epoch": 5.934640522875817, "grad_norm": 1.731423500860727, "learning_rate": 3.744895806144793e-06, "loss": 0.0731, "step": 6356 }, { "epoch": 5.935574229691877, "grad_norm": 0.4376921764492757, "learning_rate": 3.743432153444304e-06, "loss": 0.011, "step": 6357 }, { "epoch": 5.936507936507937, "grad_norm": 2.0018464986164206, "learning_rate": 3.7419686156707414e-06, "loss": 0.1191, "step": 6358 }, { "epoch": 5.9374416433239965, "grad_norm": 2.416222502546029, "learning_rate": 3.7405051929579674e-06, "loss": 0.1261, "step": 6359 }, { "epoch": 5.938375350140056, "grad_norm": 0.48983222888221645, "learning_rate": 3.7390418854398247e-06, "loss": 0.0113, "step": 6360 }, { "epoch": 5.939309056956116, "grad_norm": 1.012142030244447, "learning_rate": 3.73757869325015e-06, "loss": 0.0144, "step": 6361 }, { "epoch": 5.940242763772176, "grad_norm": 6.452859779419142, "learning_rate": 3.7361156165227676e-06, "loss": 0.1961, "step": 6362 }, { "epoch": 5.9411764705882355, "grad_norm": 1.8101747308792258, "learning_rate": 3.7346526553914896e-06, "loss": 0.1052, "step": 6363 }, { "epoch": 5.942110177404295, "grad_norm": 0.3200697576482205, "learning_rate": 3.7331898099901255e-06, "loss": 0.0071, "step": 6364 }, { "epoch": 5.943043884220355, "grad_norm": 4.36394247480025, "learning_rate": 3.731727080452464e-06, "loss": 0.0687, "step": 6365 }, { "epoch": 5.943977591036415, "grad_norm": 0.5530589812688463, "learning_rate": 3.7302644669122902e-06, "loss": 0.0117, "step": 6366 }, { "epoch": 5.9449112978524745, "grad_norm": 1.8374896997148848, "learning_rate": 3.7288019695033727e-06, "loss": 0.1214, "step": 6367 }, { "epoch": 5.945845004668534, "grad_norm": 0.5636197478978326, "learning_rate": 3.7273395883594777e-06, "loss": 0.0232, "step": 6368 }, { "epoch": 5.946778711484594, "grad_norm": 1.5171906382424587, "learning_rate": 3.725877323614354e-06, "loss": 0.0663, "step": 6369 }, { "epoch": 5.947712418300654, "grad_norm": 0.36120376932944454, "learning_rate": 3.724415175401741e-06, "loss": 0.0022, "step": 6370 }, { "epoch": 5.9486461251167135, "grad_norm": 0.3356313458748634, "learning_rate": 3.7229531438553666e-06, "loss": 0.0059, "step": 6371 }, { "epoch": 5.949579831932773, "grad_norm": 3.5942932565345993, "learning_rate": 3.7214912291089544e-06, "loss": 0.1689, "step": 6372 }, { "epoch": 5.950513538748833, "grad_norm": 0.44200086570727426, "learning_rate": 3.7200294312962092e-06, "loss": 0.0116, "step": 6373 }, { "epoch": 5.951447245564893, "grad_norm": 3.2608973014949916, "learning_rate": 3.718567750550829e-06, "loss": 0.1936, "step": 6374 }, { "epoch": 5.9523809523809526, "grad_norm": 5.326087980005038, "learning_rate": 3.7171061870065e-06, "loss": 0.0572, "step": 6375 }, { "epoch": 5.953314659197012, "grad_norm": 2.0809910998698347, "learning_rate": 3.7156447407969006e-06, "loss": 0.1122, "step": 6376 }, { "epoch": 5.954248366013072, "grad_norm": 1.2181504890830304, "learning_rate": 3.714183412055693e-06, "loss": 0.0591, "step": 6377 }, { "epoch": 5.955182072829132, "grad_norm": 0.7746584057475014, "learning_rate": 3.7127222009165342e-06, "loss": 0.0228, "step": 6378 }, { "epoch": 5.956115779645192, "grad_norm": 0.4496854372454901, "learning_rate": 3.7112611075130678e-06, "loss": 0.0071, "step": 6379 }, { "epoch": 5.957049486461251, "grad_norm": 1.0074759755383336, "learning_rate": 3.7098001319789246e-06, "loss": 0.0428, "step": 6380 }, { "epoch": 5.957983193277311, "grad_norm": 2.891075710002528, "learning_rate": 3.7083392744477277e-06, "loss": 0.1424, "step": 6381 }, { "epoch": 5.958916900093371, "grad_norm": 2.1353306265664775, "learning_rate": 3.706878535053091e-06, "loss": 0.0683, "step": 6382 }, { "epoch": 5.959850606909431, "grad_norm": 5.057771915519775, "learning_rate": 3.705417913928613e-06, "loss": 0.0999, "step": 6383 }, { "epoch": 5.96078431372549, "grad_norm": 2.864288641288327, "learning_rate": 3.7039574112078824e-06, "loss": 0.1753, "step": 6384 }, { "epoch": 5.96171802054155, "grad_norm": 4.612312414492947, "learning_rate": 3.702497027024481e-06, "loss": 0.1101, "step": 6385 }, { "epoch": 5.96265172735761, "grad_norm": 0.5661519987580288, "learning_rate": 3.7010367615119764e-06, "loss": 0.0152, "step": 6386 }, { "epoch": 5.96358543417367, "grad_norm": 0.6510806959977503, "learning_rate": 3.6995766148039247e-06, "loss": 0.0138, "step": 6387 }, { "epoch": 5.964519140989729, "grad_norm": 1.0810990402559744, "learning_rate": 3.6981165870338715e-06, "loss": 0.0516, "step": 6388 }, { "epoch": 5.965452847805789, "grad_norm": 1.330529698605737, "learning_rate": 3.696656678335355e-06, "loss": 0.0401, "step": 6389 }, { "epoch": 5.966386554621849, "grad_norm": 1.9492196720284856, "learning_rate": 3.6951968888418994e-06, "loss": 0.0225, "step": 6390 }, { "epoch": 5.967320261437909, "grad_norm": 2.843940168343619, "learning_rate": 3.693737218687017e-06, "loss": 0.1004, "step": 6391 }, { "epoch": 5.968253968253968, "grad_norm": 0.4855700806862229, "learning_rate": 3.6922776680042094e-06, "loss": 0.0136, "step": 6392 }, { "epoch": 5.969187675070028, "grad_norm": 0.9298503620799811, "learning_rate": 3.690818236926973e-06, "loss": 0.0228, "step": 6393 }, { "epoch": 5.970121381886088, "grad_norm": 1.2239995760502953, "learning_rate": 3.6893589255887863e-06, "loss": 0.0383, "step": 6394 }, { "epoch": 5.971055088702148, "grad_norm": 3.043003753267448, "learning_rate": 3.687899734123119e-06, "loss": 0.1415, "step": 6395 }, { "epoch": 5.971988795518207, "grad_norm": 0.43348298734400925, "learning_rate": 3.686440662663431e-06, "loss": 0.0066, "step": 6396 }, { "epoch": 5.972922502334267, "grad_norm": 5.86018439106168, "learning_rate": 3.6849817113431675e-06, "loss": 0.3613, "step": 6397 }, { "epoch": 5.973856209150327, "grad_norm": 2.1176801567344237, "learning_rate": 3.6835228802957703e-06, "loss": 0.1061, "step": 6398 }, { "epoch": 5.974789915966387, "grad_norm": 1.322951654493194, "learning_rate": 3.682064169654663e-06, "loss": 0.0619, "step": 6399 }, { "epoch": 5.975723622782446, "grad_norm": 2.229507604085081, "learning_rate": 3.6806055795532616e-06, "loss": 0.0835, "step": 6400 }, { "epoch": 5.976657329598506, "grad_norm": 0.4099983748791834, "learning_rate": 3.6791471101249666e-06, "loss": 0.007, "step": 6401 }, { "epoch": 5.977591036414566, "grad_norm": 0.7185831749322819, "learning_rate": 3.6776887615031763e-06, "loss": 0.0209, "step": 6402 }, { "epoch": 5.978524743230626, "grad_norm": 1.2905606719291085, "learning_rate": 3.676230533821271e-06, "loss": 0.0537, "step": 6403 }, { "epoch": 5.979458450046685, "grad_norm": 2.2684258183967825, "learning_rate": 3.6747724272126205e-06, "loss": 0.0917, "step": 6404 }, { "epoch": 5.980392156862745, "grad_norm": 0.8745704059504709, "learning_rate": 3.6733144418105837e-06, "loss": 0.0296, "step": 6405 }, { "epoch": 5.981325863678805, "grad_norm": 3.4448196752959563, "learning_rate": 3.6718565777485127e-06, "loss": 0.2289, "step": 6406 }, { "epoch": 5.982259570494865, "grad_norm": 3.922470437873115, "learning_rate": 3.670398835159744e-06, "loss": 0.1491, "step": 6407 }, { "epoch": 5.983193277310924, "grad_norm": 0.30616477288639027, "learning_rate": 3.668941214177603e-06, "loss": 0.0074, "step": 6408 }, { "epoch": 5.984126984126984, "grad_norm": 5.431535548170375, "learning_rate": 3.6674837149354027e-06, "loss": 0.0816, "step": 6409 }, { "epoch": 5.985060690943044, "grad_norm": 0.4837100481021608, "learning_rate": 3.666026337566454e-06, "loss": 0.0159, "step": 6410 }, { "epoch": 5.985994397759104, "grad_norm": 2.8545344991968498, "learning_rate": 3.664569082204046e-06, "loss": 0.096, "step": 6411 }, { "epoch": 5.9869281045751634, "grad_norm": 1.8894331764136139, "learning_rate": 3.6631119489814595e-06, "loss": 0.0585, "step": 6412 }, { "epoch": 5.987861811391223, "grad_norm": 6.603028888927112, "learning_rate": 3.661654938031968e-06, "loss": 0.1792, "step": 6413 }, { "epoch": 5.988795518207283, "grad_norm": 3.0053790125391235, "learning_rate": 3.66019804948883e-06, "loss": 0.1548, "step": 6414 }, { "epoch": 5.989729225023343, "grad_norm": 1.9194695190584063, "learning_rate": 3.658741283485295e-06, "loss": 0.0821, "step": 6415 }, { "epoch": 5.9906629318394025, "grad_norm": 0.1925179240376722, "learning_rate": 3.6572846401545982e-06, "loss": 0.0026, "step": 6416 }, { "epoch": 5.991596638655462, "grad_norm": 0.6476238293264, "learning_rate": 3.655828119629967e-06, "loss": 0.0149, "step": 6417 }, { "epoch": 5.992530345471522, "grad_norm": 2.544444044401352, "learning_rate": 3.654371722044616e-06, "loss": 0.1202, "step": 6418 }, { "epoch": 5.993464052287582, "grad_norm": 2.16509134615814, "learning_rate": 3.6529154475317487e-06, "loss": 0.121, "step": 6419 }, { "epoch": 5.9943977591036415, "grad_norm": 2.7449649497306017, "learning_rate": 3.651459296224558e-06, "loss": 0.169, "step": 6420 }, { "epoch": 5.995331465919701, "grad_norm": 1.7871490338859493, "learning_rate": 3.6500032682562247e-06, "loss": 0.0984, "step": 6421 }, { "epoch": 5.996265172735761, "grad_norm": 0.4260895139456973, "learning_rate": 3.6485473637599163e-06, "loss": 0.0075, "step": 6422 }, { "epoch": 5.997198879551821, "grad_norm": 0.4955631926899221, "learning_rate": 3.647091582868795e-06, "loss": 0.01, "step": 6423 }, { "epoch": 5.9981325863678805, "grad_norm": 1.6415080905793085, "learning_rate": 3.6456359257160063e-06, "loss": 0.0634, "step": 6424 }, { "epoch": 5.99906629318394, "grad_norm": 1.903888396890426, "learning_rate": 3.6441803924346857e-06, "loss": 0.1007, "step": 6425 }, { "epoch": 6.0, "grad_norm": 0.8424114868652254, "learning_rate": 3.642724983157956e-06, "loss": 0.0232, "step": 6426 }, { "epoch": 6.00093370681606, "grad_norm": 0.7663834372289686, "learning_rate": 3.641269698018934e-06, "loss": 0.0169, "step": 6427 }, { "epoch": 6.0018674136321195, "grad_norm": 1.4607404802647084, "learning_rate": 3.6398145371507197e-06, "loss": 0.0543, "step": 6428 }, { "epoch": 6.002801120448179, "grad_norm": 2.3683329493468444, "learning_rate": 3.6383595006864035e-06, "loss": 0.0665, "step": 6429 }, { "epoch": 6.003734827264239, "grad_norm": 1.792080281551047, "learning_rate": 3.6369045887590647e-06, "loss": 0.0997, "step": 6430 }, { "epoch": 6.004668534080299, "grad_norm": 3.0153600867015156, "learning_rate": 3.6354498015017693e-06, "loss": 0.0868, "step": 6431 }, { "epoch": 6.0056022408963585, "grad_norm": 2.466892723351056, "learning_rate": 3.633995139047577e-06, "loss": 0.0959, "step": 6432 }, { "epoch": 6.006535947712418, "grad_norm": 1.8916142717991802, "learning_rate": 3.63254060152953e-06, "loss": 0.0506, "step": 6433 }, { "epoch": 6.007469654528478, "grad_norm": 2.425431477648274, "learning_rate": 3.631086189080663e-06, "loss": 0.083, "step": 6434 }, { "epoch": 6.008403361344538, "grad_norm": 0.49650964539178666, "learning_rate": 3.6296319018339954e-06, "loss": 0.008, "step": 6435 }, { "epoch": 6.0093370681605975, "grad_norm": 3.143318189673654, "learning_rate": 3.6281777399225414e-06, "loss": 0.0934, "step": 6436 }, { "epoch": 6.010270774976657, "grad_norm": 5.037365272212556, "learning_rate": 3.626723703479299e-06, "loss": 0.2001, "step": 6437 }, { "epoch": 6.011204481792717, "grad_norm": 0.19050561319303538, "learning_rate": 3.6252697926372554e-06, "loss": 0.0034, "step": 6438 }, { "epoch": 6.012138188608777, "grad_norm": 2.326435031505903, "learning_rate": 3.623816007529384e-06, "loss": 0.0856, "step": 6439 }, { "epoch": 6.0130718954248366, "grad_norm": 2.005871194522197, "learning_rate": 3.6223623482886545e-06, "loss": 0.0647, "step": 6440 }, { "epoch": 6.014005602240896, "grad_norm": 1.3074687344073377, "learning_rate": 3.6209088150480177e-06, "loss": 0.041, "step": 6441 }, { "epoch": 6.014939309056956, "grad_norm": 1.8715206743740749, "learning_rate": 3.6194554079404145e-06, "loss": 0.0652, "step": 6442 }, { "epoch": 6.015873015873016, "grad_norm": 1.8694091829860118, "learning_rate": 3.618002127098773e-06, "loss": 0.0727, "step": 6443 }, { "epoch": 6.016806722689076, "grad_norm": 2.9964336310650523, "learning_rate": 3.616548972656017e-06, "loss": 0.0752, "step": 6444 }, { "epoch": 6.017740429505135, "grad_norm": 2.8486201697476385, "learning_rate": 3.6150959447450506e-06, "loss": 0.1191, "step": 6445 }, { "epoch": 6.018674136321195, "grad_norm": 3.77238953312736, "learning_rate": 3.6136430434987686e-06, "loss": 0.1946, "step": 6446 }, { "epoch": 6.019607843137255, "grad_norm": 0.32556214646717135, "learning_rate": 3.6121902690500542e-06, "loss": 0.0024, "step": 6447 }, { "epoch": 6.020541549953315, "grad_norm": 0.9760988757406199, "learning_rate": 3.6107376215317813e-06, "loss": 0.0309, "step": 6448 }, { "epoch": 6.021475256769374, "grad_norm": 1.1166576145573515, "learning_rate": 3.6092851010768106e-06, "loss": 0.0476, "step": 6449 }, { "epoch": 6.022408963585434, "grad_norm": 1.143381848844739, "learning_rate": 3.6078327078179896e-06, "loss": 0.0484, "step": 6450 }, { "epoch": 6.023342670401494, "grad_norm": 2.688531706912653, "learning_rate": 3.6063804418881555e-06, "loss": 0.1871, "step": 6451 }, { "epoch": 6.024276377217554, "grad_norm": 1.1584239208858482, "learning_rate": 3.6049283034201354e-06, "loss": 0.0455, "step": 6452 }, { "epoch": 6.025210084033613, "grad_norm": 0.3636498903544788, "learning_rate": 3.6034762925467435e-06, "loss": 0.0062, "step": 6453 }, { "epoch": 6.026143790849673, "grad_norm": 0.8730245137462017, "learning_rate": 3.60202440940078e-06, "loss": 0.0305, "step": 6454 }, { "epoch": 6.027077497665733, "grad_norm": 2.8084749705903493, "learning_rate": 3.6005726541150377e-06, "loss": 0.0817, "step": 6455 }, { "epoch": 6.028011204481793, "grad_norm": 5.67252421093872, "learning_rate": 3.5991210268222943e-06, "loss": 0.1999, "step": 6456 }, { "epoch": 6.028944911297852, "grad_norm": 3.6641247354635875, "learning_rate": 3.5976695276553176e-06, "loss": 0.1098, "step": 6457 }, { "epoch": 6.029878618113912, "grad_norm": 2.530611359034067, "learning_rate": 3.5962181567468634e-06, "loss": 0.1319, "step": 6458 }, { "epoch": 6.030812324929972, "grad_norm": 2.05355015752244, "learning_rate": 3.5947669142296754e-06, "loss": 0.0827, "step": 6459 }, { "epoch": 6.031746031746032, "grad_norm": 0.38628545631766614, "learning_rate": 3.5933158002364854e-06, "loss": 0.0046, "step": 6460 }, { "epoch": 6.032679738562091, "grad_norm": 1.6957649255287834, "learning_rate": 3.591864814900012e-06, "loss": 0.0853, "step": 6461 }, { "epoch": 6.033613445378151, "grad_norm": 1.845283276618772, "learning_rate": 3.5904139583529675e-06, "loss": 0.0845, "step": 6462 }, { "epoch": 6.034547152194211, "grad_norm": 1.077445065735897, "learning_rate": 3.5889632307280465e-06, "loss": 0.0195, "step": 6463 }, { "epoch": 6.035480859010271, "grad_norm": 2.9459416449124407, "learning_rate": 3.5875126321579336e-06, "loss": 0.0843, "step": 6464 }, { "epoch": 6.03641456582633, "grad_norm": 1.4042200501163904, "learning_rate": 3.5860621627753007e-06, "loss": 0.0564, "step": 6465 }, { "epoch": 6.03734827264239, "grad_norm": 6.505492595614317, "learning_rate": 3.5846118227128136e-06, "loss": 0.2059, "step": 6466 }, { "epoch": 6.03828197945845, "grad_norm": 3.5090046161859765, "learning_rate": 3.583161612103118e-06, "loss": 0.1915, "step": 6467 }, { "epoch": 6.03921568627451, "grad_norm": 1.3001567176334912, "learning_rate": 3.5817115310788536e-06, "loss": 0.0507, "step": 6468 }, { "epoch": 6.040149393090569, "grad_norm": 1.0178793636990853, "learning_rate": 3.5802615797726426e-06, "loss": 0.0115, "step": 6469 }, { "epoch": 6.041083099906629, "grad_norm": 0.5224099824893923, "learning_rate": 3.5788117583171046e-06, "loss": 0.0131, "step": 6470 }, { "epoch": 6.042016806722689, "grad_norm": 0.49327182621352955, "learning_rate": 3.5773620668448384e-06, "loss": 0.0153, "step": 6471 }, { "epoch": 6.042950513538749, "grad_norm": 2.576240706164246, "learning_rate": 3.5759125054884346e-06, "loss": 0.0925, "step": 6472 }, { "epoch": 6.043884220354808, "grad_norm": 4.675678395558808, "learning_rate": 3.5744630743804698e-06, "loss": 0.0171, "step": 6473 }, { "epoch": 6.044817927170868, "grad_norm": 2.615847573180402, "learning_rate": 3.573013773653514e-06, "loss": 0.1023, "step": 6474 }, { "epoch": 6.045751633986928, "grad_norm": 3.4612478603877928, "learning_rate": 3.5715646034401203e-06, "loss": 0.1904, "step": 6475 }, { "epoch": 6.046685340802988, "grad_norm": 1.0151978746499033, "learning_rate": 3.5701155638728303e-06, "loss": 0.0267, "step": 6476 }, { "epoch": 6.0476190476190474, "grad_norm": 0.8018591185016939, "learning_rate": 3.568666655084175e-06, "loss": 0.0276, "step": 6477 }, { "epoch": 6.048552754435107, "grad_norm": 1.1567139113661011, "learning_rate": 3.567217877206671e-06, "loss": 0.0315, "step": 6478 }, { "epoch": 6.049486461251167, "grad_norm": 1.352115728551865, "learning_rate": 3.5657692303728285e-06, "loss": 0.0623, "step": 6479 }, { "epoch": 6.050420168067227, "grad_norm": 1.745491378373633, "learning_rate": 3.5643207147151413e-06, "loss": 0.0598, "step": 6480 }, { "epoch": 6.0513538748832865, "grad_norm": 2.0866312227963957, "learning_rate": 3.562872330366092e-06, "loss": 0.1023, "step": 6481 }, { "epoch": 6.052287581699346, "grad_norm": 0.5223194275742588, "learning_rate": 3.5614240774581477e-06, "loss": 0.0094, "step": 6482 }, { "epoch": 6.053221288515406, "grad_norm": 2.2725473078692904, "learning_rate": 3.5599759561237725e-06, "loss": 0.0799, "step": 6483 }, { "epoch": 6.054154995331466, "grad_norm": 4.340976356963623, "learning_rate": 3.5585279664954096e-06, "loss": 0.1767, "step": 6484 }, { "epoch": 6.0550887021475255, "grad_norm": 2.252369111805773, "learning_rate": 3.5570801087054942e-06, "loss": 0.1179, "step": 6485 }, { "epoch": 6.056022408963585, "grad_norm": 1.2819307825010169, "learning_rate": 3.555632382886449e-06, "loss": 0.0527, "step": 6486 }, { "epoch": 6.056956115779645, "grad_norm": 0.7338036157411918, "learning_rate": 3.554184789170686e-06, "loss": 0.0223, "step": 6487 }, { "epoch": 6.057889822595705, "grad_norm": 0.6855328445119776, "learning_rate": 3.5527373276906007e-06, "loss": 0.0127, "step": 6488 }, { "epoch": 6.0588235294117645, "grad_norm": 2.238379598310911, "learning_rate": 3.551289998578581e-06, "loss": 0.1266, "step": 6489 }, { "epoch": 6.059757236227824, "grad_norm": 2.1299015915075423, "learning_rate": 3.549842801967e-06, "loss": 0.0807, "step": 6490 }, { "epoch": 6.060690943043884, "grad_norm": 1.1585861493076155, "learning_rate": 3.5483957379882218e-06, "loss": 0.0343, "step": 6491 }, { "epoch": 6.061624649859944, "grad_norm": 2.4337564070029285, "learning_rate": 3.546948806774594e-06, "loss": 0.1005, "step": 6492 }, { "epoch": 6.0625583566760035, "grad_norm": 1.682212360729406, "learning_rate": 3.5455020084584556e-06, "loss": 0.0636, "step": 6493 }, { "epoch": 6.063492063492063, "grad_norm": 3.8342924383377825, "learning_rate": 3.5440553431721323e-06, "loss": 0.1698, "step": 6494 }, { "epoch": 6.064425770308123, "grad_norm": 0.7291204665460614, "learning_rate": 3.542608811047936e-06, "loss": 0.0275, "step": 6495 }, { "epoch": 6.065359477124183, "grad_norm": 0.9883049295629599, "learning_rate": 3.5411624122181687e-06, "loss": 0.0368, "step": 6496 }, { "epoch": 6.0662931839402425, "grad_norm": 2.6320455813711794, "learning_rate": 3.539716146815122e-06, "loss": 0.0615, "step": 6497 }, { "epoch": 6.067226890756302, "grad_norm": 1.3817517406316102, "learning_rate": 3.538270014971069e-06, "loss": 0.0543, "step": 6498 }, { "epoch": 6.068160597572362, "grad_norm": 0.8086932144664803, "learning_rate": 3.536824016818275e-06, "loss": 0.0223, "step": 6499 }, { "epoch": 6.069094304388422, "grad_norm": 3.1106808611454606, "learning_rate": 3.535378152488995e-06, "loss": 0.1702, "step": 6500 }, { "epoch": 6.0700280112044815, "grad_norm": 0.3946041741649243, "learning_rate": 3.5339324221154674e-06, "loss": 0.0049, "step": 6501 }, { "epoch": 6.070961718020541, "grad_norm": 1.0638411833700117, "learning_rate": 3.5324868258299204e-06, "loss": 0.026, "step": 6502 }, { "epoch": 6.071895424836601, "grad_norm": 2.194026897690888, "learning_rate": 3.531041363764567e-06, "loss": 0.0775, "step": 6503 }, { "epoch": 6.072829131652661, "grad_norm": 0.5783441084505926, "learning_rate": 3.5295960360516157e-06, "loss": 0.0086, "step": 6504 }, { "epoch": 6.073762838468721, "grad_norm": 0.5242102804637663, "learning_rate": 3.5281508428232548e-06, "loss": 0.0188, "step": 6505 }, { "epoch": 6.07469654528478, "grad_norm": 3.402580435940352, "learning_rate": 3.5267057842116624e-06, "loss": 0.1637, "step": 6506 }, { "epoch": 6.07563025210084, "grad_norm": 5.222260747592386, "learning_rate": 3.5252608603490045e-06, "loss": 0.2795, "step": 6507 }, { "epoch": 6.0765639589169, "grad_norm": 5.984132505411364, "learning_rate": 3.5238160713674387e-06, "loss": 0.1648, "step": 6508 }, { "epoch": 6.07749766573296, "grad_norm": 1.280603727531008, "learning_rate": 3.522371417399104e-06, "loss": 0.0345, "step": 6509 }, { "epoch": 6.078431372549019, "grad_norm": 1.2006169188439946, "learning_rate": 3.5209268985761315e-06, "loss": 0.014, "step": 6510 }, { "epoch": 6.079365079365079, "grad_norm": 0.8818123302671625, "learning_rate": 3.5194825150306365e-06, "loss": 0.0279, "step": 6511 }, { "epoch": 6.080298786181139, "grad_norm": 1.3027492633327389, "learning_rate": 3.5180382668947227e-06, "loss": 0.0689, "step": 6512 }, { "epoch": 6.081232492997199, "grad_norm": 1.467930068189574, "learning_rate": 3.5165941543004867e-06, "loss": 0.0411, "step": 6513 }, { "epoch": 6.082166199813258, "grad_norm": 2.555985431393325, "learning_rate": 3.5151501773800055e-06, "loss": 0.0848, "step": 6514 }, { "epoch": 6.083099906629318, "grad_norm": 1.522264216280435, "learning_rate": 3.513706336265348e-06, "loss": 0.0887, "step": 6515 }, { "epoch": 6.084033613445378, "grad_norm": 0.26846014657530787, "learning_rate": 3.5122626310885665e-06, "loss": 0.0025, "step": 6516 }, { "epoch": 6.084967320261438, "grad_norm": 1.8320157405829023, "learning_rate": 3.5108190619817073e-06, "loss": 0.085, "step": 6517 }, { "epoch": 6.085901027077497, "grad_norm": 1.4502727913748628, "learning_rate": 3.5093756290767994e-06, "loss": 0.0669, "step": 6518 }, { "epoch": 6.086834733893557, "grad_norm": 1.251000739245292, "learning_rate": 3.5079323325058602e-06, "loss": 0.0541, "step": 6519 }, { "epoch": 6.087768440709617, "grad_norm": 1.8266478355079139, "learning_rate": 3.506489172400893e-06, "loss": 0.0574, "step": 6520 }, { "epoch": 6.088702147525677, "grad_norm": 1.1606339085495694, "learning_rate": 3.5050461488938955e-06, "loss": 0.0389, "step": 6521 }, { "epoch": 6.089635854341736, "grad_norm": 1.8417892743064093, "learning_rate": 3.5036032621168448e-06, "loss": 0.0959, "step": 6522 }, { "epoch": 6.090569561157796, "grad_norm": 2.6485010320912177, "learning_rate": 3.5021605122017086e-06, "loss": 0.176, "step": 6523 }, { "epoch": 6.091503267973856, "grad_norm": 1.456846549711532, "learning_rate": 3.500717899280442e-06, "loss": 0.0707, "step": 6524 }, { "epoch": 6.092436974789916, "grad_norm": 1.503954372186216, "learning_rate": 3.4992754234849903e-06, "loss": 0.0676, "step": 6525 }, { "epoch": 6.093370681605975, "grad_norm": 0.6250408761675285, "learning_rate": 3.4978330849472818e-06, "loss": 0.0136, "step": 6526 }, { "epoch": 6.094304388422035, "grad_norm": 1.4855466086232314, "learning_rate": 3.4963908837992334e-06, "loss": 0.0643, "step": 6527 }, { "epoch": 6.095238095238095, "grad_norm": 5.3711397527586096, "learning_rate": 3.4949488201727535e-06, "loss": 0.3187, "step": 6528 }, { "epoch": 6.096171802054155, "grad_norm": 1.4422502160694315, "learning_rate": 3.4935068941997303e-06, "loss": 0.0682, "step": 6529 }, { "epoch": 6.097105508870214, "grad_norm": 0.9199058918167478, "learning_rate": 3.4920651060120468e-06, "loss": 0.0277, "step": 6530 }, { "epoch": 6.098039215686274, "grad_norm": 3.0779444522343704, "learning_rate": 3.49062345574157e-06, "loss": 0.1681, "step": 6531 }, { "epoch": 6.098972922502334, "grad_norm": 2.2991535311554654, "learning_rate": 3.489181943520154e-06, "loss": 0.1289, "step": 6532 }, { "epoch": 6.099906629318394, "grad_norm": 2.8093784489463345, "learning_rate": 3.4877405694796406e-06, "loss": 0.162, "step": 6533 }, { "epoch": 6.100840336134453, "grad_norm": 1.1226979314351033, "learning_rate": 3.486299333751859e-06, "loss": 0.0625, "step": 6534 }, { "epoch": 6.101774042950513, "grad_norm": 2.8651753126320947, "learning_rate": 3.484858236468629e-06, "loss": 0.1808, "step": 6535 }, { "epoch": 6.102707749766573, "grad_norm": 0.6544402221622176, "learning_rate": 3.483417277761752e-06, "loss": 0.0157, "step": 6536 }, { "epoch": 6.103641456582633, "grad_norm": 9.050041729313973, "learning_rate": 3.4819764577630178e-06, "loss": 0.1909, "step": 6537 }, { "epoch": 6.104575163398692, "grad_norm": 1.5465885972723648, "learning_rate": 3.4805357766042103e-06, "loss": 0.0423, "step": 6538 }, { "epoch": 6.105508870214752, "grad_norm": 1.9415711674769516, "learning_rate": 3.4790952344170927e-06, "loss": 0.1036, "step": 6539 }, { "epoch": 6.106442577030812, "grad_norm": 1.13269066570727, "learning_rate": 3.477654831333418e-06, "loss": 0.0427, "step": 6540 }, { "epoch": 6.107376283846872, "grad_norm": 10.335080950549425, "learning_rate": 3.476214567484926e-06, "loss": 0.2184, "step": 6541 }, { "epoch": 6.1083099906629315, "grad_norm": 2.3525276411521694, "learning_rate": 3.4747744430033474e-06, "loss": 0.1357, "step": 6542 }, { "epoch": 6.109243697478991, "grad_norm": 1.649268215296684, "learning_rate": 3.4733344580203967e-06, "loss": 0.0725, "step": 6543 }, { "epoch": 6.110177404295051, "grad_norm": 0.49151398638171534, "learning_rate": 3.4718946126677755e-06, "loss": 0.0133, "step": 6544 }, { "epoch": 6.111111111111111, "grad_norm": 2.0441137448505837, "learning_rate": 3.4704549070771732e-06, "loss": 0.0929, "step": 6545 }, { "epoch": 6.1120448179271705, "grad_norm": 1.0783603515032318, "learning_rate": 3.469015341380266e-06, "loss": 0.0556, "step": 6546 }, { "epoch": 6.11297852474323, "grad_norm": 0.42272122000445755, "learning_rate": 3.467575915708721e-06, "loss": 0.0134, "step": 6547 }, { "epoch": 6.11391223155929, "grad_norm": 2.664594781588241, "learning_rate": 3.466136630194187e-06, "loss": 0.1184, "step": 6548 }, { "epoch": 6.11484593837535, "grad_norm": 1.2884248575568127, "learning_rate": 3.4646974849683033e-06, "loss": 0.0311, "step": 6549 }, { "epoch": 6.1157796451914095, "grad_norm": 5.0214345150727775, "learning_rate": 3.4632584801626935e-06, "loss": 0.2145, "step": 6550 }, { "epoch": 6.116713352007469, "grad_norm": 0.3126327745015109, "learning_rate": 3.4618196159089745e-06, "loss": 0.0042, "step": 6551 }, { "epoch": 6.117647058823529, "grad_norm": 0.8615020523402002, "learning_rate": 3.460380892338744e-06, "loss": 0.0427, "step": 6552 }, { "epoch": 6.118580765639589, "grad_norm": 1.302138736837939, "learning_rate": 3.4589423095835884e-06, "loss": 0.0369, "step": 6553 }, { "epoch": 6.1195144724556485, "grad_norm": 0.9472463604872654, "learning_rate": 3.4575038677750806e-06, "loss": 0.0381, "step": 6554 }, { "epoch": 6.120448179271708, "grad_norm": 0.8841501802374312, "learning_rate": 3.4560655670447864e-06, "loss": 0.0194, "step": 6555 }, { "epoch": 6.121381886087768, "grad_norm": 1.5753109228096585, "learning_rate": 3.4546274075242515e-06, "loss": 0.0615, "step": 6556 }, { "epoch": 6.122315592903828, "grad_norm": 0.5013626195423756, "learning_rate": 3.4531893893450115e-06, "loss": 0.0091, "step": 6557 }, { "epoch": 6.1232492997198875, "grad_norm": 0.9356555759278765, "learning_rate": 3.4517515126385866e-06, "loss": 0.04, "step": 6558 }, { "epoch": 6.124183006535947, "grad_norm": 0.6169154564582713, "learning_rate": 3.450313777536491e-06, "loss": 0.0138, "step": 6559 }, { "epoch": 6.125116713352007, "grad_norm": 1.972413518803508, "learning_rate": 3.448876184170219e-06, "loss": 0.0991, "step": 6560 }, { "epoch": 6.126050420168067, "grad_norm": 3.35988328791835, "learning_rate": 3.447438732671255e-06, "loss": 0.1078, "step": 6561 }, { "epoch": 6.1269841269841265, "grad_norm": 0.8615256797720068, "learning_rate": 3.4460014231710675e-06, "loss": 0.0372, "step": 6562 }, { "epoch": 6.127917833800186, "grad_norm": 2.0927093248948827, "learning_rate": 3.444564255801116e-06, "loss": 0.088, "step": 6563 }, { "epoch": 6.128851540616246, "grad_norm": 2.7078763777975583, "learning_rate": 3.4431272306928467e-06, "loss": 0.101, "step": 6564 }, { "epoch": 6.129785247432307, "grad_norm": 1.9074189087662377, "learning_rate": 3.4416903479776885e-06, "loss": 0.092, "step": 6565 }, { "epoch": 6.130718954248366, "grad_norm": 0.6208126586458974, "learning_rate": 3.4402536077870625e-06, "loss": 0.0144, "step": 6566 }, { "epoch": 6.131652661064426, "grad_norm": 0.86472967339048, "learning_rate": 3.4388170102523723e-06, "loss": 0.035, "step": 6567 }, { "epoch": 6.132586367880486, "grad_norm": 0.36261865286527795, "learning_rate": 3.4373805555050123e-06, "loss": 0.0064, "step": 6568 }, { "epoch": 6.133520074696546, "grad_norm": 1.0504777773742175, "learning_rate": 3.4359442436763597e-06, "loss": 0.0448, "step": 6569 }, { "epoch": 6.1344537815126055, "grad_norm": 1.4559158171462405, "learning_rate": 3.4345080748977844e-06, "loss": 0.0643, "step": 6570 }, { "epoch": 6.135387488328665, "grad_norm": 0.9066583752077568, "learning_rate": 3.4330720493006365e-06, "loss": 0.0232, "step": 6571 }, { "epoch": 6.136321195144725, "grad_norm": 0.6844869638532894, "learning_rate": 3.4316361670162574e-06, "loss": 0.0208, "step": 6572 }, { "epoch": 6.137254901960785, "grad_norm": 2.0085465046014104, "learning_rate": 3.430200428175976e-06, "loss": 0.0527, "step": 6573 }, { "epoch": 6.1381886087768445, "grad_norm": 3.4008507388158655, "learning_rate": 3.4287648329111055e-06, "loss": 0.1524, "step": 6574 }, { "epoch": 6.139122315592904, "grad_norm": 2.7447478026632997, "learning_rate": 3.427329381352945e-06, "loss": 0.1513, "step": 6575 }, { "epoch": 6.140056022408964, "grad_norm": 1.3981519647617704, "learning_rate": 3.4258940736327833e-06, "loss": 0.0543, "step": 6576 }, { "epoch": 6.140989729225024, "grad_norm": 1.636160871371069, "learning_rate": 3.424458909881897e-06, "loss": 0.042, "step": 6577 }, { "epoch": 6.1419234360410835, "grad_norm": 0.8233848065477725, "learning_rate": 3.423023890231546e-06, "loss": 0.0355, "step": 6578 }, { "epoch": 6.142857142857143, "grad_norm": 0.6724069350692313, "learning_rate": 3.421589014812978e-06, "loss": 0.0269, "step": 6579 }, { "epoch": 6.143790849673203, "grad_norm": 0.6948416889075689, "learning_rate": 3.4201542837574276e-06, "loss": 0.0188, "step": 6580 }, { "epoch": 6.144724556489263, "grad_norm": 0.5082374793826009, "learning_rate": 3.418719697196119e-06, "loss": 0.0098, "step": 6581 }, { "epoch": 6.1456582633053225, "grad_norm": 1.8316180993365454, "learning_rate": 3.417285255260261e-06, "loss": 0.079, "step": 6582 }, { "epoch": 6.146591970121382, "grad_norm": 0.34187320898908613, "learning_rate": 3.4158509580810466e-06, "loss": 0.0075, "step": 6583 }, { "epoch": 6.147525676937442, "grad_norm": 1.197984112391592, "learning_rate": 3.4144168057896576e-06, "loss": 0.0487, "step": 6584 }, { "epoch": 6.148459383753502, "grad_norm": 1.4385935015607998, "learning_rate": 3.412982798517267e-06, "loss": 0.0692, "step": 6585 }, { "epoch": 6.1493930905695615, "grad_norm": 1.4197786432504156, "learning_rate": 3.411548936395028e-06, "loss": 0.0619, "step": 6586 }, { "epoch": 6.150326797385621, "grad_norm": 2.3590263952911896, "learning_rate": 3.4101152195540832e-06, "loss": 0.0814, "step": 6587 }, { "epoch": 6.151260504201681, "grad_norm": 0.7922949685125196, "learning_rate": 3.4086816481255595e-06, "loss": 0.0216, "step": 6588 }, { "epoch": 6.152194211017741, "grad_norm": 0.3070618523657362, "learning_rate": 3.4072482222405772e-06, "loss": 0.0074, "step": 6589 }, { "epoch": 6.1531279178338005, "grad_norm": 0.6233269197294343, "learning_rate": 3.405814942030237e-06, "loss": 0.024, "step": 6590 }, { "epoch": 6.15406162464986, "grad_norm": 2.980171399068821, "learning_rate": 3.4043818076256276e-06, "loss": 0.1529, "step": 6591 }, { "epoch": 6.15499533146592, "grad_norm": 0.44948544529481943, "learning_rate": 3.402948819157822e-06, "loss": 0.0162, "step": 6592 }, { "epoch": 6.15592903828198, "grad_norm": 0.8001986147895754, "learning_rate": 3.4015159767578885e-06, "loss": 0.0257, "step": 6593 }, { "epoch": 6.1568627450980395, "grad_norm": 0.37419948258117547, "learning_rate": 3.400083280556874e-06, "loss": 0.0048, "step": 6594 }, { "epoch": 6.157796451914099, "grad_norm": 1.1763771977285047, "learning_rate": 3.398650730685813e-06, "loss": 0.0461, "step": 6595 }, { "epoch": 6.158730158730159, "grad_norm": 0.2900412743955415, "learning_rate": 3.397218327275729e-06, "loss": 0.0045, "step": 6596 }, { "epoch": 6.159663865546219, "grad_norm": 1.1637594348695885, "learning_rate": 3.395786070457629e-06, "loss": 0.0411, "step": 6597 }, { "epoch": 6.160597572362279, "grad_norm": 2.3886191250241033, "learning_rate": 3.3943539603625127e-06, "loss": 0.1666, "step": 6598 }, { "epoch": 6.161531279178338, "grad_norm": 0.7961541098401986, "learning_rate": 3.392921997121359e-06, "loss": 0.0218, "step": 6599 }, { "epoch": 6.162464985994398, "grad_norm": 1.9898250130741415, "learning_rate": 3.3914901808651374e-06, "loss": 0.0602, "step": 6600 }, { "epoch": 6.163398692810458, "grad_norm": 2.579975448146077, "learning_rate": 3.390058511724803e-06, "loss": 0.0815, "step": 6601 }, { "epoch": 6.164332399626518, "grad_norm": 0.9658997107611559, "learning_rate": 3.3886269898312997e-06, "loss": 0.0249, "step": 6602 }, { "epoch": 6.165266106442577, "grad_norm": 7.130219047430995, "learning_rate": 3.387195615315554e-06, "loss": 0.1094, "step": 6603 }, { "epoch": 6.166199813258637, "grad_norm": 0.4729023032422616, "learning_rate": 3.38576438830848e-06, "loss": 0.0141, "step": 6604 }, { "epoch": 6.167133520074697, "grad_norm": 0.5594241336705191, "learning_rate": 3.3843333089409804e-06, "loss": 0.0138, "step": 6605 }, { "epoch": 6.168067226890757, "grad_norm": 3.257137327023364, "learning_rate": 3.382902377343944e-06, "loss": 0.1493, "step": 6606 }, { "epoch": 6.169000933706816, "grad_norm": 1.1393464547037315, "learning_rate": 3.3814715936482435e-06, "loss": 0.0461, "step": 6607 }, { "epoch": 6.169934640522876, "grad_norm": 8.186323890043242, "learning_rate": 3.3800409579847415e-06, "loss": 0.2642, "step": 6608 }, { "epoch": 6.170868347338936, "grad_norm": 2.6032556207689646, "learning_rate": 3.3786104704842837e-06, "loss": 0.1717, "step": 6609 }, { "epoch": 6.171802054154996, "grad_norm": 3.6267361701144183, "learning_rate": 3.377180131277703e-06, "loss": 0.1287, "step": 6610 }, { "epoch": 6.172735760971055, "grad_norm": 1.2340735711933717, "learning_rate": 3.3757499404958235e-06, "loss": 0.0367, "step": 6611 }, { "epoch": 6.173669467787115, "grad_norm": 1.369778354957865, "learning_rate": 3.3743198982694492e-06, "loss": 0.0508, "step": 6612 }, { "epoch": 6.174603174603175, "grad_norm": 2.193565668073259, "learning_rate": 3.3728900047293734e-06, "loss": 0.1573, "step": 6613 }, { "epoch": 6.175536881419235, "grad_norm": 2.0243403587712128, "learning_rate": 3.3714602600063727e-06, "loss": 0.046, "step": 6614 }, { "epoch": 6.176470588235294, "grad_norm": 1.5481905591050409, "learning_rate": 3.370030664231219e-06, "loss": 0.0393, "step": 6615 }, { "epoch": 6.177404295051354, "grad_norm": 2.43283249285273, "learning_rate": 3.3686012175346615e-06, "loss": 0.0993, "step": 6616 }, { "epoch": 6.178338001867414, "grad_norm": 1.9633096744302463, "learning_rate": 3.3671719200474384e-06, "loss": 0.0406, "step": 6617 }, { "epoch": 6.179271708683474, "grad_norm": 0.4931978609531593, "learning_rate": 3.3657427719002733e-06, "loss": 0.0108, "step": 6618 }, { "epoch": 6.180205415499533, "grad_norm": 2.780396833875142, "learning_rate": 3.3643137732238806e-06, "loss": 0.1457, "step": 6619 }, { "epoch": 6.181139122315593, "grad_norm": 2.072474611215967, "learning_rate": 3.362884924148957e-06, "loss": 0.0731, "step": 6620 }, { "epoch": 6.182072829131653, "grad_norm": 2.711524668943687, "learning_rate": 3.3614562248061856e-06, "loss": 0.1649, "step": 6621 }, { "epoch": 6.183006535947713, "grad_norm": 0.16412596384586184, "learning_rate": 3.360027675326235e-06, "loss": 0.0051, "step": 6622 }, { "epoch": 6.183940242763772, "grad_norm": 2.111137628530808, "learning_rate": 3.3585992758397658e-06, "loss": 0.0621, "step": 6623 }, { "epoch": 6.184873949579832, "grad_norm": 4.053195891089663, "learning_rate": 3.3571710264774195e-06, "loss": 0.143, "step": 6624 }, { "epoch": 6.185807656395892, "grad_norm": 2.832163042604853, "learning_rate": 3.3557429273698234e-06, "loss": 0.1305, "step": 6625 }, { "epoch": 6.186741363211952, "grad_norm": 0.4435064068777494, "learning_rate": 3.3543149786475937e-06, "loss": 0.0108, "step": 6626 }, { "epoch": 6.187675070028011, "grad_norm": 2.8071858138943018, "learning_rate": 3.352887180441331e-06, "loss": 0.0945, "step": 6627 }, { "epoch": 6.188608776844071, "grad_norm": 0.7243641246296922, "learning_rate": 3.3514595328816255e-06, "loss": 0.0204, "step": 6628 }, { "epoch": 6.189542483660131, "grad_norm": 1.4317903673337387, "learning_rate": 3.3500320360990515e-06, "loss": 0.0573, "step": 6629 }, { "epoch": 6.190476190476191, "grad_norm": 2.2571475714838583, "learning_rate": 3.3486046902241663e-06, "loss": 0.0959, "step": 6630 }, { "epoch": 6.19140989729225, "grad_norm": 1.534678173487021, "learning_rate": 3.3471774953875164e-06, "loss": 0.0682, "step": 6631 }, { "epoch": 6.19234360410831, "grad_norm": 1.4118562594803699, "learning_rate": 3.3457504517196377e-06, "loss": 0.0496, "step": 6632 }, { "epoch": 6.19327731092437, "grad_norm": 1.8055007786286879, "learning_rate": 3.344323559351047e-06, "loss": 0.0808, "step": 6633 }, { "epoch": 6.19421101774043, "grad_norm": 1.1887751741610229, "learning_rate": 3.34289681841225e-06, "loss": 0.0356, "step": 6634 }, { "epoch": 6.1951447245564895, "grad_norm": 2.9511236023276513, "learning_rate": 3.341470229033735e-06, "loss": 0.0902, "step": 6635 }, { "epoch": 6.196078431372549, "grad_norm": 1.252065223878754, "learning_rate": 3.3400437913459837e-06, "loss": 0.0259, "step": 6636 }, { "epoch": 6.197012138188609, "grad_norm": 1.76458403845415, "learning_rate": 3.338617505479458e-06, "loss": 0.097, "step": 6637 }, { "epoch": 6.197945845004669, "grad_norm": 1.3395451243167207, "learning_rate": 3.3371913715646065e-06, "loss": 0.0273, "step": 6638 }, { "epoch": 6.1988795518207285, "grad_norm": 2.426195356808896, "learning_rate": 3.335765389731862e-06, "loss": 0.1045, "step": 6639 }, { "epoch": 6.199813258636788, "grad_norm": 0.8075196324533938, "learning_rate": 3.3343395601116524e-06, "loss": 0.0316, "step": 6640 }, { "epoch": 6.200746965452848, "grad_norm": 1.2142535921336086, "learning_rate": 3.332913882834382e-06, "loss": 0.0583, "step": 6641 }, { "epoch": 6.201680672268908, "grad_norm": 2.409824725029466, "learning_rate": 3.331488358030444e-06, "loss": 0.0671, "step": 6642 }, { "epoch": 6.2026143790849675, "grad_norm": 0.3743107967142233, "learning_rate": 3.33006298583022e-06, "loss": 0.01, "step": 6643 }, { "epoch": 6.203548085901027, "grad_norm": 1.7946520231441618, "learning_rate": 3.3286377663640753e-06, "loss": 0.07, "step": 6644 }, { "epoch": 6.204481792717087, "grad_norm": 2.2565413871339475, "learning_rate": 3.327212699762361e-06, "loss": 0.0678, "step": 6645 }, { "epoch": 6.205415499533147, "grad_norm": 1.0188899125338884, "learning_rate": 3.3257877861554165e-06, "loss": 0.0282, "step": 6646 }, { "epoch": 6.2063492063492065, "grad_norm": 1.9988275919854739, "learning_rate": 3.324363025673566e-06, "loss": 0.0626, "step": 6647 }, { "epoch": 6.207282913165266, "grad_norm": 0.26210108900743023, "learning_rate": 3.322938418447118e-06, "loss": 0.003, "step": 6648 }, { "epoch": 6.208216619981326, "grad_norm": 1.116339585780008, "learning_rate": 3.321513964606368e-06, "loss": 0.0205, "step": 6649 }, { "epoch": 6.209150326797386, "grad_norm": 1.4113081607964646, "learning_rate": 3.320089664281601e-06, "loss": 0.0707, "step": 6650 }, { "epoch": 6.2100840336134455, "grad_norm": 1.0009489330794954, "learning_rate": 3.318665517603083e-06, "loss": 0.0391, "step": 6651 }, { "epoch": 6.211017740429505, "grad_norm": 1.1043551055872696, "learning_rate": 3.317241524701066e-06, "loss": 0.0482, "step": 6652 }, { "epoch": 6.211951447245565, "grad_norm": 2.771660758929619, "learning_rate": 3.315817685705793e-06, "loss": 0.0442, "step": 6653 }, { "epoch": 6.212885154061625, "grad_norm": 0.32088192845348285, "learning_rate": 3.3143940007474886e-06, "loss": 0.012, "step": 6654 }, { "epoch": 6.2138188608776845, "grad_norm": 2.379299461341602, "learning_rate": 3.312970469956365e-06, "loss": 0.102, "step": 6655 }, { "epoch": 6.214752567693744, "grad_norm": 1.9803709758957155, "learning_rate": 3.311547093462616e-06, "loss": 0.1134, "step": 6656 }, { "epoch": 6.215686274509804, "grad_norm": 0.38836495053909226, "learning_rate": 3.3101238713964302e-06, "loss": 0.0088, "step": 6657 }, { "epoch": 6.216619981325864, "grad_norm": 0.7649825132092628, "learning_rate": 3.308700803887974e-06, "loss": 0.015, "step": 6658 }, { "epoch": 6.2175536881419236, "grad_norm": 0.3228687188022477, "learning_rate": 3.307277891067404e-06, "loss": 0.0091, "step": 6659 }, { "epoch": 6.218487394957983, "grad_norm": 2.0724170568184355, "learning_rate": 3.3058551330648604e-06, "loss": 0.0759, "step": 6660 }, { "epoch": 6.219421101774043, "grad_norm": 2.415846622942065, "learning_rate": 3.3044325300104678e-06, "loss": 0.1054, "step": 6661 }, { "epoch": 6.220354808590103, "grad_norm": 3.2340543809347966, "learning_rate": 3.3030100820343437e-06, "loss": 0.1619, "step": 6662 }, { "epoch": 6.221288515406163, "grad_norm": 1.179888476751292, "learning_rate": 3.301587789266583e-06, "loss": 0.0464, "step": 6663 }, { "epoch": 6.222222222222222, "grad_norm": 3.402681082734957, "learning_rate": 3.3001656518372715e-06, "loss": 0.1228, "step": 6664 }, { "epoch": 6.223155929038282, "grad_norm": 5.116704874842135, "learning_rate": 3.2987436698764764e-06, "loss": 0.2579, "step": 6665 }, { "epoch": 6.224089635854342, "grad_norm": 0.72823047050019, "learning_rate": 3.2973218435142586e-06, "loss": 0.0265, "step": 6666 }, { "epoch": 6.225023342670402, "grad_norm": 2.2529067581938187, "learning_rate": 3.2959001728806566e-06, "loss": 0.1024, "step": 6667 }, { "epoch": 6.225957049486461, "grad_norm": 1.2394066761502358, "learning_rate": 3.2944786581056985e-06, "loss": 0.0514, "step": 6668 }, { "epoch": 6.226890756302521, "grad_norm": 0.5165311191011962, "learning_rate": 3.2930572993193954e-06, "loss": 0.0176, "step": 6669 }, { "epoch": 6.227824463118581, "grad_norm": 0.3435947952700756, "learning_rate": 3.291636096651749e-06, "loss": 0.0043, "step": 6670 }, { "epoch": 6.228758169934641, "grad_norm": 2.9930304926792317, "learning_rate": 3.290215050232744e-06, "loss": 0.1222, "step": 6671 }, { "epoch": 6.2296918767507, "grad_norm": 0.34932689871818545, "learning_rate": 3.288794160192349e-06, "loss": 0.0051, "step": 6672 }, { "epoch": 6.23062558356676, "grad_norm": 2.305462430604736, "learning_rate": 3.287373426660519e-06, "loss": 0.0702, "step": 6673 }, { "epoch": 6.23155929038282, "grad_norm": 1.6128367411765294, "learning_rate": 3.2859528497671987e-06, "loss": 0.0219, "step": 6674 }, { "epoch": 6.23249299719888, "grad_norm": 0.5932074576872474, "learning_rate": 3.2845324296423143e-06, "loss": 0.0124, "step": 6675 }, { "epoch": 6.233426704014939, "grad_norm": 3.173701404090712, "learning_rate": 3.2831121664157782e-06, "loss": 0.1027, "step": 6676 }, { "epoch": 6.234360410830999, "grad_norm": 1.323858551644659, "learning_rate": 3.281692060217489e-06, "loss": 0.0374, "step": 6677 }, { "epoch": 6.235294117647059, "grad_norm": 0.3915025568788018, "learning_rate": 3.2802721111773313e-06, "loss": 0.0079, "step": 6678 }, { "epoch": 6.236227824463119, "grad_norm": 0.5210617701567133, "learning_rate": 3.278852319425176e-06, "loss": 0.0155, "step": 6679 }, { "epoch": 6.237161531279178, "grad_norm": 0.3300899172772193, "learning_rate": 3.2774326850908765e-06, "loss": 0.0085, "step": 6680 }, { "epoch": 6.238095238095238, "grad_norm": 3.641632308140405, "learning_rate": 3.2760132083042764e-06, "loss": 0.1388, "step": 6681 }, { "epoch": 6.239028944911298, "grad_norm": 0.4123764775141288, "learning_rate": 3.2745938891952006e-06, "loss": 0.0066, "step": 6682 }, { "epoch": 6.239962651727358, "grad_norm": 1.0194009831965491, "learning_rate": 3.273174727893463e-06, "loss": 0.0286, "step": 6683 }, { "epoch": 6.240896358543417, "grad_norm": 3.1956679802115553, "learning_rate": 3.271755724528859e-06, "loss": 0.0955, "step": 6684 }, { "epoch": 6.241830065359477, "grad_norm": 2.80603383255919, "learning_rate": 3.270336879231175e-06, "loss": 0.1088, "step": 6685 }, { "epoch": 6.242763772175537, "grad_norm": 0.7035722368233104, "learning_rate": 3.268918192130178e-06, "loss": 0.0135, "step": 6686 }, { "epoch": 6.243697478991597, "grad_norm": 0.8266438211361291, "learning_rate": 3.2674996633556228e-06, "loss": 0.0193, "step": 6687 }, { "epoch": 6.244631185807656, "grad_norm": 1.2713419154018983, "learning_rate": 3.266081293037251e-06, "loss": 0.0489, "step": 6688 }, { "epoch": 6.245564892623716, "grad_norm": 0.886006142506179, "learning_rate": 3.2646630813047862e-06, "loss": 0.03, "step": 6689 }, { "epoch": 6.246498599439776, "grad_norm": 2.449562621546385, "learning_rate": 3.2632450282879385e-06, "loss": 0.0962, "step": 6690 }, { "epoch": 6.247432306255836, "grad_norm": 3.044033810533948, "learning_rate": 3.261827134116409e-06, "loss": 0.1282, "step": 6691 }, { "epoch": 6.248366013071895, "grad_norm": 1.0759399444303637, "learning_rate": 3.2604093989198754e-06, "loss": 0.0178, "step": 6692 }, { "epoch": 6.249299719887955, "grad_norm": 2.8013405568547, "learning_rate": 3.258991822828007e-06, "loss": 0.0524, "step": 6693 }, { "epoch": 6.250233426704015, "grad_norm": 1.4340529851726576, "learning_rate": 3.257574405970456e-06, "loss": 0.0175, "step": 6694 }, { "epoch": 6.251167133520075, "grad_norm": 3.064532218268123, "learning_rate": 3.2561571484768595e-06, "loss": 0.1551, "step": 6695 }, { "epoch": 6.2521008403361344, "grad_norm": 2.0698146122281784, "learning_rate": 3.2547400504768446e-06, "loss": 0.0709, "step": 6696 }, { "epoch": 6.253034547152194, "grad_norm": 2.4649187577730562, "learning_rate": 3.253323112100018e-06, "loss": 0.0664, "step": 6697 }, { "epoch": 6.253968253968254, "grad_norm": 2.359863740759978, "learning_rate": 3.2519063334759755e-06, "loss": 0.106, "step": 6698 }, { "epoch": 6.254901960784314, "grad_norm": 4.797132481227892, "learning_rate": 3.250489714734294e-06, "loss": 0.0385, "step": 6699 }, { "epoch": 6.2558356676003735, "grad_norm": 3.575647330389156, "learning_rate": 3.2490732560045434e-06, "loss": 0.1271, "step": 6700 }, { "epoch": 6.256769374416433, "grad_norm": 0.6666212500598766, "learning_rate": 3.247656957416272e-06, "loss": 0.0196, "step": 6701 }, { "epoch": 6.257703081232493, "grad_norm": 1.9978870589076276, "learning_rate": 3.2462408190990168e-06, "loss": 0.088, "step": 6702 }, { "epoch": 6.258636788048553, "grad_norm": 3.42993261765866, "learning_rate": 3.244824841182296e-06, "loss": 0.0318, "step": 6703 }, { "epoch": 6.2595704948646125, "grad_norm": 2.617581156426263, "learning_rate": 3.243409023795621e-06, "loss": 0.1016, "step": 6704 }, { "epoch": 6.260504201680672, "grad_norm": 2.8155113068274606, "learning_rate": 3.2419933670684824e-06, "loss": 0.1011, "step": 6705 }, { "epoch": 6.261437908496732, "grad_norm": 3.0461370237918928, "learning_rate": 3.240577871130356e-06, "loss": 0.0948, "step": 6706 }, { "epoch": 6.262371615312792, "grad_norm": 0.1392311161491932, "learning_rate": 3.2391625361107033e-06, "loss": 0.0012, "step": 6707 }, { "epoch": 6.2633053221288515, "grad_norm": 0.7942092479766392, "learning_rate": 3.2377473621389765e-06, "loss": 0.0119, "step": 6708 }, { "epoch": 6.264239028944911, "grad_norm": 2.242544353257777, "learning_rate": 3.2363323493446062e-06, "loss": 0.1032, "step": 6709 }, { "epoch": 6.265172735760971, "grad_norm": 2.7015796254824798, "learning_rate": 3.234917497857012e-06, "loss": 0.084, "step": 6710 }, { "epoch": 6.266106442577031, "grad_norm": 0.8738363797094093, "learning_rate": 3.2335028078055963e-06, "loss": 0.0275, "step": 6711 }, { "epoch": 6.2670401493930905, "grad_norm": 0.6756197102469961, "learning_rate": 3.232088279319746e-06, "loss": 0.0109, "step": 6712 }, { "epoch": 6.26797385620915, "grad_norm": 1.2651010781710128, "learning_rate": 3.2306739125288413e-06, "loss": 0.0576, "step": 6713 }, { "epoch": 6.26890756302521, "grad_norm": 0.24865667576149533, "learning_rate": 3.2292597075622365e-06, "loss": 0.0031, "step": 6714 }, { "epoch": 6.26984126984127, "grad_norm": 0.6955251689143402, "learning_rate": 3.2278456645492775e-06, "loss": 0.0252, "step": 6715 }, { "epoch": 6.2707749766573295, "grad_norm": 1.8774534069865134, "learning_rate": 3.2264317836192945e-06, "loss": 0.0595, "step": 6716 }, { "epoch": 6.271708683473389, "grad_norm": 0.8005597551196574, "learning_rate": 3.2250180649016025e-06, "loss": 0.0106, "step": 6717 }, { "epoch": 6.272642390289449, "grad_norm": 1.0719097100996897, "learning_rate": 3.2236045085255024e-06, "loss": 0.0339, "step": 6718 }, { "epoch": 6.273576097105509, "grad_norm": 1.0729274263385935, "learning_rate": 3.2221911146202767e-06, "loss": 0.0512, "step": 6719 }, { "epoch": 6.2745098039215685, "grad_norm": 1.966245117582756, "learning_rate": 3.220777883315198e-06, "loss": 0.0767, "step": 6720 }, { "epoch": 6.275443510737628, "grad_norm": 2.4645190887027946, "learning_rate": 3.2193648147395225e-06, "loss": 0.072, "step": 6721 }, { "epoch": 6.276377217553688, "grad_norm": 0.9258223694426142, "learning_rate": 3.217951909022488e-06, "loss": 0.0359, "step": 6722 }, { "epoch": 6.277310924369748, "grad_norm": 0.7062390601787494, "learning_rate": 3.2165391662933233e-06, "loss": 0.0215, "step": 6723 }, { "epoch": 6.278244631185808, "grad_norm": 1.8019452232751372, "learning_rate": 3.215126586681237e-06, "loss": 0.0883, "step": 6724 }, { "epoch": 6.279178338001867, "grad_norm": 1.196252340789547, "learning_rate": 3.2137141703154255e-06, "loss": 0.0158, "step": 6725 }, { "epoch": 6.280112044817927, "grad_norm": 2.6837602596306764, "learning_rate": 3.2123019173250716e-06, "loss": 0.0596, "step": 6726 }, { "epoch": 6.281045751633987, "grad_norm": 1.4995203306300497, "learning_rate": 3.21088982783934e-06, "loss": 0.066, "step": 6727 }, { "epoch": 6.281979458450047, "grad_norm": 1.546059095077996, "learning_rate": 3.2094779019873814e-06, "loss": 0.0306, "step": 6728 }, { "epoch": 6.282913165266106, "grad_norm": 1.6987721679185621, "learning_rate": 3.20806613989833e-06, "loss": 0.0782, "step": 6729 }, { "epoch": 6.283846872082166, "grad_norm": 2.9442942855181244, "learning_rate": 3.206654541701312e-06, "loss": 0.0829, "step": 6730 }, { "epoch": 6.284780578898226, "grad_norm": 2.4543025874160276, "learning_rate": 3.20524310752543e-06, "loss": 0.1518, "step": 6731 }, { "epoch": 6.285714285714286, "grad_norm": 0.4220452918579268, "learning_rate": 3.2038318374997756e-06, "loss": 0.0123, "step": 6732 }, { "epoch": 6.286647992530345, "grad_norm": 3.2869085642110694, "learning_rate": 3.202420731753424e-06, "loss": 0.1332, "step": 6733 }, { "epoch": 6.287581699346405, "grad_norm": 1.456905380293212, "learning_rate": 3.2010097904154392e-06, "loss": 0.0412, "step": 6734 }, { "epoch": 6.288515406162465, "grad_norm": 2.433976817322588, "learning_rate": 3.199599013614866e-06, "loss": 0.0911, "step": 6735 }, { "epoch": 6.289449112978525, "grad_norm": 0.7481715725518201, "learning_rate": 3.198188401480734e-06, "loss": 0.0269, "step": 6736 }, { "epoch": 6.290382819794584, "grad_norm": 1.803546921492555, "learning_rate": 3.1967779541420596e-06, "loss": 0.0932, "step": 6737 }, { "epoch": 6.291316526610644, "grad_norm": 5.058538321376444, "learning_rate": 3.1953676717278453e-06, "loss": 0.2926, "step": 6738 }, { "epoch": 6.292250233426704, "grad_norm": 0.42411379624132906, "learning_rate": 3.1939575543670764e-06, "loss": 0.0077, "step": 6739 }, { "epoch": 6.293183940242764, "grad_norm": 1.9146283532721442, "learning_rate": 3.1925476021887236e-06, "loss": 0.0768, "step": 6740 }, { "epoch": 6.294117647058823, "grad_norm": 0.1774548866495474, "learning_rate": 3.1911378153217394e-06, "loss": 0.0018, "step": 6741 }, { "epoch": 6.295051353874883, "grad_norm": 0.8316640326213603, "learning_rate": 3.1897281938950693e-06, "loss": 0.0274, "step": 6742 }, { "epoch": 6.295985060690943, "grad_norm": 0.5041427614691568, "learning_rate": 3.188318738037636e-06, "loss": 0.0213, "step": 6743 }, { "epoch": 6.296918767507003, "grad_norm": 1.4192368461110898, "learning_rate": 3.1869094478783503e-06, "loss": 0.0629, "step": 6744 }, { "epoch": 6.297852474323062, "grad_norm": 2.098457738502568, "learning_rate": 3.185500323546107e-06, "loss": 0.0723, "step": 6745 }, { "epoch": 6.298786181139122, "grad_norm": 0.698014606506583, "learning_rate": 3.184091365169784e-06, "loss": 0.0409, "step": 6746 }, { "epoch": 6.299719887955182, "grad_norm": 0.145462144083267, "learning_rate": 3.1826825728782503e-06, "loss": 0.0017, "step": 6747 }, { "epoch": 6.300653594771242, "grad_norm": 4.376458407197414, "learning_rate": 3.181273946800353e-06, "loss": 0.1831, "step": 6748 }, { "epoch": 6.301587301587301, "grad_norm": 0.34295884428376283, "learning_rate": 3.179865487064926e-06, "loss": 0.0075, "step": 6749 }, { "epoch": 6.302521008403361, "grad_norm": 3.614542027507508, "learning_rate": 3.1784571938007873e-06, "loss": 0.1565, "step": 6750 }, { "epoch": 6.303454715219421, "grad_norm": 0.5144906614519239, "learning_rate": 3.177049067136745e-06, "loss": 0.0205, "step": 6751 }, { "epoch": 6.304388422035481, "grad_norm": 0.2557433789101999, "learning_rate": 3.175641107201585e-06, "loss": 0.0113, "step": 6752 }, { "epoch": 6.30532212885154, "grad_norm": 0.8218921628527047, "learning_rate": 3.1742333141240793e-06, "loss": 0.0244, "step": 6753 }, { "epoch": 6.3062558356676, "grad_norm": 3.5211099884647474, "learning_rate": 3.1728256880329885e-06, "loss": 0.1232, "step": 6754 }, { "epoch": 6.30718954248366, "grad_norm": 1.7451110503972245, "learning_rate": 3.171418229057055e-06, "loss": 0.095, "step": 6755 }, { "epoch": 6.30812324929972, "grad_norm": 2.797809742253855, "learning_rate": 3.1700109373250064e-06, "loss": 0.1169, "step": 6756 }, { "epoch": 6.309056956115779, "grad_norm": 1.9447084871161064, "learning_rate": 3.1686038129655527e-06, "loss": 0.074, "step": 6757 }, { "epoch": 6.309990662931839, "grad_norm": 2.1166889197101764, "learning_rate": 3.1671968561073953e-06, "loss": 0.0956, "step": 6758 }, { "epoch": 6.310924369747899, "grad_norm": 1.709760841198565, "learning_rate": 3.165790066879211e-06, "loss": 0.0486, "step": 6759 }, { "epoch": 6.311858076563959, "grad_norm": 1.658599619273522, "learning_rate": 3.164383445409669e-06, "loss": 0.1029, "step": 6760 }, { "epoch": 6.3127917833800185, "grad_norm": 1.4891943010633406, "learning_rate": 3.1629769918274204e-06, "loss": 0.0529, "step": 6761 }, { "epoch": 6.313725490196078, "grad_norm": 1.2887814184382829, "learning_rate": 3.1615707062611e-06, "loss": 0.0699, "step": 6762 }, { "epoch": 6.314659197012138, "grad_norm": 0.5027653969434557, "learning_rate": 3.160164588839328e-06, "loss": 0.0037, "step": 6763 }, { "epoch": 6.315592903828198, "grad_norm": 1.126666717478127, "learning_rate": 3.158758639690709e-06, "loss": 0.0483, "step": 6764 }, { "epoch": 6.3165266106442575, "grad_norm": 2.82621342910238, "learning_rate": 3.157352858943834e-06, "loss": 0.1096, "step": 6765 }, { "epoch": 6.317460317460317, "grad_norm": 1.4371641824176375, "learning_rate": 3.1559472467272763e-06, "loss": 0.0601, "step": 6766 }, { "epoch": 6.318394024276377, "grad_norm": 1.0387614383607442, "learning_rate": 3.154541803169592e-06, "loss": 0.0244, "step": 6767 }, { "epoch": 6.319327731092437, "grad_norm": 2.33434841311242, "learning_rate": 3.153136528399329e-06, "loss": 0.1006, "step": 6768 }, { "epoch": 6.3202614379084965, "grad_norm": 0.2138973336434772, "learning_rate": 3.1517314225450124e-06, "loss": 0.0014, "step": 6769 }, { "epoch": 6.321195144724556, "grad_norm": 1.4371630626765062, "learning_rate": 3.1503264857351547e-06, "loss": 0.0717, "step": 6770 }, { "epoch": 6.322128851540616, "grad_norm": 0.46309386652899104, "learning_rate": 3.148921718098252e-06, "loss": 0.0187, "step": 6771 }, { "epoch": 6.323062558356676, "grad_norm": 2.0157345282899657, "learning_rate": 3.1475171197627878e-06, "loss": 0.086, "step": 6772 }, { "epoch": 6.3239962651727355, "grad_norm": 0.5714321046414068, "learning_rate": 3.1461126908572277e-06, "loss": 0.0109, "step": 6773 }, { "epoch": 6.324929971988795, "grad_norm": 1.4062088550816079, "learning_rate": 3.1447084315100213e-06, "loss": 0.034, "step": 6774 }, { "epoch": 6.325863678804855, "grad_norm": 2.1592108862007096, "learning_rate": 3.1433043418496033e-06, "loss": 0.0811, "step": 6775 }, { "epoch": 6.326797385620915, "grad_norm": 1.131689843512844, "learning_rate": 3.141900422004392e-06, "loss": 0.0333, "step": 6776 }, { "epoch": 6.3277310924369745, "grad_norm": 2.7977519200022103, "learning_rate": 3.140496672102795e-06, "loss": 0.1162, "step": 6777 }, { "epoch": 6.328664799253034, "grad_norm": 0.3294407967153116, "learning_rate": 3.139093092273198e-06, "loss": 0.0143, "step": 6778 }, { "epoch": 6.329598506069094, "grad_norm": 0.24269600100111355, "learning_rate": 3.137689682643974e-06, "loss": 0.0052, "step": 6779 }, { "epoch": 6.330532212885154, "grad_norm": 1.4096435218970058, "learning_rate": 3.136286443343479e-06, "loss": 0.0515, "step": 6780 }, { "epoch": 6.3314659197012135, "grad_norm": 1.7888297707854963, "learning_rate": 3.1348833745000585e-06, "loss": 0.0736, "step": 6781 }, { "epoch": 6.332399626517273, "grad_norm": 2.252843320543947, "learning_rate": 3.1334804762420355e-06, "loss": 0.1277, "step": 6782 }, { "epoch": 6.333333333333333, "grad_norm": 0.39345889689699454, "learning_rate": 3.1320777486977216e-06, "loss": 0.0129, "step": 6783 }, { "epoch": 6.334267040149393, "grad_norm": 5.139248548197447, "learning_rate": 3.130675191995408e-06, "loss": 0.1609, "step": 6784 }, { "epoch": 6.3352007469654525, "grad_norm": 0.7569752854189307, "learning_rate": 3.1292728062633803e-06, "loss": 0.0223, "step": 6785 }, { "epoch": 6.336134453781512, "grad_norm": 4.842580235235151, "learning_rate": 3.127870591629899e-06, "loss": 0.1909, "step": 6786 }, { "epoch": 6.337068160597572, "grad_norm": 6.390569858751554, "learning_rate": 3.126468548223211e-06, "loss": 0.0771, "step": 6787 }, { "epoch": 6.338001867413632, "grad_norm": 1.509475220504989, "learning_rate": 3.1250666761715477e-06, "loss": 0.0779, "step": 6788 }, { "epoch": 6.338935574229692, "grad_norm": 0.18331068385885796, "learning_rate": 3.12366497560313e-06, "loss": 0.0021, "step": 6789 }, { "epoch": 6.339869281045751, "grad_norm": 1.6989195158120836, "learning_rate": 3.1222634466461556e-06, "loss": 0.0832, "step": 6790 }, { "epoch": 6.340802987861811, "grad_norm": 0.2718854557234132, "learning_rate": 3.1208620894288105e-06, "loss": 0.0066, "step": 6791 }, { "epoch": 6.341736694677871, "grad_norm": 1.1205849308589606, "learning_rate": 3.1194609040792632e-06, "loss": 0.0319, "step": 6792 }, { "epoch": 6.342670401493931, "grad_norm": 1.6378372939835393, "learning_rate": 3.1180598907256686e-06, "loss": 0.0527, "step": 6793 }, { "epoch": 6.34360410830999, "grad_norm": 3.099194966681986, "learning_rate": 3.116659049496165e-06, "loss": 0.1816, "step": 6794 }, { "epoch": 6.34453781512605, "grad_norm": 1.846294154934893, "learning_rate": 3.1152583805188736e-06, "loss": 0.1003, "step": 6795 }, { "epoch": 6.34547152194211, "grad_norm": 2.506568055202804, "learning_rate": 3.1138578839219024e-06, "loss": 0.0465, "step": 6796 }, { "epoch": 6.34640522875817, "grad_norm": 1.063410938487141, "learning_rate": 3.112457559833341e-06, "loss": 0.0118, "step": 6797 }, { "epoch": 6.347338935574229, "grad_norm": 1.4063480567681677, "learning_rate": 3.111057408381265e-06, "loss": 0.0161, "step": 6798 }, { "epoch": 6.348272642390289, "grad_norm": 2.4404992086428923, "learning_rate": 3.109657429693732e-06, "loss": 0.0744, "step": 6799 }, { "epoch": 6.349206349206349, "grad_norm": 0.7646795137652773, "learning_rate": 3.1082576238987893e-06, "loss": 0.0196, "step": 6800 }, { "epoch": 6.350140056022409, "grad_norm": 3.464295225676189, "learning_rate": 3.10685799112446e-06, "loss": 0.17, "step": 6801 }, { "epoch": 6.351073762838468, "grad_norm": 1.691882311962748, "learning_rate": 3.1054585314987586e-06, "loss": 0.0508, "step": 6802 }, { "epoch": 6.352007469654528, "grad_norm": 2.529657265852162, "learning_rate": 3.104059245149681e-06, "loss": 0.0797, "step": 6803 }, { "epoch": 6.352941176470588, "grad_norm": 3.245603154957517, "learning_rate": 3.102660132205206e-06, "loss": 0.1169, "step": 6804 }, { "epoch": 6.353874883286648, "grad_norm": 2.1490823781877513, "learning_rate": 3.1012611927932977e-06, "loss": 0.0827, "step": 6805 }, { "epoch": 6.354808590102707, "grad_norm": 1.4844071649521435, "learning_rate": 3.0998624270419077e-06, "loss": 0.0553, "step": 6806 }, { "epoch": 6.355742296918767, "grad_norm": 2.236978224164135, "learning_rate": 3.0984638350789653e-06, "loss": 0.0698, "step": 6807 }, { "epoch": 6.356676003734827, "grad_norm": 0.9261639466192195, "learning_rate": 3.0970654170323878e-06, "loss": 0.035, "step": 6808 }, { "epoch": 6.357609710550887, "grad_norm": 3.367455691223204, "learning_rate": 3.0956671730300765e-06, "loss": 0.0953, "step": 6809 }, { "epoch": 6.358543417366946, "grad_norm": 1.2840778140860563, "learning_rate": 3.094269103199913e-06, "loss": 0.0569, "step": 6810 }, { "epoch": 6.359477124183006, "grad_norm": 1.5775064433665709, "learning_rate": 3.0928712076697713e-06, "loss": 0.0739, "step": 6811 }, { "epoch": 6.360410830999066, "grad_norm": 1.3290280115408102, "learning_rate": 3.091473486567502e-06, "loss": 0.0446, "step": 6812 }, { "epoch": 6.361344537815126, "grad_norm": 6.96537190470956, "learning_rate": 3.0900759400209414e-06, "loss": 0.2081, "step": 6813 }, { "epoch": 6.362278244631185, "grad_norm": 4.484578535340101, "learning_rate": 3.088678568157909e-06, "loss": 0.1013, "step": 6814 }, { "epoch": 6.363211951447245, "grad_norm": 1.44215828056019, "learning_rate": 3.0872813711062136e-06, "loss": 0.0522, "step": 6815 }, { "epoch": 6.364145658263305, "grad_norm": 0.7752528477347843, "learning_rate": 3.085884348993643e-06, "loss": 0.0142, "step": 6816 }, { "epoch": 6.365079365079365, "grad_norm": 2.0474757914158923, "learning_rate": 3.0844875019479694e-06, "loss": 0.0916, "step": 6817 }, { "epoch": 6.366013071895424, "grad_norm": 1.7846463206203325, "learning_rate": 3.0830908300969477e-06, "loss": 0.0472, "step": 6818 }, { "epoch": 6.366946778711484, "grad_norm": 2.0282005682265805, "learning_rate": 3.081694333568323e-06, "loss": 0.0483, "step": 6819 }, { "epoch": 6.367880485527545, "grad_norm": 2.6051421898647535, "learning_rate": 3.0802980124898184e-06, "loss": 0.1376, "step": 6820 }, { "epoch": 6.368814192343605, "grad_norm": 2.919413956842892, "learning_rate": 3.078901866989143e-06, "loss": 0.1462, "step": 6821 }, { "epoch": 6.369747899159664, "grad_norm": 3.2576289345843907, "learning_rate": 3.0775058971939874e-06, "loss": 0.1569, "step": 6822 }, { "epoch": 6.370681605975724, "grad_norm": 1.2429254971493726, "learning_rate": 3.0761101032320324e-06, "loss": 0.0381, "step": 6823 }, { "epoch": 6.371615312791784, "grad_norm": 1.8728759966692226, "learning_rate": 3.0747144852309362e-06, "loss": 0.0926, "step": 6824 }, { "epoch": 6.372549019607844, "grad_norm": 5.701165854325084, "learning_rate": 3.0733190433183446e-06, "loss": 0.1642, "step": 6825 }, { "epoch": 6.373482726423903, "grad_norm": 1.4564079223734925, "learning_rate": 3.071923777621885e-06, "loss": 0.0269, "step": 6826 }, { "epoch": 6.374416433239963, "grad_norm": 1.0698669259700924, "learning_rate": 3.070528688269169e-06, "loss": 0.0309, "step": 6827 }, { "epoch": 6.375350140056023, "grad_norm": 2.0802013617227093, "learning_rate": 3.0691337753877948e-06, "loss": 0.0961, "step": 6828 }, { "epoch": 6.376283846872083, "grad_norm": 0.8877002964268019, "learning_rate": 3.067739039105342e-06, "loss": 0.036, "step": 6829 }, { "epoch": 6.377217553688142, "grad_norm": 2.6160240180288956, "learning_rate": 3.0663444795493736e-06, "loss": 0.0594, "step": 6830 }, { "epoch": 6.378151260504202, "grad_norm": 0.46841079107542877, "learning_rate": 3.064950096847437e-06, "loss": 0.0065, "step": 6831 }, { "epoch": 6.379084967320262, "grad_norm": 6.888133359016957, "learning_rate": 3.0635558911270668e-06, "loss": 0.0895, "step": 6832 }, { "epoch": 6.380018674136322, "grad_norm": 3.7830046909490838, "learning_rate": 3.062161862515775e-06, "loss": 0.0557, "step": 6833 }, { "epoch": 6.380952380952381, "grad_norm": 0.4197155215296183, "learning_rate": 3.0607680111410637e-06, "loss": 0.0109, "step": 6834 }, { "epoch": 6.381886087768441, "grad_norm": 2.106390877164942, "learning_rate": 3.059374337130413e-06, "loss": 0.0917, "step": 6835 }, { "epoch": 6.382819794584501, "grad_norm": 1.3303837573005173, "learning_rate": 3.057980840611293e-06, "loss": 0.0461, "step": 6836 }, { "epoch": 6.383753501400561, "grad_norm": 0.5430621509163392, "learning_rate": 3.0565875217111507e-06, "loss": 0.0221, "step": 6837 }, { "epoch": 6.38468720821662, "grad_norm": 2.0363062965518073, "learning_rate": 3.055194380557423e-06, "loss": 0.0843, "step": 6838 }, { "epoch": 6.38562091503268, "grad_norm": 4.35591036082471, "learning_rate": 3.0538014172775264e-06, "loss": 0.0627, "step": 6839 }, { "epoch": 6.38655462184874, "grad_norm": 2.631296311180813, "learning_rate": 3.0524086319988635e-06, "loss": 0.1294, "step": 6840 }, { "epoch": 6.3874883286648, "grad_norm": 0.4663756293405134, "learning_rate": 3.051016024848821e-06, "loss": 0.0144, "step": 6841 }, { "epoch": 6.388422035480859, "grad_norm": 1.9099775713337395, "learning_rate": 3.049623595954766e-06, "loss": 0.0684, "step": 6842 }, { "epoch": 6.389355742296919, "grad_norm": 3.5537469535148225, "learning_rate": 3.0482313454440528e-06, "loss": 0.1479, "step": 6843 }, { "epoch": 6.390289449112979, "grad_norm": 1.7742205442850838, "learning_rate": 3.0468392734440154e-06, "loss": 0.0301, "step": 6844 }, { "epoch": 6.391223155929039, "grad_norm": 0.30689422556121143, "learning_rate": 3.0454473800819784e-06, "loss": 0.0065, "step": 6845 }, { "epoch": 6.392156862745098, "grad_norm": 2.6146646184650293, "learning_rate": 3.0440556654852425e-06, "loss": 0.1056, "step": 6846 }, { "epoch": 6.393090569561158, "grad_norm": 2.461041150917357, "learning_rate": 3.0426641297810977e-06, "loss": 0.0689, "step": 6847 }, { "epoch": 6.394024276377218, "grad_norm": 1.194148425028642, "learning_rate": 3.0412727730968106e-06, "loss": 0.034, "step": 6848 }, { "epoch": 6.394957983193278, "grad_norm": 1.2086931614301715, "learning_rate": 3.0398815955596416e-06, "loss": 0.0067, "step": 6849 }, { "epoch": 6.395891690009337, "grad_norm": 0.3963515412721314, "learning_rate": 3.038490597296827e-06, "loss": 0.0021, "step": 6850 }, { "epoch": 6.396825396825397, "grad_norm": 1.027574769450663, "learning_rate": 3.0370997784355883e-06, "loss": 0.0464, "step": 6851 }, { "epoch": 6.397759103641457, "grad_norm": 1.0263327229087147, "learning_rate": 3.035709139103129e-06, "loss": 0.0419, "step": 6852 }, { "epoch": 6.398692810457517, "grad_norm": 1.4639200718889798, "learning_rate": 3.0343186794266433e-06, "loss": 0.0537, "step": 6853 }, { "epoch": 6.3996265172735765, "grad_norm": 1.737413521536219, "learning_rate": 3.032928399533302e-06, "loss": 0.0814, "step": 6854 }, { "epoch": 6.400560224089636, "grad_norm": 1.714444738998663, "learning_rate": 3.031538299550261e-06, "loss": 0.0174, "step": 6855 }, { "epoch": 6.401493930905696, "grad_norm": 1.5752273838940667, "learning_rate": 3.0301483796046573e-06, "loss": 0.082, "step": 6856 }, { "epoch": 6.402427637721756, "grad_norm": 2.250908925280016, "learning_rate": 3.0287586398236202e-06, "loss": 0.0986, "step": 6857 }, { "epoch": 6.4033613445378155, "grad_norm": 1.2635934997963416, "learning_rate": 3.0273690803342533e-06, "loss": 0.0371, "step": 6858 }, { "epoch": 6.404295051353875, "grad_norm": 1.152469348573436, "learning_rate": 3.0259797012636473e-06, "loss": 0.0468, "step": 6859 }, { "epoch": 6.405228758169935, "grad_norm": 3.4928435945873315, "learning_rate": 3.024590502738877e-06, "loss": 0.1858, "step": 6860 }, { "epoch": 6.406162464985995, "grad_norm": 1.4214491764887698, "learning_rate": 3.023201484886996e-06, "loss": 0.038, "step": 6861 }, { "epoch": 6.4070961718020545, "grad_norm": 1.3441147360051822, "learning_rate": 3.021812647835052e-06, "loss": 0.0407, "step": 6862 }, { "epoch": 6.408029878618114, "grad_norm": 1.7386252214101927, "learning_rate": 3.0204239917100654e-06, "loss": 0.0851, "step": 6863 }, { "epoch": 6.408963585434174, "grad_norm": 0.30746241044477807, "learning_rate": 3.019035516639044e-06, "loss": 0.0105, "step": 6864 }, { "epoch": 6.409897292250234, "grad_norm": 1.3080477789151015, "learning_rate": 3.017647222748978e-06, "loss": 0.0586, "step": 6865 }, { "epoch": 6.4108309990662935, "grad_norm": 1.1601314519711854, "learning_rate": 3.0162591101668455e-06, "loss": 0.0456, "step": 6866 }, { "epoch": 6.411764705882353, "grad_norm": 0.9466692307526582, "learning_rate": 3.0148711790196028e-06, "loss": 0.0275, "step": 6867 }, { "epoch": 6.412698412698413, "grad_norm": 1.3685999882813609, "learning_rate": 3.0134834294341907e-06, "loss": 0.0356, "step": 6868 }, { "epoch": 6.413632119514473, "grad_norm": 4.634296947981423, "learning_rate": 3.012095861537535e-06, "loss": 0.0386, "step": 6869 }, { "epoch": 6.4145658263305325, "grad_norm": 2.502494139032272, "learning_rate": 3.010708475456545e-06, "loss": 0.1414, "step": 6870 }, { "epoch": 6.415499533146592, "grad_norm": 0.2965174559709312, "learning_rate": 3.009321271318112e-06, "loss": 0.005, "step": 6871 }, { "epoch": 6.416433239962652, "grad_norm": 0.457599654723458, "learning_rate": 3.0079342492491087e-06, "loss": 0.0073, "step": 6872 }, { "epoch": 6.417366946778712, "grad_norm": 1.4835400926983606, "learning_rate": 3.006547409376395e-06, "loss": 0.0484, "step": 6873 }, { "epoch": 6.4183006535947715, "grad_norm": 0.3897185474629027, "learning_rate": 3.0051607518268143e-06, "loss": 0.0031, "step": 6874 }, { "epoch": 6.419234360410831, "grad_norm": 0.6094711345995231, "learning_rate": 3.0037742767271894e-06, "loss": 0.0078, "step": 6875 }, { "epoch": 6.420168067226891, "grad_norm": 1.859972024062094, "learning_rate": 3.00238798420433e-06, "loss": 0.031, "step": 6876 }, { "epoch": 6.421101774042951, "grad_norm": 0.7984709437965912, "learning_rate": 3.0010018743850277e-06, "loss": 0.0218, "step": 6877 }, { "epoch": 6.4220354808590105, "grad_norm": 0.5040120511744118, "learning_rate": 2.999615947396056e-06, "loss": 0.0087, "step": 6878 }, { "epoch": 6.42296918767507, "grad_norm": 1.3794375026771168, "learning_rate": 2.998230203364174e-06, "loss": 0.0715, "step": 6879 }, { "epoch": 6.42390289449113, "grad_norm": 0.4485452494183967, "learning_rate": 2.9968446424161246e-06, "loss": 0.0176, "step": 6880 }, { "epoch": 6.42483660130719, "grad_norm": 2.024988924743387, "learning_rate": 2.9954592646786318e-06, "loss": 0.0199, "step": 6881 }, { "epoch": 6.42577030812325, "grad_norm": 1.1632389934926035, "learning_rate": 2.9940740702784003e-06, "loss": 0.0306, "step": 6882 }, { "epoch": 6.426704014939309, "grad_norm": 1.320605021594333, "learning_rate": 2.9926890593421263e-06, "loss": 0.0298, "step": 6883 }, { "epoch": 6.427637721755369, "grad_norm": 3.111017563140031, "learning_rate": 2.991304231996483e-06, "loss": 0.0624, "step": 6884 }, { "epoch": 6.428571428571429, "grad_norm": 0.5049634559662439, "learning_rate": 2.9899195883681263e-06, "loss": 0.0115, "step": 6885 }, { "epoch": 6.429505135387489, "grad_norm": 1.6699065038065306, "learning_rate": 2.9885351285836966e-06, "loss": 0.0529, "step": 6886 }, { "epoch": 6.430438842203548, "grad_norm": 0.8006592497877542, "learning_rate": 2.9871508527698212e-06, "loss": 0.0195, "step": 6887 }, { "epoch": 6.431372549019608, "grad_norm": 1.404283536074159, "learning_rate": 2.985766761053106e-06, "loss": 0.0437, "step": 6888 }, { "epoch": 6.432306255835668, "grad_norm": 3.4471791756800254, "learning_rate": 2.98438285356014e-06, "loss": 0.103, "step": 6889 }, { "epoch": 6.433239962651728, "grad_norm": 1.4105877932839896, "learning_rate": 2.9829991304174975e-06, "loss": 0.0609, "step": 6890 }, { "epoch": 6.434173669467787, "grad_norm": 0.21123199097447032, "learning_rate": 2.9816155917517344e-06, "loss": 0.0062, "step": 6891 }, { "epoch": 6.435107376283847, "grad_norm": 1.9203355940161666, "learning_rate": 2.9802322376893923e-06, "loss": 0.0686, "step": 6892 }, { "epoch": 6.436041083099907, "grad_norm": 0.9508897155036862, "learning_rate": 2.9788490683569938e-06, "loss": 0.0375, "step": 6893 }, { "epoch": 6.436974789915967, "grad_norm": 2.992386808174623, "learning_rate": 2.9774660838810445e-06, "loss": 0.0562, "step": 6894 }, { "epoch": 6.437908496732026, "grad_norm": 3.6642572741874413, "learning_rate": 2.976083284388031e-06, "loss": 0.1872, "step": 6895 }, { "epoch": 6.438842203548086, "grad_norm": 2.051869435344987, "learning_rate": 2.97470067000443e-06, "loss": 0.0425, "step": 6896 }, { "epoch": 6.439775910364146, "grad_norm": 3.5120813956424253, "learning_rate": 2.9733182408566943e-06, "loss": 0.1137, "step": 6897 }, { "epoch": 6.440709617180206, "grad_norm": 2.2396498768252724, "learning_rate": 2.971935997071263e-06, "loss": 0.0847, "step": 6898 }, { "epoch": 6.441643323996265, "grad_norm": 2.1644632958512005, "learning_rate": 2.9705539387745553e-06, "loss": 0.0535, "step": 6899 }, { "epoch": 6.442577030812325, "grad_norm": 0.39931827424489635, "learning_rate": 2.9691720660929784e-06, "loss": 0.0062, "step": 6900 }, { "epoch": 6.443510737628385, "grad_norm": 1.084881697985663, "learning_rate": 2.9677903791529196e-06, "loss": 0.0213, "step": 6901 }, { "epoch": 6.444444444444445, "grad_norm": 0.9497360068931607, "learning_rate": 2.9664088780807475e-06, "loss": 0.0237, "step": 6902 }, { "epoch": 6.445378151260504, "grad_norm": 2.7814252770458907, "learning_rate": 2.965027563002815e-06, "loss": 0.1061, "step": 6903 }, { "epoch": 6.446311858076564, "grad_norm": 1.200245963575688, "learning_rate": 2.963646434045462e-06, "loss": 0.0407, "step": 6904 }, { "epoch": 6.447245564892624, "grad_norm": 4.243436538118973, "learning_rate": 2.9622654913350056e-06, "loss": 0.1551, "step": 6905 }, { "epoch": 6.448179271708684, "grad_norm": 3.0403497444338736, "learning_rate": 2.9608847349977487e-06, "loss": 0.1046, "step": 6906 }, { "epoch": 6.449112978524743, "grad_norm": 0.49294885754021456, "learning_rate": 2.9595041651599753e-06, "loss": 0.0078, "step": 6907 }, { "epoch": 6.450046685340803, "grad_norm": 0.9118118455684263, "learning_rate": 2.958123781947956e-06, "loss": 0.0374, "step": 6908 }, { "epoch": 6.450980392156863, "grad_norm": 0.6330516069239994, "learning_rate": 2.95674358548794e-06, "loss": 0.007, "step": 6909 }, { "epoch": 6.451914098972923, "grad_norm": 2.73632548197579, "learning_rate": 2.955363575906162e-06, "loss": 0.1389, "step": 6910 }, { "epoch": 6.452847805788982, "grad_norm": 3.187339306889911, "learning_rate": 2.953983753328841e-06, "loss": 0.1436, "step": 6911 }, { "epoch": 6.453781512605042, "grad_norm": 2.163017197225668, "learning_rate": 2.952604117882173e-06, "loss": 0.0714, "step": 6912 }, { "epoch": 6.454715219421102, "grad_norm": 0.565766339122994, "learning_rate": 2.951224669692343e-06, "loss": 0.0189, "step": 6913 }, { "epoch": 6.455648926237162, "grad_norm": 0.6965052064067737, "learning_rate": 2.9498454088855176e-06, "loss": 0.0147, "step": 6914 }, { "epoch": 6.456582633053221, "grad_norm": 0.8473056106392175, "learning_rate": 2.9484663355878453e-06, "loss": 0.0254, "step": 6915 }, { "epoch": 6.457516339869281, "grad_norm": 3.217392246157535, "learning_rate": 2.9470874499254553e-06, "loss": 0.0682, "step": 6916 }, { "epoch": 6.458450046685341, "grad_norm": 1.6244414757034789, "learning_rate": 2.945708752024463e-06, "loss": 0.0388, "step": 6917 }, { "epoch": 6.459383753501401, "grad_norm": 1.333212360625391, "learning_rate": 2.9443302420109667e-06, "loss": 0.051, "step": 6918 }, { "epoch": 6.4603174603174605, "grad_norm": 3.3870808864982265, "learning_rate": 2.942951920011045e-06, "loss": 0.1471, "step": 6919 }, { "epoch": 6.46125116713352, "grad_norm": 3.469192019712379, "learning_rate": 2.9415737861507586e-06, "loss": 0.1421, "step": 6920 }, { "epoch": 6.46218487394958, "grad_norm": 3.3539217070076743, "learning_rate": 2.940195840556158e-06, "loss": 0.1574, "step": 6921 }, { "epoch": 6.46311858076564, "grad_norm": 0.7720836684285841, "learning_rate": 2.9388180833532676e-06, "loss": 0.0185, "step": 6922 }, { "epoch": 6.4640522875816995, "grad_norm": 0.632841486466807, "learning_rate": 2.9374405146681004e-06, "loss": 0.0167, "step": 6923 }, { "epoch": 6.464985994397759, "grad_norm": 0.3089995630254576, "learning_rate": 2.936063134626649e-06, "loss": 0.0109, "step": 6924 }, { "epoch": 6.465919701213819, "grad_norm": 1.0111287406097378, "learning_rate": 2.934685943354888e-06, "loss": 0.018, "step": 6925 }, { "epoch": 6.466853408029879, "grad_norm": 0.8055999843726512, "learning_rate": 2.9333089409787806e-06, "loss": 0.0195, "step": 6926 }, { "epoch": 6.4677871148459385, "grad_norm": 3.482048489399065, "learning_rate": 2.9319321276242667e-06, "loss": 0.1221, "step": 6927 }, { "epoch": 6.468720821661998, "grad_norm": 8.894206010350898, "learning_rate": 2.9305555034172717e-06, "loss": 0.1941, "step": 6928 }, { "epoch": 6.469654528478058, "grad_norm": 4.230380410565331, "learning_rate": 2.929179068483701e-06, "loss": 0.2132, "step": 6929 }, { "epoch": 6.470588235294118, "grad_norm": 1.0760106073859654, "learning_rate": 2.927802822949448e-06, "loss": 0.025, "step": 6930 }, { "epoch": 6.4715219421101775, "grad_norm": 1.5705147971822424, "learning_rate": 2.926426766940384e-06, "loss": 0.0571, "step": 6931 }, { "epoch": 6.472455648926237, "grad_norm": 2.2707785558812006, "learning_rate": 2.9250509005823635e-06, "loss": 0.0554, "step": 6932 }, { "epoch": 6.473389355742297, "grad_norm": 2.0412309914245563, "learning_rate": 2.9236752240012244e-06, "loss": 0.0721, "step": 6933 }, { "epoch": 6.474323062558357, "grad_norm": 0.7105045568286171, "learning_rate": 2.9222997373227896e-06, "loss": 0.0102, "step": 6934 }, { "epoch": 6.4752567693744165, "grad_norm": 3.6122575358612257, "learning_rate": 2.920924440672861e-06, "loss": 0.1538, "step": 6935 }, { "epoch": 6.476190476190476, "grad_norm": 0.7765053832368662, "learning_rate": 2.9195493341772253e-06, "loss": 0.0114, "step": 6936 }, { "epoch": 6.477124183006536, "grad_norm": 1.1121085282198453, "learning_rate": 2.9181744179616493e-06, "loss": 0.0329, "step": 6937 }, { "epoch": 6.478057889822596, "grad_norm": 1.524411247853336, "learning_rate": 2.9167996921518848e-06, "loss": 0.0509, "step": 6938 }, { "epoch": 6.4789915966386555, "grad_norm": 0.7390048616273569, "learning_rate": 2.915425156873668e-06, "loss": 0.0175, "step": 6939 }, { "epoch": 6.479925303454715, "grad_norm": 1.1834787149819932, "learning_rate": 2.914050812252713e-06, "loss": 0.0414, "step": 6940 }, { "epoch": 6.480859010270775, "grad_norm": 2.777677329860453, "learning_rate": 2.91267665841472e-06, "loss": 0.1004, "step": 6941 }, { "epoch": 6.481792717086835, "grad_norm": 0.9629537205368822, "learning_rate": 2.9113026954853674e-06, "loss": 0.0379, "step": 6942 }, { "epoch": 6.4827264239028946, "grad_norm": 4.342512554862687, "learning_rate": 2.909928923590323e-06, "loss": 0.0669, "step": 6943 }, { "epoch": 6.483660130718954, "grad_norm": 3.7749511554949953, "learning_rate": 2.908555342855232e-06, "loss": 0.1639, "step": 6944 }, { "epoch": 6.484593837535014, "grad_norm": 1.5867354162055582, "learning_rate": 2.9071819534057232e-06, "loss": 0.0558, "step": 6945 }, { "epoch": 6.485527544351074, "grad_norm": 0.9539402930388583, "learning_rate": 2.9058087553674063e-06, "loss": 0.031, "step": 6946 }, { "epoch": 6.486461251167134, "grad_norm": 2.196930323628456, "learning_rate": 2.904435748865879e-06, "loss": 0.0846, "step": 6947 }, { "epoch": 6.487394957983193, "grad_norm": 3.140084905688365, "learning_rate": 2.9030629340267165e-06, "loss": 0.1804, "step": 6948 }, { "epoch": 6.488328664799253, "grad_norm": 0.9043793524130607, "learning_rate": 2.901690310975477e-06, "loss": 0.0362, "step": 6949 }, { "epoch": 6.489262371615313, "grad_norm": 2.7692709444628836, "learning_rate": 2.9003178798376997e-06, "loss": 0.1623, "step": 6950 }, { "epoch": 6.490196078431373, "grad_norm": 1.0139916599145977, "learning_rate": 2.8989456407389137e-06, "loss": 0.0058, "step": 6951 }, { "epoch": 6.491129785247432, "grad_norm": 0.5688050290305486, "learning_rate": 2.8975735938046223e-06, "loss": 0.0214, "step": 6952 }, { "epoch": 6.492063492063492, "grad_norm": 1.6438757742628765, "learning_rate": 2.8962017391603153e-06, "loss": 0.0565, "step": 6953 }, { "epoch": 6.492997198879552, "grad_norm": 0.7087212976597977, "learning_rate": 2.8948300769314606e-06, "loss": 0.0224, "step": 6954 }, { "epoch": 6.493930905695612, "grad_norm": 1.190437196365561, "learning_rate": 2.8934586072435167e-06, "loss": 0.0459, "step": 6955 }, { "epoch": 6.494864612511671, "grad_norm": 1.2331001785611089, "learning_rate": 2.8920873302219178e-06, "loss": 0.0237, "step": 6956 }, { "epoch": 6.495798319327731, "grad_norm": 2.7180415155528905, "learning_rate": 2.890716245992081e-06, "loss": 0.1375, "step": 6957 }, { "epoch": 6.496732026143791, "grad_norm": 1.203961692245124, "learning_rate": 2.889345354679408e-06, "loss": 0.0262, "step": 6958 }, { "epoch": 6.497665732959851, "grad_norm": 1.9104628918914746, "learning_rate": 2.8879746564092792e-06, "loss": 0.0808, "step": 6959 }, { "epoch": 6.49859943977591, "grad_norm": 0.9452076753834041, "learning_rate": 2.8866041513070646e-06, "loss": 0.0268, "step": 6960 }, { "epoch": 6.49953314659197, "grad_norm": 1.5213767776561935, "learning_rate": 2.88523383949811e-06, "loss": 0.0551, "step": 6961 }, { "epoch": 6.50046685340803, "grad_norm": 0.6555170492477755, "learning_rate": 2.8838637211077447e-06, "loss": 0.0122, "step": 6962 }, { "epoch": 6.50140056022409, "grad_norm": 1.6059171685978288, "learning_rate": 2.8824937962612803e-06, "loss": 0.0508, "step": 6963 }, { "epoch": 6.502334267040149, "grad_norm": 0.9370531235326128, "learning_rate": 2.8811240650840144e-06, "loss": 0.028, "step": 6964 }, { "epoch": 6.503267973856209, "grad_norm": 1.145292762204203, "learning_rate": 2.8797545277012224e-06, "loss": 0.0424, "step": 6965 }, { "epoch": 6.504201680672269, "grad_norm": 1.0880012871828566, "learning_rate": 2.878385184238163e-06, "loss": 0.0427, "step": 6966 }, { "epoch": 6.505135387488329, "grad_norm": 0.4972722056260884, "learning_rate": 2.8770160348200766e-06, "loss": 0.0082, "step": 6967 }, { "epoch": 6.506069094304388, "grad_norm": 1.0200868685914484, "learning_rate": 2.87564707957219e-06, "loss": 0.0342, "step": 6968 }, { "epoch": 6.507002801120448, "grad_norm": 3.653236775000523, "learning_rate": 2.874278318619709e-06, "loss": 0.1908, "step": 6969 }, { "epoch": 6.507936507936508, "grad_norm": 0.6242888633996156, "learning_rate": 2.872909752087819e-06, "loss": 0.019, "step": 6970 }, { "epoch": 6.508870214752568, "grad_norm": 4.598471140071483, "learning_rate": 2.8715413801016907e-06, "loss": 0.1675, "step": 6971 }, { "epoch": 6.509803921568627, "grad_norm": 2.270803680862769, "learning_rate": 2.8701732027864795e-06, "loss": 0.0618, "step": 6972 }, { "epoch": 6.510737628384687, "grad_norm": 4.215160100286391, "learning_rate": 2.8688052202673188e-06, "loss": 0.0467, "step": 6973 }, { "epoch": 6.511671335200747, "grad_norm": 1.2844705494825255, "learning_rate": 2.867437432669326e-06, "loss": 0.053, "step": 6974 }, { "epoch": 6.512605042016807, "grad_norm": 1.2512980017398032, "learning_rate": 2.8660698401176e-06, "loss": 0.0585, "step": 6975 }, { "epoch": 6.513538748832866, "grad_norm": 1.319101941491298, "learning_rate": 2.8647024427372194e-06, "loss": 0.0409, "step": 6976 }, { "epoch": 6.514472455648926, "grad_norm": 2.796453810744441, "learning_rate": 2.8633352406532527e-06, "loss": 0.0786, "step": 6977 }, { "epoch": 6.515406162464986, "grad_norm": 0.8778220677134739, "learning_rate": 2.861968233990743e-06, "loss": 0.0315, "step": 6978 }, { "epoch": 6.516339869281046, "grad_norm": 0.9106981938202476, "learning_rate": 2.8606014228747183e-06, "loss": 0.038, "step": 6979 }, { "epoch": 6.5172735760971054, "grad_norm": 2.4146177541400524, "learning_rate": 2.8592348074301863e-06, "loss": 0.1301, "step": 6980 }, { "epoch": 6.518207282913165, "grad_norm": 1.7201564812080692, "learning_rate": 2.8578683877821434e-06, "loss": 0.0509, "step": 6981 }, { "epoch": 6.519140989729225, "grad_norm": 1.9045425183305025, "learning_rate": 2.8565021640555614e-06, "loss": 0.0847, "step": 6982 }, { "epoch": 6.520074696545285, "grad_norm": 1.2481663826156475, "learning_rate": 2.855136136375396e-06, "loss": 0.0824, "step": 6983 }, { "epoch": 6.5210084033613445, "grad_norm": 0.6988831849627909, "learning_rate": 2.8537703048665845e-06, "loss": 0.0178, "step": 6984 }, { "epoch": 6.521942110177404, "grad_norm": 0.30147227492696915, "learning_rate": 2.8524046696540495e-06, "loss": 0.01, "step": 6985 }, { "epoch": 6.522875816993464, "grad_norm": 2.276218985952373, "learning_rate": 2.8510392308626934e-06, "loss": 0.0397, "step": 6986 }, { "epoch": 6.523809523809524, "grad_norm": 1.1303885454284746, "learning_rate": 2.8496739886173994e-06, "loss": 0.0322, "step": 6987 }, { "epoch": 6.5247432306255835, "grad_norm": 1.2848295176037376, "learning_rate": 2.848308943043032e-06, "loss": 0.039, "step": 6988 }, { "epoch": 6.525676937441643, "grad_norm": 1.199206743492478, "learning_rate": 2.846944094264444e-06, "loss": 0.0494, "step": 6989 }, { "epoch": 6.526610644257703, "grad_norm": 1.965578988759892, "learning_rate": 2.8455794424064632e-06, "loss": 0.0877, "step": 6990 }, { "epoch": 6.527544351073763, "grad_norm": 2.9647540097251124, "learning_rate": 2.844214987593902e-06, "loss": 0.0558, "step": 6991 }, { "epoch": 6.5284780578898225, "grad_norm": 2.993935855218878, "learning_rate": 2.8428507299515558e-06, "loss": 0.0946, "step": 6992 }, { "epoch": 6.529411764705882, "grad_norm": 1.2890085081373375, "learning_rate": 2.8414866696041976e-06, "loss": 0.0694, "step": 6993 }, { "epoch": 6.530345471521942, "grad_norm": 0.24558870407484631, "learning_rate": 2.8401228066765905e-06, "loss": 0.0027, "step": 6994 }, { "epoch": 6.531279178338002, "grad_norm": 2.611186734252634, "learning_rate": 2.838759141293473e-06, "loss": 0.0928, "step": 6995 }, { "epoch": 6.5322128851540615, "grad_norm": 0.5093643725268673, "learning_rate": 2.837395673579566e-06, "loss": 0.0175, "step": 6996 }, { "epoch": 6.533146591970121, "grad_norm": 1.3503053367668165, "learning_rate": 2.836032403659573e-06, "loss": 0.0325, "step": 6997 }, { "epoch": 6.534080298786181, "grad_norm": 2.483217601977848, "learning_rate": 2.8346693316581835e-06, "loss": 0.0691, "step": 6998 }, { "epoch": 6.535014005602241, "grad_norm": 0.7615371790669919, "learning_rate": 2.8333064577000635e-06, "loss": 0.0161, "step": 6999 }, { "epoch": 6.5359477124183005, "grad_norm": 1.8078740180771538, "learning_rate": 2.8319437819098623e-06, "loss": 0.0808, "step": 7000 }, { "epoch": 6.53688141923436, "grad_norm": 1.0883056471593804, "learning_rate": 2.83058130441221e-06, "loss": 0.0297, "step": 7001 }, { "epoch": 6.53781512605042, "grad_norm": 0.6605789951134166, "learning_rate": 2.829219025331724e-06, "loss": 0.0208, "step": 7002 }, { "epoch": 6.53874883286648, "grad_norm": 3.447626116047999, "learning_rate": 2.8278569447929975e-06, "loss": 0.1465, "step": 7003 }, { "epoch": 6.5396825396825395, "grad_norm": 3.2048779785329176, "learning_rate": 2.8264950629206067e-06, "loss": 0.1441, "step": 7004 }, { "epoch": 6.540616246498599, "grad_norm": 0.9032112307936392, "learning_rate": 2.8251333798391135e-06, "loss": 0.0359, "step": 7005 }, { "epoch": 6.541549953314659, "grad_norm": 2.965384492168368, "learning_rate": 2.8237718956730564e-06, "loss": 0.096, "step": 7006 }, { "epoch": 6.542483660130719, "grad_norm": 0.20866744122459807, "learning_rate": 2.822410610546958e-06, "loss": 0.0024, "step": 7007 }, { "epoch": 6.543417366946779, "grad_norm": 3.94530420205771, "learning_rate": 2.8210495245853254e-06, "loss": 0.0696, "step": 7008 }, { "epoch": 6.544351073762838, "grad_norm": 2.497996271733016, "learning_rate": 2.819688637912642e-06, "loss": 0.1273, "step": 7009 }, { "epoch": 6.545284780578898, "grad_norm": 0.4009540192922182, "learning_rate": 2.8183279506533776e-06, "loss": 0.0089, "step": 7010 }, { "epoch": 6.546218487394958, "grad_norm": 1.4030199556750182, "learning_rate": 2.81696746293198e-06, "loss": 0.0579, "step": 7011 }, { "epoch": 6.547152194211018, "grad_norm": 2.093795881024942, "learning_rate": 2.815607174872883e-06, "loss": 0.0794, "step": 7012 }, { "epoch": 6.548085901027077, "grad_norm": 1.6291487852468123, "learning_rate": 2.8142470866005e-06, "loss": 0.037, "step": 7013 }, { "epoch": 6.549019607843137, "grad_norm": 2.909364929299218, "learning_rate": 2.8128871982392226e-06, "loss": 0.1578, "step": 7014 }, { "epoch": 6.549953314659197, "grad_norm": 0.32059284871031274, "learning_rate": 2.811527509913431e-06, "loss": 0.0102, "step": 7015 }, { "epoch": 6.550887021475257, "grad_norm": 7.768813902047138, "learning_rate": 2.8101680217474834e-06, "loss": 0.204, "step": 7016 }, { "epoch": 6.551820728291316, "grad_norm": 0.5558501034479546, "learning_rate": 2.808808733865719e-06, "loss": 0.0144, "step": 7017 }, { "epoch": 6.552754435107376, "grad_norm": 0.6088661561605703, "learning_rate": 2.8074496463924574e-06, "loss": 0.011, "step": 7018 }, { "epoch": 6.553688141923436, "grad_norm": 1.4047135990362432, "learning_rate": 2.8060907594520057e-06, "loss": 0.0438, "step": 7019 }, { "epoch": 6.554621848739496, "grad_norm": 1.4579026109872923, "learning_rate": 2.8047320731686483e-06, "loss": 0.0536, "step": 7020 }, { "epoch": 6.555555555555555, "grad_norm": 1.6486860464143507, "learning_rate": 2.803373587666651e-06, "loss": 0.0516, "step": 7021 }, { "epoch": 6.556489262371615, "grad_norm": 2.657754928855161, "learning_rate": 2.802015303070261e-06, "loss": 0.1011, "step": 7022 }, { "epoch": 6.557422969187675, "grad_norm": 1.768478862103979, "learning_rate": 2.800657219503712e-06, "loss": 0.0361, "step": 7023 }, { "epoch": 6.558356676003735, "grad_norm": 0.7456226211392966, "learning_rate": 2.7992993370912124e-06, "loss": 0.025, "step": 7024 }, { "epoch": 6.559290382819794, "grad_norm": 1.274502242636889, "learning_rate": 2.797941655956957e-06, "loss": 0.0467, "step": 7025 }, { "epoch": 6.560224089635854, "grad_norm": 1.677040755675095, "learning_rate": 2.7965841762251213e-06, "loss": 0.0585, "step": 7026 }, { "epoch": 6.561157796451914, "grad_norm": 1.5801189310963113, "learning_rate": 2.7952268980198577e-06, "loss": 0.03, "step": 7027 }, { "epoch": 6.562091503267974, "grad_norm": 3.8676419130990287, "learning_rate": 2.7938698214653092e-06, "loss": 0.1399, "step": 7028 }, { "epoch": 6.563025210084033, "grad_norm": 0.4356376288949803, "learning_rate": 2.7925129466855937e-06, "loss": 0.0098, "step": 7029 }, { "epoch": 6.563958916900093, "grad_norm": 1.6216615234297134, "learning_rate": 2.7911562738048115e-06, "loss": 0.0284, "step": 7030 }, { "epoch": 6.564892623716153, "grad_norm": 0.28194949271126635, "learning_rate": 2.789799802947044e-06, "loss": 0.0064, "step": 7031 }, { "epoch": 6.565826330532213, "grad_norm": 3.813837614743638, "learning_rate": 2.788443534236359e-06, "loss": 0.1106, "step": 7032 }, { "epoch": 6.566760037348272, "grad_norm": 0.5530289305720167, "learning_rate": 2.7870874677967996e-06, "loss": 0.0121, "step": 7033 }, { "epoch": 6.567693744164332, "grad_norm": 2.1097101734667016, "learning_rate": 2.785731603752394e-06, "loss": 0.0615, "step": 7034 }, { "epoch": 6.568627450980392, "grad_norm": 2.178827558054128, "learning_rate": 2.784375942227149e-06, "loss": 0.0704, "step": 7035 }, { "epoch": 6.569561157796452, "grad_norm": 7.7569136290235, "learning_rate": 2.7830204833450577e-06, "loss": 0.2265, "step": 7036 }, { "epoch": 6.570494864612511, "grad_norm": 2.730229928343886, "learning_rate": 2.7816652272300895e-06, "loss": 0.1002, "step": 7037 }, { "epoch": 6.571428571428571, "grad_norm": 3.2774781263660957, "learning_rate": 2.7803101740061987e-06, "loss": 0.1266, "step": 7038 }, { "epoch": 6.572362278244631, "grad_norm": 3.537583171667882, "learning_rate": 2.7789553237973187e-06, "loss": 0.1453, "step": 7039 }, { "epoch": 6.573295985060691, "grad_norm": 1.9397679413995637, "learning_rate": 2.7776006767273644e-06, "loss": 0.0712, "step": 7040 }, { "epoch": 6.57422969187675, "grad_norm": 2.5875845463213603, "learning_rate": 2.7762462329202367e-06, "loss": 0.0468, "step": 7041 }, { "epoch": 6.57516339869281, "grad_norm": 1.4905506620636084, "learning_rate": 2.7748919924998126e-06, "loss": 0.0584, "step": 7042 }, { "epoch": 6.57609710550887, "grad_norm": 1.359946139345827, "learning_rate": 2.773537955589951e-06, "loss": 0.0215, "step": 7043 }, { "epoch": 6.57703081232493, "grad_norm": 2.4647455509758482, "learning_rate": 2.772184122314493e-06, "loss": 0.075, "step": 7044 }, { "epoch": 6.5779645191409895, "grad_norm": 3.142492983842084, "learning_rate": 2.770830492797265e-06, "loss": 0.0925, "step": 7045 }, { "epoch": 6.578898225957049, "grad_norm": 1.6339558984015052, "learning_rate": 2.76947706716207e-06, "loss": 0.0712, "step": 7046 }, { "epoch": 6.579831932773109, "grad_norm": 2.511359308884308, "learning_rate": 2.768123845532692e-06, "loss": 0.0777, "step": 7047 }, { "epoch": 6.580765639589169, "grad_norm": 0.354405362423291, "learning_rate": 2.7667708280328966e-06, "loss": 0.0049, "step": 7048 }, { "epoch": 6.5816993464052285, "grad_norm": 1.656276519128947, "learning_rate": 2.7654180147864368e-06, "loss": 0.0592, "step": 7049 }, { "epoch": 6.582633053221288, "grad_norm": 0.023298075983099405, "learning_rate": 2.7640654059170403e-06, "loss": 0.0001, "step": 7050 }, { "epoch": 6.583566760037348, "grad_norm": 2.1438688007736255, "learning_rate": 2.7627130015484164e-06, "loss": 0.0529, "step": 7051 }, { "epoch": 6.584500466853408, "grad_norm": 0.4440629552591438, "learning_rate": 2.761360801804257e-06, "loss": 0.0158, "step": 7052 }, { "epoch": 6.5854341736694675, "grad_norm": 0.958561223577932, "learning_rate": 2.760008806808239e-06, "loss": 0.0114, "step": 7053 }, { "epoch": 6.586367880485527, "grad_norm": 3.829025956175348, "learning_rate": 2.7586570166840154e-06, "loss": 0.1224, "step": 7054 }, { "epoch": 6.587301587301587, "grad_norm": 1.8576756185959533, "learning_rate": 2.7573054315552226e-06, "loss": 0.0501, "step": 7055 }, { "epoch": 6.588235294117647, "grad_norm": 1.850954997256099, "learning_rate": 2.7559540515454768e-06, "loss": 0.0631, "step": 7056 }, { "epoch": 6.5891690009337065, "grad_norm": 4.690757197888965, "learning_rate": 2.754602876778375e-06, "loss": 0.1374, "step": 7057 }, { "epoch": 6.590102707749766, "grad_norm": 1.5698464226245106, "learning_rate": 2.753251907377502e-06, "loss": 0.0615, "step": 7058 }, { "epoch": 6.591036414565826, "grad_norm": 1.4744644247118666, "learning_rate": 2.7519011434664156e-06, "loss": 0.0447, "step": 7059 }, { "epoch": 6.591970121381886, "grad_norm": 2.130491544557731, "learning_rate": 2.7505505851686576e-06, "loss": 0.091, "step": 7060 }, { "epoch": 6.5929038281979455, "grad_norm": 0.5821902886069216, "learning_rate": 2.7492002326077507e-06, "loss": 0.0114, "step": 7061 }, { "epoch": 6.593837535014005, "grad_norm": 1.2291617841454578, "learning_rate": 2.7478500859072033e-06, "loss": 0.0333, "step": 7062 }, { "epoch": 6.594771241830065, "grad_norm": 4.384238152853425, "learning_rate": 2.7465001451904984e-06, "loss": 0.2184, "step": 7063 }, { "epoch": 6.595704948646125, "grad_norm": 2.6442424012036634, "learning_rate": 2.745150410581102e-06, "loss": 0.0945, "step": 7064 }, { "epoch": 6.5966386554621845, "grad_norm": 0.3754875667157368, "learning_rate": 2.7438008822024632e-06, "loss": 0.0087, "step": 7065 }, { "epoch": 6.597572362278244, "grad_norm": 1.879524765354819, "learning_rate": 2.742451560178012e-06, "loss": 0.0906, "step": 7066 }, { "epoch": 6.598506069094304, "grad_norm": 1.5595503549131917, "learning_rate": 2.7411024446311597e-06, "loss": 0.0689, "step": 7067 }, { "epoch": 6.599439775910364, "grad_norm": 3.3055796140075246, "learning_rate": 2.7397535356852942e-06, "loss": 0.1382, "step": 7068 }, { "epoch": 6.6003734827264235, "grad_norm": 1.086391432116875, "learning_rate": 2.738404833463789e-06, "loss": 0.0329, "step": 7069 }, { "epoch": 6.601307189542483, "grad_norm": 1.5271988566021906, "learning_rate": 2.7370563380900005e-06, "loss": 0.0548, "step": 7070 }, { "epoch": 6.602240896358543, "grad_norm": 7.143931257361631, "learning_rate": 2.7357080496872624e-06, "loss": 0.1644, "step": 7071 }, { "epoch": 6.603174603174603, "grad_norm": 2.5355274150603937, "learning_rate": 2.734359968378889e-06, "loss": 0.1206, "step": 7072 }, { "epoch": 6.604108309990663, "grad_norm": 5.060712489114132, "learning_rate": 2.7330120942881775e-06, "loss": 0.1399, "step": 7073 }, { "epoch": 6.605042016806722, "grad_norm": 2.746358738344579, "learning_rate": 2.7316644275384045e-06, "loss": 0.1255, "step": 7074 }, { "epoch": 6.605975723622782, "grad_norm": 0.3420307497175987, "learning_rate": 2.7303169682528326e-06, "loss": 0.0052, "step": 7075 }, { "epoch": 6.606909430438842, "grad_norm": 3.1733737796657717, "learning_rate": 2.7289697165546993e-06, "loss": 0.1487, "step": 7076 }, { "epoch": 6.607843137254902, "grad_norm": 0.873199231742398, "learning_rate": 2.7276226725672245e-06, "loss": 0.0217, "step": 7077 }, { "epoch": 6.608776844070961, "grad_norm": 0.8842433224276075, "learning_rate": 2.7262758364136135e-06, "loss": 0.0259, "step": 7078 }, { "epoch": 6.609710550887021, "grad_norm": 1.7080569847614802, "learning_rate": 2.7249292082170463e-06, "loss": 0.0881, "step": 7079 }, { "epoch": 6.610644257703081, "grad_norm": 1.5018963339730478, "learning_rate": 2.7235827881006892e-06, "loss": 0.0388, "step": 7080 }, { "epoch": 6.611577964519141, "grad_norm": 0.34552625221171707, "learning_rate": 2.7222365761876835e-06, "loss": 0.0078, "step": 7081 }, { "epoch": 6.6125116713352, "grad_norm": 1.2454785978465948, "learning_rate": 2.7208905726011594e-06, "loss": 0.0543, "step": 7082 }, { "epoch": 6.61344537815126, "grad_norm": 0.2320391061357078, "learning_rate": 2.7195447774642216e-06, "loss": 0.0041, "step": 7083 }, { "epoch": 6.61437908496732, "grad_norm": 3.614411285838956, "learning_rate": 2.7181991908999563e-06, "loss": 0.0296, "step": 7084 }, { "epoch": 6.61531279178338, "grad_norm": 0.3497835459778829, "learning_rate": 2.716853813031435e-06, "loss": 0.0113, "step": 7085 }, { "epoch": 6.616246498599439, "grad_norm": 1.1509417233750054, "learning_rate": 2.7155086439817063e-06, "loss": 0.0456, "step": 7086 }, { "epoch": 6.617180205415499, "grad_norm": 0.8661731841298936, "learning_rate": 2.714163683873799e-06, "loss": 0.0277, "step": 7087 }, { "epoch": 6.618113912231559, "grad_norm": 0.8155273036910272, "learning_rate": 2.7128189328307273e-06, "loss": 0.012, "step": 7088 }, { "epoch": 6.619047619047619, "grad_norm": 1.8902660559537503, "learning_rate": 2.711474390975483e-06, "loss": 0.0963, "step": 7089 }, { "epoch": 6.619981325863678, "grad_norm": 2.3794389438789656, "learning_rate": 2.7101300584310374e-06, "loss": 0.0876, "step": 7090 }, { "epoch": 6.620915032679738, "grad_norm": 3.430042933364383, "learning_rate": 2.7087859353203438e-06, "loss": 0.0984, "step": 7091 }, { "epoch": 6.621848739495798, "grad_norm": 2.920326934771057, "learning_rate": 2.70744202176634e-06, "loss": 0.0625, "step": 7092 }, { "epoch": 6.622782446311858, "grad_norm": 0.7484227075181078, "learning_rate": 2.706098317891941e-06, "loss": 0.024, "step": 7093 }, { "epoch": 6.623716153127917, "grad_norm": 2.0689467039881504, "learning_rate": 2.7047548238200423e-06, "loss": 0.0785, "step": 7094 }, { "epoch": 6.624649859943977, "grad_norm": 1.6277515351561278, "learning_rate": 2.7034115396735194e-06, "loss": 0.0886, "step": 7095 }, { "epoch": 6.625583566760037, "grad_norm": 2.3617567035526, "learning_rate": 2.702068465575234e-06, "loss": 0.0796, "step": 7096 }, { "epoch": 6.626517273576097, "grad_norm": 1.0463724999843427, "learning_rate": 2.7007256016480245e-06, "loss": 0.0589, "step": 7097 }, { "epoch": 6.627450980392156, "grad_norm": 0.939563395591174, "learning_rate": 2.6993829480147082e-06, "loss": 0.0275, "step": 7098 }, { "epoch": 6.628384687208216, "grad_norm": 0.5481183363351482, "learning_rate": 2.6980405047980853e-06, "loss": 0.0151, "step": 7099 }, { "epoch": 6.629318394024276, "grad_norm": 2.0081555988414164, "learning_rate": 2.6966982721209402e-06, "loss": 0.059, "step": 7100 }, { "epoch": 6.630252100840336, "grad_norm": 0.36102352361471407, "learning_rate": 2.6953562501060332e-06, "loss": 0.0078, "step": 7101 }, { "epoch": 6.631185807656395, "grad_norm": 1.8934489510564148, "learning_rate": 2.694014438876107e-06, "loss": 0.0741, "step": 7102 }, { "epoch": 6.632119514472455, "grad_norm": 0.37421821189492527, "learning_rate": 2.6926728385538825e-06, "loss": 0.0108, "step": 7103 }, { "epoch": 6.633053221288515, "grad_norm": 0.12347467488612174, "learning_rate": 2.6913314492620678e-06, "loss": 0.0015, "step": 7104 }, { "epoch": 6.633986928104575, "grad_norm": 3.0094904739367956, "learning_rate": 2.6899902711233467e-06, "loss": 0.0824, "step": 7105 }, { "epoch": 6.634920634920634, "grad_norm": 2.68918931858937, "learning_rate": 2.6886493042603835e-06, "loss": 0.0847, "step": 7106 }, { "epoch": 6.635854341736694, "grad_norm": 0.9745192295349352, "learning_rate": 2.687308548795825e-06, "loss": 0.0388, "step": 7107 }, { "epoch": 6.636788048552754, "grad_norm": 3.0207806626552034, "learning_rate": 2.6859680048522953e-06, "loss": 0.059, "step": 7108 }, { "epoch": 6.637721755368814, "grad_norm": 3.4342704411099616, "learning_rate": 2.6846276725524066e-06, "loss": 0.1431, "step": 7109 }, { "epoch": 6.6386554621848735, "grad_norm": 3.09550522312647, "learning_rate": 2.6832875520187448e-06, "loss": 0.1066, "step": 7110 }, { "epoch": 6.639589169000933, "grad_norm": 1.1659976232411189, "learning_rate": 2.6819476433738785e-06, "loss": 0.0382, "step": 7111 }, { "epoch": 6.640522875816993, "grad_norm": 2.7200037641237245, "learning_rate": 2.6806079467403547e-06, "loss": 0.149, "step": 7112 }, { "epoch": 6.641456582633054, "grad_norm": 1.4125282854263443, "learning_rate": 2.679268462240708e-06, "loss": 0.0542, "step": 7113 }, { "epoch": 6.642390289449113, "grad_norm": 2.0986054121633604, "learning_rate": 2.6779291899974475e-06, "loss": 0.1075, "step": 7114 }, { "epoch": 6.643323996265173, "grad_norm": 2.6855726465078718, "learning_rate": 2.6765901301330633e-06, "loss": 0.1242, "step": 7115 }, { "epoch": 6.644257703081233, "grad_norm": 2.3378802513257075, "learning_rate": 2.675251282770025e-06, "loss": 0.1242, "step": 7116 }, { "epoch": 6.645191409897293, "grad_norm": 1.0951731726769849, "learning_rate": 2.6739126480307897e-06, "loss": 0.028, "step": 7117 }, { "epoch": 6.646125116713352, "grad_norm": 1.5131951484393853, "learning_rate": 2.6725742260377878e-06, "loss": 0.0427, "step": 7118 }, { "epoch": 6.647058823529412, "grad_norm": 0.4007284538915206, "learning_rate": 2.671236016913433e-06, "loss": 0.0079, "step": 7119 }, { "epoch": 6.647992530345472, "grad_norm": 2.4584296636895315, "learning_rate": 2.6698980207801166e-06, "loss": 0.0984, "step": 7120 }, { "epoch": 6.648926237161532, "grad_norm": 3.07538894154749, "learning_rate": 2.6685602377602182e-06, "loss": 0.0724, "step": 7121 }, { "epoch": 6.649859943977591, "grad_norm": 1.5815488452051798, "learning_rate": 2.6672226679760892e-06, "loss": 0.0407, "step": 7122 }, { "epoch": 6.650793650793651, "grad_norm": 0.2664467229843026, "learning_rate": 2.665885311550066e-06, "loss": 0.0037, "step": 7123 }, { "epoch": 6.651727357609711, "grad_norm": 1.1809444499209152, "learning_rate": 2.664548168604465e-06, "loss": 0.0403, "step": 7124 }, { "epoch": 6.652661064425771, "grad_norm": 4.314760719314039, "learning_rate": 2.6632112392615794e-06, "loss": 0.2772, "step": 7125 }, { "epoch": 6.65359477124183, "grad_norm": 3.3485474393121955, "learning_rate": 2.6618745236436903e-06, "loss": 0.122, "step": 7126 }, { "epoch": 6.65452847805789, "grad_norm": 1.1336853023990092, "learning_rate": 2.660538021873054e-06, "loss": 0.0347, "step": 7127 }, { "epoch": 6.65546218487395, "grad_norm": 0.9814316098102865, "learning_rate": 2.659201734071908e-06, "loss": 0.0163, "step": 7128 }, { "epoch": 6.65639589169001, "grad_norm": 0.5904197966381383, "learning_rate": 2.6578656603624674e-06, "loss": 0.0172, "step": 7129 }, { "epoch": 6.657329598506069, "grad_norm": 0.4685903345507979, "learning_rate": 2.656529800866936e-06, "loss": 0.0119, "step": 7130 }, { "epoch": 6.658263305322129, "grad_norm": 1.2186696864029827, "learning_rate": 2.65519415570749e-06, "loss": 0.0354, "step": 7131 }, { "epoch": 6.659197012138189, "grad_norm": 2.6210392562010556, "learning_rate": 2.653858725006289e-06, "loss": 0.1211, "step": 7132 }, { "epoch": 6.660130718954249, "grad_norm": 3.346348498519313, "learning_rate": 2.652523508885472e-06, "loss": 0.1413, "step": 7133 }, { "epoch": 6.661064425770308, "grad_norm": 0.239816708516486, "learning_rate": 2.651188507467161e-06, "loss": 0.0094, "step": 7134 }, { "epoch": 6.661998132586368, "grad_norm": 1.5241511846029556, "learning_rate": 2.6498537208734566e-06, "loss": 0.0649, "step": 7135 }, { "epoch": 6.662931839402428, "grad_norm": 1.4034136487319528, "learning_rate": 2.648519149226438e-06, "loss": 0.0691, "step": 7136 }, { "epoch": 6.663865546218488, "grad_norm": 2.250221605471879, "learning_rate": 2.6471847926481663e-06, "loss": 0.1073, "step": 7137 }, { "epoch": 6.6647992530345475, "grad_norm": 2.3409824883736774, "learning_rate": 2.645850651260685e-06, "loss": 0.1104, "step": 7138 }, { "epoch": 6.665732959850607, "grad_norm": 1.6713599329706041, "learning_rate": 2.644516725186016e-06, "loss": 0.0516, "step": 7139 }, { "epoch": 6.666666666666667, "grad_norm": 1.3487470847067706, "learning_rate": 2.6431830145461596e-06, "loss": 0.0563, "step": 7140 }, { "epoch": 6.667600373482727, "grad_norm": 2.6107614972795767, "learning_rate": 2.6418495194630996e-06, "loss": 0.0345, "step": 7141 }, { "epoch": 6.6685340802987865, "grad_norm": 2.0630510537400717, "learning_rate": 2.640516240058796e-06, "loss": 0.0535, "step": 7142 }, { "epoch": 6.669467787114846, "grad_norm": 0.8036453980790988, "learning_rate": 2.639183176455196e-06, "loss": 0.012, "step": 7143 }, { "epoch": 6.670401493930906, "grad_norm": 0.3445962727004618, "learning_rate": 2.637850328774222e-06, "loss": 0.012, "step": 7144 }, { "epoch": 6.671335200746966, "grad_norm": 0.4512147556056436, "learning_rate": 2.636517697137776e-06, "loss": 0.0078, "step": 7145 }, { "epoch": 6.6722689075630255, "grad_norm": 0.6911766135345384, "learning_rate": 2.6351852816677403e-06, "loss": 0.0259, "step": 7146 }, { "epoch": 6.673202614379085, "grad_norm": 1.412035254197522, "learning_rate": 2.633853082485983e-06, "loss": 0.036, "step": 7147 }, { "epoch": 6.674136321195145, "grad_norm": 4.4288981840850346, "learning_rate": 2.6325210997143467e-06, "loss": 0.2032, "step": 7148 }, { "epoch": 6.675070028011205, "grad_norm": 2.002514657751898, "learning_rate": 2.6311893334746557e-06, "loss": 0.034, "step": 7149 }, { "epoch": 6.6760037348272645, "grad_norm": 1.941605355349972, "learning_rate": 2.6298577838887123e-06, "loss": 0.0764, "step": 7150 }, { "epoch": 6.676937441643324, "grad_norm": 0.6903981452673619, "learning_rate": 2.628526451078306e-06, "loss": 0.0185, "step": 7151 }, { "epoch": 6.677871148459384, "grad_norm": 0.3835179016733044, "learning_rate": 2.6271953351651987e-06, "loss": 0.0063, "step": 7152 }, { "epoch": 6.678804855275444, "grad_norm": 2.0202842236858465, "learning_rate": 2.625864436271136e-06, "loss": 0.0722, "step": 7153 }, { "epoch": 6.6797385620915035, "grad_norm": 2.305842789032191, "learning_rate": 2.6245337545178423e-06, "loss": 0.0813, "step": 7154 }, { "epoch": 6.680672268907563, "grad_norm": 2.6437377311940775, "learning_rate": 2.6232032900270254e-06, "loss": 0.1073, "step": 7155 }, { "epoch": 6.681605975723623, "grad_norm": 0.3957299527412808, "learning_rate": 2.6218730429203704e-06, "loss": 0.0076, "step": 7156 }, { "epoch": 6.682539682539683, "grad_norm": 0.8686543648489996, "learning_rate": 2.6205430133195396e-06, "loss": 0.0113, "step": 7157 }, { "epoch": 6.6834733893557425, "grad_norm": 2.364016662096333, "learning_rate": 2.6192132013461842e-06, "loss": 0.0636, "step": 7158 }, { "epoch": 6.684407096171802, "grad_norm": 1.068589766553515, "learning_rate": 2.6178836071219273e-06, "loss": 0.0515, "step": 7159 }, { "epoch": 6.685340802987862, "grad_norm": 1.202524668780007, "learning_rate": 2.6165542307683744e-06, "loss": 0.0559, "step": 7160 }, { "epoch": 6.686274509803922, "grad_norm": 3.804870516746002, "learning_rate": 2.6152250724071106e-06, "loss": 0.2299, "step": 7161 }, { "epoch": 6.6872082166199815, "grad_norm": 1.050614156839714, "learning_rate": 2.6138961321597055e-06, "loss": 0.0289, "step": 7162 }, { "epoch": 6.688141923436041, "grad_norm": 3.5415772680279756, "learning_rate": 2.612567410147704e-06, "loss": 0.2019, "step": 7163 }, { "epoch": 6.689075630252101, "grad_norm": 1.0999890504985916, "learning_rate": 2.6112389064926293e-06, "loss": 0.0332, "step": 7164 }, { "epoch": 6.690009337068161, "grad_norm": 2.4540898027910147, "learning_rate": 2.6099106213159912e-06, "loss": 0.1243, "step": 7165 }, { "epoch": 6.690943043884221, "grad_norm": 1.684395265713144, "learning_rate": 2.608582554739276e-06, "loss": 0.0546, "step": 7166 }, { "epoch": 6.69187675070028, "grad_norm": 2.344901548663077, "learning_rate": 2.607254706883946e-06, "loss": 0.0793, "step": 7167 }, { "epoch": 6.69281045751634, "grad_norm": 1.9519125451412505, "learning_rate": 2.605927077871453e-06, "loss": 0.1255, "step": 7168 }, { "epoch": 6.6937441643324, "grad_norm": 2.6375262274018065, "learning_rate": 2.60459966782322e-06, "loss": 0.1065, "step": 7169 }, { "epoch": 6.69467787114846, "grad_norm": 2.787293323988376, "learning_rate": 2.603272476860653e-06, "loss": 0.1484, "step": 7170 }, { "epoch": 6.695611577964519, "grad_norm": 1.4703758927969233, "learning_rate": 2.6019455051051378e-06, "loss": 0.0494, "step": 7171 }, { "epoch": 6.696545284780579, "grad_norm": 1.2637197600161534, "learning_rate": 2.600618752678044e-06, "loss": 0.0462, "step": 7172 }, { "epoch": 6.697478991596639, "grad_norm": 1.1664452120122843, "learning_rate": 2.5992922197007153e-06, "loss": 0.0177, "step": 7173 }, { "epoch": 6.698412698412699, "grad_norm": 1.8766527353168192, "learning_rate": 2.5979659062944775e-06, "loss": 0.0766, "step": 7174 }, { "epoch": 6.699346405228758, "grad_norm": 0.24384570321973295, "learning_rate": 2.5966398125806363e-06, "loss": 0.0023, "step": 7175 }, { "epoch": 6.700280112044818, "grad_norm": 1.3137104606944843, "learning_rate": 2.5953139386804765e-06, "loss": 0.0425, "step": 7176 }, { "epoch": 6.701213818860878, "grad_norm": 1.8207384478522557, "learning_rate": 2.5939882847152676e-06, "loss": 0.0521, "step": 7177 }, { "epoch": 6.702147525676938, "grad_norm": 1.2397201816585657, "learning_rate": 2.5926628508062524e-06, "loss": 0.0537, "step": 7178 }, { "epoch": 6.703081232492997, "grad_norm": 2.124362910710285, "learning_rate": 2.5913376370746568e-06, "loss": 0.0751, "step": 7179 }, { "epoch": 6.704014939309057, "grad_norm": 2.8138696631882016, "learning_rate": 2.590012643641685e-06, "loss": 0.1447, "step": 7180 }, { "epoch": 6.704948646125117, "grad_norm": 3.7026660771294306, "learning_rate": 2.5886878706285253e-06, "loss": 0.0393, "step": 7181 }, { "epoch": 6.705882352941177, "grad_norm": 2.1558738730364695, "learning_rate": 2.587363318156341e-06, "loss": 0.1001, "step": 7182 }, { "epoch": 6.706816059757236, "grad_norm": 0.25678594483567696, "learning_rate": 2.5860389863462765e-06, "loss": 0.0053, "step": 7183 }, { "epoch": 6.707749766573296, "grad_norm": 1.288708062427529, "learning_rate": 2.584714875319455e-06, "loss": 0.048, "step": 7184 }, { "epoch": 6.708683473389356, "grad_norm": 0.8649998808927073, "learning_rate": 2.583390985196985e-06, "loss": 0.0197, "step": 7185 }, { "epoch": 6.709617180205416, "grad_norm": 1.2863916310181447, "learning_rate": 2.582067316099949e-06, "loss": 0.0418, "step": 7186 }, { "epoch": 6.710550887021475, "grad_norm": 1.4760510938275202, "learning_rate": 2.5807438681494103e-06, "loss": 0.061, "step": 7187 }, { "epoch": 6.711484593837535, "grad_norm": 0.6724604645782448, "learning_rate": 2.5794206414664137e-06, "loss": 0.0149, "step": 7188 }, { "epoch": 6.712418300653595, "grad_norm": 1.320992202759654, "learning_rate": 2.57809763617198e-06, "loss": 0.0405, "step": 7189 }, { "epoch": 6.713352007469655, "grad_norm": 3.162228441223114, "learning_rate": 2.576774852387116e-06, "loss": 0.0878, "step": 7190 }, { "epoch": 6.714285714285714, "grad_norm": 1.4459202300819773, "learning_rate": 2.575452290232805e-06, "loss": 0.0393, "step": 7191 }, { "epoch": 6.715219421101774, "grad_norm": 2.715062016126369, "learning_rate": 2.574129949830008e-06, "loss": 0.0952, "step": 7192 }, { "epoch": 6.716153127917834, "grad_norm": 1.9860467950778884, "learning_rate": 2.572807831299666e-06, "loss": 0.095, "step": 7193 }, { "epoch": 6.717086834733894, "grad_norm": 1.6130473010358992, "learning_rate": 2.5714859347627056e-06, "loss": 0.0517, "step": 7194 }, { "epoch": 6.718020541549953, "grad_norm": 2.2667855855249632, "learning_rate": 2.570164260340027e-06, "loss": 0.126, "step": 7195 }, { "epoch": 6.718954248366013, "grad_norm": 4.668392869747782, "learning_rate": 2.56884280815251e-06, "loss": 0.1842, "step": 7196 }, { "epoch": 6.719887955182073, "grad_norm": 0.3945976846361932, "learning_rate": 2.5675215783210163e-06, "loss": 0.0144, "step": 7197 }, { "epoch": 6.720821661998133, "grad_norm": 1.9545048810555723, "learning_rate": 2.5662005709663895e-06, "loss": 0.0765, "step": 7198 }, { "epoch": 6.721755368814192, "grad_norm": 0.6934558230752224, "learning_rate": 2.5648797862094483e-06, "loss": 0.0275, "step": 7199 }, { "epoch": 6.722689075630252, "grad_norm": 3.675095246848756, "learning_rate": 2.5635592241709933e-06, "loss": 0.0624, "step": 7200 }, { "epoch": 6.723622782446312, "grad_norm": 1.2631336453538777, "learning_rate": 2.562238884971803e-06, "loss": 0.0385, "step": 7201 }, { "epoch": 6.724556489262372, "grad_norm": 1.008484172416368, "learning_rate": 2.5609187687326393e-06, "loss": 0.0365, "step": 7202 }, { "epoch": 6.7254901960784315, "grad_norm": 3.1233640517483927, "learning_rate": 2.5595988755742408e-06, "loss": 0.1185, "step": 7203 }, { "epoch": 6.726423902894491, "grad_norm": 1.542113212178217, "learning_rate": 2.5582792056173256e-06, "loss": 0.0435, "step": 7204 }, { "epoch": 6.727357609710551, "grad_norm": 0.8087875625873937, "learning_rate": 2.556959758982592e-06, "loss": 0.0243, "step": 7205 }, { "epoch": 6.728291316526611, "grad_norm": 1.1071482892815092, "learning_rate": 2.5556405357907153e-06, "loss": 0.0366, "step": 7206 }, { "epoch": 6.7292250233426705, "grad_norm": 3.03762072162344, "learning_rate": 2.5543215361623585e-06, "loss": 0.1466, "step": 7207 }, { "epoch": 6.73015873015873, "grad_norm": 2.286127973057329, "learning_rate": 2.553002760218156e-06, "loss": 0.0892, "step": 7208 }, { "epoch": 6.73109243697479, "grad_norm": 1.036885033946472, "learning_rate": 2.5516842080787236e-06, "loss": 0.036, "step": 7209 }, { "epoch": 6.73202614379085, "grad_norm": 0.637995315525702, "learning_rate": 2.5503658798646556e-06, "loss": 0.0143, "step": 7210 }, { "epoch": 6.7329598506069095, "grad_norm": 0.3038438733097852, "learning_rate": 2.549047775696533e-06, "loss": 0.0059, "step": 7211 }, { "epoch": 6.733893557422969, "grad_norm": 1.2835105366935948, "learning_rate": 2.5477298956949066e-06, "loss": 0.0442, "step": 7212 }, { "epoch": 6.734827264239029, "grad_norm": 1.798832816590756, "learning_rate": 2.5464122399803126e-06, "loss": 0.0803, "step": 7213 }, { "epoch": 6.735760971055089, "grad_norm": 3.1144447662537806, "learning_rate": 2.545094808673262e-06, "loss": 0.1966, "step": 7214 }, { "epoch": 6.7366946778711485, "grad_norm": 1.399231265059042, "learning_rate": 2.543777601894254e-06, "loss": 0.0555, "step": 7215 }, { "epoch": 6.737628384687208, "grad_norm": 0.46037343531805164, "learning_rate": 2.542460619763758e-06, "loss": 0.0027, "step": 7216 }, { "epoch": 6.738562091503268, "grad_norm": 0.4443838444097559, "learning_rate": 2.5411438624022265e-06, "loss": 0.0063, "step": 7217 }, { "epoch": 6.739495798319328, "grad_norm": 4.084415464310313, "learning_rate": 2.5398273299300915e-06, "loss": 0.1693, "step": 7218 }, { "epoch": 6.7404295051353875, "grad_norm": 1.7531426385488702, "learning_rate": 2.5385110224677658e-06, "loss": 0.0796, "step": 7219 }, { "epoch": 6.741363211951447, "grad_norm": 2.9113551981655506, "learning_rate": 2.53719494013564e-06, "loss": 0.1058, "step": 7220 }, { "epoch": 6.742296918767507, "grad_norm": 6.96317085036632, "learning_rate": 2.5358790830540824e-06, "loss": 0.1211, "step": 7221 }, { "epoch": 6.743230625583567, "grad_norm": 3.391497869359242, "learning_rate": 2.5345634513434447e-06, "loss": 0.1693, "step": 7222 }, { "epoch": 6.7441643323996265, "grad_norm": 2.493301865802591, "learning_rate": 2.5332480451240525e-06, "loss": 0.0226, "step": 7223 }, { "epoch": 6.745098039215686, "grad_norm": 1.6124982693644418, "learning_rate": 2.5319328645162188e-06, "loss": 0.0777, "step": 7224 }, { "epoch": 6.746031746031746, "grad_norm": 1.2000960132846825, "learning_rate": 2.530617909640228e-06, "loss": 0.0204, "step": 7225 }, { "epoch": 6.746965452847806, "grad_norm": 2.0153550330702457, "learning_rate": 2.5293031806163496e-06, "loss": 0.0848, "step": 7226 }, { "epoch": 6.7478991596638656, "grad_norm": 0.33667691374431813, "learning_rate": 2.5279886775648266e-06, "loss": 0.0015, "step": 7227 }, { "epoch": 6.748832866479925, "grad_norm": 1.8567074995438562, "learning_rate": 2.5266744006058875e-06, "loss": 0.0807, "step": 7228 }, { "epoch": 6.749766573295985, "grad_norm": 2.014417646562749, "learning_rate": 2.525360349859738e-06, "loss": 0.0829, "step": 7229 }, { "epoch": 6.750700280112045, "grad_norm": 1.15519077826481, "learning_rate": 2.524046525446558e-06, "loss": 0.058, "step": 7230 }, { "epoch": 6.751633986928105, "grad_norm": 1.5342193162582998, "learning_rate": 2.5227329274865164e-06, "loss": 0.0387, "step": 7231 }, { "epoch": 6.752567693744164, "grad_norm": 5.365870205628899, "learning_rate": 2.5214195560997546e-06, "loss": 0.1288, "step": 7232 }, { "epoch": 6.753501400560224, "grad_norm": 0.6957870583954557, "learning_rate": 2.520106411406393e-06, "loss": 0.01, "step": 7233 }, { "epoch": 6.754435107376284, "grad_norm": 0.5431472148791999, "learning_rate": 2.518793493526534e-06, "loss": 0.0165, "step": 7234 }, { "epoch": 6.755368814192344, "grad_norm": 4.542228173380597, "learning_rate": 2.51748080258026e-06, "loss": 0.1817, "step": 7235 }, { "epoch": 6.756302521008403, "grad_norm": 2.2298539155131682, "learning_rate": 2.5161683386876297e-06, "loss": 0.0701, "step": 7236 }, { "epoch": 6.757236227824463, "grad_norm": 1.5481543718331658, "learning_rate": 2.51485610196868e-06, "loss": 0.0793, "step": 7237 }, { "epoch": 6.758169934640523, "grad_norm": 3.734769213881228, "learning_rate": 2.513544092543434e-06, "loss": 0.1368, "step": 7238 }, { "epoch": 6.759103641456583, "grad_norm": 2.579319964402971, "learning_rate": 2.5122323105318867e-06, "loss": 0.1192, "step": 7239 }, { "epoch": 6.760037348272642, "grad_norm": 3.1909972745192463, "learning_rate": 2.510920756054016e-06, "loss": 0.1549, "step": 7240 }, { "epoch": 6.760971055088702, "grad_norm": 3.3125783752158338, "learning_rate": 2.509609429229775e-06, "loss": 0.1336, "step": 7241 }, { "epoch": 6.761904761904762, "grad_norm": 0.5416605069476722, "learning_rate": 2.5082983301791033e-06, "loss": 0.0046, "step": 7242 }, { "epoch": 6.762838468720822, "grad_norm": 3.082788083278401, "learning_rate": 2.5069874590219134e-06, "loss": 0.1381, "step": 7243 }, { "epoch": 6.763772175536881, "grad_norm": 2.9492072178340956, "learning_rate": 2.5056768158780965e-06, "loss": 0.1227, "step": 7244 }, { "epoch": 6.764705882352941, "grad_norm": 1.3246989615242957, "learning_rate": 2.5043664008675297e-06, "loss": 0.0281, "step": 7245 }, { "epoch": 6.765639589169001, "grad_norm": 1.0953768590354325, "learning_rate": 2.503056214110062e-06, "loss": 0.0327, "step": 7246 }, { "epoch": 6.766573295985061, "grad_norm": 0.9598069547226076, "learning_rate": 2.5017462557255255e-06, "loss": 0.0381, "step": 7247 }, { "epoch": 6.76750700280112, "grad_norm": 2.9634315895501038, "learning_rate": 2.5004365258337282e-06, "loss": 0.0918, "step": 7248 }, { "epoch": 6.76844070961718, "grad_norm": 3.6032443971958132, "learning_rate": 2.4991270245544624e-06, "loss": 0.1513, "step": 7249 }, { "epoch": 6.76937441643324, "grad_norm": 0.6673068577617586, "learning_rate": 2.4978177520074955e-06, "loss": 0.0089, "step": 7250 }, { "epoch": 6.7703081232493, "grad_norm": 0.4171018744599597, "learning_rate": 2.4965087083125733e-06, "loss": 0.0091, "step": 7251 }, { "epoch": 6.771241830065359, "grad_norm": 0.8286596993886923, "learning_rate": 2.4951998935894216e-06, "loss": 0.0258, "step": 7252 }, { "epoch": 6.772175536881419, "grad_norm": 8.486571195760142, "learning_rate": 2.4938913079577498e-06, "loss": 0.1378, "step": 7253 }, { "epoch": 6.773109243697479, "grad_norm": 1.5511234913902532, "learning_rate": 2.4925829515372397e-06, "loss": 0.0399, "step": 7254 }, { "epoch": 6.774042950513539, "grad_norm": 2.0800109703174954, "learning_rate": 2.4912748244475548e-06, "loss": 0.139, "step": 7255 }, { "epoch": 6.774976657329598, "grad_norm": 0.423630083253279, "learning_rate": 2.4899669268083388e-06, "loss": 0.0057, "step": 7256 }, { "epoch": 6.775910364145658, "grad_norm": 0.39492635773933155, "learning_rate": 2.4886592587392105e-06, "loss": 0.0069, "step": 7257 }, { "epoch": 6.776844070961718, "grad_norm": 1.4406938679769767, "learning_rate": 2.487351820359774e-06, "loss": 0.0618, "step": 7258 }, { "epoch": 6.777777777777778, "grad_norm": 1.3059814855013878, "learning_rate": 2.486044611789608e-06, "loss": 0.0346, "step": 7259 }, { "epoch": 6.778711484593837, "grad_norm": 0.4861561928595996, "learning_rate": 2.4847376331482707e-06, "loss": 0.0124, "step": 7260 }, { "epoch": 6.779645191409897, "grad_norm": 1.3229494259633832, "learning_rate": 2.4834308845552975e-06, "loss": 0.0567, "step": 7261 }, { "epoch": 6.780578898225957, "grad_norm": 1.4158127673428984, "learning_rate": 2.4821243661302084e-06, "loss": 0.0551, "step": 7262 }, { "epoch": 6.781512605042017, "grad_norm": 0.8289566901562673, "learning_rate": 2.4808180779924984e-06, "loss": 0.0195, "step": 7263 }, { "epoch": 6.7824463118580764, "grad_norm": 2.242264948199272, "learning_rate": 2.47951202026164e-06, "loss": 0.1209, "step": 7264 }, { "epoch": 6.783380018674136, "grad_norm": 3.172055680726178, "learning_rate": 2.478206193057086e-06, "loss": 0.1063, "step": 7265 }, { "epoch": 6.784313725490196, "grad_norm": 0.23486347305843197, "learning_rate": 2.4769005964982718e-06, "loss": 0.0017, "step": 7266 }, { "epoch": 6.785247432306256, "grad_norm": 0.9053390269945327, "learning_rate": 2.475595230704607e-06, "loss": 0.0259, "step": 7267 }, { "epoch": 6.7861811391223155, "grad_norm": 0.6163849059667882, "learning_rate": 2.474290095795481e-06, "loss": 0.0164, "step": 7268 }, { "epoch": 6.787114845938375, "grad_norm": 0.8089862211432405, "learning_rate": 2.4729851918902613e-06, "loss": 0.0291, "step": 7269 }, { "epoch": 6.788048552754435, "grad_norm": 0.9526180541773906, "learning_rate": 2.4716805191082997e-06, "loss": 0.0303, "step": 7270 }, { "epoch": 6.788982259570495, "grad_norm": 2.6793692297362623, "learning_rate": 2.470376077568921e-06, "loss": 0.1359, "step": 7271 }, { "epoch": 6.7899159663865545, "grad_norm": 1.0684742732797678, "learning_rate": 2.4690718673914304e-06, "loss": 0.0414, "step": 7272 }, { "epoch": 6.790849673202614, "grad_norm": 1.0676859817016988, "learning_rate": 2.4677678886951115e-06, "loss": 0.0362, "step": 7273 }, { "epoch": 6.791783380018674, "grad_norm": 2.8876878412050004, "learning_rate": 2.466464141599227e-06, "loss": 0.1553, "step": 7274 }, { "epoch": 6.792717086834734, "grad_norm": 3.170930103354662, "learning_rate": 2.465160626223022e-06, "loss": 0.061, "step": 7275 }, { "epoch": 6.7936507936507935, "grad_norm": 1.5246266403687496, "learning_rate": 2.4638573426857154e-06, "loss": 0.0458, "step": 7276 }, { "epoch": 6.794584500466853, "grad_norm": 1.344758553729366, "learning_rate": 2.4625542911065066e-06, "loss": 0.0543, "step": 7277 }, { "epoch": 6.795518207282913, "grad_norm": 2.363914701522314, "learning_rate": 2.461251471604572e-06, "loss": 0.0807, "step": 7278 }, { "epoch": 6.796451914098973, "grad_norm": 2.3325164715120894, "learning_rate": 2.4599488842990736e-06, "loss": 0.1037, "step": 7279 }, { "epoch": 6.7973856209150325, "grad_norm": 0.4673434480956385, "learning_rate": 2.4586465293091443e-06, "loss": 0.0116, "step": 7280 }, { "epoch": 6.798319327731092, "grad_norm": 0.8007833317874706, "learning_rate": 2.457344406753899e-06, "loss": 0.0233, "step": 7281 }, { "epoch": 6.799253034547152, "grad_norm": 0.6849070004053809, "learning_rate": 2.4560425167524295e-06, "loss": 0.0281, "step": 7282 }, { "epoch": 6.800186741363212, "grad_norm": 0.8636408206931708, "learning_rate": 2.4547408594238113e-06, "loss": 0.0221, "step": 7283 }, { "epoch": 6.8011204481792715, "grad_norm": 1.378521045983001, "learning_rate": 2.453439434887094e-06, "loss": 0.0657, "step": 7284 }, { "epoch": 6.802054154995331, "grad_norm": 0.9404363957625264, "learning_rate": 2.452138243261307e-06, "loss": 0.0263, "step": 7285 }, { "epoch": 6.802987861811391, "grad_norm": 0.3601599850692462, "learning_rate": 2.4508372846654554e-06, "loss": 0.0039, "step": 7286 }, { "epoch": 6.803921568627451, "grad_norm": 1.092038522967593, "learning_rate": 2.4495365592185317e-06, "loss": 0.0386, "step": 7287 }, { "epoch": 6.8048552754435105, "grad_norm": 0.745860584421109, "learning_rate": 2.448236067039499e-06, "loss": 0.0281, "step": 7288 }, { "epoch": 6.80578898225957, "grad_norm": 1.6151494738427794, "learning_rate": 2.446935808247301e-06, "loss": 0.0961, "step": 7289 }, { "epoch": 6.80672268907563, "grad_norm": 0.7458838434448227, "learning_rate": 2.445635782960862e-06, "loss": 0.019, "step": 7290 }, { "epoch": 6.80765639589169, "grad_norm": 1.4189170758307337, "learning_rate": 2.4443359912990804e-06, "loss": 0.0634, "step": 7291 }, { "epoch": 6.80859010270775, "grad_norm": 0.9189187338661436, "learning_rate": 2.4430364333808405e-06, "loss": 0.019, "step": 7292 }, { "epoch": 6.809523809523809, "grad_norm": 0.6688550597081065, "learning_rate": 2.441737109325e-06, "loss": 0.0049, "step": 7293 }, { "epoch": 6.810457516339869, "grad_norm": 3.9485914477754114, "learning_rate": 2.440438019250395e-06, "loss": 0.2075, "step": 7294 }, { "epoch": 6.811391223155929, "grad_norm": 1.3770566131001207, "learning_rate": 2.439139163275841e-06, "loss": 0.0634, "step": 7295 }, { "epoch": 6.812324929971989, "grad_norm": 1.0975048083315275, "learning_rate": 2.4378405415201356e-06, "loss": 0.0268, "step": 7296 }, { "epoch": 6.813258636788048, "grad_norm": 1.054215197858087, "learning_rate": 2.4365421541020506e-06, "loss": 0.0128, "step": 7297 }, { "epoch": 6.814192343604108, "grad_norm": 0.9095814471539646, "learning_rate": 2.435244001140337e-06, "loss": 0.0243, "step": 7298 }, { "epoch": 6.815126050420168, "grad_norm": 2.109948367599155, "learning_rate": 2.433946082753725e-06, "loss": 0.0765, "step": 7299 }, { "epoch": 6.816059757236228, "grad_norm": 1.686804980189101, "learning_rate": 2.432648399060925e-06, "loss": 0.0797, "step": 7300 }, { "epoch": 6.816993464052287, "grad_norm": 0.044462969455806585, "learning_rate": 2.431350950180624e-06, "loss": 0.0001, "step": 7301 }, { "epoch": 6.817927170868347, "grad_norm": 1.3537190295372907, "learning_rate": 2.430053736231488e-06, "loss": 0.0435, "step": 7302 }, { "epoch": 6.818860877684407, "grad_norm": 3.520126818890725, "learning_rate": 2.428756757332159e-06, "loss": 0.0689, "step": 7303 }, { "epoch": 6.819794584500467, "grad_norm": 0.4273832344192805, "learning_rate": 2.427460013601264e-06, "loss": 0.0097, "step": 7304 }, { "epoch": 6.820728291316526, "grad_norm": 2.3707492648760327, "learning_rate": 2.426163505157402e-06, "loss": 0.1024, "step": 7305 }, { "epoch": 6.821661998132586, "grad_norm": 0.8671650675452287, "learning_rate": 2.4248672321191534e-06, "loss": 0.0249, "step": 7306 }, { "epoch": 6.822595704948646, "grad_norm": 0.6274576226588204, "learning_rate": 2.423571194605075e-06, "loss": 0.0258, "step": 7307 }, { "epoch": 6.823529411764706, "grad_norm": 2.0392657436093176, "learning_rate": 2.4222753927337073e-06, "loss": 0.0824, "step": 7308 }, { "epoch": 6.824463118580765, "grad_norm": 4.250236743339735, "learning_rate": 2.4209798266235633e-06, "loss": 0.07, "step": 7309 }, { "epoch": 6.825396825396825, "grad_norm": 1.3604654092717092, "learning_rate": 2.4196844963931355e-06, "loss": 0.0574, "step": 7310 }, { "epoch": 6.826330532212885, "grad_norm": 0.8444307526583946, "learning_rate": 2.418389402160899e-06, "loss": 0.0215, "step": 7311 }, { "epoch": 6.827264239028945, "grad_norm": 2.1559514454696784, "learning_rate": 2.417094544045303e-06, "loss": 0.0667, "step": 7312 }, { "epoch": 6.828197945845004, "grad_norm": 0.2181302176873207, "learning_rate": 2.4157999221647773e-06, "loss": 0.0019, "step": 7313 }, { "epoch": 6.829131652661064, "grad_norm": 0.2631856775179281, "learning_rate": 2.4145055366377257e-06, "loss": 0.0073, "step": 7314 }, { "epoch": 6.830065359477124, "grad_norm": 3.945373212942013, "learning_rate": 2.4132113875825392e-06, "loss": 0.1267, "step": 7315 }, { "epoch": 6.830999066293184, "grad_norm": 2.419978253202627, "learning_rate": 2.411917475117579e-06, "loss": 0.1025, "step": 7316 }, { "epoch": 6.831932773109243, "grad_norm": 3.1459128945216928, "learning_rate": 2.4106237993611866e-06, "loss": 0.107, "step": 7317 }, { "epoch": 6.832866479925303, "grad_norm": 1.9635109111800895, "learning_rate": 2.4093303604316865e-06, "loss": 0.0601, "step": 7318 }, { "epoch": 6.833800186741363, "grad_norm": 1.1655314735392557, "learning_rate": 2.408037158447375e-06, "loss": 0.0493, "step": 7319 }, { "epoch": 6.834733893557423, "grad_norm": 2.931878515820377, "learning_rate": 2.406744193526531e-06, "loss": 0.0988, "step": 7320 }, { "epoch": 6.835667600373482, "grad_norm": 3.8864009576511025, "learning_rate": 2.4054514657874074e-06, "loss": 0.1971, "step": 7321 }, { "epoch": 6.836601307189542, "grad_norm": 2.4997725277611007, "learning_rate": 2.404158975348243e-06, "loss": 0.1252, "step": 7322 }, { "epoch": 6.837535014005602, "grad_norm": 1.53777961035945, "learning_rate": 2.402866722327248e-06, "loss": 0.0509, "step": 7323 }, { "epoch": 6.838468720821662, "grad_norm": 2.989954923818438, "learning_rate": 2.4015747068426127e-06, "loss": 0.0806, "step": 7324 }, { "epoch": 6.839402427637721, "grad_norm": 0.9394820264906997, "learning_rate": 2.4002829290125047e-06, "loss": 0.0248, "step": 7325 }, { "epoch": 6.840336134453781, "grad_norm": 2.6379398851774325, "learning_rate": 2.398991388955074e-06, "loss": 0.0797, "step": 7326 }, { "epoch": 6.841269841269841, "grad_norm": 4.786033954474189, "learning_rate": 2.3977000867884462e-06, "loss": 0.1567, "step": 7327 }, { "epoch": 6.842203548085901, "grad_norm": 0.7628663702338083, "learning_rate": 2.396409022630724e-06, "loss": 0.0174, "step": 7328 }, { "epoch": 6.8431372549019605, "grad_norm": 2.9525251588920423, "learning_rate": 2.395118196599987e-06, "loss": 0.1064, "step": 7329 }, { "epoch": 6.84407096171802, "grad_norm": 2.4182560833982314, "learning_rate": 2.3938276088143003e-06, "loss": 0.1019, "step": 7330 }, { "epoch": 6.84500466853408, "grad_norm": 2.6745682486189066, "learning_rate": 2.3925372593916995e-06, "loss": 0.1233, "step": 7331 }, { "epoch": 6.84593837535014, "grad_norm": 1.6180781232120853, "learning_rate": 2.3912471484502014e-06, "loss": 0.0315, "step": 7332 }, { "epoch": 6.8468720821661995, "grad_norm": 0.5565822276179192, "learning_rate": 2.3899572761077994e-06, "loss": 0.0109, "step": 7333 }, { "epoch": 6.847805788982259, "grad_norm": 2.829014989515339, "learning_rate": 2.3886676424824702e-06, "loss": 0.1546, "step": 7334 }, { "epoch": 6.848739495798319, "grad_norm": 0.3058695372554035, "learning_rate": 2.387378247692162e-06, "loss": 0.004, "step": 7335 }, { "epoch": 6.849673202614379, "grad_norm": 0.9958537014196492, "learning_rate": 2.3860890918548056e-06, "loss": 0.0303, "step": 7336 }, { "epoch": 6.8506069094304385, "grad_norm": 2.1829345272389653, "learning_rate": 2.3848001750883078e-06, "loss": 0.099, "step": 7337 }, { "epoch": 6.851540616246498, "grad_norm": 1.3248466966018713, "learning_rate": 2.3835114975105523e-06, "loss": 0.0309, "step": 7338 }, { "epoch": 6.852474323062558, "grad_norm": 3.63745585789988, "learning_rate": 2.3822230592394064e-06, "loss": 0.1005, "step": 7339 }, { "epoch": 6.853408029878618, "grad_norm": 0.2965970953705578, "learning_rate": 2.38093486039271e-06, "loss": 0.0024, "step": 7340 }, { "epoch": 6.8543417366946775, "grad_norm": 0.5926345011133799, "learning_rate": 2.3796469010882835e-06, "loss": 0.0177, "step": 7341 }, { "epoch": 6.855275443510737, "grad_norm": 0.9543571804276357, "learning_rate": 2.3783591814439228e-06, "loss": 0.0253, "step": 7342 }, { "epoch": 6.856209150326797, "grad_norm": 0.5264773916527841, "learning_rate": 2.377071701577408e-06, "loss": 0.0187, "step": 7343 }, { "epoch": 6.857142857142857, "grad_norm": 2.6177525483830113, "learning_rate": 2.3757844616064913e-06, "loss": 0.1053, "step": 7344 }, { "epoch": 6.8580765639589165, "grad_norm": 0.6637436626600893, "learning_rate": 2.374497461648904e-06, "loss": 0.014, "step": 7345 }, { "epoch": 6.859010270774976, "grad_norm": 0.25601941812631385, "learning_rate": 2.373210701822356e-06, "loss": 0.0066, "step": 7346 }, { "epoch": 6.859943977591037, "grad_norm": 3.504714730940388, "learning_rate": 2.3719241822445388e-06, "loss": 0.067, "step": 7347 }, { "epoch": 6.860877684407097, "grad_norm": 3.4582521676251714, "learning_rate": 2.370637903033116e-06, "loss": 0.1425, "step": 7348 }, { "epoch": 6.861811391223156, "grad_norm": 0.3291743588126104, "learning_rate": 2.3693518643057336e-06, "loss": 0.0046, "step": 7349 }, { "epoch": 6.862745098039216, "grad_norm": 1.3063703250662009, "learning_rate": 2.3680660661800108e-06, "loss": 0.0457, "step": 7350 }, { "epoch": 6.863678804855276, "grad_norm": 1.3134130570980183, "learning_rate": 2.366780508773552e-06, "loss": 0.0391, "step": 7351 }, { "epoch": 6.864612511671336, "grad_norm": 2.6602665484323142, "learning_rate": 2.3654951922039337e-06, "loss": 0.077, "step": 7352 }, { "epoch": 6.865546218487395, "grad_norm": 3.461372124273406, "learning_rate": 2.3642101165887125e-06, "loss": 0.1324, "step": 7353 }, { "epoch": 6.866479925303455, "grad_norm": 1.3161314594919586, "learning_rate": 2.3629252820454224e-06, "loss": 0.0437, "step": 7354 }, { "epoch": 6.867413632119515, "grad_norm": 1.714009957292309, "learning_rate": 2.3616406886915738e-06, "loss": 0.0896, "step": 7355 }, { "epoch": 6.868347338935575, "grad_norm": 1.0559374131448844, "learning_rate": 2.360356336644661e-06, "loss": 0.0297, "step": 7356 }, { "epoch": 6.8692810457516345, "grad_norm": 2.263169081980917, "learning_rate": 2.3590722260221493e-06, "loss": 0.1149, "step": 7357 }, { "epoch": 6.870214752567694, "grad_norm": 3.1730025252820133, "learning_rate": 2.3577883569414854e-06, "loss": 0.0908, "step": 7358 }, { "epoch": 6.871148459383754, "grad_norm": 2.056396719749925, "learning_rate": 2.3565047295200917e-06, "loss": 0.0911, "step": 7359 }, { "epoch": 6.872082166199814, "grad_norm": 1.597408079946637, "learning_rate": 2.355221343875373e-06, "loss": 0.0239, "step": 7360 }, { "epoch": 6.8730158730158735, "grad_norm": 1.9103432400287412, "learning_rate": 2.3539382001247067e-06, "loss": 0.0808, "step": 7361 }, { "epoch": 6.873949579831933, "grad_norm": 2.4866643997828612, "learning_rate": 2.3526552983854523e-06, "loss": 0.0608, "step": 7362 }, { "epoch": 6.874883286647993, "grad_norm": 4.0701429808433796, "learning_rate": 2.351372638774942e-06, "loss": 0.1685, "step": 7363 }, { "epoch": 6.875816993464053, "grad_norm": 2.380397527300089, "learning_rate": 2.3500902214104927e-06, "loss": 0.0932, "step": 7364 }, { "epoch": 6.8767507002801125, "grad_norm": 1.5358924609547535, "learning_rate": 2.3488080464093943e-06, "loss": 0.0567, "step": 7365 }, { "epoch": 6.877684407096172, "grad_norm": 2.1182960039684082, "learning_rate": 2.347526113888915e-06, "loss": 0.1087, "step": 7366 }, { "epoch": 6.878618113912232, "grad_norm": 1.941557041809754, "learning_rate": 2.3462444239663007e-06, "loss": 0.0855, "step": 7367 }, { "epoch": 6.879551820728292, "grad_norm": 1.4579307718508339, "learning_rate": 2.3449629767587797e-06, "loss": 0.055, "step": 7368 }, { "epoch": 6.8804855275443515, "grad_norm": 2.484675423776065, "learning_rate": 2.3436817723835514e-06, "loss": 0.1016, "step": 7369 }, { "epoch": 6.881419234360411, "grad_norm": 1.628405259505894, "learning_rate": 2.3424008109577973e-06, "loss": 0.068, "step": 7370 }, { "epoch": 6.882352941176471, "grad_norm": 0.4623216935158272, "learning_rate": 2.341120092598675e-06, "loss": 0.0063, "step": 7371 }, { "epoch": 6.883286647992531, "grad_norm": 3.4266773339973557, "learning_rate": 2.339839617423318e-06, "loss": 0.1928, "step": 7372 }, { "epoch": 6.8842203548085905, "grad_norm": 2.150855506266373, "learning_rate": 2.3385593855488438e-06, "loss": 0.1014, "step": 7373 }, { "epoch": 6.88515406162465, "grad_norm": 0.3314198182490549, "learning_rate": 2.337279397092342e-06, "loss": 0.0094, "step": 7374 }, { "epoch": 6.88608776844071, "grad_norm": 2.7109673108387207, "learning_rate": 2.335999652170881e-06, "loss": 0.1216, "step": 7375 }, { "epoch": 6.88702147525677, "grad_norm": 0.5841954835491193, "learning_rate": 2.334720150901506e-06, "loss": 0.0132, "step": 7376 }, { "epoch": 6.8879551820728295, "grad_norm": 0.2623102631269735, "learning_rate": 2.333440893401246e-06, "loss": 0.0068, "step": 7377 }, { "epoch": 6.888888888888889, "grad_norm": 3.1426844029507945, "learning_rate": 2.3321618797871e-06, "loss": 0.2088, "step": 7378 }, { "epoch": 6.889822595704949, "grad_norm": 0.40676385358868217, "learning_rate": 2.330883110176049e-06, "loss": 0.01, "step": 7379 }, { "epoch": 6.890756302521009, "grad_norm": 0.8014231856520093, "learning_rate": 2.329604584685047e-06, "loss": 0.0212, "step": 7380 }, { "epoch": 6.8916900093370685, "grad_norm": 4.277715740424164, "learning_rate": 2.3283263034310343e-06, "loss": 0.2288, "step": 7381 }, { "epoch": 6.892623716153128, "grad_norm": 1.2294571023325438, "learning_rate": 2.327048266530922e-06, "loss": 0.0651, "step": 7382 }, { "epoch": 6.893557422969188, "grad_norm": 1.4237400388026633, "learning_rate": 2.3257704741016e-06, "loss": 0.0462, "step": 7383 }, { "epoch": 6.894491129785248, "grad_norm": 7.080004890446216, "learning_rate": 2.324492926259935e-06, "loss": 0.1072, "step": 7384 }, { "epoch": 6.895424836601308, "grad_norm": 1.1609057325882113, "learning_rate": 2.323215623122776e-06, "loss": 0.0411, "step": 7385 }, { "epoch": 6.896358543417367, "grad_norm": 0.9649518443441444, "learning_rate": 2.3219385648069444e-06, "loss": 0.0403, "step": 7386 }, { "epoch": 6.897292250233427, "grad_norm": 1.9480518776339917, "learning_rate": 2.3206617514292402e-06, "loss": 0.0925, "step": 7387 }, { "epoch": 6.898225957049487, "grad_norm": 0.6602964318548836, "learning_rate": 2.3193851831064453e-06, "loss": 0.0156, "step": 7388 }, { "epoch": 6.899159663865547, "grad_norm": 0.49404495312711577, "learning_rate": 2.3181088599553138e-06, "loss": 0.0156, "step": 7389 }, { "epoch": 6.900093370681606, "grad_norm": 1.4942292544152405, "learning_rate": 2.316832782092578e-06, "loss": 0.0454, "step": 7390 }, { "epoch": 6.901027077497666, "grad_norm": 1.5806313941552492, "learning_rate": 2.3155569496349523e-06, "loss": 0.0372, "step": 7391 }, { "epoch": 6.901960784313726, "grad_norm": 2.824493086679444, "learning_rate": 2.3142813626991244e-06, "loss": 0.1734, "step": 7392 }, { "epoch": 6.902894491129786, "grad_norm": 6.379111848043087, "learning_rate": 2.3130060214017607e-06, "loss": 0.1873, "step": 7393 }, { "epoch": 6.903828197945845, "grad_norm": 0.8717510703801277, "learning_rate": 2.3117309258595035e-06, "loss": 0.0301, "step": 7394 }, { "epoch": 6.904761904761905, "grad_norm": 0.9639368142764302, "learning_rate": 2.3104560761889773e-06, "loss": 0.0218, "step": 7395 }, { "epoch": 6.905695611577965, "grad_norm": 1.0737065732978488, "learning_rate": 2.3091814725067798e-06, "loss": 0.0113, "step": 7396 }, { "epoch": 6.906629318394025, "grad_norm": 1.9827227425993386, "learning_rate": 2.3079071149294845e-06, "loss": 0.1131, "step": 7397 }, { "epoch": 6.907563025210084, "grad_norm": 0.8334311505761947, "learning_rate": 2.306633003573651e-06, "loss": 0.0282, "step": 7398 }, { "epoch": 6.908496732026144, "grad_norm": 2.425915797279564, "learning_rate": 2.3053591385558075e-06, "loss": 0.1211, "step": 7399 }, { "epoch": 6.909430438842204, "grad_norm": 0.5800811780979219, "learning_rate": 2.304085519992464e-06, "loss": 0.014, "step": 7400 }, { "epoch": 6.910364145658264, "grad_norm": 0.8820148247637074, "learning_rate": 2.3028121480001044e-06, "loss": 0.0422, "step": 7401 }, { "epoch": 6.911297852474323, "grad_norm": 1.3790244463880479, "learning_rate": 2.3015390226951957e-06, "loss": 0.0583, "step": 7402 }, { "epoch": 6.912231559290383, "grad_norm": 3.3946624518203024, "learning_rate": 2.3002661441941783e-06, "loss": 0.1409, "step": 7403 }, { "epoch": 6.913165266106443, "grad_norm": 1.2338889803018736, "learning_rate": 2.2989935126134705e-06, "loss": 0.0525, "step": 7404 }, { "epoch": 6.914098972922503, "grad_norm": 0.5028368835708013, "learning_rate": 2.297721128069468e-06, "loss": 0.006, "step": 7405 }, { "epoch": 6.915032679738562, "grad_norm": 0.30147718565109227, "learning_rate": 2.2964489906785443e-06, "loss": 0.0051, "step": 7406 }, { "epoch": 6.915966386554622, "grad_norm": 1.385936495856589, "learning_rate": 2.2951771005570516e-06, "loss": 0.038, "step": 7407 }, { "epoch": 6.916900093370682, "grad_norm": 2.3702768728886596, "learning_rate": 2.2939054578213183e-06, "loss": 0.0915, "step": 7408 }, { "epoch": 6.917833800186742, "grad_norm": 1.6317607163364563, "learning_rate": 2.2926340625876486e-06, "loss": 0.0491, "step": 7409 }, { "epoch": 6.918767507002801, "grad_norm": 1.4917822499450277, "learning_rate": 2.291362914972325e-06, "loss": 0.0552, "step": 7410 }, { "epoch": 6.919701213818861, "grad_norm": 3.8455669801315664, "learning_rate": 2.2900920150916107e-06, "loss": 0.1921, "step": 7411 }, { "epoch": 6.920634920634921, "grad_norm": 1.1826630783367873, "learning_rate": 2.2888213630617416e-06, "loss": 0.0422, "step": 7412 }, { "epoch": 6.921568627450981, "grad_norm": 1.5999739744166805, "learning_rate": 2.2875509589989336e-06, "loss": 0.0806, "step": 7413 }, { "epoch": 6.92250233426704, "grad_norm": 4.482376443733875, "learning_rate": 2.2862808030193763e-06, "loss": 0.1286, "step": 7414 }, { "epoch": 6.9234360410831, "grad_norm": 0.4623901303252067, "learning_rate": 2.285010895239244e-06, "loss": 0.0084, "step": 7415 }, { "epoch": 6.92436974789916, "grad_norm": 4.682462709386527, "learning_rate": 2.2837412357746815e-06, "loss": 0.0751, "step": 7416 }, { "epoch": 6.92530345471522, "grad_norm": 1.696305677063978, "learning_rate": 2.2824718247418124e-06, "loss": 0.0628, "step": 7417 }, { "epoch": 6.926237161531279, "grad_norm": 1.4468156131777534, "learning_rate": 2.2812026622567374e-06, "loss": 0.044, "step": 7418 }, { "epoch": 6.927170868347339, "grad_norm": 1.5738013679840348, "learning_rate": 2.279933748435539e-06, "loss": 0.0765, "step": 7419 }, { "epoch": 6.928104575163399, "grad_norm": 1.4355141194978938, "learning_rate": 2.278665083394271e-06, "loss": 0.0366, "step": 7420 }, { "epoch": 6.929038281979459, "grad_norm": 5.388111976731552, "learning_rate": 2.2773966672489666e-06, "loss": 0.2202, "step": 7421 }, { "epoch": 6.9299719887955185, "grad_norm": 1.1196602887627127, "learning_rate": 2.276128500115637e-06, "loss": 0.0313, "step": 7422 }, { "epoch": 6.930905695611578, "grad_norm": 0.4678293777220299, "learning_rate": 2.274860582110268e-06, "loss": 0.0021, "step": 7423 }, { "epoch": 6.931839402427638, "grad_norm": 0.5403078876645834, "learning_rate": 2.273592913348828e-06, "loss": 0.0176, "step": 7424 }, { "epoch": 6.932773109243698, "grad_norm": 0.4145995626500601, "learning_rate": 2.272325493947257e-06, "loss": 0.0048, "step": 7425 }, { "epoch": 6.9337068160597575, "grad_norm": 3.0551380172632268, "learning_rate": 2.2710583240214763e-06, "loss": 0.147, "step": 7426 }, { "epoch": 6.934640522875817, "grad_norm": 1.2162852745271089, "learning_rate": 2.269791403687378e-06, "loss": 0.0486, "step": 7427 }, { "epoch": 6.935574229691877, "grad_norm": 3.242405691333488, "learning_rate": 2.2685247330608417e-06, "loss": 0.0655, "step": 7428 }, { "epoch": 6.936507936507937, "grad_norm": 2.952446645006514, "learning_rate": 2.2672583122577156e-06, "loss": 0.1047, "step": 7429 }, { "epoch": 6.9374416433239965, "grad_norm": 0.8598688108841979, "learning_rate": 2.265992141393828e-06, "loss": 0.0343, "step": 7430 }, { "epoch": 6.938375350140056, "grad_norm": 1.04415496051276, "learning_rate": 2.2647262205849816e-06, "loss": 0.0338, "step": 7431 }, { "epoch": 6.939309056956116, "grad_norm": 2.7894717927961277, "learning_rate": 2.2634605499469637e-06, "loss": 0.0773, "step": 7432 }, { "epoch": 6.940242763772176, "grad_norm": 2.4720464132177766, "learning_rate": 2.262195129595531e-06, "loss": 0.0447, "step": 7433 }, { "epoch": 6.9411764705882355, "grad_norm": 3.7928801958211578, "learning_rate": 2.26092995964642e-06, "loss": 0.2162, "step": 7434 }, { "epoch": 6.942110177404295, "grad_norm": 1.2352062887089095, "learning_rate": 2.2596650402153437e-06, "loss": 0.0394, "step": 7435 }, { "epoch": 6.943043884220355, "grad_norm": 1.202868603190726, "learning_rate": 2.258400371417995e-06, "loss": 0.0264, "step": 7436 }, { "epoch": 6.943977591036415, "grad_norm": 2.345094833954222, "learning_rate": 2.257135953370041e-06, "loss": 0.1205, "step": 7437 }, { "epoch": 6.9449112978524745, "grad_norm": 1.0738327009230648, "learning_rate": 2.2558717861871267e-06, "loss": 0.0466, "step": 7438 }, { "epoch": 6.945845004668534, "grad_norm": 1.5995462250044115, "learning_rate": 2.254607869984873e-06, "loss": 0.0463, "step": 7439 }, { "epoch": 6.946778711484594, "grad_norm": 1.6603216860406478, "learning_rate": 2.253344204878878e-06, "loss": 0.0778, "step": 7440 }, { "epoch": 6.947712418300654, "grad_norm": 3.868816083085827, "learning_rate": 2.2520807909847215e-06, "loss": 0.2111, "step": 7441 }, { "epoch": 6.9486461251167135, "grad_norm": 1.7374456584869886, "learning_rate": 2.250817628417954e-06, "loss": 0.081, "step": 7442 }, { "epoch": 6.949579831932773, "grad_norm": 2.386976490041582, "learning_rate": 2.2495547172941055e-06, "loss": 0.1244, "step": 7443 }, { "epoch": 6.950513538748833, "grad_norm": 1.8649482965146258, "learning_rate": 2.2482920577286826e-06, "loss": 0.0546, "step": 7444 }, { "epoch": 6.951447245564893, "grad_norm": 4.0008751482883484, "learning_rate": 2.247029649837172e-06, "loss": 0.1479, "step": 7445 }, { "epoch": 6.9523809523809526, "grad_norm": 2.0351709184658064, "learning_rate": 2.2457674937350326e-06, "loss": 0.0495, "step": 7446 }, { "epoch": 6.953314659197012, "grad_norm": 1.2712567439676425, "learning_rate": 2.244505589537704e-06, "loss": 0.041, "step": 7447 }, { "epoch": 6.954248366013072, "grad_norm": 2.487358713488361, "learning_rate": 2.2432439373605976e-06, "loss": 0.095, "step": 7448 }, { "epoch": 6.955182072829132, "grad_norm": 2.079903535114247, "learning_rate": 2.2419825373191095e-06, "loss": 0.0685, "step": 7449 }, { "epoch": 6.956115779645192, "grad_norm": 1.3649386912490016, "learning_rate": 2.2407213895286074e-06, "loss": 0.0324, "step": 7450 }, { "epoch": 6.957049486461251, "grad_norm": 5.313175918408515, "learning_rate": 2.2394604941044366e-06, "loss": 0.0791, "step": 7451 }, { "epoch": 6.957983193277311, "grad_norm": 0.6306033769988466, "learning_rate": 2.2381998511619184e-06, "loss": 0.0147, "step": 7452 }, { "epoch": 6.958916900093371, "grad_norm": 1.314217474657006, "learning_rate": 2.2369394608163557e-06, "loss": 0.0333, "step": 7453 }, { "epoch": 6.959850606909431, "grad_norm": 0.8412329712344994, "learning_rate": 2.235679323183023e-06, "loss": 0.0257, "step": 7454 }, { "epoch": 6.96078431372549, "grad_norm": 2.716021243278584, "learning_rate": 2.2344194383771746e-06, "loss": 0.1361, "step": 7455 }, { "epoch": 6.96171802054155, "grad_norm": 2.3630169969012442, "learning_rate": 2.23315980651404e-06, "loss": 0.0804, "step": 7456 }, { "epoch": 6.96265172735761, "grad_norm": 1.02720992105025, "learning_rate": 2.231900427708825e-06, "loss": 0.0022, "step": 7457 }, { "epoch": 6.96358543417367, "grad_norm": 7.221339137028436, "learning_rate": 2.2306413020767166e-06, "loss": 0.0706, "step": 7458 }, { "epoch": 6.964519140989729, "grad_norm": 1.8230443315747196, "learning_rate": 2.2293824297328744e-06, "loss": 0.0548, "step": 7459 }, { "epoch": 6.965452847805789, "grad_norm": 5.82321935636311, "learning_rate": 2.228123810792434e-06, "loss": 0.1561, "step": 7460 }, { "epoch": 6.966386554621849, "grad_norm": 1.1611650015640886, "learning_rate": 2.226865445370514e-06, "loss": 0.0304, "step": 7461 }, { "epoch": 6.967320261437909, "grad_norm": 1.4630791205664626, "learning_rate": 2.2256073335822036e-06, "loss": 0.0488, "step": 7462 }, { "epoch": 6.968253968253968, "grad_norm": 0.8254283355989447, "learning_rate": 2.2243494755425717e-06, "loss": 0.0282, "step": 7463 }, { "epoch": 6.969187675070028, "grad_norm": 0.9108257935931144, "learning_rate": 2.22309187136666e-06, "loss": 0.0261, "step": 7464 }, { "epoch": 6.970121381886088, "grad_norm": 0.5537411194298164, "learning_rate": 2.221834521169495e-06, "loss": 0.0128, "step": 7465 }, { "epoch": 6.971055088702148, "grad_norm": 4.971826800155923, "learning_rate": 2.220577425066072e-06, "loss": 0.1852, "step": 7466 }, { "epoch": 6.971988795518207, "grad_norm": 1.0661242672337339, "learning_rate": 2.2193205831713665e-06, "loss": 0.0341, "step": 7467 }, { "epoch": 6.972922502334267, "grad_norm": 0.6475375883549537, "learning_rate": 2.2180639956003324e-06, "loss": 0.0163, "step": 7468 }, { "epoch": 6.973856209150327, "grad_norm": 0.9141473736396576, "learning_rate": 2.2168076624678973e-06, "loss": 0.021, "step": 7469 }, { "epoch": 6.974789915966387, "grad_norm": 1.1996675425933565, "learning_rate": 2.215551583888965e-06, "loss": 0.0359, "step": 7470 }, { "epoch": 6.975723622782446, "grad_norm": 1.8906375342774147, "learning_rate": 2.214295759978421e-06, "loss": 0.0115, "step": 7471 }, { "epoch": 6.976657329598506, "grad_norm": 2.4083456146281166, "learning_rate": 2.213040190851122e-06, "loss": 0.0635, "step": 7472 }, { "epoch": 6.977591036414566, "grad_norm": 3.019931531069004, "learning_rate": 2.2117848766219046e-06, "loss": 0.1019, "step": 7473 }, { "epoch": 6.978524743230626, "grad_norm": 2.6543742574678437, "learning_rate": 2.2105298174055783e-06, "loss": 0.0217, "step": 7474 }, { "epoch": 6.979458450046685, "grad_norm": 5.189808364079272, "learning_rate": 2.2092750133169365e-06, "loss": 0.0891, "step": 7475 }, { "epoch": 6.980392156862745, "grad_norm": 0.4490150144792034, "learning_rate": 2.2080204644707428e-06, "loss": 0.0117, "step": 7476 }, { "epoch": 6.981325863678805, "grad_norm": 1.1394731535322782, "learning_rate": 2.2067661709817384e-06, "loss": 0.044, "step": 7477 }, { "epoch": 6.982259570494865, "grad_norm": 0.906719286619212, "learning_rate": 2.2055121329646416e-06, "loss": 0.0264, "step": 7478 }, { "epoch": 6.983193277310924, "grad_norm": 2.2814374384771083, "learning_rate": 2.204258350534151e-06, "loss": 0.1016, "step": 7479 }, { "epoch": 6.984126984126984, "grad_norm": 0.2974444851476265, "learning_rate": 2.2030048238049373e-06, "loss": 0.0085, "step": 7480 }, { "epoch": 6.985060690943044, "grad_norm": 4.30182785993737, "learning_rate": 2.2017515528916494e-06, "loss": 0.1786, "step": 7481 }, { "epoch": 6.985994397759104, "grad_norm": 2.4648526849904386, "learning_rate": 2.2004985379089104e-06, "loss": 0.1173, "step": 7482 }, { "epoch": 6.9869281045751634, "grad_norm": 2.589789918917203, "learning_rate": 2.199245778971326e-06, "loss": 0.09, "step": 7483 }, { "epoch": 6.987861811391223, "grad_norm": 0.6317822666633324, "learning_rate": 2.1979932761934735e-06, "loss": 0.0261, "step": 7484 }, { "epoch": 6.988795518207283, "grad_norm": 2.4238160276433653, "learning_rate": 2.1967410296899072e-06, "loss": 0.0719, "step": 7485 }, { "epoch": 6.989729225023343, "grad_norm": 2.0850506425659363, "learning_rate": 2.1954890395751592e-06, "loss": 0.0407, "step": 7486 }, { "epoch": 6.9906629318394025, "grad_norm": 0.7732206132751125, "learning_rate": 2.194237305963736e-06, "loss": 0.0156, "step": 7487 }, { "epoch": 6.991596638655462, "grad_norm": 2.9075579814836954, "learning_rate": 2.192985828970126e-06, "loss": 0.1506, "step": 7488 }, { "epoch": 6.992530345471522, "grad_norm": 0.8741581171729457, "learning_rate": 2.1917346087087894e-06, "loss": 0.0182, "step": 7489 }, { "epoch": 6.993464052287582, "grad_norm": 2.0806914204792855, "learning_rate": 2.1904836452941626e-06, "loss": 0.0569, "step": 7490 }, { "epoch": 6.9943977591036415, "grad_norm": 1.7854975222025278, "learning_rate": 2.1892329388406586e-06, "loss": 0.0794, "step": 7491 }, { "epoch": 6.995331465919701, "grad_norm": 2.1923925372524327, "learning_rate": 2.1879824894626722e-06, "loss": 0.1079, "step": 7492 }, { "epoch": 6.996265172735761, "grad_norm": 2.0665996941801676, "learning_rate": 2.186732297274569e-06, "loss": 0.0506, "step": 7493 }, { "epoch": 6.997198879551821, "grad_norm": 1.119986766734451, "learning_rate": 2.185482362390693e-06, "loss": 0.0318, "step": 7494 }, { "epoch": 6.9981325863678805, "grad_norm": 0.3998779484949793, "learning_rate": 2.1842326849253616e-06, "loss": 0.0095, "step": 7495 }, { "epoch": 6.99906629318394, "grad_norm": 2.8234251535381403, "learning_rate": 2.1829832649928763e-06, "loss": 0.1382, "step": 7496 }, { "epoch": 7.0, "grad_norm": 5.478816344752883, "learning_rate": 2.181734102707507e-06, "loss": 0.2257, "step": 7497 }, { "epoch": 7.00093370681606, "grad_norm": 4.508440021284837, "learning_rate": 2.180485198183506e-06, "loss": 0.1609, "step": 7498 }, { "epoch": 7.0018674136321195, "grad_norm": 0.759054452375803, "learning_rate": 2.1792365515350944e-06, "loss": 0.025, "step": 7499 }, { "epoch": 7.002801120448179, "grad_norm": 1.3634919505122696, "learning_rate": 2.1779881628764802e-06, "loss": 0.0435, "step": 7500 }, { "epoch": 7.003734827264239, "grad_norm": 0.5749234963553952, "learning_rate": 2.1767400323218397e-06, "loss": 0.0156, "step": 7501 }, { "epoch": 7.004668534080299, "grad_norm": 1.2972300731578377, "learning_rate": 2.1754921599853287e-06, "loss": 0.044, "step": 7502 }, { "epoch": 7.0056022408963585, "grad_norm": 1.9951991163293925, "learning_rate": 2.1742445459810784e-06, "loss": 0.075, "step": 7503 }, { "epoch": 7.006535947712418, "grad_norm": 1.3786756999835592, "learning_rate": 2.172997190423195e-06, "loss": 0.0325, "step": 7504 }, { "epoch": 7.007469654528478, "grad_norm": 0.39233031838318755, "learning_rate": 2.171750093425767e-06, "loss": 0.0064, "step": 7505 }, { "epoch": 7.008403361344538, "grad_norm": 1.671858969666846, "learning_rate": 2.170503255102852e-06, "loss": 0.0774, "step": 7506 }, { "epoch": 7.0093370681605975, "grad_norm": 0.6778390355693978, "learning_rate": 2.1692566755684885e-06, "loss": 0.0055, "step": 7507 }, { "epoch": 7.010270774976657, "grad_norm": 0.46577314881828247, "learning_rate": 2.1680103549366875e-06, "loss": 0.0123, "step": 7508 }, { "epoch": 7.011204481792717, "grad_norm": 1.5329766347914608, "learning_rate": 2.1667642933214428e-06, "loss": 0.0785, "step": 7509 }, { "epoch": 7.012138188608777, "grad_norm": 4.105593598359828, "learning_rate": 2.1655184908367176e-06, "loss": 0.1611, "step": 7510 }, { "epoch": 7.0130718954248366, "grad_norm": 0.8051694813395212, "learning_rate": 2.1642729475964548e-06, "loss": 0.0156, "step": 7511 }, { "epoch": 7.014005602240896, "grad_norm": 2.9132133627372574, "learning_rate": 2.1630276637145713e-06, "loss": 0.0442, "step": 7512 }, { "epoch": 7.014939309056956, "grad_norm": 0.2562895410337112, "learning_rate": 2.1617826393049653e-06, "loss": 0.0079, "step": 7513 }, { "epoch": 7.015873015873016, "grad_norm": 1.1661768724426333, "learning_rate": 2.1605378744815063e-06, "loss": 0.0303, "step": 7514 }, { "epoch": 7.016806722689076, "grad_norm": 2.7222718066905682, "learning_rate": 2.159293369358042e-06, "loss": 0.1635, "step": 7515 }, { "epoch": 7.017740429505135, "grad_norm": 3.1220180281294923, "learning_rate": 2.158049124048394e-06, "loss": 0.0934, "step": 7516 }, { "epoch": 7.018674136321195, "grad_norm": 0.06604376089132398, "learning_rate": 2.156805138666366e-06, "loss": 0.0002, "step": 7517 }, { "epoch": 7.019607843137255, "grad_norm": 0.35757599300119747, "learning_rate": 2.155561413325732e-06, "loss": 0.0061, "step": 7518 }, { "epoch": 7.020541549953315, "grad_norm": 2.688261731214183, "learning_rate": 2.1543179481402443e-06, "loss": 0.093, "step": 7519 }, { "epoch": 7.021475256769374, "grad_norm": 1.5394793549002503, "learning_rate": 2.1530747432236317e-06, "loss": 0.0597, "step": 7520 }, { "epoch": 7.022408963585434, "grad_norm": 1.4142278079550097, "learning_rate": 2.1518317986895964e-06, "loss": 0.0602, "step": 7521 }, { "epoch": 7.023342670401494, "grad_norm": 1.2731630283291533, "learning_rate": 2.1505891146518244e-06, "loss": 0.0297, "step": 7522 }, { "epoch": 7.024276377217554, "grad_norm": 1.0535693021504873, "learning_rate": 2.1493466912239703e-06, "loss": 0.0271, "step": 7523 }, { "epoch": 7.025210084033613, "grad_norm": 0.4367467709324637, "learning_rate": 2.1481045285196667e-06, "loss": 0.0107, "step": 7524 }, { "epoch": 7.026143790849673, "grad_norm": 5.2950658815848675, "learning_rate": 2.1468626266525223e-06, "loss": 0.1406, "step": 7525 }, { "epoch": 7.027077497665733, "grad_norm": 0.07017808096045829, "learning_rate": 2.145620985736125e-06, "loss": 0.0003, "step": 7526 }, { "epoch": 7.028011204481793, "grad_norm": 4.168865782654326, "learning_rate": 2.144379605884036e-06, "loss": 0.1797, "step": 7527 }, { "epoch": 7.028944911297852, "grad_norm": 0.37199117483320415, "learning_rate": 2.1431384872097925e-06, "loss": 0.0042, "step": 7528 }, { "epoch": 7.029878618113912, "grad_norm": 3.188218076333088, "learning_rate": 2.1418976298269066e-06, "loss": 0.1695, "step": 7529 }, { "epoch": 7.030812324929972, "grad_norm": 1.6393217937147586, "learning_rate": 2.140657033848872e-06, "loss": 0.0408, "step": 7530 }, { "epoch": 7.031746031746032, "grad_norm": 2.058058070808273, "learning_rate": 2.139416699389153e-06, "loss": 0.0991, "step": 7531 }, { "epoch": 7.032679738562091, "grad_norm": 1.9327264330382166, "learning_rate": 2.1381766265611918e-06, "loss": 0.0569, "step": 7532 }, { "epoch": 7.033613445378151, "grad_norm": 1.6680141980817327, "learning_rate": 2.1369368154784055e-06, "loss": 0.0257, "step": 7533 }, { "epoch": 7.034547152194211, "grad_norm": 1.3377224814478232, "learning_rate": 2.135697266254191e-06, "loss": 0.0454, "step": 7534 }, { "epoch": 7.035480859010271, "grad_norm": 1.2533757426021792, "learning_rate": 2.134457979001917e-06, "loss": 0.0373, "step": 7535 }, { "epoch": 7.03641456582633, "grad_norm": 1.1909505523149415, "learning_rate": 2.133218953834931e-06, "loss": 0.0193, "step": 7536 }, { "epoch": 7.03734827264239, "grad_norm": 2.8040619692063413, "learning_rate": 2.131980190866553e-06, "loss": 0.0995, "step": 7537 }, { "epoch": 7.03828197945845, "grad_norm": 1.1390798584000101, "learning_rate": 2.130741690210084e-06, "loss": 0.0382, "step": 7538 }, { "epoch": 7.03921568627451, "grad_norm": 0.829233857273554, "learning_rate": 2.1295034519787987e-06, "loss": 0.0167, "step": 7539 }, { "epoch": 7.040149393090569, "grad_norm": 0.4085462934800854, "learning_rate": 2.1282654762859445e-06, "loss": 0.0043, "step": 7540 }, { "epoch": 7.041083099906629, "grad_norm": 6.469059905539651, "learning_rate": 2.1270277632447524e-06, "loss": 0.2018, "step": 7541 }, { "epoch": 7.042016806722689, "grad_norm": 5.071843680278284, "learning_rate": 2.1257903129684217e-06, "loss": 0.1605, "step": 7542 }, { "epoch": 7.042950513538749, "grad_norm": 1.042554338139492, "learning_rate": 2.124553125570131e-06, "loss": 0.022, "step": 7543 }, { "epoch": 7.043884220354808, "grad_norm": 2.077771029433345, "learning_rate": 2.1233162011630342e-06, "loss": 0.0315, "step": 7544 }, { "epoch": 7.044817927170868, "grad_norm": 2.446140370288063, "learning_rate": 2.1220795398602644e-06, "loss": 0.0803, "step": 7545 }, { "epoch": 7.045751633986928, "grad_norm": 4.089606349119408, "learning_rate": 2.1208431417749265e-06, "loss": 0.0378, "step": 7546 }, { "epoch": 7.046685340802988, "grad_norm": 0.582554818955799, "learning_rate": 2.1196070070200998e-06, "loss": 0.0098, "step": 7547 }, { "epoch": 7.0476190476190474, "grad_norm": 1.7139240103281586, "learning_rate": 2.1183711357088466e-06, "loss": 0.0656, "step": 7548 }, { "epoch": 7.048552754435107, "grad_norm": 3.167718707509764, "learning_rate": 2.1171355279541994e-06, "loss": 0.0683, "step": 7549 }, { "epoch": 7.049486461251167, "grad_norm": 1.4671796199669669, "learning_rate": 2.115900183869166e-06, "loss": 0.0242, "step": 7550 }, { "epoch": 7.050420168067227, "grad_norm": 1.0493045969716233, "learning_rate": 2.1146651035667366e-06, "loss": 0.0223, "step": 7551 }, { "epoch": 7.0513538748832865, "grad_norm": 2.5825080024285723, "learning_rate": 2.1134302871598692e-06, "loss": 0.1275, "step": 7552 }, { "epoch": 7.052287581699346, "grad_norm": 2.0757747087756493, "learning_rate": 2.1121957347615036e-06, "loss": 0.067, "step": 7553 }, { "epoch": 7.053221288515406, "grad_norm": 1.9804454051251648, "learning_rate": 2.1109614464845514e-06, "loss": 0.0338, "step": 7554 }, { "epoch": 7.054154995331466, "grad_norm": 2.274863749154024, "learning_rate": 2.1097274224419e-06, "loss": 0.0031, "step": 7555 }, { "epoch": 7.0550887021475255, "grad_norm": 1.7594171413479494, "learning_rate": 2.1084936627464197e-06, "loss": 0.0808, "step": 7556 }, { "epoch": 7.056022408963585, "grad_norm": 1.416670021285085, "learning_rate": 2.107260167510948e-06, "loss": 0.0448, "step": 7557 }, { "epoch": 7.056956115779645, "grad_norm": 2.873852654081369, "learning_rate": 2.1060269368483024e-06, "loss": 0.0872, "step": 7558 }, { "epoch": 7.057889822595705, "grad_norm": 1.5641065627172308, "learning_rate": 2.1047939708712734e-06, "loss": 0.0354, "step": 7559 }, { "epoch": 7.0588235294117645, "grad_norm": 0.4306734352782171, "learning_rate": 2.103561269692632e-06, "loss": 0.0151, "step": 7560 }, { "epoch": 7.059757236227824, "grad_norm": 0.6648025131647557, "learning_rate": 2.1023288334251225e-06, "loss": 0.0182, "step": 7561 }, { "epoch": 7.060690943043884, "grad_norm": 0.25836750107456696, "learning_rate": 2.101096662181463e-06, "loss": 0.0074, "step": 7562 }, { "epoch": 7.061624649859944, "grad_norm": 3.901867730873942, "learning_rate": 2.0998647560743475e-06, "loss": 0.0574, "step": 7563 }, { "epoch": 7.0625583566760035, "grad_norm": 4.261719014619983, "learning_rate": 2.0986331152164513e-06, "loss": 0.0601, "step": 7564 }, { "epoch": 7.063492063492063, "grad_norm": 3.6221796769134382, "learning_rate": 2.09740173972042e-06, "loss": 0.1368, "step": 7565 }, { "epoch": 7.064425770308123, "grad_norm": 1.3588022831899769, "learning_rate": 2.0961706296988755e-06, "loss": 0.0373, "step": 7566 }, { "epoch": 7.065359477124183, "grad_norm": 3.1462816340897195, "learning_rate": 2.0949397852644156e-06, "loss": 0.185, "step": 7567 }, { "epoch": 7.0662931839402425, "grad_norm": 3.0587621347303107, "learning_rate": 2.093709206529617e-06, "loss": 0.0956, "step": 7568 }, { "epoch": 7.067226890756302, "grad_norm": 4.333174064154242, "learning_rate": 2.092478893607029e-06, "loss": 0.2072, "step": 7569 }, { "epoch": 7.068160597572362, "grad_norm": 1.2975770702451874, "learning_rate": 2.0912488466091763e-06, "loss": 0.0126, "step": 7570 }, { "epoch": 7.069094304388422, "grad_norm": 0.8576352046316706, "learning_rate": 2.0900190656485608e-06, "loss": 0.0326, "step": 7571 }, { "epoch": 7.0700280112044815, "grad_norm": 4.718237028116134, "learning_rate": 2.088789550837657e-06, "loss": 0.1323, "step": 7572 }, { "epoch": 7.070961718020541, "grad_norm": 1.5267391897941451, "learning_rate": 2.0875603022889224e-06, "loss": 0.0674, "step": 7573 }, { "epoch": 7.071895424836601, "grad_norm": 0.8177513456749017, "learning_rate": 2.0863313201147822e-06, "loss": 0.0136, "step": 7574 }, { "epoch": 7.072829131652661, "grad_norm": 1.1684146842645007, "learning_rate": 2.0851026044276405e-06, "loss": 0.0303, "step": 7575 }, { "epoch": 7.073762838468721, "grad_norm": 3.1909384226589332, "learning_rate": 2.0838741553398762e-06, "loss": 0.0417, "step": 7576 }, { "epoch": 7.07469654528478, "grad_norm": 1.5180827211761154, "learning_rate": 2.0826459729638465e-06, "loss": 0.0422, "step": 7577 }, { "epoch": 7.07563025210084, "grad_norm": 0.6022155645272651, "learning_rate": 2.081418057411881e-06, "loss": 0.0177, "step": 7578 }, { "epoch": 7.0765639589169, "grad_norm": 1.4440818928014367, "learning_rate": 2.080190408796286e-06, "loss": 0.0413, "step": 7579 }, { "epoch": 7.07749766573296, "grad_norm": 2.539971537022421, "learning_rate": 2.078963027229342e-06, "loss": 0.1645, "step": 7580 }, { "epoch": 7.078431372549019, "grad_norm": 1.1912388039445634, "learning_rate": 2.07773591282331e-06, "loss": 0.0284, "step": 7581 }, { "epoch": 7.079365079365079, "grad_norm": 1.1999964191276364, "learning_rate": 2.0765090656904213e-06, "loss": 0.0242, "step": 7582 }, { "epoch": 7.080298786181139, "grad_norm": 1.148491031095002, "learning_rate": 2.075282485942884e-06, "loss": 0.0579, "step": 7583 }, { "epoch": 7.081232492997199, "grad_norm": 0.590673544476861, "learning_rate": 2.074056173692881e-06, "loss": 0.0091, "step": 7584 }, { "epoch": 7.082166199813258, "grad_norm": 1.2760506110036014, "learning_rate": 2.072830129052576e-06, "loss": 0.0581, "step": 7585 }, { "epoch": 7.083099906629318, "grad_norm": 1.8753848942356495, "learning_rate": 2.0716043521341013e-06, "loss": 0.0614, "step": 7586 }, { "epoch": 7.084033613445378, "grad_norm": 0.36434077086054156, "learning_rate": 2.0703788430495685e-06, "loss": 0.0059, "step": 7587 }, { "epoch": 7.084967320261438, "grad_norm": 0.6660558197619647, "learning_rate": 2.069153601911064e-06, "loss": 0.0196, "step": 7588 }, { "epoch": 7.085901027077497, "grad_norm": 2.5803632896231123, "learning_rate": 2.0679286288306473e-06, "loss": 0.063, "step": 7589 }, { "epoch": 7.086834733893557, "grad_norm": 2.2639617025279004, "learning_rate": 2.0667039239203597e-06, "loss": 0.096, "step": 7590 }, { "epoch": 7.087768440709617, "grad_norm": 3.0036690551696754, "learning_rate": 2.0654794872922113e-06, "loss": 0.0827, "step": 7591 }, { "epoch": 7.088702147525677, "grad_norm": 0.4412152842003515, "learning_rate": 2.064255319058191e-06, "loss": 0.0107, "step": 7592 }, { "epoch": 7.089635854341736, "grad_norm": 1.4939032347205967, "learning_rate": 2.0630314193302604e-06, "loss": 0.0366, "step": 7593 }, { "epoch": 7.090569561157796, "grad_norm": 5.641067084721489, "learning_rate": 2.0618077882203617e-06, "loss": 0.1671, "step": 7594 }, { "epoch": 7.091503267973856, "grad_norm": 0.5406794875155985, "learning_rate": 2.060584425840409e-06, "loss": 0.011, "step": 7595 }, { "epoch": 7.092436974789916, "grad_norm": 1.7052416844464167, "learning_rate": 2.059361332302291e-06, "loss": 0.0649, "step": 7596 }, { "epoch": 7.093370681605975, "grad_norm": 1.5259391725573948, "learning_rate": 2.0581385077178713e-06, "loss": 0.0255, "step": 7597 }, { "epoch": 7.094304388422035, "grad_norm": 4.528171983494707, "learning_rate": 2.0569159521989947e-06, "loss": 0.1182, "step": 7598 }, { "epoch": 7.095238095238095, "grad_norm": 0.47374661679502716, "learning_rate": 2.055693665857475e-06, "loss": 0.0061, "step": 7599 }, { "epoch": 7.096171802054155, "grad_norm": 2.6124555380496015, "learning_rate": 2.054471648805104e-06, "loss": 0.0591, "step": 7600 }, { "epoch": 7.097105508870214, "grad_norm": 7.670238730946071, "learning_rate": 2.0532499011536473e-06, "loss": 0.1013, "step": 7601 }, { "epoch": 7.098039215686274, "grad_norm": 0.36510472885463857, "learning_rate": 2.05202842301485e-06, "loss": 0.0049, "step": 7602 }, { "epoch": 7.098972922502334, "grad_norm": 0.9799271377573245, "learning_rate": 2.0508072145004284e-06, "loss": 0.0209, "step": 7603 }, { "epoch": 7.099906629318394, "grad_norm": 0.7207647360508432, "learning_rate": 2.0495862757220745e-06, "loss": 0.0226, "step": 7604 }, { "epoch": 7.100840336134453, "grad_norm": 0.5294323388751913, "learning_rate": 2.0483656067914575e-06, "loss": 0.0041, "step": 7605 }, { "epoch": 7.101774042950513, "grad_norm": 0.377085963242153, "learning_rate": 2.0471452078202193e-06, "loss": 0.0104, "step": 7606 }, { "epoch": 7.102707749766573, "grad_norm": 3.5154844399134832, "learning_rate": 2.0459250789199814e-06, "loss": 0.1945, "step": 7607 }, { "epoch": 7.103641456582633, "grad_norm": 0.7900265595021103, "learning_rate": 2.044705220202337e-06, "loss": 0.0166, "step": 7608 }, { "epoch": 7.104575163398692, "grad_norm": 1.1522271900759307, "learning_rate": 2.0434856317788558e-06, "loss": 0.0196, "step": 7609 }, { "epoch": 7.105508870214752, "grad_norm": 1.7179527982630012, "learning_rate": 2.04226631376108e-06, "loss": 0.036, "step": 7610 }, { "epoch": 7.106442577030812, "grad_norm": 4.721794640353592, "learning_rate": 2.0410472662605335e-06, "loss": 0.1937, "step": 7611 }, { "epoch": 7.107376283846872, "grad_norm": 1.4226116756153262, "learning_rate": 2.03982848938871e-06, "loss": 0.0378, "step": 7612 }, { "epoch": 7.1083099906629315, "grad_norm": 3.394951461452538, "learning_rate": 2.0386099832570776e-06, "loss": 0.1124, "step": 7613 }, { "epoch": 7.109243697478991, "grad_norm": 1.522354032721145, "learning_rate": 2.037391747977086e-06, "loss": 0.0565, "step": 7614 }, { "epoch": 7.110177404295051, "grad_norm": 2.3701737992104794, "learning_rate": 2.0361737836601554e-06, "loss": 0.0506, "step": 7615 }, { "epoch": 7.111111111111111, "grad_norm": 1.2764502572690635, "learning_rate": 2.03495609041768e-06, "loss": 0.0328, "step": 7616 }, { "epoch": 7.1120448179271705, "grad_norm": 1.410303151527234, "learning_rate": 2.0337386683610315e-06, "loss": 0.061, "step": 7617 }, { "epoch": 7.11297852474323, "grad_norm": 0.3124955151166919, "learning_rate": 2.032521517601559e-06, "loss": 0.0077, "step": 7618 }, { "epoch": 7.11391223155929, "grad_norm": 0.9113844285959828, "learning_rate": 2.0313046382505824e-06, "loss": 0.0329, "step": 7619 }, { "epoch": 7.11484593837535, "grad_norm": 1.5754867920287483, "learning_rate": 2.030088030419397e-06, "loss": 0.059, "step": 7620 }, { "epoch": 7.1157796451914095, "grad_norm": 3.053038257416831, "learning_rate": 2.028871694219279e-06, "loss": 0.1073, "step": 7621 }, { "epoch": 7.116713352007469, "grad_norm": 1.05997675375165, "learning_rate": 2.0276556297614737e-06, "loss": 0.032, "step": 7622 }, { "epoch": 7.117647058823529, "grad_norm": 2.83526070183879, "learning_rate": 2.026439837157201e-06, "loss": 0.081, "step": 7623 }, { "epoch": 7.118580765639589, "grad_norm": 3.168627938532548, "learning_rate": 2.025224316517663e-06, "loss": 0.1098, "step": 7624 }, { "epoch": 7.1195144724556485, "grad_norm": 0.989335398319307, "learning_rate": 2.02400906795403e-06, "loss": 0.0211, "step": 7625 }, { "epoch": 7.120448179271708, "grad_norm": 0.0560371430272879, "learning_rate": 2.0227940915774514e-06, "loss": 0.0001, "step": 7626 }, { "epoch": 7.121381886087768, "grad_norm": 4.152138014671362, "learning_rate": 2.0215793874990465e-06, "loss": 0.1272, "step": 7627 }, { "epoch": 7.122315592903828, "grad_norm": 2.152681364784153, "learning_rate": 2.020364955829917e-06, "loss": 0.0521, "step": 7628 }, { "epoch": 7.1232492997198875, "grad_norm": 2.944724091920798, "learning_rate": 2.019150796681135e-06, "loss": 0.0524, "step": 7629 }, { "epoch": 7.124183006535947, "grad_norm": 2.3119578249186454, "learning_rate": 2.0179369101637485e-06, "loss": 0.072, "step": 7630 }, { "epoch": 7.125116713352007, "grad_norm": 4.346139058234086, "learning_rate": 2.016723296388779e-06, "loss": 0.1194, "step": 7631 }, { "epoch": 7.126050420168067, "grad_norm": 2.4934259804167063, "learning_rate": 2.015509955467228e-06, "loss": 0.1066, "step": 7632 }, { "epoch": 7.1269841269841265, "grad_norm": 4.279032620532999, "learning_rate": 2.0142968875100677e-06, "loss": 0.2141, "step": 7633 }, { "epoch": 7.127917833800186, "grad_norm": 5.046733849382585, "learning_rate": 2.0130840926282458e-06, "loss": 0.1485, "step": 7634 }, { "epoch": 7.128851540616246, "grad_norm": 0.9742169393856248, "learning_rate": 2.0118715709326864e-06, "loss": 0.042, "step": 7635 }, { "epoch": 7.129785247432307, "grad_norm": 1.8251823547638544, "learning_rate": 2.010659322534285e-06, "loss": 0.0327, "step": 7636 }, { "epoch": 7.130718954248366, "grad_norm": 1.7885674739611757, "learning_rate": 2.00944734754392e-06, "loss": 0.0404, "step": 7637 }, { "epoch": 7.131652661064426, "grad_norm": 2.1224291017072443, "learning_rate": 2.0082356460724374e-06, "loss": 0.0728, "step": 7638 }, { "epoch": 7.132586367880486, "grad_norm": 2.0694798193092145, "learning_rate": 2.0070242182306606e-06, "loss": 0.0738, "step": 7639 }, { "epoch": 7.133520074696546, "grad_norm": 1.5011096004477997, "learning_rate": 2.005813064129386e-06, "loss": 0.0417, "step": 7640 }, { "epoch": 7.1344537815126055, "grad_norm": 1.1516527423230778, "learning_rate": 2.004602183879391e-06, "loss": 0.0385, "step": 7641 }, { "epoch": 7.135387488328665, "grad_norm": 2.7402195011231827, "learning_rate": 2.003391577591422e-06, "loss": 0.1007, "step": 7642 }, { "epoch": 7.136321195144725, "grad_norm": 0.7172664698826501, "learning_rate": 2.0021812453762018e-06, "loss": 0.0116, "step": 7643 }, { "epoch": 7.137254901960785, "grad_norm": 1.2013331379832901, "learning_rate": 2.000971187344427e-06, "loss": 0.0461, "step": 7644 }, { "epoch": 7.1381886087768445, "grad_norm": 1.4116759641912697, "learning_rate": 1.9997614036067743e-06, "loss": 0.0544, "step": 7645 }, { "epoch": 7.139122315592904, "grad_norm": 4.689282675019913, "learning_rate": 1.9985518942738902e-06, "loss": 0.1032, "step": 7646 }, { "epoch": 7.140056022408964, "grad_norm": 1.3144502933062008, "learning_rate": 1.997342659456397e-06, "loss": 0.0502, "step": 7647 }, { "epoch": 7.140989729225024, "grad_norm": 0.6706154774990447, "learning_rate": 1.9961336992648915e-06, "loss": 0.0188, "step": 7648 }, { "epoch": 7.1419234360410835, "grad_norm": 2.3861479066640667, "learning_rate": 1.994925013809949e-06, "loss": 0.0901, "step": 7649 }, { "epoch": 7.142857142857143, "grad_norm": 1.7629499846299794, "learning_rate": 1.9937166032021167e-06, "loss": 0.0668, "step": 7650 }, { "epoch": 7.143790849673203, "grad_norm": 0.6915503444975478, "learning_rate": 1.992508467551915e-06, "loss": 0.0058, "step": 7651 }, { "epoch": 7.144724556489263, "grad_norm": 1.8199515881920028, "learning_rate": 1.991300606969843e-06, "loss": 0.091, "step": 7652 }, { "epoch": 7.1456582633053225, "grad_norm": 0.38297056983945266, "learning_rate": 1.9900930215663705e-06, "loss": 0.0114, "step": 7653 }, { "epoch": 7.146591970121382, "grad_norm": 1.0769020608588007, "learning_rate": 1.9888857114519476e-06, "loss": 0.0068, "step": 7654 }, { "epoch": 7.147525676937442, "grad_norm": 0.4931708205618241, "learning_rate": 1.987678676736995e-06, "loss": 0.0145, "step": 7655 }, { "epoch": 7.148459383753502, "grad_norm": 2.071673339691727, "learning_rate": 1.986471917531908e-06, "loss": 0.0921, "step": 7656 }, { "epoch": 7.1493930905695615, "grad_norm": 3.581332297679249, "learning_rate": 1.985265433947057e-06, "loss": 0.0592, "step": 7657 }, { "epoch": 7.150326797385621, "grad_norm": 1.9601047738482793, "learning_rate": 1.9840592260927928e-06, "loss": 0.0869, "step": 7658 }, { "epoch": 7.151260504201681, "grad_norm": 2.0608576408785093, "learning_rate": 1.9828532940794325e-06, "loss": 0.0932, "step": 7659 }, { "epoch": 7.152194211017741, "grad_norm": 0.42216852459730103, "learning_rate": 1.981647638017273e-06, "loss": 0.0063, "step": 7660 }, { "epoch": 7.1531279178338005, "grad_norm": 1.0046234783137054, "learning_rate": 1.980442258016584e-06, "loss": 0.0271, "step": 7661 }, { "epoch": 7.15406162464986, "grad_norm": 2.1080137251779925, "learning_rate": 1.9792371541876122e-06, "loss": 0.0774, "step": 7662 }, { "epoch": 7.15499533146592, "grad_norm": 1.5593344829102511, "learning_rate": 1.9780323266405764e-06, "loss": 0.0248, "step": 7663 }, { "epoch": 7.15592903828198, "grad_norm": 1.3725347874590483, "learning_rate": 1.9768277754856725e-06, "loss": 0.0369, "step": 7664 }, { "epoch": 7.1568627450980395, "grad_norm": 1.401868836327349, "learning_rate": 1.9756235008330665e-06, "loss": 0.0219, "step": 7665 }, { "epoch": 7.157796451914099, "grad_norm": 1.003815992392183, "learning_rate": 1.9744195027929075e-06, "loss": 0.0288, "step": 7666 }, { "epoch": 7.158730158730159, "grad_norm": 2.8594334845379046, "learning_rate": 1.973215781475312e-06, "loss": 0.104, "step": 7667 }, { "epoch": 7.159663865546219, "grad_norm": 1.8580244009986697, "learning_rate": 1.972012336990373e-06, "loss": 0.0558, "step": 7668 }, { "epoch": 7.160597572362279, "grad_norm": 0.29341823934842337, "learning_rate": 1.97080916944816e-06, "loss": 0.0054, "step": 7669 }, { "epoch": 7.161531279178338, "grad_norm": 1.4167976478660473, "learning_rate": 1.969606278958713e-06, "loss": 0.0361, "step": 7670 }, { "epoch": 7.162464985994398, "grad_norm": 2.2613994570205174, "learning_rate": 1.968403665632053e-06, "loss": 0.0602, "step": 7671 }, { "epoch": 7.163398692810458, "grad_norm": 2.7982359693208063, "learning_rate": 1.9672013295781716e-06, "loss": 0.0698, "step": 7672 }, { "epoch": 7.164332399626518, "grad_norm": 0.7615826016560632, "learning_rate": 1.9659992709070346e-06, "loss": 0.0142, "step": 7673 }, { "epoch": 7.165266106442577, "grad_norm": 4.092909491506603, "learning_rate": 1.964797489728582e-06, "loss": 0.0557, "step": 7674 }, { "epoch": 7.166199813258637, "grad_norm": 1.3803001132937918, "learning_rate": 1.9635959861527338e-06, "loss": 0.0461, "step": 7675 }, { "epoch": 7.167133520074697, "grad_norm": 2.3308168286638513, "learning_rate": 1.962394760289378e-06, "loss": 0.0915, "step": 7676 }, { "epoch": 7.168067226890757, "grad_norm": 1.426624891555715, "learning_rate": 1.961193812248381e-06, "loss": 0.0265, "step": 7677 }, { "epoch": 7.169000933706816, "grad_norm": 1.993957177347613, "learning_rate": 1.9599931421395798e-06, "loss": 0.0608, "step": 7678 }, { "epoch": 7.169934640522876, "grad_norm": 0.5473684560299167, "learning_rate": 1.958792750072793e-06, "loss": 0.023, "step": 7679 }, { "epoch": 7.170868347338936, "grad_norm": 1.0869883979686468, "learning_rate": 1.957592636157808e-06, "loss": 0.0416, "step": 7680 }, { "epoch": 7.171802054154996, "grad_norm": 0.5290008177085588, "learning_rate": 1.956392800504388e-06, "loss": 0.01, "step": 7681 }, { "epoch": 7.172735760971055, "grad_norm": 2.3335267825087187, "learning_rate": 1.9551932432222704e-06, "loss": 0.0293, "step": 7682 }, { "epoch": 7.173669467787115, "grad_norm": 2.1046112667091394, "learning_rate": 1.9539939644211697e-06, "loss": 0.0803, "step": 7683 }, { "epoch": 7.174603174603175, "grad_norm": 2.844039910364755, "learning_rate": 1.952794964210773e-06, "loss": 0.0593, "step": 7684 }, { "epoch": 7.175536881419235, "grad_norm": 1.2537057041920294, "learning_rate": 1.951596242700741e-06, "loss": 0.0323, "step": 7685 }, { "epoch": 7.176470588235294, "grad_norm": 1.8434014770912028, "learning_rate": 1.9503978000007103e-06, "loss": 0.0549, "step": 7686 }, { "epoch": 7.177404295051354, "grad_norm": 1.8641619406133205, "learning_rate": 1.94919963622029e-06, "loss": 0.0391, "step": 7687 }, { "epoch": 7.178338001867414, "grad_norm": 1.9256813697074948, "learning_rate": 1.9480017514690686e-06, "loss": 0.0361, "step": 7688 }, { "epoch": 7.179271708683474, "grad_norm": 0.4055219251580633, "learning_rate": 1.946804145856604e-06, "loss": 0.0145, "step": 7689 }, { "epoch": 7.180205415499533, "grad_norm": 3.052794318164949, "learning_rate": 1.945606819492429e-06, "loss": 0.0426, "step": 7690 }, { "epoch": 7.181139122315593, "grad_norm": 0.5935939911748328, "learning_rate": 1.9444097724860557e-06, "loss": 0.0099, "step": 7691 }, { "epoch": 7.182072829131653, "grad_norm": 0.7117999432082385, "learning_rate": 1.943213004946965e-06, "loss": 0.0178, "step": 7692 }, { "epoch": 7.183006535947713, "grad_norm": 3.166574242144002, "learning_rate": 1.9420165169846127e-06, "loss": 0.1013, "step": 7693 }, { "epoch": 7.183940242763772, "grad_norm": 0.44205987107785405, "learning_rate": 1.9408203087084347e-06, "loss": 0.0071, "step": 7694 }, { "epoch": 7.184873949579832, "grad_norm": 3.206236347248621, "learning_rate": 1.9396243802278357e-06, "loss": 0.0379, "step": 7695 }, { "epoch": 7.185807656395892, "grad_norm": 2.895733380759697, "learning_rate": 1.9384287316521964e-06, "loss": 0.0984, "step": 7696 }, { "epoch": 7.186741363211952, "grad_norm": 1.597976055101331, "learning_rate": 1.93723336309087e-06, "loss": 0.0466, "step": 7697 }, { "epoch": 7.187675070028011, "grad_norm": 0.7563553008666043, "learning_rate": 1.9360382746531898e-06, "loss": 0.0354, "step": 7698 }, { "epoch": 7.188608776844071, "grad_norm": 1.8428364932414127, "learning_rate": 1.934843466448458e-06, "loss": 0.0738, "step": 7699 }, { "epoch": 7.189542483660131, "grad_norm": 2.818747384474474, "learning_rate": 1.933648938585951e-06, "loss": 0.0908, "step": 7700 }, { "epoch": 7.190476190476191, "grad_norm": 1.2414834784452986, "learning_rate": 1.932454691174925e-06, "loss": 0.0105, "step": 7701 }, { "epoch": 7.19140989729225, "grad_norm": 0.11799038134632213, "learning_rate": 1.931260724324605e-06, "loss": 0.0011, "step": 7702 }, { "epoch": 7.19234360410831, "grad_norm": 2.017261171726941, "learning_rate": 1.9300670381441934e-06, "loss": 0.0646, "step": 7703 }, { "epoch": 7.19327731092437, "grad_norm": 1.4154046803609375, "learning_rate": 1.928873632742863e-06, "loss": 0.0679, "step": 7704 }, { "epoch": 7.19421101774043, "grad_norm": 3.473602770595021, "learning_rate": 1.9276805082297676e-06, "loss": 0.107, "step": 7705 }, { "epoch": 7.1951447245564895, "grad_norm": 1.6846707424961644, "learning_rate": 1.926487664714031e-06, "loss": 0.0493, "step": 7706 }, { "epoch": 7.196078431372549, "grad_norm": 2.006736867593656, "learning_rate": 1.9252951023047504e-06, "loss": 0.0354, "step": 7707 }, { "epoch": 7.197012138188609, "grad_norm": 2.4599783512994606, "learning_rate": 1.9241028211109976e-06, "loss": 0.0308, "step": 7708 }, { "epoch": 7.197945845004669, "grad_norm": 3.249548483091276, "learning_rate": 1.922910821241823e-06, "loss": 0.0859, "step": 7709 }, { "epoch": 7.1988795518207285, "grad_norm": 3.323752336142625, "learning_rate": 1.921719102806246e-06, "loss": 0.097, "step": 7710 }, { "epoch": 7.199813258636788, "grad_norm": 0.8913650428930476, "learning_rate": 1.9205276659132633e-06, "loss": 0.0186, "step": 7711 }, { "epoch": 7.200746965452848, "grad_norm": 0.3431225604134963, "learning_rate": 1.9193365106718433e-06, "loss": 0.0009, "step": 7712 }, { "epoch": 7.201680672268908, "grad_norm": 2.2777266160479033, "learning_rate": 1.918145637190932e-06, "loss": 0.0498, "step": 7713 }, { "epoch": 7.2026143790849675, "grad_norm": 5.053153549040129, "learning_rate": 1.916955045579448e-06, "loss": 0.1675, "step": 7714 }, { "epoch": 7.203548085901027, "grad_norm": 1.5342831278447813, "learning_rate": 1.9157647359462828e-06, "loss": 0.0568, "step": 7715 }, { "epoch": 7.204481792717087, "grad_norm": 2.498924539704555, "learning_rate": 1.914574708400302e-06, "loss": 0.085, "step": 7716 }, { "epoch": 7.205415499533147, "grad_norm": 0.37283079176646633, "learning_rate": 1.9133849630503497e-06, "loss": 0.004, "step": 7717 }, { "epoch": 7.2063492063492065, "grad_norm": 0.5294946119851542, "learning_rate": 1.91219550000524e-06, "loss": 0.0129, "step": 7718 }, { "epoch": 7.207282913165266, "grad_norm": 1.4065186875701883, "learning_rate": 1.9110063193737628e-06, "loss": 0.0297, "step": 7719 }, { "epoch": 7.208216619981326, "grad_norm": 2.3528274080291482, "learning_rate": 1.9098174212646803e-06, "loss": 0.0962, "step": 7720 }, { "epoch": 7.209150326797386, "grad_norm": 0.3434849863239745, "learning_rate": 1.90862880578673e-06, "loss": 0.0045, "step": 7721 }, { "epoch": 7.2100840336134455, "grad_norm": 2.230723316986474, "learning_rate": 1.9074404730486264e-06, "loss": 0.0653, "step": 7722 }, { "epoch": 7.211017740429505, "grad_norm": 1.7609401763809838, "learning_rate": 1.9062524231590541e-06, "loss": 0.0389, "step": 7723 }, { "epoch": 7.211951447245565, "grad_norm": 3.9968278100999513, "learning_rate": 1.9050646562266733e-06, "loss": 0.0994, "step": 7724 }, { "epoch": 7.212885154061625, "grad_norm": 0.5132070538237155, "learning_rate": 1.9038771723601168e-06, "loss": 0.0068, "step": 7725 }, { "epoch": 7.2138188608776845, "grad_norm": 1.4294524944648868, "learning_rate": 1.9026899716679964e-06, "loss": 0.0402, "step": 7726 }, { "epoch": 7.214752567693744, "grad_norm": 1.461544691037632, "learning_rate": 1.901503054258893e-06, "loss": 0.0436, "step": 7727 }, { "epoch": 7.215686274509804, "grad_norm": 2.241623098107125, "learning_rate": 1.9003164202413626e-06, "loss": 0.0554, "step": 7728 }, { "epoch": 7.216619981325864, "grad_norm": 3.062514211397874, "learning_rate": 1.899130069723935e-06, "loss": 0.0879, "step": 7729 }, { "epoch": 7.2175536881419236, "grad_norm": 9.124309688859428, "learning_rate": 1.897944002815118e-06, "loss": 0.2071, "step": 7730 }, { "epoch": 7.218487394957983, "grad_norm": 0.24377880164049348, "learning_rate": 1.896758219623389e-06, "loss": 0.0033, "step": 7731 }, { "epoch": 7.219421101774043, "grad_norm": 0.7810426757492205, "learning_rate": 1.8955727202572005e-06, "loss": 0.028, "step": 7732 }, { "epoch": 7.220354808590103, "grad_norm": 1.2569515073041369, "learning_rate": 1.8943875048249777e-06, "loss": 0.03, "step": 7733 }, { "epoch": 7.221288515406163, "grad_norm": 0.11399013745604798, "learning_rate": 1.8932025734351255e-06, "loss": 0.001, "step": 7734 }, { "epoch": 7.222222222222222, "grad_norm": 2.923781116494603, "learning_rate": 1.892017926196017e-06, "loss": 0.1161, "step": 7735 }, { "epoch": 7.223155929038282, "grad_norm": 3.0684154683824914, "learning_rate": 1.8908335632160013e-06, "loss": 0.0403, "step": 7736 }, { "epoch": 7.224089635854342, "grad_norm": 1.6031926752799657, "learning_rate": 1.8896494846034009e-06, "loss": 0.0145, "step": 7737 }, { "epoch": 7.225023342670402, "grad_norm": 2.2880885914957516, "learning_rate": 1.8884656904665117e-06, "loss": 0.0567, "step": 7738 }, { "epoch": 7.225957049486461, "grad_norm": 0.46862847490880066, "learning_rate": 1.887282180913607e-06, "loss": 0.0096, "step": 7739 }, { "epoch": 7.226890756302521, "grad_norm": 0.9215986715027474, "learning_rate": 1.8860989560529314e-06, "loss": 0.0304, "step": 7740 }, { "epoch": 7.227824463118581, "grad_norm": 3.8086599193483055, "learning_rate": 1.8849160159927033e-06, "loss": 0.1542, "step": 7741 }, { "epoch": 7.228758169934641, "grad_norm": 2.030090549316263, "learning_rate": 1.8837333608411134e-06, "loss": 0.0619, "step": 7742 }, { "epoch": 7.2296918767507, "grad_norm": 2.3343279502811294, "learning_rate": 1.8825509907063328e-06, "loss": 0.0567, "step": 7743 }, { "epoch": 7.23062558356676, "grad_norm": 2.0232571231618266, "learning_rate": 1.8813689056965e-06, "loss": 0.0751, "step": 7744 }, { "epoch": 7.23155929038282, "grad_norm": 5.617785379250772, "learning_rate": 1.880187105919729e-06, "loss": 0.1218, "step": 7745 }, { "epoch": 7.23249299719888, "grad_norm": 2.749217745079265, "learning_rate": 1.8790055914841081e-06, "loss": 0.0359, "step": 7746 }, { "epoch": 7.233426704014939, "grad_norm": 5.40205507203482, "learning_rate": 1.877824362497702e-06, "loss": 0.1136, "step": 7747 }, { "epoch": 7.234360410830999, "grad_norm": 3.478603612599028, "learning_rate": 1.8766434190685462e-06, "loss": 0.0602, "step": 7748 }, { "epoch": 7.235294117647059, "grad_norm": 2.2426473600399253, "learning_rate": 1.87546276130465e-06, "loss": 0.0445, "step": 7749 }, { "epoch": 7.236227824463119, "grad_norm": 0.9931486387372706, "learning_rate": 1.8742823893139966e-06, "loss": 0.0134, "step": 7750 }, { "epoch": 7.237161531279178, "grad_norm": 1.2785719976196928, "learning_rate": 1.873102303204547e-06, "loss": 0.0345, "step": 7751 }, { "epoch": 7.238095238095238, "grad_norm": 2.142044491452831, "learning_rate": 1.871922503084232e-06, "loss": 0.1127, "step": 7752 }, { "epoch": 7.239028944911298, "grad_norm": 2.6278295730591568, "learning_rate": 1.8707429890609562e-06, "loss": 0.0534, "step": 7753 }, { "epoch": 7.239962651727358, "grad_norm": 3.03863242755395, "learning_rate": 1.8695637612425998e-06, "loss": 0.1217, "step": 7754 }, { "epoch": 7.240896358543417, "grad_norm": 1.5974499623476914, "learning_rate": 1.8683848197370142e-06, "loss": 0.0225, "step": 7755 }, { "epoch": 7.241830065359477, "grad_norm": 1.6476559542027462, "learning_rate": 1.86720616465203e-06, "loss": 0.0479, "step": 7756 }, { "epoch": 7.242763772175537, "grad_norm": 1.6888833629513782, "learning_rate": 1.8660277960954465e-06, "loss": 0.0502, "step": 7757 }, { "epoch": 7.243697478991597, "grad_norm": 3.350620789640299, "learning_rate": 1.8648497141750388e-06, "loss": 0.0833, "step": 7758 }, { "epoch": 7.244631185807656, "grad_norm": 4.274485135347144, "learning_rate": 1.863671918998553e-06, "loss": 0.144, "step": 7759 }, { "epoch": 7.245564892623716, "grad_norm": 3.3730017830614973, "learning_rate": 1.8624944106737152e-06, "loss": 0.0869, "step": 7760 }, { "epoch": 7.246498599439776, "grad_norm": 0.6468783266302262, "learning_rate": 1.8613171893082193e-06, "loss": 0.0133, "step": 7761 }, { "epoch": 7.247432306255836, "grad_norm": 2.2522564846659057, "learning_rate": 1.8601402550097364e-06, "loss": 0.0784, "step": 7762 }, { "epoch": 7.248366013071895, "grad_norm": 3.3472672672175974, "learning_rate": 1.8589636078859068e-06, "loss": 0.1077, "step": 7763 }, { "epoch": 7.249299719887955, "grad_norm": 1.8035790258510078, "learning_rate": 1.8577872480443527e-06, "loss": 0.0551, "step": 7764 }, { "epoch": 7.250233426704015, "grad_norm": 4.454620407121915, "learning_rate": 1.8566111755926619e-06, "loss": 0.2056, "step": 7765 }, { "epoch": 7.251167133520075, "grad_norm": 2.2792352545893007, "learning_rate": 1.8554353906384004e-06, "loss": 0.0806, "step": 7766 }, { "epoch": 7.2521008403361344, "grad_norm": 1.7062811012024939, "learning_rate": 1.8542598932891043e-06, "loss": 0.0397, "step": 7767 }, { "epoch": 7.253034547152194, "grad_norm": 3.8861096928849284, "learning_rate": 1.8530846836522892e-06, "loss": 0.0504, "step": 7768 }, { "epoch": 7.253968253968254, "grad_norm": 2.894488790377667, "learning_rate": 1.8519097618354393e-06, "loss": 0.0928, "step": 7769 }, { "epoch": 7.254901960784314, "grad_norm": 1.0520945627637055, "learning_rate": 1.8507351279460123e-06, "loss": 0.0234, "step": 7770 }, { "epoch": 7.2558356676003735, "grad_norm": 1.2908814564627074, "learning_rate": 1.8495607820914451e-06, "loss": 0.0192, "step": 7771 }, { "epoch": 7.256769374416433, "grad_norm": 0.41847931743951144, "learning_rate": 1.8483867243791426e-06, "loss": 0.0111, "step": 7772 }, { "epoch": 7.257703081232493, "grad_norm": 0.26766003359051166, "learning_rate": 1.847212954916483e-06, "loss": 0.0065, "step": 7773 }, { "epoch": 7.258636788048553, "grad_norm": 0.5500921444916819, "learning_rate": 1.846039473810825e-06, "loss": 0.0175, "step": 7774 }, { "epoch": 7.2595704948646125, "grad_norm": 1.9922085429476488, "learning_rate": 1.844866281169494e-06, "loss": 0.0782, "step": 7775 }, { "epoch": 7.260504201680672, "grad_norm": 2.0930759556089193, "learning_rate": 1.843693377099791e-06, "loss": 0.1019, "step": 7776 }, { "epoch": 7.261437908496732, "grad_norm": 0.9897042853114816, "learning_rate": 1.8425207617089903e-06, "loss": 0.0381, "step": 7777 }, { "epoch": 7.262371615312792, "grad_norm": 6.600163604908669, "learning_rate": 1.8413484351043425e-06, "loss": 0.1043, "step": 7778 }, { "epoch": 7.2633053221288515, "grad_norm": 0.35573232242311786, "learning_rate": 1.8401763973930692e-06, "loss": 0.0069, "step": 7779 }, { "epoch": 7.264239028944911, "grad_norm": 1.3589384445781025, "learning_rate": 1.8390046486823638e-06, "loss": 0.0291, "step": 7780 }, { "epoch": 7.265172735760971, "grad_norm": 0.5230041982341334, "learning_rate": 1.8378331890793988e-06, "loss": 0.0121, "step": 7781 }, { "epoch": 7.266106442577031, "grad_norm": 3.631138235424111, "learning_rate": 1.8366620186913159e-06, "loss": 0.051, "step": 7782 }, { "epoch": 7.2670401493930905, "grad_norm": 0.6554824221516811, "learning_rate": 1.835491137625231e-06, "loss": 0.0069, "step": 7783 }, { "epoch": 7.26797385620915, "grad_norm": 3.899383627754448, "learning_rate": 1.8343205459882347e-06, "loss": 0.1071, "step": 7784 }, { "epoch": 7.26890756302521, "grad_norm": 0.8814073851643127, "learning_rate": 1.8331502438873883e-06, "loss": 0.0274, "step": 7785 }, { "epoch": 7.26984126984127, "grad_norm": 0.5699335749116026, "learning_rate": 1.8319802314297324e-06, "loss": 0.0122, "step": 7786 }, { "epoch": 7.2707749766573295, "grad_norm": 4.124759464956509, "learning_rate": 1.8308105087222755e-06, "loss": 0.1555, "step": 7787 }, { "epoch": 7.271708683473389, "grad_norm": 2.3735205196119913, "learning_rate": 1.8296410758720012e-06, "loss": 0.0761, "step": 7788 }, { "epoch": 7.272642390289449, "grad_norm": 0.2194422875744888, "learning_rate": 1.8284719329858663e-06, "loss": 0.0018, "step": 7789 }, { "epoch": 7.273576097105509, "grad_norm": 1.1199499778878792, "learning_rate": 1.8273030801708046e-06, "loss": 0.0215, "step": 7790 }, { "epoch": 7.2745098039215685, "grad_norm": 0.2418877858616069, "learning_rate": 1.8261345175337192e-06, "loss": 0.0021, "step": 7791 }, { "epoch": 7.275443510737628, "grad_norm": 2.059185241615324, "learning_rate": 1.824966245181487e-06, "loss": 0.0279, "step": 7792 }, { "epoch": 7.276377217553688, "grad_norm": 1.155283945446515, "learning_rate": 1.8237982632209589e-06, "loss": 0.034, "step": 7793 }, { "epoch": 7.277310924369748, "grad_norm": 0.5692050147514965, "learning_rate": 1.822630571758962e-06, "loss": 0.0093, "step": 7794 }, { "epoch": 7.278244631185808, "grad_norm": 1.7507125671594628, "learning_rate": 1.8214631709022934e-06, "loss": 0.0432, "step": 7795 }, { "epoch": 7.279178338001867, "grad_norm": 3.5618636231093292, "learning_rate": 1.8202960607577246e-06, "loss": 0.1417, "step": 7796 }, { "epoch": 7.280112044817927, "grad_norm": 1.7617601503680649, "learning_rate": 1.8191292414319995e-06, "loss": 0.0296, "step": 7797 }, { "epoch": 7.281045751633987, "grad_norm": 1.5103588869580655, "learning_rate": 1.8179627130318389e-06, "loss": 0.0323, "step": 7798 }, { "epoch": 7.281979458450047, "grad_norm": 4.823902663419216, "learning_rate": 1.8167964756639334e-06, "loss": 0.1949, "step": 7799 }, { "epoch": 7.282913165266106, "grad_norm": 3.700021096421103, "learning_rate": 1.8156305294349486e-06, "loss": 0.0786, "step": 7800 }, { "epoch": 7.283846872082166, "grad_norm": 2.5962003711731563, "learning_rate": 1.8144648744515226e-06, "loss": 0.0592, "step": 7801 }, { "epoch": 7.284780578898226, "grad_norm": 0.44113562924623706, "learning_rate": 1.8132995108202656e-06, "loss": 0.0059, "step": 7802 }, { "epoch": 7.285714285714286, "grad_norm": 1.1430427210930254, "learning_rate": 1.8121344386477663e-06, "loss": 0.0322, "step": 7803 }, { "epoch": 7.286647992530345, "grad_norm": 4.366396420924831, "learning_rate": 1.8109696580405821e-06, "loss": 0.1009, "step": 7804 }, { "epoch": 7.287581699346405, "grad_norm": 0.5680452825078477, "learning_rate": 1.809805169105245e-06, "loss": 0.006, "step": 7805 }, { "epoch": 7.288515406162465, "grad_norm": 2.1756638591282655, "learning_rate": 1.8086409719482578e-06, "loss": 0.0811, "step": 7806 }, { "epoch": 7.289449112978525, "grad_norm": 0.06607884212124888, "learning_rate": 1.8074770666761026e-06, "loss": 0.0004, "step": 7807 }, { "epoch": 7.290382819794584, "grad_norm": 2.419296435180595, "learning_rate": 1.8063134533952308e-06, "loss": 0.0624, "step": 7808 }, { "epoch": 7.291316526610644, "grad_norm": 3.4438198473463095, "learning_rate": 1.8051501322120668e-06, "loss": 0.1624, "step": 7809 }, { "epoch": 7.292250233426704, "grad_norm": 2.1364111846963723, "learning_rate": 1.8039871032330075e-06, "loss": 0.0888, "step": 7810 }, { "epoch": 7.293183940242764, "grad_norm": 0.27028618110479374, "learning_rate": 1.802824366564428e-06, "loss": 0.0038, "step": 7811 }, { "epoch": 7.294117647058823, "grad_norm": 3.7412810648041295, "learning_rate": 1.8016619223126714e-06, "loss": 0.1289, "step": 7812 }, { "epoch": 7.295051353874883, "grad_norm": 2.9289202425639567, "learning_rate": 1.8004997705840565e-06, "loss": 0.1348, "step": 7813 }, { "epoch": 7.295985060690943, "grad_norm": 1.9502846955444113, "learning_rate": 1.7993379114848724e-06, "loss": 0.0518, "step": 7814 }, { "epoch": 7.296918767507003, "grad_norm": 3.5362905736986354, "learning_rate": 1.7981763451213886e-06, "loss": 0.1834, "step": 7815 }, { "epoch": 7.297852474323062, "grad_norm": 4.365460147760578, "learning_rate": 1.7970150715998398e-06, "loss": 0.1178, "step": 7816 }, { "epoch": 7.298786181139122, "grad_norm": 0.039909608631273435, "learning_rate": 1.7958540910264383e-06, "loss": 0.0002, "step": 7817 }, { "epoch": 7.299719887955182, "grad_norm": 2.427240370127794, "learning_rate": 1.7946934035073681e-06, "loss": 0.096, "step": 7818 }, { "epoch": 7.300653594771242, "grad_norm": 0.9465188406248929, "learning_rate": 1.7935330091487856e-06, "loss": 0.0253, "step": 7819 }, { "epoch": 7.301587301587301, "grad_norm": 1.1190041436427443, "learning_rate": 1.7923729080568242e-06, "loss": 0.0133, "step": 7820 }, { "epoch": 7.302521008403361, "grad_norm": 2.4414645591131525, "learning_rate": 1.791213100337587e-06, "loss": 0.0496, "step": 7821 }, { "epoch": 7.303454715219421, "grad_norm": 1.700306119177019, "learning_rate": 1.7900535860971507e-06, "loss": 0.0737, "step": 7822 }, { "epoch": 7.304388422035481, "grad_norm": 1.6103695195534917, "learning_rate": 1.7888943654415635e-06, "loss": 0.0503, "step": 7823 }, { "epoch": 7.30532212885154, "grad_norm": 1.6878277578959695, "learning_rate": 1.7877354384768531e-06, "loss": 0.0449, "step": 7824 }, { "epoch": 7.3062558356676, "grad_norm": 9.090910579810581, "learning_rate": 1.7865768053090144e-06, "loss": 0.1883, "step": 7825 }, { "epoch": 7.30718954248366, "grad_norm": 1.9558542807452046, "learning_rate": 1.7854184660440167e-06, "loss": 0.0627, "step": 7826 }, { "epoch": 7.30812324929972, "grad_norm": 1.9103629413340448, "learning_rate": 1.7842604207878006e-06, "loss": 0.0512, "step": 7827 }, { "epoch": 7.309056956115779, "grad_norm": 0.5489598557555565, "learning_rate": 1.7831026696462867e-06, "loss": 0.0102, "step": 7828 }, { "epoch": 7.309990662931839, "grad_norm": 2.333850784201156, "learning_rate": 1.781945212725361e-06, "loss": 0.0649, "step": 7829 }, { "epoch": 7.310924369747899, "grad_norm": 1.79756762197288, "learning_rate": 1.7807880501308872e-06, "loss": 0.0167, "step": 7830 }, { "epoch": 7.311858076563959, "grad_norm": 3.2031270749774166, "learning_rate": 1.7796311819686974e-06, "loss": 0.1144, "step": 7831 }, { "epoch": 7.3127917833800185, "grad_norm": 0.5220414685683551, "learning_rate": 1.7784746083446041e-06, "loss": 0.0146, "step": 7832 }, { "epoch": 7.313725490196078, "grad_norm": 1.0119656079616626, "learning_rate": 1.777318329364387e-06, "loss": 0.0231, "step": 7833 }, { "epoch": 7.314659197012138, "grad_norm": 2.061453015901481, "learning_rate": 1.7761623451338e-06, "loss": 0.0702, "step": 7834 }, { "epoch": 7.315592903828198, "grad_norm": 0.2959160413533105, "learning_rate": 1.775006655758571e-06, "loss": 0.0037, "step": 7835 }, { "epoch": 7.3165266106442575, "grad_norm": 1.3328048367823477, "learning_rate": 1.773851261344398e-06, "loss": 0.0349, "step": 7836 }, { "epoch": 7.317460317460317, "grad_norm": 2.603230423230889, "learning_rate": 1.7726961619969596e-06, "loss": 0.1022, "step": 7837 }, { "epoch": 7.318394024276377, "grad_norm": 1.109593442500497, "learning_rate": 1.7715413578218988e-06, "loss": 0.0388, "step": 7838 }, { "epoch": 7.319327731092437, "grad_norm": 0.7814955420923704, "learning_rate": 1.770386848924836e-06, "loss": 0.0204, "step": 7839 }, { "epoch": 7.3202614379084965, "grad_norm": 0.9912910777935223, "learning_rate": 1.769232635411362e-06, "loss": 0.027, "step": 7840 }, { "epoch": 7.321195144724556, "grad_norm": 2.651323775136138, "learning_rate": 1.7680787173870456e-06, "loss": 0.0656, "step": 7841 }, { "epoch": 7.322128851540616, "grad_norm": 3.575844432010876, "learning_rate": 1.7669250949574235e-06, "loss": 0.1733, "step": 7842 }, { "epoch": 7.323062558356676, "grad_norm": 2.6498926036842967, "learning_rate": 1.7657717682280051e-06, "loss": 0.1296, "step": 7843 }, { "epoch": 7.3239962651727355, "grad_norm": 1.5425716752860639, "learning_rate": 1.764618737304279e-06, "loss": 0.0648, "step": 7844 }, { "epoch": 7.324929971988795, "grad_norm": 0.8061468802232943, "learning_rate": 1.7634660022917004e-06, "loss": 0.0197, "step": 7845 }, { "epoch": 7.325863678804855, "grad_norm": 0.7758398982154048, "learning_rate": 1.7623135632956977e-06, "loss": 0.0153, "step": 7846 }, { "epoch": 7.326797385620915, "grad_norm": 2.0125939510047663, "learning_rate": 1.7611614204216782e-06, "loss": 0.0851, "step": 7847 }, { "epoch": 7.3277310924369745, "grad_norm": 0.2795290900282688, "learning_rate": 1.7600095737750156e-06, "loss": 0.0065, "step": 7848 }, { "epoch": 7.328664799253034, "grad_norm": 1.5801642686123576, "learning_rate": 1.7588580234610592e-06, "loss": 0.0456, "step": 7849 }, { "epoch": 7.329598506069094, "grad_norm": 0.7103694097605012, "learning_rate": 1.757706769585129e-06, "loss": 0.0133, "step": 7850 }, { "epoch": 7.330532212885154, "grad_norm": 4.51136538134265, "learning_rate": 1.7565558122525234e-06, "loss": 0.1503, "step": 7851 }, { "epoch": 7.3314659197012135, "grad_norm": 2.1803648138980134, "learning_rate": 1.7554051515685083e-06, "loss": 0.0519, "step": 7852 }, { "epoch": 7.332399626517273, "grad_norm": 0.980538552709004, "learning_rate": 1.7542547876383226e-06, "loss": 0.0296, "step": 7853 }, { "epoch": 7.333333333333333, "grad_norm": 0.9865479339659862, "learning_rate": 1.7531047205671836e-06, "loss": 0.0145, "step": 7854 }, { "epoch": 7.334267040149393, "grad_norm": 0.8641703351217537, "learning_rate": 1.7519549504602751e-06, "loss": 0.0089, "step": 7855 }, { "epoch": 7.3352007469654525, "grad_norm": 2.93750629968253, "learning_rate": 1.7508054774227568e-06, "loss": 0.0942, "step": 7856 }, { "epoch": 7.336134453781512, "grad_norm": 1.8618876178143098, "learning_rate": 1.7496563015597585e-06, "loss": 0.0646, "step": 7857 }, { "epoch": 7.337068160597572, "grad_norm": 3.107673581186841, "learning_rate": 1.7485074229763881e-06, "loss": 0.1291, "step": 7858 }, { "epoch": 7.338001867413632, "grad_norm": 2.230692327349187, "learning_rate": 1.747358841777722e-06, "loss": 0.092, "step": 7859 }, { "epoch": 7.338935574229692, "grad_norm": 2.6265674895910185, "learning_rate": 1.746210558068811e-06, "loss": 0.0762, "step": 7860 }, { "epoch": 7.339869281045751, "grad_norm": 2.184248616804068, "learning_rate": 1.7450625719546755e-06, "loss": 0.0403, "step": 7861 }, { "epoch": 7.340802987861811, "grad_norm": 1.020288069390531, "learning_rate": 1.7439148835403152e-06, "loss": 0.0282, "step": 7862 }, { "epoch": 7.341736694677871, "grad_norm": 2.704380954942427, "learning_rate": 1.7427674929306977e-06, "loss": 0.1012, "step": 7863 }, { "epoch": 7.342670401493931, "grad_norm": 1.340696173019748, "learning_rate": 1.7416204002307636e-06, "loss": 0.0369, "step": 7864 }, { "epoch": 7.34360410830999, "grad_norm": 4.971577150858695, "learning_rate": 1.7404736055454259e-06, "loss": 0.0678, "step": 7865 }, { "epoch": 7.34453781512605, "grad_norm": 0.8924406928624544, "learning_rate": 1.7393271089795749e-06, "loss": 0.0223, "step": 7866 }, { "epoch": 7.34547152194211, "grad_norm": 2.13286526976856, "learning_rate": 1.7381809106380688e-06, "loss": 0.0766, "step": 7867 }, { "epoch": 7.34640522875817, "grad_norm": 2.753359274629657, "learning_rate": 1.7370350106257394e-06, "loss": 0.0831, "step": 7868 }, { "epoch": 7.347338935574229, "grad_norm": 0.7492373144594875, "learning_rate": 1.7358894090473928e-06, "loss": 0.0142, "step": 7869 }, { "epoch": 7.348272642390289, "grad_norm": 1.4411556713911586, "learning_rate": 1.7347441060078041e-06, "loss": 0.0331, "step": 7870 }, { "epoch": 7.349206349206349, "grad_norm": 0.5877851258647889, "learning_rate": 1.733599101611728e-06, "loss": 0.0101, "step": 7871 }, { "epoch": 7.350140056022409, "grad_norm": 1.5577788347305404, "learning_rate": 1.7324543959638862e-06, "loss": 0.0344, "step": 7872 }, { "epoch": 7.351073762838468, "grad_norm": 3.61462573639794, "learning_rate": 1.731309989168974e-06, "loss": 0.1156, "step": 7873 }, { "epoch": 7.352007469654528, "grad_norm": 3.473051771252458, "learning_rate": 1.730165881331658e-06, "loss": 0.1128, "step": 7874 }, { "epoch": 7.352941176470588, "grad_norm": 1.6184978258524005, "learning_rate": 1.7290220725565831e-06, "loss": 0.0339, "step": 7875 }, { "epoch": 7.353874883286648, "grad_norm": 3.9305824689161053, "learning_rate": 1.7278785629483624e-06, "loss": 0.085, "step": 7876 }, { "epoch": 7.354808590102707, "grad_norm": 1.3638964188107778, "learning_rate": 1.726735352611581e-06, "loss": 0.0314, "step": 7877 }, { "epoch": 7.355742296918767, "grad_norm": 5.728391171937414, "learning_rate": 1.7255924416507975e-06, "loss": 0.1401, "step": 7878 }, { "epoch": 7.356676003734827, "grad_norm": 0.20706028981446337, "learning_rate": 1.7244498301705464e-06, "loss": 0.0019, "step": 7879 }, { "epoch": 7.357609710550887, "grad_norm": 1.7296954168499818, "learning_rate": 1.7233075182753306e-06, "loss": 0.0546, "step": 7880 }, { "epoch": 7.358543417366946, "grad_norm": 0.4862786065586994, "learning_rate": 1.7221655060696268e-06, "loss": 0.0044, "step": 7881 }, { "epoch": 7.359477124183006, "grad_norm": 0.508836333846587, "learning_rate": 1.7210237936578832e-06, "loss": 0.0079, "step": 7882 }, { "epoch": 7.360410830999066, "grad_norm": 2.3906607713031973, "learning_rate": 1.7198823811445254e-06, "loss": 0.077, "step": 7883 }, { "epoch": 7.361344537815126, "grad_norm": 4.405799584663443, "learning_rate": 1.7187412686339454e-06, "loss": 0.1611, "step": 7884 }, { "epoch": 7.362278244631185, "grad_norm": 2.4573374704980377, "learning_rate": 1.717600456230512e-06, "loss": 0.0641, "step": 7885 }, { "epoch": 7.363211951447245, "grad_norm": 1.9893629131702357, "learning_rate": 1.7164599440385643e-06, "loss": 0.0727, "step": 7886 }, { "epoch": 7.364145658263305, "grad_norm": 1.212445578976532, "learning_rate": 1.7153197321624125e-06, "loss": 0.0254, "step": 7887 }, { "epoch": 7.365079365079365, "grad_norm": 1.3391716241209748, "learning_rate": 1.714179820706346e-06, "loss": 0.0297, "step": 7888 }, { "epoch": 7.366013071895424, "grad_norm": 2.1637842025470206, "learning_rate": 1.713040209774619e-06, "loss": 0.0307, "step": 7889 }, { "epoch": 7.366946778711484, "grad_norm": 3.590166863232055, "learning_rate": 1.711900899471463e-06, "loss": 0.0739, "step": 7890 }, { "epoch": 7.367880485527545, "grad_norm": 1.0584270997048195, "learning_rate": 1.710761889901078e-06, "loss": 0.0136, "step": 7891 }, { "epoch": 7.368814192343605, "grad_norm": 2.4530983474737265, "learning_rate": 1.7096231811676422e-06, "loss": 0.0856, "step": 7892 }, { "epoch": 7.369747899159664, "grad_norm": 1.276238279232375, "learning_rate": 1.7084847733753018e-06, "loss": 0.0223, "step": 7893 }, { "epoch": 7.370681605975724, "grad_norm": 1.614892619585706, "learning_rate": 1.7073466666281758e-06, "loss": 0.0413, "step": 7894 }, { "epoch": 7.371615312791784, "grad_norm": 2.397439695601988, "learning_rate": 1.7062088610303563e-06, "loss": 0.0464, "step": 7895 }, { "epoch": 7.372549019607844, "grad_norm": 4.571999231086947, "learning_rate": 1.70507135668591e-06, "loss": 0.1539, "step": 7896 }, { "epoch": 7.373482726423903, "grad_norm": 1.5939601552038003, "learning_rate": 1.7039341536988736e-06, "loss": 0.0424, "step": 7897 }, { "epoch": 7.374416433239963, "grad_norm": 2.1151422054248434, "learning_rate": 1.7027972521732555e-06, "loss": 0.0476, "step": 7898 }, { "epoch": 7.375350140056023, "grad_norm": 2.1097030728259623, "learning_rate": 1.7016606522130397e-06, "loss": 0.098, "step": 7899 }, { "epoch": 7.376283846872083, "grad_norm": 2.240200489691073, "learning_rate": 1.7005243539221773e-06, "loss": 0.0686, "step": 7900 }, { "epoch": 7.377217553688142, "grad_norm": 1.619094251856893, "learning_rate": 1.6993883574045999e-06, "loss": 0.0583, "step": 7901 }, { "epoch": 7.378151260504202, "grad_norm": 1.381413541956661, "learning_rate": 1.6982526627642043e-06, "loss": 0.0344, "step": 7902 }, { "epoch": 7.379084967320262, "grad_norm": 0.7616687811981457, "learning_rate": 1.6971172701048627e-06, "loss": 0.024, "step": 7903 }, { "epoch": 7.380018674136322, "grad_norm": 2.2119262931564996, "learning_rate": 1.6959821795304177e-06, "loss": 0.0815, "step": 7904 }, { "epoch": 7.380952380952381, "grad_norm": 1.6617853000563678, "learning_rate": 1.6948473911446884e-06, "loss": 0.0469, "step": 7905 }, { "epoch": 7.381886087768441, "grad_norm": 2.2292052656943735, "learning_rate": 1.6937129050514629e-06, "loss": 0.0528, "step": 7906 }, { "epoch": 7.382819794584501, "grad_norm": 3.8877847661966523, "learning_rate": 1.6925787213545014e-06, "loss": 0.1465, "step": 7907 }, { "epoch": 7.383753501400561, "grad_norm": 3.7659276852701846, "learning_rate": 1.6914448401575367e-06, "loss": 0.0534, "step": 7908 }, { "epoch": 7.38468720821662, "grad_norm": 1.1697218841262793, "learning_rate": 1.6903112615642781e-06, "loss": 0.023, "step": 7909 }, { "epoch": 7.38562091503268, "grad_norm": 3.2280383405613304, "learning_rate": 1.6891779856784013e-06, "loss": 0.0904, "step": 7910 }, { "epoch": 7.38655462184874, "grad_norm": 2.8408631286710175, "learning_rate": 1.6880450126035576e-06, "loss": 0.0692, "step": 7911 }, { "epoch": 7.3874883286648, "grad_norm": 1.692263022877777, "learning_rate": 1.686912342443367e-06, "loss": 0.052, "step": 7912 }, { "epoch": 7.388422035480859, "grad_norm": 1.2260686455898928, "learning_rate": 1.6857799753014297e-06, "loss": 0.025, "step": 7913 }, { "epoch": 7.389355742296919, "grad_norm": 1.0463849660507945, "learning_rate": 1.6846479112813102e-06, "loss": 0.0416, "step": 7914 }, { "epoch": 7.390289449112979, "grad_norm": 3.3960535334386153, "learning_rate": 1.6835161504865482e-06, "loss": 0.0784, "step": 7915 }, { "epoch": 7.391223155929039, "grad_norm": 1.5965870016197974, "learning_rate": 1.682384693020655e-06, "loss": 0.0331, "step": 7916 }, { "epoch": 7.392156862745098, "grad_norm": 2.2580872389106865, "learning_rate": 1.681253538987117e-06, "loss": 0.0721, "step": 7917 }, { "epoch": 7.393090569561158, "grad_norm": 2.8281424310837373, "learning_rate": 1.6801226884893895e-06, "loss": 0.1065, "step": 7918 }, { "epoch": 7.394024276377218, "grad_norm": 11.76019024788303, "learning_rate": 1.6789921416309008e-06, "loss": 0.1568, "step": 7919 }, { "epoch": 7.394957983193278, "grad_norm": 3.055986724235134, "learning_rate": 1.6778618985150514e-06, "loss": 0.0829, "step": 7920 }, { "epoch": 7.395891690009337, "grad_norm": 3.8412419214542957, "learning_rate": 1.6767319592452158e-06, "loss": 0.1598, "step": 7921 }, { "epoch": 7.396825396825397, "grad_norm": 0.1882167915190133, "learning_rate": 1.6756023239247393e-06, "loss": 0.0051, "step": 7922 }, { "epoch": 7.397759103641457, "grad_norm": 3.8229225482782976, "learning_rate": 1.6744729926569375e-06, "loss": 0.1276, "step": 7923 }, { "epoch": 7.398692810457517, "grad_norm": 3.6383670695342656, "learning_rate": 1.6733439655451028e-06, "loss": 0.1067, "step": 7924 }, { "epoch": 7.3996265172735765, "grad_norm": 0.07721218319129856, "learning_rate": 1.6722152426924959e-06, "loss": 0.0004, "step": 7925 }, { "epoch": 7.400560224089636, "grad_norm": 1.880228013195204, "learning_rate": 1.6710868242023499e-06, "loss": 0.0665, "step": 7926 }, { "epoch": 7.401493930905696, "grad_norm": 2.0406908425687917, "learning_rate": 1.669958710177873e-06, "loss": 0.0492, "step": 7927 }, { "epoch": 7.402427637721756, "grad_norm": 0.3856146514155842, "learning_rate": 1.6688309007222424e-06, "loss": 0.005, "step": 7928 }, { "epoch": 7.4033613445378155, "grad_norm": 0.3634066834579624, "learning_rate": 1.6677033959386096e-06, "loss": 0.0031, "step": 7929 }, { "epoch": 7.404295051353875, "grad_norm": 2.193387912074752, "learning_rate": 1.6665761959300947e-06, "loss": 0.0819, "step": 7930 }, { "epoch": 7.405228758169935, "grad_norm": 0.7162079983086922, "learning_rate": 1.6654493007997963e-06, "loss": 0.0112, "step": 7931 }, { "epoch": 7.406162464985995, "grad_norm": 0.9562979251314786, "learning_rate": 1.6643227106507791e-06, "loss": 0.0285, "step": 7932 }, { "epoch": 7.4070961718020545, "grad_norm": 3.7263895842759522, "learning_rate": 1.6631964255860833e-06, "loss": 0.1464, "step": 7933 }, { "epoch": 7.408029878618114, "grad_norm": 0.8077377113670756, "learning_rate": 1.6620704457087173e-06, "loss": 0.0121, "step": 7934 }, { "epoch": 7.408963585434174, "grad_norm": 3.2117051152937828, "learning_rate": 1.6609447711216675e-06, "loss": 0.112, "step": 7935 }, { "epoch": 7.409897292250234, "grad_norm": 1.19094256612705, "learning_rate": 1.659819401927888e-06, "loss": 0.0213, "step": 7936 }, { "epoch": 7.4108309990662935, "grad_norm": 2.5929558905558325, "learning_rate": 1.6586943382303067e-06, "loss": 0.0995, "step": 7937 }, { "epoch": 7.411764705882353, "grad_norm": 2.364979213234212, "learning_rate": 1.6575695801318203e-06, "loss": 0.0527, "step": 7938 }, { "epoch": 7.412698412698413, "grad_norm": 0.5283965088993464, "learning_rate": 1.6564451277353045e-06, "loss": 0.0119, "step": 7939 }, { "epoch": 7.413632119514473, "grad_norm": 1.0156497178055175, "learning_rate": 1.6553209811436005e-06, "loss": 0.0123, "step": 7940 }, { "epoch": 7.4145658263305325, "grad_norm": 1.952951167925409, "learning_rate": 1.6541971404595247e-06, "loss": 0.0887, "step": 7941 }, { "epoch": 7.415499533146592, "grad_norm": 0.3011713526615311, "learning_rate": 1.653073605785862e-06, "loss": 0.0068, "step": 7942 }, { "epoch": 7.416433239962652, "grad_norm": 1.5875116203689033, "learning_rate": 1.6519503772253759e-06, "loss": 0.0401, "step": 7943 }, { "epoch": 7.417366946778712, "grad_norm": 0.4526006404736155, "learning_rate": 1.6508274548807963e-06, "loss": 0.0109, "step": 7944 }, { "epoch": 7.4183006535947715, "grad_norm": 2.3368768129892716, "learning_rate": 1.6497048388548265e-06, "loss": 0.0644, "step": 7945 }, { "epoch": 7.419234360410831, "grad_norm": 3.499547974130081, "learning_rate": 1.64858252925014e-06, "loss": 0.1274, "step": 7946 }, { "epoch": 7.420168067226891, "grad_norm": 1.771762785458669, "learning_rate": 1.6474605261693893e-06, "loss": 0.0416, "step": 7947 }, { "epoch": 7.421101774042951, "grad_norm": 0.98237169692845, "learning_rate": 1.6463388297151905e-06, "loss": 0.0255, "step": 7948 }, { "epoch": 7.4220354808590105, "grad_norm": 1.9683727489233502, "learning_rate": 1.645217439990136e-06, "loss": 0.0624, "step": 7949 }, { "epoch": 7.42296918767507, "grad_norm": 1.7979391265535138, "learning_rate": 1.6440963570967888e-06, "loss": 0.0521, "step": 7950 }, { "epoch": 7.42390289449113, "grad_norm": 0.3923181496410006, "learning_rate": 1.642975581137683e-06, "loss": 0.0058, "step": 7951 }, { "epoch": 7.42483660130719, "grad_norm": 2.762066217340712, "learning_rate": 1.6418551122153286e-06, "loss": 0.0507, "step": 7952 }, { "epoch": 7.42577030812325, "grad_norm": 0.4134341471084775, "learning_rate": 1.6407349504322034e-06, "loss": 0.0058, "step": 7953 }, { "epoch": 7.426704014939309, "grad_norm": 1.9117022245449846, "learning_rate": 1.6396150958907586e-06, "loss": 0.0837, "step": 7954 }, { "epoch": 7.427637721755369, "grad_norm": 1.3891402425178874, "learning_rate": 1.6384955486934157e-06, "loss": 0.0518, "step": 7955 }, { "epoch": 7.428571428571429, "grad_norm": 0.8618612598342774, "learning_rate": 1.6373763089425726e-06, "loss": 0.0154, "step": 7956 }, { "epoch": 7.429505135387489, "grad_norm": 0.37451409109259115, "learning_rate": 1.6362573767405942e-06, "loss": 0.0033, "step": 7957 }, { "epoch": 7.430438842203548, "grad_norm": 1.788492337387736, "learning_rate": 1.6351387521898199e-06, "loss": 0.0622, "step": 7958 }, { "epoch": 7.431372549019608, "grad_norm": 0.9352000841328609, "learning_rate": 1.6340204353925581e-06, "loss": 0.014, "step": 7959 }, { "epoch": 7.432306255835668, "grad_norm": 3.847229330945966, "learning_rate": 1.632902426451094e-06, "loss": 0.1152, "step": 7960 }, { "epoch": 7.433239962651728, "grad_norm": 0.3401390866695267, "learning_rate": 1.6317847254676811e-06, "loss": 0.008, "step": 7961 }, { "epoch": 7.434173669467787, "grad_norm": 0.35961696895841366, "learning_rate": 1.6306673325445443e-06, "loss": 0.0079, "step": 7962 }, { "epoch": 7.435107376283847, "grad_norm": 2.223822361931035, "learning_rate": 1.6295502477838804e-06, "loss": 0.0535, "step": 7963 }, { "epoch": 7.436041083099907, "grad_norm": 2.188950241064723, "learning_rate": 1.628433471287863e-06, "loss": 0.0737, "step": 7964 }, { "epoch": 7.436974789915967, "grad_norm": 1.5932642742096192, "learning_rate": 1.627317003158631e-06, "loss": 0.0808, "step": 7965 }, { "epoch": 7.437908496732026, "grad_norm": 0.4243322358465084, "learning_rate": 1.6262008434982978e-06, "loss": 0.0076, "step": 7966 }, { "epoch": 7.438842203548086, "grad_norm": 2.166965421205319, "learning_rate": 1.6250849924089485e-06, "loss": 0.0488, "step": 7967 }, { "epoch": 7.439775910364146, "grad_norm": 0.09380662895930418, "learning_rate": 1.623969449992639e-06, "loss": 0.0004, "step": 7968 }, { "epoch": 7.440709617180206, "grad_norm": 2.244609990677457, "learning_rate": 1.6228542163514e-06, "loss": 0.073, "step": 7969 }, { "epoch": 7.441643323996265, "grad_norm": 3.609090603213329, "learning_rate": 1.6217392915872316e-06, "loss": 0.0891, "step": 7970 }, { "epoch": 7.442577030812325, "grad_norm": 1.5832255239029467, "learning_rate": 1.6206246758021044e-06, "loss": 0.0489, "step": 7971 }, { "epoch": 7.443510737628385, "grad_norm": 0.3011583414852472, "learning_rate": 1.6195103690979618e-06, "loss": 0.0049, "step": 7972 }, { "epoch": 7.444444444444445, "grad_norm": 3.7656558358987082, "learning_rate": 1.618396371576722e-06, "loss": 0.0966, "step": 7973 }, { "epoch": 7.445378151260504, "grad_norm": 2.694389243654563, "learning_rate": 1.617282683340271e-06, "loss": 0.0691, "step": 7974 }, { "epoch": 7.446311858076564, "grad_norm": 1.2458935065941992, "learning_rate": 1.6161693044904676e-06, "loss": 0.0202, "step": 7975 }, { "epoch": 7.447245564892624, "grad_norm": 0.30593790821011935, "learning_rate": 1.615056235129141e-06, "loss": 0.0102, "step": 7976 }, { "epoch": 7.448179271708684, "grad_norm": 6.085075757323433, "learning_rate": 1.6139434753580973e-06, "loss": 0.1756, "step": 7977 }, { "epoch": 7.449112978524743, "grad_norm": 1.2819667750461943, "learning_rate": 1.612831025279108e-06, "loss": 0.043, "step": 7978 }, { "epoch": 7.450046685340803, "grad_norm": 0.13422736937178592, "learning_rate": 1.6117188849939197e-06, "loss": 0.0014, "step": 7979 }, { "epoch": 7.450980392156863, "grad_norm": 1.1560121705367583, "learning_rate": 1.610607054604248e-06, "loss": 0.0226, "step": 7980 }, { "epoch": 7.451914098972923, "grad_norm": 2.733468792113218, "learning_rate": 1.6094955342117852e-06, "loss": 0.0966, "step": 7981 }, { "epoch": 7.452847805788982, "grad_norm": 2.636929335917472, "learning_rate": 1.6083843239181906e-06, "loss": 0.081, "step": 7982 }, { "epoch": 7.453781512605042, "grad_norm": 3.165995058612343, "learning_rate": 1.607273423825096e-06, "loss": 0.1326, "step": 7983 }, { "epoch": 7.454715219421102, "grad_norm": 5.661398891924127, "learning_rate": 1.6061628340341062e-06, "loss": 0.1631, "step": 7984 }, { "epoch": 7.455648926237162, "grad_norm": 6.28946290614974, "learning_rate": 1.6050525546467943e-06, "loss": 0.0791, "step": 7985 }, { "epoch": 7.456582633053221, "grad_norm": 0.8021349839040915, "learning_rate": 1.6039425857647117e-06, "loss": 0.0159, "step": 7986 }, { "epoch": 7.457516339869281, "grad_norm": 1.713535764877369, "learning_rate": 1.6028329274893757e-06, "loss": 0.0341, "step": 7987 }, { "epoch": 7.458450046685341, "grad_norm": 2.584748922292563, "learning_rate": 1.6017235799222752e-06, "loss": 0.103, "step": 7988 }, { "epoch": 7.459383753501401, "grad_norm": 3.1281409095557056, "learning_rate": 1.6006145431648723e-06, "loss": 0.0754, "step": 7989 }, { "epoch": 7.4603174603174605, "grad_norm": 3.3062579018753135, "learning_rate": 1.5995058173186035e-06, "loss": 0.0876, "step": 7990 }, { "epoch": 7.46125116713352, "grad_norm": 0.43537237375534427, "learning_rate": 1.5983974024848714e-06, "loss": 0.0133, "step": 7991 }, { "epoch": 7.46218487394958, "grad_norm": 1.1058983447669477, "learning_rate": 1.597289298765054e-06, "loss": 0.0261, "step": 7992 }, { "epoch": 7.46311858076564, "grad_norm": 3.0128845538549354, "learning_rate": 1.5961815062604968e-06, "loss": 0.096, "step": 7993 }, { "epoch": 7.4640522875816995, "grad_norm": 1.588316651092164, "learning_rate": 1.5950740250725233e-06, "loss": 0.0451, "step": 7994 }, { "epoch": 7.464985994397759, "grad_norm": 1.7789330136277544, "learning_rate": 1.5939668553024234e-06, "loss": 0.0579, "step": 7995 }, { "epoch": 7.465919701213819, "grad_norm": 0.8848521535451851, "learning_rate": 1.5928599970514586e-06, "loss": 0.0133, "step": 7996 }, { "epoch": 7.466853408029879, "grad_norm": 0.22459380480537236, "learning_rate": 1.5917534504208653e-06, "loss": 0.0019, "step": 7997 }, { "epoch": 7.4677871148459385, "grad_norm": 2.4113720647436456, "learning_rate": 1.5906472155118486e-06, "loss": 0.0669, "step": 7998 }, { "epoch": 7.468720821661998, "grad_norm": 0.8539784180509041, "learning_rate": 1.589541292425586e-06, "loss": 0.0102, "step": 7999 }, { "epoch": 7.469654528478058, "grad_norm": 2.167962098786289, "learning_rate": 1.588435681263224e-06, "loss": 0.0498, "step": 8000 }, { "epoch": 7.470588235294118, "grad_norm": 1.049516240655064, "learning_rate": 1.5873303821258862e-06, "loss": 0.0308, "step": 8001 }, { "epoch": 7.4715219421101775, "grad_norm": 1.1250797814718707, "learning_rate": 1.5862253951146633e-06, "loss": 0.0195, "step": 8002 }, { "epoch": 7.472455648926237, "grad_norm": 2.538221589764174, "learning_rate": 1.5851207203306167e-06, "loss": 0.0619, "step": 8003 }, { "epoch": 7.473389355742297, "grad_norm": 1.1923250468301687, "learning_rate": 1.5840163578747842e-06, "loss": 0.041, "step": 8004 }, { "epoch": 7.474323062558357, "grad_norm": 1.434592576768423, "learning_rate": 1.58291230784817e-06, "loss": 0.0213, "step": 8005 }, { "epoch": 7.4752567693744165, "grad_norm": 1.2755726995482009, "learning_rate": 1.5818085703517506e-06, "loss": 0.0241, "step": 8006 }, { "epoch": 7.476190476190476, "grad_norm": 1.3314484091476868, "learning_rate": 1.5807051454864774e-06, "loss": 0.0269, "step": 8007 }, { "epoch": 7.477124183006536, "grad_norm": 5.491022578632674, "learning_rate": 1.5796020333532696e-06, "loss": 0.1607, "step": 8008 }, { "epoch": 7.478057889822596, "grad_norm": 0.584465476636573, "learning_rate": 1.5784992340530187e-06, "loss": 0.0128, "step": 8009 }, { "epoch": 7.4789915966386555, "grad_norm": 0.8871597478863997, "learning_rate": 1.577396747686586e-06, "loss": 0.018, "step": 8010 }, { "epoch": 7.479925303454715, "grad_norm": 1.5342115322878194, "learning_rate": 1.5762945743548097e-06, "loss": 0.0401, "step": 8011 }, { "epoch": 7.480859010270775, "grad_norm": 1.1479677187929058, "learning_rate": 1.5751927141584939e-06, "loss": 0.0243, "step": 8012 }, { "epoch": 7.481792717086835, "grad_norm": 1.6482705581915293, "learning_rate": 1.5740911671984155e-06, "loss": 0.0161, "step": 8013 }, { "epoch": 7.4827264239028946, "grad_norm": 1.7642654210372932, "learning_rate": 1.5729899335753224e-06, "loss": 0.0645, "step": 8014 }, { "epoch": 7.483660130718954, "grad_norm": 1.3247173686671356, "learning_rate": 1.571889013389936e-06, "loss": 0.01, "step": 8015 }, { "epoch": 7.484593837535014, "grad_norm": 2.9512811350006287, "learning_rate": 1.5707884067429474e-06, "loss": 0.1186, "step": 8016 }, { "epoch": 7.485527544351074, "grad_norm": 0.6594641881902743, "learning_rate": 1.569688113735019e-06, "loss": 0.0066, "step": 8017 }, { "epoch": 7.486461251167134, "grad_norm": 1.4988644984492805, "learning_rate": 1.5685881344667842e-06, "loss": 0.0282, "step": 8018 }, { "epoch": 7.487394957983193, "grad_norm": 0.6547665704441745, "learning_rate": 1.5674884690388464e-06, "loss": 0.0137, "step": 8019 }, { "epoch": 7.488328664799253, "grad_norm": 0.7140835656334922, "learning_rate": 1.5663891175517864e-06, "loss": 0.0089, "step": 8020 }, { "epoch": 7.489262371615313, "grad_norm": 1.5431763877275089, "learning_rate": 1.5652900801061488e-06, "loss": 0.0336, "step": 8021 }, { "epoch": 7.490196078431373, "grad_norm": 0.5722160985244692, "learning_rate": 1.5641913568024542e-06, "loss": 0.0171, "step": 8022 }, { "epoch": 7.491129785247432, "grad_norm": 0.4260525810110259, "learning_rate": 1.5630929477411899e-06, "loss": 0.0072, "step": 8023 }, { "epoch": 7.492063492063492, "grad_norm": 2.8007213075402073, "learning_rate": 1.561994853022822e-06, "loss": 0.0724, "step": 8024 }, { "epoch": 7.492997198879552, "grad_norm": 1.4901977414075367, "learning_rate": 1.5608970727477807e-06, "loss": 0.0132, "step": 8025 }, { "epoch": 7.493930905695612, "grad_norm": 5.708781947422821, "learning_rate": 1.5597996070164706e-06, "loss": 0.1776, "step": 8026 }, { "epoch": 7.494864612511671, "grad_norm": 1.6842692182389778, "learning_rate": 1.5587024559292651e-06, "loss": 0.0563, "step": 8027 }, { "epoch": 7.495798319327731, "grad_norm": 4.1695877487496205, "learning_rate": 1.557605619586514e-06, "loss": 0.1536, "step": 8028 }, { "epoch": 7.496732026143791, "grad_norm": 3.6406854007835756, "learning_rate": 1.5565090980885334e-06, "loss": 0.0301, "step": 8029 }, { "epoch": 7.497665732959851, "grad_norm": 0.44734774082919093, "learning_rate": 1.5554128915356126e-06, "loss": 0.0069, "step": 8030 }, { "epoch": 7.49859943977591, "grad_norm": 3.7846812741071583, "learning_rate": 1.55431700002801e-06, "loss": 0.0841, "step": 8031 }, { "epoch": 7.49953314659197, "grad_norm": 0.7572799873364041, "learning_rate": 1.5532214236659592e-06, "loss": 0.0121, "step": 8032 }, { "epoch": 7.50046685340803, "grad_norm": 1.3204134615848757, "learning_rate": 1.5521261625496625e-06, "loss": 0.0487, "step": 8033 }, { "epoch": 7.50140056022409, "grad_norm": 1.073781960676786, "learning_rate": 1.551031216779293e-06, "loss": 0.0208, "step": 8034 }, { "epoch": 7.502334267040149, "grad_norm": 1.3133061160396522, "learning_rate": 1.5499365864549949e-06, "loss": 0.0177, "step": 8035 }, { "epoch": 7.503267973856209, "grad_norm": 2.3913670814968557, "learning_rate": 1.5488422716768831e-06, "loss": 0.0937, "step": 8036 }, { "epoch": 7.504201680672269, "grad_norm": 0.5342912594884552, "learning_rate": 1.5477482725450481e-06, "loss": 0.0095, "step": 8037 }, { "epoch": 7.505135387488329, "grad_norm": 4.2289437547232875, "learning_rate": 1.5466545891595464e-06, "loss": 0.089, "step": 8038 }, { "epoch": 7.506069094304388, "grad_norm": 0.38472014994503423, "learning_rate": 1.5455612216204069e-06, "loss": 0.0058, "step": 8039 }, { "epoch": 7.507002801120448, "grad_norm": 2.348584227879219, "learning_rate": 1.5444681700276293e-06, "loss": 0.0882, "step": 8040 }, { "epoch": 7.507936507936508, "grad_norm": 1.1968996011196658, "learning_rate": 1.5433754344811875e-06, "loss": 0.0255, "step": 8041 }, { "epoch": 7.508870214752568, "grad_norm": 2.0204784484228218, "learning_rate": 1.5422830150810237e-06, "loss": 0.0665, "step": 8042 }, { "epoch": 7.509803921568627, "grad_norm": 1.7798231369159967, "learning_rate": 1.5411909119270502e-06, "loss": 0.0693, "step": 8043 }, { "epoch": 7.510737628384687, "grad_norm": 2.052921967021593, "learning_rate": 1.540099125119151e-06, "loss": 0.0714, "step": 8044 }, { "epoch": 7.511671335200747, "grad_norm": 1.9582548713778616, "learning_rate": 1.5390076547571852e-06, "loss": 0.0499, "step": 8045 }, { "epoch": 7.512605042016807, "grad_norm": 2.577388884973728, "learning_rate": 1.5379165009409785e-06, "loss": 0.0128, "step": 8046 }, { "epoch": 7.513538748832866, "grad_norm": 2.5551532620988158, "learning_rate": 1.5368256637703283e-06, "loss": 0.0722, "step": 8047 }, { "epoch": 7.514472455648926, "grad_norm": 0.4541552434426827, "learning_rate": 1.5357351433450036e-06, "loss": 0.0133, "step": 8048 }, { "epoch": 7.515406162464986, "grad_norm": 0.842075806792899, "learning_rate": 1.5346449397647434e-06, "loss": 0.0191, "step": 8049 }, { "epoch": 7.516339869281046, "grad_norm": 1.6274536551029348, "learning_rate": 1.5335550531292614e-06, "loss": 0.046, "step": 8050 }, { "epoch": 7.5172735760971054, "grad_norm": 0.5916772204817135, "learning_rate": 1.5324654835382386e-06, "loss": 0.0109, "step": 8051 }, { "epoch": 7.518207282913165, "grad_norm": 3.0434380391391986, "learning_rate": 1.5313762310913277e-06, "loss": 0.0335, "step": 8052 }, { "epoch": 7.519140989729225, "grad_norm": 0.9425657552602594, "learning_rate": 1.5302872958881515e-06, "loss": 0.0243, "step": 8053 }, { "epoch": 7.520074696545285, "grad_norm": 1.9221664351819268, "learning_rate": 1.5291986780283087e-06, "loss": 0.0853, "step": 8054 }, { "epoch": 7.5210084033613445, "grad_norm": 3.6654906788850616, "learning_rate": 1.5281103776113626e-06, "loss": 0.1597, "step": 8055 }, { "epoch": 7.521942110177404, "grad_norm": 4.282232141532761, "learning_rate": 1.5270223947368512e-06, "loss": 0.1121, "step": 8056 }, { "epoch": 7.522875816993464, "grad_norm": 3.4196666551619725, "learning_rate": 1.5259347295042804e-06, "loss": 0.122, "step": 8057 }, { "epoch": 7.523809523809524, "grad_norm": 1.4606573798292648, "learning_rate": 1.5248473820131327e-06, "loss": 0.0424, "step": 8058 }, { "epoch": 7.5247432306255835, "grad_norm": 1.10900525544451, "learning_rate": 1.5237603523628559e-06, "loss": 0.0329, "step": 8059 }, { "epoch": 7.525676937441643, "grad_norm": 1.1787541669706225, "learning_rate": 1.522673640652871e-06, "loss": 0.0153, "step": 8060 }, { "epoch": 7.526610644257703, "grad_norm": 0.2471508373726914, "learning_rate": 1.5215872469825682e-06, "loss": 0.0009, "step": 8061 }, { "epoch": 7.527544351073763, "grad_norm": 2.7281121629613696, "learning_rate": 1.5205011714513136e-06, "loss": 0.1145, "step": 8062 }, { "epoch": 7.5284780578898225, "grad_norm": 1.6904921630872483, "learning_rate": 1.5194154141584384e-06, "loss": 0.0404, "step": 8063 }, { "epoch": 7.529411764705882, "grad_norm": 2.868257415338219, "learning_rate": 1.5183299752032476e-06, "loss": 0.1049, "step": 8064 }, { "epoch": 7.530345471521942, "grad_norm": 2.0425576610578284, "learning_rate": 1.5172448546850166e-06, "loss": 0.0671, "step": 8065 }, { "epoch": 7.531279178338002, "grad_norm": 0.987359412344114, "learning_rate": 1.5161600527029896e-06, "loss": 0.0294, "step": 8066 }, { "epoch": 7.5322128851540615, "grad_norm": 3.8233836783156527, "learning_rate": 1.5150755693563873e-06, "loss": 0.065, "step": 8067 }, { "epoch": 7.533146591970121, "grad_norm": 0.8351257448983104, "learning_rate": 1.5139914047443954e-06, "loss": 0.013, "step": 8068 }, { "epoch": 7.534080298786181, "grad_norm": 1.4002965535802385, "learning_rate": 1.5129075589661714e-06, "loss": 0.037, "step": 8069 }, { "epoch": 7.535014005602241, "grad_norm": 1.0617317804853539, "learning_rate": 1.5118240321208483e-06, "loss": 0.0282, "step": 8070 }, { "epoch": 7.5359477124183005, "grad_norm": 1.8904826820498613, "learning_rate": 1.5107408243075245e-06, "loss": 0.043, "step": 8071 }, { "epoch": 7.53688141923436, "grad_norm": 1.0643060579091064, "learning_rate": 1.5096579356252716e-06, "loss": 0.0401, "step": 8072 }, { "epoch": 7.53781512605042, "grad_norm": 3.1633588483794974, "learning_rate": 1.50857536617313e-06, "loss": 0.1456, "step": 8073 }, { "epoch": 7.53874883286648, "grad_norm": 0.5338753639585233, "learning_rate": 1.5074931160501155e-06, "loss": 0.0063, "step": 8074 }, { "epoch": 7.5396825396825395, "grad_norm": 3.5891966519992873, "learning_rate": 1.5064111853552104e-06, "loss": 0.0765, "step": 8075 }, { "epoch": 7.540616246498599, "grad_norm": 1.5836210487825226, "learning_rate": 1.505329574187368e-06, "loss": 0.0247, "step": 8076 }, { "epoch": 7.541549953314659, "grad_norm": 2.6975207848757776, "learning_rate": 1.5042482826455157e-06, "loss": 0.0816, "step": 8077 }, { "epoch": 7.542483660130719, "grad_norm": 1.5070946708337885, "learning_rate": 1.5031673108285489e-06, "loss": 0.0356, "step": 8078 }, { "epoch": 7.543417366946779, "grad_norm": 1.9266573185153553, "learning_rate": 1.5020866588353334e-06, "loss": 0.0594, "step": 8079 }, { "epoch": 7.544351073762838, "grad_norm": 2.8526902749116454, "learning_rate": 1.5010063267647051e-06, "loss": 0.0658, "step": 8080 }, { "epoch": 7.545284780578898, "grad_norm": 0.3039447144450061, "learning_rate": 1.4999263147154769e-06, "loss": 0.0069, "step": 8081 }, { "epoch": 7.546218487394958, "grad_norm": 1.430683802675548, "learning_rate": 1.4988466227864246e-06, "loss": 0.0337, "step": 8082 }, { "epoch": 7.547152194211018, "grad_norm": 1.2415403063606434, "learning_rate": 1.497767251076297e-06, "loss": 0.0331, "step": 8083 }, { "epoch": 7.548085901027077, "grad_norm": 5.5503536637884965, "learning_rate": 1.4966881996838172e-06, "loss": 0.1353, "step": 8084 }, { "epoch": 7.549019607843137, "grad_norm": 2.577304279358776, "learning_rate": 1.495609468707675e-06, "loss": 0.0515, "step": 8085 }, { "epoch": 7.549953314659197, "grad_norm": 3.4319142952629584, "learning_rate": 1.4945310582465328e-06, "loss": 0.1045, "step": 8086 }, { "epoch": 7.550887021475257, "grad_norm": 1.4892652603793852, "learning_rate": 1.4934529683990201e-06, "loss": 0.0356, "step": 8087 }, { "epoch": 7.551820728291316, "grad_norm": 1.6246667212066408, "learning_rate": 1.4923751992637447e-06, "loss": 0.0452, "step": 8088 }, { "epoch": 7.552754435107376, "grad_norm": 2.223870691786409, "learning_rate": 1.491297750939278e-06, "loss": 0.0611, "step": 8089 }, { "epoch": 7.553688141923436, "grad_norm": 1.6658053231691767, "learning_rate": 1.4902206235241644e-06, "loss": 0.0373, "step": 8090 }, { "epoch": 7.554621848739496, "grad_norm": 1.92631637663821, "learning_rate": 1.4891438171169182e-06, "loss": 0.0442, "step": 8091 }, { "epoch": 7.555555555555555, "grad_norm": 1.9072720897702171, "learning_rate": 1.4880673318160277e-06, "loss": 0.0502, "step": 8092 }, { "epoch": 7.556489262371615, "grad_norm": 2.402324081435901, "learning_rate": 1.4869911677199479e-06, "loss": 0.0387, "step": 8093 }, { "epoch": 7.557422969187675, "grad_norm": 2.935735819797748, "learning_rate": 1.4859153249271052e-06, "loss": 0.0599, "step": 8094 }, { "epoch": 7.558356676003735, "grad_norm": 3.6688730907141274, "learning_rate": 1.4848398035358968e-06, "loss": 0.1601, "step": 8095 }, { "epoch": 7.559290382819794, "grad_norm": 0.8137257130801239, "learning_rate": 1.4837646036446934e-06, "loss": 0.0176, "step": 8096 }, { "epoch": 7.560224089635854, "grad_norm": 2.601322199634744, "learning_rate": 1.4826897253518324e-06, "loss": 0.0805, "step": 8097 }, { "epoch": 7.561157796451914, "grad_norm": 0.5456918025522716, "learning_rate": 1.481615168755623e-06, "loss": 0.0116, "step": 8098 }, { "epoch": 7.562091503267974, "grad_norm": 0.5267313381111628, "learning_rate": 1.4805409339543459e-06, "loss": 0.0127, "step": 8099 }, { "epoch": 7.563025210084033, "grad_norm": 4.997289273729265, "learning_rate": 1.4794670210462492e-06, "loss": 0.2159, "step": 8100 }, { "epoch": 7.563958916900093, "grad_norm": 5.376154632460682, "learning_rate": 1.4783934301295577e-06, "loss": 0.2103, "step": 8101 }, { "epoch": 7.564892623716153, "grad_norm": 1.003321883471597, "learning_rate": 1.477320161302462e-06, "loss": 0.0103, "step": 8102 }, { "epoch": 7.565826330532213, "grad_norm": 1.4114008656489108, "learning_rate": 1.476247214663123e-06, "loss": 0.0523, "step": 8103 }, { "epoch": 7.566760037348272, "grad_norm": 1.6299141170072884, "learning_rate": 1.4751745903096731e-06, "loss": 0.0464, "step": 8104 }, { "epoch": 7.567693744164332, "grad_norm": 1.3915365516474723, "learning_rate": 1.4741022883402178e-06, "loss": 0.0386, "step": 8105 }, { "epoch": 7.568627450980392, "grad_norm": 2.7526190638465096, "learning_rate": 1.4730303088528298e-06, "loss": 0.0616, "step": 8106 }, { "epoch": 7.569561157796452, "grad_norm": 3.092364392700443, "learning_rate": 1.4719586519455536e-06, "loss": 0.0984, "step": 8107 }, { "epoch": 7.570494864612511, "grad_norm": 0.31778189636189663, "learning_rate": 1.4708873177164023e-06, "loss": 0.0027, "step": 8108 }, { "epoch": 7.571428571428571, "grad_norm": 1.3651351377707601, "learning_rate": 1.4698163062633637e-06, "loss": 0.0387, "step": 8109 }, { "epoch": 7.572362278244631, "grad_norm": 2.426733509097771, "learning_rate": 1.4687456176843929e-06, "loss": 0.1123, "step": 8110 }, { "epoch": 7.573295985060691, "grad_norm": 0.8574570308601885, "learning_rate": 1.4676752520774156e-06, "loss": 0.0146, "step": 8111 }, { "epoch": 7.57422969187675, "grad_norm": 0.8432384131291986, "learning_rate": 1.4666052095403272e-06, "loss": 0.017, "step": 8112 }, { "epoch": 7.57516339869281, "grad_norm": 0.4199582479634345, "learning_rate": 1.4655354901709973e-06, "loss": 0.0105, "step": 8113 }, { "epoch": 7.57609710550887, "grad_norm": 2.5852138511007476, "learning_rate": 1.4644660940672628e-06, "loss": 0.069, "step": 8114 }, { "epoch": 7.57703081232493, "grad_norm": 1.8868050525323279, "learning_rate": 1.4633970213269311e-06, "loss": 0.0516, "step": 8115 }, { "epoch": 7.5779645191409895, "grad_norm": 2.3952454990638183, "learning_rate": 1.4623282720477812e-06, "loss": 0.0628, "step": 8116 }, { "epoch": 7.578898225957049, "grad_norm": 0.5545650705840619, "learning_rate": 1.46125984632756e-06, "loss": 0.0082, "step": 8117 }, { "epoch": 7.579831932773109, "grad_norm": 1.2399491744217912, "learning_rate": 1.4601917442639902e-06, "loss": 0.0265, "step": 8118 }, { "epoch": 7.580765639589169, "grad_norm": 0.8000589627188069, "learning_rate": 1.459123965954759e-06, "loss": 0.0201, "step": 8119 }, { "epoch": 7.5816993464052285, "grad_norm": 0.8533588912892967, "learning_rate": 1.4580565114975276e-06, "loss": 0.0218, "step": 8120 }, { "epoch": 7.582633053221288, "grad_norm": 0.8764467608769808, "learning_rate": 1.4569893809899244e-06, "loss": 0.0171, "step": 8121 }, { "epoch": 7.583566760037348, "grad_norm": 5.942426744005501, "learning_rate": 1.4559225745295536e-06, "loss": 0.1855, "step": 8122 }, { "epoch": 7.584500466853408, "grad_norm": 1.3425702439349263, "learning_rate": 1.4548560922139838e-06, "loss": 0.0353, "step": 8123 }, { "epoch": 7.5854341736694675, "grad_norm": 0.05217082368299294, "learning_rate": 1.4537899341407579e-06, "loss": 0.0002, "step": 8124 }, { "epoch": 7.586367880485527, "grad_norm": 0.7969360739650522, "learning_rate": 1.452724100407385e-06, "loss": 0.0121, "step": 8125 }, { "epoch": 7.587301587301587, "grad_norm": 3.099994881395029, "learning_rate": 1.4516585911113512e-06, "loss": 0.0708, "step": 8126 }, { "epoch": 7.588235294117647, "grad_norm": 4.811612209498983, "learning_rate": 1.4505934063501077e-06, "loss": 0.136, "step": 8127 }, { "epoch": 7.5891690009337065, "grad_norm": 1.4831355653232572, "learning_rate": 1.4495285462210763e-06, "loss": 0.0526, "step": 8128 }, { "epoch": 7.590102707749766, "grad_norm": 2.056426139116385, "learning_rate": 1.4484640108216491e-06, "loss": 0.0493, "step": 8129 }, { "epoch": 7.591036414565826, "grad_norm": 0.42707634141807654, "learning_rate": 1.4473998002491935e-06, "loss": 0.0111, "step": 8130 }, { "epoch": 7.591970121381886, "grad_norm": 0.8133210746399901, "learning_rate": 1.4463359146010403e-06, "loss": 0.0082, "step": 8131 }, { "epoch": 7.5929038281979455, "grad_norm": 2.1983342753406028, "learning_rate": 1.4452723539744946e-06, "loss": 0.1089, "step": 8132 }, { "epoch": 7.593837535014005, "grad_norm": 1.483345210818886, "learning_rate": 1.4442091184668294e-06, "loss": 0.0337, "step": 8133 }, { "epoch": 7.594771241830065, "grad_norm": 1.038546697621675, "learning_rate": 1.4431462081752896e-06, "loss": 0.0341, "step": 8134 }, { "epoch": 7.595704948646125, "grad_norm": 2.879274041068373, "learning_rate": 1.4420836231970914e-06, "loss": 0.0868, "step": 8135 }, { "epoch": 7.5966386554621845, "grad_norm": 2.113993826374148, "learning_rate": 1.4410213636294195e-06, "loss": 0.055, "step": 8136 }, { "epoch": 7.597572362278244, "grad_norm": 0.7793281877965497, "learning_rate": 1.4399594295694286e-06, "loss": 0.0181, "step": 8137 }, { "epoch": 7.598506069094304, "grad_norm": 2.748757934997435, "learning_rate": 1.4388978211142424e-06, "loss": 0.0661, "step": 8138 }, { "epoch": 7.599439775910364, "grad_norm": 2.99308216520648, "learning_rate": 1.4378365383609606e-06, "loss": 0.0989, "step": 8139 }, { "epoch": 7.6003734827264235, "grad_norm": 4.828997217862348, "learning_rate": 1.436775581406647e-06, "loss": 0.1852, "step": 8140 }, { "epoch": 7.601307189542483, "grad_norm": 3.705498657431451, "learning_rate": 1.4357149503483382e-06, "loss": 0.1514, "step": 8141 }, { "epoch": 7.602240896358543, "grad_norm": 8.91694995408876, "learning_rate": 1.4346546452830379e-06, "loss": 0.1733, "step": 8142 }, { "epoch": 7.603174603174603, "grad_norm": 1.8746439774309516, "learning_rate": 1.4335946663077271e-06, "loss": 0.0675, "step": 8143 }, { "epoch": 7.604108309990663, "grad_norm": 1.2516813956874313, "learning_rate": 1.4325350135193505e-06, "loss": 0.0183, "step": 8144 }, { "epoch": 7.605042016806722, "grad_norm": 1.0372547296887988, "learning_rate": 1.431475687014825e-06, "loss": 0.0225, "step": 8145 }, { "epoch": 7.605975723622782, "grad_norm": 0.30649239589487565, "learning_rate": 1.4304166868910357e-06, "loss": 0.0058, "step": 8146 }, { "epoch": 7.606909430438842, "grad_norm": 5.009650455910757, "learning_rate": 1.4293580132448436e-06, "loss": 0.166, "step": 8147 }, { "epoch": 7.607843137254902, "grad_norm": 2.721723546204013, "learning_rate": 1.428299666173074e-06, "loss": 0.0979, "step": 8148 }, { "epoch": 7.608776844070961, "grad_norm": 1.863312348742331, "learning_rate": 1.427241645772523e-06, "loss": 0.056, "step": 8149 }, { "epoch": 7.609710550887021, "grad_norm": 3.5735674056266156, "learning_rate": 1.4261839521399612e-06, "loss": 0.1146, "step": 8150 }, { "epoch": 7.610644257703081, "grad_norm": 3.105278317594729, "learning_rate": 1.4251265853721241e-06, "loss": 0.153, "step": 8151 }, { "epoch": 7.611577964519141, "grad_norm": 3.6626150417864016, "learning_rate": 1.424069545565721e-06, "loss": 0.189, "step": 8152 }, { "epoch": 7.6125116713352, "grad_norm": 2.1205041508428155, "learning_rate": 1.423012832817427e-06, "loss": 0.0667, "step": 8153 }, { "epoch": 7.61344537815126, "grad_norm": 0.3338350392503466, "learning_rate": 1.4219564472238935e-06, "loss": 0.0051, "step": 8154 }, { "epoch": 7.61437908496732, "grad_norm": 0.5908515476834157, "learning_rate": 1.420900388881737e-06, "loss": 0.0243, "step": 8155 }, { "epoch": 7.61531279178338, "grad_norm": 0.4301618189577297, "learning_rate": 1.4198446578875447e-06, "loss": 0.0065, "step": 8156 }, { "epoch": 7.616246498599439, "grad_norm": 0.8201780344235364, "learning_rate": 1.4187892543378766e-06, "loss": 0.0141, "step": 8157 }, { "epoch": 7.617180205415499, "grad_norm": 1.996951129994786, "learning_rate": 1.4177341783292609e-06, "loss": 0.0455, "step": 8158 }, { "epoch": 7.618113912231559, "grad_norm": 1.4675682216813644, "learning_rate": 1.416679429958195e-06, "loss": 0.0383, "step": 8159 }, { "epoch": 7.619047619047619, "grad_norm": 1.6028975585529508, "learning_rate": 1.4156250093211454e-06, "loss": 0.0476, "step": 8160 }, { "epoch": 7.619981325863678, "grad_norm": 0.77518870532061, "learning_rate": 1.4145709165145538e-06, "loss": 0.0205, "step": 8161 }, { "epoch": 7.620915032679738, "grad_norm": 1.6197596129598502, "learning_rate": 1.4135171516348279e-06, "loss": 0.032, "step": 8162 }, { "epoch": 7.621848739495798, "grad_norm": 2.1292410578563743, "learning_rate": 1.4124637147783431e-06, "loss": 0.0738, "step": 8163 }, { "epoch": 7.622782446311858, "grad_norm": 1.4236042642215059, "learning_rate": 1.4114106060414524e-06, "loss": 0.0407, "step": 8164 }, { "epoch": 7.623716153127917, "grad_norm": 4.145608864827602, "learning_rate": 1.4103578255204713e-06, "loss": 0.0684, "step": 8165 }, { "epoch": 7.624649859943977, "grad_norm": 1.4969253061176746, "learning_rate": 1.4093053733116885e-06, "loss": 0.0432, "step": 8166 }, { "epoch": 7.625583566760037, "grad_norm": 3.8399694128618216, "learning_rate": 1.4082532495113627e-06, "loss": 0.1247, "step": 8167 }, { "epoch": 7.626517273576097, "grad_norm": 1.3891401968325572, "learning_rate": 1.4072014542157203e-06, "loss": 0.0623, "step": 8168 }, { "epoch": 7.627450980392156, "grad_norm": 2.087164255352949, "learning_rate": 1.4061499875209627e-06, "loss": 0.0374, "step": 8169 }, { "epoch": 7.628384687208216, "grad_norm": 1.6174995196512651, "learning_rate": 1.4050988495232566e-06, "loss": 0.0356, "step": 8170 }, { "epoch": 7.629318394024276, "grad_norm": 4.836428499893606, "learning_rate": 1.4040480403187396e-06, "loss": 0.156, "step": 8171 }, { "epoch": 7.630252100840336, "grad_norm": 0.3443106142926606, "learning_rate": 1.402997560003519e-06, "loss": 0.009, "step": 8172 }, { "epoch": 7.631185807656395, "grad_norm": 5.195781430434812, "learning_rate": 1.4019474086736757e-06, "loss": 0.1796, "step": 8173 }, { "epoch": 7.632119514472455, "grad_norm": 1.6581061614738128, "learning_rate": 1.4008975864252556e-06, "loss": 0.0492, "step": 8174 }, { "epoch": 7.633053221288515, "grad_norm": 0.9039074028302129, "learning_rate": 1.3998480933542764e-06, "loss": 0.0195, "step": 8175 }, { "epoch": 7.633986928104575, "grad_norm": 1.1288574990065026, "learning_rate": 1.3987989295567246e-06, "loss": 0.033, "step": 8176 }, { "epoch": 7.634920634920634, "grad_norm": 0.5043617510999088, "learning_rate": 1.397750095128561e-06, "loss": 0.0034, "step": 8177 }, { "epoch": 7.635854341736694, "grad_norm": 12.299682367262172, "learning_rate": 1.3967015901657111e-06, "loss": 0.1173, "step": 8178 }, { "epoch": 7.636788048552754, "grad_norm": 1.912608310481596, "learning_rate": 1.395653414764072e-06, "loss": 0.0654, "step": 8179 }, { "epoch": 7.637721755368814, "grad_norm": 2.5819211445409413, "learning_rate": 1.3946055690195099e-06, "loss": 0.0493, "step": 8180 }, { "epoch": 7.6386554621848735, "grad_norm": 0.3643294594743495, "learning_rate": 1.3935580530278647e-06, "loss": 0.0083, "step": 8181 }, { "epoch": 7.639589169000933, "grad_norm": 0.6735144789261519, "learning_rate": 1.392510866884942e-06, "loss": 0.0144, "step": 8182 }, { "epoch": 7.640522875816993, "grad_norm": 1.383682121651944, "learning_rate": 1.3914640106865173e-06, "loss": 0.0177, "step": 8183 }, { "epoch": 7.641456582633054, "grad_norm": 0.772473469516185, "learning_rate": 1.390417484528338e-06, "loss": 0.0181, "step": 8184 }, { "epoch": 7.642390289449113, "grad_norm": 5.373651952661339, "learning_rate": 1.389371288506119e-06, "loss": 0.0383, "step": 8185 }, { "epoch": 7.643323996265173, "grad_norm": 0.416628422304271, "learning_rate": 1.3883254227155495e-06, "loss": 0.0091, "step": 8186 }, { "epoch": 7.644257703081233, "grad_norm": 0.5894994420504197, "learning_rate": 1.387279887252283e-06, "loss": 0.0135, "step": 8187 }, { "epoch": 7.645191409897293, "grad_norm": 13.035747583551387, "learning_rate": 1.3862346822119465e-06, "loss": 0.1412, "step": 8188 }, { "epoch": 7.646125116713352, "grad_norm": 1.8915970877167239, "learning_rate": 1.3851898076901327e-06, "loss": 0.0477, "step": 8189 }, { "epoch": 7.647058823529412, "grad_norm": 0.7637900408810342, "learning_rate": 1.3841452637824109e-06, "loss": 0.012, "step": 8190 }, { "epoch": 7.647992530345472, "grad_norm": 1.2003267352973417, "learning_rate": 1.383101050584314e-06, "loss": 0.0348, "step": 8191 }, { "epoch": 7.648926237161532, "grad_norm": 4.381724415495786, "learning_rate": 1.3820571681913464e-06, "loss": 0.171, "step": 8192 }, { "epoch": 7.649859943977591, "grad_norm": 2.481355418007614, "learning_rate": 1.3810136166989824e-06, "loss": 0.0747, "step": 8193 }, { "epoch": 7.650793650793651, "grad_norm": 0.44907957603107135, "learning_rate": 1.3799703962026684e-06, "loss": 0.0042, "step": 8194 }, { "epoch": 7.651727357609711, "grad_norm": 1.036247826932838, "learning_rate": 1.3789275067978164e-06, "loss": 0.0109, "step": 8195 }, { "epoch": 7.652661064425771, "grad_norm": 1.9204516171144455, "learning_rate": 1.3778849485798112e-06, "loss": 0.0706, "step": 8196 }, { "epoch": 7.65359477124183, "grad_norm": 2.9991483881506102, "learning_rate": 1.376842721644005e-06, "loss": 0.0853, "step": 8197 }, { "epoch": 7.65452847805789, "grad_norm": 2.3290379033577575, "learning_rate": 1.3758008260857208e-06, "loss": 0.0684, "step": 8198 }, { "epoch": 7.65546218487395, "grad_norm": 1.0131412079274773, "learning_rate": 1.374759262000253e-06, "loss": 0.0252, "step": 8199 }, { "epoch": 7.65639589169001, "grad_norm": 0.5809314421876846, "learning_rate": 1.3737180294828634e-06, "loss": 0.0096, "step": 8200 }, { "epoch": 7.657329598506069, "grad_norm": 1.9034039862757925, "learning_rate": 1.3726771286287844e-06, "loss": 0.0501, "step": 8201 }, { "epoch": 7.658263305322129, "grad_norm": 1.1596484791000428, "learning_rate": 1.3716365595332154e-06, "loss": 0.0302, "step": 8202 }, { "epoch": 7.659197012138189, "grad_norm": 2.0823020548416524, "learning_rate": 1.370596322291331e-06, "loss": 0.0514, "step": 8203 }, { "epoch": 7.660130718954249, "grad_norm": 1.511915772138185, "learning_rate": 1.3695564169982716e-06, "loss": 0.0385, "step": 8204 }, { "epoch": 7.661064425770308, "grad_norm": 0.7582885812292463, "learning_rate": 1.3685168437491476e-06, "loss": 0.0114, "step": 8205 }, { "epoch": 7.661998132586368, "grad_norm": 2.094692783084729, "learning_rate": 1.367477602639038e-06, "loss": 0.0712, "step": 8206 }, { "epoch": 7.662931839402428, "grad_norm": 2.1076597565142463, "learning_rate": 1.3664386937629952e-06, "loss": 0.0444, "step": 8207 }, { "epoch": 7.663865546218488, "grad_norm": 2.7206098854551994, "learning_rate": 1.3654001172160374e-06, "loss": 0.089, "step": 8208 }, { "epoch": 7.6647992530345475, "grad_norm": 1.114206005266497, "learning_rate": 1.364361873093155e-06, "loss": 0.0317, "step": 8209 }, { "epoch": 7.665732959850607, "grad_norm": 1.032215131117081, "learning_rate": 1.363323961489304e-06, "loss": 0.0418, "step": 8210 }, { "epoch": 7.666666666666667, "grad_norm": 3.396912493087018, "learning_rate": 1.3622863824994166e-06, "loss": 0.0869, "step": 8211 }, { "epoch": 7.667600373482727, "grad_norm": 0.3638635713214444, "learning_rate": 1.3612491362183887e-06, "loss": 0.0117, "step": 8212 }, { "epoch": 7.6685340802987865, "grad_norm": 3.581604626220023, "learning_rate": 1.3602122227410886e-06, "loss": 0.1075, "step": 8213 }, { "epoch": 7.669467787114846, "grad_norm": 4.1672529301034364, "learning_rate": 1.3591756421623525e-06, "loss": 0.21, "step": 8214 }, { "epoch": 7.670401493930906, "grad_norm": 1.897038066993553, "learning_rate": 1.358139394576986e-06, "loss": 0.0719, "step": 8215 }, { "epoch": 7.671335200746966, "grad_norm": 2.9739973040297176, "learning_rate": 1.357103480079769e-06, "loss": 0.1729, "step": 8216 }, { "epoch": 7.6722689075630255, "grad_norm": 1.6829557730460483, "learning_rate": 1.3560678987654447e-06, "loss": 0.0492, "step": 8217 }, { "epoch": 7.673202614379085, "grad_norm": 4.597715503863674, "learning_rate": 1.3550326507287293e-06, "loss": 0.222, "step": 8218 }, { "epoch": 7.674136321195145, "grad_norm": 0.2043302172609167, "learning_rate": 1.3539977360643054e-06, "loss": 0.0029, "step": 8219 }, { "epoch": 7.675070028011205, "grad_norm": 2.5402988951964898, "learning_rate": 1.3529631548668298e-06, "loss": 0.0898, "step": 8220 }, { "epoch": 7.6760037348272645, "grad_norm": 1.7878986809126247, "learning_rate": 1.3519289072309267e-06, "loss": 0.067, "step": 8221 }, { "epoch": 7.676937441643324, "grad_norm": 1.8315535408473755, "learning_rate": 1.3508949932511878e-06, "loss": 0.0683, "step": 8222 }, { "epoch": 7.677871148459384, "grad_norm": 5.496771372833191, "learning_rate": 1.349861413022175e-06, "loss": 0.2361, "step": 8223 }, { "epoch": 7.678804855275444, "grad_norm": 3.1935330648716342, "learning_rate": 1.348828166638423e-06, "loss": 0.1219, "step": 8224 }, { "epoch": 7.6797385620915035, "grad_norm": 1.181096220002712, "learning_rate": 1.3477952541944328e-06, "loss": 0.0348, "step": 8225 }, { "epoch": 7.680672268907563, "grad_norm": 2.852633069774177, "learning_rate": 1.3467626757846735e-06, "loss": 0.1119, "step": 8226 }, { "epoch": 7.681605975723623, "grad_norm": 0.28425590224393465, "learning_rate": 1.345730431503589e-06, "loss": 0.0058, "step": 8227 }, { "epoch": 7.682539682539683, "grad_norm": 1.0626571005437193, "learning_rate": 1.3446985214455883e-06, "loss": 0.0353, "step": 8228 }, { "epoch": 7.6834733893557425, "grad_norm": 3.224671840084194, "learning_rate": 1.3436669457050482e-06, "loss": 0.1199, "step": 8229 }, { "epoch": 7.684407096171802, "grad_norm": 0.46435316348710187, "learning_rate": 1.3426357043763221e-06, "loss": 0.0092, "step": 8230 }, { "epoch": 7.685340802987862, "grad_norm": 3.2304992113187034, "learning_rate": 1.3416047975537255e-06, "loss": 0.11, "step": 8231 }, { "epoch": 7.686274509803922, "grad_norm": 2.200687580648171, "learning_rate": 1.3405742253315475e-06, "loss": 0.0526, "step": 8232 }, { "epoch": 7.6872082166199815, "grad_norm": 0.720316438413371, "learning_rate": 1.3395439878040423e-06, "loss": 0.0203, "step": 8233 }, { "epoch": 7.688141923436041, "grad_norm": 0.38837605927787583, "learning_rate": 1.3385140850654404e-06, "loss": 0.0051, "step": 8234 }, { "epoch": 7.689075630252101, "grad_norm": 2.9246507966661155, "learning_rate": 1.337484517209936e-06, "loss": 0.1488, "step": 8235 }, { "epoch": 7.690009337068161, "grad_norm": 2.389719652296306, "learning_rate": 1.336455284331693e-06, "loss": 0.0985, "step": 8236 }, { "epoch": 7.690943043884221, "grad_norm": 6.8059634531778075, "learning_rate": 1.335426386524849e-06, "loss": 0.122, "step": 8237 }, { "epoch": 7.69187675070028, "grad_norm": 0.5363167822789187, "learning_rate": 1.3343978238835065e-06, "loss": 0.0151, "step": 8238 }, { "epoch": 7.69281045751634, "grad_norm": 2.8990471890391083, "learning_rate": 1.3333695965017385e-06, "loss": 0.1201, "step": 8239 }, { "epoch": 7.6937441643324, "grad_norm": 1.1809348855035438, "learning_rate": 1.3323417044735865e-06, "loss": 0.0282, "step": 8240 }, { "epoch": 7.69467787114846, "grad_norm": 2.609278862675812, "learning_rate": 1.3313141478930652e-06, "loss": 0.1005, "step": 8241 }, { "epoch": 7.695611577964519, "grad_norm": 1.862942393740014, "learning_rate": 1.3302869268541552e-06, "loss": 0.056, "step": 8242 }, { "epoch": 7.696545284780579, "grad_norm": 1.933289728868735, "learning_rate": 1.329260041450806e-06, "loss": 0.0456, "step": 8243 }, { "epoch": 7.697478991596639, "grad_norm": 3.5383174471098355, "learning_rate": 1.3282334917769375e-06, "loss": 0.1212, "step": 8244 }, { "epoch": 7.698412698412699, "grad_norm": 0.6549991681971388, "learning_rate": 1.3272072779264406e-06, "loss": 0.0167, "step": 8245 }, { "epoch": 7.699346405228758, "grad_norm": 1.240934653984535, "learning_rate": 1.3261813999931727e-06, "loss": 0.0243, "step": 8246 }, { "epoch": 7.700280112044818, "grad_norm": 9.45307585408121, "learning_rate": 1.3251558580709623e-06, "loss": 0.0627, "step": 8247 }, { "epoch": 7.701213818860878, "grad_norm": 5.5297316582679965, "learning_rate": 1.3241306522536057e-06, "loss": 0.1967, "step": 8248 }, { "epoch": 7.702147525676938, "grad_norm": 2.9613604502570645, "learning_rate": 1.3231057826348676e-06, "loss": 0.0723, "step": 8249 }, { "epoch": 7.703081232492997, "grad_norm": 1.3445342480752083, "learning_rate": 1.3220812493084878e-06, "loss": 0.0396, "step": 8250 }, { "epoch": 7.704014939309057, "grad_norm": 0.9275691953347596, "learning_rate": 1.3210570523681682e-06, "loss": 0.0297, "step": 8251 }, { "epoch": 7.704948646125117, "grad_norm": 1.0882157731786672, "learning_rate": 1.3200331919075832e-06, "loss": 0.0128, "step": 8252 }, { "epoch": 7.705882352941177, "grad_norm": 2.6521816559477878, "learning_rate": 1.319009668020375e-06, "loss": 0.142, "step": 8253 }, { "epoch": 7.706816059757236, "grad_norm": 0.4810147556490678, "learning_rate": 1.3179864808001592e-06, "loss": 0.0057, "step": 8254 }, { "epoch": 7.707749766573296, "grad_norm": 2.572562292979241, "learning_rate": 1.3169636303405153e-06, "loss": 0.1113, "step": 8255 }, { "epoch": 7.708683473389356, "grad_norm": 1.8875228049084423, "learning_rate": 1.315941116734995e-06, "loss": 0.0852, "step": 8256 }, { "epoch": 7.709617180205416, "grad_norm": 1.6656712928512372, "learning_rate": 1.3149189400771168e-06, "loss": 0.0294, "step": 8257 }, { "epoch": 7.710550887021475, "grad_norm": 2.093579620151193, "learning_rate": 1.3138971004603722e-06, "loss": 0.0886, "step": 8258 }, { "epoch": 7.711484593837535, "grad_norm": 0.631937718498149, "learning_rate": 1.3128755979782187e-06, "loss": 0.0065, "step": 8259 }, { "epoch": 7.712418300653595, "grad_norm": 0.9194625549140938, "learning_rate": 1.311854432724084e-06, "loss": 0.0235, "step": 8260 }, { "epoch": 7.713352007469655, "grad_norm": 2.81673571108351, "learning_rate": 1.3108336047913633e-06, "loss": 0.0871, "step": 8261 }, { "epoch": 7.714285714285714, "grad_norm": 3.333196208806393, "learning_rate": 1.3098131142734255e-06, "loss": 0.0753, "step": 8262 }, { "epoch": 7.715219421101774, "grad_norm": 1.4222578139669857, "learning_rate": 1.3087929612636041e-06, "loss": 0.0463, "step": 8263 }, { "epoch": 7.716153127917834, "grad_norm": 1.125621184015679, "learning_rate": 1.3077731458552029e-06, "loss": 0.0304, "step": 8264 }, { "epoch": 7.717086834733894, "grad_norm": 3.4310170942000546, "learning_rate": 1.306753668141496e-06, "loss": 0.0844, "step": 8265 }, { "epoch": 7.718020541549953, "grad_norm": 7.803111460595282, "learning_rate": 1.3057345282157242e-06, "loss": 0.0553, "step": 8266 }, { "epoch": 7.718954248366013, "grad_norm": 3.0079100081520935, "learning_rate": 1.3047157261711018e-06, "loss": 0.0926, "step": 8267 }, { "epoch": 7.719887955182073, "grad_norm": 0.7603353824615521, "learning_rate": 1.303697262100808e-06, "loss": 0.0221, "step": 8268 }, { "epoch": 7.720821661998133, "grad_norm": 0.8934192254338398, "learning_rate": 1.3026791360979923e-06, "loss": 0.0257, "step": 8269 }, { "epoch": 7.721755368814192, "grad_norm": 0.9649234915198596, "learning_rate": 1.3016613482557721e-06, "loss": 0.0151, "step": 8270 }, { "epoch": 7.722689075630252, "grad_norm": 1.0758962579014344, "learning_rate": 1.3006438986672386e-06, "loss": 0.0321, "step": 8271 }, { "epoch": 7.723622782446312, "grad_norm": 0.5570527591630995, "learning_rate": 1.299626787425447e-06, "loss": 0.0189, "step": 8272 }, { "epoch": 7.724556489262372, "grad_norm": 3.693282159356863, "learning_rate": 1.298610014623423e-06, "loss": 0.1191, "step": 8273 }, { "epoch": 7.7254901960784315, "grad_norm": 4.643400004645711, "learning_rate": 1.297593580354161e-06, "loss": 0.1418, "step": 8274 }, { "epoch": 7.726423902894491, "grad_norm": 1.063780439823184, "learning_rate": 1.2965774847106273e-06, "loss": 0.0181, "step": 8275 }, { "epoch": 7.727357609710551, "grad_norm": 1.908472860303251, "learning_rate": 1.2955617277857536e-06, "loss": 0.0579, "step": 8276 }, { "epoch": 7.728291316526611, "grad_norm": 1.132312100849606, "learning_rate": 1.2945463096724425e-06, "loss": 0.0205, "step": 8277 }, { "epoch": 7.7292250233426705, "grad_norm": 0.6687500678698203, "learning_rate": 1.2935312304635627e-06, "loss": 0.0136, "step": 8278 }, { "epoch": 7.73015873015873, "grad_norm": 0.7820399626224596, "learning_rate": 1.2925164902519582e-06, "loss": 0.0179, "step": 8279 }, { "epoch": 7.73109243697479, "grad_norm": 3.6427444008359413, "learning_rate": 1.2915020891304364e-06, "loss": 0.1273, "step": 8280 }, { "epoch": 7.73202614379085, "grad_norm": 0.8640286079373585, "learning_rate": 1.2904880271917757e-06, "loss": 0.0174, "step": 8281 }, { "epoch": 7.7329598506069095, "grad_norm": 3.8060262880844777, "learning_rate": 1.289474304528723e-06, "loss": 0.1535, "step": 8282 }, { "epoch": 7.733893557422969, "grad_norm": 0.9462178034782464, "learning_rate": 1.288460921233992e-06, "loss": 0.0218, "step": 8283 }, { "epoch": 7.734827264239029, "grad_norm": 1.9743940882810278, "learning_rate": 1.2874478774002719e-06, "loss": 0.0569, "step": 8284 }, { "epoch": 7.735760971055089, "grad_norm": 1.235130702677417, "learning_rate": 1.2864351731202146e-06, "loss": 0.0426, "step": 8285 }, { "epoch": 7.7366946778711485, "grad_norm": 2.103765311816664, "learning_rate": 1.2854228084864433e-06, "loss": 0.0939, "step": 8286 }, { "epoch": 7.737628384687208, "grad_norm": 0.9713249039996003, "learning_rate": 1.2844107835915487e-06, "loss": 0.0215, "step": 8287 }, { "epoch": 7.738562091503268, "grad_norm": 1.6368974409761947, "learning_rate": 1.2833990985280937e-06, "loss": 0.0443, "step": 8288 }, { "epoch": 7.739495798319328, "grad_norm": 2.971384836978006, "learning_rate": 1.282387753388607e-06, "loss": 0.1205, "step": 8289 }, { "epoch": 7.7404295051353875, "grad_norm": 0.650740756381234, "learning_rate": 1.281376748265587e-06, "loss": 0.0124, "step": 8290 }, { "epoch": 7.741363211951447, "grad_norm": 1.463876791252797, "learning_rate": 1.2803660832515003e-06, "loss": 0.0451, "step": 8291 }, { "epoch": 7.742296918767507, "grad_norm": 1.0200649447842225, "learning_rate": 1.2793557584387855e-06, "loss": 0.0234, "step": 8292 }, { "epoch": 7.743230625583567, "grad_norm": 1.320525856077521, "learning_rate": 1.2783457739198473e-06, "loss": 0.0503, "step": 8293 }, { "epoch": 7.7441643323996265, "grad_norm": 1.686907111410911, "learning_rate": 1.2773361297870591e-06, "loss": 0.038, "step": 8294 }, { "epoch": 7.745098039215686, "grad_norm": 1.0921649856408733, "learning_rate": 1.2763268261327632e-06, "loss": 0.0408, "step": 8295 }, { "epoch": 7.746031746031746, "grad_norm": 3.0894065604535315, "learning_rate": 1.2753178630492736e-06, "loss": 0.1056, "step": 8296 }, { "epoch": 7.746965452847806, "grad_norm": 1.8526604852807664, "learning_rate": 1.27430924062887e-06, "loss": 0.0572, "step": 8297 }, { "epoch": 7.7478991596638656, "grad_norm": 1.2968118264329553, "learning_rate": 1.2733009589638024e-06, "loss": 0.0333, "step": 8298 }, { "epoch": 7.748832866479925, "grad_norm": 0.5148409233683674, "learning_rate": 1.2722930181462868e-06, "loss": 0.0079, "step": 8299 }, { "epoch": 7.749766573295985, "grad_norm": 0.33699344590239777, "learning_rate": 1.2712854182685142e-06, "loss": 0.0044, "step": 8300 }, { "epoch": 7.750700280112045, "grad_norm": 0.5743823692015755, "learning_rate": 1.2702781594226387e-06, "loss": 0.0099, "step": 8301 }, { "epoch": 7.751633986928105, "grad_norm": 0.4848919227376098, "learning_rate": 1.2692712417007864e-06, "loss": 0.0185, "step": 8302 }, { "epoch": 7.752567693744164, "grad_norm": 3.338512500842465, "learning_rate": 1.2682646651950476e-06, "loss": 0.112, "step": 8303 }, { "epoch": 7.753501400560224, "grad_norm": 0.792282379838475, "learning_rate": 1.2672584299974889e-06, "loss": 0.0221, "step": 8304 }, { "epoch": 7.754435107376284, "grad_norm": 1.762303792478036, "learning_rate": 1.26625253620014e-06, "loss": 0.0686, "step": 8305 }, { "epoch": 7.755368814192344, "grad_norm": 2.5352001061542233, "learning_rate": 1.2652469838949994e-06, "loss": 0.0764, "step": 8306 }, { "epoch": 7.756302521008403, "grad_norm": 0.07928869258365112, "learning_rate": 1.2642417731740387e-06, "loss": 0.0008, "step": 8307 }, { "epoch": 7.757236227824463, "grad_norm": 1.2530034027273678, "learning_rate": 1.2632369041291936e-06, "loss": 0.0303, "step": 8308 }, { "epoch": 7.758169934640523, "grad_norm": 1.151857425032, "learning_rate": 1.2622323768523698e-06, "loss": 0.0323, "step": 8309 }, { "epoch": 7.759103641456583, "grad_norm": 1.2512010058884064, "learning_rate": 1.2612281914354452e-06, "loss": 0.0306, "step": 8310 }, { "epoch": 7.760037348272642, "grad_norm": 2.896613131395767, "learning_rate": 1.2602243479702613e-06, "loss": 0.0973, "step": 8311 }, { "epoch": 7.760971055088702, "grad_norm": 3.0037662860533088, "learning_rate": 1.259220846548631e-06, "loss": 0.0828, "step": 8312 }, { "epoch": 7.761904761904762, "grad_norm": 1.72077927332327, "learning_rate": 1.2582176872623348e-06, "loss": 0.0388, "step": 8313 }, { "epoch": 7.762838468720822, "grad_norm": 3.3850131513548036, "learning_rate": 1.2572148702031246e-06, "loss": 0.0907, "step": 8314 }, { "epoch": 7.763772175536881, "grad_norm": 0.8896283032374857, "learning_rate": 1.2562123954627181e-06, "loss": 0.0141, "step": 8315 }, { "epoch": 7.764705882352941, "grad_norm": 1.8201467516269105, "learning_rate": 1.2552102631328022e-06, "loss": 0.0526, "step": 8316 }, { "epoch": 7.765639589169001, "grad_norm": 0.8141598421800931, "learning_rate": 1.2542084733050313e-06, "loss": 0.0238, "step": 8317 }, { "epoch": 7.766573295985061, "grad_norm": 5.519529298894584, "learning_rate": 1.2532070260710339e-06, "loss": 0.0238, "step": 8318 }, { "epoch": 7.76750700280112, "grad_norm": 1.1467166376423932, "learning_rate": 1.2522059215224015e-06, "loss": 0.0223, "step": 8319 }, { "epoch": 7.76844070961718, "grad_norm": 1.616342569820987, "learning_rate": 1.2512051597506958e-06, "loss": 0.0447, "step": 8320 }, { "epoch": 7.76937441643324, "grad_norm": 0.5563069853316337, "learning_rate": 1.250204740847446e-06, "loss": 0.0127, "step": 8321 }, { "epoch": 7.7703081232493, "grad_norm": 1.9567274048439387, "learning_rate": 1.2492046649041545e-06, "loss": 0.0547, "step": 8322 }, { "epoch": 7.771241830065359, "grad_norm": 1.9825990394324702, "learning_rate": 1.248204932012288e-06, "loss": 0.0542, "step": 8323 }, { "epoch": 7.772175536881419, "grad_norm": 4.0949115410764, "learning_rate": 1.2472055422632823e-06, "loss": 0.1523, "step": 8324 }, { "epoch": 7.773109243697479, "grad_norm": 2.2134839263707398, "learning_rate": 1.246206495748542e-06, "loss": 0.0826, "step": 8325 }, { "epoch": 7.774042950513539, "grad_norm": 1.9796021049261514, "learning_rate": 1.2452077925594435e-06, "loss": 0.0764, "step": 8326 }, { "epoch": 7.774976657329598, "grad_norm": 0.7407924415381951, "learning_rate": 1.2442094327873277e-06, "loss": 0.0124, "step": 8327 }, { "epoch": 7.775910364145658, "grad_norm": 0.9336987980100306, "learning_rate": 1.2432114165235048e-06, "loss": 0.0156, "step": 8328 }, { "epoch": 7.776844070961718, "grad_norm": 0.7805721622859281, "learning_rate": 1.242213743859254e-06, "loss": 0.0227, "step": 8329 }, { "epoch": 7.777777777777778, "grad_norm": 1.5682869990024042, "learning_rate": 1.2412164148858263e-06, "loss": 0.0495, "step": 8330 }, { "epoch": 7.778711484593837, "grad_norm": 1.8762972462102558, "learning_rate": 1.2402194296944364e-06, "loss": 0.083, "step": 8331 }, { "epoch": 7.779645191409897, "grad_norm": 1.3165835394784111, "learning_rate": 1.2392227883762692e-06, "loss": 0.0439, "step": 8332 }, { "epoch": 7.780578898225957, "grad_norm": 1.4803105271724135, "learning_rate": 1.2382264910224795e-06, "loss": 0.0554, "step": 8333 }, { "epoch": 7.781512605042017, "grad_norm": 6.915271842825911, "learning_rate": 1.2372305377241872e-06, "loss": 0.1987, "step": 8334 }, { "epoch": 7.7824463118580764, "grad_norm": 0.45789831628083966, "learning_rate": 1.2362349285724867e-06, "loss": 0.0165, "step": 8335 }, { "epoch": 7.783380018674136, "grad_norm": 1.6577857220000767, "learning_rate": 1.2352396636584358e-06, "loss": 0.0765, "step": 8336 }, { "epoch": 7.784313725490196, "grad_norm": 0.9713717599881873, "learning_rate": 1.2342447430730625e-06, "loss": 0.0329, "step": 8337 }, { "epoch": 7.785247432306256, "grad_norm": 2.0624835669037473, "learning_rate": 1.2332501669073616e-06, "loss": 0.0524, "step": 8338 }, { "epoch": 7.7861811391223155, "grad_norm": 3.5173879992416492, "learning_rate": 1.2322559352523006e-06, "loss": 0.0997, "step": 8339 }, { "epoch": 7.787114845938375, "grad_norm": 2.716712911268082, "learning_rate": 1.2312620481988125e-06, "loss": 0.104, "step": 8340 }, { "epoch": 7.788048552754435, "grad_norm": 1.0898924398406378, "learning_rate": 1.2302685058377978e-06, "loss": 0.0242, "step": 8341 }, { "epoch": 7.788982259570495, "grad_norm": 1.604636594185208, "learning_rate": 1.2292753082601266e-06, "loss": 0.0223, "step": 8342 }, { "epoch": 7.7899159663865545, "grad_norm": 1.5409206019139419, "learning_rate": 1.2282824555566397e-06, "loss": 0.0349, "step": 8343 }, { "epoch": 7.790849673202614, "grad_norm": 1.144126494568618, "learning_rate": 1.2272899478181438e-06, "loss": 0.0372, "step": 8344 }, { "epoch": 7.791783380018674, "grad_norm": 0.6428614881819962, "learning_rate": 1.226297785135414e-06, "loss": 0.0083, "step": 8345 }, { "epoch": 7.792717086834734, "grad_norm": 2.3305362477348126, "learning_rate": 1.2253059675991946e-06, "loss": 0.0784, "step": 8346 }, { "epoch": 7.7936507936507935, "grad_norm": 0.5519810254852381, "learning_rate": 1.2243144953001968e-06, "loss": 0.0078, "step": 8347 }, { "epoch": 7.794584500466853, "grad_norm": 1.6230440590165618, "learning_rate": 1.2233233683291046e-06, "loss": 0.0468, "step": 8348 }, { "epoch": 7.795518207282913, "grad_norm": 0.41653368635110016, "learning_rate": 1.2223325867765661e-06, "loss": 0.0089, "step": 8349 }, { "epoch": 7.796451914098973, "grad_norm": 2.6177619499672096, "learning_rate": 1.2213421507331986e-06, "loss": 0.1016, "step": 8350 }, { "epoch": 7.7973856209150325, "grad_norm": 3.670749275127132, "learning_rate": 1.220352060289588e-06, "loss": 0.098, "step": 8351 }, { "epoch": 7.798319327731092, "grad_norm": 0.8598668100105528, "learning_rate": 1.2193623155362906e-06, "loss": 0.0129, "step": 8352 }, { "epoch": 7.799253034547152, "grad_norm": 2.854182032750442, "learning_rate": 1.2183729165638286e-06, "loss": 0.1494, "step": 8353 }, { "epoch": 7.800186741363212, "grad_norm": 1.593008903137621, "learning_rate": 1.2173838634626932e-06, "loss": 0.0483, "step": 8354 }, { "epoch": 7.8011204481792715, "grad_norm": 0.5115718587658165, "learning_rate": 1.2163951563233433e-06, "loss": 0.0109, "step": 8355 }, { "epoch": 7.802054154995331, "grad_norm": 4.132780655929658, "learning_rate": 1.2154067952362091e-06, "loss": 0.1722, "step": 8356 }, { "epoch": 7.802987861811391, "grad_norm": 0.4332169786524659, "learning_rate": 1.2144187802916863e-06, "loss": 0.0089, "step": 8357 }, { "epoch": 7.803921568627451, "grad_norm": 0.5960942663678348, "learning_rate": 1.213431111580139e-06, "loss": 0.0138, "step": 8358 }, { "epoch": 7.8048552754435105, "grad_norm": 3.3317300925767017, "learning_rate": 1.2124437891918995e-06, "loss": 0.1009, "step": 8359 }, { "epoch": 7.80578898225957, "grad_norm": 1.1185389538905197, "learning_rate": 1.2114568132172721e-06, "loss": 0.0319, "step": 8360 }, { "epoch": 7.80672268907563, "grad_norm": 2.943206196208866, "learning_rate": 1.210470183746525e-06, "loss": 0.0757, "step": 8361 }, { "epoch": 7.80765639589169, "grad_norm": 2.7240264784666763, "learning_rate": 1.2094839008698966e-06, "loss": 0.083, "step": 8362 }, { "epoch": 7.80859010270775, "grad_norm": 2.3936561986741878, "learning_rate": 1.2084979646775925e-06, "loss": 0.0897, "step": 8363 }, { "epoch": 7.809523809523809, "grad_norm": 3.221303271106532, "learning_rate": 1.2075123752597867e-06, "loss": 0.1037, "step": 8364 }, { "epoch": 7.810457516339869, "grad_norm": 1.4829353244998134, "learning_rate": 1.2065271327066253e-06, "loss": 0.0247, "step": 8365 }, { "epoch": 7.811391223155929, "grad_norm": 0.5171488915977881, "learning_rate": 1.205542237108217e-06, "loss": 0.0053, "step": 8366 }, { "epoch": 7.812324929971989, "grad_norm": 2.1405160698267274, "learning_rate": 1.2045576885546424e-06, "loss": 0.0674, "step": 8367 }, { "epoch": 7.813258636788048, "grad_norm": 0.8562479692918511, "learning_rate": 1.2035734871359472e-06, "loss": 0.0301, "step": 8368 }, { "epoch": 7.814192343604108, "grad_norm": 3.3410238261610914, "learning_rate": 1.2025896329421505e-06, "loss": 0.1442, "step": 8369 }, { "epoch": 7.815126050420168, "grad_norm": 3.0607443311344875, "learning_rate": 1.201606126063235e-06, "loss": 0.08, "step": 8370 }, { "epoch": 7.816059757236228, "grad_norm": 2.5604316826359304, "learning_rate": 1.2006229665891534e-06, "loss": 0.0811, "step": 8371 }, { "epoch": 7.816993464052287, "grad_norm": 0.31132554104484494, "learning_rate": 1.1996401546098253e-06, "loss": 0.0024, "step": 8372 }, { "epoch": 7.817927170868347, "grad_norm": 5.048226336851985, "learning_rate": 1.198657690215142e-06, "loss": 0.1008, "step": 8373 }, { "epoch": 7.818860877684407, "grad_norm": 3.3889464115323733, "learning_rate": 1.197675573494959e-06, "loss": 0.1379, "step": 8374 }, { "epoch": 7.819794584500467, "grad_norm": 1.3407960106786931, "learning_rate": 1.1966938045391025e-06, "loss": 0.0162, "step": 8375 }, { "epoch": 7.820728291316526, "grad_norm": 0.7280909630953359, "learning_rate": 1.1957123834373642e-06, "loss": 0.0137, "step": 8376 }, { "epoch": 7.821661998132586, "grad_norm": 3.338603007087459, "learning_rate": 1.1947313102795088e-06, "loss": 0.1064, "step": 8377 }, { "epoch": 7.822595704948646, "grad_norm": 1.6081765301320063, "learning_rate": 1.1937505851552644e-06, "loss": 0.0506, "step": 8378 }, { "epoch": 7.823529411764706, "grad_norm": 1.442688934495686, "learning_rate": 1.1927702081543279e-06, "loss": 0.0578, "step": 8379 }, { "epoch": 7.824463118580765, "grad_norm": 0.7328385694584868, "learning_rate": 1.1917901793663682e-06, "loss": 0.0185, "step": 8380 }, { "epoch": 7.825396825396825, "grad_norm": 2.224906439126614, "learning_rate": 1.1908104988810187e-06, "loss": 0.0337, "step": 8381 }, { "epoch": 7.826330532212885, "grad_norm": 0.4779706829029244, "learning_rate": 1.189831166787882e-06, "loss": 0.0124, "step": 8382 }, { "epoch": 7.827264239028945, "grad_norm": 0.1985629630953525, "learning_rate": 1.1888521831765266e-06, "loss": 0.0074, "step": 8383 }, { "epoch": 7.828197945845004, "grad_norm": 1.0703217126920357, "learning_rate": 1.187873548136495e-06, "loss": 0.0308, "step": 8384 }, { "epoch": 7.829131652661064, "grad_norm": 1.4318511399090945, "learning_rate": 1.186895261757292e-06, "loss": 0.0345, "step": 8385 }, { "epoch": 7.830065359477124, "grad_norm": 3.8383212834195124, "learning_rate": 1.1859173241283917e-06, "loss": 0.1382, "step": 8386 }, { "epoch": 7.830999066293184, "grad_norm": 1.8941255906468255, "learning_rate": 1.18493973533924e-06, "loss": 0.0659, "step": 8387 }, { "epoch": 7.831932773109243, "grad_norm": 2.9797981217115495, "learning_rate": 1.1839624954792466e-06, "loss": 0.0638, "step": 8388 }, { "epoch": 7.832866479925303, "grad_norm": 5.494180842397206, "learning_rate": 1.1829856046377891e-06, "loss": 0.147, "step": 8389 }, { "epoch": 7.833800186741363, "grad_norm": 0.30199264942003917, "learning_rate": 1.1820090629042186e-06, "loss": 0.0047, "step": 8390 }, { "epoch": 7.834733893557423, "grad_norm": 0.7236205100254784, "learning_rate": 1.181032870367848e-06, "loss": 0.0234, "step": 8391 }, { "epoch": 7.835667600373482, "grad_norm": 3.248009823900753, "learning_rate": 1.1800570271179618e-06, "loss": 0.1275, "step": 8392 }, { "epoch": 7.836601307189542, "grad_norm": 1.498144924718248, "learning_rate": 1.1790815332438099e-06, "loss": 0.0433, "step": 8393 }, { "epoch": 7.837535014005602, "grad_norm": 1.9201526749371627, "learning_rate": 1.1781063888346145e-06, "loss": 0.0865, "step": 8394 }, { "epoch": 7.838468720821662, "grad_norm": 2.031401208922519, "learning_rate": 1.177131593979562e-06, "loss": 0.0603, "step": 8395 }, { "epoch": 7.839402427637721, "grad_norm": 0.9576874546458739, "learning_rate": 1.1761571487678075e-06, "loss": 0.0212, "step": 8396 }, { "epoch": 7.840336134453781, "grad_norm": 1.0751877150472526, "learning_rate": 1.1751830532884762e-06, "loss": 0.0449, "step": 8397 }, { "epoch": 7.841269841269841, "grad_norm": 4.2739899983576874, "learning_rate": 1.174209307630657e-06, "loss": 0.0798, "step": 8398 }, { "epoch": 7.842203548085901, "grad_norm": 4.561436969372063, "learning_rate": 1.173235911883413e-06, "loss": 0.1415, "step": 8399 }, { "epoch": 7.8431372549019605, "grad_norm": 3.335136813725714, "learning_rate": 1.172262866135771e-06, "loss": 0.1231, "step": 8400 }, { "epoch": 7.84407096171802, "grad_norm": 1.560545168392373, "learning_rate": 1.1712901704767254e-06, "loss": 0.046, "step": 8401 }, { "epoch": 7.84500466853408, "grad_norm": 1.05139101577451, "learning_rate": 1.1703178249952401e-06, "loss": 0.0332, "step": 8402 }, { "epoch": 7.84593837535014, "grad_norm": 1.7786565948277195, "learning_rate": 1.1693458297802485e-06, "loss": 0.045, "step": 8403 }, { "epoch": 7.8468720821661995, "grad_norm": 0.7517432959062728, "learning_rate": 1.1683741849206493e-06, "loss": 0.0245, "step": 8404 }, { "epoch": 7.847805788982259, "grad_norm": 3.1332209906609707, "learning_rate": 1.1674028905053097e-06, "loss": 0.0834, "step": 8405 }, { "epoch": 7.848739495798319, "grad_norm": 5.210175963463772, "learning_rate": 1.166431946623064e-06, "loss": 0.1658, "step": 8406 }, { "epoch": 7.849673202614379, "grad_norm": 1.491590116808986, "learning_rate": 1.1654613533627186e-06, "loss": 0.0378, "step": 8407 }, { "epoch": 7.8506069094304385, "grad_norm": 0.4303884342163864, "learning_rate": 1.1644911108130436e-06, "loss": 0.007, "step": 8408 }, { "epoch": 7.851540616246498, "grad_norm": 1.1605220975003692, "learning_rate": 1.1635212190627782e-06, "loss": 0.0415, "step": 8409 }, { "epoch": 7.852474323062558, "grad_norm": 0.9482132152827631, "learning_rate": 1.162551678200628e-06, "loss": 0.0261, "step": 8410 }, { "epoch": 7.853408029878618, "grad_norm": 2.7885791656888164, "learning_rate": 1.1615824883152716e-06, "loss": 0.0578, "step": 8411 }, { "epoch": 7.8543417366946775, "grad_norm": 5.959834478491362, "learning_rate": 1.1606136494953502e-06, "loss": 0.1348, "step": 8412 }, { "epoch": 7.855275443510737, "grad_norm": 9.540639113875962, "learning_rate": 1.1596451618294747e-06, "loss": 0.0605, "step": 8413 }, { "epoch": 7.856209150326797, "grad_norm": 2.174539667807313, "learning_rate": 1.158677025406224e-06, "loss": 0.051, "step": 8414 }, { "epoch": 7.857142857142857, "grad_norm": 1.7081600511204502, "learning_rate": 1.1577092403141431e-06, "loss": 0.0798, "step": 8415 }, { "epoch": 7.8580765639589165, "grad_norm": 0.5357452669673725, "learning_rate": 1.1567418066417502e-06, "loss": 0.0119, "step": 8416 }, { "epoch": 7.859010270774976, "grad_norm": 2.8456021177006705, "learning_rate": 1.1557747244775259e-06, "loss": 0.0536, "step": 8417 }, { "epoch": 7.859943977591037, "grad_norm": 1.044198922737507, "learning_rate": 1.1548079939099205e-06, "loss": 0.021, "step": 8418 }, { "epoch": 7.860877684407097, "grad_norm": 3.3107671054501364, "learning_rate": 1.1538416150273502e-06, "loss": 0.0903, "step": 8419 }, { "epoch": 7.861811391223156, "grad_norm": 0.38955582487831736, "learning_rate": 1.1528755879182045e-06, "loss": 0.0059, "step": 8420 }, { "epoch": 7.862745098039216, "grad_norm": 2.6925976853921956, "learning_rate": 1.151909912670836e-06, "loss": 0.1236, "step": 8421 }, { "epoch": 7.863678804855276, "grad_norm": 3.7118513910850535, "learning_rate": 1.1509445893735654e-06, "loss": 0.1774, "step": 8422 }, { "epoch": 7.864612511671336, "grad_norm": 1.4115778228013225, "learning_rate": 1.149979618114681e-06, "loss": 0.0203, "step": 8423 }, { "epoch": 7.865546218487395, "grad_norm": 13.209741398891728, "learning_rate": 1.1490149989824434e-06, "loss": 0.1981, "step": 8424 }, { "epoch": 7.866479925303455, "grad_norm": 1.228137575372679, "learning_rate": 1.1480507320650753e-06, "loss": 0.023, "step": 8425 }, { "epoch": 7.867413632119515, "grad_norm": 1.8158246484742002, "learning_rate": 1.1470868174507705e-06, "loss": 0.0566, "step": 8426 }, { "epoch": 7.868347338935575, "grad_norm": 0.6499547240958166, "learning_rate": 1.1461232552276874e-06, "loss": 0.0176, "step": 8427 }, { "epoch": 7.8692810457516345, "grad_norm": 6.834379652994197, "learning_rate": 1.1451600454839572e-06, "loss": 0.1985, "step": 8428 }, { "epoch": 7.870214752567694, "grad_norm": 1.2914245898400447, "learning_rate": 1.1441971883076751e-06, "loss": 0.0431, "step": 8429 }, { "epoch": 7.871148459383754, "grad_norm": 1.3638073914817126, "learning_rate": 1.1432346837869047e-06, "loss": 0.0248, "step": 8430 }, { "epoch": 7.872082166199814, "grad_norm": 2.861855266760094, "learning_rate": 1.1422725320096773e-06, "loss": 0.1034, "step": 8431 }, { "epoch": 7.8730158730158735, "grad_norm": 2.9115858927034477, "learning_rate": 1.141310733063991e-06, "loss": 0.0601, "step": 8432 }, { "epoch": 7.873949579831933, "grad_norm": 2.9393222757599116, "learning_rate": 1.1403492870378163e-06, "loss": 0.0602, "step": 8433 }, { "epoch": 7.874883286647993, "grad_norm": 2.0406942107304546, "learning_rate": 1.139388194019086e-06, "loss": 0.0335, "step": 8434 }, { "epoch": 7.875816993464053, "grad_norm": 0.45546424635231364, "learning_rate": 1.1384274540957025e-06, "loss": 0.0057, "step": 8435 }, { "epoch": 7.8767507002801125, "grad_norm": 1.6856121386971235, "learning_rate": 1.1374670673555349e-06, "loss": 0.0644, "step": 8436 }, { "epoch": 7.877684407096172, "grad_norm": 2.4374687194758806, "learning_rate": 1.1365070338864243e-06, "loss": 0.0664, "step": 8437 }, { "epoch": 7.878618113912232, "grad_norm": 0.11906657632206334, "learning_rate": 1.135547353776174e-06, "loss": 0.0006, "step": 8438 }, { "epoch": 7.879551820728292, "grad_norm": 1.0620040803693396, "learning_rate": 1.1345880271125582e-06, "loss": 0.0242, "step": 8439 }, { "epoch": 7.8804855275443515, "grad_norm": 0.5427642023235122, "learning_rate": 1.1336290539833151e-06, "loss": 0.0106, "step": 8440 }, { "epoch": 7.881419234360411, "grad_norm": 2.2333208085049954, "learning_rate": 1.132670434476158e-06, "loss": 0.0595, "step": 8441 }, { "epoch": 7.882352941176471, "grad_norm": 2.6134300543725812, "learning_rate": 1.1317121686787603e-06, "loss": 0.07, "step": 8442 }, { "epoch": 7.883286647992531, "grad_norm": 0.8803946863504776, "learning_rate": 1.130754256678766e-06, "loss": 0.0273, "step": 8443 }, { "epoch": 7.8842203548085905, "grad_norm": 1.503137316474109, "learning_rate": 1.1297966985637865e-06, "loss": 0.0463, "step": 8444 }, { "epoch": 7.88515406162465, "grad_norm": 2.606380893235626, "learning_rate": 1.1288394944214027e-06, "loss": 0.041, "step": 8445 }, { "epoch": 7.88608776844071, "grad_norm": 2.3301935162649072, "learning_rate": 1.1278826443391604e-06, "loss": 0.0651, "step": 8446 }, { "epoch": 7.88702147525677, "grad_norm": 2.916914764859182, "learning_rate": 1.126926148404574e-06, "loss": 0.0812, "step": 8447 }, { "epoch": 7.8879551820728295, "grad_norm": 0.2731765460323032, "learning_rate": 1.1259700067051254e-06, "loss": 0.0033, "step": 8448 }, { "epoch": 7.888888888888889, "grad_norm": 2.2735414579072986, "learning_rate": 1.1250142193282632e-06, "loss": 0.0151, "step": 8449 }, { "epoch": 7.889822595704949, "grad_norm": 1.8622238002059814, "learning_rate": 1.1240587863614071e-06, "loss": 0.0478, "step": 8450 }, { "epoch": 7.890756302521009, "grad_norm": 2.2193844021397906, "learning_rate": 1.123103707891941e-06, "loss": 0.085, "step": 8451 }, { "epoch": 7.8916900093370685, "grad_norm": 0.6661128841970827, "learning_rate": 1.1221489840072153e-06, "loss": 0.0164, "step": 8452 }, { "epoch": 7.892623716153128, "grad_norm": 2.9905119499501187, "learning_rate": 1.121194614794553e-06, "loss": 0.1282, "step": 8453 }, { "epoch": 7.893557422969188, "grad_norm": 2.9246119961643795, "learning_rate": 1.1202406003412408e-06, "loss": 0.0901, "step": 8454 }, { "epoch": 7.894491129785248, "grad_norm": 0.2851411031808323, "learning_rate": 1.119286940734533e-06, "loss": 0.0039, "step": 8455 }, { "epoch": 7.895424836601308, "grad_norm": 1.9103434816055582, "learning_rate": 1.1183336360616509e-06, "loss": 0.0466, "step": 8456 }, { "epoch": 7.896358543417367, "grad_norm": 2.426932740805637, "learning_rate": 1.1173806864097885e-06, "loss": 0.0887, "step": 8457 }, { "epoch": 7.897292250233427, "grad_norm": 3.9951909464073485, "learning_rate": 1.1164280918661009e-06, "loss": 0.0745, "step": 8458 }, { "epoch": 7.898225957049487, "grad_norm": 0.6505300748764657, "learning_rate": 1.1154758525177123e-06, "loss": 0.0107, "step": 8459 }, { "epoch": 7.899159663865547, "grad_norm": 1.2454641973402554, "learning_rate": 1.1145239684517189e-06, "loss": 0.0359, "step": 8460 }, { "epoch": 7.900093370681606, "grad_norm": 1.745457721758033, "learning_rate": 1.1135724397551785e-06, "loss": 0.0434, "step": 8461 }, { "epoch": 7.901027077497666, "grad_norm": 0.6967603748095442, "learning_rate": 1.1126212665151182e-06, "loss": 0.0143, "step": 8462 }, { "epoch": 7.901960784313726, "grad_norm": 3.9272566645707356, "learning_rate": 1.1116704488185358e-06, "loss": 0.0821, "step": 8463 }, { "epoch": 7.902894491129786, "grad_norm": 1.124630505253265, "learning_rate": 1.1107199867523928e-06, "loss": 0.03, "step": 8464 }, { "epoch": 7.903828197945845, "grad_norm": 4.0482673561206095, "learning_rate": 1.1097698804036184e-06, "loss": 0.0892, "step": 8465 }, { "epoch": 7.904761904761905, "grad_norm": 3.486586947691395, "learning_rate": 1.1088201298591105e-06, "loss": 0.1041, "step": 8466 }, { "epoch": 7.905695611577965, "grad_norm": 2.7112908609347808, "learning_rate": 1.107870735205736e-06, "loss": 0.1247, "step": 8467 }, { "epoch": 7.906629318394025, "grad_norm": 0.3281729977604143, "learning_rate": 1.106921696530326e-06, "loss": 0.0065, "step": 8468 }, { "epoch": 7.907563025210084, "grad_norm": 0.9676654899252788, "learning_rate": 1.1059730139196812e-06, "loss": 0.0214, "step": 8469 }, { "epoch": 7.908496732026144, "grad_norm": 3.284603309342669, "learning_rate": 1.1050246874605668e-06, "loss": 0.0931, "step": 8470 }, { "epoch": 7.909430438842204, "grad_norm": 1.4001799233379302, "learning_rate": 1.104076717239721e-06, "loss": 0.0208, "step": 8471 }, { "epoch": 7.910364145658264, "grad_norm": 0.7823182441767872, "learning_rate": 1.1031291033438446e-06, "loss": 0.0187, "step": 8472 }, { "epoch": 7.911297852474323, "grad_norm": 3.307647946994783, "learning_rate": 1.1021818458596073e-06, "loss": 0.1455, "step": 8473 }, { "epoch": 7.912231559290383, "grad_norm": 2.3735501368155187, "learning_rate": 1.1012349448736442e-06, "loss": 0.0802, "step": 8474 }, { "epoch": 7.913165266106443, "grad_norm": 0.45840934449026444, "learning_rate": 1.1002884004725638e-06, "loss": 0.0139, "step": 8475 }, { "epoch": 7.914098972922503, "grad_norm": 2.5455276887149725, "learning_rate": 1.099342212742936e-06, "loss": 0.0717, "step": 8476 }, { "epoch": 7.915032679738562, "grad_norm": 5.034378991723338, "learning_rate": 1.0983963817712996e-06, "loss": 0.0839, "step": 8477 }, { "epoch": 7.915966386554622, "grad_norm": 1.373569477779403, "learning_rate": 1.097450907644162e-06, "loss": 0.0307, "step": 8478 }, { "epoch": 7.916900093370682, "grad_norm": 0.6037254152086459, "learning_rate": 1.0965057904479953e-06, "loss": 0.0172, "step": 8479 }, { "epoch": 7.917833800186742, "grad_norm": 0.362179272347927, "learning_rate": 1.0955610302692432e-06, "loss": 0.0101, "step": 8480 }, { "epoch": 7.918767507002801, "grad_norm": 4.990530238748122, "learning_rate": 1.0946166271943144e-06, "loss": 0.1187, "step": 8481 }, { "epoch": 7.919701213818861, "grad_norm": 3.2938298783725433, "learning_rate": 1.093672581309584e-06, "loss": 0.1166, "step": 8482 }, { "epoch": 7.920634920634921, "grad_norm": 0.9554546138386625, "learning_rate": 1.0927288927013934e-06, "loss": 0.0253, "step": 8483 }, { "epoch": 7.921568627450981, "grad_norm": 1.5518487669931122, "learning_rate": 1.0917855614560568e-06, "loss": 0.0374, "step": 8484 }, { "epoch": 7.92250233426704, "grad_norm": 3.9597465473606417, "learning_rate": 1.0908425876598512e-06, "loss": 0.0955, "step": 8485 }, { "epoch": 7.9234360410831, "grad_norm": 3.307376757442006, "learning_rate": 1.0898999713990217e-06, "loss": 0.174, "step": 8486 }, { "epoch": 7.92436974789916, "grad_norm": 1.3172791954838823, "learning_rate": 1.0889577127597788e-06, "loss": 0.0314, "step": 8487 }, { "epoch": 7.92530345471522, "grad_norm": 2.485467686483587, "learning_rate": 1.0880158118283058e-06, "loss": 0.1, "step": 8488 }, { "epoch": 7.926237161531279, "grad_norm": 3.257317711897138, "learning_rate": 1.0870742686907481e-06, "loss": 0.1151, "step": 8489 }, { "epoch": 7.927170868347339, "grad_norm": 1.6062655839632836, "learning_rate": 1.0861330834332206e-06, "loss": 0.0335, "step": 8490 }, { "epoch": 7.928104575163399, "grad_norm": 1.821931613241534, "learning_rate": 1.0851922561418031e-06, "loss": 0.0719, "step": 8491 }, { "epoch": 7.929038281979459, "grad_norm": 0.9603516871433936, "learning_rate": 1.084251786902548e-06, "loss": 0.0346, "step": 8492 }, { "epoch": 7.9299719887955185, "grad_norm": 0.7654630712578991, "learning_rate": 1.08331167580147e-06, "loss": 0.0256, "step": 8493 }, { "epoch": 7.930905695611578, "grad_norm": 1.0597727638517713, "learning_rate": 1.0823719229245528e-06, "loss": 0.0241, "step": 8494 }, { "epoch": 7.931839402427638, "grad_norm": 0.4290369352531215, "learning_rate": 1.0814325283577458e-06, "loss": 0.0116, "step": 8495 }, { "epoch": 7.932773109243698, "grad_norm": 0.527894061862718, "learning_rate": 1.080493492186967e-06, "loss": 0.012, "step": 8496 }, { "epoch": 7.9337068160597575, "grad_norm": 1.6477767035766409, "learning_rate": 1.0795548144981039e-06, "loss": 0.0534, "step": 8497 }, { "epoch": 7.934640522875817, "grad_norm": 1.37815670159999, "learning_rate": 1.078616495377008e-06, "loss": 0.0392, "step": 8498 }, { "epoch": 7.935574229691877, "grad_norm": 1.6022150264838184, "learning_rate": 1.0776785349094975e-06, "loss": 0.0596, "step": 8499 }, { "epoch": 7.936507936507937, "grad_norm": 2.0492458285249686, "learning_rate": 1.0767409331813588e-06, "loss": 0.0695, "step": 8500 }, { "epoch": 7.9374416433239965, "grad_norm": 1.1474149781841567, "learning_rate": 1.0758036902783486e-06, "loss": 0.0456, "step": 8501 }, { "epoch": 7.938375350140056, "grad_norm": 0.7796954122925118, "learning_rate": 1.0748668062861868e-06, "loss": 0.0181, "step": 8502 }, { "epoch": 7.939309056956116, "grad_norm": 0.870794746727585, "learning_rate": 1.0739302812905616e-06, "loss": 0.0109, "step": 8503 }, { "epoch": 7.940242763772176, "grad_norm": 3.7870366784140366, "learning_rate": 1.0729941153771267e-06, "loss": 0.0555, "step": 8504 }, { "epoch": 7.9411764705882355, "grad_norm": 4.845127928112492, "learning_rate": 1.0720583086315078e-06, "loss": 0.1037, "step": 8505 }, { "epoch": 7.942110177404295, "grad_norm": 0.8577037165295027, "learning_rate": 1.0711228611392937e-06, "loss": 0.0205, "step": 8506 }, { "epoch": 7.943043884220355, "grad_norm": 2.7911010107365257, "learning_rate": 1.0701877729860404e-06, "loss": 0.0843, "step": 8507 }, { "epoch": 7.943977591036415, "grad_norm": 1.700601392614221, "learning_rate": 1.0692530442572719e-06, "loss": 0.0586, "step": 8508 }, { "epoch": 7.9449112978524745, "grad_norm": 2.895756304172895, "learning_rate": 1.068318675038481e-06, "loss": 0.1329, "step": 8509 }, { "epoch": 7.945845004668534, "grad_norm": 1.5389181478679606, "learning_rate": 1.0673846654151254e-06, "loss": 0.0332, "step": 8510 }, { "epoch": 7.946778711484594, "grad_norm": 1.6513228106465998, "learning_rate": 1.06645101547263e-06, "loss": 0.0511, "step": 8511 }, { "epoch": 7.947712418300654, "grad_norm": 0.9328705460353544, "learning_rate": 1.0655177252963877e-06, "loss": 0.0166, "step": 8512 }, { "epoch": 7.9486461251167135, "grad_norm": 1.8960460576736307, "learning_rate": 1.064584794971757e-06, "loss": 0.0347, "step": 8513 }, { "epoch": 7.949579831932773, "grad_norm": 2.414487269101263, "learning_rate": 1.0636522245840663e-06, "loss": 0.0296, "step": 8514 }, { "epoch": 7.950513538748833, "grad_norm": 1.2323444591233572, "learning_rate": 1.0627200142186094e-06, "loss": 0.0132, "step": 8515 }, { "epoch": 7.951447245564893, "grad_norm": 0.5219616981147737, "learning_rate": 1.0617881639606464e-06, "loss": 0.0127, "step": 8516 }, { "epoch": 7.9523809523809526, "grad_norm": 0.7858011345462245, "learning_rate": 1.060856673895404e-06, "loss": 0.0193, "step": 8517 }, { "epoch": 7.953314659197012, "grad_norm": 3.034176352329885, "learning_rate": 1.0599255441080796e-06, "loss": 0.0924, "step": 8518 }, { "epoch": 7.954248366013072, "grad_norm": 2.2323570862432556, "learning_rate": 1.0589947746838347e-06, "loss": 0.0999, "step": 8519 }, { "epoch": 7.955182072829132, "grad_norm": 0.7580449777287979, "learning_rate": 1.0580643657077978e-06, "loss": 0.0173, "step": 8520 }, { "epoch": 7.956115779645192, "grad_norm": 1.8633261029366022, "learning_rate": 1.0571343172650634e-06, "loss": 0.054, "step": 8521 }, { "epoch": 7.957049486461251, "grad_norm": 4.26523859854044, "learning_rate": 1.056204629440698e-06, "loss": 0.1079, "step": 8522 }, { "epoch": 7.957983193277311, "grad_norm": 0.5526844722199964, "learning_rate": 1.0552753023197298e-06, "loss": 0.0163, "step": 8523 }, { "epoch": 7.958916900093371, "grad_norm": 4.7868524839594295, "learning_rate": 1.054346335987157e-06, "loss": 0.0707, "step": 8524 }, { "epoch": 7.959850606909431, "grad_norm": 3.1313085916028385, "learning_rate": 1.053417730527941e-06, "loss": 0.1268, "step": 8525 }, { "epoch": 7.96078431372549, "grad_norm": 0.670539317183544, "learning_rate": 1.0524894860270168e-06, "loss": 0.0209, "step": 8526 }, { "epoch": 7.96171802054155, "grad_norm": 0.5003250613419086, "learning_rate": 1.0515616025692803e-06, "loss": 0.0108, "step": 8527 }, { "epoch": 7.96265172735761, "grad_norm": 2.0621476032648456, "learning_rate": 1.0506340802395976e-06, "loss": 0.0696, "step": 8528 }, { "epoch": 7.96358543417367, "grad_norm": 3.0539495485108907, "learning_rate": 1.0497069191227987e-06, "loss": 0.0715, "step": 8529 }, { "epoch": 7.964519140989729, "grad_norm": 0.5207272599720117, "learning_rate": 1.048780119303686e-06, "loss": 0.0128, "step": 8530 }, { "epoch": 7.965452847805789, "grad_norm": 4.094807049486718, "learning_rate": 1.0478536808670236e-06, "loss": 0.1861, "step": 8531 }, { "epoch": 7.966386554621849, "grad_norm": 0.9335380119675534, "learning_rate": 1.0469276038975434e-06, "loss": 0.0255, "step": 8532 }, { "epoch": 7.967320261437909, "grad_norm": 1.6826695595042758, "learning_rate": 1.0460018884799478e-06, "loss": 0.0544, "step": 8533 }, { "epoch": 7.968253968253968, "grad_norm": 0.3284265821823311, "learning_rate": 1.0450765346989027e-06, "loss": 0.0058, "step": 8534 }, { "epoch": 7.969187675070028, "grad_norm": 2.63143262735789, "learning_rate": 1.0441515426390413e-06, "loss": 0.0856, "step": 8535 }, { "epoch": 7.970121381886088, "grad_norm": 0.2959277966289717, "learning_rate": 1.0432269123849636e-06, "loss": 0.003, "step": 8536 }, { "epoch": 7.971055088702148, "grad_norm": 2.9718270173259524, "learning_rate": 1.0423026440212392e-06, "loss": 0.1206, "step": 8537 }, { "epoch": 7.971988795518207, "grad_norm": 3.3588774363257423, "learning_rate": 1.041378737632402e-06, "loss": 0.1671, "step": 8538 }, { "epoch": 7.972922502334267, "grad_norm": 0.8251895827225508, "learning_rate": 1.040455193302951e-06, "loss": 0.0177, "step": 8539 }, { "epoch": 7.973856209150327, "grad_norm": 1.7287644297490812, "learning_rate": 1.0395320111173578e-06, "loss": 0.0614, "step": 8540 }, { "epoch": 7.974789915966387, "grad_norm": 2.4016222779295577, "learning_rate": 1.0386091911600565e-06, "loss": 0.1064, "step": 8541 }, { "epoch": 7.975723622782446, "grad_norm": 2.5763991315703207, "learning_rate": 1.0376867335154472e-06, "loss": 0.0596, "step": 8542 }, { "epoch": 7.976657329598506, "grad_norm": 1.296997884857904, "learning_rate": 1.036764638267902e-06, "loss": 0.059, "step": 8543 }, { "epoch": 7.977591036414566, "grad_norm": 2.5614876744795865, "learning_rate": 1.035842905501755e-06, "loss": 0.0651, "step": 8544 }, { "epoch": 7.978524743230626, "grad_norm": 2.255403289756566, "learning_rate": 1.0349215353013086e-06, "loss": 0.0489, "step": 8545 }, { "epoch": 7.979458450046685, "grad_norm": 2.8915786628212348, "learning_rate": 1.0340005277508325e-06, "loss": 0.0754, "step": 8546 }, { "epoch": 7.980392156862745, "grad_norm": 1.697047706582007, "learning_rate": 1.0330798829345613e-06, "loss": 0.0667, "step": 8547 }, { "epoch": 7.981325863678805, "grad_norm": 1.0172766387645225, "learning_rate": 1.0321596009367014e-06, "loss": 0.0262, "step": 8548 }, { "epoch": 7.982259570494865, "grad_norm": 3.179708654498497, "learning_rate": 1.0312396818414205e-06, "loss": 0.144, "step": 8549 }, { "epoch": 7.983193277310924, "grad_norm": 0.9318322873764072, "learning_rate": 1.030320125732856e-06, "loss": 0.017, "step": 8550 }, { "epoch": 7.984126984126984, "grad_norm": 0.36058140457221277, "learning_rate": 1.0294009326951098e-06, "loss": 0.005, "step": 8551 }, { "epoch": 7.985060690943044, "grad_norm": 1.9760460655159207, "learning_rate": 1.028482102812255e-06, "loss": 0.0329, "step": 8552 }, { "epoch": 7.985994397759104, "grad_norm": 2.3722928008370956, "learning_rate": 1.0275636361683272e-06, "loss": 0.0884, "step": 8553 }, { "epoch": 7.9869281045751634, "grad_norm": 1.8394916849265872, "learning_rate": 1.02664553284733e-06, "loss": 0.025, "step": 8554 }, { "epoch": 7.987861811391223, "grad_norm": 0.5729410280602557, "learning_rate": 1.0257277929332332e-06, "loss": 0.0116, "step": 8555 }, { "epoch": 7.988795518207283, "grad_norm": 0.44490704071489023, "learning_rate": 1.0248104165099765e-06, "loss": 0.0042, "step": 8556 }, { "epoch": 7.989729225023343, "grad_norm": 0.6451028239605461, "learning_rate": 1.0238934036614634e-06, "loss": 0.0088, "step": 8557 }, { "epoch": 7.9906629318394025, "grad_norm": 1.788477703001157, "learning_rate": 1.0229767544715635e-06, "loss": 0.0291, "step": 8558 }, { "epoch": 7.991596638655462, "grad_norm": 3.5909776102097504, "learning_rate": 1.0220604690241142e-06, "loss": 0.1352, "step": 8559 }, { "epoch": 7.992530345471522, "grad_norm": 0.489456417600585, "learning_rate": 1.0211445474029225e-06, "loss": 0.007, "step": 8560 }, { "epoch": 7.993464052287582, "grad_norm": 3.5783048129060684, "learning_rate": 1.0202289896917578e-06, "loss": 0.1754, "step": 8561 }, { "epoch": 7.9943977591036415, "grad_norm": 1.6738371447973395, "learning_rate": 1.0193137959743576e-06, "loss": 0.0469, "step": 8562 }, { "epoch": 7.995331465919701, "grad_norm": 3.3215942099535516, "learning_rate": 1.0183989663344267e-06, "loss": 0.0842, "step": 8563 }, { "epoch": 7.996265172735761, "grad_norm": 1.3248170858431652, "learning_rate": 1.0174845008556355e-06, "loss": 0.0382, "step": 8564 }, { "epoch": 7.997198879551821, "grad_norm": 2.8823478989565485, "learning_rate": 1.0165703996216242e-06, "loss": 0.0491, "step": 8565 }, { "epoch": 7.9981325863678805, "grad_norm": 0.6687932746014761, "learning_rate": 1.0156566627159959e-06, "loss": 0.0091, "step": 8566 }, { "epoch": 7.99906629318394, "grad_norm": 0.9098783987912932, "learning_rate": 1.014743290222322e-06, "loss": 0.0227, "step": 8567 }, { "epoch": 8.0, "grad_norm": 1.867038097865735, "learning_rate": 1.0138302822241386e-06, "loss": 0.0822, "step": 8568 }, { "epoch": 8.00093370681606, "grad_norm": 2.18166948608884, "learning_rate": 1.012917638804954e-06, "loss": 0.024, "step": 8569 }, { "epoch": 8.00186741363212, "grad_norm": 0.7246464706659542, "learning_rate": 1.0120053600482372e-06, "loss": 0.0163, "step": 8570 }, { "epoch": 8.00280112044818, "grad_norm": 0.8820385649555388, "learning_rate": 1.0110934460374267e-06, "loss": 0.0372, "step": 8571 }, { "epoch": 8.003734827264239, "grad_norm": 1.446433715375072, "learning_rate": 1.0101818968559258e-06, "loss": 0.0406, "step": 8572 }, { "epoch": 8.004668534080299, "grad_norm": 2.2193847608559185, "learning_rate": 1.0092707125871076e-06, "loss": 0.0819, "step": 8573 }, { "epoch": 8.005602240896359, "grad_norm": 5.692242480598727, "learning_rate": 1.0083598933143096e-06, "loss": 0.1547, "step": 8574 }, { "epoch": 8.006535947712418, "grad_norm": 2.854669826275644, "learning_rate": 1.0074494391208351e-06, "loss": 0.1088, "step": 8575 }, { "epoch": 8.007469654528478, "grad_norm": 0.49555157413978107, "learning_rate": 1.0065393500899551e-06, "loss": 0.0076, "step": 8576 }, { "epoch": 8.008403361344538, "grad_norm": 1.2969286185146525, "learning_rate": 1.0056296263049086e-06, "loss": 0.027, "step": 8577 }, { "epoch": 8.009337068160598, "grad_norm": 1.746034704046843, "learning_rate": 1.0047202678488994e-06, "loss": 0.0438, "step": 8578 }, { "epoch": 8.010270774976657, "grad_norm": 1.083145475351539, "learning_rate": 1.0038112748050977e-06, "loss": 0.0187, "step": 8579 }, { "epoch": 8.011204481792717, "grad_norm": 0.9391434928547588, "learning_rate": 1.0029026472566412e-06, "loss": 0.0342, "step": 8580 }, { "epoch": 8.012138188608777, "grad_norm": 1.2977252882355819, "learning_rate": 1.0019943852866325e-06, "loss": 0.0365, "step": 8581 }, { "epoch": 8.013071895424837, "grad_norm": 1.491600768254034, "learning_rate": 1.0010864889781447e-06, "loss": 0.0343, "step": 8582 }, { "epoch": 8.014005602240896, "grad_norm": 1.350991743759299, "learning_rate": 1.000178958414214e-06, "loss": 0.0534, "step": 8583 }, { "epoch": 8.014939309056956, "grad_norm": 0.967677813866973, "learning_rate": 9.99271793677844e-07, "loss": 0.0366, "step": 8584 }, { "epoch": 8.015873015873016, "grad_norm": 1.2228118639905248, "learning_rate": 9.983649948520024e-07, "loss": 0.0146, "step": 8585 }, { "epoch": 8.016806722689076, "grad_norm": 1.2654235873415618, "learning_rate": 9.974585620196302e-07, "loss": 0.0307, "step": 8586 }, { "epoch": 8.017740429505135, "grad_norm": 0.9725614915512085, "learning_rate": 9.965524952636285e-07, "loss": 0.0174, "step": 8587 }, { "epoch": 8.018674136321195, "grad_norm": 2.1323001456667363, "learning_rate": 9.956467946668669e-07, "loss": 0.0633, "step": 8588 }, { "epoch": 8.019607843137255, "grad_norm": 0.940115716683279, "learning_rate": 9.947414603121802e-07, "loss": 0.0221, "step": 8589 }, { "epoch": 8.020541549953315, "grad_norm": 1.8103983117958182, "learning_rate": 9.938364922823745e-07, "loss": 0.0452, "step": 8590 }, { "epoch": 8.021475256769374, "grad_norm": 1.4298269733473357, "learning_rate": 9.929318906602176e-07, "loss": 0.0714, "step": 8591 }, { "epoch": 8.022408963585434, "grad_norm": 6.453566184698658, "learning_rate": 9.920276555284446e-07, "loss": 0.2349, "step": 8592 }, { "epoch": 8.023342670401494, "grad_norm": 1.4228408695965495, "learning_rate": 9.91123786969757e-07, "loss": 0.0653, "step": 8593 }, { "epoch": 8.024276377217554, "grad_norm": 1.568107802246713, "learning_rate": 9.902202850668263e-07, "loss": 0.0395, "step": 8594 }, { "epoch": 8.025210084033613, "grad_norm": 0.9452576429277852, "learning_rate": 9.893171499022858e-07, "loss": 0.0115, "step": 8595 }, { "epoch": 8.026143790849673, "grad_norm": 0.2902143364470834, "learning_rate": 9.884143815587378e-07, "loss": 0.0042, "step": 8596 }, { "epoch": 8.027077497665733, "grad_norm": 0.985077463560774, "learning_rate": 9.875119801187495e-07, "loss": 0.0226, "step": 8597 }, { "epoch": 8.028011204481793, "grad_norm": 0.4038652442730526, "learning_rate": 9.866099456648541e-07, "loss": 0.0078, "step": 8598 }, { "epoch": 8.028944911297852, "grad_norm": 0.5068483438735298, "learning_rate": 9.857082782795568e-07, "loss": 0.0114, "step": 8599 }, { "epoch": 8.029878618113912, "grad_norm": 0.20955678404272066, "learning_rate": 9.848069780453218e-07, "loss": 0.0024, "step": 8600 }, { "epoch": 8.030812324929972, "grad_norm": 2.7196977542915644, "learning_rate": 9.839060450445836e-07, "loss": 0.1038, "step": 8601 }, { "epoch": 8.031746031746032, "grad_norm": 3.154921182259566, "learning_rate": 9.830054793597415e-07, "loss": 0.0839, "step": 8602 }, { "epoch": 8.032679738562091, "grad_norm": 3.746986932642472, "learning_rate": 9.82105281073164e-07, "loss": 0.1102, "step": 8603 }, { "epoch": 8.033613445378151, "grad_norm": 1.9379863023511728, "learning_rate": 9.812054502671835e-07, "loss": 0.0388, "step": 8604 }, { "epoch": 8.034547152194211, "grad_norm": 0.2264591706495137, "learning_rate": 9.803059870240993e-07, "loss": 0.0021, "step": 8605 }, { "epoch": 8.03548085901027, "grad_norm": 0.6759633305185211, "learning_rate": 9.794068914261755e-07, "loss": 0.0079, "step": 8606 }, { "epoch": 8.03641456582633, "grad_norm": 0.6699331349966282, "learning_rate": 9.78508163555647e-07, "loss": 0.0158, "step": 8607 }, { "epoch": 8.03734827264239, "grad_norm": 1.5170234480338356, "learning_rate": 9.776098034947113e-07, "loss": 0.0291, "step": 8608 }, { "epoch": 8.03828197945845, "grad_norm": 2.5826562796359314, "learning_rate": 9.767118113255314e-07, "loss": 0.0996, "step": 8609 }, { "epoch": 8.03921568627451, "grad_norm": 2.3813180013919775, "learning_rate": 9.758141871302418e-07, "loss": 0.0463, "step": 8610 }, { "epoch": 8.04014939309057, "grad_norm": 6.9217417759682105, "learning_rate": 9.749169309909385e-07, "loss": 0.1974, "step": 8611 }, { "epoch": 8.04108309990663, "grad_norm": 2.852132762407287, "learning_rate": 9.74020042989684e-07, "loss": 0.1051, "step": 8612 }, { "epoch": 8.042016806722689, "grad_norm": 2.5376010187216744, "learning_rate": 9.731235232085113e-07, "loss": 0.0535, "step": 8613 }, { "epoch": 8.042950513538749, "grad_norm": 2.455464260495315, "learning_rate": 9.722273717294157e-07, "loss": 0.0849, "step": 8614 }, { "epoch": 8.043884220354808, "grad_norm": 5.41349363560968, "learning_rate": 9.713315886343605e-07, "loss": 0.1329, "step": 8615 }, { "epoch": 8.044817927170868, "grad_norm": 0.3289990957844637, "learning_rate": 9.704361740052726e-07, "loss": 0.0043, "step": 8616 }, { "epoch": 8.045751633986928, "grad_norm": 1.5571041777333008, "learning_rate": 9.695411279240507e-07, "loss": 0.028, "step": 8617 }, { "epoch": 8.046685340802988, "grad_norm": 1.8608804901666, "learning_rate": 9.686464504725557e-07, "loss": 0.05, "step": 8618 }, { "epoch": 8.047619047619047, "grad_norm": 1.626467015240134, "learning_rate": 9.67752141732614e-07, "loss": 0.0599, "step": 8619 }, { "epoch": 8.048552754435107, "grad_norm": 4.9783421357131665, "learning_rate": 9.668582017860223e-07, "loss": 0.1466, "step": 8620 }, { "epoch": 8.049486461251167, "grad_norm": 4.556745013623222, "learning_rate": 9.659646307145398e-07, "loss": 0.1316, "step": 8621 }, { "epoch": 8.050420168067227, "grad_norm": 5.273525591941384, "learning_rate": 9.650714285998941e-07, "loss": 0.1483, "step": 8622 }, { "epoch": 8.051353874883286, "grad_norm": 1.9344509673160477, "learning_rate": 9.64178595523777e-07, "loss": 0.0582, "step": 8623 }, { "epoch": 8.052287581699346, "grad_norm": 5.652223511977448, "learning_rate": 9.6328613156785e-07, "loss": 0.1468, "step": 8624 }, { "epoch": 8.053221288515406, "grad_norm": 3.6421544379908144, "learning_rate": 9.62394036813737e-07, "loss": 0.1057, "step": 8625 }, { "epoch": 8.054154995331466, "grad_norm": 1.4295392636776476, "learning_rate": 9.61502311343031e-07, "loss": 0.0064, "step": 8626 }, { "epoch": 8.055088702147525, "grad_norm": 2.0111432140405303, "learning_rate": 9.606109552372895e-07, "loss": 0.0292, "step": 8627 }, { "epoch": 8.056022408963585, "grad_norm": 1.3369164009529066, "learning_rate": 9.59719968578035e-07, "loss": 0.0362, "step": 8628 }, { "epoch": 8.056956115779645, "grad_norm": 2.6737866740477134, "learning_rate": 9.588293514467617e-07, "loss": 0.116, "step": 8629 }, { "epoch": 8.057889822595705, "grad_norm": 7.589498596503807, "learning_rate": 9.579391039249242e-07, "loss": 0.0659, "step": 8630 }, { "epoch": 8.058823529411764, "grad_norm": 3.5176403169453567, "learning_rate": 9.570492260939463e-07, "loss": 0.1044, "step": 8631 }, { "epoch": 8.059757236227824, "grad_norm": 0.6449516610376588, "learning_rate": 9.561597180352144e-07, "loss": 0.012, "step": 8632 }, { "epoch": 8.060690943043884, "grad_norm": 2.4997380740810957, "learning_rate": 9.552705798300877e-07, "loss": 0.0737, "step": 8633 }, { "epoch": 8.061624649859944, "grad_norm": 2.0447148033899305, "learning_rate": 9.543818115598857e-07, "loss": 0.1076, "step": 8634 }, { "epoch": 8.062558356676004, "grad_norm": 1.409190519129441, "learning_rate": 9.53493413305896e-07, "loss": 0.0226, "step": 8635 }, { "epoch": 8.063492063492063, "grad_norm": 0.2384836220529399, "learning_rate": 9.52605385149371e-07, "loss": 0.002, "step": 8636 }, { "epoch": 8.064425770308123, "grad_norm": 3.442309067397965, "learning_rate": 9.517177271715339e-07, "loss": 0.1072, "step": 8637 }, { "epoch": 8.065359477124183, "grad_norm": 3.650847110054473, "learning_rate": 9.508304394535689e-07, "loss": 0.0659, "step": 8638 }, { "epoch": 8.066293183940243, "grad_norm": 2.77649891567508, "learning_rate": 9.499435220766284e-07, "loss": 0.0792, "step": 8639 }, { "epoch": 8.067226890756302, "grad_norm": 1.8020428465137672, "learning_rate": 9.490569751218292e-07, "loss": 0.0122, "step": 8640 }, { "epoch": 8.068160597572362, "grad_norm": 2.115060661373784, "learning_rate": 9.481707986702587e-07, "loss": 0.0368, "step": 8641 }, { "epoch": 8.069094304388422, "grad_norm": 1.1898993172058412, "learning_rate": 9.472849928029654e-07, "loss": 0.0318, "step": 8642 }, { "epoch": 8.070028011204482, "grad_norm": 0.9727435428463989, "learning_rate": 9.463995576009672e-07, "loss": 0.0132, "step": 8643 }, { "epoch": 8.070961718020541, "grad_norm": 1.928504789391788, "learning_rate": 9.455144931452459e-07, "loss": 0.0594, "step": 8644 }, { "epoch": 8.071895424836601, "grad_norm": 0.43368507909557186, "learning_rate": 9.446297995167497e-07, "loss": 0.0101, "step": 8645 }, { "epoch": 8.07282913165266, "grad_norm": 1.490222494657697, "learning_rate": 9.437454767963955e-07, "loss": 0.0398, "step": 8646 }, { "epoch": 8.07376283846872, "grad_norm": 3.080902445516166, "learning_rate": 9.428615250650636e-07, "loss": 0.0821, "step": 8647 }, { "epoch": 8.07469654528478, "grad_norm": 3.3515180654736674, "learning_rate": 9.419779444036009e-07, "loss": 0.0699, "step": 8648 }, { "epoch": 8.07563025210084, "grad_norm": 1.0672681232615209, "learning_rate": 9.41094734892819e-07, "loss": 0.0159, "step": 8649 }, { "epoch": 8.0765639589169, "grad_norm": 1.8671181327508901, "learning_rate": 9.402118966134999e-07, "loss": 0.0377, "step": 8650 }, { "epoch": 8.07749766573296, "grad_norm": 0.23911307172578647, "learning_rate": 9.393294296463879e-07, "loss": 0.002, "step": 8651 }, { "epoch": 8.07843137254902, "grad_norm": 1.7476934888145088, "learning_rate": 9.384473340721933e-07, "loss": 0.0584, "step": 8652 }, { "epoch": 8.079365079365079, "grad_norm": 0.7151668818433238, "learning_rate": 9.375656099715935e-07, "loss": 0.0083, "step": 8653 }, { "epoch": 8.080298786181139, "grad_norm": 1.9828354745388077, "learning_rate": 9.366842574252332e-07, "loss": 0.053, "step": 8654 }, { "epoch": 8.081232492997199, "grad_norm": 0.5468255004625913, "learning_rate": 9.358032765137215e-07, "loss": 0.0119, "step": 8655 }, { "epoch": 8.082166199813258, "grad_norm": 1.2990729292275434, "learning_rate": 9.349226673176331e-07, "loss": 0.0183, "step": 8656 }, { "epoch": 8.083099906629318, "grad_norm": 0.014924331614336187, "learning_rate": 9.340424299175082e-07, "loss": 0.0001, "step": 8657 }, { "epoch": 8.084033613445378, "grad_norm": 2.6396134957383826, "learning_rate": 9.331625643938575e-07, "loss": 0.0602, "step": 8658 }, { "epoch": 8.084967320261438, "grad_norm": 1.1379419305803185, "learning_rate": 9.322830708271519e-07, "loss": 0.0203, "step": 8659 }, { "epoch": 8.085901027077497, "grad_norm": 1.0796201805598862, "learning_rate": 9.314039492978311e-07, "loss": 0.0122, "step": 8660 }, { "epoch": 8.086834733893557, "grad_norm": 1.5974504180389981, "learning_rate": 9.305251998863007e-07, "loss": 0.0122, "step": 8661 }, { "epoch": 8.087768440709617, "grad_norm": 2.3558208262435434, "learning_rate": 9.296468226729305e-07, "loss": 0.0985, "step": 8662 }, { "epoch": 8.088702147525677, "grad_norm": 2.749576879278873, "learning_rate": 9.287688177380605e-07, "loss": 0.0783, "step": 8663 }, { "epoch": 8.089635854341736, "grad_norm": 4.070967876499388, "learning_rate": 9.278911851619927e-07, "loss": 0.1246, "step": 8664 }, { "epoch": 8.090569561157796, "grad_norm": 0.6025335539520835, "learning_rate": 9.27013925024996e-07, "loss": 0.0142, "step": 8665 }, { "epoch": 8.091503267973856, "grad_norm": 1.6177714042815858, "learning_rate": 9.261370374073042e-07, "loss": 0.0449, "step": 8666 }, { "epoch": 8.092436974789916, "grad_norm": 0.46086791523454035, "learning_rate": 9.252605223891209e-07, "loss": 0.0092, "step": 8667 }, { "epoch": 8.093370681605975, "grad_norm": 2.862810346795202, "learning_rate": 9.243843800506114e-07, "loss": 0.0823, "step": 8668 }, { "epoch": 8.094304388422035, "grad_norm": 4.476420369516475, "learning_rate": 9.235086104719093e-07, "loss": 0.1166, "step": 8669 }, { "epoch": 8.095238095238095, "grad_norm": 0.7350899355840562, "learning_rate": 9.226332137331117e-07, "loss": 0.0187, "step": 8670 }, { "epoch": 8.096171802054155, "grad_norm": 4.524108984303539, "learning_rate": 9.217581899142852e-07, "loss": 0.124, "step": 8671 }, { "epoch": 8.097105508870214, "grad_norm": 2.301115021881419, "learning_rate": 9.208835390954601e-07, "loss": 0.0498, "step": 8672 }, { "epoch": 8.098039215686274, "grad_norm": 4.652072159091212, "learning_rate": 9.200092613566325e-07, "loss": 0.0895, "step": 8673 }, { "epoch": 8.098972922502334, "grad_norm": 0.6787623069891026, "learning_rate": 9.191353567777628e-07, "loss": 0.0115, "step": 8674 }, { "epoch": 8.099906629318394, "grad_norm": 3.6132912284733156, "learning_rate": 9.182618254387826e-07, "loss": 0.1296, "step": 8675 }, { "epoch": 8.100840336134453, "grad_norm": 3.617082673626977, "learning_rate": 9.173886674195837e-07, "loss": 0.0509, "step": 8676 }, { "epoch": 8.101774042950513, "grad_norm": 0.5915499115519292, "learning_rate": 9.16515882800027e-07, "loss": 0.0054, "step": 8677 }, { "epoch": 8.102707749766573, "grad_norm": 2.3859333090659836, "learning_rate": 9.156434716599378e-07, "loss": 0.0625, "step": 8678 }, { "epoch": 8.103641456582633, "grad_norm": 0.5890725302569293, "learning_rate": 9.147714340791064e-07, "loss": 0.0127, "step": 8679 }, { "epoch": 8.104575163398692, "grad_norm": 7.51049078644435, "learning_rate": 9.13899770137292e-07, "loss": 0.1831, "step": 8680 }, { "epoch": 8.105508870214752, "grad_norm": 4.914614134864501, "learning_rate": 9.130284799142181e-07, "loss": 0.134, "step": 8681 }, { "epoch": 8.106442577030812, "grad_norm": 0.1673431643566976, "learning_rate": 9.121575634895713e-07, "loss": 0.001, "step": 8682 }, { "epoch": 8.107376283846872, "grad_norm": 2.332231748474098, "learning_rate": 9.112870209430097e-07, "loss": 0.0703, "step": 8683 }, { "epoch": 8.108309990662931, "grad_norm": 0.38027766205536884, "learning_rate": 9.104168523541523e-07, "loss": 0.0025, "step": 8684 }, { "epoch": 8.109243697478991, "grad_norm": 0.901208455308179, "learning_rate": 9.095470578025844e-07, "loss": 0.0196, "step": 8685 }, { "epoch": 8.110177404295051, "grad_norm": 1.9268338985289883, "learning_rate": 9.086776373678608e-07, "loss": 0.0567, "step": 8686 }, { "epoch": 8.11111111111111, "grad_norm": 1.165361979739329, "learning_rate": 9.078085911294982e-07, "loss": 0.019, "step": 8687 }, { "epoch": 8.11204481792717, "grad_norm": 0.2104751228780285, "learning_rate": 9.069399191669809e-07, "loss": 0.0028, "step": 8688 }, { "epoch": 8.11297852474323, "grad_norm": 1.1389303321676572, "learning_rate": 9.060716215597565e-07, "loss": 0.0266, "step": 8689 }, { "epoch": 8.11391223155929, "grad_norm": 1.9714208030477367, "learning_rate": 9.052036983872431e-07, "loss": 0.0378, "step": 8690 }, { "epoch": 8.11484593837535, "grad_norm": 1.4219029457568857, "learning_rate": 9.043361497288212e-07, "loss": 0.0271, "step": 8691 }, { "epoch": 8.11577964519141, "grad_norm": 1.0920511451117862, "learning_rate": 9.034689756638354e-07, "loss": 0.0199, "step": 8692 }, { "epoch": 8.11671335200747, "grad_norm": 0.41030789912527477, "learning_rate": 9.026021762716014e-07, "loss": 0.0091, "step": 8693 }, { "epoch": 8.117647058823529, "grad_norm": 0.5464280348360891, "learning_rate": 9.017357516313962e-07, "loss": 0.0079, "step": 8694 }, { "epoch": 8.118580765639589, "grad_norm": 3.607757537567935, "learning_rate": 9.008697018224638e-07, "loss": 0.0765, "step": 8695 }, { "epoch": 8.119514472455649, "grad_norm": 2.682106008589283, "learning_rate": 9.000040269240123e-07, "loss": 0.0923, "step": 8696 }, { "epoch": 8.120448179271708, "grad_norm": 1.3430714417197882, "learning_rate": 8.991387270152202e-07, "loss": 0.0345, "step": 8697 }, { "epoch": 8.121381886087768, "grad_norm": 0.8372612024483426, "learning_rate": 8.98273802175228e-07, "loss": 0.0181, "step": 8698 }, { "epoch": 8.122315592903828, "grad_norm": 0.5098376343666176, "learning_rate": 8.974092524831412e-07, "loss": 0.0065, "step": 8699 }, { "epoch": 8.123249299719888, "grad_norm": 2.3593942743230096, "learning_rate": 8.965450780180318e-07, "loss": 0.039, "step": 8700 }, { "epoch": 8.124183006535947, "grad_norm": 2.0906740676013684, "learning_rate": 8.956812788589409e-07, "loss": 0.0523, "step": 8701 }, { "epoch": 8.125116713352007, "grad_norm": 2.164407024690491, "learning_rate": 8.948178550848702e-07, "loss": 0.0246, "step": 8702 }, { "epoch": 8.126050420168067, "grad_norm": 2.21626484981078, "learning_rate": 8.939548067747906e-07, "loss": 0.0923, "step": 8703 }, { "epoch": 8.126984126984127, "grad_norm": 3.4153546713018033, "learning_rate": 8.930921340076349e-07, "loss": 0.084, "step": 8704 }, { "epoch": 8.127917833800186, "grad_norm": 0.8139345217868167, "learning_rate": 8.922298368623072e-07, "loss": 0.0192, "step": 8705 }, { "epoch": 8.128851540616246, "grad_norm": 0.3978250344456708, "learning_rate": 8.913679154176724e-07, "loss": 0.0063, "step": 8706 }, { "epoch": 8.129785247432306, "grad_norm": 1.321416653622367, "learning_rate": 8.905063697525635e-07, "loss": 0.0355, "step": 8707 }, { "epoch": 8.130718954248366, "grad_norm": 0.7931738218170623, "learning_rate": 8.896451999457756e-07, "loss": 0.0138, "step": 8708 }, { "epoch": 8.131652661064425, "grad_norm": 0.8291921585810761, "learning_rate": 8.887844060760759e-07, "loss": 0.017, "step": 8709 }, { "epoch": 8.132586367880485, "grad_norm": 1.297576510101384, "learning_rate": 8.879239882221919e-07, "loss": 0.0293, "step": 8710 }, { "epoch": 8.133520074696545, "grad_norm": 4.234928229224382, "learning_rate": 8.870639464628178e-07, "loss": 0.1083, "step": 8711 }, { "epoch": 8.134453781512605, "grad_norm": 2.6663625096274655, "learning_rate": 8.862042808766141e-07, "loss": 0.073, "step": 8712 }, { "epoch": 8.135387488328664, "grad_norm": 0.9159855935557152, "learning_rate": 8.853449915422058e-07, "loss": 0.0232, "step": 8713 }, { "epoch": 8.136321195144724, "grad_norm": 0.716437803801885, "learning_rate": 8.844860785381865e-07, "loss": 0.0136, "step": 8714 }, { "epoch": 8.137254901960784, "grad_norm": 5.589029107802534, "learning_rate": 8.836275419431123e-07, "loss": 0.1994, "step": 8715 }, { "epoch": 8.138188608776844, "grad_norm": 1.2734331386385618, "learning_rate": 8.827693818355049e-07, "loss": 0.0218, "step": 8716 }, { "epoch": 8.139122315592903, "grad_norm": 2.002811320612155, "learning_rate": 8.819115982938519e-07, "loss": 0.0369, "step": 8717 }, { "epoch": 8.140056022408963, "grad_norm": 10.200012763789193, "learning_rate": 8.810541913966097e-07, "loss": 0.0673, "step": 8718 }, { "epoch": 8.140989729225023, "grad_norm": 0.3249170599904695, "learning_rate": 8.801971612221966e-07, "loss": 0.0039, "step": 8719 }, { "epoch": 8.141923436041083, "grad_norm": 3.364083090993746, "learning_rate": 8.793405078489963e-07, "loss": 0.1211, "step": 8720 }, { "epoch": 8.142857142857142, "grad_norm": 1.027529817271978, "learning_rate": 8.784842313553588e-07, "loss": 0.0168, "step": 8721 }, { "epoch": 8.143790849673202, "grad_norm": 0.8441555455993314, "learning_rate": 8.776283318196016e-07, "loss": 0.0059, "step": 8722 }, { "epoch": 8.144724556489262, "grad_norm": 2.5720860865459314, "learning_rate": 8.767728093200061e-07, "loss": 0.0687, "step": 8723 }, { "epoch": 8.145658263305322, "grad_norm": 0.5150120985766047, "learning_rate": 8.759176639348182e-07, "loss": 0.0053, "step": 8724 }, { "epoch": 8.146591970121381, "grad_norm": 1.519035671052807, "learning_rate": 8.750628957422496e-07, "loss": 0.0484, "step": 8725 }, { "epoch": 8.147525676937441, "grad_norm": 3.191383016097748, "learning_rate": 8.742085048204813e-07, "loss": 0.0632, "step": 8726 }, { "epoch": 8.1484593837535, "grad_norm": 0.5011103464956627, "learning_rate": 8.733544912476539e-07, "loss": 0.0045, "step": 8727 }, { "epoch": 8.14939309056956, "grad_norm": 2.929007866469812, "learning_rate": 8.725008551018777e-07, "loss": 0.0585, "step": 8728 }, { "epoch": 8.15032679738562, "grad_norm": 0.2726777200390255, "learning_rate": 8.716475964612264e-07, "loss": 0.0032, "step": 8729 }, { "epoch": 8.15126050420168, "grad_norm": 1.6685026513922423, "learning_rate": 8.707947154037389e-07, "loss": 0.0339, "step": 8730 }, { "epoch": 8.15219421101774, "grad_norm": 2.436284112134716, "learning_rate": 8.69942212007423e-07, "loss": 0.0743, "step": 8731 }, { "epoch": 8.1531279178338, "grad_norm": 0.3152745278066019, "learning_rate": 8.690900863502482e-07, "loss": 0.0037, "step": 8732 }, { "epoch": 8.15406162464986, "grad_norm": 2.3716742259056125, "learning_rate": 8.682383385101512e-07, "loss": 0.0625, "step": 8733 }, { "epoch": 8.15499533146592, "grad_norm": 3.403926436156875, "learning_rate": 8.673869685650315e-07, "loss": 0.108, "step": 8734 }, { "epoch": 8.155929038281979, "grad_norm": 8.428411440949851, "learning_rate": 8.665359765927594e-07, "loss": 0.1278, "step": 8735 }, { "epoch": 8.156862745098039, "grad_norm": 0.3120962959099265, "learning_rate": 8.656853626711664e-07, "loss": 0.0013, "step": 8736 }, { "epoch": 8.157796451914098, "grad_norm": 3.2903653816273293, "learning_rate": 8.648351268780497e-07, "loss": 0.151, "step": 8737 }, { "epoch": 8.158730158730158, "grad_norm": 1.8049077783936052, "learning_rate": 8.639852692911721e-07, "loss": 0.0267, "step": 8738 }, { "epoch": 8.159663865546218, "grad_norm": 0.8148859420770127, "learning_rate": 8.631357899882647e-07, "loss": 0.0095, "step": 8739 }, { "epoch": 8.160597572362278, "grad_norm": 0.7908227185567058, "learning_rate": 8.622866890470205e-07, "loss": 0.009, "step": 8740 }, { "epoch": 8.161531279178337, "grad_norm": 2.801753584347197, "learning_rate": 8.614379665450995e-07, "loss": 0.0821, "step": 8741 }, { "epoch": 8.162464985994397, "grad_norm": 0.021093251215862346, "learning_rate": 8.605896225601246e-07, "loss": 0.0001, "step": 8742 }, { "epoch": 8.163398692810457, "grad_norm": 8.841982027713517, "learning_rate": 8.597416571696893e-07, "loss": 0.0396, "step": 8743 }, { "epoch": 8.164332399626517, "grad_norm": 2.1979528615007977, "learning_rate": 8.588940704513477e-07, "loss": 0.0362, "step": 8744 }, { "epoch": 8.165266106442576, "grad_norm": 1.7778006761085179, "learning_rate": 8.58046862482621e-07, "loss": 0.0446, "step": 8745 }, { "epoch": 8.166199813258636, "grad_norm": 0.618896227502212, "learning_rate": 8.572000333409958e-07, "loss": 0.0152, "step": 8746 }, { "epoch": 8.167133520074696, "grad_norm": 1.039663411338182, "learning_rate": 8.563535831039227e-07, "loss": 0.0226, "step": 8747 }, { "epoch": 8.168067226890756, "grad_norm": 3.235125047762488, "learning_rate": 8.555075118488215e-07, "loss": 0.0998, "step": 8748 }, { "epoch": 8.169000933706815, "grad_norm": 2.147216127142886, "learning_rate": 8.546618196530737e-07, "loss": 0.0693, "step": 8749 }, { "epoch": 8.169934640522875, "grad_norm": 0.8781049012124049, "learning_rate": 8.538165065940263e-07, "loss": 0.0196, "step": 8750 }, { "epoch": 8.170868347338935, "grad_norm": 0.8441325235257565, "learning_rate": 8.529715727489912e-07, "loss": 0.014, "step": 8751 }, { "epoch": 8.171802054154995, "grad_norm": 3.671341207337128, "learning_rate": 8.521270181952501e-07, "loss": 0.0599, "step": 8752 }, { "epoch": 8.172735760971054, "grad_norm": 2.2416695040291867, "learning_rate": 8.512828430100456e-07, "loss": 0.0553, "step": 8753 }, { "epoch": 8.173669467787114, "grad_norm": 2.878072581176899, "learning_rate": 8.504390472705865e-07, "loss": 0.0825, "step": 8754 }, { "epoch": 8.174603174603174, "grad_norm": 0.7777067298096689, "learning_rate": 8.495956310540454e-07, "loss": 0.014, "step": 8755 }, { "epoch": 8.175536881419234, "grad_norm": 1.639422047993742, "learning_rate": 8.487525944375652e-07, "loss": 0.0346, "step": 8756 }, { "epoch": 8.176470588235293, "grad_norm": 3.575131614034447, "learning_rate": 8.479099374982491e-07, "loss": 0.0782, "step": 8757 }, { "epoch": 8.177404295051353, "grad_norm": 0.635917845974106, "learning_rate": 8.470676603131683e-07, "loss": 0.0157, "step": 8758 }, { "epoch": 8.178338001867413, "grad_norm": 2.2681579359953967, "learning_rate": 8.462257629593557e-07, "loss": 0.0542, "step": 8759 }, { "epoch": 8.179271708683473, "grad_norm": 1.7432687832852185, "learning_rate": 8.453842455138156e-07, "loss": 0.0429, "step": 8760 }, { "epoch": 8.180205415499533, "grad_norm": 3.2809065905646726, "learning_rate": 8.445431080535122e-07, "loss": 0.0714, "step": 8761 }, { "epoch": 8.181139122315592, "grad_norm": 3.37157041737175, "learning_rate": 8.437023506553755e-07, "loss": 0.0825, "step": 8762 }, { "epoch": 8.182072829131652, "grad_norm": 0.7332098853156345, "learning_rate": 8.42861973396305e-07, "loss": 0.0195, "step": 8763 }, { "epoch": 8.183006535947712, "grad_norm": 2.889857226067937, "learning_rate": 8.420219763531606e-07, "loss": 0.0624, "step": 8764 }, { "epoch": 8.183940242763772, "grad_norm": 2.9807067128116054, "learning_rate": 8.411823596027685e-07, "loss": 0.1105, "step": 8765 }, { "epoch": 8.184873949579831, "grad_norm": 2.663934385897028, "learning_rate": 8.403431232219228e-07, "loss": 0.0793, "step": 8766 }, { "epoch": 8.185807656395891, "grad_norm": 2.831312188185438, "learning_rate": 8.395042672873805e-07, "loss": 0.0789, "step": 8767 }, { "epoch": 8.18674136321195, "grad_norm": 2.944562478628154, "learning_rate": 8.386657918758634e-07, "loss": 0.1048, "step": 8768 }, { "epoch": 8.18767507002801, "grad_norm": 0.5300837433192814, "learning_rate": 8.378276970640581e-07, "loss": 0.0074, "step": 8769 }, { "epoch": 8.18860877684407, "grad_norm": 5.547821351971151, "learning_rate": 8.369899829286204e-07, "loss": 0.1413, "step": 8770 }, { "epoch": 8.18954248366013, "grad_norm": 1.666935473797252, "learning_rate": 8.36152649546167e-07, "loss": 0.0278, "step": 8771 }, { "epoch": 8.19047619047619, "grad_norm": 3.4945594780399447, "learning_rate": 8.353156969932802e-07, "loss": 0.0761, "step": 8772 }, { "epoch": 8.19140989729225, "grad_norm": 2.0275845968326127, "learning_rate": 8.344791253465107e-07, "loss": 0.0425, "step": 8773 }, { "epoch": 8.19234360410831, "grad_norm": 5.319752762920889, "learning_rate": 8.336429346823711e-07, "loss": 0.1016, "step": 8774 }, { "epoch": 8.193277310924369, "grad_norm": 1.0098548725023815, "learning_rate": 8.3280712507734e-07, "loss": 0.0204, "step": 8775 }, { "epoch": 8.194211017740429, "grad_norm": 2.21684702320035, "learning_rate": 8.319716966078623e-07, "loss": 0.0767, "step": 8776 }, { "epoch": 8.195144724556489, "grad_norm": 0.665059198642096, "learning_rate": 8.311366493503442e-07, "loss": 0.0123, "step": 8777 }, { "epoch": 8.196078431372548, "grad_norm": 1.166043230198705, "learning_rate": 8.303019833811638e-07, "loss": 0.0083, "step": 8778 }, { "epoch": 8.197012138188608, "grad_norm": 5.167237192555075, "learning_rate": 8.294676987766587e-07, "loss": 0.2421, "step": 8779 }, { "epoch": 8.197945845004668, "grad_norm": 1.5469844353330755, "learning_rate": 8.286337956131335e-07, "loss": 0.0174, "step": 8780 }, { "epoch": 8.198879551820728, "grad_norm": 3.358911638056852, "learning_rate": 8.278002739668567e-07, "loss": 0.0941, "step": 8781 }, { "epoch": 8.199813258636787, "grad_norm": 3.9087085175033507, "learning_rate": 8.26967133914065e-07, "loss": 0.1116, "step": 8782 }, { "epoch": 8.200746965452847, "grad_norm": 1.8299868508287116, "learning_rate": 8.261343755309575e-07, "loss": 0.0326, "step": 8783 }, { "epoch": 8.201680672268907, "grad_norm": 3.1545722468549275, "learning_rate": 8.253019988936989e-07, "loss": 0.0911, "step": 8784 }, { "epoch": 8.202614379084967, "grad_norm": 2.5844034255021753, "learning_rate": 8.244700040784176e-07, "loss": 0.0928, "step": 8785 }, { "epoch": 8.203548085901026, "grad_norm": 2.968078921511912, "learning_rate": 8.236383911612117e-07, "loss": 0.0657, "step": 8786 }, { "epoch": 8.204481792717086, "grad_norm": 1.1318141740601588, "learning_rate": 8.228071602181403e-07, "loss": 0.0191, "step": 8787 }, { "epoch": 8.205415499533146, "grad_norm": 1.1530048752872224, "learning_rate": 8.219763113252277e-07, "loss": 0.0206, "step": 8788 }, { "epoch": 8.206349206349206, "grad_norm": 1.6513105999535642, "learning_rate": 8.211458445584636e-07, "loss": 0.0503, "step": 8789 }, { "epoch": 8.207282913165265, "grad_norm": 1.6756783929075612, "learning_rate": 8.20315759993805e-07, "loss": 0.048, "step": 8790 }, { "epoch": 8.208216619981325, "grad_norm": 2.12562188602194, "learning_rate": 8.194860577071722e-07, "loss": 0.0183, "step": 8791 }, { "epoch": 8.209150326797385, "grad_norm": 1.535426883782885, "learning_rate": 8.186567377744497e-07, "loss": 0.055, "step": 8792 }, { "epoch": 8.210084033613445, "grad_norm": 0.5044567852101443, "learning_rate": 8.178278002714879e-07, "loss": 0.0045, "step": 8793 }, { "epoch": 8.211017740429504, "grad_norm": 1.1465475889677839, "learning_rate": 8.169992452741015e-07, "loss": 0.0102, "step": 8794 }, { "epoch": 8.211951447245564, "grad_norm": 1.719716882878331, "learning_rate": 8.161710728580735e-07, "loss": 0.0441, "step": 8795 }, { "epoch": 8.212885154061624, "grad_norm": 5.022069177665144, "learning_rate": 8.153432830991476e-07, "loss": 0.1248, "step": 8796 }, { "epoch": 8.213818860877684, "grad_norm": 3.237157932538472, "learning_rate": 8.145158760730349e-07, "loss": 0.0588, "step": 8797 }, { "epoch": 8.214752567693743, "grad_norm": 4.406627652261035, "learning_rate": 8.136888518554087e-07, "loss": 0.1072, "step": 8798 }, { "epoch": 8.215686274509803, "grad_norm": 2.183913396075576, "learning_rate": 8.128622105219125e-07, "loss": 0.0405, "step": 8799 }, { "epoch": 8.216619981325863, "grad_norm": 0.21601627092675696, "learning_rate": 8.120359521481502e-07, "loss": 0.0055, "step": 8800 }, { "epoch": 8.217553688141923, "grad_norm": 4.660979158078263, "learning_rate": 8.112100768096931e-07, "loss": 0.138, "step": 8801 }, { "epoch": 8.218487394957982, "grad_norm": 3.3111921198877035, "learning_rate": 8.10384584582074e-07, "loss": 0.0786, "step": 8802 }, { "epoch": 8.219421101774042, "grad_norm": 5.535114333021164, "learning_rate": 8.095594755407971e-07, "loss": 0.002, "step": 8803 }, { "epoch": 8.220354808590102, "grad_norm": 5.117105059490619, "learning_rate": 8.087347497613251e-07, "loss": 0.1432, "step": 8804 }, { "epoch": 8.221288515406162, "grad_norm": 6.006507288742992, "learning_rate": 8.07910407319089e-07, "loss": 0.0279, "step": 8805 }, { "epoch": 8.222222222222221, "grad_norm": 22.171713983239812, "learning_rate": 8.070864482894824e-07, "loss": 0.3615, "step": 8806 }, { "epoch": 8.223155929038281, "grad_norm": 1.9642007763335643, "learning_rate": 8.062628727478683e-07, "loss": 0.0558, "step": 8807 }, { "epoch": 8.224089635854341, "grad_norm": 0.7339378131008744, "learning_rate": 8.054396807695702e-07, "loss": 0.0155, "step": 8808 }, { "epoch": 8.2250233426704, "grad_norm": 3.1283269429483402, "learning_rate": 8.046168724298781e-07, "loss": 0.1191, "step": 8809 }, { "epoch": 8.22595704948646, "grad_norm": 1.2182781515828658, "learning_rate": 8.037944478040471e-07, "loss": 0.0199, "step": 8810 }, { "epoch": 8.22689075630252, "grad_norm": 1.514204469159294, "learning_rate": 8.02972406967295e-07, "loss": 0.007, "step": 8811 }, { "epoch": 8.22782446311858, "grad_norm": 1.9484780385734584, "learning_rate": 8.021507499948095e-07, "loss": 0.0462, "step": 8812 }, { "epoch": 8.22875816993464, "grad_norm": 3.137343397644005, "learning_rate": 8.013294769617391e-07, "loss": 0.0948, "step": 8813 }, { "epoch": 8.2296918767507, "grad_norm": 3.087490172376868, "learning_rate": 8.005085879431979e-07, "loss": 0.0696, "step": 8814 }, { "epoch": 8.23062558356676, "grad_norm": 0.9770308898601424, "learning_rate": 7.996880830142639e-07, "loss": 0.0186, "step": 8815 }, { "epoch": 8.231559290382819, "grad_norm": 2.1376724332186043, "learning_rate": 7.98867962249984e-07, "loss": 0.0793, "step": 8816 }, { "epoch": 8.232492997198879, "grad_norm": 1.1713405228207552, "learning_rate": 7.980482257253658e-07, "loss": 0.0302, "step": 8817 }, { "epoch": 8.233426704014938, "grad_norm": 0.3353196165088285, "learning_rate": 7.972288735153839e-07, "loss": 0.0047, "step": 8818 }, { "epoch": 8.234360410830998, "grad_norm": 3.3715694336599356, "learning_rate": 7.964099056949743e-07, "loss": 0.0578, "step": 8819 }, { "epoch": 8.235294117647058, "grad_norm": 3.542890219092017, "learning_rate": 7.955913223390443e-07, "loss": 0.1053, "step": 8820 }, { "epoch": 8.236227824463118, "grad_norm": 1.623420037052956, "learning_rate": 7.947731235224615e-07, "loss": 0.0443, "step": 8821 }, { "epoch": 8.237161531279177, "grad_norm": 1.587926878428432, "learning_rate": 7.93955309320058e-07, "loss": 0.0216, "step": 8822 }, { "epoch": 8.238095238095237, "grad_norm": 3.5690906871785493, "learning_rate": 7.931378798066308e-07, "loss": 0.0762, "step": 8823 }, { "epoch": 8.239028944911297, "grad_norm": 1.0821391638610358, "learning_rate": 7.923208350569461e-07, "loss": 0.0233, "step": 8824 }, { "epoch": 8.239962651727357, "grad_norm": 2.5740321779656763, "learning_rate": 7.915041751457298e-07, "loss": 0.0697, "step": 8825 }, { "epoch": 8.240896358543417, "grad_norm": 0.1553994910592843, "learning_rate": 7.906879001476747e-07, "loss": 0.0005, "step": 8826 }, { "epoch": 8.241830065359476, "grad_norm": 1.7304389859042089, "learning_rate": 7.898720101374374e-07, "loss": 0.0674, "step": 8827 }, { "epoch": 8.242763772175536, "grad_norm": 0.1684801018647497, "learning_rate": 7.890565051896399e-07, "loss": 0.0045, "step": 8828 }, { "epoch": 8.243697478991596, "grad_norm": 1.4716804819338376, "learning_rate": 7.882413853788707e-07, "loss": 0.02, "step": 8829 }, { "epoch": 8.244631185807656, "grad_norm": 1.737494513944434, "learning_rate": 7.874266507796801e-07, "loss": 0.0365, "step": 8830 }, { "epoch": 8.245564892623715, "grad_norm": 2.2466710077031298, "learning_rate": 7.866123014665855e-07, "loss": 0.0707, "step": 8831 }, { "epoch": 8.246498599439775, "grad_norm": 1.9875828487373237, "learning_rate": 7.85798337514066e-07, "loss": 0.0584, "step": 8832 }, { "epoch": 8.247432306255835, "grad_norm": 1.5439827075165224, "learning_rate": 7.849847589965703e-07, "loss": 0.0495, "step": 8833 }, { "epoch": 8.248366013071895, "grad_norm": 2.62157442755547, "learning_rate": 7.84171565988508e-07, "loss": 0.0371, "step": 8834 }, { "epoch": 8.249299719887954, "grad_norm": 2.5163638605564707, "learning_rate": 7.833587585642527e-07, "loss": 0.0871, "step": 8835 }, { "epoch": 8.250233426704014, "grad_norm": 1.9546680410424437, "learning_rate": 7.825463367981479e-07, "loss": 0.0497, "step": 8836 }, { "epoch": 8.251167133520074, "grad_norm": 0.5453422097355891, "learning_rate": 7.817343007644962e-07, "loss": 0.0129, "step": 8837 }, { "epoch": 8.252100840336134, "grad_norm": 1.2960945010864489, "learning_rate": 7.809226505375683e-07, "loss": 0.0306, "step": 8838 }, { "epoch": 8.253034547152193, "grad_norm": 2.2769884190534024, "learning_rate": 7.801113861915966e-07, "loss": 0.0408, "step": 8839 }, { "epoch": 8.253968253968253, "grad_norm": 3.3104406617068642, "learning_rate": 7.793005078007832e-07, "loss": 0.0728, "step": 8840 }, { "epoch": 8.254901960784313, "grad_norm": 1.6576686510684862, "learning_rate": 7.7849001543929e-07, "loss": 0.0312, "step": 8841 }, { "epoch": 8.255835667600373, "grad_norm": 0.9766687749721734, "learning_rate": 7.776799091812442e-07, "loss": 0.0175, "step": 8842 }, { "epoch": 8.256769374416432, "grad_norm": 1.1685454761432537, "learning_rate": 7.768701891007419e-07, "loss": 0.0202, "step": 8843 }, { "epoch": 8.257703081232492, "grad_norm": 1.7617122685676379, "learning_rate": 7.760608552718391e-07, "loss": 0.0478, "step": 8844 }, { "epoch": 8.258636788048554, "grad_norm": 2.916947897441607, "learning_rate": 7.752519077685577e-07, "loss": 0.1317, "step": 8845 }, { "epoch": 8.259570494864613, "grad_norm": 4.393331640509236, "learning_rate": 7.744433466648865e-07, "loss": 0.1179, "step": 8846 }, { "epoch": 8.260504201680673, "grad_norm": 3.267928872438554, "learning_rate": 7.736351720347762e-07, "loss": 0.1099, "step": 8847 }, { "epoch": 8.261437908496733, "grad_norm": 2.3962219934374605, "learning_rate": 7.728273839521439e-07, "loss": 0.0879, "step": 8848 }, { "epoch": 8.262371615312793, "grad_norm": 0.8980452610847771, "learning_rate": 7.720199824908692e-07, "loss": 0.0176, "step": 8849 }, { "epoch": 8.263305322128852, "grad_norm": 0.20338836188041073, "learning_rate": 7.712129677247998e-07, "loss": 0.0013, "step": 8850 }, { "epoch": 8.264239028944912, "grad_norm": 2.5273975961618897, "learning_rate": 7.704063397277456e-07, "loss": 0.0839, "step": 8851 }, { "epoch": 8.265172735760972, "grad_norm": 5.696528164529419, "learning_rate": 7.696000985734808e-07, "loss": 0.1406, "step": 8852 }, { "epoch": 8.266106442577032, "grad_norm": 0.6892433957268569, "learning_rate": 7.687942443357443e-07, "loss": 0.0128, "step": 8853 }, { "epoch": 8.267040149393091, "grad_norm": 1.4818498578735044, "learning_rate": 7.679887770882421e-07, "loss": 0.0409, "step": 8854 }, { "epoch": 8.267973856209151, "grad_norm": 2.911290977050291, "learning_rate": 7.671836969046431e-07, "loss": 0.0393, "step": 8855 }, { "epoch": 8.268907563025211, "grad_norm": 1.7882361773421318, "learning_rate": 7.663790038585794e-07, "loss": 0.0499, "step": 8856 }, { "epoch": 8.26984126984127, "grad_norm": 0.9832953220808993, "learning_rate": 7.655746980236489e-07, "loss": 0.0202, "step": 8857 }, { "epoch": 8.27077497665733, "grad_norm": 2.1415224718914683, "learning_rate": 7.647707794734155e-07, "loss": 0.0357, "step": 8858 }, { "epoch": 8.27170868347339, "grad_norm": 1.1711289923165573, "learning_rate": 7.639672482814059e-07, "loss": 0.0262, "step": 8859 }, { "epoch": 8.27264239028945, "grad_norm": 3.1656717520394726, "learning_rate": 7.631641045211119e-07, "loss": 0.0601, "step": 8860 }, { "epoch": 8.27357609710551, "grad_norm": 3.421264387729533, "learning_rate": 7.623613482659892e-07, "loss": 0.0948, "step": 8861 }, { "epoch": 8.27450980392157, "grad_norm": 1.2187393378515257, "learning_rate": 7.615589795894585e-07, "loss": 0.0206, "step": 8862 }, { "epoch": 8.27544351073763, "grad_norm": 11.873725212210665, "learning_rate": 7.607569985649066e-07, "loss": 0.4515, "step": 8863 }, { "epoch": 8.276377217553689, "grad_norm": 2.3892583341361715, "learning_rate": 7.599554052656827e-07, "loss": 0.0533, "step": 8864 }, { "epoch": 8.277310924369749, "grad_norm": 1.5918598130598818, "learning_rate": 7.591541997651014e-07, "loss": 0.0352, "step": 8865 }, { "epoch": 8.278244631185808, "grad_norm": 0.8580443337269196, "learning_rate": 7.583533821364403e-07, "loss": 0.012, "step": 8866 }, { "epoch": 8.279178338001868, "grad_norm": 4.57322710900358, "learning_rate": 7.575529524529456e-07, "loss": 0.0434, "step": 8867 }, { "epoch": 8.280112044817928, "grad_norm": 1.7302466274423172, "learning_rate": 7.56752910787824e-07, "loss": 0.0371, "step": 8868 }, { "epoch": 8.281045751633988, "grad_norm": 1.1762953878505493, "learning_rate": 7.559532572142481e-07, "loss": 0.0254, "step": 8869 }, { "epoch": 8.281979458450047, "grad_norm": 2.55056134609046, "learning_rate": 7.551539918053541e-07, "loss": 0.0293, "step": 8870 }, { "epoch": 8.282913165266107, "grad_norm": 2.5732488417373682, "learning_rate": 7.543551146342459e-07, "loss": 0.06, "step": 8871 }, { "epoch": 8.283846872082167, "grad_norm": 1.3008631451678823, "learning_rate": 7.53556625773988e-07, "loss": 0.0246, "step": 8872 }, { "epoch": 8.284780578898227, "grad_norm": 0.35904222546808123, "learning_rate": 7.527585252976111e-07, "loss": 0.004, "step": 8873 }, { "epoch": 8.285714285714286, "grad_norm": 2.205412312648578, "learning_rate": 7.519608132781097e-07, "loss": 0.0453, "step": 8874 }, { "epoch": 8.286647992530346, "grad_norm": 0.5480632166509454, "learning_rate": 7.511634897884451e-07, "loss": 0.0135, "step": 8875 }, { "epoch": 8.287581699346406, "grad_norm": 5.713598301184929, "learning_rate": 7.503665549015399e-07, "loss": 0.0681, "step": 8876 }, { "epoch": 8.288515406162466, "grad_norm": 0.9610751898033214, "learning_rate": 7.49570008690283e-07, "loss": 0.0208, "step": 8877 }, { "epoch": 8.289449112978525, "grad_norm": 1.7518287449973065, "learning_rate": 7.487738512275272e-07, "loss": 0.0504, "step": 8878 }, { "epoch": 8.290382819794585, "grad_norm": 0.7823479729910968, "learning_rate": 7.479780825860889e-07, "loss": 0.01, "step": 8879 }, { "epoch": 8.291316526610645, "grad_norm": 3.675960356070964, "learning_rate": 7.471827028387518e-07, "loss": 0.102, "step": 8880 }, { "epoch": 8.292250233426705, "grad_norm": 0.3742165061365832, "learning_rate": 7.463877120582608e-07, "loss": 0.0064, "step": 8881 }, { "epoch": 8.293183940242765, "grad_norm": 1.668727225792187, "learning_rate": 7.455931103173269e-07, "loss": 0.0282, "step": 8882 }, { "epoch": 8.294117647058824, "grad_norm": 5.93182358166431, "learning_rate": 7.447988976886244e-07, "loss": 0.1545, "step": 8883 }, { "epoch": 8.295051353874884, "grad_norm": 1.7277872717163427, "learning_rate": 7.440050742447941e-07, "loss": 0.0416, "step": 8884 }, { "epoch": 8.295985060690944, "grad_norm": 2.8440089053164304, "learning_rate": 7.432116400584394e-07, "loss": 0.063, "step": 8885 }, { "epoch": 8.296918767507004, "grad_norm": 2.7892491517708597, "learning_rate": 7.424185952021284e-07, "loss": 0.0574, "step": 8886 }, { "epoch": 8.297852474323063, "grad_norm": 2.044879491054388, "learning_rate": 7.416259397483927e-07, "loss": 0.0447, "step": 8887 }, { "epoch": 8.298786181139123, "grad_norm": 2.1457168297863065, "learning_rate": 7.408336737697314e-07, "loss": 0.0646, "step": 8888 }, { "epoch": 8.299719887955183, "grad_norm": 1.2980513872051205, "learning_rate": 7.400417973386053e-07, "loss": 0.024, "step": 8889 }, { "epoch": 8.300653594771243, "grad_norm": 0.6348441823253296, "learning_rate": 7.392503105274396e-07, "loss": 0.0076, "step": 8890 }, { "epoch": 8.301587301587302, "grad_norm": 1.3552924899727539, "learning_rate": 7.384592134086233e-07, "loss": 0.027, "step": 8891 }, { "epoch": 8.302521008403362, "grad_norm": 3.3290028109883476, "learning_rate": 7.376685060545141e-07, "loss": 0.0417, "step": 8892 }, { "epoch": 8.303454715219422, "grad_norm": 2.082637334375605, "learning_rate": 7.368781885374282e-07, "loss": 0.0223, "step": 8893 }, { "epoch": 8.304388422035482, "grad_norm": 0.766815482586414, "learning_rate": 7.360882609296504e-07, "loss": 0.0104, "step": 8894 }, { "epoch": 8.305322128851541, "grad_norm": 0.4811475471125684, "learning_rate": 7.352987233034275e-07, "loss": 0.0068, "step": 8895 }, { "epoch": 8.306255835667601, "grad_norm": 0.5646490892574079, "learning_rate": 7.345095757309701e-07, "loss": 0.0088, "step": 8896 }, { "epoch": 8.30718954248366, "grad_norm": 1.6506713135769726, "learning_rate": 7.337208182844568e-07, "loss": 0.0371, "step": 8897 }, { "epoch": 8.30812324929972, "grad_norm": 1.6416255394027768, "learning_rate": 7.329324510360269e-07, "loss": 0.0404, "step": 8898 }, { "epoch": 8.30905695611578, "grad_norm": 3.9093883895055064, "learning_rate": 7.32144474057786e-07, "loss": 0.064, "step": 8899 }, { "epoch": 8.30999066293184, "grad_norm": 1.7064108016015325, "learning_rate": 7.313568874218013e-07, "loss": 0.0348, "step": 8900 }, { "epoch": 8.3109243697479, "grad_norm": 2.6972406648674885, "learning_rate": 7.305696912001087e-07, "loss": 0.0382, "step": 8901 }, { "epoch": 8.31185807656396, "grad_norm": 0.5960570936014173, "learning_rate": 7.297828854647049e-07, "loss": 0.0034, "step": 8902 }, { "epoch": 8.31279178338002, "grad_norm": 2.7167321223298826, "learning_rate": 7.289964702875518e-07, "loss": 0.0561, "step": 8903 }, { "epoch": 8.313725490196079, "grad_norm": 4.3896993362652905, "learning_rate": 7.282104457405747e-07, "loss": 0.1322, "step": 8904 }, { "epoch": 8.314659197012139, "grad_norm": 0.648589679864823, "learning_rate": 7.274248118956667e-07, "loss": 0.0129, "step": 8905 }, { "epoch": 8.315592903828199, "grad_norm": 1.5436265777473215, "learning_rate": 7.266395688246808e-07, "loss": 0.045, "step": 8906 }, { "epoch": 8.316526610644258, "grad_norm": 10.436548041264645, "learning_rate": 7.258547165994367e-07, "loss": 0.179, "step": 8907 }, { "epoch": 8.317460317460318, "grad_norm": 2.2984960198418993, "learning_rate": 7.250702552917161e-07, "loss": 0.0914, "step": 8908 }, { "epoch": 8.318394024276378, "grad_norm": 0.03502444285354296, "learning_rate": 7.242861849732696e-07, "loss": 0.0001, "step": 8909 }, { "epoch": 8.319327731092438, "grad_norm": 2.813269018564369, "learning_rate": 7.235025057158073e-07, "loss": 0.1078, "step": 8910 }, { "epoch": 8.320261437908497, "grad_norm": 1.9809880488828704, "learning_rate": 7.227192175910058e-07, "loss": 0.0824, "step": 8911 }, { "epoch": 8.321195144724557, "grad_norm": 3.325842324557798, "learning_rate": 7.219363206705038e-07, "loss": 0.098, "step": 8912 }, { "epoch": 8.322128851540617, "grad_norm": 1.8395447027964438, "learning_rate": 7.211538150259078e-07, "loss": 0.0421, "step": 8913 }, { "epoch": 8.323062558356677, "grad_norm": 1.1963763917299892, "learning_rate": 7.203717007287864e-07, "loss": 0.0254, "step": 8914 }, { "epoch": 8.323996265172736, "grad_norm": 0.6007631661824385, "learning_rate": 7.19589977850671e-07, "loss": 0.0139, "step": 8915 }, { "epoch": 8.324929971988796, "grad_norm": 1.8129858248603277, "learning_rate": 7.1880864646306e-07, "loss": 0.0197, "step": 8916 }, { "epoch": 8.325863678804856, "grad_norm": 0.7855287762113853, "learning_rate": 7.180277066374152e-07, "loss": 0.0114, "step": 8917 }, { "epoch": 8.326797385620916, "grad_norm": 2.0485150018699487, "learning_rate": 7.172471584451612e-07, "loss": 0.0424, "step": 8918 }, { "epoch": 8.327731092436975, "grad_norm": 1.5897041169294404, "learning_rate": 7.164670019576869e-07, "loss": 0.0192, "step": 8919 }, { "epoch": 8.328664799253035, "grad_norm": 0.4791693161773505, "learning_rate": 7.15687237246348e-07, "loss": 0.0066, "step": 8920 }, { "epoch": 8.329598506069095, "grad_norm": 1.9246989228893767, "learning_rate": 7.149078643824619e-07, "loss": 0.048, "step": 8921 }, { "epoch": 8.330532212885155, "grad_norm": 1.2986935838009634, "learning_rate": 7.141288834373095e-07, "loss": 0.0218, "step": 8922 }, { "epoch": 8.331465919701214, "grad_norm": 0.5643793706920819, "learning_rate": 7.133502944821391e-07, "loss": 0.0065, "step": 8923 }, { "epoch": 8.332399626517274, "grad_norm": 0.4772683600344093, "learning_rate": 7.125720975881606e-07, "loss": 0.0099, "step": 8924 }, { "epoch": 8.333333333333334, "grad_norm": 3.414802999895655, "learning_rate": 7.117942928265487e-07, "loss": 0.076, "step": 8925 }, { "epoch": 8.334267040149394, "grad_norm": 4.32989712206595, "learning_rate": 7.110168802684408e-07, "loss": 0.0493, "step": 8926 }, { "epoch": 8.335200746965453, "grad_norm": 3.1766998179759423, "learning_rate": 7.102398599849419e-07, "loss": 0.0537, "step": 8927 }, { "epoch": 8.336134453781513, "grad_norm": 3.6683663045057586, "learning_rate": 7.094632320471179e-07, "loss": 0.0393, "step": 8928 }, { "epoch": 8.337068160597573, "grad_norm": 0.5786600093072999, "learning_rate": 7.086869965260002e-07, "loss": 0.0064, "step": 8929 }, { "epoch": 8.338001867413633, "grad_norm": 2.035310458633564, "learning_rate": 7.079111534925831e-07, "loss": 0.0371, "step": 8930 }, { "epoch": 8.338935574229692, "grad_norm": 1.1945729246395227, "learning_rate": 7.071357030178277e-07, "loss": 0.0246, "step": 8931 }, { "epoch": 8.339869281045752, "grad_norm": 4.990648680166997, "learning_rate": 7.063606451726568e-07, "loss": 0.1266, "step": 8932 }, { "epoch": 8.340802987861812, "grad_norm": 6.64573051331339, "learning_rate": 7.055859800279579e-07, "loss": 0.1485, "step": 8933 }, { "epoch": 8.341736694677872, "grad_norm": 2.8023142147296407, "learning_rate": 7.048117076545813e-07, "loss": 0.072, "step": 8934 }, { "epoch": 8.342670401493931, "grad_norm": 0.9606474589214176, "learning_rate": 7.040378281233451e-07, "loss": 0.0241, "step": 8935 }, { "epoch": 8.343604108309991, "grad_norm": 0.6506409691948676, "learning_rate": 7.032643415050278e-07, "loss": 0.0101, "step": 8936 }, { "epoch": 8.344537815126051, "grad_norm": 1.6647581289472346, "learning_rate": 7.024912478703732e-07, "loss": 0.0415, "step": 8937 }, { "epoch": 8.34547152194211, "grad_norm": 3.243726908066888, "learning_rate": 7.017185472900889e-07, "loss": 0.0751, "step": 8938 }, { "epoch": 8.34640522875817, "grad_norm": 0.8899763633372619, "learning_rate": 7.009462398348477e-07, "loss": 0.0161, "step": 8939 }, { "epoch": 8.34733893557423, "grad_norm": 0.9812625629390374, "learning_rate": 7.001743255752857e-07, "loss": 0.0265, "step": 8940 }, { "epoch": 8.34827264239029, "grad_norm": 0.378763130470519, "learning_rate": 6.994028045820023e-07, "loss": 0.0049, "step": 8941 }, { "epoch": 8.34920634920635, "grad_norm": 2.508983923787848, "learning_rate": 6.986316769255614e-07, "loss": 0.0644, "step": 8942 }, { "epoch": 8.35014005602241, "grad_norm": 1.2334446541717397, "learning_rate": 6.978609426764909e-07, "loss": 0.0274, "step": 8943 }, { "epoch": 8.35107376283847, "grad_norm": 4.451044191037729, "learning_rate": 6.970906019052842e-07, "loss": 0.1043, "step": 8944 }, { "epoch": 8.352007469654529, "grad_norm": 5.373216813172096, "learning_rate": 6.963206546823964e-07, "loss": 0.1045, "step": 8945 }, { "epoch": 8.352941176470589, "grad_norm": 0.4328319278870379, "learning_rate": 6.955511010782484e-07, "loss": 0.0027, "step": 8946 }, { "epoch": 8.353874883286649, "grad_norm": 1.5216133001400516, "learning_rate": 6.947819411632223e-07, "loss": 0.0266, "step": 8947 }, { "epoch": 8.354808590102708, "grad_norm": 3.260940193101481, "learning_rate": 6.940131750076689e-07, "loss": 0.1425, "step": 8948 }, { "epoch": 8.355742296918768, "grad_norm": 1.0123448042617817, "learning_rate": 6.932448026818994e-07, "loss": 0.0188, "step": 8949 }, { "epoch": 8.356676003734828, "grad_norm": 3.0427762324710326, "learning_rate": 6.924768242561891e-07, "loss": 0.08, "step": 8950 }, { "epoch": 8.357609710550888, "grad_norm": 0.5685284845389229, "learning_rate": 6.91709239800778e-07, "loss": 0.0108, "step": 8951 }, { "epoch": 8.358543417366947, "grad_norm": 0.4350566650472807, "learning_rate": 6.909420493858715e-07, "loss": 0.006, "step": 8952 }, { "epoch": 8.359477124183007, "grad_norm": 0.33479536158038753, "learning_rate": 6.901752530816369e-07, "loss": 0.0039, "step": 8953 }, { "epoch": 8.360410830999067, "grad_norm": 1.102947749546013, "learning_rate": 6.894088509582059e-07, "loss": 0.015, "step": 8954 }, { "epoch": 8.361344537815127, "grad_norm": 0.6513632736418532, "learning_rate": 6.886428430856734e-07, "loss": 0.0094, "step": 8955 }, { "epoch": 8.362278244631186, "grad_norm": 2.4876406941559623, "learning_rate": 6.878772295341018e-07, "loss": 0.0332, "step": 8956 }, { "epoch": 8.363211951447246, "grad_norm": 7.007550381808905, "learning_rate": 6.871120103735135e-07, "loss": 0.1541, "step": 8957 }, { "epoch": 8.364145658263306, "grad_norm": 4.117368332309015, "learning_rate": 6.863471856738957e-07, "loss": 0.1099, "step": 8958 }, { "epoch": 8.365079365079366, "grad_norm": 0.9975721005957083, "learning_rate": 6.855827555052008e-07, "loss": 0.006, "step": 8959 }, { "epoch": 8.366013071895425, "grad_norm": 1.174936598626776, "learning_rate": 6.848187199373424e-07, "loss": 0.0193, "step": 8960 }, { "epoch": 8.366946778711485, "grad_norm": 0.7194105123708391, "learning_rate": 6.840550790402029e-07, "loss": 0.0102, "step": 8961 }, { "epoch": 8.367880485527545, "grad_norm": 3.1872295332076743, "learning_rate": 6.832918328836247e-07, "loss": 0.05, "step": 8962 }, { "epoch": 8.368814192343605, "grad_norm": 2.9953357071734303, "learning_rate": 6.825289815374142e-07, "loss": 0.0872, "step": 8963 }, { "epoch": 8.369747899159664, "grad_norm": 2.4123798735837463, "learning_rate": 6.817665250713418e-07, "loss": 0.0833, "step": 8964 }, { "epoch": 8.370681605975724, "grad_norm": 1.0822714131686075, "learning_rate": 6.810044635551443e-07, "loss": 0.0263, "step": 8965 }, { "epoch": 8.371615312791784, "grad_norm": 1.8084574535712818, "learning_rate": 6.802427970585206e-07, "loss": 0.0296, "step": 8966 }, { "epoch": 8.372549019607844, "grad_norm": 1.1794794387902128, "learning_rate": 6.794815256511328e-07, "loss": 0.0263, "step": 8967 }, { "epoch": 8.373482726423903, "grad_norm": 2.113448323013571, "learning_rate": 6.787206494026066e-07, "loss": 0.0459, "step": 8968 }, { "epoch": 8.374416433239963, "grad_norm": 0.9892147700629625, "learning_rate": 6.779601683825343e-07, "loss": 0.0262, "step": 8969 }, { "epoch": 8.375350140056023, "grad_norm": 2.82513711002304, "learning_rate": 6.772000826604691e-07, "loss": 0.0972, "step": 8970 }, { "epoch": 8.376283846872083, "grad_norm": 2.2054629733070077, "learning_rate": 6.764403923059299e-07, "loss": 0.0393, "step": 8971 }, { "epoch": 8.377217553688142, "grad_norm": 0.21242939057427052, "learning_rate": 6.756810973883971e-07, "loss": 0.0021, "step": 8972 }, { "epoch": 8.378151260504202, "grad_norm": 3.2223714922200895, "learning_rate": 6.749221979773185e-07, "loss": 0.0787, "step": 8973 }, { "epoch": 8.379084967320262, "grad_norm": 0.715407015300622, "learning_rate": 6.741636941421032e-07, "loss": 0.0091, "step": 8974 }, { "epoch": 8.380018674136322, "grad_norm": 2.0230702232633786, "learning_rate": 6.734055859521249e-07, "loss": 0.0694, "step": 8975 }, { "epoch": 8.380952380952381, "grad_norm": 0.5045492301964227, "learning_rate": 6.7264787347672e-07, "loss": 0.0085, "step": 8976 }, { "epoch": 8.381886087768441, "grad_norm": 2.156218577333844, "learning_rate": 6.718905567851896e-07, "loss": 0.0348, "step": 8977 }, { "epoch": 8.3828197945845, "grad_norm": 2.4130340178482768, "learning_rate": 6.711336359468001e-07, "loss": 0.0961, "step": 8978 }, { "epoch": 8.38375350140056, "grad_norm": 2.035745007275119, "learning_rate": 6.703771110307794e-07, "loss": 0.0539, "step": 8979 }, { "epoch": 8.38468720821662, "grad_norm": 2.2957206183949364, "learning_rate": 6.696209821063199e-07, "loss": 0.065, "step": 8980 }, { "epoch": 8.38562091503268, "grad_norm": 4.152330595527751, "learning_rate": 6.688652492425768e-07, "loss": 0.0802, "step": 8981 }, { "epoch": 8.38655462184874, "grad_norm": 0.9259348791486431, "learning_rate": 6.681099125086727e-07, "loss": 0.0194, "step": 8982 }, { "epoch": 8.3874883286648, "grad_norm": 3.1590504288910926, "learning_rate": 6.673549719736899e-07, "loss": 0.0817, "step": 8983 }, { "epoch": 8.38842203548086, "grad_norm": 4.4929267067625664, "learning_rate": 6.666004277066762e-07, "loss": 0.1614, "step": 8984 }, { "epoch": 8.38935574229692, "grad_norm": 3.1458781783297955, "learning_rate": 6.658462797766413e-07, "loss": 0.0279, "step": 8985 }, { "epoch": 8.390289449112979, "grad_norm": 0.35105941440116606, "learning_rate": 6.650925282525638e-07, "loss": 0.0085, "step": 8986 }, { "epoch": 8.391223155929039, "grad_norm": 2.5445517101122532, "learning_rate": 6.643391732033805e-07, "loss": 0.0436, "step": 8987 }, { "epoch": 8.392156862745098, "grad_norm": 0.9204155254864149, "learning_rate": 6.635862146979927e-07, "loss": 0.0166, "step": 8988 }, { "epoch": 8.393090569561158, "grad_norm": 2.4809884403880043, "learning_rate": 6.628336528052693e-07, "loss": 0.0591, "step": 8989 }, { "epoch": 8.394024276377218, "grad_norm": 2.074522555757835, "learning_rate": 6.620814875940395e-07, "loss": 0.0407, "step": 8990 }, { "epoch": 8.394957983193278, "grad_norm": 1.0701393442268994, "learning_rate": 6.61329719133097e-07, "loss": 0.014, "step": 8991 }, { "epoch": 8.395891690009337, "grad_norm": 0.36732829212924434, "learning_rate": 6.605783474911976e-07, "loss": 0.0021, "step": 8992 }, { "epoch": 8.396825396825397, "grad_norm": 1.0147902505771014, "learning_rate": 6.598273727370652e-07, "loss": 0.0359, "step": 8993 }, { "epoch": 8.397759103641457, "grad_norm": 0.5828556056461879, "learning_rate": 6.59076794939384e-07, "loss": 0.0157, "step": 8994 }, { "epoch": 8.398692810457517, "grad_norm": 0.386032785603992, "learning_rate": 6.583266141668005e-07, "loss": 0.0049, "step": 8995 }, { "epoch": 8.399626517273576, "grad_norm": 1.228184369497794, "learning_rate": 6.575768304879293e-07, "loss": 0.0362, "step": 8996 }, { "epoch": 8.400560224089636, "grad_norm": 1.1627085124990828, "learning_rate": 6.568274439713463e-07, "loss": 0.0112, "step": 8997 }, { "epoch": 8.401493930905696, "grad_norm": 0.4511791871043277, "learning_rate": 6.560784546855898e-07, "loss": 0.0085, "step": 8998 }, { "epoch": 8.402427637721756, "grad_norm": 2.2398134818096804, "learning_rate": 6.553298626991628e-07, "loss": 0.0804, "step": 8999 }, { "epoch": 8.403361344537815, "grad_norm": 3.840255705585501, "learning_rate": 6.545816680805345e-07, "loss": 0.0482, "step": 9000 }, { "epoch": 8.404295051353875, "grad_norm": 0.42312562336883797, "learning_rate": 6.53833870898134e-07, "loss": 0.0105, "step": 9001 }, { "epoch": 8.405228758169935, "grad_norm": 1.1493371133540529, "learning_rate": 6.530864712203544e-07, "loss": 0.0114, "step": 9002 }, { "epoch": 8.406162464985995, "grad_norm": 0.37318006993579556, "learning_rate": 6.523394691155566e-07, "loss": 0.0027, "step": 9003 }, { "epoch": 8.407096171802054, "grad_norm": 4.3366139638306676, "learning_rate": 6.515928646520602e-07, "loss": 0.0515, "step": 9004 }, { "epoch": 8.408029878618114, "grad_norm": 0.7735388839676601, "learning_rate": 6.508466578981504e-07, "loss": 0.0169, "step": 9005 }, { "epoch": 8.408963585434174, "grad_norm": 0.38975119833400607, "learning_rate": 6.501008489220756e-07, "loss": 0.0046, "step": 9006 }, { "epoch": 8.409897292250234, "grad_norm": 0.9032871273887414, "learning_rate": 6.493554377920497e-07, "loss": 0.0175, "step": 9007 }, { "epoch": 8.410830999066294, "grad_norm": 3.6045461125180696, "learning_rate": 6.486104245762481e-07, "loss": 0.04, "step": 9008 }, { "epoch": 8.411764705882353, "grad_norm": 1.6210563285161712, "learning_rate": 6.478658093428103e-07, "loss": 0.0146, "step": 9009 }, { "epoch": 8.412698412698413, "grad_norm": 13.333831233946778, "learning_rate": 6.471215921598394e-07, "loss": 0.2924, "step": 9010 }, { "epoch": 8.413632119514473, "grad_norm": 0.7349449010245138, "learning_rate": 6.463777730954012e-07, "loss": 0.0165, "step": 9011 }, { "epoch": 8.414565826330533, "grad_norm": 2.4099358940896716, "learning_rate": 6.456343522175284e-07, "loss": 0.0649, "step": 9012 }, { "epoch": 8.415499533146592, "grad_norm": 4.610334574003243, "learning_rate": 6.448913295942144e-07, "loss": 0.1547, "step": 9013 }, { "epoch": 8.416433239962652, "grad_norm": 2.3110135400430534, "learning_rate": 6.441487052934159e-07, "loss": 0.0528, "step": 9014 }, { "epoch": 8.417366946778712, "grad_norm": 1.883730057417235, "learning_rate": 6.43406479383053e-07, "loss": 0.0559, "step": 9015 }, { "epoch": 8.418300653594772, "grad_norm": 2.1833498812118126, "learning_rate": 6.426646519310131e-07, "loss": 0.0744, "step": 9016 }, { "epoch": 8.419234360410831, "grad_norm": 1.4578171335355297, "learning_rate": 6.419232230051431e-07, "loss": 0.0547, "step": 9017 }, { "epoch": 8.420168067226891, "grad_norm": 1.3669069584810043, "learning_rate": 6.411821926732548e-07, "loss": 0.0367, "step": 9018 }, { "epoch": 8.42110177404295, "grad_norm": 2.5547920083768854, "learning_rate": 6.404415610031228e-07, "loss": 0.0823, "step": 9019 }, { "epoch": 8.42203548085901, "grad_norm": 2.427874605003227, "learning_rate": 6.397013280624881e-07, "loss": 0.0497, "step": 9020 }, { "epoch": 8.42296918767507, "grad_norm": 1.4275956707465567, "learning_rate": 6.389614939190514e-07, "loss": 0.0274, "step": 9021 }, { "epoch": 8.42390289449113, "grad_norm": 0.6679841836694315, "learning_rate": 6.382220586404797e-07, "loss": 0.0126, "step": 9022 }, { "epoch": 8.42483660130719, "grad_norm": 0.5154676185183438, "learning_rate": 6.374830222944007e-07, "loss": 0.0055, "step": 9023 }, { "epoch": 8.42577030812325, "grad_norm": 3.8276383934662492, "learning_rate": 6.367443849484095e-07, "loss": 0.1184, "step": 9024 }, { "epoch": 8.42670401493931, "grad_norm": 4.1109271196770685, "learning_rate": 6.360061466700618e-07, "loss": 0.0739, "step": 9025 }, { "epoch": 8.427637721755369, "grad_norm": 1.6343538532506658, "learning_rate": 6.352683075268773e-07, "loss": 0.047, "step": 9026 }, { "epoch": 8.428571428571429, "grad_norm": 1.9575465042805336, "learning_rate": 6.345308675863399e-07, "loss": 0.049, "step": 9027 }, { "epoch": 8.429505135387489, "grad_norm": 3.069051999791861, "learning_rate": 6.337938269158955e-07, "loss": 0.0667, "step": 9028 }, { "epoch": 8.430438842203548, "grad_norm": 0.5354949612138661, "learning_rate": 6.33057185582956e-07, "loss": 0.0122, "step": 9029 }, { "epoch": 8.431372549019608, "grad_norm": 0.8122296458236665, "learning_rate": 6.323209436548955e-07, "loss": 0.021, "step": 9030 }, { "epoch": 8.432306255835668, "grad_norm": 1.749229466588533, "learning_rate": 6.315851011990499e-07, "loss": 0.0373, "step": 9031 }, { "epoch": 8.433239962651728, "grad_norm": 0.3936463854119801, "learning_rate": 6.3084965828272e-07, "loss": 0.0061, "step": 9032 }, { "epoch": 8.434173669467787, "grad_norm": 0.47453781137446865, "learning_rate": 6.301146149731724e-07, "loss": 0.0085, "step": 9033 }, { "epoch": 8.435107376283847, "grad_norm": 4.527941309096447, "learning_rate": 6.293799713376336e-07, "loss": 0.1437, "step": 9034 }, { "epoch": 8.436041083099907, "grad_norm": 3.2462722619224285, "learning_rate": 6.286457274432944e-07, "loss": 0.1207, "step": 9035 }, { "epoch": 8.436974789915967, "grad_norm": 1.2698343919563129, "learning_rate": 6.27911883357309e-07, "loss": 0.0045, "step": 9036 }, { "epoch": 8.437908496732026, "grad_norm": 3.0542826878638882, "learning_rate": 6.271784391467967e-07, "loss": 0.0993, "step": 9037 }, { "epoch": 8.438842203548086, "grad_norm": 1.0083455770308638, "learning_rate": 6.26445394878839e-07, "loss": 0.0219, "step": 9038 }, { "epoch": 8.439775910364146, "grad_norm": 4.052365076360867, "learning_rate": 6.257127506204807e-07, "loss": 0.0666, "step": 9039 }, { "epoch": 8.440709617180206, "grad_norm": 1.367317736854726, "learning_rate": 6.249805064387287e-07, "loss": 0.0242, "step": 9040 }, { "epoch": 8.441643323996265, "grad_norm": 7.070828459130906, "learning_rate": 6.242486624005567e-07, "loss": 0.2024, "step": 9041 }, { "epoch": 8.442577030812325, "grad_norm": 2.9806012171014284, "learning_rate": 6.235172185728993e-07, "loss": 0.0549, "step": 9042 }, { "epoch": 8.443510737628385, "grad_norm": 0.5849717628546144, "learning_rate": 6.227861750226555e-07, "loss": 0.0146, "step": 9043 }, { "epoch": 8.444444444444445, "grad_norm": 0.21617441310834418, "learning_rate": 6.22055531816686e-07, "loss": 0.0022, "step": 9044 }, { "epoch": 8.445378151260504, "grad_norm": 2.386880339605251, "learning_rate": 6.213252890218163e-07, "loss": 0.0418, "step": 9045 }, { "epoch": 8.446311858076564, "grad_norm": 2.5400058604948956, "learning_rate": 6.205954467048364e-07, "loss": 0.0465, "step": 9046 }, { "epoch": 8.447245564892624, "grad_norm": 3.5113001098428187, "learning_rate": 6.198660049324978e-07, "loss": 0.1173, "step": 9047 }, { "epoch": 8.448179271708684, "grad_norm": 3.9655523719287773, "learning_rate": 6.191369637715156e-07, "loss": 0.0907, "step": 9048 }, { "epoch": 8.449112978524743, "grad_norm": 0.9978330851761262, "learning_rate": 6.184083232885679e-07, "loss": 0.0168, "step": 9049 }, { "epoch": 8.450046685340803, "grad_norm": 4.84185530341577, "learning_rate": 6.176800835502989e-07, "loss": 0.2151, "step": 9050 }, { "epoch": 8.450980392156863, "grad_norm": 4.068278913047032, "learning_rate": 6.169522446233134e-07, "loss": 0.1779, "step": 9051 }, { "epoch": 8.451914098972923, "grad_norm": 0.1706385520500436, "learning_rate": 6.162248065741794e-07, "loss": 0.0014, "step": 9052 }, { "epoch": 8.452847805788982, "grad_norm": 2.944583470794216, "learning_rate": 6.154977694694287e-07, "loss": 0.0944, "step": 9053 }, { "epoch": 8.453781512605042, "grad_norm": 0.2215359150105267, "learning_rate": 6.147711333755584e-07, "loss": 0.0018, "step": 9054 }, { "epoch": 8.454715219421102, "grad_norm": 2.216738482516598, "learning_rate": 6.140448983590275e-07, "loss": 0.0972, "step": 9055 }, { "epoch": 8.455648926237162, "grad_norm": 2.964170188134062, "learning_rate": 6.133190644862569e-07, "loss": 0.0399, "step": 9056 }, { "epoch": 8.456582633053221, "grad_norm": 0.5004081351668589, "learning_rate": 6.125936318236325e-07, "loss": 0.012, "step": 9057 }, { "epoch": 8.457516339869281, "grad_norm": 2.285942860777701, "learning_rate": 6.118686004375019e-07, "loss": 0.0786, "step": 9058 }, { "epoch": 8.458450046685341, "grad_norm": 1.3279977741938498, "learning_rate": 6.111439703941796e-07, "loss": 0.0081, "step": 9059 }, { "epoch": 8.4593837535014, "grad_norm": 2.144534441797385, "learning_rate": 6.1041974175994e-07, "loss": 0.0374, "step": 9060 }, { "epoch": 8.46031746031746, "grad_norm": 3.489061319112452, "learning_rate": 6.096959146010212e-07, "loss": 0.0905, "step": 9061 }, { "epoch": 8.46125116713352, "grad_norm": 4.477382576348249, "learning_rate": 6.089724889836246e-07, "loss": 0.079, "step": 9062 }, { "epoch": 8.46218487394958, "grad_norm": 1.2182463744839165, "learning_rate": 6.082494649739173e-07, "loss": 0.0264, "step": 9063 }, { "epoch": 8.46311858076564, "grad_norm": 2.2641059435138953, "learning_rate": 6.075268426380265e-07, "loss": 0.0386, "step": 9064 }, { "epoch": 8.4640522875817, "grad_norm": 1.9380364898503781, "learning_rate": 6.068046220420437e-07, "loss": 0.0564, "step": 9065 }, { "epoch": 8.46498599439776, "grad_norm": 0.8603611494677591, "learning_rate": 6.060828032520249e-07, "loss": 0.0249, "step": 9066 }, { "epoch": 8.465919701213819, "grad_norm": 0.1965120329022539, "learning_rate": 6.053613863339885e-07, "loss": 0.004, "step": 9067 }, { "epoch": 8.466853408029879, "grad_norm": 1.134579021575069, "learning_rate": 6.04640371353914e-07, "loss": 0.0219, "step": 9068 }, { "epoch": 8.467787114845938, "grad_norm": 0.5892771338746187, "learning_rate": 6.039197583777484e-07, "loss": 0.0068, "step": 9069 }, { "epoch": 8.468720821661998, "grad_norm": 4.33212075763651, "learning_rate": 6.031995474713992e-07, "loss": 0.0804, "step": 9070 }, { "epoch": 8.469654528478058, "grad_norm": 0.6939991790307696, "learning_rate": 6.024797387007369e-07, "loss": 0.0045, "step": 9071 }, { "epoch": 8.470588235294118, "grad_norm": 2.4760046440163745, "learning_rate": 6.017603321315951e-07, "loss": 0.0542, "step": 9072 }, { "epoch": 8.471521942110178, "grad_norm": 2.91963564767028, "learning_rate": 6.010413278297738e-07, "loss": 0.0615, "step": 9073 }, { "epoch": 8.472455648926237, "grad_norm": 1.0746156638268982, "learning_rate": 6.003227258610322e-07, "loss": 0.0143, "step": 9074 }, { "epoch": 8.473389355742297, "grad_norm": 6.109516810316124, "learning_rate": 5.996045262910944e-07, "loss": 0.1353, "step": 9075 }, { "epoch": 8.474323062558357, "grad_norm": 1.5117675471434235, "learning_rate": 5.988867291856482e-07, "loss": 0.0306, "step": 9076 }, { "epoch": 8.475256769374417, "grad_norm": 4.931513486828286, "learning_rate": 5.981693346103445e-07, "loss": 0.2018, "step": 9077 }, { "epoch": 8.476190476190476, "grad_norm": 1.92352378442341, "learning_rate": 5.974523426307954e-07, "loss": 0.0188, "step": 9078 }, { "epoch": 8.477124183006536, "grad_norm": 1.7710529310756635, "learning_rate": 5.967357533125784e-07, "loss": 0.0466, "step": 9079 }, { "epoch": 8.478057889822596, "grad_norm": 1.0034267855479804, "learning_rate": 5.960195667212338e-07, "loss": 0.0131, "step": 9080 }, { "epoch": 8.478991596638656, "grad_norm": 1.6196797591290233, "learning_rate": 5.953037829222646e-07, "loss": 0.0252, "step": 9081 }, { "epoch": 8.479925303454715, "grad_norm": 2.7665922894782944, "learning_rate": 5.945884019811371e-07, "loss": 0.0813, "step": 9082 }, { "epoch": 8.480859010270775, "grad_norm": 0.19591837546801263, "learning_rate": 5.938734239632798e-07, "loss": 0.001, "step": 9083 }, { "epoch": 8.481792717086835, "grad_norm": 2.7045869774907456, "learning_rate": 5.931588489340867e-07, "loss": 0.0687, "step": 9084 }, { "epoch": 8.482726423902895, "grad_norm": 4.014109049916366, "learning_rate": 5.924446769589132e-07, "loss": 0.0919, "step": 9085 }, { "epoch": 8.483660130718954, "grad_norm": 1.169011381936676, "learning_rate": 5.917309081030781e-07, "loss": 0.0292, "step": 9086 }, { "epoch": 8.484593837535014, "grad_norm": 0.36489371319181874, "learning_rate": 5.91017542431862e-07, "loss": 0.0069, "step": 9087 }, { "epoch": 8.485527544351074, "grad_norm": 0.9103653485879295, "learning_rate": 5.903045800105123e-07, "loss": 0.0127, "step": 9088 }, { "epoch": 8.486461251167134, "grad_norm": 2.145705964188439, "learning_rate": 5.89592020904236e-07, "loss": 0.0176, "step": 9089 }, { "epoch": 8.487394957983193, "grad_norm": 2.122923659323438, "learning_rate": 5.888798651782046e-07, "loss": 0.0641, "step": 9090 }, { "epoch": 8.488328664799253, "grad_norm": 4.747351085350967, "learning_rate": 5.881681128975525e-07, "loss": 0.0729, "step": 9091 }, { "epoch": 8.489262371615313, "grad_norm": 0.47975942850255965, "learning_rate": 5.874567641273766e-07, "loss": 0.0038, "step": 9092 }, { "epoch": 8.490196078431373, "grad_norm": 0.9172638050796541, "learning_rate": 5.867458189327397e-07, "loss": 0.0122, "step": 9093 }, { "epoch": 8.491129785247432, "grad_norm": 1.3984341236329656, "learning_rate": 5.860352773786632e-07, "loss": 0.037, "step": 9094 }, { "epoch": 8.492063492063492, "grad_norm": 0.7864332394363706, "learning_rate": 5.853251395301357e-07, "loss": 0.0207, "step": 9095 }, { "epoch": 8.492997198879552, "grad_norm": 2.21167522633416, "learning_rate": 5.846154054521047e-07, "loss": 0.0706, "step": 9096 }, { "epoch": 8.493930905695612, "grad_norm": 0.9515358584561835, "learning_rate": 5.83906075209486e-07, "loss": 0.0141, "step": 9097 }, { "epoch": 8.494864612511671, "grad_norm": 1.9139630534900465, "learning_rate": 5.831971488671545e-07, "loss": 0.0412, "step": 9098 }, { "epoch": 8.495798319327731, "grad_norm": 1.1828497216457419, "learning_rate": 5.824886264899493e-07, "loss": 0.0159, "step": 9099 }, { "epoch": 8.49673202614379, "grad_norm": 3.91273656904348, "learning_rate": 5.817805081426714e-07, "loss": 0.0636, "step": 9100 }, { "epoch": 8.49766573295985, "grad_norm": 1.16310988415199, "learning_rate": 5.810727938900879e-07, "loss": 0.0262, "step": 9101 }, { "epoch": 8.49859943977591, "grad_norm": 1.4854511698030388, "learning_rate": 5.803654837969263e-07, "loss": 0.0343, "step": 9102 }, { "epoch": 8.49953314659197, "grad_norm": 0.7015137602746547, "learning_rate": 5.796585779278779e-07, "loss": 0.0168, "step": 9103 }, { "epoch": 8.50046685340803, "grad_norm": 1.3341931205428317, "learning_rate": 5.789520763475958e-07, "loss": 0.0264, "step": 9104 }, { "epoch": 8.50140056022409, "grad_norm": 1.2510114065531517, "learning_rate": 5.782459791206996e-07, "loss": 0.0254, "step": 9105 }, { "epoch": 8.50233426704015, "grad_norm": 0.5447777525221615, "learning_rate": 5.775402863117679e-07, "loss": 0.014, "step": 9106 }, { "epoch": 8.50326797385621, "grad_norm": 3.160874465461742, "learning_rate": 5.768349979853449e-07, "loss": 0.0706, "step": 9107 }, { "epoch": 8.504201680672269, "grad_norm": 0.8005348551324613, "learning_rate": 5.761301142059366e-07, "loss": 0.0161, "step": 9108 }, { "epoch": 8.505135387488329, "grad_norm": 1.9194257870859357, "learning_rate": 5.754256350380116e-07, "loss": 0.0384, "step": 9109 }, { "epoch": 8.506069094304388, "grad_norm": 1.0748832460285922, "learning_rate": 5.747215605460044e-07, "loss": 0.0189, "step": 9110 }, { "epoch": 8.507002801120448, "grad_norm": 3.5076898308555067, "learning_rate": 5.740178907943083e-07, "loss": 0.115, "step": 9111 }, { "epoch": 8.507936507936508, "grad_norm": 3.4704218109299023, "learning_rate": 5.733146258472827e-07, "loss": 0.0191, "step": 9112 }, { "epoch": 8.508870214752568, "grad_norm": 1.3881955353417865, "learning_rate": 5.726117657692476e-07, "loss": 0.0404, "step": 9113 }, { "epoch": 8.509803921568627, "grad_norm": 0.7562030304789675, "learning_rate": 5.719093106244894e-07, "loss": 0.0121, "step": 9114 }, { "epoch": 8.510737628384687, "grad_norm": 0.7701033945455016, "learning_rate": 5.712072604772539e-07, "loss": 0.0072, "step": 9115 }, { "epoch": 8.511671335200747, "grad_norm": 2.6985849510041557, "learning_rate": 5.705056153917521e-07, "loss": 0.0517, "step": 9116 }, { "epoch": 8.512605042016807, "grad_norm": 1.7946045082217819, "learning_rate": 5.698043754321553e-07, "loss": 0.0405, "step": 9117 }, { "epoch": 8.513538748832866, "grad_norm": 3.28630210351986, "learning_rate": 5.691035406626017e-07, "loss": 0.0678, "step": 9118 }, { "epoch": 8.514472455648926, "grad_norm": 1.3626760937282931, "learning_rate": 5.684031111471893e-07, "loss": 0.0177, "step": 9119 }, { "epoch": 8.515406162464986, "grad_norm": 0.38277984400576703, "learning_rate": 5.677030869499811e-07, "loss": 0.0035, "step": 9120 }, { "epoch": 8.516339869281046, "grad_norm": 2.2093001646528285, "learning_rate": 5.670034681349995e-07, "loss": 0.0469, "step": 9121 }, { "epoch": 8.517273576097105, "grad_norm": 4.595300357454335, "learning_rate": 5.663042547662356e-07, "loss": 0.1834, "step": 9122 }, { "epoch": 8.518207282913165, "grad_norm": 1.472102038843432, "learning_rate": 5.656054469076383e-07, "loss": 0.0247, "step": 9123 }, { "epoch": 8.519140989729225, "grad_norm": 0.29164066478207495, "learning_rate": 5.649070446231219e-07, "loss": 0.0053, "step": 9124 }, { "epoch": 8.520074696545285, "grad_norm": 0.9902344158112627, "learning_rate": 5.642090479765622e-07, "loss": 0.0163, "step": 9125 }, { "epoch": 8.521008403361344, "grad_norm": 0.9002202444387994, "learning_rate": 5.635114570317979e-07, "loss": 0.031, "step": 9126 }, { "epoch": 8.521942110177404, "grad_norm": 2.1790057050995535, "learning_rate": 5.628142718526342e-07, "loss": 0.0323, "step": 9127 }, { "epoch": 8.522875816993464, "grad_norm": 1.3483185666452677, "learning_rate": 5.621174925028339e-07, "loss": 0.0342, "step": 9128 }, { "epoch": 8.523809523809524, "grad_norm": 1.440485693892482, "learning_rate": 5.614211190461267e-07, "loss": 0.0314, "step": 9129 }, { "epoch": 8.524743230625583, "grad_norm": 4.961261119452716, "learning_rate": 5.607251515462009e-07, "loss": 0.0868, "step": 9130 }, { "epoch": 8.525676937441643, "grad_norm": 2.665936887566504, "learning_rate": 5.600295900667141e-07, "loss": 0.0544, "step": 9131 }, { "epoch": 8.526610644257703, "grad_norm": 1.5698398878883775, "learning_rate": 5.593344346712803e-07, "loss": 0.0219, "step": 9132 }, { "epoch": 8.527544351073763, "grad_norm": 2.675765035988507, "learning_rate": 5.586396854234805e-07, "loss": 0.0909, "step": 9133 }, { "epoch": 8.528478057889822, "grad_norm": 2.5396780828568994, "learning_rate": 5.579453423868559e-07, "loss": 0.0718, "step": 9134 }, { "epoch": 8.529411764705882, "grad_norm": 1.6940217193271745, "learning_rate": 5.572514056249129e-07, "loss": 0.0464, "step": 9135 }, { "epoch": 8.530345471521942, "grad_norm": 4.445429556903039, "learning_rate": 5.565578752011197e-07, "loss": 0.1081, "step": 9136 }, { "epoch": 8.531279178338002, "grad_norm": 0.5615490453855544, "learning_rate": 5.558647511789067e-07, "loss": 0.0056, "step": 9137 }, { "epoch": 8.532212885154062, "grad_norm": 1.9439555872906804, "learning_rate": 5.551720336216665e-07, "loss": 0.0417, "step": 9138 }, { "epoch": 8.533146591970121, "grad_norm": 2.7867518332135215, "learning_rate": 5.544797225927584e-07, "loss": 0.0463, "step": 9139 }, { "epoch": 8.534080298786181, "grad_norm": 1.2205327876931698, "learning_rate": 5.537878181555006e-07, "loss": 0.0259, "step": 9140 }, { "epoch": 8.53501400560224, "grad_norm": 0.8445925534190019, "learning_rate": 5.530963203731754e-07, "loss": 0.0215, "step": 9141 }, { "epoch": 8.5359477124183, "grad_norm": 1.3854254338380556, "learning_rate": 5.524052293090265e-07, "loss": 0.0294, "step": 9142 }, { "epoch": 8.53688141923436, "grad_norm": 1.5505982335579358, "learning_rate": 5.517145450262639e-07, "loss": 0.0223, "step": 9143 }, { "epoch": 8.53781512605042, "grad_norm": 5.089146402086226, "learning_rate": 5.510242675880579e-07, "loss": 0.164, "step": 9144 }, { "epoch": 8.53874883286648, "grad_norm": 1.7526410030354154, "learning_rate": 5.503343970575398e-07, "loss": 0.0383, "step": 9145 }, { "epoch": 8.53968253968254, "grad_norm": 2.176633770744928, "learning_rate": 5.496449334978088e-07, "loss": 0.0673, "step": 9146 }, { "epoch": 8.5406162464986, "grad_norm": 0.6305288267231807, "learning_rate": 5.489558769719222e-07, "loss": 0.0131, "step": 9147 }, { "epoch": 8.541549953314659, "grad_norm": 2.2994508449653894, "learning_rate": 5.482672275429019e-07, "loss": 0.0333, "step": 9148 }, { "epoch": 8.542483660130719, "grad_norm": 0.8936925749384331, "learning_rate": 5.475789852737328e-07, "loss": 0.0206, "step": 9149 }, { "epoch": 8.543417366946779, "grad_norm": 1.9753578430401537, "learning_rate": 5.468911502273627e-07, "loss": 0.0721, "step": 9150 }, { "epoch": 8.544351073762838, "grad_norm": 1.4699503742470241, "learning_rate": 5.462037224667011e-07, "loss": 0.0274, "step": 9151 }, { "epoch": 8.545284780578898, "grad_norm": 5.181513740857906, "learning_rate": 5.455167020546198e-07, "loss": 0.1586, "step": 9152 }, { "epoch": 8.546218487394958, "grad_norm": 2.6914133647476266, "learning_rate": 5.448300890539559e-07, "loss": 0.1089, "step": 9153 }, { "epoch": 8.547152194211018, "grad_norm": 4.4002810273193855, "learning_rate": 5.441438835275076e-07, "loss": 0.1487, "step": 9154 }, { "epoch": 8.548085901027077, "grad_norm": 1.1759611450296004, "learning_rate": 5.434580855380344e-07, "loss": 0.0369, "step": 9155 }, { "epoch": 8.549019607843137, "grad_norm": 1.502620852011265, "learning_rate": 5.427726951482626e-07, "loss": 0.0339, "step": 9156 }, { "epoch": 8.549953314659197, "grad_norm": 0.2745674464069327, "learning_rate": 5.420877124208768e-07, "loss": 0.0013, "step": 9157 }, { "epoch": 8.550887021475257, "grad_norm": 7.711254363235238, "learning_rate": 5.414031374185264e-07, "loss": 0.1106, "step": 9158 }, { "epoch": 8.551820728291316, "grad_norm": 0.18335291019813935, "learning_rate": 5.40718970203824e-07, "loss": 0.0007, "step": 9159 }, { "epoch": 8.552754435107376, "grad_norm": 0.4310711764632676, "learning_rate": 5.400352108393425e-07, "loss": 0.004, "step": 9160 }, { "epoch": 8.553688141923436, "grad_norm": 1.2539255020532996, "learning_rate": 5.393518593876213e-07, "loss": 0.0304, "step": 9161 }, { "epoch": 8.554621848739496, "grad_norm": 0.20285159470960398, "learning_rate": 5.386689159111603e-07, "loss": 0.0043, "step": 9162 }, { "epoch": 8.555555555555555, "grad_norm": 0.8107696661191743, "learning_rate": 5.379863804724206e-07, "loss": 0.0145, "step": 9163 }, { "epoch": 8.556489262371615, "grad_norm": 0.01527081561007392, "learning_rate": 5.373042531338274e-07, "loss": 0.0001, "step": 9164 }, { "epoch": 8.557422969187675, "grad_norm": 0.9416704295459596, "learning_rate": 5.366225339577713e-07, "loss": 0.0135, "step": 9165 }, { "epoch": 8.558356676003735, "grad_norm": 7.744383799438186, "learning_rate": 5.359412230066008e-07, "loss": 0.1497, "step": 9166 }, { "epoch": 8.559290382819794, "grad_norm": 2.463563898802063, "learning_rate": 5.352603203426304e-07, "loss": 0.126, "step": 9167 }, { "epoch": 8.560224089635854, "grad_norm": 3.6353665166773466, "learning_rate": 5.345798260281343e-07, "loss": 0.0791, "step": 9168 }, { "epoch": 8.561157796451914, "grad_norm": 2.767590996804455, "learning_rate": 5.338997401253538e-07, "loss": 0.0413, "step": 9169 }, { "epoch": 8.562091503267974, "grad_norm": 0.5103563968819577, "learning_rate": 5.332200626964884e-07, "loss": 0.0091, "step": 9170 }, { "epoch": 8.563025210084033, "grad_norm": 0.4708897504381454, "learning_rate": 5.32540793803703e-07, "loss": 0.009, "step": 9171 }, { "epoch": 8.563958916900093, "grad_norm": 2.6073340810306704, "learning_rate": 5.318619335091229e-07, "loss": 0.0808, "step": 9172 }, { "epoch": 8.564892623716153, "grad_norm": 0.7877398209993983, "learning_rate": 5.31183481874839e-07, "loss": 0.017, "step": 9173 }, { "epoch": 8.565826330532213, "grad_norm": 6.432964723131199, "learning_rate": 5.305054389629022e-07, "loss": 0.2331, "step": 9174 }, { "epoch": 8.566760037348272, "grad_norm": 3.47991229583918, "learning_rate": 5.298278048353272e-07, "loss": 0.0877, "step": 9175 }, { "epoch": 8.567693744164332, "grad_norm": 0.8536317657817162, "learning_rate": 5.291505795540913e-07, "loss": 0.0235, "step": 9176 }, { "epoch": 8.568627450980392, "grad_norm": 1.2013332272108745, "learning_rate": 5.284737631811326e-07, "loss": 0.018, "step": 9177 }, { "epoch": 8.569561157796452, "grad_norm": 6.9505728851918835, "learning_rate": 5.277973557783561e-07, "loss": 0.243, "step": 9178 }, { "epoch": 8.570494864612511, "grad_norm": 0.6918671728227587, "learning_rate": 5.27121357407625e-07, "loss": 0.0077, "step": 9179 }, { "epoch": 8.571428571428571, "grad_norm": 2.136630821715075, "learning_rate": 5.264457681307672e-07, "loss": 0.0674, "step": 9180 }, { "epoch": 8.572362278244631, "grad_norm": 3.375164944698402, "learning_rate": 5.257705880095715e-07, "loss": 0.0686, "step": 9181 }, { "epoch": 8.57329598506069, "grad_norm": 2.7778209164079755, "learning_rate": 5.25095817105793e-07, "loss": 0.0799, "step": 9182 }, { "epoch": 8.57422969187675, "grad_norm": 0.7710481004708911, "learning_rate": 5.24421455481145e-07, "loss": 0.014, "step": 9183 }, { "epoch": 8.57516339869281, "grad_norm": 1.2221797718658203, "learning_rate": 5.237475031973061e-07, "loss": 0.0217, "step": 9184 }, { "epoch": 8.57609710550887, "grad_norm": 4.4150410016935515, "learning_rate": 5.230739603159151e-07, "loss": 0.1505, "step": 9185 }, { "epoch": 8.57703081232493, "grad_norm": 2.705670462431775, "learning_rate": 5.224008268985781e-07, "loss": 0.0831, "step": 9186 }, { "epoch": 8.57796451914099, "grad_norm": 1.4029355789111078, "learning_rate": 5.217281030068577e-07, "loss": 0.0271, "step": 9187 }, { "epoch": 8.57889822595705, "grad_norm": 6.928296992413982, "learning_rate": 5.210557887022833e-07, "loss": 0.1825, "step": 9188 }, { "epoch": 8.579831932773109, "grad_norm": 1.883799080639457, "learning_rate": 5.203838840463437e-07, "loss": 0.0236, "step": 9189 }, { "epoch": 8.580765639589169, "grad_norm": 0.5405368865544802, "learning_rate": 5.19712389100494e-07, "loss": 0.0111, "step": 9190 }, { "epoch": 8.581699346405228, "grad_norm": 1.4661143182582081, "learning_rate": 5.190413039261494e-07, "loss": 0.0592, "step": 9191 }, { "epoch": 8.582633053221288, "grad_norm": 1.147521346276805, "learning_rate": 5.183706285846873e-07, "loss": 0.0105, "step": 9192 }, { "epoch": 8.583566760037348, "grad_norm": 2.330808018826425, "learning_rate": 5.177003631374483e-07, "loss": 0.0301, "step": 9193 }, { "epoch": 8.584500466853408, "grad_norm": 0.3987476063956873, "learning_rate": 5.170305076457355e-07, "loss": 0.0089, "step": 9194 }, { "epoch": 8.585434173669467, "grad_norm": 0.8871216023967086, "learning_rate": 5.16361062170816e-07, "loss": 0.0108, "step": 9195 }, { "epoch": 8.586367880485527, "grad_norm": 5.3660562872269395, "learning_rate": 5.156920267739163e-07, "loss": 0.1651, "step": 9196 }, { "epoch": 8.587301587301587, "grad_norm": 4.911696891358323, "learning_rate": 5.150234015162275e-07, "loss": 0.0699, "step": 9197 }, { "epoch": 8.588235294117647, "grad_norm": 9.37535205019233, "learning_rate": 5.143551864589019e-07, "loss": 0.2545, "step": 9198 }, { "epoch": 8.589169000933706, "grad_norm": 0.31429999103870454, "learning_rate": 5.136873816630572e-07, "loss": 0.0042, "step": 9199 }, { "epoch": 8.590102707749766, "grad_norm": 5.371101087531985, "learning_rate": 5.130199871897701e-07, "loss": 0.0964, "step": 9200 }, { "epoch": 8.591036414565826, "grad_norm": 1.7804728513160692, "learning_rate": 5.123530031000818e-07, "loss": 0.0353, "step": 9201 }, { "epoch": 8.591970121381886, "grad_norm": 0.13352975494689595, "learning_rate": 5.116864294549934e-07, "loss": 0.0012, "step": 9202 }, { "epoch": 8.592903828197946, "grad_norm": 5.268076950418349, "learning_rate": 5.110202663154729e-07, "loss": 0.1518, "step": 9203 }, { "epoch": 8.593837535014005, "grad_norm": 2.0795717969019587, "learning_rate": 5.10354513742447e-07, "loss": 0.0208, "step": 9204 }, { "epoch": 8.594771241830065, "grad_norm": 2.5787521174275927, "learning_rate": 5.096891717968066e-07, "loss": 0.0475, "step": 9205 }, { "epoch": 8.595704948646125, "grad_norm": 2.2822124378781736, "learning_rate": 5.090242405394041e-07, "loss": 0.0616, "step": 9206 }, { "epoch": 8.596638655462185, "grad_norm": 1.2650433138019626, "learning_rate": 5.083597200310536e-07, "loss": 0.0362, "step": 9207 }, { "epoch": 8.597572362278244, "grad_norm": 0.8872078447466786, "learning_rate": 5.076956103325354e-07, "loss": 0.0167, "step": 9208 }, { "epoch": 8.598506069094304, "grad_norm": 2.7253953903574275, "learning_rate": 5.07031911504588e-07, "loss": 0.0919, "step": 9209 }, { "epoch": 8.599439775910364, "grad_norm": 2.639612839118171, "learning_rate": 5.063686236079141e-07, "loss": 0.0889, "step": 9210 }, { "epoch": 8.600373482726424, "grad_norm": 1.795224250182444, "learning_rate": 5.057057467031778e-07, "loss": 0.0414, "step": 9211 }, { "epoch": 8.601307189542483, "grad_norm": 1.5721326858129925, "learning_rate": 5.050432808510086e-07, "loss": 0.0129, "step": 9212 }, { "epoch": 8.602240896358543, "grad_norm": 1.6972061273618677, "learning_rate": 5.043812261119951e-07, "loss": 0.0665, "step": 9213 }, { "epoch": 8.603174603174603, "grad_norm": 2.388412782729037, "learning_rate": 5.037195825466895e-07, "loss": 0.0721, "step": 9214 }, { "epoch": 8.604108309990663, "grad_norm": 3.4162168828761468, "learning_rate": 5.030583502156056e-07, "loss": 0.1003, "step": 9215 }, { "epoch": 8.605042016806722, "grad_norm": 1.2869594950170016, "learning_rate": 5.023975291792216e-07, "loss": 0.0308, "step": 9216 }, { "epoch": 8.605975723622782, "grad_norm": 0.4853744215639531, "learning_rate": 5.01737119497977e-07, "loss": 0.0042, "step": 9217 }, { "epoch": 8.606909430438842, "grad_norm": 5.022049119533037, "learning_rate": 5.010771212322712e-07, "loss": 0.1682, "step": 9218 }, { "epoch": 8.607843137254902, "grad_norm": 1.4561664438948376, "learning_rate": 5.004175344424717e-07, "loss": 0.0378, "step": 9219 }, { "epoch": 8.608776844070961, "grad_norm": 3.2703933513858665, "learning_rate": 4.997583591889027e-07, "loss": 0.0776, "step": 9220 }, { "epoch": 8.609710550887021, "grad_norm": 2.4407184544567504, "learning_rate": 4.990995955318539e-07, "loss": 0.0666, "step": 9221 }, { "epoch": 8.61064425770308, "grad_norm": 3.714106094133137, "learning_rate": 4.98441243531575e-07, "loss": 0.1013, "step": 9222 }, { "epoch": 8.61157796451914, "grad_norm": 1.241040222737427, "learning_rate": 4.977833032482815e-07, "loss": 0.0234, "step": 9223 }, { "epoch": 8.6125116713352, "grad_norm": 1.301908252606611, "learning_rate": 4.971257747421487e-07, "loss": 0.0291, "step": 9224 }, { "epoch": 8.61344537815126, "grad_norm": 7.452174383809074, "learning_rate": 4.96468658073313e-07, "loss": 0.2205, "step": 9225 }, { "epoch": 8.61437908496732, "grad_norm": 2.3196509308629505, "learning_rate": 4.958119533018779e-07, "loss": 0.0547, "step": 9226 }, { "epoch": 8.61531279178338, "grad_norm": 1.184387643313323, "learning_rate": 4.951556604879049e-07, "loss": 0.0433, "step": 9227 }, { "epoch": 8.61624649859944, "grad_norm": 0.7223238871633014, "learning_rate": 4.944997796914175e-07, "loss": 0.0184, "step": 9228 }, { "epoch": 8.6171802054155, "grad_norm": 2.086463718389444, "learning_rate": 4.938443109724062e-07, "loss": 0.0714, "step": 9229 }, { "epoch": 8.618113912231559, "grad_norm": 3.3275212489196133, "learning_rate": 4.931892543908195e-07, "loss": 0.0914, "step": 9230 }, { "epoch": 8.619047619047619, "grad_norm": 2.389270486313645, "learning_rate": 4.925346100065692e-07, "loss": 0.0465, "step": 9231 }, { "epoch": 8.619981325863678, "grad_norm": 2.5869099435709844, "learning_rate": 4.918803778795289e-07, "loss": 0.0746, "step": 9232 }, { "epoch": 8.620915032679738, "grad_norm": 1.2606205468133138, "learning_rate": 4.912265580695374e-07, "loss": 0.0176, "step": 9233 }, { "epoch": 8.621848739495798, "grad_norm": 2.4413726751099714, "learning_rate": 4.905731506363931e-07, "loss": 0.08, "step": 9234 }, { "epoch": 8.622782446311858, "grad_norm": 0.11072738641811107, "learning_rate": 4.899201556398564e-07, "loss": 0.0009, "step": 9235 }, { "epoch": 8.623716153127917, "grad_norm": 0.5628842889518403, "learning_rate": 4.892675731396507e-07, "loss": 0.008, "step": 9236 }, { "epoch": 8.624649859943977, "grad_norm": 1.5891976437910076, "learning_rate": 4.886154031954632e-07, "loss": 0.0367, "step": 9237 }, { "epoch": 8.625583566760037, "grad_norm": 5.011049728639399, "learning_rate": 4.879636458669406e-07, "loss": 0.1591, "step": 9238 }, { "epoch": 8.626517273576097, "grad_norm": 3.219698221142072, "learning_rate": 4.873123012136949e-07, "loss": 0.0848, "step": 9239 }, { "epoch": 8.627450980392156, "grad_norm": 0.621077674843109, "learning_rate": 4.866613692952971e-07, "loss": 0.0166, "step": 9240 }, { "epoch": 8.628384687208216, "grad_norm": 2.4714239242091423, "learning_rate": 4.860108501712824e-07, "loss": 0.0537, "step": 9241 }, { "epoch": 8.629318394024276, "grad_norm": 2.6937408257364943, "learning_rate": 4.853607439011488e-07, "loss": 0.0984, "step": 9242 }, { "epoch": 8.630252100840336, "grad_norm": 2.7346347789813565, "learning_rate": 4.847110505443548e-07, "loss": 0.0579, "step": 9243 }, { "epoch": 8.631185807656395, "grad_norm": 2.4491955465810187, "learning_rate": 4.840617701603223e-07, "loss": 0.0715, "step": 9244 }, { "epoch": 8.632119514472455, "grad_norm": 0.44271635352216065, "learning_rate": 4.834129028084344e-07, "loss": 0.0066, "step": 9245 }, { "epoch": 8.633053221288515, "grad_norm": 4.168245172146899, "learning_rate": 4.827644485480393e-07, "loss": 0.0706, "step": 9246 }, { "epoch": 8.633986928104575, "grad_norm": 1.9420966447013328, "learning_rate": 4.821164074384432e-07, "loss": 0.0158, "step": 9247 }, { "epoch": 8.634920634920634, "grad_norm": 2.480768759029548, "learning_rate": 4.814687795389172e-07, "loss": 0.0998, "step": 9248 }, { "epoch": 8.635854341736694, "grad_norm": 3.3612581576953815, "learning_rate": 4.808215649086928e-07, "loss": 0.0808, "step": 9249 }, { "epoch": 8.636788048552754, "grad_norm": 3.1434442314457445, "learning_rate": 4.801747636069676e-07, "loss": 0.0509, "step": 9250 }, { "epoch": 8.637721755368814, "grad_norm": 4.346722288366961, "learning_rate": 4.795283756928965e-07, "loss": 0.1208, "step": 9251 }, { "epoch": 8.638655462184873, "grad_norm": 0.7858144581081293, "learning_rate": 4.788824012255999e-07, "loss": 0.0056, "step": 9252 }, { "epoch": 8.639589169000933, "grad_norm": 0.5160678387346229, "learning_rate": 4.782368402641574e-07, "loss": 0.0104, "step": 9253 }, { "epoch": 8.640522875816993, "grad_norm": 3.401856742805918, "learning_rate": 4.775916928676156e-07, "loss": 0.0682, "step": 9254 }, { "epoch": 8.641456582633053, "grad_norm": 1.0290162930858235, "learning_rate": 4.769469590949783e-07, "loss": 0.0274, "step": 9255 }, { "epoch": 8.642390289449112, "grad_norm": 1.771304653765163, "learning_rate": 4.763026390052139e-07, "loss": 0.025, "step": 9256 }, { "epoch": 8.643323996265172, "grad_norm": 1.7092193425030158, "learning_rate": 4.756587326572526e-07, "loss": 0.0401, "step": 9257 }, { "epoch": 8.644257703081232, "grad_norm": 1.64388562447966, "learning_rate": 4.7501524010998555e-07, "loss": 0.029, "step": 9258 }, { "epoch": 8.645191409897292, "grad_norm": 2.8572117865296214, "learning_rate": 4.743721614222696e-07, "loss": 0.0452, "step": 9259 }, { "epoch": 8.646125116713351, "grad_norm": 0.7408296768636281, "learning_rate": 4.737294966529199e-07, "loss": 0.0072, "step": 9260 }, { "epoch": 8.647058823529411, "grad_norm": 1.3632818740696715, "learning_rate": 4.73087245860715e-07, "loss": 0.025, "step": 9261 }, { "epoch": 8.647992530345471, "grad_norm": 2.3119894463163164, "learning_rate": 4.7244540910439564e-07, "loss": 0.0288, "step": 9262 }, { "epoch": 8.64892623716153, "grad_norm": 1.6551968753460322, "learning_rate": 4.7180398644266655e-07, "loss": 0.0518, "step": 9263 }, { "epoch": 8.64985994397759, "grad_norm": 1.3751660108714598, "learning_rate": 4.711629779341914e-07, "loss": 0.0221, "step": 9264 }, { "epoch": 8.65079365079365, "grad_norm": 0.555321492232898, "learning_rate": 4.705223836375977e-07, "loss": 0.0065, "step": 9265 }, { "epoch": 8.65172735760971, "grad_norm": 4.818133072057763, "learning_rate": 4.6988220361147363e-07, "loss": 0.0374, "step": 9266 }, { "epoch": 8.65266106442577, "grad_norm": 2.348276486605369, "learning_rate": 4.6924243791437343e-07, "loss": 0.0474, "step": 9267 }, { "epoch": 8.65359477124183, "grad_norm": 3.6614083195081646, "learning_rate": 4.6860308660480913e-07, "loss": 0.0473, "step": 9268 }, { "epoch": 8.65452847805789, "grad_norm": 1.3311761463756862, "learning_rate": 4.6796414974125625e-07, "loss": 0.0204, "step": 9269 }, { "epoch": 8.655462184873949, "grad_norm": 2.9918117217892917, "learning_rate": 4.6732562738215195e-07, "loss": 0.0837, "step": 9270 }, { "epoch": 8.656395891690009, "grad_norm": 2.6667960339112873, "learning_rate": 4.666875195858983e-07, "loss": 0.0953, "step": 9271 }, { "epoch": 8.657329598506069, "grad_norm": 6.791590823400647, "learning_rate": 4.6604982641085595e-07, "loss": 0.2361, "step": 9272 }, { "epoch": 8.658263305322128, "grad_norm": 1.1481926386664936, "learning_rate": 4.6541254791534876e-07, "loss": 0.0244, "step": 9273 }, { "epoch": 8.659197012138188, "grad_norm": 2.2491325135608022, "learning_rate": 4.647756841576628e-07, "loss": 0.0486, "step": 9274 }, { "epoch": 8.660130718954248, "grad_norm": 1.0220214202952917, "learning_rate": 4.641392351960455e-07, "loss": 0.005, "step": 9275 }, { "epoch": 8.661064425770308, "grad_norm": 1.9425422127867884, "learning_rate": 4.635032010887097e-07, "loss": 0.0419, "step": 9276 }, { "epoch": 8.661998132586367, "grad_norm": 2.347494825247006, "learning_rate": 4.628675818938255e-07, "loss": 0.0543, "step": 9277 }, { "epoch": 8.662931839402427, "grad_norm": 2.4134374341649374, "learning_rate": 4.622323776695281e-07, "loss": 0.0675, "step": 9278 }, { "epoch": 8.663865546218487, "grad_norm": 0.5944592891612198, "learning_rate": 4.6159758847391265e-07, "loss": 0.0055, "step": 9279 }, { "epoch": 8.664799253034547, "grad_norm": 1.6608525150280777, "learning_rate": 4.6096321436504e-07, "loss": 0.0366, "step": 9280 }, { "epoch": 8.665732959850606, "grad_norm": 0.9838810213437207, "learning_rate": 4.6032925540092864e-07, "loss": 0.0307, "step": 9281 }, { "epoch": 8.666666666666666, "grad_norm": 1.0679183839985174, "learning_rate": 4.5969571163956173e-07, "loss": 0.0076, "step": 9282 }, { "epoch": 8.667600373482726, "grad_norm": 0.09342162609371506, "learning_rate": 4.590625831388834e-07, "loss": 0.0007, "step": 9283 }, { "epoch": 8.668534080298786, "grad_norm": 0.48442474537118263, "learning_rate": 4.584298699568013e-07, "loss": 0.0086, "step": 9284 }, { "epoch": 8.669467787114845, "grad_norm": 1.5822929466909472, "learning_rate": 4.577975721511829e-07, "loss": 0.0159, "step": 9285 }, { "epoch": 8.670401493930905, "grad_norm": 3.940223398790975, "learning_rate": 4.571656897798593e-07, "loss": 0.1277, "step": 9286 }, { "epoch": 8.671335200746965, "grad_norm": 1.8696354930079577, "learning_rate": 4.56534222900622e-07, "loss": 0.0498, "step": 9287 }, { "epoch": 8.672268907563025, "grad_norm": 4.213814220524977, "learning_rate": 4.5590317157122755e-07, "loss": 0.0921, "step": 9288 }, { "epoch": 8.673202614379084, "grad_norm": 2.382366983163663, "learning_rate": 4.5527253584939147e-07, "loss": 0.0548, "step": 9289 }, { "epoch": 8.674136321195144, "grad_norm": 0.5736639352839247, "learning_rate": 4.5464231579279206e-07, "loss": 0.0034, "step": 9290 }, { "epoch": 8.675070028011204, "grad_norm": 1.4963823605423359, "learning_rate": 4.5401251145906933e-07, "loss": 0.0324, "step": 9291 }, { "epoch": 8.676003734827264, "grad_norm": 0.6364116731750684, "learning_rate": 4.533831229058278e-07, "loss": 0.0134, "step": 9292 }, { "epoch": 8.676937441643323, "grad_norm": 3.24044354361959, "learning_rate": 4.5275415019063016e-07, "loss": 0.0803, "step": 9293 }, { "epoch": 8.677871148459383, "grad_norm": 1.164057853425172, "learning_rate": 4.5212559337100394e-07, "loss": 0.0235, "step": 9294 }, { "epoch": 8.678804855275443, "grad_norm": 3.3138680124689497, "learning_rate": 4.5149745250443634e-07, "loss": 0.0692, "step": 9295 }, { "epoch": 8.679738562091503, "grad_norm": 1.4222096807937898, "learning_rate": 4.5086972764837924e-07, "loss": 0.0471, "step": 9296 }, { "epoch": 8.680672268907562, "grad_norm": 1.75217016979714, "learning_rate": 4.502424188602439e-07, "loss": 0.0492, "step": 9297 }, { "epoch": 8.681605975723622, "grad_norm": 3.1687754527743355, "learning_rate": 4.4961552619740446e-07, "loss": 0.061, "step": 9298 }, { "epoch": 8.682539682539682, "grad_norm": 2.981486803697546, "learning_rate": 4.4898904971719835e-07, "loss": 0.0664, "step": 9299 }, { "epoch": 8.683473389355742, "grad_norm": 1.0039176685211255, "learning_rate": 4.4836298947692313e-07, "loss": 0.014, "step": 9300 }, { "epoch": 8.684407096171801, "grad_norm": 2.6066116726957924, "learning_rate": 4.477373455338391e-07, "loss": 0.0552, "step": 9301 }, { "epoch": 8.685340802987861, "grad_norm": 1.0973709576539705, "learning_rate": 4.471121179451665e-07, "loss": 0.0259, "step": 9302 }, { "epoch": 8.686274509803921, "grad_norm": 0.9368128236817898, "learning_rate": 4.464873067680914e-07, "loss": 0.0156, "step": 9303 }, { "epoch": 8.68720821661998, "grad_norm": 0.5207840554073654, "learning_rate": 4.4586291205975905e-07, "loss": 0.0131, "step": 9304 }, { "epoch": 8.68814192343604, "grad_norm": 1.1635247631924504, "learning_rate": 4.4523893387727666e-07, "loss": 0.0115, "step": 9305 }, { "epoch": 8.6890756302521, "grad_norm": 2.5211900425145863, "learning_rate": 4.446153722777147e-07, "loss": 0.0888, "step": 9306 }, { "epoch": 8.69000933706816, "grad_norm": 0.45032111395183605, "learning_rate": 4.439922273181046e-07, "loss": 0.0049, "step": 9307 }, { "epoch": 8.69094304388422, "grad_norm": 0.46762904726120313, "learning_rate": 4.433694990554394e-07, "loss": 0.0023, "step": 9308 }, { "epoch": 8.69187675070028, "grad_norm": 0.4028214183586181, "learning_rate": 4.4274718754667377e-07, "loss": 0.0033, "step": 9309 }, { "epoch": 8.69281045751634, "grad_norm": 1.0806649819079233, "learning_rate": 4.421252928487263e-07, "loss": 0.0295, "step": 9310 }, { "epoch": 8.693744164332399, "grad_norm": 0.6268318500784575, "learning_rate": 4.4150381501847585e-07, "loss": 0.017, "step": 9311 }, { "epoch": 8.694677871148459, "grad_norm": 1.2807915358958182, "learning_rate": 4.4088275411276306e-07, "loss": 0.0147, "step": 9312 }, { "epoch": 8.695611577964518, "grad_norm": 3.189985247456388, "learning_rate": 4.402621101883897e-07, "loss": 0.0627, "step": 9313 }, { "epoch": 8.696545284780578, "grad_norm": 2.2130859807378447, "learning_rate": 4.3964188330212266e-07, "loss": 0.0477, "step": 9314 }, { "epoch": 8.697478991596638, "grad_norm": 3.969968972434335, "learning_rate": 4.3902207351068704e-07, "loss": 0.1298, "step": 9315 }, { "epoch": 8.698412698412698, "grad_norm": 0.36533180452308905, "learning_rate": 4.38402680870772e-07, "loss": 0.0039, "step": 9316 }, { "epoch": 8.699346405228757, "grad_norm": 2.1365923716540136, "learning_rate": 4.377837054390266e-07, "loss": 0.0394, "step": 9317 }, { "epoch": 8.700280112044817, "grad_norm": 1.6622354765404097, "learning_rate": 4.371651472720645e-07, "loss": 0.0478, "step": 9318 }, { "epoch": 8.701213818860877, "grad_norm": 3.6559030623423516, "learning_rate": 4.3654700642645877e-07, "loss": 0.0361, "step": 9319 }, { "epoch": 8.702147525676937, "grad_norm": 2.608903630997021, "learning_rate": 4.3592928295874527e-07, "loss": 0.0917, "step": 9320 }, { "epoch": 8.703081232492996, "grad_norm": 2.9717182555730135, "learning_rate": 4.3531197692542047e-07, "loss": 0.0853, "step": 9321 }, { "epoch": 8.704014939309056, "grad_norm": 4.885103405374628, "learning_rate": 4.3469508838294647e-07, "loss": 0.1146, "step": 9322 }, { "epoch": 8.704948646125116, "grad_norm": 1.611404538770104, "learning_rate": 4.340786173877426e-07, "loss": 0.0239, "step": 9323 }, { "epoch": 8.705882352941176, "grad_norm": 0.3394004585977643, "learning_rate": 4.33462563996192e-07, "loss": 0.0036, "step": 9324 }, { "epoch": 8.706816059757235, "grad_norm": 2.9176955697179876, "learning_rate": 4.3284692826464024e-07, "loss": 0.0582, "step": 9325 }, { "epoch": 8.707749766573295, "grad_norm": 0.49797227007124056, "learning_rate": 4.3223171024939224e-07, "loss": 0.008, "step": 9326 }, { "epoch": 8.708683473389355, "grad_norm": 1.8469403854581778, "learning_rate": 4.316169100067191e-07, "loss": 0.0236, "step": 9327 }, { "epoch": 8.709617180205415, "grad_norm": 10.079807730918569, "learning_rate": 4.310025275928492e-07, "loss": 0.2547, "step": 9328 }, { "epoch": 8.710550887021475, "grad_norm": 2.8403061872231645, "learning_rate": 4.3038856306397535e-07, "loss": 0.0629, "step": 9329 }, { "epoch": 8.711484593837534, "grad_norm": 2.5603967212856347, "learning_rate": 4.297750164762499e-07, "loss": 0.0547, "step": 9330 }, { "epoch": 8.712418300653594, "grad_norm": 1.3021447242497257, "learning_rate": 4.291618878857906e-07, "loss": 0.026, "step": 9331 }, { "epoch": 8.713352007469654, "grad_norm": 1.5806409365796736, "learning_rate": 4.285491773486733e-07, "loss": 0.0207, "step": 9332 }, { "epoch": 8.714285714285714, "grad_norm": 0.9657973481027269, "learning_rate": 4.279368849209381e-07, "loss": 0.0237, "step": 9333 }, { "epoch": 8.715219421101773, "grad_norm": 0.03550036362048647, "learning_rate": 4.273250106585841e-07, "loss": 0.0001, "step": 9334 }, { "epoch": 8.716153127917833, "grad_norm": 1.0141859308290688, "learning_rate": 4.2671355461757655e-07, "loss": 0.0107, "step": 9335 }, { "epoch": 8.717086834733893, "grad_norm": 0.4266494526601518, "learning_rate": 4.2610251685383796e-07, "loss": 0.0035, "step": 9336 }, { "epoch": 8.718020541549953, "grad_norm": 4.587513011158741, "learning_rate": 4.254918974232547e-07, "loss": 0.1389, "step": 9337 }, { "epoch": 8.718954248366012, "grad_norm": 0.6090359589554556, "learning_rate": 4.248816963816743e-07, "loss": 0.0078, "step": 9338 }, { "epoch": 8.719887955182072, "grad_norm": 0.7469108795525305, "learning_rate": 4.242719137849077e-07, "loss": 0.0076, "step": 9339 }, { "epoch": 8.720821661998132, "grad_norm": 3.4868044107663962, "learning_rate": 4.2366254968872535e-07, "loss": 0.0468, "step": 9340 }, { "epoch": 8.721755368814192, "grad_norm": 0.5182363767745624, "learning_rate": 4.2305360414886034e-07, "loss": 0.0086, "step": 9341 }, { "epoch": 8.722689075630251, "grad_norm": 1.475359522404973, "learning_rate": 4.2244507722100705e-07, "loss": 0.0317, "step": 9342 }, { "epoch": 8.723622782446311, "grad_norm": 3.4192952639944827, "learning_rate": 4.2183696896082207e-07, "loss": 0.0551, "step": 9343 }, { "epoch": 8.72455648926237, "grad_norm": 3.0234517161256558, "learning_rate": 4.212292794239242e-07, "loss": 0.0663, "step": 9344 }, { "epoch": 8.72549019607843, "grad_norm": 0.5390912746022575, "learning_rate": 4.20622008665893e-07, "loss": 0.0064, "step": 9345 }, { "epoch": 8.72642390289449, "grad_norm": 2.156980450208366, "learning_rate": 4.2001515674227e-07, "loss": 0.0773, "step": 9346 }, { "epoch": 8.72735760971055, "grad_norm": 3.3481918369441983, "learning_rate": 4.1940872370855814e-07, "loss": 0.1199, "step": 9347 }, { "epoch": 8.72829131652661, "grad_norm": 1.7400819987574885, "learning_rate": 4.1880270962022293e-07, "loss": 0.0383, "step": 9348 }, { "epoch": 8.72922502334267, "grad_norm": 4.544920396708011, "learning_rate": 4.1819711453269075e-07, "loss": 0.1175, "step": 9349 }, { "epoch": 8.73015873015873, "grad_norm": 1.1081419466350895, "learning_rate": 4.175919385013499e-07, "loss": 0.0357, "step": 9350 }, { "epoch": 8.731092436974789, "grad_norm": 0.9014884189586093, "learning_rate": 4.169871815815502e-07, "loss": 0.013, "step": 9351 }, { "epoch": 8.732026143790849, "grad_norm": 2.251407706590342, "learning_rate": 4.1638284382860386e-07, "loss": 0.0711, "step": 9352 }, { "epoch": 8.732959850606909, "grad_norm": 2.19705460911192, "learning_rate": 4.1577892529778396e-07, "loss": 0.0408, "step": 9353 }, { "epoch": 8.733893557422968, "grad_norm": 0.9344117319291023, "learning_rate": 4.1517542604432584e-07, "loss": 0.0271, "step": 9354 }, { "epoch": 8.73482726423903, "grad_norm": 6.987077395166072, "learning_rate": 4.145723461234252e-07, "loss": 0.1092, "step": 9355 }, { "epoch": 8.73576097105509, "grad_norm": 9.251798551984521, "learning_rate": 4.1396968559023973e-07, "loss": 0.2027, "step": 9356 }, { "epoch": 8.73669467787115, "grad_norm": 1.7279412112947183, "learning_rate": 4.1336744449989195e-07, "loss": 0.0421, "step": 9357 }, { "epoch": 8.73762838468721, "grad_norm": 1.6402460921128275, "learning_rate": 4.127656229074617e-07, "loss": 0.0505, "step": 9358 }, { "epoch": 8.738562091503269, "grad_norm": 0.7524264211104794, "learning_rate": 4.121642208679921e-07, "loss": 0.0128, "step": 9359 }, { "epoch": 8.739495798319329, "grad_norm": 1.5151527661108506, "learning_rate": 4.115632384364876e-07, "loss": 0.0138, "step": 9360 }, { "epoch": 8.740429505135388, "grad_norm": 1.1461612877910996, "learning_rate": 4.109626756679164e-07, "loss": 0.0296, "step": 9361 }, { "epoch": 8.741363211951448, "grad_norm": 0.7374227529995327, "learning_rate": 4.1036253261720507e-07, "loss": 0.0128, "step": 9362 }, { "epoch": 8.742296918767508, "grad_norm": 1.8217619192728767, "learning_rate": 4.0976280933924417e-07, "loss": 0.0551, "step": 9363 }, { "epoch": 8.743230625583568, "grad_norm": 0.6149854789959761, "learning_rate": 4.091635058888832e-07, "loss": 0.0082, "step": 9364 }, { "epoch": 8.744164332399627, "grad_norm": 0.5615660835564918, "learning_rate": 4.0856462232093717e-07, "loss": 0.0101, "step": 9365 }, { "epoch": 8.745098039215687, "grad_norm": 1.7429824379101093, "learning_rate": 4.079661586901801e-07, "loss": 0.0171, "step": 9366 }, { "epoch": 8.746031746031747, "grad_norm": 0.40582700912779696, "learning_rate": 4.07368115051347e-07, "loss": 0.0069, "step": 9367 }, { "epoch": 8.746965452847807, "grad_norm": 2.793293875897, "learning_rate": 4.067704914591358e-07, "loss": 0.0468, "step": 9368 }, { "epoch": 8.747899159663866, "grad_norm": 3.0301797037700458, "learning_rate": 4.061732879682073e-07, "loss": 0.0837, "step": 9369 }, { "epoch": 8.748832866479926, "grad_norm": 0.47251048183938144, "learning_rate": 4.0557650463318056e-07, "loss": 0.0077, "step": 9370 }, { "epoch": 8.749766573295986, "grad_norm": 1.050324989559898, "learning_rate": 4.0498014150863794e-07, "loss": 0.0135, "step": 9371 }, { "epoch": 8.750700280112046, "grad_norm": 6.315060814691499, "learning_rate": 4.0438419864912535e-07, "loss": 0.2119, "step": 9372 }, { "epoch": 8.751633986928105, "grad_norm": 1.6695526327464165, "learning_rate": 4.037886761091464e-07, "loss": 0.0488, "step": 9373 }, { "epoch": 8.752567693744165, "grad_norm": 0.9259819193906877, "learning_rate": 4.0319357394316917e-07, "loss": 0.0247, "step": 9374 }, { "epoch": 8.753501400560225, "grad_norm": 1.935843658334832, "learning_rate": 4.0259889220562066e-07, "loss": 0.0335, "step": 9375 }, { "epoch": 8.754435107376285, "grad_norm": 0.1070469786075613, "learning_rate": 4.0200463095089347e-07, "loss": 0.0006, "step": 9376 }, { "epoch": 8.755368814192344, "grad_norm": 0.24162468943192442, "learning_rate": 4.0141079023333804e-07, "loss": 0.0004, "step": 9377 }, { "epoch": 8.756302521008404, "grad_norm": 3.038746061197711, "learning_rate": 4.008173701072665e-07, "loss": 0.0994, "step": 9378 }, { "epoch": 8.757236227824464, "grad_norm": 2.77744469905594, "learning_rate": 4.0022437062695594e-07, "loss": 0.0283, "step": 9379 }, { "epoch": 8.758169934640524, "grad_norm": 0.76901067158767, "learning_rate": 3.996317918466419e-07, "loss": 0.0103, "step": 9380 }, { "epoch": 8.759103641456583, "grad_norm": 0.4707034073439644, "learning_rate": 3.9903963382052044e-07, "loss": 0.0072, "step": 9381 }, { "epoch": 8.760037348272643, "grad_norm": 1.5518943145988142, "learning_rate": 3.9844789660275327e-07, "loss": 0.0502, "step": 9382 }, { "epoch": 8.760971055088703, "grad_norm": 1.625009226689308, "learning_rate": 3.9785658024746097e-07, "loss": 0.0488, "step": 9383 }, { "epoch": 8.761904761904763, "grad_norm": 0.45996116030255935, "learning_rate": 3.9726568480872474e-07, "loss": 0.0081, "step": 9384 }, { "epoch": 8.762838468720823, "grad_norm": 11.779940243609024, "learning_rate": 3.96675210340588e-07, "loss": 0.2449, "step": 9385 }, { "epoch": 8.763772175536882, "grad_norm": 2.507983674274929, "learning_rate": 3.960851568970586e-07, "loss": 0.0701, "step": 9386 }, { "epoch": 8.764705882352942, "grad_norm": 1.3623528430507028, "learning_rate": 3.9549552453210126e-07, "loss": 0.0232, "step": 9387 }, { "epoch": 8.765639589169002, "grad_norm": 2.040433458485159, "learning_rate": 3.949063132996456e-07, "loss": 0.0162, "step": 9388 }, { "epoch": 8.766573295985062, "grad_norm": 2.3732201577528653, "learning_rate": 3.9431752325358066e-07, "loss": 0.0424, "step": 9389 }, { "epoch": 8.767507002801121, "grad_norm": 2.3557636362121253, "learning_rate": 3.9372915444775725e-07, "loss": 0.0227, "step": 9390 }, { "epoch": 8.768440709617181, "grad_norm": 3.640875716753321, "learning_rate": 3.931412069359897e-07, "loss": 0.1002, "step": 9391 }, { "epoch": 8.76937441643324, "grad_norm": 1.167560997096144, "learning_rate": 3.925536807720509e-07, "loss": 0.0317, "step": 9392 }, { "epoch": 8.7703081232493, "grad_norm": 3.82808100889781, "learning_rate": 3.9196657600967804e-07, "loss": 0.0846, "step": 9393 }, { "epoch": 8.77124183006536, "grad_norm": 0.985616198708432, "learning_rate": 3.9137989270256585e-07, "loss": 0.0198, "step": 9394 }, { "epoch": 8.77217553688142, "grad_norm": 4.13773093598865, "learning_rate": 3.907936309043753e-07, "loss": 0.0977, "step": 9395 }, { "epoch": 8.77310924369748, "grad_norm": 1.4611659227613185, "learning_rate": 3.902077906687263e-07, "loss": 0.0203, "step": 9396 }, { "epoch": 8.77404295051354, "grad_norm": 0.18872745167578936, "learning_rate": 3.8962237204919927e-07, "loss": 0.001, "step": 9397 }, { "epoch": 8.7749766573296, "grad_norm": 2.333988848216856, "learning_rate": 3.890373750993365e-07, "loss": 0.0352, "step": 9398 }, { "epoch": 8.775910364145659, "grad_norm": 2.5718102921928856, "learning_rate": 3.8845279987264506e-07, "loss": 0.0834, "step": 9399 }, { "epoch": 8.776844070961719, "grad_norm": 3.09272612065549, "learning_rate": 3.878686464225889e-07, "loss": 0.0432, "step": 9400 }, { "epoch": 8.777777777777779, "grad_norm": 2.152344850001254, "learning_rate": 3.872849148025953e-07, "loss": 0.0317, "step": 9401 }, { "epoch": 8.778711484593838, "grad_norm": 2.1673494303150247, "learning_rate": 3.867016050660527e-07, "loss": 0.0523, "step": 9402 }, { "epoch": 8.779645191409898, "grad_norm": 0.8602713619984154, "learning_rate": 3.861187172663128e-07, "loss": 0.0192, "step": 9403 }, { "epoch": 8.780578898225958, "grad_norm": 0.660548923973344, "learning_rate": 3.855362514566857e-07, "loss": 0.0136, "step": 9404 }, { "epoch": 8.781512605042018, "grad_norm": 2.0085756959594776, "learning_rate": 3.8495420769044443e-07, "loss": 0.025, "step": 9405 }, { "epoch": 8.782446311858077, "grad_norm": 1.046769384103209, "learning_rate": 3.843725860208236e-07, "loss": 0.0121, "step": 9406 }, { "epoch": 8.783380018674137, "grad_norm": 1.8453734203051806, "learning_rate": 3.837913865010179e-07, "loss": 0.0419, "step": 9407 }, { "epoch": 8.784313725490197, "grad_norm": 0.3361662248048336, "learning_rate": 3.832106091841858e-07, "loss": 0.0035, "step": 9408 }, { "epoch": 8.785247432306257, "grad_norm": 1.0852564391152246, "learning_rate": 3.826302541234456e-07, "loss": 0.0198, "step": 9409 }, { "epoch": 8.786181139122316, "grad_norm": 0.6240864130631792, "learning_rate": 3.820503213718768e-07, "loss": 0.0046, "step": 9410 }, { "epoch": 8.787114845938376, "grad_norm": 12.208073802991175, "learning_rate": 3.814708109825199e-07, "loss": 0.1443, "step": 9411 }, { "epoch": 8.788048552754436, "grad_norm": 3.78389987205509, "learning_rate": 3.808917230083786e-07, "loss": 0.1163, "step": 9412 }, { "epoch": 8.788982259570496, "grad_norm": 3.0877476397570787, "learning_rate": 3.8031305750241666e-07, "loss": 0.0533, "step": 9413 }, { "epoch": 8.789915966386555, "grad_norm": 1.1453351922282264, "learning_rate": 3.797348145175589e-07, "loss": 0.0257, "step": 9414 }, { "epoch": 8.790849673202615, "grad_norm": 0.6525899037236343, "learning_rate": 3.791569941066919e-07, "loss": 0.0087, "step": 9415 }, { "epoch": 8.791783380018675, "grad_norm": 0.22197641486984682, "learning_rate": 3.785795963226646e-07, "loss": 0.0022, "step": 9416 }, { "epoch": 8.792717086834735, "grad_norm": 3.744044435455321, "learning_rate": 3.780026212182858e-07, "loss": 0.1155, "step": 9417 }, { "epoch": 8.793650793650794, "grad_norm": 0.9834910868198038, "learning_rate": 3.7742606884632603e-07, "loss": 0.0083, "step": 9418 }, { "epoch": 8.794584500466854, "grad_norm": 2.9076628094040133, "learning_rate": 3.7684993925951653e-07, "loss": 0.0745, "step": 9419 }, { "epoch": 8.795518207282914, "grad_norm": 0.5284404558922681, "learning_rate": 3.7627423251055283e-07, "loss": 0.0053, "step": 9420 }, { "epoch": 8.796451914098974, "grad_norm": 1.175103296369686, "learning_rate": 3.756989486520884e-07, "loss": 0.0092, "step": 9421 }, { "epoch": 8.797385620915033, "grad_norm": 1.191517299622789, "learning_rate": 3.7512408773673956e-07, "loss": 0.0244, "step": 9422 }, { "epoch": 8.798319327731093, "grad_norm": 1.5415439800776114, "learning_rate": 3.745496498170831e-07, "loss": 0.0434, "step": 9423 }, { "epoch": 8.799253034547153, "grad_norm": 1.8148160100737307, "learning_rate": 3.7397563494565745e-07, "loss": 0.0406, "step": 9424 }, { "epoch": 8.800186741363213, "grad_norm": 3.1136291566426606, "learning_rate": 3.7340204317496407e-07, "loss": 0.07, "step": 9425 }, { "epoch": 8.801120448179272, "grad_norm": 3.1184980912317517, "learning_rate": 3.7282887455746317e-07, "loss": 0.07, "step": 9426 }, { "epoch": 8.802054154995332, "grad_norm": 1.5853125662513892, "learning_rate": 3.722561291455773e-07, "loss": 0.0284, "step": 9427 }, { "epoch": 8.802987861811392, "grad_norm": 0.7891991052814968, "learning_rate": 3.716838069916895e-07, "loss": 0.0163, "step": 9428 }, { "epoch": 8.803921568627452, "grad_norm": 1.6845435892151335, "learning_rate": 3.711119081481468e-07, "loss": 0.04, "step": 9429 }, { "epoch": 8.804855275443511, "grad_norm": 0.39704514952535597, "learning_rate": 3.705404326672546e-07, "loss": 0.0052, "step": 9430 }, { "epoch": 8.805788982259571, "grad_norm": 1.6913963953530617, "learning_rate": 3.6996938060128105e-07, "loss": 0.0296, "step": 9431 }, { "epoch": 8.806722689075631, "grad_norm": 4.727769074672732, "learning_rate": 3.693987520024539e-07, "loss": 0.1755, "step": 9432 }, { "epoch": 8.80765639589169, "grad_norm": 0.5231241284028519, "learning_rate": 3.6882854692296524e-07, "loss": 0.0036, "step": 9433 }, { "epoch": 8.80859010270775, "grad_norm": 3.7496138465320623, "learning_rate": 3.6825876541496494e-07, "loss": 0.0582, "step": 9434 }, { "epoch": 8.80952380952381, "grad_norm": 0.3770317706676822, "learning_rate": 3.6768940753056704e-07, "loss": 0.0095, "step": 9435 }, { "epoch": 8.81045751633987, "grad_norm": 0.3344452111058805, "learning_rate": 3.671204733218442e-07, "loss": 0.0037, "step": 9436 }, { "epoch": 8.81139122315593, "grad_norm": 1.976233884853792, "learning_rate": 3.665519628408332e-07, "loss": 0.0289, "step": 9437 }, { "epoch": 8.81232492997199, "grad_norm": 1.2944664083877968, "learning_rate": 3.6598387613952954e-07, "loss": 0.0346, "step": 9438 }, { "epoch": 8.81325863678805, "grad_norm": 2.0454962276808892, "learning_rate": 3.6541621326989183e-07, "loss": 0.0684, "step": 9439 }, { "epoch": 8.814192343604109, "grad_norm": 2.4868424389989747, "learning_rate": 3.648489742838385e-07, "loss": 0.0693, "step": 9440 }, { "epoch": 8.815126050420169, "grad_norm": 2.469584132086212, "learning_rate": 3.642821592332491e-07, "loss": 0.0454, "step": 9441 }, { "epoch": 8.816059757236228, "grad_norm": 1.1585485140970546, "learning_rate": 3.6371576816996603e-07, "loss": 0.0276, "step": 9442 }, { "epoch": 8.816993464052288, "grad_norm": 1.043532373117237, "learning_rate": 3.6314980114579236e-07, "loss": 0.0187, "step": 9443 }, { "epoch": 8.817927170868348, "grad_norm": 2.089133149670779, "learning_rate": 3.6258425821249113e-07, "loss": 0.0329, "step": 9444 }, { "epoch": 8.818860877684408, "grad_norm": 1.9405952889676514, "learning_rate": 3.620191394217876e-07, "loss": 0.0454, "step": 9445 }, { "epoch": 8.819794584500467, "grad_norm": 0.02060375863091923, "learning_rate": 3.614544448253682e-07, "loss": 0.0001, "step": 9446 }, { "epoch": 8.820728291316527, "grad_norm": 8.684842127887904, "learning_rate": 3.608901744748811e-07, "loss": 0.1167, "step": 9447 }, { "epoch": 8.821661998132587, "grad_norm": 6.044358202990968, "learning_rate": 3.603263284219327e-07, "loss": 0.1489, "step": 9448 }, { "epoch": 8.822595704948647, "grad_norm": 1.2257261731377487, "learning_rate": 3.597629067180963e-07, "loss": 0.0209, "step": 9449 }, { "epoch": 8.823529411764707, "grad_norm": 13.32916151432826, "learning_rate": 3.591999094149012e-07, "loss": 0.2052, "step": 9450 }, { "epoch": 8.824463118580766, "grad_norm": 2.514518093066206, "learning_rate": 3.586373365638385e-07, "loss": 0.0732, "step": 9451 }, { "epoch": 8.825396825396826, "grad_norm": 3.7967044909251833, "learning_rate": 3.580751882163641e-07, "loss": 0.0958, "step": 9452 }, { "epoch": 8.826330532212886, "grad_norm": 1.0275092074336614, "learning_rate": 3.5751346442389144e-07, "loss": 0.0255, "step": 9453 }, { "epoch": 8.827264239028946, "grad_norm": 0.6273261537637528, "learning_rate": 3.569521652377966e-07, "loss": 0.007, "step": 9454 }, { "epoch": 8.828197945845005, "grad_norm": 3.7134743183989642, "learning_rate": 3.5639129070941524e-07, "loss": 0.0678, "step": 9455 }, { "epoch": 8.829131652661065, "grad_norm": 4.962121405565312, "learning_rate": 3.558308408900474e-07, "loss": 0.1551, "step": 9456 }, { "epoch": 8.830065359477125, "grad_norm": 1.1047691255980976, "learning_rate": 3.55270815830952e-07, "loss": 0.0179, "step": 9457 }, { "epoch": 8.830999066293185, "grad_norm": 0.7636966169101251, "learning_rate": 3.5471121558334766e-07, "loss": 0.0173, "step": 9458 }, { "epoch": 8.831932773109244, "grad_norm": 3.548558115326594, "learning_rate": 3.5415204019841886e-07, "loss": 0.0929, "step": 9459 }, { "epoch": 8.832866479925304, "grad_norm": 2.764228420623425, "learning_rate": 3.535932897273064e-07, "loss": 0.0435, "step": 9460 }, { "epoch": 8.833800186741364, "grad_norm": 7.082418068173664, "learning_rate": 3.530349642211145e-07, "loss": 0.2417, "step": 9461 }, { "epoch": 8.834733893557424, "grad_norm": 5.3768511420437894, "learning_rate": 3.524770637309077e-07, "loss": 0.0862, "step": 9462 }, { "epoch": 8.835667600373483, "grad_norm": 4.234206842178312, "learning_rate": 3.519195883077131e-07, "loss": 0.1182, "step": 9463 }, { "epoch": 8.836601307189543, "grad_norm": 1.6859494646242785, "learning_rate": 3.5136253800251816e-07, "loss": 0.0156, "step": 9464 }, { "epoch": 8.837535014005603, "grad_norm": 2.9198262287995385, "learning_rate": 3.5080591286626996e-07, "loss": 0.082, "step": 9465 }, { "epoch": 8.838468720821663, "grad_norm": 1.2486002239800338, "learning_rate": 3.5024971294987833e-07, "loss": 0.0223, "step": 9466 }, { "epoch": 8.839402427637722, "grad_norm": 1.8934442052795304, "learning_rate": 3.4969393830421474e-07, "loss": 0.034, "step": 9467 }, { "epoch": 8.840336134453782, "grad_norm": 5.561025227415046, "learning_rate": 3.491385889801102e-07, "loss": 0.1756, "step": 9468 }, { "epoch": 8.841269841269842, "grad_norm": 2.641315503190616, "learning_rate": 3.485836650283575e-07, "loss": 0.0359, "step": 9469 }, { "epoch": 8.842203548085902, "grad_norm": 0.9843015276898002, "learning_rate": 3.4802916649970986e-07, "loss": 0.0135, "step": 9470 }, { "epoch": 8.843137254901961, "grad_norm": 1.6251407891158616, "learning_rate": 3.474750934448845e-07, "loss": 0.0445, "step": 9471 }, { "epoch": 8.844070961718021, "grad_norm": 3.069679548329617, "learning_rate": 3.4692144591455533e-07, "loss": 0.0498, "step": 9472 }, { "epoch": 8.84500466853408, "grad_norm": 1.9942232863236533, "learning_rate": 3.463682239593602e-07, "loss": 0.0321, "step": 9473 }, { "epoch": 8.84593837535014, "grad_norm": 0.9188326924152442, "learning_rate": 3.458154276298975e-07, "loss": 0.0256, "step": 9474 }, { "epoch": 8.8468720821662, "grad_norm": 2.695926352828805, "learning_rate": 3.452630569767257e-07, "loss": 0.0582, "step": 9475 }, { "epoch": 8.84780578898226, "grad_norm": 0.7146121516816246, "learning_rate": 3.4471111205036657e-07, "loss": 0.017, "step": 9476 }, { "epoch": 8.84873949579832, "grad_norm": 1.4948206933337294, "learning_rate": 3.441595929013003e-07, "loss": 0.0233, "step": 9477 }, { "epoch": 8.84967320261438, "grad_norm": 2.389909314104592, "learning_rate": 3.4360849957997045e-07, "loss": 0.0413, "step": 9478 }, { "epoch": 8.85060690943044, "grad_norm": 5.813142889973284, "learning_rate": 3.430578321367789e-07, "loss": 0.1319, "step": 9479 }, { "epoch": 8.8515406162465, "grad_norm": 3.0044815999989, "learning_rate": 3.42507590622092e-07, "loss": 0.0824, "step": 9480 }, { "epoch": 8.852474323062559, "grad_norm": 2.9992078948907834, "learning_rate": 3.4195777508623516e-07, "loss": 0.0397, "step": 9481 }, { "epoch": 8.853408029878619, "grad_norm": 2.0539389334149036, "learning_rate": 3.414083855794942e-07, "loss": 0.0437, "step": 9482 }, { "epoch": 8.854341736694678, "grad_norm": 1.0390947945159301, "learning_rate": 3.4085942215211607e-07, "loss": 0.017, "step": 9483 }, { "epoch": 8.855275443510738, "grad_norm": 2.0110613727882503, "learning_rate": 3.4031088485431186e-07, "loss": 0.0395, "step": 9484 }, { "epoch": 8.856209150326798, "grad_norm": 6.011113467530552, "learning_rate": 3.397627737362497e-07, "loss": 0.1456, "step": 9485 }, { "epoch": 8.857142857142858, "grad_norm": 1.2533711886576326, "learning_rate": 3.392150888480611e-07, "loss": 0.0178, "step": 9486 }, { "epoch": 8.858076563958917, "grad_norm": 1.770894614682885, "learning_rate": 3.386678302398372e-07, "loss": 0.0699, "step": 9487 }, { "epoch": 8.859010270774977, "grad_norm": 2.013914134955975, "learning_rate": 3.3812099796163066e-07, "loss": 0.059, "step": 9488 }, { "epoch": 8.859943977591037, "grad_norm": 5.522605367681528, "learning_rate": 3.3757459206345654e-07, "loss": 0.1056, "step": 9489 }, { "epoch": 8.860877684407097, "grad_norm": 2.7592502753889363, "learning_rate": 3.3702861259528876e-07, "loss": 0.0781, "step": 9490 }, { "epoch": 8.861811391223156, "grad_norm": 1.601224665008752, "learning_rate": 3.364830596070634e-07, "loss": 0.048, "step": 9491 }, { "epoch": 8.862745098039216, "grad_norm": 3.075201334044098, "learning_rate": 3.359379331486762e-07, "loss": 0.0792, "step": 9492 }, { "epoch": 8.863678804855276, "grad_norm": 1.9876712568434862, "learning_rate": 3.3539323326998663e-07, "loss": 0.0372, "step": 9493 }, { "epoch": 8.864612511671336, "grad_norm": 0.25444151359407524, "learning_rate": 3.348489600208127e-07, "loss": 0.0028, "step": 9494 }, { "epoch": 8.865546218487395, "grad_norm": 2.91006314257557, "learning_rate": 3.3430511345093455e-07, "loss": 0.0493, "step": 9495 }, { "epoch": 8.866479925303455, "grad_norm": 0.3674434737023112, "learning_rate": 3.3376169361009125e-07, "loss": 0.0038, "step": 9496 }, { "epoch": 8.867413632119515, "grad_norm": 0.7127965590131217, "learning_rate": 3.3321870054798697e-07, "loss": 0.0152, "step": 9497 }, { "epoch": 8.868347338935575, "grad_norm": 1.774271563186753, "learning_rate": 3.3267613431428315e-07, "loss": 0.0439, "step": 9498 }, { "epoch": 8.869281045751634, "grad_norm": 3.2246174424576286, "learning_rate": 3.321339949586039e-07, "loss": 0.0931, "step": 9499 }, { "epoch": 8.870214752567694, "grad_norm": 3.2826357890520423, "learning_rate": 3.3159228253053235e-07, "loss": 0.1061, "step": 9500 }, { "epoch": 8.871148459383754, "grad_norm": 0.7027678878056711, "learning_rate": 3.310509970796161e-07, "loss": 0.0185, "step": 9501 }, { "epoch": 8.872082166199814, "grad_norm": 2.138394341073104, "learning_rate": 3.3051013865536055e-07, "loss": 0.0828, "step": 9502 }, { "epoch": 8.873015873015873, "grad_norm": 0.9369025604509243, "learning_rate": 3.2996970730723333e-07, "loss": 0.0175, "step": 9503 }, { "epoch": 8.873949579831933, "grad_norm": 1.265745195628973, "learning_rate": 3.2942970308466273e-07, "loss": 0.0259, "step": 9504 }, { "epoch": 8.874883286647993, "grad_norm": 0.7312276236326308, "learning_rate": 3.2889012603703753e-07, "loss": 0.0295, "step": 9505 }, { "epoch": 8.875816993464053, "grad_norm": 2.393731837062083, "learning_rate": 3.2835097621370937e-07, "loss": 0.0349, "step": 9506 }, { "epoch": 8.876750700280112, "grad_norm": 0.22208047117279076, "learning_rate": 3.2781225366398885e-07, "loss": 0.0017, "step": 9507 }, { "epoch": 8.877684407096172, "grad_norm": 1.3371562145382456, "learning_rate": 3.272739584371476e-07, "loss": 0.0284, "step": 9508 }, { "epoch": 8.878618113912232, "grad_norm": 2.154915812470828, "learning_rate": 3.26736090582418e-07, "loss": 0.041, "step": 9509 }, { "epoch": 8.879551820728292, "grad_norm": 2.7659484403849444, "learning_rate": 3.2619865014899555e-07, "loss": 0.0962, "step": 9510 }, { "epoch": 8.880485527544351, "grad_norm": 2.910730791234448, "learning_rate": 3.256616371860344e-07, "loss": 0.0876, "step": 9511 }, { "epoch": 8.881419234360411, "grad_norm": 1.4352386539665443, "learning_rate": 3.2512505174265074e-07, "loss": 0.0342, "step": 9512 }, { "epoch": 8.882352941176471, "grad_norm": 1.832290909646031, "learning_rate": 3.245888938679193e-07, "loss": 0.0217, "step": 9513 }, { "epoch": 8.88328664799253, "grad_norm": 0.564306922849195, "learning_rate": 3.2405316361088014e-07, "loss": 0.0062, "step": 9514 }, { "epoch": 8.88422035480859, "grad_norm": 1.343875435173573, "learning_rate": 3.2351786102053086e-07, "loss": 0.0208, "step": 9515 }, { "epoch": 8.88515406162465, "grad_norm": 2.21581809261221, "learning_rate": 3.2298298614583004e-07, "loss": 0.0552, "step": 9516 }, { "epoch": 8.88608776844071, "grad_norm": 1.2396650057174545, "learning_rate": 3.224485390356974e-07, "loss": 0.031, "step": 9517 }, { "epoch": 8.88702147525677, "grad_norm": 1.3538340295004534, "learning_rate": 3.219145197390161e-07, "loss": 0.0221, "step": 9518 }, { "epoch": 8.88795518207283, "grad_norm": 1.5354082435872685, "learning_rate": 3.213809283046265e-07, "loss": 0.0172, "step": 9519 }, { "epoch": 8.88888888888889, "grad_norm": 1.4135258034442548, "learning_rate": 3.2084776478133184e-07, "loss": 0.0411, "step": 9520 }, { "epoch": 8.889822595704949, "grad_norm": 1.884024723974699, "learning_rate": 3.203150292178953e-07, "loss": 0.0509, "step": 9521 }, { "epoch": 8.890756302521009, "grad_norm": 0.4219363259901393, "learning_rate": 3.1978272166304224e-07, "loss": 0.0079, "step": 9522 }, { "epoch": 8.891690009337069, "grad_norm": 0.2616110860492942, "learning_rate": 3.1925084216545776e-07, "loss": 0.0023, "step": 9523 }, { "epoch": 8.892623716153128, "grad_norm": 2.2189200435878744, "learning_rate": 3.187193907737868e-07, "loss": 0.0171, "step": 9524 }, { "epoch": 8.893557422969188, "grad_norm": 1.5977855311503966, "learning_rate": 3.1818836753663817e-07, "loss": 0.0249, "step": 9525 }, { "epoch": 8.894491129785248, "grad_norm": 0.8579677819336385, "learning_rate": 3.1765777250257976e-07, "loss": 0.0098, "step": 9526 }, { "epoch": 8.895424836601308, "grad_norm": 1.1104006258638415, "learning_rate": 3.1712760572013945e-07, "loss": 0.017, "step": 9527 }, { "epoch": 8.896358543417367, "grad_norm": 0.7185885638045241, "learning_rate": 3.1659786723780615e-07, "loss": 0.0129, "step": 9528 }, { "epoch": 8.897292250233427, "grad_norm": 3.4894194656859656, "learning_rate": 3.1606855710403174e-07, "loss": 0.1172, "step": 9529 }, { "epoch": 8.898225957049487, "grad_norm": 3.9825512157578213, "learning_rate": 3.1553967536722686e-07, "loss": 0.1088, "step": 9530 }, { "epoch": 8.899159663865547, "grad_norm": 6.7083510156711155, "learning_rate": 3.150112220757634e-07, "loss": 0.2523, "step": 9531 }, { "epoch": 8.900093370681606, "grad_norm": 2.1635920320414073, "learning_rate": 3.144831972779744e-07, "loss": 0.0463, "step": 9532 }, { "epoch": 8.901027077497666, "grad_norm": 4.131827882438098, "learning_rate": 3.1395560102215395e-07, "loss": 0.0783, "step": 9533 }, { "epoch": 8.901960784313726, "grad_norm": 2.22762154935532, "learning_rate": 3.1342843335655624e-07, "loss": 0.0778, "step": 9534 }, { "epoch": 8.902894491129786, "grad_norm": 0.46451085175234635, "learning_rate": 3.1290169432939556e-07, "loss": 0.0046, "step": 9535 }, { "epoch": 8.903828197945845, "grad_norm": 3.3276038990327397, "learning_rate": 3.1237538398884937e-07, "loss": 0.0247, "step": 9536 }, { "epoch": 8.904761904761905, "grad_norm": 0.09360618083027337, "learning_rate": 3.118495023830537e-07, "loss": 0.0004, "step": 9537 }, { "epoch": 8.905695611577965, "grad_norm": 2.1893905339147577, "learning_rate": 3.1132404956010666e-07, "loss": 0.0414, "step": 9538 }, { "epoch": 8.906629318394025, "grad_norm": 1.43496101006053, "learning_rate": 3.1079902556806594e-07, "loss": 0.0145, "step": 9539 }, { "epoch": 8.907563025210084, "grad_norm": 0.07077083608005685, "learning_rate": 3.102744304549515e-07, "loss": 0.0003, "step": 9540 }, { "epoch": 8.908496732026144, "grad_norm": 1.067563955494303, "learning_rate": 3.097502642687439e-07, "loss": 0.0245, "step": 9541 }, { "epoch": 8.909430438842204, "grad_norm": 3.883275358083328, "learning_rate": 3.0922652705738243e-07, "loss": 0.0902, "step": 9542 }, { "epoch": 8.910364145658264, "grad_norm": 1.1502334116608148, "learning_rate": 3.0870321886876885e-07, "loss": 0.0075, "step": 9543 }, { "epoch": 8.911297852474323, "grad_norm": 1.0641207534746366, "learning_rate": 3.0818033975076655e-07, "loss": 0.0269, "step": 9544 }, { "epoch": 8.912231559290383, "grad_norm": 2.7779777167780226, "learning_rate": 3.076578897511978e-07, "loss": 0.053, "step": 9545 }, { "epoch": 8.913165266106443, "grad_norm": 0.791285195939105, "learning_rate": 3.071358689178466e-07, "loss": 0.019, "step": 9546 }, { "epoch": 8.914098972922503, "grad_norm": 2.6084906650839503, "learning_rate": 3.0661427729845706e-07, "loss": 0.0507, "step": 9547 }, { "epoch": 8.915032679738562, "grad_norm": 0.3467127690830704, "learning_rate": 3.060931149407348e-07, "loss": 0.0061, "step": 9548 }, { "epoch": 8.915966386554622, "grad_norm": 3.3077577513899876, "learning_rate": 3.0557238189234615e-07, "loss": 0.0892, "step": 9549 }, { "epoch": 8.916900093370682, "grad_norm": 0.7241264469341366, "learning_rate": 3.0505207820091755e-07, "loss": 0.0054, "step": 9550 }, { "epoch": 8.917833800186742, "grad_norm": 1.7026630785601442, "learning_rate": 3.0453220391403583e-07, "loss": 0.0479, "step": 9551 }, { "epoch": 8.918767507002801, "grad_norm": 2.336528018031809, "learning_rate": 3.040127590792502e-07, "loss": 0.0565, "step": 9552 }, { "epoch": 8.919701213818861, "grad_norm": 1.1438606636986093, "learning_rate": 3.0349374374406937e-07, "loss": 0.0349, "step": 9553 }, { "epoch": 8.920634920634921, "grad_norm": 1.6405845205165634, "learning_rate": 3.029751579559631e-07, "loss": 0.0307, "step": 9554 }, { "epoch": 8.92156862745098, "grad_norm": 1.5594454401391937, "learning_rate": 3.0245700176236125e-07, "loss": 0.039, "step": 9555 }, { "epoch": 8.92250233426704, "grad_norm": 1.836489804194422, "learning_rate": 3.019392752106548e-07, "loss": 0.0232, "step": 9556 }, { "epoch": 8.9234360410831, "grad_norm": 2.25405561162367, "learning_rate": 3.0142197834819643e-07, "loss": 0.0296, "step": 9557 }, { "epoch": 8.92436974789916, "grad_norm": 5.763959092834485, "learning_rate": 3.009051112222977e-07, "loss": 0.2047, "step": 9558 }, { "epoch": 8.92530345471522, "grad_norm": 1.248208970296136, "learning_rate": 3.003886738802325e-07, "loss": 0.0128, "step": 9559 }, { "epoch": 8.92623716153128, "grad_norm": 1.6886818836467994, "learning_rate": 2.998726663692336e-07, "loss": 0.0368, "step": 9560 }, { "epoch": 8.92717086834734, "grad_norm": 0.4399110462733374, "learning_rate": 2.9935708873649713e-07, "loss": 0.0069, "step": 9561 }, { "epoch": 8.928104575163399, "grad_norm": 0.49442962032447363, "learning_rate": 2.9884194102917763e-07, "loss": 0.0079, "step": 9562 }, { "epoch": 8.929038281979459, "grad_norm": 9.03444046378206, "learning_rate": 2.9832722329439015e-07, "loss": 0.187, "step": 9563 }, { "epoch": 8.929971988795518, "grad_norm": 0.6784982787686259, "learning_rate": 2.97812935579212e-07, "loss": 0.0055, "step": 9564 }, { "epoch": 8.930905695611578, "grad_norm": 3.0395150245423292, "learning_rate": 2.9729907793068114e-07, "loss": 0.0883, "step": 9565 }, { "epoch": 8.931839402427638, "grad_norm": 2.1965839551763398, "learning_rate": 2.9678565039579446e-07, "loss": 0.0639, "step": 9566 }, { "epoch": 8.932773109243698, "grad_norm": 0.8951047240206687, "learning_rate": 2.9627265302151096e-07, "loss": 0.0218, "step": 9567 }, { "epoch": 8.933706816059757, "grad_norm": 1.8356357520275994, "learning_rate": 2.9576008585474935e-07, "loss": 0.0457, "step": 9568 }, { "epoch": 8.934640522875817, "grad_norm": 0.6527132141780204, "learning_rate": 2.9524794894239093e-07, "loss": 0.0104, "step": 9569 }, { "epoch": 8.935574229691877, "grad_norm": 1.0350070389907777, "learning_rate": 2.9473624233127493e-07, "loss": 0.0159, "step": 9570 }, { "epoch": 8.936507936507937, "grad_norm": 0.29027336240063545, "learning_rate": 2.9422496606820284e-07, "loss": 0.0024, "step": 9571 }, { "epoch": 8.937441643323996, "grad_norm": 3.9291527609577526, "learning_rate": 2.9371412019993664e-07, "loss": 0.0669, "step": 9572 }, { "epoch": 8.938375350140056, "grad_norm": 3.9147169202717933, "learning_rate": 2.9320370477319835e-07, "loss": 0.1296, "step": 9573 }, { "epoch": 8.939309056956116, "grad_norm": 2.5511447863033156, "learning_rate": 2.926937198346719e-07, "loss": 0.065, "step": 9574 }, { "epoch": 8.940242763772176, "grad_norm": 1.2891409493951131, "learning_rate": 2.9218416543100036e-07, "loss": 0.0224, "step": 9575 }, { "epoch": 8.941176470588236, "grad_norm": 0.9974097760118587, "learning_rate": 2.9167504160878877e-07, "loss": 0.0128, "step": 9576 }, { "epoch": 8.942110177404295, "grad_norm": 4.573710856096433, "learning_rate": 2.9116634841460047e-07, "loss": 0.1152, "step": 9577 }, { "epoch": 8.943043884220355, "grad_norm": 0.4120437319172876, "learning_rate": 2.906580858949631e-07, "loss": 0.0061, "step": 9578 }, { "epoch": 8.943977591036415, "grad_norm": 1.2312196824283643, "learning_rate": 2.9015025409636187e-07, "loss": 0.0231, "step": 9579 }, { "epoch": 8.944911297852475, "grad_norm": 2.8491329691891223, "learning_rate": 2.896428530652434e-07, "loss": 0.0855, "step": 9580 }, { "epoch": 8.945845004668534, "grad_norm": 1.274032466659827, "learning_rate": 2.891358828480151e-07, "loss": 0.0183, "step": 9581 }, { "epoch": 8.946778711484594, "grad_norm": 0.7960892346256431, "learning_rate": 2.8862934349104534e-07, "loss": 0.0087, "step": 9582 }, { "epoch": 8.947712418300654, "grad_norm": 0.34250434388048995, "learning_rate": 2.881232350406632e-07, "loss": 0.0077, "step": 9583 }, { "epoch": 8.948646125116714, "grad_norm": 2.128328355821695, "learning_rate": 2.8761755754315666e-07, "loss": 0.0589, "step": 9584 }, { "epoch": 8.949579831932773, "grad_norm": 5.947906152095241, "learning_rate": 2.8711231104477543e-07, "loss": 0.0872, "step": 9585 }, { "epoch": 8.950513538748833, "grad_norm": 0.8652431498317532, "learning_rate": 2.866074955917314e-07, "loss": 0.0115, "step": 9586 }, { "epoch": 8.951447245564893, "grad_norm": 1.570013821748845, "learning_rate": 2.861031112301943e-07, "loss": 0.0398, "step": 9587 }, { "epoch": 8.952380952380953, "grad_norm": 1.0893236422824324, "learning_rate": 2.8559915800629555e-07, "loss": 0.0345, "step": 9588 }, { "epoch": 8.953314659197012, "grad_norm": 0.8329484562605659, "learning_rate": 2.8509563596612776e-07, "loss": 0.016, "step": 9589 }, { "epoch": 8.954248366013072, "grad_norm": 4.594039866005567, "learning_rate": 2.845925451557424e-07, "loss": 0.1447, "step": 9590 }, { "epoch": 8.955182072829132, "grad_norm": 3.5382073838179644, "learning_rate": 2.8408988562115494e-07, "loss": 0.0813, "step": 9591 }, { "epoch": 8.956115779645192, "grad_norm": 0.2769242118493171, "learning_rate": 2.8358765740833683e-07, "loss": 0.0019, "step": 9592 }, { "epoch": 8.957049486461251, "grad_norm": 2.8287390008317947, "learning_rate": 2.8308586056322365e-07, "loss": 0.0803, "step": 9593 }, { "epoch": 8.957983193277311, "grad_norm": 2.789730793454428, "learning_rate": 2.8258449513170916e-07, "loss": 0.0653, "step": 9594 }, { "epoch": 8.95891690009337, "grad_norm": 3.696007884566877, "learning_rate": 2.820835611596501e-07, "loss": 0.0528, "step": 9595 }, { "epoch": 8.95985060690943, "grad_norm": 3.6356929076095277, "learning_rate": 2.81583058692862e-07, "loss": 0.0866, "step": 9596 }, { "epoch": 8.96078431372549, "grad_norm": 0.5210255154983062, "learning_rate": 2.810829877771204e-07, "loss": 0.0116, "step": 9597 }, { "epoch": 8.96171802054155, "grad_norm": 0.6514545679929682, "learning_rate": 2.8058334845816214e-07, "loss": 0.0069, "step": 9598 }, { "epoch": 8.96265172735761, "grad_norm": 2.9903618418161813, "learning_rate": 2.800841407816868e-07, "loss": 0.0514, "step": 9599 }, { "epoch": 8.96358543417367, "grad_norm": 1.2221988957899985, "learning_rate": 2.7958536479335054e-07, "loss": 0.0319, "step": 9600 }, { "epoch": 8.96451914098973, "grad_norm": 0.1935417995793413, "learning_rate": 2.790870205387719e-07, "loss": 0.0005, "step": 9601 }, { "epoch": 8.965452847805789, "grad_norm": 5.371759583968549, "learning_rate": 2.785891080635311e-07, "loss": 0.1752, "step": 9602 }, { "epoch": 8.966386554621849, "grad_norm": 2.6116689593870666, "learning_rate": 2.7809162741316673e-07, "loss": 0.0267, "step": 9603 }, { "epoch": 8.967320261437909, "grad_norm": 1.1475061628268564, "learning_rate": 2.7759457863317907e-07, "loss": 0.0207, "step": 9604 }, { "epoch": 8.968253968253968, "grad_norm": 2.2476946455478433, "learning_rate": 2.770979617690289e-07, "loss": 0.0699, "step": 9605 }, { "epoch": 8.969187675070028, "grad_norm": 3.1626569160437943, "learning_rate": 2.766017768661378e-07, "loss": 0.0969, "step": 9606 }, { "epoch": 8.970121381886088, "grad_norm": 3.9426002659300368, "learning_rate": 2.7610602396988653e-07, "loss": 0.1612, "step": 9607 }, { "epoch": 8.971055088702148, "grad_norm": 1.6652676595459237, "learning_rate": 2.7561070312561665e-07, "loss": 0.043, "step": 9608 }, { "epoch": 8.971988795518207, "grad_norm": 0.40635115566939317, "learning_rate": 2.7511581437863146e-07, "loss": 0.0052, "step": 9609 }, { "epoch": 8.972922502334267, "grad_norm": 2.786165772923348, "learning_rate": 2.7462135777419463e-07, "loss": 0.0608, "step": 9610 }, { "epoch": 8.973856209150327, "grad_norm": 0.8281489463261523, "learning_rate": 2.7412733335752784e-07, "loss": 0.0126, "step": 9611 }, { "epoch": 8.974789915966387, "grad_norm": 4.744762343290283, "learning_rate": 2.7363374117381716e-07, "loss": 0.092, "step": 9612 }, { "epoch": 8.975723622782446, "grad_norm": 1.5827582942232634, "learning_rate": 2.731405812682053e-07, "loss": 0.0359, "step": 9613 }, { "epoch": 8.976657329598506, "grad_norm": 2.911156613105318, "learning_rate": 2.7264785368579847e-07, "loss": 0.0972, "step": 9614 }, { "epoch": 8.977591036414566, "grad_norm": 0.5866821571972913, "learning_rate": 2.721555584716601e-07, "loss": 0.007, "step": 9615 }, { "epoch": 8.978524743230626, "grad_norm": 2.1235309604788934, "learning_rate": 2.7166369567081854e-07, "loss": 0.0693, "step": 9616 }, { "epoch": 8.979458450046685, "grad_norm": 2.704765746862287, "learning_rate": 2.711722653282584e-07, "loss": 0.0744, "step": 9617 }, { "epoch": 8.980392156862745, "grad_norm": 3.226526903227265, "learning_rate": 2.706812674889264e-07, "loss": 0.0781, "step": 9618 }, { "epoch": 8.981325863678805, "grad_norm": 2.633341466011258, "learning_rate": 2.701907021977296e-07, "loss": 0.0293, "step": 9619 }, { "epoch": 8.982259570494865, "grad_norm": 1.7869275877644213, "learning_rate": 2.697005694995369e-07, "loss": 0.0271, "step": 9620 }, { "epoch": 8.983193277310924, "grad_norm": 2.7890164267177497, "learning_rate": 2.692108694391754e-07, "loss": 0.0396, "step": 9621 }, { "epoch": 8.984126984126984, "grad_norm": 1.716472409439555, "learning_rate": 2.687216020614336e-07, "loss": 0.0229, "step": 9622 }, { "epoch": 8.985060690943044, "grad_norm": 5.351872454782087, "learning_rate": 2.682327674110602e-07, "loss": 0.0997, "step": 9623 }, { "epoch": 8.985994397759104, "grad_norm": 3.1313685609447908, "learning_rate": 2.6774436553276386e-07, "loss": 0.0827, "step": 9624 }, { "epoch": 8.986928104575163, "grad_norm": 4.190847018351154, "learning_rate": 2.6725639647121605e-07, "loss": 0.0533, "step": 9625 }, { "epoch": 8.987861811391223, "grad_norm": 0.4554391534571742, "learning_rate": 2.6676886027104556e-07, "loss": 0.0077, "step": 9626 }, { "epoch": 8.988795518207283, "grad_norm": 5.060909433913674, "learning_rate": 2.6628175697684333e-07, "loss": 0.0997, "step": 9627 }, { "epoch": 8.989729225023343, "grad_norm": 0.4266293506992544, "learning_rate": 2.657950866331599e-07, "loss": 0.0024, "step": 9628 }, { "epoch": 8.990662931839402, "grad_norm": 0.5178968630010077, "learning_rate": 2.653088492845074e-07, "loss": 0.0032, "step": 9629 }, { "epoch": 8.991596638655462, "grad_norm": 0.6348920506251473, "learning_rate": 2.648230449753575e-07, "loss": 0.0183, "step": 9630 }, { "epoch": 8.992530345471522, "grad_norm": 1.2076509665289954, "learning_rate": 2.643376737501413e-07, "loss": 0.0316, "step": 9631 }, { "epoch": 8.993464052287582, "grad_norm": 2.5882330325218317, "learning_rate": 2.638527356532522e-07, "loss": 0.0594, "step": 9632 }, { "epoch": 8.994397759103641, "grad_norm": 2.417551524745364, "learning_rate": 2.6336823072904305e-07, "loss": 0.0728, "step": 9633 }, { "epoch": 8.995331465919701, "grad_norm": 0.6995417462706565, "learning_rate": 2.628841590218273e-07, "loss": 0.0093, "step": 9634 }, { "epoch": 8.996265172735761, "grad_norm": 0.571370455192888, "learning_rate": 2.624005205758784e-07, "loss": 0.0041, "step": 9635 }, { "epoch": 8.99719887955182, "grad_norm": 1.4065670503772127, "learning_rate": 2.6191731543542984e-07, "loss": 0.0223, "step": 9636 }, { "epoch": 8.99813258636788, "grad_norm": 1.0071918576095666, "learning_rate": 2.6143454364467635e-07, "loss": 0.024, "step": 9637 }, { "epoch": 8.99906629318394, "grad_norm": 1.6765405420302137, "learning_rate": 2.609522052477736e-07, "loss": 0.0234, "step": 9638 }, { "epoch": 9.0, "grad_norm": 2.587423653011179, "learning_rate": 2.604703002888359e-07, "loss": 0.1068, "step": 9639 }, { "epoch": 9.00093370681606, "grad_norm": 1.3610119444711406, "learning_rate": 2.5998882881193955e-07, "loss": 0.0393, "step": 9640 }, { "epoch": 9.00186741363212, "grad_norm": 0.896879441855551, "learning_rate": 2.595077908611182e-07, "loss": 0.0093, "step": 9641 }, { "epoch": 9.00280112044818, "grad_norm": 1.2374523257790122, "learning_rate": 2.590271864803706e-07, "loss": 0.0393, "step": 9642 }, { "epoch": 9.003734827264239, "grad_norm": 0.45332446629211304, "learning_rate": 2.585470157136527e-07, "loss": 0.0049, "step": 9643 }, { "epoch": 9.004668534080299, "grad_norm": 1.9416857904620415, "learning_rate": 2.580672786048805e-07, "loss": 0.045, "step": 9644 }, { "epoch": 9.005602240896359, "grad_norm": 3.9874035298387, "learning_rate": 2.57587975197931e-07, "loss": 0.1115, "step": 9645 }, { "epoch": 9.006535947712418, "grad_norm": 2.5036505431929625, "learning_rate": 2.571091055366437e-07, "loss": 0.0527, "step": 9646 }, { "epoch": 9.007469654528478, "grad_norm": 2.100782130704417, "learning_rate": 2.5663066966481465e-07, "loss": 0.0644, "step": 9647 }, { "epoch": 9.008403361344538, "grad_norm": 1.5677784288132173, "learning_rate": 2.5615266762620327e-07, "loss": 0.0292, "step": 9648 }, { "epoch": 9.009337068160598, "grad_norm": 2.1622490111267303, "learning_rate": 2.556750994645263e-07, "loss": 0.0385, "step": 9649 }, { "epoch": 9.010270774976657, "grad_norm": 3.745999910185221, "learning_rate": 2.5519796522346485e-07, "loss": 0.1175, "step": 9650 }, { "epoch": 9.011204481792717, "grad_norm": 2.063512608029662, "learning_rate": 2.547212649466568e-07, "loss": 0.0546, "step": 9651 }, { "epoch": 9.012138188608777, "grad_norm": 1.1607189814935341, "learning_rate": 2.542449986777024e-07, "loss": 0.0281, "step": 9652 }, { "epoch": 9.013071895424837, "grad_norm": 0.9454816505168242, "learning_rate": 2.5376916646016046e-07, "loss": 0.0138, "step": 9653 }, { "epoch": 9.014005602240896, "grad_norm": 0.7801238300379358, "learning_rate": 2.532937683375508e-07, "loss": 0.0117, "step": 9654 }, { "epoch": 9.014939309056956, "grad_norm": 2.1541425125253344, "learning_rate": 2.5281880435335526e-07, "loss": 0.0305, "step": 9655 }, { "epoch": 9.015873015873016, "grad_norm": 3.1700901265281085, "learning_rate": 2.523442745510135e-07, "loss": 0.1158, "step": 9656 }, { "epoch": 9.016806722689076, "grad_norm": 1.2462116026370156, "learning_rate": 2.5187017897392754e-07, "loss": 0.0238, "step": 9657 }, { "epoch": 9.017740429505135, "grad_norm": 2.329807542033618, "learning_rate": 2.5139651766545646e-07, "loss": 0.076, "step": 9658 }, { "epoch": 9.018674136321195, "grad_norm": 4.55463328341897, "learning_rate": 2.5092329066892404e-07, "loss": 0.042, "step": 9659 }, { "epoch": 9.019607843137255, "grad_norm": 0.9426026221438385, "learning_rate": 2.5045049802761124e-07, "loss": 0.025, "step": 9660 }, { "epoch": 9.020541549953315, "grad_norm": 7.832739197518715, "learning_rate": 2.4997813978476003e-07, "loss": 0.1241, "step": 9661 }, { "epoch": 9.021475256769374, "grad_norm": 0.6921771901371061, "learning_rate": 2.4950621598357206e-07, "loss": 0.0177, "step": 9662 }, { "epoch": 9.022408963585434, "grad_norm": 0.8407561420417966, "learning_rate": 2.490347266672116e-07, "loss": 0.0143, "step": 9663 }, { "epoch": 9.023342670401494, "grad_norm": 1.221233918923729, "learning_rate": 2.4856367187880036e-07, "loss": 0.0249, "step": 9664 }, { "epoch": 9.024276377217554, "grad_norm": 3.708433828137302, "learning_rate": 2.480930516614222e-07, "loss": 0.1075, "step": 9665 }, { "epoch": 9.025210084033613, "grad_norm": 0.7348871515575024, "learning_rate": 2.476228660581187e-07, "loss": 0.0161, "step": 9666 }, { "epoch": 9.026143790849673, "grad_norm": 2.260514290093939, "learning_rate": 2.471531151118961e-07, "loss": 0.0412, "step": 9667 }, { "epoch": 9.027077497665733, "grad_norm": 1.4134375253590892, "learning_rate": 2.466837988657167e-07, "loss": 0.0363, "step": 9668 }, { "epoch": 9.028011204481793, "grad_norm": 1.8257876494826495, "learning_rate": 2.4621491736250504e-07, "loss": 0.0514, "step": 9669 }, { "epoch": 9.028944911297852, "grad_norm": 2.8115710638531177, "learning_rate": 2.4574647064514566e-07, "loss": 0.0605, "step": 9670 }, { "epoch": 9.029878618113912, "grad_norm": 1.3655263868863738, "learning_rate": 2.4527845875648216e-07, "loss": 0.0182, "step": 9671 }, { "epoch": 9.030812324929972, "grad_norm": 4.33021385006256, "learning_rate": 2.448108817393208e-07, "loss": 0.1496, "step": 9672 }, { "epoch": 9.031746031746032, "grad_norm": 1.259305319339036, "learning_rate": 2.4434373963642566e-07, "loss": 0.0194, "step": 9673 }, { "epoch": 9.032679738562091, "grad_norm": 2.23373917572226, "learning_rate": 2.438770324905221e-07, "loss": 0.0377, "step": 9674 }, { "epoch": 9.033613445378151, "grad_norm": 1.7344607615926908, "learning_rate": 2.434107603442959e-07, "loss": 0.055, "step": 9675 }, { "epoch": 9.034547152194211, "grad_norm": 3.816896260190835, "learning_rate": 2.42944923240393e-07, "loss": 0.0508, "step": 9676 }, { "epoch": 9.03548085901027, "grad_norm": 4.6269982817673565, "learning_rate": 2.424795212214193e-07, "loss": 0.1035, "step": 9677 }, { "epoch": 9.03641456582633, "grad_norm": 4.088259615259429, "learning_rate": 2.420145543299396e-07, "loss": 0.143, "step": 9678 }, { "epoch": 9.03734827264239, "grad_norm": 2.6732021376774675, "learning_rate": 2.415500226084816e-07, "loss": 0.0263, "step": 9679 }, { "epoch": 9.03828197945845, "grad_norm": 0.06441279898767721, "learning_rate": 2.4108592609953187e-07, "loss": 0.0005, "step": 9680 }, { "epoch": 9.03921568627451, "grad_norm": 2.8290568337816406, "learning_rate": 2.406222648455364e-07, "loss": 0.0554, "step": 9681 }, { "epoch": 9.04014939309057, "grad_norm": 0.9805333524193757, "learning_rate": 2.401590388889025e-07, "loss": 0.0095, "step": 9682 }, { "epoch": 9.04108309990663, "grad_norm": 0.38873997451724596, "learning_rate": 2.396962482719978e-07, "loss": 0.0038, "step": 9683 }, { "epoch": 9.042016806722689, "grad_norm": 1.0258605987941727, "learning_rate": 2.3923389303714795e-07, "loss": 0.012, "step": 9684 }, { "epoch": 9.042950513538749, "grad_norm": 3.397680124286953, "learning_rate": 2.387719732266419e-07, "loss": 0.0748, "step": 9685 }, { "epoch": 9.043884220354808, "grad_norm": 2.642989723856263, "learning_rate": 2.3831048888272757e-07, "loss": 0.0669, "step": 9686 }, { "epoch": 9.044817927170868, "grad_norm": 1.247089974808008, "learning_rate": 2.3784944004761168e-07, "loss": 0.0174, "step": 9687 }, { "epoch": 9.045751633986928, "grad_norm": 2.6431292140693827, "learning_rate": 2.3738882676346165e-07, "loss": 0.0765, "step": 9688 }, { "epoch": 9.046685340802988, "grad_norm": 1.961992053784457, "learning_rate": 2.3692864907240765e-07, "loss": 0.063, "step": 9689 }, { "epoch": 9.047619047619047, "grad_norm": 2.2653130718848504, "learning_rate": 2.3646890701653658e-07, "loss": 0.0745, "step": 9690 }, { "epoch": 9.048552754435107, "grad_norm": 1.6298371696176415, "learning_rate": 2.3600960063789759e-07, "loss": 0.0427, "step": 9691 }, { "epoch": 9.049486461251167, "grad_norm": 2.780114183800164, "learning_rate": 2.3555072997849758e-07, "loss": 0.1064, "step": 9692 }, { "epoch": 9.050420168067227, "grad_norm": 9.863242106969063, "learning_rate": 2.3509229508030808e-07, "loss": 0.0514, "step": 9693 }, { "epoch": 9.051353874883286, "grad_norm": 1.9536474289348575, "learning_rate": 2.34634295985256e-07, "loss": 0.0364, "step": 9694 }, { "epoch": 9.052287581699346, "grad_norm": 2.257878897827951, "learning_rate": 2.3417673273523122e-07, "loss": 0.0485, "step": 9695 }, { "epoch": 9.053221288515406, "grad_norm": 6.046711204494692, "learning_rate": 2.3371960537208193e-07, "loss": 0.1752, "step": 9696 }, { "epoch": 9.054154995331466, "grad_norm": 2.8730082818857476, "learning_rate": 2.3326291393761914e-07, "loss": 0.0716, "step": 9697 }, { "epoch": 9.055088702147525, "grad_norm": 2.612998603585183, "learning_rate": 2.3280665847361107e-07, "loss": 0.0795, "step": 9698 }, { "epoch": 9.056022408963585, "grad_norm": 3.5269141643569464, "learning_rate": 2.3235083902178768e-07, "loss": 0.0834, "step": 9699 }, { "epoch": 9.056956115779645, "grad_norm": 1.2619796698104842, "learning_rate": 2.3189545562383787e-07, "loss": 0.0251, "step": 9700 }, { "epoch": 9.057889822595705, "grad_norm": 1.717114475519974, "learning_rate": 2.314405083214122e-07, "loss": 0.0194, "step": 9701 }, { "epoch": 9.058823529411764, "grad_norm": 0.5563126141038404, "learning_rate": 2.3098599715612124e-07, "loss": 0.01, "step": 9702 }, { "epoch": 9.059757236227824, "grad_norm": 0.4093411252289521, "learning_rate": 2.30531922169534e-07, "loss": 0.0108, "step": 9703 }, { "epoch": 9.060690943043884, "grad_norm": 0.6221500210450611, "learning_rate": 2.3007828340318117e-07, "loss": 0.006, "step": 9704 }, { "epoch": 9.061624649859944, "grad_norm": 2.3723856909624046, "learning_rate": 2.2962508089855173e-07, "loss": 0.0405, "step": 9705 }, { "epoch": 9.062558356676004, "grad_norm": 0.7406378638388098, "learning_rate": 2.2917231469709812e-07, "loss": 0.0059, "step": 9706 }, { "epoch": 9.063492063492063, "grad_norm": 0.865567098108885, "learning_rate": 2.287199848402294e-07, "loss": 0.0183, "step": 9707 }, { "epoch": 9.064425770308123, "grad_norm": 1.2461653527760614, "learning_rate": 2.2826809136931637e-07, "loss": 0.0211, "step": 9708 }, { "epoch": 9.065359477124183, "grad_norm": 0.6030108442977187, "learning_rate": 2.2781663432568933e-07, "loss": 0.0117, "step": 9709 }, { "epoch": 9.066293183940243, "grad_norm": 0.9054372179111411, "learning_rate": 2.273656137506397e-07, "loss": 0.0164, "step": 9710 }, { "epoch": 9.067226890756302, "grad_norm": 5.423948930104928, "learning_rate": 2.2691502968541778e-07, "loss": 0.1645, "step": 9711 }, { "epoch": 9.068160597572362, "grad_norm": 2.981904757679827, "learning_rate": 2.2646488217123507e-07, "loss": 0.0878, "step": 9712 }, { "epoch": 9.069094304388422, "grad_norm": 1.4587159351832928, "learning_rate": 2.2601517124926087e-07, "loss": 0.0327, "step": 9713 }, { "epoch": 9.070028011204482, "grad_norm": 1.6324472006175823, "learning_rate": 2.2556589696062837e-07, "loss": 0.0429, "step": 9714 }, { "epoch": 9.070961718020541, "grad_norm": 0.8919050606348111, "learning_rate": 2.2511705934642747e-07, "loss": 0.0134, "step": 9715 }, { "epoch": 9.071895424836601, "grad_norm": 2.619537345993, "learning_rate": 2.2466865844770923e-07, "loss": 0.0604, "step": 9716 }, { "epoch": 9.07282913165266, "grad_norm": 0.6126636377368224, "learning_rate": 2.2422069430548477e-07, "loss": 0.0126, "step": 9717 }, { "epoch": 9.07376283846872, "grad_norm": 3.36303734887417, "learning_rate": 2.237731669607257e-07, "loss": 0.0934, "step": 9718 }, { "epoch": 9.07469654528478, "grad_norm": 3.3541521571200845, "learning_rate": 2.2332607645436378e-07, "loss": 0.1238, "step": 9719 }, { "epoch": 9.07563025210084, "grad_norm": 1.2631967163224653, "learning_rate": 2.2287942282728902e-07, "loss": 0.0329, "step": 9720 }, { "epoch": 9.0765639589169, "grad_norm": 3.0196636160978514, "learning_rate": 2.2243320612035435e-07, "loss": 0.0354, "step": 9721 }, { "epoch": 9.07749766573296, "grad_norm": 2.357830862871937, "learning_rate": 2.2198742637436932e-07, "loss": 0.0795, "step": 9722 }, { "epoch": 9.07843137254902, "grad_norm": 1.515087431357609, "learning_rate": 2.2154208363010688e-07, "loss": 0.0526, "step": 9723 }, { "epoch": 9.079365079365079, "grad_norm": 5.740377779545578, "learning_rate": 2.210971779282983e-07, "loss": 0.0734, "step": 9724 }, { "epoch": 9.080298786181139, "grad_norm": 3.844177515584222, "learning_rate": 2.20652709309635e-07, "loss": 0.1101, "step": 9725 }, { "epoch": 9.081232492997199, "grad_norm": 0.9659273365484969, "learning_rate": 2.202086778147672e-07, "loss": 0.0138, "step": 9726 }, { "epoch": 9.082166199813258, "grad_norm": 1.8378490564623593, "learning_rate": 2.1976508348430903e-07, "loss": 0.0483, "step": 9727 }, { "epoch": 9.083099906629318, "grad_norm": 0.5805204742039921, "learning_rate": 2.1932192635883032e-07, "loss": 0.0099, "step": 9728 }, { "epoch": 9.084033613445378, "grad_norm": 6.841369464070113, "learning_rate": 2.1887920647886308e-07, "loss": 0.0515, "step": 9729 }, { "epoch": 9.084967320261438, "grad_norm": 2.029722933879788, "learning_rate": 2.1843692388489767e-07, "loss": 0.0255, "step": 9730 }, { "epoch": 9.085901027077497, "grad_norm": 1.9190612998357828, "learning_rate": 2.179950786173879e-07, "loss": 0.0253, "step": 9731 }, { "epoch": 9.086834733893557, "grad_norm": 1.6444643930017429, "learning_rate": 2.175536707167447e-07, "loss": 0.0422, "step": 9732 }, { "epoch": 9.087768440709617, "grad_norm": 2.3272529777545277, "learning_rate": 2.1711270022333862e-07, "loss": 0.0371, "step": 9733 }, { "epoch": 9.088702147525677, "grad_norm": 3.9653592227393935, "learning_rate": 2.1667216717750184e-07, "loss": 0.1321, "step": 9734 }, { "epoch": 9.089635854341736, "grad_norm": 0.566007802364223, "learning_rate": 2.1623207161952598e-07, "loss": 0.0099, "step": 9735 }, { "epoch": 9.090569561157796, "grad_norm": 1.807663475876302, "learning_rate": 2.157924135896633e-07, "loss": 0.0378, "step": 9736 }, { "epoch": 9.091503267973856, "grad_norm": 1.8166943081529336, "learning_rate": 2.1535319312812442e-07, "loss": 0.0435, "step": 9737 }, { "epoch": 9.092436974789916, "grad_norm": 2.776381004819692, "learning_rate": 2.1491441027508165e-07, "loss": 0.0439, "step": 9738 }, { "epoch": 9.093370681605975, "grad_norm": 4.017421133025338, "learning_rate": 2.1447606507066454e-07, "loss": 0.0828, "step": 9739 }, { "epoch": 9.094304388422035, "grad_norm": 5.522257936737767, "learning_rate": 2.1403815755496716e-07, "loss": 0.0758, "step": 9740 }, { "epoch": 9.095238095238095, "grad_norm": 1.0299590904126719, "learning_rate": 2.1360068776804023e-07, "loss": 0.0152, "step": 9741 }, { "epoch": 9.096171802054155, "grad_norm": 3.451437462673941, "learning_rate": 2.131636557498945e-07, "loss": 0.0652, "step": 9742 }, { "epoch": 9.097105508870214, "grad_norm": 2.363999075678538, "learning_rate": 2.1272706154050084e-07, "loss": 0.0315, "step": 9743 }, { "epoch": 9.098039215686274, "grad_norm": 0.3331602034401201, "learning_rate": 2.1229090517979222e-07, "loss": 0.0068, "step": 9744 }, { "epoch": 9.098972922502334, "grad_norm": 0.5418103557302801, "learning_rate": 2.1185518670765902e-07, "loss": 0.0115, "step": 9745 }, { "epoch": 9.099906629318394, "grad_norm": 0.6014949075158624, "learning_rate": 2.1141990616395215e-07, "loss": 0.0115, "step": 9746 }, { "epoch": 9.100840336134453, "grad_norm": 0.43692575712064424, "learning_rate": 2.1098506358848302e-07, "loss": 0.0073, "step": 9747 }, { "epoch": 9.101774042950513, "grad_norm": 2.9087748594112735, "learning_rate": 2.1055065902102378e-07, "loss": 0.0518, "step": 9748 }, { "epoch": 9.102707749766573, "grad_norm": 0.444618864300826, "learning_rate": 2.101166925013043e-07, "loss": 0.0152, "step": 9749 }, { "epoch": 9.103641456582633, "grad_norm": 1.634757158003221, "learning_rate": 2.096831640690161e-07, "loss": 0.0415, "step": 9750 }, { "epoch": 9.104575163398692, "grad_norm": 1.9290447882854473, "learning_rate": 2.0925007376380923e-07, "loss": 0.0485, "step": 9751 }, { "epoch": 9.105508870214752, "grad_norm": 3.3260474366517943, "learning_rate": 2.0881742162529584e-07, "loss": 0.0544, "step": 9752 }, { "epoch": 9.106442577030812, "grad_norm": 2.8336804082555482, "learning_rate": 2.0838520769304594e-07, "loss": 0.0908, "step": 9753 }, { "epoch": 9.107376283846872, "grad_norm": 0.48963359273273194, "learning_rate": 2.0795343200659012e-07, "loss": 0.0115, "step": 9754 }, { "epoch": 9.108309990662931, "grad_norm": 1.2375724427696069, "learning_rate": 2.0752209460542015e-07, "loss": 0.0192, "step": 9755 }, { "epoch": 9.109243697478991, "grad_norm": 1.5060293897793413, "learning_rate": 2.070911955289856e-07, "loss": 0.0383, "step": 9756 }, { "epoch": 9.110177404295051, "grad_norm": 0.6367844739340076, "learning_rate": 2.0666073481669714e-07, "loss": 0.019, "step": 9757 }, { "epoch": 9.11111111111111, "grad_norm": 3.0113744301416867, "learning_rate": 2.0623071250792492e-07, "loss": 0.0767, "step": 9758 }, { "epoch": 9.11204481792717, "grad_norm": 1.535126476943611, "learning_rate": 2.058011286419992e-07, "loss": 0.031, "step": 9759 }, { "epoch": 9.11297852474323, "grad_norm": 6.108724414730541, "learning_rate": 2.0537198325821073e-07, "loss": 0.0664, "step": 9760 }, { "epoch": 9.11391223155929, "grad_norm": 2.5136622223265044, "learning_rate": 2.0494327639580868e-07, "loss": 0.1148, "step": 9761 }, { "epoch": 9.11484593837535, "grad_norm": 0.41179410630742724, "learning_rate": 2.0451500809400338e-07, "loss": 0.0017, "step": 9762 }, { "epoch": 9.11577964519141, "grad_norm": 0.8488523018530374, "learning_rate": 2.040871783919657e-07, "loss": 0.0144, "step": 9763 }, { "epoch": 9.11671335200747, "grad_norm": 0.5820941008145419, "learning_rate": 2.036597873288232e-07, "loss": 0.0024, "step": 9764 }, { "epoch": 9.117647058823529, "grad_norm": 2.3586251790929493, "learning_rate": 2.032328349436674e-07, "loss": 0.0416, "step": 9765 }, { "epoch": 9.118580765639589, "grad_norm": 0.35364703462898894, "learning_rate": 2.0280632127554712e-07, "loss": 0.0026, "step": 9766 }, { "epoch": 9.119514472455649, "grad_norm": 0.33750995455447425, "learning_rate": 2.0238024636347164e-07, "loss": 0.0013, "step": 9767 }, { "epoch": 9.120448179271708, "grad_norm": 3.404723571963404, "learning_rate": 2.0195461024640982e-07, "loss": 0.078, "step": 9768 }, { "epoch": 9.121381886087768, "grad_norm": 0.5529864079472007, "learning_rate": 2.0152941296329165e-07, "loss": 0.0033, "step": 9769 }, { "epoch": 9.122315592903828, "grad_norm": 1.4845005374647622, "learning_rate": 2.01104654553006e-07, "loss": 0.0322, "step": 9770 }, { "epoch": 9.123249299719888, "grad_norm": 4.233124596067021, "learning_rate": 2.0068033505440066e-07, "loss": 0.1107, "step": 9771 }, { "epoch": 9.124183006535947, "grad_norm": 1.1332249002700923, "learning_rate": 2.0025645450628517e-07, "loss": 0.0223, "step": 9772 }, { "epoch": 9.125116713352007, "grad_norm": 2.2564990702989056, "learning_rate": 1.998330129474274e-07, "loss": 0.0473, "step": 9773 }, { "epoch": 9.126050420168067, "grad_norm": 1.9583332155969444, "learning_rate": 1.9941001041655695e-07, "loss": 0.0302, "step": 9774 }, { "epoch": 9.126984126984127, "grad_norm": 4.574685783612567, "learning_rate": 1.9898744695236116e-07, "loss": 0.131, "step": 9775 }, { "epoch": 9.127917833800186, "grad_norm": 2.523463997901474, "learning_rate": 1.9856532259348804e-07, "loss": 0.037, "step": 9776 }, { "epoch": 9.128851540616246, "grad_norm": 5.681724793276045, "learning_rate": 1.9814363737854502e-07, "loss": 0.0676, "step": 9777 }, { "epoch": 9.129785247432306, "grad_norm": 0.4895580834305514, "learning_rate": 1.9772239134610128e-07, "loss": 0.0023, "step": 9778 }, { "epoch": 9.130718954248366, "grad_norm": 1.779853847871281, "learning_rate": 1.9730158453468373e-07, "loss": 0.0263, "step": 9779 }, { "epoch": 9.131652661064425, "grad_norm": 7.1092072087516405, "learning_rate": 1.9688121698277995e-07, "loss": 0.1068, "step": 9780 }, { "epoch": 9.132586367880485, "grad_norm": 2.747893621338964, "learning_rate": 1.9646128872883584e-07, "loss": 0.0767, "step": 9781 }, { "epoch": 9.133520074696545, "grad_norm": 8.828840280367231, "learning_rate": 1.9604179981126014e-07, "loss": 0.1178, "step": 9782 }, { "epoch": 9.134453781512605, "grad_norm": 6.590320292527322, "learning_rate": 1.956227502684188e-07, "loss": 0.1496, "step": 9783 }, { "epoch": 9.135387488328664, "grad_norm": 0.45119845831308647, "learning_rate": 1.9520414013863898e-07, "loss": 0.0114, "step": 9784 }, { "epoch": 9.136321195144724, "grad_norm": 0.5351755794279031, "learning_rate": 1.9478596946020722e-07, "loss": 0.0077, "step": 9785 }, { "epoch": 9.137254901960784, "grad_norm": 0.925978483296507, "learning_rate": 1.9436823827136852e-07, "loss": 0.0167, "step": 9786 }, { "epoch": 9.138188608776844, "grad_norm": 3.797012492775718, "learning_rate": 1.939509466103312e-07, "loss": 0.0859, "step": 9787 }, { "epoch": 9.139122315592903, "grad_norm": 2.966571146086511, "learning_rate": 1.9353409451525974e-07, "loss": 0.1222, "step": 9788 }, { "epoch": 9.140056022408963, "grad_norm": 1.8794074729047625, "learning_rate": 1.9311768202427973e-07, "loss": 0.035, "step": 9789 }, { "epoch": 9.140989729225023, "grad_norm": 0.4692417965823863, "learning_rate": 1.927017091754768e-07, "loss": 0.01, "step": 9790 }, { "epoch": 9.141923436041083, "grad_norm": 2.8804581046136333, "learning_rate": 1.9228617600689724e-07, "loss": 0.0236, "step": 9791 }, { "epoch": 9.142857142857142, "grad_norm": 0.46442628712868583, "learning_rate": 1.9187108255654508e-07, "loss": 0.0085, "step": 9792 }, { "epoch": 9.143790849673202, "grad_norm": 1.5738804693702246, "learning_rate": 1.9145642886238548e-07, "loss": 0.0308, "step": 9793 }, { "epoch": 9.144724556489262, "grad_norm": 0.5221081223099368, "learning_rate": 1.9104221496234254e-07, "loss": 0.0037, "step": 9794 }, { "epoch": 9.145658263305322, "grad_norm": 0.39672810312024204, "learning_rate": 1.9062844089430155e-07, "loss": 0.0087, "step": 9795 }, { "epoch": 9.146591970121381, "grad_norm": 1.7632229305538532, "learning_rate": 1.9021510669610666e-07, "loss": 0.0564, "step": 9796 }, { "epoch": 9.147525676937441, "grad_norm": 1.924273563698189, "learning_rate": 1.8980221240556096e-07, "loss": 0.0349, "step": 9797 }, { "epoch": 9.1484593837535, "grad_norm": 0.9552838372596464, "learning_rate": 1.893897580604287e-07, "loss": 0.0273, "step": 9798 }, { "epoch": 9.14939309056956, "grad_norm": 0.988869359895618, "learning_rate": 1.8897774369843357e-07, "loss": 0.0062, "step": 9799 }, { "epoch": 9.15032679738562, "grad_norm": 2.8524249129206467, "learning_rate": 1.8856616935725825e-07, "loss": 0.1055, "step": 9800 }, { "epoch": 9.15126050420168, "grad_norm": 3.312044723342034, "learning_rate": 1.8815503507454647e-07, "loss": 0.0658, "step": 9801 }, { "epoch": 9.15219421101774, "grad_norm": 2.5444296027068622, "learning_rate": 1.8774434088790094e-07, "loss": 0.062, "step": 9802 }, { "epoch": 9.1531279178338, "grad_norm": 3.8454052120733544, "learning_rate": 1.8733408683488273e-07, "loss": 0.1051, "step": 9803 }, { "epoch": 9.15406162464986, "grad_norm": 2.3225940153803406, "learning_rate": 1.8692427295301574e-07, "loss": 0.0847, "step": 9804 }, { "epoch": 9.15499533146592, "grad_norm": 1.820502965706585, "learning_rate": 1.8651489927978105e-07, "loss": 0.0176, "step": 9805 }, { "epoch": 9.155929038281979, "grad_norm": 1.7936512418223782, "learning_rate": 1.8610596585262097e-07, "loss": 0.0336, "step": 9806 }, { "epoch": 9.156862745098039, "grad_norm": 1.2577297911663348, "learning_rate": 1.8569747270893613e-07, "loss": 0.0304, "step": 9807 }, { "epoch": 9.157796451914098, "grad_norm": 4.200779295491912, "learning_rate": 1.8528941988608884e-07, "loss": 0.0758, "step": 9808 }, { "epoch": 9.158730158730158, "grad_norm": 10.967150514443203, "learning_rate": 1.8488180742139983e-07, "loss": 0.1399, "step": 9809 }, { "epoch": 9.159663865546218, "grad_norm": 4.823309347486886, "learning_rate": 1.8447463535214872e-07, "loss": 0.0549, "step": 9810 }, { "epoch": 9.160597572362278, "grad_norm": 3.017717802789207, "learning_rate": 1.840679037155757e-07, "loss": 0.0729, "step": 9811 }, { "epoch": 9.161531279178337, "grad_norm": 1.219443165047917, "learning_rate": 1.836616125488827e-07, "loss": 0.0126, "step": 9812 }, { "epoch": 9.162464985994397, "grad_norm": 7.580793614490642, "learning_rate": 1.8325576188922833e-07, "loss": 0.0323, "step": 9813 }, { "epoch": 9.163398692810457, "grad_norm": 0.6801773159759774, "learning_rate": 1.8285035177373178e-07, "loss": 0.0166, "step": 9814 }, { "epoch": 9.164332399626517, "grad_norm": 1.095470832718292, "learning_rate": 1.8244538223947228e-07, "loss": 0.0226, "step": 9815 }, { "epoch": 9.165266106442576, "grad_norm": 3.7434334391222643, "learning_rate": 1.820408533234902e-07, "loss": 0.0941, "step": 9816 }, { "epoch": 9.166199813258636, "grad_norm": 0.7683670759501294, "learning_rate": 1.816367650627826e-07, "loss": 0.0135, "step": 9817 }, { "epoch": 9.167133520074696, "grad_norm": 1.8521431895272527, "learning_rate": 1.8123311749430828e-07, "loss": 0.0199, "step": 9818 }, { "epoch": 9.168067226890756, "grad_norm": 1.4333413308613152, "learning_rate": 1.808299106549849e-07, "loss": 0.0217, "step": 9819 }, { "epoch": 9.169000933706815, "grad_norm": 3.126624818843029, "learning_rate": 1.8042714458169019e-07, "loss": 0.0235, "step": 9820 }, { "epoch": 9.169934640522875, "grad_norm": 2.594830246926999, "learning_rate": 1.8002481931126248e-07, "loss": 0.0297, "step": 9821 }, { "epoch": 9.170868347338935, "grad_norm": 1.3436114034430493, "learning_rate": 1.7962293488049842e-07, "loss": 0.0266, "step": 9822 }, { "epoch": 9.171802054154995, "grad_norm": 5.247468377993993, "learning_rate": 1.792214913261542e-07, "loss": 0.1625, "step": 9823 }, { "epoch": 9.172735760971054, "grad_norm": 6.888528900070457, "learning_rate": 1.7882048868494595e-07, "loss": 0.1134, "step": 9824 }, { "epoch": 9.173669467787114, "grad_norm": 1.6743054952219665, "learning_rate": 1.7841992699355048e-07, "loss": 0.0266, "step": 9825 }, { "epoch": 9.174603174603174, "grad_norm": 2.55835544716496, "learning_rate": 1.7801980628860404e-07, "loss": 0.0456, "step": 9826 }, { "epoch": 9.175536881419234, "grad_norm": 2.320998614642498, "learning_rate": 1.776201266067007e-07, "loss": 0.045, "step": 9827 }, { "epoch": 9.176470588235293, "grad_norm": 3.354537106334845, "learning_rate": 1.772208879843973e-07, "loss": 0.0637, "step": 9828 }, { "epoch": 9.177404295051353, "grad_norm": 2.7456258615063414, "learning_rate": 1.7682209045820687e-07, "loss": 0.0372, "step": 9829 }, { "epoch": 9.178338001867413, "grad_norm": 1.7084548305947318, "learning_rate": 1.7642373406460522e-07, "loss": 0.0327, "step": 9830 }, { "epoch": 9.179271708683473, "grad_norm": 0.6260325016773944, "learning_rate": 1.760258188400249e-07, "loss": 0.0111, "step": 9831 }, { "epoch": 9.180205415499533, "grad_norm": 1.1655617333436659, "learning_rate": 1.7562834482086065e-07, "loss": 0.0235, "step": 9832 }, { "epoch": 9.181139122315592, "grad_norm": 2.4695565590844093, "learning_rate": 1.7523131204346622e-07, "loss": 0.0557, "step": 9833 }, { "epoch": 9.182072829131652, "grad_norm": 0.40637026554779515, "learning_rate": 1.748347205441536e-07, "loss": 0.0028, "step": 9834 }, { "epoch": 9.183006535947712, "grad_norm": 2.3423020819277065, "learning_rate": 1.744385703591961e-07, "loss": 0.0572, "step": 9835 }, { "epoch": 9.183940242763772, "grad_norm": 4.029751127015521, "learning_rate": 1.7404286152482573e-07, "loss": 0.0577, "step": 9836 }, { "epoch": 9.184873949579831, "grad_norm": 0.36099409981564196, "learning_rate": 1.7364759407723474e-07, "loss": 0.0129, "step": 9837 }, { "epoch": 9.185807656395891, "grad_norm": 2.556833054336472, "learning_rate": 1.7325276805257362e-07, "loss": 0.0594, "step": 9838 }, { "epoch": 9.18674136321195, "grad_norm": 3.481516332788769, "learning_rate": 1.7285838348695515e-07, "loss": 0.0853, "step": 9839 }, { "epoch": 9.18767507002801, "grad_norm": 1.016561747500692, "learning_rate": 1.724644404164494e-07, "loss": 0.016, "step": 9840 }, { "epoch": 9.18860877684407, "grad_norm": 0.7035763208698663, "learning_rate": 1.7207093887708636e-07, "loss": 0.0103, "step": 9841 }, { "epoch": 9.18954248366013, "grad_norm": 4.365808379702238, "learning_rate": 1.7167787890485675e-07, "loss": 0.1416, "step": 9842 }, { "epoch": 9.19047619047619, "grad_norm": 1.1070347353282872, "learning_rate": 1.712852605357096e-07, "loss": 0.0304, "step": 9843 }, { "epoch": 9.19140989729225, "grad_norm": 1.1980565285744575, "learning_rate": 1.7089308380555446e-07, "loss": 0.0136, "step": 9844 }, { "epoch": 9.19234360410831, "grad_norm": 3.21552206839146, "learning_rate": 1.7050134875025993e-07, "loss": 0.018, "step": 9845 }, { "epoch": 9.193277310924369, "grad_norm": 1.3887164966909968, "learning_rate": 1.701100554056556e-07, "loss": 0.0218, "step": 9846 }, { "epoch": 9.194211017740429, "grad_norm": 6.2141819043052395, "learning_rate": 1.6971920380752848e-07, "loss": 0.1401, "step": 9847 }, { "epoch": 9.195144724556489, "grad_norm": 25.79870907656801, "learning_rate": 1.6932879399162662e-07, "loss": 0.1278, "step": 9848 }, { "epoch": 9.196078431372548, "grad_norm": 4.339633963921209, "learning_rate": 1.6893882599365586e-07, "loss": 0.0408, "step": 9849 }, { "epoch": 9.197012138188608, "grad_norm": 3.5992231573017057, "learning_rate": 1.6854929984928548e-07, "loss": 0.0524, "step": 9850 }, { "epoch": 9.197945845004668, "grad_norm": 1.5869906177923792, "learning_rate": 1.6816021559414086e-07, "loss": 0.0224, "step": 9851 }, { "epoch": 9.198879551820728, "grad_norm": 3.688507737264072, "learning_rate": 1.67771573263808e-07, "loss": 0.0749, "step": 9852 }, { "epoch": 9.199813258636787, "grad_norm": 0.803046096184669, "learning_rate": 1.6738337289383177e-07, "loss": 0.0142, "step": 9853 }, { "epoch": 9.200746965452847, "grad_norm": 2.0802500921273603, "learning_rate": 1.6699561451971824e-07, "loss": 0.044, "step": 9854 }, { "epoch": 9.201680672268907, "grad_norm": 1.0723264215311645, "learning_rate": 1.6660829817693237e-07, "loss": 0.0218, "step": 9855 }, { "epoch": 9.202614379084967, "grad_norm": 2.3666061247938663, "learning_rate": 1.66221423900898e-07, "loss": 0.0599, "step": 9856 }, { "epoch": 9.203548085901026, "grad_norm": 1.7180332905472535, "learning_rate": 1.6583499172699914e-07, "loss": 0.0427, "step": 9857 }, { "epoch": 9.204481792717086, "grad_norm": 0.7260590042346071, "learning_rate": 1.6544900169057854e-07, "loss": 0.0154, "step": 9858 }, { "epoch": 9.205415499533146, "grad_norm": 4.5329757020623775, "learning_rate": 1.6506345382694022e-07, "loss": 0.1078, "step": 9859 }, { "epoch": 9.206349206349206, "grad_norm": 2.782000472339139, "learning_rate": 1.646783481713471e-07, "loss": 0.1242, "step": 9860 }, { "epoch": 9.207282913165265, "grad_norm": 1.7819257443347123, "learning_rate": 1.6429368475902042e-07, "loss": 0.0525, "step": 9861 }, { "epoch": 9.208216619981325, "grad_norm": 1.2211839726354161, "learning_rate": 1.639094636251415e-07, "loss": 0.0223, "step": 9862 }, { "epoch": 9.209150326797385, "grad_norm": 0.8862730420733717, "learning_rate": 1.6352568480485277e-07, "loss": 0.0098, "step": 9863 }, { "epoch": 9.210084033613445, "grad_norm": 0.08853666388750492, "learning_rate": 1.6314234833325448e-07, "loss": 0.0003, "step": 9864 }, { "epoch": 9.211017740429504, "grad_norm": 3.333146976685505, "learning_rate": 1.627594542454075e-07, "loss": 0.0371, "step": 9865 }, { "epoch": 9.211951447245564, "grad_norm": 1.9150634212646724, "learning_rate": 1.6237700257632993e-07, "loss": 0.0375, "step": 9866 }, { "epoch": 9.212885154061624, "grad_norm": 0.49133465328451087, "learning_rate": 1.619949933610032e-07, "loss": 0.009, "step": 9867 }, { "epoch": 9.213818860877684, "grad_norm": 0.7666922841458759, "learning_rate": 1.6161342663436552e-07, "loss": 0.0121, "step": 9868 }, { "epoch": 9.214752567693743, "grad_norm": 0.5678233870429064, "learning_rate": 1.612323024313156e-07, "loss": 0.0124, "step": 9869 }, { "epoch": 9.215686274509803, "grad_norm": 1.078332646836353, "learning_rate": 1.6085162078671058e-07, "loss": 0.0124, "step": 9870 }, { "epoch": 9.216619981325863, "grad_norm": 2.2468773445872325, "learning_rate": 1.6047138173536813e-07, "loss": 0.0363, "step": 9871 }, { "epoch": 9.217553688141923, "grad_norm": 3.85158493023847, "learning_rate": 1.6009158531206603e-07, "loss": 0.092, "step": 9872 }, { "epoch": 9.218487394957982, "grad_norm": 1.8088072834194326, "learning_rate": 1.597122315515409e-07, "loss": 0.0406, "step": 9873 }, { "epoch": 9.219421101774042, "grad_norm": 2.408411437900749, "learning_rate": 1.5933332048848838e-07, "loss": 0.0212, "step": 9874 }, { "epoch": 9.220354808590102, "grad_norm": 1.306719572787242, "learning_rate": 1.5895485215756346e-07, "loss": 0.0123, "step": 9875 }, { "epoch": 9.221288515406162, "grad_norm": 2.0782136747562348, "learning_rate": 1.585768265933818e-07, "loss": 0.0328, "step": 9876 }, { "epoch": 9.222222222222221, "grad_norm": 0.05635988491964244, "learning_rate": 1.5819924383051854e-07, "loss": 0.0004, "step": 9877 }, { "epoch": 9.223155929038281, "grad_norm": 3.333187392006616, "learning_rate": 1.5782210390350717e-07, "loss": 0.0841, "step": 9878 }, { "epoch": 9.224089635854341, "grad_norm": 2.240250678936599, "learning_rate": 1.5744540684684118e-07, "loss": 0.0297, "step": 9879 }, { "epoch": 9.2250233426704, "grad_norm": 3.5806237116128834, "learning_rate": 1.5706915269497358e-07, "loss": 0.1116, "step": 9880 }, { "epoch": 9.22595704948646, "grad_norm": 4.59969897616845, "learning_rate": 1.5669334148231796e-07, "loss": 0.1124, "step": 9881 }, { "epoch": 9.22689075630252, "grad_norm": 2.013716971195625, "learning_rate": 1.5631797324324517e-07, "loss": 0.0598, "step": 9882 }, { "epoch": 9.22782446311858, "grad_norm": 2.0707579389402255, "learning_rate": 1.559430480120866e-07, "loss": 0.0322, "step": 9883 }, { "epoch": 9.22875816993464, "grad_norm": 3.8340028294500375, "learning_rate": 1.5556856582313483e-07, "loss": 0.0662, "step": 9884 }, { "epoch": 9.2296918767507, "grad_norm": 2.235240239021059, "learning_rate": 1.5519452671063916e-07, "loss": 0.06, "step": 9885 }, { "epoch": 9.23062558356676, "grad_norm": 0.17916625641390746, "learning_rate": 1.5482093070880944e-07, "loss": 0.0004, "step": 9886 }, { "epoch": 9.231559290382819, "grad_norm": 0.8792999453564222, "learning_rate": 1.5444777785181608e-07, "loss": 0.0175, "step": 9887 }, { "epoch": 9.232492997198879, "grad_norm": 0.7390748910539496, "learning_rate": 1.5407506817378683e-07, "loss": 0.0051, "step": 9888 }, { "epoch": 9.233426704014938, "grad_norm": 2.3511181226065756, "learning_rate": 1.5370280170881102e-07, "loss": 0.0587, "step": 9889 }, { "epoch": 9.234360410830998, "grad_norm": 0.8157431755741248, "learning_rate": 1.5333097849093647e-07, "loss": 0.0133, "step": 9890 }, { "epoch": 9.235294117647058, "grad_norm": 3.5855586549679153, "learning_rate": 1.5295959855416986e-07, "loss": 0.0284, "step": 9891 }, { "epoch": 9.236227824463118, "grad_norm": 5.876957611706943, "learning_rate": 1.525886619324779e-07, "loss": 0.2229, "step": 9892 }, { "epoch": 9.237161531279177, "grad_norm": 3.508437073201916, "learning_rate": 1.522181686597879e-07, "loss": 0.0755, "step": 9893 }, { "epoch": 9.238095238095237, "grad_norm": 2.397903239761319, "learning_rate": 1.5184811876998495e-07, "loss": 0.0757, "step": 9894 }, { "epoch": 9.239028944911297, "grad_norm": 1.2690060087092312, "learning_rate": 1.514785122969137e-07, "loss": 0.0376, "step": 9895 }, { "epoch": 9.239962651727357, "grad_norm": 7.866452271799201, "learning_rate": 1.511093492743787e-07, "loss": 0.2139, "step": 9896 }, { "epoch": 9.240896358543417, "grad_norm": 1.728101615195607, "learning_rate": 1.5074062973614523e-07, "loss": 0.0231, "step": 9897 }, { "epoch": 9.241830065359476, "grad_norm": 1.9509484670111794, "learning_rate": 1.5037235371593573e-07, "loss": 0.0467, "step": 9898 }, { "epoch": 9.242763772175536, "grad_norm": 1.6737562844800276, "learning_rate": 1.5000452124743326e-07, "loss": 0.025, "step": 9899 }, { "epoch": 9.243697478991596, "grad_norm": 2.03348925501884, "learning_rate": 1.496371323642798e-07, "loss": 0.039, "step": 9900 }, { "epoch": 9.244631185807656, "grad_norm": 1.429429723130202, "learning_rate": 1.4927018710007734e-07, "loss": 0.0351, "step": 9901 }, { "epoch": 9.245564892623715, "grad_norm": 1.8110994127175088, "learning_rate": 1.489036854883874e-07, "loss": 0.0454, "step": 9902 }, { "epoch": 9.246498599439775, "grad_norm": 2.5473193817822835, "learning_rate": 1.4853762756273094e-07, "loss": 0.0927, "step": 9903 }, { "epoch": 9.247432306255835, "grad_norm": 3.240477237071935, "learning_rate": 1.4817201335658616e-07, "loss": 0.0983, "step": 9904 }, { "epoch": 9.248366013071895, "grad_norm": 4.303178135597486, "learning_rate": 1.478068429033941e-07, "loss": 0.1414, "step": 9905 }, { "epoch": 9.249299719887954, "grad_norm": 0.1548396702401375, "learning_rate": 1.4744211623655357e-07, "loss": 0.0015, "step": 9906 }, { "epoch": 9.250233426704014, "grad_norm": 0.20096621275033352, "learning_rate": 1.4707783338942183e-07, "loss": 0.0002, "step": 9907 }, { "epoch": 9.251167133520074, "grad_norm": 0.6928880125146177, "learning_rate": 1.4671399439531775e-07, "loss": 0.0065, "step": 9908 }, { "epoch": 9.252100840336134, "grad_norm": 0.9405108448502358, "learning_rate": 1.4635059928751804e-07, "loss": 0.0152, "step": 9909 }, { "epoch": 9.253034547152193, "grad_norm": 2.0836234128582842, "learning_rate": 1.4598764809925836e-07, "loss": 0.0379, "step": 9910 }, { "epoch": 9.253968253968253, "grad_norm": 2.005025572675043, "learning_rate": 1.4562514086373547e-07, "loss": 0.0305, "step": 9911 }, { "epoch": 9.254901960784313, "grad_norm": 1.3930182096633235, "learning_rate": 1.4526307761410395e-07, "loss": 0.0124, "step": 9912 }, { "epoch": 9.255835667600373, "grad_norm": 1.4584932209581387, "learning_rate": 1.4490145838347958e-07, "loss": 0.0118, "step": 9913 }, { "epoch": 9.256769374416432, "grad_norm": 0.983437200062103, "learning_rate": 1.4454028320493474e-07, "loss": 0.0245, "step": 9914 }, { "epoch": 9.257703081232492, "grad_norm": 3.262768426360356, "learning_rate": 1.441795521115047e-07, "loss": 0.0709, "step": 9915 }, { "epoch": 9.258636788048554, "grad_norm": 1.4532011930034057, "learning_rate": 1.4381926513618139e-07, "loss": 0.0335, "step": 9916 }, { "epoch": 9.259570494864613, "grad_norm": 0.770012567694707, "learning_rate": 1.434594223119168e-07, "loss": 0.0071, "step": 9917 }, { "epoch": 9.260504201680673, "grad_norm": 2.3072665211956296, "learning_rate": 1.431000236716218e-07, "loss": 0.0887, "step": 9918 }, { "epoch": 9.261437908496733, "grad_norm": 0.9554571837804875, "learning_rate": 1.4274106924816954e-07, "loss": 0.0063, "step": 9919 }, { "epoch": 9.262371615312793, "grad_norm": 2.6371903271131947, "learning_rate": 1.4238255907438882e-07, "loss": 0.061, "step": 9920 }, { "epoch": 9.263305322128852, "grad_norm": 0.5843052121016661, "learning_rate": 1.4202449318306943e-07, "loss": 0.0098, "step": 9921 }, { "epoch": 9.264239028944912, "grad_norm": 2.3665485193727163, "learning_rate": 1.4166687160696025e-07, "loss": 0.0535, "step": 9922 }, { "epoch": 9.265172735760972, "grad_norm": 5.730314529812215, "learning_rate": 1.4130969437877007e-07, "loss": 0.0885, "step": 9923 }, { "epoch": 9.266106442577032, "grad_norm": 4.258561849913068, "learning_rate": 1.409529615311672e-07, "loss": 0.098, "step": 9924 }, { "epoch": 9.267040149393091, "grad_norm": 2.7308636924549794, "learning_rate": 1.4059667309677783e-07, "loss": 0.0756, "step": 9925 }, { "epoch": 9.267973856209151, "grad_norm": 1.8589987950922264, "learning_rate": 1.402408291081886e-07, "loss": 0.017, "step": 9926 }, { "epoch": 9.268907563025211, "grad_norm": 2.900065008349881, "learning_rate": 1.3988542959794627e-07, "loss": 0.0459, "step": 9927 }, { "epoch": 9.26984126984127, "grad_norm": 3.2880691343872694, "learning_rate": 1.3953047459855483e-07, "loss": 0.0756, "step": 9928 }, { "epoch": 9.27077497665733, "grad_norm": 2.8121598594261887, "learning_rate": 1.3917596414247947e-07, "loss": 0.0367, "step": 9929 }, { "epoch": 9.27170868347339, "grad_norm": 0.8415057327673078, "learning_rate": 1.3882189826214365e-07, "loss": 0.0181, "step": 9930 }, { "epoch": 9.27264239028945, "grad_norm": 2.9299285447771415, "learning_rate": 1.3846827698993094e-07, "loss": 0.0709, "step": 9931 }, { "epoch": 9.27357609710551, "grad_norm": 0.25272834356866913, "learning_rate": 1.3811510035818433e-07, "loss": 0.0024, "step": 9932 }, { "epoch": 9.27450980392157, "grad_norm": 0.69826531038286, "learning_rate": 1.3776236839920464e-07, "loss": 0.0071, "step": 9933 }, { "epoch": 9.27544351073763, "grad_norm": 1.7843436586393218, "learning_rate": 1.3741008114525388e-07, "loss": 0.0381, "step": 9934 }, { "epoch": 9.276377217553689, "grad_norm": 0.2259359104551991, "learning_rate": 1.3705823862855183e-07, "loss": 0.0006, "step": 9935 }, { "epoch": 9.277310924369749, "grad_norm": 3.4726301808600093, "learning_rate": 1.367068408812794e-07, "loss": 0.1328, "step": 9936 }, { "epoch": 9.278244631185808, "grad_norm": 1.6342145088022575, "learning_rate": 1.363558879355753e-07, "loss": 0.0257, "step": 9937 }, { "epoch": 9.279178338001868, "grad_norm": 1.976530785446927, "learning_rate": 1.360053798235378e-07, "loss": 0.0275, "step": 9938 }, { "epoch": 9.280112044817928, "grad_norm": 3.453953771721737, "learning_rate": 1.3565531657722398e-07, "loss": 0.1064, "step": 9939 }, { "epoch": 9.281045751633988, "grad_norm": 6.176514898364429, "learning_rate": 1.353056982286527e-07, "loss": 0.0943, "step": 9940 }, { "epoch": 9.281979458450047, "grad_norm": 2.4226126267457015, "learning_rate": 1.3495652480979947e-07, "loss": 0.0743, "step": 9941 }, { "epoch": 9.282913165266107, "grad_norm": 1.2059991820654863, "learning_rate": 1.3460779635260045e-07, "loss": 0.0269, "step": 9942 }, { "epoch": 9.283846872082167, "grad_norm": 1.297098575374863, "learning_rate": 1.3425951288894955e-07, "loss": 0.0232, "step": 9943 }, { "epoch": 9.284780578898227, "grad_norm": 4.064798865080608, "learning_rate": 1.3391167445070242e-07, "loss": 0.1147, "step": 9944 }, { "epoch": 9.285714285714286, "grad_norm": 2.8476823183548436, "learning_rate": 1.3356428106967246e-07, "loss": 0.0505, "step": 9945 }, { "epoch": 9.286647992530346, "grad_norm": 0.8547913234167763, "learning_rate": 1.3321733277763205e-07, "loss": 0.0153, "step": 9946 }, { "epoch": 9.287581699346406, "grad_norm": 1.584901042347156, "learning_rate": 1.3287082960631303e-07, "loss": 0.0316, "step": 9947 }, { "epoch": 9.288515406162466, "grad_norm": 3.3829623404302107, "learning_rate": 1.325247715874084e-07, "loss": 0.0564, "step": 9948 }, { "epoch": 9.289449112978525, "grad_norm": 3.46619716563462, "learning_rate": 1.3217915875256836e-07, "loss": 0.113, "step": 9949 }, { "epoch": 9.290382819794585, "grad_norm": 0.8948302050314406, "learning_rate": 1.318339911334021e-07, "loss": 0.0084, "step": 9950 }, { "epoch": 9.291316526610645, "grad_norm": 1.095868320888407, "learning_rate": 1.314892687614805e-07, "loss": 0.0328, "step": 9951 }, { "epoch": 9.292250233426705, "grad_norm": 2.1131663721331986, "learning_rate": 1.311449916683305e-07, "loss": 0.0288, "step": 9952 }, { "epoch": 9.293183940242765, "grad_norm": 8.214935343552753, "learning_rate": 1.308011598854414e-07, "loss": 0.0491, "step": 9953 }, { "epoch": 9.294117647058824, "grad_norm": 1.8084580191053747, "learning_rate": 1.3045777344426024e-07, "loss": 0.0085, "step": 9954 }, { "epoch": 9.295051353874884, "grad_norm": 1.1297257289299516, "learning_rate": 1.3011483237619306e-07, "loss": 0.0132, "step": 9955 }, { "epoch": 9.295985060690944, "grad_norm": 2.8764174817144186, "learning_rate": 1.2977233671260525e-07, "loss": 0.0791, "step": 9956 }, { "epoch": 9.296918767507004, "grad_norm": 0.9490362854727503, "learning_rate": 1.2943028648482236e-07, "loss": 0.0303, "step": 9957 }, { "epoch": 9.297852474323063, "grad_norm": 3.5676194652094284, "learning_rate": 1.2908868172412882e-07, "loss": 0.0856, "step": 9958 }, { "epoch": 9.298786181139123, "grad_norm": 2.556052353281075, "learning_rate": 1.2874752246176792e-07, "loss": 0.031, "step": 9959 }, { "epoch": 9.299719887955183, "grad_norm": 0.6255959171986127, "learning_rate": 1.2840680872894195e-07, "loss": 0.0164, "step": 9960 }, { "epoch": 9.300653594771243, "grad_norm": 1.5284505560106627, "learning_rate": 1.280665405568138e-07, "loss": 0.0358, "step": 9961 }, { "epoch": 9.301587301587302, "grad_norm": 0.6705291619797962, "learning_rate": 1.2772671797650405e-07, "loss": 0.0104, "step": 9962 }, { "epoch": 9.302521008403362, "grad_norm": 0.4392042214849448, "learning_rate": 1.2738734101909288e-07, "loss": 0.0063, "step": 9963 }, { "epoch": 9.303454715219422, "grad_norm": 1.5454434842445908, "learning_rate": 1.2704840971562104e-07, "loss": 0.0221, "step": 9964 }, { "epoch": 9.304388422035482, "grad_norm": 1.524962132862958, "learning_rate": 1.2670992409708704e-07, "loss": 0.0135, "step": 9965 }, { "epoch": 9.305322128851541, "grad_norm": 3.4161875002665396, "learning_rate": 1.263718841944489e-07, "loss": 0.0904, "step": 9966 }, { "epoch": 9.306255835667601, "grad_norm": 1.2167713005920822, "learning_rate": 1.2603429003862467e-07, "loss": 0.041, "step": 9967 }, { "epoch": 9.30718954248366, "grad_norm": 0.7246214179262362, "learning_rate": 1.2569714166049075e-07, "loss": 0.0083, "step": 9968 }, { "epoch": 9.30812324929972, "grad_norm": 2.67447828723698, "learning_rate": 1.253604390908819e-07, "loss": 0.0246, "step": 9969 }, { "epoch": 9.30905695611578, "grad_norm": 2.9956976813957414, "learning_rate": 1.250241823605952e-07, "loss": 0.0855, "step": 9970 }, { "epoch": 9.30999066293184, "grad_norm": 0.6204487718652036, "learning_rate": 1.2468837150038438e-07, "loss": 0.0105, "step": 9971 }, { "epoch": 9.3109243697479, "grad_norm": 2.517994451720794, "learning_rate": 1.2435300654096262e-07, "loss": 0.0569, "step": 9972 }, { "epoch": 9.31185807656396, "grad_norm": 2.0871610899752815, "learning_rate": 1.2401808751300258e-07, "loss": 0.0376, "step": 9973 }, { "epoch": 9.31279178338002, "grad_norm": 0.3306854438812259, "learning_rate": 1.2368361444713706e-07, "loss": 0.002, "step": 9974 }, { "epoch": 9.313725490196079, "grad_norm": 0.28745068405097307, "learning_rate": 1.2334958737395654e-07, "loss": 0.001, "step": 9975 }, { "epoch": 9.314659197012139, "grad_norm": 2.907103737998391, "learning_rate": 1.2301600632401212e-07, "loss": 0.0456, "step": 9976 }, { "epoch": 9.315592903828199, "grad_norm": 0.7355304376305655, "learning_rate": 1.226828713278122e-07, "loss": 0.0128, "step": 9977 }, { "epoch": 9.316526610644258, "grad_norm": 2.1195239593641593, "learning_rate": 1.223501824158274e-07, "loss": 0.0442, "step": 9978 }, { "epoch": 9.317460317460318, "grad_norm": 0.8157670573029915, "learning_rate": 1.2201793961848508e-07, "loss": 0.0168, "step": 9979 }, { "epoch": 9.318394024276378, "grad_norm": 4.291188966357211, "learning_rate": 1.2168614296617198e-07, "loss": 0.1353, "step": 9980 }, { "epoch": 9.319327731092438, "grad_norm": 2.3220244839469326, "learning_rate": 1.2135479248923498e-07, "loss": 0.0544, "step": 9981 }, { "epoch": 9.320261437908497, "grad_norm": 0.9308434422940507, "learning_rate": 1.2102388821797927e-07, "loss": 0.0186, "step": 9982 }, { "epoch": 9.321195144724557, "grad_norm": 4.534457594918982, "learning_rate": 1.2069343018267065e-07, "loss": 0.1065, "step": 9983 }, { "epoch": 9.322128851540617, "grad_norm": 0.8305766831239259, "learning_rate": 1.2036341841353216e-07, "loss": 0.0218, "step": 9984 }, { "epoch": 9.323062558356677, "grad_norm": 1.0018049971353769, "learning_rate": 1.2003385294074742e-07, "loss": 0.017, "step": 9985 }, { "epoch": 9.323996265172736, "grad_norm": 0.8900585544355039, "learning_rate": 1.1970473379445902e-07, "loss": 0.0118, "step": 9986 }, { "epoch": 9.324929971988796, "grad_norm": 2.029730479593767, "learning_rate": 1.1937606100476785e-07, "loss": 0.0285, "step": 9987 }, { "epoch": 9.325863678804856, "grad_norm": 2.712539512874117, "learning_rate": 1.1904783460173541e-07, "loss": 0.0556, "step": 9988 }, { "epoch": 9.326797385620916, "grad_norm": 3.9548737574349544, "learning_rate": 1.187200546153816e-07, "loss": 0.0399, "step": 9989 }, { "epoch": 9.327731092436975, "grad_norm": 0.7304994569337352, "learning_rate": 1.1839272107568467e-07, "loss": 0.011, "step": 9990 }, { "epoch": 9.328664799253035, "grad_norm": 1.0592087402920904, "learning_rate": 1.1806583401258287e-07, "loss": 0.0158, "step": 9991 }, { "epoch": 9.329598506069095, "grad_norm": 1.4112438796724192, "learning_rate": 1.1773939345597507e-07, "loss": 0.0287, "step": 9992 }, { "epoch": 9.330532212885155, "grad_norm": 3.894704806717695, "learning_rate": 1.1741339943571683e-07, "loss": 0.0399, "step": 9993 }, { "epoch": 9.331465919701214, "grad_norm": 1.913699164303219, "learning_rate": 1.170878519816232e-07, "loss": 0.0483, "step": 9994 }, { "epoch": 9.332399626517274, "grad_norm": 3.060088355398587, "learning_rate": 1.1676275112347035e-07, "loss": 0.1073, "step": 9995 }, { "epoch": 9.333333333333334, "grad_norm": 5.766177123385977, "learning_rate": 1.1643809689099172e-07, "loss": 0.0456, "step": 9996 }, { "epoch": 9.334267040149394, "grad_norm": 4.86274748629358, "learning_rate": 1.1611388931388023e-07, "loss": 0.101, "step": 9997 }, { "epoch": 9.335200746965453, "grad_norm": 2.4932819083489397, "learning_rate": 1.1579012842178883e-07, "loss": 0.0346, "step": 9998 }, { "epoch": 9.336134453781513, "grad_norm": 6.285464483923672, "learning_rate": 1.1546681424432882e-07, "loss": 0.0565, "step": 9999 }, { "epoch": 9.337068160597573, "grad_norm": 2.111244750190557, "learning_rate": 1.1514394681107043e-07, "loss": 0.0666, "step": 10000 }, { "epoch": 9.338001867413633, "grad_norm": 0.606972315860331, "learning_rate": 1.1482152615154451e-07, "loss": 0.0065, "step": 10001 }, { "epoch": 9.338935574229692, "grad_norm": 0.5993162573852455, "learning_rate": 1.1449955229523856e-07, "loss": 0.0093, "step": 10002 }, { "epoch": 9.339869281045752, "grad_norm": 1.4718767969048159, "learning_rate": 1.1417802527160126e-07, "loss": 0.0379, "step": 10003 }, { "epoch": 9.340802987861812, "grad_norm": 0.5017794856163976, "learning_rate": 1.1385694511004019e-07, "loss": 0.0085, "step": 10004 }, { "epoch": 9.341736694677872, "grad_norm": 7.85227428501722, "learning_rate": 1.1353631183992075e-07, "loss": 0.0503, "step": 10005 }, { "epoch": 9.342670401493931, "grad_norm": 1.7177152787274006, "learning_rate": 1.1321612549056948e-07, "loss": 0.0475, "step": 10006 }, { "epoch": 9.343604108309991, "grad_norm": 3.6627097388932968, "learning_rate": 1.1289638609126963e-07, "loss": 0.095, "step": 10007 }, { "epoch": 9.344537815126051, "grad_norm": 0.2613519060471153, "learning_rate": 1.1257709367126613e-07, "loss": 0.002, "step": 10008 }, { "epoch": 9.34547152194211, "grad_norm": 1.8164791754811869, "learning_rate": 1.1225824825976117e-07, "loss": 0.0261, "step": 10009 }, { "epoch": 9.34640522875817, "grad_norm": 2.0389809731625945, "learning_rate": 1.1193984988591699e-07, "loss": 0.0548, "step": 10010 }, { "epoch": 9.34733893557423, "grad_norm": 1.4580413713110327, "learning_rate": 1.1162189857885364e-07, "loss": 0.0248, "step": 10011 }, { "epoch": 9.34827264239029, "grad_norm": 2.4040704595722517, "learning_rate": 1.1130439436765228e-07, "loss": 0.0725, "step": 10012 }, { "epoch": 9.34920634920635, "grad_norm": 1.6065256471239038, "learning_rate": 1.1098733728135247e-07, "loss": 0.0481, "step": 10013 }, { "epoch": 9.35014005602241, "grad_norm": 0.4800254918852626, "learning_rate": 1.1067072734895157e-07, "loss": 0.0054, "step": 10014 }, { "epoch": 9.35107376283847, "grad_norm": 2.9728845559487644, "learning_rate": 1.1035456459940697e-07, "loss": 0.0849, "step": 10015 }, { "epoch": 9.352007469654529, "grad_norm": 0.8326922203025282, "learning_rate": 1.1003884906163609e-07, "loss": 0.0049, "step": 10016 }, { "epoch": 9.352941176470589, "grad_norm": 1.276421903506115, "learning_rate": 1.0972358076451417e-07, "loss": 0.018, "step": 10017 }, { "epoch": 9.353874883286649, "grad_norm": 2.4649236430225807, "learning_rate": 1.0940875973687648e-07, "loss": 0.0596, "step": 10018 }, { "epoch": 9.354808590102708, "grad_norm": 1.5922489065149068, "learning_rate": 1.0909438600751555e-07, "loss": 0.0346, "step": 10019 }, { "epoch": 9.355742296918768, "grad_norm": 3.444783386662988, "learning_rate": 1.0878045960518501e-07, "loss": 0.0753, "step": 10020 }, { "epoch": 9.356676003734828, "grad_norm": 1.7838590476344158, "learning_rate": 1.084669805585975e-07, "loss": 0.0189, "step": 10021 }, { "epoch": 9.357609710550888, "grad_norm": 3.4512429893904453, "learning_rate": 1.0815394889642339e-07, "loss": 0.0758, "step": 10022 }, { "epoch": 9.358543417366947, "grad_norm": 0.7595325652386866, "learning_rate": 1.0784136464729366e-07, "loss": 0.015, "step": 10023 }, { "epoch": 9.359477124183007, "grad_norm": 1.694198740907016, "learning_rate": 1.0752922783979603e-07, "loss": 0.0345, "step": 10024 }, { "epoch": 9.360410830999067, "grad_norm": 2.515407353200865, "learning_rate": 1.0721753850247984e-07, "loss": 0.0451, "step": 10025 }, { "epoch": 9.361344537815127, "grad_norm": 3.9212170943928246, "learning_rate": 1.0690629666385288e-07, "loss": 0.1322, "step": 10026 }, { "epoch": 9.362278244631186, "grad_norm": 3.5986242272336417, "learning_rate": 1.0659550235238125e-07, "loss": 0.0951, "step": 10027 }, { "epoch": 9.363211951447246, "grad_norm": 3.9627739924050895, "learning_rate": 1.0628515559648999e-07, "loss": 0.0951, "step": 10028 }, { "epoch": 9.364145658263306, "grad_norm": 1.8183968471058527, "learning_rate": 1.0597525642456474e-07, "loss": 0.0631, "step": 10029 }, { "epoch": 9.365079365079366, "grad_norm": 3.202926236424739, "learning_rate": 1.0566580486494837e-07, "loss": 0.0429, "step": 10030 }, { "epoch": 9.366013071895425, "grad_norm": 2.8982356642265636, "learning_rate": 1.0535680094594436e-07, "loss": 0.0342, "step": 10031 }, { "epoch": 9.366946778711485, "grad_norm": 2.212650269786762, "learning_rate": 1.0504824469581287e-07, "loss": 0.0692, "step": 10032 }, { "epoch": 9.367880485527545, "grad_norm": 1.6915490265447362, "learning_rate": 1.0474013614277689e-07, "loss": 0.0111, "step": 10033 }, { "epoch": 9.368814192343605, "grad_norm": 0.7347018041074508, "learning_rate": 1.0443247531501499e-07, "loss": 0.0136, "step": 10034 }, { "epoch": 9.369747899159664, "grad_norm": 6.95514718753134, "learning_rate": 1.0412526224066687e-07, "loss": 0.1442, "step": 10035 }, { "epoch": 9.370681605975724, "grad_norm": 3.256976315104601, "learning_rate": 1.0381849694783009e-07, "loss": 0.0653, "step": 10036 }, { "epoch": 9.371615312791784, "grad_norm": 2.6727941415013428, "learning_rate": 1.0351217946456105e-07, "loss": 0.0835, "step": 10037 }, { "epoch": 9.372549019607844, "grad_norm": 1.4243653487980428, "learning_rate": 1.0320630981887736e-07, "loss": 0.0236, "step": 10038 }, { "epoch": 9.373482726423903, "grad_norm": 2.0512537644184246, "learning_rate": 1.0290088803875331e-07, "loss": 0.0365, "step": 10039 }, { "epoch": 9.374416433239963, "grad_norm": 2.5811460982168764, "learning_rate": 1.0259591415212322e-07, "loss": 0.0779, "step": 10040 }, { "epoch": 9.375350140056023, "grad_norm": 1.9615683789160863, "learning_rate": 1.0229138818687923e-07, "loss": 0.0445, "step": 10041 }, { "epoch": 9.376283846872083, "grad_norm": 2.550601306968286, "learning_rate": 1.0198731017087571e-07, "loss": 0.0494, "step": 10042 }, { "epoch": 9.377217553688142, "grad_norm": 2.6937089770725, "learning_rate": 1.0168368013192264e-07, "loss": 0.0524, "step": 10043 }, { "epoch": 9.378151260504202, "grad_norm": 0.7097337378633367, "learning_rate": 1.0138049809779061e-07, "loss": 0.0062, "step": 10044 }, { "epoch": 9.379084967320262, "grad_norm": 3.2500414803911695, "learning_rate": 1.0107776409620795e-07, "loss": 0.0834, "step": 10045 }, { "epoch": 9.380018674136322, "grad_norm": 1.2629507886119806, "learning_rate": 1.0077547815486477e-07, "loss": 0.0295, "step": 10046 }, { "epoch": 9.380952380952381, "grad_norm": 1.708071555212205, "learning_rate": 1.0047364030140727e-07, "loss": 0.0169, "step": 10047 }, { "epoch": 9.381886087768441, "grad_norm": 9.213136522369997, "learning_rate": 1.0017225056344227e-07, "loss": 0.1958, "step": 10048 }, { "epoch": 9.3828197945845, "grad_norm": 0.60880871151733, "learning_rate": 9.987130896853436e-08, "loss": 0.0082, "step": 10049 }, { "epoch": 9.38375350140056, "grad_norm": 1.0919221025994883, "learning_rate": 9.957081554420933e-08, "loss": 0.0165, "step": 10050 }, { "epoch": 9.38468720821662, "grad_norm": 1.3278674319622013, "learning_rate": 9.927077031794962e-08, "loss": 0.0144, "step": 10051 }, { "epoch": 9.38562091503268, "grad_norm": 0.6112220344167877, "learning_rate": 9.897117331719774e-08, "loss": 0.0061, "step": 10052 }, { "epoch": 9.38655462184874, "grad_norm": 9.772973630196912, "learning_rate": 9.86720245693562e-08, "loss": 0.1333, "step": 10053 }, { "epoch": 9.3874883286648, "grad_norm": 2.953755539674115, "learning_rate": 9.837332410178313e-08, "loss": 0.0379, "step": 10054 }, { "epoch": 9.38842203548086, "grad_norm": 4.4079029889640315, "learning_rate": 9.807507194180054e-08, "loss": 0.0959, "step": 10055 }, { "epoch": 9.38935574229692, "grad_norm": 1.0986425409936589, "learning_rate": 9.777726811668553e-08, "loss": 0.0121, "step": 10056 }, { "epoch": 9.390289449112979, "grad_norm": 0.7231912889388872, "learning_rate": 9.747991265367517e-08, "loss": 0.0062, "step": 10057 }, { "epoch": 9.391223155929039, "grad_norm": 3.267211565486355, "learning_rate": 9.718300557996663e-08, "loss": 0.0709, "step": 10058 }, { "epoch": 9.392156862745098, "grad_norm": 0.8727586389043044, "learning_rate": 9.688654692271537e-08, "loss": 0.0194, "step": 10059 }, { "epoch": 9.393090569561158, "grad_norm": 3.433575738847664, "learning_rate": 9.659053670903584e-08, "loss": 0.1052, "step": 10060 }, { "epoch": 9.394024276377218, "grad_norm": 1.0064778613962964, "learning_rate": 9.629497496600026e-08, "loss": 0.0182, "step": 10061 }, { "epoch": 9.394957983193278, "grad_norm": 1.8930082313171688, "learning_rate": 9.599986172064257e-08, "loss": 0.0389, "step": 10062 }, { "epoch": 9.395891690009337, "grad_norm": 0.9825285992376034, "learning_rate": 9.57051969999534e-08, "loss": 0.0115, "step": 10063 }, { "epoch": 9.396825396825397, "grad_norm": 7.779725068829614, "learning_rate": 9.541098083088229e-08, "loss": 0.066, "step": 10064 }, { "epoch": 9.397759103641457, "grad_norm": 0.4677849674094083, "learning_rate": 9.511721324033996e-08, "loss": 0.0042, "step": 10065 }, { "epoch": 9.398692810457517, "grad_norm": 2.6142187003654027, "learning_rate": 9.482389425519434e-08, "loss": 0.0632, "step": 10066 }, { "epoch": 9.399626517273576, "grad_norm": 0.37610102941333867, "learning_rate": 9.453102390227176e-08, "loss": 0.0069, "step": 10067 }, { "epoch": 9.400560224089636, "grad_norm": 0.8837969656419805, "learning_rate": 9.423860220835912e-08, "loss": 0.0081, "step": 10068 }, { "epoch": 9.401493930905696, "grad_norm": 1.027866897216119, "learning_rate": 9.394662920020225e-08, "loss": 0.0108, "step": 10069 }, { "epoch": 9.402427637721756, "grad_norm": 2.1355829637932566, "learning_rate": 9.365510490450425e-08, "loss": 0.0316, "step": 10070 }, { "epoch": 9.403361344537815, "grad_norm": 3.117879398765685, "learning_rate": 9.336402934792821e-08, "loss": 0.0643, "step": 10071 }, { "epoch": 9.404295051353875, "grad_norm": 1.3760151085336843, "learning_rate": 9.307340255709674e-08, "loss": 0.021, "step": 10072 }, { "epoch": 9.405228758169935, "grad_norm": 0.8801703497322642, "learning_rate": 9.278322455859079e-08, "loss": 0.0173, "step": 10073 }, { "epoch": 9.406162464985995, "grad_norm": 0.9439824330601359, "learning_rate": 9.249349537894969e-08, "loss": 0.0122, "step": 10074 }, { "epoch": 9.407096171802054, "grad_norm": 3.614124922068226, "learning_rate": 9.22042150446728e-08, "loss": 0.062, "step": 10075 }, { "epoch": 9.408029878618114, "grad_norm": 0.4848153167522181, "learning_rate": 9.191538358221896e-08, "loss": 0.0131, "step": 10076 }, { "epoch": 9.408963585434174, "grad_norm": 2.238875086922362, "learning_rate": 9.162700101800315e-08, "loss": 0.039, "step": 10077 }, { "epoch": 9.409897292250234, "grad_norm": 1.072639367684014, "learning_rate": 9.133906737840259e-08, "loss": 0.0378, "step": 10078 }, { "epoch": 9.410830999066294, "grad_norm": 0.3540789413872179, "learning_rate": 9.105158268975067e-08, "loss": 0.0063, "step": 10079 }, { "epoch": 9.411764705882353, "grad_norm": 0.8296413374025088, "learning_rate": 9.076454697834247e-08, "loss": 0.01, "step": 10080 }, { "epoch": 9.412698412698413, "grad_norm": 1.6178458701883056, "learning_rate": 9.047796027042922e-08, "loss": 0.023, "step": 10081 }, { "epoch": 9.413632119514473, "grad_norm": 2.196792211496898, "learning_rate": 9.01918225922238e-08, "loss": 0.0322, "step": 10082 }, { "epoch": 9.414565826330533, "grad_norm": 0.33137333494022847, "learning_rate": 8.990613396989533e-08, "loss": 0.0026, "step": 10083 }, { "epoch": 9.415499533146592, "grad_norm": 4.514825617866975, "learning_rate": 8.962089442957345e-08, "loss": 0.0957, "step": 10084 }, { "epoch": 9.416433239962652, "grad_norm": 1.880666494469353, "learning_rate": 8.933610399734727e-08, "loss": 0.0184, "step": 10085 }, { "epoch": 9.417366946778712, "grad_norm": 1.0906764371898294, "learning_rate": 8.905176269926318e-08, "loss": 0.011, "step": 10086 }, { "epoch": 9.418300653594772, "grad_norm": 3.8181943850002367, "learning_rate": 8.876787056132763e-08, "loss": 0.111, "step": 10087 }, { "epoch": 9.419234360410831, "grad_norm": 1.5764411282788804, "learning_rate": 8.848442760950537e-08, "loss": 0.0245, "step": 10088 }, { "epoch": 9.420168067226891, "grad_norm": 1.9805930938315113, "learning_rate": 8.820143386972125e-08, "loss": 0.038, "step": 10089 }, { "epoch": 9.42110177404295, "grad_norm": 2.3803827954394685, "learning_rate": 8.79188893678573e-08, "loss": 0.0122, "step": 10090 }, { "epoch": 9.42203548085901, "grad_norm": 1.535032561221263, "learning_rate": 8.763679412975568e-08, "loss": 0.0341, "step": 10091 }, { "epoch": 9.42296918767507, "grad_norm": 1.6419576927771458, "learning_rate": 8.735514818121626e-08, "loss": 0.0233, "step": 10092 }, { "epoch": 9.42390289449113, "grad_norm": 0.7233586034041145, "learning_rate": 8.707395154799958e-08, "loss": 0.0044, "step": 10093 }, { "epoch": 9.42483660130719, "grad_norm": 0.7557246881945647, "learning_rate": 8.679320425582449e-08, "loss": 0.0037, "step": 10094 }, { "epoch": 9.42577030812325, "grad_norm": 1.4826825525763987, "learning_rate": 8.651290633036768e-08, "loss": 0.0207, "step": 10095 }, { "epoch": 9.42670401493931, "grad_norm": 0.633904954405815, "learning_rate": 8.623305779726531e-08, "loss": 0.0075, "step": 10096 }, { "epoch": 9.427637721755369, "grad_norm": 1.7270579073250676, "learning_rate": 8.595365868211303e-08, "loss": 0.0196, "step": 10097 }, { "epoch": 9.428571428571429, "grad_norm": 1.0807923834565256, "learning_rate": 8.567470901046538e-08, "loss": 0.0109, "step": 10098 }, { "epoch": 9.429505135387489, "grad_norm": 1.8569806000918, "learning_rate": 8.539620880783473e-08, "loss": 0.0399, "step": 10099 }, { "epoch": 9.430438842203548, "grad_norm": 0.7167978527872134, "learning_rate": 8.511815809969293e-08, "loss": 0.0121, "step": 10100 }, { "epoch": 9.431372549019608, "grad_norm": 3.9333705925281244, "learning_rate": 8.484055691147076e-08, "loss": 0.0577, "step": 10101 }, { "epoch": 9.432306255835668, "grad_norm": 1.2169228131231582, "learning_rate": 8.456340526855788e-08, "loss": 0.0166, "step": 10102 }, { "epoch": 9.433239962651728, "grad_norm": 0.6369067664255654, "learning_rate": 8.428670319630405e-08, "loss": 0.0118, "step": 10103 }, { "epoch": 9.434173669467787, "grad_norm": 1.5214877248715455, "learning_rate": 8.401045072001513e-08, "loss": 0.0148, "step": 10104 }, { "epoch": 9.435107376283847, "grad_norm": 0.5904324292418713, "learning_rate": 8.373464786495755e-08, "loss": 0.0124, "step": 10105 }, { "epoch": 9.436041083099907, "grad_norm": 4.343967916040866, "learning_rate": 8.345929465635782e-08, "loss": 0.117, "step": 10106 }, { "epoch": 9.436974789915967, "grad_norm": 1.5357099500161342, "learning_rate": 8.318439111939913e-08, "loss": 0.0306, "step": 10107 }, { "epoch": 9.437908496732026, "grad_norm": 3.205827451057235, "learning_rate": 8.290993727922414e-08, "loss": 0.0315, "step": 10108 }, { "epoch": 9.438842203548086, "grad_norm": 2.461806748014888, "learning_rate": 8.263593316093498e-08, "loss": 0.0771, "step": 10109 }, { "epoch": 9.439775910364146, "grad_norm": 1.4088249078379742, "learning_rate": 8.236237878959275e-08, "loss": 0.0377, "step": 10110 }, { "epoch": 9.440709617180206, "grad_norm": 3.2561680574669865, "learning_rate": 8.208927419021629e-08, "loss": 0.0596, "step": 10111 }, { "epoch": 9.441643323996265, "grad_norm": 0.4984629766922547, "learning_rate": 8.181661938778451e-08, "loss": 0.0082, "step": 10112 }, { "epoch": 9.442577030812325, "grad_norm": 0.24590820010659495, "learning_rate": 8.154441440723416e-08, "loss": 0.0007, "step": 10113 }, { "epoch": 9.443510737628385, "grad_norm": 1.6212405161190662, "learning_rate": 8.12726592734625e-08, "loss": 0.0136, "step": 10114 }, { "epoch": 9.444444444444445, "grad_norm": 1.5170023378694286, "learning_rate": 8.100135401132359e-08, "loss": 0.0267, "step": 10115 }, { "epoch": 9.445378151260504, "grad_norm": 3.837264155206199, "learning_rate": 8.073049864563142e-08, "loss": 0.0876, "step": 10116 }, { "epoch": 9.446311858076564, "grad_norm": 4.054781899546413, "learning_rate": 8.046009320115844e-08, "loss": 0.0298, "step": 10117 }, { "epoch": 9.447245564892624, "grad_norm": 3.1502261174472834, "learning_rate": 8.01901377026365e-08, "loss": 0.0452, "step": 10118 }, { "epoch": 9.448179271708684, "grad_norm": 0.6543559591985575, "learning_rate": 7.992063217475587e-08, "loss": 0.0095, "step": 10119 }, { "epoch": 9.449112978524743, "grad_norm": 5.598257691127714, "learning_rate": 7.965157664216628e-08, "loss": 0.194, "step": 10120 }, { "epoch": 9.450046685340803, "grad_norm": 1.3104878034950715, "learning_rate": 7.938297112947579e-08, "loss": 0.014, "step": 10121 }, { "epoch": 9.450980392156863, "grad_norm": 3.2086489869565415, "learning_rate": 7.911481566124979e-08, "loss": 0.0638, "step": 10122 }, { "epoch": 9.451914098972923, "grad_norm": 1.0096927370746107, "learning_rate": 7.884711026201586e-08, "loss": 0.0223, "step": 10123 }, { "epoch": 9.452847805788982, "grad_norm": 1.4984371422393254, "learning_rate": 7.85798549562583e-08, "loss": 0.0102, "step": 10124 }, { "epoch": 9.453781512605042, "grad_norm": 2.2009095003938075, "learning_rate": 7.83130497684198e-08, "loss": 0.0408, "step": 10125 }, { "epoch": 9.454715219421102, "grad_norm": 1.3424494910341036, "learning_rate": 7.804669472290305e-08, "loss": 0.0191, "step": 10126 }, { "epoch": 9.455648926237162, "grad_norm": 0.8671993154765996, "learning_rate": 7.778078984406912e-08, "loss": 0.0108, "step": 10127 }, { "epoch": 9.456582633053221, "grad_norm": 0.4469680381957474, "learning_rate": 7.7515335156238e-08, "loss": 0.0055, "step": 10128 }, { "epoch": 9.457516339869281, "grad_norm": 4.7647253408623005, "learning_rate": 7.725033068368859e-08, "loss": 0.1045, "step": 10129 }, { "epoch": 9.458450046685341, "grad_norm": 3.78060404126964, "learning_rate": 7.698577645065764e-08, "loss": 0.0487, "step": 10130 }, { "epoch": 9.4593837535014, "grad_norm": 1.412141586674148, "learning_rate": 7.672167248134244e-08, "loss": 0.0248, "step": 10131 }, { "epoch": 9.46031746031746, "grad_norm": 3.5640555350579564, "learning_rate": 7.645801879989811e-08, "loss": 0.0375, "step": 10132 }, { "epoch": 9.46125116713352, "grad_norm": 0.9348117126199871, "learning_rate": 7.619481543043872e-08, "loss": 0.0109, "step": 10133 }, { "epoch": 9.46218487394958, "grad_norm": 1.297361310845321, "learning_rate": 7.59320623970361e-08, "loss": 0.014, "step": 10134 }, { "epoch": 9.46311858076564, "grad_norm": 4.634261867286366, "learning_rate": 7.56697597237227e-08, "loss": 0.1419, "step": 10135 }, { "epoch": 9.4640522875817, "grad_norm": 2.8470877080656445, "learning_rate": 7.540790743448934e-08, "loss": 0.1151, "step": 10136 }, { "epoch": 9.46498599439776, "grad_norm": 1.10675649089273, "learning_rate": 7.514650555328462e-08, "loss": 0.0154, "step": 10137 }, { "epoch": 9.465919701213819, "grad_norm": 4.066451893020586, "learning_rate": 7.488555410401666e-08, "loss": 0.093, "step": 10138 }, { "epoch": 9.466853408029879, "grad_norm": 1.2207555065388171, "learning_rate": 7.462505311055301e-08, "loss": 0.016, "step": 10139 }, { "epoch": 9.467787114845938, "grad_norm": 1.0835454218237799, "learning_rate": 7.436500259671908e-08, "loss": 0.03, "step": 10140 }, { "epoch": 9.468720821661998, "grad_norm": 2.1375036528274896, "learning_rate": 7.410540258629806e-08, "loss": 0.0544, "step": 10141 }, { "epoch": 9.469654528478058, "grad_norm": 10.891360696599486, "learning_rate": 7.38462531030354e-08, "loss": 0.0394, "step": 10142 }, { "epoch": 9.470588235294118, "grad_norm": 3.8529090169390185, "learning_rate": 7.358755417063156e-08, "loss": 0.1102, "step": 10143 }, { "epoch": 9.471521942110178, "grad_norm": 4.410887727357627, "learning_rate": 7.332930581274822e-08, "loss": 0.1457, "step": 10144 }, { "epoch": 9.472455648926237, "grad_norm": 1.0126941559350562, "learning_rate": 7.307150805300422e-08, "loss": 0.029, "step": 10145 }, { "epoch": 9.473389355742297, "grad_norm": 0.4773238187233471, "learning_rate": 7.281416091497906e-08, "loss": 0.0083, "step": 10146 }, { "epoch": 9.474323062558357, "grad_norm": 0.3562142389846509, "learning_rate": 7.255726442220889e-08, "loss": 0.0065, "step": 10147 }, { "epoch": 9.475256769374417, "grad_norm": 0.9491728223737315, "learning_rate": 7.230081859819049e-08, "loss": 0.0084, "step": 10148 }, { "epoch": 9.476190476190476, "grad_norm": 2.8657508315683824, "learning_rate": 7.204482346637842e-08, "loss": 0.0421, "step": 10149 }, { "epoch": 9.477124183006536, "grad_norm": 0.20910235271657487, "learning_rate": 7.178927905018617e-08, "loss": 0.0018, "step": 10150 }, { "epoch": 9.478057889822596, "grad_norm": 4.917864053838343, "learning_rate": 7.153418537298617e-08, "loss": 0.2163, "step": 10151 }, { "epoch": 9.478991596638656, "grad_norm": 2.2100095713746084, "learning_rate": 7.12795424581092e-08, "loss": 0.039, "step": 10152 }, { "epoch": 9.479925303454715, "grad_norm": 1.7918761890599548, "learning_rate": 7.102535032884605e-08, "loss": 0.0436, "step": 10153 }, { "epoch": 9.480859010270775, "grad_norm": 1.7296030006745657, "learning_rate": 7.077160900844426e-08, "loss": 0.0428, "step": 10154 }, { "epoch": 9.481792717086835, "grad_norm": 5.4402716500684445, "learning_rate": 7.051831852011192e-08, "loss": 0.1578, "step": 10155 }, { "epoch": 9.482726423902895, "grad_norm": 0.6779130070085155, "learning_rate": 7.026547888701495e-08, "loss": 0.0146, "step": 10156 }, { "epoch": 9.483660130718954, "grad_norm": 2.053147312369039, "learning_rate": 7.001309013227875e-08, "loss": 0.0267, "step": 10157 }, { "epoch": 9.484593837535014, "grad_norm": 2.8870374775958467, "learning_rate": 6.97611522789865e-08, "loss": 0.0665, "step": 10158 }, { "epoch": 9.485527544351074, "grad_norm": 2.0156991422998924, "learning_rate": 6.950966535018088e-08, "loss": 0.0714, "step": 10159 }, { "epoch": 9.486461251167134, "grad_norm": 2.424931405227994, "learning_rate": 6.925862936886296e-08, "loss": 0.0535, "step": 10160 }, { "epoch": 9.487394957983193, "grad_norm": 3.8479681389955935, "learning_rate": 6.900804435799324e-08, "loss": 0.029, "step": 10161 }, { "epoch": 9.488328664799253, "grad_norm": 2.433119938386064, "learning_rate": 6.875791034049063e-08, "loss": 0.0367, "step": 10162 }, { "epoch": 9.489262371615313, "grad_norm": 0.9393063210867697, "learning_rate": 6.85082273392318e-08, "loss": 0.0222, "step": 10163 }, { "epoch": 9.490196078431373, "grad_norm": 4.096488792546007, "learning_rate": 6.825899537705294e-08, "loss": 0.1055, "step": 10164 }, { "epoch": 9.491129785247432, "grad_norm": 2.3995716922276533, "learning_rate": 6.801021447674971e-08, "loss": 0.1034, "step": 10165 }, { "epoch": 9.492063492063492, "grad_norm": 2.5724214793341496, "learning_rate": 6.776188466107614e-08, "loss": 0.0346, "step": 10166 }, { "epoch": 9.492997198879552, "grad_norm": 1.022112540929988, "learning_rate": 6.751400595274405e-08, "loss": 0.022, "step": 10167 }, { "epoch": 9.493930905695612, "grad_norm": 1.0211770113033871, "learning_rate": 6.726657837442418e-08, "loss": 0.0201, "step": 10168 }, { "epoch": 9.494864612511671, "grad_norm": 3.080825411694816, "learning_rate": 6.701960194874735e-08, "loss": 0.0954, "step": 10169 }, { "epoch": 9.495798319327731, "grad_norm": 2.162665508045222, "learning_rate": 6.677307669830268e-08, "loss": 0.0328, "step": 10170 }, { "epoch": 9.49673202614379, "grad_norm": 0.8255961202545508, "learning_rate": 6.65270026456366e-08, "loss": 0.0067, "step": 10171 }, { "epoch": 9.49766573295985, "grad_norm": 1.1643907927167558, "learning_rate": 6.628137981325611e-08, "loss": 0.0095, "step": 10172 }, { "epoch": 9.49859943977591, "grad_norm": 2.6458678268309956, "learning_rate": 6.603620822362489e-08, "loss": 0.0519, "step": 10173 }, { "epoch": 9.49953314659197, "grad_norm": 1.413256767528851, "learning_rate": 6.579148789916779e-08, "loss": 0.0376, "step": 10174 }, { "epoch": 9.50046685340803, "grad_norm": 2.2200986574854213, "learning_rate": 6.55472188622669e-08, "loss": 0.0486, "step": 10175 }, { "epoch": 9.50140056022409, "grad_norm": 9.156973175232235, "learning_rate": 6.530340113526323e-08, "loss": 0.1945, "step": 10176 }, { "epoch": 9.50233426704015, "grad_norm": 1.703183691686352, "learning_rate": 6.506003474045618e-08, "loss": 0.0301, "step": 10177 }, { "epoch": 9.50326797385621, "grad_norm": 4.667166365464604, "learning_rate": 6.481711970010463e-08, "loss": 0.1166, "step": 10178 }, { "epoch": 9.504201680672269, "grad_norm": 1.156387065738042, "learning_rate": 6.457465603642577e-08, "loss": 0.029, "step": 10179 }, { "epoch": 9.505135387488329, "grad_norm": 1.7810712822309231, "learning_rate": 6.433264377159631e-08, "loss": 0.0235, "step": 10180 }, { "epoch": 9.506069094304388, "grad_norm": 5.8931997523042705, "learning_rate": 6.409108292774912e-08, "loss": 0.0995, "step": 10181 }, { "epoch": 9.507002801120448, "grad_norm": 3.562579060898415, "learning_rate": 6.38499735269793e-08, "loss": 0.0729, "step": 10182 }, { "epoch": 9.507936507936508, "grad_norm": 0.7652566803092784, "learning_rate": 6.360931559133865e-08, "loss": 0.0113, "step": 10183 }, { "epoch": 9.508870214752568, "grad_norm": 1.1372108895349156, "learning_rate": 6.336910914283734e-08, "loss": 0.0092, "step": 10184 }, { "epoch": 9.509803921568627, "grad_norm": 1.5290514380449782, "learning_rate": 6.312935420344557e-08, "loss": 0.0256, "step": 10185 }, { "epoch": 9.510737628384687, "grad_norm": 1.799511113291013, "learning_rate": 6.289005079509081e-08, "loss": 0.0314, "step": 10186 }, { "epoch": 9.511671335200747, "grad_norm": 6.385840921545514, "learning_rate": 6.265119893966054e-08, "loss": 0.0818, "step": 10187 }, { "epoch": 9.512605042016807, "grad_norm": 1.768348536460961, "learning_rate": 6.241279865900063e-08, "loss": 0.0317, "step": 10188 }, { "epoch": 9.513538748832866, "grad_norm": 2.9570292214902847, "learning_rate": 6.217484997491475e-08, "loss": 0.0941, "step": 10189 }, { "epoch": 9.514472455648926, "grad_norm": 1.1340974193411706, "learning_rate": 6.193735290916547e-08, "loss": 0.0228, "step": 10190 }, { "epoch": 9.515406162464986, "grad_norm": 2.1130501731268874, "learning_rate": 6.1700307483476e-08, "loss": 0.0364, "step": 10191 }, { "epoch": 9.516339869281046, "grad_norm": 1.8227039224523307, "learning_rate": 6.14637137195262e-08, "loss": 0.0117, "step": 10192 }, { "epoch": 9.517273576097105, "grad_norm": 2.3129204435694066, "learning_rate": 6.122757163895432e-08, "loss": 0.0411, "step": 10193 }, { "epoch": 9.518207282913165, "grad_norm": 5.338508800843321, "learning_rate": 6.099188126335919e-08, "loss": 0.0511, "step": 10194 }, { "epoch": 9.519140989729225, "grad_norm": 1.5205672850363106, "learning_rate": 6.075664261429637e-08, "loss": 0.028, "step": 10195 }, { "epoch": 9.520074696545285, "grad_norm": 2.945982518102024, "learning_rate": 6.052185571328195e-08, "loss": 0.0461, "step": 10196 }, { "epoch": 9.521008403361344, "grad_norm": 0.6001224816326216, "learning_rate": 6.028752058178933e-08, "loss": 0.0085, "step": 10197 }, { "epoch": 9.521942110177404, "grad_norm": 2.836976408387537, "learning_rate": 6.00536372412508e-08, "loss": 0.041, "step": 10198 }, { "epoch": 9.522875816993464, "grad_norm": 1.8595105543557222, "learning_rate": 5.982020571305813e-08, "loss": 0.015, "step": 10199 }, { "epoch": 9.523809523809524, "grad_norm": 0.9890694385851938, "learning_rate": 5.958722601856093e-08, "loss": 0.0127, "step": 10200 }, { "epoch": 9.524743230625583, "grad_norm": 0.4774176350566623, "learning_rate": 5.935469817906714e-08, "loss": 0.0066, "step": 10201 }, { "epoch": 9.525676937441643, "grad_norm": 3.360942900846234, "learning_rate": 5.91226222158453e-08, "loss": 0.1075, "step": 10202 }, { "epoch": 9.526610644257703, "grad_norm": 0.4294547186491917, "learning_rate": 5.8890998150119564e-08, "loss": 0.0021, "step": 10203 }, { "epoch": 9.527544351073763, "grad_norm": 3.853037040029149, "learning_rate": 5.865982600307629e-08, "loss": 0.0508, "step": 10204 }, { "epoch": 9.528478057889822, "grad_norm": 5.083361070406854, "learning_rate": 5.842910579585748e-08, "loss": 0.1322, "step": 10205 }, { "epoch": 9.529411764705882, "grad_norm": 0.7369390113403834, "learning_rate": 5.819883754956568e-08, "loss": 0.0151, "step": 10206 }, { "epoch": 9.530345471521942, "grad_norm": 4.67882395949614, "learning_rate": 5.7969021285260716e-08, "loss": 0.0409, "step": 10207 }, { "epoch": 9.531279178338002, "grad_norm": 2.231912896701052, "learning_rate": 5.773965702396245e-08, "loss": 0.0573, "step": 10208 }, { "epoch": 9.532212885154062, "grad_norm": 0.8827925273006227, "learning_rate": 5.751074478664909e-08, "loss": 0.0096, "step": 10209 }, { "epoch": 9.533146591970121, "grad_norm": 4.967673607841851, "learning_rate": 5.728228459425611e-08, "loss": 0.1536, "step": 10210 }, { "epoch": 9.534080298786181, "grad_norm": 1.857271365795667, "learning_rate": 5.705427646767958e-08, "loss": 0.0297, "step": 10211 }, { "epoch": 9.53501400560224, "grad_norm": 0.10915012577033449, "learning_rate": 5.6826720427773377e-08, "loss": 0.0004, "step": 10212 }, { "epoch": 9.5359477124183, "grad_norm": 2.7416650406368848, "learning_rate": 5.6599616495349176e-08, "loss": 0.0549, "step": 10213 }, { "epoch": 9.53688141923436, "grad_norm": 0.8038077983435054, "learning_rate": 5.63729646911787e-08, "loss": 0.0098, "step": 10214 }, { "epoch": 9.53781512605042, "grad_norm": 1.8527138090893294, "learning_rate": 5.6146765035991476e-08, "loss": 0.0235, "step": 10215 }, { "epoch": 9.53874883286648, "grad_norm": 1.5952221659583157, "learning_rate": 5.592101755047652e-08, "loss": 0.014, "step": 10216 }, { "epoch": 9.53968253968254, "grad_norm": 9.484943851328941, "learning_rate": 5.569572225528008e-08, "loss": 0.2397, "step": 10217 }, { "epoch": 9.5406162464986, "grad_norm": 4.412610462978694, "learning_rate": 5.5470879171009016e-08, "loss": 0.0997, "step": 10218 }, { "epoch": 9.541549953314659, "grad_norm": 0.4667548013831971, "learning_rate": 5.524648831822688e-08, "loss": 0.009, "step": 10219 }, { "epoch": 9.542483660130719, "grad_norm": 2.974593639697245, "learning_rate": 5.502254971745669e-08, "loss": 0.0611, "step": 10220 }, { "epoch": 9.543417366946779, "grad_norm": 3.235314113413961, "learning_rate": 5.479906338917984e-08, "loss": 0.0607, "step": 10221 }, { "epoch": 9.544351073762838, "grad_norm": 0.3621037826402774, "learning_rate": 5.457602935383777e-08, "loss": 0.0023, "step": 10222 }, { "epoch": 9.545284780578898, "grad_norm": 1.6945204241869023, "learning_rate": 5.435344763182804e-08, "loss": 0.0304, "step": 10223 }, { "epoch": 9.546218487394958, "grad_norm": 3.6032303778449206, "learning_rate": 5.4131318243508816e-08, "loss": 0.1204, "step": 10224 }, { "epoch": 9.547152194211018, "grad_norm": 2.3232869977884936, "learning_rate": 5.3909641209196616e-08, "loss": 0.0363, "step": 10225 }, { "epoch": 9.548085901027077, "grad_norm": 3.570576704615577, "learning_rate": 5.368841654916579e-08, "loss": 0.152, "step": 10226 }, { "epoch": 9.549019607843137, "grad_norm": 5.523424902301, "learning_rate": 5.346764428364959e-08, "loss": 0.0857, "step": 10227 }, { "epoch": 9.549953314659197, "grad_norm": 1.325668605265214, "learning_rate": 5.32473244328402e-08, "loss": 0.0213, "step": 10228 }, { "epoch": 9.550887021475257, "grad_norm": 1.4871309604454295, "learning_rate": 5.302745701688872e-08, "loss": 0.0134, "step": 10229 }, { "epoch": 9.551820728291316, "grad_norm": 1.2961985309286141, "learning_rate": 5.2808042055903505e-08, "loss": 0.0348, "step": 10230 }, { "epoch": 9.552754435107376, "grad_norm": 1.1638986686074333, "learning_rate": 5.258907956995407e-08, "loss": 0.0128, "step": 10231 }, { "epoch": 9.553688141923436, "grad_norm": 0.7551781728211909, "learning_rate": 5.237056957906495e-08, "loss": 0.0105, "step": 10232 }, { "epoch": 9.554621848739496, "grad_norm": 1.2536679478810953, "learning_rate": 5.215251210322236e-08, "loss": 0.0221, "step": 10233 }, { "epoch": 9.555555555555555, "grad_norm": 2.7542730061876934, "learning_rate": 5.1934907162370374e-08, "loss": 0.0727, "step": 10234 }, { "epoch": 9.556489262371615, "grad_norm": 0.3395267525166138, "learning_rate": 5.171775477641139e-08, "loss": 0.0005, "step": 10235 }, { "epoch": 9.557422969187675, "grad_norm": 1.6458405954072097, "learning_rate": 5.150105496520508e-08, "loss": 0.0187, "step": 10236 }, { "epoch": 9.558356676003735, "grad_norm": 1.6648117159358824, "learning_rate": 5.128480774857225e-08, "loss": 0.0186, "step": 10237 }, { "epoch": 9.559290382819794, "grad_norm": 6.061009327505603, "learning_rate": 5.1069013146290425e-08, "loss": 0.0684, "step": 10238 }, { "epoch": 9.560224089635854, "grad_norm": 3.5328874154340806, "learning_rate": 5.085367117809714e-08, "loss": 0.1156, "step": 10239 }, { "epoch": 9.561157796451914, "grad_norm": 1.7899903839862452, "learning_rate": 5.0638781863687204e-08, "loss": 0.0471, "step": 10240 }, { "epoch": 9.562091503267974, "grad_norm": 2.254418226901614, "learning_rate": 5.042434522271378e-08, "loss": 0.028, "step": 10241 }, { "epoch": 9.563025210084033, "grad_norm": 2.231000564909048, "learning_rate": 5.021036127479173e-08, "loss": 0.0489, "step": 10242 }, { "epoch": 9.563958916900093, "grad_norm": 5.157698980361578, "learning_rate": 4.9996830039489854e-08, "loss": 0.1368, "step": 10243 }, { "epoch": 9.564892623716153, "grad_norm": 3.907987128268807, "learning_rate": 4.978375153633974e-08, "loss": 0.0625, "step": 10244 }, { "epoch": 9.565826330532213, "grad_norm": 4.65701824532361, "learning_rate": 4.9571125784828586e-08, "loss": 0.1115, "step": 10245 }, { "epoch": 9.566760037348272, "grad_norm": 1.0420568280871254, "learning_rate": 4.9358952804403615e-08, "loss": 0.0224, "step": 10246 }, { "epoch": 9.567693744164332, "grad_norm": 1.366091440597342, "learning_rate": 4.9147232614470964e-08, "loss": 0.0214, "step": 10247 }, { "epoch": 9.568627450980392, "grad_norm": 1.6198236624858773, "learning_rate": 4.8935965234394035e-08, "loss": 0.0316, "step": 10248 }, { "epoch": 9.569561157796452, "grad_norm": 0.45455977813123544, "learning_rate": 4.87251506834957e-08, "loss": 0.0038, "step": 10249 }, { "epoch": 9.570494864612511, "grad_norm": 10.482587796385765, "learning_rate": 4.851478898105777e-08, "loss": 0.0349, "step": 10250 }, { "epoch": 9.571428571428571, "grad_norm": 1.3223967631381839, "learning_rate": 4.8304880146319287e-08, "loss": 0.0386, "step": 10251 }, { "epoch": 9.572362278244631, "grad_norm": 0.7811241920256835, "learning_rate": 4.8095424198479345e-08, "loss": 0.0093, "step": 10252 }, { "epoch": 9.57329598506069, "grad_norm": 1.325897851992757, "learning_rate": 4.788642115669539e-08, "loss": 0.0425, "step": 10253 }, { "epoch": 9.57422969187675, "grad_norm": 1.1196854130808354, "learning_rate": 4.767787104008159e-08, "loss": 0.0138, "step": 10254 }, { "epoch": 9.57516339869281, "grad_norm": 1.7820367038880545, "learning_rate": 4.7469773867713784e-08, "loss": 0.0221, "step": 10255 }, { "epoch": 9.57609710550887, "grad_norm": 1.1968042471974811, "learning_rate": 4.7262129658623426e-08, "loss": 0.0159, "step": 10256 }, { "epoch": 9.57703081232493, "grad_norm": 1.966654998658418, "learning_rate": 4.705493843180309e-08, "loss": 0.025, "step": 10257 }, { "epoch": 9.57796451914099, "grad_norm": 3.652250096859087, "learning_rate": 4.684820020620151e-08, "loss": 0.0974, "step": 10258 }, { "epoch": 9.57889822595705, "grad_norm": 1.8650540959879878, "learning_rate": 4.6641915000727453e-08, "loss": 0.0557, "step": 10259 }, { "epoch": 9.579831932773109, "grad_norm": 0.6317865023920115, "learning_rate": 4.6436082834248606e-08, "loss": 0.0106, "step": 10260 }, { "epoch": 9.580765639589169, "grad_norm": 1.2573761152828542, "learning_rate": 4.6230703725590465e-08, "loss": 0.0284, "step": 10261 }, { "epoch": 9.581699346405228, "grad_norm": 1.6006040757235938, "learning_rate": 4.602577769353633e-08, "loss": 0.0224, "step": 10262 }, { "epoch": 9.582633053221288, "grad_norm": 0.5728506378242875, "learning_rate": 4.582130475683011e-08, "loss": 0.0048, "step": 10263 }, { "epoch": 9.583566760037348, "grad_norm": 2.024999664659533, "learning_rate": 4.561728493417239e-08, "loss": 0.0284, "step": 10264 }, { "epoch": 9.584500466853408, "grad_norm": 2.4341900294538465, "learning_rate": 4.5413718244222695e-08, "loss": 0.0791, "step": 10265 }, { "epoch": 9.585434173669467, "grad_norm": 2.9391531733741227, "learning_rate": 4.521060470560057e-08, "loss": 0.0781, "step": 10266 }, { "epoch": 9.586367880485527, "grad_norm": 3.4162335404849578, "learning_rate": 4.5007944336881716e-08, "loss": 0.1155, "step": 10267 }, { "epoch": 9.587301587301587, "grad_norm": 1.3233004838931601, "learning_rate": 4.480573715660241e-08, "loss": 0.0278, "step": 10268 }, { "epoch": 9.588235294117647, "grad_norm": 5.64638142378773, "learning_rate": 4.460398318325676e-08, "loss": 0.0456, "step": 10269 }, { "epoch": 9.589169000933706, "grad_norm": 4.2963106124769554, "learning_rate": 4.4402682435296666e-08, "loss": 0.0605, "step": 10270 }, { "epoch": 9.590102707749766, "grad_norm": 1.7955772564777959, "learning_rate": 4.4201834931134056e-08, "loss": 0.0399, "step": 10271 }, { "epoch": 9.591036414565826, "grad_norm": 3.3810017124645553, "learning_rate": 4.40014406891387e-08, "loss": 0.1197, "step": 10272 }, { "epoch": 9.591970121381886, "grad_norm": 0.8280858274707242, "learning_rate": 4.380149972763814e-08, "loss": 0.0226, "step": 10273 }, { "epoch": 9.592903828197946, "grad_norm": 5.62618790069183, "learning_rate": 4.3602012064919986e-08, "loss": 0.1059, "step": 10274 }, { "epoch": 9.593837535014005, "grad_norm": 2.1402212897373585, "learning_rate": 4.3402977719228524e-08, "loss": 0.0606, "step": 10275 }, { "epoch": 9.594771241830065, "grad_norm": 1.1158895829157485, "learning_rate": 4.320439670876864e-08, "loss": 0.025, "step": 10276 }, { "epoch": 9.595704948646125, "grad_norm": 0.7286915221360266, "learning_rate": 4.3006269051702465e-08, "loss": 0.0063, "step": 10277 }, { "epoch": 9.596638655462185, "grad_norm": 1.3215483002992592, "learning_rate": 4.280859476615051e-08, "loss": 0.0339, "step": 10278 }, { "epoch": 9.597572362278244, "grad_norm": 1.8162739211520735, "learning_rate": 4.261137387019276e-08, "loss": 0.0538, "step": 10279 }, { "epoch": 9.598506069094304, "grad_norm": 0.6234052882767487, "learning_rate": 4.241460638186701e-08, "loss": 0.0057, "step": 10280 }, { "epoch": 9.599439775910364, "grad_norm": 3.923025645644976, "learning_rate": 4.2218292319169964e-08, "loss": 0.0582, "step": 10281 }, { "epoch": 9.600373482726424, "grad_norm": 2.4300943347251556, "learning_rate": 4.202243170005671e-08, "loss": 0.0506, "step": 10282 }, { "epoch": 9.601307189542483, "grad_norm": 0.8399027559744879, "learning_rate": 4.1827024542440696e-08, "loss": 0.0234, "step": 10283 }, { "epoch": 9.602240896358543, "grad_norm": 1.6890495239139696, "learning_rate": 4.163207086419374e-08, "loss": 0.0319, "step": 10284 }, { "epoch": 9.603174603174603, "grad_norm": 0.37900806995993996, "learning_rate": 4.143757068314713e-08, "loss": 0.0025, "step": 10285 }, { "epoch": 9.604108309990663, "grad_norm": 2.5501026104344775, "learning_rate": 4.124352401708998e-08, "loss": 0.0593, "step": 10286 }, { "epoch": 9.605042016806722, "grad_norm": 0.8886994023807835, "learning_rate": 4.104993088376974e-08, "loss": 0.0149, "step": 10287 }, { "epoch": 9.605975723622782, "grad_norm": 1.8114236956606113, "learning_rate": 4.0856791300892264e-08, "loss": 0.0421, "step": 10288 }, { "epoch": 9.606909430438842, "grad_norm": 1.5126022961001664, "learning_rate": 4.0664105286122856e-08, "loss": 0.031, "step": 10289 }, { "epoch": 9.607843137254902, "grad_norm": 2.2598694052730877, "learning_rate": 4.0471872857084646e-08, "loss": 0.0451, "step": 10290 }, { "epoch": 9.608776844070961, "grad_norm": 1.7650521002992785, "learning_rate": 4.0280094031359685e-08, "loss": 0.0238, "step": 10291 }, { "epoch": 9.609710550887021, "grad_norm": 9.664651487699228, "learning_rate": 4.008876882648782e-08, "loss": 0.1272, "step": 10292 }, { "epoch": 9.61064425770308, "grad_norm": 0.4405530297478893, "learning_rate": 3.9897897259967846e-08, "loss": 0.0081, "step": 10293 }, { "epoch": 9.61157796451914, "grad_norm": 0.4353067192933501, "learning_rate": 3.970747934925689e-08, "loss": 0.0084, "step": 10294 }, { "epoch": 9.6125116713352, "grad_norm": 1.219840203527593, "learning_rate": 3.9517515111771044e-08, "loss": 0.0137, "step": 10295 }, { "epoch": 9.61344537815126, "grad_norm": 2.6329223048152604, "learning_rate": 3.932800456488528e-08, "loss": 0.0888, "step": 10296 }, { "epoch": 9.61437908496732, "grad_norm": 2.3920549238454303, "learning_rate": 3.9138947725930745e-08, "loss": 0.0409, "step": 10297 }, { "epoch": 9.61531279178338, "grad_norm": 1.654423615202468, "learning_rate": 3.895034461220082e-08, "loss": 0.0339, "step": 10298 }, { "epoch": 9.61624649859944, "grad_norm": 4.139088959335237, "learning_rate": 3.876219524094338e-08, "loss": 0.1072, "step": 10299 }, { "epoch": 9.6171802054155, "grad_norm": 1.8514099548732783, "learning_rate": 3.8574499629368544e-08, "loss": 0.0312, "step": 10300 }, { "epoch": 9.618113912231559, "grad_norm": 1.8292271754775802, "learning_rate": 3.8387257794641475e-08, "loss": 0.0354, "step": 10301 }, { "epoch": 9.619047619047619, "grad_norm": 0.5582078645800052, "learning_rate": 3.820046975388847e-08, "loss": 0.0042, "step": 10302 }, { "epoch": 9.619981325863678, "grad_norm": 0.869136477482427, "learning_rate": 3.801413552419364e-08, "loss": 0.0135, "step": 10303 }, { "epoch": 9.620915032679738, "grad_norm": 3.248597296721247, "learning_rate": 3.7828255122598356e-08, "loss": 0.045, "step": 10304 }, { "epoch": 9.621848739495798, "grad_norm": 1.5813811044552124, "learning_rate": 3.7642828566104014e-08, "loss": 0.0141, "step": 10305 }, { "epoch": 9.622782446311858, "grad_norm": 0.4733217512406045, "learning_rate": 3.745785587166928e-08, "loss": 0.011, "step": 10306 }, { "epoch": 9.623716153127917, "grad_norm": 1.9229075355660348, "learning_rate": 3.7273337056213385e-08, "loss": 0.0495, "step": 10307 }, { "epoch": 9.624649859943977, "grad_norm": 0.6468574077474318, "learning_rate": 3.708927213661118e-08, "loss": 0.0098, "step": 10308 }, { "epoch": 9.625583566760037, "grad_norm": 0.7998480441036919, "learning_rate": 3.6905661129697514e-08, "loss": 0.0128, "step": 10309 }, { "epoch": 9.626517273576097, "grad_norm": 2.089715881506058, "learning_rate": 3.6722504052266186e-08, "loss": 0.0334, "step": 10310 }, { "epoch": 9.627450980392156, "grad_norm": 0.4885379732533189, "learning_rate": 3.653980092106879e-08, "loss": 0.0072, "step": 10311 }, { "epoch": 9.628384687208216, "grad_norm": 1.0697733591391738, "learning_rate": 3.635755175281586e-08, "loss": 0.0168, "step": 10312 }, { "epoch": 9.629318394024276, "grad_norm": 1.4182969168953845, "learning_rate": 3.6175756564175156e-08, "loss": 0.0374, "step": 10313 }, { "epoch": 9.630252100840336, "grad_norm": 1.6021748785353687, "learning_rate": 3.599441537177451e-08, "loss": 0.0255, "step": 10314 }, { "epoch": 9.631185807656395, "grad_norm": 0.17168849010277357, "learning_rate": 3.581352819219952e-08, "loss": 0.0018, "step": 10315 }, { "epoch": 9.632119514472455, "grad_norm": 2.387356022666681, "learning_rate": 3.563309504199475e-08, "loss": 0.0614, "step": 10316 }, { "epoch": 9.633053221288515, "grad_norm": 8.993494107368555, "learning_rate": 3.545311593766143e-08, "loss": 0.1101, "step": 10317 }, { "epoch": 9.633986928104575, "grad_norm": 4.408499369661456, "learning_rate": 3.527359089566196e-08, "loss": 0.0897, "step": 10318 }, { "epoch": 9.634920634920634, "grad_norm": 0.461505915531967, "learning_rate": 3.5094519932415417e-08, "loss": 0.0044, "step": 10319 }, { "epoch": 9.635854341736694, "grad_norm": 0.9821646245475787, "learning_rate": 3.491590306429926e-08, "loss": 0.0123, "step": 10320 }, { "epoch": 9.636788048552754, "grad_norm": 3.394054100919564, "learning_rate": 3.473774030765098e-08, "loss": 0.0609, "step": 10321 }, { "epoch": 9.637721755368814, "grad_norm": 2.607299506503014, "learning_rate": 3.456003167876476e-08, "loss": 0.0268, "step": 10322 }, { "epoch": 9.638655462184873, "grad_norm": 6.384035836755278, "learning_rate": 3.438277719389427e-08, "loss": 0.16, "step": 10323 }, { "epoch": 9.639589169000933, "grad_norm": 2.060023087758914, "learning_rate": 3.4205976869251536e-08, "loss": 0.0552, "step": 10324 }, { "epoch": 9.640522875816993, "grad_norm": 4.344782482823829, "learning_rate": 3.4029630721006954e-08, "loss": 0.107, "step": 10325 }, { "epoch": 9.641456582633053, "grad_norm": 2.2813576125639887, "learning_rate": 3.3853738765288745e-08, "loss": 0.0425, "step": 10326 }, { "epoch": 9.642390289449112, "grad_norm": 0.7959698990075035, "learning_rate": 3.367830101818459e-08, "loss": 0.0174, "step": 10327 }, { "epoch": 9.643323996265172, "grad_norm": 1.9146736194918, "learning_rate": 3.3503317495739986e-08, "loss": 0.0359, "step": 10328 }, { "epoch": 9.644257703081232, "grad_norm": 1.5574593907998877, "learning_rate": 3.332878821395935e-08, "loss": 0.0271, "step": 10329 }, { "epoch": 9.645191409897292, "grad_norm": 2.565772085017674, "learning_rate": 3.315471318880492e-08, "loss": 0.0414, "step": 10330 }, { "epoch": 9.646125116713351, "grad_norm": 3.37652394241392, "learning_rate": 3.29810924361984e-08, "loss": 0.0199, "step": 10331 }, { "epoch": 9.647058823529411, "grad_norm": 2.2622629580501887, "learning_rate": 3.2807925972018764e-08, "loss": 0.0418, "step": 10332 }, { "epoch": 9.647992530345471, "grad_norm": 4.776476799025486, "learning_rate": 3.263521381210444e-08, "loss": 0.1064, "step": 10333 }, { "epoch": 9.64892623716153, "grad_norm": 1.8748005208554614, "learning_rate": 3.246295597225169e-08, "loss": 0.0355, "step": 10334 }, { "epoch": 9.64985994397759, "grad_norm": 1.7593397277822183, "learning_rate": 3.229115246821457e-08, "loss": 0.0239, "step": 10335 }, { "epoch": 9.65079365079365, "grad_norm": 1.3802002664907573, "learning_rate": 3.2119803315707185e-08, "loss": 0.0158, "step": 10336 }, { "epoch": 9.65172735760971, "grad_norm": 5.439422160720721, "learning_rate": 3.1948908530401445e-08, "loss": 0.1236, "step": 10337 }, { "epoch": 9.65266106442577, "grad_norm": 2.2208277957257603, "learning_rate": 3.177846812792762e-08, "loss": 0.0512, "step": 10338 }, { "epoch": 9.65359477124183, "grad_norm": 2.154244218598858, "learning_rate": 3.160848212387379e-08, "loss": 0.0418, "step": 10339 }, { "epoch": 9.65452847805789, "grad_norm": 4.696663188272713, "learning_rate": 3.143895053378698e-08, "loss": 0.1253, "step": 10340 }, { "epoch": 9.655462184873949, "grad_norm": 2.1825621897243925, "learning_rate": 3.126987337317366e-08, "loss": 0.023, "step": 10341 }, { "epoch": 9.656395891690009, "grad_norm": 1.2710725014849789, "learning_rate": 3.1101250657497025e-08, "loss": 0.0333, "step": 10342 }, { "epoch": 9.657329598506069, "grad_norm": 7.96176030498925, "learning_rate": 3.093308240217918e-08, "loss": 0.0345, "step": 10343 }, { "epoch": 9.658263305322128, "grad_norm": 2.9191180627182747, "learning_rate": 3.076536862260171e-08, "loss": 0.0582, "step": 10344 }, { "epoch": 9.659197012138188, "grad_norm": 3.524589082607336, "learning_rate": 3.0598109334103456e-08, "loss": 0.0604, "step": 10345 }, { "epoch": 9.660130718954248, "grad_norm": 0.16655685287559513, "learning_rate": 3.0431304551982176e-08, "loss": 0.0012, "step": 10346 }, { "epoch": 9.661064425770308, "grad_norm": 1.001629959514637, "learning_rate": 3.0264954291494006e-08, "loss": 0.0212, "step": 10347 }, { "epoch": 9.661998132586367, "grad_norm": 3.579770348847017, "learning_rate": 3.0099058567854e-08, "loss": 0.0769, "step": 10348 }, { "epoch": 9.662931839402427, "grad_norm": 1.5019245040461926, "learning_rate": 2.993361739623446e-08, "loss": 0.0282, "step": 10349 }, { "epoch": 9.663865546218487, "grad_norm": 5.885531390853619, "learning_rate": 2.9768630791767172e-08, "loss": 0.2319, "step": 10350 }, { "epoch": 9.664799253034547, "grad_norm": 1.9869193327669106, "learning_rate": 2.9604098769541734e-08, "loss": 0.0369, "step": 10351 }, { "epoch": 9.665732959850606, "grad_norm": 0.1842855406899698, "learning_rate": 2.9440021344606108e-08, "loss": 0.0014, "step": 10352 }, { "epoch": 9.666666666666666, "grad_norm": 0.4582179038105022, "learning_rate": 2.927639853196773e-08, "loss": 0.012, "step": 10353 }, { "epoch": 9.667600373482726, "grad_norm": 1.9591856833910961, "learning_rate": 2.9113230346591858e-08, "loss": 0.0202, "step": 10354 }, { "epoch": 9.668534080298786, "grad_norm": 3.1806287767769037, "learning_rate": 2.8950516803400997e-08, "loss": 0.0703, "step": 10355 }, { "epoch": 9.669467787114845, "grad_norm": 1.3812170232241932, "learning_rate": 2.878825791727713e-08, "loss": 0.0269, "step": 10356 }, { "epoch": 9.670401493930905, "grad_norm": 2.6837680426926873, "learning_rate": 2.8626453703061718e-08, "loss": 0.074, "step": 10357 }, { "epoch": 9.671335200746965, "grad_norm": 2.0543162661085304, "learning_rate": 2.8465104175552926e-08, "loss": 0.0254, "step": 10358 }, { "epoch": 9.672268907563025, "grad_norm": 0.7535169268833602, "learning_rate": 2.8304209349507837e-08, "loss": 0.0073, "step": 10359 }, { "epoch": 9.673202614379084, "grad_norm": 4.555893186293682, "learning_rate": 2.8143769239641906e-08, "loss": 0.0902, "step": 10360 }, { "epoch": 9.674136321195144, "grad_norm": 2.1724069824786736, "learning_rate": 2.7983783860629498e-08, "loss": 0.0411, "step": 10361 }, { "epoch": 9.675070028011204, "grad_norm": 2.51310331135802, "learning_rate": 2.7824253227102804e-08, "loss": 0.0826, "step": 10362 }, { "epoch": 9.676003734827264, "grad_norm": 6.286170277722909, "learning_rate": 2.7665177353652928e-08, "loss": 0.0753, "step": 10363 }, { "epoch": 9.676937441643323, "grad_norm": 2.227592616435244, "learning_rate": 2.7506556254828787e-08, "loss": 0.0459, "step": 10364 }, { "epoch": 9.677871148459383, "grad_norm": 3.671751998098772, "learning_rate": 2.734838994513822e-08, "loss": 0.1018, "step": 10365 }, { "epoch": 9.678804855275443, "grad_norm": 0.6512032530903697, "learning_rate": 2.719067843904688e-08, "loss": 0.0105, "step": 10366 }, { "epoch": 9.679738562091503, "grad_norm": 3.4887278185857213, "learning_rate": 2.7033421750979894e-08, "loss": 0.0758, "step": 10367 }, { "epoch": 9.680672268907562, "grad_norm": 1.1430380775908957, "learning_rate": 2.6876619895319646e-08, "loss": 0.0186, "step": 10368 }, { "epoch": 9.681605975723622, "grad_norm": 0.1488655197688938, "learning_rate": 2.6720272886407993e-08, "loss": 0.0009, "step": 10369 }, { "epoch": 9.682539682539682, "grad_norm": 2.303135109676665, "learning_rate": 2.6564380738543506e-08, "loss": 0.0301, "step": 10370 }, { "epoch": 9.683473389355742, "grad_norm": 3.266834897663028, "learning_rate": 2.640894346598477e-08, "loss": 0.0973, "step": 10371 }, { "epoch": 9.684407096171801, "grad_norm": 2.4401333854846823, "learning_rate": 2.6253961082948754e-08, "loss": 0.0418, "step": 10372 }, { "epoch": 9.685340802987861, "grad_norm": 4.914124348758536, "learning_rate": 2.609943360360967e-08, "loss": 0.047, "step": 10373 }, { "epoch": 9.686274509803921, "grad_norm": 3.155347703960585, "learning_rate": 2.594536104210066e-08, "loss": 0.0901, "step": 10374 }, { "epoch": 9.68720821661998, "grad_norm": 4.269424546968033, "learning_rate": 2.579174341251378e-08, "loss": 0.092, "step": 10375 }, { "epoch": 9.68814192343604, "grad_norm": 6.581686288282003, "learning_rate": 2.563858072889891e-08, "loss": 0.0578, "step": 10376 }, { "epoch": 9.6890756302521, "grad_norm": 3.8323197169514347, "learning_rate": 2.5485873005264283e-08, "loss": 0.0864, "step": 10377 }, { "epoch": 9.69000933706816, "grad_norm": 1.4374202317909075, "learning_rate": 2.5333620255577063e-08, "loss": 0.0319, "step": 10378 }, { "epoch": 9.69094304388422, "grad_norm": 0.5594907869021578, "learning_rate": 2.518182249376222e-08, "loss": 0.005, "step": 10379 }, { "epoch": 9.69187675070028, "grad_norm": 3.856745824005609, "learning_rate": 2.5030479733703648e-08, "loss": 0.089, "step": 10380 }, { "epoch": 9.69281045751634, "grad_norm": 1.8545390150077588, "learning_rate": 2.48795919892425e-08, "loss": 0.0244, "step": 10381 }, { "epoch": 9.693744164332399, "grad_norm": 0.8721601602267489, "learning_rate": 2.4729159274179404e-08, "loss": 0.0131, "step": 10382 }, { "epoch": 9.694677871148459, "grad_norm": 2.465334831994951, "learning_rate": 2.4579181602273906e-08, "loss": 0.0366, "step": 10383 }, { "epoch": 9.695611577964518, "grad_norm": 2.91059906427783, "learning_rate": 2.4429658987242256e-08, "loss": 0.0459, "step": 10384 }, { "epoch": 9.696545284780578, "grad_norm": 2.298822566211855, "learning_rate": 2.428059144276018e-08, "loss": 0.0499, "step": 10385 }, { "epoch": 9.697478991596638, "grad_norm": 0.5360011121272765, "learning_rate": 2.4131978982461224e-08, "loss": 0.004, "step": 10386 }, { "epoch": 9.698412698412698, "grad_norm": 0.49547874440018724, "learning_rate": 2.3983821619937843e-08, "loss": 0.0077, "step": 10387 }, { "epoch": 9.699346405228757, "grad_norm": 0.8446056170796231, "learning_rate": 2.3836119368740862e-08, "loss": 0.0152, "step": 10388 }, { "epoch": 9.700280112044817, "grad_norm": 3.641247074846913, "learning_rate": 2.368887224237948e-08, "loss": 0.0597, "step": 10389 }, { "epoch": 9.701213818860877, "grad_norm": 1.0293272186242473, "learning_rate": 2.35420802543207e-08, "loss": 0.0147, "step": 10390 }, { "epoch": 9.702147525676937, "grad_norm": 2.6459218133727327, "learning_rate": 2.3395743417989892e-08, "loss": 0.0468, "step": 10391 }, { "epoch": 9.703081232492996, "grad_norm": 0.38956079338304767, "learning_rate": 2.3249861746771906e-08, "loss": 0.0036, "step": 10392 }, { "epoch": 9.704014939309056, "grad_norm": 6.274393831656778, "learning_rate": 2.3104435254008852e-08, "loss": 0.2091, "step": 10393 }, { "epoch": 9.704948646125116, "grad_norm": 2.8291499910516937, "learning_rate": 2.2959463953001748e-08, "loss": 0.0605, "step": 10394 }, { "epoch": 9.705882352941176, "grad_norm": 3.4636282515669823, "learning_rate": 2.281494785700944e-08, "loss": 0.0764, "step": 10395 }, { "epoch": 9.706816059757235, "grad_norm": 8.735252879822566, "learning_rate": 2.267088697925024e-08, "loss": 0.1871, "step": 10396 }, { "epoch": 9.707749766573295, "grad_norm": 4.891413119465712, "learning_rate": 2.2527281332899166e-08, "loss": 0.0952, "step": 10397 }, { "epoch": 9.708683473389355, "grad_norm": 0.7508166178783208, "learning_rate": 2.238413093109071e-08, "loss": 0.0129, "step": 10398 }, { "epoch": 9.709617180205415, "grad_norm": 3.132691768619022, "learning_rate": 2.2241435786918285e-08, "loss": 0.1087, "step": 10399 }, { "epoch": 9.710550887021475, "grad_norm": 0.126714906599592, "learning_rate": 2.209919591343257e-08, "loss": 0.0008, "step": 10400 }, { "epoch": 9.711484593837534, "grad_norm": 1.9458676583713, "learning_rate": 2.1957411323642595e-08, "loss": 0.0247, "step": 10401 }, { "epoch": 9.712418300653594, "grad_norm": 1.72367341610336, "learning_rate": 2.181608203051633e-08, "loss": 0.0444, "step": 10402 }, { "epoch": 9.713352007469654, "grad_norm": 1.4671496192819111, "learning_rate": 2.167520804698009e-08, "loss": 0.0298, "step": 10403 }, { "epoch": 9.714285714285714, "grad_norm": 3.3655471110904633, "learning_rate": 2.153478938591802e-08, "loss": 0.0553, "step": 10404 }, { "epoch": 9.715219421101773, "grad_norm": 0.2943792388151257, "learning_rate": 2.139482606017318e-08, "loss": 0.0019, "step": 10405 }, { "epoch": 9.716153127917833, "grad_norm": 1.668146460378729, "learning_rate": 2.1255318082546993e-08, "loss": 0.0326, "step": 10406 }, { "epoch": 9.717086834733893, "grad_norm": 3.1975018293931963, "learning_rate": 2.111626546579815e-08, "loss": 0.0766, "step": 10407 }, { "epoch": 9.718020541549953, "grad_norm": 2.320143064998455, "learning_rate": 2.0977668222645354e-08, "loss": 0.0445, "step": 10408 }, { "epoch": 9.718954248366012, "grad_norm": 4.9064325763750185, "learning_rate": 2.0839526365764584e-08, "loss": 0.107, "step": 10409 }, { "epoch": 9.719887955182072, "grad_norm": 0.5896390813072346, "learning_rate": 2.0701839907790734e-08, "loss": 0.0093, "step": 10410 }, { "epoch": 9.720821661998132, "grad_norm": 1.1903732351145402, "learning_rate": 2.0564608861315948e-08, "loss": 0.0234, "step": 10411 }, { "epoch": 9.721755368814192, "grad_norm": 1.673832438742421, "learning_rate": 2.0427833238892414e-08, "loss": 0.0267, "step": 10412 }, { "epoch": 9.722689075630251, "grad_norm": 0.6104037830916329, "learning_rate": 2.029151305302901e-08, "loss": 0.007, "step": 10413 }, { "epoch": 9.723622782446311, "grad_norm": 1.2301596774291936, "learning_rate": 2.01556483161941e-08, "loss": 0.0077, "step": 10414 }, { "epoch": 9.72455648926237, "grad_norm": 3.329740304706858, "learning_rate": 2.0020239040814405e-08, "loss": 0.0926, "step": 10415 }, { "epoch": 9.72549019607843, "grad_norm": 1.2841508810232845, "learning_rate": 1.9885285239273355e-08, "loss": 0.0115, "step": 10416 }, { "epoch": 9.72642390289449, "grad_norm": 1.2215310538133874, "learning_rate": 1.975078692391552e-08, "loss": 0.0109, "step": 10417 }, { "epoch": 9.72735760971055, "grad_norm": 7.811793757718027, "learning_rate": 1.9616744107041063e-08, "loss": 0.1449, "step": 10418 }, { "epoch": 9.72829131652661, "grad_norm": 1.0993810866637772, "learning_rate": 1.9483156800910173e-08, "loss": 0.0139, "step": 10419 }, { "epoch": 9.72922502334267, "grad_norm": 5.969199248795865, "learning_rate": 1.935002501774086e-08, "loss": 0.0184, "step": 10420 }, { "epoch": 9.73015873015873, "grad_norm": 0.69376776415653, "learning_rate": 1.9217348769709488e-08, "loss": 0.0173, "step": 10421 }, { "epoch": 9.731092436974789, "grad_norm": 1.0425464945676806, "learning_rate": 1.9085128068950242e-08, "loss": 0.0155, "step": 10422 }, { "epoch": 9.732026143790849, "grad_norm": 10.915218491284035, "learning_rate": 1.895336292755734e-08, "loss": 0.1645, "step": 10423 }, { "epoch": 9.732959850606909, "grad_norm": 2.118941178535701, "learning_rate": 1.8822053357580582e-08, "loss": 0.0458, "step": 10424 }, { "epoch": 9.733893557422968, "grad_norm": 2.281805322939473, "learning_rate": 1.869119937103092e-08, "loss": 0.0479, "step": 10425 }, { "epoch": 9.73482726423903, "grad_norm": 0.2920153156204195, "learning_rate": 1.8560800979876002e-08, "loss": 0.0009, "step": 10426 }, { "epoch": 9.73576097105509, "grad_norm": 8.66938488589981, "learning_rate": 1.84308581960424e-08, "loss": 0.1328, "step": 10427 }, { "epoch": 9.73669467787115, "grad_norm": 3.187105449597211, "learning_rate": 1.830137103141394e-08, "loss": 0.0505, "step": 10428 }, { "epoch": 9.73762838468721, "grad_norm": 2.6157465609960884, "learning_rate": 1.8172339497835033e-08, "loss": 0.0479, "step": 10429 }, { "epoch": 9.738562091503269, "grad_norm": 6.13349804010214, "learning_rate": 1.804376360710569e-08, "loss": 0.0882, "step": 10430 }, { "epoch": 9.739495798319329, "grad_norm": 0.7516596315435243, "learning_rate": 1.7915643370986503e-08, "loss": 0.0098, "step": 10431 }, { "epoch": 9.740429505135388, "grad_norm": 0.7578093769537075, "learning_rate": 1.778797880119476e-08, "loss": 0.012, "step": 10432 }, { "epoch": 9.741363211951448, "grad_norm": 1.8267249491996307, "learning_rate": 1.7660769909407237e-08, "loss": 0.0281, "step": 10433 }, { "epoch": 9.742296918767508, "grad_norm": 1.4065190440393764, "learning_rate": 1.7534016707259073e-08, "loss": 0.0201, "step": 10434 }, { "epoch": 9.743230625583568, "grad_norm": 2.3314234793704385, "learning_rate": 1.74077192063421e-08, "loss": 0.0497, "step": 10435 }, { "epoch": 9.744164332399627, "grad_norm": 1.738608572241168, "learning_rate": 1.7281877418208194e-08, "loss": 0.0318, "step": 10436 }, { "epoch": 9.745098039215687, "grad_norm": 0.8784615949106391, "learning_rate": 1.7156491354367032e-08, "loss": 0.0117, "step": 10437 }, { "epoch": 9.746031746031747, "grad_norm": 1.9139937330827401, "learning_rate": 1.703156102628667e-08, "loss": 0.0449, "step": 10438 }, { "epoch": 9.746965452847807, "grad_norm": 4.801923701548447, "learning_rate": 1.6907086445393516e-08, "loss": 0.1524, "step": 10439 }, { "epoch": 9.747899159663866, "grad_norm": 3.3140142441266804, "learning_rate": 1.678306762307125e-08, "loss": 0.0643, "step": 10440 }, { "epoch": 9.748832866479926, "grad_norm": 0.4331561172034099, "learning_rate": 1.665950457066301e-08, "loss": 0.0049, "step": 10441 }, { "epoch": 9.749766573295986, "grad_norm": 4.62260074160636, "learning_rate": 1.6536397299470875e-08, "loss": 0.0908, "step": 10442 }, { "epoch": 9.750700280112046, "grad_norm": 3.656165537933261, "learning_rate": 1.6413745820753613e-08, "loss": 0.0487, "step": 10443 }, { "epoch": 9.751633986928105, "grad_norm": 1.6988757607982123, "learning_rate": 1.629155014572892e-08, "loss": 0.0074, "step": 10444 }, { "epoch": 9.752567693744165, "grad_norm": 1.1590060448239292, "learning_rate": 1.6169810285573407e-08, "loss": 0.0352, "step": 10445 }, { "epoch": 9.753501400560225, "grad_norm": 2.7282299211812746, "learning_rate": 1.6048526251421502e-08, "loss": 0.0886, "step": 10446 }, { "epoch": 9.754435107376285, "grad_norm": 0.47016353782840714, "learning_rate": 1.5927698054365448e-08, "loss": 0.0053, "step": 10447 }, { "epoch": 9.755368814192344, "grad_norm": 2.427006003306592, "learning_rate": 1.5807325705456954e-08, "loss": 0.039, "step": 10448 }, { "epoch": 9.756302521008404, "grad_norm": 1.7758862814584062, "learning_rate": 1.5687409215704995e-08, "loss": 0.0177, "step": 10449 }, { "epoch": 9.757236227824464, "grad_norm": 9.424303422072136, "learning_rate": 1.5567948596077464e-08, "loss": 0.2272, "step": 10450 }, { "epoch": 9.758169934640524, "grad_norm": 0.6601559804244916, "learning_rate": 1.5448943857500064e-08, "loss": 0.0035, "step": 10451 }, { "epoch": 9.759103641456583, "grad_norm": 3.9951142993325073, "learning_rate": 1.5330395010857423e-08, "loss": 0.0884, "step": 10452 }, { "epoch": 9.760037348272643, "grad_norm": 2.4349752052996636, "learning_rate": 1.521230206699198e-08, "loss": 0.0453, "step": 10453 }, { "epoch": 9.760971055088703, "grad_norm": 1.5498673431015912, "learning_rate": 1.5094665036704538e-08, "loss": 0.0311, "step": 10454 }, { "epoch": 9.761904761904763, "grad_norm": 0.7876657454820467, "learning_rate": 1.4977483930754267e-08, "loss": 0.0045, "step": 10455 }, { "epoch": 9.762838468720823, "grad_norm": 1.705528880701993, "learning_rate": 1.4860758759858707e-08, "loss": 0.0205, "step": 10456 }, { "epoch": 9.763772175536882, "grad_norm": 6.313433415291317, "learning_rate": 1.4744489534693762e-08, "loss": 0.1393, "step": 10457 }, { "epoch": 9.764705882352942, "grad_norm": 1.7760201385503775, "learning_rate": 1.4628676265893705e-08, "loss": 0.0386, "step": 10458 }, { "epoch": 9.765639589169002, "grad_norm": 0.4187715766349729, "learning_rate": 1.451331896405006e-08, "loss": 0.0065, "step": 10459 }, { "epoch": 9.766573295985062, "grad_norm": 2.8811733163640456, "learning_rate": 1.4398417639714946e-08, "loss": 0.0732, "step": 10460 }, { "epoch": 9.767507002801121, "grad_norm": 1.2648703179219836, "learning_rate": 1.4283972303396065e-08, "loss": 0.0378, "step": 10461 }, { "epoch": 9.768440709617181, "grad_norm": 3.658238636194689, "learning_rate": 1.4169982965561713e-08, "loss": 0.0693, "step": 10462 }, { "epoch": 9.76937441643324, "grad_norm": 5.3887954556875695, "learning_rate": 1.4056449636636327e-08, "loss": 0.1176, "step": 10463 }, { "epoch": 9.7703081232493, "grad_norm": 2.3833719742172486, "learning_rate": 1.3943372327004933e-08, "loss": 0.0336, "step": 10464 }, { "epoch": 9.77124183006536, "grad_norm": 0.37533408772700305, "learning_rate": 1.3830751047009261e-08, "loss": 0.0061, "step": 10465 }, { "epoch": 9.77217553688142, "grad_norm": 2.562674934749141, "learning_rate": 1.3718585806949403e-08, "loss": 0.0587, "step": 10466 }, { "epoch": 9.77310924369748, "grad_norm": 2.3261716992440022, "learning_rate": 1.3606876617084374e-08, "loss": 0.0195, "step": 10467 }, { "epoch": 9.77404295051354, "grad_norm": 0.6883846568710386, "learning_rate": 1.3495623487631004e-08, "loss": 0.0102, "step": 10468 }, { "epoch": 9.7749766573296, "grad_norm": 1.525647888321298, "learning_rate": 1.3384826428765042e-08, "loss": 0.0253, "step": 10469 }, { "epoch": 9.775910364145659, "grad_norm": 2.5484480797727427, "learning_rate": 1.3274485450620045e-08, "loss": 0.0334, "step": 10470 }, { "epoch": 9.776844070961719, "grad_norm": 3.216788842110152, "learning_rate": 1.316460056328739e-08, "loss": 0.0709, "step": 10471 }, { "epoch": 9.777777777777779, "grad_norm": 0.9866881401697937, "learning_rate": 1.3055171776817921e-08, "loss": 0.0178, "step": 10472 }, { "epoch": 9.778711484593838, "grad_norm": 2.654220788228607, "learning_rate": 1.2946199101219747e-08, "loss": 0.0724, "step": 10473 }, { "epoch": 9.779645191409898, "grad_norm": 5.863604570982279, "learning_rate": 1.283768254645934e-08, "loss": 0.1467, "step": 10474 }, { "epoch": 9.780578898225958, "grad_norm": 0.3814771693991433, "learning_rate": 1.2729622122462094e-08, "loss": 0.0037, "step": 10475 }, { "epoch": 9.781512605042018, "grad_norm": 0.704913529435652, "learning_rate": 1.2622017839111766e-08, "loss": 0.0092, "step": 10476 }, { "epoch": 9.782446311858077, "grad_norm": 2.714763580237197, "learning_rate": 1.2514869706248822e-08, "loss": 0.0657, "step": 10477 }, { "epoch": 9.783380018674137, "grad_norm": 1.5246528675502498, "learning_rate": 1.2408177733674309e-08, "loss": 0.0491, "step": 10478 }, { "epoch": 9.784313725490197, "grad_norm": 3.241551112779367, "learning_rate": 1.2301941931145423e-08, "loss": 0.0766, "step": 10479 }, { "epoch": 9.785247432306257, "grad_norm": 4.357915993362448, "learning_rate": 1.2196162308379389e-08, "loss": 0.0698, "step": 10480 }, { "epoch": 9.786181139122316, "grad_norm": 4.824191009644966, "learning_rate": 1.2090838875050137e-08, "loss": 0.1002, "step": 10481 }, { "epoch": 9.787114845938376, "grad_norm": 4.581582335486214, "learning_rate": 1.1985971640791072e-08, "loss": 0.1342, "step": 10482 }, { "epoch": 9.788048552754436, "grad_norm": 3.75964444708345, "learning_rate": 1.1881560615193965e-08, "loss": 0.1015, "step": 10483 }, { "epoch": 9.788982259570496, "grad_norm": 2.6298088218936684, "learning_rate": 1.1777605807807291e-08, "loss": 0.0511, "step": 10484 }, { "epoch": 9.789915966386555, "grad_norm": 0.9065571218221217, "learning_rate": 1.1674107228139553e-08, "loss": 0.0152, "step": 10485 }, { "epoch": 9.790849673202615, "grad_norm": 3.616016932438248, "learning_rate": 1.1571064885656514e-08, "loss": 0.1057, "step": 10486 }, { "epoch": 9.791783380018675, "grad_norm": 1.2873401580969428, "learning_rate": 1.1468478789782855e-08, "loss": 0.0228, "step": 10487 }, { "epoch": 9.792717086834735, "grad_norm": 1.1540062463164025, "learning_rate": 1.1366348949900519e-08, "loss": 0.0138, "step": 10488 }, { "epoch": 9.793650793650794, "grad_norm": 1.289032987407825, "learning_rate": 1.1264675375351475e-08, "loss": 0.0232, "step": 10489 }, { "epoch": 9.794584500466854, "grad_norm": 0.7999587893850897, "learning_rate": 1.1163458075434398e-08, "loss": 0.0114, "step": 10490 }, { "epoch": 9.795518207282914, "grad_norm": 3.045532670458975, "learning_rate": 1.1062697059406324e-08, "loss": 0.0349, "step": 10491 }, { "epoch": 9.796451914098974, "grad_norm": 0.7422284867650374, "learning_rate": 1.0962392336483218e-08, "loss": 0.0023, "step": 10492 }, { "epoch": 9.797385620915033, "grad_norm": 6.855978401207725, "learning_rate": 1.0862543915839408e-08, "loss": 0.0574, "step": 10493 }, { "epoch": 9.798319327731093, "grad_norm": 3.698818086917235, "learning_rate": 1.0763151806607031e-08, "loss": 0.1159, "step": 10494 }, { "epoch": 9.799253034547153, "grad_norm": 4.022009141460908, "learning_rate": 1.066421601787604e-08, "loss": 0.0907, "step": 10495 }, { "epoch": 9.800186741363213, "grad_norm": 2.3688859541936185, "learning_rate": 1.0565736558695306e-08, "loss": 0.0289, "step": 10496 }, { "epoch": 9.801120448179272, "grad_norm": 3.776598113606073, "learning_rate": 1.0467713438072069e-08, "loss": 0.1058, "step": 10497 }, { "epoch": 9.802054154995332, "grad_norm": 0.21053028460831016, "learning_rate": 1.0370146664971937e-08, "loss": 0.0021, "step": 10498 }, { "epoch": 9.802987861811392, "grad_norm": 2.897084349735043, "learning_rate": 1.0273036248318325e-08, "loss": 0.0529, "step": 10499 }, { "epoch": 9.803921568627452, "grad_norm": 4.770892440740825, "learning_rate": 1.0176382196993017e-08, "loss": 0.091, "step": 10500 }, { "epoch": 9.804855275443511, "grad_norm": 0.7017378100163384, "learning_rate": 1.0080184519835057e-08, "loss": 0.0126, "step": 10501 }, { "epoch": 9.805788982259571, "grad_norm": 3.5299776182249913, "learning_rate": 9.984443225644625e-09, "loss": 0.1027, "step": 10502 }, { "epoch": 9.806722689075631, "grad_norm": 2.2078687877631964, "learning_rate": 9.88915832317694e-09, "loss": 0.0517, "step": 10503 }, { "epoch": 9.80765639589169, "grad_norm": 1.1908902014142162, "learning_rate": 9.794329821147252e-09, "loss": 0.0129, "step": 10504 }, { "epoch": 9.80859010270775, "grad_norm": 0.9248854973184188, "learning_rate": 9.699957728228626e-09, "loss": 0.0053, "step": 10505 }, { "epoch": 9.80952380952381, "grad_norm": 1.3048917910155664, "learning_rate": 9.606042053053044e-09, "loss": 0.0125, "step": 10506 }, { "epoch": 9.81045751633987, "grad_norm": 1.8896916975195575, "learning_rate": 9.512582804209191e-09, "loss": 0.0292, "step": 10507 }, { "epoch": 9.81139122315593, "grad_norm": 1.59686129499269, "learning_rate": 9.41957999024523e-09, "loss": 0.0252, "step": 10508 }, { "epoch": 9.81232492997199, "grad_norm": 1.653848071124569, "learning_rate": 9.327033619667692e-09, "loss": 0.029, "step": 10509 }, { "epoch": 9.81325863678805, "grad_norm": 0.7993756715246657, "learning_rate": 9.234943700940357e-09, "loss": 0.0075, "step": 10510 }, { "epoch": 9.814192343604109, "grad_norm": 0.5098358584521145, "learning_rate": 9.143310242486492e-09, "loss": 0.0081, "step": 10511 }, { "epoch": 9.815126050420169, "grad_norm": 0.8617332543164101, "learning_rate": 9.052133252686612e-09, "loss": 0.0115, "step": 10512 }, { "epoch": 9.816059757236228, "grad_norm": 0.5123095911898206, "learning_rate": 8.961412739879604e-09, "loss": 0.0077, "step": 10513 }, { "epoch": 9.816993464052288, "grad_norm": 3.9598654716088486, "learning_rate": 8.871148712363276e-09, "loss": 0.1277, "step": 10514 }, { "epoch": 9.817927170868348, "grad_norm": 1.491404260071939, "learning_rate": 8.781341178393244e-09, "loss": 0.0236, "step": 10515 }, { "epoch": 9.818860877684408, "grad_norm": 0.9699139051320417, "learning_rate": 8.691990146182937e-09, "loss": 0.0187, "step": 10516 }, { "epoch": 9.819794584500467, "grad_norm": 1.2832104723154394, "learning_rate": 8.603095623905267e-09, "loss": 0.0281, "step": 10517 }, { "epoch": 9.820728291316527, "grad_norm": 2.2905510732391376, "learning_rate": 8.514657619689837e-09, "loss": 0.0389, "step": 10518 }, { "epoch": 9.821661998132587, "grad_norm": 2.8204776013479944, "learning_rate": 8.426676141625734e-09, "loss": 0.0836, "step": 10519 }, { "epoch": 9.822595704948647, "grad_norm": 0.2366197773901707, "learning_rate": 8.339151197759854e-09, "loss": 0.0022, "step": 10520 }, { "epoch": 9.823529411764707, "grad_norm": 0.3940363808665511, "learning_rate": 8.252082796096905e-09, "loss": 0.0021, "step": 10521 }, { "epoch": 9.824463118580766, "grad_norm": 0.6268995892115912, "learning_rate": 8.165470944601073e-09, "loss": 0.0096, "step": 10522 }, { "epoch": 9.825396825396826, "grad_norm": 0.5748746227275604, "learning_rate": 8.079315651192687e-09, "loss": 0.0051, "step": 10523 }, { "epoch": 9.826330532212886, "grad_norm": 2.381647054831212, "learning_rate": 7.993616923752667e-09, "loss": 0.0328, "step": 10524 }, { "epoch": 9.827264239028946, "grad_norm": 0.6417200634952483, "learning_rate": 7.908374770118632e-09, "loss": 0.002, "step": 10525 }, { "epoch": 9.828197945845005, "grad_norm": 0.9719748108539459, "learning_rate": 7.823589198087678e-09, "loss": 0.0024, "step": 10526 }, { "epoch": 9.829131652661065, "grad_norm": 0.9671508827874481, "learning_rate": 7.739260215413048e-09, "loss": 0.0057, "step": 10527 }, { "epoch": 9.830065359477125, "grad_norm": 1.2225060080738888, "learning_rate": 7.65538782980857e-09, "loss": 0.0176, "step": 10528 }, { "epoch": 9.830999066293185, "grad_norm": 0.385844768587446, "learning_rate": 7.571972048944776e-09, "loss": 0.0096, "step": 10529 }, { "epoch": 9.831932773109244, "grad_norm": 1.7035589448934985, "learning_rate": 7.489012880451119e-09, "loss": 0.0485, "step": 10530 }, { "epoch": 9.832866479925304, "grad_norm": 12.619397367007101, "learning_rate": 7.406510331914862e-09, "loss": 0.2143, "step": 10531 }, { "epoch": 9.833800186741364, "grad_norm": 1.7134183565544654, "learning_rate": 7.324464410882748e-09, "loss": 0.0298, "step": 10532 }, { "epoch": 9.834733893557424, "grad_norm": 2.9694499707661577, "learning_rate": 7.242875124857662e-09, "loss": 0.0319, "step": 10533 }, { "epoch": 9.835667600373483, "grad_norm": 1.8202388539425651, "learning_rate": 7.161742481302525e-09, "loss": 0.0253, "step": 10534 }, { "epoch": 9.836601307189543, "grad_norm": 0.8787393890868719, "learning_rate": 7.0810664876369564e-09, "loss": 0.0141, "step": 10535 }, { "epoch": 9.837535014005603, "grad_norm": 10.591619421085195, "learning_rate": 7.000847151240608e-09, "loss": 0.0602, "step": 10536 }, { "epoch": 9.838468720821663, "grad_norm": 0.6753316759614881, "learning_rate": 6.921084479450391e-09, "loss": 0.0115, "step": 10537 }, { "epoch": 9.839402427637722, "grad_norm": 2.247375769699304, "learning_rate": 6.841778479561023e-09, "loss": 0.0436, "step": 10538 }, { "epoch": 9.840336134453782, "grad_norm": 4.53870369582243, "learning_rate": 6.762929158825593e-09, "loss": 0.0792, "step": 10539 }, { "epoch": 9.841269841269842, "grad_norm": 0.6341756915373945, "learning_rate": 6.684536524456664e-09, "loss": 0.0044, "step": 10540 }, { "epoch": 9.842203548085902, "grad_norm": 1.4973024546481415, "learning_rate": 6.6066005836235015e-09, "loss": 0.0366, "step": 10541 }, { "epoch": 9.843137254901961, "grad_norm": 3.428195101695622, "learning_rate": 6.529121343454292e-09, "loss": 0.0471, "step": 10542 }, { "epoch": 9.844070961718021, "grad_norm": 2.583054882402276, "learning_rate": 6.452098811035035e-09, "loss": 0.0552, "step": 10543 }, { "epoch": 9.84500466853408, "grad_norm": 2.340268948379156, "learning_rate": 6.375532993411204e-09, "loss": 0.0598, "step": 10544 }, { "epoch": 9.84593837535014, "grad_norm": 2.4082276283353647, "learning_rate": 6.299423897584978e-09, "loss": 0.0852, "step": 10545 }, { "epoch": 9.8468720821662, "grad_norm": 3.9670266197237147, "learning_rate": 6.223771530516898e-09, "loss": 0.0803, "step": 10546 }, { "epoch": 9.84780578898226, "grad_norm": 3.8670517185884266, "learning_rate": 6.148575899126985e-09, "loss": 0.0793, "step": 10547 }, { "epoch": 9.84873949579832, "grad_norm": 3.757825889748031, "learning_rate": 6.073837010292516e-09, "loss": 0.0313, "step": 10548 }, { "epoch": 9.84967320261438, "grad_norm": 2.6640482000932284, "learning_rate": 5.999554870849133e-09, "loss": 0.0777, "step": 10549 }, { "epoch": 9.85060690943044, "grad_norm": 2.2256028525328255, "learning_rate": 5.925729487590848e-09, "loss": 0.0501, "step": 10550 }, { "epoch": 9.8515406162465, "grad_norm": 0.8339021406377839, "learning_rate": 5.85236086726948e-09, "loss": 0.0134, "step": 10551 }, { "epoch": 9.852474323062559, "grad_norm": 6.615648048777254, "learning_rate": 5.779449016595773e-09, "loss": 0.226, "step": 10552 }, { "epoch": 9.853408029878619, "grad_norm": 1.72989229127405, "learning_rate": 5.706993942238281e-09, "loss": 0.0325, "step": 10553 }, { "epoch": 9.854341736694678, "grad_norm": 5.105988910810911, "learning_rate": 5.634995650823371e-09, "loss": 0.122, "step": 10554 }, { "epoch": 9.855275443510738, "grad_norm": 2.645037093139281, "learning_rate": 5.563454148936332e-09, "loss": 0.0315, "step": 10555 }, { "epoch": 9.856209150326798, "grad_norm": 2.1612943917691316, "learning_rate": 5.492369443120815e-09, "loss": 0.0347, "step": 10556 }, { "epoch": 9.857142857142858, "grad_norm": 0.6395993729715468, "learning_rate": 5.421741539877734e-09, "loss": 0.0077, "step": 10557 }, { "epoch": 9.858076563958917, "grad_norm": 4.233858035711414, "learning_rate": 5.35157044566692e-09, "loss": 0.0631, "step": 10558 }, { "epoch": 9.859010270774977, "grad_norm": 3.0361913507572806, "learning_rate": 5.2818561669065735e-09, "loss": 0.0613, "step": 10559 }, { "epoch": 9.859943977591037, "grad_norm": 4.336025671037106, "learning_rate": 5.21259870997215e-09, "loss": 0.1875, "step": 10560 }, { "epoch": 9.860877684407097, "grad_norm": 2.744239170049701, "learning_rate": 5.1437980811991365e-09, "loss": 0.0542, "step": 10561 }, { "epoch": 9.861811391223156, "grad_norm": 1.3015718773723324, "learning_rate": 5.075454286879167e-09, "loss": 0.0221, "step": 10562 }, { "epoch": 9.862745098039216, "grad_norm": 1.1888622902138093, "learning_rate": 5.007567333263352e-09, "loss": 0.0134, "step": 10563 }, { "epoch": 9.863678804855276, "grad_norm": 3.681847787421429, "learning_rate": 4.940137226560615e-09, "loss": 0.0924, "step": 10564 }, { "epoch": 9.864612511671336, "grad_norm": 3.019918336408173, "learning_rate": 4.873163972938244e-09, "loss": 0.1074, "step": 10565 }, { "epoch": 9.865546218487395, "grad_norm": 1.1051674420579447, "learning_rate": 4.8066475785218945e-09, "loss": 0.0144, "step": 10566 }, { "epoch": 9.866479925303455, "grad_norm": 2.003664495867263, "learning_rate": 4.740588049394479e-09, "loss": 0.0378, "step": 10567 }, { "epoch": 9.867413632119515, "grad_norm": 2.00918337528629, "learning_rate": 4.674985391598941e-09, "loss": 0.0363, "step": 10568 }, { "epoch": 9.868347338935575, "grad_norm": 0.61241972755881, "learning_rate": 4.60983961113437e-09, "loss": 0.0045, "step": 10569 }, { "epoch": 9.869281045751634, "grad_norm": 2.3169460913220528, "learning_rate": 4.545150713959889e-09, "loss": 0.0229, "step": 10570 }, { "epoch": 9.870214752567694, "grad_norm": 1.8343664178618917, "learning_rate": 4.4809187059913216e-09, "loss": 0.0464, "step": 10571 }, { "epoch": 9.871148459383754, "grad_norm": 3.7092757959200573, "learning_rate": 4.4171435931039675e-09, "loss": 0.0615, "step": 10572 }, { "epoch": 9.872082166199814, "grad_norm": 3.5641873563736257, "learning_rate": 4.353825381130383e-09, "loss": 0.081, "step": 10573 }, { "epoch": 9.873015873015873, "grad_norm": 0.6953419518706466, "learning_rate": 4.290964075862048e-09, "loss": 0.0027, "step": 10574 }, { "epoch": 9.873949579831933, "grad_norm": 1.489783279133586, "learning_rate": 4.228559683047695e-09, "loss": 0.0151, "step": 10575 }, { "epoch": 9.874883286647993, "grad_norm": 1.3314277014883433, "learning_rate": 4.166612208395537e-09, "loss": 0.0132, "step": 10576 }, { "epoch": 9.875816993464053, "grad_norm": 1.7536469052866197, "learning_rate": 4.105121657570488e-09, "loss": 0.0314, "step": 10577 }, { "epoch": 9.876750700280112, "grad_norm": 3.0331337784639247, "learning_rate": 4.044088036198046e-09, "loss": 0.128, "step": 10578 }, { "epoch": 9.877684407096172, "grad_norm": 1.5085067013005693, "learning_rate": 3.983511349858748e-09, "loss": 0.0293, "step": 10579 }, { "epoch": 9.878618113912232, "grad_norm": 0.30860556290128943, "learning_rate": 3.923391604094273e-09, "loss": 0.0058, "step": 10580 }, { "epoch": 9.879551820728292, "grad_norm": 5.151854640435238, "learning_rate": 3.863728804402445e-09, "loss": 0.0593, "step": 10581 }, { "epoch": 9.880485527544351, "grad_norm": 0.5600292185669061, "learning_rate": 3.804522956240564e-09, "loss": 0.0102, "step": 10582 }, { "epoch": 9.881419234360411, "grad_norm": 1.679439388962463, "learning_rate": 3.745774065023189e-09, "loss": 0.0078, "step": 10583 }, { "epoch": 9.882352941176471, "grad_norm": 0.7078531924884355, "learning_rate": 3.6874821361237988e-09, "loss": 0.0084, "step": 10584 }, { "epoch": 9.88328664799253, "grad_norm": 1.5042644229170186, "learning_rate": 3.62964717487424e-09, "loss": 0.0365, "step": 10585 }, { "epoch": 9.88422035480859, "grad_norm": 0.6026997070808106, "learning_rate": 3.57226918656306e-09, "loss": 0.0049, "step": 10586 }, { "epoch": 9.88515406162465, "grad_norm": 2.967166162909569, "learning_rate": 3.515348176439393e-09, "loss": 0.0467, "step": 10587 }, { "epoch": 9.88608776844071, "grad_norm": 0.19177073857335844, "learning_rate": 3.458884149708519e-09, "loss": 0.0016, "step": 10588 }, { "epoch": 9.88702147525677, "grad_norm": 3.1334978591717064, "learning_rate": 3.4028771115346416e-09, "loss": 0.0558, "step": 10589 }, { "epoch": 9.88795518207283, "grad_norm": 2.3820294865314966, "learning_rate": 3.347327067040884e-09, "loss": 0.0491, "step": 10590 }, { "epoch": 9.88888888888889, "grad_norm": 2.986009186100746, "learning_rate": 3.2922340213070724e-09, "loss": 0.0943, "step": 10591 }, { "epoch": 9.889822595704949, "grad_norm": 0.2793057595060851, "learning_rate": 3.237597979372509e-09, "loss": 0.003, "step": 10592 }, { "epoch": 9.890756302521009, "grad_norm": 1.4248534417247116, "learning_rate": 3.183418946234307e-09, "loss": 0.026, "step": 10593 }, { "epoch": 9.891690009337069, "grad_norm": 2.659701041029352, "learning_rate": 3.129696926847392e-09, "loss": 0.0621, "step": 10594 }, { "epoch": 9.892623716153128, "grad_norm": 1.6076719216988076, "learning_rate": 3.0764319261261668e-09, "loss": 0.045, "step": 10595 }, { "epoch": 9.893557422969188, "grad_norm": 0.7560863174031799, "learning_rate": 3.023623948941179e-09, "loss": 0.0183, "step": 10596 }, { "epoch": 9.894491129785248, "grad_norm": 0.5982263906280296, "learning_rate": 2.9712730001224545e-09, "loss": 0.0102, "step": 10597 }, { "epoch": 9.895424836601308, "grad_norm": 2.312362191265469, "learning_rate": 2.9193790844589398e-09, "loss": 0.0549, "step": 10598 }, { "epoch": 9.896358543417367, "grad_norm": 2.5634495840467717, "learning_rate": 2.8679422066957287e-09, "loss": 0.051, "step": 10599 }, { "epoch": 9.897292250233427, "grad_norm": 1.5391441282344003, "learning_rate": 2.8169623715379457e-09, "loss": 0.0217, "step": 10600 }, { "epoch": 9.898225957049487, "grad_norm": 1.9112107285535254, "learning_rate": 2.7664395836485282e-09, "loss": 0.0499, "step": 10601 }, { "epoch": 9.899159663865547, "grad_norm": 0.7188117517278325, "learning_rate": 2.7163738476476685e-09, "loss": 0.0013, "step": 10602 }, { "epoch": 9.900093370681606, "grad_norm": 5.069749306345149, "learning_rate": 2.6667651681150374e-09, "loss": 0.0317, "step": 10603 }, { "epoch": 9.901027077497666, "grad_norm": 0.5728080643331686, "learning_rate": 2.617613549587561e-09, "loss": 0.0046, "step": 10604 }, { "epoch": 9.901960784313726, "grad_norm": 1.9607824979778339, "learning_rate": 2.568918996560532e-09, "loss": 0.0305, "step": 10605 }, { "epoch": 9.902894491129786, "grad_norm": 2.010781842954547, "learning_rate": 2.5206815134881656e-09, "loss": 0.0689, "step": 10606 }, { "epoch": 9.903828197945845, "grad_norm": 2.1235753444636227, "learning_rate": 2.4729011047813777e-09, "loss": 0.0532, "step": 10607 }, { "epoch": 9.904761904761905, "grad_norm": 2.2088490760616475, "learning_rate": 2.4255777748111165e-09, "loss": 0.0435, "step": 10608 }, { "epoch": 9.905695611577965, "grad_norm": 1.272831195672404, "learning_rate": 2.378711527905586e-09, "loss": 0.0307, "step": 10609 }, { "epoch": 9.906629318394025, "grad_norm": 1.0018126990477314, "learning_rate": 2.3323023683508028e-09, "loss": 0.0167, "step": 10610 }, { "epoch": 9.907563025210084, "grad_norm": 2.2265646201953424, "learning_rate": 2.286350300391149e-09, "loss": 0.0127, "step": 10611 }, { "epoch": 9.908496732026144, "grad_norm": 2.8806121002294836, "learning_rate": 2.2408553282299296e-09, "loss": 0.044, "step": 10612 }, { "epoch": 9.909430438842204, "grad_norm": 2.2870706457871126, "learning_rate": 2.19581745602826e-09, "loss": 0.0735, "step": 10613 }, { "epoch": 9.910364145658264, "grad_norm": 0.9076265870097822, "learning_rate": 2.151236687905067e-09, "loss": 0.0155, "step": 10614 }, { "epoch": 9.911297852474323, "grad_norm": 3.7168149561340975, "learning_rate": 2.1071130279376462e-09, "loss": 0.0888, "step": 10615 }, { "epoch": 9.912231559290383, "grad_norm": 2.435486617812724, "learning_rate": 2.063446480161657e-09, "loss": 0.0995, "step": 10616 }, { "epoch": 9.913165266106443, "grad_norm": 1.1870642288113558, "learning_rate": 2.0202370485716828e-09, "loss": 0.0243, "step": 10617 }, { "epoch": 9.914098972922503, "grad_norm": 1.4800700586625104, "learning_rate": 1.9774847371184515e-09, "loss": 0.0252, "step": 10618 }, { "epoch": 9.915032679738562, "grad_norm": 2.49534873563213, "learning_rate": 1.935189549712724e-09, "loss": 0.0287, "step": 10619 }, { "epoch": 9.915966386554622, "grad_norm": 0.5075946927252856, "learning_rate": 1.893351490222517e-09, "loss": 0.0038, "step": 10620 }, { "epoch": 9.916900093370682, "grad_norm": 2.161740940968806, "learning_rate": 1.8519705624753247e-09, "loss": 0.0678, "step": 10621 }, { "epoch": 9.917833800186742, "grad_norm": 5.577194428032189, "learning_rate": 1.8110467702547873e-09, "loss": 0.1056, "step": 10622 }, { "epoch": 9.918767507002801, "grad_norm": 0.7107312839985851, "learning_rate": 1.7705801173045768e-09, "loss": 0.0116, "step": 10623 }, { "epoch": 9.919701213818861, "grad_norm": 3.0985174166668945, "learning_rate": 1.730570607325066e-09, "loss": 0.0919, "step": 10624 }, { "epoch": 9.920634920634921, "grad_norm": 2.8666861879561814, "learning_rate": 1.691018243976661e-09, "loss": 0.0937, "step": 10625 }, { "epoch": 9.92156862745098, "grad_norm": 3.552707279756119, "learning_rate": 1.6519230308759126e-09, "loss": 0.1178, "step": 10626 }, { "epoch": 9.92250233426704, "grad_norm": 0.6091153948124997, "learning_rate": 1.6132849715988496e-09, "loss": 0.0073, "step": 10627 }, { "epoch": 9.9234360410831, "grad_norm": 3.582254577681683, "learning_rate": 1.5751040696793118e-09, "loss": 0.0788, "step": 10628 }, { "epoch": 9.92436974789916, "grad_norm": 3.3210387165113606, "learning_rate": 1.5373803286095056e-09, "loss": 0.097, "step": 10629 }, { "epoch": 9.92530345471522, "grad_norm": 0.39902845736835246, "learning_rate": 1.5001137518394494e-09, "loss": 0.0024, "step": 10630 }, { "epoch": 9.92623716153128, "grad_norm": 4.6101105719665405, "learning_rate": 1.4633043427775273e-09, "loss": 0.0984, "step": 10631 }, { "epoch": 9.92717086834734, "grad_norm": 8.894552464550783, "learning_rate": 1.4269521047904912e-09, "loss": 0.1022, "step": 10632 }, { "epoch": 9.928104575163399, "grad_norm": 1.6397316082517468, "learning_rate": 1.3910570412034585e-09, "loss": 0.032, "step": 10633 }, { "epoch": 9.929038281979459, "grad_norm": 5.360767150175839, "learning_rate": 1.355619155298804e-09, "loss": 0.1235, "step": 10634 }, { "epoch": 9.929971988795518, "grad_norm": 2.5033760382327763, "learning_rate": 1.320638450318379e-09, "loss": 0.0486, "step": 10635 }, { "epoch": 9.930905695611578, "grad_norm": 0.49516575115160344, "learning_rate": 1.2861149294607356e-09, "loss": 0.0038, "step": 10636 }, { "epoch": 9.931839402427638, "grad_norm": 1.6619312656476959, "learning_rate": 1.2520485958844586e-09, "loss": 0.0154, "step": 10637 }, { "epoch": 9.932773109243698, "grad_norm": 2.344303039581809, "learning_rate": 1.2184394527048337e-09, "loss": 0.0275, "step": 10638 }, { "epoch": 9.933706816059757, "grad_norm": 1.5141060932708994, "learning_rate": 1.1852875029949583e-09, "loss": 0.0251, "step": 10639 }, { "epoch": 9.934640522875817, "grad_norm": 1.0650084863444103, "learning_rate": 1.152592749788517e-09, "loss": 0.008, "step": 10640 }, { "epoch": 9.935574229691877, "grad_norm": 1.7525218759516532, "learning_rate": 1.1203551960742297e-09, "loss": 0.0361, "step": 10641 }, { "epoch": 9.936507936507937, "grad_norm": 2.448953258047746, "learning_rate": 1.0885748448019596e-09, "loss": 0.0456, "step": 10642 }, { "epoch": 9.937441643323996, "grad_norm": 1.7757528859019176, "learning_rate": 1.0572516988771598e-09, "loss": 0.049, "step": 10643 }, { "epoch": 9.938375350140056, "grad_norm": 0.6097536825107648, "learning_rate": 1.026385761165871e-09, "loss": 0.0034, "step": 10644 }, { "epoch": 9.939309056956116, "grad_norm": 2.188128611118162, "learning_rate": 9.959770344902808e-10, "loss": 0.0418, "step": 10645 }, { "epoch": 9.940242763772176, "grad_norm": 0.4896817110499438, "learning_rate": 9.660255216314973e-10, "loss": 0.0047, "step": 10646 }, { "epoch": 9.941176470588236, "grad_norm": 2.1581536213885575, "learning_rate": 9.36531225329551e-10, "loss": 0.0579, "step": 10647 }, { "epoch": 9.942110177404295, "grad_norm": 1.199524000152705, "learning_rate": 9.074941482811739e-10, "loss": 0.0104, "step": 10648 }, { "epoch": 9.943043884220355, "grad_norm": 5.8596102520646305, "learning_rate": 8.789142931431294e-10, "loss": 0.1492, "step": 10649 }, { "epoch": 9.943977591036415, "grad_norm": 1.0981849780058308, "learning_rate": 8.507916625288826e-10, "loss": 0.0138, "step": 10650 }, { "epoch": 9.944911297852475, "grad_norm": 1.3565054002618964, "learning_rate": 8.231262590108202e-10, "loss": 0.0237, "step": 10651 }, { "epoch": 9.945845004668534, "grad_norm": 3.685811646881238, "learning_rate": 7.959180851185855e-10, "loss": 0.1039, "step": 10652 }, { "epoch": 9.946778711484594, "grad_norm": 0.39276156044736976, "learning_rate": 7.691671433407433e-10, "loss": 0.0032, "step": 10653 }, { "epoch": 9.947712418300654, "grad_norm": 1.5480785000404331, "learning_rate": 7.428734361247803e-10, "loss": 0.0371, "step": 10654 }, { "epoch": 9.948646125116714, "grad_norm": 7.279936235857586, "learning_rate": 7.170369658748844e-10, "loss": 0.1075, "step": 10655 }, { "epoch": 9.949579831932773, "grad_norm": 3.4015805640462364, "learning_rate": 6.916577349541653e-10, "loss": 0.0992, "step": 10656 }, { "epoch": 9.950513538748833, "grad_norm": 0.47367107251274054, "learning_rate": 6.667357456840994e-10, "loss": 0.0027, "step": 10657 }, { "epoch": 9.951447245564893, "grad_norm": 4.178438855670488, "learning_rate": 6.422710003439747e-10, "loss": 0.1092, "step": 10658 }, { "epoch": 9.952380952380953, "grad_norm": 0.6740712014147823, "learning_rate": 6.182635011708904e-10, "loss": 0.013, "step": 10659 }, { "epoch": 9.953314659197012, "grad_norm": 0.8470041916591992, "learning_rate": 5.947132503614228e-10, "loss": 0.012, "step": 10660 }, { "epoch": 9.954248366013072, "grad_norm": 1.314643121642597, "learning_rate": 5.716202500688495e-10, "loss": 0.0382, "step": 10661 }, { "epoch": 9.955182072829132, "grad_norm": 2.2895596428931473, "learning_rate": 5.489845024053698e-10, "loss": 0.0284, "step": 10662 }, { "epoch": 9.956115779645192, "grad_norm": 1.7916737328698225, "learning_rate": 5.268060094415495e-10, "loss": 0.0192, "step": 10663 }, { "epoch": 9.957049486461251, "grad_norm": 3.0749498371372326, "learning_rate": 5.050847732057662e-10, "loss": 0.0899, "step": 10664 }, { "epoch": 9.957983193277311, "grad_norm": 1.555733396388876, "learning_rate": 4.838207956842089e-10, "loss": 0.0218, "step": 10665 }, { "epoch": 9.95891690009337, "grad_norm": 2.724900739724281, "learning_rate": 4.6301407882254346e-10, "loss": 0.0343, "step": 10666 }, { "epoch": 9.95985060690943, "grad_norm": 0.045890837757999396, "learning_rate": 4.4266462452313697e-10, "loss": 0.0002, "step": 10667 }, { "epoch": 9.96078431372549, "grad_norm": 0.6790617678927541, "learning_rate": 4.2277243464727837e-10, "loss": 0.0088, "step": 10668 }, { "epoch": 9.96171802054155, "grad_norm": 1.0294020970628355, "learning_rate": 4.0333751101462315e-10, "loss": 0.0095, "step": 10669 }, { "epoch": 9.96265172735761, "grad_norm": 2.1197709045832207, "learning_rate": 3.843598554020833e-10, "loss": 0.0246, "step": 10670 }, { "epoch": 9.96358543417367, "grad_norm": 1.444934880340917, "learning_rate": 3.6583946954604764e-10, "loss": 0.0341, "step": 10671 }, { "epoch": 9.96451914098973, "grad_norm": 1.074928731951318, "learning_rate": 3.4777635514016137e-10, "loss": 0.0195, "step": 10672 }, { "epoch": 9.965452847805789, "grad_norm": 2.176224545574752, "learning_rate": 3.301705138364364e-10, "loss": 0.0282, "step": 10673 }, { "epoch": 9.966386554621849, "grad_norm": 1.4039690653283008, "learning_rate": 3.1302194724525113e-10, "loss": 0.0224, "step": 10674 }, { "epoch": 9.967320261437909, "grad_norm": 1.4485309949160314, "learning_rate": 2.963306569347957e-10, "loss": 0.0165, "step": 10675 }, { "epoch": 9.968253968253968, "grad_norm": 2.048444718244599, "learning_rate": 2.8009664443162667e-10, "loss": 0.0494, "step": 10676 }, { "epoch": 9.969187675070028, "grad_norm": 1.8537936288515837, "learning_rate": 2.6431991122066736e-10, "loss": 0.0387, "step": 10677 }, { "epoch": 9.970121381886088, "grad_norm": 1.8658951875597265, "learning_rate": 2.4900045874520775e-10, "loss": 0.0377, "step": 10678 }, { "epoch": 9.971055088702148, "grad_norm": 0.8969848454046027, "learning_rate": 2.3413828840634924e-10, "loss": 0.015, "step": 10679 }, { "epoch": 9.971988795518207, "grad_norm": 3.1473685377155536, "learning_rate": 2.1973340156244972e-10, "loss": 0.0568, "step": 10680 }, { "epoch": 9.972922502334267, "grad_norm": 1.416712345987815, "learning_rate": 2.0578579953189903e-10, "loss": 0.0299, "step": 10681 }, { "epoch": 9.973856209150327, "grad_norm": 0.08909058189738912, "learning_rate": 1.9229548359034346e-10, "loss": 0.0005, "step": 10682 }, { "epoch": 9.974789915966387, "grad_norm": 3.8154987765728503, "learning_rate": 1.7926245497179584e-10, "loss": 0.0837, "step": 10683 }, { "epoch": 9.975723622782446, "grad_norm": 6.074874441529748, "learning_rate": 1.666867148669704e-10, "loss": 0.1724, "step": 10684 }, { "epoch": 9.976657329598506, "grad_norm": 12.025227719846464, "learning_rate": 1.5456826442772354e-10, "loss": 0.128, "step": 10685 }, { "epoch": 9.977591036414566, "grad_norm": 5.62431138835346, "learning_rate": 1.4290710476150271e-10, "loss": 0.1239, "step": 10686 }, { "epoch": 9.978524743230626, "grad_norm": 10.288161871937335, "learning_rate": 1.3170323693467713e-10, "loss": 0.0217, "step": 10687 }, { "epoch": 9.979458450046685, "grad_norm": 0.9497149751487318, "learning_rate": 1.2095666197253775e-10, "loss": 0.0121, "step": 10688 }, { "epoch": 9.980392156862745, "grad_norm": 0.49498927398930365, "learning_rate": 1.1066738085763195e-10, "loss": 0.0044, "step": 10689 }, { "epoch": 9.981325863678805, "grad_norm": 2.6553564524798277, "learning_rate": 1.0083539453142888e-10, "loss": 0.0298, "step": 10690 }, { "epoch": 9.982259570494865, "grad_norm": 1.9998033587182558, "learning_rate": 9.146070389265405e-11, "loss": 0.0365, "step": 10691 }, { "epoch": 9.983193277310924, "grad_norm": 1.5849753314032753, "learning_rate": 8.254330979895475e-11, "loss": 0.0241, "step": 10692 }, { "epoch": 9.984126984126984, "grad_norm": 2.139147293471227, "learning_rate": 7.40832130657898e-11, "loss": 0.0384, "step": 10693 }, { "epoch": 9.985060690943044, "grad_norm": 1.9177805335213771, "learning_rate": 6.608041446753976e-11, "loss": 0.0516, "step": 10694 }, { "epoch": 9.985994397759104, "grad_norm": 1.2388499732594795, "learning_rate": 5.853491473528649e-11, "loss": 0.0207, "step": 10695 }, { "epoch": 9.986928104575163, "grad_norm": 3.898100147724434, "learning_rate": 5.1446714559588716e-11, "loss": 0.0693, "step": 10696 }, { "epoch": 9.987861811391223, "grad_norm": 1.6474562244613915, "learning_rate": 4.4815814588816674e-11, "loss": 0.0327, "step": 10697 }, { "epoch": 9.988795518207283, "grad_norm": 1.1760266599260008, "learning_rate": 3.864221542915214e-11, "loss": 0.0173, "step": 10698 }, { "epoch": 9.989729225023343, "grad_norm": 2.122153566566673, "learning_rate": 3.292591764514352e-11, "loss": 0.0761, "step": 10699 }, { "epoch": 9.990662931839402, "grad_norm": 3.531372122916376, "learning_rate": 2.766692176026098e-11, "loss": 0.1065, "step": 10700 }, { "epoch": 9.991596638655462, "grad_norm": 0.788779369902914, "learning_rate": 2.2865228255231075e-11, "loss": 0.0123, "step": 10701 }, { "epoch": 9.992530345471522, "grad_norm": 4.798400172156619, "learning_rate": 1.852083756859191e-11, "loss": 0.0658, "step": 10702 }, { "epoch": 9.993464052287582, "grad_norm": 3.5754490918210067, "learning_rate": 1.4633750098358436e-11, "loss": 0.0834, "step": 10703 }, { "epoch": 9.994397759103641, "grad_norm": 1.3514758528753636, "learning_rate": 1.1203966199802019e-11, "loss": 0.0219, "step": 10704 }, { "epoch": 9.995331465919701, "grad_norm": 1.424037652398029, "learning_rate": 8.231486187115779e-12, "loss": 0.0199, "step": 10705 }, { "epoch": 9.996265172735761, "grad_norm": 2.6070133283125974, "learning_rate": 5.7163103311941305e-12, "loss": 0.0448, "step": 10706 }, { "epoch": 9.99719887955182, "grad_norm": 1.8427263789514732, "learning_rate": 3.658438862963465e-12, "loss": 0.0337, "step": 10707 }, { "epoch": 9.99813258636788, "grad_norm": 5.381789931132172, "learning_rate": 2.0578719700514727e-12, "loss": 0.1616, "step": 10708 }, { "epoch": 9.99906629318394, "grad_norm": 1.4622604756886404, "learning_rate": 9.146097995627046e-13, "loss": 0.0094, "step": 10709 }, { "epoch": 10.0, "grad_norm": 1.0532155645577121, "learning_rate": 2.286524553030134e-13, "loss": 0.0288, "step": 10710 } ], "logging_steps": 1.0, "max_steps": 10710, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 63417195528192.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }