{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9949177877428999, "eval_steps": 500, "global_step": 418, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004783258594917788, "grad_norm": 0.709558174057161, "learning_rate": 9.523809523809523e-08, "loss": 1.2153, "step": 1 }, { "epoch": 0.009566517189835576, "grad_norm": 0.7136115199943648, "learning_rate": 1.9047619047619045e-07, "loss": 1.1666, "step": 2 }, { "epoch": 0.014349775784753363, "grad_norm": 0.675693723617585, "learning_rate": 2.857142857142857e-07, "loss": 1.1641, "step": 3 }, { "epoch": 0.019133034379671152, "grad_norm": 0.6920682930548318, "learning_rate": 3.809523809523809e-07, "loss": 1.1522, "step": 4 }, { "epoch": 0.02391629297458894, "grad_norm": 0.7102565482472595, "learning_rate": 4.761904761904761e-07, "loss": 1.1849, "step": 5 }, { "epoch": 0.028699551569506727, "grad_norm": 0.7063832004098226, "learning_rate": 5.714285714285714e-07, "loss": 1.1832, "step": 6 }, { "epoch": 0.03348281016442452, "grad_norm": 0.7273064919061014, "learning_rate": 6.666666666666666e-07, "loss": 1.1646, "step": 7 }, { "epoch": 0.038266068759342305, "grad_norm": 0.6515215643931184, "learning_rate": 7.619047619047618e-07, "loss": 1.1492, "step": 8 }, { "epoch": 0.04304932735426009, "grad_norm": 0.662055704210126, "learning_rate": 8.57142857142857e-07, "loss": 1.1356, "step": 9 }, { "epoch": 0.04783258594917788, "grad_norm": 0.6717882944749636, "learning_rate": 9.523809523809522e-07, "loss": 1.1373, "step": 10 }, { "epoch": 0.052615844544095666, "grad_norm": 0.5448741661939914, "learning_rate": 1.0476190476190476e-06, "loss": 1.1133, "step": 11 }, { "epoch": 0.05739910313901345, "grad_norm": 0.5462157636865493, "learning_rate": 1.1428571428571428e-06, "loss": 1.1025, "step": 12 }, { "epoch": 0.06218236173393124, "grad_norm": 0.5384420206966651, "learning_rate": 1.238095238095238e-06, "loss": 1.1102, "step": 13 }, { "epoch": 0.06696562032884903, "grad_norm": 0.5199166835309963, "learning_rate": 1.3333333333333332e-06, "loss": 1.0716, "step": 14 }, { "epoch": 0.07174887892376682, "grad_norm": 0.3132434598969462, "learning_rate": 1.4285714285714286e-06, "loss": 1.0198, "step": 15 }, { "epoch": 0.07653213751868461, "grad_norm": 0.2906366968418868, "learning_rate": 1.5238095238095236e-06, "loss": 1.0402, "step": 16 }, { "epoch": 0.08131539611360239, "grad_norm": 0.27762786913003945, "learning_rate": 1.619047619047619e-06, "loss": 1.0027, "step": 17 }, { "epoch": 0.08609865470852018, "grad_norm": 0.27399342611565175, "learning_rate": 1.714285714285714e-06, "loss": 1.0022, "step": 18 }, { "epoch": 0.09088191330343796, "grad_norm": 0.24537662923792491, "learning_rate": 1.8095238095238095e-06, "loss": 1.0272, "step": 19 }, { "epoch": 0.09566517189835576, "grad_norm": 0.20010147123002528, "learning_rate": 1.9047619047619045e-06, "loss": 0.9569, "step": 20 }, { "epoch": 0.10044843049327354, "grad_norm": 0.25587399010113915, "learning_rate": 2e-06, "loss": 0.9848, "step": 21 }, { "epoch": 0.10523168908819133, "grad_norm": 0.3007962756012024, "learning_rate": 1.9999686897547167e-06, "loss": 0.9581, "step": 22 }, { "epoch": 0.11001494768310911, "grad_norm": 0.3006848188189002, "learning_rate": 1.9998747609795305e-06, "loss": 0.9478, "step": 23 }, { "epoch": 0.1147982062780269, "grad_norm": 0.30781981272131237, "learning_rate": 1.999718219556307e-06, "loss": 0.9834, "step": 24 }, { "epoch": 0.11958146487294469, "grad_norm": 0.3118837626745979, "learning_rate": 1.999499075287747e-06, "loss": 0.9852, "step": 25 }, { "epoch": 0.12436472346786248, "grad_norm": 0.26545213805928825, "learning_rate": 1.999217341896772e-06, "loss": 0.9549, "step": 26 }, { "epoch": 0.12914798206278028, "grad_norm": 0.28369668711343804, "learning_rate": 1.998873037025665e-06, "loss": 0.9395, "step": 27 }, { "epoch": 0.13393124065769807, "grad_norm": 0.228056096803738, "learning_rate": 1.9984661822349665e-06, "loss": 0.9124, "step": 28 }, { "epoch": 0.13871449925261584, "grad_norm": 0.23393624794141885, "learning_rate": 1.997996803002123e-06, "loss": 0.9306, "step": 29 }, { "epoch": 0.14349775784753363, "grad_norm": 0.20376330319941563, "learning_rate": 1.9974649287198914e-06, "loss": 0.8882, "step": 30 }, { "epoch": 0.14828101644245142, "grad_norm": 0.19033042713450593, "learning_rate": 1.9968705926945013e-06, "loss": 0.8699, "step": 31 }, { "epoch": 0.15306427503736922, "grad_norm": 0.20517384186395837, "learning_rate": 1.9962138321435656e-06, "loss": 0.8919, "step": 32 }, { "epoch": 0.15784753363228698, "grad_norm": 0.19219397333283395, "learning_rate": 1.9954946881937524e-06, "loss": 0.8985, "step": 33 }, { "epoch": 0.16263079222720478, "grad_norm": 0.18095506989716384, "learning_rate": 1.994713205878208e-06, "loss": 0.8504, "step": 34 }, { "epoch": 0.16741405082212257, "grad_norm": 0.1722529909885032, "learning_rate": 1.9938694341337393e-06, "loss": 0.8743, "step": 35 }, { "epoch": 0.17219730941704037, "grad_norm": 0.16508567356320156, "learning_rate": 1.9929634257977467e-06, "loss": 0.857, "step": 36 }, { "epoch": 0.17698056801195813, "grad_norm": 0.15380307949646846, "learning_rate": 1.991995237604916e-06, "loss": 0.8487, "step": 37 }, { "epoch": 0.18176382660687593, "grad_norm": 0.14856130486975244, "learning_rate": 1.9909649301836674e-06, "loss": 0.8692, "step": 38 }, { "epoch": 0.18654708520179372, "grad_norm": 0.1518842900714723, "learning_rate": 1.9898725680523566e-06, "loss": 0.8679, "step": 39 }, { "epoch": 0.19133034379671152, "grad_norm": 0.1443106182213824, "learning_rate": 1.9887182196152367e-06, "loss": 0.8504, "step": 40 }, { "epoch": 0.1961136023916293, "grad_norm": 0.14664015617981188, "learning_rate": 1.9875019571581726e-06, "loss": 0.8125, "step": 41 }, { "epoch": 0.20089686098654708, "grad_norm": 0.14692793192413753, "learning_rate": 1.9862238568441165e-06, "loss": 0.8257, "step": 42 }, { "epoch": 0.20568011958146487, "grad_norm": 0.13896889627771705, "learning_rate": 1.9848839987083364e-06, "loss": 0.8329, "step": 43 }, { "epoch": 0.21046337817638266, "grad_norm": 0.14943974659921427, "learning_rate": 1.983482466653407e-06, "loss": 0.8409, "step": 44 }, { "epoch": 0.21524663677130046, "grad_norm": 0.138210028938997, "learning_rate": 1.982019348443952e-06, "loss": 0.8323, "step": 45 }, { "epoch": 0.22002989536621823, "grad_norm": 0.1250406305407292, "learning_rate": 1.9804947357011523e-06, "loss": 0.8673, "step": 46 }, { "epoch": 0.22481315396113602, "grad_norm": 0.12719252526959784, "learning_rate": 1.978908723897005e-06, "loss": 0.8192, "step": 47 }, { "epoch": 0.2295964125560538, "grad_norm": 0.10853106729801387, "learning_rate": 1.9772614123483485e-06, "loss": 0.8384, "step": 48 }, { "epoch": 0.2343796711509716, "grad_norm": 0.11375286279894396, "learning_rate": 1.9755529042106393e-06, "loss": 0.7854, "step": 49 }, { "epoch": 0.23916292974588937, "grad_norm": 0.11326113932314119, "learning_rate": 1.973783306471495e-06, "loss": 0.795, "step": 50 }, { "epoch": 0.24394618834080717, "grad_norm": 0.12664705711535487, "learning_rate": 1.971952729943994e-06, "loss": 0.783, "step": 51 }, { "epoch": 0.24872944693572496, "grad_norm": 0.11119059988645158, "learning_rate": 1.9700612892597372e-06, "loss": 0.8059, "step": 52 }, { "epoch": 0.25351270553064276, "grad_norm": 0.10545114737351395, "learning_rate": 1.9681091028616676e-06, "loss": 0.7885, "step": 53 }, { "epoch": 0.25829596412556055, "grad_norm": 0.11679452392637804, "learning_rate": 1.966096292996655e-06, "loss": 0.8031, "step": 54 }, { "epoch": 0.26307922272047835, "grad_norm": 0.11363287552532539, "learning_rate": 1.9640229857078413e-06, "loss": 0.7774, "step": 55 }, { "epoch": 0.26786248131539614, "grad_norm": 0.1164225509000403, "learning_rate": 1.9618893108267454e-06, "loss": 0.7949, "step": 56 }, { "epoch": 0.2726457399103139, "grad_norm": 0.11077425052933487, "learning_rate": 1.9596954019651354e-06, "loss": 0.7674, "step": 57 }, { "epoch": 0.27742899850523167, "grad_norm": 0.10576177825898277, "learning_rate": 1.95744139650666e-06, "loss": 0.7953, "step": 58 }, { "epoch": 0.28221225710014947, "grad_norm": 0.10359885133841641, "learning_rate": 1.955127435598247e-06, "loss": 0.7881, "step": 59 }, { "epoch": 0.28699551569506726, "grad_norm": 0.10586032252156977, "learning_rate": 1.9527536641412637e-06, "loss": 0.7984, "step": 60 }, { "epoch": 0.29177877428998505, "grad_norm": 0.10642116844371083, "learning_rate": 1.950320230782443e-06, "loss": 0.7666, "step": 61 }, { "epoch": 0.29656203288490285, "grad_norm": 0.11202675632435576, "learning_rate": 1.9478272879045763e-06, "loss": 0.7809, "step": 62 }, { "epoch": 0.30134529147982064, "grad_norm": 0.10728322195233368, "learning_rate": 1.9452749916169685e-06, "loss": 0.7948, "step": 63 }, { "epoch": 0.30612855007473844, "grad_norm": 0.10427886124668943, "learning_rate": 1.942663501745666e-06, "loss": 0.7843, "step": 64 }, { "epoch": 0.3109118086696562, "grad_norm": 0.09150641957182463, "learning_rate": 1.939992981823445e-06, "loss": 0.7713, "step": 65 }, { "epoch": 0.31569506726457397, "grad_norm": 0.10652939965487439, "learning_rate": 1.9372635990795744e-06, "loss": 0.7338, "step": 66 }, { "epoch": 0.32047832585949176, "grad_norm": 0.12224668990837938, "learning_rate": 1.934475524429339e-06, "loss": 0.7651, "step": 67 }, { "epoch": 0.32526158445440956, "grad_norm": 0.09554788331952155, "learning_rate": 1.9316289324633416e-06, "loss": 0.7743, "step": 68 }, { "epoch": 0.33004484304932735, "grad_norm": 0.10311314948775388, "learning_rate": 1.928724001436568e-06, "loss": 0.7818, "step": 69 }, { "epoch": 0.33482810164424515, "grad_norm": 0.11402809897006772, "learning_rate": 1.925760913257224e-06, "loss": 0.7738, "step": 70 }, { "epoch": 0.33961136023916294, "grad_norm": 0.10099702778225672, "learning_rate": 1.922739853475345e-06, "loss": 0.7694, "step": 71 }, { "epoch": 0.34439461883408073, "grad_norm": 0.09669133625846159, "learning_rate": 1.919661011271176e-06, "loss": 0.7695, "step": 72 }, { "epoch": 0.34917787742899853, "grad_norm": 0.10013746372306316, "learning_rate": 1.916524579443327e-06, "loss": 0.7762, "step": 73 }, { "epoch": 0.35396113602391627, "grad_norm": 0.09840254254939616, "learning_rate": 1.9133307543966972e-06, "loss": 0.7465, "step": 74 }, { "epoch": 0.35874439461883406, "grad_norm": 0.10348087475535427, "learning_rate": 1.910079736130178e-06, "loss": 0.7591, "step": 75 }, { "epoch": 0.36352765321375186, "grad_norm": 0.09831488128647803, "learning_rate": 1.9067717282241275e-06, "loss": 0.7473, "step": 76 }, { "epoch": 0.36831091180866965, "grad_norm": 0.10747256347092367, "learning_rate": 1.9034069378276248e-06, "loss": 0.7899, "step": 77 }, { "epoch": 0.37309417040358744, "grad_norm": 0.10145726153107046, "learning_rate": 1.8999855756454943e-06, "loss": 0.759, "step": 78 }, { "epoch": 0.37787742899850524, "grad_norm": 0.09521749859691808, "learning_rate": 1.8965078559251141e-06, "loss": 0.765, "step": 79 }, { "epoch": 0.38266068759342303, "grad_norm": 0.09559204768504546, "learning_rate": 1.892973996443e-06, "loss": 0.7653, "step": 80 }, { "epoch": 0.3874439461883408, "grad_norm": 0.09893961689958143, "learning_rate": 1.8893842184911652e-06, "loss": 0.7585, "step": 81 }, { "epoch": 0.3922272047832586, "grad_norm": 0.10469293200053865, "learning_rate": 1.8857387468632673e-06, "loss": 0.7396, "step": 82 }, { "epoch": 0.39701046337817636, "grad_norm": 0.09881168266263542, "learning_rate": 1.8820378098405269e-06, "loss": 0.7449, "step": 83 }, { "epoch": 0.40179372197309415, "grad_norm": 0.09472923155314936, "learning_rate": 1.878281639177437e-06, "loss": 0.7536, "step": 84 }, { "epoch": 0.40657698056801195, "grad_norm": 0.09940252508830999, "learning_rate": 1.874470470087246e-06, "loss": 0.7695, "step": 85 }, { "epoch": 0.41136023916292974, "grad_norm": 0.10835992130612712, "learning_rate": 1.8706045412272329e-06, "loss": 0.7804, "step": 86 }, { "epoch": 0.41614349775784754, "grad_norm": 0.09850260645852206, "learning_rate": 1.8666840946837588e-06, "loss": 0.7581, "step": 87 }, { "epoch": 0.42092675635276533, "grad_norm": 0.10663807706116737, "learning_rate": 1.8627093759571097e-06, "loss": 0.7486, "step": 88 }, { "epoch": 0.4257100149476831, "grad_norm": 0.09576966700987803, "learning_rate": 1.8586806339461223e-06, "loss": 0.7393, "step": 89 }, { "epoch": 0.4304932735426009, "grad_norm": 0.13616509255793824, "learning_rate": 1.8545981209325974e-06, "loss": 0.7412, "step": 90 }, { "epoch": 0.43527653213751866, "grad_norm": 0.10078747049635026, "learning_rate": 1.850462092565503e-06, "loss": 0.7522, "step": 91 }, { "epoch": 0.44005979073243645, "grad_norm": 0.09590506182617801, "learning_rate": 1.846272807844964e-06, "loss": 0.7361, "step": 92 }, { "epoch": 0.44484304932735425, "grad_norm": 0.09599938671410663, "learning_rate": 1.8420305291060453e-06, "loss": 0.7454, "step": 93 }, { "epoch": 0.44962630792227204, "grad_norm": 0.10175459960116054, "learning_rate": 1.837735522002322e-06, "loss": 0.7776, "step": 94 }, { "epoch": 0.45440956651718983, "grad_norm": 0.10921604960602464, "learning_rate": 1.8333880554892465e-06, "loss": 0.7284, "step": 95 }, { "epoch": 0.4591928251121076, "grad_norm": 0.10701793438795469, "learning_rate": 1.828988401807304e-06, "loss": 0.7275, "step": 96 }, { "epoch": 0.4639760837070254, "grad_norm": 0.10671158442373065, "learning_rate": 1.8245368364649672e-06, "loss": 0.7176, "step": 97 }, { "epoch": 0.4687593423019432, "grad_norm": 0.09323865008012455, "learning_rate": 1.8200336382214404e-06, "loss": 0.7558, "step": 98 }, { "epoch": 0.473542600896861, "grad_norm": 0.09924243426975013, "learning_rate": 1.815479089069208e-06, "loss": 0.7477, "step": 99 }, { "epoch": 0.47832585949177875, "grad_norm": 0.10034019533981096, "learning_rate": 1.8108734742163714e-06, "loss": 0.7302, "step": 100 }, { "epoch": 0.48310911808669654, "grad_norm": 0.09289950458176202, "learning_rate": 1.8062170820687923e-06, "loss": 0.7461, "step": 101 }, { "epoch": 0.48789237668161434, "grad_norm": 0.10063821105969947, "learning_rate": 1.8015102042120314e-06, "loss": 0.7374, "step": 102 }, { "epoch": 0.49267563527653213, "grad_norm": 0.10431764482912426, "learning_rate": 1.796753135393089e-06, "loss": 0.753, "step": 103 }, { "epoch": 0.4974588938714499, "grad_norm": 0.09777703419526715, "learning_rate": 1.791946173501948e-06, "loss": 0.7172, "step": 104 }, { "epoch": 0.5022421524663677, "grad_norm": 0.09880039694565383, "learning_rate": 1.7870896195529204e-06, "loss": 0.7157, "step": 105 }, { "epoch": 0.5070254110612855, "grad_norm": 0.10103523012523379, "learning_rate": 1.7821837776657967e-06, "loss": 0.7522, "step": 106 }, { "epoch": 0.5118086696562033, "grad_norm": 0.09953632352625874, "learning_rate": 1.777228955046803e-06, "loss": 0.7215, "step": 107 }, { "epoch": 0.5165919282511211, "grad_norm": 0.09448842637214858, "learning_rate": 1.7722254619693617e-06, "loss": 0.7311, "step": 108 }, { "epoch": 0.5213751868460389, "grad_norm": 0.09926544596139777, "learning_rate": 1.7671736117546643e-06, "loss": 0.7242, "step": 109 }, { "epoch": 0.5261584454409567, "grad_norm": 0.09420983432319698, "learning_rate": 1.7620737207520498e-06, "loss": 0.7302, "step": 110 }, { "epoch": 0.5309417040358745, "grad_norm": 0.09391867567605319, "learning_rate": 1.756926108319194e-06, "loss": 0.7222, "step": 111 }, { "epoch": 0.5357249626307923, "grad_norm": 0.09479652603956866, "learning_rate": 1.751731096802113e-06, "loss": 0.7361, "step": 112 }, { "epoch": 0.54050822122571, "grad_norm": 0.09440230389077435, "learning_rate": 1.7464890115149759e-06, "loss": 0.7183, "step": 113 }, { "epoch": 0.5452914798206278, "grad_norm": 0.09514244364363002, "learning_rate": 1.7412001807197361e-06, "loss": 0.7342, "step": 114 }, { "epoch": 0.5500747384155455, "grad_norm": 0.10939831006494534, "learning_rate": 1.735864935605572e-06, "loss": 0.7251, "step": 115 }, { "epoch": 0.5548579970104633, "grad_norm": 0.10066676165355973, "learning_rate": 1.7304836102681493e-06, "loss": 0.7081, "step": 116 }, { "epoch": 0.5596412556053811, "grad_norm": 0.10100361164339053, "learning_rate": 1.7250565416887015e-06, "loss": 0.742, "step": 117 }, { "epoch": 0.5644245142002989, "grad_norm": 0.09740229601345607, "learning_rate": 1.719584069712925e-06, "loss": 0.7314, "step": 118 }, { "epoch": 0.5692077727952167, "grad_norm": 0.1012821496567702, "learning_rate": 1.7140665370296992e-06, "loss": 0.7167, "step": 119 }, { "epoch": 0.5739910313901345, "grad_norm": 0.09994075838359362, "learning_rate": 1.708504289149628e-06, "loss": 0.7421, "step": 120 }, { "epoch": 0.5787742899850523, "grad_norm": 0.09513046173828367, "learning_rate": 1.702897674383402e-06, "loss": 0.7067, "step": 121 }, { "epoch": 0.5835575485799701, "grad_norm": 0.10488877885042427, "learning_rate": 1.697247043819988e-06, "loss": 0.7283, "step": 122 }, { "epoch": 0.5883408071748879, "grad_norm": 0.10017563354892535, "learning_rate": 1.6915527513046443e-06, "loss": 0.7289, "step": 123 }, { "epoch": 0.5931240657698057, "grad_norm": 0.09910676006320021, "learning_rate": 1.6858151534167616e-06, "loss": 0.7258, "step": 124 }, { "epoch": 0.5979073243647235, "grad_norm": 0.10226756484228856, "learning_rate": 1.6800346094475346e-06, "loss": 0.7294, "step": 125 }, { "epoch": 0.6026905829596413, "grad_norm": 0.0941277312513867, "learning_rate": 1.6742114813774618e-06, "loss": 0.7059, "step": 126 }, { "epoch": 0.6074738415545591, "grad_norm": 0.10468386708851042, "learning_rate": 1.6683461338536798e-06, "loss": 0.76, "step": 127 }, { "epoch": 0.6122571001494769, "grad_norm": 0.09546912003315239, "learning_rate": 1.6624389341671278e-06, "loss": 0.7199, "step": 128 }, { "epoch": 0.6170403587443947, "grad_norm": 0.09278710008849092, "learning_rate": 1.656490252229548e-06, "loss": 0.71, "step": 129 }, { "epoch": 0.6218236173393124, "grad_norm": 0.09629578223078193, "learning_rate": 1.6505004605503223e-06, "loss": 0.7297, "step": 130 }, { "epoch": 0.6266068759342301, "grad_norm": 0.10564515959559177, "learning_rate": 1.6444699342131428e-06, "loss": 0.7323, "step": 131 }, { "epoch": 0.6313901345291479, "grad_norm": 0.11359024419098725, "learning_rate": 1.638399050852528e-06, "loss": 0.7091, "step": 132 }, { "epoch": 0.6361733931240657, "grad_norm": 0.11261022540293862, "learning_rate": 1.632288190630172e-06, "loss": 0.7092, "step": 133 }, { "epoch": 0.6409566517189835, "grad_norm": 0.11356374624941931, "learning_rate": 1.6261377362111396e-06, "loss": 0.7226, "step": 134 }, { "epoch": 0.6457399103139013, "grad_norm": 0.09628738165774237, "learning_rate": 1.6199480727399032e-06, "loss": 0.7313, "step": 135 }, { "epoch": 0.6505231689088191, "grad_norm": 0.09955265729242128, "learning_rate": 1.6137195878162267e-06, "loss": 0.7264, "step": 136 }, { "epoch": 0.6553064275037369, "grad_norm": 0.10088157860044299, "learning_rate": 1.607452671470891e-06, "loss": 0.72, "step": 137 }, { "epoch": 0.6600896860986547, "grad_norm": 0.09316854100471951, "learning_rate": 1.601147716141272e-06, "loss": 0.7043, "step": 138 }, { "epoch": 0.6648729446935725, "grad_norm": 0.09866104920600266, "learning_rate": 1.5948051166467657e-06, "loss": 0.7314, "step": 139 }, { "epoch": 0.6696562032884903, "grad_norm": 0.09908667617176863, "learning_rate": 1.5884252701640634e-06, "loss": 0.7223, "step": 140 }, { "epoch": 0.6744394618834081, "grad_norm": 0.10108043693556777, "learning_rate": 1.5820085762022823e-06, "loss": 0.7145, "step": 141 }, { "epoch": 0.6792227204783259, "grad_norm": 0.09483321797525981, "learning_rate": 1.5755554365779455e-06, "loss": 0.712, "step": 142 }, { "epoch": 0.6840059790732437, "grad_norm": 0.09772063438530315, "learning_rate": 1.5690662553898222e-06, "loss": 0.7262, "step": 143 }, { "epoch": 0.6887892376681615, "grad_norm": 0.09547210509162248, "learning_rate": 1.5625414389936218e-06, "loss": 0.6881, "step": 144 }, { "epoch": 0.6935724962630793, "grad_norm": 0.10198333563773951, "learning_rate": 1.555981395976548e-06, "loss": 0.7023, "step": 145 }, { "epoch": 0.6983557548579971, "grad_norm": 0.0960216671080163, "learning_rate": 1.5493865371317123e-06, "loss": 0.7041, "step": 146 }, { "epoch": 0.7031390134529149, "grad_norm": 0.10811878950887173, "learning_rate": 1.542757275432411e-06, "loss": 0.7121, "step": 147 }, { "epoch": 0.7079222720478325, "grad_norm": 0.09745342759060693, "learning_rate": 1.5360940260062635e-06, "loss": 0.7, "step": 148 }, { "epoch": 0.7127055306427503, "grad_norm": 0.10002068890855158, "learning_rate": 1.5293972061092185e-06, "loss": 0.7174, "step": 149 }, { "epoch": 0.7174887892376681, "grad_norm": 0.094440761646848, "learning_rate": 1.522667235099422e-06, "loss": 0.6842, "step": 150 }, { "epoch": 0.7222720478325859, "grad_norm": 0.09714805521617614, "learning_rate": 1.515904534410961e-06, "loss": 0.6917, "step": 151 }, { "epoch": 0.7270553064275037, "grad_norm": 0.09206634939711936, "learning_rate": 1.5091095275274699e-06, "loss": 0.6807, "step": 152 }, { "epoch": 0.7318385650224215, "grad_norm": 0.09811924963451824, "learning_rate": 1.5022826399556133e-06, "loss": 0.6938, "step": 153 }, { "epoch": 0.7366218236173393, "grad_norm": 0.09469018906462104, "learning_rate": 1.4954242991984396e-06, "loss": 0.7262, "step": 154 }, { "epoch": 0.7414050822122571, "grad_norm": 0.09900495842570976, "learning_rate": 1.4885349347286115e-06, "loss": 0.6928, "step": 155 }, { "epoch": 0.7461883408071749, "grad_norm": 0.09813499443182924, "learning_rate": 1.4816149779615126e-06, "loss": 0.7041, "step": 156 }, { "epoch": 0.7509715994020927, "grad_norm": 0.09285509032551069, "learning_rate": 1.474664862228229e-06, "loss": 0.7157, "step": 157 }, { "epoch": 0.7557548579970105, "grad_norm": 0.09930227957877516, "learning_rate": 1.467685022748419e-06, "loss": 0.7077, "step": 158 }, { "epoch": 0.7605381165919283, "grad_norm": 0.09336816965151891, "learning_rate": 1.4606758966030534e-06, "loss": 0.6905, "step": 159 }, { "epoch": 0.7653213751868461, "grad_norm": 0.09584860785157516, "learning_rate": 1.4536379227070509e-06, "loss": 0.704, "step": 160 }, { "epoch": 0.7701046337817639, "grad_norm": 0.09906164552724124, "learning_rate": 1.4465715417817888e-06, "loss": 0.7014, "step": 161 }, { "epoch": 0.7748878923766817, "grad_norm": 0.09920929186360831, "learning_rate": 1.4394771963275076e-06, "loss": 0.6711, "step": 162 }, { "epoch": 0.7796711509715994, "grad_norm": 0.09312914704123235, "learning_rate": 1.4323553305955997e-06, "loss": 0.704, "step": 163 }, { "epoch": 0.7844544095665172, "grad_norm": 0.09380001375870357, "learning_rate": 1.4252063905607909e-06, "loss": 0.6769, "step": 164 }, { "epoch": 0.7892376681614349, "grad_norm": 0.09383108087011895, "learning_rate": 1.4180308238932135e-06, "loss": 0.6903, "step": 165 }, { "epoch": 0.7940209267563527, "grad_norm": 0.09761627284743495, "learning_rate": 1.410829079930372e-06, "loss": 0.7126, "step": 166 }, { "epoch": 0.7988041853512705, "grad_norm": 0.09591926993818495, "learning_rate": 1.4036016096490064e-06, "loss": 0.6936, "step": 167 }, { "epoch": 0.8035874439461883, "grad_norm": 0.09463907898930997, "learning_rate": 1.3963488656368517e-06, "loss": 0.6918, "step": 168 }, { "epoch": 0.8083707025411061, "grad_norm": 0.10314575539858357, "learning_rate": 1.389071302064295e-06, "loss": 0.6837, "step": 169 }, { "epoch": 0.8131539611360239, "grad_norm": 0.0964154089668258, "learning_rate": 1.381769374655938e-06, "loss": 0.7087, "step": 170 }, { "epoch": 0.8179372197309417, "grad_norm": 0.10458955759891816, "learning_rate": 1.374443540662057e-06, "loss": 0.7132, "step": 171 }, { "epoch": 0.8227204783258595, "grad_norm": 0.11118113052583456, "learning_rate": 1.3670942588299705e-06, "loss": 0.689, "step": 172 }, { "epoch": 0.8275037369207773, "grad_norm": 0.09430050647819165, "learning_rate": 1.3597219893753117e-06, "loss": 0.6669, "step": 173 }, { "epoch": 0.8322869955156951, "grad_norm": 0.10018122520539552, "learning_rate": 1.352327193953211e-06, "loss": 0.675, "step": 174 }, { "epoch": 0.8370702541106129, "grad_norm": 0.1036112926787395, "learning_rate": 1.3449103356293852e-06, "loss": 0.7151, "step": 175 }, { "epoch": 0.8418535127055307, "grad_norm": 0.09652117392718416, "learning_rate": 1.337471878851141e-06, "loss": 0.6819, "step": 176 }, { "epoch": 0.8466367713004485, "grad_norm": 0.11467070226240633, "learning_rate": 1.3300122894182909e-06, "loss": 0.7063, "step": 177 }, { "epoch": 0.8514200298953662, "grad_norm": 0.0974406950357686, "learning_rate": 1.3225320344539842e-06, "loss": 0.7154, "step": 178 }, { "epoch": 0.856203288490284, "grad_norm": 0.10056923973958724, "learning_rate": 1.315031582375457e-06, "loss": 0.7119, "step": 179 }, { "epoch": 0.8609865470852018, "grad_norm": 0.10289512917324216, "learning_rate": 1.3075114028646974e-06, "loss": 0.6872, "step": 180 }, { "epoch": 0.8657698056801196, "grad_norm": 0.10284996024746469, "learning_rate": 1.299971966839036e-06, "loss": 0.6995, "step": 181 }, { "epoch": 0.8705530642750373, "grad_norm": 0.09442402879665361, "learning_rate": 1.292413746421655e-06, "loss": 0.6788, "step": 182 }, { "epoch": 0.8753363228699551, "grad_norm": 0.09221585066528634, "learning_rate": 1.2848372149120246e-06, "loss": 0.6625, "step": 183 }, { "epoch": 0.8801195814648729, "grad_norm": 0.09614590670948946, "learning_rate": 1.2772428467562651e-06, "loss": 0.6993, "step": 184 }, { "epoch": 0.8849028400597907, "grad_norm": 0.09884964743533457, "learning_rate": 1.2696311175174357e-06, "loss": 0.6826, "step": 185 }, { "epoch": 0.8896860986547085, "grad_norm": 0.10049262287084837, "learning_rate": 1.2620025038457554e-06, "loss": 0.6875, "step": 186 }, { "epoch": 0.8944693572496263, "grad_norm": 0.0951319815934962, "learning_rate": 1.254357483448755e-06, "loss": 0.6763, "step": 187 }, { "epoch": 0.8992526158445441, "grad_norm": 0.0935897850203258, "learning_rate": 1.2466965350613615e-06, "loss": 0.7191, "step": 188 }, { "epoch": 0.9040358744394619, "grad_norm": 0.10488228598924217, "learning_rate": 1.2390201384159219e-06, "loss": 0.7031, "step": 189 }, { "epoch": 0.9088191330343797, "grad_norm": 0.09803611282531831, "learning_rate": 1.231328774212159e-06, "loss": 0.6596, "step": 190 }, { "epoch": 0.9136023916292975, "grad_norm": 0.10982924572402691, "learning_rate": 1.223622924087073e-06, "loss": 0.685, "step": 191 }, { "epoch": 0.9183856502242153, "grad_norm": 0.0990057467989385, "learning_rate": 1.215903070584779e-06, "loss": 0.6905, "step": 192 }, { "epoch": 0.923168908819133, "grad_norm": 0.09806799076875558, "learning_rate": 1.2081696971262903e-06, "loss": 0.6888, "step": 193 }, { "epoch": 0.9279521674140508, "grad_norm": 0.09725950749183558, "learning_rate": 1.2004232879792464e-06, "loss": 0.6897, "step": 194 }, { "epoch": 0.9327354260089686, "grad_norm": 0.09998658118754998, "learning_rate": 1.1926643282275882e-06, "loss": 0.6808, "step": 195 }, { "epoch": 0.9375186846038864, "grad_norm": 0.09991311679692257, "learning_rate": 1.1848933037411825e-06, "loss": 0.6721, "step": 196 }, { "epoch": 0.9423019431988042, "grad_norm": 0.09570773453199784, "learning_rate": 1.1771107011453933e-06, "loss": 0.6943, "step": 197 }, { "epoch": 0.947085201793722, "grad_norm": 0.09891331359398514, "learning_rate": 1.1693170077906143e-06, "loss": 0.6989, "step": 198 }, { "epoch": 0.9518684603886398, "grad_norm": 0.09162536810525922, "learning_rate": 1.1615127117217463e-06, "loss": 0.6705, "step": 199 }, { "epoch": 0.9566517189835575, "grad_norm": 0.08903988395053124, "learning_rate": 1.1536983016476373e-06, "loss": 0.679, "step": 200 }, { "epoch": 0.9614349775784753, "grad_norm": 0.09042806424104788, "learning_rate": 1.1458742669104803e-06, "loss": 0.6652, "step": 201 }, { "epoch": 0.9662182361733931, "grad_norm": 0.10347050843667145, "learning_rate": 1.1380410974551682e-06, "loss": 0.6891, "step": 202 }, { "epoch": 0.9710014947683109, "grad_norm": 0.0937785288147842, "learning_rate": 1.130199283798615e-06, "loss": 0.662, "step": 203 }, { "epoch": 0.9757847533632287, "grad_norm": 0.10125646071292, "learning_rate": 1.1223493169990391e-06, "loss": 0.6857, "step": 204 }, { "epoch": 0.9805680119581465, "grad_norm": 0.09552098120941739, "learning_rate": 1.1144916886252124e-06, "loss": 0.6693, "step": 205 }, { "epoch": 0.9853512705530643, "grad_norm": 0.0939464203547695, "learning_rate": 1.1066268907256782e-06, "loss": 0.689, "step": 206 }, { "epoch": 0.9901345291479821, "grad_norm": 0.1083244661837491, "learning_rate": 1.098755415797939e-06, "loss": 0.6795, "step": 207 }, { "epoch": 0.9949177877428999, "grad_norm": 0.09671011359258122, "learning_rate": 1.0908777567576168e-06, "loss": 0.697, "step": 208 }, { "epoch": 0.9997010463378176, "grad_norm": 0.09491067631505212, "learning_rate": 1.0829944069075847e-06, "loss": 0.6913, "step": 209 }, { "epoch": 1.0, "grad_norm": 0.09491067631505212, "learning_rate": 1.0751058599070781e-06, "loss": 0.0398, "step": 210 }, { "epoch": 1.0047832585949177, "grad_norm": 0.09568291564665689, "learning_rate": 1.0672126097407795e-06, "loss": 0.6558, "step": 211 }, { "epoch": 1.0095665171898356, "grad_norm": 0.0890899262566247, "learning_rate": 1.0593151506878865e-06, "loss": 0.6742, "step": 212 }, { "epoch": 1.0143497757847533, "grad_norm": 0.08951496407842846, "learning_rate": 1.0514139772911597e-06, "loss": 0.6589, "step": 213 }, { "epoch": 1.0191330343796712, "grad_norm": 0.09303979677050327, "learning_rate": 1.043509584325953e-06, "loss": 0.6526, "step": 214 }, { "epoch": 1.0239162929745889, "grad_norm": 0.10551892280989528, "learning_rate": 1.0356024667692314e-06, "loss": 0.6849, "step": 215 }, { "epoch": 1.0286995515695068, "grad_norm": 0.10560698057117009, "learning_rate": 1.0276931197685753e-06, "loss": 0.6947, "step": 216 }, { "epoch": 1.0334828101644244, "grad_norm": 0.09055248425609617, "learning_rate": 1.0197820386111737e-06, "loss": 0.6692, "step": 217 }, { "epoch": 1.0382660687593424, "grad_norm": 0.08952534903326591, "learning_rate": 1.0118697186928105e-06, "loss": 0.6481, "step": 218 }, { "epoch": 1.04304932735426, "grad_norm": 0.0949207133753394, "learning_rate": 1.0039566554868392e-06, "loss": 0.6561, "step": 219 }, { "epoch": 1.047832585949178, "grad_norm": 0.09247582314260705, "learning_rate": 9.960433445131607e-07, "loss": 0.6727, "step": 220 }, { "epoch": 1.0526158445440956, "grad_norm": 0.0922431854223743, "learning_rate": 9.881302813071896e-07, "loss": 0.6786, "step": 221 }, { "epoch": 1.0573991031390135, "grad_norm": 0.09921340856730206, "learning_rate": 9.802179613888262e-07, "loss": 0.6492, "step": 222 }, { "epoch": 1.0621823617339312, "grad_norm": 0.09405904196612806, "learning_rate": 9.723068802314246e-07, "loss": 0.6435, "step": 223 }, { "epoch": 1.0669656203288491, "grad_norm": 0.10252064804861775, "learning_rate": 9.643975332307687e-07, "loss": 0.6693, "step": 224 }, { "epoch": 1.0717488789237668, "grad_norm": 0.09137882604103069, "learning_rate": 9.564904156740471e-07, "loss": 0.6554, "step": 225 }, { "epoch": 1.0765321375186847, "grad_norm": 0.09506143141231545, "learning_rate": 9.485860227088405e-07, "loss": 0.6524, "step": 226 }, { "epoch": 1.0813153961136024, "grad_norm": 0.09471266291722098, "learning_rate": 9.406848493121134e-07, "loss": 0.6598, "step": 227 }, { "epoch": 1.08609865470852, "grad_norm": 0.09374158444399681, "learning_rate": 9.327873902592205e-07, "loss": 0.6546, "step": 228 }, { "epoch": 1.090881913303438, "grad_norm": 0.0988485463507574, "learning_rate": 9.248941400929222e-07, "loss": 0.6659, "step": 229 }, { "epoch": 1.0956651718983557, "grad_norm": 0.09989186431558944, "learning_rate": 9.17005593092415e-07, "loss": 0.6789, "step": 230 }, { "epoch": 1.1004484304932736, "grad_norm": 0.09577210416129449, "learning_rate": 9.09122243242383e-07, "loss": 0.6395, "step": 231 }, { "epoch": 1.1052316890881912, "grad_norm": 0.09417460653116495, "learning_rate": 9.01244584202061e-07, "loss": 0.6351, "step": 232 }, { "epoch": 1.1100149476831092, "grad_norm": 0.1060296134876217, "learning_rate": 8.933731092743219e-07, "loss": 0.6843, "step": 233 }, { "epoch": 1.1147982062780268, "grad_norm": 0.1015156854708665, "learning_rate": 8.855083113747875e-07, "loss": 0.6533, "step": 234 }, { "epoch": 1.1195814648729447, "grad_norm": 0.09252864648733664, "learning_rate": 8.776506830009607e-07, "loss": 0.6529, "step": 235 }, { "epoch": 1.1243647234678624, "grad_norm": 0.09810040579156247, "learning_rate": 8.698007162013849e-07, "loss": 0.6622, "step": 236 }, { "epoch": 1.1291479820627803, "grad_norm": 0.10333456832019272, "learning_rate": 8.619589025448318e-07, "loss": 0.6698, "step": 237 }, { "epoch": 1.133931240657698, "grad_norm": 0.09369526359642345, "learning_rate": 8.541257330895197e-07, "loss": 0.6397, "step": 238 }, { "epoch": 1.138714499252616, "grad_norm": 0.0934070849673633, "learning_rate": 8.463016983523627e-07, "loss": 0.6724, "step": 239 }, { "epoch": 1.1434977578475336, "grad_norm": 0.0968568071003159, "learning_rate": 8.384872882782541e-07, "loss": 0.6651, "step": 240 }, { "epoch": 1.1482810164424515, "grad_norm": 0.09218848184783551, "learning_rate": 8.306829922093857e-07, "loss": 0.6482, "step": 241 }, { "epoch": 1.1530642750373692, "grad_norm": 0.09367162146496326, "learning_rate": 8.228892988546067e-07, "loss": 0.6532, "step": 242 }, { "epoch": 1.157847533632287, "grad_norm": 0.09179870741014423, "learning_rate": 8.15106696258818e-07, "loss": 0.6458, "step": 243 }, { "epoch": 1.1626307922272048, "grad_norm": 0.10425982157218257, "learning_rate": 8.073356717724115e-07, "loss": 0.6476, "step": 244 }, { "epoch": 1.1674140508221225, "grad_norm": 0.10785978296392415, "learning_rate": 7.995767120207536e-07, "loss": 0.6542, "step": 245 }, { "epoch": 1.1721973094170404, "grad_norm": 0.09053925155843066, "learning_rate": 7.918303028737096e-07, "loss": 0.6444, "step": 246 }, { "epoch": 1.176980568011958, "grad_norm": 0.11054671698924359, "learning_rate": 7.840969294152211e-07, "loss": 0.6546, "step": 247 }, { "epoch": 1.181763826606876, "grad_norm": 0.09190168624229306, "learning_rate": 7.763770759129269e-07, "loss": 0.6483, "step": 248 }, { "epoch": 1.1865470852017936, "grad_norm": 0.10112895278117082, "learning_rate": 7.68671225787841e-07, "loss": 0.6607, "step": 249 }, { "epoch": 1.1913303437967115, "grad_norm": 0.09521368142452571, "learning_rate": 7.609798615840785e-07, "loss": 0.6632, "step": 250 }, { "epoch": 1.1961136023916292, "grad_norm": 0.09631678500828386, "learning_rate": 7.533034649386384e-07, "loss": 0.6271, "step": 251 }, { "epoch": 1.2008968609865471, "grad_norm": 0.09402110237205977, "learning_rate": 7.456425165512452e-07, "loss": 0.649, "step": 252 }, { "epoch": 1.2056801195814648, "grad_norm": 0.10452266128761932, "learning_rate": 7.379974961542447e-07, "loss": 0.6744, "step": 253 }, { "epoch": 1.2104633781763827, "grad_norm": 0.09522707743392524, "learning_rate": 7.303688824825646e-07, "loss": 0.6608, "step": 254 }, { "epoch": 1.2152466367713004, "grad_norm": 0.09573208889216732, "learning_rate": 7.227571532437349e-07, "loss": 0.652, "step": 255 }, { "epoch": 1.2200298953662183, "grad_norm": 0.08917908293059873, "learning_rate": 7.151627850879755e-07, "loss": 0.6543, "step": 256 }, { "epoch": 1.224813153961136, "grad_norm": 0.09616438435062312, "learning_rate": 7.075862535783453e-07, "loss": 0.6337, "step": 257 }, { "epoch": 1.229596412556054, "grad_norm": 0.09640367364080155, "learning_rate": 7.00028033160964e-07, "loss": 0.6839, "step": 258 }, { "epoch": 1.2343796711509716, "grad_norm": 0.09586353497663917, "learning_rate": 6.924885971353026e-07, "loss": 0.6669, "step": 259 }, { "epoch": 1.2391629297458895, "grad_norm": 0.09267059238961081, "learning_rate": 6.849684176245431e-07, "loss": 0.6314, "step": 260 }, { "epoch": 1.2439461883408072, "grad_norm": 0.09031407329588002, "learning_rate": 6.774679655460158e-07, "loss": 0.6449, "step": 261 }, { "epoch": 1.2487294469357249, "grad_norm": 0.09470627715876291, "learning_rate": 6.699877105817092e-07, "loss": 0.6502, "step": 262 }, { "epoch": 1.2535127055306428, "grad_norm": 0.10074811226580811, "learning_rate": 6.625281211488591e-07, "loss": 0.6686, "step": 263 }, { "epoch": 1.2582959641255607, "grad_norm": 0.10063396201285223, "learning_rate": 6.55089664370615e-07, "loss": 0.6695, "step": 264 }, { "epoch": 1.2630792227204783, "grad_norm": 0.0918463846096307, "learning_rate": 6.476728060467888e-07, "loss": 0.6451, "step": 265 }, { "epoch": 1.267862481315396, "grad_norm": 0.09328601851356563, "learning_rate": 6.402780106246884e-07, "loss": 0.6532, "step": 266 }, { "epoch": 1.272645739910314, "grad_norm": 0.09424847785405825, "learning_rate": 6.329057411700298e-07, "loss": 0.6673, "step": 267 }, { "epoch": 1.2774289985052316, "grad_norm": 0.10008134051501576, "learning_rate": 6.255564593379429e-07, "loss": 0.6672, "step": 268 }, { "epoch": 1.2822122571001495, "grad_norm": 0.09294984655524738, "learning_rate": 6.182306253440619e-07, "loss": 0.6395, "step": 269 }, { "epoch": 1.2869955156950672, "grad_norm": 0.10285895388747343, "learning_rate": 6.109286979357051e-07, "loss": 0.6637, "step": 270 }, { "epoch": 1.291778774289985, "grad_norm": 0.11139784795321246, "learning_rate": 6.036511343631488e-07, "loss": 0.6455, "step": 271 }, { "epoch": 1.2965620328849028, "grad_norm": 0.09212296328590026, "learning_rate": 5.963983903509935e-07, "loss": 0.6638, "step": 272 }, { "epoch": 1.3013452914798207, "grad_norm": 0.0949968377343012, "learning_rate": 5.89170920069628e-07, "loss": 0.6548, "step": 273 }, { "epoch": 1.3061285500747384, "grad_norm": 0.09690303299554558, "learning_rate": 5.819691761067865e-07, "loss": 0.6388, "step": 274 }, { "epoch": 1.310911808669656, "grad_norm": 0.09255296263795812, "learning_rate": 5.747936094392089e-07, "loss": 0.6435, "step": 275 }, { "epoch": 1.315695067264574, "grad_norm": 0.09503263182638313, "learning_rate": 5.676446694044002e-07, "loss": 0.638, "step": 276 }, { "epoch": 1.3204783258594919, "grad_norm": 0.09478054996201758, "learning_rate": 5.605228036724927e-07, "loss": 0.6502, "step": 277 }, { "epoch": 1.3252615844544096, "grad_norm": 0.0933411883471192, "learning_rate": 5.534284582182114e-07, "loss": 0.6511, "step": 278 }, { "epoch": 1.3300448430493272, "grad_norm": 0.09944351370813859, "learning_rate": 5.463620772929494e-07, "loss": 0.6325, "step": 279 }, { "epoch": 1.3348281016442451, "grad_norm": 0.10023032726854744, "learning_rate": 5.393241033969466e-07, "loss": 0.6418, "step": 280 }, { "epoch": 1.339611360239163, "grad_norm": 0.09729398494948012, "learning_rate": 5.323149772515812e-07, "loss": 0.6372, "step": 281 }, { "epoch": 1.3443946188340807, "grad_norm": 0.09323209082587747, "learning_rate": 5.253351377717706e-07, "loss": 0.6504, "step": 282 }, { "epoch": 1.3491778774289984, "grad_norm": 0.08940562070783202, "learning_rate": 5.183850220384873e-07, "loss": 0.6461, "step": 283 }, { "epoch": 1.3539611360239163, "grad_norm": 0.09092518318025446, "learning_rate": 5.114650652713884e-07, "loss": 0.6542, "step": 284 }, { "epoch": 1.358744394618834, "grad_norm": 0.0957083892879257, "learning_rate": 5.045757008015606e-07, "loss": 0.6627, "step": 285 }, { "epoch": 1.363527653213752, "grad_norm": 0.09918131125769998, "learning_rate": 4.977173600443868e-07, "loss": 0.6447, "step": 286 }, { "epoch": 1.3683109118086696, "grad_norm": 0.09079455495976413, "learning_rate": 4.908904724725299e-07, "loss": 0.651, "step": 287 }, { "epoch": 1.3730941704035875, "grad_norm": 0.09533039778556848, "learning_rate": 4.840954655890391e-07, "loss": 0.6518, "step": 288 }, { "epoch": 1.3778774289985052, "grad_norm": 0.09328409620590697, "learning_rate": 4.773327649005777e-07, "loss": 0.6712, "step": 289 }, { "epoch": 1.382660687593423, "grad_norm": 0.10546886430926707, "learning_rate": 4.7060279389078184e-07, "loss": 0.6594, "step": 290 }, { "epoch": 1.3874439461883408, "grad_norm": 0.09513157037379577, "learning_rate": 4.6390597399373644e-07, "loss": 0.6311, "step": 291 }, { "epoch": 1.3922272047832587, "grad_norm": 0.0910714399276055, "learning_rate": 4.5724272456758907e-07, "loss": 0.6524, "step": 292 }, { "epoch": 1.3970104633781764, "grad_norm": 0.08960044994197404, "learning_rate": 4.506134628682877e-07, "loss": 0.6515, "step": 293 }, { "epoch": 1.4017937219730943, "grad_norm": 0.0939439987196228, "learning_rate": 4.440186040234524e-07, "loss": 0.6487, "step": 294 }, { "epoch": 1.406576980568012, "grad_norm": 0.10645194425387064, "learning_rate": 4.3745856100637834e-07, "loss": 0.629, "step": 295 }, { "epoch": 1.4113602391629296, "grad_norm": 0.1047763121754449, "learning_rate": 4.3093374461017785e-07, "loss": 0.6466, "step": 296 }, { "epoch": 1.4161434977578475, "grad_norm": 0.09982639743024341, "learning_rate": 4.244445634220545e-07, "loss": 0.6504, "step": 297 }, { "epoch": 1.4209267563527654, "grad_norm": 0.094704337085837, "learning_rate": 4.1799142379771766e-07, "loss": 0.6675, "step": 298 }, { "epoch": 1.4257100149476831, "grad_norm": 0.09542340607816273, "learning_rate": 4.115747298359363e-07, "loss": 0.6379, "step": 299 }, { "epoch": 1.4304932735426008, "grad_norm": 0.09975848410849608, "learning_rate": 4.0519488335323415e-07, "loss": 0.6684, "step": 300 }, { "epoch": 1.4352765321375187, "grad_norm": 0.09564133208363568, "learning_rate": 3.9885228385872806e-07, "loss": 0.6345, "step": 301 }, { "epoch": 1.4400597907324364, "grad_norm": 0.0955432935737647, "learning_rate": 3.925473285291091e-07, "loss": 0.6419, "step": 302 }, { "epoch": 1.4448430493273543, "grad_norm": 0.0971708074341661, "learning_rate": 3.862804121837733e-07, "loss": 0.6568, "step": 303 }, { "epoch": 1.449626307922272, "grad_norm": 0.09654206097129785, "learning_rate": 3.8005192726009663e-07, "loss": 0.6526, "step": 304 }, { "epoch": 1.45440956651719, "grad_norm": 0.1047844291301578, "learning_rate": 3.738622637888608e-07, "loss": 0.6554, "step": 305 }, { "epoch": 1.4591928251121076, "grad_norm": 0.10495835343403974, "learning_rate": 3.677118093698278e-07, "loss": 0.639, "step": 306 }, { "epoch": 1.4639760837070255, "grad_norm": 0.09312185978330073, "learning_rate": 3.61600949147472e-07, "loss": 0.6534, "step": 307 }, { "epoch": 1.4687593423019432, "grad_norm": 0.0914400067851364, "learning_rate": 3.5553006578685706e-07, "loss": 0.6364, "step": 308 }, { "epoch": 1.473542600896861, "grad_norm": 0.10168751711517944, "learning_rate": 3.494995394496778e-07, "loss": 0.6438, "step": 309 }, { "epoch": 1.4783258594917787, "grad_norm": 0.08777082505313431, "learning_rate": 3.435097477704517e-07, "loss": 0.6159, "step": 310 }, { "epoch": 1.4831091180866967, "grad_norm": 0.0992483436164171, "learning_rate": 3.3756106583287205e-07, "loss": 0.6692, "step": 311 }, { "epoch": 1.4878923766816143, "grad_norm": 0.09763140125702534, "learning_rate": 3.316538661463204e-07, "loss": 0.6704, "step": 312 }, { "epoch": 1.492675635276532, "grad_norm": 0.103958466638517, "learning_rate": 3.2578851862253796e-07, "loss": 0.6582, "step": 313 }, { "epoch": 1.49745889387145, "grad_norm": 0.09058417960194183, "learning_rate": 3.199653905524654e-07, "loss": 0.6353, "step": 314 }, { "epoch": 1.5022421524663678, "grad_norm": 0.10131403619552605, "learning_rate": 3.1418484658323806e-07, "loss": 0.6566, "step": 315 }, { "epoch": 1.5070254110612855, "grad_norm": 0.09681513597634411, "learning_rate": 3.0844724869535577e-07, "loss": 0.6437, "step": 316 }, { "epoch": 1.5118086696562032, "grad_norm": 0.10073309195120103, "learning_rate": 3.027529561800117e-07, "loss": 0.6541, "step": 317 }, { "epoch": 1.516591928251121, "grad_norm": 0.09187767379862512, "learning_rate": 2.971023256165983e-07, "loss": 0.6429, "step": 318 }, { "epoch": 1.521375186846039, "grad_norm": 0.09322468814151724, "learning_rate": 2.9149571085037215e-07, "loss": 0.6536, "step": 319 }, { "epoch": 1.5261584454409567, "grad_norm": 0.09535864278016615, "learning_rate": 2.8593346297030073e-07, "loss": 0.6448, "step": 320 }, { "epoch": 1.5309417040358744, "grad_norm": 0.09853757658051235, "learning_rate": 2.804159302870751e-07, "loss": 0.6361, "step": 321 }, { "epoch": 1.5357249626307923, "grad_norm": 0.08652865663588583, "learning_rate": 2.7494345831129837e-07, "loss": 0.6275, "step": 322 }, { "epoch": 1.54050822122571, "grad_norm": 0.09209381258321075, "learning_rate": 2.6951638973185073e-07, "loss": 0.6528, "step": 323 }, { "epoch": 1.5452914798206279, "grad_norm": 0.09568385273192681, "learning_rate": 2.64135064394428e-07, "loss": 0.6632, "step": 324 }, { "epoch": 1.5500747384155455, "grad_norm": 0.0947277435093391, "learning_rate": 2.587998192802638e-07, "loss": 0.6306, "step": 325 }, { "epoch": 1.5548579970104632, "grad_norm": 0.0985703474276344, "learning_rate": 2.5351098848502386e-07, "loss": 0.6511, "step": 326 }, { "epoch": 1.5596412556053811, "grad_norm": 0.09427610648180619, "learning_rate": 2.482689031978872e-07, "loss": 0.6533, "step": 327 }, { "epoch": 1.564424514200299, "grad_norm": 0.09520925811802433, "learning_rate": 2.4307389168080606e-07, "loss": 0.6603, "step": 328 }, { "epoch": 1.5692077727952167, "grad_norm": 0.0907369263004915, "learning_rate": 2.3792627924795038e-07, "loss": 0.6818, "step": 329 }, { "epoch": 1.5739910313901344, "grad_norm": 0.09440279581013306, "learning_rate": 2.3282638824533529e-07, "loss": 0.6531, "step": 330 }, { "epoch": 1.5787742899850523, "grad_norm": 0.09614745051429147, "learning_rate": 2.277745380306383e-07, "loss": 0.6795, "step": 331 }, { "epoch": 1.5835575485799702, "grad_norm": 0.09778941686336041, "learning_rate": 2.227710449531971e-07, "loss": 0.6778, "step": 332 }, { "epoch": 1.588340807174888, "grad_norm": 0.09575250682717351, "learning_rate": 2.178162223342035e-07, "loss": 0.6404, "step": 333 }, { "epoch": 1.5931240657698056, "grad_norm": 0.09627217057571222, "learning_rate": 2.1291038044707965e-07, "loss": 0.6528, "step": 334 }, { "epoch": 1.5979073243647235, "grad_norm": 0.09572743591446818, "learning_rate": 2.0805382649805225e-07, "loss": 0.6461, "step": 335 }, { "epoch": 1.6026905829596414, "grad_norm": 0.09528928099830879, "learning_rate": 2.032468646069112e-07, "loss": 0.6425, "step": 336 }, { "epoch": 1.607473841554559, "grad_norm": 0.09652866769512121, "learning_rate": 1.9848979578796865e-07, "loss": 0.6548, "step": 337 }, { "epoch": 1.6122571001494768, "grad_norm": 0.0954083836089715, "learning_rate": 1.937829179312076e-07, "loss": 0.6633, "step": 338 }, { "epoch": 1.6170403587443947, "grad_norm": 0.09389212828330971, "learning_rate": 1.8912652578362853e-07, "loss": 0.653, "step": 339 }, { "epoch": 1.6218236173393124, "grad_norm": 0.09323975661872334, "learning_rate": 1.8452091093079215e-07, "loss": 0.6405, "step": 340 }, { "epoch": 1.6266068759342303, "grad_norm": 0.1030124431981675, "learning_rate": 1.7996636177855928e-07, "loss": 0.6776, "step": 341 }, { "epoch": 1.631390134529148, "grad_norm": 0.09627742650338285, "learning_rate": 1.75463163535033e-07, "loss": 0.6579, "step": 342 }, { "epoch": 1.6361733931240656, "grad_norm": 0.09724021609427144, "learning_rate": 1.7101159819269583e-07, "loss": 0.6432, "step": 343 }, { "epoch": 1.6409566517189835, "grad_norm": 0.09615121849981347, "learning_rate": 1.6661194451075345e-07, "loss": 0.6628, "step": 344 }, { "epoch": 1.6457399103139014, "grad_norm": 0.11302849698050037, "learning_rate": 1.6226447799767772e-07, "loss": 0.6306, "step": 345 }, { "epoch": 1.6505231689088191, "grad_norm": 0.10400127614773519, "learning_rate": 1.5796947089395475e-07, "loss": 0.6462, "step": 346 }, { "epoch": 1.6553064275037368, "grad_norm": 0.08798479350296001, "learning_rate": 1.5372719215503582e-07, "loss": 0.6309, "step": 347 }, { "epoch": 1.6600896860986547, "grad_norm": 0.09514870211869147, "learning_rate": 1.4953790743449702e-07, "loss": 0.6631, "step": 348 }, { "epoch": 1.6648729446935726, "grad_norm": 0.09749807157916107, "learning_rate": 1.4540187906740241e-07, "loss": 0.6285, "step": 349 }, { "epoch": 1.6696562032884903, "grad_norm": 0.0901583318721974, "learning_rate": 1.4131936605387762e-07, "loss": 0.6731, "step": 350 }, { "epoch": 1.674439461883408, "grad_norm": 0.09526536450165937, "learning_rate": 1.3729062404289017e-07, "loss": 0.6729, "step": 351 }, { "epoch": 1.6792227204783259, "grad_norm": 0.09836491336123554, "learning_rate": 1.3331590531624115e-07, "loss": 0.6515, "step": 352 }, { "epoch": 1.6840059790732438, "grad_norm": 0.10075181987095727, "learning_rate": 1.2939545877276726e-07, "loss": 0.6452, "step": 353 }, { "epoch": 1.6887892376681615, "grad_norm": 0.09365016014154177, "learning_rate": 1.25529529912754e-07, "loss": 0.6477, "step": 354 }, { "epoch": 1.6935724962630792, "grad_norm": 0.09704957910910289, "learning_rate": 1.2171836082256316e-07, "loss": 0.6678, "step": 355 }, { "epoch": 1.698355754857997, "grad_norm": 0.0902657671425916, "learning_rate": 1.1796219015947285e-07, "loss": 0.6515, "step": 356 }, { "epoch": 1.703139013452915, "grad_norm": 0.09237650202510098, "learning_rate": 1.1426125313673285e-07, "loss": 0.6645, "step": 357 }, { "epoch": 1.7079222720478326, "grad_norm": 0.09196231975892524, "learning_rate": 1.1061578150883444e-07, "loss": 0.6092, "step": 358 }, { "epoch": 1.7127055306427503, "grad_norm": 0.10378820492061246, "learning_rate": 1.070260035570002e-07, "loss": 0.6539, "step": 359 }, { "epoch": 1.717488789237668, "grad_norm": 0.09091589756400278, "learning_rate": 1.0349214407488571e-07, "loss": 0.6454, "step": 360 }, { "epoch": 1.722272047832586, "grad_norm": 0.09881444337923977, "learning_rate": 1.000144243545058e-07, "loss": 0.6486, "step": 361 }, { "epoch": 1.7270553064275038, "grad_norm": 0.09311309771551186, "learning_rate": 9.659306217237517e-08, "loss": 0.6402, "step": 362 }, { "epoch": 1.7318385650224215, "grad_norm": 0.09631340848121332, "learning_rate": 9.322827177587212e-08, "loss": 0.6469, "step": 363 }, { "epoch": 1.7366218236173392, "grad_norm": 0.08882699558772723, "learning_rate": 8.992026386982221e-08, "loss": 0.6535, "step": 364 }, { "epoch": 1.741405082212257, "grad_norm": 0.09280206311141305, "learning_rate": 8.66692456033029e-08, "loss": 0.648, "step": 365 }, { "epoch": 1.746188340807175, "grad_norm": 0.0909402496845187, "learning_rate": 8.347542055667311e-08, "loss": 0.6529, "step": 366 }, { "epoch": 1.7509715994020927, "grad_norm": 0.09512784479004122, "learning_rate": 8.033898872882394e-08, "loss": 0.6383, "step": 367 }, { "epoch": 1.7557548579970104, "grad_norm": 0.09252600518424785, "learning_rate": 7.726014652465507e-08, "loss": 0.6202, "step": 368 }, { "epoch": 1.7605381165919283, "grad_norm": 0.09450252582803388, "learning_rate": 7.423908674277579e-08, "loss": 0.6494, "step": 369 }, { "epoch": 1.7653213751868462, "grad_norm": 0.09089301547199258, "learning_rate": 7.127599856343192e-08, "loss": 0.6583, "step": 370 }, { "epoch": 1.7701046337817639, "grad_norm": 0.0917284963739844, "learning_rate": 6.837106753665823e-08, "loss": 0.666, "step": 371 }, { "epoch": 1.7748878923766815, "grad_norm": 0.09493041895710681, "learning_rate": 6.552447557066109e-08, "loss": 0.6464, "step": 372 }, { "epoch": 1.7796711509715994, "grad_norm": 0.08941486424509316, "learning_rate": 6.273640092042575e-08, "loss": 0.6367, "step": 373 }, { "epoch": 1.7844544095665174, "grad_norm": 0.08812104207206783, "learning_rate": 6.000701817655474e-08, "loss": 0.6259, "step": 374 }, { "epoch": 1.789237668161435, "grad_norm": 0.09772722276760373, "learning_rate": 5.733649825433384e-08, "loss": 0.6316, "step": 375 }, { "epoch": 1.7940209267563527, "grad_norm": 0.09550366242600927, "learning_rate": 5.47250083830314e-08, "loss": 0.6764, "step": 376 }, { "epoch": 1.7988041853512704, "grad_norm": 0.09529244067030168, "learning_rate": 5.217271209542384e-08, "loss": 0.6581, "step": 377 }, { "epoch": 1.8035874439461883, "grad_norm": 0.09484969927499808, "learning_rate": 4.967976921755679e-08, "loss": 0.6238, "step": 378 }, { "epoch": 1.8083707025411062, "grad_norm": 0.0922584352432481, "learning_rate": 4.724633585873627e-08, "loss": 0.6417, "step": 379 }, { "epoch": 1.813153961136024, "grad_norm": 0.09178466251978876, "learning_rate": 4.487256440175291e-08, "loss": 0.6563, "step": 380 }, { "epoch": 1.8179372197309416, "grad_norm": 0.0945223759439494, "learning_rate": 4.255860349334006e-08, "loss": 0.6479, "step": 381 }, { "epoch": 1.8227204783258595, "grad_norm": 0.08929357609354767, "learning_rate": 4.030459803486464e-08, "loss": 0.6378, "step": 382 }, { "epoch": 1.8275037369207774, "grad_norm": 0.08950252320624025, "learning_rate": 3.811068917325444e-08, "loss": 0.6128, "step": 383 }, { "epoch": 1.832286995515695, "grad_norm": 0.09959763380863362, "learning_rate": 3.5977014292158495e-08, "loss": 0.6493, "step": 384 }, { "epoch": 1.8370702541106128, "grad_norm": 0.09877239003895597, "learning_rate": 3.3903707003344774e-08, "loss": 0.6453, "step": 385 }, { "epoch": 1.8418535127055307, "grad_norm": 0.09253710326481404, "learning_rate": 3.189089713833226e-08, "loss": 0.6564, "step": 386 }, { "epoch": 1.8466367713004486, "grad_norm": 0.09295026609135121, "learning_rate": 2.9938710740262884e-08, "loss": 0.6286, "step": 387 }, { "epoch": 1.8514200298953662, "grad_norm": 0.0931563883337063, "learning_rate": 2.8047270056005934e-08, "loss": 0.6431, "step": 388 }, { "epoch": 1.856203288490284, "grad_norm": 0.10071203031568553, "learning_rate": 2.6216693528505195e-08, "loss": 0.6419, "step": 389 }, { "epoch": 1.8609865470852018, "grad_norm": 0.0926672982724561, "learning_rate": 2.4447095789360884e-08, "loss": 0.6426, "step": 390 }, { "epoch": 1.8657698056801197, "grad_norm": 0.10839157436286975, "learning_rate": 2.2738587651651487e-08, "loss": 0.6418, "step": 391 }, { "epoch": 1.8705530642750374, "grad_norm": 0.09452841812388145, "learning_rate": 2.109127610299466e-08, "loss": 0.6534, "step": 392 }, { "epoch": 1.875336322869955, "grad_norm": 0.09059164967961951, "learning_rate": 1.950526429884769e-08, "loss": 0.6385, "step": 393 }, { "epoch": 1.8801195814648728, "grad_norm": 0.09541292286319235, "learning_rate": 1.7980651556048e-08, "loss": 0.6533, "step": 394 }, { "epoch": 1.8849028400597907, "grad_norm": 0.09352871341544354, "learning_rate": 1.6517533346593226e-08, "loss": 0.6533, "step": 395 }, { "epoch": 1.8896860986547086, "grad_norm": 0.09830540898676399, "learning_rate": 1.5116001291663462e-08, "loss": 0.686, "step": 396 }, { "epoch": 1.8944693572496263, "grad_norm": 0.09186784336874675, "learning_rate": 1.3776143155883491e-08, "loss": 0.6265, "step": 397 }, { "epoch": 1.899252615844544, "grad_norm": 0.0903805903035563, "learning_rate": 1.2498042841827317e-08, "loss": 0.6444, "step": 398 }, { "epoch": 1.9040358744394619, "grad_norm": 0.09251729842752435, "learning_rate": 1.128178038476324e-08, "loss": 0.643, "step": 399 }, { "epoch": 1.9088191330343798, "grad_norm": 0.08909847951509034, "learning_rate": 1.0127431947643316e-08, "loss": 0.643, "step": 400 }, { "epoch": 1.9136023916292975, "grad_norm": 0.09779029431433935, "learning_rate": 9.035069816332619e-09, "loss": 0.6312, "step": 401 }, { "epoch": 1.9183856502242151, "grad_norm": 0.09602092233428558, "learning_rate": 8.004762395083963e-09, "loss": 0.629, "step": 402 }, { "epoch": 1.923168908819133, "grad_norm": 0.09003448698278545, "learning_rate": 7.036574202253343e-09, "loss": 0.6706, "step": 403 }, { "epoch": 1.927952167414051, "grad_norm": 0.09531787472090986, "learning_rate": 6.130565866260484e-09, "loss": 0.65, "step": 404 }, { "epoch": 1.9327354260089686, "grad_norm": 0.09179251340184746, "learning_rate": 5.286794121791782e-09, "loss": 0.6574, "step": 405 }, { "epoch": 1.9375186846038863, "grad_norm": 0.09493544791044316, "learning_rate": 4.5053118062478025e-09, "loss": 0.6322, "step": 406 }, { "epoch": 1.9423019431988042, "grad_norm": 0.09306468796228341, "learning_rate": 3.786167856434375e-09, "loss": 0.6634, "step": 407 }, { "epoch": 1.9470852017937221, "grad_norm": 0.09006826318963117, "learning_rate": 3.1294073054987102e-09, "loss": 0.6418, "step": 408 }, { "epoch": 1.9518684603886398, "grad_norm": 0.09638156976673805, "learning_rate": 2.5350712801084363e-09, "loss": 0.631, "step": 409 }, { "epoch": 1.9566517189835575, "grad_norm": 0.09585138354438733, "learning_rate": 2.003196997877099e-09, "loss": 0.6405, "step": 410 }, { "epoch": 1.9614349775784752, "grad_norm": 0.0982765637161277, "learning_rate": 1.5338177650332517e-09, "loss": 0.631, "step": 411 }, { "epoch": 1.966218236173393, "grad_norm": 0.0924075594922873, "learning_rate": 1.1269629743346777e-09, "loss": 0.6433, "step": 412 }, { "epoch": 1.971001494768311, "grad_norm": 0.09407079001673903, "learning_rate": 7.826581032279734e-10, "loss": 0.6422, "step": 413 }, { "epoch": 1.9757847533632287, "grad_norm": 0.09103323653600585, "learning_rate": 5.00924712252937e-10, "loss": 0.6645, "step": 414 }, { "epoch": 1.9805680119581464, "grad_norm": 0.09999729799669839, "learning_rate": 2.8178044369286945e-10, "loss": 0.6495, "step": 415 }, { "epoch": 1.9853512705530643, "grad_norm": 0.0958229669734574, "learning_rate": 1.2523902046934763e-10, "loss": 0.6238, "step": 416 }, { "epoch": 1.9901345291479822, "grad_norm": 0.08983387781419207, "learning_rate": 3.131024528302273e-11, "loss": 0.6478, "step": 417 }, { "epoch": 1.9949177877428999, "grad_norm": 0.09621386225221452, "learning_rate": 0.0, "loss": 0.6557, "step": 418 }, { "epoch": 1.9949177877428999, "step": 418, "total_flos": 862605439369216.0, "train_loss": 0.715426175948678, "train_runtime": 10328.0995, "train_samples_per_second": 5.181, "train_steps_per_second": 0.04 } ], "logging_steps": 1, "max_steps": 418, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 862605439369216.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }