{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.992511233150275, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00798801797304044, "grad_norm": 5.8922959558083035, "learning_rate": 1.26984126984127e-06, "loss": 0.9284, "step": 1 }, { "epoch": 0.01597603594608088, "grad_norm": 5.937587864934546, "learning_rate": 2.53968253968254e-06, "loss": 0.9318, "step": 2 }, { "epoch": 0.023964053919121316, "grad_norm": 5.861772382161128, "learning_rate": 3.80952380952381e-06, "loss": 0.9331, "step": 3 }, { "epoch": 0.03195207189216176, "grad_norm": 5.239005126601421, "learning_rate": 5.07936507936508e-06, "loss": 0.9119, "step": 4 }, { "epoch": 0.0399400898652022, "grad_norm": 3.629499849213271, "learning_rate": 6.349206349206349e-06, "loss": 0.8754, "step": 5 }, { "epoch": 0.04792810783824263, "grad_norm": 2.106015204146543, "learning_rate": 7.61904761904762e-06, "loss": 0.836, "step": 6 }, { "epoch": 0.05591612581128307, "grad_norm": 4.356383106407689, "learning_rate": 8.888888888888888e-06, "loss": 0.8711, "step": 7 }, { "epoch": 0.06390414378432352, "grad_norm": 4.748038669119492, "learning_rate": 1.015873015873016e-05, "loss": 0.8657, "step": 8 }, { "epoch": 0.07189216175736396, "grad_norm": 4.437164847463165, "learning_rate": 1.1428571428571429e-05, "loss": 0.8249, "step": 9 }, { "epoch": 0.0798801797304044, "grad_norm": 4.231505558889787, "learning_rate": 1.2698412698412699e-05, "loss": 0.8205, "step": 10 }, { "epoch": 0.08786819770344484, "grad_norm": 2.771780905554085, "learning_rate": 1.3968253968253968e-05, "loss": 0.8071, "step": 11 }, { "epoch": 0.09585621567648527, "grad_norm": 1.7918482116297212, "learning_rate": 1.523809523809524e-05, "loss": 0.7653, "step": 12 }, { "epoch": 0.1038442336495257, "grad_norm": 1.6236141779129738, "learning_rate": 1.6507936507936507e-05, "loss": 0.7437, "step": 13 }, { "epoch": 0.11183225162256615, "grad_norm": 1.2870146428263272, "learning_rate": 1.7777777777777777e-05, "loss": 0.736, "step": 14 }, { "epoch": 0.11982026959560658, "grad_norm": 1.0068702786417012, "learning_rate": 1.904761904761905e-05, "loss": 0.7124, "step": 15 }, { "epoch": 0.12780828756864704, "grad_norm": 1.1636059875738414, "learning_rate": 2.031746031746032e-05, "loss": 0.7004, "step": 16 }, { "epoch": 0.13579630554168748, "grad_norm": 0.8899548891950194, "learning_rate": 2.158730158730159e-05, "loss": 0.6953, "step": 17 }, { "epoch": 0.14378432351472792, "grad_norm": 0.8171634825879731, "learning_rate": 2.2857142857142858e-05, "loss": 0.6899, "step": 18 }, { "epoch": 0.15177234148776836, "grad_norm": 0.8423601505147725, "learning_rate": 2.4126984126984128e-05, "loss": 0.6759, "step": 19 }, { "epoch": 0.1597603594608088, "grad_norm": 0.9144240660639567, "learning_rate": 2.5396825396825397e-05, "loss": 0.6768, "step": 20 }, { "epoch": 0.16774837743384924, "grad_norm": 0.7527042679957461, "learning_rate": 2.6666666666666667e-05, "loss": 0.6664, "step": 21 }, { "epoch": 0.17573639540688968, "grad_norm": 0.9115589252395023, "learning_rate": 2.7936507936507936e-05, "loss": 0.6685, "step": 22 }, { "epoch": 0.18372441337993012, "grad_norm": 0.7794511419641769, "learning_rate": 2.9206349206349206e-05, "loss": 0.6476, "step": 23 }, { "epoch": 0.19171243135297053, "grad_norm": 0.8206145936410231, "learning_rate": 3.047619047619048e-05, "loss": 0.6555, "step": 24 }, { "epoch": 0.19970044932601097, "grad_norm": 0.8660748611689925, "learning_rate": 3.1746031746031745e-05, "loss": 0.6504, "step": 25 }, { "epoch": 0.2076884672990514, "grad_norm": 1.09005656089158, "learning_rate": 3.3015873015873014e-05, "loss": 0.6468, "step": 26 }, { "epoch": 0.21567648527209185, "grad_norm": 1.2233269812335474, "learning_rate": 3.4285714285714284e-05, "loss": 0.6554, "step": 27 }, { "epoch": 0.2236645032451323, "grad_norm": 0.7202441107469458, "learning_rate": 3.555555555555555e-05, "loss": 0.6351, "step": 28 }, { "epoch": 0.23165252121817273, "grad_norm": 1.549610538416556, "learning_rate": 3.682539682539683e-05, "loss": 0.6386, "step": 29 }, { "epoch": 0.23964053919121317, "grad_norm": 0.7964826077261805, "learning_rate": 3.80952380952381e-05, "loss": 0.6282, "step": 30 }, { "epoch": 0.2476285571642536, "grad_norm": 0.6903264777596222, "learning_rate": 3.936507936507937e-05, "loss": 0.6281, "step": 31 }, { "epoch": 0.2556165751372941, "grad_norm": 1.2761326884044875, "learning_rate": 4.063492063492064e-05, "loss": 0.6216, "step": 32 }, { "epoch": 0.2636045931103345, "grad_norm": 1.3286354473207003, "learning_rate": 4.190476190476191e-05, "loss": 0.6196, "step": 33 }, { "epoch": 0.27159261108337496, "grad_norm": 0.6908894226839724, "learning_rate": 4.317460317460318e-05, "loss": 0.6144, "step": 34 }, { "epoch": 0.2795806290564154, "grad_norm": 1.0386450814645398, "learning_rate": 4.444444444444445e-05, "loss": 0.6155, "step": 35 }, { "epoch": 0.28756864702945584, "grad_norm": 0.7231485406568985, "learning_rate": 4.5714285714285716e-05, "loss": 0.6081, "step": 36 }, { "epoch": 0.2955566650024963, "grad_norm": 1.0158040603959178, "learning_rate": 4.698412698412699e-05, "loss": 0.6071, "step": 37 }, { "epoch": 0.3035446829755367, "grad_norm": 1.5638712924845808, "learning_rate": 4.8253968253968255e-05, "loss": 0.6011, "step": 38 }, { "epoch": 0.31153270094857716, "grad_norm": 0.9158856622661424, "learning_rate": 4.952380952380953e-05, "loss": 0.6029, "step": 39 }, { "epoch": 0.3195207189216176, "grad_norm": 1.533932723524169, "learning_rate": 5.0793650793650794e-05, "loss": 0.6007, "step": 40 }, { "epoch": 0.32750873689465804, "grad_norm": 1.1151203871839255, "learning_rate": 5.206349206349207e-05, "loss": 0.614, "step": 41 }, { "epoch": 0.3354967548676985, "grad_norm": 1.7936006261869704, "learning_rate": 5.333333333333333e-05, "loss": 0.5964, "step": 42 }, { "epoch": 0.3434847728407389, "grad_norm": 1.6373220709210505, "learning_rate": 5.460317460317461e-05, "loss": 0.6048, "step": 43 }, { "epoch": 0.35147279081377936, "grad_norm": 1.1820238351172419, "learning_rate": 5.587301587301587e-05, "loss": 0.5983, "step": 44 }, { "epoch": 0.3594608087868198, "grad_norm": 1.0448595195163097, "learning_rate": 5.714285714285715e-05, "loss": 0.6015, "step": 45 }, { "epoch": 0.36744882675986024, "grad_norm": 0.9595564806215681, "learning_rate": 5.841269841269841e-05, "loss": 0.5845, "step": 46 }, { "epoch": 0.3754368447329007, "grad_norm": 1.5962786237575002, "learning_rate": 5.968253968253969e-05, "loss": 0.5995, "step": 47 }, { "epoch": 0.38342486270594106, "grad_norm": 1.5871239780794693, "learning_rate": 6.095238095238096e-05, "loss": 0.5884, "step": 48 }, { "epoch": 0.3914128806789815, "grad_norm": 1.1244276800474557, "learning_rate": 6.222222222222223e-05, "loss": 0.597, "step": 49 }, { "epoch": 0.39940089865202194, "grad_norm": 1.971108394067141, "learning_rate": 6.349206349206349e-05, "loss": 0.5959, "step": 50 }, { "epoch": 0.4073889166250624, "grad_norm": 1.119155483104472, "learning_rate": 6.476190476190477e-05, "loss": 0.595, "step": 51 }, { "epoch": 0.4153769345981028, "grad_norm": 2.3293959233637813, "learning_rate": 6.603174603174603e-05, "loss": 0.5968, "step": 52 }, { "epoch": 0.42336495257114326, "grad_norm": 1.7690872710201135, "learning_rate": 6.730158730158731e-05, "loss": 0.5942, "step": 53 }, { "epoch": 0.4313529705441837, "grad_norm": 1.5082563438406895, "learning_rate": 6.857142857142857e-05, "loss": 0.5929, "step": 54 }, { "epoch": 0.43934098851722414, "grad_norm": 1.705505860185178, "learning_rate": 6.984126984126985e-05, "loss": 0.5886, "step": 55 }, { "epoch": 0.4473290064902646, "grad_norm": 1.311562682930509, "learning_rate": 7.11111111111111e-05, "loss": 0.5942, "step": 56 }, { "epoch": 0.455317024463305, "grad_norm": 1.3702806631104458, "learning_rate": 7.238095238095239e-05, "loss": 0.5918, "step": 57 }, { "epoch": 0.46330504243634546, "grad_norm": 1.3735083834456305, "learning_rate": 7.365079365079366e-05, "loss": 0.5944, "step": 58 }, { "epoch": 0.4712930604093859, "grad_norm": 1.5109411814380815, "learning_rate": 7.492063492063493e-05, "loss": 0.5859, "step": 59 }, { "epoch": 0.47928107838242634, "grad_norm": 1.8414765598754854, "learning_rate": 7.61904761904762e-05, "loss": 0.5932, "step": 60 }, { "epoch": 0.4872690963554668, "grad_norm": 1.1402061244328228, "learning_rate": 7.746031746031747e-05, "loss": 0.5828, "step": 61 }, { "epoch": 0.4952571143285072, "grad_norm": 1.794539731996526, "learning_rate": 7.873015873015874e-05, "loss": 0.5792, "step": 62 }, { "epoch": 0.5032451323015477, "grad_norm": 1.4047554942240879, "learning_rate": 8e-05, "loss": 0.5804, "step": 63 }, { "epoch": 0.5112331502745882, "grad_norm": 1.3931552496353343, "learning_rate": 7.999937503459301e-05, "loss": 0.5775, "step": 64 }, { "epoch": 0.5192211682476285, "grad_norm": 1.1656900196646254, "learning_rate": 7.999750015790111e-05, "loss": 0.5909, "step": 65 }, { "epoch": 0.527209186220669, "grad_norm": 1.1493581998452567, "learning_rate": 7.999437542851095e-05, "loss": 0.5754, "step": 66 }, { "epoch": 0.5351972041937094, "grad_norm": 1.9412467459743252, "learning_rate": 7.999000094406493e-05, "loss": 0.5932, "step": 67 }, { "epoch": 0.5431852221667499, "grad_norm": 1.3410903514703634, "learning_rate": 7.998437684125812e-05, "loss": 0.5849, "step": 68 }, { "epoch": 0.5511732401397903, "grad_norm": 1.1599213167605864, "learning_rate": 7.997750329583402e-05, "loss": 0.5779, "step": 69 }, { "epoch": 0.5591612581128308, "grad_norm": 2.611492770456904, "learning_rate": 7.9969380522579e-05, "loss": 0.5936, "step": 70 }, { "epoch": 0.5671492760858712, "grad_norm": 1.5087257150690652, "learning_rate": 7.996000877531569e-05, "loss": 0.5884, "step": 71 }, { "epoch": 0.5751372940589117, "grad_norm": 2.6141462248634086, "learning_rate": 7.9949388346895e-05, "loss": 0.5951, "step": 72 }, { "epoch": 0.5831253120319521, "grad_norm": 1.9836349293790256, "learning_rate": 7.993751956918693e-05, "loss": 0.5874, "step": 73 }, { "epoch": 0.5911133300049926, "grad_norm": 1.5274699003911547, "learning_rate": 7.992440281307027e-05, "loss": 0.5962, "step": 74 }, { "epoch": 0.5991013479780329, "grad_norm": 1.2158179637702575, "learning_rate": 7.991003848842093e-05, "loss": 0.5801, "step": 75 }, { "epoch": 0.6070893659510734, "grad_norm": 1.2430162793293555, "learning_rate": 7.989442704409925e-05, "loss": 0.5757, "step": 76 }, { "epoch": 0.6150773839241138, "grad_norm": 0.9546052456533828, "learning_rate": 7.987756896793583e-05, "loss": 0.5836, "step": 77 }, { "epoch": 0.6230654018971543, "grad_norm": 1.051061984198158, "learning_rate": 7.985946478671642e-05, "loss": 0.575, "step": 78 }, { "epoch": 0.6310534198701947, "grad_norm": 1.025823090309492, "learning_rate": 7.984011506616534e-05, "loss": 0.5792, "step": 79 }, { "epoch": 0.6390414378432352, "grad_norm": 1.0879892769571216, "learning_rate": 7.981952041092792e-05, "loss": 0.575, "step": 80 }, { "epoch": 0.6470294558162756, "grad_norm": 1.3203984543837413, "learning_rate": 7.979768146455148e-05, "loss": 0.5725, "step": 81 }, { "epoch": 0.6550174737893161, "grad_norm": 0.8793400599633049, "learning_rate": 7.977459890946534e-05, "loss": 0.5643, "step": 82 }, { "epoch": 0.6630054917623565, "grad_norm": 0.9839614386276342, "learning_rate": 7.975027346695943e-05, "loss": 0.5609, "step": 83 }, { "epoch": 0.670993509735397, "grad_norm": 1.087269282291481, "learning_rate": 7.972470589716175e-05, "loss": 0.5706, "step": 84 }, { "epoch": 0.6789815277084373, "grad_norm": 0.8949957037226873, "learning_rate": 7.969789699901462e-05, "loss": 0.5718, "step": 85 }, { "epoch": 0.6869695456814778, "grad_norm": 0.5685210804043624, "learning_rate": 7.966984761024974e-05, "loss": 0.5651, "step": 86 }, { "epoch": 0.6949575636545182, "grad_norm": 0.7365421304468946, "learning_rate": 7.964055860736199e-05, "loss": 0.5625, "step": 87 }, { "epoch": 0.7029455816275587, "grad_norm": 0.6519155688073771, "learning_rate": 7.961003090558208e-05, "loss": 0.5602, "step": 88 }, { "epoch": 0.7109335996005991, "grad_norm": 0.47928031192412984, "learning_rate": 7.957826545884786e-05, "loss": 0.5549, "step": 89 }, { "epoch": 0.7189216175736396, "grad_norm": 0.7685907979864348, "learning_rate": 7.95452632597746e-05, "loss": 0.5558, "step": 90 }, { "epoch": 0.72690963554668, "grad_norm": 0.7342212353643156, "learning_rate": 7.951102533962393e-05, "loss": 0.5539, "step": 91 }, { "epoch": 0.7348976535197205, "grad_norm": 0.526378766562186, "learning_rate": 7.947555276827166e-05, "loss": 0.5604, "step": 92 }, { "epoch": 0.7428856714927609, "grad_norm": 0.763635167097638, "learning_rate": 7.94388466541743e-05, "loss": 0.5604, "step": 93 }, { "epoch": 0.7508736894658014, "grad_norm": 1.1133910887739713, "learning_rate": 7.940090814433437e-05, "loss": 0.5502, "step": 94 }, { "epoch": 0.7588617074388417, "grad_norm": 1.350450301452925, "learning_rate": 7.936173842426473e-05, "loss": 0.5607, "step": 95 }, { "epoch": 0.7668497254118821, "grad_norm": 0.47766209706502316, "learning_rate": 7.932133871795136e-05, "loss": 0.5584, "step": 96 }, { "epoch": 0.7748377433849226, "grad_norm": 0.8964819495426043, "learning_rate": 7.927971028781522e-05, "loss": 0.5533, "step": 97 }, { "epoch": 0.782825761357963, "grad_norm": 1.2844756885032345, "learning_rate": 7.923685443467275e-05, "loss": 0.5439, "step": 98 }, { "epoch": 0.7908137793310035, "grad_norm": 0.7076588316414215, "learning_rate": 7.919277249769522e-05, "loss": 0.5516, "step": 99 }, { "epoch": 0.7988017973040439, "grad_norm": 0.9548748366290979, "learning_rate": 7.914746585436692e-05, "loss": 0.5622, "step": 100 }, { "epoch": 0.8067898152770844, "grad_norm": 1.0033397557294186, "learning_rate": 7.91009359204421e-05, "loss": 0.55, "step": 101 }, { "epoch": 0.8147778332501248, "grad_norm": 0.8058153670114928, "learning_rate": 7.90531841499007e-05, "loss": 0.5472, "step": 102 }, { "epoch": 0.8227658512231653, "grad_norm": 0.7522684804995226, "learning_rate": 7.900421203490295e-05, "loss": 0.5475, "step": 103 }, { "epoch": 0.8307538691962056, "grad_norm": 0.8260701286176672, "learning_rate": 7.895402110574277e-05, "loss": 0.546, "step": 104 }, { "epoch": 0.8387418871692461, "grad_norm": 0.9294034148971123, "learning_rate": 7.890261293079985e-05, "loss": 0.5486, "step": 105 }, { "epoch": 0.8467299051422865, "grad_norm": 0.6210859012554373, "learning_rate": 7.884998911649077e-05, "loss": 0.5565, "step": 106 }, { "epoch": 0.854717923115327, "grad_norm": 0.6446646964930844, "learning_rate": 7.879615130721868e-05, "loss": 0.539, "step": 107 }, { "epoch": 0.8627059410883674, "grad_norm": 0.8996911090197094, "learning_rate": 7.8741101185322e-05, "loss": 0.5422, "step": 108 }, { "epoch": 0.8706939590614079, "grad_norm": 0.9338087827721026, "learning_rate": 7.868484047102183e-05, "loss": 0.5535, "step": 109 }, { "epoch": 0.8786819770344483, "grad_norm": 1.1026810388479344, "learning_rate": 7.862737092236818e-05, "loss": 0.5453, "step": 110 }, { "epoch": 0.8866699950074888, "grad_norm": 0.9663842431402072, "learning_rate": 7.856869433518506e-05, "loss": 0.5452, "step": 111 }, { "epoch": 0.8946580129805292, "grad_norm": 1.0210253102387117, "learning_rate": 7.850881254301432e-05, "loss": 0.5568, "step": 112 }, { "epoch": 0.9026460309535697, "grad_norm": 0.8477567856764551, "learning_rate": 7.844772741705835e-05, "loss": 0.545, "step": 113 }, { "epoch": 0.91063404892661, "grad_norm": 0.5613356829580358, "learning_rate": 7.838544086612174e-05, "loss": 0.5438, "step": 114 }, { "epoch": 0.9186220668996505, "grad_norm": 0.6248181380373118, "learning_rate": 7.832195483655144e-05, "loss": 0.5366, "step": 115 }, { "epoch": 0.9266100848726909, "grad_norm": 0.8519302343250585, "learning_rate": 7.825727131217609e-05, "loss": 0.5401, "step": 116 }, { "epoch": 0.9345981028457314, "grad_norm": 0.45919068712258837, "learning_rate": 7.81913923142439e-05, "loss": 0.5518, "step": 117 }, { "epoch": 0.9425861208187718, "grad_norm": 0.5491942320357649, "learning_rate": 7.812431990135965e-05, "loss": 0.545, "step": 118 }, { "epoch": 0.9505741387918123, "grad_norm": 0.7204970814629463, "learning_rate": 7.805605616942023e-05, "loss": 0.5502, "step": 119 }, { "epoch": 0.9585621567648527, "grad_norm": 0.624445399157028, "learning_rate": 7.798660325154917e-05, "loss": 0.5465, "step": 120 }, { "epoch": 0.9665501747378932, "grad_norm": 0.43723265221924457, "learning_rate": 7.791596331803003e-05, "loss": 0.5387, "step": 121 }, { "epoch": 0.9745381927109336, "grad_norm": 0.40296635700807665, "learning_rate": 7.784413857623856e-05, "loss": 0.5384, "step": 122 }, { "epoch": 0.982526210683974, "grad_norm": 0.4355607269982166, "learning_rate": 7.77711312705737e-05, "loss": 0.5391, "step": 123 }, { "epoch": 0.9905142286570144, "grad_norm": 0.37094543758250353, "learning_rate": 7.769694368238746e-05, "loss": 0.534, "step": 124 }, { "epoch": 0.9985022466300549, "grad_norm": 0.36678250566452825, "learning_rate": 7.762157812991369e-05, "loss": 0.535, "step": 125 }, { "epoch": 1.0064902646030953, "grad_norm": 0.7573100076200363, "learning_rate": 7.754503696819553e-05, "loss": 0.955, "step": 126 }, { "epoch": 1.0144782825761358, "grad_norm": 1.0167041671110564, "learning_rate": 7.74673225890119e-05, "loss": 0.5181, "step": 127 }, { "epoch": 1.0224663005491763, "grad_norm": 1.0181250181107355, "learning_rate": 7.738843742080269e-05, "loss": 0.5237, "step": 128 }, { "epoch": 1.0304543185222166, "grad_norm": 1.1080605772500498, "learning_rate": 7.730838392859303e-05, "loss": 0.5312, "step": 129 }, { "epoch": 1.038442336495257, "grad_norm": 0.7638562997222614, "learning_rate": 7.722716461391603e-05, "loss": 0.5338, "step": 130 }, { "epoch": 1.0464303544682976, "grad_norm": 0.8619620628236141, "learning_rate": 7.714478201473483e-05, "loss": 0.5249, "step": 131 }, { "epoch": 1.054418372441338, "grad_norm": 1.1654304124994774, "learning_rate": 7.706123870536315e-05, "loss": 0.5208, "step": 132 }, { "epoch": 1.0624063904143783, "grad_norm": 0.5330720342927018, "learning_rate": 7.697653729638489e-05, "loss": 0.5184, "step": 133 }, { "epoch": 1.0703944083874188, "grad_norm": 1.020325885284434, "learning_rate": 7.689068043457261e-05, "loss": 0.5128, "step": 134 }, { "epoch": 1.0783824263604593, "grad_norm": 0.6983781848617573, "learning_rate": 7.68036708028047e-05, "loss": 0.518, "step": 135 }, { "epoch": 1.0863704443334998, "grad_norm": 0.6057523169656847, "learning_rate": 7.671551111998169e-05, "loss": 0.5196, "step": 136 }, { "epoch": 1.09435846230654, "grad_norm": 0.5211411106516707, "learning_rate": 7.662620414094117e-05, "loss": 0.5199, "step": 137 }, { "epoch": 1.1023464802795806, "grad_norm": 0.5166573997289899, "learning_rate": 7.653575265637177e-05, "loss": 0.5154, "step": 138 }, { "epoch": 1.110334498252621, "grad_norm": 0.4470708726865469, "learning_rate": 7.644415949272591e-05, "loss": 0.5098, "step": 139 }, { "epoch": 1.1183225162256616, "grad_norm": 0.5357218094920962, "learning_rate": 7.635142751213156e-05, "loss": 0.5196, "step": 140 }, { "epoch": 1.1263105341987019, "grad_norm": 0.48982578714373154, "learning_rate": 7.62575596123027e-05, "loss": 0.5112, "step": 141 }, { "epoch": 1.1342985521717424, "grad_norm": 0.3953911478616972, "learning_rate": 7.616255872644888e-05, "loss": 0.5022, "step": 142 }, { "epoch": 1.1422865701447829, "grad_norm": 0.46599322968658796, "learning_rate": 7.60664278231834e-05, "loss": 0.5067, "step": 143 }, { "epoch": 1.1502745881178233, "grad_norm": 0.47850160868681485, "learning_rate": 7.596916990643077e-05, "loss": 0.5028, "step": 144 }, { "epoch": 1.1582626060908636, "grad_norm": 0.42978953466708475, "learning_rate": 7.587078801533262e-05, "loss": 0.5015, "step": 145 }, { "epoch": 1.1662506240639041, "grad_norm": 0.3540055333518291, "learning_rate": 7.577128522415292e-05, "loss": 0.5076, "step": 146 }, { "epoch": 1.1742386420369446, "grad_norm": 0.3351153000601574, "learning_rate": 7.567066464218178e-05, "loss": 0.4989, "step": 147 }, { "epoch": 1.182226660009985, "grad_norm": 0.3005800301999229, "learning_rate": 7.556892941363833e-05, "loss": 0.4967, "step": 148 }, { "epoch": 1.1902146779830254, "grad_norm": 0.3563502792477842, "learning_rate": 7.546608271757251e-05, "loss": 0.5107, "step": 149 }, { "epoch": 1.1982026959560659, "grad_norm": 0.38770493909399334, "learning_rate": 7.536212776776567e-05, "loss": 0.5104, "step": 150 }, { "epoch": 1.2061907139291064, "grad_norm": 0.3767151991317555, "learning_rate": 7.525706781263023e-05, "loss": 0.5102, "step": 151 }, { "epoch": 1.2141787319021469, "grad_norm": 0.4105950587040687, "learning_rate": 7.515090613510801e-05, "loss": 0.4986, "step": 152 }, { "epoch": 1.2221667498751871, "grad_norm": 0.42936249879191585, "learning_rate": 7.504364605256784e-05, "loss": 0.5035, "step": 153 }, { "epoch": 1.2301547678482276, "grad_norm": 0.4346225237944244, "learning_rate": 7.493529091670181e-05, "loss": 0.4988, "step": 154 }, { "epoch": 1.2381427858212681, "grad_norm": 0.4396844168311194, "learning_rate": 7.482584411342043e-05, "loss": 0.5077, "step": 155 }, { "epoch": 1.2461308037943086, "grad_norm": 0.431746092302867, "learning_rate": 7.471530906274704e-05, "loss": 0.4983, "step": 156 }, { "epoch": 1.254118821767349, "grad_norm": 0.5889910567664702, "learning_rate": 7.460368921871077e-05, "loss": 0.5122, "step": 157 }, { "epoch": 1.2621068397403894, "grad_norm": 0.6830133790630488, "learning_rate": 7.44909880692387e-05, "loss": 0.5073, "step": 158 }, { "epoch": 1.27009485771343, "grad_norm": 0.6354350767066138, "learning_rate": 7.437720913604681e-05, "loss": 0.5117, "step": 159 }, { "epoch": 1.2780828756864704, "grad_norm": 0.4963286720098572, "learning_rate": 7.426235597452995e-05, "loss": 0.4993, "step": 160 }, { "epoch": 1.2860708936595107, "grad_norm": 0.418831779419711, "learning_rate": 7.41464321736508e-05, "loss": 0.5021, "step": 161 }, { "epoch": 1.2940589116325512, "grad_norm": 0.4787432347277129, "learning_rate": 7.402944135582758e-05, "loss": 0.502, "step": 162 }, { "epoch": 1.3020469296055917, "grad_norm": 0.50062737801301, "learning_rate": 7.391138717682103e-05, "loss": 0.4937, "step": 163 }, { "epoch": 1.310034947578632, "grad_norm": 0.39201954318713855, "learning_rate": 7.379227332562005e-05, "loss": 0.5003, "step": 164 }, { "epoch": 1.3180229655516724, "grad_norm": 0.31007216413114186, "learning_rate": 7.367210352432645e-05, "loss": 0.502, "step": 165 }, { "epoch": 1.326010983524713, "grad_norm": 0.42076785863557453, "learning_rate": 7.355088152803866e-05, "loss": 0.501, "step": 166 }, { "epoch": 1.3339990014977534, "grad_norm": 0.4745296323176778, "learning_rate": 7.342861112473442e-05, "loss": 0.4979, "step": 167 }, { "epoch": 1.341987019470794, "grad_norm": 0.4199718916823893, "learning_rate": 7.330529613515232e-05, "loss": 0.4984, "step": 168 }, { "epoch": 1.3499750374438342, "grad_norm": 0.3814943625708202, "learning_rate": 7.318094041267253e-05, "loss": 0.4946, "step": 169 }, { "epoch": 1.3579630554168747, "grad_norm": 0.3584958844621985, "learning_rate": 7.305554784319625e-05, "loss": 0.4945, "step": 170 }, { "epoch": 1.3659510733899152, "grad_norm": 0.3258027404514737, "learning_rate": 7.29291223450244e-05, "loss": 0.4936, "step": 171 }, { "epoch": 1.3739390913629554, "grad_norm": 0.3304823682468289, "learning_rate": 7.280166786873514e-05, "loss": 0.4957, "step": 172 }, { "epoch": 1.381927109335996, "grad_norm": 0.285695277322611, "learning_rate": 7.267318839706038e-05, "loss": 0.5004, "step": 173 }, { "epoch": 1.3899151273090364, "grad_norm": 0.360711874339804, "learning_rate": 7.25436879447614e-05, "loss": 0.4946, "step": 174 }, { "epoch": 1.397903145282077, "grad_norm": 0.4690067762041838, "learning_rate": 7.241317055850336e-05, "loss": 0.4933, "step": 175 }, { "epoch": 1.4058911632551174, "grad_norm": 0.48954294072750454, "learning_rate": 7.228164031672879e-05, "loss": 0.4958, "step": 176 }, { "epoch": 1.4138791812281577, "grad_norm": 0.5871985410108085, "learning_rate": 7.214910132953027e-05, "loss": 0.495, "step": 177 }, { "epoch": 1.4218671992011982, "grad_norm": 0.720040324723498, "learning_rate": 7.201555773852189e-05, "loss": 0.4989, "step": 178 }, { "epoch": 1.4298552171742387, "grad_norm": 0.8159522745469254, "learning_rate": 7.188101371670991e-05, "loss": 0.5006, "step": 179 }, { "epoch": 1.437843235147279, "grad_norm": 0.8363865485901019, "learning_rate": 7.174547346836228e-05, "loss": 0.5069, "step": 180 }, { "epoch": 1.4458312531203195, "grad_norm": 0.7345453619769279, "learning_rate": 7.160894122887733e-05, "loss": 0.4927, "step": 181 }, { "epoch": 1.45381927109336, "grad_norm": 0.589527692471703, "learning_rate": 7.147142126465138e-05, "loss": 0.4955, "step": 182 }, { "epoch": 1.4618072890664005, "grad_norm": 0.4423587194525166, "learning_rate": 7.133291787294547e-05, "loss": 0.5094, "step": 183 }, { "epoch": 1.469795307039441, "grad_norm": 0.40340279142628255, "learning_rate": 7.119343538175102e-05, "loss": 0.4967, "step": 184 }, { "epoch": 1.4777833250124812, "grad_norm": 0.4982976531352129, "learning_rate": 7.10529781496546e-05, "loss": 0.4951, "step": 185 }, { "epoch": 1.4857713429855217, "grad_norm": 0.45741686448136076, "learning_rate": 7.09115505657018e-05, "loss": 0.4839, "step": 186 }, { "epoch": 1.4937593609585622, "grad_norm": 0.32134532426731377, "learning_rate": 7.076915704926e-05, "loss": 0.4947, "step": 187 }, { "epoch": 1.5017473789316025, "grad_norm": 0.2578730665869774, "learning_rate": 7.062580204988028e-05, "loss": 0.4885, "step": 188 }, { "epoch": 1.509735396904643, "grad_norm": 0.3424320920246288, "learning_rate": 7.048149004715843e-05, "loss": 0.4968, "step": 189 }, { "epoch": 1.5177234148776835, "grad_norm": 0.40215949965851383, "learning_rate": 7.033622555059491e-05, "loss": 0.4964, "step": 190 }, { "epoch": 1.525711432850724, "grad_norm": 0.3989533402101727, "learning_rate": 7.0190013099454e-05, "loss": 0.4993, "step": 191 }, { "epoch": 1.5336994508237645, "grad_norm": 0.2863829598271095, "learning_rate": 7.004285726262188e-05, "loss": 0.5058, "step": 192 }, { "epoch": 1.5416874687968047, "grad_norm": 0.24052248409440963, "learning_rate": 6.989476263846396e-05, "loss": 0.4861, "step": 193 }, { "epoch": 1.5496754867698452, "grad_norm": 0.4598329169035325, "learning_rate": 6.974573385468105e-05, "loss": 0.5007, "step": 194 }, { "epoch": 1.5576635047428855, "grad_norm": 0.6079055307812807, "learning_rate": 6.95957755681649e-05, "loss": 0.5008, "step": 195 }, { "epoch": 1.565651522715926, "grad_norm": 0.5580132412627938, "learning_rate": 6.944489246485257e-05, "loss": 0.4962, "step": 196 }, { "epoch": 1.5736395406889665, "grad_norm": 0.42994805656529084, "learning_rate": 6.929308925958009e-05, "loss": 0.5076, "step": 197 }, { "epoch": 1.581627558662007, "grad_norm": 0.3842832421038355, "learning_rate": 6.914037069593504e-05, "loss": 0.4924, "step": 198 }, { "epoch": 1.5896155766350475, "grad_norm": 0.32699055905703517, "learning_rate": 6.898674154610839e-05, "loss": 0.4921, "step": 199 }, { "epoch": 1.597603594608088, "grad_norm": 0.42528398283904756, "learning_rate": 6.883220661074534e-05, "loss": 0.4928, "step": 200 }, { "epoch": 1.6055916125811283, "grad_norm": 0.6183497108648602, "learning_rate": 6.867677071879535e-05, "loss": 0.4993, "step": 201 }, { "epoch": 1.6135796305541688, "grad_norm": 0.7584925576329896, "learning_rate": 6.852043872736116e-05, "loss": 0.4846, "step": 202 }, { "epoch": 1.621567648527209, "grad_norm": 0.6243564361060799, "learning_rate": 6.836321552154714e-05, "loss": 0.5007, "step": 203 }, { "epoch": 1.6295556665002495, "grad_norm": 0.3651441665883393, "learning_rate": 6.820510601430649e-05, "loss": 0.4936, "step": 204 }, { "epoch": 1.63754368447329, "grad_norm": 0.23834669483267124, "learning_rate": 6.804611514628788e-05, "loss": 0.4857, "step": 205 }, { "epoch": 1.6455317024463305, "grad_norm": 0.3073254289591667, "learning_rate": 6.78862478856809e-05, "loss": 0.4974, "step": 206 }, { "epoch": 1.653519720419371, "grad_norm": 0.3183758714531585, "learning_rate": 6.772550922806096e-05, "loss": 0.4915, "step": 207 }, { "epoch": 1.6615077383924115, "grad_norm": 0.2710320114390746, "learning_rate": 6.756390419623307e-05, "loss": 0.4901, "step": 208 }, { "epoch": 1.6694957563654518, "grad_norm": 0.27532630096114225, "learning_rate": 6.740143784007495e-05, "loss": 0.4885, "step": 209 }, { "epoch": 1.6774837743384923, "grad_norm": 0.24949516998489749, "learning_rate": 6.723811523637923e-05, "loss": 0.4948, "step": 210 }, { "epoch": 1.6854717923115325, "grad_norm": 0.27385769367337703, "learning_rate": 6.707394148869479e-05, "loss": 0.4963, "step": 211 }, { "epoch": 1.693459810284573, "grad_norm": 0.3041551075828834, "learning_rate": 6.690892172716726e-05, "loss": 0.486, "step": 212 }, { "epoch": 1.7014478282576135, "grad_norm": 0.3555632959677351, "learning_rate": 6.674306110837881e-05, "loss": 0.499, "step": 213 }, { "epoch": 1.709435846230654, "grad_norm": 0.3329437137508577, "learning_rate": 6.657636481518683e-05, "loss": 0.4949, "step": 214 }, { "epoch": 1.7174238642036945, "grad_norm": 0.3417126321251888, "learning_rate": 6.640883805656221e-05, "loss": 0.4913, "step": 215 }, { "epoch": 1.725411882176735, "grad_norm": 0.3989241732557222, "learning_rate": 6.624048606742636e-05, "loss": 0.4911, "step": 216 }, { "epoch": 1.7333999001497753, "grad_norm": 0.45014562286637283, "learning_rate": 6.607131410848777e-05, "loss": 0.4932, "step": 217 }, { "epoch": 1.7413879181228158, "grad_norm": 0.4927365755110579, "learning_rate": 6.590132746607755e-05, "loss": 0.4929, "step": 218 }, { "epoch": 1.749375936095856, "grad_norm": 0.5486106005274718, "learning_rate": 6.573053145198422e-05, "loss": 0.4924, "step": 219 }, { "epoch": 1.7573639540688966, "grad_norm": 0.5493013804791822, "learning_rate": 6.555893140328787e-05, "loss": 0.5029, "step": 220 }, { "epoch": 1.765351972041937, "grad_norm": 0.4921038998096511, "learning_rate": 6.538653268219316e-05, "loss": 0.501, "step": 221 }, { "epoch": 1.7733399900149776, "grad_norm": 0.36708379922405937, "learning_rate": 6.521334067586194e-05, "loss": 0.4912, "step": 222 }, { "epoch": 1.781328007988018, "grad_norm": 0.2934447036565008, "learning_rate": 6.503936079624486e-05, "loss": 0.4924, "step": 223 }, { "epoch": 1.7893160259610585, "grad_norm": 0.41971512428606667, "learning_rate": 6.486459847991226e-05, "loss": 0.4867, "step": 224 }, { "epoch": 1.7973040439340988, "grad_norm": 0.38954075869198324, "learning_rate": 6.46890591878842e-05, "loss": 0.4833, "step": 225 }, { "epoch": 1.8052920619071393, "grad_norm": 0.34504882506932716, "learning_rate": 6.451274840545995e-05, "loss": 0.4952, "step": 226 }, { "epoch": 1.8132800798801796, "grad_norm": 0.3115751552302506, "learning_rate": 6.433567164204652e-05, "loss": 0.4838, "step": 227 }, { "epoch": 1.82126809785322, "grad_norm": 0.3412485251072806, "learning_rate": 6.415783443098645e-05, "loss": 0.4855, "step": 228 }, { "epoch": 1.8292561158262606, "grad_norm": 0.4108218843875664, "learning_rate": 6.397924232938504e-05, "loss": 0.4911, "step": 229 }, { "epoch": 1.837244133799301, "grad_norm": 0.348838980704177, "learning_rate": 6.379990091793653e-05, "loss": 0.4924, "step": 230 }, { "epoch": 1.8452321517723416, "grad_norm": 0.2727569106903297, "learning_rate": 6.361981580074983e-05, "loss": 0.4875, "step": 231 }, { "epoch": 1.853220169745382, "grad_norm": 0.31966296310063425, "learning_rate": 6.343899260517339e-05, "loss": 0.4929, "step": 232 }, { "epoch": 1.8612081877184223, "grad_norm": 0.2973479822646696, "learning_rate": 6.325743698161927e-05, "loss": 0.4929, "step": 233 }, { "epoch": 1.8691962056914628, "grad_norm": 0.34272092476530364, "learning_rate": 6.307515460338672e-05, "loss": 0.4896, "step": 234 }, { "epoch": 1.877184223664503, "grad_norm": 0.3581061926529654, "learning_rate": 6.289215116648477e-05, "loss": 0.486, "step": 235 }, { "epoch": 1.8851722416375436, "grad_norm": 0.2528403776001991, "learning_rate": 6.270843238945426e-05, "loss": 0.4941, "step": 236 }, { "epoch": 1.893160259610584, "grad_norm": 0.2684767914087712, "learning_rate": 6.252400401318924e-05, "loss": 0.495, "step": 237 }, { "epoch": 1.9011482775836246, "grad_norm": 0.3089206948515233, "learning_rate": 6.233887180075744e-05, "loss": 0.4952, "step": 238 }, { "epoch": 1.909136295556665, "grad_norm": 0.30351254889018653, "learning_rate": 6.21530415372203e-05, "loss": 0.4846, "step": 239 }, { "epoch": 1.9171243135297056, "grad_norm": 0.4047998399516971, "learning_rate": 6.196651902945213e-05, "loss": 0.4961, "step": 240 }, { "epoch": 1.9251123315027459, "grad_norm": 0.34718079097807986, "learning_rate": 6.17793101059587e-05, "loss": 0.4784, "step": 241 }, { "epoch": 1.9331003494757864, "grad_norm": 0.23676859947641374, "learning_rate": 6.159142061669504e-05, "loss": 0.4816, "step": 242 }, { "epoch": 1.9410883674488266, "grad_norm": 0.3083982484226228, "learning_rate": 6.14028564328827e-05, "loss": 0.4846, "step": 243 }, { "epoch": 1.9490763854218671, "grad_norm": 0.23280924719224474, "learning_rate": 6.12136234468263e-05, "loss": 0.4901, "step": 244 }, { "epoch": 1.9570644033949076, "grad_norm": 0.23217318367899584, "learning_rate": 6.1023727571729334e-05, "loss": 0.4922, "step": 245 }, { "epoch": 1.965052421367948, "grad_norm": 0.3110861621844553, "learning_rate": 6.083317474150943e-05, "loss": 0.4897, "step": 246 }, { "epoch": 1.9730404393409886, "grad_norm": 0.2740981225422537, "learning_rate": 6.0641970910612966e-05, "loss": 0.4884, "step": 247 }, { "epoch": 1.981028457314029, "grad_norm": 0.30045631025591646, "learning_rate": 6.045012205382894e-05, "loss": 0.4842, "step": 248 }, { "epoch": 1.9890164752870694, "grad_norm": 0.3426504942977091, "learning_rate": 6.025763416610229e-05, "loss": 0.4805, "step": 249 }, { "epoch": 1.9970044932601099, "grad_norm": 0.2696833408525596, "learning_rate": 6.006451326234656e-05, "loss": 0.4955, "step": 250 }, { "epoch": 2.00499251123315, "grad_norm": 0.5162311215778072, "learning_rate": 5.987076537725598e-05, "loss": 0.8356, "step": 251 }, { "epoch": 2.0129805292061906, "grad_norm": 0.8755278174646857, "learning_rate": 5.9676396565116814e-05, "loss": 0.4597, "step": 252 }, { "epoch": 2.020968547179231, "grad_norm": 1.2654521868820567, "learning_rate": 5.9481412899618286e-05, "loss": 0.4832, "step": 253 }, { "epoch": 2.0289565651522716, "grad_norm": 0.7005128945439788, "learning_rate": 5.9285820473662676e-05, "loss": 0.4576, "step": 254 }, { "epoch": 2.036944583125312, "grad_norm": 0.8900852330925937, "learning_rate": 5.9089625399174975e-05, "loss": 0.4677, "step": 255 }, { "epoch": 2.0449326010983526, "grad_norm": 0.9295293387128268, "learning_rate": 5.8892833806911934e-05, "loss": 0.4581, "step": 256 }, { "epoch": 2.052920619071393, "grad_norm": 0.7632251727706844, "learning_rate": 5.869545184627041e-05, "loss": 0.4564, "step": 257 }, { "epoch": 2.060908637044433, "grad_norm": 0.606887179521497, "learning_rate": 5.849748568509529e-05, "loss": 0.4446, "step": 258 }, { "epoch": 2.0688966550174737, "grad_norm": 0.7617777810480713, "learning_rate": 5.829894150948668e-05, "loss": 0.4501, "step": 259 }, { "epoch": 2.076884672990514, "grad_norm": 0.6040763884991026, "learning_rate": 5.8099825523606675e-05, "loss": 0.4468, "step": 260 }, { "epoch": 2.0848726909635547, "grad_norm": 0.6051469481172999, "learning_rate": 5.790014394948542e-05, "loss": 0.4543, "step": 261 }, { "epoch": 2.092860708936595, "grad_norm": 0.478413783344682, "learning_rate": 5.769990302682672e-05, "loss": 0.4506, "step": 262 }, { "epoch": 2.1008487269096356, "grad_norm": 0.562558957244333, "learning_rate": 5.749910901281309e-05, "loss": 0.453, "step": 263 }, { "epoch": 2.108836744882676, "grad_norm": 0.4282466955885263, "learning_rate": 5.729776818191014e-05, "loss": 0.4545, "step": 264 }, { "epoch": 2.1168247628557166, "grad_norm": 0.5285703751553213, "learning_rate": 5.709588682567059e-05, "loss": 0.4479, "step": 265 }, { "epoch": 2.1248127808287567, "grad_norm": 0.40043659559155015, "learning_rate": 5.689347125253765e-05, "loss": 0.4442, "step": 266 }, { "epoch": 2.132800798801797, "grad_norm": 0.45748239783102446, "learning_rate": 5.6690527787647856e-05, "loss": 0.4507, "step": 267 }, { "epoch": 2.1407888167748377, "grad_norm": 0.4448537769428446, "learning_rate": 5.6487062772633455e-05, "loss": 0.4518, "step": 268 }, { "epoch": 2.148776834747878, "grad_norm": 0.3496452875829841, "learning_rate": 5.628308256542428e-05, "loss": 0.4511, "step": 269 }, { "epoch": 2.1567648527209187, "grad_norm": 0.36851827820489447, "learning_rate": 5.607859354004897e-05, "loss": 0.4475, "step": 270 }, { "epoch": 2.164752870693959, "grad_norm": 0.3581014245926748, "learning_rate": 5.5873602086435876e-05, "loss": 0.4559, "step": 271 }, { "epoch": 2.1727408886669997, "grad_norm": 0.3124251429586786, "learning_rate": 5.566811461021335e-05, "loss": 0.4507, "step": 272 }, { "epoch": 2.1807289066400397, "grad_norm": 0.363939895859037, "learning_rate": 5.5462137532509624e-05, "loss": 0.4488, "step": 273 }, { "epoch": 2.18871692461308, "grad_norm": 0.26872286843640025, "learning_rate": 5.5255677289752086e-05, "loss": 0.445, "step": 274 }, { "epoch": 2.1967049425861207, "grad_norm": 0.31188711856580686, "learning_rate": 5.504874033346623e-05, "loss": 0.4518, "step": 275 }, { "epoch": 2.204692960559161, "grad_norm": 0.27440306176835016, "learning_rate": 5.4841333130074015e-05, "loss": 0.4398, "step": 276 }, { "epoch": 2.2126809785322017, "grad_norm": 0.2443244556857597, "learning_rate": 5.4633462160691793e-05, "loss": 0.4496, "step": 277 }, { "epoch": 2.220668996505242, "grad_norm": 0.3469310336287689, "learning_rate": 5.442513392092783e-05, "loss": 0.4434, "step": 278 }, { "epoch": 2.2286570144782827, "grad_norm": 0.2103072041810048, "learning_rate": 5.4216354920679256e-05, "loss": 0.4536, "step": 279 }, { "epoch": 2.236645032451323, "grad_norm": 0.302897592656899, "learning_rate": 5.400713168392874e-05, "loss": 0.4469, "step": 280 }, { "epoch": 2.2446330504243637, "grad_norm": 0.26907620566043555, "learning_rate": 5.379747074854054e-05, "loss": 0.4429, "step": 281 }, { "epoch": 2.2526210683974037, "grad_norm": 0.242767529010096, "learning_rate": 5.358737866605624e-05, "loss": 0.4526, "step": 282 }, { "epoch": 2.260609086370444, "grad_norm": 0.24059729283753153, "learning_rate": 5.337686200149004e-05, "loss": 0.4496, "step": 283 }, { "epoch": 2.2685971043434847, "grad_norm": 0.16892626513698825, "learning_rate": 5.316592733312359e-05, "loss": 0.4444, "step": 284 }, { "epoch": 2.276585122316525, "grad_norm": 0.2428921866442825, "learning_rate": 5.2954581252300416e-05, "loss": 0.4475, "step": 285 }, { "epoch": 2.2845731402895657, "grad_norm": 0.24079102043869002, "learning_rate": 5.2742830363220014e-05, "loss": 0.4443, "step": 286 }, { "epoch": 2.292561158262606, "grad_norm": 0.1691131754858366, "learning_rate": 5.25306812827314e-05, "loss": 0.4423, "step": 287 }, { "epoch": 2.3005491762356467, "grad_norm": 0.26332279757319926, "learning_rate": 5.231814064012639e-05, "loss": 0.4482, "step": 288 }, { "epoch": 2.3085371942086867, "grad_norm": 0.30874064763423864, "learning_rate": 5.210521507693245e-05, "loss": 0.4439, "step": 289 }, { "epoch": 2.3165252121817272, "grad_norm": 0.22311973873687838, "learning_rate": 5.189191124670514e-05, "loss": 0.4402, "step": 290 }, { "epoch": 2.3245132301547677, "grad_norm": 0.1922497454060213, "learning_rate": 5.167823581482022e-05, "loss": 0.4409, "step": 291 }, { "epoch": 2.3325012481278082, "grad_norm": 0.16710905147214794, "learning_rate": 5.146419545826535e-05, "loss": 0.4471, "step": 292 }, { "epoch": 2.3404892661008487, "grad_norm": 0.18694588888380953, "learning_rate": 5.124979686543145e-05, "loss": 0.4514, "step": 293 }, { "epoch": 2.3484772840738892, "grad_norm": 0.19041976798949875, "learning_rate": 5.103504673590372e-05, "loss": 0.4385, "step": 294 }, { "epoch": 2.3564653020469297, "grad_norm": 0.20694395753288766, "learning_rate": 5.081995178025228e-05, "loss": 0.4486, "step": 295 }, { "epoch": 2.36445332001997, "grad_norm": 0.16778281147710722, "learning_rate": 5.060451871982242e-05, "loss": 0.455, "step": 296 }, { "epoch": 2.3724413379930107, "grad_norm": 0.17343940615670786, "learning_rate": 5.038875428652468e-05, "loss": 0.447, "step": 297 }, { "epoch": 2.3804293559660508, "grad_norm": 0.17734566622982126, "learning_rate": 5.0172665222624395e-05, "loss": 0.4481, "step": 298 }, { "epoch": 2.3884173739390913, "grad_norm": 0.1766718931672107, "learning_rate": 4.995625828053106e-05, "loss": 0.4524, "step": 299 }, { "epoch": 2.3964053919121318, "grad_norm": 0.19583636193380063, "learning_rate": 4.973954022258729e-05, "loss": 0.4547, "step": 300 }, { "epoch": 2.4043934098851723, "grad_norm": 0.17026857168289744, "learning_rate": 4.952251782085757e-05, "loss": 0.448, "step": 301 }, { "epoch": 2.4123814278582127, "grad_norm": 0.1394946256958487, "learning_rate": 4.930519785691657e-05, "loss": 0.4482, "step": 302 }, { "epoch": 2.4203694458312532, "grad_norm": 0.1507130531191368, "learning_rate": 4.9087587121637284e-05, "loss": 0.4489, "step": 303 }, { "epoch": 2.4283574638042937, "grad_norm": 0.19875894846238537, "learning_rate": 4.886969241497878e-05, "loss": 0.4445, "step": 304 }, { "epoch": 2.436345481777334, "grad_norm": 0.23769686285223604, "learning_rate": 4.865152054577379e-05, "loss": 0.4524, "step": 305 }, { "epoch": 2.4443334997503743, "grad_norm": 0.22560472662810682, "learning_rate": 4.843307833151583e-05, "loss": 0.4473, "step": 306 }, { "epoch": 2.452321517723415, "grad_norm": 0.15975420253786612, "learning_rate": 4.82143725981463e-05, "loss": 0.4474, "step": 307 }, { "epoch": 2.4603095356964553, "grad_norm": 0.1453747344586306, "learning_rate": 4.7995410179841065e-05, "loss": 0.4496, "step": 308 }, { "epoch": 2.4682975536694958, "grad_norm": 0.15320122247522389, "learning_rate": 4.777619791879698e-05, "loss": 0.4445, "step": 309 }, { "epoch": 2.4762855716425363, "grad_norm": 0.20898054566985402, "learning_rate": 4.755674266501802e-05, "loss": 0.4557, "step": 310 }, { "epoch": 2.4842735896155768, "grad_norm": 0.21741215675606498, "learning_rate": 4.73370512761013e-05, "loss": 0.4417, "step": 311 }, { "epoch": 2.4922616075886173, "grad_norm": 0.16889794561130403, "learning_rate": 4.711713061702274e-05, "loss": 0.4443, "step": 312 }, { "epoch": 2.5002496255616578, "grad_norm": 0.17993307076723922, "learning_rate": 4.689698755992255e-05, "loss": 0.4479, "step": 313 }, { "epoch": 2.508237643534698, "grad_norm": 0.19257453660181062, "learning_rate": 4.667662898389048e-05, "loss": 0.4491, "step": 314 }, { "epoch": 2.5162256615077383, "grad_norm": 0.1472085090976699, "learning_rate": 4.645606177475089e-05, "loss": 0.4373, "step": 315 }, { "epoch": 2.524213679480779, "grad_norm": 0.19033455613068187, "learning_rate": 4.6235292824847575e-05, "loss": 0.4544, "step": 316 }, { "epoch": 2.5322016974538193, "grad_norm": 0.18170601952075063, "learning_rate": 4.601432903282836e-05, "loss": 0.4412, "step": 317 }, { "epoch": 2.54018971542686, "grad_norm": 0.15727860647785666, "learning_rate": 4.579317730342955e-05, "loss": 0.4399, "step": 318 }, { "epoch": 2.5481777333999003, "grad_norm": 0.17970878529305648, "learning_rate": 4.5571844547260184e-05, "loss": 0.4403, "step": 319 }, { "epoch": 2.5561657513729408, "grad_norm": 0.15429718810042514, "learning_rate": 4.535033768058604e-05, "loss": 0.4485, "step": 320 }, { "epoch": 2.564153769345981, "grad_norm": 0.15715864822910056, "learning_rate": 4.512866362511361e-05, "loss": 0.4467, "step": 321 }, { "epoch": 2.5721417873190213, "grad_norm": 0.14222629722842062, "learning_rate": 4.490682930777368e-05, "loss": 0.4374, "step": 322 }, { "epoch": 2.580129805292062, "grad_norm": 0.16416055580887054, "learning_rate": 4.468484166050499e-05, "loss": 0.4429, "step": 323 }, { "epoch": 2.5881178232651023, "grad_norm": 0.1378665667643313, "learning_rate": 4.446270762003754e-05, "loss": 0.4439, "step": 324 }, { "epoch": 2.596105841238143, "grad_norm": 0.14749790568854468, "learning_rate": 4.424043412767589e-05, "loss": 0.4466, "step": 325 }, { "epoch": 2.6040938592111833, "grad_norm": 0.146540138127552, "learning_rate": 4.401802812908221e-05, "loss": 0.4419, "step": 326 }, { "epoch": 2.612081877184224, "grad_norm": 0.17339116836008553, "learning_rate": 4.379549657405928e-05, "loss": 0.4467, "step": 327 }, { "epoch": 2.620069895157264, "grad_norm": 0.18348099975421248, "learning_rate": 4.35728464163333e-05, "loss": 0.4416, "step": 328 }, { "epoch": 2.628057913130305, "grad_norm": 0.13620309620113327, "learning_rate": 4.335008461333657e-05, "loss": 0.4427, "step": 329 }, { "epoch": 2.636045931103345, "grad_norm": 0.1709480972281254, "learning_rate": 4.312721812599016e-05, "loss": 0.4414, "step": 330 }, { "epoch": 2.6440339490763853, "grad_norm": 0.16164451064940724, "learning_rate": 4.2904253918486295e-05, "loss": 0.4535, "step": 331 }, { "epoch": 2.652021967049426, "grad_norm": 0.14081917286088105, "learning_rate": 4.268119895807084e-05, "loss": 0.4429, "step": 332 }, { "epoch": 2.6600099850224663, "grad_norm": 0.18137180021156257, "learning_rate": 4.245806021482547e-05, "loss": 0.4427, "step": 333 }, { "epoch": 2.667998002995507, "grad_norm": 0.13800609298110714, "learning_rate": 4.2234844661449964e-05, "loss": 0.44, "step": 334 }, { "epoch": 2.6759860209685473, "grad_norm": 0.1551146252415665, "learning_rate": 4.20115592730443e-05, "loss": 0.4507, "step": 335 }, { "epoch": 2.683974038941588, "grad_norm": 0.15173038583107296, "learning_rate": 4.178821102689064e-05, "loss": 0.4426, "step": 336 }, { "epoch": 2.691962056914628, "grad_norm": 0.15116080328062176, "learning_rate": 4.156480690223537e-05, "loss": 0.447, "step": 337 }, { "epoch": 2.6999500748876684, "grad_norm": 0.17450805671193279, "learning_rate": 4.134135388007097e-05, "loss": 0.4469, "step": 338 }, { "epoch": 2.707938092860709, "grad_norm": 0.17281860373285934, "learning_rate": 4.111785894291789e-05, "loss": 0.4427, "step": 339 }, { "epoch": 2.7159261108337494, "grad_norm": 0.13324453353593427, "learning_rate": 4.089432907460634e-05, "loss": 0.45, "step": 340 }, { "epoch": 2.72391412880679, "grad_norm": 0.15126807617639215, "learning_rate": 4.0670771260058106e-05, "loss": 0.4486, "step": 341 }, { "epoch": 2.7319021467798303, "grad_norm": 0.16029221354477333, "learning_rate": 4.044719248506819e-05, "loss": 0.4408, "step": 342 }, { "epoch": 2.739890164752871, "grad_norm": 0.1463219695798821, "learning_rate": 4.0223599736086596e-05, "loss": 0.4479, "step": 343 }, { "epoch": 2.747878182725911, "grad_norm": 0.14595637335852438, "learning_rate": 4e-05, "loss": 0.4473, "step": 344 }, { "epoch": 2.755866200698952, "grad_norm": 0.13738467367514962, "learning_rate": 3.9776400263913404e-05, "loss": 0.4541, "step": 345 }, { "epoch": 2.763854218671992, "grad_norm": 0.1439562510526391, "learning_rate": 3.9552807514931824e-05, "loss": 0.4436, "step": 346 }, { "epoch": 2.7718422366450324, "grad_norm": 0.13006608621756496, "learning_rate": 3.93292287399419e-05, "loss": 0.4397, "step": 347 }, { "epoch": 2.779830254618073, "grad_norm": 0.14041358992697037, "learning_rate": 3.9105670925393665e-05, "loss": 0.4322, "step": 348 }, { "epoch": 2.7878182725911134, "grad_norm": 0.1495382630742624, "learning_rate": 3.8882141057082117e-05, "loss": 0.449, "step": 349 }, { "epoch": 2.795806290564154, "grad_norm": 0.13422760316245289, "learning_rate": 3.8658646119929046e-05, "loss": 0.4481, "step": 350 }, { "epoch": 2.8037943085371944, "grad_norm": 0.16641223994983959, "learning_rate": 3.843519309776464e-05, "loss": 0.4454, "step": 351 }, { "epoch": 2.811782326510235, "grad_norm": 0.12812350342466014, "learning_rate": 3.821178897310938e-05, "loss": 0.4535, "step": 352 }, { "epoch": 2.819770344483275, "grad_norm": 0.15337686560279318, "learning_rate": 3.798844072695571e-05, "loss": 0.4455, "step": 353 }, { "epoch": 2.8277583624563154, "grad_norm": 0.13561487109024523, "learning_rate": 3.776515533855004e-05, "loss": 0.4421, "step": 354 }, { "epoch": 2.835746380429356, "grad_norm": 0.12405473708728454, "learning_rate": 3.7541939785174545e-05, "loss": 0.4433, "step": 355 }, { "epoch": 2.8437343984023964, "grad_norm": 0.12633600414835006, "learning_rate": 3.731880104192917e-05, "loss": 0.4432, "step": 356 }, { "epoch": 2.851722416375437, "grad_norm": 0.1317080752956006, "learning_rate": 3.709574608151371e-05, "loss": 0.4465, "step": 357 }, { "epoch": 2.8597104343484774, "grad_norm": 0.1475249153982226, "learning_rate": 3.687278187400985e-05, "loss": 0.4401, "step": 358 }, { "epoch": 2.867698452321518, "grad_norm": 0.1458288492905671, "learning_rate": 3.664991538666344e-05, "loss": 0.4344, "step": 359 }, { "epoch": 2.875686470294558, "grad_norm": 0.11939958255100196, "learning_rate": 3.6427153583666715e-05, "loss": 0.4367, "step": 360 }, { "epoch": 2.883674488267599, "grad_norm": 0.16554239338436524, "learning_rate": 3.620450342594073e-05, "loss": 0.4418, "step": 361 }, { "epoch": 2.891662506240639, "grad_norm": 0.1187974636724584, "learning_rate": 3.59819718709178e-05, "loss": 0.45, "step": 362 }, { "epoch": 2.8996505242136794, "grad_norm": 0.15936228812392336, "learning_rate": 3.575956587232413e-05, "loss": 0.4508, "step": 363 }, { "epoch": 2.90763854218672, "grad_norm": 0.13367105463505505, "learning_rate": 3.5537292379962474e-05, "loss": 0.4465, "step": 364 }, { "epoch": 2.9156265601597604, "grad_norm": 0.14243006994077556, "learning_rate": 3.5315158339495015e-05, "loss": 0.4464, "step": 365 }, { "epoch": 2.923614578132801, "grad_norm": 0.1399001261869002, "learning_rate": 3.509317069222633e-05, "loss": 0.4502, "step": 366 }, { "epoch": 2.9316025961058414, "grad_norm": 0.13108273735056272, "learning_rate": 3.487133637488639e-05, "loss": 0.4369, "step": 367 }, { "epoch": 2.939590614078882, "grad_norm": 0.14943325684519726, "learning_rate": 3.464966231941397e-05, "loss": 0.4415, "step": 368 }, { "epoch": 2.947578632051922, "grad_norm": 0.13558373438864768, "learning_rate": 3.442815545273983e-05, "loss": 0.4382, "step": 369 }, { "epoch": 2.9555666500249624, "grad_norm": 0.12912584792295748, "learning_rate": 3.420682269657047e-05, "loss": 0.4363, "step": 370 }, { "epoch": 2.963554667998003, "grad_norm": 0.12458007215100302, "learning_rate": 3.398567096717165e-05, "loss": 0.4409, "step": 371 }, { "epoch": 2.9715426859710434, "grad_norm": 0.12840111428281253, "learning_rate": 3.376470717515244e-05, "loss": 0.4407, "step": 372 }, { "epoch": 2.979530703944084, "grad_norm": 0.13058809738960123, "learning_rate": 3.354393822524913e-05, "loss": 0.4407, "step": 373 }, { "epoch": 2.9875187219171244, "grad_norm": 0.15613845334671814, "learning_rate": 3.332337101610953e-05, "loss": 0.4473, "step": 374 }, { "epoch": 2.995506739890165, "grad_norm": 0.13389617942366203, "learning_rate": 3.310301244007747e-05, "loss": 0.4352, "step": 375 }, { "epoch": 3.0034947578632054, "grad_norm": 0.30944417126328405, "learning_rate": 3.2882869382977265e-05, "loss": 0.7723, "step": 376 }, { "epoch": 3.0114827758362455, "grad_norm": 0.29354627871039446, "learning_rate": 3.266294872389871e-05, "loss": 0.4025, "step": 377 }, { "epoch": 3.019470793809286, "grad_norm": 0.2010591684487564, "learning_rate": 3.2443257334981985e-05, "loss": 0.4024, "step": 378 }, { "epoch": 3.0274588117823265, "grad_norm": 0.27298247297612654, "learning_rate": 3.222380208120304e-05, "loss": 0.4089, "step": 379 }, { "epoch": 3.035446829755367, "grad_norm": 0.23270934832932566, "learning_rate": 3.200458982015894e-05, "loss": 0.4072, "step": 380 }, { "epoch": 3.0434348477284074, "grad_norm": 0.20268308202991778, "learning_rate": 3.178562740185372e-05, "loss": 0.4022, "step": 381 }, { "epoch": 3.051422865701448, "grad_norm": 0.20766736812021794, "learning_rate": 3.156692166848418e-05, "loss": 0.4024, "step": 382 }, { "epoch": 3.0594108836744884, "grad_norm": 0.2547479854625852, "learning_rate": 3.134847945422622e-05, "loss": 0.4072, "step": 383 }, { "epoch": 3.067398901647529, "grad_norm": 0.1969866280565691, "learning_rate": 3.113030758502123e-05, "loss": 0.4118, "step": 384 }, { "epoch": 3.075386919620569, "grad_norm": 0.23153499880928385, "learning_rate": 3.091241287836272e-05, "loss": 0.4077, "step": 385 }, { "epoch": 3.0833749375936095, "grad_norm": 0.20503882652518132, "learning_rate": 3.0694802143083436e-05, "loss": 0.4132, "step": 386 }, { "epoch": 3.09136295556665, "grad_norm": 0.17320798113782282, "learning_rate": 3.0477482179142432e-05, "loss": 0.4097, "step": 387 }, { "epoch": 3.0993509735396905, "grad_norm": 0.20168474769945824, "learning_rate": 3.026045977741272e-05, "loss": 0.3965, "step": 388 }, { "epoch": 3.107338991512731, "grad_norm": 0.19398918365065387, "learning_rate": 3.004374171946895e-05, "loss": 0.402, "step": 389 }, { "epoch": 3.1153270094857715, "grad_norm": 0.16700046485980305, "learning_rate": 2.9827334777375622e-05, "loss": 0.4136, "step": 390 }, { "epoch": 3.123315027458812, "grad_norm": 0.25279169008131713, "learning_rate": 2.9611245713475328e-05, "loss": 0.4003, "step": 391 }, { "epoch": 3.131303045431852, "grad_norm": 0.16080528287954057, "learning_rate": 2.9395481280177596e-05, "loss": 0.4011, "step": 392 }, { "epoch": 3.1392910634048925, "grad_norm": 0.22759163441812938, "learning_rate": 2.9180048219747736e-05, "loss": 0.4034, "step": 393 }, { "epoch": 3.147279081377933, "grad_norm": 0.17841534466968145, "learning_rate": 2.8964953264096277e-05, "loss": 0.4086, "step": 394 }, { "epoch": 3.1552670993509735, "grad_norm": 0.17487802806512123, "learning_rate": 2.8750203134568564e-05, "loss": 0.408, "step": 395 }, { "epoch": 3.163255117324014, "grad_norm": 0.18241625540198192, "learning_rate": 2.8535804541734663e-05, "loss": 0.4077, "step": 396 }, { "epoch": 3.1712431352970545, "grad_norm": 0.16398724549614757, "learning_rate": 2.832176418517979e-05, "loss": 0.4098, "step": 397 }, { "epoch": 3.179231153270095, "grad_norm": 0.16170229114317095, "learning_rate": 2.8108088753294864e-05, "loss": 0.4, "step": 398 }, { "epoch": 3.1872191712431355, "grad_norm": 0.14606650542275093, "learning_rate": 2.7894784923067563e-05, "loss": 0.4081, "step": 399 }, { "epoch": 3.195207189216176, "grad_norm": 0.154688060281461, "learning_rate": 2.768185935987362e-05, "loss": 0.4095, "step": 400 }, { "epoch": 3.203195207189216, "grad_norm": 0.14458385897335363, "learning_rate": 2.7469318717268622e-05, "loss": 0.4083, "step": 401 }, { "epoch": 3.2111832251622565, "grad_norm": 0.14953811526297756, "learning_rate": 2.7257169636779992e-05, "loss": 0.4082, "step": 402 }, { "epoch": 3.219171243135297, "grad_norm": 0.13312099784173914, "learning_rate": 2.704541874769958e-05, "loss": 0.4068, "step": 403 }, { "epoch": 3.2271592611083375, "grad_norm": 0.1386674411611782, "learning_rate": 2.6834072666876427e-05, "loss": 0.402, "step": 404 }, { "epoch": 3.235147279081378, "grad_norm": 0.12924251838188583, "learning_rate": 2.6623137998509964e-05, "loss": 0.4113, "step": 405 }, { "epoch": 3.2431352970544185, "grad_norm": 0.13222743176356805, "learning_rate": 2.641262133394378e-05, "loss": 0.4093, "step": 406 }, { "epoch": 3.251123315027459, "grad_norm": 0.13021912109847186, "learning_rate": 2.6202529251459475e-05, "loss": 0.4104, "step": 407 }, { "epoch": 3.259111333000499, "grad_norm": 0.13606000089551518, "learning_rate": 2.599286831607127e-05, "loss": 0.4089, "step": 408 }, { "epoch": 3.2670993509735395, "grad_norm": 0.13357003115707924, "learning_rate": 2.5783645079320757e-05, "loss": 0.4055, "step": 409 }, { "epoch": 3.27508736894658, "grad_norm": 0.1232470250676397, "learning_rate": 2.5574866079072188e-05, "loss": 0.4133, "step": 410 }, { "epoch": 3.2830753869196205, "grad_norm": 0.14061126711951444, "learning_rate": 2.5366537839308213e-05, "loss": 0.4023, "step": 411 }, { "epoch": 3.291063404892661, "grad_norm": 0.12020419683198272, "learning_rate": 2.515866686992599e-05, "loss": 0.406, "step": 412 }, { "epoch": 3.2990514228657015, "grad_norm": 0.13624018306536384, "learning_rate": 2.4951259666533778e-05, "loss": 0.4137, "step": 413 }, { "epoch": 3.307039440838742, "grad_norm": 0.13470595005125394, "learning_rate": 2.4744322710247914e-05, "loss": 0.4072, "step": 414 }, { "epoch": 3.3150274588117825, "grad_norm": 0.11406991036845995, "learning_rate": 2.4537862467490393e-05, "loss": 0.4032, "step": 415 }, { "epoch": 3.323015476784823, "grad_norm": 0.12469392558548403, "learning_rate": 2.4331885389786648e-05, "loss": 0.4061, "step": 416 }, { "epoch": 3.331003494757863, "grad_norm": 0.11240496673470576, "learning_rate": 2.4126397913564138e-05, "loss": 0.3972, "step": 417 }, { "epoch": 3.3389915127309036, "grad_norm": 0.11440176304944144, "learning_rate": 2.3921406459951038e-05, "loss": 0.401, "step": 418 }, { "epoch": 3.346979530703944, "grad_norm": 0.12061267695807164, "learning_rate": 2.371691743457573e-05, "loss": 0.4042, "step": 419 }, { "epoch": 3.3549675486769845, "grad_norm": 0.12408924452739928, "learning_rate": 2.3512937227366548e-05, "loss": 0.4042, "step": 420 }, { "epoch": 3.362955566650025, "grad_norm": 0.119324320832681, "learning_rate": 2.330947221235217e-05, "loss": 0.3999, "step": 421 }, { "epoch": 3.3709435846230655, "grad_norm": 0.11372319294009971, "learning_rate": 2.3106528747462374e-05, "loss": 0.411, "step": 422 }, { "epoch": 3.378931602596106, "grad_norm": 0.11440578627516848, "learning_rate": 2.290411317432942e-05, "loss": 0.4103, "step": 423 }, { "epoch": 3.386919620569146, "grad_norm": 0.11396557333843903, "learning_rate": 2.270223181808988e-05, "loss": 0.4056, "step": 424 }, { "epoch": 3.3949076385421866, "grad_norm": 0.1073175497389294, "learning_rate": 2.250089098718692e-05, "loss": 0.4001, "step": 425 }, { "epoch": 3.402895656515227, "grad_norm": 0.11142545752473547, "learning_rate": 2.2300096973173276e-05, "loss": 0.4013, "step": 426 }, { "epoch": 3.4108836744882676, "grad_norm": 0.11528253053702402, "learning_rate": 2.2099856050514593e-05, "loss": 0.4074, "step": 427 }, { "epoch": 3.418871692461308, "grad_norm": 0.1075239061798206, "learning_rate": 2.1900174476393335e-05, "loss": 0.4035, "step": 428 }, { "epoch": 3.4268597104343486, "grad_norm": 0.10808021553369461, "learning_rate": 2.170105849051332e-05, "loss": 0.4052, "step": 429 }, { "epoch": 3.434847728407389, "grad_norm": 0.11387661467604573, "learning_rate": 2.1502514314904723e-05, "loss": 0.4011, "step": 430 }, { "epoch": 3.442835746380429, "grad_norm": 0.10171924087995715, "learning_rate": 2.1304548153729596e-05, "loss": 0.4077, "step": 431 }, { "epoch": 3.4508237643534696, "grad_norm": 0.1285002444781682, "learning_rate": 2.1107166193088073e-05, "loss": 0.4063, "step": 432 }, { "epoch": 3.45881178232651, "grad_norm": 0.11335168282371334, "learning_rate": 2.091037460082503e-05, "loss": 0.4154, "step": 433 }, { "epoch": 3.4667998002995506, "grad_norm": 0.11343444669438019, "learning_rate": 2.0714179526337334e-05, "loss": 0.41, "step": 434 }, { "epoch": 3.474787818272591, "grad_norm": 0.1217156602130217, "learning_rate": 2.0518587100381727e-05, "loss": 0.4075, "step": 435 }, { "epoch": 3.4827758362456316, "grad_norm": 0.10725293992167916, "learning_rate": 2.0323603434883186e-05, "loss": 0.4066, "step": 436 }, { "epoch": 3.490763854218672, "grad_norm": 0.12028103178489573, "learning_rate": 2.0129234622744044e-05, "loss": 0.4103, "step": 437 }, { "epoch": 3.4987518721917126, "grad_norm": 0.1029854987347421, "learning_rate": 1.9935486737653452e-05, "loss": 0.4038, "step": 438 }, { "epoch": 3.506739890164753, "grad_norm": 0.11857347505878003, "learning_rate": 1.9742365833897733e-05, "loss": 0.4074, "step": 439 }, { "epoch": 3.514727908137793, "grad_norm": 0.1105825700379065, "learning_rate": 1.954987794617107e-05, "loss": 0.4105, "step": 440 }, { "epoch": 3.5227159261108336, "grad_norm": 0.11511596034752838, "learning_rate": 1.9358029089387034e-05, "loss": 0.4131, "step": 441 }, { "epoch": 3.530703944083874, "grad_norm": 0.11612657903144337, "learning_rate": 1.916682525849058e-05, "loss": 0.4068, "step": 442 }, { "epoch": 3.5386919620569146, "grad_norm": 0.10575599755099882, "learning_rate": 1.897627242827068e-05, "loss": 0.4038, "step": 443 }, { "epoch": 3.546679980029955, "grad_norm": 0.11088748332110426, "learning_rate": 1.878637655317372e-05, "loss": 0.4078, "step": 444 }, { "epoch": 3.5546679980029956, "grad_norm": 0.11466223345296331, "learning_rate": 1.859714356711731e-05, "loss": 0.3939, "step": 445 }, { "epoch": 3.562656015976036, "grad_norm": 0.11673865175002288, "learning_rate": 1.8408579383304985e-05, "loss": 0.4049, "step": 446 }, { "epoch": 3.570644033949076, "grad_norm": 0.11577952607867907, "learning_rate": 1.8220689894041314e-05, "loss": 0.4088, "step": 447 }, { "epoch": 3.578632051922117, "grad_norm": 0.10690091900937719, "learning_rate": 1.8033480970547872e-05, "loss": 0.4056, "step": 448 }, { "epoch": 3.586620069895157, "grad_norm": 0.11541573082426308, "learning_rate": 1.7846958462779716e-05, "loss": 0.4007, "step": 449 }, { "epoch": 3.5946080878681976, "grad_norm": 0.1100114302346526, "learning_rate": 1.7661128199242576e-05, "loss": 0.4089, "step": 450 }, { "epoch": 3.602596105841238, "grad_norm": 0.10956511339867736, "learning_rate": 1.7475995986810775e-05, "loss": 0.4018, "step": 451 }, { "epoch": 3.6105841238142786, "grad_norm": 0.10850454028936493, "learning_rate": 1.7291567610545738e-05, "loss": 0.4051, "step": 452 }, { "epoch": 3.618572141787319, "grad_norm": 0.1131878747175685, "learning_rate": 1.7107848833515244e-05, "loss": 0.4079, "step": 453 }, { "epoch": 3.6265601597603596, "grad_norm": 0.09884020665129564, "learning_rate": 1.6924845396613275e-05, "loss": 0.407, "step": 454 }, { "epoch": 3.6345481777334, "grad_norm": 0.11216709502149264, "learning_rate": 1.6742563018380734e-05, "loss": 0.4087, "step": 455 }, { "epoch": 3.64253619570644, "grad_norm": 0.0996580768122796, "learning_rate": 1.6561007394826623e-05, "loss": 0.4039, "step": 456 }, { "epoch": 3.6505242136794807, "grad_norm": 0.10651639312645377, "learning_rate": 1.638018419925018e-05, "loss": 0.3996, "step": 457 }, { "epoch": 3.658512231652521, "grad_norm": 0.09841162160967377, "learning_rate": 1.6200099082063477e-05, "loss": 0.4055, "step": 458 }, { "epoch": 3.6665002496255616, "grad_norm": 0.11559374542937897, "learning_rate": 1.602075767061497e-05, "loss": 0.4088, "step": 459 }, { "epoch": 3.674488267598602, "grad_norm": 0.11049592658320795, "learning_rate": 1.584216556901355e-05, "loss": 0.4053, "step": 460 }, { "epoch": 3.6824762855716426, "grad_norm": 0.09690459875455099, "learning_rate": 1.566432835795349e-05, "loss": 0.4052, "step": 461 }, { "epoch": 3.690464303544683, "grad_norm": 0.11084043420560455, "learning_rate": 1.5487251594540062e-05, "loss": 0.4013, "step": 462 }, { "epoch": 3.698452321517723, "grad_norm": 0.11145942008644477, "learning_rate": 1.5310940812115812e-05, "loss": 0.404, "step": 463 }, { "epoch": 3.706440339490764, "grad_norm": 0.09702858045834936, "learning_rate": 1.5135401520087757e-05, "loss": 0.4033, "step": 464 }, { "epoch": 3.714428357463804, "grad_norm": 0.10073536549329104, "learning_rate": 1.4960639203755136e-05, "loss": 0.4046, "step": 465 }, { "epoch": 3.7224163754368447, "grad_norm": 0.09948648507952308, "learning_rate": 1.4786659324138075e-05, "loss": 0.4041, "step": 466 }, { "epoch": 3.730404393409885, "grad_norm": 0.09373041246826647, "learning_rate": 1.4613467317806861e-05, "loss": 0.4075, "step": 467 }, { "epoch": 3.7383924113829257, "grad_norm": 0.10208986391007283, "learning_rate": 1.4441068596712157e-05, "loss": 0.3999, "step": 468 }, { "epoch": 3.746380429355966, "grad_norm": 0.10239549924151786, "learning_rate": 1.4269468548015785e-05, "loss": 0.3954, "step": 469 }, { "epoch": 3.7543684473290067, "grad_norm": 0.10434926470085772, "learning_rate": 1.4098672533922471e-05, "loss": 0.4103, "step": 470 }, { "epoch": 3.762356465302047, "grad_norm": 0.1022671854724037, "learning_rate": 1.3928685891512248e-05, "loss": 0.4068, "step": 471 }, { "epoch": 3.770344483275087, "grad_norm": 0.10372672313209318, "learning_rate": 1.375951393257365e-05, "loss": 0.4063, "step": 472 }, { "epoch": 3.7783325012481277, "grad_norm": 0.1001467709798247, "learning_rate": 1.35911619434378e-05, "loss": 0.3982, "step": 473 }, { "epoch": 3.786320519221168, "grad_norm": 0.10848171250475616, "learning_rate": 1.3423635184813182e-05, "loss": 0.3994, "step": 474 }, { "epoch": 3.7943085371942087, "grad_norm": 0.10297059459791853, "learning_rate": 1.3256938891621208e-05, "loss": 0.4051, "step": 475 }, { "epoch": 3.802296555167249, "grad_norm": 0.09850487513786725, "learning_rate": 1.3091078272832732e-05, "loss": 0.4039, "step": 476 }, { "epoch": 3.8102845731402897, "grad_norm": 0.09654837279347964, "learning_rate": 1.2926058511305221e-05, "loss": 0.4027, "step": 477 }, { "epoch": 3.81827259111333, "grad_norm": 0.10106233469187086, "learning_rate": 1.2761884763620773e-05, "loss": 0.4028, "step": 478 }, { "epoch": 3.8262606090863702, "grad_norm": 0.10521144963578496, "learning_rate": 1.2598562159925068e-05, "loss": 0.4047, "step": 479 }, { "epoch": 3.8342486270594107, "grad_norm": 0.10055170090272858, "learning_rate": 1.2436095803766946e-05, "loss": 0.408, "step": 480 }, { "epoch": 3.842236645032451, "grad_norm": 0.10030963723827738, "learning_rate": 1.2274490771939047e-05, "loss": 0.4139, "step": 481 }, { "epoch": 3.8502246630054917, "grad_norm": 0.11291159209305866, "learning_rate": 1.2113752114319107e-05, "loss": 0.4075, "step": 482 }, { "epoch": 3.858212680978532, "grad_norm": 0.09711749318081525, "learning_rate": 1.195388485371213e-05, "loss": 0.4008, "step": 483 }, { "epoch": 3.8662006989515727, "grad_norm": 0.09587092246627478, "learning_rate": 1.1794893985693517e-05, "loss": 0.4072, "step": 484 }, { "epoch": 3.874188716924613, "grad_norm": 0.10842252534792915, "learning_rate": 1.1636784478452872e-05, "loss": 0.3983, "step": 485 }, { "epoch": 3.8821767348976532, "grad_norm": 0.10030989962078998, "learning_rate": 1.1479561272638851e-05, "loss": 0.405, "step": 486 }, { "epoch": 3.890164752870694, "grad_norm": 0.09668292596476558, "learning_rate": 1.1323229281204667e-05, "loss": 0.4046, "step": 487 }, { "epoch": 3.8981527708437342, "grad_norm": 0.11229884300303226, "learning_rate": 1.1167793389254671e-05, "loss": 0.4077, "step": 488 }, { "epoch": 3.9061407888167747, "grad_norm": 0.1007265262970734, "learning_rate": 1.1013258453891624e-05, "loss": 0.4079, "step": 489 }, { "epoch": 3.9141288067898152, "grad_norm": 0.09800596022091544, "learning_rate": 1.0859629304064966e-05, "loss": 0.4124, "step": 490 }, { "epoch": 3.9221168247628557, "grad_norm": 0.0972237859271068, "learning_rate": 1.0706910740419927e-05, "loss": 0.3995, "step": 491 }, { "epoch": 3.930104842735896, "grad_norm": 0.09568160375704794, "learning_rate": 1.055510753514744e-05, "loss": 0.4044, "step": 492 }, { "epoch": 3.9380928607089367, "grad_norm": 0.10293942587001009, "learning_rate": 1.0404224431835127e-05, "loss": 0.3999, "step": 493 }, { "epoch": 3.946080878681977, "grad_norm": 0.09547704742606819, "learning_rate": 1.025426614531897e-05, "loss": 0.4012, "step": 494 }, { "epoch": 3.9540688966550173, "grad_norm": 0.09843903422495338, "learning_rate": 1.0105237361536058e-05, "loss": 0.4029, "step": 495 }, { "epoch": 3.9620569146280578, "grad_norm": 0.0995011244677626, "learning_rate": 9.957142737378128e-06, "loss": 0.4084, "step": 496 }, { "epoch": 3.9700449326010983, "grad_norm": 0.10559619287684664, "learning_rate": 9.809986900546011e-06, "loss": 0.4031, "step": 497 }, { "epoch": 3.9780329505741387, "grad_norm": 0.09619833393540202, "learning_rate": 9.663774449405095e-06, "loss": 0.3986, "step": 498 }, { "epoch": 3.9860209685471792, "grad_norm": 0.09183866726575214, "learning_rate": 9.518509952841586e-06, "loss": 0.4066, "step": 499 }, { "epoch": 3.9940089865202197, "grad_norm": 0.09366222741801747, "learning_rate": 9.374197950119726e-06, "loss": 0.4039, "step": 500 }, { "epoch": 4.00199700449326, "grad_norm": 0.2243873122878777, "learning_rate": 9.230842950740002e-06, "loss": 0.7111, "step": 501 }, { "epoch": 4.0099850224663, "grad_norm": 0.16901611231637365, "learning_rate": 9.088449434298204e-06, "loss": 0.3809, "step": 502 }, { "epoch": 4.017973040439341, "grad_norm": 0.13423991777929192, "learning_rate": 8.947021850345398e-06, "loss": 0.3726, "step": 503 }, { "epoch": 4.025961058412381, "grad_norm": 0.1178503561561421, "learning_rate": 8.806564618248999e-06, "loss": 0.3808, "step": 504 }, { "epoch": 4.033949076385422, "grad_norm": 0.14732236266291146, "learning_rate": 8.667082127054533e-06, "loss": 0.3832, "step": 505 }, { "epoch": 4.041937094358462, "grad_norm": 0.15778749749862814, "learning_rate": 8.52857873534862e-06, "loss": 0.3779, "step": 506 }, { "epoch": 4.049925112331502, "grad_norm": 0.1386627835810346, "learning_rate": 8.391058771122673e-06, "loss": 0.3831, "step": 507 }, { "epoch": 4.057913130304543, "grad_norm": 0.12467125366104446, "learning_rate": 8.254526531637727e-06, "loss": 0.3874, "step": 508 }, { "epoch": 4.065901148277583, "grad_norm": 0.1255433393864893, "learning_rate": 8.118986283290096e-06, "loss": 0.3873, "step": 509 }, { "epoch": 4.073889166250624, "grad_norm": 0.13259474782114336, "learning_rate": 7.984442261478108e-06, "loss": 0.3779, "step": 510 }, { "epoch": 4.081877184223664, "grad_norm": 0.1313024483917481, "learning_rate": 7.850898670469745e-06, "loss": 0.3796, "step": 511 }, { "epoch": 4.089865202196705, "grad_norm": 0.12062485435615429, "learning_rate": 7.718359683271224e-06, "loss": 0.3801, "step": 512 }, { "epoch": 4.097853220169745, "grad_norm": 0.11323572660175975, "learning_rate": 7.586829441496668e-06, "loss": 0.3692, "step": 513 }, { "epoch": 4.105841238142786, "grad_norm": 0.12334975202412422, "learning_rate": 7.456312055238606e-06, "loss": 0.3792, "step": 514 }, { "epoch": 4.113829256115826, "grad_norm": 0.12055598843637728, "learning_rate": 7.326811602939634e-06, "loss": 0.3825, "step": 515 }, { "epoch": 4.121817274088866, "grad_norm": 0.11922302158014507, "learning_rate": 7.198332131264876e-06, "loss": 0.3827, "step": 516 }, { "epoch": 4.129805292061907, "grad_norm": 0.1197396216655153, "learning_rate": 7.070877654975614e-06, "loss": 0.3858, "step": 517 }, { "epoch": 4.137793310034947, "grad_norm": 0.10303380534845168, "learning_rate": 6.944452156803763e-06, "loss": 0.3763, "step": 518 }, { "epoch": 4.145781328007988, "grad_norm": 0.10771564322360738, "learning_rate": 6.819059587327479e-06, "loss": 0.3798, "step": 519 }, { "epoch": 4.153769345981028, "grad_norm": 0.11083630377147478, "learning_rate": 6.694703864847673e-06, "loss": 0.3812, "step": 520 }, { "epoch": 4.161757363954069, "grad_norm": 0.1036678793429057, "learning_rate": 6.571388875265592e-06, "loss": 0.3804, "step": 521 }, { "epoch": 4.169745381927109, "grad_norm": 0.10290514415858039, "learning_rate": 6.449118471961342e-06, "loss": 0.3815, "step": 522 }, { "epoch": 4.177733399900149, "grad_norm": 0.09999602037947594, "learning_rate": 6.327896475673561e-06, "loss": 0.3796, "step": 523 }, { "epoch": 4.18572141787319, "grad_norm": 0.10176649553175782, "learning_rate": 6.207726674379961e-06, "loss": 0.3802, "step": 524 }, { "epoch": 4.19370943584623, "grad_norm": 0.10341756297649503, "learning_rate": 6.088612823178968e-06, "loss": 0.3752, "step": 525 }, { "epoch": 4.201697453819271, "grad_norm": 0.10010538520744762, "learning_rate": 5.970558644172424e-06, "loss": 0.3772, "step": 526 }, { "epoch": 4.209685471792311, "grad_norm": 0.09383564748055143, "learning_rate": 5.853567826349213e-06, "loss": 0.3738, "step": 527 }, { "epoch": 4.217673489765352, "grad_norm": 0.09458974198014311, "learning_rate": 5.737644025470057e-06, "loss": 0.3752, "step": 528 }, { "epoch": 4.225661507738392, "grad_norm": 0.10200091444940393, "learning_rate": 5.6227908639532045e-06, "loss": 0.3822, "step": 529 }, { "epoch": 4.233649525711433, "grad_norm": 0.09730500481091861, "learning_rate": 5.509011930761308e-06, "loss": 0.381, "step": 530 }, { "epoch": 4.241637543684473, "grad_norm": 0.09532139450671104, "learning_rate": 5.396310781289243e-06, "loss": 0.3816, "step": 531 }, { "epoch": 4.249625561657513, "grad_norm": 0.09644789239600618, "learning_rate": 5.284690937252977e-06, "loss": 0.3696, "step": 532 }, { "epoch": 4.257613579630554, "grad_norm": 0.10066108394874461, "learning_rate": 5.1741558865795906e-06, "loss": 0.3859, "step": 533 }, { "epoch": 4.265601597603594, "grad_norm": 0.09693373503450557, "learning_rate": 5.064709083298214e-06, "loss": 0.3822, "step": 534 }, { "epoch": 4.273589615576635, "grad_norm": 0.08926912859612744, "learning_rate": 4.95635394743216e-06, "loss": 0.3782, "step": 535 }, { "epoch": 4.281577633549675, "grad_norm": 0.09076499502790894, "learning_rate": 4.849093864891994e-06, "loss": 0.3822, "step": 536 }, { "epoch": 4.289565651522716, "grad_norm": 0.08773426720097247, "learning_rate": 4.7429321873697865e-06, "loss": 0.3783, "step": 537 }, { "epoch": 4.297553669495756, "grad_norm": 0.0879998661027265, "learning_rate": 4.637872232234326e-06, "loss": 0.3805, "step": 538 }, { "epoch": 4.305541687468796, "grad_norm": 0.09409764491066522, "learning_rate": 4.5339172824274955e-06, "loss": 0.3795, "step": 539 }, { "epoch": 4.313529705441837, "grad_norm": 0.089547761049764, "learning_rate": 4.4310705863616835e-06, "loss": 0.3794, "step": 540 }, { "epoch": 4.321517723414877, "grad_norm": 0.09066889486649515, "learning_rate": 4.329335357818236e-06, "loss": 0.3759, "step": 541 }, { "epoch": 4.329505741387918, "grad_norm": 0.09166858551173564, "learning_rate": 4.228714775847084e-06, "loss": 0.3877, "step": 542 }, { "epoch": 4.337493759360958, "grad_norm": 0.09606401143384108, "learning_rate": 4.129211984667385e-06, "loss": 0.3803, "step": 543 }, { "epoch": 4.345481777333999, "grad_norm": 0.08718065900580216, "learning_rate": 4.030830093569247e-06, "loss": 0.3764, "step": 544 }, { "epoch": 4.353469795307039, "grad_norm": 0.09279816372084171, "learning_rate": 3.933572176816602e-06, "loss": 0.3818, "step": 545 }, { "epoch": 4.361457813280079, "grad_norm": 0.0895802314032739, "learning_rate": 3.837441273551137e-06, "loss": 0.3749, "step": 546 }, { "epoch": 4.36944583125312, "grad_norm": 0.08925866632093443, "learning_rate": 3.7424403876972924e-06, "loss": 0.3741, "step": 547 }, { "epoch": 4.37743384922616, "grad_norm": 0.09077149176473304, "learning_rate": 3.6485724878684382e-06, "loss": 0.3889, "step": 548 }, { "epoch": 4.385421867199201, "grad_norm": 0.08624641665702638, "learning_rate": 3.555840507274093e-06, "loss": 0.3788, "step": 549 }, { "epoch": 4.393409885172241, "grad_norm": 0.09155307608035071, "learning_rate": 3.464247343628242e-06, "loss": 0.3833, "step": 550 }, { "epoch": 4.401397903145282, "grad_norm": 0.08659198159245704, "learning_rate": 3.373795859058837e-06, "loss": 0.3756, "step": 551 }, { "epoch": 4.409385921118322, "grad_norm": 0.08959149189104454, "learning_rate": 3.284488880018315e-06, "loss": 0.3809, "step": 552 }, { "epoch": 4.417373939091363, "grad_norm": 0.08570866197339067, "learning_rate": 3.196329197195307e-06, "loss": 0.379, "step": 553 }, { "epoch": 4.425361957064403, "grad_norm": 0.08585759689716206, "learning_rate": 3.1093195654274024e-06, "loss": 0.3844, "step": 554 }, { "epoch": 4.433349975037443, "grad_norm": 0.08851894364844058, "learning_rate": 3.0234627036151186e-06, "loss": 0.3754, "step": 555 }, { "epoch": 4.441337993010484, "grad_norm": 0.08546993455255109, "learning_rate": 2.9387612946368647e-06, "loss": 0.3767, "step": 556 }, { "epoch": 4.449326010983524, "grad_norm": 0.08689133858962513, "learning_rate": 2.855217985265184e-06, "loss": 0.3818, "step": 557 }, { "epoch": 4.457314028956565, "grad_norm": 0.08705508747400349, "learning_rate": 2.7728353860839763e-06, "loss": 0.3789, "step": 558 }, { "epoch": 4.465302046929605, "grad_norm": 0.08598514484683649, "learning_rate": 2.6916160714069817e-06, "loss": 0.3721, "step": 559 }, { "epoch": 4.473290064902646, "grad_norm": 0.08768951265999986, "learning_rate": 2.6115625791973155e-06, "loss": 0.3777, "step": 560 }, { "epoch": 4.481278082875686, "grad_norm": 0.08479223708104064, "learning_rate": 2.5326774109881223e-06, "loss": 0.3805, "step": 561 }, { "epoch": 4.489266100848727, "grad_norm": 0.08131123805163427, "learning_rate": 2.454963031804485e-06, "loss": 0.3746, "step": 562 }, { "epoch": 4.497254118821767, "grad_norm": 0.08329047935604311, "learning_rate": 2.378421870086314e-06, "loss": 0.3761, "step": 563 }, { "epoch": 4.5052421367948075, "grad_norm": 0.08462162107210089, "learning_rate": 2.3030563176125444e-06, "loss": 0.3738, "step": 564 }, { "epoch": 4.513230154767848, "grad_norm": 0.09812143956960612, "learning_rate": 2.228868729426319e-06, "loss": 0.3765, "step": 565 }, { "epoch": 4.521218172740888, "grad_norm": 0.08490273500457897, "learning_rate": 2.1558614237614516e-06, "loss": 0.3778, "step": 566 }, { "epoch": 4.529206190713929, "grad_norm": 0.08570430572140957, "learning_rate": 2.0840366819699788e-06, "loss": 0.3857, "step": 567 }, { "epoch": 4.537194208686969, "grad_norm": 0.08300561137308456, "learning_rate": 2.013396748450842e-06, "loss": 0.3761, "step": 568 }, { "epoch": 4.54518222666001, "grad_norm": 0.08443227783552133, "learning_rate": 1.9439438305797776e-06, "loss": 0.3756, "step": 569 }, { "epoch": 4.55317024463305, "grad_norm": 0.08135395570142633, "learning_rate": 1.8756800986403466e-06, "loss": 0.3782, "step": 570 }, { "epoch": 4.5611582626060905, "grad_norm": 0.08279967533402854, "learning_rate": 1.808607685756103e-06, "loss": 0.3776, "step": 571 }, { "epoch": 4.569146280579131, "grad_norm": 0.0834623870625263, "learning_rate": 1.7427286878239247e-06, "loss": 0.3713, "step": 572 }, { "epoch": 4.5771342985521715, "grad_norm": 0.08512591892730595, "learning_rate": 1.6780451634485606e-06, "loss": 0.3781, "step": 573 }, { "epoch": 4.585122316525212, "grad_norm": 0.08121169235017031, "learning_rate": 1.614559133878264e-06, "loss": 0.3822, "step": 574 }, { "epoch": 4.5931103344982525, "grad_norm": 0.0815454483422227, "learning_rate": 1.5522725829416474e-06, "loss": 0.3789, "step": 575 }, { "epoch": 4.601098352471293, "grad_norm": 0.0819923460505712, "learning_rate": 1.4911874569856965e-06, "loss": 0.3777, "step": 576 }, { "epoch": 4.6090863704443334, "grad_norm": 0.08276809528907374, "learning_rate": 1.4313056648149393e-06, "loss": 0.3818, "step": 577 }, { "epoch": 4.6170743884173735, "grad_norm": 0.08123407989783393, "learning_rate": 1.3726290776318175e-06, "loss": 0.3752, "step": 578 }, { "epoch": 4.625062406390414, "grad_norm": 0.08137283984240884, "learning_rate": 1.3151595289781738e-06, "loss": 0.3846, "step": 579 }, { "epoch": 4.6330504243634545, "grad_norm": 0.08150026114578374, "learning_rate": 1.2588988146780135e-06, "loss": 0.3884, "step": 580 }, { "epoch": 4.641038442336495, "grad_norm": 0.08281920320562544, "learning_rate": 1.2038486927813354e-06, "loss": 0.3841, "step": 581 }, { "epoch": 4.6490264603095355, "grad_norm": 0.08355306503400638, "learning_rate": 1.1500108835092472e-06, "loss": 0.3812, "step": 582 }, { "epoch": 4.657014478282576, "grad_norm": 0.08418060141581976, "learning_rate": 1.0973870692001554e-06, "loss": 0.3792, "step": 583 }, { "epoch": 4.6650024962556165, "grad_norm": 0.08223524263153421, "learning_rate": 1.0459788942572423e-06, "loss": 0.3843, "step": 584 }, { "epoch": 4.6729905142286565, "grad_norm": 0.08271968804038993, "learning_rate": 9.957879650970549e-07, "loss": 0.3857, "step": 585 }, { "epoch": 4.6809785322016975, "grad_norm": 0.08244656434489289, "learning_rate": 9.468158500993207e-07, "loss": 0.3874, "step": 586 }, { "epoch": 4.6889665501747375, "grad_norm": 0.0819506533129172, "learning_rate": 8.990640795579186e-07, "loss": 0.3808, "step": 587 }, { "epoch": 4.6969545681477785, "grad_norm": 0.08149745500782653, "learning_rate": 8.525341456330883e-07, "loss": 0.3727, "step": 588 }, { "epoch": 4.7049425861208185, "grad_norm": 0.08076187044142838, "learning_rate": 8.072275023047926e-07, "loss": 0.3761, "step": 589 }, { "epoch": 4.712930604093859, "grad_norm": 0.08151591065997134, "learning_rate": 7.631455653272613e-07, "loss": 0.3832, "step": 590 }, { "epoch": 4.7209186220668995, "grad_norm": 0.08462271380326744, "learning_rate": 7.202897121847852e-07, "loss": 0.3749, "step": 591 }, { "epoch": 4.72890664003994, "grad_norm": 0.08308313300815548, "learning_rate": 6.786612820486449e-07, "loss": 0.3742, "step": 592 }, { "epoch": 4.7368946580129805, "grad_norm": 0.08421663571704587, "learning_rate": 6.382615757352817e-07, "loss": 0.383, "step": 593 }, { "epoch": 4.744882675986021, "grad_norm": 0.08208417725816322, "learning_rate": 5.990918556656411e-07, "loss": 0.3802, "step": 594 }, { "epoch": 4.7528706939590615, "grad_norm": 0.08235652164981158, "learning_rate": 5.611533458257245e-07, "loss": 0.3826, "step": 595 }, { "epoch": 4.7608587119321015, "grad_norm": 0.0823525460961533, "learning_rate": 5.2444723172834e-07, "loss": 0.375, "step": 596 }, { "epoch": 4.7688467299051425, "grad_norm": 0.08291828155167397, "learning_rate": 4.889746603760693e-07, "loss": 0.3841, "step": 597 }, { "epoch": 4.7768347478781825, "grad_norm": 0.0809741018796145, "learning_rate": 4.5473674022541213e-07, "loss": 0.3753, "step": 598 }, { "epoch": 4.7848227658512235, "grad_norm": 0.08124124038278724, "learning_rate": 4.2173454115214783e-07, "loss": 0.3838, "step": 599 }, { "epoch": 4.7928107838242635, "grad_norm": 0.08103520713384339, "learning_rate": 3.899690944179257e-07, "loss": 0.3765, "step": 600 }, { "epoch": 4.8007988017973044, "grad_norm": 0.08227217638870313, "learning_rate": 3.5944139263800694e-07, "loss": 0.3834, "step": 601 }, { "epoch": 4.8087868197703445, "grad_norm": 0.07899228317121158, "learning_rate": 3.3015238975026675e-07, "loss": 0.3694, "step": 602 }, { "epoch": 4.8167748377433846, "grad_norm": 0.09227389493594652, "learning_rate": 3.021030009853876e-07, "loss": 0.3783, "step": 603 }, { "epoch": 4.8247628557164255, "grad_norm": 0.08106531182197436, "learning_rate": 2.752941028382594e-07, "loss": 0.3773, "step": 604 }, { "epoch": 4.8327508736894655, "grad_norm": 0.08015145752167932, "learning_rate": 2.4972653304057073e-07, "loss": 0.3777, "step": 605 }, { "epoch": 4.8407388916625065, "grad_norm": 0.08160453860592876, "learning_rate": 2.25401090534656e-07, "loss": 0.3808, "step": 606 }, { "epoch": 4.8487269096355465, "grad_norm": 0.07966427336497452, "learning_rate": 2.0231853544852465e-07, "loss": 0.3744, "step": 607 }, { "epoch": 4.8567149276085875, "grad_norm": 0.08123242623536424, "learning_rate": 1.8047958907209339e-07, "loss": 0.3825, "step": 608 }, { "epoch": 4.8647029455816275, "grad_norm": 0.0805412707928896, "learning_rate": 1.5988493383466198e-07, "loss": 0.3749, "step": 609 }, { "epoch": 4.872690963554668, "grad_norm": 0.08036474123731352, "learning_rate": 1.40535213283588e-07, "loss": 0.3748, "step": 610 }, { "epoch": 4.8806789815277085, "grad_norm": 0.08213950898863626, "learning_rate": 1.2243103206417418e-07, "loss": 0.3819, "step": 611 }, { "epoch": 4.888666999500749, "grad_norm": 0.07935174486804004, "learning_rate": 1.05572955900759e-07, "loss": 0.3827, "step": 612 }, { "epoch": 4.8966550174737895, "grad_norm": 0.07731873027438858, "learning_rate": 8.996151157907306e-08, "loss": 0.3674, "step": 613 }, { "epoch": 4.90464303544683, "grad_norm": 0.07905308777211134, "learning_rate": 7.559718692974116e-08, "loss": 0.3755, "step": 614 }, { "epoch": 4.9126310534198705, "grad_norm": 0.08188223266669394, "learning_rate": 6.248043081307664e-08, "loss": 0.3848, "step": 615 }, { "epoch": 4.9206190713929105, "grad_norm": 0.07960614583875532, "learning_rate": 5.0611653105003824e-08, "loss": 0.3754, "step": 616 }, { "epoch": 4.928607089365951, "grad_norm": 0.08159036451816658, "learning_rate": 3.99912246843126e-08, "loss": 0.384, "step": 617 }, { "epoch": 4.9365951073389915, "grad_norm": 0.08068916515639828, "learning_rate": 3.061947742101001e-08, "loss": 0.3797, "step": 618 }, { "epoch": 4.944583125312032, "grad_norm": 0.07945334308304049, "learning_rate": 2.2496704165995142e-08, "loss": 0.378, "step": 619 }, { "epoch": 4.9525711432850725, "grad_norm": 0.08051278989431843, "learning_rate": 1.5623158741884247e-08, "loss": 0.3804, "step": 620 }, { "epoch": 4.960559161258113, "grad_norm": 0.07952219440080063, "learning_rate": 9.999055935074887e-09, "loss": 0.3661, "step": 621 }, { "epoch": 4.9685471792311535, "grad_norm": 0.08077553977056519, "learning_rate": 5.624571489053488e-09, "loss": 0.3829, "step": 622 }, { "epoch": 4.976535197204194, "grad_norm": 0.08255739947277718, "learning_rate": 2.499842098901972e-09, "loss": 0.3842, "step": 623 }, { "epoch": 4.9845232151772345, "grad_norm": 0.08097383031020737, "learning_rate": 6.249654069989674e-10, "loss": 0.3817, "step": 624 }, { "epoch": 4.992511233150275, "grad_norm": 0.08105708331755175, "learning_rate": 0.0, "loss": 0.377, "step": 625 }, { "epoch": 4.992511233150275, "step": 625, "total_flos": 1.6083110655493669e+19, "train_loss": 0.47168621559143065, "train_runtime": 96267.4715, "train_samples_per_second": 3.329, "train_steps_per_second": 0.006 } ], "logging_steps": 1.0, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6083110655493669e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }