{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 492, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006097560975609756, "grad_norm": 13.761144051368902, "learning_rate": 4.0000000000000003e-07, "loss": 4.6438, "step": 1 }, { "epoch": 0.012195121951219513, "grad_norm": 13.183326369804801, "learning_rate": 8.000000000000001e-07, "loss": 4.8577, "step": 2 }, { "epoch": 0.018292682926829267, "grad_norm": 11.29732095469755, "learning_rate": 1.2000000000000002e-06, "loss": 4.7954, "step": 3 }, { "epoch": 0.024390243902439025, "grad_norm": 11.769209187889713, "learning_rate": 1.6000000000000001e-06, "loss": 4.7998, "step": 4 }, { "epoch": 0.03048780487804878, "grad_norm": 12.522749869416966, "learning_rate": 2.0000000000000003e-06, "loss": 4.7958, "step": 5 }, { "epoch": 0.036585365853658534, "grad_norm": 13.40825064276547, "learning_rate": 2.4000000000000003e-06, "loss": 4.5537, "step": 6 }, { "epoch": 0.042682926829268296, "grad_norm": 13.224745779198466, "learning_rate": 2.8000000000000003e-06, "loss": 4.5891, "step": 7 }, { "epoch": 0.04878048780487805, "grad_norm": 11.459531949346005, "learning_rate": 3.2000000000000003e-06, "loss": 4.8212, "step": 8 }, { "epoch": 0.054878048780487805, "grad_norm": 12.942263223893391, "learning_rate": 3.6000000000000003e-06, "loss": 4.6557, "step": 9 }, { "epoch": 0.06097560975609756, "grad_norm": 11.67916919497089, "learning_rate": 4.000000000000001e-06, "loss": 4.6643, "step": 10 }, { "epoch": 0.06707317073170732, "grad_norm": 11.256361402107963, "learning_rate": 4.4e-06, "loss": 4.6234, "step": 11 }, { "epoch": 0.07317073170731707, "grad_norm": 10.918667814241992, "learning_rate": 4.800000000000001e-06, "loss": 4.5428, "step": 12 }, { "epoch": 0.07926829268292683, "grad_norm": 11.18941706027207, "learning_rate": 5.2e-06, "loss": 4.1207, "step": 13 }, { "epoch": 0.08536585365853659, "grad_norm": 10.834833962041147, "learning_rate": 5.600000000000001e-06, "loss": 4.2727, "step": 14 }, { "epoch": 0.09146341463414634, "grad_norm": 11.193544605148698, "learning_rate": 6e-06, "loss": 4.3305, "step": 15 }, { "epoch": 0.0975609756097561, "grad_norm": 11.156213868367029, "learning_rate": 6.4000000000000006e-06, "loss": 3.855, "step": 16 }, { "epoch": 0.10365853658536585, "grad_norm": 11.03392463912042, "learning_rate": 6.800000000000001e-06, "loss": 3.8251, "step": 17 }, { "epoch": 0.10975609756097561, "grad_norm": 10.92516220698292, "learning_rate": 7.2000000000000005e-06, "loss": 3.6552, "step": 18 }, { "epoch": 0.11585365853658537, "grad_norm": 9.360934765475477, "learning_rate": 7.600000000000001e-06, "loss": 3.4844, "step": 19 }, { "epoch": 0.12195121951219512, "grad_norm": 8.698890724234088, "learning_rate": 8.000000000000001e-06, "loss": 3.4775, "step": 20 }, { "epoch": 0.12804878048780488, "grad_norm": 8.307940622795766, "learning_rate": 8.400000000000001e-06, "loss": 3.1046, "step": 21 }, { "epoch": 0.13414634146341464, "grad_norm": 7.873509354464809, "learning_rate": 8.8e-06, "loss": 2.8967, "step": 22 }, { "epoch": 0.1402439024390244, "grad_norm": 6.74058652993515, "learning_rate": 9.200000000000002e-06, "loss": 2.7398, "step": 23 }, { "epoch": 0.14634146341463414, "grad_norm": 5.6682482038936985, "learning_rate": 9.600000000000001e-06, "loss": 2.554, "step": 24 }, { "epoch": 0.1524390243902439, "grad_norm": 5.196466384583255, "learning_rate": 1e-05, "loss": 2.4104, "step": 25 }, { "epoch": 0.15853658536585366, "grad_norm": 4.379173110371979, "learning_rate": 1.04e-05, "loss": 2.0813, "step": 26 }, { "epoch": 0.16463414634146342, "grad_norm": 4.041770670660932, "learning_rate": 1.0800000000000002e-05, "loss": 2.1974, "step": 27 }, { "epoch": 0.17073170731707318, "grad_norm": 3.7890756018085083, "learning_rate": 1.1200000000000001e-05, "loss": 2.1635, "step": 28 }, { "epoch": 0.17682926829268292, "grad_norm": 2.760454794268313, "learning_rate": 1.16e-05, "loss": 1.7247, "step": 29 }, { "epoch": 0.18292682926829268, "grad_norm": 2.084883247086018, "learning_rate": 1.2e-05, "loss": 1.7248, "step": 30 }, { "epoch": 0.18902439024390244, "grad_norm": 1.8946267393458731, "learning_rate": 1.2400000000000002e-05, "loss": 1.6519, "step": 31 }, { "epoch": 0.1951219512195122, "grad_norm": 1.4683657417452, "learning_rate": 1.2800000000000001e-05, "loss": 1.5153, "step": 32 }, { "epoch": 0.20121951219512196, "grad_norm": 1.1510156850186328, "learning_rate": 1.3200000000000002e-05, "loss": 1.4752, "step": 33 }, { "epoch": 0.2073170731707317, "grad_norm": 0.9975006671404031, "learning_rate": 1.3600000000000002e-05, "loss": 1.3437, "step": 34 }, { "epoch": 0.21341463414634146, "grad_norm": 1.175487116789021, "learning_rate": 1.4e-05, "loss": 1.5412, "step": 35 }, { "epoch": 0.21951219512195122, "grad_norm": 1.0595320461986457, "learning_rate": 1.4400000000000001e-05, "loss": 1.3513, "step": 36 }, { "epoch": 0.22560975609756098, "grad_norm": 1.1536993931361366, "learning_rate": 1.48e-05, "loss": 1.387, "step": 37 }, { "epoch": 0.23170731707317074, "grad_norm": 1.166131895120981, "learning_rate": 1.5200000000000002e-05, "loss": 1.3192, "step": 38 }, { "epoch": 0.23780487804878048, "grad_norm": 1.1083262024444887, "learning_rate": 1.5600000000000003e-05, "loss": 1.3045, "step": 39 }, { "epoch": 0.24390243902439024, "grad_norm": 1.0376815768281262, "learning_rate": 1.6000000000000003e-05, "loss": 1.3058, "step": 40 }, { "epoch": 0.25, "grad_norm": 0.87640355368596, "learning_rate": 1.64e-05, "loss": 1.1079, "step": 41 }, { "epoch": 0.25609756097560976, "grad_norm": 0.8012590361351394, "learning_rate": 1.6800000000000002e-05, "loss": 1.0897, "step": 42 }, { "epoch": 0.2621951219512195, "grad_norm": 0.7274182856521663, "learning_rate": 1.72e-05, "loss": 1.1044, "step": 43 }, { "epoch": 0.2682926829268293, "grad_norm": 0.6432052930355101, "learning_rate": 1.76e-05, "loss": 1.071, "step": 44 }, { "epoch": 0.27439024390243905, "grad_norm": 0.6162901066533818, "learning_rate": 1.8e-05, "loss": 1.0739, "step": 45 }, { "epoch": 0.2804878048780488, "grad_norm": 0.4936365689365201, "learning_rate": 1.8400000000000003e-05, "loss": 0.9854, "step": 46 }, { "epoch": 0.2865853658536585, "grad_norm": 0.4413825753074836, "learning_rate": 1.88e-05, "loss": 0.9589, "step": 47 }, { "epoch": 0.2926829268292683, "grad_norm": 0.3835552689369408, "learning_rate": 1.9200000000000003e-05, "loss": 0.9157, "step": 48 }, { "epoch": 0.29878048780487804, "grad_norm": 0.36169118496626246, "learning_rate": 1.9600000000000002e-05, "loss": 0.9501, "step": 49 }, { "epoch": 0.3048780487804878, "grad_norm": 0.3201102977202935, "learning_rate": 2e-05, "loss": 0.8649, "step": 50 }, { "epoch": 0.31097560975609756, "grad_norm": 0.3301312685545821, "learning_rate": 1.9999747405795057e-05, "loss": 0.9233, "step": 51 }, { "epoch": 0.3170731707317073, "grad_norm": 0.31971021695823615, "learning_rate": 1.9998989635940996e-05, "loss": 0.8435, "step": 52 }, { "epoch": 0.3231707317073171, "grad_norm": 0.35515389044587536, "learning_rate": 1.9997726728719468e-05, "loss": 0.8589, "step": 53 }, { "epoch": 0.32926829268292684, "grad_norm": 0.36743148858881, "learning_rate": 1.9995958747931083e-05, "loss": 0.8576, "step": 54 }, { "epoch": 0.3353658536585366, "grad_norm": 0.42811375031790766, "learning_rate": 1.9993685782892184e-05, "loss": 0.9279, "step": 55 }, { "epoch": 0.34146341463414637, "grad_norm": 0.41721558479353726, "learning_rate": 1.9990907948430327e-05, "loss": 0.8907, "step": 56 }, { "epoch": 0.3475609756097561, "grad_norm": 0.35352961285727363, "learning_rate": 1.9987625384878493e-05, "loss": 0.8291, "step": 57 }, { "epoch": 0.35365853658536583, "grad_norm": 0.2804864451654209, "learning_rate": 1.998383825806799e-05, "loss": 0.7566, "step": 58 }, { "epoch": 0.3597560975609756, "grad_norm": 0.2882591085430372, "learning_rate": 1.997954675932006e-05, "loss": 0.8485, "step": 59 }, { "epoch": 0.36585365853658536, "grad_norm": 0.2607140485168168, "learning_rate": 1.9974751105436266e-05, "loss": 0.8366, "step": 60 }, { "epoch": 0.3719512195121951, "grad_norm": 0.23074985418491212, "learning_rate": 1.9969451538687474e-05, "loss": 0.8274, "step": 61 }, { "epoch": 0.3780487804878049, "grad_norm": 0.23223595784320752, "learning_rate": 1.9963648326801653e-05, "loss": 0.9039, "step": 62 }, { "epoch": 0.38414634146341464, "grad_norm": 0.17404598061817236, "learning_rate": 1.9957341762950346e-05, "loss": 0.6557, "step": 63 }, { "epoch": 0.3902439024390244, "grad_norm": 0.20998407134341585, "learning_rate": 1.9950532165733847e-05, "loss": 0.7985, "step": 64 }, { "epoch": 0.39634146341463417, "grad_norm": 0.194171035037221, "learning_rate": 1.9943219879165113e-05, "loss": 0.7393, "step": 65 }, { "epoch": 0.4024390243902439, "grad_norm": 0.19325266400118835, "learning_rate": 1.993540527265239e-05, "loss": 0.7448, "step": 66 }, { "epoch": 0.40853658536585363, "grad_norm": 0.22349087274155047, "learning_rate": 1.992708874098054e-05, "loss": 0.9037, "step": 67 }, { "epoch": 0.4146341463414634, "grad_norm": 0.1952833479528782, "learning_rate": 1.9918270704291104e-05, "loss": 0.7685, "step": 68 }, { "epoch": 0.42073170731707316, "grad_norm": 0.18405443762754753, "learning_rate": 1.9908951608061078e-05, "loss": 0.6956, "step": 69 }, { "epoch": 0.4268292682926829, "grad_norm": 0.18503738755792795, "learning_rate": 1.98991319230804e-05, "loss": 0.7063, "step": 70 }, { "epoch": 0.4329268292682927, "grad_norm": 0.19690421628538282, "learning_rate": 1.9888812145428172e-05, "loss": 0.7793, "step": 71 }, { "epoch": 0.43902439024390244, "grad_norm": 0.16284982763895423, "learning_rate": 1.9877992796447604e-05, "loss": 0.6833, "step": 72 }, { "epoch": 0.4451219512195122, "grad_norm": 0.14309181240903507, "learning_rate": 1.9866674422719666e-05, "loss": 0.6706, "step": 73 }, { "epoch": 0.45121951219512196, "grad_norm": 0.15477185844290706, "learning_rate": 1.9854857596035476e-05, "loss": 0.7312, "step": 74 }, { "epoch": 0.4573170731707317, "grad_norm": 0.1293717417561759, "learning_rate": 1.984254291336743e-05, "loss": 0.6589, "step": 75 }, { "epoch": 0.4634146341463415, "grad_norm": 0.12123882539222287, "learning_rate": 1.982973099683902e-05, "loss": 0.62, "step": 76 }, { "epoch": 0.4695121951219512, "grad_norm": 0.13924219962219428, "learning_rate": 1.9816422493693417e-05, "loss": 0.7501, "step": 77 }, { "epoch": 0.47560975609756095, "grad_norm": 0.11917935845470132, "learning_rate": 1.9802618076260784e-05, "loss": 0.6819, "step": 78 }, { "epoch": 0.4817073170731707, "grad_norm": 0.11800076531829735, "learning_rate": 1.9788318441924276e-05, "loss": 0.615, "step": 79 }, { "epoch": 0.4878048780487805, "grad_norm": 0.11971198977185014, "learning_rate": 1.9773524313084857e-05, "loss": 0.6414, "step": 80 }, { "epoch": 0.49390243902439024, "grad_norm": 0.13027819783391864, "learning_rate": 1.9758236437124768e-05, "loss": 0.6463, "step": 81 }, { "epoch": 0.5, "grad_norm": 0.12460980978522168, "learning_rate": 1.9742455586369786e-05, "loss": 0.6529, "step": 82 }, { "epoch": 0.5060975609756098, "grad_norm": 0.13756050372643888, "learning_rate": 1.972618255805019e-05, "loss": 0.7114, "step": 83 }, { "epoch": 0.5121951219512195, "grad_norm": 0.12522657328282585, "learning_rate": 1.9709418174260523e-05, "loss": 0.6289, "step": 84 }, { "epoch": 0.5182926829268293, "grad_norm": 0.12334790238705769, "learning_rate": 1.9692163281918016e-05, "loss": 0.6985, "step": 85 }, { "epoch": 0.524390243902439, "grad_norm": 0.10972400668603455, "learning_rate": 1.9674418752719835e-05, "loss": 0.6453, "step": 86 }, { "epoch": 0.5304878048780488, "grad_norm": 0.10309779127439034, "learning_rate": 1.9656185483099027e-05, "loss": 0.6347, "step": 87 }, { "epoch": 0.5365853658536586, "grad_norm": 0.10717651442117264, "learning_rate": 1.963746439417924e-05, "loss": 0.6389, "step": 88 }, { "epoch": 0.5426829268292683, "grad_norm": 0.10108518878859295, "learning_rate": 1.961825643172819e-05, "loss": 0.6449, "step": 89 }, { "epoch": 0.5487804878048781, "grad_norm": 0.10582770991792313, "learning_rate": 1.959856256610988e-05, "loss": 0.6407, "step": 90 }, { "epoch": 0.5548780487804879, "grad_norm": 0.09805540518718314, "learning_rate": 1.9578383792235573e-05, "loss": 0.6146, "step": 91 }, { "epoch": 0.5609756097560976, "grad_norm": 0.09178135236393883, "learning_rate": 1.9557721129513538e-05, "loss": 0.5477, "step": 92 }, { "epoch": 0.5670731707317073, "grad_norm": 0.09658702034736838, "learning_rate": 1.9536575621797546e-05, "loss": 0.5892, "step": 93 }, { "epoch": 0.573170731707317, "grad_norm": 0.09736462601933246, "learning_rate": 1.9514948337334144e-05, "loss": 0.6138, "step": 94 }, { "epoch": 0.5792682926829268, "grad_norm": 0.08905830955745823, "learning_rate": 1.9492840368708668e-05, "loss": 0.5399, "step": 95 }, { "epoch": 0.5853658536585366, "grad_norm": 0.09660552709108973, "learning_rate": 1.947025283279008e-05, "loss": 0.6364, "step": 96 }, { "epoch": 0.5914634146341463, "grad_norm": 0.09133915004182258, "learning_rate": 1.9447186870674505e-05, "loss": 0.5921, "step": 97 }, { "epoch": 0.5975609756097561, "grad_norm": 0.0933997331456134, "learning_rate": 1.9423643647627625e-05, "loss": 0.6915, "step": 98 }, { "epoch": 0.6036585365853658, "grad_norm": 0.08353569640772877, "learning_rate": 1.9399624353025774e-05, "loss": 0.6408, "step": 99 }, { "epoch": 0.6097560975609756, "grad_norm": 0.08634151989354441, "learning_rate": 1.937513020029588e-05, "loss": 0.5963, "step": 100 }, { "epoch": 0.6158536585365854, "grad_norm": 0.08898929542438962, "learning_rate": 1.9350162426854152e-05, "loss": 0.595, "step": 101 }, { "epoch": 0.6219512195121951, "grad_norm": 0.08305460595097429, "learning_rate": 1.932472229404356e-05, "loss": 0.5669, "step": 102 }, { "epoch": 0.6280487804878049, "grad_norm": 0.08888872021259317, "learning_rate": 1.9298811087070134e-05, "loss": 0.6165, "step": 103 }, { "epoch": 0.6341463414634146, "grad_norm": 0.08058144202586265, "learning_rate": 1.9272430114938018e-05, "loss": 0.5728, "step": 104 }, { "epoch": 0.6402439024390244, "grad_norm": 0.08217790638268045, "learning_rate": 1.9245580710383344e-05, "loss": 0.577, "step": 105 }, { "epoch": 0.6463414634146342, "grad_norm": 0.07659807407519503, "learning_rate": 1.9218264229806917e-05, "loss": 0.5881, "step": 106 }, { "epoch": 0.6524390243902439, "grad_norm": 0.07540223196226505, "learning_rate": 1.9190482053205673e-05, "loss": 0.62, "step": 107 }, { "epoch": 0.6585365853658537, "grad_norm": 0.08107411301661235, "learning_rate": 1.9162235584102973e-05, "loss": 0.6488, "step": 108 }, { "epoch": 0.6646341463414634, "grad_norm": 0.07719107791626204, "learning_rate": 1.91335262494777e-05, "loss": 0.5771, "step": 109 }, { "epoch": 0.6707317073170732, "grad_norm": 0.08173053132540807, "learning_rate": 1.9104355499692166e-05, "loss": 0.5666, "step": 110 }, { "epoch": 0.676829268292683, "grad_norm": 0.07965621160015979, "learning_rate": 1.9074724808418837e-05, "loss": 0.6113, "step": 111 }, { "epoch": 0.6829268292682927, "grad_norm": 0.08980818058271649, "learning_rate": 1.9044635672565898e-05, "loss": 0.6089, "step": 112 }, { "epoch": 0.6890243902439024, "grad_norm": 0.07194337673119468, "learning_rate": 1.9014089612201612e-05, "loss": 0.5728, "step": 113 }, { "epoch": 0.6951219512195121, "grad_norm": 0.08706992381814065, "learning_rate": 1.8983088170477556e-05, "loss": 0.7144, "step": 114 }, { "epoch": 0.7012195121951219, "grad_norm": 0.06652366663030163, "learning_rate": 1.8951632913550625e-05, "loss": 0.5026, "step": 115 }, { "epoch": 0.7073170731707317, "grad_norm": 0.07192252823061802, "learning_rate": 1.8919725430503946e-05, "loss": 0.5533, "step": 116 }, { "epoch": 0.7134146341463414, "grad_norm": 0.08014714412171235, "learning_rate": 1.888736733326658e-05, "loss": 0.6077, "step": 117 }, { "epoch": 0.7195121951219512, "grad_norm": 0.0751151009465322, "learning_rate": 1.8854560256532098e-05, "loss": 0.5554, "step": 118 }, { "epoch": 0.725609756097561, "grad_norm": 0.08384104993084439, "learning_rate": 1.8821305857675997e-05, "loss": 0.6079, "step": 119 }, { "epoch": 0.7317073170731707, "grad_norm": 0.07596092975802397, "learning_rate": 1.8787605816671956e-05, "loss": 0.6262, "step": 120 }, { "epoch": 0.7378048780487805, "grad_norm": 0.06984378368031652, "learning_rate": 1.875346183600699e-05, "loss": 0.5579, "step": 121 }, { "epoch": 0.7439024390243902, "grad_norm": 0.06972708877396938, "learning_rate": 1.8718875640595432e-05, "loss": 0.5568, "step": 122 }, { "epoch": 0.75, "grad_norm": 0.0708625905901818, "learning_rate": 1.8683848977691784e-05, "loss": 0.582, "step": 123 }, { "epoch": 0.7560975609756098, "grad_norm": 0.07712676436551813, "learning_rate": 1.864838361680247e-05, "loss": 0.5935, "step": 124 }, { "epoch": 0.7621951219512195, "grad_norm": 0.06823828416067228, "learning_rate": 1.8612481349596406e-05, "loss": 0.5503, "step": 125 }, { "epoch": 0.7682926829268293, "grad_norm": 0.07681189237975657, "learning_rate": 1.8576143989814524e-05, "loss": 0.6412, "step": 126 }, { "epoch": 0.774390243902439, "grad_norm": 0.06850293773437466, "learning_rate": 1.8539373373178126e-05, "loss": 0.5771, "step": 127 }, { "epoch": 0.7804878048780488, "grad_norm": 0.06791710528546226, "learning_rate": 1.8502171357296144e-05, "loss": 0.6076, "step": 128 }, { "epoch": 0.7865853658536586, "grad_norm": 0.06599767271998445, "learning_rate": 1.8464539821571302e-05, "loss": 0.5583, "step": 129 }, { "epoch": 0.7926829268292683, "grad_norm": 0.07021764659304032, "learning_rate": 1.8426480667105178e-05, "loss": 0.5439, "step": 130 }, { "epoch": 0.7987804878048781, "grad_norm": 0.06809097796108884, "learning_rate": 1.8387995816602137e-05, "loss": 0.5584, "step": 131 }, { "epoch": 0.8048780487804879, "grad_norm": 0.07552768082187959, "learning_rate": 1.8349087214272222e-05, "loss": 0.6235, "step": 132 }, { "epoch": 0.8109756097560976, "grad_norm": 0.07388542257010466, "learning_rate": 1.830975682573293e-05, "loss": 0.5605, "step": 133 }, { "epoch": 0.8170731707317073, "grad_norm": 0.0734139769106561, "learning_rate": 1.8270006637909907e-05, "loss": 0.4911, "step": 134 }, { "epoch": 0.823170731707317, "grad_norm": 0.06661902834297227, "learning_rate": 1.8229838658936566e-05, "loss": 0.5263, "step": 135 }, { "epoch": 0.8292682926829268, "grad_norm": 0.08000530324170357, "learning_rate": 1.818925491805265e-05, "loss": 0.6063, "step": 136 }, { "epoch": 0.8353658536585366, "grad_norm": 0.06955587209390625, "learning_rate": 1.8148257465501718e-05, "loss": 0.5664, "step": 137 }, { "epoch": 0.8414634146341463, "grad_norm": 0.06999764411415345, "learning_rate": 1.810684837242755e-05, "loss": 0.5731, "step": 138 }, { "epoch": 0.8475609756097561, "grad_norm": 0.07392487537186451, "learning_rate": 1.8065029730769534e-05, "loss": 0.5771, "step": 139 }, { "epoch": 0.8536585365853658, "grad_norm": 0.07023462293275694, "learning_rate": 1.8022803653156983e-05, "loss": 0.5586, "step": 140 }, { "epoch": 0.8597560975609756, "grad_norm": 0.0754370714846295, "learning_rate": 1.7980172272802398e-05, "loss": 0.5386, "step": 141 }, { "epoch": 0.8658536585365854, "grad_norm": 0.06014064520411485, "learning_rate": 1.7937137743393695e-05, "loss": 0.5019, "step": 142 }, { "epoch": 0.8719512195121951, "grad_norm": 0.0684039280130895, "learning_rate": 1.7893702238985433e-05, "loss": 0.5593, "step": 143 }, { "epoch": 0.8780487804878049, "grad_norm": 0.07523983909087964, "learning_rate": 1.784986795388895e-05, "loss": 0.608, "step": 144 }, { "epoch": 0.8841463414634146, "grad_norm": 0.06631906454003386, "learning_rate": 1.7805637102561516e-05, "loss": 0.5496, "step": 145 }, { "epoch": 0.8902439024390244, "grad_norm": 0.06861615150079985, "learning_rate": 1.776101191949449e-05, "loss": 0.543, "step": 146 }, { "epoch": 0.8963414634146342, "grad_norm": 0.06396979787588344, "learning_rate": 1.771599465910039e-05, "loss": 0.565, "step": 147 }, { "epoch": 0.9024390243902439, "grad_norm": 0.06363428283758014, "learning_rate": 1.7670587595599034e-05, "loss": 0.5657, "step": 148 }, { "epoch": 0.9085365853658537, "grad_norm": 0.06397911394577013, "learning_rate": 1.7624793022902648e-05, "loss": 0.5343, "step": 149 }, { "epoch": 0.9146341463414634, "grad_norm": 0.06974823526096927, "learning_rate": 1.757861325449997e-05, "loss": 0.5022, "step": 150 }, { "epoch": 0.9207317073170732, "grad_norm": 0.06210202598875651, "learning_rate": 1.753205062333937e-05, "loss": 0.486, "step": 151 }, { "epoch": 0.926829268292683, "grad_norm": 0.07077403367537935, "learning_rate": 1.7485107481711014e-05, "loss": 0.4869, "step": 152 }, { "epoch": 0.9329268292682927, "grad_norm": 0.06531115051828462, "learning_rate": 1.7437786201128003e-05, "loss": 0.5544, "step": 153 }, { "epoch": 0.9390243902439024, "grad_norm": 0.0705614281634583, "learning_rate": 1.7390089172206594e-05, "loss": 0.5951, "step": 154 }, { "epoch": 0.9451219512195121, "grad_norm": 0.08044467420017791, "learning_rate": 1.73420188045454e-05, "loss": 0.5882, "step": 155 }, { "epoch": 0.9512195121951219, "grad_norm": 0.06778031094228984, "learning_rate": 1.7293577526603684e-05, "loss": 0.5307, "step": 156 }, { "epoch": 0.9573170731707317, "grad_norm": 0.07735829065491621, "learning_rate": 1.724476778557866e-05, "loss": 0.5803, "step": 157 }, { "epoch": 0.9634146341463414, "grad_norm": 0.06740940804822154, "learning_rate": 1.719559204728188e-05, "loss": 0.517, "step": 158 }, { "epoch": 0.9695121951219512, "grad_norm": 0.07639960491412072, "learning_rate": 1.7146052796014646e-05, "loss": 0.5753, "step": 159 }, { "epoch": 0.975609756097561, "grad_norm": 0.061081285281898906, "learning_rate": 1.7096152534442515e-05, "loss": 0.4686, "step": 160 }, { "epoch": 0.9817073170731707, "grad_norm": 0.06793168030335978, "learning_rate": 1.704589378346886e-05, "loss": 0.5447, "step": 161 }, { "epoch": 0.9878048780487805, "grad_norm": 0.07395695811692952, "learning_rate": 1.6995279082107537e-05, "loss": 0.5657, "step": 162 }, { "epoch": 0.9939024390243902, "grad_norm": 0.065555783375453, "learning_rate": 1.6944310987354597e-05, "loss": 0.5449, "step": 163 }, { "epoch": 1.0, "grad_norm": 0.07050013910319028, "learning_rate": 1.689299207405911e-05, "loss": 0.5184, "step": 164 }, { "epoch": 1.0060975609756098, "grad_norm": 0.0733901169113276, "learning_rate": 1.6841324934793096e-05, "loss": 0.5226, "step": 165 }, { "epoch": 1.0121951219512195, "grad_norm": 0.06589566948346295, "learning_rate": 1.678931217972055e-05, "loss": 0.4873, "step": 166 }, { "epoch": 1.0182926829268293, "grad_norm": 0.07475734276172114, "learning_rate": 1.6736956436465573e-05, "loss": 0.4827, "step": 167 }, { "epoch": 1.024390243902439, "grad_norm": 0.058903086819527835, "learning_rate": 1.6684260349979637e-05, "loss": 0.5053, "step": 168 }, { "epoch": 1.0304878048780488, "grad_norm": 0.06353582735567607, "learning_rate": 1.6631226582407954e-05, "loss": 0.5482, "step": 169 }, { "epoch": 1.0365853658536586, "grad_norm": 0.06380787517800868, "learning_rate": 1.6577857812954994e-05, "loss": 0.5248, "step": 170 }, { "epoch": 1.0426829268292683, "grad_norm": 0.06730058327208745, "learning_rate": 1.6524156737749132e-05, "loss": 0.4964, "step": 171 }, { "epoch": 1.048780487804878, "grad_norm": 0.06293892448460658, "learning_rate": 1.6470126069706456e-05, "loss": 0.5168, "step": 172 }, { "epoch": 1.0548780487804879, "grad_norm": 0.0694624094267741, "learning_rate": 1.641576853839369e-05, "loss": 0.5526, "step": 173 }, { "epoch": 1.0609756097560976, "grad_norm": 0.06478295672497261, "learning_rate": 1.6361086889890307e-05, "loss": 0.4853, "step": 174 }, { "epoch": 1.0670731707317074, "grad_norm": 0.06608027299921394, "learning_rate": 1.6306083886649823e-05, "loss": 0.5226, "step": 175 }, { "epoch": 1.0731707317073171, "grad_norm": 0.06681662898135733, "learning_rate": 1.6250762307360206e-05, "loss": 0.537, "step": 176 }, { "epoch": 1.079268292682927, "grad_norm": 0.06053711484659685, "learning_rate": 1.6195124946803527e-05, "loss": 0.4683, "step": 177 }, { "epoch": 1.0853658536585367, "grad_norm": 0.07013371267663553, "learning_rate": 1.6139174615714753e-05, "loss": 0.5767, "step": 178 }, { "epoch": 1.0914634146341464, "grad_norm": 0.06617676868427722, "learning_rate": 1.6082914140639768e-05, "loss": 0.5357, "step": 179 }, { "epoch": 1.0975609756097562, "grad_norm": 0.06805254845313483, "learning_rate": 1.6026346363792565e-05, "loss": 0.5179, "step": 180 }, { "epoch": 1.103658536585366, "grad_norm": 0.06882718691143014, "learning_rate": 1.596947414291167e-05, "loss": 0.5665, "step": 181 }, { "epoch": 1.1097560975609757, "grad_norm": 0.06329003072823183, "learning_rate": 1.591230035111576e-05, "loss": 0.512, "step": 182 }, { "epoch": 1.1158536585365855, "grad_norm": 0.06713658217392786, "learning_rate": 1.5854827876758535e-05, "loss": 0.4958, "step": 183 }, { "epoch": 1.1219512195121952, "grad_norm": 0.0677868436901709, "learning_rate": 1.5797059623282787e-05, "loss": 0.4715, "step": 184 }, { "epoch": 1.1280487804878048, "grad_norm": 0.06613758460632664, "learning_rate": 1.573899850907373e-05, "loss": 0.4829, "step": 185 }, { "epoch": 1.1341463414634148, "grad_norm": 0.06887716201911288, "learning_rate": 1.568064746731156e-05, "loss": 0.5418, "step": 186 }, { "epoch": 1.1402439024390243, "grad_norm": 0.07682982503987941, "learning_rate": 1.5622009445823274e-05, "loss": 0.5929, "step": 187 }, { "epoch": 1.146341463414634, "grad_norm": 0.06571420924574008, "learning_rate": 1.5563087406933762e-05, "loss": 0.511, "step": 188 }, { "epoch": 1.1524390243902438, "grad_norm": 0.0664511649725902, "learning_rate": 1.550388432731613e-05, "loss": 0.4558, "step": 189 }, { "epoch": 1.1585365853658536, "grad_norm": 0.07492574855512298, "learning_rate": 1.5444403197841345e-05, "loss": 0.5396, "step": 190 }, { "epoch": 1.1646341463414633, "grad_norm": 0.07122982585751268, "learning_rate": 1.5384647023427136e-05, "loss": 0.5301, "step": 191 }, { "epoch": 1.170731707317073, "grad_norm": 0.0658921691477124, "learning_rate": 1.5324618822886167e-05, "loss": 0.4947, "step": 192 }, { "epoch": 1.1768292682926829, "grad_norm": 0.07813967262256015, "learning_rate": 1.526432162877356e-05, "loss": 0.5522, "step": 193 }, { "epoch": 1.1829268292682926, "grad_norm": 0.06731988936901052, "learning_rate": 1.5203758487233677e-05, "loss": 0.476, "step": 194 }, { "epoch": 1.1890243902439024, "grad_norm": 0.07228505085779448, "learning_rate": 1.514293245784623e-05, "loss": 0.5278, "step": 195 }, { "epoch": 1.1951219512195121, "grad_norm": 0.07452980948980172, "learning_rate": 1.5081846613471736e-05, "loss": 0.5773, "step": 196 }, { "epoch": 1.201219512195122, "grad_norm": 0.06955858620563475, "learning_rate": 1.5020504040096241e-05, "loss": 0.5147, "step": 197 }, { "epoch": 1.2073170731707317, "grad_norm": 0.07065384450910228, "learning_rate": 1.4958907836675467e-05, "loss": 0.5275, "step": 198 }, { "epoch": 1.2134146341463414, "grad_norm": 0.07110195242202547, "learning_rate": 1.489706111497821e-05, "loss": 0.4819, "step": 199 }, { "epoch": 1.2195121951219512, "grad_norm": 0.06820779355050262, "learning_rate": 1.4834966999429179e-05, "loss": 0.521, "step": 200 }, { "epoch": 1.225609756097561, "grad_norm": 0.06964782085920465, "learning_rate": 1.4772628626951114e-05, "loss": 0.5234, "step": 201 }, { "epoch": 1.2317073170731707, "grad_norm": 0.06930582742745629, "learning_rate": 1.4710049146806348e-05, "loss": 0.4911, "step": 202 }, { "epoch": 1.2378048780487805, "grad_norm": 0.06741877286597113, "learning_rate": 1.4647231720437687e-05, "loss": 0.5215, "step": 203 }, { "epoch": 1.2439024390243902, "grad_norm": 0.06792423855223992, "learning_rate": 1.4584179521308703e-05, "loss": 0.5117, "step": 204 }, { "epoch": 1.25, "grad_norm": 0.07167343063555995, "learning_rate": 1.4520895734743419e-05, "loss": 0.538, "step": 205 }, { "epoch": 1.2560975609756098, "grad_norm": 0.07256282666966574, "learning_rate": 1.4457383557765385e-05, "loss": 0.5529, "step": 206 }, { "epoch": 1.2621951219512195, "grad_norm": 0.06757284467612035, "learning_rate": 1.4393646198936169e-05, "loss": 0.4892, "step": 207 }, { "epoch": 1.2682926829268293, "grad_norm": 0.07188557520197532, "learning_rate": 1.4329686878193271e-05, "loss": 0.5602, "step": 208 }, { "epoch": 1.274390243902439, "grad_norm": 0.07747770661719462, "learning_rate": 1.4265508826687442e-05, "loss": 0.5658, "step": 209 }, { "epoch": 1.2804878048780488, "grad_norm": 0.06809716748651344, "learning_rate": 1.4201115286619464e-05, "loss": 0.4713, "step": 210 }, { "epoch": 1.2865853658536586, "grad_norm": 0.07448043410586613, "learning_rate": 1.4136509511076347e-05, "loss": 0.5311, "step": 211 }, { "epoch": 1.2926829268292683, "grad_norm": 0.08085706529770824, "learning_rate": 1.4071694763866988e-05, "loss": 0.5617, "step": 212 }, { "epoch": 1.298780487804878, "grad_norm": 0.0728263083382011, "learning_rate": 1.4006674319357298e-05, "loss": 0.4792, "step": 213 }, { "epoch": 1.3048780487804879, "grad_norm": 0.0670268791274602, "learning_rate": 1.3941451462304778e-05, "loss": 0.4675, "step": 214 }, { "epoch": 1.3109756097560976, "grad_norm": 0.08039512209656642, "learning_rate": 1.387602948769257e-05, "loss": 0.5056, "step": 215 }, { "epoch": 1.3170731707317074, "grad_norm": 0.06817800650730356, "learning_rate": 1.3810411700563005e-05, "loss": 0.4739, "step": 216 }, { "epoch": 1.3231707317073171, "grad_norm": 0.07057112884463199, "learning_rate": 1.3744601415850637e-05, "loss": 0.5573, "step": 217 }, { "epoch": 1.329268292682927, "grad_norm": 0.06981321032667759, "learning_rate": 1.3678601958214779e-05, "loss": 0.5014, "step": 218 }, { "epoch": 1.3353658536585367, "grad_norm": 0.06993357941167098, "learning_rate": 1.3612416661871532e-05, "loss": 0.524, "step": 219 }, { "epoch": 1.3414634146341464, "grad_norm": 0.06483182405879527, "learning_rate": 1.3546048870425356e-05, "loss": 0.4806, "step": 220 }, { "epoch": 1.3475609756097562, "grad_norm": 0.07027433088183081, "learning_rate": 1.3479501936700161e-05, "loss": 0.4944, "step": 221 }, { "epoch": 1.3536585365853657, "grad_norm": 0.08129330060634665, "learning_rate": 1.3412779222569907e-05, "loss": 0.5541, "step": 222 }, { "epoch": 1.3597560975609757, "grad_norm": 0.06825577692381518, "learning_rate": 1.3345884098788775e-05, "loss": 0.473, "step": 223 }, { "epoch": 1.3658536585365852, "grad_norm": 0.06613216751504289, "learning_rate": 1.3278819944820893e-05, "loss": 0.4318, "step": 224 }, { "epoch": 1.3719512195121952, "grad_norm": 0.07283827698992242, "learning_rate": 1.3211590148669586e-05, "loss": 0.5125, "step": 225 }, { "epoch": 1.3780487804878048, "grad_norm": 0.06892969965848932, "learning_rate": 1.314419810670624e-05, "loss": 0.4533, "step": 226 }, { "epoch": 1.3841463414634148, "grad_norm": 0.07855732769188979, "learning_rate": 1.3076647223498703e-05, "loss": 0.5461, "step": 227 }, { "epoch": 1.3902439024390243, "grad_norm": 0.07382325404677605, "learning_rate": 1.3008940911639302e-05, "loss": 0.4379, "step": 228 }, { "epoch": 1.3963414634146343, "grad_norm": 0.07190670195425049, "learning_rate": 1.2941082591572443e-05, "loss": 0.533, "step": 229 }, { "epoch": 1.4024390243902438, "grad_norm": 0.06888657163333817, "learning_rate": 1.2873075691421808e-05, "loss": 0.5146, "step": 230 }, { "epoch": 1.4085365853658536, "grad_norm": 0.06879068933807653, "learning_rate": 1.2804923646817169e-05, "loss": 0.542, "step": 231 }, { "epoch": 1.4146341463414633, "grad_norm": 0.06316663754312257, "learning_rate": 1.2736629900720832e-05, "loss": 0.4763, "step": 232 }, { "epoch": 1.420731707317073, "grad_norm": 0.0745177281343944, "learning_rate": 1.2668197903253694e-05, "loss": 0.5063, "step": 233 }, { "epoch": 1.4268292682926829, "grad_norm": 0.07549236104105322, "learning_rate": 1.2599631111520956e-05, "loss": 0.4871, "step": 234 }, { "epoch": 1.4329268292682926, "grad_norm": 0.07608890942436555, "learning_rate": 1.2530932989437463e-05, "loss": 0.5216, "step": 235 }, { "epoch": 1.4390243902439024, "grad_norm": 0.08852428564425496, "learning_rate": 1.2462107007552726e-05, "loss": 0.5814, "step": 236 }, { "epoch": 1.4451219512195121, "grad_norm": 0.07169927489263321, "learning_rate": 1.2393156642875579e-05, "loss": 0.5097, "step": 237 }, { "epoch": 1.451219512195122, "grad_norm": 0.0714844858843735, "learning_rate": 1.2324085378698529e-05, "loss": 0.4943, "step": 238 }, { "epoch": 1.4573170731707317, "grad_norm": 0.07303490526979263, "learning_rate": 1.2254896704421789e-05, "loss": 0.5254, "step": 239 }, { "epoch": 1.4634146341463414, "grad_norm": 0.07649861873490388, "learning_rate": 1.2185594115376991e-05, "loss": 0.4628, "step": 240 }, { "epoch": 1.4695121951219512, "grad_norm": 0.07968432654727967, "learning_rate": 1.211618111265061e-05, "loss": 0.5311, "step": 241 }, { "epoch": 1.475609756097561, "grad_norm": 0.08164340764032027, "learning_rate": 1.2046661202907101e-05, "loss": 0.5082, "step": 242 }, { "epoch": 1.4817073170731707, "grad_norm": 0.07536455518754334, "learning_rate": 1.1977037898211723e-05, "loss": 0.4963, "step": 243 }, { "epoch": 1.4878048780487805, "grad_norm": 0.07838988741024766, "learning_rate": 1.1907314715853138e-05, "loss": 0.4964, "step": 244 }, { "epoch": 1.4939024390243902, "grad_norm": 0.07681652298019655, "learning_rate": 1.1837495178165706e-05, "loss": 0.531, "step": 245 }, { "epoch": 1.5, "grad_norm": 0.07585351296872528, "learning_rate": 1.176758281235155e-05, "loss": 0.4971, "step": 246 }, { "epoch": 1.5060975609756098, "grad_norm": 0.07391818296648663, "learning_rate": 1.1697581150302362e-05, "loss": 0.5189, "step": 247 }, { "epoch": 1.5121951219512195, "grad_norm": 0.07240466805493384, "learning_rate": 1.1627493728420978e-05, "loss": 0.4696, "step": 248 }, { "epoch": 1.5182926829268293, "grad_norm": 0.07336052530563156, "learning_rate": 1.1557324087442719e-05, "loss": 0.5158, "step": 249 }, { "epoch": 1.524390243902439, "grad_norm": 0.0734922713913184, "learning_rate": 1.1487075772256517e-05, "loss": 0.5013, "step": 250 }, { "epoch": 1.5304878048780488, "grad_norm": 0.07041146771920755, "learning_rate": 1.1416752331725842e-05, "loss": 0.4925, "step": 251 }, { "epoch": 1.5365853658536586, "grad_norm": 0.07749242303496245, "learning_rate": 1.1346357318509395e-05, "loss": 0.5115, "step": 252 }, { "epoch": 1.5426829268292683, "grad_norm": 0.06688029575585173, "learning_rate": 1.1275894288881664e-05, "loss": 0.434, "step": 253 }, { "epoch": 1.548780487804878, "grad_norm": 0.07880736062509318, "learning_rate": 1.1205366802553231e-05, "loss": 0.513, "step": 254 }, { "epoch": 1.5548780487804879, "grad_norm": 0.07925969227352526, "learning_rate": 1.1134778422490971e-05, "loss": 0.5467, "step": 255 }, { "epoch": 1.5609756097560976, "grad_norm": 0.07331952124747425, "learning_rate": 1.1064132714738024e-05, "loss": 0.5394, "step": 256 }, { "epoch": 1.5670731707317072, "grad_norm": 0.06881324378789994, "learning_rate": 1.0993433248233672e-05, "loss": 0.481, "step": 257 }, { "epoch": 1.5731707317073171, "grad_norm": 0.0760805406679711, "learning_rate": 1.092268359463302e-05, "loss": 0.4998, "step": 258 }, { "epoch": 1.5792682926829267, "grad_norm": 0.0723622186584405, "learning_rate": 1.0851887328126569e-05, "loss": 0.4989, "step": 259 }, { "epoch": 1.5853658536585367, "grad_norm": 0.0691805814488331, "learning_rate": 1.0781048025259648e-05, "loss": 0.4491, "step": 260 }, { "epoch": 1.5914634146341462, "grad_norm": 0.07332736517126122, "learning_rate": 1.0710169264751733e-05, "loss": 0.4767, "step": 261 }, { "epoch": 1.5975609756097562, "grad_norm": 0.07503167590781622, "learning_rate": 1.0639254627315658e-05, "loss": 0.5108, "step": 262 }, { "epoch": 1.6036585365853657, "grad_norm": 0.07676152916231047, "learning_rate": 1.0568307695476712e-05, "loss": 0.5324, "step": 263 }, { "epoch": 1.6097560975609757, "grad_norm": 0.0834575992691424, "learning_rate": 1.049733205339167e-05, "loss": 0.5628, "step": 264 }, { "epoch": 1.6158536585365852, "grad_norm": 0.07979828063308889, "learning_rate": 1.0426331286667701e-05, "loss": 0.5017, "step": 265 }, { "epoch": 1.6219512195121952, "grad_norm": 0.07227807983837574, "learning_rate": 1.0355308982181254e-05, "loss": 0.4286, "step": 266 }, { "epoch": 1.6280487804878048, "grad_norm": 0.08033151020781615, "learning_rate": 1.0284268727896833e-05, "loss": 0.4991, "step": 267 }, { "epoch": 1.6341463414634148, "grad_norm": 0.07726754814987509, "learning_rate": 1.0213214112685747e-05, "loss": 0.5663, "step": 268 }, { "epoch": 1.6402439024390243, "grad_norm": 0.06975489529697236, "learning_rate": 1.0142148726144807e-05, "loss": 0.4509, "step": 269 }, { "epoch": 1.6463414634146343, "grad_norm": 0.08607941878366727, "learning_rate": 1.0071076158414977e-05, "loss": 0.5012, "step": 270 }, { "epoch": 1.6524390243902438, "grad_norm": 0.07924808288315173, "learning_rate": 1e-05, "loss": 0.4968, "step": 271 }, { "epoch": 1.6585365853658538, "grad_norm": 0.07521641587131223, "learning_rate": 9.928923841585025e-06, "loss": 0.5333, "step": 272 }, { "epoch": 1.6646341463414633, "grad_norm": 0.0810914472998851, "learning_rate": 9.857851273855195e-06, "loss": 0.5256, "step": 273 }, { "epoch": 1.6707317073170733, "grad_norm": 0.07570611716859411, "learning_rate": 9.786785887314255e-06, "loss": 0.4844, "step": 274 }, { "epoch": 1.6768292682926829, "grad_norm": 0.08692043191937647, "learning_rate": 9.715731272103172e-06, "loss": 0.55, "step": 275 }, { "epoch": 1.6829268292682928, "grad_norm": 0.07549433485077096, "learning_rate": 9.644691017818752e-06, "loss": 0.4599, "step": 276 }, { "epoch": 1.6890243902439024, "grad_norm": 0.07013095295478262, "learning_rate": 9.573668713332305e-06, "loss": 0.4641, "step": 277 }, { "epoch": 1.6951219512195121, "grad_norm": 0.085732485667367, "learning_rate": 9.502667946608332e-06, "loss": 0.5409, "step": 278 }, { "epoch": 1.701219512195122, "grad_norm": 0.0786732581180552, "learning_rate": 9.43169230452329e-06, "loss": 0.5047, "step": 279 }, { "epoch": 1.7073170731707317, "grad_norm": 0.06973904067642213, "learning_rate": 9.360745372684346e-06, "loss": 0.4611, "step": 280 }, { "epoch": 1.7134146341463414, "grad_norm": 0.07149507221102347, "learning_rate": 9.289830735248269e-06, "loss": 0.5249, "step": 281 }, { "epoch": 1.7195121951219512, "grad_norm": 0.07598878917991338, "learning_rate": 9.218951974740354e-06, "loss": 0.53, "step": 282 }, { "epoch": 1.725609756097561, "grad_norm": 0.07880251948989025, "learning_rate": 9.148112671873433e-06, "loss": 0.5195, "step": 283 }, { "epoch": 1.7317073170731707, "grad_norm": 0.08006942318123801, "learning_rate": 9.07731640536698e-06, "loss": 0.4935, "step": 284 }, { "epoch": 1.7378048780487805, "grad_norm": 0.07419436980709716, "learning_rate": 9.00656675176633e-06, "loss": 0.5, "step": 285 }, { "epoch": 1.7439024390243902, "grad_norm": 0.07335821062357814, "learning_rate": 8.935867285261977e-06, "loss": 0.4689, "step": 286 }, { "epoch": 1.75, "grad_norm": 0.08300432948525868, "learning_rate": 8.865221577509034e-06, "loss": 0.5499, "step": 287 }, { "epoch": 1.7560975609756098, "grad_norm": 0.07414556420580612, "learning_rate": 8.79463319744677e-06, "loss": 0.5016, "step": 288 }, { "epoch": 1.7621951219512195, "grad_norm": 0.0777082606308814, "learning_rate": 8.724105711118342e-06, "loss": 0.5094, "step": 289 }, { "epoch": 1.7682926829268293, "grad_norm": 0.07879787724881869, "learning_rate": 8.653642681490608e-06, "loss": 0.504, "step": 290 }, { "epoch": 1.774390243902439, "grad_norm": 0.07782466491724375, "learning_rate": 8.583247668274163e-06, "loss": 0.4871, "step": 291 }, { "epoch": 1.7804878048780488, "grad_norm": 0.07959753822335783, "learning_rate": 8.512924227743482e-06, "loss": 0.4637, "step": 292 }, { "epoch": 1.7865853658536586, "grad_norm": 0.08403574176326123, "learning_rate": 8.442675912557281e-06, "loss": 0.4978, "step": 293 }, { "epoch": 1.7926829268292683, "grad_norm": 0.07506838029170206, "learning_rate": 8.372506271579022e-06, "loss": 0.4801, "step": 294 }, { "epoch": 1.798780487804878, "grad_norm": 0.08007620776198685, "learning_rate": 8.30241884969764e-06, "loss": 0.5467, "step": 295 }, { "epoch": 1.8048780487804879, "grad_norm": 0.07302138656473144, "learning_rate": 8.232417187648454e-06, "loss": 0.4591, "step": 296 }, { "epoch": 1.8109756097560976, "grad_norm": 0.07968477173539414, "learning_rate": 8.162504821834296e-06, "loss": 0.4869, "step": 297 }, { "epoch": 1.8170731707317072, "grad_norm": 0.07295556591701204, "learning_rate": 8.092685284146865e-06, "loss": 0.4857, "step": 298 }, { "epoch": 1.8231707317073171, "grad_norm": 0.0687852684162483, "learning_rate": 8.02296210178828e-06, "loss": 0.4376, "step": 299 }, { "epoch": 1.8292682926829267, "grad_norm": 0.07614301377824628, "learning_rate": 7.953338797092902e-06, "loss": 0.4687, "step": 300 }, { "epoch": 1.8353658536585367, "grad_norm": 0.08065344887657697, "learning_rate": 7.883818887349391e-06, "loss": 0.558, "step": 301 }, { "epoch": 1.8414634146341462, "grad_norm": 0.07537828658738212, "learning_rate": 7.814405884623012e-06, "loss": 0.4641, "step": 302 }, { "epoch": 1.8475609756097562, "grad_norm": 0.07520932682727934, "learning_rate": 7.745103295578216e-06, "loss": 0.4807, "step": 303 }, { "epoch": 1.8536585365853657, "grad_norm": 0.08194352024084861, "learning_rate": 7.675914621301476e-06, "loss": 0.5249, "step": 304 }, { "epoch": 1.8597560975609757, "grad_norm": 0.06548230309414133, "learning_rate": 7.606843357124426e-06, "loss": 0.4296, "step": 305 }, { "epoch": 1.8658536585365852, "grad_norm": 0.07539507791922381, "learning_rate": 7.5378929924472735e-06, "loss": 0.4906, "step": 306 }, { "epoch": 1.8719512195121952, "grad_norm": 0.07757046744416946, "learning_rate": 7.469067010562538e-06, "loss": 0.4764, "step": 307 }, { "epoch": 1.8780487804878048, "grad_norm": 0.08347451677435065, "learning_rate": 7.400368888479048e-06, "loss": 0.5079, "step": 308 }, { "epoch": 1.8841463414634148, "grad_norm": 0.09459522265909277, "learning_rate": 7.331802096746309e-06, "loss": 0.5622, "step": 309 }, { "epoch": 1.8902439024390243, "grad_norm": 0.07271276680988117, "learning_rate": 7.263370099279173e-06, "loss": 0.4646, "step": 310 }, { "epoch": 1.8963414634146343, "grad_norm": 0.07270944162582102, "learning_rate": 7.195076353182834e-06, "loss": 0.4824, "step": 311 }, { "epoch": 1.9024390243902438, "grad_norm": 0.07557182291040342, "learning_rate": 7.126924308578196e-06, "loss": 0.4434, "step": 312 }, { "epoch": 1.9085365853658538, "grad_norm": 0.07838104917336293, "learning_rate": 7.058917408427559e-06, "loss": 0.4969, "step": 313 }, { "epoch": 1.9146341463414633, "grad_norm": 0.0772118612308542, "learning_rate": 6.9910590883607e-06, "loss": 0.4897, "step": 314 }, { "epoch": 1.9207317073170733, "grad_norm": 0.06733566470669253, "learning_rate": 6.923352776501302e-06, "loss": 0.4541, "step": 315 }, { "epoch": 1.9268292682926829, "grad_norm": 0.07768052936426381, "learning_rate": 6.855801893293765e-06, "loss": 0.4746, "step": 316 }, { "epoch": 1.9329268292682928, "grad_norm": 0.07601549840390559, "learning_rate": 6.788409851330419e-06, "loss": 0.5037, "step": 317 }, { "epoch": 1.9390243902439024, "grad_norm": 0.08311022242797193, "learning_rate": 6.721180055179113e-06, "loss": 0.5478, "step": 318 }, { "epoch": 1.9451219512195121, "grad_norm": 0.07527262749855876, "learning_rate": 6.654115901211229e-06, "loss": 0.4801, "step": 319 }, { "epoch": 1.951219512195122, "grad_norm": 0.0819908930356081, "learning_rate": 6.587220777430097e-06, "loss": 0.5252, "step": 320 }, { "epoch": 1.9573170731707317, "grad_norm": 0.07276145316957822, "learning_rate": 6.5204980632998394e-06, "loss": 0.411, "step": 321 }, { "epoch": 1.9634146341463414, "grad_norm": 0.06904185307886326, "learning_rate": 6.453951129574644e-06, "loss": 0.4813, "step": 322 }, { "epoch": 1.9695121951219512, "grad_norm": 0.07458141714965788, "learning_rate": 6.387583338128471e-06, "loss": 0.5033, "step": 323 }, { "epoch": 1.975609756097561, "grad_norm": 0.07431900473667878, "learning_rate": 6.321398041785225e-06, "loss": 0.4907, "step": 324 }, { "epoch": 1.9817073170731707, "grad_norm": 0.07780066087542951, "learning_rate": 6.255398584149366e-06, "loss": 0.4902, "step": 325 }, { "epoch": 1.9878048780487805, "grad_norm": 0.07639692261752619, "learning_rate": 6.189588299436997e-06, "loss": 0.4978, "step": 326 }, { "epoch": 1.9939024390243902, "grad_norm": 0.07548995093210441, "learning_rate": 6.123970512307433e-06, "loss": 0.4664, "step": 327 }, { "epoch": 2.0, "grad_norm": 0.0747178162965431, "learning_rate": 6.058548537695225e-06, "loss": 0.474, "step": 328 }, { "epoch": 2.0060975609756095, "grad_norm": 0.07256188615230309, "learning_rate": 5.9933256806427056e-06, "loss": 0.45, "step": 329 }, { "epoch": 2.0121951219512195, "grad_norm": 0.07890015744026495, "learning_rate": 5.928305236133016e-06, "loss": 0.5278, "step": 330 }, { "epoch": 2.018292682926829, "grad_norm": 0.0743608081300144, "learning_rate": 5.86349048892366e-06, "loss": 0.5151, "step": 331 }, { "epoch": 2.024390243902439, "grad_norm": 0.07744372435832222, "learning_rate": 5.798884713380542e-06, "loss": 0.4706, "step": 332 }, { "epoch": 2.0304878048780486, "grad_norm": 0.07446342377276646, "learning_rate": 5.734491173312559e-06, "loss": 0.3936, "step": 333 }, { "epoch": 2.0365853658536586, "grad_norm": 0.07700683633251168, "learning_rate": 5.67031312180673e-06, "loss": 0.4931, "step": 334 }, { "epoch": 2.042682926829268, "grad_norm": 0.07438951456585641, "learning_rate": 5.60635380106383e-06, "loss": 0.4958, "step": 335 }, { "epoch": 2.048780487804878, "grad_norm": 0.07659907187784923, "learning_rate": 5.542616442234618e-06, "loss": 0.4846, "step": 336 }, { "epoch": 2.0548780487804876, "grad_norm": 0.06905536948425527, "learning_rate": 5.479104265256583e-06, "loss": 0.4426, "step": 337 }, { "epoch": 2.0609756097560976, "grad_norm": 0.07334778911378619, "learning_rate": 5.415820478691301e-06, "loss": 0.5074, "step": 338 }, { "epoch": 2.067073170731707, "grad_norm": 0.08664025624260953, "learning_rate": 5.352768279562315e-06, "loss": 0.5383, "step": 339 }, { "epoch": 2.073170731707317, "grad_norm": 0.08278651278238408, "learning_rate": 5.2899508531936526e-06, "loss": 0.4713, "step": 340 }, { "epoch": 2.0792682926829267, "grad_norm": 0.0705392226402741, "learning_rate": 5.2273713730488886e-06, "loss": 0.403, "step": 341 }, { "epoch": 2.0853658536585367, "grad_norm": 0.0717929565778019, "learning_rate": 5.165033000570825e-06, "loss": 0.472, "step": 342 }, { "epoch": 2.091463414634146, "grad_norm": 0.07551856711067856, "learning_rate": 5.1029388850217935e-06, "loss": 0.4945, "step": 343 }, { "epoch": 2.097560975609756, "grad_norm": 0.07438201387306197, "learning_rate": 5.041092163324537e-06, "loss": 0.4939, "step": 344 }, { "epoch": 2.1036585365853657, "grad_norm": 0.07626903753672695, "learning_rate": 4.979495959903759e-06, "loss": 0.4662, "step": 345 }, { "epoch": 2.1097560975609757, "grad_norm": 0.07374673606028373, "learning_rate": 4.918153386528271e-06, "loss": 0.4792, "step": 346 }, { "epoch": 2.1158536585365852, "grad_norm": 0.08229855647674697, "learning_rate": 4.8570675421537685e-06, "loss": 0.5428, "step": 347 }, { "epoch": 2.1219512195121952, "grad_norm": 0.07472947580500576, "learning_rate": 4.7962415127663265e-06, "loss": 0.5573, "step": 348 }, { "epoch": 2.1280487804878048, "grad_norm": 0.07720860403921907, "learning_rate": 4.7356783712264405e-06, "loss": 0.5366, "step": 349 }, { "epoch": 2.1341463414634148, "grad_norm": 0.07671762679161022, "learning_rate": 4.675381177113837e-06, "loss": 0.4991, "step": 350 }, { "epoch": 2.1402439024390243, "grad_norm": 0.0698611175055525, "learning_rate": 4.615352976572867e-06, "loss": 0.463, "step": 351 }, { "epoch": 2.1463414634146343, "grad_norm": 0.08168992782435783, "learning_rate": 4.555596802158653e-06, "loss": 0.5243, "step": 352 }, { "epoch": 2.152439024390244, "grad_norm": 0.07634240659584558, "learning_rate": 4.4961156726838725e-06, "loss": 0.4832, "step": 353 }, { "epoch": 2.158536585365854, "grad_norm": 0.07065742577897564, "learning_rate": 4.436912593066241e-06, "loss": 0.5121, "step": 354 }, { "epoch": 2.1646341463414633, "grad_norm": 0.07396087251257588, "learning_rate": 4.377990554176729e-06, "loss": 0.4896, "step": 355 }, { "epoch": 2.1707317073170733, "grad_norm": 0.07474646282993128, "learning_rate": 4.319352532688444e-06, "loss": 0.4612, "step": 356 }, { "epoch": 2.176829268292683, "grad_norm": 0.08069543680226443, "learning_rate": 4.261001490926272e-06, "loss": 0.5218, "step": 357 }, { "epoch": 2.182926829268293, "grad_norm": 0.07346735875767992, "learning_rate": 4.2029403767172175e-06, "loss": 0.435, "step": 358 }, { "epoch": 2.1890243902439024, "grad_norm": 0.07522119438266486, "learning_rate": 4.14517212324147e-06, "loss": 0.4956, "step": 359 }, { "epoch": 2.1951219512195124, "grad_norm": 0.08032494668646596, "learning_rate": 4.087699648884248e-06, "loss": 0.4752, "step": 360 }, { "epoch": 2.201219512195122, "grad_norm": 0.08192204498128373, "learning_rate": 4.0305258570883336e-06, "loss": 0.5108, "step": 361 }, { "epoch": 2.207317073170732, "grad_norm": 0.08394146118190073, "learning_rate": 3.973653636207437e-06, "loss": 0.5567, "step": 362 }, { "epoch": 2.2134146341463414, "grad_norm": 0.07596087618305393, "learning_rate": 3.917085859360234e-06, "loss": 0.4685, "step": 363 }, { "epoch": 2.2195121951219514, "grad_norm": 0.07887155715773822, "learning_rate": 3.860825384285247e-06, "loss": 0.5206, "step": 364 }, { "epoch": 2.225609756097561, "grad_norm": 0.07296513823227467, "learning_rate": 3.804875053196477e-06, "loss": 0.4469, "step": 365 }, { "epoch": 2.231707317073171, "grad_norm": 0.08190684855847946, "learning_rate": 3.7492376926397966e-06, "loss": 0.5094, "step": 366 }, { "epoch": 2.2378048780487805, "grad_norm": 0.07617526219017642, "learning_rate": 3.6939161133501823e-06, "loss": 0.4479, "step": 367 }, { "epoch": 2.2439024390243905, "grad_norm": 0.08063324451878306, "learning_rate": 3.6389131101096953e-06, "loss": 0.5099, "step": 368 }, { "epoch": 2.25, "grad_norm": 0.07739599643572288, "learning_rate": 3.5842314616063134e-06, "loss": 0.491, "step": 369 }, { "epoch": 2.2560975609756095, "grad_norm": 0.07768918830309231, "learning_rate": 3.529873930293546e-06, "loss": 0.5417, "step": 370 }, { "epoch": 2.2621951219512195, "grad_norm": 0.0822012033362632, "learning_rate": 3.4758432622508677e-06, "loss": 0.5186, "step": 371 }, { "epoch": 2.2682926829268295, "grad_norm": 0.0764020181839071, "learning_rate": 3.422142187045011e-06, "loss": 0.4754, "step": 372 }, { "epoch": 2.274390243902439, "grad_norm": 0.08335314099643498, "learning_rate": 3.3687734175920505e-06, "loss": 0.5537, "step": 373 }, { "epoch": 2.2804878048780486, "grad_norm": 0.0803475394628154, "learning_rate": 3.3157396500203655e-06, "loss": 0.4212, "step": 374 }, { "epoch": 2.2865853658536586, "grad_norm": 0.06853604150629149, "learning_rate": 3.2630435635344283e-06, "loss": 0.4197, "step": 375 }, { "epoch": 2.292682926829268, "grad_norm": 0.07027233118743227, "learning_rate": 3.2106878202794513e-06, "loss": 0.426, "step": 376 }, { "epoch": 2.298780487804878, "grad_norm": 0.08035482501355977, "learning_rate": 3.1586750652069077e-06, "loss": 0.4768, "step": 377 }, { "epoch": 2.3048780487804876, "grad_norm": 0.0767033268066497, "learning_rate": 3.1070079259408934e-06, "loss": 0.4298, "step": 378 }, { "epoch": 2.3109756097560976, "grad_norm": 0.07871530893320917, "learning_rate": 3.0556890126454075e-06, "loss": 0.5194, "step": 379 }, { "epoch": 2.317073170731707, "grad_norm": 0.0694907669966186, "learning_rate": 3.004720917892464e-06, "loss": 0.4458, "step": 380 }, { "epoch": 2.323170731707317, "grad_norm": 0.07550036175573449, "learning_rate": 2.954106216531141e-06, "loss": 0.4877, "step": 381 }, { "epoch": 2.3292682926829267, "grad_norm": 0.06828766275227673, "learning_rate": 2.90384746555749e-06, "loss": 0.4694, "step": 382 }, { "epoch": 2.3353658536585367, "grad_norm": 0.07957885134154746, "learning_rate": 2.8539472039853557e-06, "loss": 0.4549, "step": 383 }, { "epoch": 2.341463414634146, "grad_norm": 0.07312612756103479, "learning_rate": 2.804407952718119e-06, "loss": 0.4717, "step": 384 }, { "epoch": 2.347560975609756, "grad_norm": 0.07462194695242244, "learning_rate": 2.7552322144213405e-06, "loss": 0.4681, "step": 385 }, { "epoch": 2.3536585365853657, "grad_norm": 0.07382029470746747, "learning_rate": 2.7064224733963197e-06, "loss": 0.4455, "step": 386 }, { "epoch": 2.3597560975609757, "grad_norm": 0.07566404170504752, "learning_rate": 2.6579811954546054e-06, "loss": 0.4601, "step": 387 }, { "epoch": 2.3658536585365852, "grad_norm": 0.06650889658374204, "learning_rate": 2.6099108277934105e-06, "loss": 0.403, "step": 388 }, { "epoch": 2.3719512195121952, "grad_norm": 0.08128051733864035, "learning_rate": 2.5622137988719985e-06, "loss": 0.5062, "step": 389 }, { "epoch": 2.3780487804878048, "grad_norm": 0.07645763895183058, "learning_rate": 2.514892518288988e-06, "loss": 0.4992, "step": 390 }, { "epoch": 2.3841463414634148, "grad_norm": 0.08185922748732076, "learning_rate": 2.46794937666063e-06, "loss": 0.4998, "step": 391 }, { "epoch": 2.3902439024390243, "grad_norm": 0.07724446363577575, "learning_rate": 2.421386745500034e-06, "loss": 0.4832, "step": 392 }, { "epoch": 2.3963414634146343, "grad_norm": 0.0719202945692499, "learning_rate": 2.375206977097353e-06, "loss": 0.4625, "step": 393 }, { "epoch": 2.402439024390244, "grad_norm": 0.07160181702178699, "learning_rate": 2.329412404400969e-06, "loss": 0.4786, "step": 394 }, { "epoch": 2.408536585365854, "grad_norm": 0.07705465155073153, "learning_rate": 2.2840053408996154e-06, "loss": 0.4873, "step": 395 }, { "epoch": 2.4146341463414633, "grad_norm": 0.06734740120536699, "learning_rate": 2.238988080505513e-06, "loss": 0.4268, "step": 396 }, { "epoch": 2.4207317073170733, "grad_norm": 0.07171806752940019, "learning_rate": 2.1943628974384858e-06, "loss": 0.4657, "step": 397 }, { "epoch": 2.426829268292683, "grad_norm": 0.06712821968746505, "learning_rate": 2.150132046111054e-06, "loss": 0.4201, "step": 398 }, { "epoch": 2.432926829268293, "grad_norm": 0.08316643198749764, "learning_rate": 2.1062977610145697e-06, "loss": 0.513, "step": 399 }, { "epoch": 2.4390243902439024, "grad_norm": 0.0730957075364869, "learning_rate": 2.0628622566063063e-06, "loss": 0.4895, "step": 400 }, { "epoch": 2.4451219512195124, "grad_norm": 0.07287061567762979, "learning_rate": 2.019827727197605e-06, "loss": 0.4306, "step": 401 }, { "epoch": 2.451219512195122, "grad_norm": 0.06700730358392487, "learning_rate": 1.977196346843019e-06, "loss": 0.4141, "step": 402 }, { "epoch": 2.457317073170732, "grad_norm": 0.07927651728219412, "learning_rate": 1.934970269230464e-06, "loss": 0.4702, "step": 403 }, { "epoch": 2.4634146341463414, "grad_norm": 0.07966939559181735, "learning_rate": 1.8931516275724527e-06, "loss": 0.4209, "step": 404 }, { "epoch": 2.4695121951219514, "grad_norm": 0.07505835152415707, "learning_rate": 1.8517425344982831e-06, "loss": 0.5004, "step": 405 }, { "epoch": 2.475609756097561, "grad_norm": 0.0792696269268693, "learning_rate": 1.8107450819473505e-06, "loss": 0.4954, "step": 406 }, { "epoch": 2.4817073170731705, "grad_norm": 0.07162945978931057, "learning_rate": 1.7701613410634367e-06, "loss": 0.496, "step": 407 }, { "epoch": 2.4878048780487805, "grad_norm": 0.07893944712223014, "learning_rate": 1.7299933620900945e-06, "loss": 0.4774, "step": 408 }, { "epoch": 2.4939024390243905, "grad_norm": 0.06827623677585598, "learning_rate": 1.690243174267071e-06, "loss": 0.4177, "step": 409 }, { "epoch": 2.5, "grad_norm": 0.0754447597879692, "learning_rate": 1.6509127857277784e-06, "loss": 0.4889, "step": 410 }, { "epoch": 2.5060975609756095, "grad_norm": 0.08857036691886101, "learning_rate": 1.6120041833978662e-06, "loss": 0.5317, "step": 411 }, { "epoch": 2.5121951219512195, "grad_norm": 0.07177001116277269, "learning_rate": 1.573519332894824e-06, "loss": 0.414, "step": 412 }, { "epoch": 2.5182926829268295, "grad_norm": 0.07831205185485109, "learning_rate": 1.535460178428697e-06, "loss": 0.5028, "step": 413 }, { "epoch": 2.524390243902439, "grad_norm": 0.07285605695660612, "learning_rate": 1.4978286427038602e-06, "loss": 0.5031, "step": 414 }, { "epoch": 2.5304878048780486, "grad_norm": 0.08720951847793187, "learning_rate": 1.4606266268218783e-06, "loss": 0.5084, "step": 415 }, { "epoch": 2.5365853658536586, "grad_norm": 0.06975140711559835, "learning_rate": 1.4238560101854815e-06, "loss": 0.4253, "step": 416 }, { "epoch": 2.5426829268292686, "grad_norm": 0.07714853710418437, "learning_rate": 1.3875186504035965e-06, "loss": 0.4744, "step": 417 }, { "epoch": 2.548780487804878, "grad_norm": 0.07565492373996721, "learning_rate": 1.3516163831975337e-06, "loss": 0.5152, "step": 418 }, { "epoch": 2.5548780487804876, "grad_norm": 0.07030057664082874, "learning_rate": 1.3161510223082152e-06, "loss": 0.4461, "step": 419 }, { "epoch": 2.5609756097560976, "grad_norm": 0.0800726969605912, "learning_rate": 1.2811243594045697e-06, "loss": 0.5135, "step": 420 }, { "epoch": 2.567073170731707, "grad_norm": 0.07816897719762364, "learning_rate": 1.246538163993013e-06, "loss": 0.4999, "step": 421 }, { "epoch": 2.573170731707317, "grad_norm": 0.07745184122312047, "learning_rate": 1.2123941833280472e-06, "loss": 0.4847, "step": 422 }, { "epoch": 2.5792682926829267, "grad_norm": 0.07419017119462436, "learning_rate": 1.1786941423240072e-06, "loss": 0.4843, "step": 423 }, { "epoch": 2.5853658536585367, "grad_norm": 0.07931455390788596, "learning_rate": 1.1454397434679022e-06, "loss": 0.4946, "step": 424 }, { "epoch": 2.591463414634146, "grad_norm": 0.07615796865199526, "learning_rate": 1.1126326667334196e-06, "loss": 0.4524, "step": 425 }, { "epoch": 2.597560975609756, "grad_norm": 0.0772418363352449, "learning_rate": 1.080274569496057e-06, "loss": 0.5152, "step": 426 }, { "epoch": 2.6036585365853657, "grad_norm": 0.07025077296325957, "learning_rate": 1.0483670864493777e-06, "loss": 0.4332, "step": 427 }, { "epoch": 2.6097560975609757, "grad_norm": 0.07566288563595869, "learning_rate": 1.0169118295224488e-06, "loss": 0.5029, "step": 428 }, { "epoch": 2.6158536585365852, "grad_norm": 0.0758658441769188, "learning_rate": 9.85910387798389e-07, "loss": 0.4573, "step": 429 }, { "epoch": 2.6219512195121952, "grad_norm": 0.07964262946952728, "learning_rate": 9.55364327434105e-07, "loss": 0.4933, "step": 430 }, { "epoch": 2.6280487804878048, "grad_norm": 0.07874589016859943, "learning_rate": 9.252751915811642e-07, "loss": 0.473, "step": 431 }, { "epoch": 2.6341463414634148, "grad_norm": 0.0766185482028681, "learning_rate": 8.956445003078351e-07, "loss": 0.5018, "step": 432 }, { "epoch": 2.6402439024390243, "grad_norm": 0.08140880877769818, "learning_rate": 8.664737505223009e-07, "loss": 0.5203, "step": 433 }, { "epoch": 2.6463414634146343, "grad_norm": 0.08015141287053174, "learning_rate": 8.377644158970277e-07, "loss": 0.5215, "step": 434 }, { "epoch": 2.652439024390244, "grad_norm": 0.07563574533469265, "learning_rate": 8.095179467943293e-07, "loss": 0.4877, "step": 435 }, { "epoch": 2.658536585365854, "grad_norm": 0.08532123143914754, "learning_rate": 7.81735770193085e-07, "loss": 0.5027, "step": 436 }, { "epoch": 2.6646341463414633, "grad_norm": 0.06972127152615569, "learning_rate": 7.544192896166569e-07, "loss": 0.4691, "step": 437 }, { "epoch": 2.6707317073170733, "grad_norm": 0.0748196016294252, "learning_rate": 7.275698850619861e-07, "loss": 0.5059, "step": 438 }, { "epoch": 2.676829268292683, "grad_norm": 0.07757312698493772, "learning_rate": 7.011889129298688e-07, "loss": 0.5559, "step": 439 }, { "epoch": 2.682926829268293, "grad_norm": 0.07577704718768018, "learning_rate": 6.752777059564431e-07, "loss": 0.4718, "step": 440 }, { "epoch": 2.6890243902439024, "grad_norm": 0.07357519669905033, "learning_rate": 6.498375731458529e-07, "loss": 0.4876, "step": 441 }, { "epoch": 2.6951219512195124, "grad_norm": 0.07445682597283106, "learning_rate": 6.248697997041219e-07, "loss": 0.4833, "step": 442 }, { "epoch": 2.701219512195122, "grad_norm": 0.07241140052205494, "learning_rate": 6.003756469742294e-07, "loss": 0.4713, "step": 443 }, { "epoch": 2.7073170731707314, "grad_norm": 0.07656055745393084, "learning_rate": 5.763563523723769e-07, "loss": 0.4525, "step": 444 }, { "epoch": 2.7134146341463414, "grad_norm": 0.07364895292854746, "learning_rate": 5.528131293254957e-07, "loss": 0.477, "step": 445 }, { "epoch": 2.7195121951219514, "grad_norm": 0.07520667622845931, "learning_rate": 5.29747167209923e-07, "loss": 0.4747, "step": 446 }, { "epoch": 2.725609756097561, "grad_norm": 0.08218001537128848, "learning_rate": 5.071596312913329e-07, "loss": 0.54, "step": 447 }, { "epoch": 2.7317073170731705, "grad_norm": 0.0867606649208939, "learning_rate": 4.850516626658585e-07, "loss": 0.5081, "step": 448 }, { "epoch": 2.7378048780487805, "grad_norm": 0.07210738314053156, "learning_rate": 4.634243782024539e-07, "loss": 0.4431, "step": 449 }, { "epoch": 2.7439024390243905, "grad_norm": 0.06582252551133536, "learning_rate": 4.4227887048646335e-07, "loss": 0.4192, "step": 450 }, { "epoch": 2.75, "grad_norm": 0.07567885015733454, "learning_rate": 4.216162077644281e-07, "loss": 0.4887, "step": 451 }, { "epoch": 2.7560975609756095, "grad_norm": 0.07689928271198733, "learning_rate": 4.014374338901206e-07, "loss": 0.4683, "step": 452 }, { "epoch": 2.7621951219512195, "grad_norm": 0.07188606850501078, "learning_rate": 3.817435682718096e-07, "loss": 0.467, "step": 453 }, { "epoch": 2.7682926829268295, "grad_norm": 0.07163831506927039, "learning_rate": 3.6253560582076075e-07, "loss": 0.4539, "step": 454 }, { "epoch": 2.774390243902439, "grad_norm": 0.07592375615552627, "learning_rate": 3.4381451690097653e-07, "loss": 0.4736, "step": 455 }, { "epoch": 2.7804878048780486, "grad_norm": 0.0814766286311381, "learning_rate": 3.255812472801689e-07, "loss": 0.532, "step": 456 }, { "epoch": 2.7865853658536586, "grad_norm": 0.0717352396757339, "learning_rate": 3.078367180819863e-07, "loss": 0.4316, "step": 457 }, { "epoch": 2.7926829268292686, "grad_norm": 0.0746931546839919, "learning_rate": 2.905818257394799e-07, "loss": 0.5206, "step": 458 }, { "epoch": 2.798780487804878, "grad_norm": 0.07272697545910387, "learning_rate": 2.7381744194980963e-07, "loss": 0.4832, "step": 459 }, { "epoch": 2.8048780487804876, "grad_norm": 0.07230252882563974, "learning_rate": 2.5754441363021854e-07, "loss": 0.4778, "step": 460 }, { "epoch": 2.8109756097560976, "grad_norm": 0.08033515297716644, "learning_rate": 2.417635628752324e-07, "loss": 0.5301, "step": 461 }, { "epoch": 2.817073170731707, "grad_norm": 0.08891143823100789, "learning_rate": 2.264756869151441e-07, "loss": 0.5255, "step": 462 }, { "epoch": 2.823170731707317, "grad_norm": 0.07069972457447841, "learning_rate": 2.1168155807572476e-07, "loss": 0.431, "step": 463 }, { "epoch": 2.8292682926829267, "grad_norm": 0.0735272098530535, "learning_rate": 1.973819237392205e-07, "loss": 0.4968, "step": 464 }, { "epoch": 2.8353658536585367, "grad_norm": 0.08455153800014176, "learning_rate": 1.8357750630658367e-07, "loss": 0.4924, "step": 465 }, { "epoch": 2.841463414634146, "grad_norm": 0.06931732980810784, "learning_rate": 1.7026900316098217e-07, "loss": 0.4309, "step": 466 }, { "epoch": 2.847560975609756, "grad_norm": 0.07200984122635784, "learning_rate": 1.5745708663257199e-07, "loss": 0.4667, "step": 467 }, { "epoch": 2.8536585365853657, "grad_norm": 0.07677994816913122, "learning_rate": 1.4514240396452438e-07, "loss": 0.4834, "step": 468 }, { "epoch": 2.8597560975609757, "grad_norm": 0.08178248437061655, "learning_rate": 1.333255772803377e-07, "loss": 0.5251, "step": 469 }, { "epoch": 2.8658536585365852, "grad_norm": 0.07923538700790901, "learning_rate": 1.2200720355239893e-07, "loss": 0.5171, "step": 470 }, { "epoch": 2.8719512195121952, "grad_norm": 0.06761152489499198, "learning_rate": 1.1118785457183034e-07, "loss": 0.4615, "step": 471 }, { "epoch": 2.8780487804878048, "grad_norm": 0.0767262597066072, "learning_rate": 1.0086807691960243e-07, "loss": 0.4976, "step": 472 }, { "epoch": 2.8841463414634148, "grad_norm": 0.08266454055489816, "learning_rate": 9.104839193892379e-08, "loss": 0.5389, "step": 473 }, { "epoch": 2.8902439024390243, "grad_norm": 0.08086361495523785, "learning_rate": 8.172929570889553e-08, "loss": 0.4929, "step": 474 }, { "epoch": 2.8963414634146343, "grad_norm": 0.0745089911333017, "learning_rate": 7.291125901946027e-08, "loss": 0.4939, "step": 475 }, { "epoch": 2.902439024390244, "grad_norm": 0.08025945659207359, "learning_rate": 6.459472734760997e-08, "loss": 0.4876, "step": 476 }, { "epoch": 2.908536585365854, "grad_norm": 0.07053947220102097, "learning_rate": 5.6780120834887264e-08, "loss": 0.4611, "step": 477 }, { "epoch": 2.9146341463414633, "grad_norm": 0.0803969476007233, "learning_rate": 4.9467834266154756e-08, "loss": 0.5419, "step": 478 }, { "epoch": 2.9207317073170733, "grad_norm": 0.07482955639907388, "learning_rate": 4.2658237049655325e-08, "loss": 0.4889, "step": 479 }, { "epoch": 2.926829268292683, "grad_norm": 0.07752759847951908, "learning_rate": 3.635167319834709e-08, "loss": 0.4749, "step": 480 }, { "epoch": 2.932926829268293, "grad_norm": 0.08359023148499886, "learning_rate": 3.054846131252731e-08, "loss": 0.5334, "step": 481 }, { "epoch": 2.9390243902439024, "grad_norm": 0.08139586618522474, "learning_rate": 2.524889456373525e-08, "loss": 0.5523, "step": 482 }, { "epoch": 2.9451219512195124, "grad_norm": 0.07935921771678194, "learning_rate": 2.045324067993959e-08, "loss": 0.4853, "step": 483 }, { "epoch": 2.951219512195122, "grad_norm": 0.07808298733873976, "learning_rate": 1.6161741932017026e-08, "loss": 0.5005, "step": 484 }, { "epoch": 2.9573170731707314, "grad_norm": 0.0649213930995393, "learning_rate": 1.2374615121508726e-08, "loss": 0.4098, "step": 485 }, { "epoch": 2.9634146341463414, "grad_norm": 0.08655284099627421, "learning_rate": 9.092051569674632e-09, "loss": 0.4856, "step": 486 }, { "epoch": 2.9695121951219514, "grad_norm": 0.07707319839718253, "learning_rate": 6.314217107817877e-09, "loss": 0.5193, "step": 487 }, { "epoch": 2.975609756097561, "grad_norm": 0.07715334723099698, "learning_rate": 4.041252068918145e-09, "loss": 0.5263, "step": 488 }, { "epoch": 2.9817073170731705, "grad_norm": 0.07908531746017289, "learning_rate": 2.273271280534006e-09, "loss": 0.4823, "step": 489 }, { "epoch": 2.9878048780487805, "grad_norm": 0.07306053297024867, "learning_rate": 1.0103640590064524e-09, "loss": 0.4543, "step": 490 }, { "epoch": 2.9939024390243905, "grad_norm": 0.08516212104693965, "learning_rate": 2.525942049436125e-10, "loss": 0.5, "step": 491 }, { "epoch": 3.0, "grad_norm": 0.07775010775380628, "learning_rate": 0.0, "loss": 0.4651, "step": 492 }, { "epoch": 3.0, "step": 492, "total_flos": 7801758436818944.0, "train_loss": 0.7506088816780385, "train_runtime": 5228.6286, "train_samples_per_second": 6.005, "train_steps_per_second": 0.094 } ], "logging_steps": 1, "max_steps": 492, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7801758436818944.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }