{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.025326474079937, "eval_steps": 500, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000633161851998417, "grad_norm": 6.590027568134772, "learning_rate": 0.0, "loss": 0.8514, "step": 1 }, { "epoch": 0.001266323703996834, "grad_norm": 6.685441325397761, "learning_rate": 8.438818565400844e-08, "loss": 0.8839, "step": 2 }, { "epoch": 0.0018994855559952514, "grad_norm": 6.750452455519639, "learning_rate": 1.6877637130801689e-07, "loss": 0.8734, "step": 3 }, { "epoch": 0.002532647407993668, "grad_norm": 6.660907237735488, "learning_rate": 2.5316455696202533e-07, "loss": 0.8777, "step": 4 }, { "epoch": 0.0031658092599920855, "grad_norm": 6.48194808168943, "learning_rate": 3.3755274261603377e-07, "loss": 0.8574, "step": 5 }, { "epoch": 0.0037989711119905028, "grad_norm": 6.5635712902468875, "learning_rate": 4.219409282700422e-07, "loss": 0.8654, "step": 6 }, { "epoch": 0.00443213296398892, "grad_norm": 6.595584964021003, "learning_rate": 5.063291139240507e-07, "loss": 0.8779, "step": 7 }, { "epoch": 0.005065294815987336, "grad_norm": 6.227731565199322, "learning_rate": 5.907172995780591e-07, "loss": 0.8548, "step": 8 }, { "epoch": 0.005698456667985754, "grad_norm": 5.975792182592218, "learning_rate": 6.751054852320675e-07, "loss": 0.8499, "step": 9 }, { "epoch": 0.006331618519984171, "grad_norm": 6.146193846926148, "learning_rate": 7.59493670886076e-07, "loss": 0.8575, "step": 10 }, { "epoch": 0.006964780371982588, "grad_norm": 5.755865334407744, "learning_rate": 8.438818565400844e-07, "loss": 0.8583, "step": 11 }, { "epoch": 0.0075979422239810055, "grad_norm": 4.236361497613728, "learning_rate": 9.28270042194093e-07, "loss": 0.8377, "step": 12 }, { "epoch": 0.008231104075979422, "grad_norm": 4.137767820299327, "learning_rate": 1.0126582278481013e-06, "loss": 0.8048, "step": 13 }, { "epoch": 0.00886426592797784, "grad_norm": 4.065135837918306, "learning_rate": 1.0970464135021099e-06, "loss": 0.8195, "step": 14 }, { "epoch": 0.009497427779976256, "grad_norm": 3.19578897264207, "learning_rate": 1.1814345991561182e-06, "loss": 0.8125, "step": 15 }, { "epoch": 0.010130589631974673, "grad_norm": 3.6572163219656604, "learning_rate": 1.2658227848101267e-06, "loss": 0.7831, "step": 16 }, { "epoch": 0.010763751483973091, "grad_norm": 3.3439104265152455, "learning_rate": 1.350210970464135e-06, "loss": 0.772, "step": 17 }, { "epoch": 0.011396913335971507, "grad_norm": 3.212689613688698, "learning_rate": 1.4345991561181436e-06, "loss": 0.7718, "step": 18 }, { "epoch": 0.012030075187969926, "grad_norm": 3.0806734191876894, "learning_rate": 1.518987341772152e-06, "loss": 0.7652, "step": 19 }, { "epoch": 0.012663237039968342, "grad_norm": 2.487335833951049, "learning_rate": 1.6033755274261605e-06, "loss": 0.7513, "step": 20 }, { "epoch": 0.013296398891966758, "grad_norm": 2.1926996863233943, "learning_rate": 1.6877637130801689e-06, "loss": 0.7218, "step": 21 }, { "epoch": 0.013929560743965176, "grad_norm": 2.4924787160781174, "learning_rate": 1.7721518987341774e-06, "loss": 0.7021, "step": 22 }, { "epoch": 0.014562722595963593, "grad_norm": 2.4552608902609476, "learning_rate": 1.856540084388186e-06, "loss": 0.7084, "step": 23 }, { "epoch": 0.015195884447962011, "grad_norm": 2.318133697478427, "learning_rate": 1.9409282700421943e-06, "loss": 0.7062, "step": 24 }, { "epoch": 0.015829046299960427, "grad_norm": 2.0858734803079892, "learning_rate": 2.0253164556962026e-06, "loss": 0.6827, "step": 25 }, { "epoch": 0.016462208151958844, "grad_norm": 1.933930114721539, "learning_rate": 2.1097046413502114e-06, "loss": 0.6942, "step": 26 }, { "epoch": 0.01709537000395726, "grad_norm": 1.7428795510604398, "learning_rate": 2.1940928270042197e-06, "loss": 0.6717, "step": 27 }, { "epoch": 0.01772853185595568, "grad_norm": 1.3198576578222792, "learning_rate": 2.278481012658228e-06, "loss": 0.6726, "step": 28 }, { "epoch": 0.018361693707954096, "grad_norm": 1.3387296424808421, "learning_rate": 2.3628691983122364e-06, "loss": 0.6616, "step": 29 }, { "epoch": 0.018994855559952513, "grad_norm": 1.440924238709439, "learning_rate": 2.447257383966245e-06, "loss": 0.6449, "step": 30 }, { "epoch": 0.01962801741195093, "grad_norm": 1.4143123960073807, "learning_rate": 2.5316455696202535e-06, "loss": 0.617, "step": 31 }, { "epoch": 0.020261179263949346, "grad_norm": 1.2824535614618957, "learning_rate": 2.6160337552742622e-06, "loss": 0.6308, "step": 32 }, { "epoch": 0.020894341115947766, "grad_norm": 1.0998108471371222, "learning_rate": 2.70042194092827e-06, "loss": 0.5893, "step": 33 }, { "epoch": 0.021527502967946182, "grad_norm": 0.9900147340592944, "learning_rate": 2.7848101265822785e-06, "loss": 0.613, "step": 34 }, { "epoch": 0.0221606648199446, "grad_norm": 0.9179612234726228, "learning_rate": 2.8691983122362873e-06, "loss": 0.5945, "step": 35 }, { "epoch": 0.022793826671943015, "grad_norm": 0.9160025496704113, "learning_rate": 2.9535864978902956e-06, "loss": 0.5965, "step": 36 }, { "epoch": 0.02342698852394143, "grad_norm": 0.8973142733724568, "learning_rate": 3.037974683544304e-06, "loss": 0.5759, "step": 37 }, { "epoch": 0.02406015037593985, "grad_norm": 0.8360002442510349, "learning_rate": 3.1223628691983127e-06, "loss": 0.5919, "step": 38 }, { "epoch": 0.024693312227938267, "grad_norm": 0.8123235099055627, "learning_rate": 3.206751054852321e-06, "loss": 0.5701, "step": 39 }, { "epoch": 0.025326474079936684, "grad_norm": 0.8411024225420295, "learning_rate": 3.2911392405063294e-06, "loss": 0.5665, "step": 40 }, { "epoch": 0.0259596359319351, "grad_norm": 1.1919155938672577, "learning_rate": 3.3755274261603377e-06, "loss": 0.566, "step": 41 }, { "epoch": 0.026592797783933517, "grad_norm": 0.7942180923929226, "learning_rate": 3.459915611814346e-06, "loss": 0.5835, "step": 42 }, { "epoch": 0.027225959635931937, "grad_norm": 0.8181520877736833, "learning_rate": 3.544303797468355e-06, "loss": 0.5547, "step": 43 }, { "epoch": 0.027859121487930353, "grad_norm": 0.8480195438260757, "learning_rate": 3.628691983122363e-06, "loss": 0.5511, "step": 44 }, { "epoch": 0.02849228333992877, "grad_norm": 0.8554523900913158, "learning_rate": 3.713080168776372e-06, "loss": 0.5667, "step": 45 }, { "epoch": 0.029125445191927186, "grad_norm": 0.792116858119633, "learning_rate": 3.7974683544303802e-06, "loss": 0.5432, "step": 46 }, { "epoch": 0.029758607043925602, "grad_norm": 1.1677927725266535, "learning_rate": 3.8818565400843886e-06, "loss": 0.5381, "step": 47 }, { "epoch": 0.030391768895924022, "grad_norm": 0.7768756061303338, "learning_rate": 3.9662447257383965e-06, "loss": 0.5305, "step": 48 }, { "epoch": 0.03102493074792244, "grad_norm": 0.7778557678278698, "learning_rate": 4.050632911392405e-06, "loss": 0.5632, "step": 49 }, { "epoch": 0.031658092599920855, "grad_norm": 0.732872665484128, "learning_rate": 4.135021097046414e-06, "loss": 0.5499, "step": 50 }, { "epoch": 0.03229125445191927, "grad_norm": 0.7207478004352674, "learning_rate": 4.219409282700423e-06, "loss": 0.541, "step": 51 }, { "epoch": 0.03292441630391769, "grad_norm": 0.750762132323327, "learning_rate": 4.303797468354431e-06, "loss": 0.5418, "step": 52 }, { "epoch": 0.033557578155916104, "grad_norm": 0.7434764648013086, "learning_rate": 4.3881856540084394e-06, "loss": 0.5152, "step": 53 }, { "epoch": 0.03419074000791452, "grad_norm": 0.7082508963014316, "learning_rate": 4.472573839662447e-06, "loss": 0.5148, "step": 54 }, { "epoch": 0.03482390185991294, "grad_norm": 0.7035708611318852, "learning_rate": 4.556962025316456e-06, "loss": 0.5273, "step": 55 }, { "epoch": 0.03545706371191136, "grad_norm": 0.7337225311251793, "learning_rate": 4.641350210970465e-06, "loss": 0.5285, "step": 56 }, { "epoch": 0.03609022556390978, "grad_norm": 0.6791874416149749, "learning_rate": 4.725738396624473e-06, "loss": 0.5181, "step": 57 }, { "epoch": 0.03672338741590819, "grad_norm": 0.6787102930916649, "learning_rate": 4.8101265822784815e-06, "loss": 0.5191, "step": 58 }, { "epoch": 0.03735654926790661, "grad_norm": 0.6672635626777768, "learning_rate": 4.89451476793249e-06, "loss": 0.5236, "step": 59 }, { "epoch": 0.037989711119905026, "grad_norm": 0.7155453870733437, "learning_rate": 4.978902953586498e-06, "loss": 0.5213, "step": 60 }, { "epoch": 0.03862287297190344, "grad_norm": 0.8698890454102993, "learning_rate": 5.063291139240507e-06, "loss": 0.5109, "step": 61 }, { "epoch": 0.03925603482390186, "grad_norm": 1.5864301459746957, "learning_rate": 5.147679324894516e-06, "loss": 0.4968, "step": 62 }, { "epoch": 0.039889196675900275, "grad_norm": 0.7908265706423138, "learning_rate": 5.2320675105485245e-06, "loss": 0.5112, "step": 63 }, { "epoch": 0.04052235852789869, "grad_norm": 0.6931074111026475, "learning_rate": 5.3164556962025316e-06, "loss": 0.4979, "step": 64 }, { "epoch": 0.04115552037989711, "grad_norm": 0.7092625506244769, "learning_rate": 5.40084388185654e-06, "loss": 0.5178, "step": 65 }, { "epoch": 0.04178868223189553, "grad_norm": 0.7220762952560791, "learning_rate": 5.485232067510548e-06, "loss": 0.4918, "step": 66 }, { "epoch": 0.04242184408389395, "grad_norm": 0.7120675778000228, "learning_rate": 5.569620253164557e-06, "loss": 0.4999, "step": 67 }, { "epoch": 0.043055005935892364, "grad_norm": 2.012442751489758, "learning_rate": 5.654008438818566e-06, "loss": 0.5185, "step": 68 }, { "epoch": 0.04368816778789078, "grad_norm": 0.8451094824199832, "learning_rate": 5.7383966244725745e-06, "loss": 0.5161, "step": 69 }, { "epoch": 0.0443213296398892, "grad_norm": 0.7354399193372889, "learning_rate": 5.8227848101265824e-06, "loss": 0.5073, "step": 70 }, { "epoch": 0.04495449149188761, "grad_norm": 0.7223980580165095, "learning_rate": 5.907172995780591e-06, "loss": 0.5045, "step": 71 }, { "epoch": 0.04558765334388603, "grad_norm": 0.7741801495621329, "learning_rate": 5.9915611814346e-06, "loss": 0.501, "step": 72 }, { "epoch": 0.046220815195884446, "grad_norm": 0.855413846524164, "learning_rate": 6.075949367088608e-06, "loss": 0.4892, "step": 73 }, { "epoch": 0.04685397704788286, "grad_norm": 0.7092316454326212, "learning_rate": 6.160337552742617e-06, "loss": 0.4996, "step": 74 }, { "epoch": 0.04748713889988128, "grad_norm": 0.7434911708598702, "learning_rate": 6.244725738396625e-06, "loss": 0.5056, "step": 75 }, { "epoch": 0.0481203007518797, "grad_norm": 0.7898605244828075, "learning_rate": 6.329113924050634e-06, "loss": 0.49, "step": 76 }, { "epoch": 0.04875346260387812, "grad_norm": 0.7009638487308841, "learning_rate": 6.413502109704642e-06, "loss": 0.4893, "step": 77 }, { "epoch": 0.049386624455876535, "grad_norm": 0.7973003653929035, "learning_rate": 6.49789029535865e-06, "loss": 0.4857, "step": 78 }, { "epoch": 0.05001978630787495, "grad_norm": 0.7091660861235203, "learning_rate": 6.582278481012659e-06, "loss": 0.4942, "step": 79 }, { "epoch": 0.05065294815987337, "grad_norm": 0.7029603347621632, "learning_rate": 6.666666666666667e-06, "loss": 0.4726, "step": 80 }, { "epoch": 0.051286110011871784, "grad_norm": 0.8392501054878392, "learning_rate": 6.751054852320675e-06, "loss": 0.486, "step": 81 }, { "epoch": 0.0519192718638702, "grad_norm": 0.771108194641631, "learning_rate": 6.835443037974684e-06, "loss": 0.4857, "step": 82 }, { "epoch": 0.05255243371586862, "grad_norm": 0.8167524977470255, "learning_rate": 6.919831223628692e-06, "loss": 0.4997, "step": 83 }, { "epoch": 0.05318559556786703, "grad_norm": 0.8540647035818936, "learning_rate": 7.004219409282701e-06, "loss": 0.5049, "step": 84 }, { "epoch": 0.05381875741986545, "grad_norm": 0.7203938663819929, "learning_rate": 7.08860759493671e-06, "loss": 0.4697, "step": 85 }, { "epoch": 0.05445191927186387, "grad_norm": 0.8979614398357612, "learning_rate": 7.172995780590718e-06, "loss": 0.4998, "step": 86 }, { "epoch": 0.05508508112386229, "grad_norm": 0.8983175239445339, "learning_rate": 7.257383966244726e-06, "loss": 0.5002, "step": 87 }, { "epoch": 0.055718242975860706, "grad_norm": 0.7989470140910984, "learning_rate": 7.341772151898735e-06, "loss": 0.4582, "step": 88 }, { "epoch": 0.05635140482785912, "grad_norm": 0.9016554030083122, "learning_rate": 7.426160337552744e-06, "loss": 0.4584, "step": 89 }, { "epoch": 0.05698456667985754, "grad_norm": 0.7698591346749634, "learning_rate": 7.510548523206752e-06, "loss": 0.4693, "step": 90 }, { "epoch": 0.057617728531855955, "grad_norm": 0.8451221190087225, "learning_rate": 7.5949367088607605e-06, "loss": 0.4612, "step": 91 }, { "epoch": 0.05825089038385437, "grad_norm": 0.7776488368685276, "learning_rate": 7.679324894514768e-06, "loss": 0.4774, "step": 92 }, { "epoch": 0.05888405223585279, "grad_norm": 0.8104563067870384, "learning_rate": 7.763713080168777e-06, "loss": 0.4871, "step": 93 }, { "epoch": 0.059517214087851204, "grad_norm": 0.7790149370344418, "learning_rate": 7.848101265822786e-06, "loss": 0.4775, "step": 94 }, { "epoch": 0.06015037593984962, "grad_norm": 0.8264922704494518, "learning_rate": 7.932489451476793e-06, "loss": 0.4761, "step": 95 }, { "epoch": 0.060783537791848044, "grad_norm": 0.7697311001757953, "learning_rate": 8.016877637130802e-06, "loss": 0.4627, "step": 96 }, { "epoch": 0.06141669964384646, "grad_norm": 0.8675756083396168, "learning_rate": 8.10126582278481e-06, "loss": 0.4578, "step": 97 }, { "epoch": 0.06204986149584488, "grad_norm": 0.7363960566008788, "learning_rate": 8.18565400843882e-06, "loss": 0.4945, "step": 98 }, { "epoch": 0.06268302334784329, "grad_norm": 0.9407740989678045, "learning_rate": 8.270042194092828e-06, "loss": 0.4761, "step": 99 }, { "epoch": 0.06331618519984171, "grad_norm": 0.7853206984419256, "learning_rate": 8.354430379746837e-06, "loss": 0.4812, "step": 100 }, { "epoch": 0.06394934705184013, "grad_norm": 0.8856842807910008, "learning_rate": 8.438818565400846e-06, "loss": 0.4691, "step": 101 }, { "epoch": 0.06458250890383854, "grad_norm": 0.7464280626359762, "learning_rate": 8.523206751054853e-06, "loss": 0.468, "step": 102 }, { "epoch": 0.06521567075583697, "grad_norm": 1.0401878414310537, "learning_rate": 8.607594936708861e-06, "loss": 0.47, "step": 103 }, { "epoch": 0.06584883260783538, "grad_norm": 0.7296174719562629, "learning_rate": 8.69198312236287e-06, "loss": 0.4875, "step": 104 }, { "epoch": 0.0664819944598338, "grad_norm": 0.8683995921238001, "learning_rate": 8.776371308016879e-06, "loss": 0.4557, "step": 105 }, { "epoch": 0.06711515631183221, "grad_norm": 0.7574641801758992, "learning_rate": 8.860759493670886e-06, "loss": 0.4622, "step": 106 }, { "epoch": 0.06774831816383063, "grad_norm": 0.9523240205585705, "learning_rate": 8.945147679324895e-06, "loss": 0.4636, "step": 107 }, { "epoch": 0.06838148001582904, "grad_norm": 0.8343729768855822, "learning_rate": 9.029535864978903e-06, "loss": 0.4646, "step": 108 }, { "epoch": 0.06901464186782746, "grad_norm": 0.9150408804469654, "learning_rate": 9.113924050632912e-06, "loss": 0.4747, "step": 109 }, { "epoch": 0.06964780371982587, "grad_norm": 0.8715527559993842, "learning_rate": 9.198312236286921e-06, "loss": 0.4405, "step": 110 }, { "epoch": 0.0702809655718243, "grad_norm": 0.7527145836529907, "learning_rate": 9.28270042194093e-06, "loss": 0.4525, "step": 111 }, { "epoch": 0.07091412742382272, "grad_norm": 0.9484884040826234, "learning_rate": 9.367088607594937e-06, "loss": 0.4479, "step": 112 }, { "epoch": 0.07154728927582113, "grad_norm": 0.9990589980412264, "learning_rate": 9.451476793248946e-06, "loss": 0.4691, "step": 113 }, { "epoch": 0.07218045112781955, "grad_norm": 0.8306179744332812, "learning_rate": 9.535864978902954e-06, "loss": 0.4683, "step": 114 }, { "epoch": 0.07281361297981796, "grad_norm": 0.7949905782574014, "learning_rate": 9.620253164556963e-06, "loss": 0.4754, "step": 115 }, { "epoch": 0.07344677483181639, "grad_norm": 0.7327367856279499, "learning_rate": 9.704641350210972e-06, "loss": 0.4533, "step": 116 }, { "epoch": 0.0740799366838148, "grad_norm": 0.7591028005097373, "learning_rate": 9.78902953586498e-06, "loss": 0.4601, "step": 117 }, { "epoch": 0.07471309853581322, "grad_norm": 0.7459287795346834, "learning_rate": 9.87341772151899e-06, "loss": 0.4537, "step": 118 }, { "epoch": 0.07534626038781163, "grad_norm": 0.7894209737314439, "learning_rate": 9.957805907172996e-06, "loss": 0.46, "step": 119 }, { "epoch": 0.07597942223981005, "grad_norm": 0.7443677351598359, "learning_rate": 1.0042194092827005e-05, "loss": 0.4724, "step": 120 }, { "epoch": 0.07661258409180847, "grad_norm": 0.8326050623885121, "learning_rate": 1.0126582278481014e-05, "loss": 0.4485, "step": 121 }, { "epoch": 0.07724574594380688, "grad_norm": 0.7494062244615093, "learning_rate": 1.0210970464135021e-05, "loss": 0.4745, "step": 122 }, { "epoch": 0.07787890779580531, "grad_norm": 0.8592490279754234, "learning_rate": 1.0295358649789031e-05, "loss": 0.4513, "step": 123 }, { "epoch": 0.07851206964780372, "grad_norm": 0.881251222898683, "learning_rate": 1.0379746835443039e-05, "loss": 0.4452, "step": 124 }, { "epoch": 0.07914523149980214, "grad_norm": 0.7741439991204454, "learning_rate": 1.0464135021097049e-05, "loss": 0.4455, "step": 125 }, { "epoch": 0.07977839335180055, "grad_norm": 1.0680915204869827, "learning_rate": 1.0548523206751056e-05, "loss": 0.4558, "step": 126 }, { "epoch": 0.08041155520379897, "grad_norm": 0.7496590520213242, "learning_rate": 1.0632911392405063e-05, "loss": 0.4567, "step": 127 }, { "epoch": 0.08104471705579738, "grad_norm": 0.885066774923925, "learning_rate": 1.0717299578059072e-05, "loss": 0.4384, "step": 128 }, { "epoch": 0.0816778789077958, "grad_norm": 0.8222929273229034, "learning_rate": 1.080168776371308e-05, "loss": 0.4618, "step": 129 }, { "epoch": 0.08231104075979422, "grad_norm": 0.7963939640341857, "learning_rate": 1.088607594936709e-05, "loss": 0.4667, "step": 130 }, { "epoch": 0.08294420261179264, "grad_norm": 0.8845208905283207, "learning_rate": 1.0970464135021096e-05, "loss": 0.4671, "step": 131 }, { "epoch": 0.08357736446379106, "grad_norm": 0.8559795166352561, "learning_rate": 1.1054852320675107e-05, "loss": 0.4652, "step": 132 }, { "epoch": 0.08421052631578947, "grad_norm": 0.8326856954133551, "learning_rate": 1.1139240506329114e-05, "loss": 0.4613, "step": 133 }, { "epoch": 0.0848436881677879, "grad_norm": 0.8250085305291432, "learning_rate": 1.1223628691983124e-05, "loss": 0.4576, "step": 134 }, { "epoch": 0.0854768500197863, "grad_norm": 0.7978683802994626, "learning_rate": 1.1308016877637132e-05, "loss": 0.4441, "step": 135 }, { "epoch": 0.08611001187178473, "grad_norm": 0.7671707870432439, "learning_rate": 1.139240506329114e-05, "loss": 0.4431, "step": 136 }, { "epoch": 0.08674317372378314, "grad_norm": 0.7892090474440195, "learning_rate": 1.1476793248945149e-05, "loss": 0.4464, "step": 137 }, { "epoch": 0.08737633557578156, "grad_norm": 0.8677238534147906, "learning_rate": 1.1561181434599158e-05, "loss": 0.4479, "step": 138 }, { "epoch": 0.08800949742777997, "grad_norm": 0.8377818183707961, "learning_rate": 1.1645569620253165e-05, "loss": 0.4528, "step": 139 }, { "epoch": 0.0886426592797784, "grad_norm": 0.9489900539225798, "learning_rate": 1.1729957805907175e-05, "loss": 0.4683, "step": 140 }, { "epoch": 0.08927582113177682, "grad_norm": 1.1067189293997721, "learning_rate": 1.1814345991561182e-05, "loss": 0.4598, "step": 141 }, { "epoch": 0.08990898298377523, "grad_norm": 0.9044788757792249, "learning_rate": 1.189873417721519e-05, "loss": 0.4637, "step": 142 }, { "epoch": 0.09054214483577365, "grad_norm": 0.9279775992813144, "learning_rate": 1.19831223628692e-05, "loss": 0.4494, "step": 143 }, { "epoch": 0.09117530668777206, "grad_norm": 0.8341475446818292, "learning_rate": 1.2067510548523207e-05, "loss": 0.4414, "step": 144 }, { "epoch": 0.09180846853977048, "grad_norm": 0.8952801522375003, "learning_rate": 1.2151898734177216e-05, "loss": 0.4482, "step": 145 }, { "epoch": 0.09244163039176889, "grad_norm": 0.8745235900607942, "learning_rate": 1.2236286919831224e-05, "loss": 0.4471, "step": 146 }, { "epoch": 0.09307479224376732, "grad_norm": 0.7796953665801014, "learning_rate": 1.2320675105485233e-05, "loss": 0.4768, "step": 147 }, { "epoch": 0.09370795409576572, "grad_norm": 0.9767710549896571, "learning_rate": 1.240506329113924e-05, "loss": 0.4596, "step": 148 }, { "epoch": 0.09434111594776415, "grad_norm": 0.881073359719102, "learning_rate": 1.248945147679325e-05, "loss": 0.463, "step": 149 }, { "epoch": 0.09497427779976256, "grad_norm": 1.0215955731233055, "learning_rate": 1.2573839662447258e-05, "loss": 0.4552, "step": 150 }, { "epoch": 0.09560743965176098, "grad_norm": 1.9073241536848125, "learning_rate": 1.2658227848101268e-05, "loss": 0.4549, "step": 151 }, { "epoch": 0.0962406015037594, "grad_norm": 1.1085168504233651, "learning_rate": 1.2742616033755275e-05, "loss": 0.4359, "step": 152 }, { "epoch": 0.09687376335575781, "grad_norm": 0.9059665450243155, "learning_rate": 1.2827004219409284e-05, "loss": 0.4261, "step": 153 }, { "epoch": 0.09750692520775624, "grad_norm": 1.1049295545397961, "learning_rate": 1.2911392405063293e-05, "loss": 0.4587, "step": 154 }, { "epoch": 0.09814008705975465, "grad_norm": 0.9569279997230549, "learning_rate": 1.29957805907173e-05, "loss": 0.4486, "step": 155 }, { "epoch": 0.09877324891175307, "grad_norm": 0.9512943085202468, "learning_rate": 1.3080168776371309e-05, "loss": 0.4417, "step": 156 }, { "epoch": 0.09940641076375148, "grad_norm": 0.86087638159287, "learning_rate": 1.3164556962025317e-05, "loss": 0.4645, "step": 157 }, { "epoch": 0.1000395726157499, "grad_norm": 0.9762005366967628, "learning_rate": 1.3248945147679326e-05, "loss": 0.4413, "step": 158 }, { "epoch": 0.10067273446774831, "grad_norm": 0.8777150397338164, "learning_rate": 1.3333333333333333e-05, "loss": 0.4644, "step": 159 }, { "epoch": 0.10130589631974674, "grad_norm": 0.8383717265538615, "learning_rate": 1.3417721518987344e-05, "loss": 0.4528, "step": 160 }, { "epoch": 0.10193905817174516, "grad_norm": 0.8999446891338113, "learning_rate": 1.350210970464135e-05, "loss": 0.4461, "step": 161 }, { "epoch": 0.10257222002374357, "grad_norm": 0.8731011018322569, "learning_rate": 1.358649789029536e-05, "loss": 0.4552, "step": 162 }, { "epoch": 0.10320538187574199, "grad_norm": 0.9663317026371241, "learning_rate": 1.3670886075949368e-05, "loss": 0.439, "step": 163 }, { "epoch": 0.1038385437277404, "grad_norm": 0.9803672206769637, "learning_rate": 1.3755274261603377e-05, "loss": 0.4657, "step": 164 }, { "epoch": 0.10447170557973882, "grad_norm": 0.9388477867373294, "learning_rate": 1.3839662447257384e-05, "loss": 0.4438, "step": 165 }, { "epoch": 0.10510486743173723, "grad_norm": 0.9594710668786071, "learning_rate": 1.3924050632911395e-05, "loss": 0.4506, "step": 166 }, { "epoch": 0.10573802928373566, "grad_norm": 0.8874102235118828, "learning_rate": 1.4008438818565402e-05, "loss": 0.4459, "step": 167 }, { "epoch": 0.10637119113573407, "grad_norm": 1.0234393505448927, "learning_rate": 1.4092827004219412e-05, "loss": 0.4524, "step": 168 }, { "epoch": 0.10700435298773249, "grad_norm": 1.1863836555379583, "learning_rate": 1.417721518987342e-05, "loss": 0.4368, "step": 169 }, { "epoch": 0.1076375148397309, "grad_norm": 0.8845850148719895, "learning_rate": 1.4261603375527426e-05, "loss": 0.4593, "step": 170 }, { "epoch": 0.10827067669172932, "grad_norm": 1.1963542623629069, "learning_rate": 1.4345991561181437e-05, "loss": 0.4317, "step": 171 }, { "epoch": 0.10890383854372775, "grad_norm": 0.8428696730805616, "learning_rate": 1.4430379746835444e-05, "loss": 0.4396, "step": 172 }, { "epoch": 0.10953700039572616, "grad_norm": 1.0364920441135275, "learning_rate": 1.4514767932489453e-05, "loss": 0.4503, "step": 173 }, { "epoch": 0.11017016224772458, "grad_norm": 1.2017238483069341, "learning_rate": 1.459915611814346e-05, "loss": 0.4516, "step": 174 }, { "epoch": 0.11080332409972299, "grad_norm": 1.0562840596429022, "learning_rate": 1.468354430379747e-05, "loss": 0.4588, "step": 175 }, { "epoch": 0.11143648595172141, "grad_norm": 0.7493023525633755, "learning_rate": 1.4767932489451477e-05, "loss": 0.4454, "step": 176 }, { "epoch": 0.11206964780371982, "grad_norm": 1.0166616158533066, "learning_rate": 1.4852320675105488e-05, "loss": 0.451, "step": 177 }, { "epoch": 0.11270280965571824, "grad_norm": 0.8623656183423507, "learning_rate": 1.4936708860759495e-05, "loss": 0.4527, "step": 178 }, { "epoch": 0.11333597150771665, "grad_norm": 0.9714889351671637, "learning_rate": 1.5021097046413503e-05, "loss": 0.454, "step": 179 }, { "epoch": 0.11396913335971508, "grad_norm": 0.8767546116950311, "learning_rate": 1.5105485232067512e-05, "loss": 0.4282, "step": 180 }, { "epoch": 0.1146022952117135, "grad_norm": 0.9825012182558678, "learning_rate": 1.5189873417721521e-05, "loss": 0.4316, "step": 181 }, { "epoch": 0.11523545706371191, "grad_norm": 0.9427846033250604, "learning_rate": 1.5274261603375528e-05, "loss": 0.4246, "step": 182 }, { "epoch": 0.11586861891571033, "grad_norm": 0.9944189671984397, "learning_rate": 1.5358649789029537e-05, "loss": 0.4263, "step": 183 }, { "epoch": 0.11650178076770874, "grad_norm": 0.9052886248535524, "learning_rate": 1.5443037974683546e-05, "loss": 0.4517, "step": 184 }, { "epoch": 0.11713494261970717, "grad_norm": 0.8742819886276305, "learning_rate": 1.5527426160337554e-05, "loss": 0.4371, "step": 185 }, { "epoch": 0.11776810447170558, "grad_norm": 0.8237213936494939, "learning_rate": 1.5611814345991563e-05, "loss": 0.429, "step": 186 }, { "epoch": 0.118401266323704, "grad_norm": 0.8770441498048889, "learning_rate": 1.5696202531645572e-05, "loss": 0.4434, "step": 187 }, { "epoch": 0.11903442817570241, "grad_norm": 0.8382620786014895, "learning_rate": 1.578059071729958e-05, "loss": 0.4257, "step": 188 }, { "epoch": 0.11966759002770083, "grad_norm": 0.7976777445699623, "learning_rate": 1.5864978902953586e-05, "loss": 0.4367, "step": 189 }, { "epoch": 0.12030075187969924, "grad_norm": 0.8221520858960057, "learning_rate": 1.5949367088607598e-05, "loss": 0.4452, "step": 190 }, { "epoch": 0.12093391373169766, "grad_norm": 1.3238423311357521, "learning_rate": 1.6033755274261603e-05, "loss": 0.4282, "step": 191 }, { "epoch": 0.12156707558369609, "grad_norm": 2.7363110725561492, "learning_rate": 1.6118143459915612e-05, "loss": 0.4408, "step": 192 }, { "epoch": 0.1222002374356945, "grad_norm": 0.8580716737282159, "learning_rate": 1.620253164556962e-05, "loss": 0.4412, "step": 193 }, { "epoch": 0.12283339928769292, "grad_norm": 0.7583905946727146, "learning_rate": 1.628691983122363e-05, "loss": 0.4668, "step": 194 }, { "epoch": 0.12346656113969133, "grad_norm": 0.8292909441356512, "learning_rate": 1.637130801687764e-05, "loss": 0.4313, "step": 195 }, { "epoch": 0.12409972299168975, "grad_norm": 0.8628619218707285, "learning_rate": 1.6455696202531647e-05, "loss": 0.4472, "step": 196 }, { "epoch": 0.12473288484368816, "grad_norm": 0.9358404346938707, "learning_rate": 1.6540084388185656e-05, "loss": 0.4278, "step": 197 }, { "epoch": 0.12536604669568657, "grad_norm": 0.9963457710842174, "learning_rate": 1.662447257383966e-05, "loss": 0.4441, "step": 198 }, { "epoch": 0.125999208547685, "grad_norm": 0.7309248798768899, "learning_rate": 1.6708860759493674e-05, "loss": 0.426, "step": 199 }, { "epoch": 0.12663237039968342, "grad_norm": 0.8987901495559926, "learning_rate": 1.679324894514768e-05, "loss": 0.4524, "step": 200 }, { "epoch": 0.12726553225168183, "grad_norm": 0.8068232944378104, "learning_rate": 1.687763713080169e-05, "loss": 0.439, "step": 201 }, { "epoch": 0.12789869410368027, "grad_norm": 0.8554779767967822, "learning_rate": 1.6962025316455696e-05, "loss": 0.4414, "step": 202 }, { "epoch": 0.12853185595567868, "grad_norm": 0.8919945792336013, "learning_rate": 1.7046413502109705e-05, "loss": 0.4513, "step": 203 }, { "epoch": 0.12916501780767708, "grad_norm": 7.110323751792144, "learning_rate": 1.7130801687763714e-05, "loss": 0.4384, "step": 204 }, { "epoch": 0.1297981796596755, "grad_norm": 1.22943000952675, "learning_rate": 1.7215189873417723e-05, "loss": 0.4474, "step": 205 }, { "epoch": 0.13043134151167393, "grad_norm": 0.8719666586353585, "learning_rate": 1.729957805907173e-05, "loss": 0.4366, "step": 206 }, { "epoch": 0.13106450336367234, "grad_norm": 1.0782013446708596, "learning_rate": 1.738396624472574e-05, "loss": 0.4439, "step": 207 }, { "epoch": 0.13169766521567075, "grad_norm": 1.3239137870697066, "learning_rate": 1.746835443037975e-05, "loss": 0.4481, "step": 208 }, { "epoch": 0.13233082706766916, "grad_norm": 0.9460441870150338, "learning_rate": 1.7552742616033758e-05, "loss": 0.4309, "step": 209 }, { "epoch": 0.1329639889196676, "grad_norm": 0.7742489007058667, "learning_rate": 1.7637130801687767e-05, "loss": 0.4339, "step": 210 }, { "epoch": 0.133597150771666, "grad_norm": 0.945959256372493, "learning_rate": 1.7721518987341772e-05, "loss": 0.4313, "step": 211 }, { "epoch": 0.13423031262366442, "grad_norm": 26.01569152153846, "learning_rate": 1.780590717299578e-05, "loss": 0.4418, "step": 212 }, { "epoch": 0.13486347447566285, "grad_norm": 1.723118886596464, "learning_rate": 1.789029535864979e-05, "loss": 0.4695, "step": 213 }, { "epoch": 0.13549663632766126, "grad_norm": 1.057278709491382, "learning_rate": 1.7974683544303798e-05, "loss": 0.4404, "step": 214 }, { "epoch": 0.13612979817965967, "grad_norm": 1.6295765246430132, "learning_rate": 1.8059071729957807e-05, "loss": 0.4646, "step": 215 }, { "epoch": 0.13676296003165808, "grad_norm": 1.4631898510848118, "learning_rate": 1.8143459915611816e-05, "loss": 0.4342, "step": 216 }, { "epoch": 0.13739612188365652, "grad_norm": 1.1218665541578527, "learning_rate": 1.8227848101265824e-05, "loss": 0.4385, "step": 217 }, { "epoch": 0.13802928373565493, "grad_norm": 1.3800652635891215, "learning_rate": 1.8312236286919833e-05, "loss": 0.4456, "step": 218 }, { "epoch": 0.13866244558765334, "grad_norm": 0.999064351324887, "learning_rate": 1.8396624472573842e-05, "loss": 0.4384, "step": 219 }, { "epoch": 0.13929560743965175, "grad_norm": 1.2755552708422495, "learning_rate": 1.848101265822785e-05, "loss": 0.4231, "step": 220 }, { "epoch": 0.13992876929165018, "grad_norm": 1.0708810068077466, "learning_rate": 1.856540084388186e-05, "loss": 0.4257, "step": 221 }, { "epoch": 0.1405619311436486, "grad_norm": 1.0875588842449753, "learning_rate": 1.8649789029535868e-05, "loss": 0.4467, "step": 222 }, { "epoch": 0.141195092995647, "grad_norm": 1.0625441536102713, "learning_rate": 1.8734177215189874e-05, "loss": 0.4409, "step": 223 }, { "epoch": 0.14182825484764544, "grad_norm": 1.0419816162281883, "learning_rate": 1.8818565400843886e-05, "loss": 0.4484, "step": 224 }, { "epoch": 0.14246141669964385, "grad_norm": 1.0666308925190124, "learning_rate": 1.890295358649789e-05, "loss": 0.4305, "step": 225 }, { "epoch": 0.14309457855164226, "grad_norm": 0.9428491102314237, "learning_rate": 1.89873417721519e-05, "loss": 0.4391, "step": 226 }, { "epoch": 0.14372774040364067, "grad_norm": 1.0307266124141585, "learning_rate": 1.907172995780591e-05, "loss": 0.4405, "step": 227 }, { "epoch": 0.1443609022556391, "grad_norm": 1.6403175965899186, "learning_rate": 1.9156118143459917e-05, "loss": 0.4344, "step": 228 }, { "epoch": 0.14499406410763752, "grad_norm": 1.1650742331152533, "learning_rate": 1.9240506329113926e-05, "loss": 0.4219, "step": 229 }, { "epoch": 0.14562722595963593, "grad_norm": 0.8864623159522195, "learning_rate": 1.9324894514767935e-05, "loss": 0.405, "step": 230 }, { "epoch": 0.14626038781163436, "grad_norm": 0.9440602447734575, "learning_rate": 1.9409282700421944e-05, "loss": 0.4333, "step": 231 }, { "epoch": 0.14689354966363277, "grad_norm": 0.9447418189379434, "learning_rate": 1.949367088607595e-05, "loss": 0.4369, "step": 232 }, { "epoch": 0.14752671151563118, "grad_norm": 1.091943602036306, "learning_rate": 1.957805907172996e-05, "loss": 0.4427, "step": 233 }, { "epoch": 0.1481598733676296, "grad_norm": 0.9340873947999535, "learning_rate": 1.9662447257383967e-05, "loss": 0.4301, "step": 234 }, { "epoch": 0.14879303521962803, "grad_norm": 0.9182773956174171, "learning_rate": 1.974683544303798e-05, "loss": 0.4445, "step": 235 }, { "epoch": 0.14942619707162644, "grad_norm": 0.8641515638054832, "learning_rate": 1.9831223628691984e-05, "loss": 0.43, "step": 236 }, { "epoch": 0.15005935892362485, "grad_norm": 1.440269434234952, "learning_rate": 1.9915611814345993e-05, "loss": 0.4127, "step": 237 }, { "epoch": 0.15069252077562326, "grad_norm": 0.9968298240552369, "learning_rate": 2e-05, "loss": 0.4344, "step": 238 }, { "epoch": 0.1513256826276217, "grad_norm": 1.03482531343781, "learning_rate": 1.9999997566306747e-05, "loss": 0.4339, "step": 239 }, { "epoch": 0.1519588444796201, "grad_norm": 1.0323077976016846, "learning_rate": 1.9999990265228172e-05, "loss": 0.4301, "step": 240 }, { "epoch": 0.1525920063316185, "grad_norm": 0.9089794442453314, "learning_rate": 1.9999978096767827e-05, "loss": 0.4246, "step": 241 }, { "epoch": 0.15322516818361695, "grad_norm": 0.9075811799821755, "learning_rate": 1.9999961060931635e-05, "loss": 0.4327, "step": 242 }, { "epoch": 0.15385833003561536, "grad_norm": 0.8605472176980666, "learning_rate": 1.9999939157727894e-05, "loss": 0.4302, "step": 243 }, { "epoch": 0.15449149188761377, "grad_norm": 0.879460638937236, "learning_rate": 1.999991238716726e-05, "loss": 0.4315, "step": 244 }, { "epoch": 0.15512465373961218, "grad_norm": 0.8759803770010028, "learning_rate": 1.9999880749262756e-05, "loss": 0.4495, "step": 245 }, { "epoch": 0.15575781559161062, "grad_norm": 0.853399238874743, "learning_rate": 1.9999844244029797e-05, "loss": 0.4251, "step": 246 }, { "epoch": 0.15639097744360902, "grad_norm": 0.8500366188912147, "learning_rate": 1.9999802871486138e-05, "loss": 0.4407, "step": 247 }, { "epoch": 0.15702413929560743, "grad_norm": 0.8893873926160858, "learning_rate": 1.9999756631651924e-05, "loss": 0.4194, "step": 248 }, { "epoch": 0.15765730114760584, "grad_norm": 1.0893338356005946, "learning_rate": 1.999970552454966e-05, "loss": 0.4268, "step": 249 }, { "epoch": 0.15829046299960428, "grad_norm": 0.8155520400846719, "learning_rate": 1.999964955020422e-05, "loss": 0.4359, "step": 250 }, { "epoch": 0.1589236248516027, "grad_norm": 0.8375730203388012, "learning_rate": 1.9999588708642856e-05, "loss": 0.4415, "step": 251 }, { "epoch": 0.1595567867036011, "grad_norm": 0.8492208837943337, "learning_rate": 1.9999522999895175e-05, "loss": 0.4177, "step": 252 }, { "epoch": 0.16018994855559954, "grad_norm": 0.8655511559480105, "learning_rate": 1.9999452423993156e-05, "loss": 0.4418, "step": 253 }, { "epoch": 0.16082311040759795, "grad_norm": 0.8026317170399843, "learning_rate": 1.9999376980971165e-05, "loss": 0.4234, "step": 254 }, { "epoch": 0.16145627225959636, "grad_norm": 1.0588681011478824, "learning_rate": 1.9999296670865906e-05, "loss": 0.4183, "step": 255 }, { "epoch": 0.16208943411159477, "grad_norm": 2.945893453179673, "learning_rate": 1.999921149371648e-05, "loss": 0.435, "step": 256 }, { "epoch": 0.1627225959635932, "grad_norm": 1.3793725632477518, "learning_rate": 1.9999121449564347e-05, "loss": 0.4488, "step": 257 }, { "epoch": 0.1633557578155916, "grad_norm": 0.7901105053017597, "learning_rate": 1.999902653845333e-05, "loss": 0.4086, "step": 258 }, { "epoch": 0.16398891966759002, "grad_norm": 0.8762501675630634, "learning_rate": 1.9998926760429626e-05, "loss": 0.4353, "step": 259 }, { "epoch": 0.16462208151958843, "grad_norm": 0.866697828044225, "learning_rate": 1.9998822115541802e-05, "loss": 0.4356, "step": 260 }, { "epoch": 0.16525524337158687, "grad_norm": 0.9646264082335028, "learning_rate": 1.9998712603840794e-05, "loss": 0.4384, "step": 261 }, { "epoch": 0.16588840522358528, "grad_norm": 0.7653315446174197, "learning_rate": 1.9998598225379905e-05, "loss": 0.4244, "step": 262 }, { "epoch": 0.1665215670755837, "grad_norm": 0.9133899966890516, "learning_rate": 1.9998478980214805e-05, "loss": 0.432, "step": 263 }, { "epoch": 0.16715472892758212, "grad_norm": 0.7560231555174962, "learning_rate": 1.9998354868403537e-05, "loss": 0.4345, "step": 264 }, { "epoch": 0.16778789077958053, "grad_norm": 0.7408306855082744, "learning_rate": 1.9998225890006516e-05, "loss": 0.4282, "step": 265 }, { "epoch": 0.16842105263157894, "grad_norm": 0.7668343079097253, "learning_rate": 1.999809204508651e-05, "loss": 0.4122, "step": 266 }, { "epoch": 0.16905421448357735, "grad_norm": 0.7847828543856646, "learning_rate": 1.9997953333708676e-05, "loss": 0.425, "step": 267 }, { "epoch": 0.1696873763355758, "grad_norm": 0.9700104215865575, "learning_rate": 1.9997809755940525e-05, "loss": 0.4319, "step": 268 }, { "epoch": 0.1703205381875742, "grad_norm": 0.951440908059207, "learning_rate": 1.9997661311851944e-05, "loss": 0.4322, "step": 269 }, { "epoch": 0.1709537000395726, "grad_norm": 0.8235797379014453, "learning_rate": 1.9997508001515188e-05, "loss": 0.4193, "step": 270 }, { "epoch": 0.17158686189157105, "grad_norm": 0.9015736161518481, "learning_rate": 1.9997349825004876e-05, "loss": 0.4173, "step": 271 }, { "epoch": 0.17222002374356946, "grad_norm": 1.4108874763641002, "learning_rate": 1.9997186782397998e-05, "loss": 0.4275, "step": 272 }, { "epoch": 0.17285318559556787, "grad_norm": 0.8461076517043912, "learning_rate": 1.9997018873773914e-05, "loss": 0.4303, "step": 273 }, { "epoch": 0.17348634744756627, "grad_norm": 0.7653490465865177, "learning_rate": 1.9996846099214352e-05, "loss": 0.4354, "step": 274 }, { "epoch": 0.1741195092995647, "grad_norm": 0.7776142748446964, "learning_rate": 1.9996668458803412e-05, "loss": 0.4342, "step": 275 }, { "epoch": 0.17475267115156312, "grad_norm": 1.2622258996506905, "learning_rate": 1.9996485952627554e-05, "loss": 0.4205, "step": 276 }, { "epoch": 0.17538583300356153, "grad_norm": 0.8258310733800387, "learning_rate": 1.999629858077561e-05, "loss": 0.439, "step": 277 }, { "epoch": 0.17601899485555994, "grad_norm": 0.8858596524204329, "learning_rate": 1.9996106343338787e-05, "loss": 0.4096, "step": 278 }, { "epoch": 0.17665215670755838, "grad_norm": 0.8263138350087393, "learning_rate": 1.9995909240410645e-05, "loss": 0.4165, "step": 279 }, { "epoch": 0.1772853185595568, "grad_norm": 0.8569261171721425, "learning_rate": 1.9995707272087127e-05, "loss": 0.4199, "step": 280 }, { "epoch": 0.1779184804115552, "grad_norm": 0.7195725864110927, "learning_rate": 1.9995500438466543e-05, "loss": 0.4295, "step": 281 }, { "epoch": 0.17855164226355363, "grad_norm": 0.8019915879817567, "learning_rate": 1.999528873964956e-05, "loss": 0.4366, "step": 282 }, { "epoch": 0.17918480411555204, "grad_norm": 0.977634669276342, "learning_rate": 1.9995072175739226e-05, "loss": 0.4346, "step": 283 }, { "epoch": 0.17981796596755045, "grad_norm": 0.8980639195697745, "learning_rate": 1.999485074684094e-05, "loss": 0.422, "step": 284 }, { "epoch": 0.18045112781954886, "grad_norm": 0.7403130747265382, "learning_rate": 1.9994624453062493e-05, "loss": 0.4207, "step": 285 }, { "epoch": 0.1810842896715473, "grad_norm": 0.7837366502809165, "learning_rate": 1.9994393294514024e-05, "loss": 0.4214, "step": 286 }, { "epoch": 0.1817174515235457, "grad_norm": 0.7236051808101585, "learning_rate": 1.999415727130805e-05, "loss": 0.4253, "step": 287 }, { "epoch": 0.18235061337554412, "grad_norm": 0.7574869922632548, "learning_rate": 1.999391638355945e-05, "loss": 0.4153, "step": 288 }, { "epoch": 0.18298377522754253, "grad_norm": 0.7485297562186857, "learning_rate": 1.9993670631385477e-05, "loss": 0.4186, "step": 289 }, { "epoch": 0.18361693707954096, "grad_norm": 0.7853565790442948, "learning_rate": 1.999342001490574e-05, "loss": 0.4085, "step": 290 }, { "epoch": 0.18425009893153937, "grad_norm": 0.7584131107829444, "learning_rate": 1.999316453424223e-05, "loss": 0.4454, "step": 291 }, { "epoch": 0.18488326078353778, "grad_norm": 0.8172293295492046, "learning_rate": 1.9992904189519303e-05, "loss": 0.414, "step": 292 }, { "epoch": 0.18551642263553622, "grad_norm": 0.7378855198249222, "learning_rate": 1.999263898086367e-05, "loss": 0.4288, "step": 293 }, { "epoch": 0.18614958448753463, "grad_norm": 0.8368992313193568, "learning_rate": 1.9992368908404426e-05, "loss": 0.4315, "step": 294 }, { "epoch": 0.18678274633953304, "grad_norm": 0.7450750642686522, "learning_rate": 1.999209397227302e-05, "loss": 0.4226, "step": 295 }, { "epoch": 0.18741590819153145, "grad_norm": 0.8190217672605551, "learning_rate": 1.9991814172603277e-05, "loss": 0.4192, "step": 296 }, { "epoch": 0.1880490700435299, "grad_norm": 0.7219790621996496, "learning_rate": 1.9991529509531388e-05, "loss": 0.4262, "step": 297 }, { "epoch": 0.1886822318955283, "grad_norm": 1.064778237556338, "learning_rate": 1.9991239983195902e-05, "loss": 0.4039, "step": 298 }, { "epoch": 0.1893153937475267, "grad_norm": 0.8466414538825305, "learning_rate": 1.9990945593737748e-05, "loss": 0.4173, "step": 299 }, { "epoch": 0.18994855559952512, "grad_norm": 0.8571848185304766, "learning_rate": 1.999064634130022e-05, "loss": 0.4123, "step": 300 }, { "epoch": 0.19058171745152355, "grad_norm": 0.8606053477047091, "learning_rate": 1.9990342226028972e-05, "loss": 0.417, "step": 301 }, { "epoch": 0.19121487930352196, "grad_norm": 0.8225075098650887, "learning_rate": 1.9990033248072027e-05, "loss": 0.4389, "step": 302 }, { "epoch": 0.19184804115552037, "grad_norm": 1.8852006759526447, "learning_rate": 1.998971940757978e-05, "loss": 0.4202, "step": 303 }, { "epoch": 0.1924812030075188, "grad_norm": 0.8645880498130121, "learning_rate": 1.998940070470499e-05, "loss": 0.429, "step": 304 }, { "epoch": 0.19311436485951722, "grad_norm": 0.7505653185042953, "learning_rate": 1.9989077139602778e-05, "loss": 0.4296, "step": 305 }, { "epoch": 0.19374752671151563, "grad_norm": 0.8927673338085846, "learning_rate": 1.9988748712430635e-05, "loss": 0.4276, "step": 306 }, { "epoch": 0.19438068856351404, "grad_norm": 0.8026888571163797, "learning_rate": 1.9988415423348423e-05, "loss": 0.423, "step": 307 }, { "epoch": 0.19501385041551247, "grad_norm": 0.8981546171214745, "learning_rate": 1.9988077272518366e-05, "loss": 0.4253, "step": 308 }, { "epoch": 0.19564701226751088, "grad_norm": 0.8004477758447456, "learning_rate": 1.9987734260105053e-05, "loss": 0.4073, "step": 309 }, { "epoch": 0.1962801741195093, "grad_norm": 0.8434696995999319, "learning_rate": 1.9987386386275443e-05, "loss": 0.4167, "step": 310 }, { "epoch": 0.19691333597150773, "grad_norm": 0.7232962527911898, "learning_rate": 1.998703365119886e-05, "loss": 0.4128, "step": 311 }, { "epoch": 0.19754649782350614, "grad_norm": 0.8349025397753074, "learning_rate": 1.998667605504699e-05, "loss": 0.4074, "step": 312 }, { "epoch": 0.19817965967550455, "grad_norm": 0.8137962434204904, "learning_rate": 1.9986313597993898e-05, "loss": 0.4073, "step": 313 }, { "epoch": 0.19881282152750296, "grad_norm": 1.0397166400440652, "learning_rate": 1.9985946280215996e-05, "loss": 0.4234, "step": 314 }, { "epoch": 0.1994459833795014, "grad_norm": 0.7516053934662427, "learning_rate": 1.998557410189208e-05, "loss": 0.4204, "step": 315 }, { "epoch": 0.2000791452314998, "grad_norm": 0.7960556964199649, "learning_rate": 1.9985197063203293e-05, "loss": 0.4122, "step": 316 }, { "epoch": 0.20071230708349821, "grad_norm": 0.7347604817343009, "learning_rate": 1.9984815164333163e-05, "loss": 0.4149, "step": 317 }, { "epoch": 0.20134546893549662, "grad_norm": 0.8034798072373123, "learning_rate": 1.9984428405467575e-05, "loss": 0.4327, "step": 318 }, { "epoch": 0.20197863078749506, "grad_norm": 0.7232816135761722, "learning_rate": 1.9984036786794775e-05, "loss": 0.4112, "step": 319 }, { "epoch": 0.20261179263949347, "grad_norm": 0.7525315120927546, "learning_rate": 1.998364030850538e-05, "loss": 0.4075, "step": 320 }, { "epoch": 0.20324495449149188, "grad_norm": 0.8072682718748136, "learning_rate": 1.9983238970792375e-05, "loss": 0.4286, "step": 321 }, { "epoch": 0.20387811634349032, "grad_norm": 0.8123696995207504, "learning_rate": 1.99828327738511e-05, "loss": 0.4343, "step": 322 }, { "epoch": 0.20451127819548873, "grad_norm": 0.7296213260497413, "learning_rate": 1.9982421717879272e-05, "loss": 0.4103, "step": 323 }, { "epoch": 0.20514444004748714, "grad_norm": 1.2057257427433676, "learning_rate": 1.998200580307697e-05, "loss": 0.4232, "step": 324 }, { "epoch": 0.20577760189948555, "grad_norm": 0.6912740180307089, "learning_rate": 1.9981585029646625e-05, "loss": 0.4105, "step": 325 }, { "epoch": 0.20641076375148398, "grad_norm": 0.8423429911042344, "learning_rate": 1.9981159397793054e-05, "loss": 0.4079, "step": 326 }, { "epoch": 0.2070439256034824, "grad_norm": 0.6687347217925318, "learning_rate": 1.9980728907723427e-05, "loss": 0.4287, "step": 327 }, { "epoch": 0.2076770874554808, "grad_norm": 0.6914299675905126, "learning_rate": 1.998029355964728e-05, "loss": 0.4383, "step": 328 }, { "epoch": 0.2083102493074792, "grad_norm": 0.9762304991576938, "learning_rate": 1.9979853353776507e-05, "loss": 0.4183, "step": 329 }, { "epoch": 0.20894341115947765, "grad_norm": 0.7155749314854077, "learning_rate": 1.997940829032538e-05, "loss": 0.4229, "step": 330 }, { "epoch": 0.20957657301147606, "grad_norm": 0.857184143367243, "learning_rate": 1.9978958369510532e-05, "loss": 0.4003, "step": 331 }, { "epoch": 0.21020973486347447, "grad_norm": 1.574777357513006, "learning_rate": 1.9978503591550948e-05, "loss": 0.4149, "step": 332 }, { "epoch": 0.2108428967154729, "grad_norm": 0.8218009673066347, "learning_rate": 1.997804395666799e-05, "loss": 0.4357, "step": 333 }, { "epoch": 0.21147605856747131, "grad_norm": 0.6844044665088634, "learning_rate": 1.997757946508538e-05, "loss": 0.4061, "step": 334 }, { "epoch": 0.21210922041946972, "grad_norm": 0.7768210646419832, "learning_rate": 1.9977110117029202e-05, "loss": 0.4199, "step": 335 }, { "epoch": 0.21274238227146813, "grad_norm": 0.7385018936952782, "learning_rate": 1.997663591272791e-05, "loss": 0.4092, "step": 336 }, { "epoch": 0.21337554412346657, "grad_norm": 0.7675494686256527, "learning_rate": 1.9976156852412318e-05, "loss": 0.4111, "step": 337 }, { "epoch": 0.21400870597546498, "grad_norm": 0.7638013441617211, "learning_rate": 1.9975672936315596e-05, "loss": 0.3997, "step": 338 }, { "epoch": 0.2146418678274634, "grad_norm": 0.754685472766862, "learning_rate": 1.9975184164673292e-05, "loss": 0.4076, "step": 339 }, { "epoch": 0.2152750296794618, "grad_norm": 0.6827751790136249, "learning_rate": 1.9974690537723308e-05, "loss": 0.4311, "step": 340 }, { "epoch": 0.21590819153146024, "grad_norm": 0.7448284286464442, "learning_rate": 1.997419205570591e-05, "loss": 0.4142, "step": 341 }, { "epoch": 0.21654135338345865, "grad_norm": 0.6917063504591302, "learning_rate": 1.997368871886373e-05, "loss": 0.4192, "step": 342 }, { "epoch": 0.21717451523545706, "grad_norm": 0.8139230900999184, "learning_rate": 1.9973180527441757e-05, "loss": 0.4044, "step": 343 }, { "epoch": 0.2178076770874555, "grad_norm": 0.6843924465745497, "learning_rate": 1.9972667481687355e-05, "loss": 0.418, "step": 344 }, { "epoch": 0.2184408389394539, "grad_norm": 0.8290162688078465, "learning_rate": 1.9972149581850234e-05, "loss": 0.4171, "step": 345 }, { "epoch": 0.2190740007914523, "grad_norm": 0.7063936655344202, "learning_rate": 1.997162682818248e-05, "loss": 0.4277, "step": 346 }, { "epoch": 0.21970716264345072, "grad_norm": 0.7886936712704291, "learning_rate": 1.9971099220938546e-05, "loss": 0.4088, "step": 347 }, { "epoch": 0.22034032449544916, "grad_norm": 0.7031244983481736, "learning_rate": 1.9970566760375228e-05, "loss": 0.4068, "step": 348 }, { "epoch": 0.22097348634744757, "grad_norm": 0.8402377078496283, "learning_rate": 1.9970029446751695e-05, "loss": 0.4265, "step": 349 }, { "epoch": 0.22160664819944598, "grad_norm": 0.7220807188133611, "learning_rate": 1.9969487280329488e-05, "loss": 0.3942, "step": 350 }, { "epoch": 0.22223981005144441, "grad_norm": 0.7976388468966579, "learning_rate": 1.9968940261372488e-05, "loss": 0.4172, "step": 351 }, { "epoch": 0.22287297190344282, "grad_norm": 0.7930119143797857, "learning_rate": 1.996838839014696e-05, "loss": 0.4271, "step": 352 }, { "epoch": 0.22350613375544123, "grad_norm": 0.7408499459954452, "learning_rate": 1.9967831666921515e-05, "loss": 0.4004, "step": 353 }, { "epoch": 0.22413929560743964, "grad_norm": 0.9747228090820194, "learning_rate": 1.9967270091967137e-05, "loss": 0.4167, "step": 354 }, { "epoch": 0.22477245745943808, "grad_norm": 0.795619544513496, "learning_rate": 1.996670366555716e-05, "loss": 0.4194, "step": 355 }, { "epoch": 0.2254056193114365, "grad_norm": 0.8163077230629225, "learning_rate": 1.9966132387967293e-05, "loss": 0.396, "step": 356 }, { "epoch": 0.2260387811634349, "grad_norm": 0.7870292745985996, "learning_rate": 1.996555625947559e-05, "loss": 0.4414, "step": 357 }, { "epoch": 0.2266719430154333, "grad_norm": 0.6635800630818252, "learning_rate": 1.9964975280362486e-05, "loss": 0.4205, "step": 358 }, { "epoch": 0.22730510486743175, "grad_norm": 0.8956224674540982, "learning_rate": 1.9964389450910754e-05, "loss": 0.4, "step": 359 }, { "epoch": 0.22793826671943015, "grad_norm": 0.757758963439133, "learning_rate": 1.9963798771405548e-05, "loss": 0.4022, "step": 360 }, { "epoch": 0.22857142857142856, "grad_norm": 0.6905384024219277, "learning_rate": 1.9963203242134374e-05, "loss": 0.412, "step": 361 }, { "epoch": 0.229204590423427, "grad_norm": 0.7124996578792767, "learning_rate": 1.9962602863387097e-05, "loss": 0.4083, "step": 362 }, { "epoch": 0.2298377522754254, "grad_norm": 0.9220023329448379, "learning_rate": 1.996199763545594e-05, "loss": 0.422, "step": 363 }, { "epoch": 0.23047091412742382, "grad_norm": 0.7192045923812558, "learning_rate": 1.9961387558635497e-05, "loss": 0.4413, "step": 364 }, { "epoch": 0.23110407597942223, "grad_norm": 0.7217010732554293, "learning_rate": 1.996077263322272e-05, "loss": 0.4213, "step": 365 }, { "epoch": 0.23173723783142067, "grad_norm": 0.6689137968322582, "learning_rate": 1.996015285951691e-05, "loss": 0.4325, "step": 366 }, { "epoch": 0.23237039968341908, "grad_norm": 0.7344138108658487, "learning_rate": 1.9959528237819734e-05, "loss": 0.4268, "step": 367 }, { "epoch": 0.23300356153541749, "grad_norm": 0.6751372388973893, "learning_rate": 1.9958898768435223e-05, "loss": 0.4074, "step": 368 }, { "epoch": 0.2336367233874159, "grad_norm": 0.736129533946913, "learning_rate": 1.995826445166976e-05, "loss": 0.4339, "step": 369 }, { "epoch": 0.23426988523941433, "grad_norm": 0.6971326684431621, "learning_rate": 1.9957625287832097e-05, "loss": 0.4161, "step": 370 }, { "epoch": 0.23490304709141274, "grad_norm": 0.7658091969314728, "learning_rate": 1.9956981277233342e-05, "loss": 0.4248, "step": 371 }, { "epoch": 0.23553620894341115, "grad_norm": 0.6719832934332354, "learning_rate": 1.9956332420186948e-05, "loss": 0.403, "step": 372 }, { "epoch": 0.2361693707954096, "grad_norm": 0.7478090268909814, "learning_rate": 1.995567871700875e-05, "loss": 0.3939, "step": 373 }, { "epoch": 0.236802532647408, "grad_norm": 0.7024001868839143, "learning_rate": 1.9955020168016926e-05, "loss": 0.4235, "step": 374 }, { "epoch": 0.2374356944994064, "grad_norm": 0.7179020051007703, "learning_rate": 1.995435677353202e-05, "loss": 0.4115, "step": 375 }, { "epoch": 0.23806885635140482, "grad_norm": 0.7408041495630544, "learning_rate": 1.9953688533876924e-05, "loss": 0.4086, "step": 376 }, { "epoch": 0.23870201820340325, "grad_norm": 0.7014230776826398, "learning_rate": 1.9953015449376906e-05, "loss": 0.4329, "step": 377 }, { "epoch": 0.23933518005540166, "grad_norm": 0.7081240106207342, "learning_rate": 1.9952337520359573e-05, "loss": 0.4183, "step": 378 }, { "epoch": 0.23996834190740007, "grad_norm": 0.7424288707262301, "learning_rate": 1.9951654747154906e-05, "loss": 0.4088, "step": 379 }, { "epoch": 0.24060150375939848, "grad_norm": 0.6992987540507014, "learning_rate": 1.9950967130095237e-05, "loss": 0.4215, "step": 380 }, { "epoch": 0.24123466561139692, "grad_norm": 0.7507895737486899, "learning_rate": 1.9950274669515255e-05, "loss": 0.4157, "step": 381 }, { "epoch": 0.24186782746339533, "grad_norm": 0.7508707685006646, "learning_rate": 1.9949577365752005e-05, "loss": 0.398, "step": 382 }, { "epoch": 0.24250098931539374, "grad_norm": 0.8240419474599066, "learning_rate": 1.9948875219144896e-05, "loss": 0.4093, "step": 383 }, { "epoch": 0.24313415116739218, "grad_norm": 0.7283457795440621, "learning_rate": 1.9948168230035682e-05, "loss": 0.4161, "step": 384 }, { "epoch": 0.24376731301939059, "grad_norm": 0.8022783605724343, "learning_rate": 1.9947456398768486e-05, "loss": 0.4081, "step": 385 }, { "epoch": 0.244400474871389, "grad_norm": 0.8053042481734337, "learning_rate": 1.994673972568979e-05, "loss": 0.414, "step": 386 }, { "epoch": 0.2450336367233874, "grad_norm": 0.7330759037700031, "learning_rate": 1.9946018211148418e-05, "loss": 0.4096, "step": 387 }, { "epoch": 0.24566679857538584, "grad_norm": 0.7557736220621349, "learning_rate": 1.994529185549556e-05, "loss": 0.4042, "step": 388 }, { "epoch": 0.24629996042738425, "grad_norm": 0.698871494681617, "learning_rate": 1.9944560659084765e-05, "loss": 0.4006, "step": 389 }, { "epoch": 0.24693312227938266, "grad_norm": 0.7216883406870622, "learning_rate": 1.9943824622271934e-05, "loss": 0.4199, "step": 390 }, { "epoch": 0.2475662841313811, "grad_norm": 0.8093561670631765, "learning_rate": 1.9943083745415325e-05, "loss": 0.418, "step": 391 }, { "epoch": 0.2481994459833795, "grad_norm": 0.7507641401705253, "learning_rate": 1.994233802887555e-05, "loss": 0.4227, "step": 392 }, { "epoch": 0.24883260783537792, "grad_norm": 0.7464527633854482, "learning_rate": 1.9941587473015575e-05, "loss": 0.4043, "step": 393 }, { "epoch": 0.24946576968737633, "grad_norm": 0.6498087799854716, "learning_rate": 1.9940832078200733e-05, "loss": 0.4123, "step": 394 }, { "epoch": 0.25009893153937474, "grad_norm": 0.6853381563103891, "learning_rate": 1.994007184479869e-05, "loss": 0.4071, "step": 395 }, { "epoch": 0.25073209339137315, "grad_norm": 0.6808083500849335, "learning_rate": 1.9939306773179498e-05, "loss": 0.4115, "step": 396 }, { "epoch": 0.2513652552433716, "grad_norm": 0.63789899423638, "learning_rate": 1.9938536863715533e-05, "loss": 0.424, "step": 397 }, { "epoch": 0.25199841709537, "grad_norm": 0.7290640013382641, "learning_rate": 1.9937762116781547e-05, "loss": 0.3895, "step": 398 }, { "epoch": 0.25263157894736843, "grad_norm": 0.6854271497594834, "learning_rate": 1.9936982532754637e-05, "loss": 0.4326, "step": 399 }, { "epoch": 0.25326474079936684, "grad_norm": 0.677415552421906, "learning_rate": 1.9936198112014254e-05, "loss": 0.4084, "step": 400 }, { "epoch": 0.25389790265136525, "grad_norm": 0.7814074016686253, "learning_rate": 1.9935408854942216e-05, "loss": 0.434, "step": 401 }, { "epoch": 0.25453106450336366, "grad_norm": 0.7133999085103865, "learning_rate": 1.9934614761922676e-05, "loss": 0.4153, "step": 402 }, { "epoch": 0.25516422635536207, "grad_norm": 0.6865366319182933, "learning_rate": 1.9933815833342147e-05, "loss": 0.4317, "step": 403 }, { "epoch": 0.25579738820736053, "grad_norm": 0.6805407935546268, "learning_rate": 1.9933012069589506e-05, "loss": 0.4151, "step": 404 }, { "epoch": 0.25643055005935894, "grad_norm": 0.6603802979369017, "learning_rate": 1.9932203471055974e-05, "loss": 0.4208, "step": 405 }, { "epoch": 0.25706371191135735, "grad_norm": 0.7849874188556407, "learning_rate": 1.993139003813512e-05, "loss": 0.4017, "step": 406 }, { "epoch": 0.25769687376335576, "grad_norm": 0.6354960878910955, "learning_rate": 1.9930571771222888e-05, "loss": 0.3947, "step": 407 }, { "epoch": 0.25833003561535417, "grad_norm": 0.6823911668448054, "learning_rate": 1.9929748670717545e-05, "loss": 0.4108, "step": 408 }, { "epoch": 0.2589631974673526, "grad_norm": 0.6504950248792305, "learning_rate": 1.9928920737019735e-05, "loss": 0.4124, "step": 409 }, { "epoch": 0.259596359319351, "grad_norm": 0.6848790132595115, "learning_rate": 1.9928087970532442e-05, "loss": 0.4145, "step": 410 }, { "epoch": 0.26022952117134945, "grad_norm": 0.7045111195889068, "learning_rate": 1.9927250371661006e-05, "loss": 0.4225, "step": 411 }, { "epoch": 0.26086268302334786, "grad_norm": 0.7803725296508581, "learning_rate": 1.992640794081312e-05, "loss": 0.3879, "step": 412 }, { "epoch": 0.2614958448753463, "grad_norm": 0.6594044295164018, "learning_rate": 1.9925560678398826e-05, "loss": 0.3859, "step": 413 }, { "epoch": 0.2621290067273447, "grad_norm": 0.6354719656460751, "learning_rate": 1.9924708584830525e-05, "loss": 0.3996, "step": 414 }, { "epoch": 0.2627621685793431, "grad_norm": 0.6748941618813917, "learning_rate": 1.992385166052295e-05, "loss": 0.4185, "step": 415 }, { "epoch": 0.2633953304313415, "grad_norm": 0.6411178592054179, "learning_rate": 1.9922989905893215e-05, "loss": 0.4011, "step": 416 }, { "epoch": 0.2640284922833399, "grad_norm": 0.722422931640675, "learning_rate": 1.992212332136076e-05, "loss": 0.3992, "step": 417 }, { "epoch": 0.2646616541353383, "grad_norm": 0.6366548750270229, "learning_rate": 1.9921251907347386e-05, "loss": 0.4005, "step": 418 }, { "epoch": 0.2652948159873368, "grad_norm": 0.6438878473117108, "learning_rate": 1.9920375664277245e-05, "loss": 0.4174, "step": 419 }, { "epoch": 0.2659279778393352, "grad_norm": 0.6500486283701162, "learning_rate": 1.991949459257684e-05, "loss": 0.3955, "step": 420 }, { "epoch": 0.2665611396913336, "grad_norm": 0.6816699365598933, "learning_rate": 1.991860869267502e-05, "loss": 0.3956, "step": 421 }, { "epoch": 0.267194301543332, "grad_norm": 1.2261609196265353, "learning_rate": 1.991771796500299e-05, "loss": 0.395, "step": 422 }, { "epoch": 0.2678274633953304, "grad_norm": 0.6789807105980243, "learning_rate": 1.9916822409994303e-05, "loss": 0.4019, "step": 423 }, { "epoch": 0.26846062524732883, "grad_norm": 0.8049020246408666, "learning_rate": 1.991592202808485e-05, "loss": 0.3943, "step": 424 }, { "epoch": 0.26909378709932724, "grad_norm": 0.7064948539391696, "learning_rate": 1.9915016819712894e-05, "loss": 0.4059, "step": 425 }, { "epoch": 0.2697269489513257, "grad_norm": 0.6427188978497465, "learning_rate": 1.991410678531903e-05, "loss": 0.4144, "step": 426 }, { "epoch": 0.2703601108033241, "grad_norm": 0.6776815143230073, "learning_rate": 1.99131919253462e-05, "loss": 0.4023, "step": 427 }, { "epoch": 0.2709932726553225, "grad_norm": 0.7640449350066261, "learning_rate": 1.9912272240239715e-05, "loss": 0.4119, "step": 428 }, { "epoch": 0.27162643450732094, "grad_norm": 0.7433008787102598, "learning_rate": 1.9911347730447216e-05, "loss": 0.4027, "step": 429 }, { "epoch": 0.27225959635931934, "grad_norm": 0.6986847046085851, "learning_rate": 1.9910418396418688e-05, "loss": 0.3906, "step": 430 }, { "epoch": 0.27289275821131775, "grad_norm": 0.7457267852618008, "learning_rate": 1.990948423860649e-05, "loss": 0.3999, "step": 431 }, { "epoch": 0.27352592006331616, "grad_norm": 0.6987167019593074, "learning_rate": 1.9908545257465297e-05, "loss": 0.4057, "step": 432 }, { "epoch": 0.27415908191531463, "grad_norm": 0.6985770415769124, "learning_rate": 1.9907601453452158e-05, "loss": 0.4035, "step": 433 }, { "epoch": 0.27479224376731304, "grad_norm": 0.7402920821203979, "learning_rate": 1.990665282702646e-05, "loss": 0.4031, "step": 434 }, { "epoch": 0.27542540561931145, "grad_norm": 0.669666955365112, "learning_rate": 1.9905699378649927e-05, "loss": 0.3898, "step": 435 }, { "epoch": 0.27605856747130986, "grad_norm": 0.6448427728244894, "learning_rate": 1.990474110878665e-05, "loss": 0.4018, "step": 436 }, { "epoch": 0.27669172932330827, "grad_norm": 0.6523253397537625, "learning_rate": 1.9903778017903045e-05, "loss": 0.3983, "step": 437 }, { "epoch": 0.2773248911753067, "grad_norm": 0.6642855935503792, "learning_rate": 1.990281010646789e-05, "loss": 0.4184, "step": 438 }, { "epoch": 0.2779580530273051, "grad_norm": 0.6869767110655316, "learning_rate": 1.9901837374952308e-05, "loss": 0.4155, "step": 439 }, { "epoch": 0.2785912148793035, "grad_norm": 0.6829837955723447, "learning_rate": 1.9900859823829763e-05, "loss": 0.3965, "step": 440 }, { "epoch": 0.27922437673130196, "grad_norm": 0.6319178277109848, "learning_rate": 1.9899877453576066e-05, "loss": 0.4124, "step": 441 }, { "epoch": 0.27985753858330037, "grad_norm": 0.76474559102417, "learning_rate": 1.9898890264669376e-05, "loss": 0.4051, "step": 442 }, { "epoch": 0.2804907004352988, "grad_norm": 0.6442957639772616, "learning_rate": 1.9897898257590193e-05, "loss": 0.4006, "step": 443 }, { "epoch": 0.2811238622872972, "grad_norm": 0.7105403174111444, "learning_rate": 1.989690143282137e-05, "loss": 0.4007, "step": 444 }, { "epoch": 0.2817570241392956, "grad_norm": 0.6519096058916409, "learning_rate": 1.9895899790848097e-05, "loss": 0.3942, "step": 445 }, { "epoch": 0.282390185991294, "grad_norm": 0.7866915815142667, "learning_rate": 1.989489333215791e-05, "loss": 0.403, "step": 446 }, { "epoch": 0.2830233478432924, "grad_norm": 0.709564763183452, "learning_rate": 1.9893882057240698e-05, "loss": 0.3994, "step": 447 }, { "epoch": 0.2836565096952909, "grad_norm": 0.8758413991658224, "learning_rate": 1.989286596658868e-05, "loss": 0.3959, "step": 448 }, { "epoch": 0.2842896715472893, "grad_norm": 0.7217410485417768, "learning_rate": 1.989184506069643e-05, "loss": 0.3928, "step": 449 }, { "epoch": 0.2849228333992877, "grad_norm": 0.621121242725828, "learning_rate": 1.9890819340060867e-05, "loss": 0.4069, "step": 450 }, { "epoch": 0.2855559952512861, "grad_norm": 0.6889078158370837, "learning_rate": 1.9889788805181237e-05, "loss": 0.3929, "step": 451 }, { "epoch": 0.2861891571032845, "grad_norm": 0.6048375314885988, "learning_rate": 1.9888753456559154e-05, "loss": 0.3804, "step": 452 }, { "epoch": 0.28682231895528293, "grad_norm": 1.0625094402423183, "learning_rate": 1.9887713294698553e-05, "loss": 0.3832, "step": 453 }, { "epoch": 0.28745548080728134, "grad_norm": 0.6675045672398878, "learning_rate": 1.9886668320105724e-05, "loss": 0.4044, "step": 454 }, { "epoch": 0.2880886426592798, "grad_norm": 0.6572192161658723, "learning_rate": 1.98856185332893e-05, "loss": 0.3971, "step": 455 }, { "epoch": 0.2887218045112782, "grad_norm": 0.6211810835369563, "learning_rate": 1.988456393476025e-05, "loss": 0.4126, "step": 456 }, { "epoch": 0.2893549663632766, "grad_norm": 0.6385818805170462, "learning_rate": 1.9883504525031886e-05, "loss": 0.4019, "step": 457 }, { "epoch": 0.28998812821527503, "grad_norm": 3.425157413281736, "learning_rate": 1.9882440304619866e-05, "loss": 0.4059, "step": 458 }, { "epoch": 0.29062129006727344, "grad_norm": 0.7797836398781998, "learning_rate": 1.9881371274042185e-05, "loss": 0.4022, "step": 459 }, { "epoch": 0.29125445191927185, "grad_norm": 0.6609269794905638, "learning_rate": 1.9880297433819185e-05, "loss": 0.3979, "step": 460 }, { "epoch": 0.29188761377127026, "grad_norm": 0.6271558555866004, "learning_rate": 1.987921878447354e-05, "loss": 0.404, "step": 461 }, { "epoch": 0.2925207756232687, "grad_norm": 0.6660326446690865, "learning_rate": 1.987813532653028e-05, "loss": 0.4139, "step": 462 }, { "epoch": 0.29315393747526713, "grad_norm": 0.6810516584695148, "learning_rate": 1.9877047060516755e-05, "loss": 0.398, "step": 463 }, { "epoch": 0.29378709932726554, "grad_norm": 0.7239622247254914, "learning_rate": 1.9875953986962674e-05, "loss": 0.3995, "step": 464 }, { "epoch": 0.29442026117926395, "grad_norm": 0.6463611359759586, "learning_rate": 1.9874856106400072e-05, "loss": 0.3915, "step": 465 }, { "epoch": 0.29505342303126236, "grad_norm": 0.64587905462018, "learning_rate": 1.9873753419363336e-05, "loss": 0.4067, "step": 466 }, { "epoch": 0.2956865848832608, "grad_norm": 0.665621659289357, "learning_rate": 1.9872645926389184e-05, "loss": 0.4071, "step": 467 }, { "epoch": 0.2963197467352592, "grad_norm": 0.669497328709441, "learning_rate": 1.987153362801667e-05, "loss": 0.392, "step": 468 }, { "epoch": 0.2969529085872576, "grad_norm": 0.6820994286152706, "learning_rate": 1.98704165247872e-05, "loss": 0.4015, "step": 469 }, { "epoch": 0.29758607043925606, "grad_norm": 0.6484220214257882, "learning_rate": 1.9869294617244513e-05, "loss": 0.4006, "step": 470 }, { "epoch": 0.29821923229125447, "grad_norm": 0.6846563214675042, "learning_rate": 1.9868167905934673e-05, "loss": 0.4069, "step": 471 }, { "epoch": 0.2988523941432529, "grad_norm": 0.6665443704738881, "learning_rate": 1.986703639140611e-05, "loss": 0.3888, "step": 472 }, { "epoch": 0.2994855559952513, "grad_norm": 0.6087450834233915, "learning_rate": 1.9865900074209562e-05, "loss": 0.4014, "step": 473 }, { "epoch": 0.3001187178472497, "grad_norm": 0.693299888422899, "learning_rate": 1.9864758954898126e-05, "loss": 0.3929, "step": 474 }, { "epoch": 0.3007518796992481, "grad_norm": 0.6696909755250009, "learning_rate": 1.9863613034027224e-05, "loss": 0.3845, "step": 475 }, { "epoch": 0.3013850415512465, "grad_norm": 0.6722244599005365, "learning_rate": 1.9862462312154627e-05, "loss": 0.4012, "step": 476 }, { "epoch": 0.302018203403245, "grad_norm": 0.6643094706070576, "learning_rate": 1.9861306789840428e-05, "loss": 0.4036, "step": 477 }, { "epoch": 0.3026513652552434, "grad_norm": 0.7656071014155773, "learning_rate": 1.9860146467647072e-05, "loss": 0.3902, "step": 478 }, { "epoch": 0.3032845271072418, "grad_norm": 0.6536794640350777, "learning_rate": 1.9858981346139325e-05, "loss": 0.3888, "step": 479 }, { "epoch": 0.3039176889592402, "grad_norm": 1.1237719694332617, "learning_rate": 1.9857811425884305e-05, "loss": 0.3972, "step": 480 }, { "epoch": 0.3045508508112386, "grad_norm": 0.6726617983327701, "learning_rate": 1.9856636707451446e-05, "loss": 0.3915, "step": 481 }, { "epoch": 0.305184012663237, "grad_norm": 0.6533938362449629, "learning_rate": 1.9855457191412538e-05, "loss": 0.3919, "step": 482 }, { "epoch": 0.30581717451523543, "grad_norm": 1.0968168075024003, "learning_rate": 1.9854272878341698e-05, "loss": 0.4119, "step": 483 }, { "epoch": 0.3064503363672339, "grad_norm": 0.6848687030859072, "learning_rate": 1.9853083768815372e-05, "loss": 0.4213, "step": 484 }, { "epoch": 0.3070834982192323, "grad_norm": 0.6737661089234482, "learning_rate": 1.9851889863412347e-05, "loss": 0.4015, "step": 485 }, { "epoch": 0.3077166600712307, "grad_norm": 0.664845265219969, "learning_rate": 1.9850691162713743e-05, "loss": 0.3959, "step": 486 }, { "epoch": 0.30834982192322913, "grad_norm": 0.6693607818506827, "learning_rate": 1.9849487667303013e-05, "loss": 0.393, "step": 487 }, { "epoch": 0.30898298377522754, "grad_norm": 0.7465192390006706, "learning_rate": 1.9848279377765948e-05, "loss": 0.3979, "step": 488 }, { "epoch": 0.30961614562722595, "grad_norm": 0.6642756706896186, "learning_rate": 1.9847066294690666e-05, "loss": 0.3915, "step": 489 }, { "epoch": 0.31024930747922436, "grad_norm": 0.6980042276418105, "learning_rate": 1.984584841866762e-05, "loss": 0.4023, "step": 490 }, { "epoch": 0.3108824693312228, "grad_norm": 0.7027671099394948, "learning_rate": 1.9844625750289602e-05, "loss": 0.4037, "step": 491 }, { "epoch": 0.31151563118322123, "grad_norm": 0.9496430018848955, "learning_rate": 1.984339829015173e-05, "loss": 0.4081, "step": 492 }, { "epoch": 0.31214879303521964, "grad_norm": 0.7288244141522842, "learning_rate": 1.9842166038851455e-05, "loss": 0.3884, "step": 493 }, { "epoch": 0.31278195488721805, "grad_norm": 0.628924174936853, "learning_rate": 1.9840928996988565e-05, "loss": 0.3945, "step": 494 }, { "epoch": 0.31341511673921646, "grad_norm": 0.7305817711984398, "learning_rate": 1.9839687165165174e-05, "loss": 0.3941, "step": 495 }, { "epoch": 0.31404827859121487, "grad_norm": 0.6473391639649254, "learning_rate": 1.9838440543985723e-05, "loss": 0.403, "step": 496 }, { "epoch": 0.3146814404432133, "grad_norm": 0.7544317307031616, "learning_rate": 1.9837189134057e-05, "loss": 0.4057, "step": 497 }, { "epoch": 0.3153146022952117, "grad_norm": 0.7421513985831454, "learning_rate": 1.9835932935988112e-05, "loss": 0.4009, "step": 498 }, { "epoch": 0.31594776414721015, "grad_norm": 0.9130706392578081, "learning_rate": 1.9834671950390502e-05, "loss": 0.4078, "step": 499 }, { "epoch": 0.31658092599920856, "grad_norm": 0.7504322826945532, "learning_rate": 1.983340617787793e-05, "loss": 0.4047, "step": 500 }, { "epoch": 0.31721408785120697, "grad_norm": 0.813438454346561, "learning_rate": 1.9832135619066506e-05, "loss": 0.3918, "step": 501 }, { "epoch": 0.3178472497032054, "grad_norm": 0.6747059330069289, "learning_rate": 1.983086027457466e-05, "loss": 0.3965, "step": 502 }, { "epoch": 0.3184804115552038, "grad_norm": 0.7110990028417714, "learning_rate": 1.9829580145023143e-05, "loss": 0.415, "step": 503 }, { "epoch": 0.3191135734072022, "grad_norm": 0.7052269311636193, "learning_rate": 1.9828295231035054e-05, "loss": 0.3901, "step": 504 }, { "epoch": 0.3197467352592006, "grad_norm": 0.6710874100634775, "learning_rate": 1.9827005533235804e-05, "loss": 0.3985, "step": 505 }, { "epoch": 0.3203798971111991, "grad_norm": 0.6595211171683181, "learning_rate": 1.982571105225314e-05, "loss": 0.3887, "step": 506 }, { "epoch": 0.3210130589631975, "grad_norm": 0.6074048992249659, "learning_rate": 1.9824411788717137e-05, "loss": 0.3958, "step": 507 }, { "epoch": 0.3216462208151959, "grad_norm": 0.6859193097883356, "learning_rate": 1.9823107743260196e-05, "loss": 0.4069, "step": 508 }, { "epoch": 0.3222793826671943, "grad_norm": 0.6765227409775045, "learning_rate": 1.9821798916517047e-05, "loss": 0.3901, "step": 509 }, { "epoch": 0.3229125445191927, "grad_norm": 0.6920194881295104, "learning_rate": 1.9820485309124745e-05, "loss": 0.3834, "step": 510 }, { "epoch": 0.3235457063711911, "grad_norm": 0.7214010724182123, "learning_rate": 1.981916692172267e-05, "loss": 0.393, "step": 511 }, { "epoch": 0.32417886822318953, "grad_norm": 0.6741979248186699, "learning_rate": 1.9817843754952544e-05, "loss": 0.407, "step": 512 }, { "epoch": 0.324812030075188, "grad_norm": 0.7570484195991247, "learning_rate": 1.9816515809458393e-05, "loss": 0.3992, "step": 513 }, { "epoch": 0.3254451919271864, "grad_norm": 0.6597436675888937, "learning_rate": 1.9815183085886583e-05, "loss": 0.3962, "step": 514 }, { "epoch": 0.3260783537791848, "grad_norm": 0.7974954484415969, "learning_rate": 1.98138455848858e-05, "loss": 0.4061, "step": 515 }, { "epoch": 0.3267115156311832, "grad_norm": 0.7098940541253767, "learning_rate": 1.9812503307107064e-05, "loss": 0.363, "step": 516 }, { "epoch": 0.32734467748318163, "grad_norm": 0.7253685738374493, "learning_rate": 1.9811156253203702e-05, "loss": 0.415, "step": 517 }, { "epoch": 0.32797783933518004, "grad_norm": 0.6065326182736357, "learning_rate": 1.9809804423831386e-05, "loss": 0.4043, "step": 518 }, { "epoch": 0.32861100118717845, "grad_norm": 0.6988268651257613, "learning_rate": 1.98084478196481e-05, "loss": 0.3896, "step": 519 }, { "epoch": 0.32924416303917686, "grad_norm": 0.8827781280530197, "learning_rate": 1.980708644131416e-05, "loss": 0.3983, "step": 520 }, { "epoch": 0.3298773248911753, "grad_norm": 0.6460467681015282, "learning_rate": 1.9805720289492196e-05, "loss": 0.393, "step": 521 }, { "epoch": 0.33051048674317374, "grad_norm": 0.6925358892214128, "learning_rate": 1.980434936484717e-05, "loss": 0.3929, "step": 522 }, { "epoch": 0.33114364859517215, "grad_norm": 0.6292371245852798, "learning_rate": 1.9802973668046364e-05, "loss": 0.3957, "step": 523 }, { "epoch": 0.33177681044717056, "grad_norm": 0.614656736506358, "learning_rate": 1.980159319975938e-05, "loss": 0.406, "step": 524 }, { "epoch": 0.33240997229916897, "grad_norm": 0.6821927021101496, "learning_rate": 1.9800207960658148e-05, "loss": 0.3968, "step": 525 }, { "epoch": 0.3330431341511674, "grad_norm": 0.6592630580804905, "learning_rate": 1.9798817951416914e-05, "loss": 0.4062, "step": 526 }, { "epoch": 0.3336762960031658, "grad_norm": 0.6382803206887339, "learning_rate": 1.979742317271226e-05, "loss": 0.4027, "step": 527 }, { "epoch": 0.33430945785516425, "grad_norm": 0.7272309944684486, "learning_rate": 1.9796023625223063e-05, "loss": 0.4093, "step": 528 }, { "epoch": 0.33494261970716266, "grad_norm": 0.6275131126634955, "learning_rate": 1.9794619309630546e-05, "loss": 0.3981, "step": 529 }, { "epoch": 0.33557578155916107, "grad_norm": 0.7013375411368048, "learning_rate": 1.9793210226618244e-05, "loss": 0.3919, "step": 530 }, { "epoch": 0.3362089434111595, "grad_norm": 0.6192486425330196, "learning_rate": 1.979179637687201e-05, "loss": 0.3928, "step": 531 }, { "epoch": 0.3368421052631579, "grad_norm": 0.6593450525283062, "learning_rate": 1.979037776108002e-05, "loss": 0.3767, "step": 532 }, { "epoch": 0.3374752671151563, "grad_norm": 0.6848560674671161, "learning_rate": 1.9788954379932767e-05, "loss": 0.3832, "step": 533 }, { "epoch": 0.3381084289671547, "grad_norm": 0.6055929425576437, "learning_rate": 1.978752623412307e-05, "loss": 0.3922, "step": 534 }, { "epoch": 0.33874159081915317, "grad_norm": 0.6907938825779556, "learning_rate": 1.9786093324346057e-05, "loss": 0.4012, "step": 535 }, { "epoch": 0.3393747526711516, "grad_norm": 0.6554287621805062, "learning_rate": 1.9784655651299185e-05, "loss": 0.3964, "step": 536 }, { "epoch": 0.34000791452315, "grad_norm": 0.6361617550915152, "learning_rate": 1.9783213215682225e-05, "loss": 0.4009, "step": 537 }, { "epoch": 0.3406410763751484, "grad_norm": 0.6903796556595394, "learning_rate": 1.978176601819726e-05, "loss": 0.3839, "step": 538 }, { "epoch": 0.3412742382271468, "grad_norm": 0.6951997165469059, "learning_rate": 1.978031405954871e-05, "loss": 0.3902, "step": 539 }, { "epoch": 0.3419074000791452, "grad_norm": 0.6227056534135867, "learning_rate": 1.9778857340443287e-05, "loss": 0.3911, "step": 540 }, { "epoch": 0.34254056193114363, "grad_norm": 0.7563634877235174, "learning_rate": 1.9777395861590036e-05, "loss": 0.4033, "step": 541 }, { "epoch": 0.3431737237831421, "grad_norm": 0.6747130687988845, "learning_rate": 1.9775929623700318e-05, "loss": 0.3973, "step": 542 }, { "epoch": 0.3438068856351405, "grad_norm": 0.6322093826910358, "learning_rate": 1.9774458627487805e-05, "loss": 0.3934, "step": 543 }, { "epoch": 0.3444400474871389, "grad_norm": 0.695935580644979, "learning_rate": 1.9772982873668487e-05, "loss": 0.3833, "step": 544 }, { "epoch": 0.3450732093391373, "grad_norm": 0.6214601098961653, "learning_rate": 1.9771502362960673e-05, "loss": 0.413, "step": 545 }, { "epoch": 0.34570637119113573, "grad_norm": 0.6722016447500148, "learning_rate": 1.9770017096084983e-05, "loss": 0.4112, "step": 546 }, { "epoch": 0.34633953304313414, "grad_norm": 0.7028120555162899, "learning_rate": 1.9768527073764355e-05, "loss": 0.393, "step": 547 }, { "epoch": 0.34697269489513255, "grad_norm": 1.310795022678624, "learning_rate": 1.976703229672404e-05, "loss": 0.3882, "step": 548 }, { "epoch": 0.34760585674713096, "grad_norm": 0.6917772387139676, "learning_rate": 1.9765532765691604e-05, "loss": 0.3939, "step": 549 }, { "epoch": 0.3482390185991294, "grad_norm": 0.6545279428994923, "learning_rate": 1.9764028481396928e-05, "loss": 0.4027, "step": 550 }, { "epoch": 0.34887218045112783, "grad_norm": 0.6702559519688648, "learning_rate": 1.97625194445722e-05, "loss": 0.3985, "step": 551 }, { "epoch": 0.34950534230312624, "grad_norm": 0.7253925348315586, "learning_rate": 1.976100565595193e-05, "loss": 0.3993, "step": 552 }, { "epoch": 0.35013850415512465, "grad_norm": 0.6299557255383034, "learning_rate": 1.9759487116272938e-05, "loss": 0.3892, "step": 553 }, { "epoch": 0.35077166600712306, "grad_norm": 0.7524042790238693, "learning_rate": 1.9757963826274357e-05, "loss": 0.3919, "step": 554 }, { "epoch": 0.35140482785912147, "grad_norm": 0.6801529255636802, "learning_rate": 1.975643578669763e-05, "loss": 0.3887, "step": 555 }, { "epoch": 0.3520379897111199, "grad_norm": 0.790807652876966, "learning_rate": 1.975490299828651e-05, "loss": 0.3904, "step": 556 }, { "epoch": 0.35267115156311835, "grad_norm": 0.7119191441250614, "learning_rate": 1.975336546178707e-05, "loss": 0.391, "step": 557 }, { "epoch": 0.35330431341511676, "grad_norm": 0.675277714949934, "learning_rate": 1.9751823177947686e-05, "loss": 0.3912, "step": 558 }, { "epoch": 0.35393747526711516, "grad_norm": 0.6719629813902523, "learning_rate": 1.9750276147519045e-05, "loss": 0.3924, "step": 559 }, { "epoch": 0.3545706371191136, "grad_norm": 0.6586479400294157, "learning_rate": 1.9748724371254146e-05, "loss": 0.3964, "step": 560 }, { "epoch": 0.355203798971112, "grad_norm": 0.7321821528989753, "learning_rate": 1.9747167849908305e-05, "loss": 0.3958, "step": 561 }, { "epoch": 0.3558369608231104, "grad_norm": 0.6224036249359008, "learning_rate": 1.974560658423913e-05, "loss": 0.389, "step": 562 }, { "epoch": 0.3564701226751088, "grad_norm": 0.7561348639700095, "learning_rate": 1.9744040575006563e-05, "loss": 0.4005, "step": 563 }, { "epoch": 0.35710328452710727, "grad_norm": 0.609410574182543, "learning_rate": 1.9742469822972832e-05, "loss": 0.4127, "step": 564 }, { "epoch": 0.3577364463791057, "grad_norm": 0.7683910303509521, "learning_rate": 1.9740894328902483e-05, "loss": 0.3877, "step": 565 }, { "epoch": 0.3583696082311041, "grad_norm": 0.7074193333189221, "learning_rate": 1.9739314093562373e-05, "loss": 0.3969, "step": 566 }, { "epoch": 0.3590027700831025, "grad_norm": 0.7347611638079957, "learning_rate": 1.9737729117721664e-05, "loss": 0.3777, "step": 567 }, { "epoch": 0.3596359319351009, "grad_norm": 0.6744678411209413, "learning_rate": 1.973613940215182e-05, "loss": 0.3772, "step": 568 }, { "epoch": 0.3602690937870993, "grad_norm": 4.154614364315114, "learning_rate": 1.9734544947626623e-05, "loss": 0.3914, "step": 569 }, { "epoch": 0.3609022556390977, "grad_norm": 0.6966888639833063, "learning_rate": 1.9732945754922155e-05, "loss": 0.4004, "step": 570 }, { "epoch": 0.36153541749109613, "grad_norm": 0.5951247378536516, "learning_rate": 1.97313418248168e-05, "loss": 0.3818, "step": 571 }, { "epoch": 0.3621685793430946, "grad_norm": 0.6505214950704158, "learning_rate": 1.9729733158091255e-05, "loss": 0.3922, "step": 572 }, { "epoch": 0.362801741195093, "grad_norm": 0.5874109026377015, "learning_rate": 1.972811975552852e-05, "loss": 0.3869, "step": 573 }, { "epoch": 0.3634349030470914, "grad_norm": 0.6087357796275259, "learning_rate": 1.9726501617913904e-05, "loss": 0.3794, "step": 574 }, { "epoch": 0.3640680648990898, "grad_norm": 0.616559647133867, "learning_rate": 1.9724878746035013e-05, "loss": 0.3847, "step": 575 }, { "epoch": 0.36470122675108824, "grad_norm": 0.6757752943261643, "learning_rate": 1.9723251140681768e-05, "loss": 0.3786, "step": 576 }, { "epoch": 0.36533438860308665, "grad_norm": 0.6904357039523144, "learning_rate": 1.9721618802646375e-05, "loss": 0.3963, "step": 577 }, { "epoch": 0.36596755045508506, "grad_norm": 0.6518745969804765, "learning_rate": 1.971998173272337e-05, "loss": 0.3844, "step": 578 }, { "epoch": 0.3666007123070835, "grad_norm": 0.6676891825207799, "learning_rate": 1.9718339931709568e-05, "loss": 0.389, "step": 579 }, { "epoch": 0.36723387415908193, "grad_norm": 0.6603765076936063, "learning_rate": 1.97166934004041e-05, "loss": 0.3911, "step": 580 }, { "epoch": 0.36786703601108034, "grad_norm": 0.6059597470252573, "learning_rate": 1.97150421396084e-05, "loss": 0.384, "step": 581 }, { "epoch": 0.36850019786307875, "grad_norm": 0.6109314457633623, "learning_rate": 1.9713386150126193e-05, "loss": 0.393, "step": 582 }, { "epoch": 0.36913335971507716, "grad_norm": 0.6259929140318727, "learning_rate": 1.9711725432763524e-05, "loss": 0.39, "step": 583 }, { "epoch": 0.36976652156707557, "grad_norm": 0.6368944060736598, "learning_rate": 1.971005998832872e-05, "loss": 0.3984, "step": 584 }, { "epoch": 0.370399683419074, "grad_norm": 0.592053455350519, "learning_rate": 1.9708389817632416e-05, "loss": 0.4063, "step": 585 }, { "epoch": 0.37103284527107244, "grad_norm": 0.6143254320832072, "learning_rate": 1.9706714921487554e-05, "loss": 0.3752, "step": 586 }, { "epoch": 0.37166600712307085, "grad_norm": 0.622633536636713, "learning_rate": 1.970503530070937e-05, "loss": 0.4064, "step": 587 }, { "epoch": 0.37229916897506926, "grad_norm": 0.6129225891512118, "learning_rate": 1.9703350956115393e-05, "loss": 0.3714, "step": 588 }, { "epoch": 0.37293233082706767, "grad_norm": 0.6222143219625977, "learning_rate": 1.970166188852547e-05, "loss": 0.3713, "step": 589 }, { "epoch": 0.3735654926790661, "grad_norm": 0.6105026490232913, "learning_rate": 1.9699968098761724e-05, "loss": 0.3986, "step": 590 }, { "epoch": 0.3741986545310645, "grad_norm": 0.6369217438082535, "learning_rate": 1.96982695876486e-05, "loss": 0.3815, "step": 591 }, { "epoch": 0.3748318163830629, "grad_norm": 0.6147599289586447, "learning_rate": 1.969656635601282e-05, "loss": 0.3858, "step": 592 }, { "epoch": 0.37546497823506136, "grad_norm": 0.6174658842128438, "learning_rate": 1.9694858404683416e-05, "loss": 0.3944, "step": 593 }, { "epoch": 0.3760981400870598, "grad_norm": 0.6642782958067294, "learning_rate": 1.9693145734491714e-05, "loss": 0.3794, "step": 594 }, { "epoch": 0.3767313019390582, "grad_norm": 0.6319294691241052, "learning_rate": 1.9691428346271334e-05, "loss": 0.4002, "step": 595 }, { "epoch": 0.3773644637910566, "grad_norm": 0.6137025755205253, "learning_rate": 1.96897062408582e-05, "loss": 0.4143, "step": 596 }, { "epoch": 0.377997625643055, "grad_norm": 0.7203618358496631, "learning_rate": 1.9687979419090528e-05, "loss": 0.3787, "step": 597 }, { "epoch": 0.3786307874950534, "grad_norm": 0.6210682169791237, "learning_rate": 1.968624788180882e-05, "loss": 0.3864, "step": 598 }, { "epoch": 0.3792639493470518, "grad_norm": 0.633331389833362, "learning_rate": 1.968451162985589e-05, "loss": 0.3914, "step": 599 }, { "epoch": 0.37989711119905023, "grad_norm": 0.6947894110684079, "learning_rate": 1.9682770664076834e-05, "loss": 0.3991, "step": 600 }, { "epoch": 0.3805302730510487, "grad_norm": 0.5901656283199777, "learning_rate": 1.9681024985319054e-05, "loss": 0.3949, "step": 601 }, { "epoch": 0.3811634349030471, "grad_norm": 0.6445841343429741, "learning_rate": 1.967927459443223e-05, "loss": 0.3822, "step": 602 }, { "epoch": 0.3817965967550455, "grad_norm": 0.6178725601506304, "learning_rate": 1.9677519492268357e-05, "loss": 0.3842, "step": 603 }, { "epoch": 0.3824297586070439, "grad_norm": 0.6201648492321139, "learning_rate": 1.9675759679681697e-05, "loss": 0.3935, "step": 604 }, { "epoch": 0.38306292045904233, "grad_norm": 0.6721581308947016, "learning_rate": 1.967399515752883e-05, "loss": 0.3977, "step": 605 }, { "epoch": 0.38369608231104074, "grad_norm": 0.5878300455053511, "learning_rate": 1.9672225926668616e-05, "loss": 0.3932, "step": 606 }, { "epoch": 0.38432924416303915, "grad_norm": 0.6587030180765424, "learning_rate": 1.96704519879622e-05, "loss": 0.3965, "step": 607 }, { "epoch": 0.3849624060150376, "grad_norm": 0.5945371639396212, "learning_rate": 1.9668673342273034e-05, "loss": 0.3826, "step": 608 }, { "epoch": 0.385595567867036, "grad_norm": 0.698069647900001, "learning_rate": 1.966688999046685e-05, "loss": 0.3988, "step": 609 }, { "epoch": 0.38622872971903444, "grad_norm": 0.6025615713326364, "learning_rate": 1.9665101933411682e-05, "loss": 0.4001, "step": 610 }, { "epoch": 0.38686189157103285, "grad_norm": 0.6826825675388501, "learning_rate": 1.9663309171977832e-05, "loss": 0.3867, "step": 611 }, { "epoch": 0.38749505342303125, "grad_norm": 0.7127351987945358, "learning_rate": 1.9661511707037918e-05, "loss": 0.3738, "step": 612 }, { "epoch": 0.38812821527502966, "grad_norm": 0.6450554915083905, "learning_rate": 1.9659709539466833e-05, "loss": 0.3873, "step": 613 }, { "epoch": 0.3887613771270281, "grad_norm": 0.7374473227769845, "learning_rate": 1.965790267014176e-05, "loss": 0.3969, "step": 614 }, { "epoch": 0.38939453897902654, "grad_norm": 0.7060144132563971, "learning_rate": 1.965609109994217e-05, "loss": 0.3955, "step": 615 }, { "epoch": 0.39002770083102495, "grad_norm": 0.6419252767914948, "learning_rate": 1.9654274829749828e-05, "loss": 0.3758, "step": 616 }, { "epoch": 0.39066086268302336, "grad_norm": 0.6798616538156798, "learning_rate": 1.9652453860448784e-05, "loss": 0.3924, "step": 617 }, { "epoch": 0.39129402453502177, "grad_norm": 0.6143708609928734, "learning_rate": 1.9650628192925372e-05, "loss": 0.3983, "step": 618 }, { "epoch": 0.3919271863870202, "grad_norm": 0.6509590221593116, "learning_rate": 1.9648797828068214e-05, "loss": 0.3826, "step": 619 }, { "epoch": 0.3925603482390186, "grad_norm": 0.6988814909304661, "learning_rate": 1.964696276676822e-05, "loss": 0.3914, "step": 620 }, { "epoch": 0.393193510091017, "grad_norm": 0.5790037525688576, "learning_rate": 1.964512300991859e-05, "loss": 0.3946, "step": 621 }, { "epoch": 0.39382667194301546, "grad_norm": 0.6983695559367518, "learning_rate": 1.9643278558414794e-05, "loss": 0.3889, "step": 622 }, { "epoch": 0.39445983379501387, "grad_norm": 0.6252243983565687, "learning_rate": 1.964142941315461e-05, "loss": 0.3866, "step": 623 }, { "epoch": 0.3950929956470123, "grad_norm": 0.8032577358695853, "learning_rate": 1.9639575575038073e-05, "loss": 0.4051, "step": 624 }, { "epoch": 0.3957261574990107, "grad_norm": 0.6446400253671033, "learning_rate": 1.9637717044967536e-05, "loss": 0.3823, "step": 625 }, { "epoch": 0.3963593193510091, "grad_norm": 0.6505141252623248, "learning_rate": 1.9635853823847607e-05, "loss": 0.3833, "step": 626 }, { "epoch": 0.3969924812030075, "grad_norm": 1.0195496852657617, "learning_rate": 1.963398591258519e-05, "loss": 0.3669, "step": 627 }, { "epoch": 0.3976256430550059, "grad_norm": 0.6139389191485172, "learning_rate": 1.9632113312089466e-05, "loss": 0.3869, "step": 628 }, { "epoch": 0.3982588049070043, "grad_norm": 0.6286914122132403, "learning_rate": 1.9630236023271907e-05, "loss": 0.3944, "step": 629 }, { "epoch": 0.3988919667590028, "grad_norm": 0.5849687075032328, "learning_rate": 1.962835404704626e-05, "loss": 0.3776, "step": 630 }, { "epoch": 0.3995251286110012, "grad_norm": 0.9889197999258952, "learning_rate": 1.9626467384328558e-05, "loss": 0.3779, "step": 631 }, { "epoch": 0.4001582904629996, "grad_norm": 0.5804977729148556, "learning_rate": 1.9624576036037108e-05, "loss": 0.3899, "step": 632 }, { "epoch": 0.400791452314998, "grad_norm": 0.9561662194706809, "learning_rate": 1.9622680003092503e-05, "loss": 0.38, "step": 633 }, { "epoch": 0.40142461416699643, "grad_norm": 0.6688549577029577, "learning_rate": 1.962077928641762e-05, "loss": 0.3931, "step": 634 }, { "epoch": 0.40205777601899484, "grad_norm": 0.8018781488101914, "learning_rate": 1.961887388693761e-05, "loss": 0.3855, "step": 635 }, { "epoch": 0.40269093787099325, "grad_norm": 0.6258089509669941, "learning_rate": 1.96169638055799e-05, "loss": 0.3948, "step": 636 }, { "epoch": 0.4033240997229917, "grad_norm": 0.6193285995150213, "learning_rate": 1.9615049043274207e-05, "loss": 0.3885, "step": 637 }, { "epoch": 0.4039572615749901, "grad_norm": 0.596419891995283, "learning_rate": 1.9613129600952515e-05, "loss": 0.4009, "step": 638 }, { "epoch": 0.40459042342698853, "grad_norm": 0.6035131788798132, "learning_rate": 1.961120547954909e-05, "loss": 0.3896, "step": 639 }, { "epoch": 0.40522358527898694, "grad_norm": 0.6267581246408791, "learning_rate": 1.960927668000048e-05, "loss": 0.392, "step": 640 }, { "epoch": 0.40585674713098535, "grad_norm": 0.5812389552200335, "learning_rate": 1.96073432032455e-05, "loss": 0.3787, "step": 641 }, { "epoch": 0.40648990898298376, "grad_norm": 0.6256452400991089, "learning_rate": 1.960540505022526e-05, "loss": 0.3838, "step": 642 }, { "epoch": 0.40712307083498217, "grad_norm": 0.5836558535521179, "learning_rate": 1.9603462221883122e-05, "loss": 0.4002, "step": 643 }, { "epoch": 0.40775623268698064, "grad_norm": 0.5604620400456402, "learning_rate": 1.9601514719164742e-05, "loss": 0.4084, "step": 644 }, { "epoch": 0.40838939453897904, "grad_norm": 0.6754805571779712, "learning_rate": 1.9599562543018044e-05, "loss": 0.4024, "step": 645 }, { "epoch": 0.40902255639097745, "grad_norm": 0.6584953517608986, "learning_rate": 1.9597605694393223e-05, "loss": 0.3899, "step": 646 }, { "epoch": 0.40965571824297586, "grad_norm": 0.5883685134601473, "learning_rate": 1.9595644174242758e-05, "loss": 0.3767, "step": 647 }, { "epoch": 0.4102888800949743, "grad_norm": 0.5878376162717172, "learning_rate": 1.9593677983521396e-05, "loss": 0.3991, "step": 648 }, { "epoch": 0.4109220419469727, "grad_norm": 0.6504400987577321, "learning_rate": 1.9591707123186156e-05, "loss": 0.3789, "step": 649 }, { "epoch": 0.4115552037989711, "grad_norm": 0.6319011267925412, "learning_rate": 1.9589731594196334e-05, "loss": 0.391, "step": 650 }, { "epoch": 0.4121883656509695, "grad_norm": 0.5514024270303453, "learning_rate": 1.9587751397513492e-05, "loss": 0.3777, "step": 651 }, { "epoch": 0.41282152750296797, "grad_norm": 0.6321751109573278, "learning_rate": 1.9585766534101475e-05, "loss": 0.3813, "step": 652 }, { "epoch": 0.4134546893549664, "grad_norm": 0.6002175236102909, "learning_rate": 1.958377700492639e-05, "loss": 0.4038, "step": 653 }, { "epoch": 0.4140878512069648, "grad_norm": 0.6075249468093722, "learning_rate": 1.9581782810956615e-05, "loss": 0.3901, "step": 654 }, { "epoch": 0.4147210130589632, "grad_norm": 0.6372066364757273, "learning_rate": 1.9579783953162803e-05, "loss": 0.3869, "step": 655 }, { "epoch": 0.4153541749109616, "grad_norm": 0.6121155551163825, "learning_rate": 1.957778043251788e-05, "loss": 0.3814, "step": 656 }, { "epoch": 0.41598733676296, "grad_norm": 0.6050517991711154, "learning_rate": 1.9575772249997028e-05, "loss": 0.3716, "step": 657 }, { "epoch": 0.4166204986149584, "grad_norm": 0.5719329994350025, "learning_rate": 1.9573759406577712e-05, "loss": 0.3987, "step": 658 }, { "epoch": 0.4172536604669569, "grad_norm": 0.6043892276592988, "learning_rate": 1.957174190323966e-05, "loss": 0.3934, "step": 659 }, { "epoch": 0.4178868223189553, "grad_norm": 0.7626248804937626, "learning_rate": 1.956971974096487e-05, "loss": 0.3909, "step": 660 }, { "epoch": 0.4185199841709537, "grad_norm": 0.638500175275569, "learning_rate": 1.9567692920737608e-05, "loss": 0.3884, "step": 661 }, { "epoch": 0.4191531460229521, "grad_norm": 0.6003406089436863, "learning_rate": 1.9565661443544403e-05, "loss": 0.3841, "step": 662 }, { "epoch": 0.4197863078749505, "grad_norm": 0.6475314568683005, "learning_rate": 1.9563625310374053e-05, "loss": 0.3968, "step": 663 }, { "epoch": 0.42041946972694894, "grad_norm": 0.61276891134888, "learning_rate": 1.9561584522217624e-05, "loss": 0.3923, "step": 664 }, { "epoch": 0.42105263157894735, "grad_norm": 0.7097112039520415, "learning_rate": 1.9559539080068447e-05, "loss": 0.3758, "step": 665 }, { "epoch": 0.4216857934309458, "grad_norm": 0.611709793368116, "learning_rate": 1.9557488984922112e-05, "loss": 0.3841, "step": 666 }, { "epoch": 0.4223189552829442, "grad_norm": 0.5923217518588361, "learning_rate": 1.955543423777649e-05, "loss": 0.3877, "step": 667 }, { "epoch": 0.42295211713494263, "grad_norm": 0.5989196182182053, "learning_rate": 1.95533748396317e-05, "loss": 0.3941, "step": 668 }, { "epoch": 0.42358527898694104, "grad_norm": 0.6281828067898233, "learning_rate": 1.9551310791490125e-05, "loss": 0.3841, "step": 669 }, { "epoch": 0.42421844083893945, "grad_norm": 0.6528303683170389, "learning_rate": 1.954924209435643e-05, "loss": 0.3784, "step": 670 }, { "epoch": 0.42485160269093786, "grad_norm": 0.6636868211082034, "learning_rate": 1.954716874923752e-05, "loss": 0.3851, "step": 671 }, { "epoch": 0.42548476454293627, "grad_norm": 0.5970415793580687, "learning_rate": 1.9545090757142575e-05, "loss": 0.3913, "step": 672 }, { "epoch": 0.42611792639493473, "grad_norm": 0.6116582879380595, "learning_rate": 1.9543008119083035e-05, "loss": 0.3771, "step": 673 }, { "epoch": 0.42675108824693314, "grad_norm": 0.6418435352803964, "learning_rate": 1.9540920836072598e-05, "loss": 0.3882, "step": 674 }, { "epoch": 0.42738425009893155, "grad_norm": 0.6029223078675872, "learning_rate": 1.953882890912723e-05, "loss": 0.392, "step": 675 }, { "epoch": 0.42801741195092996, "grad_norm": 0.6244860233732157, "learning_rate": 1.9536732339265145e-05, "loss": 0.3896, "step": 676 }, { "epoch": 0.42865057380292837, "grad_norm": 0.6196949195988627, "learning_rate": 1.9534631127506833e-05, "loss": 0.3849, "step": 677 }, { "epoch": 0.4292837356549268, "grad_norm": 0.6526112259659119, "learning_rate": 1.953252527487503e-05, "loss": 0.3977, "step": 678 }, { "epoch": 0.4299168975069252, "grad_norm": 0.7116379336619582, "learning_rate": 1.9530414782394735e-05, "loss": 0.374, "step": 679 }, { "epoch": 0.4305500593589236, "grad_norm": 0.6406725649887782, "learning_rate": 1.952829965109321e-05, "loss": 0.3785, "step": 680 }, { "epoch": 0.43118322121092206, "grad_norm": 0.6415648616420178, "learning_rate": 1.952617988199997e-05, "loss": 0.386, "step": 681 }, { "epoch": 0.4318163830629205, "grad_norm": 0.6404339755692455, "learning_rate": 1.952405547614679e-05, "loss": 0.4125, "step": 682 }, { "epoch": 0.4324495449149189, "grad_norm": 1.017495110402015, "learning_rate": 1.9521926434567694e-05, "loss": 0.3855, "step": 683 }, { "epoch": 0.4330827067669173, "grad_norm": 0.6879765033104195, "learning_rate": 1.9519792758298976e-05, "loss": 0.3961, "step": 684 }, { "epoch": 0.4337158686189157, "grad_norm": 0.5683839697555428, "learning_rate": 1.9517654448379173e-05, "loss": 0.3581, "step": 685 }, { "epoch": 0.4343490304709141, "grad_norm": 0.6569814149747841, "learning_rate": 1.951551150584909e-05, "loss": 0.3937, "step": 686 }, { "epoch": 0.4349821923229125, "grad_norm": 0.6062541261600132, "learning_rate": 1.9513363931751774e-05, "loss": 0.3842, "step": 687 }, { "epoch": 0.435615354174911, "grad_norm": 0.6469009935378092, "learning_rate": 1.9511211727132534e-05, "loss": 0.3891, "step": 688 }, { "epoch": 0.4362485160269094, "grad_norm": 0.5953894755599018, "learning_rate": 1.9509054893038932e-05, "loss": 0.3789, "step": 689 }, { "epoch": 0.4368816778789078, "grad_norm": 0.684845918339056, "learning_rate": 1.950689343052078e-05, "loss": 0.3904, "step": 690 }, { "epoch": 0.4375148397309062, "grad_norm": 0.6282410927318971, "learning_rate": 1.950472734063015e-05, "loss": 0.3961, "step": 691 }, { "epoch": 0.4381480015829046, "grad_norm": 0.6648785939419672, "learning_rate": 1.9502556624421357e-05, "loss": 0.392, "step": 692 }, { "epoch": 0.43878116343490303, "grad_norm": 0.588674349395492, "learning_rate": 1.9500381282950975e-05, "loss": 0.3913, "step": 693 }, { "epoch": 0.43941432528690144, "grad_norm": 0.7229253622286608, "learning_rate": 1.949820131727783e-05, "loss": 0.3928, "step": 694 }, { "epoch": 0.4400474871388999, "grad_norm": 0.5861719305485964, "learning_rate": 1.9496016728462986e-05, "loss": 0.3896, "step": 695 }, { "epoch": 0.4406806489908983, "grad_norm": 0.7563930647266753, "learning_rate": 1.9493827517569776e-05, "loss": 0.3768, "step": 696 }, { "epoch": 0.4413138108428967, "grad_norm": 0.5983039123308066, "learning_rate": 1.949163368566377e-05, "loss": 0.3857, "step": 697 }, { "epoch": 0.44194697269489513, "grad_norm": 0.667957126069025, "learning_rate": 1.948943523381279e-05, "loss": 0.3742, "step": 698 }, { "epoch": 0.44258013454689354, "grad_norm": 0.5595464742673795, "learning_rate": 1.948723216308691e-05, "loss": 0.3919, "step": 699 }, { "epoch": 0.44321329639889195, "grad_norm": 0.6378902851155042, "learning_rate": 1.9485024474558445e-05, "loss": 0.3949, "step": 700 }, { "epoch": 0.44384645825089036, "grad_norm": 1.1122470808011184, "learning_rate": 1.948281216930197e-05, "loss": 0.3907, "step": 701 }, { "epoch": 0.44447962010288883, "grad_norm": 0.6280226548113214, "learning_rate": 1.9480595248394293e-05, "loss": 0.3742, "step": 702 }, { "epoch": 0.44511278195488724, "grad_norm": 0.6088925240151578, "learning_rate": 1.9478373712914475e-05, "loss": 0.3854, "step": 703 }, { "epoch": 0.44574594380688565, "grad_norm": 0.6115518637320669, "learning_rate": 1.9476147563943826e-05, "loss": 0.3856, "step": 704 }, { "epoch": 0.44637910565888406, "grad_norm": 0.5910793035332751, "learning_rate": 1.94739168025659e-05, "loss": 0.3783, "step": 705 }, { "epoch": 0.44701226751088247, "grad_norm": 0.6119751585480656, "learning_rate": 1.947168142986649e-05, "loss": 0.3935, "step": 706 }, { "epoch": 0.4476454293628809, "grad_norm": 0.6185621094876682, "learning_rate": 1.9469441446933644e-05, "loss": 0.387, "step": 707 }, { "epoch": 0.4482785912148793, "grad_norm": 0.5997664105008051, "learning_rate": 1.9467196854857642e-05, "loss": 0.3784, "step": 708 }, { "epoch": 0.4489117530668777, "grad_norm": 0.7690092075305839, "learning_rate": 1.946494765473102e-05, "loss": 0.376, "step": 709 }, { "epoch": 0.44954491491887616, "grad_norm": 0.6406470318855368, "learning_rate": 1.9462693847648544e-05, "loss": 0.3781, "step": 710 }, { "epoch": 0.45017807677087457, "grad_norm": 0.6156008921470735, "learning_rate": 1.9460435434707236e-05, "loss": 0.386, "step": 711 }, { "epoch": 0.450811238622873, "grad_norm": 0.552633128928631, "learning_rate": 1.9458172417006347e-05, "loss": 0.389, "step": 712 }, { "epoch": 0.4514444004748714, "grad_norm": 0.6558380531136188, "learning_rate": 1.945590479564738e-05, "loss": 0.3594, "step": 713 }, { "epoch": 0.4520775623268698, "grad_norm": 0.5546668696422218, "learning_rate": 1.945363257173407e-05, "loss": 0.3908, "step": 714 }, { "epoch": 0.4527107241788682, "grad_norm": 0.921065950547674, "learning_rate": 1.94513557463724e-05, "loss": 0.3782, "step": 715 }, { "epoch": 0.4533438860308666, "grad_norm": 0.6053846108729959, "learning_rate": 1.9449074320670586e-05, "loss": 0.3994, "step": 716 }, { "epoch": 0.4539770478828651, "grad_norm": 0.7163465110865092, "learning_rate": 1.9446788295739086e-05, "loss": 0.3908, "step": 717 }, { "epoch": 0.4546102097348635, "grad_norm": 0.6220917095995803, "learning_rate": 1.94444976726906e-05, "loss": 0.3774, "step": 718 }, { "epoch": 0.4552433715868619, "grad_norm": 0.6977477357938803, "learning_rate": 1.944220245264006e-05, "loss": 0.3906, "step": 719 }, { "epoch": 0.4558765334388603, "grad_norm": 0.6130449590193233, "learning_rate": 1.9439902636704633e-05, "loss": 0.3862, "step": 720 }, { "epoch": 0.4565096952908587, "grad_norm": 0.6456938157710966, "learning_rate": 1.9437598226003738e-05, "loss": 0.3816, "step": 721 }, { "epoch": 0.45714285714285713, "grad_norm": 0.6797801856071881, "learning_rate": 1.943528922165902e-05, "loss": 0.3872, "step": 722 }, { "epoch": 0.45777601899485554, "grad_norm": 0.6756550965644136, "learning_rate": 1.943297562479435e-05, "loss": 0.3937, "step": 723 }, { "epoch": 0.458409180846854, "grad_norm": 0.6343693011031392, "learning_rate": 1.9430657436535857e-05, "loss": 0.3826, "step": 724 }, { "epoch": 0.4590423426988524, "grad_norm": 0.618642418805738, "learning_rate": 1.9428334658011888e-05, "loss": 0.3965, "step": 725 }, { "epoch": 0.4596755045508508, "grad_norm": 0.6402970531073218, "learning_rate": 1.9426007290353027e-05, "loss": 0.3926, "step": 726 }, { "epoch": 0.46030866640284923, "grad_norm": 0.6450341523397419, "learning_rate": 1.94236753346921e-05, "loss": 0.3849, "step": 727 }, { "epoch": 0.46094182825484764, "grad_norm": 0.6776125066788707, "learning_rate": 1.9421338792164152e-05, "loss": 0.3939, "step": 728 }, { "epoch": 0.46157499010684605, "grad_norm": 0.6464418044376081, "learning_rate": 1.9418997663906474e-05, "loss": 0.3763, "step": 729 }, { "epoch": 0.46220815195884446, "grad_norm": 0.6685292654981129, "learning_rate": 1.9416651951058584e-05, "loss": 0.3892, "step": 730 }, { "epoch": 0.46284131381084287, "grad_norm": 0.5780899018841831, "learning_rate": 1.9414301654762226e-05, "loss": 0.4008, "step": 731 }, { "epoch": 0.46347447566284133, "grad_norm": 0.6150532306309426, "learning_rate": 1.9411946776161388e-05, "loss": 0.3829, "step": 732 }, { "epoch": 0.46410763751483974, "grad_norm": 0.6243723922930852, "learning_rate": 1.9409587316402272e-05, "loss": 0.3787, "step": 733 }, { "epoch": 0.46474079936683815, "grad_norm": 0.5816661546973081, "learning_rate": 1.9407223276633323e-05, "loss": 0.3794, "step": 734 }, { "epoch": 0.46537396121883656, "grad_norm": 0.6223531126885381, "learning_rate": 1.9404854658005208e-05, "loss": 0.3808, "step": 735 }, { "epoch": 0.46600712307083497, "grad_norm": 0.6272217460981503, "learning_rate": 1.940248146167083e-05, "loss": 0.3815, "step": 736 }, { "epoch": 0.4666402849228334, "grad_norm": 0.5700564861668677, "learning_rate": 1.940010368878531e-05, "loss": 0.3926, "step": 737 }, { "epoch": 0.4672734467748318, "grad_norm": 0.6135982381689913, "learning_rate": 1.9397721340506004e-05, "loss": 0.4023, "step": 738 }, { "epoch": 0.46790660862683026, "grad_norm": 0.6192972998845908, "learning_rate": 1.9395334417992494e-05, "loss": 0.3689, "step": 739 }, { "epoch": 0.46853977047882867, "grad_norm": 1.2351974034787825, "learning_rate": 1.9392942922406588e-05, "loss": 0.3871, "step": 740 }, { "epoch": 0.4691729323308271, "grad_norm": 0.5897580950916, "learning_rate": 1.9390546854912315e-05, "loss": 0.3855, "step": 741 }, { "epoch": 0.4698060941828255, "grad_norm": 0.6430834765028304, "learning_rate": 1.9388146216675938e-05, "loss": 0.3738, "step": 742 }, { "epoch": 0.4704392560348239, "grad_norm": 0.5949061581367806, "learning_rate": 1.938574100886594e-05, "loss": 0.3788, "step": 743 }, { "epoch": 0.4710724178868223, "grad_norm": 0.580443025600538, "learning_rate": 1.9383331232653028e-05, "loss": 0.41, "step": 744 }, { "epoch": 0.4717055797388207, "grad_norm": 0.6286650005712141, "learning_rate": 1.938091688921013e-05, "loss": 0.3797, "step": 745 }, { "epoch": 0.4723387415908192, "grad_norm": 0.5585735306846475, "learning_rate": 1.9378497979712404e-05, "loss": 0.3741, "step": 746 }, { "epoch": 0.4729719034428176, "grad_norm": 0.6401606296120204, "learning_rate": 1.9376074505337225e-05, "loss": 0.3867, "step": 747 }, { "epoch": 0.473605065294816, "grad_norm": 0.60073063109369, "learning_rate": 1.9373646467264193e-05, "loss": 0.396, "step": 748 }, { "epoch": 0.4742382271468144, "grad_norm": 0.6051695873073375, "learning_rate": 1.9371213866675127e-05, "loss": 0.3755, "step": 749 }, { "epoch": 0.4748713889988128, "grad_norm": 0.5982780023249037, "learning_rate": 1.936877670475407e-05, "loss": 0.3664, "step": 750 }, { "epoch": 0.4755045508508112, "grad_norm": 0.6234353651371994, "learning_rate": 1.936633498268728e-05, "loss": 0.3824, "step": 751 }, { "epoch": 0.47613771270280963, "grad_norm": 0.6394145747918111, "learning_rate": 1.9363888701663242e-05, "loss": 0.3729, "step": 752 }, { "epoch": 0.4767708745548081, "grad_norm": 0.6199818300995419, "learning_rate": 1.9361437862872648e-05, "loss": 0.3802, "step": 753 }, { "epoch": 0.4774040364068065, "grad_norm": 0.6497309571785671, "learning_rate": 1.935898246750842e-05, "loss": 0.3904, "step": 754 }, { "epoch": 0.4780371982588049, "grad_norm": 0.5474597052191333, "learning_rate": 1.9356522516765694e-05, "loss": 0.3986, "step": 755 }, { "epoch": 0.47867036011080333, "grad_norm": 0.6413742229530999, "learning_rate": 1.9354058011841826e-05, "loss": 0.4032, "step": 756 }, { "epoch": 0.47930352196280174, "grad_norm": 0.5915730583669101, "learning_rate": 1.935158895393638e-05, "loss": 0.3728, "step": 757 }, { "epoch": 0.47993668381480015, "grad_norm": 0.5788368185001334, "learning_rate": 1.9349115344251142e-05, "loss": 0.3858, "step": 758 }, { "epoch": 0.48056984566679856, "grad_norm": 0.6476437265250036, "learning_rate": 1.934663718399012e-05, "loss": 0.3848, "step": 759 }, { "epoch": 0.48120300751879697, "grad_norm": 0.6489988638986021, "learning_rate": 1.934415447435953e-05, "loss": 0.3861, "step": 760 }, { "epoch": 0.48183616937079543, "grad_norm": 0.6448020238476446, "learning_rate": 1.9341667216567793e-05, "loss": 0.3962, "step": 761 }, { "epoch": 0.48246933122279384, "grad_norm": 0.631188682888142, "learning_rate": 1.9339175411825563e-05, "loss": 0.399, "step": 762 }, { "epoch": 0.48310249307479225, "grad_norm": 0.6074347662869253, "learning_rate": 1.9336679061345694e-05, "loss": 0.372, "step": 763 }, { "epoch": 0.48373565492679066, "grad_norm": 0.5898568427156446, "learning_rate": 1.9334178166343256e-05, "loss": 0.3675, "step": 764 }, { "epoch": 0.48436881677878907, "grad_norm": 0.6315120509013651, "learning_rate": 1.9331672728035535e-05, "loss": 0.3815, "step": 765 }, { "epoch": 0.4850019786307875, "grad_norm": 0.6169746273230262, "learning_rate": 1.9329162747642017e-05, "loss": 0.3707, "step": 766 }, { "epoch": 0.4856351404827859, "grad_norm": 0.5735532045130952, "learning_rate": 1.9326648226384412e-05, "loss": 0.3666, "step": 767 }, { "epoch": 0.48626830233478435, "grad_norm": 0.6255677145768495, "learning_rate": 1.9324129165486636e-05, "loss": 0.3975, "step": 768 }, { "epoch": 0.48690146418678276, "grad_norm": 0.6084230743764972, "learning_rate": 1.9321605566174808e-05, "loss": 0.3847, "step": 769 }, { "epoch": 0.48753462603878117, "grad_norm": 0.6432415723964373, "learning_rate": 1.931907742967727e-05, "loss": 0.393, "step": 770 }, { "epoch": 0.4881677878907796, "grad_norm": 0.6170608462069967, "learning_rate": 1.9316544757224552e-05, "loss": 0.395, "step": 771 }, { "epoch": 0.488800949742778, "grad_norm": 0.6463946032341855, "learning_rate": 1.931400755004941e-05, "loss": 0.3933, "step": 772 }, { "epoch": 0.4894341115947764, "grad_norm": 0.6361667306167477, "learning_rate": 1.9311465809386806e-05, "loss": 0.3808, "step": 773 }, { "epoch": 0.4900672734467748, "grad_norm": 0.5891547784362421, "learning_rate": 1.9308919536473893e-05, "loss": 0.3894, "step": 774 }, { "epoch": 0.4907004352987733, "grad_norm": 0.6417369763280599, "learning_rate": 1.9306368732550045e-05, "loss": 0.3732, "step": 775 }, { "epoch": 0.4913335971507717, "grad_norm": 0.6288972435708676, "learning_rate": 1.9303813398856843e-05, "loss": 0.3791, "step": 776 }, { "epoch": 0.4919667590027701, "grad_norm": 0.6363681815442639, "learning_rate": 1.9301253536638058e-05, "loss": 0.37, "step": 777 }, { "epoch": 0.4925999208547685, "grad_norm": 0.6626446844164291, "learning_rate": 1.9298689147139674e-05, "loss": 0.4002, "step": 778 }, { "epoch": 0.4932330827067669, "grad_norm": 0.6159913480814496, "learning_rate": 1.9296120231609885e-05, "loss": 0.3931, "step": 779 }, { "epoch": 0.4938662445587653, "grad_norm": 0.6008216482987156, "learning_rate": 1.9293546791299077e-05, "loss": 0.389, "step": 780 }, { "epoch": 0.49449940641076373, "grad_norm": 0.6157405924129086, "learning_rate": 1.9290968827459843e-05, "loss": 0.3674, "step": 781 }, { "epoch": 0.4951325682627622, "grad_norm": 0.599308992865699, "learning_rate": 1.9288386341346978e-05, "loss": 0.3969, "step": 782 }, { "epoch": 0.4957657301147606, "grad_norm": 0.6156654266234558, "learning_rate": 1.9285799334217478e-05, "loss": 0.3612, "step": 783 }, { "epoch": 0.496398891966759, "grad_norm": 0.7878898980057697, "learning_rate": 1.928320780733054e-05, "loss": 0.3703, "step": 784 }, { "epoch": 0.4970320538187574, "grad_norm": 0.6325584862441239, "learning_rate": 1.928061176194756e-05, "loss": 0.375, "step": 785 }, { "epoch": 0.49766521567075583, "grad_norm": 0.6627481930860277, "learning_rate": 1.9278011199332132e-05, "loss": 0.3531, "step": 786 }, { "epoch": 0.49829837752275424, "grad_norm": 0.5959925828281317, "learning_rate": 1.927540612075005e-05, "loss": 0.3707, "step": 787 }, { "epoch": 0.49893153937475265, "grad_norm": 0.5447896988237994, "learning_rate": 1.9272796527469312e-05, "loss": 0.3978, "step": 788 }, { "epoch": 0.49956470122675106, "grad_norm": 0.5502348090575993, "learning_rate": 1.9270182420760104e-05, "loss": 0.3861, "step": 789 }, { "epoch": 0.5001978630787495, "grad_norm": 0.5671257629237411, "learning_rate": 1.926756380189481e-05, "loss": 0.4016, "step": 790 }, { "epoch": 0.5008310249307479, "grad_norm": 0.5701013899093985, "learning_rate": 1.9264940672148018e-05, "loss": 0.3748, "step": 791 }, { "epoch": 0.5014641867827463, "grad_norm": 0.576115254057117, "learning_rate": 1.9262313032796505e-05, "loss": 0.3788, "step": 792 }, { "epoch": 0.5020973486347448, "grad_norm": 0.5586882714483794, "learning_rate": 1.925968088511924e-05, "loss": 0.3955, "step": 793 }, { "epoch": 0.5027305104867432, "grad_norm": 0.6071516129639434, "learning_rate": 1.9257044230397403e-05, "loss": 0.3963, "step": 794 }, { "epoch": 0.5033636723387416, "grad_norm": 0.5698324579498927, "learning_rate": 1.925440306991434e-05, "loss": 0.3859, "step": 795 }, { "epoch": 0.50399683419074, "grad_norm": 0.6040877299509833, "learning_rate": 1.9251757404955617e-05, "loss": 0.3767, "step": 796 }, { "epoch": 0.5046299960427384, "grad_norm": 0.5872451725434343, "learning_rate": 1.924910723680898e-05, "loss": 0.3685, "step": 797 }, { "epoch": 0.5052631578947369, "grad_norm": 0.5549775436030533, "learning_rate": 1.9246452566764363e-05, "loss": 0.3752, "step": 798 }, { "epoch": 0.5058963197467352, "grad_norm": 0.607498492502315, "learning_rate": 1.9243793396113902e-05, "loss": 0.3773, "step": 799 }, { "epoch": 0.5065294815987337, "grad_norm": 0.5981087427607222, "learning_rate": 1.9241129726151917e-05, "loss": 0.3873, "step": 800 }, { "epoch": 0.5071626434507321, "grad_norm": 0.5929015333837331, "learning_rate": 1.9238461558174918e-05, "loss": 0.3812, "step": 801 }, { "epoch": 0.5077958053027305, "grad_norm": 0.5725581202557256, "learning_rate": 1.9235788893481602e-05, "loss": 0.3816, "step": 802 }, { "epoch": 0.508428967154729, "grad_norm": 0.6135879586454235, "learning_rate": 1.9233111733372863e-05, "loss": 0.3923, "step": 803 }, { "epoch": 0.5090621290067273, "grad_norm": 0.587586789647547, "learning_rate": 1.923043007915178e-05, "loss": 0.3751, "step": 804 }, { "epoch": 0.5096952908587258, "grad_norm": 0.622200215210228, "learning_rate": 1.9227743932123614e-05, "loss": 0.3773, "step": 805 }, { "epoch": 0.5103284527107241, "grad_norm": 0.5730511662540984, "learning_rate": 1.9225053293595817e-05, "loss": 0.365, "step": 806 }, { "epoch": 0.5109616145627226, "grad_norm": 0.6017826477779117, "learning_rate": 1.922235816487803e-05, "loss": 0.3885, "step": 807 }, { "epoch": 0.5115947764147211, "grad_norm": 0.573277662320179, "learning_rate": 1.921965854728207e-05, "loss": 0.377, "step": 808 }, { "epoch": 0.5122279382667194, "grad_norm": 0.7719802537751814, "learning_rate": 1.921695444212195e-05, "loss": 0.379, "step": 809 }, { "epoch": 0.5128611001187179, "grad_norm": 0.5504068457403586, "learning_rate": 1.921424585071386e-05, "loss": 0.3871, "step": 810 }, { "epoch": 0.5134942619707162, "grad_norm": 0.6032049403280552, "learning_rate": 1.9211532774376177e-05, "loss": 0.3839, "step": 811 }, { "epoch": 0.5141274238227147, "grad_norm": 0.5870704534677721, "learning_rate": 1.9208815214429463e-05, "loss": 0.3885, "step": 812 }, { "epoch": 0.5147605856747131, "grad_norm": 0.6354343454749245, "learning_rate": 1.9206093172196455e-05, "loss": 0.3974, "step": 813 }, { "epoch": 0.5153937475267115, "grad_norm": 0.5957614623247504, "learning_rate": 1.9203366649002082e-05, "loss": 0.3844, "step": 814 }, { "epoch": 0.51602690937871, "grad_norm": 0.5807519044405636, "learning_rate": 1.920063564617344e-05, "loss": 0.3801, "step": 815 }, { "epoch": 0.5166600712307083, "grad_norm": 0.6464575019712087, "learning_rate": 1.919790016503982e-05, "loss": 0.3915, "step": 816 }, { "epoch": 0.5172932330827068, "grad_norm": 0.5468197963551252, "learning_rate": 1.9195160206932683e-05, "loss": 0.3558, "step": 817 }, { "epoch": 0.5179263949347052, "grad_norm": 0.6396562227623129, "learning_rate": 1.9192415773185678e-05, "loss": 0.3794, "step": 818 }, { "epoch": 0.5185595567867036, "grad_norm": 0.6443719076806651, "learning_rate": 1.918966686513462e-05, "loss": 0.3841, "step": 819 }, { "epoch": 0.519192718638702, "grad_norm": 0.608055115735397, "learning_rate": 1.918691348411751e-05, "loss": 0.3763, "step": 820 }, { "epoch": 0.5198258804907004, "grad_norm": 0.5759761568413364, "learning_rate": 1.918415563147453e-05, "loss": 0.3783, "step": 821 }, { "epoch": 0.5204590423426989, "grad_norm": 0.6176586731407528, "learning_rate": 1.9181393308548027e-05, "loss": 0.3775, "step": 822 }, { "epoch": 0.5210922041946973, "grad_norm": 0.6401421774866891, "learning_rate": 1.9178626516682536e-05, "loss": 0.3739, "step": 823 }, { "epoch": 0.5217253660466957, "grad_norm": 0.590228911224434, "learning_rate": 1.9175855257224757e-05, "loss": 0.3695, "step": 824 }, { "epoch": 0.5223585278986941, "grad_norm": 0.5974478677624872, "learning_rate": 1.9173079531523576e-05, "loss": 0.3752, "step": 825 }, { "epoch": 0.5229916897506925, "grad_norm": 0.5343863827284595, "learning_rate": 1.9170299340930035e-05, "loss": 0.3782, "step": 826 }, { "epoch": 0.5236248516026909, "grad_norm": 0.7171724865376762, "learning_rate": 1.916751468679737e-05, "loss": 0.3735, "step": 827 }, { "epoch": 0.5242580134546894, "grad_norm": 0.5876072629864583, "learning_rate": 1.9164725570480974e-05, "loss": 0.3844, "step": 828 }, { "epoch": 0.5248911753066877, "grad_norm": 0.6311861818757037, "learning_rate": 1.9161931993338422e-05, "loss": 0.3584, "step": 829 }, { "epoch": 0.5255243371586862, "grad_norm": 0.576530064842874, "learning_rate": 1.9159133956729455e-05, "loss": 0.3752, "step": 830 }, { "epoch": 0.5261574990106846, "grad_norm": 0.6063704843315594, "learning_rate": 1.915633146201598e-05, "loss": 0.3825, "step": 831 }, { "epoch": 0.526790660862683, "grad_norm": 0.6972969797753689, "learning_rate": 1.9153524510562087e-05, "loss": 0.374, "step": 832 }, { "epoch": 0.5274238227146815, "grad_norm": 0.5721052400359432, "learning_rate": 1.9150713103734023e-05, "loss": 0.3778, "step": 833 }, { "epoch": 0.5280569845666798, "grad_norm": 0.6062572352732217, "learning_rate": 1.914789724290021e-05, "loss": 0.3713, "step": 834 }, { "epoch": 0.5286901464186783, "grad_norm": 0.5648368050283548, "learning_rate": 1.914507692943124e-05, "loss": 0.3885, "step": 835 }, { "epoch": 0.5293233082706766, "grad_norm": 0.628558926623308, "learning_rate": 1.914225216469986e-05, "loss": 0.3717, "step": 836 }, { "epoch": 0.5299564701226751, "grad_norm": 0.5396018630091787, "learning_rate": 1.9139422950081e-05, "loss": 0.3739, "step": 837 }, { "epoch": 0.5305896319746736, "grad_norm": 0.577187056748745, "learning_rate": 1.9136589286951742e-05, "loss": 0.3645, "step": 838 }, { "epoch": 0.5312227938266719, "grad_norm": 0.5464151672524324, "learning_rate": 1.9133751176691346e-05, "loss": 0.3643, "step": 839 }, { "epoch": 0.5318559556786704, "grad_norm": 0.5856109933809379, "learning_rate": 1.9130908620681226e-05, "loss": 0.3604, "step": 840 }, { "epoch": 0.5324891175306687, "grad_norm": 0.6634842231507074, "learning_rate": 1.9128061620304962e-05, "loss": 0.363, "step": 841 }, { "epoch": 0.5331222793826672, "grad_norm": 0.6890588925957415, "learning_rate": 1.9125210176948305e-05, "loss": 0.3767, "step": 842 }, { "epoch": 0.5337554412346656, "grad_norm": 0.5438411353960032, "learning_rate": 1.9122354291999158e-05, "loss": 0.3925, "step": 843 }, { "epoch": 0.534388603086664, "grad_norm": 0.5944764421475889, "learning_rate": 1.911949396684759e-05, "loss": 0.3859, "step": 844 }, { "epoch": 0.5350217649386625, "grad_norm": 0.5539691647454598, "learning_rate": 1.9116629202885836e-05, "loss": 0.3926, "step": 845 }, { "epoch": 0.5356549267906608, "grad_norm": 0.6131923479821462, "learning_rate": 1.911376000150828e-05, "loss": 0.3877, "step": 846 }, { "epoch": 0.5362880886426593, "grad_norm": 0.5813596161942604, "learning_rate": 1.9110886364111483e-05, "loss": 0.3859, "step": 847 }, { "epoch": 0.5369212504946577, "grad_norm": 0.6045062995732741, "learning_rate": 1.9108008292094148e-05, "loss": 0.3766, "step": 848 }, { "epoch": 0.5375544123466561, "grad_norm": 0.5326860575088431, "learning_rate": 1.9105125786857148e-05, "loss": 0.3699, "step": 849 }, { "epoch": 0.5381875741986545, "grad_norm": 0.6133776984629188, "learning_rate": 1.9102238849803506e-05, "loss": 0.3592, "step": 850 }, { "epoch": 0.538820736050653, "grad_norm": 0.6048453457359698, "learning_rate": 1.909934748233841e-05, "loss": 0.3788, "step": 851 }, { "epoch": 0.5394538979026514, "grad_norm": 0.5882248303379229, "learning_rate": 1.90964516858692e-05, "loss": 0.3835, "step": 852 }, { "epoch": 0.5400870597546498, "grad_norm": 0.5818740446540904, "learning_rate": 1.9093551461805367e-05, "loss": 0.3851, "step": 853 }, { "epoch": 0.5407202216066482, "grad_norm": 0.5412205384838983, "learning_rate": 1.9090646811558566e-05, "loss": 0.3764, "step": 854 }, { "epoch": 0.5413533834586466, "grad_norm": 0.5738982595459403, "learning_rate": 1.9087737736542603e-05, "loss": 0.3698, "step": 855 }, { "epoch": 0.541986545310645, "grad_norm": 0.5738079364865236, "learning_rate": 1.9084824238173433e-05, "loss": 0.3786, "step": 856 }, { "epoch": 0.5426197071626434, "grad_norm": 0.5860979508574945, "learning_rate": 1.9081906317869175e-05, "loss": 0.3873, "step": 857 }, { "epoch": 0.5432528690146419, "grad_norm": 0.6356800099349725, "learning_rate": 1.907898397705009e-05, "loss": 0.3712, "step": 858 }, { "epoch": 0.5438860308666403, "grad_norm": 0.5563872099439217, "learning_rate": 1.9076057217138594e-05, "loss": 0.3844, "step": 859 }, { "epoch": 0.5445191927186387, "grad_norm": 0.5767077229495315, "learning_rate": 1.9073126039559254e-05, "loss": 0.3748, "step": 860 }, { "epoch": 0.5451523545706372, "grad_norm": 0.5338439409525176, "learning_rate": 1.907019044573879e-05, "loss": 0.3553, "step": 861 }, { "epoch": 0.5457855164226355, "grad_norm": 0.5794156560905626, "learning_rate": 1.9067250437106063e-05, "loss": 0.3693, "step": 862 }, { "epoch": 0.546418678274634, "grad_norm": 0.8570361272688745, "learning_rate": 1.9064306015092097e-05, "loss": 0.3804, "step": 863 }, { "epoch": 0.5470518401266323, "grad_norm": 0.5693748259611771, "learning_rate": 1.906135718113005e-05, "loss": 0.3799, "step": 864 }, { "epoch": 0.5476850019786308, "grad_norm": 0.5408092882430787, "learning_rate": 1.9058403936655235e-05, "loss": 0.3692, "step": 865 }, { "epoch": 0.5483181638306293, "grad_norm": 0.5490572396539402, "learning_rate": 1.905544628310511e-05, "loss": 0.3723, "step": 866 }, { "epoch": 0.5489513256826276, "grad_norm": 0.5616019949590524, "learning_rate": 1.905248422191928e-05, "loss": 0.3837, "step": 867 }, { "epoch": 0.5495844875346261, "grad_norm": 0.5228833807279253, "learning_rate": 1.904951775453949e-05, "loss": 0.378, "step": 868 }, { "epoch": 0.5502176493866244, "grad_norm": 0.5433759338296861, "learning_rate": 1.9046546882409648e-05, "loss": 0.3651, "step": 869 }, { "epoch": 0.5508508112386229, "grad_norm": 0.5724571144221501, "learning_rate": 1.9043571606975776e-05, "loss": 0.3891, "step": 870 }, { "epoch": 0.5514839730906212, "grad_norm": 0.5967834904218984, "learning_rate": 1.9040591929686064e-05, "loss": 0.3721, "step": 871 }, { "epoch": 0.5521171349426197, "grad_norm": 0.576588221219, "learning_rate": 1.903760785199084e-05, "loss": 0.3758, "step": 872 }, { "epoch": 0.5527502967946182, "grad_norm": 0.5645164379739741, "learning_rate": 1.9034619375342557e-05, "loss": 0.3718, "step": 873 }, { "epoch": 0.5533834586466165, "grad_norm": 0.5888886654593243, "learning_rate": 1.9031626501195833e-05, "loss": 0.3981, "step": 874 }, { "epoch": 0.554016620498615, "grad_norm": 0.5588831958067282, "learning_rate": 1.902862923100741e-05, "loss": 0.3676, "step": 875 }, { "epoch": 0.5546497823506134, "grad_norm": 0.5202038741416652, "learning_rate": 1.9025627566236182e-05, "loss": 0.3801, "step": 876 }, { "epoch": 0.5552829442026118, "grad_norm": 0.5746970871285484, "learning_rate": 1.9022621508343168e-05, "loss": 0.3785, "step": 877 }, { "epoch": 0.5559161060546102, "grad_norm": 0.5635345359389081, "learning_rate": 1.9019611058791534e-05, "loss": 0.4016, "step": 878 }, { "epoch": 0.5565492679066086, "grad_norm": 0.6806772202867333, "learning_rate": 1.9016596219046585e-05, "loss": 0.3838, "step": 879 }, { "epoch": 0.557182429758607, "grad_norm": 0.9506905999634354, "learning_rate": 1.901357699057576e-05, "loss": 0.3603, "step": 880 }, { "epoch": 0.5578155916106055, "grad_norm": 0.5962371929053172, "learning_rate": 1.901055337484863e-05, "loss": 0.3763, "step": 881 }, { "epoch": 0.5584487534626039, "grad_norm": 0.5618232243971054, "learning_rate": 1.900752537333691e-05, "loss": 0.3741, "step": 882 }, { "epoch": 0.5590819153146023, "grad_norm": 0.6495618916291064, "learning_rate": 1.900449298751444e-05, "loss": 0.3589, "step": 883 }, { "epoch": 0.5597150771666007, "grad_norm": 0.5672614937904914, "learning_rate": 1.9001456218857207e-05, "loss": 0.3805, "step": 884 }, { "epoch": 0.5603482390185991, "grad_norm": 0.5602164511712144, "learning_rate": 1.899841506884332e-05, "loss": 0.3819, "step": 885 }, { "epoch": 0.5609814008705976, "grad_norm": 0.5619457685919987, "learning_rate": 1.8995369538953024e-05, "loss": 0.3746, "step": 886 }, { "epoch": 0.5616145627225959, "grad_norm": 0.5970548287326287, "learning_rate": 1.8992319630668694e-05, "loss": 0.376, "step": 887 }, { "epoch": 0.5622477245745944, "grad_norm": 0.5964452544805111, "learning_rate": 1.898926534547484e-05, "loss": 0.3691, "step": 888 }, { "epoch": 0.5628808864265928, "grad_norm": 0.6044172463986606, "learning_rate": 1.8986206684858106e-05, "loss": 0.3789, "step": 889 }, { "epoch": 0.5635140482785912, "grad_norm": 0.5688687314729329, "learning_rate": 1.898314365030725e-05, "loss": 0.3656, "step": 890 }, { "epoch": 0.5641472101305897, "grad_norm": 0.5533140589312702, "learning_rate": 1.8980076243313178e-05, "loss": 0.3682, "step": 891 }, { "epoch": 0.564780371982588, "grad_norm": 0.5783055390969336, "learning_rate": 1.897700446536891e-05, "loss": 0.3718, "step": 892 }, { "epoch": 0.5654135338345865, "grad_norm": 0.6574212484405774, "learning_rate": 1.8973928317969604e-05, "loss": 0.3713, "step": 893 }, { "epoch": 0.5660466956865848, "grad_norm": 0.6069627984986965, "learning_rate": 1.8970847802612534e-05, "loss": 0.3867, "step": 894 }, { "epoch": 0.5666798575385833, "grad_norm": 0.5751254774100125, "learning_rate": 1.8967762920797107e-05, "loss": 0.3771, "step": 895 }, { "epoch": 0.5673130193905818, "grad_norm": 0.6196596366440378, "learning_rate": 1.896467367402486e-05, "loss": 0.3861, "step": 896 }, { "epoch": 0.5679461812425801, "grad_norm": 0.604645917608119, "learning_rate": 1.8961580063799447e-05, "loss": 0.3648, "step": 897 }, { "epoch": 0.5685793430945786, "grad_norm": 0.6276836698080156, "learning_rate": 1.8958482091626643e-05, "loss": 0.384, "step": 898 }, { "epoch": 0.5692125049465769, "grad_norm": 0.6282606295358962, "learning_rate": 1.8955379759014353e-05, "loss": 0.3717, "step": 899 }, { "epoch": 0.5698456667985754, "grad_norm": 0.5634027726280951, "learning_rate": 1.8952273067472602e-05, "loss": 0.3883, "step": 900 }, { "epoch": 0.5704788286505738, "grad_norm": 0.6860689991014751, "learning_rate": 1.894916201851354e-05, "loss": 0.3655, "step": 901 }, { "epoch": 0.5711119905025722, "grad_norm": 0.6010535108633372, "learning_rate": 1.894604661365143e-05, "loss": 0.3652, "step": 902 }, { "epoch": 0.5717451523545707, "grad_norm": 0.7169531012053655, "learning_rate": 1.894292685440266e-05, "loss": 0.3668, "step": 903 }, { "epoch": 0.572378314206569, "grad_norm": 0.6132062593003731, "learning_rate": 1.8939802742285746e-05, "loss": 0.3788, "step": 904 }, { "epoch": 0.5730114760585675, "grad_norm": 0.5564291644219105, "learning_rate": 1.89366742788213e-05, "loss": 0.3669, "step": 905 }, { "epoch": 0.5736446379105659, "grad_norm": 0.5698328334312798, "learning_rate": 1.893354146553208e-05, "loss": 0.3767, "step": 906 }, { "epoch": 0.5742777997625643, "grad_norm": 0.6181660636471362, "learning_rate": 1.893040430394294e-05, "loss": 0.3724, "step": 907 }, { "epoch": 0.5749109616145627, "grad_norm": 0.921253899918026, "learning_rate": 1.892726279558086e-05, "loss": 0.3732, "step": 908 }, { "epoch": 0.5755441234665611, "grad_norm": 0.6149881600723035, "learning_rate": 1.892411694197493e-05, "loss": 0.3795, "step": 909 }, { "epoch": 0.5761772853185596, "grad_norm": 0.6490794553426771, "learning_rate": 1.8920966744656362e-05, "loss": 0.3696, "step": 910 }, { "epoch": 0.576810447170558, "grad_norm": 0.6209234982232468, "learning_rate": 1.8917812205158478e-05, "loss": 0.3689, "step": 911 }, { "epoch": 0.5774436090225564, "grad_norm": 0.6178930490311936, "learning_rate": 1.891465332501671e-05, "loss": 0.3658, "step": 912 }, { "epoch": 0.5780767708745548, "grad_norm": 0.5801749303611217, "learning_rate": 1.8911490105768617e-05, "loss": 0.3846, "step": 913 }, { "epoch": 0.5787099327265532, "grad_norm": 0.6226191487153271, "learning_rate": 1.8908322548953854e-05, "loss": 0.3654, "step": 914 }, { "epoch": 0.5793430945785516, "grad_norm": 0.5735682584036507, "learning_rate": 1.890515065611419e-05, "loss": 0.3763, "step": 915 }, { "epoch": 0.5799762564305501, "grad_norm": 0.6567462602635294, "learning_rate": 1.890197442879351e-05, "loss": 0.3743, "step": 916 }, { "epoch": 0.5806094182825485, "grad_norm": 0.6203200777526909, "learning_rate": 1.8898793868537816e-05, "loss": 0.3759, "step": 917 }, { "epoch": 0.5812425801345469, "grad_norm": 0.6051319321674671, "learning_rate": 1.8895608976895198e-05, "loss": 0.3587, "step": 918 }, { "epoch": 0.5818757419865453, "grad_norm": 0.6202500266360572, "learning_rate": 1.8892419755415866e-05, "loss": 0.3717, "step": 919 }, { "epoch": 0.5825089038385437, "grad_norm": 0.6446772664297106, "learning_rate": 1.8889226205652145e-05, "loss": 0.3842, "step": 920 }, { "epoch": 0.5831420656905422, "grad_norm": 1.242003884988728, "learning_rate": 1.8886028329158452e-05, "loss": 0.3655, "step": 921 }, { "epoch": 0.5837752275425405, "grad_norm": 0.7459009171255029, "learning_rate": 1.888282612749132e-05, "loss": 0.3731, "step": 922 }, { "epoch": 0.584408389394539, "grad_norm": 0.5879131637844501, "learning_rate": 1.8879619602209387e-05, "loss": 0.3585, "step": 923 }, { "epoch": 0.5850415512465375, "grad_norm": 0.9925621365607818, "learning_rate": 1.8876408754873387e-05, "loss": 0.3779, "step": 924 }, { "epoch": 0.5856747130985358, "grad_norm": 0.5489089850639363, "learning_rate": 1.887319358704617e-05, "loss": 0.3597, "step": 925 }, { "epoch": 0.5863078749505343, "grad_norm": 0.6300148443565987, "learning_rate": 1.8869974100292675e-05, "loss": 0.3804, "step": 926 }, { "epoch": 0.5869410368025326, "grad_norm": 0.7830510939143385, "learning_rate": 1.8866750296179958e-05, "loss": 0.3698, "step": 927 }, { "epoch": 0.5875741986545311, "grad_norm": 1.0814957515519932, "learning_rate": 1.8863522176277165e-05, "loss": 0.3773, "step": 928 }, { "epoch": 0.5882073605065294, "grad_norm": 0.5977588571026216, "learning_rate": 1.8860289742155547e-05, "loss": 0.3701, "step": 929 }, { "epoch": 0.5888405223585279, "grad_norm": 0.578662181262387, "learning_rate": 1.8857052995388455e-05, "loss": 0.3838, "step": 930 }, { "epoch": 0.5894736842105263, "grad_norm": 0.5770853950776934, "learning_rate": 1.885381193755134e-05, "loss": 0.3662, "step": 931 }, { "epoch": 0.5901068460625247, "grad_norm": 0.5368367229368581, "learning_rate": 1.8850566570221752e-05, "loss": 0.385, "step": 932 }, { "epoch": 0.5907400079145232, "grad_norm": 0.5838043369241286, "learning_rate": 1.8847316894979327e-05, "loss": 0.3879, "step": 933 }, { "epoch": 0.5913731697665215, "grad_norm": 0.5505000855600694, "learning_rate": 1.884406291340582e-05, "loss": 0.3919, "step": 934 }, { "epoch": 0.59200633161852, "grad_norm": 0.6875151242232145, "learning_rate": 1.884080462708506e-05, "loss": 0.3775, "step": 935 }, { "epoch": 0.5926394934705184, "grad_norm": 0.6059811976641593, "learning_rate": 1.8837542037602986e-05, "loss": 0.3689, "step": 936 }, { "epoch": 0.5932726553225168, "grad_norm": 0.5542178014022192, "learning_rate": 1.883427514654763e-05, "loss": 0.3632, "step": 937 }, { "epoch": 0.5939058171745152, "grad_norm": 0.5700522383879693, "learning_rate": 1.88310039555091e-05, "loss": 0.3858, "step": 938 }, { "epoch": 0.5945389790265136, "grad_norm": 0.5723948493613408, "learning_rate": 1.8827728466079628e-05, "loss": 0.3697, "step": 939 }, { "epoch": 0.5951721408785121, "grad_norm": 0.5894746983347907, "learning_rate": 1.882444867985351e-05, "loss": 0.3873, "step": 940 }, { "epoch": 0.5958053027305105, "grad_norm": 0.5309146162805508, "learning_rate": 1.8821164598427148e-05, "loss": 0.3825, "step": 941 }, { "epoch": 0.5964384645825089, "grad_norm": 0.5390258926254504, "learning_rate": 1.8817876223399034e-05, "loss": 0.3729, "step": 942 }, { "epoch": 0.5970716264345073, "grad_norm": 0.5456572667703432, "learning_rate": 1.8814583556369744e-05, "loss": 0.3745, "step": 943 }, { "epoch": 0.5977047882865058, "grad_norm": 0.654620026500585, "learning_rate": 1.8811286598941946e-05, "loss": 0.3831, "step": 944 }, { "epoch": 0.5983379501385041, "grad_norm": 0.5692414938525828, "learning_rate": 1.88079853527204e-05, "loss": 0.38, "step": 945 }, { "epoch": 0.5989711119905026, "grad_norm": 0.5767520557259066, "learning_rate": 1.8804679819311946e-05, "loss": 0.3603, "step": 946 }, { "epoch": 0.599604273842501, "grad_norm": 0.540231743231449, "learning_rate": 1.880137000032552e-05, "loss": 0.3837, "step": 947 }, { "epoch": 0.6002374356944994, "grad_norm": 0.6795001487505217, "learning_rate": 1.8798055897372135e-05, "loss": 0.3688, "step": 948 }, { "epoch": 0.6008705975464979, "grad_norm": 0.5641248818228579, "learning_rate": 1.879473751206489e-05, "loss": 0.3586, "step": 949 }, { "epoch": 0.6015037593984962, "grad_norm": 0.5441273985336715, "learning_rate": 1.879141484601898e-05, "loss": 0.3862, "step": 950 }, { "epoch": 0.6021369212504947, "grad_norm": 0.5815200280379915, "learning_rate": 1.878808790085167e-05, "loss": 0.3694, "step": 951 }, { "epoch": 0.602770083102493, "grad_norm": 0.6062253398232755, "learning_rate": 1.878475667818231e-05, "loss": 0.3639, "step": 952 }, { "epoch": 0.6034032449544915, "grad_norm": 0.612940403843925, "learning_rate": 1.878142117963234e-05, "loss": 0.3584, "step": 953 }, { "epoch": 0.60403640680649, "grad_norm": 0.5743987441841462, "learning_rate": 1.8778081406825273e-05, "loss": 0.3589, "step": 954 }, { "epoch": 0.6046695686584883, "grad_norm": 0.6092847107451647, "learning_rate": 1.8774737361386705e-05, "loss": 0.3731, "step": 955 }, { "epoch": 0.6053027305104868, "grad_norm": 0.5587987899751716, "learning_rate": 1.8771389044944315e-05, "loss": 0.3748, "step": 956 }, { "epoch": 0.6059358923624851, "grad_norm": 0.650997849807343, "learning_rate": 1.876803645912786e-05, "loss": 0.3641, "step": 957 }, { "epoch": 0.6065690542144836, "grad_norm": 0.6055454065937111, "learning_rate": 1.8764679605569163e-05, "loss": 0.3624, "step": 958 }, { "epoch": 0.607202216066482, "grad_norm": 0.5917593066741887, "learning_rate": 1.8761318485902146e-05, "loss": 0.3631, "step": 959 }, { "epoch": 0.6078353779184804, "grad_norm": 0.587967305650764, "learning_rate": 1.8757953101762786e-05, "loss": 0.3624, "step": 960 }, { "epoch": 0.6084685397704789, "grad_norm": 0.6421378838350997, "learning_rate": 1.8754583454789155e-05, "loss": 0.3699, "step": 961 }, { "epoch": 0.6091017016224772, "grad_norm": 0.5451558105020571, "learning_rate": 1.875120954662138e-05, "loss": 0.3686, "step": 962 }, { "epoch": 0.6097348634744757, "grad_norm": 0.5587972554852858, "learning_rate": 1.8747831378901685e-05, "loss": 0.3772, "step": 963 }, { "epoch": 0.610368025326474, "grad_norm": 0.5783519135298565, "learning_rate": 1.874444895327435e-05, "loss": 0.3797, "step": 964 }, { "epoch": 0.6110011871784725, "grad_norm": 0.6066665662569757, "learning_rate": 1.8741062271385723e-05, "loss": 0.3596, "step": 965 }, { "epoch": 0.6116343490304709, "grad_norm": 0.5770023271495253, "learning_rate": 1.8737671334884243e-05, "loss": 0.361, "step": 966 }, { "epoch": 0.6122675108824693, "grad_norm": 0.5751020441681123, "learning_rate": 1.8734276145420408e-05, "loss": 0.3813, "step": 967 }, { "epoch": 0.6129006727344678, "grad_norm": 0.5698617129210538, "learning_rate": 1.8730876704646788e-05, "loss": 0.368, "step": 968 }, { "epoch": 0.6135338345864662, "grad_norm": 0.5855848224866121, "learning_rate": 1.872747301421802e-05, "loss": 0.3607, "step": 969 }, { "epoch": 0.6141669964384646, "grad_norm": 0.6076634183273567, "learning_rate": 1.8724065075790814e-05, "loss": 0.38, "step": 970 }, { "epoch": 0.614800158290463, "grad_norm": 0.5988638353219566, "learning_rate": 1.8720652891023943e-05, "loss": 0.3694, "step": 971 }, { "epoch": 0.6154333201424614, "grad_norm": 0.6177702811416718, "learning_rate": 1.8717236461578254e-05, "loss": 0.3837, "step": 972 }, { "epoch": 0.6160664819944598, "grad_norm": 0.6749339482066227, "learning_rate": 1.871381578911665e-05, "loss": 0.3698, "step": 973 }, { "epoch": 0.6166996438464583, "grad_norm": 0.5960636568363931, "learning_rate": 1.8710390875304104e-05, "loss": 0.357, "step": 974 }, { "epoch": 0.6173328056984567, "grad_norm": 0.5747176662930341, "learning_rate": 1.870696172180766e-05, "loss": 0.3626, "step": 975 }, { "epoch": 0.6179659675504551, "grad_norm": 0.6563669674227133, "learning_rate": 1.8703528330296413e-05, "loss": 0.3757, "step": 976 }, { "epoch": 0.6185991294024535, "grad_norm": 0.58008798755606, "learning_rate": 1.870009070244153e-05, "loss": 0.37, "step": 977 }, { "epoch": 0.6192322912544519, "grad_norm": 0.5630016600308326, "learning_rate": 1.8696648839916242e-05, "loss": 0.3628, "step": 978 }, { "epoch": 0.6198654531064504, "grad_norm": 0.5617699658988163, "learning_rate": 1.869320274439583e-05, "loss": 0.3625, "step": 979 }, { "epoch": 0.6204986149584487, "grad_norm": 0.6693890287765789, "learning_rate": 1.868975241755764e-05, "loss": 0.3803, "step": 980 }, { "epoch": 0.6211317768104472, "grad_norm": 0.6185179308076324, "learning_rate": 1.8686297861081086e-05, "loss": 0.3606, "step": 981 }, { "epoch": 0.6217649386624456, "grad_norm": 0.580545192957941, "learning_rate": 1.8682839076647635e-05, "loss": 0.3715, "step": 982 }, { "epoch": 0.622398100514444, "grad_norm": 0.6304698673429182, "learning_rate": 1.86793760659408e-05, "loss": 0.3555, "step": 983 }, { "epoch": 0.6230312623664425, "grad_norm": 0.5586081430530137, "learning_rate": 1.8675908830646177e-05, "loss": 0.3969, "step": 984 }, { "epoch": 0.6236644242184408, "grad_norm": 0.5931190784599077, "learning_rate": 1.8672437372451394e-05, "loss": 0.3786, "step": 985 }, { "epoch": 0.6242975860704393, "grad_norm": 1.3413557902565925, "learning_rate": 1.8668961693046146e-05, "loss": 0.3874, "step": 986 }, { "epoch": 0.6249307479224376, "grad_norm": 0.6802016145707958, "learning_rate": 1.866548179412218e-05, "loss": 0.3599, "step": 987 }, { "epoch": 0.6255639097744361, "grad_norm": 0.5820884326461698, "learning_rate": 1.86619976773733e-05, "loss": 0.3749, "step": 988 }, { "epoch": 0.6261970716264345, "grad_norm": 0.5811521922636456, "learning_rate": 1.8658509344495356e-05, "loss": 0.3711, "step": 989 }, { "epoch": 0.6268302334784329, "grad_norm": 0.5338421444247617, "learning_rate": 1.8655016797186255e-05, "loss": 0.3753, "step": 990 }, { "epoch": 0.6274633953304314, "grad_norm": 1.0339819504286307, "learning_rate": 1.865152003714596e-05, "loss": 0.3989, "step": 991 }, { "epoch": 0.6280965571824297, "grad_norm": 0.6126286200745733, "learning_rate": 1.864801906607647e-05, "loss": 0.3782, "step": 992 }, { "epoch": 0.6287297190344282, "grad_norm": 0.6046485659264246, "learning_rate": 1.8644513885681858e-05, "loss": 0.367, "step": 993 }, { "epoch": 0.6293628808864266, "grad_norm": 0.6606065322819764, "learning_rate": 1.8641004497668215e-05, "loss": 0.3671, "step": 994 }, { "epoch": 0.629996042738425, "grad_norm": 0.5981525510748904, "learning_rate": 1.8637490903743702e-05, "loss": 0.3613, "step": 995 }, { "epoch": 0.6306292045904234, "grad_norm": 0.6338952134192926, "learning_rate": 1.8633973105618525e-05, "loss": 0.3721, "step": 996 }, { "epoch": 0.6312623664424218, "grad_norm": 0.5625325957388463, "learning_rate": 1.8630451105004926e-05, "loss": 0.3648, "step": 997 }, { "epoch": 0.6318955282944203, "grad_norm": 0.6120069580187096, "learning_rate": 1.86269249036172e-05, "loss": 0.3636, "step": 998 }, { "epoch": 0.6325286901464187, "grad_norm": 0.644387318686003, "learning_rate": 1.862339450317169e-05, "loss": 0.3755, "step": 999 }, { "epoch": 0.6331618519984171, "grad_norm": 0.6127760521442106, "learning_rate": 1.8619859905386774e-05, "loss": 0.3717, "step": 1000 }, { "epoch": 0.6337950138504155, "grad_norm": 0.5979216376481588, "learning_rate": 1.8616321111982876e-05, "loss": 0.3759, "step": 1001 }, { "epoch": 0.6344281757024139, "grad_norm": 0.5883447871088159, "learning_rate": 1.8612778124682466e-05, "loss": 0.3701, "step": 1002 }, { "epoch": 0.6350613375544123, "grad_norm": 0.5917140314262573, "learning_rate": 1.8609230945210056e-05, "loss": 0.3803, "step": 1003 }, { "epoch": 0.6356944994064108, "grad_norm": 0.6614826154860304, "learning_rate": 1.860567957529219e-05, "loss": 0.3623, "step": 1004 }, { "epoch": 0.6363276612584092, "grad_norm": 0.6375933862721477, "learning_rate": 1.860212401665746e-05, "loss": 0.3695, "step": 1005 }, { "epoch": 0.6369608231104076, "grad_norm": 0.612596385415155, "learning_rate": 1.8598564271036493e-05, "loss": 0.394, "step": 1006 }, { "epoch": 0.637593984962406, "grad_norm": 0.6162177816987678, "learning_rate": 1.8595000340161954e-05, "loss": 0.3784, "step": 1007 }, { "epoch": 0.6382271468144044, "grad_norm": 0.5906964052146128, "learning_rate": 1.859143222576855e-05, "loss": 0.3847, "step": 1008 }, { "epoch": 0.6388603086664029, "grad_norm": 0.7238524190248031, "learning_rate": 1.858785992959301e-05, "loss": 0.3552, "step": 1009 }, { "epoch": 0.6394934705184012, "grad_norm": 0.5880568514499974, "learning_rate": 1.8584283453374122e-05, "loss": 0.3657, "step": 1010 }, { "epoch": 0.6401266323703997, "grad_norm": 0.6632856318526638, "learning_rate": 1.858070279885269e-05, "loss": 0.37, "step": 1011 }, { "epoch": 0.6407597942223981, "grad_norm": 0.6016692075624585, "learning_rate": 1.8577117967771553e-05, "loss": 0.3726, "step": 1012 }, { "epoch": 0.6413929560743965, "grad_norm": 0.5688709322373783, "learning_rate": 1.8573528961875586e-05, "loss": 0.3669, "step": 1013 }, { "epoch": 0.642026117926395, "grad_norm": 0.59376784737875, "learning_rate": 1.85699357829117e-05, "loss": 0.3693, "step": 1014 }, { "epoch": 0.6426592797783933, "grad_norm": 0.5861666443561204, "learning_rate": 1.856633843262884e-05, "loss": 0.3867, "step": 1015 }, { "epoch": 0.6432924416303918, "grad_norm": 0.6920264930405463, "learning_rate": 1.8562736912777967e-05, "loss": 0.351, "step": 1016 }, { "epoch": 0.6439256034823901, "grad_norm": 0.6232499472987663, "learning_rate": 1.8559131225112085e-05, "loss": 0.3723, "step": 1017 }, { "epoch": 0.6445587653343886, "grad_norm": 0.5636187374776724, "learning_rate": 1.8555521371386217e-05, "loss": 0.3761, "step": 1018 }, { "epoch": 0.6451919271863871, "grad_norm": 0.6149930144703821, "learning_rate": 1.8551907353357423e-05, "loss": 0.3692, "step": 1019 }, { "epoch": 0.6458250890383854, "grad_norm": 0.6445982682620227, "learning_rate": 1.854828917278478e-05, "loss": 0.3815, "step": 1020 }, { "epoch": 0.6464582508903839, "grad_norm": 0.5741457266230504, "learning_rate": 1.8544666831429404e-05, "loss": 0.3846, "step": 1021 }, { "epoch": 0.6470914127423822, "grad_norm": 0.5596112525820324, "learning_rate": 1.8541040331054422e-05, "loss": 0.3893, "step": 1022 }, { "epoch": 0.6477245745943807, "grad_norm": 0.892474197293613, "learning_rate": 1.853740967342499e-05, "loss": 0.3635, "step": 1023 }, { "epoch": 0.6483577364463791, "grad_norm": 0.6420711157985864, "learning_rate": 1.8533774860308296e-05, "loss": 0.3545, "step": 1024 }, { "epoch": 0.6489908982983775, "grad_norm": 0.5877950991956167, "learning_rate": 1.853013589347354e-05, "loss": 0.3702, "step": 1025 }, { "epoch": 0.649624060150376, "grad_norm": 0.534694439446179, "learning_rate": 1.852649277469195e-05, "loss": 0.3868, "step": 1026 }, { "epoch": 0.6502572220023743, "grad_norm": 0.5833181094015213, "learning_rate": 1.8522845505736774e-05, "loss": 0.3744, "step": 1027 }, { "epoch": 0.6508903838543728, "grad_norm": 0.5806286554265573, "learning_rate": 1.851919408838327e-05, "loss": 0.3726, "step": 1028 }, { "epoch": 0.6515235457063712, "grad_norm": 0.6213919671608455, "learning_rate": 1.8515538524408736e-05, "loss": 0.3546, "step": 1029 }, { "epoch": 0.6521567075583696, "grad_norm": 0.5723483918023697, "learning_rate": 1.851187881559247e-05, "loss": 0.3613, "step": 1030 }, { "epoch": 0.652789869410368, "grad_norm": 0.5738829404982974, "learning_rate": 1.8508214963715793e-05, "loss": 0.3737, "step": 1031 }, { "epoch": 0.6534230312623664, "grad_norm": 0.6450894170267297, "learning_rate": 1.8504546970562045e-05, "loss": 0.3788, "step": 1032 }, { "epoch": 0.6540561931143649, "grad_norm": 0.6471354133430754, "learning_rate": 1.850087483791658e-05, "loss": 0.3787, "step": 1033 }, { "epoch": 0.6546893549663633, "grad_norm": 0.569704447138437, "learning_rate": 1.8497198567566767e-05, "loss": 0.3834, "step": 1034 }, { "epoch": 0.6553225168183617, "grad_norm": 0.5902284924454306, "learning_rate": 1.849351816130199e-05, "loss": 0.356, "step": 1035 }, { "epoch": 0.6559556786703601, "grad_norm": 0.5892268357883347, "learning_rate": 1.8489833620913644e-05, "loss": 0.3596, "step": 1036 }, { "epoch": 0.6565888405223586, "grad_norm": 0.6187582782336997, "learning_rate": 1.8486144948195132e-05, "loss": 0.3738, "step": 1037 }, { "epoch": 0.6572220023743569, "grad_norm": 0.6405801539206627, "learning_rate": 1.848245214494188e-05, "loss": 0.3774, "step": 1038 }, { "epoch": 0.6578551642263554, "grad_norm": 0.5567362440243985, "learning_rate": 1.8478755212951317e-05, "loss": 0.3668, "step": 1039 }, { "epoch": 0.6584883260783537, "grad_norm": 0.5957991056122862, "learning_rate": 1.847505415402288e-05, "loss": 0.3698, "step": 1040 }, { "epoch": 0.6591214879303522, "grad_norm": 0.5499148980998202, "learning_rate": 1.8471348969958026e-05, "loss": 0.3654, "step": 1041 }, { "epoch": 0.6597546497823507, "grad_norm": 0.6239468682689138, "learning_rate": 1.84676396625602e-05, "loss": 0.3649, "step": 1042 }, { "epoch": 0.660387811634349, "grad_norm": 0.547848438206227, "learning_rate": 1.846392623363487e-05, "loss": 0.378, "step": 1043 }, { "epoch": 0.6610209734863475, "grad_norm": 0.6303941110696487, "learning_rate": 1.8460208684989505e-05, "loss": 0.3892, "step": 1044 }, { "epoch": 0.6616541353383458, "grad_norm": 0.6305160443793808, "learning_rate": 1.8456487018433585e-05, "loss": 0.3498, "step": 1045 }, { "epoch": 0.6622872971903443, "grad_norm": 0.5974579560292679, "learning_rate": 1.8452761235778577e-05, "loss": 0.3688, "step": 1046 }, { "epoch": 0.6629204590423426, "grad_norm": 0.6273243083158165, "learning_rate": 1.8449031338837974e-05, "loss": 0.3595, "step": 1047 }, { "epoch": 0.6635536208943411, "grad_norm": 0.5902598364235484, "learning_rate": 1.8445297329427257e-05, "loss": 0.3539, "step": 1048 }, { "epoch": 0.6641867827463396, "grad_norm": 0.5768444794295022, "learning_rate": 1.8441559209363916e-05, "loss": 0.367, "step": 1049 }, { "epoch": 0.6648199445983379, "grad_norm": 0.5655784893333454, "learning_rate": 1.8437816980467432e-05, "loss": 0.3674, "step": 1050 }, { "epoch": 0.6654531064503364, "grad_norm": 0.5573147992361733, "learning_rate": 1.8434070644559298e-05, "loss": 0.3587, "step": 1051 }, { "epoch": 0.6660862683023347, "grad_norm": 0.6013301767265987, "learning_rate": 1.8430320203462996e-05, "loss": 0.344, "step": 1052 }, { "epoch": 0.6667194301543332, "grad_norm": 0.5601328004416374, "learning_rate": 1.8426565659004015e-05, "loss": 0.3804, "step": 1053 }, { "epoch": 0.6673525920063316, "grad_norm": 0.5752362202661687, "learning_rate": 1.8422807013009835e-05, "loss": 0.3605, "step": 1054 }, { "epoch": 0.66798575385833, "grad_norm": 0.5852918359205099, "learning_rate": 1.841904426730994e-05, "loss": 0.353, "step": 1055 }, { "epoch": 0.6686189157103285, "grad_norm": 0.7118586459866062, "learning_rate": 1.8415277423735788e-05, "loss": 0.3712, "step": 1056 }, { "epoch": 0.6692520775623269, "grad_norm": 0.5420538217585712, "learning_rate": 1.8411506484120862e-05, "loss": 0.3767, "step": 1057 }, { "epoch": 0.6698852394143253, "grad_norm": 0.5779741447922296, "learning_rate": 1.8407731450300622e-05, "loss": 0.3766, "step": 1058 }, { "epoch": 0.6705184012663237, "grad_norm": 0.5625290044108457, "learning_rate": 1.8403952324112516e-05, "loss": 0.3697, "step": 1059 }, { "epoch": 0.6711515631183221, "grad_norm": 0.5178645171838488, "learning_rate": 1.8400169107396e-05, "loss": 0.373, "step": 1060 }, { "epoch": 0.6717847249703205, "grad_norm": 0.5899339601151495, "learning_rate": 1.8396381801992506e-05, "loss": 0.3617, "step": 1061 }, { "epoch": 0.672417886822319, "grad_norm": 0.542074747883639, "learning_rate": 1.839259040974546e-05, "loss": 0.3624, "step": 1062 }, { "epoch": 0.6730510486743174, "grad_norm": 0.5746440837045141, "learning_rate": 1.8388794932500285e-05, "loss": 0.3773, "step": 1063 }, { "epoch": 0.6736842105263158, "grad_norm": 0.5370881313169931, "learning_rate": 1.8384995372104383e-05, "loss": 0.3818, "step": 1064 }, { "epoch": 0.6743173723783142, "grad_norm": 0.5767603576698904, "learning_rate": 1.8381191730407147e-05, "loss": 0.3676, "step": 1065 }, { "epoch": 0.6749505342303126, "grad_norm": 0.5212059240442241, "learning_rate": 1.8377384009259958e-05, "loss": 0.365, "step": 1066 }, { "epoch": 0.6755836960823111, "grad_norm": 0.601512063957476, "learning_rate": 1.8373572210516178e-05, "loss": 0.3584, "step": 1067 }, { "epoch": 0.6762168579343094, "grad_norm": 0.5435310368869701, "learning_rate": 1.836975633603116e-05, "loss": 0.3561, "step": 1068 }, { "epoch": 0.6768500197863079, "grad_norm": 0.5984172923279657, "learning_rate": 1.8365936387662238e-05, "loss": 0.3815, "step": 1069 }, { "epoch": 0.6774831816383063, "grad_norm": 0.5938847373217014, "learning_rate": 1.8362112367268723e-05, "loss": 0.3602, "step": 1070 }, { "epoch": 0.6781163434903047, "grad_norm": 0.5909648294948189, "learning_rate": 1.835828427671192e-05, "loss": 0.3916, "step": 1071 }, { "epoch": 0.6787495053423032, "grad_norm": 0.6029572059883065, "learning_rate": 1.8354452117855108e-05, "loss": 0.3688, "step": 1072 }, { "epoch": 0.6793826671943015, "grad_norm": 0.6170836183755596, "learning_rate": 1.835061589256354e-05, "loss": 0.36, "step": 1073 }, { "epoch": 0.6800158290463, "grad_norm": 0.556509922410444, "learning_rate": 1.8346775602704464e-05, "loss": 0.3829, "step": 1074 }, { "epoch": 0.6806489908982983, "grad_norm": 0.6153441630476257, "learning_rate": 1.834293125014709e-05, "loss": 0.3699, "step": 1075 }, { "epoch": 0.6812821527502968, "grad_norm": 0.6144441317038124, "learning_rate": 1.8339082836762618e-05, "loss": 0.3519, "step": 1076 }, { "epoch": 0.6819153146022953, "grad_norm": 0.566994588995616, "learning_rate": 1.833523036442422e-05, "loss": 0.391, "step": 1077 }, { "epoch": 0.6825484764542936, "grad_norm": 0.600404932194796, "learning_rate": 1.833137383500704e-05, "loss": 0.3708, "step": 1078 }, { "epoch": 0.6831816383062921, "grad_norm": 0.5156993776027072, "learning_rate": 1.83275132503882e-05, "loss": 0.3779, "step": 1079 }, { "epoch": 0.6838148001582904, "grad_norm": 0.5352137559124773, "learning_rate": 1.83236486124468e-05, "loss": 0.3688, "step": 1080 }, { "epoch": 0.6844479620102889, "grad_norm": 0.5474830884094319, "learning_rate": 1.83197799230639e-05, "loss": 0.3559, "step": 1081 }, { "epoch": 0.6850811238622873, "grad_norm": 0.5710255781288813, "learning_rate": 1.831590718412255e-05, "loss": 0.3594, "step": 1082 }, { "epoch": 0.6857142857142857, "grad_norm": 0.5701814787002943, "learning_rate": 1.8312030397507757e-05, "loss": 0.382, "step": 1083 }, { "epoch": 0.6863474475662842, "grad_norm": 0.6833078285251714, "learning_rate": 1.8308149565106507e-05, "loss": 0.3609, "step": 1084 }, { "epoch": 0.6869806094182825, "grad_norm": 0.5511534369973602, "learning_rate": 1.8304264688807743e-05, "loss": 0.3714, "step": 1085 }, { "epoch": 0.687613771270281, "grad_norm": 0.6189309713521197, "learning_rate": 1.830037577050239e-05, "loss": 0.3424, "step": 1086 }, { "epoch": 0.6882469331222794, "grad_norm": 0.5650354869943955, "learning_rate": 1.8296482812083335e-05, "loss": 0.37, "step": 1087 }, { "epoch": 0.6888800949742778, "grad_norm": 0.5472993573504756, "learning_rate": 1.8292585815445432e-05, "loss": 0.3989, "step": 1088 }, { "epoch": 0.6895132568262762, "grad_norm": 0.579183358351138, "learning_rate": 1.82886847824855e-05, "loss": 0.3772, "step": 1089 }, { "epoch": 0.6901464186782746, "grad_norm": 0.6176780644067786, "learning_rate": 1.8284779715102315e-05, "loss": 0.3772, "step": 1090 }, { "epoch": 0.690779580530273, "grad_norm": 0.5907994214334336, "learning_rate": 1.8280870615196632e-05, "loss": 0.3728, "step": 1091 }, { "epoch": 0.6914127423822715, "grad_norm": 0.5565805346019972, "learning_rate": 1.8276957484671162e-05, "loss": 0.3688, "step": 1092 }, { "epoch": 0.6920459042342699, "grad_norm": 0.5777799037391069, "learning_rate": 1.8273040325430575e-05, "loss": 0.3608, "step": 1093 }, { "epoch": 0.6926790660862683, "grad_norm": 0.5686189732659468, "learning_rate": 1.82691191393815e-05, "loss": 0.3509, "step": 1094 }, { "epoch": 0.6933122279382667, "grad_norm": 0.6297929297441917, "learning_rate": 1.8265193928432536e-05, "loss": 0.3729, "step": 1095 }, { "epoch": 0.6939453897902651, "grad_norm": 0.5892217905953299, "learning_rate": 1.8261264694494225e-05, "loss": 0.3814, "step": 1096 }, { "epoch": 0.6945785516422636, "grad_norm": 0.5851710001357096, "learning_rate": 1.8257331439479088e-05, "loss": 0.3753, "step": 1097 }, { "epoch": 0.6952117134942619, "grad_norm": 0.6463548154433715, "learning_rate": 1.8253394165301587e-05, "loss": 0.3785, "step": 1098 }, { "epoch": 0.6958448753462604, "grad_norm": 0.567742161300617, "learning_rate": 1.8249452873878146e-05, "loss": 0.3502, "step": 1099 }, { "epoch": 0.6964780371982588, "grad_norm": 0.6587888062831951, "learning_rate": 1.8245507567127148e-05, "loss": 0.3633, "step": 1100 }, { "epoch": 0.6971111990502572, "grad_norm": 0.5885400411967872, "learning_rate": 1.824155824696892e-05, "loss": 0.3672, "step": 1101 }, { "epoch": 0.6977443609022557, "grad_norm": 0.618314699415007, "learning_rate": 1.823760491532575e-05, "loss": 0.3765, "step": 1102 }, { "epoch": 0.698377522754254, "grad_norm": 0.5551776991894982, "learning_rate": 1.823364757412188e-05, "loss": 0.3548, "step": 1103 }, { "epoch": 0.6990106846062525, "grad_norm": 0.6754643798238982, "learning_rate": 1.8229686225283497e-05, "loss": 0.3451, "step": 1104 }, { "epoch": 0.6996438464582508, "grad_norm": 0.6188850353543128, "learning_rate": 1.8225720870738745e-05, "loss": 0.3748, "step": 1105 }, { "epoch": 0.7002770083102493, "grad_norm": 0.6129018679270483, "learning_rate": 1.822175151241772e-05, "loss": 0.3722, "step": 1106 }, { "epoch": 0.7009101701622478, "grad_norm": 0.612152663847087, "learning_rate": 1.821777815225245e-05, "loss": 0.3762, "step": 1107 }, { "epoch": 0.7015433320142461, "grad_norm": 0.5906470969261947, "learning_rate": 1.821380079217694e-05, "loss": 0.3673, "step": 1108 }, { "epoch": 0.7021764938662446, "grad_norm": 0.5255822446618104, "learning_rate": 1.8209819434127108e-05, "loss": 0.38, "step": 1109 }, { "epoch": 0.7028096557182429, "grad_norm": 0.5649628591630739, "learning_rate": 1.8205834080040847e-05, "loss": 0.3792, "step": 1110 }, { "epoch": 0.7034428175702414, "grad_norm": 0.5177105632297271, "learning_rate": 1.8201844731857977e-05, "loss": 0.3679, "step": 1111 }, { "epoch": 0.7040759794222398, "grad_norm": 0.6130902176203834, "learning_rate": 1.8197851391520265e-05, "loss": 0.3587, "step": 1112 }, { "epoch": 0.7047091412742382, "grad_norm": 0.5084913105129977, "learning_rate": 1.8193854060971433e-05, "loss": 0.3716, "step": 1113 }, { "epoch": 0.7053423031262367, "grad_norm": 0.5318993120057826, "learning_rate": 1.8189852742157125e-05, "loss": 0.3678, "step": 1114 }, { "epoch": 0.705975464978235, "grad_norm": 0.5428472034432253, "learning_rate": 1.818584743702495e-05, "loss": 0.3611, "step": 1115 }, { "epoch": 0.7066086268302335, "grad_norm": 0.5879093406676648, "learning_rate": 1.818183814752444e-05, "loss": 0.3577, "step": 1116 }, { "epoch": 0.7072417886822319, "grad_norm": 0.5793208926662043, "learning_rate": 1.8177824875607065e-05, "loss": 0.3791, "step": 1117 }, { "epoch": 0.7078749505342303, "grad_norm": 0.612343800064264, "learning_rate": 1.8173807623226244e-05, "loss": 0.36, "step": 1118 }, { "epoch": 0.7085081123862287, "grad_norm": 0.5849079814040952, "learning_rate": 1.8169786392337335e-05, "loss": 0.3686, "step": 1119 }, { "epoch": 0.7091412742382271, "grad_norm": 0.5913130536920814, "learning_rate": 1.8165761184897617e-05, "loss": 0.3785, "step": 1120 }, { "epoch": 0.7097744360902256, "grad_norm": 0.578236177314469, "learning_rate": 1.8161732002866316e-05, "loss": 0.3741, "step": 1121 }, { "epoch": 0.710407597942224, "grad_norm": 0.5405235319964413, "learning_rate": 1.8157698848204596e-05, "loss": 0.368, "step": 1122 }, { "epoch": 0.7110407597942224, "grad_norm": 0.5763551890101115, "learning_rate": 1.8153661722875548e-05, "loss": 0.3775, "step": 1123 }, { "epoch": 0.7116739216462208, "grad_norm": 0.5271260773555898, "learning_rate": 1.814962062884419e-05, "loss": 0.3635, "step": 1124 }, { "epoch": 0.7123070834982193, "grad_norm": 0.548612384845434, "learning_rate": 1.8145575568077486e-05, "loss": 0.3584, "step": 1125 }, { "epoch": 0.7129402453502176, "grad_norm": 0.5698401438331759, "learning_rate": 1.814152654254432e-05, "loss": 0.3621, "step": 1126 }, { "epoch": 0.7135734072022161, "grad_norm": 0.6661623219774246, "learning_rate": 1.8137473554215507e-05, "loss": 0.3694, "step": 1127 }, { "epoch": 0.7142065690542145, "grad_norm": 0.5411189480226077, "learning_rate": 1.8133416605063802e-05, "loss": 0.3708, "step": 1128 }, { "epoch": 0.7148397309062129, "grad_norm": 0.5671168954191186, "learning_rate": 1.812935569706387e-05, "loss": 0.3785, "step": 1129 }, { "epoch": 0.7154728927582114, "grad_norm": 0.5534797894147735, "learning_rate": 1.8125290832192317e-05, "loss": 0.3724, "step": 1130 }, { "epoch": 0.7161060546102097, "grad_norm": 0.5485968312142845, "learning_rate": 1.8121222012427666e-05, "loss": 0.3606, "step": 1131 }, { "epoch": 0.7167392164622082, "grad_norm": 0.562157058651462, "learning_rate": 1.811714923975037e-05, "loss": 0.3624, "step": 1132 }, { "epoch": 0.7173723783142065, "grad_norm": 0.6015682160470311, "learning_rate": 1.8113072516142807e-05, "loss": 0.3461, "step": 1133 }, { "epoch": 0.718005540166205, "grad_norm": 0.5440116198781026, "learning_rate": 1.8108991843589275e-05, "loss": 0.3631, "step": 1134 }, { "epoch": 0.7186387020182035, "grad_norm": 0.5935764822397143, "learning_rate": 1.8104907224075992e-05, "loss": 0.3632, "step": 1135 }, { "epoch": 0.7192718638702018, "grad_norm": 0.5640872785955315, "learning_rate": 1.8100818659591106e-05, "loss": 0.3546, "step": 1136 }, { "epoch": 0.7199050257222003, "grad_norm": 0.5446474906474759, "learning_rate": 1.809672615212467e-05, "loss": 0.3573, "step": 1137 }, { "epoch": 0.7205381875741986, "grad_norm": 0.5395983694863076, "learning_rate": 1.8092629703668677e-05, "loss": 0.3711, "step": 1138 }, { "epoch": 0.7211713494261971, "grad_norm": 0.5562053553078519, "learning_rate": 1.8088529316217024e-05, "loss": 0.3766, "step": 1139 }, { "epoch": 0.7218045112781954, "grad_norm": 0.5779909905025135, "learning_rate": 1.8084424991765523e-05, "loss": 0.3634, "step": 1140 }, { "epoch": 0.7224376731301939, "grad_norm": 0.5761470697029769, "learning_rate": 1.8080316732311907e-05, "loss": 0.3452, "step": 1141 }, { "epoch": 0.7230708349821923, "grad_norm": 0.5530690159965241, "learning_rate": 1.807620453985583e-05, "loss": 0.3797, "step": 1142 }, { "epoch": 0.7237039968341907, "grad_norm": 0.5454764319308514, "learning_rate": 1.8072088416398852e-05, "loss": 0.3824, "step": 1143 }, { "epoch": 0.7243371586861892, "grad_norm": 0.5474969313752484, "learning_rate": 1.806796836394445e-05, "loss": 0.372, "step": 1144 }, { "epoch": 0.7249703205381876, "grad_norm": 0.5557545620190125, "learning_rate": 1.8063844384498015e-05, "loss": 0.355, "step": 1145 }, { "epoch": 0.725603482390186, "grad_norm": 0.5422863145765651, "learning_rate": 1.805971648006684e-05, "loss": 0.3748, "step": 1146 }, { "epoch": 0.7262366442421844, "grad_norm": 0.5219476415544394, "learning_rate": 1.8055584652660143e-05, "loss": 0.3753, "step": 1147 }, { "epoch": 0.7268698060941828, "grad_norm": 0.5550028776780925, "learning_rate": 1.8051448904289043e-05, "loss": 0.3727, "step": 1148 }, { "epoch": 0.7275029679461812, "grad_norm": 0.5912106326886157, "learning_rate": 1.8047309236966565e-05, "loss": 0.3687, "step": 1149 }, { "epoch": 0.7281361297981797, "grad_norm": 1.2936960032005786, "learning_rate": 1.804316565270765e-05, "loss": 0.357, "step": 1150 }, { "epoch": 0.7287692916501781, "grad_norm": 0.6156098045820575, "learning_rate": 1.8039018153529137e-05, "loss": 0.3564, "step": 1151 }, { "epoch": 0.7294024535021765, "grad_norm": 0.5216291959844793, "learning_rate": 1.803486674144977e-05, "loss": 0.3836, "step": 1152 }, { "epoch": 0.7300356153541749, "grad_norm": 0.6078112695628359, "learning_rate": 1.8030711418490214e-05, "loss": 0.3727, "step": 1153 }, { "epoch": 0.7306687772061733, "grad_norm": 0.5668002419036349, "learning_rate": 1.8026552186673014e-05, "loss": 0.3543, "step": 1154 }, { "epoch": 0.7313019390581718, "grad_norm": 0.6062596934286766, "learning_rate": 1.8022389048022633e-05, "loss": 0.3566, "step": 1155 }, { "epoch": 0.7319351009101701, "grad_norm": 0.5309306315735529, "learning_rate": 1.8018222004565436e-05, "loss": 0.3622, "step": 1156 }, { "epoch": 0.7325682627621686, "grad_norm": 0.6348157181883808, "learning_rate": 1.8014051058329674e-05, "loss": 0.3616, "step": 1157 }, { "epoch": 0.733201424614167, "grad_norm": 0.5664129037337716, "learning_rate": 1.8009876211345518e-05, "loss": 0.3672, "step": 1158 }, { "epoch": 0.7338345864661654, "grad_norm": 0.5899186656457893, "learning_rate": 1.8005697465645017e-05, "loss": 0.3677, "step": 1159 }, { "epoch": 0.7344677483181639, "grad_norm": 0.545642683851767, "learning_rate": 1.800151482326214e-05, "loss": 0.3614, "step": 1160 }, { "epoch": 0.7351009101701622, "grad_norm": 0.5837270102764961, "learning_rate": 1.799732828623273e-05, "loss": 0.3663, "step": 1161 }, { "epoch": 0.7357340720221607, "grad_norm": 0.6033159037179214, "learning_rate": 1.799313785659454e-05, "loss": 0.3405, "step": 1162 }, { "epoch": 0.736367233874159, "grad_norm": 0.5561625052661069, "learning_rate": 1.7988943536387216e-05, "loss": 0.3824, "step": 1163 }, { "epoch": 0.7370003957261575, "grad_norm": 0.5296026549110102, "learning_rate": 1.7984745327652294e-05, "loss": 0.3654, "step": 1164 }, { "epoch": 0.737633557578156, "grad_norm": 0.5680254260331294, "learning_rate": 1.798054323243321e-05, "loss": 0.3597, "step": 1165 }, { "epoch": 0.7382667194301543, "grad_norm": 0.5445105390501249, "learning_rate": 1.797633725277528e-05, "loss": 0.3483, "step": 1166 }, { "epoch": 0.7388998812821528, "grad_norm": 0.5981479769553163, "learning_rate": 1.7972127390725715e-05, "loss": 0.3518, "step": 1167 }, { "epoch": 0.7395330431341511, "grad_norm": 0.6813057174089686, "learning_rate": 1.796791364833362e-05, "loss": 0.3774, "step": 1168 }, { "epoch": 0.7401662049861496, "grad_norm": 0.5910331256272656, "learning_rate": 1.796369602764999e-05, "loss": 0.3712, "step": 1169 }, { "epoch": 0.740799366838148, "grad_norm": 0.5263135427588711, "learning_rate": 1.7959474530727696e-05, "loss": 0.3783, "step": 1170 }, { "epoch": 0.7414325286901464, "grad_norm": 0.6039842989594478, "learning_rate": 1.7955249159621514e-05, "loss": 0.3805, "step": 1171 }, { "epoch": 0.7420656905421449, "grad_norm": 0.513150574731169, "learning_rate": 1.795101991638809e-05, "loss": 0.3779, "step": 1172 }, { "epoch": 0.7426988523941432, "grad_norm": 0.5357171461487978, "learning_rate": 1.7946786803085955e-05, "loss": 0.357, "step": 1173 }, { "epoch": 0.7433320142461417, "grad_norm": 0.5619538517071481, "learning_rate": 1.7942549821775537e-05, "loss": 0.3557, "step": 1174 }, { "epoch": 0.7439651760981401, "grad_norm": 0.5431959334344694, "learning_rate": 1.793830897451914e-05, "loss": 0.3705, "step": 1175 }, { "epoch": 0.7445983379501385, "grad_norm": 0.5612292856751678, "learning_rate": 1.793406426338094e-05, "loss": 0.3615, "step": 1176 }, { "epoch": 0.7452314998021369, "grad_norm": 0.5453504635768504, "learning_rate": 1.7929815690427004e-05, "loss": 0.366, "step": 1177 }, { "epoch": 0.7458646616541353, "grad_norm": 0.5388730784939249, "learning_rate": 1.792556325772528e-05, "loss": 0.3664, "step": 1178 }, { "epoch": 0.7464978235061338, "grad_norm": 0.532701665567647, "learning_rate": 1.7921306967345592e-05, "loss": 0.3589, "step": 1179 }, { "epoch": 0.7471309853581322, "grad_norm": 0.564538993456631, "learning_rate": 1.7917046821359637e-05, "loss": 0.3438, "step": 1180 }, { "epoch": 0.7477641472101306, "grad_norm": 0.601948486600775, "learning_rate": 1.7912782821840995e-05, "loss": 0.367, "step": 1181 }, { "epoch": 0.748397309062129, "grad_norm": 0.6243257278796703, "learning_rate": 1.790851497086512e-05, "loss": 0.3908, "step": 1182 }, { "epoch": 0.7490304709141274, "grad_norm": 0.5682708228806582, "learning_rate": 1.7904243270509338e-05, "loss": 0.3617, "step": 1183 }, { "epoch": 0.7496636327661258, "grad_norm": 0.5637438074339497, "learning_rate": 1.7899967722852853e-05, "loss": 0.3601, "step": 1184 }, { "epoch": 0.7502967946181243, "grad_norm": 0.5806643453350805, "learning_rate": 1.7895688329976737e-05, "loss": 0.3653, "step": 1185 }, { "epoch": 0.7509299564701227, "grad_norm": 0.6183655092587411, "learning_rate": 1.789140509396394e-05, "loss": 0.3435, "step": 1186 }, { "epoch": 0.7515631183221211, "grad_norm": 0.5436751026719984, "learning_rate": 1.7887118016899272e-05, "loss": 0.3673, "step": 1187 }, { "epoch": 0.7521962801741195, "grad_norm": 0.6757929133045946, "learning_rate": 1.788282710086942e-05, "loss": 0.3713, "step": 1188 }, { "epoch": 0.7528294420261179, "grad_norm": 0.6349465466303442, "learning_rate": 1.7878532347962947e-05, "loss": 0.3571, "step": 1189 }, { "epoch": 0.7534626038781164, "grad_norm": 0.5903583968754788, "learning_rate": 1.7874233760270264e-05, "loss": 0.3621, "step": 1190 }, { "epoch": 0.7540957657301147, "grad_norm": 0.63583382190417, "learning_rate": 1.786993133988367e-05, "loss": 0.3641, "step": 1191 }, { "epoch": 0.7547289275821132, "grad_norm": 0.5706075774716195, "learning_rate": 1.7865625088897313e-05, "loss": 0.3693, "step": 1192 }, { "epoch": 0.7553620894341116, "grad_norm": 0.593740795944009, "learning_rate": 1.7861315009407215e-05, "loss": 0.3674, "step": 1193 }, { "epoch": 0.75599525128611, "grad_norm": 0.5858558170294981, "learning_rate": 1.7857001103511256e-05, "loss": 0.3732, "step": 1194 }, { "epoch": 0.7566284131381085, "grad_norm": 0.6767195949546776, "learning_rate": 1.7852683373309178e-05, "loss": 0.3568, "step": 1195 }, { "epoch": 0.7572615749901068, "grad_norm": 0.5957117657018506, "learning_rate": 1.7848361820902594e-05, "loss": 0.3593, "step": 1196 }, { "epoch": 0.7578947368421053, "grad_norm": 0.5740706817899217, "learning_rate": 1.7844036448394965e-05, "loss": 0.3852, "step": 1197 }, { "epoch": 0.7585278986941036, "grad_norm": 0.5806324280909475, "learning_rate": 1.7839707257891622e-05, "loss": 0.3732, "step": 1198 }, { "epoch": 0.7591610605461021, "grad_norm": 0.5060088809581653, "learning_rate": 1.7835374251499743e-05, "loss": 0.358, "step": 1199 }, { "epoch": 0.7597942223981005, "grad_norm": 0.6841469565550979, "learning_rate": 1.7831037431328378e-05, "loss": 0.3554, "step": 1200 }, { "epoch": 0.7604273842500989, "grad_norm": 0.5224591633573585, "learning_rate": 1.7826696799488418e-05, "loss": 0.375, "step": 1201 }, { "epoch": 0.7610605461020974, "grad_norm": 0.5059887076098845, "learning_rate": 1.7822352358092614e-05, "loss": 0.3527, "step": 1202 }, { "epoch": 0.7616937079540957, "grad_norm": 0.6049944829822, "learning_rate": 1.7818004109255584e-05, "loss": 0.3585, "step": 1203 }, { "epoch": 0.7623268698060942, "grad_norm": 0.5959399508225727, "learning_rate": 1.7813652055093778e-05, "loss": 0.3756, "step": 1204 }, { "epoch": 0.7629600316580926, "grad_norm": 0.8834790615011671, "learning_rate": 1.7809296197725518e-05, "loss": 0.3662, "step": 1205 }, { "epoch": 0.763593193510091, "grad_norm": 0.5318238845847583, "learning_rate": 1.780493653927096e-05, "loss": 0.3639, "step": 1206 }, { "epoch": 0.7642263553620894, "grad_norm": 0.5585229765961075, "learning_rate": 1.7800573081852124e-05, "loss": 0.3652, "step": 1207 }, { "epoch": 0.7648595172140878, "grad_norm": 0.5891244866012901, "learning_rate": 1.779620582759287e-05, "loss": 0.3411, "step": 1208 }, { "epoch": 0.7654926790660863, "grad_norm": 0.553036985543611, "learning_rate": 1.7791834778618914e-05, "loss": 0.3563, "step": 1209 }, { "epoch": 0.7661258409180847, "grad_norm": 0.5241416365385405, "learning_rate": 1.7787459937057808e-05, "loss": 0.3766, "step": 1210 }, { "epoch": 0.7667590027700831, "grad_norm": 0.5677650076891827, "learning_rate": 1.7783081305038964e-05, "loss": 0.3826, "step": 1211 }, { "epoch": 0.7673921646220815, "grad_norm": 0.5252593725683831, "learning_rate": 1.7778698884693622e-05, "loss": 0.347, "step": 1212 }, { "epoch": 0.76802532647408, "grad_norm": 0.5306235343961452, "learning_rate": 1.7774312678154886e-05, "loss": 0.3743, "step": 1213 }, { "epoch": 0.7686584883260783, "grad_norm": 0.5482900872548632, "learning_rate": 1.7769922687557685e-05, "loss": 0.366, "step": 1214 }, { "epoch": 0.7692916501780768, "grad_norm": 0.5489634007612059, "learning_rate": 1.7765528915038798e-05, "loss": 0.3698, "step": 1215 }, { "epoch": 0.7699248120300752, "grad_norm": 0.5442404628456465, "learning_rate": 1.7761131362736845e-05, "loss": 0.352, "step": 1216 }, { "epoch": 0.7705579738820736, "grad_norm": 0.5320553098791828, "learning_rate": 1.7756730032792285e-05, "loss": 0.3562, "step": 1217 }, { "epoch": 0.771191135734072, "grad_norm": 0.5742906797540702, "learning_rate": 1.7752324927347414e-05, "loss": 0.3834, "step": 1218 }, { "epoch": 0.7718242975860704, "grad_norm": 0.5718542644244723, "learning_rate": 1.7747916048546372e-05, "loss": 0.3528, "step": 1219 }, { "epoch": 0.7724574594380689, "grad_norm": 0.6069667059661923, "learning_rate": 1.7743503398535123e-05, "loss": 0.3535, "step": 1220 }, { "epoch": 0.7730906212900672, "grad_norm": 0.5529875351647033, "learning_rate": 1.773908697946148e-05, "loss": 0.3656, "step": 1221 }, { "epoch": 0.7737237831420657, "grad_norm": 0.7090371559580917, "learning_rate": 1.7734666793475083e-05, "loss": 0.3575, "step": 1222 }, { "epoch": 0.7743569449940642, "grad_norm": 0.5819488106435573, "learning_rate": 1.7730242842727404e-05, "loss": 0.3634, "step": 1223 }, { "epoch": 0.7749901068460625, "grad_norm": 0.6186883624336381, "learning_rate": 1.7725815129371757e-05, "loss": 0.3797, "step": 1224 }, { "epoch": 0.775623268698061, "grad_norm": 0.6522636149527434, "learning_rate": 1.772138365556328e-05, "loss": 0.3676, "step": 1225 }, { "epoch": 0.7762564305500593, "grad_norm": 0.5875569350947746, "learning_rate": 1.771694842345894e-05, "loss": 0.3651, "step": 1226 }, { "epoch": 0.7768895924020578, "grad_norm": 0.5416672267995631, "learning_rate": 1.7712509435217534e-05, "loss": 0.3534, "step": 1227 }, { "epoch": 0.7775227542540561, "grad_norm": 0.5935920171670677, "learning_rate": 1.77080666929997e-05, "loss": 0.3425, "step": 1228 }, { "epoch": 0.7781559161060546, "grad_norm": 0.5570728463317237, "learning_rate": 1.7703620198967876e-05, "loss": 0.3501, "step": 1229 }, { "epoch": 0.7787890779580531, "grad_norm": 0.5321626307347082, "learning_rate": 1.7699169955286353e-05, "loss": 0.3749, "step": 1230 }, { "epoch": 0.7794222398100514, "grad_norm": 0.8054428273459621, "learning_rate": 1.7694715964121235e-05, "loss": 0.3754, "step": 1231 }, { "epoch": 0.7800554016620499, "grad_norm": 0.607136461317216, "learning_rate": 1.769025822764045e-05, "loss": 0.3524, "step": 1232 }, { "epoch": 0.7806885635140483, "grad_norm": 0.5440165017853458, "learning_rate": 1.7685796748013754e-05, "loss": 0.365, "step": 1233 }, { "epoch": 0.7813217253660467, "grad_norm": 0.7079944069271772, "learning_rate": 1.768133152741272e-05, "loss": 0.3552, "step": 1234 }, { "epoch": 0.7819548872180451, "grad_norm": 0.5343430139658615, "learning_rate": 1.7676862568010743e-05, "loss": 0.3513, "step": 1235 }, { "epoch": 0.7825880490700435, "grad_norm": 0.6070665378255286, "learning_rate": 1.7672389871983035e-05, "loss": 0.3858, "step": 1236 }, { "epoch": 0.783221210922042, "grad_norm": 0.6042816778916471, "learning_rate": 1.7667913441506633e-05, "loss": 0.3624, "step": 1237 }, { "epoch": 0.7838543727740404, "grad_norm": 0.5664479158435761, "learning_rate": 1.766343327876039e-05, "loss": 0.3594, "step": 1238 }, { "epoch": 0.7844875346260388, "grad_norm": 0.6293692895802439, "learning_rate": 1.7658949385924974e-05, "loss": 0.3554, "step": 1239 }, { "epoch": 0.7851206964780372, "grad_norm": 0.5603044656869642, "learning_rate": 1.7654461765182868e-05, "loss": 0.3835, "step": 1240 }, { "epoch": 0.7857538583300356, "grad_norm": 0.6424293996776493, "learning_rate": 1.764997041871837e-05, "loss": 0.3555, "step": 1241 }, { "epoch": 0.786387020182034, "grad_norm": 0.6104538142437049, "learning_rate": 1.7645475348717593e-05, "loss": 0.373, "step": 1242 }, { "epoch": 0.7870201820340325, "grad_norm": 0.6535408006588908, "learning_rate": 1.7640976557368462e-05, "loss": 0.3533, "step": 1243 }, { "epoch": 0.7876533438860309, "grad_norm": 0.5878392048082343, "learning_rate": 1.763647404686071e-05, "loss": 0.3783, "step": 1244 }, { "epoch": 0.7882865057380293, "grad_norm": 0.6149090893213592, "learning_rate": 1.7631967819385883e-05, "loss": 0.3475, "step": 1245 }, { "epoch": 0.7889196675900277, "grad_norm": 0.5610399680203129, "learning_rate": 1.762745787713734e-05, "loss": 0.3417, "step": 1246 }, { "epoch": 0.7895528294420261, "grad_norm": 0.610294715976349, "learning_rate": 1.7622944222310242e-05, "loss": 0.3597, "step": 1247 }, { "epoch": 0.7901859912940246, "grad_norm": 0.602878324745947, "learning_rate": 1.7618426857101555e-05, "loss": 0.3669, "step": 1248 }, { "epoch": 0.7908191531460229, "grad_norm": 0.7385260417553897, "learning_rate": 1.7613905783710063e-05, "loss": 0.3596, "step": 1249 }, { "epoch": 0.7914523149980214, "grad_norm": 0.6239580588384648, "learning_rate": 1.7609381004336344e-05, "loss": 0.3656, "step": 1250 }, { "epoch": 0.7920854768500197, "grad_norm": 0.6186415207942213, "learning_rate": 1.760485252118278e-05, "loss": 0.3776, "step": 1251 }, { "epoch": 0.7927186387020182, "grad_norm": 0.6131018504281605, "learning_rate": 1.760032033645356e-05, "loss": 0.3594, "step": 1252 }, { "epoch": 0.7933518005540167, "grad_norm": 0.6258488471974673, "learning_rate": 1.7595784452354677e-05, "loss": 0.3721, "step": 1253 }, { "epoch": 0.793984962406015, "grad_norm": 0.5912012556136351, "learning_rate": 1.759124487109392e-05, "loss": 0.3558, "step": 1254 }, { "epoch": 0.7946181242580135, "grad_norm": 0.5598180311576123, "learning_rate": 1.7586701594880873e-05, "loss": 0.3636, "step": 1255 }, { "epoch": 0.7952512861100118, "grad_norm": 0.5655856767429678, "learning_rate": 1.758215462592693e-05, "loss": 0.3689, "step": 1256 }, { "epoch": 0.7958844479620103, "grad_norm": 0.5409227311270224, "learning_rate": 1.7577603966445278e-05, "loss": 0.3618, "step": 1257 }, { "epoch": 0.7965176098140087, "grad_norm": 0.5966173073570545, "learning_rate": 1.7573049618650893e-05, "loss": 0.3659, "step": 1258 }, { "epoch": 0.7971507716660071, "grad_norm": 0.5739464723975908, "learning_rate": 1.7568491584760554e-05, "loss": 0.348, "step": 1259 }, { "epoch": 0.7977839335180056, "grad_norm": 0.5371380213708293, "learning_rate": 1.7563929866992837e-05, "loss": 0.3694, "step": 1260 }, { "epoch": 0.7984170953700039, "grad_norm": 0.5595959567653257, "learning_rate": 1.75593644675681e-05, "loss": 0.3586, "step": 1261 }, { "epoch": 0.7990502572220024, "grad_norm": 0.5913989330126729, "learning_rate": 1.75547953887085e-05, "loss": 0.3532, "step": 1262 }, { "epoch": 0.7996834190740008, "grad_norm": 0.5836563641852137, "learning_rate": 1.7550222632637992e-05, "loss": 0.3462, "step": 1263 }, { "epoch": 0.8003165809259992, "grad_norm": 0.5622052376667421, "learning_rate": 1.7545646201582304e-05, "loss": 0.3762, "step": 1264 }, { "epoch": 0.8009497427779976, "grad_norm": 0.5701354796652008, "learning_rate": 1.7541066097768965e-05, "loss": 0.358, "step": 1265 }, { "epoch": 0.801582904629996, "grad_norm": 0.5653036265658661, "learning_rate": 1.7536482323427288e-05, "loss": 0.3721, "step": 1266 }, { "epoch": 0.8022160664819945, "grad_norm": 0.6286759869908067, "learning_rate": 1.7531894880788373e-05, "loss": 0.3606, "step": 1267 }, { "epoch": 0.8028492283339929, "grad_norm": 0.5374556150478305, "learning_rate": 1.752730377208511e-05, "loss": 0.3576, "step": 1268 }, { "epoch": 0.8034823901859913, "grad_norm": 0.718072238583419, "learning_rate": 1.7522708999552166e-05, "loss": 0.3663, "step": 1269 }, { "epoch": 0.8041155520379897, "grad_norm": 0.5688715435016897, "learning_rate": 1.7518110565425993e-05, "loss": 0.3483, "step": 1270 }, { "epoch": 0.8047487138899881, "grad_norm": 0.5609026602869661, "learning_rate": 1.7513508471944823e-05, "loss": 0.3585, "step": 1271 }, { "epoch": 0.8053818757419865, "grad_norm": 0.5295908968157189, "learning_rate": 1.750890272134868e-05, "loss": 0.3712, "step": 1272 }, { "epoch": 0.806015037593985, "grad_norm": 0.5354417088764817, "learning_rate": 1.7504293315879364e-05, "loss": 0.3666, "step": 1273 }, { "epoch": 0.8066481994459834, "grad_norm": 0.55985203545222, "learning_rate": 1.7499680257780437e-05, "loss": 0.3743, "step": 1274 }, { "epoch": 0.8072813612979818, "grad_norm": 0.5440874717117571, "learning_rate": 1.7495063549297268e-05, "loss": 0.3731, "step": 1275 }, { "epoch": 0.8079145231499802, "grad_norm": 0.5531877831432632, "learning_rate": 1.7490443192676972e-05, "loss": 0.3722, "step": 1276 }, { "epoch": 0.8085476850019786, "grad_norm": 0.5402160578033471, "learning_rate": 1.7485819190168468e-05, "loss": 0.3613, "step": 1277 }, { "epoch": 0.8091808468539771, "grad_norm": 0.5321454167801649, "learning_rate": 1.7481191544022435e-05, "loss": 0.3534, "step": 1278 }, { "epoch": 0.8098140087059754, "grad_norm": 0.5213111841290136, "learning_rate": 1.747656025649132e-05, "loss": 0.3665, "step": 1279 }, { "epoch": 0.8104471705579739, "grad_norm": 0.6292845425879516, "learning_rate": 1.7471925329829354e-05, "loss": 0.376, "step": 1280 }, { "epoch": 0.8110803324099723, "grad_norm": 0.5415191986403443, "learning_rate": 1.7467286766292536e-05, "loss": 0.3501, "step": 1281 }, { "epoch": 0.8117134942619707, "grad_norm": 0.5028422601500859, "learning_rate": 1.746264456813863e-05, "loss": 0.3675, "step": 1282 }, { "epoch": 0.8123466561139692, "grad_norm": 0.6023523908626135, "learning_rate": 1.7457998737627183e-05, "loss": 0.3431, "step": 1283 }, { "epoch": 0.8129798179659675, "grad_norm": 0.5715095319734433, "learning_rate": 1.7453349277019488e-05, "loss": 0.3634, "step": 1284 }, { "epoch": 0.813612979817966, "grad_norm": 0.5933331871480145, "learning_rate": 1.7448696188578625e-05, "loss": 0.3529, "step": 1285 }, { "epoch": 0.8142461416699643, "grad_norm": 0.5418568298273543, "learning_rate": 1.744403947456943e-05, "loss": 0.3548, "step": 1286 }, { "epoch": 0.8148793035219628, "grad_norm": 1.4194393012015905, "learning_rate": 1.7439379137258505e-05, "loss": 0.3621, "step": 1287 }, { "epoch": 0.8155124653739613, "grad_norm": 0.6012310705367893, "learning_rate": 1.7434715178914214e-05, "loss": 0.3442, "step": 1288 }, { "epoch": 0.8161456272259596, "grad_norm": 0.544913642332669, "learning_rate": 1.7430047601806693e-05, "loss": 0.3562, "step": 1289 }, { "epoch": 0.8167787890779581, "grad_norm": 0.5537217723232346, "learning_rate": 1.7425376408207822e-05, "loss": 0.3508, "step": 1290 }, { "epoch": 0.8174119509299564, "grad_norm": 0.5965250941850238, "learning_rate": 1.742070160039126e-05, "loss": 0.3631, "step": 1291 }, { "epoch": 0.8180451127819549, "grad_norm": 0.529973822764971, "learning_rate": 1.7416023180632416e-05, "loss": 0.351, "step": 1292 }, { "epoch": 0.8186782746339533, "grad_norm": 0.5810969863314652, "learning_rate": 1.7411341151208455e-05, "loss": 0.3528, "step": 1293 }, { "epoch": 0.8193114364859517, "grad_norm": 0.5355685006308123, "learning_rate": 1.7406655514398302e-05, "loss": 0.3578, "step": 1294 }, { "epoch": 0.8199445983379502, "grad_norm": 0.5103080449938027, "learning_rate": 1.7401966272482636e-05, "loss": 0.3629, "step": 1295 }, { "epoch": 0.8205777601899485, "grad_norm": 0.5756857883235594, "learning_rate": 1.7397273427743896e-05, "loss": 0.3847, "step": 1296 }, { "epoch": 0.821210922041947, "grad_norm": 0.5560091348508187, "learning_rate": 1.7392576982466266e-05, "loss": 0.3347, "step": 1297 }, { "epoch": 0.8218440838939454, "grad_norm": 0.7857518443190126, "learning_rate": 1.7387876938935694e-05, "loss": 0.3472, "step": 1298 }, { "epoch": 0.8224772457459438, "grad_norm": 0.5201526028949909, "learning_rate": 1.7383173299439868e-05, "loss": 0.3626, "step": 1299 }, { "epoch": 0.8231104075979422, "grad_norm": 0.5080059181944958, "learning_rate": 1.737846606626823e-05, "loss": 0.3686, "step": 1300 }, { "epoch": 0.8237435694499406, "grad_norm": 0.5279128376939227, "learning_rate": 1.7373755241711978e-05, "loss": 0.3642, "step": 1301 }, { "epoch": 0.824376731301939, "grad_norm": 0.5358438457020414, "learning_rate": 1.7369040828064046e-05, "loss": 0.3641, "step": 1302 }, { "epoch": 0.8250098931539375, "grad_norm": 0.6532459525188425, "learning_rate": 1.7364322827619128e-05, "loss": 0.3569, "step": 1303 }, { "epoch": 0.8256430550059359, "grad_norm": 0.5080530250020676, "learning_rate": 1.7359601242673654e-05, "loss": 0.3658, "step": 1304 }, { "epoch": 0.8262762168579343, "grad_norm": 0.5333567193667007, "learning_rate": 1.7354876075525798e-05, "loss": 0.3661, "step": 1305 }, { "epoch": 0.8269093787099328, "grad_norm": 0.5292288615785768, "learning_rate": 1.7350147328475488e-05, "loss": 0.3576, "step": 1306 }, { "epoch": 0.8275425405619311, "grad_norm": 0.5573493338040751, "learning_rate": 1.734541500382438e-05, "loss": 0.369, "step": 1307 }, { "epoch": 0.8281757024139296, "grad_norm": 0.5276197704848026, "learning_rate": 1.734067910387589e-05, "loss": 0.3516, "step": 1308 }, { "epoch": 0.8288088642659279, "grad_norm": 0.4999778788904999, "learning_rate": 1.7335939630935155e-05, "loss": 0.3683, "step": 1309 }, { "epoch": 0.8294420261179264, "grad_norm": 0.5482357892009277, "learning_rate": 1.7331196587309064e-05, "loss": 0.3487, "step": 1310 }, { "epoch": 0.8300751879699249, "grad_norm": 0.49818930421122143, "learning_rate": 1.7326449975306236e-05, "loss": 0.3511, "step": 1311 }, { "epoch": 0.8307083498219232, "grad_norm": 0.5671289969263509, "learning_rate": 1.7321699797237033e-05, "loss": 0.3507, "step": 1312 }, { "epoch": 0.8313415116739217, "grad_norm": 0.5516598780949005, "learning_rate": 1.7316946055413552e-05, "loss": 0.3598, "step": 1313 }, { "epoch": 0.83197467352592, "grad_norm": 0.537424021457692, "learning_rate": 1.7312188752149618e-05, "loss": 0.3606, "step": 1314 }, { "epoch": 0.8326078353779185, "grad_norm": 0.5405033634695997, "learning_rate": 1.73074278897608e-05, "loss": 0.3618, "step": 1315 }, { "epoch": 0.8332409972299168, "grad_norm": 0.6074487719165839, "learning_rate": 1.730266347056439e-05, "loss": 0.3681, "step": 1316 }, { "epoch": 0.8338741590819153, "grad_norm": 0.5189474708631582, "learning_rate": 1.7297895496879412e-05, "loss": 0.3641, "step": 1317 }, { "epoch": 0.8345073209339138, "grad_norm": 0.5873563283133082, "learning_rate": 1.729312397102663e-05, "loss": 0.3488, "step": 1318 }, { "epoch": 0.8351404827859121, "grad_norm": 0.5387816437305283, "learning_rate": 1.728834889532853e-05, "loss": 0.3585, "step": 1319 }, { "epoch": 0.8357736446379106, "grad_norm": 0.5126072222811893, "learning_rate": 1.7283570272109317e-05, "loss": 0.3478, "step": 1320 }, { "epoch": 0.836406806489909, "grad_norm": 0.5132047513488159, "learning_rate": 1.7278788103694944e-05, "loss": 0.3524, "step": 1321 }, { "epoch": 0.8370399683419074, "grad_norm": 0.5355402392635255, "learning_rate": 1.7274002392413068e-05, "loss": 0.3487, "step": 1322 }, { "epoch": 0.8376731301939058, "grad_norm": 0.5769167106496998, "learning_rate": 1.726921314059308e-05, "loss": 0.3642, "step": 1323 }, { "epoch": 0.8383062920459042, "grad_norm": 0.5711832730430235, "learning_rate": 1.7264420350566098e-05, "loss": 0.3477, "step": 1324 }, { "epoch": 0.8389394538979027, "grad_norm": 0.5225689976293666, "learning_rate": 1.7259624024664955e-05, "loss": 0.3597, "step": 1325 }, { "epoch": 0.839572615749901, "grad_norm": 0.5361197107555687, "learning_rate": 1.725482416522421e-05, "loss": 0.3646, "step": 1326 }, { "epoch": 0.8402057776018995, "grad_norm": 0.5387096184073223, "learning_rate": 1.725002077458014e-05, "loss": 0.3756, "step": 1327 }, { "epoch": 0.8408389394538979, "grad_norm": 0.5338626052001899, "learning_rate": 1.7245213855070746e-05, "loss": 0.3578, "step": 1328 }, { "epoch": 0.8414721013058963, "grad_norm": 0.563332842672782, "learning_rate": 1.724040340903573e-05, "loss": 0.3546, "step": 1329 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5443763063084469, "learning_rate": 1.7235589438816533e-05, "loss": 0.339, "step": 1330 }, { "epoch": 0.8427384250098932, "grad_norm": 0.5583225597211818, "learning_rate": 1.723077194675629e-05, "loss": 0.357, "step": 1331 }, { "epoch": 0.8433715868618916, "grad_norm": 0.5497051721587372, "learning_rate": 1.7225950935199872e-05, "loss": 0.3476, "step": 1332 }, { "epoch": 0.84400474871389, "grad_norm": 0.6142993580943642, "learning_rate": 1.722112640649384e-05, "loss": 0.3798, "step": 1333 }, { "epoch": 0.8446379105658884, "grad_norm": 0.5169243124580558, "learning_rate": 1.7216298362986487e-05, "loss": 0.3557, "step": 1334 }, { "epoch": 0.8452710724178868, "grad_norm": 0.5460898358304719, "learning_rate": 1.7211466807027805e-05, "loss": 0.3649, "step": 1335 }, { "epoch": 0.8459042342698853, "grad_norm": 0.5333722179250027, "learning_rate": 1.7206631740969505e-05, "loss": 0.3758, "step": 1336 }, { "epoch": 0.8465373961218836, "grad_norm": 0.5511560220803958, "learning_rate": 1.7201793167164987e-05, "loss": 0.387, "step": 1337 }, { "epoch": 0.8471705579738821, "grad_norm": 0.5441019568717654, "learning_rate": 1.7196951087969387e-05, "loss": 0.3531, "step": 1338 }, { "epoch": 0.8478037198258805, "grad_norm": 0.5935372003873354, "learning_rate": 1.719210550573952e-05, "loss": 0.3572, "step": 1339 }, { "epoch": 0.8484368816778789, "grad_norm": 0.5858535352197185, "learning_rate": 1.7187256422833928e-05, "loss": 0.3671, "step": 1340 }, { "epoch": 0.8490700435298774, "grad_norm": 0.5837793526635819, "learning_rate": 1.718240384161284e-05, "loss": 0.368, "step": 1341 }, { "epoch": 0.8497032053818757, "grad_norm": 0.6240688610067643, "learning_rate": 1.7177547764438197e-05, "loss": 0.3543, "step": 1342 }, { "epoch": 0.8503363672338742, "grad_norm": 0.5900570535663626, "learning_rate": 1.7172688193673643e-05, "loss": 0.363, "step": 1343 }, { "epoch": 0.8509695290858725, "grad_norm": 0.5744259135729016, "learning_rate": 1.7167825131684516e-05, "loss": 0.3606, "step": 1344 }, { "epoch": 0.851602690937871, "grad_norm": 0.6165937415540265, "learning_rate": 1.716295858083785e-05, "loss": 0.3625, "step": 1345 }, { "epoch": 0.8522358527898695, "grad_norm": 0.569433638805146, "learning_rate": 1.7158088543502395e-05, "loss": 0.3609, "step": 1346 }, { "epoch": 0.8528690146418678, "grad_norm": 0.5524559283994417, "learning_rate": 1.715321502204858e-05, "loss": 0.371, "step": 1347 }, { "epoch": 0.8535021764938663, "grad_norm": 0.5483661615352118, "learning_rate": 1.7148338018848533e-05, "loss": 0.3472, "step": 1348 }, { "epoch": 0.8541353383458646, "grad_norm": 0.5622849395544336, "learning_rate": 1.714345753627609e-05, "loss": 0.3693, "step": 1349 }, { "epoch": 0.8547685001978631, "grad_norm": 0.5571382577222526, "learning_rate": 1.713857357670676e-05, "loss": 0.3473, "step": 1350 }, { "epoch": 0.8554016620498615, "grad_norm": 0.7734493825542383, "learning_rate": 1.7133686142517762e-05, "loss": 0.3738, "step": 1351 }, { "epoch": 0.8560348239018599, "grad_norm": 0.5881149469285148, "learning_rate": 1.7128795236087995e-05, "loss": 0.3669, "step": 1352 }, { "epoch": 0.8566679857538584, "grad_norm": 0.5442559719532788, "learning_rate": 1.7123900859798056e-05, "loss": 0.3676, "step": 1353 }, { "epoch": 0.8573011476058567, "grad_norm": 0.5325224957198177, "learning_rate": 1.711900301603022e-05, "loss": 0.3588, "step": 1354 }, { "epoch": 0.8579343094578552, "grad_norm": 0.8190759587767968, "learning_rate": 1.7114101707168464e-05, "loss": 0.3567, "step": 1355 }, { "epoch": 0.8585674713098536, "grad_norm": 0.7609116549011383, "learning_rate": 1.7109196935598446e-05, "loss": 0.3381, "step": 1356 }, { "epoch": 0.859200633161852, "grad_norm": 0.6911400580591313, "learning_rate": 1.71042887037075e-05, "loss": 0.362, "step": 1357 }, { "epoch": 0.8598337950138504, "grad_norm": 0.5757976047719633, "learning_rate": 1.7099377013884657e-05, "loss": 0.3648, "step": 1358 }, { "epoch": 0.8604669568658488, "grad_norm": 0.5372947377780105, "learning_rate": 1.7094461868520625e-05, "loss": 0.3621, "step": 1359 }, { "epoch": 0.8611001187178472, "grad_norm": 0.6348899003900317, "learning_rate": 1.7089543270007795e-05, "loss": 0.3635, "step": 1360 }, { "epoch": 0.8617332805698457, "grad_norm": 0.5281600282042763, "learning_rate": 1.7084621220740242e-05, "loss": 0.3516, "step": 1361 }, { "epoch": 0.8623664424218441, "grad_norm": 0.8680536151511893, "learning_rate": 1.7079695723113716e-05, "loss": 0.3541, "step": 1362 }, { "epoch": 0.8629996042738425, "grad_norm": 0.7334925306818796, "learning_rate": 1.7074766779525643e-05, "loss": 0.3514, "step": 1363 }, { "epoch": 0.863632766125841, "grad_norm": 0.5363751640054986, "learning_rate": 1.7069834392375138e-05, "loss": 0.3676, "step": 1364 }, { "epoch": 0.8642659279778393, "grad_norm": 0.6027623560753451, "learning_rate": 1.7064898564062975e-05, "loss": 0.3574, "step": 1365 }, { "epoch": 0.8648990898298378, "grad_norm": 0.5605769771042229, "learning_rate": 1.7059959296991622e-05, "loss": 0.3556, "step": 1366 }, { "epoch": 0.8655322516818361, "grad_norm": 0.6088630281078109, "learning_rate": 1.7055016593565204e-05, "loss": 0.3558, "step": 1367 }, { "epoch": 0.8661654135338346, "grad_norm": 0.5798589110886473, "learning_rate": 1.705007045618953e-05, "loss": 0.3641, "step": 1368 }, { "epoch": 0.866798575385833, "grad_norm": 0.6212725742750159, "learning_rate": 1.7045120887272076e-05, "loss": 0.3719, "step": 1369 }, { "epoch": 0.8674317372378314, "grad_norm": 0.6477022874651548, "learning_rate": 1.7040167889221984e-05, "loss": 0.3623, "step": 1370 }, { "epoch": 0.8680648990898299, "grad_norm": 0.5352975278324448, "learning_rate": 1.7035211464450075e-05, "loss": 0.3607, "step": 1371 }, { "epoch": 0.8686980609418282, "grad_norm": 0.5367097542487239, "learning_rate": 1.703025161536883e-05, "loss": 0.3708, "step": 1372 }, { "epoch": 0.8693312227938267, "grad_norm": 0.6067864229055177, "learning_rate": 1.70252883443924e-05, "loss": 0.3644, "step": 1373 }, { "epoch": 0.869964384645825, "grad_norm": 0.5824211253219527, "learning_rate": 1.7020321653936602e-05, "loss": 0.3656, "step": 1374 }, { "epoch": 0.8705975464978235, "grad_norm": 0.5724718113628382, "learning_rate": 1.7015351546418914e-05, "loss": 0.3703, "step": 1375 }, { "epoch": 0.871230708349822, "grad_norm": 0.6403221277769461, "learning_rate": 1.701037802425848e-05, "loss": 0.3491, "step": 1376 }, { "epoch": 0.8718638702018203, "grad_norm": 0.5658776881229636, "learning_rate": 1.7005401089876102e-05, "loss": 0.3669, "step": 1377 }, { "epoch": 0.8724970320538188, "grad_norm": 0.6364255702044269, "learning_rate": 1.7000420745694256e-05, "loss": 0.3444, "step": 1378 }, { "epoch": 0.8731301939058171, "grad_norm": 0.6019110091570978, "learning_rate": 1.699543699413706e-05, "loss": 0.3569, "step": 1379 }, { "epoch": 0.8737633557578156, "grad_norm": 0.5535226251817275, "learning_rate": 1.6990449837630295e-05, "loss": 0.3407, "step": 1380 }, { "epoch": 0.874396517609814, "grad_norm": 0.60148688038489, "learning_rate": 1.6985459278601415e-05, "loss": 0.3655, "step": 1381 }, { "epoch": 0.8750296794618124, "grad_norm": 1.1868235072107587, "learning_rate": 1.6980465319479508e-05, "loss": 0.3526, "step": 1382 }, { "epoch": 0.8756628413138109, "grad_norm": 0.7171225735755002, "learning_rate": 1.6975467962695334e-05, "loss": 0.348, "step": 1383 }, { "epoch": 0.8762960031658092, "grad_norm": 0.5818576882229176, "learning_rate": 1.697046721068129e-05, "loss": 0.3513, "step": 1384 }, { "epoch": 0.8769291650178077, "grad_norm": 0.546708830456928, "learning_rate": 1.696546306587145e-05, "loss": 0.3562, "step": 1385 }, { "epoch": 0.8775623268698061, "grad_norm": 0.6007425447212293, "learning_rate": 1.6960455530701505e-05, "loss": 0.3457, "step": 1386 }, { "epoch": 0.8781954887218045, "grad_norm": 0.6259900042804004, "learning_rate": 1.6955444607608833e-05, "loss": 0.3571, "step": 1387 }, { "epoch": 0.8788286505738029, "grad_norm": 0.6356355366926645, "learning_rate": 1.6950430299032437e-05, "loss": 0.3605, "step": 1388 }, { "epoch": 0.8794618124258013, "grad_norm": 0.5771510564850232, "learning_rate": 1.6945412607412976e-05, "loss": 0.3512, "step": 1389 }, { "epoch": 0.8800949742777998, "grad_norm": 0.5464098918325994, "learning_rate": 1.6940391535192753e-05, "loss": 0.3666, "step": 1390 }, { "epoch": 0.8807281361297982, "grad_norm": 0.5975714292589436, "learning_rate": 1.693536708481572e-05, "loss": 0.3611, "step": 1391 }, { "epoch": 0.8813612979817966, "grad_norm": 0.5349581374558515, "learning_rate": 1.6930339258727467e-05, "loss": 0.3557, "step": 1392 }, { "epoch": 0.881994459833795, "grad_norm": 0.598128470109058, "learning_rate": 1.6925308059375235e-05, "loss": 0.3723, "step": 1393 }, { "epoch": 0.8826276216857935, "grad_norm": 0.6404944716999759, "learning_rate": 1.6920273489207905e-05, "loss": 0.3651, "step": 1394 }, { "epoch": 0.8832607835377918, "grad_norm": 0.5912271461769796, "learning_rate": 1.6915235550675997e-05, "loss": 0.3457, "step": 1395 }, { "epoch": 0.8838939453897903, "grad_norm": 0.6117715086712051, "learning_rate": 1.691019424623166e-05, "loss": 0.3557, "step": 1396 }, { "epoch": 0.8845271072417887, "grad_norm": 0.5362255836455389, "learning_rate": 1.6905149578328705e-05, "loss": 0.3394, "step": 1397 }, { "epoch": 0.8851602690937871, "grad_norm": 0.5801498056014345, "learning_rate": 1.690010154942256e-05, "loss": 0.3441, "step": 1398 }, { "epoch": 0.8857934309457856, "grad_norm": 0.5944759051712878, "learning_rate": 1.6895050161970298e-05, "loss": 0.3713, "step": 1399 }, { "epoch": 0.8864265927977839, "grad_norm": 0.5935729357736694, "learning_rate": 1.6889995418430626e-05, "loss": 0.3644, "step": 1400 }, { "epoch": 0.8870597546497824, "grad_norm": 0.534039815322947, "learning_rate": 1.6884937321263875e-05, "loss": 0.3467, "step": 1401 }, { "epoch": 0.8876929165017807, "grad_norm": 0.5009169577513948, "learning_rate": 1.6879875872932026e-05, "loss": 0.3643, "step": 1402 }, { "epoch": 0.8883260783537792, "grad_norm": 0.5985189346568609, "learning_rate": 1.6874811075898675e-05, "loss": 0.3606, "step": 1403 }, { "epoch": 0.8889592402057777, "grad_norm": 0.5833091986357978, "learning_rate": 1.686974293262906e-05, "loss": 0.3536, "step": 1404 }, { "epoch": 0.889592402057776, "grad_norm": 0.5702410676367516, "learning_rate": 1.686467144559004e-05, "loss": 0.359, "step": 1405 }, { "epoch": 0.8902255639097745, "grad_norm": 0.5505874731231469, "learning_rate": 1.68595966172501e-05, "loss": 0.3667, "step": 1406 }, { "epoch": 0.8908587257617728, "grad_norm": 0.6249806403112185, "learning_rate": 1.6854518450079358e-05, "loss": 0.3563, "step": 1407 }, { "epoch": 0.8914918876137713, "grad_norm": 0.533199551146853, "learning_rate": 1.6849436946549554e-05, "loss": 0.3507, "step": 1408 }, { "epoch": 0.8921250494657696, "grad_norm": 0.6003861254259432, "learning_rate": 1.6844352109134056e-05, "loss": 0.3566, "step": 1409 }, { "epoch": 0.8927582113177681, "grad_norm": 0.6049535012152102, "learning_rate": 1.6839263940307845e-05, "loss": 0.3747, "step": 1410 }, { "epoch": 0.8933913731697665, "grad_norm": 0.5427665698784325, "learning_rate": 1.6834172442547534e-05, "loss": 0.3623, "step": 1411 }, { "epoch": 0.8940245350217649, "grad_norm": 0.5669040818380792, "learning_rate": 1.682907761833135e-05, "loss": 0.3699, "step": 1412 }, { "epoch": 0.8946576968737634, "grad_norm": 0.484569211209836, "learning_rate": 1.6823979470139142e-05, "loss": 0.3517, "step": 1413 }, { "epoch": 0.8952908587257618, "grad_norm": 0.5285498890779118, "learning_rate": 1.681887800045237e-05, "loss": 0.3758, "step": 1414 }, { "epoch": 0.8959240205777602, "grad_norm": 0.5387412441910527, "learning_rate": 1.681377321175412e-05, "loss": 0.3665, "step": 1415 }, { "epoch": 0.8965571824297586, "grad_norm": 0.5455558922960363, "learning_rate": 1.6808665106529096e-05, "loss": 0.3564, "step": 1416 }, { "epoch": 0.897190344281757, "grad_norm": 0.5411474223603138, "learning_rate": 1.68035536872636e-05, "loss": 0.3421, "step": 1417 }, { "epoch": 0.8978235061337554, "grad_norm": 0.5273354008925656, "learning_rate": 1.6798438956445564e-05, "loss": 0.3544, "step": 1418 }, { "epoch": 0.8984566679857539, "grad_norm": 0.788640246409335, "learning_rate": 1.6793320916564525e-05, "loss": 0.3575, "step": 1419 }, { "epoch": 0.8990898298377523, "grad_norm": 0.4874504612526772, "learning_rate": 1.678819957011163e-05, "loss": 0.3439, "step": 1420 }, { "epoch": 0.8997229916897507, "grad_norm": 0.5511657830461996, "learning_rate": 1.6783074919579633e-05, "loss": 0.3539, "step": 1421 }, { "epoch": 0.9003561535417491, "grad_norm": 0.5396459057328201, "learning_rate": 1.6777946967462902e-05, "loss": 0.3559, "step": 1422 }, { "epoch": 0.9009893153937475, "grad_norm": 0.5001204118265156, "learning_rate": 1.6772815716257414e-05, "loss": 0.347, "step": 1423 }, { "epoch": 0.901622477245746, "grad_norm": 0.6478675863896038, "learning_rate": 1.676768116846074e-05, "loss": 0.355, "step": 1424 }, { "epoch": 0.9022556390977443, "grad_norm": 0.5467975749036478, "learning_rate": 1.6762543326572064e-05, "loss": 0.3513, "step": 1425 }, { "epoch": 0.9028888009497428, "grad_norm": 0.7398538224178438, "learning_rate": 1.6757402193092174e-05, "loss": 0.3672, "step": 1426 }, { "epoch": 0.9035219628017412, "grad_norm": 0.514742598213152, "learning_rate": 1.6752257770523457e-05, "loss": 0.3471, "step": 1427 }, { "epoch": 0.9041551246537396, "grad_norm": 0.5984087591493219, "learning_rate": 1.674711006136991e-05, "loss": 0.3434, "step": 1428 }, { "epoch": 0.9047882865057381, "grad_norm": 0.5504796273473361, "learning_rate": 1.674195906813711e-05, "loss": 0.3432, "step": 1429 }, { "epoch": 0.9054214483577364, "grad_norm": 0.5750752752712424, "learning_rate": 1.6736804793332254e-05, "loss": 0.3737, "step": 1430 }, { "epoch": 0.9060546102097349, "grad_norm": 0.5302691513334066, "learning_rate": 1.673164723946412e-05, "loss": 0.3594, "step": 1431 }, { "epoch": 0.9066877720617332, "grad_norm": 0.5117325427894713, "learning_rate": 1.6726486409043094e-05, "loss": 0.3709, "step": 1432 }, { "epoch": 0.9073209339137317, "grad_norm": 0.645718554514302, "learning_rate": 1.6721322304581148e-05, "loss": 0.3518, "step": 1433 }, { "epoch": 0.9079540957657302, "grad_norm": 0.562179001110847, "learning_rate": 1.671615492859185e-05, "loss": 0.3645, "step": 1434 }, { "epoch": 0.9085872576177285, "grad_norm": 0.5946881723330236, "learning_rate": 1.671098428359037e-05, "loss": 0.3598, "step": 1435 }, { "epoch": 0.909220419469727, "grad_norm": 0.6380819469875718, "learning_rate": 1.6705810372093448e-05, "loss": 0.3642, "step": 1436 }, { "epoch": 0.9098535813217253, "grad_norm": 0.6610006203262878, "learning_rate": 1.670063319661944e-05, "loss": 0.3572, "step": 1437 }, { "epoch": 0.9104867431737238, "grad_norm": 0.5382915381368958, "learning_rate": 1.669545275968827e-05, "loss": 0.349, "step": 1438 }, { "epoch": 0.9111199050257222, "grad_norm": 0.6103723034271316, "learning_rate": 1.6690269063821456e-05, "loss": 0.3502, "step": 1439 }, { "epoch": 0.9117530668777206, "grad_norm": 0.5383065578581019, "learning_rate": 1.6685082111542104e-05, "loss": 0.3529, "step": 1440 }, { "epoch": 0.9123862287297191, "grad_norm": 0.5538995136992032, "learning_rate": 1.6679891905374908e-05, "loss": 0.348, "step": 1441 }, { "epoch": 0.9130193905817174, "grad_norm": 0.634309414761355, "learning_rate": 1.667469844784614e-05, "loss": 0.3532, "step": 1442 }, { "epoch": 0.9136525524337159, "grad_norm": 0.5275000293577013, "learning_rate": 1.6669501741483653e-05, "loss": 0.3485, "step": 1443 }, { "epoch": 0.9142857142857143, "grad_norm": 0.8674523443472858, "learning_rate": 1.666430178881689e-05, "loss": 0.3617, "step": 1444 }, { "epoch": 0.9149188761377127, "grad_norm": 0.5257492329277836, "learning_rate": 1.6659098592376865e-05, "loss": 0.3649, "step": 1445 }, { "epoch": 0.9155520379897111, "grad_norm": 0.5170547763696566, "learning_rate": 1.6653892154696173e-05, "loss": 0.3657, "step": 1446 }, { "epoch": 0.9161851998417095, "grad_norm": 0.5535372005173214, "learning_rate": 1.6648682478308998e-05, "loss": 0.3564, "step": 1447 }, { "epoch": 0.916818361693708, "grad_norm": 0.5451933742156146, "learning_rate": 1.664346956575108e-05, "loss": 0.3446, "step": 1448 }, { "epoch": 0.9174515235457064, "grad_norm": 0.5397103655308947, "learning_rate": 1.663825341955975e-05, "loss": 0.3743, "step": 1449 }, { "epoch": 0.9180846853977048, "grad_norm": 0.5585707696416667, "learning_rate": 1.663303404227391e-05, "loss": 0.3536, "step": 1450 }, { "epoch": 0.9187178472497032, "grad_norm": 0.5377059989824293, "learning_rate": 1.6627811436434028e-05, "loss": 0.3572, "step": 1451 }, { "epoch": 0.9193510091017016, "grad_norm": 0.564010194339442, "learning_rate": 1.6622585604582154e-05, "loss": 0.3517, "step": 1452 }, { "epoch": 0.9199841709537, "grad_norm": 0.5371025195178196, "learning_rate": 1.6617356549261897e-05, "loss": 0.3592, "step": 1453 }, { "epoch": 0.9206173328056985, "grad_norm": 0.5785213094085655, "learning_rate": 1.661212427301844e-05, "loss": 0.3711, "step": 1454 }, { "epoch": 0.9212504946576969, "grad_norm": 0.5017511868885878, "learning_rate": 1.6606888778398534e-05, "loss": 0.3716, "step": 1455 }, { "epoch": 0.9218836565096953, "grad_norm": 0.5667923953826056, "learning_rate": 1.6601650067950505e-05, "loss": 0.3582, "step": 1456 }, { "epoch": 0.9225168183616937, "grad_norm": 0.5371792211406206, "learning_rate": 1.6596408144224226e-05, "loss": 0.3424, "step": 1457 }, { "epoch": 0.9231499802136921, "grad_norm": 0.5599038832746341, "learning_rate": 1.659116300977115e-05, "loss": 0.3504, "step": 1458 }, { "epoch": 0.9237831420656906, "grad_norm": 0.5435628976769811, "learning_rate": 1.658591466714428e-05, "loss": 0.3706, "step": 1459 }, { "epoch": 0.9244163039176889, "grad_norm": 0.524295626166247, "learning_rate": 1.6580663118898195e-05, "loss": 0.3452, "step": 1460 }, { "epoch": 0.9250494657696874, "grad_norm": 0.549061270623463, "learning_rate": 1.6575408367589022e-05, "loss": 0.3581, "step": 1461 }, { "epoch": 0.9256826276216857, "grad_norm": 0.7610815199076647, "learning_rate": 1.657015041577445e-05, "loss": 0.3696, "step": 1462 }, { "epoch": 0.9263157894736842, "grad_norm": 0.5169254588557712, "learning_rate": 1.656488926601373e-05, "loss": 0.3638, "step": 1463 }, { "epoch": 0.9269489513256827, "grad_norm": 0.5415671019322525, "learning_rate": 1.6559624920867673e-05, "loss": 0.3559, "step": 1464 }, { "epoch": 0.927582113177681, "grad_norm": 0.5510429414411339, "learning_rate": 1.6554357382898626e-05, "loss": 0.3551, "step": 1465 }, { "epoch": 0.9282152750296795, "grad_norm": 0.5756919044801848, "learning_rate": 1.6549086654670514e-05, "loss": 0.3675, "step": 1466 }, { "epoch": 0.9288484368816778, "grad_norm": 0.5847689870963212, "learning_rate": 1.6543812738748796e-05, "loss": 0.361, "step": 1467 }, { "epoch": 0.9294815987336763, "grad_norm": 0.7306185253380817, "learning_rate": 1.6538535637700498e-05, "loss": 0.3594, "step": 1468 }, { "epoch": 0.9301147605856747, "grad_norm": 0.6105546540729567, "learning_rate": 1.653325535409419e-05, "loss": 0.3492, "step": 1469 }, { "epoch": 0.9307479224376731, "grad_norm": 0.5572647604530035, "learning_rate": 1.652797189049998e-05, "loss": 0.3541, "step": 1470 }, { "epoch": 0.9313810842896716, "grad_norm": 0.5920290602476642, "learning_rate": 1.6522685249489542e-05, "loss": 0.3599, "step": 1471 }, { "epoch": 0.9320142461416699, "grad_norm": 0.5101035121570746, "learning_rate": 1.651739543363609e-05, "loss": 0.3548, "step": 1472 }, { "epoch": 0.9326474079936684, "grad_norm": 0.5656099425597685, "learning_rate": 1.6512102445514376e-05, "loss": 0.3567, "step": 1473 }, { "epoch": 0.9332805698456668, "grad_norm": 0.5547139100436607, "learning_rate": 1.6506806287700703e-05, "loss": 0.3548, "step": 1474 }, { "epoch": 0.9339137316976652, "grad_norm": 0.55095628203667, "learning_rate": 1.650150696277292e-05, "loss": 0.3802, "step": 1475 }, { "epoch": 0.9345468935496636, "grad_norm": 0.5488721044045144, "learning_rate": 1.6496204473310407e-05, "loss": 0.353, "step": 1476 }, { "epoch": 0.935180055401662, "grad_norm": 0.5719298708773775, "learning_rate": 1.6490898821894096e-05, "loss": 0.3528, "step": 1477 }, { "epoch": 0.9358132172536605, "grad_norm": 0.5543225675612831, "learning_rate": 1.648559001110645e-05, "loss": 0.3713, "step": 1478 }, { "epoch": 0.9364463791056589, "grad_norm": 0.5783828832754306, "learning_rate": 1.6480278043531478e-05, "loss": 0.3477, "step": 1479 }, { "epoch": 0.9370795409576573, "grad_norm": 0.6199811073578709, "learning_rate": 1.6474962921754708e-05, "loss": 0.3556, "step": 1480 }, { "epoch": 0.9377127028096557, "grad_norm": 0.551619401986749, "learning_rate": 1.646964464836323e-05, "loss": 0.3516, "step": 1481 }, { "epoch": 0.9383458646616541, "grad_norm": 0.614519703901086, "learning_rate": 1.646432322594564e-05, "loss": 0.3527, "step": 1482 }, { "epoch": 0.9389790265136525, "grad_norm": 0.5655049016456524, "learning_rate": 1.6458998657092084e-05, "loss": 0.3641, "step": 1483 }, { "epoch": 0.939612188365651, "grad_norm": 0.5277273167149684, "learning_rate": 1.645367094439424e-05, "loss": 0.3447, "step": 1484 }, { "epoch": 0.9402453502176494, "grad_norm": 0.5545906842998654, "learning_rate": 1.6448340090445306e-05, "loss": 0.3485, "step": 1485 }, { "epoch": 0.9408785120696478, "grad_norm": 0.5308380057941534, "learning_rate": 1.6443006097840018e-05, "loss": 0.35, "step": 1486 }, { "epoch": 0.9415116739216463, "grad_norm": 0.5591289826144603, "learning_rate": 1.6437668969174637e-05, "loss": 0.3617, "step": 1487 }, { "epoch": 0.9421448357736446, "grad_norm": 0.5133238616262041, "learning_rate": 1.6432328707046948e-05, "loss": 0.3356, "step": 1488 }, { "epoch": 0.9427779976256431, "grad_norm": 0.5207708456947564, "learning_rate": 1.6426985314056262e-05, "loss": 0.362, "step": 1489 }, { "epoch": 0.9434111594776414, "grad_norm": 0.5336906177349104, "learning_rate": 1.6421638792803417e-05, "loss": 0.3591, "step": 1490 }, { "epoch": 0.9440443213296399, "grad_norm": 0.513455650947661, "learning_rate": 1.641628914589077e-05, "loss": 0.3698, "step": 1491 }, { "epoch": 0.9446774831816384, "grad_norm": 0.5876748110977121, "learning_rate": 1.64109363759222e-05, "loss": 0.3803, "step": 1492 }, { "epoch": 0.9453106450336367, "grad_norm": 0.564419221615585, "learning_rate": 1.6405580485503113e-05, "loss": 0.3441, "step": 1493 }, { "epoch": 0.9459438068856352, "grad_norm": 0.5807263806781042, "learning_rate": 1.6400221477240423e-05, "loss": 0.3529, "step": 1494 }, { "epoch": 0.9465769687376335, "grad_norm": 0.5577746357331776, "learning_rate": 1.6394859353742564e-05, "loss": 0.3523, "step": 1495 }, { "epoch": 0.947210130589632, "grad_norm": 0.5126790601355213, "learning_rate": 1.6389494117619493e-05, "loss": 0.3551, "step": 1496 }, { "epoch": 0.9478432924416303, "grad_norm": 0.545221643364061, "learning_rate": 1.6384125771482677e-05, "loss": 0.3802, "step": 1497 }, { "epoch": 0.9484764542936288, "grad_norm": 0.534508821098197, "learning_rate": 1.6378754317945096e-05, "loss": 0.3479, "step": 1498 }, { "epoch": 0.9491096161456273, "grad_norm": 0.5605508013505168, "learning_rate": 1.637337975962125e-05, "loss": 0.3549, "step": 1499 }, { "epoch": 0.9497427779976256, "grad_norm": 0.6252146636699506, "learning_rate": 1.6368002099127133e-05, "loss": 0.3467, "step": 1500 }, { "epoch": 0.9503759398496241, "grad_norm": 0.4993344081081089, "learning_rate": 1.6362621339080268e-05, "loss": 0.3554, "step": 1501 }, { "epoch": 0.9510091017016225, "grad_norm": 0.5533826404548183, "learning_rate": 1.6357237482099682e-05, "loss": 0.3435, "step": 1502 }, { "epoch": 0.9516422635536209, "grad_norm": 0.5225973354095481, "learning_rate": 1.63518505308059e-05, "loss": 0.3675, "step": 1503 }, { "epoch": 0.9522754254056193, "grad_norm": 0.5440957058650285, "learning_rate": 1.634646048782096e-05, "loss": 0.3633, "step": 1504 }, { "epoch": 0.9529085872576177, "grad_norm": 0.5401827011089391, "learning_rate": 1.63410673557684e-05, "loss": 0.3517, "step": 1505 }, { "epoch": 0.9535417491096162, "grad_norm": 0.5768148330263898, "learning_rate": 1.6335671137273277e-05, "loss": 0.3718, "step": 1506 }, { "epoch": 0.9541749109616146, "grad_norm": 0.5308390782020305, "learning_rate": 1.6330271834962137e-05, "loss": 0.3625, "step": 1507 }, { "epoch": 0.954808072813613, "grad_norm": 0.4986825656366938, "learning_rate": 1.632486945146302e-05, "loss": 0.3604, "step": 1508 }, { "epoch": 0.9554412346656114, "grad_norm": 0.5782482035023937, "learning_rate": 1.6319463989405485e-05, "loss": 0.365, "step": 1509 }, { "epoch": 0.9560743965176098, "grad_norm": 0.5531797533844198, "learning_rate": 1.631405545142057e-05, "loss": 0.3499, "step": 1510 }, { "epoch": 0.9567075583696082, "grad_norm": 0.573630891706363, "learning_rate": 1.630864384014083e-05, "loss": 0.3627, "step": 1511 }, { "epoch": 0.9573407202216067, "grad_norm": 0.5156537493848483, "learning_rate": 1.6303229158200292e-05, "loss": 0.3557, "step": 1512 }, { "epoch": 0.9579738820736051, "grad_norm": 0.594759837657408, "learning_rate": 1.6297811408234504e-05, "loss": 0.3495, "step": 1513 }, { "epoch": 0.9586070439256035, "grad_norm": 0.5347294933262127, "learning_rate": 1.6292390592880484e-05, "loss": 0.3647, "step": 1514 }, { "epoch": 0.9592402057776019, "grad_norm": 0.6117231248504826, "learning_rate": 1.628696671477676e-05, "loss": 0.3431, "step": 1515 }, { "epoch": 0.9598733676296003, "grad_norm": 1.06184247832104, "learning_rate": 1.628153977656334e-05, "loss": 0.3481, "step": 1516 }, { "epoch": 0.9605065294815988, "grad_norm": 0.5167541658804945, "learning_rate": 1.6276109780881727e-05, "loss": 0.3662, "step": 1517 }, { "epoch": 0.9611396913335971, "grad_norm": 0.531997359006542, "learning_rate": 1.6270676730374905e-05, "loss": 0.3591, "step": 1518 }, { "epoch": 0.9617728531855956, "grad_norm": 0.6730578273719964, "learning_rate": 1.626524062768735e-05, "loss": 0.3584, "step": 1519 }, { "epoch": 0.9624060150375939, "grad_norm": 0.5623038997969004, "learning_rate": 1.625980147546503e-05, "loss": 0.3599, "step": 1520 }, { "epoch": 0.9630391768895924, "grad_norm": 0.5630088981541459, "learning_rate": 1.6254359276355388e-05, "loss": 0.3599, "step": 1521 }, { "epoch": 0.9636723387415909, "grad_norm": 0.5801208373751424, "learning_rate": 1.6248914033007347e-05, "loss": 0.3502, "step": 1522 }, { "epoch": 0.9643055005935892, "grad_norm": 0.5476551756619953, "learning_rate": 1.6243465748071322e-05, "loss": 0.351, "step": 1523 }, { "epoch": 0.9649386624455877, "grad_norm": 0.5451793977494847, "learning_rate": 1.6238014424199204e-05, "loss": 0.3485, "step": 1524 }, { "epoch": 0.965571824297586, "grad_norm": 0.5854712999969912, "learning_rate": 1.623256006404436e-05, "loss": 0.362, "step": 1525 }, { "epoch": 0.9662049861495845, "grad_norm": 0.549392969536954, "learning_rate": 1.622710267026164e-05, "loss": 0.3414, "step": 1526 }, { "epoch": 0.9668381480015829, "grad_norm": 0.5477283846165786, "learning_rate": 1.622164224550737e-05, "loss": 0.3403, "step": 1527 }, { "epoch": 0.9674713098535813, "grad_norm": 0.5054992834785093, "learning_rate": 1.621617879243935e-05, "loss": 0.3635, "step": 1528 }, { "epoch": 0.9681044717055798, "grad_norm": 0.5284953507346505, "learning_rate": 1.621071231371685e-05, "loss": 0.3543, "step": 1529 }, { "epoch": 0.9687376335575781, "grad_norm": 0.5549972732406752, "learning_rate": 1.620524281200062e-05, "loss": 0.336, "step": 1530 }, { "epoch": 0.9693707954095766, "grad_norm": 0.5454524080351257, "learning_rate": 1.6199770289952878e-05, "loss": 0.3702, "step": 1531 }, { "epoch": 0.970003957261575, "grad_norm": 0.6196784032732169, "learning_rate": 1.6194294750237304e-05, "loss": 0.3533, "step": 1532 }, { "epoch": 0.9706371191135734, "grad_norm": 0.5577232185275285, "learning_rate": 1.6188816195519067e-05, "loss": 0.3621, "step": 1533 }, { "epoch": 0.9712702809655718, "grad_norm": 0.5442218506356646, "learning_rate": 1.6183334628464783e-05, "loss": 0.3669, "step": 1534 }, { "epoch": 0.9719034428175702, "grad_norm": 0.5247971507009683, "learning_rate": 1.6177850051742545e-05, "loss": 0.346, "step": 1535 }, { "epoch": 0.9725366046695687, "grad_norm": 0.5221901856283838, "learning_rate": 1.617236246802191e-05, "loss": 0.3629, "step": 1536 }, { "epoch": 0.9731697665215671, "grad_norm": 0.47624719944635263, "learning_rate": 1.6166871879973896e-05, "loss": 0.3711, "step": 1537 }, { "epoch": 0.9738029283735655, "grad_norm": 0.5434365255894814, "learning_rate": 1.6161378290270982e-05, "loss": 0.348, "step": 1538 }, { "epoch": 0.9744360902255639, "grad_norm": 0.882555681842936, "learning_rate": 1.6155881701587114e-05, "loss": 0.3619, "step": 1539 }, { "epoch": 0.9750692520775623, "grad_norm": 0.5175885257617971, "learning_rate": 1.6150382116597695e-05, "loss": 0.352, "step": 1540 }, { "epoch": 0.9757024139295607, "grad_norm": 0.5797395980544892, "learning_rate": 1.614487953797958e-05, "loss": 0.3551, "step": 1541 }, { "epoch": 0.9763355757815592, "grad_norm": 0.5957166381769974, "learning_rate": 1.613937396841109e-05, "loss": 0.3459, "step": 1542 }, { "epoch": 0.9769687376335576, "grad_norm": 0.5057203832079166, "learning_rate": 1.6133865410572e-05, "loss": 0.3455, "step": 1543 }, { "epoch": 0.977601899485556, "grad_norm": 0.5178655353461926, "learning_rate": 1.6128353867143537e-05, "loss": 0.3531, "step": 1544 }, { "epoch": 0.9782350613375544, "grad_norm": 0.5567098031591869, "learning_rate": 1.612283934080838e-05, "loss": 0.3483, "step": 1545 }, { "epoch": 0.9788682231895528, "grad_norm": 0.5541207554733665, "learning_rate": 1.6117321834250664e-05, "loss": 0.3326, "step": 1546 }, { "epoch": 0.9795013850415513, "grad_norm": 0.562495986125744, "learning_rate": 1.6111801350155973e-05, "loss": 0.3619, "step": 1547 }, { "epoch": 0.9801345468935496, "grad_norm": 0.5440812638535829, "learning_rate": 1.610627789121134e-05, "loss": 0.3604, "step": 1548 }, { "epoch": 0.9807677087455481, "grad_norm": 0.5844882680676428, "learning_rate": 1.6100751460105244e-05, "loss": 0.3285, "step": 1549 }, { "epoch": 0.9814008705975465, "grad_norm": 0.5407015904457508, "learning_rate": 1.6095222059527617e-05, "loss": 0.3604, "step": 1550 }, { "epoch": 0.9820340324495449, "grad_norm": 0.5592128997701301, "learning_rate": 1.6089689692169828e-05, "loss": 0.364, "step": 1551 }, { "epoch": 0.9826671943015434, "grad_norm": 0.5384931970338068, "learning_rate": 1.6084154360724693e-05, "loss": 0.3646, "step": 1552 }, { "epoch": 0.9833003561535417, "grad_norm": 0.531994214730322, "learning_rate": 1.6078616067886478e-05, "loss": 0.3422, "step": 1553 }, { "epoch": 0.9839335180055402, "grad_norm": 0.5459992124485059, "learning_rate": 1.6073074816350875e-05, "loss": 0.3652, "step": 1554 }, { "epoch": 0.9845666798575385, "grad_norm": 0.5675608764109349, "learning_rate": 1.6067530608815035e-05, "loss": 0.3598, "step": 1555 }, { "epoch": 0.985199841709537, "grad_norm": 0.5256931132333134, "learning_rate": 1.6061983447977528e-05, "loss": 0.3518, "step": 1556 }, { "epoch": 0.9858330035615355, "grad_norm": 0.5918801047013993, "learning_rate": 1.605643333653838e-05, "loss": 0.3451, "step": 1557 }, { "epoch": 0.9864661654135338, "grad_norm": 0.5723935694453879, "learning_rate": 1.605088027719904e-05, "loss": 0.3496, "step": 1558 }, { "epoch": 0.9870993272655323, "grad_norm": 0.5375325262817346, "learning_rate": 1.6045324272662402e-05, "loss": 0.3685, "step": 1559 }, { "epoch": 0.9877324891175306, "grad_norm": 0.5338369255079475, "learning_rate": 1.6039765325632783e-05, "loss": 0.3558, "step": 1560 }, { "epoch": 0.9883656509695291, "grad_norm": 0.5108114142364815, "learning_rate": 1.603420343881594e-05, "loss": 0.3572, "step": 1561 }, { "epoch": 0.9889988128215275, "grad_norm": 0.6123978610330247, "learning_rate": 1.6028638614919057e-05, "loss": 0.3577, "step": 1562 }, { "epoch": 0.9896319746735259, "grad_norm": 0.5710922267516406, "learning_rate": 1.6023070856650746e-05, "loss": 0.3495, "step": 1563 }, { "epoch": 0.9902651365255244, "grad_norm": 0.5475591742523868, "learning_rate": 1.6017500166721055e-05, "loss": 0.3413, "step": 1564 }, { "epoch": 0.9908982983775227, "grad_norm": 0.5646919340217523, "learning_rate": 1.601192654784145e-05, "loss": 0.3517, "step": 1565 }, { "epoch": 0.9915314602295212, "grad_norm": 0.5362572042554152, "learning_rate": 1.6006350002724833e-05, "loss": 0.3412, "step": 1566 }, { "epoch": 0.9921646220815196, "grad_norm": 0.6534805253869033, "learning_rate": 1.6000770534085518e-05, "loss": 0.3612, "step": 1567 }, { "epoch": 0.992797783933518, "grad_norm": 0.5345490603632949, "learning_rate": 1.599518814463925e-05, "loss": 0.3572, "step": 1568 }, { "epoch": 0.9934309457855164, "grad_norm": 0.5069879512272716, "learning_rate": 1.5989602837103196e-05, "loss": 0.3554, "step": 1569 }, { "epoch": 0.9940641076375148, "grad_norm": 0.5882102638673978, "learning_rate": 1.5984014614195936e-05, "loss": 0.3497, "step": 1570 }, { "epoch": 0.9946972694895132, "grad_norm": 0.5215481239644884, "learning_rate": 1.597842347863748e-05, "loss": 0.3629, "step": 1571 }, { "epoch": 0.9953304313415117, "grad_norm": 0.511068936988775, "learning_rate": 1.5972829433149244e-05, "loss": 0.3691, "step": 1572 }, { "epoch": 0.9959635931935101, "grad_norm": 0.737000739780695, "learning_rate": 1.5967232480454075e-05, "loss": 0.3505, "step": 1573 }, { "epoch": 0.9965967550455085, "grad_norm": 0.5237051048569414, "learning_rate": 1.5961632623276208e-05, "loss": 0.3565, "step": 1574 }, { "epoch": 0.997229916897507, "grad_norm": 0.5594440499105641, "learning_rate": 1.595602986434133e-05, "loss": 0.3566, "step": 1575 }, { "epoch": 0.9978630787495053, "grad_norm": 0.5684216546898295, "learning_rate": 1.5950424206376513e-05, "loss": 0.3721, "step": 1576 }, { "epoch": 0.9984962406015038, "grad_norm": 0.531601107452361, "learning_rate": 1.5944815652110244e-05, "loss": 0.3594, "step": 1577 }, { "epoch": 0.9991294024535021, "grad_norm": 0.531258635609034, "learning_rate": 1.5939204204272426e-05, "loss": 0.3485, "step": 1578 }, { "epoch": 0.9997625643055006, "grad_norm": 0.6015149043111591, "learning_rate": 1.5933589865594373e-05, "loss": 0.3321, "step": 1579 }, { "epoch": 1.0, "grad_norm": 0.7186668599247568, "learning_rate": 1.5927972638808787e-05, "loss": 0.3375, "step": 1580 }, { "epoch": 1.0006331618519984, "grad_norm": 0.6224829683223114, "learning_rate": 1.5922352526649803e-05, "loss": 0.3018, "step": 1581 }, { "epoch": 1.001266323703997, "grad_norm": 0.54346433327986, "learning_rate": 1.591672953185294e-05, "loss": 0.3231, "step": 1582 }, { "epoch": 1.0018994855559953, "grad_norm": 0.5610796371491107, "learning_rate": 1.5911103657155122e-05, "loss": 0.3069, "step": 1583 }, { "epoch": 1.0025326474079936, "grad_norm": 0.6093499248737604, "learning_rate": 1.5905474905294697e-05, "loss": 0.3094, "step": 1584 }, { "epoch": 1.003165809259992, "grad_norm": 0.9194436674314784, "learning_rate": 1.5899843279011376e-05, "loss": 0.3307, "step": 1585 }, { "epoch": 1.0037989711119906, "grad_norm": 0.5415785407026014, "learning_rate": 1.5894208781046302e-05, "loss": 0.3101, "step": 1586 }, { "epoch": 1.004432132963989, "grad_norm": 0.5486210321718129, "learning_rate": 1.5888571414141997e-05, "loss": 0.3157, "step": 1587 }, { "epoch": 1.0050652948159873, "grad_norm": 0.5566243036807956, "learning_rate": 1.5882931181042388e-05, "loss": 0.3043, "step": 1588 }, { "epoch": 1.0056984566679859, "grad_norm": 0.5510102227706268, "learning_rate": 1.5877288084492795e-05, "loss": 0.3209, "step": 1589 }, { "epoch": 1.0063316185199842, "grad_norm": 1.0070705460274163, "learning_rate": 1.5871642127239928e-05, "loss": 0.3236, "step": 1590 }, { "epoch": 1.0069647803719826, "grad_norm": 0.6047901520210086, "learning_rate": 1.5865993312031896e-05, "loss": 0.3167, "step": 1591 }, { "epoch": 1.007597942223981, "grad_norm": 0.5556578336856317, "learning_rate": 1.5860341641618194e-05, "loss": 0.3244, "step": 1592 }, { "epoch": 1.0082311040759795, "grad_norm": 0.6460862252584328, "learning_rate": 1.5854687118749706e-05, "loss": 0.3099, "step": 1593 }, { "epoch": 1.0088642659279778, "grad_norm": 0.5259714119152519, "learning_rate": 1.5849029746178716e-05, "loss": 0.3176, "step": 1594 }, { "epoch": 1.0094974277799762, "grad_norm": 0.6096015669076793, "learning_rate": 1.5843369526658876e-05, "loss": 0.2993, "step": 1595 }, { "epoch": 1.0101305896319748, "grad_norm": 0.6277480963629019, "learning_rate": 1.5837706462945236e-05, "loss": 0.3142, "step": 1596 }, { "epoch": 1.0107637514839731, "grad_norm": 0.5404101781905225, "learning_rate": 1.583204055779423e-05, "loss": 0.3192, "step": 1597 }, { "epoch": 1.0113969133359715, "grad_norm": 0.6254473358943915, "learning_rate": 1.5826371813963675e-05, "loss": 0.3141, "step": 1598 }, { "epoch": 1.0120300751879698, "grad_norm": 0.6030425028397175, "learning_rate": 1.582070023421276e-05, "loss": 0.294, "step": 1599 }, { "epoch": 1.0126632370399684, "grad_norm": 0.6019813023233817, "learning_rate": 1.581502582130207e-05, "loss": 0.3018, "step": 1600 }, { "epoch": 1.0132963988919668, "grad_norm": 0.5557455941513597, "learning_rate": 1.5809348577993558e-05, "loss": 0.3057, "step": 1601 }, { "epoch": 1.0139295607439651, "grad_norm": 0.5816053440967955, "learning_rate": 1.5803668507050556e-05, "loss": 0.3067, "step": 1602 }, { "epoch": 1.0145627225959637, "grad_norm": 0.555287630106854, "learning_rate": 1.5797985611237775e-05, "loss": 0.3276, "step": 1603 }, { "epoch": 1.015195884447962, "grad_norm": 0.5591836024765549, "learning_rate": 1.5792299893321304e-05, "loss": 0.3187, "step": 1604 }, { "epoch": 1.0158290462999604, "grad_norm": 0.6053715079095348, "learning_rate": 1.5786611356068594e-05, "loss": 0.3172, "step": 1605 }, { "epoch": 1.0164622081519588, "grad_norm": 0.6213101262771158, "learning_rate": 1.5780920002248484e-05, "loss": 0.3022, "step": 1606 }, { "epoch": 1.0170953700039573, "grad_norm": 0.5785756736403161, "learning_rate": 1.577522583463117e-05, "loss": 0.3153, "step": 1607 }, { "epoch": 1.0177285318559557, "grad_norm": 0.5589335814136088, "learning_rate": 1.576952885598823e-05, "loss": 0.3148, "step": 1608 }, { "epoch": 1.018361693707954, "grad_norm": 0.6302938104796401, "learning_rate": 1.57638290690926e-05, "loss": 0.3119, "step": 1609 }, { "epoch": 1.0189948555599526, "grad_norm": 0.612425256388834, "learning_rate": 1.575812647671858e-05, "loss": 0.2863, "step": 1610 }, { "epoch": 1.019628017411951, "grad_norm": 0.559177481614129, "learning_rate": 1.5752421081641853e-05, "loss": 0.3181, "step": 1611 }, { "epoch": 1.0202611792639493, "grad_norm": 0.6907218659065544, "learning_rate": 1.5746712886639448e-05, "loss": 0.3208, "step": 1612 }, { "epoch": 1.0208943411159477, "grad_norm": 0.5680744410085439, "learning_rate": 1.5741001894489767e-05, "loss": 0.3177, "step": 1613 }, { "epoch": 1.0215275029679463, "grad_norm": 0.5536694506192956, "learning_rate": 1.573528810797257e-05, "loss": 0.3055, "step": 1614 }, { "epoch": 1.0221606648199446, "grad_norm": 0.5537310985192532, "learning_rate": 1.572957152986898e-05, "loss": 0.2986, "step": 1615 }, { "epoch": 1.022793826671943, "grad_norm": 0.5572456849345103, "learning_rate": 1.572385216296147e-05, "loss": 0.3094, "step": 1616 }, { "epoch": 1.0234269885239415, "grad_norm": 0.5651952570211274, "learning_rate": 1.5718130010033888e-05, "loss": 0.2902, "step": 1617 }, { "epoch": 1.02406015037594, "grad_norm": 0.5746485863119033, "learning_rate": 1.5712405073871416e-05, "loss": 0.3312, "step": 1618 }, { "epoch": 1.0246933122279382, "grad_norm": 0.5097897363168385, "learning_rate": 1.5706677357260608e-05, "loss": 0.318, "step": 1619 }, { "epoch": 1.0253264740799366, "grad_norm": 0.5845426757448843, "learning_rate": 1.570094686298936e-05, "loss": 0.3269, "step": 1620 }, { "epoch": 1.0259596359319352, "grad_norm": 0.5579945037166182, "learning_rate": 1.5695213593846933e-05, "loss": 0.3205, "step": 1621 }, { "epoch": 1.0265927977839335, "grad_norm": 0.512336192633164, "learning_rate": 1.5689477552623926e-05, "loss": 0.3133, "step": 1622 }, { "epoch": 1.0272259596359319, "grad_norm": 0.5658692885807624, "learning_rate": 1.5683738742112285e-05, "loss": 0.3093, "step": 1623 }, { "epoch": 1.0278591214879305, "grad_norm": 0.5153658636618506, "learning_rate": 1.5677997165105322e-05, "loss": 0.3213, "step": 1624 }, { "epoch": 1.0284922833399288, "grad_norm": 0.5291945166390537, "learning_rate": 1.5672252824397683e-05, "loss": 0.3196, "step": 1625 }, { "epoch": 1.0291254451919272, "grad_norm": 0.5394740338397687, "learning_rate": 1.5666505722785354e-05, "loss": 0.3124, "step": 1626 }, { "epoch": 1.0297586070439255, "grad_norm": 0.528684657962499, "learning_rate": 1.5660755863065676e-05, "loss": 0.2992, "step": 1627 }, { "epoch": 1.030391768895924, "grad_norm": 0.5644373494878028, "learning_rate": 1.5655003248037325e-05, "loss": 0.3105, "step": 1628 }, { "epoch": 1.0310249307479225, "grad_norm": 0.5554235240143603, "learning_rate": 1.5649247880500328e-05, "loss": 0.3055, "step": 1629 }, { "epoch": 1.0316580925999208, "grad_norm": 0.4983801062252052, "learning_rate": 1.564348976325604e-05, "loss": 0.2911, "step": 1630 }, { "epoch": 1.0322912544519194, "grad_norm": 0.5501533133479412, "learning_rate": 1.5637728899107155e-05, "loss": 0.3142, "step": 1631 }, { "epoch": 1.0329244163039177, "grad_norm": 0.5575698895791086, "learning_rate": 1.5631965290857717e-05, "loss": 0.3079, "step": 1632 }, { "epoch": 1.033557578155916, "grad_norm": 0.5557318334068071, "learning_rate": 1.5626198941313092e-05, "loss": 0.3041, "step": 1633 }, { "epoch": 1.0341907400079144, "grad_norm": 0.5070194043974294, "learning_rate": 1.5620429853279984e-05, "loss": 0.3279, "step": 1634 }, { "epoch": 1.034823901859913, "grad_norm": 0.6030305905504468, "learning_rate": 1.5614658029566434e-05, "loss": 0.302, "step": 1635 }, { "epoch": 1.0354570637119114, "grad_norm": 0.5830825316899253, "learning_rate": 1.560888347298181e-05, "loss": 0.2878, "step": 1636 }, { "epoch": 1.0360902255639097, "grad_norm": 0.5401903597579097, "learning_rate": 1.5603106186336812e-05, "loss": 0.305, "step": 1637 }, { "epoch": 1.036723387415908, "grad_norm": 0.5244387444339795, "learning_rate": 1.5597326172443472e-05, "loss": 0.2991, "step": 1638 }, { "epoch": 1.0373565492679067, "grad_norm": 0.5779516419962033, "learning_rate": 1.559154343411514e-05, "loss": 0.3159, "step": 1639 }, { "epoch": 1.037989711119905, "grad_norm": 0.5350678637421342, "learning_rate": 1.5585757974166506e-05, "loss": 0.3137, "step": 1640 }, { "epoch": 1.0386228729719034, "grad_norm": 0.583664718802466, "learning_rate": 1.557996979541357e-05, "loss": 0.2939, "step": 1641 }, { "epoch": 1.039256034823902, "grad_norm": 0.5315055731379972, "learning_rate": 1.557417890067366e-05, "loss": 0.319, "step": 1642 }, { "epoch": 1.0398891966759003, "grad_norm": 0.5231913091334326, "learning_rate": 1.556838529276544e-05, "loss": 0.3172, "step": 1643 }, { "epoch": 1.0405223585278986, "grad_norm": 0.5811505706218036, "learning_rate": 1.556258897450887e-05, "loss": 0.3086, "step": 1644 }, { "epoch": 1.041155520379897, "grad_norm": 0.5282180040316893, "learning_rate": 1.5556789948725253e-05, "loss": 0.3243, "step": 1645 }, { "epoch": 1.0417886822318956, "grad_norm": 0.5355738660674311, "learning_rate": 1.5550988218237196e-05, "loss": 0.3252, "step": 1646 }, { "epoch": 1.042421844083894, "grad_norm": 0.6197132176754204, "learning_rate": 1.554518378586862e-05, "loss": 0.3056, "step": 1647 }, { "epoch": 1.0430550059358923, "grad_norm": 0.5782986752709515, "learning_rate": 1.553937665444477e-05, "loss": 0.3085, "step": 1648 }, { "epoch": 1.0436881677878909, "grad_norm": 0.5269127951679125, "learning_rate": 1.55335668267922e-05, "loss": 0.3169, "step": 1649 }, { "epoch": 1.0443213296398892, "grad_norm": 0.5867491720688888, "learning_rate": 1.5527754305738783e-05, "loss": 0.3004, "step": 1650 }, { "epoch": 1.0449544914918876, "grad_norm": 0.5273758356073489, "learning_rate": 1.5521939094113693e-05, "loss": 0.3148, "step": 1651 }, { "epoch": 1.045587653343886, "grad_norm": 0.5731397597427851, "learning_rate": 1.551612119474742e-05, "loss": 0.3135, "step": 1652 }, { "epoch": 1.0462208151958845, "grad_norm": 0.5131791578892756, "learning_rate": 1.5510300610471752e-05, "loss": 0.3209, "step": 1653 }, { "epoch": 1.0468539770478829, "grad_norm": 0.4939013702740331, "learning_rate": 1.5504477344119807e-05, "loss": 0.3084, "step": 1654 }, { "epoch": 1.0474871388998812, "grad_norm": 0.5237071672443402, "learning_rate": 1.5498651398525987e-05, "loss": 0.3238, "step": 1655 }, { "epoch": 1.0481203007518798, "grad_norm": 0.5319610371503214, "learning_rate": 1.5492822776526005e-05, "loss": 0.2948, "step": 1656 }, { "epoch": 1.0487534626038781, "grad_norm": 0.5003722027674912, "learning_rate": 1.5486991480956876e-05, "loss": 0.3201, "step": 1657 }, { "epoch": 1.0493866244558765, "grad_norm": 0.5778749320663582, "learning_rate": 1.5481157514656913e-05, "loss": 0.3075, "step": 1658 }, { "epoch": 1.0500197863078748, "grad_norm": 0.5495148332830458, "learning_rate": 1.547532088046574e-05, "loss": 0.3046, "step": 1659 }, { "epoch": 1.0506529481598734, "grad_norm": 0.6632605927257332, "learning_rate": 1.5469481581224274e-05, "loss": 0.2996, "step": 1660 }, { "epoch": 1.0512861100118718, "grad_norm": 0.5404615862811342, "learning_rate": 1.5463639619774714e-05, "loss": 0.3213, "step": 1661 }, { "epoch": 1.0519192718638701, "grad_norm": 0.5358337080400306, "learning_rate": 1.545779499896058e-05, "loss": 0.3202, "step": 1662 }, { "epoch": 1.0525524337158687, "grad_norm": 0.6111533336886587, "learning_rate": 1.5451947721626676e-05, "loss": 0.3138, "step": 1663 }, { "epoch": 1.053185595567867, "grad_norm": 0.5314585904505876, "learning_rate": 1.5446097790619096e-05, "loss": 0.2993, "step": 1664 }, { "epoch": 1.0538187574198654, "grad_norm": 0.5432014273481025, "learning_rate": 1.544024520878522e-05, "loss": 0.3208, "step": 1665 }, { "epoch": 1.0544519192718638, "grad_norm": 0.5875950539302465, "learning_rate": 1.5434389978973734e-05, "loss": 0.3283, "step": 1666 }, { "epoch": 1.0550850811238623, "grad_norm": 0.5586247196365928, "learning_rate": 1.5428532104034607e-05, "loss": 0.305, "step": 1667 }, { "epoch": 1.0557182429758607, "grad_norm": 0.5841136199222252, "learning_rate": 1.5422671586819084e-05, "loss": 0.3004, "step": 1668 }, { "epoch": 1.056351404827859, "grad_norm": 0.5721656966571029, "learning_rate": 1.541680843017971e-05, "loss": 0.3161, "step": 1669 }, { "epoch": 1.0569845666798576, "grad_norm": 0.5629167206687111, "learning_rate": 1.5410942636970308e-05, "loss": 0.3018, "step": 1670 }, { "epoch": 1.057617728531856, "grad_norm": 0.5562684912385024, "learning_rate": 1.540507421004599e-05, "loss": 0.31, "step": 1671 }, { "epoch": 1.0582508903838543, "grad_norm": 0.559913743082349, "learning_rate": 1.539920315226314e-05, "loss": 0.324, "step": 1672 }, { "epoch": 1.0588840522358527, "grad_norm": 0.5609693204479762, "learning_rate": 1.5393329466479438e-05, "loss": 0.2948, "step": 1673 }, { "epoch": 1.0595172140878513, "grad_norm": 0.6263460209539395, "learning_rate": 1.538745315555383e-05, "loss": 0.3078, "step": 1674 }, { "epoch": 1.0601503759398496, "grad_norm": 0.5598144087501957, "learning_rate": 1.5381574222346536e-05, "loss": 0.3042, "step": 1675 }, { "epoch": 1.060783537791848, "grad_norm": 0.6004910259650965, "learning_rate": 1.5375692669719072e-05, "loss": 0.3142, "step": 1676 }, { "epoch": 1.0614166996438466, "grad_norm": 0.592171272339301, "learning_rate": 1.5369808500534206e-05, "loss": 0.2971, "step": 1677 }, { "epoch": 1.062049861495845, "grad_norm": 0.5669238433582039, "learning_rate": 1.5363921717656004e-05, "loss": 0.2998, "step": 1678 }, { "epoch": 1.0626830233478433, "grad_norm": 0.6317875168893555, "learning_rate": 1.5358032323949775e-05, "loss": 0.3134, "step": 1679 }, { "epoch": 1.0633161851998416, "grad_norm": 0.5977441898003506, "learning_rate": 1.5352140322282126e-05, "loss": 0.3028, "step": 1680 }, { "epoch": 1.0639493470518402, "grad_norm": 0.5722647250286581, "learning_rate": 1.5346245715520916e-05, "loss": 0.3084, "step": 1681 }, { "epoch": 1.0645825089038385, "grad_norm": 0.6148071896890843, "learning_rate": 1.534034850653528e-05, "loss": 0.3162, "step": 1682 }, { "epoch": 1.065215670755837, "grad_norm": 0.5789063210954523, "learning_rate": 1.533444869819562e-05, "loss": 0.3166, "step": 1683 }, { "epoch": 1.0658488326078355, "grad_norm": 0.6068894022694753, "learning_rate": 1.53285462933736e-05, "loss": 0.3069, "step": 1684 }, { "epoch": 1.0664819944598338, "grad_norm": 0.5483759008649539, "learning_rate": 1.5322641294942144e-05, "loss": 0.3267, "step": 1685 }, { "epoch": 1.0671151563118322, "grad_norm": 0.6177565045643831, "learning_rate": 1.5316733705775447e-05, "loss": 0.297, "step": 1686 }, { "epoch": 1.0677483181638305, "grad_norm": 0.5648505246310863, "learning_rate": 1.5310823528748962e-05, "loss": 0.309, "step": 1687 }, { "epoch": 1.068381480015829, "grad_norm": 0.5409276894094246, "learning_rate": 1.5304910766739395e-05, "loss": 0.3213, "step": 1688 }, { "epoch": 1.0690146418678275, "grad_norm": 0.5470992703657542, "learning_rate": 1.5298995422624717e-05, "loss": 0.3057, "step": 1689 }, { "epoch": 1.0696478037198258, "grad_norm": 0.5824081439693279, "learning_rate": 1.5293077499284162e-05, "loss": 0.2976, "step": 1690 }, { "epoch": 1.0702809655718244, "grad_norm": 0.5476618968219095, "learning_rate": 1.528715699959821e-05, "loss": 0.3039, "step": 1691 }, { "epoch": 1.0709141274238227, "grad_norm": 0.5461672906858451, "learning_rate": 1.5281233926448584e-05, "loss": 0.3171, "step": 1692 }, { "epoch": 1.071547289275821, "grad_norm": 0.5273395924367978, "learning_rate": 1.527530828271829e-05, "loss": 0.302, "step": 1693 }, { "epoch": 1.0721804511278195, "grad_norm": 0.6154753690833024, "learning_rate": 1.5269380071291556e-05, "loss": 0.3093, "step": 1694 }, { "epoch": 1.072813612979818, "grad_norm": 0.5575276418294947, "learning_rate": 1.5263449295053882e-05, "loss": 0.3062, "step": 1695 }, { "epoch": 1.0734467748318164, "grad_norm": 0.5647064404551192, "learning_rate": 1.5257515956891996e-05, "loss": 0.2953, "step": 1696 }, { "epoch": 1.0740799366838147, "grad_norm": 0.5625185997913422, "learning_rate": 1.525158005969389e-05, "loss": 0.3142, "step": 1697 }, { "epoch": 1.0747130985358133, "grad_norm": 0.5604675074178819, "learning_rate": 1.5245641606348788e-05, "loss": 0.3248, "step": 1698 }, { "epoch": 1.0753462603878117, "grad_norm": 0.5602971168992987, "learning_rate": 1.5239700599747171e-05, "loss": 0.3296, "step": 1699 }, { "epoch": 1.07597942223981, "grad_norm": 0.645563729769898, "learning_rate": 1.5233757042780753e-05, "loss": 0.3101, "step": 1700 }, { "epoch": 1.0766125840918084, "grad_norm": 0.5497279759990575, "learning_rate": 1.5227810938342493e-05, "loss": 0.3146, "step": 1701 }, { "epoch": 1.077245745943807, "grad_norm": 0.5633348111465203, "learning_rate": 1.5221862289326592e-05, "loss": 0.3013, "step": 1702 }, { "epoch": 1.0778789077958053, "grad_norm": 0.5987364159120888, "learning_rate": 1.5215911098628487e-05, "loss": 0.3042, "step": 1703 }, { "epoch": 1.0785120696478037, "grad_norm": 0.5773795152203233, "learning_rate": 1.520995736914485e-05, "loss": 0.3075, "step": 1704 }, { "epoch": 1.0791452314998022, "grad_norm": 0.6113489086952053, "learning_rate": 1.5204001103773594e-05, "loss": 0.3131, "step": 1705 }, { "epoch": 1.0797783933518006, "grad_norm": 0.9200587447228236, "learning_rate": 1.5198042305413862e-05, "loss": 0.3061, "step": 1706 }, { "epoch": 1.080411555203799, "grad_norm": 0.5517751715267424, "learning_rate": 1.5192080976966033e-05, "loss": 0.3127, "step": 1707 }, { "epoch": 1.0810447170557973, "grad_norm": 0.7539180087660785, "learning_rate": 1.5186117121331717e-05, "loss": 0.3001, "step": 1708 }, { "epoch": 1.0816778789077959, "grad_norm": 0.5922199453113002, "learning_rate": 1.5180150741413747e-05, "loss": 0.317, "step": 1709 }, { "epoch": 1.0823110407597942, "grad_norm": 0.5146327431912135, "learning_rate": 1.5174181840116196e-05, "loss": 0.3058, "step": 1710 }, { "epoch": 1.0829442026117926, "grad_norm": 0.533771225212532, "learning_rate": 1.5168210420344358e-05, "loss": 0.3106, "step": 1711 }, { "epoch": 1.0835773644637912, "grad_norm": 0.5833120266700127, "learning_rate": 1.516223648500475e-05, "loss": 0.3117, "step": 1712 }, { "epoch": 1.0842105263157895, "grad_norm": 0.5033859974228944, "learning_rate": 1.5156260037005126e-05, "loss": 0.3048, "step": 1713 }, { "epoch": 1.0848436881677879, "grad_norm": 0.5496406269644025, "learning_rate": 1.5150281079254448e-05, "loss": 0.305, "step": 1714 }, { "epoch": 1.0854768500197862, "grad_norm": 0.5734178632335484, "learning_rate": 1.5144299614662905e-05, "loss": 0.3022, "step": 1715 }, { "epoch": 1.0861100118717848, "grad_norm": 0.5200603502216231, "learning_rate": 1.513831564614191e-05, "loss": 0.321, "step": 1716 }, { "epoch": 1.0867431737237832, "grad_norm": 0.528250546719799, "learning_rate": 1.5132329176604091e-05, "loss": 0.2983, "step": 1717 }, { "epoch": 1.0873763355757815, "grad_norm": 0.544269340592588, "learning_rate": 1.5126340208963293e-05, "loss": 0.3106, "step": 1718 }, { "epoch": 1.0880094974277799, "grad_norm": 0.5305411838484645, "learning_rate": 1.5120348746134579e-05, "loss": 0.3168, "step": 1719 }, { "epoch": 1.0886426592797784, "grad_norm": 0.5037973013421765, "learning_rate": 1.5114354791034225e-05, "loss": 0.3304, "step": 1720 }, { "epoch": 1.0892758211317768, "grad_norm": 0.6190550227800878, "learning_rate": 1.5108358346579718e-05, "loss": 0.2921, "step": 1721 }, { "epoch": 1.0899089829837751, "grad_norm": 0.552357725533165, "learning_rate": 1.5102359415689765e-05, "loss": 0.3123, "step": 1722 }, { "epoch": 1.0905421448357737, "grad_norm": 0.5239164205755068, "learning_rate": 1.5096358001284275e-05, "loss": 0.3173, "step": 1723 }, { "epoch": 1.091175306687772, "grad_norm": 0.5563865099083752, "learning_rate": 1.5090354106284369e-05, "loss": 0.3105, "step": 1724 }, { "epoch": 1.0918084685397704, "grad_norm": 0.4989753867145873, "learning_rate": 1.508434773361237e-05, "loss": 0.3011, "step": 1725 }, { "epoch": 1.092441630391769, "grad_norm": 0.562023907598398, "learning_rate": 1.5078338886191819e-05, "loss": 0.3158, "step": 1726 }, { "epoch": 1.0930747922437674, "grad_norm": 0.5383604746330817, "learning_rate": 1.507232756694745e-05, "loss": 0.3113, "step": 1727 }, { "epoch": 1.0937079540957657, "grad_norm": 0.5862192781483457, "learning_rate": 1.5066313778805205e-05, "loss": 0.2905, "step": 1728 }, { "epoch": 1.094341115947764, "grad_norm": 0.5802880386221689, "learning_rate": 1.5060297524692227e-05, "loss": 0.3154, "step": 1729 }, { "epoch": 1.0949742777997626, "grad_norm": 0.5051226434566322, "learning_rate": 1.5054278807536857e-05, "loss": 0.3187, "step": 1730 }, { "epoch": 1.095607439651761, "grad_norm": 0.5490226376812527, "learning_rate": 1.5048257630268643e-05, "loss": 0.3057, "step": 1731 }, { "epoch": 1.0962406015037593, "grad_norm": 0.5609521724439424, "learning_rate": 1.5042233995818323e-05, "loss": 0.3191, "step": 1732 }, { "epoch": 1.0968737633557577, "grad_norm": 0.5191547004649467, "learning_rate": 1.5036207907117828e-05, "loss": 0.3064, "step": 1733 }, { "epoch": 1.0975069252077563, "grad_norm": 0.5013824575923301, "learning_rate": 1.5030179367100295e-05, "loss": 0.292, "step": 1734 }, { "epoch": 1.0981400870597546, "grad_norm": 0.4937009041786699, "learning_rate": 1.502414837870004e-05, "loss": 0.3058, "step": 1735 }, { "epoch": 1.098773248911753, "grad_norm": 0.527148191098657, "learning_rate": 1.5018114944852587e-05, "loss": 0.3201, "step": 1736 }, { "epoch": 1.0994064107637516, "grad_norm": 0.5425581555897361, "learning_rate": 1.5012079068494636e-05, "loss": 0.3064, "step": 1737 }, { "epoch": 1.10003957261575, "grad_norm": 0.5462996405483178, "learning_rate": 1.5006040752564079e-05, "loss": 0.3036, "step": 1738 }, { "epoch": 1.1006727344677483, "grad_norm": 0.5038115672082406, "learning_rate": 1.5000000000000002e-05, "loss": 0.3189, "step": 1739 }, { "epoch": 1.1013058963197468, "grad_norm": 0.5538285353897318, "learning_rate": 1.4993956813742672e-05, "loss": 0.3175, "step": 1740 }, { "epoch": 1.1019390581717452, "grad_norm": 3.9104470877004456, "learning_rate": 1.4987911196733537e-05, "loss": 0.3034, "step": 1741 }, { "epoch": 1.1025722200237436, "grad_norm": 0.6249402217830983, "learning_rate": 1.498186315191524e-05, "loss": 0.3013, "step": 1742 }, { "epoch": 1.103205381875742, "grad_norm": 0.5328341633432763, "learning_rate": 1.4975812682231599e-05, "loss": 0.3155, "step": 1743 }, { "epoch": 1.1038385437277405, "grad_norm": 0.5684576527976585, "learning_rate": 1.49697597906276e-05, "loss": 0.3259, "step": 1744 }, { "epoch": 1.1044717055797388, "grad_norm": 0.5524994132552267, "learning_rate": 1.4963704480049431e-05, "loss": 0.3024, "step": 1745 }, { "epoch": 1.1051048674317372, "grad_norm": 0.5007462403651529, "learning_rate": 1.4957646753444438e-05, "loss": 0.3147, "step": 1746 }, { "epoch": 1.1057380292837355, "grad_norm": 0.5365447926890649, "learning_rate": 1.4951586613761154e-05, "loss": 0.3202, "step": 1747 }, { "epoch": 1.1063711911357341, "grad_norm": 0.5643076516797405, "learning_rate": 1.4945524063949283e-05, "loss": 0.3139, "step": 1748 }, { "epoch": 1.1070043529877325, "grad_norm": 0.5083811929723338, "learning_rate": 1.4939459106959701e-05, "loss": 0.3125, "step": 1749 }, { "epoch": 1.1076375148397308, "grad_norm": 0.5341800015258049, "learning_rate": 1.4933391745744462e-05, "loss": 0.3226, "step": 1750 }, { "epoch": 1.1082706766917294, "grad_norm": 0.6178276716380817, "learning_rate": 1.4927321983256778e-05, "loss": 0.3136, "step": 1751 }, { "epoch": 1.1089038385437278, "grad_norm": 0.598068666770699, "learning_rate": 1.4921249822451037e-05, "loss": 0.3145, "step": 1752 }, { "epoch": 1.1095370003957261, "grad_norm": 0.6438807547781815, "learning_rate": 1.4915175266282804e-05, "loss": 0.3084, "step": 1753 }, { "epoch": 1.1101701622477247, "grad_norm": 0.5724915187697044, "learning_rate": 1.4909098317708793e-05, "loss": 0.3067, "step": 1754 }, { "epoch": 1.110803324099723, "grad_norm": 0.5583196864779123, "learning_rate": 1.490301897968689e-05, "loss": 0.3085, "step": 1755 }, { "epoch": 1.1114364859517214, "grad_norm": 0.5599075125195394, "learning_rate": 1.4896937255176142e-05, "loss": 0.3054, "step": 1756 }, { "epoch": 1.1120696478037198, "grad_norm": 0.5187830310970398, "learning_rate": 1.4890853147136763e-05, "loss": 0.3144, "step": 1757 }, { "epoch": 1.1127028096557183, "grad_norm": 0.508776577866178, "learning_rate": 1.4884766658530126e-05, "loss": 0.3049, "step": 1758 }, { "epoch": 1.1133359715077167, "grad_norm": 0.5550008827610642, "learning_rate": 1.4878677792318751e-05, "loss": 0.3125, "step": 1759 }, { "epoch": 1.113969133359715, "grad_norm": 0.56542465832283, "learning_rate": 1.4872586551466335e-05, "loss": 0.3105, "step": 1760 }, { "epoch": 1.1146022952117134, "grad_norm": 0.6354202830223854, "learning_rate": 1.4866492938937713e-05, "loss": 0.3163, "step": 1761 }, { "epoch": 1.115235457063712, "grad_norm": 0.5614217123584127, "learning_rate": 1.4860396957698884e-05, "loss": 0.326, "step": 1762 }, { "epoch": 1.1158686189157103, "grad_norm": 0.5396578674410697, "learning_rate": 1.4854298610717e-05, "loss": 0.2922, "step": 1763 }, { "epoch": 1.1165017807677087, "grad_norm": 0.5104726937243509, "learning_rate": 1.484819790096036e-05, "loss": 0.3031, "step": 1764 }, { "epoch": 1.1171349426197072, "grad_norm": 0.5820428832129815, "learning_rate": 1.4842094831398413e-05, "loss": 0.3015, "step": 1765 }, { "epoch": 1.1177681044717056, "grad_norm": 0.5436540147739202, "learning_rate": 1.4835989405001763e-05, "loss": 0.3149, "step": 1766 }, { "epoch": 1.118401266323704, "grad_norm": 0.541363242492549, "learning_rate": 1.4829881624742154e-05, "loss": 0.2969, "step": 1767 }, { "epoch": 1.1190344281757023, "grad_norm": 0.518543594614504, "learning_rate": 1.4823771493592483e-05, "loss": 0.3388, "step": 1768 }, { "epoch": 1.1196675900277009, "grad_norm": 0.5401350719399879, "learning_rate": 1.4817659014526781e-05, "loss": 0.3336, "step": 1769 }, { "epoch": 1.1203007518796992, "grad_norm": 0.49503602486117204, "learning_rate": 1.481154419052023e-05, "loss": 0.3111, "step": 1770 }, { "epoch": 1.1209339137316976, "grad_norm": 0.7135448039243888, "learning_rate": 1.4805427024549156e-05, "loss": 0.3114, "step": 1771 }, { "epoch": 1.1215670755836962, "grad_norm": 0.5742916662935447, "learning_rate": 1.4799307519591013e-05, "loss": 0.3092, "step": 1772 }, { "epoch": 1.1222002374356945, "grad_norm": 0.5664472257719487, "learning_rate": 1.4793185678624401e-05, "loss": 0.2919, "step": 1773 }, { "epoch": 1.1228333992876929, "grad_norm": 0.5396997581565317, "learning_rate": 1.4787061504629066e-05, "loss": 0.296, "step": 1774 }, { "epoch": 1.1234665611396912, "grad_norm": 0.5710878436843616, "learning_rate": 1.4780935000585867e-05, "loss": 0.3125, "step": 1775 }, { "epoch": 1.1240997229916898, "grad_norm": 0.5672649526587157, "learning_rate": 1.4774806169476821e-05, "loss": 0.2943, "step": 1776 }, { "epoch": 1.1247328848436882, "grad_norm": 0.6311387543895239, "learning_rate": 1.4768675014285063e-05, "loss": 0.3111, "step": 1777 }, { "epoch": 1.1253660466956865, "grad_norm": 0.601157642495307, "learning_rate": 1.476254153799486e-05, "loss": 0.2964, "step": 1778 }, { "epoch": 1.125999208547685, "grad_norm": 0.518222353364912, "learning_rate": 1.4756405743591617e-05, "loss": 0.3107, "step": 1779 }, { "epoch": 1.1266323703996834, "grad_norm": 0.5641056019157865, "learning_rate": 1.4750267634061859e-05, "loss": 0.3176, "step": 1780 }, { "epoch": 1.1272655322516818, "grad_norm": 0.5614603800536253, "learning_rate": 1.4744127212393243e-05, "loss": 0.3104, "step": 1781 }, { "epoch": 1.1278986941036804, "grad_norm": 0.5423578043803612, "learning_rate": 1.4737984481574548e-05, "loss": 0.3001, "step": 1782 }, { "epoch": 1.1285318559556787, "grad_norm": 0.5060971779436744, "learning_rate": 1.4731839444595681e-05, "loss": 0.3108, "step": 1783 }, { "epoch": 1.129165017807677, "grad_norm": 0.5682612824594523, "learning_rate": 1.4725692104447667e-05, "loss": 0.3149, "step": 1784 }, { "epoch": 1.1297981796596754, "grad_norm": 0.5241403345405216, "learning_rate": 1.4719542464122656e-05, "loss": 0.3055, "step": 1785 }, { "epoch": 1.130431341511674, "grad_norm": 0.5459665090118921, "learning_rate": 1.4713390526613908e-05, "loss": 0.3063, "step": 1786 }, { "epoch": 1.1310645033636724, "grad_norm": 0.5607773399393053, "learning_rate": 1.470723629491582e-05, "loss": 0.3003, "step": 1787 }, { "epoch": 1.1316976652156707, "grad_norm": 0.506548524252695, "learning_rate": 1.470107977202389e-05, "loss": 0.3189, "step": 1788 }, { "epoch": 1.132330827067669, "grad_norm": 0.6013273914275514, "learning_rate": 1.4694920960934735e-05, "loss": 0.3144, "step": 1789 }, { "epoch": 1.1329639889196677, "grad_norm": 0.489534668463401, "learning_rate": 1.4688759864646083e-05, "loss": 0.3232, "step": 1790 }, { "epoch": 1.133597150771666, "grad_norm": 0.6695221887725161, "learning_rate": 1.4682596486156782e-05, "loss": 0.3202, "step": 1791 }, { "epoch": 1.1342303126236644, "grad_norm": 0.5210979652430595, "learning_rate": 1.4676430828466787e-05, "loss": 0.3283, "step": 1792 }, { "epoch": 1.134863474475663, "grad_norm": 0.5042564689486951, "learning_rate": 1.4670262894577161e-05, "loss": 0.302, "step": 1793 }, { "epoch": 1.1354966363276613, "grad_norm": 0.7198297524829662, "learning_rate": 1.4664092687490072e-05, "loss": 0.312, "step": 1794 }, { "epoch": 1.1361297981796596, "grad_norm": 0.6786813322581771, "learning_rate": 1.4657920210208807e-05, "loss": 0.3084, "step": 1795 }, { "epoch": 1.136762960031658, "grad_norm": 0.5586594924846497, "learning_rate": 1.465174546573774e-05, "loss": 0.3125, "step": 1796 }, { "epoch": 1.1373961218836566, "grad_norm": 0.5974054101850985, "learning_rate": 1.4645568457082362e-05, "loss": 0.3236, "step": 1797 }, { "epoch": 1.138029283735655, "grad_norm": 0.6308736059890548, "learning_rate": 1.4639389187249263e-05, "loss": 0.3205, "step": 1798 }, { "epoch": 1.1386624455876533, "grad_norm": 0.5544784076178716, "learning_rate": 1.4633207659246129e-05, "loss": 0.3008, "step": 1799 }, { "epoch": 1.1392956074396516, "grad_norm": 0.5944627568144315, "learning_rate": 1.462702387608175e-05, "loss": 0.2895, "step": 1800 }, { "epoch": 1.1399287692916502, "grad_norm": 0.5683461553496921, "learning_rate": 1.4620837840766015e-05, "loss": 0.3104, "step": 1801 }, { "epoch": 1.1405619311436486, "grad_norm": 0.577725831005889, "learning_rate": 1.4614649556309903e-05, "loss": 0.3013, "step": 1802 }, { "epoch": 1.141195092995647, "grad_norm": 0.53896706487561, "learning_rate": 1.4608459025725492e-05, "loss": 0.3039, "step": 1803 }, { "epoch": 1.1418282548476455, "grad_norm": 0.5208124291092108, "learning_rate": 1.4602266252025954e-05, "loss": 0.3038, "step": 1804 }, { "epoch": 1.1424614166996439, "grad_norm": 0.563230605549027, "learning_rate": 1.4596071238225551e-05, "loss": 0.3038, "step": 1805 }, { "epoch": 1.1430945785516422, "grad_norm": 0.5786673364829151, "learning_rate": 1.4589873987339635e-05, "loss": 0.2948, "step": 1806 }, { "epoch": 1.1437277404036408, "grad_norm": 0.5420770431274006, "learning_rate": 1.4583674502384648e-05, "loss": 0.2955, "step": 1807 }, { "epoch": 1.1443609022556391, "grad_norm": 0.545338634863411, "learning_rate": 1.4577472786378118e-05, "loss": 0.3054, "step": 1808 }, { "epoch": 1.1449940641076375, "grad_norm": 0.5277930242581004, "learning_rate": 1.457126884233866e-05, "loss": 0.3054, "step": 1809 }, { "epoch": 1.1456272259596358, "grad_norm": 0.5332603778091792, "learning_rate": 1.4565062673285973e-05, "loss": 0.2943, "step": 1810 }, { "epoch": 1.1462603878116344, "grad_norm": 0.5000377916342283, "learning_rate": 1.4558854282240839e-05, "loss": 0.3047, "step": 1811 }, { "epoch": 1.1468935496636328, "grad_norm": 0.5493800334909296, "learning_rate": 1.4552643672225125e-05, "loss": 0.3118, "step": 1812 }, { "epoch": 1.1475267115156311, "grad_norm": 0.5248037920959219, "learning_rate": 1.4546430846261772e-05, "loss": 0.3172, "step": 1813 }, { "epoch": 1.1481598733676295, "grad_norm": 0.5710841439806423, "learning_rate": 1.4540215807374805e-05, "loss": 0.3021, "step": 1814 }, { "epoch": 1.148793035219628, "grad_norm": 0.5464042444772632, "learning_rate": 1.4533998558589319e-05, "loss": 0.3049, "step": 1815 }, { "epoch": 1.1494261970716264, "grad_norm": 0.5344820419820885, "learning_rate": 1.4527779102931496e-05, "loss": 0.308, "step": 1816 }, { "epoch": 1.1500593589236248, "grad_norm": 0.5113871532924099, "learning_rate": 1.452155744342858e-05, "loss": 0.3157, "step": 1817 }, { "epoch": 1.1506925207756233, "grad_norm": 0.585879258080287, "learning_rate": 1.4515333583108896e-05, "loss": 0.3161, "step": 1818 }, { "epoch": 1.1513256826276217, "grad_norm": 0.48608452355068804, "learning_rate": 1.4509107525001836e-05, "loss": 0.3078, "step": 1819 }, { "epoch": 1.15195884447962, "grad_norm": 0.5580677517522848, "learning_rate": 1.450287927213786e-05, "loss": 0.3071, "step": 1820 }, { "epoch": 1.1525920063316186, "grad_norm": 0.530381430950991, "learning_rate": 1.4496648827548505e-05, "loss": 0.3307, "step": 1821 }, { "epoch": 1.153225168183617, "grad_norm": 0.5466636361388336, "learning_rate": 1.4490416194266371e-05, "loss": 0.3063, "step": 1822 }, { "epoch": 1.1538583300356153, "grad_norm": 0.53518053466448, "learning_rate": 1.4484181375325112e-05, "loss": 0.3154, "step": 1823 }, { "epoch": 1.1544914918876137, "grad_norm": 0.5633882640520078, "learning_rate": 1.4477944373759466e-05, "loss": 0.3097, "step": 1824 }, { "epoch": 1.1551246537396123, "grad_norm": 0.5595064814439863, "learning_rate": 1.4471705192605212e-05, "loss": 0.286, "step": 1825 }, { "epoch": 1.1557578155916106, "grad_norm": 0.5831648651552059, "learning_rate": 1.446546383489921e-05, "loss": 0.3053, "step": 1826 }, { "epoch": 1.156390977443609, "grad_norm": 0.5464629327295855, "learning_rate": 1.4459220303679367e-05, "loss": 0.3072, "step": 1827 }, { "epoch": 1.1570241392956073, "grad_norm": 0.578422802603394, "learning_rate": 1.4452974601984648e-05, "loss": 0.3084, "step": 1828 }, { "epoch": 1.157657301147606, "grad_norm": 0.5592267629770608, "learning_rate": 1.444672673285508e-05, "loss": 0.3046, "step": 1829 }, { "epoch": 1.1582904629996043, "grad_norm": 0.9295360299514029, "learning_rate": 1.4440476699331742e-05, "loss": 0.3216, "step": 1830 }, { "epoch": 1.1589236248516026, "grad_norm": 0.5693398659903085, "learning_rate": 1.4434224504456766e-05, "loss": 0.3125, "step": 1831 }, { "epoch": 1.1595567867036012, "grad_norm": 0.5318679815504553, "learning_rate": 1.4427970151273338e-05, "loss": 0.2985, "step": 1832 }, { "epoch": 1.1601899485555995, "grad_norm": 0.5352424674125277, "learning_rate": 1.4421713642825694e-05, "loss": 0.3183, "step": 1833 }, { "epoch": 1.160823110407598, "grad_norm": 0.5287121864139575, "learning_rate": 1.4415454982159121e-05, "loss": 0.3065, "step": 1834 }, { "epoch": 1.1614562722595965, "grad_norm": 0.5048364398632921, "learning_rate": 1.4409194172319945e-05, "loss": 0.3063, "step": 1835 }, { "epoch": 1.1620894341115948, "grad_norm": 0.5232197588929182, "learning_rate": 1.4402931216355544e-05, "loss": 0.3252, "step": 1836 }, { "epoch": 1.1627225959635932, "grad_norm": 0.5590467308784596, "learning_rate": 1.4396666117314349e-05, "loss": 0.3119, "step": 1837 }, { "epoch": 1.1633557578155915, "grad_norm": 0.5529657487210894, "learning_rate": 1.4390398878245816e-05, "loss": 0.3212, "step": 1838 }, { "epoch": 1.16398891966759, "grad_norm": 0.5068058794900825, "learning_rate": 1.4384129502200462e-05, "loss": 0.3244, "step": 1839 }, { "epoch": 1.1646220815195885, "grad_norm": 0.5347982755045098, "learning_rate": 1.4377857992229825e-05, "loss": 0.3199, "step": 1840 }, { "epoch": 1.1652552433715868, "grad_norm": 0.5594042940948376, "learning_rate": 1.4371584351386496e-05, "loss": 0.3087, "step": 1841 }, { "epoch": 1.1658884052235852, "grad_norm": 0.5399467455949095, "learning_rate": 1.43653085827241e-05, "loss": 0.2982, "step": 1842 }, { "epoch": 1.1665215670755837, "grad_norm": 0.515977185317248, "learning_rate": 1.4359030689297296e-05, "loss": 0.3013, "step": 1843 }, { "epoch": 1.167154728927582, "grad_norm": 0.513535436352586, "learning_rate": 1.4352750674161774e-05, "loss": 0.3152, "step": 1844 }, { "epoch": 1.1677878907795805, "grad_norm": 0.5464921596516767, "learning_rate": 1.4346468540374264e-05, "loss": 0.3125, "step": 1845 }, { "epoch": 1.168421052631579, "grad_norm": 0.5171018805916527, "learning_rate": 1.4340184290992518e-05, "loss": 0.3024, "step": 1846 }, { "epoch": 1.1690542144835774, "grad_norm": 0.5324828033235169, "learning_rate": 1.4333897929075329e-05, "loss": 0.3069, "step": 1847 }, { "epoch": 1.1696873763355757, "grad_norm": 0.551660952169537, "learning_rate": 1.432760945768251e-05, "loss": 0.3312, "step": 1848 }, { "epoch": 1.1703205381875743, "grad_norm": 0.5479504096324408, "learning_rate": 1.4321318879874899e-05, "loss": 0.2901, "step": 1849 }, { "epoch": 1.1709537000395727, "grad_norm": 0.5073825234665386, "learning_rate": 1.431502619871437e-05, "loss": 0.3374, "step": 1850 }, { "epoch": 1.171586861891571, "grad_norm": 0.5230460702998075, "learning_rate": 1.4308731417263812e-05, "loss": 0.3085, "step": 1851 }, { "epoch": 1.1722200237435694, "grad_norm": 0.5409716037196278, "learning_rate": 1.4302434538587138e-05, "loss": 0.3235, "step": 1852 }, { "epoch": 1.172853185595568, "grad_norm": 0.5087738287145734, "learning_rate": 1.429613556574928e-05, "loss": 0.3287, "step": 1853 }, { "epoch": 1.1734863474475663, "grad_norm": 0.5480139143534769, "learning_rate": 1.4289834501816194e-05, "loss": 0.3096, "step": 1854 }, { "epoch": 1.1741195092995647, "grad_norm": 0.542232383338522, "learning_rate": 1.4283531349854853e-05, "loss": 0.3198, "step": 1855 }, { "epoch": 1.174752671151563, "grad_norm": 0.5564293187119811, "learning_rate": 1.4277226112933242e-05, "loss": 0.3194, "step": 1856 }, { "epoch": 1.1753858330035616, "grad_norm": 0.5395825956970696, "learning_rate": 1.4270918794120363e-05, "loss": 0.2938, "step": 1857 }, { "epoch": 1.17601899485556, "grad_norm": 0.5510657373124017, "learning_rate": 1.4264609396486233e-05, "loss": 0.3146, "step": 1858 }, { "epoch": 1.1766521567075583, "grad_norm": 0.5437364984140245, "learning_rate": 1.425829792310188e-05, "loss": 0.3148, "step": 1859 }, { "epoch": 1.1772853185595569, "grad_norm": 0.5724403537780681, "learning_rate": 1.425198437703934e-05, "loss": 0.3069, "step": 1860 }, { "epoch": 1.1779184804115552, "grad_norm": 0.5428739351436541, "learning_rate": 1.4245668761371666e-05, "loss": 0.3051, "step": 1861 }, { "epoch": 1.1785516422635536, "grad_norm": 0.5082116241799514, "learning_rate": 1.42393510791729e-05, "loss": 0.3149, "step": 1862 }, { "epoch": 1.1791848041155522, "grad_norm": 0.5188764884747145, "learning_rate": 1.4233031333518117e-05, "loss": 0.2907, "step": 1863 }, { "epoch": 1.1798179659675505, "grad_norm": 0.8565837854682765, "learning_rate": 1.4226709527483373e-05, "loss": 0.308, "step": 1864 }, { "epoch": 1.1804511278195489, "grad_norm": 0.5697835279600966, "learning_rate": 1.4220385664145734e-05, "loss": 0.3106, "step": 1865 }, { "epoch": 1.1810842896715472, "grad_norm": 0.5059836521755822, "learning_rate": 1.4214059746583276e-05, "loss": 0.3007, "step": 1866 }, { "epoch": 1.1817174515235458, "grad_norm": 0.4883052941443527, "learning_rate": 1.4207731777875058e-05, "loss": 0.2912, "step": 1867 }, { "epoch": 1.1823506133755441, "grad_norm": 0.5915745335383517, "learning_rate": 1.4201401761101155e-05, "loss": 0.3032, "step": 1868 }, { "epoch": 1.1829837752275425, "grad_norm": 0.62728186718479, "learning_rate": 1.4195069699342625e-05, "loss": 0.325, "step": 1869 }, { "epoch": 1.1836169370795409, "grad_norm": 0.5222018897448786, "learning_rate": 1.4188735595681534e-05, "loss": 0.3041, "step": 1870 }, { "epoch": 1.1842500989315394, "grad_norm": 0.5102100732940141, "learning_rate": 1.4182399453200928e-05, "loss": 0.337, "step": 1871 }, { "epoch": 1.1848832607835378, "grad_norm": 0.5305889015400258, "learning_rate": 1.4176061274984858e-05, "loss": 0.3173, "step": 1872 }, { "epoch": 1.1855164226355361, "grad_norm": 0.4876798083768718, "learning_rate": 1.4169721064118353e-05, "loss": 0.3344, "step": 1873 }, { "epoch": 1.1861495844875347, "grad_norm": 0.5590313867168931, "learning_rate": 1.4163378823687447e-05, "loss": 0.3016, "step": 1874 }, { "epoch": 1.186782746339533, "grad_norm": 0.5147068277440804, "learning_rate": 1.415703455677915e-05, "loss": 0.3195, "step": 1875 }, { "epoch": 1.1874159081915314, "grad_norm": 0.5336654981056389, "learning_rate": 1.4150688266481463e-05, "loss": 0.2989, "step": 1876 }, { "epoch": 1.18804907004353, "grad_norm": 0.5539834180951126, "learning_rate": 1.414433995588337e-05, "loss": 0.3057, "step": 1877 }, { "epoch": 1.1886822318955284, "grad_norm": 0.5042858479934469, "learning_rate": 1.4137989628074835e-05, "loss": 0.3003, "step": 1878 }, { "epoch": 1.1893153937475267, "grad_norm": 0.5852151044300166, "learning_rate": 1.4131637286146815e-05, "loss": 0.3013, "step": 1879 }, { "epoch": 1.189948555599525, "grad_norm": 0.5347150172844027, "learning_rate": 1.4125282933191237e-05, "loss": 0.3159, "step": 1880 }, { "epoch": 1.1905817174515236, "grad_norm": 0.5402446585392207, "learning_rate": 1.4118926572301008e-05, "loss": 0.3144, "step": 1881 }, { "epoch": 1.191214879303522, "grad_norm": 0.5930401958220756, "learning_rate": 1.4112568206570018e-05, "loss": 0.3047, "step": 1882 }, { "epoch": 1.1918480411555203, "grad_norm": 0.5300518395401854, "learning_rate": 1.410620783909313e-05, "loss": 0.3068, "step": 1883 }, { "epoch": 1.1924812030075187, "grad_norm": 0.5439923475377881, "learning_rate": 1.4099845472966179e-05, "loss": 0.336, "step": 1884 }, { "epoch": 1.1931143648595173, "grad_norm": 0.5580416825818032, "learning_rate": 1.4093481111285973e-05, "loss": 0.3075, "step": 1885 }, { "epoch": 1.1937475267115156, "grad_norm": 0.5277831496132138, "learning_rate": 1.4087114757150293e-05, "loss": 0.3159, "step": 1886 }, { "epoch": 1.194380688563514, "grad_norm": 0.5264771946741991, "learning_rate": 1.4080746413657896e-05, "loss": 0.2931, "step": 1887 }, { "epoch": 1.1950138504155126, "grad_norm": 0.5729685331073776, "learning_rate": 1.407437608390849e-05, "loss": 0.3267, "step": 1888 }, { "epoch": 1.195647012267511, "grad_norm": 0.49980650693393214, "learning_rate": 1.4068003771002766e-05, "loss": 0.3084, "step": 1889 }, { "epoch": 1.1962801741195093, "grad_norm": 0.5829189207357429, "learning_rate": 1.406162947804238e-05, "loss": 0.3187, "step": 1890 }, { "epoch": 1.1969133359715078, "grad_norm": 0.5198796904254427, "learning_rate": 1.405525320812994e-05, "loss": 0.32, "step": 1891 }, { "epoch": 1.1975464978235062, "grad_norm": 0.5702494913944562, "learning_rate": 1.4048874964369026e-05, "loss": 0.295, "step": 1892 }, { "epoch": 1.1981796596755045, "grad_norm": 0.554406274615292, "learning_rate": 1.4042494749864175e-05, "loss": 0.2959, "step": 1893 }, { "epoch": 1.198812821527503, "grad_norm": 0.5534832073617877, "learning_rate": 1.4036112567720882e-05, "loss": 0.3034, "step": 1894 }, { "epoch": 1.1994459833795015, "grad_norm": 0.6056089446556913, "learning_rate": 1.4029728421045608e-05, "loss": 0.3037, "step": 1895 }, { "epoch": 1.2000791452314998, "grad_norm": 0.5739828099094273, "learning_rate": 1.4023342312945757e-05, "loss": 0.3099, "step": 1896 }, { "epoch": 1.2007123070834982, "grad_norm": 0.595687008361321, "learning_rate": 1.4016954246529697e-05, "loss": 0.3084, "step": 1897 }, { "epoch": 1.2013454689354965, "grad_norm": 0.5226798645282734, "learning_rate": 1.4010564224906749e-05, "loss": 0.3098, "step": 1898 }, { "epoch": 1.2019786307874951, "grad_norm": 0.6749228215638818, "learning_rate": 1.4004172251187176e-05, "loss": 0.3078, "step": 1899 }, { "epoch": 1.2026117926394935, "grad_norm": 0.5471934199567136, "learning_rate": 1.399777832848221e-05, "loss": 0.3256, "step": 1900 }, { "epoch": 1.2032449544914918, "grad_norm": 0.5251682706072197, "learning_rate": 1.3991382459904013e-05, "loss": 0.3088, "step": 1901 }, { "epoch": 1.2038781163434904, "grad_norm": 0.5446685250006449, "learning_rate": 1.3984984648565703e-05, "loss": 0.3037, "step": 1902 }, { "epoch": 1.2045112781954888, "grad_norm": 0.4966635012797021, "learning_rate": 1.3978584897581344e-05, "loss": 0.329, "step": 1903 }, { "epoch": 1.205144440047487, "grad_norm": 0.6144164626095695, "learning_rate": 1.3972183210065938e-05, "loss": 0.3105, "step": 1904 }, { "epoch": 1.2057776018994855, "grad_norm": 0.6403236553837127, "learning_rate": 1.3965779589135435e-05, "loss": 0.2905, "step": 1905 }, { "epoch": 1.206410763751484, "grad_norm": 0.5378968316931688, "learning_rate": 1.395937403790673e-05, "loss": 0.3122, "step": 1906 }, { "epoch": 1.2070439256034824, "grad_norm": 0.5025801758473524, "learning_rate": 1.3952966559497644e-05, "loss": 0.3166, "step": 1907 }, { "epoch": 1.2076770874554807, "grad_norm": 0.5390481565466195, "learning_rate": 1.3946557157026953e-05, "loss": 0.3046, "step": 1908 }, { "epoch": 1.208310249307479, "grad_norm": 0.5246989301444472, "learning_rate": 1.3940145833614352e-05, "loss": 0.3187, "step": 1909 }, { "epoch": 1.2089434111594777, "grad_norm": 0.5046128811057833, "learning_rate": 1.3933732592380485e-05, "loss": 0.3119, "step": 1910 }, { "epoch": 1.209576573011476, "grad_norm": 0.5131717442755815, "learning_rate": 1.3927317436446926e-05, "loss": 0.3243, "step": 1911 }, { "epoch": 1.2102097348634744, "grad_norm": 0.5189477496541555, "learning_rate": 1.3920900368936175e-05, "loss": 0.3153, "step": 1912 }, { "epoch": 1.210842896715473, "grad_norm": 0.4838651885546405, "learning_rate": 1.3914481392971671e-05, "loss": 0.3146, "step": 1913 }, { "epoch": 1.2114760585674713, "grad_norm": 0.531841807873919, "learning_rate": 1.3908060511677774e-05, "loss": 0.3085, "step": 1914 }, { "epoch": 1.2121092204194697, "grad_norm": 0.4845135000283538, "learning_rate": 1.3901637728179778e-05, "loss": 0.2984, "step": 1915 }, { "epoch": 1.2127423822714682, "grad_norm": 0.4971621417367585, "learning_rate": 1.3895213045603897e-05, "loss": 0.3071, "step": 1916 }, { "epoch": 1.2133755441234666, "grad_norm": 0.5216365391784803, "learning_rate": 1.3888786467077276e-05, "loss": 0.2868, "step": 1917 }, { "epoch": 1.214008705975465, "grad_norm": 0.5003038353098972, "learning_rate": 1.3882357995727975e-05, "loss": 0.3019, "step": 1918 }, { "epoch": 1.2146418678274633, "grad_norm": 0.5474654185536523, "learning_rate": 1.3875927634684983e-05, "loss": 0.3128, "step": 1919 }, { "epoch": 1.2152750296794619, "grad_norm": 0.5227325110222275, "learning_rate": 1.3869495387078202e-05, "loss": 0.2937, "step": 1920 }, { "epoch": 1.2159081915314602, "grad_norm": 0.498698451661663, "learning_rate": 1.386306125603846e-05, "loss": 0.307, "step": 1921 }, { "epoch": 1.2165413533834586, "grad_norm": 0.5325298853259209, "learning_rate": 1.3856625244697495e-05, "loss": 0.3068, "step": 1922 }, { "epoch": 1.217174515235457, "grad_norm": 0.4901026343007112, "learning_rate": 1.385018735618796e-05, "loss": 0.2955, "step": 1923 }, { "epoch": 1.2178076770874555, "grad_norm": 0.5586126422144708, "learning_rate": 1.3843747593643429e-05, "loss": 0.2994, "step": 1924 }, { "epoch": 1.2184408389394539, "grad_norm": 0.5473879742722589, "learning_rate": 1.3837305960198378e-05, "loss": 0.3134, "step": 1925 }, { "epoch": 1.2190740007914522, "grad_norm": 0.5025537708617508, "learning_rate": 1.3830862458988204e-05, "loss": 0.3086, "step": 1926 }, { "epoch": 1.2197071626434508, "grad_norm": 0.6353767115333829, "learning_rate": 1.3824417093149205e-05, "loss": 0.2982, "step": 1927 }, { "epoch": 1.2203403244954492, "grad_norm": 0.5955982750936427, "learning_rate": 1.3817969865818585e-05, "loss": 0.3144, "step": 1928 }, { "epoch": 1.2209734863474475, "grad_norm": 0.5466123052298251, "learning_rate": 1.3811520780134471e-05, "loss": 0.2984, "step": 1929 }, { "epoch": 1.221606648199446, "grad_norm": 0.5852812926366472, "learning_rate": 1.3805069839235875e-05, "loss": 0.3149, "step": 1930 }, { "epoch": 1.2222398100514444, "grad_norm": 0.5525439063033442, "learning_rate": 1.3798617046262717e-05, "loss": 0.3064, "step": 1931 }, { "epoch": 1.2228729719034428, "grad_norm": 0.5112409546713541, "learning_rate": 1.3792162404355825e-05, "loss": 0.3069, "step": 1932 }, { "epoch": 1.2235061337554411, "grad_norm": 0.5600525936483683, "learning_rate": 1.3785705916656919e-05, "loss": 0.2997, "step": 1933 }, { "epoch": 1.2241392956074397, "grad_norm": 0.5084522007071355, "learning_rate": 1.3779247586308626e-05, "loss": 0.2949, "step": 1934 }, { "epoch": 1.224772457459438, "grad_norm": 0.612527440414467, "learning_rate": 1.3772787416454461e-05, "loss": 0.3184, "step": 1935 }, { "epoch": 1.2254056193114364, "grad_norm": 0.5283643618946093, "learning_rate": 1.3766325410238838e-05, "loss": 0.318, "step": 1936 }, { "epoch": 1.2260387811634348, "grad_norm": 0.5465871410658216, "learning_rate": 1.3759861570807069e-05, "loss": 0.31, "step": 1937 }, { "epoch": 1.2266719430154334, "grad_norm": 0.5346413344588923, "learning_rate": 1.3753395901305351e-05, "loss": 0.2968, "step": 1938 }, { "epoch": 1.2273051048674317, "grad_norm": 0.5189966929675964, "learning_rate": 1.3746928404880777e-05, "loss": 0.3217, "step": 1939 }, { "epoch": 1.22793826671943, "grad_norm": 0.5032162016267205, "learning_rate": 1.3740459084681327e-05, "loss": 0.3063, "step": 1940 }, { "epoch": 1.2285714285714286, "grad_norm": 0.5269527341480218, "learning_rate": 1.373398794385587e-05, "loss": 0.3266, "step": 1941 }, { "epoch": 1.229204590423427, "grad_norm": 0.5249654645731845, "learning_rate": 1.3727514985554158e-05, "loss": 0.3189, "step": 1942 }, { "epoch": 1.2298377522754254, "grad_norm": 0.5581891651273634, "learning_rate": 1.3721040212926833e-05, "loss": 0.3075, "step": 1943 }, { "epoch": 1.230470914127424, "grad_norm": 0.5348279503116771, "learning_rate": 1.3714563629125412e-05, "loss": 0.3336, "step": 1944 }, { "epoch": 1.2311040759794223, "grad_norm": 0.5903504231201901, "learning_rate": 1.3708085237302307e-05, "loss": 0.3033, "step": 1945 }, { "epoch": 1.2317372378314206, "grad_norm": 0.5254614069271176, "learning_rate": 1.3701605040610795e-05, "loss": 0.307, "step": 1946 }, { "epoch": 1.232370399683419, "grad_norm": 0.5191552517074542, "learning_rate": 1.3695123042205038e-05, "loss": 0.3128, "step": 1947 }, { "epoch": 1.2330035615354176, "grad_norm": 0.5293221958786294, "learning_rate": 1.3688639245240078e-05, "loss": 0.3149, "step": 1948 }, { "epoch": 1.233636723387416, "grad_norm": 0.5331380123029925, "learning_rate": 1.368215365287183e-05, "loss": 0.3066, "step": 1949 }, { "epoch": 1.2342698852394143, "grad_norm": 0.5296633924276604, "learning_rate": 1.3675666268257083e-05, "loss": 0.3211, "step": 1950 }, { "epoch": 1.2349030470914126, "grad_norm": 0.5408187825406171, "learning_rate": 1.3669177094553497e-05, "loss": 0.3197, "step": 1951 }, { "epoch": 1.2355362089434112, "grad_norm": 0.5895725572155263, "learning_rate": 1.3662686134919598e-05, "loss": 0.3014, "step": 1952 }, { "epoch": 1.2361693707954096, "grad_norm": 0.5565065295628924, "learning_rate": 1.3656193392514796e-05, "loss": 0.3082, "step": 1953 }, { "epoch": 1.236802532647408, "grad_norm": 0.5194081206172105, "learning_rate": 1.3649698870499353e-05, "loss": 0.3088, "step": 1954 }, { "epoch": 1.2374356944994065, "grad_norm": 0.5692690350984452, "learning_rate": 1.3643202572034406e-05, "loss": 0.3205, "step": 1955 }, { "epoch": 1.2380688563514048, "grad_norm": 0.47814292440506484, "learning_rate": 1.3636704500281957e-05, "loss": 0.3051, "step": 1956 }, { "epoch": 1.2387020182034032, "grad_norm": 0.5410118100851087, "learning_rate": 1.3630204658404864e-05, "loss": 0.319, "step": 1957 }, { "epoch": 1.2393351800554018, "grad_norm": 0.5310666468178249, "learning_rate": 1.3623703049566855e-05, "loss": 0.2981, "step": 1958 }, { "epoch": 1.2399683419074001, "grad_norm": 0.5432390743671851, "learning_rate": 1.3617199676932514e-05, "loss": 0.3025, "step": 1959 }, { "epoch": 1.2406015037593985, "grad_norm": 0.5256519182555329, "learning_rate": 1.3610694543667283e-05, "loss": 0.3256, "step": 1960 }, { "epoch": 1.2412346656113968, "grad_norm": 0.5840448109200881, "learning_rate": 1.360418765293746e-05, "loss": 0.309, "step": 1961 }, { "epoch": 1.2418678274633954, "grad_norm": 0.5138028154242975, "learning_rate": 1.3597679007910203e-05, "loss": 0.3238, "step": 1962 }, { "epoch": 1.2425009893153938, "grad_norm": 0.5545425953234864, "learning_rate": 1.359116861175352e-05, "loss": 0.3289, "step": 1963 }, { "epoch": 1.2431341511673921, "grad_norm": 0.5111932827579836, "learning_rate": 1.3584656467636275e-05, "loss": 0.2993, "step": 1964 }, { "epoch": 1.2437673130193905, "grad_norm": 0.47896118846191804, "learning_rate": 1.3578142578728173e-05, "loss": 0.2893, "step": 1965 }, { "epoch": 1.244400474871389, "grad_norm": 0.5387691490229412, "learning_rate": 1.3571626948199783e-05, "loss": 0.3115, "step": 1966 }, { "epoch": 1.2450336367233874, "grad_norm": 0.5186989430101787, "learning_rate": 1.3565109579222511e-05, "loss": 0.3049, "step": 1967 }, { "epoch": 1.2456667985753858, "grad_norm": 0.475654595178711, "learning_rate": 1.3558590474968611e-05, "loss": 0.3065, "step": 1968 }, { "epoch": 1.2462999604273843, "grad_norm": 0.5095022775904415, "learning_rate": 1.3552069638611189e-05, "loss": 0.3115, "step": 1969 }, { "epoch": 1.2469331222793827, "grad_norm": 0.5285965832485278, "learning_rate": 1.354554707332418e-05, "loss": 0.2969, "step": 1970 }, { "epoch": 1.247566284131381, "grad_norm": 0.5210217047454638, "learning_rate": 1.3539022782282376e-05, "loss": 0.3206, "step": 1971 }, { "epoch": 1.2481994459833796, "grad_norm": 0.5228455435380135, "learning_rate": 1.3532496768661397e-05, "loss": 0.3072, "step": 1972 }, { "epoch": 1.248832607835378, "grad_norm": 0.5124167985012986, "learning_rate": 1.3525969035637704e-05, "loss": 0.3054, "step": 1973 }, { "epoch": 1.2494657696873763, "grad_norm": 0.49955685472886713, "learning_rate": 1.3519439586388605e-05, "loss": 0.3209, "step": 1974 }, { "epoch": 1.2500989315393747, "grad_norm": 0.5004150251956284, "learning_rate": 1.3512908424092228e-05, "loss": 0.3013, "step": 1975 }, { "epoch": 1.250732093391373, "grad_norm": 0.49409935586036896, "learning_rate": 1.3506375551927546e-05, "loss": 0.3238, "step": 1976 }, { "epoch": 1.2513652552433716, "grad_norm": 0.5111116207703327, "learning_rate": 1.3499840973074359e-05, "loss": 0.313, "step": 1977 }, { "epoch": 1.25199841709537, "grad_norm": 0.5599795668080154, "learning_rate": 1.3493304690713296e-05, "loss": 0.2994, "step": 1978 }, { "epoch": 1.2526315789473683, "grad_norm": 0.5289481796549842, "learning_rate": 1.3486766708025823e-05, "loss": 0.3193, "step": 1979 }, { "epoch": 1.253264740799367, "grad_norm": 0.6104907088272329, "learning_rate": 1.3480227028194231e-05, "loss": 0.3017, "step": 1980 }, { "epoch": 1.2538979026513652, "grad_norm": 0.5866906088887796, "learning_rate": 1.3473685654401624e-05, "loss": 0.2999, "step": 1981 }, { "epoch": 1.2545310645033636, "grad_norm": 0.5003824939353901, "learning_rate": 1.3467142589831954e-05, "loss": 0.3076, "step": 1982 }, { "epoch": 1.2551642263553622, "grad_norm": 0.5339432374115344, "learning_rate": 1.3460597837669975e-05, "loss": 0.3273, "step": 1983 }, { "epoch": 1.2557973882073605, "grad_norm": 0.5827233334169565, "learning_rate": 1.3454051401101278e-05, "loss": 0.3189, "step": 1984 }, { "epoch": 1.2564305500593589, "grad_norm": 0.6676812578914499, "learning_rate": 1.3447503283312263e-05, "loss": 0.3089, "step": 1985 }, { "epoch": 1.2570637119113575, "grad_norm": 0.5251655616662864, "learning_rate": 1.3440953487490145e-05, "loss": 0.3121, "step": 1986 }, { "epoch": 1.2576968737633558, "grad_norm": 0.5654790897690088, "learning_rate": 1.3434402016822974e-05, "loss": 0.3045, "step": 1987 }, { "epoch": 1.2583300356153542, "grad_norm": 0.5128602003797815, "learning_rate": 1.3427848874499598e-05, "loss": 0.3086, "step": 1988 }, { "epoch": 1.2589631974673525, "grad_norm": 0.5103411385490139, "learning_rate": 1.3421294063709686e-05, "loss": 0.3104, "step": 1989 }, { "epoch": 1.2595963593193509, "grad_norm": 0.5396148301116697, "learning_rate": 1.341473758764372e-05, "loss": 0.3143, "step": 1990 }, { "epoch": 1.2602295211713495, "grad_norm": 0.49702997126763726, "learning_rate": 1.3408179449492984e-05, "loss": 0.3169, "step": 1991 }, { "epoch": 1.2608626830233478, "grad_norm": 0.5639202643880502, "learning_rate": 1.3401619652449587e-05, "loss": 0.3077, "step": 1992 }, { "epoch": 1.2614958448753462, "grad_norm": 0.5344021706652423, "learning_rate": 1.3395058199706428e-05, "loss": 0.3031, "step": 1993 }, { "epoch": 1.2621290067273447, "grad_norm": 0.5527348501275611, "learning_rate": 1.3388495094457218e-05, "loss": 0.2972, "step": 1994 }, { "epoch": 1.262762168579343, "grad_norm": 0.7365050978381765, "learning_rate": 1.3381930339896483e-05, "loss": 0.3152, "step": 1995 }, { "epoch": 1.2633953304313414, "grad_norm": 0.6318873965315452, "learning_rate": 1.3375363939219535e-05, "loss": 0.2974, "step": 1996 }, { "epoch": 1.26402849228334, "grad_norm": 0.4934696280038517, "learning_rate": 1.3368795895622497e-05, "loss": 0.3099, "step": 1997 }, { "epoch": 1.2646616541353384, "grad_norm": 0.5568285201928311, "learning_rate": 1.3362226212302291e-05, "loss": 0.2975, "step": 1998 }, { "epoch": 1.2652948159873367, "grad_norm": 0.5130250627185204, "learning_rate": 1.3355654892456635e-05, "loss": 0.3011, "step": 1999 }, { "epoch": 1.2659279778393353, "grad_norm": 0.5310849377910202, "learning_rate": 1.3349081939284048e-05, "loss": 0.2974, "step": 2000 }, { "epoch": 1.2665611396913337, "grad_norm": 0.6211881285889058, "learning_rate": 1.3342507355983833e-05, "loss": 0.2853, "step": 2001 }, { "epoch": 1.267194301543332, "grad_norm": 0.48847010410531766, "learning_rate": 1.3335931145756098e-05, "loss": 0.3238, "step": 2002 }, { "epoch": 1.2678274633953304, "grad_norm": 0.5056639848423882, "learning_rate": 1.332935331180174e-05, "loss": 0.3098, "step": 2003 }, { "epoch": 1.2684606252473287, "grad_norm": 0.5469967441659502, "learning_rate": 1.332277385732244e-05, "loss": 0.3062, "step": 2004 }, { "epoch": 1.2690937870993273, "grad_norm": 0.5039239977659813, "learning_rate": 1.331619278552068e-05, "loss": 0.3039, "step": 2005 }, { "epoch": 1.2697269489513257, "grad_norm": 0.5356940301792565, "learning_rate": 1.3309610099599717e-05, "loss": 0.3063, "step": 2006 }, { "epoch": 1.270360110803324, "grad_norm": 0.5101376278353787, "learning_rate": 1.3303025802763598e-05, "loss": 0.2967, "step": 2007 }, { "epoch": 1.2709932726553226, "grad_norm": 0.7470602528734264, "learning_rate": 1.3296439898217158e-05, "loss": 0.3197, "step": 2008 }, { "epoch": 1.271626434507321, "grad_norm": 0.5619617136907031, "learning_rate": 1.328985238916601e-05, "loss": 0.301, "step": 2009 }, { "epoch": 1.2722595963593193, "grad_norm": 0.6645758869158365, "learning_rate": 1.3283263278816547e-05, "loss": 0.3004, "step": 2010 }, { "epoch": 1.2728927582113179, "grad_norm": 0.5213347824040034, "learning_rate": 1.3276672570375948e-05, "loss": 0.297, "step": 2011 }, { "epoch": 1.2735259200633162, "grad_norm": 0.5122740729621287, "learning_rate": 1.3270080267052164e-05, "loss": 0.3011, "step": 2012 }, { "epoch": 1.2741590819153146, "grad_norm": 0.510831923140091, "learning_rate": 1.3263486372053919e-05, "loss": 0.3155, "step": 2013 }, { "epoch": 1.2747922437673131, "grad_norm": 0.5853359871690026, "learning_rate": 1.3256890888590728e-05, "loss": 0.3254, "step": 2014 }, { "epoch": 1.2754254056193115, "grad_norm": 0.5797741982128813, "learning_rate": 1.3250293819872853e-05, "loss": 0.2908, "step": 2015 }, { "epoch": 1.2760585674713099, "grad_norm": 0.520609049694649, "learning_rate": 1.3243695169111354e-05, "loss": 0.305, "step": 2016 }, { "epoch": 1.2766917293233082, "grad_norm": 0.5508183896881301, "learning_rate": 1.3237094939518043e-05, "loss": 0.3307, "step": 2017 }, { "epoch": 1.2773248911753066, "grad_norm": 0.5337497824391709, "learning_rate": 1.323049313430551e-05, "loss": 0.3042, "step": 2018 }, { "epoch": 1.2779580530273051, "grad_norm": 0.5661239038962325, "learning_rate": 1.3223889756687107e-05, "loss": 0.3226, "step": 2019 }, { "epoch": 1.2785912148793035, "grad_norm": 0.540452910507493, "learning_rate": 1.3217284809876955e-05, "loss": 0.3171, "step": 2020 }, { "epoch": 1.2792243767313018, "grad_norm": 0.6001325963041826, "learning_rate": 1.3210678297089935e-05, "loss": 0.3089, "step": 2021 }, { "epoch": 1.2798575385833004, "grad_norm": 0.7050604115499063, "learning_rate": 1.3204070221541693e-05, "loss": 0.3202, "step": 2022 }, { "epoch": 1.2804907004352988, "grad_norm": 0.5040802351076826, "learning_rate": 1.3197460586448635e-05, "loss": 0.3061, "step": 2023 }, { "epoch": 1.2811238622872971, "grad_norm": 0.5148644626620136, "learning_rate": 1.3190849395027926e-05, "loss": 0.2989, "step": 2024 }, { "epoch": 1.2817570241392957, "grad_norm": 0.52276719559017, "learning_rate": 1.3184236650497488e-05, "loss": 0.3189, "step": 2025 }, { "epoch": 1.282390185991294, "grad_norm": 0.5743278129497319, "learning_rate": 1.3177622356076e-05, "loss": 0.3101, "step": 2026 }, { "epoch": 1.2830233478432924, "grad_norm": 0.6082337376928285, "learning_rate": 1.3171006514982889e-05, "loss": 0.3032, "step": 2027 }, { "epoch": 1.283656509695291, "grad_norm": 0.5451278860689441, "learning_rate": 1.3164389130438351e-05, "loss": 0.3009, "step": 2028 }, { "epoch": 1.2842896715472893, "grad_norm": 0.6130831580348542, "learning_rate": 1.3157770205663314e-05, "loss": 0.3022, "step": 2029 }, { "epoch": 1.2849228333992877, "grad_norm": 0.5011622193286174, "learning_rate": 1.315114974387947e-05, "loss": 0.3026, "step": 2030 }, { "epoch": 1.285555995251286, "grad_norm": 0.4964668439184743, "learning_rate": 1.314452774830925e-05, "loss": 0.3082, "step": 2031 }, { "epoch": 1.2861891571032844, "grad_norm": 0.5662110226478205, "learning_rate": 1.3137904222175837e-05, "loss": 0.2964, "step": 2032 }, { "epoch": 1.286822318955283, "grad_norm": 0.5388171145594792, "learning_rate": 1.3131279168703159e-05, "loss": 0.3194, "step": 2033 }, { "epoch": 1.2874554808072813, "grad_norm": 0.5043061879910501, "learning_rate": 1.3124652591115881e-05, "loss": 0.2968, "step": 2034 }, { "epoch": 1.2880886426592797, "grad_norm": 0.8466473022107489, "learning_rate": 1.3118024492639418e-05, "loss": 0.317, "step": 2035 }, { "epoch": 1.2887218045112783, "grad_norm": 0.5457501413609704, "learning_rate": 1.3111394876499919e-05, "loss": 0.2954, "step": 2036 }, { "epoch": 1.2893549663632766, "grad_norm": 0.48949307991118396, "learning_rate": 1.3104763745924279e-05, "loss": 0.2971, "step": 2037 }, { "epoch": 1.289988128215275, "grad_norm": 0.5958940799099451, "learning_rate": 1.309813110414012e-05, "loss": 0.3289, "step": 2038 }, { "epoch": 1.2906212900672736, "grad_norm": 0.4963703041939979, "learning_rate": 1.3091496954375807e-05, "loss": 0.3013, "step": 2039 }, { "epoch": 1.291254451919272, "grad_norm": 0.4833177354539366, "learning_rate": 1.308486129986044e-05, "loss": 0.3128, "step": 2040 }, { "epoch": 1.2918876137712703, "grad_norm": 0.48341631122140727, "learning_rate": 1.3078224143823848e-05, "loss": 0.3232, "step": 2041 }, { "epoch": 1.2925207756232688, "grad_norm": 0.5177438741324304, "learning_rate": 1.3071585489496585e-05, "loss": 0.3071, "step": 2042 }, { "epoch": 1.2931539374752672, "grad_norm": 0.5116361572123075, "learning_rate": 1.306494534010995e-05, "loss": 0.3088, "step": 2043 }, { "epoch": 1.2937870993272655, "grad_norm": 0.5439931159631366, "learning_rate": 1.3058303698895953e-05, "loss": 0.3081, "step": 2044 }, { "epoch": 1.294420261179264, "grad_norm": 0.5670811126157259, "learning_rate": 1.305166056908734e-05, "loss": 0.3051, "step": 2045 }, { "epoch": 1.2950534230312623, "grad_norm": 0.5185488584957247, "learning_rate": 1.3045015953917584e-05, "loss": 0.2915, "step": 2046 }, { "epoch": 1.2956865848832608, "grad_norm": 0.5106915695249135, "learning_rate": 1.3038369856620863e-05, "loss": 0.3028, "step": 2047 }, { "epoch": 1.2963197467352592, "grad_norm": 0.6085403778055183, "learning_rate": 1.3031722280432102e-05, "loss": 0.3172, "step": 2048 }, { "epoch": 1.2969529085872575, "grad_norm": 0.5595769510425291, "learning_rate": 1.3025073228586931e-05, "loss": 0.3026, "step": 2049 }, { "epoch": 1.2975860704392561, "grad_norm": 0.5030632222337882, "learning_rate": 1.3018422704321695e-05, "loss": 0.2955, "step": 2050 }, { "epoch": 1.2982192322912545, "grad_norm": 0.7392797541475464, "learning_rate": 1.3011770710873468e-05, "loss": 0.3179, "step": 2051 }, { "epoch": 1.2988523941432528, "grad_norm": 0.5170067822131723, "learning_rate": 1.3005117251480023e-05, "loss": 0.3068, "step": 2052 }, { "epoch": 1.2994855559952514, "grad_norm": 0.5184437533696546, "learning_rate": 1.2998462329379868e-05, "loss": 0.3346, "step": 2053 }, { "epoch": 1.3001187178472497, "grad_norm": 0.5030577940997448, "learning_rate": 1.2991805947812204e-05, "loss": 0.3112, "step": 2054 }, { "epoch": 1.300751879699248, "grad_norm": 0.47765923045603986, "learning_rate": 1.2985148110016947e-05, "loss": 0.3082, "step": 2055 }, { "epoch": 1.3013850415512465, "grad_norm": 0.520952198551959, "learning_rate": 1.2978488819234727e-05, "loss": 0.3051, "step": 2056 }, { "epoch": 1.302018203403245, "grad_norm": 0.5156304278704044, "learning_rate": 1.297182807870688e-05, "loss": 0.336, "step": 2057 }, { "epoch": 1.3026513652552434, "grad_norm": 0.522505149829915, "learning_rate": 1.2965165891675443e-05, "loss": 0.3137, "step": 2058 }, { "epoch": 1.3032845271072417, "grad_norm": 0.5118433043672135, "learning_rate": 1.2958502261383161e-05, "loss": 0.3076, "step": 2059 }, { "epoch": 1.30391768895924, "grad_norm": 0.5376843658253144, "learning_rate": 1.295183719107348e-05, "loss": 0.329, "step": 2060 }, { "epoch": 1.3045508508112387, "grad_norm": 0.5202176354966411, "learning_rate": 1.2945170683990549e-05, "loss": 0.326, "step": 2061 }, { "epoch": 1.305184012663237, "grad_norm": 0.526756617004494, "learning_rate": 1.2938502743379212e-05, "loss": 0.3157, "step": 2062 }, { "epoch": 1.3058171745152354, "grad_norm": 0.7490062944159325, "learning_rate": 1.2931833372485013e-05, "loss": 0.3005, "step": 2063 }, { "epoch": 1.306450336367234, "grad_norm": 0.5108234472135248, "learning_rate": 1.2925162574554197e-05, "loss": 0.3138, "step": 2064 }, { "epoch": 1.3070834982192323, "grad_norm": 0.6604242867402123, "learning_rate": 1.2918490352833693e-05, "loss": 0.3271, "step": 2065 }, { "epoch": 1.3077166600712307, "grad_norm": 0.5703230120269843, "learning_rate": 1.2911816710571134e-05, "loss": 0.2977, "step": 2066 }, { "epoch": 1.3083498219232292, "grad_norm": 0.5048160766502902, "learning_rate": 1.290514165101484e-05, "loss": 0.3005, "step": 2067 }, { "epoch": 1.3089829837752276, "grad_norm": 0.5354385984255657, "learning_rate": 1.2898465177413817e-05, "loss": 0.3094, "step": 2068 }, { "epoch": 1.309616145627226, "grad_norm": 0.5010288106116242, "learning_rate": 1.2891787293017765e-05, "loss": 0.2938, "step": 2069 }, { "epoch": 1.3102493074792243, "grad_norm": 0.5159764747445823, "learning_rate": 1.2885108001077068e-05, "loss": 0.3207, "step": 2070 }, { "epoch": 1.3108824693312229, "grad_norm": 0.5437130618637441, "learning_rate": 1.2878427304842796e-05, "loss": 0.3047, "step": 2071 }, { "epoch": 1.3115156311832212, "grad_norm": 0.5038117065915236, "learning_rate": 1.2871745207566702e-05, "loss": 0.3174, "step": 2072 }, { "epoch": 1.3121487930352196, "grad_norm": 0.5662636860443037, "learning_rate": 1.286506171250122e-05, "loss": 0.2947, "step": 2073 }, { "epoch": 1.312781954887218, "grad_norm": 0.48544982347124577, "learning_rate": 1.2858376822899466e-05, "loss": 0.3099, "step": 2074 }, { "epoch": 1.3134151167392165, "grad_norm": 0.5047605523364255, "learning_rate": 1.2851690542015235e-05, "loss": 0.3068, "step": 2075 }, { "epoch": 1.3140482785912149, "grad_norm": 0.4912806374260507, "learning_rate": 1.2845002873102992e-05, "loss": 0.3061, "step": 2076 }, { "epoch": 1.3146814404432132, "grad_norm": 0.5377523892353245, "learning_rate": 1.2838313819417894e-05, "loss": 0.3026, "step": 2077 }, { "epoch": 1.3153146022952118, "grad_norm": 0.5141094946077666, "learning_rate": 1.2831623384215756e-05, "loss": 0.2961, "step": 2078 }, { "epoch": 1.3159477641472102, "grad_norm": 0.5237037438050209, "learning_rate": 1.2824931570753071e-05, "loss": 0.3155, "step": 2079 }, { "epoch": 1.3165809259992085, "grad_norm": 0.5100807786506902, "learning_rate": 1.2818238382287009e-05, "loss": 0.3029, "step": 2080 }, { "epoch": 1.317214087851207, "grad_norm": 0.4906060045397783, "learning_rate": 1.2811543822075396e-05, "loss": 0.3025, "step": 2081 }, { "epoch": 1.3178472497032054, "grad_norm": 0.5406526017627172, "learning_rate": 1.2804847893376739e-05, "loss": 0.3106, "step": 2082 }, { "epoch": 1.3184804115552038, "grad_norm": 0.5097832985427191, "learning_rate": 1.2798150599450203e-05, "loss": 0.3109, "step": 2083 }, { "epoch": 1.3191135734072021, "grad_norm": 0.5292440496602397, "learning_rate": 1.279145194355562e-05, "loss": 0.3123, "step": 2084 }, { "epoch": 1.3197467352592005, "grad_norm": 0.5336557860904271, "learning_rate": 1.2784751928953486e-05, "loss": 0.3023, "step": 2085 }, { "epoch": 1.320379897111199, "grad_norm": 0.5449699809812125, "learning_rate": 1.2778050558904948e-05, "loss": 0.3154, "step": 2086 }, { "epoch": 1.3210130589631974, "grad_norm": 0.5148325934036395, "learning_rate": 1.2771347836671835e-05, "loss": 0.3096, "step": 2087 }, { "epoch": 1.3216462208151958, "grad_norm": 0.4842818535494918, "learning_rate": 1.2764643765516616e-05, "loss": 0.3073, "step": 2088 }, { "epoch": 1.3222793826671944, "grad_norm": 0.4980425886132198, "learning_rate": 1.2757938348702418e-05, "loss": 0.3087, "step": 2089 }, { "epoch": 1.3229125445191927, "grad_norm": 0.5092480200558666, "learning_rate": 1.2751231589493034e-05, "loss": 0.2929, "step": 2090 }, { "epoch": 1.323545706371191, "grad_norm": 0.5124478611127777, "learning_rate": 1.2744523491152896e-05, "loss": 0.2834, "step": 2091 }, { "epoch": 1.3241788682231896, "grad_norm": 0.5306603961949581, "learning_rate": 1.2737814056947096e-05, "loss": 0.3071, "step": 2092 }, { "epoch": 1.324812030075188, "grad_norm": 0.5386975021332497, "learning_rate": 1.2731103290141377e-05, "loss": 0.3115, "step": 2093 }, { "epoch": 1.3254451919271864, "grad_norm": 0.5378225159452267, "learning_rate": 1.2724391194002126e-05, "loss": 0.2981, "step": 2094 }, { "epoch": 1.326078353779185, "grad_norm": 0.5366475874431949, "learning_rate": 1.2717677771796385e-05, "loss": 0.3123, "step": 2095 }, { "epoch": 1.3267115156311833, "grad_norm": 0.5425308726687119, "learning_rate": 1.2710963026791828e-05, "loss": 0.2962, "step": 2096 }, { "epoch": 1.3273446774831816, "grad_norm": 0.5429504180451302, "learning_rate": 1.2704246962256788e-05, "loss": 0.3203, "step": 2097 }, { "epoch": 1.32797783933518, "grad_norm": 0.5156509425814237, "learning_rate": 1.2697529581460228e-05, "loss": 0.3012, "step": 2098 }, { "epoch": 1.3286110011871783, "grad_norm": 0.529330661056799, "learning_rate": 1.2690810887671763e-05, "loss": 0.3122, "step": 2099 }, { "epoch": 1.329244163039177, "grad_norm": 0.533872983376628, "learning_rate": 1.2684090884161636e-05, "loss": 0.3094, "step": 2100 }, { "epoch": 1.3298773248911753, "grad_norm": 0.49109820413955696, "learning_rate": 1.2677369574200733e-05, "loss": 0.3136, "step": 2101 }, { "epoch": 1.3305104867431736, "grad_norm": 0.540174863375538, "learning_rate": 1.2670646961060575e-05, "loss": 0.3078, "step": 2102 }, { "epoch": 1.3311436485951722, "grad_norm": 0.5526314788131246, "learning_rate": 1.266392304801332e-05, "loss": 0.3044, "step": 2103 }, { "epoch": 1.3317768104471706, "grad_norm": 0.530996770981021, "learning_rate": 1.2657197838331755e-05, "loss": 0.2984, "step": 2104 }, { "epoch": 1.332409972299169, "grad_norm": 0.5489020559247387, "learning_rate": 1.2650471335289297e-05, "loss": 0.3182, "step": 2105 }, { "epoch": 1.3330431341511675, "grad_norm": 0.5331187248321614, "learning_rate": 1.264374354216e-05, "loss": 0.3159, "step": 2106 }, { "epoch": 1.3336762960031658, "grad_norm": 0.5064494981276145, "learning_rate": 1.263701446221854e-05, "loss": 0.2798, "step": 2107 }, { "epoch": 1.3343094578551642, "grad_norm": 0.5049131418237175, "learning_rate": 1.2630284098740218e-05, "loss": 0.3151, "step": 2108 }, { "epoch": 1.3349426197071628, "grad_norm": 0.567845042562525, "learning_rate": 1.2623552455000962e-05, "loss": 0.2962, "step": 2109 }, { "epoch": 1.3355757815591611, "grad_norm": 0.516254494764179, "learning_rate": 1.2616819534277322e-05, "loss": 0.3023, "step": 2110 }, { "epoch": 1.3362089434111595, "grad_norm": 0.5395082830200709, "learning_rate": 1.2610085339846476e-05, "loss": 0.3031, "step": 2111 }, { "epoch": 1.3368421052631578, "grad_norm": 0.5128755714358751, "learning_rate": 1.2603349874986214e-05, "loss": 0.3264, "step": 2112 }, { "epoch": 1.3374752671151562, "grad_norm": 0.5139286197166235, "learning_rate": 1.2596613142974944e-05, "loss": 0.3182, "step": 2113 }, { "epoch": 1.3381084289671548, "grad_norm": 0.7242919303042885, "learning_rate": 1.2589875147091699e-05, "loss": 0.3384, "step": 2114 }, { "epoch": 1.3387415908191531, "grad_norm": 0.5822996640709417, "learning_rate": 1.2583135890616117e-05, "loss": 0.3003, "step": 2115 }, { "epoch": 1.3393747526711515, "grad_norm": 0.5046646107957791, "learning_rate": 1.2576395376828456e-05, "loss": 0.3186, "step": 2116 }, { "epoch": 1.34000791452315, "grad_norm": 0.5811335278449148, "learning_rate": 1.2569653609009588e-05, "loss": 0.3146, "step": 2117 }, { "epoch": 1.3406410763751484, "grad_norm": 0.536124138959363, "learning_rate": 1.256291059044099e-05, "loss": 0.304, "step": 2118 }, { "epoch": 1.3412742382271468, "grad_norm": 0.5622671190725099, "learning_rate": 1.2556166324404747e-05, "loss": 0.305, "step": 2119 }, { "epoch": 1.3419074000791453, "grad_norm": 0.5209942542653869, "learning_rate": 1.254942081418356e-05, "loss": 0.3053, "step": 2120 }, { "epoch": 1.3425405619311437, "grad_norm": 0.4930559926178725, "learning_rate": 1.2542674063060722e-05, "loss": 0.3188, "step": 2121 }, { "epoch": 1.343173723783142, "grad_norm": 0.5063401008694383, "learning_rate": 1.2535926074320144e-05, "loss": 0.3066, "step": 2122 }, { "epoch": 1.3438068856351406, "grad_norm": 0.5287055421719494, "learning_rate": 1.2529176851246328e-05, "loss": 0.3106, "step": 2123 }, { "epoch": 1.344440047487139, "grad_norm": 0.4917474634270442, "learning_rate": 1.2522426397124387e-05, "loss": 0.3193, "step": 2124 }, { "epoch": 1.3450732093391373, "grad_norm": 0.522287743927119, "learning_rate": 1.2515674715240023e-05, "loss": 0.2871, "step": 2125 }, { "epoch": 1.3457063711911357, "grad_norm": 0.5156386342369904, "learning_rate": 1.2508921808879541e-05, "loss": 0.3164, "step": 2126 }, { "epoch": 1.346339533043134, "grad_norm": 0.5438920129806385, "learning_rate": 1.2502167681329847e-05, "loss": 0.2895, "step": 2127 }, { "epoch": 1.3469726948951326, "grad_norm": 0.49679381515126986, "learning_rate": 1.249541233587843e-05, "loss": 0.3083, "step": 2128 }, { "epoch": 1.347605856747131, "grad_norm": 0.5869994500892063, "learning_rate": 1.2488655775813378e-05, "loss": 0.3053, "step": 2129 }, { "epoch": 1.3482390185991293, "grad_norm": 0.5277465053805417, "learning_rate": 1.2481898004423373e-05, "loss": 0.3046, "step": 2130 }, { "epoch": 1.348872180451128, "grad_norm": 0.49259643838402656, "learning_rate": 1.2475139024997684e-05, "loss": 0.3104, "step": 2131 }, { "epoch": 1.3495053423031262, "grad_norm": 0.5158441770974327, "learning_rate": 1.2468378840826165e-05, "loss": 0.3153, "step": 2132 }, { "epoch": 1.3501385041551246, "grad_norm": 0.5270032689308449, "learning_rate": 1.2461617455199259e-05, "loss": 0.3104, "step": 2133 }, { "epoch": 1.3507716660071232, "grad_norm": 0.5158746118815986, "learning_rate": 1.2454854871407993e-05, "loss": 0.3183, "step": 2134 }, { "epoch": 1.3514048278591215, "grad_norm": 0.5108338085008713, "learning_rate": 1.244809109274398e-05, "loss": 0.2926, "step": 2135 }, { "epoch": 1.3520379897111199, "grad_norm": 0.5171420156965572, "learning_rate": 1.2441326122499413e-05, "loss": 0.3012, "step": 2136 }, { "epoch": 1.3526711515631185, "grad_norm": 0.48585236895367084, "learning_rate": 1.2434559963967061e-05, "loss": 0.3015, "step": 2137 }, { "epoch": 1.3533043134151168, "grad_norm": 0.4949478599318545, "learning_rate": 1.242779262044028e-05, "loss": 0.3017, "step": 2138 }, { "epoch": 1.3539374752671152, "grad_norm": 0.5021012812952262, "learning_rate": 1.2421024095212992e-05, "loss": 0.305, "step": 2139 }, { "epoch": 1.3545706371191135, "grad_norm": 0.4928603402734078, "learning_rate": 1.2414254391579706e-05, "loss": 0.3173, "step": 2140 }, { "epoch": 1.3552037989711119, "grad_norm": 0.5251219995490337, "learning_rate": 1.2407483512835493e-05, "loss": 0.3213, "step": 2141 }, { "epoch": 1.3558369608231104, "grad_norm": 0.5360177972828049, "learning_rate": 1.2400711462276e-05, "loss": 0.3139, "step": 2142 }, { "epoch": 1.3564701226751088, "grad_norm": 0.5329929296091915, "learning_rate": 1.2393938243197454e-05, "loss": 0.3193, "step": 2143 }, { "epoch": 1.3571032845271072, "grad_norm": 0.4938935783655904, "learning_rate": 1.2387163858896636e-05, "loss": 0.3005, "step": 2144 }, { "epoch": 1.3577364463791057, "grad_norm": 0.515362817017569, "learning_rate": 1.2380388312670901e-05, "loss": 0.3073, "step": 2145 }, { "epoch": 1.358369608231104, "grad_norm": 0.5049544551894564, "learning_rate": 1.2373611607818173e-05, "loss": 0.3083, "step": 2146 }, { "epoch": 1.3590027700831024, "grad_norm": 0.539461745441947, "learning_rate": 1.2366833747636931e-05, "loss": 0.2907, "step": 2147 }, { "epoch": 1.359635931935101, "grad_norm": 0.5015767470473758, "learning_rate": 1.2360054735426226e-05, "loss": 0.315, "step": 2148 }, { "epoch": 1.3602690937870994, "grad_norm": 0.4984395601161067, "learning_rate": 1.2353274574485666e-05, "loss": 0.3147, "step": 2149 }, { "epoch": 1.3609022556390977, "grad_norm": 0.5277167036212743, "learning_rate": 1.2346493268115412e-05, "loss": 0.3141, "step": 2150 }, { "epoch": 1.361535417491096, "grad_norm": 0.490244155493904, "learning_rate": 1.233971081961619e-05, "loss": 0.3075, "step": 2151 }, { "epoch": 1.3621685793430947, "grad_norm": 0.4883835553773454, "learning_rate": 1.233292723228928e-05, "loss": 0.3078, "step": 2152 }, { "epoch": 1.362801741195093, "grad_norm": 0.5410600594458204, "learning_rate": 1.2326142509436521e-05, "loss": 0.3174, "step": 2153 }, { "epoch": 1.3634349030470914, "grad_norm": 0.48795131140053793, "learning_rate": 1.2319356654360295e-05, "loss": 0.3081, "step": 2154 }, { "epoch": 1.3640680648990897, "grad_norm": 0.5154423942028353, "learning_rate": 1.2312569670363535e-05, "loss": 0.2948, "step": 2155 }, { "epoch": 1.3647012267510883, "grad_norm": 0.5726226601958391, "learning_rate": 1.230578156074974e-05, "loss": 0.3042, "step": 2156 }, { "epoch": 1.3653343886030866, "grad_norm": 0.5369583091350484, "learning_rate": 1.2298992328822937e-05, "loss": 0.3151, "step": 2157 }, { "epoch": 1.365967550455085, "grad_norm": 0.5845938517745328, "learning_rate": 1.2292201977887708e-05, "loss": 0.3057, "step": 2158 }, { "epoch": 1.3666007123070836, "grad_norm": 0.4871519325444316, "learning_rate": 1.2285410511249185e-05, "loss": 0.3096, "step": 2159 }, { "epoch": 1.367233874159082, "grad_norm": 0.5771162173065699, "learning_rate": 1.2278617932213031e-05, "loss": 0.283, "step": 2160 }, { "epoch": 1.3678670360110803, "grad_norm": 0.5441012643473975, "learning_rate": 1.227182424408546e-05, "loss": 0.2963, "step": 2161 }, { "epoch": 1.3685001978630789, "grad_norm": 0.6264210625872687, "learning_rate": 1.2265029450173222e-05, "loss": 0.3034, "step": 2162 }, { "epoch": 1.3691333597150772, "grad_norm": 0.5459211121254169, "learning_rate": 1.2258233553783604e-05, "loss": 0.3153, "step": 2163 }, { "epoch": 1.3697665215670756, "grad_norm": 0.5374879566695552, "learning_rate": 1.2251436558224436e-05, "loss": 0.3087, "step": 2164 }, { "epoch": 1.370399683419074, "grad_norm": 0.5117818864476469, "learning_rate": 1.2244638466804072e-05, "loss": 0.3018, "step": 2165 }, { "epoch": 1.3710328452710725, "grad_norm": 0.7136107710394957, "learning_rate": 1.2237839282831411e-05, "loss": 0.2935, "step": 2166 }, { "epoch": 1.3716660071230709, "grad_norm": 0.4977569930512717, "learning_rate": 1.223103900961588e-05, "loss": 0.2937, "step": 2167 }, { "epoch": 1.3722991689750692, "grad_norm": 0.5175299822724457, "learning_rate": 1.2224237650467427e-05, "loss": 0.3263, "step": 2168 }, { "epoch": 1.3729323308270676, "grad_norm": 0.5721854971415898, "learning_rate": 1.2217435208696545e-05, "loss": 0.2961, "step": 2169 }, { "epoch": 1.3735654926790661, "grad_norm": 0.49369749511011296, "learning_rate": 1.2210631687614241e-05, "loss": 0.3245, "step": 2170 }, { "epoch": 1.3741986545310645, "grad_norm": 0.48083429173492653, "learning_rate": 1.220382709053205e-05, "loss": 0.306, "step": 2171 }, { "epoch": 1.3748318163830628, "grad_norm": 0.5153630634261792, "learning_rate": 1.2197021420762037e-05, "loss": 0.3208, "step": 2172 }, { "epoch": 1.3754649782350614, "grad_norm": 0.5088898941738057, "learning_rate": 1.2190214681616778e-05, "loss": 0.3096, "step": 2173 }, { "epoch": 1.3760981400870598, "grad_norm": 0.5279409647113117, "learning_rate": 1.2183406876409385e-05, "loss": 0.3086, "step": 2174 }, { "epoch": 1.3767313019390581, "grad_norm": 0.7797120905205243, "learning_rate": 1.2176598008453471e-05, "loss": 0.3311, "step": 2175 }, { "epoch": 1.3773644637910567, "grad_norm": 0.4902159548948818, "learning_rate": 1.2169788081063181e-05, "loss": 0.2895, "step": 2176 }, { "epoch": 1.377997625643055, "grad_norm": 0.5044386955303853, "learning_rate": 1.2162977097553168e-05, "loss": 0.3035, "step": 2177 }, { "epoch": 1.3786307874950534, "grad_norm": 0.5332480987235597, "learning_rate": 1.2156165061238603e-05, "loss": 0.3108, "step": 2178 }, { "epoch": 1.3792639493470518, "grad_norm": 0.5679342577011992, "learning_rate": 1.2149351975435165e-05, "loss": 0.3077, "step": 2179 }, { "epoch": 1.3798971111990501, "grad_norm": 0.4982328501003848, "learning_rate": 1.2142537843459046e-05, "loss": 0.2977, "step": 2180 }, { "epoch": 1.3805302730510487, "grad_norm": 0.532170205632394, "learning_rate": 1.2135722668626946e-05, "loss": 0.2918, "step": 2181 }, { "epoch": 1.381163434903047, "grad_norm": 0.5186561998717973, "learning_rate": 1.2128906454256078e-05, "loss": 0.2957, "step": 2182 }, { "epoch": 1.3817965967550454, "grad_norm": 0.47526150507665826, "learning_rate": 1.2122089203664156e-05, "loss": 0.3078, "step": 2183 }, { "epoch": 1.382429758607044, "grad_norm": 0.5347989408332311, "learning_rate": 1.2115270920169396e-05, "loss": 0.3003, "step": 2184 }, { "epoch": 1.3830629204590423, "grad_norm": 0.5319457510136246, "learning_rate": 1.2108451607090521e-05, "loss": 0.3211, "step": 2185 }, { "epoch": 1.3836960823110407, "grad_norm": 0.491790119886463, "learning_rate": 1.2101631267746758e-05, "loss": 0.3247, "step": 2186 }, { "epoch": 1.3843292441630393, "grad_norm": 0.5325226258018615, "learning_rate": 1.2094809905457827e-05, "loss": 0.3221, "step": 2187 }, { "epoch": 1.3849624060150376, "grad_norm": 0.6559164399721211, "learning_rate": 1.2087987523543951e-05, "loss": 0.3005, "step": 2188 }, { "epoch": 1.385595567867036, "grad_norm": 0.5126079360077832, "learning_rate": 1.2081164125325842e-05, "loss": 0.3151, "step": 2189 }, { "epoch": 1.3862287297190345, "grad_norm": 0.5814289890283406, "learning_rate": 1.2074339714124718e-05, "loss": 0.296, "step": 2190 }, { "epoch": 1.386861891571033, "grad_norm": 0.5259015549713274, "learning_rate": 1.206751429326228e-05, "loss": 0.3092, "step": 2191 }, { "epoch": 1.3874950534230313, "grad_norm": 0.5110434508088889, "learning_rate": 1.2060687866060723e-05, "loss": 0.3094, "step": 2192 }, { "epoch": 1.3881282152750296, "grad_norm": 0.5361150082683287, "learning_rate": 1.2053860435842736e-05, "loss": 0.318, "step": 2193 }, { "epoch": 1.388761377127028, "grad_norm": 0.4889038807887889, "learning_rate": 1.204703200593149e-05, "loss": 0.3051, "step": 2194 }, { "epoch": 1.3893945389790265, "grad_norm": 0.5447205712195846, "learning_rate": 1.2040202579650649e-05, "loss": 0.3045, "step": 2195 }, { "epoch": 1.390027700831025, "grad_norm": 0.5380806614914077, "learning_rate": 1.2033372160324354e-05, "loss": 0.2849, "step": 2196 }, { "epoch": 1.3906608626830232, "grad_norm": 0.5309656122387654, "learning_rate": 1.202654075127724e-05, "loss": 0.2933, "step": 2197 }, { "epoch": 1.3912940245350218, "grad_norm": 0.5599308168184097, "learning_rate": 1.2019708355834416e-05, "loss": 0.3162, "step": 2198 }, { "epoch": 1.3919271863870202, "grad_norm": 0.5460923038310861, "learning_rate": 1.2012874977321474e-05, "loss": 0.3086, "step": 2199 }, { "epoch": 1.3925603482390185, "grad_norm": 0.5203234593298743, "learning_rate": 1.2006040619064476e-05, "loss": 0.2873, "step": 2200 }, { "epoch": 1.393193510091017, "grad_norm": 4.77460943071337, "learning_rate": 1.199920528438998e-05, "loss": 0.2961, "step": 2201 }, { "epoch": 1.3938266719430155, "grad_norm": 0.6542134868555622, "learning_rate": 1.1992368976624995e-05, "loss": 0.3124, "step": 2202 }, { "epoch": 1.3944598337950138, "grad_norm": 0.5146492700995893, "learning_rate": 1.1985531699097029e-05, "loss": 0.2956, "step": 2203 }, { "epoch": 1.3950929956470124, "grad_norm": 0.5816407159930638, "learning_rate": 1.1978693455134037e-05, "loss": 0.2916, "step": 2204 }, { "epoch": 1.3957261574990107, "grad_norm": 0.5133813105971943, "learning_rate": 1.1971854248064463e-05, "loss": 0.3182, "step": 2205 }, { "epoch": 1.396359319351009, "grad_norm": 0.5297381167006124, "learning_rate": 1.1965014081217217e-05, "loss": 0.3009, "step": 2206 }, { "epoch": 1.3969924812030075, "grad_norm": 0.9208264933067604, "learning_rate": 1.1958172957921668e-05, "loss": 0.2895, "step": 2207 }, { "epoch": 1.3976256430550058, "grad_norm": 0.5120706698142755, "learning_rate": 1.1951330881507653e-05, "loss": 0.2971, "step": 2208 }, { "epoch": 1.3982588049070044, "grad_norm": 0.5092805670007917, "learning_rate": 1.1944487855305478e-05, "loss": 0.2968, "step": 2209 }, { "epoch": 1.3988919667590027, "grad_norm": 0.5722197953374303, "learning_rate": 1.1937643882645909e-05, "loss": 0.2945, "step": 2210 }, { "epoch": 1.399525128611001, "grad_norm": 0.5624069154786089, "learning_rate": 1.1930798966860171e-05, "loss": 0.2942, "step": 2211 }, { "epoch": 1.4001582904629997, "grad_norm": 0.5119861899511939, "learning_rate": 1.192395311127995e-05, "loss": 0.3099, "step": 2212 }, { "epoch": 1.400791452314998, "grad_norm": 0.5321138425583972, "learning_rate": 1.1917106319237386e-05, "loss": 0.3196, "step": 2213 }, { "epoch": 1.4014246141669964, "grad_norm": 0.552498357593841, "learning_rate": 1.1910258594065079e-05, "loss": 0.315, "step": 2214 }, { "epoch": 1.402057776018995, "grad_norm": 0.511523643280222, "learning_rate": 1.1903409939096082e-05, "loss": 0.313, "step": 2215 }, { "epoch": 1.4026909378709933, "grad_norm": 0.5159580451961597, "learning_rate": 1.1896560357663898e-05, "loss": 0.3189, "step": 2216 }, { "epoch": 1.4033240997229917, "grad_norm": 0.5191395034146692, "learning_rate": 1.188970985310249e-05, "loss": 0.3007, "step": 2217 }, { "epoch": 1.4039572615749902, "grad_norm": 0.6764725406915285, "learning_rate": 1.1882858428746251e-05, "loss": 0.3232, "step": 2218 }, { "epoch": 1.4045904234269886, "grad_norm": 0.5500466190007467, "learning_rate": 1.1876006087930046e-05, "loss": 0.2997, "step": 2219 }, { "epoch": 1.405223585278987, "grad_norm": 0.5116508513944666, "learning_rate": 1.1869152833989165e-05, "loss": 0.3088, "step": 2220 }, { "epoch": 1.4058567471309853, "grad_norm": 0.5329290060923375, "learning_rate": 1.1862298670259358e-05, "loss": 0.2944, "step": 2221 }, { "epoch": 1.4064899089829836, "grad_norm": 0.4807002000116782, "learning_rate": 1.185544360007681e-05, "loss": 0.2901, "step": 2222 }, { "epoch": 1.4071230708349822, "grad_norm": 0.5013858088844957, "learning_rate": 1.1848587626778145e-05, "loss": 0.3075, "step": 2223 }, { "epoch": 1.4077562326869806, "grad_norm": 0.5476622832809722, "learning_rate": 1.1841730753700434e-05, "loss": 0.3109, "step": 2224 }, { "epoch": 1.408389394538979, "grad_norm": 0.5219965945505398, "learning_rate": 1.183487298418118e-05, "loss": 0.2961, "step": 2225 }, { "epoch": 1.4090225563909775, "grad_norm": 0.5243432265174681, "learning_rate": 1.1828014321558328e-05, "loss": 0.3063, "step": 2226 }, { "epoch": 1.4096557182429759, "grad_norm": 0.5861210583902836, "learning_rate": 1.182115476917025e-05, "loss": 0.3213, "step": 2227 }, { "epoch": 1.4102888800949742, "grad_norm": 0.6149925344839572, "learning_rate": 1.1814294330355758e-05, "loss": 0.3054, "step": 2228 }, { "epoch": 1.4109220419469728, "grad_norm": 0.5130509018048842, "learning_rate": 1.180743300845409e-05, "loss": 0.314, "step": 2229 }, { "epoch": 1.4115552037989711, "grad_norm": 0.569101503770727, "learning_rate": 1.1800570806804921e-05, "loss": 0.3101, "step": 2230 }, { "epoch": 1.4121883656509695, "grad_norm": 0.5284002809469651, "learning_rate": 1.1793707728748345e-05, "loss": 0.3032, "step": 2231 }, { "epoch": 1.412821527502968, "grad_norm": 0.5150817663301451, "learning_rate": 1.178684377762489e-05, "loss": 0.2996, "step": 2232 }, { "epoch": 1.4134546893549664, "grad_norm": 0.5781054128663621, "learning_rate": 1.1779978956775507e-05, "loss": 0.3083, "step": 2233 }, { "epoch": 1.4140878512069648, "grad_norm": 0.5009287228736811, "learning_rate": 1.1773113269541562e-05, "loss": 0.3301, "step": 2234 }, { "epoch": 1.4147210130589631, "grad_norm": 0.5062440336252122, "learning_rate": 1.1766246719264863e-05, "loss": 0.3073, "step": 2235 }, { "epoch": 1.4153541749109615, "grad_norm": 0.550095270235638, "learning_rate": 1.1759379309287617e-05, "loss": 0.3114, "step": 2236 }, { "epoch": 1.41598733676296, "grad_norm": 0.5209670385881033, "learning_rate": 1.175251104295246e-05, "loss": 0.302, "step": 2237 }, { "epoch": 1.4166204986149584, "grad_norm": 0.4714516076175211, "learning_rate": 1.1745641923602444e-05, "loss": 0.3149, "step": 2238 }, { "epoch": 1.4172536604669568, "grad_norm": 0.5003399117969153, "learning_rate": 1.1738771954581031e-05, "loss": 0.3099, "step": 2239 }, { "epoch": 1.4178868223189554, "grad_norm": 0.4996765299974471, "learning_rate": 1.1731901139232104e-05, "loss": 0.2951, "step": 2240 }, { "epoch": 1.4185199841709537, "grad_norm": 0.4951325921722887, "learning_rate": 1.1725029480899957e-05, "loss": 0.3061, "step": 2241 }, { "epoch": 1.419153146022952, "grad_norm": 0.4974636575421966, "learning_rate": 1.1718156982929282e-05, "loss": 0.3161, "step": 2242 }, { "epoch": 1.4197863078749506, "grad_norm": 0.5206799376204339, "learning_rate": 1.17112836486652e-05, "loss": 0.2904, "step": 2243 }, { "epoch": 1.420419469726949, "grad_norm": 0.5572339143312107, "learning_rate": 1.1704409481453221e-05, "loss": 0.3001, "step": 2244 }, { "epoch": 1.4210526315789473, "grad_norm": 0.5030995837125919, "learning_rate": 1.169753448463927e-05, "loss": 0.3093, "step": 2245 }, { "epoch": 1.421685793430946, "grad_norm": 0.4851731104033203, "learning_rate": 1.1690658661569678e-05, "loss": 0.3099, "step": 2246 }, { "epoch": 1.4223189552829443, "grad_norm": 0.500701235381827, "learning_rate": 1.1683782015591167e-05, "loss": 0.3235, "step": 2247 }, { "epoch": 1.4229521171349426, "grad_norm": 0.5233944628037964, "learning_rate": 1.1676904550050874e-05, "loss": 0.3097, "step": 2248 }, { "epoch": 1.423585278986941, "grad_norm": 0.5102308269088879, "learning_rate": 1.167002626829632e-05, "loss": 0.3041, "step": 2249 }, { "epoch": 1.4242184408389393, "grad_norm": 0.5412025135567805, "learning_rate": 1.1663147173675433e-05, "loss": 0.3206, "step": 2250 }, { "epoch": 1.424851602690938, "grad_norm": 0.5529517822716626, "learning_rate": 1.1656267269536536e-05, "loss": 0.3003, "step": 2251 }, { "epoch": 1.4254847645429363, "grad_norm": 0.5876788548077253, "learning_rate": 1.1649386559228342e-05, "loss": 0.3039, "step": 2252 }, { "epoch": 1.4261179263949346, "grad_norm": 0.5478568852053406, "learning_rate": 1.164250504609996e-05, "loss": 0.3053, "step": 2253 }, { "epoch": 1.4267510882469332, "grad_norm": 0.5146628876403878, "learning_rate": 1.1635622733500889e-05, "loss": 0.305, "step": 2254 }, { "epoch": 1.4273842500989316, "grad_norm": 0.5955710447225966, "learning_rate": 1.1628739624781013e-05, "loss": 0.3034, "step": 2255 }, { "epoch": 1.42801741195093, "grad_norm": 0.47797578448248434, "learning_rate": 1.1621855723290614e-05, "loss": 0.3001, "step": 2256 }, { "epoch": 1.4286505738029285, "grad_norm": 0.5022546839221843, "learning_rate": 1.1614971032380347e-05, "loss": 0.3054, "step": 2257 }, { "epoch": 1.4292837356549268, "grad_norm": 0.5117808347108539, "learning_rate": 1.1608085555401256e-05, "loss": 0.2981, "step": 2258 }, { "epoch": 1.4299168975069252, "grad_norm": 0.512512316111564, "learning_rate": 1.1601199295704774e-05, "loss": 0.3029, "step": 2259 }, { "epoch": 1.4305500593589235, "grad_norm": 0.5313024104563739, "learning_rate": 1.1594312256642709e-05, "loss": 0.2958, "step": 2260 }, { "epoch": 1.4311832212109221, "grad_norm": 0.5307881034632446, "learning_rate": 1.1587424441567245e-05, "loss": 0.3015, "step": 2261 }, { "epoch": 1.4318163830629205, "grad_norm": 0.46939503085740414, "learning_rate": 1.1580535853830952e-05, "loss": 0.3365, "step": 2262 }, { "epoch": 1.4324495449149188, "grad_norm": 0.5803271695475768, "learning_rate": 1.1573646496786766e-05, "loss": 0.3059, "step": 2263 }, { "epoch": 1.4330827067669172, "grad_norm": 0.5528654908126317, "learning_rate": 1.156675637378801e-05, "loss": 0.3054, "step": 2264 }, { "epoch": 1.4337158686189158, "grad_norm": 0.4690641660061206, "learning_rate": 1.1559865488188373e-05, "loss": 0.2976, "step": 2265 }, { "epoch": 1.434349030470914, "grad_norm": 0.48637043943781194, "learning_rate": 1.155297384334191e-05, "loss": 0.3142, "step": 2266 }, { "epoch": 1.4349821923229125, "grad_norm": 0.4934607703941815, "learning_rate": 1.1546081442603056e-05, "loss": 0.2904, "step": 2267 }, { "epoch": 1.435615354174911, "grad_norm": 0.5756419590405887, "learning_rate": 1.1539188289326604e-05, "loss": 0.3104, "step": 2268 }, { "epoch": 1.4362485160269094, "grad_norm": 0.563098514274801, "learning_rate": 1.1532294386867725e-05, "loss": 0.3148, "step": 2269 }, { "epoch": 1.4368816778789077, "grad_norm": 0.5644837296392526, "learning_rate": 1.152539973858194e-05, "loss": 0.3108, "step": 2270 }, { "epoch": 1.4375148397309063, "grad_norm": 0.5615035631972248, "learning_rate": 1.1518504347825146e-05, "loss": 0.3144, "step": 2271 }, { "epoch": 1.4381480015829047, "grad_norm": 0.5421783384365187, "learning_rate": 1.1511608217953595e-05, "loss": 0.2999, "step": 2272 }, { "epoch": 1.438781163434903, "grad_norm": 0.5700945860576938, "learning_rate": 1.1504711352323898e-05, "loss": 0.2979, "step": 2273 }, { "epoch": 1.4394143252869014, "grad_norm": 0.47751569290496043, "learning_rate": 1.1497813754293028e-05, "loss": 0.3038, "step": 2274 }, { "epoch": 1.4400474871389, "grad_norm": 0.5799004824856543, "learning_rate": 1.1490915427218314e-05, "loss": 0.3234, "step": 2275 }, { "epoch": 1.4406806489908983, "grad_norm": 0.5458473670068389, "learning_rate": 1.1484016374457434e-05, "loss": 0.2934, "step": 2276 }, { "epoch": 1.4413138108428967, "grad_norm": 0.475127206989108, "learning_rate": 1.147711659936843e-05, "loss": 0.2976, "step": 2277 }, { "epoch": 1.441946972694895, "grad_norm": 0.5630562113909174, "learning_rate": 1.1470216105309683e-05, "loss": 0.2963, "step": 2278 }, { "epoch": 1.4425801345468936, "grad_norm": 0.5421397662440371, "learning_rate": 1.1463314895639932e-05, "loss": 0.3222, "step": 2279 }, { "epoch": 1.443213296398892, "grad_norm": 0.5134156189506656, "learning_rate": 1.1456412973718265e-05, "loss": 0.2997, "step": 2280 }, { "epoch": 1.4438464582508903, "grad_norm": 0.5186938819630209, "learning_rate": 1.1449510342904111e-05, "loss": 0.3028, "step": 2281 }, { "epoch": 1.4444796201028889, "grad_norm": 0.5005457062115097, "learning_rate": 1.144260700655725e-05, "loss": 0.3029, "step": 2282 }, { "epoch": 1.4451127819548872, "grad_norm": 0.4933615740734623, "learning_rate": 1.1435702968037799e-05, "loss": 0.3061, "step": 2283 }, { "epoch": 1.4457459438068856, "grad_norm": 0.4855056388792427, "learning_rate": 1.1428798230706222e-05, "loss": 0.326, "step": 2284 }, { "epoch": 1.4463791056588842, "grad_norm": 0.5732400132235127, "learning_rate": 1.1421892797923327e-05, "loss": 0.3116, "step": 2285 }, { "epoch": 1.4470122675108825, "grad_norm": 0.5120153078584387, "learning_rate": 1.1414986673050245e-05, "loss": 0.3167, "step": 2286 }, { "epoch": 1.4476454293628809, "grad_norm": 0.5028695144854811, "learning_rate": 1.1408079859448462e-05, "loss": 0.3181, "step": 2287 }, { "epoch": 1.4482785912148792, "grad_norm": 0.519359552306337, "learning_rate": 1.1401172360479789e-05, "loss": 0.3036, "step": 2288 }, { "epoch": 1.4489117530668776, "grad_norm": 0.5044995305074104, "learning_rate": 1.1394264179506367e-05, "loss": 0.31, "step": 2289 }, { "epoch": 1.4495449149188762, "grad_norm": 0.4808889736930479, "learning_rate": 1.1387355319890685e-05, "loss": 0.3114, "step": 2290 }, { "epoch": 1.4501780767708745, "grad_norm": 0.49302330154625235, "learning_rate": 1.1380445784995544e-05, "loss": 0.2996, "step": 2291 }, { "epoch": 1.4508112386228729, "grad_norm": 0.5044146362368477, "learning_rate": 1.1373535578184083e-05, "loss": 0.3055, "step": 2292 }, { "epoch": 1.4514444004748714, "grad_norm": 0.4912341375766709, "learning_rate": 1.1366624702819768e-05, "loss": 0.2928, "step": 2293 }, { "epoch": 1.4520775623268698, "grad_norm": 0.5227326978378124, "learning_rate": 1.1359713162266393e-05, "loss": 0.3, "step": 2294 }, { "epoch": 1.4527107241788682, "grad_norm": 0.5746751001637284, "learning_rate": 1.1352800959888064e-05, "loss": 0.3077, "step": 2295 }, { "epoch": 1.4533438860308667, "grad_norm": 0.5242523660567165, "learning_rate": 1.1345888099049222e-05, "loss": 0.2914, "step": 2296 }, { "epoch": 1.453977047882865, "grad_norm": 0.5778534607850676, "learning_rate": 1.1338974583114623e-05, "loss": 0.2981, "step": 2297 }, { "epoch": 1.4546102097348634, "grad_norm": 0.5611384286609097, "learning_rate": 1.1332060415449344e-05, "loss": 0.2948, "step": 2298 }, { "epoch": 1.455243371586862, "grad_norm": 0.5312805114011852, "learning_rate": 1.1325145599418775e-05, "loss": 0.2882, "step": 2299 }, { "epoch": 1.4558765334388604, "grad_norm": 0.5690246829825721, "learning_rate": 1.1318230138388624e-05, "loss": 0.3078, "step": 2300 }, { "epoch": 1.4565096952908587, "grad_norm": 0.5789326364667462, "learning_rate": 1.1311314035724913e-05, "loss": 0.3213, "step": 2301 }, { "epoch": 1.457142857142857, "grad_norm": 0.5224156704303662, "learning_rate": 1.1304397294793978e-05, "loss": 0.2794, "step": 2302 }, { "epoch": 1.4577760189948554, "grad_norm": 0.5877094829425575, "learning_rate": 1.1297479918962466e-05, "loss": 0.3147, "step": 2303 }, { "epoch": 1.458409180846854, "grad_norm": 0.5439730437191946, "learning_rate": 1.1290561911597329e-05, "loss": 0.2846, "step": 2304 }, { "epoch": 1.4590423426988524, "grad_norm": 0.6174371958037435, "learning_rate": 1.1283643276065826e-05, "loss": 0.3022, "step": 2305 }, { "epoch": 1.4596755045508507, "grad_norm": 0.587258682509549, "learning_rate": 1.1276724015735527e-05, "loss": 0.3106, "step": 2306 }, { "epoch": 1.4603086664028493, "grad_norm": 0.5875617773856048, "learning_rate": 1.1269804133974306e-05, "loss": 0.3136, "step": 2307 }, { "epoch": 1.4609418282548476, "grad_norm": 0.9601671682007903, "learning_rate": 1.1262883634150332e-05, "loss": 0.2959, "step": 2308 }, { "epoch": 1.461574990106846, "grad_norm": 0.4988670054872239, "learning_rate": 1.1255962519632082e-05, "loss": 0.2975, "step": 2309 }, { "epoch": 1.4622081519588446, "grad_norm": 0.4859628218296625, "learning_rate": 1.124904079378833e-05, "loss": 0.3079, "step": 2310 }, { "epoch": 1.462841313810843, "grad_norm": 0.5674332721755223, "learning_rate": 1.124211845998815e-05, "loss": 0.2956, "step": 2311 }, { "epoch": 1.4634744756628413, "grad_norm": 0.49330920120368593, "learning_rate": 1.1235195521600904e-05, "loss": 0.3102, "step": 2312 }, { "epoch": 1.4641076375148399, "grad_norm": 0.5305332455020654, "learning_rate": 1.1228271981996253e-05, "loss": 0.3005, "step": 2313 }, { "epoch": 1.4647407993668382, "grad_norm": 0.4922812928533072, "learning_rate": 1.1221347844544158e-05, "loss": 0.309, "step": 2314 }, { "epoch": 1.4653739612188366, "grad_norm": 0.5049144459947655, "learning_rate": 1.1214423112614865e-05, "loss": 0.3164, "step": 2315 }, { "epoch": 1.466007123070835, "grad_norm": 0.5814051745875654, "learning_rate": 1.12074977895789e-05, "loss": 0.3127, "step": 2316 }, { "epoch": 1.4666402849228333, "grad_norm": 0.5162137052127854, "learning_rate": 1.1200571878807094e-05, "loss": 0.2959, "step": 2317 }, { "epoch": 1.4672734467748318, "grad_norm": 0.5826199316100331, "learning_rate": 1.1193645383670548e-05, "loss": 0.3031, "step": 2318 }, { "epoch": 1.4679066086268302, "grad_norm": 0.47371057254081017, "learning_rate": 1.118671830754066e-05, "loss": 0.314, "step": 2319 }, { "epoch": 1.4685397704788286, "grad_norm": 0.5199769732475947, "learning_rate": 1.1179790653789106e-05, "loss": 0.3046, "step": 2320 }, { "epoch": 1.4691729323308271, "grad_norm": 0.5369176830540715, "learning_rate": 1.1172862425787839e-05, "loss": 0.3131, "step": 2321 }, { "epoch": 1.4698060941828255, "grad_norm": 0.6674880231954912, "learning_rate": 1.1165933626909096e-05, "loss": 0.3068, "step": 2322 }, { "epoch": 1.4704392560348238, "grad_norm": 0.4905382793240836, "learning_rate": 1.1159004260525395e-05, "loss": 0.3112, "step": 2323 }, { "epoch": 1.4710724178868224, "grad_norm": 0.5167845030423941, "learning_rate": 1.1152074330009522e-05, "loss": 0.2977, "step": 2324 }, { "epoch": 1.4717055797388208, "grad_norm": 0.5402425403038791, "learning_rate": 1.1145143838734547e-05, "loss": 0.3069, "step": 2325 }, { "epoch": 1.4723387415908191, "grad_norm": 0.4908572198007735, "learning_rate": 1.11382127900738e-05, "loss": 0.3227, "step": 2326 }, { "epoch": 1.4729719034428177, "grad_norm": 0.5354704009586886, "learning_rate": 1.1131281187400902e-05, "loss": 0.2991, "step": 2327 }, { "epoch": 1.473605065294816, "grad_norm": 0.5150280498042916, "learning_rate": 1.1124349034089724e-05, "loss": 0.316, "step": 2328 }, { "epoch": 1.4742382271468144, "grad_norm": 0.5437170121896965, "learning_rate": 1.111741633351441e-05, "loss": 0.3011, "step": 2329 }, { "epoch": 1.4748713889988128, "grad_norm": 0.509772446213831, "learning_rate": 1.1110483089049382e-05, "loss": 0.3082, "step": 2330 }, { "epoch": 1.4755045508508111, "grad_norm": 0.5211908173455077, "learning_rate": 1.1103549304069309e-05, "loss": 0.288, "step": 2331 }, { "epoch": 1.4761377127028097, "grad_norm": 0.4810455809563629, "learning_rate": 1.109661498194914e-05, "loss": 0.3241, "step": 2332 }, { "epoch": 1.476770874554808, "grad_norm": 0.47382415127949384, "learning_rate": 1.1089680126064075e-05, "loss": 0.3149, "step": 2333 }, { "epoch": 1.4774040364068064, "grad_norm": 0.503874249117806, "learning_rate": 1.1082744739789576e-05, "loss": 0.3074, "step": 2334 }, { "epoch": 1.478037198258805, "grad_norm": 0.6493124742936199, "learning_rate": 1.107580882650136e-05, "loss": 0.2975, "step": 2335 }, { "epoch": 1.4786703601108033, "grad_norm": 0.4903750841447042, "learning_rate": 1.106887238957541e-05, "loss": 0.3015, "step": 2336 }, { "epoch": 1.4793035219628017, "grad_norm": 0.47480428843956585, "learning_rate": 1.1061935432387953e-05, "loss": 0.3029, "step": 2337 }, { "epoch": 1.4799366838148003, "grad_norm": 0.5118206181407761, "learning_rate": 1.1054997958315479e-05, "loss": 0.3108, "step": 2338 }, { "epoch": 1.4805698456667986, "grad_norm": 0.5379699926815201, "learning_rate": 1.1048059970734724e-05, "loss": 0.3027, "step": 2339 }, { "epoch": 1.481203007518797, "grad_norm": 0.5079087233744595, "learning_rate": 1.1041121473022669e-05, "loss": 0.2997, "step": 2340 }, { "epoch": 1.4818361693707955, "grad_norm": 0.4989709475843989, "learning_rate": 1.1034182468556555e-05, "loss": 0.3077, "step": 2341 }, { "epoch": 1.482469331222794, "grad_norm": 0.5126652063745442, "learning_rate": 1.1027242960713859e-05, "loss": 0.3106, "step": 2342 }, { "epoch": 1.4831024930747922, "grad_norm": 0.5241315288603835, "learning_rate": 1.1020302952872313e-05, "loss": 0.3147, "step": 2343 }, { "epoch": 1.4837356549267906, "grad_norm": 0.5032235598944814, "learning_rate": 1.1013362448409884e-05, "loss": 0.3126, "step": 2344 }, { "epoch": 1.484368816778789, "grad_norm": 0.5512008407501351, "learning_rate": 1.1006421450704782e-05, "loss": 0.3146, "step": 2345 }, { "epoch": 1.4850019786307875, "grad_norm": 0.5470287559587713, "learning_rate": 1.0999479963135465e-05, "loss": 0.3018, "step": 2346 }, { "epoch": 1.4856351404827859, "grad_norm": 0.5284049405727667, "learning_rate": 1.0992537989080618e-05, "loss": 0.3222, "step": 2347 }, { "epoch": 1.4862683023347842, "grad_norm": 0.5080714783312925, "learning_rate": 1.0985595531919169e-05, "loss": 0.3046, "step": 2348 }, { "epoch": 1.4869014641867828, "grad_norm": 0.542560609383286, "learning_rate": 1.0978652595030282e-05, "loss": 0.3022, "step": 2349 }, { "epoch": 1.4875346260387812, "grad_norm": 0.4847681774466619, "learning_rate": 1.0971709181793349e-05, "loss": 0.2885, "step": 2350 }, { "epoch": 1.4881677878907795, "grad_norm": 0.6397783596259035, "learning_rate": 1.0964765295588e-05, "loss": 0.2947, "step": 2351 }, { "epoch": 1.488800949742778, "grad_norm": 0.5488407985551894, "learning_rate": 1.0957820939794092e-05, "loss": 0.3203, "step": 2352 }, { "epoch": 1.4894341115947765, "grad_norm": 0.5019959911775371, "learning_rate": 1.0950876117791713e-05, "loss": 0.3065, "step": 2353 }, { "epoch": 1.4900672734467748, "grad_norm": 0.5673344014476935, "learning_rate": 1.0943930832961178e-05, "loss": 0.2924, "step": 2354 }, { "epoch": 1.4907004352987734, "grad_norm": 0.5307499010748175, "learning_rate": 1.0936985088683023e-05, "loss": 0.2709, "step": 2355 }, { "epoch": 1.4913335971507717, "grad_norm": 0.6792338879046098, "learning_rate": 1.0930038888338008e-05, "loss": 0.2984, "step": 2356 }, { "epoch": 1.49196675900277, "grad_norm": 0.5265625238009554, "learning_rate": 1.092309223530712e-05, "loss": 0.3069, "step": 2357 }, { "epoch": 1.4925999208547684, "grad_norm": 0.5138956242791857, "learning_rate": 1.0916145132971562e-05, "loss": 0.3108, "step": 2358 }, { "epoch": 1.4932330827067668, "grad_norm": 0.5061900971040192, "learning_rate": 1.0909197584712762e-05, "loss": 0.3192, "step": 2359 }, { "epoch": 1.4938662445587654, "grad_norm": 0.501435615927025, "learning_rate": 1.0902249593912353e-05, "loss": 0.3155, "step": 2360 }, { "epoch": 1.4944994064107637, "grad_norm": 0.5149524365642576, "learning_rate": 1.0895301163952192e-05, "loss": 0.3051, "step": 2361 }, { "epoch": 1.495132568262762, "grad_norm": 0.5251333160254544, "learning_rate": 1.0888352298214354e-05, "loss": 0.2851, "step": 2362 }, { "epoch": 1.4957657301147607, "grad_norm": 0.6122785685141459, "learning_rate": 1.0881403000081117e-05, "loss": 0.3041, "step": 2363 }, { "epoch": 1.496398891966759, "grad_norm": 0.5455063529051069, "learning_rate": 1.087445327293497e-05, "loss": 0.3082, "step": 2364 }, { "epoch": 1.4970320538187574, "grad_norm": 0.5070218422079469, "learning_rate": 1.086750312015862e-05, "loss": 0.3057, "step": 2365 }, { "epoch": 1.497665215670756, "grad_norm": 0.5460869971823944, "learning_rate": 1.086055254513497e-05, "loss": 0.3046, "step": 2366 }, { "epoch": 1.4982983775227543, "grad_norm": 0.5002563669596142, "learning_rate": 1.0853601551247138e-05, "loss": 0.2942, "step": 2367 }, { "epoch": 1.4989315393747527, "grad_norm": 0.526691586685249, "learning_rate": 1.0846650141878436e-05, "loss": 0.2984, "step": 2368 }, { "epoch": 1.499564701226751, "grad_norm": 0.4907763329837287, "learning_rate": 1.0839698320412386e-05, "loss": 0.3004, "step": 2369 }, { "epoch": 1.5001978630787494, "grad_norm": 0.46054802656996285, "learning_rate": 1.083274609023271e-05, "loss": 0.3255, "step": 2370 }, { "epoch": 1.500831024930748, "grad_norm": 0.5015652694466495, "learning_rate": 1.0825793454723325e-05, "loss": 0.2946, "step": 2371 }, { "epoch": 1.5014641867827463, "grad_norm": 0.5062472419667304, "learning_rate": 1.0818840417268345e-05, "loss": 0.3159, "step": 2372 }, { "epoch": 1.5020973486347446, "grad_norm": 0.5025431311654432, "learning_rate": 1.0811886981252091e-05, "loss": 0.2872, "step": 2373 }, { "epoch": 1.5027305104867432, "grad_norm": 0.5473957221595287, "learning_rate": 1.080493315005906e-05, "loss": 0.3034, "step": 2374 }, { "epoch": 1.5033636723387416, "grad_norm": 0.4774683889432156, "learning_rate": 1.0797978927073956e-05, "loss": 0.2993, "step": 2375 }, { "epoch": 1.50399683419074, "grad_norm": 0.47676805150884854, "learning_rate": 1.0791024315681664e-05, "loss": 0.3092, "step": 2376 }, { "epoch": 1.5046299960427385, "grad_norm": 0.5163212081249985, "learning_rate": 1.0784069319267263e-05, "loss": 0.3065, "step": 2377 }, { "epoch": 1.5052631578947369, "grad_norm": 0.49690761785809634, "learning_rate": 1.0777113941216022e-05, "loss": 0.3033, "step": 2378 }, { "epoch": 1.5058963197467352, "grad_norm": 0.5543561901299843, "learning_rate": 1.0770158184913388e-05, "loss": 0.2976, "step": 2379 }, { "epoch": 1.5065294815987338, "grad_norm": 0.49408410471253605, "learning_rate": 1.0763202053744999e-05, "loss": 0.3092, "step": 2380 }, { "epoch": 1.5071626434507321, "grad_norm": 0.5112978673156093, "learning_rate": 1.0756245551096673e-05, "loss": 0.3107, "step": 2381 }, { "epoch": 1.5077958053027305, "grad_norm": 1.133029407827138, "learning_rate": 1.0749288680354405e-05, "loss": 0.3084, "step": 2382 }, { "epoch": 1.508428967154729, "grad_norm": 0.7018574297559078, "learning_rate": 1.0742331444904378e-05, "loss": 0.301, "step": 2383 }, { "epoch": 1.5090621290067272, "grad_norm": 0.5156979895622339, "learning_rate": 1.0735373848132943e-05, "loss": 0.3017, "step": 2384 }, { "epoch": 1.5096952908587258, "grad_norm": 0.5081050530740547, "learning_rate": 1.0728415893426636e-05, "loss": 0.2945, "step": 2385 }, { "epoch": 1.5103284527107241, "grad_norm": 0.5776413226287923, "learning_rate": 1.0721457584172159e-05, "loss": 0.2937, "step": 2386 }, { "epoch": 1.5109616145627225, "grad_norm": 0.4947516764149312, "learning_rate": 1.0714498923756392e-05, "loss": 0.308, "step": 2387 }, { "epoch": 1.511594776414721, "grad_norm": 0.4982908118041058, "learning_rate": 1.070753991556638e-05, "loss": 0.3055, "step": 2388 }, { "epoch": 1.5122279382667194, "grad_norm": 0.5305825572247981, "learning_rate": 1.0700580562989348e-05, "loss": 0.2985, "step": 2389 }, { "epoch": 1.5128611001187178, "grad_norm": 0.4853808821367819, "learning_rate": 1.0693620869412676e-05, "loss": 0.3016, "step": 2390 }, { "epoch": 1.5134942619707163, "grad_norm": 0.5209653923544731, "learning_rate": 1.0686660838223914e-05, "loss": 0.2999, "step": 2391 }, { "epoch": 1.5141274238227147, "grad_norm": 0.5459268561345596, "learning_rate": 1.0679700472810788e-05, "loss": 0.3058, "step": 2392 }, { "epoch": 1.514760585674713, "grad_norm": 0.47553451018121573, "learning_rate": 1.0672739776561167e-05, "loss": 0.3175, "step": 2393 }, { "epoch": 1.5153937475267116, "grad_norm": 0.5512489736725628, "learning_rate": 1.0665778752863099e-05, "loss": 0.3012, "step": 2394 }, { "epoch": 1.51602690937871, "grad_norm": 0.5629289175453964, "learning_rate": 1.0658817405104776e-05, "loss": 0.2983, "step": 2395 }, { "epoch": 1.5166600712307083, "grad_norm": 0.5067410124225232, "learning_rate": 1.0651855736674562e-05, "loss": 0.3217, "step": 2396 }, { "epoch": 1.517293233082707, "grad_norm": 0.5219688502772429, "learning_rate": 1.0644893750960962e-05, "loss": 0.305, "step": 2397 }, { "epoch": 1.517926394934705, "grad_norm": 0.5481021681746725, "learning_rate": 1.063793145135265e-05, "loss": 0.318, "step": 2398 }, { "epoch": 1.5185595567867036, "grad_norm": 0.4874661795081353, "learning_rate": 1.0630968841238445e-05, "loss": 0.2979, "step": 2399 }, { "epoch": 1.519192718638702, "grad_norm": 0.469298506883839, "learning_rate": 1.0624005924007314e-05, "loss": 0.3011, "step": 2400 }, { "epoch": 1.5198258804907003, "grad_norm": 0.5196665205645815, "learning_rate": 1.0617042703048381e-05, "loss": 0.318, "step": 2401 }, { "epoch": 1.520459042342699, "grad_norm": 0.5281956050077724, "learning_rate": 1.061007918175092e-05, "loss": 0.2879, "step": 2402 }, { "epoch": 1.5210922041946973, "grad_norm": 0.5059399338872894, "learning_rate": 1.0603115363504338e-05, "loss": 0.3198, "step": 2403 }, { "epoch": 1.5217253660466956, "grad_norm": 0.534460136528347, "learning_rate": 1.05961512516982e-05, "loss": 0.2892, "step": 2404 }, { "epoch": 1.5223585278986942, "grad_norm": 0.5354592953493942, "learning_rate": 1.0589186849722206e-05, "loss": 0.289, "step": 2405 }, { "epoch": 1.5229916897506925, "grad_norm": 0.4945502757715682, "learning_rate": 1.0582222160966198e-05, "loss": 0.3177, "step": 2406 }, { "epoch": 1.523624851602691, "grad_norm": 0.5224124766098509, "learning_rate": 1.0575257188820162e-05, "loss": 0.2997, "step": 2407 }, { "epoch": 1.5242580134546895, "grad_norm": 0.5886272945639497, "learning_rate": 1.0568291936674218e-05, "loss": 0.3142, "step": 2408 }, { "epoch": 1.5248911753066876, "grad_norm": 0.5108559346480956, "learning_rate": 1.0561326407918624e-05, "loss": 0.3098, "step": 2409 }, { "epoch": 1.5255243371586862, "grad_norm": 0.5015766999816065, "learning_rate": 1.0554360605943771e-05, "loss": 0.2924, "step": 2410 }, { "epoch": 1.5261574990106848, "grad_norm": 0.5030879431968445, "learning_rate": 1.0547394534140183e-05, "loss": 0.281, "step": 2411 }, { "epoch": 1.526790660862683, "grad_norm": 0.5198243432523418, "learning_rate": 1.054042819589852e-05, "loss": 0.2948, "step": 2412 }, { "epoch": 1.5274238227146815, "grad_norm": 0.5088390767246997, "learning_rate": 1.0533461594609567e-05, "loss": 0.3054, "step": 2413 }, { "epoch": 1.5280569845666798, "grad_norm": 0.48072436478801034, "learning_rate": 1.0526494733664235e-05, "loss": 0.2907, "step": 2414 }, { "epoch": 1.5286901464186782, "grad_norm": 0.4602120484586448, "learning_rate": 1.051952761645357e-05, "loss": 0.3071, "step": 2415 }, { "epoch": 1.5293233082706768, "grad_norm": 0.5040860479761987, "learning_rate": 1.051256024636873e-05, "loss": 0.3178, "step": 2416 }, { "epoch": 1.529956470122675, "grad_norm": 0.5030935415403729, "learning_rate": 1.0505592626801011e-05, "loss": 0.2945, "step": 2417 }, { "epoch": 1.5305896319746735, "grad_norm": 0.4546659385070338, "learning_rate": 1.0498624761141818e-05, "loss": 0.2985, "step": 2418 }, { "epoch": 1.531222793826672, "grad_norm": 0.5051352515071088, "learning_rate": 1.0491656652782678e-05, "loss": 0.3083, "step": 2419 }, { "epoch": 1.5318559556786704, "grad_norm": 0.5336858136227338, "learning_rate": 1.0484688305115247e-05, "loss": 0.3092, "step": 2420 }, { "epoch": 1.5324891175306687, "grad_norm": 0.4518973174445372, "learning_rate": 1.047771972153128e-05, "loss": 0.2951, "step": 2421 }, { "epoch": 1.5331222793826673, "grad_norm": 0.5062792506919492, "learning_rate": 1.0470750905422662e-05, "loss": 0.2975, "step": 2422 }, { "epoch": 1.5337554412346655, "grad_norm": 0.5107029933835948, "learning_rate": 1.0463781860181385e-05, "loss": 0.2804, "step": 2423 }, { "epoch": 1.534388603086664, "grad_norm": 0.4758693284077281, "learning_rate": 1.045681258919955e-05, "loss": 0.2996, "step": 2424 }, { "epoch": 1.5350217649386626, "grad_norm": 0.5408301700952003, "learning_rate": 1.0449843095869371e-05, "loss": 0.3029, "step": 2425 }, { "epoch": 1.5356549267906607, "grad_norm": 0.5264122715800541, "learning_rate": 1.0442873383583174e-05, "loss": 0.3004, "step": 2426 }, { "epoch": 1.5362880886426593, "grad_norm": 0.5628750139344998, "learning_rate": 1.0435903455733381e-05, "loss": 0.2907, "step": 2427 }, { "epoch": 1.5369212504946577, "grad_norm": 0.4702268161664269, "learning_rate": 1.0428933315712528e-05, "loss": 0.3103, "step": 2428 }, { "epoch": 1.537554412346656, "grad_norm": 0.49682670635071624, "learning_rate": 1.0421962966913253e-05, "loss": 0.3086, "step": 2429 }, { "epoch": 1.5381875741986546, "grad_norm": 0.5178179192196143, "learning_rate": 1.0414992412728294e-05, "loss": 0.3036, "step": 2430 }, { "epoch": 1.538820736050653, "grad_norm": 0.5172629661785249, "learning_rate": 1.0408021656550482e-05, "loss": 0.2928, "step": 2431 }, { "epoch": 1.5394538979026513, "grad_norm": 0.4868254616026313, "learning_rate": 1.0401050701772764e-05, "loss": 0.3, "step": 2432 }, { "epoch": 1.5400870597546499, "grad_norm": 0.49240425385903325, "learning_rate": 1.0394079551788167e-05, "loss": 0.3093, "step": 2433 }, { "epoch": 1.5407202216066482, "grad_norm": 0.5303860965356233, "learning_rate": 1.0387108209989825e-05, "loss": 0.3098, "step": 2434 }, { "epoch": 1.5413533834586466, "grad_norm": 0.5312519344245589, "learning_rate": 1.038013667977095e-05, "loss": 0.2844, "step": 2435 }, { "epoch": 1.5419865453106452, "grad_norm": 0.4975620016032502, "learning_rate": 1.0373164964524863e-05, "loss": 0.2907, "step": 2436 }, { "epoch": 1.5426197071626433, "grad_norm": 0.5564867311274675, "learning_rate": 1.0366193067644963e-05, "loss": 0.2943, "step": 2437 }, { "epoch": 1.5432528690146419, "grad_norm": 0.5063492495026072, "learning_rate": 1.0359220992524746e-05, "loss": 0.2954, "step": 2438 }, { "epoch": 1.5438860308666404, "grad_norm": 0.5006333281332944, "learning_rate": 1.0352248742557786e-05, "loss": 0.2969, "step": 2439 }, { "epoch": 1.5445191927186386, "grad_norm": 0.4910111186110252, "learning_rate": 1.0345276321137748e-05, "loss": 0.298, "step": 2440 }, { "epoch": 1.5451523545706372, "grad_norm": 0.6115381219288648, "learning_rate": 1.0338303731658377e-05, "loss": 0.2932, "step": 2441 }, { "epoch": 1.5457855164226355, "grad_norm": 0.5109126866573048, "learning_rate": 1.033133097751351e-05, "loss": 0.29, "step": 2442 }, { "epoch": 1.5464186782746339, "grad_norm": 0.535913346142768, "learning_rate": 1.0324358062097045e-05, "loss": 0.2998, "step": 2443 }, { "epoch": 1.5470518401266324, "grad_norm": 0.5656131478652755, "learning_rate": 1.0317384988802976e-05, "loss": 0.3134, "step": 2444 }, { "epoch": 1.5476850019786308, "grad_norm": 0.5476166910823212, "learning_rate": 1.0310411761025365e-05, "loss": 0.3049, "step": 2445 }, { "epoch": 1.5483181638306291, "grad_norm": 0.47990008219483016, "learning_rate": 1.0303438382158355e-05, "loss": 0.2976, "step": 2446 }, { "epoch": 1.5489513256826277, "grad_norm": 0.5047998829981523, "learning_rate": 1.0296464855596158e-05, "loss": 0.3075, "step": 2447 }, { "epoch": 1.549584487534626, "grad_norm": 0.6259186962882757, "learning_rate": 1.0289491184733052e-05, "loss": 0.2982, "step": 2448 }, { "epoch": 1.5502176493866244, "grad_norm": 0.6538331273471668, "learning_rate": 1.0282517372963401e-05, "loss": 0.3051, "step": 2449 }, { "epoch": 1.550850811238623, "grad_norm": 0.5228627713452794, "learning_rate": 1.0275543423681622e-05, "loss": 0.3089, "step": 2450 }, { "epoch": 1.5514839730906211, "grad_norm": 0.5711810945165543, "learning_rate": 1.0268569340282209e-05, "loss": 0.3092, "step": 2451 }, { "epoch": 1.5521171349426197, "grad_norm": 0.5077026000530331, "learning_rate": 1.026159512615972e-05, "loss": 0.3012, "step": 2452 }, { "epoch": 1.5527502967946183, "grad_norm": 0.5114800010722054, "learning_rate": 1.025462078470877e-05, "loss": 0.3022, "step": 2453 }, { "epoch": 1.5533834586466164, "grad_norm": 0.5590354569848227, "learning_rate": 1.0247646319324044e-05, "loss": 0.2995, "step": 2454 }, { "epoch": 1.554016620498615, "grad_norm": 0.5628263616071913, "learning_rate": 1.0240671733400285e-05, "loss": 0.3018, "step": 2455 }, { "epoch": 1.5546497823506134, "grad_norm": 0.491856238716419, "learning_rate": 1.0233697030332287e-05, "loss": 0.2993, "step": 2456 }, { "epoch": 1.5552829442026117, "grad_norm": 0.5203879703333487, "learning_rate": 1.0226722213514915e-05, "loss": 0.2941, "step": 2457 }, { "epoch": 1.5559161060546103, "grad_norm": 0.4802387771428654, "learning_rate": 1.0219747286343076e-05, "loss": 0.3173, "step": 2458 }, { "epoch": 1.5565492679066086, "grad_norm": 0.49464815427291425, "learning_rate": 1.0212772252211743e-05, "loss": 0.2966, "step": 2459 }, { "epoch": 1.557182429758607, "grad_norm": 0.5020668255081392, "learning_rate": 1.0205797114515927e-05, "loss": 0.3175, "step": 2460 }, { "epoch": 1.5578155916106056, "grad_norm": 0.49882841068147965, "learning_rate": 1.0198821876650702e-05, "loss": 0.288, "step": 2461 }, { "epoch": 1.558448753462604, "grad_norm": 0.511967199743277, "learning_rate": 1.0191846542011186e-05, "loss": 0.2914, "step": 2462 }, { "epoch": 1.5590819153146023, "grad_norm": 0.5150674617181225, "learning_rate": 1.0184871113992544e-05, "loss": 0.3057, "step": 2463 }, { "epoch": 1.5597150771666008, "grad_norm": 0.4906693408989332, "learning_rate": 1.0177895595989984e-05, "loss": 0.3185, "step": 2464 }, { "epoch": 1.560348239018599, "grad_norm": 0.49767586599257213, "learning_rate": 1.0170919991398762e-05, "loss": 0.2995, "step": 2465 }, { "epoch": 1.5609814008705976, "grad_norm": 0.5257761761603471, "learning_rate": 1.0163944303614176e-05, "loss": 0.2952, "step": 2466 }, { "epoch": 1.561614562722596, "grad_norm": 0.5059273446068783, "learning_rate": 1.0156968536031557e-05, "loss": 0.2888, "step": 2467 }, { "epoch": 1.5622477245745943, "grad_norm": 0.470679152437, "learning_rate": 1.0149992692046288e-05, "loss": 0.31, "step": 2468 }, { "epoch": 1.5628808864265928, "grad_norm": 0.5204528108256181, "learning_rate": 1.0143016775053776e-05, "loss": 0.3017, "step": 2469 }, { "epoch": 1.5635140482785912, "grad_norm": 0.4614958302269678, "learning_rate": 1.0136040788449472e-05, "loss": 0.2905, "step": 2470 }, { "epoch": 1.5641472101305895, "grad_norm": 0.4991785235011585, "learning_rate": 1.0129064735628859e-05, "loss": 0.3121, "step": 2471 }, { "epoch": 1.5647803719825881, "grad_norm": 0.5127094926727119, "learning_rate": 1.0122088619987451e-05, "loss": 0.2822, "step": 2472 }, { "epoch": 1.5654135338345865, "grad_norm": 0.5060009247448805, "learning_rate": 1.0115112444920793e-05, "loss": 0.3175, "step": 2473 }, { "epoch": 1.5660466956865848, "grad_norm": 0.46878001301163114, "learning_rate": 1.0108136213824458e-05, "loss": 0.3041, "step": 2474 }, { "epoch": 1.5666798575385834, "grad_norm": 0.47732732952503004, "learning_rate": 1.010115993009405e-05, "loss": 0.2974, "step": 2475 }, { "epoch": 1.5673130193905818, "grad_norm": 0.49645416642798834, "learning_rate": 1.0094183597125193e-05, "loss": 0.3094, "step": 2476 }, { "epoch": 1.5679461812425801, "grad_norm": 0.4852950147143347, "learning_rate": 1.008720721831354e-05, "loss": 0.3023, "step": 2477 }, { "epoch": 1.5685793430945787, "grad_norm": 0.49582932375963473, "learning_rate": 1.0080230797054762e-05, "loss": 0.3022, "step": 2478 }, { "epoch": 1.5692125049465768, "grad_norm": 0.5270801647132398, "learning_rate": 1.0073254336744553e-05, "loss": 0.3001, "step": 2479 }, { "epoch": 1.5698456667985754, "grad_norm": 0.5185391325149944, "learning_rate": 1.0066277840778626e-05, "loss": 0.2805, "step": 2480 }, { "epoch": 1.5704788286505738, "grad_norm": 0.4933134420447475, "learning_rate": 1.0059301312552716e-05, "loss": 0.3028, "step": 2481 }, { "epoch": 1.571111990502572, "grad_norm": 0.48934954266408015, "learning_rate": 1.0052324755462564e-05, "loss": 0.3062, "step": 2482 }, { "epoch": 1.5717451523545707, "grad_norm": 0.5454749389717589, "learning_rate": 1.004534817290393e-05, "loss": 0.3126, "step": 2483 }, { "epoch": 1.572378314206569, "grad_norm": 0.5209305053259349, "learning_rate": 1.003837156827259e-05, "loss": 0.3155, "step": 2484 }, { "epoch": 1.5730114760585674, "grad_norm": 0.5118533921080375, "learning_rate": 1.003139494496432e-05, "loss": 0.295, "step": 2485 }, { "epoch": 1.573644637910566, "grad_norm": 0.4965600837438929, "learning_rate": 1.0024418306374922e-05, "loss": 0.3138, "step": 2486 }, { "epoch": 1.5742777997625643, "grad_norm": 0.5399439494322202, "learning_rate": 1.0017441655900187e-05, "loss": 0.2979, "step": 2487 }, { "epoch": 1.5749109616145627, "grad_norm": 0.535177597740596, "learning_rate": 1.0010464996935927e-05, "loss": 0.3028, "step": 2488 }, { "epoch": 1.5755441234665613, "grad_norm": 0.5124273634079823, "learning_rate": 1.0003488332877947e-05, "loss": 0.3042, "step": 2489 }, { "epoch": 1.5761772853185596, "grad_norm": 0.5378173797339794, "learning_rate": 9.996511667122058e-06, "loss": 0.2979, "step": 2490 }, { "epoch": 1.576810447170558, "grad_norm": 0.528433114604215, "learning_rate": 9.989535003064076e-06, "loss": 0.2946, "step": 2491 }, { "epoch": 1.5774436090225565, "grad_norm": 0.5113629184272386, "learning_rate": 9.982558344099817e-06, "loss": 0.3002, "step": 2492 }, { "epoch": 1.5780767708745547, "grad_norm": 0.501170319267892, "learning_rate": 9.975581693625082e-06, "loss": 0.2782, "step": 2493 }, { "epoch": 1.5787099327265532, "grad_norm": 0.5270803540364494, "learning_rate": 9.968605055035685e-06, "loss": 0.2926, "step": 2494 }, { "epoch": 1.5793430945785516, "grad_norm": 0.6979306780014392, "learning_rate": 9.961628431727414e-06, "loss": 0.3147, "step": 2495 }, { "epoch": 1.57997625643055, "grad_norm": 0.5058814805325951, "learning_rate": 9.954651827096071e-06, "loss": 0.2816, "step": 2496 }, { "epoch": 1.5806094182825485, "grad_norm": 0.5064734261798008, "learning_rate": 9.94767524453744e-06, "loss": 0.2948, "step": 2497 }, { "epoch": 1.5812425801345469, "grad_norm": 0.5126439425333328, "learning_rate": 9.940698687447284e-06, "loss": 0.2917, "step": 2498 }, { "epoch": 1.5818757419865452, "grad_norm": 0.53259911526845, "learning_rate": 9.933722159221375e-06, "loss": 0.2937, "step": 2499 }, { "epoch": 1.5825089038385438, "grad_norm": 0.4965125608024925, "learning_rate": 9.92674566325545e-06, "loss": 0.3121, "step": 2500 }, { "epoch": 1.5831420656905422, "grad_norm": 0.5082096663403334, "learning_rate": 9.919769202945243e-06, "loss": 0.3015, "step": 2501 }, { "epoch": 1.5837752275425405, "grad_norm": 0.5026375035382545, "learning_rate": 9.912792781686464e-06, "loss": 0.2905, "step": 2502 }, { "epoch": 1.584408389394539, "grad_norm": 0.5159482460403626, "learning_rate": 9.905816402874812e-06, "loss": 0.2809, "step": 2503 }, { "epoch": 1.5850415512465375, "grad_norm": 0.4963144376066531, "learning_rate": 9.898840069905953e-06, "loss": 0.3139, "step": 2504 }, { "epoch": 1.5856747130985358, "grad_norm": 0.508485131131669, "learning_rate": 9.891863786175542e-06, "loss": 0.2903, "step": 2505 }, { "epoch": 1.5863078749505344, "grad_norm": 0.4897704220694141, "learning_rate": 9.884887555079208e-06, "loss": 0.3141, "step": 2506 }, { "epoch": 1.5869410368025325, "grad_norm": 0.5201232505278174, "learning_rate": 9.877911380012549e-06, "loss": 0.297, "step": 2507 }, { "epoch": 1.587574198654531, "grad_norm": 0.5072590806558264, "learning_rate": 9.870935264371143e-06, "loss": 0.304, "step": 2508 }, { "epoch": 1.5882073605065294, "grad_norm": 0.506146717145843, "learning_rate": 9.863959211550528e-06, "loss": 0.2919, "step": 2509 }, { "epoch": 1.5888405223585278, "grad_norm": 0.5347399947660091, "learning_rate": 9.856983224946229e-06, "loss": 0.2733, "step": 2510 }, { "epoch": 1.5894736842105264, "grad_norm": 0.5076416784918278, "learning_rate": 9.850007307953713e-06, "loss": 0.3006, "step": 2511 }, { "epoch": 1.5901068460625247, "grad_norm": 0.4635116612061907, "learning_rate": 9.843031463968447e-06, "loss": 0.2986, "step": 2512 }, { "epoch": 1.590740007914523, "grad_norm": 0.4963861728128413, "learning_rate": 9.83605569638583e-06, "loss": 0.2937, "step": 2513 }, { "epoch": 1.5913731697665217, "grad_norm": 0.5202828122496459, "learning_rate": 9.829080008601241e-06, "loss": 0.2928, "step": 2514 }, { "epoch": 1.59200633161852, "grad_norm": 0.5010373034773627, "learning_rate": 9.82210440401002e-06, "loss": 0.2911, "step": 2515 }, { "epoch": 1.5926394934705184, "grad_norm": 0.4940736805980018, "learning_rate": 9.815128886007458e-06, "loss": 0.2986, "step": 2516 }, { "epoch": 1.593272655322517, "grad_norm": 0.5075735973494016, "learning_rate": 9.808153457988816e-06, "loss": 0.3105, "step": 2517 }, { "epoch": 1.593905817174515, "grad_norm": 0.5235602341198905, "learning_rate": 9.801178123349298e-06, "loss": 0.3245, "step": 2518 }, { "epoch": 1.5945389790265136, "grad_norm": 0.5387975861579761, "learning_rate": 9.794202885484076e-06, "loss": 0.3098, "step": 2519 }, { "epoch": 1.5951721408785122, "grad_norm": 0.4974186923330067, "learning_rate": 9.78722774778826e-06, "loss": 0.2887, "step": 2520 }, { "epoch": 1.5958053027305104, "grad_norm": 0.5351633067705774, "learning_rate": 9.780252713656927e-06, "loss": 0.2874, "step": 2521 }, { "epoch": 1.596438464582509, "grad_norm": 0.49596570663522405, "learning_rate": 9.773277786485088e-06, "loss": 0.324, "step": 2522 }, { "epoch": 1.5970716264345073, "grad_norm": 0.48646537521834926, "learning_rate": 9.766302969667718e-06, "loss": 0.3009, "step": 2523 }, { "epoch": 1.5977047882865056, "grad_norm": 0.5347014116807616, "learning_rate": 9.759328266599719e-06, "loss": 0.29, "step": 2524 }, { "epoch": 1.5983379501385042, "grad_norm": 0.4891502395562979, "learning_rate": 9.752353680675956e-06, "loss": 0.3076, "step": 2525 }, { "epoch": 1.5989711119905026, "grad_norm": 0.4793513812025177, "learning_rate": 9.745379215291232e-06, "loss": 0.3122, "step": 2526 }, { "epoch": 1.599604273842501, "grad_norm": 0.4823545014276905, "learning_rate": 9.738404873840282e-06, "loss": 0.2898, "step": 2527 }, { "epoch": 1.6002374356944995, "grad_norm": 0.48371128578123845, "learning_rate": 9.731430659717794e-06, "loss": 0.2909, "step": 2528 }, { "epoch": 1.6008705975464979, "grad_norm": 0.47766492578531927, "learning_rate": 9.724456576318383e-06, "loss": 0.3001, "step": 2529 }, { "epoch": 1.6015037593984962, "grad_norm": 0.48505473279174266, "learning_rate": 9.717482627036606e-06, "loss": 0.3026, "step": 2530 }, { "epoch": 1.6021369212504948, "grad_norm": 0.5737523006760608, "learning_rate": 9.710508815266952e-06, "loss": 0.2853, "step": 2531 }, { "epoch": 1.602770083102493, "grad_norm": 0.49106850892697945, "learning_rate": 9.703535144403849e-06, "loss": 0.3169, "step": 2532 }, { "epoch": 1.6034032449544915, "grad_norm": 0.4675883037692483, "learning_rate": 9.696561617841647e-06, "loss": 0.3, "step": 2533 }, { "epoch": 1.60403640680649, "grad_norm": 0.48239748359588397, "learning_rate": 9.689588238974634e-06, "loss": 0.2926, "step": 2534 }, { "epoch": 1.6046695686584882, "grad_norm": 0.5163141021317659, "learning_rate": 9.682615011197026e-06, "loss": 0.3105, "step": 2535 }, { "epoch": 1.6053027305104868, "grad_norm": 0.46069045326534735, "learning_rate": 9.675641937902957e-06, "loss": 0.2801, "step": 2536 }, { "epoch": 1.6059358923624851, "grad_norm": 0.4753409302735789, "learning_rate": 9.668669022486495e-06, "loss": 0.2985, "step": 2537 }, { "epoch": 1.6065690542144835, "grad_norm": 0.4984102023553579, "learning_rate": 9.661696268341623e-06, "loss": 0.2929, "step": 2538 }, { "epoch": 1.607202216066482, "grad_norm": 0.4581120151423792, "learning_rate": 9.654723678862257e-06, "loss": 0.3213, "step": 2539 }, { "epoch": 1.6078353779184804, "grad_norm": 0.48302339704816544, "learning_rate": 9.647751257442217e-06, "loss": 0.2955, "step": 2540 }, { "epoch": 1.6084685397704788, "grad_norm": 0.48446212495759944, "learning_rate": 9.64077900747526e-06, "loss": 0.2937, "step": 2541 }, { "epoch": 1.6091017016224773, "grad_norm": 0.49442640260510967, "learning_rate": 9.63380693235504e-06, "loss": 0.2886, "step": 2542 }, { "epoch": 1.6097348634744757, "grad_norm": 0.45730380807729365, "learning_rate": 9.626835035475142e-06, "loss": 0.3068, "step": 2543 }, { "epoch": 1.610368025326474, "grad_norm": 0.49112465795197396, "learning_rate": 9.619863320229054e-06, "loss": 0.2824, "step": 2544 }, { "epoch": 1.6110011871784726, "grad_norm": 0.4930546199411795, "learning_rate": 9.612891790010179e-06, "loss": 0.2922, "step": 2545 }, { "epoch": 1.6116343490304708, "grad_norm": 0.4757992885978526, "learning_rate": 9.605920448211835e-06, "loss": 0.301, "step": 2546 }, { "epoch": 1.6122675108824693, "grad_norm": 0.47439389226363443, "learning_rate": 9.598949298227237e-06, "loss": 0.2931, "step": 2547 }, { "epoch": 1.612900672734468, "grad_norm": 0.49211184672989644, "learning_rate": 9.59197834344952e-06, "loss": 0.3042, "step": 2548 }, { "epoch": 1.613533834586466, "grad_norm": 0.4963766469404122, "learning_rate": 9.585007587271711e-06, "loss": 0.3006, "step": 2549 }, { "epoch": 1.6141669964384646, "grad_norm": 0.49577963278047144, "learning_rate": 9.578037033086752e-06, "loss": 0.3074, "step": 2550 }, { "epoch": 1.614800158290463, "grad_norm": 0.45576557640777676, "learning_rate": 9.571066684287473e-06, "loss": 0.3122, "step": 2551 }, { "epoch": 1.6154333201424613, "grad_norm": 0.5069938889158025, "learning_rate": 9.564096544266624e-06, "loss": 0.2867, "step": 2552 }, { "epoch": 1.61606648199446, "grad_norm": 0.48070649650560593, "learning_rate": 9.557126616416828e-06, "loss": 0.3075, "step": 2553 }, { "epoch": 1.6166996438464583, "grad_norm": 0.5093088545189948, "learning_rate": 9.550156904130627e-06, "loss": 0.3007, "step": 2554 }, { "epoch": 1.6173328056984566, "grad_norm": 0.5018780814873479, "learning_rate": 9.543187410800452e-06, "loss": 0.3181, "step": 2555 }, { "epoch": 1.6179659675504552, "grad_norm": 0.5103321759398136, "learning_rate": 9.536218139818615e-06, "loss": 0.3077, "step": 2556 }, { "epoch": 1.6185991294024535, "grad_norm": 0.49656785982404794, "learning_rate": 9.529249094577342e-06, "loss": 0.3118, "step": 2557 }, { "epoch": 1.619232291254452, "grad_norm": 0.5427882271682262, "learning_rate": 9.522280278468721e-06, "loss": 0.3083, "step": 2558 }, { "epoch": 1.6198654531064505, "grad_norm": 0.4891328055511099, "learning_rate": 9.515311694884758e-06, "loss": 0.2802, "step": 2559 }, { "epoch": 1.6204986149584486, "grad_norm": 0.5082817862231437, "learning_rate": 9.508343347217324e-06, "loss": 0.305, "step": 2560 }, { "epoch": 1.6211317768104472, "grad_norm": 0.525202956683103, "learning_rate": 9.501375238858189e-06, "loss": 0.2985, "step": 2561 }, { "epoch": 1.6217649386624458, "grad_norm": 0.6010561877722064, "learning_rate": 9.494407373198992e-06, "loss": 0.3193, "step": 2562 }, { "epoch": 1.6223981005144439, "grad_norm": 0.4659241450685036, "learning_rate": 9.487439753631273e-06, "loss": 0.2879, "step": 2563 }, { "epoch": 1.6230312623664425, "grad_norm": 0.49014796939842525, "learning_rate": 9.480472383546434e-06, "loss": 0.2986, "step": 2564 }, { "epoch": 1.6236644242184408, "grad_norm": 0.5035950398207917, "learning_rate": 9.473505266335765e-06, "loss": 0.2951, "step": 2565 }, { "epoch": 1.6242975860704392, "grad_norm": 0.4773509862592553, "learning_rate": 9.466538405390435e-06, "loss": 0.2968, "step": 2566 }, { "epoch": 1.6249307479224377, "grad_norm": 1.3201667113749673, "learning_rate": 9.459571804101481e-06, "loss": 0.2832, "step": 2567 }, { "epoch": 1.625563909774436, "grad_norm": 0.5130985743852796, "learning_rate": 9.45260546585982e-06, "loss": 0.3065, "step": 2568 }, { "epoch": 1.6261970716264345, "grad_norm": 0.5138798228236557, "learning_rate": 9.44563939405623e-06, "loss": 0.2986, "step": 2569 }, { "epoch": 1.626830233478433, "grad_norm": 0.46917172746303126, "learning_rate": 9.438673592081381e-06, "loss": 0.2868, "step": 2570 }, { "epoch": 1.6274633953304314, "grad_norm": 0.47034989650566156, "learning_rate": 9.431708063325786e-06, "loss": 0.2973, "step": 2571 }, { "epoch": 1.6280965571824297, "grad_norm": 0.5099259633909538, "learning_rate": 9.424742811179843e-06, "loss": 0.3104, "step": 2572 }, { "epoch": 1.6287297190344283, "grad_norm": 0.481667867409196, "learning_rate": 9.417777839033806e-06, "loss": 0.2903, "step": 2573 }, { "epoch": 1.6293628808864264, "grad_norm": 0.4856351178736194, "learning_rate": 9.410813150277797e-06, "loss": 0.2961, "step": 2574 }, { "epoch": 1.629996042738425, "grad_norm": 0.46103164920573525, "learning_rate": 9.403848748301802e-06, "loss": 0.3044, "step": 2575 }, { "epoch": 1.6306292045904234, "grad_norm": 0.6174426329505374, "learning_rate": 9.396884636495662e-06, "loss": 0.3094, "step": 2576 }, { "epoch": 1.6312623664424217, "grad_norm": 0.45887500721342483, "learning_rate": 9.389920818249083e-06, "loss": 0.2894, "step": 2577 }, { "epoch": 1.6318955282944203, "grad_norm": 0.47568843303026365, "learning_rate": 9.382957296951619e-06, "loss": 0.3096, "step": 2578 }, { "epoch": 1.6325286901464187, "grad_norm": 0.4913629273903809, "learning_rate": 9.375994075992689e-06, "loss": 0.2957, "step": 2579 }, { "epoch": 1.633161851998417, "grad_norm": 0.48331959278171877, "learning_rate": 9.369031158761558e-06, "loss": 0.3141, "step": 2580 }, { "epoch": 1.6337950138504156, "grad_norm": 0.48293726899721995, "learning_rate": 9.362068548647355e-06, "loss": 0.2951, "step": 2581 }, { "epoch": 1.634428175702414, "grad_norm": 0.45235545017118045, "learning_rate": 9.35510624903904e-06, "loss": 0.308, "step": 2582 }, { "epoch": 1.6350613375544123, "grad_norm": 0.5158529544361448, "learning_rate": 9.348144263325445e-06, "loss": 0.2918, "step": 2583 }, { "epoch": 1.6356944994064109, "grad_norm": 0.46304767504252675, "learning_rate": 9.341182594895226e-06, "loss": 0.3158, "step": 2584 }, { "epoch": 1.6363276612584092, "grad_norm": 0.4607233514048589, "learning_rate": 9.334221247136901e-06, "loss": 0.3002, "step": 2585 }, { "epoch": 1.6369608231104076, "grad_norm": 0.5403913662438093, "learning_rate": 9.327260223438835e-06, "loss": 0.2977, "step": 2586 }, { "epoch": 1.6375939849624062, "grad_norm": 0.5303515363707206, "learning_rate": 9.320299527189214e-06, "loss": 0.3007, "step": 2587 }, { "epoch": 1.6382271468144043, "grad_norm": 0.7982382871374352, "learning_rate": 9.313339161776087e-06, "loss": 0.2969, "step": 2588 }, { "epoch": 1.6388603086664029, "grad_norm": 0.4769340612397734, "learning_rate": 9.306379130587329e-06, "loss": 0.2985, "step": 2589 }, { "epoch": 1.6394934705184012, "grad_norm": 0.46574429892437047, "learning_rate": 9.299419437010657e-06, "loss": 0.3118, "step": 2590 }, { "epoch": 1.6401266323703996, "grad_norm": 0.45301278733340067, "learning_rate": 9.292460084433622e-06, "loss": 0.307, "step": 2591 }, { "epoch": 1.6407597942223981, "grad_norm": 0.48065947600944703, "learning_rate": 9.285501076243613e-06, "loss": 0.2874, "step": 2592 }, { "epoch": 1.6413929560743965, "grad_norm": 0.4538501235199759, "learning_rate": 9.278542415827843e-06, "loss": 0.3047, "step": 2593 }, { "epoch": 1.6420261179263949, "grad_norm": 0.5419422743984068, "learning_rate": 9.271584106573364e-06, "loss": 0.314, "step": 2594 }, { "epoch": 1.6426592797783934, "grad_norm": 0.47924549167520747, "learning_rate": 9.264626151867058e-06, "loss": 0.2956, "step": 2595 }, { "epoch": 1.6432924416303918, "grad_norm": 0.48766576635313547, "learning_rate": 9.257668555095624e-06, "loss": 0.2943, "step": 2596 }, { "epoch": 1.6439256034823901, "grad_norm": 0.48628210415437917, "learning_rate": 9.250711319645599e-06, "loss": 0.2969, "step": 2597 }, { "epoch": 1.6445587653343887, "grad_norm": 0.5077521974588602, "learning_rate": 9.243754448903329e-06, "loss": 0.2998, "step": 2598 }, { "epoch": 1.645191927186387, "grad_norm": 0.5618055051320409, "learning_rate": 9.236797946255005e-06, "loss": 0.2915, "step": 2599 }, { "epoch": 1.6458250890383854, "grad_norm": 0.494471484982773, "learning_rate": 9.229841815086615e-06, "loss": 0.2945, "step": 2600 }, { "epoch": 1.646458250890384, "grad_norm": 0.5115425260486157, "learning_rate": 9.222886058783983e-06, "loss": 0.2974, "step": 2601 }, { "epoch": 1.6470914127423821, "grad_norm": 0.4853115690127187, "learning_rate": 9.215930680732739e-06, "loss": 0.2964, "step": 2602 }, { "epoch": 1.6477245745943807, "grad_norm": 0.47316609793324643, "learning_rate": 9.208975684318338e-06, "loss": 0.3014, "step": 2603 }, { "epoch": 1.648357736446379, "grad_norm": 0.5182576017712874, "learning_rate": 9.202021072926047e-06, "loss": 0.2838, "step": 2604 }, { "epoch": 1.6489908982983774, "grad_norm": 0.47650110498843057, "learning_rate": 9.19506684994094e-06, "loss": 0.2948, "step": 2605 }, { "epoch": 1.649624060150376, "grad_norm": 0.4968989853559405, "learning_rate": 9.18811301874791e-06, "loss": 0.3026, "step": 2606 }, { "epoch": 1.6502572220023743, "grad_norm": 0.5850743714433655, "learning_rate": 9.181159582731653e-06, "loss": 0.2916, "step": 2607 }, { "epoch": 1.6508903838543727, "grad_norm": 0.6951150062915318, "learning_rate": 9.174206545276678e-06, "loss": 0.3125, "step": 2608 }, { "epoch": 1.6515235457063713, "grad_norm": 0.5403344802841848, "learning_rate": 9.167253909767291e-06, "loss": 0.2878, "step": 2609 }, { "epoch": 1.6521567075583696, "grad_norm": 0.5258531986863229, "learning_rate": 9.160301679587619e-06, "loss": 0.2664, "step": 2610 }, { "epoch": 1.652789869410368, "grad_norm": 0.4911372873525715, "learning_rate": 9.153349858121566e-06, "loss": 0.3054, "step": 2611 }, { "epoch": 1.6534230312623666, "grad_norm": 0.5018937695786589, "learning_rate": 9.146398448752869e-06, "loss": 0.2819, "step": 2612 }, { "epoch": 1.654056193114365, "grad_norm": 0.5193481190425472, "learning_rate": 9.139447454865034e-06, "loss": 0.3078, "step": 2613 }, { "epoch": 1.6546893549663633, "grad_norm": 0.5643028766835785, "learning_rate": 9.132496879841381e-06, "loss": 0.3009, "step": 2614 }, { "epoch": 1.6553225168183618, "grad_norm": 0.46516232596543383, "learning_rate": 9.125546727065033e-06, "loss": 0.3046, "step": 2615 }, { "epoch": 1.65595567867036, "grad_norm": 0.5304270412937848, "learning_rate": 9.118596999918888e-06, "loss": 0.2968, "step": 2616 }, { "epoch": 1.6565888405223586, "grad_norm": 0.4932077180614443, "learning_rate": 9.11164770178565e-06, "loss": 0.3221, "step": 2617 }, { "epoch": 1.657222002374357, "grad_norm": 0.5034585478262317, "learning_rate": 9.10469883604781e-06, "loss": 0.3001, "step": 2618 }, { "epoch": 1.6578551642263553, "grad_norm": 0.4904868565409983, "learning_rate": 9.097750406087652e-06, "loss": 0.2859, "step": 2619 }, { "epoch": 1.6584883260783538, "grad_norm": 0.4913440172626629, "learning_rate": 9.090802415287244e-06, "loss": 0.3097, "step": 2620 }, { "epoch": 1.6591214879303522, "grad_norm": 0.4995164858400611, "learning_rate": 9.083854867028441e-06, "loss": 0.2906, "step": 2621 }, { "epoch": 1.6597546497823505, "grad_norm": 0.4878591340991132, "learning_rate": 9.076907764692883e-06, "loss": 0.2939, "step": 2622 }, { "epoch": 1.6603878116343491, "grad_norm": 0.5575457651633747, "learning_rate": 9.069961111661993e-06, "loss": 0.3011, "step": 2623 }, { "epoch": 1.6610209734863475, "grad_norm": 0.5398292105645723, "learning_rate": 9.063014911316979e-06, "loss": 0.3152, "step": 2624 }, { "epoch": 1.6616541353383458, "grad_norm": 0.4941529203704135, "learning_rate": 9.056069167038822e-06, "loss": 0.3043, "step": 2625 }, { "epoch": 1.6622872971903444, "grad_norm": 0.5521684191718857, "learning_rate": 9.049123882208288e-06, "loss": 0.286, "step": 2626 }, { "epoch": 1.6629204590423425, "grad_norm": 0.49735902643567426, "learning_rate": 9.042179060205906e-06, "loss": 0.2898, "step": 2627 }, { "epoch": 1.6635536208943411, "grad_norm": 0.48729725112630473, "learning_rate": 9.035234704412005e-06, "loss": 0.2817, "step": 2628 }, { "epoch": 1.6641867827463397, "grad_norm": 0.5206422186927887, "learning_rate": 9.028290818206655e-06, "loss": 0.2935, "step": 2629 }, { "epoch": 1.6648199445983378, "grad_norm": 0.5212049914938093, "learning_rate": 9.021347404969725e-06, "loss": 0.3224, "step": 2630 }, { "epoch": 1.6654531064503364, "grad_norm": 0.5850227976807543, "learning_rate": 9.014404468080836e-06, "loss": 0.296, "step": 2631 }, { "epoch": 1.6660862683023347, "grad_norm": 0.4679784383955725, "learning_rate": 9.007462010919387e-06, "loss": 0.3179, "step": 2632 }, { "epoch": 1.666719430154333, "grad_norm": 0.5031279999276709, "learning_rate": 9.000520036864537e-06, "loss": 0.3058, "step": 2633 }, { "epoch": 1.6673525920063317, "grad_norm": 0.5349387187151382, "learning_rate": 8.993578549295218e-06, "loss": 0.2927, "step": 2634 }, { "epoch": 1.66798575385833, "grad_norm": 0.5820532610583375, "learning_rate": 8.986637551590118e-06, "loss": 0.2937, "step": 2635 }, { "epoch": 1.6686189157103284, "grad_norm": 0.4912084827448723, "learning_rate": 8.979697047127688e-06, "loss": 0.2909, "step": 2636 }, { "epoch": 1.669252077562327, "grad_norm": 0.48246563699403416, "learning_rate": 8.972757039286143e-06, "loss": 0.3089, "step": 2637 }, { "epoch": 1.6698852394143253, "grad_norm": 0.6209198516831401, "learning_rate": 8.965817531443448e-06, "loss": 0.312, "step": 2638 }, { "epoch": 1.6705184012663237, "grad_norm": 0.5139336331436708, "learning_rate": 8.958878526977336e-06, "loss": 0.292, "step": 2639 }, { "epoch": 1.6711515631183222, "grad_norm": 0.5447218722136853, "learning_rate": 8.951940029265279e-06, "loss": 0.2862, "step": 2640 }, { "epoch": 1.6717847249703204, "grad_norm": 0.5101032314290153, "learning_rate": 8.945002041684525e-06, "loss": 0.2917, "step": 2641 }, { "epoch": 1.672417886822319, "grad_norm": 0.49656866517979, "learning_rate": 8.93806456761205e-06, "loss": 0.3026, "step": 2642 }, { "epoch": 1.6730510486743175, "grad_norm": 0.9545815394541698, "learning_rate": 8.931127610424592e-06, "loss": 0.2957, "step": 2643 }, { "epoch": 1.6736842105263157, "grad_norm": 0.6220795951059112, "learning_rate": 8.924191173498643e-06, "loss": 0.2927, "step": 2644 }, { "epoch": 1.6743173723783142, "grad_norm": 0.5330317471202818, "learning_rate": 8.917255260210428e-06, "loss": 0.297, "step": 2645 }, { "epoch": 1.6749505342303126, "grad_norm": 0.5075066562452419, "learning_rate": 8.910319873935929e-06, "loss": 0.2925, "step": 2646 }, { "epoch": 1.675583696082311, "grad_norm": 0.4873883666616107, "learning_rate": 8.90338501805086e-06, "loss": 0.306, "step": 2647 }, { "epoch": 1.6762168579343095, "grad_norm": 0.6191411397395262, "learning_rate": 8.896450695930693e-06, "loss": 0.3128, "step": 2648 }, { "epoch": 1.6768500197863079, "grad_norm": 0.4942938807309572, "learning_rate": 8.889516910950622e-06, "loss": 0.2925, "step": 2649 }, { "epoch": 1.6774831816383062, "grad_norm": 0.47594078309341725, "learning_rate": 8.882583666485593e-06, "loss": 0.2847, "step": 2650 }, { "epoch": 1.6781163434903048, "grad_norm": 0.6855223468716567, "learning_rate": 8.87565096591028e-06, "loss": 0.2959, "step": 2651 }, { "epoch": 1.6787495053423032, "grad_norm": 0.5098275731540953, "learning_rate": 8.868718812599103e-06, "loss": 0.2903, "step": 2652 }, { "epoch": 1.6793826671943015, "grad_norm": 0.49965093838318253, "learning_rate": 8.861787209926201e-06, "loss": 0.3018, "step": 2653 }, { "epoch": 1.6800158290463, "grad_norm": 0.49576506118723845, "learning_rate": 8.854856161265454e-06, "loss": 0.2868, "step": 2654 }, { "epoch": 1.6806489908982982, "grad_norm": 0.4877926396071697, "learning_rate": 8.84792566999048e-06, "loss": 0.2951, "step": 2655 }, { "epoch": 1.6812821527502968, "grad_norm": 0.4893413597236197, "learning_rate": 8.840995739474608e-06, "loss": 0.3006, "step": 2656 }, { "epoch": 1.6819153146022954, "grad_norm": 0.50817212041842, "learning_rate": 8.834066373090908e-06, "loss": 0.2852, "step": 2657 }, { "epoch": 1.6825484764542935, "grad_norm": 0.5972836704432322, "learning_rate": 8.827137574212165e-06, "loss": 0.307, "step": 2658 }, { "epoch": 1.683181638306292, "grad_norm": 0.5160350629580982, "learning_rate": 8.820209346210899e-06, "loss": 0.2888, "step": 2659 }, { "epoch": 1.6838148001582904, "grad_norm": 0.4974317871629267, "learning_rate": 8.813281692459342e-06, "loss": 0.295, "step": 2660 }, { "epoch": 1.6844479620102888, "grad_norm": 0.528086217833931, "learning_rate": 8.806354616329455e-06, "loss": 0.2789, "step": 2661 }, { "epoch": 1.6850811238622874, "grad_norm": 0.5279992147900172, "learning_rate": 8.79942812119291e-06, "loss": 0.2845, "step": 2662 }, { "epoch": 1.6857142857142857, "grad_norm": 0.4813564382089557, "learning_rate": 8.7925022104211e-06, "loss": 0.302, "step": 2663 }, { "epoch": 1.686347447566284, "grad_norm": 0.5220173206185597, "learning_rate": 8.785576887385138e-06, "loss": 0.3003, "step": 2664 }, { "epoch": 1.6869806094182827, "grad_norm": 0.6101978195707497, "learning_rate": 8.77865215545584e-06, "loss": 0.3033, "step": 2665 }, { "epoch": 1.687613771270281, "grad_norm": 0.49505633507939495, "learning_rate": 8.771728018003749e-06, "loss": 0.3176, "step": 2666 }, { "epoch": 1.6882469331222794, "grad_norm": 0.4875700338008648, "learning_rate": 8.7648044783991e-06, "loss": 0.2949, "step": 2667 }, { "epoch": 1.688880094974278, "grad_norm": 0.4668342015525576, "learning_rate": 8.757881540011857e-06, "loss": 0.2972, "step": 2668 }, { "epoch": 1.689513256826276, "grad_norm": 0.4743793075890778, "learning_rate": 8.750959206211673e-06, "loss": 0.2677, "step": 2669 }, { "epoch": 1.6901464186782746, "grad_norm": 0.47505457105652765, "learning_rate": 8.744037480367922e-06, "loss": 0.2779, "step": 2670 }, { "epoch": 1.690779580530273, "grad_norm": 0.5104346317854433, "learning_rate": 8.737116365849671e-06, "loss": 0.3142, "step": 2671 }, { "epoch": 1.6914127423822714, "grad_norm": 0.5384605828049946, "learning_rate": 8.730195866025697e-06, "loss": 0.3028, "step": 2672 }, { "epoch": 1.69204590423427, "grad_norm": 0.48358046384955733, "learning_rate": 8.723275984264475e-06, "loss": 0.3098, "step": 2673 }, { "epoch": 1.6926790660862683, "grad_norm": 0.6534422194309233, "learning_rate": 8.716356723934176e-06, "loss": 0.3061, "step": 2674 }, { "epoch": 1.6933122279382666, "grad_norm": 0.5781483059546279, "learning_rate": 8.709438088402674e-06, "loss": 0.3104, "step": 2675 }, { "epoch": 1.6939453897902652, "grad_norm": 0.5054210812990828, "learning_rate": 8.702520081037536e-06, "loss": 0.2779, "step": 2676 }, { "epoch": 1.6945785516422636, "grad_norm": 0.5043673219157555, "learning_rate": 8.695602705206024e-06, "loss": 0.3058, "step": 2677 }, { "epoch": 1.695211713494262, "grad_norm": 0.5115947355999715, "learning_rate": 8.688685964275088e-06, "loss": 0.2889, "step": 2678 }, { "epoch": 1.6958448753462605, "grad_norm": 0.4857497272557646, "learning_rate": 8.681769861611383e-06, "loss": 0.3087, "step": 2679 }, { "epoch": 1.6964780371982588, "grad_norm": 0.6136753746000727, "learning_rate": 8.674854400581227e-06, "loss": 0.3064, "step": 2680 }, { "epoch": 1.6971111990502572, "grad_norm": 0.5161830158657976, "learning_rate": 8.667939584550661e-06, "loss": 0.2961, "step": 2681 }, { "epoch": 1.6977443609022558, "grad_norm": 0.5119475833231281, "learning_rate": 8.661025416885379e-06, "loss": 0.2874, "step": 2682 }, { "epoch": 1.698377522754254, "grad_norm": 0.5073964237812294, "learning_rate": 8.654111900950776e-06, "loss": 0.2998, "step": 2683 }, { "epoch": 1.6990106846062525, "grad_norm": 0.5600906859232286, "learning_rate": 8.64719904011194e-06, "loss": 0.313, "step": 2684 }, { "epoch": 1.6996438464582508, "grad_norm": 0.4892432232046277, "learning_rate": 8.64028683773361e-06, "loss": 0.2903, "step": 2685 }, { "epoch": 1.7002770083102492, "grad_norm": 0.4648437016720367, "learning_rate": 8.633375297180235e-06, "loss": 0.3109, "step": 2686 }, { "epoch": 1.7009101701622478, "grad_norm": 0.5114434040415352, "learning_rate": 8.626464421815919e-06, "loss": 0.3113, "step": 2687 }, { "epoch": 1.7015433320142461, "grad_norm": 0.5933972354447778, "learning_rate": 8.61955421500446e-06, "loss": 0.3141, "step": 2688 }, { "epoch": 1.7021764938662445, "grad_norm": 0.8607333899843255, "learning_rate": 8.61264468010932e-06, "loss": 0.2891, "step": 2689 }, { "epoch": 1.702809655718243, "grad_norm": 0.5016012350630952, "learning_rate": 8.605735820493635e-06, "loss": 0.2909, "step": 2690 }, { "epoch": 1.7034428175702414, "grad_norm": 0.5691535109697687, "learning_rate": 8.598827639520216e-06, "loss": 0.2977, "step": 2691 }, { "epoch": 1.7040759794222398, "grad_norm": 0.5002526720732648, "learning_rate": 8.59192014055154e-06, "loss": 0.3001, "step": 2692 }, { "epoch": 1.7047091412742383, "grad_norm": 0.47955686574464723, "learning_rate": 8.585013326949756e-06, "loss": 0.3152, "step": 2693 }, { "epoch": 1.7053423031262367, "grad_norm": 0.5131441054780455, "learning_rate": 8.578107202076675e-06, "loss": 0.2944, "step": 2694 }, { "epoch": 1.705975464978235, "grad_norm": 0.5580732194365101, "learning_rate": 8.571201769293779e-06, "loss": 0.2823, "step": 2695 }, { "epoch": 1.7066086268302336, "grad_norm": 0.9479166689396746, "learning_rate": 8.564297031962201e-06, "loss": 0.2965, "step": 2696 }, { "epoch": 1.7072417886822318, "grad_norm": 0.5089696345697066, "learning_rate": 8.557392993442755e-06, "loss": 0.3073, "step": 2697 }, { "epoch": 1.7078749505342303, "grad_norm": 0.5154559706124695, "learning_rate": 8.550489657095892e-06, "loss": 0.3074, "step": 2698 }, { "epoch": 1.7085081123862287, "grad_norm": 0.4972655967459355, "learning_rate": 8.543587026281738e-06, "loss": 0.2945, "step": 2699 }, { "epoch": 1.709141274238227, "grad_norm": 2.022282724448696, "learning_rate": 8.536685104360071e-06, "loss": 0.3032, "step": 2700 }, { "epoch": 1.7097744360902256, "grad_norm": 0.5569551601560329, "learning_rate": 8.529783894690322e-06, "loss": 0.2929, "step": 2701 }, { "epoch": 1.710407597942224, "grad_norm": 0.4817609927417, "learning_rate": 8.522883400631574e-06, "loss": 0.2985, "step": 2702 }, { "epoch": 1.7110407597942223, "grad_norm": 0.5142100836316311, "learning_rate": 8.515983625542566e-06, "loss": 0.2902, "step": 2703 }, { "epoch": 1.711673921646221, "grad_norm": 0.4941717748977838, "learning_rate": 8.50908457278169e-06, "loss": 0.3004, "step": 2704 }, { "epoch": 1.7123070834982193, "grad_norm": 0.48185078738867204, "learning_rate": 8.502186245706973e-06, "loss": 0.3222, "step": 2705 }, { "epoch": 1.7129402453502176, "grad_norm": 0.725898890182137, "learning_rate": 8.495288647676105e-06, "loss": 0.2936, "step": 2706 }, { "epoch": 1.7135734072022162, "grad_norm": 0.4697092614161719, "learning_rate": 8.488391782046408e-06, "loss": 0.3036, "step": 2707 }, { "epoch": 1.7142065690542145, "grad_norm": 0.7088261063723792, "learning_rate": 8.481495652174859e-06, "loss": 0.3173, "step": 2708 }, { "epoch": 1.714839730906213, "grad_norm": 0.476746870476688, "learning_rate": 8.474600261418063e-06, "loss": 0.2891, "step": 2709 }, { "epoch": 1.7154728927582115, "grad_norm": 0.7409754544575144, "learning_rate": 8.46770561313228e-06, "loss": 0.3013, "step": 2710 }, { "epoch": 1.7161060546102096, "grad_norm": 0.5037540937786119, "learning_rate": 8.4608117106734e-06, "loss": 0.2981, "step": 2711 }, { "epoch": 1.7167392164622082, "grad_norm": 0.5228397513947896, "learning_rate": 8.453918557396946e-06, "loss": 0.3004, "step": 2712 }, { "epoch": 1.7173723783142065, "grad_norm": 0.5103081647941748, "learning_rate": 8.447026156658093e-06, "loss": 0.299, "step": 2713 }, { "epoch": 1.7180055401662049, "grad_norm": 0.5510416637289044, "learning_rate": 8.44013451181163e-06, "loss": 0.2834, "step": 2714 }, { "epoch": 1.7186387020182035, "grad_norm": 1.8593444254890714, "learning_rate": 8.433243626211992e-06, "loss": 0.3015, "step": 2715 }, { "epoch": 1.7192718638702018, "grad_norm": 0.5028997648589041, "learning_rate": 8.426353503213235e-06, "loss": 0.3066, "step": 2716 }, { "epoch": 1.7199050257222002, "grad_norm": 0.5064816368589454, "learning_rate": 8.419464146169053e-06, "loss": 0.2992, "step": 2717 }, { "epoch": 1.7205381875741987, "grad_norm": 0.5134336626851119, "learning_rate": 8.412575558432759e-06, "loss": 0.3131, "step": 2718 }, { "epoch": 1.721171349426197, "grad_norm": 0.7199627698985737, "learning_rate": 8.405687743357296e-06, "loss": 0.2977, "step": 2719 }, { "epoch": 1.7218045112781954, "grad_norm": 0.5003447728725594, "learning_rate": 8.398800704295227e-06, "loss": 0.2837, "step": 2720 }, { "epoch": 1.722437673130194, "grad_norm": 0.4722267557911499, "learning_rate": 8.391914444598748e-06, "loss": 0.304, "step": 2721 }, { "epoch": 1.7230708349821922, "grad_norm": 0.4824384731113557, "learning_rate": 8.385028967619656e-06, "loss": 0.2896, "step": 2722 }, { "epoch": 1.7237039968341907, "grad_norm": 0.517303965019785, "learning_rate": 8.378144276709388e-06, "loss": 0.2928, "step": 2723 }, { "epoch": 1.7243371586861893, "grad_norm": 0.5311101679673534, "learning_rate": 8.37126037521899e-06, "loss": 0.3024, "step": 2724 }, { "epoch": 1.7249703205381874, "grad_norm": 0.4825291987115084, "learning_rate": 8.364377266499111e-06, "loss": 0.3016, "step": 2725 }, { "epoch": 1.725603482390186, "grad_norm": 0.49015829829414026, "learning_rate": 8.357494953900044e-06, "loss": 0.2929, "step": 2726 }, { "epoch": 1.7262366442421844, "grad_norm": 0.4879027420993279, "learning_rate": 8.350613440771661e-06, "loss": 0.2891, "step": 2727 }, { "epoch": 1.7268698060941827, "grad_norm": 0.5492056639904805, "learning_rate": 8.343732730463469e-06, "loss": 0.2791, "step": 2728 }, { "epoch": 1.7275029679461813, "grad_norm": 0.4901155520217472, "learning_rate": 8.336852826324569e-06, "loss": 0.3086, "step": 2729 }, { "epoch": 1.7281361297981797, "grad_norm": 0.505823686918469, "learning_rate": 8.329973731703685e-06, "loss": 0.297, "step": 2730 }, { "epoch": 1.728769291650178, "grad_norm": 0.4916247883204254, "learning_rate": 8.32309544994913e-06, "loss": 0.3075, "step": 2731 }, { "epoch": 1.7294024535021766, "grad_norm": 0.5087256896388145, "learning_rate": 8.316217984408833e-06, "loss": 0.3049, "step": 2732 }, { "epoch": 1.730035615354175, "grad_norm": 0.5955100636838804, "learning_rate": 8.309341338430324e-06, "loss": 0.2956, "step": 2733 }, { "epoch": 1.7306687772061733, "grad_norm": 0.4790885784579694, "learning_rate": 8.30246551536073e-06, "loss": 0.2928, "step": 2734 }, { "epoch": 1.7313019390581719, "grad_norm": 0.4968496606153347, "learning_rate": 8.295590518546782e-06, "loss": 0.278, "step": 2735 }, { "epoch": 1.73193510091017, "grad_norm": 0.4946188395158144, "learning_rate": 8.288716351334802e-06, "loss": 0.2977, "step": 2736 }, { "epoch": 1.7325682627621686, "grad_norm": 0.47761667210155795, "learning_rate": 8.281843017070723e-06, "loss": 0.2855, "step": 2737 }, { "epoch": 1.7332014246141672, "grad_norm": 0.5629105686117533, "learning_rate": 8.274970519100048e-06, "loss": 0.2948, "step": 2738 }, { "epoch": 1.7338345864661653, "grad_norm": 0.5120536542460793, "learning_rate": 8.268098860767899e-06, "loss": 0.2874, "step": 2739 }, { "epoch": 1.7344677483181639, "grad_norm": 0.5146850946141476, "learning_rate": 8.261228045418972e-06, "loss": 0.299, "step": 2740 }, { "epoch": 1.7351009101701622, "grad_norm": 0.48273369435633673, "learning_rate": 8.254358076397558e-06, "loss": 0.3074, "step": 2741 }, { "epoch": 1.7357340720221606, "grad_norm": 0.5021165300657863, "learning_rate": 8.247488957047542e-06, "loss": 0.2923, "step": 2742 }, { "epoch": 1.7363672338741591, "grad_norm": 0.4824996216209961, "learning_rate": 8.240620690712385e-06, "loss": 0.2904, "step": 2743 }, { "epoch": 1.7370003957261575, "grad_norm": 0.6335010125290406, "learning_rate": 8.23375328073514e-06, "loss": 0.2879, "step": 2744 }, { "epoch": 1.7376335575781559, "grad_norm": 0.4802247921860606, "learning_rate": 8.22688673045844e-06, "loss": 0.3131, "step": 2745 }, { "epoch": 1.7382667194301544, "grad_norm": 0.45732396902410366, "learning_rate": 8.2200210432245e-06, "loss": 0.2973, "step": 2746 }, { "epoch": 1.7388998812821528, "grad_norm": 0.5105776880574181, "learning_rate": 8.213156222375113e-06, "loss": 0.2878, "step": 2747 }, { "epoch": 1.7395330431341511, "grad_norm": 0.5307353628174362, "learning_rate": 8.206292271251659e-06, "loss": 0.2932, "step": 2748 }, { "epoch": 1.7401662049861497, "grad_norm": 0.488075182990819, "learning_rate": 8.199429193195082e-06, "loss": 0.2981, "step": 2749 }, { "epoch": 1.7407993668381478, "grad_norm": 0.5212759078424901, "learning_rate": 8.192566991545915e-06, "loss": 0.2883, "step": 2750 }, { "epoch": 1.7414325286901464, "grad_norm": 0.5125315946438628, "learning_rate": 8.185705669644243e-06, "loss": 0.2946, "step": 2751 }, { "epoch": 1.742065690542145, "grad_norm": 0.5317868972874079, "learning_rate": 8.178845230829751e-06, "loss": 0.2933, "step": 2752 }, { "epoch": 1.7426988523941431, "grad_norm": 0.4890079857204619, "learning_rate": 8.171985678441676e-06, "loss": 0.289, "step": 2753 }, { "epoch": 1.7433320142461417, "grad_norm": 0.5176988108081804, "learning_rate": 8.16512701581882e-06, "loss": 0.2824, "step": 2754 }, { "epoch": 1.74396517609814, "grad_norm": 0.5048332430071022, "learning_rate": 8.158269246299571e-06, "loss": 0.2969, "step": 2755 }, { "epoch": 1.7445983379501384, "grad_norm": 0.5179507853302707, "learning_rate": 8.151412373221857e-06, "loss": 0.2849, "step": 2756 }, { "epoch": 1.745231499802137, "grad_norm": 0.5854536668829404, "learning_rate": 8.144556399923195e-06, "loss": 0.2908, "step": 2757 }, { "epoch": 1.7458646616541353, "grad_norm": 0.538181805024099, "learning_rate": 8.137701329740646e-06, "loss": 0.2937, "step": 2758 }, { "epoch": 1.7464978235061337, "grad_norm": 0.4835924608163045, "learning_rate": 8.130847166010838e-06, "loss": 0.2939, "step": 2759 }, { "epoch": 1.7471309853581323, "grad_norm": 0.6010803515091938, "learning_rate": 8.123993912069958e-06, "loss": 0.289, "step": 2760 }, { "epoch": 1.7477641472101306, "grad_norm": 0.5109657234658643, "learning_rate": 8.11714157125375e-06, "loss": 0.2987, "step": 2761 }, { "epoch": 1.748397309062129, "grad_norm": 0.5350164329273455, "learning_rate": 8.110290146897514e-06, "loss": 0.2879, "step": 2762 }, { "epoch": 1.7490304709141276, "grad_norm": 0.48840777969407206, "learning_rate": 8.103439642336102e-06, "loss": 0.3075, "step": 2763 }, { "epoch": 1.7496636327661257, "grad_norm": 0.5392701672196939, "learning_rate": 8.096590060903921e-06, "loss": 0.2932, "step": 2764 }, { "epoch": 1.7502967946181243, "grad_norm": 0.49785889058012095, "learning_rate": 8.089741405934923e-06, "loss": 0.3056, "step": 2765 }, { "epoch": 1.7509299564701228, "grad_norm": 0.4706828989855841, "learning_rate": 8.082893680762619e-06, "loss": 0.2959, "step": 2766 }, { "epoch": 1.751563118322121, "grad_norm": 0.4822877059202935, "learning_rate": 8.076046888720053e-06, "loss": 0.3006, "step": 2767 }, { "epoch": 1.7521962801741195, "grad_norm": 0.626644933354952, "learning_rate": 8.069201033139834e-06, "loss": 0.295, "step": 2768 }, { "epoch": 1.752829442026118, "grad_norm": 0.4899390899584427, "learning_rate": 8.062356117354095e-06, "loss": 0.2903, "step": 2769 }, { "epoch": 1.7534626038781163, "grad_norm": 0.4965109670163355, "learning_rate": 8.055512144694526e-06, "loss": 0.2949, "step": 2770 }, { "epoch": 1.7540957657301148, "grad_norm": 0.5222806615061127, "learning_rate": 8.04866911849235e-06, "loss": 0.2869, "step": 2771 }, { "epoch": 1.7547289275821132, "grad_norm": 0.5578558846663617, "learning_rate": 8.041827042078336e-06, "loss": 0.3, "step": 2772 }, { "epoch": 1.7553620894341115, "grad_norm": 0.49708329972652443, "learning_rate": 8.034985918782786e-06, "loss": 0.2856, "step": 2773 }, { "epoch": 1.7559952512861101, "grad_norm": 0.49802090505435515, "learning_rate": 8.028145751935537e-06, "loss": 0.3015, "step": 2774 }, { "epoch": 1.7566284131381085, "grad_norm": 0.5305754213666004, "learning_rate": 8.021306544865966e-06, "loss": 0.294, "step": 2775 }, { "epoch": 1.7572615749901068, "grad_norm": 0.5409939767772841, "learning_rate": 8.014468300902976e-06, "loss": 0.2999, "step": 2776 }, { "epoch": 1.7578947368421054, "grad_norm": 0.48791245619484724, "learning_rate": 8.007631023375008e-06, "loss": 0.2838, "step": 2777 }, { "epoch": 1.7585278986941035, "grad_norm": 0.48116964666901657, "learning_rate": 8.000794715610024e-06, "loss": 0.3029, "step": 2778 }, { "epoch": 1.759161060546102, "grad_norm": 0.48888746962248525, "learning_rate": 7.993959380935527e-06, "loss": 0.3137, "step": 2779 }, { "epoch": 1.7597942223981005, "grad_norm": 0.5626000362770531, "learning_rate": 7.98712502267853e-06, "loss": 0.2936, "step": 2780 }, { "epoch": 1.7604273842500988, "grad_norm": 0.4889867688548964, "learning_rate": 7.980291644165582e-06, "loss": 0.2911, "step": 2781 }, { "epoch": 1.7610605461020974, "grad_norm": 0.5047779902557404, "learning_rate": 7.973459248722762e-06, "loss": 0.2901, "step": 2782 }, { "epoch": 1.7616937079540957, "grad_norm": 0.47224084659264437, "learning_rate": 7.966627839675644e-06, "loss": 0.2932, "step": 2783 }, { "epoch": 1.762326869806094, "grad_norm": 0.4804274672030338, "learning_rate": 7.959797420349356e-06, "loss": 0.3036, "step": 2784 }, { "epoch": 1.7629600316580927, "grad_norm": 0.5406972218822158, "learning_rate": 7.952967994068512e-06, "loss": 0.3135, "step": 2785 }, { "epoch": 1.763593193510091, "grad_norm": 0.8697265393856263, "learning_rate": 7.946139564157269e-06, "loss": 0.2878, "step": 2786 }, { "epoch": 1.7642263553620894, "grad_norm": 0.48057959930276806, "learning_rate": 7.93931213393928e-06, "loss": 0.2933, "step": 2787 }, { "epoch": 1.764859517214088, "grad_norm": 0.49847719623843806, "learning_rate": 7.932485706737725e-06, "loss": 0.2958, "step": 2788 }, { "epoch": 1.7654926790660863, "grad_norm": 0.502464758021978, "learning_rate": 7.925660285875285e-06, "loss": 0.2923, "step": 2789 }, { "epoch": 1.7661258409180847, "grad_norm": 0.5185875198437899, "learning_rate": 7.91883587467416e-06, "loss": 0.2801, "step": 2790 }, { "epoch": 1.7667590027700832, "grad_norm": 0.48409075429086107, "learning_rate": 7.912012476456052e-06, "loss": 0.3128, "step": 2791 }, { "epoch": 1.7673921646220814, "grad_norm": 0.570736160913194, "learning_rate": 7.905190094542173e-06, "loss": 0.2957, "step": 2792 }, { "epoch": 1.76802532647408, "grad_norm": 0.5696844524534719, "learning_rate": 7.898368732253243e-06, "loss": 0.2704, "step": 2793 }, { "epoch": 1.7686584883260783, "grad_norm": 0.5010813926386034, "learning_rate": 7.891548392909479e-06, "loss": 0.2913, "step": 2794 }, { "epoch": 1.7692916501780767, "grad_norm": 0.4930380180634304, "learning_rate": 7.884729079830611e-06, "loss": 0.3072, "step": 2795 }, { "epoch": 1.7699248120300752, "grad_norm": 0.5229260404958118, "learning_rate": 7.877910796335849e-06, "loss": 0.2906, "step": 2796 }, { "epoch": 1.7705579738820736, "grad_norm": 0.4824392619807417, "learning_rate": 7.871093545743927e-06, "loss": 0.2825, "step": 2797 }, { "epoch": 1.771191135734072, "grad_norm": 0.5034444374322955, "learning_rate": 7.864277331373057e-06, "loss": 0.2936, "step": 2798 }, { "epoch": 1.7718242975860705, "grad_norm": 0.5023539397808806, "learning_rate": 7.85746215654096e-06, "loss": 0.2837, "step": 2799 }, { "epoch": 1.7724574594380689, "grad_norm": 0.5628491850528401, "learning_rate": 7.85064802456484e-06, "loss": 0.3023, "step": 2800 }, { "epoch": 1.7730906212900672, "grad_norm": 0.49204008575457225, "learning_rate": 7.843834938761398e-06, "loss": 0.2962, "step": 2801 }, { "epoch": 1.7737237831420658, "grad_norm": 0.507391075510825, "learning_rate": 7.837022902446834e-06, "loss": 0.302, "step": 2802 }, { "epoch": 1.7743569449940642, "grad_norm": 0.5405415126184835, "learning_rate": 7.83021191893682e-06, "loss": 0.2922, "step": 2803 }, { "epoch": 1.7749901068460625, "grad_norm": 0.47189849411658263, "learning_rate": 7.82340199154653e-06, "loss": 0.3065, "step": 2804 }, { "epoch": 1.775623268698061, "grad_norm": 0.4960198195939216, "learning_rate": 7.816593123590618e-06, "loss": 0.2971, "step": 2805 }, { "epoch": 1.7762564305500592, "grad_norm": 0.5283791146039991, "learning_rate": 7.809785318383224e-06, "loss": 0.2897, "step": 2806 }, { "epoch": 1.7768895924020578, "grad_norm": 0.5165638681316848, "learning_rate": 7.802978579237966e-06, "loss": 0.3026, "step": 2807 }, { "epoch": 1.7775227542540561, "grad_norm": 0.5492393438725026, "learning_rate": 7.796172909467956e-06, "loss": 0.2722, "step": 2808 }, { "epoch": 1.7781559161060545, "grad_norm": 0.5549138743729279, "learning_rate": 7.789368312385762e-06, "loss": 0.3019, "step": 2809 }, { "epoch": 1.778789077958053, "grad_norm": 0.531505023402321, "learning_rate": 7.782564791303457e-06, "loss": 0.3047, "step": 2810 }, { "epoch": 1.7794222398100514, "grad_norm": 0.5122158285858687, "learning_rate": 7.775762349532576e-06, "loss": 0.298, "step": 2811 }, { "epoch": 1.7800554016620498, "grad_norm": 0.46367728832421357, "learning_rate": 7.76896099038412e-06, "loss": 0.3125, "step": 2812 }, { "epoch": 1.7806885635140484, "grad_norm": 0.4843039762036562, "learning_rate": 7.76216071716859e-06, "loss": 0.2891, "step": 2813 }, { "epoch": 1.7813217253660467, "grad_norm": 0.49892779266645326, "learning_rate": 7.75536153319593e-06, "loss": 0.3112, "step": 2814 }, { "epoch": 1.781954887218045, "grad_norm": 0.4947762173395908, "learning_rate": 7.748563441775568e-06, "loss": 0.299, "step": 2815 }, { "epoch": 1.7825880490700436, "grad_norm": 0.47800590205206006, "learning_rate": 7.741766446216397e-06, "loss": 0.3046, "step": 2816 }, { "epoch": 1.783221210922042, "grad_norm": 0.5418785126637622, "learning_rate": 7.734970549826782e-06, "loss": 0.2953, "step": 2817 }, { "epoch": 1.7838543727740404, "grad_norm": 0.4885073988965421, "learning_rate": 7.728175755914542e-06, "loss": 0.2979, "step": 2818 }, { "epoch": 1.784487534626039, "grad_norm": 0.47293098538775963, "learning_rate": 7.721382067786972e-06, "loss": 0.2849, "step": 2819 }, { "epoch": 1.785120696478037, "grad_norm": 0.5270709427435345, "learning_rate": 7.714589488750818e-06, "loss": 0.3103, "step": 2820 }, { "epoch": 1.7857538583300356, "grad_norm": 0.475545768018705, "learning_rate": 7.707798022112292e-06, "loss": 0.2904, "step": 2821 }, { "epoch": 1.786387020182034, "grad_norm": 0.6756148587036431, "learning_rate": 7.701007671177066e-06, "loss": 0.2942, "step": 2822 }, { "epoch": 1.7870201820340323, "grad_norm": 0.5928320870846046, "learning_rate": 7.694218439250262e-06, "loss": 0.3078, "step": 2823 }, { "epoch": 1.787653343886031, "grad_norm": 0.522085332550572, "learning_rate": 7.687430329636469e-06, "loss": 0.2823, "step": 2824 }, { "epoch": 1.7882865057380293, "grad_norm": 0.48457907961454355, "learning_rate": 7.680643345639709e-06, "loss": 0.2817, "step": 2825 }, { "epoch": 1.7889196675900276, "grad_norm": 0.49260958418894557, "learning_rate": 7.673857490563484e-06, "loss": 0.3107, "step": 2826 }, { "epoch": 1.7895528294420262, "grad_norm": 0.5134358354792504, "learning_rate": 7.667072767710722e-06, "loss": 0.3017, "step": 2827 }, { "epoch": 1.7901859912940246, "grad_norm": 0.5612880602678826, "learning_rate": 7.660289180383815e-06, "loss": 0.2939, "step": 2828 }, { "epoch": 1.790819153146023, "grad_norm": 0.4861484629155393, "learning_rate": 7.653506731884592e-06, "loss": 0.3101, "step": 2829 }, { "epoch": 1.7914523149980215, "grad_norm": 0.52716356491976, "learning_rate": 7.646725425514337e-06, "loss": 0.3013, "step": 2830 }, { "epoch": 1.7920854768500196, "grad_norm": 0.5488146169859704, "learning_rate": 7.639945264573776e-06, "loss": 0.3048, "step": 2831 }, { "epoch": 1.7927186387020182, "grad_norm": 0.4962614856205639, "learning_rate": 7.63316625236307e-06, "loss": 0.3038, "step": 2832 }, { "epoch": 1.7933518005540168, "grad_norm": 0.547931910709016, "learning_rate": 7.626388392181831e-06, "loss": 0.2937, "step": 2833 }, { "epoch": 1.793984962406015, "grad_norm": 0.5089191545585205, "learning_rate": 7.619611687329101e-06, "loss": 0.2962, "step": 2834 }, { "epoch": 1.7946181242580135, "grad_norm": 0.501934170446511, "learning_rate": 7.612836141103368e-06, "loss": 0.3026, "step": 2835 }, { "epoch": 1.7952512861100118, "grad_norm": 0.513779691404734, "learning_rate": 7.606061756802548e-06, "loss": 0.2981, "step": 2836 }, { "epoch": 1.7958844479620102, "grad_norm": 0.5119717082958984, "learning_rate": 7.599288537724004e-06, "loss": 0.293, "step": 2837 }, { "epoch": 1.7965176098140088, "grad_norm": 0.5431770143803639, "learning_rate": 7.592516487164511e-06, "loss": 0.2961, "step": 2838 }, { "epoch": 1.7971507716660071, "grad_norm": 0.519576921860991, "learning_rate": 7.585745608420301e-06, "loss": 0.3077, "step": 2839 }, { "epoch": 1.7977839335180055, "grad_norm": 0.506739915413889, "learning_rate": 7.578975904787012e-06, "loss": 0.3215, "step": 2840 }, { "epoch": 1.798417095370004, "grad_norm": 0.5159959621585232, "learning_rate": 7.572207379559722e-06, "loss": 0.3005, "step": 2841 }, { "epoch": 1.7990502572220024, "grad_norm": 0.47963954569933237, "learning_rate": 7.5654400360329415e-06, "loss": 0.2895, "step": 2842 }, { "epoch": 1.7996834190740008, "grad_norm": 0.489433929337491, "learning_rate": 7.558673877500591e-06, "loss": 0.3143, "step": 2843 }, { "epoch": 1.8003165809259993, "grad_norm": 0.477559418660064, "learning_rate": 7.551908907256024e-06, "loss": 0.3155, "step": 2844 }, { "epoch": 1.8009497427779975, "grad_norm": 0.515054417784757, "learning_rate": 7.545145128592009e-06, "loss": 0.2898, "step": 2845 }, { "epoch": 1.801582904629996, "grad_norm": 0.48570667678954216, "learning_rate": 7.538382544800745e-06, "loss": 0.2924, "step": 2846 }, { "epoch": 1.8022160664819946, "grad_norm": 0.5442460130638463, "learning_rate": 7.5316211591738385e-06, "loss": 0.3022, "step": 2847 }, { "epoch": 1.8028492283339927, "grad_norm": 0.4678355437767317, "learning_rate": 7.524860975002319e-06, "loss": 0.2929, "step": 2848 }, { "epoch": 1.8034823901859913, "grad_norm": 0.5000327589134759, "learning_rate": 7.518101995576628e-06, "loss": 0.3141, "step": 2849 }, { "epoch": 1.8041155520379897, "grad_norm": 0.481584869785803, "learning_rate": 7.511344224186622e-06, "loss": 0.3033, "step": 2850 }, { "epoch": 1.804748713889988, "grad_norm": 0.4896308022525431, "learning_rate": 7.504587664121572e-06, "loss": 0.2927, "step": 2851 }, { "epoch": 1.8053818757419866, "grad_norm": 0.4692050392474647, "learning_rate": 7.497832318670155e-06, "loss": 0.2705, "step": 2852 }, { "epoch": 1.806015037593985, "grad_norm": 0.5182407225338511, "learning_rate": 7.491078191120462e-06, "loss": 0.2792, "step": 2853 }, { "epoch": 1.8066481994459833, "grad_norm": 0.5104902208400345, "learning_rate": 7.484325284759979e-06, "loss": 0.2846, "step": 2854 }, { "epoch": 1.807281361297982, "grad_norm": 0.4940173911015563, "learning_rate": 7.4775736028756186e-06, "loss": 0.2926, "step": 2855 }, { "epoch": 1.8079145231499802, "grad_norm": 0.49705492789779776, "learning_rate": 7.470823148753674e-06, "loss": 0.2965, "step": 2856 }, { "epoch": 1.8085476850019786, "grad_norm": 0.49255722062348833, "learning_rate": 7.46407392567986e-06, "loss": 0.3033, "step": 2857 }, { "epoch": 1.8091808468539772, "grad_norm": 0.4882545467421992, "learning_rate": 7.4573259369392806e-06, "loss": 0.294, "step": 2858 }, { "epoch": 1.8098140087059753, "grad_norm": 0.505464291128993, "learning_rate": 7.4505791858164445e-06, "loss": 0.2977, "step": 2859 }, { "epoch": 1.8104471705579739, "grad_norm": 0.5197770566965518, "learning_rate": 7.443833675595254e-06, "loss": 0.2832, "step": 2860 }, { "epoch": 1.8110803324099725, "grad_norm": 0.5104284035874076, "learning_rate": 7.437089409559012e-06, "loss": 0.2934, "step": 2861 }, { "epoch": 1.8117134942619706, "grad_norm": 0.4988136674389589, "learning_rate": 7.430346390990414e-06, "loss": 0.3048, "step": 2862 }, { "epoch": 1.8123466561139692, "grad_norm": 0.6625250852298367, "learning_rate": 7.423604623171544e-06, "loss": 0.2874, "step": 2863 }, { "epoch": 1.8129798179659675, "grad_norm": 0.4862626804839132, "learning_rate": 7.416864109383886e-06, "loss": 0.2955, "step": 2864 }, { "epoch": 1.8136129798179659, "grad_norm": 0.49134623136851024, "learning_rate": 7.410124852908305e-06, "loss": 0.3056, "step": 2865 }, { "epoch": 1.8142461416699645, "grad_norm": 0.49255624267839787, "learning_rate": 7.403386857025061e-06, "loss": 0.2903, "step": 2866 }, { "epoch": 1.8148793035219628, "grad_norm": 0.6982684500478253, "learning_rate": 7.396650125013789e-06, "loss": 0.3007, "step": 2867 }, { "epoch": 1.8155124653739612, "grad_norm": 0.4968812968656214, "learning_rate": 7.389914660153528e-06, "loss": 0.28, "step": 2868 }, { "epoch": 1.8161456272259597, "grad_norm": 0.5348133427315775, "learning_rate": 7.383180465722679e-06, "loss": 0.3166, "step": 2869 }, { "epoch": 1.816778789077958, "grad_norm": 0.5007178823316623, "learning_rate": 7.376447544999039e-06, "loss": 0.2899, "step": 2870 }, { "epoch": 1.8174119509299564, "grad_norm": 0.5025668246232424, "learning_rate": 7.369715901259787e-06, "loss": 0.2897, "step": 2871 }, { "epoch": 1.818045112781955, "grad_norm": 0.5323722536147965, "learning_rate": 7.362985537781462e-06, "loss": 0.2901, "step": 2872 }, { "epoch": 1.8186782746339532, "grad_norm": 0.4853042265496475, "learning_rate": 7.356256457840001e-06, "loss": 0.2788, "step": 2873 }, { "epoch": 1.8193114364859517, "grad_norm": 0.5394487553510539, "learning_rate": 7.349528664710703e-06, "loss": 0.2858, "step": 2874 }, { "epoch": 1.8199445983379503, "grad_norm": 0.5197180043867436, "learning_rate": 7.342802161668249e-06, "loss": 0.2964, "step": 2875 }, { "epoch": 1.8205777601899484, "grad_norm": 0.5414762938247014, "learning_rate": 7.336076951986682e-06, "loss": 0.3048, "step": 2876 }, { "epoch": 1.821210922041947, "grad_norm": 0.572956773463285, "learning_rate": 7.329353038939429e-06, "loss": 0.3042, "step": 2877 }, { "epoch": 1.8218440838939454, "grad_norm": 0.4954031326631155, "learning_rate": 7.322630425799271e-06, "loss": 0.2833, "step": 2878 }, { "epoch": 1.8224772457459437, "grad_norm": 0.4414857995796335, "learning_rate": 7.315909115838367e-06, "loss": 0.2868, "step": 2879 }, { "epoch": 1.8231104075979423, "grad_norm": 0.5311052333083538, "learning_rate": 7.3091891123282385e-06, "loss": 0.2996, "step": 2880 }, { "epoch": 1.8237435694499406, "grad_norm": 0.4906696594727194, "learning_rate": 7.302470418539771e-06, "loss": 0.2994, "step": 2881 }, { "epoch": 1.824376731301939, "grad_norm": 0.4873060802135116, "learning_rate": 7.295753037743216e-06, "loss": 0.2852, "step": 2882 }, { "epoch": 1.8250098931539376, "grad_norm": 0.46101387473825123, "learning_rate": 7.289036973208172e-06, "loss": 0.3004, "step": 2883 }, { "epoch": 1.825643055005936, "grad_norm": 0.48344284808137733, "learning_rate": 7.282322228203621e-06, "loss": 0.2962, "step": 2884 }, { "epoch": 1.8262762168579343, "grad_norm": 0.4892542348829714, "learning_rate": 7.275608805997876e-06, "loss": 0.2895, "step": 2885 }, { "epoch": 1.8269093787099329, "grad_norm": 0.5228247762002172, "learning_rate": 7.268896709858629e-06, "loss": 0.2896, "step": 2886 }, { "epoch": 1.827542540561931, "grad_norm": 0.5739605797332784, "learning_rate": 7.262185943052907e-06, "loss": 0.3052, "step": 2887 }, { "epoch": 1.8281757024139296, "grad_norm": 0.4743667578191603, "learning_rate": 7.25547650884711e-06, "loss": 0.3078, "step": 2888 }, { "epoch": 1.828808864265928, "grad_norm": 0.5253051670573011, "learning_rate": 7.24876841050697e-06, "loss": 0.3032, "step": 2889 }, { "epoch": 1.8294420261179263, "grad_norm": 0.5538947910432338, "learning_rate": 7.2420616512975826e-06, "loss": 0.294, "step": 2890 }, { "epoch": 1.8300751879699249, "grad_norm": 0.4780251500431824, "learning_rate": 7.235356234483386e-06, "loss": 0.2947, "step": 2891 }, { "epoch": 1.8307083498219232, "grad_norm": 0.4889306172893017, "learning_rate": 7.2286521633281644e-06, "loss": 0.2913, "step": 2892 }, { "epoch": 1.8313415116739216, "grad_norm": 0.5501226627504825, "learning_rate": 7.221949441095053e-06, "loss": 0.2946, "step": 2893 }, { "epoch": 1.8319746735259201, "grad_norm": 0.46716531199388234, "learning_rate": 7.215248071046519e-06, "loss": 0.3038, "step": 2894 }, { "epoch": 1.8326078353779185, "grad_norm": 0.4979542407326539, "learning_rate": 7.208548056444385e-06, "loss": 0.3, "step": 2895 }, { "epoch": 1.8332409972299168, "grad_norm": 0.5443255404614814, "learning_rate": 7.201849400549799e-06, "loss": 0.3085, "step": 2896 }, { "epoch": 1.8338741590819154, "grad_norm": 0.4792681008401427, "learning_rate": 7.1951521066232645e-06, "loss": 0.2902, "step": 2897 }, { "epoch": 1.8345073209339138, "grad_norm": 0.4609642189439054, "learning_rate": 7.1884561779246055e-06, "loss": 0.3095, "step": 2898 }, { "epoch": 1.8351404827859121, "grad_norm": 0.5086710881693124, "learning_rate": 7.181761617712991e-06, "loss": 0.298, "step": 2899 }, { "epoch": 1.8357736446379107, "grad_norm": 0.5317689453481118, "learning_rate": 7.1750684292469294e-06, "loss": 0.288, "step": 2900 }, { "epoch": 1.8364068064899088, "grad_norm": 0.4993841484321399, "learning_rate": 7.168376615784247e-06, "loss": 0.2823, "step": 2901 }, { "epoch": 1.8370399683419074, "grad_norm": 0.5604211419823297, "learning_rate": 7.1616861805821105e-06, "loss": 0.3053, "step": 2902 }, { "epoch": 1.8376731301939058, "grad_norm": 0.6058212540422364, "learning_rate": 7.15499712689701e-06, "loss": 0.2953, "step": 2903 }, { "epoch": 1.8383062920459041, "grad_norm": 0.46790300898603604, "learning_rate": 7.148309457984772e-06, "loss": 0.2993, "step": 2904 }, { "epoch": 1.8389394538979027, "grad_norm": 0.5408340448133266, "learning_rate": 7.141623177100539e-06, "loss": 0.2866, "step": 2905 }, { "epoch": 1.839572615749901, "grad_norm": 0.567456743094386, "learning_rate": 7.134938287498785e-06, "loss": 0.2945, "step": 2906 }, { "epoch": 1.8402057776018994, "grad_norm": 0.4805647088935192, "learning_rate": 7.1282547924333e-06, "loss": 0.2837, "step": 2907 }, { "epoch": 1.840838939453898, "grad_norm": 0.472301667658622, "learning_rate": 7.121572695157209e-06, "loss": 0.3027, "step": 2908 }, { "epoch": 1.8414721013058963, "grad_norm": 0.5149774064864513, "learning_rate": 7.1148919989229324e-06, "loss": 0.2848, "step": 2909 }, { "epoch": 1.8421052631578947, "grad_norm": 0.4907370529336978, "learning_rate": 7.108212706982235e-06, "loss": 0.288, "step": 2910 }, { "epoch": 1.8427384250098933, "grad_norm": 0.5206052984932173, "learning_rate": 7.101534822586187e-06, "loss": 0.285, "step": 2911 }, { "epoch": 1.8433715868618916, "grad_norm": 0.5407062543515547, "learning_rate": 7.094858348985161e-06, "loss": 0.2772, "step": 2912 }, { "epoch": 1.84400474871389, "grad_norm": 0.4935860100003553, "learning_rate": 7.088183289428868e-06, "loss": 0.2913, "step": 2913 }, { "epoch": 1.8446379105658885, "grad_norm": 0.49005863017193796, "learning_rate": 7.081509647166309e-06, "loss": 0.3031, "step": 2914 }, { "epoch": 1.8452710724178867, "grad_norm": 0.5219694766221394, "learning_rate": 7.074837425445809e-06, "loss": 0.2996, "step": 2915 }, { "epoch": 1.8459042342698853, "grad_norm": 0.4628873813101813, "learning_rate": 7.06816662751499e-06, "loss": 0.2918, "step": 2916 }, { "epoch": 1.8465373961218836, "grad_norm": 0.5107454597468507, "learning_rate": 7.061497256620793e-06, "loss": 0.2734, "step": 2917 }, { "epoch": 1.847170557973882, "grad_norm": 0.510435346861972, "learning_rate": 7.054829316009455e-06, "loss": 0.278, "step": 2918 }, { "epoch": 1.8478037198258805, "grad_norm": 0.49897743475827055, "learning_rate": 7.048162808926522e-06, "loss": 0.3023, "step": 2919 }, { "epoch": 1.848436881677879, "grad_norm": 0.5007552339229829, "learning_rate": 7.041497738616842e-06, "loss": 0.2858, "step": 2920 }, { "epoch": 1.8490700435298772, "grad_norm": 0.5212823366554409, "learning_rate": 7.034834108324557e-06, "loss": 0.2788, "step": 2921 }, { "epoch": 1.8497032053818758, "grad_norm": 0.5252594449914827, "learning_rate": 7.028171921293122e-06, "loss": 0.3022, "step": 2922 }, { "epoch": 1.8503363672338742, "grad_norm": 0.47446118604586457, "learning_rate": 7.021511180765272e-06, "loss": 0.287, "step": 2923 }, { "epoch": 1.8509695290858725, "grad_norm": 0.5632283417755006, "learning_rate": 7.014851889983058e-06, "loss": 0.2879, "step": 2924 }, { "epoch": 1.851602690937871, "grad_norm": 0.517266487096142, "learning_rate": 7.0081940521877975e-06, "loss": 0.2851, "step": 2925 }, { "epoch": 1.8522358527898695, "grad_norm": 0.5152596122950329, "learning_rate": 7.001537670620135e-06, "loss": 0.2836, "step": 2926 }, { "epoch": 1.8528690146418678, "grad_norm": 0.5101156434470101, "learning_rate": 6.994882748519978e-06, "loss": 0.3014, "step": 2927 }, { "epoch": 1.8535021764938664, "grad_norm": 0.5273371523687046, "learning_rate": 6.988229289126533e-06, "loss": 0.2934, "step": 2928 }, { "epoch": 1.8541353383458645, "grad_norm": 0.5359428683472941, "learning_rate": 6.981577295678308e-06, "loss": 0.2919, "step": 2929 }, { "epoch": 1.854768500197863, "grad_norm": 0.507657361478674, "learning_rate": 6.974926771413072e-06, "loss": 0.2884, "step": 2930 }, { "epoch": 1.8554016620498615, "grad_norm": 0.4684401608570081, "learning_rate": 6.9682777195679e-06, "loss": 0.293, "step": 2931 }, { "epoch": 1.8560348239018598, "grad_norm": 0.5071122677799285, "learning_rate": 6.961630143379138e-06, "loss": 0.2939, "step": 2932 }, { "epoch": 1.8566679857538584, "grad_norm": 0.48942536329051195, "learning_rate": 6.9549840460824225e-06, "loss": 0.2846, "step": 2933 }, { "epoch": 1.8573011476058567, "grad_norm": 0.4817048095288351, "learning_rate": 6.94833943091266e-06, "loss": 0.2914, "step": 2934 }, { "epoch": 1.857934309457855, "grad_norm": 0.5108552844050132, "learning_rate": 6.94169630110405e-06, "loss": 0.2895, "step": 2935 }, { "epoch": 1.8585674713098537, "grad_norm": 0.5192912173923017, "learning_rate": 6.935054659890053e-06, "loss": 0.2947, "step": 2936 }, { "epoch": 1.859200633161852, "grad_norm": 0.5162252556417952, "learning_rate": 6.928414510503419e-06, "loss": 0.2964, "step": 2937 }, { "epoch": 1.8598337950138504, "grad_norm": 0.5016247458050843, "learning_rate": 6.921775856176154e-06, "loss": 0.2949, "step": 2938 }, { "epoch": 1.860466956865849, "grad_norm": 0.48229561426272843, "learning_rate": 6.915138700139558e-06, "loss": 0.295, "step": 2939 }, { "epoch": 1.861100118717847, "grad_norm": 0.49171660343732554, "learning_rate": 6.9085030456241955e-06, "loss": 0.3054, "step": 2940 }, { "epoch": 1.8617332805698457, "grad_norm": 0.4884299055561689, "learning_rate": 6.90186889585988e-06, "loss": 0.2887, "step": 2941 }, { "epoch": 1.8623664424218442, "grad_norm": 0.5004435249986317, "learning_rate": 6.895236254075726e-06, "loss": 0.2775, "step": 2942 }, { "epoch": 1.8629996042738424, "grad_norm": 0.5396187824896082, "learning_rate": 6.888605123500083e-06, "loss": 0.2955, "step": 2943 }, { "epoch": 1.863632766125841, "grad_norm": 0.4806192734726037, "learning_rate": 6.881975507360587e-06, "loss": 0.2926, "step": 2944 }, { "epoch": 1.8642659279778393, "grad_norm": 0.4835338893396246, "learning_rate": 6.875347408884122e-06, "loss": 0.2944, "step": 2945 }, { "epoch": 1.8648990898298377, "grad_norm": 0.5308072962964233, "learning_rate": 6.868720831296846e-06, "loss": 0.3004, "step": 2946 }, { "epoch": 1.8655322516818362, "grad_norm": 0.787678749003492, "learning_rate": 6.862095777824165e-06, "loss": 0.2845, "step": 2947 }, { "epoch": 1.8661654135338346, "grad_norm": 0.49943056501676664, "learning_rate": 6.855472251690751e-06, "loss": 0.2713, "step": 2948 }, { "epoch": 1.866798575385833, "grad_norm": 0.4926249852906571, "learning_rate": 6.848850256120534e-06, "loss": 0.2926, "step": 2949 }, { "epoch": 1.8674317372378315, "grad_norm": 0.4756903929457278, "learning_rate": 6.842229794336688e-06, "loss": 0.2925, "step": 2950 }, { "epoch": 1.8680648990898299, "grad_norm": 0.4859906634525945, "learning_rate": 6.835610869561653e-06, "loss": 0.2919, "step": 2951 }, { "epoch": 1.8686980609418282, "grad_norm": 0.507848810722276, "learning_rate": 6.828993485017111e-06, "loss": 0.2954, "step": 2952 }, { "epoch": 1.8693312227938268, "grad_norm": 0.48465411193783725, "learning_rate": 6.8223776439240075e-06, "loss": 0.2907, "step": 2953 }, { "epoch": 1.869964384645825, "grad_norm": 0.5684427010101877, "learning_rate": 6.815763349502514e-06, "loss": 0.2936, "step": 2954 }, { "epoch": 1.8705975464978235, "grad_norm": 0.526704051397893, "learning_rate": 6.809150604972079e-06, "loss": 0.2843, "step": 2955 }, { "epoch": 1.871230708349822, "grad_norm": 0.4674502736419178, "learning_rate": 6.802539413551368e-06, "loss": 0.2922, "step": 2956 }, { "epoch": 1.8718638702018202, "grad_norm": 0.5066624888927391, "learning_rate": 6.79592977845831e-06, "loss": 0.2999, "step": 2957 }, { "epoch": 1.8724970320538188, "grad_norm": 0.5240348319985704, "learning_rate": 6.789321702910068e-06, "loss": 0.3005, "step": 2958 }, { "epoch": 1.8731301939058171, "grad_norm": 0.483549516548595, "learning_rate": 6.782715190123046e-06, "loss": 0.2884, "step": 2959 }, { "epoch": 1.8737633557578155, "grad_norm": 0.49361731564213823, "learning_rate": 6.776110243312894e-06, "loss": 0.3027, "step": 2960 }, { "epoch": 1.874396517609814, "grad_norm": 0.5690861203105568, "learning_rate": 6.769506865694492e-06, "loss": 0.2884, "step": 2961 }, { "epoch": 1.8750296794618124, "grad_norm": 0.4837272773584176, "learning_rate": 6.7629050604819594e-06, "loss": 0.2816, "step": 2962 }, { "epoch": 1.8756628413138108, "grad_norm": 0.4944185647561937, "learning_rate": 6.756304830888649e-06, "loss": 0.2886, "step": 2963 }, { "epoch": 1.8762960031658094, "grad_norm": 0.5874378980266073, "learning_rate": 6.749706180127149e-06, "loss": 0.2988, "step": 2964 }, { "epoch": 1.8769291650178077, "grad_norm": 0.47644549919302825, "learning_rate": 6.7431091114092766e-06, "loss": 0.2928, "step": 2965 }, { "epoch": 1.877562326869806, "grad_norm": 0.4848563333873003, "learning_rate": 6.736513627946084e-06, "loss": 0.2817, "step": 2966 }, { "epoch": 1.8781954887218046, "grad_norm": 0.5672689839472054, "learning_rate": 6.729919732947838e-06, "loss": 0.2907, "step": 2967 }, { "epoch": 1.8788286505738028, "grad_norm": 0.5851853608350981, "learning_rate": 6.723327429624052e-06, "loss": 0.2935, "step": 2968 }, { "epoch": 1.8794618124258013, "grad_norm": 0.9308815714401991, "learning_rate": 6.7167367211834564e-06, "loss": 0.283, "step": 2969 }, { "epoch": 1.8800949742778, "grad_norm": 0.48110432051798185, "learning_rate": 6.710147610833994e-06, "loss": 0.2924, "step": 2970 }, { "epoch": 1.880728136129798, "grad_norm": 0.5234274009828539, "learning_rate": 6.703560101782847e-06, "loss": 0.2922, "step": 2971 }, { "epoch": 1.8813612979817966, "grad_norm": 0.5011310188754544, "learning_rate": 6.696974197236404e-06, "loss": 0.288, "step": 2972 }, { "epoch": 1.881994459833795, "grad_norm": 0.5142111341798532, "learning_rate": 6.690389900400287e-06, "loss": 0.2871, "step": 2973 }, { "epoch": 1.8826276216857933, "grad_norm": 0.49138124792036403, "learning_rate": 6.683807214479323e-06, "loss": 0.2852, "step": 2974 }, { "epoch": 1.883260783537792, "grad_norm": 0.452109164875722, "learning_rate": 6.677226142677562e-06, "loss": 0.2875, "step": 2975 }, { "epoch": 1.8838939453897903, "grad_norm": 0.5205726329931555, "learning_rate": 6.670646688198263e-06, "loss": 0.2998, "step": 2976 }, { "epoch": 1.8845271072417886, "grad_norm": 0.7592532298970365, "learning_rate": 6.664068854243905e-06, "loss": 0.3004, "step": 2977 }, { "epoch": 1.8851602690937872, "grad_norm": 0.49360541377508094, "learning_rate": 6.657492644016169e-06, "loss": 0.2974, "step": 2978 }, { "epoch": 1.8857934309457856, "grad_norm": 0.48856259880529734, "learning_rate": 6.650918060715954e-06, "loss": 0.2818, "step": 2979 }, { "epoch": 1.886426592797784, "grad_norm": 0.5652552655509276, "learning_rate": 6.644345107543366e-06, "loss": 0.2992, "step": 2980 }, { "epoch": 1.8870597546497825, "grad_norm": 0.500577452166533, "learning_rate": 6.637773787697708e-06, "loss": 0.3066, "step": 2981 }, { "epoch": 1.8876929165017806, "grad_norm": 0.5264342477119236, "learning_rate": 6.631204104377506e-06, "loss": 0.3025, "step": 2982 }, { "epoch": 1.8883260783537792, "grad_norm": 0.5134510157099833, "learning_rate": 6.624636060780467e-06, "loss": 0.309, "step": 2983 }, { "epoch": 1.8889592402057778, "grad_norm": 0.48422501100375703, "learning_rate": 6.618069660103521e-06, "loss": 0.2971, "step": 2984 }, { "epoch": 1.889592402057776, "grad_norm": 0.47734562409197306, "learning_rate": 6.6115049055427826e-06, "loss": 0.2927, "step": 2985 }, { "epoch": 1.8902255639097745, "grad_norm": 0.4808011507550808, "learning_rate": 6.6049418002935775e-06, "loss": 0.2927, "step": 2986 }, { "epoch": 1.8908587257617728, "grad_norm": 0.48056876432020174, "learning_rate": 6.5983803475504174e-06, "loss": 0.2899, "step": 2987 }, { "epoch": 1.8914918876137712, "grad_norm": 0.5569175102644183, "learning_rate": 6.591820550507016e-06, "loss": 0.2808, "step": 2988 }, { "epoch": 1.8921250494657698, "grad_norm": 0.4868255589807058, "learning_rate": 6.585262412356284e-06, "loss": 0.2719, "step": 2989 }, { "epoch": 1.8927582113177681, "grad_norm": 0.49790839907899304, "learning_rate": 6.578705936290315e-06, "loss": 0.2726, "step": 2990 }, { "epoch": 1.8933913731697665, "grad_norm": 0.4782235988903479, "learning_rate": 6.572151125500403e-06, "loss": 0.2869, "step": 2991 }, { "epoch": 1.894024535021765, "grad_norm": 0.5118551400118899, "learning_rate": 6.565597983177028e-06, "loss": 0.2954, "step": 2992 }, { "epoch": 1.8946576968737634, "grad_norm": 0.480689227470801, "learning_rate": 6.559046512509859e-06, "loss": 0.3012, "step": 2993 }, { "epoch": 1.8952908587257618, "grad_norm": 0.47859059873717075, "learning_rate": 6.5524967166877415e-06, "loss": 0.2946, "step": 2994 }, { "epoch": 1.8959240205777603, "grad_norm": 0.46325053482566064, "learning_rate": 6.545948598898727e-06, "loss": 0.2983, "step": 2995 }, { "epoch": 1.8965571824297585, "grad_norm": 0.4925291919234712, "learning_rate": 6.5394021623300265e-06, "loss": 0.2878, "step": 2996 }, { "epoch": 1.897190344281757, "grad_norm": 0.5010144526644932, "learning_rate": 6.5328574101680455e-06, "loss": 0.2872, "step": 2997 }, { "epoch": 1.8978235061337554, "grad_norm": 0.48304131362705516, "learning_rate": 6.526314345598377e-06, "loss": 0.282, "step": 2998 }, { "epoch": 1.8984566679857537, "grad_norm": 0.49870338045775114, "learning_rate": 6.519772971805773e-06, "loss": 0.2977, "step": 2999 }, { "epoch": 1.8990898298377523, "grad_norm": 0.6279727267914506, "learning_rate": 6.513233291974179e-06, "loss": 0.2864, "step": 3000 }, { "epoch": 1.8997229916897507, "grad_norm": 0.48767229262524664, "learning_rate": 6.506695309286706e-06, "loss": 0.2981, "step": 3001 }, { "epoch": 1.900356153541749, "grad_norm": 0.48639266444245016, "learning_rate": 6.5001590269256455e-06, "loss": 0.2845, "step": 3002 }, { "epoch": 1.9009893153937476, "grad_norm": 0.5089385645755086, "learning_rate": 6.4936244480724575e-06, "loss": 0.2995, "step": 3003 }, { "epoch": 1.901622477245746, "grad_norm": 0.5364573194320085, "learning_rate": 6.487091575907776e-06, "loss": 0.2794, "step": 3004 }, { "epoch": 1.9022556390977443, "grad_norm": 0.5207919465196396, "learning_rate": 6.480560413611397e-06, "loss": 0.2989, "step": 3005 }, { "epoch": 1.9028888009497429, "grad_norm": 0.4920600113295705, "learning_rate": 6.474030964362299e-06, "loss": 0.2857, "step": 3006 }, { "epoch": 1.9035219628017412, "grad_norm": 0.4946952082765538, "learning_rate": 6.4675032313386055e-06, "loss": 0.2758, "step": 3007 }, { "epoch": 1.9041551246537396, "grad_norm": 0.4859450015378218, "learning_rate": 6.460977217717625e-06, "loss": 0.2971, "step": 3008 }, { "epoch": 1.9047882865057382, "grad_norm": 0.7358520471982651, "learning_rate": 6.4544529266758225e-06, "loss": 0.2796, "step": 3009 }, { "epoch": 1.9054214483577363, "grad_norm": 0.5215250367731078, "learning_rate": 6.447930361388812e-06, "loss": 0.2931, "step": 3010 }, { "epoch": 1.9060546102097349, "grad_norm": 0.6050320469030894, "learning_rate": 6.44140952503139e-06, "loss": 0.2933, "step": 3011 }, { "epoch": 1.9066877720617332, "grad_norm": 0.4976067263953108, "learning_rate": 6.434890420777491e-06, "loss": 0.303, "step": 3012 }, { "epoch": 1.9073209339137316, "grad_norm": 0.4863159611631033, "learning_rate": 6.428373051800221e-06, "loss": 0.2801, "step": 3013 }, { "epoch": 1.9079540957657302, "grad_norm": 0.4829084688983045, "learning_rate": 6.421857421271829e-06, "loss": 0.2807, "step": 3014 }, { "epoch": 1.9085872576177285, "grad_norm": 0.4760767076027145, "learning_rate": 6.4153435323637305e-06, "loss": 0.2917, "step": 3015 }, { "epoch": 1.9092204194697269, "grad_norm": 0.48302117561632407, "learning_rate": 6.408831388246482e-06, "loss": 0.3046, "step": 3016 }, { "epoch": 1.9098535813217254, "grad_norm": 0.49504565038815707, "learning_rate": 6.402320992089799e-06, "loss": 0.2867, "step": 3017 }, { "epoch": 1.9104867431737238, "grad_norm": 0.4954554662079657, "learning_rate": 6.395812347062542e-06, "loss": 0.2923, "step": 3018 }, { "epoch": 1.9111199050257222, "grad_norm": 0.4925098741190485, "learning_rate": 6.38930545633272e-06, "loss": 0.2866, "step": 3019 }, { "epoch": 1.9117530668777207, "grad_norm": 0.4815531062287046, "learning_rate": 6.382800323067489e-06, "loss": 0.2792, "step": 3020 }, { "epoch": 1.912386228729719, "grad_norm": 0.49626870076078666, "learning_rate": 6.376296950433145e-06, "loss": 0.2918, "step": 3021 }, { "epoch": 1.9130193905817174, "grad_norm": 0.4927891869237997, "learning_rate": 6.36979534159514e-06, "loss": 0.2913, "step": 3022 }, { "epoch": 1.913652552433716, "grad_norm": 0.48529352034576795, "learning_rate": 6.3632954997180455e-06, "loss": 0.2945, "step": 3023 }, { "epoch": 1.9142857142857141, "grad_norm": 0.44844957646835176, "learning_rate": 6.356797427965599e-06, "loss": 0.2808, "step": 3024 }, { "epoch": 1.9149188761377127, "grad_norm": 0.4761089823530423, "learning_rate": 6.350301129500651e-06, "loss": 0.3056, "step": 3025 }, { "epoch": 1.915552037989711, "grad_norm": 0.4537724134870015, "learning_rate": 6.34380660748521e-06, "loss": 0.2979, "step": 3026 }, { "epoch": 1.9161851998417094, "grad_norm": 0.5557295982209598, "learning_rate": 6.3373138650804055e-06, "loss": 0.3068, "step": 3027 }, { "epoch": 1.916818361693708, "grad_norm": 0.47947944570858503, "learning_rate": 6.3308229054465075e-06, "loss": 0.2851, "step": 3028 }, { "epoch": 1.9174515235457064, "grad_norm": 0.9659943891981814, "learning_rate": 6.32433373174292e-06, "loss": 0.3152, "step": 3029 }, { "epoch": 1.9180846853977047, "grad_norm": 0.4558304602899076, "learning_rate": 6.31784634712817e-06, "loss": 0.3055, "step": 3030 }, { "epoch": 1.9187178472497033, "grad_norm": 0.47527818681807144, "learning_rate": 6.311360754759923e-06, "loss": 0.2871, "step": 3031 }, { "epoch": 1.9193510091017016, "grad_norm": 0.9329705177656343, "learning_rate": 6.304876957794963e-06, "loss": 0.3, "step": 3032 }, { "epoch": 1.9199841709537, "grad_norm": 0.4622010628364194, "learning_rate": 6.298394959389209e-06, "loss": 0.2908, "step": 3033 }, { "epoch": 1.9206173328056986, "grad_norm": 0.5391329713848786, "learning_rate": 6.291914762697695e-06, "loss": 0.2959, "step": 3034 }, { "epoch": 1.921250494657697, "grad_norm": 0.4665768557210641, "learning_rate": 6.285436370874592e-06, "loss": 0.2932, "step": 3035 }, { "epoch": 1.9218836565096953, "grad_norm": 0.4748918721102723, "learning_rate": 6.27895978707317e-06, "loss": 0.2916, "step": 3036 }, { "epoch": 1.9225168183616939, "grad_norm": 0.49026421061504966, "learning_rate": 6.272485014445844e-06, "loss": 0.2993, "step": 3037 }, { "epoch": 1.923149980213692, "grad_norm": 0.5143395628313862, "learning_rate": 6.266012056144135e-06, "loss": 0.2942, "step": 3038 }, { "epoch": 1.9237831420656906, "grad_norm": 0.5932476279290999, "learning_rate": 6.2595409153186735e-06, "loss": 0.2918, "step": 3039 }, { "epoch": 1.924416303917689, "grad_norm": 0.5088158313655105, "learning_rate": 6.2530715951192265e-06, "loss": 0.2918, "step": 3040 }, { "epoch": 1.9250494657696873, "grad_norm": 0.585554696270967, "learning_rate": 6.246604098694651e-06, "loss": 0.2856, "step": 3041 }, { "epoch": 1.9256826276216858, "grad_norm": 1.5984311294086968, "learning_rate": 6.240138429192935e-06, "loss": 0.285, "step": 3042 }, { "epoch": 1.9263157894736842, "grad_norm": 0.5072495731428683, "learning_rate": 6.233674589761163e-06, "loss": 0.2899, "step": 3043 }, { "epoch": 1.9269489513256826, "grad_norm": 0.5032646856457613, "learning_rate": 6.227212583545543e-06, "loss": 0.2844, "step": 3044 }, { "epoch": 1.9275821131776811, "grad_norm": 0.5595398631416102, "learning_rate": 6.220752413691377e-06, "loss": 0.2759, "step": 3045 }, { "epoch": 1.9282152750296795, "grad_norm": 1.304427712917841, "learning_rate": 6.214294083343083e-06, "loss": 0.2922, "step": 3046 }, { "epoch": 1.9288484368816778, "grad_norm": 0.5055270395886119, "learning_rate": 6.207837595644177e-06, "loss": 0.2873, "step": 3047 }, { "epoch": 1.9294815987336764, "grad_norm": 0.5167642109205753, "learning_rate": 6.201382953737284e-06, "loss": 0.294, "step": 3048 }, { "epoch": 1.9301147605856745, "grad_norm": 0.48840746170279536, "learning_rate": 6.194930160764128e-06, "loss": 0.2839, "step": 3049 }, { "epoch": 1.9307479224376731, "grad_norm": 0.5075719745925344, "learning_rate": 6.188479219865529e-06, "loss": 0.28, "step": 3050 }, { "epoch": 1.9313810842896717, "grad_norm": 0.49407579059102197, "learning_rate": 6.182030134181416e-06, "loss": 0.3013, "step": 3051 }, { "epoch": 1.9320142461416698, "grad_norm": 0.525246679862127, "learning_rate": 6.175582906850797e-06, "loss": 0.2854, "step": 3052 }, { "epoch": 1.9326474079936684, "grad_norm": 0.4688264579174492, "learning_rate": 6.169137541011801e-06, "loss": 0.2831, "step": 3053 }, { "epoch": 1.9332805698456668, "grad_norm": 0.5180253391523882, "learning_rate": 6.162694039801625e-06, "loss": 0.2898, "step": 3054 }, { "epoch": 1.9339137316976651, "grad_norm": 0.518769408060389, "learning_rate": 6.156252406356576e-06, "loss": 0.3009, "step": 3055 }, { "epoch": 1.9345468935496637, "grad_norm": 0.5409523400023079, "learning_rate": 6.149812643812042e-06, "loss": 0.3043, "step": 3056 }, { "epoch": 1.935180055401662, "grad_norm": 0.5060861794485431, "learning_rate": 6.143374755302507e-06, "loss": 0.2888, "step": 3057 }, { "epoch": 1.9358132172536604, "grad_norm": 0.5042843885983861, "learning_rate": 6.136938743961543e-06, "loss": 0.2877, "step": 3058 }, { "epoch": 1.936446379105659, "grad_norm": 0.525656198639174, "learning_rate": 6.130504612921798e-06, "loss": 0.3033, "step": 3059 }, { "epoch": 1.9370795409576573, "grad_norm": 0.47778140052031737, "learning_rate": 6.12407236531502e-06, "loss": 0.2874, "step": 3060 }, { "epoch": 1.9377127028096557, "grad_norm": 0.48670457092904595, "learning_rate": 6.117642004272026e-06, "loss": 0.2824, "step": 3061 }, { "epoch": 1.9383458646616543, "grad_norm": 0.4756182682300285, "learning_rate": 6.1112135329227285e-06, "loss": 0.2792, "step": 3062 }, { "epoch": 1.9389790265136524, "grad_norm": 0.4875069987110465, "learning_rate": 6.104786954396105e-06, "loss": 0.3056, "step": 3063 }, { "epoch": 1.939612188365651, "grad_norm": 0.48839054173783825, "learning_rate": 6.0983622718202286e-06, "loss": 0.2848, "step": 3064 }, { "epoch": 1.9402453502176495, "grad_norm": 0.4888849792049703, "learning_rate": 6.0919394883222284e-06, "loss": 0.2825, "step": 3065 }, { "epoch": 1.9408785120696477, "grad_norm": 0.4975025493180563, "learning_rate": 6.085518607028331e-06, "loss": 0.2828, "step": 3066 }, { "epoch": 1.9415116739216463, "grad_norm": 0.5012960337449449, "learning_rate": 6.07909963106383e-06, "loss": 0.2911, "step": 3067 }, { "epoch": 1.9421448357736446, "grad_norm": 0.6818539633718428, "learning_rate": 6.072682563553076e-06, "loss": 0.2696, "step": 3068 }, { "epoch": 1.942777997625643, "grad_norm": 0.4854741828835486, "learning_rate": 6.06626740761952e-06, "loss": 0.2799, "step": 3069 }, { "epoch": 1.9434111594776415, "grad_norm": 0.4847976799491527, "learning_rate": 6.059854166385653e-06, "loss": 0.3032, "step": 3070 }, { "epoch": 1.94404432132964, "grad_norm": 0.5548884675259181, "learning_rate": 6.053442842973054e-06, "loss": 0.2853, "step": 3071 }, { "epoch": 1.9446774831816382, "grad_norm": 0.4823962969619776, "learning_rate": 6.047033440502359e-06, "loss": 0.2874, "step": 3072 }, { "epoch": 1.9453106450336368, "grad_norm": 0.4597544335961013, "learning_rate": 6.040625962093275e-06, "loss": 0.3081, "step": 3073 }, { "epoch": 1.9459438068856352, "grad_norm": 0.48079655090662, "learning_rate": 6.034220410864566e-06, "loss": 0.2863, "step": 3074 }, { "epoch": 1.9465769687376335, "grad_norm": 0.48265097395436973, "learning_rate": 6.027816789934066e-06, "loss": 0.2825, "step": 3075 }, { "epoch": 1.947210130589632, "grad_norm": 0.5177023799755496, "learning_rate": 6.021415102418659e-06, "loss": 0.2919, "step": 3076 }, { "epoch": 1.9478432924416302, "grad_norm": 0.5686393832688907, "learning_rate": 6.015015351434298e-06, "loss": 0.2776, "step": 3077 }, { "epoch": 1.9484764542936288, "grad_norm": 0.49485511579926866, "learning_rate": 6.008617540095989e-06, "loss": 0.3095, "step": 3078 }, { "epoch": 1.9491096161456274, "grad_norm": 0.6684911256899195, "learning_rate": 6.0022216715177915e-06, "loss": 0.2955, "step": 3079 }, { "epoch": 1.9497427779976255, "grad_norm": 0.6019599937311744, "learning_rate": 5.995827748812826e-06, "loss": 0.2783, "step": 3080 }, { "epoch": 1.950375939849624, "grad_norm": 0.4727223433286092, "learning_rate": 5.9894357750932554e-06, "loss": 0.2825, "step": 3081 }, { "epoch": 1.9510091017016225, "grad_norm": 0.5098209172092001, "learning_rate": 5.983045753470308e-06, "loss": 0.2914, "step": 3082 }, { "epoch": 1.9516422635536208, "grad_norm": 0.5012516406749246, "learning_rate": 5.9766576870542485e-06, "loss": 0.2685, "step": 3083 }, { "epoch": 1.9522754254056194, "grad_norm": 0.4881401363554876, "learning_rate": 5.970271578954397e-06, "loss": 0.2885, "step": 3084 }, { "epoch": 1.9529085872576177, "grad_norm": 0.5042602708872271, "learning_rate": 5.963887432279119e-06, "loss": 0.2897, "step": 3085 }, { "epoch": 1.953541749109616, "grad_norm": 0.5153388362482242, "learning_rate": 5.957505250135826e-06, "loss": 0.2874, "step": 3086 }, { "epoch": 1.9541749109616147, "grad_norm": 0.5133092839327783, "learning_rate": 5.951125035630977e-06, "loss": 0.2945, "step": 3087 }, { "epoch": 1.954808072813613, "grad_norm": 0.514008249734469, "learning_rate": 5.944746791870062e-06, "loss": 0.2928, "step": 3088 }, { "epoch": 1.9554412346656114, "grad_norm": 0.504251183370781, "learning_rate": 5.938370521957622e-06, "loss": 0.3072, "step": 3089 }, { "epoch": 1.95607439651761, "grad_norm": 0.5189961476123208, "learning_rate": 5.931996228997234e-06, "loss": 0.293, "step": 3090 }, { "epoch": 1.956707558369608, "grad_norm": 0.48361357989013204, "learning_rate": 5.925623916091514e-06, "loss": 0.293, "step": 3091 }, { "epoch": 1.9573407202216067, "grad_norm": 0.4861394227463543, "learning_rate": 5.919253586342108e-06, "loss": 0.2967, "step": 3092 }, { "epoch": 1.9579738820736052, "grad_norm": 0.5003493388704748, "learning_rate": 5.912885242849711e-06, "loss": 0.2934, "step": 3093 }, { "epoch": 1.9586070439256034, "grad_norm": 0.5117671840699132, "learning_rate": 5.906518888714029e-06, "loss": 0.2809, "step": 3094 }, { "epoch": 1.959240205777602, "grad_norm": 0.4813172612395912, "learning_rate": 5.900154527033826e-06, "loss": 0.264, "step": 3095 }, { "epoch": 1.9598733676296003, "grad_norm": 0.47047172059840126, "learning_rate": 5.893792160906873e-06, "loss": 0.292, "step": 3096 }, { "epoch": 1.9605065294815986, "grad_norm": 0.4586993226528238, "learning_rate": 5.88743179342998e-06, "loss": 0.2981, "step": 3097 }, { "epoch": 1.9611396913335972, "grad_norm": 0.5300656287852303, "learning_rate": 5.881073427698994e-06, "loss": 0.2938, "step": 3098 }, { "epoch": 1.9617728531855956, "grad_norm": 0.5930662187567449, "learning_rate": 5.874717066808766e-06, "loss": 0.304, "step": 3099 }, { "epoch": 1.962406015037594, "grad_norm": 0.5878683132110596, "learning_rate": 5.8683627138531885e-06, "loss": 0.3137, "step": 3100 }, { "epoch": 1.9630391768895925, "grad_norm": 0.49536551703802106, "learning_rate": 5.862010371925166e-06, "loss": 0.2946, "step": 3101 }, { "epoch": 1.9636723387415909, "grad_norm": 0.4562209613774983, "learning_rate": 5.855660044116636e-06, "loss": 0.2899, "step": 3102 }, { "epoch": 1.9643055005935892, "grad_norm": 0.4683801484425886, "learning_rate": 5.849311733518539e-06, "loss": 0.2952, "step": 3103 }, { "epoch": 1.9649386624455878, "grad_norm": 0.49792420088386263, "learning_rate": 5.842965443220852e-06, "loss": 0.292, "step": 3104 }, { "epoch": 1.965571824297586, "grad_norm": 0.5632075318114719, "learning_rate": 5.836621176312553e-06, "loss": 0.2757, "step": 3105 }, { "epoch": 1.9662049861495845, "grad_norm": 0.45073740490803227, "learning_rate": 5.830278935881644e-06, "loss": 0.2801, "step": 3106 }, { "epoch": 1.9668381480015829, "grad_norm": 0.5422812867402417, "learning_rate": 5.823938725015148e-06, "loss": 0.2902, "step": 3107 }, { "epoch": 1.9674713098535812, "grad_norm": 0.4833486947806537, "learning_rate": 5.817600546799074e-06, "loss": 0.2967, "step": 3108 }, { "epoch": 1.9681044717055798, "grad_norm": 0.505998494899373, "learning_rate": 5.811264404318468e-06, "loss": 0.2773, "step": 3109 }, { "epoch": 1.9687376335575781, "grad_norm": 0.4754441627308182, "learning_rate": 5.804930300657377e-06, "loss": 0.2957, "step": 3110 }, { "epoch": 1.9693707954095765, "grad_norm": 0.49327908656860325, "learning_rate": 5.798598238898849e-06, "loss": 0.2723, "step": 3111 }, { "epoch": 1.970003957261575, "grad_norm": 0.538544581933528, "learning_rate": 5.792268222124943e-06, "loss": 0.28, "step": 3112 }, { "epoch": 1.9706371191135734, "grad_norm": 0.4688693084549079, "learning_rate": 5.7859402534167285e-06, "loss": 0.3061, "step": 3113 }, { "epoch": 1.9712702809655718, "grad_norm": 0.4650361191427734, "learning_rate": 5.779614335854269e-06, "loss": 0.2903, "step": 3114 }, { "epoch": 1.9719034428175704, "grad_norm": 0.4806431161417957, "learning_rate": 5.773290472516631e-06, "loss": 0.2982, "step": 3115 }, { "epoch": 1.9725366046695687, "grad_norm": 0.48970326758211513, "learning_rate": 5.7669686664818835e-06, "loss": 0.302, "step": 3116 }, { "epoch": 1.973169766521567, "grad_norm": 0.5976108386743594, "learning_rate": 5.760648920827099e-06, "loss": 0.2818, "step": 3117 }, { "epoch": 1.9738029283735656, "grad_norm": 0.4851924955826684, "learning_rate": 5.754331238628339e-06, "loss": 0.2928, "step": 3118 }, { "epoch": 1.9744360902255638, "grad_norm": 0.45457334353214635, "learning_rate": 5.748015622960663e-06, "loss": 0.2895, "step": 3119 }, { "epoch": 1.9750692520775623, "grad_norm": 0.4959842598654033, "learning_rate": 5.741702076898126e-06, "loss": 0.2852, "step": 3120 }, { "epoch": 1.9757024139295607, "grad_norm": 0.536477850305765, "learning_rate": 5.735390603513769e-06, "loss": 0.3019, "step": 3121 }, { "epoch": 1.976335575781559, "grad_norm": 0.49241883800902597, "learning_rate": 5.7290812058796406e-06, "loss": 0.2959, "step": 3122 }, { "epoch": 1.9769687376335576, "grad_norm": 0.4873334774707114, "learning_rate": 5.722773887066763e-06, "loss": 0.2919, "step": 3123 }, { "epoch": 1.977601899485556, "grad_norm": 0.5021694247157638, "learning_rate": 5.716468650145152e-06, "loss": 0.2965, "step": 3124 }, { "epoch": 1.9782350613375543, "grad_norm": 0.47920118465607336, "learning_rate": 5.710165498183806e-06, "loss": 0.2873, "step": 3125 }, { "epoch": 1.978868223189553, "grad_norm": 0.485869538994702, "learning_rate": 5.703864434250721e-06, "loss": 0.293, "step": 3126 }, { "epoch": 1.9795013850415513, "grad_norm": 0.5024454258641641, "learning_rate": 5.697565461412866e-06, "loss": 0.2967, "step": 3127 }, { "epoch": 1.9801345468935496, "grad_norm": 0.6192220626737207, "learning_rate": 5.691268582736188e-06, "loss": 0.2904, "step": 3128 }, { "epoch": 1.9807677087455482, "grad_norm": 0.5182403401801062, "learning_rate": 5.6849738012856304e-06, "loss": 0.2977, "step": 3129 }, { "epoch": 1.9814008705975465, "grad_norm": 0.5258851615235178, "learning_rate": 5.678681120125102e-06, "loss": 0.2864, "step": 3130 }, { "epoch": 1.982034032449545, "grad_norm": 0.4901896564259341, "learning_rate": 5.672390542317495e-06, "loss": 0.294, "step": 3131 }, { "epoch": 1.9826671943015435, "grad_norm": 0.5072572763197355, "learning_rate": 5.666102070924676e-06, "loss": 0.2985, "step": 3132 }, { "epoch": 1.9833003561535416, "grad_norm": 0.6144481376772083, "learning_rate": 5.6598157090074875e-06, "loss": 0.2874, "step": 3133 }, { "epoch": 1.9839335180055402, "grad_norm": 0.490517687206397, "learning_rate": 5.6535314596257405e-06, "loss": 0.2993, "step": 3134 }, { "epoch": 1.9845666798575385, "grad_norm": 0.5230140107775197, "learning_rate": 5.64724932583823e-06, "loss": 0.2953, "step": 3135 }, { "epoch": 1.985199841709537, "grad_norm": 0.5074415057089711, "learning_rate": 5.640969310702708e-06, "loss": 0.2886, "step": 3136 }, { "epoch": 1.9858330035615355, "grad_norm": 0.49813294438613753, "learning_rate": 5.634691417275901e-06, "loss": 0.2831, "step": 3137 }, { "epoch": 1.9864661654135338, "grad_norm": 0.47544160521571394, "learning_rate": 5.628415648613505e-06, "loss": 0.2999, "step": 3138 }, { "epoch": 1.9870993272655322, "grad_norm": 0.5038452999917699, "learning_rate": 5.6221420077701795e-06, "loss": 0.279, "step": 3139 }, { "epoch": 1.9877324891175308, "grad_norm": 9.796253857940629, "learning_rate": 5.615870497799545e-06, "loss": 0.2901, "step": 3140 }, { "epoch": 1.988365650969529, "grad_norm": 0.5188079276827732, "learning_rate": 5.609601121754184e-06, "loss": 0.2722, "step": 3141 }, { "epoch": 1.9889988128215275, "grad_norm": 0.5775135895891499, "learning_rate": 5.603333882685655e-06, "loss": 0.2907, "step": 3142 }, { "epoch": 1.989631974673526, "grad_norm": 0.5152641549971518, "learning_rate": 5.597068783644457e-06, "loss": 0.3032, "step": 3143 }, { "epoch": 1.9902651365255244, "grad_norm": 0.5043326579650839, "learning_rate": 5.590805827680061e-06, "loss": 0.2842, "step": 3144 }, { "epoch": 1.9908982983775227, "grad_norm": 0.7090459327787975, "learning_rate": 5.584545017840886e-06, "loss": 0.2784, "step": 3145 }, { "epoch": 1.9915314602295213, "grad_norm": 0.4856715039386869, "learning_rate": 5.578286357174305e-06, "loss": 0.2755, "step": 3146 }, { "epoch": 1.9921646220815195, "grad_norm": 0.6486312201069803, "learning_rate": 5.572029848726664e-06, "loss": 0.301, "step": 3147 }, { "epoch": 1.992797783933518, "grad_norm": 0.5127682621896326, "learning_rate": 5.565775495543238e-06, "loss": 0.2715, "step": 3148 }, { "epoch": 1.9934309457855164, "grad_norm": 0.48979140742385885, "learning_rate": 5.559523300668263e-06, "loss": 0.2933, "step": 3149 }, { "epoch": 1.9940641076375147, "grad_norm": 0.4708308532421127, "learning_rate": 5.553273267144923e-06, "loss": 0.2788, "step": 3150 }, { "epoch": 1.9946972694895133, "grad_norm": 0.4655384914223205, "learning_rate": 5.5470253980153555e-06, "loss": 0.2888, "step": 3151 }, { "epoch": 1.9953304313415117, "grad_norm": 0.5212283829273494, "learning_rate": 5.540779696320639e-06, "loss": 0.2878, "step": 3152 }, { "epoch": 1.99596359319351, "grad_norm": 0.5259051760801778, "learning_rate": 5.534536165100794e-06, "loss": 0.2996, "step": 3153 }, { "epoch": 1.9965967550455086, "grad_norm": 0.5043630288557979, "learning_rate": 5.528294807394788e-06, "loss": 0.2843, "step": 3154 }, { "epoch": 1.997229916897507, "grad_norm": 0.47286752150973704, "learning_rate": 5.5220556262405385e-06, "loss": 0.2852, "step": 3155 }, { "epoch": 1.9978630787495053, "grad_norm": 0.5222194728703525, "learning_rate": 5.51581862467489e-06, "loss": 0.3075, "step": 3156 }, { "epoch": 1.9984962406015039, "grad_norm": 0.4745354270643103, "learning_rate": 5.509583805733631e-06, "loss": 0.2761, "step": 3157 }, { "epoch": 1.999129402453502, "grad_norm": 0.5792420868656574, "learning_rate": 5.503351172451499e-06, "loss": 0.2935, "step": 3158 }, { "epoch": 1.9997625643055006, "grad_norm": 0.4750039468758609, "learning_rate": 5.49712072786214e-06, "loss": 0.2784, "step": 3159 }, { "epoch": 2.0, "grad_norm": 0.719673490753275, "learning_rate": 5.490892474998169e-06, "loss": 0.269, "step": 3160 }, { "epoch": 2.0006331618519986, "grad_norm": 0.6143204217988607, "learning_rate": 5.484666416891109e-06, "loss": 0.2679, "step": 3161 }, { "epoch": 2.0012663237039967, "grad_norm": 0.5580744541363075, "learning_rate": 5.478442556571425e-06, "loss": 0.2364, "step": 3162 }, { "epoch": 2.0018994855559953, "grad_norm": 0.48183124919989806, "learning_rate": 5.472220897068505e-06, "loss": 0.2579, "step": 3163 }, { "epoch": 2.002532647407994, "grad_norm": 0.6499092874450255, "learning_rate": 5.4660014414106825e-06, "loss": 0.2585, "step": 3164 }, { "epoch": 2.003165809259992, "grad_norm": 0.4974234394605588, "learning_rate": 5.459784192625199e-06, "loss": 0.2742, "step": 3165 }, { "epoch": 2.0037989711119906, "grad_norm": 0.5657720188149497, "learning_rate": 5.453569153738227e-06, "loss": 0.2567, "step": 3166 }, { "epoch": 2.0044321329639887, "grad_norm": 0.5680399336984316, "learning_rate": 5.447356327774876e-06, "loss": 0.2564, "step": 3167 }, { "epoch": 2.0050652948159873, "grad_norm": 0.5882023192065432, "learning_rate": 5.4411457177591635e-06, "loss": 0.2484, "step": 3168 }, { "epoch": 2.005698456667986, "grad_norm": 0.6528626186065208, "learning_rate": 5.434937326714031e-06, "loss": 0.2498, "step": 3169 }, { "epoch": 2.006331618519984, "grad_norm": 0.5291956933527988, "learning_rate": 5.428731157661342e-06, "loss": 0.2314, "step": 3170 }, { "epoch": 2.0069647803719826, "grad_norm": 0.5323301312995494, "learning_rate": 5.422527213621889e-06, "loss": 0.25, "step": 3171 }, { "epoch": 2.007597942223981, "grad_norm": 0.5084308553207658, "learning_rate": 5.416325497615356e-06, "loss": 0.236, "step": 3172 }, { "epoch": 2.0082311040759793, "grad_norm": 0.5327151706926155, "learning_rate": 5.410126012660368e-06, "loss": 0.247, "step": 3173 }, { "epoch": 2.008864265927978, "grad_norm": 0.4976069416285746, "learning_rate": 5.403928761774453e-06, "loss": 0.2406, "step": 3174 }, { "epoch": 2.0094974277799764, "grad_norm": 0.494474849164022, "learning_rate": 5.397733747974045e-06, "loss": 0.2535, "step": 3175 }, { "epoch": 2.0101305896319746, "grad_norm": 0.48025490932793646, "learning_rate": 5.3915409742745096e-06, "loss": 0.2421, "step": 3176 }, { "epoch": 2.010763751483973, "grad_norm": 0.49633305215259527, "learning_rate": 5.385350443690101e-06, "loss": 0.2545, "step": 3177 }, { "epoch": 2.0113969133359717, "grad_norm": 0.48951376206892167, "learning_rate": 5.379162159233989e-06, "loss": 0.2502, "step": 3178 }, { "epoch": 2.01203007518797, "grad_norm": 0.5418644381209676, "learning_rate": 5.372976123918251e-06, "loss": 0.2489, "step": 3179 }, { "epoch": 2.0126632370399684, "grad_norm": 0.4831832183042368, "learning_rate": 5.366792340753874e-06, "loss": 0.2514, "step": 3180 }, { "epoch": 2.0132963988919665, "grad_norm": 0.5754917376300394, "learning_rate": 5.360610812750742e-06, "loss": 0.2649, "step": 3181 }, { "epoch": 2.013929560743965, "grad_norm": 0.47131139183976023, "learning_rate": 5.354431542917642e-06, "loss": 0.2394, "step": 3182 }, { "epoch": 2.0145627225959637, "grad_norm": 0.47003622609615303, "learning_rate": 5.348254534262262e-06, "loss": 0.2498, "step": 3183 }, { "epoch": 2.015195884447962, "grad_norm": 0.4904375120879024, "learning_rate": 5.3420797897912e-06, "loss": 0.2434, "step": 3184 }, { "epoch": 2.0158290462999604, "grad_norm": 0.48434325940332673, "learning_rate": 5.335907312509929e-06, "loss": 0.2533, "step": 3185 }, { "epoch": 2.016462208151959, "grad_norm": 0.5181455792807361, "learning_rate": 5.32973710542284e-06, "loss": 0.2414, "step": 3186 }, { "epoch": 2.017095370003957, "grad_norm": 0.4705092046351119, "learning_rate": 5.3235691715332185e-06, "loss": 0.2482, "step": 3187 }, { "epoch": 2.0177285318559557, "grad_norm": 0.46874963027045136, "learning_rate": 5.317403513843219e-06, "loss": 0.2654, "step": 3188 }, { "epoch": 2.0183616937079543, "grad_norm": 0.4802263485101575, "learning_rate": 5.31124013535392e-06, "loss": 0.255, "step": 3189 }, { "epoch": 2.0189948555599524, "grad_norm": 0.4581070032442674, "learning_rate": 5.305079039065271e-06, "loss": 0.2622, "step": 3190 }, { "epoch": 2.019628017411951, "grad_norm": 0.4752943644452432, "learning_rate": 5.298920227976114e-06, "loss": 0.2549, "step": 3191 }, { "epoch": 2.0202611792639495, "grad_norm": 0.47854215984520665, "learning_rate": 5.292763705084181e-06, "loss": 0.2439, "step": 3192 }, { "epoch": 2.0208943411159477, "grad_norm": 0.5504053799110693, "learning_rate": 5.286609473386093e-06, "loss": 0.2556, "step": 3193 }, { "epoch": 2.0215275029679463, "grad_norm": 0.4847467474480546, "learning_rate": 5.28045753587735e-06, "loss": 0.2468, "step": 3194 }, { "epoch": 2.0221606648199444, "grad_norm": 0.47646016433856386, "learning_rate": 5.274307895552334e-06, "loss": 0.2427, "step": 3195 }, { "epoch": 2.022793826671943, "grad_norm": 1.1126315391946824, "learning_rate": 5.26816055540432e-06, "loss": 0.258, "step": 3196 }, { "epoch": 2.0234269885239415, "grad_norm": 0.4981906116293257, "learning_rate": 5.262015518425454e-06, "loss": 0.25, "step": 3197 }, { "epoch": 2.0240601503759397, "grad_norm": 0.5461923835615902, "learning_rate": 5.255872787606761e-06, "loss": 0.238, "step": 3198 }, { "epoch": 2.0246933122279382, "grad_norm": 0.6638756092301555, "learning_rate": 5.249732365938142e-06, "loss": 0.2502, "step": 3199 }, { "epoch": 2.025326474079937, "grad_norm": 0.4781211002022049, "learning_rate": 5.243594256408389e-06, "loss": 0.2631, "step": 3200 } ], "logging_steps": 1, "max_steps": 4740, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6983586149826560.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }