{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1896, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004219409282700422, "grad_norm": 8.808255195617676, "learning_rate": 8.421052631578947e-08, "loss": 2.1962406635284424, "step": 2 }, { "epoch": 0.008438818565400843, "grad_norm": 11.09923267364502, "learning_rate": 2.526315789473684e-07, "loss": 1.780366063117981, "step": 4 }, { "epoch": 0.012658227848101266, "grad_norm": 2.419753313064575, "learning_rate": 4.2105263157894733e-07, "loss": 1.93634033203125, "step": 6 }, { "epoch": 0.016877637130801686, "grad_norm": 3.9049134254455566, "learning_rate": 5.894736842105262e-07, "loss": 1.943023443222046, "step": 8 }, { "epoch": 0.02109704641350211, "grad_norm": 6.477982044219971, "learning_rate": 7.578947368421053e-07, "loss": 1.8409148454666138, "step": 10 }, { "epoch": 0.02531645569620253, "grad_norm": 1.0134243965148926, "learning_rate": 9.263157894736841e-07, "loss": 1.3077127933502197, "step": 12 }, { "epoch": 0.029535864978902954, "grad_norm": 3.247878074645996, "learning_rate": 1.0947368421052632e-06, "loss": 1.6219741106033325, "step": 14 }, { "epoch": 0.03375527426160337, "grad_norm": 7.898465156555176, "learning_rate": 1.263157894736842e-06, "loss": 2.037022113800049, "step": 16 }, { "epoch": 0.0379746835443038, "grad_norm": 1.3195950984954834, "learning_rate": 1.431578947368421e-06, "loss": 1.7350990772247314, "step": 18 }, { "epoch": 0.04219409282700422, "grad_norm": 1.8259081840515137, "learning_rate": 1.6e-06, "loss": 1.8126976490020752, "step": 20 }, { "epoch": 0.046413502109704644, "grad_norm": 1.4393107891082764, "learning_rate": 1.768421052631579e-06, "loss": 1.4626768827438354, "step": 22 }, { "epoch": 0.05063291139240506, "grad_norm": 2.4846441745758057, "learning_rate": 1.936842105263158e-06, "loss": 1.600361704826355, "step": 24 }, { "epoch": 0.05485232067510549, "grad_norm": 2.291980743408203, "learning_rate": 2.1052631578947366e-06, "loss": 1.3303472995758057, "step": 26 }, { "epoch": 0.05907172995780591, "grad_norm": 1.7172958850860596, "learning_rate": 2.273684210526316e-06, "loss": 1.7306660413742065, "step": 28 }, { "epoch": 0.06329113924050633, "grad_norm": 2.336642026901245, "learning_rate": 2.4421052631578946e-06, "loss": 1.3191598653793335, "step": 30 }, { "epoch": 0.06751054852320675, "grad_norm": 1.4607104063034058, "learning_rate": 2.6105263157894738e-06, "loss": 1.550937533378601, "step": 32 }, { "epoch": 0.07172995780590717, "grad_norm": 3.0410056114196777, "learning_rate": 2.7789473684210525e-06, "loss": 1.0037612915039062, "step": 34 }, { "epoch": 0.0759493670886076, "grad_norm": 1.8816224336624146, "learning_rate": 2.9473684210526313e-06, "loss": 1.5629675388336182, "step": 36 }, { "epoch": 0.08016877637130802, "grad_norm": 2.050464391708374, "learning_rate": 3.1157894736842105e-06, "loss": 1.1124558448791504, "step": 38 }, { "epoch": 0.08438818565400844, "grad_norm": 1.7500700950622559, "learning_rate": 3.2842105263157892e-06, "loss": 1.5181232690811157, "step": 40 }, { "epoch": 0.08860759493670886, "grad_norm": 3.6279757022857666, "learning_rate": 3.4526315789473684e-06, "loss": 1.064979076385498, "step": 42 }, { "epoch": 0.09282700421940929, "grad_norm": 3.692965507507324, "learning_rate": 3.621052631578947e-06, "loss": 1.7742705345153809, "step": 44 }, { "epoch": 0.0970464135021097, "grad_norm": 2.689681053161621, "learning_rate": 3.789473684210526e-06, "loss": 1.652271032333374, "step": 46 }, { "epoch": 0.10126582278481013, "grad_norm": 1.1244308948516846, "learning_rate": 3.957894736842105e-06, "loss": 1.5283629894256592, "step": 48 }, { "epoch": 0.10548523206751055, "grad_norm": 1.6453142166137695, "learning_rate": 4.126315789473685e-06, "loss": 0.7807677388191223, "step": 50 }, { "epoch": 0.10970464135021098, "grad_norm": 1.3963041305541992, "learning_rate": 4.294736842105263e-06, "loss": 0.8492421507835388, "step": 52 }, { "epoch": 0.11392405063291139, "grad_norm": 1.8241719007492065, "learning_rate": 4.463157894736841e-06, "loss": 0.7646088600158691, "step": 54 }, { "epoch": 0.11814345991561181, "grad_norm": 5.430877208709717, "learning_rate": 4.631578947368421e-06, "loss": 1.385468602180481, "step": 56 }, { "epoch": 0.12236286919831224, "grad_norm": 4.216091632843018, "learning_rate": 4.8e-06, "loss": 0.6626491546630859, "step": 58 }, { "epoch": 0.12658227848101267, "grad_norm": 2.5527150630950928, "learning_rate": 4.968421052631579e-06, "loss": 1.4430313110351562, "step": 60 }, { "epoch": 0.1308016877637131, "grad_norm": 2.92517352104187, "learning_rate": 5.136842105263157e-06, "loss": 1.4682120084762573, "step": 62 }, { "epoch": 0.1350210970464135, "grad_norm": 0.9485960602760315, "learning_rate": 5.305263157894736e-06, "loss": 1.0755484104156494, "step": 64 }, { "epoch": 0.13924050632911392, "grad_norm": 3.126896619796753, "learning_rate": 5.473684210526316e-06, "loss": 0.8076987266540527, "step": 66 }, { "epoch": 0.14345991561181434, "grad_norm": 1.318830966949463, "learning_rate": 5.6421052631578944e-06, "loss": 1.0927525758743286, "step": 68 }, { "epoch": 0.14767932489451477, "grad_norm": 3.404849052429199, "learning_rate": 5.810526315789474e-06, "loss": 0.9728879332542419, "step": 70 }, { "epoch": 0.1518987341772152, "grad_norm": 3.532927989959717, "learning_rate": 5.978947368421052e-06, "loss": 1.265703797340393, "step": 72 }, { "epoch": 0.15611814345991562, "grad_norm": 2.622828245162964, "learning_rate": 6.147368421052631e-06, "loss": 1.0082859992980957, "step": 74 }, { "epoch": 0.16033755274261605, "grad_norm": 3.0084891319274902, "learning_rate": 6.31578947368421e-06, "loss": 0.8589051365852356, "step": 76 }, { "epoch": 0.16455696202531644, "grad_norm": 1.5682191848754883, "learning_rate": 6.484210526315789e-06, "loss": 1.3428035974502563, "step": 78 }, { "epoch": 0.16877637130801687, "grad_norm": 1.149340033531189, "learning_rate": 6.652631578947369e-06, "loss": 1.3348019123077393, "step": 80 }, { "epoch": 0.1729957805907173, "grad_norm": 1.4813284873962402, "learning_rate": 6.821052631578947e-06, "loss": 1.3795466423034668, "step": 82 }, { "epoch": 0.17721518987341772, "grad_norm": 2.05072283744812, "learning_rate": 6.989473684210526e-06, "loss": 0.6971267461776733, "step": 84 }, { "epoch": 0.18143459915611815, "grad_norm": 1.8534572124481201, "learning_rate": 7.157894736842105e-06, "loss": 1.113297700881958, "step": 86 }, { "epoch": 0.18565400843881857, "grad_norm": 1.253941535949707, "learning_rate": 7.326315789473684e-06, "loss": 1.2875044345855713, "step": 88 }, { "epoch": 0.189873417721519, "grad_norm": 1.6247971057891846, "learning_rate": 7.494736842105263e-06, "loss": 0.6056119203567505, "step": 90 }, { "epoch": 0.1940928270042194, "grad_norm": 1.780320167541504, "learning_rate": 7.663157894736842e-06, "loss": 1.676588773727417, "step": 92 }, { "epoch": 0.19831223628691982, "grad_norm": 2.9397106170654297, "learning_rate": 7.831578947368421e-06, "loss": 0.941127359867096, "step": 94 }, { "epoch": 0.20253164556962025, "grad_norm": 1.2082825899124146, "learning_rate": 8e-06, "loss": 1.4084690809249878, "step": 96 }, { "epoch": 0.20675105485232068, "grad_norm": 1.0113117694854736, "learning_rate": 7.999978091917096e-06, "loss": 1.4652810096740723, "step": 98 }, { "epoch": 0.2109704641350211, "grad_norm": 3.00728178024292, "learning_rate": 7.999912367935033e-06, "loss": 1.2376593351364136, "step": 100 }, { "epoch": 0.21518987341772153, "grad_norm": 2.393007278442383, "learning_rate": 7.999802828853748e-06, "loss": 1.051348090171814, "step": 102 }, { "epoch": 0.21940928270042195, "grad_norm": 1.1730682849884033, "learning_rate": 7.999649476006456e-06, "loss": 0.9889463782310486, "step": 104 }, { "epoch": 0.22362869198312235, "grad_norm": 3.6534433364868164, "learning_rate": 7.99945231125964e-06, "loss": 0.9653201699256897, "step": 106 }, { "epoch": 0.22784810126582278, "grad_norm": 1.5248578786849976, "learning_rate": 7.999211337013023e-06, "loss": 0.970741331577301, "step": 108 }, { "epoch": 0.2320675105485232, "grad_norm": 1.0831537246704102, "learning_rate": 7.998926556199543e-06, "loss": 1.0245221853256226, "step": 110 }, { "epoch": 0.23628691983122363, "grad_norm": 1.1785749197006226, "learning_rate": 7.998597972285308e-06, "loss": 1.3712621927261353, "step": 112 }, { "epoch": 0.24050632911392406, "grad_norm": 1.3017958402633667, "learning_rate": 7.998225589269567e-06, "loss": 1.2707055807113647, "step": 114 }, { "epoch": 0.24472573839662448, "grad_norm": 2.281855344772339, "learning_rate": 7.997809411684651e-06, "loss": 1.5997581481933594, "step": 116 }, { "epoch": 0.2489451476793249, "grad_norm": 0.8561908602714539, "learning_rate": 7.997349444595921e-06, "loss": 1.2587316036224365, "step": 118 }, { "epoch": 0.25316455696202533, "grad_norm": 0.5100749135017395, "learning_rate": 7.996845693601713e-06, "loss": 0.9957402348518372, "step": 120 }, { "epoch": 0.25738396624472576, "grad_norm": 3.094721794128418, "learning_rate": 7.996298164833256e-06, "loss": 1.258643627166748, "step": 122 }, { "epoch": 0.2616033755274262, "grad_norm": 1.02621328830719, "learning_rate": 7.995706864954613e-06, "loss": 1.2669998407363892, "step": 124 }, { "epoch": 0.26582278481012656, "grad_norm": 0.9123193621635437, "learning_rate": 7.995071801162584e-06, "loss": 1.3570575714111328, "step": 126 }, { "epoch": 0.270042194092827, "grad_norm": 1.731139898300171, "learning_rate": 7.99439298118663e-06, "loss": 0.8451032042503357, "step": 128 }, { "epoch": 0.2742616033755274, "grad_norm": 1.6111301183700562, "learning_rate": 7.993670413288777e-06, "loss": 1.1453604698181152, "step": 130 }, { "epoch": 0.27848101265822783, "grad_norm": 2.9632482528686523, "learning_rate": 7.992904106263512e-06, "loss": 1.2021801471710205, "step": 132 }, { "epoch": 0.28270042194092826, "grad_norm": 3.683976411819458, "learning_rate": 7.992094069437679e-06, "loss": 0.8209038972854614, "step": 134 }, { "epoch": 0.2869198312236287, "grad_norm": 0.8348782658576965, "learning_rate": 7.991240312670361e-06, "loss": 0.8820058703422546, "step": 136 }, { "epoch": 0.2911392405063291, "grad_norm": 2.7286171913146973, "learning_rate": 7.99034284635277e-06, "loss": 1.361273169517517, "step": 138 }, { "epoch": 0.29535864978902954, "grad_norm": 4.104984283447266, "learning_rate": 7.989401681408107e-06, "loss": 0.9259814023971558, "step": 140 }, { "epoch": 0.29957805907172996, "grad_norm": 1.7529696226119995, "learning_rate": 7.988416829291437e-06, "loss": 1.2620042562484741, "step": 142 }, { "epoch": 0.3037974683544304, "grad_norm": 3.8300654888153076, "learning_rate": 7.987388301989553e-06, "loss": 1.0700979232788086, "step": 144 }, { "epoch": 0.3080168776371308, "grad_norm": 1.7690354585647583, "learning_rate": 7.986316112020821e-06, "loss": 1.3733104467391968, "step": 146 }, { "epoch": 0.31223628691983124, "grad_norm": 1.0495412349700928, "learning_rate": 7.985200272435035e-06, "loss": 1.3526469469070435, "step": 148 }, { "epoch": 0.31645569620253167, "grad_norm": 1.3418861627578735, "learning_rate": 7.984040796813251e-06, "loss": 1.2077337503433228, "step": 150 }, { "epoch": 0.3206751054852321, "grad_norm": 1.4774577617645264, "learning_rate": 7.982837699267632e-06, "loss": 1.2690041065216064, "step": 152 }, { "epoch": 0.32489451476793246, "grad_norm": 1.621232032775879, "learning_rate": 7.981590994441264e-06, "loss": 1.4557234048843384, "step": 154 }, { "epoch": 0.3291139240506329, "grad_norm": 9.58849811553955, "learning_rate": 7.98030069750799e-06, "loss": 1.2517377138137817, "step": 156 }, { "epoch": 0.3333333333333333, "grad_norm": 0.9245334267616272, "learning_rate": 7.978966824172219e-06, "loss": 1.312515377998352, "step": 158 }, { "epoch": 0.33755274261603374, "grad_norm": 0.9059025049209595, "learning_rate": 7.977589390668727e-06, "loss": 1.2920206785202026, "step": 160 }, { "epoch": 0.34177215189873417, "grad_norm": 4.1672210693359375, "learning_rate": 7.976168413762478e-06, "loss": 0.8602538108825684, "step": 162 }, { "epoch": 0.3459915611814346, "grad_norm": 3.4024016857147217, "learning_rate": 7.974703910748405e-06, "loss": 1.214678168296814, "step": 164 }, { "epoch": 0.350210970464135, "grad_norm": 2.5402605533599854, "learning_rate": 7.973195899451203e-06, "loss": 0.809662401676178, "step": 166 }, { "epoch": 0.35443037974683544, "grad_norm": 2.9653215408325195, "learning_rate": 7.971644398225114e-06, "loss": 1.2221626043319702, "step": 168 }, { "epoch": 0.35864978902953587, "grad_norm": 1.4855773448944092, "learning_rate": 7.970049425953705e-06, "loss": 1.3168489933013916, "step": 170 }, { "epoch": 0.3628691983122363, "grad_norm": 2.3102357387542725, "learning_rate": 7.968411002049635e-06, "loss": 1.1956959962844849, "step": 172 }, { "epoch": 0.3670886075949367, "grad_norm": 0.9286043643951416, "learning_rate": 7.966729146454414e-06, "loss": 1.0124504566192627, "step": 174 }, { "epoch": 0.37130801687763715, "grad_norm": 2.3038041591644287, "learning_rate": 7.965003879638177e-06, "loss": 1.30778169631958, "step": 176 }, { "epoch": 0.3755274261603376, "grad_norm": 1.806934118270874, "learning_rate": 7.963235222599414e-06, "loss": 1.307655930519104, "step": 178 }, { "epoch": 0.379746835443038, "grad_norm": 1.0257319211959839, "learning_rate": 7.961423196864727e-06, "loss": 0.990490198135376, "step": 180 }, { "epoch": 0.38396624472573837, "grad_norm": 0.9438347220420837, "learning_rate": 7.95956782448857e-06, "loss": 1.2663373947143555, "step": 182 }, { "epoch": 0.3881856540084388, "grad_norm": 0.7051662802696228, "learning_rate": 7.957669128052967e-06, "loss": 1.264948844909668, "step": 184 }, { "epoch": 0.3924050632911392, "grad_norm": 1.4901031255722046, "learning_rate": 7.955727130667254e-06, "loss": 0.5198807120323181, "step": 186 }, { "epoch": 0.39662447257383965, "grad_norm": 1.6583565473556519, "learning_rate": 7.953741855967786e-06, "loss": 0.9574030041694641, "step": 188 }, { "epoch": 0.4008438818565401, "grad_norm": 0.9092651009559631, "learning_rate": 7.951713328117653e-06, "loss": 1.0500378608703613, "step": 190 }, { "epoch": 0.4050632911392405, "grad_norm": 1.398674726486206, "learning_rate": 7.949641571806384e-06, "loss": 1.2852396965026855, "step": 192 }, { "epoch": 0.4092827004219409, "grad_norm": 1.3172980546951294, "learning_rate": 7.947526612249655e-06, "loss": 1.063001036643982, "step": 194 }, { "epoch": 0.41350210970464135, "grad_norm": 1.0889501571655273, "learning_rate": 7.945368475188967e-06, "loss": 1.2641280889511108, "step": 196 }, { "epoch": 0.4177215189873418, "grad_norm": 1.0665010213851929, "learning_rate": 7.943167186891349e-06, "loss": 1.0218112468719482, "step": 198 }, { "epoch": 0.4219409282700422, "grad_norm": 3.58223032951355, "learning_rate": 7.940922774149026e-06, "loss": 1.0860857963562012, "step": 200 }, { "epoch": 0.42616033755274263, "grad_norm": 1.089058756828308, "learning_rate": 7.938635264279095e-06, "loss": 1.11153244972229, "step": 202 }, { "epoch": 0.43037974683544306, "grad_norm": 0.7927507162094116, "learning_rate": 7.9363046851232e-06, "loss": 0.9039996862411499, "step": 204 }, { "epoch": 0.4345991561181435, "grad_norm": 1.210415005683899, "learning_rate": 7.933931065047189e-06, "loss": 1.3628909587860107, "step": 206 }, { "epoch": 0.4388185654008439, "grad_norm": 2.830822706222534, "learning_rate": 7.931514432940762e-06, "loss": 0.7663958668708801, "step": 208 }, { "epoch": 0.4430379746835443, "grad_norm": 0.9303013682365417, "learning_rate": 7.92905481821713e-06, "loss": 1.2602308988571167, "step": 210 }, { "epoch": 0.4472573839662447, "grad_norm": 2.2079782485961914, "learning_rate": 7.926552250812647e-06, "loss": 0.7922911047935486, "step": 212 }, { "epoch": 0.45147679324894513, "grad_norm": 1.6323434114456177, "learning_rate": 7.92400676118646e-06, "loss": 1.1895182132720947, "step": 214 }, { "epoch": 0.45569620253164556, "grad_norm": 1.2990820407867432, "learning_rate": 7.921418380320117e-06, "loss": 1.236521601676941, "step": 216 }, { "epoch": 0.459915611814346, "grad_norm": 1.2548811435699463, "learning_rate": 7.918787139717211e-06, "loss": 1.2785851955413818, "step": 218 }, { "epoch": 0.4641350210970464, "grad_norm": 1.9813899993896484, "learning_rate": 7.916113071402986e-06, "loss": 1.230564832687378, "step": 220 }, { "epoch": 0.46835443037974683, "grad_norm": 3.8140616416931152, "learning_rate": 7.913396207923946e-06, "loss": 1.2052173614501953, "step": 222 }, { "epoch": 0.47257383966244726, "grad_norm": 1.2737183570861816, "learning_rate": 7.910636582347466e-06, "loss": 1.0253933668136597, "step": 224 }, { "epoch": 0.4767932489451477, "grad_norm": 2.2479248046875, "learning_rate": 7.907834228261378e-06, "loss": 1.259740948677063, "step": 226 }, { "epoch": 0.4810126582278481, "grad_norm": 2.846742630004883, "learning_rate": 7.90498917977358e-06, "loss": 0.782292366027832, "step": 228 }, { "epoch": 0.48523206751054854, "grad_norm": 1.540499210357666, "learning_rate": 7.9021014715116e-06, "loss": 0.9024060368537903, "step": 230 }, { "epoch": 0.48945147679324896, "grad_norm": 4.1563334465026855, "learning_rate": 7.89917113862219e-06, "loss": 0.8967229723930359, "step": 232 }, { "epoch": 0.4936708860759494, "grad_norm": 1.8965283632278442, "learning_rate": 7.896198216770892e-06, "loss": 1.2712947130203247, "step": 234 }, { "epoch": 0.4978902953586498, "grad_norm": 1.1842418909072876, "learning_rate": 7.893182742141606e-06, "loss": 1.312996506690979, "step": 236 }, { "epoch": 0.5021097046413502, "grad_norm": 5.636916637420654, "learning_rate": 7.890124751436146e-06, "loss": 0.9852314591407776, "step": 238 }, { "epoch": 0.5063291139240507, "grad_norm": 2.3782975673675537, "learning_rate": 7.887024281873801e-06, "loss": 0.5027163028717041, "step": 240 }, { "epoch": 0.510548523206751, "grad_norm": 2.305345296859741, "learning_rate": 7.88388137119087e-06, "loss": 1.4169656038284302, "step": 242 }, { "epoch": 0.5147679324894515, "grad_norm": 3.1710622310638428, "learning_rate": 7.880696057640214e-06, "loss": 0.8661314845085144, "step": 244 }, { "epoch": 0.5189873417721519, "grad_norm": 1.3543529510498047, "learning_rate": 7.877468379990784e-06, "loss": 1.4801579713821411, "step": 246 }, { "epoch": 0.5232067510548524, "grad_norm": 1.236095666885376, "learning_rate": 7.874198377527153e-06, "loss": 1.1857268810272217, "step": 248 }, { "epoch": 0.5274261603375527, "grad_norm": 1.6078490018844604, "learning_rate": 7.870886090049034e-06, "loss": 1.301129698753357, "step": 250 }, { "epoch": 0.5316455696202531, "grad_norm": 2.838106632232666, "learning_rate": 7.867531557870802e-06, "loss": 1.1084915399551392, "step": 252 }, { "epoch": 0.5358649789029536, "grad_norm": 0.7895591855049133, "learning_rate": 7.864134821820989e-06, "loss": 1.2187299728393555, "step": 254 }, { "epoch": 0.540084388185654, "grad_norm": 0.9526500701904297, "learning_rate": 7.860695923241808e-06, "loss": 1.1880545616149902, "step": 256 }, { "epoch": 0.5443037974683544, "grad_norm": 5.490752220153809, "learning_rate": 7.857214903988633e-06, "loss": 1.0243443250656128, "step": 258 }, { "epoch": 0.5485232067510548, "grad_norm": 5.083505153656006, "learning_rate": 7.853691806429497e-06, "loss": 0.9623079299926758, "step": 260 }, { "epoch": 0.5527426160337553, "grad_norm": 0.914940357208252, "learning_rate": 7.850126673444574e-06, "loss": 1.2602107524871826, "step": 262 }, { "epoch": 0.5569620253164557, "grad_norm": 1.6456938982009888, "learning_rate": 7.846519548425655e-06, "loss": 1.2723337411880493, "step": 264 }, { "epoch": 0.5611814345991561, "grad_norm": 1.0772303342819214, "learning_rate": 7.84287047527563e-06, "loss": 1.2261412143707275, "step": 266 }, { "epoch": 0.5654008438818565, "grad_norm": 1.5974887609481812, "learning_rate": 7.839179498407939e-06, "loss": 1.5006755590438843, "step": 268 }, { "epoch": 0.569620253164557, "grad_norm": 3.4865124225616455, "learning_rate": 7.835446662746043e-06, "loss": 1.0508859157562256, "step": 270 }, { "epoch": 0.5738396624472574, "grad_norm": 1.8510380983352661, "learning_rate": 7.831672013722869e-06, "loss": 1.303536057472229, "step": 272 }, { "epoch": 0.5780590717299579, "grad_norm": 1.9769766330718994, "learning_rate": 7.827855597280267e-06, "loss": 1.1014729738235474, "step": 274 }, { "epoch": 0.5822784810126582, "grad_norm": 1.2615407705307007, "learning_rate": 7.82399745986844e-06, "loss": 1.247708797454834, "step": 276 }, { "epoch": 0.5864978902953587, "grad_norm": 1.0950466394424438, "learning_rate": 7.820097648445383e-06, "loss": 1.3929113149642944, "step": 278 }, { "epoch": 0.5907172995780591, "grad_norm": 0.7477395534515381, "learning_rate": 7.816156210476316e-06, "loss": 0.9548027515411377, "step": 280 }, { "epoch": 0.5949367088607594, "grad_norm": 0.8824933171272278, "learning_rate": 7.812173193933098e-06, "loss": 1.321789264678955, "step": 282 }, { "epoch": 0.5991561181434599, "grad_norm": 2.1495587825775146, "learning_rate": 7.808148647293651e-06, "loss": 1.5318031311035156, "step": 284 }, { "epoch": 0.6033755274261603, "grad_norm": 0.366915225982666, "learning_rate": 7.804082619541366e-06, "loss": 1.1145987510681152, "step": 286 }, { "epoch": 0.6075949367088608, "grad_norm": 0.9638784527778625, "learning_rate": 7.799975160164503e-06, "loss": 1.244531512260437, "step": 288 }, { "epoch": 0.6118143459915611, "grad_norm": 1.1089756488800049, "learning_rate": 7.795826319155596e-06, "loss": 0.8029107451438904, "step": 290 }, { "epoch": 0.6160337552742616, "grad_norm": 1.2907359600067139, "learning_rate": 7.791636147010842e-06, "loss": 0.660262942314148, "step": 292 }, { "epoch": 0.620253164556962, "grad_norm": 1.7545690536499023, "learning_rate": 7.787404694729485e-06, "loss": 1.2182437181472778, "step": 294 }, { "epoch": 0.6244725738396625, "grad_norm": 2.1209237575531006, "learning_rate": 7.783132013813194e-06, "loss": 0.698481559753418, "step": 296 }, { "epoch": 0.6286919831223629, "grad_norm": 15.612320899963379, "learning_rate": 7.778818156265443e-06, "loss": 0.6525253057479858, "step": 298 }, { "epoch": 0.6329113924050633, "grad_norm": 2.1628663539886475, "learning_rate": 7.774463174590867e-06, "loss": 1.7705399990081787, "step": 300 }, { "epoch": 0.6371308016877637, "grad_norm": 0.9601294994354248, "learning_rate": 7.770067121794634e-06, "loss": 1.2569221258163452, "step": 302 }, { "epoch": 0.6413502109704642, "grad_norm": 6.321805953979492, "learning_rate": 7.765630051381794e-06, "loss": 0.8693046569824219, "step": 304 }, { "epoch": 0.6455696202531646, "grad_norm": 1.1084986925125122, "learning_rate": 7.761152017356627e-06, "loss": 0.9949886798858643, "step": 306 }, { "epoch": 0.6497890295358649, "grad_norm": 0.5674229860305786, "learning_rate": 7.75663307422199e-06, "loss": 1.2497336864471436, "step": 308 }, { "epoch": 0.6540084388185654, "grad_norm": 0.785914421081543, "learning_rate": 7.75207327697865e-06, "loss": 1.197920560836792, "step": 310 }, { "epoch": 0.6582278481012658, "grad_norm": 1.9101108312606812, "learning_rate": 7.747472681124616e-06, "loss": 1.0015456676483154, "step": 312 }, { "epoch": 0.6624472573839663, "grad_norm": 1.1794395446777344, "learning_rate": 7.742831342654461e-06, "loss": 1.199405312538147, "step": 314 }, { "epoch": 0.6666666666666666, "grad_norm": 0.8022367358207703, "learning_rate": 7.738149318058648e-06, "loss": 1.1004928350448608, "step": 316 }, { "epoch": 0.6708860759493671, "grad_norm": 0.7416080236434937, "learning_rate": 7.733426664322834e-06, "loss": 1.0781973600387573, "step": 318 }, { "epoch": 0.6751054852320675, "grad_norm": 1.5531445741653442, "learning_rate": 7.728663438927177e-06, "loss": 1.0812546014785767, "step": 320 }, { "epoch": 0.679324894514768, "grad_norm": 1.3288339376449585, "learning_rate": 7.723859699845645e-06, "loss": 0.8775804042816162, "step": 322 }, { "epoch": 0.6835443037974683, "grad_norm": 1.0390150547027588, "learning_rate": 7.7190155055453e-06, "loss": 0.8802211284637451, "step": 324 }, { "epoch": 0.6877637130801688, "grad_norm": 0.8296968936920166, "learning_rate": 7.714130914985593e-06, "loss": 0.8521700501441956, "step": 326 }, { "epoch": 0.6919831223628692, "grad_norm": 0.742783784866333, "learning_rate": 7.709205987617642e-06, "loss": 0.8648751974105835, "step": 328 }, { "epoch": 0.6962025316455697, "grad_norm": 2.3347666263580322, "learning_rate": 7.704240783383513e-06, "loss": 1.5764340162277222, "step": 330 }, { "epoch": 0.70042194092827, "grad_norm": 1.7489248514175415, "learning_rate": 7.699235362715488e-06, "loss": 1.0720549821853638, "step": 332 }, { "epoch": 0.7046413502109705, "grad_norm": 1.7539390325546265, "learning_rate": 7.694189786535325e-06, "loss": 1.3112399578094482, "step": 334 }, { "epoch": 0.7088607594936709, "grad_norm": 2.141007900238037, "learning_rate": 7.689104116253529e-06, "loss": 1.2512861490249634, "step": 336 }, { "epoch": 0.7130801687763713, "grad_norm": 1.4252289533615112, "learning_rate": 7.683978413768591e-06, "loss": 1.2772711515426636, "step": 338 }, { "epoch": 0.7172995780590717, "grad_norm": 0.931917667388916, "learning_rate": 7.678812741466241e-06, "loss": 1.2473686933517456, "step": 340 }, { "epoch": 0.7215189873417721, "grad_norm": 0.674299418926239, "learning_rate": 7.673607162218688e-06, "loss": 1.2429455518722534, "step": 342 }, { "epoch": 0.7257383966244726, "grad_norm": 3.6710023880004883, "learning_rate": 7.668361739383856e-06, "loss": 1.0689202547073364, "step": 344 }, { "epoch": 0.729957805907173, "grad_norm": 2.2613823413848877, "learning_rate": 7.66307653680461e-06, "loss": 0.8741579651832581, "step": 346 }, { "epoch": 0.7341772151898734, "grad_norm": 0.8934361338615417, "learning_rate": 7.657751618807982e-06, "loss": 1.234643578529358, "step": 348 }, { "epoch": 0.7383966244725738, "grad_norm": 2.2448158264160156, "learning_rate": 7.652387050204386e-06, "loss": 1.1629151105880737, "step": 350 }, { "epoch": 0.7426160337552743, "grad_norm": 2.3110733032226562, "learning_rate": 7.64698289628683e-06, "loss": 1.074580192565918, "step": 352 }, { "epoch": 0.7468354430379747, "grad_norm": 2.6933298110961914, "learning_rate": 7.641539222830117e-06, "loss": 0.6495164036750793, "step": 354 }, { "epoch": 0.7510548523206751, "grad_norm": 8.03363037109375, "learning_rate": 7.63605609609006e-06, "loss": 1.0515384674072266, "step": 356 }, { "epoch": 0.7552742616033755, "grad_norm": 1.2727802991867065, "learning_rate": 7.630533582802647e-06, "loss": 1.0052093267440796, "step": 358 }, { "epoch": 0.759493670886076, "grad_norm": 0.8008536100387573, "learning_rate": 7.6249717501832616e-06, "loss": 1.208338975906372, "step": 360 }, { "epoch": 0.7637130801687764, "grad_norm": 1.467780351638794, "learning_rate": 7.619370665925842e-06, "loss": 0.9765693545341492, "step": 362 }, { "epoch": 0.7679324894514767, "grad_norm": 1.806458830833435, "learning_rate": 7.613730398202061e-06, "loss": 1.5730071067810059, "step": 364 }, { "epoch": 0.7721518987341772, "grad_norm": 2.95654034614563, "learning_rate": 7.608051015660508e-06, "loss": 0.6979476809501648, "step": 366 }, { "epoch": 0.7763713080168776, "grad_norm": 1.1918550729751587, "learning_rate": 7.60233258742584e-06, "loss": 1.1616028547286987, "step": 368 }, { "epoch": 0.7805907172995781, "grad_norm": 2.9640629291534424, "learning_rate": 7.596575183097943e-06, "loss": 1.4773938655853271, "step": 370 }, { "epoch": 0.7848101265822784, "grad_norm": 3.126946210861206, "learning_rate": 7.590778872751091e-06, "loss": 0.9821402430534363, "step": 372 }, { "epoch": 0.7890295358649789, "grad_norm": 1.5098716020584106, "learning_rate": 7.58494372693309e-06, "loss": 1.0515096187591553, "step": 374 }, { "epoch": 0.7932489451476793, "grad_norm": 1.3018288612365723, "learning_rate": 7.579069816664417e-06, "loss": 1.1510859727859497, "step": 376 }, { "epoch": 0.7974683544303798, "grad_norm": 0.4790183901786804, "learning_rate": 7.573157213437353e-06, "loss": 1.1152373552322388, "step": 378 }, { "epoch": 0.8016877637130801, "grad_norm": 1.7124649286270142, "learning_rate": 7.567205989215126e-06, "loss": 0.729989230632782, "step": 380 }, { "epoch": 0.8059071729957806, "grad_norm": 1.813360333442688, "learning_rate": 7.5612162164310196e-06, "loss": 1.2611396312713623, "step": 382 }, { "epoch": 0.810126582278481, "grad_norm": 3.6355385780334473, "learning_rate": 7.555187967987499e-06, "loss": 0.9938818216323853, "step": 384 }, { "epoch": 0.8143459915611815, "grad_norm": 1.1984269618988037, "learning_rate": 7.549121317255322e-06, "loss": 1.2364702224731445, "step": 386 }, { "epoch": 0.8185654008438819, "grad_norm": 1.193095326423645, "learning_rate": 7.543016338072653e-06, "loss": 0.9437189102172852, "step": 388 }, { "epoch": 0.8227848101265823, "grad_norm": 0.9609726071357727, "learning_rate": 7.5368731047441486e-06, "loss": 1.2113581895828247, "step": 390 }, { "epoch": 0.8270042194092827, "grad_norm": 2.6752779483795166, "learning_rate": 7.530691692040069e-06, "loss": 0.9650623798370361, "step": 392 }, { "epoch": 0.8312236286919831, "grad_norm": 1.805607795715332, "learning_rate": 7.52447217519536e-06, "loss": 0.9127689003944397, "step": 394 }, { "epoch": 0.8354430379746836, "grad_norm": 1.9785407781600952, "learning_rate": 7.5182146299087375e-06, "loss": 1.2258358001708984, "step": 396 }, { "epoch": 0.8396624472573839, "grad_norm": 0.2685816287994385, "learning_rate": 7.51191913234177e-06, "loss": 1.0780510902404785, "step": 398 }, { "epoch": 0.8438818565400844, "grad_norm": 0.8069247007369995, "learning_rate": 7.505585759117947e-06, "loss": 0.9565885663032532, "step": 400 }, { "epoch": 0.8481012658227848, "grad_norm": 2.5125067234039307, "learning_rate": 7.499214587321749e-06, "loss": 0.7042322754859924, "step": 402 }, { "epoch": 0.8523206751054853, "grad_norm": 1.6951823234558105, "learning_rate": 7.49280569449771e-06, "loss": 1.1971807479858398, "step": 404 }, { "epoch": 0.8565400843881856, "grad_norm": 0.7480567097663879, "learning_rate": 7.486359158649471e-06, "loss": 0.9361912608146667, "step": 406 }, { "epoch": 0.8607594936708861, "grad_norm": 1.4761838912963867, "learning_rate": 7.4798750582388354e-06, "loss": 0.9626801609992981, "step": 408 }, { "epoch": 0.8649789029535865, "grad_norm": 0.9039321541786194, "learning_rate": 7.473353472184806e-06, "loss": 1.2230124473571777, "step": 410 }, { "epoch": 0.869198312236287, "grad_norm": 1.8411026000976562, "learning_rate": 7.466794479862632e-06, "loss": 0.838551938533783, "step": 412 }, { "epoch": 0.8734177215189873, "grad_norm": 1.6176677942276, "learning_rate": 7.460198161102841e-06, "loss": 1.2056636810302734, "step": 414 }, { "epoch": 0.8776371308016878, "grad_norm": 0.796684205532074, "learning_rate": 7.453564596190265e-06, "loss": 1.0609307289123535, "step": 416 }, { "epoch": 0.8818565400843882, "grad_norm": 3.51039457321167, "learning_rate": 7.446893865863063e-06, "loss": 1.1577751636505127, "step": 418 }, { "epoch": 0.8860759493670886, "grad_norm": 1.3933305740356445, "learning_rate": 7.440186051311744e-06, "loss": 0.9417897462844849, "step": 420 }, { "epoch": 0.890295358649789, "grad_norm": 1.077215313911438, "learning_rate": 7.433441234178174e-06, "loss": 1.333181619644165, "step": 422 }, { "epoch": 0.8945147679324894, "grad_norm": 3.3169748783111572, "learning_rate": 7.426659496554582e-06, "loss": 0.9721631407737732, "step": 424 }, { "epoch": 0.8987341772151899, "grad_norm": 1.1560137271881104, "learning_rate": 7.4198409209825615e-06, "loss": 1.1756271123886108, "step": 426 }, { "epoch": 0.9029535864978903, "grad_norm": 0.8857593536376953, "learning_rate": 7.412985590452066e-06, "loss": 1.037049651145935, "step": 428 }, { "epoch": 0.9071729957805907, "grad_norm": 1.4016187191009521, "learning_rate": 7.4060935884004045e-06, "loss": 1.0027376413345337, "step": 430 }, { "epoch": 0.9113924050632911, "grad_norm": 1.6030839681625366, "learning_rate": 7.399164998711215e-06, "loss": 1.0767489671707153, "step": 432 }, { "epoch": 0.9156118143459916, "grad_norm": 1.2388867139816284, "learning_rate": 7.392199905713454e-06, "loss": 1.241571307182312, "step": 434 }, { "epoch": 0.919831223628692, "grad_norm": 1.3980982303619385, "learning_rate": 7.385198394180359e-06, "loss": 0.8756187558174133, "step": 436 }, { "epoch": 0.9240506329113924, "grad_norm": 1.7992874383926392, "learning_rate": 7.378160549328429e-06, "loss": 1.196347713470459, "step": 438 }, { "epoch": 0.9282700421940928, "grad_norm": 0.9916715025901794, "learning_rate": 7.371086456816381e-06, "loss": 0.9922671318054199, "step": 440 }, { "epoch": 0.9324894514767933, "grad_norm": 2.890634536743164, "learning_rate": 7.363976202744106e-06, "loss": 0.9319839477539062, "step": 442 }, { "epoch": 0.9367088607594937, "grad_norm": 5.665074348449707, "learning_rate": 7.356829873651623e-06, "loss": 1.0942474603652954, "step": 444 }, { "epoch": 0.9409282700421941, "grad_norm": 1.669215440750122, "learning_rate": 7.3496475565180284e-06, "loss": 1.2984267473220825, "step": 446 }, { "epoch": 0.9451476793248945, "grad_norm": 0.8537882566452026, "learning_rate": 7.342429338760431e-06, "loss": 0.9971826076507568, "step": 448 }, { "epoch": 0.9493670886075949, "grad_norm": 3.3685364723205566, "learning_rate": 7.3351753082328946e-06, "loss": 0.9323700666427612, "step": 450 }, { "epoch": 0.9535864978902954, "grad_norm": 1.2475708723068237, "learning_rate": 7.327885553225365e-06, "loss": 1.2786669731140137, "step": 452 }, { "epoch": 0.9578059071729957, "grad_norm": 1.7981699705123901, "learning_rate": 7.320560162462594e-06, "loss": 0.9830716848373413, "step": 454 }, { "epoch": 0.9620253164556962, "grad_norm": 1.964571237564087, "learning_rate": 7.313199225103068e-06, "loss": 1.1577880382537842, "step": 456 }, { "epoch": 0.9662447257383966, "grad_norm": 0.8031247854232788, "learning_rate": 7.3058028307379104e-06, "loss": 0.8746158480644226, "step": 458 }, { "epoch": 0.9704641350210971, "grad_norm": 6.945025444030762, "learning_rate": 7.298371069389798e-06, "loss": 0.6917670369148254, "step": 460 }, { "epoch": 0.9746835443037974, "grad_norm": 1.1903830766677856, "learning_rate": 7.290904031511867e-06, "loss": 0.8951276540756226, "step": 462 }, { "epoch": 0.9789029535864979, "grad_norm": 1.7528347969055176, "learning_rate": 7.28340180798661e-06, "loss": 1.1649961471557617, "step": 464 }, { "epoch": 0.9831223628691983, "grad_norm": 2.7463033199310303, "learning_rate": 7.275864490124769e-06, "loss": 0.7191216349601746, "step": 466 }, { "epoch": 0.9873417721518988, "grad_norm": 1.2754981517791748, "learning_rate": 7.268292169664222e-06, "loss": 1.3055366277694702, "step": 468 }, { "epoch": 0.9915611814345991, "grad_norm": 1.402946949005127, "learning_rate": 7.260684938768874e-06, "loss": 0.8869744539260864, "step": 470 }, { "epoch": 0.9957805907172996, "grad_norm": 1.225201964378357, "learning_rate": 7.253042890027527e-06, "loss": 1.202407956123352, "step": 472 }, { "epoch": 1.0, "grad_norm": 1.0377144813537598, "learning_rate": 7.2453661164527565e-06, "loss": 1.249975562095642, "step": 474 }, { "epoch": 1.0042194092827004, "grad_norm": 2.005743980407715, "learning_rate": 7.237654711479781e-06, "loss": 0.9949838519096375, "step": 476 }, { "epoch": 1.0084388185654007, "grad_norm": 1.0849618911743164, "learning_rate": 7.2299087689653224e-06, "loss": 1.1602823734283447, "step": 478 }, { "epoch": 1.0126582278481013, "grad_norm": 1.2653388977050781, "learning_rate": 7.222128383186464e-06, "loss": 1.13376784324646, "step": 480 }, { "epoch": 1.0168776371308017, "grad_norm": 1.4933967590332031, "learning_rate": 7.214313648839504e-06, "loss": 0.8098440766334534, "step": 482 }, { "epoch": 1.021097046413502, "grad_norm": 1.3467376232147217, "learning_rate": 7.206464661038802e-06, "loss": 1.058078408241272, "step": 484 }, { "epoch": 1.0253164556962024, "grad_norm": 1.572613000869751, "learning_rate": 7.198581515315622e-06, "loss": 0.46203434467315674, "step": 486 }, { "epoch": 1.029535864978903, "grad_norm": 1.1707247495651245, "learning_rate": 7.1906643076169736e-06, "loss": 0.952732264995575, "step": 488 }, { "epoch": 1.0337552742616034, "grad_norm": 0.9213857650756836, "learning_rate": 7.182713134304431e-06, "loss": 0.8125715851783752, "step": 490 }, { "epoch": 1.0379746835443038, "grad_norm": 1.2644870281219482, "learning_rate": 7.174728092152975e-06, "loss": 1.1190340518951416, "step": 492 }, { "epoch": 1.0421940928270041, "grad_norm": 2.924903392791748, "learning_rate": 7.1667092783498105e-06, "loss": 0.8107770085334778, "step": 494 }, { "epoch": 1.0464135021097047, "grad_norm": 1.9948618412017822, "learning_rate": 7.15865679049318e-06, "loss": 0.9311866164207458, "step": 496 }, { "epoch": 1.0506329113924051, "grad_norm": 0.8843758702278137, "learning_rate": 7.150570726591178e-06, "loss": 1.216412901878357, "step": 498 }, { "epoch": 1.0548523206751055, "grad_norm": 1.182377815246582, "learning_rate": 7.14245118506056e-06, "loss": 1.0732381343841553, "step": 500 }, { "epoch": 1.0590717299578059, "grad_norm": 1.3736239671707153, "learning_rate": 7.134298264725542e-06, "loss": 1.0816361904144287, "step": 502 }, { "epoch": 1.0632911392405062, "grad_norm": 0.9931637644767761, "learning_rate": 7.126112064816598e-06, "loss": 1.20469331741333, "step": 504 }, { "epoch": 1.0675105485232068, "grad_norm": 0.99040687084198, "learning_rate": 7.117892684969255e-06, "loss": 0.7193590402603149, "step": 506 }, { "epoch": 1.0717299578059072, "grad_norm": 1.0574781894683838, "learning_rate": 7.109640225222874e-06, "loss": 1.030031442642212, "step": 508 }, { "epoch": 1.0759493670886076, "grad_norm": 0.9034721255302429, "learning_rate": 7.101354786019443e-06, "loss": 1.0760937929153442, "step": 510 }, { "epoch": 1.080168776371308, "grad_norm": 2.8123185634613037, "learning_rate": 7.0930364682023446e-06, "loss": 1.0546125173568726, "step": 512 }, { "epoch": 1.0843881856540085, "grad_norm": 1.0355448722839355, "learning_rate": 7.084685373015131e-06, "loss": 1.060817003250122, "step": 514 }, { "epoch": 1.0886075949367089, "grad_norm": 0.6953267455101013, "learning_rate": 7.076301602100294e-06, "loss": 1.0389786958694458, "step": 516 }, { "epoch": 1.0928270042194093, "grad_norm": 1.1498650312423706, "learning_rate": 7.067885257498027e-06, "loss": 0.9518197774887085, "step": 518 }, { "epoch": 1.0970464135021096, "grad_norm": 1.411102533340454, "learning_rate": 7.059436441644984e-06, "loss": 0.8960402011871338, "step": 520 }, { "epoch": 1.1012658227848102, "grad_norm": 2.03861927986145, "learning_rate": 7.0509552573730305e-06, "loss": 0.8347494602203369, "step": 522 }, { "epoch": 1.1054852320675106, "grad_norm": 1.5810351371765137, "learning_rate": 7.0424418079079925e-06, "loss": 0.9857693314552307, "step": 524 }, { "epoch": 1.109704641350211, "grad_norm": 2.000030040740967, "learning_rate": 7.033896196868403e-06, "loss": 0.8366687893867493, "step": 526 }, { "epoch": 1.1139240506329113, "grad_norm": 1.2057420015335083, "learning_rate": 7.025318528264234e-06, "loss": 1.3332631587982178, "step": 528 }, { "epoch": 1.1181434599156117, "grad_norm": 0.9624310731887817, "learning_rate": 7.016708906495641e-06, "loss": 1.2037197351455688, "step": 530 }, { "epoch": 1.1223628691983123, "grad_norm": 1.497033715248108, "learning_rate": 7.008067436351683e-06, "loss": 0.6526771783828735, "step": 532 }, { "epoch": 1.1265822784810127, "grad_norm": 2.6542580127716064, "learning_rate": 6.999394223009052e-06, "loss": 0.8994975090026855, "step": 534 }, { "epoch": 1.130801687763713, "grad_norm": 0.8712950944900513, "learning_rate": 6.9906893720307895e-06, "loss": 1.0523709058761597, "step": 536 }, { "epoch": 1.1350210970464134, "grad_norm": 1.1105852127075195, "learning_rate": 6.981952989365005e-06, "loss": 0.8754544854164124, "step": 538 }, { "epoch": 1.139240506329114, "grad_norm": 0.9426999688148499, "learning_rate": 6.973185181343585e-06, "loss": 0.7062304019927979, "step": 540 }, { "epoch": 1.1434599156118144, "grad_norm": 0.9875909090042114, "learning_rate": 6.9643860546809e-06, "loss": 0.7558496594429016, "step": 542 }, { "epoch": 1.1476793248945147, "grad_norm": 1.1727705001831055, "learning_rate": 6.955555716472503e-06, "loss": 0.7818480730056763, "step": 544 }, { "epoch": 1.1518987341772151, "grad_norm": 1.8272697925567627, "learning_rate": 6.9466942741938275e-06, "loss": 1.0798598527908325, "step": 546 }, { "epoch": 1.1561181434599157, "grad_norm": 1.3200129270553589, "learning_rate": 6.93780183569888e-06, "loss": 1.1049277782440186, "step": 548 }, { "epoch": 1.160337552742616, "grad_norm": 12.516786575317383, "learning_rate": 6.928878509218929e-06, "loss": 0.8925328850746155, "step": 550 }, { "epoch": 1.1645569620253164, "grad_norm": 1.5449568033218384, "learning_rate": 6.919924403361182e-06, "loss": 1.3479973077774048, "step": 552 }, { "epoch": 1.1687763713080168, "grad_norm": 0.9759451746940613, "learning_rate": 6.910939627107469e-06, "loss": 1.0944254398345947, "step": 554 }, { "epoch": 1.1729957805907172, "grad_norm": 3.6769256591796875, "learning_rate": 6.901924289812913e-06, "loss": 0.6496379375457764, "step": 556 }, { "epoch": 1.1772151898734178, "grad_norm": 1.0708627700805664, "learning_rate": 6.892878501204603e-06, "loss": 0.9399113059043884, "step": 558 }, { "epoch": 1.1814345991561181, "grad_norm": 3.548515558242798, "learning_rate": 6.883802371380252e-06, "loss": 0.6334307789802551, "step": 560 }, { "epoch": 1.1856540084388185, "grad_norm": 0.7087482810020447, "learning_rate": 6.874696010806865e-06, "loss": 0.6812013983726501, "step": 562 }, { "epoch": 1.189873417721519, "grad_norm": 1.6790183782577515, "learning_rate": 6.865559530319386e-06, "loss": 1.2819935083389282, "step": 564 }, { "epoch": 1.1940928270042195, "grad_norm": 2.5965490341186523, "learning_rate": 6.8563930411193535e-06, "loss": 1.02937912940979, "step": 566 }, { "epoch": 1.1983122362869199, "grad_norm": 3.7219197750091553, "learning_rate": 6.847196654773552e-06, "loss": 0.7903206944465637, "step": 568 }, { "epoch": 1.2025316455696202, "grad_norm": 0.9391790628433228, "learning_rate": 6.837970483212643e-06, "loss": 1.0360606908798218, "step": 570 }, { "epoch": 1.2067510548523206, "grad_norm": 20.603315353393555, "learning_rate": 6.828714638729809e-06, "loss": 1.0591099262237549, "step": 572 }, { "epoch": 1.2109704641350212, "grad_norm": 2.5088610649108887, "learning_rate": 6.81942923397939e-06, "loss": 1.0366530418395996, "step": 574 }, { "epoch": 1.2151898734177216, "grad_norm": 0.9826826453208923, "learning_rate": 6.810114381975507e-06, "loss": 0.9062384963035583, "step": 576 }, { "epoch": 1.219409282700422, "grad_norm": 1.3147906064987183, "learning_rate": 6.800770196090688e-06, "loss": 0.6110230684280396, "step": 578 }, { "epoch": 1.2236286919831223, "grad_norm": 0.8988205194473267, "learning_rate": 6.791396790054484e-06, "loss": 0.910240888595581, "step": 580 }, { "epoch": 1.2278481012658227, "grad_norm": 2.201284170150757, "learning_rate": 6.781994277952099e-06, "loss": 0.8457823395729065, "step": 582 }, { "epoch": 1.2320675105485233, "grad_norm": 3.1297316551208496, "learning_rate": 6.7725627742229815e-06, "loss": 0.8808956146240234, "step": 584 }, { "epoch": 1.2362869198312236, "grad_norm": 5.279428482055664, "learning_rate": 6.763102393659446e-06, "loss": 0.9118282198905945, "step": 586 }, { "epoch": 1.240506329113924, "grad_norm": 1.449725866317749, "learning_rate": 6.753613251405274e-06, "loss": 0.8038244247436523, "step": 588 }, { "epoch": 1.2447257383966246, "grad_norm": 1.0893408060073853, "learning_rate": 6.744095462954303e-06, "loss": 1.065926194190979, "step": 590 }, { "epoch": 1.248945147679325, "grad_norm": 11.18133544921875, "learning_rate": 6.734549144149036e-06, "loss": 0.6128525733947754, "step": 592 }, { "epoch": 1.2531645569620253, "grad_norm": 0.5239539742469788, "learning_rate": 6.724974411179218e-06, "loss": 0.8248177766799927, "step": 594 }, { "epoch": 1.2573839662447257, "grad_norm": 3.62746000289917, "learning_rate": 6.7153713805804285e-06, "loss": 0.6825551986694336, "step": 596 }, { "epoch": 1.261603375527426, "grad_norm": 3.413501501083374, "learning_rate": 6.7057401692326625e-06, "loss": 0.567305862903595, "step": 598 }, { "epoch": 1.2658227848101267, "grad_norm": 0.6996157169342041, "learning_rate": 6.696080894358908e-06, "loss": 0.8849403262138367, "step": 600 }, { "epoch": 1.270042194092827, "grad_norm": 6.248124599456787, "learning_rate": 6.686393673523715e-06, "loss": 1.3093706369400024, "step": 602 }, { "epoch": 1.2742616033755274, "grad_norm": 0.9306197166442871, "learning_rate": 6.6766786246317726e-06, "loss": 1.0244123935699463, "step": 604 }, { "epoch": 1.2784810126582278, "grad_norm": 2.1768555641174316, "learning_rate": 6.666935865926468e-06, "loss": 0.8419608473777771, "step": 606 }, { "epoch": 1.2827004219409281, "grad_norm": 1.509337306022644, "learning_rate": 6.65716551598845e-06, "loss": 0.8019965291023254, "step": 608 }, { "epoch": 1.2869198312236287, "grad_norm": 5.7914323806762695, "learning_rate": 6.647367693734181e-06, "loss": 0.8274118900299072, "step": 610 }, { "epoch": 1.2911392405063291, "grad_norm": 3.3554115295410156, "learning_rate": 6.637542518414495e-06, "loss": 0.5377339124679565, "step": 612 }, { "epoch": 1.2953586497890295, "grad_norm": 0.9977070093154907, "learning_rate": 6.627690109613147e-06, "loss": 0.6412088871002197, "step": 614 }, { "epoch": 1.29957805907173, "grad_norm": 5.793771743774414, "learning_rate": 6.617810587245352e-06, "loss": 1.0477070808410645, "step": 616 }, { "epoch": 1.3037974683544304, "grad_norm": 1.5624624490737915, "learning_rate": 6.607904071556331e-06, "loss": 1.0696133375167847, "step": 618 }, { "epoch": 1.3080168776371308, "grad_norm": 0.9112898111343384, "learning_rate": 6.597970683119841e-06, "loss": 0.6664775609970093, "step": 620 }, { "epoch": 1.3122362869198312, "grad_norm": 1.806433081626892, "learning_rate": 6.588010542836715e-06, "loss": 0.7590267062187195, "step": 622 }, { "epoch": 1.3164556962025316, "grad_norm": 1.8049041032791138, "learning_rate": 6.578023771933387e-06, "loss": 0.8482476472854614, "step": 624 }, { "epoch": 1.3206751054852321, "grad_norm": 0.6221198439598083, "learning_rate": 6.568010491960412e-06, "loss": 1.0443530082702637, "step": 626 }, { "epoch": 1.3248945147679325, "grad_norm": 1.562410593032837, "learning_rate": 6.557970824790997e-06, "loss": 1.539845585823059, "step": 628 }, { "epoch": 1.3291139240506329, "grad_norm": 0.906804084777832, "learning_rate": 6.5479048926195106e-06, "loss": 0.9005885124206543, "step": 630 }, { "epoch": 1.3333333333333333, "grad_norm": 0.909907341003418, "learning_rate": 6.53781281795999e-06, "loss": 0.8963803648948669, "step": 632 }, { "epoch": 1.3375527426160336, "grad_norm": 0.4755572974681854, "learning_rate": 6.527694723644668e-06, "loss": 0.766118049621582, "step": 634 }, { "epoch": 1.3417721518987342, "grad_norm": 1.1491714715957642, "learning_rate": 6.517550732822457e-06, "loss": 0.611838161945343, "step": 636 }, { "epoch": 1.3459915611814346, "grad_norm": 2.129890203475952, "learning_rate": 6.507380968957463e-06, "loss": 0.7923972606658936, "step": 638 }, { "epoch": 1.350210970464135, "grad_norm": 0.800757646560669, "learning_rate": 6.497185555827484e-06, "loss": 1.1963096857070923, "step": 640 }, { "epoch": 1.3544303797468356, "grad_norm": 1.0547887086868286, "learning_rate": 6.486964617522494e-06, "loss": 0.7548023462295532, "step": 642 }, { "epoch": 1.358649789029536, "grad_norm": 1.0978026390075684, "learning_rate": 6.476718278443137e-06, "loss": 1.230237603187561, "step": 644 }, { "epoch": 1.3628691983122363, "grad_norm": 0.8420884609222412, "learning_rate": 6.4664466632992195e-06, "loss": 1.0555733442306519, "step": 646 }, { "epoch": 1.3670886075949367, "grad_norm": 13.5333890914917, "learning_rate": 6.456149897108182e-06, "loss": 0.8676448464393616, "step": 648 }, { "epoch": 1.371308016877637, "grad_norm": 0.9475505352020264, "learning_rate": 6.445828105193586e-06, "loss": 1.2682842016220093, "step": 650 }, { "epoch": 1.3755274261603376, "grad_norm": 0.9018502831459045, "learning_rate": 6.4354814131835815e-06, "loss": 1.0565340518951416, "step": 652 }, { "epoch": 1.379746835443038, "grad_norm": 0.54316645860672, "learning_rate": 6.425109947009384e-06, "loss": 0.7839528322219849, "step": 654 }, { "epoch": 1.3839662447257384, "grad_norm": 0.874775767326355, "learning_rate": 6.414713832903737e-06, "loss": 1.1698050498962402, "step": 656 }, { "epoch": 1.3881856540084387, "grad_norm": 4.575730323791504, "learning_rate": 6.404293197399381e-06, "loss": 0.5863835215568542, "step": 658 }, { "epoch": 1.3924050632911391, "grad_norm": 0.7375707030296326, "learning_rate": 6.393848167327507e-06, "loss": 1.086789608001709, "step": 660 }, { "epoch": 1.3966244725738397, "grad_norm": 2.6595211029052734, "learning_rate": 6.3833788698162205e-06, "loss": 0.7023826241493225, "step": 662 }, { "epoch": 1.40084388185654, "grad_norm": 1.631126046180725, "learning_rate": 6.372885432288982e-06, "loss": 1.0789552927017212, "step": 664 }, { "epoch": 1.4050632911392404, "grad_norm": 0.8146964907646179, "learning_rate": 6.362367982463073e-06, "loss": 0.6926907300949097, "step": 666 }, { "epoch": 1.409282700421941, "grad_norm": 0.9691137075424194, "learning_rate": 6.351826648348027e-06, "loss": 1.0910325050354004, "step": 668 }, { "epoch": 1.4135021097046414, "grad_norm": 0.9395283460617065, "learning_rate": 6.341261558244079e-06, "loss": 1.0995792150497437, "step": 670 }, { "epoch": 1.4177215189873418, "grad_norm": 1.9160293340682983, "learning_rate": 6.3306728407406015e-06, "loss": 1.3626757860183716, "step": 672 }, { "epoch": 1.4219409282700421, "grad_norm": 1.089788556098938, "learning_rate": 6.320060624714535e-06, "loss": 0.6344588994979858, "step": 674 }, { "epoch": 1.4261603375527425, "grad_norm": 0.6872738003730774, "learning_rate": 6.309425039328834e-06, "loss": 0.5873957872390747, "step": 676 }, { "epoch": 1.4303797468354431, "grad_norm": 2.0739026069641113, "learning_rate": 6.298766214030878e-06, "loss": 0.9192869067192078, "step": 678 }, { "epoch": 1.4345991561181435, "grad_norm": 1.781925082206726, "learning_rate": 6.288084278550905e-06, "loss": 1.0239968299865723, "step": 680 }, { "epoch": 1.4388185654008439, "grad_norm": 1.0395771265029907, "learning_rate": 6.2773793629004305e-06, "loss": 0.7735893726348877, "step": 682 }, { "epoch": 1.4430379746835442, "grad_norm": 2.4246017932891846, "learning_rate": 6.2666515973706635e-06, "loss": 1.135629415512085, "step": 684 }, { "epoch": 1.4472573839662446, "grad_norm": 0.7276327013969421, "learning_rate": 6.255901112530928e-06, "loss": 0.7381588816642761, "step": 686 }, { "epoch": 1.4514767932489452, "grad_norm": 1.8231748342514038, "learning_rate": 6.245128039227063e-06, "loss": 0.8623338341712952, "step": 688 }, { "epoch": 1.4556962025316456, "grad_norm": 2.5722429752349854, "learning_rate": 6.234332508579835e-06, "loss": 1.0199339389801025, "step": 690 }, { "epoch": 1.459915611814346, "grad_norm": 1.6927199363708496, "learning_rate": 6.2235146519833465e-06, "loss": 0.5960026383399963, "step": 692 }, { "epoch": 1.4641350210970465, "grad_norm": 1.3126322031021118, "learning_rate": 6.21267460110343e-06, "loss": 1.2402201890945435, "step": 694 }, { "epoch": 1.4683544303797469, "grad_norm": 2.016200304031372, "learning_rate": 6.201812487876048e-06, "loss": 0.5972878932952881, "step": 696 }, { "epoch": 1.4725738396624473, "grad_norm": 0.913316011428833, "learning_rate": 6.1909284445056886e-06, "loss": 1.0932003259658813, "step": 698 }, { "epoch": 1.4767932489451476, "grad_norm": 0.6828026175498962, "learning_rate": 6.1800226034637514e-06, "loss": 1.1358331441879272, "step": 700 }, { "epoch": 1.481012658227848, "grad_norm": 2.209768295288086, "learning_rate": 6.169095097486947e-06, "loss": 0.9575251340866089, "step": 702 }, { "epoch": 1.4852320675105486, "grad_norm": 1.6049399375915527, "learning_rate": 6.158146059575663e-06, "loss": 0.7674723863601685, "step": 704 }, { "epoch": 1.489451476793249, "grad_norm": 3.575786828994751, "learning_rate": 6.147175622992363e-06, "loss": 1.086501121520996, "step": 706 }, { "epoch": 1.4936708860759493, "grad_norm": 0.9102724194526672, "learning_rate": 6.136183921259956e-06, "loss": 1.1395413875579834, "step": 708 }, { "epoch": 1.49789029535865, "grad_norm": 0.5558487176895142, "learning_rate": 6.125171088160168e-06, "loss": 0.9195235371589661, "step": 710 }, { "epoch": 1.50210970464135, "grad_norm": 0.9477531313896179, "learning_rate": 6.114137257731925e-06, "loss": 0.4785539209842682, "step": 712 }, { "epoch": 1.5063291139240507, "grad_norm": 0.8040223717689514, "learning_rate": 6.10308256426971e-06, "loss": 1.0396082401275635, "step": 714 }, { "epoch": 1.510548523206751, "grad_norm": 0.9315183162689209, "learning_rate": 6.092007142321932e-06, "loss": 1.043006181716919, "step": 716 }, { "epoch": 1.5147679324894514, "grad_norm": 0.7686951756477356, "learning_rate": 6.080911126689296e-06, "loss": 1.0344305038452148, "step": 718 }, { "epoch": 1.518987341772152, "grad_norm": 2.7792484760284424, "learning_rate": 6.069794652423152e-06, "loss": 1.009570598602295, "step": 720 }, { "epoch": 1.5232067510548524, "grad_norm": 2.2277615070343018, "learning_rate": 6.058657854823854e-06, "loss": 1.0374475717544556, "step": 722 }, { "epoch": 1.5274261603375527, "grad_norm": 1.6987587213516235, "learning_rate": 6.047500869439114e-06, "loss": 1.1916974782943726, "step": 724 }, { "epoch": 1.5316455696202531, "grad_norm": 4.649923801422119, "learning_rate": 6.036323832062359e-06, "loss": 0.5684564113616943, "step": 726 }, { "epoch": 1.5358649789029535, "grad_norm": 2.5940101146698, "learning_rate": 6.025126878731064e-06, "loss": 0.3716410994529724, "step": 728 }, { "epoch": 1.540084388185654, "grad_norm": 0.8527175784111023, "learning_rate": 6.013910145725112e-06, "loss": 0.8164302706718445, "step": 730 }, { "epoch": 1.5443037974683544, "grad_norm": 0.9312422275543213, "learning_rate": 6.002673769565118e-06, "loss": 0.9368805885314941, "step": 732 }, { "epoch": 1.5485232067510548, "grad_norm": 0.9247412085533142, "learning_rate": 5.991417887010786e-06, "loss": 1.1238614320755005, "step": 734 }, { "epoch": 1.5527426160337554, "grad_norm": 0.723078727722168, "learning_rate": 5.98014263505923e-06, "loss": 0.8048302531242371, "step": 736 }, { "epoch": 1.5569620253164556, "grad_norm": 1.92336106300354, "learning_rate": 5.968848150943314e-06, "loss": 0.8754326105117798, "step": 738 }, { "epoch": 1.5611814345991561, "grad_norm": 1.0468974113464355, "learning_rate": 5.957534572129979e-06, "loss": 0.9829418659210205, "step": 740 }, { "epoch": 1.5654008438818565, "grad_norm": 0.782278835773468, "learning_rate": 5.946202036318572e-06, "loss": 0.6887242197990417, "step": 742 }, { "epoch": 1.5696202531645569, "grad_norm": 1.8223977088928223, "learning_rate": 5.934850681439166e-06, "loss": 0.5122029185295105, "step": 744 }, { "epoch": 1.5738396624472575, "grad_norm": 1.1448414325714111, "learning_rate": 5.923480645650887e-06, "loss": 0.6803614497184753, "step": 746 }, { "epoch": 1.5780590717299579, "grad_norm": 4.745306491851807, "learning_rate": 5.912092067340226e-06, "loss": 0.6753883361816406, "step": 748 }, { "epoch": 1.5822784810126582, "grad_norm": 0.8308981657028198, "learning_rate": 5.900685085119361e-06, "loss": 1.0774937868118286, "step": 750 }, { "epoch": 1.5864978902953588, "grad_norm": 1.0071589946746826, "learning_rate": 5.889259837824464e-06, "loss": 0.5942963361740112, "step": 752 }, { "epoch": 1.590717299578059, "grad_norm": 0.7977795600891113, "learning_rate": 5.8778164645140155e-06, "loss": 0.644191563129425, "step": 754 }, { "epoch": 1.5949367088607596, "grad_norm": 0.7821984887123108, "learning_rate": 5.8663551044671125e-06, "loss": 0.601950466632843, "step": 756 }, { "epoch": 1.59915611814346, "grad_norm": 1.1435626745224, "learning_rate": 5.854875897181766e-06, "loss": 0.8324768543243408, "step": 758 }, { "epoch": 1.6033755274261603, "grad_norm": 0.794941246509552, "learning_rate": 5.843378982373218e-06, "loss": 1.0321424007415771, "step": 760 }, { "epoch": 1.6075949367088609, "grad_norm": 0.4165087938308716, "learning_rate": 5.8318644999722194e-06, "loss": 0.6179360747337341, "step": 762 }, { "epoch": 1.611814345991561, "grad_norm": 1.0744069814682007, "learning_rate": 5.820332590123348e-06, "loss": 1.0869427919387817, "step": 764 }, { "epoch": 1.6160337552742616, "grad_norm": 2.4255731105804443, "learning_rate": 5.80878339318329e-06, "loss": 0.9976139664649963, "step": 766 }, { "epoch": 1.620253164556962, "grad_norm": 0.3937893807888031, "learning_rate": 5.797217049719138e-06, "loss": 0.8773806095123291, "step": 768 }, { "epoch": 1.6244725738396624, "grad_norm": 1.528141975402832, "learning_rate": 5.785633700506676e-06, "loss": 1.0529608726501465, "step": 770 }, { "epoch": 1.628691983122363, "grad_norm": 1.1761523485183716, "learning_rate": 5.774033486528666e-06, "loss": 1.1696523427963257, "step": 772 }, { "epoch": 1.6329113924050633, "grad_norm": 0.6605724096298218, "learning_rate": 5.762416548973137e-06, "loss": 1.06764554977417, "step": 774 }, { "epoch": 1.6371308016877637, "grad_norm": 1.0275272130966187, "learning_rate": 5.750783029231662e-06, "loss": 1.0699821710586548, "step": 776 }, { "epoch": 1.6413502109704643, "grad_norm": 0.9171205759048462, "learning_rate": 5.739133068897638e-06, "loss": 0.7903687953948975, "step": 778 }, { "epoch": 1.6455696202531644, "grad_norm": 2.0880374908447266, "learning_rate": 5.727466809764562e-06, "loss": 0.372045636177063, "step": 780 }, { "epoch": 1.649789029535865, "grad_norm": 3.6843972206115723, "learning_rate": 5.715784393824309e-06, "loss": 1.0749914646148682, "step": 782 }, { "epoch": 1.6540084388185654, "grad_norm": 1.0832284688949585, "learning_rate": 5.7040859632653985e-06, "loss": 0.9234107136726379, "step": 784 }, { "epoch": 1.6582278481012658, "grad_norm": 1.366377353668213, "learning_rate": 5.692371660471269e-06, "loss": 1.0691020488739014, "step": 786 }, { "epoch": 1.6624472573839664, "grad_norm": 1.1804683208465576, "learning_rate": 5.680641628018539e-06, "loss": 0.5163772702217102, "step": 788 }, { "epoch": 1.6666666666666665, "grad_norm": 1.0049868822097778, "learning_rate": 5.6688960086752775e-06, "loss": 1.0653210878372192, "step": 790 }, { "epoch": 1.6708860759493671, "grad_norm": 2.6457467079162598, "learning_rate": 5.657134945399265e-06, "loss": 0.6419547200202942, "step": 792 }, { "epoch": 1.6751054852320675, "grad_norm": 0.2932789921760559, "learning_rate": 5.645358581336249e-06, "loss": 0.8718560338020325, "step": 794 }, { "epoch": 1.6793248945147679, "grad_norm": 0.8630058169364929, "learning_rate": 5.633567059818208e-06, "loss": 1.0517830848693848, "step": 796 }, { "epoch": 1.6835443037974684, "grad_norm": 4.652339935302734, "learning_rate": 5.621760524361605e-06, "loss": 0.8228880167007446, "step": 798 }, { "epoch": 1.6877637130801688, "grad_norm": 1.278536081314087, "learning_rate": 5.6099391186656375e-06, "loss": 1.1810134649276733, "step": 800 }, { "epoch": 1.6919831223628692, "grad_norm": 5.43027925491333, "learning_rate": 5.598102986610493e-06, "loss": 0.5597525238990784, "step": 802 }, { "epoch": 1.6962025316455698, "grad_norm": 4.56681489944458, "learning_rate": 5.586252272255595e-06, "loss": 1.1962707042694092, "step": 804 }, { "epoch": 1.70042194092827, "grad_norm": 0.887153148651123, "learning_rate": 5.574387119837848e-06, "loss": 1.0536723136901855, "step": 806 }, { "epoch": 1.7046413502109705, "grad_norm": 8.7942533493042, "learning_rate": 5.562507673769889e-06, "loss": 0.7714130282402039, "step": 808 }, { "epoch": 1.7088607594936709, "grad_norm": 2.3065688610076904, "learning_rate": 5.550614078638324e-06, "loss": 0.8722562193870544, "step": 810 }, { "epoch": 1.7130801687763713, "grad_norm": 1.0666223764419556, "learning_rate": 5.5387064792019686e-06, "loss": 1.1128357648849487, "step": 812 }, { "epoch": 1.7172995780590719, "grad_norm": 3.0997695922851562, "learning_rate": 5.526785020390084e-06, "loss": 1.4282304048538208, "step": 814 }, { "epoch": 1.721518987341772, "grad_norm": 1.0092432498931885, "learning_rate": 5.514849847300622e-06, "loss": 1.1036298274993896, "step": 816 }, { "epoch": 1.7257383966244726, "grad_norm": 3.5967748165130615, "learning_rate": 5.502901105198449e-06, "loss": 0.7901860475540161, "step": 818 }, { "epoch": 1.729957805907173, "grad_norm": 2.40335750579834, "learning_rate": 5.490938939513584e-06, "loss": 0.3646574020385742, "step": 820 }, { "epoch": 1.7341772151898733, "grad_norm": 3.5133466720581055, "learning_rate": 5.478963495839425e-06, "loss": 0.5445467233657837, "step": 822 }, { "epoch": 1.738396624472574, "grad_norm": 0.9658949375152588, "learning_rate": 5.466974919930979e-06, "loss": 0.7141355276107788, "step": 824 }, { "epoch": 1.7426160337552743, "grad_norm": 1.7418462038040161, "learning_rate": 5.454973357703087e-06, "loss": 0.8929092884063721, "step": 826 }, { "epoch": 1.7468354430379747, "grad_norm": 8.529016494750977, "learning_rate": 5.442958955228649e-06, "loss": 0.9267692565917969, "step": 828 }, { "epoch": 1.7510548523206753, "grad_norm": 0.9777578115463257, "learning_rate": 5.430931858736848e-06, "loss": 1.0351005792617798, "step": 830 }, { "epoch": 1.7552742616033754, "grad_norm": 1.306839108467102, "learning_rate": 5.418892214611364e-06, "loss": 1.0336472988128662, "step": 832 }, { "epoch": 1.759493670886076, "grad_norm": 0.9158060550689697, "learning_rate": 5.406840169388598e-06, "loss": 0.8349417448043823, "step": 834 }, { "epoch": 1.7637130801687764, "grad_norm": 4.128747940063477, "learning_rate": 5.394775869755888e-06, "loss": 1.078331470489502, "step": 836 }, { "epoch": 1.7679324894514767, "grad_norm": 2.023729085922241, "learning_rate": 5.3826994625497186e-06, "loss": 0.8993400931358337, "step": 838 }, { "epoch": 1.7721518987341773, "grad_norm": 1.9145705699920654, "learning_rate": 5.370611094753943e-06, "loss": 0.756892740726471, "step": 840 }, { "epoch": 1.7763713080168775, "grad_norm": 4.3195695877075195, "learning_rate": 5.358510913497981e-06, "loss": 0.8908122777938843, "step": 842 }, { "epoch": 1.780590717299578, "grad_norm": 0.7751283645629883, "learning_rate": 5.346399066055044e-06, "loss": 0.4248788058757782, "step": 844 }, { "epoch": 1.7848101265822784, "grad_norm": 0.7409003973007202, "learning_rate": 5.33427569984033e-06, "loss": 0.650154173374176, "step": 846 }, { "epoch": 1.7890295358649788, "grad_norm": 1.8226172924041748, "learning_rate": 5.322140962409236e-06, "loss": 0.59881591796875, "step": 848 }, { "epoch": 1.7932489451476794, "grad_norm": 1.4619311094284058, "learning_rate": 5.3099950014555554e-06, "loss": 0.7507359981536865, "step": 850 }, { "epoch": 1.7974683544303798, "grad_norm": 1.0151058435440063, "learning_rate": 5.29783796480969e-06, "loss": 1.127907633781433, "step": 852 }, { "epoch": 1.8016877637130801, "grad_norm": 2.056638240814209, "learning_rate": 5.2856700004368425e-06, "loss": 1.3744020462036133, "step": 854 }, { "epoch": 1.8059071729957807, "grad_norm": 0.30007457733154297, "learning_rate": 5.273491256435222e-06, "loss": 0.8465395569801331, "step": 856 }, { "epoch": 1.810126582278481, "grad_norm": 2.211362361907959, "learning_rate": 5.2613018810342314e-06, "loss": 0.9668091535568237, "step": 858 }, { "epoch": 1.8143459915611815, "grad_norm": 0.9358858466148376, "learning_rate": 5.24910202259268e-06, "loss": 0.664305567741394, "step": 860 }, { "epoch": 1.8185654008438819, "grad_norm": 1.0162758827209473, "learning_rate": 5.236891829596958e-06, "loss": 1.0983484983444214, "step": 862 }, { "epoch": 1.8227848101265822, "grad_norm": 0.8416312336921692, "learning_rate": 5.2246714506592454e-06, "loss": 0.9112118482589722, "step": 864 }, { "epoch": 1.8270042194092828, "grad_norm": 0.833525538444519, "learning_rate": 5.212441034515695e-06, "loss": 0.9819576740264893, "step": 866 }, { "epoch": 1.831223628691983, "grad_norm": 21.00221824645996, "learning_rate": 5.200200730024622e-06, "loss": 0.9238821268081665, "step": 868 }, { "epoch": 1.8354430379746836, "grad_norm": 0.7736468315124512, "learning_rate": 5.187950686164699e-06, "loss": 0.9560548663139343, "step": 870 }, { "epoch": 1.839662447257384, "grad_norm": 2.4590296745300293, "learning_rate": 5.175691052033133e-06, "loss": 0.7176443338394165, "step": 872 }, { "epoch": 1.8438818565400843, "grad_norm": 14.438817977905273, "learning_rate": 5.163421976843859e-06, "loss": 0.9139724373817444, "step": 874 }, { "epoch": 1.8481012658227849, "grad_norm": 0.7402142286300659, "learning_rate": 5.151143609925718e-06, "loss": 1.0629327297210693, "step": 876 }, { "epoch": 1.8523206751054853, "grad_norm": 2.3395838737487793, "learning_rate": 5.138856100720645e-06, "loss": 0.7460686564445496, "step": 878 }, { "epoch": 1.8565400843881856, "grad_norm": 1.0752966403961182, "learning_rate": 5.126559598781845e-06, "loss": 0.6765896677970886, "step": 880 }, { "epoch": 1.8607594936708862, "grad_norm": 1.11525559425354, "learning_rate": 5.114254253771977e-06, "loss": 0.8317286968231201, "step": 882 }, { "epoch": 1.8649789029535864, "grad_norm": 1.3351662158966064, "learning_rate": 5.1019402154613264e-06, "loss": 0.6764845252037048, "step": 884 }, { "epoch": 1.869198312236287, "grad_norm": 1.5033775568008423, "learning_rate": 5.089617633725992e-06, "loss": 0.7203776240348816, "step": 886 }, { "epoch": 1.8734177215189873, "grad_norm": 0.8415127992630005, "learning_rate": 5.07728665854605e-06, "loss": 1.027212142944336, "step": 888 }, { "epoch": 1.8776371308016877, "grad_norm": 6.813464641571045, "learning_rate": 5.064947440003741e-06, "loss": 0.3982529640197754, "step": 890 }, { "epoch": 1.8818565400843883, "grad_norm": 1.178280234336853, "learning_rate": 5.0526001282816285e-06, "loss": 0.6589434146881104, "step": 892 }, { "epoch": 1.8860759493670884, "grad_norm": 1.1725980043411255, "learning_rate": 5.0402448736607874e-06, "loss": 1.087322473526001, "step": 894 }, { "epoch": 1.890295358649789, "grad_norm": 0.9317290782928467, "learning_rate": 5.027881826518963e-06, "loss": 1.2050056457519531, "step": 896 }, { "epoch": 1.8945147679324894, "grad_norm": 3.785717010498047, "learning_rate": 5.015511137328743e-06, "loss": 0.7949274182319641, "step": 898 }, { "epoch": 1.8987341772151898, "grad_norm": 1.9153603315353394, "learning_rate": 5.003132956655735e-06, "loss": 0.9485737085342407, "step": 900 }, { "epoch": 1.9029535864978904, "grad_norm": 1.1202062368392944, "learning_rate": 4.990747435156715e-06, "loss": 1.1542925834655762, "step": 902 }, { "epoch": 1.9071729957805907, "grad_norm": 5.2798967361450195, "learning_rate": 4.978354723577818e-06, "loss": 0.9438016414642334, "step": 904 }, { "epoch": 1.9113924050632911, "grad_norm": 1.9839028120040894, "learning_rate": 4.965954972752677e-06, "loss": 1.31730055809021, "step": 906 }, { "epoch": 1.9156118143459917, "grad_norm": 3.2454068660736084, "learning_rate": 4.953548333600616e-06, "loss": 0.43834638595581055, "step": 908 }, { "epoch": 1.9198312236286919, "grad_norm": 0.7044057250022888, "learning_rate": 4.9411349571247845e-06, "loss": 1.0278995037078857, "step": 910 }, { "epoch": 1.9240506329113924, "grad_norm": 4.6270246505737305, "learning_rate": 4.928714994410341e-06, "loss": 0.7902883887290955, "step": 912 }, { "epoch": 1.9282700421940928, "grad_norm": 1.1006572246551514, "learning_rate": 4.9162885966226035e-06, "loss": 1.0976777076721191, "step": 914 }, { "epoch": 1.9324894514767932, "grad_norm": 1.0053693056106567, "learning_rate": 4.903855915005212e-06, "loss": 0.7121254205703735, "step": 916 }, { "epoch": 1.9367088607594938, "grad_norm": 0.7003014087677002, "learning_rate": 4.8914171008782885e-06, "loss": 1.054925560951233, "step": 918 }, { "epoch": 1.9409282700421941, "grad_norm": 1.9592584371566772, "learning_rate": 4.878972305636595e-06, "loss": 0.46024253964424133, "step": 920 }, { "epoch": 1.9451476793248945, "grad_norm": 0.7745406627655029, "learning_rate": 4.86652168074769e-06, "loss": 1.0552338361740112, "step": 922 }, { "epoch": 1.9493670886075949, "grad_norm": 1.3688344955444336, "learning_rate": 4.8540653777500865e-06, "loss": 1.0862473249435425, "step": 924 }, { "epoch": 1.9535864978902953, "grad_norm": 2.7827236652374268, "learning_rate": 4.841603548251406e-06, "loss": 0.8950420022010803, "step": 926 }, { "epoch": 1.9578059071729959, "grad_norm": 0.7784646153450012, "learning_rate": 4.829136343926532e-06, "loss": 0.720669686794281, "step": 928 }, { "epoch": 1.9620253164556962, "grad_norm": 1.3589555025100708, "learning_rate": 4.816663916515772e-06, "loss": 1.1043243408203125, "step": 930 }, { "epoch": 1.9662447257383966, "grad_norm": 3.5423665046691895, "learning_rate": 4.804186417822995e-06, "loss": 0.870411217212677, "step": 932 }, { "epoch": 1.9704641350210972, "grad_norm": 0.7861785292625427, "learning_rate": 4.791703999713803e-06, "loss": 1.1790004968643188, "step": 934 }, { "epoch": 1.9746835443037973, "grad_norm": 0.7902828454971313, "learning_rate": 4.779216814113667e-06, "loss": 1.103920340538025, "step": 936 }, { "epoch": 1.978902953586498, "grad_norm": 0.6530998945236206, "learning_rate": 4.766725013006085e-06, "loss": 0.7225710153579712, "step": 938 }, { "epoch": 1.9831223628691983, "grad_norm": 1.588179588317871, "learning_rate": 4.754228748430731e-06, "loss": 1.0604408979415894, "step": 940 }, { "epoch": 1.9873417721518987, "grad_norm": 1.7913926839828491, "learning_rate": 4.741728172481607e-06, "loss": 0.8651899099349976, "step": 942 }, { "epoch": 1.9915611814345993, "grad_norm": 0.891537070274353, "learning_rate": 4.729223437305187e-06, "loss": 0.6996287107467651, "step": 944 }, { "epoch": 1.9957805907172996, "grad_norm": 1.4173448085784912, "learning_rate": 4.716714695098568e-06, "loss": 1.0344507694244385, "step": 946 }, { "epoch": 2.0, "grad_norm": 3.7296454906463623, "learning_rate": 4.7042020981076185e-06, "loss": 0.5512294173240662, "step": 948 }, { "epoch": 2.0042194092827006, "grad_norm": 2.249424457550049, "learning_rate": 4.69168579862512e-06, "loss": 0.8092342615127563, "step": 950 }, { "epoch": 2.0084388185654007, "grad_norm": 2.6464383602142334, "learning_rate": 4.679165948988924e-06, "loss": 0.47413283586502075, "step": 952 }, { "epoch": 2.0126582278481013, "grad_norm": 1.5369104146957397, "learning_rate": 4.666642701580086e-06, "loss": 0.7702062129974365, "step": 954 }, { "epoch": 2.0168776371308015, "grad_norm": 1.0920283794403076, "learning_rate": 4.65411620882102e-06, "loss": 0.8473414182662964, "step": 956 }, { "epoch": 2.021097046413502, "grad_norm": 20.295406341552734, "learning_rate": 4.6415866231736375e-06, "loss": 0.6457698345184326, "step": 958 }, { "epoch": 2.0253164556962027, "grad_norm": 3.8915340900421143, "learning_rate": 4.629054097137493e-06, "loss": 0.7031627893447876, "step": 960 }, { "epoch": 2.029535864978903, "grad_norm": 1.0874841213226318, "learning_rate": 4.616518783247934e-06, "loss": 1.0022499561309814, "step": 962 }, { "epoch": 2.0337552742616034, "grad_norm": 5.714715480804443, "learning_rate": 4.603980834074232e-06, "loss": 0.7056564688682556, "step": 964 }, { "epoch": 2.037974683544304, "grad_norm": 0.8951921463012695, "learning_rate": 4.591440402217741e-06, "loss": 0.5630991458892822, "step": 966 }, { "epoch": 2.042194092827004, "grad_norm": 4.608378887176514, "learning_rate": 4.578897640310025e-06, "loss": 0.6585802435874939, "step": 968 }, { "epoch": 2.0464135021097047, "grad_norm": 1.6705124378204346, "learning_rate": 4.566352701011013e-06, "loss": 0.9024470448493958, "step": 970 }, { "epoch": 2.050632911392405, "grad_norm": 2.591546058654785, "learning_rate": 4.5538057370071315e-06, "loss": 0.7236870527267456, "step": 972 }, { "epoch": 2.0548523206751055, "grad_norm": 1.0205042362213135, "learning_rate": 4.541256901009451e-06, "loss": 0.7728800177574158, "step": 974 }, { "epoch": 2.059071729957806, "grad_norm": 2.32804799079895, "learning_rate": 4.528706345751826e-06, "loss": 0.6220592856407166, "step": 976 }, { "epoch": 2.0632911392405062, "grad_norm": 0.9847302436828613, "learning_rate": 4.516154223989039e-06, "loss": 0.6414508819580078, "step": 978 }, { "epoch": 2.067510548523207, "grad_norm": 1.0494519472122192, "learning_rate": 4.503600688494938e-06, "loss": 0.5687150359153748, "step": 980 }, { "epoch": 2.071729957805907, "grad_norm": 1.0996086597442627, "learning_rate": 4.491045892060573e-06, "loss": 0.9595503211021423, "step": 982 }, { "epoch": 2.0759493670886076, "grad_norm": 1.6307997703552246, "learning_rate": 4.478489987492346e-06, "loss": 0.8499625325202942, "step": 984 }, { "epoch": 2.080168776371308, "grad_norm": 1.1343793869018555, "learning_rate": 4.465933127610145e-06, "loss": 0.8802004456520081, "step": 986 }, { "epoch": 2.0843881856540083, "grad_norm": 0.8233914375305176, "learning_rate": 4.453375465245486e-06, "loss": 0.8876461982727051, "step": 988 }, { "epoch": 2.088607594936709, "grad_norm": 3.605290651321411, "learning_rate": 4.44081715323965e-06, "loss": 0.47245436906814575, "step": 990 }, { "epoch": 2.0928270042194095, "grad_norm": 1.4245373010635376, "learning_rate": 4.428258344441826e-06, "loss": 0.4930482804775238, "step": 992 }, { "epoch": 2.0970464135021096, "grad_norm": 1.0939189195632935, "learning_rate": 4.415699191707251e-06, "loss": 0.9832253456115723, "step": 994 }, { "epoch": 2.1012658227848102, "grad_norm": 1.3786028623580933, "learning_rate": 4.403139847895348e-06, "loss": 0.8831475377082825, "step": 996 }, { "epoch": 2.1054852320675104, "grad_norm": 0.33124950528144836, "learning_rate": 4.39058046586786e-06, "loss": 0.5398452877998352, "step": 998 }, { "epoch": 2.109704641350211, "grad_norm": 2.1223366260528564, "learning_rate": 4.3780211984870044e-06, "loss": 1.0190367698669434, "step": 1000 }, { "epoch": 2.1139240506329116, "grad_norm": 2.0882437229156494, "learning_rate": 4.365462198613595e-06, "loss": 0.8691745400428772, "step": 1002 }, { "epoch": 2.1181434599156117, "grad_norm": 0.9551434516906738, "learning_rate": 4.352903619105196e-06, "loss": 0.8893840909004211, "step": 1004 }, { "epoch": 2.1223628691983123, "grad_norm": 0.49108386039733887, "learning_rate": 4.340345612814251e-06, "loss": 0.5169594287872314, "step": 1006 }, { "epoch": 2.1265822784810124, "grad_norm": 0.9406089186668396, "learning_rate": 4.327788332586227e-06, "loss": 0.5989170074462891, "step": 1008 }, { "epoch": 2.130801687763713, "grad_norm": 1.099560022354126, "learning_rate": 4.315231931257758e-06, "loss": 0.5996731519699097, "step": 1010 }, { "epoch": 2.1350210970464136, "grad_norm": 12.219691276550293, "learning_rate": 4.302676561654775e-06, "loss": 0.8513282537460327, "step": 1012 }, { "epoch": 2.1392405063291138, "grad_norm": 2.0376791954040527, "learning_rate": 4.290122376590656e-06, "loss": 0.9961199164390564, "step": 1014 }, { "epoch": 2.1434599156118144, "grad_norm": 1.4444695711135864, "learning_rate": 4.2775695288643615e-06, "loss": 0.4728237986564636, "step": 1016 }, { "epoch": 2.147679324894515, "grad_norm": 1.0163081884384155, "learning_rate": 4.2650181712585735e-06, "loss": 0.7495555281639099, "step": 1018 }, { "epoch": 2.151898734177215, "grad_norm": 1.1818724870681763, "learning_rate": 4.252468456537838e-06, "loss": 0.6457207202911377, "step": 1020 }, { "epoch": 2.1561181434599157, "grad_norm": 2.961237907409668, "learning_rate": 4.239920537446705e-06, "loss": 0.7249948978424072, "step": 1022 }, { "epoch": 2.160337552742616, "grad_norm": 2.8546791076660156, "learning_rate": 4.227374566707871e-06, "loss": 0.6750069856643677, "step": 1024 }, { "epoch": 2.1645569620253164, "grad_norm": 1.0282621383666992, "learning_rate": 4.214830697020316e-06, "loss": 0.9150334000587463, "step": 1026 }, { "epoch": 2.168776371308017, "grad_norm": 0.8248642086982727, "learning_rate": 4.202289081057452e-06, "loss": 0.9421663284301758, "step": 1028 }, { "epoch": 2.172995780590717, "grad_norm": 0.9548051953315735, "learning_rate": 4.189749871465253e-06, "loss": 0.8729570508003235, "step": 1030 }, { "epoch": 2.1772151898734178, "grad_norm": 0.8367507457733154, "learning_rate": 4.177213220860416e-06, "loss": 0.8981440663337708, "step": 1032 }, { "epoch": 2.181434599156118, "grad_norm": 1.4248055219650269, "learning_rate": 4.164679281828482e-06, "loss": 0.8822668194770813, "step": 1034 }, { "epoch": 2.1856540084388185, "grad_norm": 0.9020785689353943, "learning_rate": 4.152148206921995e-06, "loss": 0.8814399838447571, "step": 1036 }, { "epoch": 2.189873417721519, "grad_norm": 1.4970018863677979, "learning_rate": 4.139620148658634e-06, "loss": 0.8485023379325867, "step": 1038 }, { "epoch": 2.1940928270042193, "grad_norm": 1.1914066076278687, "learning_rate": 4.127095259519368e-06, "loss": 1.0057520866394043, "step": 1040 }, { "epoch": 2.19831223628692, "grad_norm": 5.138652324676514, "learning_rate": 4.114573691946591e-06, "loss": 0.26296478509902954, "step": 1042 }, { "epoch": 2.2025316455696204, "grad_norm": 1.1444544792175293, "learning_rate": 4.102055598342269e-06, "loss": 0.8880115747451782, "step": 1044 }, { "epoch": 2.2067510548523206, "grad_norm": 1.740729808807373, "learning_rate": 4.089541131066086e-06, "loss": 0.5347674489021301, "step": 1046 }, { "epoch": 2.210970464135021, "grad_norm": 1.3183239698410034, "learning_rate": 4.077030442433593e-06, "loss": 0.790450930595398, "step": 1048 }, { "epoch": 2.2151898734177213, "grad_norm": 1.1291550397872925, "learning_rate": 4.064523684714344e-06, "loss": 0.8988840579986572, "step": 1050 }, { "epoch": 2.219409282700422, "grad_norm": 2.9497318267822266, "learning_rate": 4.052021010130056e-06, "loss": 0.7755071520805359, "step": 1052 }, { "epoch": 2.2236286919831225, "grad_norm": 2.4455068111419678, "learning_rate": 4.039522570852745e-06, "loss": 0.7849942445755005, "step": 1054 }, { "epoch": 2.2278481012658227, "grad_norm": 0.9835525751113892, "learning_rate": 4.0270285190028794e-06, "loss": 0.7088072896003723, "step": 1056 }, { "epoch": 2.2320675105485233, "grad_norm": 20.216365814208984, "learning_rate": 4.014539006647528e-06, "loss": 0.42411160469055176, "step": 1058 }, { "epoch": 2.2362869198312234, "grad_norm": 0.8427597284317017, "learning_rate": 4.002054185798509e-06, "loss": 0.8620681762695312, "step": 1060 }, { "epoch": 2.240506329113924, "grad_norm": 0.3895626366138458, "learning_rate": 3.98957420841054e-06, "loss": 0.6363852024078369, "step": 1062 }, { "epoch": 2.2447257383966246, "grad_norm": 1.1307460069656372, "learning_rate": 3.977099226379386e-06, "loss": 0.4475446343421936, "step": 1064 }, { "epoch": 2.2489451476793247, "grad_norm": 1.3451250791549683, "learning_rate": 3.9646293915400145e-06, "loss": 0.8441832661628723, "step": 1066 }, { "epoch": 2.2531645569620253, "grad_norm": 1.8237205743789673, "learning_rate": 3.952164855664745e-06, "loss": 1.0592007637023926, "step": 1068 }, { "epoch": 2.257383966244726, "grad_norm": 1.1085244417190552, "learning_rate": 3.939705770461403e-06, "loss": 1.0274057388305664, "step": 1070 }, { "epoch": 2.261603375527426, "grad_norm": 1.4007558822631836, "learning_rate": 3.927252287571472e-06, "loss": 0.8607990145683289, "step": 1072 }, { "epoch": 2.2658227848101267, "grad_norm": 3.7572860717773438, "learning_rate": 3.914804558568251e-06, "loss": 1.1480568647384644, "step": 1074 }, { "epoch": 2.270042194092827, "grad_norm": 0.819203794002533, "learning_rate": 3.902362734955003e-06, "loss": 0.8235105872154236, "step": 1076 }, { "epoch": 2.2742616033755274, "grad_norm": 0.528959333896637, "learning_rate": 3.889926968163123e-06, "loss": 0.5926033854484558, "step": 1078 }, { "epoch": 2.278481012658228, "grad_norm": 1.5626213550567627, "learning_rate": 3.877497409550281e-06, "loss": 0.7218382358551025, "step": 1080 }, { "epoch": 2.282700421940928, "grad_norm": 1.657475233078003, "learning_rate": 3.8650742103985865e-06, "loss": 0.33192554116249084, "step": 1082 }, { "epoch": 2.2869198312236287, "grad_norm": 1.3998394012451172, "learning_rate": 3.852657521912752e-06, "loss": 0.5696985721588135, "step": 1084 }, { "epoch": 2.291139240506329, "grad_norm": 0.8090922832489014, "learning_rate": 3.840247495218242e-06, "loss": 0.4131937325000763, "step": 1086 }, { "epoch": 2.2953586497890295, "grad_norm": 1.96702241897583, "learning_rate": 3.827844281359444e-06, "loss": 0.5371357202529907, "step": 1088 }, { "epoch": 2.29957805907173, "grad_norm": 0.4463783800601959, "learning_rate": 3.815448031297822e-06, "loss": 0.48086562752723694, "step": 1090 }, { "epoch": 2.3037974683544302, "grad_norm": 2.2645716667175293, "learning_rate": 3.8030588959100845e-06, "loss": 0.759406328201294, "step": 1092 }, { "epoch": 2.308016877637131, "grad_norm": 0.9995399117469788, "learning_rate": 3.790677025986345e-06, "loss": 0.5466501116752625, "step": 1094 }, { "epoch": 2.3122362869198314, "grad_norm": 2.6267566680908203, "learning_rate": 3.7783025722282897e-06, "loss": 0.35581734776496887, "step": 1096 }, { "epoch": 2.3164556962025316, "grad_norm": 2.8866639137268066, "learning_rate": 3.765935685247338e-06, "loss": 0.8641759157180786, "step": 1098 }, { "epoch": 2.320675105485232, "grad_norm": 1.3129066228866577, "learning_rate": 3.753576515562816e-06, "loss": 0.7505000233650208, "step": 1100 }, { "epoch": 2.3248945147679323, "grad_norm": 1.0732929706573486, "learning_rate": 3.7412252136001213e-06, "loss": 0.8979564905166626, "step": 1102 }, { "epoch": 2.329113924050633, "grad_norm": 0.7349892854690552, "learning_rate": 3.7288819296888898e-06, "loss": 1.1566518545150757, "step": 1104 }, { "epoch": 2.3333333333333335, "grad_norm": 1.2569828033447266, "learning_rate": 3.716546814061171e-06, "loss": 0.6977556347846985, "step": 1106 }, { "epoch": 2.3375527426160336, "grad_norm": 2.377737522125244, "learning_rate": 3.7042200168495946e-06, "loss": 0.44933831691741943, "step": 1108 }, { "epoch": 2.3417721518987342, "grad_norm": 1.586523413658142, "learning_rate": 3.691901688085548e-06, "loss": 0.8763599395751953, "step": 1110 }, { "epoch": 2.3459915611814344, "grad_norm": 1.1108835935592651, "learning_rate": 3.6795919776973473e-06, "loss": 0.9540433287620544, "step": 1112 }, { "epoch": 2.350210970464135, "grad_norm": 0.9403799176216125, "learning_rate": 3.667291035508411e-06, "loss": 1.034621000289917, "step": 1114 }, { "epoch": 2.3544303797468356, "grad_norm": 0.43867227435112, "learning_rate": 3.65499901123544e-06, "loss": 0.4901280999183655, "step": 1116 }, { "epoch": 2.3586497890295357, "grad_norm": 2.578577995300293, "learning_rate": 3.642716054486595e-06, "loss": 0.7974634170532227, "step": 1118 }, { "epoch": 2.3628691983122363, "grad_norm": 1.1387748718261719, "learning_rate": 3.630442314759671e-06, "loss": 0.5818929672241211, "step": 1120 }, { "epoch": 2.367088607594937, "grad_norm": 2.1316514015197754, "learning_rate": 3.618177941440285e-06, "loss": 0.7703042030334473, "step": 1122 }, { "epoch": 2.371308016877637, "grad_norm": 0.8024750351905823, "learning_rate": 3.605923083800051e-06, "loss": 0.5044012069702148, "step": 1124 }, { "epoch": 2.3755274261603376, "grad_norm": 1.283586025238037, "learning_rate": 3.593677890994768e-06, "loss": 0.663129448890686, "step": 1126 }, { "epoch": 2.379746835443038, "grad_norm": 1.0305255651474, "learning_rate": 3.581442512062602e-06, "loss": 0.8820338249206543, "step": 1128 }, { "epoch": 2.3839662447257384, "grad_norm": 2.735337972640991, "learning_rate": 3.5692170959222735e-06, "loss": 0.42376741766929626, "step": 1130 }, { "epoch": 2.388185654008439, "grad_norm": 4.083228588104248, "learning_rate": 3.5570017913712438e-06, "loss": 0.3104958236217499, "step": 1132 }, { "epoch": 2.392405063291139, "grad_norm": 1.5016670227050781, "learning_rate": 3.5447967470839038e-06, "loss": 0.34900638461112976, "step": 1134 }, { "epoch": 2.3966244725738397, "grad_norm": 1.8445940017700195, "learning_rate": 3.5326021116097655e-06, "loss": 0.5472123026847839, "step": 1136 }, { "epoch": 2.40084388185654, "grad_norm": 1.0861736536026, "learning_rate": 3.520418033371655e-06, "loss": 0.9556151628494263, "step": 1138 }, { "epoch": 2.4050632911392404, "grad_norm": 3.4282290935516357, "learning_rate": 3.5082446606639014e-06, "loss": 0.7003535032272339, "step": 1140 }, { "epoch": 2.409282700421941, "grad_norm": 3.9306039810180664, "learning_rate": 3.4960821416505406e-06, "loss": 0.24855707585811615, "step": 1142 }, { "epoch": 2.413502109704641, "grad_norm": 0.9033668041229248, "learning_rate": 3.4839306243635003e-06, "loss": 0.7160732746124268, "step": 1144 }, { "epoch": 2.4177215189873418, "grad_norm": 1.116635799407959, "learning_rate": 3.4717902567008086e-06, "loss": 0.9801563620567322, "step": 1146 }, { "epoch": 2.4219409282700424, "grad_norm": 4.026218891143799, "learning_rate": 3.459661186424787e-06, "loss": 0.7096956968307495, "step": 1148 }, { "epoch": 2.4261603375527425, "grad_norm": 1.311513066291809, "learning_rate": 3.447543561160258e-06, "loss": 0.9519820809364319, "step": 1150 }, { "epoch": 2.430379746835443, "grad_norm": 3.234283208847046, "learning_rate": 3.435437528392741e-06, "loss": 0.6188116073608398, "step": 1152 }, { "epoch": 2.4345991561181437, "grad_norm": 0.9476786851882935, "learning_rate": 3.4233432354666666e-06, "loss": 1.0032005310058594, "step": 1154 }, { "epoch": 2.438818565400844, "grad_norm": 1.0260239839553833, "learning_rate": 3.4112608295835718e-06, "loss": 0.3281160891056061, "step": 1156 }, { "epoch": 2.4430379746835444, "grad_norm": 0.7956026196479797, "learning_rate": 3.3991904578003182e-06, "loss": 0.627183735370636, "step": 1158 }, { "epoch": 2.4472573839662446, "grad_norm": 0.9774817824363708, "learning_rate": 3.3871322670273e-06, "loss": 0.9342701435089111, "step": 1160 }, { "epoch": 2.451476793248945, "grad_norm": 1.73080313205719, "learning_rate": 3.3750864040266497e-06, "loss": 0.5555570721626282, "step": 1162 }, { "epoch": 2.4556962025316453, "grad_norm": 1.2167036533355713, "learning_rate": 3.3630530154104603e-06, "loss": 0.8571757674217224, "step": 1164 }, { "epoch": 2.459915611814346, "grad_norm": 0.8792468905448914, "learning_rate": 3.3510322476389953e-06, "loss": 0.8499954342842102, "step": 1166 }, { "epoch": 2.4641350210970465, "grad_norm": 0.3647661805152893, "learning_rate": 3.33902424701891e-06, "loss": 0.4817237854003906, "step": 1168 }, { "epoch": 2.4683544303797467, "grad_norm": 1.5427345037460327, "learning_rate": 3.327029159701465e-06, "loss": 0.8259966373443604, "step": 1170 }, { "epoch": 2.4725738396624473, "grad_norm": 0.9573671221733093, "learning_rate": 3.315047131680755e-06, "loss": 0.9262470006942749, "step": 1172 }, { "epoch": 2.476793248945148, "grad_norm": 0.8954631686210632, "learning_rate": 3.3030783087919253e-06, "loss": 0.8667972087860107, "step": 1174 }, { "epoch": 2.481012658227848, "grad_norm": 0.998231828212738, "learning_rate": 3.291122836709402e-06, "loss": 0.6898888349533081, "step": 1176 }, { "epoch": 2.4852320675105486, "grad_norm": 3.1478688716888428, "learning_rate": 3.2791808609451125e-06, "loss": 0.3274869918823242, "step": 1178 }, { "epoch": 2.489451476793249, "grad_norm": 11.714877128601074, "learning_rate": 3.2672525268467225e-06, "loss": 0.6489510536193848, "step": 1180 }, { "epoch": 2.4936708860759493, "grad_norm": 1.9469349384307861, "learning_rate": 3.2553379795958604e-06, "loss": 0.6815069913864136, "step": 1182 }, { "epoch": 2.49789029535865, "grad_norm": 2.3261117935180664, "learning_rate": 3.2434373642063522e-06, "loss": 0.3795571029186249, "step": 1184 }, { "epoch": 2.50210970464135, "grad_norm": 2.7311949729919434, "learning_rate": 3.2315508255224613e-06, "loss": 0.3261902630329132, "step": 1186 }, { "epoch": 2.5063291139240507, "grad_norm": 2.2631030082702637, "learning_rate": 3.2196785082171147e-06, "loss": 0.5865919589996338, "step": 1188 }, { "epoch": 2.510548523206751, "grad_norm": 0.8359600305557251, "learning_rate": 3.207820556790155e-06, "loss": 0.8902769088745117, "step": 1190 }, { "epoch": 2.5147679324894514, "grad_norm": 2.3550963401794434, "learning_rate": 3.1959771155665715e-06, "loss": 0.4082001745700836, "step": 1192 }, { "epoch": 2.518987341772152, "grad_norm": 4.461960315704346, "learning_rate": 3.184148328694748e-06, "loss": 1.1846554279327393, "step": 1194 }, { "epoch": 2.523206751054852, "grad_norm": 1.4942057132720947, "learning_rate": 3.1723343401447107e-06, "loss": 0.9881184697151184, "step": 1196 }, { "epoch": 2.5274261603375527, "grad_norm": 2.0736021995544434, "learning_rate": 3.160535293706369e-06, "loss": 0.9017194509506226, "step": 1198 }, { "epoch": 2.5316455696202533, "grad_norm": 3.7537925243377686, "learning_rate": 3.148751332987772e-06, "loss": 0.5090019106864929, "step": 1200 }, { "epoch": 2.5358649789029535, "grad_norm": 1.3264377117156982, "learning_rate": 3.1369826014133594e-06, "loss": 0.67947918176651, "step": 1202 }, { "epoch": 2.540084388185654, "grad_norm": 3.953713893890381, "learning_rate": 3.125229242222211e-06, "loss": 0.5951077342033386, "step": 1204 }, { "epoch": 2.5443037974683547, "grad_norm": 0.990692675113678, "learning_rate": 3.1134913984663093e-06, "loss": 0.8030409812927246, "step": 1206 }, { "epoch": 2.548523206751055, "grad_norm": 3.0001838207244873, "learning_rate": 3.101769213008796e-06, "loss": 0.6891695261001587, "step": 1208 }, { "epoch": 2.5527426160337554, "grad_norm": 1.335438847541809, "learning_rate": 3.0900628285222307e-06, "loss": 0.9814665913581848, "step": 1210 }, { "epoch": 2.5569620253164556, "grad_norm": 1.2493577003479004, "learning_rate": 3.078372387486861e-06, "loss": 0.9131478667259216, "step": 1212 }, { "epoch": 2.561181434599156, "grad_norm": 2.756460428237915, "learning_rate": 3.0666980321888823e-06, "loss": 0.27317380905151367, "step": 1214 }, { "epoch": 2.5654008438818563, "grad_norm": 3.6866559982299805, "learning_rate": 3.055039904718706e-06, "loss": 0.6986894011497498, "step": 1216 }, { "epoch": 2.569620253164557, "grad_norm": 0.7736930847167969, "learning_rate": 3.0433981469692346e-06, "loss": 0.8533654808998108, "step": 1218 }, { "epoch": 2.5738396624472575, "grad_norm": 6.2710161209106445, "learning_rate": 3.0317729006341315e-06, "loss": 0.5412061214447021, "step": 1220 }, { "epoch": 2.5780590717299576, "grad_norm": 2.4914796352386475, "learning_rate": 3.0201643072060964e-06, "loss": 0.7507292628288269, "step": 1222 }, { "epoch": 2.5822784810126582, "grad_norm": 4.1669840812683105, "learning_rate": 3.0085725079751465e-06, "loss": 0.599193274974823, "step": 1224 }, { "epoch": 2.586497890295359, "grad_norm": 1.4165141582489014, "learning_rate": 2.996997644026889e-06, "loss": 0.542171835899353, "step": 1226 }, { "epoch": 2.590717299578059, "grad_norm": 1.2593107223510742, "learning_rate": 2.9854398562408144e-06, "loss": 0.8244262933731079, "step": 1228 }, { "epoch": 2.5949367088607596, "grad_norm": 1.6781362295150757, "learning_rate": 2.9738992852885742e-06, "loss": 1.0771939754486084, "step": 1230 }, { "epoch": 2.59915611814346, "grad_norm": 1.0754374265670776, "learning_rate": 2.9623760716322706e-06, "loss": 0.7803739309310913, "step": 1232 }, { "epoch": 2.6033755274261603, "grad_norm": 4.246564865112305, "learning_rate": 2.950870355522748e-06, "loss": 0.2662976384162903, "step": 1234 }, { "epoch": 2.607594936708861, "grad_norm": 1.650658369064331, "learning_rate": 2.939382276997886e-06, "loss": 0.9140543937683105, "step": 1236 }, { "epoch": 2.611814345991561, "grad_norm": 5.929245471954346, "learning_rate": 2.9279119758808942e-06, "loss": 1.1032469272613525, "step": 1238 }, { "epoch": 2.6160337552742616, "grad_norm": 1.0307083129882812, "learning_rate": 2.9164595917786088e-06, "loss": 0.6352362632751465, "step": 1240 }, { "epoch": 2.620253164556962, "grad_norm": 1.3630961179733276, "learning_rate": 2.905025264079799e-06, "loss": 0.8276194334030151, "step": 1242 }, { "epoch": 2.6244725738396624, "grad_norm": 2.032569408416748, "learning_rate": 2.8936091319534617e-06, "loss": 0.4083612859249115, "step": 1244 }, { "epoch": 2.628691983122363, "grad_norm": 0.8530462384223938, "learning_rate": 2.8822113343471365e-06, "loss": 0.6202731132507324, "step": 1246 }, { "epoch": 2.632911392405063, "grad_norm": 1.9822677373886108, "learning_rate": 2.8708320099852108e-06, "loss": 1.1646617650985718, "step": 1248 }, { "epoch": 2.6371308016877637, "grad_norm": 0.7690547108650208, "learning_rate": 2.8594712973672276e-06, "loss": 0.8482010364532471, "step": 1250 }, { "epoch": 2.6413502109704643, "grad_norm": 8.547155380249023, "learning_rate": 2.8481293347662067e-06, "loss": 0.904060959815979, "step": 1252 }, { "epoch": 2.6455696202531644, "grad_norm": 2.017336368560791, "learning_rate": 2.8368062602269573e-06, "loss": 0.3393191993236542, "step": 1254 }, { "epoch": 2.649789029535865, "grad_norm": 1.945145845413208, "learning_rate": 2.8255022115644017e-06, "loss": 0.39150819182395935, "step": 1256 }, { "epoch": 2.6540084388185656, "grad_norm": 1.3301414251327515, "learning_rate": 2.8142173263618877e-06, "loss": 0.7564312815666199, "step": 1258 }, { "epoch": 2.6582278481012658, "grad_norm": 0.9791122078895569, "learning_rate": 2.8029517419695303e-06, "loss": 0.8787249326705933, "step": 1260 }, { "epoch": 2.6624472573839664, "grad_norm": 1.0031580924987793, "learning_rate": 2.7917055955025285e-06, "loss": 0.8559532165527344, "step": 1262 }, { "epoch": 2.6666666666666665, "grad_norm": 0.7568211555480957, "learning_rate": 2.7804790238394958e-06, "loss": 0.5114046931266785, "step": 1264 }, { "epoch": 2.670886075949367, "grad_norm": 1.7229481935501099, "learning_rate": 2.7692721636208013e-06, "loss": 0.8251296281814575, "step": 1266 }, { "epoch": 2.6751054852320673, "grad_norm": 0.9991238713264465, "learning_rate": 2.7580851512469024e-06, "loss": 0.6419144868850708, "step": 1268 }, { "epoch": 2.679324894514768, "grad_norm": 1.1213876008987427, "learning_rate": 2.746918122876686e-06, "loss": 0.36948972940444946, "step": 1270 }, { "epoch": 2.6835443037974684, "grad_norm": 1.1551014184951782, "learning_rate": 2.7357712144258074e-06, "loss": 0.8657974004745483, "step": 1272 }, { "epoch": 2.6877637130801686, "grad_norm": 7.327043533325195, "learning_rate": 2.724644561565042e-06, "loss": 0.6017997860908508, "step": 1274 }, { "epoch": 2.691983122362869, "grad_norm": 3.296600818634033, "learning_rate": 2.713538299718631e-06, "loss": 0.6844916343688965, "step": 1276 }, { "epoch": 2.6962025316455698, "grad_norm": 0.31361812353134155, "learning_rate": 2.702452564062635e-06, "loss": 0.2726902365684509, "step": 1278 }, { "epoch": 2.70042194092827, "grad_norm": 1.6500128507614136, "learning_rate": 2.69138748952328e-06, "loss": 0.8048746585845947, "step": 1280 }, { "epoch": 2.7046413502109705, "grad_norm": 1.1757248640060425, "learning_rate": 2.680343210775331e-06, "loss": 0.9176240563392639, "step": 1282 }, { "epoch": 2.708860759493671, "grad_norm": 2.345834493637085, "learning_rate": 2.6693198622404403e-06, "loss": 0.4069772958755493, "step": 1284 }, { "epoch": 2.7130801687763713, "grad_norm": 5.173031330108643, "learning_rate": 2.658317578085514e-06, "loss": 0.4281209409236908, "step": 1286 }, { "epoch": 2.717299578059072, "grad_norm": 0.6406076550483704, "learning_rate": 2.647336492221082e-06, "loss": 0.4584686756134033, "step": 1288 }, { "epoch": 2.721518987341772, "grad_norm": 0.30545204877853394, "learning_rate": 2.636376738299666e-06, "loss": 0.7299985289573669, "step": 1290 }, { "epoch": 2.7257383966244726, "grad_norm": 2.3275787830352783, "learning_rate": 2.6254384497141563e-06, "loss": 0.8682552576065063, "step": 1292 }, { "epoch": 2.7299578059071727, "grad_norm": 1.1502134799957275, "learning_rate": 2.6145217595961786e-06, "loss": 0.36897793412208557, "step": 1294 }, { "epoch": 2.7341772151898733, "grad_norm": 0.9601994752883911, "learning_rate": 2.603626800814486e-06, "loss": 0.8473520278930664, "step": 1296 }, { "epoch": 2.738396624472574, "grad_norm": 0.9873552322387695, "learning_rate": 2.5927537059733337e-06, "loss": 0.9228261113166809, "step": 1298 }, { "epoch": 2.742616033755274, "grad_norm": 0.5264573097229004, "learning_rate": 2.5819026074108695e-06, "loss": 0.6119830012321472, "step": 1300 }, { "epoch": 2.7468354430379747, "grad_norm": 0.9602957963943481, "learning_rate": 2.5710736371975165e-06, "loss": 0.9762548208236694, "step": 1302 }, { "epoch": 2.7510548523206753, "grad_norm": 0.9380753040313721, "learning_rate": 2.560266927134375e-06, "loss": 0.5131715536117554, "step": 1304 }, { "epoch": 2.7552742616033754, "grad_norm": 1.438719630241394, "learning_rate": 2.549482608751613e-06, "loss": 1.091052532196045, "step": 1306 }, { "epoch": 2.759493670886076, "grad_norm": 1.7355360984802246, "learning_rate": 2.5387208133068613e-06, "loss": 0.9066473245620728, "step": 1308 }, { "epoch": 2.7637130801687766, "grad_norm": 2.98097825050354, "learning_rate": 2.5279816717836256e-06, "loss": 0.7622301578521729, "step": 1310 }, { "epoch": 2.7679324894514767, "grad_norm": 0.885686993598938, "learning_rate": 2.5172653148896842e-06, "loss": 0.9722012877464294, "step": 1312 }, { "epoch": 2.7721518987341773, "grad_norm": 1.3240593671798706, "learning_rate": 2.5065718730555033e-06, "loss": 0.9415172338485718, "step": 1314 }, { "epoch": 2.7763713080168775, "grad_norm": 1.9628123044967651, "learning_rate": 2.4959014764326415e-06, "loss": 0.6243242025375366, "step": 1316 }, { "epoch": 2.780590717299578, "grad_norm": 3.583494186401367, "learning_rate": 2.4852542548921747e-06, "loss": 0.4649869501590729, "step": 1318 }, { "epoch": 2.7848101265822782, "grad_norm": 0.94072425365448, "learning_rate": 2.4746303380231085e-06, "loss": 0.9694103002548218, "step": 1320 }, { "epoch": 2.789029535864979, "grad_norm": 0.6174410581588745, "learning_rate": 2.4640298551308073e-06, "loss": 0.5571610331535339, "step": 1322 }, { "epoch": 2.7932489451476794, "grad_norm": 2.0068700313568115, "learning_rate": 2.453452935235412e-06, "loss": 1.0208598375320435, "step": 1324 }, { "epoch": 2.7974683544303796, "grad_norm": 1.8920451402664185, "learning_rate": 2.442899707070277e-06, "loss": 0.7713922262191772, "step": 1326 }, { "epoch": 2.80168776371308, "grad_norm": 0.9682056903839111, "learning_rate": 2.432370299080402e-06, "loss": 0.5502282977104187, "step": 1328 }, { "epoch": 2.8059071729957807, "grad_norm": 0.9725003838539124, "learning_rate": 2.4218648394208675e-06, "loss": 0.8966948986053467, "step": 1330 }, { "epoch": 2.810126582278481, "grad_norm": 1.1623132228851318, "learning_rate": 2.4113834559552725e-06, "loss": 0.7290566563606262, "step": 1332 }, { "epoch": 2.8143459915611815, "grad_norm": 1.2533057928085327, "learning_rate": 2.4009262762541812e-06, "loss": 0.4873872697353363, "step": 1334 }, { "epoch": 2.818565400843882, "grad_norm": 0.42495617270469666, "learning_rate": 2.3904934275935742e-06, "loss": 0.6868776082992554, "step": 1336 }, { "epoch": 2.8227848101265822, "grad_norm": 1.3464299440383911, "learning_rate": 2.3800850369532913e-06, "loss": 0.792182207107544, "step": 1338 }, { "epoch": 2.827004219409283, "grad_norm": 1.2492246627807617, "learning_rate": 2.3697012310154895e-06, "loss": 0.8120459318161011, "step": 1340 }, { "epoch": 2.831223628691983, "grad_norm": 1.79072105884552, "learning_rate": 2.3593421361631063e-06, "loss": 0.8677684664726257, "step": 1342 }, { "epoch": 2.8354430379746836, "grad_norm": 1.2441151142120361, "learning_rate": 2.3490078784783088e-06, "loss": 1.0221854448318481, "step": 1344 }, { "epoch": 2.8396624472573837, "grad_norm": 2.060967206954956, "learning_rate": 2.3386985837409736e-06, "loss": 0.6457461714744568, "step": 1346 }, { "epoch": 2.8438818565400843, "grad_norm": 0.8780367970466614, "learning_rate": 2.328414377427148e-06, "loss": 0.514173686504364, "step": 1348 }, { "epoch": 2.848101265822785, "grad_norm": 0.9615793228149414, "learning_rate": 2.318155384707524e-06, "loss": 0.9813417792320251, "step": 1350 }, { "epoch": 2.852320675105485, "grad_norm": 0.7979256510734558, "learning_rate": 2.3079217304459114e-06, "loss": 0.6034799218177795, "step": 1352 }, { "epoch": 2.8565400843881856, "grad_norm": 2.0170516967773438, "learning_rate": 2.2977135391977264e-06, "loss": 0.6767147779464722, "step": 1354 }, { "epoch": 2.8607594936708862, "grad_norm": 2.4936254024505615, "learning_rate": 2.287530935208469e-06, "loss": 0.5042116045951843, "step": 1356 }, { "epoch": 2.8649789029535864, "grad_norm": 1.2325421571731567, "learning_rate": 2.277374042412214e-06, "loss": 0.9337244033813477, "step": 1358 }, { "epoch": 2.869198312236287, "grad_norm": 2.9698169231414795, "learning_rate": 2.2672429844300972e-06, "loss": 0.7304012179374695, "step": 1360 }, { "epoch": 2.8734177215189876, "grad_norm": 1.5197981595993042, "learning_rate": 2.257137884568819e-06, "loss": 0.5767084956169128, "step": 1362 }, { "epoch": 2.8776371308016877, "grad_norm": 2.374297857284546, "learning_rate": 2.24705886581914e-06, "loss": 0.9020572304725647, "step": 1364 }, { "epoch": 2.8818565400843883, "grad_norm": 1.3976613283157349, "learning_rate": 2.237006050854378e-06, "loss": 0.8876560926437378, "step": 1366 }, { "epoch": 2.8860759493670884, "grad_norm": 1.1186343431472778, "learning_rate": 2.2269795620289255e-06, "loss": 0.9599936008453369, "step": 1368 }, { "epoch": 2.890295358649789, "grad_norm": 2.704097270965576, "learning_rate": 2.2169795213767533e-06, "loss": 0.8696321249008179, "step": 1370 }, { "epoch": 2.894514767932489, "grad_norm": 7.440235614776611, "learning_rate": 2.207006050609931e-06, "loss": 0.3180171847343445, "step": 1372 }, { "epoch": 2.8987341772151898, "grad_norm": 0.950478196144104, "learning_rate": 2.1970592711171343e-06, "loss": 0.6180795431137085, "step": 1374 }, { "epoch": 2.9029535864978904, "grad_norm": 1.206428050994873, "learning_rate": 2.1871393039621813e-06, "loss": 0.8911280035972595, "step": 1376 }, { "epoch": 2.9071729957805905, "grad_norm": 3.0545897483825684, "learning_rate": 2.177246269882552e-06, "loss": 0.752612292766571, "step": 1378 }, { "epoch": 2.911392405063291, "grad_norm": 1.6597026586532593, "learning_rate": 2.1673802892879202e-06, "loss": 1.0073306560516357, "step": 1380 }, { "epoch": 2.9156118143459917, "grad_norm": 2.8480212688446045, "learning_rate": 2.1575414822586834e-06, "loss": 0.49533841013908386, "step": 1382 }, { "epoch": 2.919831223628692, "grad_norm": 2.9914588928222656, "learning_rate": 2.1477299685445093e-06, "loss": 0.6439518332481384, "step": 1384 }, { "epoch": 2.9240506329113924, "grad_norm": 1.6400901079177856, "learning_rate": 2.1379458675628758e-06, "loss": 0.5329881906509399, "step": 1386 }, { "epoch": 2.928270042194093, "grad_norm": 0.9584951400756836, "learning_rate": 2.128189298397611e-06, "loss": 0.9460800290107727, "step": 1388 }, { "epoch": 2.932489451476793, "grad_norm": 1.2493575811386108, "learning_rate": 2.118460379797452e-06, "loss": 0.7834473848342896, "step": 1390 }, { "epoch": 2.9367088607594938, "grad_norm": 1.484129548072815, "learning_rate": 2.1087592301745965e-06, "loss": 0.4930620491504669, "step": 1392 }, { "epoch": 2.9409282700421944, "grad_norm": 1.0145891904830933, "learning_rate": 2.0990859676032623e-06, "loss": 0.4643522799015045, "step": 1394 }, { "epoch": 2.9451476793248945, "grad_norm": 0.9809361696243286, "learning_rate": 2.0894407098182474e-06, "loss": 0.8622637987136841, "step": 1396 }, { "epoch": 2.9493670886075947, "grad_norm": 3.8030622005462646, "learning_rate": 2.0798235742134995e-06, "loss": 0.6468316316604614, "step": 1398 }, { "epoch": 2.9535864978902953, "grad_norm": 3.291412830352783, "learning_rate": 2.0702346778406887e-06, "loss": 0.871576726436615, "step": 1400 }, { "epoch": 2.957805907172996, "grad_norm": 2.847675085067749, "learning_rate": 2.0606741374077804e-06, "loss": 0.6290037631988525, "step": 1402 }, { "epoch": 2.962025316455696, "grad_norm": 0.8518403172492981, "learning_rate": 2.0511420692776135e-06, "loss": 0.8591277003288269, "step": 1404 }, { "epoch": 2.9662447257383966, "grad_norm": 2.023810386657715, "learning_rate": 2.041638589466487e-06, "loss": 0.8211725354194641, "step": 1406 }, { "epoch": 2.970464135021097, "grad_norm": 2.9551258087158203, "learning_rate": 2.0321638136427495e-06, "loss": 0.46553725004196167, "step": 1408 }, { "epoch": 2.9746835443037973, "grad_norm": 3.8522558212280273, "learning_rate": 2.0227178571253846e-06, "loss": 0.7728868126869202, "step": 1410 }, { "epoch": 2.978902953586498, "grad_norm": 0.8442367911338806, "learning_rate": 2.013300834882615e-06, "loss": 0.9526476860046387, "step": 1412 }, { "epoch": 2.9831223628691985, "grad_norm": 2.8707711696624756, "learning_rate": 2.0039128615304967e-06, "loss": 0.6912641525268555, "step": 1414 }, { "epoch": 2.9873417721518987, "grad_norm": 0.9124540686607361, "learning_rate": 1.994554051331532e-06, "loss": 0.7677329778671265, "step": 1416 }, { "epoch": 2.9915611814345993, "grad_norm": 0.7803240418434143, "learning_rate": 1.9852245181932674e-06, "loss": 0.8512239456176758, "step": 1418 }, { "epoch": 2.9957805907173, "grad_norm": 3.4592530727386475, "learning_rate": 1.975924375666918e-06, "loss": 0.8197758197784424, "step": 1420 }, { "epoch": 3.0, "grad_norm": 1.0075371265411377, "learning_rate": 1.9666537369459813e-06, "loss": 0.26588016748428345, "step": 1422 }, { "epoch": 3.0042194092827006, "grad_norm": 0.8261951208114624, "learning_rate": 1.9574127148648586e-06, "loss": 0.4992481768131256, "step": 1424 }, { "epoch": 3.0084388185654007, "grad_norm": 1.8350886106491089, "learning_rate": 1.94820142189748e-06, "loss": 0.4615590572357178, "step": 1426 }, { "epoch": 3.0126582278481013, "grad_norm": 7.030728816986084, "learning_rate": 1.9390199701559407e-06, "loss": 0.5607567429542542, "step": 1428 }, { "epoch": 3.0168776371308015, "grad_norm": 1.843036413192749, "learning_rate": 1.929868471389133e-06, "loss": 0.1959325075149536, "step": 1430 }, { "epoch": 3.021097046413502, "grad_norm": 1.2027599811553955, "learning_rate": 1.920747036981388e-06, "loss": 0.8035475611686707, "step": 1432 }, { "epoch": 3.0253164556962027, "grad_norm": 1.0378309488296509, "learning_rate": 1.9116557779511153e-06, "loss": 0.7113970518112183, "step": 1434 }, { "epoch": 3.029535864978903, "grad_norm": 1.079108715057373, "learning_rate": 1.9025948049494587e-06, "loss": 0.8759698271751404, "step": 1436 }, { "epoch": 3.0337552742616034, "grad_norm": 1.387281060218811, "learning_rate": 1.8935642282589452e-06, "loss": 0.4212711453437805, "step": 1438 }, { "epoch": 3.037974683544304, "grad_norm": 1.6048085689544678, "learning_rate": 1.884564157792141e-06, "loss": 0.7371959090232849, "step": 1440 }, { "epoch": 3.042194092827004, "grad_norm": 0.33521798253059387, "learning_rate": 1.87559470309032e-06, "loss": 0.5267896056175232, "step": 1442 }, { "epoch": 3.0464135021097047, "grad_norm": 1.3722892999649048, "learning_rate": 1.8666559733221244e-06, "loss": 0.657349169254303, "step": 1444 }, { "epoch": 3.050632911392405, "grad_norm": 1.0858877897262573, "learning_rate": 1.8577480772822405e-06, "loss": 0.8311367034912109, "step": 1446 }, { "epoch": 3.0548523206751055, "grad_norm": 5.020367622375488, "learning_rate": 1.8488711233900686e-06, "loss": 0.5246130228042603, "step": 1448 }, { "epoch": 3.059071729957806, "grad_norm": 3.7570173740386963, "learning_rate": 1.8400252196884106e-06, "loss": 0.6080931425094604, "step": 1450 }, { "epoch": 3.0632911392405062, "grad_norm": 1.1105659008026123, "learning_rate": 1.8312104738421518e-06, "loss": 0.8224632740020752, "step": 1452 }, { "epoch": 3.067510548523207, "grad_norm": 3.6815249919891357, "learning_rate": 1.8224269931369494e-06, "loss": 0.6160001158714294, "step": 1454 }, { "epoch": 3.071729957805907, "grad_norm": 9.295499801635742, "learning_rate": 1.8136748844779257e-06, "loss": 0.49316591024398804, "step": 1456 }, { "epoch": 3.0759493670886076, "grad_norm": 4.4355974197387695, "learning_rate": 1.8049542543883718e-06, "loss": 0.6495121121406555, "step": 1458 }, { "epoch": 3.080168776371308, "grad_norm": 2.505272626876831, "learning_rate": 1.7962652090084483e-06, "loss": 0.4862138032913208, "step": 1460 }, { "epoch": 3.0843881856540083, "grad_norm": 0.9544802904129028, "learning_rate": 1.7876078540938897e-06, "loss": 0.7817291021347046, "step": 1462 }, { "epoch": 3.088607594936709, "grad_norm": 0.9137688875198364, "learning_rate": 1.778982295014725e-06, "loss": 0.7803807258605957, "step": 1464 }, { "epoch": 3.0928270042194095, "grad_norm": 0.9232447743415833, "learning_rate": 1.7703886367539886e-06, "loss": 0.7208024859428406, "step": 1466 }, { "epoch": 3.0970464135021096, "grad_norm": 2.5386898517608643, "learning_rate": 1.7618269839064476e-06, "loss": 0.535610556602478, "step": 1468 }, { "epoch": 3.1012658227848102, "grad_norm": 2.476505756378174, "learning_rate": 1.7532974406773215e-06, "loss": 0.11650805175304413, "step": 1470 }, { "epoch": 3.1054852320675104, "grad_norm": 3.4205284118652344, "learning_rate": 1.744800110881024e-06, "loss": 0.9236214756965637, "step": 1472 }, { "epoch": 3.109704641350211, "grad_norm": 0.38351741433143616, "learning_rate": 1.7363350979398904e-06, "loss": 0.3822326362133026, "step": 1474 }, { "epoch": 3.1139240506329116, "grad_norm": 1.7231391668319702, "learning_rate": 1.7279025048829247e-06, "loss": 0.8056196570396423, "step": 1476 }, { "epoch": 3.1181434599156117, "grad_norm": 1.3952598571777344, "learning_rate": 1.7195024343445406e-06, "loss": 0.8253889679908752, "step": 1478 }, { "epoch": 3.1223628691983123, "grad_norm": 1.235793113708496, "learning_rate": 1.711134988563318e-06, "loss": 0.7869700193405151, "step": 1480 }, { "epoch": 3.1265822784810124, "grad_norm": 1.5086437463760376, "learning_rate": 1.7028002693807553e-06, "loss": 0.74970543384552, "step": 1482 }, { "epoch": 3.130801687763713, "grad_norm": 1.1958047151565552, "learning_rate": 1.694498378240028e-06, "loss": 0.7713515758514404, "step": 1484 }, { "epoch": 3.1350210970464136, "grad_norm": 0.9930305480957031, "learning_rate": 1.6862294161847582e-06, "loss": 0.4803518056869507, "step": 1486 }, { "epoch": 3.1392405063291138, "grad_norm": 1.338038444519043, "learning_rate": 1.6779934838577833e-06, "loss": 0.4478246569633484, "step": 1488 }, { "epoch": 3.1434599156118144, "grad_norm": 1.8812412023544312, "learning_rate": 1.6697906814999316e-06, "loss": 0.8487708568572998, "step": 1490 }, { "epoch": 3.147679324894515, "grad_norm": 1.079730749130249, "learning_rate": 1.6616211089487968e-06, "loss": 0.4909372329711914, "step": 1492 }, { "epoch": 3.151898734177215, "grad_norm": 3.950795888900757, "learning_rate": 1.653484865637532e-06, "loss": 0.6456606388092041, "step": 1494 }, { "epoch": 3.1561181434599157, "grad_norm": 0.8888868093490601, "learning_rate": 1.645382050593633e-06, "loss": 0.5738848447799683, "step": 1496 }, { "epoch": 3.160337552742616, "grad_norm": 0.8062717318534851, "learning_rate": 1.6373127624377361e-06, "loss": 0.3924991488456726, "step": 1498 }, { "epoch": 3.1645569620253164, "grad_norm": 1.1965993642807007, "learning_rate": 1.6292770993824138e-06, "loss": 0.4241105318069458, "step": 1500 }, { "epoch": 3.168776371308017, "grad_norm": 1.7078224420547485, "learning_rate": 1.621275159230986e-06, "loss": 0.7920833230018616, "step": 1502 }, { "epoch": 3.172995780590717, "grad_norm": 3.2493438720703125, "learning_rate": 1.6133070393763222e-06, "loss": 0.7387109994888306, "step": 1504 }, { "epoch": 3.1772151898734178, "grad_norm": 1.1433643102645874, "learning_rate": 1.605372836799664e-06, "loss": 0.8177753686904907, "step": 1506 }, { "epoch": 3.181434599156118, "grad_norm": 1.1686694622039795, "learning_rate": 1.5974726480694356e-06, "loss": 0.810562014579773, "step": 1508 }, { "epoch": 3.1856540084388185, "grad_norm": 1.6440011262893677, "learning_rate": 1.589606569340076e-06, "loss": 0.8004451394081116, "step": 1510 }, { "epoch": 3.189873417721519, "grad_norm": 3.572957754135132, "learning_rate": 1.5817746963508675e-06, "loss": 0.19780634343624115, "step": 1512 }, { "epoch": 3.1940928270042193, "grad_norm": 1.8729281425476074, "learning_rate": 1.5739771244247647e-06, "loss": 0.8508098721504211, "step": 1514 }, { "epoch": 3.19831223628692, "grad_norm": 0.22832605242729187, "learning_rate": 1.5662139484672423e-06, "loss": 0.5102086663246155, "step": 1516 }, { "epoch": 3.2025316455696204, "grad_norm": 1.493944764137268, "learning_rate": 1.558485262965135e-06, "loss": 0.8561201691627502, "step": 1518 }, { "epoch": 3.2067510548523206, "grad_norm": 2.02929949760437, "learning_rate": 1.55079116198549e-06, "loss": 0.7038779258728027, "step": 1520 }, { "epoch": 3.210970464135021, "grad_norm": 2.459091901779175, "learning_rate": 1.5431317391744167e-06, "loss": 0.2252277433872223, "step": 1522 }, { "epoch": 3.2151898734177213, "grad_norm": 2.103160858154297, "learning_rate": 1.535507087755956e-06, "loss": 0.548999011516571, "step": 1524 }, { "epoch": 3.219409282700422, "grad_norm": 1.064772129058838, "learning_rate": 1.527917300530938e-06, "loss": 0.7090752124786377, "step": 1526 }, { "epoch": 3.2236286919831225, "grad_norm": 0.5920833945274353, "learning_rate": 1.5203624698758573e-06, "loss": 0.28943130373954773, "step": 1528 }, { "epoch": 3.2278481012658227, "grad_norm": 2.5098395347595215, "learning_rate": 1.5128426877417428e-06, "loss": 0.5822982788085938, "step": 1530 }, { "epoch": 3.2320675105485233, "grad_norm": 0.6460347175598145, "learning_rate": 1.5053580456530459e-06, "loss": 0.15637226402759552, "step": 1532 }, { "epoch": 3.2362869198312234, "grad_norm": 1.804608702659607, "learning_rate": 1.4979086347065225e-06, "loss": 0.7296754121780396, "step": 1534 }, { "epoch": 3.240506329113924, "grad_norm": 1.5082496404647827, "learning_rate": 1.4904945455701232e-06, "loss": 0.7508465647697449, "step": 1536 }, { "epoch": 3.2447257383966246, "grad_norm": 1.1056941747665405, "learning_rate": 1.4831158684818917e-06, "loss": 0.6265556812286377, "step": 1538 }, { "epoch": 3.2489451476793247, "grad_norm": 2.1995933055877686, "learning_rate": 1.4757726932488672e-06, "loss": 0.5779432058334351, "step": 1540 }, { "epoch": 3.2531645569620253, "grad_norm": 2.594663619995117, "learning_rate": 1.4684651092459906e-06, "loss": 0.4649961590766907, "step": 1542 }, { "epoch": 3.257383966244726, "grad_norm": 2.5885109901428223, "learning_rate": 1.4611932054150132e-06, "loss": 0.5126054883003235, "step": 1544 }, { "epoch": 3.261603375527426, "grad_norm": 2.8481526374816895, "learning_rate": 1.4539570702634208e-06, "loss": 0.49317800998687744, "step": 1546 }, { "epoch": 3.2658227848101267, "grad_norm": 1.6855295896530151, "learning_rate": 1.446756791863351e-06, "loss": 0.6522631049156189, "step": 1548 }, { "epoch": 3.270042194092827, "grad_norm": 2.981158971786499, "learning_rate": 1.4395924578505253e-06, "loss": 0.20762769877910614, "step": 1550 }, { "epoch": 3.2742616033755274, "grad_norm": 0.8789273500442505, "learning_rate": 1.4324641554231767e-06, "loss": 0.2234586775302887, "step": 1552 }, { "epoch": 3.278481012658228, "grad_norm": 5.3056182861328125, "learning_rate": 1.4253719713409958e-06, "loss": 0.40713340044021606, "step": 1554 }, { "epoch": 3.282700421940928, "grad_norm": 0.8367089033126831, "learning_rate": 1.41831599192407e-06, "loss": 0.7326263189315796, "step": 1556 }, { "epoch": 3.2869198312236287, "grad_norm": 1.1955314874649048, "learning_rate": 1.4112963030518329e-06, "loss": 0.5510862469673157, "step": 1558 }, { "epoch": 3.291139240506329, "grad_norm": 1.1264405250549316, "learning_rate": 1.4043129901620198e-06, "loss": 0.44987189769744873, "step": 1560 }, { "epoch": 3.2953586497890295, "grad_norm": 2.407663345336914, "learning_rate": 1.397366138249633e-06, "loss": 0.42221248149871826, "step": 1562 }, { "epoch": 3.29957805907173, "grad_norm": 2.001704692840576, "learning_rate": 1.3904558318658964e-06, "loss": 0.7191241383552551, "step": 1564 }, { "epoch": 3.3037974683544302, "grad_norm": 2.9357941150665283, "learning_rate": 1.3835821551172352e-06, "loss": 0.5609620809555054, "step": 1566 }, { "epoch": 3.308016877637131, "grad_norm": 0.1518426090478897, "learning_rate": 1.3767451916642502e-06, "loss": 0.3671785891056061, "step": 1568 }, { "epoch": 3.3122362869198314, "grad_norm": 2.9103848934173584, "learning_rate": 1.3699450247206987e-06, "loss": 0.3877882659435272, "step": 1570 }, { "epoch": 3.3164556962025316, "grad_norm": 1.832383394241333, "learning_rate": 1.363181737052479e-06, "loss": 0.38887959718704224, "step": 1572 }, { "epoch": 3.320675105485232, "grad_norm": 1.458479404449463, "learning_rate": 1.3564554109766303e-06, "loss": 0.87562096118927, "step": 1574 }, { "epoch": 3.3248945147679323, "grad_norm": 1.4098705053329468, "learning_rate": 1.3497661283603241e-06, "loss": 0.618715763092041, "step": 1576 }, { "epoch": 3.329113924050633, "grad_norm": 0.9463833570480347, "learning_rate": 1.3431139706198703e-06, "loss": 0.7363364100456238, "step": 1578 }, { "epoch": 3.3333333333333335, "grad_norm": 1.1084620952606201, "learning_rate": 1.336499018719726e-06, "loss": 0.46182820200920105, "step": 1580 }, { "epoch": 3.3375527426160336, "grad_norm": 2.6807730197906494, "learning_rate": 1.3299213531715104e-06, "loss": 0.4027124345302582, "step": 1582 }, { "epoch": 3.3417721518987342, "grad_norm": 3.307328939437866, "learning_rate": 1.3233810540330258e-06, "loss": 0.7045289278030396, "step": 1584 }, { "epoch": 3.3459915611814344, "grad_norm": 3.4561619758605957, "learning_rate": 1.3168782009072792e-06, "loss": 0.5450237989425659, "step": 1586 }, { "epoch": 3.350210970464135, "grad_norm": 0.9626051783561707, "learning_rate": 1.3104128729415191e-06, "loss": 0.29501575231552124, "step": 1588 }, { "epoch": 3.3544303797468356, "grad_norm": 0.9624335169792175, "learning_rate": 1.3039851488262682e-06, "loss": 0.7472168207168579, "step": 1590 }, { "epoch": 3.3586497890295357, "grad_norm": 1.6804134845733643, "learning_rate": 1.2975951067943673e-06, "loss": 0.7001281976699829, "step": 1592 }, { "epoch": 3.3628691983122363, "grad_norm": 0.3944559097290039, "learning_rate": 1.2912428246200215e-06, "loss": 0.45443102717399597, "step": 1594 }, { "epoch": 3.367088607594937, "grad_norm": 3.5907063484191895, "learning_rate": 1.2849283796178554e-06, "loss": 0.32309669256210327, "step": 1596 }, { "epoch": 3.371308016877637, "grad_norm": 1.1190893650054932, "learning_rate": 1.2786518486419726e-06, "loss": 0.369854599237442, "step": 1598 }, { "epoch": 3.3755274261603376, "grad_norm": 5.47310733795166, "learning_rate": 1.2724133080850176e-06, "loss": 0.5572913289070129, "step": 1600 }, { "epoch": 3.379746835443038, "grad_norm": 1.1562130451202393, "learning_rate": 1.266212833877248e-06, "loss": 0.4165474772453308, "step": 1602 }, { "epoch": 3.3839662447257384, "grad_norm": 0.32527777552604675, "learning_rate": 1.2600505014856088e-06, "loss": 0.3750830888748169, "step": 1604 }, { "epoch": 3.388185654008439, "grad_norm": 1.6657167673110962, "learning_rate": 1.253926385912818e-06, "loss": 0.8115463852882385, "step": 1606 }, { "epoch": 3.392405063291139, "grad_norm": 1.3920835256576538, "learning_rate": 1.2478405616964485e-06, "loss": 0.4179677963256836, "step": 1608 }, { "epoch": 3.3966244725738397, "grad_norm": 1.1664825677871704, "learning_rate": 1.2417931029080215e-06, "loss": 0.41709059476852417, "step": 1610 }, { "epoch": 3.40084388185654, "grad_norm": 1.5139544010162354, "learning_rate": 1.23578408315211e-06, "loss": 0.7101098299026489, "step": 1612 }, { "epoch": 3.4050632911392404, "grad_norm": 0.8697150945663452, "learning_rate": 1.2298135755654378e-06, "loss": 0.20523357391357422, "step": 1614 }, { "epoch": 3.409282700421941, "grad_norm": 2.3192219734191895, "learning_rate": 1.2238816528159904e-06, "loss": 0.6002774238586426, "step": 1616 }, { "epoch": 3.413502109704641, "grad_norm": 1.1426221132278442, "learning_rate": 1.2179883871021322e-06, "loss": 0.8457775712013245, "step": 1618 }, { "epoch": 3.4177215189873418, "grad_norm": 0.9432998895645142, "learning_rate": 1.2121338501517264e-06, "loss": 0.7718835473060608, "step": 1620 }, { "epoch": 3.4219409282700424, "grad_norm": 0.8319332599639893, "learning_rate": 1.2063181132212632e-06, "loss": 0.43066444993019104, "step": 1622 }, { "epoch": 3.4261603375527425, "grad_norm": 6.067880630493164, "learning_rate": 1.200541247094989e-06, "loss": 0.23788659274578094, "step": 1624 }, { "epoch": 3.430379746835443, "grad_norm": 5.315701007843018, "learning_rate": 1.1948033220840512e-06, "loss": 0.17813172936439514, "step": 1626 }, { "epoch": 3.4345991561181437, "grad_norm": 1.0699946880340576, "learning_rate": 1.1891044080256355e-06, "loss": 0.67367023229599, "step": 1628 }, { "epoch": 3.438818565400844, "grad_norm": 1.1057825088500977, "learning_rate": 1.1834445742821226e-06, "loss": 0.27095526456832886, "step": 1630 }, { "epoch": 3.4430379746835444, "grad_norm": 1.340883493423462, "learning_rate": 1.1778238897402362e-06, "loss": 0.8471240401268005, "step": 1632 }, { "epoch": 3.4472573839662446, "grad_norm": 0.34447231888771057, "learning_rate": 1.1722424228102123e-06, "loss": 0.4438764452934265, "step": 1634 }, { "epoch": 3.451476793248945, "grad_norm": 3.183422327041626, "learning_rate": 1.1667002414249631e-06, "loss": 0.7752975225448608, "step": 1636 }, { "epoch": 3.4556962025316453, "grad_norm": 2.9104697704315186, "learning_rate": 1.1611974130392475e-06, "loss": 0.9540504813194275, "step": 1638 }, { "epoch": 3.459915611814346, "grad_norm": 1.6280553340911865, "learning_rate": 1.1557340046288554e-06, "loss": 0.8632485270500183, "step": 1640 }, { "epoch": 3.4641350210970465, "grad_norm": 2.387031078338623, "learning_rate": 1.1503100826897889e-06, "loss": 0.8144734501838684, "step": 1642 }, { "epoch": 3.4683544303797467, "grad_norm": 0.9215964674949646, "learning_rate": 1.144925713237456e-06, "loss": 0.20231464505195618, "step": 1644 }, { "epoch": 3.4725738396624473, "grad_norm": 1.58251953125, "learning_rate": 1.1395809618058614e-06, "loss": 0.5774148106575012, "step": 1646 }, { "epoch": 3.476793248945148, "grad_norm": 2.2536582946777344, "learning_rate": 1.1342758934468158e-06, "loss": 0.6982643604278564, "step": 1648 }, { "epoch": 3.481012658227848, "grad_norm": 1.4097844362258911, "learning_rate": 1.12901057272914e-06, "loss": 0.38915500044822693, "step": 1650 }, { "epoch": 3.4852320675105486, "grad_norm": 1.30046546459198, "learning_rate": 1.1237850637378808e-06, "loss": 0.6481969356536865, "step": 1652 }, { "epoch": 3.489451476793249, "grad_norm": 0.18971386551856995, "learning_rate": 1.1185994300735278e-06, "loss": 0.3767941892147064, "step": 1654 }, { "epoch": 3.4936708860759493, "grad_norm": 0.3824913203716278, "learning_rate": 1.1134537348512443e-06, "loss": 0.5739644169807434, "step": 1656 }, { "epoch": 3.49789029535865, "grad_norm": 2.9707915782928467, "learning_rate": 1.1083480407000954e-06, "loss": 0.609894335269928, "step": 1658 }, { "epoch": 3.50210970464135, "grad_norm": 1.3457541465759277, "learning_rate": 1.103282409762287e-06, "loss": 0.6929283142089844, "step": 1660 }, { "epoch": 3.5063291139240507, "grad_norm": 2.39221453666687, "learning_rate": 1.0982569036924092e-06, "loss": 0.8087446093559265, "step": 1662 }, { "epoch": 3.510548523206751, "grad_norm": 5.895007610321045, "learning_rate": 1.0932715836566866e-06, "loss": 0.3411268889904022, "step": 1664 }, { "epoch": 3.5147679324894514, "grad_norm": 1.3041728734970093, "learning_rate": 1.0883265103322333e-06, "loss": 0.8067029714584351, "step": 1666 }, { "epoch": 3.518987341772152, "grad_norm": 1.6455022096633911, "learning_rate": 1.083421743906313e-06, "loss": 0.4951574504375458, "step": 1668 }, { "epoch": 3.523206751054852, "grad_norm": 1.431204915046692, "learning_rate": 1.0785573440756093e-06, "loss": 0.7452267408370972, "step": 1670 }, { "epoch": 3.5274261603375527, "grad_norm": 7.941998481750488, "learning_rate": 1.0737333700454966e-06, "loss": 0.2036304473876953, "step": 1672 }, { "epoch": 3.5316455696202533, "grad_norm": 1.081209659576416, "learning_rate": 1.068949880529322e-06, "loss": 0.4741116166114807, "step": 1674 }, { "epoch": 3.5358649789029535, "grad_norm": 3.1109554767608643, "learning_rate": 1.0642069337476872e-06, "loss": 0.5494669675827026, "step": 1676 }, { "epoch": 3.540084388185654, "grad_norm": 3.2354819774627686, "learning_rate": 1.0595045874277425e-06, "loss": 0.4578985571861267, "step": 1678 }, { "epoch": 3.5443037974683547, "grad_norm": 1.4328290224075317, "learning_rate": 1.0548428988024858e-06, "loss": 0.7518556714057922, "step": 1680 }, { "epoch": 3.548523206751055, "grad_norm": 1.069136619567871, "learning_rate": 1.050221924610061e-06, "loss": 0.567197859287262, "step": 1682 }, { "epoch": 3.5527426160337554, "grad_norm": 21.512428283691406, "learning_rate": 1.045641721093071e-06, "loss": 0.6879177093505859, "step": 1684 }, { "epoch": 3.5569620253164556, "grad_norm": 3.211840867996216, "learning_rate": 1.041102343997893e-06, "loss": 0.23187503218650818, "step": 1686 }, { "epoch": 3.561181434599156, "grad_norm": 0.7154665589332581, "learning_rate": 1.0366038485739996e-06, "loss": 0.4495694935321808, "step": 1688 }, { "epoch": 3.5654008438818563, "grad_norm": 1.3137481212615967, "learning_rate": 1.032146289573284e-06, "loss": 0.7427676320075989, "step": 1690 }, { "epoch": 3.569620253164557, "grad_norm": 4.688238620758057, "learning_rate": 1.027729721249399e-06, "loss": 0.16239574551582336, "step": 1692 }, { "epoch": 3.5738396624472575, "grad_norm": 0.26294824481010437, "learning_rate": 1.023354197357091e-06, "loss": 0.6016992926597595, "step": 1694 }, { "epoch": 3.5780590717299576, "grad_norm": 2.513110637664795, "learning_rate": 1.0190197711515498e-06, "loss": 0.20142441987991333, "step": 1696 }, { "epoch": 3.5822784810126582, "grad_norm": 1.3879189491271973, "learning_rate": 1.014726495387757e-06, "loss": 0.5553002953529358, "step": 1698 }, { "epoch": 3.586497890295359, "grad_norm": 1.3632709980010986, "learning_rate": 1.0104744223198471e-06, "loss": 0.4727664589881897, "step": 1700 }, { "epoch": 3.590717299578059, "grad_norm": 1.0121846199035645, "learning_rate": 1.0062636037004696e-06, "loss": 0.3748111128807068, "step": 1702 }, { "epoch": 3.5949367088607596, "grad_norm": 1.831874132156372, "learning_rate": 1.0020940907801604e-06, "loss": 0.869547963142395, "step": 1704 }, { "epoch": 3.59915611814346, "grad_norm": 7.198643684387207, "learning_rate": 9.979659343067154e-07, "loss": 0.5534847974777222, "step": 1706 }, { "epoch": 3.6033755274261603, "grad_norm": 2.4725635051727295, "learning_rate": 9.938791845245768e-07, "loss": 0.5149208307266235, "step": 1708 }, { "epoch": 3.607594936708861, "grad_norm": 0.5918768048286438, "learning_rate": 9.898338911742186e-07, "loss": 0.364676296710968, "step": 1710 }, { "epoch": 3.611814345991561, "grad_norm": 1.779348611831665, "learning_rate": 9.85830103491541e-07, "loss": 0.7533677816390991, "step": 1712 }, { "epoch": 3.6160337552742616, "grad_norm": 3.70202374458313, "learning_rate": 9.818678702072734e-07, "loss": 0.9169490933418274, "step": 1714 }, { "epoch": 3.620253164556962, "grad_norm": 1.195534110069275, "learning_rate": 9.779472395463802e-07, "loss": 0.39904284477233887, "step": 1716 }, { "epoch": 3.6244725738396624, "grad_norm": 1.971677303314209, "learning_rate": 9.740682592274744e-07, "loss": 0.3311789035797119, "step": 1718 }, { "epoch": 3.628691983122363, "grad_norm": 1.8470239639282227, "learning_rate": 9.702309764622328e-07, "loss": 0.1799009144306183, "step": 1720 }, { "epoch": 3.632911392405063, "grad_norm": 2.5582504272460938, "learning_rate": 9.664354379548284e-07, "loss": 0.8046585321426392, "step": 1722 }, { "epoch": 3.6371308016877637, "grad_norm": 3.1312508583068848, "learning_rate": 9.62681689901357e-07, "loss": 0.3371848165988922, "step": 1724 }, { "epoch": 3.6413502109704643, "grad_norm": 2.6263599395751953, "learning_rate": 9.589697779892765e-07, "loss": 0.2725059986114502, "step": 1726 }, { "epoch": 3.6455696202531644, "grad_norm": 1.8412586450576782, "learning_rate": 9.552997473968485e-07, "loss": 0.8444567918777466, "step": 1728 }, { "epoch": 3.649789029535865, "grad_norm": 2.1324095726013184, "learning_rate": 9.516716427925936e-07, "loss": 0.15560747683048248, "step": 1730 }, { "epoch": 3.6540084388185656, "grad_norm": 3.671393394470215, "learning_rate": 9.480855083347428e-07, "loss": 0.7069560289382935, "step": 1732 }, { "epoch": 3.6582278481012658, "grad_norm": 2.5802621841430664, "learning_rate": 9.445413876707028e-07, "loss": 0.2358541190624237, "step": 1734 }, { "epoch": 3.6624472573839664, "grad_norm": 1.5859323740005493, "learning_rate": 9.41039323936522e-07, "loss": 0.20277546346187592, "step": 1736 }, { "epoch": 3.6666666666666665, "grad_norm": 0.9495890736579895, "learning_rate": 9.375793597563692e-07, "loss": 0.5327252745628357, "step": 1738 }, { "epoch": 3.670886075949367, "grad_norm": 5.723246097564697, "learning_rate": 9.341615372420126e-07, "loss": 0.2760300636291504, "step": 1740 }, { "epoch": 3.6751054852320673, "grad_norm": 1.0034908056259155, "learning_rate": 9.307858979923064e-07, "loss": 0.905087411403656, "step": 1742 }, { "epoch": 3.679324894514768, "grad_norm": 0.9102334976196289, "learning_rate": 9.274524830926866e-07, "loss": 0.40605294704437256, "step": 1744 }, { "epoch": 3.6835443037974684, "grad_norm": 1.6663873195648193, "learning_rate": 9.241613331146703e-07, "loss": 0.4531800448894501, "step": 1746 }, { "epoch": 3.6877637130801686, "grad_norm": 0.9356587529182434, "learning_rate": 9.209124881153613e-07, "loss": 0.8058107495307922, "step": 1748 }, { "epoch": 3.691983122362869, "grad_norm": 1.130460500717163, "learning_rate": 9.177059876369619e-07, "loss": 0.5929072499275208, "step": 1750 }, { "epoch": 3.6962025316455698, "grad_norm": 1.5055688619613647, "learning_rate": 9.145418707062941e-07, "loss": 0.7090030908584595, "step": 1752 }, { "epoch": 3.70042194092827, "grad_norm": 1.6110928058624268, "learning_rate": 9.114201758343216e-07, "loss": 0.8376182913780212, "step": 1754 }, { "epoch": 3.7046413502109705, "grad_norm": 2.095933198928833, "learning_rate": 9.083409410156845e-07, "loss": 0.6055005788803101, "step": 1756 }, { "epoch": 3.708860759493671, "grad_norm": 4.733712673187256, "learning_rate": 9.053042037282327e-07, "loss": 0.6132983565330505, "step": 1758 }, { "epoch": 3.7130801687763713, "grad_norm": 5.780431270599365, "learning_rate": 9.023100009325733e-07, "loss": 0.5792241096496582, "step": 1760 }, { "epoch": 3.717299578059072, "grad_norm": 2.4617018699645996, "learning_rate": 8.993583690716196e-07, "loss": 0.16029909253120422, "step": 1762 }, { "epoch": 3.721518987341772, "grad_norm": 22.150638580322266, "learning_rate": 8.964493440701455e-07, "loss": 0.41341426968574524, "step": 1764 }, { "epoch": 3.7257383966244726, "grad_norm": 1.3757541179656982, "learning_rate": 8.935829613343528e-07, "loss": 0.6639930605888367, "step": 1766 }, { "epoch": 3.7299578059071727, "grad_norm": 2.44677472114563, "learning_rate": 8.907592557514363e-07, "loss": 0.404757022857666, "step": 1768 }, { "epoch": 3.7341772151898733, "grad_norm": 1.3542742729187012, "learning_rate": 8.8797826168916e-07, "loss": 0.539573073387146, "step": 1770 }, { "epoch": 3.738396624472574, "grad_norm": 1.3471025228500366, "learning_rate": 8.852400129954396e-07, "loss": 0.7064318656921387, "step": 1772 }, { "epoch": 3.742616033755274, "grad_norm": 0.8148087859153748, "learning_rate": 8.825445429979306e-07, "loss": 0.22752483189105988, "step": 1774 }, { "epoch": 3.7468354430379747, "grad_norm": 0.2493860125541687, "learning_rate": 8.798918845036217e-07, "loss": 0.4672152101993561, "step": 1776 }, { "epoch": 3.7510548523206753, "grad_norm": 1.1036282777786255, "learning_rate": 8.772820697984369e-07, "loss": 0.6906728148460388, "step": 1778 }, { "epoch": 3.7552742616033754, "grad_norm": 1.202394962310791, "learning_rate": 8.747151306468404e-07, "loss": 0.689781904220581, "step": 1780 }, { "epoch": 3.759493670886076, "grad_norm": 1.5368152856826782, "learning_rate": 8.721910982914527e-07, "loss": 0.6156559586524963, "step": 1782 }, { "epoch": 3.7637130801687766, "grad_norm": 2.000227928161621, "learning_rate": 8.697100034526685e-07, "loss": 0.6539533734321594, "step": 1784 }, { "epoch": 3.7679324894514767, "grad_norm": 1.3653666973114014, "learning_rate": 8.672718763282814e-07, "loss": 0.7773669362068176, "step": 1786 }, { "epoch": 3.7721518987341773, "grad_norm": 4.830440521240234, "learning_rate": 8.648767465931215e-07, "loss": 0.11648596078157425, "step": 1788 }, { "epoch": 3.7763713080168775, "grad_norm": 1.1531578302383423, "learning_rate": 8.625246433986894e-07, "loss": 0.3612111806869507, "step": 1790 }, { "epoch": 3.780590717299578, "grad_norm": 1.085537075996399, "learning_rate": 8.602155953728014e-07, "loss": 0.7319397330284119, "step": 1792 }, { "epoch": 3.7848101265822782, "grad_norm": 2.1902003288269043, "learning_rate": 8.579496306192452e-07, "loss": 0.42418360710144043, "step": 1794 }, { "epoch": 3.789029535864979, "grad_norm": 2.2132256031036377, "learning_rate": 8.557267767174329e-07, "loss": 0.6966800093650818, "step": 1796 }, { "epoch": 3.7932489451476794, "grad_norm": 15.147263526916504, "learning_rate": 8.535470607220696e-07, "loss": 0.7651135325431824, "step": 1798 }, { "epoch": 3.7974683544303796, "grad_norm": 1.363494634628296, "learning_rate": 8.514105091628205e-07, "loss": 0.6677999496459961, "step": 1800 }, { "epoch": 3.80168776371308, "grad_norm": 2.6623036861419678, "learning_rate": 8.493171480439908e-07, "loss": 0.8932458758354187, "step": 1802 }, { "epoch": 3.8059071729957807, "grad_norm": 1.0123828649520874, "learning_rate": 8.47267002844208e-07, "loss": 0.3895692527294159, "step": 1804 }, { "epoch": 3.810126582278481, "grad_norm": 7.874610900878906, "learning_rate": 8.452600985161112e-07, "loss": 0.0816773921251297, "step": 1806 }, { "epoch": 3.8143459915611815, "grad_norm": 2.1802990436553955, "learning_rate": 8.432964594860478e-07, "loss": 0.7556171417236328, "step": 1808 }, { "epoch": 3.818565400843882, "grad_norm": 1.1918423175811768, "learning_rate": 8.413761096537786e-07, "loss": 0.6875542402267456, "step": 1810 }, { "epoch": 3.8227848101265822, "grad_norm": 1.7377078533172607, "learning_rate": 8.394990723921816e-07, "loss": 0.29866987466812134, "step": 1812 }, { "epoch": 3.827004219409283, "grad_norm": 1.0950947999954224, "learning_rate": 8.376653705469733e-07, "loss": 0.7598391771316528, "step": 1814 }, { "epoch": 3.831223628691983, "grad_norm": 2.3216395378112793, "learning_rate": 8.358750264364267e-07, "loss": 0.7117894291877747, "step": 1816 }, { "epoch": 3.8354430379746836, "grad_norm": 4.284765720367432, "learning_rate": 8.341280618511016e-07, "loss": 0.6586706042289734, "step": 1818 }, { "epoch": 3.8396624472573837, "grad_norm": 2.1526107788085938, "learning_rate": 8.324244980535782e-07, "loss": 0.5206190347671509, "step": 1820 }, { "epoch": 3.8438818565400843, "grad_norm": 1.1617799997329712, "learning_rate": 8.307643557781994e-07, "loss": 0.7454214692115784, "step": 1822 }, { "epoch": 3.848101265822785, "grad_norm": 1.9797450304031372, "learning_rate": 8.291476552308179e-07, "loss": 0.6207857728004456, "step": 1824 }, { "epoch": 3.852320675105485, "grad_norm": 1.9322015047073364, "learning_rate": 8.275744160885501e-07, "loss": 0.685775876045227, "step": 1826 }, { "epoch": 3.8565400843881856, "grad_norm": 2.0633387565612793, "learning_rate": 8.260446574995363e-07, "loss": 0.7667111754417419, "step": 1828 }, { "epoch": 3.8607594936708862, "grad_norm": 1.2368988990783691, "learning_rate": 8.245583980827098e-07, "loss": 0.6670578718185425, "step": 1830 }, { "epoch": 3.8649789029535864, "grad_norm": 2.3721048831939697, "learning_rate": 8.231156559275666e-07, "loss": 0.15816515684127808, "step": 1832 }, { "epoch": 3.869198312236287, "grad_norm": 3.6182823181152344, "learning_rate": 8.217164485939484e-07, "loss": 0.4539300501346588, "step": 1834 }, { "epoch": 3.8734177215189876, "grad_norm": 4.013774394989014, "learning_rate": 8.203607931118281e-07, "loss": 0.5095362663269043, "step": 1836 }, { "epoch": 3.8776371308016877, "grad_norm": 2.4649147987365723, "learning_rate": 8.190487059811013e-07, "loss": 0.4961618483066559, "step": 1838 }, { "epoch": 3.8818565400843883, "grad_norm": 4.491702079772949, "learning_rate": 8.177802031713863e-07, "loss": 0.7962309122085571, "step": 1840 }, { "epoch": 3.8860759493670884, "grad_norm": 0.9100720286369324, "learning_rate": 8.165553001218308e-07, "loss": 0.4460848867893219, "step": 1842 }, { "epoch": 3.890295358649789, "grad_norm": 0.9674596786499023, "learning_rate": 8.153740117409218e-07, "loss": 0.44232675433158875, "step": 1844 }, { "epoch": 3.894514767932489, "grad_norm": 1.0640747547149658, "learning_rate": 8.142363524063067e-07, "loss": 0.7083509564399719, "step": 1846 }, { "epoch": 3.8987341772151898, "grad_norm": 7.521092414855957, "learning_rate": 8.131423359646147e-07, "loss": 0.309792697429657, "step": 1848 }, { "epoch": 3.9029535864978904, "grad_norm": 1.9627479314804077, "learning_rate": 8.120919757312934e-07, "loss": 0.7434027194976807, "step": 1850 }, { "epoch": 3.9071729957805905, "grad_norm": 1.8858873844146729, "learning_rate": 8.110852844904411e-07, "loss": 0.7783426642417908, "step": 1852 }, { "epoch": 3.911392405063291, "grad_norm": 1.578383445739746, "learning_rate": 8.101222744946554e-07, "loss": 0.7528443336486816, "step": 1854 }, { "epoch": 3.9156118143459917, "grad_norm": 1.0352789163589478, "learning_rate": 8.092029574648825e-07, "loss": 0.6360561847686768, "step": 1856 }, { "epoch": 3.919831223628692, "grad_norm": 1.9537928104400635, "learning_rate": 8.08327344590275e-07, "loss": 0.7542226314544678, "step": 1858 }, { "epoch": 3.9240506329113924, "grad_norm": 1.5767334699630737, "learning_rate": 8.074954465280533e-07, "loss": 0.7059440016746521, "step": 1860 }, { "epoch": 3.928270042194093, "grad_norm": 1.4566371440887451, "learning_rate": 8.067072734033808e-07, "loss": 0.44404223561286926, "step": 1862 }, { "epoch": 3.932489451476793, "grad_norm": 1.6387444734573364, "learning_rate": 8.05962834809236e-07, "loss": 0.4295271039009094, "step": 1864 }, { "epoch": 3.9367088607594938, "grad_norm": 4.41506290435791, "learning_rate": 8.052621398062982e-07, "loss": 0.9274621605873108, "step": 1866 }, { "epoch": 3.9409282700421944, "grad_norm": 2.0459539890289307, "learning_rate": 8.046051969228362e-07, "loss": 0.6663318872451782, "step": 1868 }, { "epoch": 3.9451476793248945, "grad_norm": 1.6559040546417236, "learning_rate": 8.039920141546053e-07, "loss": 0.5702696442604065, "step": 1870 }, { "epoch": 3.9493670886075947, "grad_norm": 0.035610347986221313, "learning_rate": 8.034225989647494e-07, "loss": 0.2956307530403137, "step": 1872 }, { "epoch": 3.9535864978902953, "grad_norm": 5.27208948135376, "learning_rate": 8.028969582837097e-07, "loss": 0.22891630232334137, "step": 1874 }, { "epoch": 3.957805907172996, "grad_norm": 1.6149741411209106, "learning_rate": 8.024150985091419e-07, "loss": 0.5240350961685181, "step": 1876 }, { "epoch": 3.962025316455696, "grad_norm": 3.051912307739258, "learning_rate": 8.019770255058373e-07, "loss": 0.6355645060539246, "step": 1878 }, { "epoch": 3.9662447257383966, "grad_norm": 1.413213849067688, "learning_rate": 8.015827446056511e-07, "loss": 0.4071570634841919, "step": 1880 }, { "epoch": 3.970464135021097, "grad_norm": 1.0149176120758057, "learning_rate": 8.012322606074381e-07, "loss": 0.6791200637817383, "step": 1882 }, { "epoch": 3.9746835443037973, "grad_norm": 6.349963188171387, "learning_rate": 8.009255777769939e-07, "loss": 0.1739484965801239, "step": 1884 }, { "epoch": 3.978902953586498, "grad_norm": 1.85853111743927, "learning_rate": 8.006626998470039e-07, "loss": 0.6670107245445251, "step": 1886 }, { "epoch": 3.9831223628691985, "grad_norm": 1.893870234489441, "learning_rate": 8.004436300169959e-07, "loss": 0.5138272047042847, "step": 1888 }, { "epoch": 3.9873417721518987, "grad_norm": 3.016247272491455, "learning_rate": 8.002683709533043e-07, "loss": 0.7126239538192749, "step": 1890 }, { "epoch": 3.9915611814345993, "grad_norm": 2.8701586723327637, "learning_rate": 8.001369247890338e-07, "loss": 0.4470701813697815, "step": 1892 }, { "epoch": 3.9957805907173, "grad_norm": 1.1190752983093262, "learning_rate": 8.00049293124037e-07, "loss": 0.6980884075164795, "step": 1894 }, { "epoch": 4.0, "grad_norm": 1.7359868288040161, "learning_rate": 8.000054770248921e-07, "loss": 0.6384545564651489, "step": 1896 }, { "epoch": 4.0, "step": 1896, "total_flos": 3.5948540672197263e+18, "train_loss": 0.8366947202287017, "train_runtime": 8313.2571, "train_samples_per_second": 6.842, "train_steps_per_second": 0.228 } ], "logging_steps": 2, "max_steps": 1896, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5948540672197263e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }