{ "best_global_step": 638, "best_metric": 0.313894122838974, "best_model_checkpoint": "outputs/checkpoint-638", "epoch": 5.0, "eval_steps": 500, "global_step": 1595, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003144036156415799, "grad_norm": 2.18092679977417, "learning_rate": 0.0, "loss": 1.3508, "step": 1 }, { "epoch": 0.006288072312831598, "grad_norm": 2.135878562927246, "learning_rate": 2e-05, "loss": 1.277, "step": 2 }, { "epoch": 0.009432108469247396, "grad_norm": 1.8312653303146362, "learning_rate": 4e-05, "loss": 1.2121, "step": 3 }, { "epoch": 0.012576144625663196, "grad_norm": 1.2045841217041016, "learning_rate": 6e-05, "loss": 1.1445, "step": 4 }, { "epoch": 0.015720180782078996, "grad_norm": 0.8778926730155945, "learning_rate": 8e-05, "loss": 1.0126, "step": 5 }, { "epoch": 0.018864216938494792, "grad_norm": 0.9060773849487305, "learning_rate": 0.0001, "loss": 0.8732, "step": 6 }, { "epoch": 0.02200825309491059, "grad_norm": 0.8855406045913696, "learning_rate": 9.995497523638001e-05, "loss": 0.7976, "step": 7 }, { "epoch": 0.02515228925132639, "grad_norm": 0.8502305746078491, "learning_rate": 9.990995047276002e-05, "loss": 0.6693, "step": 8 }, { "epoch": 0.028296325407742188, "grad_norm": 2.569504737854004, "learning_rate": 9.986492570914003e-05, "loss": 0.5713, "step": 9 }, { "epoch": 0.03144036156415799, "grad_norm": 0.60772705078125, "learning_rate": 9.981990094552004e-05, "loss": 0.5188, "step": 10 }, { "epoch": 0.034584397720573784, "grad_norm": 0.3977140784263611, "learning_rate": 9.977487618190005e-05, "loss": 0.5009, "step": 11 }, { "epoch": 0.037728433876989584, "grad_norm": 0.4928475022315979, "learning_rate": 9.972985141828006e-05, "loss": 0.4721, "step": 12 }, { "epoch": 0.040872470033405384, "grad_norm": 0.3859867453575134, "learning_rate": 9.968482665466006e-05, "loss": 0.4744, "step": 13 }, { "epoch": 0.04401650618982118, "grad_norm": 0.24680837988853455, "learning_rate": 9.963980189104007e-05, "loss": 0.4696, "step": 14 }, { "epoch": 0.04716054234623698, "grad_norm": 0.23483239114284515, "learning_rate": 9.95947771274201e-05, "loss": 0.4319, "step": 15 }, { "epoch": 0.05030457850265278, "grad_norm": 0.21595372259616852, "learning_rate": 9.954975236380009e-05, "loss": 0.4489, "step": 16 }, { "epoch": 0.053448614659068576, "grad_norm": 0.5875914096832275, "learning_rate": 9.95047276001801e-05, "loss": 0.4522, "step": 17 }, { "epoch": 0.056592650815484376, "grad_norm": 0.22523947060108185, "learning_rate": 9.945970283656011e-05, "loss": 0.4459, "step": 18 }, { "epoch": 0.059736686971900176, "grad_norm": 0.20356932282447815, "learning_rate": 9.941467807294013e-05, "loss": 0.4596, "step": 19 }, { "epoch": 0.06288072312831598, "grad_norm": 1.1203888654708862, "learning_rate": 9.936965330932014e-05, "loss": 0.446, "step": 20 }, { "epoch": 0.06602475928473177, "grad_norm": 0.2615947723388672, "learning_rate": 9.932462854570013e-05, "loss": 0.4723, "step": 21 }, { "epoch": 0.06916879544114757, "grad_norm": 0.21939712762832642, "learning_rate": 9.927960378208014e-05, "loss": 0.4203, "step": 22 }, { "epoch": 0.07231283159756337, "grad_norm": 0.18068519234657288, "learning_rate": 9.923457901846016e-05, "loss": 0.4117, "step": 23 }, { "epoch": 0.07545686775397917, "grad_norm": 0.1733531355857849, "learning_rate": 9.918955425484017e-05, "loss": 0.4386, "step": 24 }, { "epoch": 0.07860090391039497, "grad_norm": 0.18101659417152405, "learning_rate": 9.914452949122017e-05, "loss": 0.4291, "step": 25 }, { "epoch": 0.08174494006681077, "grad_norm": 0.18338626623153687, "learning_rate": 9.909950472760019e-05, "loss": 0.4189, "step": 26 }, { "epoch": 0.08488897622322657, "grad_norm": 0.17645250260829926, "learning_rate": 9.90544799639802e-05, "loss": 0.3872, "step": 27 }, { "epoch": 0.08803301237964237, "grad_norm": 0.1902536153793335, "learning_rate": 9.900945520036021e-05, "loss": 0.4055, "step": 28 }, { "epoch": 0.09117704853605817, "grad_norm": 0.18971717357635498, "learning_rate": 9.89644304367402e-05, "loss": 0.4148, "step": 29 }, { "epoch": 0.09432108469247397, "grad_norm": 0.1757958084344864, "learning_rate": 9.891940567312022e-05, "loss": 0.4019, "step": 30 }, { "epoch": 0.09746512084888977, "grad_norm": 0.18116620182991028, "learning_rate": 9.887438090950023e-05, "loss": 0.409, "step": 31 }, { "epoch": 0.10060915700530557, "grad_norm": 0.16721461713314056, "learning_rate": 9.882935614588024e-05, "loss": 0.4118, "step": 32 }, { "epoch": 0.10375319316172137, "grad_norm": 0.18521425127983093, "learning_rate": 9.878433138226025e-05, "loss": 0.4358, "step": 33 }, { "epoch": 0.10689722931813715, "grad_norm": 0.18263505399227142, "learning_rate": 9.873930661864026e-05, "loss": 0.3956, "step": 34 }, { "epoch": 0.11004126547455295, "grad_norm": 0.1665913313627243, "learning_rate": 9.869428185502027e-05, "loss": 0.3826, "step": 35 }, { "epoch": 0.11318530163096875, "grad_norm": 0.16498151421546936, "learning_rate": 9.864925709140028e-05, "loss": 0.3556, "step": 36 }, { "epoch": 0.11632933778738455, "grad_norm": 0.17468655109405518, "learning_rate": 9.860423232778028e-05, "loss": 0.3702, "step": 37 }, { "epoch": 0.11947337394380035, "grad_norm": 0.1699349582195282, "learning_rate": 9.855920756416029e-05, "loss": 0.3763, "step": 38 }, { "epoch": 0.12261741010021615, "grad_norm": 0.17979387938976288, "learning_rate": 9.85141828005403e-05, "loss": 0.4059, "step": 39 }, { "epoch": 0.12576144625663196, "grad_norm": 0.17460626363754272, "learning_rate": 9.846915803692031e-05, "loss": 0.4032, "step": 40 }, { "epoch": 0.12890548241304775, "grad_norm": 0.1785130649805069, "learning_rate": 9.842413327330032e-05, "loss": 0.3939, "step": 41 }, { "epoch": 0.13204951856946354, "grad_norm": 0.1852668821811676, "learning_rate": 9.837910850968033e-05, "loss": 0.3746, "step": 42 }, { "epoch": 0.13519355472587935, "grad_norm": 0.18407292664051056, "learning_rate": 9.833408374606034e-05, "loss": 0.3851, "step": 43 }, { "epoch": 0.13833759088229514, "grad_norm": 0.18785783648490906, "learning_rate": 9.828905898244036e-05, "loss": 0.3895, "step": 44 }, { "epoch": 0.14148162703871095, "grad_norm": 0.18965557217597961, "learning_rate": 9.824403421882035e-05, "loss": 0.3746, "step": 45 }, { "epoch": 0.14462566319512674, "grad_norm": 0.1771431416273117, "learning_rate": 9.819900945520036e-05, "loss": 0.3483, "step": 46 }, { "epoch": 0.14776969935154255, "grad_norm": 0.18872253596782684, "learning_rate": 9.815398469158037e-05, "loss": 0.3533, "step": 47 }, { "epoch": 0.15091373550795834, "grad_norm": 0.18482953310012817, "learning_rate": 9.810895992796039e-05, "loss": 0.3758, "step": 48 }, { "epoch": 0.15405777166437415, "grad_norm": 0.18910518288612366, "learning_rate": 9.806393516434039e-05, "loss": 0.3804, "step": 49 }, { "epoch": 0.15720180782078993, "grad_norm": 0.187296524643898, "learning_rate": 9.80189104007204e-05, "loss": 0.3676, "step": 50 }, { "epoch": 0.16034584397720575, "grad_norm": 0.19214150309562683, "learning_rate": 9.797388563710042e-05, "loss": 0.3921, "step": 51 }, { "epoch": 0.16348988013362153, "grad_norm": 0.19029422104358673, "learning_rate": 9.792886087348043e-05, "loss": 0.3778, "step": 52 }, { "epoch": 0.16663391629003735, "grad_norm": 0.1980220377445221, "learning_rate": 9.788383610986042e-05, "loss": 0.3917, "step": 53 }, { "epoch": 0.16977795244645313, "grad_norm": 0.19083669781684875, "learning_rate": 9.783881134624043e-05, "loss": 0.3798, "step": 54 }, { "epoch": 0.17292198860286892, "grad_norm": 0.25795647501945496, "learning_rate": 9.779378658262045e-05, "loss": 0.3877, "step": 55 }, { "epoch": 0.17606602475928473, "grad_norm": 0.19090382754802704, "learning_rate": 9.774876181900046e-05, "loss": 0.3696, "step": 56 }, { "epoch": 0.17921006091570052, "grad_norm": 0.19982369244098663, "learning_rate": 9.770373705538046e-05, "loss": 0.3734, "step": 57 }, { "epoch": 0.18235409707211633, "grad_norm": 0.1944751739501953, "learning_rate": 9.765871229176046e-05, "loss": 0.3572, "step": 58 }, { "epoch": 0.18549813322853212, "grad_norm": 0.1942175179719925, "learning_rate": 9.761368752814049e-05, "loss": 0.4059, "step": 59 }, { "epoch": 0.18864216938494793, "grad_norm": 0.177927166223526, "learning_rate": 9.75686627645205e-05, "loss": 0.3645, "step": 60 }, { "epoch": 0.19178620554136372, "grad_norm": 0.18761321902275085, "learning_rate": 9.752363800090049e-05, "loss": 0.3735, "step": 61 }, { "epoch": 0.19493024169777953, "grad_norm": 0.21108420193195343, "learning_rate": 9.747861323728051e-05, "loss": 0.3603, "step": 62 }, { "epoch": 0.19807427785419532, "grad_norm": 0.18813803791999817, "learning_rate": 9.743358847366052e-05, "loss": 0.366, "step": 63 }, { "epoch": 0.20121831401061113, "grad_norm": 0.1801685392856598, "learning_rate": 9.738856371004053e-05, "loss": 0.3585, "step": 64 }, { "epoch": 0.20436235016702692, "grad_norm": 0.1869877278804779, "learning_rate": 9.734353894642053e-05, "loss": 0.3787, "step": 65 }, { "epoch": 0.20750638632344273, "grad_norm": 0.18504877388477325, "learning_rate": 9.729851418280055e-05, "loss": 0.3442, "step": 66 }, { "epoch": 0.21065042247985852, "grad_norm": 0.19591134786605835, "learning_rate": 9.725348941918056e-05, "loss": 0.3876, "step": 67 }, { "epoch": 0.2137944586362743, "grad_norm": 0.1981891542673111, "learning_rate": 9.720846465556056e-05, "loss": 0.3507, "step": 68 }, { "epoch": 0.21693849479269012, "grad_norm": 0.20417073369026184, "learning_rate": 9.716343989194057e-05, "loss": 0.3667, "step": 69 }, { "epoch": 0.2200825309491059, "grad_norm": 0.19462363421916962, "learning_rate": 9.711841512832058e-05, "loss": 0.3595, "step": 70 }, { "epoch": 0.22322656710552172, "grad_norm": 0.17222774028778076, "learning_rate": 9.707339036470059e-05, "loss": 0.3451, "step": 71 }, { "epoch": 0.2263706032619375, "grad_norm": 0.1774955689907074, "learning_rate": 9.70283656010806e-05, "loss": 0.3386, "step": 72 }, { "epoch": 0.22951463941835332, "grad_norm": 0.189998060464859, "learning_rate": 9.698334083746061e-05, "loss": 0.3522, "step": 73 }, { "epoch": 0.2326586755747691, "grad_norm": 0.1920982450246811, "learning_rate": 9.693831607384062e-05, "loss": 0.3733, "step": 74 }, { "epoch": 0.23580271173118492, "grad_norm": 0.1971607357263565, "learning_rate": 9.689329131022062e-05, "loss": 0.3504, "step": 75 }, { "epoch": 0.2389467478876007, "grad_norm": 0.20512360334396362, "learning_rate": 9.684826654660063e-05, "loss": 0.3464, "step": 76 }, { "epoch": 0.24209078404401652, "grad_norm": 0.2119520902633667, "learning_rate": 9.680324178298064e-05, "loss": 0.3686, "step": 77 }, { "epoch": 0.2452348202004323, "grad_norm": 0.22858689725399017, "learning_rate": 9.675821701936065e-05, "loss": 0.3438, "step": 78 }, { "epoch": 0.24837885635684812, "grad_norm": 0.1901649832725525, "learning_rate": 9.671319225574066e-05, "loss": 0.3607, "step": 79 }, { "epoch": 0.25152289251326393, "grad_norm": 0.1896492838859558, "learning_rate": 9.666816749212068e-05, "loss": 0.3728, "step": 80 }, { "epoch": 0.2546669286696797, "grad_norm": 0.18334272503852844, "learning_rate": 9.662314272850068e-05, "loss": 0.3617, "step": 81 }, { "epoch": 0.2578109648260955, "grad_norm": 0.20095829665660858, "learning_rate": 9.657811796488068e-05, "loss": 0.3632, "step": 82 }, { "epoch": 0.2609550009825113, "grad_norm": 0.17583882808685303, "learning_rate": 9.653309320126069e-05, "loss": 0.3582, "step": 83 }, { "epoch": 0.2640990371389271, "grad_norm": 0.19473253190517426, "learning_rate": 9.648806843764072e-05, "loss": 0.3643, "step": 84 }, { "epoch": 0.2672430732953429, "grad_norm": 0.1956205517053604, "learning_rate": 9.644304367402071e-05, "loss": 0.336, "step": 85 }, { "epoch": 0.2703871094517587, "grad_norm": 0.19884824752807617, "learning_rate": 9.639801891040072e-05, "loss": 0.3626, "step": 86 }, { "epoch": 0.2735311456081745, "grad_norm": 0.18607290089130402, "learning_rate": 9.635299414678074e-05, "loss": 0.3274, "step": 87 }, { "epoch": 0.27667518176459027, "grad_norm": 0.18494442105293274, "learning_rate": 9.630796938316075e-05, "loss": 0.3437, "step": 88 }, { "epoch": 0.2798192179210061, "grad_norm": 0.18413978815078735, "learning_rate": 9.626294461954074e-05, "loss": 0.3604, "step": 89 }, { "epoch": 0.2829632540774219, "grad_norm": 0.19610458612442017, "learning_rate": 9.621791985592075e-05, "loss": 0.3729, "step": 90 }, { "epoch": 0.2861072902338377, "grad_norm": 0.19458866119384766, "learning_rate": 9.617289509230078e-05, "loss": 0.3493, "step": 91 }, { "epoch": 0.28925132639025347, "grad_norm": 0.20431379973888397, "learning_rate": 9.612787032868078e-05, "loss": 0.3464, "step": 92 }, { "epoch": 0.2923953625466693, "grad_norm": 0.1833576112985611, "learning_rate": 9.608284556506079e-05, "loss": 0.3434, "step": 93 }, { "epoch": 0.2955393987030851, "grad_norm": 0.18712273240089417, "learning_rate": 9.603782080144079e-05, "loss": 0.3497, "step": 94 }, { "epoch": 0.2986834348595009, "grad_norm": 0.19049568474292755, "learning_rate": 9.599279603782081e-05, "loss": 0.3579, "step": 95 }, { "epoch": 0.30182747101591667, "grad_norm": 0.18482261896133423, "learning_rate": 9.594777127420082e-05, "loss": 0.3714, "step": 96 }, { "epoch": 0.30497150717233246, "grad_norm": 0.23206727206707, "learning_rate": 9.590274651058083e-05, "loss": 0.3457, "step": 97 }, { "epoch": 0.3081155433287483, "grad_norm": 0.18018409609794617, "learning_rate": 9.585772174696084e-05, "loss": 0.3507, "step": 98 }, { "epoch": 0.3112595794851641, "grad_norm": 0.1778680980205536, "learning_rate": 9.581269698334084e-05, "loss": 0.3455, "step": 99 }, { "epoch": 0.31440361564157987, "grad_norm": 0.19341802597045898, "learning_rate": 9.576767221972085e-05, "loss": 0.3522, "step": 100 }, { "epoch": 0.31754765179799566, "grad_norm": 0.18757164478302002, "learning_rate": 9.572264745610086e-05, "loss": 0.331, "step": 101 }, { "epoch": 0.3206916879544115, "grad_norm": 0.1873527616262436, "learning_rate": 9.567762269248087e-05, "loss": 0.3341, "step": 102 }, { "epoch": 0.3238357241108273, "grad_norm": 0.19451723992824554, "learning_rate": 9.563259792886088e-05, "loss": 0.3726, "step": 103 }, { "epoch": 0.32697976026724307, "grad_norm": 0.19554930925369263, "learning_rate": 9.558757316524089e-05, "loss": 0.3568, "step": 104 }, { "epoch": 0.33012379642365886, "grad_norm": 0.18672047555446625, "learning_rate": 9.55425484016209e-05, "loss": 0.3666, "step": 105 }, { "epoch": 0.3332678325800747, "grad_norm": 0.18587937951087952, "learning_rate": 9.54975236380009e-05, "loss": 0.3239, "step": 106 }, { "epoch": 0.3364118687364905, "grad_norm": 0.18024438619613647, "learning_rate": 9.545249887438091e-05, "loss": 0.3492, "step": 107 }, { "epoch": 0.33955590489290627, "grad_norm": 0.18240226805210114, "learning_rate": 9.540747411076092e-05, "loss": 0.3351, "step": 108 }, { "epoch": 0.34269994104932205, "grad_norm": 0.16186107695102692, "learning_rate": 9.536244934714093e-05, "loss": 0.3139, "step": 109 }, { "epoch": 0.34584397720573784, "grad_norm": 0.16864165663719177, "learning_rate": 9.531742458352094e-05, "loss": 0.3321, "step": 110 }, { "epoch": 0.3489880133621537, "grad_norm": 0.1825931966304779, "learning_rate": 9.527239981990095e-05, "loss": 0.3531, "step": 111 }, { "epoch": 0.35213204951856947, "grad_norm": 0.17394313216209412, "learning_rate": 9.522737505628096e-05, "loss": 0.3626, "step": 112 }, { "epoch": 0.35527608567498525, "grad_norm": 0.18193919956684113, "learning_rate": 9.518235029266098e-05, "loss": 0.342, "step": 113 }, { "epoch": 0.35842012183140104, "grad_norm": 0.18491008877754211, "learning_rate": 9.513732552904097e-05, "loss": 0.348, "step": 114 }, { "epoch": 0.3615641579878169, "grad_norm": 0.19338466227054596, "learning_rate": 9.509230076542098e-05, "loss": 0.3531, "step": 115 }, { "epoch": 0.36470819414423267, "grad_norm": 0.18233619630336761, "learning_rate": 9.5047276001801e-05, "loss": 0.3147, "step": 116 }, { "epoch": 0.36785223030064845, "grad_norm": 0.1774706244468689, "learning_rate": 9.500225123818101e-05, "loss": 0.3442, "step": 117 }, { "epoch": 0.37099626645706424, "grad_norm": 0.1780499666929245, "learning_rate": 9.495722647456101e-05, "loss": 0.3368, "step": 118 }, { "epoch": 0.3741403026134801, "grad_norm": 0.18548406660556793, "learning_rate": 9.491220171094102e-05, "loss": 0.3429, "step": 119 }, { "epoch": 0.37728433876989587, "grad_norm": 0.18113106489181519, "learning_rate": 9.486717694732104e-05, "loss": 0.3278, "step": 120 }, { "epoch": 0.38042837492631165, "grad_norm": 0.18168263137340546, "learning_rate": 9.482215218370105e-05, "loss": 0.3127, "step": 121 }, { "epoch": 0.38357241108272744, "grad_norm": 0.17432525753974915, "learning_rate": 9.477712742008104e-05, "loss": 0.3594, "step": 122 }, { "epoch": 0.3867164472391432, "grad_norm": 0.187408447265625, "learning_rate": 9.473210265646106e-05, "loss": 0.355, "step": 123 }, { "epoch": 0.38986048339555907, "grad_norm": 0.17972330749034882, "learning_rate": 9.468707789284107e-05, "loss": 0.3493, "step": 124 }, { "epoch": 0.39300451955197485, "grad_norm": 0.17262862622737885, "learning_rate": 9.464205312922108e-05, "loss": 0.323, "step": 125 }, { "epoch": 0.39614855570839064, "grad_norm": 0.26467645168304443, "learning_rate": 9.459702836560108e-05, "loss": 0.3643, "step": 126 }, { "epoch": 0.3992925918648064, "grad_norm": 0.24252085387706757, "learning_rate": 9.45520036019811e-05, "loss": 0.3573, "step": 127 }, { "epoch": 0.40243662802122226, "grad_norm": 0.18881508708000183, "learning_rate": 9.45069788383611e-05, "loss": 0.325, "step": 128 }, { "epoch": 0.40558066417763805, "grad_norm": 0.22236384451389313, "learning_rate": 9.446195407474112e-05, "loss": 0.3806, "step": 129 }, { "epoch": 0.40872470033405384, "grad_norm": 0.19904322922229767, "learning_rate": 9.441692931112111e-05, "loss": 0.4056, "step": 130 }, { "epoch": 0.4118687364904696, "grad_norm": 0.18705110251903534, "learning_rate": 9.437190454750113e-05, "loss": 0.3346, "step": 131 }, { "epoch": 0.41501277264688546, "grad_norm": 0.18179073929786682, "learning_rate": 9.432687978388114e-05, "loss": 0.3136, "step": 132 }, { "epoch": 0.41815680880330125, "grad_norm": 0.17651726305484772, "learning_rate": 9.428185502026115e-05, "loss": 0.3267, "step": 133 }, { "epoch": 0.42130084495971704, "grad_norm": 0.1757514625787735, "learning_rate": 9.423683025664116e-05, "loss": 0.3442, "step": 134 }, { "epoch": 0.4244448811161328, "grad_norm": 0.18630896508693695, "learning_rate": 9.419180549302117e-05, "loss": 0.3357, "step": 135 }, { "epoch": 0.4275889172725486, "grad_norm": 0.17533083260059357, "learning_rate": 9.414678072940118e-05, "loss": 0.3253, "step": 136 }, { "epoch": 0.43073295342896445, "grad_norm": 0.17959101498126984, "learning_rate": 9.410175596578118e-05, "loss": 0.3185, "step": 137 }, { "epoch": 0.43387698958538023, "grad_norm": 0.1812899112701416, "learning_rate": 9.405673120216119e-05, "loss": 0.3502, "step": 138 }, { "epoch": 0.437021025741796, "grad_norm": 0.18919287621974945, "learning_rate": 9.40117064385412e-05, "loss": 0.3326, "step": 139 }, { "epoch": 0.4401650618982118, "grad_norm": 0.19101746380329132, "learning_rate": 9.396668167492121e-05, "loss": 0.2964, "step": 140 }, { "epoch": 0.44330909805462765, "grad_norm": 7.04909086227417, "learning_rate": 9.392165691130123e-05, "loss": 0.366, "step": 141 }, { "epoch": 0.44645313421104343, "grad_norm": 0.23961827158927917, "learning_rate": 9.387663214768123e-05, "loss": 0.3251, "step": 142 }, { "epoch": 0.4495971703674592, "grad_norm": 0.18671870231628418, "learning_rate": 9.383160738406124e-05, "loss": 0.3513, "step": 143 }, { "epoch": 0.452741206523875, "grad_norm": 0.25953730940818787, "learning_rate": 9.378658262044124e-05, "loss": 0.3189, "step": 144 }, { "epoch": 0.45588524268029085, "grad_norm": 0.17804424464702606, "learning_rate": 9.374155785682127e-05, "loss": 0.3426, "step": 145 }, { "epoch": 0.45902927883670663, "grad_norm": 0.19183290004730225, "learning_rate": 9.369653309320126e-05, "loss": 0.3388, "step": 146 }, { "epoch": 0.4621733149931224, "grad_norm": 0.1751260757446289, "learning_rate": 9.365150832958127e-05, "loss": 0.3465, "step": 147 }, { "epoch": 0.4653173511495382, "grad_norm": 0.17152872681617737, "learning_rate": 9.360648356596128e-05, "loss": 0.3009, "step": 148 }, { "epoch": 0.468461387305954, "grad_norm": 0.17340736091136932, "learning_rate": 9.35614588023413e-05, "loss": 0.3377, "step": 149 }, { "epoch": 0.47160542346236983, "grad_norm": 0.17497164011001587, "learning_rate": 9.35164340387213e-05, "loss": 0.3512, "step": 150 }, { "epoch": 0.4747494596187856, "grad_norm": 0.18566282093524933, "learning_rate": 9.34714092751013e-05, "loss": 0.3208, "step": 151 }, { "epoch": 0.4778934957752014, "grad_norm": 0.20263151824474335, "learning_rate": 9.342638451148133e-05, "loss": 0.3543, "step": 152 }, { "epoch": 0.4810375319316172, "grad_norm": 0.19179081916809082, "learning_rate": 9.338135974786133e-05, "loss": 0.3387, "step": 153 }, { "epoch": 0.48418156808803303, "grad_norm": 0.19308720529079437, "learning_rate": 9.333633498424133e-05, "loss": 0.3679, "step": 154 }, { "epoch": 0.4873256042444488, "grad_norm": 0.1667911857366562, "learning_rate": 9.329131022062134e-05, "loss": 0.3243, "step": 155 }, { "epoch": 0.4904696404008646, "grad_norm": 0.17789964377880096, "learning_rate": 9.324628545700136e-05, "loss": 0.3291, "step": 156 }, { "epoch": 0.4936136765572804, "grad_norm": 0.17497336864471436, "learning_rate": 9.320126069338137e-05, "loss": 0.3347, "step": 157 }, { "epoch": 0.49675771271369623, "grad_norm": 0.1668512523174286, "learning_rate": 9.315623592976136e-05, "loss": 0.3074, "step": 158 }, { "epoch": 0.499901748870112, "grad_norm": 0.18032796680927277, "learning_rate": 9.311121116614139e-05, "loss": 0.3242, "step": 159 }, { "epoch": 0.5030457850265279, "grad_norm": 0.19095478951931, "learning_rate": 9.30661864025214e-05, "loss": 0.3309, "step": 160 }, { "epoch": 0.5061898211829436, "grad_norm": 0.17513571679592133, "learning_rate": 9.30211616389014e-05, "loss": 0.354, "step": 161 }, { "epoch": 0.5093338573393594, "grad_norm": 0.17440561950206757, "learning_rate": 9.29761368752814e-05, "loss": 0.3447, "step": 162 }, { "epoch": 0.5124778934957752, "grad_norm": 0.17587585747241974, "learning_rate": 9.293111211166142e-05, "loss": 0.347, "step": 163 }, { "epoch": 0.515621929652191, "grad_norm": 0.17777486145496368, "learning_rate": 9.288608734804143e-05, "loss": 0.3216, "step": 164 }, { "epoch": 0.5187659658086068, "grad_norm": 0.17235027253627777, "learning_rate": 9.284106258442144e-05, "loss": 0.342, "step": 165 }, { "epoch": 0.5219100019650226, "grad_norm": 0.17032384872436523, "learning_rate": 9.279603782080145e-05, "loss": 0.3235, "step": 166 }, { "epoch": 0.5250540381214384, "grad_norm": 0.1659417450428009, "learning_rate": 9.275101305718146e-05, "loss": 0.3294, "step": 167 }, { "epoch": 0.5281980742778541, "grad_norm": 0.1650734841823578, "learning_rate": 9.270598829356146e-05, "loss": 0.3179, "step": 168 }, { "epoch": 0.53134211043427, "grad_norm": 0.1897146999835968, "learning_rate": 9.266096352994147e-05, "loss": 0.3436, "step": 169 }, { "epoch": 0.5344861465906858, "grad_norm": 0.18100985884666443, "learning_rate": 9.261593876632148e-05, "loss": 0.3378, "step": 170 }, { "epoch": 0.5376301827471016, "grad_norm": 0.18976901471614838, "learning_rate": 9.257091400270149e-05, "loss": 0.3329, "step": 171 }, { "epoch": 0.5407742189035174, "grad_norm": 0.18210701644420624, "learning_rate": 9.25258892390815e-05, "loss": 0.3566, "step": 172 }, { "epoch": 0.5439182550599332, "grad_norm": 0.1779012680053711, "learning_rate": 9.24808644754615e-05, "loss": 0.3063, "step": 173 }, { "epoch": 0.547062291216349, "grad_norm": 0.16529639065265656, "learning_rate": 9.243583971184152e-05, "loss": 0.3234, "step": 174 }, { "epoch": 0.5502063273727648, "grad_norm": 0.24405060708522797, "learning_rate": 9.239081494822152e-05, "loss": 0.3321, "step": 175 }, { "epoch": 0.5533503635291805, "grad_norm": 0.16497737169265747, "learning_rate": 9.234579018460153e-05, "loss": 0.3337, "step": 176 }, { "epoch": 0.5564943996855963, "grad_norm": 0.47097891569137573, "learning_rate": 9.230076542098155e-05, "loss": 0.3378, "step": 177 }, { "epoch": 0.5596384358420122, "grad_norm": 0.18612946569919586, "learning_rate": 9.225574065736155e-05, "loss": 0.3134, "step": 178 }, { "epoch": 0.562782471998428, "grad_norm": 0.2161218822002411, "learning_rate": 9.221071589374156e-05, "loss": 0.3345, "step": 179 }, { "epoch": 0.5659265081548438, "grad_norm": 0.19805237650871277, "learning_rate": 9.216569113012157e-05, "loss": 0.3342, "step": 180 }, { "epoch": 0.5690705443112596, "grad_norm": 0.17592518031597137, "learning_rate": 9.212066636650159e-05, "loss": 0.3454, "step": 181 }, { "epoch": 0.5722145804676754, "grad_norm": 0.18876737356185913, "learning_rate": 9.207564160288158e-05, "loss": 0.344, "step": 182 }, { "epoch": 0.5753586166240912, "grad_norm": 0.18281705677509308, "learning_rate": 9.203061683926159e-05, "loss": 0.3277, "step": 183 }, { "epoch": 0.5785026527805069, "grad_norm": 0.18671815097332, "learning_rate": 9.19855920756416e-05, "loss": 0.319, "step": 184 }, { "epoch": 0.5816466889369227, "grad_norm": 0.1737174689769745, "learning_rate": 9.194056731202162e-05, "loss": 0.3554, "step": 185 }, { "epoch": 0.5847907250933386, "grad_norm": 0.16264449059963226, "learning_rate": 9.189554254840163e-05, "loss": 0.3404, "step": 186 }, { "epoch": 0.5879347612497544, "grad_norm": 0.16205957531929016, "learning_rate": 9.185051778478163e-05, "loss": 0.3345, "step": 187 }, { "epoch": 0.5910787974061702, "grad_norm": 0.16299399733543396, "learning_rate": 9.180549302116165e-05, "loss": 0.3406, "step": 188 }, { "epoch": 0.594222833562586, "grad_norm": 0.16929860413074493, "learning_rate": 9.176046825754166e-05, "loss": 0.3351, "step": 189 }, { "epoch": 0.5973668697190018, "grad_norm": 0.18242709338665009, "learning_rate": 9.171544349392167e-05, "loss": 0.3277, "step": 190 }, { "epoch": 0.6005109058754176, "grad_norm": 0.1715114861726761, "learning_rate": 9.167041873030166e-05, "loss": 0.3341, "step": 191 }, { "epoch": 0.6036549420318333, "grad_norm": 0.1673378199338913, "learning_rate": 9.162539396668168e-05, "loss": 0.316, "step": 192 }, { "epoch": 0.6067989781882491, "grad_norm": 0.1861652284860611, "learning_rate": 9.158036920306169e-05, "loss": 0.3425, "step": 193 }, { "epoch": 0.6099430143446649, "grad_norm": 0.170218825340271, "learning_rate": 9.15353444394417e-05, "loss": 0.3495, "step": 194 }, { "epoch": 0.6130870505010808, "grad_norm": 0.16409920156002045, "learning_rate": 9.149031967582171e-05, "loss": 0.3216, "step": 195 }, { "epoch": 0.6162310866574966, "grad_norm": 0.1930875927209854, "learning_rate": 9.144529491220172e-05, "loss": 0.3221, "step": 196 }, { "epoch": 0.6193751228139124, "grad_norm": 0.168474480509758, "learning_rate": 9.140027014858173e-05, "loss": 0.3483, "step": 197 }, { "epoch": 0.6225191589703282, "grad_norm": 0.1649659276008606, "learning_rate": 9.135524538496173e-05, "loss": 0.3281, "step": 198 }, { "epoch": 0.625663195126744, "grad_norm": 0.16725848615169525, "learning_rate": 9.131022062134174e-05, "loss": 0.349, "step": 199 }, { "epoch": 0.6288072312831597, "grad_norm": 0.16848574578762054, "learning_rate": 9.126519585772175e-05, "loss": 0.3467, "step": 200 }, { "epoch": 0.6319512674395755, "grad_norm": 0.17817632853984833, "learning_rate": 9.122017109410176e-05, "loss": 0.3468, "step": 201 }, { "epoch": 0.6350953035959913, "grad_norm": 0.16884905099868774, "learning_rate": 9.117514633048177e-05, "loss": 0.3197, "step": 202 }, { "epoch": 0.6382393397524071, "grad_norm": 0.16829445958137512, "learning_rate": 9.113012156686178e-05, "loss": 0.3495, "step": 203 }, { "epoch": 0.641383375908823, "grad_norm": 0.1753387451171875, "learning_rate": 9.108509680324179e-05, "loss": 0.3549, "step": 204 }, { "epoch": 0.6445274120652388, "grad_norm": 0.17498289048671722, "learning_rate": 9.10400720396218e-05, "loss": 0.3169, "step": 205 }, { "epoch": 0.6476714482216546, "grad_norm": 0.17499548196792603, "learning_rate": 9.09950472760018e-05, "loss": 0.3226, "step": 206 }, { "epoch": 0.6508154843780704, "grad_norm": 0.17783628404140472, "learning_rate": 9.095002251238181e-05, "loss": 0.3355, "step": 207 }, { "epoch": 0.6539595205344861, "grad_norm": 0.16701580584049225, "learning_rate": 9.090499774876182e-05, "loss": 0.3348, "step": 208 }, { "epoch": 0.6571035566909019, "grad_norm": 0.1692950576543808, "learning_rate": 9.085997298514183e-05, "loss": 0.3117, "step": 209 }, { "epoch": 0.6602475928473177, "grad_norm": 0.17733407020568848, "learning_rate": 9.081494822152185e-05, "loss": 0.3254, "step": 210 }, { "epoch": 0.6633916290037335, "grad_norm": 0.18444949388504028, "learning_rate": 9.076992345790185e-05, "loss": 0.3243, "step": 211 }, { "epoch": 0.6665356651601494, "grad_norm": 0.1709858626127243, "learning_rate": 9.072489869428186e-05, "loss": 0.3437, "step": 212 }, { "epoch": 0.6696797013165652, "grad_norm": 0.16070497035980225, "learning_rate": 9.067987393066188e-05, "loss": 0.3205, "step": 213 }, { "epoch": 0.672823737472981, "grad_norm": 0.16293945908546448, "learning_rate": 9.063484916704189e-05, "loss": 0.3178, "step": 214 }, { "epoch": 0.6759677736293968, "grad_norm": 0.17348802089691162, "learning_rate": 9.058982440342188e-05, "loss": 0.3434, "step": 215 }, { "epoch": 0.6791118097858125, "grad_norm": 0.16067078709602356, "learning_rate": 9.054479963980189e-05, "loss": 0.3051, "step": 216 }, { "epoch": 0.6822558459422283, "grad_norm": 0.1788797527551651, "learning_rate": 9.049977487618191e-05, "loss": 0.3311, "step": 217 }, { "epoch": 0.6853998820986441, "grad_norm": 0.17016440629959106, "learning_rate": 9.045475011256192e-05, "loss": 0.3248, "step": 218 }, { "epoch": 0.6885439182550599, "grad_norm": 0.18454566597938538, "learning_rate": 9.040972534894192e-05, "loss": 0.3144, "step": 219 }, { "epoch": 0.6916879544114757, "grad_norm": 0.1694164127111435, "learning_rate": 9.036470058532192e-05, "loss": 0.3263, "step": 220 }, { "epoch": 0.6948319905678916, "grad_norm": 0.1772613525390625, "learning_rate": 9.031967582170195e-05, "loss": 0.3468, "step": 221 }, { "epoch": 0.6979760267243074, "grad_norm": 0.16401882469654083, "learning_rate": 9.027465105808195e-05, "loss": 0.342, "step": 222 }, { "epoch": 0.7011200628807231, "grad_norm": 0.16261254251003265, "learning_rate": 9.022962629446195e-05, "loss": 0.3249, "step": 223 }, { "epoch": 0.7042640990371389, "grad_norm": 0.17154066264629364, "learning_rate": 9.018460153084197e-05, "loss": 0.3631, "step": 224 }, { "epoch": 0.7074081351935547, "grad_norm": 0.18076153099536896, "learning_rate": 9.013957676722198e-05, "loss": 0.3282, "step": 225 }, { "epoch": 0.7105521713499705, "grad_norm": 0.15930242836475372, "learning_rate": 9.009455200360199e-05, "loss": 0.3233, "step": 226 }, { "epoch": 0.7136962075063863, "grad_norm": 0.16669179499149323, "learning_rate": 9.004952723998198e-05, "loss": 0.307, "step": 227 }, { "epoch": 0.7168402436628021, "grad_norm": 0.18358565866947174, "learning_rate": 9.0004502476362e-05, "loss": 0.3719, "step": 228 }, { "epoch": 0.719984279819218, "grad_norm": 0.16769863665103912, "learning_rate": 8.995947771274201e-05, "loss": 0.3081, "step": 229 }, { "epoch": 0.7231283159756338, "grad_norm": 0.1651238203048706, "learning_rate": 8.991445294912202e-05, "loss": 0.3229, "step": 230 }, { "epoch": 0.7262723521320495, "grad_norm": 0.18452374637126923, "learning_rate": 8.986942818550203e-05, "loss": 0.3249, "step": 231 }, { "epoch": 0.7294163882884653, "grad_norm": 0.17209681868553162, "learning_rate": 8.982440342188204e-05, "loss": 0.3444, "step": 232 }, { "epoch": 0.7325604244448811, "grad_norm": 0.17528848350048065, "learning_rate": 8.977937865826205e-05, "loss": 0.3217, "step": 233 }, { "epoch": 0.7357044606012969, "grad_norm": 0.16508957743644714, "learning_rate": 8.973435389464206e-05, "loss": 0.3098, "step": 234 }, { "epoch": 0.7388484967577127, "grad_norm": 0.171140655875206, "learning_rate": 8.968932913102207e-05, "loss": 0.367, "step": 235 }, { "epoch": 0.7419925329141285, "grad_norm": 0.16529837250709534, "learning_rate": 8.964430436740207e-05, "loss": 0.2911, "step": 236 }, { "epoch": 0.7451365690705443, "grad_norm": 0.1798229068517685, "learning_rate": 8.959927960378208e-05, "loss": 0.3264, "step": 237 }, { "epoch": 0.7482806052269602, "grad_norm": 0.17085868120193481, "learning_rate": 8.955425484016209e-05, "loss": 0.3109, "step": 238 }, { "epoch": 0.751424641383376, "grad_norm": 0.17515264451503754, "learning_rate": 8.95092300765421e-05, "loss": 0.317, "step": 239 }, { "epoch": 0.7545686775397917, "grad_norm": 0.18475565314292908, "learning_rate": 8.946420531292211e-05, "loss": 0.3683, "step": 240 }, { "epoch": 0.7577127136962075, "grad_norm": 0.16714327037334442, "learning_rate": 8.941918054930212e-05, "loss": 0.3429, "step": 241 }, { "epoch": 0.7608567498526233, "grad_norm": 0.15969350934028625, "learning_rate": 8.937415578568214e-05, "loss": 0.3331, "step": 242 }, { "epoch": 0.7640007860090391, "grad_norm": 0.16738007962703705, "learning_rate": 8.932913102206213e-05, "loss": 0.3332, "step": 243 }, { "epoch": 0.7671448221654549, "grad_norm": 0.15596827864646912, "learning_rate": 8.928410625844214e-05, "loss": 0.2864, "step": 244 }, { "epoch": 0.7702888583218707, "grad_norm": 0.16769914329051971, "learning_rate": 8.923908149482215e-05, "loss": 0.3411, "step": 245 }, { "epoch": 0.7734328944782864, "grad_norm": 0.1581619828939438, "learning_rate": 8.919405673120217e-05, "loss": 0.3271, "step": 246 }, { "epoch": 0.7765769306347023, "grad_norm": 0.18669439852237701, "learning_rate": 8.914903196758217e-05, "loss": 0.3363, "step": 247 }, { "epoch": 0.7797209667911181, "grad_norm": 0.1833750307559967, "learning_rate": 8.910400720396218e-05, "loss": 0.3425, "step": 248 }, { "epoch": 0.7828650029475339, "grad_norm": 0.16842873394489288, "learning_rate": 8.90589824403422e-05, "loss": 0.3019, "step": 249 }, { "epoch": 0.7860090391039497, "grad_norm": 0.1643659919500351, "learning_rate": 8.901395767672221e-05, "loss": 0.3301, "step": 250 }, { "epoch": 0.7891530752603655, "grad_norm": 0.17208907008171082, "learning_rate": 8.89689329131022e-05, "loss": 0.3469, "step": 251 }, { "epoch": 0.7922971114167813, "grad_norm": 0.16336563229560852, "learning_rate": 8.892390814948221e-05, "loss": 0.325, "step": 252 }, { "epoch": 0.7954411475731971, "grad_norm": 0.17350764572620392, "learning_rate": 8.887888338586223e-05, "loss": 0.3486, "step": 253 }, { "epoch": 0.7985851837296128, "grad_norm": 0.15856927633285522, "learning_rate": 8.883385862224224e-05, "loss": 0.3223, "step": 254 }, { "epoch": 0.8017292198860287, "grad_norm": 0.16306869685649872, "learning_rate": 8.878883385862224e-05, "loss": 0.3164, "step": 255 }, { "epoch": 0.8048732560424445, "grad_norm": 0.1610950231552124, "learning_rate": 8.874380909500225e-05, "loss": 0.3146, "step": 256 }, { "epoch": 0.8080172921988603, "grad_norm": 0.18235592544078827, "learning_rate": 8.869878433138227e-05, "loss": 0.3259, "step": 257 }, { "epoch": 0.8111613283552761, "grad_norm": 0.1566954404115677, "learning_rate": 8.865375956776228e-05, "loss": 0.29, "step": 258 }, { "epoch": 0.8143053645116919, "grad_norm": 0.17046710848808289, "learning_rate": 8.860873480414229e-05, "loss": 0.3419, "step": 259 }, { "epoch": 0.8174494006681077, "grad_norm": 0.1749659776687622, "learning_rate": 8.85637100405223e-05, "loss": 0.3038, "step": 260 }, { "epoch": 0.8205934368245235, "grad_norm": 0.1782928705215454, "learning_rate": 8.85186852769023e-05, "loss": 0.3123, "step": 261 }, { "epoch": 0.8237374729809392, "grad_norm": 0.16543257236480713, "learning_rate": 8.847366051328231e-05, "loss": 0.2995, "step": 262 }, { "epoch": 0.826881509137355, "grad_norm": 0.17038169503211975, "learning_rate": 8.842863574966232e-05, "loss": 0.3437, "step": 263 }, { "epoch": 0.8300255452937709, "grad_norm": 0.16956864297389984, "learning_rate": 8.838361098604233e-05, "loss": 0.3208, "step": 264 }, { "epoch": 0.8331695814501867, "grad_norm": 0.16261757910251617, "learning_rate": 8.833858622242234e-05, "loss": 0.323, "step": 265 }, { "epoch": 0.8363136176066025, "grad_norm": 0.1713312268257141, "learning_rate": 8.829356145880235e-05, "loss": 0.3261, "step": 266 }, { "epoch": 0.8394576537630183, "grad_norm": 0.16358059644699097, "learning_rate": 8.824853669518235e-05, "loss": 0.3207, "step": 267 }, { "epoch": 0.8426016899194341, "grad_norm": 0.1711219847202301, "learning_rate": 8.820351193156236e-05, "loss": 0.3343, "step": 268 }, { "epoch": 0.8457457260758499, "grad_norm": 0.15430651605129242, "learning_rate": 8.815848716794237e-05, "loss": 0.3004, "step": 269 }, { "epoch": 0.8488897622322656, "grad_norm": 0.1770448386669159, "learning_rate": 8.811346240432238e-05, "loss": 0.3483, "step": 270 }, { "epoch": 0.8520337983886814, "grad_norm": 0.17468826472759247, "learning_rate": 8.806843764070239e-05, "loss": 0.3382, "step": 271 }, { "epoch": 0.8551778345450972, "grad_norm": 0.16027683019638062, "learning_rate": 8.80234128770824e-05, "loss": 0.3178, "step": 272 }, { "epoch": 0.8583218707015131, "grad_norm": 0.1797255426645279, "learning_rate": 8.79783881134624e-05, "loss": 0.339, "step": 273 }, { "epoch": 0.8614659068579289, "grad_norm": 0.17427705228328705, "learning_rate": 8.793336334984241e-05, "loss": 0.334, "step": 274 }, { "epoch": 0.8646099430143447, "grad_norm": 0.16854874789714813, "learning_rate": 8.788833858622242e-05, "loss": 0.3512, "step": 275 }, { "epoch": 0.8677539791707605, "grad_norm": 0.1548936814069748, "learning_rate": 8.784331382260243e-05, "loss": 0.2863, "step": 276 }, { "epoch": 0.8708980153271763, "grad_norm": 0.17063087224960327, "learning_rate": 8.779828905898244e-05, "loss": 0.3299, "step": 277 }, { "epoch": 0.874042051483592, "grad_norm": 0.17423272132873535, "learning_rate": 8.775326429536246e-05, "loss": 0.3161, "step": 278 }, { "epoch": 0.8771860876400078, "grad_norm": 0.16254863142967224, "learning_rate": 8.770823953174246e-05, "loss": 0.3236, "step": 279 }, { "epoch": 0.8803301237964236, "grad_norm": 0.16803030669689178, "learning_rate": 8.766321476812247e-05, "loss": 0.3288, "step": 280 }, { "epoch": 0.8834741599528395, "grad_norm": 0.1748911589384079, "learning_rate": 8.761819000450247e-05, "loss": 0.2984, "step": 281 }, { "epoch": 0.8866181961092553, "grad_norm": 0.16277071833610535, "learning_rate": 8.75731652408825e-05, "loss": 0.2983, "step": 282 }, { "epoch": 0.8897622322656711, "grad_norm": 0.16375455260276794, "learning_rate": 8.75281404772625e-05, "loss": 0.3339, "step": 283 }, { "epoch": 0.8929062684220869, "grad_norm": 0.15943491458892822, "learning_rate": 8.74831157136425e-05, "loss": 0.3414, "step": 284 }, { "epoch": 0.8960503045785027, "grad_norm": 0.16314157843589783, "learning_rate": 8.743809095002252e-05, "loss": 0.3207, "step": 285 }, { "epoch": 0.8991943407349184, "grad_norm": 0.16236723959445953, "learning_rate": 8.739306618640253e-05, "loss": 0.3349, "step": 286 }, { "epoch": 0.9023383768913342, "grad_norm": 0.17521819472312927, "learning_rate": 8.734804142278254e-05, "loss": 0.3275, "step": 287 }, { "epoch": 0.90548241304775, "grad_norm": 0.2164747565984726, "learning_rate": 8.730301665916253e-05, "loss": 0.3288, "step": 288 }, { "epoch": 0.9086264492041658, "grad_norm": 0.16042940318584442, "learning_rate": 8.725799189554256e-05, "loss": 0.3214, "step": 289 }, { "epoch": 0.9117704853605817, "grad_norm": 0.16976606845855713, "learning_rate": 8.721296713192257e-05, "loss": 0.3112, "step": 290 }, { "epoch": 0.9149145215169975, "grad_norm": 0.17349150776863098, "learning_rate": 8.716794236830257e-05, "loss": 0.2979, "step": 291 }, { "epoch": 0.9180585576734133, "grad_norm": 0.15189234912395477, "learning_rate": 8.712291760468257e-05, "loss": 0.3083, "step": 292 }, { "epoch": 0.921202593829829, "grad_norm": 0.15766362845897675, "learning_rate": 8.707789284106259e-05, "loss": 0.3337, "step": 293 }, { "epoch": 0.9243466299862448, "grad_norm": 0.15773652493953705, "learning_rate": 8.70328680774426e-05, "loss": 0.3161, "step": 294 }, { "epoch": 0.9274906661426606, "grad_norm": 0.15952229499816895, "learning_rate": 8.698784331382261e-05, "loss": 0.3131, "step": 295 }, { "epoch": 0.9306347022990764, "grad_norm": 0.16705040633678436, "learning_rate": 8.694281855020262e-05, "loss": 0.321, "step": 296 }, { "epoch": 0.9337787384554922, "grad_norm": 0.16729433834552765, "learning_rate": 8.689779378658263e-05, "loss": 0.2895, "step": 297 }, { "epoch": 0.936922774611908, "grad_norm": 0.17739711701869965, "learning_rate": 8.685276902296263e-05, "loss": 0.3236, "step": 298 }, { "epoch": 0.9400668107683239, "grad_norm": 0.16125445067882538, "learning_rate": 8.680774425934264e-05, "loss": 0.3227, "step": 299 }, { "epoch": 0.9432108469247397, "grad_norm": 0.19061018526554108, "learning_rate": 8.676271949572265e-05, "loss": 0.3327, "step": 300 }, { "epoch": 0.9463548830811555, "grad_norm": 0.17478956282138824, "learning_rate": 8.671769473210266e-05, "loss": 0.3325, "step": 301 }, { "epoch": 0.9494989192375712, "grad_norm": 0.1599021852016449, "learning_rate": 8.667266996848267e-05, "loss": 0.3091, "step": 302 }, { "epoch": 0.952642955393987, "grad_norm": 0.16696953773498535, "learning_rate": 8.662764520486268e-05, "loss": 0.3211, "step": 303 }, { "epoch": 0.9557869915504028, "grad_norm": 0.16814808547496796, "learning_rate": 8.658262044124269e-05, "loss": 0.3284, "step": 304 }, { "epoch": 0.9589310277068186, "grad_norm": 0.15857313573360443, "learning_rate": 8.65375956776227e-05, "loss": 0.3224, "step": 305 }, { "epoch": 0.9620750638632344, "grad_norm": 0.15295511484146118, "learning_rate": 8.64925709140027e-05, "loss": 0.3315, "step": 306 }, { "epoch": 0.9652191000196503, "grad_norm": 0.21399492025375366, "learning_rate": 8.644754615038273e-05, "loss": 0.2922, "step": 307 }, { "epoch": 0.9683631361760661, "grad_norm": 0.17268632352352142, "learning_rate": 8.640252138676272e-05, "loss": 0.3564, "step": 308 }, { "epoch": 0.9715071723324818, "grad_norm": 0.17499002814292908, "learning_rate": 8.635749662314273e-05, "loss": 0.3385, "step": 309 }, { "epoch": 0.9746512084888976, "grad_norm": 0.170021191239357, "learning_rate": 8.631247185952274e-05, "loss": 0.3305, "step": 310 }, { "epoch": 0.9777952446453134, "grad_norm": 0.17455638945102692, "learning_rate": 8.626744709590276e-05, "loss": 0.3107, "step": 311 }, { "epoch": 0.9809392808017292, "grad_norm": 0.16129587590694427, "learning_rate": 8.622242233228275e-05, "loss": 0.3182, "step": 312 }, { "epoch": 0.984083316958145, "grad_norm": 0.161848783493042, "learning_rate": 8.617739756866276e-05, "loss": 0.3368, "step": 313 }, { "epoch": 0.9872273531145608, "grad_norm": 0.14891745150089264, "learning_rate": 8.613237280504279e-05, "loss": 0.2923, "step": 314 }, { "epoch": 0.9903713892709766, "grad_norm": 0.1604168862104416, "learning_rate": 8.60873480414228e-05, "loss": 0.3197, "step": 315 }, { "epoch": 0.9935154254273925, "grad_norm": 0.15211592614650726, "learning_rate": 8.604232327780279e-05, "loss": 0.3197, "step": 316 }, { "epoch": 0.9966594615838082, "grad_norm": 0.1654754877090454, "learning_rate": 8.59972985141828e-05, "loss": 0.3108, "step": 317 }, { "epoch": 0.999803497740224, "grad_norm": 0.1642957627773285, "learning_rate": 8.595227375056282e-05, "loss": 0.3337, "step": 318 }, { "epoch": 1.0, "grad_norm": 0.6600321531295776, "learning_rate": 8.590724898694283e-05, "loss": 0.3815, "step": 319 }, { "epoch": 1.0, "eval_loss": 0.3256175220012665, "eval_runtime": 102.4846, "eval_samples_per_second": 12.412, "eval_steps_per_second": 12.412, "step": 319 }, { "epoch": 1.003144036156416, "grad_norm": 0.15727423131465912, "learning_rate": 8.586222422332282e-05, "loss": 0.2867, "step": 320 }, { "epoch": 1.0062880723128316, "grad_norm": 0.17558008432388306, "learning_rate": 8.581719945970285e-05, "loss": 0.3426, "step": 321 }, { "epoch": 1.0094321084692475, "grad_norm": 0.1863006204366684, "learning_rate": 8.577217469608285e-05, "loss": 0.2946, "step": 322 }, { "epoch": 1.0125761446256631, "grad_norm": 0.20207758247852325, "learning_rate": 8.572714993246286e-05, "loss": 0.2872, "step": 323 }, { "epoch": 1.015720180782079, "grad_norm": 0.17089968919754028, "learning_rate": 8.568212516884286e-05, "loss": 0.3098, "step": 324 }, { "epoch": 1.0188642169384947, "grad_norm": 0.18078500032424927, "learning_rate": 8.563710040522288e-05, "loss": 0.2901, "step": 325 }, { "epoch": 1.0220082530949106, "grad_norm": 0.18292267620563507, "learning_rate": 8.559207564160289e-05, "loss": 0.332, "step": 326 }, { "epoch": 1.0251522892513263, "grad_norm": 0.16480115056037903, "learning_rate": 8.55470508779829e-05, "loss": 0.3042, "step": 327 }, { "epoch": 1.0282963254077422, "grad_norm": 0.16814446449279785, "learning_rate": 8.550202611436289e-05, "loss": 0.319, "step": 328 }, { "epoch": 1.031440361564158, "grad_norm": 0.16136884689331055, "learning_rate": 8.545700135074291e-05, "loss": 0.291, "step": 329 }, { "epoch": 1.0345843977205738, "grad_norm": 0.17144669592380524, "learning_rate": 8.541197658712292e-05, "loss": 0.2852, "step": 330 }, { "epoch": 1.0377284338769897, "grad_norm": 0.1635693609714508, "learning_rate": 8.536695182350293e-05, "loss": 0.2684, "step": 331 }, { "epoch": 1.0408724700334053, "grad_norm": 0.1604490727186203, "learning_rate": 8.532192705988294e-05, "loss": 0.3093, "step": 332 }, { "epoch": 1.0440165061898212, "grad_norm": 0.15917396545410156, "learning_rate": 8.527690229626295e-05, "loss": 0.2697, "step": 333 }, { "epoch": 1.047160542346237, "grad_norm": 0.17525093257427216, "learning_rate": 8.523187753264296e-05, "loss": 0.3084, "step": 334 }, { "epoch": 1.0503045785026528, "grad_norm": 0.1677919328212738, "learning_rate": 8.518685276902297e-05, "loss": 0.2974, "step": 335 }, { "epoch": 1.0534486146590685, "grad_norm": 0.17789426445960999, "learning_rate": 8.514182800540297e-05, "loss": 0.3062, "step": 336 }, { "epoch": 1.0565926508154844, "grad_norm": 0.16536547243595123, "learning_rate": 8.509680324178298e-05, "loss": 0.2872, "step": 337 }, { "epoch": 1.0597366869719003, "grad_norm": 0.17476080358028412, "learning_rate": 8.505177847816299e-05, "loss": 0.306, "step": 338 }, { "epoch": 1.062880723128316, "grad_norm": 0.1762908548116684, "learning_rate": 8.500675371454301e-05, "loss": 0.2878, "step": 339 }, { "epoch": 1.0660247592847318, "grad_norm": 0.17144866287708282, "learning_rate": 8.496172895092301e-05, "loss": 0.2915, "step": 340 }, { "epoch": 1.0691687954411475, "grad_norm": 0.16622525453567505, "learning_rate": 8.491670418730302e-05, "loss": 0.3169, "step": 341 }, { "epoch": 1.0723128315975634, "grad_norm": 0.17786164581775665, "learning_rate": 8.487167942368303e-05, "loss": 0.2731, "step": 342 }, { "epoch": 1.075456867753979, "grad_norm": 0.17761558294296265, "learning_rate": 8.482665466006305e-05, "loss": 0.2967, "step": 343 }, { "epoch": 1.078600903910395, "grad_norm": 0.16161416471004486, "learning_rate": 8.478162989644304e-05, "loss": 0.2887, "step": 344 }, { "epoch": 1.0817449400668107, "grad_norm": 0.1788141131401062, "learning_rate": 8.473660513282305e-05, "loss": 0.3456, "step": 345 }, { "epoch": 1.0848889762232266, "grad_norm": 0.17762236297130585, "learning_rate": 8.469158036920306e-05, "loss": 0.3078, "step": 346 }, { "epoch": 1.0880330123796425, "grad_norm": 0.16463209688663483, "learning_rate": 8.464655560558308e-05, "loss": 0.285, "step": 347 }, { "epoch": 1.0911770485360581, "grad_norm": 0.17661692202091217, "learning_rate": 8.460153084196308e-05, "loss": 0.3139, "step": 348 }, { "epoch": 1.094321084692474, "grad_norm": 0.170676589012146, "learning_rate": 8.455650607834309e-05, "loss": 0.3009, "step": 349 }, { "epoch": 1.0974651208488897, "grad_norm": 0.17777417600154877, "learning_rate": 8.451148131472311e-05, "loss": 0.2889, "step": 350 }, { "epoch": 1.1006091570053056, "grad_norm": 0.17952531576156616, "learning_rate": 8.446645655110312e-05, "loss": 0.3181, "step": 351 }, { "epoch": 1.1037531931617213, "grad_norm": 0.17377127707004547, "learning_rate": 8.442143178748311e-05, "loss": 0.3003, "step": 352 }, { "epoch": 1.1068972293181372, "grad_norm": 0.17013375461101532, "learning_rate": 8.437640702386312e-05, "loss": 0.3086, "step": 353 }, { "epoch": 1.1100412654745528, "grad_norm": 0.17330169677734375, "learning_rate": 8.433138226024314e-05, "loss": 0.314, "step": 354 }, { "epoch": 1.1131853016309687, "grad_norm": 0.17327344417572021, "learning_rate": 8.428635749662315e-05, "loss": 0.3227, "step": 355 }, { "epoch": 1.1163293377873846, "grad_norm": 0.17580825090408325, "learning_rate": 8.424133273300316e-05, "loss": 0.3104, "step": 356 }, { "epoch": 1.1194733739438003, "grad_norm": 0.17389941215515137, "learning_rate": 8.419630796938317e-05, "loss": 0.285, "step": 357 }, { "epoch": 1.1226174101002162, "grad_norm": 0.1775561273097992, "learning_rate": 8.415128320576318e-05, "loss": 0.2969, "step": 358 }, { "epoch": 1.1257614462566319, "grad_norm": 0.18656259775161743, "learning_rate": 8.410625844214319e-05, "loss": 0.3193, "step": 359 }, { "epoch": 1.1289054824130478, "grad_norm": 0.1777326762676239, "learning_rate": 8.40612336785232e-05, "loss": 0.3024, "step": 360 }, { "epoch": 1.1320495185694635, "grad_norm": 0.17319585382938385, "learning_rate": 8.40162089149032e-05, "loss": 0.2978, "step": 361 }, { "epoch": 1.1351935547258794, "grad_norm": 0.17653490602970123, "learning_rate": 8.397118415128321e-05, "loss": 0.308, "step": 362 }, { "epoch": 1.138337590882295, "grad_norm": 0.160200297832489, "learning_rate": 8.392615938766322e-05, "loss": 0.2694, "step": 363 }, { "epoch": 1.141481627038711, "grad_norm": 0.16492871940135956, "learning_rate": 8.388113462404323e-05, "loss": 0.2824, "step": 364 }, { "epoch": 1.1446256631951268, "grad_norm": 0.17109255492687225, "learning_rate": 8.383610986042324e-05, "loss": 0.2841, "step": 365 }, { "epoch": 1.1477696993515425, "grad_norm": 0.23076315224170685, "learning_rate": 8.379108509680325e-05, "loss": 0.314, "step": 366 }, { "epoch": 1.1509137355079584, "grad_norm": 0.1704353392124176, "learning_rate": 8.374606033318325e-05, "loss": 0.2962, "step": 367 }, { "epoch": 1.154057771664374, "grad_norm": 0.17220115661621094, "learning_rate": 8.370103556956326e-05, "loss": 0.2904, "step": 368 }, { "epoch": 1.15720180782079, "grad_norm": 0.17528584599494934, "learning_rate": 8.365601080594327e-05, "loss": 0.3263, "step": 369 }, { "epoch": 1.1603458439772059, "grad_norm": 0.18602944910526276, "learning_rate": 8.361098604232328e-05, "loss": 0.2989, "step": 370 }, { "epoch": 1.1634898801336215, "grad_norm": 0.18739493191242218, "learning_rate": 8.356596127870329e-05, "loss": 0.3148, "step": 371 }, { "epoch": 1.1666339162900374, "grad_norm": 0.1813725382089615, "learning_rate": 8.35209365150833e-05, "loss": 0.2989, "step": 372 }, { "epoch": 1.169777952446453, "grad_norm": 0.1674114614725113, "learning_rate": 8.34759117514633e-05, "loss": 0.2668, "step": 373 }, { "epoch": 1.172921988602869, "grad_norm": 0.1844543069601059, "learning_rate": 8.343088698784331e-05, "loss": 0.3003, "step": 374 }, { "epoch": 1.1760660247592847, "grad_norm": 0.17155998945236206, "learning_rate": 8.338586222422334e-05, "loss": 0.2931, "step": 375 }, { "epoch": 1.1792100609157006, "grad_norm": 0.1664140224456787, "learning_rate": 8.334083746060334e-05, "loss": 0.2896, "step": 376 }, { "epoch": 1.1823540970721163, "grad_norm": 0.18443046510219574, "learning_rate": 8.329581269698334e-05, "loss": 0.3098, "step": 377 }, { "epoch": 1.1854981332285321, "grad_norm": 0.16364677250385284, "learning_rate": 8.325078793336335e-05, "loss": 0.2849, "step": 378 }, { "epoch": 1.188642169384948, "grad_norm": 0.1778181493282318, "learning_rate": 8.320576316974337e-05, "loss": 0.2951, "step": 379 }, { "epoch": 1.1917862055413637, "grad_norm": 0.17129847407341003, "learning_rate": 8.316073840612338e-05, "loss": 0.2917, "step": 380 }, { "epoch": 1.1949302416977796, "grad_norm": 0.17360500991344452, "learning_rate": 8.311571364250337e-05, "loss": 0.3078, "step": 381 }, { "epoch": 1.1980742778541953, "grad_norm": 0.17020374536514282, "learning_rate": 8.307068887888338e-05, "loss": 0.2966, "step": 382 }, { "epoch": 1.2012183140106112, "grad_norm": 0.1838023066520691, "learning_rate": 8.30256641152634e-05, "loss": 0.2991, "step": 383 }, { "epoch": 1.2043623501670269, "grad_norm": 0.18513008952140808, "learning_rate": 8.298063935164341e-05, "loss": 0.3455, "step": 384 }, { "epoch": 1.2075063863234428, "grad_norm": 0.17663338780403137, "learning_rate": 8.293561458802341e-05, "loss": 0.3059, "step": 385 }, { "epoch": 1.2106504224798584, "grad_norm": 0.17648449540138245, "learning_rate": 8.289058982440343e-05, "loss": 0.2991, "step": 386 }, { "epoch": 1.2137944586362743, "grad_norm": 0.18601331114768982, "learning_rate": 8.284556506078344e-05, "loss": 0.2936, "step": 387 }, { "epoch": 1.2169384947926902, "grad_norm": 0.18048390746116638, "learning_rate": 8.280054029716345e-05, "loss": 0.2837, "step": 388 }, { "epoch": 1.220082530949106, "grad_norm": 0.17065560817718506, "learning_rate": 8.275551553354344e-05, "loss": 0.283, "step": 389 }, { "epoch": 1.2232265671055218, "grad_norm": 0.1708894670009613, "learning_rate": 8.271049076992347e-05, "loss": 0.2718, "step": 390 }, { "epoch": 1.2263706032619375, "grad_norm": 0.17479634284973145, "learning_rate": 8.266546600630347e-05, "loss": 0.3003, "step": 391 }, { "epoch": 1.2295146394183534, "grad_norm": 0.22091898322105408, "learning_rate": 8.262044124268348e-05, "loss": 0.3324, "step": 392 }, { "epoch": 1.232658675574769, "grad_norm": 0.17981559038162231, "learning_rate": 8.257541647906349e-05, "loss": 0.3042, "step": 393 }, { "epoch": 1.235802711731185, "grad_norm": 0.16127324104309082, "learning_rate": 8.25303917154435e-05, "loss": 0.2662, "step": 394 }, { "epoch": 1.2389467478876006, "grad_norm": 0.18422247469425201, "learning_rate": 8.248536695182351e-05, "loss": 0.31, "step": 395 }, { "epoch": 1.2420907840440165, "grad_norm": 0.18198904395103455, "learning_rate": 8.244034218820352e-05, "loss": 0.3242, "step": 396 }, { "epoch": 1.2452348202004324, "grad_norm": 0.17157980799674988, "learning_rate": 8.239531742458353e-05, "loss": 0.3025, "step": 397 }, { "epoch": 1.248378856356848, "grad_norm": 0.17674268782138824, "learning_rate": 8.235029266096353e-05, "loss": 0.3072, "step": 398 }, { "epoch": 1.251522892513264, "grad_norm": 0.18540705740451813, "learning_rate": 8.230526789734354e-05, "loss": 0.3159, "step": 399 }, { "epoch": 1.2546669286696797, "grad_norm": 0.18816250562667847, "learning_rate": 8.226024313372355e-05, "loss": 0.328, "step": 400 }, { "epoch": 1.2578109648260956, "grad_norm": 0.1927611082792282, "learning_rate": 8.221521837010356e-05, "loss": 0.3032, "step": 401 }, { "epoch": 1.2609550009825112, "grad_norm": 0.18845967948436737, "learning_rate": 8.217019360648357e-05, "loss": 0.2793, "step": 402 }, { "epoch": 1.2640990371389271, "grad_norm": 0.19096308946609497, "learning_rate": 8.212516884286358e-05, "loss": 0.3369, "step": 403 }, { "epoch": 1.2672430732953428, "grad_norm": 0.16917437314987183, "learning_rate": 8.20801440792436e-05, "loss": 0.2785, "step": 404 }, { "epoch": 1.2703871094517587, "grad_norm": 0.16734306514263153, "learning_rate": 8.20351193156236e-05, "loss": 0.2914, "step": 405 }, { "epoch": 1.2735311456081746, "grad_norm": 0.17491504549980164, "learning_rate": 8.19900945520036e-05, "loss": 0.2905, "step": 406 }, { "epoch": 1.2766751817645903, "grad_norm": 0.18531963229179382, "learning_rate": 8.194506978838361e-05, "loss": 0.3365, "step": 407 }, { "epoch": 1.2798192179210062, "grad_norm": 0.1812233179807663, "learning_rate": 8.190004502476363e-05, "loss": 0.3051, "step": 408 }, { "epoch": 1.2829632540774218, "grad_norm": 0.17402727901935577, "learning_rate": 8.185502026114363e-05, "loss": 0.2906, "step": 409 }, { "epoch": 1.2861072902338377, "grad_norm": 0.18160969018936157, "learning_rate": 8.180999549752364e-05, "loss": 0.295, "step": 410 }, { "epoch": 1.2892513263902534, "grad_norm": 0.17364852130413055, "learning_rate": 8.176497073390366e-05, "loss": 0.2789, "step": 411 }, { "epoch": 1.2923953625466693, "grad_norm": 0.17983028292655945, "learning_rate": 8.171994597028367e-05, "loss": 0.2976, "step": 412 }, { "epoch": 1.295539398703085, "grad_norm": 0.18376639485359192, "learning_rate": 8.167492120666366e-05, "loss": 0.2984, "step": 413 }, { "epoch": 1.2986834348595009, "grad_norm": 0.16966019570827484, "learning_rate": 8.162989644304367e-05, "loss": 0.2825, "step": 414 }, { "epoch": 1.3018274710159168, "grad_norm": 0.18048398196697235, "learning_rate": 8.15848716794237e-05, "loss": 0.3105, "step": 415 }, { "epoch": 1.3049715071723325, "grad_norm": 0.1738966405391693, "learning_rate": 8.15398469158037e-05, "loss": 0.27, "step": 416 }, { "epoch": 1.3081155433287484, "grad_norm": 0.22127372026443481, "learning_rate": 8.14948221521837e-05, "loss": 0.3156, "step": 417 }, { "epoch": 1.311259579485164, "grad_norm": 0.17313317954540253, "learning_rate": 8.14497973885637e-05, "loss": 0.2921, "step": 418 }, { "epoch": 1.31440361564158, "grad_norm": 0.17622841894626617, "learning_rate": 8.140477262494373e-05, "loss": 0.3026, "step": 419 }, { "epoch": 1.3175476517979956, "grad_norm": 0.17847168445587158, "learning_rate": 8.135974786132374e-05, "loss": 0.3021, "step": 420 }, { "epoch": 1.3206916879544115, "grad_norm": 0.18637776374816895, "learning_rate": 8.131472309770373e-05, "loss": 0.3186, "step": 421 }, { "epoch": 1.3238357241108272, "grad_norm": 0.16532807052135468, "learning_rate": 8.126969833408375e-05, "loss": 0.2878, "step": 422 }, { "epoch": 1.326979760267243, "grad_norm": 0.16804370284080505, "learning_rate": 8.122467357046376e-05, "loss": 0.2868, "step": 423 }, { "epoch": 1.330123796423659, "grad_norm": 0.1693575531244278, "learning_rate": 8.117964880684377e-05, "loss": 0.2898, "step": 424 }, { "epoch": 1.3332678325800746, "grad_norm": 0.17773057520389557, "learning_rate": 8.113462404322378e-05, "loss": 0.2741, "step": 425 }, { "epoch": 1.3364118687364905, "grad_norm": 0.1866486817598343, "learning_rate": 8.108959927960379e-05, "loss": 0.298, "step": 426 }, { "epoch": 1.3395559048929062, "grad_norm": 0.18073201179504395, "learning_rate": 8.10445745159838e-05, "loss": 0.2933, "step": 427 }, { "epoch": 1.342699941049322, "grad_norm": 0.17505986988544464, "learning_rate": 8.09995497523638e-05, "loss": 0.2936, "step": 428 }, { "epoch": 1.3458439772057378, "grad_norm": 0.17242185771465302, "learning_rate": 8.095452498874381e-05, "loss": 0.2827, "step": 429 }, { "epoch": 1.3489880133621537, "grad_norm": 0.16698665916919708, "learning_rate": 8.090950022512382e-05, "loss": 0.2742, "step": 430 }, { "epoch": 1.3521320495185694, "grad_norm": 0.18763364851474762, "learning_rate": 8.086447546150383e-05, "loss": 0.3192, "step": 431 }, { "epoch": 1.3552760856749853, "grad_norm": 0.18754689395427704, "learning_rate": 8.081945069788384e-05, "loss": 0.2932, "step": 432 }, { "epoch": 1.3584201218314012, "grad_norm": 0.18708984553813934, "learning_rate": 8.077442593426385e-05, "loss": 0.328, "step": 433 }, { "epoch": 1.3615641579878168, "grad_norm": 0.18035311996936798, "learning_rate": 8.072940117064386e-05, "loss": 0.2699, "step": 434 }, { "epoch": 1.3647081941442327, "grad_norm": 0.17291460931301117, "learning_rate": 8.068437640702387e-05, "loss": 0.2729, "step": 435 }, { "epoch": 1.3678522303006484, "grad_norm": 0.1894587129354477, "learning_rate": 8.063935164340387e-05, "loss": 0.2792, "step": 436 }, { "epoch": 1.3709962664570643, "grad_norm": 0.17740470170974731, "learning_rate": 8.059432687978388e-05, "loss": 0.3022, "step": 437 }, { "epoch": 1.3741403026134802, "grad_norm": 0.17968855798244476, "learning_rate": 8.054930211616389e-05, "loss": 0.3095, "step": 438 }, { "epoch": 1.3772843387698959, "grad_norm": 0.1771247237920761, "learning_rate": 8.05042773525439e-05, "loss": 0.3052, "step": 439 }, { "epoch": 1.3804283749263115, "grad_norm": 0.164938822388649, "learning_rate": 8.045925258892392e-05, "loss": 0.2912, "step": 440 }, { "epoch": 1.3835724110827274, "grad_norm": 0.17572474479675293, "learning_rate": 8.041422782530392e-05, "loss": 0.2942, "step": 441 }, { "epoch": 1.3867164472391433, "grad_norm": 0.16663512587547302, "learning_rate": 8.036920306168393e-05, "loss": 0.2838, "step": 442 }, { "epoch": 1.389860483395559, "grad_norm": 0.1684209108352661, "learning_rate": 8.032417829806393e-05, "loss": 0.3024, "step": 443 }, { "epoch": 1.393004519551975, "grad_norm": 0.1704261749982834, "learning_rate": 8.027915353444396e-05, "loss": 0.3055, "step": 444 }, { "epoch": 1.3961485557083906, "grad_norm": 0.17855525016784668, "learning_rate": 8.023412877082395e-05, "loss": 0.3047, "step": 445 }, { "epoch": 1.3992925918648065, "grad_norm": 0.16438795626163483, "learning_rate": 8.018910400720396e-05, "loss": 0.2833, "step": 446 }, { "epoch": 1.4024366280212224, "grad_norm": 0.1803821176290512, "learning_rate": 8.014407924358398e-05, "loss": 0.2693, "step": 447 }, { "epoch": 1.405580664177638, "grad_norm": 0.17037837207317352, "learning_rate": 8.009905447996399e-05, "loss": 0.2931, "step": 448 }, { "epoch": 1.4087247003340537, "grad_norm": 0.17838133871555328, "learning_rate": 8.0054029716344e-05, "loss": 0.3126, "step": 449 }, { "epoch": 1.4118687364904696, "grad_norm": 0.17596563696861267, "learning_rate": 8.0009004952724e-05, "loss": 0.3124, "step": 450 }, { "epoch": 1.4150127726468855, "grad_norm": 0.17477372288703918, "learning_rate": 7.996398018910402e-05, "loss": 0.2846, "step": 451 }, { "epoch": 1.4181568088033012, "grad_norm": 0.16831114888191223, "learning_rate": 7.991895542548402e-05, "loss": 0.2841, "step": 452 }, { "epoch": 1.421300844959717, "grad_norm": 0.16885237395763397, "learning_rate": 7.987393066186403e-05, "loss": 0.2882, "step": 453 }, { "epoch": 1.4244448811161328, "grad_norm": 0.1732211410999298, "learning_rate": 7.982890589824403e-05, "loss": 0.3065, "step": 454 }, { "epoch": 1.4275889172725487, "grad_norm": 0.17489729821681976, "learning_rate": 7.978388113462405e-05, "loss": 0.2937, "step": 455 }, { "epoch": 1.4307329534289646, "grad_norm": 0.1771242618560791, "learning_rate": 7.973885637100406e-05, "loss": 0.2963, "step": 456 }, { "epoch": 1.4338769895853802, "grad_norm": 0.19036780297756195, "learning_rate": 7.969383160738407e-05, "loss": 0.3104, "step": 457 }, { "epoch": 1.437021025741796, "grad_norm": 0.1863013058900833, "learning_rate": 7.964880684376408e-05, "loss": 0.3047, "step": 458 }, { "epoch": 1.4401650618982118, "grad_norm": 0.1722109168767929, "learning_rate": 7.960378208014408e-05, "loss": 0.2828, "step": 459 }, { "epoch": 1.4433090980546277, "grad_norm": 0.1802283078432083, "learning_rate": 7.95587573165241e-05, "loss": 0.2978, "step": 460 }, { "epoch": 1.4464531342110434, "grad_norm": 0.17628727853298187, "learning_rate": 7.95137325529041e-05, "loss": 0.2963, "step": 461 }, { "epoch": 1.4495971703674593, "grad_norm": 0.17598123848438263, "learning_rate": 7.946870778928411e-05, "loss": 0.3106, "step": 462 }, { "epoch": 1.452741206523875, "grad_norm": 0.17388591170310974, "learning_rate": 7.942368302566412e-05, "loss": 0.3067, "step": 463 }, { "epoch": 1.4558852426802908, "grad_norm": 0.17893949151039124, "learning_rate": 7.937865826204413e-05, "loss": 0.2756, "step": 464 }, { "epoch": 1.4590292788367067, "grad_norm": 0.16779755055904388, "learning_rate": 7.933363349842414e-05, "loss": 0.2911, "step": 465 }, { "epoch": 1.4621733149931224, "grad_norm": 0.19151651859283447, "learning_rate": 7.928860873480414e-05, "loss": 0.2919, "step": 466 }, { "epoch": 1.465317351149538, "grad_norm": 0.17654001712799072, "learning_rate": 7.924358397118415e-05, "loss": 0.3021, "step": 467 }, { "epoch": 1.468461387305954, "grad_norm": 0.17647038400173187, "learning_rate": 7.919855920756416e-05, "loss": 0.271, "step": 468 }, { "epoch": 1.4716054234623699, "grad_norm": 0.17813007533550262, "learning_rate": 7.915353444394417e-05, "loss": 0.3031, "step": 469 }, { "epoch": 1.4747494596187856, "grad_norm": 0.19432079792022705, "learning_rate": 7.910850968032418e-05, "loss": 0.3529, "step": 470 }, { "epoch": 1.4778934957752015, "grad_norm": 0.18345120549201965, "learning_rate": 7.906348491670419e-05, "loss": 0.2832, "step": 471 }, { "epoch": 1.4810375319316171, "grad_norm": 0.1722515970468521, "learning_rate": 7.90184601530842e-05, "loss": 0.3294, "step": 472 }, { "epoch": 1.484181568088033, "grad_norm": 0.1815156191587448, "learning_rate": 7.897343538946422e-05, "loss": 0.3025, "step": 473 }, { "epoch": 1.487325604244449, "grad_norm": 0.17528071999549866, "learning_rate": 7.892841062584421e-05, "loss": 0.3167, "step": 474 }, { "epoch": 1.4904696404008646, "grad_norm": 0.1877971738576889, "learning_rate": 7.888338586222422e-05, "loss": 0.3088, "step": 475 }, { "epoch": 1.4936136765572803, "grad_norm": 0.16935402154922485, "learning_rate": 7.883836109860424e-05, "loss": 0.3003, "step": 476 }, { "epoch": 1.4967577127136962, "grad_norm": 0.1625109314918518, "learning_rate": 7.879333633498425e-05, "loss": 0.2853, "step": 477 }, { "epoch": 1.499901748870112, "grad_norm": 0.17674805223941803, "learning_rate": 7.874831157136425e-05, "loss": 0.3047, "step": 478 }, { "epoch": 1.503045785026528, "grad_norm": 0.168808251619339, "learning_rate": 7.870328680774426e-05, "loss": 0.301, "step": 479 }, { "epoch": 1.5061898211829436, "grad_norm": 0.1753881871700287, "learning_rate": 7.865826204412428e-05, "loss": 0.2939, "step": 480 }, { "epoch": 1.5093338573393593, "grad_norm": 0.16852906346321106, "learning_rate": 7.861323728050429e-05, "loss": 0.2989, "step": 481 }, { "epoch": 1.5124778934957752, "grad_norm": 0.16612806916236877, "learning_rate": 7.856821251688428e-05, "loss": 0.2731, "step": 482 }, { "epoch": 1.5156219296521911, "grad_norm": 0.17498096823692322, "learning_rate": 7.85231877532643e-05, "loss": 0.2903, "step": 483 }, { "epoch": 1.5187659658086068, "grad_norm": 0.1843009740114212, "learning_rate": 7.847816298964431e-05, "loss": 0.3168, "step": 484 }, { "epoch": 1.5219100019650225, "grad_norm": 0.17858386039733887, "learning_rate": 7.843313822602432e-05, "loss": 0.2954, "step": 485 }, { "epoch": 1.5250540381214384, "grad_norm": 0.18993936479091644, "learning_rate": 7.838811346240432e-05, "loss": 0.3264, "step": 486 }, { "epoch": 1.5281980742778543, "grad_norm": 0.1731633096933365, "learning_rate": 7.834308869878434e-05, "loss": 0.2982, "step": 487 }, { "epoch": 1.5313421104342702, "grad_norm": 0.17727167904376984, "learning_rate": 7.829806393516435e-05, "loss": 0.2854, "step": 488 }, { "epoch": 1.5344861465906858, "grad_norm": 0.17536379396915436, "learning_rate": 7.825303917154436e-05, "loss": 0.2788, "step": 489 }, { "epoch": 1.5376301827471015, "grad_norm": 0.1785167157649994, "learning_rate": 7.820801440792435e-05, "loss": 0.3007, "step": 490 }, { "epoch": 1.5407742189035174, "grad_norm": 0.1738578975200653, "learning_rate": 7.816298964430437e-05, "loss": 0.3015, "step": 491 }, { "epoch": 1.5439182550599333, "grad_norm": 0.1737809181213379, "learning_rate": 7.811796488068438e-05, "loss": 0.3031, "step": 492 }, { "epoch": 1.547062291216349, "grad_norm": 0.17526312172412872, "learning_rate": 7.807294011706439e-05, "loss": 0.3072, "step": 493 }, { "epoch": 1.5502063273727646, "grad_norm": 0.17959162592887878, "learning_rate": 7.80279153534444e-05, "loss": 0.2699, "step": 494 }, { "epoch": 1.5533503635291805, "grad_norm": 0.17218153178691864, "learning_rate": 7.798289058982441e-05, "loss": 0.2734, "step": 495 }, { "epoch": 1.5564943996855964, "grad_norm": 0.17062252759933472, "learning_rate": 7.793786582620442e-05, "loss": 0.2426, "step": 496 }, { "epoch": 1.5596384358420123, "grad_norm": 0.19795489311218262, "learning_rate": 7.789284106258442e-05, "loss": 0.3396, "step": 497 }, { "epoch": 1.562782471998428, "grad_norm": 0.18899548053741455, "learning_rate": 7.784781629896443e-05, "loss": 0.295, "step": 498 }, { "epoch": 1.5659265081548437, "grad_norm": 0.18889367580413818, "learning_rate": 7.780279153534444e-05, "loss": 0.2996, "step": 499 }, { "epoch": 1.5690705443112596, "grad_norm": 0.184955894947052, "learning_rate": 7.775776677172445e-05, "loss": 0.2833, "step": 500 }, { "epoch": 1.5722145804676755, "grad_norm": 0.16244037449359894, "learning_rate": 7.771274200810447e-05, "loss": 0.2677, "step": 501 }, { "epoch": 1.5753586166240912, "grad_norm": 0.19440148770809174, "learning_rate": 7.766771724448447e-05, "loss": 0.3052, "step": 502 }, { "epoch": 1.5785026527805068, "grad_norm": 0.1759510636329651, "learning_rate": 7.762269248086448e-05, "loss": 0.2824, "step": 503 }, { "epoch": 1.5816466889369227, "grad_norm": 0.17166948318481445, "learning_rate": 7.757766771724448e-05, "loss": 0.2911, "step": 504 }, { "epoch": 1.5847907250933386, "grad_norm": 0.17509418725967407, "learning_rate": 7.753264295362451e-05, "loss": 0.2845, "step": 505 }, { "epoch": 1.5879347612497545, "grad_norm": 0.16338001191616058, "learning_rate": 7.74876181900045e-05, "loss": 0.2759, "step": 506 }, { "epoch": 1.5910787974061702, "grad_norm": 0.1770390421152115, "learning_rate": 7.744259342638451e-05, "loss": 0.3137, "step": 507 }, { "epoch": 1.5942228335625859, "grad_norm": 0.17159558832645416, "learning_rate": 7.739756866276452e-05, "loss": 0.2736, "step": 508 }, { "epoch": 1.5973668697190018, "grad_norm": 0.18849338591098785, "learning_rate": 7.735254389914454e-05, "loss": 0.2775, "step": 509 }, { "epoch": 1.6005109058754177, "grad_norm": 0.18084058165550232, "learning_rate": 7.730751913552454e-05, "loss": 0.2883, "step": 510 }, { "epoch": 1.6036549420318333, "grad_norm": 0.1859467774629593, "learning_rate": 7.726249437190454e-05, "loss": 0.3053, "step": 511 }, { "epoch": 1.606798978188249, "grad_norm": 0.18158085644245148, "learning_rate": 7.721746960828457e-05, "loss": 0.2923, "step": 512 }, { "epoch": 1.609943014344665, "grad_norm": 0.18600253760814667, "learning_rate": 7.717244484466458e-05, "loss": 0.3107, "step": 513 }, { "epoch": 1.6130870505010808, "grad_norm": 0.1869710236787796, "learning_rate": 7.712742008104457e-05, "loss": 0.2821, "step": 514 }, { "epoch": 1.6162310866574967, "grad_norm": 0.1755673587322235, "learning_rate": 7.708239531742458e-05, "loss": 0.2621, "step": 515 }, { "epoch": 1.6193751228139124, "grad_norm": 0.17789125442504883, "learning_rate": 7.70373705538046e-05, "loss": 0.3073, "step": 516 }, { "epoch": 1.622519158970328, "grad_norm": 0.16756756603717804, "learning_rate": 7.699234579018461e-05, "loss": 0.3078, "step": 517 }, { "epoch": 1.625663195126744, "grad_norm": 0.17822512984275818, "learning_rate": 7.69473210265646e-05, "loss": 0.2998, "step": 518 }, { "epoch": 1.6288072312831599, "grad_norm": 0.16880451142787933, "learning_rate": 7.690229626294463e-05, "loss": 0.2918, "step": 519 }, { "epoch": 1.6319512674395755, "grad_norm": 0.1791965365409851, "learning_rate": 7.685727149932464e-05, "loss": 0.2898, "step": 520 }, { "epoch": 1.6350953035959912, "grad_norm": 0.17452813684940338, "learning_rate": 7.681224673570464e-05, "loss": 0.2885, "step": 521 }, { "epoch": 1.638239339752407, "grad_norm": 0.18743397295475006, "learning_rate": 7.676722197208465e-05, "loss": 0.3007, "step": 522 }, { "epoch": 1.641383375908823, "grad_norm": 0.18785692751407623, "learning_rate": 7.672219720846466e-05, "loss": 0.2928, "step": 523 }, { "epoch": 1.644527412065239, "grad_norm": 0.19505468010902405, "learning_rate": 7.667717244484467e-05, "loss": 0.3206, "step": 524 }, { "epoch": 1.6476714482216546, "grad_norm": 0.1750132143497467, "learning_rate": 7.663214768122468e-05, "loss": 0.2764, "step": 525 }, { "epoch": 1.6508154843780702, "grad_norm": 0.18247836828231812, "learning_rate": 7.658712291760469e-05, "loss": 0.314, "step": 526 }, { "epoch": 1.6539595205344861, "grad_norm": 0.1866837590932846, "learning_rate": 7.65420981539847e-05, "loss": 0.3158, "step": 527 }, { "epoch": 1.657103556690902, "grad_norm": 0.17475096881389618, "learning_rate": 7.64970733903647e-05, "loss": 0.2964, "step": 528 }, { "epoch": 1.6602475928473177, "grad_norm": 0.1679716855287552, "learning_rate": 7.645204862674471e-05, "loss": 0.3122, "step": 529 }, { "epoch": 1.6633916290037334, "grad_norm": 0.16546200215816498, "learning_rate": 7.640702386312472e-05, "loss": 0.3125, "step": 530 }, { "epoch": 1.6665356651601493, "grad_norm": 0.16651305556297302, "learning_rate": 7.636199909950473e-05, "loss": 0.2636, "step": 531 }, { "epoch": 1.6696797013165652, "grad_norm": 0.16956521570682526, "learning_rate": 7.631697433588474e-05, "loss": 0.2956, "step": 532 }, { "epoch": 1.672823737472981, "grad_norm": 0.17262689769268036, "learning_rate": 7.627194957226475e-05, "loss": 0.2889, "step": 533 }, { "epoch": 1.6759677736293968, "grad_norm": 0.17842979729175568, "learning_rate": 7.622692480864476e-05, "loss": 0.3175, "step": 534 }, { "epoch": 1.6791118097858124, "grad_norm": 0.18716371059417725, "learning_rate": 7.618190004502476e-05, "loss": 0.294, "step": 535 }, { "epoch": 1.6822558459422283, "grad_norm": 0.17072086036205292, "learning_rate": 7.613687528140477e-05, "loss": 0.3001, "step": 536 }, { "epoch": 1.6853998820986442, "grad_norm": 0.16700303554534912, "learning_rate": 7.60918505177848e-05, "loss": 0.2576, "step": 537 }, { "epoch": 1.68854391825506, "grad_norm": 0.17436909675598145, "learning_rate": 7.604682575416479e-05, "loss": 0.2986, "step": 538 }, { "epoch": 1.6916879544114756, "grad_norm": 0.1712087094783783, "learning_rate": 7.60018009905448e-05, "loss": 0.2824, "step": 539 }, { "epoch": 1.6948319905678915, "grad_norm": 0.17220038175582886, "learning_rate": 7.595677622692481e-05, "loss": 0.2901, "step": 540 }, { "epoch": 1.6979760267243074, "grad_norm": 0.18637694418430328, "learning_rate": 7.591175146330483e-05, "loss": 0.3375, "step": 541 }, { "epoch": 1.7011200628807233, "grad_norm": 0.1691576987504959, "learning_rate": 7.586672669968482e-05, "loss": 0.2638, "step": 542 }, { "epoch": 1.704264099037139, "grad_norm": 0.17768289148807526, "learning_rate": 7.582170193606483e-05, "loss": 0.2879, "step": 543 }, { "epoch": 1.7074081351935546, "grad_norm": 0.1812208741903305, "learning_rate": 7.577667717244484e-05, "loss": 0.3129, "step": 544 }, { "epoch": 1.7105521713499705, "grad_norm": 0.18346074223518372, "learning_rate": 7.573165240882486e-05, "loss": 0.302, "step": 545 }, { "epoch": 1.7136962075063864, "grad_norm": 0.17309945821762085, "learning_rate": 7.568662764520487e-05, "loss": 0.255, "step": 546 }, { "epoch": 1.716840243662802, "grad_norm": 0.1879347264766693, "learning_rate": 7.564160288158487e-05, "loss": 0.3124, "step": 547 }, { "epoch": 1.719984279819218, "grad_norm": 0.1695443093776703, "learning_rate": 7.559657811796489e-05, "loss": 0.2809, "step": 548 }, { "epoch": 1.7231283159756337, "grad_norm": 0.17476417124271393, "learning_rate": 7.55515533543449e-05, "loss": 0.3043, "step": 549 }, { "epoch": 1.7262723521320495, "grad_norm": 0.1775609701871872, "learning_rate": 7.550652859072491e-05, "loss": 0.289, "step": 550 }, { "epoch": 1.7294163882884654, "grad_norm": 0.17453855276107788, "learning_rate": 7.54615038271049e-05, "loss": 0.2914, "step": 551 }, { "epoch": 1.7325604244448811, "grad_norm": 0.18414853513240814, "learning_rate": 7.541647906348492e-05, "loss": 0.3184, "step": 552 }, { "epoch": 1.7357044606012968, "grad_norm": 0.18060451745986938, "learning_rate": 7.537145429986493e-05, "loss": 0.2998, "step": 553 }, { "epoch": 1.7388484967577127, "grad_norm": 0.1735735535621643, "learning_rate": 7.532642953624494e-05, "loss": 0.3192, "step": 554 }, { "epoch": 1.7419925329141286, "grad_norm": 0.17077748477458954, "learning_rate": 7.528140477262495e-05, "loss": 0.2912, "step": 555 }, { "epoch": 1.7451365690705443, "grad_norm": 0.16513197124004364, "learning_rate": 7.523638000900496e-05, "loss": 0.3024, "step": 556 }, { "epoch": 1.7482806052269602, "grad_norm": 0.1681637018918991, "learning_rate": 7.519135524538497e-05, "loss": 0.2764, "step": 557 }, { "epoch": 1.7514246413833758, "grad_norm": 0.17060600221157074, "learning_rate": 7.514633048176498e-05, "loss": 0.272, "step": 558 }, { "epoch": 1.7545686775397917, "grad_norm": 0.1727294623851776, "learning_rate": 7.510130571814498e-05, "loss": 0.2938, "step": 559 }, { "epoch": 1.7577127136962076, "grad_norm": 0.16411182284355164, "learning_rate": 7.505628095452499e-05, "loss": 0.2864, "step": 560 }, { "epoch": 1.7608567498526233, "grad_norm": 0.16701269149780273, "learning_rate": 7.5011256190905e-05, "loss": 0.276, "step": 561 }, { "epoch": 1.764000786009039, "grad_norm": 0.16412830352783203, "learning_rate": 7.496623142728501e-05, "loss": 0.2836, "step": 562 }, { "epoch": 1.7671448221654549, "grad_norm": 0.17730842530727386, "learning_rate": 7.492120666366502e-05, "loss": 0.2812, "step": 563 }, { "epoch": 1.7702888583218708, "grad_norm": 0.16831046342849731, "learning_rate": 7.487618190004503e-05, "loss": 0.2832, "step": 564 }, { "epoch": 1.7734328944782864, "grad_norm": 0.17002396285533905, "learning_rate": 7.483115713642504e-05, "loss": 0.2884, "step": 565 }, { "epoch": 1.7765769306347023, "grad_norm": 0.181968092918396, "learning_rate": 7.478613237280504e-05, "loss": 0.32, "step": 566 }, { "epoch": 1.779720966791118, "grad_norm": 0.18976394832134247, "learning_rate": 7.474110760918505e-05, "loss": 0.2993, "step": 567 }, { "epoch": 1.782865002947534, "grad_norm": 0.1806926429271698, "learning_rate": 7.469608284556506e-05, "loss": 0.2914, "step": 568 }, { "epoch": 1.7860090391039498, "grad_norm": 0.17822052538394928, "learning_rate": 7.465105808194507e-05, "loss": 0.2843, "step": 569 }, { "epoch": 1.7891530752603655, "grad_norm": 0.18080289661884308, "learning_rate": 7.460603331832509e-05, "loss": 0.3121, "step": 570 }, { "epoch": 1.7922971114167812, "grad_norm": 0.17676854133605957, "learning_rate": 7.456100855470509e-05, "loss": 0.2901, "step": 571 }, { "epoch": 1.795441147573197, "grad_norm": 0.16959191858768463, "learning_rate": 7.45159837910851e-05, "loss": 0.3058, "step": 572 }, { "epoch": 1.798585183729613, "grad_norm": 0.16757243871688843, "learning_rate": 7.447095902746512e-05, "loss": 0.2982, "step": 573 }, { "epoch": 1.8017292198860289, "grad_norm": 0.1798073798418045, "learning_rate": 7.442593426384513e-05, "loss": 0.3097, "step": 574 }, { "epoch": 1.8048732560424445, "grad_norm": 0.16888341307640076, "learning_rate": 7.438090950022512e-05, "loss": 0.2763, "step": 575 }, { "epoch": 1.8080172921988602, "grad_norm": 0.17195682227611542, "learning_rate": 7.433588473660513e-05, "loss": 0.2778, "step": 576 }, { "epoch": 1.811161328355276, "grad_norm": 0.17291922867298126, "learning_rate": 7.429085997298515e-05, "loss": 0.2894, "step": 577 }, { "epoch": 1.814305364511692, "grad_norm": 0.17213907837867737, "learning_rate": 7.424583520936516e-05, "loss": 0.2998, "step": 578 }, { "epoch": 1.8174494006681077, "grad_norm": 0.16969838738441467, "learning_rate": 7.420081044574516e-05, "loss": 0.2953, "step": 579 }, { "epoch": 1.8205934368245233, "grad_norm": 0.16950733959674835, "learning_rate": 7.415578568212516e-05, "loss": 0.2774, "step": 580 }, { "epoch": 1.8237374729809392, "grad_norm": 0.1866762787103653, "learning_rate": 7.411076091850519e-05, "loss": 0.3004, "step": 581 }, { "epoch": 1.8268815091373551, "grad_norm": 0.18050317466259003, "learning_rate": 7.40657361548852e-05, "loss": 0.2867, "step": 582 }, { "epoch": 1.830025545293771, "grad_norm": 0.19073279201984406, "learning_rate": 7.402071139126519e-05, "loss": 0.2874, "step": 583 }, { "epoch": 1.8331695814501867, "grad_norm": 0.18162357807159424, "learning_rate": 7.397568662764521e-05, "loss": 0.3211, "step": 584 }, { "epoch": 1.8363136176066024, "grad_norm": 0.17108604311943054, "learning_rate": 7.393066186402522e-05, "loss": 0.3021, "step": 585 }, { "epoch": 1.8394576537630183, "grad_norm": 0.17849913239479065, "learning_rate": 7.388563710040523e-05, "loss": 0.3127, "step": 586 }, { "epoch": 1.8426016899194342, "grad_norm": 0.16922686994075775, "learning_rate": 7.384061233678522e-05, "loss": 0.2732, "step": 587 }, { "epoch": 1.8457457260758499, "grad_norm": 0.17308250069618225, "learning_rate": 7.379558757316525e-05, "loss": 0.2932, "step": 588 }, { "epoch": 1.8488897622322655, "grad_norm": 0.18480746448040009, "learning_rate": 7.375056280954526e-05, "loss": 0.3036, "step": 589 }, { "epoch": 1.8520337983886814, "grad_norm": 0.16831083595752716, "learning_rate": 7.370553804592526e-05, "loss": 0.2913, "step": 590 }, { "epoch": 1.8551778345450973, "grad_norm": 0.1726708710193634, "learning_rate": 7.366051328230527e-05, "loss": 0.2683, "step": 591 }, { "epoch": 1.8583218707015132, "grad_norm": 0.17040051519870758, "learning_rate": 7.361548851868528e-05, "loss": 0.2681, "step": 592 }, { "epoch": 1.861465906857929, "grad_norm": 0.18175894021987915, "learning_rate": 7.357046375506529e-05, "loss": 0.2643, "step": 593 }, { "epoch": 1.8646099430143446, "grad_norm": 0.18901702761650085, "learning_rate": 7.35254389914453e-05, "loss": 0.2859, "step": 594 }, { "epoch": 1.8677539791707605, "grad_norm": 0.18690907955169678, "learning_rate": 7.348041422782531e-05, "loss": 0.3001, "step": 595 }, { "epoch": 1.8708980153271764, "grad_norm": 0.16587451100349426, "learning_rate": 7.343538946420532e-05, "loss": 0.2833, "step": 596 }, { "epoch": 1.874042051483592, "grad_norm": 0.170462504029274, "learning_rate": 7.339036470058532e-05, "loss": 0.2754, "step": 597 }, { "epoch": 1.8771860876400077, "grad_norm": 0.17350532114505768, "learning_rate": 7.334533993696533e-05, "loss": 0.2956, "step": 598 }, { "epoch": 1.8803301237964236, "grad_norm": 0.1863803118467331, "learning_rate": 7.330031517334534e-05, "loss": 0.3071, "step": 599 }, { "epoch": 1.8834741599528395, "grad_norm": 0.17055153846740723, "learning_rate": 7.325529040972535e-05, "loss": 0.3082, "step": 600 }, { "epoch": 1.8866181961092554, "grad_norm": 0.17581762373447418, "learning_rate": 7.321026564610536e-05, "loss": 0.2947, "step": 601 }, { "epoch": 1.889762232265671, "grad_norm": 0.18630677461624146, "learning_rate": 7.316524088248538e-05, "loss": 0.315, "step": 602 }, { "epoch": 1.8929062684220868, "grad_norm": 0.18152126669883728, "learning_rate": 7.312021611886538e-05, "loss": 0.312, "step": 603 }, { "epoch": 1.8960503045785027, "grad_norm": 0.1875506043434143, "learning_rate": 7.307519135524538e-05, "loss": 0.3224, "step": 604 }, { "epoch": 1.8991943407349186, "grad_norm": 0.18474234640598297, "learning_rate": 7.303016659162539e-05, "loss": 0.2944, "step": 605 }, { "epoch": 1.9023383768913342, "grad_norm": 0.18096047639846802, "learning_rate": 7.298514182800542e-05, "loss": 0.3152, "step": 606 }, { "epoch": 1.90548241304775, "grad_norm": 0.16774339973926544, "learning_rate": 7.294011706438541e-05, "loss": 0.2588, "step": 607 }, { "epoch": 1.9086264492041658, "grad_norm": 0.17628465592861176, "learning_rate": 7.289509230076542e-05, "loss": 0.2923, "step": 608 }, { "epoch": 1.9117704853605817, "grad_norm": 0.17404650151729584, "learning_rate": 7.285006753714544e-05, "loss": 0.2852, "step": 609 }, { "epoch": 1.9149145215169976, "grad_norm": 0.1805901676416397, "learning_rate": 7.280504277352545e-05, "loss": 0.2866, "step": 610 }, { "epoch": 1.9180585576734133, "grad_norm": 0.18428674340248108, "learning_rate": 7.276001800990544e-05, "loss": 0.3207, "step": 611 }, { "epoch": 1.921202593829829, "grad_norm": 0.1807202845811844, "learning_rate": 7.271499324628545e-05, "loss": 0.2933, "step": 612 }, { "epoch": 1.9243466299862448, "grad_norm": 0.17590177059173584, "learning_rate": 7.266996848266548e-05, "loss": 0.2732, "step": 613 }, { "epoch": 1.9274906661426607, "grad_norm": 0.16720589995384216, "learning_rate": 7.262494371904548e-05, "loss": 0.3146, "step": 614 }, { "epoch": 1.9306347022990764, "grad_norm": 0.1786167472600937, "learning_rate": 7.257991895542548e-05, "loss": 0.2892, "step": 615 }, { "epoch": 1.933778738455492, "grad_norm": 0.17377831041812897, "learning_rate": 7.253489419180549e-05, "loss": 0.3105, "step": 616 }, { "epoch": 1.936922774611908, "grad_norm": 0.18173402547836304, "learning_rate": 7.248986942818551e-05, "loss": 0.3436, "step": 617 }, { "epoch": 1.9400668107683239, "grad_norm": 0.17383264005184174, "learning_rate": 7.244484466456552e-05, "loss": 0.2823, "step": 618 }, { "epoch": 1.9432108469247398, "grad_norm": 0.18473853170871735, "learning_rate": 7.239981990094553e-05, "loss": 0.2867, "step": 619 }, { "epoch": 1.9463548830811555, "grad_norm": 0.17817547917366028, "learning_rate": 7.235479513732554e-05, "loss": 0.3056, "step": 620 }, { "epoch": 1.9494989192375711, "grad_norm": 0.17514194548130035, "learning_rate": 7.230977037370554e-05, "loss": 0.2951, "step": 621 }, { "epoch": 1.952642955393987, "grad_norm": 0.17744790017604828, "learning_rate": 7.226474561008555e-05, "loss": 0.293, "step": 622 }, { "epoch": 1.955786991550403, "grad_norm": 0.1766396164894104, "learning_rate": 7.221972084646556e-05, "loss": 0.2775, "step": 623 }, { "epoch": 1.9589310277068186, "grad_norm": 0.17238113284111023, "learning_rate": 7.217469608284557e-05, "loss": 0.2873, "step": 624 }, { "epoch": 1.9620750638632343, "grad_norm": 0.17035745084285736, "learning_rate": 7.212967131922558e-05, "loss": 0.2552, "step": 625 }, { "epoch": 1.9652191000196502, "grad_norm": 0.17209386825561523, "learning_rate": 7.208464655560559e-05, "loss": 0.2826, "step": 626 }, { "epoch": 1.968363136176066, "grad_norm": 0.17958694696426392, "learning_rate": 7.20396217919856e-05, "loss": 0.3108, "step": 627 }, { "epoch": 1.971507172332482, "grad_norm": 0.18314975500106812, "learning_rate": 7.19945970283656e-05, "loss": 0.3155, "step": 628 }, { "epoch": 1.9746512084888976, "grad_norm": 0.17581366002559662, "learning_rate": 7.194957226474561e-05, "loss": 0.2829, "step": 629 }, { "epoch": 1.9777952446453133, "grad_norm": 0.1770240068435669, "learning_rate": 7.190454750112562e-05, "loss": 0.2861, "step": 630 }, { "epoch": 1.9809392808017292, "grad_norm": 0.17571915686130524, "learning_rate": 7.185952273750563e-05, "loss": 0.2827, "step": 631 }, { "epoch": 1.984083316958145, "grad_norm": 0.18270526826381683, "learning_rate": 7.181449797388564e-05, "loss": 0.325, "step": 632 }, { "epoch": 1.9872273531145608, "grad_norm": 0.18204954266548157, "learning_rate": 7.176947321026565e-05, "loss": 0.3038, "step": 633 }, { "epoch": 1.9903713892709765, "grad_norm": 0.16646772623062134, "learning_rate": 7.172444844664566e-05, "loss": 0.2859, "step": 634 }, { "epoch": 1.9935154254273924, "grad_norm": 0.1777997761964798, "learning_rate": 7.167942368302566e-05, "loss": 0.2793, "step": 635 }, { "epoch": 1.9966594615838082, "grad_norm": 0.1707630306482315, "learning_rate": 7.163439891940567e-05, "loss": 0.2845, "step": 636 }, { "epoch": 1.9998034977402241, "grad_norm": 0.17496661841869354, "learning_rate": 7.158937415578568e-05, "loss": 0.2853, "step": 637 }, { "epoch": 2.0, "grad_norm": 0.7054563760757446, "learning_rate": 7.15443493921657e-05, "loss": 0.3121, "step": 638 }, { "epoch": 2.0, "eval_loss": 0.313894122838974, "eval_runtime": 102.2414, "eval_samples_per_second": 12.441, "eval_steps_per_second": 12.441, "step": 638 }, { "epoch": 2.003144036156416, "grad_norm": 0.1561952829360962, "learning_rate": 7.149932462854571e-05, "loss": 0.2603, "step": 639 }, { "epoch": 2.006288072312832, "grad_norm": 0.16964316368103027, "learning_rate": 7.145429986492571e-05, "loss": 0.2682, "step": 640 }, { "epoch": 2.0094321084692472, "grad_norm": 0.18406444787979126, "learning_rate": 7.140927510130572e-05, "loss": 0.2757, "step": 641 }, { "epoch": 2.012576144625663, "grad_norm": 0.17398463189601898, "learning_rate": 7.136425033768574e-05, "loss": 0.2402, "step": 642 }, { "epoch": 2.015720180782079, "grad_norm": 0.17903786897659302, "learning_rate": 7.131922557406575e-05, "loss": 0.2781, "step": 643 }, { "epoch": 2.018864216938495, "grad_norm": 0.18108315765857697, "learning_rate": 7.127420081044574e-05, "loss": 0.2352, "step": 644 }, { "epoch": 2.0220082530949104, "grad_norm": 0.1897296905517578, "learning_rate": 7.122917604682576e-05, "loss": 0.2766, "step": 645 }, { "epoch": 2.0251522892513263, "grad_norm": 0.18837666511535645, "learning_rate": 7.118415128320577e-05, "loss": 0.2524, "step": 646 }, { "epoch": 2.028296325407742, "grad_norm": 0.19135642051696777, "learning_rate": 7.113912651958578e-05, "loss": 0.2351, "step": 647 }, { "epoch": 2.031440361564158, "grad_norm": 0.18856996297836304, "learning_rate": 7.109410175596578e-05, "loss": 0.2508, "step": 648 }, { "epoch": 2.034584397720574, "grad_norm": 0.1961357444524765, "learning_rate": 7.10490769923458e-05, "loss": 0.2412, "step": 649 }, { "epoch": 2.0377284338769894, "grad_norm": 0.18814240396022797, "learning_rate": 7.10040522287258e-05, "loss": 0.2582, "step": 650 }, { "epoch": 2.0408724700334053, "grad_norm": 0.20037615299224854, "learning_rate": 7.095902746510582e-05, "loss": 0.2685, "step": 651 }, { "epoch": 2.0440165061898212, "grad_norm": 0.19964700937271118, "learning_rate": 7.091400270148581e-05, "loss": 0.2637, "step": 652 }, { "epoch": 2.047160542346237, "grad_norm": 0.20531342923641205, "learning_rate": 7.086897793786583e-05, "loss": 0.2754, "step": 653 }, { "epoch": 2.0503045785026526, "grad_norm": 0.18540841341018677, "learning_rate": 7.082395317424584e-05, "loss": 0.2543, "step": 654 }, { "epoch": 2.0534486146590685, "grad_norm": 0.19029483199119568, "learning_rate": 7.077892841062585e-05, "loss": 0.2546, "step": 655 }, { "epoch": 2.0565926508154844, "grad_norm": 0.20083504915237427, "learning_rate": 7.073390364700586e-05, "loss": 0.2479, "step": 656 }, { "epoch": 2.0597366869719003, "grad_norm": 0.2063308209180832, "learning_rate": 7.068887888338587e-05, "loss": 0.2647, "step": 657 }, { "epoch": 2.062880723128316, "grad_norm": 0.202437624335289, "learning_rate": 7.064385411976588e-05, "loss": 0.2562, "step": 658 }, { "epoch": 2.0660247592847316, "grad_norm": 0.20782361924648285, "learning_rate": 7.059882935614588e-05, "loss": 0.2695, "step": 659 }, { "epoch": 2.0691687954411475, "grad_norm": 0.19810637831687927, "learning_rate": 7.055380459252589e-05, "loss": 0.2596, "step": 660 }, { "epoch": 2.0723128315975634, "grad_norm": 0.20872469246387482, "learning_rate": 7.05087798289059e-05, "loss": 0.269, "step": 661 }, { "epoch": 2.0754568677539793, "grad_norm": 0.20939649641513824, "learning_rate": 7.046375506528591e-05, "loss": 0.2731, "step": 662 }, { "epoch": 2.0786009039103948, "grad_norm": 0.18877120316028595, "learning_rate": 7.041873030166593e-05, "loss": 0.263, "step": 663 }, { "epoch": 2.0817449400668107, "grad_norm": 0.20508892834186554, "learning_rate": 7.037370553804593e-05, "loss": 0.2521, "step": 664 }, { "epoch": 2.0848889762232266, "grad_norm": 0.20418784022331238, "learning_rate": 7.032868077442594e-05, "loss": 0.2873, "step": 665 }, { "epoch": 2.0880330123796425, "grad_norm": 0.1922852247953415, "learning_rate": 7.028365601080594e-05, "loss": 0.2547, "step": 666 }, { "epoch": 2.0911770485360583, "grad_norm": 0.1984114795923233, "learning_rate": 7.023863124718597e-05, "loss": 0.2516, "step": 667 }, { "epoch": 2.094321084692474, "grad_norm": 0.19552253186702728, "learning_rate": 7.019360648356596e-05, "loss": 0.2659, "step": 668 }, { "epoch": 2.0974651208488897, "grad_norm": 0.20418553054332733, "learning_rate": 7.014858171994597e-05, "loss": 0.2431, "step": 669 }, { "epoch": 2.1006091570053056, "grad_norm": 0.20504607260227203, "learning_rate": 7.010355695632598e-05, "loss": 0.2355, "step": 670 }, { "epoch": 2.1037531931617215, "grad_norm": 0.2173071652650833, "learning_rate": 7.0058532192706e-05, "loss": 0.2924, "step": 671 }, { "epoch": 2.106897229318137, "grad_norm": 0.20269042253494263, "learning_rate": 7.0013507429086e-05, "loss": 0.2558, "step": 672 }, { "epoch": 2.110041265474553, "grad_norm": 0.19754748046398163, "learning_rate": 6.9968482665466e-05, "loss": 0.2597, "step": 673 }, { "epoch": 2.1131853016309687, "grad_norm": 0.19913309812545776, "learning_rate": 6.992345790184603e-05, "loss": 0.2341, "step": 674 }, { "epoch": 2.1163293377873846, "grad_norm": 0.20122374594211578, "learning_rate": 6.987843313822603e-05, "loss": 0.2556, "step": 675 }, { "epoch": 2.1194733739438005, "grad_norm": 0.20415376126766205, "learning_rate": 6.983340837460603e-05, "loss": 0.235, "step": 676 }, { "epoch": 2.122617410100216, "grad_norm": 0.20989355444908142, "learning_rate": 6.978838361098604e-05, "loss": 0.2517, "step": 677 }, { "epoch": 2.125761446256632, "grad_norm": 0.1990608125925064, "learning_rate": 6.974335884736606e-05, "loss": 0.2662, "step": 678 }, { "epoch": 2.128905482413048, "grad_norm": 0.21925538778305054, "learning_rate": 6.969833408374607e-05, "loss": 0.2653, "step": 679 }, { "epoch": 2.1320495185694637, "grad_norm": 0.1933649778366089, "learning_rate": 6.965330932012606e-05, "loss": 0.2548, "step": 680 }, { "epoch": 2.1351935547258796, "grad_norm": 0.21430613100528717, "learning_rate": 6.960828455650609e-05, "loss": 0.2774, "step": 681 }, { "epoch": 2.138337590882295, "grad_norm": 0.2014111578464508, "learning_rate": 6.95632597928861e-05, "loss": 0.246, "step": 682 }, { "epoch": 2.141481627038711, "grad_norm": 0.19965550303459167, "learning_rate": 6.95182350292661e-05, "loss": 0.2795, "step": 683 }, { "epoch": 2.144625663195127, "grad_norm": 0.2065405547618866, "learning_rate": 6.94732102656461e-05, "loss": 0.2861, "step": 684 }, { "epoch": 2.1477696993515427, "grad_norm": 0.19733914732933044, "learning_rate": 6.942818550202612e-05, "loss": 0.2575, "step": 685 }, { "epoch": 2.150913735507958, "grad_norm": 0.20192170143127441, "learning_rate": 6.938316073840613e-05, "loss": 0.2458, "step": 686 }, { "epoch": 2.154057771664374, "grad_norm": 0.19670715928077698, "learning_rate": 6.933813597478614e-05, "loss": 0.2596, "step": 687 }, { "epoch": 2.15720180782079, "grad_norm": 0.2009287029504776, "learning_rate": 6.929311121116615e-05, "loss": 0.2431, "step": 688 }, { "epoch": 2.160345843977206, "grad_norm": 0.20156782865524292, "learning_rate": 6.924808644754615e-05, "loss": 0.2431, "step": 689 }, { "epoch": 2.1634898801336213, "grad_norm": 0.20345966517925262, "learning_rate": 6.920306168392616e-05, "loss": 0.2886, "step": 690 }, { "epoch": 2.166633916290037, "grad_norm": 0.1945163756608963, "learning_rate": 6.915803692030617e-05, "loss": 0.2599, "step": 691 }, { "epoch": 2.169777952446453, "grad_norm": 0.1998612880706787, "learning_rate": 6.911301215668618e-05, "loss": 0.2673, "step": 692 }, { "epoch": 2.172921988602869, "grad_norm": 0.1975962072610855, "learning_rate": 6.906798739306619e-05, "loss": 0.264, "step": 693 }, { "epoch": 2.176066024759285, "grad_norm": 0.20633172988891602, "learning_rate": 6.90229626294462e-05, "loss": 0.2584, "step": 694 }, { "epoch": 2.1792100609157004, "grad_norm": 0.19151711463928223, "learning_rate": 6.89779378658262e-05, "loss": 0.2692, "step": 695 }, { "epoch": 2.1823540970721163, "grad_norm": 0.19535675644874573, "learning_rate": 6.893291310220622e-05, "loss": 0.2424, "step": 696 }, { "epoch": 2.185498133228532, "grad_norm": 0.2010478973388672, "learning_rate": 6.888788833858622e-05, "loss": 0.2686, "step": 697 }, { "epoch": 2.188642169384948, "grad_norm": 0.21084745228290558, "learning_rate": 6.884286357496623e-05, "loss": 0.2603, "step": 698 }, { "epoch": 2.191786205541364, "grad_norm": 0.19947238266468048, "learning_rate": 6.879783881134625e-05, "loss": 0.2536, "step": 699 }, { "epoch": 2.1949302416977794, "grad_norm": 0.2031436562538147, "learning_rate": 6.875281404772625e-05, "loss": 0.2437, "step": 700 }, { "epoch": 2.1980742778541953, "grad_norm": 0.20883992314338684, "learning_rate": 6.870778928410626e-05, "loss": 0.2734, "step": 701 }, { "epoch": 2.201218314010611, "grad_norm": 0.20566269755363464, "learning_rate": 6.866276452048627e-05, "loss": 0.2556, "step": 702 }, { "epoch": 2.204362350167027, "grad_norm": 0.2203208953142166, "learning_rate": 6.861773975686629e-05, "loss": 0.26, "step": 703 }, { "epoch": 2.2075063863234425, "grad_norm": 0.21616657078266144, "learning_rate": 6.857271499324628e-05, "loss": 0.2535, "step": 704 }, { "epoch": 2.2106504224798584, "grad_norm": 0.21879667043685913, "learning_rate": 6.852769022962629e-05, "loss": 0.272, "step": 705 }, { "epoch": 2.2137944586362743, "grad_norm": 0.21559832990169525, "learning_rate": 6.84826654660063e-05, "loss": 0.2759, "step": 706 }, { "epoch": 2.2169384947926902, "grad_norm": 0.20863446593284607, "learning_rate": 6.843764070238632e-05, "loss": 0.244, "step": 707 }, { "epoch": 2.2200825309491057, "grad_norm": 0.21461093425750732, "learning_rate": 6.839261593876632e-05, "loss": 0.2756, "step": 708 }, { "epoch": 2.2232265671055216, "grad_norm": 0.210508331656456, "learning_rate": 6.834759117514633e-05, "loss": 0.2656, "step": 709 }, { "epoch": 2.2263706032619375, "grad_norm": 0.20859602093696594, "learning_rate": 6.830256641152635e-05, "loss": 0.2484, "step": 710 }, { "epoch": 2.2295146394183534, "grad_norm": 0.20836792886257172, "learning_rate": 6.825754164790636e-05, "loss": 0.2805, "step": 711 }, { "epoch": 2.2326586755747693, "grad_norm": 0.19477733969688416, "learning_rate": 6.821251688428637e-05, "loss": 0.2451, "step": 712 }, { "epoch": 2.2358027117311847, "grad_norm": 0.20550581812858582, "learning_rate": 6.816749212066636e-05, "loss": 0.2596, "step": 713 }, { "epoch": 2.2389467478876006, "grad_norm": 0.2021128386259079, "learning_rate": 6.812246735704638e-05, "loss": 0.253, "step": 714 }, { "epoch": 2.2420907840440165, "grad_norm": 0.217298686504364, "learning_rate": 6.807744259342639e-05, "loss": 0.2658, "step": 715 }, { "epoch": 2.2452348202004324, "grad_norm": 0.20543943345546722, "learning_rate": 6.80324178298064e-05, "loss": 0.2693, "step": 716 }, { "epoch": 2.2483788563568483, "grad_norm": 0.2067873626947403, "learning_rate": 6.798739306618641e-05, "loss": 0.2551, "step": 717 }, { "epoch": 2.2515228925132638, "grad_norm": 0.21177925169467926, "learning_rate": 6.794236830256642e-05, "loss": 0.2657, "step": 718 }, { "epoch": 2.2546669286696797, "grad_norm": 0.21894124150276184, "learning_rate": 6.789734353894643e-05, "loss": 0.2449, "step": 719 }, { "epoch": 2.2578109648260956, "grad_norm": 0.21589498221874237, "learning_rate": 6.785231877532643e-05, "loss": 0.2792, "step": 720 }, { "epoch": 2.2609550009825115, "grad_norm": 0.20770463347434998, "learning_rate": 6.780729401170644e-05, "loss": 0.2534, "step": 721 }, { "epoch": 2.264099037138927, "grad_norm": 0.207777738571167, "learning_rate": 6.776226924808645e-05, "loss": 0.2632, "step": 722 }, { "epoch": 2.267243073295343, "grad_norm": 0.2114667147397995, "learning_rate": 6.771724448446646e-05, "loss": 0.2513, "step": 723 }, { "epoch": 2.2703871094517587, "grad_norm": 0.21462954580783844, "learning_rate": 6.767221972084647e-05, "loss": 0.259, "step": 724 }, { "epoch": 2.2735311456081746, "grad_norm": 0.2036915272474289, "learning_rate": 6.762719495722648e-05, "loss": 0.2611, "step": 725 }, { "epoch": 2.27667518176459, "grad_norm": 0.2013128101825714, "learning_rate": 6.758217019360649e-05, "loss": 0.257, "step": 726 }, { "epoch": 2.279819217921006, "grad_norm": 0.2161940485239029, "learning_rate": 6.75371454299865e-05, "loss": 0.2583, "step": 727 }, { "epoch": 2.282963254077422, "grad_norm": 0.21858982741832733, "learning_rate": 6.74921206663665e-05, "loss": 0.2761, "step": 728 }, { "epoch": 2.2861072902338377, "grad_norm": 0.21288833022117615, "learning_rate": 6.744709590274651e-05, "loss": 0.271, "step": 729 }, { "epoch": 2.2892513263902536, "grad_norm": 0.19840721786022186, "learning_rate": 6.740207113912652e-05, "loss": 0.237, "step": 730 }, { "epoch": 2.292395362546669, "grad_norm": 0.21009394526481628, "learning_rate": 6.735704637550653e-05, "loss": 0.2719, "step": 731 }, { "epoch": 2.295539398703085, "grad_norm": 0.21607272326946259, "learning_rate": 6.731202161188654e-05, "loss": 0.2613, "step": 732 }, { "epoch": 2.298683434859501, "grad_norm": 0.20696361362934113, "learning_rate": 6.726699684826655e-05, "loss": 0.2812, "step": 733 }, { "epoch": 2.301827471015917, "grad_norm": 0.21792151033878326, "learning_rate": 6.722197208464655e-05, "loss": 0.2574, "step": 734 }, { "epoch": 2.3049715071723327, "grad_norm": 0.2094501405954361, "learning_rate": 6.717694732102658e-05, "loss": 0.2438, "step": 735 }, { "epoch": 2.308115543328748, "grad_norm": 0.21923451125621796, "learning_rate": 6.713192255740659e-05, "loss": 0.259, "step": 736 }, { "epoch": 2.311259579485164, "grad_norm": 0.21065974235534668, "learning_rate": 6.708689779378658e-05, "loss": 0.255, "step": 737 }, { "epoch": 2.31440361564158, "grad_norm": 0.22192592918872833, "learning_rate": 6.704187303016659e-05, "loss": 0.2606, "step": 738 }, { "epoch": 2.317547651797996, "grad_norm": 0.21515235304832458, "learning_rate": 6.699684826654661e-05, "loss": 0.2552, "step": 739 }, { "epoch": 2.3206916879544117, "grad_norm": 0.19961953163146973, "learning_rate": 6.695182350292662e-05, "loss": 0.2333, "step": 740 }, { "epoch": 2.323835724110827, "grad_norm": 0.21117697656154633, "learning_rate": 6.690679873930662e-05, "loss": 0.2663, "step": 741 }, { "epoch": 2.326979760267243, "grad_norm": 0.1987060159444809, "learning_rate": 6.686177397568662e-05, "loss": 0.2336, "step": 742 }, { "epoch": 2.330123796423659, "grad_norm": 0.20169393718242645, "learning_rate": 6.681674921206665e-05, "loss": 0.231, "step": 743 }, { "epoch": 2.333267832580075, "grad_norm": 0.21280111372470856, "learning_rate": 6.677172444844665e-05, "loss": 0.2781, "step": 744 }, { "epoch": 2.3364118687364903, "grad_norm": 0.22040395438671112, "learning_rate": 6.672669968482665e-05, "loss": 0.2681, "step": 745 }, { "epoch": 2.339555904892906, "grad_norm": 0.20575925707817078, "learning_rate": 6.668167492120667e-05, "loss": 0.2537, "step": 746 }, { "epoch": 2.342699941049322, "grad_norm": 0.20729082822799683, "learning_rate": 6.663665015758668e-05, "loss": 0.2511, "step": 747 }, { "epoch": 2.345843977205738, "grad_norm": 0.21347106993198395, "learning_rate": 6.659162539396669e-05, "loss": 0.2593, "step": 748 }, { "epoch": 2.3489880133621535, "grad_norm": 0.21103815734386444, "learning_rate": 6.654660063034668e-05, "loss": 0.2707, "step": 749 }, { "epoch": 2.3521320495185694, "grad_norm": 0.21059168875217438, "learning_rate": 6.65015758667267e-05, "loss": 0.2511, "step": 750 }, { "epoch": 2.3552760856749853, "grad_norm": 0.20155653357505798, "learning_rate": 6.645655110310671e-05, "loss": 0.2628, "step": 751 }, { "epoch": 2.358420121831401, "grad_norm": 0.22449065744876862, "learning_rate": 6.641152633948672e-05, "loss": 0.2741, "step": 752 }, { "epoch": 2.361564157987817, "grad_norm": 0.21507477760314941, "learning_rate": 6.636650157586673e-05, "loss": 0.2633, "step": 753 }, { "epoch": 2.3647081941442325, "grad_norm": 0.20926739275455475, "learning_rate": 6.632147681224674e-05, "loss": 0.2602, "step": 754 }, { "epoch": 2.3678522303006484, "grad_norm": 0.19953890144824982, "learning_rate": 6.627645204862675e-05, "loss": 0.2475, "step": 755 }, { "epoch": 2.3709962664570643, "grad_norm": 0.19762159883975983, "learning_rate": 6.623142728500676e-05, "loss": 0.2594, "step": 756 }, { "epoch": 2.37414030261348, "grad_norm": 0.21025808155536652, "learning_rate": 6.618640252138677e-05, "loss": 0.2518, "step": 757 }, { "epoch": 2.377284338769896, "grad_norm": 0.21766838431358337, "learning_rate": 6.614137775776677e-05, "loss": 0.2686, "step": 758 }, { "epoch": 2.3804283749263115, "grad_norm": 0.2110193520784378, "learning_rate": 6.609635299414678e-05, "loss": 0.2683, "step": 759 }, { "epoch": 2.3835724110827274, "grad_norm": 0.20737890899181366, "learning_rate": 6.605132823052679e-05, "loss": 0.2594, "step": 760 }, { "epoch": 2.3867164472391433, "grad_norm": 0.21457286179065704, "learning_rate": 6.60063034669068e-05, "loss": 0.2645, "step": 761 }, { "epoch": 2.3898604833955592, "grad_norm": 0.21056179702281952, "learning_rate": 6.596127870328681e-05, "loss": 0.2516, "step": 762 }, { "epoch": 2.3930045195519747, "grad_norm": 0.21386046707630157, "learning_rate": 6.591625393966682e-05, "loss": 0.2555, "step": 763 }, { "epoch": 2.3961485557083906, "grad_norm": 0.21433734893798828, "learning_rate": 6.587122917604684e-05, "loss": 0.2636, "step": 764 }, { "epoch": 2.3992925918648065, "grad_norm": 0.20941971242427826, "learning_rate": 6.582620441242683e-05, "loss": 0.2604, "step": 765 }, { "epoch": 2.4024366280212224, "grad_norm": 0.2234574407339096, "learning_rate": 6.578117964880684e-05, "loss": 0.2545, "step": 766 }, { "epoch": 2.405580664177638, "grad_norm": 0.22144284844398499, "learning_rate": 6.573615488518685e-05, "loss": 0.2771, "step": 767 }, { "epoch": 2.4087247003340537, "grad_norm": 0.20735910534858704, "learning_rate": 6.569113012156687e-05, "loss": 0.2549, "step": 768 }, { "epoch": 2.4118687364904696, "grad_norm": 0.19960278272628784, "learning_rate": 6.564610535794687e-05, "loss": 0.2504, "step": 769 }, { "epoch": 2.4150127726468855, "grad_norm": 0.2132129669189453, "learning_rate": 6.560108059432688e-05, "loss": 0.261, "step": 770 }, { "epoch": 2.4181568088033014, "grad_norm": 0.20775727927684784, "learning_rate": 6.55560558307069e-05, "loss": 0.2419, "step": 771 }, { "epoch": 2.421300844959717, "grad_norm": 0.2030857801437378, "learning_rate": 6.551103106708691e-05, "loss": 0.2381, "step": 772 }, { "epoch": 2.4244448811161328, "grad_norm": 0.21294380724430084, "learning_rate": 6.54660063034669e-05, "loss": 0.2797, "step": 773 }, { "epoch": 2.4275889172725487, "grad_norm": 0.21088573336601257, "learning_rate": 6.542098153984691e-05, "loss": 0.2487, "step": 774 }, { "epoch": 2.4307329534289646, "grad_norm": 0.20481328666210175, "learning_rate": 6.537595677622693e-05, "loss": 0.2326, "step": 775 }, { "epoch": 2.4338769895853805, "grad_norm": 0.21915322542190552, "learning_rate": 6.533093201260694e-05, "loss": 0.2649, "step": 776 }, { "epoch": 2.437021025741796, "grad_norm": 0.20623372495174408, "learning_rate": 6.528590724898694e-05, "loss": 0.2343, "step": 777 }, { "epoch": 2.440165061898212, "grad_norm": 0.2215745896100998, "learning_rate": 6.524088248536695e-05, "loss": 0.2733, "step": 778 }, { "epoch": 2.4433090980546277, "grad_norm": 0.25559526681900024, "learning_rate": 6.519585772174697e-05, "loss": 0.2718, "step": 779 }, { "epoch": 2.4464531342110436, "grad_norm": 0.21796360611915588, "learning_rate": 6.515083295812698e-05, "loss": 0.2649, "step": 780 }, { "epoch": 2.449597170367459, "grad_norm": 0.2073902189731598, "learning_rate": 6.510580819450697e-05, "loss": 0.2668, "step": 781 }, { "epoch": 2.452741206523875, "grad_norm": 0.20899632573127747, "learning_rate": 6.5060783430887e-05, "loss": 0.2606, "step": 782 }, { "epoch": 2.455885242680291, "grad_norm": 0.20637986063957214, "learning_rate": 6.5015758667267e-05, "loss": 0.2493, "step": 783 }, { "epoch": 2.4590292788367067, "grad_norm": 0.22849562764167786, "learning_rate": 6.497073390364701e-05, "loss": 0.2531, "step": 784 }, { "epoch": 2.462173314993122, "grad_norm": 0.21141007542610168, "learning_rate": 6.492570914002702e-05, "loss": 0.2769, "step": 785 }, { "epoch": 2.465317351149538, "grad_norm": 0.21039460599422455, "learning_rate": 6.488068437640703e-05, "loss": 0.2656, "step": 786 }, { "epoch": 2.468461387305954, "grad_norm": 0.2262180894613266, "learning_rate": 6.483565961278704e-05, "loss": 0.2542, "step": 787 }, { "epoch": 2.47160542346237, "grad_norm": 0.20708444714546204, "learning_rate": 6.479063484916705e-05, "loss": 0.2652, "step": 788 }, { "epoch": 2.474749459618786, "grad_norm": 0.21185430884361267, "learning_rate": 6.474561008554705e-05, "loss": 0.2652, "step": 789 }, { "epoch": 2.4778934957752012, "grad_norm": 0.21837091445922852, "learning_rate": 6.470058532192706e-05, "loss": 0.2773, "step": 790 }, { "epoch": 2.481037531931617, "grad_norm": 0.2129461169242859, "learning_rate": 6.465556055830707e-05, "loss": 0.2706, "step": 791 }, { "epoch": 2.484181568088033, "grad_norm": 0.20862016081809998, "learning_rate": 6.461053579468708e-05, "loss": 0.2463, "step": 792 }, { "epoch": 2.487325604244449, "grad_norm": 0.21938055753707886, "learning_rate": 6.456551103106709e-05, "loss": 0.2958, "step": 793 }, { "epoch": 2.490469640400865, "grad_norm": 0.20615948736667633, "learning_rate": 6.45204862674471e-05, "loss": 0.2258, "step": 794 }, { "epoch": 2.4936136765572803, "grad_norm": 0.2173532098531723, "learning_rate": 6.44754615038271e-05, "loss": 0.2368, "step": 795 }, { "epoch": 2.496757712713696, "grad_norm": 0.21336568892002106, "learning_rate": 6.443043674020711e-05, "loss": 0.2466, "step": 796 }, { "epoch": 2.499901748870112, "grad_norm": 0.2175597995519638, "learning_rate": 6.438541197658712e-05, "loss": 0.2475, "step": 797 }, { "epoch": 2.503045785026528, "grad_norm": 0.21101081371307373, "learning_rate": 6.434038721296713e-05, "loss": 0.2652, "step": 798 }, { "epoch": 2.506189821182944, "grad_norm": 0.21488317847251892, "learning_rate": 6.429536244934714e-05, "loss": 0.2536, "step": 799 }, { "epoch": 2.5093338573393593, "grad_norm": 0.22166606783866882, "learning_rate": 6.425033768572716e-05, "loss": 0.2862, "step": 800 }, { "epoch": 2.512477893495775, "grad_norm": 0.21679571270942688, "learning_rate": 6.420531292210716e-05, "loss": 0.257, "step": 801 }, { "epoch": 2.515621929652191, "grad_norm": 0.20331613719463348, "learning_rate": 6.416028815848717e-05, "loss": 0.228, "step": 802 }, { "epoch": 2.5187659658086066, "grad_norm": 0.21109510958194733, "learning_rate": 6.411526339486717e-05, "loss": 0.2563, "step": 803 }, { "epoch": 2.5219100019650225, "grad_norm": 0.2095702886581421, "learning_rate": 6.40702386312472e-05, "loss": 0.2546, "step": 804 }, { "epoch": 2.5250540381214384, "grad_norm": 0.2165631204843521, "learning_rate": 6.402521386762719e-05, "loss": 0.2549, "step": 805 }, { "epoch": 2.5281980742778543, "grad_norm": 0.20962166786193848, "learning_rate": 6.39801891040072e-05, "loss": 0.2529, "step": 806 }, { "epoch": 2.53134211043427, "grad_norm": 0.2025025188922882, "learning_rate": 6.393516434038722e-05, "loss": 0.2514, "step": 807 }, { "epoch": 2.5344861465906856, "grad_norm": 0.21611244976520538, "learning_rate": 6.389013957676723e-05, "loss": 0.2576, "step": 808 }, { "epoch": 2.5376301827471015, "grad_norm": 0.21303138136863708, "learning_rate": 6.384511481314724e-05, "loss": 0.2636, "step": 809 }, { "epoch": 2.5407742189035174, "grad_norm": 0.21392260491847992, "learning_rate": 6.380009004952723e-05, "loss": 0.258, "step": 810 }, { "epoch": 2.5439182550599333, "grad_norm": 0.20991139113903046, "learning_rate": 6.375506528590726e-05, "loss": 0.2723, "step": 811 }, { "epoch": 2.547062291216349, "grad_norm": 0.20843861997127533, "learning_rate": 6.371004052228727e-05, "loss": 0.2814, "step": 812 }, { "epoch": 2.5502063273727646, "grad_norm": 0.2059059739112854, "learning_rate": 6.366501575866727e-05, "loss": 0.2531, "step": 813 }, { "epoch": 2.5533503635291805, "grad_norm": 0.2146722972393036, "learning_rate": 6.361999099504727e-05, "loss": 0.2945, "step": 814 }, { "epoch": 2.5564943996855964, "grad_norm": 0.21372562646865845, "learning_rate": 6.357496623142729e-05, "loss": 0.2536, "step": 815 }, { "epoch": 2.5596384358420123, "grad_norm": 0.20308613777160645, "learning_rate": 6.35299414678073e-05, "loss": 0.2428, "step": 816 }, { "epoch": 2.5627824719984282, "grad_norm": 0.21821540594100952, "learning_rate": 6.348491670418731e-05, "loss": 0.2682, "step": 817 }, { "epoch": 2.5659265081548437, "grad_norm": 0.21356253325939178, "learning_rate": 6.343989194056732e-05, "loss": 0.2426, "step": 818 }, { "epoch": 2.5690705443112596, "grad_norm": 0.21461884677410126, "learning_rate": 6.339486717694733e-05, "loss": 0.2713, "step": 819 }, { "epoch": 2.5722145804676755, "grad_norm": 0.2067326307296753, "learning_rate": 6.334984241332733e-05, "loss": 0.2424, "step": 820 }, { "epoch": 2.575358616624091, "grad_norm": 0.20385001599788666, "learning_rate": 6.330481764970734e-05, "loss": 0.2466, "step": 821 }, { "epoch": 2.578502652780507, "grad_norm": 0.22466441988945007, "learning_rate": 6.325979288608735e-05, "loss": 0.2643, "step": 822 }, { "epoch": 2.5816466889369227, "grad_norm": 0.19516023993492126, "learning_rate": 6.321476812246736e-05, "loss": 0.2154, "step": 823 }, { "epoch": 2.5847907250933386, "grad_norm": 0.22316451370716095, "learning_rate": 6.316974335884737e-05, "loss": 0.2745, "step": 824 }, { "epoch": 2.5879347612497545, "grad_norm": 0.21437634527683258, "learning_rate": 6.312471859522738e-05, "loss": 0.2553, "step": 825 }, { "epoch": 2.59107879740617, "grad_norm": 0.21494808793067932, "learning_rate": 6.307969383160739e-05, "loss": 0.2684, "step": 826 }, { "epoch": 2.594222833562586, "grad_norm": 0.21400584280490875, "learning_rate": 6.30346690679874e-05, "loss": 0.2531, "step": 827 }, { "epoch": 2.5973668697190018, "grad_norm": 0.2113097906112671, "learning_rate": 6.29896443043674e-05, "loss": 0.2587, "step": 828 }, { "epoch": 2.6005109058754177, "grad_norm": 0.2207076996564865, "learning_rate": 6.294461954074741e-05, "loss": 0.2748, "step": 829 }, { "epoch": 2.6036549420318336, "grad_norm": 0.2082621455192566, "learning_rate": 6.289959477712742e-05, "loss": 0.259, "step": 830 }, { "epoch": 2.606798978188249, "grad_norm": 0.21487262845039368, "learning_rate": 6.285457001350743e-05, "loss": 0.2682, "step": 831 }, { "epoch": 2.609943014344665, "grad_norm": 0.1992974430322647, "learning_rate": 6.280954524988744e-05, "loss": 0.2511, "step": 832 }, { "epoch": 2.613087050501081, "grad_norm": 0.21342509984970093, "learning_rate": 6.276452048626746e-05, "loss": 0.2503, "step": 833 }, { "epoch": 2.6162310866574967, "grad_norm": 0.2101677805185318, "learning_rate": 6.271949572264745e-05, "loss": 0.2457, "step": 834 }, { "epoch": 2.6193751228139126, "grad_norm": 0.21574127674102783, "learning_rate": 6.267447095902746e-05, "loss": 0.2457, "step": 835 }, { "epoch": 2.622519158970328, "grad_norm": 0.21575531363487244, "learning_rate": 6.262944619540749e-05, "loss": 0.261, "step": 836 }, { "epoch": 2.625663195126744, "grad_norm": 0.22505593299865723, "learning_rate": 6.25844214317875e-05, "loss": 0.2758, "step": 837 }, { "epoch": 2.62880723128316, "grad_norm": 0.20997416973114014, "learning_rate": 6.253939666816749e-05, "loss": 0.2594, "step": 838 }, { "epoch": 2.6319512674395753, "grad_norm": 0.21567827463150024, "learning_rate": 6.24943719045475e-05, "loss": 0.2706, "step": 839 }, { "epoch": 2.635095303595991, "grad_norm": 0.2084456980228424, "learning_rate": 6.244934714092752e-05, "loss": 0.2672, "step": 840 }, { "epoch": 2.638239339752407, "grad_norm": 0.2244175523519516, "learning_rate": 6.240432237730753e-05, "loss": 0.2786, "step": 841 }, { "epoch": 2.641383375908823, "grad_norm": 0.21015183627605438, "learning_rate": 6.235929761368752e-05, "loss": 0.2631, "step": 842 }, { "epoch": 2.644527412065239, "grad_norm": 0.21380454301834106, "learning_rate": 6.231427285006755e-05, "loss": 0.2676, "step": 843 }, { "epoch": 2.6476714482216543, "grad_norm": 0.21570055186748505, "learning_rate": 6.226924808644755e-05, "loss": 0.2419, "step": 844 }, { "epoch": 2.6508154843780702, "grad_norm": 0.2170439213514328, "learning_rate": 6.222422332282756e-05, "loss": 0.2664, "step": 845 }, { "epoch": 2.653959520534486, "grad_norm": 0.26516497135162354, "learning_rate": 6.217919855920756e-05, "loss": 0.2429, "step": 846 }, { "epoch": 2.657103556690902, "grad_norm": 0.22887665033340454, "learning_rate": 6.213417379558758e-05, "loss": 0.2459, "step": 847 }, { "epoch": 2.660247592847318, "grad_norm": 0.22019223868846893, "learning_rate": 6.208914903196759e-05, "loss": 0.2619, "step": 848 }, { "epoch": 2.6633916290037334, "grad_norm": 0.20383231341838837, "learning_rate": 6.20441242683476e-05, "loss": 0.2568, "step": 849 }, { "epoch": 2.6665356651601493, "grad_norm": 0.20949026942253113, "learning_rate": 6.199909950472759e-05, "loss": 0.2622, "step": 850 }, { "epoch": 2.669679701316565, "grad_norm": 0.2074085772037506, "learning_rate": 6.195407474110761e-05, "loss": 0.2602, "step": 851 }, { "epoch": 2.672823737472981, "grad_norm": 0.2055203765630722, "learning_rate": 6.190904997748762e-05, "loss": 0.265, "step": 852 }, { "epoch": 2.675967773629397, "grad_norm": 0.2122073769569397, "learning_rate": 6.186402521386763e-05, "loss": 0.2614, "step": 853 }, { "epoch": 2.6791118097858124, "grad_norm": 0.21181835234165192, "learning_rate": 6.181900045024764e-05, "loss": 0.2605, "step": 854 }, { "epoch": 2.6822558459422283, "grad_norm": 0.21882998943328857, "learning_rate": 6.177397568662765e-05, "loss": 0.2654, "step": 855 }, { "epoch": 2.685399882098644, "grad_norm": 0.20726241171360016, "learning_rate": 6.172895092300766e-05, "loss": 0.2543, "step": 856 }, { "epoch": 2.6885439182550597, "grad_norm": 0.21267525851726532, "learning_rate": 6.168392615938767e-05, "loss": 0.2582, "step": 857 }, { "epoch": 2.6916879544114756, "grad_norm": 0.21388453245162964, "learning_rate": 6.163890139576767e-05, "loss": 0.2449, "step": 858 }, { "epoch": 2.6948319905678915, "grad_norm": 0.22963562607765198, "learning_rate": 6.159387663214768e-05, "loss": 0.2714, "step": 859 }, { "epoch": 2.6979760267243074, "grad_norm": 0.23032361268997192, "learning_rate": 6.154885186852769e-05, "loss": 0.2701, "step": 860 }, { "epoch": 2.7011200628807233, "grad_norm": 0.22304952144622803, "learning_rate": 6.150382710490771e-05, "loss": 0.2464, "step": 861 }, { "epoch": 2.7042640990371387, "grad_norm": 0.22591029107570648, "learning_rate": 6.145880234128771e-05, "loss": 0.2689, "step": 862 }, { "epoch": 2.7074081351935546, "grad_norm": 0.2028559446334839, "learning_rate": 6.141377757766772e-05, "loss": 0.2344, "step": 863 }, { "epoch": 2.7105521713499705, "grad_norm": 0.21405014395713806, "learning_rate": 6.136875281404773e-05, "loss": 0.2661, "step": 864 }, { "epoch": 2.7136962075063864, "grad_norm": 0.21614673733711243, "learning_rate": 6.132372805042775e-05, "loss": 0.2765, "step": 865 }, { "epoch": 2.7168402436628023, "grad_norm": 0.20479866862297058, "learning_rate": 6.127870328680774e-05, "loss": 0.248, "step": 866 }, { "epoch": 2.7199842798192178, "grad_norm": 0.20060454308986664, "learning_rate": 6.123367852318775e-05, "loss": 0.251, "step": 867 }, { "epoch": 2.7231283159756337, "grad_norm": 0.19836781919002533, "learning_rate": 6.118865375956776e-05, "loss": 0.2495, "step": 868 }, { "epoch": 2.7262723521320495, "grad_norm": 0.20714689791202545, "learning_rate": 6.114362899594778e-05, "loss": 0.2593, "step": 869 }, { "epoch": 2.7294163882884654, "grad_norm": 0.21739421784877777, "learning_rate": 6.109860423232778e-05, "loss": 0.2951, "step": 870 }, { "epoch": 2.7325604244448813, "grad_norm": 0.20889277756214142, "learning_rate": 6.105357946870779e-05, "loss": 0.2567, "step": 871 }, { "epoch": 2.735704460601297, "grad_norm": 0.20689308643341064, "learning_rate": 6.100855470508781e-05, "loss": 0.2679, "step": 872 }, { "epoch": 2.7388484967577127, "grad_norm": 0.22083355486392975, "learning_rate": 6.096352994146781e-05, "loss": 0.2825, "step": 873 }, { "epoch": 2.7419925329141286, "grad_norm": 0.2199823260307312, "learning_rate": 6.091850517784782e-05, "loss": 0.2573, "step": 874 }, { "epoch": 2.745136569070544, "grad_norm": 0.21396394073963165, "learning_rate": 6.087348041422783e-05, "loss": 0.214, "step": 875 }, { "epoch": 2.7482806052269604, "grad_norm": 0.2129971981048584, "learning_rate": 6.082845565060784e-05, "loss": 0.2635, "step": 876 }, { "epoch": 2.751424641383376, "grad_norm": 0.21062715351581573, "learning_rate": 6.0783430886987844e-05, "loss": 0.2647, "step": 877 }, { "epoch": 2.7545686775397917, "grad_norm": 0.2033323347568512, "learning_rate": 6.073840612336785e-05, "loss": 0.2413, "step": 878 }, { "epoch": 2.7577127136962076, "grad_norm": 0.21737009286880493, "learning_rate": 6.069338135974787e-05, "loss": 0.2784, "step": 879 }, { "epoch": 2.760856749852623, "grad_norm": 0.20817922055721283, "learning_rate": 6.0648356596127876e-05, "loss": 0.2589, "step": 880 }, { "epoch": 2.764000786009039, "grad_norm": 0.20610374212265015, "learning_rate": 6.060333183250788e-05, "loss": 0.2576, "step": 881 }, { "epoch": 2.767144822165455, "grad_norm": 0.22060273587703705, "learning_rate": 6.055830706888789e-05, "loss": 0.2645, "step": 882 }, { "epoch": 2.7702888583218708, "grad_norm": 0.225420743227005, "learning_rate": 6.05132823052679e-05, "loss": 0.2616, "step": 883 }, { "epoch": 2.7734328944782867, "grad_norm": 0.2128961831331253, "learning_rate": 6.046825754164791e-05, "loss": 0.2461, "step": 884 }, { "epoch": 2.776576930634702, "grad_norm": 0.21983756124973297, "learning_rate": 6.042323277802792e-05, "loss": 0.2624, "step": 885 }, { "epoch": 2.779720966791118, "grad_norm": 0.20498870313167572, "learning_rate": 6.037820801440792e-05, "loss": 0.2296, "step": 886 }, { "epoch": 2.782865002947534, "grad_norm": 0.2215169221162796, "learning_rate": 6.033318325078794e-05, "loss": 0.2718, "step": 887 }, { "epoch": 2.78600903910395, "grad_norm": 0.2262987494468689, "learning_rate": 6.0288158487167945e-05, "loss": 0.3, "step": 888 }, { "epoch": 2.7891530752603657, "grad_norm": 0.2125198245048523, "learning_rate": 6.0243133723547954e-05, "loss": 0.2688, "step": 889 }, { "epoch": 2.792297111416781, "grad_norm": 0.21152637898921967, "learning_rate": 6.019810895992797e-05, "loss": 0.2588, "step": 890 }, { "epoch": 2.795441147573197, "grad_norm": 0.20861785113811493, "learning_rate": 6.015308419630797e-05, "loss": 0.265, "step": 891 }, { "epoch": 2.798585183729613, "grad_norm": 0.2091401219367981, "learning_rate": 6.010805943268798e-05, "loss": 0.2684, "step": 892 }, { "epoch": 2.801729219886029, "grad_norm": 0.20484274625778198, "learning_rate": 6.006303466906799e-05, "loss": 0.2544, "step": 893 }, { "epoch": 2.8048732560424448, "grad_norm": 0.2059127688407898, "learning_rate": 6.0018009905448004e-05, "loss": 0.2743, "step": 894 }, { "epoch": 2.80801729219886, "grad_norm": 0.20695185661315918, "learning_rate": 5.9972985141828005e-05, "loss": 0.2698, "step": 895 }, { "epoch": 2.811161328355276, "grad_norm": 0.21439561247825623, "learning_rate": 5.9927960378208014e-05, "loss": 0.2646, "step": 896 }, { "epoch": 2.814305364511692, "grad_norm": 0.21806564927101135, "learning_rate": 5.988293561458803e-05, "loss": 0.2555, "step": 897 }, { "epoch": 2.8174494006681075, "grad_norm": 0.22359566390514374, "learning_rate": 5.983791085096804e-05, "loss": 0.2573, "step": 898 }, { "epoch": 2.8205934368245233, "grad_norm": 0.20134207606315613, "learning_rate": 5.9792886087348046e-05, "loss": 0.24, "step": 899 }, { "epoch": 2.8237374729809392, "grad_norm": 0.23836348950862885, "learning_rate": 5.974786132372805e-05, "loss": 0.2942, "step": 900 }, { "epoch": 2.826881509137355, "grad_norm": 0.20495203137397766, "learning_rate": 5.9702836560108064e-05, "loss": 0.2547, "step": 901 }, { "epoch": 2.830025545293771, "grad_norm": 0.21878692507743835, "learning_rate": 5.965781179648807e-05, "loss": 0.2861, "step": 902 }, { "epoch": 2.8331695814501865, "grad_norm": 0.20971405506134033, "learning_rate": 5.961278703286808e-05, "loss": 0.2519, "step": 903 }, { "epoch": 2.8363136176066024, "grad_norm": 0.22699595987796783, "learning_rate": 5.956776226924808e-05, "loss": 0.276, "step": 904 }, { "epoch": 2.8394576537630183, "grad_norm": 0.20432929694652557, "learning_rate": 5.95227375056281e-05, "loss": 0.2627, "step": 905 }, { "epoch": 2.842601689919434, "grad_norm": 0.21798695623874664, "learning_rate": 5.9477712742008107e-05, "loss": 0.2341, "step": 906 }, { "epoch": 2.84574572607585, "grad_norm": 0.20063690841197968, "learning_rate": 5.9432687978388115e-05, "loss": 0.2444, "step": 907 }, { "epoch": 2.8488897622322655, "grad_norm": 0.21176105737686157, "learning_rate": 5.938766321476813e-05, "loss": 0.2756, "step": 908 }, { "epoch": 2.8520337983886814, "grad_norm": 0.19662748277187347, "learning_rate": 5.934263845114814e-05, "loss": 0.229, "step": 909 }, { "epoch": 2.8551778345450973, "grad_norm": 0.20191557705402374, "learning_rate": 5.929761368752814e-05, "loss": 0.2446, "step": 910 }, { "epoch": 2.8583218707015132, "grad_norm": 0.2074964940547943, "learning_rate": 5.925258892390815e-05, "loss": 0.2624, "step": 911 }, { "epoch": 2.861465906857929, "grad_norm": 0.21854685246944427, "learning_rate": 5.9207564160288165e-05, "loss": 0.2601, "step": 912 }, { "epoch": 2.8646099430143446, "grad_norm": 0.2077440321445465, "learning_rate": 5.9162539396668173e-05, "loss": 0.2793, "step": 913 }, { "epoch": 2.8677539791707605, "grad_norm": 0.21336384117603302, "learning_rate": 5.9117514633048175e-05, "loss": 0.2419, "step": 914 }, { "epoch": 2.8708980153271764, "grad_norm": 0.20829536020755768, "learning_rate": 5.907248986942819e-05, "loss": 0.2685, "step": 915 }, { "epoch": 2.874042051483592, "grad_norm": 0.21437959372997284, "learning_rate": 5.90274651058082e-05, "loss": 0.264, "step": 916 }, { "epoch": 2.8771860876400077, "grad_norm": 0.22627568244934082, "learning_rate": 5.898244034218821e-05, "loss": 0.2765, "step": 917 }, { "epoch": 2.8803301237964236, "grad_norm": 0.2127591371536255, "learning_rate": 5.893741557856821e-05, "loss": 0.259, "step": 918 }, { "epoch": 2.8834741599528395, "grad_norm": 0.20785318315029144, "learning_rate": 5.889239081494823e-05, "loss": 0.2505, "step": 919 }, { "epoch": 2.8866181961092554, "grad_norm": 0.21895797550678253, "learning_rate": 5.8847366051328234e-05, "loss": 0.2596, "step": 920 }, { "epoch": 2.889762232265671, "grad_norm": 0.22070826590061188, "learning_rate": 5.880234128770824e-05, "loss": 0.2674, "step": 921 }, { "epoch": 2.8929062684220868, "grad_norm": 0.21603362262248993, "learning_rate": 5.8757316524088244e-05, "loss": 0.2397, "step": 922 }, { "epoch": 2.8960503045785027, "grad_norm": 0.22455249726772308, "learning_rate": 5.8712291760468266e-05, "loss": 0.2538, "step": 923 }, { "epoch": 2.8991943407349186, "grad_norm": 0.20244206488132477, "learning_rate": 5.866726699684827e-05, "loss": 0.235, "step": 924 }, { "epoch": 2.9023383768913344, "grad_norm": 0.21344028413295746, "learning_rate": 5.8622242233228277e-05, "loss": 0.2937, "step": 925 }, { "epoch": 2.90548241304775, "grad_norm": 0.2114301323890686, "learning_rate": 5.857721746960829e-05, "loss": 0.2638, "step": 926 }, { "epoch": 2.908626449204166, "grad_norm": 0.21806597709655762, "learning_rate": 5.85321927059883e-05, "loss": 0.2666, "step": 927 }, { "epoch": 2.9117704853605817, "grad_norm": 0.219473198056221, "learning_rate": 5.84871679423683e-05, "loss": 0.273, "step": 928 }, { "epoch": 2.9149145215169976, "grad_norm": 0.20743805170059204, "learning_rate": 5.844214317874831e-05, "loss": 0.2631, "step": 929 }, { "epoch": 2.9180585576734135, "grad_norm": 0.21817666292190552, "learning_rate": 5.8397118415128326e-05, "loss": 0.2839, "step": 930 }, { "epoch": 2.921202593829829, "grad_norm": 0.21776226162910461, "learning_rate": 5.8352093651508335e-05, "loss": 0.2756, "step": 931 }, { "epoch": 2.924346629986245, "grad_norm": 0.1976674348115921, "learning_rate": 5.830706888788834e-05, "loss": 0.2479, "step": 932 }, { "epoch": 2.9274906661426607, "grad_norm": 0.22051428258419037, "learning_rate": 5.826204412426836e-05, "loss": 0.2673, "step": 933 }, { "epoch": 2.930634702299076, "grad_norm": 0.21151353418827057, "learning_rate": 5.821701936064836e-05, "loss": 0.272, "step": 934 }, { "epoch": 2.933778738455492, "grad_norm": 0.21933844685554504, "learning_rate": 5.817199459702837e-05, "loss": 0.2893, "step": 935 }, { "epoch": 2.936922774611908, "grad_norm": 0.21245354413986206, "learning_rate": 5.812696983340837e-05, "loss": 0.255, "step": 936 }, { "epoch": 2.940066810768324, "grad_norm": 0.20733119547367096, "learning_rate": 5.808194506978839e-05, "loss": 0.2435, "step": 937 }, { "epoch": 2.9432108469247398, "grad_norm": 0.22125717997550964, "learning_rate": 5.8036920306168395e-05, "loss": 0.2692, "step": 938 }, { "epoch": 2.9463548830811552, "grad_norm": 0.21045711636543274, "learning_rate": 5.7991895542548404e-05, "loss": 0.2505, "step": 939 }, { "epoch": 2.949498919237571, "grad_norm": 0.20696495473384857, "learning_rate": 5.7946870778928405e-05, "loss": 0.2343, "step": 940 }, { "epoch": 2.952642955393987, "grad_norm": 0.1989220827817917, "learning_rate": 5.790184601530843e-05, "loss": 0.2235, "step": 941 }, { "epoch": 2.955786991550403, "grad_norm": 0.22250813245773315, "learning_rate": 5.785682125168843e-05, "loss": 0.2861, "step": 942 }, { "epoch": 2.958931027706819, "grad_norm": 0.22199483215808868, "learning_rate": 5.781179648806844e-05, "loss": 0.2432, "step": 943 }, { "epoch": 2.9620750638632343, "grad_norm": 0.2106807678937912, "learning_rate": 5.776677172444845e-05, "loss": 0.2427, "step": 944 }, { "epoch": 2.96521910001965, "grad_norm": 0.23767822980880737, "learning_rate": 5.772174696082846e-05, "loss": 0.2757, "step": 945 }, { "epoch": 2.968363136176066, "grad_norm": 0.21560463309288025, "learning_rate": 5.7676722197208464e-05, "loss": 0.2608, "step": 946 }, { "epoch": 2.971507172332482, "grad_norm": 0.21380501985549927, "learning_rate": 5.763169743358847e-05, "loss": 0.2745, "step": 947 }, { "epoch": 2.974651208488898, "grad_norm": 0.2076747864484787, "learning_rate": 5.758667266996849e-05, "loss": 0.2528, "step": 948 }, { "epoch": 2.9777952446453133, "grad_norm": 0.21542812883853912, "learning_rate": 5.7541647906348496e-05, "loss": 0.2559, "step": 949 }, { "epoch": 2.980939280801729, "grad_norm": 0.21929873526096344, "learning_rate": 5.74966231427285e-05, "loss": 0.2566, "step": 950 }, { "epoch": 2.984083316958145, "grad_norm": 0.21829146146774292, "learning_rate": 5.745159837910852e-05, "loss": 0.2719, "step": 951 }, { "epoch": 2.9872273531145606, "grad_norm": 0.2140122950077057, "learning_rate": 5.740657361548852e-05, "loss": 0.2464, "step": 952 }, { "epoch": 2.9903713892709765, "grad_norm": 0.20136047899723053, "learning_rate": 5.736154885186853e-05, "loss": 0.2552, "step": 953 }, { "epoch": 2.9935154254273924, "grad_norm": 0.20880834758281708, "learning_rate": 5.731652408824853e-05, "loss": 0.2601, "step": 954 }, { "epoch": 2.9966594615838082, "grad_norm": 0.21881428360939026, "learning_rate": 5.7271499324628554e-05, "loss": 0.2613, "step": 955 }, { "epoch": 2.999803497740224, "grad_norm": 0.21611551940441132, "learning_rate": 5.7226474561008556e-05, "loss": 0.2651, "step": 956 }, { "epoch": 3.0, "grad_norm": 1.0032835006713867, "learning_rate": 5.7181449797388565e-05, "loss": 0.3132, "step": 957 }, { "epoch": 3.0, "eval_loss": 0.31562528014183044, "eval_runtime": 102.01, "eval_samples_per_second": 12.469, "eval_steps_per_second": 12.469, "step": 957 }, { "epoch": 3.003144036156416, "grad_norm": 0.21193626523017883, "learning_rate": 5.7136425033768573e-05, "loss": 0.2177, "step": 958 }, { "epoch": 3.006288072312832, "grad_norm": 0.2002457082271576, "learning_rate": 5.709140027014859e-05, "loss": 0.2191, "step": 959 }, { "epoch": 3.0094321084692472, "grad_norm": 0.20423737168312073, "learning_rate": 5.704637550652859e-05, "loss": 0.2356, "step": 960 }, { "epoch": 3.012576144625663, "grad_norm": 0.24370385706424713, "learning_rate": 5.70013507429086e-05, "loss": 0.2504, "step": 961 }, { "epoch": 3.015720180782079, "grad_norm": 0.22390393912792206, "learning_rate": 5.6956325979288615e-05, "loss": 0.1943, "step": 962 }, { "epoch": 3.018864216938495, "grad_norm": 0.2506263256072998, "learning_rate": 5.691130121566862e-05, "loss": 0.2223, "step": 963 }, { "epoch": 3.0220082530949104, "grad_norm": 0.27031680941581726, "learning_rate": 5.6866276452048625e-05, "loss": 0.2133, "step": 964 }, { "epoch": 3.0251522892513263, "grad_norm": 0.2697952091693878, "learning_rate": 5.6821251688428634e-05, "loss": 0.2067, "step": 965 }, { "epoch": 3.028296325407742, "grad_norm": 0.2612900137901306, "learning_rate": 5.677622692480865e-05, "loss": 0.219, "step": 966 }, { "epoch": 3.031440361564158, "grad_norm": 0.2544502317905426, "learning_rate": 5.673120216118866e-05, "loss": 0.228, "step": 967 }, { "epoch": 3.034584397720574, "grad_norm": 0.2696615755558014, "learning_rate": 5.6686177397568666e-05, "loss": 0.2272, "step": 968 }, { "epoch": 3.0377284338769894, "grad_norm": 0.2606545090675354, "learning_rate": 5.664115263394868e-05, "loss": 0.2447, "step": 969 }, { "epoch": 3.0408724700334053, "grad_norm": 0.23320814967155457, "learning_rate": 5.659612787032868e-05, "loss": 0.1956, "step": 970 }, { "epoch": 3.0440165061898212, "grad_norm": 0.22663918137550354, "learning_rate": 5.655110310670869e-05, "loss": 0.1847, "step": 971 }, { "epoch": 3.047160542346237, "grad_norm": 0.2516002357006073, "learning_rate": 5.65060783430887e-05, "loss": 0.2081, "step": 972 }, { "epoch": 3.0503045785026526, "grad_norm": 0.23996178805828094, "learning_rate": 5.6461053579468716e-05, "loss": 0.2228, "step": 973 }, { "epoch": 3.0534486146590685, "grad_norm": 0.2666967511177063, "learning_rate": 5.641602881584872e-05, "loss": 0.2132, "step": 974 }, { "epoch": 3.0565926508154844, "grad_norm": 0.25362634658813477, "learning_rate": 5.6371004052228726e-05, "loss": 0.2038, "step": 975 }, { "epoch": 3.0597366869719003, "grad_norm": 0.25967440009117126, "learning_rate": 5.6325979288608735e-05, "loss": 0.2058, "step": 976 }, { "epoch": 3.062880723128316, "grad_norm": 0.2553247809410095, "learning_rate": 5.628095452498875e-05, "loss": 0.2181, "step": 977 }, { "epoch": 3.0660247592847316, "grad_norm": 0.24881340563297272, "learning_rate": 5.623592976136875e-05, "loss": 0.1871, "step": 978 }, { "epoch": 3.0691687954411475, "grad_norm": 0.24760021269321442, "learning_rate": 5.619090499774876e-05, "loss": 0.199, "step": 979 }, { "epoch": 3.0723128315975634, "grad_norm": 0.2702149450778961, "learning_rate": 5.6145880234128776e-05, "loss": 0.214, "step": 980 }, { "epoch": 3.0754568677539793, "grad_norm": 0.26589730381965637, "learning_rate": 5.6100855470508785e-05, "loss": 0.2302, "step": 981 }, { "epoch": 3.0786009039103948, "grad_norm": 0.2599831819534302, "learning_rate": 5.605583070688879e-05, "loss": 0.214, "step": 982 }, { "epoch": 3.0817449400668107, "grad_norm": 0.2595042586326599, "learning_rate": 5.6010805943268795e-05, "loss": 0.2375, "step": 983 }, { "epoch": 3.0848889762232266, "grad_norm": 0.2543151378631592, "learning_rate": 5.596578117964881e-05, "loss": 0.2081, "step": 984 }, { "epoch": 3.0880330123796425, "grad_norm": 0.2398238182067871, "learning_rate": 5.592075641602882e-05, "loss": 0.2057, "step": 985 }, { "epoch": 3.0911770485360583, "grad_norm": 0.26113200187683105, "learning_rate": 5.587573165240883e-05, "loss": 0.245, "step": 986 }, { "epoch": 3.094321084692474, "grad_norm": 0.2740868926048279, "learning_rate": 5.583070688878884e-05, "loss": 0.2249, "step": 987 }, { "epoch": 3.0974651208488897, "grad_norm": 0.25202661752700806, "learning_rate": 5.5785682125168845e-05, "loss": 0.1952, "step": 988 }, { "epoch": 3.1006091570053056, "grad_norm": 0.25760969519615173, "learning_rate": 5.574065736154885e-05, "loss": 0.2199, "step": 989 }, { "epoch": 3.1037531931617215, "grad_norm": 0.27904078364372253, "learning_rate": 5.569563259792886e-05, "loss": 0.2188, "step": 990 }, { "epoch": 3.106897229318137, "grad_norm": 0.26383012533187866, "learning_rate": 5.565060783430888e-05, "loss": 0.2238, "step": 991 }, { "epoch": 3.110041265474553, "grad_norm": 0.27171590924263, "learning_rate": 5.5605583070688886e-05, "loss": 0.2286, "step": 992 }, { "epoch": 3.1131853016309687, "grad_norm": 0.26922065019607544, "learning_rate": 5.556055830706889e-05, "loss": 0.2229, "step": 993 }, { "epoch": 3.1163293377873846, "grad_norm": 0.2586776316165924, "learning_rate": 5.5515533543448896e-05, "loss": 0.2232, "step": 994 }, { "epoch": 3.1194733739438005, "grad_norm": 0.2527298033237457, "learning_rate": 5.547050877982891e-05, "loss": 0.1762, "step": 995 }, { "epoch": 3.122617410100216, "grad_norm": 0.24694758653640747, "learning_rate": 5.542548401620892e-05, "loss": 0.2008, "step": 996 }, { "epoch": 3.125761446256632, "grad_norm": 0.254830926656723, "learning_rate": 5.538045925258892e-05, "loss": 0.2187, "step": 997 }, { "epoch": 3.128905482413048, "grad_norm": 0.2638319730758667, "learning_rate": 5.533543448896894e-05, "loss": 0.218, "step": 998 }, { "epoch": 3.1320495185694637, "grad_norm": 0.24225632846355438, "learning_rate": 5.5290409725348946e-05, "loss": 0.212, "step": 999 }, { "epoch": 3.1351935547258796, "grad_norm": 0.25262516736984253, "learning_rate": 5.5245384961728954e-05, "loss": 0.2089, "step": 1000 }, { "epoch": 3.138337590882295, "grad_norm": 0.2696562707424164, "learning_rate": 5.5200360198108956e-05, "loss": 0.2152, "step": 1001 }, { "epoch": 3.141481627038711, "grad_norm": 0.28393542766571045, "learning_rate": 5.515533543448897e-05, "loss": 0.2312, "step": 1002 }, { "epoch": 3.144625663195127, "grad_norm": 0.2509925663471222, "learning_rate": 5.511031067086898e-05, "loss": 0.1969, "step": 1003 }, { "epoch": 3.1477696993515427, "grad_norm": 0.25587406754493713, "learning_rate": 5.506528590724899e-05, "loss": 0.2054, "step": 1004 }, { "epoch": 3.150913735507958, "grad_norm": 0.2488221526145935, "learning_rate": 5.5020261143629004e-05, "loss": 0.206, "step": 1005 }, { "epoch": 3.154057771664374, "grad_norm": 0.2701071500778198, "learning_rate": 5.497523638000901e-05, "loss": 0.2237, "step": 1006 }, { "epoch": 3.15720180782079, "grad_norm": 0.26221317052841187, "learning_rate": 5.4930211616389015e-05, "loss": 0.2052, "step": 1007 }, { "epoch": 3.160345843977206, "grad_norm": 0.2596054971218109, "learning_rate": 5.488518685276902e-05, "loss": 0.2179, "step": 1008 }, { "epoch": 3.1634898801336213, "grad_norm": 0.2726416289806366, "learning_rate": 5.484016208914904e-05, "loss": 0.2545, "step": 1009 }, { "epoch": 3.166633916290037, "grad_norm": 0.2557564675807953, "learning_rate": 5.479513732552905e-05, "loss": 0.1975, "step": 1010 }, { "epoch": 3.169777952446453, "grad_norm": 0.23446181416511536, "learning_rate": 5.475011256190905e-05, "loss": 0.2037, "step": 1011 }, { "epoch": 3.172921988602869, "grad_norm": 0.2543303966522217, "learning_rate": 5.470508779828906e-05, "loss": 0.2329, "step": 1012 }, { "epoch": 3.176066024759285, "grad_norm": 0.25099271535873413, "learning_rate": 5.466006303466907e-05, "loss": 0.2209, "step": 1013 }, { "epoch": 3.1792100609157004, "grad_norm": 0.2652290463447571, "learning_rate": 5.461503827104908e-05, "loss": 0.2282, "step": 1014 }, { "epoch": 3.1823540970721163, "grad_norm": 0.2743479311466217, "learning_rate": 5.457001350742908e-05, "loss": 0.2223, "step": 1015 }, { "epoch": 3.185498133228532, "grad_norm": 0.26626113057136536, "learning_rate": 5.4524988743809105e-05, "loss": 0.2102, "step": 1016 }, { "epoch": 3.188642169384948, "grad_norm": 0.2583649754524231, "learning_rate": 5.447996398018911e-05, "loss": 0.1925, "step": 1017 }, { "epoch": 3.191786205541364, "grad_norm": 0.26990553736686707, "learning_rate": 5.4434939216569116e-05, "loss": 0.2016, "step": 1018 }, { "epoch": 3.1949302416977794, "grad_norm": 0.27624157071113586, "learning_rate": 5.438991445294912e-05, "loss": 0.2197, "step": 1019 }, { "epoch": 3.1980742778541953, "grad_norm": 0.26293328404426575, "learning_rate": 5.434488968932914e-05, "loss": 0.2031, "step": 1020 }, { "epoch": 3.201218314010611, "grad_norm": 0.26402950286865234, "learning_rate": 5.429986492570914e-05, "loss": 0.1939, "step": 1021 }, { "epoch": 3.204362350167027, "grad_norm": 0.2643432021141052, "learning_rate": 5.425484016208915e-05, "loss": 0.2118, "step": 1022 }, { "epoch": 3.2075063863234425, "grad_norm": 0.2646975815296173, "learning_rate": 5.4209815398469166e-05, "loss": 0.2234, "step": 1023 }, { "epoch": 3.2106504224798584, "grad_norm": 0.254766583442688, "learning_rate": 5.4164790634849174e-05, "loss": 0.2056, "step": 1024 }, { "epoch": 3.2137944586362743, "grad_norm": 0.25292158126831055, "learning_rate": 5.4119765871229176e-05, "loss": 0.1927, "step": 1025 }, { "epoch": 3.2169384947926902, "grad_norm": 0.25392162799835205, "learning_rate": 5.4074741107609185e-05, "loss": 0.2184, "step": 1026 }, { "epoch": 3.2200825309491057, "grad_norm": 0.26445361971855164, "learning_rate": 5.40297163439892e-05, "loss": 0.2199, "step": 1027 }, { "epoch": 3.2232265671055216, "grad_norm": 0.26420584321022034, "learning_rate": 5.398469158036921e-05, "loss": 0.2217, "step": 1028 }, { "epoch": 3.2263706032619375, "grad_norm": 0.2616407871246338, "learning_rate": 5.393966681674921e-05, "loss": 0.232, "step": 1029 }, { "epoch": 3.2295146394183534, "grad_norm": 0.2585027515888214, "learning_rate": 5.389464205312922e-05, "loss": 0.2198, "step": 1030 }, { "epoch": 3.2326586755747693, "grad_norm": 0.27603375911712646, "learning_rate": 5.3849617289509234e-05, "loss": 0.2468, "step": 1031 }, { "epoch": 3.2358027117311847, "grad_norm": 0.2667909860610962, "learning_rate": 5.380459252588924e-05, "loss": 0.2204, "step": 1032 }, { "epoch": 3.2389467478876006, "grad_norm": 0.2541857063770294, "learning_rate": 5.3759567762269245e-05, "loss": 0.2094, "step": 1033 }, { "epoch": 3.2420907840440165, "grad_norm": 0.2743613123893738, "learning_rate": 5.371454299864927e-05, "loss": 0.2287, "step": 1034 }, { "epoch": 3.2452348202004324, "grad_norm": 0.2608887851238251, "learning_rate": 5.366951823502927e-05, "loss": 0.2131, "step": 1035 }, { "epoch": 3.2483788563568483, "grad_norm": 0.2794705331325531, "learning_rate": 5.362449347140928e-05, "loss": 0.2494, "step": 1036 }, { "epoch": 3.2515228925132638, "grad_norm": 0.268891304731369, "learning_rate": 5.357946870778928e-05, "loss": 0.2355, "step": 1037 }, { "epoch": 3.2546669286696797, "grad_norm": 0.2710425853729248, "learning_rate": 5.35344439441693e-05, "loss": 0.2035, "step": 1038 }, { "epoch": 3.2578109648260956, "grad_norm": 0.25012078881263733, "learning_rate": 5.34894191805493e-05, "loss": 0.1982, "step": 1039 }, { "epoch": 3.2609550009825115, "grad_norm": 0.2515929639339447, "learning_rate": 5.344439441692931e-05, "loss": 0.2132, "step": 1040 }, { "epoch": 3.264099037138927, "grad_norm": 0.25890979170799255, "learning_rate": 5.339936965330933e-05, "loss": 0.2207, "step": 1041 }, { "epoch": 3.267243073295343, "grad_norm": 0.25978735089302063, "learning_rate": 5.3354344889689335e-05, "loss": 0.2059, "step": 1042 }, { "epoch": 3.2703871094517587, "grad_norm": 0.277829647064209, "learning_rate": 5.330932012606934e-05, "loss": 0.2201, "step": 1043 }, { "epoch": 3.2735311456081746, "grad_norm": 0.2831403315067291, "learning_rate": 5.3264295362449346e-05, "loss": 0.2328, "step": 1044 }, { "epoch": 3.27667518176459, "grad_norm": 0.268534392118454, "learning_rate": 5.321927059882936e-05, "loss": 0.2147, "step": 1045 }, { "epoch": 3.279819217921006, "grad_norm": 0.25797542929649353, "learning_rate": 5.317424583520937e-05, "loss": 0.2329, "step": 1046 }, { "epoch": 3.282963254077422, "grad_norm": 0.2753913998603821, "learning_rate": 5.312922107158937e-05, "loss": 0.2252, "step": 1047 }, { "epoch": 3.2861072902338377, "grad_norm": 0.27206990122795105, "learning_rate": 5.308419630796938e-05, "loss": 0.2167, "step": 1048 }, { "epoch": 3.2892513263902536, "grad_norm": 0.2592896819114685, "learning_rate": 5.3039171544349396e-05, "loss": 0.2141, "step": 1049 }, { "epoch": 3.292395362546669, "grad_norm": 0.25398609042167664, "learning_rate": 5.2994146780729404e-05, "loss": 0.1978, "step": 1050 }, { "epoch": 3.295539398703085, "grad_norm": 0.2543790638446808, "learning_rate": 5.2949122017109406e-05, "loss": 0.2027, "step": 1051 }, { "epoch": 3.298683434859501, "grad_norm": 0.27710023522377014, "learning_rate": 5.290409725348943e-05, "loss": 0.2124, "step": 1052 }, { "epoch": 3.301827471015917, "grad_norm": 0.28544339537620544, "learning_rate": 5.285907248986943e-05, "loss": 0.2265, "step": 1053 }, { "epoch": 3.3049715071723327, "grad_norm": 0.27211806178092957, "learning_rate": 5.281404772624944e-05, "loss": 0.2329, "step": 1054 }, { "epoch": 3.308115543328748, "grad_norm": 0.2547142207622528, "learning_rate": 5.276902296262945e-05, "loss": 0.196, "step": 1055 }, { "epoch": 3.311259579485164, "grad_norm": 0.2728598415851593, "learning_rate": 5.272399819900946e-05, "loss": 0.2417, "step": 1056 }, { "epoch": 3.31440361564158, "grad_norm": 0.29068490862846375, "learning_rate": 5.2678973435389464e-05, "loss": 0.2432, "step": 1057 }, { "epoch": 3.317547651797996, "grad_norm": 0.25328272581100464, "learning_rate": 5.263394867176947e-05, "loss": 0.2254, "step": 1058 }, { "epoch": 3.3206916879544117, "grad_norm": 0.25226205587387085, "learning_rate": 5.258892390814949e-05, "loss": 0.2098, "step": 1059 }, { "epoch": 3.323835724110827, "grad_norm": 0.24329794943332672, "learning_rate": 5.25438991445295e-05, "loss": 0.2003, "step": 1060 }, { "epoch": 3.326979760267243, "grad_norm": 0.261622816324234, "learning_rate": 5.24988743809095e-05, "loss": 0.2362, "step": 1061 }, { "epoch": 3.330123796423659, "grad_norm": 0.24722333252429962, "learning_rate": 5.245384961728951e-05, "loss": 0.2133, "step": 1062 }, { "epoch": 3.333267832580075, "grad_norm": 0.26223739981651306, "learning_rate": 5.240882485366952e-05, "loss": 0.2089, "step": 1063 }, { "epoch": 3.3364118687364903, "grad_norm": 0.26595374941825867, "learning_rate": 5.236380009004953e-05, "loss": 0.2198, "step": 1064 }, { "epoch": 3.339555904892906, "grad_norm": 0.2527714669704437, "learning_rate": 5.231877532642954e-05, "loss": 0.2279, "step": 1065 }, { "epoch": 3.342699941049322, "grad_norm": 0.2748038172721863, "learning_rate": 5.227375056280954e-05, "loss": 0.2184, "step": 1066 }, { "epoch": 3.345843977205738, "grad_norm": 0.27844253182411194, "learning_rate": 5.222872579918956e-05, "loss": 0.2286, "step": 1067 }, { "epoch": 3.3489880133621535, "grad_norm": 0.2651500105857849, "learning_rate": 5.2183701035569566e-05, "loss": 0.2111, "step": 1068 }, { "epoch": 3.3521320495185694, "grad_norm": 0.28038156032562256, "learning_rate": 5.2138676271949574e-05, "loss": 0.2437, "step": 1069 }, { "epoch": 3.3552760856749853, "grad_norm": 0.2683061361312866, "learning_rate": 5.209365150832959e-05, "loss": 0.2126, "step": 1070 }, { "epoch": 3.358420121831401, "grad_norm": 0.2660753130912781, "learning_rate": 5.204862674470959e-05, "loss": 0.2074, "step": 1071 }, { "epoch": 3.361564157987817, "grad_norm": 0.2589535117149353, "learning_rate": 5.20036019810896e-05, "loss": 0.2091, "step": 1072 }, { "epoch": 3.3647081941442325, "grad_norm": 0.2742885947227478, "learning_rate": 5.195857721746961e-05, "loss": 0.2318, "step": 1073 }, { "epoch": 3.3678522303006484, "grad_norm": 0.26752617955207825, "learning_rate": 5.1913552453849624e-05, "loss": 0.2071, "step": 1074 }, { "epoch": 3.3709962664570643, "grad_norm": 0.26302826404571533, "learning_rate": 5.186852769022963e-05, "loss": 0.2159, "step": 1075 }, { "epoch": 3.37414030261348, "grad_norm": 0.2693713903427124, "learning_rate": 5.1823502926609634e-05, "loss": 0.2219, "step": 1076 }, { "epoch": 3.377284338769896, "grad_norm": 0.2816333472728729, "learning_rate": 5.177847816298965e-05, "loss": 0.2398, "step": 1077 }, { "epoch": 3.3804283749263115, "grad_norm": 0.26509806513786316, "learning_rate": 5.173345339936966e-05, "loss": 0.2179, "step": 1078 }, { "epoch": 3.3835724110827274, "grad_norm": 0.26126453280448914, "learning_rate": 5.168842863574967e-05, "loss": 0.2123, "step": 1079 }, { "epoch": 3.3867164472391433, "grad_norm": 0.26438355445861816, "learning_rate": 5.164340387212967e-05, "loss": 0.2038, "step": 1080 }, { "epoch": 3.3898604833955592, "grad_norm": 0.27625972032546997, "learning_rate": 5.1598379108509684e-05, "loss": 0.2293, "step": 1081 }, { "epoch": 3.3930045195519747, "grad_norm": 0.2709941565990448, "learning_rate": 5.155335434488969e-05, "loss": 0.2233, "step": 1082 }, { "epoch": 3.3961485557083906, "grad_norm": 0.2658209800720215, "learning_rate": 5.15083295812697e-05, "loss": 0.2131, "step": 1083 }, { "epoch": 3.3992925918648065, "grad_norm": 0.26393112540245056, "learning_rate": 5.14633048176497e-05, "loss": 0.2362, "step": 1084 }, { "epoch": 3.4024366280212224, "grad_norm": 0.24963341653347015, "learning_rate": 5.141828005402972e-05, "loss": 0.2324, "step": 1085 }, { "epoch": 3.405580664177638, "grad_norm": 0.2534169554710388, "learning_rate": 5.137325529040973e-05, "loss": 0.2113, "step": 1086 }, { "epoch": 3.4087247003340537, "grad_norm": 0.27559417486190796, "learning_rate": 5.1328230526789736e-05, "loss": 0.2184, "step": 1087 }, { "epoch": 3.4118687364904696, "grad_norm": 0.25605255365371704, "learning_rate": 5.128320576316975e-05, "loss": 0.2098, "step": 1088 }, { "epoch": 3.4150127726468855, "grad_norm": 0.27292224764823914, "learning_rate": 5.123818099954976e-05, "loss": 0.228, "step": 1089 }, { "epoch": 3.4181568088033014, "grad_norm": 0.28134986758232117, "learning_rate": 5.119315623592976e-05, "loss": 0.2203, "step": 1090 }, { "epoch": 3.421300844959717, "grad_norm": 0.2759127914905548, "learning_rate": 5.114813147230977e-05, "loss": 0.2225, "step": 1091 }, { "epoch": 3.4244448811161328, "grad_norm": 0.2747078537940979, "learning_rate": 5.1103106708689785e-05, "loss": 0.218, "step": 1092 }, { "epoch": 3.4275889172725487, "grad_norm": 0.2804114520549774, "learning_rate": 5.1058081945069794e-05, "loss": 0.2378, "step": 1093 }, { "epoch": 3.4307329534289646, "grad_norm": 0.27649229764938354, "learning_rate": 5.1013057181449796e-05, "loss": 0.2284, "step": 1094 }, { "epoch": 3.4338769895853805, "grad_norm": 0.27862462401390076, "learning_rate": 5.096803241782981e-05, "loss": 0.2362, "step": 1095 }, { "epoch": 3.437021025741796, "grad_norm": 0.26648539304733276, "learning_rate": 5.092300765420982e-05, "loss": 0.2332, "step": 1096 }, { "epoch": 3.440165061898212, "grad_norm": 0.2648904621601105, "learning_rate": 5.087798289058983e-05, "loss": 0.2054, "step": 1097 }, { "epoch": 3.4433090980546277, "grad_norm": 0.2688305377960205, "learning_rate": 5.083295812696983e-05, "loss": 0.2271, "step": 1098 }, { "epoch": 3.4464531342110436, "grad_norm": 0.2604798972606659, "learning_rate": 5.078793336334985e-05, "loss": 0.2202, "step": 1099 }, { "epoch": 3.449597170367459, "grad_norm": 0.25735729932785034, "learning_rate": 5.0742908599729854e-05, "loss": 0.2135, "step": 1100 }, { "epoch": 3.452741206523875, "grad_norm": 0.2655589282512665, "learning_rate": 5.069788383610986e-05, "loss": 0.2215, "step": 1101 }, { "epoch": 3.455885242680291, "grad_norm": 0.27245640754699707, "learning_rate": 5.0652859072489864e-05, "loss": 0.2058, "step": 1102 }, { "epoch": 3.4590292788367067, "grad_norm": 0.26700568199157715, "learning_rate": 5.0607834308869886e-05, "loss": 0.2209, "step": 1103 }, { "epoch": 3.462173314993122, "grad_norm": 0.28836384415626526, "learning_rate": 5.056280954524989e-05, "loss": 0.219, "step": 1104 }, { "epoch": 3.465317351149538, "grad_norm": 0.25990161299705505, "learning_rate": 5.05177847816299e-05, "loss": 0.2087, "step": 1105 }, { "epoch": 3.468461387305954, "grad_norm": 0.26433536410331726, "learning_rate": 5.047276001800991e-05, "loss": 0.2169, "step": 1106 }, { "epoch": 3.47160542346237, "grad_norm": 0.2681952118873596, "learning_rate": 5.042773525438992e-05, "loss": 0.226, "step": 1107 }, { "epoch": 3.474749459618786, "grad_norm": 0.26302868127822876, "learning_rate": 5.038271049076992e-05, "loss": 0.2167, "step": 1108 }, { "epoch": 3.4778934957752012, "grad_norm": 0.24854496121406555, "learning_rate": 5.033768572714993e-05, "loss": 0.2044, "step": 1109 }, { "epoch": 3.481037531931617, "grad_norm": 0.26921898126602173, "learning_rate": 5.0292660963529947e-05, "loss": 0.2189, "step": 1110 }, { "epoch": 3.484181568088033, "grad_norm": 0.2762416899204254, "learning_rate": 5.0247636199909955e-05, "loss": 0.2324, "step": 1111 }, { "epoch": 3.487325604244449, "grad_norm": 0.2657710611820221, "learning_rate": 5.020261143628996e-05, "loss": 0.2095, "step": 1112 }, { "epoch": 3.490469640400865, "grad_norm": 0.2780584990978241, "learning_rate": 5.015758667266998e-05, "loss": 0.2193, "step": 1113 }, { "epoch": 3.4936136765572803, "grad_norm": 0.27477025985717773, "learning_rate": 5.011256190904998e-05, "loss": 0.2113, "step": 1114 }, { "epoch": 3.496757712713696, "grad_norm": 0.2667027413845062, "learning_rate": 5.006753714542999e-05, "loss": 0.2132, "step": 1115 }, { "epoch": 3.499901748870112, "grad_norm": 0.26994743943214417, "learning_rate": 5.002251238180999e-05, "loss": 0.2166, "step": 1116 }, { "epoch": 3.503045785026528, "grad_norm": 0.2543928325176239, "learning_rate": 4.997748761819001e-05, "loss": 0.1925, "step": 1117 }, { "epoch": 3.506189821182944, "grad_norm": 0.26968643069267273, "learning_rate": 4.9932462854570015e-05, "loss": 0.2244, "step": 1118 }, { "epoch": 3.5093338573393593, "grad_norm": 0.26054245233535767, "learning_rate": 4.9887438090950024e-05, "loss": 0.2196, "step": 1119 }, { "epoch": 3.512477893495775, "grad_norm": 0.2509452700614929, "learning_rate": 4.984241332733003e-05, "loss": 0.2132, "step": 1120 }, { "epoch": 3.515621929652191, "grad_norm": 0.27598923444747925, "learning_rate": 4.979738856371005e-05, "loss": 0.2218, "step": 1121 }, { "epoch": 3.5187659658086066, "grad_norm": 0.25189635157585144, "learning_rate": 4.975236380009005e-05, "loss": 0.1974, "step": 1122 }, { "epoch": 3.5219100019650225, "grad_norm": 0.24723222851753235, "learning_rate": 4.9707339036470065e-05, "loss": 0.2056, "step": 1123 }, { "epoch": 3.5250540381214384, "grad_norm": 0.2754393517971039, "learning_rate": 4.966231427285007e-05, "loss": 0.233, "step": 1124 }, { "epoch": 3.5281980742778543, "grad_norm": 0.28501662611961365, "learning_rate": 4.961728950923008e-05, "loss": 0.222, "step": 1125 }, { "epoch": 3.53134211043427, "grad_norm": 0.29177454113960266, "learning_rate": 4.9572264745610084e-05, "loss": 0.2388, "step": 1126 }, { "epoch": 3.5344861465906856, "grad_norm": 0.28808361291885376, "learning_rate": 4.95272399819901e-05, "loss": 0.2181, "step": 1127 }, { "epoch": 3.5376301827471015, "grad_norm": 0.28812456130981445, "learning_rate": 4.94822152183701e-05, "loss": 0.244, "step": 1128 }, { "epoch": 3.5407742189035174, "grad_norm": 0.2853076756000519, "learning_rate": 4.9437190454750117e-05, "loss": 0.2333, "step": 1129 }, { "epoch": 3.5439182550599333, "grad_norm": 0.3009452819824219, "learning_rate": 4.9392165691130125e-05, "loss": 0.2324, "step": 1130 }, { "epoch": 3.547062291216349, "grad_norm": 0.2839139401912689, "learning_rate": 4.9347140927510134e-05, "loss": 0.2334, "step": 1131 }, { "epoch": 3.5502063273727646, "grad_norm": 0.2858094573020935, "learning_rate": 4.930211616389014e-05, "loss": 0.2348, "step": 1132 }, { "epoch": 3.5533503635291805, "grad_norm": 0.2753075957298279, "learning_rate": 4.925709140027015e-05, "loss": 0.2563, "step": 1133 }, { "epoch": 3.5564943996855964, "grad_norm": 0.27349305152893066, "learning_rate": 4.921206663665016e-05, "loss": 0.2144, "step": 1134 }, { "epoch": 3.5596384358420123, "grad_norm": 0.28366997838020325, "learning_rate": 4.916704187303017e-05, "loss": 0.2134, "step": 1135 }, { "epoch": 3.5627824719984282, "grad_norm": 0.26744216680526733, "learning_rate": 4.912201710941018e-05, "loss": 0.2364, "step": 1136 }, { "epoch": 3.5659265081548437, "grad_norm": 0.26940158009529114, "learning_rate": 4.9076992345790185e-05, "loss": 0.2255, "step": 1137 }, { "epoch": 3.5690705443112596, "grad_norm": 0.27945220470428467, "learning_rate": 4.9031967582170194e-05, "loss": 0.2396, "step": 1138 }, { "epoch": 3.5722145804676755, "grad_norm": 0.2586713433265686, "learning_rate": 4.898694281855021e-05, "loss": 0.2191, "step": 1139 }, { "epoch": 3.575358616624091, "grad_norm": 0.25625908374786377, "learning_rate": 4.894191805493021e-05, "loss": 0.2207, "step": 1140 }, { "epoch": 3.578502652780507, "grad_norm": 0.2548212707042694, "learning_rate": 4.8896893291310226e-05, "loss": 0.2149, "step": 1141 }, { "epoch": 3.5816466889369227, "grad_norm": 0.25424882769584656, "learning_rate": 4.885186852769023e-05, "loss": 0.1972, "step": 1142 }, { "epoch": 3.5847907250933386, "grad_norm": 0.2918783128261566, "learning_rate": 4.8806843764070244e-05, "loss": 0.2223, "step": 1143 }, { "epoch": 3.5879347612497545, "grad_norm": 0.2688378691673279, "learning_rate": 4.8761819000450245e-05, "loss": 0.2224, "step": 1144 }, { "epoch": 3.59107879740617, "grad_norm": 0.2554541230201721, "learning_rate": 4.871679423683026e-05, "loss": 0.2054, "step": 1145 }, { "epoch": 3.594222833562586, "grad_norm": 0.2563970386981964, "learning_rate": 4.867176947321026e-05, "loss": 0.2142, "step": 1146 }, { "epoch": 3.5973668697190018, "grad_norm": 0.2660623788833618, "learning_rate": 4.862674470959028e-05, "loss": 0.2299, "step": 1147 }, { "epoch": 3.6005109058754177, "grad_norm": 0.2669471800327301, "learning_rate": 4.8581719945970286e-05, "loss": 0.2178, "step": 1148 }, { "epoch": 3.6036549420318336, "grad_norm": 0.26192253828048706, "learning_rate": 4.8536695182350295e-05, "loss": 0.2085, "step": 1149 }, { "epoch": 3.606798978188249, "grad_norm": 0.28995901346206665, "learning_rate": 4.8491670418730304e-05, "loss": 0.2243, "step": 1150 }, { "epoch": 3.609943014344665, "grad_norm": 0.27581891417503357, "learning_rate": 4.844664565511031e-05, "loss": 0.2264, "step": 1151 }, { "epoch": 3.613087050501081, "grad_norm": 0.26228320598602295, "learning_rate": 4.840162089149032e-05, "loss": 0.1932, "step": 1152 }, { "epoch": 3.6162310866574967, "grad_norm": 0.25524231791496277, "learning_rate": 4.835659612787033e-05, "loss": 0.193, "step": 1153 }, { "epoch": 3.6193751228139126, "grad_norm": 0.27221062779426575, "learning_rate": 4.831157136425034e-05, "loss": 0.2255, "step": 1154 }, { "epoch": 3.622519158970328, "grad_norm": 0.2663659155368805, "learning_rate": 4.8266546600630347e-05, "loss": 0.2115, "step": 1155 }, { "epoch": 3.625663195126744, "grad_norm": 0.2815920412540436, "learning_rate": 4.8221521837010355e-05, "loss": 0.2362, "step": 1156 }, { "epoch": 3.62880723128316, "grad_norm": 0.2612375020980835, "learning_rate": 4.817649707339037e-05, "loss": 0.2125, "step": 1157 }, { "epoch": 3.6319512674395753, "grad_norm": 0.278981477022171, "learning_rate": 4.813147230977037e-05, "loss": 0.2335, "step": 1158 }, { "epoch": 3.635095303595991, "grad_norm": 0.26510703563690186, "learning_rate": 4.808644754615039e-05, "loss": 0.2108, "step": 1159 }, { "epoch": 3.638239339752407, "grad_norm": 0.2480216771364212, "learning_rate": 4.8041422782530396e-05, "loss": 0.1921, "step": 1160 }, { "epoch": 3.641383375908823, "grad_norm": 0.27259308099746704, "learning_rate": 4.7996398018910405e-05, "loss": 0.2316, "step": 1161 }, { "epoch": 3.644527412065239, "grad_norm": 0.2570619583129883, "learning_rate": 4.7951373255290413e-05, "loss": 0.2157, "step": 1162 }, { "epoch": 3.6476714482216543, "grad_norm": 0.27665191888809204, "learning_rate": 4.790634849167042e-05, "loss": 0.2066, "step": 1163 }, { "epoch": 3.6508154843780702, "grad_norm": 0.274823933839798, "learning_rate": 4.786132372805043e-05, "loss": 0.2353, "step": 1164 }, { "epoch": 3.653959520534486, "grad_norm": 0.29660841822624207, "learning_rate": 4.781629896443044e-05, "loss": 0.2176, "step": 1165 }, { "epoch": 3.657103556690902, "grad_norm": 0.2792792022228241, "learning_rate": 4.777127420081045e-05, "loss": 0.2407, "step": 1166 }, { "epoch": 3.660247592847318, "grad_norm": 0.2719855308532715, "learning_rate": 4.7726249437190456e-05, "loss": 0.2113, "step": 1167 }, { "epoch": 3.6633916290037334, "grad_norm": 0.26791417598724365, "learning_rate": 4.7681224673570465e-05, "loss": 0.2306, "step": 1168 }, { "epoch": 3.6665356651601493, "grad_norm": 0.259931355714798, "learning_rate": 4.7636199909950474e-05, "loss": 0.2081, "step": 1169 }, { "epoch": 3.669679701316565, "grad_norm": 0.2692759037017822, "learning_rate": 4.759117514633049e-05, "loss": 0.2142, "step": 1170 }, { "epoch": 3.672823737472981, "grad_norm": 0.25810664892196655, "learning_rate": 4.754615038271049e-05, "loss": 0.2072, "step": 1171 }, { "epoch": 3.675967773629397, "grad_norm": 0.27101725339889526, "learning_rate": 4.7501125619090506e-05, "loss": 0.2115, "step": 1172 }, { "epoch": 3.6791118097858124, "grad_norm": 0.2799369990825653, "learning_rate": 4.745610085547051e-05, "loss": 0.2227, "step": 1173 }, { "epoch": 3.6822558459422283, "grad_norm": 0.2759827673435211, "learning_rate": 4.741107609185052e-05, "loss": 0.2156, "step": 1174 }, { "epoch": 3.685399882098644, "grad_norm": 0.27416959404945374, "learning_rate": 4.736605132823053e-05, "loss": 0.2147, "step": 1175 }, { "epoch": 3.6885439182550597, "grad_norm": 0.26900628209114075, "learning_rate": 4.732102656461054e-05, "loss": 0.1928, "step": 1176 }, { "epoch": 3.6916879544114756, "grad_norm": 0.2725447416305542, "learning_rate": 4.727600180099055e-05, "loss": 0.2174, "step": 1177 }, { "epoch": 3.6948319905678915, "grad_norm": 0.2719217836856842, "learning_rate": 4.723097703737056e-05, "loss": 0.221, "step": 1178 }, { "epoch": 3.6979760267243074, "grad_norm": 0.2676503658294678, "learning_rate": 4.7185952273750566e-05, "loss": 0.221, "step": 1179 }, { "epoch": 3.7011200628807233, "grad_norm": 0.2713521420955658, "learning_rate": 4.7140927510130575e-05, "loss": 0.1981, "step": 1180 }, { "epoch": 3.7042640990371387, "grad_norm": 0.26955726742744446, "learning_rate": 4.7095902746510583e-05, "loss": 0.2241, "step": 1181 }, { "epoch": 3.7074081351935546, "grad_norm": 0.2783749997615814, "learning_rate": 4.705087798289059e-05, "loss": 0.23, "step": 1182 }, { "epoch": 3.7105521713499705, "grad_norm": 0.2713487446308136, "learning_rate": 4.70058532192706e-05, "loss": 0.2364, "step": 1183 }, { "epoch": 3.7136962075063864, "grad_norm": 0.26244601607322693, "learning_rate": 4.6960828455650616e-05, "loss": 0.2208, "step": 1184 }, { "epoch": 3.7168402436628023, "grad_norm": 0.2538108825683594, "learning_rate": 4.691580369203062e-05, "loss": 0.1999, "step": 1185 }, { "epoch": 3.7199842798192178, "grad_norm": 0.2600885033607483, "learning_rate": 4.687077892841063e-05, "loss": 0.2126, "step": 1186 }, { "epoch": 3.7231283159756337, "grad_norm": 0.27090543508529663, "learning_rate": 4.6825754164790635e-05, "loss": 0.2109, "step": 1187 }, { "epoch": 3.7262723521320495, "grad_norm": 0.28523942828178406, "learning_rate": 4.678072940117065e-05, "loss": 0.2082, "step": 1188 }, { "epoch": 3.7294163882884654, "grad_norm": 0.2704015374183655, "learning_rate": 4.673570463755065e-05, "loss": 0.2116, "step": 1189 }, { "epoch": 3.7325604244448813, "grad_norm": 0.25477057695388794, "learning_rate": 4.669067987393067e-05, "loss": 0.1868, "step": 1190 }, { "epoch": 3.735704460601297, "grad_norm": 0.29762375354766846, "learning_rate": 4.664565511031067e-05, "loss": 0.2407, "step": 1191 }, { "epoch": 3.7388484967577127, "grad_norm": 0.2843954563140869, "learning_rate": 4.6600630346690685e-05, "loss": 0.2145, "step": 1192 }, { "epoch": 3.7419925329141286, "grad_norm": 0.2750847339630127, "learning_rate": 4.655560558307069e-05, "loss": 0.2272, "step": 1193 }, { "epoch": 3.745136569070544, "grad_norm": 0.28124505281448364, "learning_rate": 4.65105808194507e-05, "loss": 0.2008, "step": 1194 }, { "epoch": 3.7482806052269604, "grad_norm": 0.2918599247932434, "learning_rate": 4.646555605583071e-05, "loss": 0.2137, "step": 1195 }, { "epoch": 3.751424641383376, "grad_norm": 0.2769639492034912, "learning_rate": 4.642053129221072e-05, "loss": 0.2218, "step": 1196 }, { "epoch": 3.7545686775397917, "grad_norm": 0.27769285440444946, "learning_rate": 4.637550652859073e-05, "loss": 0.2288, "step": 1197 }, { "epoch": 3.7577127136962076, "grad_norm": 0.27250128984451294, "learning_rate": 4.6330481764970736e-05, "loss": 0.2321, "step": 1198 }, { "epoch": 3.760856749852623, "grad_norm": 0.2540079355239868, "learning_rate": 4.6285457001350745e-05, "loss": 0.2056, "step": 1199 }, { "epoch": 3.764000786009039, "grad_norm": 0.27811941504478455, "learning_rate": 4.624043223773075e-05, "loss": 0.2296, "step": 1200 }, { "epoch": 3.767144822165455, "grad_norm": 0.27619028091430664, "learning_rate": 4.619540747411076e-05, "loss": 0.2222, "step": 1201 }, { "epoch": 3.7702888583218708, "grad_norm": 0.2599914073944092, "learning_rate": 4.615038271049078e-05, "loss": 0.2035, "step": 1202 }, { "epoch": 3.7734328944782867, "grad_norm": 0.27861905097961426, "learning_rate": 4.610535794687078e-05, "loss": 0.2413, "step": 1203 }, { "epoch": 3.776576930634702, "grad_norm": 0.2873072028160095, "learning_rate": 4.6060333183250794e-05, "loss": 0.2281, "step": 1204 }, { "epoch": 3.779720966791118, "grad_norm": 0.2840508818626404, "learning_rate": 4.6015308419630796e-05, "loss": 0.2295, "step": 1205 }, { "epoch": 3.782865002947534, "grad_norm": 0.2830098271369934, "learning_rate": 4.597028365601081e-05, "loss": 0.23, "step": 1206 }, { "epoch": 3.78600903910395, "grad_norm": 0.2736683487892151, "learning_rate": 4.5925258892390813e-05, "loss": 0.2041, "step": 1207 }, { "epoch": 3.7891530752603657, "grad_norm": 0.2993437647819519, "learning_rate": 4.588023412877083e-05, "loss": 0.256, "step": 1208 }, { "epoch": 3.792297111416781, "grad_norm": 0.279880553483963, "learning_rate": 4.583520936515083e-05, "loss": 0.2123, "step": 1209 }, { "epoch": 3.795441147573197, "grad_norm": 0.27665743231773376, "learning_rate": 4.5790184601530846e-05, "loss": 0.2178, "step": 1210 }, { "epoch": 3.798585183729613, "grad_norm": 0.28740230202674866, "learning_rate": 4.5745159837910855e-05, "loss": 0.2184, "step": 1211 }, { "epoch": 3.801729219886029, "grad_norm": 0.27269086241722107, "learning_rate": 4.570013507429086e-05, "loss": 0.2365, "step": 1212 }, { "epoch": 3.8048732560424448, "grad_norm": 0.28101804852485657, "learning_rate": 4.565511031067087e-05, "loss": 0.2203, "step": 1213 }, { "epoch": 3.80801729219886, "grad_norm": 0.2693193256855011, "learning_rate": 4.561008554705088e-05, "loss": 0.2177, "step": 1214 }, { "epoch": 3.811161328355276, "grad_norm": 0.27970948815345764, "learning_rate": 4.556506078343089e-05, "loss": 0.2394, "step": 1215 }, { "epoch": 3.814305364511692, "grad_norm": 0.2619451880455017, "learning_rate": 4.55200360198109e-05, "loss": 0.2038, "step": 1216 }, { "epoch": 3.8174494006681075, "grad_norm": 0.27357161045074463, "learning_rate": 4.5475011256190906e-05, "loss": 0.2249, "step": 1217 }, { "epoch": 3.8205934368245233, "grad_norm": 0.2673509120941162, "learning_rate": 4.5429986492570915e-05, "loss": 0.2157, "step": 1218 }, { "epoch": 3.8237374729809392, "grad_norm": 0.2881917357444763, "learning_rate": 4.538496172895092e-05, "loss": 0.2417, "step": 1219 }, { "epoch": 3.826881509137355, "grad_norm": 0.26993855834007263, "learning_rate": 4.533993696533094e-05, "loss": 0.2053, "step": 1220 }, { "epoch": 3.830025545293771, "grad_norm": 0.2821698486804962, "learning_rate": 4.529491220171094e-05, "loss": 0.2189, "step": 1221 }, { "epoch": 3.8331695814501865, "grad_norm": 0.26443150639533997, "learning_rate": 4.5249887438090956e-05, "loss": 0.2033, "step": 1222 }, { "epoch": 3.8363136176066024, "grad_norm": 0.2921953499317169, "learning_rate": 4.520486267447096e-05, "loss": 0.2151, "step": 1223 }, { "epoch": 3.8394576537630183, "grad_norm": 0.2694004774093628, "learning_rate": 4.515983791085097e-05, "loss": 0.2002, "step": 1224 }, { "epoch": 3.842601689919434, "grad_norm": 0.2816608250141144, "learning_rate": 4.5114813147230975e-05, "loss": 0.2136, "step": 1225 }, { "epoch": 3.84574572607585, "grad_norm": 0.2641552984714508, "learning_rate": 4.506978838361099e-05, "loss": 0.2171, "step": 1226 }, { "epoch": 3.8488897622322655, "grad_norm": 0.2562899589538574, "learning_rate": 4.502476361999099e-05, "loss": 0.2042, "step": 1227 }, { "epoch": 3.8520337983886814, "grad_norm": 0.25968772172927856, "learning_rate": 4.497973885637101e-05, "loss": 0.1902, "step": 1228 }, { "epoch": 3.8551778345450973, "grad_norm": 0.30985110998153687, "learning_rate": 4.4934714092751016e-05, "loss": 0.2361, "step": 1229 }, { "epoch": 3.8583218707015132, "grad_norm": 0.25821468234062195, "learning_rate": 4.4889689329131025e-05, "loss": 0.214, "step": 1230 }, { "epoch": 3.861465906857929, "grad_norm": 0.29691803455352783, "learning_rate": 4.484466456551103e-05, "loss": 0.2414, "step": 1231 }, { "epoch": 3.8646099430143446, "grad_norm": 0.2855352759361267, "learning_rate": 4.479963980189104e-05, "loss": 0.2424, "step": 1232 }, { "epoch": 3.8677539791707605, "grad_norm": 0.27892571687698364, "learning_rate": 4.475461503827105e-05, "loss": 0.2132, "step": 1233 }, { "epoch": 3.8708980153271764, "grad_norm": 0.25937420129776, "learning_rate": 4.470959027465106e-05, "loss": 0.2019, "step": 1234 }, { "epoch": 3.874042051483592, "grad_norm": 0.25848716497421265, "learning_rate": 4.466456551103107e-05, "loss": 0.2085, "step": 1235 }, { "epoch": 3.8771860876400077, "grad_norm": 0.2610270082950592, "learning_rate": 4.4619540747411076e-05, "loss": 0.2028, "step": 1236 }, { "epoch": 3.8803301237964236, "grad_norm": 0.27180153131484985, "learning_rate": 4.4574515983791085e-05, "loss": 0.2218, "step": 1237 }, { "epoch": 3.8834741599528395, "grad_norm": 0.2885635793209076, "learning_rate": 4.45294912201711e-05, "loss": 0.2686, "step": 1238 }, { "epoch": 3.8866181961092554, "grad_norm": 0.2911549210548401, "learning_rate": 4.44844664565511e-05, "loss": 0.2309, "step": 1239 }, { "epoch": 3.889762232265671, "grad_norm": 0.27582037448883057, "learning_rate": 4.443944169293112e-05, "loss": 0.2039, "step": 1240 }, { "epoch": 3.8929062684220868, "grad_norm": 0.27571436762809753, "learning_rate": 4.439441692931112e-05, "loss": 0.2257, "step": 1241 }, { "epoch": 3.8960503045785027, "grad_norm": 0.2915656268596649, "learning_rate": 4.4349392165691134e-05, "loss": 0.2028, "step": 1242 }, { "epoch": 3.8991943407349186, "grad_norm": 0.2878768742084503, "learning_rate": 4.430436740207114e-05, "loss": 0.2178, "step": 1243 }, { "epoch": 3.9023383768913344, "grad_norm": 0.27826157212257385, "learning_rate": 4.425934263845115e-05, "loss": 0.2253, "step": 1244 }, { "epoch": 3.90548241304775, "grad_norm": 0.2802741825580597, "learning_rate": 4.421431787483116e-05, "loss": 0.2417, "step": 1245 }, { "epoch": 3.908626449204166, "grad_norm": 0.28083693981170654, "learning_rate": 4.416929311121117e-05, "loss": 0.2349, "step": 1246 }, { "epoch": 3.9117704853605817, "grad_norm": 0.27791112661361694, "learning_rate": 4.412426834759118e-05, "loss": 0.2333, "step": 1247 }, { "epoch": 3.9149145215169976, "grad_norm": 0.26817572116851807, "learning_rate": 4.4079243583971186e-05, "loss": 0.2158, "step": 1248 }, { "epoch": 3.9180585576734135, "grad_norm": 0.2688419818878174, "learning_rate": 4.4034218820351194e-05, "loss": 0.2069, "step": 1249 }, { "epoch": 3.921202593829829, "grad_norm": 0.2617838978767395, "learning_rate": 4.39891940567312e-05, "loss": 0.2158, "step": 1250 }, { "epoch": 3.924346629986245, "grad_norm": 0.26919645071029663, "learning_rate": 4.394416929311121e-05, "loss": 0.1995, "step": 1251 }, { "epoch": 3.9274906661426607, "grad_norm": 0.27416127920150757, "learning_rate": 4.389914452949122e-05, "loss": 0.2081, "step": 1252 }, { "epoch": 3.930634702299076, "grad_norm": 0.2692427635192871, "learning_rate": 4.385411976587123e-05, "loss": 0.1912, "step": 1253 }, { "epoch": 3.933778738455492, "grad_norm": 0.2657999098300934, "learning_rate": 4.380909500225124e-05, "loss": 0.2066, "step": 1254 }, { "epoch": 3.936922774611908, "grad_norm": 0.2760981023311615, "learning_rate": 4.376407023863125e-05, "loss": 0.2132, "step": 1255 }, { "epoch": 3.940066810768324, "grad_norm": 0.29359671473503113, "learning_rate": 4.371904547501126e-05, "loss": 0.2355, "step": 1256 }, { "epoch": 3.9432108469247398, "grad_norm": 0.2690160870552063, "learning_rate": 4.367402071139127e-05, "loss": 0.2313, "step": 1257 }, { "epoch": 3.9463548830811552, "grad_norm": 0.2844559848308563, "learning_rate": 4.362899594777128e-05, "loss": 0.2386, "step": 1258 }, { "epoch": 3.949498919237571, "grad_norm": 0.27324485778808594, "learning_rate": 4.358397118415129e-05, "loss": 0.2204, "step": 1259 }, { "epoch": 3.952642955393987, "grad_norm": 0.2731996774673462, "learning_rate": 4.3538946420531296e-05, "loss": 0.2171, "step": 1260 }, { "epoch": 3.955786991550403, "grad_norm": 0.2917567789554596, "learning_rate": 4.3493921656911304e-05, "loss": 0.2152, "step": 1261 }, { "epoch": 3.958931027706819, "grad_norm": 0.27139395475387573, "learning_rate": 4.344889689329131e-05, "loss": 0.2096, "step": 1262 }, { "epoch": 3.9620750638632343, "grad_norm": 0.2586003541946411, "learning_rate": 4.340387212967132e-05, "loss": 0.215, "step": 1263 }, { "epoch": 3.96521910001965, "grad_norm": 0.2656058073043823, "learning_rate": 4.335884736605133e-05, "loss": 0.2196, "step": 1264 }, { "epoch": 3.968363136176066, "grad_norm": 0.27002981305122375, "learning_rate": 4.331382260243134e-05, "loss": 0.2134, "step": 1265 }, { "epoch": 3.971507172332482, "grad_norm": 0.2708611786365509, "learning_rate": 4.326879783881135e-05, "loss": 0.2139, "step": 1266 }, { "epoch": 3.974651208488898, "grad_norm": 0.2748422920703888, "learning_rate": 4.322377307519136e-05, "loss": 0.2165, "step": 1267 }, { "epoch": 3.9777952446453133, "grad_norm": 0.26929381489753723, "learning_rate": 4.3178748311571364e-05, "loss": 0.2324, "step": 1268 }, { "epoch": 3.980939280801729, "grad_norm": 0.2929108440876007, "learning_rate": 4.313372354795138e-05, "loss": 0.2331, "step": 1269 }, { "epoch": 3.984083316958145, "grad_norm": 0.26749187707901, "learning_rate": 4.308869878433138e-05, "loss": 0.2179, "step": 1270 }, { "epoch": 3.9872273531145606, "grad_norm": 0.28992959856987, "learning_rate": 4.30436740207114e-05, "loss": 0.2245, "step": 1271 }, { "epoch": 3.9903713892709765, "grad_norm": 0.273666650056839, "learning_rate": 4.29986492570914e-05, "loss": 0.2493, "step": 1272 }, { "epoch": 3.9935154254273924, "grad_norm": 0.26677224040031433, "learning_rate": 4.2953624493471414e-05, "loss": 0.2085, "step": 1273 }, { "epoch": 3.9966594615838082, "grad_norm": 0.3334199786186218, "learning_rate": 4.290859972985142e-05, "loss": 0.2242, "step": 1274 }, { "epoch": 3.999803497740224, "grad_norm": 0.2708599865436554, "learning_rate": 4.286357496623143e-05, "loss": 0.2151, "step": 1275 }, { "epoch": 4.0, "grad_norm": 1.1953833103179932, "learning_rate": 4.281855020261144e-05, "loss": 0.2157, "step": 1276 }, { "epoch": 4.0, "eval_loss": 0.3284938931465149, "eval_runtime": 102.853, "eval_samples_per_second": 12.367, "eval_steps_per_second": 12.367, "step": 1276 }, { "epoch": 4.0031440361564155, "grad_norm": 0.24413202702999115, "learning_rate": 4.277352543899145e-05, "loss": 0.1776, "step": 1277 }, { "epoch": 4.006288072312832, "grad_norm": 0.2580040395259857, "learning_rate": 4.272850067537146e-05, "loss": 0.1825, "step": 1278 }, { "epoch": 4.009432108469247, "grad_norm": 0.2598809599876404, "learning_rate": 4.2683475911751466e-05, "loss": 0.1835, "step": 1279 }, { "epoch": 4.012576144625664, "grad_norm": 0.2667539119720459, "learning_rate": 4.2638451148131474e-05, "loss": 0.1679, "step": 1280 }, { "epoch": 4.015720180782079, "grad_norm": 0.2849894165992737, "learning_rate": 4.259342638451148e-05, "loss": 0.1712, "step": 1281 }, { "epoch": 4.0188642169384945, "grad_norm": 0.30847135186195374, "learning_rate": 4.254840162089149e-05, "loss": 0.1662, "step": 1282 }, { "epoch": 4.022008253094911, "grad_norm": 0.34202155470848083, "learning_rate": 4.250337685727151e-05, "loss": 0.1536, "step": 1283 }, { "epoch": 4.025152289251326, "grad_norm": 0.40676772594451904, "learning_rate": 4.245835209365151e-05, "loss": 0.1919, "step": 1284 }, { "epoch": 4.028296325407743, "grad_norm": 0.39432772994041443, "learning_rate": 4.2413327330031524e-05, "loss": 0.1763, "step": 1285 }, { "epoch": 4.031440361564158, "grad_norm": 0.3515225350856781, "learning_rate": 4.2368302566411526e-05, "loss": 0.1852, "step": 1286 }, { "epoch": 4.0345843977205735, "grad_norm": 0.33073797821998596, "learning_rate": 4.232327780279154e-05, "loss": 0.1705, "step": 1287 }, { "epoch": 4.03772843387699, "grad_norm": 0.31567326188087463, "learning_rate": 4.227825303917154e-05, "loss": 0.1651, "step": 1288 }, { "epoch": 4.040872470033405, "grad_norm": 0.30833232402801514, "learning_rate": 4.223322827555156e-05, "loss": 0.1699, "step": 1289 }, { "epoch": 4.044016506189821, "grad_norm": 0.3001779615879059, "learning_rate": 4.218820351193156e-05, "loss": 0.1713, "step": 1290 }, { "epoch": 4.047160542346237, "grad_norm": 0.3179980218410492, "learning_rate": 4.2143178748311576e-05, "loss": 0.173, "step": 1291 }, { "epoch": 4.050304578502653, "grad_norm": 0.3188309967517853, "learning_rate": 4.2098153984691584e-05, "loss": 0.1631, "step": 1292 }, { "epoch": 4.053448614659069, "grad_norm": 0.29104533791542053, "learning_rate": 4.205312922107159e-05, "loss": 0.1674, "step": 1293 }, { "epoch": 4.056592650815484, "grad_norm": 0.3018805682659149, "learning_rate": 4.20081044574516e-05, "loss": 0.1661, "step": 1294 }, { "epoch": 4.0597366869719, "grad_norm": 0.31885239481925964, "learning_rate": 4.196307969383161e-05, "loss": 0.1682, "step": 1295 }, { "epoch": 4.062880723128316, "grad_norm": 0.35104960203170776, "learning_rate": 4.191805493021162e-05, "loss": 0.1843, "step": 1296 }, { "epoch": 4.066024759284732, "grad_norm": 0.3519931733608246, "learning_rate": 4.187303016659163e-05, "loss": 0.1588, "step": 1297 }, { "epoch": 4.069168795441148, "grad_norm": 0.3455098867416382, "learning_rate": 4.1828005402971636e-05, "loss": 0.1797, "step": 1298 }, { "epoch": 4.072312831597563, "grad_norm": 0.33985161781311035, "learning_rate": 4.1782980639351644e-05, "loss": 0.1746, "step": 1299 }, { "epoch": 4.075456867753979, "grad_norm": 0.34039297699928284, "learning_rate": 4.173795587573165e-05, "loss": 0.1832, "step": 1300 }, { "epoch": 4.078600903910395, "grad_norm": 0.3311939537525177, "learning_rate": 4.169293111211167e-05, "loss": 0.1716, "step": 1301 }, { "epoch": 4.081744940066811, "grad_norm": 0.31825950741767883, "learning_rate": 4.164790634849167e-05, "loss": 0.1693, "step": 1302 }, { "epoch": 4.084888976223227, "grad_norm": 0.31489965319633484, "learning_rate": 4.1602881584871685e-05, "loss": 0.1611, "step": 1303 }, { "epoch": 4.0880330123796425, "grad_norm": 0.3245725929737091, "learning_rate": 4.155785682125169e-05, "loss": 0.1733, "step": 1304 }, { "epoch": 4.091177048536058, "grad_norm": 0.33683744072914124, "learning_rate": 4.15128320576317e-05, "loss": 0.1706, "step": 1305 }, { "epoch": 4.094321084692474, "grad_norm": 0.3254200518131256, "learning_rate": 4.1467807294011704e-05, "loss": 0.1581, "step": 1306 }, { "epoch": 4.09746512084889, "grad_norm": 0.3455100953578949, "learning_rate": 4.142278253039172e-05, "loss": 0.1761, "step": 1307 }, { "epoch": 4.100609157005305, "grad_norm": 0.38987427949905396, "learning_rate": 4.137775776677172e-05, "loss": 0.1782, "step": 1308 }, { "epoch": 4.1037531931617215, "grad_norm": 0.34706345200538635, "learning_rate": 4.133273300315174e-05, "loss": 0.181, "step": 1309 }, { "epoch": 4.106897229318137, "grad_norm": 0.3558022379875183, "learning_rate": 4.1287708239531745e-05, "loss": 0.1895, "step": 1310 }, { "epoch": 4.110041265474553, "grad_norm": 0.3826228082180023, "learning_rate": 4.1242683475911754e-05, "loss": 0.1952, "step": 1311 }, { "epoch": 4.113185301630969, "grad_norm": 0.37599122524261475, "learning_rate": 4.119765871229176e-05, "loss": 0.1915, "step": 1312 }, { "epoch": 4.116329337787384, "grad_norm": 0.33296701312065125, "learning_rate": 4.115263394867177e-05, "loss": 0.1747, "step": 1313 }, { "epoch": 4.1194733739438005, "grad_norm": 0.3420018255710602, "learning_rate": 4.110760918505178e-05, "loss": 0.191, "step": 1314 }, { "epoch": 4.122617410100216, "grad_norm": 0.3105049133300781, "learning_rate": 4.106258442143179e-05, "loss": 0.1731, "step": 1315 }, { "epoch": 4.125761446256632, "grad_norm": 0.30032503604888916, "learning_rate": 4.10175596578118e-05, "loss": 0.1689, "step": 1316 }, { "epoch": 4.128905482413048, "grad_norm": 0.322191447019577, "learning_rate": 4.0972534894191806e-05, "loss": 0.167, "step": 1317 }, { "epoch": 4.132049518569463, "grad_norm": 0.33827921748161316, "learning_rate": 4.0927510130571814e-05, "loss": 0.1601, "step": 1318 }, { "epoch": 4.13519355472588, "grad_norm": 0.3245418667793274, "learning_rate": 4.088248536695183e-05, "loss": 0.1719, "step": 1319 }, { "epoch": 4.138337590882295, "grad_norm": 0.32474806904792786, "learning_rate": 4.083746060333183e-05, "loss": 0.168, "step": 1320 }, { "epoch": 4.141481627038711, "grad_norm": 0.33745747804641724, "learning_rate": 4.079243583971185e-05, "loss": 0.1885, "step": 1321 }, { "epoch": 4.144625663195127, "grad_norm": 0.3697684407234192, "learning_rate": 4.074741107609185e-05, "loss": 0.1855, "step": 1322 }, { "epoch": 4.147769699351542, "grad_norm": 0.35057878494262695, "learning_rate": 4.0702386312471864e-05, "loss": 0.1846, "step": 1323 }, { "epoch": 4.150913735507959, "grad_norm": 0.36322712898254395, "learning_rate": 4.0657361548851866e-05, "loss": 0.1794, "step": 1324 }, { "epoch": 4.154057771664374, "grad_norm": 0.3216482102870941, "learning_rate": 4.061233678523188e-05, "loss": 0.1629, "step": 1325 }, { "epoch": 4.1572018078207895, "grad_norm": 0.3450748920440674, "learning_rate": 4.056731202161189e-05, "loss": 0.1826, "step": 1326 }, { "epoch": 4.160345843977206, "grad_norm": 0.443024218082428, "learning_rate": 4.05222872579919e-05, "loss": 0.1833, "step": 1327 }, { "epoch": 4.163489880133621, "grad_norm": 0.3316139578819275, "learning_rate": 4.047726249437191e-05, "loss": 0.1798, "step": 1328 }, { "epoch": 4.166633916290038, "grad_norm": 0.32531842589378357, "learning_rate": 4.0432237730751915e-05, "loss": 0.1688, "step": 1329 }, { "epoch": 4.169777952446453, "grad_norm": 0.32947710156440735, "learning_rate": 4.0387212967131924e-05, "loss": 0.1872, "step": 1330 }, { "epoch": 4.172921988602869, "grad_norm": 0.31240200996398926, "learning_rate": 4.034218820351193e-05, "loss": 0.1798, "step": 1331 }, { "epoch": 4.176066024759285, "grad_norm": 0.346986323595047, "learning_rate": 4.029716343989194e-05, "loss": 0.1676, "step": 1332 }, { "epoch": 4.1792100609157, "grad_norm": 0.3199343979358673, "learning_rate": 4.025213867627195e-05, "loss": 0.1615, "step": 1333 }, { "epoch": 4.182354097072117, "grad_norm": 0.3335675001144409, "learning_rate": 4.020711391265196e-05, "loss": 0.1537, "step": 1334 }, { "epoch": 4.185498133228532, "grad_norm": 0.3093056380748749, "learning_rate": 4.016208914903197e-05, "loss": 0.1568, "step": 1335 }, { "epoch": 4.188642169384948, "grad_norm": 0.3667335510253906, "learning_rate": 4.0117064385411976e-05, "loss": 0.1809, "step": 1336 }, { "epoch": 4.191786205541364, "grad_norm": 0.3497050702571869, "learning_rate": 4.007203962179199e-05, "loss": 0.1775, "step": 1337 }, { "epoch": 4.194930241697779, "grad_norm": 0.3513116240501404, "learning_rate": 4.0027014858172e-05, "loss": 0.1749, "step": 1338 }, { "epoch": 4.198074277854196, "grad_norm": 0.38039329648017883, "learning_rate": 3.998199009455201e-05, "loss": 0.1622, "step": 1339 }, { "epoch": 4.201218314010611, "grad_norm": 0.3401157557964325, "learning_rate": 3.993696533093202e-05, "loss": 0.1829, "step": 1340 }, { "epoch": 4.204362350167027, "grad_norm": 0.36430948972702026, "learning_rate": 3.9891940567312025e-05, "loss": 0.183, "step": 1341 }, { "epoch": 4.207506386323443, "grad_norm": 0.32911059260368347, "learning_rate": 3.9846915803692034e-05, "loss": 0.1643, "step": 1342 }, { "epoch": 4.210650422479858, "grad_norm": 0.34291791915893555, "learning_rate": 3.980189104007204e-05, "loss": 0.1709, "step": 1343 }, { "epoch": 4.213794458636274, "grad_norm": 0.33970776200294495, "learning_rate": 3.975686627645205e-05, "loss": 0.1735, "step": 1344 }, { "epoch": 4.21693849479269, "grad_norm": 0.3342609703540802, "learning_rate": 3.971184151283206e-05, "loss": 0.1625, "step": 1345 }, { "epoch": 4.220082530949106, "grad_norm": 0.310596764087677, "learning_rate": 3.966681674921207e-05, "loss": 0.1679, "step": 1346 }, { "epoch": 4.223226567105522, "grad_norm": 0.33155351877212524, "learning_rate": 3.962179198559208e-05, "loss": 0.1726, "step": 1347 }, { "epoch": 4.2263706032619375, "grad_norm": 0.33284807205200195, "learning_rate": 3.9576767221972085e-05, "loss": 0.1608, "step": 1348 }, { "epoch": 4.229514639418353, "grad_norm": 0.3194194436073303, "learning_rate": 3.9531742458352094e-05, "loss": 0.1592, "step": 1349 }, { "epoch": 4.232658675574769, "grad_norm": 0.33438998460769653, "learning_rate": 3.948671769473211e-05, "loss": 0.1792, "step": 1350 }, { "epoch": 4.235802711731185, "grad_norm": 0.3347371220588684, "learning_rate": 3.944169293111211e-05, "loss": 0.1625, "step": 1351 }, { "epoch": 4.238946747887601, "grad_norm": 0.356064110994339, "learning_rate": 3.9396668167492126e-05, "loss": 0.1709, "step": 1352 }, { "epoch": 4.2420907840440165, "grad_norm": 0.3434184193611145, "learning_rate": 3.935164340387213e-05, "loss": 0.1714, "step": 1353 }, { "epoch": 4.245234820200432, "grad_norm": 0.3638344407081604, "learning_rate": 3.9306618640252144e-05, "loss": 0.1837, "step": 1354 }, { "epoch": 4.248378856356848, "grad_norm": 0.3447159230709076, "learning_rate": 3.926159387663215e-05, "loss": 0.1902, "step": 1355 }, { "epoch": 4.251522892513264, "grad_norm": 0.3268422484397888, "learning_rate": 3.921656911301216e-05, "loss": 0.1668, "step": 1356 }, { "epoch": 4.25466692866968, "grad_norm": 0.31544023752212524, "learning_rate": 3.917154434939217e-05, "loss": 0.1588, "step": 1357 }, { "epoch": 4.257810964826096, "grad_norm": 0.3119179308414459, "learning_rate": 3.912651958577218e-05, "loss": 0.1553, "step": 1358 }, { "epoch": 4.260955000982511, "grad_norm": 0.33523574471473694, "learning_rate": 3.9081494822152187e-05, "loss": 0.1804, "step": 1359 }, { "epoch": 4.264099037138927, "grad_norm": 0.36602863669395447, "learning_rate": 3.9036470058532195e-05, "loss": 0.1914, "step": 1360 }, { "epoch": 4.267243073295343, "grad_norm": 0.3590531051158905, "learning_rate": 3.8991445294912204e-05, "loss": 0.1713, "step": 1361 }, { "epoch": 4.270387109451759, "grad_norm": 0.32857000827789307, "learning_rate": 3.894642053129221e-05, "loss": 0.1747, "step": 1362 }, { "epoch": 4.273531145608175, "grad_norm": 0.35596147179603577, "learning_rate": 3.890139576767222e-05, "loss": 0.1849, "step": 1363 }, { "epoch": 4.27667518176459, "grad_norm": 0.35360538959503174, "learning_rate": 3.8856371004052236e-05, "loss": 0.1868, "step": 1364 }, { "epoch": 4.279819217921006, "grad_norm": 0.34246009588241577, "learning_rate": 3.881134624043224e-05, "loss": 0.1642, "step": 1365 }, { "epoch": 4.282963254077422, "grad_norm": 0.3634006977081299, "learning_rate": 3.8766321476812253e-05, "loss": 0.175, "step": 1366 }, { "epoch": 4.286107290233837, "grad_norm": 0.330964058637619, "learning_rate": 3.8721296713192255e-05, "loss": 0.1681, "step": 1367 }, { "epoch": 4.289251326390254, "grad_norm": 0.3637744188308716, "learning_rate": 3.867627194957227e-05, "loss": 0.1906, "step": 1368 }, { "epoch": 4.292395362546669, "grad_norm": 0.3509615659713745, "learning_rate": 3.863124718595227e-05, "loss": 0.1694, "step": 1369 }, { "epoch": 4.295539398703085, "grad_norm": 0.34948980808258057, "learning_rate": 3.858622242233229e-05, "loss": 0.1987, "step": 1370 }, { "epoch": 4.298683434859501, "grad_norm": 0.33693766593933105, "learning_rate": 3.854119765871229e-05, "loss": 0.1681, "step": 1371 }, { "epoch": 4.301827471015916, "grad_norm": 0.32878193259239197, "learning_rate": 3.8496172895092305e-05, "loss": 0.1792, "step": 1372 }, { "epoch": 4.304971507172333, "grad_norm": 0.34040093421936035, "learning_rate": 3.8451148131472314e-05, "loss": 0.172, "step": 1373 }, { "epoch": 4.308115543328748, "grad_norm": 0.3261936604976654, "learning_rate": 3.840612336785232e-05, "loss": 0.1639, "step": 1374 }, { "epoch": 4.3112595794851645, "grad_norm": 0.3431290090084076, "learning_rate": 3.836109860423233e-05, "loss": 0.1643, "step": 1375 }, { "epoch": 4.31440361564158, "grad_norm": 0.34705156087875366, "learning_rate": 3.831607384061234e-05, "loss": 0.1853, "step": 1376 }, { "epoch": 4.317547651797995, "grad_norm": 0.35218438506126404, "learning_rate": 3.827104907699235e-05, "loss": 0.1782, "step": 1377 }, { "epoch": 4.320691687954412, "grad_norm": 0.3469063639640808, "learning_rate": 3.8226024313372357e-05, "loss": 0.1766, "step": 1378 }, { "epoch": 4.323835724110827, "grad_norm": 0.3514147698879242, "learning_rate": 3.8180999549752365e-05, "loss": 0.1638, "step": 1379 }, { "epoch": 4.326979760267243, "grad_norm": 0.3342735767364502, "learning_rate": 3.8135974786132374e-05, "loss": 0.1664, "step": 1380 }, { "epoch": 4.330123796423659, "grad_norm": 0.3489379584789276, "learning_rate": 3.809095002251238e-05, "loss": 0.1938, "step": 1381 }, { "epoch": 4.333267832580074, "grad_norm": 0.3389243185520172, "learning_rate": 3.80459252588924e-05, "loss": 0.1652, "step": 1382 }, { "epoch": 4.336411868736491, "grad_norm": 0.33191007375717163, "learning_rate": 3.80009004952724e-05, "loss": 0.1646, "step": 1383 }, { "epoch": 4.339555904892906, "grad_norm": 0.31757664680480957, "learning_rate": 3.7955875731652415e-05, "loss": 0.1708, "step": 1384 }, { "epoch": 4.342699941049322, "grad_norm": 0.34814366698265076, "learning_rate": 3.791085096803242e-05, "loss": 0.1904, "step": 1385 }, { "epoch": 4.345843977205738, "grad_norm": 0.32272517681121826, "learning_rate": 3.786582620441243e-05, "loss": 0.1757, "step": 1386 }, { "epoch": 4.3489880133621535, "grad_norm": 0.32649651169776917, "learning_rate": 3.7820801440792434e-05, "loss": 0.1583, "step": 1387 }, { "epoch": 4.35213204951857, "grad_norm": 0.33990156650543213, "learning_rate": 3.777577667717245e-05, "loss": 0.1524, "step": 1388 }, { "epoch": 4.355276085674985, "grad_norm": 0.34945619106292725, "learning_rate": 3.773075191355245e-05, "loss": 0.1917, "step": 1389 }, { "epoch": 4.358420121831401, "grad_norm": 0.33834320306777954, "learning_rate": 3.7685727149932466e-05, "loss": 0.1695, "step": 1390 }, { "epoch": 4.361564157987817, "grad_norm": 0.3227016031742096, "learning_rate": 3.7640702386312475e-05, "loss": 0.1797, "step": 1391 }, { "epoch": 4.3647081941442325, "grad_norm": 0.32759568095207214, "learning_rate": 3.7595677622692484e-05, "loss": 0.1533, "step": 1392 }, { "epoch": 4.367852230300649, "grad_norm": 0.32721182703971863, "learning_rate": 3.755065285907249e-05, "loss": 0.1826, "step": 1393 }, { "epoch": 4.370996266457064, "grad_norm": 0.3367503583431244, "learning_rate": 3.75056280954525e-05, "loss": 0.1929, "step": 1394 }, { "epoch": 4.37414030261348, "grad_norm": 0.3433319330215454, "learning_rate": 3.746060333183251e-05, "loss": 0.1858, "step": 1395 }, { "epoch": 4.377284338769896, "grad_norm": 0.3500429689884186, "learning_rate": 3.741557856821252e-05, "loss": 0.1688, "step": 1396 }, { "epoch": 4.3804283749263115, "grad_norm": 0.35479190945625305, "learning_rate": 3.7370553804592526e-05, "loss": 0.169, "step": 1397 }, { "epoch": 4.383572411082728, "grad_norm": 0.3375284969806671, "learning_rate": 3.7325529040972535e-05, "loss": 0.1627, "step": 1398 }, { "epoch": 4.386716447239143, "grad_norm": 0.34040164947509766, "learning_rate": 3.7280504277352544e-05, "loss": 0.1751, "step": 1399 }, { "epoch": 4.389860483395559, "grad_norm": 0.3353491425514221, "learning_rate": 3.723547951373256e-05, "loss": 0.1802, "step": 1400 }, { "epoch": 4.393004519551975, "grad_norm": 0.34556058049201965, "learning_rate": 3.719045475011256e-05, "loss": 0.1819, "step": 1401 }, { "epoch": 4.396148555708391, "grad_norm": 0.3369559049606323, "learning_rate": 3.7145429986492576e-05, "loss": 0.1901, "step": 1402 }, { "epoch": 4.399292591864806, "grad_norm": 0.3365342915058136, "learning_rate": 3.710040522287258e-05, "loss": 0.1767, "step": 1403 }, { "epoch": 4.402436628021222, "grad_norm": 0.31761398911476135, "learning_rate": 3.705538045925259e-05, "loss": 0.1568, "step": 1404 }, { "epoch": 4.405580664177638, "grad_norm": 0.33362120389938354, "learning_rate": 3.7010355695632595e-05, "loss": 0.1751, "step": 1405 }, { "epoch": 4.408724700334054, "grad_norm": 0.3224197030067444, "learning_rate": 3.696533093201261e-05, "loss": 0.1523, "step": 1406 }, { "epoch": 4.41186873649047, "grad_norm": 0.34683409333229065, "learning_rate": 3.692030616839261e-05, "loss": 0.1771, "step": 1407 }, { "epoch": 4.415012772646885, "grad_norm": 0.365283727645874, "learning_rate": 3.687528140477263e-05, "loss": 0.172, "step": 1408 }, { "epoch": 4.418156808803301, "grad_norm": 0.34443703293800354, "learning_rate": 3.6830256641152636e-05, "loss": 0.1673, "step": 1409 }, { "epoch": 4.421300844959717, "grad_norm": 0.35013824701309204, "learning_rate": 3.6785231877532645e-05, "loss": 0.1955, "step": 1410 }, { "epoch": 4.424444881116133, "grad_norm": 0.33194807171821594, "learning_rate": 3.6740207113912653e-05, "loss": 0.1798, "step": 1411 }, { "epoch": 4.427588917272549, "grad_norm": 0.34917956590652466, "learning_rate": 3.669518235029266e-05, "loss": 0.1886, "step": 1412 }, { "epoch": 4.430732953428964, "grad_norm": 0.34957945346832275, "learning_rate": 3.665015758667267e-05, "loss": 0.2023, "step": 1413 }, { "epoch": 4.4338769895853805, "grad_norm": 0.4198627769947052, "learning_rate": 3.660513282305268e-05, "loss": 0.1689, "step": 1414 }, { "epoch": 4.437021025741796, "grad_norm": 0.3367863893508911, "learning_rate": 3.656010805943269e-05, "loss": 0.1763, "step": 1415 }, { "epoch": 4.440165061898211, "grad_norm": 0.3114827871322632, "learning_rate": 3.6515083295812696e-05, "loss": 0.1748, "step": 1416 }, { "epoch": 4.443309098054628, "grad_norm": 0.3186023533344269, "learning_rate": 3.6470058532192705e-05, "loss": 0.1659, "step": 1417 }, { "epoch": 4.446453134211043, "grad_norm": 0.3368983864784241, "learning_rate": 3.642503376857272e-05, "loss": 0.1573, "step": 1418 }, { "epoch": 4.4495971703674595, "grad_norm": 0.327280193567276, "learning_rate": 3.638000900495272e-05, "loss": 0.1772, "step": 1419 }, { "epoch": 4.452741206523875, "grad_norm": 0.35328540205955505, "learning_rate": 3.633498424133274e-05, "loss": 0.175, "step": 1420 }, { "epoch": 4.45588524268029, "grad_norm": 0.3691512942314148, "learning_rate": 3.628995947771274e-05, "loss": 0.1563, "step": 1421 }, { "epoch": 4.459029278836707, "grad_norm": 0.32719704508781433, "learning_rate": 3.6244934714092755e-05, "loss": 0.1692, "step": 1422 }, { "epoch": 4.462173314993122, "grad_norm": 0.3479040563106537, "learning_rate": 3.619990995047276e-05, "loss": 0.1773, "step": 1423 }, { "epoch": 4.4653173511495385, "grad_norm": 0.36871057748794556, "learning_rate": 3.615488518685277e-05, "loss": 0.1625, "step": 1424 }, { "epoch": 4.468461387305954, "grad_norm": 0.35252922773361206, "learning_rate": 3.610986042323278e-05, "loss": 0.1658, "step": 1425 }, { "epoch": 4.4716054234623694, "grad_norm": 0.3504349887371063, "learning_rate": 3.606483565961279e-05, "loss": 0.1679, "step": 1426 }, { "epoch": 4.474749459618786, "grad_norm": 0.36664140224456787, "learning_rate": 3.60198108959928e-05, "loss": 0.1937, "step": 1427 }, { "epoch": 4.477893495775201, "grad_norm": 0.3537529706954956, "learning_rate": 3.5974786132372806e-05, "loss": 0.1777, "step": 1428 }, { "epoch": 4.481037531931618, "grad_norm": 0.3579857647418976, "learning_rate": 3.5929761368752815e-05, "loss": 0.2022, "step": 1429 }, { "epoch": 4.484181568088033, "grad_norm": 0.3417457044124603, "learning_rate": 3.5884736605132823e-05, "loss": 0.1825, "step": 1430 }, { "epoch": 4.4873256042444485, "grad_norm": 0.3508750796318054, "learning_rate": 3.583971184151283e-05, "loss": 0.1867, "step": 1431 }, { "epoch": 4.490469640400865, "grad_norm": 0.316987544298172, "learning_rate": 3.579468707789284e-05, "loss": 0.174, "step": 1432 }, { "epoch": 4.49361367655728, "grad_norm": 0.3204813599586487, "learning_rate": 3.5749662314272856e-05, "loss": 0.1658, "step": 1433 }, { "epoch": 4.496757712713697, "grad_norm": 0.33144694566726685, "learning_rate": 3.570463755065286e-05, "loss": 0.1876, "step": 1434 }, { "epoch": 4.499901748870112, "grad_norm": 0.3246394395828247, "learning_rate": 3.565961278703287e-05, "loss": 0.1685, "step": 1435 }, { "epoch": 4.5030457850265275, "grad_norm": 0.3174656927585602, "learning_rate": 3.561458802341288e-05, "loss": 0.1628, "step": 1436 }, { "epoch": 4.506189821182944, "grad_norm": 0.3583329916000366, "learning_rate": 3.556956325979289e-05, "loss": 0.1693, "step": 1437 }, { "epoch": 4.509333857339359, "grad_norm": 0.37494146823883057, "learning_rate": 3.55245384961729e-05, "loss": 0.2148, "step": 1438 }, { "epoch": 4.512477893495776, "grad_norm": 0.35661935806274414, "learning_rate": 3.547951373255291e-05, "loss": 0.1855, "step": 1439 }, { "epoch": 4.515621929652191, "grad_norm": 0.35309916734695435, "learning_rate": 3.5434488968932916e-05, "loss": 0.1951, "step": 1440 }, { "epoch": 4.518765965808607, "grad_norm": 0.34192830324172974, "learning_rate": 3.5389464205312925e-05, "loss": 0.161, "step": 1441 }, { "epoch": 4.521910001965023, "grad_norm": 0.34285855293273926, "learning_rate": 3.534443944169293e-05, "loss": 0.1691, "step": 1442 }, { "epoch": 4.525054038121438, "grad_norm": 0.3496817648410797, "learning_rate": 3.529941467807294e-05, "loss": 0.1768, "step": 1443 }, { "epoch": 4.528198074277854, "grad_norm": 0.3484686315059662, "learning_rate": 3.525438991445295e-05, "loss": 0.1752, "step": 1444 }, { "epoch": 4.53134211043427, "grad_norm": 0.33376917243003845, "learning_rate": 3.5209365150832966e-05, "loss": 0.178, "step": 1445 }, { "epoch": 4.534486146590686, "grad_norm": 0.34997811913490295, "learning_rate": 3.516434038721297e-05, "loss": 0.1791, "step": 1446 }, { "epoch": 4.537630182747102, "grad_norm": 0.3474102318286896, "learning_rate": 3.511931562359298e-05, "loss": 0.1813, "step": 1447 }, { "epoch": 4.540774218903517, "grad_norm": 0.36707478761672974, "learning_rate": 3.5074290859972985e-05, "loss": 0.1935, "step": 1448 }, { "epoch": 4.543918255059933, "grad_norm": 0.3474053144454956, "learning_rate": 3.5029266096353e-05, "loss": 0.1728, "step": 1449 }, { "epoch": 4.547062291216349, "grad_norm": 0.3290421664714813, "learning_rate": 3.4984241332733e-05, "loss": 0.1758, "step": 1450 }, { "epoch": 4.550206327372765, "grad_norm": 0.3392203748226166, "learning_rate": 3.493921656911302e-05, "loss": 0.1635, "step": 1451 }, { "epoch": 4.55335036352918, "grad_norm": 0.3409757614135742, "learning_rate": 3.489419180549302e-05, "loss": 0.1724, "step": 1452 }, { "epoch": 4.556494399685596, "grad_norm": 0.3525085151195526, "learning_rate": 3.4849167041873035e-05, "loss": 0.1717, "step": 1453 }, { "epoch": 4.559638435842012, "grad_norm": 0.34035542607307434, "learning_rate": 3.480414227825304e-05, "loss": 0.1697, "step": 1454 }, { "epoch": 4.562782471998428, "grad_norm": 0.3816874325275421, "learning_rate": 3.475911751463305e-05, "loss": 0.172, "step": 1455 }, { "epoch": 4.565926508154844, "grad_norm": 0.3580548167228699, "learning_rate": 3.471409275101306e-05, "loss": 0.1687, "step": 1456 }, { "epoch": 4.569070544311259, "grad_norm": 0.35063621401786804, "learning_rate": 3.466906798739307e-05, "loss": 0.1679, "step": 1457 }, { "epoch": 4.5722145804676755, "grad_norm": 0.32499825954437256, "learning_rate": 3.462404322377308e-05, "loss": 0.1668, "step": 1458 }, { "epoch": 4.575358616624091, "grad_norm": 0.34060755372047424, "learning_rate": 3.4579018460153086e-05, "loss": 0.1744, "step": 1459 }, { "epoch": 4.578502652780507, "grad_norm": 0.35474133491516113, "learning_rate": 3.4533993696533095e-05, "loss": 0.1847, "step": 1460 }, { "epoch": 4.581646688936923, "grad_norm": 0.35434579849243164, "learning_rate": 3.44889689329131e-05, "loss": 0.179, "step": 1461 }, { "epoch": 4.584790725093338, "grad_norm": 0.35998302698135376, "learning_rate": 3.444394416929311e-05, "loss": 0.1734, "step": 1462 }, { "epoch": 4.5879347612497545, "grad_norm": 0.34586137533187866, "learning_rate": 3.439891940567313e-05, "loss": 0.1514, "step": 1463 }, { "epoch": 4.59107879740617, "grad_norm": 0.35507819056510925, "learning_rate": 3.435389464205313e-05, "loss": 0.161, "step": 1464 }, { "epoch": 4.594222833562586, "grad_norm": 0.31969332695007324, "learning_rate": 3.4308869878433144e-05, "loss": 0.1599, "step": 1465 }, { "epoch": 4.597366869719002, "grad_norm": 0.3506850302219391, "learning_rate": 3.4263845114813146e-05, "loss": 0.1746, "step": 1466 }, { "epoch": 4.600510905875417, "grad_norm": 0.359840452671051, "learning_rate": 3.421882035119316e-05, "loss": 0.1748, "step": 1467 }, { "epoch": 4.603654942031834, "grad_norm": 0.3507457673549652, "learning_rate": 3.417379558757316e-05, "loss": 0.1573, "step": 1468 }, { "epoch": 4.606798978188249, "grad_norm": 0.3225368559360504, "learning_rate": 3.412877082395318e-05, "loss": 0.1653, "step": 1469 }, { "epoch": 4.609943014344665, "grad_norm": 0.31924569606781006, "learning_rate": 3.408374606033318e-05, "loss": 0.1705, "step": 1470 }, { "epoch": 4.613087050501081, "grad_norm": 0.3486468195915222, "learning_rate": 3.4038721296713196e-05, "loss": 0.1692, "step": 1471 }, { "epoch": 4.616231086657496, "grad_norm": 0.3565337359905243, "learning_rate": 3.3993696533093204e-05, "loss": 0.1883, "step": 1472 }, { "epoch": 4.619375122813913, "grad_norm": 0.3359141945838928, "learning_rate": 3.394867176947321e-05, "loss": 0.162, "step": 1473 }, { "epoch": 4.622519158970328, "grad_norm": 0.3412599563598633, "learning_rate": 3.390364700585322e-05, "loss": 0.167, "step": 1474 }, { "epoch": 4.625663195126744, "grad_norm": 0.38506564497947693, "learning_rate": 3.385862224223323e-05, "loss": 0.2185, "step": 1475 }, { "epoch": 4.62880723128316, "grad_norm": 0.34747371077537537, "learning_rate": 3.381359747861324e-05, "loss": 0.1881, "step": 1476 }, { "epoch": 4.631951267439575, "grad_norm": 0.3209041357040405, "learning_rate": 3.376857271499325e-05, "loss": 0.1697, "step": 1477 }, { "epoch": 4.635095303595992, "grad_norm": 0.320462167263031, "learning_rate": 3.3723547951373256e-05, "loss": 0.1682, "step": 1478 }, { "epoch": 4.638239339752407, "grad_norm": 0.33537420630455017, "learning_rate": 3.3678523187753265e-05, "loss": 0.1647, "step": 1479 }, { "epoch": 4.641383375908823, "grad_norm": 0.34121811389923096, "learning_rate": 3.363349842413327e-05, "loss": 0.1953, "step": 1480 }, { "epoch": 4.644527412065239, "grad_norm": 0.3303282856941223, "learning_rate": 3.358847366051329e-05, "loss": 0.1692, "step": 1481 }, { "epoch": 4.647671448221654, "grad_norm": 0.36466050148010254, "learning_rate": 3.354344889689329e-05, "loss": 0.1844, "step": 1482 }, { "epoch": 4.650815484378071, "grad_norm": 0.3210047781467438, "learning_rate": 3.3498424133273306e-05, "loss": 0.1645, "step": 1483 }, { "epoch": 4.653959520534486, "grad_norm": 0.3620697259902954, "learning_rate": 3.345339936965331e-05, "loss": 0.1813, "step": 1484 }, { "epoch": 4.657103556690902, "grad_norm": 0.3314252495765686, "learning_rate": 3.340837460603332e-05, "loss": 0.1713, "step": 1485 }, { "epoch": 4.660247592847318, "grad_norm": 0.33339688181877136, "learning_rate": 3.3363349842413325e-05, "loss": 0.1697, "step": 1486 }, { "epoch": 4.663391629003733, "grad_norm": 0.339979887008667, "learning_rate": 3.331832507879334e-05, "loss": 0.1822, "step": 1487 }, { "epoch": 4.66653566516015, "grad_norm": 0.3323740065097809, "learning_rate": 3.327330031517334e-05, "loss": 0.1739, "step": 1488 }, { "epoch": 4.669679701316565, "grad_norm": 0.34336036443710327, "learning_rate": 3.322827555155336e-05, "loss": 0.1735, "step": 1489 }, { "epoch": 4.672823737472981, "grad_norm": 0.3499758839607239, "learning_rate": 3.3183250787933366e-05, "loss": 0.1791, "step": 1490 }, { "epoch": 4.675967773629397, "grad_norm": 0.33729472756385803, "learning_rate": 3.3138226024313374e-05, "loss": 0.1752, "step": 1491 }, { "epoch": 4.679111809785812, "grad_norm": 0.3310723900794983, "learning_rate": 3.309320126069338e-05, "loss": 0.1912, "step": 1492 }, { "epoch": 4.682255845942228, "grad_norm": 0.3452661335468292, "learning_rate": 3.304817649707339e-05, "loss": 0.1687, "step": 1493 }, { "epoch": 4.685399882098644, "grad_norm": 0.36222660541534424, "learning_rate": 3.30031517334534e-05, "loss": 0.1823, "step": 1494 }, { "epoch": 4.68854391825506, "grad_norm": 0.3454042375087738, "learning_rate": 3.295812696983341e-05, "loss": 0.1691, "step": 1495 }, { "epoch": 4.691687954411476, "grad_norm": 0.3679512143135071, "learning_rate": 3.291310220621342e-05, "loss": 0.1816, "step": 1496 }, { "epoch": 4.6948319905678915, "grad_norm": 0.33819204568862915, "learning_rate": 3.2868077442593426e-05, "loss": 0.1783, "step": 1497 }, { "epoch": 4.697976026724307, "grad_norm": 0.32415708899497986, "learning_rate": 3.2823052678973435e-05, "loss": 0.1608, "step": 1498 }, { "epoch": 4.701120062880723, "grad_norm": 0.3342552185058594, "learning_rate": 3.277802791535345e-05, "loss": 0.1764, "step": 1499 }, { "epoch": 4.704264099037139, "grad_norm": 0.343462198972702, "learning_rate": 3.273300315173345e-05, "loss": 0.1859, "step": 1500 }, { "epoch": 4.707408135193555, "grad_norm": 0.3578430712223053, "learning_rate": 3.268797838811347e-05, "loss": 0.183, "step": 1501 }, { "epoch": 4.7105521713499705, "grad_norm": 0.3466953635215759, "learning_rate": 3.264295362449347e-05, "loss": 0.1789, "step": 1502 }, { "epoch": 4.713696207506386, "grad_norm": 0.35577669739723206, "learning_rate": 3.2597928860873484e-05, "loss": 0.1874, "step": 1503 }, { "epoch": 4.716840243662802, "grad_norm": 0.3529647886753082, "learning_rate": 3.2552904097253486e-05, "loss": 0.1949, "step": 1504 }, { "epoch": 4.719984279819218, "grad_norm": 0.36124175786972046, "learning_rate": 3.25078793336335e-05, "loss": 0.1679, "step": 1505 }, { "epoch": 4.723128315975634, "grad_norm": 0.34850648045539856, "learning_rate": 3.246285457001351e-05, "loss": 0.1591, "step": 1506 }, { "epoch": 4.7262723521320495, "grad_norm": 0.34418416023254395, "learning_rate": 3.241782980639352e-05, "loss": 0.1765, "step": 1507 }, { "epoch": 4.729416388288465, "grad_norm": 0.3683808743953705, "learning_rate": 3.237280504277353e-05, "loss": 0.172, "step": 1508 }, { "epoch": 4.732560424444881, "grad_norm": 0.3373892903327942, "learning_rate": 3.2327780279153536e-05, "loss": 0.1718, "step": 1509 }, { "epoch": 4.735704460601297, "grad_norm": 0.3219720125198364, "learning_rate": 3.2282755515533544e-05, "loss": 0.1572, "step": 1510 }, { "epoch": 4.738848496757713, "grad_norm": 0.3357372581958771, "learning_rate": 3.223773075191355e-05, "loss": 0.1628, "step": 1511 }, { "epoch": 4.741992532914129, "grad_norm": 0.3820708692073822, "learning_rate": 3.219270598829356e-05, "loss": 0.1901, "step": 1512 }, { "epoch": 4.745136569070544, "grad_norm": 0.3662216365337372, "learning_rate": 3.214768122467357e-05, "loss": 0.1833, "step": 1513 }, { "epoch": 4.74828060522696, "grad_norm": 0.3251596689224243, "learning_rate": 3.210265646105358e-05, "loss": 0.165, "step": 1514 }, { "epoch": 4.751424641383376, "grad_norm": 0.3528580963611603, "learning_rate": 3.205763169743359e-05, "loss": 0.1931, "step": 1515 }, { "epoch": 4.754568677539792, "grad_norm": 0.3584713637828827, "learning_rate": 3.2012606933813596e-05, "loss": 0.1858, "step": 1516 }, { "epoch": 4.757712713696208, "grad_norm": 0.36356058716773987, "learning_rate": 3.196758217019361e-05, "loss": 0.1985, "step": 1517 }, { "epoch": 4.760856749852623, "grad_norm": 0.35836127400398254, "learning_rate": 3.192255740657362e-05, "loss": 0.1862, "step": 1518 }, { "epoch": 4.764000786009039, "grad_norm": 0.3375653922557831, "learning_rate": 3.187753264295363e-05, "loss": 0.1647, "step": 1519 }, { "epoch": 4.767144822165455, "grad_norm": 0.37332043051719666, "learning_rate": 3.183250787933364e-05, "loss": 0.1822, "step": 1520 }, { "epoch": 4.77028885832187, "grad_norm": 0.3493974804878235, "learning_rate": 3.1787483115713646e-05, "loss": 0.1709, "step": 1521 }, { "epoch": 4.773432894478287, "grad_norm": 0.3353089690208435, "learning_rate": 3.1742458352093654e-05, "loss": 0.1601, "step": 1522 }, { "epoch": 4.776576930634702, "grad_norm": 0.3554600179195404, "learning_rate": 3.169743358847366e-05, "loss": 0.1815, "step": 1523 }, { "epoch": 4.7797209667911185, "grad_norm": 0.3451012372970581, "learning_rate": 3.165240882485367e-05, "loss": 0.1887, "step": 1524 }, { "epoch": 4.782865002947534, "grad_norm": 0.36190125346183777, "learning_rate": 3.160738406123368e-05, "loss": 0.179, "step": 1525 }, { "epoch": 4.786009039103949, "grad_norm": 0.31984999775886536, "learning_rate": 3.156235929761369e-05, "loss": 0.1746, "step": 1526 }, { "epoch": 4.789153075260366, "grad_norm": 0.347707599401474, "learning_rate": 3.15173345339937e-05, "loss": 0.1723, "step": 1527 }, { "epoch": 4.792297111416781, "grad_norm": 0.32577282190322876, "learning_rate": 3.1472309770373706e-05, "loss": 0.1717, "step": 1528 }, { "epoch": 4.795441147573197, "grad_norm": 0.3438778519630432, "learning_rate": 3.1427285006753714e-05, "loss": 0.1664, "step": 1529 }, { "epoch": 4.798585183729613, "grad_norm": 0.3667009770870209, "learning_rate": 3.138226024313373e-05, "loss": 0.1696, "step": 1530 }, { "epoch": 4.801729219886028, "grad_norm": 0.3698095977306366, "learning_rate": 3.133723547951373e-05, "loss": 0.1929, "step": 1531 }, { "epoch": 4.804873256042445, "grad_norm": 0.3590872585773468, "learning_rate": 3.129221071589375e-05, "loss": 0.1749, "step": 1532 }, { "epoch": 4.80801729219886, "grad_norm": 0.3810647130012512, "learning_rate": 3.124718595227375e-05, "loss": 0.2119, "step": 1533 }, { "epoch": 4.811161328355276, "grad_norm": 0.3602610230445862, "learning_rate": 3.1202161188653764e-05, "loss": 0.1705, "step": 1534 }, { "epoch": 4.814305364511692, "grad_norm": 0.3463843762874603, "learning_rate": 3.115713642503377e-05, "loss": 0.1774, "step": 1535 }, { "epoch": 4.8174494006681075, "grad_norm": 0.3391202390193939, "learning_rate": 3.111211166141378e-05, "loss": 0.1619, "step": 1536 }, { "epoch": 4.820593436824524, "grad_norm": 0.3725079596042633, "learning_rate": 3.106708689779379e-05, "loss": 0.1765, "step": 1537 }, { "epoch": 4.823737472980939, "grad_norm": 0.3331317603588104, "learning_rate": 3.10220621341738e-05, "loss": 0.1537, "step": 1538 }, { "epoch": 4.826881509137355, "grad_norm": 0.3413841128349304, "learning_rate": 3.097703737055381e-05, "loss": 0.1795, "step": 1539 }, { "epoch": 4.830025545293771, "grad_norm": 0.3438486158847809, "learning_rate": 3.0932012606933816e-05, "loss": 0.1677, "step": 1540 }, { "epoch": 4.8331695814501865, "grad_norm": 0.352260023355484, "learning_rate": 3.0886987843313824e-05, "loss": 0.1705, "step": 1541 }, { "epoch": 4.836313617606603, "grad_norm": 0.3582271337509155, "learning_rate": 3.084196307969383e-05, "loss": 0.1806, "step": 1542 }, { "epoch": 4.839457653763018, "grad_norm": 0.32175520062446594, "learning_rate": 3.079693831607384e-05, "loss": 0.1612, "step": 1543 }, { "epoch": 4.842601689919434, "grad_norm": 0.3652719557285309, "learning_rate": 3.075191355245386e-05, "loss": 0.1765, "step": 1544 }, { "epoch": 4.84574572607585, "grad_norm": 0.3469483554363251, "learning_rate": 3.070688878883386e-05, "loss": 0.1679, "step": 1545 }, { "epoch": 4.8488897622322655, "grad_norm": 0.3534870743751526, "learning_rate": 3.0661864025213874e-05, "loss": 0.1595, "step": 1546 }, { "epoch": 4.852033798388682, "grad_norm": 0.3385668098926544, "learning_rate": 3.0616839261593876e-05, "loss": 0.1654, "step": 1547 }, { "epoch": 4.855177834545097, "grad_norm": 0.3592233955860138, "learning_rate": 3.057181449797389e-05, "loss": 0.1998, "step": 1548 }, { "epoch": 4.858321870701513, "grad_norm": 0.3432442843914032, "learning_rate": 3.052678973435389e-05, "loss": 0.1633, "step": 1549 }, { "epoch": 4.861465906857929, "grad_norm": 0.34492307901382446, "learning_rate": 3.0481764970733905e-05, "loss": 0.1737, "step": 1550 }, { "epoch": 4.864609943014345, "grad_norm": 0.364162415266037, "learning_rate": 3.0436740207113913e-05, "loss": 0.1859, "step": 1551 }, { "epoch": 4.867753979170761, "grad_norm": 0.36672598123550415, "learning_rate": 3.0391715443493922e-05, "loss": 0.1737, "step": 1552 }, { "epoch": 4.870898015327176, "grad_norm": 0.3315171003341675, "learning_rate": 3.0346690679873934e-05, "loss": 0.1772, "step": 1553 }, { "epoch": 4.874042051483592, "grad_norm": 0.37914609909057617, "learning_rate": 3.030166591625394e-05, "loss": 0.1872, "step": 1554 }, { "epoch": 4.877186087640008, "grad_norm": 0.3529629707336426, "learning_rate": 3.025664115263395e-05, "loss": 0.1806, "step": 1555 }, { "epoch": 4.880330123796424, "grad_norm": 0.3450305759906769, "learning_rate": 3.021161638901396e-05, "loss": 0.1864, "step": 1556 }, { "epoch": 4.88347415995284, "grad_norm": 0.3295477330684662, "learning_rate": 3.016659162539397e-05, "loss": 0.1637, "step": 1557 }, { "epoch": 4.886618196109255, "grad_norm": 0.3463036119937897, "learning_rate": 3.0121566861773977e-05, "loss": 0.1794, "step": 1558 }, { "epoch": 4.889762232265671, "grad_norm": 0.3440949022769928, "learning_rate": 3.0076542098153985e-05, "loss": 0.1742, "step": 1559 }, { "epoch": 4.892906268422087, "grad_norm": 0.35209280252456665, "learning_rate": 3.0031517334533994e-05, "loss": 0.184, "step": 1560 }, { "epoch": 4.896050304578503, "grad_norm": 0.3333894908428192, "learning_rate": 2.9986492570914003e-05, "loss": 0.1867, "step": 1561 }, { "epoch": 4.899194340734918, "grad_norm": 0.32986724376678467, "learning_rate": 2.9941467807294015e-05, "loss": 0.1659, "step": 1562 }, { "epoch": 4.9023383768913344, "grad_norm": 0.35760498046875, "learning_rate": 2.9896443043674023e-05, "loss": 0.188, "step": 1563 }, { "epoch": 4.90548241304775, "grad_norm": 0.34434768557548523, "learning_rate": 2.9851418280054032e-05, "loss": 0.1759, "step": 1564 }, { "epoch": 4.908626449204165, "grad_norm": 0.35044562816619873, "learning_rate": 2.980639351643404e-05, "loss": 0.1841, "step": 1565 }, { "epoch": 4.911770485360582, "grad_norm": 0.3745686113834381, "learning_rate": 2.976136875281405e-05, "loss": 0.1863, "step": 1566 }, { "epoch": 4.914914521516997, "grad_norm": 0.3614986836910248, "learning_rate": 2.9716343989194058e-05, "loss": 0.1832, "step": 1567 }, { "epoch": 4.9180585576734135, "grad_norm": 0.32271724939346313, "learning_rate": 2.967131922557407e-05, "loss": 0.1708, "step": 1568 }, { "epoch": 4.921202593829829, "grad_norm": 0.392841637134552, "learning_rate": 2.9626294461954075e-05, "loss": 0.1924, "step": 1569 }, { "epoch": 4.924346629986244, "grad_norm": 0.3539784252643585, "learning_rate": 2.9581269698334087e-05, "loss": 0.17, "step": 1570 }, { "epoch": 4.927490666142661, "grad_norm": 0.3550097346305847, "learning_rate": 2.9536244934714095e-05, "loss": 0.1923, "step": 1571 }, { "epoch": 4.930634702299076, "grad_norm": 0.3477020263671875, "learning_rate": 2.9491220171094104e-05, "loss": 0.1564, "step": 1572 }, { "epoch": 4.9337787384554925, "grad_norm": 0.3535844385623932, "learning_rate": 2.9446195407474116e-05, "loss": 0.1814, "step": 1573 }, { "epoch": 4.936922774611908, "grad_norm": 0.3288232982158661, "learning_rate": 2.940117064385412e-05, "loss": 0.1784, "step": 1574 }, { "epoch": 4.940066810768323, "grad_norm": 0.3535751700401306, "learning_rate": 2.9356145880234133e-05, "loss": 0.1875, "step": 1575 }, { "epoch": 4.94321084692474, "grad_norm": 0.33178603649139404, "learning_rate": 2.9311121116614138e-05, "loss": 0.1764, "step": 1576 }, { "epoch": 4.946354883081155, "grad_norm": 0.33566486835479736, "learning_rate": 2.926609635299415e-05, "loss": 0.1795, "step": 1577 }, { "epoch": 4.949498919237572, "grad_norm": 0.3447016775608063, "learning_rate": 2.9221071589374155e-05, "loss": 0.1697, "step": 1578 }, { "epoch": 4.952642955393987, "grad_norm": 0.3438349962234497, "learning_rate": 2.9176046825754167e-05, "loss": 0.1687, "step": 1579 }, { "epoch": 4.9557869915504025, "grad_norm": 0.35665637254714966, "learning_rate": 2.913102206213418e-05, "loss": 0.1672, "step": 1580 }, { "epoch": 4.958931027706819, "grad_norm": 0.36878228187561035, "learning_rate": 2.9085997298514185e-05, "loss": 0.1884, "step": 1581 }, { "epoch": 4.962075063863234, "grad_norm": 0.3471169173717499, "learning_rate": 2.9040972534894197e-05, "loss": 0.1799, "step": 1582 }, { "epoch": 4.965219100019651, "grad_norm": 0.337471067905426, "learning_rate": 2.8995947771274202e-05, "loss": 0.1762, "step": 1583 }, { "epoch": 4.968363136176066, "grad_norm": 0.3572784662246704, "learning_rate": 2.8950923007654214e-05, "loss": 0.1813, "step": 1584 }, { "epoch": 4.9715071723324815, "grad_norm": 0.3627830147743225, "learning_rate": 2.890589824403422e-05, "loss": 0.1638, "step": 1585 }, { "epoch": 4.974651208488898, "grad_norm": 0.3592737317085266, "learning_rate": 2.886087348041423e-05, "loss": 0.1847, "step": 1586 }, { "epoch": 4.977795244645313, "grad_norm": 0.3623504042625427, "learning_rate": 2.8815848716794236e-05, "loss": 0.1833, "step": 1587 }, { "epoch": 4.98093928080173, "grad_norm": 0.3580358922481537, "learning_rate": 2.8770823953174248e-05, "loss": 0.1587, "step": 1588 }, { "epoch": 4.984083316958145, "grad_norm": 0.35607844591140747, "learning_rate": 2.872579918955426e-05, "loss": 0.1801, "step": 1589 }, { "epoch": 4.987227353114561, "grad_norm": 0.3519856035709381, "learning_rate": 2.8680774425934265e-05, "loss": 0.1724, "step": 1590 }, { "epoch": 4.990371389270977, "grad_norm": 0.34639349579811096, "learning_rate": 2.8635749662314277e-05, "loss": 0.171, "step": 1591 }, { "epoch": 4.993515425427392, "grad_norm": 0.3384530246257782, "learning_rate": 2.8590724898694282e-05, "loss": 0.1838, "step": 1592 }, { "epoch": 4.996659461583809, "grad_norm": 0.34964290261268616, "learning_rate": 2.8545700135074294e-05, "loss": 0.18, "step": 1593 }, { "epoch": 4.999803497740224, "grad_norm": 0.35439789295196533, "learning_rate": 2.85006753714543e-05, "loss": 0.1771, "step": 1594 }, { "epoch": 5.0, "grad_norm": 1.2393293380737305, "learning_rate": 2.845565060783431e-05, "loss": 0.2431, "step": 1595 }, { "epoch": 5.0, "eval_loss": 0.3554837703704834, "eval_runtime": 99.0054, "eval_samples_per_second": 12.848, "eval_steps_per_second": 12.848, "step": 1595 } ], "logging_steps": 1, "max_steps": 2226, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.220436445700735e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }